diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,137234 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25, + "eval_steps": 500, + "global_step": 196000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5e-05, + "grad_norm": 1.2729493379592896, + "learning_rate": 4.5e-08, + "loss": 1.2644, + "step": 10 + }, + { + "epoch": 0.0001, + "grad_norm": 1.3188505172729492, + "learning_rate": 9.5e-08, + "loss": 1.2658, + "step": 20 + }, + { + "epoch": 0.00015, + "grad_norm": 1.2601258754730225, + "learning_rate": 1.45e-07, + "loss": 1.2636, + "step": 30 + }, + { + "epoch": 0.0002, + "grad_norm": 1.3004794120788574, + "learning_rate": 1.95e-07, + "loss": 1.2627, + "step": 40 + }, + { + "epoch": 0.00025, + "grad_norm": 1.202286720275879, + "learning_rate": 2.45e-07, + "loss": 1.2534, + "step": 50 + }, + { + "epoch": 0.0003, + "grad_norm": 1.0703670978546143, + "learning_rate": 2.95e-07, + "loss": 1.2429, + "step": 60 + }, + { + "epoch": 0.00035, + "grad_norm": 0.9904623031616211, + "learning_rate": 3.4500000000000003e-07, + "loss": 1.223, + "step": 70 + }, + { + "epoch": 0.0004, + "grad_norm": 0.7193464040756226, + "learning_rate": 3.950000000000001e-07, + "loss": 1.2048, + "step": 80 + }, + { + "epoch": 0.00045, + "grad_norm": 0.5247392058372498, + "learning_rate": 4.4500000000000003e-07, + "loss": 1.1592, + "step": 90 + }, + { + "epoch": 0.0005, + "grad_norm": 0.41689035296440125, + "learning_rate": 4.95e-07, + "loss": 1.15, + "step": 100 + }, + { + "epoch": 0.00055, + "grad_norm": 0.29279381036758423, + "learning_rate": 5.450000000000001e-07, + "loss": 1.1358, + "step": 110 + }, + { + "epoch": 0.0006, + "grad_norm": 0.17133663594722748, + "learning_rate": 5.95e-07, + "loss": 1.128, + "step": 120 + }, + { + "epoch": 0.00065, + "grad_norm": 0.16452327370643616, + "learning_rate": 6.450000000000001e-07, + "loss": 1.1251, + "step": 130 + }, + { + "epoch": 0.0007, + "grad_norm": 0.09243213385343552, + "learning_rate": 6.95e-07, + "loss": 1.1193, + "step": 140 + }, + { + "epoch": 0.00075, + "grad_norm": 0.09495005011558533, + "learning_rate": 7.450000000000001e-07, + "loss": 1.1182, + "step": 150 + }, + { + "epoch": 0.0008, + "grad_norm": 0.06381222605705261, + "learning_rate": 7.950000000000001e-07, + "loss": 1.1163, + "step": 160 + }, + { + "epoch": 0.00085, + "grad_norm": 0.07385324686765671, + "learning_rate": 8.45e-07, + "loss": 1.1156, + "step": 170 + }, + { + "epoch": 0.0009, + "grad_norm": 0.052240073680877686, + "learning_rate": 8.95e-07, + "loss": 1.1142, + "step": 180 + }, + { + "epoch": 0.00095, + "grad_norm": 0.07249893993139267, + "learning_rate": 9.450000000000001e-07, + "loss": 1.1135, + "step": 190 + }, + { + "epoch": 0.001, + "grad_norm": 0.05016590654850006, + "learning_rate": 9.95e-07, + "loss": 1.1123, + "step": 200 + }, + { + "epoch": 0.00105, + "grad_norm": 0.0813603475689888, + "learning_rate": 1.045e-06, + "loss": 1.1118, + "step": 210 + }, + { + "epoch": 0.0011, + "grad_norm": 0.04910163953900337, + "learning_rate": 1.095e-06, + "loss": 1.1108, + "step": 220 + }, + { + "epoch": 0.00115, + "grad_norm": 0.07322562485933304, + "learning_rate": 1.145e-06, + "loss": 1.1103, + "step": 230 + }, + { + "epoch": 0.0012, + "grad_norm": 0.05893868952989578, + "learning_rate": 1.1950000000000002e-06, + "loss": 1.1098, + "step": 240 + }, + { + "epoch": 0.00125, + "grad_norm": 0.06626304984092712, + "learning_rate": 1.245e-06, + "loss": 1.11, + "step": 250 + }, + { + "epoch": 0.0013, + "grad_norm": 0.05874939262866974, + "learning_rate": 1.295e-06, + "loss": 1.1075, + "step": 260 + }, + { + "epoch": 0.00135, + "grad_norm": 0.052276477217674255, + "learning_rate": 1.345e-06, + "loss": 1.1041, + "step": 270 + }, + { + "epoch": 0.0014, + "grad_norm": 0.06827472150325775, + "learning_rate": 1.3950000000000002e-06, + "loss": 1.1035, + "step": 280 + }, + { + "epoch": 0.00145, + "grad_norm": 0.04688436537981033, + "learning_rate": 1.445e-06, + "loss": 1.1028, + "step": 290 + }, + { + "epoch": 0.0015, + "grad_norm": 0.061367060989141464, + "learning_rate": 1.495e-06, + "loss": 1.1027, + "step": 300 + }, + { + "epoch": 0.00155, + "grad_norm": 0.05163731053471565, + "learning_rate": 1.545e-06, + "loss": 1.1019, + "step": 310 + }, + { + "epoch": 0.0016, + "grad_norm": 0.04336206987500191, + "learning_rate": 1.595e-06, + "loss": 1.1012, + "step": 320 + }, + { + "epoch": 0.00165, + "grad_norm": 0.04618152230978012, + "learning_rate": 1.645e-06, + "loss": 1.0994, + "step": 330 + }, + { + "epoch": 0.0017, + "grad_norm": 0.04556488245725632, + "learning_rate": 1.695e-06, + "loss": 1.0989, + "step": 340 + }, + { + "epoch": 0.00175, + "grad_norm": 0.04560953006148338, + "learning_rate": 1.745e-06, + "loss": 1.0986, + "step": 350 + }, + { + "epoch": 0.0018, + "grad_norm": 0.05721442773938179, + "learning_rate": 1.7950000000000002e-06, + "loss": 1.0982, + "step": 360 + }, + { + "epoch": 0.00185, + "grad_norm": 0.045500755310058594, + "learning_rate": 1.8450000000000001e-06, + "loss": 1.099, + "step": 370 + }, + { + "epoch": 0.0019, + "grad_norm": 0.04730851575732231, + "learning_rate": 1.8950000000000003e-06, + "loss": 1.0996, + "step": 380 + }, + { + "epoch": 0.00195, + "grad_norm": 0.0756210908293724, + "learning_rate": 1.945e-06, + "loss": 1.098, + "step": 390 + }, + { + "epoch": 0.002, + "grad_norm": 0.043138016015291214, + "learning_rate": 1.995e-06, + "loss": 1.0959, + "step": 400 + }, + { + "epoch": 0.00205, + "grad_norm": 0.055947400629520416, + "learning_rate": 2.045e-06, + "loss": 1.095, + "step": 410 + }, + { + "epoch": 0.0021, + "grad_norm": 0.0793597623705864, + "learning_rate": 2.0950000000000003e-06, + "loss": 1.0927, + "step": 420 + }, + { + "epoch": 0.00215, + "grad_norm": 0.06133449450135231, + "learning_rate": 2.1450000000000002e-06, + "loss": 1.0914, + "step": 430 + }, + { + "epoch": 0.0022, + "grad_norm": 0.238010436296463, + "learning_rate": 2.195e-06, + "loss": 1.0899, + "step": 440 + }, + { + "epoch": 0.00225, + "grad_norm": 0.161293625831604, + "learning_rate": 2.245e-06, + "loss": 1.088, + "step": 450 + }, + { + "epoch": 0.0023, + "grad_norm": 0.4534534811973572, + "learning_rate": 2.2950000000000005e-06, + "loss": 1.0862, + "step": 460 + }, + { + "epoch": 0.00235, + "grad_norm": 0.13972555100917816, + "learning_rate": 2.345e-06, + "loss": 1.0843, + "step": 470 + }, + { + "epoch": 0.0024, + "grad_norm": 0.5878572463989258, + "learning_rate": 2.395e-06, + "loss": 1.0831, + "step": 480 + }, + { + "epoch": 0.00245, + "grad_norm": 1.0253840684890747, + "learning_rate": 2.445e-06, + "loss": 1.0819, + "step": 490 + }, + { + "epoch": 0.0025, + "grad_norm": 0.5768185257911682, + "learning_rate": 2.4950000000000003e-06, + "loss": 1.0797, + "step": 500 + }, + { + "epoch": 0.00255, + "grad_norm": 0.7738291025161743, + "learning_rate": 2.545e-06, + "loss": 1.077, + "step": 510 + }, + { + "epoch": 0.0026, + "grad_norm": 0.8719862103462219, + "learning_rate": 2.595e-06, + "loss": 1.0732, + "step": 520 + }, + { + "epoch": 0.00265, + "grad_norm": 1.0189906358718872, + "learning_rate": 2.645e-06, + "loss": 1.07, + "step": 530 + }, + { + "epoch": 0.0027, + "grad_norm": 0.7507759928703308, + "learning_rate": 2.6950000000000005e-06, + "loss": 1.0668, + "step": 540 + }, + { + "epoch": 0.00275, + "grad_norm": 0.181038960814476, + "learning_rate": 2.745e-06, + "loss": 1.059, + "step": 550 + }, + { + "epoch": 0.0028, + "grad_norm": 0.3252950608730316, + "learning_rate": 2.795e-06, + "loss": 1.0507, + "step": 560 + }, + { + "epoch": 0.00285, + "grad_norm": 0.8660540580749512, + "learning_rate": 2.8450000000000003e-06, + "loss": 1.04, + "step": 570 + }, + { + "epoch": 0.0029, + "grad_norm": 0.36427515745162964, + "learning_rate": 2.8950000000000002e-06, + "loss": 1.0302, + "step": 580 + }, + { + "epoch": 0.00295, + "grad_norm": 0.6166048049926758, + "learning_rate": 2.945e-06, + "loss": 1.0182, + "step": 590 + }, + { + "epoch": 0.003, + "grad_norm": 1.1126497983932495, + "learning_rate": 2.995e-06, + "loss": 0.9998, + "step": 600 + }, + { + "epoch": 0.00305, + "grad_norm": 0.7704527974128723, + "learning_rate": 3.0450000000000005e-06, + "loss": 0.9803, + "step": 610 + }, + { + "epoch": 0.0031, + "grad_norm": 1.032020926475525, + "learning_rate": 3.095e-06, + "loss": 0.9623, + "step": 620 + }, + { + "epoch": 0.00315, + "grad_norm": 0.41123878955841064, + "learning_rate": 3.145e-06, + "loss": 0.9493, + "step": 630 + }, + { + "epoch": 0.0032, + "grad_norm": 0.44400614500045776, + "learning_rate": 3.195e-06, + "loss": 0.9173, + "step": 640 + }, + { + "epoch": 0.00325, + "grad_norm": 0.7372226119041443, + "learning_rate": 3.2450000000000003e-06, + "loss": 0.8879, + "step": 650 + }, + { + "epoch": 0.0033, + "grad_norm": 0.463133841753006, + "learning_rate": 3.2950000000000002e-06, + "loss": 0.8666, + "step": 660 + }, + { + "epoch": 0.00335, + "grad_norm": 0.5725065469741821, + "learning_rate": 3.345e-06, + "loss": 0.8306, + "step": 670 + }, + { + "epoch": 0.0034, + "grad_norm": 0.36958637833595276, + "learning_rate": 3.395e-06, + "loss": 0.8081, + "step": 680 + }, + { + "epoch": 0.00345, + "grad_norm": 0.5271418690681458, + "learning_rate": 3.4450000000000005e-06, + "loss": 0.7744, + "step": 690 + }, + { + "epoch": 0.0035, + "grad_norm": 0.48186737298965454, + "learning_rate": 3.4950000000000004e-06, + "loss": 0.752, + "step": 700 + }, + { + "epoch": 0.00355, + "grad_norm": 0.7156515717506409, + "learning_rate": 3.5450000000000004e-06, + "loss": 0.7249, + "step": 710 + }, + { + "epoch": 0.0036, + "grad_norm": 0.773932695388794, + "learning_rate": 3.5950000000000003e-06, + "loss": 0.7132, + "step": 720 + }, + { + "epoch": 0.00365, + "grad_norm": 0.5371299386024475, + "learning_rate": 3.6450000000000007e-06, + "loss": 0.6724, + "step": 730 + }, + { + "epoch": 0.0037, + "grad_norm": 0.579585075378418, + "learning_rate": 3.6949999999999998e-06, + "loss": 0.656, + "step": 740 + }, + { + "epoch": 0.00375, + "grad_norm": 0.6134310364723206, + "learning_rate": 3.7449999999999997e-06, + "loss": 0.6222, + "step": 750 + }, + { + "epoch": 0.0038, + "grad_norm": 0.4206959307193756, + "learning_rate": 3.795e-06, + "loss": 0.5999, + "step": 760 + }, + { + "epoch": 0.00385, + "grad_norm": 0.5062888860702515, + "learning_rate": 3.845e-06, + "loss": 0.5768, + "step": 770 + }, + { + "epoch": 0.0039, + "grad_norm": 0.470325767993927, + "learning_rate": 3.895e-06, + "loss": 0.5585, + "step": 780 + }, + { + "epoch": 0.00395, + "grad_norm": 0.543121874332428, + "learning_rate": 3.945e-06, + "loss": 0.5445, + "step": 790 + }, + { + "epoch": 0.004, + "grad_norm": 0.5184602737426758, + "learning_rate": 3.995e-06, + "loss": 0.5076, + "step": 800 + }, + { + "epoch": 0.00405, + "grad_norm": 0.6378428339958191, + "learning_rate": 4.045e-06, + "loss": 0.5173, + "step": 810 + }, + { + "epoch": 0.0041, + "grad_norm": 0.5006017684936523, + "learning_rate": 4.095000000000001e-06, + "loss": 0.4775, + "step": 820 + }, + { + "epoch": 0.00415, + "grad_norm": 0.4863206744194031, + "learning_rate": 4.145e-06, + "loss": 0.4889, + "step": 830 + }, + { + "epoch": 0.0042, + "grad_norm": 0.49730730056762695, + "learning_rate": 4.1950000000000005e-06, + "loss": 0.4476, + "step": 840 + }, + { + "epoch": 0.00425, + "grad_norm": 0.7628925442695618, + "learning_rate": 4.245e-06, + "loss": 0.4368, + "step": 850 + }, + { + "epoch": 0.0043, + "grad_norm": 0.49417269229888916, + "learning_rate": 4.295e-06, + "loss": 0.4226, + "step": 860 + }, + { + "epoch": 0.00435, + "grad_norm": 0.6243632435798645, + "learning_rate": 4.345000000000001e-06, + "loss": 0.4138, + "step": 870 + }, + { + "epoch": 0.0044, + "grad_norm": 0.4828900396823883, + "learning_rate": 4.395e-06, + "loss": 0.398, + "step": 880 + }, + { + "epoch": 0.00445, + "grad_norm": 0.7151875495910645, + "learning_rate": 4.445000000000001e-06, + "loss": 0.3762, + "step": 890 + }, + { + "epoch": 0.0045, + "grad_norm": 0.44865331053733826, + "learning_rate": 4.495e-06, + "loss": 0.3935, + "step": 900 + }, + { + "epoch": 0.00455, + "grad_norm": 0.57523113489151, + "learning_rate": 4.545e-06, + "loss": 0.3625, + "step": 910 + }, + { + "epoch": 0.0046, + "grad_norm": 0.4795694053173065, + "learning_rate": 4.595e-06, + "loss": 0.3682, + "step": 920 + }, + { + "epoch": 0.00465, + "grad_norm": 0.8969322443008423, + "learning_rate": 4.645e-06, + "loss": 0.37, + "step": 930 + }, + { + "epoch": 0.0047, + "grad_norm": 0.6106979250907898, + "learning_rate": 4.695e-06, + "loss": 0.3536, + "step": 940 + }, + { + "epoch": 0.00475, + "grad_norm": 0.4854564666748047, + "learning_rate": 4.745e-06, + "loss": 0.3444, + "step": 950 + }, + { + "epoch": 0.0048, + "grad_norm": 0.5885597467422485, + "learning_rate": 4.795e-06, + "loss": 0.328, + "step": 960 + }, + { + "epoch": 0.00485, + "grad_norm": 0.5267831087112427, + "learning_rate": 4.845e-06, + "loss": 0.3361, + "step": 970 + }, + { + "epoch": 0.0049, + "grad_norm": 0.6329275369644165, + "learning_rate": 4.8950000000000006e-06, + "loss": 0.3166, + "step": 980 + }, + { + "epoch": 0.00495, + "grad_norm": 0.7038520574569702, + "learning_rate": 4.945e-06, + "loss": 0.3146, + "step": 990 + }, + { + "epoch": 0.005, + "grad_norm": 0.9382172226905823, + "learning_rate": 4.9950000000000005e-06, + "loss": 0.3026, + "step": 1000 + }, + { + "epoch": 5e-05, + "grad_norm": 0.5049734711647034, + "learning_rate": 5.045000000000001e-06, + "loss": 0.3008, + "step": 1010 + }, + { + "epoch": 0.0001, + "grad_norm": 0.4181530773639679, + "learning_rate": 5.095e-06, + "loss": 0.285, + "step": 1020 + }, + { + "epoch": 0.00015, + "grad_norm": 0.5890700221061707, + "learning_rate": 5.145000000000001e-06, + "loss": 0.2802, + "step": 1030 + }, + { + "epoch": 0.0002, + "grad_norm": 0.7265661358833313, + "learning_rate": 5.195e-06, + "loss": 0.2882, + "step": 1040 + }, + { + "epoch": 0.00025, + "grad_norm": 0.47236159443855286, + "learning_rate": 5.245e-06, + "loss": 0.2769, + "step": 1050 + }, + { + "epoch": 0.0003, + "grad_norm": 0.8550785779953003, + "learning_rate": 5.295e-06, + "loss": 0.272, + "step": 1060 + }, + { + "epoch": 0.00035, + "grad_norm": 0.5752397179603577, + "learning_rate": 5.345e-06, + "loss": 0.2686, + "step": 1070 + }, + { + "epoch": 0.0004, + "grad_norm": 0.40073686838150024, + "learning_rate": 5.395e-06, + "loss": 0.292, + "step": 1080 + }, + { + "epoch": 0.00045, + "grad_norm": 0.623715877532959, + "learning_rate": 5.445e-06, + "loss": 0.2762, + "step": 1090 + }, + { + "epoch": 0.0005, + "grad_norm": 0.8721001744270325, + "learning_rate": 5.495e-06, + "loss": 0.2733, + "step": 1100 + }, + { + "epoch": 0.00055, + "grad_norm": 0.4661286473274231, + "learning_rate": 5.545e-06, + "loss": 0.27, + "step": 1110 + }, + { + "epoch": 0.0006, + "grad_norm": 0.38948312401771545, + "learning_rate": 5.595000000000001e-06, + "loss": 0.257, + "step": 1120 + }, + { + "epoch": 0.00065, + "grad_norm": 0.3550063967704773, + "learning_rate": 5.645e-06, + "loss": 0.2786, + "step": 1130 + }, + { + "epoch": 0.0007, + "grad_norm": 0.35984066128730774, + "learning_rate": 5.6950000000000005e-06, + "loss": 0.2508, + "step": 1140 + }, + { + "epoch": 0.00075, + "grad_norm": 0.7620972990989685, + "learning_rate": 5.745e-06, + "loss": 0.2399, + "step": 1150 + }, + { + "epoch": 0.0008, + "grad_norm": 0.8049952983856201, + "learning_rate": 5.795e-06, + "loss": 0.2466, + "step": 1160 + }, + { + "epoch": 0.00085, + "grad_norm": 0.5977007150650024, + "learning_rate": 5.845000000000001e-06, + "loss": 0.2481, + "step": 1170 + }, + { + "epoch": 0.0009, + "grad_norm": 0.6251347661018372, + "learning_rate": 5.895e-06, + "loss": 0.2506, + "step": 1180 + }, + { + "epoch": 0.00095, + "grad_norm": 0.7045720219612122, + "learning_rate": 5.945000000000001e-06, + "loss": 0.2341, + "step": 1190 + }, + { + "epoch": 0.001, + "grad_norm": 0.45146504044532776, + "learning_rate": 5.995e-06, + "loss": 0.2406, + "step": 1200 + }, + { + "epoch": 0.00105, + "grad_norm": 0.8330421447753906, + "learning_rate": 6.045e-06, + "loss": 0.2667, + "step": 1210 + }, + { + "epoch": 0.0011, + "grad_norm": 0.3268861174583435, + "learning_rate": 6.095e-06, + "loss": 0.2388, + "step": 1220 + }, + { + "epoch": 0.00115, + "grad_norm": 0.45664694905281067, + "learning_rate": 6.1450000000000005e-06, + "loss": 0.2444, + "step": 1230 + }, + { + "epoch": 0.0012, + "grad_norm": 0.48323923349380493, + "learning_rate": 6.195e-06, + "loss": 0.2396, + "step": 1240 + }, + { + "epoch": 0.00125, + "grad_norm": 0.5909716486930847, + "learning_rate": 6.245e-06, + "loss": 0.2285, + "step": 1250 + }, + { + "epoch": 0.0013, + "grad_norm": 0.3466012477874756, + "learning_rate": 6.295000000000001e-06, + "loss": 0.2328, + "step": 1260 + }, + { + "epoch": 0.00135, + "grad_norm": 0.5698915719985962, + "learning_rate": 6.345000000000001e-06, + "loss": 0.2228, + "step": 1270 + }, + { + "epoch": 0.0014, + "grad_norm": 0.5685576796531677, + "learning_rate": 6.395000000000001e-06, + "loss": 0.2153, + "step": 1280 + }, + { + "epoch": 0.00145, + "grad_norm": 0.30952244997024536, + "learning_rate": 6.444999999999999e-06, + "loss": 0.219, + "step": 1290 + }, + { + "epoch": 0.0015, + "grad_norm": 0.41652658581733704, + "learning_rate": 6.495e-06, + "loss": 0.2272, + "step": 1300 + }, + { + "epoch": 0.00155, + "grad_norm": 0.43570876121520996, + "learning_rate": 6.545e-06, + "loss": 0.2158, + "step": 1310 + }, + { + "epoch": 0.0016, + "grad_norm": 0.3096759617328644, + "learning_rate": 6.5949999999999995e-06, + "loss": 0.2184, + "step": 1320 + }, + { + "epoch": 0.00165, + "grad_norm": 0.383800208568573, + "learning_rate": 6.645e-06, + "loss": 0.215, + "step": 1330 + }, + { + "epoch": 0.0017, + "grad_norm": 0.3670880198478699, + "learning_rate": 6.695e-06, + "loss": 0.2012, + "step": 1340 + }, + { + "epoch": 0.00175, + "grad_norm": 0.4803926944732666, + "learning_rate": 6.745e-06, + "loss": 0.2218, + "step": 1350 + }, + { + "epoch": 0.0018, + "grad_norm": 0.3520296812057495, + "learning_rate": 6.795e-06, + "loss": 0.2182, + "step": 1360 + }, + { + "epoch": 0.00185, + "grad_norm": 0.5916102528572083, + "learning_rate": 6.845e-06, + "loss": 0.2272, + "step": 1370 + }, + { + "epoch": 0.0019, + "grad_norm": 0.46707865595817566, + "learning_rate": 6.895e-06, + "loss": 0.2129, + "step": 1380 + }, + { + "epoch": 0.00195, + "grad_norm": 0.4104538559913635, + "learning_rate": 6.945e-06, + "loss": 0.2168, + "step": 1390 + }, + { + "epoch": 0.002, + "grad_norm": 0.8709113001823425, + "learning_rate": 6.995e-06, + "loss": 0.203, + "step": 1400 + }, + { + "epoch": 0.00205, + "grad_norm": 0.4051400125026703, + "learning_rate": 7.045e-06, + "loss": 0.2104, + "step": 1410 + }, + { + "epoch": 0.0021, + "grad_norm": 0.6250728368759155, + "learning_rate": 7.095000000000001e-06, + "loss": 0.1993, + "step": 1420 + }, + { + "epoch": 0.00215, + "grad_norm": 0.4261170029640198, + "learning_rate": 7.145e-06, + "loss": 0.2033, + "step": 1430 + }, + { + "epoch": 0.0022, + "grad_norm": 0.6586616635322571, + "learning_rate": 7.1950000000000006e-06, + "loss": 0.1927, + "step": 1440 + }, + { + "epoch": 0.00225, + "grad_norm": 0.2842698395252228, + "learning_rate": 7.245e-06, + "loss": 0.1917, + "step": 1450 + }, + { + "epoch": 0.0023, + "grad_norm": 0.4493279457092285, + "learning_rate": 7.2950000000000005e-06, + "loss": 0.1979, + "step": 1460 + }, + { + "epoch": 0.00235, + "grad_norm": 0.7227078676223755, + "learning_rate": 7.345000000000001e-06, + "loss": 0.2076, + "step": 1470 + }, + { + "epoch": 0.0024, + "grad_norm": 0.5697221755981445, + "learning_rate": 7.395e-06, + "loss": 0.185, + "step": 1480 + }, + { + "epoch": 0.00245, + "grad_norm": 0.5880600214004517, + "learning_rate": 7.445000000000001e-06, + "loss": 0.2147, + "step": 1490 + }, + { + "epoch": 0.0025, + "grad_norm": 0.4969238042831421, + "learning_rate": 7.495e-06, + "loss": 0.2049, + "step": 1500 + }, + { + "epoch": 0.00255, + "grad_norm": 0.7584699988365173, + "learning_rate": 7.545000000000001e-06, + "loss": 0.1988, + "step": 1510 + }, + { + "epoch": 0.0026, + "grad_norm": 0.643905758857727, + "learning_rate": 7.595000000000001e-06, + "loss": 0.1977, + "step": 1520 + }, + { + "epoch": 0.00265, + "grad_norm": 0.380588561296463, + "learning_rate": 7.645e-06, + "loss": 0.2, + "step": 1530 + }, + { + "epoch": 0.0027, + "grad_norm": 0.5197119116783142, + "learning_rate": 7.695e-06, + "loss": 0.1853, + "step": 1540 + }, + { + "epoch": 0.00275, + "grad_norm": 0.6001980900764465, + "learning_rate": 7.745000000000001e-06, + "loss": 0.1974, + "step": 1550 + }, + { + "epoch": 0.0028, + "grad_norm": 0.32734277844429016, + "learning_rate": 7.795e-06, + "loss": 0.1848, + "step": 1560 + }, + { + "epoch": 0.00285, + "grad_norm": 0.39121389389038086, + "learning_rate": 7.845e-06, + "loss": 0.186, + "step": 1570 + }, + { + "epoch": 0.0029, + "grad_norm": 0.4204869568347931, + "learning_rate": 7.895000000000001e-06, + "loss": 0.1922, + "step": 1580 + }, + { + "epoch": 0.00295, + "grad_norm": 0.5324739813804626, + "learning_rate": 7.945000000000001e-06, + "loss": 0.1857, + "step": 1590 + }, + { + "epoch": 0.003, + "grad_norm": 0.6414231657981873, + "learning_rate": 7.995e-06, + "loss": 0.192, + "step": 1600 + }, + { + "epoch": 0.00305, + "grad_norm": 0.8036292195320129, + "learning_rate": 8.045e-06, + "loss": 0.1826, + "step": 1610 + }, + { + "epoch": 0.0031, + "grad_norm": 0.697265625, + "learning_rate": 8.095e-06, + "loss": 0.1863, + "step": 1620 + }, + { + "epoch": 0.00315, + "grad_norm": 0.519187867641449, + "learning_rate": 8.144999999999999e-06, + "loss": 0.1984, + "step": 1630 + }, + { + "epoch": 0.0032, + "grad_norm": 0.4846292734146118, + "learning_rate": 8.195e-06, + "loss": 0.1886, + "step": 1640 + }, + { + "epoch": 0.00325, + "grad_norm": 0.4025842547416687, + "learning_rate": 8.245e-06, + "loss": 0.183, + "step": 1650 + }, + { + "epoch": 0.0033, + "grad_norm": 0.4674290120601654, + "learning_rate": 8.295e-06, + "loss": 0.1735, + "step": 1660 + }, + { + "epoch": 0.00335, + "grad_norm": 0.482946515083313, + "learning_rate": 8.345e-06, + "loss": 0.1792, + "step": 1670 + }, + { + "epoch": 0.0034, + "grad_norm": 0.5781715512275696, + "learning_rate": 8.395e-06, + "loss": 0.1769, + "step": 1680 + }, + { + "epoch": 0.00345, + "grad_norm": 0.34087538719177246, + "learning_rate": 8.445e-06, + "loss": 0.1768, + "step": 1690 + }, + { + "epoch": 0.0035, + "grad_norm": 0.5387849807739258, + "learning_rate": 8.495e-06, + "loss": 0.1705, + "step": 1700 + }, + { + "epoch": 0.00355, + "grad_norm": 0.3808085024356842, + "learning_rate": 8.545e-06, + "loss": 0.1821, + "step": 1710 + }, + { + "epoch": 0.0036, + "grad_norm": 0.3419106602668762, + "learning_rate": 8.595e-06, + "loss": 0.1702, + "step": 1720 + }, + { + "epoch": 0.00365, + "grad_norm": 0.5243216156959534, + "learning_rate": 8.645000000000001e-06, + "loss": 0.1826, + "step": 1730 + }, + { + "epoch": 0.0037, + "grad_norm": 0.4391971826553345, + "learning_rate": 8.695e-06, + "loss": 0.1635, + "step": 1740 + }, + { + "epoch": 0.00375, + "grad_norm": 0.4228607714176178, + "learning_rate": 8.745e-06, + "loss": 0.1705, + "step": 1750 + }, + { + "epoch": 0.0038, + "grad_norm": 0.6283707022666931, + "learning_rate": 8.795e-06, + "loss": 0.1647, + "step": 1760 + }, + { + "epoch": 0.00385, + "grad_norm": 0.5195528864860535, + "learning_rate": 8.845000000000001e-06, + "loss": 0.1706, + "step": 1770 + }, + { + "epoch": 0.0039, + "grad_norm": 0.6638985276222229, + "learning_rate": 8.895e-06, + "loss": 0.169, + "step": 1780 + }, + { + "epoch": 0.00395, + "grad_norm": 0.6162652969360352, + "learning_rate": 8.945e-06, + "loss": 0.1811, + "step": 1790 + }, + { + "epoch": 0.004, + "grad_norm": 0.6614283919334412, + "learning_rate": 8.995000000000001e-06, + "loss": 0.1776, + "step": 1800 + }, + { + "epoch": 0.00405, + "grad_norm": 0.38888028264045715, + "learning_rate": 9.045e-06, + "loss": 0.1795, + "step": 1810 + }, + { + "epoch": 0.0041, + "grad_norm": 0.4616861045360565, + "learning_rate": 9.095e-06, + "loss": 0.1729, + "step": 1820 + }, + { + "epoch": 0.00415, + "grad_norm": 0.7028610706329346, + "learning_rate": 9.145000000000001e-06, + "loss": 0.1695, + "step": 1830 + }, + { + "epoch": 0.0042, + "grad_norm": 0.5122510194778442, + "learning_rate": 9.195000000000001e-06, + "loss": 0.1696, + "step": 1840 + }, + { + "epoch": 0.00425, + "grad_norm": 0.411853164434433, + "learning_rate": 9.245e-06, + "loss": 0.1583, + "step": 1850 + }, + { + "epoch": 0.0043, + "grad_norm": 0.41692647337913513, + "learning_rate": 9.295000000000002e-06, + "loss": 0.1699, + "step": 1860 + }, + { + "epoch": 0.00435, + "grad_norm": 0.42551809549331665, + "learning_rate": 9.345000000000001e-06, + "loss": 0.1689, + "step": 1870 + }, + { + "epoch": 0.0044, + "grad_norm": 0.587212860584259, + "learning_rate": 9.395e-06, + "loss": 0.1625, + "step": 1880 + }, + { + "epoch": 0.00445, + "grad_norm": 1.0188299417495728, + "learning_rate": 9.445000000000002e-06, + "loss": 0.1639, + "step": 1890 + }, + { + "epoch": 0.0045, + "grad_norm": 0.5616925954818726, + "learning_rate": 9.495000000000001e-06, + "loss": 0.1605, + "step": 1900 + }, + { + "epoch": 0.00455, + "grad_norm": 0.7324318885803223, + "learning_rate": 9.545e-06, + "loss": 0.1598, + "step": 1910 + }, + { + "epoch": 0.0046, + "grad_norm": 0.6417198181152344, + "learning_rate": 9.595e-06, + "loss": 0.1561, + "step": 1920 + }, + { + "epoch": 0.00465, + "grad_norm": 0.4558139443397522, + "learning_rate": 9.645e-06, + "loss": 0.1637, + "step": 1930 + }, + { + "epoch": 0.0047, + "grad_norm": 0.6513067483901978, + "learning_rate": 9.695e-06, + "loss": 0.1573, + "step": 1940 + }, + { + "epoch": 0.00475, + "grad_norm": 0.5894631147384644, + "learning_rate": 9.745e-06, + "loss": 0.1725, + "step": 1950 + }, + { + "epoch": 0.0048, + "grad_norm": 0.4687117040157318, + "learning_rate": 9.795e-06, + "loss": 0.1565, + "step": 1960 + }, + { + "epoch": 0.00485, + "grad_norm": 0.9609802961349487, + "learning_rate": 9.845e-06, + "loss": 0.1537, + "step": 1970 + }, + { + "epoch": 0.0049, + "grad_norm": 0.6374800801277161, + "learning_rate": 9.895e-06, + "loss": 0.1734, + "step": 1980 + }, + { + "epoch": 0.00495, + "grad_norm": 0.659702718257904, + "learning_rate": 9.945e-06, + "loss": 0.1631, + "step": 1990 + }, + { + "epoch": 0.005, + "grad_norm": 0.566866397857666, + "learning_rate": 9.995e-06, + "loss": 0.1729, + "step": 2000 + }, + { + "epoch": 0.00505, + "grad_norm": 0.575867235660553, + "learning_rate": 1.0045e-05, + "loss": 0.1608, + "step": 2010 + }, + { + "epoch": 0.0051, + "grad_norm": 0.5216620564460754, + "learning_rate": 1.0095e-05, + "loss": 0.1549, + "step": 2020 + }, + { + "epoch": 0.00515, + "grad_norm": 0.4505414664745331, + "learning_rate": 1.0145e-05, + "loss": 0.1666, + "step": 2030 + }, + { + "epoch": 0.0052, + "grad_norm": 0.47155532240867615, + "learning_rate": 1.0195e-05, + "loss": 0.156, + "step": 2040 + }, + { + "epoch": 0.00525, + "grad_norm": 0.499719500541687, + "learning_rate": 1.0245000000000001e-05, + "loss": 0.1593, + "step": 2050 + }, + { + "epoch": 0.0053, + "grad_norm": 0.5323207974433899, + "learning_rate": 1.0295e-05, + "loss": 0.1552, + "step": 2060 + }, + { + "epoch": 0.00535, + "grad_norm": 0.48722875118255615, + "learning_rate": 1.0345e-05, + "loss": 0.156, + "step": 2070 + }, + { + "epoch": 0.0054, + "grad_norm": 0.4714648425579071, + "learning_rate": 1.0395000000000001e-05, + "loss": 0.1562, + "step": 2080 + }, + { + "epoch": 0.00545, + "grad_norm": 0.5482438206672668, + "learning_rate": 1.0445e-05, + "loss": 0.1521, + "step": 2090 + }, + { + "epoch": 0.0055, + "grad_norm": 0.5565738081932068, + "learning_rate": 1.0495e-05, + "loss": 0.1519, + "step": 2100 + }, + { + "epoch": 0.00555, + "grad_norm": 0.6969251036643982, + "learning_rate": 1.0545000000000002e-05, + "loss": 0.1658, + "step": 2110 + }, + { + "epoch": 0.0056, + "grad_norm": 0.47207051515579224, + "learning_rate": 1.0595000000000001e-05, + "loss": 0.1571, + "step": 2120 + }, + { + "epoch": 0.00565, + "grad_norm": 0.6089038252830505, + "learning_rate": 1.0645e-05, + "loss": 0.1522, + "step": 2130 + }, + { + "epoch": 0.0057, + "grad_norm": 0.5770543217658997, + "learning_rate": 1.0695e-05, + "loss": 0.1515, + "step": 2140 + }, + { + "epoch": 0.00575, + "grad_norm": 0.7493625283241272, + "learning_rate": 1.0745000000000001e-05, + "loss": 0.1452, + "step": 2150 + }, + { + "epoch": 0.0058, + "grad_norm": 0.6182336807250977, + "learning_rate": 1.0795e-05, + "loss": 0.1439, + "step": 2160 + }, + { + "epoch": 0.00585, + "grad_norm": 0.6780596971511841, + "learning_rate": 1.0845e-05, + "loss": 0.1441, + "step": 2170 + }, + { + "epoch": 0.0059, + "grad_norm": 0.48616987466812134, + "learning_rate": 1.0895000000000002e-05, + "loss": 0.1405, + "step": 2180 + }, + { + "epoch": 0.00595, + "grad_norm": 0.48511484265327454, + "learning_rate": 1.0945000000000001e-05, + "loss": 0.1431, + "step": 2190 + }, + { + "epoch": 0.006, + "grad_norm": 0.5814856886863708, + "learning_rate": 1.0995e-05, + "loss": 0.1495, + "step": 2200 + }, + { + "epoch": 0.00605, + "grad_norm": 0.5197475552558899, + "learning_rate": 1.1045000000000002e-05, + "loss": 0.143, + "step": 2210 + }, + { + "epoch": 0.0061, + "grad_norm": 0.5951067805290222, + "learning_rate": 1.1095e-05, + "loss": 0.1574, + "step": 2220 + }, + { + "epoch": 0.00615, + "grad_norm": 0.5585006475448608, + "learning_rate": 1.1145e-05, + "loss": 0.1489, + "step": 2230 + }, + { + "epoch": 0.0062, + "grad_norm": 0.5013735890388489, + "learning_rate": 1.1195e-05, + "loss": 0.1411, + "step": 2240 + }, + { + "epoch": 0.00625, + "grad_norm": 0.8280708193778992, + "learning_rate": 1.1245e-05, + "loss": 0.1369, + "step": 2250 + }, + { + "epoch": 0.0063, + "grad_norm": 0.48556965589523315, + "learning_rate": 1.1295e-05, + "loss": 0.1497, + "step": 2260 + }, + { + "epoch": 0.00635, + "grad_norm": 0.44708576798439026, + "learning_rate": 1.1345e-05, + "loss": 0.1385, + "step": 2270 + }, + { + "epoch": 0.0064, + "grad_norm": 0.4834116995334625, + "learning_rate": 1.1395e-05, + "loss": 0.1413, + "step": 2280 + }, + { + "epoch": 0.00645, + "grad_norm": 0.5674512386322021, + "learning_rate": 1.1445e-05, + "loss": 0.1512, + "step": 2290 + }, + { + "epoch": 0.0065, + "grad_norm": 0.45389649271965027, + "learning_rate": 1.1495000000000001e-05, + "loss": 0.1555, + "step": 2300 + }, + { + "epoch": 0.00655, + "grad_norm": 0.5489965081214905, + "learning_rate": 1.1545e-05, + "loss": 0.1524, + "step": 2310 + }, + { + "epoch": 0.0066, + "grad_norm": 0.4980386793613434, + "learning_rate": 1.1595e-05, + "loss": 0.1499, + "step": 2320 + }, + { + "epoch": 0.00665, + "grad_norm": 0.6992486715316772, + "learning_rate": 1.1645000000000001e-05, + "loss": 0.1447, + "step": 2330 + }, + { + "epoch": 0.0067, + "grad_norm": 0.4216298460960388, + "learning_rate": 1.1695e-05, + "loss": 0.1414, + "step": 2340 + }, + { + "epoch": 0.00675, + "grad_norm": 0.4966219961643219, + "learning_rate": 1.1745e-05, + "loss": 0.1419, + "step": 2350 + }, + { + "epoch": 0.0068, + "grad_norm": 0.5875610709190369, + "learning_rate": 1.1795e-05, + "loss": 0.1369, + "step": 2360 + }, + { + "epoch": 0.00685, + "grad_norm": 0.910181999206543, + "learning_rate": 1.1845000000000001e-05, + "loss": 0.1439, + "step": 2370 + }, + { + "epoch": 0.0069, + "grad_norm": 0.5701887011528015, + "learning_rate": 1.1895e-05, + "loss": 0.1386, + "step": 2380 + }, + { + "epoch": 0.00695, + "grad_norm": 0.7611976861953735, + "learning_rate": 1.1945e-05, + "loss": 0.1356, + "step": 2390 + }, + { + "epoch": 0.007, + "grad_norm": 0.5266007781028748, + "learning_rate": 1.1995000000000001e-05, + "loss": 0.1357, + "step": 2400 + }, + { + "epoch": 0.00705, + "grad_norm": 0.41671043634414673, + "learning_rate": 1.2045e-05, + "loss": 0.1338, + "step": 2410 + }, + { + "epoch": 0.0071, + "grad_norm": 0.5773289799690247, + "learning_rate": 1.2095e-05, + "loss": 0.1329, + "step": 2420 + }, + { + "epoch": 0.00715, + "grad_norm": 0.5124483108520508, + "learning_rate": 1.2145000000000001e-05, + "loss": 0.1354, + "step": 2430 + }, + { + "epoch": 0.0072, + "grad_norm": 0.6027225255966187, + "learning_rate": 1.2195000000000001e-05, + "loss": 0.1314, + "step": 2440 + }, + { + "epoch": 0.00725, + "grad_norm": 0.5789300203323364, + "learning_rate": 1.2245e-05, + "loss": 0.1377, + "step": 2450 + }, + { + "epoch": 0.0073, + "grad_norm": 0.4970523715019226, + "learning_rate": 1.2295000000000002e-05, + "loss": 0.1434, + "step": 2460 + }, + { + "epoch": 0.00735, + "grad_norm": 0.5187194347381592, + "learning_rate": 1.2345000000000001e-05, + "loss": 0.1349, + "step": 2470 + }, + { + "epoch": 0.0074, + "grad_norm": 0.7817258238792419, + "learning_rate": 1.2395e-05, + "loss": 0.1409, + "step": 2480 + }, + { + "epoch": 0.00745, + "grad_norm": 0.5194874405860901, + "learning_rate": 1.2445e-05, + "loss": 0.1337, + "step": 2490 + }, + { + "epoch": 0.0075, + "grad_norm": 0.47469213604927063, + "learning_rate": 1.2495000000000001e-05, + "loss": 0.1341, + "step": 2500 + }, + { + "epoch": 0.00755, + "grad_norm": 0.5221440196037292, + "learning_rate": 1.2545000000000001e-05, + "loss": 0.1304, + "step": 2510 + }, + { + "epoch": 0.0076, + "grad_norm": 0.4986429214477539, + "learning_rate": 1.2595e-05, + "loss": 0.129, + "step": 2520 + }, + { + "epoch": 0.00765, + "grad_norm": 0.5564189553260803, + "learning_rate": 1.2645000000000002e-05, + "loss": 0.1286, + "step": 2530 + }, + { + "epoch": 0.0077, + "grad_norm": 0.5815272927284241, + "learning_rate": 1.2695000000000001e-05, + "loss": 0.1266, + "step": 2540 + }, + { + "epoch": 0.00775, + "grad_norm": 0.4768863916397095, + "learning_rate": 1.2745e-05, + "loss": 0.1271, + "step": 2550 + }, + { + "epoch": 0.0078, + "grad_norm": 0.5922735333442688, + "learning_rate": 1.2795000000000002e-05, + "loss": 0.1288, + "step": 2560 + }, + { + "epoch": 0.00785, + "grad_norm": 0.8866589069366455, + "learning_rate": 1.2845000000000002e-05, + "loss": 0.1275, + "step": 2570 + }, + { + "epoch": 0.0079, + "grad_norm": 0.5470432639122009, + "learning_rate": 1.2895000000000001e-05, + "loss": 0.1367, + "step": 2580 + }, + { + "epoch": 0.00795, + "grad_norm": 0.45371824502944946, + "learning_rate": 1.2945000000000002e-05, + "loss": 0.1274, + "step": 2590 + }, + { + "epoch": 0.008, + "grad_norm": 0.46164730191230774, + "learning_rate": 1.2995000000000002e-05, + "loss": 0.124, + "step": 2600 + }, + { + "epoch": 0.00805, + "grad_norm": 0.5537484884262085, + "learning_rate": 1.3045000000000001e-05, + "loss": 0.1368, + "step": 2610 + }, + { + "epoch": 0.0081, + "grad_norm": 0.6138302087783813, + "learning_rate": 1.3095000000000003e-05, + "loss": 0.1316, + "step": 2620 + }, + { + "epoch": 0.00815, + "grad_norm": 0.6606197357177734, + "learning_rate": 1.3145000000000002e-05, + "loss": 0.1293, + "step": 2630 + }, + { + "epoch": 0.0082, + "grad_norm": 0.5215857625007629, + "learning_rate": 1.3195000000000002e-05, + "loss": 0.1291, + "step": 2640 + }, + { + "epoch": 0.00825, + "grad_norm": 0.5248004198074341, + "learning_rate": 1.3245000000000001e-05, + "loss": 0.1177, + "step": 2650 + }, + { + "epoch": 0.0083, + "grad_norm": 0.49368399381637573, + "learning_rate": 1.3295000000000002e-05, + "loss": 0.1245, + "step": 2660 + }, + { + "epoch": 0.00835, + "grad_norm": 0.6579505801200867, + "learning_rate": 1.3345000000000002e-05, + "loss": 0.1244, + "step": 2670 + }, + { + "epoch": 0.0084, + "grad_norm": 0.6320722103118896, + "learning_rate": 1.3395000000000001e-05, + "loss": 0.1205, + "step": 2680 + }, + { + "epoch": 0.00845, + "grad_norm": 0.4902285933494568, + "learning_rate": 1.3445e-05, + "loss": 0.1213, + "step": 2690 + }, + { + "epoch": 0.0085, + "grad_norm": 0.6687859296798706, + "learning_rate": 1.3494999999999999e-05, + "loss": 0.1283, + "step": 2700 + }, + { + "epoch": 0.00855, + "grad_norm": 0.5977720022201538, + "learning_rate": 1.3545e-05, + "loss": 0.1281, + "step": 2710 + }, + { + "epoch": 0.0086, + "grad_norm": 0.5987256169319153, + "learning_rate": 1.3595e-05, + "loss": 0.1206, + "step": 2720 + }, + { + "epoch": 0.00865, + "grad_norm": 0.6178602576255798, + "learning_rate": 1.3644999999999999e-05, + "loss": 0.1214, + "step": 2730 + }, + { + "epoch": 0.0087, + "grad_norm": 0.5904673933982849, + "learning_rate": 1.3695e-05, + "loss": 0.1244, + "step": 2740 + }, + { + "epoch": 0.00875, + "grad_norm": 0.7004421353340149, + "learning_rate": 1.3745e-05, + "loss": 0.1252, + "step": 2750 + }, + { + "epoch": 0.0088, + "grad_norm": 0.4697619378566742, + "learning_rate": 1.3795e-05, + "loss": 0.124, + "step": 2760 + }, + { + "epoch": 0.00885, + "grad_norm": 0.5584418773651123, + "learning_rate": 1.3845e-05, + "loss": 0.1318, + "step": 2770 + }, + { + "epoch": 0.0089, + "grad_norm": 0.5180680751800537, + "learning_rate": 1.3895e-05, + "loss": 0.1291, + "step": 2780 + }, + { + "epoch": 0.00895, + "grad_norm": 0.5268616676330566, + "learning_rate": 1.3945e-05, + "loss": 0.13, + "step": 2790 + }, + { + "epoch": 0.009, + "grad_norm": 0.7439182996749878, + "learning_rate": 1.3994999999999999e-05, + "loss": 0.1256, + "step": 2800 + }, + { + "epoch": 0.00905, + "grad_norm": 0.7283898591995239, + "learning_rate": 1.4045e-05, + "loss": 0.1227, + "step": 2810 + }, + { + "epoch": 0.0091, + "grad_norm": 0.6830780506134033, + "learning_rate": 1.4095e-05, + "loss": 0.1164, + "step": 2820 + }, + { + "epoch": 0.00915, + "grad_norm": 0.6911619901657104, + "learning_rate": 1.4145e-05, + "loss": 0.1265, + "step": 2830 + }, + { + "epoch": 0.0092, + "grad_norm": 0.6404842734336853, + "learning_rate": 1.4195e-05, + "loss": 0.1306, + "step": 2840 + }, + { + "epoch": 0.00925, + "grad_norm": 0.4563213586807251, + "learning_rate": 1.4245e-05, + "loss": 0.1255, + "step": 2850 + }, + { + "epoch": 0.0093, + "grad_norm": 0.8771994113922119, + "learning_rate": 1.4295e-05, + "loss": 0.1282, + "step": 2860 + }, + { + "epoch": 0.00935, + "grad_norm": 0.6573876142501831, + "learning_rate": 1.4345e-05, + "loss": 0.129, + "step": 2870 + }, + { + "epoch": 0.0094, + "grad_norm": 0.6409444808959961, + "learning_rate": 1.4395e-05, + "loss": 0.129, + "step": 2880 + }, + { + "epoch": 0.00945, + "grad_norm": 0.5570323467254639, + "learning_rate": 1.4445e-05, + "loss": 0.1206, + "step": 2890 + }, + { + "epoch": 0.0095, + "grad_norm": 0.4876030385494232, + "learning_rate": 1.4495000000000001e-05, + "loss": 0.112, + "step": 2900 + }, + { + "epoch": 0.00955, + "grad_norm": 0.5265260934829712, + "learning_rate": 1.4545e-05, + "loss": 0.121, + "step": 2910 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7372077703475952, + "learning_rate": 1.4595e-05, + "loss": 0.1226, + "step": 2920 + }, + { + "epoch": 0.00965, + "grad_norm": 0.5543709993362427, + "learning_rate": 1.4645e-05, + "loss": 0.1223, + "step": 2930 + }, + { + "epoch": 0.0097, + "grad_norm": 0.6461299061775208, + "learning_rate": 1.4695e-05, + "loss": 0.1272, + "step": 2940 + }, + { + "epoch": 0.00975, + "grad_norm": 0.6065276861190796, + "learning_rate": 1.4745e-05, + "loss": 0.1213, + "step": 2950 + }, + { + "epoch": 0.0098, + "grad_norm": 0.518893837928772, + "learning_rate": 1.4795e-05, + "loss": 0.1231, + "step": 2960 + }, + { + "epoch": 0.00985, + "grad_norm": 0.41890275478363037, + "learning_rate": 1.4845000000000001e-05, + "loss": 0.1227, + "step": 2970 + }, + { + "epoch": 0.0099, + "grad_norm": 0.7018951177597046, + "learning_rate": 1.4895e-05, + "loss": 0.121, + "step": 2980 + }, + { + "epoch": 0.00995, + "grad_norm": 0.5778883695602417, + "learning_rate": 1.4945e-05, + "loss": 0.1131, + "step": 2990 + }, + { + "epoch": 0.01, + "grad_norm": 0.5019136667251587, + "learning_rate": 1.4995000000000001e-05, + "loss": 0.1282, + "step": 3000 + }, + { + "epoch": 0.01005, + "grad_norm": 0.547877311706543, + "learning_rate": 1.5045e-05, + "loss": 0.1088, + "step": 3010 + }, + { + "epoch": 0.0101, + "grad_norm": 0.5180936455726624, + "learning_rate": 1.5095e-05, + "loss": 0.1261, + "step": 3020 + }, + { + "epoch": 0.01015, + "grad_norm": 0.5541945099830627, + "learning_rate": 1.5145000000000002e-05, + "loss": 0.115, + "step": 3030 + }, + { + "epoch": 0.0102, + "grad_norm": 0.4803144037723541, + "learning_rate": 1.5195000000000001e-05, + "loss": 0.1096, + "step": 3040 + }, + { + "epoch": 0.01025, + "grad_norm": 0.5364857912063599, + "learning_rate": 1.5245e-05, + "loss": 0.1177, + "step": 3050 + }, + { + "epoch": 0.0103, + "grad_norm": 0.40823110938072205, + "learning_rate": 1.5295000000000002e-05, + "loss": 0.1219, + "step": 3060 + }, + { + "epoch": 0.01035, + "grad_norm": 0.46944060921669006, + "learning_rate": 1.5345e-05, + "loss": 0.1051, + "step": 3070 + }, + { + "epoch": 0.0104, + "grad_norm": 0.517673909664154, + "learning_rate": 1.5395e-05, + "loss": 0.1183, + "step": 3080 + }, + { + "epoch": 0.01045, + "grad_norm": 0.4942791759967804, + "learning_rate": 1.5445000000000002e-05, + "loss": 0.1111, + "step": 3090 + }, + { + "epoch": 0.0105, + "grad_norm": 0.5908092260360718, + "learning_rate": 1.5495e-05, + "loss": 0.1149, + "step": 3100 + }, + { + "epoch": 0.01055, + "grad_norm": 0.7741053104400635, + "learning_rate": 1.5545e-05, + "loss": 0.1102, + "step": 3110 + }, + { + "epoch": 0.0106, + "grad_norm": 0.6076146364212036, + "learning_rate": 1.5595000000000002e-05, + "loss": 0.1085, + "step": 3120 + }, + { + "epoch": 0.01065, + "grad_norm": 0.6643165349960327, + "learning_rate": 1.5645e-05, + "loss": 0.1132, + "step": 3130 + }, + { + "epoch": 0.0107, + "grad_norm": 0.4119599461555481, + "learning_rate": 1.5695e-05, + "loss": 0.1197, + "step": 3140 + }, + { + "epoch": 0.01075, + "grad_norm": 0.437264621257782, + "learning_rate": 1.5745000000000003e-05, + "loss": 0.1132, + "step": 3150 + }, + { + "epoch": 0.0108, + "grad_norm": 0.44356924295425415, + "learning_rate": 1.5795e-05, + "loss": 0.107, + "step": 3160 + }, + { + "epoch": 0.01085, + "grad_norm": 0.8218526840209961, + "learning_rate": 1.5845e-05, + "loss": 0.1124, + "step": 3170 + }, + { + "epoch": 0.0109, + "grad_norm": 0.6016043424606323, + "learning_rate": 1.5895000000000003e-05, + "loss": 0.1073, + "step": 3180 + }, + { + "epoch": 0.01095, + "grad_norm": 0.5623441934585571, + "learning_rate": 1.5945e-05, + "loss": 0.1037, + "step": 3190 + }, + { + "epoch": 0.011, + "grad_norm": 0.4690852165222168, + "learning_rate": 1.5995000000000002e-05, + "loss": 0.117, + "step": 3200 + }, + { + "epoch": 0.01105, + "grad_norm": 0.5373580455780029, + "learning_rate": 1.6045000000000003e-05, + "loss": 0.1096, + "step": 3210 + }, + { + "epoch": 0.0111, + "grad_norm": 0.6314288973808289, + "learning_rate": 1.6095e-05, + "loss": 0.1077, + "step": 3220 + }, + { + "epoch": 0.01115, + "grad_norm": 0.6033948063850403, + "learning_rate": 1.6145000000000002e-05, + "loss": 0.1107, + "step": 3230 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5538676977157593, + "learning_rate": 1.6195000000000003e-05, + "loss": 0.1038, + "step": 3240 + }, + { + "epoch": 0.01125, + "grad_norm": 0.6222608089447021, + "learning_rate": 1.6245e-05, + "loss": 0.1073, + "step": 3250 + }, + { + "epoch": 0.0113, + "grad_norm": 0.7525963187217712, + "learning_rate": 1.6295000000000002e-05, + "loss": 0.115, + "step": 3260 + }, + { + "epoch": 0.01135, + "grad_norm": 0.6032587885856628, + "learning_rate": 1.6345000000000004e-05, + "loss": 0.1064, + "step": 3270 + }, + { + "epoch": 0.0114, + "grad_norm": 0.5158464908599854, + "learning_rate": 1.6395e-05, + "loss": 0.1027, + "step": 3280 + }, + { + "epoch": 0.01145, + "grad_norm": 0.5689551830291748, + "learning_rate": 1.6445000000000003e-05, + "loss": 0.1138, + "step": 3290 + }, + { + "epoch": 0.0115, + "grad_norm": 0.3941485285758972, + "learning_rate": 1.6495e-05, + "loss": 0.1228, + "step": 3300 + }, + { + "epoch": 0.01155, + "grad_norm": 0.4815775454044342, + "learning_rate": 1.6545e-05, + "loss": 0.1124, + "step": 3310 + }, + { + "epoch": 0.0116, + "grad_norm": 0.6636756658554077, + "learning_rate": 1.6595e-05, + "loss": 0.1066, + "step": 3320 + }, + { + "epoch": 0.01165, + "grad_norm": 0.7142220735549927, + "learning_rate": 1.6645e-05, + "loss": 0.1072, + "step": 3330 + }, + { + "epoch": 0.0117, + "grad_norm": 0.5734850764274597, + "learning_rate": 1.6695e-05, + "loss": 0.1136, + "step": 3340 + }, + { + "epoch": 0.01175, + "grad_norm": 0.4835772216320038, + "learning_rate": 1.6745e-05, + "loss": 0.1061, + "step": 3350 + }, + { + "epoch": 0.0118, + "grad_norm": 0.444543719291687, + "learning_rate": 1.6795e-05, + "loss": 0.109, + "step": 3360 + }, + { + "epoch": 0.01185, + "grad_norm": 0.5176830291748047, + "learning_rate": 1.6845e-05, + "loss": 0.1214, + "step": 3370 + }, + { + "epoch": 0.0119, + "grad_norm": 0.47018465399742126, + "learning_rate": 1.6895e-05, + "loss": 0.1119, + "step": 3380 + }, + { + "epoch": 0.01195, + "grad_norm": 0.41452693939208984, + "learning_rate": 1.6945e-05, + "loss": 0.1116, + "step": 3390 + }, + { + "epoch": 0.012, + "grad_norm": 0.5852661728858948, + "learning_rate": 1.6995e-05, + "loss": 0.1133, + "step": 3400 + }, + { + "epoch": 0.01205, + "grad_norm": 0.8678564429283142, + "learning_rate": 1.7045e-05, + "loss": 0.1123, + "step": 3410 + }, + { + "epoch": 0.0121, + "grad_norm": 0.5551216006278992, + "learning_rate": 1.7095e-05, + "loss": 0.1061, + "step": 3420 + }, + { + "epoch": 0.01215, + "grad_norm": 0.6764218807220459, + "learning_rate": 1.7145e-05, + "loss": 0.1097, + "step": 3430 + }, + { + "epoch": 0.0122, + "grad_norm": 0.623274564743042, + "learning_rate": 1.7195e-05, + "loss": 0.1074, + "step": 3440 + }, + { + "epoch": 0.01225, + "grad_norm": 0.5014644265174866, + "learning_rate": 1.7245e-05, + "loss": 0.1151, + "step": 3450 + }, + { + "epoch": 0.0123, + "grad_norm": 0.7478466629981995, + "learning_rate": 1.7295e-05, + "loss": 0.1125, + "step": 3460 + }, + { + "epoch": 0.01235, + "grad_norm": 0.6174198985099792, + "learning_rate": 1.7345e-05, + "loss": 0.1086, + "step": 3470 + }, + { + "epoch": 0.0124, + "grad_norm": 0.5381947159767151, + "learning_rate": 1.7395e-05, + "loss": 0.1, + "step": 3480 + }, + { + "epoch": 0.01245, + "grad_norm": 0.5227854251861572, + "learning_rate": 1.7445e-05, + "loss": 0.1083, + "step": 3490 + }, + { + "epoch": 0.0125, + "grad_norm": 0.6526561379432678, + "learning_rate": 1.7495e-05, + "loss": 0.1185, + "step": 3500 + }, + { + "epoch": 0.01255, + "grad_norm": 0.4748079478740692, + "learning_rate": 1.7545e-05, + "loss": 0.1052, + "step": 3510 + }, + { + "epoch": 0.0126, + "grad_norm": 0.6691007614135742, + "learning_rate": 1.7595e-05, + "loss": 0.1043, + "step": 3520 + }, + { + "epoch": 0.01265, + "grad_norm": 0.5588632225990295, + "learning_rate": 1.7645e-05, + "loss": 0.1119, + "step": 3530 + }, + { + "epoch": 0.0127, + "grad_norm": 0.6040257215499878, + "learning_rate": 1.7695e-05, + "loss": 0.1086, + "step": 3540 + }, + { + "epoch": 0.01275, + "grad_norm": 0.5624618530273438, + "learning_rate": 1.7745e-05, + "loss": 0.1108, + "step": 3550 + }, + { + "epoch": 0.0128, + "grad_norm": 0.6275506019592285, + "learning_rate": 1.7795e-05, + "loss": 0.1032, + "step": 3560 + }, + { + "epoch": 0.01285, + "grad_norm": 0.46647828817367554, + "learning_rate": 1.7845e-05, + "loss": 0.1, + "step": 3570 + }, + { + "epoch": 0.0129, + "grad_norm": 0.5406060218811035, + "learning_rate": 1.7895e-05, + "loss": 0.1113, + "step": 3580 + }, + { + "epoch": 0.01295, + "grad_norm": 0.4511054754257202, + "learning_rate": 1.7945000000000002e-05, + "loss": 0.1061, + "step": 3590 + }, + { + "epoch": 0.013, + "grad_norm": 0.40886831283569336, + "learning_rate": 1.7995e-05, + "loss": 0.1028, + "step": 3600 + }, + { + "epoch": 0.01305, + "grad_norm": 0.43077781796455383, + "learning_rate": 1.8045e-05, + "loss": 0.1115, + "step": 3610 + }, + { + "epoch": 0.0131, + "grad_norm": 0.6307575106620789, + "learning_rate": 1.8095000000000002e-05, + "loss": 0.0984, + "step": 3620 + }, + { + "epoch": 0.01315, + "grad_norm": 0.39847230911254883, + "learning_rate": 1.8145e-05, + "loss": 0.0988, + "step": 3630 + }, + { + "epoch": 0.0132, + "grad_norm": 0.46950647234916687, + "learning_rate": 1.8195e-05, + "loss": 0.1033, + "step": 3640 + }, + { + "epoch": 0.01325, + "grad_norm": 0.5256922841072083, + "learning_rate": 1.8245000000000002e-05, + "loss": 0.1037, + "step": 3650 + }, + { + "epoch": 0.0133, + "grad_norm": 0.6507235765457153, + "learning_rate": 1.8295e-05, + "loss": 0.1042, + "step": 3660 + }, + { + "epoch": 0.01335, + "grad_norm": 0.8489135503768921, + "learning_rate": 1.8345e-05, + "loss": 0.1016, + "step": 3670 + }, + { + "epoch": 0.0134, + "grad_norm": 0.5893881916999817, + "learning_rate": 1.8395000000000003e-05, + "loss": 0.1077, + "step": 3680 + }, + { + "epoch": 0.01345, + "grad_norm": 0.5649431943893433, + "learning_rate": 1.8445e-05, + "loss": 0.1168, + "step": 3690 + }, + { + "epoch": 0.0135, + "grad_norm": 0.6583290100097656, + "learning_rate": 1.8495e-05, + "loss": 0.1063, + "step": 3700 + }, + { + "epoch": 0.01355, + "grad_norm": 0.5168076157569885, + "learning_rate": 1.8545000000000003e-05, + "loss": 0.1033, + "step": 3710 + }, + { + "epoch": 0.0136, + "grad_norm": 0.8110647201538086, + "learning_rate": 1.8595e-05, + "loss": 0.098, + "step": 3720 + }, + { + "epoch": 0.01365, + "grad_norm": 0.4656333923339844, + "learning_rate": 1.8645000000000002e-05, + "loss": 0.0974, + "step": 3730 + }, + { + "epoch": 0.0137, + "grad_norm": 0.5489840507507324, + "learning_rate": 1.8695e-05, + "loss": 0.0946, + "step": 3740 + }, + { + "epoch": 0.01375, + "grad_norm": 0.9075152277946472, + "learning_rate": 1.8745e-05, + "loss": 0.107, + "step": 3750 + }, + { + "epoch": 0.0138, + "grad_norm": 0.6150771379470825, + "learning_rate": 1.8795000000000002e-05, + "loss": 0.0987, + "step": 3760 + }, + { + "epoch": 0.01385, + "grad_norm": 0.6698026657104492, + "learning_rate": 1.8845e-05, + "loss": 0.1006, + "step": 3770 + }, + { + "epoch": 0.0139, + "grad_norm": 0.7191886901855469, + "learning_rate": 1.8895e-05, + "loss": 0.1032, + "step": 3780 + }, + { + "epoch": 0.01395, + "grad_norm": 0.4110310673713684, + "learning_rate": 1.8945000000000002e-05, + "loss": 0.1016, + "step": 3790 + }, + { + "epoch": 0.014, + "grad_norm": 0.4223337769508362, + "learning_rate": 1.8995e-05, + "loss": 0.1007, + "step": 3800 + }, + { + "epoch": 0.01405, + "grad_norm": 0.5190854668617249, + "learning_rate": 1.9045e-05, + "loss": 0.1016, + "step": 3810 + }, + { + "epoch": 0.0141, + "grad_norm": 0.4307669997215271, + "learning_rate": 1.9095000000000003e-05, + "loss": 0.1039, + "step": 3820 + }, + { + "epoch": 0.01415, + "grad_norm": 0.37840327620506287, + "learning_rate": 1.9145e-05, + "loss": 0.0955, + "step": 3830 + }, + { + "epoch": 0.0142, + "grad_norm": 0.47309333086013794, + "learning_rate": 1.9195000000000002e-05, + "loss": 0.103, + "step": 3840 + }, + { + "epoch": 0.01425, + "grad_norm": 0.43094107508659363, + "learning_rate": 1.9245000000000003e-05, + "loss": 0.1027, + "step": 3850 + }, + { + "epoch": 0.0143, + "grad_norm": 0.9020057916641235, + "learning_rate": 1.9295e-05, + "loss": 0.1007, + "step": 3860 + }, + { + "epoch": 0.01435, + "grad_norm": 1.0408713817596436, + "learning_rate": 1.9345000000000002e-05, + "loss": 0.1092, + "step": 3870 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5754146575927734, + "learning_rate": 1.9395000000000003e-05, + "loss": 0.1043, + "step": 3880 + }, + { + "epoch": 0.01445, + "grad_norm": 0.5601019859313965, + "learning_rate": 1.9445e-05, + "loss": 0.0969, + "step": 3890 + }, + { + "epoch": 0.0145, + "grad_norm": 0.5026534795761108, + "learning_rate": 1.9495000000000002e-05, + "loss": 0.0923, + "step": 3900 + }, + { + "epoch": 0.01455, + "grad_norm": 0.4332098364830017, + "learning_rate": 1.9545000000000003e-05, + "loss": 0.0949, + "step": 3910 + }, + { + "epoch": 0.0146, + "grad_norm": 0.4272383749485016, + "learning_rate": 1.9595e-05, + "loss": 0.0946, + "step": 3920 + }, + { + "epoch": 0.01465, + "grad_norm": 0.4689870774745941, + "learning_rate": 1.9645000000000002e-05, + "loss": 0.0998, + "step": 3930 + }, + { + "epoch": 0.0147, + "grad_norm": 0.41271480917930603, + "learning_rate": 1.9695e-05, + "loss": 0.1012, + "step": 3940 + }, + { + "epoch": 0.01475, + "grad_norm": 0.47752124071121216, + "learning_rate": 1.9744999999999998e-05, + "loss": 0.1015, + "step": 3950 + }, + { + "epoch": 0.0148, + "grad_norm": 0.4925667345523834, + "learning_rate": 1.9795e-05, + "loss": 0.0945, + "step": 3960 + }, + { + "epoch": 0.01485, + "grad_norm": 0.46057838201522827, + "learning_rate": 1.9845e-05, + "loss": 0.0945, + "step": 3970 + }, + { + "epoch": 0.0149, + "grad_norm": 0.5354776382446289, + "learning_rate": 1.9895e-05, + "loss": 0.1022, + "step": 3980 + }, + { + "epoch": 0.01495, + "grad_norm": 0.5683553218841553, + "learning_rate": 1.9945e-05, + "loss": 0.1012, + "step": 3990 + }, + { + "epoch": 0.015, + "grad_norm": 0.5514432191848755, + "learning_rate": 1.9995e-05, + "loss": 0.0933, + "step": 4000 + }, + { + "epoch": 0.01505, + "grad_norm": 0.6688148975372314, + "learning_rate": 2.0045e-05, + "loss": 0.1029, + "step": 4010 + }, + { + "epoch": 0.0151, + "grad_norm": 0.5286933779716492, + "learning_rate": 2.0095e-05, + "loss": 0.094, + "step": 4020 + }, + { + "epoch": 0.01515, + "grad_norm": 0.6586257815361023, + "learning_rate": 2.0145e-05, + "loss": 0.1005, + "step": 4030 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5654017329216003, + "learning_rate": 2.0195e-05, + "loss": 0.1011, + "step": 4040 + }, + { + "epoch": 0.01525, + "grad_norm": 0.6501078009605408, + "learning_rate": 2.0245e-05, + "loss": 0.0976, + "step": 4050 + }, + { + "epoch": 0.0153, + "grad_norm": 0.6477259397506714, + "learning_rate": 2.0295e-05, + "loss": 0.1002, + "step": 4060 + }, + { + "epoch": 0.01535, + "grad_norm": 0.4997469186782837, + "learning_rate": 2.0345e-05, + "loss": 0.095, + "step": 4070 + }, + { + "epoch": 0.0154, + "grad_norm": 0.418266236782074, + "learning_rate": 2.0395e-05, + "loss": 0.1032, + "step": 4080 + }, + { + "epoch": 0.01545, + "grad_norm": 0.5838645100593567, + "learning_rate": 2.0445e-05, + "loss": 0.1019, + "step": 4090 + }, + { + "epoch": 0.0155, + "grad_norm": 0.4280761778354645, + "learning_rate": 2.0495e-05, + "loss": 0.0961, + "step": 4100 + }, + { + "epoch": 0.01555, + "grad_norm": 0.5512828230857849, + "learning_rate": 2.0545e-05, + "loss": 0.1019, + "step": 4110 + }, + { + "epoch": 0.0156, + "grad_norm": 0.5964367985725403, + "learning_rate": 2.0595000000000002e-05, + "loss": 0.0943, + "step": 4120 + }, + { + "epoch": 0.01565, + "grad_norm": 0.5994517803192139, + "learning_rate": 2.0645e-05, + "loss": 0.1026, + "step": 4130 + }, + { + "epoch": 0.0157, + "grad_norm": 0.5050929188728333, + "learning_rate": 2.0695e-05, + "loss": 0.0997, + "step": 4140 + }, + { + "epoch": 0.01575, + "grad_norm": 0.7495620846748352, + "learning_rate": 2.0745000000000002e-05, + "loss": 0.0993, + "step": 4150 + }, + { + "epoch": 0.0158, + "grad_norm": 0.6496561765670776, + "learning_rate": 2.0795e-05, + "loss": 0.0977, + "step": 4160 + }, + { + "epoch": 0.01585, + "grad_norm": 0.5183280110359192, + "learning_rate": 2.0845e-05, + "loss": 0.099, + "step": 4170 + }, + { + "epoch": 0.0159, + "grad_norm": 0.4890337586402893, + "learning_rate": 2.0895e-05, + "loss": 0.1013, + "step": 4180 + }, + { + "epoch": 0.01595, + "grad_norm": 0.48081013560295105, + "learning_rate": 2.0945e-05, + "loss": 0.1021, + "step": 4190 + }, + { + "epoch": 0.016, + "grad_norm": 0.6270437240600586, + "learning_rate": 2.0995e-05, + "loss": 0.0953, + "step": 4200 + }, + { + "epoch": 0.01605, + "grad_norm": 0.5585843920707703, + "learning_rate": 2.1045e-05, + "loss": 0.1001, + "step": 4210 + }, + { + "epoch": 0.0161, + "grad_norm": 0.5349629521369934, + "learning_rate": 2.1095e-05, + "loss": 0.0927, + "step": 4220 + }, + { + "epoch": 0.01615, + "grad_norm": 0.5161903500556946, + "learning_rate": 2.1145e-05, + "loss": 0.0901, + "step": 4230 + }, + { + "epoch": 0.0162, + "grad_norm": 0.5629688501358032, + "learning_rate": 2.1195e-05, + "loss": 0.1021, + "step": 4240 + }, + { + "epoch": 0.01625, + "grad_norm": 0.4266774356365204, + "learning_rate": 2.1245e-05, + "loss": 0.0968, + "step": 4250 + }, + { + "epoch": 0.0163, + "grad_norm": 0.6050424575805664, + "learning_rate": 2.1295000000000002e-05, + "loss": 0.096, + "step": 4260 + }, + { + "epoch": 0.01635, + "grad_norm": 0.48876550793647766, + "learning_rate": 2.1345e-05, + "loss": 0.0954, + "step": 4270 + }, + { + "epoch": 0.0164, + "grad_norm": 0.4919767677783966, + "learning_rate": 2.1395e-05, + "loss": 0.0925, + "step": 4280 + }, + { + "epoch": 0.01645, + "grad_norm": 0.55455482006073, + "learning_rate": 2.1445000000000002e-05, + "loss": 0.0863, + "step": 4290 + }, + { + "epoch": 0.0165, + "grad_norm": 0.5026130676269531, + "learning_rate": 2.1495e-05, + "loss": 0.0923, + "step": 4300 + }, + { + "epoch": 0.01655, + "grad_norm": 0.5634472370147705, + "learning_rate": 2.1545e-05, + "loss": 0.1001, + "step": 4310 + }, + { + "epoch": 0.0166, + "grad_norm": 0.5411179661750793, + "learning_rate": 2.1595000000000002e-05, + "loss": 0.0928, + "step": 4320 + }, + { + "epoch": 0.01665, + "grad_norm": 0.5356360077857971, + "learning_rate": 2.1645e-05, + "loss": 0.0948, + "step": 4330 + }, + { + "epoch": 0.0167, + "grad_norm": 0.5214255452156067, + "learning_rate": 2.1695e-05, + "loss": 0.0913, + "step": 4340 + }, + { + "epoch": 0.01675, + "grad_norm": 0.5843163728713989, + "learning_rate": 2.1745000000000003e-05, + "loss": 0.0923, + "step": 4350 + }, + { + "epoch": 0.0168, + "grad_norm": 0.428416907787323, + "learning_rate": 2.1795e-05, + "loss": 0.0969, + "step": 4360 + }, + { + "epoch": 0.01685, + "grad_norm": 0.5517915487289429, + "learning_rate": 2.1845000000000002e-05, + "loss": 0.0862, + "step": 4370 + }, + { + "epoch": 0.0169, + "grad_norm": 0.48688235878944397, + "learning_rate": 2.1895000000000003e-05, + "loss": 0.096, + "step": 4380 + }, + { + "epoch": 0.01695, + "grad_norm": 0.46745675802230835, + "learning_rate": 2.1945e-05, + "loss": 0.0964, + "step": 4390 + }, + { + "epoch": 0.017, + "grad_norm": 0.42236295342445374, + "learning_rate": 2.1995000000000002e-05, + "loss": 0.0937, + "step": 4400 + }, + { + "epoch": 0.01705, + "grad_norm": 0.5698501467704773, + "learning_rate": 2.2045000000000003e-05, + "loss": 0.0918, + "step": 4410 + }, + { + "epoch": 0.0171, + "grad_norm": 0.4699753224849701, + "learning_rate": 2.2095e-05, + "loss": 0.0988, + "step": 4420 + }, + { + "epoch": 0.01715, + "grad_norm": 0.512039065361023, + "learning_rate": 2.2145000000000002e-05, + "loss": 0.0967, + "step": 4430 + }, + { + "epoch": 0.0172, + "grad_norm": 0.4880082905292511, + "learning_rate": 2.2195000000000003e-05, + "loss": 0.0979, + "step": 4440 + }, + { + "epoch": 0.01725, + "grad_norm": 0.6383131742477417, + "learning_rate": 2.2245e-05, + "loss": 0.0949, + "step": 4450 + }, + { + "epoch": 0.0173, + "grad_norm": 0.7075005173683167, + "learning_rate": 2.2295000000000003e-05, + "loss": 0.094, + "step": 4460 + }, + { + "epoch": 0.01735, + "grad_norm": 0.5859620571136475, + "learning_rate": 2.2345e-05, + "loss": 0.0977, + "step": 4470 + }, + { + "epoch": 0.0174, + "grad_norm": 0.4959677755832672, + "learning_rate": 2.2395e-05, + "loss": 0.0934, + "step": 4480 + }, + { + "epoch": 0.01745, + "grad_norm": 0.5418904423713684, + "learning_rate": 2.2445000000000003e-05, + "loss": 0.0898, + "step": 4490 + }, + { + "epoch": 0.0175, + "grad_norm": 0.5080021023750305, + "learning_rate": 2.2495e-05, + "loss": 0.091, + "step": 4500 + }, + { + "epoch": 0.01755, + "grad_norm": 0.43675655126571655, + "learning_rate": 2.2545000000000002e-05, + "loss": 0.0953, + "step": 4510 + }, + { + "epoch": 0.0176, + "grad_norm": 0.4278501272201538, + "learning_rate": 2.2595000000000003e-05, + "loss": 0.0977, + "step": 4520 + }, + { + "epoch": 0.01765, + "grad_norm": 0.5013251304626465, + "learning_rate": 2.2645e-05, + "loss": 0.0915, + "step": 4530 + }, + { + "epoch": 0.0177, + "grad_norm": 0.40271320939064026, + "learning_rate": 2.2695000000000002e-05, + "loss": 0.0921, + "step": 4540 + }, + { + "epoch": 0.01775, + "grad_norm": 0.6630820631980896, + "learning_rate": 2.2745000000000003e-05, + "loss": 0.087, + "step": 4550 + }, + { + "epoch": 0.0178, + "grad_norm": 0.547572910785675, + "learning_rate": 2.2795e-05, + "loss": 0.0881, + "step": 4560 + }, + { + "epoch": 0.01785, + "grad_norm": 0.4887124300003052, + "learning_rate": 2.2845e-05, + "loss": 0.0897, + "step": 4570 + }, + { + "epoch": 0.0179, + "grad_norm": 0.46305638551712036, + "learning_rate": 2.2895e-05, + "loss": 0.0893, + "step": 4580 + }, + { + "epoch": 0.01795, + "grad_norm": 0.36096814274787903, + "learning_rate": 2.2945e-05, + "loss": 0.0919, + "step": 4590 + }, + { + "epoch": 0.018, + "grad_norm": 0.4254930317401886, + "learning_rate": 2.2995e-05, + "loss": 0.0953, + "step": 4600 + }, + { + "epoch": 0.01805, + "grad_norm": 0.4120861291885376, + "learning_rate": 2.3045e-05, + "loss": 0.0899, + "step": 4610 + }, + { + "epoch": 0.0181, + "grad_norm": 0.519589364528656, + "learning_rate": 2.3095e-05, + "loss": 0.0907, + "step": 4620 + }, + { + "epoch": 0.01815, + "grad_norm": 0.45753464102745056, + "learning_rate": 2.3145e-05, + "loss": 0.0856, + "step": 4630 + }, + { + "epoch": 0.0182, + "grad_norm": 0.4146299362182617, + "learning_rate": 2.3195e-05, + "loss": 0.0892, + "step": 4640 + }, + { + "epoch": 0.01825, + "grad_norm": 0.4657032787799835, + "learning_rate": 2.3245e-05, + "loss": 0.0876, + "step": 4650 + }, + { + "epoch": 0.0183, + "grad_norm": 0.5032997131347656, + "learning_rate": 2.3295e-05, + "loss": 0.0858, + "step": 4660 + }, + { + "epoch": 0.01835, + "grad_norm": 0.42685577273368835, + "learning_rate": 2.3345e-05, + "loss": 0.0925, + "step": 4670 + }, + { + "epoch": 0.0184, + "grad_norm": 0.4389508366584778, + "learning_rate": 2.3395e-05, + "loss": 0.0839, + "step": 4680 + }, + { + "epoch": 0.01845, + "grad_norm": 0.5109202861785889, + "learning_rate": 2.3445e-05, + "loss": 0.0876, + "step": 4690 + }, + { + "epoch": 0.0185, + "grad_norm": 0.537704586982727, + "learning_rate": 2.3495e-05, + "loss": 0.087, + "step": 4700 + }, + { + "epoch": 0.01855, + "grad_norm": 0.42653656005859375, + "learning_rate": 2.3545e-05, + "loss": 0.0847, + "step": 4710 + }, + { + "epoch": 0.0186, + "grad_norm": 0.48544803261756897, + "learning_rate": 2.3595e-05, + "loss": 0.0808, + "step": 4720 + }, + { + "epoch": 0.01865, + "grad_norm": 0.4926588237285614, + "learning_rate": 2.3645e-05, + "loss": 0.0995, + "step": 4730 + }, + { + "epoch": 0.0187, + "grad_norm": 0.4736453890800476, + "learning_rate": 2.3695e-05, + "loss": 0.0862, + "step": 4740 + }, + { + "epoch": 0.01875, + "grad_norm": 0.44785216450691223, + "learning_rate": 2.3745e-05, + "loss": 0.0837, + "step": 4750 + }, + { + "epoch": 0.0188, + "grad_norm": 0.6035889387130737, + "learning_rate": 2.3795000000000002e-05, + "loss": 0.0869, + "step": 4760 + }, + { + "epoch": 0.01885, + "grad_norm": 0.4353933334350586, + "learning_rate": 2.3845e-05, + "loss": 0.0834, + "step": 4770 + }, + { + "epoch": 0.0189, + "grad_norm": 0.5200499296188354, + "learning_rate": 2.3895e-05, + "loss": 0.0907, + "step": 4780 + }, + { + "epoch": 0.01895, + "grad_norm": 0.49414077401161194, + "learning_rate": 2.3945000000000002e-05, + "loss": 0.0863, + "step": 4790 + }, + { + "epoch": 0.019, + "grad_norm": 0.44971001148223877, + "learning_rate": 2.3995e-05, + "loss": 0.0845, + "step": 4800 + }, + { + "epoch": 0.01905, + "grad_norm": 0.4229947626590729, + "learning_rate": 2.4045e-05, + "loss": 0.0878, + "step": 4810 + }, + { + "epoch": 0.0191, + "grad_norm": 0.5139651298522949, + "learning_rate": 2.4095000000000002e-05, + "loss": 0.0879, + "step": 4820 + }, + { + "epoch": 0.01915, + "grad_norm": 0.5074255466461182, + "learning_rate": 2.4145e-05, + "loss": 0.0816, + "step": 4830 + }, + { + "epoch": 0.0192, + "grad_norm": 0.605755090713501, + "learning_rate": 2.4195e-05, + "loss": 0.0921, + "step": 4840 + }, + { + "epoch": 0.01925, + "grad_norm": 0.48389285802841187, + "learning_rate": 2.4245000000000002e-05, + "loss": 0.0829, + "step": 4850 + }, + { + "epoch": 0.0193, + "grad_norm": 0.7070655226707458, + "learning_rate": 2.4295e-05, + "loss": 0.0931, + "step": 4860 + }, + { + "epoch": 0.01935, + "grad_norm": 0.6098916530609131, + "learning_rate": 2.4345e-05, + "loss": 0.0903, + "step": 4870 + }, + { + "epoch": 0.0194, + "grad_norm": 0.5935694575309753, + "learning_rate": 2.4395000000000003e-05, + "loss": 0.0884, + "step": 4880 + }, + { + "epoch": 0.01945, + "grad_norm": 0.53770911693573, + "learning_rate": 2.4445e-05, + "loss": 0.0889, + "step": 4890 + }, + { + "epoch": 0.0195, + "grad_norm": 0.3916023373603821, + "learning_rate": 2.4495000000000002e-05, + "loss": 0.0833, + "step": 4900 + }, + { + "epoch": 0.01955, + "grad_norm": 0.567225456237793, + "learning_rate": 2.4545000000000003e-05, + "loss": 0.0889, + "step": 4910 + }, + { + "epoch": 0.0196, + "grad_norm": 0.5302049517631531, + "learning_rate": 2.4595e-05, + "loss": 0.09, + "step": 4920 + }, + { + "epoch": 0.01965, + "grad_norm": 0.5962851047515869, + "learning_rate": 2.4645000000000002e-05, + "loss": 0.0919, + "step": 4930 + }, + { + "epoch": 0.0197, + "grad_norm": 0.49372929334640503, + "learning_rate": 2.4695e-05, + "loss": 0.0907, + "step": 4940 + }, + { + "epoch": 0.01975, + "grad_norm": 0.5563930869102478, + "learning_rate": 2.4745e-05, + "loss": 0.0855, + "step": 4950 + }, + { + "epoch": 0.0198, + "grad_norm": 0.44720908999443054, + "learning_rate": 2.4795000000000002e-05, + "loss": 0.0912, + "step": 4960 + }, + { + "epoch": 0.01985, + "grad_norm": 0.4346630871295929, + "learning_rate": 2.4845e-05, + "loss": 0.0909, + "step": 4970 + }, + { + "epoch": 0.0199, + "grad_norm": 0.5587462186813354, + "learning_rate": 2.4895e-05, + "loss": 0.092, + "step": 4980 + }, + { + "epoch": 0.01995, + "grad_norm": 0.5791865587234497, + "learning_rate": 2.4945000000000003e-05, + "loss": 0.0893, + "step": 4990 + }, + { + "epoch": 0.02, + "grad_norm": 0.44487252831459045, + "learning_rate": 2.4995e-05, + "loss": 0.0825, + "step": 5000 + }, + { + "epoch": 0.02005, + "grad_norm": 0.5317064523696899, + "learning_rate": 2.5045e-05, + "loss": 0.0863, + "step": 5010 + }, + { + "epoch": 0.0201, + "grad_norm": 0.41652870178222656, + "learning_rate": 2.5095000000000003e-05, + "loss": 0.086, + "step": 5020 + }, + { + "epoch": 0.02015, + "grad_norm": 0.4210962653160095, + "learning_rate": 2.5145e-05, + "loss": 0.085, + "step": 5030 + }, + { + "epoch": 0.0202, + "grad_norm": 0.4132222831249237, + "learning_rate": 2.5195000000000002e-05, + "loss": 0.083, + "step": 5040 + }, + { + "epoch": 0.02025, + "grad_norm": 0.418788880109787, + "learning_rate": 2.5245000000000003e-05, + "loss": 0.0858, + "step": 5050 + }, + { + "epoch": 0.0203, + "grad_norm": 0.4658764898777008, + "learning_rate": 2.5295e-05, + "loss": 0.082, + "step": 5060 + }, + { + "epoch": 0.02035, + "grad_norm": 0.48415932059288025, + "learning_rate": 2.5345000000000002e-05, + "loss": 0.0833, + "step": 5070 + }, + { + "epoch": 0.0204, + "grad_norm": 0.4540814459323883, + "learning_rate": 2.5395000000000003e-05, + "loss": 0.0866, + "step": 5080 + }, + { + "epoch": 0.02045, + "grad_norm": 0.5398637056350708, + "learning_rate": 2.5445e-05, + "loss": 0.0879, + "step": 5090 + }, + { + "epoch": 0.0205, + "grad_norm": 0.46198877692222595, + "learning_rate": 2.5495000000000002e-05, + "loss": 0.0858, + "step": 5100 + }, + { + "epoch": 0.02055, + "grad_norm": 0.5064612627029419, + "learning_rate": 2.5545000000000004e-05, + "loss": 0.0843, + "step": 5110 + }, + { + "epoch": 0.0206, + "grad_norm": 0.43292903900146484, + "learning_rate": 2.5595e-05, + "loss": 0.082, + "step": 5120 + }, + { + "epoch": 0.02065, + "grad_norm": 0.42029690742492676, + "learning_rate": 2.5645000000000003e-05, + "loss": 0.084, + "step": 5130 + }, + { + "epoch": 0.0207, + "grad_norm": 0.5056539177894592, + "learning_rate": 2.5695000000000004e-05, + "loss": 0.0815, + "step": 5140 + }, + { + "epoch": 0.02075, + "grad_norm": 0.5533456206321716, + "learning_rate": 2.5745e-05, + "loss": 0.0833, + "step": 5150 + }, + { + "epoch": 0.0208, + "grad_norm": 0.4594043791294098, + "learning_rate": 2.5795000000000003e-05, + "loss": 0.0848, + "step": 5160 + }, + { + "epoch": 0.02085, + "grad_norm": 0.6164016723632812, + "learning_rate": 2.5845000000000004e-05, + "loss": 0.089, + "step": 5170 + }, + { + "epoch": 0.0209, + "grad_norm": 0.506243884563446, + "learning_rate": 2.5895000000000002e-05, + "loss": 0.0824, + "step": 5180 + }, + { + "epoch": 0.02095, + "grad_norm": 0.7490360736846924, + "learning_rate": 2.5945000000000003e-05, + "loss": 0.082, + "step": 5190 + }, + { + "epoch": 0.021, + "grad_norm": 0.5026156306266785, + "learning_rate": 2.5995000000000004e-05, + "loss": 0.0934, + "step": 5200 + }, + { + "epoch": 0.02105, + "grad_norm": 0.5008965134620667, + "learning_rate": 2.6045000000000002e-05, + "loss": 0.0893, + "step": 5210 + }, + { + "epoch": 0.0211, + "grad_norm": 0.43096840381622314, + "learning_rate": 2.6095000000000003e-05, + "loss": 0.0798, + "step": 5220 + }, + { + "epoch": 0.02115, + "grad_norm": 0.3479880690574646, + "learning_rate": 2.6145e-05, + "loss": 0.0821, + "step": 5230 + }, + { + "epoch": 0.0212, + "grad_norm": 0.3658469617366791, + "learning_rate": 2.6195000000000002e-05, + "loss": 0.0806, + "step": 5240 + }, + { + "epoch": 0.02125, + "grad_norm": 0.5226966142654419, + "learning_rate": 2.6245000000000004e-05, + "loss": 0.0808, + "step": 5250 + }, + { + "epoch": 0.0213, + "grad_norm": 0.5437954068183899, + "learning_rate": 2.6295e-05, + "loss": 0.0851, + "step": 5260 + }, + { + "epoch": 0.02135, + "grad_norm": 0.4487932026386261, + "learning_rate": 2.6345000000000003e-05, + "loss": 0.0831, + "step": 5270 + }, + { + "epoch": 0.0214, + "grad_norm": 0.44730430841445923, + "learning_rate": 2.6395000000000004e-05, + "loss": 0.09, + "step": 5280 + }, + { + "epoch": 0.02145, + "grad_norm": 0.5502302050590515, + "learning_rate": 2.6445000000000002e-05, + "loss": 0.0812, + "step": 5290 + }, + { + "epoch": 0.0215, + "grad_norm": 0.6342505216598511, + "learning_rate": 2.6495000000000003e-05, + "loss": 0.0876, + "step": 5300 + }, + { + "epoch": 0.02155, + "grad_norm": 0.4663097858428955, + "learning_rate": 2.6545000000000004e-05, + "loss": 0.0892, + "step": 5310 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5896298289299011, + "learning_rate": 2.6595000000000002e-05, + "loss": 0.0848, + "step": 5320 + }, + { + "epoch": 0.02165, + "grad_norm": 0.4614022374153137, + "learning_rate": 2.6645000000000003e-05, + "loss": 0.0838, + "step": 5330 + }, + { + "epoch": 0.0217, + "grad_norm": 0.4756334125995636, + "learning_rate": 2.6695000000000004e-05, + "loss": 0.0856, + "step": 5340 + }, + { + "epoch": 0.02175, + "grad_norm": 0.6175363063812256, + "learning_rate": 2.6745000000000002e-05, + "loss": 0.0891, + "step": 5350 + }, + { + "epoch": 0.0218, + "grad_norm": 0.49067890644073486, + "learning_rate": 2.6795000000000003e-05, + "loss": 0.091, + "step": 5360 + }, + { + "epoch": 0.02185, + "grad_norm": 0.4190593957901001, + "learning_rate": 2.6845000000000005e-05, + "loss": 0.0818, + "step": 5370 + }, + { + "epoch": 0.0219, + "grad_norm": 0.42757564783096313, + "learning_rate": 2.6895000000000003e-05, + "loss": 0.0829, + "step": 5380 + }, + { + "epoch": 0.02195, + "grad_norm": 0.4748636782169342, + "learning_rate": 2.6945000000000004e-05, + "loss": 0.086, + "step": 5390 + }, + { + "epoch": 0.022, + "grad_norm": 0.4459311366081238, + "learning_rate": 2.6995000000000005e-05, + "loss": 0.0845, + "step": 5400 + }, + { + "epoch": 0.02205, + "grad_norm": 0.521063506603241, + "learning_rate": 2.7045000000000003e-05, + "loss": 0.0815, + "step": 5410 + }, + { + "epoch": 0.0221, + "grad_norm": 0.5046432614326477, + "learning_rate": 2.7095000000000004e-05, + "loss": 0.0913, + "step": 5420 + }, + { + "epoch": 0.02215, + "grad_norm": 0.6038782000541687, + "learning_rate": 2.7145000000000005e-05, + "loss": 0.0819, + "step": 5430 + }, + { + "epoch": 0.0222, + "grad_norm": 0.46855854988098145, + "learning_rate": 2.7195000000000003e-05, + "loss": 0.0882, + "step": 5440 + }, + { + "epoch": 0.02225, + "grad_norm": 0.43038228154182434, + "learning_rate": 2.7245000000000004e-05, + "loss": 0.0891, + "step": 5450 + }, + { + "epoch": 0.0223, + "grad_norm": 0.4552011787891388, + "learning_rate": 2.7295000000000005e-05, + "loss": 0.0781, + "step": 5460 + }, + { + "epoch": 0.02235, + "grad_norm": 0.4150822162628174, + "learning_rate": 2.7345000000000003e-05, + "loss": 0.0881, + "step": 5470 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4220922887325287, + "learning_rate": 2.7395000000000005e-05, + "loss": 0.0774, + "step": 5480 + }, + { + "epoch": 0.02245, + "grad_norm": 0.39791256189346313, + "learning_rate": 2.7445000000000002e-05, + "loss": 0.0818, + "step": 5490 + }, + { + "epoch": 0.0225, + "grad_norm": 0.4505294859409332, + "learning_rate": 2.7495000000000004e-05, + "loss": 0.0865, + "step": 5500 + }, + { + "epoch": 0.02255, + "grad_norm": 0.6119654774665833, + "learning_rate": 2.7544999999999998e-05, + "loss": 0.0863, + "step": 5510 + }, + { + "epoch": 0.0226, + "grad_norm": 0.5199548602104187, + "learning_rate": 2.7595e-05, + "loss": 0.0825, + "step": 5520 + }, + { + "epoch": 0.02265, + "grad_norm": 0.4721256494522095, + "learning_rate": 2.7644999999999997e-05, + "loss": 0.0815, + "step": 5530 + }, + { + "epoch": 0.0227, + "grad_norm": 0.5406511425971985, + "learning_rate": 2.7694999999999998e-05, + "loss": 0.08, + "step": 5540 + }, + { + "epoch": 0.02275, + "grad_norm": 0.5193853378295898, + "learning_rate": 2.7745e-05, + "loss": 0.0855, + "step": 5550 + }, + { + "epoch": 0.0228, + "grad_norm": 0.4746890962123871, + "learning_rate": 2.7794999999999997e-05, + "loss": 0.0822, + "step": 5560 + }, + { + "epoch": 0.02285, + "grad_norm": 0.3871111273765564, + "learning_rate": 2.7845e-05, + "loss": 0.0829, + "step": 5570 + }, + { + "epoch": 0.0229, + "grad_norm": 0.5385257005691528, + "learning_rate": 2.7895e-05, + "loss": 0.0865, + "step": 5580 + }, + { + "epoch": 0.02295, + "grad_norm": 0.45264989137649536, + "learning_rate": 2.7944999999999998e-05, + "loss": 0.09, + "step": 5590 + }, + { + "epoch": 0.023, + "grad_norm": 0.6268983483314514, + "learning_rate": 2.7995e-05, + "loss": 0.0808, + "step": 5600 + }, + { + "epoch": 0.02305, + "grad_norm": 0.43054303526878357, + "learning_rate": 2.8045e-05, + "loss": 0.0812, + "step": 5610 + }, + { + "epoch": 0.0231, + "grad_norm": 0.71108078956604, + "learning_rate": 2.8094999999999998e-05, + "loss": 0.0818, + "step": 5620 + }, + { + "epoch": 0.02315, + "grad_norm": 0.6452078223228455, + "learning_rate": 2.8145e-05, + "loss": 0.0848, + "step": 5630 + }, + { + "epoch": 0.0232, + "grad_norm": 0.5041331648826599, + "learning_rate": 2.8195e-05, + "loss": 0.081, + "step": 5640 + }, + { + "epoch": 0.02325, + "grad_norm": 0.40575382113456726, + "learning_rate": 2.8244999999999998e-05, + "loss": 0.084, + "step": 5650 + }, + { + "epoch": 0.0233, + "grad_norm": 0.4588361978530884, + "learning_rate": 2.8295e-05, + "loss": 0.0818, + "step": 5660 + }, + { + "epoch": 0.02335, + "grad_norm": 0.47744446992874146, + "learning_rate": 2.8345e-05, + "loss": 0.0856, + "step": 5670 + }, + { + "epoch": 0.0234, + "grad_norm": 0.49122774600982666, + "learning_rate": 2.8395e-05, + "loss": 0.0852, + "step": 5680 + }, + { + "epoch": 0.02345, + "grad_norm": 0.45338350534439087, + "learning_rate": 2.8445e-05, + "loss": 0.0839, + "step": 5690 + }, + { + "epoch": 0.0235, + "grad_norm": 0.4360921084880829, + "learning_rate": 2.8495e-05, + "loss": 0.0788, + "step": 5700 + }, + { + "epoch": 0.02355, + "grad_norm": 0.4302518367767334, + "learning_rate": 2.8545e-05, + "loss": 0.0821, + "step": 5710 + }, + { + "epoch": 0.0236, + "grad_norm": 0.46977418661117554, + "learning_rate": 2.8595e-05, + "loss": 0.0777, + "step": 5720 + }, + { + "epoch": 0.02365, + "grad_norm": 0.4087386131286621, + "learning_rate": 2.8645e-05, + "loss": 0.0813, + "step": 5730 + }, + { + "epoch": 0.0237, + "grad_norm": 0.5564868450164795, + "learning_rate": 2.8695e-05, + "loss": 0.0843, + "step": 5740 + }, + { + "epoch": 0.02375, + "grad_norm": 0.5684245824813843, + "learning_rate": 2.8745e-05, + "loss": 0.0835, + "step": 5750 + }, + { + "epoch": 0.0238, + "grad_norm": 0.498969167470932, + "learning_rate": 2.8795e-05, + "loss": 0.08, + "step": 5760 + }, + { + "epoch": 0.02385, + "grad_norm": 0.43636438250541687, + "learning_rate": 2.8845e-05, + "loss": 0.0804, + "step": 5770 + }, + { + "epoch": 0.0239, + "grad_norm": 0.5187768936157227, + "learning_rate": 2.8895e-05, + "loss": 0.0781, + "step": 5780 + }, + { + "epoch": 0.02395, + "grad_norm": 0.4015440046787262, + "learning_rate": 2.8945e-05, + "loss": 0.0901, + "step": 5790 + }, + { + "epoch": 0.024, + "grad_norm": 0.4077690541744232, + "learning_rate": 2.8995e-05, + "loss": 0.0779, + "step": 5800 + }, + { + "epoch": 0.02405, + "grad_norm": 0.6242225766181946, + "learning_rate": 2.9045e-05, + "loss": 0.0827, + "step": 5810 + }, + { + "epoch": 0.0241, + "grad_norm": 0.4288853108882904, + "learning_rate": 2.9095e-05, + "loss": 0.0851, + "step": 5820 + }, + { + "epoch": 0.02415, + "grad_norm": 0.43137598037719727, + "learning_rate": 2.9145e-05, + "loss": 0.085, + "step": 5830 + }, + { + "epoch": 0.0242, + "grad_norm": 0.540719211101532, + "learning_rate": 2.9195e-05, + "loss": 0.0856, + "step": 5840 + }, + { + "epoch": 0.02425, + "grad_norm": 0.3808692395687103, + "learning_rate": 2.9245e-05, + "loss": 0.0823, + "step": 5850 + }, + { + "epoch": 0.0243, + "grad_norm": 0.4207990765571594, + "learning_rate": 2.9295e-05, + "loss": 0.0817, + "step": 5860 + }, + { + "epoch": 0.02435, + "grad_norm": 0.4244430959224701, + "learning_rate": 2.9345e-05, + "loss": 0.0812, + "step": 5870 + }, + { + "epoch": 0.0244, + "grad_norm": 0.5179128050804138, + "learning_rate": 2.9395e-05, + "loss": 0.0846, + "step": 5880 + }, + { + "epoch": 0.02445, + "grad_norm": 0.5072123408317566, + "learning_rate": 2.9445e-05, + "loss": 0.0811, + "step": 5890 + }, + { + "epoch": 0.0245, + "grad_norm": 0.5344266295433044, + "learning_rate": 2.9495e-05, + "loss": 0.0906, + "step": 5900 + }, + { + "epoch": 0.02455, + "grad_norm": 0.42122307419776917, + "learning_rate": 2.9545e-05, + "loss": 0.0803, + "step": 5910 + }, + { + "epoch": 0.0246, + "grad_norm": 0.3986223638057709, + "learning_rate": 2.9595e-05, + "loss": 0.0807, + "step": 5920 + }, + { + "epoch": 0.02465, + "grad_norm": 0.4583601653575897, + "learning_rate": 2.9645e-05, + "loss": 0.0848, + "step": 5930 + }, + { + "epoch": 0.0247, + "grad_norm": 0.44839999079704285, + "learning_rate": 2.9695e-05, + "loss": 0.0757, + "step": 5940 + }, + { + "epoch": 0.02475, + "grad_norm": 0.41972583532333374, + "learning_rate": 2.9745e-05, + "loss": 0.0897, + "step": 5950 + }, + { + "epoch": 0.0248, + "grad_norm": 0.6261882185935974, + "learning_rate": 2.9795000000000002e-05, + "loss": 0.0822, + "step": 5960 + }, + { + "epoch": 0.02485, + "grad_norm": 0.5626884698867798, + "learning_rate": 2.9845e-05, + "loss": 0.0873, + "step": 5970 + }, + { + "epoch": 0.0249, + "grad_norm": 0.4882666766643524, + "learning_rate": 2.9895e-05, + "loss": 0.0781, + "step": 5980 + }, + { + "epoch": 0.02495, + "grad_norm": 0.40979450941085815, + "learning_rate": 2.9945000000000002e-05, + "loss": 0.0763, + "step": 5990 + }, + { + "epoch": 0.025, + "grad_norm": 0.32592901587486267, + "learning_rate": 2.9995e-05, + "loss": 0.078, + "step": 6000 + }, + { + "epoch": 0.02505, + "grad_norm": 0.5241904258728027, + "learning_rate": 3.0045e-05, + "loss": 0.0852, + "step": 6010 + }, + { + "epoch": 0.0251, + "grad_norm": 0.48974916338920593, + "learning_rate": 3.0095000000000002e-05, + "loss": 0.0791, + "step": 6020 + }, + { + "epoch": 0.02515, + "grad_norm": 0.4012157917022705, + "learning_rate": 3.0145e-05, + "loss": 0.077, + "step": 6030 + }, + { + "epoch": 0.0252, + "grad_norm": 0.4815496802330017, + "learning_rate": 3.0195e-05, + "loss": 0.0832, + "step": 6040 + }, + { + "epoch": 0.02525, + "grad_norm": 0.4560248851776123, + "learning_rate": 3.0245000000000003e-05, + "loss": 0.0769, + "step": 6050 + }, + { + "epoch": 0.0253, + "grad_norm": 0.43385860323905945, + "learning_rate": 3.0295e-05, + "loss": 0.0793, + "step": 6060 + }, + { + "epoch": 0.02535, + "grad_norm": 0.3584393858909607, + "learning_rate": 3.0345e-05, + "loss": 0.0813, + "step": 6070 + }, + { + "epoch": 0.0254, + "grad_norm": 0.426179975271225, + "learning_rate": 3.0395000000000003e-05, + "loss": 0.0804, + "step": 6080 + }, + { + "epoch": 0.02545, + "grad_norm": 0.47968167066574097, + "learning_rate": 3.0445e-05, + "loss": 0.0865, + "step": 6090 + }, + { + "epoch": 0.0255, + "grad_norm": 0.4998861253261566, + "learning_rate": 3.0495000000000002e-05, + "loss": 0.083, + "step": 6100 + }, + { + "epoch": 0.02555, + "grad_norm": 0.38313713669776917, + "learning_rate": 3.0545e-05, + "loss": 0.0756, + "step": 6110 + }, + { + "epoch": 0.0256, + "grad_norm": 0.44031354784965515, + "learning_rate": 3.0595e-05, + "loss": 0.0755, + "step": 6120 + }, + { + "epoch": 0.02565, + "grad_norm": 0.3908822238445282, + "learning_rate": 3.0645e-05, + "loss": 0.0796, + "step": 6130 + }, + { + "epoch": 0.0257, + "grad_norm": 0.39873838424682617, + "learning_rate": 3.0695000000000003e-05, + "loss": 0.0762, + "step": 6140 + }, + { + "epoch": 0.02575, + "grad_norm": 0.3386397063732147, + "learning_rate": 3.0745000000000005e-05, + "loss": 0.0779, + "step": 6150 + }, + { + "epoch": 0.0258, + "grad_norm": 0.4112503230571747, + "learning_rate": 3.0795e-05, + "loss": 0.0791, + "step": 6160 + }, + { + "epoch": 0.02585, + "grad_norm": 0.41174352169036865, + "learning_rate": 3.0845e-05, + "loss": 0.0819, + "step": 6170 + }, + { + "epoch": 0.0259, + "grad_norm": 0.4976944029331207, + "learning_rate": 3.0895e-05, + "loss": 0.0782, + "step": 6180 + }, + { + "epoch": 0.02595, + "grad_norm": 0.42394208908081055, + "learning_rate": 3.0945e-05, + "loss": 0.0785, + "step": 6190 + }, + { + "epoch": 0.026, + "grad_norm": 0.5694761276245117, + "learning_rate": 3.0995000000000004e-05, + "loss": 0.0773, + "step": 6200 + }, + { + "epoch": 0.02605, + "grad_norm": 0.46636876463890076, + "learning_rate": 3.1045000000000005e-05, + "loss": 0.0818, + "step": 6210 + }, + { + "epoch": 0.0261, + "grad_norm": 0.4229767918586731, + "learning_rate": 3.1095e-05, + "loss": 0.0792, + "step": 6220 + }, + { + "epoch": 0.02615, + "grad_norm": 0.41858726739883423, + "learning_rate": 3.1145e-05, + "loss": 0.076, + "step": 6230 + }, + { + "epoch": 0.0262, + "grad_norm": 0.43213996291160583, + "learning_rate": 3.1195e-05, + "loss": 0.076, + "step": 6240 + }, + { + "epoch": 0.02625, + "grad_norm": 0.4471328556537628, + "learning_rate": 3.1245e-05, + "loss": 0.0742, + "step": 6250 + }, + { + "epoch": 0.0263, + "grad_norm": 0.4880267381668091, + "learning_rate": 3.1295000000000004e-05, + "loss": 0.0847, + "step": 6260 + }, + { + "epoch": 0.02635, + "grad_norm": 0.4235740900039673, + "learning_rate": 3.1345e-05, + "loss": 0.081, + "step": 6270 + }, + { + "epoch": 0.0264, + "grad_norm": 0.4439496099948883, + "learning_rate": 3.1395e-05, + "loss": 0.0757, + "step": 6280 + }, + { + "epoch": 0.02645, + "grad_norm": 0.37231093645095825, + "learning_rate": 3.1445e-05, + "loss": 0.0816, + "step": 6290 + }, + { + "epoch": 0.0265, + "grad_norm": 0.5958653688430786, + "learning_rate": 3.1495e-05, + "loss": 0.074, + "step": 6300 + }, + { + "epoch": 0.02655, + "grad_norm": 0.5390610098838806, + "learning_rate": 3.1545000000000004e-05, + "loss": 0.0807, + "step": 6310 + }, + { + "epoch": 0.0266, + "grad_norm": 0.44329580664634705, + "learning_rate": 3.1595000000000005e-05, + "loss": 0.0826, + "step": 6320 + }, + { + "epoch": 0.02665, + "grad_norm": 0.4327351748943329, + "learning_rate": 3.1645e-05, + "loss": 0.0789, + "step": 6330 + }, + { + "epoch": 0.0267, + "grad_norm": 0.4001625180244446, + "learning_rate": 3.1695e-05, + "loss": 0.0794, + "step": 6340 + }, + { + "epoch": 0.02675, + "grad_norm": 0.36040255427360535, + "learning_rate": 3.1745e-05, + "loss": 0.0763, + "step": 6350 + }, + { + "epoch": 0.0268, + "grad_norm": 0.423082560300827, + "learning_rate": 3.1795e-05, + "loss": 0.0804, + "step": 6360 + }, + { + "epoch": 0.02685, + "grad_norm": 0.42768681049346924, + "learning_rate": 3.1845000000000004e-05, + "loss": 0.0818, + "step": 6370 + }, + { + "epoch": 0.0269, + "grad_norm": 0.4242507219314575, + "learning_rate": 3.1895000000000005e-05, + "loss": 0.0769, + "step": 6380 + }, + { + "epoch": 0.02695, + "grad_norm": 0.5090714693069458, + "learning_rate": 3.1945e-05, + "loss": 0.0837, + "step": 6390 + }, + { + "epoch": 0.027, + "grad_norm": 0.45466625690460205, + "learning_rate": 3.1995e-05, + "loss": 0.0751, + "step": 6400 + }, + { + "epoch": 0.02705, + "grad_norm": 0.3438904285430908, + "learning_rate": 3.2045e-05, + "loss": 0.0809, + "step": 6410 + }, + { + "epoch": 0.0271, + "grad_norm": 0.47308188676834106, + "learning_rate": 3.2095000000000004e-05, + "loss": 0.0762, + "step": 6420 + }, + { + "epoch": 0.02715, + "grad_norm": 0.41979026794433594, + "learning_rate": 3.2145000000000005e-05, + "loss": 0.0812, + "step": 6430 + }, + { + "epoch": 0.0272, + "grad_norm": 0.432265043258667, + "learning_rate": 3.2195000000000006e-05, + "loss": 0.0747, + "step": 6440 + }, + { + "epoch": 0.02725, + "grad_norm": 0.39878150820732117, + "learning_rate": 3.2245e-05, + "loss": 0.0744, + "step": 6450 + }, + { + "epoch": 0.0273, + "grad_norm": 0.4525047540664673, + "learning_rate": 3.2295e-05, + "loss": 0.0753, + "step": 6460 + }, + { + "epoch": 0.02735, + "grad_norm": 0.46699413657188416, + "learning_rate": 3.2345e-05, + "loss": 0.0792, + "step": 6470 + }, + { + "epoch": 0.0274, + "grad_norm": 0.48744940757751465, + "learning_rate": 3.2395000000000004e-05, + "loss": 0.0783, + "step": 6480 + }, + { + "epoch": 0.02745, + "grad_norm": 0.4602070748806, + "learning_rate": 3.2445000000000005e-05, + "loss": 0.0746, + "step": 6490 + }, + { + "epoch": 0.0275, + "grad_norm": 0.46310096979141235, + "learning_rate": 3.2495000000000007e-05, + "loss": 0.0779, + "step": 6500 + }, + { + "epoch": 0.02755, + "grad_norm": 0.3661746084690094, + "learning_rate": 3.2545e-05, + "loss": 0.0774, + "step": 6510 + }, + { + "epoch": 0.0276, + "grad_norm": 0.3644971549510956, + "learning_rate": 3.2595e-05, + "loss": 0.0737, + "step": 6520 + }, + { + "epoch": 0.02765, + "grad_norm": 0.3705560863018036, + "learning_rate": 3.2645e-05, + "loss": 0.0748, + "step": 6530 + }, + { + "epoch": 0.0277, + "grad_norm": 0.4240495562553406, + "learning_rate": 3.2695000000000005e-05, + "loss": 0.0767, + "step": 6540 + }, + { + "epoch": 0.02775, + "grad_norm": 0.34379133582115173, + "learning_rate": 3.2745000000000006e-05, + "loss": 0.0715, + "step": 6550 + }, + { + "epoch": 0.0278, + "grad_norm": 0.3436594605445862, + "learning_rate": 3.2795e-05, + "loss": 0.0732, + "step": 6560 + }, + { + "epoch": 0.02785, + "grad_norm": 0.42003849148750305, + "learning_rate": 3.2845e-05, + "loss": 0.0821, + "step": 6570 + }, + { + "epoch": 0.0279, + "grad_norm": 0.4620581865310669, + "learning_rate": 3.2895e-05, + "loss": 0.0711, + "step": 6580 + }, + { + "epoch": 0.02795, + "grad_norm": 0.40812432765960693, + "learning_rate": 3.2945000000000004e-05, + "loss": 0.0781, + "step": 6590 + }, + { + "epoch": 0.028, + "grad_norm": 0.41272908449172974, + "learning_rate": 3.2995000000000005e-05, + "loss": 0.0765, + "step": 6600 + }, + { + "epoch": 0.02805, + "grad_norm": 0.43230175971984863, + "learning_rate": 3.3045000000000006e-05, + "loss": 0.0752, + "step": 6610 + }, + { + "epoch": 0.0281, + "grad_norm": 0.4300050735473633, + "learning_rate": 3.3095e-05, + "loss": 0.0731, + "step": 6620 + }, + { + "epoch": 0.02815, + "grad_norm": 0.44373124837875366, + "learning_rate": 3.3145e-05, + "loss": 0.0753, + "step": 6630 + }, + { + "epoch": 0.0282, + "grad_norm": 0.49943041801452637, + "learning_rate": 3.3195e-05, + "loss": 0.0796, + "step": 6640 + }, + { + "epoch": 0.02825, + "grad_norm": 0.3910506069660187, + "learning_rate": 3.3245000000000004e-05, + "loss": 0.0741, + "step": 6650 + }, + { + "epoch": 0.0283, + "grad_norm": 0.394123911857605, + "learning_rate": 3.3295000000000006e-05, + "loss": 0.0744, + "step": 6660 + }, + { + "epoch": 0.02835, + "grad_norm": 0.431263267993927, + "learning_rate": 3.334500000000001e-05, + "loss": 0.0779, + "step": 6670 + }, + { + "epoch": 0.0284, + "grad_norm": 0.5385127067565918, + "learning_rate": 3.3395e-05, + "loss": 0.0731, + "step": 6680 + }, + { + "epoch": 0.02845, + "grad_norm": 0.3577653765678406, + "learning_rate": 3.3445e-05, + "loss": 0.0731, + "step": 6690 + }, + { + "epoch": 0.0285, + "grad_norm": 0.5011255145072937, + "learning_rate": 3.3495000000000004e-05, + "loss": 0.0797, + "step": 6700 + }, + { + "epoch": 0.02855, + "grad_norm": 0.5084308981895447, + "learning_rate": 3.3545000000000005e-05, + "loss": 0.0772, + "step": 6710 + }, + { + "epoch": 0.0286, + "grad_norm": 0.47546619176864624, + "learning_rate": 3.3595000000000006e-05, + "loss": 0.0739, + "step": 6720 + }, + { + "epoch": 0.02865, + "grad_norm": 0.38033586740493774, + "learning_rate": 3.364500000000001e-05, + "loss": 0.0795, + "step": 6730 + }, + { + "epoch": 0.0287, + "grad_norm": 0.4411650002002716, + "learning_rate": 3.3695e-05, + "loss": 0.0734, + "step": 6740 + }, + { + "epoch": 0.02875, + "grad_norm": 0.32608819007873535, + "learning_rate": 3.3745e-05, + "loss": 0.0713, + "step": 6750 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4803429841995239, + "learning_rate": 3.3795e-05, + "loss": 0.0765, + "step": 6760 + }, + { + "epoch": 0.02885, + "grad_norm": 0.38406142592430115, + "learning_rate": 3.3845e-05, + "loss": 0.0717, + "step": 6770 + }, + { + "epoch": 0.0289, + "grad_norm": 0.4204261004924774, + "learning_rate": 3.3895e-05, + "loss": 0.0737, + "step": 6780 + }, + { + "epoch": 0.02895, + "grad_norm": 0.37816375494003296, + "learning_rate": 3.3945e-05, + "loss": 0.0743, + "step": 6790 + }, + { + "epoch": 0.029, + "grad_norm": 0.3421754240989685, + "learning_rate": 3.3995e-05, + "loss": 0.0693, + "step": 6800 + }, + { + "epoch": 0.02905, + "grad_norm": 0.4057287871837616, + "learning_rate": 3.4045e-05, + "loss": 0.0754, + "step": 6810 + }, + { + "epoch": 0.0291, + "grad_norm": 0.329377144575119, + "learning_rate": 3.4095e-05, + "loss": 0.071, + "step": 6820 + }, + { + "epoch": 0.02915, + "grad_norm": 0.41998857259750366, + "learning_rate": 3.4145e-05, + "loss": 0.0811, + "step": 6830 + }, + { + "epoch": 0.0292, + "grad_norm": 0.5486347675323486, + "learning_rate": 3.4195e-05, + "loss": 0.0773, + "step": 6840 + }, + { + "epoch": 0.02925, + "grad_norm": 0.3410201966762543, + "learning_rate": 3.4245e-05, + "loss": 0.0707, + "step": 6850 + }, + { + "epoch": 0.0293, + "grad_norm": 0.4516546130180359, + "learning_rate": 3.4294999999999996e-05, + "loss": 0.0742, + "step": 6860 + }, + { + "epoch": 0.02935, + "grad_norm": 0.37678903341293335, + "learning_rate": 3.4345e-05, + "loss": 0.0717, + "step": 6870 + }, + { + "epoch": 0.0294, + "grad_norm": 0.46914374828338623, + "learning_rate": 3.4395e-05, + "loss": 0.0718, + "step": 6880 + }, + { + "epoch": 0.02945, + "grad_norm": 0.4036213755607605, + "learning_rate": 3.4445e-05, + "loss": 0.0751, + "step": 6890 + }, + { + "epoch": 0.0295, + "grad_norm": 0.41932258009910583, + "learning_rate": 3.4495e-05, + "loss": 0.0699, + "step": 6900 + }, + { + "epoch": 0.02955, + "grad_norm": 0.37729623913764954, + "learning_rate": 3.4545e-05, + "loss": 0.0725, + "step": 6910 + }, + { + "epoch": 0.0296, + "grad_norm": 0.33412089943885803, + "learning_rate": 3.4594999999999997e-05, + "loss": 0.0677, + "step": 6920 + }, + { + "epoch": 0.02965, + "grad_norm": 0.42308661341667175, + "learning_rate": 3.4645e-05, + "loss": 0.0758, + "step": 6930 + }, + { + "epoch": 0.0297, + "grad_norm": 0.41113191843032837, + "learning_rate": 3.4695e-05, + "loss": 0.0746, + "step": 6940 + }, + { + "epoch": 0.02975, + "grad_norm": 0.5308328866958618, + "learning_rate": 3.4745e-05, + "loss": 0.0713, + "step": 6950 + }, + { + "epoch": 0.0298, + "grad_norm": 0.44157448410987854, + "learning_rate": 3.4795e-05, + "loss": 0.0756, + "step": 6960 + }, + { + "epoch": 0.02985, + "grad_norm": 0.5505300760269165, + "learning_rate": 3.4845e-05, + "loss": 0.0765, + "step": 6970 + }, + { + "epoch": 0.0299, + "grad_norm": 0.39519307017326355, + "learning_rate": 3.4895e-05, + "loss": 0.0755, + "step": 6980 + }, + { + "epoch": 0.02995, + "grad_norm": 0.4539121687412262, + "learning_rate": 3.4945e-05, + "loss": 0.0747, + "step": 6990 + }, + { + "epoch": 0.03, + "grad_norm": 0.5236254930496216, + "learning_rate": 3.4995e-05, + "loss": 0.0763, + "step": 7000 + }, + { + "epoch": 0.03005, + "grad_norm": 0.36866137385368347, + "learning_rate": 3.5045e-05, + "loss": 0.0777, + "step": 7010 + }, + { + "epoch": 0.0301, + "grad_norm": 0.3630446195602417, + "learning_rate": 3.5095e-05, + "loss": 0.0745, + "step": 7020 + }, + { + "epoch": 0.03015, + "grad_norm": 0.4937750995159149, + "learning_rate": 3.5145e-05, + "loss": 0.0762, + "step": 7030 + }, + { + "epoch": 0.0302, + "grad_norm": 0.39128226041793823, + "learning_rate": 3.5195e-05, + "loss": 0.0838, + "step": 7040 + }, + { + "epoch": 0.03025, + "grad_norm": 0.41970768570899963, + "learning_rate": 3.5245e-05, + "loss": 0.0794, + "step": 7050 + }, + { + "epoch": 0.0303, + "grad_norm": 0.4620768129825592, + "learning_rate": 3.5295e-05, + "loss": 0.0764, + "step": 7060 + }, + { + "epoch": 0.03035, + "grad_norm": 0.5133053064346313, + "learning_rate": 3.5345e-05, + "loss": 0.0736, + "step": 7070 + }, + { + "epoch": 0.0304, + "grad_norm": 0.3818191587924957, + "learning_rate": 3.5395e-05, + "loss": 0.0729, + "step": 7080 + }, + { + "epoch": 0.03045, + "grad_norm": 0.4005877375602722, + "learning_rate": 3.5445000000000004e-05, + "loss": 0.0757, + "step": 7090 + }, + { + "epoch": 0.0305, + "grad_norm": 0.4175821840763092, + "learning_rate": 3.5495e-05, + "loss": 0.0756, + "step": 7100 + }, + { + "epoch": 0.03055, + "grad_norm": 0.38189101219177246, + "learning_rate": 3.5545e-05, + "loss": 0.0746, + "step": 7110 + }, + { + "epoch": 0.0306, + "grad_norm": 0.3358469009399414, + "learning_rate": 3.5595e-05, + "loss": 0.0737, + "step": 7120 + }, + { + "epoch": 0.03065, + "grad_norm": 0.33312109112739563, + "learning_rate": 3.5645e-05, + "loss": 0.0712, + "step": 7130 + }, + { + "epoch": 0.0307, + "grad_norm": 0.3312031924724579, + "learning_rate": 3.5695e-05, + "loss": 0.0753, + "step": 7140 + }, + { + "epoch": 0.03075, + "grad_norm": 0.3859456181526184, + "learning_rate": 3.5745e-05, + "loss": 0.074, + "step": 7150 + }, + { + "epoch": 0.0308, + "grad_norm": 0.3346497118473053, + "learning_rate": 3.5795e-05, + "loss": 0.0787, + "step": 7160 + }, + { + "epoch": 0.03085, + "grad_norm": 0.4235798418521881, + "learning_rate": 3.5845e-05, + "loss": 0.0695, + "step": 7170 + }, + { + "epoch": 0.0309, + "grad_norm": 0.41039881110191345, + "learning_rate": 3.5895e-05, + "loss": 0.0694, + "step": 7180 + }, + { + "epoch": 0.03095, + "grad_norm": 0.3948202431201935, + "learning_rate": 3.5945e-05, + "loss": 0.0736, + "step": 7190 + }, + { + "epoch": 0.031, + "grad_norm": 0.37504926323890686, + "learning_rate": 3.5995000000000004e-05, + "loss": 0.0734, + "step": 7200 + }, + { + "epoch": 0.03105, + "grad_norm": 0.34111249446868896, + "learning_rate": 3.6045e-05, + "loss": 0.068, + "step": 7210 + }, + { + "epoch": 0.0311, + "grad_norm": 0.32981258630752563, + "learning_rate": 3.6095e-05, + "loss": 0.0727, + "step": 7220 + }, + { + "epoch": 0.03115, + "grad_norm": 0.3568899631500244, + "learning_rate": 3.6145e-05, + "loss": 0.0717, + "step": 7230 + }, + { + "epoch": 0.0312, + "grad_norm": 0.3659535348415375, + "learning_rate": 3.6195e-05, + "loss": 0.0763, + "step": 7240 + }, + { + "epoch": 0.03125, + "grad_norm": 0.32447150349617004, + "learning_rate": 3.6245e-05, + "loss": 0.0704, + "step": 7250 + }, + { + "epoch": 0.0313, + "grad_norm": 0.516423225402832, + "learning_rate": 3.6295000000000004e-05, + "loss": 0.0727, + "step": 7260 + }, + { + "epoch": 0.03135, + "grad_norm": 0.5282039642333984, + "learning_rate": 3.6345e-05, + "loss": 0.0726, + "step": 7270 + }, + { + "epoch": 0.0314, + "grad_norm": 0.4386370778083801, + "learning_rate": 3.6395e-05, + "loss": 0.0707, + "step": 7280 + }, + { + "epoch": 0.03145, + "grad_norm": 0.41988563537597656, + "learning_rate": 3.6445e-05, + "loss": 0.0733, + "step": 7290 + }, + { + "epoch": 0.0315, + "grad_norm": 0.355773001909256, + "learning_rate": 3.6495e-05, + "loss": 0.0722, + "step": 7300 + }, + { + "epoch": 0.03155, + "grad_norm": 0.33527055382728577, + "learning_rate": 3.6545e-05, + "loss": 0.0695, + "step": 7310 + }, + { + "epoch": 0.0316, + "grad_norm": 0.30877190828323364, + "learning_rate": 3.6595000000000005e-05, + "loss": 0.0696, + "step": 7320 + }, + { + "epoch": 0.03165, + "grad_norm": 0.4743344783782959, + "learning_rate": 3.6645e-05, + "loss": 0.0692, + "step": 7330 + }, + { + "epoch": 0.0317, + "grad_norm": 0.49953508377075195, + "learning_rate": 3.6695e-05, + "loss": 0.0759, + "step": 7340 + }, + { + "epoch": 0.03175, + "grad_norm": 0.4573311507701874, + "learning_rate": 3.6745e-05, + "loss": 0.0721, + "step": 7350 + }, + { + "epoch": 0.0318, + "grad_norm": 0.3666287958621979, + "learning_rate": 3.6795e-05, + "loss": 0.071, + "step": 7360 + }, + { + "epoch": 0.03185, + "grad_norm": 0.3707403242588043, + "learning_rate": 3.6845000000000004e-05, + "loss": 0.0764, + "step": 7370 + }, + { + "epoch": 0.0319, + "grad_norm": 0.36320701241493225, + "learning_rate": 3.6895000000000005e-05, + "loss": 0.0701, + "step": 7380 + }, + { + "epoch": 0.03195, + "grad_norm": 0.34615108370780945, + "learning_rate": 3.6945e-05, + "loss": 0.0679, + "step": 7390 + }, + { + "epoch": 0.032, + "grad_norm": 0.42103487253189087, + "learning_rate": 3.6995e-05, + "loss": 0.0725, + "step": 7400 + }, + { + "epoch": 0.03205, + "grad_norm": 0.5118808746337891, + "learning_rate": 3.7045e-05, + "loss": 0.071, + "step": 7410 + }, + { + "epoch": 0.0321, + "grad_norm": 0.502199649810791, + "learning_rate": 3.7095e-05, + "loss": 0.0749, + "step": 7420 + }, + { + "epoch": 0.03215, + "grad_norm": 0.31548601388931274, + "learning_rate": 3.7145000000000004e-05, + "loss": 0.0739, + "step": 7430 + }, + { + "epoch": 0.0322, + "grad_norm": 0.3683542311191559, + "learning_rate": 3.7195e-05, + "loss": 0.07, + "step": 7440 + }, + { + "epoch": 0.03225, + "grad_norm": 0.3420860469341278, + "learning_rate": 3.7245e-05, + "loss": 0.0708, + "step": 7450 + }, + { + "epoch": 0.0323, + "grad_norm": 0.32849380373954773, + "learning_rate": 3.7295e-05, + "loss": 0.0726, + "step": 7460 + }, + { + "epoch": 0.03235, + "grad_norm": 0.4532844126224518, + "learning_rate": 3.7345e-05, + "loss": 0.07, + "step": 7470 + }, + { + "epoch": 0.0324, + "grad_norm": 0.5196378827095032, + "learning_rate": 3.7395000000000004e-05, + "loss": 0.0726, + "step": 7480 + }, + { + "epoch": 0.03245, + "grad_norm": 0.346918523311615, + "learning_rate": 3.7445000000000005e-05, + "loss": 0.0713, + "step": 7490 + }, + { + "epoch": 0.0325, + "grad_norm": 0.402946412563324, + "learning_rate": 3.7495e-05, + "loss": 0.0727, + "step": 7500 + }, + { + "epoch": 0.03255, + "grad_norm": 0.3576841354370117, + "learning_rate": 3.7545e-05, + "loss": 0.0747, + "step": 7510 + }, + { + "epoch": 0.0326, + "grad_norm": 0.3537071645259857, + "learning_rate": 3.7595e-05, + "loss": 0.072, + "step": 7520 + }, + { + "epoch": 0.03265, + "grad_norm": 0.3819756507873535, + "learning_rate": 3.7645e-05, + "loss": 0.0653, + "step": 7530 + }, + { + "epoch": 0.0327, + "grad_norm": 0.44822728633880615, + "learning_rate": 3.7695000000000004e-05, + "loss": 0.0711, + "step": 7540 + }, + { + "epoch": 0.03275, + "grad_norm": 0.3307861089706421, + "learning_rate": 3.7745000000000005e-05, + "loss": 0.0706, + "step": 7550 + }, + { + "epoch": 0.0328, + "grad_norm": 0.3189656138420105, + "learning_rate": 3.7795e-05, + "loss": 0.0691, + "step": 7560 + }, + { + "epoch": 0.03285, + "grad_norm": 0.45738330483436584, + "learning_rate": 3.7845e-05, + "loss": 0.0713, + "step": 7570 + }, + { + "epoch": 0.0329, + "grad_norm": 0.4482582211494446, + "learning_rate": 3.7895e-05, + "loss": 0.0707, + "step": 7580 + }, + { + "epoch": 0.03295, + "grad_norm": 0.4654321074485779, + "learning_rate": 3.7945000000000003e-05, + "loss": 0.0749, + "step": 7590 + }, + { + "epoch": 0.033, + "grad_norm": 0.4446773827075958, + "learning_rate": 3.7995000000000005e-05, + "loss": 0.0719, + "step": 7600 + }, + { + "epoch": 0.03305, + "grad_norm": 0.3221088647842407, + "learning_rate": 3.8045000000000006e-05, + "loss": 0.0741, + "step": 7610 + }, + { + "epoch": 0.0331, + "grad_norm": 0.33431246876716614, + "learning_rate": 3.8095e-05, + "loss": 0.0727, + "step": 7620 + }, + { + "epoch": 0.03315, + "grad_norm": 0.3938763439655304, + "learning_rate": 3.8145e-05, + "loss": 0.0763, + "step": 7630 + }, + { + "epoch": 0.0332, + "grad_norm": 0.34474390745162964, + "learning_rate": 3.8195e-05, + "loss": 0.0743, + "step": 7640 + }, + { + "epoch": 0.03325, + "grad_norm": 0.3402387797832489, + "learning_rate": 3.8245000000000004e-05, + "loss": 0.071, + "step": 7650 + }, + { + "epoch": 0.0333, + "grad_norm": 0.34557628631591797, + "learning_rate": 3.8295000000000005e-05, + "loss": 0.0764, + "step": 7660 + }, + { + "epoch": 0.03335, + "grad_norm": 0.2767280042171478, + "learning_rate": 3.8345000000000006e-05, + "loss": 0.0712, + "step": 7670 + }, + { + "epoch": 0.0334, + "grad_norm": 0.37553533911705017, + "learning_rate": 3.8395e-05, + "loss": 0.0698, + "step": 7680 + }, + { + "epoch": 0.03345, + "grad_norm": 0.4499758183956146, + "learning_rate": 3.8445e-05, + "loss": 0.0727, + "step": 7690 + }, + { + "epoch": 0.0335, + "grad_norm": 0.4133255183696747, + "learning_rate": 3.8495e-05, + "loss": 0.0702, + "step": 7700 + }, + { + "epoch": 0.03355, + "grad_norm": 0.3341597020626068, + "learning_rate": 3.8545000000000004e-05, + "loss": 0.0682, + "step": 7710 + }, + { + "epoch": 0.0336, + "grad_norm": 0.3567278981208801, + "learning_rate": 3.8595000000000006e-05, + "loss": 0.0674, + "step": 7720 + }, + { + "epoch": 0.03365, + "grad_norm": 0.43361714482307434, + "learning_rate": 3.8645e-05, + "loss": 0.0722, + "step": 7730 + }, + { + "epoch": 0.0337, + "grad_norm": 0.41051551699638367, + "learning_rate": 3.8695e-05, + "loss": 0.0757, + "step": 7740 + }, + { + "epoch": 0.03375, + "grad_norm": 0.4041409492492676, + "learning_rate": 3.8745e-05, + "loss": 0.0752, + "step": 7750 + }, + { + "epoch": 0.0338, + "grad_norm": 0.41050779819488525, + "learning_rate": 3.8795000000000004e-05, + "loss": 0.0677, + "step": 7760 + }, + { + "epoch": 0.03385, + "grad_norm": 0.36437398195266724, + "learning_rate": 3.8845000000000005e-05, + "loss": 0.0712, + "step": 7770 + }, + { + "epoch": 0.0339, + "grad_norm": 0.4467727839946747, + "learning_rate": 3.8895000000000006e-05, + "loss": 0.0737, + "step": 7780 + }, + { + "epoch": 0.03395, + "grad_norm": 0.4188295900821686, + "learning_rate": 3.8945e-05, + "loss": 0.0718, + "step": 7790 + }, + { + "epoch": 0.034, + "grad_norm": 0.3202160894870758, + "learning_rate": 3.8995e-05, + "loss": 0.0675, + "step": 7800 + }, + { + "epoch": 0.03405, + "grad_norm": 0.35536912083625793, + "learning_rate": 3.9045e-05, + "loss": 0.0711, + "step": 7810 + }, + { + "epoch": 0.0341, + "grad_norm": 0.3401013910770416, + "learning_rate": 3.9095000000000004e-05, + "loss": 0.0665, + "step": 7820 + }, + { + "epoch": 0.03415, + "grad_norm": 0.32901763916015625, + "learning_rate": 3.9145000000000006e-05, + "loss": 0.0709, + "step": 7830 + }, + { + "epoch": 0.0342, + "grad_norm": 0.4919268786907196, + "learning_rate": 3.919500000000001e-05, + "loss": 0.0813, + "step": 7840 + }, + { + "epoch": 0.03425, + "grad_norm": 0.33046674728393555, + "learning_rate": 3.9245e-05, + "loss": 0.0723, + "step": 7850 + }, + { + "epoch": 0.0343, + "grad_norm": 0.29371678829193115, + "learning_rate": 3.9295e-05, + "loss": 0.0743, + "step": 7860 + }, + { + "epoch": 0.03435, + "grad_norm": 0.3690441846847534, + "learning_rate": 3.9345000000000004e-05, + "loss": 0.071, + "step": 7870 + }, + { + "epoch": 0.0344, + "grad_norm": 0.30826297402381897, + "learning_rate": 3.9395000000000005e-05, + "loss": 0.0704, + "step": 7880 + }, + { + "epoch": 0.03445, + "grad_norm": 0.3399566113948822, + "learning_rate": 3.9445000000000006e-05, + "loss": 0.0708, + "step": 7890 + }, + { + "epoch": 0.0345, + "grad_norm": 0.35554730892181396, + "learning_rate": 3.949500000000001e-05, + "loss": 0.0726, + "step": 7900 + }, + { + "epoch": 0.03455, + "grad_norm": 0.3770660161972046, + "learning_rate": 3.9545e-05, + "loss": 0.0737, + "step": 7910 + }, + { + "epoch": 0.0346, + "grad_norm": 0.3023923933506012, + "learning_rate": 3.9595e-05, + "loss": 0.0699, + "step": 7920 + }, + { + "epoch": 0.03465, + "grad_norm": 0.31832584738731384, + "learning_rate": 3.9645000000000004e-05, + "loss": 0.0692, + "step": 7930 + }, + { + "epoch": 0.0347, + "grad_norm": 0.30358853936195374, + "learning_rate": 3.9695000000000005e-05, + "loss": 0.0709, + "step": 7940 + }, + { + "epoch": 0.03475, + "grad_norm": 0.3580412268638611, + "learning_rate": 3.9745000000000007e-05, + "loss": 0.0671, + "step": 7950 + }, + { + "epoch": 0.0348, + "grad_norm": 0.36572298407554626, + "learning_rate": 3.979500000000001e-05, + "loss": 0.0712, + "step": 7960 + }, + { + "epoch": 0.03485, + "grad_norm": 0.2903812527656555, + "learning_rate": 3.9845e-05, + "loss": 0.0748, + "step": 7970 + }, + { + "epoch": 0.0349, + "grad_norm": 0.25910863280296326, + "learning_rate": 3.9895000000000003e-05, + "loss": 0.0686, + "step": 7980 + }, + { + "epoch": 0.03495, + "grad_norm": 0.37598222494125366, + "learning_rate": 3.9945000000000005e-05, + "loss": 0.0651, + "step": 7990 + }, + { + "epoch": 0.035, + "grad_norm": 0.32963132858276367, + "learning_rate": 3.9995000000000006e-05, + "loss": 0.0662, + "step": 8000 + }, + { + "epoch": 0.03505, + "grad_norm": 0.36862391233444214, + "learning_rate": 4.0045e-05, + "loss": 0.068, + "step": 8010 + }, + { + "epoch": 0.0351, + "grad_norm": 0.3572233021259308, + "learning_rate": 4.0095e-05, + "loss": 0.0682, + "step": 8020 + }, + { + "epoch": 0.03515, + "grad_norm": 0.33086565136909485, + "learning_rate": 4.0144999999999996e-05, + "loss": 0.0644, + "step": 8030 + }, + { + "epoch": 0.0352, + "grad_norm": 0.39653196930885315, + "learning_rate": 4.0195e-05, + "loss": 0.0727, + "step": 8040 + }, + { + "epoch": 0.03525, + "grad_norm": 0.42832210659980774, + "learning_rate": 4.0245e-05, + "loss": 0.0723, + "step": 8050 + }, + { + "epoch": 0.0353, + "grad_norm": 0.3951289653778076, + "learning_rate": 4.0295e-05, + "loss": 0.0741, + "step": 8060 + }, + { + "epoch": 0.03535, + "grad_norm": 0.40113726258277893, + "learning_rate": 4.0345e-05, + "loss": 0.067, + "step": 8070 + }, + { + "epoch": 0.0354, + "grad_norm": 0.3262624740600586, + "learning_rate": 4.0395e-05, + "loss": 0.0747, + "step": 8080 + }, + { + "epoch": 0.03545, + "grad_norm": 0.35799640417099, + "learning_rate": 4.0444999999999996e-05, + "loss": 0.0733, + "step": 8090 + }, + { + "epoch": 0.0355, + "grad_norm": 0.357382595539093, + "learning_rate": 4.0495e-05, + "loss": 0.0696, + "step": 8100 + }, + { + "epoch": 0.03555, + "grad_norm": 0.36806145310401917, + "learning_rate": 4.0545e-05, + "loss": 0.0689, + "step": 8110 + }, + { + "epoch": 0.0356, + "grad_norm": 0.4190371334552765, + "learning_rate": 4.0595e-05, + "loss": 0.0712, + "step": 8120 + }, + { + "epoch": 0.03565, + "grad_norm": 0.3320857584476471, + "learning_rate": 4.0645e-05, + "loss": 0.0675, + "step": 8130 + }, + { + "epoch": 0.0357, + "grad_norm": 0.4370461106300354, + "learning_rate": 4.0695e-05, + "loss": 0.0741, + "step": 8140 + }, + { + "epoch": 0.03575, + "grad_norm": 0.3096306324005127, + "learning_rate": 4.0745e-05, + "loss": 0.0684, + "step": 8150 + }, + { + "epoch": 0.0358, + "grad_norm": 0.42305925488471985, + "learning_rate": 4.0795e-05, + "loss": 0.0671, + "step": 8160 + }, + { + "epoch": 0.03585, + "grad_norm": 0.4075618386268616, + "learning_rate": 4.0845e-05, + "loss": 0.0694, + "step": 8170 + }, + { + "epoch": 0.0359, + "grad_norm": 0.3317747116088867, + "learning_rate": 4.0895e-05, + "loss": 0.0712, + "step": 8180 + }, + { + "epoch": 0.03595, + "grad_norm": 0.34016817808151245, + "learning_rate": 4.0945e-05, + "loss": 0.0683, + "step": 8190 + }, + { + "epoch": 0.036, + "grad_norm": 0.29989123344421387, + "learning_rate": 4.0995e-05, + "loss": 0.0683, + "step": 8200 + }, + { + "epoch": 0.03605, + "grad_norm": 0.3664803206920624, + "learning_rate": 4.1045e-05, + "loss": 0.0675, + "step": 8210 + }, + { + "epoch": 0.0361, + "grad_norm": 0.40872564911842346, + "learning_rate": 4.1095e-05, + "loss": 0.0663, + "step": 8220 + }, + { + "epoch": 0.03615, + "grad_norm": 0.3482436239719391, + "learning_rate": 4.1145e-05, + "loss": 0.065, + "step": 8230 + }, + { + "epoch": 0.0362, + "grad_norm": 0.3456083834171295, + "learning_rate": 4.1195e-05, + "loss": 0.0669, + "step": 8240 + }, + { + "epoch": 0.03625, + "grad_norm": 0.45253556966781616, + "learning_rate": 4.1245e-05, + "loss": 0.0695, + "step": 8250 + }, + { + "epoch": 0.0363, + "grad_norm": 0.329023540019989, + "learning_rate": 4.1295000000000004e-05, + "loss": 0.0665, + "step": 8260 + }, + { + "epoch": 0.03635, + "grad_norm": 0.3185300827026367, + "learning_rate": 4.1345e-05, + "loss": 0.0655, + "step": 8270 + }, + { + "epoch": 0.0364, + "grad_norm": 0.3669206500053406, + "learning_rate": 4.1395e-05, + "loss": 0.0668, + "step": 8280 + }, + { + "epoch": 0.03645, + "grad_norm": 0.43072745203971863, + "learning_rate": 4.1445e-05, + "loss": 0.0665, + "step": 8290 + }, + { + "epoch": 0.0365, + "grad_norm": 0.28496241569519043, + "learning_rate": 4.1495e-05, + "loss": 0.0677, + "step": 8300 + }, + { + "epoch": 0.03655, + "grad_norm": 0.4361313581466675, + "learning_rate": 4.1545e-05, + "loss": 0.0696, + "step": 8310 + }, + { + "epoch": 0.0366, + "grad_norm": 0.3698161244392395, + "learning_rate": 4.1595e-05, + "loss": 0.0742, + "step": 8320 + }, + { + "epoch": 0.03665, + "grad_norm": 0.32634246349334717, + "learning_rate": 4.1645e-05, + "loss": 0.0704, + "step": 8330 + }, + { + "epoch": 0.0367, + "grad_norm": 0.33289647102355957, + "learning_rate": 4.1695e-05, + "loss": 0.0674, + "step": 8340 + }, + { + "epoch": 0.03675, + "grad_norm": 0.32005950808525085, + "learning_rate": 4.1745e-05, + "loss": 0.0693, + "step": 8350 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5573751926422119, + "learning_rate": 4.1795e-05, + "loss": 0.0707, + "step": 8360 + }, + { + "epoch": 0.03685, + "grad_norm": 0.3519611656665802, + "learning_rate": 4.1845000000000003e-05, + "loss": 0.0673, + "step": 8370 + }, + { + "epoch": 0.0369, + "grad_norm": 0.43575596809387207, + "learning_rate": 4.1895e-05, + "loss": 0.0664, + "step": 8380 + }, + { + "epoch": 0.03695, + "grad_norm": 0.33544063568115234, + "learning_rate": 4.1945e-05, + "loss": 0.0642, + "step": 8390 + }, + { + "epoch": 0.037, + "grad_norm": 0.3813968002796173, + "learning_rate": 4.1995e-05, + "loss": 0.0707, + "step": 8400 + }, + { + "epoch": 0.03705, + "grad_norm": 0.25771671533584595, + "learning_rate": 4.2045e-05, + "loss": 0.0655, + "step": 8410 + }, + { + "epoch": 0.0371, + "grad_norm": 0.3025599420070648, + "learning_rate": 4.2095e-05, + "loss": 0.0637, + "step": 8420 + }, + { + "epoch": 0.03715, + "grad_norm": 0.30885520577430725, + "learning_rate": 4.2145000000000004e-05, + "loss": 0.0681, + "step": 8430 + }, + { + "epoch": 0.0372, + "grad_norm": 0.37138083577156067, + "learning_rate": 4.2195e-05, + "loss": 0.065, + "step": 8440 + }, + { + "epoch": 0.03725, + "grad_norm": 0.33118507266044617, + "learning_rate": 4.2245e-05, + "loss": 0.0632, + "step": 8450 + }, + { + "epoch": 0.0373, + "grad_norm": 0.4132819175720215, + "learning_rate": 4.2295e-05, + "loss": 0.0687, + "step": 8460 + }, + { + "epoch": 0.03735, + "grad_norm": 0.43236279487609863, + "learning_rate": 4.2345e-05, + "loss": 0.0646, + "step": 8470 + }, + { + "epoch": 0.0374, + "grad_norm": 0.3995482921600342, + "learning_rate": 4.2395e-05, + "loss": 0.0656, + "step": 8480 + }, + { + "epoch": 0.03745, + "grad_norm": 0.42754000425338745, + "learning_rate": 4.2445000000000004e-05, + "loss": 0.0671, + "step": 8490 + }, + { + "epoch": 0.0375, + "grad_norm": 0.39753013849258423, + "learning_rate": 4.2495e-05, + "loss": 0.069, + "step": 8500 + }, + { + "epoch": 0.03755, + "grad_norm": 0.38752812147140503, + "learning_rate": 4.2545e-05, + "loss": 0.0651, + "step": 8510 + }, + { + "epoch": 0.0376, + "grad_norm": 0.44947749376296997, + "learning_rate": 4.2595e-05, + "loss": 0.0753, + "step": 8520 + }, + { + "epoch": 0.03765, + "grad_norm": 0.33878234028816223, + "learning_rate": 4.2645e-05, + "loss": 0.0666, + "step": 8530 + }, + { + "epoch": 0.0377, + "grad_norm": 0.3755720555782318, + "learning_rate": 4.2695000000000004e-05, + "loss": 0.0688, + "step": 8540 + }, + { + "epoch": 0.03775, + "grad_norm": 0.3157779574394226, + "learning_rate": 4.2745000000000005e-05, + "loss": 0.0676, + "step": 8550 + }, + { + "epoch": 0.0378, + "grad_norm": 0.29580649733543396, + "learning_rate": 4.2795e-05, + "loss": 0.0677, + "step": 8560 + }, + { + "epoch": 0.03785, + "grad_norm": 0.3034764230251312, + "learning_rate": 4.2845e-05, + "loss": 0.0656, + "step": 8570 + }, + { + "epoch": 0.0379, + "grad_norm": 0.30412721633911133, + "learning_rate": 4.2895e-05, + "loss": 0.0672, + "step": 8580 + }, + { + "epoch": 0.03795, + "grad_norm": 0.3045472502708435, + "learning_rate": 4.2945e-05, + "loss": 0.0663, + "step": 8590 + }, + { + "epoch": 0.038, + "grad_norm": 0.3242882490158081, + "learning_rate": 4.2995000000000004e-05, + "loss": 0.067, + "step": 8600 + }, + { + "epoch": 0.03805, + "grad_norm": 0.3049640357494354, + "learning_rate": 4.3045e-05, + "loss": 0.068, + "step": 8610 + }, + { + "epoch": 0.0381, + "grad_norm": 0.2670859098434448, + "learning_rate": 4.3095e-05, + "loss": 0.0676, + "step": 8620 + }, + { + "epoch": 0.03815, + "grad_norm": 0.34805092215538025, + "learning_rate": 4.3145e-05, + "loss": 0.0645, + "step": 8630 + }, + { + "epoch": 0.0382, + "grad_norm": 0.34513089060783386, + "learning_rate": 4.3195e-05, + "loss": 0.0654, + "step": 8640 + }, + { + "epoch": 0.03825, + "grad_norm": 0.36328983306884766, + "learning_rate": 4.3245000000000004e-05, + "loss": 0.0657, + "step": 8650 + }, + { + "epoch": 0.0383, + "grad_norm": 0.34282222390174866, + "learning_rate": 4.3295000000000005e-05, + "loss": 0.068, + "step": 8660 + }, + { + "epoch": 0.03835, + "grad_norm": 0.3558262586593628, + "learning_rate": 4.3345e-05, + "loss": 0.0724, + "step": 8670 + }, + { + "epoch": 0.0384, + "grad_norm": 0.33850905299186707, + "learning_rate": 4.3395e-05, + "loss": 0.0671, + "step": 8680 + }, + { + "epoch": 0.03845, + "grad_norm": 0.43555590510368347, + "learning_rate": 4.3445e-05, + "loss": 0.0657, + "step": 8690 + }, + { + "epoch": 0.0385, + "grad_norm": 0.4613133668899536, + "learning_rate": 4.3495e-05, + "loss": 0.0696, + "step": 8700 + }, + { + "epoch": 0.03855, + "grad_norm": 0.3406635820865631, + "learning_rate": 4.3545000000000004e-05, + "loss": 0.07, + "step": 8710 + }, + { + "epoch": 0.0386, + "grad_norm": 0.32975125312805176, + "learning_rate": 4.3595000000000005e-05, + "loss": 0.0713, + "step": 8720 + }, + { + "epoch": 0.03865, + "grad_norm": 0.4285775125026703, + "learning_rate": 4.3645e-05, + "loss": 0.069, + "step": 8730 + }, + { + "epoch": 0.0387, + "grad_norm": 0.2535092830657959, + "learning_rate": 4.3695e-05, + "loss": 0.0685, + "step": 8740 + }, + { + "epoch": 0.03875, + "grad_norm": 0.30236580967903137, + "learning_rate": 4.3745e-05, + "loss": 0.0693, + "step": 8750 + }, + { + "epoch": 0.0388, + "grad_norm": 0.3505745828151703, + "learning_rate": 4.3795e-05, + "loss": 0.0655, + "step": 8760 + }, + { + "epoch": 0.03885, + "grad_norm": 0.2992234230041504, + "learning_rate": 4.3845000000000005e-05, + "loss": 0.0661, + "step": 8770 + }, + { + "epoch": 0.0389, + "grad_norm": 0.34178248047828674, + "learning_rate": 4.3895000000000006e-05, + "loss": 0.0703, + "step": 8780 + }, + { + "epoch": 0.03895, + "grad_norm": 0.36263760924339294, + "learning_rate": 4.3945e-05, + "loss": 0.0674, + "step": 8790 + }, + { + "epoch": 0.039, + "grad_norm": 0.26600268483161926, + "learning_rate": 4.3995e-05, + "loss": 0.0649, + "step": 8800 + }, + { + "epoch": 0.03905, + "grad_norm": 0.2860242426395416, + "learning_rate": 4.4045e-05, + "loss": 0.0646, + "step": 8810 + }, + { + "epoch": 0.0391, + "grad_norm": 0.326742023229599, + "learning_rate": 4.4095000000000004e-05, + "loss": 0.0661, + "step": 8820 + }, + { + "epoch": 0.03915, + "grad_norm": 0.32793718576431274, + "learning_rate": 4.4145000000000005e-05, + "loss": 0.0651, + "step": 8830 + }, + { + "epoch": 0.0392, + "grad_norm": 0.3359118103981018, + "learning_rate": 4.4195000000000006e-05, + "loss": 0.0649, + "step": 8840 + }, + { + "epoch": 0.03925, + "grad_norm": 0.3868449032306671, + "learning_rate": 4.4245e-05, + "loss": 0.0645, + "step": 8850 + }, + { + "epoch": 0.0393, + "grad_norm": 0.27781063318252563, + "learning_rate": 4.4295e-05, + "loss": 0.0622, + "step": 8860 + }, + { + "epoch": 0.03935, + "grad_norm": 0.30407023429870605, + "learning_rate": 4.4345e-05, + "loss": 0.0622, + "step": 8870 + }, + { + "epoch": 0.0394, + "grad_norm": 0.3161301612854004, + "learning_rate": 4.4395000000000004e-05, + "loss": 0.0639, + "step": 8880 + }, + { + "epoch": 0.03945, + "grad_norm": 0.33792781829833984, + "learning_rate": 4.4445000000000006e-05, + "loss": 0.0646, + "step": 8890 + }, + { + "epoch": 0.0395, + "grad_norm": 0.30785366892814636, + "learning_rate": 4.4495e-05, + "loss": 0.0652, + "step": 8900 + }, + { + "epoch": 0.03955, + "grad_norm": 0.4145120084285736, + "learning_rate": 4.4545e-05, + "loss": 0.0661, + "step": 8910 + }, + { + "epoch": 0.0396, + "grad_norm": 0.3020155131816864, + "learning_rate": 4.4595e-05, + "loss": 0.0693, + "step": 8920 + }, + { + "epoch": 0.03965, + "grad_norm": 0.4017809331417084, + "learning_rate": 4.4645000000000004e-05, + "loss": 0.0677, + "step": 8930 + }, + { + "epoch": 0.0397, + "grad_norm": 0.3599850535392761, + "learning_rate": 4.4695000000000005e-05, + "loss": 0.0681, + "step": 8940 + }, + { + "epoch": 0.03975, + "grad_norm": 0.3779388666152954, + "learning_rate": 4.4745000000000006e-05, + "loss": 0.0663, + "step": 8950 + }, + { + "epoch": 0.0398, + "grad_norm": 0.30472904443740845, + "learning_rate": 4.4795e-05, + "loss": 0.0685, + "step": 8960 + }, + { + "epoch": 0.03985, + "grad_norm": 0.38446706533432007, + "learning_rate": 4.4845e-05, + "loss": 0.0669, + "step": 8970 + }, + { + "epoch": 0.0399, + "grad_norm": 0.49220919609069824, + "learning_rate": 4.4895e-05, + "loss": 0.0696, + "step": 8980 + }, + { + "epoch": 0.03995, + "grad_norm": 0.353708416223526, + "learning_rate": 4.4945000000000004e-05, + "loss": 0.0688, + "step": 8990 + }, + { + "epoch": 0.04, + "grad_norm": 0.3147698938846588, + "learning_rate": 4.4995000000000005e-05, + "loss": 0.0646, + "step": 9000 + }, + { + "epoch": 0.04005, + "grad_norm": 0.3192857503890991, + "learning_rate": 4.504500000000001e-05, + "loss": 0.069, + "step": 9010 + }, + { + "epoch": 0.0401, + "grad_norm": 0.38566654920578003, + "learning_rate": 4.5095e-05, + "loss": 0.0652, + "step": 9020 + }, + { + "epoch": 0.04015, + "grad_norm": 0.37652164697647095, + "learning_rate": 4.5145e-05, + "loss": 0.0689, + "step": 9030 + }, + { + "epoch": 0.0402, + "grad_norm": 0.32042309641838074, + "learning_rate": 4.5195000000000004e-05, + "loss": 0.0651, + "step": 9040 + }, + { + "epoch": 0.04025, + "grad_norm": 0.3594474792480469, + "learning_rate": 4.5245000000000005e-05, + "loss": 0.0665, + "step": 9050 + }, + { + "epoch": 0.0403, + "grad_norm": 0.33755066990852356, + "learning_rate": 4.5295000000000006e-05, + "loss": 0.0672, + "step": 9060 + }, + { + "epoch": 0.04035, + "grad_norm": 0.30055055022239685, + "learning_rate": 4.534500000000001e-05, + "loss": 0.0671, + "step": 9070 + }, + { + "epoch": 0.0404, + "grad_norm": 0.30003151297569275, + "learning_rate": 4.5395e-05, + "loss": 0.0641, + "step": 9080 + }, + { + "epoch": 0.04045, + "grad_norm": 0.3057911992073059, + "learning_rate": 4.5445e-05, + "loss": 0.0664, + "step": 9090 + }, + { + "epoch": 0.0405, + "grad_norm": 0.30321067571640015, + "learning_rate": 4.5495000000000004e-05, + "loss": 0.0642, + "step": 9100 + }, + { + "epoch": 0.04055, + "grad_norm": 0.2936624586582184, + "learning_rate": 4.5545000000000005e-05, + "loss": 0.0654, + "step": 9110 + }, + { + "epoch": 0.0406, + "grad_norm": 0.3208167850971222, + "learning_rate": 4.5595000000000006e-05, + "loss": 0.0616, + "step": 9120 + }, + { + "epoch": 0.04065, + "grad_norm": 0.3730127513408661, + "learning_rate": 4.564500000000001e-05, + "loss": 0.0649, + "step": 9130 + }, + { + "epoch": 0.0407, + "grad_norm": 0.3969462811946869, + "learning_rate": 4.5695e-05, + "loss": 0.0656, + "step": 9140 + }, + { + "epoch": 0.04075, + "grad_norm": 0.3340781033039093, + "learning_rate": 4.5745e-05, + "loss": 0.0676, + "step": 9150 + }, + { + "epoch": 0.0408, + "grad_norm": 0.3071921169757843, + "learning_rate": 4.5795000000000005e-05, + "loss": 0.0696, + "step": 9160 + }, + { + "epoch": 0.04085, + "grad_norm": 0.31838589906692505, + "learning_rate": 4.5845000000000006e-05, + "loss": 0.0602, + "step": 9170 + }, + { + "epoch": 0.0409, + "grad_norm": 0.3813740909099579, + "learning_rate": 4.589500000000001e-05, + "loss": 0.0678, + "step": 9180 + }, + { + "epoch": 0.04095, + "grad_norm": 0.30758917331695557, + "learning_rate": 4.5945e-05, + "loss": 0.0698, + "step": 9190 + }, + { + "epoch": 0.041, + "grad_norm": 0.41876235604286194, + "learning_rate": 4.5995e-05, + "loss": 0.064, + "step": 9200 + }, + { + "epoch": 0.04105, + "grad_norm": 0.32810330390930176, + "learning_rate": 4.6045000000000004e-05, + "loss": 0.0635, + "step": 9210 + }, + { + "epoch": 0.0411, + "grad_norm": 0.41681545972824097, + "learning_rate": 4.6095000000000005e-05, + "loss": 0.0628, + "step": 9220 + }, + { + "epoch": 0.04115, + "grad_norm": 0.33085331320762634, + "learning_rate": 4.6145000000000006e-05, + "loss": 0.0606, + "step": 9230 + }, + { + "epoch": 0.0412, + "grad_norm": 0.3884361684322357, + "learning_rate": 4.619500000000001e-05, + "loss": 0.0672, + "step": 9240 + }, + { + "epoch": 0.04125, + "grad_norm": 0.3489012122154236, + "learning_rate": 4.6245e-05, + "loss": 0.0677, + "step": 9250 + }, + { + "epoch": 0.0413, + "grad_norm": 0.29971393942832947, + "learning_rate": 4.6294999999999996e-05, + "loss": 0.0643, + "step": 9260 + }, + { + "epoch": 0.04135, + "grad_norm": 0.37529256939888, + "learning_rate": 4.6345e-05, + "loss": 0.065, + "step": 9270 + }, + { + "epoch": 0.0414, + "grad_norm": 0.32133549451828003, + "learning_rate": 4.6395e-05, + "loss": 0.0641, + "step": 9280 + }, + { + "epoch": 0.04145, + "grad_norm": 0.3157539963722229, + "learning_rate": 4.6445e-05, + "loss": 0.0597, + "step": 9290 + }, + { + "epoch": 0.0415, + "grad_norm": 0.28702154755592346, + "learning_rate": 4.6495e-05, + "loss": 0.0635, + "step": 9300 + }, + { + "epoch": 0.04155, + "grad_norm": 0.3248656988143921, + "learning_rate": 4.6545e-05, + "loss": 0.0687, + "step": 9310 + }, + { + "epoch": 0.0416, + "grad_norm": 0.30441874265670776, + "learning_rate": 4.6595e-05, + "loss": 0.0648, + "step": 9320 + }, + { + "epoch": 0.04165, + "grad_norm": 0.3399603068828583, + "learning_rate": 4.6645e-05, + "loss": 0.0654, + "step": 9330 + }, + { + "epoch": 0.0417, + "grad_norm": 0.3140687048435211, + "learning_rate": 4.6695e-05, + "loss": 0.0657, + "step": 9340 + }, + { + "epoch": 0.04175, + "grad_norm": 0.37293708324432373, + "learning_rate": 4.6745e-05, + "loss": 0.0621, + "step": 9350 + }, + { + "epoch": 0.0418, + "grad_norm": 0.3210163414478302, + "learning_rate": 4.6795e-05, + "loss": 0.0633, + "step": 9360 + }, + { + "epoch": 0.04185, + "grad_norm": 0.3903563320636749, + "learning_rate": 4.6845e-05, + "loss": 0.0769, + "step": 9370 + }, + { + "epoch": 0.0419, + "grad_norm": 0.3045681118965149, + "learning_rate": 4.6895e-05, + "loss": 0.0667, + "step": 9380 + }, + { + "epoch": 0.04195, + "grad_norm": 0.3322887718677521, + "learning_rate": 4.6945e-05, + "loss": 0.0637, + "step": 9390 + }, + { + "epoch": 0.042, + "grad_norm": 0.2262994796037674, + "learning_rate": 4.6995e-05, + "loss": 0.0638, + "step": 9400 + }, + { + "epoch": 0.04205, + "grad_norm": 0.2820618152618408, + "learning_rate": 4.7045e-05, + "loss": 0.0638, + "step": 9410 + }, + { + "epoch": 0.0421, + "grad_norm": 0.3317321538925171, + "learning_rate": 4.7095e-05, + "loss": 0.0691, + "step": 9420 + }, + { + "epoch": 0.04215, + "grad_norm": 0.43117380142211914, + "learning_rate": 4.7145000000000003e-05, + "loss": 0.0686, + "step": 9430 + }, + { + "epoch": 0.0422, + "grad_norm": 0.3164169490337372, + "learning_rate": 4.7195e-05, + "loss": 0.0666, + "step": 9440 + }, + { + "epoch": 0.04225, + "grad_norm": 0.27061155438423157, + "learning_rate": 4.7245e-05, + "loss": 0.0652, + "step": 9450 + }, + { + "epoch": 0.0423, + "grad_norm": 0.32976457476615906, + "learning_rate": 4.7295e-05, + "loss": 0.0652, + "step": 9460 + }, + { + "epoch": 0.04235, + "grad_norm": 0.2550129294395447, + "learning_rate": 4.7345e-05, + "loss": 0.064, + "step": 9470 + }, + { + "epoch": 0.0424, + "grad_norm": 0.3532482385635376, + "learning_rate": 4.7395e-05, + "loss": 0.0667, + "step": 9480 + }, + { + "epoch": 0.04245, + "grad_norm": 0.3003195524215698, + "learning_rate": 4.7445e-05, + "loss": 0.0681, + "step": 9490 + }, + { + "epoch": 0.0425, + "grad_norm": 0.42255571484565735, + "learning_rate": 4.7495e-05, + "loss": 0.0643, + "step": 9500 + }, + { + "epoch": 0.04255, + "grad_norm": 0.3724454939365387, + "learning_rate": 4.7545e-05, + "loss": 0.0669, + "step": 9510 + }, + { + "epoch": 0.0426, + "grad_norm": 0.3489012122154236, + "learning_rate": 4.7595e-05, + "loss": 0.0632, + "step": 9520 + }, + { + "epoch": 0.04265, + "grad_norm": 0.33121049404144287, + "learning_rate": 4.7645e-05, + "loss": 0.0667, + "step": 9530 + }, + { + "epoch": 0.0427, + "grad_norm": 0.35532277822494507, + "learning_rate": 4.7695e-05, + "loss": 0.0692, + "step": 9540 + }, + { + "epoch": 0.04275, + "grad_norm": 0.28395581245422363, + "learning_rate": 4.7745e-05, + "loss": 0.0634, + "step": 9550 + }, + { + "epoch": 0.0428, + "grad_norm": 0.25116708874702454, + "learning_rate": 4.7795e-05, + "loss": 0.0647, + "step": 9560 + }, + { + "epoch": 0.04285, + "grad_norm": 0.32771411538124084, + "learning_rate": 4.7845e-05, + "loss": 0.0633, + "step": 9570 + }, + { + "epoch": 0.0429, + "grad_norm": 0.3212730884552002, + "learning_rate": 4.7895e-05, + "loss": 0.0629, + "step": 9580 + }, + { + "epoch": 0.04295, + "grad_norm": 0.2865977883338928, + "learning_rate": 4.7945e-05, + "loss": 0.0633, + "step": 9590 + }, + { + "epoch": 0.043, + "grad_norm": 0.29332104325294495, + "learning_rate": 4.7995000000000004e-05, + "loss": 0.064, + "step": 9600 + }, + { + "epoch": 0.04305, + "grad_norm": 0.28664249181747437, + "learning_rate": 4.8045e-05, + "loss": 0.0602, + "step": 9610 + }, + { + "epoch": 0.0431, + "grad_norm": 0.3150465786457062, + "learning_rate": 4.8095e-05, + "loss": 0.0644, + "step": 9620 + }, + { + "epoch": 0.04315, + "grad_norm": 0.3172779083251953, + "learning_rate": 4.8145e-05, + "loss": 0.065, + "step": 9630 + }, + { + "epoch": 0.0432, + "grad_norm": 0.2557525038719177, + "learning_rate": 4.8195e-05, + "loss": 0.0621, + "step": 9640 + }, + { + "epoch": 0.04325, + "grad_norm": 0.35068729519844055, + "learning_rate": 4.8245e-05, + "loss": 0.0641, + "step": 9650 + }, + { + "epoch": 0.0433, + "grad_norm": 0.36322087049484253, + "learning_rate": 4.8295000000000004e-05, + "loss": 0.0623, + "step": 9660 + }, + { + "epoch": 0.04335, + "grad_norm": 0.2819206416606903, + "learning_rate": 4.8345e-05, + "loss": 0.0635, + "step": 9670 + }, + { + "epoch": 0.0434, + "grad_norm": 0.25681859254837036, + "learning_rate": 4.8395e-05, + "loss": 0.0603, + "step": 9680 + }, + { + "epoch": 0.04345, + "grad_norm": 0.35684749484062195, + "learning_rate": 4.8445e-05, + "loss": 0.0655, + "step": 9690 + }, + { + "epoch": 0.0435, + "grad_norm": 0.3112817704677582, + "learning_rate": 4.8495e-05, + "loss": 0.069, + "step": 9700 + }, + { + "epoch": 0.04355, + "grad_norm": 0.3422347605228424, + "learning_rate": 4.8545000000000004e-05, + "loss": 0.064, + "step": 9710 + }, + { + "epoch": 0.0436, + "grad_norm": 0.2875858247280121, + "learning_rate": 4.8595000000000005e-05, + "loss": 0.0626, + "step": 9720 + }, + { + "epoch": 0.04365, + "grad_norm": 0.25618475675582886, + "learning_rate": 4.8645e-05, + "loss": 0.0648, + "step": 9730 + }, + { + "epoch": 0.0437, + "grad_norm": 0.26029500365257263, + "learning_rate": 4.8695e-05, + "loss": 0.0634, + "step": 9740 + }, + { + "epoch": 0.04375, + "grad_norm": 0.2977071702480316, + "learning_rate": 4.8745e-05, + "loss": 0.0673, + "step": 9750 + }, + { + "epoch": 0.0438, + "grad_norm": 0.21176737546920776, + "learning_rate": 4.8795e-05, + "loss": 0.0613, + "step": 9760 + }, + { + "epoch": 0.04385, + "grad_norm": 0.30652615427970886, + "learning_rate": 4.8845000000000004e-05, + "loss": 0.0639, + "step": 9770 + }, + { + "epoch": 0.0439, + "grad_norm": 0.38696739077568054, + "learning_rate": 4.8895e-05, + "loss": 0.0628, + "step": 9780 + }, + { + "epoch": 0.04395, + "grad_norm": 0.3368676006793976, + "learning_rate": 4.8945e-05, + "loss": 0.0657, + "step": 9790 + }, + { + "epoch": 0.044, + "grad_norm": 0.3058677315711975, + "learning_rate": 4.8995e-05, + "loss": 0.0602, + "step": 9800 + }, + { + "epoch": 0.04405, + "grad_norm": 0.2631063461303711, + "learning_rate": 4.9045e-05, + "loss": 0.0631, + "step": 9810 + }, + { + "epoch": 0.0441, + "grad_norm": 0.2672768533229828, + "learning_rate": 4.9095000000000003e-05, + "loss": 0.0631, + "step": 9820 + }, + { + "epoch": 0.04415, + "grad_norm": 0.3295259177684784, + "learning_rate": 4.9145000000000005e-05, + "loss": 0.0675, + "step": 9830 + }, + { + "epoch": 0.0442, + "grad_norm": 0.32516661286354065, + "learning_rate": 4.9195e-05, + "loss": 0.0616, + "step": 9840 + }, + { + "epoch": 0.04425, + "grad_norm": 0.3430582284927368, + "learning_rate": 4.9245e-05, + "loss": 0.0659, + "step": 9850 + }, + { + "epoch": 0.0443, + "grad_norm": 0.27236413955688477, + "learning_rate": 4.9295e-05, + "loss": 0.0613, + "step": 9860 + }, + { + "epoch": 0.04435, + "grad_norm": 0.30401864647865295, + "learning_rate": 4.9345e-05, + "loss": 0.0637, + "step": 9870 + }, + { + "epoch": 0.0444, + "grad_norm": 0.3249342143535614, + "learning_rate": 4.9395000000000004e-05, + "loss": 0.0651, + "step": 9880 + }, + { + "epoch": 0.04445, + "grad_norm": 0.3392452299594879, + "learning_rate": 4.9445000000000005e-05, + "loss": 0.0622, + "step": 9890 + }, + { + "epoch": 0.0445, + "grad_norm": 0.31362077593803406, + "learning_rate": 4.9495e-05, + "loss": 0.0629, + "step": 9900 + }, + { + "epoch": 0.04455, + "grad_norm": 0.26321685314178467, + "learning_rate": 4.9545e-05, + "loss": 0.0616, + "step": 9910 + }, + { + "epoch": 0.0446, + "grad_norm": 0.3222457766532898, + "learning_rate": 4.9595e-05, + "loss": 0.0646, + "step": 9920 + }, + { + "epoch": 0.04465, + "grad_norm": 0.3142569065093994, + "learning_rate": 4.9645e-05, + "loss": 0.0607, + "step": 9930 + }, + { + "epoch": 0.0447, + "grad_norm": 0.2683854103088379, + "learning_rate": 4.9695000000000004e-05, + "loss": 0.0615, + "step": 9940 + }, + { + "epoch": 0.04475, + "grad_norm": 0.31706681847572327, + "learning_rate": 4.9745000000000006e-05, + "loss": 0.0608, + "step": 9950 + }, + { + "epoch": 0.0448, + "grad_norm": 0.38556215167045593, + "learning_rate": 4.9795e-05, + "loss": 0.0637, + "step": 9960 + }, + { + "epoch": 0.04485, + "grad_norm": 0.29253655672073364, + "learning_rate": 4.9845e-05, + "loss": 0.0634, + "step": 9970 + }, + { + "epoch": 0.0449, + "grad_norm": 0.3062693774700165, + "learning_rate": 4.9895e-05, + "loss": 0.0652, + "step": 9980 + }, + { + "epoch": 0.04495, + "grad_norm": 0.3026590943336487, + "learning_rate": 4.9945000000000004e-05, + "loss": 0.0602, + "step": 9990 + }, + { + "epoch": 0.045, + "grad_norm": 0.35445278882980347, + "learning_rate": 4.9995000000000005e-05, + "loss": 0.0611, + "step": 10000 + }, + { + "epoch": 0.04505, + "grad_norm": 0.3197009265422821, + "learning_rate": 4.999999972318631e-05, + "loss": 0.0633, + "step": 10010 + }, + { + "epoch": 0.0451, + "grad_norm": 0.3079759478569031, + "learning_rate": 4.999999876629946e-05, + "loss": 0.0673, + "step": 10020 + }, + { + "epoch": 0.04515, + "grad_norm": 0.29146572947502136, + "learning_rate": 4.9999997125922047e-05, + "loss": 0.0636, + "step": 10030 + }, + { + "epoch": 0.0452, + "grad_norm": 0.26313549280166626, + "learning_rate": 4.9999994802054094e-05, + "loss": 0.0613, + "step": 10040 + }, + { + "epoch": 0.04525, + "grad_norm": 0.30985012650489807, + "learning_rate": 4.999999179469568e-05, + "loss": 0.0636, + "step": 10050 + }, + { + "epoch": 0.0453, + "grad_norm": 0.3062456250190735, + "learning_rate": 4.9999988103846886e-05, + "loss": 0.0641, + "step": 10060 + }, + { + "epoch": 0.04535, + "grad_norm": 0.32784801721572876, + "learning_rate": 4.9999983729507806e-05, + "loss": 0.0591, + "step": 10070 + }, + { + "epoch": 0.0454, + "grad_norm": 0.32923054695129395, + "learning_rate": 4.999997867167856e-05, + "loss": 0.0635, + "step": 10080 + }, + { + "epoch": 0.04545, + "grad_norm": 0.2901904881000519, + "learning_rate": 4.99999729303593e-05, + "loss": 0.0636, + "step": 10090 + }, + { + "epoch": 0.0455, + "grad_norm": 0.29496315121650696, + "learning_rate": 4.999996650555017e-05, + "loss": 0.0649, + "step": 10100 + }, + { + "epoch": 0.04555, + "grad_norm": 0.33649277687072754, + "learning_rate": 4.999995939725134e-05, + "loss": 0.0634, + "step": 10110 + }, + { + "epoch": 0.0456, + "grad_norm": 0.30145928263664246, + "learning_rate": 4.9999951605463015e-05, + "loss": 0.0625, + "step": 10120 + }, + { + "epoch": 0.04565, + "grad_norm": 0.3132384717464447, + "learning_rate": 4.99999431301854e-05, + "loss": 0.0645, + "step": 10130 + }, + { + "epoch": 0.0457, + "grad_norm": 0.24817843735218048, + "learning_rate": 4.999993397141874e-05, + "loss": 0.0618, + "step": 10140 + }, + { + "epoch": 0.04575, + "grad_norm": 0.2636968195438385, + "learning_rate": 4.999992412916327e-05, + "loss": 0.068, + "step": 10150 + }, + { + "epoch": 0.0458, + "grad_norm": 0.36269840598106384, + "learning_rate": 4.999991360341927e-05, + "loss": 0.0638, + "step": 10160 + }, + { + "epoch": 0.04585, + "grad_norm": 0.43733730912208557, + "learning_rate": 4.9999902394187024e-05, + "loss": 0.0663, + "step": 10170 + }, + { + "epoch": 0.0459, + "grad_norm": 0.32769593596458435, + "learning_rate": 4.999989050146684e-05, + "loss": 0.0639, + "step": 10180 + }, + { + "epoch": 0.04595, + "grad_norm": 0.269916296005249, + "learning_rate": 4.999987792525904e-05, + "loss": 0.0656, + "step": 10190 + }, + { + "epoch": 0.046, + "grad_norm": 0.2898986041545868, + "learning_rate": 4.9999864665563975e-05, + "loss": 0.0661, + "step": 10200 + }, + { + "epoch": 0.04605, + "grad_norm": 0.32004767656326294, + "learning_rate": 4.999985072238199e-05, + "loss": 0.0655, + "step": 10210 + }, + { + "epoch": 0.0461, + "grad_norm": 0.2851164937019348, + "learning_rate": 4.999983609571349e-05, + "loss": 0.0605, + "step": 10220 + }, + { + "epoch": 0.04615, + "grad_norm": 0.22278539836406708, + "learning_rate": 4.999982078555886e-05, + "loss": 0.0658, + "step": 10230 + }, + { + "epoch": 0.0462, + "grad_norm": 0.31230485439300537, + "learning_rate": 4.999980479191852e-05, + "loss": 0.0636, + "step": 10240 + }, + { + "epoch": 0.04625, + "grad_norm": 0.2774222493171692, + "learning_rate": 4.999978811479291e-05, + "loss": 0.0618, + "step": 10250 + }, + { + "epoch": 0.0463, + "grad_norm": 0.22640889883041382, + "learning_rate": 4.999977075418248e-05, + "loss": 0.0625, + "step": 10260 + }, + { + "epoch": 0.04635, + "grad_norm": 0.23389208316802979, + "learning_rate": 4.999975271008772e-05, + "loss": 0.0623, + "step": 10270 + }, + { + "epoch": 0.0464, + "grad_norm": 0.25524914264678955, + "learning_rate": 4.99997339825091e-05, + "loss": 0.0583, + "step": 10280 + }, + { + "epoch": 0.04645, + "grad_norm": 0.26234862208366394, + "learning_rate": 4.999971457144715e-05, + "loss": 0.0585, + "step": 10290 + }, + { + "epoch": 0.0465, + "grad_norm": 0.2599146366119385, + "learning_rate": 4.9999694476902404e-05, + "loss": 0.0618, + "step": 10300 + }, + { + "epoch": 0.04655, + "grad_norm": 0.24796922504901886, + "learning_rate": 4.9999673698875393e-05, + "loss": 0.0576, + "step": 10310 + }, + { + "epoch": 0.0466, + "grad_norm": 0.31469812989234924, + "learning_rate": 4.99996522373667e-05, + "loss": 0.061, + "step": 10320 + }, + { + "epoch": 0.04665, + "grad_norm": 0.2860850989818573, + "learning_rate": 4.99996300923769e-05, + "loss": 0.0617, + "step": 10330 + }, + { + "epoch": 0.0467, + "grad_norm": 0.2586144804954529, + "learning_rate": 4.999960726390662e-05, + "loss": 0.059, + "step": 10340 + }, + { + "epoch": 0.04675, + "grad_norm": 0.2587771415710449, + "learning_rate": 4.9999583751956455e-05, + "loss": 0.0596, + "step": 10350 + }, + { + "epoch": 0.0468, + "grad_norm": 0.22751501202583313, + "learning_rate": 4.999955955652706e-05, + "loss": 0.0582, + "step": 10360 + }, + { + "epoch": 0.04685, + "grad_norm": 0.2889695167541504, + "learning_rate": 4.999953467761911e-05, + "loss": 0.0612, + "step": 10370 + }, + { + "epoch": 0.0469, + "grad_norm": 0.27310308814048767, + "learning_rate": 4.9999509115233275e-05, + "loss": 0.0612, + "step": 10380 + }, + { + "epoch": 0.04695, + "grad_norm": 0.2685157358646393, + "learning_rate": 4.999948286937024e-05, + "loss": 0.0592, + "step": 10390 + }, + { + "epoch": 0.047, + "grad_norm": 0.24792355298995972, + "learning_rate": 4.9999455940030746e-05, + "loss": 0.059, + "step": 10400 + }, + { + "epoch": 0.04705, + "grad_norm": 0.26915544271469116, + "learning_rate": 4.999942832721551e-05, + "loss": 0.0577, + "step": 10410 + }, + { + "epoch": 0.0471, + "grad_norm": 0.28766775131225586, + "learning_rate": 4.99994000309253e-05, + "loss": 0.0606, + "step": 10420 + }, + { + "epoch": 0.04715, + "grad_norm": 0.32877805829048157, + "learning_rate": 4.9999371051160893e-05, + "loss": 0.0605, + "step": 10430 + }, + { + "epoch": 0.0472, + "grad_norm": 0.3041923940181732, + "learning_rate": 4.999934138792306e-05, + "loss": 0.0591, + "step": 10440 + }, + { + "epoch": 0.04725, + "grad_norm": 0.2929264307022095, + "learning_rate": 4.999931104121263e-05, + "loss": 0.0621, + "step": 10450 + }, + { + "epoch": 0.0473, + "grad_norm": 0.2993503510951996, + "learning_rate": 4.999928001103043e-05, + "loss": 0.0598, + "step": 10460 + }, + { + "epoch": 0.04735, + "grad_norm": 0.33387285470962524, + "learning_rate": 4.99992482973773e-05, + "loss": 0.0581, + "step": 10470 + }, + { + "epoch": 0.0474, + "grad_norm": 0.30160704255104065, + "learning_rate": 4.999921590025412e-05, + "loss": 0.0656, + "step": 10480 + }, + { + "epoch": 0.04745, + "grad_norm": 0.2685299217700958, + "learning_rate": 4.9999182819661774e-05, + "loss": 0.0628, + "step": 10490 + }, + { + "epoch": 0.0475, + "grad_norm": 0.2762181758880615, + "learning_rate": 4.999914905560115e-05, + "loss": 0.0636, + "step": 10500 + }, + { + "epoch": 0.04755, + "grad_norm": 0.315581351518631, + "learning_rate": 4.999911460807318e-05, + "loss": 0.0625, + "step": 10510 + }, + { + "epoch": 0.0476, + "grad_norm": 0.34477338194847107, + "learning_rate": 4.999907947707882e-05, + "loss": 0.0626, + "step": 10520 + }, + { + "epoch": 0.04765, + "grad_norm": 0.2868213355541229, + "learning_rate": 4.999904366261902e-05, + "loss": 0.0612, + "step": 10530 + }, + { + "epoch": 0.0477, + "grad_norm": 0.27307435870170593, + "learning_rate": 4.999900716469475e-05, + "loss": 0.0634, + "step": 10540 + }, + { + "epoch": 0.04775, + "grad_norm": 0.3192185163497925, + "learning_rate": 4.999896998330701e-05, + "loss": 0.059, + "step": 10550 + }, + { + "epoch": 0.0478, + "grad_norm": 0.2786751091480255, + "learning_rate": 4.999893211845684e-05, + "loss": 0.0579, + "step": 10560 + }, + { + "epoch": 0.04785, + "grad_norm": 0.26636555790901184, + "learning_rate": 4.9998893570145243e-05, + "loss": 0.0598, + "step": 10570 + }, + { + "epoch": 0.0479, + "grad_norm": 0.28624773025512695, + "learning_rate": 4.999885433837329e-05, + "loss": 0.0639, + "step": 10580 + }, + { + "epoch": 0.04795, + "grad_norm": 0.26311632990837097, + "learning_rate": 4.999881442314206e-05, + "loss": 0.0599, + "step": 10590 + }, + { + "epoch": 0.048, + "grad_norm": 0.33889976143836975, + "learning_rate": 4.999877382445263e-05, + "loss": 0.0602, + "step": 10600 + }, + { + "epoch": 0.04805, + "grad_norm": 0.25893542170524597, + "learning_rate": 4.999873254230611e-05, + "loss": 0.0615, + "step": 10610 + }, + { + "epoch": 0.0481, + "grad_norm": 0.2611432671546936, + "learning_rate": 4.9998690576703635e-05, + "loss": 0.0589, + "step": 10620 + }, + { + "epoch": 0.04815, + "grad_norm": 0.23470397293567657, + "learning_rate": 4.9998647927646355e-05, + "loss": 0.0582, + "step": 10630 + }, + { + "epoch": 0.0482, + "grad_norm": 0.2380250096321106, + "learning_rate": 4.9998604595135435e-05, + "loss": 0.0593, + "step": 10640 + }, + { + "epoch": 0.04825, + "grad_norm": 0.35195785760879517, + "learning_rate": 4.999856057917205e-05, + "loss": 0.0636, + "step": 10650 + }, + { + "epoch": 0.0483, + "grad_norm": 0.3239285349845886, + "learning_rate": 4.9998515879757415e-05, + "loss": 0.0652, + "step": 10660 + }, + { + "epoch": 0.04835, + "grad_norm": 0.32478657364845276, + "learning_rate": 4.9998470496892746e-05, + "loss": 0.0632, + "step": 10670 + }, + { + "epoch": 0.0484, + "grad_norm": 0.2464042752981186, + "learning_rate": 4.999842443057928e-05, + "loss": 0.0584, + "step": 10680 + }, + { + "epoch": 0.04845, + "grad_norm": 0.2592853009700775, + "learning_rate": 4.9998377680818286e-05, + "loss": 0.0595, + "step": 10690 + }, + { + "epoch": 0.0485, + "grad_norm": 0.25503969192504883, + "learning_rate": 4.999833024761104e-05, + "loss": 0.058, + "step": 10700 + }, + { + "epoch": 0.04855, + "grad_norm": 0.2166227400302887, + "learning_rate": 4.999828213095883e-05, + "loss": 0.0581, + "step": 10710 + }, + { + "epoch": 0.0486, + "grad_norm": 0.3099343180656433, + "learning_rate": 4.9998233330862984e-05, + "loss": 0.0616, + "step": 10720 + }, + { + "epoch": 0.04865, + "grad_norm": 0.3154681324958801, + "learning_rate": 4.9998183847324833e-05, + "loss": 0.0577, + "step": 10730 + }, + { + "epoch": 0.0487, + "grad_norm": 0.2956450283527374, + "learning_rate": 4.999813368034572e-05, + "loss": 0.0591, + "step": 10740 + }, + { + "epoch": 0.04875, + "grad_norm": 0.34081724286079407, + "learning_rate": 4.999808282992703e-05, + "loss": 0.0616, + "step": 10750 + }, + { + "epoch": 0.0488, + "grad_norm": 0.24640336632728577, + "learning_rate": 4.9998031296070144e-05, + "loss": 0.0605, + "step": 10760 + }, + { + "epoch": 0.04885, + "grad_norm": 0.25215527415275574, + "learning_rate": 4.999797907877647e-05, + "loss": 0.0593, + "step": 10770 + }, + { + "epoch": 0.0489, + "grad_norm": 0.2916519343852997, + "learning_rate": 4.999792617804744e-05, + "loss": 0.059, + "step": 10780 + }, + { + "epoch": 0.04895, + "grad_norm": 0.30040955543518066, + "learning_rate": 4.99978725938845e-05, + "loss": 0.0611, + "step": 10790 + }, + { + "epoch": 0.049, + "grad_norm": 0.2178073674440384, + "learning_rate": 4.999781832628911e-05, + "loss": 0.0586, + "step": 10800 + }, + { + "epoch": 0.04905, + "grad_norm": 0.2435062974691391, + "learning_rate": 4.999776337526277e-05, + "loss": 0.0605, + "step": 10810 + }, + { + "epoch": 0.0491, + "grad_norm": 0.2754117548465729, + "learning_rate": 4.999770774080696e-05, + "loss": 0.0585, + "step": 10820 + }, + { + "epoch": 0.04915, + "grad_norm": 0.25847530364990234, + "learning_rate": 4.999765142292322e-05, + "loss": 0.0574, + "step": 10830 + }, + { + "epoch": 0.0492, + "grad_norm": 0.2868664860725403, + "learning_rate": 4.999759442161308e-05, + "loss": 0.0591, + "step": 10840 + }, + { + "epoch": 0.04925, + "grad_norm": 0.2413976490497589, + "learning_rate": 4.99975367368781e-05, + "loss": 0.0602, + "step": 10850 + }, + { + "epoch": 0.0493, + "grad_norm": 0.2344127595424652, + "learning_rate": 4.999747836871985e-05, + "loss": 0.0636, + "step": 10860 + }, + { + "epoch": 0.04935, + "grad_norm": 0.24767453968524933, + "learning_rate": 4.999741931713994e-05, + "loss": 0.0589, + "step": 10870 + }, + { + "epoch": 0.0494, + "grad_norm": 0.28867772221565247, + "learning_rate": 4.9997359582139966e-05, + "loss": 0.0627, + "step": 10880 + }, + { + "epoch": 0.04945, + "grad_norm": 0.3197946846485138, + "learning_rate": 4.999729916372159e-05, + "loss": 0.0634, + "step": 10890 + }, + { + "epoch": 0.0495, + "grad_norm": 0.29469308257102966, + "learning_rate": 4.999723806188643e-05, + "loss": 0.0612, + "step": 10900 + }, + { + "epoch": 0.04955, + "grad_norm": 0.23049208521842957, + "learning_rate": 4.999717627663618e-05, + "loss": 0.0595, + "step": 10910 + }, + { + "epoch": 0.0496, + "grad_norm": 0.24875399470329285, + "learning_rate": 4.9997113807972516e-05, + "loss": 0.0623, + "step": 10920 + }, + { + "epoch": 0.04965, + "grad_norm": 0.21667809784412384, + "learning_rate": 4.999705065589716e-05, + "loss": 0.0618, + "step": 10930 + }, + { + "epoch": 0.0497, + "grad_norm": 0.26928454637527466, + "learning_rate": 4.9996986820411824e-05, + "loss": 0.0607, + "step": 10940 + }, + { + "epoch": 0.04975, + "grad_norm": 0.30384913086891174, + "learning_rate": 4.999692230151825e-05, + "loss": 0.0617, + "step": 10950 + }, + { + "epoch": 0.0498, + "grad_norm": 0.3048796057701111, + "learning_rate": 4.999685709921823e-05, + "loss": 0.0608, + "step": 10960 + }, + { + "epoch": 0.04985, + "grad_norm": 0.25846514105796814, + "learning_rate": 4.999679121351352e-05, + "loss": 0.0623, + "step": 10970 + }, + { + "epoch": 0.0499, + "grad_norm": 0.28279492259025574, + "learning_rate": 4.999672464440592e-05, + "loss": 0.0604, + "step": 10980 + }, + { + "epoch": 0.04995, + "grad_norm": 0.24530085921287537, + "learning_rate": 4.999665739189727e-05, + "loss": 0.0569, + "step": 10990 + }, + { + "epoch": 0.05, + "grad_norm": 0.42206844687461853, + "learning_rate": 4.9996589455989404e-05, + "loss": 0.0626, + "step": 11000 + }, + { + "epoch": 0.05005, + "grad_norm": 0.24767158925533295, + "learning_rate": 4.999652083668416e-05, + "loss": 0.0609, + "step": 11010 + }, + { + "epoch": 0.0501, + "grad_norm": 0.273266077041626, + "learning_rate": 4.9996451533983435e-05, + "loss": 0.0619, + "step": 11020 + }, + { + "epoch": 0.05015, + "grad_norm": 0.30322688817977905, + "learning_rate": 4.99963815478891e-05, + "loss": 0.0622, + "step": 11030 + }, + { + "epoch": 0.0502, + "grad_norm": 0.2822934091091156, + "learning_rate": 4.9996310878403106e-05, + "loss": 0.06, + "step": 11040 + }, + { + "epoch": 0.05025, + "grad_norm": 0.3341050446033478, + "learning_rate": 4.9996239525527356e-05, + "loss": 0.0579, + "step": 11050 + }, + { + "epoch": 0.0503, + "grad_norm": 0.2769171893596649, + "learning_rate": 4.99961674892638e-05, + "loss": 0.0567, + "step": 11060 + }, + { + "epoch": 0.05035, + "grad_norm": 0.2524805963039398, + "learning_rate": 4.999609476961442e-05, + "loss": 0.0596, + "step": 11070 + }, + { + "epoch": 0.0504, + "grad_norm": 0.2670571208000183, + "learning_rate": 4.99960213665812e-05, + "loss": 0.0632, + "step": 11080 + }, + { + "epoch": 0.05045, + "grad_norm": 0.26169538497924805, + "learning_rate": 4.999594728016615e-05, + "loss": 0.0636, + "step": 11090 + }, + { + "epoch": 0.0505, + "grad_norm": 0.30789557099342346, + "learning_rate": 4.999587251037129e-05, + "loss": 0.0651, + "step": 11100 + }, + { + "epoch": 0.05055, + "grad_norm": 0.2783627510070801, + "learning_rate": 4.999579705719866e-05, + "loss": 0.0615, + "step": 11110 + }, + { + "epoch": 0.0506, + "grad_norm": 0.3286147117614746, + "learning_rate": 4.999572092065034e-05, + "loss": 0.0612, + "step": 11120 + }, + { + "epoch": 0.05065, + "grad_norm": 0.2502022385597229, + "learning_rate": 4.999564410072839e-05, + "loss": 0.0613, + "step": 11130 + }, + { + "epoch": 0.0507, + "grad_norm": 0.23253153264522552, + "learning_rate": 4.999556659743493e-05, + "loss": 0.0585, + "step": 11140 + }, + { + "epoch": 0.05075, + "grad_norm": 0.21726743876934052, + "learning_rate": 4.999548841077206e-05, + "loss": 0.059, + "step": 11150 + }, + { + "epoch": 0.0508, + "grad_norm": 0.2250111848115921, + "learning_rate": 4.9995409540741934e-05, + "loss": 0.0597, + "step": 11160 + }, + { + "epoch": 0.05085, + "grad_norm": 0.22477279603481293, + "learning_rate": 4.99953299873467e-05, + "loss": 0.0629, + "step": 11170 + }, + { + "epoch": 0.0509, + "grad_norm": 0.2970663011074066, + "learning_rate": 4.9995249750588535e-05, + "loss": 0.0627, + "step": 11180 + }, + { + "epoch": 0.05095, + "grad_norm": 0.2567269802093506, + "learning_rate": 4.999516883046963e-05, + "loss": 0.0585, + "step": 11190 + }, + { + "epoch": 0.051, + "grad_norm": 0.2891165614128113, + "learning_rate": 4.99950872269922e-05, + "loss": 0.06, + "step": 11200 + }, + { + "epoch": 0.05105, + "grad_norm": 0.3167526125907898, + "learning_rate": 4.9995004940158476e-05, + "loss": 0.0616, + "step": 11210 + }, + { + "epoch": 0.0511, + "grad_norm": 0.3454287648200989, + "learning_rate": 4.99949219699707e-05, + "loss": 0.0652, + "step": 11220 + }, + { + "epoch": 0.05115, + "grad_norm": 0.28335583209991455, + "learning_rate": 4.999483831643116e-05, + "loss": 0.0659, + "step": 11230 + }, + { + "epoch": 0.0512, + "grad_norm": 0.28895190358161926, + "learning_rate": 4.999475397954213e-05, + "loss": 0.0603, + "step": 11240 + }, + { + "epoch": 0.05125, + "grad_norm": 0.2975988984107971, + "learning_rate": 4.99946689593059e-05, + "loss": 0.0614, + "step": 11250 + }, + { + "epoch": 0.0513, + "grad_norm": 0.2765286862850189, + "learning_rate": 4.999458325572483e-05, + "loss": 0.0584, + "step": 11260 + }, + { + "epoch": 0.05135, + "grad_norm": 0.2741754651069641, + "learning_rate": 4.999449686880123e-05, + "loss": 0.0663, + "step": 11270 + }, + { + "epoch": 0.0514, + "grad_norm": 0.28087082505226135, + "learning_rate": 4.999440979853749e-05, + "loss": 0.0589, + "step": 11280 + }, + { + "epoch": 0.05145, + "grad_norm": 0.2920054793357849, + "learning_rate": 4.999432204493596e-05, + "loss": 0.0627, + "step": 11290 + }, + { + "epoch": 0.0515, + "grad_norm": 0.25342732667922974, + "learning_rate": 4.9994233607999064e-05, + "loss": 0.0571, + "step": 11300 + }, + { + "epoch": 0.05155, + "grad_norm": 0.258687824010849, + "learning_rate": 4.999414448772921e-05, + "loss": 0.0605, + "step": 11310 + }, + { + "epoch": 0.0516, + "grad_norm": 0.26928308606147766, + "learning_rate": 4.999405468412883e-05, + "loss": 0.0582, + "step": 11320 + }, + { + "epoch": 0.05165, + "grad_norm": 0.20206747949123383, + "learning_rate": 4.9993964197200394e-05, + "loss": 0.0551, + "step": 11330 + }, + { + "epoch": 0.0517, + "grad_norm": 0.2057802826166153, + "learning_rate": 4.999387302694636e-05, + "loss": 0.0537, + "step": 11340 + }, + { + "epoch": 0.05175, + "grad_norm": 0.23069089651107788, + "learning_rate": 4.999378117336924e-05, + "loss": 0.0584, + "step": 11350 + }, + { + "epoch": 0.0518, + "grad_norm": 0.33335962891578674, + "learning_rate": 4.9993688636471516e-05, + "loss": 0.0623, + "step": 11360 + }, + { + "epoch": 0.05185, + "grad_norm": 0.2621992528438568, + "learning_rate": 4.999359541625574e-05, + "loss": 0.0598, + "step": 11370 + }, + { + "epoch": 0.0519, + "grad_norm": 0.2107490450143814, + "learning_rate": 4.9993501512724463e-05, + "loss": 0.0545, + "step": 11380 + }, + { + "epoch": 0.05195, + "grad_norm": 0.24193622171878815, + "learning_rate": 4.9993406925880233e-05, + "loss": 0.0563, + "step": 11390 + }, + { + "epoch": 0.052, + "grad_norm": 0.23652245104312897, + "learning_rate": 4.999331165572565e-05, + "loss": 0.058, + "step": 11400 + }, + { + "epoch": 0.05205, + "grad_norm": 0.29875287413597107, + "learning_rate": 4.9993215702263316e-05, + "loss": 0.0597, + "step": 11410 + }, + { + "epoch": 0.0521, + "grad_norm": 0.2521936297416687, + "learning_rate": 4.999311906549585e-05, + "loss": 0.057, + "step": 11420 + }, + { + "epoch": 0.05215, + "grad_norm": 0.3101290762424469, + "learning_rate": 4.999302174542591e-05, + "loss": 0.0588, + "step": 11430 + }, + { + "epoch": 0.0522, + "grad_norm": 0.2589113712310791, + "learning_rate": 4.999292374205614e-05, + "loss": 0.0603, + "step": 11440 + }, + { + "epoch": 0.05225, + "grad_norm": 0.25411316752433777, + "learning_rate": 4.999282505538922e-05, + "loss": 0.0636, + "step": 11450 + }, + { + "epoch": 0.0523, + "grad_norm": 0.2493228316307068, + "learning_rate": 4.999272568542785e-05, + "loss": 0.0572, + "step": 11460 + }, + { + "epoch": 0.05235, + "grad_norm": 0.27624061703681946, + "learning_rate": 4.999262563217476e-05, + "loss": 0.0577, + "step": 11470 + }, + { + "epoch": 0.0524, + "grad_norm": 0.23362897336483002, + "learning_rate": 4.999252489563267e-05, + "loss": 0.0585, + "step": 11480 + }, + { + "epoch": 0.05245, + "grad_norm": 0.27527329325675964, + "learning_rate": 4.999242347580434e-05, + "loss": 0.0609, + "step": 11490 + }, + { + "epoch": 0.0525, + "grad_norm": 0.2193654328584671, + "learning_rate": 4.9992321372692535e-05, + "loss": 0.0578, + "step": 11500 + }, + { + "epoch": 0.05255, + "grad_norm": 0.2288377285003662, + "learning_rate": 4.999221858630005e-05, + "loss": 0.0581, + "step": 11510 + }, + { + "epoch": 0.0526, + "grad_norm": 0.2176082879304886, + "learning_rate": 4.9992115116629714e-05, + "loss": 0.058, + "step": 11520 + }, + { + "epoch": 0.05265, + "grad_norm": 0.21936215460300446, + "learning_rate": 4.9992010963684325e-05, + "loss": 0.0577, + "step": 11530 + }, + { + "epoch": 0.0527, + "grad_norm": 0.2650482952594757, + "learning_rate": 4.999190612746675e-05, + "loss": 0.0592, + "step": 11540 + }, + { + "epoch": 0.05275, + "grad_norm": 0.2885134816169739, + "learning_rate": 4.999180060797986e-05, + "loss": 0.0575, + "step": 11550 + }, + { + "epoch": 0.0528, + "grad_norm": 0.21044647693634033, + "learning_rate": 4.999169440522652e-05, + "loss": 0.0638, + "step": 11560 + }, + { + "epoch": 0.05285, + "grad_norm": 0.21848969161510468, + "learning_rate": 4.999158751920964e-05, + "loss": 0.0567, + "step": 11570 + }, + { + "epoch": 0.0529, + "grad_norm": 0.2269868552684784, + "learning_rate": 4.999147994993215e-05, + "loss": 0.0605, + "step": 11580 + }, + { + "epoch": 0.05295, + "grad_norm": 0.24068693816661835, + "learning_rate": 4.999137169739699e-05, + "loss": 0.0562, + "step": 11590 + }, + { + "epoch": 0.053, + "grad_norm": 0.2279026210308075, + "learning_rate": 4.999126276160711e-05, + "loss": 0.0551, + "step": 11600 + }, + { + "epoch": 0.05305, + "grad_norm": 0.2190355807542801, + "learning_rate": 4.9991153142565506e-05, + "loss": 0.0563, + "step": 11610 + }, + { + "epoch": 0.0531, + "grad_norm": 0.2589868903160095, + "learning_rate": 4.999104284027516e-05, + "loss": 0.0586, + "step": 11620 + }, + { + "epoch": 0.05315, + "grad_norm": 0.24582260847091675, + "learning_rate": 4.9990931854739084e-05, + "loss": 0.0571, + "step": 11630 + }, + { + "epoch": 0.0532, + "grad_norm": 0.2815797030925751, + "learning_rate": 4.999082018596033e-05, + "loss": 0.0587, + "step": 11640 + }, + { + "epoch": 0.05325, + "grad_norm": 0.3051389753818512, + "learning_rate": 4.999070783394193e-05, + "loss": 0.0615, + "step": 11650 + }, + { + "epoch": 0.0533, + "grad_norm": 0.2518411874771118, + "learning_rate": 4.9990594798686975e-05, + "loss": 0.057, + "step": 11660 + }, + { + "epoch": 0.05335, + "grad_norm": 0.2967738211154938, + "learning_rate": 4.9990481080198546e-05, + "loss": 0.0617, + "step": 11670 + }, + { + "epoch": 0.0534, + "grad_norm": 0.32430994510650635, + "learning_rate": 4.999036667847975e-05, + "loss": 0.0586, + "step": 11680 + }, + { + "epoch": 0.05345, + "grad_norm": 0.2484043687582016, + "learning_rate": 4.999025159353372e-05, + "loss": 0.0581, + "step": 11690 + }, + { + "epoch": 0.0535, + "grad_norm": 0.2939985394477844, + "learning_rate": 4.999013582536359e-05, + "loss": 0.0574, + "step": 11700 + }, + { + "epoch": 0.05355, + "grad_norm": 0.23284538090229034, + "learning_rate": 4.9990019373972554e-05, + "loss": 0.0581, + "step": 11710 + }, + { + "epoch": 0.0536, + "grad_norm": 0.2365046888589859, + "learning_rate": 4.9989902239363765e-05, + "loss": 0.057, + "step": 11720 + }, + { + "epoch": 0.05365, + "grad_norm": 0.2517213821411133, + "learning_rate": 4.998978442154043e-05, + "loss": 0.0563, + "step": 11730 + }, + { + "epoch": 0.0537, + "grad_norm": 0.21513904631137848, + "learning_rate": 4.998966592050579e-05, + "loss": 0.0566, + "step": 11740 + }, + { + "epoch": 0.05375, + "grad_norm": 0.2141963541507721, + "learning_rate": 4.9989546736263066e-05, + "loss": 0.0611, + "step": 11750 + }, + { + "epoch": 0.0538, + "grad_norm": 0.2445666640996933, + "learning_rate": 4.998942686881553e-05, + "loss": 0.0566, + "step": 11760 + }, + { + "epoch": 0.05385, + "grad_norm": 0.3019103705883026, + "learning_rate": 4.998930631816644e-05, + "loss": 0.0597, + "step": 11770 + }, + { + "epoch": 0.0539, + "grad_norm": 0.25654977560043335, + "learning_rate": 4.99891850843191e-05, + "loss": 0.0568, + "step": 11780 + }, + { + "epoch": 0.05395, + "grad_norm": 0.30889660120010376, + "learning_rate": 4.998906316727684e-05, + "loss": 0.0589, + "step": 11790 + }, + { + "epoch": 0.054, + "grad_norm": 0.26153361797332764, + "learning_rate": 4.9988940567042975e-05, + "loss": 0.057, + "step": 11800 + }, + { + "epoch": 0.05405, + "grad_norm": 0.25084272027015686, + "learning_rate": 4.998881728362086e-05, + "loss": 0.0605, + "step": 11810 + }, + { + "epoch": 0.0541, + "grad_norm": 0.25785118341445923, + "learning_rate": 4.998869331701388e-05, + "loss": 0.0651, + "step": 11820 + }, + { + "epoch": 0.05415, + "grad_norm": 0.2896030843257904, + "learning_rate": 4.99885686672254e-05, + "loss": 0.0596, + "step": 11830 + }, + { + "epoch": 0.0542, + "grad_norm": 0.2596687078475952, + "learning_rate": 4.998844333425885e-05, + "loss": 0.0586, + "step": 11840 + }, + { + "epoch": 0.05425, + "grad_norm": 0.28756245970726013, + "learning_rate": 4.998831731811764e-05, + "loss": 0.0591, + "step": 11850 + }, + { + "epoch": 0.0543, + "grad_norm": 0.23252467811107635, + "learning_rate": 4.998819061880523e-05, + "loss": 0.0589, + "step": 11860 + }, + { + "epoch": 0.05435, + "grad_norm": 0.3250674903392792, + "learning_rate": 4.998806323632507e-05, + "loss": 0.0593, + "step": 11870 + }, + { + "epoch": 0.0544, + "grad_norm": 0.23451420664787292, + "learning_rate": 4.998793517068065e-05, + "loss": 0.0575, + "step": 11880 + }, + { + "epoch": 0.05445, + "grad_norm": 0.25425106287002563, + "learning_rate": 4.9987806421875465e-05, + "loss": 0.0572, + "step": 11890 + }, + { + "epoch": 0.0545, + "grad_norm": 0.23875631392002106, + "learning_rate": 4.998767698991304e-05, + "loss": 0.0602, + "step": 11900 + }, + { + "epoch": 0.05455, + "grad_norm": 0.24599017202854156, + "learning_rate": 4.998754687479692e-05, + "loss": 0.0591, + "step": 11910 + }, + { + "epoch": 0.0546, + "grad_norm": 0.2623789310455322, + "learning_rate": 4.998741607653066e-05, + "loss": 0.0617, + "step": 11920 + }, + { + "epoch": 0.05465, + "grad_norm": 0.2282388061285019, + "learning_rate": 4.9987284595117825e-05, + "loss": 0.0566, + "step": 11930 + }, + { + "epoch": 0.0547, + "grad_norm": 0.22660212218761444, + "learning_rate": 4.998715243056201e-05, + "loss": 0.0545, + "step": 11940 + }, + { + "epoch": 0.05475, + "grad_norm": 0.20677420496940613, + "learning_rate": 4.9987019582866844e-05, + "loss": 0.0552, + "step": 11950 + }, + { + "epoch": 0.0548, + "grad_norm": 0.24677255749702454, + "learning_rate": 4.9986886052035954e-05, + "loss": 0.0555, + "step": 11960 + }, + { + "epoch": 0.05485, + "grad_norm": 0.23030756413936615, + "learning_rate": 4.998675183807298e-05, + "loss": 0.0596, + "step": 11970 + }, + { + "epoch": 0.0549, + "grad_norm": 0.2902722656726837, + "learning_rate": 4.99866169409816e-05, + "loss": 0.0591, + "step": 11980 + }, + { + "epoch": 0.05495, + "grad_norm": 0.310017853975296, + "learning_rate": 4.998648136076549e-05, + "loss": 0.0576, + "step": 11990 + }, + { + "epoch": 0.055, + "grad_norm": 0.2971934378147125, + "learning_rate": 4.9986345097428375e-05, + "loss": 0.0583, + "step": 12000 + }, + { + "epoch": 0.05505, + "grad_norm": 0.23468483984470367, + "learning_rate": 4.9986208150973975e-05, + "loss": 0.0587, + "step": 12010 + }, + { + "epoch": 0.0551, + "grad_norm": 0.22812537848949432, + "learning_rate": 4.998607052140603e-05, + "loss": 0.0582, + "step": 12020 + }, + { + "epoch": 0.05515, + "grad_norm": 0.2039913386106491, + "learning_rate": 4.99859322087283e-05, + "loss": 0.0573, + "step": 12030 + }, + { + "epoch": 0.0552, + "grad_norm": 0.21972723305225372, + "learning_rate": 4.998579321294456e-05, + "loss": 0.0598, + "step": 12040 + }, + { + "epoch": 0.05525, + "grad_norm": 0.2677394151687622, + "learning_rate": 4.998565353405864e-05, + "loss": 0.0588, + "step": 12050 + }, + { + "epoch": 0.0553, + "grad_norm": 0.24771089851856232, + "learning_rate": 4.998551317207433e-05, + "loss": 0.0577, + "step": 12060 + }, + { + "epoch": 0.05535, + "grad_norm": 0.22934655845165253, + "learning_rate": 4.9985372126995475e-05, + "loss": 0.059, + "step": 12070 + }, + { + "epoch": 0.0554, + "grad_norm": 0.31099027395248413, + "learning_rate": 4.998523039882594e-05, + "loss": 0.0591, + "step": 12080 + }, + { + "epoch": 0.05545, + "grad_norm": 0.2668203115463257, + "learning_rate": 4.9985087987569586e-05, + "loss": 0.0593, + "step": 12090 + }, + { + "epoch": 0.0555, + "grad_norm": 0.22269988059997559, + "learning_rate": 4.998494489323031e-05, + "loss": 0.0576, + "step": 12100 + }, + { + "epoch": 0.05555, + "grad_norm": 0.27321428060531616, + "learning_rate": 4.998480111581203e-05, + "loss": 0.0588, + "step": 12110 + }, + { + "epoch": 0.0556, + "grad_norm": 0.2842467427253723, + "learning_rate": 4.998465665531868e-05, + "loss": 0.0565, + "step": 12120 + }, + { + "epoch": 0.05565, + "grad_norm": 0.26087847352027893, + "learning_rate": 4.998451151175419e-05, + "loss": 0.0555, + "step": 12130 + }, + { + "epoch": 0.0557, + "grad_norm": 0.2942187190055847, + "learning_rate": 4.998436568512256e-05, + "loss": 0.0597, + "step": 12140 + }, + { + "epoch": 0.05575, + "grad_norm": 0.2344774305820465, + "learning_rate": 4.998421917542775e-05, + "loss": 0.0604, + "step": 12150 + }, + { + "epoch": 0.0558, + "grad_norm": 0.27858108282089233, + "learning_rate": 4.998407198267376e-05, + "loss": 0.0556, + "step": 12160 + }, + { + "epoch": 0.05585, + "grad_norm": 0.2372090369462967, + "learning_rate": 4.998392410686465e-05, + "loss": 0.06, + "step": 12170 + }, + { + "epoch": 0.0559, + "grad_norm": 0.24095700681209564, + "learning_rate": 4.998377554800443e-05, + "loss": 0.0579, + "step": 12180 + }, + { + "epoch": 0.05595, + "grad_norm": 0.22147738933563232, + "learning_rate": 4.998362630609717e-05, + "loss": 0.0578, + "step": 12190 + }, + { + "epoch": 0.056, + "grad_norm": 0.31405651569366455, + "learning_rate": 4.998347638114696e-05, + "loss": 0.0601, + "step": 12200 + }, + { + "epoch": 0.05605, + "grad_norm": 0.28502020239830017, + "learning_rate": 4.9983325773157886e-05, + "loss": 0.0577, + "step": 12210 + }, + { + "epoch": 0.0561, + "grad_norm": 0.3050804138183594, + "learning_rate": 4.9983174482134076e-05, + "loss": 0.0605, + "step": 12220 + }, + { + "epoch": 0.05615, + "grad_norm": 0.22686448693275452, + "learning_rate": 4.9983022508079655e-05, + "loss": 0.0556, + "step": 12230 + }, + { + "epoch": 0.0562, + "grad_norm": 0.2321311980485916, + "learning_rate": 4.998286985099879e-05, + "loss": 0.0586, + "step": 12240 + }, + { + "epoch": 0.05625, + "grad_norm": 0.24789652228355408, + "learning_rate": 4.998271651089564e-05, + "loss": 0.0614, + "step": 12250 + }, + { + "epoch": 0.0563, + "grad_norm": 0.27146685123443604, + "learning_rate": 4.998256248777442e-05, + "loss": 0.0595, + "step": 12260 + }, + { + "epoch": 0.05635, + "grad_norm": 0.2287423312664032, + "learning_rate": 4.998240778163932e-05, + "loss": 0.056, + "step": 12270 + }, + { + "epoch": 0.0564, + "grad_norm": 0.25738126039505005, + "learning_rate": 4.9982252392494576e-05, + "loss": 0.0573, + "step": 12280 + }, + { + "epoch": 0.05645, + "grad_norm": 0.21796013414859772, + "learning_rate": 4.998209632034444e-05, + "loss": 0.0566, + "step": 12290 + }, + { + "epoch": 0.0565, + "grad_norm": 0.18819575011730194, + "learning_rate": 4.998193956519317e-05, + "loss": 0.0569, + "step": 12300 + }, + { + "epoch": 0.05655, + "grad_norm": 0.3370819091796875, + "learning_rate": 4.998178212704506e-05, + "loss": 0.0607, + "step": 12310 + }, + { + "epoch": 0.0566, + "grad_norm": 0.3050325810909271, + "learning_rate": 4.998162400590442e-05, + "loss": 0.0553, + "step": 12320 + }, + { + "epoch": 0.05665, + "grad_norm": 0.3089597821235657, + "learning_rate": 4.9981465201775554e-05, + "loss": 0.0576, + "step": 12330 + }, + { + "epoch": 0.0567, + "grad_norm": 0.23376913368701935, + "learning_rate": 4.998130571466282e-05, + "loss": 0.056, + "step": 12340 + }, + { + "epoch": 0.05675, + "grad_norm": 0.22346380352973938, + "learning_rate": 4.9981145544570565e-05, + "loss": 0.0554, + "step": 12350 + }, + { + "epoch": 0.0568, + "grad_norm": 0.22319258749485016, + "learning_rate": 4.998098469150319e-05, + "loss": 0.057, + "step": 12360 + }, + { + "epoch": 0.05685, + "grad_norm": 0.19363442063331604, + "learning_rate": 4.998082315546506e-05, + "loss": 0.0584, + "step": 12370 + }, + { + "epoch": 0.0569, + "grad_norm": 0.23074665665626526, + "learning_rate": 4.9980660936460624e-05, + "loss": 0.0537, + "step": 12380 + }, + { + "epoch": 0.05695, + "grad_norm": 0.25440314412117004, + "learning_rate": 4.99804980344943e-05, + "loss": 0.0591, + "step": 12390 + }, + { + "epoch": 0.057, + "grad_norm": 0.24444711208343506, + "learning_rate": 4.998033444957054e-05, + "loss": 0.0549, + "step": 12400 + }, + { + "epoch": 0.05705, + "grad_norm": 0.28419366478919983, + "learning_rate": 4.998017018169383e-05, + "loss": 0.0555, + "step": 12410 + }, + { + "epoch": 0.0571, + "grad_norm": 0.24973613023757935, + "learning_rate": 4.998000523086864e-05, + "loss": 0.0555, + "step": 12420 + }, + { + "epoch": 0.05715, + "grad_norm": 0.23273035883903503, + "learning_rate": 4.99798395970995e-05, + "loss": 0.0576, + "step": 12430 + }, + { + "epoch": 0.0572, + "grad_norm": 0.18866272270679474, + "learning_rate": 4.997967328039093e-05, + "loss": 0.0565, + "step": 12440 + }, + { + "epoch": 0.05725, + "grad_norm": 0.22517941892147064, + "learning_rate": 4.997950628074747e-05, + "loss": 0.0588, + "step": 12450 + }, + { + "epoch": 0.0573, + "grad_norm": 0.24999724328517914, + "learning_rate": 4.99793385981737e-05, + "loss": 0.0601, + "step": 12460 + }, + { + "epoch": 0.05735, + "grad_norm": 0.21959833800792694, + "learning_rate": 4.997917023267419e-05, + "loss": 0.0563, + "step": 12470 + }, + { + "epoch": 0.0574, + "grad_norm": 0.2291775792837143, + "learning_rate": 4.997900118425356e-05, + "loss": 0.0549, + "step": 12480 + }, + { + "epoch": 0.05745, + "grad_norm": 0.2492663860321045, + "learning_rate": 4.997883145291641e-05, + "loss": 0.0554, + "step": 12490 + }, + { + "epoch": 0.0575, + "grad_norm": 0.23691225051879883, + "learning_rate": 4.99786610386674e-05, + "loss": 0.0576, + "step": 12500 + }, + { + "epoch": 0.05755, + "grad_norm": 0.21633702516555786, + "learning_rate": 4.9978489941511184e-05, + "loss": 0.0547, + "step": 12510 + }, + { + "epoch": 0.0576, + "grad_norm": 0.23120734095573425, + "learning_rate": 4.997831816145243e-05, + "loss": 0.0547, + "step": 12520 + }, + { + "epoch": 0.05765, + "grad_norm": 0.23655107617378235, + "learning_rate": 4.9978145698495845e-05, + "loss": 0.0542, + "step": 12530 + }, + { + "epoch": 0.0577, + "grad_norm": 0.2183103710412979, + "learning_rate": 4.9977972552646136e-05, + "loss": 0.0552, + "step": 12540 + }, + { + "epoch": 0.05775, + "grad_norm": 0.2202279269695282, + "learning_rate": 4.997779872390805e-05, + "loss": 0.0562, + "step": 12550 + }, + { + "epoch": 0.0578, + "grad_norm": 0.24577485024929047, + "learning_rate": 4.9977624212286324e-05, + "loss": 0.0535, + "step": 12560 + }, + { + "epoch": 0.05785, + "grad_norm": 0.24386066198349, + "learning_rate": 4.997744901778574e-05, + "loss": 0.0607, + "step": 12570 + }, + { + "epoch": 0.0579, + "grad_norm": 0.2942847013473511, + "learning_rate": 4.997727314041107e-05, + "loss": 0.0546, + "step": 12580 + }, + { + "epoch": 0.05795, + "grad_norm": 0.23982101678848267, + "learning_rate": 4.997709658016715e-05, + "loss": 0.0558, + "step": 12590 + }, + { + "epoch": 0.058, + "grad_norm": 0.23010914027690887, + "learning_rate": 4.997691933705879e-05, + "loss": 0.0566, + "step": 12600 + }, + { + "epoch": 0.05805, + "grad_norm": 0.23606491088867188, + "learning_rate": 4.997674141109084e-05, + "loss": 0.0557, + "step": 12610 + }, + { + "epoch": 0.0581, + "grad_norm": 0.22416196763515472, + "learning_rate": 4.997656280226816e-05, + "loss": 0.0553, + "step": 12620 + }, + { + "epoch": 0.05815, + "grad_norm": 0.23933905363082886, + "learning_rate": 4.997638351059564e-05, + "loss": 0.0568, + "step": 12630 + }, + { + "epoch": 0.0582, + "grad_norm": 0.2400108426809311, + "learning_rate": 4.997620353607817e-05, + "loss": 0.0538, + "step": 12640 + }, + { + "epoch": 0.05825, + "grad_norm": 0.20764097571372986, + "learning_rate": 4.9976022878720684e-05, + "loss": 0.0566, + "step": 12650 + }, + { + "epoch": 0.0583, + "grad_norm": 0.22233030200004578, + "learning_rate": 4.997584153852812e-05, + "loss": 0.056, + "step": 12660 + }, + { + "epoch": 0.05835, + "grad_norm": 0.23320798575878143, + "learning_rate": 4.997565951550542e-05, + "loss": 0.0566, + "step": 12670 + }, + { + "epoch": 0.0584, + "grad_norm": 0.2351810187101364, + "learning_rate": 4.997547680965758e-05, + "loss": 0.054, + "step": 12680 + }, + { + "epoch": 0.05845, + "grad_norm": 0.22825945913791656, + "learning_rate": 4.997529342098959e-05, + "loss": 0.0581, + "step": 12690 + }, + { + "epoch": 0.0585, + "grad_norm": 0.21631501615047455, + "learning_rate": 4.9975109349506455e-05, + "loss": 0.0542, + "step": 12700 + }, + { + "epoch": 0.05855, + "grad_norm": 0.26137256622314453, + "learning_rate": 4.997492459521321e-05, + "loss": 0.058, + "step": 12710 + }, + { + "epoch": 0.0586, + "grad_norm": 0.21283724904060364, + "learning_rate": 4.9974739158114916e-05, + "loss": 0.0568, + "step": 12720 + }, + { + "epoch": 0.05865, + "grad_norm": 0.22252531349658966, + "learning_rate": 4.9974553038216635e-05, + "loss": 0.057, + "step": 12730 + }, + { + "epoch": 0.0587, + "grad_norm": 0.21466626226902008, + "learning_rate": 4.997436623552345e-05, + "loss": 0.058, + "step": 12740 + }, + { + "epoch": 0.05875, + "grad_norm": 0.225613072514534, + "learning_rate": 4.997417875004048e-05, + "loss": 0.0541, + "step": 12750 + }, + { + "epoch": 0.0588, + "grad_norm": 0.22769001126289368, + "learning_rate": 4.997399058177284e-05, + "loss": 0.0557, + "step": 12760 + }, + { + "epoch": 0.05885, + "grad_norm": 0.24786868691444397, + "learning_rate": 4.997380173072569e-05, + "loss": 0.054, + "step": 12770 + }, + { + "epoch": 0.0589, + "grad_norm": 0.2872532308101654, + "learning_rate": 4.997361219690417e-05, + "loss": 0.0587, + "step": 12780 + }, + { + "epoch": 0.05895, + "grad_norm": 0.2511669993400574, + "learning_rate": 4.997342198031348e-05, + "loss": 0.0546, + "step": 12790 + }, + { + "epoch": 0.059, + "grad_norm": 0.3343372046947479, + "learning_rate": 4.997323108095883e-05, + "loss": 0.0593, + "step": 12800 + }, + { + "epoch": 0.05905, + "grad_norm": 0.26693952083587646, + "learning_rate": 4.997303949884541e-05, + "loss": 0.0564, + "step": 12810 + }, + { + "epoch": 0.0591, + "grad_norm": 0.23090007901191711, + "learning_rate": 4.997284723397847e-05, + "loss": 0.0598, + "step": 12820 + }, + { + "epoch": 0.05915, + "grad_norm": 0.22998195886611938, + "learning_rate": 4.997265428636328e-05, + "loss": 0.0573, + "step": 12830 + }, + { + "epoch": 0.0592, + "grad_norm": 0.24856556951999664, + "learning_rate": 4.997246065600508e-05, + "loss": 0.0601, + "step": 12840 + }, + { + "epoch": 0.05925, + "grad_norm": 0.2634192705154419, + "learning_rate": 4.997226634290921e-05, + "loss": 0.0584, + "step": 12850 + }, + { + "epoch": 0.0593, + "grad_norm": 0.3157845735549927, + "learning_rate": 4.997207134708095e-05, + "loss": 0.0557, + "step": 12860 + }, + { + "epoch": 0.05935, + "grad_norm": 0.22737300395965576, + "learning_rate": 4.9971875668525646e-05, + "loss": 0.0552, + "step": 12870 + }, + { + "epoch": 0.0594, + "grad_norm": 0.28176745772361755, + "learning_rate": 4.997167930724864e-05, + "loss": 0.0552, + "step": 12880 + }, + { + "epoch": 0.05945, + "grad_norm": 0.3244107663631439, + "learning_rate": 4.99714822632553e-05, + "loss": 0.058, + "step": 12890 + }, + { + "epoch": 0.0595, + "grad_norm": 0.26062488555908203, + "learning_rate": 4.9971284536551025e-05, + "loss": 0.0582, + "step": 12900 + }, + { + "epoch": 0.05955, + "grad_norm": 0.22571077942848206, + "learning_rate": 4.9971086127141206e-05, + "loss": 0.0548, + "step": 12910 + }, + { + "epoch": 0.0596, + "grad_norm": 0.2939530611038208, + "learning_rate": 4.9970887035031274e-05, + "loss": 0.0574, + "step": 12920 + }, + { + "epoch": 0.05965, + "grad_norm": 0.24956828355789185, + "learning_rate": 4.9970687260226665e-05, + "loss": 0.0569, + "step": 12930 + }, + { + "epoch": 0.0597, + "grad_norm": 0.2565845251083374, + "learning_rate": 4.997048680273286e-05, + "loss": 0.0536, + "step": 12940 + }, + { + "epoch": 0.05975, + "grad_norm": 0.22084426879882812, + "learning_rate": 4.9970285662555315e-05, + "loss": 0.0544, + "step": 12950 + }, + { + "epoch": 0.0598, + "grad_norm": 0.2750712037086487, + "learning_rate": 4.997008383969955e-05, + "loss": 0.0571, + "step": 12960 + }, + { + "epoch": 0.05985, + "grad_norm": 0.24832278490066528, + "learning_rate": 4.996988133417107e-05, + "loss": 0.0573, + "step": 12970 + }, + { + "epoch": 0.0599, + "grad_norm": 0.24103182554244995, + "learning_rate": 4.996967814597542e-05, + "loss": 0.054, + "step": 12980 + }, + { + "epoch": 0.05995, + "grad_norm": 0.2249392718076706, + "learning_rate": 4.996947427511814e-05, + "loss": 0.0548, + "step": 12990 + }, + { + "epoch": 0.06, + "grad_norm": 0.23457661271095276, + "learning_rate": 4.996926972160482e-05, + "loss": 0.0537, + "step": 13000 + }, + { + "epoch": 0.06005, + "grad_norm": 0.26607438921928406, + "learning_rate": 4.996906448544105e-05, + "loss": 0.057, + "step": 13010 + }, + { + "epoch": 0.0601, + "grad_norm": 0.24230334162712097, + "learning_rate": 4.9968858566632435e-05, + "loss": 0.0534, + "step": 13020 + }, + { + "epoch": 0.06015, + "grad_norm": 0.2050572633743286, + "learning_rate": 4.99686519651846e-05, + "loss": 0.0542, + "step": 13030 + }, + { + "epoch": 0.0602, + "grad_norm": 0.2492501586675644, + "learning_rate": 4.996844468110321e-05, + "loss": 0.0544, + "step": 13040 + }, + { + "epoch": 0.06025, + "grad_norm": 0.20426131784915924, + "learning_rate": 4.9968236714393916e-05, + "loss": 0.0544, + "step": 13050 + }, + { + "epoch": 0.0603, + "grad_norm": 0.21557781100273132, + "learning_rate": 4.996802806506241e-05, + "loss": 0.0535, + "step": 13060 + }, + { + "epoch": 0.06035, + "grad_norm": 0.22966895997524261, + "learning_rate": 4.9967818733114404e-05, + "loss": 0.0522, + "step": 13070 + }, + { + "epoch": 0.0604, + "grad_norm": 0.16762161254882812, + "learning_rate": 4.996760871855561e-05, + "loss": 0.0532, + "step": 13080 + }, + { + "epoch": 0.06045, + "grad_norm": 0.24853459000587463, + "learning_rate": 4.996739802139177e-05, + "loss": 0.0547, + "step": 13090 + }, + { + "epoch": 0.0605, + "grad_norm": 0.2116885632276535, + "learning_rate": 4.996718664162865e-05, + "loss": 0.0559, + "step": 13100 + }, + { + "epoch": 0.06055, + "grad_norm": 0.23208336532115936, + "learning_rate": 4.996697457927203e-05, + "loss": 0.0565, + "step": 13110 + }, + { + "epoch": 0.0606, + "grad_norm": 0.2170572280883789, + "learning_rate": 4.99667618343277e-05, + "loss": 0.0531, + "step": 13120 + }, + { + "epoch": 0.06065, + "grad_norm": 0.22632868587970734, + "learning_rate": 4.9966548406801486e-05, + "loss": 0.0547, + "step": 13130 + }, + { + "epoch": 0.0607, + "grad_norm": 0.24190941452980042, + "learning_rate": 4.996633429669921e-05, + "loss": 0.0537, + "step": 13140 + }, + { + "epoch": 0.06075, + "grad_norm": 0.1979205310344696, + "learning_rate": 4.996611950402674e-05, + "loss": 0.055, + "step": 13150 + }, + { + "epoch": 0.0608, + "grad_norm": 0.23045705258846283, + "learning_rate": 4.9965904028789945e-05, + "loss": 0.0538, + "step": 13160 + }, + { + "epoch": 0.06085, + "grad_norm": 0.24325931072235107, + "learning_rate": 4.9965687870994716e-05, + "loss": 0.0562, + "step": 13170 + }, + { + "epoch": 0.0609, + "grad_norm": 0.28811779618263245, + "learning_rate": 4.996547103064695e-05, + "loss": 0.055, + "step": 13180 + }, + { + "epoch": 0.06095, + "grad_norm": 0.2513379454612732, + "learning_rate": 4.9965253507752585e-05, + "loss": 0.0536, + "step": 13190 + }, + { + "epoch": 0.061, + "grad_norm": 0.22728721797466278, + "learning_rate": 4.9965035302317574e-05, + "loss": 0.0549, + "step": 13200 + }, + { + "epoch": 0.06105, + "grad_norm": 0.272308886051178, + "learning_rate": 4.9964816414347874e-05, + "loss": 0.0567, + "step": 13210 + }, + { + "epoch": 0.0611, + "grad_norm": 0.26510074734687805, + "learning_rate": 4.9964596843849474e-05, + "loss": 0.0551, + "step": 13220 + }, + { + "epoch": 0.06115, + "grad_norm": 0.2139650285243988, + "learning_rate": 4.996437659082838e-05, + "loss": 0.0543, + "step": 13230 + }, + { + "epoch": 0.0612, + "grad_norm": 0.2072010636329651, + "learning_rate": 4.9964155655290596e-05, + "loss": 0.0553, + "step": 13240 + }, + { + "epoch": 0.06125, + "grad_norm": 0.23949231207370758, + "learning_rate": 4.996393403724218e-05, + "loss": 0.0552, + "step": 13250 + }, + { + "epoch": 0.0613, + "grad_norm": 0.20085765421390533, + "learning_rate": 4.996371173668919e-05, + "loss": 0.0552, + "step": 13260 + }, + { + "epoch": 0.06135, + "grad_norm": 0.20907004177570343, + "learning_rate": 4.9963488753637696e-05, + "loss": 0.0562, + "step": 13270 + }, + { + "epoch": 0.0614, + "grad_norm": 0.18924008309841156, + "learning_rate": 4.99632650880938e-05, + "loss": 0.0541, + "step": 13280 + }, + { + "epoch": 0.06145, + "grad_norm": 0.2130063772201538, + "learning_rate": 4.996304074006361e-05, + "loss": 0.0531, + "step": 13290 + }, + { + "epoch": 0.0615, + "grad_norm": 0.2611365020275116, + "learning_rate": 4.996281570955327e-05, + "loss": 0.0541, + "step": 13300 + }, + { + "epoch": 0.06155, + "grad_norm": 0.25117290019989014, + "learning_rate": 4.996258999656892e-05, + "loss": 0.0538, + "step": 13310 + }, + { + "epoch": 0.0616, + "grad_norm": 0.23625993728637695, + "learning_rate": 4.9962363601116745e-05, + "loss": 0.0554, + "step": 13320 + }, + { + "epoch": 0.06165, + "grad_norm": 0.23082759976387024, + "learning_rate": 4.996213652320292e-05, + "loss": 0.0581, + "step": 13330 + }, + { + "epoch": 0.0617, + "grad_norm": 0.1999133825302124, + "learning_rate": 4.9961908762833666e-05, + "loss": 0.0558, + "step": 13340 + }, + { + "epoch": 0.06175, + "grad_norm": 0.23757296800613403, + "learning_rate": 4.9961680320015205e-05, + "loss": 0.0553, + "step": 13350 + }, + { + "epoch": 0.0618, + "grad_norm": 0.25344860553741455, + "learning_rate": 4.996145119475377e-05, + "loss": 0.0553, + "step": 13360 + }, + { + "epoch": 0.06185, + "grad_norm": 0.24485984444618225, + "learning_rate": 4.996122138705565e-05, + "loss": 0.0537, + "step": 13370 + }, + { + "epoch": 0.0619, + "grad_norm": 0.21683093905448914, + "learning_rate": 4.9960990896927116e-05, + "loss": 0.0539, + "step": 13380 + }, + { + "epoch": 0.06195, + "grad_norm": 0.2736802101135254, + "learning_rate": 4.9960759724374464e-05, + "loss": 0.0562, + "step": 13390 + }, + { + "epoch": 0.062, + "grad_norm": 0.253433495759964, + "learning_rate": 4.996052786940402e-05, + "loss": 0.0579, + "step": 13400 + }, + { + "epoch": 0.06205, + "grad_norm": 0.28744834661483765, + "learning_rate": 4.996029533202211e-05, + "loss": 0.0576, + "step": 13410 + }, + { + "epoch": 0.0621, + "grad_norm": 0.23220312595367432, + "learning_rate": 4.996006211223511e-05, + "loss": 0.0553, + "step": 13420 + }, + { + "epoch": 0.06215, + "grad_norm": 0.24789561331272125, + "learning_rate": 4.99598282100494e-05, + "loss": 0.0562, + "step": 13430 + }, + { + "epoch": 0.0622, + "grad_norm": 0.2644476890563965, + "learning_rate": 4.9959593625471344e-05, + "loss": 0.0572, + "step": 13440 + }, + { + "epoch": 0.06225, + "grad_norm": 0.2106294333934784, + "learning_rate": 4.995935835850739e-05, + "loss": 0.0584, + "step": 13450 + }, + { + "epoch": 0.0623, + "grad_norm": 0.2522731423377991, + "learning_rate": 4.995912240916395e-05, + "loss": 0.0593, + "step": 13460 + }, + { + "epoch": 0.06235, + "grad_norm": 0.2467738538980484, + "learning_rate": 4.995888577744748e-05, + "loss": 0.0532, + "step": 13470 + }, + { + "epoch": 0.0624, + "grad_norm": 0.2429942637681961, + "learning_rate": 4.995864846336445e-05, + "loss": 0.0566, + "step": 13480 + }, + { + "epoch": 0.06245, + "grad_norm": 0.2741338908672333, + "learning_rate": 4.995841046692135e-05, + "loss": 0.0559, + "step": 13490 + }, + { + "epoch": 0.0625, + "grad_norm": 0.21140851080417633, + "learning_rate": 4.995817178812468e-05, + "loss": 0.0524, + "step": 13500 + }, + { + "epoch": 0.06255, + "grad_norm": 0.23873601853847504, + "learning_rate": 4.9957932426980966e-05, + "loss": 0.0565, + "step": 13510 + }, + { + "epoch": 0.0626, + "grad_norm": 0.2380428910255432, + "learning_rate": 4.9957692383496765e-05, + "loss": 0.0527, + "step": 13520 + }, + { + "epoch": 0.06265, + "grad_norm": 0.19534370303153992, + "learning_rate": 4.995745165767863e-05, + "loss": 0.0577, + "step": 13530 + }, + { + "epoch": 0.0627, + "grad_norm": 0.21950581669807434, + "learning_rate": 4.995721024953314e-05, + "loss": 0.0546, + "step": 13540 + }, + { + "epoch": 0.06275, + "grad_norm": 0.26917189359664917, + "learning_rate": 4.9956968159066894e-05, + "loss": 0.0564, + "step": 13550 + }, + { + "epoch": 0.0628, + "grad_norm": 0.24399952590465546, + "learning_rate": 4.995672538628652e-05, + "loss": 0.0546, + "step": 13560 + }, + { + "epoch": 0.06285, + "grad_norm": 0.27379703521728516, + "learning_rate": 4.9956481931198644e-05, + "loss": 0.0544, + "step": 13570 + }, + { + "epoch": 0.0629, + "grad_norm": 0.2227669209241867, + "learning_rate": 4.995623779380993e-05, + "loss": 0.0535, + "step": 13580 + }, + { + "epoch": 0.06295, + "grad_norm": 0.20523680746555328, + "learning_rate": 4.9955992974127055e-05, + "loss": 0.0518, + "step": 13590 + }, + { + "epoch": 0.063, + "grad_norm": 0.21672451496124268, + "learning_rate": 4.99557474721567e-05, + "loss": 0.0522, + "step": 13600 + }, + { + "epoch": 0.06305, + "grad_norm": 0.2435152381658554, + "learning_rate": 4.995550128790559e-05, + "loss": 0.0531, + "step": 13610 + }, + { + "epoch": 0.0631, + "grad_norm": 0.19593331217765808, + "learning_rate": 4.9955254421380446e-05, + "loss": 0.0524, + "step": 13620 + }, + { + "epoch": 0.06315, + "grad_norm": 0.202594593167305, + "learning_rate": 4.995500687258803e-05, + "loss": 0.0524, + "step": 13630 + }, + { + "epoch": 0.0632, + "grad_norm": 0.17885445058345795, + "learning_rate": 4.9954758641535094e-05, + "loss": 0.0539, + "step": 13640 + }, + { + "epoch": 0.06325, + "grad_norm": 0.2962518632411957, + "learning_rate": 4.9954509728228434e-05, + "loss": 0.0588, + "step": 13650 + }, + { + "epoch": 0.0633, + "grad_norm": 0.20670145750045776, + "learning_rate": 4.9954260132674844e-05, + "loss": 0.055, + "step": 13660 + }, + { + "epoch": 0.06335, + "grad_norm": 0.2579496502876282, + "learning_rate": 4.995400985488117e-05, + "loss": 0.0546, + "step": 13670 + }, + { + "epoch": 0.0634, + "grad_norm": 0.2677033841609955, + "learning_rate": 4.995375889485424e-05, + "loss": 0.054, + "step": 13680 + }, + { + "epoch": 0.06345, + "grad_norm": 0.22976121306419373, + "learning_rate": 4.9953507252600906e-05, + "loss": 0.0551, + "step": 13690 + }, + { + "epoch": 0.0635, + "grad_norm": 0.2728902995586395, + "learning_rate": 4.995325492812807e-05, + "loss": 0.0552, + "step": 13700 + }, + { + "epoch": 0.06355, + "grad_norm": 0.2082107663154602, + "learning_rate": 4.9953001921442613e-05, + "loss": 0.0563, + "step": 13710 + }, + { + "epoch": 0.0636, + "grad_norm": 0.20261944830417633, + "learning_rate": 4.995274823255146e-05, + "loss": 0.0587, + "step": 13720 + }, + { + "epoch": 0.06365, + "grad_norm": 0.23577085137367249, + "learning_rate": 4.9952493861461544e-05, + "loss": 0.0554, + "step": 13730 + }, + { + "epoch": 0.0637, + "grad_norm": 0.2136303335428238, + "learning_rate": 4.995223880817982e-05, + "loss": 0.0535, + "step": 13740 + }, + { + "epoch": 0.06375, + "grad_norm": 0.250302255153656, + "learning_rate": 4.995198307271326e-05, + "loss": 0.0571, + "step": 13750 + }, + { + "epoch": 0.0638, + "grad_norm": 0.1985165923833847, + "learning_rate": 4.995172665506886e-05, + "loss": 0.056, + "step": 13760 + }, + { + "epoch": 0.06385, + "grad_norm": 0.2227594256401062, + "learning_rate": 4.9951469555253624e-05, + "loss": 0.0558, + "step": 13770 + }, + { + "epoch": 0.0639, + "grad_norm": 0.2320021539926529, + "learning_rate": 4.995121177327458e-05, + "loss": 0.0586, + "step": 13780 + }, + { + "epoch": 0.06395, + "grad_norm": 0.23888349533081055, + "learning_rate": 4.9950953309138784e-05, + "loss": 0.0597, + "step": 13790 + }, + { + "epoch": 0.064, + "grad_norm": 0.23809459805488586, + "learning_rate": 4.99506941628533e-05, + "loss": 0.0544, + "step": 13800 + }, + { + "epoch": 0.06405, + "grad_norm": 0.23160304129123688, + "learning_rate": 4.995043433442521e-05, + "loss": 0.0554, + "step": 13810 + }, + { + "epoch": 0.0641, + "grad_norm": 0.20707209408283234, + "learning_rate": 4.995017382386162e-05, + "loss": 0.0557, + "step": 13820 + }, + { + "epoch": 0.06415, + "grad_norm": 0.20726770162582397, + "learning_rate": 4.994991263116965e-05, + "loss": 0.0568, + "step": 13830 + }, + { + "epoch": 0.0642, + "grad_norm": 0.2668224573135376, + "learning_rate": 4.9949650756356434e-05, + "loss": 0.0531, + "step": 13840 + }, + { + "epoch": 0.06425, + "grad_norm": 0.23629474639892578, + "learning_rate": 4.994938819942915e-05, + "loss": 0.0547, + "step": 13850 + }, + { + "epoch": 0.0643, + "grad_norm": 0.2216833382844925, + "learning_rate": 4.994912496039496e-05, + "loss": 0.055, + "step": 13860 + }, + { + "epoch": 0.06435, + "grad_norm": 0.20070704817771912, + "learning_rate": 4.9948861039261074e-05, + "loss": 0.058, + "step": 13870 + }, + { + "epoch": 0.0644, + "grad_norm": 0.23889175057411194, + "learning_rate": 4.994859643603469e-05, + "loss": 0.0557, + "step": 13880 + }, + { + "epoch": 0.06445, + "grad_norm": 0.2590092718601227, + "learning_rate": 4.994833115072306e-05, + "loss": 0.0543, + "step": 13890 + }, + { + "epoch": 0.0645, + "grad_norm": 0.1995381861925125, + "learning_rate": 4.994806518333343e-05, + "loss": 0.0558, + "step": 13900 + }, + { + "epoch": 0.06455, + "grad_norm": 0.24994847178459167, + "learning_rate": 4.994779853387307e-05, + "loss": 0.0612, + "step": 13910 + }, + { + "epoch": 0.0646, + "grad_norm": 0.21314527094364166, + "learning_rate": 4.994753120234926e-05, + "loss": 0.0564, + "step": 13920 + }, + { + "epoch": 0.06465, + "grad_norm": 0.24399550259113312, + "learning_rate": 4.9947263188769337e-05, + "loss": 0.0567, + "step": 13930 + }, + { + "epoch": 0.0647, + "grad_norm": 0.20098140835762024, + "learning_rate": 4.9946994493140595e-05, + "loss": 0.0581, + "step": 13940 + }, + { + "epoch": 0.06475, + "grad_norm": 0.2497253566980362, + "learning_rate": 4.99467251154704e-05, + "loss": 0.0528, + "step": 13950 + }, + { + "epoch": 0.0648, + "grad_norm": 0.20846222341060638, + "learning_rate": 4.994645505576612e-05, + "loss": 0.0555, + "step": 13960 + }, + { + "epoch": 0.06485, + "grad_norm": 0.20436640083789825, + "learning_rate": 4.9946184314035116e-05, + "loss": 0.0542, + "step": 13970 + }, + { + "epoch": 0.0649, + "grad_norm": 0.18963970243930817, + "learning_rate": 4.994591289028482e-05, + "loss": 0.052, + "step": 13980 + }, + { + "epoch": 0.06495, + "grad_norm": 0.23356103897094727, + "learning_rate": 4.994564078452262e-05, + "loss": 0.0536, + "step": 13990 + }, + { + "epoch": 0.065, + "grad_norm": 0.23844048380851746, + "learning_rate": 4.994536799675599e-05, + "loss": 0.0544, + "step": 14000 + }, + { + "epoch": 0.06505, + "grad_norm": 0.23758967220783234, + "learning_rate": 4.9945094526992364e-05, + "loss": 0.0543, + "step": 14010 + }, + { + "epoch": 0.0651, + "grad_norm": 0.2923468351364136, + "learning_rate": 4.994482037523922e-05, + "loss": 0.0549, + "step": 14020 + }, + { + "epoch": 0.06515, + "grad_norm": 0.23966705799102783, + "learning_rate": 4.994454554150406e-05, + "loss": 0.0552, + "step": 14030 + }, + { + "epoch": 0.0652, + "grad_norm": 0.18790079653263092, + "learning_rate": 4.99442700257944e-05, + "loss": 0.0546, + "step": 14040 + }, + { + "epoch": 0.06525, + "grad_norm": 0.24292923510074615, + "learning_rate": 4.9943993828117776e-05, + "loss": 0.0547, + "step": 14050 + }, + { + "epoch": 0.0653, + "grad_norm": 0.2541712820529938, + "learning_rate": 4.9943716948481715e-05, + "loss": 0.053, + "step": 14060 + }, + { + "epoch": 0.06535, + "grad_norm": 0.23627446591854095, + "learning_rate": 4.994343938689381e-05, + "loss": 0.0541, + "step": 14070 + }, + { + "epoch": 0.0654, + "grad_norm": 0.23709315061569214, + "learning_rate": 4.994316114336165e-05, + "loss": 0.0539, + "step": 14080 + }, + { + "epoch": 0.06545, + "grad_norm": 0.22304728627204895, + "learning_rate": 4.9942882217892825e-05, + "loss": 0.0524, + "step": 14090 + }, + { + "epoch": 0.0655, + "grad_norm": 0.23598924279212952, + "learning_rate": 4.994260261049498e-05, + "loss": 0.0525, + "step": 14100 + }, + { + "epoch": 0.06555, + "grad_norm": 0.26575416326522827, + "learning_rate": 4.994232232117574e-05, + "loss": 0.0593, + "step": 14110 + }, + { + "epoch": 0.0656, + "grad_norm": 0.23113156855106354, + "learning_rate": 4.9942041349942795e-05, + "loss": 0.053, + "step": 14120 + }, + { + "epoch": 0.06565, + "grad_norm": 0.2666196823120117, + "learning_rate": 4.994175969680379e-05, + "loss": 0.0527, + "step": 14130 + }, + { + "epoch": 0.0657, + "grad_norm": 0.2423446625471115, + "learning_rate": 4.994147736176645e-05, + "loss": 0.0525, + "step": 14140 + }, + { + "epoch": 0.06575, + "grad_norm": 0.27136650681495667, + "learning_rate": 4.9941194344838496e-05, + "loss": 0.0554, + "step": 14150 + }, + { + "epoch": 0.0658, + "grad_norm": 0.31948310136795044, + "learning_rate": 4.994091064602766e-05, + "loss": 0.0557, + "step": 14160 + }, + { + "epoch": 0.06585, + "grad_norm": 0.2882412075996399, + "learning_rate": 4.994062626534169e-05, + "loss": 0.0578, + "step": 14170 + }, + { + "epoch": 0.0659, + "grad_norm": 0.2573373019695282, + "learning_rate": 4.994034120278837e-05, + "loss": 0.057, + "step": 14180 + }, + { + "epoch": 0.06595, + "grad_norm": 0.2526601552963257, + "learning_rate": 4.994005545837549e-05, + "loss": 0.0554, + "step": 14190 + }, + { + "epoch": 0.066, + "grad_norm": 0.24789269268512726, + "learning_rate": 4.9939769032110864e-05, + "loss": 0.0543, + "step": 14200 + }, + { + "epoch": 0.06605, + "grad_norm": 0.24072995781898499, + "learning_rate": 4.993948192400232e-05, + "loss": 0.0529, + "step": 14210 + }, + { + "epoch": 0.0661, + "grad_norm": 0.23017463088035583, + "learning_rate": 4.993919413405772e-05, + "loss": 0.0563, + "step": 14220 + }, + { + "epoch": 0.06615, + "grad_norm": 0.25913599133491516, + "learning_rate": 4.993890566228491e-05, + "loss": 0.0568, + "step": 14230 + }, + { + "epoch": 0.0662, + "grad_norm": 0.25078967213630676, + "learning_rate": 4.993861650869179e-05, + "loss": 0.0588, + "step": 14240 + }, + { + "epoch": 0.06625, + "grad_norm": 0.23285934329032898, + "learning_rate": 4.993832667328626e-05, + "loss": 0.0546, + "step": 14250 + }, + { + "epoch": 0.0663, + "grad_norm": 0.2456798553466797, + "learning_rate": 4.9938036156076256e-05, + "loss": 0.0547, + "step": 14260 + }, + { + "epoch": 0.06635, + "grad_norm": 0.23873870074748993, + "learning_rate": 4.993774495706971e-05, + "loss": 0.0576, + "step": 14270 + }, + { + "epoch": 0.0664, + "grad_norm": 0.2455510050058365, + "learning_rate": 4.9937453076274584e-05, + "loss": 0.0554, + "step": 14280 + }, + { + "epoch": 0.06645, + "grad_norm": 0.23358359932899475, + "learning_rate": 4.993716051369886e-05, + "loss": 0.0566, + "step": 14290 + }, + { + "epoch": 0.0665, + "grad_norm": 0.22073981165885925, + "learning_rate": 4.993686726935054e-05, + "loss": 0.0559, + "step": 14300 + }, + { + "epoch": 0.06655, + "grad_norm": 0.22967039048671722, + "learning_rate": 4.993657334323763e-05, + "loss": 0.0578, + "step": 14310 + }, + { + "epoch": 0.0666, + "grad_norm": 0.2374074012041092, + "learning_rate": 4.993627873536818e-05, + "loss": 0.0556, + "step": 14320 + }, + { + "epoch": 0.06665, + "grad_norm": 0.25563186407089233, + "learning_rate": 4.993598344575023e-05, + "loss": 0.0556, + "step": 14330 + }, + { + "epoch": 0.0667, + "grad_norm": 0.22516918182373047, + "learning_rate": 4.993568747439187e-05, + "loss": 0.0524, + "step": 14340 + }, + { + "epoch": 0.06675, + "grad_norm": 0.2268334925174713, + "learning_rate": 4.993539082130117e-05, + "loss": 0.0524, + "step": 14350 + }, + { + "epoch": 0.0668, + "grad_norm": 0.20074786245822906, + "learning_rate": 4.993509348648626e-05, + "loss": 0.0535, + "step": 14360 + }, + { + "epoch": 0.06685, + "grad_norm": 0.23249930143356323, + "learning_rate": 4.9934795469955266e-05, + "loss": 0.0519, + "step": 14370 + }, + { + "epoch": 0.0669, + "grad_norm": 0.21201986074447632, + "learning_rate": 4.9934496771716326e-05, + "loss": 0.0581, + "step": 14380 + }, + { + "epoch": 0.06695, + "grad_norm": 0.22310949862003326, + "learning_rate": 4.993419739177761e-05, + "loss": 0.0555, + "step": 14390 + }, + { + "epoch": 0.067, + "grad_norm": 0.2250770777463913, + "learning_rate": 4.9933897330147305e-05, + "loss": 0.0571, + "step": 14400 + }, + { + "epoch": 0.06705, + "grad_norm": 0.23023316264152527, + "learning_rate": 4.993359658683362e-05, + "loss": 0.0589, + "step": 14410 + }, + { + "epoch": 0.0671, + "grad_norm": 0.19881965219974518, + "learning_rate": 4.9933295161844765e-05, + "loss": 0.0531, + "step": 14420 + }, + { + "epoch": 0.06715, + "grad_norm": 0.16332776844501495, + "learning_rate": 4.993299305518899e-05, + "loss": 0.0506, + "step": 14430 + }, + { + "epoch": 0.0672, + "grad_norm": 0.23525168001651764, + "learning_rate": 4.993269026687456e-05, + "loss": 0.0556, + "step": 14440 + }, + { + "epoch": 0.06725, + "grad_norm": 0.19342724978923798, + "learning_rate": 4.993238679690974e-05, + "loss": 0.0549, + "step": 14450 + }, + { + "epoch": 0.0673, + "grad_norm": 0.24618756771087646, + "learning_rate": 4.993208264530282e-05, + "loss": 0.0537, + "step": 14460 + }, + { + "epoch": 0.06735, + "grad_norm": 0.21478991210460663, + "learning_rate": 4.9931777812062134e-05, + "loss": 0.0541, + "step": 14470 + }, + { + "epoch": 0.0674, + "grad_norm": 0.2020798772573471, + "learning_rate": 4.9931472297196015e-05, + "loss": 0.0532, + "step": 14480 + }, + { + "epoch": 0.06745, + "grad_norm": 0.20711049437522888, + "learning_rate": 4.99311661007128e-05, + "loss": 0.0528, + "step": 14490 + }, + { + "epoch": 0.0675, + "grad_norm": 0.2520681619644165, + "learning_rate": 4.993085922262088e-05, + "loss": 0.0552, + "step": 14500 + }, + { + "epoch": 0.06755, + "grad_norm": 0.24167174100875854, + "learning_rate": 4.993055166292863e-05, + "loss": 0.0529, + "step": 14510 + }, + { + "epoch": 0.0676, + "grad_norm": 0.2574155628681183, + "learning_rate": 4.9930243421644466e-05, + "loss": 0.0518, + "step": 14520 + }, + { + "epoch": 0.06765, + "grad_norm": 0.27726471424102783, + "learning_rate": 4.992993449877681e-05, + "loss": 0.0527, + "step": 14530 + }, + { + "epoch": 0.0677, + "grad_norm": 0.26100972294807434, + "learning_rate": 4.992962489433411e-05, + "loss": 0.0518, + "step": 14540 + }, + { + "epoch": 0.06775, + "grad_norm": 0.23531347513198853, + "learning_rate": 4.992931460832483e-05, + "loss": 0.0527, + "step": 14550 + }, + { + "epoch": 0.0678, + "grad_norm": 0.23893924057483673, + "learning_rate": 4.992900364075746e-05, + "loss": 0.0563, + "step": 14560 + }, + { + "epoch": 0.06785, + "grad_norm": 0.22128519415855408, + "learning_rate": 4.992869199164048e-05, + "loss": 0.0524, + "step": 14570 + }, + { + "epoch": 0.0679, + "grad_norm": 0.2469291090965271, + "learning_rate": 4.992837966098245e-05, + "loss": 0.0524, + "step": 14580 + }, + { + "epoch": 0.06795, + "grad_norm": 0.24957656860351562, + "learning_rate": 4.992806664879187e-05, + "loss": 0.0527, + "step": 14590 + }, + { + "epoch": 0.068, + "grad_norm": 0.21424804627895355, + "learning_rate": 4.9927752955077314e-05, + "loss": 0.0545, + "step": 14600 + }, + { + "epoch": 0.06805, + "grad_norm": 0.18640349805355072, + "learning_rate": 4.9927438579847364e-05, + "loss": 0.0531, + "step": 14610 + }, + { + "epoch": 0.0681, + "grad_norm": 0.2083619236946106, + "learning_rate": 4.9927123523110595e-05, + "loss": 0.0531, + "step": 14620 + }, + { + "epoch": 0.06815, + "grad_norm": 0.17825178802013397, + "learning_rate": 4.9926807784875654e-05, + "loss": 0.0538, + "step": 14630 + }, + { + "epoch": 0.0682, + "grad_norm": 0.21923069655895233, + "learning_rate": 4.992649136515113e-05, + "loss": 0.0585, + "step": 14640 + }, + { + "epoch": 0.06825, + "grad_norm": 0.21128062903881073, + "learning_rate": 4.992617426394571e-05, + "loss": 0.055, + "step": 14650 + }, + { + "epoch": 0.0683, + "grad_norm": 0.23529167473316193, + "learning_rate": 4.992585648126805e-05, + "loss": 0.052, + "step": 14660 + }, + { + "epoch": 0.06835, + "grad_norm": 0.23567888140678406, + "learning_rate": 4.9925538017126836e-05, + "loss": 0.0549, + "step": 14670 + }, + { + "epoch": 0.0684, + "grad_norm": 0.19339175522327423, + "learning_rate": 4.992521887153078e-05, + "loss": 0.0513, + "step": 14680 + }, + { + "epoch": 0.06845, + "grad_norm": 0.20444408059120178, + "learning_rate": 4.9924899044488594e-05, + "loss": 0.0561, + "step": 14690 + }, + { + "epoch": 0.0685, + "grad_norm": 0.18966902792453766, + "learning_rate": 4.9924578536009035e-05, + "loss": 0.0543, + "step": 14700 + }, + { + "epoch": 0.06855, + "grad_norm": 0.2409667819738388, + "learning_rate": 4.992425734610087e-05, + "loss": 0.0553, + "step": 14710 + }, + { + "epoch": 0.0686, + "grad_norm": 0.19646379351615906, + "learning_rate": 4.9923935474772864e-05, + "loss": 0.0529, + "step": 14720 + }, + { + "epoch": 0.06865, + "grad_norm": 0.21415621042251587, + "learning_rate": 4.9923612922033836e-05, + "loss": 0.0535, + "step": 14730 + }, + { + "epoch": 0.0687, + "grad_norm": 0.18555407226085663, + "learning_rate": 4.992328968789258e-05, + "loss": 0.0533, + "step": 14740 + }, + { + "epoch": 0.06875, + "grad_norm": 0.2298015058040619, + "learning_rate": 4.992296577235796e-05, + "loss": 0.055, + "step": 14750 + }, + { + "epoch": 0.0688, + "grad_norm": 0.1745564043521881, + "learning_rate": 4.9922641175438813e-05, + "loss": 0.0536, + "step": 14760 + }, + { + "epoch": 0.06885, + "grad_norm": 0.21434536576271057, + "learning_rate": 4.992231589714402e-05, + "loss": 0.0532, + "step": 14770 + }, + { + "epoch": 0.0689, + "grad_norm": 0.21135056018829346, + "learning_rate": 4.992198993748247e-05, + "loss": 0.0549, + "step": 14780 + }, + { + "epoch": 0.06895, + "grad_norm": 0.1799994707107544, + "learning_rate": 4.992166329646308e-05, + "loss": 0.0503, + "step": 14790 + }, + { + "epoch": 0.069, + "grad_norm": 0.18514901399612427, + "learning_rate": 4.992133597409478e-05, + "loss": 0.05, + "step": 14800 + }, + { + "epoch": 0.06905, + "grad_norm": 0.26623496413230896, + "learning_rate": 4.992100797038652e-05, + "loss": 0.0557, + "step": 14810 + }, + { + "epoch": 0.0691, + "grad_norm": 0.23814378678798676, + "learning_rate": 4.992067928534726e-05, + "loss": 0.0497, + "step": 14820 + }, + { + "epoch": 0.06915, + "grad_norm": 0.18923845887184143, + "learning_rate": 4.9920349918985995e-05, + "loss": 0.0512, + "step": 14830 + }, + { + "epoch": 0.0692, + "grad_norm": 0.1813831627368927, + "learning_rate": 4.992001987131172e-05, + "loss": 0.0511, + "step": 14840 + }, + { + "epoch": 0.06925, + "grad_norm": 0.23114556074142456, + "learning_rate": 4.991968914233347e-05, + "loss": 0.0534, + "step": 14850 + }, + { + "epoch": 0.0693, + "grad_norm": 0.28188079595565796, + "learning_rate": 4.991935773206027e-05, + "loss": 0.0534, + "step": 14860 + }, + { + "epoch": 0.06935, + "grad_norm": 0.20539860427379608, + "learning_rate": 4.99190256405012e-05, + "loss": 0.0554, + "step": 14870 + }, + { + "epoch": 0.0694, + "grad_norm": 0.1767107993364334, + "learning_rate": 4.9918692867665327e-05, + "loss": 0.0511, + "step": 14880 + }, + { + "epoch": 0.06945, + "grad_norm": 0.16256114840507507, + "learning_rate": 4.991835941356176e-05, + "loss": 0.0519, + "step": 14890 + }, + { + "epoch": 0.0695, + "grad_norm": 0.23473677039146423, + "learning_rate": 4.9918025278199597e-05, + "loss": 0.0538, + "step": 14900 + }, + { + "epoch": 0.06955, + "grad_norm": 0.23456275463104248, + "learning_rate": 4.991769046158799e-05, + "loss": 0.0531, + "step": 14910 + }, + { + "epoch": 0.0696, + "grad_norm": 0.22146765887737274, + "learning_rate": 4.991735496373609e-05, + "loss": 0.053, + "step": 14920 + }, + { + "epoch": 0.06965, + "grad_norm": 0.2178531438112259, + "learning_rate": 4.9917018784653056e-05, + "loss": 0.0513, + "step": 14930 + }, + { + "epoch": 0.0697, + "grad_norm": 0.20697830617427826, + "learning_rate": 4.99166819243481e-05, + "loss": 0.0512, + "step": 14940 + }, + { + "epoch": 0.06975, + "grad_norm": 0.17402079701423645, + "learning_rate": 4.9916344382830414e-05, + "loss": 0.0516, + "step": 14950 + }, + { + "epoch": 0.0698, + "grad_norm": 0.20043763518333435, + "learning_rate": 4.9916006160109235e-05, + "loss": 0.0537, + "step": 14960 + }, + { + "epoch": 0.06985, + "grad_norm": 0.1847904920578003, + "learning_rate": 4.991566725619381e-05, + "loss": 0.0518, + "step": 14970 + }, + { + "epoch": 0.0699, + "grad_norm": 0.2037106603384018, + "learning_rate": 4.99153276710934e-05, + "loss": 0.0561, + "step": 14980 + }, + { + "epoch": 0.06995, + "grad_norm": 0.21404510736465454, + "learning_rate": 4.991498740481729e-05, + "loss": 0.0521, + "step": 14990 + }, + { + "epoch": 0.07, + "grad_norm": 0.2313455194234848, + "learning_rate": 4.991464645737479e-05, + "loss": 0.0522, + "step": 15000 + }, + { + "epoch": 0.07005, + "grad_norm": 0.22190025448799133, + "learning_rate": 4.9914304828775215e-05, + "loss": 0.0533, + "step": 15010 + }, + { + "epoch": 0.0701, + "grad_norm": 0.18861819803714752, + "learning_rate": 4.99139625190279e-05, + "loss": 0.0541, + "step": 15020 + }, + { + "epoch": 0.07015, + "grad_norm": 0.20761419832706451, + "learning_rate": 4.991361952814222e-05, + "loss": 0.0538, + "step": 15030 + }, + { + "epoch": 0.0702, + "grad_norm": 0.2672784626483917, + "learning_rate": 4.9913275856127534e-05, + "loss": 0.0583, + "step": 15040 + }, + { + "epoch": 0.07025, + "grad_norm": 0.22271452844142914, + "learning_rate": 4.991293150299324e-05, + "loss": 0.0536, + "step": 15050 + }, + { + "epoch": 0.0703, + "grad_norm": 0.2635791003704071, + "learning_rate": 4.9912586468748774e-05, + "loss": 0.0535, + "step": 15060 + }, + { + "epoch": 0.07035, + "grad_norm": 0.21073433756828308, + "learning_rate": 4.991224075340355e-05, + "loss": 0.0536, + "step": 15070 + }, + { + "epoch": 0.0704, + "grad_norm": 0.22948896884918213, + "learning_rate": 4.991189435696701e-05, + "loss": 0.053, + "step": 15080 + }, + { + "epoch": 0.07045, + "grad_norm": 0.19976653158664703, + "learning_rate": 4.9911547279448644e-05, + "loss": 0.0546, + "step": 15090 + }, + { + "epoch": 0.0705, + "grad_norm": 0.20821602642536163, + "learning_rate": 4.9911199520857935e-05, + "loss": 0.0512, + "step": 15100 + }, + { + "epoch": 0.07055, + "grad_norm": 0.20193876326084137, + "learning_rate": 4.991085108120439e-05, + "loss": 0.0518, + "step": 15110 + }, + { + "epoch": 0.0706, + "grad_norm": 0.18943262100219727, + "learning_rate": 4.9910501960497536e-05, + "loss": 0.0516, + "step": 15120 + }, + { + "epoch": 0.07065, + "grad_norm": 0.2120610475540161, + "learning_rate": 4.9910152158746914e-05, + "loss": 0.0522, + "step": 15130 + }, + { + "epoch": 0.0707, + "grad_norm": 0.1962938755750656, + "learning_rate": 4.990980167596209e-05, + "loss": 0.0513, + "step": 15140 + }, + { + "epoch": 0.07075, + "grad_norm": 0.1755223572254181, + "learning_rate": 4.990945051215265e-05, + "loss": 0.0517, + "step": 15150 + }, + { + "epoch": 0.0708, + "grad_norm": 0.17874115705490112, + "learning_rate": 4.990909866732819e-05, + "loss": 0.0522, + "step": 15160 + }, + { + "epoch": 0.07085, + "grad_norm": 0.24599440395832062, + "learning_rate": 4.990874614149833e-05, + "loss": 0.0541, + "step": 15170 + }, + { + "epoch": 0.0709, + "grad_norm": 0.22264912724494934, + "learning_rate": 4.9908392934672705e-05, + "loss": 0.0526, + "step": 15180 + }, + { + "epoch": 0.07095, + "grad_norm": 0.1884782463312149, + "learning_rate": 4.990803904686098e-05, + "loss": 0.053, + "step": 15190 + }, + { + "epoch": 0.071, + "grad_norm": 0.1697518229484558, + "learning_rate": 4.990768447807282e-05, + "loss": 0.0518, + "step": 15200 + }, + { + "epoch": 0.07105, + "grad_norm": 0.22185924649238586, + "learning_rate": 4.990732922831792e-05, + "loss": 0.0543, + "step": 15210 + }, + { + "epoch": 0.0711, + "grad_norm": 0.2215665727853775, + "learning_rate": 4.990697329760601e-05, + "loss": 0.0519, + "step": 15220 + }, + { + "epoch": 0.07115, + "grad_norm": 0.21765001118183136, + "learning_rate": 4.99066166859468e-05, + "loss": 0.0511, + "step": 15230 + }, + { + "epoch": 0.0712, + "grad_norm": 0.2361811399459839, + "learning_rate": 4.990625939335004e-05, + "loss": 0.0544, + "step": 15240 + }, + { + "epoch": 0.07125, + "grad_norm": 0.25048092007637024, + "learning_rate": 4.990590141982552e-05, + "loss": 0.0544, + "step": 15250 + }, + { + "epoch": 0.0713, + "grad_norm": 0.2167491912841797, + "learning_rate": 4.9905542765382996e-05, + "loss": 0.0538, + "step": 15260 + }, + { + "epoch": 0.07135, + "grad_norm": 0.2278248518705368, + "learning_rate": 4.9905183430032296e-05, + "loss": 0.0551, + "step": 15270 + }, + { + "epoch": 0.0714, + "grad_norm": 0.2136402279138565, + "learning_rate": 4.990482341378324e-05, + "loss": 0.0535, + "step": 15280 + }, + { + "epoch": 0.07145, + "grad_norm": 0.18827708065509796, + "learning_rate": 4.9904462716645675e-05, + "loss": 0.051, + "step": 15290 + }, + { + "epoch": 0.0715, + "grad_norm": 0.20222033560276031, + "learning_rate": 4.990410133862944e-05, + "loss": 0.0537, + "step": 15300 + }, + { + "epoch": 0.07155, + "grad_norm": 0.2278817594051361, + "learning_rate": 4.9903739279744436e-05, + "loss": 0.0559, + "step": 15310 + }, + { + "epoch": 0.0716, + "grad_norm": 0.26084935665130615, + "learning_rate": 4.9903376540000555e-05, + "loss": 0.0533, + "step": 15320 + }, + { + "epoch": 0.07165, + "grad_norm": 0.24685749411582947, + "learning_rate": 4.990301311940772e-05, + "loss": 0.0535, + "step": 15330 + }, + { + "epoch": 0.0717, + "grad_norm": 0.1969948559999466, + "learning_rate": 4.990264901797586e-05, + "loss": 0.052, + "step": 15340 + }, + { + "epoch": 0.07175, + "grad_norm": 0.19155311584472656, + "learning_rate": 4.990228423571493e-05, + "loss": 0.0532, + "step": 15350 + }, + { + "epoch": 0.0718, + "grad_norm": 0.19653113186359406, + "learning_rate": 4.9901918772634906e-05, + "loss": 0.051, + "step": 15360 + }, + { + "epoch": 0.07185, + "grad_norm": 0.17781168222427368, + "learning_rate": 4.990155262874577e-05, + "loss": 0.0522, + "step": 15370 + }, + { + "epoch": 0.0719, + "grad_norm": 0.1654060333967209, + "learning_rate": 4.990118580405755e-05, + "loss": 0.0514, + "step": 15380 + }, + { + "epoch": 0.07195, + "grad_norm": 0.19361428916454315, + "learning_rate": 4.9900818298580263e-05, + "loss": 0.0523, + "step": 15390 + }, + { + "epoch": 0.072, + "grad_norm": 0.2262229472398758, + "learning_rate": 4.990045011232396e-05, + "loss": 0.0506, + "step": 15400 + }, + { + "epoch": 0.07205, + "grad_norm": 0.20477080345153809, + "learning_rate": 4.9900081245298703e-05, + "loss": 0.0491, + "step": 15410 + }, + { + "epoch": 0.0721, + "grad_norm": 0.20688781142234802, + "learning_rate": 4.9899711697514586e-05, + "loss": 0.0504, + "step": 15420 + }, + { + "epoch": 0.07215, + "grad_norm": 0.24755077064037323, + "learning_rate": 4.9899341468981696e-05, + "loss": 0.0512, + "step": 15430 + }, + { + "epoch": 0.0722, + "grad_norm": 0.2354496866464615, + "learning_rate": 4.9898970559710165e-05, + "loss": 0.051, + "step": 15440 + }, + { + "epoch": 0.07225, + "grad_norm": 0.1773119419813156, + "learning_rate": 4.9898598969710137e-05, + "loss": 0.0505, + "step": 15450 + }, + { + "epoch": 0.0723, + "grad_norm": 0.20547893643379211, + "learning_rate": 4.989822669899177e-05, + "loss": 0.0513, + "step": 15460 + }, + { + "epoch": 0.07235, + "grad_norm": 0.22126734256744385, + "learning_rate": 4.9897853747565225e-05, + "loss": 0.0522, + "step": 15470 + }, + { + "epoch": 0.0724, + "grad_norm": 0.2021513730287552, + "learning_rate": 4.9897480115440724e-05, + "loss": 0.0556, + "step": 15480 + }, + { + "epoch": 0.07245, + "grad_norm": 0.215627521276474, + "learning_rate": 4.989710580262847e-05, + "loss": 0.0513, + "step": 15490 + }, + { + "epoch": 0.0725, + "grad_norm": 0.2203914225101471, + "learning_rate": 4.9896730809138694e-05, + "loss": 0.0567, + "step": 15500 + }, + { + "epoch": 0.07255, + "grad_norm": 0.19085222482681274, + "learning_rate": 4.9896355134981655e-05, + "loss": 0.0541, + "step": 15510 + }, + { + "epoch": 0.0726, + "grad_norm": 0.18488481640815735, + "learning_rate": 4.9895978780167615e-05, + "loss": 0.0518, + "step": 15520 + }, + { + "epoch": 0.07265, + "grad_norm": 0.18811728060245514, + "learning_rate": 4.989560174470687e-05, + "loss": 0.0522, + "step": 15530 + }, + { + "epoch": 0.0727, + "grad_norm": 0.233742818236351, + "learning_rate": 4.989522402860972e-05, + "loss": 0.0525, + "step": 15540 + }, + { + "epoch": 0.07275, + "grad_norm": 0.20568574965000153, + "learning_rate": 4.989484563188651e-05, + "loss": 0.0564, + "step": 15550 + }, + { + "epoch": 0.0728, + "grad_norm": 0.185214564204216, + "learning_rate": 4.9894466554547566e-05, + "loss": 0.0528, + "step": 15560 + }, + { + "epoch": 0.07285, + "grad_norm": 0.17654481530189514, + "learning_rate": 4.989408679660326e-05, + "loss": 0.051, + "step": 15570 + }, + { + "epoch": 0.0729, + "grad_norm": 0.1739576905965805, + "learning_rate": 4.989370635806398e-05, + "loss": 0.0533, + "step": 15580 + }, + { + "epoch": 0.07295, + "grad_norm": 0.197053924202919, + "learning_rate": 4.98933252389401e-05, + "loss": 0.0499, + "step": 15590 + }, + { + "epoch": 0.073, + "grad_norm": 0.2722446918487549, + "learning_rate": 4.9892943439242076e-05, + "loss": 0.0537, + "step": 15600 + }, + { + "epoch": 0.07305, + "grad_norm": 0.20078343152999878, + "learning_rate": 4.9892560958980326e-05, + "loss": 0.0516, + "step": 15610 + }, + { + "epoch": 0.0731, + "grad_norm": 0.21441137790679932, + "learning_rate": 4.989217779816532e-05, + "loss": 0.0568, + "step": 15620 + }, + { + "epoch": 0.07315, + "grad_norm": 0.1925336718559265, + "learning_rate": 4.9891793956807506e-05, + "loss": 0.0512, + "step": 15630 + }, + { + "epoch": 0.0732, + "grad_norm": 0.16830724477767944, + "learning_rate": 4.9891409434917414e-05, + "loss": 0.0505, + "step": 15640 + }, + { + "epoch": 0.07325, + "grad_norm": 0.2086031436920166, + "learning_rate": 4.9891024232505536e-05, + "loss": 0.0504, + "step": 15650 + }, + { + "epoch": 0.0733, + "grad_norm": 0.22745658457279205, + "learning_rate": 4.98906383495824e-05, + "loss": 0.0531, + "step": 15660 + }, + { + "epoch": 0.07335, + "grad_norm": 0.17610715329647064, + "learning_rate": 4.9890251786158565e-05, + "loss": 0.0491, + "step": 15670 + }, + { + "epoch": 0.0734, + "grad_norm": 0.1811055839061737, + "learning_rate": 4.9889864542244594e-05, + "loss": 0.0486, + "step": 15680 + }, + { + "epoch": 0.07345, + "grad_norm": 0.18700256943702698, + "learning_rate": 4.9889476617851085e-05, + "loss": 0.0543, + "step": 15690 + }, + { + "epoch": 0.0735, + "grad_norm": 0.20162612199783325, + "learning_rate": 4.988908801298863e-05, + "loss": 0.0522, + "step": 15700 + }, + { + "epoch": 0.07355, + "grad_norm": 0.16793783009052277, + "learning_rate": 4.988869872766786e-05, + "loss": 0.0527, + "step": 15710 + }, + { + "epoch": 0.0736, + "grad_norm": 0.2012702226638794, + "learning_rate": 4.988830876189942e-05, + "loss": 0.0496, + "step": 15720 + }, + { + "epoch": 0.07365, + "grad_norm": 0.18021146953105927, + "learning_rate": 4.988791811569396e-05, + "loss": 0.0487, + "step": 15730 + }, + { + "epoch": 0.0737, + "grad_norm": 0.2230038046836853, + "learning_rate": 4.988752678906218e-05, + "loss": 0.0497, + "step": 15740 + }, + { + "epoch": 0.07375, + "grad_norm": 0.20034830272197723, + "learning_rate": 4.9887134782014764e-05, + "loss": 0.0515, + "step": 15750 + }, + { + "epoch": 0.0738, + "grad_norm": 0.18192660808563232, + "learning_rate": 4.988674209456243e-05, + "loss": 0.051, + "step": 15760 + }, + { + "epoch": 0.07385, + "grad_norm": 0.21604807674884796, + "learning_rate": 4.988634872671592e-05, + "loss": 0.0529, + "step": 15770 + }, + { + "epoch": 0.0739, + "grad_norm": 0.17603836953639984, + "learning_rate": 4.988595467848598e-05, + "loss": 0.051, + "step": 15780 + }, + { + "epoch": 0.07395, + "grad_norm": 0.1528969407081604, + "learning_rate": 4.988555994988339e-05, + "loss": 0.053, + "step": 15790 + }, + { + "epoch": 0.074, + "grad_norm": 0.18262742459774017, + "learning_rate": 4.988516454091894e-05, + "loss": 0.0488, + "step": 15800 + }, + { + "epoch": 0.07405, + "grad_norm": 0.23504868149757385, + "learning_rate": 4.988476845160345e-05, + "loss": 0.0517, + "step": 15810 + }, + { + "epoch": 0.0741, + "grad_norm": 0.1953057050704956, + "learning_rate": 4.988437168194773e-05, + "loss": 0.0505, + "step": 15820 + }, + { + "epoch": 0.07415, + "grad_norm": 0.19112664461135864, + "learning_rate": 4.988397423196264e-05, + "loss": 0.0496, + "step": 15830 + }, + { + "epoch": 0.0742, + "grad_norm": 0.17466707527637482, + "learning_rate": 4.9883576101659037e-05, + "loss": 0.0486, + "step": 15840 + }, + { + "epoch": 0.07425, + "grad_norm": 0.2254481315612793, + "learning_rate": 4.988317729104781e-05, + "loss": 0.052, + "step": 15850 + }, + { + "epoch": 0.0743, + "grad_norm": 0.20193511247634888, + "learning_rate": 4.9882777800139875e-05, + "loss": 0.0524, + "step": 15860 + }, + { + "epoch": 0.07435, + "grad_norm": 0.22816495597362518, + "learning_rate": 4.988237762894613e-05, + "loss": 0.0542, + "step": 15870 + }, + { + "epoch": 0.0744, + "grad_norm": 0.2185070514678955, + "learning_rate": 4.9881976777477545e-05, + "loss": 0.0574, + "step": 15880 + }, + { + "epoch": 0.07445, + "grad_norm": 0.18639391660690308, + "learning_rate": 4.9881575245745046e-05, + "loss": 0.0568, + "step": 15890 + }, + { + "epoch": 0.0745, + "grad_norm": 0.22813963890075684, + "learning_rate": 4.988117303375964e-05, + "loss": 0.0512, + "step": 15900 + }, + { + "epoch": 0.07455, + "grad_norm": 0.1964694708585739, + "learning_rate": 4.9880770141532304e-05, + "loss": 0.0524, + "step": 15910 + }, + { + "epoch": 0.0746, + "grad_norm": 0.17903338372707367, + "learning_rate": 4.988036656907407e-05, + "loss": 0.0491, + "step": 15920 + }, + { + "epoch": 0.07465, + "grad_norm": 0.21408379077911377, + "learning_rate": 4.987996231639594e-05, + "loss": 0.0503, + "step": 15930 + }, + { + "epoch": 0.0747, + "grad_norm": 0.222556933760643, + "learning_rate": 4.9879557383509005e-05, + "loss": 0.0515, + "step": 15940 + }, + { + "epoch": 0.07475, + "grad_norm": 0.18890510499477386, + "learning_rate": 4.9879151770424314e-05, + "loss": 0.0494, + "step": 15950 + }, + { + "epoch": 0.0748, + "grad_norm": 0.1653035432100296, + "learning_rate": 4.9878745477152955e-05, + "loss": 0.0527, + "step": 15960 + }, + { + "epoch": 0.07485, + "grad_norm": 0.21293434500694275, + "learning_rate": 4.987833850370605e-05, + "loss": 0.0536, + "step": 15970 + }, + { + "epoch": 0.0749, + "grad_norm": 0.17240995168685913, + "learning_rate": 4.9877930850094715e-05, + "loss": 0.0518, + "step": 15980 + }, + { + "epoch": 0.07495, + "grad_norm": 0.1972210854291916, + "learning_rate": 4.987752251633009e-05, + "loss": 0.0486, + "step": 15990 + }, + { + "epoch": 0.075, + "grad_norm": 0.16621819138526917, + "learning_rate": 4.9877113502423345e-05, + "loss": 0.0507, + "step": 16000 + }, + { + "epoch": 0.07505, + "grad_norm": 0.16641293466091156, + "learning_rate": 4.987670380838567e-05, + "loss": 0.0498, + "step": 16010 + }, + { + "epoch": 0.0751, + "grad_norm": 0.21070459485054016, + "learning_rate": 4.987629343422825e-05, + "loss": 0.0531, + "step": 16020 + }, + { + "epoch": 0.07515, + "grad_norm": 0.15260066092014313, + "learning_rate": 4.987588237996232e-05, + "loss": 0.0531, + "step": 16030 + }, + { + "epoch": 0.0752, + "grad_norm": 0.16156025230884552, + "learning_rate": 4.987547064559911e-05, + "loss": 0.0487, + "step": 16040 + }, + { + "epoch": 0.07525, + "grad_norm": 0.19028621912002563, + "learning_rate": 4.987505823114988e-05, + "loss": 0.0475, + "step": 16050 + }, + { + "epoch": 0.0753, + "grad_norm": 0.22921723127365112, + "learning_rate": 4.9874645136625894e-05, + "loss": 0.0516, + "step": 16060 + }, + { + "epoch": 0.07535, + "grad_norm": 0.24922367930412292, + "learning_rate": 4.987423136203847e-05, + "loss": 0.0523, + "step": 16070 + }, + { + "epoch": 0.0754, + "grad_norm": 0.24001844227313995, + "learning_rate": 4.98738169073989e-05, + "loss": 0.0519, + "step": 16080 + }, + { + "epoch": 0.07545, + "grad_norm": 0.18395234644412994, + "learning_rate": 4.987340177271851e-05, + "loss": 0.0501, + "step": 16090 + }, + { + "epoch": 0.0755, + "grad_norm": 0.2519650459289551, + "learning_rate": 4.9872985958008664e-05, + "loss": 0.0514, + "step": 16100 + }, + { + "epoch": 0.07555, + "grad_norm": 0.22571155428886414, + "learning_rate": 4.9872569463280736e-05, + "loss": 0.0567, + "step": 16110 + }, + { + "epoch": 0.0756, + "grad_norm": 0.22015048563480377, + "learning_rate": 4.987215228854609e-05, + "loss": 0.0506, + "step": 16120 + }, + { + "epoch": 0.07565, + "grad_norm": 0.21603775024414062, + "learning_rate": 4.9871734433816156e-05, + "loss": 0.0521, + "step": 16130 + }, + { + "epoch": 0.0757, + "grad_norm": 0.2373482584953308, + "learning_rate": 4.9871315899102345e-05, + "loss": 0.0512, + "step": 16140 + }, + { + "epoch": 0.07575, + "grad_norm": 0.19632349908351898, + "learning_rate": 4.98708966844161e-05, + "loss": 0.0542, + "step": 16150 + }, + { + "epoch": 0.0758, + "grad_norm": 0.2170940786600113, + "learning_rate": 4.987047678976887e-05, + "loss": 0.0484, + "step": 16160 + }, + { + "epoch": 0.07585, + "grad_norm": 0.2200576663017273, + "learning_rate": 4.987005621517217e-05, + "loss": 0.0514, + "step": 16170 + }, + { + "epoch": 0.0759, + "grad_norm": 0.30243274569511414, + "learning_rate": 4.9869634960637454e-05, + "loss": 0.0518, + "step": 16180 + }, + { + "epoch": 0.07595, + "grad_norm": 0.2449166476726532, + "learning_rate": 4.9869213026176275e-05, + "loss": 0.0512, + "step": 16190 + }, + { + "epoch": 0.076, + "grad_norm": 0.22707422077655792, + "learning_rate": 4.986879041180016e-05, + "loss": 0.05, + "step": 16200 + }, + { + "epoch": 0.07605, + "grad_norm": 0.17047467827796936, + "learning_rate": 4.986836711752064e-05, + "loss": 0.0505, + "step": 16210 + }, + { + "epoch": 0.0761, + "grad_norm": 0.19080035388469696, + "learning_rate": 4.986794314334932e-05, + "loss": 0.0497, + "step": 16220 + }, + { + "epoch": 0.07615, + "grad_norm": 0.20476721227169037, + "learning_rate": 4.986751848929777e-05, + "loss": 0.0501, + "step": 16230 + }, + { + "epoch": 0.0762, + "grad_norm": 0.31264495849609375, + "learning_rate": 4.9867093155377606e-05, + "loss": 0.0536, + "step": 16240 + }, + { + "epoch": 0.07625, + "grad_norm": 0.21649867296218872, + "learning_rate": 4.986666714160047e-05, + "loss": 0.0512, + "step": 16250 + }, + { + "epoch": 0.0763, + "grad_norm": 0.2247818261384964, + "learning_rate": 4.986624044797799e-05, + "loss": 0.0525, + "step": 16260 + }, + { + "epoch": 0.07635, + "grad_norm": 0.19337685406208038, + "learning_rate": 4.9865813074521825e-05, + "loss": 0.0508, + "step": 16270 + }, + { + "epoch": 0.0764, + "grad_norm": 0.23544394969940186, + "learning_rate": 4.9865385021243686e-05, + "loss": 0.0517, + "step": 16280 + }, + { + "epoch": 0.07645, + "grad_norm": 0.177505761384964, + "learning_rate": 4.986495628815526e-05, + "loss": 0.0517, + "step": 16290 + }, + { + "epoch": 0.0765, + "grad_norm": 0.19319675862789154, + "learning_rate": 4.986452687526827e-05, + "loss": 0.0524, + "step": 16300 + }, + { + "epoch": 0.07655, + "grad_norm": 0.20353886485099792, + "learning_rate": 4.9864096782594446e-05, + "loss": 0.0526, + "step": 16310 + }, + { + "epoch": 0.0766, + "grad_norm": 0.2161436676979065, + "learning_rate": 4.986366601014557e-05, + "loss": 0.0532, + "step": 16320 + }, + { + "epoch": 0.07665, + "grad_norm": 0.23269261419773102, + "learning_rate": 4.98632345579334e-05, + "loss": 0.0508, + "step": 16330 + }, + { + "epoch": 0.0767, + "grad_norm": 0.16609209775924683, + "learning_rate": 4.9862802425969744e-05, + "loss": 0.0526, + "step": 16340 + }, + { + "epoch": 0.07675, + "grad_norm": 0.24200305342674255, + "learning_rate": 4.9862369614266404e-05, + "loss": 0.0542, + "step": 16350 + }, + { + "epoch": 0.0768, + "grad_norm": 0.21339236199855804, + "learning_rate": 4.9861936122835223e-05, + "loss": 0.0496, + "step": 16360 + }, + { + "epoch": 0.07685, + "grad_norm": 0.218331441283226, + "learning_rate": 4.986150195168805e-05, + "loss": 0.0522, + "step": 16370 + }, + { + "epoch": 0.0769, + "grad_norm": 0.19578105211257935, + "learning_rate": 4.9861067100836744e-05, + "loss": 0.052, + "step": 16380 + }, + { + "epoch": 0.07695, + "grad_norm": 0.24065200984477997, + "learning_rate": 4.9860631570293216e-05, + "loss": 0.0508, + "step": 16390 + }, + { + "epoch": 0.077, + "grad_norm": 0.19160734117031097, + "learning_rate": 4.986019536006935e-05, + "loss": 0.0533, + "step": 16400 + }, + { + "epoch": 0.07705, + "grad_norm": 0.24766413867473602, + "learning_rate": 4.9859758470177084e-05, + "loss": 0.0503, + "step": 16410 + }, + { + "epoch": 0.0771, + "grad_norm": 0.21655277907848358, + "learning_rate": 4.985932090062837e-05, + "loss": 0.0537, + "step": 16420 + }, + { + "epoch": 0.07715, + "grad_norm": 0.22482611238956451, + "learning_rate": 4.985888265143515e-05, + "loss": 0.051, + "step": 16430 + }, + { + "epoch": 0.0772, + "grad_norm": 0.19084882736206055, + "learning_rate": 4.9858443722609426e-05, + "loss": 0.0489, + "step": 16440 + }, + { + "epoch": 0.07725, + "grad_norm": 0.1958894431591034, + "learning_rate": 4.985800411416318e-05, + "loss": 0.0507, + "step": 16450 + }, + { + "epoch": 0.0773, + "grad_norm": 0.19690927863121033, + "learning_rate": 4.9857563826108456e-05, + "loss": 0.0507, + "step": 16460 + }, + { + "epoch": 0.07735, + "grad_norm": 0.19239164888858795, + "learning_rate": 4.985712285845726e-05, + "loss": 0.0521, + "step": 16470 + }, + { + "epoch": 0.0774, + "grad_norm": 0.19204671680927277, + "learning_rate": 4.9856681211221666e-05, + "loss": 0.0536, + "step": 16480 + }, + { + "epoch": 0.07745, + "grad_norm": 0.20469598472118378, + "learning_rate": 4.9856238884413754e-05, + "loss": 0.0493, + "step": 16490 + }, + { + "epoch": 0.0775, + "grad_norm": 0.1652633398771286, + "learning_rate": 4.9855795878045606e-05, + "loss": 0.051, + "step": 16500 + }, + { + "epoch": 0.07755, + "grad_norm": 0.16923579573631287, + "learning_rate": 4.985535219212933e-05, + "loss": 0.0493, + "step": 16510 + }, + { + "epoch": 0.0776, + "grad_norm": 0.1931590586900711, + "learning_rate": 4.9854907826677074e-05, + "loss": 0.0499, + "step": 16520 + }, + { + "epoch": 0.07765, + "grad_norm": 0.17787854373455048, + "learning_rate": 4.985446278170097e-05, + "loss": 0.0499, + "step": 16530 + }, + { + "epoch": 0.0777, + "grad_norm": 0.21355508267879486, + "learning_rate": 4.9854017057213187e-05, + "loss": 0.0539, + "step": 16540 + }, + { + "epoch": 0.07775, + "grad_norm": 0.17724072933197021, + "learning_rate": 4.985357065322592e-05, + "loss": 0.052, + "step": 16550 + }, + { + "epoch": 0.0778, + "grad_norm": 0.19531121850013733, + "learning_rate": 4.985312356975137e-05, + "loss": 0.0519, + "step": 16560 + }, + { + "epoch": 0.07785, + "grad_norm": 0.17170202732086182, + "learning_rate": 4.985267580680175e-05, + "loss": 0.0568, + "step": 16570 + }, + { + "epoch": 0.0779, + "grad_norm": 0.22292238473892212, + "learning_rate": 4.9852227364389316e-05, + "loss": 0.0529, + "step": 16580 + }, + { + "epoch": 0.07795, + "grad_norm": 0.1876860409975052, + "learning_rate": 4.985177824252632e-05, + "loss": 0.0508, + "step": 16590 + }, + { + "epoch": 0.078, + "grad_norm": 0.19387325644493103, + "learning_rate": 4.9851328441225044e-05, + "loss": 0.05, + "step": 16600 + }, + { + "epoch": 0.07805, + "grad_norm": 0.19660423696041107, + "learning_rate": 4.9850877960497786e-05, + "loss": 0.0498, + "step": 16610 + }, + { + "epoch": 0.0781, + "grad_norm": 0.2226826399564743, + "learning_rate": 4.9850426800356855e-05, + "loss": 0.0552, + "step": 16620 + }, + { + "epoch": 0.07815, + "grad_norm": 0.16555924713611603, + "learning_rate": 4.9849974960814606e-05, + "loss": 0.0496, + "step": 16630 + }, + { + "epoch": 0.0782, + "grad_norm": 0.21441705524921417, + "learning_rate": 4.9849522441883364e-05, + "loss": 0.0563, + "step": 16640 + }, + { + "epoch": 0.07825, + "grad_norm": 0.20419728755950928, + "learning_rate": 4.984906924357552e-05, + "loss": 0.0528, + "step": 16650 + }, + { + "epoch": 0.0783, + "grad_norm": 0.2105439454317093, + "learning_rate": 4.984861536590345e-05, + "loss": 0.0516, + "step": 16660 + }, + { + "epoch": 0.07835, + "grad_norm": 0.19244681298732758, + "learning_rate": 4.984816080887958e-05, + "loss": 0.0517, + "step": 16670 + }, + { + "epoch": 0.0784, + "grad_norm": 0.1926605999469757, + "learning_rate": 4.9847705572516326e-05, + "loss": 0.0495, + "step": 16680 + }, + { + "epoch": 0.07845, + "grad_norm": 0.16235984861850739, + "learning_rate": 4.9847249656826136e-05, + "loss": 0.0509, + "step": 16690 + }, + { + "epoch": 0.0785, + "grad_norm": 0.18869003653526306, + "learning_rate": 4.984679306182147e-05, + "loss": 0.0505, + "step": 16700 + }, + { + "epoch": 0.07855, + "grad_norm": 0.1517942249774933, + "learning_rate": 4.984633578751482e-05, + "loss": 0.0498, + "step": 16710 + }, + { + "epoch": 0.0786, + "grad_norm": 0.18166415393352509, + "learning_rate": 4.984587783391869e-05, + "loss": 0.0566, + "step": 16720 + }, + { + "epoch": 0.07865, + "grad_norm": 0.19320838153362274, + "learning_rate": 4.984541920104558e-05, + "loss": 0.0502, + "step": 16730 + }, + { + "epoch": 0.0787, + "grad_norm": 0.21377691626548767, + "learning_rate": 4.984495988890806e-05, + "loss": 0.0522, + "step": 16740 + }, + { + "epoch": 0.07875, + "grad_norm": 0.1650165170431137, + "learning_rate": 4.9844499897518656e-05, + "loss": 0.0476, + "step": 16750 + }, + { + "epoch": 0.0788, + "grad_norm": 0.23033910989761353, + "learning_rate": 4.984403922688997e-05, + "loss": 0.0549, + "step": 16760 + }, + { + "epoch": 0.07885, + "grad_norm": 0.19480597972869873, + "learning_rate": 4.984357787703458e-05, + "loss": 0.0538, + "step": 16770 + }, + { + "epoch": 0.0789, + "grad_norm": 0.21139460802078247, + "learning_rate": 4.98431158479651e-05, + "loss": 0.0529, + "step": 16780 + }, + { + "epoch": 0.07895, + "grad_norm": 0.17839471995830536, + "learning_rate": 4.984265313969417e-05, + "loss": 0.0541, + "step": 16790 + }, + { + "epoch": 0.079, + "grad_norm": 0.18361295759677887, + "learning_rate": 4.9842189752234435e-05, + "loss": 0.0487, + "step": 16800 + }, + { + "epoch": 0.07905, + "grad_norm": 0.1692679226398468, + "learning_rate": 4.9841725685598574e-05, + "loss": 0.0492, + "step": 16810 + }, + { + "epoch": 0.0791, + "grad_norm": 0.1974862962961197, + "learning_rate": 4.984126093979925e-05, + "loss": 0.051, + "step": 16820 + }, + { + "epoch": 0.07915, + "grad_norm": 0.18256103992462158, + "learning_rate": 4.9840795514849196e-05, + "loss": 0.0524, + "step": 16830 + }, + { + "epoch": 0.0792, + "grad_norm": 0.1663244068622589, + "learning_rate": 4.9840329410761124e-05, + "loss": 0.049, + "step": 16840 + }, + { + "epoch": 0.07925, + "grad_norm": 0.18855704367160797, + "learning_rate": 4.983986262754777e-05, + "loss": 0.0506, + "step": 16850 + }, + { + "epoch": 0.0793, + "grad_norm": 0.168840229511261, + "learning_rate": 4.983939516522191e-05, + "loss": 0.0522, + "step": 16860 + }, + { + "epoch": 0.07935, + "grad_norm": 0.183172807097435, + "learning_rate": 4.9838927023796315e-05, + "loss": 0.0504, + "step": 16870 + }, + { + "epoch": 0.0794, + "grad_norm": 0.26104679703712463, + "learning_rate": 4.9838458203283786e-05, + "loss": 0.0545, + "step": 16880 + }, + { + "epoch": 0.07945, + "grad_norm": 0.18943244218826294, + "learning_rate": 4.9837988703697144e-05, + "loss": 0.0493, + "step": 16890 + }, + { + "epoch": 0.0795, + "grad_norm": 0.20340140163898468, + "learning_rate": 4.983751852504922e-05, + "loss": 0.0517, + "step": 16900 + }, + { + "epoch": 0.07955, + "grad_norm": 0.16128084063529968, + "learning_rate": 4.983704766735288e-05, + "loss": 0.0493, + "step": 16910 + }, + { + "epoch": 0.0796, + "grad_norm": 0.17783492803573608, + "learning_rate": 4.983657613062097e-05, + "loss": 0.047, + "step": 16920 + }, + { + "epoch": 0.07965, + "grad_norm": 0.15998443961143494, + "learning_rate": 4.983610391486641e-05, + "loss": 0.0516, + "step": 16930 + }, + { + "epoch": 0.0797, + "grad_norm": 0.1869068592786789, + "learning_rate": 4.9835631020102104e-05, + "loss": 0.0555, + "step": 16940 + }, + { + "epoch": 0.07975, + "grad_norm": 0.21481378376483917, + "learning_rate": 4.9835157446340965e-05, + "loss": 0.0509, + "step": 16950 + }, + { + "epoch": 0.0798, + "grad_norm": 0.19575315713882446, + "learning_rate": 4.983468319359595e-05, + "loss": 0.0519, + "step": 16960 + }, + { + "epoch": 0.07985, + "grad_norm": 0.18808448314666748, + "learning_rate": 4.983420826188004e-05, + "loss": 0.0522, + "step": 16970 + }, + { + "epoch": 0.0799, + "grad_norm": 0.161651149392128, + "learning_rate": 4.98337326512062e-05, + "loss": 0.0492, + "step": 16980 + }, + { + "epoch": 0.07995, + "grad_norm": 0.16255159676074982, + "learning_rate": 4.983325636158744e-05, + "loss": 0.0482, + "step": 16990 + }, + { + "epoch": 0.08, + "grad_norm": 0.17627467215061188, + "learning_rate": 4.9832779393036777e-05, + "loss": 0.0467, + "step": 17000 + }, + { + "epoch": 0.08005, + "grad_norm": 0.17127850651741028, + "learning_rate": 4.983230174556725e-05, + "loss": 0.0503, + "step": 17010 + }, + { + "epoch": 0.0801, + "grad_norm": 0.1601397693157196, + "learning_rate": 4.983182341919194e-05, + "loss": 0.0497, + "step": 17020 + }, + { + "epoch": 0.08015, + "grad_norm": 0.17131473124027252, + "learning_rate": 4.9831344413923885e-05, + "loss": 0.047, + "step": 17030 + }, + { + "epoch": 0.0802, + "grad_norm": 0.20350749790668488, + "learning_rate": 4.983086472977622e-05, + "loss": 0.0493, + "step": 17040 + }, + { + "epoch": 0.08025, + "grad_norm": 0.15694685280323029, + "learning_rate": 4.9830384366762026e-05, + "loss": 0.0508, + "step": 17050 + }, + { + "epoch": 0.0803, + "grad_norm": 0.18018211424350739, + "learning_rate": 4.9829903324894466e-05, + "loss": 0.05, + "step": 17060 + }, + { + "epoch": 0.08035, + "grad_norm": 0.19399897754192352, + "learning_rate": 4.982942160418667e-05, + "loss": 0.0523, + "step": 17070 + }, + { + "epoch": 0.0804, + "grad_norm": 0.18660689890384674, + "learning_rate": 4.982893920465181e-05, + "loss": 0.0497, + "step": 17080 + }, + { + "epoch": 0.08045, + "grad_norm": 0.1875895857810974, + "learning_rate": 4.9828456126303094e-05, + "loss": 0.0493, + "step": 17090 + }, + { + "epoch": 0.0805, + "grad_norm": 0.1983502358198166, + "learning_rate": 4.982797236915371e-05, + "loss": 0.0497, + "step": 17100 + }, + { + "epoch": 0.08055, + "grad_norm": 0.24940334260463715, + "learning_rate": 4.9827487933216884e-05, + "loss": 0.0525, + "step": 17110 + }, + { + "epoch": 0.0806, + "grad_norm": 0.2184525579214096, + "learning_rate": 4.982700281850586e-05, + "loss": 0.0509, + "step": 17120 + }, + { + "epoch": 0.08065, + "grad_norm": 0.21082714200019836, + "learning_rate": 4.982651702503392e-05, + "loss": 0.053, + "step": 17130 + }, + { + "epoch": 0.0807, + "grad_norm": 0.2231767475605011, + "learning_rate": 4.982603055281432e-05, + "loss": 0.0515, + "step": 17140 + }, + { + "epoch": 0.08075, + "grad_norm": 0.250836044549942, + "learning_rate": 4.982554340186038e-05, + "loss": 0.0605, + "step": 17150 + }, + { + "epoch": 0.0808, + "grad_norm": 0.22837120294570923, + "learning_rate": 4.982505557218541e-05, + "loss": 0.0522, + "step": 17160 + }, + { + "epoch": 0.08085, + "grad_norm": 0.2457517683506012, + "learning_rate": 4.9824567063802744e-05, + "loss": 0.0554, + "step": 17170 + }, + { + "epoch": 0.0809, + "grad_norm": 0.23828142881393433, + "learning_rate": 4.982407787672574e-05, + "loss": 0.0516, + "step": 17180 + }, + { + "epoch": 0.08095, + "grad_norm": 0.20121601223945618, + "learning_rate": 4.982358801096777e-05, + "loss": 0.0505, + "step": 17190 + }, + { + "epoch": 0.081, + "grad_norm": 0.20422187447547913, + "learning_rate": 4.9823097466542236e-05, + "loss": 0.0544, + "step": 17200 + }, + { + "epoch": 0.08105, + "grad_norm": 0.15830935537815094, + "learning_rate": 4.9822606243462534e-05, + "loss": 0.0501, + "step": 17210 + }, + { + "epoch": 0.0811, + "grad_norm": 0.166924387216568, + "learning_rate": 4.982211434174211e-05, + "loss": 0.0519, + "step": 17220 + }, + { + "epoch": 0.08115, + "grad_norm": 0.15573611855506897, + "learning_rate": 4.98216217613944e-05, + "loss": 0.0494, + "step": 17230 + }, + { + "epoch": 0.0812, + "grad_norm": 0.15739154815673828, + "learning_rate": 4.982112850243288e-05, + "loss": 0.0528, + "step": 17240 + }, + { + "epoch": 0.08125, + "grad_norm": 0.1668911725282669, + "learning_rate": 4.9820634564871034e-05, + "loss": 0.0516, + "step": 17250 + }, + { + "epoch": 0.0813, + "grad_norm": 0.17671068012714386, + "learning_rate": 4.982013994872236e-05, + "loss": 0.051, + "step": 17260 + }, + { + "epoch": 0.08135, + "grad_norm": 0.16582036018371582, + "learning_rate": 4.9819644654000387e-05, + "loss": 0.0499, + "step": 17270 + }, + { + "epoch": 0.0814, + "grad_norm": 0.1869654655456543, + "learning_rate": 4.981914868071865e-05, + "loss": 0.054, + "step": 17280 + }, + { + "epoch": 0.08145, + "grad_norm": 0.19672545790672302, + "learning_rate": 4.981865202889071e-05, + "loss": 0.0493, + "step": 17290 + }, + { + "epoch": 0.0815, + "grad_norm": 0.20300154387950897, + "learning_rate": 4.981815469853015e-05, + "loss": 0.0507, + "step": 17300 + }, + { + "epoch": 0.08155, + "grad_norm": 0.1864527016878128, + "learning_rate": 4.981765668965057e-05, + "loss": 0.0513, + "step": 17310 + }, + { + "epoch": 0.0816, + "grad_norm": 0.15904299914836884, + "learning_rate": 4.9817158002265576e-05, + "loss": 0.0493, + "step": 17320 + }, + { + "epoch": 0.08165, + "grad_norm": 0.19241668283939362, + "learning_rate": 4.98166586363888e-05, + "loss": 0.0503, + "step": 17330 + }, + { + "epoch": 0.0817, + "grad_norm": 0.17900395393371582, + "learning_rate": 4.98161585920339e-05, + "loss": 0.0508, + "step": 17340 + }, + { + "epoch": 0.08175, + "grad_norm": 0.16683508455753326, + "learning_rate": 4.981565786921456e-05, + "loss": 0.0528, + "step": 17350 + }, + { + "epoch": 0.0818, + "grad_norm": 0.17146515846252441, + "learning_rate": 4.9815156467944446e-05, + "loss": 0.0484, + "step": 17360 + }, + { + "epoch": 0.08185, + "grad_norm": 0.195095032453537, + "learning_rate": 4.9814654388237284e-05, + "loss": 0.0491, + "step": 17370 + }, + { + "epoch": 0.0819, + "grad_norm": 0.16585716605186462, + "learning_rate": 4.981415163010679e-05, + "loss": 0.0496, + "step": 17380 + }, + { + "epoch": 0.08195, + "grad_norm": 0.15615801513195038, + "learning_rate": 4.9813648193566705e-05, + "loss": 0.049, + "step": 17390 + }, + { + "epoch": 0.082, + "grad_norm": 0.20243534445762634, + "learning_rate": 4.98131440786308e-05, + "loss": 0.0562, + "step": 17400 + }, + { + "epoch": 0.08205, + "grad_norm": 0.19609931111335754, + "learning_rate": 4.981263928531287e-05, + "loss": 0.0514, + "step": 17410 + }, + { + "epoch": 0.0821, + "grad_norm": 0.19932952523231506, + "learning_rate": 4.98121338136267e-05, + "loss": 0.0521, + "step": 17420 + }, + { + "epoch": 0.08215, + "grad_norm": 0.2170480191707611, + "learning_rate": 4.981162766358611e-05, + "loss": 0.0506, + "step": 17430 + }, + { + "epoch": 0.0822, + "grad_norm": 0.21314817667007446, + "learning_rate": 4.981112083520494e-05, + "loss": 0.0499, + "step": 17440 + }, + { + "epoch": 0.08225, + "grad_norm": 0.15489475429058075, + "learning_rate": 4.981061332849705e-05, + "loss": 0.0518, + "step": 17450 + }, + { + "epoch": 0.0823, + "grad_norm": 0.18252049386501312, + "learning_rate": 4.9810105143476315e-05, + "loss": 0.0492, + "step": 17460 + }, + { + "epoch": 0.08235, + "grad_norm": 0.1492038071155548, + "learning_rate": 4.980959628015662e-05, + "loss": 0.0495, + "step": 17470 + }, + { + "epoch": 0.0824, + "grad_norm": 0.1855282336473465, + "learning_rate": 4.980908673855189e-05, + "loss": 0.0495, + "step": 17480 + }, + { + "epoch": 0.08245, + "grad_norm": 0.15187287330627441, + "learning_rate": 4.980857651867604e-05, + "loss": 0.0496, + "step": 17490 + }, + { + "epoch": 0.0825, + "grad_norm": 0.1776946783065796, + "learning_rate": 4.980806562054303e-05, + "loss": 0.0515, + "step": 17500 + }, + { + "epoch": 0.08255, + "grad_norm": 0.19719122350215912, + "learning_rate": 4.980755404416684e-05, + "loss": 0.052, + "step": 17510 + }, + { + "epoch": 0.0826, + "grad_norm": 0.1953863650560379, + "learning_rate": 4.980704178956143e-05, + "loss": 0.0538, + "step": 17520 + }, + { + "epoch": 0.08265, + "grad_norm": 0.1708361655473709, + "learning_rate": 4.9806528856740814e-05, + "loss": 0.0519, + "step": 17530 + }, + { + "epoch": 0.0827, + "grad_norm": 0.23801736533641815, + "learning_rate": 4.9806015245719025e-05, + "loss": 0.0522, + "step": 17540 + }, + { + "epoch": 0.08275, + "grad_norm": 0.19790126383304596, + "learning_rate": 4.9805500956510095e-05, + "loss": 0.056, + "step": 17550 + }, + { + "epoch": 0.0828, + "grad_norm": 0.17115181684494019, + "learning_rate": 4.980498598912809e-05, + "loss": 0.049, + "step": 17560 + }, + { + "epoch": 0.08285, + "grad_norm": 0.15244634449481964, + "learning_rate": 4.980447034358708e-05, + "loss": 0.0501, + "step": 17570 + }, + { + "epoch": 0.0829, + "grad_norm": 0.18820776045322418, + "learning_rate": 4.9803954019901175e-05, + "loss": 0.0512, + "step": 17580 + }, + { + "epoch": 0.08295, + "grad_norm": 0.16231533885002136, + "learning_rate": 4.980343701808449e-05, + "loss": 0.051, + "step": 17590 + }, + { + "epoch": 0.083, + "grad_norm": 0.18385298550128937, + "learning_rate": 4.9802919338151154e-05, + "loss": 0.0498, + "step": 17600 + }, + { + "epoch": 0.08305, + "grad_norm": 0.145505890250206, + "learning_rate": 4.980240098011532e-05, + "loss": 0.0531, + "step": 17610 + }, + { + "epoch": 0.0831, + "grad_norm": 0.19038888812065125, + "learning_rate": 4.980188194399116e-05, + "loss": 0.0501, + "step": 17620 + }, + { + "epoch": 0.08315, + "grad_norm": 0.19386428594589233, + "learning_rate": 4.980136222979286e-05, + "loss": 0.05, + "step": 17630 + }, + { + "epoch": 0.0832, + "grad_norm": 0.19215770065784454, + "learning_rate": 4.9800841837534636e-05, + "loss": 0.0495, + "step": 17640 + }, + { + "epoch": 0.08325, + "grad_norm": 0.1821633279323578, + "learning_rate": 4.980032076723073e-05, + "loss": 0.0496, + "step": 17650 + }, + { + "epoch": 0.0833, + "grad_norm": 0.16347461938858032, + "learning_rate": 4.979979901889535e-05, + "loss": 0.0512, + "step": 17660 + }, + { + "epoch": 0.08335, + "grad_norm": 0.19061973690986633, + "learning_rate": 4.979927659254279e-05, + "loss": 0.0493, + "step": 17670 + }, + { + "epoch": 0.0834, + "grad_norm": 0.21797873079776764, + "learning_rate": 4.9798753488187324e-05, + "loss": 0.0508, + "step": 17680 + }, + { + "epoch": 0.08345, + "grad_norm": 0.189432293176651, + "learning_rate": 4.979822970584325e-05, + "loss": 0.0498, + "step": 17690 + }, + { + "epoch": 0.0835, + "grad_norm": 0.23013871908187866, + "learning_rate": 4.979770524552489e-05, + "loss": 0.0541, + "step": 17700 + }, + { + "epoch": 0.08355, + "grad_norm": 0.19550783932209015, + "learning_rate": 4.97971801072466e-05, + "loss": 0.0503, + "step": 17710 + }, + { + "epoch": 0.0836, + "grad_norm": 0.20004889369010925, + "learning_rate": 4.979665429102271e-05, + "loss": 0.0516, + "step": 17720 + }, + { + "epoch": 0.08365, + "grad_norm": 0.2468462437391281, + "learning_rate": 4.979612779686761e-05, + "loss": 0.0506, + "step": 17730 + }, + { + "epoch": 0.0837, + "grad_norm": 0.2049475908279419, + "learning_rate": 4.979560062479569e-05, + "loss": 0.0534, + "step": 17740 + }, + { + "epoch": 0.08375, + "grad_norm": 0.18321344256401062, + "learning_rate": 4.9795072774821366e-05, + "loss": 0.0506, + "step": 17750 + }, + { + "epoch": 0.0838, + "grad_norm": 0.20350444316864014, + "learning_rate": 4.979454424695906e-05, + "loss": 0.0575, + "step": 17760 + }, + { + "epoch": 0.08385, + "grad_norm": 0.18226414918899536, + "learning_rate": 4.979401504122324e-05, + "loss": 0.0484, + "step": 17770 + }, + { + "epoch": 0.0839, + "grad_norm": 0.21651868522167206, + "learning_rate": 4.979348515762836e-05, + "loss": 0.0519, + "step": 17780 + }, + { + "epoch": 0.08395, + "grad_norm": 0.15989641845226288, + "learning_rate": 4.9792954596188914e-05, + "loss": 0.052, + "step": 17790 + }, + { + "epoch": 0.084, + "grad_norm": 0.20612096786499023, + "learning_rate": 4.979242335691939e-05, + "loss": 0.0499, + "step": 17800 + }, + { + "epoch": 0.08405, + "grad_norm": 0.16908518970012665, + "learning_rate": 4.979189143983434e-05, + "loss": 0.0487, + "step": 17810 + }, + { + "epoch": 0.0841, + "grad_norm": 0.18822510540485382, + "learning_rate": 4.979135884494829e-05, + "loss": 0.0504, + "step": 17820 + }, + { + "epoch": 0.08415, + "grad_norm": 0.19432739913463593, + "learning_rate": 4.97908255722758e-05, + "loss": 0.0516, + "step": 17830 + }, + { + "epoch": 0.0842, + "grad_norm": 0.1897604763507843, + "learning_rate": 4.9790291621831456e-05, + "loss": 0.0509, + "step": 17840 + }, + { + "epoch": 0.08425, + "grad_norm": 0.18982402980327606, + "learning_rate": 4.978975699362984e-05, + "loss": 0.0519, + "step": 17850 + }, + { + "epoch": 0.0843, + "grad_norm": 0.187894806265831, + "learning_rate": 4.97892216876856e-05, + "loss": 0.0485, + "step": 17860 + }, + { + "epoch": 0.08435, + "grad_norm": 0.16973808407783508, + "learning_rate": 4.978868570401333e-05, + "loss": 0.0478, + "step": 17870 + }, + { + "epoch": 0.0844, + "grad_norm": 0.15670092403888702, + "learning_rate": 4.978814904262772e-05, + "loss": 0.0495, + "step": 17880 + }, + { + "epoch": 0.08445, + "grad_norm": 0.16168759763240814, + "learning_rate": 4.9787611703543426e-05, + "loss": 0.0487, + "step": 17890 + }, + { + "epoch": 0.0845, + "grad_norm": 0.178312286734581, + "learning_rate": 4.9787073686775136e-05, + "loss": 0.049, + "step": 17900 + }, + { + "epoch": 0.08455, + "grad_norm": 0.15167541801929474, + "learning_rate": 4.978653499233756e-05, + "loss": 0.0492, + "step": 17910 + }, + { + "epoch": 0.0846, + "grad_norm": 0.17063525319099426, + "learning_rate": 4.978599562024544e-05, + "loss": 0.0532, + "step": 17920 + }, + { + "epoch": 0.08465, + "grad_norm": 0.18666213750839233, + "learning_rate": 4.978545557051351e-05, + "loss": 0.0507, + "step": 17930 + }, + { + "epoch": 0.0847, + "grad_norm": 0.1855110377073288, + "learning_rate": 4.978491484315653e-05, + "loss": 0.0475, + "step": 17940 + }, + { + "epoch": 0.08475, + "grad_norm": 0.20270107686519623, + "learning_rate": 4.978437343818929e-05, + "loss": 0.0476, + "step": 17950 + }, + { + "epoch": 0.0848, + "grad_norm": 0.22121216356754303, + "learning_rate": 4.97838313556266e-05, + "loss": 0.0481, + "step": 17960 + }, + { + "epoch": 0.08485, + "grad_norm": 0.17030639946460724, + "learning_rate": 4.978328859548326e-05, + "loss": 0.0468, + "step": 17970 + }, + { + "epoch": 0.0849, + "grad_norm": 0.17538540065288544, + "learning_rate": 4.978274515777413e-05, + "loss": 0.0513, + "step": 17980 + }, + { + "epoch": 0.08495, + "grad_norm": 0.1701241135597229, + "learning_rate": 4.9782201042514056e-05, + "loss": 0.049, + "step": 17990 + }, + { + "epoch": 0.085, + "grad_norm": 0.166990265250206, + "learning_rate": 4.9781656249717914e-05, + "loss": 0.0496, + "step": 18000 + }, + { + "epoch": 0.08505, + "grad_norm": 0.20476819574832916, + "learning_rate": 4.978111077940059e-05, + "loss": 0.0508, + "step": 18010 + }, + { + "epoch": 0.0851, + "grad_norm": 0.19486652314662933, + "learning_rate": 4.978056463157702e-05, + "loss": 0.049, + "step": 18020 + }, + { + "epoch": 0.08515, + "grad_norm": 0.1732991337776184, + "learning_rate": 4.978001780626212e-05, + "loss": 0.0488, + "step": 18030 + }, + { + "epoch": 0.0852, + "grad_norm": 0.19245341420173645, + "learning_rate": 4.977947030347084e-05, + "loss": 0.0484, + "step": 18040 + }, + { + "epoch": 0.08525, + "grad_norm": 0.18052397668361664, + "learning_rate": 4.977892212321815e-05, + "loss": 0.0511, + "step": 18050 + }, + { + "epoch": 0.0853, + "grad_norm": 0.23231765627861023, + "learning_rate": 4.977837326551904e-05, + "loss": 0.0537, + "step": 18060 + }, + { + "epoch": 0.08535, + "grad_norm": 0.1854337900876999, + "learning_rate": 4.977782373038852e-05, + "loss": 0.0499, + "step": 18070 + }, + { + "epoch": 0.0854, + "grad_norm": 0.20114870369434357, + "learning_rate": 4.9777273517841597e-05, + "loss": 0.0483, + "step": 18080 + }, + { + "epoch": 0.08545, + "grad_norm": 0.1717618703842163, + "learning_rate": 4.9776722627893334e-05, + "loss": 0.0489, + "step": 18090 + }, + { + "epoch": 0.0855, + "grad_norm": 0.16750378906726837, + "learning_rate": 4.977617106055878e-05, + "loss": 0.0513, + "step": 18100 + }, + { + "epoch": 0.08555, + "grad_norm": 0.21718497574329376, + "learning_rate": 4.977561881585301e-05, + "loss": 0.0562, + "step": 18110 + }, + { + "epoch": 0.0856, + "grad_norm": 0.19127683341503143, + "learning_rate": 4.977506589379114e-05, + "loss": 0.0526, + "step": 18120 + }, + { + "epoch": 0.08565, + "grad_norm": 0.2382887452840805, + "learning_rate": 4.977451229438827e-05, + "loss": 0.054, + "step": 18130 + }, + { + "epoch": 0.0857, + "grad_norm": 0.19061945378780365, + "learning_rate": 4.977395801765954e-05, + "loss": 0.0521, + "step": 18140 + }, + { + "epoch": 0.08575, + "grad_norm": 0.18431106209754944, + "learning_rate": 4.9773403063620104e-05, + "loss": 0.0506, + "step": 18150 + }, + { + "epoch": 0.0858, + "grad_norm": 0.16501373052597046, + "learning_rate": 4.977284743228514e-05, + "loss": 0.0486, + "step": 18160 + }, + { + "epoch": 0.08585, + "grad_norm": 0.18923909962177277, + "learning_rate": 4.977229112366983e-05, + "loss": 0.0558, + "step": 18170 + }, + { + "epoch": 0.0859, + "grad_norm": 0.17071984708309174, + "learning_rate": 4.9771734137789394e-05, + "loss": 0.0551, + "step": 18180 + }, + { + "epoch": 0.08595, + "grad_norm": 0.20952653884887695, + "learning_rate": 4.9771176474659045e-05, + "loss": 0.0528, + "step": 18190 + }, + { + "epoch": 0.086, + "grad_norm": 0.17564013600349426, + "learning_rate": 4.977061813429404e-05, + "loss": 0.0541, + "step": 18200 + }, + { + "epoch": 0.08605, + "grad_norm": 0.17267075181007385, + "learning_rate": 4.977005911670964e-05, + "loss": 0.0501, + "step": 18210 + }, + { + "epoch": 0.0861, + "grad_norm": 0.206747367978096, + "learning_rate": 4.976949942192114e-05, + "loss": 0.049, + "step": 18220 + }, + { + "epoch": 0.08615, + "grad_norm": 0.19832727313041687, + "learning_rate": 4.976893904994382e-05, + "loss": 0.0493, + "step": 18230 + }, + { + "epoch": 0.0862, + "grad_norm": 0.18825599551200867, + "learning_rate": 4.9768378000793015e-05, + "loss": 0.0542, + "step": 18240 + }, + { + "epoch": 0.08625, + "grad_norm": 0.21697713434696198, + "learning_rate": 4.976781627448406e-05, + "loss": 0.0544, + "step": 18250 + }, + { + "epoch": 0.0863, + "grad_norm": 0.21250379085540771, + "learning_rate": 4.976725387103231e-05, + "loss": 0.0498, + "step": 18260 + }, + { + "epoch": 0.08635, + "grad_norm": 0.2346934974193573, + "learning_rate": 4.9766690790453144e-05, + "loss": 0.0479, + "step": 18270 + }, + { + "epoch": 0.0864, + "grad_norm": 0.2579614222049713, + "learning_rate": 4.976612703276196e-05, + "loss": 0.0515, + "step": 18280 + }, + { + "epoch": 0.08645, + "grad_norm": 0.261024534702301, + "learning_rate": 4.976556259797417e-05, + "loss": 0.0511, + "step": 18290 + }, + { + "epoch": 0.0865, + "grad_norm": 0.2698875963687897, + "learning_rate": 4.976499748610519e-05, + "loss": 0.052, + "step": 18300 + }, + { + "epoch": 0.08655, + "grad_norm": 0.23211123049259186, + "learning_rate": 4.97644316971705e-05, + "loss": 0.0506, + "step": 18310 + }, + { + "epoch": 0.0866, + "grad_norm": 0.26521798968315125, + "learning_rate": 4.976386523118554e-05, + "loss": 0.0496, + "step": 18320 + }, + { + "epoch": 0.08665, + "grad_norm": 0.24303749203681946, + "learning_rate": 4.97632980881658e-05, + "loss": 0.0493, + "step": 18330 + }, + { + "epoch": 0.0867, + "grad_norm": 0.2587723731994629, + "learning_rate": 4.976273026812681e-05, + "loss": 0.0521, + "step": 18340 + }, + { + "epoch": 0.08675, + "grad_norm": 0.20306704938411713, + "learning_rate": 4.976216177108407e-05, + "loss": 0.0495, + "step": 18350 + }, + { + "epoch": 0.0868, + "grad_norm": 0.24386657774448395, + "learning_rate": 4.976159259705313e-05, + "loss": 0.0521, + "step": 18360 + }, + { + "epoch": 0.08685, + "grad_norm": 0.22963173687458038, + "learning_rate": 4.9761022746049544e-05, + "loss": 0.0497, + "step": 18370 + }, + { + "epoch": 0.0869, + "grad_norm": 0.2091069370508194, + "learning_rate": 4.9760452218088915e-05, + "loss": 0.0486, + "step": 18380 + }, + { + "epoch": 0.08695, + "grad_norm": 0.2133539766073227, + "learning_rate": 4.975988101318682e-05, + "loss": 0.0527, + "step": 18390 + }, + { + "epoch": 0.087, + "grad_norm": 0.19983141124248505, + "learning_rate": 4.975930913135887e-05, + "loss": 0.0493, + "step": 18400 + }, + { + "epoch": 0.08705, + "grad_norm": 0.20242683589458466, + "learning_rate": 4.9758736572620714e-05, + "loss": 0.0517, + "step": 18410 + }, + { + "epoch": 0.0871, + "grad_norm": 0.163058802485466, + "learning_rate": 4.9758163336988e-05, + "loss": 0.0484, + "step": 18420 + }, + { + "epoch": 0.08715, + "grad_norm": 0.18098005652427673, + "learning_rate": 4.97575894244764e-05, + "loss": 0.0481, + "step": 18430 + }, + { + "epoch": 0.0872, + "grad_norm": 0.2012663632631302, + "learning_rate": 4.975701483510161e-05, + "loss": 0.0521, + "step": 18440 + }, + { + "epoch": 0.08725, + "grad_norm": 0.19771431386470795, + "learning_rate": 4.9756439568879345e-05, + "loss": 0.0493, + "step": 18450 + }, + { + "epoch": 0.0873, + "grad_norm": 0.1853446364402771, + "learning_rate": 4.975586362582531e-05, + "loss": 0.0492, + "step": 18460 + }, + { + "epoch": 0.08735, + "grad_norm": 0.17247962951660156, + "learning_rate": 4.9755287005955264e-05, + "loss": 0.0506, + "step": 18470 + }, + { + "epoch": 0.0874, + "grad_norm": 0.1908915936946869, + "learning_rate": 4.975470970928498e-05, + "loss": 0.0497, + "step": 18480 + }, + { + "epoch": 0.08745, + "grad_norm": 0.14187689125537872, + "learning_rate": 4.9754131735830223e-05, + "loss": 0.0493, + "step": 18490 + }, + { + "epoch": 0.0875, + "grad_norm": 0.20994983613491058, + "learning_rate": 4.975355308560681e-05, + "loss": 0.0483, + "step": 18500 + }, + { + "epoch": 0.08755, + "grad_norm": 0.21409648656845093, + "learning_rate": 4.975297375863055e-05, + "loss": 0.049, + "step": 18510 + }, + { + "epoch": 0.0876, + "grad_norm": 0.1786034107208252, + "learning_rate": 4.975239375491729e-05, + "loss": 0.0514, + "step": 18520 + }, + { + "epoch": 0.08765, + "grad_norm": 0.20096784830093384, + "learning_rate": 4.9751813074482876e-05, + "loss": 0.0474, + "step": 18530 + }, + { + "epoch": 0.0877, + "grad_norm": 0.18679524958133698, + "learning_rate": 4.975123171734321e-05, + "loss": 0.0492, + "step": 18540 + }, + { + "epoch": 0.08775, + "grad_norm": 0.19444356858730316, + "learning_rate": 4.975064968351415e-05, + "loss": 0.0476, + "step": 18550 + }, + { + "epoch": 0.0878, + "grad_norm": 0.20263616740703583, + "learning_rate": 4.975006697301163e-05, + "loss": 0.0545, + "step": 18560 + }, + { + "epoch": 0.08785, + "grad_norm": 0.23749925196170807, + "learning_rate": 4.974948358585158e-05, + "loss": 0.0511, + "step": 18570 + }, + { + "epoch": 0.0879, + "grad_norm": 0.19112126529216766, + "learning_rate": 4.9748899522049944e-05, + "loss": 0.0504, + "step": 18580 + }, + { + "epoch": 0.08795, + "grad_norm": 0.1887068897485733, + "learning_rate": 4.9748314781622696e-05, + "loss": 0.0495, + "step": 18590 + }, + { + "epoch": 0.088, + "grad_norm": 0.1935853362083435, + "learning_rate": 4.974772936458582e-05, + "loss": 0.0496, + "step": 18600 + }, + { + "epoch": 0.08805, + "grad_norm": 0.1678624302148819, + "learning_rate": 4.9747143270955324e-05, + "loss": 0.0472, + "step": 18610 + }, + { + "epoch": 0.0881, + "grad_norm": 0.21695537865161896, + "learning_rate": 4.974655650074722e-05, + "loss": 0.0474, + "step": 18620 + }, + { + "epoch": 0.08815, + "grad_norm": 0.1442045122385025, + "learning_rate": 4.974596905397756e-05, + "loss": 0.0481, + "step": 18630 + }, + { + "epoch": 0.0882, + "grad_norm": 0.17364618182182312, + "learning_rate": 4.97453809306624e-05, + "loss": 0.0469, + "step": 18640 + }, + { + "epoch": 0.08825, + "grad_norm": 0.16962353885173798, + "learning_rate": 4.974479213081783e-05, + "loss": 0.0521, + "step": 18650 + }, + { + "epoch": 0.0883, + "grad_norm": 0.21104495227336884, + "learning_rate": 4.9744202654459935e-05, + "loss": 0.0505, + "step": 18660 + }, + { + "epoch": 0.08835, + "grad_norm": 0.13724002242088318, + "learning_rate": 4.974361250160483e-05, + "loss": 0.051, + "step": 18670 + }, + { + "epoch": 0.0884, + "grad_norm": 0.19101646542549133, + "learning_rate": 4.974302167226866e-05, + "loss": 0.0493, + "step": 18680 + }, + { + "epoch": 0.08845, + "grad_norm": 0.14986075460910797, + "learning_rate": 4.974243016646758e-05, + "loss": 0.0493, + "step": 18690 + }, + { + "epoch": 0.0885, + "grad_norm": 0.18582309782505035, + "learning_rate": 4.9741837984217746e-05, + "loss": 0.0479, + "step": 18700 + }, + { + "epoch": 0.08855, + "grad_norm": 0.18872737884521484, + "learning_rate": 4.974124512553535e-05, + "loss": 0.0477, + "step": 18710 + }, + { + "epoch": 0.0886, + "grad_norm": 0.20704255998134613, + "learning_rate": 4.9740651590436624e-05, + "loss": 0.0481, + "step": 18720 + }, + { + "epoch": 0.08865, + "grad_norm": 0.2147834151983261, + "learning_rate": 4.9740057378937764e-05, + "loss": 0.0486, + "step": 18730 + }, + { + "epoch": 0.0887, + "grad_norm": 0.166178360581398, + "learning_rate": 4.9739462491055035e-05, + "loss": 0.0483, + "step": 18740 + }, + { + "epoch": 0.08875, + "grad_norm": 0.2481415867805481, + "learning_rate": 4.9738866926804694e-05, + "loss": 0.0486, + "step": 18750 + }, + { + "epoch": 0.0888, + "grad_norm": 0.1964527815580368, + "learning_rate": 4.973827068620303e-05, + "loss": 0.0496, + "step": 18760 + }, + { + "epoch": 0.08885, + "grad_norm": 0.22712422907352448, + "learning_rate": 4.973767376926633e-05, + "loss": 0.0529, + "step": 18770 + }, + { + "epoch": 0.0889, + "grad_norm": 0.21424773335456848, + "learning_rate": 4.9737076176010935e-05, + "loss": 0.0489, + "step": 18780 + }, + { + "epoch": 0.08895, + "grad_norm": 0.19034512341022491, + "learning_rate": 4.973647790645316e-05, + "loss": 0.0487, + "step": 18790 + }, + { + "epoch": 0.089, + "grad_norm": 0.1828261762857437, + "learning_rate": 4.9735878960609385e-05, + "loss": 0.05, + "step": 18800 + }, + { + "epoch": 0.08905, + "grad_norm": 0.210920050740242, + "learning_rate": 4.973527933849596e-05, + "loss": 0.0502, + "step": 18810 + }, + { + "epoch": 0.0891, + "grad_norm": 0.15590353310108185, + "learning_rate": 4.9734679040129296e-05, + "loss": 0.0499, + "step": 18820 + }, + { + "epoch": 0.08915, + "grad_norm": 0.17563462257385254, + "learning_rate": 4.973407806552579e-05, + "loss": 0.0503, + "step": 18830 + }, + { + "epoch": 0.0892, + "grad_norm": 0.23146726191043854, + "learning_rate": 4.97334764147019e-05, + "loss": 0.0536, + "step": 18840 + }, + { + "epoch": 0.08925, + "grad_norm": 0.18807169795036316, + "learning_rate": 4.9732874087674044e-05, + "loss": 0.0483, + "step": 18850 + }, + { + "epoch": 0.0893, + "grad_norm": 0.16290698945522308, + "learning_rate": 4.9732271084458704e-05, + "loss": 0.0481, + "step": 18860 + }, + { + "epoch": 0.08935, + "grad_norm": 0.15102915465831757, + "learning_rate": 4.973166740507236e-05, + "loss": 0.0478, + "step": 18870 + }, + { + "epoch": 0.0894, + "grad_norm": 0.17469939589500427, + "learning_rate": 4.9731063049531527e-05, + "loss": 0.0491, + "step": 18880 + }, + { + "epoch": 0.08945, + "grad_norm": 0.14090858399868011, + "learning_rate": 4.973045801785272e-05, + "loss": 0.0472, + "step": 18890 + }, + { + "epoch": 0.0895, + "grad_norm": 0.16017688810825348, + "learning_rate": 4.9729852310052475e-05, + "loss": 0.0488, + "step": 18900 + }, + { + "epoch": 0.08955, + "grad_norm": 0.24661116302013397, + "learning_rate": 4.9729245926147364e-05, + "loss": 0.0481, + "step": 18910 + }, + { + "epoch": 0.0896, + "grad_norm": 0.19585567712783813, + "learning_rate": 4.9728638866153965e-05, + "loss": 0.0508, + "step": 18920 + }, + { + "epoch": 0.08965, + "grad_norm": 0.1510034054517746, + "learning_rate": 4.972803113008886e-05, + "loss": 0.0456, + "step": 18930 + }, + { + "epoch": 0.0897, + "grad_norm": 0.23765966296195984, + "learning_rate": 4.972742271796868e-05, + "loss": 0.0485, + "step": 18940 + }, + { + "epoch": 0.08975, + "grad_norm": 0.16872632503509521, + "learning_rate": 4.9726813629810056e-05, + "loss": 0.0468, + "step": 18950 + }, + { + "epoch": 0.0898, + "grad_norm": 0.1598125398159027, + "learning_rate": 4.972620386562963e-05, + "loss": 0.048, + "step": 18960 + }, + { + "epoch": 0.08985, + "grad_norm": 0.2012661248445511, + "learning_rate": 4.9725593425444075e-05, + "loss": 0.0475, + "step": 18970 + }, + { + "epoch": 0.0899, + "grad_norm": 0.16737717390060425, + "learning_rate": 4.972498230927009e-05, + "loss": 0.0474, + "step": 18980 + }, + { + "epoch": 0.08995, + "grad_norm": 0.16477149724960327, + "learning_rate": 4.972437051712438e-05, + "loss": 0.0475, + "step": 18990 + }, + { + "epoch": 0.09, + "grad_norm": 0.1504310518503189, + "learning_rate": 4.972375804902366e-05, + "loss": 0.0489, + "step": 19000 + }, + { + "epoch": 0.09005, + "grad_norm": 0.16693048179149628, + "learning_rate": 4.97231449049847e-05, + "loss": 0.0472, + "step": 19010 + }, + { + "epoch": 0.0901, + "grad_norm": 0.15752717852592468, + "learning_rate": 4.9722531085024234e-05, + "loss": 0.0481, + "step": 19020 + }, + { + "epoch": 0.09015, + "grad_norm": 0.1765477955341339, + "learning_rate": 4.972191658915906e-05, + "loss": 0.0471, + "step": 19030 + }, + { + "epoch": 0.0902, + "grad_norm": 0.16440802812576294, + "learning_rate": 4.972130141740597e-05, + "loss": 0.0489, + "step": 19040 + }, + { + "epoch": 0.09025, + "grad_norm": 0.1679811179637909, + "learning_rate": 4.972068556978179e-05, + "loss": 0.0485, + "step": 19050 + }, + { + "epoch": 0.0903, + "grad_norm": 0.14989008009433746, + "learning_rate": 4.9720069046303355e-05, + "loss": 0.0478, + "step": 19060 + }, + { + "epoch": 0.09035, + "grad_norm": 0.19462595880031586, + "learning_rate": 4.971945184698751e-05, + "loss": 0.0519, + "step": 19070 + }, + { + "epoch": 0.0904, + "grad_norm": 0.2007419466972351, + "learning_rate": 4.971883397185114e-05, + "loss": 0.0481, + "step": 19080 + }, + { + "epoch": 0.09045, + "grad_norm": 0.1667453795671463, + "learning_rate": 4.9718215420911145e-05, + "loss": 0.0489, + "step": 19090 + }, + { + "epoch": 0.0905, + "grad_norm": 0.17430023849010468, + "learning_rate": 4.9717596194184426e-05, + "loss": 0.0471, + "step": 19100 + }, + { + "epoch": 0.09055, + "grad_norm": 0.18542641401290894, + "learning_rate": 4.9716976291687904e-05, + "loss": 0.048, + "step": 19110 + }, + { + "epoch": 0.0906, + "grad_norm": 0.17246194183826447, + "learning_rate": 4.9716355713438546e-05, + "loss": 0.0506, + "step": 19120 + }, + { + "epoch": 0.09065, + "grad_norm": 0.19071803987026215, + "learning_rate": 4.97157344594533e-05, + "loss": 0.0481, + "step": 19130 + }, + { + "epoch": 0.0907, + "grad_norm": 0.17150120437145233, + "learning_rate": 4.9715112529749165e-05, + "loss": 0.0511, + "step": 19140 + }, + { + "epoch": 0.09075, + "grad_norm": 0.18714603781700134, + "learning_rate": 4.971448992434313e-05, + "loss": 0.0491, + "step": 19150 + }, + { + "epoch": 0.0908, + "grad_norm": 0.1737683117389679, + "learning_rate": 4.9713866643252235e-05, + "loss": 0.0482, + "step": 19160 + }, + { + "epoch": 0.09085, + "grad_norm": 0.15026921033859253, + "learning_rate": 4.9713242686493504e-05, + "loss": 0.0482, + "step": 19170 + }, + { + "epoch": 0.0909, + "grad_norm": 0.1403944045305252, + "learning_rate": 4.9712618054084e-05, + "loss": 0.0468, + "step": 19180 + }, + { + "epoch": 0.09095, + "grad_norm": 0.17783498764038086, + "learning_rate": 4.9711992746040815e-05, + "loss": 0.0472, + "step": 19190 + }, + { + "epoch": 0.091, + "grad_norm": 0.16668634116649628, + "learning_rate": 4.9711366762381023e-05, + "loss": 0.0489, + "step": 19200 + }, + { + "epoch": 0.09105, + "grad_norm": 0.14376750588417053, + "learning_rate": 4.971074010312175e-05, + "loss": 0.0469, + "step": 19210 + }, + { + "epoch": 0.0911, + "grad_norm": 0.15709319710731506, + "learning_rate": 4.971011276828012e-05, + "loss": 0.0471, + "step": 19220 + }, + { + "epoch": 0.09115, + "grad_norm": 0.15818123519420624, + "learning_rate": 4.9709484757873296e-05, + "loss": 0.0457, + "step": 19230 + }, + { + "epoch": 0.0912, + "grad_norm": 0.20621873438358307, + "learning_rate": 4.9708856071918444e-05, + "loss": 0.0496, + "step": 19240 + }, + { + "epoch": 0.09125, + "grad_norm": 0.16149362921714783, + "learning_rate": 4.970822671043275e-05, + "loss": 0.0461, + "step": 19250 + }, + { + "epoch": 0.0913, + "grad_norm": 0.19152836501598358, + "learning_rate": 4.970759667343341e-05, + "loss": 0.0467, + "step": 19260 + }, + { + "epoch": 0.09135, + "grad_norm": 0.1924281120300293, + "learning_rate": 4.970696596093767e-05, + "loss": 0.0472, + "step": 19270 + }, + { + "epoch": 0.0914, + "grad_norm": 0.2085145264863968, + "learning_rate": 4.9706334572962754e-05, + "loss": 0.0463, + "step": 19280 + }, + { + "epoch": 0.09145, + "grad_norm": 0.2535405457019806, + "learning_rate": 4.970570250952594e-05, + "loss": 0.0542, + "step": 19290 + }, + { + "epoch": 0.0915, + "grad_norm": 0.20979748666286469, + "learning_rate": 4.9705069770644495e-05, + "loss": 0.0497, + "step": 19300 + }, + { + "epoch": 0.09155, + "grad_norm": 0.24943894147872925, + "learning_rate": 4.9704436356335726e-05, + "loss": 0.051, + "step": 19310 + }, + { + "epoch": 0.0916, + "grad_norm": 0.19635538756847382, + "learning_rate": 4.970380226661695e-05, + "loss": 0.0471, + "step": 19320 + }, + { + "epoch": 0.09165, + "grad_norm": 0.1994963139295578, + "learning_rate": 4.97031675015055e-05, + "loss": 0.0484, + "step": 19330 + }, + { + "epoch": 0.0917, + "grad_norm": 0.2685892879962921, + "learning_rate": 4.970253206101873e-05, + "loss": 0.0474, + "step": 19340 + }, + { + "epoch": 0.09175, + "grad_norm": 0.24149328470230103, + "learning_rate": 4.970189594517401e-05, + "loss": 0.052, + "step": 19350 + }, + { + "epoch": 0.0918, + "grad_norm": 0.24114196002483368, + "learning_rate": 4.9701259153988746e-05, + "loss": 0.0479, + "step": 19360 + }, + { + "epoch": 0.09185, + "grad_norm": 0.21207568049430847, + "learning_rate": 4.9700621687480326e-05, + "loss": 0.0475, + "step": 19370 + }, + { + "epoch": 0.0919, + "grad_norm": 0.21124345064163208, + "learning_rate": 4.9699983545666196e-05, + "loss": 0.0492, + "step": 19380 + }, + { + "epoch": 0.09195, + "grad_norm": 0.19619761407375336, + "learning_rate": 4.969934472856379e-05, + "loss": 0.0485, + "step": 19390 + }, + { + "epoch": 0.092, + "grad_norm": 0.18321619927883148, + "learning_rate": 4.9698705236190576e-05, + "loss": 0.0477, + "step": 19400 + }, + { + "epoch": 0.09205, + "grad_norm": 0.17496579885482788, + "learning_rate": 4.9698065068564046e-05, + "loss": 0.0511, + "step": 19410 + }, + { + "epoch": 0.0921, + "grad_norm": 0.18823853135108948, + "learning_rate": 4.9697424225701695e-05, + "loss": 0.0491, + "step": 19420 + }, + { + "epoch": 0.09215, + "grad_norm": 0.1598740518093109, + "learning_rate": 4.9696782707621034e-05, + "loss": 0.0491, + "step": 19430 + }, + { + "epoch": 0.0922, + "grad_norm": 0.21306900680065155, + "learning_rate": 4.969614051433963e-05, + "loss": 0.0517, + "step": 19440 + }, + { + "epoch": 0.09225, + "grad_norm": 0.16243909299373627, + "learning_rate": 4.969549764587501e-05, + "loss": 0.0482, + "step": 19450 + }, + { + "epoch": 0.0923, + "grad_norm": 0.17526249587535858, + "learning_rate": 4.9694854102244756e-05, + "loss": 0.0478, + "step": 19460 + }, + { + "epoch": 0.09235, + "grad_norm": 0.1852334886789322, + "learning_rate": 4.969420988346648e-05, + "loss": 0.0493, + "step": 19470 + }, + { + "epoch": 0.0924, + "grad_norm": 0.14817926287651062, + "learning_rate": 4.9693564989557784e-05, + "loss": 0.0472, + "step": 19480 + }, + { + "epoch": 0.09245, + "grad_norm": 0.15732935070991516, + "learning_rate": 4.9692919420536285e-05, + "loss": 0.0474, + "step": 19490 + }, + { + "epoch": 0.0925, + "grad_norm": 0.19525741040706635, + "learning_rate": 4.969227317641966e-05, + "loss": 0.0541, + "step": 19500 + }, + { + "epoch": 0.09255, + "grad_norm": 0.18633055686950684, + "learning_rate": 4.969162625722556e-05, + "loss": 0.0489, + "step": 19510 + }, + { + "epoch": 0.0926, + "grad_norm": 0.17439649999141693, + "learning_rate": 4.9690978662971674e-05, + "loss": 0.052, + "step": 19520 + }, + { + "epoch": 0.09265, + "grad_norm": 0.172563835978508, + "learning_rate": 4.96903303936757e-05, + "loss": 0.0501, + "step": 19530 + }, + { + "epoch": 0.0927, + "grad_norm": 0.1648489534854889, + "learning_rate": 4.968968144935538e-05, + "loss": 0.0521, + "step": 19540 + }, + { + "epoch": 0.09275, + "grad_norm": 0.17998361587524414, + "learning_rate": 4.968903183002843e-05, + "loss": 0.0495, + "step": 19550 + }, + { + "epoch": 0.0928, + "grad_norm": 0.15992969274520874, + "learning_rate": 4.968838153571263e-05, + "loss": 0.05, + "step": 19560 + }, + { + "epoch": 0.09285, + "grad_norm": 0.17172817885875702, + "learning_rate": 4.968773056642576e-05, + "loss": 0.0472, + "step": 19570 + }, + { + "epoch": 0.0929, + "grad_norm": 0.1717754751443863, + "learning_rate": 4.9687078922185614e-05, + "loss": 0.0484, + "step": 19580 + }, + { + "epoch": 0.09295, + "grad_norm": 0.17016863822937012, + "learning_rate": 4.9686426603009996e-05, + "loss": 0.05, + "step": 19590 + }, + { + "epoch": 0.093, + "grad_norm": 0.20225080847740173, + "learning_rate": 4.968577360891675e-05, + "loss": 0.052, + "step": 19600 + }, + { + "epoch": 0.09305, + "grad_norm": 0.19731438159942627, + "learning_rate": 4.968511993992373e-05, + "loss": 0.0492, + "step": 19610 + }, + { + "epoch": 0.0931, + "grad_norm": 0.16638094186782837, + "learning_rate": 4.9684465596048804e-05, + "loss": 0.0486, + "step": 19620 + }, + { + "epoch": 0.09315, + "grad_norm": 0.19387859106063843, + "learning_rate": 4.968381057730986e-05, + "loss": 0.0484, + "step": 19630 + }, + { + "epoch": 0.0932, + "grad_norm": 0.1559034287929535, + "learning_rate": 4.9683154883724815e-05, + "loss": 0.0477, + "step": 19640 + }, + { + "epoch": 0.09325, + "grad_norm": 0.16176250576972961, + "learning_rate": 4.968249851531158e-05, + "loss": 0.0528, + "step": 19650 + }, + { + "epoch": 0.0933, + "grad_norm": 0.1534813493490219, + "learning_rate": 4.9681841472088116e-05, + "loss": 0.0474, + "step": 19660 + }, + { + "epoch": 0.09335, + "grad_norm": 0.1552479863166809, + "learning_rate": 4.968118375407238e-05, + "loss": 0.0466, + "step": 19670 + }, + { + "epoch": 0.0934, + "grad_norm": 0.13360746204853058, + "learning_rate": 4.9680525361282335e-05, + "loss": 0.0476, + "step": 19680 + }, + { + "epoch": 0.09345, + "grad_norm": 0.15875092148780823, + "learning_rate": 4.9679866293736015e-05, + "loss": 0.0486, + "step": 19690 + }, + { + "epoch": 0.0935, + "grad_norm": 0.18327684700489044, + "learning_rate": 4.9679206551451415e-05, + "loss": 0.048, + "step": 19700 + }, + { + "epoch": 0.09355, + "grad_norm": 0.15746480226516724, + "learning_rate": 4.967854613444659e-05, + "loss": 0.0485, + "step": 19710 + }, + { + "epoch": 0.0936, + "grad_norm": 0.2012982964515686, + "learning_rate": 4.9677885042739575e-05, + "loss": 0.048, + "step": 19720 + }, + { + "epoch": 0.09365, + "grad_norm": 0.17083559930324554, + "learning_rate": 4.967722327634846e-05, + "loss": 0.0466, + "step": 19730 + }, + { + "epoch": 0.0937, + "grad_norm": 0.1629980057477951, + "learning_rate": 4.9676560835291324e-05, + "loss": 0.049, + "step": 19740 + }, + { + "epoch": 0.09375, + "grad_norm": 0.17821356654167175, + "learning_rate": 4.967589771958629e-05, + "loss": 0.0479, + "step": 19750 + }, + { + "epoch": 0.0938, + "grad_norm": 0.1729680448770523, + "learning_rate": 4.9675233929251486e-05, + "loss": 0.0463, + "step": 19760 + }, + { + "epoch": 0.09385, + "grad_norm": 0.13014718890190125, + "learning_rate": 4.967456946430505e-05, + "loss": 0.0476, + "step": 19770 + }, + { + "epoch": 0.0939, + "grad_norm": 0.15263806283473969, + "learning_rate": 4.9673904324765154e-05, + "loss": 0.0481, + "step": 19780 + }, + { + "epoch": 0.09395, + "grad_norm": 0.18089689314365387, + "learning_rate": 4.967323851064999e-05, + "loss": 0.0499, + "step": 19790 + }, + { + "epoch": 0.094, + "grad_norm": 0.17258678376674652, + "learning_rate": 4.9672572021977747e-05, + "loss": 0.0488, + "step": 19800 + }, + { + "epoch": 0.09405, + "grad_norm": 0.1985284686088562, + "learning_rate": 4.967190485876666e-05, + "loss": 0.0516, + "step": 19810 + }, + { + "epoch": 0.0941, + "grad_norm": 0.1818893700838089, + "learning_rate": 4.967123702103496e-05, + "loss": 0.0476, + "step": 19820 + }, + { + "epoch": 0.09415, + "grad_norm": 0.16856974363327026, + "learning_rate": 4.9670568508800905e-05, + "loss": 0.05, + "step": 19830 + }, + { + "epoch": 0.0942, + "grad_norm": 0.17095766961574554, + "learning_rate": 4.966989932208279e-05, + "loss": 0.0499, + "step": 19840 + }, + { + "epoch": 0.09425, + "grad_norm": 0.18125998973846436, + "learning_rate": 4.966922946089888e-05, + "loss": 0.05, + "step": 19850 + }, + { + "epoch": 0.0943, + "grad_norm": 0.14619998633861542, + "learning_rate": 4.966855892526751e-05, + "loss": 0.0472, + "step": 19860 + }, + { + "epoch": 0.09435, + "grad_norm": 0.17567703127861023, + "learning_rate": 4.9667887715207004e-05, + "loss": 0.0479, + "step": 19870 + }, + { + "epoch": 0.0944, + "grad_norm": 0.16288483142852783, + "learning_rate": 4.966721583073572e-05, + "loss": 0.0488, + "step": 19880 + }, + { + "epoch": 0.09445, + "grad_norm": 0.17857293784618378, + "learning_rate": 4.9666543271872017e-05, + "loss": 0.0493, + "step": 19890 + }, + { + "epoch": 0.0945, + "grad_norm": 0.1777510941028595, + "learning_rate": 4.966587003863429e-05, + "loss": 0.0471, + "step": 19900 + }, + { + "epoch": 0.09455, + "grad_norm": 0.20810148119926453, + "learning_rate": 4.9665196131040946e-05, + "loss": 0.0506, + "step": 19910 + }, + { + "epoch": 0.0946, + "grad_norm": 0.18452343344688416, + "learning_rate": 4.96645215491104e-05, + "loss": 0.0525, + "step": 19920 + }, + { + "epoch": 0.09465, + "grad_norm": 0.1990736871957779, + "learning_rate": 4.96638462928611e-05, + "loss": 0.0506, + "step": 19930 + }, + { + "epoch": 0.0947, + "grad_norm": 0.1752295345067978, + "learning_rate": 4.966317036231152e-05, + "loss": 0.048, + "step": 19940 + }, + { + "epoch": 0.09475, + "grad_norm": 0.18605153262615204, + "learning_rate": 4.966249375748012e-05, + "loss": 0.0482, + "step": 19950 + }, + { + "epoch": 0.0948, + "grad_norm": 0.17329007387161255, + "learning_rate": 4.96618164783854e-05, + "loss": 0.0484, + "step": 19960 + }, + { + "epoch": 0.09485, + "grad_norm": 0.2310418337583542, + "learning_rate": 4.966113852504589e-05, + "loss": 0.0499, + "step": 19970 + }, + { + "epoch": 0.0949, + "grad_norm": 0.14862242341041565, + "learning_rate": 4.966045989748011e-05, + "loss": 0.0472, + "step": 19980 + }, + { + "epoch": 0.09495, + "grad_norm": 0.20593217015266418, + "learning_rate": 4.965978059570663e-05, + "loss": 0.0489, + "step": 19990 + }, + { + "epoch": 0.095, + "grad_norm": 0.15908844769001007, + "learning_rate": 4.9659100619744016e-05, + "loss": 0.0468, + "step": 20000 + }, + { + "epoch": 0.09505, + "grad_norm": 0.15577799081802368, + "learning_rate": 4.965841996961084e-05, + "loss": 0.0479, + "step": 20010 + }, + { + "epoch": 0.0951, + "grad_norm": 0.20530954003334045, + "learning_rate": 4.965773864532573e-05, + "loss": 0.0483, + "step": 20020 + }, + { + "epoch": 0.09515, + "grad_norm": 0.16188617050647736, + "learning_rate": 4.965705664690732e-05, + "loss": 0.0471, + "step": 20030 + }, + { + "epoch": 0.0952, + "grad_norm": 0.1475004404783249, + "learning_rate": 4.9656373974374235e-05, + "loss": 0.0474, + "step": 20040 + }, + { + "epoch": 0.09525, + "grad_norm": 0.14627623558044434, + "learning_rate": 4.9655690627745156e-05, + "loss": 0.0475, + "step": 20050 + }, + { + "epoch": 0.0953, + "grad_norm": 0.14917166531085968, + "learning_rate": 4.965500660703875e-05, + "loss": 0.0482, + "step": 20060 + }, + { + "epoch": 0.09535, + "grad_norm": 0.16681663691997528, + "learning_rate": 4.965432191227373e-05, + "loss": 0.0502, + "step": 20070 + }, + { + "epoch": 0.0954, + "grad_norm": 0.18023552000522614, + "learning_rate": 4.965363654346881e-05, + "loss": 0.0507, + "step": 20080 + }, + { + "epoch": 0.09545, + "grad_norm": 0.14802071452140808, + "learning_rate": 4.9652950500642724e-05, + "loss": 0.0491, + "step": 20090 + }, + { + "epoch": 0.0955, + "grad_norm": 0.20631712675094604, + "learning_rate": 4.965226378381424e-05, + "loss": 0.0502, + "step": 20100 + }, + { + "epoch": 0.09555, + "grad_norm": 0.16245928406715393, + "learning_rate": 4.9651576393002124e-05, + "loss": 0.0474, + "step": 20110 + }, + { + "epoch": 0.0956, + "grad_norm": 0.18286171555519104, + "learning_rate": 4.965088832822517e-05, + "loss": 0.0484, + "step": 20120 + }, + { + "epoch": 0.09565, + "grad_norm": 0.1389743834733963, + "learning_rate": 4.965019958950219e-05, + "loss": 0.0474, + "step": 20130 + }, + { + "epoch": 0.0957, + "grad_norm": 0.15614444017410278, + "learning_rate": 4.9649510176852016e-05, + "loss": 0.0488, + "step": 20140 + }, + { + "epoch": 0.09575, + "grad_norm": 0.15054257214069366, + "learning_rate": 4.964882009029349e-05, + "loss": 0.047, + "step": 20150 + }, + { + "epoch": 0.0958, + "grad_norm": 0.14017704129219055, + "learning_rate": 4.964812932984548e-05, + "loss": 0.0482, + "step": 20160 + }, + { + "epoch": 0.09585, + "grad_norm": 0.17021861672401428, + "learning_rate": 4.964743789552688e-05, + "loss": 0.0473, + "step": 20170 + }, + { + "epoch": 0.0959, + "grad_norm": 0.16860070824623108, + "learning_rate": 4.964674578735659e-05, + "loss": 0.0487, + "step": 20180 + }, + { + "epoch": 0.09595, + "grad_norm": 0.1646297574043274, + "learning_rate": 4.964605300535353e-05, + "loss": 0.0475, + "step": 20190 + }, + { + "epoch": 0.096, + "grad_norm": 0.14261919260025024, + "learning_rate": 4.964535954953663e-05, + "loss": 0.047, + "step": 20200 + }, + { + "epoch": 0.09605, + "grad_norm": 0.1402837634086609, + "learning_rate": 4.9644665419924864e-05, + "loss": 0.0445, + "step": 20210 + }, + { + "epoch": 0.0961, + "grad_norm": 0.17717242240905762, + "learning_rate": 4.96439706165372e-05, + "loss": 0.045, + "step": 20220 + }, + { + "epoch": 0.09615, + "grad_norm": 0.16690470278263092, + "learning_rate": 4.9643275139392646e-05, + "loss": 0.0456, + "step": 20230 + }, + { + "epoch": 0.0962, + "grad_norm": 0.20035287737846375, + "learning_rate": 4.96425789885102e-05, + "loss": 0.0485, + "step": 20240 + }, + { + "epoch": 0.09625, + "grad_norm": 0.1991281360387802, + "learning_rate": 4.964188216390891e-05, + "loss": 0.0506, + "step": 20250 + }, + { + "epoch": 0.0963, + "grad_norm": 0.20741082727909088, + "learning_rate": 4.964118466560782e-05, + "loss": 0.0516, + "step": 20260 + }, + { + "epoch": 0.09635, + "grad_norm": 0.18098628520965576, + "learning_rate": 4.9640486493625996e-05, + "loss": 0.0476, + "step": 20270 + }, + { + "epoch": 0.0964, + "grad_norm": 0.16041788458824158, + "learning_rate": 4.9639787647982525e-05, + "loss": 0.0481, + "step": 20280 + }, + { + "epoch": 0.09645, + "grad_norm": 0.15006977319717407, + "learning_rate": 4.963908812869652e-05, + "loss": 0.0477, + "step": 20290 + }, + { + "epoch": 0.0965, + "grad_norm": 0.137539803981781, + "learning_rate": 4.963838793578709e-05, + "loss": 0.0466, + "step": 20300 + }, + { + "epoch": 0.09655, + "grad_norm": 0.1478036642074585, + "learning_rate": 4.96376870692734e-05, + "loss": 0.0462, + "step": 20310 + }, + { + "epoch": 0.0966, + "grad_norm": 0.1680741310119629, + "learning_rate": 4.963698552917461e-05, + "loss": 0.0463, + "step": 20320 + }, + { + "epoch": 0.09665, + "grad_norm": 0.141627699136734, + "learning_rate": 4.963628331550988e-05, + "loss": 0.0454, + "step": 20330 + }, + { + "epoch": 0.0967, + "grad_norm": 0.17239850759506226, + "learning_rate": 4.963558042829842e-05, + "loss": 0.0505, + "step": 20340 + }, + { + "epoch": 0.09675, + "grad_norm": 0.17625150084495544, + "learning_rate": 4.9634876867559445e-05, + "loss": 0.0454, + "step": 20350 + }, + { + "epoch": 0.0968, + "grad_norm": 0.23776133358478546, + "learning_rate": 4.963417263331219e-05, + "loss": 0.0482, + "step": 20360 + }, + { + "epoch": 0.09685, + "grad_norm": 0.20735669136047363, + "learning_rate": 4.963346772557592e-05, + "loss": 0.0483, + "step": 20370 + }, + { + "epoch": 0.0969, + "grad_norm": 0.18635834753513336, + "learning_rate": 4.9632762144369894e-05, + "loss": 0.0464, + "step": 20380 + }, + { + "epoch": 0.09695, + "grad_norm": 0.1918153315782547, + "learning_rate": 4.9632055889713405e-05, + "loss": 0.0472, + "step": 20390 + }, + { + "epoch": 0.097, + "grad_norm": 0.18084578216075897, + "learning_rate": 4.9631348961625756e-05, + "loss": 0.0457, + "step": 20400 + }, + { + "epoch": 0.09705, + "grad_norm": 0.16984711587429047, + "learning_rate": 4.963064136012629e-05, + "loss": 0.0496, + "step": 20410 + }, + { + "epoch": 0.0971, + "grad_norm": 0.18165351450443268, + "learning_rate": 4.9629933085234324e-05, + "loss": 0.0471, + "step": 20420 + }, + { + "epoch": 0.09715, + "grad_norm": 0.18519999086856842, + "learning_rate": 4.9629224136969254e-05, + "loss": 0.0478, + "step": 20430 + }, + { + "epoch": 0.0972, + "grad_norm": 0.1832815706729889, + "learning_rate": 4.962851451535044e-05, + "loss": 0.0478, + "step": 20440 + }, + { + "epoch": 0.09725, + "grad_norm": 0.18598419427871704, + "learning_rate": 4.9627804220397306e-05, + "loss": 0.0482, + "step": 20450 + }, + { + "epoch": 0.0973, + "grad_norm": 0.16671109199523926, + "learning_rate": 4.962709325212925e-05, + "loss": 0.0467, + "step": 20460 + }, + { + "epoch": 0.09735, + "grad_norm": 0.14696188271045685, + "learning_rate": 4.9626381610565714e-05, + "loss": 0.0486, + "step": 20470 + }, + { + "epoch": 0.0974, + "grad_norm": 0.15696905553340912, + "learning_rate": 4.962566929572616e-05, + "loss": 0.0506, + "step": 20480 + }, + { + "epoch": 0.09745, + "grad_norm": 0.18372030556201935, + "learning_rate": 4.9624956307630054e-05, + "loss": 0.0471, + "step": 20490 + }, + { + "epoch": 0.0975, + "grad_norm": 0.18625737726688385, + "learning_rate": 4.96242426462969e-05, + "loss": 0.0475, + "step": 20500 + }, + { + "epoch": 0.09755, + "grad_norm": 0.15121020376682281, + "learning_rate": 4.96235283117462e-05, + "loss": 0.0491, + "step": 20510 + }, + { + "epoch": 0.0976, + "grad_norm": 0.16401106119155884, + "learning_rate": 4.9622813303997486e-05, + "loss": 0.0469, + "step": 20520 + }, + { + "epoch": 0.09765, + "grad_norm": 0.1643715500831604, + "learning_rate": 4.9622097623070306e-05, + "loss": 0.0464, + "step": 20530 + }, + { + "epoch": 0.0977, + "grad_norm": 0.1850992739200592, + "learning_rate": 4.9621381268984224e-05, + "loss": 0.0483, + "step": 20540 + }, + { + "epoch": 0.09775, + "grad_norm": 0.15636497735977173, + "learning_rate": 4.9620664241758835e-05, + "loss": 0.0493, + "step": 20550 + }, + { + "epoch": 0.0978, + "grad_norm": 0.17449866235256195, + "learning_rate": 4.961994654141373e-05, + "loss": 0.0483, + "step": 20560 + }, + { + "epoch": 0.09785, + "grad_norm": 0.17442123591899872, + "learning_rate": 4.961922816796854e-05, + "loss": 0.0467, + "step": 20570 + }, + { + "epoch": 0.0979, + "grad_norm": 0.16407912969589233, + "learning_rate": 4.96185091214429e-05, + "loss": 0.0465, + "step": 20580 + }, + { + "epoch": 0.09795, + "grad_norm": 0.15191154181957245, + "learning_rate": 4.961778940185647e-05, + "loss": 0.0483, + "step": 20590 + }, + { + "epoch": 0.098, + "grad_norm": 0.17933468520641327, + "learning_rate": 4.9617069009228924e-05, + "loss": 0.0454, + "step": 20600 + }, + { + "epoch": 0.09805, + "grad_norm": 0.1651156097650528, + "learning_rate": 4.9616347943579955e-05, + "loss": 0.0482, + "step": 20610 + }, + { + "epoch": 0.0981, + "grad_norm": 0.14247684180736542, + "learning_rate": 4.9615626204929285e-05, + "loss": 0.0484, + "step": 20620 + }, + { + "epoch": 0.09815, + "grad_norm": 0.1793159544467926, + "learning_rate": 4.9614903793296655e-05, + "loss": 0.0483, + "step": 20630 + }, + { + "epoch": 0.0982, + "grad_norm": 0.1951449066400528, + "learning_rate": 4.961418070870178e-05, + "loss": 0.0483, + "step": 20640 + }, + { + "epoch": 0.09825, + "grad_norm": 0.16701920330524445, + "learning_rate": 4.961345695116447e-05, + "loss": 0.0481, + "step": 20650 + }, + { + "epoch": 0.0983, + "grad_norm": 0.18166546523571014, + "learning_rate": 4.9612732520704486e-05, + "loss": 0.0468, + "step": 20660 + }, + { + "epoch": 0.09835, + "grad_norm": 0.17654158174991608, + "learning_rate": 4.9612007417341635e-05, + "loss": 0.0519, + "step": 20670 + }, + { + "epoch": 0.0984, + "grad_norm": 0.17844170331954956, + "learning_rate": 4.9611281641095757e-05, + "loss": 0.0506, + "step": 20680 + }, + { + "epoch": 0.09845, + "grad_norm": 0.1816539615392685, + "learning_rate": 4.9610555191986685e-05, + "loss": 0.0523, + "step": 20690 + }, + { + "epoch": 0.0985, + "grad_norm": 0.1649259477853775, + "learning_rate": 4.9609828070034274e-05, + "loss": 0.0479, + "step": 20700 + }, + { + "epoch": 0.09855, + "grad_norm": 0.13955436646938324, + "learning_rate": 4.960910027525841e-05, + "loss": 0.0491, + "step": 20710 + }, + { + "epoch": 0.0986, + "grad_norm": 0.13358356058597565, + "learning_rate": 4.9608371807679e-05, + "loss": 0.0471, + "step": 20720 + }, + { + "epoch": 0.09865, + "grad_norm": 0.14095593988895416, + "learning_rate": 4.960764266731593e-05, + "loss": 0.0468, + "step": 20730 + }, + { + "epoch": 0.0987, + "grad_norm": 0.16070382297039032, + "learning_rate": 4.960691285418918e-05, + "loss": 0.0479, + "step": 20740 + }, + { + "epoch": 0.09875, + "grad_norm": 0.13572952151298523, + "learning_rate": 4.9606182368318654e-05, + "loss": 0.0496, + "step": 20750 + }, + { + "epoch": 0.0988, + "grad_norm": 0.17128685116767883, + "learning_rate": 4.960545120972436e-05, + "loss": 0.0461, + "step": 20760 + }, + { + "epoch": 0.09885, + "grad_norm": 0.16364675760269165, + "learning_rate": 4.960471937842627e-05, + "loss": 0.0499, + "step": 20770 + }, + { + "epoch": 0.0989, + "grad_norm": 0.21058125793933868, + "learning_rate": 4.9603986874444393e-05, + "loss": 0.0474, + "step": 20780 + }, + { + "epoch": 0.09895, + "grad_norm": 0.16382066905498505, + "learning_rate": 4.960325369779876e-05, + "loss": 0.0479, + "step": 20790 + }, + { + "epoch": 0.099, + "grad_norm": 0.16887398064136505, + "learning_rate": 4.960251984850941e-05, + "loss": 0.047, + "step": 20800 + }, + { + "epoch": 0.09905, + "grad_norm": 0.16802550852298737, + "learning_rate": 4.960178532659642e-05, + "loss": 0.0478, + "step": 20810 + }, + { + "epoch": 0.0991, + "grad_norm": 0.14457744359970093, + "learning_rate": 4.960105013207985e-05, + "loss": 0.047, + "step": 20820 + }, + { + "epoch": 0.09915, + "grad_norm": 0.1769077330827713, + "learning_rate": 4.960031426497982e-05, + "loss": 0.0459, + "step": 20830 + }, + { + "epoch": 0.0992, + "grad_norm": 0.19485503435134888, + "learning_rate": 4.959957772531643e-05, + "loss": 0.0485, + "step": 20840 + }, + { + "epoch": 0.09925, + "grad_norm": 0.17297333478927612, + "learning_rate": 4.959884051310983e-05, + "loss": 0.0462, + "step": 20850 + }, + { + "epoch": 0.0993, + "grad_norm": 0.14237897098064423, + "learning_rate": 4.959810262838018e-05, + "loss": 0.0476, + "step": 20860 + }, + { + "epoch": 0.09935, + "grad_norm": 0.17594468593597412, + "learning_rate": 4.959736407114764e-05, + "loss": 0.0462, + "step": 20870 + }, + { + "epoch": 0.0994, + "grad_norm": 0.17022374272346497, + "learning_rate": 4.9596624841432404e-05, + "loss": 0.0469, + "step": 20880 + }, + { + "epoch": 0.09945, + "grad_norm": 0.14756067097187042, + "learning_rate": 4.959588493925469e-05, + "loss": 0.049, + "step": 20890 + }, + { + "epoch": 0.0995, + "grad_norm": 0.18401427567005157, + "learning_rate": 4.959514436463473e-05, + "loss": 0.0491, + "step": 20900 + }, + { + "epoch": 0.09955, + "grad_norm": 0.15568624436855316, + "learning_rate": 4.9594403117592746e-05, + "loss": 0.0456, + "step": 20910 + }, + { + "epoch": 0.0996, + "grad_norm": 0.18284066021442413, + "learning_rate": 4.959366119814903e-05, + "loss": 0.0474, + "step": 20920 + }, + { + "epoch": 0.09965, + "grad_norm": 0.179499551653862, + "learning_rate": 4.9592918606323856e-05, + "loss": 0.0475, + "step": 20930 + }, + { + "epoch": 0.0997, + "grad_norm": 0.16402794420719147, + "learning_rate": 4.959217534213753e-05, + "loss": 0.049, + "step": 20940 + }, + { + "epoch": 0.09975, + "grad_norm": 0.16333773732185364, + "learning_rate": 4.959143140561036e-05, + "loss": 0.0472, + "step": 20950 + }, + { + "epoch": 0.0998, + "grad_norm": 0.1984231323003769, + "learning_rate": 4.9590686796762695e-05, + "loss": 0.0505, + "step": 20960 + }, + { + "epoch": 0.09985, + "grad_norm": 0.14446966350078583, + "learning_rate": 4.958994151561489e-05, + "loss": 0.0457, + "step": 20970 + }, + { + "epoch": 0.0999, + "grad_norm": 0.15792179107666016, + "learning_rate": 4.958919556218733e-05, + "loss": 0.0461, + "step": 20980 + }, + { + "epoch": 0.09995, + "grad_norm": 0.1875765323638916, + "learning_rate": 4.9588448936500395e-05, + "loss": 0.0473, + "step": 20990 + }, + { + "epoch": 0.1, + "grad_norm": 0.16804379224777222, + "learning_rate": 4.958770163857451e-05, + "loss": 0.0458, + "step": 21000 + }, + { + "epoch": 0.10005, + "grad_norm": 0.17551161348819733, + "learning_rate": 4.958695366843009e-05, + "loss": 0.0473, + "step": 21010 + }, + { + "epoch": 0.1001, + "grad_norm": 0.19064000248908997, + "learning_rate": 4.95862050260876e-05, + "loss": 0.0487, + "step": 21020 + }, + { + "epoch": 0.10015, + "grad_norm": 0.17545157670974731, + "learning_rate": 4.9585455711567495e-05, + "loss": 0.0475, + "step": 21030 + }, + { + "epoch": 0.1002, + "grad_norm": 0.13496017456054688, + "learning_rate": 4.958470572489028e-05, + "loss": 0.0455, + "step": 21040 + }, + { + "epoch": 0.10025, + "grad_norm": 0.1735667735338211, + "learning_rate": 4.958395506607644e-05, + "loss": 0.0461, + "step": 21050 + }, + { + "epoch": 0.1003, + "grad_norm": 0.15615443885326385, + "learning_rate": 4.95832037351465e-05, + "loss": 0.0486, + "step": 21060 + }, + { + "epoch": 0.10035, + "grad_norm": 0.2032114714384079, + "learning_rate": 4.9582451732121e-05, + "loss": 0.0467, + "step": 21070 + }, + { + "epoch": 0.1004, + "grad_norm": 0.173023521900177, + "learning_rate": 4.958169905702052e-05, + "loss": 0.0495, + "step": 21080 + }, + { + "epoch": 0.10045, + "grad_norm": 0.1550341099500656, + "learning_rate": 4.958094570986561e-05, + "loss": 0.0481, + "step": 21090 + }, + { + "epoch": 0.1005, + "grad_norm": 0.16095350682735443, + "learning_rate": 4.958019169067689e-05, + "loss": 0.0457, + "step": 21100 + }, + { + "epoch": 0.10055, + "grad_norm": 0.16583847999572754, + "learning_rate": 4.957943699947496e-05, + "loss": 0.0471, + "step": 21110 + }, + { + "epoch": 0.1006, + "grad_norm": 0.13893641531467438, + "learning_rate": 4.957868163628045e-05, + "loss": 0.0467, + "step": 21120 + }, + { + "epoch": 0.10065, + "grad_norm": 0.1380900740623474, + "learning_rate": 4.957792560111403e-05, + "loss": 0.0466, + "step": 21130 + }, + { + "epoch": 0.1007, + "grad_norm": 0.19369691610336304, + "learning_rate": 4.9577168893996346e-05, + "loss": 0.0469, + "step": 21140 + }, + { + "epoch": 0.10075, + "grad_norm": 0.12918607890605927, + "learning_rate": 4.9576411514948095e-05, + "loss": 0.046, + "step": 21150 + }, + { + "epoch": 0.1008, + "grad_norm": 0.16536854207515717, + "learning_rate": 4.957565346399e-05, + "loss": 0.0496, + "step": 21160 + }, + { + "epoch": 0.10085, + "grad_norm": 0.15015877783298492, + "learning_rate": 4.9574894741142765e-05, + "loss": 0.0468, + "step": 21170 + }, + { + "epoch": 0.1009, + "grad_norm": 0.1411079466342926, + "learning_rate": 4.957413534642714e-05, + "loss": 0.0456, + "step": 21180 + }, + { + "epoch": 0.10095, + "grad_norm": 0.12581391632556915, + "learning_rate": 4.957337527986389e-05, + "loss": 0.0453, + "step": 21190 + }, + { + "epoch": 0.101, + "grad_norm": 0.1657503843307495, + "learning_rate": 4.9572614541473786e-05, + "loss": 0.0476, + "step": 21200 + }, + { + "epoch": 0.10105, + "grad_norm": 0.13735216856002808, + "learning_rate": 4.957185313127763e-05, + "loss": 0.0469, + "step": 21210 + }, + { + "epoch": 0.1011, + "grad_norm": 0.15780487656593323, + "learning_rate": 4.9571091049296246e-05, + "loss": 0.0448, + "step": 21220 + }, + { + "epoch": 0.10115, + "grad_norm": 0.18495288491249084, + "learning_rate": 4.957032829555046e-05, + "loss": 0.051, + "step": 21230 + }, + { + "epoch": 0.1012, + "grad_norm": 0.16596247255802155, + "learning_rate": 4.956956487006114e-05, + "loss": 0.0447, + "step": 21240 + }, + { + "epoch": 0.10125, + "grad_norm": 0.169532909989357, + "learning_rate": 4.9568800772849136e-05, + "loss": 0.045, + "step": 21250 + }, + { + "epoch": 0.1013, + "grad_norm": 0.20555630326271057, + "learning_rate": 4.9568036003935344e-05, + "loss": 0.0473, + "step": 21260 + }, + { + "epoch": 0.10135, + "grad_norm": 0.17142048478126526, + "learning_rate": 4.956727056334068e-05, + "loss": 0.0476, + "step": 21270 + }, + { + "epoch": 0.1014, + "grad_norm": 0.1793658286333084, + "learning_rate": 4.956650445108608e-05, + "loss": 0.0468, + "step": 21280 + }, + { + "epoch": 0.10145, + "grad_norm": 0.1630728393793106, + "learning_rate": 4.956573766719247e-05, + "loss": 0.0474, + "step": 21290 + }, + { + "epoch": 0.1015, + "grad_norm": 0.16996127367019653, + "learning_rate": 4.956497021168082e-05, + "loss": 0.0459, + "step": 21300 + }, + { + "epoch": 0.10155, + "grad_norm": 0.16866938769817352, + "learning_rate": 4.9564202084572114e-05, + "loss": 0.0459, + "step": 21310 + }, + { + "epoch": 0.1016, + "grad_norm": 0.1557689607143402, + "learning_rate": 4.956343328588735e-05, + "loss": 0.0473, + "step": 21320 + }, + { + "epoch": 0.10165, + "grad_norm": 0.1726214587688446, + "learning_rate": 4.9562663815647555e-05, + "loss": 0.0456, + "step": 21330 + }, + { + "epoch": 0.1017, + "grad_norm": 0.1973710060119629, + "learning_rate": 4.956189367387375e-05, + "loss": 0.0473, + "step": 21340 + }, + { + "epoch": 0.10175, + "grad_norm": 0.17411907017230988, + "learning_rate": 4.956112286058701e-05, + "loss": 0.0452, + "step": 21350 + }, + { + "epoch": 0.1018, + "grad_norm": 0.18137136101722717, + "learning_rate": 4.9560351375808386e-05, + "loss": 0.0469, + "step": 21360 + }, + { + "epoch": 0.10185, + "grad_norm": 0.20940333604812622, + "learning_rate": 4.9559579219558985e-05, + "loss": 0.0484, + "step": 21370 + }, + { + "epoch": 0.1019, + "grad_norm": 0.16980507969856262, + "learning_rate": 4.9558806391859925e-05, + "loss": 0.0464, + "step": 21380 + }, + { + "epoch": 0.10195, + "grad_norm": 0.20325854420661926, + "learning_rate": 4.955803289273231e-05, + "loss": 0.0465, + "step": 21390 + }, + { + "epoch": 0.102, + "grad_norm": 0.17460691928863525, + "learning_rate": 4.9557258722197305e-05, + "loss": 0.0482, + "step": 21400 + }, + { + "epoch": 0.10205, + "grad_norm": 0.18925505876541138, + "learning_rate": 4.955648388027608e-05, + "loss": 0.0472, + "step": 21410 + }, + { + "epoch": 0.1021, + "grad_norm": 0.2193133533000946, + "learning_rate": 4.9555708366989804e-05, + "loss": 0.0469, + "step": 21420 + }, + { + "epoch": 0.10215, + "grad_norm": 0.1568610519170761, + "learning_rate": 4.955493218235969e-05, + "loss": 0.0475, + "step": 21430 + }, + { + "epoch": 0.1022, + "grad_norm": 0.18509572744369507, + "learning_rate": 4.9554155326406956e-05, + "loss": 0.0514, + "step": 21440 + }, + { + "epoch": 0.10225, + "grad_norm": 0.20055852830410004, + "learning_rate": 4.955337779915285e-05, + "loss": 0.0488, + "step": 21450 + }, + { + "epoch": 0.1023, + "grad_norm": 0.22387243807315826, + "learning_rate": 4.9552599600618596e-05, + "loss": 0.0503, + "step": 21460 + }, + { + "epoch": 0.10235, + "grad_norm": 0.16432657837867737, + "learning_rate": 4.955182073082551e-05, + "loss": 0.048, + "step": 21470 + }, + { + "epoch": 0.1024, + "grad_norm": 0.169547900557518, + "learning_rate": 4.955104118979487e-05, + "loss": 0.0486, + "step": 21480 + }, + { + "epoch": 0.10245, + "grad_norm": 0.19012103974819183, + "learning_rate": 4.9550260977547974e-05, + "loss": 0.0473, + "step": 21490 + }, + { + "epoch": 0.1025, + "grad_norm": 0.18030716478824615, + "learning_rate": 4.954948009410617e-05, + "loss": 0.048, + "step": 21500 + }, + { + "epoch": 0.10255, + "grad_norm": 0.16843658685684204, + "learning_rate": 4.954869853949081e-05, + "loss": 0.0467, + "step": 21510 + }, + { + "epoch": 0.1026, + "grad_norm": 0.18063050508499146, + "learning_rate": 4.9547916313723254e-05, + "loss": 0.0467, + "step": 21520 + }, + { + "epoch": 0.10265, + "grad_norm": 0.16403870284557343, + "learning_rate": 4.9547133416824875e-05, + "loss": 0.0476, + "step": 21530 + }, + { + "epoch": 0.1027, + "grad_norm": 0.1540970355272293, + "learning_rate": 4.954634984881711e-05, + "loss": 0.0481, + "step": 21540 + }, + { + "epoch": 0.10275, + "grad_norm": 0.1952674686908722, + "learning_rate": 4.9545565609721346e-05, + "loss": 0.0486, + "step": 21550 + }, + { + "epoch": 0.1028, + "grad_norm": 0.17950768768787384, + "learning_rate": 4.954478069955905e-05, + "loss": 0.047, + "step": 21560 + }, + { + "epoch": 0.10285, + "grad_norm": 0.17310073971748352, + "learning_rate": 4.954399511835166e-05, + "loss": 0.046, + "step": 21570 + }, + { + "epoch": 0.1029, + "grad_norm": 0.16321447491645813, + "learning_rate": 4.954320886612067e-05, + "loss": 0.0474, + "step": 21580 + }, + { + "epoch": 0.10295, + "grad_norm": 0.16060872375965118, + "learning_rate": 4.954242194288757e-05, + "loss": 0.049, + "step": 21590 + }, + { + "epoch": 0.103, + "grad_norm": 0.1784420907497406, + "learning_rate": 4.9541634348673875e-05, + "loss": 0.047, + "step": 21600 + }, + { + "epoch": 0.10305, + "grad_norm": 0.17316702008247375, + "learning_rate": 4.9540846083501115e-05, + "loss": 0.0521, + "step": 21610 + }, + { + "epoch": 0.1031, + "grad_norm": 0.19821666181087494, + "learning_rate": 4.954005714739085e-05, + "loss": 0.0487, + "step": 21620 + }, + { + "epoch": 0.10315, + "grad_norm": 0.16755364835262299, + "learning_rate": 4.953926754036463e-05, + "loss": 0.05, + "step": 21630 + }, + { + "epoch": 0.1032, + "grad_norm": 0.1867600977420807, + "learning_rate": 4.953847726244406e-05, + "loss": 0.0486, + "step": 21640 + }, + { + "epoch": 0.10325, + "grad_norm": 0.17544685304164886, + "learning_rate": 4.9537686313650744e-05, + "loss": 0.0488, + "step": 21650 + }, + { + "epoch": 0.1033, + "grad_norm": 0.14893613755702972, + "learning_rate": 4.9536894694006295e-05, + "loss": 0.0482, + "step": 21660 + }, + { + "epoch": 0.10335, + "grad_norm": 0.1764899045228958, + "learning_rate": 4.953610240353237e-05, + "loss": 0.0495, + "step": 21670 + }, + { + "epoch": 0.1034, + "grad_norm": 0.1684151291847229, + "learning_rate": 4.9535309442250624e-05, + "loss": 0.0497, + "step": 21680 + }, + { + "epoch": 0.10345, + "grad_norm": 0.16186705231666565, + "learning_rate": 4.9534515810182724e-05, + "loss": 0.0486, + "step": 21690 + }, + { + "epoch": 0.1035, + "grad_norm": 0.17481642961502075, + "learning_rate": 4.9533721507350395e-05, + "loss": 0.0467, + "step": 21700 + }, + { + "epoch": 0.10355, + "grad_norm": 0.1745694875717163, + "learning_rate": 4.953292653377533e-05, + "loss": 0.05, + "step": 21710 + }, + { + "epoch": 0.1036, + "grad_norm": 0.17581479251384735, + "learning_rate": 4.953213088947926e-05, + "loss": 0.0479, + "step": 21720 + }, + { + "epoch": 0.10365, + "grad_norm": 0.21984413266181946, + "learning_rate": 4.953133457448395e-05, + "loss": 0.0555, + "step": 21730 + }, + { + "epoch": 0.1037, + "grad_norm": 0.20761926472187042, + "learning_rate": 4.953053758881119e-05, + "loss": 0.047, + "step": 21740 + }, + { + "epoch": 0.10375, + "grad_norm": 0.1760006844997406, + "learning_rate": 4.952973993248273e-05, + "loss": 0.0501, + "step": 21750 + }, + { + "epoch": 0.1038, + "grad_norm": 0.14995700120925903, + "learning_rate": 4.952894160552039e-05, + "loss": 0.0475, + "step": 21760 + }, + { + "epoch": 0.10385, + "grad_norm": 0.18868878483772278, + "learning_rate": 4.952814260794602e-05, + "loss": 0.0494, + "step": 21770 + }, + { + "epoch": 0.1039, + "grad_norm": 0.1648269146680832, + "learning_rate": 4.9527342939781426e-05, + "loss": 0.048, + "step": 21780 + }, + { + "epoch": 0.10395, + "grad_norm": 0.16374856233596802, + "learning_rate": 4.952654260104851e-05, + "loss": 0.0517, + "step": 21790 + }, + { + "epoch": 0.104, + "grad_norm": 0.1632968932390213, + "learning_rate": 4.952574159176912e-05, + "loss": 0.0502, + "step": 21800 + }, + { + "epoch": 0.10405, + "grad_norm": 0.15796631574630737, + "learning_rate": 4.9524939911965176e-05, + "loss": 0.0479, + "step": 21810 + }, + { + "epoch": 0.1041, + "grad_norm": 0.16355274617671967, + "learning_rate": 4.9524137561658586e-05, + "loss": 0.049, + "step": 21820 + }, + { + "epoch": 0.10415, + "grad_norm": 0.19500236213207245, + "learning_rate": 4.952333454087128e-05, + "loss": 0.0476, + "step": 21830 + }, + { + "epoch": 0.1042, + "grad_norm": 0.1601325124502182, + "learning_rate": 4.9522530849625235e-05, + "loss": 0.0463, + "step": 21840 + }, + { + "epoch": 0.10425, + "grad_norm": 0.14225871860980988, + "learning_rate": 4.95217264879424e-05, + "loss": 0.0469, + "step": 21850 + }, + { + "epoch": 0.1043, + "grad_norm": 0.14632344245910645, + "learning_rate": 4.952092145584478e-05, + "loss": 0.0511, + "step": 21860 + }, + { + "epoch": 0.10435, + "grad_norm": 0.13829264044761658, + "learning_rate": 4.952011575335438e-05, + "loss": 0.0462, + "step": 21870 + }, + { + "epoch": 0.1044, + "grad_norm": 0.15746837854385376, + "learning_rate": 4.951930938049322e-05, + "loss": 0.0472, + "step": 21880 + }, + { + "epoch": 0.10445, + "grad_norm": 0.15343759953975677, + "learning_rate": 4.951850233728336e-05, + "loss": 0.0462, + "step": 21890 + }, + { + "epoch": 0.1045, + "grad_norm": 0.1506185531616211, + "learning_rate": 4.9517694623746855e-05, + "loss": 0.0467, + "step": 21900 + }, + { + "epoch": 0.10455, + "grad_norm": 0.13890409469604492, + "learning_rate": 4.9516886239905794e-05, + "loss": 0.0459, + "step": 21910 + }, + { + "epoch": 0.1046, + "grad_norm": 0.15097202360630035, + "learning_rate": 4.951607718578226e-05, + "loss": 0.0463, + "step": 21920 + }, + { + "epoch": 0.10465, + "grad_norm": 0.14566144347190857, + "learning_rate": 4.9515267461398396e-05, + "loss": 0.0481, + "step": 21930 + }, + { + "epoch": 0.1047, + "grad_norm": 0.11933682858943939, + "learning_rate": 4.9514457066776334e-05, + "loss": 0.0481, + "step": 21940 + }, + { + "epoch": 0.10475, + "grad_norm": 0.13602997362613678, + "learning_rate": 4.951364600193822e-05, + "loss": 0.044, + "step": 21950 + }, + { + "epoch": 0.1048, + "grad_norm": 0.13590200245380402, + "learning_rate": 4.951283426690623e-05, + "loss": 0.0455, + "step": 21960 + }, + { + "epoch": 0.10485, + "grad_norm": 0.15556122362613678, + "learning_rate": 4.951202186170257e-05, + "loss": 0.0462, + "step": 21970 + }, + { + "epoch": 0.1049, + "grad_norm": 0.2523100972175598, + "learning_rate": 4.951120878634943e-05, + "loss": 0.0474, + "step": 21980 + }, + { + "epoch": 0.10495, + "grad_norm": 0.14426535367965698, + "learning_rate": 4.9510395040869054e-05, + "loss": 0.0466, + "step": 21990 + }, + { + "epoch": 0.105, + "grad_norm": 0.15779918432235718, + "learning_rate": 4.950958062528369e-05, + "loss": 0.0495, + "step": 22000 + }, + { + "epoch": 0.10505, + "grad_norm": 0.1777641624212265, + "learning_rate": 4.95087655396156e-05, + "loss": 0.0462, + "step": 22010 + }, + { + "epoch": 0.1051, + "grad_norm": 0.1580272912979126, + "learning_rate": 4.950794978388706e-05, + "loss": 0.0468, + "step": 22020 + }, + { + "epoch": 0.10515, + "grad_norm": 0.14673081040382385, + "learning_rate": 4.950713335812038e-05, + "loss": 0.0484, + "step": 22030 + }, + { + "epoch": 0.1052, + "grad_norm": 0.1237303763628006, + "learning_rate": 4.9506316262337896e-05, + "loss": 0.0462, + "step": 22040 + }, + { + "epoch": 0.10525, + "grad_norm": 0.1669149547815323, + "learning_rate": 4.950549849656192e-05, + "loss": 0.0468, + "step": 22050 + }, + { + "epoch": 0.1053, + "grad_norm": 0.15633957087993622, + "learning_rate": 4.9504680060814823e-05, + "loss": 0.0452, + "step": 22060 + }, + { + "epoch": 0.10535, + "grad_norm": 0.16523058712482452, + "learning_rate": 4.9503860955118976e-05, + "loss": 0.0507, + "step": 22070 + }, + { + "epoch": 0.1054, + "grad_norm": 0.1778353899717331, + "learning_rate": 4.950304117949678e-05, + "loss": 0.0492, + "step": 22080 + }, + { + "epoch": 0.10545, + "grad_norm": 0.18347936868667603, + "learning_rate": 4.950222073397064e-05, + "loss": 0.0489, + "step": 22090 + }, + { + "epoch": 0.1055, + "grad_norm": 0.20095577836036682, + "learning_rate": 4.950139961856299e-05, + "loss": 0.0491, + "step": 22100 + }, + { + "epoch": 0.10555, + "grad_norm": 0.14817671477794647, + "learning_rate": 4.9500577833296284e-05, + "loss": 0.0481, + "step": 22110 + }, + { + "epoch": 0.1056, + "grad_norm": 0.17643176019191742, + "learning_rate": 4.949975537819298e-05, + "loss": 0.0483, + "step": 22120 + }, + { + "epoch": 0.10565, + "grad_norm": 0.156997412443161, + "learning_rate": 4.949893225327558e-05, + "loss": 0.0486, + "step": 22130 + }, + { + "epoch": 0.1057, + "grad_norm": 0.14414997398853302, + "learning_rate": 4.949810845856656e-05, + "loss": 0.0478, + "step": 22140 + }, + { + "epoch": 0.10575, + "grad_norm": 0.1429523229598999, + "learning_rate": 4.949728399408846e-05, + "loss": 0.0459, + "step": 22150 + }, + { + "epoch": 0.1058, + "grad_norm": 0.16600804030895233, + "learning_rate": 4.9496458859863824e-05, + "loss": 0.0475, + "step": 22160 + }, + { + "epoch": 0.10585, + "grad_norm": 0.1708773672580719, + "learning_rate": 4.949563305591521e-05, + "loss": 0.0451, + "step": 22170 + }, + { + "epoch": 0.1059, + "grad_norm": 0.13618949055671692, + "learning_rate": 4.949480658226518e-05, + "loss": 0.0475, + "step": 22180 + }, + { + "epoch": 0.10595, + "grad_norm": 0.14604583382606506, + "learning_rate": 4.9493979438936356e-05, + "loss": 0.0464, + "step": 22190 + }, + { + "epoch": 0.106, + "grad_norm": 0.1562792807817459, + "learning_rate": 4.9493151625951326e-05, + "loss": 0.0488, + "step": 22200 + }, + { + "epoch": 0.10605, + "grad_norm": 0.1627453714609146, + "learning_rate": 4.949232314333273e-05, + "loss": 0.0472, + "step": 22210 + }, + { + "epoch": 0.1061, + "grad_norm": 0.149214968085289, + "learning_rate": 4.949149399110322e-05, + "loss": 0.0461, + "step": 22220 + }, + { + "epoch": 0.10615, + "grad_norm": 0.16553807258605957, + "learning_rate": 4.949066416928547e-05, + "loss": 0.0458, + "step": 22230 + }, + { + "epoch": 0.1062, + "grad_norm": 0.14975816011428833, + "learning_rate": 4.9489833677902164e-05, + "loss": 0.0442, + "step": 22240 + }, + { + "epoch": 0.10625, + "grad_norm": 0.14777584373950958, + "learning_rate": 4.9489002516976e-05, + "loss": 0.0473, + "step": 22250 + }, + { + "epoch": 0.1063, + "grad_norm": 0.1806098222732544, + "learning_rate": 4.9488170686529714e-05, + "loss": 0.048, + "step": 22260 + }, + { + "epoch": 0.10635, + "grad_norm": 0.17245250940322876, + "learning_rate": 4.948733818658604e-05, + "loss": 0.0505, + "step": 22270 + }, + { + "epoch": 0.1064, + "grad_norm": 0.19378966093063354, + "learning_rate": 4.9486505017167726e-05, + "loss": 0.0458, + "step": 22280 + }, + { + "epoch": 0.10645, + "grad_norm": 0.2093646377325058, + "learning_rate": 4.9485671178297576e-05, + "loss": 0.0481, + "step": 22290 + }, + { + "epoch": 0.1065, + "grad_norm": 0.16001328825950623, + "learning_rate": 4.948483666999838e-05, + "loss": 0.0472, + "step": 22300 + }, + { + "epoch": 0.10655, + "grad_norm": 0.1635342389345169, + "learning_rate": 4.948400149229294e-05, + "loss": 0.0459, + "step": 22310 + }, + { + "epoch": 0.1066, + "grad_norm": 0.16335773468017578, + "learning_rate": 4.9483165645204097e-05, + "loss": 0.0457, + "step": 22320 + }, + { + "epoch": 0.10665, + "grad_norm": 0.1378794014453888, + "learning_rate": 4.94823291287547e-05, + "loss": 0.0461, + "step": 22330 + }, + { + "epoch": 0.1067, + "grad_norm": 0.1463647335767746, + "learning_rate": 4.948149194296763e-05, + "loss": 0.0452, + "step": 22340 + }, + { + "epoch": 0.10675, + "grad_norm": 0.1523444652557373, + "learning_rate": 4.9480654087865766e-05, + "loss": 0.0463, + "step": 22350 + }, + { + "epoch": 0.1068, + "grad_norm": 0.1556294709444046, + "learning_rate": 4.947981556347201e-05, + "loss": 0.0458, + "step": 22360 + }, + { + "epoch": 0.10685, + "grad_norm": 0.19621415436267853, + "learning_rate": 4.9478976369809305e-05, + "loss": 0.0453, + "step": 22370 + }, + { + "epoch": 0.1069, + "grad_norm": 0.2060937136411667, + "learning_rate": 4.9478136506900574e-05, + "loss": 0.0488, + "step": 22380 + }, + { + "epoch": 0.10695, + "grad_norm": 0.18328139185905457, + "learning_rate": 4.947729597476879e-05, + "loss": 0.0487, + "step": 22390 + }, + { + "epoch": 0.107, + "grad_norm": 0.15908265113830566, + "learning_rate": 4.9476454773436925e-05, + "loss": 0.0461, + "step": 22400 + }, + { + "epoch": 0.10705, + "grad_norm": 0.16934293508529663, + "learning_rate": 4.9475612902927985e-05, + "loss": 0.0459, + "step": 22410 + }, + { + "epoch": 0.1071, + "grad_norm": 0.15121278166770935, + "learning_rate": 4.947477036326498e-05, + "loss": 0.0459, + "step": 22420 + }, + { + "epoch": 0.10715, + "grad_norm": 0.1772674322128296, + "learning_rate": 4.9473927154470954e-05, + "loss": 0.0467, + "step": 22430 + }, + { + "epoch": 0.1072, + "grad_norm": 0.17032106220722198, + "learning_rate": 4.9473083276568955e-05, + "loss": 0.0486, + "step": 22440 + }, + { + "epoch": 0.10725, + "grad_norm": 0.16720731556415558, + "learning_rate": 4.9472238729582045e-05, + "loss": 0.0484, + "step": 22450 + }, + { + "epoch": 0.1073, + "grad_norm": 0.19819645583629608, + "learning_rate": 4.9471393513533324e-05, + "loss": 0.046, + "step": 22460 + }, + { + "epoch": 0.10735, + "grad_norm": 0.16138656437397003, + "learning_rate": 4.94705476284459e-05, + "loss": 0.0461, + "step": 22470 + }, + { + "epoch": 0.1074, + "grad_norm": 0.1695665568113327, + "learning_rate": 4.94697010743429e-05, + "loss": 0.0474, + "step": 22480 + }, + { + "epoch": 0.10745, + "grad_norm": 0.13845255970954895, + "learning_rate": 4.9468853851247466e-05, + "loss": 0.0442, + "step": 22490 + }, + { + "epoch": 0.1075, + "grad_norm": 0.1434382200241089, + "learning_rate": 4.946800595918275e-05, + "loss": 0.0445, + "step": 22500 + }, + { + "epoch": 0.10755, + "grad_norm": 0.14327076077461243, + "learning_rate": 4.9467157398171956e-05, + "loss": 0.0495, + "step": 22510 + }, + { + "epoch": 0.1076, + "grad_norm": 0.12793277204036713, + "learning_rate": 4.946630816823826e-05, + "loss": 0.0462, + "step": 22520 + }, + { + "epoch": 0.10765, + "grad_norm": 0.17494246363639832, + "learning_rate": 4.9465458269404895e-05, + "loss": 0.0476, + "step": 22530 + }, + { + "epoch": 0.1077, + "grad_norm": 0.17591674625873566, + "learning_rate": 4.94646077016951e-05, + "loss": 0.0453, + "step": 22540 + }, + { + "epoch": 0.10775, + "grad_norm": 0.15116706490516663, + "learning_rate": 4.94637564651321e-05, + "loss": 0.0468, + "step": 22550 + }, + { + "epoch": 0.1078, + "grad_norm": 0.17800773680210114, + "learning_rate": 4.946290455973921e-05, + "loss": 0.0471, + "step": 22560 + }, + { + "epoch": 0.10785, + "grad_norm": 0.15833675861358643, + "learning_rate": 4.9462051985539695e-05, + "loss": 0.0458, + "step": 22570 + }, + { + "epoch": 0.1079, + "grad_norm": 0.17471368610858917, + "learning_rate": 4.946119874255686e-05, + "loss": 0.0491, + "step": 22580 + }, + { + "epoch": 0.10795, + "grad_norm": 0.18584364652633667, + "learning_rate": 4.946034483081405e-05, + "loss": 0.0482, + "step": 22590 + }, + { + "epoch": 0.108, + "grad_norm": 0.17959827184677124, + "learning_rate": 4.945949025033459e-05, + "loss": 0.0473, + "step": 22600 + }, + { + "epoch": 0.10805, + "grad_norm": 0.13268496096134186, + "learning_rate": 4.945863500114187e-05, + "loss": 0.0467, + "step": 22610 + }, + { + "epoch": 0.1081, + "grad_norm": 0.19379787147045135, + "learning_rate": 4.9457779083259255e-05, + "loss": 0.0461, + "step": 22620 + }, + { + "epoch": 0.10815, + "grad_norm": 0.1832592785358429, + "learning_rate": 4.945692249671015e-05, + "loss": 0.0469, + "step": 22630 + }, + { + "epoch": 0.1082, + "grad_norm": 0.14402779936790466, + "learning_rate": 4.945606524151796e-05, + "loss": 0.0471, + "step": 22640 + }, + { + "epoch": 0.10825, + "grad_norm": 0.15217886865139008, + "learning_rate": 4.945520731770614e-05, + "loss": 0.0475, + "step": 22650 + }, + { + "epoch": 0.1083, + "grad_norm": 0.15967465937137604, + "learning_rate": 4.945434872529814e-05, + "loss": 0.0491, + "step": 22660 + }, + { + "epoch": 0.10835, + "grad_norm": 0.13702714443206787, + "learning_rate": 4.9453489464317434e-05, + "loss": 0.0459, + "step": 22670 + }, + { + "epoch": 0.1084, + "grad_norm": 0.19126775860786438, + "learning_rate": 4.9452629534787506e-05, + "loss": 0.0499, + "step": 22680 + }, + { + "epoch": 0.10845, + "grad_norm": 0.16599732637405396, + "learning_rate": 4.9451768936731885e-05, + "loss": 0.0465, + "step": 22690 + }, + { + "epoch": 0.1085, + "grad_norm": 0.1632414609193802, + "learning_rate": 4.9450907670174084e-05, + "loss": 0.0469, + "step": 22700 + }, + { + "epoch": 0.10855, + "grad_norm": 0.16432011127471924, + "learning_rate": 4.945004573513765e-05, + "loss": 0.0489, + "step": 22710 + }, + { + "epoch": 0.1086, + "grad_norm": 0.16096735000610352, + "learning_rate": 4.9449183131646146e-05, + "loss": 0.0483, + "step": 22720 + }, + { + "epoch": 0.10865, + "grad_norm": 0.18021048605442047, + "learning_rate": 4.944831985972317e-05, + "loss": 0.047, + "step": 22730 + }, + { + "epoch": 0.1087, + "grad_norm": 0.155808225274086, + "learning_rate": 4.94474559193923e-05, + "loss": 0.0478, + "step": 22740 + }, + { + "epoch": 0.10875, + "grad_norm": 0.1347564309835434, + "learning_rate": 4.944659131067719e-05, + "loss": 0.046, + "step": 22750 + }, + { + "epoch": 0.1088, + "grad_norm": 0.1773679256439209, + "learning_rate": 4.944572603360145e-05, + "loss": 0.0458, + "step": 22760 + }, + { + "epoch": 0.10885, + "grad_norm": 0.14633294939994812, + "learning_rate": 4.9444860088188736e-05, + "loss": 0.0457, + "step": 22770 + }, + { + "epoch": 0.1089, + "grad_norm": 0.14980213344097137, + "learning_rate": 4.944399347446274e-05, + "loss": 0.0459, + "step": 22780 + }, + { + "epoch": 0.10895, + "grad_norm": 0.16055136919021606, + "learning_rate": 4.944312619244714e-05, + "loss": 0.0471, + "step": 22790 + }, + { + "epoch": 0.109, + "grad_norm": 0.16401828825473785, + "learning_rate": 4.9442258242165653e-05, + "loss": 0.0452, + "step": 22800 + }, + { + "epoch": 0.10905, + "grad_norm": 0.1338483691215515, + "learning_rate": 4.944138962364201e-05, + "loss": 0.0488, + "step": 22810 + }, + { + "epoch": 0.1091, + "grad_norm": 0.16888396441936493, + "learning_rate": 4.944052033689995e-05, + "loss": 0.0451, + "step": 22820 + }, + { + "epoch": 0.10915, + "grad_norm": 0.1720247119665146, + "learning_rate": 4.943965038196326e-05, + "loss": 0.0456, + "step": 22830 + }, + { + "epoch": 0.1092, + "grad_norm": 0.13762693107128143, + "learning_rate": 4.9438779758855694e-05, + "loss": 0.0467, + "step": 22840 + }, + { + "epoch": 0.10925, + "grad_norm": 0.11534810066223145, + "learning_rate": 4.943790846760108e-05, + "loss": 0.0441, + "step": 22850 + }, + { + "epoch": 0.1093, + "grad_norm": 0.1553107500076294, + "learning_rate": 4.943703650822323e-05, + "loss": 0.0481, + "step": 22860 + }, + { + "epoch": 0.10935, + "grad_norm": 0.1464255452156067, + "learning_rate": 4.9436163880745975e-05, + "loss": 0.0444, + "step": 22870 + }, + { + "epoch": 0.1094, + "grad_norm": 0.167152538895607, + "learning_rate": 4.9435290585193186e-05, + "loss": 0.0435, + "step": 22880 + }, + { + "epoch": 0.10945, + "grad_norm": 0.14588108658790588, + "learning_rate": 4.943441662158874e-05, + "loss": 0.0433, + "step": 22890 + }, + { + "epoch": 0.1095, + "grad_norm": 0.14812380075454712, + "learning_rate": 4.943354198995651e-05, + "loss": 0.0447, + "step": 22900 + }, + { + "epoch": 0.10955, + "grad_norm": 0.2095862329006195, + "learning_rate": 4.9432666690320426e-05, + "loss": 0.0462, + "step": 22910 + }, + { + "epoch": 0.1096, + "grad_norm": 0.1407693475484848, + "learning_rate": 4.943179072270441e-05, + "loss": 0.0461, + "step": 22920 + }, + { + "epoch": 0.10965, + "grad_norm": 0.15722720324993134, + "learning_rate": 4.9430914087132415e-05, + "loss": 0.0452, + "step": 22930 + }, + { + "epoch": 0.1097, + "grad_norm": 0.15718896687030792, + "learning_rate": 4.943003678362842e-05, + "loss": 0.0457, + "step": 22940 + }, + { + "epoch": 0.10975, + "grad_norm": 0.13286322355270386, + "learning_rate": 4.942915881221638e-05, + "loss": 0.0441, + "step": 22950 + }, + { + "epoch": 0.1098, + "grad_norm": 0.16470444202423096, + "learning_rate": 4.942828017292033e-05, + "loss": 0.0468, + "step": 22960 + }, + { + "epoch": 0.10985, + "grad_norm": 0.12710019946098328, + "learning_rate": 4.942740086576427e-05, + "loss": 0.0452, + "step": 22970 + }, + { + "epoch": 0.1099, + "grad_norm": 0.18647213280200958, + "learning_rate": 4.9426520890772245e-05, + "loss": 0.0507, + "step": 22980 + }, + { + "epoch": 0.10995, + "grad_norm": 0.24347060918807983, + "learning_rate": 4.942564024796832e-05, + "loss": 0.046, + "step": 22990 + }, + { + "epoch": 0.11, + "grad_norm": 0.1583353877067566, + "learning_rate": 4.9424758937376567e-05, + "loss": 0.0475, + "step": 23000 + }, + { + "epoch": 0.11005, + "grad_norm": 0.16557100415229797, + "learning_rate": 4.942387695902108e-05, + "loss": 0.0466, + "step": 23010 + }, + { + "epoch": 0.1101, + "grad_norm": 0.1670871376991272, + "learning_rate": 4.942299431292596e-05, + "loss": 0.0463, + "step": 23020 + }, + { + "epoch": 0.11015, + "grad_norm": 0.14393047988414764, + "learning_rate": 4.9422110999115365e-05, + "loss": 0.0445, + "step": 23030 + }, + { + "epoch": 0.1102, + "grad_norm": 0.15277686715126038, + "learning_rate": 4.942122701761343e-05, + "loss": 0.0491, + "step": 23040 + }, + { + "epoch": 0.11025, + "grad_norm": 0.14930570125579834, + "learning_rate": 4.9420342368444316e-05, + "loss": 0.0453, + "step": 23050 + }, + { + "epoch": 0.1103, + "grad_norm": 0.12654705345630646, + "learning_rate": 4.941945705163222e-05, + "loss": 0.048, + "step": 23060 + }, + { + "epoch": 0.11035, + "grad_norm": 0.17418532073497772, + "learning_rate": 4.9418571067201346e-05, + "loss": 0.0463, + "step": 23070 + }, + { + "epoch": 0.1104, + "grad_norm": 0.14700326323509216, + "learning_rate": 4.9417684415175905e-05, + "loss": 0.0509, + "step": 23080 + }, + { + "epoch": 0.11045, + "grad_norm": 0.17009063065052032, + "learning_rate": 4.9416797095580156e-05, + "loss": 0.0495, + "step": 23090 + }, + { + "epoch": 0.1105, + "grad_norm": 0.15119709074497223, + "learning_rate": 4.9415909108438344e-05, + "loss": 0.048, + "step": 23100 + }, + { + "epoch": 0.11055, + "grad_norm": 0.16565102338790894, + "learning_rate": 4.941502045377474e-05, + "loss": 0.045, + "step": 23110 + }, + { + "epoch": 0.1106, + "grad_norm": 0.17975634336471558, + "learning_rate": 4.9414131131613656e-05, + "loss": 0.0494, + "step": 23120 + }, + { + "epoch": 0.11065, + "grad_norm": 0.15956923365592957, + "learning_rate": 4.9413241141979394e-05, + "loss": 0.0459, + "step": 23130 + }, + { + "epoch": 0.1107, + "grad_norm": 0.13292647898197174, + "learning_rate": 4.9412350484896294e-05, + "loss": 0.0451, + "step": 23140 + }, + { + "epoch": 0.11075, + "grad_norm": 0.15606789290905, + "learning_rate": 4.9411459160388705e-05, + "loss": 0.0485, + "step": 23150 + }, + { + "epoch": 0.1108, + "grad_norm": 0.20483602583408356, + "learning_rate": 4.941056716848099e-05, + "loss": 0.0503, + "step": 23160 + }, + { + "epoch": 0.11085, + "grad_norm": 0.167527973651886, + "learning_rate": 4.940967450919755e-05, + "loss": 0.047, + "step": 23170 + }, + { + "epoch": 0.1109, + "grad_norm": 0.16891071200370789, + "learning_rate": 4.940878118256277e-05, + "loss": 0.0488, + "step": 23180 + }, + { + "epoch": 0.11095, + "grad_norm": 0.15802857279777527, + "learning_rate": 4.9407887188601084e-05, + "loss": 0.048, + "step": 23190 + }, + { + "epoch": 0.111, + "grad_norm": 0.1566019505262375, + "learning_rate": 4.9406992527336924e-05, + "loss": 0.0493, + "step": 23200 + }, + { + "epoch": 0.11105, + "grad_norm": 0.1614404320716858, + "learning_rate": 4.940609719879477e-05, + "loss": 0.0458, + "step": 23210 + }, + { + "epoch": 0.1111, + "grad_norm": 0.17096981406211853, + "learning_rate": 4.9405201202999085e-05, + "loss": 0.0499, + "step": 23220 + }, + { + "epoch": 0.11115, + "grad_norm": 0.14519457519054413, + "learning_rate": 4.9404304539974364e-05, + "loss": 0.0445, + "step": 23230 + }, + { + "epoch": 0.1112, + "grad_norm": 0.1501145362854004, + "learning_rate": 4.9403407209745125e-05, + "loss": 0.0479, + "step": 23240 + }, + { + "epoch": 0.11125, + "grad_norm": 0.1739199012517929, + "learning_rate": 4.9402509212335904e-05, + "loss": 0.0486, + "step": 23250 + }, + { + "epoch": 0.1113, + "grad_norm": 0.1648600697517395, + "learning_rate": 4.9401610547771246e-05, + "loss": 0.0467, + "step": 23260 + }, + { + "epoch": 0.11135, + "grad_norm": 0.15994058549404144, + "learning_rate": 4.9400711216075726e-05, + "loss": 0.0492, + "step": 23270 + }, + { + "epoch": 0.1114, + "grad_norm": 0.14188778400421143, + "learning_rate": 4.9399811217273916e-05, + "loss": 0.0483, + "step": 23280 + }, + { + "epoch": 0.11145, + "grad_norm": 0.14371943473815918, + "learning_rate": 4.939891055139045e-05, + "loss": 0.0466, + "step": 23290 + }, + { + "epoch": 0.1115, + "grad_norm": 0.1529754251241684, + "learning_rate": 4.939800921844993e-05, + "loss": 0.0491, + "step": 23300 + }, + { + "epoch": 0.11155, + "grad_norm": 0.18079929053783417, + "learning_rate": 4.9397107218477e-05, + "loss": 0.0471, + "step": 23310 + }, + { + "epoch": 0.1116, + "grad_norm": 0.16525928676128387, + "learning_rate": 4.9396204551496326e-05, + "loss": 0.0452, + "step": 23320 + }, + { + "epoch": 0.11165, + "grad_norm": 0.14274610579013824, + "learning_rate": 4.939530121753259e-05, + "loss": 0.0465, + "step": 23330 + }, + { + "epoch": 0.1117, + "grad_norm": 0.15236027538776398, + "learning_rate": 4.9394397216610476e-05, + "loss": 0.0474, + "step": 23340 + }, + { + "epoch": 0.11175, + "grad_norm": 0.12764261662960052, + "learning_rate": 4.939349254875472e-05, + "loss": 0.0449, + "step": 23350 + }, + { + "epoch": 0.1118, + "grad_norm": 0.13034336268901825, + "learning_rate": 4.939258721399003e-05, + "loss": 0.0468, + "step": 23360 + }, + { + "epoch": 0.11185, + "grad_norm": 0.1476508378982544, + "learning_rate": 4.939168121234117e-05, + "loss": 0.0496, + "step": 23370 + }, + { + "epoch": 0.1119, + "grad_norm": 0.166147843003273, + "learning_rate": 4.9390774543832906e-05, + "loss": 0.0507, + "step": 23380 + }, + { + "epoch": 0.11195, + "grad_norm": 0.17686863243579865, + "learning_rate": 4.9389867208490034e-05, + "loss": 0.048, + "step": 23390 + }, + { + "epoch": 0.112, + "grad_norm": 0.17661166191101074, + "learning_rate": 4.938895920633736e-05, + "loss": 0.0457, + "step": 23400 + }, + { + "epoch": 0.11205, + "grad_norm": 0.16418366134166718, + "learning_rate": 4.93880505373997e-05, + "loss": 0.0461, + "step": 23410 + }, + { + "epoch": 0.1121, + "grad_norm": 0.14816594123840332, + "learning_rate": 4.93871412017019e-05, + "loss": 0.0489, + "step": 23420 + }, + { + "epoch": 0.11215, + "grad_norm": 0.15972666442394257, + "learning_rate": 4.938623119926882e-05, + "loss": 0.0459, + "step": 23430 + }, + { + "epoch": 0.1122, + "grad_norm": 0.1464952528476715, + "learning_rate": 4.9385320530125346e-05, + "loss": 0.0472, + "step": 23440 + }, + { + "epoch": 0.11225, + "grad_norm": 0.15542052686214447, + "learning_rate": 4.938440919429637e-05, + "loss": 0.0467, + "step": 23450 + }, + { + "epoch": 0.1123, + "grad_norm": 0.16439156234264374, + "learning_rate": 4.938349719180679e-05, + "loss": 0.0481, + "step": 23460 + }, + { + "epoch": 0.11235, + "grad_norm": 0.18987365067005157, + "learning_rate": 4.9382584522681575e-05, + "loss": 0.0453, + "step": 23470 + }, + { + "epoch": 0.1124, + "grad_norm": 0.15060585737228394, + "learning_rate": 4.9381671186945656e-05, + "loss": 0.0447, + "step": 23480 + }, + { + "epoch": 0.11245, + "grad_norm": 0.21386857330799103, + "learning_rate": 4.9380757184624006e-05, + "loss": 0.0464, + "step": 23490 + }, + { + "epoch": 0.1125, + "grad_norm": 0.16736474633216858, + "learning_rate": 4.937984251574162e-05, + "loss": 0.0467, + "step": 23500 + }, + { + "epoch": 0.11255, + "grad_norm": 0.17925432324409485, + "learning_rate": 4.9378927180323485e-05, + "loss": 0.0465, + "step": 23510 + }, + { + "epoch": 0.1126, + "grad_norm": 0.1393420398235321, + "learning_rate": 4.937801117839464e-05, + "loss": 0.0447, + "step": 23520 + }, + { + "epoch": 0.11265, + "grad_norm": 0.191842183470726, + "learning_rate": 4.9377094509980135e-05, + "loss": 0.0468, + "step": 23530 + }, + { + "epoch": 0.1127, + "grad_norm": 0.14526723325252533, + "learning_rate": 4.9376177175105035e-05, + "loss": 0.0466, + "step": 23540 + }, + { + "epoch": 0.11275, + "grad_norm": 0.17123140394687653, + "learning_rate": 4.937525917379439e-05, + "loss": 0.0487, + "step": 23550 + }, + { + "epoch": 0.1128, + "grad_norm": 0.1246662586927414, + "learning_rate": 4.937434050607332e-05, + "loss": 0.0452, + "step": 23560 + }, + { + "epoch": 0.11285, + "grad_norm": 0.16733843088150024, + "learning_rate": 4.937342117196695e-05, + "loss": 0.0461, + "step": 23570 + }, + { + "epoch": 0.1129, + "grad_norm": 0.15778127312660217, + "learning_rate": 4.937250117150039e-05, + "loss": 0.0459, + "step": 23580 + }, + { + "epoch": 0.11295, + "grad_norm": 0.1407967209815979, + "learning_rate": 4.9371580504698814e-05, + "loss": 0.0482, + "step": 23590 + }, + { + "epoch": 0.113, + "grad_norm": 0.15934114158153534, + "learning_rate": 4.9370659171587367e-05, + "loss": 0.0485, + "step": 23600 + }, + { + "epoch": 0.11305, + "grad_norm": 0.18262344598770142, + "learning_rate": 4.9369737172191265e-05, + "loss": 0.0468, + "step": 23610 + }, + { + "epoch": 0.1131, + "grad_norm": 0.15956884622573853, + "learning_rate": 4.93688145065357e-05, + "loss": 0.0465, + "step": 23620 + }, + { + "epoch": 0.11315, + "grad_norm": 0.16960874199867249, + "learning_rate": 4.936789117464591e-05, + "loss": 0.0449, + "step": 23630 + }, + { + "epoch": 0.1132, + "grad_norm": 0.1442752331495285, + "learning_rate": 4.936696717654712e-05, + "loss": 0.0473, + "step": 23640 + }, + { + "epoch": 0.11325, + "grad_norm": 0.14218948781490326, + "learning_rate": 4.9366042512264604e-05, + "loss": 0.0468, + "step": 23650 + }, + { + "epoch": 0.1133, + "grad_norm": 0.14883334934711456, + "learning_rate": 4.936511718182364e-05, + "loss": 0.0464, + "step": 23660 + }, + { + "epoch": 0.11335, + "grad_norm": 0.14746984839439392, + "learning_rate": 4.9364191185249534e-05, + "loss": 0.0443, + "step": 23670 + }, + { + "epoch": 0.1134, + "grad_norm": 0.17093515396118164, + "learning_rate": 4.936326452256758e-05, + "loss": 0.0454, + "step": 23680 + }, + { + "epoch": 0.11345, + "grad_norm": 0.15980379283428192, + "learning_rate": 4.936233719380313e-05, + "loss": 0.0472, + "step": 23690 + }, + { + "epoch": 0.1135, + "grad_norm": 0.16157618165016174, + "learning_rate": 4.936140919898155e-05, + "loss": 0.0467, + "step": 23700 + }, + { + "epoch": 0.11355, + "grad_norm": 0.17927232384681702, + "learning_rate": 4.936048053812817e-05, + "loss": 0.0488, + "step": 23710 + }, + { + "epoch": 0.1136, + "grad_norm": 0.19343438744544983, + "learning_rate": 4.9359551211268415e-05, + "loss": 0.0456, + "step": 23720 + }, + { + "epoch": 0.11365, + "grad_norm": 0.1483830362558365, + "learning_rate": 4.935862121842769e-05, + "loss": 0.047, + "step": 23730 + }, + { + "epoch": 0.1137, + "grad_norm": 0.16596491634845734, + "learning_rate": 4.93576905596314e-05, + "loss": 0.0438, + "step": 23740 + }, + { + "epoch": 0.11375, + "grad_norm": 0.15183082222938538, + "learning_rate": 4.9356759234905e-05, + "loss": 0.044, + "step": 23750 + }, + { + "epoch": 0.1138, + "grad_norm": 0.15099778771400452, + "learning_rate": 4.935582724427397e-05, + "loss": 0.0455, + "step": 23760 + }, + { + "epoch": 0.11385, + "grad_norm": 0.19475258886814117, + "learning_rate": 4.935489458776375e-05, + "loss": 0.0448, + "step": 23770 + }, + { + "epoch": 0.1139, + "grad_norm": 0.18847818672657013, + "learning_rate": 4.935396126539988e-05, + "loss": 0.0445, + "step": 23780 + }, + { + "epoch": 0.11395, + "grad_norm": 0.15470103919506073, + "learning_rate": 4.935302727720785e-05, + "loss": 0.0448, + "step": 23790 + }, + { + "epoch": 0.114, + "grad_norm": 0.13757477700710297, + "learning_rate": 4.93520926232132e-05, + "loss": 0.0449, + "step": 23800 + }, + { + "epoch": 0.11405, + "grad_norm": 0.1728510856628418, + "learning_rate": 4.9351157303441495e-05, + "loss": 0.0441, + "step": 23810 + }, + { + "epoch": 0.1141, + "grad_norm": 0.15007427334785461, + "learning_rate": 4.935022131791829e-05, + "loss": 0.0435, + "step": 23820 + }, + { + "epoch": 0.11415, + "grad_norm": 0.1726987063884735, + "learning_rate": 4.934928466666919e-05, + "loss": 0.0461, + "step": 23830 + }, + { + "epoch": 0.1142, + "grad_norm": 0.15952524542808533, + "learning_rate": 4.9348347349719784e-05, + "loss": 0.0432, + "step": 23840 + }, + { + "epoch": 0.11425, + "grad_norm": 0.14444439113140106, + "learning_rate": 4.934740936709572e-05, + "loss": 0.0431, + "step": 23850 + }, + { + "epoch": 0.1143, + "grad_norm": 0.1386149674654007, + "learning_rate": 4.934647071882262e-05, + "loss": 0.0475, + "step": 23860 + }, + { + "epoch": 0.11435, + "grad_norm": 0.14660941064357758, + "learning_rate": 4.934553140492617e-05, + "loss": 0.0451, + "step": 23870 + }, + { + "epoch": 0.1144, + "grad_norm": 0.1509007215499878, + "learning_rate": 4.934459142543203e-05, + "loss": 0.0458, + "step": 23880 + }, + { + "epoch": 0.11445, + "grad_norm": 0.1373247653245926, + "learning_rate": 4.93436507803659e-05, + "loss": 0.0473, + "step": 23890 + }, + { + "epoch": 0.1145, + "grad_norm": 0.1345268040895462, + "learning_rate": 4.934270946975351e-05, + "loss": 0.0441, + "step": 23900 + }, + { + "epoch": 0.11455, + "grad_norm": 0.1353430151939392, + "learning_rate": 4.934176749362059e-05, + "loss": 0.045, + "step": 23910 + }, + { + "epoch": 0.1146, + "grad_norm": 0.14987526834011078, + "learning_rate": 4.9340824851992894e-05, + "loss": 0.0447, + "step": 23920 + }, + { + "epoch": 0.11465, + "grad_norm": 0.12962773442268372, + "learning_rate": 4.933988154489618e-05, + "loss": 0.0453, + "step": 23930 + }, + { + "epoch": 0.1147, + "grad_norm": 0.1771841198205948, + "learning_rate": 4.933893757235626e-05, + "loss": 0.0461, + "step": 23940 + }, + { + "epoch": 0.11475, + "grad_norm": 0.14618121087551117, + "learning_rate": 4.9337992934398926e-05, + "loss": 0.0484, + "step": 23950 + }, + { + "epoch": 0.1148, + "grad_norm": 0.13878297805786133, + "learning_rate": 4.933704763105e-05, + "loss": 0.0494, + "step": 23960 + }, + { + "epoch": 0.11485, + "grad_norm": 0.13771523535251617, + "learning_rate": 4.9336101662335346e-05, + "loss": 0.0458, + "step": 23970 + }, + { + "epoch": 0.1149, + "grad_norm": 0.15850576758384705, + "learning_rate": 4.9335155028280816e-05, + "loss": 0.0484, + "step": 23980 + }, + { + "epoch": 0.11495, + "grad_norm": 0.15937262773513794, + "learning_rate": 4.9334207728912284e-05, + "loss": 0.0465, + "step": 23990 + }, + { + "epoch": 0.115, + "grad_norm": 0.1431184709072113, + "learning_rate": 4.9333259764255655e-05, + "loss": 0.0448, + "step": 24000 + }, + { + "epoch": 0.11505, + "grad_norm": 0.1383841186761856, + "learning_rate": 4.933231113433685e-05, + "loss": 0.0438, + "step": 24010 + }, + { + "epoch": 0.1151, + "grad_norm": 0.15110275149345398, + "learning_rate": 4.93313618391818e-05, + "loss": 0.0476, + "step": 24020 + }, + { + "epoch": 0.11515, + "grad_norm": 0.1487853080034256, + "learning_rate": 4.933041187881645e-05, + "loss": 0.0462, + "step": 24030 + }, + { + "epoch": 0.1152, + "grad_norm": 0.14043785631656647, + "learning_rate": 4.9329461253266784e-05, + "loss": 0.0462, + "step": 24040 + }, + { + "epoch": 0.11525, + "grad_norm": 0.1876734346151352, + "learning_rate": 4.932850996255879e-05, + "loss": 0.0469, + "step": 24050 + }, + { + "epoch": 0.1153, + "grad_norm": 0.13531990349292755, + "learning_rate": 4.932755800671848e-05, + "loss": 0.045, + "step": 24060 + }, + { + "epoch": 0.11535, + "grad_norm": 0.17501574754714966, + "learning_rate": 4.932660538577186e-05, + "loss": 0.0453, + "step": 24070 + }, + { + "epoch": 0.1154, + "grad_norm": 0.17578630149364471, + "learning_rate": 4.932565209974499e-05, + "loss": 0.0455, + "step": 24080 + }, + { + "epoch": 0.11545, + "grad_norm": 0.16083061695098877, + "learning_rate": 4.932469814866394e-05, + "loss": 0.0452, + "step": 24090 + }, + { + "epoch": 0.1155, + "grad_norm": 0.1421496570110321, + "learning_rate": 4.932374353255477e-05, + "loss": 0.045, + "step": 24100 + }, + { + "epoch": 0.11555, + "grad_norm": 0.16017691791057587, + "learning_rate": 4.9322788251443596e-05, + "loss": 0.0463, + "step": 24110 + }, + { + "epoch": 0.1156, + "grad_norm": 0.14793579280376434, + "learning_rate": 4.932183230535653e-05, + "loss": 0.0467, + "step": 24120 + }, + { + "epoch": 0.11565, + "grad_norm": 0.14724218845367432, + "learning_rate": 4.93208756943197e-05, + "loss": 0.0447, + "step": 24130 + }, + { + "epoch": 0.1157, + "grad_norm": 0.12646318972110748, + "learning_rate": 4.931991841835927e-05, + "loss": 0.0437, + "step": 24140 + }, + { + "epoch": 0.11575, + "grad_norm": 0.17053678631782532, + "learning_rate": 4.931896047750141e-05, + "loss": 0.046, + "step": 24150 + }, + { + "epoch": 0.1158, + "grad_norm": 0.16350778937339783, + "learning_rate": 4.931800187177229e-05, + "loss": 0.0492, + "step": 24160 + }, + { + "epoch": 0.11585, + "grad_norm": 0.15240345895290375, + "learning_rate": 4.931704260119815e-05, + "loss": 0.0449, + "step": 24170 + }, + { + "epoch": 0.1159, + "grad_norm": 0.1353459358215332, + "learning_rate": 4.9316082665805204e-05, + "loss": 0.0449, + "step": 24180 + }, + { + "epoch": 0.11595, + "grad_norm": 0.16670545935630798, + "learning_rate": 4.931512206561968e-05, + "loss": 0.0451, + "step": 24190 + }, + { + "epoch": 0.116, + "grad_norm": 0.12886999547481537, + "learning_rate": 4.931416080066786e-05, + "loss": 0.045, + "step": 24200 + }, + { + "epoch": 0.11605, + "grad_norm": 0.134444460272789, + "learning_rate": 4.931319887097602e-05, + "loss": 0.0466, + "step": 24210 + }, + { + "epoch": 0.1161, + "grad_norm": 0.12937898933887482, + "learning_rate": 4.931223627657046e-05, + "loss": 0.048, + "step": 24220 + }, + { + "epoch": 0.11615, + "grad_norm": 0.17327240109443665, + "learning_rate": 4.931127301747749e-05, + "loss": 0.0468, + "step": 24230 + }, + { + "epoch": 0.1162, + "grad_norm": 0.15150600671768188, + "learning_rate": 4.931030909372345e-05, + "loss": 0.0458, + "step": 24240 + }, + { + "epoch": 0.11625, + "grad_norm": 0.17797043919563293, + "learning_rate": 4.9309344505334685e-05, + "loss": 0.0461, + "step": 24250 + }, + { + "epoch": 0.1163, + "grad_norm": 0.13645793497562408, + "learning_rate": 4.9308379252337586e-05, + "loss": 0.0448, + "step": 24260 + }, + { + "epoch": 0.11635, + "grad_norm": 0.13534113764762878, + "learning_rate": 4.9307413334758524e-05, + "loss": 0.0466, + "step": 24270 + }, + { + "epoch": 0.1164, + "grad_norm": 0.1359132081270218, + "learning_rate": 4.930644675262391e-05, + "loss": 0.0464, + "step": 24280 + }, + { + "epoch": 0.11645, + "grad_norm": 0.1498836725950241, + "learning_rate": 4.9305479505960176e-05, + "loss": 0.0451, + "step": 24290 + }, + { + "epoch": 0.1165, + "grad_norm": 0.12932755053043365, + "learning_rate": 4.9304511594793765e-05, + "loss": 0.0454, + "step": 24300 + }, + { + "epoch": 0.11655, + "grad_norm": 0.1804189831018448, + "learning_rate": 4.9303543019151135e-05, + "loss": 0.0481, + "step": 24310 + }, + { + "epoch": 0.1166, + "grad_norm": 0.15709887444972992, + "learning_rate": 4.930257377905877e-05, + "loss": 0.0459, + "step": 24320 + }, + { + "epoch": 0.11665, + "grad_norm": 0.16021974384784698, + "learning_rate": 4.9301603874543165e-05, + "loss": 0.0457, + "step": 24330 + }, + { + "epoch": 0.1167, + "grad_norm": 0.16061711311340332, + "learning_rate": 4.930063330563085e-05, + "loss": 0.0451, + "step": 24340 + }, + { + "epoch": 0.11675, + "grad_norm": 0.14702945947647095, + "learning_rate": 4.929966207234834e-05, + "loss": 0.0489, + "step": 24350 + }, + { + "epoch": 0.1168, + "grad_norm": 0.14569586515426636, + "learning_rate": 4.9298690174722204e-05, + "loss": 0.047, + "step": 24360 + }, + { + "epoch": 0.11685, + "grad_norm": 0.16680438816547394, + "learning_rate": 4.929771761277901e-05, + "loss": 0.0472, + "step": 24370 + }, + { + "epoch": 0.1169, + "grad_norm": 0.16751612722873688, + "learning_rate": 4.9296744386545334e-05, + "loss": 0.0483, + "step": 24380 + }, + { + "epoch": 0.11695, + "grad_norm": 0.15513846278190613, + "learning_rate": 4.92957704960478e-05, + "loss": 0.0484, + "step": 24390 + }, + { + "epoch": 0.117, + "grad_norm": 0.17224083840847015, + "learning_rate": 4.9294795941313034e-05, + "loss": 0.0475, + "step": 24400 + }, + { + "epoch": 0.11705, + "grad_norm": 0.15373164415359497, + "learning_rate": 4.929382072236766e-05, + "loss": 0.0454, + "step": 24410 + }, + { + "epoch": 0.1171, + "grad_norm": 0.1522088199853897, + "learning_rate": 4.9292844839238364e-05, + "loss": 0.0444, + "step": 24420 + }, + { + "epoch": 0.11715, + "grad_norm": 0.12994112074375153, + "learning_rate": 4.9291868291951814e-05, + "loss": 0.0457, + "step": 24430 + }, + { + "epoch": 0.1172, + "grad_norm": 0.17007765173912048, + "learning_rate": 4.9290891080534715e-05, + "loss": 0.044, + "step": 24440 + }, + { + "epoch": 0.11725, + "grad_norm": 0.1421523094177246, + "learning_rate": 4.9289913205013774e-05, + "loss": 0.0474, + "step": 24450 + }, + { + "epoch": 0.1173, + "grad_norm": 0.12650936841964722, + "learning_rate": 4.928893466541573e-05, + "loss": 0.0456, + "step": 24460 + }, + { + "epoch": 0.11735, + "grad_norm": 0.1637522131204605, + "learning_rate": 4.9287955461767346e-05, + "loss": 0.0472, + "step": 24470 + }, + { + "epoch": 0.1174, + "grad_norm": 0.16588808596134186, + "learning_rate": 4.928697559409537e-05, + "loss": 0.0462, + "step": 24480 + }, + { + "epoch": 0.11745, + "grad_norm": 0.1298874318599701, + "learning_rate": 4.928599506242662e-05, + "loss": 0.0455, + "step": 24490 + }, + { + "epoch": 0.1175, + "grad_norm": 0.13552021980285645, + "learning_rate": 4.9285013866787886e-05, + "loss": 0.0458, + "step": 24500 + }, + { + "epoch": 0.11755, + "grad_norm": 0.15007399022579193, + "learning_rate": 4.928403200720599e-05, + "loss": 0.0446, + "step": 24510 + }, + { + "epoch": 0.1176, + "grad_norm": 0.14397795498371124, + "learning_rate": 4.928304948370779e-05, + "loss": 0.0432, + "step": 24520 + }, + { + "epoch": 0.11765, + "grad_norm": 0.13422149419784546, + "learning_rate": 4.9282066296320125e-05, + "loss": 0.0454, + "step": 24530 + }, + { + "epoch": 0.1177, + "grad_norm": 0.15637044608592987, + "learning_rate": 4.928108244506991e-05, + "loss": 0.0445, + "step": 24540 + }, + { + "epoch": 0.11775, + "grad_norm": 0.1382390856742859, + "learning_rate": 4.928009792998401e-05, + "loss": 0.0452, + "step": 24550 + }, + { + "epoch": 0.1178, + "grad_norm": 0.18540625274181366, + "learning_rate": 4.9279112751089356e-05, + "loss": 0.0481, + "step": 24560 + }, + { + "epoch": 0.11785, + "grad_norm": 0.14658458530902863, + "learning_rate": 4.927812690841288e-05, + "loss": 0.0467, + "step": 24570 + }, + { + "epoch": 0.1179, + "grad_norm": 0.16724786162376404, + "learning_rate": 4.9277140401981534e-05, + "loss": 0.0484, + "step": 24580 + }, + { + "epoch": 0.11795, + "grad_norm": 0.15964344143867493, + "learning_rate": 4.92761532318223e-05, + "loss": 0.044, + "step": 24590 + }, + { + "epoch": 0.118, + "grad_norm": 0.1390381157398224, + "learning_rate": 4.927516539796215e-05, + "loss": 0.0463, + "step": 24600 + }, + { + "epoch": 0.11805, + "grad_norm": 0.16976219415664673, + "learning_rate": 4.927417690042809e-05, + "loss": 0.0475, + "step": 24610 + }, + { + "epoch": 0.1181, + "grad_norm": 0.1623208075761795, + "learning_rate": 4.927318773924717e-05, + "loss": 0.0447, + "step": 24620 + }, + { + "epoch": 0.11815, + "grad_norm": 0.1608884483575821, + "learning_rate": 4.9272197914446406e-05, + "loss": 0.0463, + "step": 24630 + }, + { + "epoch": 0.1182, + "grad_norm": 0.16607420146465302, + "learning_rate": 4.9271207426052866e-05, + "loss": 0.0442, + "step": 24640 + }, + { + "epoch": 0.11825, + "grad_norm": 0.16706447303295135, + "learning_rate": 4.927021627409364e-05, + "loss": 0.0455, + "step": 24650 + }, + { + "epoch": 0.1183, + "grad_norm": 0.13739296793937683, + "learning_rate": 4.926922445859581e-05, + "loss": 0.0444, + "step": 24660 + }, + { + "epoch": 0.11835, + "grad_norm": 0.12479973584413528, + "learning_rate": 4.926823197958651e-05, + "loss": 0.0461, + "step": 24670 + }, + { + "epoch": 0.1184, + "grad_norm": 0.14369264245033264, + "learning_rate": 4.9267238837092865e-05, + "loss": 0.045, + "step": 24680 + }, + { + "epoch": 0.11845, + "grad_norm": 0.17088323831558228, + "learning_rate": 4.926624503114202e-05, + "loss": 0.0471, + "step": 24690 + }, + { + "epoch": 0.1185, + "grad_norm": 0.14353309571743011, + "learning_rate": 4.9265250561761155e-05, + "loss": 0.0459, + "step": 24700 + }, + { + "epoch": 0.11855, + "grad_norm": 0.12303953617811203, + "learning_rate": 4.926425542897746e-05, + "loss": 0.0464, + "step": 24710 + }, + { + "epoch": 0.1186, + "grad_norm": 0.1280302107334137, + "learning_rate": 4.926325963281814e-05, + "loss": 0.0454, + "step": 24720 + }, + { + "epoch": 0.11865, + "grad_norm": 0.16849106550216675, + "learning_rate": 4.9262263173310405e-05, + "loss": 0.0488, + "step": 24730 + }, + { + "epoch": 0.1187, + "grad_norm": 0.14628657698631287, + "learning_rate": 4.926126605048152e-05, + "loss": 0.046, + "step": 24740 + }, + { + "epoch": 0.11875, + "grad_norm": 0.15758998692035675, + "learning_rate": 4.926026826435873e-05, + "loss": 0.0452, + "step": 24750 + }, + { + "epoch": 0.1188, + "grad_norm": 0.12086597830057144, + "learning_rate": 4.925926981496932e-05, + "loss": 0.045, + "step": 24760 + }, + { + "epoch": 0.11885, + "grad_norm": 0.14628925919532776, + "learning_rate": 4.925827070234059e-05, + "loss": 0.0453, + "step": 24770 + }, + { + "epoch": 0.1189, + "grad_norm": 0.1637919396162033, + "learning_rate": 4.9257270926499855e-05, + "loss": 0.0437, + "step": 24780 + }, + { + "epoch": 0.11895, + "grad_norm": 0.13955332338809967, + "learning_rate": 4.9256270487474437e-05, + "loss": 0.0469, + "step": 24790 + }, + { + "epoch": 0.119, + "grad_norm": 0.14324098825454712, + "learning_rate": 4.9255269385291704e-05, + "loss": 0.0485, + "step": 24800 + }, + { + "epoch": 0.11905, + "grad_norm": 0.13485878705978394, + "learning_rate": 4.925426761997901e-05, + "loss": 0.044, + "step": 24810 + }, + { + "epoch": 0.1191, + "grad_norm": 0.14601413905620575, + "learning_rate": 4.925326519156376e-05, + "loss": 0.0458, + "step": 24820 + }, + { + "epoch": 0.11915, + "grad_norm": 0.17952613532543182, + "learning_rate": 4.925226210007335e-05, + "loss": 0.0455, + "step": 24830 + }, + { + "epoch": 0.1192, + "grad_norm": 0.19048964977264404, + "learning_rate": 4.92512583455352e-05, + "loss": 0.0462, + "step": 24840 + }, + { + "epoch": 0.11925, + "grad_norm": 0.1312924027442932, + "learning_rate": 4.925025392797676e-05, + "loss": 0.0434, + "step": 24850 + }, + { + "epoch": 0.1193, + "grad_norm": 0.160012885928154, + "learning_rate": 4.924924884742549e-05, + "loss": 0.0429, + "step": 24860 + }, + { + "epoch": 0.11935, + "grad_norm": 0.15358127653598785, + "learning_rate": 4.9248243103908864e-05, + "loss": 0.0444, + "step": 24870 + }, + { + "epoch": 0.1194, + "grad_norm": 0.14759136736392975, + "learning_rate": 4.9247236697454386e-05, + "loss": 0.0444, + "step": 24880 + }, + { + "epoch": 0.11945, + "grad_norm": 0.13033431768417358, + "learning_rate": 4.9246229628089556e-05, + "loss": 0.0436, + "step": 24890 + }, + { + "epoch": 0.1195, + "grad_norm": 0.1593874990940094, + "learning_rate": 4.924522189584193e-05, + "loss": 0.045, + "step": 24900 + }, + { + "epoch": 0.11955, + "grad_norm": 0.1567847579717636, + "learning_rate": 4.924421350073904e-05, + "loss": 0.0451, + "step": 24910 + }, + { + "epoch": 0.1196, + "grad_norm": 0.15744049847126007, + "learning_rate": 4.9243204442808456e-05, + "loss": 0.0455, + "step": 24920 + }, + { + "epoch": 0.11965, + "grad_norm": 0.14594145119190216, + "learning_rate": 4.924219472207778e-05, + "loss": 0.0442, + "step": 24930 + }, + { + "epoch": 0.1197, + "grad_norm": 0.1674627661705017, + "learning_rate": 4.9241184338574595e-05, + "loss": 0.0449, + "step": 24940 + }, + { + "epoch": 0.11975, + "grad_norm": 0.15260593593120575, + "learning_rate": 4.924017329232655e-05, + "loss": 0.0422, + "step": 24950 + }, + { + "epoch": 0.1198, + "grad_norm": 0.13431614637374878, + "learning_rate": 4.923916158336127e-05, + "loss": 0.0456, + "step": 24960 + }, + { + "epoch": 0.11985, + "grad_norm": 0.11683829873800278, + "learning_rate": 4.923814921170641e-05, + "loss": 0.047, + "step": 24970 + }, + { + "epoch": 0.1199, + "grad_norm": 0.12878383696079254, + "learning_rate": 4.923713617738967e-05, + "loss": 0.0444, + "step": 24980 + }, + { + "epoch": 0.11995, + "grad_norm": 0.14457914233207703, + "learning_rate": 4.923612248043872e-05, + "loss": 0.0439, + "step": 24990 + }, + { + "epoch": 0.12, + "grad_norm": 0.16910295188426971, + "learning_rate": 4.92351081208813e-05, + "loss": 0.0469, + "step": 25000 + }, + { + "epoch": 0.12005, + "grad_norm": 0.13894687592983246, + "learning_rate": 4.923409309874511e-05, + "loss": 0.0467, + "step": 25010 + }, + { + "epoch": 0.1201, + "grad_norm": 0.1838841736316681, + "learning_rate": 4.923307741405794e-05, + "loss": 0.0475, + "step": 25020 + }, + { + "epoch": 0.12015, + "grad_norm": 0.1557500958442688, + "learning_rate": 4.923206106684752e-05, + "loss": 0.0459, + "step": 25030 + }, + { + "epoch": 0.1202, + "grad_norm": 0.1988830864429474, + "learning_rate": 4.923104405714166e-05, + "loss": 0.0459, + "step": 25040 + }, + { + "epoch": 0.12025, + "grad_norm": 0.16980133950710297, + "learning_rate": 4.9230026384968166e-05, + "loss": 0.0466, + "step": 25050 + }, + { + "epoch": 0.1203, + "grad_norm": 0.13996870815753937, + "learning_rate": 4.922900805035484e-05, + "loss": 0.0446, + "step": 25060 + }, + { + "epoch": 0.12035, + "grad_norm": 0.11587783694267273, + "learning_rate": 4.922798905332955e-05, + "loss": 0.0434, + "step": 25070 + }, + { + "epoch": 0.1204, + "grad_norm": 0.2113139033317566, + "learning_rate": 4.922696939392013e-05, + "loss": 0.0464, + "step": 25080 + }, + { + "epoch": 0.12045, + "grad_norm": 0.12333756685256958, + "learning_rate": 4.9225949072154474e-05, + "loss": 0.047, + "step": 25090 + }, + { + "epoch": 0.1205, + "grad_norm": 0.1441316455602646, + "learning_rate": 4.922492808806047e-05, + "loss": 0.0457, + "step": 25100 + }, + { + "epoch": 0.12055, + "grad_norm": 0.1563645601272583, + "learning_rate": 4.9223906441666036e-05, + "loss": 0.0473, + "step": 25110 + }, + { + "epoch": 0.1206, + "grad_norm": 0.1893489956855774, + "learning_rate": 4.92228841329991e-05, + "loss": 0.0473, + "step": 25120 + }, + { + "epoch": 0.12065, + "grad_norm": 0.15625514090061188, + "learning_rate": 4.922186116208761e-05, + "loss": 0.0486, + "step": 25130 + }, + { + "epoch": 0.1207, + "grad_norm": 0.13410907983779907, + "learning_rate": 4.9220837528959535e-05, + "loss": 0.0466, + "step": 25140 + }, + { + "epoch": 0.12075, + "grad_norm": 0.1733837127685547, + "learning_rate": 4.9219813233642866e-05, + "loss": 0.0453, + "step": 25150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.12287669628858566, + "learning_rate": 4.9218788276165596e-05, + "loss": 0.0443, + "step": 25160 + }, + { + "epoch": 0.12085, + "grad_norm": 0.13462580740451813, + "learning_rate": 4.9217762656555754e-05, + "loss": 0.0452, + "step": 25170 + }, + { + "epoch": 0.1209, + "grad_norm": 0.17285668849945068, + "learning_rate": 4.921673637484138e-05, + "loss": 0.0456, + "step": 25180 + }, + { + "epoch": 0.12095, + "grad_norm": 0.1468110978603363, + "learning_rate": 4.9215709431050535e-05, + "loss": 0.047, + "step": 25190 + }, + { + "epoch": 0.121, + "grad_norm": 0.15016555786132812, + "learning_rate": 4.921468182521128e-05, + "loss": 0.0468, + "step": 25200 + }, + { + "epoch": 0.12105, + "grad_norm": 0.16271093487739563, + "learning_rate": 4.9213653557351736e-05, + "loss": 0.0462, + "step": 25210 + }, + { + "epoch": 0.1211, + "grad_norm": 0.17403465509414673, + "learning_rate": 4.9212624627499994e-05, + "loss": 0.0442, + "step": 25220 + }, + { + "epoch": 0.12115, + "grad_norm": 0.16199059784412384, + "learning_rate": 4.921159503568419e-05, + "loss": 0.0437, + "step": 25230 + }, + { + "epoch": 0.1212, + "grad_norm": 0.14354346692562103, + "learning_rate": 4.921056478193247e-05, + "loss": 0.0459, + "step": 25240 + }, + { + "epoch": 0.12125, + "grad_norm": 0.12012068182229996, + "learning_rate": 4.920953386627301e-05, + "loss": 0.0438, + "step": 25250 + }, + { + "epoch": 0.1213, + "grad_norm": 0.1600666344165802, + "learning_rate": 4.9208502288733996e-05, + "loss": 0.045, + "step": 25260 + }, + { + "epoch": 0.12135, + "grad_norm": 0.19848820567131042, + "learning_rate": 4.920747004934361e-05, + "loss": 0.0462, + "step": 25270 + }, + { + "epoch": 0.1214, + "grad_norm": 0.14214332401752472, + "learning_rate": 4.920643714813009e-05, + "loss": 0.0461, + "step": 25280 + }, + { + "epoch": 0.12145, + "grad_norm": 0.12816590070724487, + "learning_rate": 4.9205403585121676e-05, + "loss": 0.0442, + "step": 25290 + }, + { + "epoch": 0.1215, + "grad_norm": 0.11601614207029343, + "learning_rate": 4.920436936034663e-05, + "loss": 0.045, + "step": 25300 + }, + { + "epoch": 0.12155, + "grad_norm": 0.14862045645713806, + "learning_rate": 4.920333447383321e-05, + "loss": 0.0455, + "step": 25310 + }, + { + "epoch": 0.1216, + "grad_norm": 0.14009587466716766, + "learning_rate": 4.9202298925609716e-05, + "loss": 0.0443, + "step": 25320 + }, + { + "epoch": 0.12165, + "grad_norm": 0.1675596684217453, + "learning_rate": 4.9201262715704455e-05, + "loss": 0.0434, + "step": 25330 + }, + { + "epoch": 0.1217, + "grad_norm": 0.1344049572944641, + "learning_rate": 4.9200225844145777e-05, + "loss": 0.0453, + "step": 25340 + }, + { + "epoch": 0.12175, + "grad_norm": 0.13212081789970398, + "learning_rate": 4.9199188310962006e-05, + "loss": 0.0432, + "step": 25350 + }, + { + "epoch": 0.1218, + "grad_norm": 0.11753246933221817, + "learning_rate": 4.919815011618153e-05, + "loss": 0.0427, + "step": 25360 + }, + { + "epoch": 0.12185, + "grad_norm": 0.16912850737571716, + "learning_rate": 4.9197111259832703e-05, + "loss": 0.0443, + "step": 25370 + }, + { + "epoch": 0.1219, + "grad_norm": 0.14177367091178894, + "learning_rate": 4.9196071741943964e-05, + "loss": 0.0438, + "step": 25380 + }, + { + "epoch": 0.12195, + "grad_norm": 0.11950483918190002, + "learning_rate": 4.91950315625437e-05, + "loss": 0.0464, + "step": 25390 + }, + { + "epoch": 0.122, + "grad_norm": 0.12239759415388107, + "learning_rate": 4.919399072166037e-05, + "loss": 0.0453, + "step": 25400 + }, + { + "epoch": 0.12205, + "grad_norm": 0.13678602874279022, + "learning_rate": 4.919294921932242e-05, + "loss": 0.0441, + "step": 25410 + }, + { + "epoch": 0.1221, + "grad_norm": 0.13878613710403442, + "learning_rate": 4.9191907055558326e-05, + "loss": 0.0445, + "step": 25420 + }, + { + "epoch": 0.12215, + "grad_norm": 0.1621149778366089, + "learning_rate": 4.9190864230396585e-05, + "loss": 0.0464, + "step": 25430 + }, + { + "epoch": 0.1222, + "grad_norm": 0.1646154224872589, + "learning_rate": 4.91898207438657e-05, + "loss": 0.0443, + "step": 25440 + }, + { + "epoch": 0.12225, + "grad_norm": 0.13514678180217743, + "learning_rate": 4.9188776595994215e-05, + "loss": 0.0462, + "step": 25450 + }, + { + "epoch": 0.1223, + "grad_norm": 0.15032826364040375, + "learning_rate": 4.9187731786810654e-05, + "loss": 0.0455, + "step": 25460 + }, + { + "epoch": 0.12235, + "grad_norm": 0.1921292394399643, + "learning_rate": 4.9186686316343586e-05, + "loss": 0.0461, + "step": 25470 + }, + { + "epoch": 0.1224, + "grad_norm": 0.1746305674314499, + "learning_rate": 4.918564018462162e-05, + "loss": 0.0472, + "step": 25480 + }, + { + "epoch": 0.12245, + "grad_norm": 0.1615629643201828, + "learning_rate": 4.9184593391673325e-05, + "loss": 0.0451, + "step": 25490 + }, + { + "epoch": 0.1225, + "grad_norm": 0.13830015063285828, + "learning_rate": 4.918354593752733e-05, + "loss": 0.0473, + "step": 25500 + }, + { + "epoch": 0.12255, + "grad_norm": 0.16088566184043884, + "learning_rate": 4.9182497822212284e-05, + "loss": 0.0514, + "step": 25510 + }, + { + "epoch": 0.1226, + "grad_norm": 0.1678658276796341, + "learning_rate": 4.9181449045756825e-05, + "loss": 0.0466, + "step": 25520 + }, + { + "epoch": 0.12265, + "grad_norm": 0.21823139488697052, + "learning_rate": 4.918039960818963e-05, + "loss": 0.0472, + "step": 25530 + }, + { + "epoch": 0.1227, + "grad_norm": 0.19542300701141357, + "learning_rate": 4.9179349509539404e-05, + "loss": 0.0481, + "step": 25540 + }, + { + "epoch": 0.12275, + "grad_norm": 0.15862037241458893, + "learning_rate": 4.917829874983484e-05, + "loss": 0.0437, + "step": 25550 + }, + { + "epoch": 0.1228, + "grad_norm": 0.1674732118844986, + "learning_rate": 4.917724732910467e-05, + "loss": 0.0475, + "step": 25560 + }, + { + "epoch": 0.12285, + "grad_norm": 0.17866535484790802, + "learning_rate": 4.917619524737765e-05, + "loss": 0.0442, + "step": 25570 + }, + { + "epoch": 0.1229, + "grad_norm": 0.15923969447612762, + "learning_rate": 4.917514250468252e-05, + "loss": 0.0445, + "step": 25580 + }, + { + "epoch": 0.12295, + "grad_norm": 0.16598527133464813, + "learning_rate": 4.9174089101048094e-05, + "loss": 0.0437, + "step": 25590 + }, + { + "epoch": 0.123, + "grad_norm": 0.17330916225910187, + "learning_rate": 4.917303503650314e-05, + "loss": 0.0479, + "step": 25600 + }, + { + "epoch": 0.12305, + "grad_norm": 0.15099115669727325, + "learning_rate": 4.91719803110765e-05, + "loss": 0.0449, + "step": 25610 + }, + { + "epoch": 0.1231, + "grad_norm": 0.16531068086624146, + "learning_rate": 4.917092492479699e-05, + "loss": 0.0466, + "step": 25620 + }, + { + "epoch": 0.12315, + "grad_norm": 0.18818257749080658, + "learning_rate": 4.9169868877693484e-05, + "loss": 0.0456, + "step": 25630 + }, + { + "epoch": 0.1232, + "grad_norm": 0.17593468725681305, + "learning_rate": 4.916881216979483e-05, + "loss": 0.0451, + "step": 25640 + }, + { + "epoch": 0.12325, + "grad_norm": 0.17095914483070374, + "learning_rate": 4.916775480112994e-05, + "loss": 0.046, + "step": 25650 + }, + { + "epoch": 0.1233, + "grad_norm": 0.1490679830312729, + "learning_rate": 4.916669677172771e-05, + "loss": 0.0454, + "step": 25660 + }, + { + "epoch": 0.12335, + "grad_norm": 0.1847240924835205, + "learning_rate": 4.9165638081617065e-05, + "loss": 0.046, + "step": 25670 + }, + { + "epoch": 0.1234, + "grad_norm": 0.16121570765972137, + "learning_rate": 4.916457873082696e-05, + "loss": 0.0473, + "step": 25680 + }, + { + "epoch": 0.12345, + "grad_norm": 0.15476711094379425, + "learning_rate": 4.916351871938635e-05, + "loss": 0.044, + "step": 25690 + }, + { + "epoch": 0.1235, + "grad_norm": 0.17886728048324585, + "learning_rate": 4.916245804732421e-05, + "loss": 0.047, + "step": 25700 + }, + { + "epoch": 0.12355, + "grad_norm": 0.14230097830295563, + "learning_rate": 4.916139671466955e-05, + "loss": 0.0445, + "step": 25710 + }, + { + "epoch": 0.1236, + "grad_norm": 0.2042488008737564, + "learning_rate": 4.9160334721451386e-05, + "loss": 0.0473, + "step": 25720 + }, + { + "epoch": 0.12365, + "grad_norm": 0.17995183169841766, + "learning_rate": 4.9159272067698734e-05, + "loss": 0.0469, + "step": 25730 + }, + { + "epoch": 0.1237, + "grad_norm": 0.15162166953086853, + "learning_rate": 4.9158208753440674e-05, + "loss": 0.0461, + "step": 25740 + }, + { + "epoch": 0.12375, + "grad_norm": 0.13117767870426178, + "learning_rate": 4.915714477870625e-05, + "loss": 0.0443, + "step": 25750 + }, + { + "epoch": 0.1238, + "grad_norm": 0.12508119642734528, + "learning_rate": 4.915608014352457e-05, + "loss": 0.0453, + "step": 25760 + }, + { + "epoch": 0.12385, + "grad_norm": 0.14855307340621948, + "learning_rate": 4.915501484792473e-05, + "loss": 0.0442, + "step": 25770 + }, + { + "epoch": 0.1239, + "grad_norm": 0.13553832471370697, + "learning_rate": 4.9153948891935866e-05, + "loss": 0.0463, + "step": 25780 + }, + { + "epoch": 0.12395, + "grad_norm": 0.16864003241062164, + "learning_rate": 4.915288227558711e-05, + "loss": 0.0455, + "step": 25790 + }, + { + "epoch": 0.124, + "grad_norm": 0.15773332118988037, + "learning_rate": 4.915181499890762e-05, + "loss": 0.0449, + "step": 25800 + }, + { + "epoch": 0.12405, + "grad_norm": 0.16534097492694855, + "learning_rate": 4.9150747061926584e-05, + "loss": 0.0445, + "step": 25810 + }, + { + "epoch": 0.1241, + "grad_norm": 0.13091982901096344, + "learning_rate": 4.9149678464673196e-05, + "loss": 0.045, + "step": 25820 + }, + { + "epoch": 0.12415, + "grad_norm": 0.13769251108169556, + "learning_rate": 4.914860920717668e-05, + "loss": 0.0476, + "step": 25830 + }, + { + "epoch": 0.1242, + "grad_norm": 0.14034955203533173, + "learning_rate": 4.9147539289466256e-05, + "loss": 0.0444, + "step": 25840 + }, + { + "epoch": 0.12425, + "grad_norm": 0.13416755199432373, + "learning_rate": 4.914646871157118e-05, + "loss": 0.043, + "step": 25850 + }, + { + "epoch": 0.1243, + "grad_norm": 0.14412455260753632, + "learning_rate": 4.9145397473520715e-05, + "loss": 0.0443, + "step": 25860 + }, + { + "epoch": 0.12435, + "grad_norm": 0.14550353586673737, + "learning_rate": 4.9144325575344166e-05, + "loss": 0.0429, + "step": 25870 + }, + { + "epoch": 0.1244, + "grad_norm": 0.1473642885684967, + "learning_rate": 4.914325301707081e-05, + "loss": 0.0471, + "step": 25880 + }, + { + "epoch": 0.12445, + "grad_norm": 0.14089325070381165, + "learning_rate": 4.914217979873e-05, + "loss": 0.0477, + "step": 25890 + }, + { + "epoch": 0.1245, + "grad_norm": 0.1616249531507492, + "learning_rate": 4.914110592035106e-05, + "loss": 0.0454, + "step": 25900 + }, + { + "epoch": 0.12455, + "grad_norm": 0.14541175961494446, + "learning_rate": 4.9140031381963347e-05, + "loss": 0.0489, + "step": 25910 + }, + { + "epoch": 0.1246, + "grad_norm": 0.21233685314655304, + "learning_rate": 4.913895618359625e-05, + "loss": 0.0451, + "step": 25920 + }, + { + "epoch": 0.12465, + "grad_norm": 0.15763972699642181, + "learning_rate": 4.913788032527916e-05, + "loss": 0.0459, + "step": 25930 + }, + { + "epoch": 0.1247, + "grad_norm": 0.1575399935245514, + "learning_rate": 4.91368038070415e-05, + "loss": 0.0458, + "step": 25940 + }, + { + "epoch": 0.12475, + "grad_norm": 0.1579417586326599, + "learning_rate": 4.9135726628912675e-05, + "loss": 0.0481, + "step": 25950 + }, + { + "epoch": 0.1248, + "grad_norm": 0.15620434284210205, + "learning_rate": 4.913464879092216e-05, + "loss": 0.0483, + "step": 25960 + }, + { + "epoch": 0.12485, + "grad_norm": 0.20243607461452484, + "learning_rate": 4.913357029309941e-05, + "loss": 0.0456, + "step": 25970 + }, + { + "epoch": 0.1249, + "grad_norm": 0.1301449090242386, + "learning_rate": 4.913249113547392e-05, + "loss": 0.0469, + "step": 25980 + }, + { + "epoch": 0.12495, + "grad_norm": 0.14186380803585052, + "learning_rate": 4.913141131807518e-05, + "loss": 0.0471, + "step": 25990 + }, + { + "epoch": 0.125, + "grad_norm": 0.13385401666164398, + "learning_rate": 4.913033084093273e-05, + "loss": 0.044, + "step": 26000 + }, + { + "epoch": 0.12505, + "grad_norm": 0.1556403785943985, + "learning_rate": 4.91292497040761e-05, + "loss": 0.0481, + "step": 26010 + }, + { + "epoch": 0.1251, + "grad_norm": 0.14860087633132935, + "learning_rate": 4.912816790753484e-05, + "loss": 0.0444, + "step": 26020 + }, + { + "epoch": 0.12515, + "grad_norm": 0.1349562108516693, + "learning_rate": 4.9127085451338536e-05, + "loss": 0.044, + "step": 26030 + }, + { + "epoch": 0.1252, + "grad_norm": 0.10741005092859268, + "learning_rate": 4.9126002335516787e-05, + "loss": 0.0458, + "step": 26040 + }, + { + "epoch": 0.12525, + "grad_norm": 0.15222665667533875, + "learning_rate": 4.912491856009919e-05, + "loss": 0.0457, + "step": 26050 + }, + { + "epoch": 0.1253, + "grad_norm": 0.14157378673553467, + "learning_rate": 4.9123834125115384e-05, + "loss": 0.0456, + "step": 26060 + }, + { + "epoch": 0.12535, + "grad_norm": 0.15087658166885376, + "learning_rate": 4.9122749030595024e-05, + "loss": 0.0444, + "step": 26070 + }, + { + "epoch": 0.1254, + "grad_norm": 0.16578349471092224, + "learning_rate": 4.912166327656776e-05, + "loss": 0.0454, + "step": 26080 + }, + { + "epoch": 0.12545, + "grad_norm": 0.13934017717838287, + "learning_rate": 4.912057686306328e-05, + "loss": 0.0462, + "step": 26090 + }, + { + "epoch": 0.1255, + "grad_norm": 0.141837477684021, + "learning_rate": 4.91194897901113e-05, + "loss": 0.0441, + "step": 26100 + }, + { + "epoch": 0.12555, + "grad_norm": 0.15003222227096558, + "learning_rate": 4.911840205774153e-05, + "loss": 0.0459, + "step": 26110 + }, + { + "epoch": 0.1256, + "grad_norm": 0.20069071650505066, + "learning_rate": 4.911731366598371e-05, + "loss": 0.045, + "step": 26120 + }, + { + "epoch": 0.12565, + "grad_norm": 0.15775476396083832, + "learning_rate": 4.911622461486759e-05, + "loss": 0.0444, + "step": 26130 + }, + { + "epoch": 0.1257, + "grad_norm": 0.14468349516391754, + "learning_rate": 4.9115134904422946e-05, + "loss": 0.0495, + "step": 26140 + }, + { + "epoch": 0.12575, + "grad_norm": 0.1316310465335846, + "learning_rate": 4.911404453467957e-05, + "loss": 0.0445, + "step": 26150 + }, + { + "epoch": 0.1258, + "grad_norm": 0.12668436765670776, + "learning_rate": 4.9112953505667286e-05, + "loss": 0.0454, + "step": 26160 + }, + { + "epoch": 0.12585, + "grad_norm": 0.14326219260692596, + "learning_rate": 4.9111861817415905e-05, + "loss": 0.0459, + "step": 26170 + }, + { + "epoch": 0.1259, + "grad_norm": 0.14999836683273315, + "learning_rate": 4.9110769469955285e-05, + "loss": 0.0446, + "step": 26180 + }, + { + "epoch": 0.12595, + "grad_norm": 0.13554710149765015, + "learning_rate": 4.910967646331528e-05, + "loss": 0.0434, + "step": 26190 + }, + { + "epoch": 0.126, + "grad_norm": 0.1860104203224182, + "learning_rate": 4.9108582797525786e-05, + "loss": 0.0459, + "step": 26200 + }, + { + "epoch": 0.12605, + "grad_norm": 0.13717034459114075, + "learning_rate": 4.9107488472616694e-05, + "loss": 0.045, + "step": 26210 + }, + { + "epoch": 0.1261, + "grad_norm": 0.15142101049423218, + "learning_rate": 4.910639348861792e-05, + "loss": 0.0478, + "step": 26220 + }, + { + "epoch": 0.12615, + "grad_norm": 0.14934422075748444, + "learning_rate": 4.9105297845559405e-05, + "loss": 0.048, + "step": 26230 + }, + { + "epoch": 0.1262, + "grad_norm": 0.1412418782711029, + "learning_rate": 4.9104201543471104e-05, + "loss": 0.0473, + "step": 26240 + }, + { + "epoch": 0.12625, + "grad_norm": 0.1649186909198761, + "learning_rate": 4.910310458238298e-05, + "loss": 0.0452, + "step": 26250 + }, + { + "epoch": 0.1263, + "grad_norm": 0.1373409926891327, + "learning_rate": 4.9102006962325056e-05, + "loss": 0.0442, + "step": 26260 + }, + { + "epoch": 0.12635, + "grad_norm": 0.14176934957504272, + "learning_rate": 4.9100908683327294e-05, + "loss": 0.0446, + "step": 26270 + }, + { + "epoch": 0.1264, + "grad_norm": 0.13973231613636017, + "learning_rate": 4.909980974541975e-05, + "loss": 0.0457, + "step": 26280 + }, + { + "epoch": 0.12645, + "grad_norm": 0.14384910464286804, + "learning_rate": 4.909871014863246e-05, + "loss": 0.0446, + "step": 26290 + }, + { + "epoch": 0.1265, + "grad_norm": 0.13749195635318756, + "learning_rate": 4.909760989299549e-05, + "loss": 0.0463, + "step": 26300 + }, + { + "epoch": 0.12655, + "grad_norm": 0.1508299857378006, + "learning_rate": 4.9096508978538914e-05, + "loss": 0.0488, + "step": 26310 + }, + { + "epoch": 0.1266, + "grad_norm": 0.1548951119184494, + "learning_rate": 4.9095407405292834e-05, + "loss": 0.0471, + "step": 26320 + }, + { + "epoch": 0.12665, + "grad_norm": 0.150093212723732, + "learning_rate": 4.909430517328738e-05, + "loss": 0.0442, + "step": 26330 + }, + { + "epoch": 0.1267, + "grad_norm": 0.15900050103664398, + "learning_rate": 4.9093202282552666e-05, + "loss": 0.0443, + "step": 26340 + }, + { + "epoch": 0.12675, + "grad_norm": 0.15392284095287323, + "learning_rate": 4.909209873311885e-05, + "loss": 0.0431, + "step": 26350 + }, + { + "epoch": 0.1268, + "grad_norm": 0.13655874133110046, + "learning_rate": 4.909099452501611e-05, + "loss": 0.0493, + "step": 26360 + }, + { + "epoch": 0.12685, + "grad_norm": 0.1434694528579712, + "learning_rate": 4.908988965827463e-05, + "loss": 0.0451, + "step": 26370 + }, + { + "epoch": 0.1269, + "grad_norm": 0.10418988764286041, + "learning_rate": 4.9088784132924616e-05, + "loss": 0.0446, + "step": 26380 + }, + { + "epoch": 0.12695, + "grad_norm": 0.13806277513504028, + "learning_rate": 4.908767794899629e-05, + "loss": 0.0449, + "step": 26390 + }, + { + "epoch": 0.127, + "grad_norm": 0.17036783695220947, + "learning_rate": 4.908657110651991e-05, + "loss": 0.0456, + "step": 26400 + }, + { + "epoch": 0.12705, + "grad_norm": 0.1243971735239029, + "learning_rate": 4.9085463605525716e-05, + "loss": 0.0449, + "step": 26410 + }, + { + "epoch": 0.1271, + "grad_norm": 0.1742110699415207, + "learning_rate": 4.9084355446044e-05, + "loss": 0.0468, + "step": 26420 + }, + { + "epoch": 0.12715, + "grad_norm": 0.17756503820419312, + "learning_rate": 4.908324662810505e-05, + "loss": 0.0442, + "step": 26430 + }, + { + "epoch": 0.1272, + "grad_norm": 0.13110657036304474, + "learning_rate": 4.908213715173918e-05, + "loss": 0.0464, + "step": 26440 + }, + { + "epoch": 0.12725, + "grad_norm": 0.13226915895938873, + "learning_rate": 4.9081027016976736e-05, + "loss": 0.0449, + "step": 26450 + }, + { + "epoch": 0.1273, + "grad_norm": 0.12369966506958008, + "learning_rate": 4.9079916223848055e-05, + "loss": 0.0453, + "step": 26460 + }, + { + "epoch": 0.12735, + "grad_norm": 0.18580640852451324, + "learning_rate": 4.907880477238351e-05, + "loss": 0.0456, + "step": 26470 + }, + { + "epoch": 0.1274, + "grad_norm": 0.17344515025615692, + "learning_rate": 4.9077692662613496e-05, + "loss": 0.0463, + "step": 26480 + }, + { + "epoch": 0.12745, + "grad_norm": 0.17119541764259338, + "learning_rate": 4.90765798945684e-05, + "loss": 0.0495, + "step": 26490 + }, + { + "epoch": 0.1275, + "grad_norm": 0.167933389544487, + "learning_rate": 4.907546646827866e-05, + "loss": 0.0475, + "step": 26500 + }, + { + "epoch": 0.12755, + "grad_norm": 0.16994008421897888, + "learning_rate": 4.907435238377471e-05, + "loss": 0.0449, + "step": 26510 + }, + { + "epoch": 0.1276, + "grad_norm": 0.1789732128381729, + "learning_rate": 4.9073237641087014e-05, + "loss": 0.0442, + "step": 26520 + }, + { + "epoch": 0.12765, + "grad_norm": 0.19679789245128632, + "learning_rate": 4.907212224024604e-05, + "loss": 0.0473, + "step": 26530 + }, + { + "epoch": 0.1277, + "grad_norm": 0.1414823681116104, + "learning_rate": 4.90710061812823e-05, + "loss": 0.0476, + "step": 26540 + }, + { + "epoch": 0.12775, + "grad_norm": 0.14352194964885712, + "learning_rate": 4.906988946422628e-05, + "loss": 0.0489, + "step": 26550 + }, + { + "epoch": 0.1278, + "grad_norm": 0.12210658937692642, + "learning_rate": 4.906877208910853e-05, + "loss": 0.0454, + "step": 26560 + }, + { + "epoch": 0.12785, + "grad_norm": 0.15541048347949982, + "learning_rate": 4.906765405595959e-05, + "loss": 0.0472, + "step": 26570 + }, + { + "epoch": 0.1279, + "grad_norm": 0.14378651976585388, + "learning_rate": 4.9066535364810034e-05, + "loss": 0.0456, + "step": 26580 + }, + { + "epoch": 0.12795, + "grad_norm": 0.15418651700019836, + "learning_rate": 4.906541601569044e-05, + "loss": 0.0465, + "step": 26590 + }, + { + "epoch": 0.128, + "grad_norm": 0.1363755464553833, + "learning_rate": 4.9064296008631414e-05, + "loss": 0.0468, + "step": 26600 + }, + { + "epoch": 0.12805, + "grad_norm": 0.15140806138515472, + "learning_rate": 4.9063175343663574e-05, + "loss": 0.0467, + "step": 26610 + }, + { + "epoch": 0.1281, + "grad_norm": 0.1380670815706253, + "learning_rate": 4.906205402081756e-05, + "loss": 0.0437, + "step": 26620 + }, + { + "epoch": 0.12815, + "grad_norm": 0.13127738237380981, + "learning_rate": 4.906093204012403e-05, + "loss": 0.0439, + "step": 26630 + }, + { + "epoch": 0.1282, + "grad_norm": 0.12910018861293793, + "learning_rate": 4.905980940161366e-05, + "loss": 0.0453, + "step": 26640 + }, + { + "epoch": 0.12825, + "grad_norm": 0.1474410444498062, + "learning_rate": 4.905868610531714e-05, + "loss": 0.045, + "step": 26650 + }, + { + "epoch": 0.1283, + "grad_norm": 0.11991109699010849, + "learning_rate": 4.905756215126518e-05, + "loss": 0.0447, + "step": 26660 + }, + { + "epoch": 0.12835, + "grad_norm": 0.14297989010810852, + "learning_rate": 4.9056437539488506e-05, + "loss": 0.0445, + "step": 26670 + }, + { + "epoch": 0.1284, + "grad_norm": 0.14665597677230835, + "learning_rate": 4.905531227001786e-05, + "loss": 0.0439, + "step": 26680 + }, + { + "epoch": 0.12845, + "grad_norm": 0.14444920420646667, + "learning_rate": 4.905418634288402e-05, + "loss": 0.0458, + "step": 26690 + }, + { + "epoch": 0.1285, + "grad_norm": 0.13890399038791656, + "learning_rate": 4.905305975811777e-05, + "loss": 0.0442, + "step": 26700 + }, + { + "epoch": 0.12855, + "grad_norm": 0.14420518279075623, + "learning_rate": 4.9051932515749896e-05, + "loss": 0.0481, + "step": 26710 + }, + { + "epoch": 0.1286, + "grad_norm": 0.1519700139760971, + "learning_rate": 4.905080461581123e-05, + "loss": 0.0449, + "step": 26720 + }, + { + "epoch": 0.12865, + "grad_norm": 0.14807257056236267, + "learning_rate": 4.904967605833259e-05, + "loss": 0.0488, + "step": 26730 + }, + { + "epoch": 0.1287, + "grad_norm": 0.16089566051959991, + "learning_rate": 4.9048546843344846e-05, + "loss": 0.0439, + "step": 26740 + }, + { + "epoch": 0.12875, + "grad_norm": 0.1584918200969696, + "learning_rate": 4.9047416970878866e-05, + "loss": 0.0474, + "step": 26750 + }, + { + "epoch": 0.1288, + "grad_norm": 0.1363014280796051, + "learning_rate": 4.9046286440965535e-05, + "loss": 0.0443, + "step": 26760 + }, + { + "epoch": 0.12885, + "grad_norm": 0.13531547784805298, + "learning_rate": 4.9045155253635776e-05, + "loss": 0.0441, + "step": 26770 + }, + { + "epoch": 0.1289, + "grad_norm": 0.14951872825622559, + "learning_rate": 4.90440234089205e-05, + "loss": 0.0437, + "step": 26780 + }, + { + "epoch": 0.12895, + "grad_norm": 0.15993520617485046, + "learning_rate": 4.9042890906850655e-05, + "loss": 0.0494, + "step": 26790 + }, + { + "epoch": 0.129, + "grad_norm": 0.13922452926635742, + "learning_rate": 4.9041757747457215e-05, + "loss": 0.0466, + "step": 26800 + }, + { + "epoch": 0.12905, + "grad_norm": 0.15624390542507172, + "learning_rate": 4.904062393077114e-05, + "loss": 0.0479, + "step": 26810 + }, + { + "epoch": 0.1291, + "grad_norm": 0.13383163511753082, + "learning_rate": 4.903948945682344e-05, + "loss": 0.0449, + "step": 26820 + }, + { + "epoch": 0.12915, + "grad_norm": 0.15837447345256805, + "learning_rate": 4.903835432564513e-05, + "loss": 0.0456, + "step": 26830 + }, + { + "epoch": 0.1292, + "grad_norm": 0.13015559315681458, + "learning_rate": 4.903721853726725e-05, + "loss": 0.0443, + "step": 26840 + }, + { + "epoch": 0.12925, + "grad_norm": 0.12030153721570969, + "learning_rate": 4.9036082091720834e-05, + "loss": 0.0445, + "step": 26850 + }, + { + "epoch": 0.1293, + "grad_norm": 0.12426728010177612, + "learning_rate": 4.903494498903698e-05, + "loss": 0.045, + "step": 26860 + }, + { + "epoch": 0.12935, + "grad_norm": 0.1436558961868286, + "learning_rate": 4.903380722924674e-05, + "loss": 0.0435, + "step": 26870 + }, + { + "epoch": 0.1294, + "grad_norm": 0.1367773413658142, + "learning_rate": 4.903266881238126e-05, + "loss": 0.0444, + "step": 26880 + }, + { + "epoch": 0.12945, + "grad_norm": 0.14350031316280365, + "learning_rate": 4.903152973847163e-05, + "loss": 0.0469, + "step": 26890 + }, + { + "epoch": 0.1295, + "grad_norm": 0.17975474894046783, + "learning_rate": 4.9030390007549005e-05, + "loss": 0.0482, + "step": 26900 + }, + { + "epoch": 0.12955, + "grad_norm": 0.1254936158657074, + "learning_rate": 4.902924961964455e-05, + "loss": 0.0461, + "step": 26910 + }, + { + "epoch": 0.1296, + "grad_norm": 0.16277149319648743, + "learning_rate": 4.902810857478943e-05, + "loss": 0.048, + "step": 26920 + }, + { + "epoch": 0.12965, + "grad_norm": 0.16418753564357758, + "learning_rate": 4.902696687301486e-05, + "loss": 0.0451, + "step": 26930 + }, + { + "epoch": 0.1297, + "grad_norm": 0.10756832361221313, + "learning_rate": 4.902582451435203e-05, + "loss": 0.0453, + "step": 26940 + }, + { + "epoch": 0.12975, + "grad_norm": 0.12258588522672653, + "learning_rate": 4.902468149883219e-05, + "loss": 0.0464, + "step": 26950 + }, + { + "epoch": 0.1298, + "grad_norm": 0.14724516868591309, + "learning_rate": 4.902353782648659e-05, + "loss": 0.0438, + "step": 26960 + }, + { + "epoch": 0.12985, + "grad_norm": 0.1251763254404068, + "learning_rate": 4.902239349734648e-05, + "loss": 0.0456, + "step": 26970 + }, + { + "epoch": 0.1299, + "grad_norm": 0.13004091382026672, + "learning_rate": 4.9021248511443165e-05, + "loss": 0.0431, + "step": 26980 + }, + { + "epoch": 0.12995, + "grad_norm": 0.14965547621250153, + "learning_rate": 4.9020102868807936e-05, + "loss": 0.046, + "step": 26990 + }, + { + "epoch": 0.13, + "grad_norm": 0.1758066564798355, + "learning_rate": 4.9018956569472115e-05, + "loss": 0.0449, + "step": 27000 + }, + { + "epoch": 0.13005, + "grad_norm": 0.16269785165786743, + "learning_rate": 4.901780961346705e-05, + "loss": 0.0447, + "step": 27010 + }, + { + "epoch": 0.1301, + "grad_norm": 0.1386614590883255, + "learning_rate": 4.9016662000824086e-05, + "loss": 0.0447, + "step": 27020 + }, + { + "epoch": 0.13015, + "grad_norm": 0.15130813419818878, + "learning_rate": 4.901551373157461e-05, + "loss": 0.0459, + "step": 27030 + }, + { + "epoch": 0.1302, + "grad_norm": 0.142473042011261, + "learning_rate": 4.9014364805750016e-05, + "loss": 0.0471, + "step": 27040 + }, + { + "epoch": 0.13025, + "grad_norm": 0.1500493437051773, + "learning_rate": 4.9013215223381705e-05, + "loss": 0.0451, + "step": 27050 + }, + { + "epoch": 0.1303, + "grad_norm": 0.1761331856250763, + "learning_rate": 4.9012064984501115e-05, + "loss": 0.0439, + "step": 27060 + }, + { + "epoch": 0.13035, + "grad_norm": 0.22045297920703888, + "learning_rate": 4.901091408913968e-05, + "loss": 0.0475, + "step": 27070 + }, + { + "epoch": 0.1304, + "grad_norm": 0.19645242393016815, + "learning_rate": 4.9009762537328885e-05, + "loss": 0.0474, + "step": 27080 + }, + { + "epoch": 0.13045, + "grad_norm": 0.13204947113990784, + "learning_rate": 4.90086103291002e-05, + "loss": 0.0479, + "step": 27090 + }, + { + "epoch": 0.1305, + "grad_norm": 0.1573173999786377, + "learning_rate": 4.900745746448512e-05, + "loss": 0.0456, + "step": 27100 + }, + { + "epoch": 0.13055, + "grad_norm": 0.14312538504600525, + "learning_rate": 4.9006303943515184e-05, + "loss": 0.0455, + "step": 27110 + }, + { + "epoch": 0.1306, + "grad_norm": 0.13176430761814117, + "learning_rate": 4.9005149766221915e-05, + "loss": 0.0445, + "step": 27120 + }, + { + "epoch": 0.13065, + "grad_norm": 0.16872818768024445, + "learning_rate": 4.900399493263686e-05, + "loss": 0.0449, + "step": 27130 + }, + { + "epoch": 0.1307, + "grad_norm": 0.1655738353729248, + "learning_rate": 4.900283944279161e-05, + "loss": 0.0439, + "step": 27140 + }, + { + "epoch": 0.13075, + "grad_norm": 0.15001314878463745, + "learning_rate": 4.9001683296717744e-05, + "loss": 0.0442, + "step": 27150 + }, + { + "epoch": 0.1308, + "grad_norm": 0.13303294777870178, + "learning_rate": 4.9000526494446874e-05, + "loss": 0.045, + "step": 27160 + }, + { + "epoch": 0.13085, + "grad_norm": 0.13751111924648285, + "learning_rate": 4.899936903601062e-05, + "loss": 0.0446, + "step": 27170 + }, + { + "epoch": 0.1309, + "grad_norm": 0.12845827639102936, + "learning_rate": 4.8998210921440647e-05, + "loss": 0.0457, + "step": 27180 + }, + { + "epoch": 0.13095, + "grad_norm": 0.10752175748348236, + "learning_rate": 4.899705215076859e-05, + "loss": 0.0422, + "step": 27190 + }, + { + "epoch": 0.131, + "grad_norm": 0.13293281197547913, + "learning_rate": 4.8995892724026146e-05, + "loss": 0.0424, + "step": 27200 + }, + { + "epoch": 0.13105, + "grad_norm": 0.1374746561050415, + "learning_rate": 4.899473264124501e-05, + "loss": 0.0454, + "step": 27210 + }, + { + "epoch": 0.1311, + "grad_norm": 0.12084544450044632, + "learning_rate": 4.89935719024569e-05, + "loss": 0.0422, + "step": 27220 + }, + { + "epoch": 0.13115, + "grad_norm": 0.13153678178787231, + "learning_rate": 4.8992410507693554e-05, + "loss": 0.044, + "step": 27230 + }, + { + "epoch": 0.1312, + "grad_norm": 0.17095345258712769, + "learning_rate": 4.8991248456986714e-05, + "loss": 0.0478, + "step": 27240 + }, + { + "epoch": 0.13125, + "grad_norm": 0.1500340700149536, + "learning_rate": 4.899008575036815e-05, + "loss": 0.0442, + "step": 27250 + }, + { + "epoch": 0.1313, + "grad_norm": 0.15221814811229706, + "learning_rate": 4.898892238786965e-05, + "loss": 0.0439, + "step": 27260 + }, + { + "epoch": 0.13135, + "grad_norm": 0.1547854095697403, + "learning_rate": 4.898775836952303e-05, + "loss": 0.0457, + "step": 27270 + }, + { + "epoch": 0.1314, + "grad_norm": 0.15095393359661102, + "learning_rate": 4.8986593695360114e-05, + "loss": 0.044, + "step": 27280 + }, + { + "epoch": 0.13145, + "grad_norm": 0.1295640468597412, + "learning_rate": 4.8985428365412734e-05, + "loss": 0.0441, + "step": 27290 + }, + { + "epoch": 0.1315, + "grad_norm": 0.1503930389881134, + "learning_rate": 4.898426237971275e-05, + "loss": 0.0432, + "step": 27300 + }, + { + "epoch": 0.13155, + "grad_norm": 0.1323421448469162, + "learning_rate": 4.898309573829204e-05, + "loss": 0.0432, + "step": 27310 + }, + { + "epoch": 0.1316, + "grad_norm": 0.1202586442232132, + "learning_rate": 4.8981928441182514e-05, + "loss": 0.0425, + "step": 27320 + }, + { + "epoch": 0.13165, + "grad_norm": 0.10859017819166183, + "learning_rate": 4.8980760488416064e-05, + "loss": 0.0452, + "step": 27330 + }, + { + "epoch": 0.1317, + "grad_norm": 0.15065470337867737, + "learning_rate": 4.897959188002463e-05, + "loss": 0.0454, + "step": 27340 + }, + { + "epoch": 0.13175, + "grad_norm": 0.16167475283145905, + "learning_rate": 4.897842261604017e-05, + "loss": 0.0437, + "step": 27350 + }, + { + "epoch": 0.1318, + "grad_norm": 0.17504888772964478, + "learning_rate": 4.897725269649464e-05, + "loss": 0.0452, + "step": 27360 + }, + { + "epoch": 0.13185, + "grad_norm": 0.16473330557346344, + "learning_rate": 4.897608212142003e-05, + "loss": 0.0465, + "step": 27370 + }, + { + "epoch": 0.1319, + "grad_norm": 0.15663087368011475, + "learning_rate": 4.897491089084835e-05, + "loss": 0.044, + "step": 27380 + }, + { + "epoch": 0.13195, + "grad_norm": 0.16694441437721252, + "learning_rate": 4.89737390048116e-05, + "loss": 0.0479, + "step": 27390 + }, + { + "epoch": 0.132, + "grad_norm": 0.15354514122009277, + "learning_rate": 4.897256646334184e-05, + "loss": 0.0429, + "step": 27400 + }, + { + "epoch": 0.13205, + "grad_norm": 0.1537885069847107, + "learning_rate": 4.897139326647111e-05, + "loss": 0.0449, + "step": 27410 + }, + { + "epoch": 0.1321, + "grad_norm": 0.18281573057174683, + "learning_rate": 4.897021941423151e-05, + "loss": 0.0442, + "step": 27420 + }, + { + "epoch": 0.13215, + "grad_norm": 0.14189311861991882, + "learning_rate": 4.896904490665511e-05, + "loss": 0.0459, + "step": 27430 + }, + { + "epoch": 0.1322, + "grad_norm": 0.13066235184669495, + "learning_rate": 4.896786974377401e-05, + "loss": 0.0434, + "step": 27440 + }, + { + "epoch": 0.13225, + "grad_norm": 0.12698237597942352, + "learning_rate": 4.896669392562038e-05, + "loss": 0.0431, + "step": 27450 + }, + { + "epoch": 0.1323, + "grad_norm": 0.1511726677417755, + "learning_rate": 4.896551745222633e-05, + "loss": 0.0453, + "step": 27460 + }, + { + "epoch": 0.13235, + "grad_norm": 0.15052133798599243, + "learning_rate": 4.896434032362404e-05, + "loss": 0.0483, + "step": 27470 + }, + { + "epoch": 0.1324, + "grad_norm": 0.14751042425632477, + "learning_rate": 4.8963162539845676e-05, + "loss": 0.0432, + "step": 27480 + }, + { + "epoch": 0.13245, + "grad_norm": 0.130686953663826, + "learning_rate": 4.896198410092347e-05, + "loss": 0.0454, + "step": 27490 + }, + { + "epoch": 0.1325, + "grad_norm": 0.11741486936807632, + "learning_rate": 4.8960805006889604e-05, + "loss": 0.0431, + "step": 27500 + }, + { + "epoch": 0.13255, + "grad_norm": 0.12329643219709396, + "learning_rate": 4.8959625257776344e-05, + "loss": 0.0431, + "step": 27510 + }, + { + "epoch": 0.1326, + "grad_norm": 0.129413902759552, + "learning_rate": 4.895844485361592e-05, + "loss": 0.0419, + "step": 27520 + }, + { + "epoch": 0.13265, + "grad_norm": 0.12024693936109543, + "learning_rate": 4.895726379444062e-05, + "loss": 0.0432, + "step": 27530 + }, + { + "epoch": 0.1327, + "grad_norm": 0.12062715739011765, + "learning_rate": 4.8956082080282726e-05, + "loss": 0.0425, + "step": 27540 + }, + { + "epoch": 0.13275, + "grad_norm": 0.11159638315439224, + "learning_rate": 4.895489971117455e-05, + "loss": 0.042, + "step": 27550 + }, + { + "epoch": 0.1328, + "grad_norm": 0.15674765408039093, + "learning_rate": 4.895371668714841e-05, + "loss": 0.0422, + "step": 27560 + }, + { + "epoch": 0.13285, + "grad_norm": 0.1360040009021759, + "learning_rate": 4.895253300823667e-05, + "loss": 0.0452, + "step": 27570 + }, + { + "epoch": 0.1329, + "grad_norm": 0.13582608103752136, + "learning_rate": 4.8951348674471666e-05, + "loss": 0.0452, + "step": 27580 + }, + { + "epoch": 0.13295, + "grad_norm": 0.11164236068725586, + "learning_rate": 4.8950163685885786e-05, + "loss": 0.0439, + "step": 27590 + }, + { + "epoch": 0.133, + "grad_norm": 0.11445189267396927, + "learning_rate": 4.8948978042511426e-05, + "loss": 0.0441, + "step": 27600 + }, + { + "epoch": 0.13305, + "grad_norm": 0.10876807570457458, + "learning_rate": 4.8947791744381005e-05, + "loss": 0.041, + "step": 27610 + }, + { + "epoch": 0.1331, + "grad_norm": 0.1405716985464096, + "learning_rate": 4.894660479152696e-05, + "loss": 0.0434, + "step": 27620 + }, + { + "epoch": 0.13315, + "grad_norm": 0.11688645929098129, + "learning_rate": 4.8945417183981737e-05, + "loss": 0.0458, + "step": 27630 + }, + { + "epoch": 0.1332, + "grad_norm": 0.14625436067581177, + "learning_rate": 4.894422892177779e-05, + "loss": 0.0431, + "step": 27640 + }, + { + "epoch": 0.13325, + "grad_norm": 0.1444549858570099, + "learning_rate": 4.894304000494764e-05, + "loss": 0.042, + "step": 27650 + }, + { + "epoch": 0.1333, + "grad_norm": 0.13313430547714233, + "learning_rate": 4.894185043352375e-05, + "loss": 0.0439, + "step": 27660 + }, + { + "epoch": 0.13335, + "grad_norm": 0.11160176992416382, + "learning_rate": 4.894066020753868e-05, + "loss": 0.0434, + "step": 27670 + }, + { + "epoch": 0.1334, + "grad_norm": 0.14903420209884644, + "learning_rate": 4.893946932702494e-05, + "loss": 0.0449, + "step": 27680 + }, + { + "epoch": 0.13345, + "grad_norm": 0.13911889493465424, + "learning_rate": 4.893827779201512e-05, + "loss": 0.0425, + "step": 27690 + }, + { + "epoch": 0.1335, + "grad_norm": 0.13201235234737396, + "learning_rate": 4.893708560254177e-05, + "loss": 0.0424, + "step": 27700 + }, + { + "epoch": 0.13355, + "grad_norm": 0.14347508549690247, + "learning_rate": 4.893589275863749e-05, + "loss": 0.0432, + "step": 27710 + }, + { + "epoch": 0.1336, + "grad_norm": 0.13755393028259277, + "learning_rate": 4.8934699260334893e-05, + "loss": 0.043, + "step": 27720 + }, + { + "epoch": 0.13365, + "grad_norm": 0.1434353142976761, + "learning_rate": 4.893350510766661e-05, + "loss": 0.043, + "step": 27730 + }, + { + "epoch": 0.1337, + "grad_norm": 0.21609467267990112, + "learning_rate": 4.8932310300665295e-05, + "loss": 0.0444, + "step": 27740 + }, + { + "epoch": 0.13375, + "grad_norm": 0.15393678843975067, + "learning_rate": 4.89311148393636e-05, + "loss": 0.0476, + "step": 27750 + }, + { + "epoch": 0.1338, + "grad_norm": 0.14801058173179626, + "learning_rate": 4.8929918723794224e-05, + "loss": 0.0447, + "step": 27760 + }, + { + "epoch": 0.13385, + "grad_norm": 0.17422208189964294, + "learning_rate": 4.892872195398985e-05, + "loss": 0.0451, + "step": 27770 + }, + { + "epoch": 0.1339, + "grad_norm": 0.14667102694511414, + "learning_rate": 4.8927524529983224e-05, + "loss": 0.0447, + "step": 27780 + }, + { + "epoch": 0.13395, + "grad_norm": 0.1609266698360443, + "learning_rate": 4.892632645180705e-05, + "loss": 0.0449, + "step": 27790 + }, + { + "epoch": 0.134, + "grad_norm": 0.15593993663787842, + "learning_rate": 4.892512771949411e-05, + "loss": 0.0447, + "step": 27800 + }, + { + "epoch": 0.13405, + "grad_norm": 0.19725289940834045, + "learning_rate": 4.8923928333077164e-05, + "loss": 0.0444, + "step": 27810 + }, + { + "epoch": 0.1341, + "grad_norm": 0.1465511918067932, + "learning_rate": 4.8922728292589e-05, + "loss": 0.0423, + "step": 27820 + }, + { + "epoch": 0.13415, + "grad_norm": 0.15368938446044922, + "learning_rate": 4.8921527598062435e-05, + "loss": 0.0446, + "step": 27830 + }, + { + "epoch": 0.1342, + "grad_norm": 0.1399412304162979, + "learning_rate": 4.892032624953029e-05, + "loss": 0.0435, + "step": 27840 + }, + { + "epoch": 0.13425, + "grad_norm": 0.11387931555509567, + "learning_rate": 4.891912424702542e-05, + "loss": 0.0436, + "step": 27850 + }, + { + "epoch": 0.1343, + "grad_norm": 0.17357230186462402, + "learning_rate": 4.891792159058066e-05, + "loss": 0.048, + "step": 27860 + }, + { + "epoch": 0.13435, + "grad_norm": 0.15355756878852844, + "learning_rate": 4.891671828022893e-05, + "loss": 0.043, + "step": 27870 + }, + { + "epoch": 0.1344, + "grad_norm": 0.17153047025203705, + "learning_rate": 4.89155143160031e-05, + "loss": 0.0445, + "step": 27880 + }, + { + "epoch": 0.13445, + "grad_norm": 0.2016763985157013, + "learning_rate": 4.891430969793609e-05, + "loss": 0.046, + "step": 27890 + }, + { + "epoch": 0.1345, + "grad_norm": 0.16488516330718994, + "learning_rate": 4.891310442606084e-05, + "loss": 0.0448, + "step": 27900 + }, + { + "epoch": 0.13455, + "grad_norm": 0.15825265645980835, + "learning_rate": 4.8911898500410304e-05, + "loss": 0.0437, + "step": 27910 + }, + { + "epoch": 0.1346, + "grad_norm": 0.1382274627685547, + "learning_rate": 4.8910691921017434e-05, + "loss": 0.0439, + "step": 27920 + }, + { + "epoch": 0.13465, + "grad_norm": 0.15109901130199432, + "learning_rate": 4.890948468791524e-05, + "loss": 0.0433, + "step": 27930 + }, + { + "epoch": 0.1347, + "grad_norm": 0.1563548445701599, + "learning_rate": 4.890827680113671e-05, + "loss": 0.0417, + "step": 27940 + }, + { + "epoch": 0.13475, + "grad_norm": 0.15322072803974152, + "learning_rate": 4.890706826071488e-05, + "loss": 0.0429, + "step": 27950 + }, + { + "epoch": 0.1348, + "grad_norm": 0.14017850160598755, + "learning_rate": 4.890585906668278e-05, + "loss": 0.0441, + "step": 27960 + }, + { + "epoch": 0.13485, + "grad_norm": 0.12772151827812195, + "learning_rate": 4.890464921907348e-05, + "loss": 0.0444, + "step": 27970 + }, + { + "epoch": 0.1349, + "grad_norm": 0.1149485856294632, + "learning_rate": 4.890343871792005e-05, + "loss": 0.0444, + "step": 27980 + }, + { + "epoch": 0.13495, + "grad_norm": 0.1375674158334732, + "learning_rate": 4.890222756325558e-05, + "loss": 0.0453, + "step": 27990 + }, + { + "epoch": 0.135, + "grad_norm": 0.1502043753862381, + "learning_rate": 4.8901015755113195e-05, + "loss": 0.0457, + "step": 28000 + }, + { + "epoch": 0.13505, + "grad_norm": 0.1569092720746994, + "learning_rate": 4.889980329352602e-05, + "loss": 0.0467, + "step": 28010 + }, + { + "epoch": 0.1351, + "grad_norm": 0.1355733722448349, + "learning_rate": 4.8898590178527195e-05, + "loss": 0.0441, + "step": 28020 + }, + { + "epoch": 0.13515, + "grad_norm": 0.13171911239624023, + "learning_rate": 4.8897376410149885e-05, + "loss": 0.0468, + "step": 28030 + }, + { + "epoch": 0.1352, + "grad_norm": 0.1389130800962448, + "learning_rate": 4.889616198842729e-05, + "loss": 0.0472, + "step": 28040 + }, + { + "epoch": 0.13525, + "grad_norm": 0.14203593134880066, + "learning_rate": 4.8894946913392616e-05, + "loss": 0.0444, + "step": 28050 + }, + { + "epoch": 0.1353, + "grad_norm": 0.13637006282806396, + "learning_rate": 4.889373118507905e-05, + "loss": 0.0422, + "step": 28060 + }, + { + "epoch": 0.13535, + "grad_norm": 0.15632040798664093, + "learning_rate": 4.889251480351986e-05, + "loss": 0.0454, + "step": 28070 + }, + { + "epoch": 0.1354, + "grad_norm": 0.14616303145885468, + "learning_rate": 4.889129776874829e-05, + "loss": 0.0458, + "step": 28080 + }, + { + "epoch": 0.13545, + "grad_norm": 0.1338450163602829, + "learning_rate": 4.889008008079762e-05, + "loss": 0.0453, + "step": 28090 + }, + { + "epoch": 0.1355, + "grad_norm": 0.1341494768857956, + "learning_rate": 4.888886173970113e-05, + "loss": 0.0444, + "step": 28100 + }, + { + "epoch": 0.13555, + "grad_norm": 0.13778996467590332, + "learning_rate": 4.888764274549213e-05, + "loss": 0.0446, + "step": 28110 + }, + { + "epoch": 0.1356, + "grad_norm": 0.12898795306682587, + "learning_rate": 4.888642309820396e-05, + "loss": 0.0434, + "step": 28120 + }, + { + "epoch": 0.13565, + "grad_norm": 0.13595975935459137, + "learning_rate": 4.888520279786996e-05, + "loss": 0.0444, + "step": 28130 + }, + { + "epoch": 0.1357, + "grad_norm": 0.1189492866396904, + "learning_rate": 4.8883981844523476e-05, + "loss": 0.0447, + "step": 28140 + }, + { + "epoch": 0.13575, + "grad_norm": 0.15735140442848206, + "learning_rate": 4.8882760238197906e-05, + "loss": 0.0425, + "step": 28150 + }, + { + "epoch": 0.1358, + "grad_norm": 0.1403653472661972, + "learning_rate": 4.888153797892665e-05, + "loss": 0.0424, + "step": 28160 + }, + { + "epoch": 0.13585, + "grad_norm": 0.1440337896347046, + "learning_rate": 4.888031506674311e-05, + "loss": 0.0425, + "step": 28170 + }, + { + "epoch": 0.1359, + "grad_norm": 0.13385984301567078, + "learning_rate": 4.887909150168073e-05, + "loss": 0.0423, + "step": 28180 + }, + { + "epoch": 0.13595, + "grad_norm": 0.1268351823091507, + "learning_rate": 4.8877867283772956e-05, + "loss": 0.0411, + "step": 28190 + }, + { + "epoch": 0.136, + "grad_norm": 0.1349794566631317, + "learning_rate": 4.8876642413053266e-05, + "loss": 0.0444, + "step": 28200 + }, + { + "epoch": 0.13605, + "grad_norm": 0.13911347091197968, + "learning_rate": 4.887541688955514e-05, + "loss": 0.0418, + "step": 28210 + }, + { + "epoch": 0.1361, + "grad_norm": 0.173640638589859, + "learning_rate": 4.8874190713312086e-05, + "loss": 0.0451, + "step": 28220 + }, + { + "epoch": 0.13615, + "grad_norm": 0.1754017323255539, + "learning_rate": 4.887296388435763e-05, + "loss": 0.045, + "step": 28230 + }, + { + "epoch": 0.1362, + "grad_norm": 0.1387481838464737, + "learning_rate": 4.88717364027253e-05, + "loss": 0.0438, + "step": 28240 + }, + { + "epoch": 0.13625, + "grad_norm": 0.1442059576511383, + "learning_rate": 4.8870508268448676e-05, + "loss": 0.0443, + "step": 28250 + }, + { + "epoch": 0.1363, + "grad_norm": 0.1649225652217865, + "learning_rate": 4.8869279481561316e-05, + "loss": 0.0437, + "step": 28260 + }, + { + "epoch": 0.13635, + "grad_norm": 0.13748526573181152, + "learning_rate": 4.886805004209682e-05, + "loss": 0.0432, + "step": 28270 + }, + { + "epoch": 0.1364, + "grad_norm": 0.12064554542303085, + "learning_rate": 4.886681995008881e-05, + "loss": 0.0422, + "step": 28280 + }, + { + "epoch": 0.13645, + "grad_norm": 0.1495683491230011, + "learning_rate": 4.886558920557091e-05, + "loss": 0.0426, + "step": 28290 + }, + { + "epoch": 0.1365, + "grad_norm": 0.1313278079032898, + "learning_rate": 4.8864357808576765e-05, + "loss": 0.0417, + "step": 28300 + }, + { + "epoch": 0.13655, + "grad_norm": 0.11771807074546814, + "learning_rate": 4.8863125759140036e-05, + "loss": 0.044, + "step": 28310 + }, + { + "epoch": 0.1366, + "grad_norm": 0.12219876050949097, + "learning_rate": 4.886189305729443e-05, + "loss": 0.0433, + "step": 28320 + }, + { + "epoch": 0.13665, + "grad_norm": 0.13192905485630035, + "learning_rate": 4.886065970307362e-05, + "loss": 0.045, + "step": 28330 + }, + { + "epoch": 0.1367, + "grad_norm": 0.13180845975875854, + "learning_rate": 4.885942569651134e-05, + "loss": 0.0421, + "step": 28340 + }, + { + "epoch": 0.13675, + "grad_norm": 0.14230282604694366, + "learning_rate": 4.885819103764132e-05, + "loss": 0.0423, + "step": 28350 + }, + { + "epoch": 0.1368, + "grad_norm": 0.14785164594650269, + "learning_rate": 4.8856955726497327e-05, + "loss": 0.043, + "step": 28360 + }, + { + "epoch": 0.13685, + "grad_norm": 0.13133099675178528, + "learning_rate": 4.885571976311313e-05, + "loss": 0.0431, + "step": 28370 + }, + { + "epoch": 0.1369, + "grad_norm": 0.12610967457294464, + "learning_rate": 4.885448314752251e-05, + "loss": 0.0447, + "step": 28380 + }, + { + "epoch": 0.13695, + "grad_norm": 0.15596739947795868, + "learning_rate": 4.885324587975928e-05, + "loss": 0.044, + "step": 28390 + }, + { + "epoch": 0.137, + "grad_norm": 0.1739337146282196, + "learning_rate": 4.885200795985727e-05, + "loss": 0.0476, + "step": 28400 + }, + { + "epoch": 0.13705, + "grad_norm": 0.14911046624183655, + "learning_rate": 4.8850769387850334e-05, + "loss": 0.0464, + "step": 28410 + }, + { + "epoch": 0.1371, + "grad_norm": 0.13654664158821106, + "learning_rate": 4.884953016377232e-05, + "loss": 0.0437, + "step": 28420 + }, + { + "epoch": 0.13715, + "grad_norm": 0.11479999125003815, + "learning_rate": 4.884829028765711e-05, + "loss": 0.0435, + "step": 28430 + }, + { + "epoch": 0.1372, + "grad_norm": 0.13781145215034485, + "learning_rate": 4.884704975953859e-05, + "loss": 0.0442, + "step": 28440 + }, + { + "epoch": 0.13725, + "grad_norm": 0.12538376450538635, + "learning_rate": 4.88458085794507e-05, + "loss": 0.0446, + "step": 28450 + }, + { + "epoch": 0.1373, + "grad_norm": 0.10659784823656082, + "learning_rate": 4.884456674742736e-05, + "loss": 0.0441, + "step": 28460 + }, + { + "epoch": 0.13735, + "grad_norm": 0.12495476007461548, + "learning_rate": 4.8843324263502523e-05, + "loss": 0.044, + "step": 28470 + }, + { + "epoch": 0.1374, + "grad_norm": 0.16172359883785248, + "learning_rate": 4.884208112771016e-05, + "loss": 0.0468, + "step": 28480 + }, + { + "epoch": 0.13745, + "grad_norm": 0.1775505542755127, + "learning_rate": 4.884083734008425e-05, + "loss": 0.0446, + "step": 28490 + }, + { + "epoch": 0.1375, + "grad_norm": 0.22093729674816132, + "learning_rate": 4.883959290065882e-05, + "loss": 0.0441, + "step": 28500 + }, + { + "epoch": 0.13755, + "grad_norm": 0.16403494775295258, + "learning_rate": 4.883834780946786e-05, + "loss": 0.043, + "step": 28510 + }, + { + "epoch": 0.1376, + "grad_norm": 0.13922278583049774, + "learning_rate": 4.883710206654543e-05, + "loss": 0.0466, + "step": 28520 + }, + { + "epoch": 0.13765, + "grad_norm": 0.14404745399951935, + "learning_rate": 4.883585567192559e-05, + "loss": 0.0436, + "step": 28530 + }, + { + "epoch": 0.1377, + "grad_norm": 0.12358921021223068, + "learning_rate": 4.8834608625642404e-05, + "loss": 0.0443, + "step": 28540 + }, + { + "epoch": 0.13775, + "grad_norm": 0.14362011849880219, + "learning_rate": 4.8833360927729976e-05, + "loss": 0.0441, + "step": 28550 + }, + { + "epoch": 0.1378, + "grad_norm": 0.17407487332820892, + "learning_rate": 4.883211257822241e-05, + "loss": 0.0433, + "step": 28560 + }, + { + "epoch": 0.13785, + "grad_norm": 0.1505686640739441, + "learning_rate": 4.883086357715384e-05, + "loss": 0.044, + "step": 28570 + }, + { + "epoch": 0.1379, + "grad_norm": 0.16092127561569214, + "learning_rate": 4.882961392455842e-05, + "loss": 0.0425, + "step": 28580 + }, + { + "epoch": 0.13795, + "grad_norm": 0.1629003882408142, + "learning_rate": 4.88283636204703e-05, + "loss": 0.0444, + "step": 28590 + }, + { + "epoch": 0.138, + "grad_norm": 0.13917267322540283, + "learning_rate": 4.8827112664923674e-05, + "loss": 0.0424, + "step": 28600 + }, + { + "epoch": 0.13805, + "grad_norm": 0.14045603573322296, + "learning_rate": 4.882586105795274e-05, + "loss": 0.0424, + "step": 28610 + }, + { + "epoch": 0.1381, + "grad_norm": 0.14074933528900146, + "learning_rate": 4.882460879959171e-05, + "loss": 0.0424, + "step": 28620 + }, + { + "epoch": 0.13815, + "grad_norm": 0.14256353676319122, + "learning_rate": 4.882335588987483e-05, + "loss": 0.0436, + "step": 28630 + }, + { + "epoch": 0.1382, + "grad_norm": 0.15993767976760864, + "learning_rate": 4.882210232883635e-05, + "loss": 0.044, + "step": 28640 + }, + { + "epoch": 0.13825, + "grad_norm": 0.13555461168289185, + "learning_rate": 4.8820848116510544e-05, + "loss": 0.045, + "step": 28650 + }, + { + "epoch": 0.1383, + "grad_norm": 0.14272552728652954, + "learning_rate": 4.881959325293169e-05, + "loss": 0.0437, + "step": 28660 + }, + { + "epoch": 0.13835, + "grad_norm": 0.14286339282989502, + "learning_rate": 4.8818337738134124e-05, + "loss": 0.0444, + "step": 28670 + }, + { + "epoch": 0.1384, + "grad_norm": 0.12127411365509033, + "learning_rate": 4.881708157215213e-05, + "loss": 0.0442, + "step": 28680 + }, + { + "epoch": 0.13845, + "grad_norm": 0.132398322224617, + "learning_rate": 4.881582475502009e-05, + "loss": 0.0445, + "step": 28690 + }, + { + "epoch": 0.1385, + "grad_norm": 0.13280871510505676, + "learning_rate": 4.8814567286772344e-05, + "loss": 0.0428, + "step": 28700 + }, + { + "epoch": 0.13855, + "grad_norm": 0.11291223764419556, + "learning_rate": 4.881330916744327e-05, + "loss": 0.0451, + "step": 28710 + }, + { + "epoch": 0.1386, + "grad_norm": 0.14127440750598907, + "learning_rate": 4.8812050397067277e-05, + "loss": 0.0457, + "step": 28720 + }, + { + "epoch": 0.13865, + "grad_norm": 0.1400194764137268, + "learning_rate": 4.881079097567877e-05, + "loss": 0.0444, + "step": 28730 + }, + { + "epoch": 0.1387, + "grad_norm": 0.1517164260149002, + "learning_rate": 4.880953090331218e-05, + "loss": 0.0425, + "step": 28740 + }, + { + "epoch": 0.13875, + "grad_norm": 0.1363055408000946, + "learning_rate": 4.880827018000196e-05, + "loss": 0.0434, + "step": 28750 + }, + { + "epoch": 0.1388, + "grad_norm": 0.13896967470645905, + "learning_rate": 4.880700880578258e-05, + "loss": 0.0464, + "step": 28760 + }, + { + "epoch": 0.13885, + "grad_norm": 0.1326436996459961, + "learning_rate": 4.880574678068852e-05, + "loss": 0.0437, + "step": 28770 + }, + { + "epoch": 0.1389, + "grad_norm": 0.11332570761442184, + "learning_rate": 4.880448410475429e-05, + "loss": 0.0413, + "step": 28780 + }, + { + "epoch": 0.13895, + "grad_norm": 0.10437055677175522, + "learning_rate": 4.88032207780144e-05, + "loss": 0.044, + "step": 28790 + }, + { + "epoch": 0.139, + "grad_norm": 0.11086023598909378, + "learning_rate": 4.8801956800503406e-05, + "loss": 0.0429, + "step": 28800 + }, + { + "epoch": 0.13905, + "grad_norm": 0.12515541911125183, + "learning_rate": 4.880069217225585e-05, + "loss": 0.0459, + "step": 28810 + }, + { + "epoch": 0.1391, + "grad_norm": 0.1383202075958252, + "learning_rate": 4.879942689330631e-05, + "loss": 0.0446, + "step": 28820 + }, + { + "epoch": 0.13915, + "grad_norm": 0.17073068022727966, + "learning_rate": 4.879816096368939e-05, + "loss": 0.0488, + "step": 28830 + }, + { + "epoch": 0.1392, + "grad_norm": 0.14519037306308746, + "learning_rate": 4.879689438343968e-05, + "loss": 0.0444, + "step": 28840 + }, + { + "epoch": 0.13925, + "grad_norm": 0.12331254780292511, + "learning_rate": 4.8795627152591825e-05, + "loss": 0.0458, + "step": 28850 + }, + { + "epoch": 0.1393, + "grad_norm": 0.13862133026123047, + "learning_rate": 4.8794359271180454e-05, + "loss": 0.0457, + "step": 28860 + }, + { + "epoch": 0.13935, + "grad_norm": 0.147194504737854, + "learning_rate": 4.8793090739240244e-05, + "loss": 0.0444, + "step": 28870 + }, + { + "epoch": 0.1394, + "grad_norm": 0.15450793504714966, + "learning_rate": 4.879182155680587e-05, + "loss": 0.0465, + "step": 28880 + }, + { + "epoch": 0.13945, + "grad_norm": 0.14073385298252106, + "learning_rate": 4.879055172391204e-05, + "loss": 0.0441, + "step": 28890 + }, + { + "epoch": 0.1395, + "grad_norm": 0.12063471972942352, + "learning_rate": 4.878928124059345e-05, + "loss": 0.044, + "step": 28900 + }, + { + "epoch": 0.13955, + "grad_norm": 0.14246101677417755, + "learning_rate": 4.878801010688486e-05, + "loss": 0.0443, + "step": 28910 + }, + { + "epoch": 0.1396, + "grad_norm": 0.12795647978782654, + "learning_rate": 4.878673832282101e-05, + "loss": 0.0455, + "step": 28920 + }, + { + "epoch": 0.13965, + "grad_norm": 0.1561502069234848, + "learning_rate": 4.878546588843666e-05, + "loss": 0.0458, + "step": 28930 + }, + { + "epoch": 0.1397, + "grad_norm": 0.15751421451568604, + "learning_rate": 4.8784192803766624e-05, + "loss": 0.0451, + "step": 28940 + }, + { + "epoch": 0.13975, + "grad_norm": 0.1431007981300354, + "learning_rate": 4.878291906884568e-05, + "loss": 0.0424, + "step": 28950 + }, + { + "epoch": 0.1398, + "grad_norm": 0.1989883929491043, + "learning_rate": 4.878164468370867e-05, + "loss": 0.0441, + "step": 28960 + }, + { + "epoch": 0.13985, + "grad_norm": 0.14188385009765625, + "learning_rate": 4.8780369648390426e-05, + "loss": 0.0435, + "step": 28970 + }, + { + "epoch": 0.1399, + "grad_norm": 0.13006633520126343, + "learning_rate": 4.87790939629258e-05, + "loss": 0.0436, + "step": 28980 + }, + { + "epoch": 0.13995, + "grad_norm": 0.1398228257894516, + "learning_rate": 4.87778176273497e-05, + "loss": 0.0432, + "step": 28990 + }, + { + "epoch": 0.14, + "grad_norm": 0.1444910317659378, + "learning_rate": 4.877654064169698e-05, + "loss": 0.0425, + "step": 29000 + }, + { + "epoch": 0.14005, + "grad_norm": 0.17665322124958038, + "learning_rate": 4.877526300600258e-05, + "loss": 0.0468, + "step": 29010 + }, + { + "epoch": 0.1401, + "grad_norm": 0.13634498417377472, + "learning_rate": 4.877398472030142e-05, + "loss": 0.0436, + "step": 29020 + }, + { + "epoch": 0.14015, + "grad_norm": 0.12259041517972946, + "learning_rate": 4.877270578462845e-05, + "loss": 0.044, + "step": 29030 + }, + { + "epoch": 0.1402, + "grad_norm": 0.19929362833499908, + "learning_rate": 4.8771426199018634e-05, + "loss": 0.0452, + "step": 29040 + }, + { + "epoch": 0.14025, + "grad_norm": 0.15330860018730164, + "learning_rate": 4.877014596350695e-05, + "loss": 0.0425, + "step": 29050 + }, + { + "epoch": 0.1403, + "grad_norm": 0.145126074552536, + "learning_rate": 4.876886507812841e-05, + "loss": 0.0442, + "step": 29060 + }, + { + "epoch": 0.14035, + "grad_norm": 0.11339390277862549, + "learning_rate": 4.8767583542918037e-05, + "loss": 0.0421, + "step": 29070 + }, + { + "epoch": 0.1404, + "grad_norm": 0.15190169215202332, + "learning_rate": 4.876630135791085e-05, + "loss": 0.0424, + "step": 29080 + }, + { + "epoch": 0.14045, + "grad_norm": 0.12558703124523163, + "learning_rate": 4.8765018523141915e-05, + "loss": 0.0441, + "step": 29090 + }, + { + "epoch": 0.1405, + "grad_norm": 0.14016909897327423, + "learning_rate": 4.8763735038646296e-05, + "loss": 0.0462, + "step": 29100 + }, + { + "epoch": 0.14055, + "grad_norm": 0.1210547462105751, + "learning_rate": 4.87624509044591e-05, + "loss": 0.0436, + "step": 29110 + }, + { + "epoch": 0.1406, + "grad_norm": 0.15262196958065033, + "learning_rate": 4.8761166120615415e-05, + "loss": 0.0445, + "step": 29120 + }, + { + "epoch": 0.14065, + "grad_norm": 0.13301680982112885, + "learning_rate": 4.8759880687150375e-05, + "loss": 0.0479, + "step": 29130 + }, + { + "epoch": 0.1407, + "grad_norm": 0.12849004566669464, + "learning_rate": 4.875859460409913e-05, + "loss": 0.0427, + "step": 29140 + }, + { + "epoch": 0.14075, + "grad_norm": 0.1430385261774063, + "learning_rate": 4.8757307871496825e-05, + "loss": 0.0424, + "step": 29150 + }, + { + "epoch": 0.1408, + "grad_norm": 0.1357356458902359, + "learning_rate": 4.875602048937865e-05, + "loss": 0.0418, + "step": 29160 + }, + { + "epoch": 0.14085, + "grad_norm": 0.12177083641290665, + "learning_rate": 4.875473245777981e-05, + "loss": 0.0436, + "step": 29170 + }, + { + "epoch": 0.1409, + "grad_norm": 0.12450092285871506, + "learning_rate": 4.87534437767355e-05, + "loss": 0.0457, + "step": 29180 + }, + { + "epoch": 0.14095, + "grad_norm": 0.1108444556593895, + "learning_rate": 4.875215444628095e-05, + "loss": 0.0436, + "step": 29190 + }, + { + "epoch": 0.141, + "grad_norm": 0.14912369847297668, + "learning_rate": 4.875086446645144e-05, + "loss": 0.0437, + "step": 29200 + }, + { + "epoch": 0.14105, + "grad_norm": 0.1382071077823639, + "learning_rate": 4.8749573837282207e-05, + "loss": 0.0452, + "step": 29210 + }, + { + "epoch": 0.1411, + "grad_norm": 0.1486228108406067, + "learning_rate": 4.874828255880855e-05, + "loss": 0.0465, + "step": 29220 + }, + { + "epoch": 0.14115, + "grad_norm": 0.12856066226959229, + "learning_rate": 4.874699063106577e-05, + "loss": 0.0428, + "step": 29230 + }, + { + "epoch": 0.1412, + "grad_norm": 0.16187553107738495, + "learning_rate": 4.874569805408919e-05, + "loss": 0.0439, + "step": 29240 + }, + { + "epoch": 0.14125, + "grad_norm": 0.1266612708568573, + "learning_rate": 4.8744404827914144e-05, + "loss": 0.0437, + "step": 29250 + }, + { + "epoch": 0.1413, + "grad_norm": 0.14293614029884338, + "learning_rate": 4.874311095257599e-05, + "loss": 0.0431, + "step": 29260 + }, + { + "epoch": 0.14135, + "grad_norm": 0.16605037450790405, + "learning_rate": 4.87418164281101e-05, + "loss": 0.0438, + "step": 29270 + }, + { + "epoch": 0.1414, + "grad_norm": 0.12890055775642395, + "learning_rate": 4.8740521254551876e-05, + "loss": 0.0463, + "step": 29280 + }, + { + "epoch": 0.14145, + "grad_norm": 0.14314982295036316, + "learning_rate": 4.873922543193671e-05, + "loss": 0.0436, + "step": 29290 + }, + { + "epoch": 0.1415, + "grad_norm": 0.1430131494998932, + "learning_rate": 4.873792896030005e-05, + "loss": 0.0434, + "step": 29300 + }, + { + "epoch": 0.14155, + "grad_norm": 0.19689872860908508, + "learning_rate": 4.873663183967732e-05, + "loss": 0.0447, + "step": 29310 + }, + { + "epoch": 0.1416, + "grad_norm": 0.19659042358398438, + "learning_rate": 4.8735334070104e-05, + "loss": 0.0464, + "step": 29320 + }, + { + "epoch": 0.14165, + "grad_norm": 0.15170709788799286, + "learning_rate": 4.873403565161556e-05, + "loss": 0.0429, + "step": 29330 + }, + { + "epoch": 0.1417, + "grad_norm": 0.1255590319633484, + "learning_rate": 4.873273658424751e-05, + "loss": 0.0446, + "step": 29340 + }, + { + "epoch": 0.14175, + "grad_norm": 0.13071368634700775, + "learning_rate": 4.8731436868035343e-05, + "loss": 0.0472, + "step": 29350 + }, + { + "epoch": 0.1418, + "grad_norm": 0.1392545998096466, + "learning_rate": 4.873013650301461e-05, + "loss": 0.0444, + "step": 29360 + }, + { + "epoch": 0.14185, + "grad_norm": 0.1398293524980545, + "learning_rate": 4.872883548922087e-05, + "loss": 0.045, + "step": 29370 + }, + { + "epoch": 0.1419, + "grad_norm": 0.12076738476753235, + "learning_rate": 4.8727533826689677e-05, + "loss": 0.0431, + "step": 29380 + }, + { + "epoch": 0.14195, + "grad_norm": 0.12129142880439758, + "learning_rate": 4.872623151545662e-05, + "loss": 0.0442, + "step": 29390 + }, + { + "epoch": 0.142, + "grad_norm": 0.13107934594154358, + "learning_rate": 4.872492855555732e-05, + "loss": 0.0474, + "step": 29400 + }, + { + "epoch": 0.14205, + "grad_norm": 0.13777202367782593, + "learning_rate": 4.872362494702737e-05, + "loss": 0.0514, + "step": 29410 + }, + { + "epoch": 0.1421, + "grad_norm": 0.1417369246482849, + "learning_rate": 4.8722320689902434e-05, + "loss": 0.0447, + "step": 29420 + }, + { + "epoch": 0.14215, + "grad_norm": 0.16407637298107147, + "learning_rate": 4.872101578421816e-05, + "loss": 0.0492, + "step": 29430 + }, + { + "epoch": 0.1422, + "grad_norm": 0.16390691697597504, + "learning_rate": 4.871971023001023e-05, + "loss": 0.0459, + "step": 29440 + }, + { + "epoch": 0.14225, + "grad_norm": 0.1474514603614807, + "learning_rate": 4.871840402731432e-05, + "loss": 0.0458, + "step": 29450 + }, + { + "epoch": 0.1423, + "grad_norm": 0.14336465299129486, + "learning_rate": 4.871709717616617e-05, + "loss": 0.047, + "step": 29460 + }, + { + "epoch": 0.14235, + "grad_norm": 0.1408020406961441, + "learning_rate": 4.8715789676601484e-05, + "loss": 0.0437, + "step": 29470 + }, + { + "epoch": 0.1424, + "grad_norm": 0.14137528836727142, + "learning_rate": 4.871448152865603e-05, + "loss": 0.0437, + "step": 29480 + }, + { + "epoch": 0.14245, + "grad_norm": 0.14283783733844757, + "learning_rate": 4.8713172732365554e-05, + "loss": 0.0424, + "step": 29490 + }, + { + "epoch": 0.1425, + "grad_norm": 0.11966720223426819, + "learning_rate": 4.871186328776583e-05, + "loss": 0.0441, + "step": 29500 + }, + { + "epoch": 0.14255, + "grad_norm": 0.12390667200088501, + "learning_rate": 4.871055319489269e-05, + "loss": 0.0433, + "step": 29510 + }, + { + "epoch": 0.1426, + "grad_norm": 0.13050583004951477, + "learning_rate": 4.8709242453781936e-05, + "loss": 0.0425, + "step": 29520 + }, + { + "epoch": 0.14265, + "grad_norm": 0.12697172164916992, + "learning_rate": 4.8707931064469385e-05, + "loss": 0.0456, + "step": 29530 + }, + { + "epoch": 0.1427, + "grad_norm": 0.12325213849544525, + "learning_rate": 4.870661902699092e-05, + "loss": 0.0431, + "step": 29540 + }, + { + "epoch": 0.14275, + "grad_norm": 0.11337971687316895, + "learning_rate": 4.8705306341382385e-05, + "loss": 0.0419, + "step": 29550 + }, + { + "epoch": 0.1428, + "grad_norm": 0.1135648712515831, + "learning_rate": 4.870399300767968e-05, + "loss": 0.0419, + "step": 29560 + }, + { + "epoch": 0.14285, + "grad_norm": 0.16290436685085297, + "learning_rate": 4.870267902591872e-05, + "loss": 0.0449, + "step": 29570 + }, + { + "epoch": 0.1429, + "grad_norm": 0.1278180629014969, + "learning_rate": 4.870136439613542e-05, + "loss": 0.0412, + "step": 29580 + }, + { + "epoch": 0.14295, + "grad_norm": 0.15755663812160492, + "learning_rate": 4.870004911836572e-05, + "loss": 0.0421, + "step": 29590 + }, + { + "epoch": 0.143, + "grad_norm": 0.16184493899345398, + "learning_rate": 4.8698733192645574e-05, + "loss": 0.0428, + "step": 29600 + }, + { + "epoch": 0.14305, + "grad_norm": 0.12744948267936707, + "learning_rate": 4.869741661901097e-05, + "loss": 0.0442, + "step": 29610 + }, + { + "epoch": 0.1431, + "grad_norm": 0.1351950615644455, + "learning_rate": 4.86960993974979e-05, + "loss": 0.0475, + "step": 29620 + }, + { + "epoch": 0.14315, + "grad_norm": 0.12647458910942078, + "learning_rate": 4.869478152814238e-05, + "loss": 0.0445, + "step": 29630 + }, + { + "epoch": 0.1432, + "grad_norm": 0.13030609488487244, + "learning_rate": 4.869346301098042e-05, + "loss": 0.049, + "step": 29640 + }, + { + "epoch": 0.14325, + "grad_norm": 0.15038636326789856, + "learning_rate": 4.869214384604809e-05, + "loss": 0.0433, + "step": 29650 + }, + { + "epoch": 0.1433, + "grad_norm": 0.1297498345375061, + "learning_rate": 4.869082403338145e-05, + "loss": 0.0445, + "step": 29660 + }, + { + "epoch": 0.14335, + "grad_norm": 0.16710205376148224, + "learning_rate": 4.868950357301658e-05, + "loss": 0.0456, + "step": 29670 + }, + { + "epoch": 0.1434, + "grad_norm": 0.11809530109167099, + "learning_rate": 4.868818246498958e-05, + "loss": 0.0441, + "step": 29680 + }, + { + "epoch": 0.14345, + "grad_norm": 0.14866682887077332, + "learning_rate": 4.8686860709336575e-05, + "loss": 0.0444, + "step": 29690 + }, + { + "epoch": 0.1435, + "grad_norm": 0.15026448667049408, + "learning_rate": 4.868553830609369e-05, + "loss": 0.0447, + "step": 29700 + }, + { + "epoch": 0.14355, + "grad_norm": 0.11738848686218262, + "learning_rate": 4.86842152552971e-05, + "loss": 0.0429, + "step": 29710 + }, + { + "epoch": 0.1436, + "grad_norm": 0.15006989240646362, + "learning_rate": 4.868289155698294e-05, + "loss": 0.0443, + "step": 29720 + }, + { + "epoch": 0.14365, + "grad_norm": 0.12709090113639832, + "learning_rate": 4.868156721118744e-05, + "loss": 0.0426, + "step": 29730 + }, + { + "epoch": 0.1437, + "grad_norm": 0.1313161700963974, + "learning_rate": 4.868024221794678e-05, + "loss": 0.0434, + "step": 29740 + }, + { + "epoch": 0.14375, + "grad_norm": 0.1490878313779831, + "learning_rate": 4.8678916577297205e-05, + "loss": 0.0433, + "step": 29750 + }, + { + "epoch": 0.1438, + "grad_norm": 0.13998951017856598, + "learning_rate": 4.867759028927494e-05, + "loss": 0.0424, + "step": 29760 + }, + { + "epoch": 0.14385, + "grad_norm": 0.14876703917980194, + "learning_rate": 4.867626335391625e-05, + "loss": 0.0465, + "step": 29770 + }, + { + "epoch": 0.1439, + "grad_norm": 0.17557351291179657, + "learning_rate": 4.867493577125741e-05, + "loss": 0.0463, + "step": 29780 + }, + { + "epoch": 0.14395, + "grad_norm": 0.205479234457016, + "learning_rate": 4.867360754133473e-05, + "loss": 0.0476, + "step": 29790 + }, + { + "epoch": 0.144, + "grad_norm": 0.12718096375465393, + "learning_rate": 4.867227866418451e-05, + "loss": 0.045, + "step": 29800 + }, + { + "epoch": 0.14405, + "grad_norm": 0.12666594982147217, + "learning_rate": 4.867094913984309e-05, + "loss": 0.0461, + "step": 29810 + }, + { + "epoch": 0.1441, + "grad_norm": 0.15450750291347504, + "learning_rate": 4.866961896834681e-05, + "loss": 0.0441, + "step": 29820 + }, + { + "epoch": 0.14415, + "grad_norm": 0.15755434334278107, + "learning_rate": 4.866828814973203e-05, + "loss": 0.0436, + "step": 29830 + }, + { + "epoch": 0.1442, + "grad_norm": 0.14265063405036926, + "learning_rate": 4.866695668403515e-05, + "loss": 0.0444, + "step": 29840 + }, + { + "epoch": 0.14425, + "grad_norm": 0.12945078313350677, + "learning_rate": 4.866562457129257e-05, + "loss": 0.0435, + "step": 29850 + }, + { + "epoch": 0.1443, + "grad_norm": 0.13932296633720398, + "learning_rate": 4.8664291811540704e-05, + "loss": 0.0431, + "step": 29860 + }, + { + "epoch": 0.14435, + "grad_norm": 0.13899292051792145, + "learning_rate": 4.866295840481598e-05, + "loss": 0.0439, + "step": 29870 + }, + { + "epoch": 0.1444, + "grad_norm": 0.14892756938934326, + "learning_rate": 4.8661624351154877e-05, + "loss": 0.043, + "step": 29880 + }, + { + "epoch": 0.14445, + "grad_norm": 0.14375853538513184, + "learning_rate": 4.8660289650593846e-05, + "loss": 0.0423, + "step": 29890 + }, + { + "epoch": 0.1445, + "grad_norm": 0.1570858210325241, + "learning_rate": 4.865895430316939e-05, + "loss": 0.0421, + "step": 29900 + }, + { + "epoch": 0.14455, + "grad_norm": 0.11134622991085052, + "learning_rate": 4.865761830891801e-05, + "loss": 0.043, + "step": 29910 + }, + { + "epoch": 0.1446, + "grad_norm": 0.13085021078586578, + "learning_rate": 4.865628166787623e-05, + "loss": 0.0436, + "step": 29920 + }, + { + "epoch": 0.14465, + "grad_norm": 0.14486683905124664, + "learning_rate": 4.865494438008059e-05, + "loss": 0.0446, + "step": 29930 + }, + { + "epoch": 0.1447, + "grad_norm": 0.17880284786224365, + "learning_rate": 4.865360644556767e-05, + "loss": 0.0456, + "step": 29940 + }, + { + "epoch": 0.14475, + "grad_norm": 0.15188609063625336, + "learning_rate": 4.865226786437403e-05, + "loss": 0.043, + "step": 29950 + }, + { + "epoch": 0.1448, + "grad_norm": 0.11204826831817627, + "learning_rate": 4.8650928636536277e-05, + "loss": 0.0406, + "step": 29960 + }, + { + "epoch": 0.14485, + "grad_norm": 0.1512330174446106, + "learning_rate": 4.8649588762091016e-05, + "loss": 0.0413, + "step": 29970 + }, + { + "epoch": 0.1449, + "grad_norm": 0.1452956199645996, + "learning_rate": 4.864824824107488e-05, + "loss": 0.0434, + "step": 29980 + }, + { + "epoch": 0.14495, + "grad_norm": 0.16116859018802643, + "learning_rate": 4.864690707352453e-05, + "loss": 0.0427, + "step": 29990 + }, + { + "epoch": 0.145, + "grad_norm": 0.1275971531867981, + "learning_rate": 4.864556525947661e-05, + "loss": 0.0462, + "step": 30000 + }, + { + "epoch": 0.14505, + "grad_norm": 0.142277330160141, + "learning_rate": 4.864422279896783e-05, + "loss": 0.0414, + "step": 30010 + }, + { + "epoch": 0.1451, + "grad_norm": 0.13965527713298798, + "learning_rate": 4.864287969203488e-05, + "loss": 0.0453, + "step": 30020 + }, + { + "epoch": 0.14515, + "grad_norm": 0.14702655375003815, + "learning_rate": 4.8641535938714486e-05, + "loss": 0.0491, + "step": 30030 + }, + { + "epoch": 0.1452, + "grad_norm": 0.14110992848873138, + "learning_rate": 4.864019153904337e-05, + "loss": 0.0435, + "step": 30040 + }, + { + "epoch": 0.14525, + "grad_norm": 0.12232547998428345, + "learning_rate": 4.863884649305831e-05, + "loss": 0.0414, + "step": 30050 + }, + { + "epoch": 0.1453, + "grad_norm": 0.12205258011817932, + "learning_rate": 4.863750080079606e-05, + "loss": 0.0427, + "step": 30060 + }, + { + "epoch": 0.14535, + "grad_norm": 0.1372302770614624, + "learning_rate": 4.863615446229342e-05, + "loss": 0.0414, + "step": 30070 + }, + { + "epoch": 0.1454, + "grad_norm": 0.13404032588005066, + "learning_rate": 4.86348074775872e-05, + "loss": 0.0465, + "step": 30080 + }, + { + "epoch": 0.14545, + "grad_norm": 0.13370977342128754, + "learning_rate": 4.863345984671422e-05, + "loss": 0.043, + "step": 30090 + }, + { + "epoch": 0.1455, + "grad_norm": 0.1394168585538864, + "learning_rate": 4.8632111569711326e-05, + "loss": 0.042, + "step": 30100 + }, + { + "epoch": 0.14555, + "grad_norm": 0.15278661251068115, + "learning_rate": 4.863076264661538e-05, + "loss": 0.0459, + "step": 30110 + }, + { + "epoch": 0.1456, + "grad_norm": 0.15998971462249756, + "learning_rate": 4.862941307746326e-05, + "loss": 0.0436, + "step": 30120 + }, + { + "epoch": 0.14565, + "grad_norm": 0.1253858506679535, + "learning_rate": 4.8628062862291865e-05, + "loss": 0.0484, + "step": 30130 + }, + { + "epoch": 0.1457, + "grad_norm": 0.11512526869773865, + "learning_rate": 4.862671200113811e-05, + "loss": 0.0433, + "step": 30140 + }, + { + "epoch": 0.14575, + "grad_norm": 0.13237643241882324, + "learning_rate": 4.862536049403892e-05, + "loss": 0.0434, + "step": 30150 + }, + { + "epoch": 0.1458, + "grad_norm": 0.14471276104450226, + "learning_rate": 4.862400834103125e-05, + "loss": 0.0473, + "step": 30160 + }, + { + "epoch": 0.14585, + "grad_norm": 0.14546695351600647, + "learning_rate": 4.862265554215207e-05, + "loss": 0.0445, + "step": 30170 + }, + { + "epoch": 0.1459, + "grad_norm": 0.132734015583992, + "learning_rate": 4.862130209743837e-05, + "loss": 0.0443, + "step": 30180 + }, + { + "epoch": 0.14595, + "grad_norm": 0.12827634811401367, + "learning_rate": 4.861994800692713e-05, + "loss": 0.0451, + "step": 30190 + }, + { + "epoch": 0.146, + "grad_norm": 0.1215103417634964, + "learning_rate": 4.861859327065539e-05, + "loss": 0.0444, + "step": 30200 + }, + { + "epoch": 0.14605, + "grad_norm": 0.1259389966726303, + "learning_rate": 4.8617237888660185e-05, + "loss": 0.0427, + "step": 30210 + }, + { + "epoch": 0.1461, + "grad_norm": 0.14394418895244598, + "learning_rate": 4.861588186097858e-05, + "loss": 0.0435, + "step": 30220 + }, + { + "epoch": 0.14615, + "grad_norm": 0.13755978643894196, + "learning_rate": 4.861452518764762e-05, + "loss": 0.0436, + "step": 30230 + }, + { + "epoch": 0.1462, + "grad_norm": 0.16422000527381897, + "learning_rate": 4.8613167868704414e-05, + "loss": 0.0451, + "step": 30240 + }, + { + "epoch": 0.14625, + "grad_norm": 0.15230846405029297, + "learning_rate": 4.8611809904186074e-05, + "loss": 0.0438, + "step": 30250 + }, + { + "epoch": 0.1463, + "grad_norm": 0.14470531046390533, + "learning_rate": 4.861045129412972e-05, + "loss": 0.0425, + "step": 30260 + }, + { + "epoch": 0.14635, + "grad_norm": 0.151280477643013, + "learning_rate": 4.86090920385725e-05, + "loss": 0.0436, + "step": 30270 + }, + { + "epoch": 0.1464, + "grad_norm": 0.15169380605220795, + "learning_rate": 4.860773213755158e-05, + "loss": 0.0443, + "step": 30280 + }, + { + "epoch": 0.14645, + "grad_norm": 0.19994564354419708, + "learning_rate": 4.8606371591104114e-05, + "loss": 0.0429, + "step": 30290 + }, + { + "epoch": 0.1465, + "grad_norm": 0.16692997515201569, + "learning_rate": 4.860501039926734e-05, + "loss": 0.0418, + "step": 30300 + }, + { + "epoch": 0.14655, + "grad_norm": 0.17754624783992767, + "learning_rate": 4.860364856207843e-05, + "loss": 0.0448, + "step": 30310 + }, + { + "epoch": 0.1466, + "grad_norm": 0.148456409573555, + "learning_rate": 4.860228607957464e-05, + "loss": 0.0429, + "step": 30320 + }, + { + "epoch": 0.14665, + "grad_norm": 0.14688019454479218, + "learning_rate": 4.860092295179323e-05, + "loss": 0.0421, + "step": 30330 + }, + { + "epoch": 0.1467, + "grad_norm": 0.2008282095193863, + "learning_rate": 4.8599559178771436e-05, + "loss": 0.0455, + "step": 30340 + }, + { + "epoch": 0.14675, + "grad_norm": 0.21322788298130035, + "learning_rate": 4.859819476054657e-05, + "loss": 0.0446, + "step": 30350 + }, + { + "epoch": 0.1468, + "grad_norm": 0.1635725349187851, + "learning_rate": 4.859682969715592e-05, + "loss": 0.0434, + "step": 30360 + }, + { + "epoch": 0.14685, + "grad_norm": 0.1512279212474823, + "learning_rate": 4.859546398863681e-05, + "loss": 0.0436, + "step": 30370 + }, + { + "epoch": 0.1469, + "grad_norm": 0.13886789977550507, + "learning_rate": 4.859409763502658e-05, + "loss": 0.0431, + "step": 30380 + }, + { + "epoch": 0.14695, + "grad_norm": 0.15730249881744385, + "learning_rate": 4.859273063636258e-05, + "loss": 0.0466, + "step": 30390 + }, + { + "epoch": 0.147, + "grad_norm": 0.13086619973182678, + "learning_rate": 4.85913629926822e-05, + "loss": 0.0429, + "step": 30400 + }, + { + "epoch": 0.14705, + "grad_norm": 0.13605093955993652, + "learning_rate": 4.858999470402281e-05, + "loss": 0.0447, + "step": 30410 + }, + { + "epoch": 0.1471, + "grad_norm": 0.13402029871940613, + "learning_rate": 4.8588625770421825e-05, + "loss": 0.0453, + "step": 30420 + }, + { + "epoch": 0.14715, + "grad_norm": 0.1320749968290329, + "learning_rate": 4.8587256191916674e-05, + "loss": 0.0447, + "step": 30430 + }, + { + "epoch": 0.1472, + "grad_norm": 0.11509065330028534, + "learning_rate": 4.858588596854481e-05, + "loss": 0.0461, + "step": 30440 + }, + { + "epoch": 0.14725, + "grad_norm": 0.14351004362106323, + "learning_rate": 4.858451510034367e-05, + "loss": 0.0456, + "step": 30450 + }, + { + "epoch": 0.1473, + "grad_norm": 0.12643565237522125, + "learning_rate": 4.858314358735076e-05, + "loss": 0.0435, + "step": 30460 + }, + { + "epoch": 0.14735, + "grad_norm": 0.14670643210411072, + "learning_rate": 4.858177142960356e-05, + "loss": 0.0436, + "step": 30470 + }, + { + "epoch": 0.1474, + "grad_norm": 0.14714765548706055, + "learning_rate": 4.858039862713959e-05, + "loss": 0.0443, + "step": 30480 + }, + { + "epoch": 0.14745, + "grad_norm": 0.12171582132577896, + "learning_rate": 4.857902517999638e-05, + "loss": 0.0429, + "step": 30490 + }, + { + "epoch": 0.1475, + "grad_norm": 0.13254272937774658, + "learning_rate": 4.8577651088211475e-05, + "loss": 0.044, + "step": 30500 + }, + { + "epoch": 0.14755, + "grad_norm": 0.11033543199300766, + "learning_rate": 4.8576276351822445e-05, + "loss": 0.0441, + "step": 30510 + }, + { + "epoch": 0.1476, + "grad_norm": 0.13831742107868195, + "learning_rate": 4.857490097086688e-05, + "loss": 0.0423, + "step": 30520 + }, + { + "epoch": 0.14765, + "grad_norm": 0.11719835549592972, + "learning_rate": 4.857352494538239e-05, + "loss": 0.0446, + "step": 30530 + }, + { + "epoch": 0.1477, + "grad_norm": 0.11771178245544434, + "learning_rate": 4.857214827540657e-05, + "loss": 0.0416, + "step": 30540 + }, + { + "epoch": 0.14775, + "grad_norm": 0.10563771426677704, + "learning_rate": 4.857077096097708e-05, + "loss": 0.0425, + "step": 30550 + }, + { + "epoch": 0.1478, + "grad_norm": 0.13564516603946686, + "learning_rate": 4.856939300213156e-05, + "loss": 0.0435, + "step": 30560 + }, + { + "epoch": 0.14785, + "grad_norm": 0.12499434500932693, + "learning_rate": 4.856801439890769e-05, + "loss": 0.0429, + "step": 30570 + }, + { + "epoch": 0.1479, + "grad_norm": 0.12963660061359406, + "learning_rate": 4.8566635151343164e-05, + "loss": 0.0447, + "step": 30580 + }, + { + "epoch": 0.14795, + "grad_norm": 0.14376573264598846, + "learning_rate": 4.8565255259475686e-05, + "loss": 0.0418, + "step": 30590 + }, + { + "epoch": 0.148, + "grad_norm": 0.14322718977928162, + "learning_rate": 4.856387472334298e-05, + "loss": 0.0416, + "step": 30600 + }, + { + "epoch": 0.14805, + "grad_norm": 0.11483057588338852, + "learning_rate": 4.8562493542982796e-05, + "loss": 0.0408, + "step": 30610 + }, + { + "epoch": 0.1481, + "grad_norm": 0.12188133597373962, + "learning_rate": 4.856111171843289e-05, + "loss": 0.042, + "step": 30620 + }, + { + "epoch": 0.14815, + "grad_norm": 0.13449354469776154, + "learning_rate": 4.855972924973104e-05, + "loss": 0.0422, + "step": 30630 + }, + { + "epoch": 0.1482, + "grad_norm": 0.1258065402507782, + "learning_rate": 4.855834613691505e-05, + "loss": 0.0403, + "step": 30640 + }, + { + "epoch": 0.14825, + "grad_norm": 0.12190357595682144, + "learning_rate": 4.855696238002271e-05, + "loss": 0.0412, + "step": 30650 + }, + { + "epoch": 0.1483, + "grad_norm": 0.12858296930789948, + "learning_rate": 4.855557797909188e-05, + "loss": 0.0423, + "step": 30660 + }, + { + "epoch": 0.14835, + "grad_norm": 0.11363779753446579, + "learning_rate": 4.85541929341604e-05, + "loss": 0.0433, + "step": 30670 + }, + { + "epoch": 0.1484, + "grad_norm": 0.1235252171754837, + "learning_rate": 4.855280724526613e-05, + "loss": 0.0422, + "step": 30680 + }, + { + "epoch": 0.14845, + "grad_norm": 0.1475268006324768, + "learning_rate": 4.8551420912446956e-05, + "loss": 0.0411, + "step": 30690 + }, + { + "epoch": 0.1485, + "grad_norm": 0.1395253837108612, + "learning_rate": 4.855003393574079e-05, + "loss": 0.0429, + "step": 30700 + }, + { + "epoch": 0.14855, + "grad_norm": 0.13783404231071472, + "learning_rate": 4.854864631518553e-05, + "loss": 0.0435, + "step": 30710 + }, + { + "epoch": 0.1486, + "grad_norm": 0.12407044321298599, + "learning_rate": 4.854725805081913e-05, + "loss": 0.0442, + "step": 30720 + }, + { + "epoch": 0.14865, + "grad_norm": 0.11461728811264038, + "learning_rate": 4.8545869142679556e-05, + "loss": 0.0418, + "step": 30730 + }, + { + "epoch": 0.1487, + "grad_norm": 0.15319527685642242, + "learning_rate": 4.8544479590804754e-05, + "loss": 0.0417, + "step": 30740 + }, + { + "epoch": 0.14875, + "grad_norm": 0.10197413712739944, + "learning_rate": 4.854308939523272e-05, + "loss": 0.0422, + "step": 30750 + }, + { + "epoch": 0.1488, + "grad_norm": 0.11823474615812302, + "learning_rate": 4.854169855600148e-05, + "loss": 0.0416, + "step": 30760 + }, + { + "epoch": 0.14885, + "grad_norm": 0.1088530421257019, + "learning_rate": 4.854030707314904e-05, + "loss": 0.0416, + "step": 30770 + }, + { + "epoch": 0.1489, + "grad_norm": 0.12142953276634216, + "learning_rate": 4.853891494671344e-05, + "loss": 0.0436, + "step": 30780 + }, + { + "epoch": 0.14895, + "grad_norm": 0.1265615075826645, + "learning_rate": 4.853752217673276e-05, + "loss": 0.0422, + "step": 30790 + }, + { + "epoch": 0.149, + "grad_norm": 0.1514863669872284, + "learning_rate": 4.853612876324506e-05, + "loss": 0.0458, + "step": 30800 + }, + { + "epoch": 0.14905, + "grad_norm": 0.14438602328300476, + "learning_rate": 4.853473470628844e-05, + "loss": 0.0461, + "step": 30810 + }, + { + "epoch": 0.1491, + "grad_norm": 0.12658219039440155, + "learning_rate": 4.853334000590102e-05, + "loss": 0.0449, + "step": 30820 + }, + { + "epoch": 0.14915, + "grad_norm": 0.10492241382598877, + "learning_rate": 4.853194466212093e-05, + "loss": 0.0422, + "step": 30830 + }, + { + "epoch": 0.1492, + "grad_norm": 0.11288110911846161, + "learning_rate": 4.85305486749863e-05, + "loss": 0.0431, + "step": 30840 + }, + { + "epoch": 0.14925, + "grad_norm": 0.1231205090880394, + "learning_rate": 4.852915204453532e-05, + "loss": 0.0439, + "step": 30850 + }, + { + "epoch": 0.1493, + "grad_norm": 0.13926143944263458, + "learning_rate": 4.852775477080616e-05, + "loss": 0.0418, + "step": 30860 + }, + { + "epoch": 0.14935, + "grad_norm": 0.12145378440618515, + "learning_rate": 4.852635685383702e-05, + "loss": 0.043, + "step": 30870 + }, + { + "epoch": 0.1494, + "grad_norm": 0.13048270344734192, + "learning_rate": 4.8524958293666125e-05, + "loss": 0.0447, + "step": 30880 + }, + { + "epoch": 0.14945, + "grad_norm": 0.13639546930789948, + "learning_rate": 4.852355909033171e-05, + "loss": 0.0449, + "step": 30890 + }, + { + "epoch": 0.1495, + "grad_norm": 0.12359599024057388, + "learning_rate": 4.852215924387202e-05, + "loss": 0.043, + "step": 30900 + }, + { + "epoch": 0.14955, + "grad_norm": 0.1215788796544075, + "learning_rate": 4.8520758754325343e-05, + "loss": 0.0432, + "step": 30910 + }, + { + "epoch": 0.1496, + "grad_norm": 0.13396112620830536, + "learning_rate": 4.851935762172995e-05, + "loss": 0.0435, + "step": 30920 + }, + { + "epoch": 0.14965, + "grad_norm": 0.10398834198713303, + "learning_rate": 4.8517955846124164e-05, + "loss": 0.0428, + "step": 30930 + }, + { + "epoch": 0.1497, + "grad_norm": 0.13041123747825623, + "learning_rate": 4.851655342754629e-05, + "loss": 0.0436, + "step": 30940 + }, + { + "epoch": 0.14975, + "grad_norm": 0.13264279067516327, + "learning_rate": 4.851515036603469e-05, + "loss": 0.0433, + "step": 30950 + }, + { + "epoch": 0.1498, + "grad_norm": 0.12627571821212769, + "learning_rate": 4.85137466616277e-05, + "loss": 0.0426, + "step": 30960 + }, + { + "epoch": 0.14985, + "grad_norm": 0.11815209686756134, + "learning_rate": 4.851234231436372e-05, + "loss": 0.0429, + "step": 30970 + }, + { + "epoch": 0.1499, + "grad_norm": 0.13062144815921783, + "learning_rate": 4.8510937324281134e-05, + "loss": 0.0441, + "step": 30980 + }, + { + "epoch": 0.14995, + "grad_norm": 0.11060670763254166, + "learning_rate": 4.850953169141835e-05, + "loss": 0.0422, + "step": 30990 + }, + { + "epoch": 0.15, + "grad_norm": 0.11919166892766953, + "learning_rate": 4.850812541581381e-05, + "loss": 0.0419, + "step": 31000 + }, + { + "epoch": 0.15005, + "grad_norm": 0.1286785900592804, + "learning_rate": 4.8506718497505944e-05, + "loss": 0.0435, + "step": 31010 + }, + { + "epoch": 0.1501, + "grad_norm": 0.1418483853340149, + "learning_rate": 4.8505310936533225e-05, + "loss": 0.0417, + "step": 31020 + }, + { + "epoch": 0.15015, + "grad_norm": 0.14956867694854736, + "learning_rate": 4.8503902732934133e-05, + "loss": 0.0432, + "step": 31030 + }, + { + "epoch": 0.1502, + "grad_norm": 0.12980765104293823, + "learning_rate": 4.850249388674718e-05, + "loss": 0.0424, + "step": 31040 + }, + { + "epoch": 0.15025, + "grad_norm": 0.12441367655992508, + "learning_rate": 4.8501084398010873e-05, + "loss": 0.0417, + "step": 31050 + }, + { + "epoch": 0.1503, + "grad_norm": 0.14082591235637665, + "learning_rate": 4.8499674266763745e-05, + "loss": 0.0417, + "step": 31060 + }, + { + "epoch": 0.15035, + "grad_norm": 0.12361367046833038, + "learning_rate": 4.849826349304435e-05, + "loss": 0.0429, + "step": 31070 + }, + { + "epoch": 0.1504, + "grad_norm": 0.12353827059268951, + "learning_rate": 4.849685207689126e-05, + "loss": 0.0445, + "step": 31080 + }, + { + "epoch": 0.15045, + "grad_norm": 0.12288283556699753, + "learning_rate": 4.849544001834306e-05, + "loss": 0.0426, + "step": 31090 + }, + { + "epoch": 0.1505, + "grad_norm": 0.1343759000301361, + "learning_rate": 4.849402731743836e-05, + "loss": 0.0443, + "step": 31100 + }, + { + "epoch": 0.15055, + "grad_norm": 0.12144505977630615, + "learning_rate": 4.849261397421577e-05, + "loss": 0.0427, + "step": 31110 + }, + { + "epoch": 0.1506, + "grad_norm": 0.13607613742351532, + "learning_rate": 4.849119998871395e-05, + "loss": 0.0452, + "step": 31120 + }, + { + "epoch": 0.15065, + "grad_norm": 0.15730790793895721, + "learning_rate": 4.848978536097154e-05, + "loss": 0.0449, + "step": 31130 + }, + { + "epoch": 0.1507, + "grad_norm": 0.1263347566127777, + "learning_rate": 4.848837009102723e-05, + "loss": 0.0448, + "step": 31140 + }, + { + "epoch": 0.15075, + "grad_norm": 0.16018787026405334, + "learning_rate": 4.8486954178919704e-05, + "loss": 0.0455, + "step": 31150 + }, + { + "epoch": 0.1508, + "grad_norm": 0.14352253079414368, + "learning_rate": 4.848553762468767e-05, + "loss": 0.0434, + "step": 31160 + }, + { + "epoch": 0.15085, + "grad_norm": 0.138255774974823, + "learning_rate": 4.8484120428369864e-05, + "loss": 0.0437, + "step": 31170 + }, + { + "epoch": 0.1509, + "grad_norm": 0.15098395943641663, + "learning_rate": 4.848270259000503e-05, + "loss": 0.0439, + "step": 31180 + }, + { + "epoch": 0.15095, + "grad_norm": 0.1253850758075714, + "learning_rate": 4.848128410963193e-05, + "loss": 0.0436, + "step": 31190 + }, + { + "epoch": 0.151, + "grad_norm": 0.13632526993751526, + "learning_rate": 4.8479864987289336e-05, + "loss": 0.0427, + "step": 31200 + }, + { + "epoch": 0.15105, + "grad_norm": 0.12763665616512299, + "learning_rate": 4.847844522301606e-05, + "loss": 0.0422, + "step": 31210 + }, + { + "epoch": 0.1511, + "grad_norm": 0.12563873827457428, + "learning_rate": 4.8477024816850916e-05, + "loss": 0.045, + "step": 31220 + }, + { + "epoch": 0.15115, + "grad_norm": 0.1258392035961151, + "learning_rate": 4.847560376883272e-05, + "loss": 0.0422, + "step": 31230 + }, + { + "epoch": 0.1512, + "grad_norm": 0.1221856102347374, + "learning_rate": 4.847418207900035e-05, + "loss": 0.0423, + "step": 31240 + }, + { + "epoch": 0.15125, + "grad_norm": 0.1283620297908783, + "learning_rate": 4.847275974739266e-05, + "loss": 0.0414, + "step": 31250 + }, + { + "epoch": 0.1513, + "grad_norm": 0.13173000514507294, + "learning_rate": 4.8471336774048526e-05, + "loss": 0.0446, + "step": 31260 + }, + { + "epoch": 0.15135, + "grad_norm": 0.12331975251436234, + "learning_rate": 4.846991315900687e-05, + "loss": 0.0446, + "step": 31270 + }, + { + "epoch": 0.1514, + "grad_norm": 0.12383377552032471, + "learning_rate": 4.846848890230661e-05, + "loss": 0.0417, + "step": 31280 + }, + { + "epoch": 0.15145, + "grad_norm": 0.14767709374427795, + "learning_rate": 4.8467064003986676e-05, + "loss": 0.043, + "step": 31290 + }, + { + "epoch": 0.1515, + "grad_norm": 0.13305449485778809, + "learning_rate": 4.846563846408602e-05, + "loss": 0.041, + "step": 31300 + }, + { + "epoch": 0.15155, + "grad_norm": 0.1448502540588379, + "learning_rate": 4.846421228264363e-05, + "loss": 0.0448, + "step": 31310 + }, + { + "epoch": 0.1516, + "grad_norm": 0.14288462698459625, + "learning_rate": 4.846278545969849e-05, + "loss": 0.0413, + "step": 31320 + }, + { + "epoch": 0.15165, + "grad_norm": 0.15707068145275116, + "learning_rate": 4.846135799528961e-05, + "loss": 0.0411, + "step": 31330 + }, + { + "epoch": 0.1517, + "grad_norm": 0.13953132927417755, + "learning_rate": 4.845992988945602e-05, + "loss": 0.0427, + "step": 31340 + }, + { + "epoch": 0.15175, + "grad_norm": 0.14889703691005707, + "learning_rate": 4.845850114223677e-05, + "loss": 0.0442, + "step": 31350 + }, + { + "epoch": 0.1518, + "grad_norm": 0.15186628699302673, + "learning_rate": 4.845707175367089e-05, + "loss": 0.0453, + "step": 31360 + }, + { + "epoch": 0.15185, + "grad_norm": 0.14371851086616516, + "learning_rate": 4.8455641723797496e-05, + "loss": 0.0428, + "step": 31370 + }, + { + "epoch": 0.1519, + "grad_norm": 0.1377260982990265, + "learning_rate": 4.8454211052655665e-05, + "loss": 0.044, + "step": 31380 + }, + { + "epoch": 0.15195, + "grad_norm": 0.12451760470867157, + "learning_rate": 4.8452779740284516e-05, + "loss": 0.0416, + "step": 31390 + }, + { + "epoch": 0.152, + "grad_norm": 0.15763983130455017, + "learning_rate": 4.8451347786723175e-05, + "loss": 0.0421, + "step": 31400 + }, + { + "epoch": 0.15205, + "grad_norm": 0.15236034989356995, + "learning_rate": 4.8449915192010795e-05, + "loss": 0.0418, + "step": 31410 + }, + { + "epoch": 0.1521, + "grad_norm": 0.14867082238197327, + "learning_rate": 4.8448481956186556e-05, + "loss": 0.0423, + "step": 31420 + }, + { + "epoch": 0.15215, + "grad_norm": 0.13444784283638, + "learning_rate": 4.844704807928961e-05, + "loss": 0.0432, + "step": 31430 + }, + { + "epoch": 0.1522, + "grad_norm": 0.13199631869792938, + "learning_rate": 4.844561356135919e-05, + "loss": 0.0413, + "step": 31440 + }, + { + "epoch": 0.15225, + "grad_norm": 0.14558719098567963, + "learning_rate": 4.844417840243451e-05, + "loss": 0.0411, + "step": 31450 + }, + { + "epoch": 0.1523, + "grad_norm": 0.1309782713651657, + "learning_rate": 4.8442742602554794e-05, + "loss": 0.0422, + "step": 31460 + }, + { + "epoch": 0.15235, + "grad_norm": 0.11371913552284241, + "learning_rate": 4.84413061617593e-05, + "loss": 0.0422, + "step": 31470 + }, + { + "epoch": 0.1524, + "grad_norm": 0.12032821029424667, + "learning_rate": 4.84398690800873e-05, + "loss": 0.0404, + "step": 31480 + }, + { + "epoch": 0.15245, + "grad_norm": 0.13665400445461273, + "learning_rate": 4.843843135757809e-05, + "loss": 0.042, + "step": 31490 + }, + { + "epoch": 0.1525, + "grad_norm": 0.1554425209760666, + "learning_rate": 4.843699299427097e-05, + "loss": 0.0441, + "step": 31500 + }, + { + "epoch": 0.15255, + "grad_norm": 0.1209154799580574, + "learning_rate": 4.8435553990205265e-05, + "loss": 0.0427, + "step": 31510 + }, + { + "epoch": 0.1526, + "grad_norm": 0.1109633594751358, + "learning_rate": 4.843411434542032e-05, + "loss": 0.0415, + "step": 31520 + }, + { + "epoch": 0.15265, + "grad_norm": 0.10953951627016068, + "learning_rate": 4.8432674059955496e-05, + "loss": 0.0416, + "step": 31530 + }, + { + "epoch": 0.1527, + "grad_norm": 0.10354560613632202, + "learning_rate": 4.843123313385016e-05, + "loss": 0.0416, + "step": 31540 + }, + { + "epoch": 0.15275, + "grad_norm": 0.08530950546264648, + "learning_rate": 4.842979156714372e-05, + "loss": 0.0413, + "step": 31550 + }, + { + "epoch": 0.1528, + "grad_norm": 0.10561627149581909, + "learning_rate": 4.842834935987557e-05, + "loss": 0.0432, + "step": 31560 + }, + { + "epoch": 0.15285, + "grad_norm": 0.11676881462335587, + "learning_rate": 4.842690651208516e-05, + "loss": 0.0417, + "step": 31570 + }, + { + "epoch": 0.1529, + "grad_norm": 0.11251876503229141, + "learning_rate": 4.8425463023811924e-05, + "loss": 0.0417, + "step": 31580 + }, + { + "epoch": 0.15295, + "grad_norm": 0.15875111520290375, + "learning_rate": 4.842401889509532e-05, + "loss": 0.0459, + "step": 31590 + }, + { + "epoch": 0.153, + "grad_norm": 0.1364508420228958, + "learning_rate": 4.8422574125974855e-05, + "loss": 0.0429, + "step": 31600 + }, + { + "epoch": 0.15305, + "grad_norm": 0.13057385385036469, + "learning_rate": 4.8421128716490004e-05, + "loss": 0.0432, + "step": 31610 + }, + { + "epoch": 0.1531, + "grad_norm": 0.12634187936782837, + "learning_rate": 4.84196826666803e-05, + "loss": 0.0414, + "step": 31620 + }, + { + "epoch": 0.15315, + "grad_norm": 0.11812356114387512, + "learning_rate": 4.841823597658527e-05, + "loss": 0.0414, + "step": 31630 + }, + { + "epoch": 0.1532, + "grad_norm": 0.12652729451656342, + "learning_rate": 4.841678864624446e-05, + "loss": 0.0421, + "step": 31640 + }, + { + "epoch": 0.15325, + "grad_norm": 0.11509863287210464, + "learning_rate": 4.841534067569744e-05, + "loss": 0.0451, + "step": 31650 + }, + { + "epoch": 0.1533, + "grad_norm": 0.16723394393920898, + "learning_rate": 4.841389206498381e-05, + "loss": 0.0462, + "step": 31660 + }, + { + "epoch": 0.15335, + "grad_norm": 0.15288612246513367, + "learning_rate": 4.841244281414317e-05, + "loss": 0.0426, + "step": 31670 + }, + { + "epoch": 0.1534, + "grad_norm": 0.13192178308963776, + "learning_rate": 4.841099292321514e-05, + "loss": 0.0443, + "step": 31680 + }, + { + "epoch": 0.15345, + "grad_norm": 0.13673368096351624, + "learning_rate": 4.840954239223935e-05, + "loss": 0.0424, + "step": 31690 + }, + { + "epoch": 0.1535, + "grad_norm": 0.12678951025009155, + "learning_rate": 4.840809122125547e-05, + "loss": 0.0422, + "step": 31700 + }, + { + "epoch": 0.15355, + "grad_norm": 0.11920646578073502, + "learning_rate": 4.840663941030317e-05, + "loss": 0.041, + "step": 31710 + }, + { + "epoch": 0.1536, + "grad_norm": 0.12292204797267914, + "learning_rate": 4.840518695942214e-05, + "loss": 0.0433, + "step": 31720 + }, + { + "epoch": 0.15365, + "grad_norm": 0.14412416517734528, + "learning_rate": 4.8403733868652104e-05, + "loss": 0.044, + "step": 31730 + }, + { + "epoch": 0.1537, + "grad_norm": 0.13721759617328644, + "learning_rate": 4.840228013803276e-05, + "loss": 0.0435, + "step": 31740 + }, + { + "epoch": 0.15375, + "grad_norm": 0.13853123784065247, + "learning_rate": 4.840082576760388e-05, + "loss": 0.0429, + "step": 31750 + }, + { + "epoch": 0.1538, + "grad_norm": 0.1147557869553566, + "learning_rate": 4.839937075740521e-05, + "loss": 0.0429, + "step": 31760 + }, + { + "epoch": 0.15385, + "grad_norm": 0.15210305154323578, + "learning_rate": 4.8397915107476535e-05, + "loss": 0.043, + "step": 31770 + }, + { + "epoch": 0.1539, + "grad_norm": 0.1328851878643036, + "learning_rate": 4.839645881785765e-05, + "loss": 0.0417, + "step": 31780 + }, + { + "epoch": 0.15395, + "grad_norm": 0.10255947709083557, + "learning_rate": 4.8395001888588366e-05, + "loss": 0.0408, + "step": 31790 + }, + { + "epoch": 0.154, + "grad_norm": 0.13945145905017853, + "learning_rate": 4.8393544319708524e-05, + "loss": 0.043, + "step": 31800 + }, + { + "epoch": 0.15405, + "grad_norm": 0.13556864857673645, + "learning_rate": 4.839208611125797e-05, + "loss": 0.0419, + "step": 31810 + }, + { + "epoch": 0.1541, + "grad_norm": 0.12402399629354477, + "learning_rate": 4.839062726327657e-05, + "loss": 0.0444, + "step": 31820 + }, + { + "epoch": 0.15415, + "grad_norm": 0.13532769680023193, + "learning_rate": 4.83891677758042e-05, + "loss": 0.0425, + "step": 31830 + }, + { + "epoch": 0.1542, + "grad_norm": 0.13559764623641968, + "learning_rate": 4.838770764888078e-05, + "loss": 0.0431, + "step": 31840 + }, + { + "epoch": 0.15425, + "grad_norm": 0.12736962735652924, + "learning_rate": 4.838624688254621e-05, + "loss": 0.0422, + "step": 31850 + }, + { + "epoch": 0.1543, + "grad_norm": 0.11483705788850784, + "learning_rate": 4.838478547684045e-05, + "loss": 0.0415, + "step": 31860 + }, + { + "epoch": 0.15435, + "grad_norm": 0.14374835789203644, + "learning_rate": 4.838332343180343e-05, + "loss": 0.0416, + "step": 31870 + }, + { + "epoch": 0.1544, + "grad_norm": 0.14173774421215057, + "learning_rate": 4.8381860747475136e-05, + "loss": 0.0442, + "step": 31880 + }, + { + "epoch": 0.15445, + "grad_norm": 0.1337192803621292, + "learning_rate": 4.838039742389555e-05, + "loss": 0.0448, + "step": 31890 + }, + { + "epoch": 0.1545, + "grad_norm": 0.13194599747657776, + "learning_rate": 4.837893346110469e-05, + "loss": 0.0446, + "step": 31900 + }, + { + "epoch": 0.15455, + "grad_norm": 0.11847416311502457, + "learning_rate": 4.837746885914256e-05, + "loss": 0.0453, + "step": 31910 + }, + { + "epoch": 0.1546, + "grad_norm": 0.15355290472507477, + "learning_rate": 4.8376003618049225e-05, + "loss": 0.042, + "step": 31920 + }, + { + "epoch": 0.15465, + "grad_norm": 0.1322755068540573, + "learning_rate": 4.837453773786472e-05, + "loss": 0.0411, + "step": 31930 + }, + { + "epoch": 0.1547, + "grad_norm": 0.12431639432907104, + "learning_rate": 4.837307121862915e-05, + "loss": 0.042, + "step": 31940 + }, + { + "epoch": 0.15475, + "grad_norm": 0.14812661707401276, + "learning_rate": 4.837160406038258e-05, + "loss": 0.0435, + "step": 31950 + }, + { + "epoch": 0.1548, + "grad_norm": 0.1294979453086853, + "learning_rate": 4.8370136263165146e-05, + "loss": 0.0425, + "step": 31960 + }, + { + "epoch": 0.15485, + "grad_norm": 0.12444666028022766, + "learning_rate": 4.836866782701696e-05, + "loss": 0.0429, + "step": 31970 + }, + { + "epoch": 0.1549, + "grad_norm": 0.1274973303079605, + "learning_rate": 4.836719875197818e-05, + "loss": 0.0446, + "step": 31980 + }, + { + "epoch": 0.15495, + "grad_norm": 0.13861462473869324, + "learning_rate": 4.836572903808896e-05, + "loss": 0.0428, + "step": 31990 + }, + { + "epoch": 0.155, + "grad_norm": 0.13576054573059082, + "learning_rate": 4.836425868538949e-05, + "loss": 0.0446, + "step": 32000 + }, + { + "epoch": 0.15505, + "grad_norm": 0.19204431772232056, + "learning_rate": 4.8362787693919967e-05, + "loss": 0.0435, + "step": 32010 + }, + { + "epoch": 0.1551, + "grad_norm": 0.11911473423242569, + "learning_rate": 4.83613160637206e-05, + "loss": 0.0441, + "step": 32020 + }, + { + "epoch": 0.15515, + "grad_norm": 0.1512276828289032, + "learning_rate": 4.835984379483163e-05, + "loss": 0.0443, + "step": 32030 + }, + { + "epoch": 0.1552, + "grad_norm": 0.1160498857498169, + "learning_rate": 4.83583708872933e-05, + "loss": 0.0431, + "step": 32040 + }, + { + "epoch": 0.15525, + "grad_norm": 0.11876894533634186, + "learning_rate": 4.835689734114589e-05, + "loss": 0.043, + "step": 32050 + }, + { + "epoch": 0.1553, + "grad_norm": 0.12058950960636139, + "learning_rate": 4.835542315642968e-05, + "loss": 0.0414, + "step": 32060 + }, + { + "epoch": 0.15535, + "grad_norm": 0.1484060287475586, + "learning_rate": 4.8353948333184986e-05, + "loss": 0.0445, + "step": 32070 + }, + { + "epoch": 0.1554, + "grad_norm": 0.12514857947826385, + "learning_rate": 4.8352472871452106e-05, + "loss": 0.0443, + "step": 32080 + }, + { + "epoch": 0.15545, + "grad_norm": 0.12500616908073425, + "learning_rate": 4.8350996771271394e-05, + "loss": 0.0467, + "step": 32090 + }, + { + "epoch": 0.1555, + "grad_norm": 0.1326914280653, + "learning_rate": 4.83495200326832e-05, + "loss": 0.0443, + "step": 32100 + }, + { + "epoch": 0.15555, + "grad_norm": 0.1272750347852707, + "learning_rate": 4.834804265572791e-05, + "loss": 0.0423, + "step": 32110 + }, + { + "epoch": 0.1556, + "grad_norm": 0.14041025936603546, + "learning_rate": 4.8346564640445905e-05, + "loss": 0.0421, + "step": 32120 + }, + { + "epoch": 0.15565, + "grad_norm": 0.13470202684402466, + "learning_rate": 4.834508598687758e-05, + "loss": 0.0446, + "step": 32130 + }, + { + "epoch": 0.1557, + "grad_norm": 0.16813085973262787, + "learning_rate": 4.8343606695063384e-05, + "loss": 0.046, + "step": 32140 + }, + { + "epoch": 0.15575, + "grad_norm": 0.1304176151752472, + "learning_rate": 4.8342126765043746e-05, + "loss": 0.0429, + "step": 32150 + }, + { + "epoch": 0.1558, + "grad_norm": 0.11898932605981827, + "learning_rate": 4.834064619685914e-05, + "loss": 0.0412, + "step": 32160 + }, + { + "epoch": 0.15585, + "grad_norm": 0.11131195724010468, + "learning_rate": 4.833916499055003e-05, + "loss": 0.042, + "step": 32170 + }, + { + "epoch": 0.1559, + "grad_norm": 0.12093434482812881, + "learning_rate": 4.833768314615692e-05, + "loss": 0.0413, + "step": 32180 + }, + { + "epoch": 0.15595, + "grad_norm": 0.13842767477035522, + "learning_rate": 4.833620066372031e-05, + "loss": 0.0423, + "step": 32190 + }, + { + "epoch": 0.156, + "grad_norm": 0.1440475434064865, + "learning_rate": 4.833471754328075e-05, + "loss": 0.0424, + "step": 32200 + }, + { + "epoch": 0.15605, + "grad_norm": 0.13751320540905, + "learning_rate": 4.8333233784878785e-05, + "loss": 0.0416, + "step": 32210 + }, + { + "epoch": 0.1561, + "grad_norm": 0.15234899520874023, + "learning_rate": 4.8331749388554956e-05, + "loss": 0.0426, + "step": 32220 + }, + { + "epoch": 0.15615, + "grad_norm": 0.1456148475408554, + "learning_rate": 4.8330264354349886e-05, + "loss": 0.0444, + "step": 32230 + }, + { + "epoch": 0.1562, + "grad_norm": 0.14833270013332367, + "learning_rate": 4.832877868230414e-05, + "loss": 0.0444, + "step": 32240 + }, + { + "epoch": 0.15625, + "grad_norm": 0.14564715325832367, + "learning_rate": 4.832729237245835e-05, + "loss": 0.0434, + "step": 32250 + }, + { + "epoch": 0.1563, + "grad_norm": 0.13104142248630524, + "learning_rate": 4.832580542485316e-05, + "loss": 0.0433, + "step": 32260 + }, + { + "epoch": 0.15635, + "grad_norm": 0.12680700421333313, + "learning_rate": 4.83243178395292e-05, + "loss": 0.0432, + "step": 32270 + }, + { + "epoch": 0.1564, + "grad_norm": 0.1314367651939392, + "learning_rate": 4.832282961652716e-05, + "loss": 0.043, + "step": 32280 + }, + { + "epoch": 0.15645, + "grad_norm": 0.13626080751419067, + "learning_rate": 4.832134075588771e-05, + "loss": 0.0437, + "step": 32290 + }, + { + "epoch": 0.1565, + "grad_norm": 0.16060654819011688, + "learning_rate": 4.831985125765157e-05, + "loss": 0.0452, + "step": 32300 + }, + { + "epoch": 0.15655, + "grad_norm": 0.13280725479125977, + "learning_rate": 4.831836112185946e-05, + "loss": 0.0461, + "step": 32310 + }, + { + "epoch": 0.1566, + "grad_norm": 0.14577241241931915, + "learning_rate": 4.8316870348552116e-05, + "loss": 0.044, + "step": 32320 + }, + { + "epoch": 0.15665, + "grad_norm": 0.12814298272132874, + "learning_rate": 4.83153789377703e-05, + "loss": 0.0439, + "step": 32330 + }, + { + "epoch": 0.1567, + "grad_norm": 0.1262589991092682, + "learning_rate": 4.831388688955478e-05, + "loss": 0.044, + "step": 32340 + }, + { + "epoch": 0.15675, + "grad_norm": 0.13185927271842957, + "learning_rate": 4.8312394203946356e-05, + "loss": 0.0477, + "step": 32350 + }, + { + "epoch": 0.1568, + "grad_norm": 0.11433325707912445, + "learning_rate": 4.831090088098582e-05, + "loss": 0.0425, + "step": 32360 + }, + { + "epoch": 0.15685, + "grad_norm": 0.12537720799446106, + "learning_rate": 4.8309406920714024e-05, + "loss": 0.0455, + "step": 32370 + }, + { + "epoch": 0.1569, + "grad_norm": 0.12167773395776749, + "learning_rate": 4.83079123231718e-05, + "loss": 0.0438, + "step": 32380 + }, + { + "epoch": 0.15695, + "grad_norm": 0.1502913534641266, + "learning_rate": 4.83064170884e-05, + "loss": 0.0436, + "step": 32390 + }, + { + "epoch": 0.157, + "grad_norm": 0.13366125524044037, + "learning_rate": 4.830492121643951e-05, + "loss": 0.0464, + "step": 32400 + }, + { + "epoch": 0.15705, + "grad_norm": 0.10904904454946518, + "learning_rate": 4.830342470733125e-05, + "loss": 0.0436, + "step": 32410 + }, + { + "epoch": 0.1571, + "grad_norm": 0.13437418639659882, + "learning_rate": 4.8301927561116095e-05, + "loss": 0.0448, + "step": 32420 + }, + { + "epoch": 0.15715, + "grad_norm": 0.1309521645307541, + "learning_rate": 4.8300429777835e-05, + "loss": 0.0441, + "step": 32430 + }, + { + "epoch": 0.1572, + "grad_norm": 0.11571266502141953, + "learning_rate": 4.829893135752891e-05, + "loss": 0.0441, + "step": 32440 + }, + { + "epoch": 0.15725, + "grad_norm": 0.13288447260856628, + "learning_rate": 4.829743230023879e-05, + "loss": 0.0431, + "step": 32450 + }, + { + "epoch": 0.1573, + "grad_norm": 0.10984359681606293, + "learning_rate": 4.829593260600561e-05, + "loss": 0.0422, + "step": 32460 + }, + { + "epoch": 0.15735, + "grad_norm": 0.12429521977901459, + "learning_rate": 4.82944322748704e-05, + "loss": 0.0438, + "step": 32470 + }, + { + "epoch": 0.1574, + "grad_norm": 0.11131302267313004, + "learning_rate": 4.829293130687416e-05, + "loss": 0.045, + "step": 32480 + }, + { + "epoch": 0.15745, + "grad_norm": 0.1384342461824417, + "learning_rate": 4.829142970205792e-05, + "loss": 0.0446, + "step": 32490 + }, + { + "epoch": 0.1575, + "grad_norm": 0.12439969182014465, + "learning_rate": 4.828992746046276e-05, + "loss": 0.0443, + "step": 32500 + }, + { + "epoch": 0.15755, + "grad_norm": 0.14163966476917267, + "learning_rate": 4.828842458212972e-05, + "loss": 0.0451, + "step": 32510 + }, + { + "epoch": 0.1576, + "grad_norm": 0.1380494385957718, + "learning_rate": 4.82869210670999e-05, + "loss": 0.044, + "step": 32520 + }, + { + "epoch": 0.15765, + "grad_norm": 0.13433505594730377, + "learning_rate": 4.8285416915414406e-05, + "loss": 0.0427, + "step": 32530 + }, + { + "epoch": 0.1577, + "grad_norm": 0.12882503867149353, + "learning_rate": 4.828391212711437e-05, + "loss": 0.0415, + "step": 32540 + }, + { + "epoch": 0.15775, + "grad_norm": 0.1770928055047989, + "learning_rate": 4.828240670224092e-05, + "loss": 0.0439, + "step": 32550 + }, + { + "epoch": 0.1578, + "grad_norm": 0.1492464393377304, + "learning_rate": 4.828090064083521e-05, + "loss": 0.0416, + "step": 32560 + }, + { + "epoch": 0.15785, + "grad_norm": 0.1461704522371292, + "learning_rate": 4.8279393942938434e-05, + "loss": 0.0427, + "step": 32570 + }, + { + "epoch": 0.1579, + "grad_norm": 0.14312250912189484, + "learning_rate": 4.8277886608591766e-05, + "loss": 0.0421, + "step": 32580 + }, + { + "epoch": 0.15795, + "grad_norm": 0.13308829069137573, + "learning_rate": 4.827637863783643e-05, + "loss": 0.0445, + "step": 32590 + }, + { + "epoch": 0.158, + "grad_norm": 0.13714931905269623, + "learning_rate": 4.827487003071364e-05, + "loss": 0.0423, + "step": 32600 + }, + { + "epoch": 0.15805, + "grad_norm": 0.1418546587228775, + "learning_rate": 4.8273360787264644e-05, + "loss": 0.043, + "step": 32610 + }, + { + "epoch": 0.1581, + "grad_norm": 0.13813413679599762, + "learning_rate": 4.8271850907530715e-05, + "loss": 0.0421, + "step": 32620 + }, + { + "epoch": 0.15815, + "grad_norm": 0.1461115926504135, + "learning_rate": 4.827034039155312e-05, + "loss": 0.0423, + "step": 32630 + }, + { + "epoch": 0.1582, + "grad_norm": 0.15559233725070953, + "learning_rate": 4.826882923937317e-05, + "loss": 0.0451, + "step": 32640 + }, + { + "epoch": 0.15825, + "grad_norm": 0.12452242523431778, + "learning_rate": 4.826731745103216e-05, + "loss": 0.0427, + "step": 32650 + }, + { + "epoch": 0.1583, + "grad_norm": 0.11743653565645218, + "learning_rate": 4.826580502657144e-05, + "loss": 0.0429, + "step": 32660 + }, + { + "epoch": 0.15835, + "grad_norm": 0.12276814877986908, + "learning_rate": 4.826429196603235e-05, + "loss": 0.0436, + "step": 32670 + }, + { + "epoch": 0.1584, + "grad_norm": 0.13034766912460327, + "learning_rate": 4.826277826945625e-05, + "loss": 0.0398, + "step": 32680 + }, + { + "epoch": 0.15845, + "grad_norm": 0.1436816155910492, + "learning_rate": 4.826126393688454e-05, + "loss": 0.0417, + "step": 32690 + }, + { + "epoch": 0.1585, + "grad_norm": 0.14281252026557922, + "learning_rate": 4.825974896835861e-05, + "loss": 0.0448, + "step": 32700 + }, + { + "epoch": 0.15855, + "grad_norm": 0.133723646402359, + "learning_rate": 4.825823336391988e-05, + "loss": 0.0445, + "step": 32710 + }, + { + "epoch": 0.1586, + "grad_norm": 0.12675301730632782, + "learning_rate": 4.825671712360978e-05, + "loss": 0.0416, + "step": 32720 + }, + { + "epoch": 0.15865, + "grad_norm": 0.12376851588487625, + "learning_rate": 4.825520024746978e-05, + "loss": 0.0412, + "step": 32730 + }, + { + "epoch": 0.1587, + "grad_norm": 0.13101379573345184, + "learning_rate": 4.825368273554135e-05, + "loss": 0.0419, + "step": 32740 + }, + { + "epoch": 0.15875, + "grad_norm": 0.12889224290847778, + "learning_rate": 4.825216458786596e-05, + "loss": 0.041, + "step": 32750 + }, + { + "epoch": 0.1588, + "grad_norm": 0.11569836735725403, + "learning_rate": 4.8250645804485125e-05, + "loss": 0.0406, + "step": 32760 + }, + { + "epoch": 0.15885, + "grad_norm": 0.1261492222547531, + "learning_rate": 4.824912638544037e-05, + "loss": 0.0427, + "step": 32770 + }, + { + "epoch": 0.1589, + "grad_norm": 0.10232339799404144, + "learning_rate": 4.824760633077323e-05, + "loss": 0.0427, + "step": 32780 + }, + { + "epoch": 0.15895, + "grad_norm": 0.10576627403497696, + "learning_rate": 4.8246085640525276e-05, + "loss": 0.0413, + "step": 32790 + }, + { + "epoch": 0.159, + "grad_norm": 0.11924322694540024, + "learning_rate": 4.824456431473807e-05, + "loss": 0.0413, + "step": 32800 + }, + { + "epoch": 0.15905, + "grad_norm": 0.13449467718601227, + "learning_rate": 4.82430423534532e-05, + "loss": 0.0421, + "step": 32810 + }, + { + "epoch": 0.1591, + "grad_norm": 0.13242179155349731, + "learning_rate": 4.8241519756712293e-05, + "loss": 0.0422, + "step": 32820 + }, + { + "epoch": 0.15915, + "grad_norm": 0.11021512001752853, + "learning_rate": 4.823999652455696e-05, + "loss": 0.0428, + "step": 32830 + }, + { + "epoch": 0.1592, + "grad_norm": 0.1300233155488968, + "learning_rate": 4.823847265702887e-05, + "loss": 0.0448, + "step": 32840 + }, + { + "epoch": 0.15925, + "grad_norm": 0.1332443505525589, + "learning_rate": 4.823694815416965e-05, + "loss": 0.0426, + "step": 32850 + }, + { + "epoch": 0.1593, + "grad_norm": 0.11915509402751923, + "learning_rate": 4.8235423016021e-05, + "loss": 0.0433, + "step": 32860 + }, + { + "epoch": 0.15935, + "grad_norm": 0.1483246237039566, + "learning_rate": 4.8233897242624616e-05, + "loss": 0.044, + "step": 32870 + }, + { + "epoch": 0.1594, + "grad_norm": 0.10429392009973526, + "learning_rate": 4.823237083402221e-05, + "loss": 0.0404, + "step": 32880 + }, + { + "epoch": 0.15945, + "grad_norm": 0.10859861224889755, + "learning_rate": 4.823084379025552e-05, + "loss": 0.0409, + "step": 32890 + }, + { + "epoch": 0.1595, + "grad_norm": 0.12180684506893158, + "learning_rate": 4.822931611136628e-05, + "loss": 0.0414, + "step": 32900 + }, + { + "epoch": 0.15955, + "grad_norm": 0.10532913357019424, + "learning_rate": 4.8227787797396265e-05, + "loss": 0.0421, + "step": 32910 + }, + { + "epoch": 0.1596, + "grad_norm": 0.11721262335777283, + "learning_rate": 4.822625884838726e-05, + "loss": 0.0426, + "step": 32920 + }, + { + "epoch": 0.15965, + "grad_norm": 0.1321670413017273, + "learning_rate": 4.8224729264381065e-05, + "loss": 0.0426, + "step": 32930 + }, + { + "epoch": 0.1597, + "grad_norm": 0.13177931308746338, + "learning_rate": 4.82231990454195e-05, + "loss": 0.0419, + "step": 32940 + }, + { + "epoch": 0.15975, + "grad_norm": 0.1313125044107437, + "learning_rate": 4.822166819154439e-05, + "loss": 0.0429, + "step": 32950 + }, + { + "epoch": 0.1598, + "grad_norm": 0.12595787644386292, + "learning_rate": 4.8220136702797596e-05, + "loss": 0.0434, + "step": 32960 + }, + { + "epoch": 0.15985, + "grad_norm": 0.13454866409301758, + "learning_rate": 4.8218604579220994e-05, + "loss": 0.0443, + "step": 32970 + }, + { + "epoch": 0.1599, + "grad_norm": 0.12417369335889816, + "learning_rate": 4.821707182085646e-05, + "loss": 0.0434, + "step": 32980 + }, + { + "epoch": 0.15995, + "grad_norm": 0.12248330563306808, + "learning_rate": 4.821553842774591e-05, + "loss": 0.0425, + "step": 32990 + }, + { + "epoch": 0.16, + "grad_norm": 0.12675760686397552, + "learning_rate": 4.8214004399931255e-05, + "loss": 0.0419, + "step": 33000 + }, + { + "epoch": 0.16005, + "grad_norm": 0.13656532764434814, + "learning_rate": 4.8212469737454444e-05, + "loss": 0.0417, + "step": 33010 + }, + { + "epoch": 0.1601, + "grad_norm": 0.12864769995212555, + "learning_rate": 4.821093444035743e-05, + "loss": 0.0426, + "step": 33020 + }, + { + "epoch": 0.16015, + "grad_norm": 0.12323490530252457, + "learning_rate": 4.820939850868219e-05, + "loss": 0.0431, + "step": 33030 + }, + { + "epoch": 0.1602, + "grad_norm": 0.1335224062204361, + "learning_rate": 4.8207861942470714e-05, + "loss": 0.0427, + "step": 33040 + }, + { + "epoch": 0.16025, + "grad_norm": 0.14590322971343994, + "learning_rate": 4.8206324741765006e-05, + "loss": 0.0427, + "step": 33050 + }, + { + "epoch": 0.1603, + "grad_norm": 0.1478782743215561, + "learning_rate": 4.820478690660711e-05, + "loss": 0.0428, + "step": 33060 + }, + { + "epoch": 0.16035, + "grad_norm": 0.15049858391284943, + "learning_rate": 4.820324843703905e-05, + "loss": 0.0451, + "step": 33070 + }, + { + "epoch": 0.1604, + "grad_norm": 0.14486844837665558, + "learning_rate": 4.82017093331029e-05, + "loss": 0.0427, + "step": 33080 + }, + { + "epoch": 0.16045, + "grad_norm": 0.14537589251995087, + "learning_rate": 4.8200169594840713e-05, + "loss": 0.0448, + "step": 33090 + }, + { + "epoch": 0.1605, + "grad_norm": 0.1660872846841812, + "learning_rate": 4.819862922229463e-05, + "loss": 0.0424, + "step": 33100 + }, + { + "epoch": 0.16055, + "grad_norm": 0.12716545164585114, + "learning_rate": 4.8197088215506724e-05, + "loss": 0.0424, + "step": 33110 + }, + { + "epoch": 0.1606, + "grad_norm": 0.13636474311351776, + "learning_rate": 4.819554657451915e-05, + "loss": 0.0434, + "step": 33120 + }, + { + "epoch": 0.16065, + "grad_norm": 0.14646953344345093, + "learning_rate": 4.819400429937404e-05, + "loss": 0.0422, + "step": 33130 + }, + { + "epoch": 0.1607, + "grad_norm": 0.14615385234355927, + "learning_rate": 4.819246139011358e-05, + "loss": 0.0444, + "step": 33140 + }, + { + "epoch": 0.16075, + "grad_norm": 0.13825707137584686, + "learning_rate": 4.819091784677992e-05, + "loss": 0.0434, + "step": 33150 + }, + { + "epoch": 0.1608, + "grad_norm": 0.11570119857788086, + "learning_rate": 4.8189373669415284e-05, + "loss": 0.0407, + "step": 33160 + }, + { + "epoch": 0.16085, + "grad_norm": 0.09911048412322998, + "learning_rate": 4.818782885806189e-05, + "loss": 0.0422, + "step": 33170 + }, + { + "epoch": 0.1609, + "grad_norm": 0.15140579640865326, + "learning_rate": 4.818628341276196e-05, + "loss": 0.046, + "step": 33180 + }, + { + "epoch": 0.16095, + "grad_norm": 0.14800402522087097, + "learning_rate": 4.8184737333557754e-05, + "loss": 0.0451, + "step": 33190 + }, + { + "epoch": 0.161, + "grad_norm": 0.18855224549770355, + "learning_rate": 4.818319062049154e-05, + "loss": 0.0445, + "step": 33200 + }, + { + "epoch": 0.16105, + "grad_norm": 0.15011833608150482, + "learning_rate": 4.8181643273605605e-05, + "loss": 0.048, + "step": 33210 + }, + { + "epoch": 0.1611, + "grad_norm": 0.1251029670238495, + "learning_rate": 4.818009529294225e-05, + "loss": 0.0449, + "step": 33220 + }, + { + "epoch": 0.16115, + "grad_norm": 0.1206798106431961, + "learning_rate": 4.81785466785438e-05, + "loss": 0.0429, + "step": 33230 + }, + { + "epoch": 0.1612, + "grad_norm": 0.1255817860364914, + "learning_rate": 4.817699743045259e-05, + "loss": 0.0415, + "step": 33240 + }, + { + "epoch": 0.16125, + "grad_norm": 0.15104557573795319, + "learning_rate": 4.817544754871098e-05, + "loss": 0.0458, + "step": 33250 + }, + { + "epoch": 0.1613, + "grad_norm": 0.1329016238451004, + "learning_rate": 4.8173897033361336e-05, + "loss": 0.0428, + "step": 33260 + }, + { + "epoch": 0.16135, + "grad_norm": 0.12674003839492798, + "learning_rate": 4.8172345884446056e-05, + "loss": 0.0429, + "step": 33270 + }, + { + "epoch": 0.1614, + "grad_norm": 0.12564903497695923, + "learning_rate": 4.817079410200754e-05, + "loss": 0.0431, + "step": 33280 + }, + { + "epoch": 0.16145, + "grad_norm": 0.13768647611141205, + "learning_rate": 4.816924168608823e-05, + "loss": 0.0435, + "step": 33290 + }, + { + "epoch": 0.1615, + "grad_norm": 0.10589944571256638, + "learning_rate": 4.816768863673055e-05, + "loss": 0.0427, + "step": 33300 + }, + { + "epoch": 0.16155, + "grad_norm": 0.11592059582471848, + "learning_rate": 4.816613495397696e-05, + "loss": 0.0417, + "step": 33310 + }, + { + "epoch": 0.1616, + "grad_norm": 0.10585790127515793, + "learning_rate": 4.8164580637869946e-05, + "loss": 0.041, + "step": 33320 + }, + { + "epoch": 0.16165, + "grad_norm": 0.12283612787723541, + "learning_rate": 4.816302568845201e-05, + "loss": 0.0421, + "step": 33330 + }, + { + "epoch": 0.1617, + "grad_norm": 0.1281306892633438, + "learning_rate": 4.816147010576565e-05, + "loss": 0.0412, + "step": 33340 + }, + { + "epoch": 0.16175, + "grad_norm": 0.1307964026927948, + "learning_rate": 4.815991388985339e-05, + "loss": 0.0421, + "step": 33350 + }, + { + "epoch": 0.1618, + "grad_norm": 0.12150103598833084, + "learning_rate": 4.8158357040757794e-05, + "loss": 0.044, + "step": 33360 + }, + { + "epoch": 0.16185, + "grad_norm": 0.14208216965198517, + "learning_rate": 4.8156799558521406e-05, + "loss": 0.0451, + "step": 33370 + }, + { + "epoch": 0.1619, + "grad_norm": 0.12786491215229034, + "learning_rate": 4.815524144318683e-05, + "loss": 0.0432, + "step": 33380 + }, + { + "epoch": 0.16195, + "grad_norm": 0.1281542181968689, + "learning_rate": 4.815368269479664e-05, + "loss": 0.0424, + "step": 33390 + }, + { + "epoch": 0.162, + "grad_norm": 0.13442762196063995, + "learning_rate": 4.8152123313393475e-05, + "loss": 0.0443, + "step": 33400 + }, + { + "epoch": 0.16205, + "grad_norm": 0.11269143223762512, + "learning_rate": 4.8150563299019955e-05, + "loss": 0.0434, + "step": 33410 + }, + { + "epoch": 0.1621, + "grad_norm": 0.11426067352294922, + "learning_rate": 4.8149002651718725e-05, + "loss": 0.0426, + "step": 33420 + }, + { + "epoch": 0.16215, + "grad_norm": 0.10544957965612411, + "learning_rate": 4.814744137153247e-05, + "loss": 0.0444, + "step": 33430 + }, + { + "epoch": 0.1622, + "grad_norm": 0.15541492402553558, + "learning_rate": 4.814587945850385e-05, + "loss": 0.0466, + "step": 33440 + }, + { + "epoch": 0.16225, + "grad_norm": 0.14596639573574066, + "learning_rate": 4.814431691267559e-05, + "loss": 0.0422, + "step": 33450 + }, + { + "epoch": 0.1623, + "grad_norm": 0.11651241034269333, + "learning_rate": 4.8142753734090395e-05, + "loss": 0.0419, + "step": 33460 + }, + { + "epoch": 0.16235, + "grad_norm": 0.11221083253622055, + "learning_rate": 4.8141189922791014e-05, + "loss": 0.0416, + "step": 33470 + }, + { + "epoch": 0.1624, + "grad_norm": 0.10135234147310257, + "learning_rate": 4.813962547882019e-05, + "loss": 0.0417, + "step": 33480 + }, + { + "epoch": 0.16245, + "grad_norm": 0.132141575217247, + "learning_rate": 4.81380604022207e-05, + "loss": 0.0405, + "step": 33490 + }, + { + "epoch": 0.1625, + "grad_norm": 0.13567662239074707, + "learning_rate": 4.813649469303533e-05, + "loss": 0.0421, + "step": 33500 + }, + { + "epoch": 0.16255, + "grad_norm": 0.13327448070049286, + "learning_rate": 4.813492835130688e-05, + "loss": 0.0421, + "step": 33510 + }, + { + "epoch": 0.1626, + "grad_norm": 0.10723388195037842, + "learning_rate": 4.813336137707819e-05, + "loss": 0.0421, + "step": 33520 + }, + { + "epoch": 0.16265, + "grad_norm": 0.1065554991364479, + "learning_rate": 4.813179377039209e-05, + "loss": 0.0417, + "step": 33530 + }, + { + "epoch": 0.1627, + "grad_norm": 0.11611859500408173, + "learning_rate": 4.813022553129144e-05, + "loss": 0.0405, + "step": 33540 + }, + { + "epoch": 0.16275, + "grad_norm": 0.13964098691940308, + "learning_rate": 4.812865665981911e-05, + "loss": 0.0444, + "step": 33550 + }, + { + "epoch": 0.1628, + "grad_norm": 0.1278601586818695, + "learning_rate": 4.8127087156018e-05, + "loss": 0.0429, + "step": 33560 + }, + { + "epoch": 0.16285, + "grad_norm": 0.14337113499641418, + "learning_rate": 4.812551701993101e-05, + "loss": 0.0407, + "step": 33570 + }, + { + "epoch": 0.1629, + "grad_norm": 0.1198313906788826, + "learning_rate": 4.812394625160107e-05, + "loss": 0.044, + "step": 33580 + }, + { + "epoch": 0.16295, + "grad_norm": 0.13165201246738434, + "learning_rate": 4.8122374851071134e-05, + "loss": 0.0434, + "step": 33590 + }, + { + "epoch": 0.163, + "grad_norm": 0.12109248340129852, + "learning_rate": 4.812080281838415e-05, + "loss": 0.0418, + "step": 33600 + }, + { + "epoch": 0.16305, + "grad_norm": 0.11725418269634247, + "learning_rate": 4.811923015358311e-05, + "loss": 0.044, + "step": 33610 + }, + { + "epoch": 0.1631, + "grad_norm": 0.12260973453521729, + "learning_rate": 4.8117656856711005e-05, + "loss": 0.0429, + "step": 33620 + }, + { + "epoch": 0.16315, + "grad_norm": 0.14538836479187012, + "learning_rate": 4.8116082927810836e-05, + "loss": 0.0427, + "step": 33630 + }, + { + "epoch": 0.1632, + "grad_norm": 0.12341050058603287, + "learning_rate": 4.811450836692565e-05, + "loss": 0.0411, + "step": 33640 + }, + { + "epoch": 0.16325, + "grad_norm": 0.12862537801265717, + "learning_rate": 4.811293317409848e-05, + "loss": 0.043, + "step": 33650 + }, + { + "epoch": 0.1633, + "grad_norm": 0.13257691264152527, + "learning_rate": 4.811135734937242e-05, + "loss": 0.044, + "step": 33660 + }, + { + "epoch": 0.16335, + "grad_norm": 0.1441202312707901, + "learning_rate": 4.810978089279052e-05, + "loss": 0.0436, + "step": 33670 + }, + { + "epoch": 0.1634, + "grad_norm": 0.12990273535251617, + "learning_rate": 4.810820380439589e-05, + "loss": 0.0475, + "step": 33680 + }, + { + "epoch": 0.16345, + "grad_norm": 0.13740640878677368, + "learning_rate": 4.8106626084231656e-05, + "loss": 0.0451, + "step": 33690 + }, + { + "epoch": 0.1635, + "grad_norm": 0.1576330065727234, + "learning_rate": 4.810504773234094e-05, + "loss": 0.0428, + "step": 33700 + }, + { + "epoch": 0.16355, + "grad_norm": 0.13011543452739716, + "learning_rate": 4.81034687487669e-05, + "loss": 0.0445, + "step": 33710 + }, + { + "epoch": 0.1636, + "grad_norm": 0.129585400223732, + "learning_rate": 4.8101889133552706e-05, + "loss": 0.0425, + "step": 33720 + }, + { + "epoch": 0.16365, + "grad_norm": 0.13436222076416016, + "learning_rate": 4.810030888674154e-05, + "loss": 0.0418, + "step": 33730 + }, + { + "epoch": 0.1637, + "grad_norm": 0.11319204419851303, + "learning_rate": 4.809872800837662e-05, + "loss": 0.0415, + "step": 33740 + }, + { + "epoch": 0.16375, + "grad_norm": 0.09980058670043945, + "learning_rate": 4.809714649850113e-05, + "loss": 0.0424, + "step": 33750 + }, + { + "epoch": 0.1638, + "grad_norm": 0.13024428486824036, + "learning_rate": 4.809556435715835e-05, + "loss": 0.0449, + "step": 33760 + }, + { + "epoch": 0.16385, + "grad_norm": 0.13720740377902985, + "learning_rate": 4.809398158439151e-05, + "loss": 0.0429, + "step": 33770 + }, + { + "epoch": 0.1639, + "grad_norm": 0.1280098259449005, + "learning_rate": 4.809239818024389e-05, + "loss": 0.0418, + "step": 33780 + }, + { + "epoch": 0.16395, + "grad_norm": 0.13328638672828674, + "learning_rate": 4.8090814144758787e-05, + "loss": 0.0434, + "step": 33790 + }, + { + "epoch": 0.164, + "grad_norm": 0.1063094437122345, + "learning_rate": 4.808922947797949e-05, + "loss": 0.0433, + "step": 33800 + }, + { + "epoch": 0.16405, + "grad_norm": 0.10085838288068771, + "learning_rate": 4.8087644179949335e-05, + "loss": 0.044, + "step": 33810 + }, + { + "epoch": 0.1641, + "grad_norm": 0.1739586889743805, + "learning_rate": 4.808605825071166e-05, + "loss": 0.0462, + "step": 33820 + }, + { + "epoch": 0.16415, + "grad_norm": 0.12872996926307678, + "learning_rate": 4.808447169030983e-05, + "loss": 0.043, + "step": 33830 + }, + { + "epoch": 0.1642, + "grad_norm": 0.13191847503185272, + "learning_rate": 4.808288449878722e-05, + "loss": 0.0444, + "step": 33840 + }, + { + "epoch": 0.16425, + "grad_norm": 0.13148938119411469, + "learning_rate": 4.8081296676187214e-05, + "loss": 0.0437, + "step": 33850 + }, + { + "epoch": 0.1643, + "grad_norm": 0.11032675951719284, + "learning_rate": 4.807970822255323e-05, + "loss": 0.0432, + "step": 33860 + }, + { + "epoch": 0.16435, + "grad_norm": 0.11404123902320862, + "learning_rate": 4.807811913792869e-05, + "loss": 0.0422, + "step": 33870 + }, + { + "epoch": 0.1644, + "grad_norm": 0.11356104910373688, + "learning_rate": 4.8076529422357054e-05, + "loss": 0.0438, + "step": 33880 + }, + { + "epoch": 0.16445, + "grad_norm": 0.10598156601190567, + "learning_rate": 4.807493907588176e-05, + "loss": 0.043, + "step": 33890 + }, + { + "epoch": 0.1645, + "grad_norm": 0.11783069372177124, + "learning_rate": 4.807334809854631e-05, + "loss": 0.0425, + "step": 33900 + }, + { + "epoch": 0.16455, + "grad_norm": 0.1382354497909546, + "learning_rate": 4.807175649039418e-05, + "loss": 0.0437, + "step": 33910 + }, + { + "epoch": 0.1646, + "grad_norm": 0.11479967087507248, + "learning_rate": 4.807016425146891e-05, + "loss": 0.0424, + "step": 33920 + }, + { + "epoch": 0.16465, + "grad_norm": 0.11492490768432617, + "learning_rate": 4.806857138181401e-05, + "loss": 0.0423, + "step": 33930 + }, + { + "epoch": 0.1647, + "grad_norm": 0.10321938991546631, + "learning_rate": 4.806697788147303e-05, + "loss": 0.0429, + "step": 33940 + }, + { + "epoch": 0.16475, + "grad_norm": 0.13526004552841187, + "learning_rate": 4.8065383750489544e-05, + "loss": 0.0416, + "step": 33950 + }, + { + "epoch": 0.1648, + "grad_norm": 0.14217951893806458, + "learning_rate": 4.806378898890713e-05, + "loss": 0.0429, + "step": 33960 + }, + { + "epoch": 0.16485, + "grad_norm": 0.12112054973840714, + "learning_rate": 4.806219359676939e-05, + "loss": 0.0428, + "step": 33970 + }, + { + "epoch": 0.1649, + "grad_norm": 0.13174232840538025, + "learning_rate": 4.8060597574119945e-05, + "loss": 0.0423, + "step": 33980 + }, + { + "epoch": 0.16495, + "grad_norm": 0.1228984072804451, + "learning_rate": 4.805900092100242e-05, + "loss": 0.0422, + "step": 33990 + }, + { + "epoch": 0.165, + "grad_norm": 0.14152731001377106, + "learning_rate": 4.8057403637460475e-05, + "loss": 0.0421, + "step": 34000 + }, + { + "epoch": 0.16505, + "grad_norm": 0.10901544243097305, + "learning_rate": 4.8055805723537775e-05, + "loss": 0.0439, + "step": 34010 + }, + { + "epoch": 0.1651, + "grad_norm": 0.12598423659801483, + "learning_rate": 4.805420717927802e-05, + "loss": 0.0425, + "step": 34020 + }, + { + "epoch": 0.16515, + "grad_norm": 0.13269755244255066, + "learning_rate": 4.805260800472489e-05, + "loss": 0.0439, + "step": 34030 + }, + { + "epoch": 0.1652, + "grad_norm": 0.11629320681095123, + "learning_rate": 4.8051008199922123e-05, + "loss": 0.0426, + "step": 34040 + }, + { + "epoch": 0.16525, + "grad_norm": 0.1244744211435318, + "learning_rate": 4.804940776491345e-05, + "loss": 0.042, + "step": 34050 + }, + { + "epoch": 0.1653, + "grad_norm": 0.11696765571832657, + "learning_rate": 4.804780669974262e-05, + "loss": 0.0424, + "step": 34060 + }, + { + "epoch": 0.16535, + "grad_norm": 0.12034659087657928, + "learning_rate": 4.804620500445342e-05, + "loss": 0.0424, + "step": 34070 + }, + { + "epoch": 0.1654, + "grad_norm": 0.11395153403282166, + "learning_rate": 4.8044602679089634e-05, + "loss": 0.041, + "step": 34080 + }, + { + "epoch": 0.16545, + "grad_norm": 0.12198033183813095, + "learning_rate": 4.804299972369507e-05, + "loss": 0.0428, + "step": 34090 + }, + { + "epoch": 0.1655, + "grad_norm": 0.11845625191926956, + "learning_rate": 4.804139613831354e-05, + "loss": 0.0435, + "step": 34100 + }, + { + "epoch": 0.16555, + "grad_norm": 0.1283753514289856, + "learning_rate": 4.803979192298891e-05, + "loss": 0.0432, + "step": 34110 + }, + { + "epoch": 0.1656, + "grad_norm": 0.1280210018157959, + "learning_rate": 4.803818707776502e-05, + "loss": 0.0448, + "step": 34120 + }, + { + "epoch": 0.16565, + "grad_norm": 0.165738046169281, + "learning_rate": 4.803658160268575e-05, + "loss": 0.0456, + "step": 34130 + }, + { + "epoch": 0.1657, + "grad_norm": 0.134948268532753, + "learning_rate": 4.803497549779499e-05, + "loss": 0.047, + "step": 34140 + }, + { + "epoch": 0.16575, + "grad_norm": 0.16082508862018585, + "learning_rate": 4.803336876313666e-05, + "loss": 0.0425, + "step": 34150 + }, + { + "epoch": 0.1658, + "grad_norm": 0.12395468354225159, + "learning_rate": 4.803176139875467e-05, + "loss": 0.0436, + "step": 34160 + }, + { + "epoch": 0.16585, + "grad_norm": 0.15877650678157806, + "learning_rate": 4.803015340469299e-05, + "loss": 0.0419, + "step": 34170 + }, + { + "epoch": 0.1659, + "grad_norm": 0.1441933810710907, + "learning_rate": 4.802854478099555e-05, + "loss": 0.0442, + "step": 34180 + }, + { + "epoch": 0.16595, + "grad_norm": 0.11970805376768112, + "learning_rate": 4.802693552770636e-05, + "loss": 0.0432, + "step": 34190 + }, + { + "epoch": 0.166, + "grad_norm": 0.154872864484787, + "learning_rate": 4.802532564486941e-05, + "loss": 0.0426, + "step": 34200 + }, + { + "epoch": 0.16605, + "grad_norm": 0.133418008685112, + "learning_rate": 4.80237151325287e-05, + "loss": 0.0424, + "step": 34210 + }, + { + "epoch": 0.1661, + "grad_norm": 0.12561088800430298, + "learning_rate": 4.802210399072826e-05, + "loss": 0.0421, + "step": 34220 + }, + { + "epoch": 0.16615, + "grad_norm": 0.1357862651348114, + "learning_rate": 4.8020492219512156e-05, + "loss": 0.0431, + "step": 34230 + }, + { + "epoch": 0.1662, + "grad_norm": 0.12643086910247803, + "learning_rate": 4.801887981892444e-05, + "loss": 0.0436, + "step": 34240 + }, + { + "epoch": 0.16625, + "grad_norm": 0.13576337695121765, + "learning_rate": 4.80172667890092e-05, + "loss": 0.0434, + "step": 34250 + }, + { + "epoch": 0.1663, + "grad_norm": 0.10537552833557129, + "learning_rate": 4.801565312981052e-05, + "loss": 0.0419, + "step": 34260 + }, + { + "epoch": 0.16635, + "grad_norm": 0.13234888017177582, + "learning_rate": 4.8014038841372547e-05, + "loss": 0.0417, + "step": 34270 + }, + { + "epoch": 0.1664, + "grad_norm": 0.1402786672115326, + "learning_rate": 4.8012423923739395e-05, + "loss": 0.0425, + "step": 34280 + }, + { + "epoch": 0.16645, + "grad_norm": 0.13282206654548645, + "learning_rate": 4.801080837695521e-05, + "loss": 0.0439, + "step": 34290 + }, + { + "epoch": 0.1665, + "grad_norm": 0.12814287841320038, + "learning_rate": 4.8009192201064174e-05, + "loss": 0.0434, + "step": 34300 + }, + { + "epoch": 0.16655, + "grad_norm": 0.10998068004846573, + "learning_rate": 4.800757539611047e-05, + "loss": 0.0425, + "step": 34310 + }, + { + "epoch": 0.1666, + "grad_norm": 0.132798969745636, + "learning_rate": 4.800595796213829e-05, + "loss": 0.0414, + "step": 34320 + }, + { + "epoch": 0.16665, + "grad_norm": 0.1641465574502945, + "learning_rate": 4.800433989919187e-05, + "loss": 0.0437, + "step": 34330 + }, + { + "epoch": 0.1667, + "grad_norm": 0.1456586867570877, + "learning_rate": 4.800272120731544e-05, + "loss": 0.0423, + "step": 34340 + }, + { + "epoch": 0.16675, + "grad_norm": 0.13222168385982513, + "learning_rate": 4.800110188655325e-05, + "loss": 0.0413, + "step": 34350 + }, + { + "epoch": 0.1668, + "grad_norm": 0.14294667541980743, + "learning_rate": 4.7999481936949575e-05, + "loss": 0.0419, + "step": 34360 + }, + { + "epoch": 0.16685, + "grad_norm": 0.1279636025428772, + "learning_rate": 4.7997861358548704e-05, + "loss": 0.0421, + "step": 34370 + }, + { + "epoch": 0.1669, + "grad_norm": 0.12441389262676239, + "learning_rate": 4.7996240151394945e-05, + "loss": 0.0418, + "step": 34380 + }, + { + "epoch": 0.16695, + "grad_norm": 0.11313042789697647, + "learning_rate": 4.799461831553262e-05, + "loss": 0.0401, + "step": 34390 + }, + { + "epoch": 0.167, + "grad_norm": 0.11300710588693619, + "learning_rate": 4.7992995851006066e-05, + "loss": 0.042, + "step": 34400 + }, + { + "epoch": 0.16705, + "grad_norm": 0.13788188993930817, + "learning_rate": 4.799137275785965e-05, + "loss": 0.0407, + "step": 34410 + }, + { + "epoch": 0.1671, + "grad_norm": 0.12407025694847107, + "learning_rate": 4.798974903613773e-05, + "loss": 0.0386, + "step": 34420 + }, + { + "epoch": 0.16715, + "grad_norm": 0.13035129010677338, + "learning_rate": 4.7988124685884713e-05, + "loss": 0.0419, + "step": 34430 + }, + { + "epoch": 0.1672, + "grad_norm": 0.13006633520126343, + "learning_rate": 4.798649970714501e-05, + "loss": 0.04, + "step": 34440 + }, + { + "epoch": 0.16725, + "grad_norm": 0.14761589467525482, + "learning_rate": 4.798487409996303e-05, + "loss": 0.0404, + "step": 34450 + }, + { + "epoch": 0.1673, + "grad_norm": 0.1414937973022461, + "learning_rate": 4.798324786438324e-05, + "loss": 0.0404, + "step": 34460 + }, + { + "epoch": 0.16735, + "grad_norm": 0.12699708342552185, + "learning_rate": 4.798162100045008e-05, + "loss": 0.0403, + "step": 34470 + }, + { + "epoch": 0.1674, + "grad_norm": 0.1410609930753708, + "learning_rate": 4.797999350820803e-05, + "loss": 0.0419, + "step": 34480 + }, + { + "epoch": 0.16745, + "grad_norm": 0.14010128378868103, + "learning_rate": 4.7978365387701595e-05, + "loss": 0.0416, + "step": 34490 + }, + { + "epoch": 0.1675, + "grad_norm": 0.11545085906982422, + "learning_rate": 4.797673663897529e-05, + "loss": 0.0431, + "step": 34500 + }, + { + "epoch": 0.16755, + "grad_norm": 0.12026017159223557, + "learning_rate": 4.7975107262073634e-05, + "loss": 0.0445, + "step": 34510 + }, + { + "epoch": 0.1676, + "grad_norm": 0.13077577948570251, + "learning_rate": 4.7973477257041175e-05, + "loss": 0.0426, + "step": 34520 + }, + { + "epoch": 0.16765, + "grad_norm": 0.1372055560350418, + "learning_rate": 4.7971846623922476e-05, + "loss": 0.0431, + "step": 34530 + }, + { + "epoch": 0.1677, + "grad_norm": 0.1372019201517105, + "learning_rate": 4.7970215362762125e-05, + "loss": 0.0439, + "step": 34540 + }, + { + "epoch": 0.16775, + "grad_norm": 0.13161469995975494, + "learning_rate": 4.796858347360471e-05, + "loss": 0.0417, + "step": 34550 + }, + { + "epoch": 0.1678, + "grad_norm": 0.14171482622623444, + "learning_rate": 4.796695095649485e-05, + "loss": 0.0419, + "step": 34560 + }, + { + "epoch": 0.16785, + "grad_norm": 0.10291971266269684, + "learning_rate": 4.796531781147719e-05, + "loss": 0.0435, + "step": 34570 + }, + { + "epoch": 0.1679, + "grad_norm": 0.11898507177829742, + "learning_rate": 4.7963684038596356e-05, + "loss": 0.0431, + "step": 34580 + }, + { + "epoch": 0.16795, + "grad_norm": 0.13572126626968384, + "learning_rate": 4.7962049637897036e-05, + "loss": 0.0417, + "step": 34590 + }, + { + "epoch": 0.168, + "grad_norm": 0.1346631646156311, + "learning_rate": 4.796041460942391e-05, + "loss": 0.045, + "step": 34600 + }, + { + "epoch": 0.16805, + "grad_norm": 0.11369791626930237, + "learning_rate": 4.795877895322166e-05, + "loss": 0.0408, + "step": 34610 + }, + { + "epoch": 0.1681, + "grad_norm": 0.20911628007888794, + "learning_rate": 4.7957142669335034e-05, + "loss": 0.0435, + "step": 34620 + }, + { + "epoch": 0.16815, + "grad_norm": 0.1218675747513771, + "learning_rate": 4.795550575780874e-05, + "loss": 0.0414, + "step": 34630 + }, + { + "epoch": 0.1682, + "grad_norm": 0.15563416481018066, + "learning_rate": 4.795386821868755e-05, + "loss": 0.0425, + "step": 34640 + }, + { + "epoch": 0.16825, + "grad_norm": 0.12554652988910675, + "learning_rate": 4.7952230052016226e-05, + "loss": 0.0412, + "step": 34650 + }, + { + "epoch": 0.1683, + "grad_norm": 0.1206367164850235, + "learning_rate": 4.795059125783955e-05, + "loss": 0.0444, + "step": 34660 + }, + { + "epoch": 0.16835, + "grad_norm": 0.1299329549074173, + "learning_rate": 4.794895183620233e-05, + "loss": 0.043, + "step": 34670 + }, + { + "epoch": 0.1684, + "grad_norm": 0.17082592844963074, + "learning_rate": 4.794731178714939e-05, + "loss": 0.046, + "step": 34680 + }, + { + "epoch": 0.16845, + "grad_norm": 0.13465429842472076, + "learning_rate": 4.794567111072557e-05, + "loss": 0.0422, + "step": 34690 + }, + { + "epoch": 0.1685, + "grad_norm": 0.12303932756185532, + "learning_rate": 4.794402980697572e-05, + "loss": 0.0416, + "step": 34700 + }, + { + "epoch": 0.16855, + "grad_norm": 0.10682618618011475, + "learning_rate": 4.794238787594472e-05, + "loss": 0.042, + "step": 34710 + }, + { + "epoch": 0.1686, + "grad_norm": 0.11872848123311996, + "learning_rate": 4.794074531767745e-05, + "loss": 0.0421, + "step": 34720 + }, + { + "epoch": 0.16865, + "grad_norm": 0.12520605325698853, + "learning_rate": 4.7939102132218816e-05, + "loss": 0.0433, + "step": 34730 + }, + { + "epoch": 0.1687, + "grad_norm": 0.13752523064613342, + "learning_rate": 4.793745831961375e-05, + "loss": 0.0421, + "step": 34740 + }, + { + "epoch": 0.16875, + "grad_norm": 0.1419668048620224, + "learning_rate": 4.7935813879907195e-05, + "loss": 0.0436, + "step": 34750 + }, + { + "epoch": 0.1688, + "grad_norm": 0.12771490216255188, + "learning_rate": 4.7934168813144106e-05, + "loss": 0.0418, + "step": 34760 + }, + { + "epoch": 0.16885, + "grad_norm": 0.11459469795227051, + "learning_rate": 4.793252311936945e-05, + "loss": 0.0411, + "step": 34770 + }, + { + "epoch": 0.1689, + "grad_norm": 0.13765087723731995, + "learning_rate": 4.793087679862824e-05, + "loss": 0.0416, + "step": 34780 + }, + { + "epoch": 0.16895, + "grad_norm": 0.11228399723768234, + "learning_rate": 4.7929229850965465e-05, + "loss": 0.0423, + "step": 34790 + }, + { + "epoch": 0.169, + "grad_norm": 0.13364118337631226, + "learning_rate": 4.7927582276426155e-05, + "loss": 0.0412, + "step": 34800 + }, + { + "epoch": 0.16905, + "grad_norm": 0.12470797449350357, + "learning_rate": 4.792593407505537e-05, + "loss": 0.0435, + "step": 34810 + }, + { + "epoch": 0.1691, + "grad_norm": 0.16428443789482117, + "learning_rate": 4.7924285246898154e-05, + "loss": 0.0428, + "step": 34820 + }, + { + "epoch": 0.16915, + "grad_norm": 0.13110315799713135, + "learning_rate": 4.7922635791999594e-05, + "loss": 0.0429, + "step": 34830 + }, + { + "epoch": 0.1692, + "grad_norm": 0.13065670430660248, + "learning_rate": 4.7920985710404774e-05, + "loss": 0.042, + "step": 34840 + }, + { + "epoch": 0.16925, + "grad_norm": 0.15915274620056152, + "learning_rate": 4.7919335002158826e-05, + "loss": 0.0442, + "step": 34850 + }, + { + "epoch": 0.1693, + "grad_norm": 0.11836789548397064, + "learning_rate": 4.7917683667306864e-05, + "loss": 0.043, + "step": 34860 + }, + { + "epoch": 0.16935, + "grad_norm": 0.125947505235672, + "learning_rate": 4.791603170589405e-05, + "loss": 0.0423, + "step": 34870 + }, + { + "epoch": 0.1694, + "grad_norm": 0.14417004585266113, + "learning_rate": 4.791437911796553e-05, + "loss": 0.0441, + "step": 34880 + }, + { + "epoch": 0.16945, + "grad_norm": 0.1497897356748581, + "learning_rate": 4.791272590356649e-05, + "loss": 0.0429, + "step": 34890 + }, + { + "epoch": 0.1695, + "grad_norm": 0.1332363337278366, + "learning_rate": 4.791107206274214e-05, + "loss": 0.0447, + "step": 34900 + }, + { + "epoch": 0.16955, + "grad_norm": 0.1140265166759491, + "learning_rate": 4.790941759553769e-05, + "loss": 0.0434, + "step": 34910 + }, + { + "epoch": 0.1696, + "grad_norm": 0.1542254090309143, + "learning_rate": 4.790776250199836e-05, + "loss": 0.0438, + "step": 34920 + }, + { + "epoch": 0.16965, + "grad_norm": 0.13593624532222748, + "learning_rate": 4.790610678216941e-05, + "loss": 0.0459, + "step": 34930 + }, + { + "epoch": 0.1697, + "grad_norm": 0.11747143417596817, + "learning_rate": 4.7904450436096104e-05, + "loss": 0.0413, + "step": 34940 + }, + { + "epoch": 0.16975, + "grad_norm": 0.10711923241615295, + "learning_rate": 4.7902793463823735e-05, + "loss": 0.0413, + "step": 34950 + }, + { + "epoch": 0.1698, + "grad_norm": 0.11600740998983383, + "learning_rate": 4.7901135865397594e-05, + "loss": 0.0445, + "step": 34960 + }, + { + "epoch": 0.16985, + "grad_norm": 0.12567615509033203, + "learning_rate": 4.7899477640863e-05, + "loss": 0.042, + "step": 34970 + }, + { + "epoch": 0.1699, + "grad_norm": 0.12033237516880035, + "learning_rate": 4.789781879026529e-05, + "loss": 0.0408, + "step": 34980 + }, + { + "epoch": 0.16995, + "grad_norm": 0.1295735239982605, + "learning_rate": 4.789615931364983e-05, + "loss": 0.0429, + "step": 34990 + }, + { + "epoch": 0.17, + "grad_norm": 0.13830392062664032, + "learning_rate": 4.789449921106196e-05, + "loss": 0.0452, + "step": 35000 + }, + { + "epoch": 0.17005, + "grad_norm": 0.14405027031898499, + "learning_rate": 4.789283848254709e-05, + "loss": 0.0408, + "step": 35010 + }, + { + "epoch": 0.1701, + "grad_norm": 0.10081218183040619, + "learning_rate": 4.789117712815061e-05, + "loss": 0.04, + "step": 35020 + }, + { + "epoch": 0.17015, + "grad_norm": 0.09701535850763321, + "learning_rate": 4.788951514791795e-05, + "loss": 0.0409, + "step": 35030 + }, + { + "epoch": 0.1702, + "grad_norm": 0.11014352738857269, + "learning_rate": 4.7887852541894554e-05, + "loss": 0.0415, + "step": 35040 + }, + { + "epoch": 0.17025, + "grad_norm": 0.11594094336032867, + "learning_rate": 4.788618931012585e-05, + "loss": 0.0409, + "step": 35050 + }, + { + "epoch": 0.1703, + "grad_norm": 0.09745623916387558, + "learning_rate": 4.788452545265734e-05, + "loss": 0.04, + "step": 35060 + }, + { + "epoch": 0.17035, + "grad_norm": 0.1125708594918251, + "learning_rate": 4.7882860969534504e-05, + "loss": 0.0401, + "step": 35070 + }, + { + "epoch": 0.1704, + "grad_norm": 0.10175339877605438, + "learning_rate": 4.788119586080284e-05, + "loss": 0.0403, + "step": 35080 + }, + { + "epoch": 0.17045, + "grad_norm": 0.15936651825904846, + "learning_rate": 4.787953012650788e-05, + "loss": 0.0435, + "step": 35090 + }, + { + "epoch": 0.1705, + "grad_norm": 0.1685941219329834, + "learning_rate": 4.787786376669516e-05, + "loss": 0.0415, + "step": 35100 + }, + { + "epoch": 0.17055, + "grad_norm": 0.14762447774410248, + "learning_rate": 4.7876196781410245e-05, + "loss": 0.0436, + "step": 35110 + }, + { + "epoch": 0.1706, + "grad_norm": 0.13409827649593353, + "learning_rate": 4.78745291706987e-05, + "loss": 0.0439, + "step": 35120 + }, + { + "epoch": 0.17065, + "grad_norm": 0.1589771956205368, + "learning_rate": 4.787286093460611e-05, + "loss": 0.0427, + "step": 35130 + }, + { + "epoch": 0.1707, + "grad_norm": 0.12625166773796082, + "learning_rate": 4.7871192073178105e-05, + "loss": 0.0433, + "step": 35140 + }, + { + "epoch": 0.17075, + "grad_norm": 0.1295214742422104, + "learning_rate": 4.78695225864603e-05, + "loss": 0.044, + "step": 35150 + }, + { + "epoch": 0.1708, + "grad_norm": 0.11780333518981934, + "learning_rate": 4.786785247449834e-05, + "loss": 0.043, + "step": 35160 + }, + { + "epoch": 0.17085, + "grad_norm": 0.10930639505386353, + "learning_rate": 4.786618173733789e-05, + "loss": 0.0426, + "step": 35170 + }, + { + "epoch": 0.1709, + "grad_norm": 0.10027613490819931, + "learning_rate": 4.786451037502462e-05, + "loss": 0.0421, + "step": 35180 + }, + { + "epoch": 0.17095, + "grad_norm": 0.13434745371341705, + "learning_rate": 4.786283838760422e-05, + "loss": 0.0443, + "step": 35190 + }, + { + "epoch": 0.171, + "grad_norm": 0.13159210979938507, + "learning_rate": 4.786116577512241e-05, + "loss": 0.0438, + "step": 35200 + }, + { + "epoch": 0.17105, + "grad_norm": 0.1493593007326126, + "learning_rate": 4.785949253762492e-05, + "loss": 0.0427, + "step": 35210 + }, + { + "epoch": 0.1711, + "grad_norm": 0.11078088730573654, + "learning_rate": 4.7857818675157484e-05, + "loss": 0.041, + "step": 35220 + }, + { + "epoch": 0.17115, + "grad_norm": 0.10402943938970566, + "learning_rate": 4.7856144187765884e-05, + "loss": 0.0434, + "step": 35230 + }, + { + "epoch": 0.1712, + "grad_norm": 0.11463809758424759, + "learning_rate": 4.785446907549588e-05, + "loss": 0.0412, + "step": 35240 + }, + { + "epoch": 0.17125, + "grad_norm": 0.12795351445674896, + "learning_rate": 4.785279333839328e-05, + "loss": 0.041, + "step": 35250 + }, + { + "epoch": 0.1713, + "grad_norm": 0.09924820065498352, + "learning_rate": 4.7851116976503895e-05, + "loss": 0.0412, + "step": 35260 + }, + { + "epoch": 0.17135, + "grad_norm": 0.10971652716398239, + "learning_rate": 4.784943998987356e-05, + "loss": 0.0443, + "step": 35270 + }, + { + "epoch": 0.1714, + "grad_norm": 0.09839256852865219, + "learning_rate": 4.784776237854811e-05, + "loss": 0.041, + "step": 35280 + }, + { + "epoch": 0.17145, + "grad_norm": 0.13040214776992798, + "learning_rate": 4.7846084142573425e-05, + "loss": 0.0418, + "step": 35290 + }, + { + "epoch": 0.1715, + "grad_norm": 0.12357919663190842, + "learning_rate": 4.784440528199539e-05, + "loss": 0.0414, + "step": 35300 + }, + { + "epoch": 0.17155, + "grad_norm": 0.13220131397247314, + "learning_rate": 4.784272579685989e-05, + "loss": 0.042, + "step": 35310 + }, + { + "epoch": 0.1716, + "grad_norm": 0.1541513353586197, + "learning_rate": 4.784104568721285e-05, + "loss": 0.0434, + "step": 35320 + }, + { + "epoch": 0.17165, + "grad_norm": 0.1281459629535675, + "learning_rate": 4.78393649531002e-05, + "loss": 0.0413, + "step": 35330 + }, + { + "epoch": 0.1717, + "grad_norm": 0.11172624677419662, + "learning_rate": 4.783768359456789e-05, + "loss": 0.0418, + "step": 35340 + }, + { + "epoch": 0.17175, + "grad_norm": 0.10488735139369965, + "learning_rate": 4.7836001611661895e-05, + "loss": 0.0416, + "step": 35350 + }, + { + "epoch": 0.1718, + "grad_norm": 0.14712375402450562, + "learning_rate": 4.783431900442819e-05, + "loss": 0.0412, + "step": 35360 + }, + { + "epoch": 0.17185, + "grad_norm": 0.12036072462797165, + "learning_rate": 4.783263577291279e-05, + "loss": 0.041, + "step": 35370 + }, + { + "epoch": 0.1719, + "grad_norm": 0.11746695637702942, + "learning_rate": 4.78309519171617e-05, + "loss": 0.0438, + "step": 35380 + }, + { + "epoch": 0.17195, + "grad_norm": 0.1313781887292862, + "learning_rate": 4.782926743722096e-05, + "loss": 0.0436, + "step": 35390 + }, + { + "epoch": 0.172, + "grad_norm": 0.12686990201473236, + "learning_rate": 4.7827582333136635e-05, + "loss": 0.0424, + "step": 35400 + }, + { + "epoch": 0.17205, + "grad_norm": 0.11555877327919006, + "learning_rate": 4.7825896604954774e-05, + "loss": 0.0419, + "step": 35410 + }, + { + "epoch": 0.1721, + "grad_norm": 0.16383430361747742, + "learning_rate": 4.782421025272148e-05, + "loss": 0.0401, + "step": 35420 + }, + { + "epoch": 0.17215, + "grad_norm": 0.14573214948177338, + "learning_rate": 4.7822523276482844e-05, + "loss": 0.0419, + "step": 35430 + }, + { + "epoch": 0.1722, + "grad_norm": 0.1528214067220688, + "learning_rate": 4.782083567628501e-05, + "loss": 0.0407, + "step": 35440 + }, + { + "epoch": 0.17225, + "grad_norm": 0.12795792520046234, + "learning_rate": 4.78191474521741e-05, + "loss": 0.0424, + "step": 35450 + }, + { + "epoch": 0.1723, + "grad_norm": 0.12186912447214127, + "learning_rate": 4.7817458604196264e-05, + "loss": 0.0401, + "step": 35460 + }, + { + "epoch": 0.17235, + "grad_norm": 0.12507309019565582, + "learning_rate": 4.781576913239768e-05, + "loss": 0.0425, + "step": 35470 + }, + { + "epoch": 0.1724, + "grad_norm": 0.11313261091709137, + "learning_rate": 4.781407903682454e-05, + "loss": 0.0417, + "step": 35480 + }, + { + "epoch": 0.17245, + "grad_norm": 0.1325627565383911, + "learning_rate": 4.781238831752306e-05, + "loss": 0.0437, + "step": 35490 + }, + { + "epoch": 0.1725, + "grad_norm": 0.14015568792819977, + "learning_rate": 4.781069697453944e-05, + "loss": 0.0456, + "step": 35500 + }, + { + "epoch": 0.17255, + "grad_norm": 0.1312084048986435, + "learning_rate": 4.780900500791995e-05, + "loss": 0.0416, + "step": 35510 + }, + { + "epoch": 0.1726, + "grad_norm": 0.12628820538520813, + "learning_rate": 4.780731241771082e-05, + "loss": 0.0422, + "step": 35520 + }, + { + "epoch": 0.17265, + "grad_norm": 0.10066360980272293, + "learning_rate": 4.780561920395834e-05, + "loss": 0.0419, + "step": 35530 + }, + { + "epoch": 0.1727, + "grad_norm": 0.129759281873703, + "learning_rate": 4.78039253667088e-05, + "loss": 0.0422, + "step": 35540 + }, + { + "epoch": 0.17275, + "grad_norm": 0.11697951704263687, + "learning_rate": 4.780223090600851e-05, + "loss": 0.0419, + "step": 35550 + }, + { + "epoch": 0.1728, + "grad_norm": 0.101000115275383, + "learning_rate": 4.7800535821903784e-05, + "loss": 0.0398, + "step": 35560 + }, + { + "epoch": 0.17285, + "grad_norm": 0.14083412289619446, + "learning_rate": 4.779884011444098e-05, + "loss": 0.0416, + "step": 35570 + }, + { + "epoch": 0.1729, + "grad_norm": 0.12678256630897522, + "learning_rate": 4.7797143783666454e-05, + "loss": 0.0418, + "step": 35580 + }, + { + "epoch": 0.17295, + "grad_norm": 0.11358196288347244, + "learning_rate": 4.7795446829626575e-05, + "loss": 0.0432, + "step": 35590 + }, + { + "epoch": 0.173, + "grad_norm": 0.1325872242450714, + "learning_rate": 4.779374925236775e-05, + "loss": 0.0436, + "step": 35600 + }, + { + "epoch": 0.17305, + "grad_norm": 0.11626822501420975, + "learning_rate": 4.7792051051936386e-05, + "loss": 0.0432, + "step": 35610 + }, + { + "epoch": 0.1731, + "grad_norm": 0.13823387026786804, + "learning_rate": 4.779035222837891e-05, + "loss": 0.043, + "step": 35620 + }, + { + "epoch": 0.17315, + "grad_norm": 0.13054156303405762, + "learning_rate": 4.778865278174176e-05, + "loss": 0.042, + "step": 35630 + }, + { + "epoch": 0.1732, + "grad_norm": 0.19842328131198883, + "learning_rate": 4.7786952712071404e-05, + "loss": 0.045, + "step": 35640 + }, + { + "epoch": 0.17325, + "grad_norm": 0.1172667071223259, + "learning_rate": 4.778525201941432e-05, + "loss": 0.0428, + "step": 35650 + }, + { + "epoch": 0.1733, + "grad_norm": 0.11237437278032303, + "learning_rate": 4.778355070381701e-05, + "loss": 0.0424, + "step": 35660 + }, + { + "epoch": 0.17335, + "grad_norm": 0.1331014186143875, + "learning_rate": 4.778184876532598e-05, + "loss": 0.0406, + "step": 35670 + }, + { + "epoch": 0.1734, + "grad_norm": 0.12321636825799942, + "learning_rate": 4.7780146203987766e-05, + "loss": 0.0407, + "step": 35680 + }, + { + "epoch": 0.17345, + "grad_norm": 0.1291119009256363, + "learning_rate": 4.777844301984892e-05, + "loss": 0.0433, + "step": 35690 + }, + { + "epoch": 0.1735, + "grad_norm": 0.12867674231529236, + "learning_rate": 4.777673921295599e-05, + "loss": 0.0403, + "step": 35700 + }, + { + "epoch": 0.17355, + "grad_norm": 0.10945740342140198, + "learning_rate": 4.777503478335557e-05, + "loss": 0.0419, + "step": 35710 + }, + { + "epoch": 0.1736, + "grad_norm": 0.14428310096263885, + "learning_rate": 4.777332973109425e-05, + "loss": 0.0463, + "step": 35720 + }, + { + "epoch": 0.17365, + "grad_norm": 0.09689648449420929, + "learning_rate": 4.7771624056218655e-05, + "loss": 0.0412, + "step": 35730 + }, + { + "epoch": 0.1737, + "grad_norm": 0.10113327950239182, + "learning_rate": 4.7769917758775405e-05, + "loss": 0.0405, + "step": 35740 + }, + { + "epoch": 0.17375, + "grad_norm": 0.1302248239517212, + "learning_rate": 4.7768210838811166e-05, + "loss": 0.0423, + "step": 35750 + }, + { + "epoch": 0.1738, + "grad_norm": 0.14287564158439636, + "learning_rate": 4.77665032963726e-05, + "loss": 0.0421, + "step": 35760 + }, + { + "epoch": 0.17385, + "grad_norm": 0.11934467405080795, + "learning_rate": 4.776479513150638e-05, + "loss": 0.042, + "step": 35770 + }, + { + "epoch": 0.1739, + "grad_norm": 0.13890060782432556, + "learning_rate": 4.776308634425922e-05, + "loss": 0.0445, + "step": 35780 + }, + { + "epoch": 0.17395, + "grad_norm": 0.10712046176195145, + "learning_rate": 4.7761376934677826e-05, + "loss": 0.0403, + "step": 35790 + }, + { + "epoch": 0.174, + "grad_norm": 0.11675825715065002, + "learning_rate": 4.775966690280894e-05, + "loss": 0.0408, + "step": 35800 + }, + { + "epoch": 0.17405, + "grad_norm": 0.12506303191184998, + "learning_rate": 4.775795624869931e-05, + "loss": 0.0418, + "step": 35810 + }, + { + "epoch": 0.1741, + "grad_norm": 0.11785845458507538, + "learning_rate": 4.77562449723957e-05, + "loss": 0.0444, + "step": 35820 + }, + { + "epoch": 0.17415, + "grad_norm": 0.1276514232158661, + "learning_rate": 4.775453307394491e-05, + "loss": 0.0427, + "step": 35830 + }, + { + "epoch": 0.1742, + "grad_norm": 0.12888126075267792, + "learning_rate": 4.775282055339373e-05, + "loss": 0.0409, + "step": 35840 + }, + { + "epoch": 0.17425, + "grad_norm": 0.12849290668964386, + "learning_rate": 4.775110741078899e-05, + "loss": 0.0415, + "step": 35850 + }, + { + "epoch": 0.1743, + "grad_norm": 0.1092122346162796, + "learning_rate": 4.774939364617751e-05, + "loss": 0.0416, + "step": 35860 + }, + { + "epoch": 0.17435, + "grad_norm": 0.10579238086938858, + "learning_rate": 4.774767925960616e-05, + "loss": 0.0404, + "step": 35870 + }, + { + "epoch": 0.1744, + "grad_norm": 0.10392007976770401, + "learning_rate": 4.774596425112181e-05, + "loss": 0.0424, + "step": 35880 + }, + { + "epoch": 0.17445, + "grad_norm": 0.13740697503089905, + "learning_rate": 4.774424862077134e-05, + "loss": 0.0418, + "step": 35890 + }, + { + "epoch": 0.1745, + "grad_norm": 0.21001854538917542, + "learning_rate": 4.774253236860165e-05, + "loss": 0.041, + "step": 35900 + }, + { + "epoch": 0.17455, + "grad_norm": 0.15767604112625122, + "learning_rate": 4.7740815494659675e-05, + "loss": 0.0419, + "step": 35910 + }, + { + "epoch": 0.1746, + "grad_norm": 0.15760114789009094, + "learning_rate": 4.773909799899234e-05, + "loss": 0.0407, + "step": 35920 + }, + { + "epoch": 0.17465, + "grad_norm": 0.12130020558834076, + "learning_rate": 4.773737988164663e-05, + "loss": 0.0402, + "step": 35930 + }, + { + "epoch": 0.1747, + "grad_norm": 0.11412428319454193, + "learning_rate": 4.773566114266948e-05, + "loss": 0.0403, + "step": 35940 + }, + { + "epoch": 0.17475, + "grad_norm": 0.10798191279172897, + "learning_rate": 4.773394178210789e-05, + "loss": 0.0413, + "step": 35950 + }, + { + "epoch": 0.1748, + "grad_norm": 0.11256234347820282, + "learning_rate": 4.773222180000888e-05, + "loss": 0.0414, + "step": 35960 + }, + { + "epoch": 0.17485, + "grad_norm": 0.10670135915279388, + "learning_rate": 4.773050119641946e-05, + "loss": 0.0421, + "step": 35970 + }, + { + "epoch": 0.1749, + "grad_norm": 0.1281348466873169, + "learning_rate": 4.7728779971386686e-05, + "loss": 0.0406, + "step": 35980 + }, + { + "epoch": 0.17495, + "grad_norm": 0.14552175998687744, + "learning_rate": 4.7727058124957605e-05, + "loss": 0.0429, + "step": 35990 + }, + { + "epoch": 0.175, + "grad_norm": 0.1336289942264557, + "learning_rate": 4.772533565717929e-05, + "loss": 0.0414, + "step": 36000 + }, + { + "epoch": 0.17505, + "grad_norm": 0.1263270229101181, + "learning_rate": 4.772361256809884e-05, + "loss": 0.0436, + "step": 36010 + }, + { + "epoch": 0.1751, + "grad_norm": 0.11949208378791809, + "learning_rate": 4.772188885776335e-05, + "loss": 0.0445, + "step": 36020 + }, + { + "epoch": 0.17515, + "grad_norm": 0.12418423593044281, + "learning_rate": 4.772016452621997e-05, + "loss": 0.0418, + "step": 36030 + }, + { + "epoch": 0.1752, + "grad_norm": 0.11824215948581696, + "learning_rate": 4.771843957351581e-05, + "loss": 0.0415, + "step": 36040 + }, + { + "epoch": 0.17525, + "grad_norm": 0.13300898671150208, + "learning_rate": 4.771671399969806e-05, + "loss": 0.0433, + "step": 36050 + }, + { + "epoch": 0.1753, + "grad_norm": 0.1046031191945076, + "learning_rate": 4.7714987804813875e-05, + "loss": 0.0433, + "step": 36060 + }, + { + "epoch": 0.17535, + "grad_norm": 0.12247949093580246, + "learning_rate": 4.771326098891046e-05, + "loss": 0.0425, + "step": 36070 + }, + { + "epoch": 0.1754, + "grad_norm": 0.1275743842124939, + "learning_rate": 4.771153355203502e-05, + "loss": 0.0449, + "step": 36080 + }, + { + "epoch": 0.17545, + "grad_norm": 0.11374276876449585, + "learning_rate": 4.7709805494234796e-05, + "loss": 0.041, + "step": 36090 + }, + { + "epoch": 0.1755, + "grad_norm": 0.14593404531478882, + "learning_rate": 4.770807681555701e-05, + "loss": 0.0426, + "step": 36100 + }, + { + "epoch": 0.17555, + "grad_norm": 0.14208447933197021, + "learning_rate": 4.770634751604894e-05, + "loss": 0.0407, + "step": 36110 + }, + { + "epoch": 0.1756, + "grad_norm": 0.131479874253273, + "learning_rate": 4.7704617595757856e-05, + "loss": 0.0415, + "step": 36120 + }, + { + "epoch": 0.17565, + "grad_norm": 0.11647072434425354, + "learning_rate": 4.770288705473106e-05, + "loss": 0.041, + "step": 36130 + }, + { + "epoch": 0.1757, + "grad_norm": 0.11759953945875168, + "learning_rate": 4.770115589301586e-05, + "loss": 0.0411, + "step": 36140 + }, + { + "epoch": 0.17575, + "grad_norm": 0.10574915260076523, + "learning_rate": 4.769942411065959e-05, + "loss": 0.0422, + "step": 36150 + }, + { + "epoch": 0.1758, + "grad_norm": 0.10825838148593903, + "learning_rate": 4.769769170770958e-05, + "loss": 0.0419, + "step": 36160 + }, + { + "epoch": 0.17585, + "grad_norm": 0.09498035907745361, + "learning_rate": 4.7695958684213215e-05, + "loss": 0.0407, + "step": 36170 + }, + { + "epoch": 0.1759, + "grad_norm": 0.1188809871673584, + "learning_rate": 4.7694225040217866e-05, + "loss": 0.0424, + "step": 36180 + }, + { + "epoch": 0.17595, + "grad_norm": 0.11033222079277039, + "learning_rate": 4.7692490775770924e-05, + "loss": 0.0411, + "step": 36190 + }, + { + "epoch": 0.176, + "grad_norm": 0.1235862448811531, + "learning_rate": 4.769075589091982e-05, + "loss": 0.0421, + "step": 36200 + }, + { + "epoch": 0.17605, + "grad_norm": 0.1003691628575325, + "learning_rate": 4.768902038571197e-05, + "loss": 0.0411, + "step": 36210 + }, + { + "epoch": 0.1761, + "grad_norm": 0.1102064847946167, + "learning_rate": 4.768728426019482e-05, + "loss": 0.0426, + "step": 36220 + }, + { + "epoch": 0.17615, + "grad_norm": 0.11367040872573853, + "learning_rate": 4.768554751441585e-05, + "loss": 0.0404, + "step": 36230 + }, + { + "epoch": 0.1762, + "grad_norm": 0.12277144193649292, + "learning_rate": 4.7683810148422534e-05, + "loss": 0.0415, + "step": 36240 + }, + { + "epoch": 0.17625, + "grad_norm": 0.11358033120632172, + "learning_rate": 4.768207216226237e-05, + "loss": 0.0414, + "step": 36250 + }, + { + "epoch": 0.1763, + "grad_norm": 0.10719634592533112, + "learning_rate": 4.768033355598287e-05, + "loss": 0.0439, + "step": 36260 + }, + { + "epoch": 0.17635, + "grad_norm": 0.1190221831202507, + "learning_rate": 4.767859432963157e-05, + "loss": 0.0414, + "step": 36270 + }, + { + "epoch": 0.1764, + "grad_norm": 0.11484932154417038, + "learning_rate": 4.7676854483256025e-05, + "loss": 0.0411, + "step": 36280 + }, + { + "epoch": 0.17645, + "grad_norm": 0.13288499414920807, + "learning_rate": 4.7675114016903795e-05, + "loss": 0.0416, + "step": 36290 + }, + { + "epoch": 0.1765, + "grad_norm": 0.12918755412101746, + "learning_rate": 4.767337293062247e-05, + "loss": 0.0422, + "step": 36300 + }, + { + "epoch": 0.17655, + "grad_norm": 0.12075911462306976, + "learning_rate": 4.767163122445964e-05, + "loss": 0.0426, + "step": 36310 + }, + { + "epoch": 0.1766, + "grad_norm": 0.12961497902870178, + "learning_rate": 4.766988889846294e-05, + "loss": 0.0429, + "step": 36320 + }, + { + "epoch": 0.17665, + "grad_norm": 0.1006336435675621, + "learning_rate": 4.766814595267999e-05, + "loss": 0.0407, + "step": 36330 + }, + { + "epoch": 0.1767, + "grad_norm": 0.12820640206336975, + "learning_rate": 4.766640238715844e-05, + "loss": 0.0408, + "step": 36340 + }, + { + "epoch": 0.17675, + "grad_norm": 0.12266498059034348, + "learning_rate": 4.7664658201945966e-05, + "loss": 0.042, + "step": 36350 + }, + { + "epoch": 0.1768, + "grad_norm": 0.1417977511882782, + "learning_rate": 4.766291339709026e-05, + "loss": 0.0421, + "step": 36360 + }, + { + "epoch": 0.17685, + "grad_norm": 0.14648132026195526, + "learning_rate": 4.766116797263901e-05, + "loss": 0.0439, + "step": 36370 + }, + { + "epoch": 0.1769, + "grad_norm": 0.13662028312683105, + "learning_rate": 4.7659421928639934e-05, + "loss": 0.0426, + "step": 36380 + }, + { + "epoch": 0.17695, + "grad_norm": 0.12682056427001953, + "learning_rate": 4.765767526514079e-05, + "loss": 0.0422, + "step": 36390 + }, + { + "epoch": 0.177, + "grad_norm": 0.11788281798362732, + "learning_rate": 4.7655927982189305e-05, + "loss": 0.0437, + "step": 36400 + }, + { + "epoch": 0.17705, + "grad_norm": 0.09522189944982529, + "learning_rate": 4.765418007983327e-05, + "loss": 0.0411, + "step": 36410 + }, + { + "epoch": 0.1771, + "grad_norm": 0.11930128931999207, + "learning_rate": 4.765243155812045e-05, + "loss": 0.0417, + "step": 36420 + }, + { + "epoch": 0.17715, + "grad_norm": 0.11503031104803085, + "learning_rate": 4.7650682417098666e-05, + "loss": 0.0429, + "step": 36430 + }, + { + "epoch": 0.1772, + "grad_norm": 0.1319715827703476, + "learning_rate": 4.764893265681574e-05, + "loss": 0.0417, + "step": 36440 + }, + { + "epoch": 0.17725, + "grad_norm": 0.13776862621307373, + "learning_rate": 4.76471822773195e-05, + "loss": 0.0428, + "step": 36450 + }, + { + "epoch": 0.1773, + "grad_norm": 0.13260617852210999, + "learning_rate": 4.76454312786578e-05, + "loss": 0.0427, + "step": 36460 + }, + { + "epoch": 0.17735, + "grad_norm": 0.11885460466146469, + "learning_rate": 4.764367966087852e-05, + "loss": 0.042, + "step": 36470 + }, + { + "epoch": 0.1774, + "grad_norm": 0.1272895336151123, + "learning_rate": 4.764192742402955e-05, + "loss": 0.043, + "step": 36480 + }, + { + "epoch": 0.17745, + "grad_norm": 0.11527583003044128, + "learning_rate": 4.764017456815878e-05, + "loss": 0.0429, + "step": 36490 + }, + { + "epoch": 0.1775, + "grad_norm": 0.14333097636699677, + "learning_rate": 4.7638421093314156e-05, + "loss": 0.0426, + "step": 36500 + }, + { + "epoch": 0.17755, + "grad_norm": 0.11847139149904251, + "learning_rate": 4.763666699954359e-05, + "loss": 0.0417, + "step": 36510 + }, + { + "epoch": 0.1776, + "grad_norm": 0.1231340542435646, + "learning_rate": 4.763491228689506e-05, + "loss": 0.0414, + "step": 36520 + }, + { + "epoch": 0.17765, + "grad_norm": 0.09365687519311905, + "learning_rate": 4.7633156955416535e-05, + "loss": 0.0397, + "step": 36530 + }, + { + "epoch": 0.1777, + "grad_norm": 0.13405677676200867, + "learning_rate": 4.763140100515599e-05, + "loss": 0.0412, + "step": 36540 + }, + { + "epoch": 0.17775, + "grad_norm": 0.12576311826705933, + "learning_rate": 4.762964443616146e-05, + "loss": 0.0429, + "step": 36550 + }, + { + "epoch": 0.1778, + "grad_norm": 0.11587604135274887, + "learning_rate": 4.762788724848094e-05, + "loss": 0.0414, + "step": 36560 + }, + { + "epoch": 0.17785, + "grad_norm": 0.11648576706647873, + "learning_rate": 4.7626129442162495e-05, + "loss": 0.0401, + "step": 36570 + }, + { + "epoch": 0.1779, + "grad_norm": 0.12591294944286346, + "learning_rate": 4.762437101725416e-05, + "loss": 0.0421, + "step": 36580 + }, + { + "epoch": 0.17795, + "grad_norm": 0.21443773806095123, + "learning_rate": 4.762261197380402e-05, + "loss": 0.0501, + "step": 36590 + }, + { + "epoch": 0.178, + "grad_norm": 0.1882997751235962, + "learning_rate": 4.7620852311860176e-05, + "loss": 0.0456, + "step": 36600 + }, + { + "epoch": 0.17805, + "grad_norm": 0.12708589434623718, + "learning_rate": 4.761909203147073e-05, + "loss": 0.0427, + "step": 36610 + }, + { + "epoch": 0.1781, + "grad_norm": 0.12208003550767899, + "learning_rate": 4.7617331132683795e-05, + "loss": 0.0424, + "step": 36620 + }, + { + "epoch": 0.17815, + "grad_norm": 0.1320149004459381, + "learning_rate": 4.761556961554753e-05, + "loss": 0.0435, + "step": 36630 + }, + { + "epoch": 0.1782, + "grad_norm": 0.13649402558803558, + "learning_rate": 4.7613807480110086e-05, + "loss": 0.0448, + "step": 36640 + }, + { + "epoch": 0.17825, + "grad_norm": 0.134579598903656, + "learning_rate": 4.761204472641965e-05, + "loss": 0.0432, + "step": 36650 + }, + { + "epoch": 0.1783, + "grad_norm": 0.17320720851421356, + "learning_rate": 4.761028135452439e-05, + "loss": 0.0449, + "step": 36660 + }, + { + "epoch": 0.17835, + "grad_norm": 0.11641421914100647, + "learning_rate": 4.760851736447254e-05, + "loss": 0.0445, + "step": 36670 + }, + { + "epoch": 0.1784, + "grad_norm": 0.12470538914203644, + "learning_rate": 4.7606752756312325e-05, + "loss": 0.0414, + "step": 36680 + }, + { + "epoch": 0.17845, + "grad_norm": 0.13089129328727722, + "learning_rate": 4.760498753009197e-05, + "loss": 0.0444, + "step": 36690 + }, + { + "epoch": 0.1785, + "grad_norm": 0.11784205585718155, + "learning_rate": 4.760322168585976e-05, + "loss": 0.0417, + "step": 36700 + }, + { + "epoch": 0.17855, + "grad_norm": 0.09654921293258667, + "learning_rate": 4.760145522366395e-05, + "loss": 0.0411, + "step": 36710 + }, + { + "epoch": 0.1786, + "grad_norm": 0.11832010000944138, + "learning_rate": 4.759968814355286e-05, + "loss": 0.0419, + "step": 36720 + }, + { + "epoch": 0.17865, + "grad_norm": 0.09841049462556839, + "learning_rate": 4.759792044557477e-05, + "loss": 0.0403, + "step": 36730 + }, + { + "epoch": 0.1787, + "grad_norm": 0.12049797922372818, + "learning_rate": 4.759615212977803e-05, + "loss": 0.0417, + "step": 36740 + }, + { + "epoch": 0.17875, + "grad_norm": 0.11145062744617462, + "learning_rate": 4.759438319621099e-05, + "loss": 0.0434, + "step": 36750 + }, + { + "epoch": 0.1788, + "grad_norm": 0.11987379938364029, + "learning_rate": 4.7592613644921993e-05, + "loss": 0.0409, + "step": 36760 + }, + { + "epoch": 0.17885, + "grad_norm": 0.11805453151464462, + "learning_rate": 4.7590843475959424e-05, + "loss": 0.0407, + "step": 36770 + }, + { + "epoch": 0.1789, + "grad_norm": 0.1306656002998352, + "learning_rate": 4.758907268937168e-05, + "loss": 0.0413, + "step": 36780 + }, + { + "epoch": 0.17895, + "grad_norm": 0.11330852657556534, + "learning_rate": 4.758730128520718e-05, + "loss": 0.042, + "step": 36790 + }, + { + "epoch": 0.179, + "grad_norm": 0.13489657640457153, + "learning_rate": 4.758552926351435e-05, + "loss": 0.0414, + "step": 36800 + }, + { + "epoch": 0.17905, + "grad_norm": 0.10893921554088593, + "learning_rate": 4.758375662434163e-05, + "loss": 0.0428, + "step": 36810 + }, + { + "epoch": 0.1791, + "grad_norm": 0.11990348249673843, + "learning_rate": 4.758198336773749e-05, + "loss": 0.041, + "step": 36820 + }, + { + "epoch": 0.17915, + "grad_norm": 0.09690472483634949, + "learning_rate": 4.758020949375041e-05, + "loss": 0.0405, + "step": 36830 + }, + { + "epoch": 0.1792, + "grad_norm": 0.12950287759304047, + "learning_rate": 4.757843500242889e-05, + "loss": 0.0406, + "step": 36840 + }, + { + "epoch": 0.17925, + "grad_norm": 0.13361723721027374, + "learning_rate": 4.757665989382143e-05, + "loss": 0.0416, + "step": 36850 + }, + { + "epoch": 0.1793, + "grad_norm": 0.11338499933481216, + "learning_rate": 4.7574884167976575e-05, + "loss": 0.0416, + "step": 36860 + }, + { + "epoch": 0.17935, + "grad_norm": 0.12144909054040909, + "learning_rate": 4.757310782494286e-05, + "loss": 0.0409, + "step": 36870 + }, + { + "epoch": 0.1794, + "grad_norm": 0.1445496678352356, + "learning_rate": 4.7571330864768874e-05, + "loss": 0.0424, + "step": 36880 + }, + { + "epoch": 0.17945, + "grad_norm": 0.1517098993062973, + "learning_rate": 4.756955328750317e-05, + "loss": 0.0439, + "step": 36890 + }, + { + "epoch": 0.1795, + "grad_norm": 0.11401533335447311, + "learning_rate": 4.756777509319436e-05, + "loss": 0.0395, + "step": 36900 + }, + { + "epoch": 0.17955, + "grad_norm": 0.1113637238740921, + "learning_rate": 4.7565996281891054e-05, + "loss": 0.0411, + "step": 36910 + }, + { + "epoch": 0.1796, + "grad_norm": 0.1286325603723526, + "learning_rate": 4.7564216853641896e-05, + "loss": 0.0425, + "step": 36920 + }, + { + "epoch": 0.17965, + "grad_norm": 0.11840667575597763, + "learning_rate": 4.756243680849552e-05, + "loss": 0.0445, + "step": 36930 + }, + { + "epoch": 0.1797, + "grad_norm": 0.13859489560127258, + "learning_rate": 4.75606561465006e-05, + "loss": 0.0427, + "step": 36940 + }, + { + "epoch": 0.17975, + "grad_norm": 0.13032551109790802, + "learning_rate": 4.7558874867705815e-05, + "loss": 0.0404, + "step": 36950 + }, + { + "epoch": 0.1798, + "grad_norm": 0.15403352677822113, + "learning_rate": 4.755709297215987e-05, + "loss": 0.0411, + "step": 36960 + }, + { + "epoch": 0.17985, + "grad_norm": 0.12021184712648392, + "learning_rate": 4.7555310459911476e-05, + "loss": 0.0417, + "step": 36970 + }, + { + "epoch": 0.1799, + "grad_norm": 0.09335090965032578, + "learning_rate": 4.755352733100936e-05, + "loss": 0.04, + "step": 36980 + }, + { + "epoch": 0.17995, + "grad_norm": 0.10187531262636185, + "learning_rate": 4.755174358550229e-05, + "loss": 0.0395, + "step": 36990 + }, + { + "epoch": 0.18, + "grad_norm": 0.11029377579689026, + "learning_rate": 4.7549959223439016e-05, + "loss": 0.0408, + "step": 37000 + }, + { + "epoch": 0.18005, + "grad_norm": 0.10429729521274567, + "learning_rate": 4.754817424486833e-05, + "loss": 0.04, + "step": 37010 + }, + { + "epoch": 0.1801, + "grad_norm": 0.10928675532341003, + "learning_rate": 4.754638864983904e-05, + "loss": 0.0411, + "step": 37020 + }, + { + "epoch": 0.18015, + "grad_norm": 0.11169719696044922, + "learning_rate": 4.7544602438399945e-05, + "loss": 0.041, + "step": 37030 + }, + { + "epoch": 0.1802, + "grad_norm": 0.1231222003698349, + "learning_rate": 4.754281561059989e-05, + "loss": 0.042, + "step": 37040 + }, + { + "epoch": 0.18025, + "grad_norm": 0.11430267244577408, + "learning_rate": 4.754102816648772e-05, + "loss": 0.041, + "step": 37050 + }, + { + "epoch": 0.1803, + "grad_norm": 0.11378253251314163, + "learning_rate": 4.753924010611231e-05, + "loss": 0.041, + "step": 37060 + }, + { + "epoch": 0.18035, + "grad_norm": 0.11739642173051834, + "learning_rate": 4.753745142952255e-05, + "loss": 0.0395, + "step": 37070 + }, + { + "epoch": 0.1804, + "grad_norm": 0.10547371953725815, + "learning_rate": 4.7535662136767336e-05, + "loss": 0.0421, + "step": 37080 + }, + { + "epoch": 0.18045, + "grad_norm": 0.11511596292257309, + "learning_rate": 4.753387222789558e-05, + "loss": 0.0406, + "step": 37090 + }, + { + "epoch": 0.1805, + "grad_norm": 0.11022399365901947, + "learning_rate": 4.753208170295622e-05, + "loss": 0.0427, + "step": 37100 + }, + { + "epoch": 0.18055, + "grad_norm": 0.10935037583112717, + "learning_rate": 4.7530290561998216e-05, + "loss": 0.0443, + "step": 37110 + }, + { + "epoch": 0.1806, + "grad_norm": 0.11182756721973419, + "learning_rate": 4.7528498805070534e-05, + "loss": 0.043, + "step": 37120 + }, + { + "epoch": 0.18065, + "grad_norm": 0.10958198457956314, + "learning_rate": 4.752670643222216e-05, + "loss": 0.0403, + "step": 37130 + }, + { + "epoch": 0.1807, + "grad_norm": 0.12529942393302917, + "learning_rate": 4.7524913443502086e-05, + "loss": 0.0441, + "step": 37140 + }, + { + "epoch": 0.18075, + "grad_norm": 0.17277610301971436, + "learning_rate": 4.7523119838959345e-05, + "loss": 0.0424, + "step": 37150 + }, + { + "epoch": 0.1808, + "grad_norm": 0.12355010211467743, + "learning_rate": 4.752132561864297e-05, + "loss": 0.0414, + "step": 37160 + }, + { + "epoch": 0.18085, + "grad_norm": 0.12070120126008987, + "learning_rate": 4.751953078260202e-05, + "loss": 0.0427, + "step": 37170 + }, + { + "epoch": 0.1809, + "grad_norm": 0.12554572522640228, + "learning_rate": 4.751773533088554e-05, + "loss": 0.0446, + "step": 37180 + }, + { + "epoch": 0.18095, + "grad_norm": 0.12956035137176514, + "learning_rate": 4.751593926354265e-05, + "loss": 0.0413, + "step": 37190 + }, + { + "epoch": 0.181, + "grad_norm": 0.10663142800331116, + "learning_rate": 4.751414258062244e-05, + "loss": 0.0421, + "step": 37200 + }, + { + "epoch": 0.18105, + "grad_norm": 0.1439746767282486, + "learning_rate": 4.751234528217402e-05, + "loss": 0.0426, + "step": 37210 + }, + { + "epoch": 0.1811, + "grad_norm": 0.1384759396314621, + "learning_rate": 4.751054736824655e-05, + "loss": 0.0429, + "step": 37220 + }, + { + "epoch": 0.18115, + "grad_norm": 0.14315307140350342, + "learning_rate": 4.750874883888916e-05, + "loss": 0.0426, + "step": 37230 + }, + { + "epoch": 0.1812, + "grad_norm": 0.11895201355218887, + "learning_rate": 4.750694969415105e-05, + "loss": 0.0423, + "step": 37240 + }, + { + "epoch": 0.18125, + "grad_norm": 0.14067918062210083, + "learning_rate": 4.750514993408137e-05, + "loss": 0.0422, + "step": 37250 + }, + { + "epoch": 0.1813, + "grad_norm": 0.13616418838500977, + "learning_rate": 4.7503349558729356e-05, + "loss": 0.0425, + "step": 37260 + }, + { + "epoch": 0.18135, + "grad_norm": 0.1241074725985527, + "learning_rate": 4.750154856814422e-05, + "loss": 0.041, + "step": 37270 + }, + { + "epoch": 0.1814, + "grad_norm": 0.11493717133998871, + "learning_rate": 4.74997469623752e-05, + "loss": 0.0413, + "step": 37280 + }, + { + "epoch": 0.18145, + "grad_norm": 0.11242803186178207, + "learning_rate": 4.7497944741471546e-05, + "loss": 0.0402, + "step": 37290 + }, + { + "epoch": 0.1815, + "grad_norm": 0.10811872780323029, + "learning_rate": 4.749614190548254e-05, + "loss": 0.0425, + "step": 37300 + }, + { + "epoch": 0.18155, + "grad_norm": 0.10886134952306747, + "learning_rate": 4.749433845445746e-05, + "loss": 0.04, + "step": 37310 + }, + { + "epoch": 0.1816, + "grad_norm": 0.12404333055019379, + "learning_rate": 4.749253438844562e-05, + "loss": 0.0424, + "step": 37320 + }, + { + "epoch": 0.18165, + "grad_norm": 0.11045855283737183, + "learning_rate": 4.7490729707496346e-05, + "loss": 0.0427, + "step": 37330 + }, + { + "epoch": 0.1817, + "grad_norm": 0.12336307018995285, + "learning_rate": 4.7488924411658965e-05, + "loss": 0.0427, + "step": 37340 + }, + { + "epoch": 0.18175, + "grad_norm": 0.1207285076379776, + "learning_rate": 4.748711850098284e-05, + "loss": 0.0414, + "step": 37350 + }, + { + "epoch": 0.1818, + "grad_norm": 0.10973604023456573, + "learning_rate": 4.748531197551734e-05, + "loss": 0.0417, + "step": 37360 + }, + { + "epoch": 0.18185, + "grad_norm": 0.11707280576229095, + "learning_rate": 4.7483504835311866e-05, + "loss": 0.0425, + "step": 37370 + }, + { + "epoch": 0.1819, + "grad_norm": 0.12151395529508591, + "learning_rate": 4.748169708041581e-05, + "loss": 0.0432, + "step": 37380 + }, + { + "epoch": 0.18195, + "grad_norm": 0.13998086750507355, + "learning_rate": 4.747988871087861e-05, + "loss": 0.0435, + "step": 37390 + }, + { + "epoch": 0.182, + "grad_norm": 0.10092715173959732, + "learning_rate": 4.7478079726749686e-05, + "loss": 0.0409, + "step": 37400 + }, + { + "epoch": 0.18205, + "grad_norm": 0.1240464299917221, + "learning_rate": 4.7476270128078506e-05, + "loss": 0.0411, + "step": 37410 + }, + { + "epoch": 0.1821, + "grad_norm": 0.11496293544769287, + "learning_rate": 4.7474459914914556e-05, + "loss": 0.0406, + "step": 37420 + }, + { + "epoch": 0.18215, + "grad_norm": 0.14588946104049683, + "learning_rate": 4.747264908730731e-05, + "loss": 0.0423, + "step": 37430 + }, + { + "epoch": 0.1822, + "grad_norm": 0.10504837334156036, + "learning_rate": 4.747083764530628e-05, + "loss": 0.0411, + "step": 37440 + }, + { + "epoch": 0.18225, + "grad_norm": 0.13117945194244385, + "learning_rate": 4.746902558896099e-05, + "loss": 0.0424, + "step": 37450 + }, + { + "epoch": 0.1823, + "grad_norm": 0.13221648335456848, + "learning_rate": 4.746721291832098e-05, + "loss": 0.0436, + "step": 37460 + }, + { + "epoch": 0.18235, + "grad_norm": 0.1333584040403366, + "learning_rate": 4.7465399633435814e-05, + "loss": 0.0418, + "step": 37470 + }, + { + "epoch": 0.1824, + "grad_norm": 0.12405399233102798, + "learning_rate": 4.7463585734355064e-05, + "loss": 0.045, + "step": 37480 + }, + { + "epoch": 0.18245, + "grad_norm": 0.13527528941631317, + "learning_rate": 4.746177122112831e-05, + "loss": 0.0419, + "step": 37490 + }, + { + "epoch": 0.1825, + "grad_norm": 0.14091968536376953, + "learning_rate": 4.745995609380518e-05, + "loss": 0.0444, + "step": 37500 + }, + { + "epoch": 0.18255, + "grad_norm": 0.13996820151805878, + "learning_rate": 4.745814035243528e-05, + "loss": 0.0405, + "step": 37510 + }, + { + "epoch": 0.1826, + "grad_norm": 0.11088625341653824, + "learning_rate": 4.7456323997068264e-05, + "loss": 0.0424, + "step": 37520 + }, + { + "epoch": 0.18265, + "grad_norm": 0.12366887181997299, + "learning_rate": 4.7454507027753784e-05, + "loss": 0.0434, + "step": 37530 + }, + { + "epoch": 0.1827, + "grad_norm": 0.11137080937623978, + "learning_rate": 4.745268944454152e-05, + "loss": 0.0411, + "step": 37540 + }, + { + "epoch": 0.18275, + "grad_norm": 0.1298379898071289, + "learning_rate": 4.745087124748116e-05, + "loss": 0.0406, + "step": 37550 + }, + { + "epoch": 0.1828, + "grad_norm": 0.11690547317266464, + "learning_rate": 4.744905243662241e-05, + "loss": 0.0402, + "step": 37560 + }, + { + "epoch": 0.18285, + "grad_norm": 0.11526573449373245, + "learning_rate": 4.744723301201501e-05, + "loss": 0.0421, + "step": 37570 + }, + { + "epoch": 0.1829, + "grad_norm": 0.12407028675079346, + "learning_rate": 4.7445412973708694e-05, + "loss": 0.0392, + "step": 37580 + }, + { + "epoch": 0.18295, + "grad_norm": 0.13423435389995575, + "learning_rate": 4.74435923217532e-05, + "loss": 0.0395, + "step": 37590 + }, + { + "epoch": 0.183, + "grad_norm": 0.12274365872144699, + "learning_rate": 4.744177105619835e-05, + "loss": 0.0405, + "step": 37600 + }, + { + "epoch": 0.18305, + "grad_norm": 0.10433799028396606, + "learning_rate": 4.743994917709389e-05, + "loss": 0.039, + "step": 37610 + }, + { + "epoch": 0.1831, + "grad_norm": 0.1150219738483429, + "learning_rate": 4.7438126684489656e-05, + "loss": 0.0414, + "step": 37620 + }, + { + "epoch": 0.18315, + "grad_norm": 0.13015680015087128, + "learning_rate": 4.743630357843547e-05, + "loss": 0.04, + "step": 37630 + }, + { + "epoch": 0.1832, + "grad_norm": 0.12152936309576035, + "learning_rate": 4.743447985898117e-05, + "loss": 0.0415, + "step": 37640 + }, + { + "epoch": 0.18325, + "grad_norm": 0.11936400085687637, + "learning_rate": 4.743265552617663e-05, + "loss": 0.04, + "step": 37650 + }, + { + "epoch": 0.1833, + "grad_norm": 0.151456817984581, + "learning_rate": 4.74308305800717e-05, + "loss": 0.0408, + "step": 37660 + }, + { + "epoch": 0.18335, + "grad_norm": 0.13457858562469482, + "learning_rate": 4.74290050207163e-05, + "loss": 0.042, + "step": 37670 + }, + { + "epoch": 0.1834, + "grad_norm": 0.12324851751327515, + "learning_rate": 4.742717884816032e-05, + "loss": 0.0411, + "step": 37680 + }, + { + "epoch": 0.18345, + "grad_norm": 0.12512250244617462, + "learning_rate": 4.74253520624537e-05, + "loss": 0.0397, + "step": 37690 + }, + { + "epoch": 0.1835, + "grad_norm": 0.12226025015115738, + "learning_rate": 4.742352466364638e-05, + "loss": 0.0387, + "step": 37700 + }, + { + "epoch": 0.18355, + "grad_norm": 0.12683923542499542, + "learning_rate": 4.742169665178832e-05, + "loss": 0.0397, + "step": 37710 + }, + { + "epoch": 0.1836, + "grad_norm": 0.1118398904800415, + "learning_rate": 4.741986802692949e-05, + "loss": 0.0389, + "step": 37720 + }, + { + "epoch": 0.18365, + "grad_norm": 0.11483946442604065, + "learning_rate": 4.74180387891199e-05, + "loss": 0.0433, + "step": 37730 + }, + { + "epoch": 0.1837, + "grad_norm": 0.11242949962615967, + "learning_rate": 4.741620893840955e-05, + "loss": 0.0392, + "step": 37740 + }, + { + "epoch": 0.18375, + "grad_norm": 0.1014494001865387, + "learning_rate": 4.7414378474848464e-05, + "loss": 0.0398, + "step": 37750 + }, + { + "epoch": 0.1838, + "grad_norm": 0.10241231322288513, + "learning_rate": 4.741254739848669e-05, + "loss": 0.0395, + "step": 37760 + }, + { + "epoch": 0.18385, + "grad_norm": 0.10721082240343094, + "learning_rate": 4.74107157093743e-05, + "loss": 0.0425, + "step": 37770 + }, + { + "epoch": 0.1839, + "grad_norm": 0.13023872673511505, + "learning_rate": 4.740888340756136e-05, + "loss": 0.0391, + "step": 37780 + }, + { + "epoch": 0.18395, + "grad_norm": 0.11415564268827438, + "learning_rate": 4.740705049309796e-05, + "loss": 0.0416, + "step": 37790 + }, + { + "epoch": 0.184, + "grad_norm": 0.1057114526629448, + "learning_rate": 4.740521696603423e-05, + "loss": 0.0426, + "step": 37800 + }, + { + "epoch": 0.18405, + "grad_norm": 0.12084627896547318, + "learning_rate": 4.740338282642027e-05, + "loss": 0.0413, + "step": 37810 + }, + { + "epoch": 0.1841, + "grad_norm": 0.10022873431444168, + "learning_rate": 4.7401548074306245e-05, + "loss": 0.0388, + "step": 37820 + }, + { + "epoch": 0.18415, + "grad_norm": 0.09546560049057007, + "learning_rate": 4.7399712709742316e-05, + "loss": 0.0412, + "step": 37830 + }, + { + "epoch": 0.1842, + "grad_norm": 0.12398182600736618, + "learning_rate": 4.739787673277865e-05, + "loss": 0.04, + "step": 37840 + }, + { + "epoch": 0.18425, + "grad_norm": 0.11884655058383942, + "learning_rate": 4.739604014346545e-05, + "loss": 0.0412, + "step": 37850 + }, + { + "epoch": 0.1843, + "grad_norm": 0.1322961449623108, + "learning_rate": 4.7394202941852925e-05, + "loss": 0.0416, + "step": 37860 + }, + { + "epoch": 0.18435, + "grad_norm": 0.11267701536417007, + "learning_rate": 4.7392365127991315e-05, + "loss": 0.0417, + "step": 37870 + }, + { + "epoch": 0.1844, + "grad_norm": 0.10018815845251083, + "learning_rate": 4.739052670193085e-05, + "loss": 0.0405, + "step": 37880 + }, + { + "epoch": 0.18445, + "grad_norm": 0.11316210776567459, + "learning_rate": 4.7388687663721784e-05, + "loss": 0.0398, + "step": 37890 + }, + { + "epoch": 0.1845, + "grad_norm": 0.11430251598358154, + "learning_rate": 4.738684801341442e-05, + "loss": 0.0425, + "step": 37900 + }, + { + "epoch": 0.18455, + "grad_norm": 0.12140493839979172, + "learning_rate": 4.738500775105904e-05, + "loss": 0.0406, + "step": 37910 + }, + { + "epoch": 0.1846, + "grad_norm": 0.14046849310398102, + "learning_rate": 4.7383166876705966e-05, + "loss": 0.0402, + "step": 37920 + }, + { + "epoch": 0.18465, + "grad_norm": 0.17947669327259064, + "learning_rate": 4.73813253904055e-05, + "loss": 0.0446, + "step": 37930 + }, + { + "epoch": 0.1847, + "grad_norm": 0.13138681650161743, + "learning_rate": 4.7379483292208026e-05, + "loss": 0.0399, + "step": 37940 + }, + { + "epoch": 0.18475, + "grad_norm": 0.1199428141117096, + "learning_rate": 4.7377640582163876e-05, + "loss": 0.0399, + "step": 37950 + }, + { + "epoch": 0.1848, + "grad_norm": 0.1424848437309265, + "learning_rate": 4.737579726032344e-05, + "loss": 0.0425, + "step": 37960 + }, + { + "epoch": 0.18485, + "grad_norm": 0.1481008380651474, + "learning_rate": 4.7373953326737114e-05, + "loss": 0.0423, + "step": 37970 + }, + { + "epoch": 0.1849, + "grad_norm": 0.15787029266357422, + "learning_rate": 4.7372108781455306e-05, + "loss": 0.0428, + "step": 37980 + }, + { + "epoch": 0.18495, + "grad_norm": 0.16060331463813782, + "learning_rate": 4.737026362452845e-05, + "loss": 0.0415, + "step": 37990 + }, + { + "epoch": 0.185, + "grad_norm": 0.1347510814666748, + "learning_rate": 4.7368417856006996e-05, + "loss": 0.0408, + "step": 38000 + }, + { + "epoch": 0.18505, + "grad_norm": 0.13687683641910553, + "learning_rate": 4.73665714759414e-05, + "loss": 0.0395, + "step": 38010 + }, + { + "epoch": 0.1851, + "grad_norm": 0.13434498012065887, + "learning_rate": 4.7364724484382137e-05, + "loss": 0.0399, + "step": 38020 + }, + { + "epoch": 0.18515, + "grad_norm": 0.12056301534175873, + "learning_rate": 4.7362876881379714e-05, + "loss": 0.0393, + "step": 38030 + }, + { + "epoch": 0.1852, + "grad_norm": 0.10731586813926697, + "learning_rate": 4.736102866698463e-05, + "loss": 0.0402, + "step": 38040 + }, + { + "epoch": 0.18525, + "grad_norm": 0.14883634448051453, + "learning_rate": 4.7359179841247436e-05, + "loss": 0.0407, + "step": 38050 + }, + { + "epoch": 0.1853, + "grad_norm": 0.13408304750919342, + "learning_rate": 4.735733040421866e-05, + "loss": 0.0414, + "step": 38060 + }, + { + "epoch": 0.18535, + "grad_norm": 0.13457036018371582, + "learning_rate": 4.735548035594887e-05, + "loss": 0.0408, + "step": 38070 + }, + { + "epoch": 0.1854, + "grad_norm": 0.12219943106174469, + "learning_rate": 4.7353629696488636e-05, + "loss": 0.0409, + "step": 38080 + }, + { + "epoch": 0.18545, + "grad_norm": 0.10472157597541809, + "learning_rate": 4.735177842588857e-05, + "loss": 0.041, + "step": 38090 + }, + { + "epoch": 0.1855, + "grad_norm": 0.11975611001253128, + "learning_rate": 4.7349926544199285e-05, + "loss": 0.0392, + "step": 38100 + }, + { + "epoch": 0.18555, + "grad_norm": 0.10920672118663788, + "learning_rate": 4.7348074051471404e-05, + "loss": 0.0403, + "step": 38110 + }, + { + "epoch": 0.1856, + "grad_norm": 0.1137668713927269, + "learning_rate": 4.734622094775557e-05, + "loss": 0.0389, + "step": 38120 + }, + { + "epoch": 0.18565, + "grad_norm": 0.11372815817594528, + "learning_rate": 4.734436723310245e-05, + "loss": 0.0413, + "step": 38130 + }, + { + "epoch": 0.1857, + "grad_norm": 0.11171303689479828, + "learning_rate": 4.734251290756272e-05, + "loss": 0.0415, + "step": 38140 + }, + { + "epoch": 0.18575, + "grad_norm": 0.1370186060667038, + "learning_rate": 4.7340657971187094e-05, + "loss": 0.0426, + "step": 38150 + }, + { + "epoch": 0.1858, + "grad_norm": 0.13567283749580383, + "learning_rate": 4.7338802424026266e-05, + "loss": 0.0413, + "step": 38160 + }, + { + "epoch": 0.18585, + "grad_norm": 0.12056778371334076, + "learning_rate": 4.7336946266130965e-05, + "loss": 0.0421, + "step": 38170 + }, + { + "epoch": 0.1859, + "grad_norm": 0.11835743486881256, + "learning_rate": 4.733508949755195e-05, + "loss": 0.0409, + "step": 38180 + }, + { + "epoch": 0.18595, + "grad_norm": 0.1459125429391861, + "learning_rate": 4.733323211833998e-05, + "loss": 0.0417, + "step": 38190 + }, + { + "epoch": 0.186, + "grad_norm": 0.13521605730056763, + "learning_rate": 4.733137412854583e-05, + "loss": 0.0417, + "step": 38200 + }, + { + "epoch": 0.18605, + "grad_norm": 0.11533375829458237, + "learning_rate": 4.7329515528220306e-05, + "loss": 0.0415, + "step": 38210 + }, + { + "epoch": 0.1861, + "grad_norm": 0.11277813464403152, + "learning_rate": 4.732765631741422e-05, + "loss": 0.0423, + "step": 38220 + }, + { + "epoch": 0.18615, + "grad_norm": 0.09624869376420975, + "learning_rate": 4.7325796496178384e-05, + "loss": 0.0419, + "step": 38230 + }, + { + "epoch": 0.1862, + "grad_norm": 0.1310359239578247, + "learning_rate": 4.7323936064563665e-05, + "loss": 0.0435, + "step": 38240 + }, + { + "epoch": 0.18625, + "grad_norm": 0.1489342749118805, + "learning_rate": 4.732207502262093e-05, + "loss": 0.0444, + "step": 38250 + }, + { + "epoch": 0.1863, + "grad_norm": 0.11564663052558899, + "learning_rate": 4.732021337040105e-05, + "loss": 0.0415, + "step": 38260 + }, + { + "epoch": 0.18635, + "grad_norm": 0.11250898241996765, + "learning_rate": 4.731835110795491e-05, + "loss": 0.0413, + "step": 38270 + }, + { + "epoch": 0.1864, + "grad_norm": 0.11310267448425293, + "learning_rate": 4.7316488235333434e-05, + "loss": 0.0428, + "step": 38280 + }, + { + "epoch": 0.18645, + "grad_norm": 0.12901782989501953, + "learning_rate": 4.731462475258757e-05, + "loss": 0.0433, + "step": 38290 + }, + { + "epoch": 0.1865, + "grad_norm": 0.13150332868099213, + "learning_rate": 4.731276065976823e-05, + "loss": 0.0411, + "step": 38300 + }, + { + "epoch": 0.18655, + "grad_norm": 0.13157768547534943, + "learning_rate": 4.7310895956926406e-05, + "loss": 0.0405, + "step": 38310 + }, + { + "epoch": 0.1866, + "grad_norm": 0.12932339310646057, + "learning_rate": 4.730903064411307e-05, + "loss": 0.04, + "step": 38320 + }, + { + "epoch": 0.18665, + "grad_norm": 0.13414748013019562, + "learning_rate": 4.7307164721379216e-05, + "loss": 0.0438, + "step": 38330 + }, + { + "epoch": 0.1867, + "grad_norm": 0.13133932650089264, + "learning_rate": 4.730529818877585e-05, + "loss": 0.0418, + "step": 38340 + }, + { + "epoch": 0.18675, + "grad_norm": 0.12572602927684784, + "learning_rate": 4.730343104635402e-05, + "loss": 0.0417, + "step": 38350 + }, + { + "epoch": 0.1868, + "grad_norm": 0.10463862866163254, + "learning_rate": 4.7301563294164764e-05, + "loss": 0.0409, + "step": 38360 + }, + { + "epoch": 0.18685, + "grad_norm": 0.14042209088802338, + "learning_rate": 4.729969493225914e-05, + "loss": 0.0423, + "step": 38370 + }, + { + "epoch": 0.1869, + "grad_norm": 0.1559218168258667, + "learning_rate": 4.729782596068825e-05, + "loss": 0.0406, + "step": 38380 + }, + { + "epoch": 0.18695, + "grad_norm": 0.12093447893857956, + "learning_rate": 4.729595637950316e-05, + "loss": 0.0442, + "step": 38390 + }, + { + "epoch": 0.187, + "grad_norm": 0.13172774016857147, + "learning_rate": 4.7294086188755e-05, + "loss": 0.0403, + "step": 38400 + }, + { + "epoch": 0.18705, + "grad_norm": 0.1175733283162117, + "learning_rate": 4.7292215388494896e-05, + "loss": 0.0397, + "step": 38410 + }, + { + "epoch": 0.1871, + "grad_norm": 0.12560902535915375, + "learning_rate": 4.729034397877401e-05, + "loss": 0.0412, + "step": 38420 + }, + { + "epoch": 0.18715, + "grad_norm": 0.12575411796569824, + "learning_rate": 4.728847195964349e-05, + "loss": 0.0395, + "step": 38430 + }, + { + "epoch": 0.1872, + "grad_norm": 0.10520323365926743, + "learning_rate": 4.728659933115451e-05, + "loss": 0.0386, + "step": 38440 + }, + { + "epoch": 0.18725, + "grad_norm": 0.1346939504146576, + "learning_rate": 4.728472609335829e-05, + "loss": 0.041, + "step": 38450 + }, + { + "epoch": 0.1873, + "grad_norm": 0.10870691388845444, + "learning_rate": 4.728285224630602e-05, + "loss": 0.0402, + "step": 38460 + }, + { + "epoch": 0.18735, + "grad_norm": 0.13659881055355072, + "learning_rate": 4.7280977790048955e-05, + "loss": 0.041, + "step": 38470 + }, + { + "epoch": 0.1874, + "grad_norm": 0.11378481239080429, + "learning_rate": 4.727910272463831e-05, + "loss": 0.0392, + "step": 38480 + }, + { + "epoch": 0.18745, + "grad_norm": 0.13256292045116425, + "learning_rate": 4.727722705012538e-05, + "loss": 0.0401, + "step": 38490 + }, + { + "epoch": 0.1875, + "grad_norm": 0.13196402788162231, + "learning_rate": 4.7275350766561424e-05, + "loss": 0.0393, + "step": 38500 + }, + { + "epoch": 0.18755, + "grad_norm": 0.14104105532169342, + "learning_rate": 4.727347387399775e-05, + "loss": 0.0395, + "step": 38510 + }, + { + "epoch": 0.1876, + "grad_norm": 0.1576061099767685, + "learning_rate": 4.727159637248567e-05, + "loss": 0.0401, + "step": 38520 + }, + { + "epoch": 0.18765, + "grad_norm": 0.12883985042572021, + "learning_rate": 4.726971826207651e-05, + "loss": 0.0413, + "step": 38530 + }, + { + "epoch": 0.1877, + "grad_norm": 0.14144699275493622, + "learning_rate": 4.7267839542821615e-05, + "loss": 0.0418, + "step": 38540 + }, + { + "epoch": 0.18775, + "grad_norm": 0.13098569214344025, + "learning_rate": 4.7265960214772354e-05, + "loss": 0.0421, + "step": 38550 + }, + { + "epoch": 0.1878, + "grad_norm": 0.1399405300617218, + "learning_rate": 4.726408027798011e-05, + "loss": 0.0401, + "step": 38560 + }, + { + "epoch": 0.18785, + "grad_norm": 0.12670686841011047, + "learning_rate": 4.726219973249627e-05, + "loss": 0.0404, + "step": 38570 + }, + { + "epoch": 0.1879, + "grad_norm": 0.154850035905838, + "learning_rate": 4.7260318578372265e-05, + "loss": 0.0411, + "step": 38580 + }, + { + "epoch": 0.18795, + "grad_norm": 0.12611621618270874, + "learning_rate": 4.7258436815659504e-05, + "loss": 0.0417, + "step": 38590 + }, + { + "epoch": 0.188, + "grad_norm": 0.17699086666107178, + "learning_rate": 4.725655444440944e-05, + "loss": 0.0444, + "step": 38600 + }, + { + "epoch": 0.18805, + "grad_norm": 0.12984317541122437, + "learning_rate": 4.725467146467354e-05, + "loss": 0.0403, + "step": 38610 + }, + { + "epoch": 0.1881, + "grad_norm": 0.13115186989307404, + "learning_rate": 4.725278787650328e-05, + "loss": 0.0431, + "step": 38620 + }, + { + "epoch": 0.18815, + "grad_norm": 0.1118667721748352, + "learning_rate": 4.725090367995016e-05, + "loss": 0.0425, + "step": 38630 + }, + { + "epoch": 0.1882, + "grad_norm": 0.1281067132949829, + "learning_rate": 4.72490188750657e-05, + "loss": 0.0427, + "step": 38640 + }, + { + "epoch": 0.18825, + "grad_norm": 0.13673530519008636, + "learning_rate": 4.724713346190142e-05, + "loss": 0.0412, + "step": 38650 + }, + { + "epoch": 0.1883, + "grad_norm": 0.14275339245796204, + "learning_rate": 4.7245247440508864e-05, + "loss": 0.0405, + "step": 38660 + }, + { + "epoch": 0.18835, + "grad_norm": 0.1330052763223648, + "learning_rate": 4.7243360810939606e-05, + "loss": 0.0429, + "step": 38670 + }, + { + "epoch": 0.1884, + "grad_norm": 0.17282722890377045, + "learning_rate": 4.724147357324522e-05, + "loss": 0.0424, + "step": 38680 + }, + { + "epoch": 0.18845, + "grad_norm": 0.13815313577651978, + "learning_rate": 4.7239585727477296e-05, + "loss": 0.0415, + "step": 38690 + }, + { + "epoch": 0.1885, + "grad_norm": 0.14790645241737366, + "learning_rate": 4.723769727368747e-05, + "loss": 0.0406, + "step": 38700 + }, + { + "epoch": 0.18855, + "grad_norm": 0.12194222956895828, + "learning_rate": 4.723580821192733e-05, + "loss": 0.0395, + "step": 38710 + }, + { + "epoch": 0.1886, + "grad_norm": 0.16369490325450897, + "learning_rate": 4.723391854224857e-05, + "loss": 0.0417, + "step": 38720 + }, + { + "epoch": 0.18865, + "grad_norm": 0.1218780055642128, + "learning_rate": 4.723202826470281e-05, + "loss": 0.041, + "step": 38730 + }, + { + "epoch": 0.1887, + "grad_norm": 0.12561967968940735, + "learning_rate": 4.723013737934176e-05, + "loss": 0.0419, + "step": 38740 + }, + { + "epoch": 0.18875, + "grad_norm": 0.11251445859670639, + "learning_rate": 4.7228245886217104e-05, + "loss": 0.0402, + "step": 38750 + }, + { + "epoch": 0.1888, + "grad_norm": 0.1460985243320465, + "learning_rate": 4.722635378538056e-05, + "loss": 0.0462, + "step": 38760 + }, + { + "epoch": 0.18885, + "grad_norm": 0.15045461058616638, + "learning_rate": 4.722446107688385e-05, + "loss": 0.0412, + "step": 38770 + }, + { + "epoch": 0.1889, + "grad_norm": 0.12663142383098602, + "learning_rate": 4.722256776077872e-05, + "loss": 0.0416, + "step": 38780 + }, + { + "epoch": 0.18895, + "grad_norm": 0.14913487434387207, + "learning_rate": 4.722067383711694e-05, + "loss": 0.043, + "step": 38790 + }, + { + "epoch": 0.189, + "grad_norm": 0.10253901779651642, + "learning_rate": 4.721877930595029e-05, + "loss": 0.0413, + "step": 38800 + }, + { + "epoch": 0.18905, + "grad_norm": 0.13283059000968933, + "learning_rate": 4.721688416733055e-05, + "loss": 0.0429, + "step": 38810 + }, + { + "epoch": 0.1891, + "grad_norm": 0.12368650734424591, + "learning_rate": 4.721498842130955e-05, + "loss": 0.042, + "step": 38820 + }, + { + "epoch": 0.18915, + "grad_norm": 0.13323071599006653, + "learning_rate": 4.721309206793911e-05, + "loss": 0.0411, + "step": 38830 + }, + { + "epoch": 0.1892, + "grad_norm": 0.09104353189468384, + "learning_rate": 4.721119510727108e-05, + "loss": 0.0425, + "step": 38840 + }, + { + "epoch": 0.18925, + "grad_norm": 0.10489680618047714, + "learning_rate": 4.7209297539357324e-05, + "loss": 0.0423, + "step": 38850 + }, + { + "epoch": 0.1893, + "grad_norm": 0.11301996558904648, + "learning_rate": 4.72073993642497e-05, + "loss": 0.0422, + "step": 38860 + }, + { + "epoch": 0.18935, + "grad_norm": 0.14078927040100098, + "learning_rate": 4.720550058200014e-05, + "loss": 0.0424, + "step": 38870 + }, + { + "epoch": 0.1894, + "grad_norm": 0.15084753930568695, + "learning_rate": 4.720360119266053e-05, + "loss": 0.0415, + "step": 38880 + }, + { + "epoch": 0.18945, + "grad_norm": 0.11446460336446762, + "learning_rate": 4.7201701196282804e-05, + "loss": 0.0409, + "step": 38890 + }, + { + "epoch": 0.1895, + "grad_norm": 0.11791659146547318, + "learning_rate": 4.719980059291891e-05, + "loss": 0.0435, + "step": 38900 + }, + { + "epoch": 0.18955, + "grad_norm": 0.11702030152082443, + "learning_rate": 4.71978993826208e-05, + "loss": 0.0411, + "step": 38910 + }, + { + "epoch": 0.1896, + "grad_norm": 0.10994726419448853, + "learning_rate": 4.719599756544047e-05, + "loss": 0.0433, + "step": 38920 + }, + { + "epoch": 0.18965, + "grad_norm": 0.11107087880373001, + "learning_rate": 4.71940951414299e-05, + "loss": 0.0424, + "step": 38930 + }, + { + "epoch": 0.1897, + "grad_norm": 0.11057529598474503, + "learning_rate": 4.719219211064111e-05, + "loss": 0.0429, + "step": 38940 + }, + { + "epoch": 0.18975, + "grad_norm": 0.11657113581895828, + "learning_rate": 4.719028847312612e-05, + "loss": 0.0407, + "step": 38950 + }, + { + "epoch": 0.1898, + "grad_norm": 0.12191347032785416, + "learning_rate": 4.7188384228936986e-05, + "loss": 0.0419, + "step": 38960 + }, + { + "epoch": 0.18985, + "grad_norm": 0.11876232177019119, + "learning_rate": 4.7186479378125756e-05, + "loss": 0.0416, + "step": 38970 + }, + { + "epoch": 0.1899, + "grad_norm": 0.11813126504421234, + "learning_rate": 4.718457392074452e-05, + "loss": 0.0421, + "step": 38980 + }, + { + "epoch": 0.18995, + "grad_norm": 0.11932237446308136, + "learning_rate": 4.7182667856845364e-05, + "loss": 0.042, + "step": 38990 + }, + { + "epoch": 0.19, + "grad_norm": 0.12584024667739868, + "learning_rate": 4.71807611864804e-05, + "loss": 0.0449, + "step": 39000 + }, + { + "epoch": 0.19005, + "grad_norm": 0.11920773237943649, + "learning_rate": 4.717885390970177e-05, + "loss": 0.0468, + "step": 39010 + }, + { + "epoch": 0.1901, + "grad_norm": 0.11415545642375946, + "learning_rate": 4.7176946026561596e-05, + "loss": 0.0428, + "step": 39020 + }, + { + "epoch": 0.19015, + "grad_norm": 0.10740591585636139, + "learning_rate": 4.717503753711205e-05, + "loss": 0.0431, + "step": 39030 + }, + { + "epoch": 0.1902, + "grad_norm": 0.1185695007443428, + "learning_rate": 4.7173128441405315e-05, + "loss": 0.0448, + "step": 39040 + }, + { + "epoch": 0.19025, + "grad_norm": 0.1198916956782341, + "learning_rate": 4.717121873949357e-05, + "loss": 0.0415, + "step": 39050 + }, + { + "epoch": 0.1903, + "grad_norm": 0.14157956838607788, + "learning_rate": 4.716930843142904e-05, + "loss": 0.0443, + "step": 39060 + }, + { + "epoch": 0.19035, + "grad_norm": 0.11701211333274841, + "learning_rate": 4.716739751726394e-05, + "loss": 0.0427, + "step": 39070 + }, + { + "epoch": 0.1904, + "grad_norm": 0.13657964766025543, + "learning_rate": 4.716548599705053e-05, + "loss": 0.0403, + "step": 39080 + }, + { + "epoch": 0.19045, + "grad_norm": 0.13691385090351105, + "learning_rate": 4.716357387084105e-05, + "loss": 0.0441, + "step": 39090 + }, + { + "epoch": 0.1905, + "grad_norm": 0.149797260761261, + "learning_rate": 4.7161661138687794e-05, + "loss": 0.0428, + "step": 39100 + }, + { + "epoch": 0.19055, + "grad_norm": 0.12537500262260437, + "learning_rate": 4.715974780064304e-05, + "loss": 0.0427, + "step": 39110 + }, + { + "epoch": 0.1906, + "grad_norm": 0.11275333166122437, + "learning_rate": 4.7157833856759116e-05, + "loss": 0.0408, + "step": 39120 + }, + { + "epoch": 0.19065, + "grad_norm": 0.12390165030956268, + "learning_rate": 4.715591930708833e-05, + "loss": 0.0425, + "step": 39130 + }, + { + "epoch": 0.1907, + "grad_norm": 0.11583295464515686, + "learning_rate": 4.715400415168304e-05, + "loss": 0.0398, + "step": 39140 + }, + { + "epoch": 0.19075, + "grad_norm": 0.10922762751579285, + "learning_rate": 4.7152088390595595e-05, + "loss": 0.0398, + "step": 39150 + }, + { + "epoch": 0.1908, + "grad_norm": 0.11143206804990768, + "learning_rate": 4.715017202387838e-05, + "loss": 0.0402, + "step": 39160 + }, + { + "epoch": 0.19085, + "grad_norm": 0.11785729974508286, + "learning_rate": 4.714825505158378e-05, + "loss": 0.0423, + "step": 39170 + }, + { + "epoch": 0.1909, + "grad_norm": 0.12262077629566193, + "learning_rate": 4.714633747376421e-05, + "loss": 0.0403, + "step": 39180 + }, + { + "epoch": 0.19095, + "grad_norm": 0.11699897050857544, + "learning_rate": 4.714441929047209e-05, + "loss": 0.0414, + "step": 39190 + }, + { + "epoch": 0.191, + "grad_norm": 0.1262628734111786, + "learning_rate": 4.7142500501759866e-05, + "loss": 0.0421, + "step": 39200 + }, + { + "epoch": 0.19105, + "grad_norm": 0.13855388760566711, + "learning_rate": 4.714058110768e-05, + "loss": 0.0413, + "step": 39210 + }, + { + "epoch": 0.1911, + "grad_norm": 0.1330314725637436, + "learning_rate": 4.713866110828496e-05, + "loss": 0.0405, + "step": 39220 + }, + { + "epoch": 0.19115, + "grad_norm": 0.12841005623340607, + "learning_rate": 4.713674050362724e-05, + "loss": 0.0406, + "step": 39230 + }, + { + "epoch": 0.1912, + "grad_norm": 0.15829530358314514, + "learning_rate": 4.713481929375936e-05, + "loss": 0.0396, + "step": 39240 + }, + { + "epoch": 0.19125, + "grad_norm": 0.11351760476827621, + "learning_rate": 4.7132897478733836e-05, + "loss": 0.0429, + "step": 39250 + }, + { + "epoch": 0.1913, + "grad_norm": 0.12715424597263336, + "learning_rate": 4.71309750586032e-05, + "loss": 0.0424, + "step": 39260 + }, + { + "epoch": 0.19135, + "grad_norm": 0.12090346217155457, + "learning_rate": 4.712905203342003e-05, + "loss": 0.0404, + "step": 39270 + }, + { + "epoch": 0.1914, + "grad_norm": 0.10686006397008896, + "learning_rate": 4.712712840323689e-05, + "loss": 0.0402, + "step": 39280 + }, + { + "epoch": 0.19145, + "grad_norm": 0.1376791000366211, + "learning_rate": 4.7125204168106365e-05, + "loss": 0.043, + "step": 39290 + }, + { + "epoch": 0.1915, + "grad_norm": 0.1357308030128479, + "learning_rate": 4.7123279328081074e-05, + "loss": 0.0411, + "step": 39300 + }, + { + "epoch": 0.19155, + "grad_norm": 0.13313201069831848, + "learning_rate": 4.712135388321364e-05, + "loss": 0.0401, + "step": 39310 + }, + { + "epoch": 0.1916, + "grad_norm": 0.11176084727048874, + "learning_rate": 4.7119427833556696e-05, + "loss": 0.0433, + "step": 39320 + }, + { + "epoch": 0.19165, + "grad_norm": 0.13899466395378113, + "learning_rate": 4.711750117916292e-05, + "loss": 0.0426, + "step": 39330 + }, + { + "epoch": 0.1917, + "grad_norm": 0.11376943439245224, + "learning_rate": 4.711557392008495e-05, + "loss": 0.0447, + "step": 39340 + }, + { + "epoch": 0.19175, + "grad_norm": 0.13883130252361298, + "learning_rate": 4.7113646056375506e-05, + "loss": 0.042, + "step": 39350 + }, + { + "epoch": 0.1918, + "grad_norm": 0.11968471109867096, + "learning_rate": 4.711171758808729e-05, + "loss": 0.043, + "step": 39360 + }, + { + "epoch": 0.19185, + "grad_norm": 0.1307145357131958, + "learning_rate": 4.710978851527302e-05, + "loss": 0.04, + "step": 39370 + }, + { + "epoch": 0.1919, + "grad_norm": 0.11859049648046494, + "learning_rate": 4.710785883798543e-05, + "loss": 0.0402, + "step": 39380 + }, + { + "epoch": 0.19195, + "grad_norm": 0.12955895066261292, + "learning_rate": 4.71059285562773e-05, + "loss": 0.0412, + "step": 39390 + }, + { + "epoch": 0.192, + "grad_norm": 0.11519957333803177, + "learning_rate": 4.7103997670201376e-05, + "loss": 0.0443, + "step": 39400 + }, + { + "epoch": 0.19205, + "grad_norm": 0.13244330883026123, + "learning_rate": 4.710206617981047e-05, + "loss": 0.0402, + "step": 39410 + }, + { + "epoch": 0.1921, + "grad_norm": 0.09134592860937119, + "learning_rate": 4.7100134085157365e-05, + "loss": 0.0393, + "step": 39420 + }, + { + "epoch": 0.19215, + "grad_norm": 0.13230308890342712, + "learning_rate": 4.7098201386294904e-05, + "loss": 0.0402, + "step": 39430 + }, + { + "epoch": 0.1922, + "grad_norm": 0.12215183675289154, + "learning_rate": 4.7096268083275926e-05, + "loss": 0.0409, + "step": 39440 + }, + { + "epoch": 0.19225, + "grad_norm": 0.10658028721809387, + "learning_rate": 4.709433417615327e-05, + "loss": 0.0415, + "step": 39450 + }, + { + "epoch": 0.1923, + "grad_norm": 0.1050388514995575, + "learning_rate": 4.7092399664979824e-05, + "loss": 0.0434, + "step": 39460 + }, + { + "epoch": 0.19235, + "grad_norm": 0.10058680176734924, + "learning_rate": 4.709046454980846e-05, + "loss": 0.0387, + "step": 39470 + }, + { + "epoch": 0.1924, + "grad_norm": 0.10979174822568893, + "learning_rate": 4.708852883069211e-05, + "loss": 0.0392, + "step": 39480 + }, + { + "epoch": 0.19245, + "grad_norm": 0.1021413505077362, + "learning_rate": 4.7086592507683667e-05, + "loss": 0.04, + "step": 39490 + }, + { + "epoch": 0.1925, + "grad_norm": 0.10319457948207855, + "learning_rate": 4.708465558083609e-05, + "loss": 0.0387, + "step": 39500 + }, + { + "epoch": 0.19255, + "grad_norm": 0.10485776513814926, + "learning_rate": 4.7082718050202326e-05, + "loss": 0.0397, + "step": 39510 + }, + { + "epoch": 0.1926, + "grad_norm": 0.10097415745258331, + "learning_rate": 4.708077991583534e-05, + "loss": 0.0395, + "step": 39520 + }, + { + "epoch": 0.19265, + "grad_norm": 0.10246019810438156, + "learning_rate": 4.7078841177788136e-05, + "loss": 0.0409, + "step": 39530 + }, + { + "epoch": 0.1927, + "grad_norm": 0.10946165025234222, + "learning_rate": 4.7076901836113696e-05, + "loss": 0.0433, + "step": 39540 + }, + { + "epoch": 0.19275, + "grad_norm": 0.14613662660121918, + "learning_rate": 4.7074961890865065e-05, + "loss": 0.0423, + "step": 39550 + }, + { + "epoch": 0.1928, + "grad_norm": 0.12220507115125656, + "learning_rate": 4.707302134209527e-05, + "loss": 0.0395, + "step": 39560 + }, + { + "epoch": 0.19285, + "grad_norm": 0.13756348192691803, + "learning_rate": 4.7071080189857356e-05, + "loss": 0.0413, + "step": 39570 + }, + { + "epoch": 0.1929, + "grad_norm": 0.1259576976299286, + "learning_rate": 4.706913843420441e-05, + "loss": 0.0417, + "step": 39580 + }, + { + "epoch": 0.19295, + "grad_norm": 0.1224607303738594, + "learning_rate": 4.70671960751895e-05, + "loss": 0.0416, + "step": 39590 + }, + { + "epoch": 0.193, + "grad_norm": 0.1667010635137558, + "learning_rate": 4.706525311286574e-05, + "loss": 0.0456, + "step": 39600 + }, + { + "epoch": 0.19305, + "grad_norm": 0.13042321801185608, + "learning_rate": 4.706330954728626e-05, + "loss": 0.0442, + "step": 39610 + }, + { + "epoch": 0.1931, + "grad_norm": 0.12576870620250702, + "learning_rate": 4.7061365378504174e-05, + "loss": 0.0432, + "step": 39620 + }, + { + "epoch": 0.19315, + "grad_norm": 0.11800982058048248, + "learning_rate": 4.705942060657266e-05, + "loss": 0.0411, + "step": 39630 + }, + { + "epoch": 0.1932, + "grad_norm": 0.10502568632364273, + "learning_rate": 4.7057475231544865e-05, + "loss": 0.0403, + "step": 39640 + }, + { + "epoch": 0.19325, + "grad_norm": 0.09104301780462265, + "learning_rate": 4.705552925347398e-05, + "loss": 0.0424, + "step": 39650 + }, + { + "epoch": 0.1933, + "grad_norm": 0.11457667499780655, + "learning_rate": 4.705358267241322e-05, + "loss": 0.0426, + "step": 39660 + }, + { + "epoch": 0.19335, + "grad_norm": 0.12124619632959366, + "learning_rate": 4.705163548841579e-05, + "loss": 0.0401, + "step": 39670 + }, + { + "epoch": 0.1934, + "grad_norm": 0.11806228756904602, + "learning_rate": 4.704968770153493e-05, + "loss": 0.0406, + "step": 39680 + }, + { + "epoch": 0.19345, + "grad_norm": 0.11900746077299118, + "learning_rate": 4.704773931182389e-05, + "loss": 0.0408, + "step": 39690 + }, + { + "epoch": 0.1935, + "grad_norm": 0.09312348812818527, + "learning_rate": 4.704579031933595e-05, + "loss": 0.042, + "step": 39700 + }, + { + "epoch": 0.19355, + "grad_norm": 0.10966067761182785, + "learning_rate": 4.7043840724124375e-05, + "loss": 0.0411, + "step": 39710 + }, + { + "epoch": 0.1936, + "grad_norm": 0.11456384509801865, + "learning_rate": 4.704189052624248e-05, + "loss": 0.041, + "step": 39720 + }, + { + "epoch": 0.19365, + "grad_norm": 0.1079399585723877, + "learning_rate": 4.703993972574358e-05, + "loss": 0.0402, + "step": 39730 + }, + { + "epoch": 0.1937, + "grad_norm": 0.10426142066717148, + "learning_rate": 4.7037988322681e-05, + "loss": 0.0403, + "step": 39740 + }, + { + "epoch": 0.19375, + "grad_norm": 0.13527005910873413, + "learning_rate": 4.703603631710811e-05, + "loss": 0.0404, + "step": 39750 + }, + { + "epoch": 0.1938, + "grad_norm": 0.11864572763442993, + "learning_rate": 4.703408370907826e-05, + "loss": 0.0422, + "step": 39760 + }, + { + "epoch": 0.19385, + "grad_norm": 0.1404629349708557, + "learning_rate": 4.7032130498644835e-05, + "loss": 0.0413, + "step": 39770 + }, + { + "epoch": 0.1939, + "grad_norm": 0.10311874002218246, + "learning_rate": 4.703017668586125e-05, + "loss": 0.0425, + "step": 39780 + }, + { + "epoch": 0.19395, + "grad_norm": 0.1279229074716568, + "learning_rate": 4.70282222707809e-05, + "loss": 0.0402, + "step": 39790 + }, + { + "epoch": 0.194, + "grad_norm": 0.12124011665582657, + "learning_rate": 4.702626725345723e-05, + "loss": 0.0448, + "step": 39800 + }, + { + "epoch": 0.19405, + "grad_norm": 0.11050168424844742, + "learning_rate": 4.7024311633943696e-05, + "loss": 0.0402, + "step": 39810 + }, + { + "epoch": 0.1941, + "grad_norm": 0.13088427484035492, + "learning_rate": 4.702235541229375e-05, + "loss": 0.0409, + "step": 39820 + }, + { + "epoch": 0.19415, + "grad_norm": 0.1190885379910469, + "learning_rate": 4.702039858856088e-05, + "loss": 0.0429, + "step": 39830 + }, + { + "epoch": 0.1942, + "grad_norm": 0.10510970652103424, + "learning_rate": 4.701844116279859e-05, + "loss": 0.0402, + "step": 39840 + }, + { + "epoch": 0.19425, + "grad_norm": 0.10281083732843399, + "learning_rate": 4.7016483135060386e-05, + "loss": 0.0401, + "step": 39850 + }, + { + "epoch": 0.1943, + "grad_norm": 0.12347941845655441, + "learning_rate": 4.701452450539981e-05, + "loss": 0.0402, + "step": 39860 + }, + { + "epoch": 0.19435, + "grad_norm": 0.10865604877471924, + "learning_rate": 4.70125652738704e-05, + "loss": 0.0403, + "step": 39870 + }, + { + "epoch": 0.1944, + "grad_norm": 0.14394986629486084, + "learning_rate": 4.701060544052572e-05, + "loss": 0.0422, + "step": 39880 + }, + { + "epoch": 0.19445, + "grad_norm": 0.11788714677095413, + "learning_rate": 4.700864500541936e-05, + "loss": 0.0434, + "step": 39890 + }, + { + "epoch": 0.1945, + "grad_norm": 0.11781013011932373, + "learning_rate": 4.7006683968604915e-05, + "loss": 0.0434, + "step": 39900 + }, + { + "epoch": 0.19455, + "grad_norm": 0.10928132385015488, + "learning_rate": 4.7004722330136005e-05, + "loss": 0.0397, + "step": 39910 + }, + { + "epoch": 0.1946, + "grad_norm": 0.10922566801309586, + "learning_rate": 4.700276009006625e-05, + "loss": 0.0412, + "step": 39920 + }, + { + "epoch": 0.19465, + "grad_norm": 0.10895717144012451, + "learning_rate": 4.700079724844929e-05, + "loss": 0.0403, + "step": 39930 + }, + { + "epoch": 0.1947, + "grad_norm": 0.11466559022665024, + "learning_rate": 4.6998833805338806e-05, + "loss": 0.041, + "step": 39940 + }, + { + "epoch": 0.19475, + "grad_norm": 0.11927556246519089, + "learning_rate": 4.699686976078847e-05, + "loss": 0.0401, + "step": 39950 + }, + { + "epoch": 0.1948, + "grad_norm": 0.10541284084320068, + "learning_rate": 4.6994905114851976e-05, + "loss": 0.0409, + "step": 39960 + }, + { + "epoch": 0.19485, + "grad_norm": 0.11831863224506378, + "learning_rate": 4.699293986758304e-05, + "loss": 0.0418, + "step": 39970 + }, + { + "epoch": 0.1949, + "grad_norm": 0.12878260016441345, + "learning_rate": 4.699097401903539e-05, + "loss": 0.0407, + "step": 39980 + }, + { + "epoch": 0.19495, + "grad_norm": 0.10302649438381195, + "learning_rate": 4.6989007569262776e-05, + "loss": 0.042, + "step": 39990 + }, + { + "epoch": 0.195, + "grad_norm": 0.13494367897510529, + "learning_rate": 4.698704051831896e-05, + "loss": 0.0412, + "step": 40000 + }, + { + "epoch": 0.19505, + "grad_norm": 0.12048853188753128, + "learning_rate": 4.6985072866257704e-05, + "loss": 0.0418, + "step": 40010 + }, + { + "epoch": 0.1951, + "grad_norm": 0.13809038698673248, + "learning_rate": 4.698310461313282e-05, + "loss": 0.0412, + "step": 40020 + }, + { + "epoch": 0.19515, + "grad_norm": 0.13032236695289612, + "learning_rate": 4.6981135758998115e-05, + "loss": 0.0404, + "step": 40030 + }, + { + "epoch": 0.1952, + "grad_norm": 0.14365342259407043, + "learning_rate": 4.6979166303907425e-05, + "loss": 0.0404, + "step": 40040 + }, + { + "epoch": 0.19525, + "grad_norm": 0.15081055462360382, + "learning_rate": 4.697719624791458e-05, + "loss": 0.0398, + "step": 40050 + }, + { + "epoch": 0.1953, + "grad_norm": 0.13427375257015228, + "learning_rate": 4.697522559107344e-05, + "loss": 0.0404, + "step": 40060 + }, + { + "epoch": 0.19535, + "grad_norm": 0.14125995337963104, + "learning_rate": 4.697325433343789e-05, + "loss": 0.0422, + "step": 40070 + }, + { + "epoch": 0.1954, + "grad_norm": 0.14351476728916168, + "learning_rate": 4.697128247506183e-05, + "loss": 0.0415, + "step": 40080 + }, + { + "epoch": 0.19545, + "grad_norm": 0.12644805014133453, + "learning_rate": 4.696931001599914e-05, + "loss": 0.0403, + "step": 40090 + }, + { + "epoch": 0.1955, + "grad_norm": 0.1217808797955513, + "learning_rate": 4.6967336956303794e-05, + "loss": 0.0423, + "step": 40100 + }, + { + "epoch": 0.19555, + "grad_norm": 0.11470723897218704, + "learning_rate": 4.6965363296029695e-05, + "loss": 0.0417, + "step": 40110 + }, + { + "epoch": 0.1956, + "grad_norm": 0.10986042767763138, + "learning_rate": 4.696338903523082e-05, + "loss": 0.0425, + "step": 40120 + }, + { + "epoch": 0.19565, + "grad_norm": 0.122439906001091, + "learning_rate": 4.696141417396114e-05, + "loss": 0.0433, + "step": 40130 + }, + { + "epoch": 0.1957, + "grad_norm": 0.11072283983230591, + "learning_rate": 4.695943871227464e-05, + "loss": 0.0422, + "step": 40140 + }, + { + "epoch": 0.19575, + "grad_norm": 0.14070159196853638, + "learning_rate": 4.695746265022534e-05, + "loss": 0.0445, + "step": 40150 + }, + { + "epoch": 0.1958, + "grad_norm": 0.11827629059553146, + "learning_rate": 4.695548598786726e-05, + "loss": 0.0435, + "step": 40160 + }, + { + "epoch": 0.19585, + "grad_norm": 0.123737633228302, + "learning_rate": 4.695350872525444e-05, + "loss": 0.045, + "step": 40170 + }, + { + "epoch": 0.1959, + "grad_norm": 0.0988842099905014, + "learning_rate": 4.695153086244094e-05, + "loss": 0.0434, + "step": 40180 + }, + { + "epoch": 0.19595, + "grad_norm": 0.12570056319236755, + "learning_rate": 4.6949552399480834e-05, + "loss": 0.0425, + "step": 40190 + }, + { + "epoch": 0.196, + "grad_norm": 0.1427522897720337, + "learning_rate": 4.694757333642821e-05, + "loss": 0.0402, + "step": 40200 + }, + { + "epoch": 0.19605, + "grad_norm": 0.11194559931755066, + "learning_rate": 4.6945593673337173e-05, + "loss": 0.0441, + "step": 40210 + }, + { + "epoch": 0.1961, + "grad_norm": 0.1071145310997963, + "learning_rate": 4.6943613410261856e-05, + "loss": 0.0432, + "step": 40220 + }, + { + "epoch": 0.19615, + "grad_norm": 0.1292775571346283, + "learning_rate": 4.694163254725639e-05, + "loss": 0.0437, + "step": 40230 + }, + { + "epoch": 0.1962, + "grad_norm": 0.11506784707307816, + "learning_rate": 4.693965108437494e-05, + "loss": 0.0403, + "step": 40240 + }, + { + "epoch": 0.19625, + "grad_norm": 0.12157977372407913, + "learning_rate": 4.693766902167166e-05, + "loss": 0.0411, + "step": 40250 + }, + { + "epoch": 0.1963, + "grad_norm": 0.14803488552570343, + "learning_rate": 4.6935686359200754e-05, + "loss": 0.0407, + "step": 40260 + }, + { + "epoch": 0.19635, + "grad_norm": 0.12866628170013428, + "learning_rate": 4.6933703097016425e-05, + "loss": 0.0418, + "step": 40270 + }, + { + "epoch": 0.1964, + "grad_norm": 0.15898150205612183, + "learning_rate": 4.693171923517289e-05, + "loss": 0.041, + "step": 40280 + }, + { + "epoch": 0.19645, + "grad_norm": 0.11652620136737823, + "learning_rate": 4.69297347737244e-05, + "loss": 0.0414, + "step": 40290 + }, + { + "epoch": 0.1965, + "grad_norm": 0.11326843500137329, + "learning_rate": 4.692774971272519e-05, + "loss": 0.0403, + "step": 40300 + }, + { + "epoch": 0.19655, + "grad_norm": 0.11849953234195709, + "learning_rate": 4.692576405222955e-05, + "loss": 0.0389, + "step": 40310 + }, + { + "epoch": 0.1966, + "grad_norm": 0.13824835419654846, + "learning_rate": 4.6923777792291746e-05, + "loss": 0.0454, + "step": 40320 + }, + { + "epoch": 0.19665, + "grad_norm": 0.12199034541845322, + "learning_rate": 4.69217909329661e-05, + "loss": 0.0406, + "step": 40330 + }, + { + "epoch": 0.1967, + "grad_norm": 0.15483516454696655, + "learning_rate": 4.6919803474306926e-05, + "loss": 0.0408, + "step": 40340 + }, + { + "epoch": 0.19675, + "grad_norm": 0.11486873030662537, + "learning_rate": 4.691781541636856e-05, + "loss": 0.0422, + "step": 40350 + }, + { + "epoch": 0.1968, + "grad_norm": 0.12523794174194336, + "learning_rate": 4.6915826759205355e-05, + "loss": 0.0414, + "step": 40360 + }, + { + "epoch": 0.19685, + "grad_norm": 0.11457782983779907, + "learning_rate": 4.691383750287168e-05, + "loss": 0.0414, + "step": 40370 + }, + { + "epoch": 0.1969, + "grad_norm": 0.09785265475511551, + "learning_rate": 4.691184764742192e-05, + "loss": 0.0416, + "step": 40380 + }, + { + "epoch": 0.19695, + "grad_norm": 0.10499918460845947, + "learning_rate": 4.690985719291048e-05, + "loss": 0.0407, + "step": 40390 + }, + { + "epoch": 0.197, + "grad_norm": 0.09804340451955795, + "learning_rate": 4.6907866139391766e-05, + "loss": 0.0418, + "step": 40400 + }, + { + "epoch": 0.19705, + "grad_norm": 0.09513143450021744, + "learning_rate": 4.6905874486920234e-05, + "loss": 0.0396, + "step": 40410 + }, + { + "epoch": 0.1971, + "grad_norm": 0.11613192409276962, + "learning_rate": 4.690388223555031e-05, + "loss": 0.0398, + "step": 40420 + }, + { + "epoch": 0.19715, + "grad_norm": 0.11552929133176804, + "learning_rate": 4.6901889385336486e-05, + "loss": 0.0397, + "step": 40430 + }, + { + "epoch": 0.1972, + "grad_norm": 0.10436779260635376, + "learning_rate": 4.6899895936333226e-05, + "loss": 0.0401, + "step": 40440 + }, + { + "epoch": 0.19725, + "grad_norm": 0.1088162288069725, + "learning_rate": 4.6897901888595044e-05, + "loss": 0.04, + "step": 40450 + }, + { + "epoch": 0.1973, + "grad_norm": 0.0982193872332573, + "learning_rate": 4.689590724217645e-05, + "loss": 0.0403, + "step": 40460 + }, + { + "epoch": 0.19735, + "grad_norm": 0.11795471608638763, + "learning_rate": 4.689391199713198e-05, + "loss": 0.0401, + "step": 40470 + }, + { + "epoch": 0.1974, + "grad_norm": 0.10341308265924454, + "learning_rate": 4.689191615351618e-05, + "loss": 0.0397, + "step": 40480 + }, + { + "epoch": 0.19745, + "grad_norm": 0.10679616034030914, + "learning_rate": 4.6889919711383614e-05, + "loss": 0.0395, + "step": 40490 + }, + { + "epoch": 0.1975, + "grad_norm": 0.11206220835447311, + "learning_rate": 4.6887922670788866e-05, + "loss": 0.0418, + "step": 40500 + }, + { + "epoch": 0.19755, + "grad_norm": 0.13069789111614227, + "learning_rate": 4.688592503178654e-05, + "loss": 0.0401, + "step": 40510 + }, + { + "epoch": 0.1976, + "grad_norm": 0.10349959135055542, + "learning_rate": 4.6883926794431244e-05, + "loss": 0.0394, + "step": 40520 + }, + { + "epoch": 0.19765, + "grad_norm": 0.10999346524477005, + "learning_rate": 4.68819279587776e-05, + "loss": 0.0395, + "step": 40530 + }, + { + "epoch": 0.1977, + "grad_norm": 0.11055387556552887, + "learning_rate": 4.6879928524880284e-05, + "loss": 0.0403, + "step": 40540 + }, + { + "epoch": 0.19775, + "grad_norm": 0.12687133252620697, + "learning_rate": 4.6877928492793933e-05, + "loss": 0.0438, + "step": 40550 + }, + { + "epoch": 0.1978, + "grad_norm": 0.11826013028621674, + "learning_rate": 4.687592786257324e-05, + "loss": 0.0394, + "step": 40560 + }, + { + "epoch": 0.19785, + "grad_norm": 0.11717572808265686, + "learning_rate": 4.687392663427289e-05, + "loss": 0.0408, + "step": 40570 + }, + { + "epoch": 0.1979, + "grad_norm": 0.11580274999141693, + "learning_rate": 4.6871924807947615e-05, + "loss": 0.0412, + "step": 40580 + }, + { + "epoch": 0.19795, + "grad_norm": 0.11562202870845795, + "learning_rate": 4.686992238365212e-05, + "loss": 0.0397, + "step": 40590 + }, + { + "epoch": 0.198, + "grad_norm": 0.09922197461128235, + "learning_rate": 4.6867919361441174e-05, + "loss": 0.0404, + "step": 40600 + }, + { + "epoch": 0.19805, + "grad_norm": 0.10405375063419342, + "learning_rate": 4.6865915741369526e-05, + "loss": 0.0428, + "step": 40610 + }, + { + "epoch": 0.1981, + "grad_norm": 0.12160701304674149, + "learning_rate": 4.6863911523491956e-05, + "loss": 0.0401, + "step": 40620 + }, + { + "epoch": 0.19815, + "grad_norm": 0.12346811592578888, + "learning_rate": 4.6861906707863255e-05, + "loss": 0.0394, + "step": 40630 + }, + { + "epoch": 0.1982, + "grad_norm": 0.12269331514835358, + "learning_rate": 4.6859901294538236e-05, + "loss": 0.0396, + "step": 40640 + }, + { + "epoch": 0.19825, + "grad_norm": 0.11659187078475952, + "learning_rate": 4.685789528357173e-05, + "loss": 0.0417, + "step": 40650 + }, + { + "epoch": 0.1983, + "grad_norm": 0.11113385856151581, + "learning_rate": 4.685588867501858e-05, + "loss": 0.0414, + "step": 40660 + }, + { + "epoch": 0.19835, + "grad_norm": 0.11463132500648499, + "learning_rate": 4.6853881468933645e-05, + "loss": 0.0414, + "step": 40670 + }, + { + "epoch": 0.1984, + "grad_norm": 0.1146690770983696, + "learning_rate": 4.68518736653718e-05, + "loss": 0.0412, + "step": 40680 + }, + { + "epoch": 0.19845, + "grad_norm": 0.11185281723737717, + "learning_rate": 4.6849865264387936e-05, + "loss": 0.0423, + "step": 40690 + }, + { + "epoch": 0.1985, + "grad_norm": 0.11425045132637024, + "learning_rate": 4.684785626603697e-05, + "loss": 0.0438, + "step": 40700 + }, + { + "epoch": 0.19855, + "grad_norm": 0.10859055817127228, + "learning_rate": 4.6845846670373815e-05, + "loss": 0.042, + "step": 40710 + }, + { + "epoch": 0.1986, + "grad_norm": 0.14859603345394135, + "learning_rate": 4.684383647745343e-05, + "loss": 0.0404, + "step": 40720 + }, + { + "epoch": 0.19865, + "grad_norm": 0.12444942444562912, + "learning_rate": 4.684182568733075e-05, + "loss": 0.0455, + "step": 40730 + }, + { + "epoch": 0.1987, + "grad_norm": 0.11537047475576401, + "learning_rate": 4.683981430006077e-05, + "loss": 0.041, + "step": 40740 + }, + { + "epoch": 0.19875, + "grad_norm": 0.11382672190666199, + "learning_rate": 4.683780231569846e-05, + "loss": 0.0402, + "step": 40750 + }, + { + "epoch": 0.1988, + "grad_norm": 0.11329783499240875, + "learning_rate": 4.683578973429885e-05, + "loss": 0.042, + "step": 40760 + }, + { + "epoch": 0.19885, + "grad_norm": 0.10937277972698212, + "learning_rate": 4.683377655591695e-05, + "loss": 0.0404, + "step": 40770 + }, + { + "epoch": 0.1989, + "grad_norm": 0.11984826624393463, + "learning_rate": 4.68317627806078e-05, + "loss": 0.0421, + "step": 40780 + }, + { + "epoch": 0.19895, + "grad_norm": 0.12853102385997772, + "learning_rate": 4.6829748408426454e-05, + "loss": 0.041, + "step": 40790 + }, + { + "epoch": 0.199, + "grad_norm": 0.16592375934123993, + "learning_rate": 4.6827733439428e-05, + "loss": 0.044, + "step": 40800 + }, + { + "epoch": 0.19905, + "grad_norm": 0.12035780400037766, + "learning_rate": 4.682571787366749e-05, + "loss": 0.0422, + "step": 40810 + }, + { + "epoch": 0.1991, + "grad_norm": 0.11223269999027252, + "learning_rate": 4.682370171120008e-05, + "loss": 0.0424, + "step": 40820 + }, + { + "epoch": 0.19915, + "grad_norm": 0.10703121870756149, + "learning_rate": 4.682168495208085e-05, + "loss": 0.0411, + "step": 40830 + }, + { + "epoch": 0.1992, + "grad_norm": 0.12128929793834686, + "learning_rate": 4.681966759636495e-05, + "loss": 0.0417, + "step": 40840 + }, + { + "epoch": 0.19925, + "grad_norm": 0.12266344577074051, + "learning_rate": 4.681764964410754e-05, + "loss": 0.0415, + "step": 40850 + }, + { + "epoch": 0.1993, + "grad_norm": 0.11447182297706604, + "learning_rate": 4.6815631095363785e-05, + "loss": 0.0432, + "step": 40860 + }, + { + "epoch": 0.19935, + "grad_norm": 0.1356915533542633, + "learning_rate": 4.6813611950188874e-05, + "loss": 0.0409, + "step": 40870 + }, + { + "epoch": 0.1994, + "grad_norm": 0.13189062476158142, + "learning_rate": 4.6811592208638e-05, + "loss": 0.0422, + "step": 40880 + }, + { + "epoch": 0.19945, + "grad_norm": 0.1379818320274353, + "learning_rate": 4.68095718707664e-05, + "loss": 0.0432, + "step": 40890 + }, + { + "epoch": 0.1995, + "grad_norm": 0.13345512747764587, + "learning_rate": 4.6807550936629286e-05, + "loss": 0.0405, + "step": 40900 + }, + { + "epoch": 0.19955, + "grad_norm": 0.12089315801858902, + "learning_rate": 4.6805529406281925e-05, + "loss": 0.0435, + "step": 40910 + }, + { + "epoch": 0.1996, + "grad_norm": 0.15447144210338593, + "learning_rate": 4.680350727977959e-05, + "loss": 0.0458, + "step": 40920 + }, + { + "epoch": 0.19965, + "grad_norm": 0.1232617050409317, + "learning_rate": 4.6801484557177546e-05, + "loss": 0.0398, + "step": 40930 + }, + { + "epoch": 0.1997, + "grad_norm": 0.1233721524477005, + "learning_rate": 4.679946123853111e-05, + "loss": 0.0423, + "step": 40940 + }, + { + "epoch": 0.19975, + "grad_norm": 0.121514692902565, + "learning_rate": 4.67974373238956e-05, + "loss": 0.0428, + "step": 40950 + }, + { + "epoch": 0.1998, + "grad_norm": 0.1508697271347046, + "learning_rate": 4.679541281332633e-05, + "loss": 0.0473, + "step": 40960 + }, + { + "epoch": 0.19985, + "grad_norm": 0.12545537948608398, + "learning_rate": 4.679338770687867e-05, + "loss": 0.0423, + "step": 40970 + }, + { + "epoch": 0.1999, + "grad_norm": 0.15301913022994995, + "learning_rate": 4.679136200460798e-05, + "loss": 0.0438, + "step": 40980 + }, + { + "epoch": 0.19995, + "grad_norm": 0.11956794559955597, + "learning_rate": 4.6789335706569635e-05, + "loss": 0.0415, + "step": 40990 + }, + { + "epoch": 0.2, + "grad_norm": 0.11663742363452911, + "learning_rate": 4.678730881281904e-05, + "loss": 0.0434, + "step": 41000 + }, + { + "epoch": 0.20005, + "grad_norm": 0.10982909053564072, + "learning_rate": 4.67852813234116e-05, + "loss": 0.0412, + "step": 41010 + }, + { + "epoch": 0.2001, + "grad_norm": 0.11424484103918076, + "learning_rate": 4.678325323840276e-05, + "loss": 0.0406, + "step": 41020 + }, + { + "epoch": 0.20015, + "grad_norm": 0.125224769115448, + "learning_rate": 4.6781224557847955e-05, + "loss": 0.0409, + "step": 41030 + }, + { + "epoch": 0.2002, + "grad_norm": 0.10726512968540192, + "learning_rate": 4.677919528180266e-05, + "loss": 0.0396, + "step": 41040 + }, + { + "epoch": 0.20025, + "grad_norm": 0.12481468170881271, + "learning_rate": 4.6777165410322344e-05, + "loss": 0.0422, + "step": 41050 + }, + { + "epoch": 0.2003, + "grad_norm": 0.11721587926149368, + "learning_rate": 4.6775134943462504e-05, + "loss": 0.042, + "step": 41060 + }, + { + "epoch": 0.20035, + "grad_norm": 0.1256370097398758, + "learning_rate": 4.6773103881278655e-05, + "loss": 0.043, + "step": 41070 + }, + { + "epoch": 0.2004, + "grad_norm": 0.11127348989248276, + "learning_rate": 4.6771072223826336e-05, + "loss": 0.0403, + "step": 41080 + }, + { + "epoch": 0.20045, + "grad_norm": 0.10558488219976425, + "learning_rate": 4.676903997116107e-05, + "loss": 0.0416, + "step": 41090 + }, + { + "epoch": 0.2005, + "grad_norm": 0.10310178250074387, + "learning_rate": 4.676700712333843e-05, + "loss": 0.0402, + "step": 41100 + }, + { + "epoch": 0.20055, + "grad_norm": 0.1143566370010376, + "learning_rate": 4.6764973680414e-05, + "loss": 0.0415, + "step": 41110 + }, + { + "epoch": 0.2006, + "grad_norm": 0.1191648617386818, + "learning_rate": 4.6762939642443366e-05, + "loss": 0.0416, + "step": 41120 + }, + { + "epoch": 0.20065, + "grad_norm": 0.10538813471794128, + "learning_rate": 4.6760905009482136e-05, + "loss": 0.0409, + "step": 41130 + }, + { + "epoch": 0.2007, + "grad_norm": 0.11145827174186707, + "learning_rate": 4.6758869781585936e-05, + "loss": 0.0424, + "step": 41140 + }, + { + "epoch": 0.20075, + "grad_norm": 0.10465462505817413, + "learning_rate": 4.6756833958810406e-05, + "loss": 0.043, + "step": 41150 + }, + { + "epoch": 0.2008, + "grad_norm": 0.10324224084615707, + "learning_rate": 4.675479754121122e-05, + "loss": 0.0418, + "step": 41160 + }, + { + "epoch": 0.20085, + "grad_norm": 0.1077202707529068, + "learning_rate": 4.675276052884404e-05, + "loss": 0.0418, + "step": 41170 + }, + { + "epoch": 0.2009, + "grad_norm": 0.11636579036712646, + "learning_rate": 4.6750722921764556e-05, + "loss": 0.0408, + "step": 41180 + }, + { + "epoch": 0.20095, + "grad_norm": 0.08367370069026947, + "learning_rate": 4.674868472002848e-05, + "loss": 0.0395, + "step": 41190 + }, + { + "epoch": 0.201, + "grad_norm": 0.10918355733156204, + "learning_rate": 4.674664592369154e-05, + "loss": 0.0419, + "step": 41200 + }, + { + "epoch": 0.20105, + "grad_norm": 0.11053916066884995, + "learning_rate": 4.6744606532809456e-05, + "loss": 0.0446, + "step": 41210 + }, + { + "epoch": 0.2011, + "grad_norm": 0.0952436625957489, + "learning_rate": 4.6742566547438006e-05, + "loss": 0.0414, + "step": 41220 + }, + { + "epoch": 0.20115, + "grad_norm": 0.11707165092229843, + "learning_rate": 4.6740525967632955e-05, + "loss": 0.0427, + "step": 41230 + }, + { + "epoch": 0.2012, + "grad_norm": 0.11166837066411972, + "learning_rate": 4.673848479345009e-05, + "loss": 0.0416, + "step": 41240 + }, + { + "epoch": 0.20125, + "grad_norm": 0.13893011212348938, + "learning_rate": 4.673644302494522e-05, + "loss": 0.0429, + "step": 41250 + }, + { + "epoch": 0.2013, + "grad_norm": 0.11983472853899002, + "learning_rate": 4.6734400662174164e-05, + "loss": 0.0405, + "step": 41260 + }, + { + "epoch": 0.20135, + "grad_norm": 0.12756171822547913, + "learning_rate": 4.673235770519276e-05, + "loss": 0.0413, + "step": 41270 + }, + { + "epoch": 0.2014, + "grad_norm": 0.10711822658777237, + "learning_rate": 4.673031415405686e-05, + "loss": 0.0393, + "step": 41280 + }, + { + "epoch": 0.20145, + "grad_norm": 0.10304751247167587, + "learning_rate": 4.672827000882233e-05, + "loss": 0.0412, + "step": 41290 + }, + { + "epoch": 0.2015, + "grad_norm": 0.1470487266778946, + "learning_rate": 4.672622526954506e-05, + "loss": 0.0415, + "step": 41300 + }, + { + "epoch": 0.20155, + "grad_norm": 0.12710584700107574, + "learning_rate": 4.6724179936280965e-05, + "loss": 0.0395, + "step": 41310 + }, + { + "epoch": 0.2016, + "grad_norm": 0.12694060802459717, + "learning_rate": 4.672213400908595e-05, + "loss": 0.0395, + "step": 41320 + }, + { + "epoch": 0.20165, + "grad_norm": 0.10214198380708694, + "learning_rate": 4.672008748801594e-05, + "loss": 0.0408, + "step": 41330 + }, + { + "epoch": 0.2017, + "grad_norm": 0.10517226159572601, + "learning_rate": 4.67180403731269e-05, + "loss": 0.0407, + "step": 41340 + }, + { + "epoch": 0.20175, + "grad_norm": 0.12303302437067032, + "learning_rate": 4.6715992664474805e-05, + "loss": 0.0406, + "step": 41350 + }, + { + "epoch": 0.2018, + "grad_norm": 0.10291822999715805, + "learning_rate": 4.6713944362115625e-05, + "loss": 0.0399, + "step": 41360 + }, + { + "epoch": 0.20185, + "grad_norm": 0.11955802887678146, + "learning_rate": 4.671189546610536e-05, + "loss": 0.0422, + "step": 41370 + }, + { + "epoch": 0.2019, + "grad_norm": 0.1398523449897766, + "learning_rate": 4.670984597650003e-05, + "loss": 0.0403, + "step": 41380 + }, + { + "epoch": 0.20195, + "grad_norm": 0.11278299987316132, + "learning_rate": 4.6707795893355675e-05, + "loss": 0.0403, + "step": 41390 + }, + { + "epoch": 0.202, + "grad_norm": 0.13503143191337585, + "learning_rate": 4.6705745216728334e-05, + "loss": 0.0399, + "step": 41400 + }, + { + "epoch": 0.20205, + "grad_norm": 0.0978846326470375, + "learning_rate": 4.670369394667407e-05, + "loss": 0.0412, + "step": 41410 + }, + { + "epoch": 0.2021, + "grad_norm": 0.11684785783290863, + "learning_rate": 4.670164208324896e-05, + "loss": 0.0427, + "step": 41420 + }, + { + "epoch": 0.20215, + "grad_norm": 0.12648822367191315, + "learning_rate": 4.669958962650912e-05, + "loss": 0.0409, + "step": 41430 + }, + { + "epoch": 0.2022, + "grad_norm": 0.13909755647182465, + "learning_rate": 4.6697536576510644e-05, + "loss": 0.0412, + "step": 41440 + }, + { + "epoch": 0.20225, + "grad_norm": 0.12606674432754517, + "learning_rate": 4.669548293330967e-05, + "loss": 0.0399, + "step": 41450 + }, + { + "epoch": 0.2023, + "grad_norm": 0.10662326961755753, + "learning_rate": 4.6693428696962344e-05, + "loss": 0.0416, + "step": 41460 + }, + { + "epoch": 0.20235, + "grad_norm": 0.09710852801799774, + "learning_rate": 4.669137386752483e-05, + "loss": 0.0406, + "step": 41470 + }, + { + "epoch": 0.2024, + "grad_norm": 0.1193772554397583, + "learning_rate": 4.66893184450533e-05, + "loss": 0.0417, + "step": 41480 + }, + { + "epoch": 0.20245, + "grad_norm": 0.12737101316452026, + "learning_rate": 4.668726242960395e-05, + "loss": 0.0407, + "step": 41490 + }, + { + "epoch": 0.2025, + "grad_norm": 0.1282685548067093, + "learning_rate": 4.6685205821233e-05, + "loss": 0.0422, + "step": 41500 + }, + { + "epoch": 0.20255, + "grad_norm": 0.11876413971185684, + "learning_rate": 4.668314861999667e-05, + "loss": 0.0416, + "step": 41510 + }, + { + "epoch": 0.2026, + "grad_norm": 0.10370395332574844, + "learning_rate": 4.6681090825951194e-05, + "loss": 0.0424, + "step": 41520 + }, + { + "epoch": 0.20265, + "grad_norm": 0.12113114446401596, + "learning_rate": 4.667903243915285e-05, + "loss": 0.0421, + "step": 41530 + }, + { + "epoch": 0.2027, + "grad_norm": 0.11187760531902313, + "learning_rate": 4.66769734596579e-05, + "loss": 0.0426, + "step": 41540 + }, + { + "epoch": 0.20275, + "grad_norm": 0.11668987572193146, + "learning_rate": 4.667491388752263e-05, + "loss": 0.0423, + "step": 41550 + }, + { + "epoch": 0.2028, + "grad_norm": 0.11089004576206207, + "learning_rate": 4.6672853722803365e-05, + "loss": 0.0426, + "step": 41560 + }, + { + "epoch": 0.20285, + "grad_norm": 0.12049264460802078, + "learning_rate": 4.667079296555642e-05, + "loss": 0.0439, + "step": 41570 + }, + { + "epoch": 0.2029, + "grad_norm": 0.10137578099966049, + "learning_rate": 4.6668731615838144e-05, + "loss": 0.0405, + "step": 41580 + }, + { + "epoch": 0.20295, + "grad_norm": 0.11302396655082703, + "learning_rate": 4.666666967370488e-05, + "loss": 0.0417, + "step": 41590 + }, + { + "epoch": 0.203, + "grad_norm": 0.11328114569187164, + "learning_rate": 4.6664607139213e-05, + "loss": 0.0422, + "step": 41600 + }, + { + "epoch": 0.20305, + "grad_norm": 0.12339196354150772, + "learning_rate": 4.666254401241891e-05, + "loss": 0.0406, + "step": 41610 + }, + { + "epoch": 0.2031, + "grad_norm": 0.09714601933956146, + "learning_rate": 4.6660480293379006e-05, + "loss": 0.0407, + "step": 41620 + }, + { + "epoch": 0.20315, + "grad_norm": 0.13312238454818726, + "learning_rate": 4.66584159821497e-05, + "loss": 0.0432, + "step": 41630 + }, + { + "epoch": 0.2032, + "grad_norm": 0.11992169916629791, + "learning_rate": 4.665635107878744e-05, + "loss": 0.0408, + "step": 41640 + }, + { + "epoch": 0.20325, + "grad_norm": 0.10778101533651352, + "learning_rate": 4.665428558334868e-05, + "loss": 0.0411, + "step": 41650 + }, + { + "epoch": 0.2033, + "grad_norm": 0.12014755606651306, + "learning_rate": 4.665221949588989e-05, + "loss": 0.0446, + "step": 41660 + }, + { + "epoch": 0.20335, + "grad_norm": 0.090846486389637, + "learning_rate": 4.6650152816467545e-05, + "loss": 0.0382, + "step": 41670 + }, + { + "epoch": 0.2034, + "grad_norm": 0.11623945832252502, + "learning_rate": 4.6648085545138164e-05, + "loss": 0.0391, + "step": 41680 + }, + { + "epoch": 0.20345, + "grad_norm": 0.13945305347442627, + "learning_rate": 4.6646017681958254e-05, + "loss": 0.0414, + "step": 41690 + }, + { + "epoch": 0.2035, + "grad_norm": 0.13401776552200317, + "learning_rate": 4.664394922698435e-05, + "loss": 0.041, + "step": 41700 + }, + { + "epoch": 0.20355, + "grad_norm": 0.11550464481115341, + "learning_rate": 4.664188018027301e-05, + "loss": 0.0423, + "step": 41710 + }, + { + "epoch": 0.2036, + "grad_norm": 0.11903069168329239, + "learning_rate": 4.663981054188079e-05, + "loss": 0.04, + "step": 41720 + }, + { + "epoch": 0.20365, + "grad_norm": 0.1341111809015274, + "learning_rate": 4.663774031186429e-05, + "loss": 0.0407, + "step": 41730 + }, + { + "epoch": 0.2037, + "grad_norm": 0.13095654547214508, + "learning_rate": 4.6635669490280085e-05, + "loss": 0.0405, + "step": 41740 + }, + { + "epoch": 0.20375, + "grad_norm": 0.09725949913263321, + "learning_rate": 4.6633598077184815e-05, + "loss": 0.0421, + "step": 41750 + }, + { + "epoch": 0.2038, + "grad_norm": 0.10696309059858322, + "learning_rate": 4.6631526072635095e-05, + "loss": 0.0387, + "step": 41760 + }, + { + "epoch": 0.20385, + "grad_norm": 0.106041319668293, + "learning_rate": 4.662945347668758e-05, + "loss": 0.041, + "step": 41770 + }, + { + "epoch": 0.2039, + "grad_norm": 0.10835712403059006, + "learning_rate": 4.6627380289398936e-05, + "loss": 0.0415, + "step": 41780 + }, + { + "epoch": 0.20395, + "grad_norm": 0.09863156825304031, + "learning_rate": 4.662530651082584e-05, + "loss": 0.0415, + "step": 41790 + }, + { + "epoch": 0.204, + "grad_norm": 0.12385343760251999, + "learning_rate": 4.662323214102499e-05, + "loss": 0.0401, + "step": 41800 + }, + { + "epoch": 0.20405, + "grad_norm": 0.1045091450214386, + "learning_rate": 4.6621157180053085e-05, + "loss": 0.0418, + "step": 41810 + }, + { + "epoch": 0.2041, + "grad_norm": 0.10694091022014618, + "learning_rate": 4.661908162796687e-05, + "loss": 0.0408, + "step": 41820 + }, + { + "epoch": 0.20415, + "grad_norm": 0.11447501182556152, + "learning_rate": 4.661700548482309e-05, + "loss": 0.0402, + "step": 41830 + }, + { + "epoch": 0.2042, + "grad_norm": 0.0982501357793808, + "learning_rate": 4.66149287506785e-05, + "loss": 0.0416, + "step": 41840 + }, + { + "epoch": 0.20425, + "grad_norm": 0.16061526536941528, + "learning_rate": 4.6612851425589876e-05, + "loss": 0.0441, + "step": 41850 + }, + { + "epoch": 0.2043, + "grad_norm": 0.11797483265399933, + "learning_rate": 4.6610773509614016e-05, + "loss": 0.0397, + "step": 41860 + }, + { + "epoch": 0.20435, + "grad_norm": 0.11120536178350449, + "learning_rate": 4.6608695002807724e-05, + "loss": 0.0418, + "step": 41870 + }, + { + "epoch": 0.2044, + "grad_norm": 0.10933781415224075, + "learning_rate": 4.6606615905227834e-05, + "loss": 0.042, + "step": 41880 + }, + { + "epoch": 0.20445, + "grad_norm": 0.10738882422447205, + "learning_rate": 4.6604536216931185e-05, + "loss": 0.0412, + "step": 41890 + }, + { + "epoch": 0.2045, + "grad_norm": 0.1318594068288803, + "learning_rate": 4.660245593797462e-05, + "loss": 0.0395, + "step": 41900 + }, + { + "epoch": 0.20455, + "grad_norm": 0.13262218236923218, + "learning_rate": 4.6600375068415034e-05, + "loss": 0.0407, + "step": 41910 + }, + { + "epoch": 0.2046, + "grad_norm": 0.10908342152833939, + "learning_rate": 4.6598293608309306e-05, + "loss": 0.0394, + "step": 41920 + }, + { + "epoch": 0.20465, + "grad_norm": 0.12204093486070633, + "learning_rate": 4.659621155771434e-05, + "loss": 0.0412, + "step": 41930 + }, + { + "epoch": 0.2047, + "grad_norm": 0.1055048406124115, + "learning_rate": 4.6594128916687074e-05, + "loss": 0.041, + "step": 41940 + }, + { + "epoch": 0.20475, + "grad_norm": 0.12463721632957458, + "learning_rate": 4.659204568528443e-05, + "loss": 0.0419, + "step": 41950 + }, + { + "epoch": 0.2048, + "grad_norm": 0.12951551377773285, + "learning_rate": 4.658996186356337e-05, + "loss": 0.0409, + "step": 41960 + }, + { + "epoch": 0.20485, + "grad_norm": 0.11702567338943481, + "learning_rate": 4.658787745158086e-05, + "loss": 0.0428, + "step": 41970 + }, + { + "epoch": 0.2049, + "grad_norm": 0.09821664541959763, + "learning_rate": 4.6585792449393894e-05, + "loss": 0.0421, + "step": 41980 + }, + { + "epoch": 0.20495, + "grad_norm": 0.1046162024140358, + "learning_rate": 4.6583706857059475e-05, + "loss": 0.0408, + "step": 41990 + }, + { + "epoch": 0.205, + "grad_norm": 0.13040538132190704, + "learning_rate": 4.658162067463461e-05, + "loss": 0.0397, + "step": 42000 + }, + { + "epoch": 0.20505, + "grad_norm": 0.1357438713312149, + "learning_rate": 4.657953390217635e-05, + "loss": 0.0398, + "step": 42010 + }, + { + "epoch": 0.2051, + "grad_norm": 0.13346247375011444, + "learning_rate": 4.6577446539741745e-05, + "loss": 0.0406, + "step": 42020 + }, + { + "epoch": 0.20515, + "grad_norm": 0.12445792555809021, + "learning_rate": 4.657535858738785e-05, + "loss": 0.0404, + "step": 42030 + }, + { + "epoch": 0.2052, + "grad_norm": 0.1608707159757614, + "learning_rate": 4.657327004517176e-05, + "loss": 0.041, + "step": 42040 + }, + { + "epoch": 0.20525, + "grad_norm": 0.12387620657682419, + "learning_rate": 4.657118091315057e-05, + "loss": 0.0402, + "step": 42050 + }, + { + "epoch": 0.2053, + "grad_norm": 0.12562602758407593, + "learning_rate": 4.65690911913814e-05, + "loss": 0.0407, + "step": 42060 + }, + { + "epoch": 0.20535, + "grad_norm": 0.12161588668823242, + "learning_rate": 4.6567000879921376e-05, + "loss": 0.0411, + "step": 42070 + }, + { + "epoch": 0.2054, + "grad_norm": 0.10995645076036453, + "learning_rate": 4.656490997882765e-05, + "loss": 0.0415, + "step": 42080 + }, + { + "epoch": 0.20545, + "grad_norm": 0.09514041990041733, + "learning_rate": 4.656281848815739e-05, + "loss": 0.0406, + "step": 42090 + }, + { + "epoch": 0.2055, + "grad_norm": 0.12755118310451508, + "learning_rate": 4.656072640796777e-05, + "loss": 0.0428, + "step": 42100 + }, + { + "epoch": 0.20555, + "grad_norm": 0.14956414699554443, + "learning_rate": 4.655863373831599e-05, + "loss": 0.0404, + "step": 42110 + }, + { + "epoch": 0.2056, + "grad_norm": 0.11889217048883438, + "learning_rate": 4.655654047925927e-05, + "loss": 0.0423, + "step": 42120 + }, + { + "epoch": 0.20565, + "grad_norm": 0.10934683680534363, + "learning_rate": 4.6554446630854833e-05, + "loss": 0.0406, + "step": 42130 + }, + { + "epoch": 0.2057, + "grad_norm": 0.14071911573410034, + "learning_rate": 4.655235219315991e-05, + "loss": 0.0445, + "step": 42140 + }, + { + "epoch": 0.20575, + "grad_norm": 0.1343953013420105, + "learning_rate": 4.6550257166231784e-05, + "loss": 0.0414, + "step": 42150 + }, + { + "epoch": 0.2058, + "grad_norm": 0.1337887942790985, + "learning_rate": 4.654816155012772e-05, + "loss": 0.0411, + "step": 42160 + }, + { + "epoch": 0.20585, + "grad_norm": 0.10310254991054535, + "learning_rate": 4.6546065344905015e-05, + "loss": 0.0416, + "step": 42170 + }, + { + "epoch": 0.2059, + "grad_norm": 0.12980087101459503, + "learning_rate": 4.654396855062098e-05, + "loss": 0.0419, + "step": 42180 + }, + { + "epoch": 0.20595, + "grad_norm": 0.1000036969780922, + "learning_rate": 4.6541871167332934e-05, + "loss": 0.0403, + "step": 42190 + }, + { + "epoch": 0.206, + "grad_norm": 0.11595522612333298, + "learning_rate": 4.653977319509822e-05, + "loss": 0.0432, + "step": 42200 + }, + { + "epoch": 0.20605, + "grad_norm": 0.12469027191400528, + "learning_rate": 4.653767463397421e-05, + "loss": 0.0438, + "step": 42210 + }, + { + "epoch": 0.2061, + "grad_norm": 0.13064132630825043, + "learning_rate": 4.653557548401827e-05, + "loss": 0.0413, + "step": 42220 + }, + { + "epoch": 0.20615, + "grad_norm": 0.13942815363407135, + "learning_rate": 4.653347574528777e-05, + "loss": 0.0429, + "step": 42230 + }, + { + "epoch": 0.2062, + "grad_norm": 0.11371278017759323, + "learning_rate": 4.6531375417840145e-05, + "loss": 0.0411, + "step": 42240 + }, + { + "epoch": 0.20625, + "grad_norm": 0.12325238436460495, + "learning_rate": 4.65292745017328e-05, + "loss": 0.04, + "step": 42250 + }, + { + "epoch": 0.2063, + "grad_norm": 0.13593898713588715, + "learning_rate": 4.6527172997023184e-05, + "loss": 0.042, + "step": 42260 + }, + { + "epoch": 0.20635, + "grad_norm": 0.14073550701141357, + "learning_rate": 4.652507090376874e-05, + "loss": 0.0419, + "step": 42270 + }, + { + "epoch": 0.2064, + "grad_norm": 0.13174958527088165, + "learning_rate": 4.652296822202694e-05, + "loss": 0.0413, + "step": 42280 + }, + { + "epoch": 0.20645, + "grad_norm": 0.13833463191986084, + "learning_rate": 4.652086495185528e-05, + "loss": 0.0427, + "step": 42290 + }, + { + "epoch": 0.2065, + "grad_norm": 0.10557810217142105, + "learning_rate": 4.6518761093311256e-05, + "loss": 0.0453, + "step": 42300 + }, + { + "epoch": 0.20655, + "grad_norm": 0.11190221458673477, + "learning_rate": 4.6516656646452395e-05, + "loss": 0.0405, + "step": 42310 + }, + { + "epoch": 0.2066, + "grad_norm": 0.1107366532087326, + "learning_rate": 4.651455161133622e-05, + "loss": 0.0416, + "step": 42320 + }, + { + "epoch": 0.20665, + "grad_norm": 0.10672711580991745, + "learning_rate": 4.651244598802028e-05, + "loss": 0.0435, + "step": 42330 + }, + { + "epoch": 0.2067, + "grad_norm": 0.11309855431318283, + "learning_rate": 4.651033977656216e-05, + "loss": 0.0419, + "step": 42340 + }, + { + "epoch": 0.20675, + "grad_norm": 0.11873457580804825, + "learning_rate": 4.650823297701942e-05, + "loss": 0.0413, + "step": 42350 + }, + { + "epoch": 0.2068, + "grad_norm": 0.12466181814670563, + "learning_rate": 4.650612558944968e-05, + "loss": 0.0423, + "step": 42360 + }, + { + "epoch": 0.20685, + "grad_norm": 0.1014702096581459, + "learning_rate": 4.650401761391054e-05, + "loss": 0.0427, + "step": 42370 + }, + { + "epoch": 0.2069, + "grad_norm": 0.12159010767936707, + "learning_rate": 4.6501909050459644e-05, + "loss": 0.0421, + "step": 42380 + }, + { + "epoch": 0.20695, + "grad_norm": 0.1161002367734909, + "learning_rate": 4.649979989915463e-05, + "loss": 0.0453, + "step": 42390 + }, + { + "epoch": 0.207, + "grad_norm": 0.16336022317409515, + "learning_rate": 4.649769016005316e-05, + "loss": 0.0422, + "step": 42400 + }, + { + "epoch": 0.20705, + "grad_norm": 0.11819625645875931, + "learning_rate": 4.649557983321292e-05, + "loss": 0.0401, + "step": 42410 + }, + { + "epoch": 0.2071, + "grad_norm": 0.15638066828250885, + "learning_rate": 4.649346891869159e-05, + "loss": 0.0399, + "step": 42420 + }, + { + "epoch": 0.20715, + "grad_norm": 0.1084650382399559, + "learning_rate": 4.649135741654691e-05, + "loss": 0.0399, + "step": 42430 + }, + { + "epoch": 0.2072, + "grad_norm": 0.12430199235677719, + "learning_rate": 4.648924532683659e-05, + "loss": 0.0405, + "step": 42440 + }, + { + "epoch": 0.20725, + "grad_norm": 0.11965826898813248, + "learning_rate": 4.648713264961838e-05, + "loss": 0.042, + "step": 42450 + }, + { + "epoch": 0.2073, + "grad_norm": 0.11596711724996567, + "learning_rate": 4.648501938495003e-05, + "loss": 0.0405, + "step": 42460 + }, + { + "epoch": 0.20735, + "grad_norm": 0.1251993179321289, + "learning_rate": 4.648290553288932e-05, + "loss": 0.0405, + "step": 42470 + }, + { + "epoch": 0.2074, + "grad_norm": 0.10778885334730148, + "learning_rate": 4.6480791093494046e-05, + "loss": 0.0422, + "step": 42480 + }, + { + "epoch": 0.20745, + "grad_norm": 0.1158812865614891, + "learning_rate": 4.6478676066822016e-05, + "loss": 0.041, + "step": 42490 + }, + { + "epoch": 0.2075, + "grad_norm": 0.107663094997406, + "learning_rate": 4.647656045293104e-05, + "loss": 0.0403, + "step": 42500 + }, + { + "epoch": 0.20755, + "grad_norm": 0.09674108773469925, + "learning_rate": 4.647444425187898e-05, + "loss": 0.0409, + "step": 42510 + }, + { + "epoch": 0.2076, + "grad_norm": 0.10285669565200806, + "learning_rate": 4.6472327463723684e-05, + "loss": 0.0389, + "step": 42520 + }, + { + "epoch": 0.20765, + "grad_norm": 0.09745500981807709, + "learning_rate": 4.6470210088523015e-05, + "loss": 0.0398, + "step": 42530 + }, + { + "epoch": 0.2077, + "grad_norm": 0.11266753077507019, + "learning_rate": 4.646809212633487e-05, + "loss": 0.0416, + "step": 42540 + }, + { + "epoch": 0.20775, + "grad_norm": 0.13525390625, + "learning_rate": 4.6465973577217146e-05, + "loss": 0.0421, + "step": 42550 + }, + { + "epoch": 0.2078, + "grad_norm": 0.14094169437885284, + "learning_rate": 4.6463854441227785e-05, + "loss": 0.0413, + "step": 42560 + }, + { + "epoch": 0.20785, + "grad_norm": 0.10353077203035355, + "learning_rate": 4.6461734718424685e-05, + "loss": 0.0402, + "step": 42570 + }, + { + "epoch": 0.2079, + "grad_norm": 0.11642540246248245, + "learning_rate": 4.6459614408865836e-05, + "loss": 0.0399, + "step": 42580 + }, + { + "epoch": 0.20795, + "grad_norm": 0.13795001804828644, + "learning_rate": 4.645749351260919e-05, + "loss": 0.0413, + "step": 42590 + }, + { + "epoch": 0.208, + "grad_norm": 0.1193564310669899, + "learning_rate": 4.645537202971273e-05, + "loss": 0.0394, + "step": 42600 + }, + { + "epoch": 0.20805, + "grad_norm": 0.09946589171886444, + "learning_rate": 4.645324996023446e-05, + "loss": 0.0408, + "step": 42610 + }, + { + "epoch": 0.2081, + "grad_norm": 0.12816078960895538, + "learning_rate": 4.64511273042324e-05, + "loss": 0.0407, + "step": 42620 + }, + { + "epoch": 0.20815, + "grad_norm": 0.11315611749887466, + "learning_rate": 4.6449004061764565e-05, + "loss": 0.0398, + "step": 42630 + }, + { + "epoch": 0.2082, + "grad_norm": 0.10693423449993134, + "learning_rate": 4.644688023288903e-05, + "loss": 0.0434, + "step": 42640 + }, + { + "epoch": 0.20825, + "grad_norm": 0.09122775495052338, + "learning_rate": 4.6444755817663845e-05, + "loss": 0.0405, + "step": 42650 + }, + { + "epoch": 0.2083, + "grad_norm": 0.1261245608329773, + "learning_rate": 4.644263081614708e-05, + "loss": 0.04, + "step": 42660 + }, + { + "epoch": 0.20835, + "grad_norm": 0.10321059823036194, + "learning_rate": 4.6440505228396855e-05, + "loss": 0.0402, + "step": 42670 + }, + { + "epoch": 0.2084, + "grad_norm": 0.10114654153585434, + "learning_rate": 4.6438379054471274e-05, + "loss": 0.0422, + "step": 42680 + }, + { + "epoch": 0.20845, + "grad_norm": 0.10504954308271408, + "learning_rate": 4.643625229442846e-05, + "loss": 0.0415, + "step": 42690 + }, + { + "epoch": 0.2085, + "grad_norm": 0.11723367124795914, + "learning_rate": 4.6434124948326564e-05, + "loss": 0.0415, + "step": 42700 + }, + { + "epoch": 0.20855, + "grad_norm": 0.11996826529502869, + "learning_rate": 4.643199701622374e-05, + "loss": 0.0427, + "step": 42710 + }, + { + "epoch": 0.2086, + "grad_norm": 0.11167429387569427, + "learning_rate": 4.642986849817817e-05, + "loss": 0.0403, + "step": 42720 + }, + { + "epoch": 0.20865, + "grad_norm": 0.13511034846305847, + "learning_rate": 4.6427739394248046e-05, + "loss": 0.0419, + "step": 42730 + }, + { + "epoch": 0.2087, + "grad_norm": 0.12214132398366928, + "learning_rate": 4.642560970449158e-05, + "loss": 0.0421, + "step": 42740 + }, + { + "epoch": 0.20875, + "grad_norm": 0.1669897884130478, + "learning_rate": 4.642347942896699e-05, + "loss": 0.0427, + "step": 42750 + }, + { + "epoch": 0.2088, + "grad_norm": 0.12708796560764313, + "learning_rate": 4.642134856773253e-05, + "loss": 0.0395, + "step": 42760 + }, + { + "epoch": 0.20885, + "grad_norm": 0.13688746094703674, + "learning_rate": 4.641921712084644e-05, + "loss": 0.044, + "step": 42770 + }, + { + "epoch": 0.2089, + "grad_norm": 0.13669873774051666, + "learning_rate": 4.6417085088366996e-05, + "loss": 0.0404, + "step": 42780 + }, + { + "epoch": 0.20895, + "grad_norm": 0.13903357088565826, + "learning_rate": 4.6414952470352494e-05, + "loss": 0.0403, + "step": 42790 + }, + { + "epoch": 0.209, + "grad_norm": 0.13542260229587555, + "learning_rate": 4.641281926686124e-05, + "loss": 0.0429, + "step": 42800 + }, + { + "epoch": 0.20905, + "grad_norm": 0.14414159953594208, + "learning_rate": 4.641068547795155e-05, + "loss": 0.0428, + "step": 42810 + }, + { + "epoch": 0.2091, + "grad_norm": 0.09341292828321457, + "learning_rate": 4.640855110368177e-05, + "loss": 0.0393, + "step": 42820 + }, + { + "epoch": 0.20915, + "grad_norm": 0.13524645566940308, + "learning_rate": 4.6406416144110236e-05, + "loss": 0.0424, + "step": 42830 + }, + { + "epoch": 0.2092, + "grad_norm": 0.11175638437271118, + "learning_rate": 4.640428059929534e-05, + "loss": 0.0441, + "step": 42840 + }, + { + "epoch": 0.20925, + "grad_norm": 0.10336706787347794, + "learning_rate": 4.640214446929544e-05, + "loss": 0.0397, + "step": 42850 + }, + { + "epoch": 0.2093, + "grad_norm": 0.09097188711166382, + "learning_rate": 4.640000775416895e-05, + "loss": 0.0405, + "step": 42860 + }, + { + "epoch": 0.20935, + "grad_norm": 0.10464566200971603, + "learning_rate": 4.639787045397429e-05, + "loss": 0.04, + "step": 42870 + }, + { + "epoch": 0.2094, + "grad_norm": 0.10859711468219757, + "learning_rate": 4.639573256876989e-05, + "loss": 0.0425, + "step": 42880 + }, + { + "epoch": 0.20945, + "grad_norm": 0.11505578458309174, + "learning_rate": 4.6393594098614204e-05, + "loss": 0.0403, + "step": 42890 + }, + { + "epoch": 0.2095, + "grad_norm": 0.11063778400421143, + "learning_rate": 4.63914550435657e-05, + "loss": 0.0438, + "step": 42900 + }, + { + "epoch": 0.20955, + "grad_norm": 0.11395706981420517, + "learning_rate": 4.6389315403682846e-05, + "loss": 0.0404, + "step": 42910 + }, + { + "epoch": 0.2096, + "grad_norm": 0.11956844478845596, + "learning_rate": 4.6387175179024134e-05, + "loss": 0.0417, + "step": 42920 + }, + { + "epoch": 0.20965, + "grad_norm": 0.11688018590211868, + "learning_rate": 4.6385034369648096e-05, + "loss": 0.0401, + "step": 42930 + }, + { + "epoch": 0.2097, + "grad_norm": 0.10507351905107498, + "learning_rate": 4.6382892975613244e-05, + "loss": 0.0404, + "step": 42940 + }, + { + "epoch": 0.20975, + "grad_norm": 0.10709551721811295, + "learning_rate": 4.638075099697814e-05, + "loss": 0.042, + "step": 42950 + }, + { + "epoch": 0.2098, + "grad_norm": 0.1598290354013443, + "learning_rate": 4.6378608433801336e-05, + "loss": 0.0445, + "step": 42960 + }, + { + "epoch": 0.20985, + "grad_norm": 0.12490461766719818, + "learning_rate": 4.637646528614141e-05, + "loss": 0.0432, + "step": 42970 + }, + { + "epoch": 0.2099, + "grad_norm": 0.13051536679267883, + "learning_rate": 4.637432155405694e-05, + "loss": 0.0441, + "step": 42980 + }, + { + "epoch": 0.20995, + "grad_norm": 0.11199908703565598, + "learning_rate": 4.6372177237606565e-05, + "loss": 0.0405, + "step": 42990 + }, + { + "epoch": 0.21, + "grad_norm": 0.13808445632457733, + "learning_rate": 4.637003233684889e-05, + "loss": 0.0419, + "step": 43000 + }, + { + "epoch": 0.21005, + "grad_norm": 0.163025364279747, + "learning_rate": 4.636788685184256e-05, + "loss": 0.0421, + "step": 43010 + }, + { + "epoch": 0.2101, + "grad_norm": 0.11270739883184433, + "learning_rate": 4.636574078264623e-05, + "loss": 0.0432, + "step": 43020 + }, + { + "epoch": 0.21015, + "grad_norm": 0.15009522438049316, + "learning_rate": 4.636359412931857e-05, + "loss": 0.0429, + "step": 43030 + }, + { + "epoch": 0.2102, + "grad_norm": 0.13547831773757935, + "learning_rate": 4.636144689191827e-05, + "loss": 0.0415, + "step": 43040 + }, + { + "epoch": 0.21025, + "grad_norm": 0.1354401558637619, + "learning_rate": 4.635929907050404e-05, + "loss": 0.0419, + "step": 43050 + }, + { + "epoch": 0.2103, + "grad_norm": 0.11277691274881363, + "learning_rate": 4.63571506651346e-05, + "loss": 0.0428, + "step": 43060 + }, + { + "epoch": 0.21035, + "grad_norm": 0.134175643324852, + "learning_rate": 4.635500167586868e-05, + "loss": 0.0418, + "step": 43070 + }, + { + "epoch": 0.2104, + "grad_norm": 0.11299371719360352, + "learning_rate": 4.635285210276504e-05, + "loss": 0.04, + "step": 43080 + }, + { + "epoch": 0.21045, + "grad_norm": 0.11659647524356842, + "learning_rate": 4.635070194588245e-05, + "loss": 0.0437, + "step": 43090 + }, + { + "epoch": 0.2105, + "grad_norm": 0.11916449666023254, + "learning_rate": 4.6348551205279686e-05, + "loss": 0.0405, + "step": 43100 + }, + { + "epoch": 0.21055, + "grad_norm": 0.12605997920036316, + "learning_rate": 4.634639988101555e-05, + "loss": 0.0406, + "step": 43110 + }, + { + "epoch": 0.2106, + "grad_norm": 0.1151721179485321, + "learning_rate": 4.6344247973148866e-05, + "loss": 0.0438, + "step": 43120 + }, + { + "epoch": 0.21065, + "grad_norm": 0.1175384446978569, + "learning_rate": 4.634209548173846e-05, + "loss": 0.0433, + "step": 43130 + }, + { + "epoch": 0.2107, + "grad_norm": 0.10693112015724182, + "learning_rate": 4.6339942406843174e-05, + "loss": 0.0393, + "step": 43140 + }, + { + "epoch": 0.21075, + "grad_norm": 0.09275225549936295, + "learning_rate": 4.6337788748521886e-05, + "loss": 0.0397, + "step": 43150 + }, + { + "epoch": 0.2108, + "grad_norm": 0.104120172560215, + "learning_rate": 4.633563450683347e-05, + "loss": 0.0395, + "step": 43160 + }, + { + "epoch": 0.21085, + "grad_norm": 0.11659583449363708, + "learning_rate": 4.6333479681836825e-05, + "loss": 0.0399, + "step": 43170 + }, + { + "epoch": 0.2109, + "grad_norm": 0.11879605799913406, + "learning_rate": 4.633132427359085e-05, + "loss": 0.0418, + "step": 43180 + }, + { + "epoch": 0.21095, + "grad_norm": 0.1126229539513588, + "learning_rate": 4.632916828215449e-05, + "loss": 0.0412, + "step": 43190 + }, + { + "epoch": 0.211, + "grad_norm": 0.1250193566083908, + "learning_rate": 4.632701170758668e-05, + "loss": 0.0417, + "step": 43200 + }, + { + "epoch": 0.21105, + "grad_norm": 0.10943544656038284, + "learning_rate": 4.632485454994638e-05, + "loss": 0.0392, + "step": 43210 + }, + { + "epoch": 0.2111, + "grad_norm": 0.10515572130680084, + "learning_rate": 4.632269680929257e-05, + "loss": 0.0413, + "step": 43220 + }, + { + "epoch": 0.21115, + "grad_norm": 0.1227058544754982, + "learning_rate": 4.632053848568425e-05, + "loss": 0.0443, + "step": 43230 + }, + { + "epoch": 0.2112, + "grad_norm": 0.12968966364860535, + "learning_rate": 4.6318379579180404e-05, + "loss": 0.04, + "step": 43240 + }, + { + "epoch": 0.21125, + "grad_norm": 0.13288845121860504, + "learning_rate": 4.631622008984007e-05, + "loss": 0.0412, + "step": 43250 + }, + { + "epoch": 0.2113, + "grad_norm": 0.12562234699726105, + "learning_rate": 4.6314060017722296e-05, + "loss": 0.0422, + "step": 43260 + }, + { + "epoch": 0.21135, + "grad_norm": 0.11604010313749313, + "learning_rate": 4.631189936288612e-05, + "loss": 0.0404, + "step": 43270 + }, + { + "epoch": 0.2114, + "grad_norm": 0.10224141925573349, + "learning_rate": 4.630973812539063e-05, + "loss": 0.0398, + "step": 43280 + }, + { + "epoch": 0.21145, + "grad_norm": 0.11876539885997772, + "learning_rate": 4.63075763052949e-05, + "loss": 0.0416, + "step": 43290 + }, + { + "epoch": 0.2115, + "grad_norm": 0.105568528175354, + "learning_rate": 4.6305413902658036e-05, + "loss": 0.0395, + "step": 43300 + }, + { + "epoch": 0.21155, + "grad_norm": 0.11702471971511841, + "learning_rate": 4.630325091753917e-05, + "loss": 0.0407, + "step": 43310 + }, + { + "epoch": 0.2116, + "grad_norm": 0.10368052870035172, + "learning_rate": 4.6301087349997416e-05, + "loss": 0.0417, + "step": 43320 + }, + { + "epoch": 0.21165, + "grad_norm": 0.11167798191308975, + "learning_rate": 4.629892320009194e-05, + "loss": 0.0426, + "step": 43330 + }, + { + "epoch": 0.2117, + "grad_norm": 0.12406478822231293, + "learning_rate": 4.62967584678819e-05, + "loss": 0.0407, + "step": 43340 + }, + { + "epoch": 0.21175, + "grad_norm": 0.11944945901632309, + "learning_rate": 4.6294593153426496e-05, + "loss": 0.0407, + "step": 43350 + }, + { + "epoch": 0.2118, + "grad_norm": 0.1270545870065689, + "learning_rate": 4.629242725678491e-05, + "loss": 0.0405, + "step": 43360 + }, + { + "epoch": 0.21185, + "grad_norm": 0.12933006882667542, + "learning_rate": 4.629026077801636e-05, + "loss": 0.0397, + "step": 43370 + }, + { + "epoch": 0.2119, + "grad_norm": 0.11296647787094116, + "learning_rate": 4.628809371718008e-05, + "loss": 0.0425, + "step": 43380 + }, + { + "epoch": 0.21195, + "grad_norm": 0.11964956670999527, + "learning_rate": 4.6285926074335315e-05, + "loss": 0.0409, + "step": 43390 + }, + { + "epoch": 0.212, + "grad_norm": 0.1273273080587387, + "learning_rate": 4.628375784954133e-05, + "loss": 0.0403, + "step": 43400 + }, + { + "epoch": 0.21205, + "grad_norm": 0.09842070192098618, + "learning_rate": 4.62815890428574e-05, + "loss": 0.0395, + "step": 43410 + }, + { + "epoch": 0.2121, + "grad_norm": 0.10548171401023865, + "learning_rate": 4.627941965434281e-05, + "loss": 0.043, + "step": 43420 + }, + { + "epoch": 0.21215, + "grad_norm": 0.1085827574133873, + "learning_rate": 4.62772496840569e-05, + "loss": 0.0413, + "step": 43430 + }, + { + "epoch": 0.2122, + "grad_norm": 0.11527638882398605, + "learning_rate": 4.627507913205897e-05, + "loss": 0.0391, + "step": 43440 + }, + { + "epoch": 0.21225, + "grad_norm": 0.10231008380651474, + "learning_rate": 4.627290799840837e-05, + "loss": 0.0396, + "step": 43450 + }, + { + "epoch": 0.2123, + "grad_norm": 0.1240062266588211, + "learning_rate": 4.627073628316445e-05, + "loss": 0.0395, + "step": 43460 + }, + { + "epoch": 0.21235, + "grad_norm": 0.11377868801355362, + "learning_rate": 4.6268563986386596e-05, + "loss": 0.0436, + "step": 43470 + }, + { + "epoch": 0.2124, + "grad_norm": 0.1008082777261734, + "learning_rate": 4.6266391108134195e-05, + "loss": 0.0411, + "step": 43480 + }, + { + "epoch": 0.21245, + "grad_norm": 0.1036459431052208, + "learning_rate": 4.626421764846665e-05, + "loss": 0.0405, + "step": 43490 + }, + { + "epoch": 0.2125, + "grad_norm": 0.11455801874399185, + "learning_rate": 4.626204360744338e-05, + "loss": 0.0409, + "step": 43500 + }, + { + "epoch": 0.21255, + "grad_norm": 0.12558279931545258, + "learning_rate": 4.625986898512382e-05, + "loss": 0.0428, + "step": 43510 + }, + { + "epoch": 0.2126, + "grad_norm": 0.11081899702548981, + "learning_rate": 4.625769378156744e-05, + "loss": 0.0427, + "step": 43520 + }, + { + "epoch": 0.21265, + "grad_norm": 0.11352989077568054, + "learning_rate": 4.6255517996833696e-05, + "loss": 0.0404, + "step": 43530 + }, + { + "epoch": 0.2127, + "grad_norm": 0.11873759329319, + "learning_rate": 4.6253341630982075e-05, + "loss": 0.0397, + "step": 43540 + }, + { + "epoch": 0.21275, + "grad_norm": 0.1015138179063797, + "learning_rate": 4.6251164684072065e-05, + "loss": 0.0426, + "step": 43550 + }, + { + "epoch": 0.2128, + "grad_norm": 0.12051185220479965, + "learning_rate": 4.624898715616322e-05, + "loss": 0.0411, + "step": 43560 + }, + { + "epoch": 0.21285, + "grad_norm": 0.11388301849365234, + "learning_rate": 4.6246809047315034e-05, + "loss": 0.0407, + "step": 43570 + }, + { + "epoch": 0.2129, + "grad_norm": 0.09138743579387665, + "learning_rate": 4.624463035758707e-05, + "loss": 0.04, + "step": 43580 + }, + { + "epoch": 0.21295, + "grad_norm": 0.14637517929077148, + "learning_rate": 4.62424510870389e-05, + "loss": 0.0454, + "step": 43590 + }, + { + "epoch": 0.213, + "grad_norm": 0.11419618129730225, + "learning_rate": 4.6240271235730095e-05, + "loss": 0.0408, + "step": 43600 + }, + { + "epoch": 0.21305, + "grad_norm": 0.1109025701880455, + "learning_rate": 4.623809080372025e-05, + "loss": 0.0409, + "step": 43610 + }, + { + "epoch": 0.2131, + "grad_norm": 0.10209771245718002, + "learning_rate": 4.6235909791068986e-05, + "loss": 0.0412, + "step": 43620 + }, + { + "epoch": 0.21315, + "grad_norm": 0.09903377294540405, + "learning_rate": 4.623372819783592e-05, + "loss": 0.0411, + "step": 43630 + }, + { + "epoch": 0.2132, + "grad_norm": 0.10271365940570831, + "learning_rate": 4.623154602408071e-05, + "loss": 0.0405, + "step": 43640 + }, + { + "epoch": 0.21325, + "grad_norm": 0.09763599932193756, + "learning_rate": 4.622936326986301e-05, + "loss": 0.0442, + "step": 43650 + }, + { + "epoch": 0.2133, + "grad_norm": 0.10397458076477051, + "learning_rate": 4.622717993524249e-05, + "loss": 0.0424, + "step": 43660 + }, + { + "epoch": 0.21335, + "grad_norm": 0.10201514512300491, + "learning_rate": 4.6224996020278844e-05, + "loss": 0.04, + "step": 43670 + }, + { + "epoch": 0.2134, + "grad_norm": 0.10763736069202423, + "learning_rate": 4.622281152503177e-05, + "loss": 0.0406, + "step": 43680 + }, + { + "epoch": 0.21345, + "grad_norm": 0.1229335218667984, + "learning_rate": 4.622062644956102e-05, + "loss": 0.0441, + "step": 43690 + }, + { + "epoch": 0.2135, + "grad_norm": 0.10760986804962158, + "learning_rate": 4.621844079392631e-05, + "loss": 0.0419, + "step": 43700 + }, + { + "epoch": 0.21355, + "grad_norm": 0.10992319136857986, + "learning_rate": 4.6216254558187395e-05, + "loss": 0.0389, + "step": 43710 + }, + { + "epoch": 0.2136, + "grad_norm": 0.10541414469480515, + "learning_rate": 4.6214067742404055e-05, + "loss": 0.0406, + "step": 43720 + }, + { + "epoch": 0.21365, + "grad_norm": 0.111421599984169, + "learning_rate": 4.621188034663607e-05, + "loss": 0.0406, + "step": 43730 + }, + { + "epoch": 0.2137, + "grad_norm": 0.10435806214809418, + "learning_rate": 4.620969237094325e-05, + "loss": 0.0391, + "step": 43740 + }, + { + "epoch": 0.21375, + "grad_norm": 0.12230052053928375, + "learning_rate": 4.62075038153854e-05, + "loss": 0.0418, + "step": 43750 + }, + { + "epoch": 0.2138, + "grad_norm": 0.11830747127532959, + "learning_rate": 4.620531468002237e-05, + "loss": 0.0402, + "step": 43760 + }, + { + "epoch": 0.21385, + "grad_norm": 0.1271902620792389, + "learning_rate": 4.6203124964914005e-05, + "loss": 0.04, + "step": 43770 + }, + { + "epoch": 0.2139, + "grad_norm": 0.11162140220403671, + "learning_rate": 4.620093467012017e-05, + "loss": 0.0406, + "step": 43780 + }, + { + "epoch": 0.21395, + "grad_norm": 0.11397633701562881, + "learning_rate": 4.619874379570074e-05, + "loss": 0.0404, + "step": 43790 + }, + { + "epoch": 0.214, + "grad_norm": 0.11327353864908218, + "learning_rate": 4.6196552341715615e-05, + "loss": 0.0412, + "step": 43800 + }, + { + "epoch": 0.21405, + "grad_norm": 0.10754092037677765, + "learning_rate": 4.6194360308224715e-05, + "loss": 0.0393, + "step": 43810 + }, + { + "epoch": 0.2141, + "grad_norm": 0.10313699394464493, + "learning_rate": 4.619216769528797e-05, + "loss": 0.043, + "step": 43820 + }, + { + "epoch": 0.21415, + "grad_norm": 0.11732255667448044, + "learning_rate": 4.6189974502965324e-05, + "loss": 0.0441, + "step": 43830 + }, + { + "epoch": 0.2142, + "grad_norm": 0.1100316047668457, + "learning_rate": 4.618778073131673e-05, + "loss": 0.0419, + "step": 43840 + }, + { + "epoch": 0.21425, + "grad_norm": 0.11513995379209518, + "learning_rate": 4.6185586380402174e-05, + "loss": 0.0392, + "step": 43850 + }, + { + "epoch": 0.2143, + "grad_norm": 0.11842764168977737, + "learning_rate": 4.618339145028164e-05, + "loss": 0.0384, + "step": 43860 + }, + { + "epoch": 0.21435, + "grad_norm": 0.10758869349956512, + "learning_rate": 4.618119594101515e-05, + "loss": 0.0383, + "step": 43870 + }, + { + "epoch": 0.2144, + "grad_norm": 0.12508603930473328, + "learning_rate": 4.617899985266272e-05, + "loss": 0.0407, + "step": 43880 + }, + { + "epoch": 0.21445, + "grad_norm": 0.12971076369285583, + "learning_rate": 4.617680318528439e-05, + "loss": 0.0412, + "step": 43890 + }, + { + "epoch": 0.2145, + "grad_norm": 0.12210968881845474, + "learning_rate": 4.617460593894021e-05, + "loss": 0.0386, + "step": 43900 + }, + { + "epoch": 0.21455, + "grad_norm": 0.14959664642810822, + "learning_rate": 4.617240811369026e-05, + "loss": 0.0387, + "step": 43910 + }, + { + "epoch": 0.2146, + "grad_norm": 0.11446508765220642, + "learning_rate": 4.617020970959463e-05, + "loss": 0.0388, + "step": 43920 + }, + { + "epoch": 0.21465, + "grad_norm": 0.11315374821424484, + "learning_rate": 4.616801072671342e-05, + "loss": 0.0407, + "step": 43930 + }, + { + "epoch": 0.2147, + "grad_norm": 0.12179291993379593, + "learning_rate": 4.6165811165106746e-05, + "loss": 0.0385, + "step": 43940 + }, + { + "epoch": 0.21475, + "grad_norm": 0.11779157817363739, + "learning_rate": 4.616361102483475e-05, + "loss": 0.0389, + "step": 43950 + }, + { + "epoch": 0.2148, + "grad_norm": 0.11438784748315811, + "learning_rate": 4.616141030595757e-05, + "loss": 0.0399, + "step": 43960 + }, + { + "epoch": 0.21485, + "grad_norm": 0.10303477197885513, + "learning_rate": 4.6159209008535397e-05, + "loss": 0.0388, + "step": 43970 + }, + { + "epoch": 0.2149, + "grad_norm": 0.1258867383003235, + "learning_rate": 4.6157007132628396e-05, + "loss": 0.044, + "step": 43980 + }, + { + "epoch": 0.21495, + "grad_norm": 0.11557912081480026, + "learning_rate": 4.615480467829676e-05, + "loss": 0.0401, + "step": 43990 + }, + { + "epoch": 0.215, + "grad_norm": 0.13071365654468536, + "learning_rate": 4.615260164560071e-05, + "loss": 0.0407, + "step": 44000 + }, + { + "epoch": 0.21505, + "grad_norm": 0.10629302263259888, + "learning_rate": 4.615039803460049e-05, + "loss": 0.0384, + "step": 44010 + }, + { + "epoch": 0.2151, + "grad_norm": 0.11779369413852692, + "learning_rate": 4.6148193845356324e-05, + "loss": 0.0415, + "step": 44020 + }, + { + "epoch": 0.21515, + "grad_norm": 0.11141815036535263, + "learning_rate": 4.6145989077928486e-05, + "loss": 0.0392, + "step": 44030 + }, + { + "epoch": 0.2152, + "grad_norm": 0.14413152635097504, + "learning_rate": 4.614378373237726e-05, + "loss": 0.0399, + "step": 44040 + }, + { + "epoch": 0.21525, + "grad_norm": 0.1350352168083191, + "learning_rate": 4.614157780876292e-05, + "loss": 0.0401, + "step": 44050 + }, + { + "epoch": 0.2153, + "grad_norm": 0.12539725005626678, + "learning_rate": 4.613937130714578e-05, + "loss": 0.0402, + "step": 44060 + }, + { + "epoch": 0.21535, + "grad_norm": 0.13736669719219208, + "learning_rate": 4.6137164227586177e-05, + "loss": 0.0402, + "step": 44070 + }, + { + "epoch": 0.2154, + "grad_norm": 0.11826146394014359, + "learning_rate": 4.613495657014445e-05, + "loss": 0.0438, + "step": 44080 + }, + { + "epoch": 0.21545, + "grad_norm": 0.11898969113826752, + "learning_rate": 4.613274833488094e-05, + "loss": 0.039, + "step": 44090 + }, + { + "epoch": 0.2155, + "grad_norm": 0.12201030552387238, + "learning_rate": 4.613053952185604e-05, + "loss": 0.0408, + "step": 44100 + }, + { + "epoch": 0.21555, + "grad_norm": 0.13165467977523804, + "learning_rate": 4.612833013113012e-05, + "loss": 0.0391, + "step": 44110 + }, + { + "epoch": 0.2156, + "grad_norm": 0.12074373662471771, + "learning_rate": 4.6126120162763595e-05, + "loss": 0.0391, + "step": 44120 + }, + { + "epoch": 0.21565, + "grad_norm": 0.11341175436973572, + "learning_rate": 4.612390961681687e-05, + "loss": 0.0385, + "step": 44130 + }, + { + "epoch": 0.2157, + "grad_norm": 0.11415523290634155, + "learning_rate": 4.612169849335041e-05, + "loss": 0.041, + "step": 44140 + }, + { + "epoch": 0.21575, + "grad_norm": 0.11367049813270569, + "learning_rate": 4.6119486792424645e-05, + "loss": 0.0389, + "step": 44150 + }, + { + "epoch": 0.2158, + "grad_norm": 0.1104145422577858, + "learning_rate": 4.611727451410004e-05, + "loss": 0.0393, + "step": 44160 + }, + { + "epoch": 0.21585, + "grad_norm": 0.125356525182724, + "learning_rate": 4.611506165843708e-05, + "loss": 0.0386, + "step": 44170 + }, + { + "epoch": 0.2159, + "grad_norm": 0.1293518990278244, + "learning_rate": 4.611284822549627e-05, + "loss": 0.0399, + "step": 44180 + }, + { + "epoch": 0.21595, + "grad_norm": 0.1320677399635315, + "learning_rate": 4.611063421533812e-05, + "loss": 0.0422, + "step": 44190 + }, + { + "epoch": 0.216, + "grad_norm": 0.12373978644609451, + "learning_rate": 4.610841962802317e-05, + "loss": 0.0404, + "step": 44200 + }, + { + "epoch": 0.21605, + "grad_norm": 0.12895497679710388, + "learning_rate": 4.6106204463611944e-05, + "loss": 0.0408, + "step": 44210 + }, + { + "epoch": 0.2161, + "grad_norm": 0.11972857266664505, + "learning_rate": 4.610398872216503e-05, + "loss": 0.0396, + "step": 44220 + }, + { + "epoch": 0.21615, + "grad_norm": 0.11606613546609879, + "learning_rate": 4.610177240374299e-05, + "loss": 0.0405, + "step": 44230 + }, + { + "epoch": 0.2162, + "grad_norm": 0.1022796630859375, + "learning_rate": 4.609955550840641e-05, + "loss": 0.0405, + "step": 44240 + }, + { + "epoch": 0.21625, + "grad_norm": 0.11300525069236755, + "learning_rate": 4.609733803621592e-05, + "loss": 0.0432, + "step": 44250 + }, + { + "epoch": 0.2163, + "grad_norm": 0.12350910902023315, + "learning_rate": 4.609511998723213e-05, + "loss": 0.0431, + "step": 44260 + }, + { + "epoch": 0.21635, + "grad_norm": 0.13697876036167145, + "learning_rate": 4.6092901361515684e-05, + "loss": 0.0411, + "step": 44270 + }, + { + "epoch": 0.2164, + "grad_norm": 0.11545902490615845, + "learning_rate": 4.609068215912724e-05, + "loss": 0.0409, + "step": 44280 + }, + { + "epoch": 0.21645, + "grad_norm": 0.13642287254333496, + "learning_rate": 4.6088462380127476e-05, + "loss": 0.0408, + "step": 44290 + }, + { + "epoch": 0.2165, + "grad_norm": 0.11719198524951935, + "learning_rate": 4.608624202457706e-05, + "loss": 0.0423, + "step": 44300 + }, + { + "epoch": 0.21655, + "grad_norm": 0.1083524227142334, + "learning_rate": 4.6084021092536715e-05, + "loss": 0.0405, + "step": 44310 + }, + { + "epoch": 0.2166, + "grad_norm": 0.11146090924739838, + "learning_rate": 4.608179958406715e-05, + "loss": 0.0438, + "step": 44320 + }, + { + "epoch": 0.21665, + "grad_norm": 0.10394291579723358, + "learning_rate": 4.607957749922911e-05, + "loss": 0.0416, + "step": 44330 + }, + { + "epoch": 0.2167, + "grad_norm": 0.11491485685110092, + "learning_rate": 4.607735483808334e-05, + "loss": 0.0396, + "step": 44340 + }, + { + "epoch": 0.21675, + "grad_norm": 0.10888620465993881, + "learning_rate": 4.607513160069061e-05, + "loss": 0.0422, + "step": 44350 + }, + { + "epoch": 0.2168, + "grad_norm": 0.12154927104711533, + "learning_rate": 4.60729077871117e-05, + "loss": 0.041, + "step": 44360 + }, + { + "epoch": 0.21685, + "grad_norm": 0.11862784624099731, + "learning_rate": 4.60706833974074e-05, + "loss": 0.0413, + "step": 44370 + }, + { + "epoch": 0.2169, + "grad_norm": 0.08911058306694031, + "learning_rate": 4.6068458431638537e-05, + "loss": 0.0398, + "step": 44380 + }, + { + "epoch": 0.21695, + "grad_norm": 0.11093780398368835, + "learning_rate": 4.606623288986594e-05, + "loss": 0.0402, + "step": 44390 + }, + { + "epoch": 0.217, + "grad_norm": 0.11029799282550812, + "learning_rate": 4.606400677215044e-05, + "loss": 0.041, + "step": 44400 + }, + { + "epoch": 0.21705, + "grad_norm": 0.12298743426799774, + "learning_rate": 4.6061780078552906e-05, + "loss": 0.0399, + "step": 44410 + }, + { + "epoch": 0.2171, + "grad_norm": 0.11163527518510818, + "learning_rate": 4.6059552809134224e-05, + "loss": 0.0389, + "step": 44420 + }, + { + "epoch": 0.21715, + "grad_norm": 0.09122147411108017, + "learning_rate": 4.6057324963955284e-05, + "loss": 0.0395, + "step": 44430 + }, + { + "epoch": 0.2172, + "grad_norm": 0.10241471230983734, + "learning_rate": 4.605509654307698e-05, + "loss": 0.0403, + "step": 44440 + }, + { + "epoch": 0.21725, + "grad_norm": 0.0955214723944664, + "learning_rate": 4.605286754656025e-05, + "loss": 0.0396, + "step": 44450 + }, + { + "epoch": 0.2173, + "grad_norm": 0.09183744341135025, + "learning_rate": 4.6050637974466036e-05, + "loss": 0.0395, + "step": 44460 + }, + { + "epoch": 0.21735, + "grad_norm": 0.09811728447675705, + "learning_rate": 4.604840782685529e-05, + "loss": 0.0396, + "step": 44470 + }, + { + "epoch": 0.2174, + "grad_norm": 0.11486396938562393, + "learning_rate": 4.604617710378897e-05, + "loss": 0.0387, + "step": 44480 + }, + { + "epoch": 0.21745, + "grad_norm": 0.10211937129497528, + "learning_rate": 4.604394580532808e-05, + "loss": 0.0392, + "step": 44490 + }, + { + "epoch": 0.2175, + "grad_norm": 0.13213109970092773, + "learning_rate": 4.6041713931533624e-05, + "loss": 0.0405, + "step": 44500 + }, + { + "epoch": 0.21755, + "grad_norm": 0.10534148663282394, + "learning_rate": 4.6039481482466606e-05, + "loss": 0.0432, + "step": 44510 + }, + { + "epoch": 0.2176, + "grad_norm": 0.10675963759422302, + "learning_rate": 4.603724845818808e-05, + "loss": 0.0396, + "step": 44520 + }, + { + "epoch": 0.21765, + "grad_norm": 0.13914652168750763, + "learning_rate": 4.603501485875907e-05, + "loss": 0.0417, + "step": 44530 + }, + { + "epoch": 0.2177, + "grad_norm": 0.14863325655460358, + "learning_rate": 4.6032780684240665e-05, + "loss": 0.0415, + "step": 44540 + }, + { + "epoch": 0.21775, + "grad_norm": 0.12211088836193085, + "learning_rate": 4.603054593469393e-05, + "loss": 0.0412, + "step": 44550 + }, + { + "epoch": 0.2178, + "grad_norm": 0.14143238961696625, + "learning_rate": 4.602831061017997e-05, + "loss": 0.0401, + "step": 44560 + }, + { + "epoch": 0.21785, + "grad_norm": 0.13877104222774506, + "learning_rate": 4.60260747107599e-05, + "loss": 0.0396, + "step": 44570 + }, + { + "epoch": 0.2179, + "grad_norm": 0.11739282310009003, + "learning_rate": 4.6023838236494854e-05, + "loss": 0.0403, + "step": 44580 + }, + { + "epoch": 0.21795, + "grad_norm": 0.12237027287483215, + "learning_rate": 4.602160118744596e-05, + "loss": 0.0408, + "step": 44590 + }, + { + "epoch": 0.218, + "grad_norm": 0.10916484892368317, + "learning_rate": 4.601936356367439e-05, + "loss": 0.0383, + "step": 44600 + }, + { + "epoch": 0.21805, + "grad_norm": 0.11514069139957428, + "learning_rate": 4.601712536524132e-05, + "loss": 0.0407, + "step": 44610 + }, + { + "epoch": 0.2181, + "grad_norm": 0.10619265586137772, + "learning_rate": 4.601488659220794e-05, + "loss": 0.0386, + "step": 44620 + }, + { + "epoch": 0.21815, + "grad_norm": 0.09500869363546371, + "learning_rate": 4.601264724463546e-05, + "loss": 0.039, + "step": 44630 + }, + { + "epoch": 0.2182, + "grad_norm": 0.09380833804607391, + "learning_rate": 4.601040732258508e-05, + "loss": 0.0395, + "step": 44640 + }, + { + "epoch": 0.21825, + "grad_norm": 0.12103443592786789, + "learning_rate": 4.600816682611807e-05, + "loss": 0.0395, + "step": 44650 + }, + { + "epoch": 0.2183, + "grad_norm": 0.12043504416942596, + "learning_rate": 4.600592575529566e-05, + "loss": 0.0392, + "step": 44660 + }, + { + "epoch": 0.21835, + "grad_norm": 0.12798574566841125, + "learning_rate": 4.600368411017914e-05, + "loss": 0.0393, + "step": 44670 + }, + { + "epoch": 0.2184, + "grad_norm": 0.1347103863954544, + "learning_rate": 4.600144189082979e-05, + "loss": 0.0385, + "step": 44680 + }, + { + "epoch": 0.21845, + "grad_norm": 0.12477003782987595, + "learning_rate": 4.599919909730891e-05, + "loss": 0.0375, + "step": 44690 + }, + { + "epoch": 0.2185, + "grad_norm": 0.11453071981668472, + "learning_rate": 4.5996955729677803e-05, + "loss": 0.0394, + "step": 44700 + }, + { + "epoch": 0.21855, + "grad_norm": 0.09489106386899948, + "learning_rate": 4.5994711787997826e-05, + "loss": 0.0381, + "step": 44710 + }, + { + "epoch": 0.2186, + "grad_norm": 0.12129834294319153, + "learning_rate": 4.5992467272330315e-05, + "loss": 0.0389, + "step": 44720 + }, + { + "epoch": 0.21865, + "grad_norm": 0.1389053910970688, + "learning_rate": 4.599022218273663e-05, + "loss": 0.0385, + "step": 44730 + }, + { + "epoch": 0.2187, + "grad_norm": 0.10801096260547638, + "learning_rate": 4.5987976519278165e-05, + "loss": 0.0402, + "step": 44740 + }, + { + "epoch": 0.21875, + "grad_norm": 0.11225487291812897, + "learning_rate": 4.59857302820163e-05, + "loss": 0.0394, + "step": 44750 + }, + { + "epoch": 0.2188, + "grad_norm": 0.11907331645488739, + "learning_rate": 4.598348347101245e-05, + "loss": 0.0384, + "step": 44760 + }, + { + "epoch": 0.21885, + "grad_norm": 0.10216663032770157, + "learning_rate": 4.598123608632805e-05, + "loss": 0.0393, + "step": 44770 + }, + { + "epoch": 0.2189, + "grad_norm": 0.10929743945598602, + "learning_rate": 4.597898812802454e-05, + "loss": 0.0387, + "step": 44780 + }, + { + "epoch": 0.21895, + "grad_norm": 0.09658437222242355, + "learning_rate": 4.597673959616337e-05, + "loss": 0.0381, + "step": 44790 + }, + { + "epoch": 0.219, + "grad_norm": 0.12752942740917206, + "learning_rate": 4.597449049080602e-05, + "loss": 0.0386, + "step": 44800 + }, + { + "epoch": 0.21905, + "grad_norm": 0.10357045382261276, + "learning_rate": 4.5972240812013986e-05, + "loss": 0.0383, + "step": 44810 + }, + { + "epoch": 0.2191, + "grad_norm": 0.11409095674753189, + "learning_rate": 4.5969990559848766e-05, + "loss": 0.0387, + "step": 44820 + }, + { + "epoch": 0.21915, + "grad_norm": 0.10450869798660278, + "learning_rate": 4.596773973437187e-05, + "loss": 0.0386, + "step": 44830 + }, + { + "epoch": 0.2192, + "grad_norm": 0.12107925117015839, + "learning_rate": 4.596548833564486e-05, + "loss": 0.0372, + "step": 44840 + }, + { + "epoch": 0.21925, + "grad_norm": 0.10662046819925308, + "learning_rate": 4.5963236363729276e-05, + "loss": 0.0379, + "step": 44850 + }, + { + "epoch": 0.2193, + "grad_norm": 0.11387676745653152, + "learning_rate": 4.5960983818686674e-05, + "loss": 0.0388, + "step": 44860 + }, + { + "epoch": 0.21935, + "grad_norm": 0.14249145984649658, + "learning_rate": 4.595873070057866e-05, + "loss": 0.0391, + "step": 44870 + }, + { + "epoch": 0.2194, + "grad_norm": 0.13589072227478027, + "learning_rate": 4.595647700946682e-05, + "loss": 0.0382, + "step": 44880 + }, + { + "epoch": 0.21945, + "grad_norm": 0.14612525701522827, + "learning_rate": 4.5954222745412766e-05, + "loss": 0.0401, + "step": 44890 + }, + { + "epoch": 0.2195, + "grad_norm": 0.11659722030162811, + "learning_rate": 4.5951967908478147e-05, + "loss": 0.0377, + "step": 44900 + }, + { + "epoch": 0.21955, + "grad_norm": 0.10423924773931503, + "learning_rate": 4.594971249872458e-05, + "loss": 0.0378, + "step": 44910 + }, + { + "epoch": 0.2196, + "grad_norm": 0.12860740721225739, + "learning_rate": 4.594745651621376e-05, + "loss": 0.0389, + "step": 44920 + }, + { + "epoch": 0.21965, + "grad_norm": 0.09772031009197235, + "learning_rate": 4.5945199961007335e-05, + "loss": 0.0385, + "step": 44930 + }, + { + "epoch": 0.2197, + "grad_norm": 0.15080465376377106, + "learning_rate": 4.5942942833167016e-05, + "loss": 0.039, + "step": 44940 + }, + { + "epoch": 0.21975, + "grad_norm": 0.13005079329013824, + "learning_rate": 4.5940685132754516e-05, + "loss": 0.0376, + "step": 44950 + }, + { + "epoch": 0.2198, + "grad_norm": 0.11674488335847855, + "learning_rate": 4.593842685983154e-05, + "loss": 0.0389, + "step": 44960 + }, + { + "epoch": 0.21985, + "grad_norm": 0.10848286002874374, + "learning_rate": 4.593616801445984e-05, + "loss": 0.0391, + "step": 44970 + }, + { + "epoch": 0.2199, + "grad_norm": 0.13445886969566345, + "learning_rate": 4.593390859670118e-05, + "loss": 0.0392, + "step": 44980 + }, + { + "epoch": 0.21995, + "grad_norm": 0.11834866553544998, + "learning_rate": 4.593164860661732e-05, + "loss": 0.0392, + "step": 44990 + }, + { + "epoch": 0.22, + "grad_norm": 0.13557834923267365, + "learning_rate": 4.592938804427005e-05, + "loss": 0.0396, + "step": 45000 + }, + { + "epoch": 0.22005, + "grad_norm": 0.1014893501996994, + "learning_rate": 4.592712690972117e-05, + "loss": 0.0397, + "step": 45010 + }, + { + "epoch": 0.2201, + "grad_norm": 0.11393041908740997, + "learning_rate": 4.592486520303251e-05, + "loss": 0.0407, + "step": 45020 + }, + { + "epoch": 0.22015, + "grad_norm": 0.16742663085460663, + "learning_rate": 4.59226029242659e-05, + "loss": 0.0449, + "step": 45030 + }, + { + "epoch": 0.2202, + "grad_norm": 0.12571029365062714, + "learning_rate": 4.5920340073483175e-05, + "loss": 0.0388, + "step": 45040 + }, + { + "epoch": 0.22025, + "grad_norm": 0.13934175670146942, + "learning_rate": 4.591807665074621e-05, + "loss": 0.0386, + "step": 45050 + }, + { + "epoch": 0.2203, + "grad_norm": 0.11282069236040115, + "learning_rate": 4.5915812656116896e-05, + "loss": 0.0392, + "step": 45060 + }, + { + "epoch": 0.22035, + "grad_norm": 0.14101460576057434, + "learning_rate": 4.591354808965712e-05, + "loss": 0.038, + "step": 45070 + }, + { + "epoch": 0.2204, + "grad_norm": 0.1505534052848816, + "learning_rate": 4.59112829514288e-05, + "loss": 0.0406, + "step": 45080 + }, + { + "epoch": 0.22045, + "grad_norm": 0.1381063014268875, + "learning_rate": 4.5909017241493854e-05, + "loss": 0.0385, + "step": 45090 + }, + { + "epoch": 0.2205, + "grad_norm": 0.14494432508945465, + "learning_rate": 4.590675095991424e-05, + "loss": 0.0378, + "step": 45100 + }, + { + "epoch": 0.22055, + "grad_norm": 0.11994045972824097, + "learning_rate": 4.59044841067519e-05, + "loss": 0.0377, + "step": 45110 + }, + { + "epoch": 0.2206, + "grad_norm": 0.1361813098192215, + "learning_rate": 4.590221668206882e-05, + "loss": 0.037, + "step": 45120 + }, + { + "epoch": 0.22065, + "grad_norm": 0.12299936264753342, + "learning_rate": 4.5899948685926985e-05, + "loss": 0.0392, + "step": 45130 + }, + { + "epoch": 0.2207, + "grad_norm": 0.12199182063341141, + "learning_rate": 4.589768011838841e-05, + "loss": 0.0394, + "step": 45140 + }, + { + "epoch": 0.22075, + "grad_norm": 0.11998183280229568, + "learning_rate": 4.589541097951511e-05, + "loss": 0.0406, + "step": 45150 + }, + { + "epoch": 0.2208, + "grad_norm": 0.10331569612026215, + "learning_rate": 4.589314126936912e-05, + "loss": 0.0411, + "step": 45160 + }, + { + "epoch": 0.22085, + "grad_norm": 0.09803680330514908, + "learning_rate": 4.5890870988012504e-05, + "loss": 0.0413, + "step": 45170 + }, + { + "epoch": 0.2209, + "grad_norm": 0.1218324825167656, + "learning_rate": 4.588860013550732e-05, + "loss": 0.0398, + "step": 45180 + }, + { + "epoch": 0.22095, + "grad_norm": 0.12272092700004578, + "learning_rate": 4.588632871191566e-05, + "loss": 0.0406, + "step": 45190 + }, + { + "epoch": 0.221, + "grad_norm": 0.1250077337026596, + "learning_rate": 4.5884056717299615e-05, + "loss": 0.0399, + "step": 45200 + }, + { + "epoch": 0.22105, + "grad_norm": 0.10727230459451675, + "learning_rate": 4.588178415172131e-05, + "loss": 0.0399, + "step": 45210 + }, + { + "epoch": 0.2211, + "grad_norm": 0.11648436635732651, + "learning_rate": 4.587951101524286e-05, + "loss": 0.044, + "step": 45220 + }, + { + "epoch": 0.22115, + "grad_norm": 0.12675625085830688, + "learning_rate": 4.587723730792644e-05, + "loss": 0.0438, + "step": 45230 + }, + { + "epoch": 0.2212, + "grad_norm": 0.11761613935232162, + "learning_rate": 4.587496302983418e-05, + "loss": 0.0393, + "step": 45240 + }, + { + "epoch": 0.22125, + "grad_norm": 0.11859478801488876, + "learning_rate": 4.587268818102828e-05, + "loss": 0.04, + "step": 45250 + }, + { + "epoch": 0.2213, + "grad_norm": 0.09970647096633911, + "learning_rate": 4.587041276157093e-05, + "loss": 0.0403, + "step": 45260 + }, + { + "epoch": 0.22135, + "grad_norm": 0.1070898249745369, + "learning_rate": 4.5868136771524325e-05, + "loss": 0.0394, + "step": 45270 + }, + { + "epoch": 0.2214, + "grad_norm": 0.12319520860910416, + "learning_rate": 4.5865860210950704e-05, + "loss": 0.0392, + "step": 45280 + }, + { + "epoch": 0.22145, + "grad_norm": 0.10981305688619614, + "learning_rate": 4.5863583079912306e-05, + "loss": 0.0399, + "step": 45290 + }, + { + "epoch": 0.2215, + "grad_norm": 0.12001689523458481, + "learning_rate": 4.5861305378471385e-05, + "loss": 0.0402, + "step": 45300 + }, + { + "epoch": 0.22155, + "grad_norm": 0.11061044037342072, + "learning_rate": 4.585902710669021e-05, + "loss": 0.0401, + "step": 45310 + }, + { + "epoch": 0.2216, + "grad_norm": 0.10832685977220535, + "learning_rate": 4.585674826463108e-05, + "loss": 0.0402, + "step": 45320 + }, + { + "epoch": 0.22165, + "grad_norm": 0.11753688752651215, + "learning_rate": 4.585446885235628e-05, + "loss": 0.0388, + "step": 45330 + }, + { + "epoch": 0.2217, + "grad_norm": 0.11813338100910187, + "learning_rate": 4.5852188869928134e-05, + "loss": 0.0422, + "step": 45340 + }, + { + "epoch": 0.22175, + "grad_norm": 0.1180034875869751, + "learning_rate": 4.584990831740897e-05, + "loss": 0.0397, + "step": 45350 + }, + { + "epoch": 0.2218, + "grad_norm": 0.11829423159360886, + "learning_rate": 4.584762719486117e-05, + "loss": 0.042, + "step": 45360 + }, + { + "epoch": 0.22185, + "grad_norm": 0.09697888791561127, + "learning_rate": 4.5845345502347055e-05, + "loss": 0.0415, + "step": 45370 + }, + { + "epoch": 0.2219, + "grad_norm": 0.10831569880247116, + "learning_rate": 4.584306323992903e-05, + "loss": 0.0402, + "step": 45380 + }, + { + "epoch": 0.22195, + "grad_norm": 0.11756439507007599, + "learning_rate": 4.584078040766949e-05, + "loss": 0.0411, + "step": 45390 + }, + { + "epoch": 0.222, + "grad_norm": 0.11222250014543533, + "learning_rate": 4.5838497005630835e-05, + "loss": 0.0408, + "step": 45400 + }, + { + "epoch": 0.22205, + "grad_norm": 0.12091228365898132, + "learning_rate": 4.5836213033875506e-05, + "loss": 0.0452, + "step": 45410 + }, + { + "epoch": 0.2221, + "grad_norm": 0.10155369341373444, + "learning_rate": 4.583392849246594e-05, + "loss": 0.0407, + "step": 45420 + }, + { + "epoch": 0.22215, + "grad_norm": 0.1056760624051094, + "learning_rate": 4.5831643381464596e-05, + "loss": 0.0418, + "step": 45430 + }, + { + "epoch": 0.2222, + "grad_norm": 0.10789384692907333, + "learning_rate": 4.582935770093395e-05, + "loss": 0.0395, + "step": 45440 + }, + { + "epoch": 0.22225, + "grad_norm": 0.10316034406423569, + "learning_rate": 4.582707145093649e-05, + "loss": 0.04, + "step": 45450 + }, + { + "epoch": 0.2223, + "grad_norm": 0.11070965230464935, + "learning_rate": 4.582478463153472e-05, + "loss": 0.0398, + "step": 45460 + }, + { + "epoch": 0.22235, + "grad_norm": 0.1169808954000473, + "learning_rate": 4.582249724279116e-05, + "loss": 0.042, + "step": 45470 + }, + { + "epoch": 0.2224, + "grad_norm": 0.11864583194255829, + "learning_rate": 4.582020928476835e-05, + "loss": 0.0397, + "step": 45480 + }, + { + "epoch": 0.22245, + "grad_norm": 0.09163357317447662, + "learning_rate": 4.5817920757528834e-05, + "loss": 0.0405, + "step": 45490 + }, + { + "epoch": 0.2225, + "grad_norm": 0.11840710043907166, + "learning_rate": 4.5815631661135196e-05, + "loss": 0.0402, + "step": 45500 + }, + { + "epoch": 0.22255, + "grad_norm": 0.1186244860291481, + "learning_rate": 4.581334199565e-05, + "loss": 0.0404, + "step": 45510 + }, + { + "epoch": 0.2226, + "grad_norm": 0.10652244091033936, + "learning_rate": 4.5811051761135856e-05, + "loss": 0.0397, + "step": 45520 + }, + { + "epoch": 0.22265, + "grad_norm": 0.11250728368759155, + "learning_rate": 4.5808760957655374e-05, + "loss": 0.04, + "step": 45530 + }, + { + "epoch": 0.2227, + "grad_norm": 0.1473996937274933, + "learning_rate": 4.580646958527118e-05, + "loss": 0.0397, + "step": 45540 + }, + { + "epoch": 0.22275, + "grad_norm": 0.09371239691972733, + "learning_rate": 4.5804177644045935e-05, + "loss": 0.0391, + "step": 45550 + }, + { + "epoch": 0.2228, + "grad_norm": 0.11303984373807907, + "learning_rate": 4.5801885134042285e-05, + "loss": 0.0388, + "step": 45560 + }, + { + "epoch": 0.22285, + "grad_norm": 0.09464023262262344, + "learning_rate": 4.579959205532291e-05, + "loss": 0.0435, + "step": 45570 + }, + { + "epoch": 0.2229, + "grad_norm": 0.1317097246646881, + "learning_rate": 4.57972984079505e-05, + "loss": 0.039, + "step": 45580 + }, + { + "epoch": 0.22295, + "grad_norm": 0.12932200729846954, + "learning_rate": 4.5795004191987765e-05, + "loss": 0.041, + "step": 45590 + }, + { + "epoch": 0.223, + "grad_norm": 0.10654744505882263, + "learning_rate": 4.579270940749743e-05, + "loss": 0.0408, + "step": 45600 + }, + { + "epoch": 0.22305, + "grad_norm": 0.11881094425916672, + "learning_rate": 4.579041405454223e-05, + "loss": 0.0388, + "step": 45610 + }, + { + "epoch": 0.2231, + "grad_norm": 0.12884128093719482, + "learning_rate": 4.578811813318492e-05, + "loss": 0.0408, + "step": 45620 + }, + { + "epoch": 0.22315, + "grad_norm": 0.12064798176288605, + "learning_rate": 4.578582164348827e-05, + "loss": 0.0416, + "step": 45630 + }, + { + "epoch": 0.2232, + "grad_norm": 0.13775815069675446, + "learning_rate": 4.578352458551507e-05, + "loss": 0.0392, + "step": 45640 + }, + { + "epoch": 0.22325, + "grad_norm": 0.11223536729812622, + "learning_rate": 4.57812269593281e-05, + "loss": 0.0391, + "step": 45650 + }, + { + "epoch": 0.2233, + "grad_norm": 0.13564331829547882, + "learning_rate": 4.5778928764990217e-05, + "loss": 0.0402, + "step": 45660 + }, + { + "epoch": 0.22335, + "grad_norm": 0.12138502299785614, + "learning_rate": 4.5776630002564206e-05, + "loss": 0.0398, + "step": 45670 + }, + { + "epoch": 0.2234, + "grad_norm": 0.1268160045146942, + "learning_rate": 4.577433067211295e-05, + "loss": 0.0413, + "step": 45680 + }, + { + "epoch": 0.22345, + "grad_norm": 0.11599452048540115, + "learning_rate": 4.577203077369929e-05, + "loss": 0.0387, + "step": 45690 + }, + { + "epoch": 0.2235, + "grad_norm": 0.10990285873413086, + "learning_rate": 4.5769730307386114e-05, + "loss": 0.0389, + "step": 45700 + }, + { + "epoch": 0.22355, + "grad_norm": 0.10611005127429962, + "learning_rate": 4.576742927323632e-05, + "loss": 0.0381, + "step": 45710 + }, + { + "epoch": 0.2236, + "grad_norm": 0.12060553580522537, + "learning_rate": 4.5765127671312805e-05, + "loss": 0.039, + "step": 45720 + }, + { + "epoch": 0.22365, + "grad_norm": 0.12426841259002686, + "learning_rate": 4.5762825501678495e-05, + "loss": 0.0411, + "step": 45730 + }, + { + "epoch": 0.2237, + "grad_norm": 0.1251668632030487, + "learning_rate": 4.576052276439635e-05, + "loss": 0.0404, + "step": 45740 + }, + { + "epoch": 0.22375, + "grad_norm": 0.16352106630802155, + "learning_rate": 4.57582194595293e-05, + "loss": 0.0415, + "step": 45750 + }, + { + "epoch": 0.2238, + "grad_norm": 0.12801645696163177, + "learning_rate": 4.5755915587140336e-05, + "loss": 0.0397, + "step": 45760 + }, + { + "epoch": 0.22385, + "grad_norm": 0.10659578442573547, + "learning_rate": 4.5753611147292435e-05, + "loss": 0.0388, + "step": 45770 + }, + { + "epoch": 0.2239, + "grad_norm": 0.14153005182743073, + "learning_rate": 4.57513061400486e-05, + "loss": 0.0404, + "step": 45780 + }, + { + "epoch": 0.22395, + "grad_norm": 0.110100157558918, + "learning_rate": 4.5749000565471855e-05, + "loss": 0.0392, + "step": 45790 + }, + { + "epoch": 0.224, + "grad_norm": 0.10864286869764328, + "learning_rate": 4.574669442362522e-05, + "loss": 0.0388, + "step": 45800 + }, + { + "epoch": 0.22405, + "grad_norm": 0.1269918531179428, + "learning_rate": 4.5744387714571766e-05, + "loss": 0.0406, + "step": 45810 + }, + { + "epoch": 0.2241, + "grad_norm": 0.11123709380626678, + "learning_rate": 4.5742080438374545e-05, + "loss": 0.0399, + "step": 45820 + }, + { + "epoch": 0.22415, + "grad_norm": 0.10304092615842819, + "learning_rate": 4.573977259509663e-05, + "loss": 0.0399, + "step": 45830 + }, + { + "epoch": 0.2242, + "grad_norm": 0.1020602434873581, + "learning_rate": 4.5737464184801124e-05, + "loss": 0.0403, + "step": 45840 + }, + { + "epoch": 0.22425, + "grad_norm": 0.12396201491355896, + "learning_rate": 4.5735155207551145e-05, + "loss": 0.0409, + "step": 45850 + }, + { + "epoch": 0.2243, + "grad_norm": 0.119930699467659, + "learning_rate": 4.5732845663409804e-05, + "loss": 0.0394, + "step": 45860 + }, + { + "epoch": 0.22435, + "grad_norm": 0.11855699121952057, + "learning_rate": 4.5730535552440256e-05, + "loss": 0.0402, + "step": 45870 + }, + { + "epoch": 0.2244, + "grad_norm": 0.11706940829753876, + "learning_rate": 4.572822487470566e-05, + "loss": 0.0395, + "step": 45880 + }, + { + "epoch": 0.22445, + "grad_norm": 0.09675632417201996, + "learning_rate": 4.572591363026918e-05, + "loss": 0.039, + "step": 45890 + }, + { + "epoch": 0.2245, + "grad_norm": 0.11238475143909454, + "learning_rate": 4.5723601819193996e-05, + "loss": 0.0378, + "step": 45900 + }, + { + "epoch": 0.22455, + "grad_norm": 0.11811317503452301, + "learning_rate": 4.5721289441543336e-05, + "loss": 0.04, + "step": 45910 + }, + { + "epoch": 0.2246, + "grad_norm": 0.10928291827440262, + "learning_rate": 4.5718976497380404e-05, + "loss": 0.0388, + "step": 45920 + }, + { + "epoch": 0.22465, + "grad_norm": 0.11499692499637604, + "learning_rate": 4.571666298676843e-05, + "loss": 0.0382, + "step": 45930 + }, + { + "epoch": 0.2247, + "grad_norm": 0.10858325660228729, + "learning_rate": 4.571434890977069e-05, + "loss": 0.0385, + "step": 45940 + }, + { + "epoch": 0.22475, + "grad_norm": 0.11937263607978821, + "learning_rate": 4.571203426645042e-05, + "loss": 0.038, + "step": 45950 + }, + { + "epoch": 0.2248, + "grad_norm": 0.10898961871862411, + "learning_rate": 4.5709719056870916e-05, + "loss": 0.0419, + "step": 45960 + }, + { + "epoch": 0.22485, + "grad_norm": 0.11576051265001297, + "learning_rate": 4.570740328109547e-05, + "loss": 0.039, + "step": 45970 + }, + { + "epoch": 0.2249, + "grad_norm": 0.11729492247104645, + "learning_rate": 4.5705086939187414e-05, + "loss": 0.0391, + "step": 45980 + }, + { + "epoch": 0.22495, + "grad_norm": 0.13462895154953003, + "learning_rate": 4.5702770031210044e-05, + "loss": 0.041, + "step": 45990 + }, + { + "epoch": 0.225, + "grad_norm": 0.10053347796201706, + "learning_rate": 4.5700452557226726e-05, + "loss": 0.0389, + "step": 46000 + }, + { + "epoch": 0.22505, + "grad_norm": 0.1229424923658371, + "learning_rate": 4.5698134517300804e-05, + "loss": 0.0397, + "step": 46010 + }, + { + "epoch": 0.2251, + "grad_norm": 0.14538830518722534, + "learning_rate": 4.569581591149566e-05, + "loss": 0.0394, + "step": 46020 + }, + { + "epoch": 0.22515, + "grad_norm": 0.12600792944431305, + "learning_rate": 4.5693496739874695e-05, + "loss": 0.0392, + "step": 46030 + }, + { + "epoch": 0.2252, + "grad_norm": 0.12603269517421722, + "learning_rate": 4.56911770025013e-05, + "loss": 0.0388, + "step": 46040 + }, + { + "epoch": 0.22525, + "grad_norm": 0.11895067989826202, + "learning_rate": 4.5688856699438895e-05, + "loss": 0.0402, + "step": 46050 + }, + { + "epoch": 0.2253, + "grad_norm": 0.1792805939912796, + "learning_rate": 4.568653583075093e-05, + "loss": 0.0418, + "step": 46060 + }, + { + "epoch": 0.22535, + "grad_norm": 0.16201379895210266, + "learning_rate": 4.5684214396500836e-05, + "loss": 0.039, + "step": 46070 + }, + { + "epoch": 0.2254, + "grad_norm": 0.13260048627853394, + "learning_rate": 4.568189239675209e-05, + "loss": 0.0392, + "step": 46080 + }, + { + "epoch": 0.22545, + "grad_norm": 0.13739709556102753, + "learning_rate": 4.567956983156818e-05, + "loss": 0.039, + "step": 46090 + }, + { + "epoch": 0.2255, + "grad_norm": 0.11852910369634628, + "learning_rate": 4.56772467010126e-05, + "loss": 0.0391, + "step": 46100 + }, + { + "epoch": 0.22555, + "grad_norm": 0.14033553004264832, + "learning_rate": 4.5674923005148864e-05, + "loss": 0.0379, + "step": 46110 + }, + { + "epoch": 0.2256, + "grad_norm": 0.16816692054271698, + "learning_rate": 4.56725987440405e-05, + "loss": 0.0397, + "step": 46120 + }, + { + "epoch": 0.22565, + "grad_norm": 0.15413282811641693, + "learning_rate": 4.567027391775105e-05, + "loss": 0.0406, + "step": 46130 + }, + { + "epoch": 0.2257, + "grad_norm": 0.10553387552499771, + "learning_rate": 4.5667948526344086e-05, + "loss": 0.0391, + "step": 46140 + }, + { + "epoch": 0.22575, + "grad_norm": 0.10783561319112778, + "learning_rate": 4.566562256988316e-05, + "loss": 0.0411, + "step": 46150 + }, + { + "epoch": 0.2258, + "grad_norm": 0.07838821411132812, + "learning_rate": 4.566329604843188e-05, + "loss": 0.0378, + "step": 46160 + }, + { + "epoch": 0.22585, + "grad_norm": 0.10225919634103775, + "learning_rate": 4.5660968962053856e-05, + "loss": 0.038, + "step": 46170 + }, + { + "epoch": 0.2259, + "grad_norm": 0.1086994856595993, + "learning_rate": 4.56586413108127e-05, + "loss": 0.0385, + "step": 46180 + }, + { + "epoch": 0.22595, + "grad_norm": 0.09652048349380493, + "learning_rate": 4.565631309477205e-05, + "loss": 0.0392, + "step": 46190 + }, + { + "epoch": 0.226, + "grad_norm": 0.11381851881742477, + "learning_rate": 4.565398431399556e-05, + "loss": 0.0414, + "step": 46200 + }, + { + "epoch": 0.22605, + "grad_norm": 0.12784039974212646, + "learning_rate": 4.56516549685469e-05, + "loss": 0.0398, + "step": 46210 + }, + { + "epoch": 0.2261, + "grad_norm": 0.10668402910232544, + "learning_rate": 4.564932505848975e-05, + "loss": 0.0389, + "step": 46220 + }, + { + "epoch": 0.22615, + "grad_norm": 0.10620923340320587, + "learning_rate": 4.5646994583887805e-05, + "loss": 0.0397, + "step": 46230 + }, + { + "epoch": 0.2262, + "grad_norm": 0.11140339076519012, + "learning_rate": 4.5644663544804794e-05, + "loss": 0.0394, + "step": 46240 + }, + { + "epoch": 0.22625, + "grad_norm": 0.11114098131656647, + "learning_rate": 4.564233194130444e-05, + "loss": 0.0423, + "step": 46250 + }, + { + "epoch": 0.2263, + "grad_norm": 0.09415170550346375, + "learning_rate": 4.563999977345047e-05, + "loss": 0.041, + "step": 46260 + }, + { + "epoch": 0.22635, + "grad_norm": 0.12156187742948532, + "learning_rate": 4.5637667041306675e-05, + "loss": 0.0391, + "step": 46270 + }, + { + "epoch": 0.2264, + "grad_norm": 0.1148257926106453, + "learning_rate": 4.563533374493682e-05, + "loss": 0.0404, + "step": 46280 + }, + { + "epoch": 0.22645, + "grad_norm": 0.10641665756702423, + "learning_rate": 4.563299988440467e-05, + "loss": 0.0394, + "step": 46290 + }, + { + "epoch": 0.2265, + "grad_norm": 0.12104397267103195, + "learning_rate": 4.563066545977407e-05, + "loss": 0.0418, + "step": 46300 + }, + { + "epoch": 0.22655, + "grad_norm": 0.11271051317453384, + "learning_rate": 4.562833047110883e-05, + "loss": 0.0425, + "step": 46310 + }, + { + "epoch": 0.2266, + "grad_norm": 0.10609057545661926, + "learning_rate": 4.562599491847278e-05, + "loss": 0.0393, + "step": 46320 + }, + { + "epoch": 0.22665, + "grad_norm": 0.10337311774492264, + "learning_rate": 4.562365880192978e-05, + "loss": 0.0406, + "step": 46330 + }, + { + "epoch": 0.2267, + "grad_norm": 0.1276106983423233, + "learning_rate": 4.562132212154369e-05, + "loss": 0.0404, + "step": 46340 + }, + { + "epoch": 0.22675, + "grad_norm": 0.12957589328289032, + "learning_rate": 4.56189848773784e-05, + "loss": 0.0416, + "step": 46350 + }, + { + "epoch": 0.2268, + "grad_norm": 0.11019352823495865, + "learning_rate": 4.561664706949782e-05, + "loss": 0.0405, + "step": 46360 + }, + { + "epoch": 0.22685, + "grad_norm": 0.12697316706180573, + "learning_rate": 4.5614308697965845e-05, + "loss": 0.0391, + "step": 46370 + }, + { + "epoch": 0.2269, + "grad_norm": 0.13217809796333313, + "learning_rate": 4.5611969762846415e-05, + "loss": 0.04, + "step": 46380 + }, + { + "epoch": 0.22695, + "grad_norm": 0.12451531738042831, + "learning_rate": 4.560963026420349e-05, + "loss": 0.0408, + "step": 46390 + }, + { + "epoch": 0.227, + "grad_norm": 0.10132154077291489, + "learning_rate": 4.5607290202100996e-05, + "loss": 0.0394, + "step": 46400 + }, + { + "epoch": 0.22705, + "grad_norm": 0.10651516169309616, + "learning_rate": 4.560494957660294e-05, + "loss": 0.0396, + "step": 46410 + }, + { + "epoch": 0.2271, + "grad_norm": 0.18300026655197144, + "learning_rate": 4.560260838777331e-05, + "loss": 0.0406, + "step": 46420 + }, + { + "epoch": 0.22715, + "grad_norm": 0.13345034420490265, + "learning_rate": 4.5600266635676094e-05, + "loss": 0.0403, + "step": 46430 + }, + { + "epoch": 0.2272, + "grad_norm": 0.11327176541090012, + "learning_rate": 4.559792432037533e-05, + "loss": 0.0388, + "step": 46440 + }, + { + "epoch": 0.22725, + "grad_norm": 0.09436774998903275, + "learning_rate": 4.559558144193505e-05, + "loss": 0.0407, + "step": 46450 + }, + { + "epoch": 0.2273, + "grad_norm": 0.09983410686254501, + "learning_rate": 4.559323800041932e-05, + "loss": 0.0403, + "step": 46460 + }, + { + "epoch": 0.22735, + "grad_norm": 0.09111980348825455, + "learning_rate": 4.5590893995892196e-05, + "loss": 0.0401, + "step": 46470 + }, + { + "epoch": 0.2274, + "grad_norm": 0.12816768884658813, + "learning_rate": 4.5588549428417765e-05, + "loss": 0.0406, + "step": 46480 + }, + { + "epoch": 0.22745, + "grad_norm": 0.1014213114976883, + "learning_rate": 4.558620429806013e-05, + "loss": 0.0384, + "step": 46490 + }, + { + "epoch": 0.2275, + "grad_norm": 0.10942688584327698, + "learning_rate": 4.55838586048834e-05, + "loss": 0.0414, + "step": 46500 + }, + { + "epoch": 0.22755, + "grad_norm": 0.09891363978385925, + "learning_rate": 4.5581512348951706e-05, + "loss": 0.0395, + "step": 46510 + }, + { + "epoch": 0.2276, + "grad_norm": 0.12483636289834976, + "learning_rate": 4.55791655303292e-05, + "loss": 0.0429, + "step": 46520 + }, + { + "epoch": 0.22765, + "grad_norm": 0.10598792880773544, + "learning_rate": 4.5576818149080045e-05, + "loss": 0.0389, + "step": 46530 + }, + { + "epoch": 0.2277, + "grad_norm": 0.12170213460922241, + "learning_rate": 4.5574470205268406e-05, + "loss": 0.0388, + "step": 46540 + }, + { + "epoch": 0.22775, + "grad_norm": 0.11315925419330597, + "learning_rate": 4.5572121698958484e-05, + "loss": 0.0401, + "step": 46550 + }, + { + "epoch": 0.2278, + "grad_norm": 0.11891046166419983, + "learning_rate": 4.556977263021448e-05, + "loss": 0.041, + "step": 46560 + }, + { + "epoch": 0.22785, + "grad_norm": 0.0966760441660881, + "learning_rate": 4.5567422999100624e-05, + "loss": 0.0424, + "step": 46570 + }, + { + "epoch": 0.2279, + "grad_norm": 0.11074547469615936, + "learning_rate": 4.556507280568114e-05, + "loss": 0.0405, + "step": 46580 + }, + { + "epoch": 0.22795, + "grad_norm": 0.10060471296310425, + "learning_rate": 4.55627220500203e-05, + "loss": 0.0394, + "step": 46590 + }, + { + "epoch": 0.228, + "grad_norm": 0.10925568640232086, + "learning_rate": 4.5560370732182364e-05, + "loss": 0.0397, + "step": 46600 + }, + { + "epoch": 0.22805, + "grad_norm": 0.12439475953578949, + "learning_rate": 4.555801885223162e-05, + "loss": 0.0398, + "step": 46610 + }, + { + "epoch": 0.2281, + "grad_norm": 0.12491118907928467, + "learning_rate": 4.5555666410232356e-05, + "loss": 0.0414, + "step": 46620 + }, + { + "epoch": 0.22815, + "grad_norm": 0.10894277691841125, + "learning_rate": 4.55533134062489e-05, + "loss": 0.04, + "step": 46630 + }, + { + "epoch": 0.2282, + "grad_norm": 0.14506717026233673, + "learning_rate": 4.5550959840345574e-05, + "loss": 0.0398, + "step": 46640 + }, + { + "epoch": 0.22825, + "grad_norm": 0.10716503113508224, + "learning_rate": 4.554860571258673e-05, + "loss": 0.0398, + "step": 46650 + }, + { + "epoch": 0.2283, + "grad_norm": 0.11917222291231155, + "learning_rate": 4.554625102303672e-05, + "loss": 0.0404, + "step": 46660 + }, + { + "epoch": 0.22835, + "grad_norm": 0.1065252348780632, + "learning_rate": 4.554389577175993e-05, + "loss": 0.0397, + "step": 46670 + }, + { + "epoch": 0.2284, + "grad_norm": 0.14224746823310852, + "learning_rate": 4.554153995882074e-05, + "loss": 0.0398, + "step": 46680 + }, + { + "epoch": 0.22845, + "grad_norm": 0.11975722014904022, + "learning_rate": 4.553918358428358e-05, + "loss": 0.0379, + "step": 46690 + }, + { + "epoch": 0.2285, + "grad_norm": 0.12840095162391663, + "learning_rate": 4.5536826648212846e-05, + "loss": 0.04, + "step": 46700 + }, + { + "epoch": 0.22855, + "grad_norm": 0.1120993047952652, + "learning_rate": 4.553446915067299e-05, + "loss": 0.0406, + "step": 46710 + }, + { + "epoch": 0.2286, + "grad_norm": 0.16436734795570374, + "learning_rate": 4.5532111091728465e-05, + "loss": 0.0417, + "step": 46720 + }, + { + "epoch": 0.22865, + "grad_norm": 0.12969239056110382, + "learning_rate": 4.552975247144373e-05, + "loss": 0.0416, + "step": 46730 + }, + { + "epoch": 0.2287, + "grad_norm": 0.12759140133857727, + "learning_rate": 4.552739328988328e-05, + "loss": 0.0413, + "step": 46740 + }, + { + "epoch": 0.22875, + "grad_norm": 0.12827183306217194, + "learning_rate": 4.5525033547111604e-05, + "loss": 0.0407, + "step": 46750 + }, + { + "epoch": 0.2288, + "grad_norm": 0.12281138449907303, + "learning_rate": 4.5522673243193225e-05, + "loss": 0.0403, + "step": 46760 + }, + { + "epoch": 0.22885, + "grad_norm": 0.12596429884433746, + "learning_rate": 4.552031237819267e-05, + "loss": 0.0407, + "step": 46770 + }, + { + "epoch": 0.2289, + "grad_norm": 0.11390097439289093, + "learning_rate": 4.551795095217448e-05, + "loss": 0.0411, + "step": 46780 + }, + { + "epoch": 0.22895, + "grad_norm": 0.1110418513417244, + "learning_rate": 4.551558896520323e-05, + "loss": 0.0405, + "step": 46790 + }, + { + "epoch": 0.229, + "grad_norm": 0.10880877077579498, + "learning_rate": 4.551322641734347e-05, + "loss": 0.0384, + "step": 46800 + }, + { + "epoch": 0.22905, + "grad_norm": 0.09871169924736023, + "learning_rate": 4.551086330865981e-05, + "loss": 0.04, + "step": 46810 + }, + { + "epoch": 0.2291, + "grad_norm": 0.1078561469912529, + "learning_rate": 4.550849963921686e-05, + "loss": 0.0405, + "step": 46820 + }, + { + "epoch": 0.22915, + "grad_norm": 0.09892131388187408, + "learning_rate": 4.5506135409079234e-05, + "loss": 0.0415, + "step": 46830 + }, + { + "epoch": 0.2292, + "grad_norm": 0.11442753672599792, + "learning_rate": 4.550377061831156e-05, + "loss": 0.0411, + "step": 46840 + }, + { + "epoch": 0.22925, + "grad_norm": 0.10793804377317429, + "learning_rate": 4.550140526697851e-05, + "loss": 0.042, + "step": 46850 + }, + { + "epoch": 0.2293, + "grad_norm": 0.10234250128269196, + "learning_rate": 4.549903935514473e-05, + "loss": 0.0391, + "step": 46860 + }, + { + "epoch": 0.22935, + "grad_norm": 0.09293326735496521, + "learning_rate": 4.549667288287493e-05, + "loss": 0.0402, + "step": 46870 + }, + { + "epoch": 0.2294, + "grad_norm": 0.12015996873378754, + "learning_rate": 4.5494305850233786e-05, + "loss": 0.0392, + "step": 46880 + }, + { + "epoch": 0.22945, + "grad_norm": 0.1293002963066101, + "learning_rate": 4.549193825728602e-05, + "loss": 0.0404, + "step": 46890 + }, + { + "epoch": 0.2295, + "grad_norm": 0.09664689749479294, + "learning_rate": 4.548957010409636e-05, + "loss": 0.0383, + "step": 46900 + }, + { + "epoch": 0.22955, + "grad_norm": 0.10771577805280685, + "learning_rate": 4.548720139072955e-05, + "loss": 0.0404, + "step": 46910 + }, + { + "epoch": 0.2296, + "grad_norm": 0.10461598634719849, + "learning_rate": 4.5484832117250356e-05, + "loss": 0.0406, + "step": 46920 + }, + { + "epoch": 0.22965, + "grad_norm": 0.1101800948381424, + "learning_rate": 4.5482462283723545e-05, + "loss": 0.0384, + "step": 46930 + }, + { + "epoch": 0.2297, + "grad_norm": 0.1299954056739807, + "learning_rate": 4.548009189021391e-05, + "loss": 0.0413, + "step": 46940 + }, + { + "epoch": 0.22975, + "grad_norm": 0.1355685144662857, + "learning_rate": 4.547772093678626e-05, + "loss": 0.0403, + "step": 46950 + }, + { + "epoch": 0.2298, + "grad_norm": 0.12462537735700607, + "learning_rate": 4.547534942350541e-05, + "loss": 0.0384, + "step": 46960 + }, + { + "epoch": 0.22985, + "grad_norm": 0.11404204368591309, + "learning_rate": 4.54729773504362e-05, + "loss": 0.0393, + "step": 46970 + }, + { + "epoch": 0.2299, + "grad_norm": 0.11085730791091919, + "learning_rate": 4.547060471764347e-05, + "loss": 0.0412, + "step": 46980 + }, + { + "epoch": 0.22995, + "grad_norm": 0.09980040788650513, + "learning_rate": 4.5468231525192104e-05, + "loss": 0.0379, + "step": 46990 + }, + { + "epoch": 0.23, + "grad_norm": 0.12433990836143494, + "learning_rate": 4.546585777314698e-05, + "loss": 0.0391, + "step": 47000 + }, + { + "epoch": 0.23005, + "grad_norm": 0.11919044703245163, + "learning_rate": 4.5463483461572996e-05, + "loss": 0.0385, + "step": 47010 + }, + { + "epoch": 0.2301, + "grad_norm": 0.10812638700008392, + "learning_rate": 4.546110859053506e-05, + "loss": 0.0386, + "step": 47020 + }, + { + "epoch": 0.23015, + "grad_norm": 0.08290821313858032, + "learning_rate": 4.54587331600981e-05, + "loss": 0.0374, + "step": 47030 + }, + { + "epoch": 0.2302, + "grad_norm": 0.12326608598232269, + "learning_rate": 4.545635717032706e-05, + "loss": 0.0383, + "step": 47040 + }, + { + "epoch": 0.23025, + "grad_norm": 0.0903635025024414, + "learning_rate": 4.5453980621286904e-05, + "loss": 0.0382, + "step": 47050 + }, + { + "epoch": 0.2303, + "grad_norm": 0.11884330213069916, + "learning_rate": 4.54516035130426e-05, + "loss": 0.0393, + "step": 47060 + }, + { + "epoch": 0.23035, + "grad_norm": 0.10627713054418564, + "learning_rate": 4.544922584565914e-05, + "loss": 0.0386, + "step": 47070 + }, + { + "epoch": 0.2304, + "grad_norm": 0.12588046491146088, + "learning_rate": 4.5446847619201524e-05, + "loss": 0.0404, + "step": 47080 + }, + { + "epoch": 0.23045, + "grad_norm": 0.10804393887519836, + "learning_rate": 4.544446883373478e-05, + "loss": 0.0387, + "step": 47090 + }, + { + "epoch": 0.2305, + "grad_norm": 0.13833652436733246, + "learning_rate": 4.5442089489323933e-05, + "loss": 0.0396, + "step": 47100 + }, + { + "epoch": 0.23055, + "grad_norm": 0.11136481165885925, + "learning_rate": 4.543970958603405e-05, + "loss": 0.0397, + "step": 47110 + }, + { + "epoch": 0.2306, + "grad_norm": 0.10702042281627655, + "learning_rate": 4.5437329123930175e-05, + "loss": 0.0399, + "step": 47120 + }, + { + "epoch": 0.23065, + "grad_norm": 0.12892375886440277, + "learning_rate": 4.543494810307741e-05, + "loss": 0.0411, + "step": 47130 + }, + { + "epoch": 0.2307, + "grad_norm": 0.1187572181224823, + "learning_rate": 4.543256652354083e-05, + "loss": 0.0402, + "step": 47140 + }, + { + "epoch": 0.23075, + "grad_norm": 0.1277954876422882, + "learning_rate": 4.5430184385385565e-05, + "loss": 0.04, + "step": 47150 + }, + { + "epoch": 0.2308, + "grad_norm": 0.10876869410276413, + "learning_rate": 4.542780168867673e-05, + "loss": 0.0404, + "step": 47160 + }, + { + "epoch": 0.23085, + "grad_norm": 0.10255390405654907, + "learning_rate": 4.5425418433479475e-05, + "loss": 0.0381, + "step": 47170 + }, + { + "epoch": 0.2309, + "grad_norm": 0.10682419687509537, + "learning_rate": 4.542303461985895e-05, + "loss": 0.0385, + "step": 47180 + }, + { + "epoch": 0.23095, + "grad_norm": 0.0859571322798729, + "learning_rate": 4.5420650247880337e-05, + "loss": 0.0388, + "step": 47190 + }, + { + "epoch": 0.231, + "grad_norm": 0.09608227014541626, + "learning_rate": 4.541826531760881e-05, + "loss": 0.0391, + "step": 47200 + }, + { + "epoch": 0.23105, + "grad_norm": 0.1249164342880249, + "learning_rate": 4.5415879829109584e-05, + "loss": 0.0382, + "step": 47210 + }, + { + "epoch": 0.2311, + "grad_norm": 0.10630123317241669, + "learning_rate": 4.5413493782447866e-05, + "loss": 0.0397, + "step": 47220 + }, + { + "epoch": 0.23115, + "grad_norm": 0.11662591993808746, + "learning_rate": 4.5411107177688914e-05, + "loss": 0.0381, + "step": 47230 + }, + { + "epoch": 0.2312, + "grad_norm": 0.12181032449007034, + "learning_rate": 4.540872001489794e-05, + "loss": 0.0396, + "step": 47240 + }, + { + "epoch": 0.23125, + "grad_norm": 0.10808727890253067, + "learning_rate": 4.540633229414024e-05, + "loss": 0.0402, + "step": 47250 + }, + { + "epoch": 0.2313, + "grad_norm": 0.10237918049097061, + "learning_rate": 4.540394401548108e-05, + "loss": 0.0418, + "step": 47260 + }, + { + "epoch": 0.23135, + "grad_norm": 0.10050293803215027, + "learning_rate": 4.540155517898575e-05, + "loss": 0.0394, + "step": 47270 + }, + { + "epoch": 0.2314, + "grad_norm": 0.10733969509601593, + "learning_rate": 4.5399165784719574e-05, + "loss": 0.038, + "step": 47280 + }, + { + "epoch": 0.23145, + "grad_norm": 0.11772868782281876, + "learning_rate": 4.5396775832747876e-05, + "loss": 0.0408, + "step": 47290 + }, + { + "epoch": 0.2315, + "grad_norm": 0.09678518772125244, + "learning_rate": 4.5394385323135974e-05, + "loss": 0.0399, + "step": 47300 + }, + { + "epoch": 0.23155, + "grad_norm": 0.11706313490867615, + "learning_rate": 4.5391994255949245e-05, + "loss": 0.0388, + "step": 47310 + }, + { + "epoch": 0.2316, + "grad_norm": 0.09137055277824402, + "learning_rate": 4.5389602631253054e-05, + "loss": 0.0386, + "step": 47320 + }, + { + "epoch": 0.23165, + "grad_norm": 0.10179854184389114, + "learning_rate": 4.5387210449112785e-05, + "loss": 0.0392, + "step": 47330 + }, + { + "epoch": 0.2317, + "grad_norm": 0.12545840442180634, + "learning_rate": 4.538481770959384e-05, + "loss": 0.039, + "step": 47340 + }, + { + "epoch": 0.23175, + "grad_norm": 0.11552725732326508, + "learning_rate": 4.5382424412761635e-05, + "loss": 0.0387, + "step": 47350 + }, + { + "epoch": 0.2318, + "grad_norm": 0.09177439659833908, + "learning_rate": 4.538003055868162e-05, + "loss": 0.0393, + "step": 47360 + }, + { + "epoch": 0.23185, + "grad_norm": 0.07980625331401825, + "learning_rate": 4.537763614741921e-05, + "loss": 0.0403, + "step": 47370 + }, + { + "epoch": 0.2319, + "grad_norm": 0.10224027931690216, + "learning_rate": 4.5375241179039886e-05, + "loss": 0.0395, + "step": 47380 + }, + { + "epoch": 0.23195, + "grad_norm": 0.10371386259794235, + "learning_rate": 4.537284565360913e-05, + "loss": 0.0395, + "step": 47390 + }, + { + "epoch": 0.232, + "grad_norm": 0.1289975494146347, + "learning_rate": 4.537044957119242e-05, + "loss": 0.0408, + "step": 47400 + }, + { + "epoch": 0.23205, + "grad_norm": 0.11533493548631668, + "learning_rate": 4.536805293185527e-05, + "loss": 0.0391, + "step": 47410 + }, + { + "epoch": 0.2321, + "grad_norm": 0.12109930068254471, + "learning_rate": 4.5365655735663214e-05, + "loss": 0.0383, + "step": 47420 + }, + { + "epoch": 0.23215, + "grad_norm": 0.1250036507844925, + "learning_rate": 4.536325798268177e-05, + "loss": 0.0394, + "step": 47430 + }, + { + "epoch": 0.2322, + "grad_norm": 0.12080734968185425, + "learning_rate": 4.536085967297651e-05, + "loss": 0.0394, + "step": 47440 + }, + { + "epoch": 0.23225, + "grad_norm": 0.12277339398860931, + "learning_rate": 4.5358460806612996e-05, + "loss": 0.0425, + "step": 47450 + }, + { + "epoch": 0.2323, + "grad_norm": 0.11439141631126404, + "learning_rate": 4.535606138365681e-05, + "loss": 0.042, + "step": 47460 + }, + { + "epoch": 0.23235, + "grad_norm": 0.12071476131677628, + "learning_rate": 4.5353661404173554e-05, + "loss": 0.0405, + "step": 47470 + }, + { + "epoch": 0.2324, + "grad_norm": 0.12388350069522858, + "learning_rate": 4.535126086822884e-05, + "loss": 0.0404, + "step": 47480 + }, + { + "epoch": 0.23245, + "grad_norm": 0.15248411893844604, + "learning_rate": 4.53488597758883e-05, + "loss": 0.0405, + "step": 47490 + }, + { + "epoch": 0.2325, + "grad_norm": 0.12671367824077606, + "learning_rate": 4.534645812721758e-05, + "loss": 0.0388, + "step": 47500 + }, + { + "epoch": 0.23255, + "grad_norm": 0.14130564033985138, + "learning_rate": 4.534405592228233e-05, + "loss": 0.0393, + "step": 47510 + }, + { + "epoch": 0.2326, + "grad_norm": 0.10151517391204834, + "learning_rate": 4.534165316114825e-05, + "loss": 0.04, + "step": 47520 + }, + { + "epoch": 0.23265, + "grad_norm": 0.1347714066505432, + "learning_rate": 4.5339249843881004e-05, + "loss": 0.0425, + "step": 47530 + }, + { + "epoch": 0.2327, + "grad_norm": 0.11208470910787582, + "learning_rate": 4.5336845970546315e-05, + "loss": 0.0385, + "step": 47540 + }, + { + "epoch": 0.23275, + "grad_norm": 0.12243901938199997, + "learning_rate": 4.5334441541209895e-05, + "loss": 0.0394, + "step": 47550 + }, + { + "epoch": 0.2328, + "grad_norm": 0.10456179082393646, + "learning_rate": 4.5332036555937475e-05, + "loss": 0.0401, + "step": 47560 + }, + { + "epoch": 0.23285, + "grad_norm": 0.1470469832420349, + "learning_rate": 4.532963101479482e-05, + "loss": 0.0392, + "step": 47570 + }, + { + "epoch": 0.2329, + "grad_norm": 0.11980633437633514, + "learning_rate": 4.532722491784769e-05, + "loss": 0.0398, + "step": 47580 + }, + { + "epoch": 0.23295, + "grad_norm": 0.11149538308382034, + "learning_rate": 4.5324818265161875e-05, + "loss": 0.0395, + "step": 47590 + }, + { + "epoch": 0.233, + "grad_norm": 0.09943703562021255, + "learning_rate": 4.532241105680315e-05, + "loss": 0.0399, + "step": 47600 + }, + { + "epoch": 0.23305, + "grad_norm": 0.10191832482814789, + "learning_rate": 4.532000329283735e-05, + "loss": 0.0393, + "step": 47610 + }, + { + "epoch": 0.2331, + "grad_norm": 0.116791270673275, + "learning_rate": 4.531759497333029e-05, + "loss": 0.0408, + "step": 47620 + }, + { + "epoch": 0.23315, + "grad_norm": 0.09550857543945312, + "learning_rate": 4.5315186098347814e-05, + "loss": 0.0394, + "step": 47630 + }, + { + "epoch": 0.2332, + "grad_norm": 0.09740550816059113, + "learning_rate": 4.5312776667955795e-05, + "loss": 0.039, + "step": 47640 + }, + { + "epoch": 0.23325, + "grad_norm": 0.11140163987874985, + "learning_rate": 4.531036668222008e-05, + "loss": 0.0386, + "step": 47650 + }, + { + "epoch": 0.2333, + "grad_norm": 0.10831119120121002, + "learning_rate": 4.530795614120657e-05, + "loss": 0.0389, + "step": 47660 + }, + { + "epoch": 0.23335, + "grad_norm": 0.10964003950357437, + "learning_rate": 4.530554504498118e-05, + "loss": 0.0415, + "step": 47670 + }, + { + "epoch": 0.2334, + "grad_norm": 0.1138722375035286, + "learning_rate": 4.530313339360981e-05, + "loss": 0.0414, + "step": 47680 + }, + { + "epoch": 0.23345, + "grad_norm": 0.10521946847438812, + "learning_rate": 4.53007211871584e-05, + "loss": 0.0385, + "step": 47690 + }, + { + "epoch": 0.2335, + "grad_norm": 0.12359435856342316, + "learning_rate": 4.52983084256929e-05, + "loss": 0.0413, + "step": 47700 + }, + { + "epoch": 0.23355, + "grad_norm": 0.13060010969638824, + "learning_rate": 4.529589510927927e-05, + "loss": 0.0409, + "step": 47710 + }, + { + "epoch": 0.2336, + "grad_norm": 0.15324173867702484, + "learning_rate": 4.5293481237983506e-05, + "loss": 0.0411, + "step": 47720 + }, + { + "epoch": 0.23365, + "grad_norm": 0.11896300315856934, + "learning_rate": 4.529106681187158e-05, + "loss": 0.0407, + "step": 47730 + }, + { + "epoch": 0.2337, + "grad_norm": 0.10039672255516052, + "learning_rate": 4.52886518310095e-05, + "loss": 0.0407, + "step": 47740 + }, + { + "epoch": 0.23375, + "grad_norm": 0.11257708072662354, + "learning_rate": 4.528623629546331e-05, + "loss": 0.0395, + "step": 47750 + }, + { + "epoch": 0.2338, + "grad_norm": 0.09427443146705627, + "learning_rate": 4.5283820205299044e-05, + "loss": 0.0396, + "step": 47760 + }, + { + "epoch": 0.23385, + "grad_norm": 0.11360807716846466, + "learning_rate": 4.5281403560582754e-05, + "loss": 0.0382, + "step": 47770 + }, + { + "epoch": 0.2339, + "grad_norm": 0.11459067463874817, + "learning_rate": 4.52789863613805e-05, + "loss": 0.038, + "step": 47780 + }, + { + "epoch": 0.23395, + "grad_norm": 0.11873367428779602, + "learning_rate": 4.527656860775838e-05, + "loss": 0.0388, + "step": 47790 + }, + { + "epoch": 0.234, + "grad_norm": 0.1183047667145729, + "learning_rate": 4.52741502997825e-05, + "loss": 0.0386, + "step": 47800 + }, + { + "epoch": 0.23405, + "grad_norm": 0.10633184015750885, + "learning_rate": 4.527173143751897e-05, + "loss": 0.0396, + "step": 47810 + }, + { + "epoch": 0.2341, + "grad_norm": 0.10609755665063858, + "learning_rate": 4.526931202103391e-05, + "loss": 0.0392, + "step": 47820 + }, + { + "epoch": 0.23415, + "grad_norm": 0.12338245660066605, + "learning_rate": 4.526689205039347e-05, + "loss": 0.0384, + "step": 47830 + }, + { + "epoch": 0.2342, + "grad_norm": 0.13311438262462616, + "learning_rate": 4.526447152566382e-05, + "loss": 0.0386, + "step": 47840 + }, + { + "epoch": 0.23425, + "grad_norm": 0.10245085507631302, + "learning_rate": 4.526205044691114e-05, + "loss": 0.0383, + "step": 47850 + }, + { + "epoch": 0.2343, + "grad_norm": 0.13168932497501373, + "learning_rate": 4.5259628814201604e-05, + "loss": 0.0412, + "step": 47860 + }, + { + "epoch": 0.23435, + "grad_norm": 0.10075122863054276, + "learning_rate": 4.525720662760143e-05, + "loss": 0.0392, + "step": 47870 + }, + { + "epoch": 0.2344, + "grad_norm": 0.11399999260902405, + "learning_rate": 4.525478388717683e-05, + "loss": 0.0382, + "step": 47880 + }, + { + "epoch": 0.23445, + "grad_norm": 0.09355992823839188, + "learning_rate": 4.5252360592994056e-05, + "loss": 0.0408, + "step": 47890 + }, + { + "epoch": 0.2345, + "grad_norm": 0.10210133343935013, + "learning_rate": 4.524993674511935e-05, + "loss": 0.041, + "step": 47900 + }, + { + "epoch": 0.23455, + "grad_norm": 0.10055769979953766, + "learning_rate": 4.524751234361898e-05, + "loss": 0.0393, + "step": 47910 + }, + { + "epoch": 0.2346, + "grad_norm": 0.10071728378534317, + "learning_rate": 4.524508738855924e-05, + "loss": 0.0386, + "step": 47920 + }, + { + "epoch": 0.23465, + "grad_norm": 0.08796606957912445, + "learning_rate": 4.52426618800064e-05, + "loss": 0.0389, + "step": 47930 + }, + { + "epoch": 0.2347, + "grad_norm": 0.09659391641616821, + "learning_rate": 4.52402358180268e-05, + "loss": 0.0391, + "step": 47940 + }, + { + "epoch": 0.23475, + "grad_norm": 0.08772718906402588, + "learning_rate": 4.523780920268675e-05, + "loss": 0.0411, + "step": 47950 + }, + { + "epoch": 0.2348, + "grad_norm": 0.10829611867666245, + "learning_rate": 4.5235382034052596e-05, + "loss": 0.0414, + "step": 47960 + }, + { + "epoch": 0.23485, + "grad_norm": 0.08450762927532196, + "learning_rate": 4.523295431219071e-05, + "loss": 0.0409, + "step": 47970 + }, + { + "epoch": 0.2349, + "grad_norm": 0.10977165400981903, + "learning_rate": 4.5230526037167444e-05, + "loss": 0.0401, + "step": 47980 + }, + { + "epoch": 0.23495, + "grad_norm": 0.11166959255933762, + "learning_rate": 4.52280972090492e-05, + "loss": 0.0385, + "step": 47990 + }, + { + "epoch": 0.235, + "grad_norm": 0.10049823671579361, + "learning_rate": 4.522566782790238e-05, + "loss": 0.0403, + "step": 48000 + }, + { + "epoch": 0.23505, + "grad_norm": 0.10753172636032104, + "learning_rate": 4.52232378937934e-05, + "loss": 0.0402, + "step": 48010 + }, + { + "epoch": 0.2351, + "grad_norm": 0.11045978963375092, + "learning_rate": 4.52208074067887e-05, + "loss": 0.0383, + "step": 48020 + }, + { + "epoch": 0.23515, + "grad_norm": 0.10796971619129181, + "learning_rate": 4.521837636695471e-05, + "loss": 0.0434, + "step": 48030 + }, + { + "epoch": 0.2352, + "grad_norm": 0.18447628617286682, + "learning_rate": 4.521594477435791e-05, + "loss": 0.0419, + "step": 48040 + }, + { + "epoch": 0.23525, + "grad_norm": 0.1373187154531479, + "learning_rate": 4.521351262906478e-05, + "loss": 0.0409, + "step": 48050 + }, + { + "epoch": 0.2353, + "grad_norm": 0.11791710555553436, + "learning_rate": 4.5211079931141795e-05, + "loss": 0.0399, + "step": 48060 + }, + { + "epoch": 0.23535, + "grad_norm": 0.09290598332881927, + "learning_rate": 4.5208646680655495e-05, + "loss": 0.0393, + "step": 48070 + }, + { + "epoch": 0.2354, + "grad_norm": 0.09873346239328384, + "learning_rate": 4.520621287767237e-05, + "loss": 0.0394, + "step": 48080 + }, + { + "epoch": 0.23545, + "grad_norm": 0.09659668058156967, + "learning_rate": 4.520377852225899e-05, + "loss": 0.0392, + "step": 48090 + }, + { + "epoch": 0.2355, + "grad_norm": 0.11689428240060806, + "learning_rate": 4.520134361448189e-05, + "loss": 0.041, + "step": 48100 + }, + { + "epoch": 0.23555, + "grad_norm": 0.09561052173376083, + "learning_rate": 4.519890815440764e-05, + "loss": 0.0405, + "step": 48110 + }, + { + "epoch": 0.2356, + "grad_norm": 0.0902252271771431, + "learning_rate": 4.519647214210284e-05, + "loss": 0.0413, + "step": 48120 + }, + { + "epoch": 0.23565, + "grad_norm": 0.10735762864351273, + "learning_rate": 4.5194035577634075e-05, + "loss": 0.0392, + "step": 48130 + }, + { + "epoch": 0.2357, + "grad_norm": 0.11423599720001221, + "learning_rate": 4.5191598461067955e-05, + "loss": 0.0391, + "step": 48140 + }, + { + "epoch": 0.23575, + "grad_norm": 0.10585816204547882, + "learning_rate": 4.518916079247113e-05, + "loss": 0.039, + "step": 48150 + }, + { + "epoch": 0.2358, + "grad_norm": 0.11375124752521515, + "learning_rate": 4.518672257191023e-05, + "loss": 0.0396, + "step": 48160 + }, + { + "epoch": 0.23585, + "grad_norm": 0.1231110617518425, + "learning_rate": 4.5184283799451916e-05, + "loss": 0.0399, + "step": 48170 + }, + { + "epoch": 0.2359, + "grad_norm": 0.11836814880371094, + "learning_rate": 4.518184447516287e-05, + "loss": 0.041, + "step": 48180 + }, + { + "epoch": 0.23595, + "grad_norm": 0.11588375270366669, + "learning_rate": 4.517940459910978e-05, + "loss": 0.0402, + "step": 48190 + }, + { + "epoch": 0.236, + "grad_norm": 0.12206704169511795, + "learning_rate": 4.517696417135934e-05, + "loss": 0.0434, + "step": 48200 + }, + { + "epoch": 0.23605, + "grad_norm": 0.1477038711309433, + "learning_rate": 4.517452319197828e-05, + "loss": 0.041, + "step": 48210 + }, + { + "epoch": 0.2361, + "grad_norm": 0.13147269189357758, + "learning_rate": 4.5172081661033344e-05, + "loss": 0.0393, + "step": 48220 + }, + { + "epoch": 0.23615, + "grad_norm": 0.09250767529010773, + "learning_rate": 4.5169639578591274e-05, + "loss": 0.039, + "step": 48230 + }, + { + "epoch": 0.2362, + "grad_norm": 0.13637393712997437, + "learning_rate": 4.5167196944718824e-05, + "loss": 0.0393, + "step": 48240 + }, + { + "epoch": 0.23625, + "grad_norm": 0.1100025624036789, + "learning_rate": 4.516475375948279e-05, + "loss": 0.0401, + "step": 48250 + }, + { + "epoch": 0.2363, + "grad_norm": 0.11402478814125061, + "learning_rate": 4.516231002294997e-05, + "loss": 0.0385, + "step": 48260 + }, + { + "epoch": 0.23635, + "grad_norm": 0.0982499048113823, + "learning_rate": 4.5159865735187165e-05, + "loss": 0.0392, + "step": 48270 + }, + { + "epoch": 0.2364, + "grad_norm": 0.11894634366035461, + "learning_rate": 4.5157420896261205e-05, + "loss": 0.0392, + "step": 48280 + }, + { + "epoch": 0.23645, + "grad_norm": 0.08542770147323608, + "learning_rate": 4.5154975506238926e-05, + "loss": 0.0383, + "step": 48290 + }, + { + "epoch": 0.2365, + "grad_norm": 0.13653557002544403, + "learning_rate": 4.51525295651872e-05, + "loss": 0.0401, + "step": 48300 + }, + { + "epoch": 0.23655, + "grad_norm": 0.11575108021497726, + "learning_rate": 4.515008307317288e-05, + "loss": 0.04, + "step": 48310 + }, + { + "epoch": 0.2366, + "grad_norm": 0.08517330139875412, + "learning_rate": 4.5147636030262854e-05, + "loss": 0.0398, + "step": 48320 + }, + { + "epoch": 0.23665, + "grad_norm": 0.10020171105861664, + "learning_rate": 4.514518843652403e-05, + "loss": 0.0402, + "step": 48330 + }, + { + "epoch": 0.2367, + "grad_norm": 0.09228149801492691, + "learning_rate": 4.514274029202333e-05, + "loss": 0.039, + "step": 48340 + }, + { + "epoch": 0.23675, + "grad_norm": 0.09565956890583038, + "learning_rate": 4.514029159682767e-05, + "loss": 0.0373, + "step": 48350 + }, + { + "epoch": 0.2368, + "grad_norm": 0.08942577987909317, + "learning_rate": 4.5137842351004004e-05, + "loss": 0.0384, + "step": 48360 + }, + { + "epoch": 0.23685, + "grad_norm": 0.09256299585103989, + "learning_rate": 4.51353925546193e-05, + "loss": 0.0379, + "step": 48370 + }, + { + "epoch": 0.2369, + "grad_norm": 0.1037282645702362, + "learning_rate": 4.513294220774053e-05, + "loss": 0.0396, + "step": 48380 + }, + { + "epoch": 0.23695, + "grad_norm": 0.09529906511306763, + "learning_rate": 4.513049131043467e-05, + "loss": 0.0388, + "step": 48390 + }, + { + "epoch": 0.237, + "grad_norm": 0.09999002516269684, + "learning_rate": 4.5128039862768745e-05, + "loss": 0.0424, + "step": 48400 + }, + { + "epoch": 0.23705, + "grad_norm": 0.10632119327783585, + "learning_rate": 4.512558786480978e-05, + "loss": 0.0417, + "step": 48410 + }, + { + "epoch": 0.2371, + "grad_norm": 0.10591237992048264, + "learning_rate": 4.5123135316624796e-05, + "loss": 0.0381, + "step": 48420 + }, + { + "epoch": 0.23715, + "grad_norm": 0.09921521693468094, + "learning_rate": 4.512068221828086e-05, + "loss": 0.0399, + "step": 48430 + }, + { + "epoch": 0.2372, + "grad_norm": 0.0894329622387886, + "learning_rate": 4.511822856984502e-05, + "loss": 0.0418, + "step": 48440 + }, + { + "epoch": 0.23725, + "grad_norm": 0.09910974651575089, + "learning_rate": 4.5115774371384375e-05, + "loss": 0.039, + "step": 48450 + }, + { + "epoch": 0.2373, + "grad_norm": 0.09602376818656921, + "learning_rate": 4.511331962296602e-05, + "loss": 0.0397, + "step": 48460 + }, + { + "epoch": 0.23735, + "grad_norm": 0.09119140356779099, + "learning_rate": 4.511086432465705e-05, + "loss": 0.0425, + "step": 48470 + }, + { + "epoch": 0.2374, + "grad_norm": 0.11407288908958435, + "learning_rate": 4.510840847652462e-05, + "loss": 0.0405, + "step": 48480 + }, + { + "epoch": 0.23745, + "grad_norm": 0.11424467712640762, + "learning_rate": 4.510595207863585e-05, + "loss": 0.0391, + "step": 48490 + }, + { + "epoch": 0.2375, + "grad_norm": 0.10439980030059814, + "learning_rate": 4.5103495131057904e-05, + "loss": 0.0394, + "step": 48500 + }, + { + "epoch": 0.23755, + "grad_norm": 0.11426830291748047, + "learning_rate": 4.510103763385795e-05, + "loss": 0.0385, + "step": 48510 + }, + { + "epoch": 0.2376, + "grad_norm": 0.1235620304942131, + "learning_rate": 4.5098579587103186e-05, + "loss": 0.0376, + "step": 48520 + }, + { + "epoch": 0.23765, + "grad_norm": 0.12331556528806686, + "learning_rate": 4.50961209908608e-05, + "loss": 0.0401, + "step": 48530 + }, + { + "epoch": 0.2377, + "grad_norm": 0.08665104955434799, + "learning_rate": 4.509366184519802e-05, + "loss": 0.0386, + "step": 48540 + }, + { + "epoch": 0.23775, + "grad_norm": 0.09209824353456497, + "learning_rate": 4.5091202150182064e-05, + "loss": 0.0411, + "step": 48550 + }, + { + "epoch": 0.2378, + "grad_norm": 0.09111570566892624, + "learning_rate": 4.508874190588021e-05, + "loss": 0.0389, + "step": 48560 + }, + { + "epoch": 0.23785, + "grad_norm": 0.09899560362100601, + "learning_rate": 4.508628111235968e-05, + "loss": 0.0399, + "step": 48570 + }, + { + "epoch": 0.2379, + "grad_norm": 0.10667652636766434, + "learning_rate": 4.5083819769687776e-05, + "loss": 0.0385, + "step": 48580 + }, + { + "epoch": 0.23795, + "grad_norm": 0.10386928170919418, + "learning_rate": 4.508135787793178e-05, + "loss": 0.038, + "step": 48590 + }, + { + "epoch": 0.238, + "grad_norm": 0.10738810151815414, + "learning_rate": 4.5078895437159016e-05, + "loss": 0.0374, + "step": 48600 + }, + { + "epoch": 0.23805, + "grad_norm": 0.11245346814393997, + "learning_rate": 4.507643244743679e-05, + "loss": 0.0384, + "step": 48610 + }, + { + "epoch": 0.2381, + "grad_norm": 0.09302639216184616, + "learning_rate": 4.5073968908832446e-05, + "loss": 0.038, + "step": 48620 + }, + { + "epoch": 0.23815, + "grad_norm": 0.10083436220884323, + "learning_rate": 4.5071504821413326e-05, + "loss": 0.0394, + "step": 48630 + }, + { + "epoch": 0.2382, + "grad_norm": 0.10127535462379456, + "learning_rate": 4.5069040185246805e-05, + "loss": 0.0387, + "step": 48640 + }, + { + "epoch": 0.23825, + "grad_norm": 0.12256845831871033, + "learning_rate": 4.5066575000400265e-05, + "loss": 0.0395, + "step": 48650 + }, + { + "epoch": 0.2383, + "grad_norm": 0.10586626827716827, + "learning_rate": 4.5064109266941104e-05, + "loss": 0.0368, + "step": 48660 + }, + { + "epoch": 0.23835, + "grad_norm": 0.10445740073919296, + "learning_rate": 4.506164298493674e-05, + "loss": 0.0376, + "step": 48670 + }, + { + "epoch": 0.2384, + "grad_norm": 0.12181146442890167, + "learning_rate": 4.5059176154454586e-05, + "loss": 0.0379, + "step": 48680 + }, + { + "epoch": 0.23845, + "grad_norm": 0.11048153042793274, + "learning_rate": 4.5056708775562096e-05, + "loss": 0.0399, + "step": 48690 + }, + { + "epoch": 0.2385, + "grad_norm": 0.1106812134385109, + "learning_rate": 4.505424084832672e-05, + "loss": 0.0387, + "step": 48700 + }, + { + "epoch": 0.23855, + "grad_norm": 0.09739340096712112, + "learning_rate": 4.505177237281594e-05, + "loss": 0.0424, + "step": 48710 + }, + { + "epoch": 0.2386, + "grad_norm": 0.1155017539858818, + "learning_rate": 4.504930334909723e-05, + "loss": 0.0397, + "step": 48720 + }, + { + "epoch": 0.23865, + "grad_norm": 0.13820934295654297, + "learning_rate": 4.50468337772381e-05, + "loss": 0.0391, + "step": 48730 + }, + { + "epoch": 0.2387, + "grad_norm": 0.12262959033250809, + "learning_rate": 4.5044363657306055e-05, + "loss": 0.0398, + "step": 48740 + }, + { + "epoch": 0.23875, + "grad_norm": 0.11455817520618439, + "learning_rate": 4.504189298936865e-05, + "loss": 0.0398, + "step": 48750 + }, + { + "epoch": 0.2388, + "grad_norm": 0.12329459190368652, + "learning_rate": 4.5039421773493417e-05, + "loss": 0.0401, + "step": 48760 + }, + { + "epoch": 0.23885, + "grad_norm": 0.10852232575416565, + "learning_rate": 4.5036950009747925e-05, + "loss": 0.0397, + "step": 48770 + }, + { + "epoch": 0.2389, + "grad_norm": 0.11602942645549774, + "learning_rate": 4.503447769819974e-05, + "loss": 0.0392, + "step": 48780 + }, + { + "epoch": 0.23895, + "grad_norm": 0.11058322340250015, + "learning_rate": 4.503200483891647e-05, + "loss": 0.0391, + "step": 48790 + }, + { + "epoch": 0.239, + "grad_norm": 0.1164764016866684, + "learning_rate": 4.502953143196571e-05, + "loss": 0.0398, + "step": 48800 + }, + { + "epoch": 0.23905, + "grad_norm": 0.10696570575237274, + "learning_rate": 4.502705747741508e-05, + "loss": 0.0387, + "step": 48810 + }, + { + "epoch": 0.2391, + "grad_norm": 0.10800722241401672, + "learning_rate": 4.502458297533223e-05, + "loss": 0.0417, + "step": 48820 + }, + { + "epoch": 0.23915, + "grad_norm": 0.09370379149913788, + "learning_rate": 4.502210792578481e-05, + "loss": 0.039, + "step": 48830 + }, + { + "epoch": 0.2392, + "grad_norm": 0.11608373373746872, + "learning_rate": 4.501963232884047e-05, + "loss": 0.0409, + "step": 48840 + }, + { + "epoch": 0.23925, + "grad_norm": 0.09210273623466492, + "learning_rate": 4.50171561845669e-05, + "loss": 0.0395, + "step": 48850 + }, + { + "epoch": 0.2393, + "grad_norm": 0.12715387344360352, + "learning_rate": 4.501467949303181e-05, + "loss": 0.0441, + "step": 48860 + }, + { + "epoch": 0.23935, + "grad_norm": 0.11636749655008316, + "learning_rate": 4.5012202254302894e-05, + "loss": 0.0417, + "step": 48870 + }, + { + "epoch": 0.2394, + "grad_norm": 0.12367291003465652, + "learning_rate": 4.500972446844789e-05, + "loss": 0.04, + "step": 48880 + }, + { + "epoch": 0.23945, + "grad_norm": 0.11022377759218216, + "learning_rate": 4.500724613553454e-05, + "loss": 0.0403, + "step": 48890 + }, + { + "epoch": 0.2395, + "grad_norm": 0.115224689245224, + "learning_rate": 4.500476725563059e-05, + "loss": 0.0406, + "step": 48900 + }, + { + "epoch": 0.23955, + "grad_norm": 0.12038897722959518, + "learning_rate": 4.500228782880382e-05, + "loss": 0.0408, + "step": 48910 + }, + { + "epoch": 0.2396, + "grad_norm": 0.17151667177677155, + "learning_rate": 4.4999807855122025e-05, + "loss": 0.0406, + "step": 48920 + }, + { + "epoch": 0.23965, + "grad_norm": 0.12789474427700043, + "learning_rate": 4.4997327334652984e-05, + "loss": 0.039, + "step": 48930 + }, + { + "epoch": 0.2397, + "grad_norm": 0.10562655329704285, + "learning_rate": 4.499484626746453e-05, + "loss": 0.0376, + "step": 48940 + }, + { + "epoch": 0.23975, + "grad_norm": 0.11011016368865967, + "learning_rate": 4.4992364653624495e-05, + "loss": 0.0392, + "step": 48950 + }, + { + "epoch": 0.2398, + "grad_norm": 0.09995349496603012, + "learning_rate": 4.498988249320072e-05, + "loss": 0.0413, + "step": 48960 + }, + { + "epoch": 0.23985, + "grad_norm": 0.10026644915342331, + "learning_rate": 4.4987399786261064e-05, + "loss": 0.0396, + "step": 48970 + }, + { + "epoch": 0.2399, + "grad_norm": 0.10839217156171799, + "learning_rate": 4.498491653287341e-05, + "loss": 0.039, + "step": 48980 + }, + { + "epoch": 0.23995, + "grad_norm": 0.1049165427684784, + "learning_rate": 4.4982432733105646e-05, + "loss": 0.0399, + "step": 48990 + }, + { + "epoch": 0.24, + "grad_norm": 0.08821311593055725, + "learning_rate": 4.4979948387025675e-05, + "loss": 0.0386, + "step": 49000 + }, + { + "epoch": 0.24005, + "grad_norm": 0.09977595508098602, + "learning_rate": 4.497746349470142e-05, + "loss": 0.0391, + "step": 49010 + }, + { + "epoch": 0.2401, + "grad_norm": 0.10528219491243362, + "learning_rate": 4.497497805620082e-05, + "loss": 0.0375, + "step": 49020 + }, + { + "epoch": 0.24015, + "grad_norm": 0.11685045063495636, + "learning_rate": 4.497249207159183e-05, + "loss": 0.0407, + "step": 49030 + }, + { + "epoch": 0.2402, + "grad_norm": 0.10669083148241043, + "learning_rate": 4.4970005540942405e-05, + "loss": 0.04, + "step": 49040 + }, + { + "epoch": 0.24025, + "grad_norm": 0.09289010614156723, + "learning_rate": 4.496751846432053e-05, + "loss": 0.0385, + "step": 49050 + }, + { + "epoch": 0.2403, + "grad_norm": 0.13816522061824799, + "learning_rate": 4.496503084179421e-05, + "loss": 0.0393, + "step": 49060 + }, + { + "epoch": 0.24035, + "grad_norm": 0.14262689650058746, + "learning_rate": 4.4962542673431434e-05, + "loss": 0.0404, + "step": 49070 + }, + { + "epoch": 0.2404, + "grad_norm": 0.10469383001327515, + "learning_rate": 4.4960053959300254e-05, + "loss": 0.039, + "step": 49080 + }, + { + "epoch": 0.24045, + "grad_norm": 0.09244794398546219, + "learning_rate": 4.495756469946869e-05, + "loss": 0.0384, + "step": 49090 + }, + { + "epoch": 0.2405, + "grad_norm": 0.10283170640468597, + "learning_rate": 4.49550748940048e-05, + "loss": 0.0388, + "step": 49100 + }, + { + "epoch": 0.24055, + "grad_norm": 0.12514524161815643, + "learning_rate": 4.4952584542976664e-05, + "loss": 0.041, + "step": 49110 + }, + { + "epoch": 0.2406, + "grad_norm": 0.09727492928504944, + "learning_rate": 4.495009364645236e-05, + "loss": 0.0389, + "step": 49120 + }, + { + "epoch": 0.24065, + "grad_norm": 0.11048632115125656, + "learning_rate": 4.49476022045e-05, + "loss": 0.0411, + "step": 49130 + }, + { + "epoch": 0.2407, + "grad_norm": 0.09122204035520554, + "learning_rate": 4.494511021718768e-05, + "loss": 0.0398, + "step": 49140 + }, + { + "epoch": 0.24075, + "grad_norm": 0.1161789745092392, + "learning_rate": 4.4942617684583546e-05, + "loss": 0.0418, + "step": 49150 + }, + { + "epoch": 0.2408, + "grad_norm": 0.11543729901313782, + "learning_rate": 4.4940124606755734e-05, + "loss": 0.0419, + "step": 49160 + }, + { + "epoch": 0.24085, + "grad_norm": 0.12771295011043549, + "learning_rate": 4.493763098377241e-05, + "loss": 0.0404, + "step": 49170 + }, + { + "epoch": 0.2409, + "grad_norm": 0.0894574448466301, + "learning_rate": 4.493513681570174e-05, + "loss": 0.0391, + "step": 49180 + }, + { + "epoch": 0.24095, + "grad_norm": 0.08314885199069977, + "learning_rate": 4.493264210261192e-05, + "loss": 0.0402, + "step": 49190 + }, + { + "epoch": 0.241, + "grad_norm": 0.09199110418558121, + "learning_rate": 4.4930146844571156e-05, + "loss": 0.0448, + "step": 49200 + }, + { + "epoch": 0.24105, + "grad_norm": 0.08660294115543365, + "learning_rate": 4.4927651041647654e-05, + "loss": 0.0389, + "step": 49210 + }, + { + "epoch": 0.2411, + "grad_norm": 0.09824992716312408, + "learning_rate": 4.4925154693909674e-05, + "loss": 0.0403, + "step": 49220 + }, + { + "epoch": 0.24115, + "grad_norm": 0.10209884494543076, + "learning_rate": 4.492265780142544e-05, + "loss": 0.0422, + "step": 49230 + }, + { + "epoch": 0.2412, + "grad_norm": 0.09823673963546753, + "learning_rate": 4.4920160364263234e-05, + "loss": 0.0403, + "step": 49240 + }, + { + "epoch": 0.24125, + "grad_norm": 0.11731412261724472, + "learning_rate": 4.491766238249132e-05, + "loss": 0.0405, + "step": 49250 + }, + { + "epoch": 0.2413, + "grad_norm": 0.11141234636306763, + "learning_rate": 4.4915163856178e-05, + "loss": 0.0391, + "step": 49260 + }, + { + "epoch": 0.24135, + "grad_norm": 0.1311003714799881, + "learning_rate": 4.4912664785391584e-05, + "loss": 0.0393, + "step": 49270 + }, + { + "epoch": 0.2414, + "grad_norm": 0.09752027690410614, + "learning_rate": 4.491016517020039e-05, + "loss": 0.0387, + "step": 49280 + }, + { + "epoch": 0.24145, + "grad_norm": 0.09724754840135574, + "learning_rate": 4.4907665010672765e-05, + "loss": 0.0384, + "step": 49290 + }, + { + "epoch": 0.2415, + "grad_norm": 0.11260055750608444, + "learning_rate": 4.4905164306877055e-05, + "loss": 0.0381, + "step": 49300 + }, + { + "epoch": 0.24155, + "grad_norm": 0.11821656674146652, + "learning_rate": 4.4902663058881636e-05, + "loss": 0.0383, + "step": 49310 + }, + { + "epoch": 0.2416, + "grad_norm": 0.13511382043361664, + "learning_rate": 4.490016126675488e-05, + "loss": 0.0401, + "step": 49320 + }, + { + "epoch": 0.24165, + "grad_norm": 0.11218443512916565, + "learning_rate": 4.4897658930565196e-05, + "loss": 0.0405, + "step": 49330 + }, + { + "epoch": 0.2417, + "grad_norm": 0.11669328063726425, + "learning_rate": 4.4895156050380994e-05, + "loss": 0.0387, + "step": 49340 + }, + { + "epoch": 0.24175, + "grad_norm": 0.09982103109359741, + "learning_rate": 4.489265262627069e-05, + "loss": 0.0397, + "step": 49350 + }, + { + "epoch": 0.2418, + "grad_norm": 0.09687146544456482, + "learning_rate": 4.489014865830274e-05, + "loss": 0.039, + "step": 49360 + }, + { + "epoch": 0.24185, + "grad_norm": 0.10066474229097366, + "learning_rate": 4.4887644146545605e-05, + "loss": 0.0383, + "step": 49370 + }, + { + "epoch": 0.2419, + "grad_norm": 0.09016059339046478, + "learning_rate": 4.488513909106774e-05, + "loss": 0.0382, + "step": 49380 + }, + { + "epoch": 0.24195, + "grad_norm": 0.09374802559614182, + "learning_rate": 4.4882633491937654e-05, + "loss": 0.0399, + "step": 49390 + }, + { + "epoch": 0.242, + "grad_norm": 0.09328625351190567, + "learning_rate": 4.488012734922383e-05, + "loss": 0.038, + "step": 49400 + }, + { + "epoch": 0.24205, + "grad_norm": 0.0893532782793045, + "learning_rate": 4.487762066299479e-05, + "loss": 0.0384, + "step": 49410 + }, + { + "epoch": 0.2421, + "grad_norm": 0.09541701525449753, + "learning_rate": 4.487511343331908e-05, + "loss": 0.0383, + "step": 49420 + }, + { + "epoch": 0.24215, + "grad_norm": 0.08737003803253174, + "learning_rate": 4.4872605660265227e-05, + "loss": 0.0388, + "step": 49430 + }, + { + "epoch": 0.2422, + "grad_norm": 0.10412393510341644, + "learning_rate": 4.48700973439018e-05, + "loss": 0.0385, + "step": 49440 + }, + { + "epoch": 0.24225, + "grad_norm": 0.09979381412267685, + "learning_rate": 4.486758848429738e-05, + "loss": 0.0388, + "step": 49450 + }, + { + "epoch": 0.2423, + "grad_norm": 0.10434600710868835, + "learning_rate": 4.486507908152055e-05, + "loss": 0.0387, + "step": 49460 + }, + { + "epoch": 0.24235, + "grad_norm": 0.0980309322476387, + "learning_rate": 4.4862569135639934e-05, + "loss": 0.0386, + "step": 49470 + }, + { + "epoch": 0.2424, + "grad_norm": 0.10557007789611816, + "learning_rate": 4.486005864672412e-05, + "loss": 0.0379, + "step": 49480 + }, + { + "epoch": 0.24245, + "grad_norm": 0.09704895317554474, + "learning_rate": 4.485754761484178e-05, + "loss": 0.039, + "step": 49490 + }, + { + "epoch": 0.2425, + "grad_norm": 0.08014574646949768, + "learning_rate": 4.485503604006154e-05, + "loss": 0.0383, + "step": 49500 + }, + { + "epoch": 0.24255, + "grad_norm": 0.08936386555433273, + "learning_rate": 4.4852523922452084e-05, + "loss": 0.0371, + "step": 49510 + }, + { + "epoch": 0.2426, + "grad_norm": 0.11614465713500977, + "learning_rate": 4.485001126208207e-05, + "loss": 0.0388, + "step": 49520 + }, + { + "epoch": 0.24265, + "grad_norm": 0.08956684917211533, + "learning_rate": 4.484749805902021e-05, + "loss": 0.0387, + "step": 49530 + }, + { + "epoch": 0.2427, + "grad_norm": 0.08727693557739258, + "learning_rate": 4.484498431333521e-05, + "loss": 0.0381, + "step": 49540 + }, + { + "epoch": 0.24275, + "grad_norm": 0.0980883464217186, + "learning_rate": 4.48424700250958e-05, + "loss": 0.038, + "step": 49550 + }, + { + "epoch": 0.2428, + "grad_norm": 0.11391928046941757, + "learning_rate": 4.483995519437071e-05, + "loss": 0.0379, + "step": 49560 + }, + { + "epoch": 0.24285, + "grad_norm": 0.11534087359905243, + "learning_rate": 4.483743982122869e-05, + "loss": 0.0417, + "step": 49570 + }, + { + "epoch": 0.2429, + "grad_norm": 0.11695587635040283, + "learning_rate": 4.483492390573853e-05, + "loss": 0.0398, + "step": 49580 + }, + { + "epoch": 0.24295, + "grad_norm": 0.12372042238712311, + "learning_rate": 4.4832407447968994e-05, + "loss": 0.0411, + "step": 49590 + }, + { + "epoch": 0.243, + "grad_norm": 0.10684667527675629, + "learning_rate": 4.482989044798889e-05, + "loss": 0.0388, + "step": 49600 + }, + { + "epoch": 0.24305, + "grad_norm": 0.10489478707313538, + "learning_rate": 4.482737290586703e-05, + "loss": 0.0414, + "step": 49610 + }, + { + "epoch": 0.2431, + "grad_norm": 0.11063244938850403, + "learning_rate": 4.4824854821672245e-05, + "loss": 0.0388, + "step": 49620 + }, + { + "epoch": 0.24315, + "grad_norm": 0.1027778759598732, + "learning_rate": 4.482233619547338e-05, + "loss": 0.0416, + "step": 49630 + }, + { + "epoch": 0.2432, + "grad_norm": 0.11320636421442032, + "learning_rate": 4.481981702733929e-05, + "loss": 0.0402, + "step": 49640 + }, + { + "epoch": 0.24325, + "grad_norm": 0.09724336862564087, + "learning_rate": 4.481729731733885e-05, + "loss": 0.0403, + "step": 49650 + }, + { + "epoch": 0.2433, + "grad_norm": 0.11856746673583984, + "learning_rate": 4.4814777065540936e-05, + "loss": 0.0393, + "step": 49660 + }, + { + "epoch": 0.24335, + "grad_norm": 0.10303764790296555, + "learning_rate": 4.481225627201448e-05, + "loss": 0.0402, + "step": 49670 + }, + { + "epoch": 0.2434, + "grad_norm": 0.0987589880824089, + "learning_rate": 4.4809734936828365e-05, + "loss": 0.0414, + "step": 49680 + }, + { + "epoch": 0.24345, + "grad_norm": 0.12305308878421783, + "learning_rate": 4.480721306005154e-05, + "loss": 0.0424, + "step": 49690 + }, + { + "epoch": 0.2435, + "grad_norm": 0.11457843333482742, + "learning_rate": 4.4804690641752955e-05, + "loss": 0.0386, + "step": 49700 + }, + { + "epoch": 0.24355, + "grad_norm": 0.15410658717155457, + "learning_rate": 4.480216768200157e-05, + "loss": 0.0408, + "step": 49710 + }, + { + "epoch": 0.2436, + "grad_norm": 0.13668401539325714, + "learning_rate": 4.479964418086635e-05, + "loss": 0.04, + "step": 49720 + }, + { + "epoch": 0.24365, + "grad_norm": 0.1328498274087906, + "learning_rate": 4.47971201384163e-05, + "loss": 0.039, + "step": 49730 + }, + { + "epoch": 0.2437, + "grad_norm": 0.11546991765499115, + "learning_rate": 4.479459555472043e-05, + "loss": 0.0389, + "step": 49740 + }, + { + "epoch": 0.24375, + "grad_norm": 0.09257245808839798, + "learning_rate": 4.479207042984775e-05, + "loss": 0.0372, + "step": 49750 + }, + { + "epoch": 0.2438, + "grad_norm": 0.11340250819921494, + "learning_rate": 4.4789544763867304e-05, + "loss": 0.039, + "step": 49760 + }, + { + "epoch": 0.24385, + "grad_norm": 0.11558615416288376, + "learning_rate": 4.478701855684814e-05, + "loss": 0.0394, + "step": 49770 + }, + { + "epoch": 0.2439, + "grad_norm": 0.10287030786275864, + "learning_rate": 4.4784491808859314e-05, + "loss": 0.0395, + "step": 49780 + }, + { + "epoch": 0.24395, + "grad_norm": 0.1381189078092575, + "learning_rate": 4.478196451996992e-05, + "loss": 0.0404, + "step": 49790 + }, + { + "epoch": 0.244, + "grad_norm": 0.136036217212677, + "learning_rate": 4.4779436690249045e-05, + "loss": 0.0393, + "step": 49800 + }, + { + "epoch": 0.24405, + "grad_norm": 0.10182151943445206, + "learning_rate": 4.4776908319765797e-05, + "loss": 0.0383, + "step": 49810 + }, + { + "epoch": 0.2441, + "grad_norm": 0.1141929030418396, + "learning_rate": 4.477437940858932e-05, + "loss": 0.0394, + "step": 49820 + }, + { + "epoch": 0.24415, + "grad_norm": 0.10495369881391525, + "learning_rate": 4.477184995678872e-05, + "loss": 0.0383, + "step": 49830 + }, + { + "epoch": 0.2442, + "grad_norm": 0.0963873639702797, + "learning_rate": 4.476931996443319e-05, + "loss": 0.0392, + "step": 49840 + }, + { + "epoch": 0.24425, + "grad_norm": 0.0951836109161377, + "learning_rate": 4.476678943159186e-05, + "loss": 0.0408, + "step": 49850 + }, + { + "epoch": 0.2443, + "grad_norm": 0.09826401621103287, + "learning_rate": 4.476425835833394e-05, + "loss": 0.039, + "step": 49860 + }, + { + "epoch": 0.24435, + "grad_norm": 0.09092506021261215, + "learning_rate": 4.4761726744728626e-05, + "loss": 0.0383, + "step": 49870 + }, + { + "epoch": 0.2444, + "grad_norm": 0.08619531244039536, + "learning_rate": 4.4759194590845136e-05, + "loss": 0.0388, + "step": 49880 + }, + { + "epoch": 0.24445, + "grad_norm": 0.09693659096956253, + "learning_rate": 4.4756661896752675e-05, + "loss": 0.0392, + "step": 49890 + }, + { + "epoch": 0.2445, + "grad_norm": 0.09849540889263153, + "learning_rate": 4.47541286625205e-05, + "loss": 0.0384, + "step": 49900 + }, + { + "epoch": 0.24455, + "grad_norm": 0.09277253597974777, + "learning_rate": 4.475159488821787e-05, + "loss": 0.0395, + "step": 49910 + }, + { + "epoch": 0.2446, + "grad_norm": 0.09384094178676605, + "learning_rate": 4.474906057391406e-05, + "loss": 0.0391, + "step": 49920 + }, + { + "epoch": 0.24465, + "grad_norm": 0.11941733211278915, + "learning_rate": 4.474652571967834e-05, + "loss": 0.0387, + "step": 49930 + }, + { + "epoch": 0.2447, + "grad_norm": 0.14373308420181274, + "learning_rate": 4.474399032558004e-05, + "loss": 0.043, + "step": 49940 + }, + { + "epoch": 0.24475, + "grad_norm": 0.1225065290927887, + "learning_rate": 4.474145439168846e-05, + "loss": 0.0395, + "step": 49950 + }, + { + "epoch": 0.2448, + "grad_norm": 0.10563571751117706, + "learning_rate": 4.473891791807293e-05, + "loss": 0.0391, + "step": 49960 + }, + { + "epoch": 0.24485, + "grad_norm": 0.11376042664051056, + "learning_rate": 4.4736380904802796e-05, + "loss": 0.0396, + "step": 49970 + }, + { + "epoch": 0.2449, + "grad_norm": 0.12583886086940765, + "learning_rate": 4.4733843351947434e-05, + "loss": 0.0397, + "step": 49980 + }, + { + "epoch": 0.24495, + "grad_norm": 0.10607270151376724, + "learning_rate": 4.47313052595762e-05, + "loss": 0.0426, + "step": 49990 + }, + { + "epoch": 0.245, + "grad_norm": 0.10916159301996231, + "learning_rate": 4.47287666277585e-05, + "loss": 0.0398, + "step": 50000 + }, + { + "epoch": 0.24505, + "grad_norm": 0.13387051224708557, + "learning_rate": 4.472622745656372e-05, + "loss": 0.039, + "step": 50010 + }, + { + "epoch": 0.2451, + "grad_norm": 0.10772855579853058, + "learning_rate": 4.4723687746061305e-05, + "loss": 0.0384, + "step": 50020 + }, + { + "epoch": 0.24515, + "grad_norm": 0.11364582926034927, + "learning_rate": 4.472114749632067e-05, + "loss": 0.0382, + "step": 50030 + }, + { + "epoch": 0.2452, + "grad_norm": 0.121206134557724, + "learning_rate": 4.471860670741127e-05, + "loss": 0.0397, + "step": 50040 + }, + { + "epoch": 0.24525, + "grad_norm": 0.13348430395126343, + "learning_rate": 4.471606537940257e-05, + "loss": 0.0404, + "step": 50050 + }, + { + "epoch": 0.2453, + "grad_norm": 0.11775519698858261, + "learning_rate": 4.471352351236406e-05, + "loss": 0.0425, + "step": 50060 + }, + { + "epoch": 0.24535, + "grad_norm": 0.13693532347679138, + "learning_rate": 4.4710981106365214e-05, + "loss": 0.0414, + "step": 50070 + }, + { + "epoch": 0.2454, + "grad_norm": 0.10618642717599869, + "learning_rate": 4.470843816147555e-05, + "loss": 0.0378, + "step": 50080 + }, + { + "epoch": 0.24545, + "grad_norm": 0.10738607496023178, + "learning_rate": 4.470589467776459e-05, + "loss": 0.0412, + "step": 50090 + }, + { + "epoch": 0.2455, + "grad_norm": 0.12106368690729141, + "learning_rate": 4.4703350655301876e-05, + "loss": 0.0415, + "step": 50100 + }, + { + "epoch": 0.24555, + "grad_norm": 0.1166282445192337, + "learning_rate": 4.4700806094156955e-05, + "loss": 0.0393, + "step": 50110 + }, + { + "epoch": 0.2456, + "grad_norm": 0.1051270067691803, + "learning_rate": 4.4698260994399396e-05, + "loss": 0.0388, + "step": 50120 + }, + { + "epoch": 0.24565, + "grad_norm": 0.10988081246614456, + "learning_rate": 4.469571535609879e-05, + "loss": 0.0384, + "step": 50130 + }, + { + "epoch": 0.2457, + "grad_norm": 0.11927267909049988, + "learning_rate": 4.469316917932472e-05, + "loss": 0.0395, + "step": 50140 + }, + { + "epoch": 0.24575, + "grad_norm": 0.10540378093719482, + "learning_rate": 4.46906224641468e-05, + "loss": 0.0396, + "step": 50150 + }, + { + "epoch": 0.2458, + "grad_norm": 0.11323534697294235, + "learning_rate": 4.468807521063466e-05, + "loss": 0.0405, + "step": 50160 + }, + { + "epoch": 0.24585, + "grad_norm": 0.12552185356616974, + "learning_rate": 4.468552741885794e-05, + "loss": 0.0397, + "step": 50170 + }, + { + "epoch": 0.2459, + "grad_norm": 0.1230066567659378, + "learning_rate": 4.4682979088886304e-05, + "loss": 0.0418, + "step": 50180 + }, + { + "epoch": 0.24595, + "grad_norm": 0.14640747010707855, + "learning_rate": 4.4680430220789406e-05, + "loss": 0.0395, + "step": 50190 + }, + { + "epoch": 0.246, + "grad_norm": 0.11146890372037888, + "learning_rate": 4.467788081463694e-05, + "loss": 0.0386, + "step": 50200 + }, + { + "epoch": 0.24605, + "grad_norm": 0.12735052406787872, + "learning_rate": 4.4675330870498604e-05, + "loss": 0.0388, + "step": 50210 + }, + { + "epoch": 0.2461, + "grad_norm": 0.11881628632545471, + "learning_rate": 4.4672780388444114e-05, + "loss": 0.0389, + "step": 50220 + }, + { + "epoch": 0.24615, + "grad_norm": 0.11332988739013672, + "learning_rate": 4.4670229368543206e-05, + "loss": 0.0395, + "step": 50230 + }, + { + "epoch": 0.2462, + "grad_norm": 0.09810035675764084, + "learning_rate": 4.4667677810865606e-05, + "loss": 0.0387, + "step": 50240 + }, + { + "epoch": 0.24625, + "grad_norm": 0.12134762108325958, + "learning_rate": 4.4665125715481096e-05, + "loss": 0.0394, + "step": 50250 + }, + { + "epoch": 0.2463, + "grad_norm": 0.22856345772743225, + "learning_rate": 4.4662573082459424e-05, + "loss": 0.0436, + "step": 50260 + }, + { + "epoch": 0.24635, + "grad_norm": 0.10471013188362122, + "learning_rate": 4.46600199118704e-05, + "loss": 0.0395, + "step": 50270 + }, + { + "epoch": 0.2464, + "grad_norm": 0.10634054243564606, + "learning_rate": 4.465746620378381e-05, + "loss": 0.0391, + "step": 50280 + }, + { + "epoch": 0.24645, + "grad_norm": 0.102584607899189, + "learning_rate": 4.465491195826948e-05, + "loss": 0.0394, + "step": 50290 + }, + { + "epoch": 0.2465, + "grad_norm": 0.10269295424222946, + "learning_rate": 4.465235717539725e-05, + "loss": 0.0392, + "step": 50300 + }, + { + "epoch": 0.24655, + "grad_norm": 0.09852663427591324, + "learning_rate": 4.464980185523695e-05, + "loss": 0.039, + "step": 50310 + }, + { + "epoch": 0.2466, + "grad_norm": 0.1208033636212349, + "learning_rate": 4.464724599785846e-05, + "loss": 0.0392, + "step": 50320 + }, + { + "epoch": 0.24665, + "grad_norm": 0.12018878012895584, + "learning_rate": 4.464468960333163e-05, + "loss": 0.0423, + "step": 50330 + }, + { + "epoch": 0.2467, + "grad_norm": 0.1266999989748001, + "learning_rate": 4.464213267172637e-05, + "loss": 0.0396, + "step": 50340 + }, + { + "epoch": 0.24675, + "grad_norm": 0.12401743978261948, + "learning_rate": 4.463957520311259e-05, + "loss": 0.0402, + "step": 50350 + }, + { + "epoch": 0.2468, + "grad_norm": 0.1187533289194107, + "learning_rate": 4.4637017197560196e-05, + "loss": 0.04, + "step": 50360 + }, + { + "epoch": 0.24685, + "grad_norm": 0.10464418679475784, + "learning_rate": 4.463445865513913e-05, + "loss": 0.0393, + "step": 50370 + }, + { + "epoch": 0.2469, + "grad_norm": 0.1359594464302063, + "learning_rate": 4.4631899575919344e-05, + "loss": 0.0391, + "step": 50380 + }, + { + "epoch": 0.24695, + "grad_norm": 0.11157345026731491, + "learning_rate": 4.4629339959970794e-05, + "loss": 0.0401, + "step": 50390 + }, + { + "epoch": 0.247, + "grad_norm": 0.12209325283765793, + "learning_rate": 4.462677980736346e-05, + "loss": 0.0401, + "step": 50400 + }, + { + "epoch": 0.24705, + "grad_norm": 0.1256990283727646, + "learning_rate": 4.4624219118167355e-05, + "loss": 0.0385, + "step": 50410 + }, + { + "epoch": 0.2471, + "grad_norm": 0.12570686638355255, + "learning_rate": 4.462165789245246e-05, + "loss": 0.0403, + "step": 50420 + }, + { + "epoch": 0.24715, + "grad_norm": 0.12645868957042694, + "learning_rate": 4.461909613028881e-05, + "loss": 0.0398, + "step": 50430 + }, + { + "epoch": 0.2472, + "grad_norm": 0.12722982466220856, + "learning_rate": 4.461653383174644e-05, + "loss": 0.0384, + "step": 50440 + }, + { + "epoch": 0.24725, + "grad_norm": 0.11190466582775116, + "learning_rate": 4.461397099689542e-05, + "loss": 0.0412, + "step": 50450 + }, + { + "epoch": 0.2473, + "grad_norm": 0.12560081481933594, + "learning_rate": 4.461140762580579e-05, + "loss": 0.0393, + "step": 50460 + }, + { + "epoch": 0.24735, + "grad_norm": 0.1073743924498558, + "learning_rate": 4.460884371854764e-05, + "loss": 0.039, + "step": 50470 + }, + { + "epoch": 0.2474, + "grad_norm": 0.12059140205383301, + "learning_rate": 4.460627927519107e-05, + "loss": 0.0393, + "step": 50480 + }, + { + "epoch": 0.24745, + "grad_norm": 0.1306591033935547, + "learning_rate": 4.46037142958062e-05, + "loss": 0.0404, + "step": 50490 + }, + { + "epoch": 0.2475, + "grad_norm": 0.10691185295581818, + "learning_rate": 4.460114878046313e-05, + "loss": 0.0378, + "step": 50500 + }, + { + "epoch": 0.24755, + "grad_norm": 0.11255087703466415, + "learning_rate": 4.459858272923203e-05, + "loss": 0.0385, + "step": 50510 + }, + { + "epoch": 0.2476, + "grad_norm": 0.10249876976013184, + "learning_rate": 4.459601614218304e-05, + "loss": 0.0365, + "step": 50520 + }, + { + "epoch": 0.24765, + "grad_norm": 0.10994745790958405, + "learning_rate": 4.459344901938633e-05, + "loss": 0.039, + "step": 50530 + }, + { + "epoch": 0.2477, + "grad_norm": 0.11549528688192368, + "learning_rate": 4.4590881360912074e-05, + "loss": 0.038, + "step": 50540 + }, + { + "epoch": 0.24775, + "grad_norm": 0.10953694581985474, + "learning_rate": 4.4588313166830495e-05, + "loss": 0.0384, + "step": 50550 + }, + { + "epoch": 0.2478, + "grad_norm": 0.09753890335559845, + "learning_rate": 4.4585744437211786e-05, + "loss": 0.0398, + "step": 50560 + }, + { + "epoch": 0.24785, + "grad_norm": 0.11490300297737122, + "learning_rate": 4.458317517212618e-05, + "loss": 0.0382, + "step": 50570 + }, + { + "epoch": 0.2479, + "grad_norm": 0.11047282069921494, + "learning_rate": 4.458060537164393e-05, + "loss": 0.0386, + "step": 50580 + }, + { + "epoch": 0.24795, + "grad_norm": 0.13541057705879211, + "learning_rate": 4.4578035035835275e-05, + "loss": 0.0401, + "step": 50590 + }, + { + "epoch": 0.248, + "grad_norm": 0.09646723419427872, + "learning_rate": 4.457546416477051e-05, + "loss": 0.037, + "step": 50600 + }, + { + "epoch": 0.24805, + "grad_norm": 0.10674086958169937, + "learning_rate": 4.45728927585199e-05, + "loss": 0.0378, + "step": 50610 + }, + { + "epoch": 0.2481, + "grad_norm": 0.10835059732198715, + "learning_rate": 4.4570320817153756e-05, + "loss": 0.0381, + "step": 50620 + }, + { + "epoch": 0.24815, + "grad_norm": 0.09281093627214432, + "learning_rate": 4.4567748340742396e-05, + "loss": 0.0386, + "step": 50630 + }, + { + "epoch": 0.2482, + "grad_norm": 0.09371732175350189, + "learning_rate": 4.456517532935615e-05, + "loss": 0.0425, + "step": 50640 + }, + { + "epoch": 0.24825, + "grad_norm": 0.10443238168954849, + "learning_rate": 4.456260178306535e-05, + "loss": 0.0409, + "step": 50650 + }, + { + "epoch": 0.2483, + "grad_norm": 0.10375183820724487, + "learning_rate": 4.456002770194038e-05, + "loss": 0.0387, + "step": 50660 + }, + { + "epoch": 0.24835, + "grad_norm": 0.11241745203733444, + "learning_rate": 4.4557453086051595e-05, + "loss": 0.0424, + "step": 50670 + }, + { + "epoch": 0.2484, + "grad_norm": 0.08906576782464981, + "learning_rate": 4.455487793546939e-05, + "loss": 0.0386, + "step": 50680 + }, + { + "epoch": 0.24845, + "grad_norm": 0.11888798326253891, + "learning_rate": 4.455230225026416e-05, + "loss": 0.0394, + "step": 50690 + }, + { + "epoch": 0.2485, + "grad_norm": 0.11634727567434311, + "learning_rate": 4.454972603050634e-05, + "loss": 0.0385, + "step": 50700 + }, + { + "epoch": 0.24855, + "grad_norm": 0.11918650567531586, + "learning_rate": 4.4547149276266355e-05, + "loss": 0.0398, + "step": 50710 + }, + { + "epoch": 0.2486, + "grad_norm": 0.11601907014846802, + "learning_rate": 4.454457198761465e-05, + "loss": 0.0411, + "step": 50720 + }, + { + "epoch": 0.24865, + "grad_norm": 0.10228978842496872, + "learning_rate": 4.454199416462169e-05, + "loss": 0.0402, + "step": 50730 + }, + { + "epoch": 0.2487, + "grad_norm": 0.10430190712213516, + "learning_rate": 4.4539415807357955e-05, + "loss": 0.0396, + "step": 50740 + }, + { + "epoch": 0.24875, + "grad_norm": 0.095687136054039, + "learning_rate": 4.453683691589393e-05, + "loss": 0.0381, + "step": 50750 + }, + { + "epoch": 0.2488, + "grad_norm": 0.10482900589704514, + "learning_rate": 4.453425749030012e-05, + "loss": 0.0395, + "step": 50760 + }, + { + "epoch": 0.24885, + "grad_norm": 0.0979951024055481, + "learning_rate": 4.4531677530647056e-05, + "loss": 0.0393, + "step": 50770 + }, + { + "epoch": 0.2489, + "grad_norm": 0.10559823364019394, + "learning_rate": 4.452909703700526e-05, + "loss": 0.0382, + "step": 50780 + }, + { + "epoch": 0.24895, + "grad_norm": 0.1315240114927292, + "learning_rate": 4.452651600944529e-05, + "loss": 0.0386, + "step": 50790 + }, + { + "epoch": 0.249, + "grad_norm": 0.10066678375005722, + "learning_rate": 4.452393444803771e-05, + "loss": 0.0388, + "step": 50800 + }, + { + "epoch": 0.24905, + "grad_norm": 0.09541074186563492, + "learning_rate": 4.4521352352853095e-05, + "loss": 0.0389, + "step": 50810 + }, + { + "epoch": 0.2491, + "grad_norm": 0.11731182783842087, + "learning_rate": 4.451876972396204e-05, + "loss": 0.039, + "step": 50820 + }, + { + "epoch": 0.24915, + "grad_norm": 0.09840092062950134, + "learning_rate": 4.4516186561435156e-05, + "loss": 0.0384, + "step": 50830 + }, + { + "epoch": 0.2492, + "grad_norm": 0.10836604237556458, + "learning_rate": 4.451360286534306e-05, + "loss": 0.038, + "step": 50840 + }, + { + "epoch": 0.24925, + "grad_norm": 0.13531796634197235, + "learning_rate": 4.45110186357564e-05, + "loss": 0.0392, + "step": 50850 + }, + { + "epoch": 0.2493, + "grad_norm": 0.08943230658769608, + "learning_rate": 4.450843387274581e-05, + "loss": 0.0384, + "step": 50860 + }, + { + "epoch": 0.24935, + "grad_norm": 0.1034252792596817, + "learning_rate": 4.450584857638197e-05, + "loss": 0.0385, + "step": 50870 + }, + { + "epoch": 0.2494, + "grad_norm": 0.09238631278276443, + "learning_rate": 4.4503262746735567e-05, + "loss": 0.0381, + "step": 50880 + }, + { + "epoch": 0.24945, + "grad_norm": 0.0975264385342598, + "learning_rate": 4.450067638387727e-05, + "loss": 0.0374, + "step": 50890 + }, + { + "epoch": 0.2495, + "grad_norm": 0.11952003091573715, + "learning_rate": 4.449808948787782e-05, + "loss": 0.0373, + "step": 50900 + }, + { + "epoch": 0.24955, + "grad_norm": 0.09383943676948547, + "learning_rate": 4.4495502058807925e-05, + "loss": 0.0385, + "step": 50910 + }, + { + "epoch": 0.2496, + "grad_norm": 0.11109983921051025, + "learning_rate": 4.4492914096738326e-05, + "loss": 0.0389, + "step": 50920 + }, + { + "epoch": 0.24965, + "grad_norm": 0.09983723610639572, + "learning_rate": 4.449032560173978e-05, + "loss": 0.0397, + "step": 50930 + }, + { + "epoch": 0.2497, + "grad_norm": 0.1179436445236206, + "learning_rate": 4.448773657388305e-05, + "loss": 0.0375, + "step": 50940 + }, + { + "epoch": 0.24975, + "grad_norm": 0.131612166762352, + "learning_rate": 4.4485147013238936e-05, + "loss": 0.0401, + "step": 50950 + }, + { + "epoch": 0.2498, + "grad_norm": 0.12161412090063095, + "learning_rate": 4.4482556919878214e-05, + "loss": 0.0407, + "step": 50960 + }, + { + "epoch": 0.24985, + "grad_norm": 0.10872988402843475, + "learning_rate": 4.44799662938717e-05, + "loss": 0.0392, + "step": 50970 + }, + { + "epoch": 0.2499, + "grad_norm": 0.1138838529586792, + "learning_rate": 4.447737513529023e-05, + "loss": 0.0403, + "step": 50980 + }, + { + "epoch": 0.24995, + "grad_norm": 0.12414175271987915, + "learning_rate": 4.447478344420465e-05, + "loss": 0.0392, + "step": 50990 + }, + { + "epoch": 0.25, + "grad_norm": 0.13560877740383148, + "learning_rate": 4.44721912206858e-05, + "loss": 0.0402, + "step": 51000 + }, + { + "epoch": 0.25005, + "grad_norm": 0.13603296875953674, + "learning_rate": 4.446959846480456e-05, + "loss": 0.0394, + "step": 51010 + }, + { + "epoch": 0.2501, + "grad_norm": 0.1167522445321083, + "learning_rate": 4.44670051766318e-05, + "loss": 0.0386, + "step": 51020 + }, + { + "epoch": 0.25015, + "grad_norm": 0.1144317165017128, + "learning_rate": 4.4464411356238447e-05, + "loss": 0.0379, + "step": 51030 + }, + { + "epoch": 0.2502, + "grad_norm": 0.13284580409526825, + "learning_rate": 4.4461817003695396e-05, + "loss": 0.0383, + "step": 51040 + }, + { + "epoch": 0.25025, + "grad_norm": 0.1393263041973114, + "learning_rate": 4.445922211907358e-05, + "loss": 0.0382, + "step": 51050 + }, + { + "epoch": 0.2503, + "grad_norm": 0.11129328608512878, + "learning_rate": 4.445662670244394e-05, + "loss": 0.0387, + "step": 51060 + }, + { + "epoch": 0.25035, + "grad_norm": 0.12428943812847137, + "learning_rate": 4.445403075387743e-05, + "loss": 0.0393, + "step": 51070 + }, + { + "epoch": 0.2504, + "grad_norm": 0.10676748305559158, + "learning_rate": 4.4451434273445036e-05, + "loss": 0.0374, + "step": 51080 + }, + { + "epoch": 0.25045, + "grad_norm": 0.10181623697280884, + "learning_rate": 4.444883726121773e-05, + "loss": 0.039, + "step": 51090 + }, + { + "epoch": 0.2505, + "grad_norm": 0.10132446140050888, + "learning_rate": 4.4446239717266525e-05, + "loss": 0.0396, + "step": 51100 + }, + { + "epoch": 0.25055, + "grad_norm": 0.12805144488811493, + "learning_rate": 4.444364164166244e-05, + "loss": 0.0388, + "step": 51110 + }, + { + "epoch": 0.2506, + "grad_norm": 0.1167738139629364, + "learning_rate": 4.444104303447648e-05, + "loss": 0.0385, + "step": 51120 + }, + { + "epoch": 0.25065, + "grad_norm": 0.11137361824512482, + "learning_rate": 4.4438443895779716e-05, + "loss": 0.0386, + "step": 51130 + }, + { + "epoch": 0.2507, + "grad_norm": 0.09908577054738998, + "learning_rate": 4.44358442256432e-05, + "loss": 0.0377, + "step": 51140 + }, + { + "epoch": 0.25075, + "grad_norm": 0.11198722571134567, + "learning_rate": 4.4433244024138e-05, + "loss": 0.038, + "step": 51150 + }, + { + "epoch": 0.2508, + "grad_norm": 0.10560618340969086, + "learning_rate": 4.4430643291335206e-05, + "loss": 0.0381, + "step": 51160 + }, + { + "epoch": 0.25085, + "grad_norm": 0.11533159762620926, + "learning_rate": 4.4428042027305934e-05, + "loss": 0.0411, + "step": 51170 + }, + { + "epoch": 0.2509, + "grad_norm": 0.10088592767715454, + "learning_rate": 4.442544023212129e-05, + "loss": 0.0406, + "step": 51180 + }, + { + "epoch": 0.25095, + "grad_norm": 0.11384643614292145, + "learning_rate": 4.44228379058524e-05, + "loss": 0.0396, + "step": 51190 + }, + { + "epoch": 0.251, + "grad_norm": 0.10306670516729355, + "learning_rate": 4.442023504857042e-05, + "loss": 0.0401, + "step": 51200 + }, + { + "epoch": 0.25105, + "grad_norm": 0.10993543267250061, + "learning_rate": 4.441763166034652e-05, + "loss": 0.0386, + "step": 51210 + }, + { + "epoch": 0.2511, + "grad_norm": 0.0968732088804245, + "learning_rate": 4.441502774125185e-05, + "loss": 0.0386, + "step": 51220 + }, + { + "epoch": 0.25115, + "grad_norm": 0.10479246079921722, + "learning_rate": 4.441242329135763e-05, + "loss": 0.0392, + "step": 51230 + }, + { + "epoch": 0.2512, + "grad_norm": 0.1229248195886612, + "learning_rate": 4.440981831073504e-05, + "loss": 0.0408, + "step": 51240 + }, + { + "epoch": 0.25125, + "grad_norm": 0.0964876189827919, + "learning_rate": 4.4407212799455313e-05, + "loss": 0.0389, + "step": 51250 + }, + { + "epoch": 0.2513, + "grad_norm": 0.13513514399528503, + "learning_rate": 4.440460675758967e-05, + "loss": 0.0406, + "step": 51260 + }, + { + "epoch": 0.25135, + "grad_norm": 0.10261445492506027, + "learning_rate": 4.440200018520938e-05, + "loss": 0.0444, + "step": 51270 + }, + { + "epoch": 0.2514, + "grad_norm": 0.11525224894285202, + "learning_rate": 4.43993930823857e-05, + "loss": 0.0382, + "step": 51280 + }, + { + "epoch": 0.25145, + "grad_norm": 0.12469878047704697, + "learning_rate": 4.439678544918989e-05, + "loss": 0.0389, + "step": 51290 + }, + { + "epoch": 0.2515, + "grad_norm": 0.09355195611715317, + "learning_rate": 4.439417728569325e-05, + "loss": 0.0414, + "step": 51300 + }, + { + "epoch": 0.25155, + "grad_norm": 0.12074048072099686, + "learning_rate": 4.43915685919671e-05, + "loss": 0.0419, + "step": 51310 + }, + { + "epoch": 0.2516, + "grad_norm": 0.1394028216600418, + "learning_rate": 4.438895936808274e-05, + "loss": 0.0411, + "step": 51320 + }, + { + "epoch": 0.25165, + "grad_norm": 0.1064663678407669, + "learning_rate": 4.4386349614111524e-05, + "loss": 0.0394, + "step": 51330 + }, + { + "epoch": 0.2517, + "grad_norm": 0.13321200013160706, + "learning_rate": 4.438373933012478e-05, + "loss": 0.0393, + "step": 51340 + }, + { + "epoch": 0.25175, + "grad_norm": 0.12222933024168015, + "learning_rate": 4.438112851619389e-05, + "loss": 0.0413, + "step": 51350 + }, + { + "epoch": 0.2518, + "grad_norm": 0.11078042536973953, + "learning_rate": 4.4378517172390234e-05, + "loss": 0.0402, + "step": 51360 + }, + { + "epoch": 0.25185, + "grad_norm": 0.10573874413967133, + "learning_rate": 4.437590529878519e-05, + "loss": 0.0402, + "step": 51370 + }, + { + "epoch": 0.2519, + "grad_norm": 0.14229434728622437, + "learning_rate": 4.437329289545018e-05, + "loss": 0.0396, + "step": 51380 + }, + { + "epoch": 0.25195, + "grad_norm": 0.13220266997814178, + "learning_rate": 4.437067996245662e-05, + "loss": 0.0395, + "step": 51390 + }, + { + "epoch": 0.252, + "grad_norm": 0.12287094444036484, + "learning_rate": 4.436806649987595e-05, + "loss": 0.0384, + "step": 51400 + }, + { + "epoch": 0.25205, + "grad_norm": 0.11596930772066116, + "learning_rate": 4.436545250777961e-05, + "loss": 0.0399, + "step": 51410 + }, + { + "epoch": 0.2521, + "grad_norm": 0.12470424175262451, + "learning_rate": 4.436283798623908e-05, + "loss": 0.0406, + "step": 51420 + }, + { + "epoch": 0.25215, + "grad_norm": 0.12757036089897156, + "learning_rate": 4.4360222935325835e-05, + "loss": 0.0411, + "step": 51430 + }, + { + "epoch": 0.2522, + "grad_norm": 0.13404591381549835, + "learning_rate": 4.435760735511136e-05, + "loss": 0.0402, + "step": 51440 + }, + { + "epoch": 0.25225, + "grad_norm": 0.12462472170591354, + "learning_rate": 4.4354991245667175e-05, + "loss": 0.0399, + "step": 51450 + }, + { + "epoch": 0.2523, + "grad_norm": 0.14413760602474213, + "learning_rate": 4.435237460706481e-05, + "loss": 0.0402, + "step": 51460 + }, + { + "epoch": 0.25235, + "grad_norm": 0.10462265461683273, + "learning_rate": 4.4349757439375786e-05, + "loss": 0.0398, + "step": 51470 + }, + { + "epoch": 0.2524, + "grad_norm": 0.10423137247562408, + "learning_rate": 4.434713974267166e-05, + "loss": 0.0391, + "step": 51480 + }, + { + "epoch": 0.25245, + "grad_norm": 0.10511630773544312, + "learning_rate": 4.4344521517024004e-05, + "loss": 0.0405, + "step": 51490 + }, + { + "epoch": 0.2525, + "grad_norm": 0.10550902038812637, + "learning_rate": 4.43419027625044e-05, + "loss": 0.0408, + "step": 51500 + }, + { + "epoch": 0.25255, + "grad_norm": 0.11555942893028259, + "learning_rate": 4.433928347918444e-05, + "loss": 0.0433, + "step": 51510 + }, + { + "epoch": 0.2526, + "grad_norm": 0.12114205956459045, + "learning_rate": 4.433666366713574e-05, + "loss": 0.0413, + "step": 51520 + }, + { + "epoch": 0.25265, + "grad_norm": 0.10090594738721848, + "learning_rate": 4.4334043326429907e-05, + "loss": 0.0402, + "step": 51530 + }, + { + "epoch": 0.2527, + "grad_norm": 0.09123598784208298, + "learning_rate": 4.433142245713861e-05, + "loss": 0.0395, + "step": 51540 + }, + { + "epoch": 0.25275, + "grad_norm": 0.10139287263154984, + "learning_rate": 4.432880105933347e-05, + "loss": 0.0408, + "step": 51550 + }, + { + "epoch": 0.2528, + "grad_norm": 0.08323674649000168, + "learning_rate": 4.4326179133086174e-05, + "loss": 0.0396, + "step": 51560 + }, + { + "epoch": 0.25285, + "grad_norm": 0.11446565389633179, + "learning_rate": 4.43235566784684e-05, + "loss": 0.0416, + "step": 51570 + }, + { + "epoch": 0.2529, + "grad_norm": 0.11079252511262894, + "learning_rate": 4.432093369555185e-05, + "loss": 0.0386, + "step": 51580 + }, + { + "epoch": 0.25295, + "grad_norm": 0.12396615743637085, + "learning_rate": 4.4318310184408234e-05, + "loss": 0.0397, + "step": 51590 + }, + { + "epoch": 0.253, + "grad_norm": 0.12712040543556213, + "learning_rate": 4.431568614510927e-05, + "loss": 0.0399, + "step": 51600 + }, + { + "epoch": 0.25305, + "grad_norm": 0.09145376086235046, + "learning_rate": 4.4313061577726703e-05, + "loss": 0.0381, + "step": 51610 + }, + { + "epoch": 0.2531, + "grad_norm": 0.10613539814949036, + "learning_rate": 4.4310436482332294e-05, + "loss": 0.0394, + "step": 51620 + }, + { + "epoch": 0.25315, + "grad_norm": 0.0854751318693161, + "learning_rate": 4.43078108589978e-05, + "loss": 0.039, + "step": 51630 + }, + { + "epoch": 0.2532, + "grad_norm": 0.09941904991865158, + "learning_rate": 4.430518470779501e-05, + "loss": 0.0386, + "step": 51640 + }, + { + "epoch": 0.25325, + "grad_norm": 0.0964796170592308, + "learning_rate": 4.430255802879573e-05, + "loss": 0.0388, + "step": 51650 + }, + { + "epoch": 0.2533, + "grad_norm": 0.10009407252073288, + "learning_rate": 4.4299930822071755e-05, + "loss": 0.0397, + "step": 51660 + }, + { + "epoch": 0.25335, + "grad_norm": 0.12308847159147263, + "learning_rate": 4.429730308769493e-05, + "loss": 0.0386, + "step": 51670 + }, + { + "epoch": 0.2534, + "grad_norm": 0.10295268893241882, + "learning_rate": 4.4294674825737086e-05, + "loss": 0.0391, + "step": 51680 + }, + { + "epoch": 0.25345, + "grad_norm": 0.10725370049476624, + "learning_rate": 4.429204603627009e-05, + "loss": 0.0399, + "step": 51690 + }, + { + "epoch": 0.2535, + "grad_norm": 0.09342306107282639, + "learning_rate": 4.4289416719365784e-05, + "loss": 0.0382, + "step": 51700 + }, + { + "epoch": 0.25355, + "grad_norm": 0.10227091610431671, + "learning_rate": 4.428678687509609e-05, + "loss": 0.0392, + "step": 51710 + }, + { + "epoch": 0.2536, + "grad_norm": 0.10371004045009613, + "learning_rate": 4.4284156503532876e-05, + "loss": 0.0377, + "step": 51720 + }, + { + "epoch": 0.25365, + "grad_norm": 0.10542786866426468, + "learning_rate": 4.428152560474807e-05, + "loss": 0.0391, + "step": 51730 + }, + { + "epoch": 0.2537, + "grad_norm": 0.0887867733836174, + "learning_rate": 4.42788941788136e-05, + "loss": 0.0392, + "step": 51740 + }, + { + "epoch": 0.25375, + "grad_norm": 0.0874442532658577, + "learning_rate": 4.4276262225801404e-05, + "loss": 0.0388, + "step": 51750 + }, + { + "epoch": 0.2538, + "grad_norm": 0.1287989616394043, + "learning_rate": 4.427362974578344e-05, + "loss": 0.042, + "step": 51760 + }, + { + "epoch": 0.25385, + "grad_norm": 0.11925298720598221, + "learning_rate": 4.4270996738831684e-05, + "loss": 0.0381, + "step": 51770 + }, + { + "epoch": 0.2539, + "grad_norm": 0.11712602525949478, + "learning_rate": 4.4268363205018114e-05, + "loss": 0.0383, + "step": 51780 + }, + { + "epoch": 0.25395, + "grad_norm": 0.13583825528621674, + "learning_rate": 4.426572914441474e-05, + "loss": 0.0392, + "step": 51790 + }, + { + "epoch": 0.254, + "grad_norm": 0.14465183019638062, + "learning_rate": 4.426309455709355e-05, + "loss": 0.0392, + "step": 51800 + }, + { + "epoch": 0.25405, + "grad_norm": 0.12359750270843506, + "learning_rate": 4.426045944312661e-05, + "loss": 0.0379, + "step": 51810 + }, + { + "epoch": 0.2541, + "grad_norm": 0.11113641411066055, + "learning_rate": 4.425782380258594e-05, + "loss": 0.0382, + "step": 51820 + }, + { + "epoch": 0.25415, + "grad_norm": 0.11407049745321274, + "learning_rate": 4.42551876355436e-05, + "loss": 0.0392, + "step": 51830 + }, + { + "epoch": 0.2542, + "grad_norm": 0.11310215294361115, + "learning_rate": 4.425255094207167e-05, + "loss": 0.0419, + "step": 51840 + }, + { + "epoch": 0.25425, + "grad_norm": 0.11849265545606613, + "learning_rate": 4.424991372224222e-05, + "loss": 0.039, + "step": 51850 + }, + { + "epoch": 0.2543, + "grad_norm": 0.09995820373296738, + "learning_rate": 4.4247275976127366e-05, + "loss": 0.0405, + "step": 51860 + }, + { + "epoch": 0.25435, + "grad_norm": 0.1018250435590744, + "learning_rate": 4.4244637703799216e-05, + "loss": 0.0405, + "step": 51870 + }, + { + "epoch": 0.2544, + "grad_norm": 0.10998055338859558, + "learning_rate": 4.4241998905329904e-05, + "loss": 0.0416, + "step": 51880 + }, + { + "epoch": 0.25445, + "grad_norm": 0.11770867556333542, + "learning_rate": 4.423935958079156e-05, + "loss": 0.0394, + "step": 51890 + }, + { + "epoch": 0.2545, + "grad_norm": 0.1142832338809967, + "learning_rate": 4.4236719730256365e-05, + "loss": 0.0407, + "step": 51900 + }, + { + "epoch": 0.25455, + "grad_norm": 0.13255222141742706, + "learning_rate": 4.423407935379647e-05, + "loss": 0.0395, + "step": 51910 + }, + { + "epoch": 0.2546, + "grad_norm": 0.11216466128826141, + "learning_rate": 4.423143845148409e-05, + "loss": 0.0396, + "step": 51920 + }, + { + "epoch": 0.25465, + "grad_norm": 0.12529611587524414, + "learning_rate": 4.422879702339139e-05, + "loss": 0.04, + "step": 51930 + }, + { + "epoch": 0.2547, + "grad_norm": 0.11650749295949936, + "learning_rate": 4.422615506959061e-05, + "loss": 0.0399, + "step": 51940 + }, + { + "epoch": 0.25475, + "grad_norm": 0.11138208210468292, + "learning_rate": 4.422351259015397e-05, + "loss": 0.0395, + "step": 51950 + }, + { + "epoch": 0.2548, + "grad_norm": 0.10220567137002945, + "learning_rate": 4.422086958515372e-05, + "loss": 0.0387, + "step": 51960 + }, + { + "epoch": 0.25485, + "grad_norm": 0.09616690129041672, + "learning_rate": 4.421822605466211e-05, + "loss": 0.0377, + "step": 51970 + }, + { + "epoch": 0.2549, + "grad_norm": 0.09676245599985123, + "learning_rate": 4.4215581998751434e-05, + "loss": 0.0387, + "step": 51980 + }, + { + "epoch": 0.25495, + "grad_norm": 0.11996602267026901, + "learning_rate": 4.4212937417493954e-05, + "loss": 0.0394, + "step": 51990 + }, + { + "epoch": 0.255, + "grad_norm": 0.10802966356277466, + "learning_rate": 4.421029231096199e-05, + "loss": 0.0392, + "step": 52000 + }, + { + "epoch": 0.25505, + "grad_norm": 0.13428770005702972, + "learning_rate": 4.4207646679227846e-05, + "loss": 0.0401, + "step": 52010 + }, + { + "epoch": 0.2551, + "grad_norm": 0.12374002486467361, + "learning_rate": 4.420500052236386e-05, + "loss": 0.0399, + "step": 52020 + }, + { + "epoch": 0.25515, + "grad_norm": 0.14582951366901398, + "learning_rate": 4.420235384044237e-05, + "loss": 0.04, + "step": 52030 + }, + { + "epoch": 0.2552, + "grad_norm": 0.10715020447969437, + "learning_rate": 4.4199706633535744e-05, + "loss": 0.0401, + "step": 52040 + }, + { + "epoch": 0.25525, + "grad_norm": 0.11347389221191406, + "learning_rate": 4.4197058901716347e-05, + "loss": 0.0388, + "step": 52050 + }, + { + "epoch": 0.2553, + "grad_norm": 0.09797408431768417, + "learning_rate": 4.4194410645056585e-05, + "loss": 0.0395, + "step": 52060 + }, + { + "epoch": 0.25535, + "grad_norm": 0.13154490292072296, + "learning_rate": 4.4191761863628836e-05, + "loss": 0.0393, + "step": 52070 + }, + { + "epoch": 0.2554, + "grad_norm": 0.1017269417643547, + "learning_rate": 4.4189112557505525e-05, + "loss": 0.0387, + "step": 52080 + }, + { + "epoch": 0.25545, + "grad_norm": 0.12850724160671234, + "learning_rate": 4.418646272675909e-05, + "loss": 0.0399, + "step": 52090 + }, + { + "epoch": 0.2555, + "grad_norm": 0.10764136165380478, + "learning_rate": 4.418381237146198e-05, + "loss": 0.0403, + "step": 52100 + }, + { + "epoch": 0.25555, + "grad_norm": 0.11821214854717255, + "learning_rate": 4.418116149168663e-05, + "loss": 0.041, + "step": 52110 + }, + { + "epoch": 0.2556, + "grad_norm": 0.10730551928281784, + "learning_rate": 4.417851008750554e-05, + "loss": 0.0394, + "step": 52120 + }, + { + "epoch": 0.25565, + "grad_norm": 0.09921026974916458, + "learning_rate": 4.417585815899119e-05, + "loss": 0.0395, + "step": 52130 + }, + { + "epoch": 0.2557, + "grad_norm": 0.13657888770103455, + "learning_rate": 4.4173205706216084e-05, + "loss": 0.0451, + "step": 52140 + }, + { + "epoch": 0.25575, + "grad_norm": 0.10962523519992828, + "learning_rate": 4.417055272925273e-05, + "loss": 0.0402, + "step": 52150 + }, + { + "epoch": 0.2558, + "grad_norm": 0.09717714041471481, + "learning_rate": 4.416789922817367e-05, + "loss": 0.0386, + "step": 52160 + }, + { + "epoch": 0.25585, + "grad_norm": 0.11585210263729095, + "learning_rate": 4.416524520305145e-05, + "loss": 0.0396, + "step": 52170 + }, + { + "epoch": 0.2559, + "grad_norm": 0.12299904227256775, + "learning_rate": 4.416259065395862e-05, + "loss": 0.0395, + "step": 52180 + }, + { + "epoch": 0.25595, + "grad_norm": 0.10254476219415665, + "learning_rate": 4.415993558096776e-05, + "loss": 0.0381, + "step": 52190 + }, + { + "epoch": 0.256, + "grad_norm": 0.08398805558681488, + "learning_rate": 4.415727998415147e-05, + "loss": 0.0394, + "step": 52200 + }, + { + "epoch": 0.25605, + "grad_norm": 0.10847415030002594, + "learning_rate": 4.415462386358233e-05, + "loss": 0.0402, + "step": 52210 + }, + { + "epoch": 0.2561, + "grad_norm": 0.09870469570159912, + "learning_rate": 4.415196721933298e-05, + "loss": 0.037, + "step": 52220 + }, + { + "epoch": 0.25615, + "grad_norm": 0.11038938909769058, + "learning_rate": 4.414931005147604e-05, + "loss": 0.0386, + "step": 52230 + }, + { + "epoch": 0.2562, + "grad_norm": 0.08587460964918137, + "learning_rate": 4.414665236008414e-05, + "loss": 0.0374, + "step": 52240 + }, + { + "epoch": 0.25625, + "grad_norm": 0.1056613028049469, + "learning_rate": 4.414399414522997e-05, + "loss": 0.0402, + "step": 52250 + }, + { + "epoch": 0.2563, + "grad_norm": 0.10318482667207718, + "learning_rate": 4.41413354069862e-05, + "loss": 0.0385, + "step": 52260 + }, + { + "epoch": 0.25635, + "grad_norm": 0.11092269420623779, + "learning_rate": 4.413867614542551e-05, + "loss": 0.0383, + "step": 52270 + }, + { + "epoch": 0.2564, + "grad_norm": 0.09598180651664734, + "learning_rate": 4.4136016360620594e-05, + "loss": 0.037, + "step": 52280 + }, + { + "epoch": 0.25645, + "grad_norm": 0.10150059312582016, + "learning_rate": 4.413335605264418e-05, + "loss": 0.039, + "step": 52290 + }, + { + "epoch": 0.2565, + "grad_norm": 0.10892947018146515, + "learning_rate": 4.4130695221569007e-05, + "loss": 0.0387, + "step": 52300 + }, + { + "epoch": 0.25655, + "grad_norm": 0.09892286360263824, + "learning_rate": 4.4128033867467805e-05, + "loss": 0.0392, + "step": 52310 + }, + { + "epoch": 0.2566, + "grad_norm": 0.10339682549238205, + "learning_rate": 4.412537199041335e-05, + "loss": 0.0392, + "step": 52320 + }, + { + "epoch": 0.25665, + "grad_norm": 0.10403071343898773, + "learning_rate": 4.4122709590478406e-05, + "loss": 0.0374, + "step": 52330 + }, + { + "epoch": 0.2567, + "grad_norm": 0.10536352545022964, + "learning_rate": 4.412004666773577e-05, + "loss": 0.0425, + "step": 52340 + }, + { + "epoch": 0.25675, + "grad_norm": 0.0953894630074501, + "learning_rate": 4.411738322225823e-05, + "loss": 0.0421, + "step": 52350 + }, + { + "epoch": 0.2568, + "grad_norm": 0.08324065059423447, + "learning_rate": 4.411471925411863e-05, + "loss": 0.039, + "step": 52360 + }, + { + "epoch": 0.25685, + "grad_norm": 0.10176534205675125, + "learning_rate": 4.411205476338978e-05, + "loss": 0.0379, + "step": 52370 + }, + { + "epoch": 0.2569, + "grad_norm": 0.10972446948289871, + "learning_rate": 4.410938975014454e-05, + "loss": 0.0381, + "step": 52380 + }, + { + "epoch": 0.25695, + "grad_norm": 0.11949066072702408, + "learning_rate": 4.4106724214455754e-05, + "loss": 0.0389, + "step": 52390 + }, + { + "epoch": 0.257, + "grad_norm": 0.1242421492934227, + "learning_rate": 4.410405815639631e-05, + "loss": 0.0393, + "step": 52400 + }, + { + "epoch": 0.25705, + "grad_norm": 0.11890441179275513, + "learning_rate": 4.410139157603909e-05, + "loss": 0.0384, + "step": 52410 + }, + { + "epoch": 0.2571, + "grad_norm": 0.124184750020504, + "learning_rate": 4.4098724473457e-05, + "loss": 0.0381, + "step": 52420 + }, + { + "epoch": 0.25715, + "grad_norm": 0.1061735451221466, + "learning_rate": 4.409605684872295e-05, + "loss": 0.0377, + "step": 52430 + }, + { + "epoch": 0.2572, + "grad_norm": 0.11340545862913132, + "learning_rate": 4.40933887019099e-05, + "loss": 0.0376, + "step": 52440 + }, + { + "epoch": 0.25725, + "grad_norm": 0.11279773712158203, + "learning_rate": 4.409072003309077e-05, + "loss": 0.0373, + "step": 52450 + }, + { + "epoch": 0.2573, + "grad_norm": 0.10331476479768753, + "learning_rate": 4.408805084233852e-05, + "loss": 0.0385, + "step": 52460 + }, + { + "epoch": 0.25735, + "grad_norm": 0.11186760663986206, + "learning_rate": 4.4085381129726136e-05, + "loss": 0.0386, + "step": 52470 + }, + { + "epoch": 0.2574, + "grad_norm": 0.119344562292099, + "learning_rate": 4.4082710895326596e-05, + "loss": 0.0402, + "step": 52480 + }, + { + "epoch": 0.25745, + "grad_norm": 0.09493733942508698, + "learning_rate": 4.408004013921291e-05, + "loss": 0.0373, + "step": 52490 + }, + { + "epoch": 0.2575, + "grad_norm": 0.09359245002269745, + "learning_rate": 4.407736886145809e-05, + "loss": 0.0383, + "step": 52500 + }, + { + "epoch": 0.25755, + "grad_norm": 0.1006658598780632, + "learning_rate": 4.4074697062135185e-05, + "loss": 0.0393, + "step": 52510 + }, + { + "epoch": 0.2576, + "grad_norm": 0.1178915798664093, + "learning_rate": 4.4072024741317225e-05, + "loss": 0.0389, + "step": 52520 + }, + { + "epoch": 0.25765, + "grad_norm": 0.09350401163101196, + "learning_rate": 4.406935189907727e-05, + "loss": 0.0385, + "step": 52530 + }, + { + "epoch": 0.2577, + "grad_norm": 0.11241558194160461, + "learning_rate": 4.4066678535488404e-05, + "loss": 0.0416, + "step": 52540 + }, + { + "epoch": 0.25775, + "grad_norm": 0.1266544610261917, + "learning_rate": 4.40640046506237e-05, + "loss": 0.0387, + "step": 52550 + }, + { + "epoch": 0.2578, + "grad_norm": 0.11901448667049408, + "learning_rate": 4.4061330244556274e-05, + "loss": 0.0413, + "step": 52560 + }, + { + "epoch": 0.25785, + "grad_norm": 0.13161858916282654, + "learning_rate": 4.4058655317359246e-05, + "loss": 0.0376, + "step": 52570 + }, + { + "epoch": 0.2579, + "grad_norm": 0.10740600526332855, + "learning_rate": 4.4055979869105734e-05, + "loss": 0.0383, + "step": 52580 + }, + { + "epoch": 0.25795, + "grad_norm": 0.0965193584561348, + "learning_rate": 4.40533038998689e-05, + "loss": 0.0392, + "step": 52590 + }, + { + "epoch": 0.258, + "grad_norm": 0.09509056806564331, + "learning_rate": 4.405062740972189e-05, + "loss": 0.0374, + "step": 52600 + }, + { + "epoch": 0.25805, + "grad_norm": 0.09715612977743149, + "learning_rate": 4.404795039873788e-05, + "loss": 0.038, + "step": 52610 + }, + { + "epoch": 0.2581, + "grad_norm": 0.11560391634702682, + "learning_rate": 4.4045272866990073e-05, + "loss": 0.0379, + "step": 52620 + }, + { + "epoch": 0.25815, + "grad_norm": 0.1066320464015007, + "learning_rate": 4.4042594814551654e-05, + "loss": 0.0403, + "step": 52630 + }, + { + "epoch": 0.2582, + "grad_norm": 0.11784431338310242, + "learning_rate": 4.4039916241495845e-05, + "loss": 0.0395, + "step": 52640 + }, + { + "epoch": 0.25825, + "grad_norm": 0.1124262809753418, + "learning_rate": 4.403723714789588e-05, + "loss": 0.0396, + "step": 52650 + }, + { + "epoch": 0.2583, + "grad_norm": 0.15125466883182526, + "learning_rate": 4.403455753382501e-05, + "loss": 0.0422, + "step": 52660 + }, + { + "epoch": 0.25835, + "grad_norm": 0.11785928905010223, + "learning_rate": 4.4031877399356476e-05, + "loss": 0.0388, + "step": 52670 + }, + { + "epoch": 0.2584, + "grad_norm": 0.10574966669082642, + "learning_rate": 4.4029196744563574e-05, + "loss": 0.0396, + "step": 52680 + }, + { + "epoch": 0.25845, + "grad_norm": 0.10138683766126633, + "learning_rate": 4.4026515569519574e-05, + "loss": 0.0397, + "step": 52690 + }, + { + "epoch": 0.2585, + "grad_norm": 0.09625864028930664, + "learning_rate": 4.4023833874297786e-05, + "loss": 0.0393, + "step": 52700 + }, + { + "epoch": 0.25855, + "grad_norm": 0.09360393136739731, + "learning_rate": 4.402115165897153e-05, + "loss": 0.0405, + "step": 52710 + }, + { + "epoch": 0.2586, + "grad_norm": 0.11491934210062027, + "learning_rate": 4.4018468923614136e-05, + "loss": 0.0417, + "step": 52720 + }, + { + "epoch": 0.25865, + "grad_norm": 0.09822811186313629, + "learning_rate": 4.4015785668298945e-05, + "loss": 0.0411, + "step": 52730 + }, + { + "epoch": 0.2587, + "grad_norm": 0.12619754672050476, + "learning_rate": 4.401310189309932e-05, + "loss": 0.0388, + "step": 52740 + }, + { + "epoch": 0.25875, + "grad_norm": 0.09798824787139893, + "learning_rate": 4.4010417598088624e-05, + "loss": 0.0393, + "step": 52750 + }, + { + "epoch": 0.2588, + "grad_norm": 0.08708862215280533, + "learning_rate": 4.400773278334026e-05, + "loss": 0.0386, + "step": 52760 + }, + { + "epoch": 0.25885, + "grad_norm": 0.10267054289579391, + "learning_rate": 4.400504744892763e-05, + "loss": 0.0387, + "step": 52770 + }, + { + "epoch": 0.2589, + "grad_norm": 0.09680944681167603, + "learning_rate": 4.4002361594924125e-05, + "loss": 0.0411, + "step": 52780 + }, + { + "epoch": 0.25895, + "grad_norm": 0.0983702540397644, + "learning_rate": 4.39996752214032e-05, + "loss": 0.0407, + "step": 52790 + }, + { + "epoch": 0.259, + "grad_norm": 0.09816974401473999, + "learning_rate": 4.39969883284383e-05, + "loss": 0.0374, + "step": 52800 + }, + { + "epoch": 0.25905, + "grad_norm": 0.11069867759943008, + "learning_rate": 4.399430091610287e-05, + "loss": 0.0389, + "step": 52810 + }, + { + "epoch": 0.2591, + "grad_norm": 0.10115727037191391, + "learning_rate": 4.3991612984470386e-05, + "loss": 0.0392, + "step": 52820 + }, + { + "epoch": 0.25915, + "grad_norm": 0.11012036353349686, + "learning_rate": 4.398892453361434e-05, + "loss": 0.0391, + "step": 52830 + }, + { + "epoch": 0.2592, + "grad_norm": 0.10272771865129471, + "learning_rate": 4.3986235563608233e-05, + "loss": 0.0409, + "step": 52840 + }, + { + "epoch": 0.25925, + "grad_norm": 0.10734359920024872, + "learning_rate": 4.398354607452558e-05, + "loss": 0.0372, + "step": 52850 + }, + { + "epoch": 0.2593, + "grad_norm": 0.09947605431079865, + "learning_rate": 4.39808560664399e-05, + "loss": 0.0395, + "step": 52860 + }, + { + "epoch": 0.25935, + "grad_norm": 0.11368408799171448, + "learning_rate": 4.3978165539424756e-05, + "loss": 0.0421, + "step": 52870 + }, + { + "epoch": 0.2594, + "grad_norm": 0.11971694231033325, + "learning_rate": 4.397547449355369e-05, + "loss": 0.0398, + "step": 52880 + }, + { + "epoch": 0.25945, + "grad_norm": 0.10816139727830887, + "learning_rate": 4.397278292890028e-05, + "loss": 0.0407, + "step": 52890 + }, + { + "epoch": 0.2595, + "grad_norm": 0.12127848714590073, + "learning_rate": 4.397009084553812e-05, + "loss": 0.0397, + "step": 52900 + }, + { + "epoch": 0.25955, + "grad_norm": 0.10986391454935074, + "learning_rate": 4.396739824354079e-05, + "loss": 0.0394, + "step": 52910 + }, + { + "epoch": 0.2596, + "grad_norm": 0.10677799582481384, + "learning_rate": 4.396470512298193e-05, + "loss": 0.0378, + "step": 52920 + }, + { + "epoch": 0.25965, + "grad_norm": 0.10446598380804062, + "learning_rate": 4.396201148393515e-05, + "loss": 0.0392, + "step": 52930 + }, + { + "epoch": 0.2597, + "grad_norm": 0.12632323801517487, + "learning_rate": 4.39593173264741e-05, + "loss": 0.0382, + "step": 52940 + }, + { + "epoch": 0.25975, + "grad_norm": 0.09477602690458298, + "learning_rate": 4.395662265067244e-05, + "loss": 0.0377, + "step": 52950 + }, + { + "epoch": 0.2598, + "grad_norm": 0.12109529972076416, + "learning_rate": 4.395392745660384e-05, + "loss": 0.0382, + "step": 52960 + }, + { + "epoch": 0.25985, + "grad_norm": 0.1306433379650116, + "learning_rate": 4.395123174434198e-05, + "loss": 0.0391, + "step": 52970 + }, + { + "epoch": 0.2599, + "grad_norm": 0.11219587177038193, + "learning_rate": 4.394853551396056e-05, + "loss": 0.039, + "step": 52980 + }, + { + "epoch": 0.25995, + "grad_norm": 0.10092771053314209, + "learning_rate": 4.3945838765533307e-05, + "loss": 0.038, + "step": 52990 + }, + { + "epoch": 0.26, + "grad_norm": 0.09246230125427246, + "learning_rate": 4.394314149913393e-05, + "loss": 0.0383, + "step": 53000 + }, + { + "epoch": 0.26005, + "grad_norm": 0.097933329641819, + "learning_rate": 4.394044371483619e-05, + "loss": 0.038, + "step": 53010 + }, + { + "epoch": 0.2601, + "grad_norm": 0.10322330892086029, + "learning_rate": 4.393774541271383e-05, + "loss": 0.0374, + "step": 53020 + }, + { + "epoch": 0.26015, + "grad_norm": 0.10426066815853119, + "learning_rate": 4.393504659284063e-05, + "loss": 0.0381, + "step": 53030 + }, + { + "epoch": 0.2602, + "grad_norm": 0.09258675575256348, + "learning_rate": 4.3932347255290365e-05, + "loss": 0.0375, + "step": 53040 + }, + { + "epoch": 0.26025, + "grad_norm": 0.10530385375022888, + "learning_rate": 4.3929647400136835e-05, + "loss": 0.0394, + "step": 53050 + }, + { + "epoch": 0.2603, + "grad_norm": 0.10726174712181091, + "learning_rate": 4.3926947027453866e-05, + "loss": 0.0373, + "step": 53060 + }, + { + "epoch": 0.26035, + "grad_norm": 0.10103695839643478, + "learning_rate": 4.392424613731527e-05, + "loss": 0.039, + "step": 53070 + }, + { + "epoch": 0.2604, + "grad_norm": 0.11701173335313797, + "learning_rate": 4.39215447297949e-05, + "loss": 0.0385, + "step": 53080 + }, + { + "epoch": 0.26045, + "grad_norm": 0.10399830341339111, + "learning_rate": 4.3918842804966586e-05, + "loss": 0.0387, + "step": 53090 + }, + { + "epoch": 0.2605, + "grad_norm": 0.10775898396968842, + "learning_rate": 4.391614036290423e-05, + "loss": 0.038, + "step": 53100 + }, + { + "epoch": 0.26055, + "grad_norm": 0.10917066037654877, + "learning_rate": 4.391343740368171e-05, + "loss": 0.0392, + "step": 53110 + }, + { + "epoch": 0.2606, + "grad_norm": 0.11006568372249603, + "learning_rate": 4.391073392737291e-05, + "loss": 0.0376, + "step": 53120 + }, + { + "epoch": 0.26065, + "grad_norm": 0.09199454635381699, + "learning_rate": 4.390802993405175e-05, + "loss": 0.0377, + "step": 53130 + }, + { + "epoch": 0.2607, + "grad_norm": 0.1270432323217392, + "learning_rate": 4.3905325423792155e-05, + "loss": 0.039, + "step": 53140 + }, + { + "epoch": 0.26075, + "grad_norm": 0.08557243645191193, + "learning_rate": 4.390262039666807e-05, + "loss": 0.0371, + "step": 53150 + }, + { + "epoch": 0.2608, + "grad_norm": 0.09341472387313843, + "learning_rate": 4.3899914852753436e-05, + "loss": 0.0399, + "step": 53160 + }, + { + "epoch": 0.26085, + "grad_norm": 0.10942547768354416, + "learning_rate": 4.389720879212223e-05, + "loss": 0.0381, + "step": 53170 + }, + { + "epoch": 0.2609, + "grad_norm": 0.09050387144088745, + "learning_rate": 4.389450221484844e-05, + "loss": 0.0366, + "step": 53180 + }, + { + "epoch": 0.26095, + "grad_norm": 0.10948771238327026, + "learning_rate": 4.389179512100606e-05, + "loss": 0.0378, + "step": 53190 + }, + { + "epoch": 0.261, + "grad_norm": 0.09486088156700134, + "learning_rate": 4.3889087510669094e-05, + "loss": 0.0368, + "step": 53200 + }, + { + "epoch": 0.26105, + "grad_norm": 0.10282183438539505, + "learning_rate": 4.3886379383911574e-05, + "loss": 0.0393, + "step": 53210 + }, + { + "epoch": 0.2611, + "grad_norm": 0.1014777421951294, + "learning_rate": 4.3883670740807534e-05, + "loss": 0.0387, + "step": 53220 + }, + { + "epoch": 0.26115, + "grad_norm": 0.13208040595054626, + "learning_rate": 4.388096158143104e-05, + "loss": 0.0401, + "step": 53230 + }, + { + "epoch": 0.2612, + "grad_norm": 0.12502078711986542, + "learning_rate": 4.3878251905856135e-05, + "loss": 0.0377, + "step": 53240 + }, + { + "epoch": 0.26125, + "grad_norm": 0.09379098564386368, + "learning_rate": 4.3875541714156926e-05, + "loss": 0.039, + "step": 53250 + }, + { + "epoch": 0.2613, + "grad_norm": 0.10392840951681137, + "learning_rate": 4.3872831006407495e-05, + "loss": 0.0384, + "step": 53260 + }, + { + "epoch": 0.26135, + "grad_norm": 0.09636663645505905, + "learning_rate": 4.3870119782681954e-05, + "loss": 0.0383, + "step": 53270 + }, + { + "epoch": 0.2614, + "grad_norm": 0.09062938392162323, + "learning_rate": 4.386740804305443e-05, + "loss": 0.0385, + "step": 53280 + }, + { + "epoch": 0.26145, + "grad_norm": 0.09627866744995117, + "learning_rate": 4.386469578759905e-05, + "loss": 0.0402, + "step": 53290 + }, + { + "epoch": 0.2615, + "grad_norm": 0.11795052140951157, + "learning_rate": 4.386198301638999e-05, + "loss": 0.0404, + "step": 53300 + }, + { + "epoch": 0.26155, + "grad_norm": 0.1151648685336113, + "learning_rate": 4.3859269729501383e-05, + "loss": 0.0392, + "step": 53310 + }, + { + "epoch": 0.2616, + "grad_norm": 0.12234436720609665, + "learning_rate": 4.385655592700743e-05, + "loss": 0.0394, + "step": 53320 + }, + { + "epoch": 0.26165, + "grad_norm": 0.11547648161649704, + "learning_rate": 4.385384160898233e-05, + "loss": 0.0396, + "step": 53330 + }, + { + "epoch": 0.2617, + "grad_norm": 0.09618788212537766, + "learning_rate": 4.385112677550027e-05, + "loss": 0.0383, + "step": 53340 + }, + { + "epoch": 0.26175, + "grad_norm": 0.09515061974525452, + "learning_rate": 4.38484114266355e-05, + "loss": 0.0407, + "step": 53350 + }, + { + "epoch": 0.2618, + "grad_norm": 0.10868088155984879, + "learning_rate": 4.384569556246223e-05, + "loss": 0.0415, + "step": 53360 + }, + { + "epoch": 0.26185, + "grad_norm": 0.09142906963825226, + "learning_rate": 4.384297918305474e-05, + "loss": 0.04, + "step": 53370 + }, + { + "epoch": 0.2619, + "grad_norm": 0.09424825012683868, + "learning_rate": 4.384026228848727e-05, + "loss": 0.0399, + "step": 53380 + }, + { + "epoch": 0.26195, + "grad_norm": 0.10501080006361008, + "learning_rate": 4.38375448788341e-05, + "loss": 0.0382, + "step": 53390 + }, + { + "epoch": 0.262, + "grad_norm": 0.10195177793502808, + "learning_rate": 4.383482695416954e-05, + "loss": 0.0383, + "step": 53400 + }, + { + "epoch": 0.26205, + "grad_norm": 0.09079153090715408, + "learning_rate": 4.383210851456788e-05, + "loss": 0.0395, + "step": 53410 + }, + { + "epoch": 0.2621, + "grad_norm": 0.11704879254102707, + "learning_rate": 4.382938956010345e-05, + "loss": 0.0392, + "step": 53420 + }, + { + "epoch": 0.26215, + "grad_norm": 0.10230205208063126, + "learning_rate": 4.382667009085059e-05, + "loss": 0.0396, + "step": 53430 + }, + { + "epoch": 0.2622, + "grad_norm": 0.13031253218650818, + "learning_rate": 4.382395010688364e-05, + "loss": 0.0391, + "step": 53440 + }, + { + "epoch": 0.26225, + "grad_norm": 0.10187526047229767, + "learning_rate": 4.382122960827696e-05, + "loss": 0.0417, + "step": 53450 + }, + { + "epoch": 0.2623, + "grad_norm": 0.09725139290094376, + "learning_rate": 4.3818508595104934e-05, + "loss": 0.0383, + "step": 53460 + }, + { + "epoch": 0.26235, + "grad_norm": 0.11895791441202164, + "learning_rate": 4.381578706744196e-05, + "loss": 0.0387, + "step": 53470 + }, + { + "epoch": 0.2624, + "grad_norm": 0.11287042498588562, + "learning_rate": 4.381306502536243e-05, + "loss": 0.038, + "step": 53480 + }, + { + "epoch": 0.26245, + "grad_norm": 0.11742764711380005, + "learning_rate": 4.381034246894077e-05, + "loss": 0.0409, + "step": 53490 + }, + { + "epoch": 0.2625, + "grad_norm": 0.08674655854701996, + "learning_rate": 4.3807619398251415e-05, + "loss": 0.0397, + "step": 53500 + }, + { + "epoch": 0.26255, + "grad_norm": 0.12602120637893677, + "learning_rate": 4.380489581336881e-05, + "loss": 0.0386, + "step": 53510 + }, + { + "epoch": 0.2626, + "grad_norm": 0.13071084022521973, + "learning_rate": 4.380217171436742e-05, + "loss": 0.039, + "step": 53520 + }, + { + "epoch": 0.26265, + "grad_norm": 0.15403853356838226, + "learning_rate": 4.3799447101321723e-05, + "loss": 0.0402, + "step": 53530 + }, + { + "epoch": 0.2627, + "grad_norm": 0.12097310274839401, + "learning_rate": 4.37967219743062e-05, + "loss": 0.0398, + "step": 53540 + }, + { + "epoch": 0.26275, + "grad_norm": 0.15257878601551056, + "learning_rate": 4.3793996333395356e-05, + "loss": 0.0372, + "step": 53550 + }, + { + "epoch": 0.2628, + "grad_norm": 0.13050629198551178, + "learning_rate": 4.379127017866372e-05, + "loss": 0.038, + "step": 53560 + }, + { + "epoch": 0.26285, + "grad_norm": 0.10135944187641144, + "learning_rate": 4.3788543510185807e-05, + "loss": 0.0366, + "step": 53570 + }, + { + "epoch": 0.2629, + "grad_norm": 0.09226872771978378, + "learning_rate": 4.378581632803618e-05, + "loss": 0.039, + "step": 53580 + }, + { + "epoch": 0.26295, + "grad_norm": 0.11682125180959702, + "learning_rate": 4.378308863228939e-05, + "loss": 0.0382, + "step": 53590 + }, + { + "epoch": 0.263, + "grad_norm": 0.10629759728908539, + "learning_rate": 4.378036042302002e-05, + "loss": 0.0405, + "step": 53600 + }, + { + "epoch": 0.26305, + "grad_norm": 0.1006932407617569, + "learning_rate": 4.377763170030265e-05, + "loss": 0.0395, + "step": 53610 + }, + { + "epoch": 0.2631, + "grad_norm": 0.10320789366960526, + "learning_rate": 4.377490246421187e-05, + "loss": 0.039, + "step": 53620 + }, + { + "epoch": 0.26315, + "grad_norm": 0.14634302258491516, + "learning_rate": 4.377217271482232e-05, + "loss": 0.0388, + "step": 53630 + }, + { + "epoch": 0.2632, + "grad_norm": 0.09399808943271637, + "learning_rate": 4.376944245220863e-05, + "loss": 0.0383, + "step": 53640 + }, + { + "epoch": 0.26325, + "grad_norm": 0.09940533339977264, + "learning_rate": 4.3766711676445423e-05, + "loss": 0.0378, + "step": 53650 + }, + { + "epoch": 0.2633, + "grad_norm": 0.10146481543779373, + "learning_rate": 4.3763980387607374e-05, + "loss": 0.0384, + "step": 53660 + }, + { + "epoch": 0.26335, + "grad_norm": 0.09821083396673203, + "learning_rate": 4.3761248585769147e-05, + "loss": 0.0396, + "step": 53670 + }, + { + "epoch": 0.2634, + "grad_norm": 0.1172725036740303, + "learning_rate": 4.3758516271005435e-05, + "loss": 0.0394, + "step": 53680 + }, + { + "epoch": 0.26345, + "grad_norm": 0.1124570220708847, + "learning_rate": 4.375578344339093e-05, + "loss": 0.0401, + "step": 53690 + }, + { + "epoch": 0.2635, + "grad_norm": 0.11581005156040192, + "learning_rate": 4.375305010300036e-05, + "loss": 0.0397, + "step": 53700 + }, + { + "epoch": 0.26355, + "grad_norm": 0.09203293919563293, + "learning_rate": 4.3750316249908435e-05, + "loss": 0.04, + "step": 53710 + }, + { + "epoch": 0.2636, + "grad_norm": 0.11718445271253586, + "learning_rate": 4.3747581884189913e-05, + "loss": 0.039, + "step": 53720 + }, + { + "epoch": 0.26365, + "grad_norm": 0.10248976945877075, + "learning_rate": 4.374484700591955e-05, + "loss": 0.041, + "step": 53730 + }, + { + "epoch": 0.2637, + "grad_norm": 0.09773720800876617, + "learning_rate": 4.3742111615172104e-05, + "loss": 0.0404, + "step": 53740 + }, + { + "epoch": 0.26375, + "grad_norm": 0.12032605707645416, + "learning_rate": 4.3739375712022375e-05, + "loss": 0.0411, + "step": 53750 + }, + { + "epoch": 0.2638, + "grad_norm": 0.09681548923254013, + "learning_rate": 4.373663929654515e-05, + "loss": 0.0379, + "step": 53760 + }, + { + "epoch": 0.26385, + "grad_norm": 0.13609491288661957, + "learning_rate": 4.3733902368815245e-05, + "loss": 0.0387, + "step": 53770 + }, + { + "epoch": 0.2639, + "grad_norm": 0.10241185128688812, + "learning_rate": 4.3731164928907485e-05, + "loss": 0.0379, + "step": 53780 + }, + { + "epoch": 0.26395, + "grad_norm": 0.1285761594772339, + "learning_rate": 4.372842697689672e-05, + "loss": 0.0407, + "step": 53790 + }, + { + "epoch": 0.264, + "grad_norm": 0.10861656069755554, + "learning_rate": 4.372568851285779e-05, + "loss": 0.04, + "step": 53800 + }, + { + "epoch": 0.26405, + "grad_norm": 0.11549534648656845, + "learning_rate": 4.372294953686558e-05, + "loss": 0.0384, + "step": 53810 + }, + { + "epoch": 0.2641, + "grad_norm": 0.10912875831127167, + "learning_rate": 4.3720210048994957e-05, + "loss": 0.04, + "step": 53820 + }, + { + "epoch": 0.26415, + "grad_norm": 0.10230275243520737, + "learning_rate": 4.3717470049320825e-05, + "loss": 0.0372, + "step": 53830 + }, + { + "epoch": 0.2642, + "grad_norm": 0.10299071669578552, + "learning_rate": 4.3714729537918095e-05, + "loss": 0.0407, + "step": 53840 + }, + { + "epoch": 0.26425, + "grad_norm": 0.10616176575422287, + "learning_rate": 4.371198851486169e-05, + "loss": 0.0376, + "step": 53850 + }, + { + "epoch": 0.2643, + "grad_norm": 0.12114676833152771, + "learning_rate": 4.370924698022655e-05, + "loss": 0.0402, + "step": 53860 + }, + { + "epoch": 0.26435, + "grad_norm": 0.0981198400259018, + "learning_rate": 4.370650493408762e-05, + "loss": 0.0373, + "step": 53870 + }, + { + "epoch": 0.2644, + "grad_norm": 0.1140938326716423, + "learning_rate": 4.3703762376519876e-05, + "loss": 0.0392, + "step": 53880 + }, + { + "epoch": 0.26445, + "grad_norm": 0.12672537565231323, + "learning_rate": 4.37010193075983e-05, + "loss": 0.0385, + "step": 53890 + }, + { + "epoch": 0.2645, + "grad_norm": 0.14617519080638885, + "learning_rate": 4.369827572739788e-05, + "loss": 0.0394, + "step": 53900 + }, + { + "epoch": 0.26455, + "grad_norm": 0.1404951810836792, + "learning_rate": 4.369553163599362e-05, + "loss": 0.0388, + "step": 53910 + }, + { + "epoch": 0.2646, + "grad_norm": 0.12953104078769684, + "learning_rate": 4.369278703346055e-05, + "loss": 0.0417, + "step": 53920 + }, + { + "epoch": 0.26465, + "grad_norm": 0.10081244260072708, + "learning_rate": 4.369004191987371e-05, + "loss": 0.0383, + "step": 53930 + }, + { + "epoch": 0.2647, + "grad_norm": 0.10385037958621979, + "learning_rate": 4.3687296295308144e-05, + "loss": 0.0398, + "step": 53940 + }, + { + "epoch": 0.26475, + "grad_norm": 0.11250067502260208, + "learning_rate": 4.368455015983892e-05, + "loss": 0.0404, + "step": 53950 + }, + { + "epoch": 0.2648, + "grad_norm": 0.13306987285614014, + "learning_rate": 4.368180351354111e-05, + "loss": 0.0397, + "step": 53960 + }, + { + "epoch": 0.26485, + "grad_norm": 0.11967480927705765, + "learning_rate": 4.3679056356489814e-05, + "loss": 0.038, + "step": 53970 + }, + { + "epoch": 0.2649, + "grad_norm": 0.11501402407884598, + "learning_rate": 4.367630868876013e-05, + "loss": 0.0384, + "step": 53980 + }, + { + "epoch": 0.26495, + "grad_norm": 0.130889892578125, + "learning_rate": 4.367356051042718e-05, + "loss": 0.0434, + "step": 53990 + }, + { + "epoch": 0.265, + "grad_norm": 0.16248153150081635, + "learning_rate": 4.367081182156611e-05, + "loss": 0.0408, + "step": 54000 + }, + { + "epoch": 0.26505, + "grad_norm": 0.13356177508831024, + "learning_rate": 4.366806262225206e-05, + "loss": 0.0381, + "step": 54010 + }, + { + "epoch": 0.2651, + "grad_norm": 0.1282336264848709, + "learning_rate": 4.3665312912560185e-05, + "loss": 0.0388, + "step": 54020 + }, + { + "epoch": 0.26515, + "grad_norm": 0.1705833077430725, + "learning_rate": 4.366256269256567e-05, + "loss": 0.0402, + "step": 54030 + }, + { + "epoch": 0.2652, + "grad_norm": 0.13103726506233215, + "learning_rate": 4.36598119623437e-05, + "loss": 0.0395, + "step": 54040 + }, + { + "epoch": 0.26525, + "grad_norm": 0.10220030695199966, + "learning_rate": 4.365706072196948e-05, + "loss": 0.0385, + "step": 54050 + }, + { + "epoch": 0.2653, + "grad_norm": 0.08819513767957687, + "learning_rate": 4.365430897151823e-05, + "loss": 0.0365, + "step": 54060 + }, + { + "epoch": 0.26535, + "grad_norm": 0.10846425592899323, + "learning_rate": 4.365155671106518e-05, + "loss": 0.0391, + "step": 54070 + }, + { + "epoch": 0.2654, + "grad_norm": 0.11298328638076782, + "learning_rate": 4.364880394068558e-05, + "loss": 0.0371, + "step": 54080 + }, + { + "epoch": 0.26545, + "grad_norm": 0.152954563498497, + "learning_rate": 4.364605066045469e-05, + "loss": 0.0382, + "step": 54090 + }, + { + "epoch": 0.2655, + "grad_norm": 0.11766555905342102, + "learning_rate": 4.364329687044777e-05, + "loss": 0.0367, + "step": 54100 + }, + { + "epoch": 0.26555, + "grad_norm": 0.10459113866090775, + "learning_rate": 4.3640542570740115e-05, + "loss": 0.0367, + "step": 54110 + }, + { + "epoch": 0.2656, + "grad_norm": 0.10225178301334381, + "learning_rate": 4.363778776140704e-05, + "loss": 0.0391, + "step": 54120 + }, + { + "epoch": 0.26565, + "grad_norm": 0.11881320923566818, + "learning_rate": 4.363503244252385e-05, + "loss": 0.037, + "step": 54130 + }, + { + "epoch": 0.2657, + "grad_norm": 0.09808455407619476, + "learning_rate": 4.363227661416587e-05, + "loss": 0.039, + "step": 54140 + }, + { + "epoch": 0.26575, + "grad_norm": 0.09890187531709671, + "learning_rate": 4.362952027640844e-05, + "loss": 0.0376, + "step": 54150 + }, + { + "epoch": 0.2658, + "grad_norm": 0.08728187531232834, + "learning_rate": 4.3626763429326936e-05, + "loss": 0.0405, + "step": 54160 + }, + { + "epoch": 0.26585, + "grad_norm": 0.09770821779966354, + "learning_rate": 4.3624006072996714e-05, + "loss": 0.0384, + "step": 54170 + }, + { + "epoch": 0.2659, + "grad_norm": 0.0924566313624382, + "learning_rate": 4.362124820749316e-05, + "loss": 0.037, + "step": 54180 + }, + { + "epoch": 0.26595, + "grad_norm": 0.10226110368967056, + "learning_rate": 4.361848983289167e-05, + "loss": 0.0385, + "step": 54190 + }, + { + "epoch": 0.266, + "grad_norm": 0.08714452385902405, + "learning_rate": 4.3615730949267674e-05, + "loss": 0.0377, + "step": 54200 + }, + { + "epoch": 0.26605, + "grad_norm": 0.11017415672540665, + "learning_rate": 4.361297155669659e-05, + "loss": 0.0396, + "step": 54210 + }, + { + "epoch": 0.2661, + "grad_norm": 0.09896216541528702, + "learning_rate": 4.361021165525384e-05, + "loss": 0.0371, + "step": 54220 + }, + { + "epoch": 0.26615, + "grad_norm": 0.11052402853965759, + "learning_rate": 4.360745124501491e-05, + "loss": 0.0378, + "step": 54230 + }, + { + "epoch": 0.2662, + "grad_norm": 0.08323037624359131, + "learning_rate": 4.360469032605525e-05, + "loss": 0.0385, + "step": 54240 + }, + { + "epoch": 0.26625, + "grad_norm": 0.09808001667261124, + "learning_rate": 4.360192889845034e-05, + "loss": 0.0374, + "step": 54250 + }, + { + "epoch": 0.2663, + "grad_norm": 0.10530462861061096, + "learning_rate": 4.3599166962275684e-05, + "loss": 0.0384, + "step": 54260 + }, + { + "epoch": 0.26635, + "grad_norm": 0.10739065706729889, + "learning_rate": 4.359640451760679e-05, + "loss": 0.0385, + "step": 54270 + }, + { + "epoch": 0.2664, + "grad_norm": 0.09916096925735474, + "learning_rate": 4.359364156451919e-05, + "loss": 0.0399, + "step": 54280 + }, + { + "epoch": 0.26645, + "grad_norm": 0.11747587472200394, + "learning_rate": 4.3590878103088405e-05, + "loss": 0.0373, + "step": 54290 + }, + { + "epoch": 0.2665, + "grad_norm": 0.10611972212791443, + "learning_rate": 4.3588114133390005e-05, + "loss": 0.0387, + "step": 54300 + }, + { + "epoch": 0.26655, + "grad_norm": 0.09637123346328735, + "learning_rate": 4.358534965549954e-05, + "loss": 0.0389, + "step": 54310 + }, + { + "epoch": 0.2666, + "grad_norm": 0.10092896968126297, + "learning_rate": 4.358258466949261e-05, + "loss": 0.0379, + "step": 54320 + }, + { + "epoch": 0.26665, + "grad_norm": 0.09020482748746872, + "learning_rate": 4.3579819175444794e-05, + "loss": 0.0381, + "step": 54330 + }, + { + "epoch": 0.2667, + "grad_norm": 0.09622832387685776, + "learning_rate": 4.3577053173431695e-05, + "loss": 0.0387, + "step": 54340 + }, + { + "epoch": 0.26675, + "grad_norm": 0.10070313513278961, + "learning_rate": 4.357428666352894e-05, + "loss": 0.0397, + "step": 54350 + }, + { + "epoch": 0.2668, + "grad_norm": 0.1218617707490921, + "learning_rate": 4.3571519645812166e-05, + "loss": 0.0377, + "step": 54360 + }, + { + "epoch": 0.26685, + "grad_norm": 0.11019979417324066, + "learning_rate": 4.356875212035702e-05, + "loss": 0.039, + "step": 54370 + }, + { + "epoch": 0.2669, + "grad_norm": 0.10644866526126862, + "learning_rate": 4.3565984087239175e-05, + "loss": 0.0384, + "step": 54380 + }, + { + "epoch": 0.26695, + "grad_norm": 0.0956096425652504, + "learning_rate": 4.35632155465343e-05, + "loss": 0.0389, + "step": 54390 + }, + { + "epoch": 0.267, + "grad_norm": 0.12280172109603882, + "learning_rate": 4.3560446498318085e-05, + "loss": 0.0391, + "step": 54400 + }, + { + "epoch": 0.26705, + "grad_norm": 0.09611979871988297, + "learning_rate": 4.355767694266623e-05, + "loss": 0.0378, + "step": 54410 + }, + { + "epoch": 0.2671, + "grad_norm": 0.09847360104322433, + "learning_rate": 4.3554906879654465e-05, + "loss": 0.0377, + "step": 54420 + }, + { + "epoch": 0.26715, + "grad_norm": 0.09786097705364227, + "learning_rate": 4.3552136309358514e-05, + "loss": 0.0376, + "step": 54430 + }, + { + "epoch": 0.2672, + "grad_norm": 0.09535615146160126, + "learning_rate": 4.3549365231854125e-05, + "loss": 0.0382, + "step": 54440 + }, + { + "epoch": 0.26725, + "grad_norm": 0.10359787940979004, + "learning_rate": 4.354659364721706e-05, + "loss": 0.0399, + "step": 54450 + }, + { + "epoch": 0.2673, + "grad_norm": 0.11358862370252609, + "learning_rate": 4.354382155552309e-05, + "loss": 0.0415, + "step": 54460 + }, + { + "epoch": 0.26735, + "grad_norm": 0.11824797838926315, + "learning_rate": 4.3541048956848004e-05, + "loss": 0.0382, + "step": 54470 + }, + { + "epoch": 0.2674, + "grad_norm": 0.10883744060993195, + "learning_rate": 4.353827585126762e-05, + "loss": 0.0379, + "step": 54480 + }, + { + "epoch": 0.26745, + "grad_norm": 0.09778952598571777, + "learning_rate": 4.353550223885772e-05, + "loss": 0.0382, + "step": 54490 + }, + { + "epoch": 0.2675, + "grad_norm": 0.10703311860561371, + "learning_rate": 4.353272811969416e-05, + "loss": 0.0382, + "step": 54500 + }, + { + "epoch": 0.26755, + "grad_norm": 0.10027310997247696, + "learning_rate": 4.352995349385278e-05, + "loss": 0.0386, + "step": 54510 + }, + { + "epoch": 0.2676, + "grad_norm": 0.10753300040960312, + "learning_rate": 4.352717836140943e-05, + "loss": 0.0392, + "step": 54520 + }, + { + "epoch": 0.26765, + "grad_norm": 0.10875657200813293, + "learning_rate": 4.3524402722439976e-05, + "loss": 0.0381, + "step": 54530 + }, + { + "epoch": 0.2677, + "grad_norm": 0.08940254896879196, + "learning_rate": 4.3521626577020316e-05, + "loss": 0.039, + "step": 54540 + }, + { + "epoch": 0.26775, + "grad_norm": 0.10491291433572769, + "learning_rate": 4.351884992522635e-05, + "loss": 0.0383, + "step": 54550 + }, + { + "epoch": 0.2678, + "grad_norm": 0.11627072840929031, + "learning_rate": 4.3516072767133974e-05, + "loss": 0.0395, + "step": 54560 + }, + { + "epoch": 0.26785, + "grad_norm": 0.16696451604366302, + "learning_rate": 4.351329510281913e-05, + "loss": 0.0408, + "step": 54570 + }, + { + "epoch": 0.2679, + "grad_norm": 0.11519969999790192, + "learning_rate": 4.3510516932357754e-05, + "loss": 0.0381, + "step": 54580 + }, + { + "epoch": 0.26795, + "grad_norm": 0.13323043286800385, + "learning_rate": 4.35077382558258e-05, + "loss": 0.0399, + "step": 54590 + }, + { + "epoch": 0.268, + "grad_norm": 0.10563565790653229, + "learning_rate": 4.3504959073299235e-05, + "loss": 0.0395, + "step": 54600 + }, + { + "epoch": 0.26805, + "grad_norm": 0.12867824733257294, + "learning_rate": 4.3502179384854035e-05, + "loss": 0.0406, + "step": 54610 + }, + { + "epoch": 0.2681, + "grad_norm": 0.1132294312119484, + "learning_rate": 4.34993991905662e-05, + "loss": 0.0415, + "step": 54620 + }, + { + "epoch": 0.26815, + "grad_norm": 0.1161569133400917, + "learning_rate": 4.3496618490511754e-05, + "loss": 0.038, + "step": 54630 + }, + { + "epoch": 0.2682, + "grad_norm": 0.10462360829114914, + "learning_rate": 4.34938372847667e-05, + "loss": 0.0385, + "step": 54640 + }, + { + "epoch": 0.26825, + "grad_norm": 0.1060120090842247, + "learning_rate": 4.349105557340708e-05, + "loss": 0.0407, + "step": 54650 + }, + { + "epoch": 0.2683, + "grad_norm": 0.11183242499828339, + "learning_rate": 4.3488273356508945e-05, + "loss": 0.0385, + "step": 54660 + }, + { + "epoch": 0.26835, + "grad_norm": 0.13913311064243317, + "learning_rate": 4.3485490634148375e-05, + "loss": 0.0386, + "step": 54670 + }, + { + "epoch": 0.2684, + "grad_norm": 0.1398865282535553, + "learning_rate": 4.348270740640142e-05, + "loss": 0.0418, + "step": 54680 + }, + { + "epoch": 0.26845, + "grad_norm": 0.11413382738828659, + "learning_rate": 4.34799236733442e-05, + "loss": 0.0383, + "step": 54690 + }, + { + "epoch": 0.2685, + "grad_norm": 0.10674172639846802, + "learning_rate": 4.34771394350528e-05, + "loss": 0.0411, + "step": 54700 + }, + { + "epoch": 0.26855, + "grad_norm": 0.08494861423969269, + "learning_rate": 4.3474354691603356e-05, + "loss": 0.0395, + "step": 54710 + }, + { + "epoch": 0.2686, + "grad_norm": 0.10439600795507431, + "learning_rate": 4.3471569443072e-05, + "loss": 0.0388, + "step": 54720 + }, + { + "epoch": 0.26865, + "grad_norm": 0.11908277869224548, + "learning_rate": 4.346878368953486e-05, + "loss": 0.0382, + "step": 54730 + }, + { + "epoch": 0.2687, + "grad_norm": 0.11009802669286728, + "learning_rate": 4.346599743106813e-05, + "loss": 0.0407, + "step": 54740 + }, + { + "epoch": 0.26875, + "grad_norm": 0.0999823585152626, + "learning_rate": 4.3463210667747956e-05, + "loss": 0.038, + "step": 54750 + }, + { + "epoch": 0.2688, + "grad_norm": 0.09358491003513336, + "learning_rate": 4.346042339965054e-05, + "loss": 0.0375, + "step": 54760 + }, + { + "epoch": 0.26885, + "grad_norm": 0.10833162814378738, + "learning_rate": 4.3457635626852084e-05, + "loss": 0.0387, + "step": 54770 + }, + { + "epoch": 0.2689, + "grad_norm": 0.09545520693063736, + "learning_rate": 4.3454847349428804e-05, + "loss": 0.038, + "step": 54780 + }, + { + "epoch": 0.26895, + "grad_norm": 0.09593231976032257, + "learning_rate": 4.345205856745693e-05, + "loss": 0.038, + "step": 54790 + }, + { + "epoch": 0.269, + "grad_norm": 0.1229575127363205, + "learning_rate": 4.344926928101271e-05, + "loss": 0.0375, + "step": 54800 + }, + { + "epoch": 0.26905, + "grad_norm": 0.1230328157544136, + "learning_rate": 4.34464794901724e-05, + "loss": 0.0394, + "step": 54810 + }, + { + "epoch": 0.2691, + "grad_norm": 0.16736532747745514, + "learning_rate": 4.344368919501226e-05, + "loss": 0.0425, + "step": 54820 + }, + { + "epoch": 0.26915, + "grad_norm": 0.13367053866386414, + "learning_rate": 4.3440898395608595e-05, + "loss": 0.0431, + "step": 54830 + }, + { + "epoch": 0.2692, + "grad_norm": 0.14473852515220642, + "learning_rate": 4.34381070920377e-05, + "loss": 0.0413, + "step": 54840 + }, + { + "epoch": 0.26925, + "grad_norm": 0.10313988476991653, + "learning_rate": 4.343531528437588e-05, + "loss": 0.0417, + "step": 54850 + }, + { + "epoch": 0.2693, + "grad_norm": 0.10479586571455002, + "learning_rate": 4.343252297269946e-05, + "loss": 0.0397, + "step": 54860 + }, + { + "epoch": 0.26935, + "grad_norm": 0.0953693762421608, + "learning_rate": 4.34297301570848e-05, + "loss": 0.0399, + "step": 54870 + }, + { + "epoch": 0.2694, + "grad_norm": 0.10397922992706299, + "learning_rate": 4.342693683760823e-05, + "loss": 0.0389, + "step": 54880 + }, + { + "epoch": 0.26945, + "grad_norm": 0.09787308424711227, + "learning_rate": 4.342414301434613e-05, + "loss": 0.0386, + "step": 54890 + }, + { + "epoch": 0.2695, + "grad_norm": 0.10238681733608246, + "learning_rate": 4.342134868737488e-05, + "loss": 0.038, + "step": 54900 + }, + { + "epoch": 0.26955, + "grad_norm": 0.09361415356397629, + "learning_rate": 4.341855385677089e-05, + "loss": 0.0402, + "step": 54910 + }, + { + "epoch": 0.2696, + "grad_norm": 0.09431982785463333, + "learning_rate": 4.3415758522610543e-05, + "loss": 0.0415, + "step": 54920 + }, + { + "epoch": 0.26965, + "grad_norm": 0.10237084329128265, + "learning_rate": 4.3412962684970285e-05, + "loss": 0.0373, + "step": 54930 + }, + { + "epoch": 0.2697, + "grad_norm": 0.0966499000787735, + "learning_rate": 4.341016634392654e-05, + "loss": 0.0389, + "step": 54940 + }, + { + "epoch": 0.26975, + "grad_norm": 0.07389848679304123, + "learning_rate": 4.340736949955577e-05, + "loss": 0.0378, + "step": 54950 + }, + { + "epoch": 0.2698, + "grad_norm": 0.08758895099163055, + "learning_rate": 4.3404572151934425e-05, + "loss": 0.0387, + "step": 54960 + }, + { + "epoch": 0.26985, + "grad_norm": 0.10281972587108612, + "learning_rate": 4.3401774301139e-05, + "loss": 0.0396, + "step": 54970 + }, + { + "epoch": 0.2699, + "grad_norm": 0.07996245473623276, + "learning_rate": 4.3398975947245965e-05, + "loss": 0.0386, + "step": 54980 + }, + { + "epoch": 0.26995, + "grad_norm": 0.09434854984283447, + "learning_rate": 4.3396177090331856e-05, + "loss": 0.0378, + "step": 54990 + }, + { + "epoch": 0.27, + "grad_norm": 0.10240872949361801, + "learning_rate": 4.3393377730473164e-05, + "loss": 0.0383, + "step": 55000 + }, + { + "epoch": 0.27005, + "grad_norm": 0.09521150588989258, + "learning_rate": 4.339057786774644e-05, + "loss": 0.038, + "step": 55010 + }, + { + "epoch": 0.2701, + "grad_norm": 0.09409971535205841, + "learning_rate": 4.3387777502228225e-05, + "loss": 0.0393, + "step": 55020 + }, + { + "epoch": 0.27015, + "grad_norm": 0.11078613996505737, + "learning_rate": 4.338497663399509e-05, + "loss": 0.0385, + "step": 55030 + }, + { + "epoch": 0.2702, + "grad_norm": 0.09805911779403687, + "learning_rate": 4.338217526312359e-05, + "loss": 0.0401, + "step": 55040 + }, + { + "epoch": 0.27025, + "grad_norm": 0.09694908559322357, + "learning_rate": 4.337937338969033e-05, + "loss": 0.0385, + "step": 55050 + }, + { + "epoch": 0.2703, + "grad_norm": 0.09191986173391342, + "learning_rate": 4.3376571013771897e-05, + "loss": 0.0394, + "step": 55060 + }, + { + "epoch": 0.27035, + "grad_norm": 0.09870189428329468, + "learning_rate": 4.3373768135444926e-05, + "loss": 0.041, + "step": 55070 + }, + { + "epoch": 0.2704, + "grad_norm": 0.10547038167715073, + "learning_rate": 4.3370964754786035e-05, + "loss": 0.0384, + "step": 55080 + }, + { + "epoch": 0.27045, + "grad_norm": 0.09930089116096497, + "learning_rate": 4.336816087187186e-05, + "loss": 0.0391, + "step": 55090 + }, + { + "epoch": 0.2705, + "grad_norm": 0.11283731460571289, + "learning_rate": 4.3365356486779084e-05, + "loss": 0.0395, + "step": 55100 + }, + { + "epoch": 0.27055, + "grad_norm": 0.1049695685505867, + "learning_rate": 4.336255159958435e-05, + "loss": 0.0416, + "step": 55110 + }, + { + "epoch": 0.2706, + "grad_norm": 0.095323346555233, + "learning_rate": 4.335974621036436e-05, + "loss": 0.0385, + "step": 55120 + }, + { + "epoch": 0.27065, + "grad_norm": 0.11626312881708145, + "learning_rate": 4.33569403191958e-05, + "loss": 0.0439, + "step": 55130 + }, + { + "epoch": 0.2707, + "grad_norm": 0.10035998374223709, + "learning_rate": 4.335413392615539e-05, + "loss": 0.0383, + "step": 55140 + }, + { + "epoch": 0.27075, + "grad_norm": 0.09624806046485901, + "learning_rate": 4.3351327031319856e-05, + "loss": 0.039, + "step": 55150 + }, + { + "epoch": 0.2708, + "grad_norm": 0.08691024035215378, + "learning_rate": 4.3348519634765934e-05, + "loss": 0.0391, + "step": 55160 + }, + { + "epoch": 0.27085, + "grad_norm": 0.09586155414581299, + "learning_rate": 4.334571173657037e-05, + "loss": 0.0372, + "step": 55170 + }, + { + "epoch": 0.2709, + "grad_norm": 0.08697967976331711, + "learning_rate": 4.3342903336809956e-05, + "loss": 0.0389, + "step": 55180 + }, + { + "epoch": 0.27095, + "grad_norm": 0.09988107532262802, + "learning_rate": 4.334009443556144e-05, + "loss": 0.0389, + "step": 55190 + }, + { + "epoch": 0.271, + "grad_norm": 0.11418605595827103, + "learning_rate": 4.333728503290164e-05, + "loss": 0.0383, + "step": 55200 + }, + { + "epoch": 0.27105, + "grad_norm": 0.0969906598329544, + "learning_rate": 4.333447512890736e-05, + "loss": 0.0374, + "step": 55210 + }, + { + "epoch": 0.2711, + "grad_norm": 0.10156113654375076, + "learning_rate": 4.3331664723655414e-05, + "loss": 0.0389, + "step": 55220 + }, + { + "epoch": 0.27115, + "grad_norm": 0.09116654843091965, + "learning_rate": 4.3328853817222635e-05, + "loss": 0.0372, + "step": 55230 + }, + { + "epoch": 0.2712, + "grad_norm": 0.10540885478258133, + "learning_rate": 4.332604240968588e-05, + "loss": 0.0378, + "step": 55240 + }, + { + "epoch": 0.27125, + "grad_norm": 0.09678299725055695, + "learning_rate": 4.332323050112202e-05, + "loss": 0.0379, + "step": 55250 + }, + { + "epoch": 0.2713, + "grad_norm": 0.11525113135576248, + "learning_rate": 4.3320418091607916e-05, + "loss": 0.0388, + "step": 55260 + }, + { + "epoch": 0.27135, + "grad_norm": 0.1061660423874855, + "learning_rate": 4.331760518122046e-05, + "loss": 0.038, + "step": 55270 + }, + { + "epoch": 0.2714, + "grad_norm": 0.11649101972579956, + "learning_rate": 4.3314791770036564e-05, + "loss": 0.039, + "step": 55280 + }, + { + "epoch": 0.27145, + "grad_norm": 0.1300489753484726, + "learning_rate": 4.331197785813314e-05, + "loss": 0.0383, + "step": 55290 + }, + { + "epoch": 0.2715, + "grad_norm": 0.10463863611221313, + "learning_rate": 4.330916344558713e-05, + "loss": 0.0375, + "step": 55300 + }, + { + "epoch": 0.27155, + "grad_norm": 0.1016702651977539, + "learning_rate": 4.330634853247546e-05, + "loss": 0.0381, + "step": 55310 + }, + { + "epoch": 0.2716, + "grad_norm": 0.10234900563955307, + "learning_rate": 4.3303533118875104e-05, + "loss": 0.0387, + "step": 55320 + }, + { + "epoch": 0.27165, + "grad_norm": 0.1014292985200882, + "learning_rate": 4.330071720486302e-05, + "loss": 0.037, + "step": 55330 + }, + { + "epoch": 0.2717, + "grad_norm": 0.08624016493558884, + "learning_rate": 4.329790079051621e-05, + "loss": 0.0376, + "step": 55340 + }, + { + "epoch": 0.27175, + "grad_norm": 0.097990982234478, + "learning_rate": 4.3295083875911667e-05, + "loss": 0.0369, + "step": 55350 + }, + { + "epoch": 0.2718, + "grad_norm": 0.10517366230487823, + "learning_rate": 4.329226646112641e-05, + "loss": 0.0387, + "step": 55360 + }, + { + "epoch": 0.27185, + "grad_norm": 0.1022857278585434, + "learning_rate": 4.3289448546237443e-05, + "loss": 0.041, + "step": 55370 + }, + { + "epoch": 0.2719, + "grad_norm": 0.10659095644950867, + "learning_rate": 4.3286630131321835e-05, + "loss": 0.0398, + "step": 55380 + }, + { + "epoch": 0.27195, + "grad_norm": 0.09299805760383606, + "learning_rate": 4.3283811216456624e-05, + "loss": 0.0384, + "step": 55390 + }, + { + "epoch": 0.272, + "grad_norm": 0.12613967061042786, + "learning_rate": 4.328099180171889e-05, + "loss": 0.0387, + "step": 55400 + }, + { + "epoch": 0.27205, + "grad_norm": 0.10692556947469711, + "learning_rate": 4.3278171887185706e-05, + "loss": 0.0391, + "step": 55410 + }, + { + "epoch": 0.2721, + "grad_norm": 0.0877951979637146, + "learning_rate": 4.3275351472934166e-05, + "loss": 0.0377, + "step": 55420 + }, + { + "epoch": 0.27215, + "grad_norm": 0.10409408807754517, + "learning_rate": 4.3272530559041384e-05, + "loss": 0.0368, + "step": 55430 + }, + { + "epoch": 0.2722, + "grad_norm": 0.09759706258773804, + "learning_rate": 4.326970914558448e-05, + "loss": 0.0377, + "step": 55440 + }, + { + "epoch": 0.27225, + "grad_norm": 0.10468898713588715, + "learning_rate": 4.3266887232640596e-05, + "loss": 0.0383, + "step": 55450 + }, + { + "epoch": 0.2723, + "grad_norm": 0.10428676009178162, + "learning_rate": 4.326406482028688e-05, + "loss": 0.0411, + "step": 55460 + }, + { + "epoch": 0.27235, + "grad_norm": 0.10064087808132172, + "learning_rate": 4.326124190860048e-05, + "loss": 0.0381, + "step": 55470 + }, + { + "epoch": 0.2724, + "grad_norm": 0.08288514614105225, + "learning_rate": 4.32584184976586e-05, + "loss": 0.0393, + "step": 55480 + }, + { + "epoch": 0.27245, + "grad_norm": 0.09690169990062714, + "learning_rate": 4.3255594587538403e-05, + "loss": 0.0401, + "step": 55490 + }, + { + "epoch": 0.2725, + "grad_norm": 0.11273466795682907, + "learning_rate": 4.3252770178317124e-05, + "loss": 0.0394, + "step": 55500 + }, + { + "epoch": 0.27255, + "grad_norm": 0.08136719465255737, + "learning_rate": 4.324994527007196e-05, + "loss": 0.0383, + "step": 55510 + }, + { + "epoch": 0.2726, + "grad_norm": 0.09851282089948654, + "learning_rate": 4.324711986288015e-05, + "loss": 0.0412, + "step": 55520 + }, + { + "epoch": 0.27265, + "grad_norm": 0.12646430730819702, + "learning_rate": 4.324429395681893e-05, + "loss": 0.0405, + "step": 55530 + }, + { + "epoch": 0.2727, + "grad_norm": 0.11000816524028778, + "learning_rate": 4.324146755196558e-05, + "loss": 0.0397, + "step": 55540 + }, + { + "epoch": 0.27275, + "grad_norm": 0.08530110120773315, + "learning_rate": 4.3238640648397344e-05, + "loss": 0.0389, + "step": 55550 + }, + { + "epoch": 0.2728, + "grad_norm": 0.08713596314191818, + "learning_rate": 4.3235813246191535e-05, + "loss": 0.0398, + "step": 55560 + }, + { + "epoch": 0.27285, + "grad_norm": 0.09457286447286606, + "learning_rate": 4.323298534542545e-05, + "loss": 0.0391, + "step": 55570 + }, + { + "epoch": 0.2729, + "grad_norm": 0.11076316237449646, + "learning_rate": 4.323015694617638e-05, + "loss": 0.0383, + "step": 55580 + }, + { + "epoch": 0.27295, + "grad_norm": 0.11398779600858688, + "learning_rate": 4.3227328048521674e-05, + "loss": 0.0417, + "step": 55590 + }, + { + "epoch": 0.273, + "grad_norm": 0.09488118439912796, + "learning_rate": 4.322449865253867e-05, + "loss": 0.0395, + "step": 55600 + }, + { + "epoch": 0.27305, + "grad_norm": 0.11086251586675644, + "learning_rate": 4.322166875830472e-05, + "loss": 0.0383, + "step": 55610 + }, + { + "epoch": 0.2731, + "grad_norm": 0.11592381447553635, + "learning_rate": 4.3218838365897184e-05, + "loss": 0.0373, + "step": 55620 + }, + { + "epoch": 0.27315, + "grad_norm": 0.09241390973329544, + "learning_rate": 4.321600747539346e-05, + "loss": 0.0411, + "step": 55630 + }, + { + "epoch": 0.2732, + "grad_norm": 0.13217687606811523, + "learning_rate": 4.321317608687093e-05, + "loss": 0.042, + "step": 55640 + }, + { + "epoch": 0.27325, + "grad_norm": 0.0901438444852829, + "learning_rate": 4.3210344200407e-05, + "loss": 0.0372, + "step": 55650 + }, + { + "epoch": 0.2733, + "grad_norm": 0.10367929190397263, + "learning_rate": 4.320751181607912e-05, + "loss": 0.0378, + "step": 55660 + }, + { + "epoch": 0.27335, + "grad_norm": 0.10702197253704071, + "learning_rate": 4.32046789339647e-05, + "loss": 0.0378, + "step": 55670 + }, + { + "epoch": 0.2734, + "grad_norm": 0.132543683052063, + "learning_rate": 4.320184555414119e-05, + "loss": 0.0395, + "step": 55680 + }, + { + "epoch": 0.27345, + "grad_norm": 0.11184263229370117, + "learning_rate": 4.319901167668607e-05, + "loss": 0.0396, + "step": 55690 + }, + { + "epoch": 0.2735, + "grad_norm": 0.10397180914878845, + "learning_rate": 4.31961773016768e-05, + "loss": 0.0377, + "step": 55700 + }, + { + "epoch": 0.27355, + "grad_norm": 0.12112978845834732, + "learning_rate": 4.319334242919088e-05, + "loss": 0.0409, + "step": 55710 + }, + { + "epoch": 0.2736, + "grad_norm": 0.10749992728233337, + "learning_rate": 4.3190507059305817e-05, + "loss": 0.0387, + "step": 55720 + }, + { + "epoch": 0.27365, + "grad_norm": 0.1014375239610672, + "learning_rate": 4.3187671192099124e-05, + "loss": 0.0399, + "step": 55730 + }, + { + "epoch": 0.2737, + "grad_norm": 0.12156882882118225, + "learning_rate": 4.318483482764833e-05, + "loss": 0.0387, + "step": 55740 + }, + { + "epoch": 0.27375, + "grad_norm": 0.12430386245250702, + "learning_rate": 4.3181997966030986e-05, + "loss": 0.0397, + "step": 55750 + }, + { + "epoch": 0.2738, + "grad_norm": 0.10626768320798874, + "learning_rate": 4.317916060732465e-05, + "loss": 0.0404, + "step": 55760 + }, + { + "epoch": 0.27385, + "grad_norm": 0.15057958662509918, + "learning_rate": 4.317632275160689e-05, + "loss": 0.0403, + "step": 55770 + }, + { + "epoch": 0.2739, + "grad_norm": 0.10483145713806152, + "learning_rate": 4.317348439895529e-05, + "loss": 0.0383, + "step": 55780 + }, + { + "epoch": 0.27395, + "grad_norm": 0.09630624204874039, + "learning_rate": 4.3170645549447463e-05, + "loss": 0.0383, + "step": 55790 + }, + { + "epoch": 0.274, + "grad_norm": 0.09001373499631882, + "learning_rate": 4.316780620316101e-05, + "loss": 0.0378, + "step": 55800 + }, + { + "epoch": 0.27405, + "grad_norm": 0.10086652636528015, + "learning_rate": 4.316496636017355e-05, + "loss": 0.0413, + "step": 55810 + }, + { + "epoch": 0.2741, + "grad_norm": 0.11561109870672226, + "learning_rate": 4.316212602056276e-05, + "loss": 0.0378, + "step": 55820 + }, + { + "epoch": 0.27415, + "grad_norm": 0.08696012198925018, + "learning_rate": 4.315928518440624e-05, + "loss": 0.038, + "step": 55830 + }, + { + "epoch": 0.2742, + "grad_norm": 0.08141034841537476, + "learning_rate": 4.3156443851781695e-05, + "loss": 0.0383, + "step": 55840 + }, + { + "epoch": 0.27425, + "grad_norm": 0.08150725811719894, + "learning_rate": 4.31536020227668e-05, + "loss": 0.0376, + "step": 55850 + }, + { + "epoch": 0.2743, + "grad_norm": 0.07563025504350662, + "learning_rate": 4.3150759697439246e-05, + "loss": 0.0368, + "step": 55860 + }, + { + "epoch": 0.27435, + "grad_norm": 0.0782807469367981, + "learning_rate": 4.3147916875876734e-05, + "loss": 0.036, + "step": 55870 + }, + { + "epoch": 0.2744, + "grad_norm": 0.09279277920722961, + "learning_rate": 4.3145073558157e-05, + "loss": 0.039, + "step": 55880 + }, + { + "epoch": 0.27445, + "grad_norm": 0.0960644781589508, + "learning_rate": 4.314222974435776e-05, + "loss": 0.0372, + "step": 55890 + }, + { + "epoch": 0.2745, + "grad_norm": 0.10694056004285812, + "learning_rate": 4.313938543455679e-05, + "loss": 0.0402, + "step": 55900 + }, + { + "epoch": 0.27455, + "grad_norm": 0.10038580745458603, + "learning_rate": 4.313654062883183e-05, + "loss": 0.0392, + "step": 55910 + }, + { + "epoch": 0.2746, + "grad_norm": 0.13570621609687805, + "learning_rate": 4.313369532726066e-05, + "loss": 0.0386, + "step": 55920 + }, + { + "epoch": 0.27465, + "grad_norm": 0.10145073384046555, + "learning_rate": 4.313084952992108e-05, + "loss": 0.0402, + "step": 55930 + }, + { + "epoch": 0.2747, + "grad_norm": 0.10217112302780151, + "learning_rate": 4.3128003236890876e-05, + "loss": 0.0375, + "step": 55940 + }, + { + "epoch": 0.27475, + "grad_norm": 0.09518636018037796, + "learning_rate": 4.312515644824788e-05, + "loss": 0.0394, + "step": 55950 + }, + { + "epoch": 0.2748, + "grad_norm": 0.09258914738893509, + "learning_rate": 4.312230916406991e-05, + "loss": 0.038, + "step": 55960 + }, + { + "epoch": 0.27485, + "grad_norm": 0.09878391027450562, + "learning_rate": 4.311946138443482e-05, + "loss": 0.0379, + "step": 55970 + }, + { + "epoch": 0.2749, + "grad_norm": 0.09210311621427536, + "learning_rate": 4.311661310942047e-05, + "loss": 0.0366, + "step": 55980 + }, + { + "epoch": 0.27495, + "grad_norm": 0.09525465220212936, + "learning_rate": 4.311376433910471e-05, + "loss": 0.0375, + "step": 55990 + }, + { + "epoch": 0.275, + "grad_norm": 0.105075404047966, + "learning_rate": 4.3110915073565444e-05, + "loss": 0.0392, + "step": 56000 + }, + { + "epoch": 0.27505, + "grad_norm": 0.09033656865358353, + "learning_rate": 4.3108065312880566e-05, + "loss": 0.0406, + "step": 56010 + }, + { + "epoch": 0.2751, + "grad_norm": 0.09966182708740234, + "learning_rate": 4.3105215057127984e-05, + "loss": 0.0363, + "step": 56020 + }, + { + "epoch": 0.27515, + "grad_norm": 0.09546273201704025, + "learning_rate": 4.3102364306385624e-05, + "loss": 0.0371, + "step": 56030 + }, + { + "epoch": 0.2752, + "grad_norm": 0.09017182886600494, + "learning_rate": 4.309951306073142e-05, + "loss": 0.0371, + "step": 56040 + }, + { + "epoch": 0.27525, + "grad_norm": 0.1062690019607544, + "learning_rate": 4.3096661320243334e-05, + "loss": 0.0365, + "step": 56050 + }, + { + "epoch": 0.2753, + "grad_norm": 0.10952750593423843, + "learning_rate": 4.3093809084999325e-05, + "loss": 0.0377, + "step": 56060 + }, + { + "epoch": 0.27535, + "grad_norm": 0.11846937239170074, + "learning_rate": 4.3090956355077375e-05, + "loss": 0.0364, + "step": 56070 + }, + { + "epoch": 0.2754, + "grad_norm": 0.11436047405004501, + "learning_rate": 4.308810313055547e-05, + "loss": 0.0394, + "step": 56080 + }, + { + "epoch": 0.27545, + "grad_norm": 0.09996142238378525, + "learning_rate": 4.308524941151163e-05, + "loss": 0.0381, + "step": 56090 + }, + { + "epoch": 0.2755, + "grad_norm": 0.0903242826461792, + "learning_rate": 4.3082395198023854e-05, + "loss": 0.037, + "step": 56100 + }, + { + "epoch": 0.27555, + "grad_norm": 0.08647549897432327, + "learning_rate": 4.307954049017019e-05, + "loss": 0.0411, + "step": 56110 + }, + { + "epoch": 0.2756, + "grad_norm": 0.10381969064474106, + "learning_rate": 4.307668528802868e-05, + "loss": 0.0396, + "step": 56120 + }, + { + "epoch": 0.27565, + "grad_norm": 0.09873152524232864, + "learning_rate": 4.3073829591677396e-05, + "loss": 0.0399, + "step": 56130 + }, + { + "epoch": 0.2757, + "grad_norm": 0.09237905591726303, + "learning_rate": 4.307097340119439e-05, + "loss": 0.0366, + "step": 56140 + }, + { + "epoch": 0.27575, + "grad_norm": 0.11224275082349777, + "learning_rate": 4.3068116716657764e-05, + "loss": 0.0367, + "step": 56150 + }, + { + "epoch": 0.2758, + "grad_norm": 0.10116568952798843, + "learning_rate": 4.3065259538145616e-05, + "loss": 0.038, + "step": 56160 + }, + { + "epoch": 0.27585, + "grad_norm": 0.11430336534976959, + "learning_rate": 4.306240186573606e-05, + "loss": 0.0369, + "step": 56170 + }, + { + "epoch": 0.2759, + "grad_norm": 0.09505539387464523, + "learning_rate": 4.305954369950722e-05, + "loss": 0.0383, + "step": 56180 + }, + { + "epoch": 0.27595, + "grad_norm": 0.09002593159675598, + "learning_rate": 4.305668503953724e-05, + "loss": 0.0401, + "step": 56190 + }, + { + "epoch": 0.276, + "grad_norm": 0.10569896548986435, + "learning_rate": 4.3053825885904264e-05, + "loss": 0.0381, + "step": 56200 + }, + { + "epoch": 0.27605, + "grad_norm": 0.10630550235509872, + "learning_rate": 4.3050966238686483e-05, + "loss": 0.0374, + "step": 56210 + }, + { + "epoch": 0.2761, + "grad_norm": 0.09257291257381439, + "learning_rate": 4.3048106097962066e-05, + "loss": 0.0383, + "step": 56220 + }, + { + "epoch": 0.27615, + "grad_norm": 0.1080874353647232, + "learning_rate": 4.30452454638092e-05, + "loss": 0.0379, + "step": 56230 + }, + { + "epoch": 0.2762, + "grad_norm": 0.10577518492937088, + "learning_rate": 4.304238433630612e-05, + "loss": 0.0417, + "step": 56240 + }, + { + "epoch": 0.27625, + "grad_norm": 0.10467962175607681, + "learning_rate": 4.303952271553101e-05, + "loss": 0.0383, + "step": 56250 + }, + { + "epoch": 0.2763, + "grad_norm": 0.12002705782651901, + "learning_rate": 4.303666060156214e-05, + "loss": 0.0397, + "step": 56260 + }, + { + "epoch": 0.27635, + "grad_norm": 0.10973189771175385, + "learning_rate": 4.303379799447774e-05, + "loss": 0.0372, + "step": 56270 + }, + { + "epoch": 0.2764, + "grad_norm": 0.10119590163230896, + "learning_rate": 4.3030934894356076e-05, + "loss": 0.0382, + "step": 56280 + }, + { + "epoch": 0.27645, + "grad_norm": 0.11402632296085358, + "learning_rate": 4.302807130127543e-05, + "loss": 0.0407, + "step": 56290 + }, + { + "epoch": 0.2765, + "grad_norm": 0.10487694293260574, + "learning_rate": 4.302520721531409e-05, + "loss": 0.0393, + "step": 56300 + }, + { + "epoch": 0.27655, + "grad_norm": 0.11392984539270401, + "learning_rate": 4.302234263655035e-05, + "loss": 0.0374, + "step": 56310 + }, + { + "epoch": 0.2766, + "grad_norm": 0.11727987974882126, + "learning_rate": 4.301947756506254e-05, + "loss": 0.0392, + "step": 56320 + }, + { + "epoch": 0.27665, + "grad_norm": 0.14777207374572754, + "learning_rate": 4.301661200092898e-05, + "loss": 0.0384, + "step": 56330 + }, + { + "epoch": 0.2767, + "grad_norm": 0.10190843045711517, + "learning_rate": 4.3013745944228014e-05, + "loss": 0.0386, + "step": 56340 + }, + { + "epoch": 0.27675, + "grad_norm": 0.10420718789100647, + "learning_rate": 4.3010879395038e-05, + "loss": 0.0407, + "step": 56350 + }, + { + "epoch": 0.2768, + "grad_norm": 0.09990087896585464, + "learning_rate": 4.300801235343731e-05, + "loss": 0.0384, + "step": 56360 + }, + { + "epoch": 0.27685, + "grad_norm": 0.11819495260715485, + "learning_rate": 4.3005144819504335e-05, + "loss": 0.0381, + "step": 56370 + }, + { + "epoch": 0.2769, + "grad_norm": 0.08432481437921524, + "learning_rate": 4.300227679331745e-05, + "loss": 0.0369, + "step": 56380 + }, + { + "epoch": 0.27695, + "grad_norm": 0.10276342183351517, + "learning_rate": 4.29994082749551e-05, + "loss": 0.0384, + "step": 56390 + }, + { + "epoch": 0.277, + "grad_norm": 0.10119026154279709, + "learning_rate": 4.2996539264495674e-05, + "loss": 0.0371, + "step": 56400 + }, + { + "epoch": 0.27705, + "grad_norm": 0.11653642356395721, + "learning_rate": 4.2993669762017636e-05, + "loss": 0.0382, + "step": 56410 + }, + { + "epoch": 0.2771, + "grad_norm": 0.09874815493822098, + "learning_rate": 4.299079976759942e-05, + "loss": 0.0366, + "step": 56420 + }, + { + "epoch": 0.27715, + "grad_norm": 0.09425827860832214, + "learning_rate": 4.2987929281319505e-05, + "loss": 0.0373, + "step": 56430 + }, + { + "epoch": 0.2772, + "grad_norm": 0.08276829123497009, + "learning_rate": 4.2985058303256357e-05, + "loss": 0.0382, + "step": 56440 + }, + { + "epoch": 0.27725, + "grad_norm": 0.10070496797561646, + "learning_rate": 4.298218683348846e-05, + "loss": 0.0371, + "step": 56450 + }, + { + "epoch": 0.2773, + "grad_norm": 0.0893116220831871, + "learning_rate": 4.2979314872094343e-05, + "loss": 0.039, + "step": 56460 + }, + { + "epoch": 0.27735, + "grad_norm": 0.13710635900497437, + "learning_rate": 4.297644241915251e-05, + "loss": 0.0398, + "step": 56470 + }, + { + "epoch": 0.2774, + "grad_norm": 0.09827051311731339, + "learning_rate": 4.2973569474741496e-05, + "loss": 0.0376, + "step": 56480 + }, + { + "epoch": 0.27745, + "grad_norm": 0.08752574771642685, + "learning_rate": 4.297069603893984e-05, + "loss": 0.0399, + "step": 56490 + }, + { + "epoch": 0.2775, + "grad_norm": 0.11656755954027176, + "learning_rate": 4.296782211182611e-05, + "loss": 0.039, + "step": 56500 + }, + { + "epoch": 0.27755, + "grad_norm": 0.08865627646446228, + "learning_rate": 4.296494769347887e-05, + "loss": 0.0384, + "step": 56510 + }, + { + "epoch": 0.2776, + "grad_norm": 0.11651672422885895, + "learning_rate": 4.2962072783976714e-05, + "loss": 0.0384, + "step": 56520 + }, + { + "epoch": 0.27765, + "grad_norm": 0.09058934450149536, + "learning_rate": 4.2959197383398234e-05, + "loss": 0.0389, + "step": 56530 + }, + { + "epoch": 0.2777, + "grad_norm": 0.10706375539302826, + "learning_rate": 4.295632149182205e-05, + "loss": 0.0377, + "step": 56540 + }, + { + "epoch": 0.27775, + "grad_norm": 0.09810793399810791, + "learning_rate": 4.295344510932677e-05, + "loss": 0.0379, + "step": 56550 + }, + { + "epoch": 0.2778, + "grad_norm": 0.102114737033844, + "learning_rate": 4.295056823599106e-05, + "loss": 0.0383, + "step": 56560 + }, + { + "epoch": 0.27785, + "grad_norm": 0.09272314608097076, + "learning_rate": 4.294769087189354e-05, + "loss": 0.0363, + "step": 56570 + }, + { + "epoch": 0.2779, + "grad_norm": 0.10033122450113297, + "learning_rate": 4.29448130171129e-05, + "loss": 0.0379, + "step": 56580 + }, + { + "epoch": 0.27795, + "grad_norm": 0.08752158284187317, + "learning_rate": 4.2941934671727826e-05, + "loss": 0.0382, + "step": 56590 + }, + { + "epoch": 0.278, + "grad_norm": 0.0926806703209877, + "learning_rate": 4.293905583581699e-05, + "loss": 0.0366, + "step": 56600 + }, + { + "epoch": 0.27805, + "grad_norm": 0.08347244560718536, + "learning_rate": 4.293617650945911e-05, + "loss": 0.0382, + "step": 56610 + }, + { + "epoch": 0.2781, + "grad_norm": 0.09313614666461945, + "learning_rate": 4.29332966927329e-05, + "loss": 0.0384, + "step": 56620 + }, + { + "epoch": 0.27815, + "grad_norm": 0.08686469495296478, + "learning_rate": 4.2930416385717095e-05, + "loss": 0.0381, + "step": 56630 + }, + { + "epoch": 0.2782, + "grad_norm": 0.13193097710609436, + "learning_rate": 4.292753558849044e-05, + "loss": 0.04, + "step": 56640 + }, + { + "epoch": 0.27825, + "grad_norm": 0.09673771262168884, + "learning_rate": 4.2924654301131705e-05, + "loss": 0.0388, + "step": 56650 + }, + { + "epoch": 0.2783, + "grad_norm": 0.09554016590118408, + "learning_rate": 4.292177252371965e-05, + "loss": 0.0389, + "step": 56660 + }, + { + "epoch": 0.27835, + "grad_norm": 0.09741620719432831, + "learning_rate": 4.291889025633307e-05, + "loss": 0.0389, + "step": 56670 + }, + { + "epoch": 0.2784, + "grad_norm": 0.09482339769601822, + "learning_rate": 4.291600749905076e-05, + "loss": 0.039, + "step": 56680 + }, + { + "epoch": 0.27845, + "grad_norm": 0.0791984349489212, + "learning_rate": 4.291312425195153e-05, + "loss": 0.0416, + "step": 56690 + }, + { + "epoch": 0.2785, + "grad_norm": 0.09493456780910492, + "learning_rate": 4.291024051511422e-05, + "loss": 0.0377, + "step": 56700 + }, + { + "epoch": 0.27855, + "grad_norm": 0.0838426798582077, + "learning_rate": 4.290735628861766e-05, + "loss": 0.0372, + "step": 56710 + }, + { + "epoch": 0.2786, + "grad_norm": 0.10570637881755829, + "learning_rate": 4.29044715725407e-05, + "loss": 0.0377, + "step": 56720 + }, + { + "epoch": 0.27865, + "grad_norm": 0.09676932543516159, + "learning_rate": 4.290158636696223e-05, + "loss": 0.0384, + "step": 56730 + }, + { + "epoch": 0.2787, + "grad_norm": 0.0901820957660675, + "learning_rate": 4.28987006719611e-05, + "loss": 0.038, + "step": 56740 + }, + { + "epoch": 0.27875, + "grad_norm": 0.07807715237140656, + "learning_rate": 4.289581448761623e-05, + "loss": 0.0371, + "step": 56750 + }, + { + "epoch": 0.2788, + "grad_norm": 0.07762596756219864, + "learning_rate": 4.28929278140065e-05, + "loss": 0.0365, + "step": 56760 + }, + { + "epoch": 0.27885, + "grad_norm": 0.10255575180053711, + "learning_rate": 4.2890040651210856e-05, + "loss": 0.0383, + "step": 56770 + }, + { + "epoch": 0.2789, + "grad_norm": 0.10037130117416382, + "learning_rate": 4.288715299930822e-05, + "loss": 0.0397, + "step": 56780 + }, + { + "epoch": 0.27895, + "grad_norm": 0.09931264072656631, + "learning_rate": 4.2884264858377544e-05, + "loss": 0.0363, + "step": 56790 + }, + { + "epoch": 0.279, + "grad_norm": 0.09609793871641159, + "learning_rate": 4.2881376228497776e-05, + "loss": 0.0365, + "step": 56800 + }, + { + "epoch": 0.27905, + "grad_norm": 0.11774688214063644, + "learning_rate": 4.287848710974791e-05, + "loss": 0.0378, + "step": 56810 + }, + { + "epoch": 0.2791, + "grad_norm": 0.11118960380554199, + "learning_rate": 4.287559750220692e-05, + "loss": 0.0378, + "step": 56820 + }, + { + "epoch": 0.27915, + "grad_norm": 0.09443902224302292, + "learning_rate": 4.287270740595381e-05, + "loss": 0.0388, + "step": 56830 + }, + { + "epoch": 0.2792, + "grad_norm": 0.11444979161024094, + "learning_rate": 4.286981682106759e-05, + "loss": 0.0381, + "step": 56840 + }, + { + "epoch": 0.27925, + "grad_norm": 0.10338012874126434, + "learning_rate": 4.286692574762729e-05, + "loss": 0.0369, + "step": 56850 + }, + { + "epoch": 0.2793, + "grad_norm": 0.09579575806856155, + "learning_rate": 4.2864034185711955e-05, + "loss": 0.0377, + "step": 56860 + }, + { + "epoch": 0.27935, + "grad_norm": 0.08403484523296356, + "learning_rate": 4.286114213540063e-05, + "loss": 0.0389, + "step": 56870 + }, + { + "epoch": 0.2794, + "grad_norm": 0.09917088598012924, + "learning_rate": 4.2858249596772404e-05, + "loss": 0.0386, + "step": 56880 + }, + { + "epoch": 0.27945, + "grad_norm": 0.09660632908344269, + "learning_rate": 4.2855356569906335e-05, + "loss": 0.0394, + "step": 56890 + }, + { + "epoch": 0.2795, + "grad_norm": 0.10014674812555313, + "learning_rate": 4.2852463054881523e-05, + "loss": 0.04, + "step": 56900 + }, + { + "epoch": 0.27955, + "grad_norm": 0.10737550258636475, + "learning_rate": 4.2849569051777083e-05, + "loss": 0.0402, + "step": 56910 + }, + { + "epoch": 0.2796, + "grad_norm": 0.11162016540765762, + "learning_rate": 4.284667456067213e-05, + "loss": 0.0381, + "step": 56920 + }, + { + "epoch": 0.27965, + "grad_norm": 0.1165522113442421, + "learning_rate": 4.2843779581645796e-05, + "loss": 0.0428, + "step": 56930 + }, + { + "epoch": 0.2797, + "grad_norm": 0.10214854031801224, + "learning_rate": 4.2840884114777235e-05, + "loss": 0.039, + "step": 56940 + }, + { + "epoch": 0.27975, + "grad_norm": 0.09295105934143066, + "learning_rate": 4.2837988160145605e-05, + "loss": 0.0372, + "step": 56950 + }, + { + "epoch": 0.2798, + "grad_norm": 0.11330553889274597, + "learning_rate": 4.283509171783008e-05, + "loss": 0.038, + "step": 56960 + }, + { + "epoch": 0.27985, + "grad_norm": 0.09909593313932419, + "learning_rate": 4.283219478790984e-05, + "loss": 0.0374, + "step": 56970 + }, + { + "epoch": 0.2799, + "grad_norm": 0.11746770143508911, + "learning_rate": 4.282929737046411e-05, + "loss": 0.038, + "step": 56980 + }, + { + "epoch": 0.27995, + "grad_norm": 0.17033761739730835, + "learning_rate": 4.282639946557208e-05, + "loss": 0.0372, + "step": 56990 + }, + { + "epoch": 0.28, + "grad_norm": 0.09611111134290695, + "learning_rate": 4.2823501073312975e-05, + "loss": 0.0408, + "step": 57000 + }, + { + "epoch": 0.28005, + "grad_norm": 0.09751928597688675, + "learning_rate": 4.282060219376606e-05, + "loss": 0.0407, + "step": 57010 + }, + { + "epoch": 0.2801, + "grad_norm": 0.09460132569074631, + "learning_rate": 4.281770282701057e-05, + "loss": 0.0399, + "step": 57020 + }, + { + "epoch": 0.28015, + "grad_norm": 0.08417051285505295, + "learning_rate": 4.2814802973125776e-05, + "loss": 0.0383, + "step": 57030 + }, + { + "epoch": 0.2802, + "grad_norm": 0.112062469124794, + "learning_rate": 4.281190263219097e-05, + "loss": 0.0379, + "step": 57040 + }, + { + "epoch": 0.28025, + "grad_norm": 0.09878487139940262, + "learning_rate": 4.280900180428543e-05, + "loss": 0.0377, + "step": 57050 + }, + { + "epoch": 0.2803, + "grad_norm": 0.10927791148424149, + "learning_rate": 4.280610048948848e-05, + "loss": 0.0372, + "step": 57060 + }, + { + "epoch": 0.28035, + "grad_norm": 0.10309398174285889, + "learning_rate": 4.280319868787942e-05, + "loss": 0.0376, + "step": 57070 + }, + { + "epoch": 0.2804, + "grad_norm": 0.10313721746206284, + "learning_rate": 4.28002963995376e-05, + "loss": 0.0374, + "step": 57080 + }, + { + "epoch": 0.28045, + "grad_norm": 0.09849844127893448, + "learning_rate": 4.279739362454237e-05, + "loss": 0.0381, + "step": 57090 + }, + { + "epoch": 0.2805, + "grad_norm": 0.09395359456539154, + "learning_rate": 4.2794490362973084e-05, + "loss": 0.0369, + "step": 57100 + }, + { + "epoch": 0.28055, + "grad_norm": 0.1996801346540451, + "learning_rate": 4.2791586614909105e-05, + "loss": 0.0389, + "step": 57110 + }, + { + "epoch": 0.2806, + "grad_norm": 0.14284029603004456, + "learning_rate": 4.278868238042984e-05, + "loss": 0.0392, + "step": 57120 + }, + { + "epoch": 0.28065, + "grad_norm": 0.10570540279150009, + "learning_rate": 4.278577765961469e-05, + "loss": 0.0392, + "step": 57130 + }, + { + "epoch": 0.2807, + "grad_norm": 0.104091115295887, + "learning_rate": 4.2782872452543056e-05, + "loss": 0.0379, + "step": 57140 + }, + { + "epoch": 0.28075, + "grad_norm": 0.10061943531036377, + "learning_rate": 4.277996675929437e-05, + "loss": 0.0384, + "step": 57150 + }, + { + "epoch": 0.2808, + "grad_norm": 0.09896393865346909, + "learning_rate": 4.277706057994806e-05, + "loss": 0.0382, + "step": 57160 + }, + { + "epoch": 0.28085, + "grad_norm": 0.08161722123622894, + "learning_rate": 4.27741539145836e-05, + "loss": 0.0377, + "step": 57170 + }, + { + "epoch": 0.2809, + "grad_norm": 0.11624377965927124, + "learning_rate": 4.277124676328045e-05, + "loss": 0.0387, + "step": 57180 + }, + { + "epoch": 0.28095, + "grad_norm": 0.095398910343647, + "learning_rate": 4.27683391261181e-05, + "loss": 0.0387, + "step": 57190 + }, + { + "epoch": 0.281, + "grad_norm": 0.0983344167470932, + "learning_rate": 4.2765431003176015e-05, + "loss": 0.0386, + "step": 57200 + }, + { + "epoch": 0.28105, + "grad_norm": 0.10947199165821075, + "learning_rate": 4.276252239453373e-05, + "loss": 0.0383, + "step": 57210 + }, + { + "epoch": 0.2811, + "grad_norm": 0.08485335111618042, + "learning_rate": 4.275961330027076e-05, + "loss": 0.0373, + "step": 57220 + }, + { + "epoch": 0.28115, + "grad_norm": 0.12160183489322662, + "learning_rate": 4.2756703720466626e-05, + "loss": 0.0404, + "step": 57230 + }, + { + "epoch": 0.2812, + "grad_norm": 0.09800686687231064, + "learning_rate": 4.275379365520089e-05, + "loss": 0.0391, + "step": 57240 + }, + { + "epoch": 0.28125, + "grad_norm": 0.11026581376791, + "learning_rate": 4.2750883104553096e-05, + "loss": 0.041, + "step": 57250 + }, + { + "epoch": 0.2813, + "grad_norm": 0.10947220027446747, + "learning_rate": 4.274797206860284e-05, + "loss": 0.04, + "step": 57260 + }, + { + "epoch": 0.28135, + "grad_norm": 0.11520034074783325, + "learning_rate": 4.2745060547429685e-05, + "loss": 0.0381, + "step": 57270 + }, + { + "epoch": 0.2814, + "grad_norm": 0.09699587523937225, + "learning_rate": 4.274214854111324e-05, + "loss": 0.0377, + "step": 57280 + }, + { + "epoch": 0.28145, + "grad_norm": 0.08693718910217285, + "learning_rate": 4.2739236049733124e-05, + "loss": 0.0378, + "step": 57290 + }, + { + "epoch": 0.2815, + "grad_norm": 0.09276087582111359, + "learning_rate": 4.273632307336896e-05, + "loss": 0.0374, + "step": 57300 + }, + { + "epoch": 0.28155, + "grad_norm": 0.08943061530590057, + "learning_rate": 4.273340961210038e-05, + "loss": 0.0366, + "step": 57310 + }, + { + "epoch": 0.2816, + "grad_norm": 0.09189813584089279, + "learning_rate": 4.273049566600705e-05, + "loss": 0.0365, + "step": 57320 + }, + { + "epoch": 0.28165, + "grad_norm": 0.09366247057914734, + "learning_rate": 4.272758123516863e-05, + "loss": 0.0389, + "step": 57330 + }, + { + "epoch": 0.2817, + "grad_norm": 0.10908438265323639, + "learning_rate": 4.2724666319664794e-05, + "loss": 0.0388, + "step": 57340 + }, + { + "epoch": 0.28175, + "grad_norm": 0.10651381313800812, + "learning_rate": 4.2721750919575246e-05, + "loss": 0.0384, + "step": 57350 + }, + { + "epoch": 0.2818, + "grad_norm": 0.13720935583114624, + "learning_rate": 4.271883503497967e-05, + "loss": 0.0409, + "step": 57360 + }, + { + "epoch": 0.28185, + "grad_norm": 0.1090945228934288, + "learning_rate": 4.271591866595782e-05, + "loss": 0.0377, + "step": 57370 + }, + { + "epoch": 0.2819, + "grad_norm": 0.11343448609113693, + "learning_rate": 4.27130018125894e-05, + "loss": 0.0387, + "step": 57380 + }, + { + "epoch": 0.28195, + "grad_norm": 0.12336157262325287, + "learning_rate": 4.271008447495417e-05, + "loss": 0.0391, + "step": 57390 + }, + { + "epoch": 0.282, + "grad_norm": 0.0998220294713974, + "learning_rate": 4.270716665313188e-05, + "loss": 0.0383, + "step": 57400 + }, + { + "epoch": 0.28205, + "grad_norm": 0.10613575577735901, + "learning_rate": 4.27042483472023e-05, + "loss": 0.0362, + "step": 57410 + }, + { + "epoch": 0.2821, + "grad_norm": 0.10241192579269409, + "learning_rate": 4.2701329557245225e-05, + "loss": 0.0382, + "step": 57420 + }, + { + "epoch": 0.28215, + "grad_norm": 0.09192397445440292, + "learning_rate": 4.269841028334046e-05, + "loss": 0.0384, + "step": 57430 + }, + { + "epoch": 0.2822, + "grad_norm": 0.10546057671308517, + "learning_rate": 4.26954905255678e-05, + "loss": 0.0386, + "step": 57440 + }, + { + "epoch": 0.28225, + "grad_norm": 0.0989101231098175, + "learning_rate": 4.2692570284007074e-05, + "loss": 0.0385, + "step": 57450 + }, + { + "epoch": 0.2823, + "grad_norm": 0.10276032984256744, + "learning_rate": 4.268964955873813e-05, + "loss": 0.0377, + "step": 57460 + }, + { + "epoch": 0.28235, + "grad_norm": 0.0878804475069046, + "learning_rate": 4.2686728349840805e-05, + "loss": 0.0371, + "step": 57470 + }, + { + "epoch": 0.2824, + "grad_norm": 0.09753140062093735, + "learning_rate": 4.268380665739498e-05, + "loss": 0.0377, + "step": 57480 + }, + { + "epoch": 0.28245, + "grad_norm": 0.09146375954151154, + "learning_rate": 4.268088448148051e-05, + "loss": 0.038, + "step": 57490 + }, + { + "epoch": 0.2825, + "grad_norm": 0.08938898146152496, + "learning_rate": 4.2677961822177315e-05, + "loss": 0.0381, + "step": 57500 + }, + { + "epoch": 0.28255, + "grad_norm": 0.11164992302656174, + "learning_rate": 4.267503867956528e-05, + "loss": 0.0373, + "step": 57510 + }, + { + "epoch": 0.2826, + "grad_norm": 0.09699451923370361, + "learning_rate": 4.267211505372433e-05, + "loss": 0.0425, + "step": 57520 + }, + { + "epoch": 0.28265, + "grad_norm": 0.09400709718465805, + "learning_rate": 4.26691909447344e-05, + "loss": 0.0376, + "step": 57530 + }, + { + "epoch": 0.2827, + "grad_norm": 0.09006451070308685, + "learning_rate": 4.266626635267541e-05, + "loss": 0.0383, + "step": 57540 + }, + { + "epoch": 0.28275, + "grad_norm": 0.08708031475543976, + "learning_rate": 4.266334127762734e-05, + "loss": 0.0395, + "step": 57550 + }, + { + "epoch": 0.2828, + "grad_norm": 0.09977874159812927, + "learning_rate": 4.266041571967016e-05, + "loss": 0.0371, + "step": 57560 + }, + { + "epoch": 0.28285, + "grad_norm": 0.11364461481571198, + "learning_rate": 4.265748967888385e-05, + "loss": 0.0382, + "step": 57570 + }, + { + "epoch": 0.2829, + "grad_norm": 0.08485016971826553, + "learning_rate": 4.2654563155348406e-05, + "loss": 0.0396, + "step": 57580 + }, + { + "epoch": 0.28295, + "grad_norm": 0.10243216156959534, + "learning_rate": 4.2651636149143835e-05, + "loss": 0.0381, + "step": 57590 + }, + { + "epoch": 0.283, + "grad_norm": 0.12172680348157883, + "learning_rate": 4.2648708660350164e-05, + "loss": 0.0382, + "step": 57600 + }, + { + "epoch": 0.28305, + "grad_norm": 0.10779182612895966, + "learning_rate": 4.264578068904742e-05, + "loss": 0.0387, + "step": 57610 + }, + { + "epoch": 0.2831, + "grad_norm": 0.0988893210887909, + "learning_rate": 4.264285223531568e-05, + "loss": 0.0369, + "step": 57620 + }, + { + "epoch": 0.28315, + "grad_norm": 0.0954919382929802, + "learning_rate": 4.2639923299234976e-05, + "loss": 0.0379, + "step": 57630 + }, + { + "epoch": 0.2832, + "grad_norm": 0.1120079979300499, + "learning_rate": 4.263699388088539e-05, + "loss": 0.0402, + "step": 57640 + }, + { + "epoch": 0.28325, + "grad_norm": 0.10489992797374725, + "learning_rate": 4.263406398034703e-05, + "loss": 0.0373, + "step": 57650 + }, + { + "epoch": 0.2833, + "grad_norm": 0.09630849212408066, + "learning_rate": 4.263113359769998e-05, + "loss": 0.0407, + "step": 57660 + }, + { + "epoch": 0.28335, + "grad_norm": 0.0975610762834549, + "learning_rate": 4.262820273302436e-05, + "loss": 0.0382, + "step": 57670 + }, + { + "epoch": 0.2834, + "grad_norm": 0.10288140922784805, + "learning_rate": 4.2625271386400304e-05, + "loss": 0.0385, + "step": 57680 + }, + { + "epoch": 0.28345, + "grad_norm": 0.1018190085887909, + "learning_rate": 4.262233955790794e-05, + "loss": 0.0381, + "step": 57690 + }, + { + "epoch": 0.2835, + "grad_norm": 0.09703332185745239, + "learning_rate": 4.261940724762744e-05, + "loss": 0.0392, + "step": 57700 + }, + { + "epoch": 0.28355, + "grad_norm": 0.08421315252780914, + "learning_rate": 4.261647445563897e-05, + "loss": 0.0377, + "step": 57710 + }, + { + "epoch": 0.2836, + "grad_norm": 0.08458781242370605, + "learning_rate": 4.26135411820227e-05, + "loss": 0.038, + "step": 57720 + }, + { + "epoch": 0.28365, + "grad_norm": 0.12038341164588928, + "learning_rate": 4.261060742685883e-05, + "loss": 0.0395, + "step": 57730 + }, + { + "epoch": 0.2837, + "grad_norm": 0.10978465527296066, + "learning_rate": 4.260767319022757e-05, + "loss": 0.0397, + "step": 57740 + }, + { + "epoch": 0.28375, + "grad_norm": 0.12968258559703827, + "learning_rate": 4.260473847220915e-05, + "loss": 0.0401, + "step": 57750 + }, + { + "epoch": 0.2838, + "grad_norm": 0.11993337422609329, + "learning_rate": 4.2601803272883784e-05, + "loss": 0.0386, + "step": 57760 + }, + { + "epoch": 0.28385, + "grad_norm": 0.12599441409111023, + "learning_rate": 4.259886759233173e-05, + "loss": 0.039, + "step": 57770 + }, + { + "epoch": 0.2839, + "grad_norm": 0.0750201940536499, + "learning_rate": 4.259593143063325e-05, + "loss": 0.0378, + "step": 57780 + }, + { + "epoch": 0.28395, + "grad_norm": 0.0985063910484314, + "learning_rate": 4.259299478786861e-05, + "loss": 0.0394, + "step": 57790 + }, + { + "epoch": 0.284, + "grad_norm": 0.09771952033042908, + "learning_rate": 4.2590057664118106e-05, + "loss": 0.0376, + "step": 57800 + }, + { + "epoch": 0.28405, + "grad_norm": 0.1153845489025116, + "learning_rate": 4.258712005946204e-05, + "loss": 0.0406, + "step": 57810 + }, + { + "epoch": 0.2841, + "grad_norm": 0.09112930297851562, + "learning_rate": 4.258418197398071e-05, + "loss": 0.0398, + "step": 57820 + }, + { + "epoch": 0.28415, + "grad_norm": 0.10500326007604599, + "learning_rate": 4.258124340775445e-05, + "loss": 0.0411, + "step": 57830 + }, + { + "epoch": 0.2842, + "grad_norm": 0.10515421628952026, + "learning_rate": 4.257830436086361e-05, + "loss": 0.0452, + "step": 57840 + }, + { + "epoch": 0.28425, + "grad_norm": 0.09916632622480392, + "learning_rate": 4.257536483338852e-05, + "loss": 0.0395, + "step": 57850 + }, + { + "epoch": 0.2843, + "grad_norm": 0.09431524574756622, + "learning_rate": 4.257242482540956e-05, + "loss": 0.0404, + "step": 57860 + }, + { + "epoch": 0.28435, + "grad_norm": 0.12175924330949783, + "learning_rate": 4.256948433700712e-05, + "loss": 0.0417, + "step": 57870 + }, + { + "epoch": 0.2844, + "grad_norm": 0.09342382103204727, + "learning_rate": 4.2566543368261564e-05, + "loss": 0.0431, + "step": 57880 + }, + { + "epoch": 0.28445, + "grad_norm": 0.09715968370437622, + "learning_rate": 4.256360191925332e-05, + "loss": 0.0387, + "step": 57890 + }, + { + "epoch": 0.2845, + "grad_norm": 0.09505413472652435, + "learning_rate": 4.256065999006279e-05, + "loss": 0.0379, + "step": 57900 + }, + { + "epoch": 0.28455, + "grad_norm": 0.0904054269194603, + "learning_rate": 4.255771758077042e-05, + "loss": 0.0387, + "step": 57910 + }, + { + "epoch": 0.2846, + "grad_norm": 0.0966554805636406, + "learning_rate": 4.255477469145665e-05, + "loss": 0.0395, + "step": 57920 + }, + { + "epoch": 0.28465, + "grad_norm": 0.09629569947719574, + "learning_rate": 4.255183132220192e-05, + "loss": 0.04, + "step": 57930 + }, + { + "epoch": 0.2847, + "grad_norm": 0.10692469030618668, + "learning_rate": 4.254888747308673e-05, + "loss": 0.0395, + "step": 57940 + }, + { + "epoch": 0.28475, + "grad_norm": 0.10003534704446793, + "learning_rate": 4.254594314419155e-05, + "loss": 0.0387, + "step": 57950 + }, + { + "epoch": 0.2848, + "grad_norm": 0.09970128536224365, + "learning_rate": 4.254299833559687e-05, + "loss": 0.0395, + "step": 57960 + }, + { + "epoch": 0.28485, + "grad_norm": 0.11436878144741058, + "learning_rate": 4.2540053047383214e-05, + "loss": 0.0403, + "step": 57970 + }, + { + "epoch": 0.2849, + "grad_norm": 0.10870227962732315, + "learning_rate": 4.2537107279631084e-05, + "loss": 0.0385, + "step": 57980 + }, + { + "epoch": 0.28495, + "grad_norm": 0.09994117170572281, + "learning_rate": 4.2534161032421037e-05, + "loss": 0.04, + "step": 57990 + }, + { + "epoch": 0.285, + "grad_norm": 0.11786071956157684, + "learning_rate": 4.2531214305833614e-05, + "loss": 0.0401, + "step": 58000 + }, + { + "epoch": 0.28505, + "grad_norm": 0.09488601982593536, + "learning_rate": 4.252826709994938e-05, + "loss": 0.0396, + "step": 58010 + }, + { + "epoch": 0.2851, + "grad_norm": 0.10463389754295349, + "learning_rate": 4.252531941484891e-05, + "loss": 0.0393, + "step": 58020 + }, + { + "epoch": 0.28515, + "grad_norm": 0.10832806676626205, + "learning_rate": 4.252237125061279e-05, + "loss": 0.0405, + "step": 58030 + }, + { + "epoch": 0.2852, + "grad_norm": 0.11303062736988068, + "learning_rate": 4.251942260732161e-05, + "loss": 0.0422, + "step": 58040 + }, + { + "epoch": 0.28525, + "grad_norm": 0.10420338064432144, + "learning_rate": 4.251647348505601e-05, + "loss": 0.0391, + "step": 58050 + }, + { + "epoch": 0.2853, + "grad_norm": 0.14655640721321106, + "learning_rate": 4.25135238838966e-05, + "loss": 0.0394, + "step": 58060 + }, + { + "epoch": 0.28535, + "grad_norm": 0.09848672896623611, + "learning_rate": 4.251057380392404e-05, + "loss": 0.0408, + "step": 58070 + }, + { + "epoch": 0.2854, + "grad_norm": 0.09891166538000107, + "learning_rate": 4.250762324521896e-05, + "loss": 0.0392, + "step": 58080 + }, + { + "epoch": 0.28545, + "grad_norm": 0.0940348282456398, + "learning_rate": 4.250467220786204e-05, + "loss": 0.0384, + "step": 58090 + }, + { + "epoch": 0.2855, + "grad_norm": 0.09893086552619934, + "learning_rate": 4.250172069193395e-05, + "loss": 0.0378, + "step": 58100 + }, + { + "epoch": 0.28555, + "grad_norm": 0.1129058450460434, + "learning_rate": 4.24987686975154e-05, + "loss": 0.0369, + "step": 58110 + }, + { + "epoch": 0.2856, + "grad_norm": 0.1151210144162178, + "learning_rate": 4.249581622468709e-05, + "loss": 0.0389, + "step": 58120 + }, + { + "epoch": 0.28565, + "grad_norm": 0.09511999785900116, + "learning_rate": 4.2492863273529734e-05, + "loss": 0.0367, + "step": 58130 + }, + { + "epoch": 0.2857, + "grad_norm": 0.12105721980333328, + "learning_rate": 4.2489909844124066e-05, + "loss": 0.0398, + "step": 58140 + }, + { + "epoch": 0.28575, + "grad_norm": 0.11214801669120789, + "learning_rate": 4.248695593655083e-05, + "loss": 0.0407, + "step": 58150 + }, + { + "epoch": 0.2858, + "grad_norm": 0.10821537673473358, + "learning_rate": 4.248400155089079e-05, + "loss": 0.0377, + "step": 58160 + }, + { + "epoch": 0.28585, + "grad_norm": 0.10529662668704987, + "learning_rate": 4.2481046687224726e-05, + "loss": 0.0382, + "step": 58170 + }, + { + "epoch": 0.2859, + "grad_norm": 0.10574857145547867, + "learning_rate": 4.2478091345633405e-05, + "loss": 0.0388, + "step": 58180 + }, + { + "epoch": 0.28595, + "grad_norm": 0.11062496155500412, + "learning_rate": 4.247513552619763e-05, + "loss": 0.0376, + "step": 58190 + }, + { + "epoch": 0.286, + "grad_norm": 0.10402261465787888, + "learning_rate": 4.247217922899822e-05, + "loss": 0.0382, + "step": 58200 + }, + { + "epoch": 0.28605, + "grad_norm": 0.10761887580156326, + "learning_rate": 4.2469222454115996e-05, + "loss": 0.0392, + "step": 58210 + }, + { + "epoch": 0.2861, + "grad_norm": 0.09106829017400742, + "learning_rate": 4.246626520163179e-05, + "loss": 0.0376, + "step": 58220 + }, + { + "epoch": 0.28615, + "grad_norm": 0.08656405657529831, + "learning_rate": 4.246330747162646e-05, + "loss": 0.0373, + "step": 58230 + }, + { + "epoch": 0.2862, + "grad_norm": 0.09331633150577545, + "learning_rate": 4.246034926418085e-05, + "loss": 0.0396, + "step": 58240 + }, + { + "epoch": 0.28625, + "grad_norm": 0.13578954339027405, + "learning_rate": 4.245739057937586e-05, + "loss": 0.0388, + "step": 58250 + }, + { + "epoch": 0.2863, + "grad_norm": 0.1069922149181366, + "learning_rate": 4.245443141729237e-05, + "loss": 0.0383, + "step": 58260 + }, + { + "epoch": 0.28635, + "grad_norm": 0.09611210972070694, + "learning_rate": 4.245147177801129e-05, + "loss": 0.0386, + "step": 58270 + }, + { + "epoch": 0.2864, + "grad_norm": 0.11979424953460693, + "learning_rate": 4.2448511661613514e-05, + "loss": 0.0383, + "step": 58280 + }, + { + "epoch": 0.28645, + "grad_norm": 0.11691362410783768, + "learning_rate": 4.244555106817999e-05, + "loss": 0.0414, + "step": 58290 + }, + { + "epoch": 0.2865, + "grad_norm": 0.11565831303596497, + "learning_rate": 4.2442589997791655e-05, + "loss": 0.0392, + "step": 58300 + }, + { + "epoch": 0.28655, + "grad_norm": 0.12460681796073914, + "learning_rate": 4.2439628450529455e-05, + "loss": 0.0406, + "step": 58310 + }, + { + "epoch": 0.2866, + "grad_norm": 0.11439589411020279, + "learning_rate": 4.2436666426474374e-05, + "loss": 0.0398, + "step": 58320 + }, + { + "epoch": 0.28665, + "grad_norm": 0.11800641566514969, + "learning_rate": 4.243370392570738e-05, + "loss": 0.0434, + "step": 58330 + }, + { + "epoch": 0.2867, + "grad_norm": 0.11225130409002304, + "learning_rate": 4.2430740948309475e-05, + "loss": 0.0391, + "step": 58340 + }, + { + "epoch": 0.28675, + "grad_norm": 0.12273237109184265, + "learning_rate": 4.2427777494361656e-05, + "loss": 0.0404, + "step": 58350 + }, + { + "epoch": 0.2868, + "grad_norm": 0.11912668496370316, + "learning_rate": 4.242481356394495e-05, + "loss": 0.0397, + "step": 58360 + }, + { + "epoch": 0.28685, + "grad_norm": 0.1070471778512001, + "learning_rate": 4.242184915714038e-05, + "loss": 0.039, + "step": 58370 + }, + { + "epoch": 0.2869, + "grad_norm": 0.09075344353914261, + "learning_rate": 4.241888427402901e-05, + "loss": 0.0382, + "step": 58380 + }, + { + "epoch": 0.28695, + "grad_norm": 0.09854944050312042, + "learning_rate": 4.2415918914691877e-05, + "loss": 0.0386, + "step": 58390 + }, + { + "epoch": 0.287, + "grad_norm": 0.09506674110889435, + "learning_rate": 4.241295307921007e-05, + "loss": 0.0365, + "step": 58400 + }, + { + "epoch": 0.28705, + "grad_norm": 0.1081521064043045, + "learning_rate": 4.240998676766467e-05, + "loss": 0.0374, + "step": 58410 + }, + { + "epoch": 0.2871, + "grad_norm": 0.0927954763174057, + "learning_rate": 4.240701998013677e-05, + "loss": 0.0387, + "step": 58420 + }, + { + "epoch": 0.28715, + "grad_norm": 0.08340541273355484, + "learning_rate": 4.240405271670749e-05, + "loss": 0.0364, + "step": 58430 + }, + { + "epoch": 0.2872, + "grad_norm": 0.07858669012784958, + "learning_rate": 4.240108497745793e-05, + "loss": 0.0377, + "step": 58440 + }, + { + "epoch": 0.28725, + "grad_norm": 0.09130463004112244, + "learning_rate": 4.239811676246925e-05, + "loss": 0.0378, + "step": 58450 + }, + { + "epoch": 0.2873, + "grad_norm": 0.09747853130102158, + "learning_rate": 4.23951480718226e-05, + "loss": 0.0379, + "step": 58460 + }, + { + "epoch": 0.28735, + "grad_norm": 0.11840526014566422, + "learning_rate": 4.239217890559914e-05, + "loss": 0.0384, + "step": 58470 + }, + { + "epoch": 0.2874, + "grad_norm": 0.114509217441082, + "learning_rate": 4.238920926388004e-05, + "loss": 0.0375, + "step": 58480 + }, + { + "epoch": 0.28745, + "grad_norm": 0.10365968942642212, + "learning_rate": 4.2386239146746484e-05, + "loss": 0.0389, + "step": 58490 + }, + { + "epoch": 0.2875, + "grad_norm": 0.11859627813100815, + "learning_rate": 4.238326855427969e-05, + "loss": 0.037, + "step": 58500 + }, + { + "epoch": 0.28755, + "grad_norm": 0.10479515790939331, + "learning_rate": 4.2380297486560855e-05, + "loss": 0.0398, + "step": 58510 + }, + { + "epoch": 0.2876, + "grad_norm": 0.09964556246995926, + "learning_rate": 4.237732594367122e-05, + "loss": 0.038, + "step": 58520 + }, + { + "epoch": 0.28765, + "grad_norm": 0.11377781629562378, + "learning_rate": 4.237435392569203e-05, + "loss": 0.0381, + "step": 58530 + }, + { + "epoch": 0.2877, + "grad_norm": 0.10320976376533508, + "learning_rate": 4.2371381432704525e-05, + "loss": 0.0374, + "step": 58540 + }, + { + "epoch": 0.28775, + "grad_norm": 0.10899948328733444, + "learning_rate": 4.236840846478998e-05, + "loss": 0.0376, + "step": 58550 + }, + { + "epoch": 0.2878, + "grad_norm": 0.10679149627685547, + "learning_rate": 4.236543502202966e-05, + "loss": 0.0387, + "step": 58560 + }, + { + "epoch": 0.28785, + "grad_norm": 0.1140972375869751, + "learning_rate": 4.236246110450488e-05, + "loss": 0.0391, + "step": 58570 + }, + { + "epoch": 0.2879, + "grad_norm": 0.1361556500196457, + "learning_rate": 4.235948671229694e-05, + "loss": 0.0376, + "step": 58580 + }, + { + "epoch": 0.28795, + "grad_norm": 0.11948247998952866, + "learning_rate": 4.2356511845487156e-05, + "loss": 0.0387, + "step": 58590 + }, + { + "epoch": 0.288, + "grad_norm": 0.10225441306829453, + "learning_rate": 4.2353536504156855e-05, + "loss": 0.0386, + "step": 58600 + }, + { + "epoch": 0.28805, + "grad_norm": 0.10306891798973083, + "learning_rate": 4.235056068838738e-05, + "loss": 0.0373, + "step": 58610 + }, + { + "epoch": 0.2881, + "grad_norm": 0.0958067774772644, + "learning_rate": 4.2347584398260096e-05, + "loss": 0.0374, + "step": 58620 + }, + { + "epoch": 0.28815, + "grad_norm": 0.11552973836660385, + "learning_rate": 4.234460763385638e-05, + "loss": 0.0376, + "step": 58630 + }, + { + "epoch": 0.2882, + "grad_norm": 0.10933344811201096, + "learning_rate": 4.2341630395257594e-05, + "loss": 0.037, + "step": 58640 + }, + { + "epoch": 0.28825, + "grad_norm": 0.08934877812862396, + "learning_rate": 4.233865268254516e-05, + "loss": 0.0366, + "step": 58650 + }, + { + "epoch": 0.2883, + "grad_norm": 0.09421779215335846, + "learning_rate": 4.233567449580047e-05, + "loss": 0.0365, + "step": 58660 + }, + { + "epoch": 0.28835, + "grad_norm": 0.09506764262914658, + "learning_rate": 4.233269583510495e-05, + "loss": 0.0376, + "step": 58670 + }, + { + "epoch": 0.2884, + "grad_norm": 0.09250709414482117, + "learning_rate": 4.232971670054005e-05, + "loss": 0.0362, + "step": 58680 + }, + { + "epoch": 0.28845, + "grad_norm": 0.08452494442462921, + "learning_rate": 4.2326737092187194e-05, + "loss": 0.0372, + "step": 58690 + }, + { + "epoch": 0.2885, + "grad_norm": 0.09364336729049683, + "learning_rate": 4.232375701012785e-05, + "loss": 0.036, + "step": 58700 + }, + { + "epoch": 0.28855, + "grad_norm": 0.08511223644018173, + "learning_rate": 4.2320776454443514e-05, + "loss": 0.0369, + "step": 58710 + }, + { + "epoch": 0.2886, + "grad_norm": 0.08995082229375839, + "learning_rate": 4.2317795425215645e-05, + "loss": 0.0355, + "step": 58720 + }, + { + "epoch": 0.28865, + "grad_norm": 0.09742347151041031, + "learning_rate": 4.231481392252576e-05, + "loss": 0.0423, + "step": 58730 + }, + { + "epoch": 0.2887, + "grad_norm": 0.09668943285942078, + "learning_rate": 4.2311831946455366e-05, + "loss": 0.0376, + "step": 58740 + }, + { + "epoch": 0.28875, + "grad_norm": 0.11909755319356918, + "learning_rate": 4.230884949708599e-05, + "loss": 0.0362, + "step": 58750 + }, + { + "epoch": 0.2888, + "grad_norm": 0.09919518232345581, + "learning_rate": 4.2305866574499166e-05, + "loss": 0.0367, + "step": 58760 + }, + { + "epoch": 0.28885, + "grad_norm": 0.11793835461139679, + "learning_rate": 4.230288317877646e-05, + "loss": 0.0414, + "step": 58770 + }, + { + "epoch": 0.2889, + "grad_norm": 0.13174089789390564, + "learning_rate": 4.2299899309999424e-05, + "loss": 0.038, + "step": 58780 + }, + { + "epoch": 0.28895, + "grad_norm": 0.08976200968027115, + "learning_rate": 4.229691496824965e-05, + "loss": 0.0369, + "step": 58790 + }, + { + "epoch": 0.289, + "grad_norm": 0.0908234715461731, + "learning_rate": 4.229393015360871e-05, + "loss": 0.0365, + "step": 58800 + }, + { + "epoch": 0.28905, + "grad_norm": 0.07467728108167648, + "learning_rate": 4.229094486615821e-05, + "loss": 0.0371, + "step": 58810 + }, + { + "epoch": 0.2891, + "grad_norm": 0.08888581395149231, + "learning_rate": 4.228795910597978e-05, + "loss": 0.0385, + "step": 58820 + }, + { + "epoch": 0.28915, + "grad_norm": 0.08418140560388565, + "learning_rate": 4.228497287315504e-05, + "loss": 0.0373, + "step": 58830 + }, + { + "epoch": 0.2892, + "grad_norm": 0.10168585926294327, + "learning_rate": 4.2281986167765644e-05, + "loss": 0.0364, + "step": 58840 + }, + { + "epoch": 0.28925, + "grad_norm": 0.08546897768974304, + "learning_rate": 4.227899898989323e-05, + "loss": 0.0369, + "step": 58850 + }, + { + "epoch": 0.2893, + "grad_norm": 0.10331574082374573, + "learning_rate": 4.2276011339619476e-05, + "loss": 0.0366, + "step": 58860 + }, + { + "epoch": 0.28935, + "grad_norm": 0.08570695668458939, + "learning_rate": 4.2273023217026066e-05, + "loss": 0.0351, + "step": 58870 + }, + { + "epoch": 0.2894, + "grad_norm": 0.10039761662483215, + "learning_rate": 4.2270034622194685e-05, + "loss": 0.0367, + "step": 58880 + }, + { + "epoch": 0.28945, + "grad_norm": 0.09401549398899078, + "learning_rate": 4.226704555520705e-05, + "loss": 0.0345, + "step": 58890 + }, + { + "epoch": 0.2895, + "grad_norm": 0.10001719743013382, + "learning_rate": 4.226405601614487e-05, + "loss": 0.0386, + "step": 58900 + }, + { + "epoch": 0.28955, + "grad_norm": 0.1016169935464859, + "learning_rate": 4.22610660050899e-05, + "loss": 0.0369, + "step": 58910 + }, + { + "epoch": 0.2896, + "grad_norm": 0.09658100455999374, + "learning_rate": 4.2258075522123854e-05, + "loss": 0.0353, + "step": 58920 + }, + { + "epoch": 0.28965, + "grad_norm": 0.09994114190340042, + "learning_rate": 4.225508456732851e-05, + "loss": 0.0366, + "step": 58930 + }, + { + "epoch": 0.2897, + "grad_norm": 0.0924081951379776, + "learning_rate": 4.225209314078564e-05, + "loss": 0.0396, + "step": 58940 + }, + { + "epoch": 0.28975, + "grad_norm": 0.10897225141525269, + "learning_rate": 4.224910124257702e-05, + "loss": 0.0411, + "step": 58950 + }, + { + "epoch": 0.2898, + "grad_norm": 0.08845224976539612, + "learning_rate": 4.224610887278446e-05, + "loss": 0.0387, + "step": 58960 + }, + { + "epoch": 0.28985, + "grad_norm": 0.10787303745746613, + "learning_rate": 4.224311603148976e-05, + "loss": 0.0412, + "step": 58970 + }, + { + "epoch": 0.2899, + "grad_norm": 0.08971309661865234, + "learning_rate": 4.2240122718774747e-05, + "loss": 0.0374, + "step": 58980 + }, + { + "epoch": 0.28995, + "grad_norm": 0.08946974575519562, + "learning_rate": 4.2237128934721246e-05, + "loss": 0.0381, + "step": 58990 + }, + { + "epoch": 0.29, + "grad_norm": 0.08195340633392334, + "learning_rate": 4.223413467941113e-05, + "loss": 0.0377, + "step": 59000 + }, + { + "epoch": 0.29005, + "grad_norm": 0.0776698887348175, + "learning_rate": 4.223113995292624e-05, + "loss": 0.0378, + "step": 59010 + }, + { + "epoch": 0.2901, + "grad_norm": 0.07512795925140381, + "learning_rate": 4.2228144755348444e-05, + "loss": 0.039, + "step": 59020 + }, + { + "epoch": 0.29015, + "grad_norm": 0.097700335085392, + "learning_rate": 4.2225149086759664e-05, + "loss": 0.038, + "step": 59030 + }, + { + "epoch": 0.2902, + "grad_norm": 0.10088939964771271, + "learning_rate": 4.222215294724177e-05, + "loss": 0.0388, + "step": 59040 + }, + { + "epoch": 0.29025, + "grad_norm": 0.09535989910364151, + "learning_rate": 4.221915633687668e-05, + "loss": 0.039, + "step": 59050 + }, + { + "epoch": 0.2903, + "grad_norm": 0.10953981429338455, + "learning_rate": 4.221615925574633e-05, + "loss": 0.039, + "step": 59060 + }, + { + "epoch": 0.29035, + "grad_norm": 0.0990564301609993, + "learning_rate": 4.2213161703932644e-05, + "loss": 0.0361, + "step": 59070 + }, + { + "epoch": 0.2904, + "grad_norm": 0.08613920211791992, + "learning_rate": 4.2210163681517603e-05, + "loss": 0.0392, + "step": 59080 + }, + { + "epoch": 0.29045, + "grad_norm": 0.08893074840307236, + "learning_rate": 4.220716518858314e-05, + "loss": 0.0381, + "step": 59090 + }, + { + "epoch": 0.2905, + "grad_norm": 0.08558481186628342, + "learning_rate": 4.2204166225211246e-05, + "loss": 0.0373, + "step": 59100 + }, + { + "epoch": 0.29055, + "grad_norm": 0.09346235543489456, + "learning_rate": 4.2201166791483915e-05, + "loss": 0.0373, + "step": 59110 + }, + { + "epoch": 0.2906, + "grad_norm": 0.09380681067705154, + "learning_rate": 4.219816688748314e-05, + "loss": 0.038, + "step": 59120 + }, + { + "epoch": 0.29065, + "grad_norm": 0.08615744858980179, + "learning_rate": 4.219516651329095e-05, + "loss": 0.0381, + "step": 59130 + }, + { + "epoch": 0.2907, + "grad_norm": 0.08506422489881516, + "learning_rate": 4.2192165668989356e-05, + "loss": 0.037, + "step": 59140 + }, + { + "epoch": 0.29075, + "grad_norm": 0.10388914495706558, + "learning_rate": 4.218916435466042e-05, + "loss": 0.0418, + "step": 59150 + }, + { + "epoch": 0.2908, + "grad_norm": 0.09521077573299408, + "learning_rate": 4.218616257038619e-05, + "loss": 0.038, + "step": 59160 + }, + { + "epoch": 0.29085, + "grad_norm": 0.08011168241500854, + "learning_rate": 4.2183160316248726e-05, + "loss": 0.0383, + "step": 59170 + }, + { + "epoch": 0.2909, + "grad_norm": 0.09949786216020584, + "learning_rate": 4.218015759233012e-05, + "loss": 0.0383, + "step": 59180 + }, + { + "epoch": 0.29095, + "grad_norm": 0.10403574258089066, + "learning_rate": 4.2177154398712456e-05, + "loss": 0.0377, + "step": 59190 + }, + { + "epoch": 0.291, + "grad_norm": 0.10081654787063599, + "learning_rate": 4.2174150735477844e-05, + "loss": 0.0374, + "step": 59200 + }, + { + "epoch": 0.29105, + "grad_norm": 0.090993233025074, + "learning_rate": 4.217114660270841e-05, + "loss": 0.0378, + "step": 59210 + }, + { + "epoch": 0.2911, + "grad_norm": 0.10115541517734528, + "learning_rate": 4.2168142000486267e-05, + "loss": 0.0388, + "step": 59220 + }, + { + "epoch": 0.29115, + "grad_norm": 0.11779844015836716, + "learning_rate": 4.216513692889358e-05, + "loss": 0.0401, + "step": 59230 + }, + { + "epoch": 0.2912, + "grad_norm": 0.12547267973423004, + "learning_rate": 4.216213138801249e-05, + "loss": 0.0375, + "step": 59240 + }, + { + "epoch": 0.29125, + "grad_norm": 0.10295387357473373, + "learning_rate": 4.215912537792519e-05, + "loss": 0.0367, + "step": 59250 + }, + { + "epoch": 0.2913, + "grad_norm": 0.1021818071603775, + "learning_rate": 4.215611889871384e-05, + "loss": 0.0375, + "step": 59260 + }, + { + "epoch": 0.29135, + "grad_norm": 0.10994240641593933, + "learning_rate": 4.215311195046064e-05, + "loss": 0.0384, + "step": 59270 + }, + { + "epoch": 0.2914, + "grad_norm": 0.10220217704772949, + "learning_rate": 4.21501045332478e-05, + "loss": 0.0392, + "step": 59280 + }, + { + "epoch": 0.29145, + "grad_norm": 0.10150808095932007, + "learning_rate": 4.214709664715756e-05, + "loss": 0.0391, + "step": 59290 + }, + { + "epoch": 0.2915, + "grad_norm": 0.1190982237458229, + "learning_rate": 4.214408829227213e-05, + "loss": 0.0392, + "step": 59300 + }, + { + "epoch": 0.29155, + "grad_norm": 0.10309015214443207, + "learning_rate": 4.214107946867377e-05, + "loss": 0.0371, + "step": 59310 + }, + { + "epoch": 0.2916, + "grad_norm": 0.11924099177122116, + "learning_rate": 4.2138070176444736e-05, + "loss": 0.0378, + "step": 59320 + }, + { + "epoch": 0.29165, + "grad_norm": 0.115843266248703, + "learning_rate": 4.21350604156673e-05, + "loss": 0.0366, + "step": 59330 + }, + { + "epoch": 0.2917, + "grad_norm": 0.09781511127948761, + "learning_rate": 4.213205018642375e-05, + "loss": 0.0378, + "step": 59340 + }, + { + "epoch": 0.29175, + "grad_norm": 0.10209403187036514, + "learning_rate": 4.2129039488796384e-05, + "loss": 0.0379, + "step": 59350 + }, + { + "epoch": 0.2918, + "grad_norm": 0.10987348854541779, + "learning_rate": 4.212602832286752e-05, + "loss": 0.0381, + "step": 59360 + }, + { + "epoch": 0.29185, + "grad_norm": 0.09803928434848785, + "learning_rate": 4.212301668871946e-05, + "loss": 0.0364, + "step": 59370 + }, + { + "epoch": 0.2919, + "grad_norm": 0.09594839811325073, + "learning_rate": 4.212000458643457e-05, + "loss": 0.0365, + "step": 59380 + }, + { + "epoch": 0.29195, + "grad_norm": 0.14262939989566803, + "learning_rate": 4.211699201609518e-05, + "loss": 0.0385, + "step": 59390 + }, + { + "epoch": 0.292, + "grad_norm": 0.16029612720012665, + "learning_rate": 4.211397897778366e-05, + "loss": 0.0402, + "step": 59400 + }, + { + "epoch": 0.29205, + "grad_norm": 0.11073275655508041, + "learning_rate": 4.211096547158239e-05, + "loss": 0.0395, + "step": 59410 + }, + { + "epoch": 0.2921, + "grad_norm": 0.11205213516950607, + "learning_rate": 4.210795149757375e-05, + "loss": 0.0384, + "step": 59420 + }, + { + "epoch": 0.29215, + "grad_norm": 0.1112198457121849, + "learning_rate": 4.2104937055840144e-05, + "loss": 0.0411, + "step": 59430 + }, + { + "epoch": 0.2922, + "grad_norm": 0.09706633538007736, + "learning_rate": 4.210192214646398e-05, + "loss": 0.037, + "step": 59440 + }, + { + "epoch": 0.29225, + "grad_norm": 0.08984319865703583, + "learning_rate": 4.209890676952769e-05, + "loss": 0.0378, + "step": 59450 + }, + { + "epoch": 0.2923, + "grad_norm": 0.09882747381925583, + "learning_rate": 4.2095890925113715e-05, + "loss": 0.0377, + "step": 59460 + }, + { + "epoch": 0.29235, + "grad_norm": 0.10908032208681107, + "learning_rate": 4.2092874613304506e-05, + "loss": 0.0362, + "step": 59470 + }, + { + "epoch": 0.2924, + "grad_norm": 0.11629695445299149, + "learning_rate": 4.208985783418252e-05, + "loss": 0.0376, + "step": 59480 + }, + { + "epoch": 0.29245, + "grad_norm": 0.10098670423030853, + "learning_rate": 4.2086840587830255e-05, + "loss": 0.0382, + "step": 59490 + }, + { + "epoch": 0.2925, + "grad_norm": 0.1128614991903305, + "learning_rate": 4.2083822874330175e-05, + "loss": 0.037, + "step": 59500 + }, + { + "epoch": 0.29255, + "grad_norm": 0.09345178306102753, + "learning_rate": 4.2080804693764805e-05, + "loss": 0.0379, + "step": 59510 + }, + { + "epoch": 0.2926, + "grad_norm": 0.09811430424451828, + "learning_rate": 4.207778604621664e-05, + "loss": 0.0366, + "step": 59520 + }, + { + "epoch": 0.29265, + "grad_norm": 0.09692283719778061, + "learning_rate": 4.2074766931768225e-05, + "loss": 0.0376, + "step": 59530 + }, + { + "epoch": 0.2927, + "grad_norm": 0.09886656701564789, + "learning_rate": 4.20717473505021e-05, + "loss": 0.0375, + "step": 59540 + }, + { + "epoch": 0.29275, + "grad_norm": 0.0967152789235115, + "learning_rate": 4.2068727302500815e-05, + "loss": 0.0374, + "step": 59550 + }, + { + "epoch": 0.2928, + "grad_norm": 0.0911996141076088, + "learning_rate": 4.2065706787846936e-05, + "loss": 0.0375, + "step": 59560 + }, + { + "epoch": 0.29285, + "grad_norm": 0.09192916005849838, + "learning_rate": 4.206268580662305e-05, + "loss": 0.0361, + "step": 59570 + }, + { + "epoch": 0.2929, + "grad_norm": 0.07742298394441605, + "learning_rate": 4.2059664358911734e-05, + "loss": 0.0376, + "step": 59580 + }, + { + "epoch": 0.29295, + "grad_norm": 0.09748908132314682, + "learning_rate": 4.2056642444795616e-05, + "loss": 0.0376, + "step": 59590 + }, + { + "epoch": 0.293, + "grad_norm": 0.09673763811588287, + "learning_rate": 4.2053620064357294e-05, + "loss": 0.037, + "step": 59600 + }, + { + "epoch": 0.29305, + "grad_norm": 0.10387150943279266, + "learning_rate": 4.20505972176794e-05, + "loss": 0.0387, + "step": 59610 + }, + { + "epoch": 0.2931, + "grad_norm": 0.08987797051668167, + "learning_rate": 4.204757390484459e-05, + "loss": 0.0393, + "step": 59620 + }, + { + "epoch": 0.29315, + "grad_norm": 0.10657423734664917, + "learning_rate": 4.2044550125935514e-05, + "loss": 0.0384, + "step": 59630 + }, + { + "epoch": 0.2932, + "grad_norm": 0.10422495007514954, + "learning_rate": 4.204152588103485e-05, + "loss": 0.0383, + "step": 59640 + }, + { + "epoch": 0.29325, + "grad_norm": 0.09066640585660934, + "learning_rate": 4.2038501170225254e-05, + "loss": 0.0398, + "step": 59650 + }, + { + "epoch": 0.2933, + "grad_norm": 0.08937875926494598, + "learning_rate": 4.2035475993589447e-05, + "loss": 0.0389, + "step": 59660 + }, + { + "epoch": 0.29335, + "grad_norm": 0.10382039844989777, + "learning_rate": 4.203245035121012e-05, + "loss": 0.041, + "step": 59670 + }, + { + "epoch": 0.2934, + "grad_norm": 0.10572951287031174, + "learning_rate": 4.202942424317001e-05, + "loss": 0.0403, + "step": 59680 + }, + { + "epoch": 0.29345, + "grad_norm": 0.09443110972642899, + "learning_rate": 4.202639766955183e-05, + "loss": 0.0398, + "step": 59690 + }, + { + "epoch": 0.2935, + "grad_norm": 0.10440312325954437, + "learning_rate": 4.202337063043834e-05, + "loss": 0.0377, + "step": 59700 + }, + { + "epoch": 0.29355, + "grad_norm": 0.0963246151804924, + "learning_rate": 4.202034312591229e-05, + "loss": 0.0377, + "step": 59710 + }, + { + "epoch": 0.2936, + "grad_norm": 0.08949219435453415, + "learning_rate": 4.2017315156056445e-05, + "loss": 0.0377, + "step": 59720 + }, + { + "epoch": 0.29365, + "grad_norm": 0.10338666290044785, + "learning_rate": 4.201428672095361e-05, + "loss": 0.0377, + "step": 59730 + }, + { + "epoch": 0.2937, + "grad_norm": 0.08472144603729248, + "learning_rate": 4.2011257820686554e-05, + "loss": 0.0389, + "step": 59740 + }, + { + "epoch": 0.29375, + "grad_norm": 0.0787237361073494, + "learning_rate": 4.200822845533812e-05, + "loss": 0.0378, + "step": 59750 + }, + { + "epoch": 0.2938, + "grad_norm": 0.08679846674203873, + "learning_rate": 4.200519862499109e-05, + "loss": 0.037, + "step": 59760 + }, + { + "epoch": 0.29385, + "grad_norm": 0.09616634249687195, + "learning_rate": 4.2002168329728325e-05, + "loss": 0.0393, + "step": 59770 + }, + { + "epoch": 0.2939, + "grad_norm": 0.11898311227560043, + "learning_rate": 4.199913756963267e-05, + "loss": 0.0384, + "step": 59780 + }, + { + "epoch": 0.29395, + "grad_norm": 0.10533025115728378, + "learning_rate": 4.1996106344786976e-05, + "loss": 0.038, + "step": 59790 + }, + { + "epoch": 0.294, + "grad_norm": 0.09638418257236481, + "learning_rate": 4.1993074655274126e-05, + "loss": 0.0383, + "step": 59800 + }, + { + "epoch": 0.29405, + "grad_norm": 0.10283535718917847, + "learning_rate": 4.1990042501176985e-05, + "loss": 0.0391, + "step": 59810 + }, + { + "epoch": 0.2941, + "grad_norm": 0.10665203630924225, + "learning_rate": 4.1987009882578476e-05, + "loss": 0.038, + "step": 59820 + }, + { + "epoch": 0.29415, + "grad_norm": 0.09289417415857315, + "learning_rate": 4.198397679956149e-05, + "loss": 0.0388, + "step": 59830 + }, + { + "epoch": 0.2942, + "grad_norm": 0.09182269871234894, + "learning_rate": 4.198094325220897e-05, + "loss": 0.0376, + "step": 59840 + }, + { + "epoch": 0.29425, + "grad_norm": 0.10201194882392883, + "learning_rate": 4.197790924060383e-05, + "loss": 0.0376, + "step": 59850 + }, + { + "epoch": 0.2943, + "grad_norm": 0.08951577544212341, + "learning_rate": 4.197487476482903e-05, + "loss": 0.0382, + "step": 59860 + }, + { + "epoch": 0.29435, + "grad_norm": 0.10263777524232864, + "learning_rate": 4.197183982496754e-05, + "loss": 0.0405, + "step": 59870 + }, + { + "epoch": 0.2944, + "grad_norm": 0.12024150788784027, + "learning_rate": 4.196880442110232e-05, + "loss": 0.0408, + "step": 59880 + }, + { + "epoch": 0.29445, + "grad_norm": 0.09682836383581161, + "learning_rate": 4.196576855331637e-05, + "loss": 0.0386, + "step": 59890 + }, + { + "epoch": 0.2945, + "grad_norm": 0.09489523619413376, + "learning_rate": 4.196273222169267e-05, + "loss": 0.0378, + "step": 59900 + }, + { + "epoch": 0.29455, + "grad_norm": 0.09791134297847748, + "learning_rate": 4.195969542631425e-05, + "loss": 0.0384, + "step": 59910 + }, + { + "epoch": 0.2946, + "grad_norm": 0.08972064405679703, + "learning_rate": 4.195665816726412e-05, + "loss": 0.0372, + "step": 59920 + }, + { + "epoch": 0.29465, + "grad_norm": 0.07786723971366882, + "learning_rate": 4.195362044462534e-05, + "loss": 0.0375, + "step": 59930 + }, + { + "epoch": 0.2947, + "grad_norm": 0.08396299183368683, + "learning_rate": 4.195058225848094e-05, + "loss": 0.0373, + "step": 59940 + }, + { + "epoch": 0.29475, + "grad_norm": 0.07986466586589813, + "learning_rate": 4.194754360891398e-05, + "loss": 0.038, + "step": 59950 + }, + { + "epoch": 0.2948, + "grad_norm": 0.08281272649765015, + "learning_rate": 4.1944504496007555e-05, + "loss": 0.0407, + "step": 59960 + }, + { + "epoch": 0.29485, + "grad_norm": 0.09473776817321777, + "learning_rate": 4.194146491984474e-05, + "loss": 0.0369, + "step": 59970 + }, + { + "epoch": 0.2949, + "grad_norm": 0.10042299330234528, + "learning_rate": 4.193842488050864e-05, + "loss": 0.0401, + "step": 59980 + }, + { + "epoch": 0.29495, + "grad_norm": 0.10005058348178864, + "learning_rate": 4.1935384378082366e-05, + "loss": 0.0396, + "step": 59990 + }, + { + "epoch": 0.295, + "grad_norm": 0.08495144546031952, + "learning_rate": 4.193234341264905e-05, + "loss": 0.0392, + "step": 60000 + }, + { + "epoch": 0.29505, + "grad_norm": 0.09953828155994415, + "learning_rate": 4.1929301984291825e-05, + "loss": 0.0396, + "step": 60010 + }, + { + "epoch": 0.2951, + "grad_norm": 0.08857406675815582, + "learning_rate": 4.1926260093093836e-05, + "loss": 0.0375, + "step": 60020 + }, + { + "epoch": 0.29515, + "grad_norm": 0.0942673310637474, + "learning_rate": 4.192321773913826e-05, + "loss": 0.0386, + "step": 60030 + }, + { + "epoch": 0.2952, + "grad_norm": 0.0972483903169632, + "learning_rate": 4.192017492250827e-05, + "loss": 0.0401, + "step": 60040 + }, + { + "epoch": 0.29525, + "grad_norm": 0.09403170645236969, + "learning_rate": 4.1917131643287056e-05, + "loss": 0.0374, + "step": 60050 + }, + { + "epoch": 0.2953, + "grad_norm": 0.12300402671098709, + "learning_rate": 4.191408790155781e-05, + "loss": 0.0395, + "step": 60060 + }, + { + "epoch": 0.29535, + "grad_norm": 0.13763009011745453, + "learning_rate": 4.191104369740376e-05, + "loss": 0.0392, + "step": 60070 + }, + { + "epoch": 0.2954, + "grad_norm": 0.09596195816993713, + "learning_rate": 4.190799903090813e-05, + "loss": 0.0402, + "step": 60080 + }, + { + "epoch": 0.29545, + "grad_norm": 0.10017865896224976, + "learning_rate": 4.1904953902154156e-05, + "loss": 0.0395, + "step": 60090 + }, + { + "epoch": 0.2955, + "grad_norm": 0.10523372888565063, + "learning_rate": 4.1901908311225094e-05, + "loss": 0.0395, + "step": 60100 + }, + { + "epoch": 0.29555, + "grad_norm": 0.1086510494351387, + "learning_rate": 4.189886225820421e-05, + "loss": 0.0386, + "step": 60110 + }, + { + "epoch": 0.2956, + "grad_norm": 0.09998047351837158, + "learning_rate": 4.189581574317478e-05, + "loss": 0.042, + "step": 60120 + }, + { + "epoch": 0.29565, + "grad_norm": 0.09882159531116486, + "learning_rate": 4.1892768766220094e-05, + "loss": 0.0375, + "step": 60130 + }, + { + "epoch": 0.2957, + "grad_norm": 0.12305544316768646, + "learning_rate": 4.1889721327423456e-05, + "loss": 0.0392, + "step": 60140 + }, + { + "epoch": 0.29575, + "grad_norm": 0.1286933273077011, + "learning_rate": 4.188667342686818e-05, + "loss": 0.0385, + "step": 60150 + }, + { + "epoch": 0.2958, + "grad_norm": 0.11229319125413895, + "learning_rate": 4.188362506463761e-05, + "loss": 0.0387, + "step": 60160 + }, + { + "epoch": 0.29585, + "grad_norm": 0.09351322799921036, + "learning_rate": 4.188057624081506e-05, + "loss": 0.038, + "step": 60170 + }, + { + "epoch": 0.2959, + "grad_norm": 0.10067510604858398, + "learning_rate": 4.18775269554839e-05, + "loss": 0.0406, + "step": 60180 + }, + { + "epoch": 0.29595, + "grad_norm": 0.10245190560817719, + "learning_rate": 4.187447720872749e-05, + "loss": 0.0382, + "step": 60190 + }, + { + "epoch": 0.296, + "grad_norm": 0.10589483380317688, + "learning_rate": 4.187142700062922e-05, + "loss": 0.0391, + "step": 60200 + }, + { + "epoch": 0.29605, + "grad_norm": 0.10421047359704971, + "learning_rate": 4.186837633127247e-05, + "loss": 0.0357, + "step": 60210 + }, + { + "epoch": 0.2961, + "grad_norm": 0.09676158428192139, + "learning_rate": 4.1865325200740644e-05, + "loss": 0.0362, + "step": 60220 + }, + { + "epoch": 0.29615, + "grad_norm": 0.1064562052488327, + "learning_rate": 4.1862273609117174e-05, + "loss": 0.0373, + "step": 60230 + }, + { + "epoch": 0.2962, + "grad_norm": 0.09847971051931381, + "learning_rate": 4.185922155648547e-05, + "loss": 0.0378, + "step": 60240 + }, + { + "epoch": 0.29625, + "grad_norm": 0.1274549812078476, + "learning_rate": 4.185616904292898e-05, + "loss": 0.0395, + "step": 60250 + }, + { + "epoch": 0.2963, + "grad_norm": 0.1018499881029129, + "learning_rate": 4.185311606853117e-05, + "loss": 0.0389, + "step": 60260 + }, + { + "epoch": 0.29635, + "grad_norm": 0.11894772946834564, + "learning_rate": 4.18500626333755e-05, + "loss": 0.037, + "step": 60270 + }, + { + "epoch": 0.2964, + "grad_norm": 0.08830223232507706, + "learning_rate": 4.184700873754544e-05, + "loss": 0.0369, + "step": 60280 + }, + { + "epoch": 0.29645, + "grad_norm": 0.10243300348520279, + "learning_rate": 4.184395438112449e-05, + "loss": 0.0363, + "step": 60290 + }, + { + "epoch": 0.2965, + "grad_norm": 0.1040797159075737, + "learning_rate": 4.1840899564196156e-05, + "loss": 0.0416, + "step": 60300 + }, + { + "epoch": 0.29655, + "grad_norm": 0.1132705807685852, + "learning_rate": 4.1837844286843955e-05, + "loss": 0.038, + "step": 60310 + }, + { + "epoch": 0.2966, + "grad_norm": 0.10648635029792786, + "learning_rate": 4.183478854915142e-05, + "loss": 0.0378, + "step": 60320 + }, + { + "epoch": 0.29665, + "grad_norm": 0.12978768348693848, + "learning_rate": 4.183173235120209e-05, + "loss": 0.0382, + "step": 60330 + }, + { + "epoch": 0.2967, + "grad_norm": 0.10505112260580063, + "learning_rate": 4.182867569307952e-05, + "loss": 0.0374, + "step": 60340 + }, + { + "epoch": 0.29675, + "grad_norm": 0.10616233944892883, + "learning_rate": 4.182561857486727e-05, + "loss": 0.037, + "step": 60350 + }, + { + "epoch": 0.2968, + "grad_norm": 0.0917801707983017, + "learning_rate": 4.182256099664894e-05, + "loss": 0.0367, + "step": 60360 + }, + { + "epoch": 0.29685, + "grad_norm": 0.13801418244838715, + "learning_rate": 4.181950295850811e-05, + "loss": 0.0379, + "step": 60370 + }, + { + "epoch": 0.2969, + "grad_norm": 0.10303942114114761, + "learning_rate": 4.1816444460528393e-05, + "loss": 0.0365, + "step": 60380 + }, + { + "epoch": 0.29695, + "grad_norm": 0.10049451142549515, + "learning_rate": 4.181338550279339e-05, + "loss": 0.0375, + "step": 60390 + }, + { + "epoch": 0.297, + "grad_norm": 0.09753605723381042, + "learning_rate": 4.1810326085386755e-05, + "loss": 0.0368, + "step": 60400 + }, + { + "epoch": 0.29705, + "grad_norm": 0.09278786927461624, + "learning_rate": 4.180726620839212e-05, + "loss": 0.0377, + "step": 60410 + }, + { + "epoch": 0.2971, + "grad_norm": 0.10221299529075623, + "learning_rate": 4.180420587189313e-05, + "loss": 0.0377, + "step": 60420 + }, + { + "epoch": 0.29715, + "grad_norm": 0.09834998100996017, + "learning_rate": 4.180114507597347e-05, + "loss": 0.0379, + "step": 60430 + }, + { + "epoch": 0.2972, + "grad_norm": 0.08894462883472443, + "learning_rate": 4.1798083820716815e-05, + "loss": 0.0387, + "step": 60440 + }, + { + "epoch": 0.29725, + "grad_norm": 0.092961885035038, + "learning_rate": 4.179502210620687e-05, + "loss": 0.0371, + "step": 60450 + }, + { + "epoch": 0.2973, + "grad_norm": 0.10489032417535782, + "learning_rate": 4.179195993252731e-05, + "loss": 0.0381, + "step": 60460 + }, + { + "epoch": 0.29735, + "grad_norm": 0.09162574261426926, + "learning_rate": 4.1788897299761884e-05, + "loss": 0.0385, + "step": 60470 + }, + { + "epoch": 0.2974, + "grad_norm": 0.08949447423219681, + "learning_rate": 4.1785834207994316e-05, + "loss": 0.036, + "step": 60480 + }, + { + "epoch": 0.29745, + "grad_norm": 0.0965668112039566, + "learning_rate": 4.178277065730835e-05, + "loss": 0.0395, + "step": 60490 + }, + { + "epoch": 0.2975, + "grad_norm": 0.06762854009866714, + "learning_rate": 4.177970664778773e-05, + "loss": 0.0358, + "step": 60500 + }, + { + "epoch": 0.29755, + "grad_norm": 0.1038472130894661, + "learning_rate": 4.177664217951624e-05, + "loss": 0.0367, + "step": 60510 + }, + { + "epoch": 0.2976, + "grad_norm": 0.08165507763624191, + "learning_rate": 4.1773577252577656e-05, + "loss": 0.038, + "step": 60520 + }, + { + "epoch": 0.29765, + "grad_norm": 0.11089900881052017, + "learning_rate": 4.1770511867055776e-05, + "loss": 0.0395, + "step": 60530 + }, + { + "epoch": 0.2977, + "grad_norm": 0.07820553332567215, + "learning_rate": 4.1767446023034385e-05, + "loss": 0.0364, + "step": 60540 + }, + { + "epoch": 0.29775, + "grad_norm": 0.09922687709331512, + "learning_rate": 4.176437972059733e-05, + "loss": 0.0371, + "step": 60550 + }, + { + "epoch": 0.2978, + "grad_norm": 0.0939578264951706, + "learning_rate": 4.176131295982843e-05, + "loss": 0.0366, + "step": 60560 + }, + { + "epoch": 0.29785, + "grad_norm": 0.09753625839948654, + "learning_rate": 4.175824574081153e-05, + "loss": 0.037, + "step": 60570 + }, + { + "epoch": 0.2979, + "grad_norm": 0.09230932593345642, + "learning_rate": 4.1755178063630493e-05, + "loss": 0.0367, + "step": 60580 + }, + { + "epoch": 0.29795, + "grad_norm": 0.10648012906312943, + "learning_rate": 4.175210992836918e-05, + "loss": 0.0365, + "step": 60590 + }, + { + "epoch": 0.298, + "grad_norm": 0.10602240264415741, + "learning_rate": 4.1749041335111464e-05, + "loss": 0.0365, + "step": 60600 + }, + { + "epoch": 0.29805, + "grad_norm": 0.10287895053625107, + "learning_rate": 4.174597228394126e-05, + "loss": 0.0366, + "step": 60610 + }, + { + "epoch": 0.2981, + "grad_norm": 0.10624495148658752, + "learning_rate": 4.174290277494246e-05, + "loss": 0.0378, + "step": 60620 + }, + { + "epoch": 0.29815, + "grad_norm": 0.09934336692094803, + "learning_rate": 4.1739832808199e-05, + "loss": 0.0362, + "step": 60630 + }, + { + "epoch": 0.2982, + "grad_norm": 0.1049598678946495, + "learning_rate": 4.1736762383794795e-05, + "loss": 0.0386, + "step": 60640 + }, + { + "epoch": 0.29825, + "grad_norm": 0.08692259341478348, + "learning_rate": 4.1733691501813786e-05, + "loss": 0.0376, + "step": 60650 + }, + { + "epoch": 0.2983, + "grad_norm": 0.094350665807724, + "learning_rate": 4.173062016233994e-05, + "loss": 0.0385, + "step": 60660 + }, + { + "epoch": 0.29835, + "grad_norm": 0.09607906639575958, + "learning_rate": 4.172754836545723e-05, + "loss": 0.0373, + "step": 60670 + }, + { + "epoch": 0.2984, + "grad_norm": 0.10610980540513992, + "learning_rate": 4.172447611124963e-05, + "loss": 0.0368, + "step": 60680 + }, + { + "epoch": 0.29845, + "grad_norm": 0.11003467440605164, + "learning_rate": 4.172140339980114e-05, + "loss": 0.0394, + "step": 60690 + }, + { + "epoch": 0.2985, + "grad_norm": 0.11963500082492828, + "learning_rate": 4.171833023119576e-05, + "loss": 0.0419, + "step": 60700 + }, + { + "epoch": 0.29855, + "grad_norm": 0.08699667453765869, + "learning_rate": 4.1715256605517504e-05, + "loss": 0.0375, + "step": 60710 + }, + { + "epoch": 0.2986, + "grad_norm": 0.09019137918949127, + "learning_rate": 4.171218252285042e-05, + "loss": 0.0395, + "step": 60720 + }, + { + "epoch": 0.29865, + "grad_norm": 0.09844812005758286, + "learning_rate": 4.170910798327854e-05, + "loss": 0.039, + "step": 60730 + }, + { + "epoch": 0.2987, + "grad_norm": 0.11068392544984818, + "learning_rate": 4.170603298688593e-05, + "loss": 0.0394, + "step": 60740 + }, + { + "epoch": 0.29875, + "grad_norm": 0.09484200179576874, + "learning_rate": 4.170295753375665e-05, + "loss": 0.0368, + "step": 60750 + }, + { + "epoch": 0.2988, + "grad_norm": 0.09472393989562988, + "learning_rate": 4.169988162397479e-05, + "loss": 0.0372, + "step": 60760 + }, + { + "epoch": 0.29885, + "grad_norm": 0.09127482771873474, + "learning_rate": 4.169680525762444e-05, + "loss": 0.0376, + "step": 60770 + }, + { + "epoch": 0.2989, + "grad_norm": 0.09734153747558594, + "learning_rate": 4.169372843478971e-05, + "loss": 0.0374, + "step": 60780 + }, + { + "epoch": 0.29895, + "grad_norm": 0.11162359267473221, + "learning_rate": 4.1690651155554704e-05, + "loss": 0.037, + "step": 60790 + }, + { + "epoch": 0.299, + "grad_norm": 0.11358215659856796, + "learning_rate": 4.168757342000358e-05, + "loss": 0.0379, + "step": 60800 + }, + { + "epoch": 0.29905, + "grad_norm": 0.10406457632780075, + "learning_rate": 4.1684495228220454e-05, + "loss": 0.0366, + "step": 60810 + }, + { + "epoch": 0.2991, + "grad_norm": 0.1159927248954773, + "learning_rate": 4.168141658028951e-05, + "loss": 0.0413, + "step": 60820 + }, + { + "epoch": 0.29915, + "grad_norm": 0.09065935015678406, + "learning_rate": 4.16783374762949e-05, + "loss": 0.0374, + "step": 60830 + }, + { + "epoch": 0.2992, + "grad_norm": 0.08514414727687836, + "learning_rate": 4.1675257916320804e-05, + "loss": 0.0381, + "step": 60840 + }, + { + "epoch": 0.29925, + "grad_norm": 0.10681270807981491, + "learning_rate": 4.167217790045143e-05, + "loss": 0.0383, + "step": 60850 + }, + { + "epoch": 0.2993, + "grad_norm": 0.09594421088695526, + "learning_rate": 4.166909742877097e-05, + "loss": 0.038, + "step": 60860 + }, + { + "epoch": 0.29935, + "grad_norm": 0.10875742882490158, + "learning_rate": 4.166601650136364e-05, + "loss": 0.0369, + "step": 60870 + }, + { + "epoch": 0.2994, + "grad_norm": 0.1058308482170105, + "learning_rate": 4.166293511831369e-05, + "loss": 0.0374, + "step": 60880 + }, + { + "epoch": 0.29945, + "grad_norm": 0.12089107185602188, + "learning_rate": 4.165985327970535e-05, + "loss": 0.0371, + "step": 60890 + }, + { + "epoch": 0.2995, + "grad_norm": 0.10122127830982208, + "learning_rate": 4.165677098562288e-05, + "loss": 0.0402, + "step": 60900 + }, + { + "epoch": 0.29955, + "grad_norm": 0.10833931714296341, + "learning_rate": 4.1653688236150554e-05, + "loss": 0.0381, + "step": 60910 + }, + { + "epoch": 0.2996, + "grad_norm": 0.10205668210983276, + "learning_rate": 4.165060503137265e-05, + "loss": 0.0386, + "step": 60920 + }, + { + "epoch": 0.29965, + "grad_norm": 0.10592308640480042, + "learning_rate": 4.164752137137345e-05, + "loss": 0.0376, + "step": 60930 + }, + { + "epoch": 0.2997, + "grad_norm": 0.11171428859233856, + "learning_rate": 4.164443725623728e-05, + "loss": 0.0392, + "step": 60940 + }, + { + "epoch": 0.29975, + "grad_norm": 0.11343561112880707, + "learning_rate": 4.164135268604844e-05, + "loss": 0.0382, + "step": 60950 + }, + { + "epoch": 0.2998, + "grad_norm": 0.12067008018493652, + "learning_rate": 4.163826766089127e-05, + "loss": 0.0369, + "step": 60960 + }, + { + "epoch": 0.29985, + "grad_norm": 0.11105100810527802, + "learning_rate": 4.163518218085012e-05, + "loss": 0.0414, + "step": 60970 + }, + { + "epoch": 0.2999, + "grad_norm": 0.11540385335683823, + "learning_rate": 4.1632096246009335e-05, + "loss": 0.0377, + "step": 60980 + }, + { + "epoch": 0.29995, + "grad_norm": 0.08897673338651657, + "learning_rate": 4.1629009856453284e-05, + "loss": 0.039, + "step": 60990 + }, + { + "epoch": 0.3, + "grad_norm": 0.09491023421287537, + "learning_rate": 4.162592301226635e-05, + "loss": 0.038, + "step": 61000 + }, + { + "epoch": 0.30005, + "grad_norm": 0.09792555868625641, + "learning_rate": 4.162283571353293e-05, + "loss": 0.0419, + "step": 61010 + }, + { + "epoch": 0.3001, + "grad_norm": 0.10005024075508118, + "learning_rate": 4.161974796033743e-05, + "loss": 0.0379, + "step": 61020 + }, + { + "epoch": 0.30015, + "grad_norm": 0.09802912175655365, + "learning_rate": 4.161665975276426e-05, + "loss": 0.0375, + "step": 61030 + }, + { + "epoch": 0.3002, + "grad_norm": 0.11515079438686371, + "learning_rate": 4.1613571090897855e-05, + "loss": 0.0381, + "step": 61040 + }, + { + "epoch": 0.30025, + "grad_norm": 0.10093576461076736, + "learning_rate": 4.161048197482266e-05, + "loss": 0.0387, + "step": 61050 + }, + { + "epoch": 0.3003, + "grad_norm": 0.10900209844112396, + "learning_rate": 4.160739240462312e-05, + "loss": 0.0384, + "step": 61060 + }, + { + "epoch": 0.30035, + "grad_norm": 0.08780990540981293, + "learning_rate": 4.160430238038372e-05, + "loss": 0.0382, + "step": 61070 + }, + { + "epoch": 0.3004, + "grad_norm": 0.09186938405036926, + "learning_rate": 4.160121190218893e-05, + "loss": 0.0387, + "step": 61080 + }, + { + "epoch": 0.30045, + "grad_norm": 0.11051493883132935, + "learning_rate": 4.1598120970123245e-05, + "loss": 0.0389, + "step": 61090 + }, + { + "epoch": 0.3005, + "grad_norm": 0.093411386013031, + "learning_rate": 4.159502958427116e-05, + "loss": 0.0368, + "step": 61100 + }, + { + "epoch": 0.30055, + "grad_norm": 0.11050494015216827, + "learning_rate": 4.159193774471721e-05, + "loss": 0.0405, + "step": 61110 + }, + { + "epoch": 0.3006, + "grad_norm": 0.10686563700437546, + "learning_rate": 4.158884545154591e-05, + "loss": 0.0391, + "step": 61120 + }, + { + "epoch": 0.30065, + "grad_norm": 0.09879803657531738, + "learning_rate": 4.158575270484181e-05, + "loss": 0.038, + "step": 61130 + }, + { + "epoch": 0.3007, + "grad_norm": 0.10461480170488358, + "learning_rate": 4.1582659504689456e-05, + "loss": 0.0376, + "step": 61140 + }, + { + "epoch": 0.30075, + "grad_norm": 0.08467607200145721, + "learning_rate": 4.157956585117343e-05, + "loss": 0.0404, + "step": 61150 + }, + { + "epoch": 0.3008, + "grad_norm": 0.0974237322807312, + "learning_rate": 4.15764717443783e-05, + "loss": 0.0382, + "step": 61160 + }, + { + "epoch": 0.30085, + "grad_norm": 0.09864193946123123, + "learning_rate": 4.157337718438865e-05, + "loss": 0.0407, + "step": 61170 + }, + { + "epoch": 0.3009, + "grad_norm": 0.0968838632106781, + "learning_rate": 4.157028217128911e-05, + "loss": 0.0384, + "step": 61180 + }, + { + "epoch": 0.30095, + "grad_norm": 0.10413113236427307, + "learning_rate": 4.1567186705164265e-05, + "loss": 0.0422, + "step": 61190 + }, + { + "epoch": 0.301, + "grad_norm": 0.0874795988202095, + "learning_rate": 4.1564090786098776e-05, + "loss": 0.0374, + "step": 61200 + }, + { + "epoch": 0.30105, + "grad_norm": 0.13343235850334167, + "learning_rate": 4.156099441417726e-05, + "loss": 0.0412, + "step": 61210 + }, + { + "epoch": 0.3011, + "grad_norm": 0.08971703797578812, + "learning_rate": 4.1557897589484376e-05, + "loss": 0.0375, + "step": 61220 + }, + { + "epoch": 0.30115, + "grad_norm": 0.10634627938270569, + "learning_rate": 4.155480031210479e-05, + "loss": 0.0373, + "step": 61230 + }, + { + "epoch": 0.3012, + "grad_norm": 0.08652137964963913, + "learning_rate": 4.1551702582123186e-05, + "loss": 0.0384, + "step": 61240 + }, + { + "epoch": 0.30125, + "grad_norm": 0.09820088744163513, + "learning_rate": 4.154860439962425e-05, + "loss": 0.0382, + "step": 61250 + }, + { + "epoch": 0.3013, + "grad_norm": 0.08968545496463776, + "learning_rate": 4.154550576469269e-05, + "loss": 0.0392, + "step": 61260 + }, + { + "epoch": 0.30135, + "grad_norm": 0.11606442183256149, + "learning_rate": 4.154240667741322e-05, + "loss": 0.0396, + "step": 61270 + }, + { + "epoch": 0.3014, + "grad_norm": 0.09368910640478134, + "learning_rate": 4.1539307137870567e-05, + "loss": 0.0379, + "step": 61280 + }, + { + "epoch": 0.30145, + "grad_norm": 0.09235761314630508, + "learning_rate": 4.1536207146149467e-05, + "loss": 0.0368, + "step": 61290 + }, + { + "epoch": 0.3015, + "grad_norm": 0.11601971834897995, + "learning_rate": 4.153310670233467e-05, + "loss": 0.0398, + "step": 61300 + }, + { + "epoch": 0.30155, + "grad_norm": 0.10703108459711075, + "learning_rate": 4.153000580651095e-05, + "loss": 0.0405, + "step": 61310 + }, + { + "epoch": 0.3016, + "grad_norm": 0.09087284654378891, + "learning_rate": 4.152690445876308e-05, + "loss": 0.0383, + "step": 61320 + }, + { + "epoch": 0.30165, + "grad_norm": 0.09212212264537811, + "learning_rate": 4.152380265917586e-05, + "loss": 0.0393, + "step": 61330 + }, + { + "epoch": 0.3017, + "grad_norm": 0.11663784086704254, + "learning_rate": 4.1520700407834076e-05, + "loss": 0.0426, + "step": 61340 + }, + { + "epoch": 0.30175, + "grad_norm": 0.10482209920883179, + "learning_rate": 4.1517597704822555e-05, + "loss": 0.0385, + "step": 61350 + }, + { + "epoch": 0.3018, + "grad_norm": 0.10460387915372849, + "learning_rate": 4.151449455022611e-05, + "loss": 0.0391, + "step": 61360 + }, + { + "epoch": 0.30185, + "grad_norm": 0.08725716918706894, + "learning_rate": 4.151139094412959e-05, + "loss": 0.0379, + "step": 61370 + }, + { + "epoch": 0.3019, + "grad_norm": 0.08314737677574158, + "learning_rate": 4.150828688661785e-05, + "loss": 0.0384, + "step": 61380 + }, + { + "epoch": 0.30195, + "grad_norm": 0.09015455842018127, + "learning_rate": 4.150518237777575e-05, + "loss": 0.038, + "step": 61390 + }, + { + "epoch": 0.302, + "grad_norm": 0.07633396238088608, + "learning_rate": 4.1502077417688156e-05, + "loss": 0.0405, + "step": 61400 + }, + { + "epoch": 0.30205, + "grad_norm": 0.10376927256584167, + "learning_rate": 4.149897200643997e-05, + "loss": 0.0393, + "step": 61410 + }, + { + "epoch": 0.3021, + "grad_norm": 0.09429501742124557, + "learning_rate": 4.1495866144116094e-05, + "loss": 0.0374, + "step": 61420 + }, + { + "epoch": 0.30215, + "grad_norm": 0.10373689979314804, + "learning_rate": 4.149275983080142e-05, + "loss": 0.0396, + "step": 61430 + }, + { + "epoch": 0.3022, + "grad_norm": 0.08532211929559708, + "learning_rate": 4.148965306658089e-05, + "loss": 0.0394, + "step": 61440 + }, + { + "epoch": 0.30225, + "grad_norm": 0.11602317541837692, + "learning_rate": 4.148654585153945e-05, + "loss": 0.0383, + "step": 61450 + }, + { + "epoch": 0.3023, + "grad_norm": 0.08118918538093567, + "learning_rate": 4.148343818576204e-05, + "loss": 0.0385, + "step": 61460 + }, + { + "epoch": 0.30235, + "grad_norm": 0.10779253393411636, + "learning_rate": 4.1480330069333616e-05, + "loss": 0.037, + "step": 61470 + }, + { + "epoch": 0.3024, + "grad_norm": 0.08785419166088104, + "learning_rate": 4.147722150233916e-05, + "loss": 0.041, + "step": 61480 + }, + { + "epoch": 0.30245, + "grad_norm": 0.09721635282039642, + "learning_rate": 4.147411248486366e-05, + "loss": 0.0402, + "step": 61490 + }, + { + "epoch": 0.3025, + "grad_norm": 0.10442981123924255, + "learning_rate": 4.1471003016992116e-05, + "loss": 0.0395, + "step": 61500 + }, + { + "epoch": 0.30255, + "grad_norm": 0.0965740829706192, + "learning_rate": 4.146789309880953e-05, + "loss": 0.0387, + "step": 61510 + }, + { + "epoch": 0.3026, + "grad_norm": 0.11614526808261871, + "learning_rate": 4.146478273040094e-05, + "loss": 0.0374, + "step": 61520 + }, + { + "epoch": 0.30265, + "grad_norm": 0.12318149209022522, + "learning_rate": 4.1461671911851375e-05, + "loss": 0.0378, + "step": 61530 + }, + { + "epoch": 0.3027, + "grad_norm": 0.1019563376903534, + "learning_rate": 4.145856064324589e-05, + "loss": 0.0368, + "step": 61540 + }, + { + "epoch": 0.30275, + "grad_norm": 0.0958062931895256, + "learning_rate": 4.145544892466953e-05, + "loss": 0.0365, + "step": 61550 + }, + { + "epoch": 0.3028, + "grad_norm": 0.11094732582569122, + "learning_rate": 4.1452336756207374e-05, + "loss": 0.038, + "step": 61560 + }, + { + "epoch": 0.30285, + "grad_norm": 0.10732755064964294, + "learning_rate": 4.144922413794453e-05, + "loss": 0.036, + "step": 61570 + }, + { + "epoch": 0.3029, + "grad_norm": 0.09656397253274918, + "learning_rate": 4.1446111069966066e-05, + "loss": 0.0374, + "step": 61580 + }, + { + "epoch": 0.30295, + "grad_norm": 0.09352206438779831, + "learning_rate": 4.1442997552357105e-05, + "loss": 0.0373, + "step": 61590 + }, + { + "epoch": 0.303, + "grad_norm": 0.09997440129518509, + "learning_rate": 4.143988358520277e-05, + "loss": 0.0382, + "step": 61600 + }, + { + "epoch": 0.30305, + "grad_norm": 0.11431118845939636, + "learning_rate": 4.14367691685882e-05, + "loss": 0.0411, + "step": 61610 + }, + { + "epoch": 0.3031, + "grad_norm": 0.1606724113225937, + "learning_rate": 4.143365430259852e-05, + "loss": 0.0388, + "step": 61620 + }, + { + "epoch": 0.30315, + "grad_norm": 0.14557507634162903, + "learning_rate": 4.143053898731891e-05, + "loss": 0.0372, + "step": 61630 + }, + { + "epoch": 0.3032, + "grad_norm": 0.11492512375116348, + "learning_rate": 4.1427423222834547e-05, + "loss": 0.0369, + "step": 61640 + }, + { + "epoch": 0.30325, + "grad_norm": 0.08114670217037201, + "learning_rate": 4.1424307009230594e-05, + "loss": 0.0369, + "step": 61650 + }, + { + "epoch": 0.3033, + "grad_norm": 0.09293445199728012, + "learning_rate": 4.1421190346592263e-05, + "loss": 0.0359, + "step": 61660 + }, + { + "epoch": 0.30335, + "grad_norm": 0.09854632616043091, + "learning_rate": 4.141807323500476e-05, + "loss": 0.0379, + "step": 61670 + }, + { + "epoch": 0.3034, + "grad_norm": 0.10344306379556656, + "learning_rate": 4.141495567455329e-05, + "loss": 0.0372, + "step": 61680 + }, + { + "epoch": 0.30345, + "grad_norm": 0.10848110914230347, + "learning_rate": 4.141183766532312e-05, + "loss": 0.0376, + "step": 61690 + }, + { + "epoch": 0.3035, + "grad_norm": 0.10579539835453033, + "learning_rate": 4.1408719207399453e-05, + "loss": 0.0391, + "step": 61700 + }, + { + "epoch": 0.30355, + "grad_norm": 0.09526227414608002, + "learning_rate": 4.1405600300867575e-05, + "loss": 0.0376, + "step": 61710 + }, + { + "epoch": 0.3036, + "grad_norm": 0.0759005919098854, + "learning_rate": 4.140248094581275e-05, + "loss": 0.0374, + "step": 61720 + }, + { + "epoch": 0.30365, + "grad_norm": 0.08883017301559448, + "learning_rate": 4.139936114232026e-05, + "loss": 0.0369, + "step": 61730 + }, + { + "epoch": 0.3037, + "grad_norm": 0.09480241686105728, + "learning_rate": 4.139624089047539e-05, + "loss": 0.0386, + "step": 61740 + }, + { + "epoch": 0.30375, + "grad_norm": 0.09421525150537491, + "learning_rate": 4.139312019036346e-05, + "loss": 0.0379, + "step": 61750 + }, + { + "epoch": 0.3038, + "grad_norm": 0.09258712083101273, + "learning_rate": 4.138999904206978e-05, + "loss": 0.0383, + "step": 61760 + }, + { + "epoch": 0.30385, + "grad_norm": 0.08552666008472443, + "learning_rate": 4.1386877445679686e-05, + "loss": 0.0371, + "step": 61770 + }, + { + "epoch": 0.3039, + "grad_norm": 0.08578373491764069, + "learning_rate": 4.138375540127852e-05, + "loss": 0.0396, + "step": 61780 + }, + { + "epoch": 0.30395, + "grad_norm": 0.10112480074167252, + "learning_rate": 4.1380632908951634e-05, + "loss": 0.0409, + "step": 61790 + }, + { + "epoch": 0.304, + "grad_norm": 0.11652129888534546, + "learning_rate": 4.137750996878439e-05, + "loss": 0.0403, + "step": 61800 + }, + { + "epoch": 0.30405, + "grad_norm": 0.11524508148431778, + "learning_rate": 4.137438658086219e-05, + "loss": 0.0384, + "step": 61810 + }, + { + "epoch": 0.3041, + "grad_norm": 0.10093747079372406, + "learning_rate": 4.13712627452704e-05, + "loss": 0.0375, + "step": 61820 + }, + { + "epoch": 0.30415, + "grad_norm": 0.09673056751489639, + "learning_rate": 4.1368138462094445e-05, + "loss": 0.0381, + "step": 61830 + }, + { + "epoch": 0.3042, + "grad_norm": 0.0936247855424881, + "learning_rate": 4.136501373141973e-05, + "loss": 0.0382, + "step": 61840 + }, + { + "epoch": 0.30425, + "grad_norm": 0.11220092326402664, + "learning_rate": 4.1361888553331695e-05, + "loss": 0.0375, + "step": 61850 + }, + { + "epoch": 0.3043, + "grad_norm": 0.1188722476363182, + "learning_rate": 4.1358762927915775e-05, + "loss": 0.0386, + "step": 61860 + }, + { + "epoch": 0.30435, + "grad_norm": 0.12074930220842361, + "learning_rate": 4.1355636855257406e-05, + "loss": 0.0431, + "step": 61870 + }, + { + "epoch": 0.3044, + "grad_norm": 0.10390708595514297, + "learning_rate": 4.1352510335442084e-05, + "loss": 0.042, + "step": 61880 + }, + { + "epoch": 0.30445, + "grad_norm": 0.09604662656784058, + "learning_rate": 4.1349383368555265e-05, + "loss": 0.039, + "step": 61890 + }, + { + "epoch": 0.3045, + "grad_norm": 0.09382264316082001, + "learning_rate": 4.134625595468246e-05, + "loss": 0.0359, + "step": 61900 + }, + { + "epoch": 0.30455, + "grad_norm": 0.1026986688375473, + "learning_rate": 4.1343128093909144e-05, + "loss": 0.0435, + "step": 61910 + }, + { + "epoch": 0.3046, + "grad_norm": 0.10203906893730164, + "learning_rate": 4.133999978632085e-05, + "loss": 0.0384, + "step": 61920 + }, + { + "epoch": 0.30465, + "grad_norm": 0.1025913804769516, + "learning_rate": 4.13368710320031e-05, + "loss": 0.0384, + "step": 61930 + }, + { + "epoch": 0.3047, + "grad_norm": 0.07931026816368103, + "learning_rate": 4.1333741831041425e-05, + "loss": 0.0374, + "step": 61940 + }, + { + "epoch": 0.30475, + "grad_norm": 0.11639677733182907, + "learning_rate": 4.13306121835214e-05, + "loss": 0.038, + "step": 61950 + }, + { + "epoch": 0.3048, + "grad_norm": 0.10649111866950989, + "learning_rate": 4.132748208952857e-05, + "loss": 0.0375, + "step": 61960 + }, + { + "epoch": 0.30485, + "grad_norm": 0.09318245202302933, + "learning_rate": 4.132435154914851e-05, + "loss": 0.037, + "step": 61970 + }, + { + "epoch": 0.3049, + "grad_norm": 0.09532459825277328, + "learning_rate": 4.132122056246681e-05, + "loss": 0.0373, + "step": 61980 + }, + { + "epoch": 0.30495, + "grad_norm": 0.09357064962387085, + "learning_rate": 4.131808912956907e-05, + "loss": 0.0371, + "step": 61990 + }, + { + "epoch": 0.305, + "grad_norm": 0.10159306228160858, + "learning_rate": 4.131495725054091e-05, + "loss": 0.0362, + "step": 62000 + }, + { + "epoch": 0.30505, + "grad_norm": 0.10437101870775223, + "learning_rate": 4.1311824925467946e-05, + "loss": 0.0372, + "step": 62010 + }, + { + "epoch": 0.3051, + "grad_norm": 0.0874381735920906, + "learning_rate": 4.1308692154435815e-05, + "loss": 0.0378, + "step": 62020 + }, + { + "epoch": 0.30515, + "grad_norm": 0.09519375115633011, + "learning_rate": 4.130555893753016e-05, + "loss": 0.0362, + "step": 62030 + }, + { + "epoch": 0.3052, + "grad_norm": 0.10409439355134964, + "learning_rate": 4.1302425274836666e-05, + "loss": 0.037, + "step": 62040 + }, + { + "epoch": 0.30525, + "grad_norm": 0.11408756673336029, + "learning_rate": 4.129929116644098e-05, + "loss": 0.0373, + "step": 62050 + }, + { + "epoch": 0.3053, + "grad_norm": 0.10046551376581192, + "learning_rate": 4.1296156612428794e-05, + "loss": 0.0367, + "step": 62060 + }, + { + "epoch": 0.30535, + "grad_norm": 0.10562136769294739, + "learning_rate": 4.129302161288582e-05, + "loss": 0.038, + "step": 62070 + }, + { + "epoch": 0.3054, + "grad_norm": 0.11253020167350769, + "learning_rate": 4.128988616789774e-05, + "loss": 0.0402, + "step": 62080 + }, + { + "epoch": 0.30545, + "grad_norm": 0.10092043876647949, + "learning_rate": 4.1286750277550304e-05, + "loss": 0.0367, + "step": 62090 + }, + { + "epoch": 0.3055, + "grad_norm": 0.09704675525426865, + "learning_rate": 4.1283613941929234e-05, + "loss": 0.037, + "step": 62100 + }, + { + "epoch": 0.30555, + "grad_norm": 0.09652874618768692, + "learning_rate": 4.1280477161120265e-05, + "loss": 0.0372, + "step": 62110 + }, + { + "epoch": 0.3056, + "grad_norm": 0.11451993137598038, + "learning_rate": 4.127733993520918e-05, + "loss": 0.037, + "step": 62120 + }, + { + "epoch": 0.30565, + "grad_norm": 0.1000591367483139, + "learning_rate": 4.1274202264281724e-05, + "loss": 0.0378, + "step": 62130 + }, + { + "epoch": 0.3057, + "grad_norm": 0.11066377907991409, + "learning_rate": 4.12710641484237e-05, + "loss": 0.0391, + "step": 62140 + }, + { + "epoch": 0.30575, + "grad_norm": 0.08408109843730927, + "learning_rate": 4.12679255877209e-05, + "loss": 0.0366, + "step": 62150 + }, + { + "epoch": 0.3058, + "grad_norm": 0.08459489792585373, + "learning_rate": 4.126478658225912e-05, + "loss": 0.0352, + "step": 62160 + }, + { + "epoch": 0.30585, + "grad_norm": 0.08361006528139114, + "learning_rate": 4.1261647132124184e-05, + "loss": 0.0361, + "step": 62170 + }, + { + "epoch": 0.3059, + "grad_norm": 0.1002361848950386, + "learning_rate": 4.125850723740192e-05, + "loss": 0.0369, + "step": 62180 + }, + { + "epoch": 0.30595, + "grad_norm": 0.12000847607851028, + "learning_rate": 4.1255366898178184e-05, + "loss": 0.0377, + "step": 62190 + }, + { + "epoch": 0.306, + "grad_norm": 0.10912597924470901, + "learning_rate": 4.125222611453882e-05, + "loss": 0.0377, + "step": 62200 + }, + { + "epoch": 0.30605, + "grad_norm": 0.09594231098890305, + "learning_rate": 4.12490848865697e-05, + "loss": 0.0364, + "step": 62210 + }, + { + "epoch": 0.3061, + "grad_norm": 0.09830238670110703, + "learning_rate": 4.1245943214356705e-05, + "loss": 0.0376, + "step": 62220 + }, + { + "epoch": 0.30615, + "grad_norm": 0.1080409362912178, + "learning_rate": 4.124280109798573e-05, + "loss": 0.0362, + "step": 62230 + }, + { + "epoch": 0.3062, + "grad_norm": 0.10195433348417282, + "learning_rate": 4.123965853754267e-05, + "loss": 0.0357, + "step": 62240 + }, + { + "epoch": 0.30625, + "grad_norm": 0.113552026450634, + "learning_rate": 4.123651553311345e-05, + "loss": 0.0379, + "step": 62250 + }, + { + "epoch": 0.3063, + "grad_norm": 0.10659865289926529, + "learning_rate": 4.123337208478399e-05, + "loss": 0.0385, + "step": 62260 + }, + { + "epoch": 0.30635, + "grad_norm": 0.09573084115982056, + "learning_rate": 4.1230228192640236e-05, + "loss": 0.0377, + "step": 62270 + }, + { + "epoch": 0.3064, + "grad_norm": 0.09120679646730423, + "learning_rate": 4.1227083856768145e-05, + "loss": 0.039, + "step": 62280 + }, + { + "epoch": 0.30645, + "grad_norm": 0.09591097384691238, + "learning_rate": 4.122393907725368e-05, + "loss": 0.0365, + "step": 62290 + }, + { + "epoch": 0.3065, + "grad_norm": 0.09726311266422272, + "learning_rate": 4.1220793854182804e-05, + "loss": 0.0389, + "step": 62300 + }, + { + "epoch": 0.30655, + "grad_norm": 0.10283922404050827, + "learning_rate": 4.121764818764153e-05, + "loss": 0.0378, + "step": 62310 + }, + { + "epoch": 0.3066, + "grad_norm": 0.09351787716150284, + "learning_rate": 4.121450207771584e-05, + "loss": 0.0374, + "step": 62320 + }, + { + "epoch": 0.30665, + "grad_norm": 0.09554977715015411, + "learning_rate": 4.121135552449176e-05, + "loss": 0.0371, + "step": 62330 + }, + { + "epoch": 0.3067, + "grad_norm": 0.09455087035894394, + "learning_rate": 4.1208208528055306e-05, + "loss": 0.0378, + "step": 62340 + }, + { + "epoch": 0.30675, + "grad_norm": 0.11537139862775803, + "learning_rate": 4.1205061088492517e-05, + "loss": 0.0404, + "step": 62350 + }, + { + "epoch": 0.3068, + "grad_norm": 0.11361251771450043, + "learning_rate": 4.1201913205889455e-05, + "loss": 0.0385, + "step": 62360 + }, + { + "epoch": 0.30685, + "grad_norm": 0.0990777239203453, + "learning_rate": 4.119876488033216e-05, + "loss": 0.0378, + "step": 62370 + }, + { + "epoch": 0.3069, + "grad_norm": 0.09623073786497116, + "learning_rate": 4.119561611190673e-05, + "loss": 0.0378, + "step": 62380 + }, + { + "epoch": 0.30695, + "grad_norm": 0.11010082811117172, + "learning_rate": 4.1192466900699236e-05, + "loss": 0.0388, + "step": 62390 + }, + { + "epoch": 0.307, + "grad_norm": 0.09047936648130417, + "learning_rate": 4.1189317246795784e-05, + "loss": 0.0384, + "step": 62400 + }, + { + "epoch": 0.30705, + "grad_norm": 0.11480408161878586, + "learning_rate": 4.1186167150282475e-05, + "loss": 0.0399, + "step": 62410 + }, + { + "epoch": 0.3071, + "grad_norm": 0.11480441689491272, + "learning_rate": 4.118301661124544e-05, + "loss": 0.0375, + "step": 62420 + }, + { + "epoch": 0.30715, + "grad_norm": 0.10686799138784409, + "learning_rate": 4.117986562977081e-05, + "loss": 0.0386, + "step": 62430 + }, + { + "epoch": 0.3072, + "grad_norm": 0.09445648640394211, + "learning_rate": 4.117671420594473e-05, + "loss": 0.0382, + "step": 62440 + }, + { + "epoch": 0.30725, + "grad_norm": 0.11411401629447937, + "learning_rate": 4.117356233985337e-05, + "loss": 0.0404, + "step": 62450 + }, + { + "epoch": 0.3073, + "grad_norm": 0.09053072333335876, + "learning_rate": 4.117041003158288e-05, + "loss": 0.0385, + "step": 62460 + }, + { + "epoch": 0.30735, + "grad_norm": 0.07754967361688614, + "learning_rate": 4.1167257281219455e-05, + "loss": 0.0383, + "step": 62470 + }, + { + "epoch": 0.3074, + "grad_norm": 0.08269201964139938, + "learning_rate": 4.1164104088849296e-05, + "loss": 0.0392, + "step": 62480 + }, + { + "epoch": 0.30745, + "grad_norm": 0.10286622494459152, + "learning_rate": 4.11609504545586e-05, + "loss": 0.0402, + "step": 62490 + }, + { + "epoch": 0.3075, + "grad_norm": 0.09792700409889221, + "learning_rate": 4.1157796378433596e-05, + "loss": 0.0385, + "step": 62500 + }, + { + "epoch": 0.30755, + "grad_norm": 0.08387406915426254, + "learning_rate": 4.11546418605605e-05, + "loss": 0.0359, + "step": 62510 + }, + { + "epoch": 0.3076, + "grad_norm": 0.08836230635643005, + "learning_rate": 4.1151486901025574e-05, + "loss": 0.0397, + "step": 62520 + }, + { + "epoch": 0.30765, + "grad_norm": 0.09530341625213623, + "learning_rate": 4.1148331499915056e-05, + "loss": 0.0373, + "step": 62530 + }, + { + "epoch": 0.3077, + "grad_norm": 0.09608582407236099, + "learning_rate": 4.1145175657315225e-05, + "loss": 0.0372, + "step": 62540 + }, + { + "epoch": 0.30775, + "grad_norm": 0.09119854867458344, + "learning_rate": 4.1142019373312355e-05, + "loss": 0.0402, + "step": 62550 + }, + { + "epoch": 0.3078, + "grad_norm": 0.10277434438467026, + "learning_rate": 4.113886264799275e-05, + "loss": 0.0365, + "step": 62560 + }, + { + "epoch": 0.30785, + "grad_norm": 0.10991762578487396, + "learning_rate": 4.1135705481442685e-05, + "loss": 0.0375, + "step": 62570 + }, + { + "epoch": 0.3079, + "grad_norm": 0.13433682918548584, + "learning_rate": 4.1132547873748503e-05, + "loss": 0.0379, + "step": 62580 + }, + { + "epoch": 0.30795, + "grad_norm": 0.1160324215888977, + "learning_rate": 4.112938982499652e-05, + "loss": 0.0366, + "step": 62590 + }, + { + "epoch": 0.308, + "grad_norm": 0.09958013147115707, + "learning_rate": 4.1126231335273094e-05, + "loss": 0.039, + "step": 62600 + }, + { + "epoch": 0.30805, + "grad_norm": 0.10945141315460205, + "learning_rate": 4.1123072404664545e-05, + "loss": 0.0366, + "step": 62610 + }, + { + "epoch": 0.3081, + "grad_norm": 0.08697067946195602, + "learning_rate": 4.111991303325726e-05, + "loss": 0.0374, + "step": 62620 + }, + { + "epoch": 0.30815, + "grad_norm": 0.0961507260799408, + "learning_rate": 4.1116753221137606e-05, + "loss": 0.0364, + "step": 62630 + }, + { + "epoch": 0.3082, + "grad_norm": 0.09394659847021103, + "learning_rate": 4.1113592968391976e-05, + "loss": 0.037, + "step": 62640 + }, + { + "epoch": 0.30825, + "grad_norm": 0.09779747575521469, + "learning_rate": 4.1110432275106767e-05, + "loss": 0.0365, + "step": 62650 + }, + { + "epoch": 0.3083, + "grad_norm": 0.08492156118154526, + "learning_rate": 4.110727114136839e-05, + "loss": 0.0372, + "step": 62660 + }, + { + "epoch": 0.30835, + "grad_norm": 0.1093636080622673, + "learning_rate": 4.1104109567263274e-05, + "loss": 0.0377, + "step": 62670 + }, + { + "epoch": 0.3084, + "grad_norm": 0.12288608402013779, + "learning_rate": 4.110094755287785e-05, + "loss": 0.0367, + "step": 62680 + }, + { + "epoch": 0.30845, + "grad_norm": 0.10244695097208023, + "learning_rate": 4.109778509829857e-05, + "loss": 0.04, + "step": 62690 + }, + { + "epoch": 0.3085, + "grad_norm": 0.10393958538770676, + "learning_rate": 4.109462220361189e-05, + "loss": 0.0397, + "step": 62700 + }, + { + "epoch": 0.30855, + "grad_norm": 0.11706296354532242, + "learning_rate": 4.109145886890429e-05, + "loss": 0.0397, + "step": 62710 + }, + { + "epoch": 0.3086, + "grad_norm": 0.11853691935539246, + "learning_rate": 4.1088295094262255e-05, + "loss": 0.0395, + "step": 62720 + }, + { + "epoch": 0.30865, + "grad_norm": 0.12326949834823608, + "learning_rate": 4.108513087977227e-05, + "loss": 0.0379, + "step": 62730 + }, + { + "epoch": 0.3087, + "grad_norm": 0.10332608968019485, + "learning_rate": 4.1081966225520846e-05, + "loss": 0.037, + "step": 62740 + }, + { + "epoch": 0.30875, + "grad_norm": 0.11756819486618042, + "learning_rate": 4.107880113159451e-05, + "loss": 0.0384, + "step": 62750 + }, + { + "epoch": 0.3088, + "grad_norm": 0.11053383350372314, + "learning_rate": 4.107563559807979e-05, + "loss": 0.0359, + "step": 62760 + }, + { + "epoch": 0.30885, + "grad_norm": 0.09963206201791763, + "learning_rate": 4.107246962506324e-05, + "loss": 0.0369, + "step": 62770 + }, + { + "epoch": 0.3089, + "grad_norm": 0.09865821152925491, + "learning_rate": 4.106930321263139e-05, + "loss": 0.0376, + "step": 62780 + }, + { + "epoch": 0.30895, + "grad_norm": 0.10817153006792068, + "learning_rate": 4.106613636087085e-05, + "loss": 0.0368, + "step": 62790 + }, + { + "epoch": 0.309, + "grad_norm": 0.10829388350248337, + "learning_rate": 4.106296906986816e-05, + "loss": 0.0376, + "step": 62800 + }, + { + "epoch": 0.30905, + "grad_norm": 0.10261911153793335, + "learning_rate": 4.105980133970995e-05, + "loss": 0.0362, + "step": 62810 + }, + { + "epoch": 0.3091, + "grad_norm": 0.09937945753335953, + "learning_rate": 4.105663317048278e-05, + "loss": 0.0363, + "step": 62820 + }, + { + "epoch": 0.30915, + "grad_norm": 0.10094765573740005, + "learning_rate": 4.10534645622733e-05, + "loss": 0.0377, + "step": 62830 + }, + { + "epoch": 0.3092, + "grad_norm": 0.11168407648801804, + "learning_rate": 4.1050295515168144e-05, + "loss": 0.0373, + "step": 62840 + }, + { + "epoch": 0.30925, + "grad_norm": 0.09110624343156815, + "learning_rate": 4.104712602925392e-05, + "loss": 0.0375, + "step": 62850 + }, + { + "epoch": 0.3093, + "grad_norm": 0.08740352094173431, + "learning_rate": 4.104395610461731e-05, + "loss": 0.0415, + "step": 62860 + }, + { + "epoch": 0.30935, + "grad_norm": 0.08553145825862885, + "learning_rate": 4.104078574134497e-05, + "loss": 0.0381, + "step": 62870 + }, + { + "epoch": 0.3094, + "grad_norm": 0.09509292244911194, + "learning_rate": 4.103761493952357e-05, + "loss": 0.0362, + "step": 62880 + }, + { + "epoch": 0.30945, + "grad_norm": 0.09452133625745773, + "learning_rate": 4.10344436992398e-05, + "loss": 0.0381, + "step": 62890 + }, + { + "epoch": 0.3095, + "grad_norm": 0.10518445074558258, + "learning_rate": 4.103127202058036e-05, + "loss": 0.0384, + "step": 62900 + }, + { + "epoch": 0.30955, + "grad_norm": 0.11385580897331238, + "learning_rate": 4.102809990363197e-05, + "loss": 0.039, + "step": 62910 + }, + { + "epoch": 0.3096, + "grad_norm": 0.09979929029941559, + "learning_rate": 4.102492734848136e-05, + "loss": 0.0381, + "step": 62920 + }, + { + "epoch": 0.30965, + "grad_norm": 0.08985389769077301, + "learning_rate": 4.1021754355215235e-05, + "loss": 0.0361, + "step": 62930 + }, + { + "epoch": 0.3097, + "grad_norm": 0.0899069532752037, + "learning_rate": 4.101858092392038e-05, + "loss": 0.0379, + "step": 62940 + }, + { + "epoch": 0.30975, + "grad_norm": 0.09491459280252457, + "learning_rate": 4.101540705468354e-05, + "loss": 0.0396, + "step": 62950 + }, + { + "epoch": 0.3098, + "grad_norm": 0.10776514559984207, + "learning_rate": 4.1012232747591484e-05, + "loss": 0.0375, + "step": 62960 + }, + { + "epoch": 0.30985, + "grad_norm": 0.09491831809282303, + "learning_rate": 4.1009058002730995e-05, + "loss": 0.0422, + "step": 62970 + }, + { + "epoch": 0.3099, + "grad_norm": 0.11386746913194656, + "learning_rate": 4.1005882820188885e-05, + "loss": 0.0389, + "step": 62980 + }, + { + "epoch": 0.30995, + "grad_norm": 0.10596064478158951, + "learning_rate": 4.100270720005195e-05, + "loss": 0.0375, + "step": 62990 + }, + { + "epoch": 0.31, + "grad_norm": 0.1110059916973114, + "learning_rate": 4.099953114240701e-05, + "loss": 0.0362, + "step": 63000 + }, + { + "epoch": 0.31005, + "grad_norm": 0.1058363988995552, + "learning_rate": 4.09963546473409e-05, + "loss": 0.0379, + "step": 63010 + }, + { + "epoch": 0.3101, + "grad_norm": 0.10297834128141403, + "learning_rate": 4.099317771494046e-05, + "loss": 0.0382, + "step": 63020 + }, + { + "epoch": 0.31015, + "grad_norm": 0.10393298417329788, + "learning_rate": 4.0990000345292546e-05, + "loss": 0.0367, + "step": 63030 + }, + { + "epoch": 0.3102, + "grad_norm": 0.11371879279613495, + "learning_rate": 4.098682253848404e-05, + "loss": 0.0381, + "step": 63040 + }, + { + "epoch": 0.31025, + "grad_norm": 0.11789979785680771, + "learning_rate": 4.0983644294601805e-05, + "loss": 0.0376, + "step": 63050 + }, + { + "epoch": 0.3103, + "grad_norm": 0.10600556433200836, + "learning_rate": 4.098046561373274e-05, + "loss": 0.0383, + "step": 63060 + }, + { + "epoch": 0.31035, + "grad_norm": 0.11378031224012375, + "learning_rate": 4.097728649596376e-05, + "loss": 0.0378, + "step": 63070 + }, + { + "epoch": 0.3104, + "grad_norm": 0.1041090115904808, + "learning_rate": 4.097410694138175e-05, + "loss": 0.0379, + "step": 63080 + }, + { + "epoch": 0.31045, + "grad_norm": 0.11633723229169846, + "learning_rate": 4.0970926950073674e-05, + "loss": 0.0374, + "step": 63090 + }, + { + "epoch": 0.3105, + "grad_norm": 0.10892637819051743, + "learning_rate": 4.096774652212645e-05, + "loss": 0.037, + "step": 63100 + }, + { + "epoch": 0.31055, + "grad_norm": 0.09778828173875809, + "learning_rate": 4.096456565762703e-05, + "loss": 0.0367, + "step": 63110 + }, + { + "epoch": 0.3106, + "grad_norm": 0.10582304745912552, + "learning_rate": 4.096138435666239e-05, + "loss": 0.0375, + "step": 63120 + }, + { + "epoch": 0.31065, + "grad_norm": 0.09552489221096039, + "learning_rate": 4.09582026193195e-05, + "loss": 0.0382, + "step": 63130 + }, + { + "epoch": 0.3107, + "grad_norm": 0.11684516072273254, + "learning_rate": 4.095502044568533e-05, + "loss": 0.0391, + "step": 63140 + }, + { + "epoch": 0.31075, + "grad_norm": 0.09883775562047958, + "learning_rate": 4.0951837835846906e-05, + "loss": 0.0382, + "step": 63150 + }, + { + "epoch": 0.3108, + "grad_norm": 0.08349844813346863, + "learning_rate": 4.0948654789891235e-05, + "loss": 0.0394, + "step": 63160 + }, + { + "epoch": 0.31085, + "grad_norm": 0.1069011241197586, + "learning_rate": 4.094547130790532e-05, + "loss": 0.0392, + "step": 63170 + }, + { + "epoch": 0.3109, + "grad_norm": 0.11482026427984238, + "learning_rate": 4.094228738997622e-05, + "loss": 0.038, + "step": 63180 + }, + { + "epoch": 0.31095, + "grad_norm": 0.08558998256921768, + "learning_rate": 4.093910303619097e-05, + "loss": 0.0377, + "step": 63190 + }, + { + "epoch": 0.311, + "grad_norm": 0.0929851084947586, + "learning_rate": 4.0935918246636626e-05, + "loss": 0.0394, + "step": 63200 + }, + { + "epoch": 0.31105, + "grad_norm": 0.09259190410375595, + "learning_rate": 4.0932733021400266e-05, + "loss": 0.0368, + "step": 63210 + }, + { + "epoch": 0.3111, + "grad_norm": 0.11861695349216461, + "learning_rate": 4.092954736056897e-05, + "loss": 0.0415, + "step": 63220 + }, + { + "epoch": 0.31115, + "grad_norm": 0.09823275357484818, + "learning_rate": 4.092636126422984e-05, + "loss": 0.0384, + "step": 63230 + }, + { + "epoch": 0.3112, + "grad_norm": 0.11202265322208405, + "learning_rate": 4.092317473246997e-05, + "loss": 0.0388, + "step": 63240 + }, + { + "epoch": 0.31125, + "grad_norm": 0.09932032227516174, + "learning_rate": 4.091998776537649e-05, + "loss": 0.0373, + "step": 63250 + }, + { + "epoch": 0.3113, + "grad_norm": 0.10629276931285858, + "learning_rate": 4.091680036303652e-05, + "loss": 0.0387, + "step": 63260 + }, + { + "epoch": 0.31135, + "grad_norm": 0.08321334421634674, + "learning_rate": 4.091361252553721e-05, + "loss": 0.037, + "step": 63270 + }, + { + "epoch": 0.3114, + "grad_norm": 0.08534108102321625, + "learning_rate": 4.091042425296571e-05, + "loss": 0.0366, + "step": 63280 + }, + { + "epoch": 0.31145, + "grad_norm": 0.07628346979618073, + "learning_rate": 4.090723554540919e-05, + "loss": 0.039, + "step": 63290 + }, + { + "epoch": 0.3115, + "grad_norm": 0.08024024963378906, + "learning_rate": 4.090404640295483e-05, + "loss": 0.0353, + "step": 63300 + }, + { + "epoch": 0.31155, + "grad_norm": 0.08957171440124512, + "learning_rate": 4.090085682568982e-05, + "loss": 0.0372, + "step": 63310 + }, + { + "epoch": 0.3116, + "grad_norm": 0.0850076675415039, + "learning_rate": 4.089766681370135e-05, + "loss": 0.0363, + "step": 63320 + }, + { + "epoch": 0.31165, + "grad_norm": 0.115052230656147, + "learning_rate": 4.089447636707664e-05, + "loss": 0.0365, + "step": 63330 + }, + { + "epoch": 0.3117, + "grad_norm": 0.07677609473466873, + "learning_rate": 4.0891285485902915e-05, + "loss": 0.0357, + "step": 63340 + }, + { + "epoch": 0.31175, + "grad_norm": 0.09936800599098206, + "learning_rate": 4.088809417026742e-05, + "loss": 0.0366, + "step": 63350 + }, + { + "epoch": 0.3118, + "grad_norm": 0.09300931543111801, + "learning_rate": 4.08849024202574e-05, + "loss": 0.0369, + "step": 63360 + }, + { + "epoch": 0.31185, + "grad_norm": 0.09681177884340286, + "learning_rate": 4.0881710235960115e-05, + "loss": 0.0381, + "step": 63370 + }, + { + "epoch": 0.3119, + "grad_norm": 0.08285442739725113, + "learning_rate": 4.087851761746284e-05, + "loss": 0.0361, + "step": 63380 + }, + { + "epoch": 0.31195, + "grad_norm": 0.11278854310512543, + "learning_rate": 4.0875324564852856e-05, + "loss": 0.0383, + "step": 63390 + }, + { + "epoch": 0.312, + "grad_norm": 0.10942468047142029, + "learning_rate": 4.0872131078217465e-05, + "loss": 0.0361, + "step": 63400 + }, + { + "epoch": 0.31205, + "grad_norm": 0.08108315616846085, + "learning_rate": 4.086893715764397e-05, + "loss": 0.0371, + "step": 63410 + }, + { + "epoch": 0.3121, + "grad_norm": 0.09134598076343536, + "learning_rate": 4.08657428032197e-05, + "loss": 0.0376, + "step": 63420 + }, + { + "epoch": 0.31215, + "grad_norm": 0.07536963373422623, + "learning_rate": 4.086254801503198e-05, + "loss": 0.0371, + "step": 63430 + }, + { + "epoch": 0.3122, + "grad_norm": 0.09576810896396637, + "learning_rate": 4.085935279316815e-05, + "loss": 0.0373, + "step": 63440 + }, + { + "epoch": 0.31225, + "grad_norm": 0.09800707548856735, + "learning_rate": 4.0856157137715576e-05, + "loss": 0.0386, + "step": 63450 + }, + { + "epoch": 0.3123, + "grad_norm": 0.10401442646980286, + "learning_rate": 4.085296104876163e-05, + "loss": 0.0389, + "step": 63460 + }, + { + "epoch": 0.31235, + "grad_norm": 0.14090527594089508, + "learning_rate": 4.084976452639367e-05, + "loss": 0.0391, + "step": 63470 + }, + { + "epoch": 0.3124, + "grad_norm": 0.10894068330526352, + "learning_rate": 4.084656757069911e-05, + "loss": 0.0393, + "step": 63480 + }, + { + "epoch": 0.31245, + "grad_norm": 0.09572970867156982, + "learning_rate": 4.084337018176535e-05, + "loss": 0.0379, + "step": 63490 + }, + { + "epoch": 0.3125, + "grad_norm": 0.10507770627737045, + "learning_rate": 4.08401723596798e-05, + "loss": 0.0379, + "step": 63500 + }, + { + "epoch": 0.31255, + "grad_norm": 0.10381949692964554, + "learning_rate": 4.083697410452989e-05, + "loss": 0.039, + "step": 63510 + }, + { + "epoch": 0.3126, + "grad_norm": 0.09521272033452988, + "learning_rate": 4.083377541640305e-05, + "loss": 0.0379, + "step": 63520 + }, + { + "epoch": 0.31265, + "grad_norm": 0.10659544169902802, + "learning_rate": 4.0830576295386744e-05, + "loss": 0.0378, + "step": 63530 + }, + { + "epoch": 0.3127, + "grad_norm": 0.09491292387247086, + "learning_rate": 4.082737674156844e-05, + "loss": 0.0357, + "step": 63540 + }, + { + "epoch": 0.31275, + "grad_norm": 0.08107301592826843, + "learning_rate": 4.082417675503558e-05, + "loss": 0.0367, + "step": 63550 + }, + { + "epoch": 0.3128, + "grad_norm": 0.0824798122048378, + "learning_rate": 4.082097633587569e-05, + "loss": 0.0385, + "step": 63560 + }, + { + "epoch": 0.31285, + "grad_norm": 0.10407048463821411, + "learning_rate": 4.081777548417625e-05, + "loss": 0.0386, + "step": 63570 + }, + { + "epoch": 0.3129, + "grad_norm": 0.09542399644851685, + "learning_rate": 4.081457420002476e-05, + "loss": 0.0393, + "step": 63580 + }, + { + "epoch": 0.31295, + "grad_norm": 0.07698172330856323, + "learning_rate": 4.0811372483508745e-05, + "loss": 0.0368, + "step": 63590 + }, + { + "epoch": 0.313, + "grad_norm": 0.08906321972608566, + "learning_rate": 4.080817033471577e-05, + "loss": 0.0376, + "step": 63600 + }, + { + "epoch": 0.31305, + "grad_norm": 0.07813603430986404, + "learning_rate": 4.080496775373334e-05, + "loss": 0.0358, + "step": 63610 + }, + { + "epoch": 0.3131, + "grad_norm": 0.0924825519323349, + "learning_rate": 4.080176474064904e-05, + "loss": 0.0378, + "step": 63620 + }, + { + "epoch": 0.31315, + "grad_norm": 0.10021001100540161, + "learning_rate": 4.079856129555042e-05, + "loss": 0.0374, + "step": 63630 + }, + { + "epoch": 0.3132, + "grad_norm": 0.11586814373731613, + "learning_rate": 4.079535741852507e-05, + "loss": 0.0394, + "step": 63640 + }, + { + "epoch": 0.31325, + "grad_norm": 0.12628233432769775, + "learning_rate": 4.079215310966059e-05, + "loss": 0.038, + "step": 63650 + }, + { + "epoch": 0.3133, + "grad_norm": 0.10121853649616241, + "learning_rate": 4.078894836904457e-05, + "loss": 0.0386, + "step": 63660 + }, + { + "epoch": 0.31335, + "grad_norm": 0.09781887382268906, + "learning_rate": 4.078574319676463e-05, + "loss": 0.0375, + "step": 63670 + }, + { + "epoch": 0.3134, + "grad_norm": 0.1199178546667099, + "learning_rate": 4.078253759290841e-05, + "loss": 0.0387, + "step": 63680 + }, + { + "epoch": 0.31345, + "grad_norm": 0.12737081944942474, + "learning_rate": 4.077933155756354e-05, + "loss": 0.0379, + "step": 63690 + }, + { + "epoch": 0.3135, + "grad_norm": 0.11249277740716934, + "learning_rate": 4.0776125090817666e-05, + "loss": 0.0392, + "step": 63700 + }, + { + "epoch": 0.31355, + "grad_norm": 0.1020110547542572, + "learning_rate": 4.077291819275847e-05, + "loss": 0.0384, + "step": 63710 + }, + { + "epoch": 0.3136, + "grad_norm": 0.11463608592748642, + "learning_rate": 4.0769710863473604e-05, + "loss": 0.0394, + "step": 63720 + }, + { + "epoch": 0.31365, + "grad_norm": 0.10257559269666672, + "learning_rate": 4.076650310305077e-05, + "loss": 0.0418, + "step": 63730 + }, + { + "epoch": 0.3137, + "grad_norm": 0.10356997698545456, + "learning_rate": 4.076329491157768e-05, + "loss": 0.0393, + "step": 63740 + }, + { + "epoch": 0.31375, + "grad_norm": 0.12238980829715729, + "learning_rate": 4.0760086289142006e-05, + "loss": 0.0374, + "step": 63750 + }, + { + "epoch": 0.3138, + "grad_norm": 0.10171011835336685, + "learning_rate": 4.075687723583151e-05, + "loss": 0.0386, + "step": 63760 + }, + { + "epoch": 0.31385, + "grad_norm": 0.15728776156902313, + "learning_rate": 4.0753667751733906e-05, + "loss": 0.0376, + "step": 63770 + }, + { + "epoch": 0.3139, + "grad_norm": 0.12389139831066132, + "learning_rate": 4.075045783693694e-05, + "loss": 0.0383, + "step": 63780 + }, + { + "epoch": 0.31395, + "grad_norm": 0.12089716643095016, + "learning_rate": 4.074724749152837e-05, + "loss": 0.0377, + "step": 63790 + }, + { + "epoch": 0.314, + "grad_norm": 0.08855020999908447, + "learning_rate": 4.074403671559598e-05, + "loss": 0.0391, + "step": 63800 + }, + { + "epoch": 0.31405, + "grad_norm": 0.11645334213972092, + "learning_rate": 4.0740825509227544e-05, + "loss": 0.0407, + "step": 63810 + }, + { + "epoch": 0.3141, + "grad_norm": 0.10966397821903229, + "learning_rate": 4.073761387251084e-05, + "loss": 0.0385, + "step": 63820 + }, + { + "epoch": 0.31415, + "grad_norm": 0.10489311069250107, + "learning_rate": 4.0734401805533696e-05, + "loss": 0.0373, + "step": 63830 + }, + { + "epoch": 0.3142, + "grad_norm": 0.1026468500494957, + "learning_rate": 4.073118930838391e-05, + "loss": 0.0367, + "step": 63840 + }, + { + "epoch": 0.31425, + "grad_norm": 0.09668806195259094, + "learning_rate": 4.072797638114931e-05, + "loss": 0.037, + "step": 63850 + }, + { + "epoch": 0.3143, + "grad_norm": 0.0955677479505539, + "learning_rate": 4.072476302391776e-05, + "loss": 0.0377, + "step": 63860 + }, + { + "epoch": 0.31435, + "grad_norm": 0.11412930488586426, + "learning_rate": 4.072154923677709e-05, + "loss": 0.036, + "step": 63870 + }, + { + "epoch": 0.3144, + "grad_norm": 0.10534878075122833, + "learning_rate": 4.071833501981517e-05, + "loss": 0.0385, + "step": 63880 + }, + { + "epoch": 0.31445, + "grad_norm": 0.10487980395555496, + "learning_rate": 4.071512037311988e-05, + "loss": 0.0378, + "step": 63890 + }, + { + "epoch": 0.3145, + "grad_norm": 0.10983464121818542, + "learning_rate": 4.07119052967791e-05, + "loss": 0.0384, + "step": 63900 + }, + { + "epoch": 0.31455, + "grad_norm": 0.08953792601823807, + "learning_rate": 4.070868979088073e-05, + "loss": 0.0373, + "step": 63910 + }, + { + "epoch": 0.3146, + "grad_norm": 0.10473524034023285, + "learning_rate": 4.070547385551269e-05, + "loss": 0.0386, + "step": 63920 + }, + { + "epoch": 0.31465, + "grad_norm": 0.08791254460811615, + "learning_rate": 4.070225749076289e-05, + "loss": 0.0369, + "step": 63930 + }, + { + "epoch": 0.3147, + "grad_norm": 0.09786912798881531, + "learning_rate": 4.069904069671927e-05, + "loss": 0.0372, + "step": 63940 + }, + { + "epoch": 0.31475, + "grad_norm": 0.09092038869857788, + "learning_rate": 4.069582347346977e-05, + "loss": 0.0379, + "step": 63950 + }, + { + "epoch": 0.3148, + "grad_norm": 0.10886016488075256, + "learning_rate": 4.069260582110236e-05, + "loss": 0.0379, + "step": 63960 + }, + { + "epoch": 0.31485, + "grad_norm": 0.09442399442195892, + "learning_rate": 4.0689387739705e-05, + "loss": 0.0396, + "step": 63970 + }, + { + "epoch": 0.3149, + "grad_norm": 0.07843463867902756, + "learning_rate": 4.0686169229365665e-05, + "loss": 0.0392, + "step": 63980 + }, + { + "epoch": 0.31495, + "grad_norm": 0.089840367436409, + "learning_rate": 4.068295029017236e-05, + "loss": 0.0391, + "step": 63990 + }, + { + "epoch": 0.315, + "grad_norm": 0.08948429673910141, + "learning_rate": 4.0679730922213096e-05, + "loss": 0.0398, + "step": 64000 + }, + { + "epoch": 0.31505, + "grad_norm": 0.08338318765163422, + "learning_rate": 4.067651112557587e-05, + "loss": 0.0381, + "step": 64010 + }, + { + "epoch": 0.3151, + "grad_norm": 0.12214217334985733, + "learning_rate": 4.0673290900348726e-05, + "loss": 0.0402, + "step": 64020 + }, + { + "epoch": 0.31515, + "grad_norm": 0.0753837451338768, + "learning_rate": 4.0670070246619694e-05, + "loss": 0.0362, + "step": 64030 + }, + { + "epoch": 0.3152, + "grad_norm": 0.09924887120723724, + "learning_rate": 4.066684916447682e-05, + "loss": 0.0396, + "step": 64040 + }, + { + "epoch": 0.31525, + "grad_norm": 0.11461825668811798, + "learning_rate": 4.066362765400819e-05, + "loss": 0.0408, + "step": 64050 + }, + { + "epoch": 0.3153, + "grad_norm": 0.11488715559244156, + "learning_rate": 4.066040571530185e-05, + "loss": 0.0398, + "step": 64060 + }, + { + "epoch": 0.31535, + "grad_norm": 0.11786787211894989, + "learning_rate": 4.0657183348445917e-05, + "loss": 0.0383, + "step": 64070 + }, + { + "epoch": 0.3154, + "grad_norm": 0.10072103142738342, + "learning_rate": 4.065396055352846e-05, + "loss": 0.0386, + "step": 64080 + }, + { + "epoch": 0.31545, + "grad_norm": 0.10096098482608795, + "learning_rate": 4.065073733063761e-05, + "loss": 0.0366, + "step": 64090 + }, + { + "epoch": 0.3155, + "grad_norm": 0.08004684001207352, + "learning_rate": 4.064751367986148e-05, + "loss": 0.0386, + "step": 64100 + }, + { + "epoch": 0.31555, + "grad_norm": 0.10719065368175507, + "learning_rate": 4.06442896012882e-05, + "loss": 0.0375, + "step": 64110 + }, + { + "epoch": 0.3156, + "grad_norm": 0.11431484669446945, + "learning_rate": 4.064106509500592e-05, + "loss": 0.0381, + "step": 64120 + }, + { + "epoch": 0.31565, + "grad_norm": 0.09725914150476456, + "learning_rate": 4.06378401611028e-05, + "loss": 0.0382, + "step": 64130 + }, + { + "epoch": 0.3157, + "grad_norm": 0.09252629429101944, + "learning_rate": 4.0634614799667004e-05, + "loss": 0.0374, + "step": 64140 + }, + { + "epoch": 0.31575, + "grad_norm": 0.13072961568832397, + "learning_rate": 4.0631389010786716e-05, + "loss": 0.0384, + "step": 64150 + }, + { + "epoch": 0.3158, + "grad_norm": 0.10663977265357971, + "learning_rate": 4.0628162794550116e-05, + "loss": 0.0385, + "step": 64160 + }, + { + "epoch": 0.31585, + "grad_norm": 0.09481567144393921, + "learning_rate": 4.0624936151045426e-05, + "loss": 0.0389, + "step": 64170 + }, + { + "epoch": 0.3159, + "grad_norm": 0.1195489764213562, + "learning_rate": 4.062170908036085e-05, + "loss": 0.0365, + "step": 64180 + }, + { + "epoch": 0.31595, + "grad_norm": 0.1233643889427185, + "learning_rate": 4.061848158258461e-05, + "loss": 0.0379, + "step": 64190 + }, + { + "epoch": 0.316, + "grad_norm": 0.09934791922569275, + "learning_rate": 4.0615253657804955e-05, + "loss": 0.0367, + "step": 64200 + }, + { + "epoch": 0.31605, + "grad_norm": 0.09175673127174377, + "learning_rate": 4.0612025306110137e-05, + "loss": 0.0385, + "step": 64210 + }, + { + "epoch": 0.3161, + "grad_norm": 0.09873286634683609, + "learning_rate": 4.06087965275884e-05, + "loss": 0.0395, + "step": 64220 + }, + { + "epoch": 0.31615, + "grad_norm": 0.07790535688400269, + "learning_rate": 4.060556732232804e-05, + "loss": 0.0381, + "step": 64230 + }, + { + "epoch": 0.3162, + "grad_norm": 0.10746274143457413, + "learning_rate": 4.0602337690417325e-05, + "loss": 0.0368, + "step": 64240 + }, + { + "epoch": 0.31625, + "grad_norm": 0.0758485198020935, + "learning_rate": 4.0599107631944565e-05, + "loss": 0.0364, + "step": 64250 + }, + { + "epoch": 0.3163, + "grad_norm": 0.08690139651298523, + "learning_rate": 4.059587714699806e-05, + "loss": 0.037, + "step": 64260 + }, + { + "epoch": 0.31635, + "grad_norm": 0.0894489660859108, + "learning_rate": 4.0592646235666136e-05, + "loss": 0.0386, + "step": 64270 + }, + { + "epoch": 0.3164, + "grad_norm": 0.10154829174280167, + "learning_rate": 4.0589414898037124e-05, + "loss": 0.0377, + "step": 64280 + }, + { + "epoch": 0.31645, + "grad_norm": 0.09624433517456055, + "learning_rate": 4.0586183134199355e-05, + "loss": 0.0385, + "step": 64290 + }, + { + "epoch": 0.3165, + "grad_norm": 0.09763739258050919, + "learning_rate": 4.0582950944241204e-05, + "loss": 0.0376, + "step": 64300 + }, + { + "epoch": 0.31655, + "grad_norm": 0.11039437353610992, + "learning_rate": 4.057971832825103e-05, + "loss": 0.0402, + "step": 64310 + }, + { + "epoch": 0.3166, + "grad_norm": 0.09372076392173767, + "learning_rate": 4.05764852863172e-05, + "loss": 0.0405, + "step": 64320 + }, + { + "epoch": 0.31665, + "grad_norm": 0.08872032910585403, + "learning_rate": 4.057325181852812e-05, + "loss": 0.0383, + "step": 64330 + }, + { + "epoch": 0.3167, + "grad_norm": 0.11525247991085052, + "learning_rate": 4.057001792497218e-05, + "loss": 0.0399, + "step": 64340 + }, + { + "epoch": 0.31675, + "grad_norm": 0.08598694950342178, + "learning_rate": 4.0566783605737804e-05, + "loss": 0.0379, + "step": 64350 + }, + { + "epoch": 0.3168, + "grad_norm": 0.08638902008533478, + "learning_rate": 4.0563548860913415e-05, + "loss": 0.037, + "step": 64360 + }, + { + "epoch": 0.31685, + "grad_norm": 0.09434332698583603, + "learning_rate": 4.056031369058745e-05, + "loss": 0.0367, + "step": 64370 + }, + { + "epoch": 0.3169, + "grad_norm": 0.10789995640516281, + "learning_rate": 4.055707809484834e-05, + "loss": 0.0361, + "step": 64380 + }, + { + "epoch": 0.31695, + "grad_norm": 0.09658708423376083, + "learning_rate": 4.055384207378457e-05, + "loss": 0.0364, + "step": 64390 + }, + { + "epoch": 0.317, + "grad_norm": 0.10439179092645645, + "learning_rate": 4.05506056274846e-05, + "loss": 0.0376, + "step": 64400 + }, + { + "epoch": 0.31705, + "grad_norm": 0.11971355229616165, + "learning_rate": 4.054736875603692e-05, + "loss": 0.036, + "step": 64410 + }, + { + "epoch": 0.3171, + "grad_norm": 0.09868525713682175, + "learning_rate": 4.0544131459530006e-05, + "loss": 0.0366, + "step": 64420 + }, + { + "epoch": 0.31715, + "grad_norm": 0.0924951508641243, + "learning_rate": 4.0540893738052385e-05, + "loss": 0.0378, + "step": 64430 + }, + { + "epoch": 0.3172, + "grad_norm": 0.11395670473575592, + "learning_rate": 4.053765559169257e-05, + "loss": 0.0382, + "step": 64440 + }, + { + "epoch": 0.31725, + "grad_norm": 0.09016165137290955, + "learning_rate": 4.053441702053908e-05, + "loss": 0.0371, + "step": 64450 + }, + { + "epoch": 0.3173, + "grad_norm": 0.09708821773529053, + "learning_rate": 4.053117802468047e-05, + "loss": 0.0369, + "step": 64460 + }, + { + "epoch": 0.31735, + "grad_norm": 0.08717033267021179, + "learning_rate": 4.05279386042053e-05, + "loss": 0.0361, + "step": 64470 + }, + { + "epoch": 0.3174, + "grad_norm": 0.1037134975194931, + "learning_rate": 4.05246987592021e-05, + "loss": 0.0377, + "step": 64480 + }, + { + "epoch": 0.31745, + "grad_norm": 0.09291183948516846, + "learning_rate": 4.052145848975948e-05, + "loss": 0.0362, + "step": 64490 + }, + { + "epoch": 0.3175, + "grad_norm": 0.1136283352971077, + "learning_rate": 4.051821779596601e-05, + "loss": 0.0365, + "step": 64500 + }, + { + "epoch": 0.31755, + "grad_norm": 0.09991339594125748, + "learning_rate": 4.05149766779103e-05, + "loss": 0.0406, + "step": 64510 + }, + { + "epoch": 0.3176, + "grad_norm": 0.09657027572393417, + "learning_rate": 4.051173513568096e-05, + "loss": 0.0377, + "step": 64520 + }, + { + "epoch": 0.31765, + "grad_norm": 0.11914631724357605, + "learning_rate": 4.0508493169366604e-05, + "loss": 0.0397, + "step": 64530 + }, + { + "epoch": 0.3177, + "grad_norm": 0.11885827034711838, + "learning_rate": 4.050525077905587e-05, + "loss": 0.0397, + "step": 64540 + }, + { + "epoch": 0.31775, + "grad_norm": 0.09189107269048691, + "learning_rate": 4.050200796483741e-05, + "loss": 0.0365, + "step": 64550 + }, + { + "epoch": 0.3178, + "grad_norm": 0.09286625683307648, + "learning_rate": 4.049876472679987e-05, + "loss": 0.0384, + "step": 64560 + }, + { + "epoch": 0.31785, + "grad_norm": 0.1071949303150177, + "learning_rate": 4.0495521065031926e-05, + "loss": 0.0399, + "step": 64570 + }, + { + "epoch": 0.3179, + "grad_norm": 0.08723582327365875, + "learning_rate": 4.049227697962226e-05, + "loss": 0.0394, + "step": 64580 + }, + { + "epoch": 0.31795, + "grad_norm": 0.1001315489411354, + "learning_rate": 4.048903247065956e-05, + "loss": 0.0359, + "step": 64590 + }, + { + "epoch": 0.318, + "grad_norm": 0.09844236075878143, + "learning_rate": 4.048578753823253e-05, + "loss": 0.0369, + "step": 64600 + }, + { + "epoch": 0.31805, + "grad_norm": 0.10941297560930252, + "learning_rate": 4.048254218242989e-05, + "loss": 0.0378, + "step": 64610 + }, + { + "epoch": 0.3181, + "grad_norm": 0.10665342956781387, + "learning_rate": 4.047929640334036e-05, + "loss": 0.0381, + "step": 64620 + }, + { + "epoch": 0.31815, + "grad_norm": 0.0897960439324379, + "learning_rate": 4.047605020105268e-05, + "loss": 0.0377, + "step": 64630 + }, + { + "epoch": 0.3182, + "grad_norm": 0.07692000269889832, + "learning_rate": 4.04728035756556e-05, + "loss": 0.0372, + "step": 64640 + }, + { + "epoch": 0.31825, + "grad_norm": 0.09068204462528229, + "learning_rate": 4.04695565272379e-05, + "loss": 0.0369, + "step": 64650 + }, + { + "epoch": 0.3183, + "grad_norm": 0.09132274240255356, + "learning_rate": 4.046630905588832e-05, + "loss": 0.0376, + "step": 64660 + }, + { + "epoch": 0.31835, + "grad_norm": 0.10166731476783752, + "learning_rate": 4.046306116169567e-05, + "loss": 0.0387, + "step": 64670 + }, + { + "epoch": 0.3184, + "grad_norm": 0.10080720484256744, + "learning_rate": 4.0459812844748724e-05, + "loss": 0.0388, + "step": 64680 + }, + { + "epoch": 0.31845, + "grad_norm": 0.09786974638700485, + "learning_rate": 4.045656410513631e-05, + "loss": 0.0371, + "step": 64690 + }, + { + "epoch": 0.3185, + "grad_norm": 0.12344319373369217, + "learning_rate": 4.045331494294724e-05, + "loss": 0.0365, + "step": 64700 + }, + { + "epoch": 0.31855, + "grad_norm": 0.10499534755945206, + "learning_rate": 4.045006535827035e-05, + "loss": 0.0372, + "step": 64710 + }, + { + "epoch": 0.3186, + "grad_norm": 0.1006593182682991, + "learning_rate": 4.044681535119447e-05, + "loss": 0.04, + "step": 64720 + }, + { + "epoch": 0.31865, + "grad_norm": 0.10956310480833054, + "learning_rate": 4.044356492180847e-05, + "loss": 0.0373, + "step": 64730 + }, + { + "epoch": 0.3187, + "grad_norm": 0.10918961465358734, + "learning_rate": 4.0440314070201194e-05, + "loss": 0.0372, + "step": 64740 + }, + { + "epoch": 0.31875, + "grad_norm": 0.09064657986164093, + "learning_rate": 4.0437062796461545e-05, + "loss": 0.037, + "step": 64750 + }, + { + "epoch": 0.3188, + "grad_norm": 0.08278244733810425, + "learning_rate": 4.043381110067839e-05, + "loss": 0.0379, + "step": 64760 + }, + { + "epoch": 0.31885, + "grad_norm": 0.08891545236110687, + "learning_rate": 4.043055898294064e-05, + "loss": 0.0382, + "step": 64770 + }, + { + "epoch": 0.3189, + "grad_norm": 0.1029762253165245, + "learning_rate": 4.042730644333721e-05, + "loss": 0.0368, + "step": 64780 + }, + { + "epoch": 0.31895, + "grad_norm": 0.12306119501590729, + "learning_rate": 4.042405348195701e-05, + "loss": 0.0376, + "step": 64790 + }, + { + "epoch": 0.319, + "grad_norm": 0.12125366926193237, + "learning_rate": 4.042080009888899e-05, + "loss": 0.0368, + "step": 64800 + }, + { + "epoch": 0.31905, + "grad_norm": 0.12127557396888733, + "learning_rate": 4.0417546294222074e-05, + "loss": 0.0396, + "step": 64810 + }, + { + "epoch": 0.3191, + "grad_norm": 0.13900983333587646, + "learning_rate": 4.041429206804525e-05, + "loss": 0.0374, + "step": 64820 + }, + { + "epoch": 0.31915, + "grad_norm": 0.12804409861564636, + "learning_rate": 4.041103742044746e-05, + "loss": 0.0385, + "step": 64830 + }, + { + "epoch": 0.3192, + "grad_norm": 0.11414000391960144, + "learning_rate": 4.040778235151771e-05, + "loss": 0.0372, + "step": 64840 + }, + { + "epoch": 0.31925, + "grad_norm": 0.11278680711984634, + "learning_rate": 4.040452686134497e-05, + "loss": 0.0368, + "step": 64850 + }, + { + "epoch": 0.3193, + "grad_norm": 0.11015470325946808, + "learning_rate": 4.0401270950018256e-05, + "loss": 0.037, + "step": 64860 + }, + { + "epoch": 0.31935, + "grad_norm": 0.12102688103914261, + "learning_rate": 4.0398014617626576e-05, + "loss": 0.0377, + "step": 64870 + }, + { + "epoch": 0.3194, + "grad_norm": 0.10942655801773071, + "learning_rate": 4.039475786425896e-05, + "loss": 0.0374, + "step": 64880 + }, + { + "epoch": 0.31945, + "grad_norm": 0.1215989887714386, + "learning_rate": 4.039150069000445e-05, + "loss": 0.0372, + "step": 64890 + }, + { + "epoch": 0.3195, + "grad_norm": 0.13428235054016113, + "learning_rate": 4.03882430949521e-05, + "loss": 0.0379, + "step": 64900 + }, + { + "epoch": 0.31955, + "grad_norm": 0.09986699372529984, + "learning_rate": 4.038498507919096e-05, + "loss": 0.0369, + "step": 64910 + }, + { + "epoch": 0.3196, + "grad_norm": 0.12329811602830887, + "learning_rate": 4.03817266428101e-05, + "loss": 0.0374, + "step": 64920 + }, + { + "epoch": 0.31965, + "grad_norm": 0.1049598753452301, + "learning_rate": 4.037846778589862e-05, + "loss": 0.0372, + "step": 64930 + }, + { + "epoch": 0.3197, + "grad_norm": 0.10766629129648209, + "learning_rate": 4.037520850854561e-05, + "loss": 0.0378, + "step": 64940 + }, + { + "epoch": 0.31975, + "grad_norm": 0.130364328622818, + "learning_rate": 4.0371948810840175e-05, + "loss": 0.0373, + "step": 64950 + }, + { + "epoch": 0.3198, + "grad_norm": 0.12801463901996613, + "learning_rate": 4.036868869287144e-05, + "loss": 0.0362, + "step": 64960 + }, + { + "epoch": 0.31985, + "grad_norm": 0.10812928527593613, + "learning_rate": 4.036542815472851e-05, + "loss": 0.0366, + "step": 64970 + }, + { + "epoch": 0.3199, + "grad_norm": 0.09909074008464813, + "learning_rate": 4.0362167196500566e-05, + "loss": 0.0367, + "step": 64980 + }, + { + "epoch": 0.31995, + "grad_norm": 0.0915384441614151, + "learning_rate": 4.035890581827673e-05, + "loss": 0.0356, + "step": 64990 + }, + { + "epoch": 0.32, + "grad_norm": 0.09630433470010757, + "learning_rate": 4.035564402014619e-05, + "loss": 0.0373, + "step": 65000 + }, + { + "epoch": 0.32005, + "grad_norm": 0.10088414698839188, + "learning_rate": 4.03523818021981e-05, + "loss": 0.0372, + "step": 65010 + }, + { + "epoch": 0.3201, + "grad_norm": 0.08994408696889877, + "learning_rate": 4.034911916452167e-05, + "loss": 0.0366, + "step": 65020 + }, + { + "epoch": 0.32015, + "grad_norm": 0.10887617617845535, + "learning_rate": 4.034585610720608e-05, + "loss": 0.0383, + "step": 65030 + }, + { + "epoch": 0.3202, + "grad_norm": 0.087371826171875, + "learning_rate": 4.034259263034056e-05, + "loss": 0.0378, + "step": 65040 + }, + { + "epoch": 0.32025, + "grad_norm": 0.09730211645364761, + "learning_rate": 4.033932873401431e-05, + "loss": 0.0376, + "step": 65050 + }, + { + "epoch": 0.3203, + "grad_norm": 0.10706541687250137, + "learning_rate": 4.0336064418316575e-05, + "loss": 0.0365, + "step": 65060 + }, + { + "epoch": 0.32035, + "grad_norm": 0.09982568770647049, + "learning_rate": 4.0332799683336605e-05, + "loss": 0.0374, + "step": 65070 + }, + { + "epoch": 0.3204, + "grad_norm": 0.08777441084384918, + "learning_rate": 4.0329534529163654e-05, + "loss": 0.0358, + "step": 65080 + }, + { + "epoch": 0.32045, + "grad_norm": 0.10409170389175415, + "learning_rate": 4.032626895588698e-05, + "loss": 0.0358, + "step": 65090 + }, + { + "epoch": 0.3205, + "grad_norm": 0.09802231192588806, + "learning_rate": 4.032300296359588e-05, + "loss": 0.0365, + "step": 65100 + }, + { + "epoch": 0.32055, + "grad_norm": 0.09521409869194031, + "learning_rate": 4.031973655237963e-05, + "loss": 0.0402, + "step": 65110 + }, + { + "epoch": 0.3206, + "grad_norm": 0.09686526656150818, + "learning_rate": 4.031646972232754e-05, + "loss": 0.0378, + "step": 65120 + }, + { + "epoch": 0.32065, + "grad_norm": 0.1024775430560112, + "learning_rate": 4.031320247352892e-05, + "loss": 0.0367, + "step": 65130 + }, + { + "epoch": 0.3207, + "grad_norm": 0.07837732136249542, + "learning_rate": 4.03099348060731e-05, + "loss": 0.0361, + "step": 65140 + }, + { + "epoch": 0.32075, + "grad_norm": 0.07916653156280518, + "learning_rate": 4.030666672004941e-05, + "loss": 0.0387, + "step": 65150 + }, + { + "epoch": 0.3208, + "grad_norm": 0.08275351673364639, + "learning_rate": 4.030339821554721e-05, + "loss": 0.0352, + "step": 65160 + }, + { + "epoch": 0.32085, + "grad_norm": 0.10535655915737152, + "learning_rate": 4.030012929265585e-05, + "loss": 0.0372, + "step": 65170 + }, + { + "epoch": 0.3209, + "grad_norm": 0.0935136154294014, + "learning_rate": 4.0296859951464695e-05, + "loss": 0.0362, + "step": 65180 + }, + { + "epoch": 0.32095, + "grad_norm": 0.09714420139789581, + "learning_rate": 4.0293590192063145e-05, + "loss": 0.0367, + "step": 65190 + }, + { + "epoch": 0.321, + "grad_norm": 0.0949711725115776, + "learning_rate": 4.029032001454058e-05, + "loss": 0.0398, + "step": 65200 + }, + { + "epoch": 0.32105, + "grad_norm": 0.09688550233840942, + "learning_rate": 4.028704941898641e-05, + "loss": 0.0378, + "step": 65210 + }, + { + "epoch": 0.3211, + "grad_norm": 0.10137657821178436, + "learning_rate": 4.028377840549005e-05, + "loss": 0.0364, + "step": 65220 + }, + { + "epoch": 0.32115, + "grad_norm": 0.1072549819946289, + "learning_rate": 4.028050697414094e-05, + "loss": 0.0395, + "step": 65230 + }, + { + "epoch": 0.3212, + "grad_norm": 0.10294804722070694, + "learning_rate": 4.02772351250285e-05, + "loss": 0.0393, + "step": 65240 + }, + { + "epoch": 0.32125, + "grad_norm": 0.11168816685676575, + "learning_rate": 4.027396285824219e-05, + "loss": 0.0362, + "step": 65250 + }, + { + "epoch": 0.3213, + "grad_norm": 0.08691563457250595, + "learning_rate": 4.027069017387148e-05, + "loss": 0.0372, + "step": 65260 + }, + { + "epoch": 0.32135, + "grad_norm": 0.08215939253568649, + "learning_rate": 4.026741707200584e-05, + "loss": 0.0372, + "step": 65270 + }, + { + "epoch": 0.3214, + "grad_norm": 0.09381280094385147, + "learning_rate": 4.026414355273475e-05, + "loss": 0.0374, + "step": 65280 + }, + { + "epoch": 0.32145, + "grad_norm": 0.09605218470096588, + "learning_rate": 4.02608696161477e-05, + "loss": 0.0376, + "step": 65290 + }, + { + "epoch": 0.3215, + "grad_norm": 0.09198276698589325, + "learning_rate": 4.0257595262334214e-05, + "loss": 0.0396, + "step": 65300 + }, + { + "epoch": 0.32155, + "grad_norm": 0.10652513802051544, + "learning_rate": 4.025432049138381e-05, + "loss": 0.0373, + "step": 65310 + }, + { + "epoch": 0.3216, + "grad_norm": 0.09276243299245834, + "learning_rate": 4.0251045303386013e-05, + "loss": 0.0368, + "step": 65320 + }, + { + "epoch": 0.32165, + "grad_norm": 0.08338119834661484, + "learning_rate": 4.024776969843037e-05, + "loss": 0.0369, + "step": 65330 + }, + { + "epoch": 0.3217, + "grad_norm": 0.07244552671909332, + "learning_rate": 4.024449367660642e-05, + "loss": 0.0366, + "step": 65340 + }, + { + "epoch": 0.32175, + "grad_norm": 0.08666352927684784, + "learning_rate": 4.0241217238003746e-05, + "loss": 0.0377, + "step": 65350 + }, + { + "epoch": 0.3218, + "grad_norm": 0.08412259817123413, + "learning_rate": 4.023794038271193e-05, + "loss": 0.0356, + "step": 65360 + }, + { + "epoch": 0.32185, + "grad_norm": 0.10810157656669617, + "learning_rate": 4.0234663110820534e-05, + "loss": 0.037, + "step": 65370 + }, + { + "epoch": 0.3219, + "grad_norm": 0.08649826794862747, + "learning_rate": 4.0231385422419174e-05, + "loss": 0.0369, + "step": 65380 + }, + { + "epoch": 0.32195, + "grad_norm": 0.09026186913251877, + "learning_rate": 4.0228107317597464e-05, + "loss": 0.0353, + "step": 65390 + }, + { + "epoch": 0.322, + "grad_norm": 0.08323583006858826, + "learning_rate": 4.0224828796445014e-05, + "loss": 0.036, + "step": 65400 + }, + { + "epoch": 0.32205, + "grad_norm": 0.08100961893796921, + "learning_rate": 4.022154985905147e-05, + "loss": 0.036, + "step": 65410 + }, + { + "epoch": 0.3221, + "grad_norm": 0.08172884583473206, + "learning_rate": 4.021827050550647e-05, + "loss": 0.0371, + "step": 65420 + }, + { + "epoch": 0.32215, + "grad_norm": 0.09196305274963379, + "learning_rate": 4.021499073589967e-05, + "loss": 0.0369, + "step": 65430 + }, + { + "epoch": 0.3222, + "grad_norm": 0.10631976276636124, + "learning_rate": 4.021171055032074e-05, + "loss": 0.0363, + "step": 65440 + }, + { + "epoch": 0.32225, + "grad_norm": 0.08054377138614655, + "learning_rate": 4.0208429948859364e-05, + "loss": 0.0366, + "step": 65450 + }, + { + "epoch": 0.3223, + "grad_norm": 0.09362701326608658, + "learning_rate": 4.020514893160522e-05, + "loss": 0.0372, + "step": 65460 + }, + { + "epoch": 0.32235, + "grad_norm": 0.08705075085163116, + "learning_rate": 4.020186749864802e-05, + "loss": 0.0354, + "step": 65470 + }, + { + "epoch": 0.3224, + "grad_norm": 0.11856447160243988, + "learning_rate": 4.019858565007747e-05, + "loss": 0.0364, + "step": 65480 + }, + { + "epoch": 0.32245, + "grad_norm": 0.12213760614395142, + "learning_rate": 4.0195303385983305e-05, + "loss": 0.0359, + "step": 65490 + }, + { + "epoch": 0.3225, + "grad_norm": 0.10082264244556427, + "learning_rate": 4.0192020706455245e-05, + "loss": 0.0373, + "step": 65500 + }, + { + "epoch": 0.32255, + "grad_norm": 0.09375528991222382, + "learning_rate": 4.018873761158305e-05, + "loss": 0.0359, + "step": 65510 + }, + { + "epoch": 0.3226, + "grad_norm": 0.09272167086601257, + "learning_rate": 4.018545410145648e-05, + "loss": 0.0361, + "step": 65520 + }, + { + "epoch": 0.32265, + "grad_norm": 0.10465720295906067, + "learning_rate": 4.018217017616529e-05, + "loss": 0.0379, + "step": 65530 + }, + { + "epoch": 0.3227, + "grad_norm": 0.12206566333770752, + "learning_rate": 4.017888583579928e-05, + "loss": 0.0377, + "step": 65540 + }, + { + "epoch": 0.32275, + "grad_norm": 0.1141703650355339, + "learning_rate": 4.017560108044823e-05, + "loss": 0.0366, + "step": 65550 + }, + { + "epoch": 0.3228, + "grad_norm": 0.10635753720998764, + "learning_rate": 4.017231591020194e-05, + "loss": 0.0386, + "step": 65560 + }, + { + "epoch": 0.32285, + "grad_norm": 0.11412060260772705, + "learning_rate": 4.016903032515025e-05, + "loss": 0.0388, + "step": 65570 + }, + { + "epoch": 0.3229, + "grad_norm": 0.09945717453956604, + "learning_rate": 4.016574432538296e-05, + "loss": 0.0366, + "step": 65580 + }, + { + "epoch": 0.32295, + "grad_norm": 0.11253881454467773, + "learning_rate": 4.0162457910989914e-05, + "loss": 0.0374, + "step": 65590 + }, + { + "epoch": 0.323, + "grad_norm": 0.09585455060005188, + "learning_rate": 4.015917108206097e-05, + "loss": 0.0393, + "step": 65600 + }, + { + "epoch": 0.32305, + "grad_norm": 0.08950097113847733, + "learning_rate": 4.015588383868598e-05, + "loss": 0.0381, + "step": 65610 + }, + { + "epoch": 0.3231, + "grad_norm": 0.09589240700006485, + "learning_rate": 4.015259618095483e-05, + "loss": 0.0385, + "step": 65620 + }, + { + "epoch": 0.32315, + "grad_norm": 0.10538499057292938, + "learning_rate": 4.014930810895738e-05, + "loss": 0.0388, + "step": 65630 + }, + { + "epoch": 0.3232, + "grad_norm": 0.11811120808124542, + "learning_rate": 4.014601962278354e-05, + "loss": 0.0388, + "step": 65640 + }, + { + "epoch": 0.32325, + "grad_norm": 0.1357794851064682, + "learning_rate": 4.014273072252322e-05, + "loss": 0.0386, + "step": 65650 + }, + { + "epoch": 0.3233, + "grad_norm": 0.12051770836114883, + "learning_rate": 4.0139441408266326e-05, + "loss": 0.0388, + "step": 65660 + }, + { + "epoch": 0.32335, + "grad_norm": 0.12320932745933533, + "learning_rate": 4.0136151680102794e-05, + "loss": 0.0372, + "step": 65670 + }, + { + "epoch": 0.3234, + "grad_norm": 0.10769601166248322, + "learning_rate": 4.013286153812256e-05, + "loss": 0.0383, + "step": 65680 + }, + { + "epoch": 0.32345, + "grad_norm": 0.10716307908296585, + "learning_rate": 4.012957098241558e-05, + "loss": 0.0394, + "step": 65690 + }, + { + "epoch": 0.3235, + "grad_norm": 0.10903730243444443, + "learning_rate": 4.0126280013071806e-05, + "loss": 0.0389, + "step": 65700 + }, + { + "epoch": 0.32355, + "grad_norm": 0.11699903011322021, + "learning_rate": 4.0122988630181226e-05, + "loss": 0.038, + "step": 65710 + }, + { + "epoch": 0.3236, + "grad_norm": 0.09041581302881241, + "learning_rate": 4.011969683383381e-05, + "loss": 0.0401, + "step": 65720 + }, + { + "epoch": 0.32365, + "grad_norm": 0.0852922797203064, + "learning_rate": 4.0116404624119576e-05, + "loss": 0.0372, + "step": 65730 + }, + { + "epoch": 0.3237, + "grad_norm": 0.10173098742961884, + "learning_rate": 4.0113112001128505e-05, + "loss": 0.0376, + "step": 65740 + }, + { + "epoch": 0.32375, + "grad_norm": 0.11126632243394852, + "learning_rate": 4.010981896495064e-05, + "loss": 0.0364, + "step": 65750 + }, + { + "epoch": 0.3238, + "grad_norm": 0.09293050318956375, + "learning_rate": 4.0106525515676e-05, + "loss": 0.0362, + "step": 65760 + }, + { + "epoch": 0.32385, + "grad_norm": 0.08062175661325455, + "learning_rate": 4.010323165339462e-05, + "loss": 0.0357, + "step": 65770 + }, + { + "epoch": 0.3239, + "grad_norm": 0.07949662208557129, + "learning_rate": 4.009993737819656e-05, + "loss": 0.0362, + "step": 65780 + }, + { + "epoch": 0.32395, + "grad_norm": 0.08509853482246399, + "learning_rate": 4.009664269017189e-05, + "loss": 0.0376, + "step": 65790 + }, + { + "epoch": 0.324, + "grad_norm": 0.09357540309429169, + "learning_rate": 4.009334758941068e-05, + "loss": 0.0357, + "step": 65800 + }, + { + "epoch": 0.32405, + "grad_norm": 0.08863934129476547, + "learning_rate": 4.009005207600302e-05, + "loss": 0.0367, + "step": 65810 + }, + { + "epoch": 0.3241, + "grad_norm": 0.11384686082601547, + "learning_rate": 4.008675615003901e-05, + "loss": 0.0372, + "step": 65820 + }, + { + "epoch": 0.32415, + "grad_norm": 0.08933109045028687, + "learning_rate": 4.008345981160874e-05, + "loss": 0.0356, + "step": 65830 + }, + { + "epoch": 0.3242, + "grad_norm": 0.09838990122079849, + "learning_rate": 4.008016306080236e-05, + "loss": 0.0371, + "step": 65840 + }, + { + "epoch": 0.32425, + "grad_norm": 0.09985664486885071, + "learning_rate": 4.007686589770997e-05, + "loss": 0.0356, + "step": 65850 + }, + { + "epoch": 0.3243, + "grad_norm": 0.10246867686510086, + "learning_rate": 4.0073568322421744e-05, + "loss": 0.0398, + "step": 65860 + }, + { + "epoch": 0.32435, + "grad_norm": 0.09228799492120743, + "learning_rate": 4.007027033502782e-05, + "loss": 0.0375, + "step": 65870 + }, + { + "epoch": 0.3244, + "grad_norm": 0.0932326391339302, + "learning_rate": 4.006697193561837e-05, + "loss": 0.0364, + "step": 65880 + }, + { + "epoch": 0.32445, + "grad_norm": 0.11617203056812286, + "learning_rate": 4.006367312428356e-05, + "loss": 0.0387, + "step": 65890 + }, + { + "epoch": 0.3245, + "grad_norm": 0.0983414575457573, + "learning_rate": 4.006037390111359e-05, + "loss": 0.0375, + "step": 65900 + }, + { + "epoch": 0.32455, + "grad_norm": 0.08532964438199997, + "learning_rate": 4.005707426619866e-05, + "loss": 0.0369, + "step": 65910 + }, + { + "epoch": 0.3246, + "grad_norm": 0.08981236815452576, + "learning_rate": 4.005377421962897e-05, + "loss": 0.0367, + "step": 65920 + }, + { + "epoch": 0.32465, + "grad_norm": 0.08878609538078308, + "learning_rate": 4.005047376149475e-05, + "loss": 0.0377, + "step": 65930 + }, + { + "epoch": 0.3247, + "grad_norm": 0.11349914222955704, + "learning_rate": 4.004717289188623e-05, + "loss": 0.0384, + "step": 65940 + }, + { + "epoch": 0.32475, + "grad_norm": 0.1028319001197815, + "learning_rate": 4.004387161089365e-05, + "loss": 0.0383, + "step": 65950 + }, + { + "epoch": 0.3248, + "grad_norm": 0.0931456908583641, + "learning_rate": 4.0040569918607285e-05, + "loss": 0.0371, + "step": 65960 + }, + { + "epoch": 0.32485, + "grad_norm": 0.10074255615472794, + "learning_rate": 4.003726781511738e-05, + "loss": 0.038, + "step": 65970 + }, + { + "epoch": 0.3249, + "grad_norm": 0.10310529172420502, + "learning_rate": 4.0033965300514226e-05, + "loss": 0.0379, + "step": 65980 + }, + { + "epoch": 0.32495, + "grad_norm": 0.08930498361587524, + "learning_rate": 4.003066237488811e-05, + "loss": 0.0373, + "step": 65990 + }, + { + "epoch": 0.325, + "grad_norm": 0.12736685574054718, + "learning_rate": 4.002735903832933e-05, + "loss": 0.0419, + "step": 66000 + }, + { + "epoch": 0.32505, + "grad_norm": 0.10972797870635986, + "learning_rate": 4.0024055290928196e-05, + "loss": 0.0367, + "step": 66010 + }, + { + "epoch": 0.3251, + "grad_norm": 0.10036025196313858, + "learning_rate": 4.002075113277504e-05, + "loss": 0.0377, + "step": 66020 + }, + { + "epoch": 0.32515, + "grad_norm": 0.09857061505317688, + "learning_rate": 4.001744656396019e-05, + "loss": 0.0391, + "step": 66030 + }, + { + "epoch": 0.3252, + "grad_norm": 0.09945893287658691, + "learning_rate": 4.001414158457399e-05, + "loss": 0.0374, + "step": 66040 + }, + { + "epoch": 0.32525, + "grad_norm": 0.0812176913022995, + "learning_rate": 4.00108361947068e-05, + "loss": 0.0372, + "step": 66050 + }, + { + "epoch": 0.3253, + "grad_norm": 0.08450525254011154, + "learning_rate": 4.000753039444899e-05, + "loss": 0.0356, + "step": 66060 + }, + { + "epoch": 0.32535, + "grad_norm": 0.09813328087329865, + "learning_rate": 4.000422418389094e-05, + "loss": 0.0389, + "step": 66070 + }, + { + "epoch": 0.3254, + "grad_norm": 0.09411191940307617, + "learning_rate": 4.000091756312302e-05, + "loss": 0.0365, + "step": 66080 + }, + { + "epoch": 0.32545, + "grad_norm": 0.10968726128339767, + "learning_rate": 3.9997610532235665e-05, + "loss": 0.0379, + "step": 66090 + }, + { + "epoch": 0.3255, + "grad_norm": 0.1049523800611496, + "learning_rate": 3.999430309131927e-05, + "loss": 0.0364, + "step": 66100 + }, + { + "epoch": 0.32555, + "grad_norm": 0.11498141288757324, + "learning_rate": 3.999099524046427e-05, + "loss": 0.0367, + "step": 66110 + }, + { + "epoch": 0.3256, + "grad_norm": 0.11578301340341568, + "learning_rate": 3.998768697976108e-05, + "loss": 0.0388, + "step": 66120 + }, + { + "epoch": 0.32565, + "grad_norm": 0.11545883119106293, + "learning_rate": 3.998437830930016e-05, + "loss": 0.0374, + "step": 66130 + }, + { + "epoch": 0.3257, + "grad_norm": 0.11564458906650543, + "learning_rate": 3.9981069229171965e-05, + "loss": 0.0366, + "step": 66140 + }, + { + "epoch": 0.32575, + "grad_norm": 0.11431770771741867, + "learning_rate": 3.997775973946697e-05, + "loss": 0.0395, + "step": 66150 + }, + { + "epoch": 0.3258, + "grad_norm": 0.1111539676785469, + "learning_rate": 3.997444984027565e-05, + "loss": 0.0371, + "step": 66160 + }, + { + "epoch": 0.32585, + "grad_norm": 0.08865462243556976, + "learning_rate": 3.99711395316885e-05, + "loss": 0.0355, + "step": 66170 + }, + { + "epoch": 0.3259, + "grad_norm": 0.0973217785358429, + "learning_rate": 3.9967828813796015e-05, + "loss": 0.0372, + "step": 66180 + }, + { + "epoch": 0.32595, + "grad_norm": 0.08040197193622589, + "learning_rate": 3.9964517686688716e-05, + "loss": 0.0374, + "step": 66190 + }, + { + "epoch": 0.326, + "grad_norm": 0.10195370018482208, + "learning_rate": 3.996120615045712e-05, + "loss": 0.0377, + "step": 66200 + }, + { + "epoch": 0.32605, + "grad_norm": 0.09814627468585968, + "learning_rate": 3.9957894205191776e-05, + "loss": 0.0378, + "step": 66210 + }, + { + "epoch": 0.3261, + "grad_norm": 0.07224318385124207, + "learning_rate": 3.995458185098322e-05, + "loss": 0.0382, + "step": 66220 + }, + { + "epoch": 0.32615, + "grad_norm": 0.09805803000926971, + "learning_rate": 3.995126908792201e-05, + "loss": 0.0376, + "step": 66230 + }, + { + "epoch": 0.3262, + "grad_norm": 0.1111101359128952, + "learning_rate": 3.9947955916098734e-05, + "loss": 0.0373, + "step": 66240 + }, + { + "epoch": 0.32625, + "grad_norm": 0.11266911774873734, + "learning_rate": 3.9944642335603946e-05, + "loss": 0.0377, + "step": 66250 + }, + { + "epoch": 0.3263, + "grad_norm": 0.12269485741853714, + "learning_rate": 3.994132834652825e-05, + "loss": 0.039, + "step": 66260 + }, + { + "epoch": 0.32635, + "grad_norm": 0.08595847338438034, + "learning_rate": 3.993801394896226e-05, + "loss": 0.0364, + "step": 66270 + }, + { + "epoch": 0.3264, + "grad_norm": 0.08507047593593597, + "learning_rate": 3.9934699142996576e-05, + "loss": 0.0376, + "step": 66280 + }, + { + "epoch": 0.32645, + "grad_norm": 0.09251835197210312, + "learning_rate": 3.993138392872182e-05, + "loss": 0.037, + "step": 66290 + }, + { + "epoch": 0.3265, + "grad_norm": 0.10652598738670349, + "learning_rate": 3.992806830622865e-05, + "loss": 0.0398, + "step": 66300 + }, + { + "epoch": 0.32655, + "grad_norm": 0.08765865862369537, + "learning_rate": 3.99247522756077e-05, + "loss": 0.037, + "step": 66310 + }, + { + "epoch": 0.3266, + "grad_norm": 0.07849414646625519, + "learning_rate": 3.992143583694962e-05, + "loss": 0.0367, + "step": 66320 + }, + { + "epoch": 0.32665, + "grad_norm": 0.09718433022499084, + "learning_rate": 3.99181189903451e-05, + "loss": 0.0385, + "step": 66330 + }, + { + "epoch": 0.3267, + "grad_norm": 0.11125581711530685, + "learning_rate": 3.99148017358848e-05, + "loss": 0.0376, + "step": 66340 + }, + { + "epoch": 0.32675, + "grad_norm": 0.09389445185661316, + "learning_rate": 3.991148407365943e-05, + "loss": 0.0373, + "step": 66350 + }, + { + "epoch": 0.3268, + "grad_norm": 0.07964462786912918, + "learning_rate": 3.990816600375969e-05, + "loss": 0.0382, + "step": 66360 + }, + { + "epoch": 0.32685, + "grad_norm": 0.08663036674261093, + "learning_rate": 3.990484752627629e-05, + "loss": 0.0372, + "step": 66370 + }, + { + "epoch": 0.3269, + "grad_norm": 0.09641426801681519, + "learning_rate": 3.9901528641299955e-05, + "loss": 0.0385, + "step": 66380 + }, + { + "epoch": 0.32695, + "grad_norm": 0.08874489367008209, + "learning_rate": 3.989820934892143e-05, + "loss": 0.036, + "step": 66390 + }, + { + "epoch": 0.327, + "grad_norm": 0.08502322435379028, + "learning_rate": 3.9894889649231455e-05, + "loss": 0.0383, + "step": 66400 + }, + { + "epoch": 0.32705, + "grad_norm": 0.12889431416988373, + "learning_rate": 3.98915695423208e-05, + "loss": 0.0377, + "step": 66410 + }, + { + "epoch": 0.3271, + "grad_norm": 0.10460399091243744, + "learning_rate": 3.988824902828022e-05, + "loss": 0.0397, + "step": 66420 + }, + { + "epoch": 0.32715, + "grad_norm": 0.1179177463054657, + "learning_rate": 3.98849281072005e-05, + "loss": 0.0377, + "step": 66430 + }, + { + "epoch": 0.3272, + "grad_norm": 0.10219012945890427, + "learning_rate": 3.988160677917245e-05, + "loss": 0.0378, + "step": 66440 + }, + { + "epoch": 0.32725, + "grad_norm": 0.11003606021404266, + "learning_rate": 3.987828504428685e-05, + "loss": 0.0378, + "step": 66450 + }, + { + "epoch": 0.3273, + "grad_norm": 0.10524841398000717, + "learning_rate": 3.987496290263454e-05, + "loss": 0.0389, + "step": 66460 + }, + { + "epoch": 0.32735, + "grad_norm": 0.09106507152318954, + "learning_rate": 3.987164035430632e-05, + "loss": 0.0364, + "step": 66470 + }, + { + "epoch": 0.3274, + "grad_norm": 0.07917270809412003, + "learning_rate": 3.986831739939305e-05, + "loss": 0.0374, + "step": 66480 + }, + { + "epoch": 0.32745, + "grad_norm": 0.08859952539205551, + "learning_rate": 3.986499403798556e-05, + "loss": 0.0366, + "step": 66490 + }, + { + "epoch": 0.3275, + "grad_norm": 0.1100822240114212, + "learning_rate": 3.986167027017472e-05, + "loss": 0.0398, + "step": 66500 + }, + { + "epoch": 0.32755, + "grad_norm": 0.10014413297176361, + "learning_rate": 3.9858346096051405e-05, + "loss": 0.0352, + "step": 66510 + }, + { + "epoch": 0.3276, + "grad_norm": 0.08480783551931381, + "learning_rate": 3.985502151570648e-05, + "loss": 0.0366, + "step": 66520 + }, + { + "epoch": 0.32765, + "grad_norm": 0.10877691209316254, + "learning_rate": 3.9851696529230847e-05, + "loss": 0.0384, + "step": 66530 + }, + { + "epoch": 0.3277, + "grad_norm": 0.10374827682971954, + "learning_rate": 3.984837113671541e-05, + "loss": 0.037, + "step": 66540 + }, + { + "epoch": 0.32775, + "grad_norm": 0.09818617254495621, + "learning_rate": 3.984504533825109e-05, + "loss": 0.0363, + "step": 66550 + }, + { + "epoch": 0.3278, + "grad_norm": 0.1003548726439476, + "learning_rate": 3.984171913392881e-05, + "loss": 0.0395, + "step": 66560 + }, + { + "epoch": 0.32785, + "grad_norm": 0.09324333071708679, + "learning_rate": 3.9838392523839496e-05, + "loss": 0.0376, + "step": 66570 + }, + { + "epoch": 0.3279, + "grad_norm": 0.07978849112987518, + "learning_rate": 3.983506550807411e-05, + "loss": 0.0365, + "step": 66580 + }, + { + "epoch": 0.32795, + "grad_norm": 0.09464648365974426, + "learning_rate": 3.98317380867236e-05, + "loss": 0.0357, + "step": 66590 + }, + { + "epoch": 0.328, + "grad_norm": 0.08341842144727707, + "learning_rate": 3.982841025987896e-05, + "loss": 0.035, + "step": 66600 + }, + { + "epoch": 0.32805, + "grad_norm": 0.07485797256231308, + "learning_rate": 3.982508202763114e-05, + "loss": 0.0348, + "step": 66610 + }, + { + "epoch": 0.3281, + "grad_norm": 0.09925837814807892, + "learning_rate": 3.982175339007115e-05, + "loss": 0.0356, + "step": 66620 + }, + { + "epoch": 0.32815, + "grad_norm": 0.10949580371379852, + "learning_rate": 3.981842434728999e-05, + "loss": 0.0373, + "step": 66630 + }, + { + "epoch": 0.3282, + "grad_norm": 0.08591295033693314, + "learning_rate": 3.981509489937868e-05, + "loss": 0.0375, + "step": 66640 + }, + { + "epoch": 0.32825, + "grad_norm": 0.10751231759786606, + "learning_rate": 3.981176504642823e-05, + "loss": 0.0384, + "step": 66650 + }, + { + "epoch": 0.3283, + "grad_norm": 0.10512945801019669, + "learning_rate": 3.980843478852969e-05, + "loss": 0.0417, + "step": 66660 + }, + { + "epoch": 0.32835, + "grad_norm": 0.09879367053508759, + "learning_rate": 3.980510412577412e-05, + "loss": 0.0364, + "step": 66670 + }, + { + "epoch": 0.3284, + "grad_norm": 0.10238294303417206, + "learning_rate": 3.980177305825256e-05, + "loss": 0.036, + "step": 66680 + }, + { + "epoch": 0.32845, + "grad_norm": 0.09616518765687943, + "learning_rate": 3.979844158605608e-05, + "loss": 0.0369, + "step": 66690 + }, + { + "epoch": 0.3285, + "grad_norm": 0.08645855635404587, + "learning_rate": 3.979510970927577e-05, + "loss": 0.0374, + "step": 66700 + }, + { + "epoch": 0.32855, + "grad_norm": 0.11920953541994095, + "learning_rate": 3.979177742800271e-05, + "loss": 0.0393, + "step": 66710 + }, + { + "epoch": 0.3286, + "grad_norm": 0.11412829160690308, + "learning_rate": 3.978844474232802e-05, + "loss": 0.042, + "step": 66720 + }, + { + "epoch": 0.32865, + "grad_norm": 0.10297642648220062, + "learning_rate": 3.978511165234281e-05, + "loss": 0.0403, + "step": 66730 + }, + { + "epoch": 0.3287, + "grad_norm": 0.07714718580245972, + "learning_rate": 3.97817781581382e-05, + "loss": 0.0387, + "step": 66740 + }, + { + "epoch": 0.32875, + "grad_norm": 0.12992724776268005, + "learning_rate": 3.977844425980532e-05, + "loss": 0.0427, + "step": 66750 + }, + { + "epoch": 0.3288, + "grad_norm": 0.12231067568063736, + "learning_rate": 3.977510995743533e-05, + "loss": 0.041, + "step": 66760 + }, + { + "epoch": 0.32885, + "grad_norm": 0.09669715911149979, + "learning_rate": 3.977177525111939e-05, + "loss": 0.0374, + "step": 66770 + }, + { + "epoch": 0.3289, + "grad_norm": 0.12006594240665436, + "learning_rate": 3.976844014094866e-05, + "loss": 0.0393, + "step": 66780 + }, + { + "epoch": 0.32895, + "grad_norm": 0.1673240065574646, + "learning_rate": 3.9765104627014324e-05, + "loss": 0.0383, + "step": 66790 + }, + { + "epoch": 0.329, + "grad_norm": 0.10522366315126419, + "learning_rate": 3.976176870940758e-05, + "loss": 0.0381, + "step": 66800 + }, + { + "epoch": 0.32905, + "grad_norm": 0.07761319726705551, + "learning_rate": 3.975843238821961e-05, + "loss": 0.0377, + "step": 66810 + }, + { + "epoch": 0.3291, + "grad_norm": 0.08520624786615372, + "learning_rate": 3.9755095663541665e-05, + "loss": 0.0405, + "step": 66820 + }, + { + "epoch": 0.32915, + "grad_norm": 0.09112141281366348, + "learning_rate": 3.9751758535464935e-05, + "loss": 0.038, + "step": 66830 + }, + { + "epoch": 0.3292, + "grad_norm": 0.08923102170228958, + "learning_rate": 3.9748421004080664e-05, + "loss": 0.0367, + "step": 66840 + }, + { + "epoch": 0.32925, + "grad_norm": 0.0915558859705925, + "learning_rate": 3.9745083069480114e-05, + "loss": 0.0372, + "step": 66850 + }, + { + "epoch": 0.3293, + "grad_norm": 0.10059615224599838, + "learning_rate": 3.974174473175453e-05, + "loss": 0.0374, + "step": 66860 + }, + { + "epoch": 0.32935, + "grad_norm": 0.10322733968496323, + "learning_rate": 3.973840599099518e-05, + "loss": 0.0377, + "step": 66870 + }, + { + "epoch": 0.3294, + "grad_norm": 0.08271342515945435, + "learning_rate": 3.973506684729335e-05, + "loss": 0.0384, + "step": 66880 + }, + { + "epoch": 0.32945, + "grad_norm": 0.10284489393234253, + "learning_rate": 3.973172730074033e-05, + "loss": 0.0375, + "step": 66890 + }, + { + "epoch": 0.3295, + "grad_norm": 0.13020607829093933, + "learning_rate": 3.972838735142741e-05, + "loss": 0.0397, + "step": 66900 + }, + { + "epoch": 0.32955, + "grad_norm": 0.10525212436914444, + "learning_rate": 3.9725046999445924e-05, + "loss": 0.0369, + "step": 66910 + }, + { + "epoch": 0.3296, + "grad_norm": 0.0913553461432457, + "learning_rate": 3.972170624488718e-05, + "loss": 0.0373, + "step": 66920 + }, + { + "epoch": 0.32965, + "grad_norm": 0.10340815782546997, + "learning_rate": 3.971836508784252e-05, + "loss": 0.0382, + "step": 66930 + }, + { + "epoch": 0.3297, + "grad_norm": 0.11414355039596558, + "learning_rate": 3.971502352840328e-05, + "loss": 0.0373, + "step": 66940 + }, + { + "epoch": 0.32975, + "grad_norm": 0.0910719633102417, + "learning_rate": 3.971168156666084e-05, + "loss": 0.0373, + "step": 66950 + }, + { + "epoch": 0.3298, + "grad_norm": 0.10548502206802368, + "learning_rate": 3.9708339202706545e-05, + "loss": 0.0413, + "step": 66960 + }, + { + "epoch": 0.32985, + "grad_norm": 0.10033611953258514, + "learning_rate": 3.970499643663178e-05, + "loss": 0.0366, + "step": 66970 + }, + { + "epoch": 0.3299, + "grad_norm": 0.10521698743104935, + "learning_rate": 3.970165326852794e-05, + "loss": 0.0374, + "step": 66980 + }, + { + "epoch": 0.32995, + "grad_norm": 0.08181363344192505, + "learning_rate": 3.969830969848642e-05, + "loss": 0.0377, + "step": 66990 + }, + { + "epoch": 0.33, + "grad_norm": 0.08763346076011658, + "learning_rate": 3.9694965726598634e-05, + "loss": 0.0365, + "step": 67000 + }, + { + "epoch": 0.33005, + "grad_norm": 0.07829241454601288, + "learning_rate": 3.969162135295601e-05, + "loss": 0.0365, + "step": 67010 + }, + { + "epoch": 0.3301, + "grad_norm": 0.09384607523679733, + "learning_rate": 3.968827657764997e-05, + "loss": 0.0351, + "step": 67020 + }, + { + "epoch": 0.33015, + "grad_norm": 0.09687677025794983, + "learning_rate": 3.9684931400771974e-05, + "loss": 0.0357, + "step": 67030 + }, + { + "epoch": 0.3302, + "grad_norm": 0.09295211732387543, + "learning_rate": 3.968158582241347e-05, + "loss": 0.0358, + "step": 67040 + }, + { + "epoch": 0.33025, + "grad_norm": 0.07670729607343674, + "learning_rate": 3.967823984266592e-05, + "loss": 0.0366, + "step": 67050 + }, + { + "epoch": 0.3303, + "grad_norm": 0.07735750824213028, + "learning_rate": 3.967489346162081e-05, + "loss": 0.0387, + "step": 67060 + }, + { + "epoch": 0.33035, + "grad_norm": 0.0834031030535698, + "learning_rate": 3.967154667936963e-05, + "loss": 0.036, + "step": 67070 + }, + { + "epoch": 0.3304, + "grad_norm": 0.08556082099676132, + "learning_rate": 3.966819949600387e-05, + "loss": 0.0362, + "step": 67080 + }, + { + "epoch": 0.33045, + "grad_norm": 0.07740224897861481, + "learning_rate": 3.9664851911615055e-05, + "loss": 0.0369, + "step": 67090 + }, + { + "epoch": 0.3305, + "grad_norm": 0.08293741941452026, + "learning_rate": 3.966150392629469e-05, + "loss": 0.0357, + "step": 67100 + }, + { + "epoch": 0.33055, + "grad_norm": 0.09485205262899399, + "learning_rate": 3.965815554013431e-05, + "loss": 0.0365, + "step": 67110 + }, + { + "epoch": 0.3306, + "grad_norm": 0.07694607973098755, + "learning_rate": 3.965480675322547e-05, + "loss": 0.0367, + "step": 67120 + }, + { + "epoch": 0.33065, + "grad_norm": 0.10260697454214096, + "learning_rate": 3.965145756565972e-05, + "loss": 0.0373, + "step": 67130 + }, + { + "epoch": 0.3307, + "grad_norm": 0.09534306079149246, + "learning_rate": 3.964810797752863e-05, + "loss": 0.0366, + "step": 67140 + }, + { + "epoch": 0.33075, + "grad_norm": 0.09196845442056656, + "learning_rate": 3.9644757988923766e-05, + "loss": 0.0364, + "step": 67150 + }, + { + "epoch": 0.3308, + "grad_norm": 0.08703939616680145, + "learning_rate": 3.9641407599936715e-05, + "loss": 0.0367, + "step": 67160 + }, + { + "epoch": 0.33085, + "grad_norm": 0.09351355582475662, + "learning_rate": 3.9638056810659085e-05, + "loss": 0.038, + "step": 67170 + }, + { + "epoch": 0.3309, + "grad_norm": 0.09776463359594345, + "learning_rate": 3.963470562118248e-05, + "loss": 0.0378, + "step": 67180 + }, + { + "epoch": 0.33095, + "grad_norm": 0.12001438438892365, + "learning_rate": 3.9631354031598526e-05, + "loss": 0.0382, + "step": 67190 + }, + { + "epoch": 0.331, + "grad_norm": 0.10254254937171936, + "learning_rate": 3.962800204199885e-05, + "loss": 0.0385, + "step": 67200 + }, + { + "epoch": 0.33105, + "grad_norm": 0.09995129704475403, + "learning_rate": 3.962464965247509e-05, + "loss": 0.0381, + "step": 67210 + }, + { + "epoch": 0.3311, + "grad_norm": 0.08643165230751038, + "learning_rate": 3.96212968631189e-05, + "loss": 0.0379, + "step": 67220 + }, + { + "epoch": 0.33115, + "grad_norm": 0.10039006918668747, + "learning_rate": 3.961794367402195e-05, + "loss": 0.0371, + "step": 67230 + }, + { + "epoch": 0.3312, + "grad_norm": 0.0936351865530014, + "learning_rate": 3.9614590085275914e-05, + "loss": 0.039, + "step": 67240 + }, + { + "epoch": 0.33125, + "grad_norm": 0.09664653241634369, + "learning_rate": 3.961123609697247e-05, + "loss": 0.0379, + "step": 67250 + }, + { + "epoch": 0.3313, + "grad_norm": 0.08333840221166611, + "learning_rate": 3.960788170920332e-05, + "loss": 0.0379, + "step": 67260 + }, + { + "epoch": 0.33135, + "grad_norm": 0.08787233382463455, + "learning_rate": 3.960452692206018e-05, + "loss": 0.0378, + "step": 67270 + }, + { + "epoch": 0.3314, + "grad_norm": 0.09200208634138107, + "learning_rate": 3.9601171735634756e-05, + "loss": 0.0382, + "step": 67280 + }, + { + "epoch": 0.33145, + "grad_norm": 0.08844379335641861, + "learning_rate": 3.959781615001878e-05, + "loss": 0.0381, + "step": 67290 + }, + { + "epoch": 0.3315, + "grad_norm": 0.09280557930469513, + "learning_rate": 3.9594460165303995e-05, + "loss": 0.0372, + "step": 67300 + }, + { + "epoch": 0.33155, + "grad_norm": 0.07958632707595825, + "learning_rate": 3.959110378158216e-05, + "loss": 0.0376, + "step": 67310 + }, + { + "epoch": 0.3316, + "grad_norm": 0.09166911244392395, + "learning_rate": 3.958774699894502e-05, + "loss": 0.0384, + "step": 67320 + }, + { + "epoch": 0.33165, + "grad_norm": 0.08712979406118393, + "learning_rate": 3.9584389817484355e-05, + "loss": 0.0369, + "step": 67330 + }, + { + "epoch": 0.3317, + "grad_norm": 0.09324368089437485, + "learning_rate": 3.958103223729196e-05, + "loss": 0.0378, + "step": 67340 + }, + { + "epoch": 0.33175, + "grad_norm": 0.07274219393730164, + "learning_rate": 3.9577674258459616e-05, + "loss": 0.0363, + "step": 67350 + }, + { + "epoch": 0.3318, + "grad_norm": 0.09377812594175339, + "learning_rate": 3.957431588107914e-05, + "loss": 0.0414, + "step": 67360 + }, + { + "epoch": 0.33185, + "grad_norm": 0.08584894984960556, + "learning_rate": 3.957095710524233e-05, + "loss": 0.0374, + "step": 67370 + }, + { + "epoch": 0.3319, + "grad_norm": 0.10434996336698532, + "learning_rate": 3.956759793104105e-05, + "loss": 0.0376, + "step": 67380 + }, + { + "epoch": 0.33195, + "grad_norm": 0.10280624777078629, + "learning_rate": 3.95642383585671e-05, + "loss": 0.0368, + "step": 67390 + }, + { + "epoch": 0.332, + "grad_norm": 0.08698102086782455, + "learning_rate": 3.956087838791235e-05, + "loss": 0.0376, + "step": 67400 + }, + { + "epoch": 0.33205, + "grad_norm": 0.0837353840470314, + "learning_rate": 3.9557518019168645e-05, + "loss": 0.0372, + "step": 67410 + }, + { + "epoch": 0.3321, + "grad_norm": 0.08556614071130753, + "learning_rate": 3.955415725242787e-05, + "loss": 0.0375, + "step": 67420 + }, + { + "epoch": 0.33215, + "grad_norm": 0.10648448020219803, + "learning_rate": 3.955079608778191e-05, + "loss": 0.0368, + "step": 67430 + }, + { + "epoch": 0.3322, + "grad_norm": 0.09454721957445145, + "learning_rate": 3.9547434525322644e-05, + "loss": 0.0374, + "step": 67440 + }, + { + "epoch": 0.33225, + "grad_norm": 0.09219378978013992, + "learning_rate": 3.954407256514199e-05, + "loss": 0.0364, + "step": 67450 + }, + { + "epoch": 0.3323, + "grad_norm": 0.08805567026138306, + "learning_rate": 3.954071020733185e-05, + "loss": 0.0372, + "step": 67460 + }, + { + "epoch": 0.33235, + "grad_norm": 0.08742700517177582, + "learning_rate": 3.953734745198416e-05, + "loss": 0.0369, + "step": 67470 + }, + { + "epoch": 0.3324, + "grad_norm": 0.12774163484573364, + "learning_rate": 3.953398429919085e-05, + "loss": 0.0384, + "step": 67480 + }, + { + "epoch": 0.33245, + "grad_norm": 0.10126247256994247, + "learning_rate": 3.953062074904388e-05, + "loss": 0.0364, + "step": 67490 + }, + { + "epoch": 0.3325, + "grad_norm": 0.0901947095990181, + "learning_rate": 3.952725680163518e-05, + "loss": 0.0371, + "step": 67500 + }, + { + "epoch": 0.33255, + "grad_norm": 0.12030135095119476, + "learning_rate": 3.952389245705674e-05, + "loss": 0.0381, + "step": 67510 + }, + { + "epoch": 0.3326, + "grad_norm": 0.10265327990055084, + "learning_rate": 3.952052771540055e-05, + "loss": 0.0374, + "step": 67520 + }, + { + "epoch": 0.33265, + "grad_norm": 0.08405707776546478, + "learning_rate": 3.951716257675858e-05, + "loss": 0.0364, + "step": 67530 + }, + { + "epoch": 0.3327, + "grad_norm": 0.10454193502664566, + "learning_rate": 3.951379704122283e-05, + "loss": 0.0404, + "step": 67540 + }, + { + "epoch": 0.33275, + "grad_norm": 0.14861930906772614, + "learning_rate": 3.951043110888533e-05, + "loss": 0.0378, + "step": 67550 + }, + { + "epoch": 0.3328, + "grad_norm": 0.10401389747858047, + "learning_rate": 3.9507064779838096e-05, + "loss": 0.0386, + "step": 67560 + }, + { + "epoch": 0.33285, + "grad_norm": 0.10092278569936752, + "learning_rate": 3.950369805417316e-05, + "loss": 0.0396, + "step": 67570 + }, + { + "epoch": 0.3329, + "grad_norm": 0.09623225033283234, + "learning_rate": 3.9500330931982567e-05, + "loss": 0.0381, + "step": 67580 + }, + { + "epoch": 0.33295, + "grad_norm": 0.10712605714797974, + "learning_rate": 3.949696341335838e-05, + "loss": 0.0381, + "step": 67590 + }, + { + "epoch": 0.333, + "grad_norm": 0.10331765562295914, + "learning_rate": 3.9493595498392645e-05, + "loss": 0.0383, + "step": 67600 + }, + { + "epoch": 0.33305, + "grad_norm": 0.09916549921035767, + "learning_rate": 3.949022718717747e-05, + "loss": 0.0371, + "step": 67610 + }, + { + "epoch": 0.3331, + "grad_norm": 0.08550255000591278, + "learning_rate": 3.948685847980491e-05, + "loss": 0.0376, + "step": 67620 + }, + { + "epoch": 0.33315, + "grad_norm": 0.08405419439077377, + "learning_rate": 3.94834893763671e-05, + "loss": 0.0385, + "step": 67630 + }, + { + "epoch": 0.3332, + "grad_norm": 0.12460258603096008, + "learning_rate": 3.948011987695612e-05, + "loss": 0.0383, + "step": 67640 + }, + { + "epoch": 0.33325, + "grad_norm": 0.09573324024677277, + "learning_rate": 3.9476749981664106e-05, + "loss": 0.037, + "step": 67650 + }, + { + "epoch": 0.3333, + "grad_norm": 0.08493708819150925, + "learning_rate": 3.947337969058319e-05, + "loss": 0.037, + "step": 67660 + }, + { + "epoch": 0.33335, + "grad_norm": 0.095456063747406, + "learning_rate": 3.94700090038055e-05, + "loss": 0.0375, + "step": 67670 + }, + { + "epoch": 0.3334, + "grad_norm": 0.08934903144836426, + "learning_rate": 3.946663792142321e-05, + "loss": 0.0372, + "step": 67680 + }, + { + "epoch": 0.33345, + "grad_norm": 0.10696164518594742, + "learning_rate": 3.9463266443528466e-05, + "loss": 0.0393, + "step": 67690 + }, + { + "epoch": 0.3335, + "grad_norm": 0.09020401537418365, + "learning_rate": 3.9459894570213454e-05, + "loss": 0.0374, + "step": 67700 + }, + { + "epoch": 0.33355, + "grad_norm": 0.07880588620901108, + "learning_rate": 3.9456522301570364e-05, + "loss": 0.0365, + "step": 67710 + }, + { + "epoch": 0.3336, + "grad_norm": 0.07263457030057907, + "learning_rate": 3.945314963769138e-05, + "loss": 0.0381, + "step": 67720 + }, + { + "epoch": 0.33365, + "grad_norm": 0.06893815100193024, + "learning_rate": 3.944977657866871e-05, + "loss": 0.0377, + "step": 67730 + }, + { + "epoch": 0.3337, + "grad_norm": 0.1003556102514267, + "learning_rate": 3.9446403124594586e-05, + "loss": 0.0391, + "step": 67740 + }, + { + "epoch": 0.33375, + "grad_norm": 0.0821806788444519, + "learning_rate": 3.944302927556122e-05, + "loss": 0.0379, + "step": 67750 + }, + { + "epoch": 0.3338, + "grad_norm": 0.09835135191679001, + "learning_rate": 3.943965503166086e-05, + "loss": 0.0372, + "step": 67760 + }, + { + "epoch": 0.33385, + "grad_norm": 0.0932500883936882, + "learning_rate": 3.943628039298576e-05, + "loss": 0.0384, + "step": 67770 + }, + { + "epoch": 0.3339, + "grad_norm": 0.09037882089614868, + "learning_rate": 3.943290535962818e-05, + "loss": 0.0379, + "step": 67780 + }, + { + "epoch": 0.33395, + "grad_norm": 0.07879551500082016, + "learning_rate": 3.9429529931680384e-05, + "loss": 0.0349, + "step": 67790 + }, + { + "epoch": 0.334, + "grad_norm": 0.08956615626811981, + "learning_rate": 3.9426154109234656e-05, + "loss": 0.0359, + "step": 67800 + }, + { + "epoch": 0.33405, + "grad_norm": 0.09419052302837372, + "learning_rate": 3.94227778923833e-05, + "loss": 0.0387, + "step": 67810 + }, + { + "epoch": 0.3341, + "grad_norm": 0.08166324347257614, + "learning_rate": 3.941940128121862e-05, + "loss": 0.0366, + "step": 67820 + }, + { + "epoch": 0.33415, + "grad_norm": 0.09600416570901871, + "learning_rate": 3.941602427583292e-05, + "loss": 0.0374, + "step": 67830 + }, + { + "epoch": 0.3342, + "grad_norm": 0.10011889785528183, + "learning_rate": 3.941264687631854e-05, + "loss": 0.0361, + "step": 67840 + }, + { + "epoch": 0.33425, + "grad_norm": 0.09820140898227692, + "learning_rate": 3.94092690827678e-05, + "loss": 0.0379, + "step": 67850 + }, + { + "epoch": 0.3343, + "grad_norm": 0.1091008335351944, + "learning_rate": 3.9405890895273053e-05, + "loss": 0.0373, + "step": 67860 + }, + { + "epoch": 0.33435, + "grad_norm": 0.11982117593288422, + "learning_rate": 3.9402512313926674e-05, + "loss": 0.039, + "step": 67870 + }, + { + "epoch": 0.3344, + "grad_norm": 0.11701437830924988, + "learning_rate": 3.9399133338821016e-05, + "loss": 0.0367, + "step": 67880 + }, + { + "epoch": 0.33445, + "grad_norm": 0.09619835019111633, + "learning_rate": 3.9395753970048455e-05, + "loss": 0.0369, + "step": 67890 + }, + { + "epoch": 0.3345, + "grad_norm": 0.13266895711421967, + "learning_rate": 3.939237420770139e-05, + "loss": 0.0379, + "step": 67900 + }, + { + "epoch": 0.33455, + "grad_norm": 0.10785192251205444, + "learning_rate": 3.938899405187223e-05, + "loss": 0.0395, + "step": 67910 + }, + { + "epoch": 0.3346, + "grad_norm": 0.09762826561927795, + "learning_rate": 3.938561350265336e-05, + "loss": 0.0383, + "step": 67920 + }, + { + "epoch": 0.33465, + "grad_norm": 0.09617773443460464, + "learning_rate": 3.938223256013724e-05, + "loss": 0.0367, + "step": 67930 + }, + { + "epoch": 0.3347, + "grad_norm": 0.11434216797351837, + "learning_rate": 3.937885122441628e-05, + "loss": 0.0371, + "step": 67940 + }, + { + "epoch": 0.33475, + "grad_norm": 0.09391623735427856, + "learning_rate": 3.937546949558293e-05, + "loss": 0.0374, + "step": 67950 + }, + { + "epoch": 0.3348, + "grad_norm": 0.10000456124544144, + "learning_rate": 3.937208737372964e-05, + "loss": 0.0362, + "step": 67960 + }, + { + "epoch": 0.33485, + "grad_norm": 0.10160036385059357, + "learning_rate": 3.936870485894888e-05, + "loss": 0.0375, + "step": 67970 + }, + { + "epoch": 0.3349, + "grad_norm": 0.09816861897706985, + "learning_rate": 3.9365321951333127e-05, + "loss": 0.0377, + "step": 67980 + }, + { + "epoch": 0.33495, + "grad_norm": 0.08815471082925797, + "learning_rate": 3.936193865097487e-05, + "loss": 0.0363, + "step": 67990 + }, + { + "epoch": 0.335, + "grad_norm": 0.19023734331130981, + "learning_rate": 3.935855495796661e-05, + "loss": 0.0384, + "step": 68000 + }, + { + "epoch": 0.33505, + "grad_norm": 0.13372373580932617, + "learning_rate": 3.935517087240085e-05, + "loss": 0.0392, + "step": 68010 + }, + { + "epoch": 0.3351, + "grad_norm": 0.10422195494174957, + "learning_rate": 3.9351786394370104e-05, + "loss": 0.0369, + "step": 68020 + }, + { + "epoch": 0.33515, + "grad_norm": 0.09163201600313187, + "learning_rate": 3.9348401523966924e-05, + "loss": 0.0365, + "step": 68030 + }, + { + "epoch": 0.3352, + "grad_norm": 0.10625466704368591, + "learning_rate": 3.934501626128383e-05, + "loss": 0.0375, + "step": 68040 + }, + { + "epoch": 0.33525, + "grad_norm": 0.11696978658437729, + "learning_rate": 3.934163060641337e-05, + "loss": 0.0379, + "step": 68050 + }, + { + "epoch": 0.3353, + "grad_norm": 0.09626693278551102, + "learning_rate": 3.933824455944813e-05, + "loss": 0.0376, + "step": 68060 + }, + { + "epoch": 0.33535, + "grad_norm": 0.07733645290136337, + "learning_rate": 3.9334858120480666e-05, + "loss": 0.038, + "step": 68070 + }, + { + "epoch": 0.3354, + "grad_norm": 0.09125781804323196, + "learning_rate": 3.9331471289603575e-05, + "loss": 0.0354, + "step": 68080 + }, + { + "epoch": 0.33545, + "grad_norm": 0.08020053058862686, + "learning_rate": 3.932808406690943e-05, + "loss": 0.0363, + "step": 68090 + }, + { + "epoch": 0.3355, + "grad_norm": 0.13144421577453613, + "learning_rate": 3.932469645249086e-05, + "loss": 0.0387, + "step": 68100 + }, + { + "epoch": 0.33555, + "grad_norm": 0.09514191746711731, + "learning_rate": 3.932130844644045e-05, + "loss": 0.0362, + "step": 68110 + }, + { + "epoch": 0.3356, + "grad_norm": 0.0922185406088829, + "learning_rate": 3.931792004885086e-05, + "loss": 0.0388, + "step": 68120 + }, + { + "epoch": 0.33565, + "grad_norm": 0.08498401194810867, + "learning_rate": 3.931453125981472e-05, + "loss": 0.0358, + "step": 68130 + }, + { + "epoch": 0.3357, + "grad_norm": 0.07472465932369232, + "learning_rate": 3.931114207942468e-05, + "loss": 0.0357, + "step": 68140 + }, + { + "epoch": 0.33575, + "grad_norm": 0.10260728746652603, + "learning_rate": 3.930775250777338e-05, + "loss": 0.0364, + "step": 68150 + }, + { + "epoch": 0.3358, + "grad_norm": 0.10084313899278641, + "learning_rate": 3.9304362544953506e-05, + "loss": 0.0363, + "step": 68160 + }, + { + "epoch": 0.33585, + "grad_norm": 0.09109058231115341, + "learning_rate": 3.9300972191057726e-05, + "loss": 0.0359, + "step": 68170 + }, + { + "epoch": 0.3359, + "grad_norm": 0.09064600616693497, + "learning_rate": 3.929758144617874e-05, + "loss": 0.037, + "step": 68180 + }, + { + "epoch": 0.33595, + "grad_norm": 0.10729323327541351, + "learning_rate": 3.9294190310409264e-05, + "loss": 0.0375, + "step": 68190 + }, + { + "epoch": 0.336, + "grad_norm": 0.10835836082696915, + "learning_rate": 3.929079878384198e-05, + "loss": 0.0367, + "step": 68200 + }, + { + "epoch": 0.33605, + "grad_norm": 0.09931907802820206, + "learning_rate": 3.928740686656963e-05, + "loss": 0.0358, + "step": 68210 + }, + { + "epoch": 0.3361, + "grad_norm": 0.10069522261619568, + "learning_rate": 3.9284014558684945e-05, + "loss": 0.0373, + "step": 68220 + }, + { + "epoch": 0.33615, + "grad_norm": 0.0890309140086174, + "learning_rate": 3.928062186028067e-05, + "loss": 0.0371, + "step": 68230 + }, + { + "epoch": 0.3362, + "grad_norm": 0.10961360484361649, + "learning_rate": 3.9277228771449555e-05, + "loss": 0.0358, + "step": 68240 + }, + { + "epoch": 0.33625, + "grad_norm": 0.08743534982204437, + "learning_rate": 3.9273835292284364e-05, + "loss": 0.0373, + "step": 68250 + }, + { + "epoch": 0.3363, + "grad_norm": 0.09339070320129395, + "learning_rate": 3.9270441422877894e-05, + "loss": 0.0404, + "step": 68260 + }, + { + "epoch": 0.33635, + "grad_norm": 0.11863364279270172, + "learning_rate": 3.92670471633229e-05, + "loss": 0.0376, + "step": 68270 + }, + { + "epoch": 0.3364, + "grad_norm": 0.10614442825317383, + "learning_rate": 3.9263652513712205e-05, + "loss": 0.0371, + "step": 68280 + }, + { + "epoch": 0.33645, + "grad_norm": 0.09804320335388184, + "learning_rate": 3.926025747413861e-05, + "loss": 0.038, + "step": 68290 + }, + { + "epoch": 0.3365, + "grad_norm": 0.08753801882266998, + "learning_rate": 3.925686204469492e-05, + "loss": 0.043, + "step": 68300 + }, + { + "epoch": 0.33655, + "grad_norm": 0.08489872515201569, + "learning_rate": 3.9253466225474e-05, + "loss": 0.0368, + "step": 68310 + }, + { + "epoch": 0.3366, + "grad_norm": 0.09966583549976349, + "learning_rate": 3.925007001656865e-05, + "loss": 0.0376, + "step": 68320 + }, + { + "epoch": 0.33665, + "grad_norm": 0.09853795915842056, + "learning_rate": 3.9246673418071743e-05, + "loss": 0.0392, + "step": 68330 + }, + { + "epoch": 0.3367, + "grad_norm": 0.08462931960821152, + "learning_rate": 3.9243276430076146e-05, + "loss": 0.0368, + "step": 68340 + }, + { + "epoch": 0.33675, + "grad_norm": 0.08958669006824493, + "learning_rate": 3.9239879052674715e-05, + "loss": 0.0361, + "step": 68350 + }, + { + "epoch": 0.3368, + "grad_norm": 0.11544979363679886, + "learning_rate": 3.9236481285960347e-05, + "loss": 0.037, + "step": 68360 + }, + { + "epoch": 0.33685, + "grad_norm": 0.08902397751808167, + "learning_rate": 3.9233083130025916e-05, + "loss": 0.0376, + "step": 68370 + }, + { + "epoch": 0.3369, + "grad_norm": 0.08821864426136017, + "learning_rate": 3.9229684584964346e-05, + "loss": 0.0359, + "step": 68380 + }, + { + "epoch": 0.33695, + "grad_norm": 0.07459808886051178, + "learning_rate": 3.9226285650868546e-05, + "loss": 0.0381, + "step": 68390 + }, + { + "epoch": 0.337, + "grad_norm": 0.09547564387321472, + "learning_rate": 3.9222886327831446e-05, + "loss": 0.0363, + "step": 68400 + }, + { + "epoch": 0.33705, + "grad_norm": 0.09121499210596085, + "learning_rate": 3.921948661594597e-05, + "loss": 0.0361, + "step": 68410 + }, + { + "epoch": 0.3371, + "grad_norm": 0.0961172953248024, + "learning_rate": 3.921608651530507e-05, + "loss": 0.0378, + "step": 68420 + }, + { + "epoch": 0.33715, + "grad_norm": 0.09326346218585968, + "learning_rate": 3.921268602600171e-05, + "loss": 0.038, + "step": 68430 + }, + { + "epoch": 0.3372, + "grad_norm": 0.09295550733804703, + "learning_rate": 3.9209285148128854e-05, + "loss": 0.0362, + "step": 68440 + }, + { + "epoch": 0.33725, + "grad_norm": 0.08030777424573898, + "learning_rate": 3.920588388177948e-05, + "loss": 0.035, + "step": 68450 + }, + { + "epoch": 0.3373, + "grad_norm": 0.08462897688150406, + "learning_rate": 3.920248222704658e-05, + "loss": 0.0356, + "step": 68460 + }, + { + "epoch": 0.33735, + "grad_norm": 0.08583926409482956, + "learning_rate": 3.919908018402314e-05, + "loss": 0.0374, + "step": 68470 + }, + { + "epoch": 0.3374, + "grad_norm": 0.10724081099033356, + "learning_rate": 3.91956777528022e-05, + "loss": 0.0371, + "step": 68480 + }, + { + "epoch": 0.33745, + "grad_norm": 0.09373513609170914, + "learning_rate": 3.919227493347675e-05, + "loss": 0.0356, + "step": 68490 + }, + { + "epoch": 0.3375, + "grad_norm": 0.08826597779989243, + "learning_rate": 3.918887172613983e-05, + "loss": 0.0376, + "step": 68500 + }, + { + "epoch": 0.33755, + "grad_norm": 0.10178831964731216, + "learning_rate": 3.91854681308845e-05, + "loss": 0.0359, + "step": 68510 + }, + { + "epoch": 0.3376, + "grad_norm": 0.08865686506032944, + "learning_rate": 3.91820641478038e-05, + "loss": 0.035, + "step": 68520 + }, + { + "epoch": 0.33765, + "grad_norm": 0.09183824807405472, + "learning_rate": 3.917865977699079e-05, + "loss": 0.0362, + "step": 68530 + }, + { + "epoch": 0.3377, + "grad_norm": 0.097674161195755, + "learning_rate": 3.917525501853855e-05, + "loss": 0.0375, + "step": 68540 + }, + { + "epoch": 0.33775, + "grad_norm": 0.10785385221242905, + "learning_rate": 3.917184987254016e-05, + "loss": 0.0371, + "step": 68550 + }, + { + "epoch": 0.3378, + "grad_norm": 0.09957551211118698, + "learning_rate": 3.916844433908872e-05, + "loss": 0.0366, + "step": 68560 + }, + { + "epoch": 0.33785, + "grad_norm": 0.09474816173315048, + "learning_rate": 3.916503841827733e-05, + "loss": 0.0357, + "step": 68570 + }, + { + "epoch": 0.3379, + "grad_norm": 0.0891641154885292, + "learning_rate": 3.916163211019912e-05, + "loss": 0.0366, + "step": 68580 + }, + { + "epoch": 0.33795, + "grad_norm": 0.08394962549209595, + "learning_rate": 3.9158225414947206e-05, + "loss": 0.037, + "step": 68590 + }, + { + "epoch": 0.338, + "grad_norm": 0.09305551648139954, + "learning_rate": 3.915481833261473e-05, + "loss": 0.0367, + "step": 68600 + }, + { + "epoch": 0.33805, + "grad_norm": 0.09584067761898041, + "learning_rate": 3.9151410863294835e-05, + "loss": 0.0374, + "step": 68610 + }, + { + "epoch": 0.3381, + "grad_norm": 0.09269727021455765, + "learning_rate": 3.914800300708068e-05, + "loss": 0.0369, + "step": 68620 + }, + { + "epoch": 0.33815, + "grad_norm": 0.09044751524925232, + "learning_rate": 3.914459476406545e-05, + "loss": 0.0388, + "step": 68630 + }, + { + "epoch": 0.3382, + "grad_norm": 0.09777285903692245, + "learning_rate": 3.91411861343423e-05, + "loss": 0.0366, + "step": 68640 + }, + { + "epoch": 0.33825, + "grad_norm": 0.10088469088077545, + "learning_rate": 3.913777711800444e-05, + "loss": 0.0376, + "step": 68650 + }, + { + "epoch": 0.3383, + "grad_norm": 0.08271917700767517, + "learning_rate": 3.9134367715145065e-05, + "loss": 0.0379, + "step": 68660 + }, + { + "epoch": 0.33835, + "grad_norm": 0.07132899016141891, + "learning_rate": 3.9130957925857384e-05, + "loss": 0.0379, + "step": 68670 + }, + { + "epoch": 0.3384, + "grad_norm": 0.09053319692611694, + "learning_rate": 3.912754775023463e-05, + "loss": 0.0374, + "step": 68680 + }, + { + "epoch": 0.33845, + "grad_norm": 0.10277561098337173, + "learning_rate": 3.912413718837001e-05, + "loss": 0.0368, + "step": 68690 + }, + { + "epoch": 0.3385, + "grad_norm": 0.0785021260380745, + "learning_rate": 3.9120726240356804e-05, + "loss": 0.0374, + "step": 68700 + }, + { + "epoch": 0.33855, + "grad_norm": 0.08113619685173035, + "learning_rate": 3.911731490628824e-05, + "loss": 0.0374, + "step": 68710 + }, + { + "epoch": 0.3386, + "grad_norm": 0.11385636776685715, + "learning_rate": 3.911390318625759e-05, + "loss": 0.0371, + "step": 68720 + }, + { + "epoch": 0.33865, + "grad_norm": 0.08692467212677002, + "learning_rate": 3.911049108035813e-05, + "loss": 0.037, + "step": 68730 + }, + { + "epoch": 0.3387, + "grad_norm": 0.08841327577829361, + "learning_rate": 3.9107078588683145e-05, + "loss": 0.0371, + "step": 68740 + }, + { + "epoch": 0.33875, + "grad_norm": 0.09725911170244217, + "learning_rate": 3.910366571132593e-05, + "loss": 0.0364, + "step": 68750 + }, + { + "epoch": 0.3388, + "grad_norm": 0.0884573757648468, + "learning_rate": 3.9100252448379795e-05, + "loss": 0.0368, + "step": 68760 + }, + { + "epoch": 0.33885, + "grad_norm": 0.08143701404333115, + "learning_rate": 3.909683879993805e-05, + "loss": 0.0359, + "step": 68770 + }, + { + "epoch": 0.3389, + "grad_norm": 0.10178311169147491, + "learning_rate": 3.9093424766094036e-05, + "loss": 0.0373, + "step": 68780 + }, + { + "epoch": 0.33895, + "grad_norm": 0.09373073279857635, + "learning_rate": 3.909001034694108e-05, + "loss": 0.0364, + "step": 68790 + }, + { + "epoch": 0.339, + "grad_norm": 0.0956706777215004, + "learning_rate": 3.908659554257254e-05, + "loss": 0.0367, + "step": 68800 + }, + { + "epoch": 0.33905, + "grad_norm": 0.1166117936372757, + "learning_rate": 3.908318035308176e-05, + "loss": 0.0359, + "step": 68810 + }, + { + "epoch": 0.3391, + "grad_norm": 0.09963870793581009, + "learning_rate": 3.9079764778562124e-05, + "loss": 0.0367, + "step": 68820 + }, + { + "epoch": 0.33915, + "grad_norm": 0.08719000965356827, + "learning_rate": 3.9076348819107005e-05, + "loss": 0.036, + "step": 68830 + }, + { + "epoch": 0.3392, + "grad_norm": 0.08710069954395294, + "learning_rate": 3.9072932474809805e-05, + "loss": 0.0376, + "step": 68840 + }, + { + "epoch": 0.33925, + "grad_norm": 0.09574553370475769, + "learning_rate": 3.9069515745763914e-05, + "loss": 0.0368, + "step": 68850 + }, + { + "epoch": 0.3393, + "grad_norm": 0.08620046824216843, + "learning_rate": 3.9066098632062744e-05, + "loss": 0.0376, + "step": 68860 + }, + { + "epoch": 0.33935, + "grad_norm": 0.10227379202842712, + "learning_rate": 3.9062681133799726e-05, + "loss": 0.0368, + "step": 68870 + }, + { + "epoch": 0.3394, + "grad_norm": 0.08720734715461731, + "learning_rate": 3.905926325106829e-05, + "loss": 0.0381, + "step": 68880 + }, + { + "epoch": 0.33945, + "grad_norm": 0.10980436950922012, + "learning_rate": 3.905584498396188e-05, + "loss": 0.0366, + "step": 68890 + }, + { + "epoch": 0.3395, + "grad_norm": 0.09383834153413773, + "learning_rate": 3.9052426332573945e-05, + "loss": 0.0375, + "step": 68900 + }, + { + "epoch": 0.33955, + "grad_norm": 0.08845657110214233, + "learning_rate": 3.904900729699796e-05, + "loss": 0.0368, + "step": 68910 + }, + { + "epoch": 0.3396, + "grad_norm": 0.08985838294029236, + "learning_rate": 3.904558787732738e-05, + "loss": 0.0378, + "step": 68920 + }, + { + "epoch": 0.33965, + "grad_norm": 0.07143373042345047, + "learning_rate": 3.904216807365572e-05, + "loss": 0.0378, + "step": 68930 + }, + { + "epoch": 0.3397, + "grad_norm": 0.07896678149700165, + "learning_rate": 3.903874788607645e-05, + "loss": 0.0354, + "step": 68940 + }, + { + "epoch": 0.33975, + "grad_norm": 0.10684600472450256, + "learning_rate": 3.903532731468309e-05, + "loss": 0.038, + "step": 68950 + }, + { + "epoch": 0.3398, + "grad_norm": 0.08521618694067001, + "learning_rate": 3.903190635956915e-05, + "loss": 0.0366, + "step": 68960 + }, + { + "epoch": 0.33985, + "grad_norm": 0.09305830299854279, + "learning_rate": 3.902848502082817e-05, + "loss": 0.0378, + "step": 68970 + }, + { + "epoch": 0.3399, + "grad_norm": 0.09355901181697845, + "learning_rate": 3.902506329855367e-05, + "loss": 0.0361, + "step": 68980 + }, + { + "epoch": 0.33995, + "grad_norm": 0.09021936357021332, + "learning_rate": 3.902164119283922e-05, + "loss": 0.0343, + "step": 68990 + }, + { + "epoch": 0.34, + "grad_norm": 0.10385162383317947, + "learning_rate": 3.901821870377836e-05, + "loss": 0.0389, + "step": 69000 + }, + { + "epoch": 0.34005, + "grad_norm": 0.09473024308681488, + "learning_rate": 3.901479583146466e-05, + "loss": 0.0361, + "step": 69010 + }, + { + "epoch": 0.3401, + "grad_norm": 0.10816251486539841, + "learning_rate": 3.9011372575991715e-05, + "loss": 0.0366, + "step": 69020 + }, + { + "epoch": 0.34015, + "grad_norm": 0.09471640735864639, + "learning_rate": 3.900794893745311e-05, + "loss": 0.0344, + "step": 69030 + }, + { + "epoch": 0.3402, + "grad_norm": 0.09427373856306076, + "learning_rate": 3.900452491594244e-05, + "loss": 0.0369, + "step": 69040 + }, + { + "epoch": 0.34025, + "grad_norm": 0.08891633152961731, + "learning_rate": 3.9001100511553326e-05, + "loss": 0.0368, + "step": 69050 + }, + { + "epoch": 0.3403, + "grad_norm": 0.10900906473398209, + "learning_rate": 3.899767572437938e-05, + "loss": 0.0385, + "step": 69060 + }, + { + "epoch": 0.34035, + "grad_norm": 0.11046279221773148, + "learning_rate": 3.8994250554514236e-05, + "loss": 0.0401, + "step": 69070 + }, + { + "epoch": 0.3404, + "grad_norm": 0.08577143400907516, + "learning_rate": 3.899082500205154e-05, + "loss": 0.0383, + "step": 69080 + }, + { + "epoch": 0.34045, + "grad_norm": 0.09973526746034622, + "learning_rate": 3.8987399067084944e-05, + "loss": 0.0365, + "step": 69090 + }, + { + "epoch": 0.3405, + "grad_norm": 0.09257543087005615, + "learning_rate": 3.898397274970811e-05, + "loss": 0.0365, + "step": 69100 + }, + { + "epoch": 0.34055, + "grad_norm": 0.07047468423843384, + "learning_rate": 3.8980546050014724e-05, + "loss": 0.0364, + "step": 69110 + }, + { + "epoch": 0.3406, + "grad_norm": 0.09412377327680588, + "learning_rate": 3.897711896809846e-05, + "loss": 0.0352, + "step": 69120 + }, + { + "epoch": 0.34065, + "grad_norm": 0.08309981226921082, + "learning_rate": 3.8973691504053e-05, + "loss": 0.0382, + "step": 69130 + }, + { + "epoch": 0.3407, + "grad_norm": 0.09098193794488907, + "learning_rate": 3.897026365797208e-05, + "loss": 0.0357, + "step": 69140 + }, + { + "epoch": 0.34075, + "grad_norm": 0.08813491463661194, + "learning_rate": 3.896683542994939e-05, + "loss": 0.039, + "step": 69150 + }, + { + "epoch": 0.3408, + "grad_norm": 0.09963098168373108, + "learning_rate": 3.8963406820078675e-05, + "loss": 0.0391, + "step": 69160 + }, + { + "epoch": 0.34085, + "grad_norm": 0.09404154866933823, + "learning_rate": 3.8959977828453656e-05, + "loss": 0.0375, + "step": 69170 + }, + { + "epoch": 0.3409, + "grad_norm": 0.08720036596059799, + "learning_rate": 3.895654845516809e-05, + "loss": 0.0373, + "step": 69180 + }, + { + "epoch": 0.34095, + "grad_norm": 0.10500133782625198, + "learning_rate": 3.8953118700315735e-05, + "loss": 0.0371, + "step": 69190 + }, + { + "epoch": 0.341, + "grad_norm": 0.09468196332454681, + "learning_rate": 3.894968856399035e-05, + "loss": 0.0374, + "step": 69200 + }, + { + "epoch": 0.34105, + "grad_norm": 0.09113126993179321, + "learning_rate": 3.8946258046285724e-05, + "loss": 0.0372, + "step": 69210 + }, + { + "epoch": 0.3411, + "grad_norm": 0.11324943602085114, + "learning_rate": 3.8942827147295645e-05, + "loss": 0.0369, + "step": 69220 + }, + { + "epoch": 0.34115, + "grad_norm": 0.11532092839479446, + "learning_rate": 3.893939586711391e-05, + "loss": 0.0366, + "step": 69230 + }, + { + "epoch": 0.3412, + "grad_norm": 0.10300014168024063, + "learning_rate": 3.893596420583433e-05, + "loss": 0.0375, + "step": 69240 + }, + { + "epoch": 0.34125, + "grad_norm": 0.09133999794721603, + "learning_rate": 3.893253216355072e-05, + "loss": 0.0376, + "step": 69250 + }, + { + "epoch": 0.3413, + "grad_norm": 0.0787564367055893, + "learning_rate": 3.892909974035691e-05, + "loss": 0.038, + "step": 69260 + }, + { + "epoch": 0.34135, + "grad_norm": 0.08994283527135849, + "learning_rate": 3.892566693634675e-05, + "loss": 0.0365, + "step": 69270 + }, + { + "epoch": 0.3414, + "grad_norm": 0.09112702310085297, + "learning_rate": 3.892223375161409e-05, + "loss": 0.0357, + "step": 69280 + }, + { + "epoch": 0.34145, + "grad_norm": 0.0852375328540802, + "learning_rate": 3.891880018625279e-05, + "loss": 0.0412, + "step": 69290 + }, + { + "epoch": 0.3415, + "grad_norm": 0.09685932844877243, + "learning_rate": 3.891536624035672e-05, + "loss": 0.0396, + "step": 69300 + }, + { + "epoch": 0.34155, + "grad_norm": 0.10497764497995377, + "learning_rate": 3.891193191401977e-05, + "loss": 0.0352, + "step": 69310 + }, + { + "epoch": 0.3416, + "grad_norm": 0.09369406849145889, + "learning_rate": 3.890849720733582e-05, + "loss": 0.0364, + "step": 69320 + }, + { + "epoch": 0.34165, + "grad_norm": 0.10982207208871841, + "learning_rate": 3.8905062120398785e-05, + "loss": 0.0363, + "step": 69330 + }, + { + "epoch": 0.3417, + "grad_norm": 0.10184473544359207, + "learning_rate": 3.890162665330258e-05, + "loss": 0.0367, + "step": 69340 + }, + { + "epoch": 0.34175, + "grad_norm": 0.09807814657688141, + "learning_rate": 3.889819080614112e-05, + "loss": 0.0376, + "step": 69350 + }, + { + "epoch": 0.3418, + "grad_norm": 0.09532096236944199, + "learning_rate": 3.8894754579008344e-05, + "loss": 0.039, + "step": 69360 + }, + { + "epoch": 0.34185, + "grad_norm": 0.08966340869665146, + "learning_rate": 3.8891317971998196e-05, + "loss": 0.0368, + "step": 69370 + }, + { + "epoch": 0.3419, + "grad_norm": 0.07751365005970001, + "learning_rate": 3.888788098520464e-05, + "loss": 0.0365, + "step": 69380 + }, + { + "epoch": 0.34195, + "grad_norm": 0.08647938817739487, + "learning_rate": 3.8884443618721634e-05, + "loss": 0.0365, + "step": 69390 + }, + { + "epoch": 0.342, + "grad_norm": 0.09409962594509125, + "learning_rate": 3.888100587264315e-05, + "loss": 0.0366, + "step": 69400 + }, + { + "epoch": 0.34205, + "grad_norm": 0.08813067525625229, + "learning_rate": 3.887756774706318e-05, + "loss": 0.0357, + "step": 69410 + }, + { + "epoch": 0.3421, + "grad_norm": 0.07670162618160248, + "learning_rate": 3.887412924207573e-05, + "loss": 0.0362, + "step": 69420 + }, + { + "epoch": 0.34215, + "grad_norm": 0.07312174886465073, + "learning_rate": 3.887069035777479e-05, + "loss": 0.0368, + "step": 69430 + }, + { + "epoch": 0.3422, + "grad_norm": 0.09502144902944565, + "learning_rate": 3.886725109425439e-05, + "loss": 0.0368, + "step": 69440 + }, + { + "epoch": 0.34225, + "grad_norm": 0.08605118095874786, + "learning_rate": 3.8863811451608554e-05, + "loss": 0.037, + "step": 69450 + }, + { + "epoch": 0.3423, + "grad_norm": 0.08746170252561569, + "learning_rate": 3.886037142993132e-05, + "loss": 0.0372, + "step": 69460 + }, + { + "epoch": 0.34235, + "grad_norm": 0.08239756524562836, + "learning_rate": 3.885693102931675e-05, + "loss": 0.0367, + "step": 69470 + }, + { + "epoch": 0.3424, + "grad_norm": 0.08249850571155548, + "learning_rate": 3.885349024985888e-05, + "loss": 0.0363, + "step": 69480 + }, + { + "epoch": 0.34245, + "grad_norm": 0.11191985756158829, + "learning_rate": 3.8850049091651794e-05, + "loss": 0.0378, + "step": 69490 + }, + { + "epoch": 0.3425, + "grad_norm": 0.11292050033807755, + "learning_rate": 3.8846607554789566e-05, + "loss": 0.0406, + "step": 69500 + }, + { + "epoch": 0.34255, + "grad_norm": 0.0940447673201561, + "learning_rate": 3.8843165639366285e-05, + "loss": 0.0382, + "step": 69510 + }, + { + "epoch": 0.3426, + "grad_norm": 0.1097087562084198, + "learning_rate": 3.8839723345476065e-05, + "loss": 0.0376, + "step": 69520 + }, + { + "epoch": 0.34265, + "grad_norm": 0.12192397564649582, + "learning_rate": 3.883628067321301e-05, + "loss": 0.0388, + "step": 69530 + }, + { + "epoch": 0.3427, + "grad_norm": 0.10179516673088074, + "learning_rate": 3.883283762267124e-05, + "loss": 0.0385, + "step": 69540 + }, + { + "epoch": 0.34275, + "grad_norm": 0.10461835563182831, + "learning_rate": 3.882939419394488e-05, + "loss": 0.0385, + "step": 69550 + }, + { + "epoch": 0.3428, + "grad_norm": 0.10870037972927094, + "learning_rate": 3.8825950387128074e-05, + "loss": 0.0364, + "step": 69560 + }, + { + "epoch": 0.34285, + "grad_norm": 0.09890930354595184, + "learning_rate": 3.882250620231499e-05, + "loss": 0.0381, + "step": 69570 + }, + { + "epoch": 0.3429, + "grad_norm": 0.09251926839351654, + "learning_rate": 3.8819061639599765e-05, + "loss": 0.038, + "step": 69580 + }, + { + "epoch": 0.34295, + "grad_norm": 0.09087437391281128, + "learning_rate": 3.88156166990766e-05, + "loss": 0.0368, + "step": 69590 + }, + { + "epoch": 0.343, + "grad_norm": 0.09862448275089264, + "learning_rate": 3.8812171380839655e-05, + "loss": 0.0373, + "step": 69600 + }, + { + "epoch": 0.34305, + "grad_norm": 0.08722679316997528, + "learning_rate": 3.880872568498314e-05, + "loss": 0.0367, + "step": 69610 + }, + { + "epoch": 0.3431, + "grad_norm": 0.07909327000379562, + "learning_rate": 3.880527961160125e-05, + "loss": 0.0375, + "step": 69620 + }, + { + "epoch": 0.34315, + "grad_norm": 0.07764890044927597, + "learning_rate": 3.88018331607882e-05, + "loss": 0.0366, + "step": 69630 + }, + { + "epoch": 0.3432, + "grad_norm": 0.0877370536327362, + "learning_rate": 3.879838633263822e-05, + "loss": 0.0382, + "step": 69640 + }, + { + "epoch": 0.34325, + "grad_norm": 0.09489338099956512, + "learning_rate": 3.879493912724554e-05, + "loss": 0.0365, + "step": 69650 + }, + { + "epoch": 0.3433, + "grad_norm": 0.12813325226306915, + "learning_rate": 3.87914915447044e-05, + "loss": 0.0366, + "step": 69660 + }, + { + "epoch": 0.34335, + "grad_norm": 0.10898727178573608, + "learning_rate": 3.878804358510908e-05, + "loss": 0.0368, + "step": 69670 + }, + { + "epoch": 0.3434, + "grad_norm": 0.08357562869787216, + "learning_rate": 3.878459524855381e-05, + "loss": 0.0375, + "step": 69680 + }, + { + "epoch": 0.34345, + "grad_norm": 0.10898575931787491, + "learning_rate": 3.87811465351329e-05, + "loss": 0.0364, + "step": 69690 + }, + { + "epoch": 0.3435, + "grad_norm": 0.10167691111564636, + "learning_rate": 3.877769744494061e-05, + "loss": 0.0369, + "step": 69700 + }, + { + "epoch": 0.34355, + "grad_norm": 0.1086648479104042, + "learning_rate": 3.877424797807125e-05, + "loss": 0.0363, + "step": 69710 + }, + { + "epoch": 0.3436, + "grad_norm": 0.09656655043363571, + "learning_rate": 3.877079813461912e-05, + "loss": 0.0359, + "step": 69720 + }, + { + "epoch": 0.34365, + "grad_norm": 0.1367729753255844, + "learning_rate": 3.8767347914678556e-05, + "loss": 0.0363, + "step": 69730 + }, + { + "epoch": 0.3437, + "grad_norm": 0.11441062390804291, + "learning_rate": 3.8763897318343864e-05, + "loss": 0.0387, + "step": 69740 + }, + { + "epoch": 0.34375, + "grad_norm": 0.09865527600049973, + "learning_rate": 3.876044634570939e-05, + "loss": 0.039, + "step": 69750 + }, + { + "epoch": 0.3438, + "grad_norm": 0.09027211368083954, + "learning_rate": 3.875699499686949e-05, + "loss": 0.0348, + "step": 69760 + }, + { + "epoch": 0.34385, + "grad_norm": 0.08302388340234756, + "learning_rate": 3.87535432719185e-05, + "loss": 0.0364, + "step": 69770 + }, + { + "epoch": 0.3439, + "grad_norm": 0.12716825306415558, + "learning_rate": 3.8750091170950814e-05, + "loss": 0.0383, + "step": 69780 + }, + { + "epoch": 0.34395, + "grad_norm": 0.10267481207847595, + "learning_rate": 3.87466386940608e-05, + "loss": 0.0358, + "step": 69790 + }, + { + "epoch": 0.344, + "grad_norm": 0.10430262982845306, + "learning_rate": 3.874318584134285e-05, + "loss": 0.0368, + "step": 69800 + }, + { + "epoch": 0.34405, + "grad_norm": 0.10200727730989456, + "learning_rate": 3.873973261289136e-05, + "loss": 0.0379, + "step": 69810 + }, + { + "epoch": 0.3441, + "grad_norm": 0.09183640778064728, + "learning_rate": 3.8736279008800747e-05, + "loss": 0.038, + "step": 69820 + }, + { + "epoch": 0.34415, + "grad_norm": 0.08767364174127579, + "learning_rate": 3.8732825029165416e-05, + "loss": 0.0364, + "step": 69830 + }, + { + "epoch": 0.3442, + "grad_norm": 0.10637889057397842, + "learning_rate": 3.872937067407981e-05, + "loss": 0.0363, + "step": 69840 + }, + { + "epoch": 0.34425, + "grad_norm": 0.11616005748510361, + "learning_rate": 3.872591594363837e-05, + "loss": 0.0379, + "step": 69850 + }, + { + "epoch": 0.3443, + "grad_norm": 0.09851089864969254, + "learning_rate": 3.872246083793555e-05, + "loss": 0.0364, + "step": 69860 + }, + { + "epoch": 0.34435, + "grad_norm": 0.0919380709528923, + "learning_rate": 3.8719005357065804e-05, + "loss": 0.0351, + "step": 69870 + }, + { + "epoch": 0.3444, + "grad_norm": 0.0871756300330162, + "learning_rate": 3.8715549501123604e-05, + "loss": 0.0393, + "step": 69880 + }, + { + "epoch": 0.34445, + "grad_norm": 0.10097227245569229, + "learning_rate": 3.871209327020343e-05, + "loss": 0.0365, + "step": 69890 + }, + { + "epoch": 0.3445, + "grad_norm": 0.1065862774848938, + "learning_rate": 3.870863666439978e-05, + "loss": 0.0379, + "step": 69900 + }, + { + "epoch": 0.34455, + "grad_norm": 0.08766256272792816, + "learning_rate": 3.870517968380715e-05, + "loss": 0.0378, + "step": 69910 + }, + { + "epoch": 0.3446, + "grad_norm": 0.07498373836278915, + "learning_rate": 3.8701722328520064e-05, + "loss": 0.0366, + "step": 69920 + }, + { + "epoch": 0.34465, + "grad_norm": 0.0901317447423935, + "learning_rate": 3.869826459863303e-05, + "loss": 0.0373, + "step": 69930 + }, + { + "epoch": 0.3447, + "grad_norm": 0.09419835358858109, + "learning_rate": 3.86948064942406e-05, + "loss": 0.0386, + "step": 69940 + }, + { + "epoch": 0.34475, + "grad_norm": 0.0863010361790657, + "learning_rate": 3.869134801543729e-05, + "loss": 0.0373, + "step": 69950 + }, + { + "epoch": 0.3448, + "grad_norm": 0.08560824394226074, + "learning_rate": 3.868788916231767e-05, + "loss": 0.0379, + "step": 69960 + }, + { + "epoch": 0.34485, + "grad_norm": 0.09437470883131027, + "learning_rate": 3.868442993497631e-05, + "loss": 0.0368, + "step": 69970 + }, + { + "epoch": 0.3449, + "grad_norm": 0.10649093985557556, + "learning_rate": 3.8680970333507774e-05, + "loss": 0.0392, + "step": 69980 + }, + { + "epoch": 0.34495, + "grad_norm": 0.10162971168756485, + "learning_rate": 3.867751035800665e-05, + "loss": 0.0369, + "step": 69990 + }, + { + "epoch": 0.345, + "grad_norm": 0.09044211357831955, + "learning_rate": 3.8674050008567534e-05, + "loss": 0.0373, + "step": 70000 + }, + { + "epoch": 0.34505, + "grad_norm": 0.10452600568532944, + "learning_rate": 3.8670589285285025e-05, + "loss": 0.0367, + "step": 70010 + }, + { + "epoch": 0.3451, + "grad_norm": 0.09597063809633255, + "learning_rate": 3.8667128188253734e-05, + "loss": 0.037, + "step": 70020 + }, + { + "epoch": 0.34515, + "grad_norm": 0.08981821686029434, + "learning_rate": 3.8663666717568306e-05, + "loss": 0.0365, + "step": 70030 + }, + { + "epoch": 0.3452, + "grad_norm": 0.07237569987773895, + "learning_rate": 3.8660204873323356e-05, + "loss": 0.0354, + "step": 70040 + }, + { + "epoch": 0.34525, + "grad_norm": 0.08156467229127884, + "learning_rate": 3.865674265561353e-05, + "loss": 0.0349, + "step": 70050 + }, + { + "epoch": 0.3453, + "grad_norm": 0.08877598494291306, + "learning_rate": 3.8653280064533506e-05, + "loss": 0.036, + "step": 70060 + }, + { + "epoch": 0.34535, + "grad_norm": 0.08256573975086212, + "learning_rate": 3.864981710017792e-05, + "loss": 0.0354, + "step": 70070 + }, + { + "epoch": 0.3454, + "grad_norm": 0.07784508168697357, + "learning_rate": 3.864635376264148e-05, + "loss": 0.0367, + "step": 70080 + }, + { + "epoch": 0.34545, + "grad_norm": 0.08144626766443253, + "learning_rate": 3.864289005201883e-05, + "loss": 0.0372, + "step": 70090 + }, + { + "epoch": 0.3455, + "grad_norm": 0.08631348609924316, + "learning_rate": 3.863942596840471e-05, + "loss": 0.0363, + "step": 70100 + }, + { + "epoch": 0.34555, + "grad_norm": 0.09809038043022156, + "learning_rate": 3.8635961511893805e-05, + "loss": 0.0366, + "step": 70110 + }, + { + "epoch": 0.3456, + "grad_norm": 0.07484088093042374, + "learning_rate": 3.8632496682580825e-05, + "loss": 0.0373, + "step": 70120 + }, + { + "epoch": 0.34565, + "grad_norm": 0.08303824067115784, + "learning_rate": 3.862903148056052e-05, + "loss": 0.0364, + "step": 70130 + }, + { + "epoch": 0.3457, + "grad_norm": 0.07585174590349197, + "learning_rate": 3.8625565905927605e-05, + "loss": 0.0377, + "step": 70140 + }, + { + "epoch": 0.34575, + "grad_norm": 0.10004965960979462, + "learning_rate": 3.8622099958776835e-05, + "loss": 0.036, + "step": 70150 + }, + { + "epoch": 0.3458, + "grad_norm": 0.09887797385454178, + "learning_rate": 3.861863363920298e-05, + "loss": 0.0363, + "step": 70160 + }, + { + "epoch": 0.34585, + "grad_norm": 0.09083712846040726, + "learning_rate": 3.8615166947300794e-05, + "loss": 0.0375, + "step": 70170 + }, + { + "epoch": 0.3459, + "grad_norm": 0.09992069005966187, + "learning_rate": 3.861169988316506e-05, + "loss": 0.0369, + "step": 70180 + }, + { + "epoch": 0.34595, + "grad_norm": 0.1044621393084526, + "learning_rate": 3.860823244689056e-05, + "loss": 0.0387, + "step": 70190 + }, + { + "epoch": 0.346, + "grad_norm": 0.10451257228851318, + "learning_rate": 3.86047646385721e-05, + "loss": 0.0379, + "step": 70200 + }, + { + "epoch": 0.34605, + "grad_norm": 0.1116936206817627, + "learning_rate": 3.860129645830449e-05, + "loss": 0.0395, + "step": 70210 + }, + { + "epoch": 0.3461, + "grad_norm": 0.10671953111886978, + "learning_rate": 3.859782790618254e-05, + "loss": 0.0371, + "step": 70220 + }, + { + "epoch": 0.34615, + "grad_norm": 0.09391382336616516, + "learning_rate": 3.859435898230108e-05, + "loss": 0.0366, + "step": 70230 + }, + { + "epoch": 0.3462, + "grad_norm": 0.10093817859888077, + "learning_rate": 3.859088968675496e-05, + "loss": 0.0382, + "step": 70240 + }, + { + "epoch": 0.34625, + "grad_norm": 0.09761158376932144, + "learning_rate": 3.858742001963902e-05, + "loss": 0.0376, + "step": 70250 + }, + { + "epoch": 0.3463, + "grad_norm": 0.08548399806022644, + "learning_rate": 3.858394998104812e-05, + "loss": 0.0368, + "step": 70260 + }, + { + "epoch": 0.34635, + "grad_norm": 0.12837553024291992, + "learning_rate": 3.858047957107713e-05, + "loss": 0.039, + "step": 70270 + }, + { + "epoch": 0.3464, + "grad_norm": 0.09966098517179489, + "learning_rate": 3.857700878982092e-05, + "loss": 0.0386, + "step": 70280 + }, + { + "epoch": 0.34645, + "grad_norm": 0.09091837704181671, + "learning_rate": 3.857353763737441e-05, + "loss": 0.0366, + "step": 70290 + }, + { + "epoch": 0.3465, + "grad_norm": 0.0735870823264122, + "learning_rate": 3.857006611383247e-05, + "loss": 0.0369, + "step": 70300 + }, + { + "epoch": 0.34655, + "grad_norm": 0.08942056447267532, + "learning_rate": 3.856659421929003e-05, + "loss": 0.0364, + "step": 70310 + }, + { + "epoch": 0.3466, + "grad_norm": 0.09885366261005402, + "learning_rate": 3.856312195384199e-05, + "loss": 0.0368, + "step": 70320 + }, + { + "epoch": 0.34665, + "grad_norm": 0.09341978281736374, + "learning_rate": 3.855964931758329e-05, + "loss": 0.0366, + "step": 70330 + }, + { + "epoch": 0.3467, + "grad_norm": 0.09163305908441544, + "learning_rate": 3.855617631060887e-05, + "loss": 0.0377, + "step": 70340 + }, + { + "epoch": 0.34675, + "grad_norm": 0.12091311812400818, + "learning_rate": 3.8552702933013696e-05, + "loss": 0.0361, + "step": 70350 + }, + { + "epoch": 0.3468, + "grad_norm": 0.07424789667129517, + "learning_rate": 3.854922918489271e-05, + "loss": 0.036, + "step": 70360 + }, + { + "epoch": 0.34685, + "grad_norm": 0.08816181868314743, + "learning_rate": 3.8545755066340884e-05, + "loss": 0.036, + "step": 70370 + }, + { + "epoch": 0.3469, + "grad_norm": 0.09558789432048798, + "learning_rate": 3.85422805774532e-05, + "loss": 0.0361, + "step": 70380 + }, + { + "epoch": 0.34695, + "grad_norm": 0.09851736575365067, + "learning_rate": 3.853880571832466e-05, + "loss": 0.0399, + "step": 70390 + }, + { + "epoch": 0.347, + "grad_norm": 0.09676993638277054, + "learning_rate": 3.853533048905026e-05, + "loss": 0.0374, + "step": 70400 + }, + { + "epoch": 0.34705, + "grad_norm": 0.1089349016547203, + "learning_rate": 3.8531854889725e-05, + "loss": 0.0364, + "step": 70410 + }, + { + "epoch": 0.3471, + "grad_norm": 0.1273079216480255, + "learning_rate": 3.852837892044392e-05, + "loss": 0.0378, + "step": 70420 + }, + { + "epoch": 0.34715, + "grad_norm": 0.11691662669181824, + "learning_rate": 3.8524902581302035e-05, + "loss": 0.0388, + "step": 70430 + }, + { + "epoch": 0.3472, + "grad_norm": 0.09867251664400101, + "learning_rate": 3.85214258723944e-05, + "loss": 0.0391, + "step": 70440 + }, + { + "epoch": 0.34725, + "grad_norm": 0.1259452998638153, + "learning_rate": 3.851794879381606e-05, + "loss": 0.0402, + "step": 70450 + }, + { + "epoch": 0.3473, + "grad_norm": 0.09362509101629257, + "learning_rate": 3.851447134566208e-05, + "loss": 0.038, + "step": 70460 + }, + { + "epoch": 0.34735, + "grad_norm": 0.08717688918113708, + "learning_rate": 3.851099352802753e-05, + "loss": 0.0379, + "step": 70470 + }, + { + "epoch": 0.3474, + "grad_norm": 0.08880654722452164, + "learning_rate": 3.8507515341007494e-05, + "loss": 0.0397, + "step": 70480 + }, + { + "epoch": 0.34745, + "grad_norm": 0.11957395076751709, + "learning_rate": 3.8504036784697056e-05, + "loss": 0.039, + "step": 70490 + }, + { + "epoch": 0.3475, + "grad_norm": 0.09604611247777939, + "learning_rate": 3.850055785919133e-05, + "loss": 0.0376, + "step": 70500 + }, + { + "epoch": 0.34755, + "grad_norm": 0.07652097940444946, + "learning_rate": 3.8497078564585434e-05, + "loss": 0.037, + "step": 70510 + }, + { + "epoch": 0.3476, + "grad_norm": 0.09587130695581436, + "learning_rate": 3.849359890097446e-05, + "loss": 0.0408, + "step": 70520 + }, + { + "epoch": 0.34765, + "grad_norm": 0.08766037970781326, + "learning_rate": 3.849011886845357e-05, + "loss": 0.0369, + "step": 70530 + }, + { + "epoch": 0.3477, + "grad_norm": 0.09857996553182602, + "learning_rate": 3.8486638467117904e-05, + "loss": 0.0369, + "step": 70540 + }, + { + "epoch": 0.34775, + "grad_norm": 0.0813198834657669, + "learning_rate": 3.848315769706261e-05, + "loss": 0.0356, + "step": 70550 + }, + { + "epoch": 0.3478, + "grad_norm": 0.08867272734642029, + "learning_rate": 3.847967655838284e-05, + "loss": 0.0373, + "step": 70560 + }, + { + "epoch": 0.34785, + "grad_norm": 0.09040363878011703, + "learning_rate": 3.847619505117379e-05, + "loss": 0.0387, + "step": 70570 + }, + { + "epoch": 0.3479, + "grad_norm": 0.09970448166131973, + "learning_rate": 3.8472713175530615e-05, + "loss": 0.0369, + "step": 70580 + }, + { + "epoch": 0.34795, + "grad_norm": 0.0712309181690216, + "learning_rate": 3.8469230931548536e-05, + "loss": 0.0367, + "step": 70590 + }, + { + "epoch": 0.348, + "grad_norm": 0.1036083847284317, + "learning_rate": 3.846574831932274e-05, + "loss": 0.037, + "step": 70600 + }, + { + "epoch": 0.34805, + "grad_norm": 0.08488594740629196, + "learning_rate": 3.846226533894844e-05, + "loss": 0.0361, + "step": 70610 + }, + { + "epoch": 0.3481, + "grad_norm": 0.0831713154911995, + "learning_rate": 3.8458781990520864e-05, + "loss": 0.0361, + "step": 70620 + }, + { + "epoch": 0.34815, + "grad_norm": 0.08196142315864563, + "learning_rate": 3.8455298274135246e-05, + "loss": 0.0365, + "step": 70630 + }, + { + "epoch": 0.3482, + "grad_norm": 0.09471315890550613, + "learning_rate": 3.8451814189886825e-05, + "loss": 0.0378, + "step": 70640 + }, + { + "epoch": 0.34825, + "grad_norm": 0.08712545782327652, + "learning_rate": 3.8448329737870867e-05, + "loss": 0.0371, + "step": 70650 + }, + { + "epoch": 0.3483, + "grad_norm": 0.08729460090398788, + "learning_rate": 3.844484491818261e-05, + "loss": 0.0395, + "step": 70660 + }, + { + "epoch": 0.34835, + "grad_norm": 0.0900210365653038, + "learning_rate": 3.8441359730917357e-05, + "loss": 0.0362, + "step": 70670 + }, + { + "epoch": 0.3484, + "grad_norm": 0.09874347597360611, + "learning_rate": 3.8437874176170373e-05, + "loss": 0.0373, + "step": 70680 + }, + { + "epoch": 0.34845, + "grad_norm": 0.11273519694805145, + "learning_rate": 3.843438825403697e-05, + "loss": 0.0391, + "step": 70690 + }, + { + "epoch": 0.3485, + "grad_norm": 0.09694775193929672, + "learning_rate": 3.8430901964612424e-05, + "loss": 0.0359, + "step": 70700 + }, + { + "epoch": 0.34855, + "grad_norm": 0.09418394416570663, + "learning_rate": 3.842741530799207e-05, + "loss": 0.037, + "step": 70710 + }, + { + "epoch": 0.3486, + "grad_norm": 0.09429769963026047, + "learning_rate": 3.842392828427123e-05, + "loss": 0.0369, + "step": 70720 + }, + { + "epoch": 0.34865, + "grad_norm": 0.0869753360748291, + "learning_rate": 3.8420440893545226e-05, + "loss": 0.0392, + "step": 70730 + }, + { + "epoch": 0.3487, + "grad_norm": 0.086611308157444, + "learning_rate": 3.8416953135909404e-05, + "loss": 0.0363, + "step": 70740 + }, + { + "epoch": 0.34875, + "grad_norm": 0.08878615498542786, + "learning_rate": 3.8413465011459134e-05, + "loss": 0.0363, + "step": 70750 + }, + { + "epoch": 0.3488, + "grad_norm": 0.08758122473955154, + "learning_rate": 3.840997652028978e-05, + "loss": 0.0382, + "step": 70760 + }, + { + "epoch": 0.34885, + "grad_norm": 0.0743941143155098, + "learning_rate": 3.8406487662496686e-05, + "loss": 0.0357, + "step": 70770 + }, + { + "epoch": 0.3489, + "grad_norm": 0.08691000938415527, + "learning_rate": 3.840299843817527e-05, + "loss": 0.0364, + "step": 70780 + }, + { + "epoch": 0.34895, + "grad_norm": 0.0793720930814743, + "learning_rate": 3.8399508847420894e-05, + "loss": 0.0367, + "step": 70790 + }, + { + "epoch": 0.349, + "grad_norm": 0.09652606397867203, + "learning_rate": 3.8396018890329e-05, + "loss": 0.039, + "step": 70800 + }, + { + "epoch": 0.34905, + "grad_norm": 0.08942319452762604, + "learning_rate": 3.839252856699497e-05, + "loss": 0.0362, + "step": 70810 + }, + { + "epoch": 0.3491, + "grad_norm": 0.09653190523386002, + "learning_rate": 3.838903787751425e-05, + "loss": 0.0381, + "step": 70820 + }, + { + "epoch": 0.34915, + "grad_norm": 0.10607529431581497, + "learning_rate": 3.838554682198225e-05, + "loss": 0.0376, + "step": 70830 + }, + { + "epoch": 0.3492, + "grad_norm": 0.1037462130188942, + "learning_rate": 3.838205540049445e-05, + "loss": 0.0394, + "step": 70840 + }, + { + "epoch": 0.34925, + "grad_norm": 0.10811994969844818, + "learning_rate": 3.8378563613146264e-05, + "loss": 0.037, + "step": 70850 + }, + { + "epoch": 0.3493, + "grad_norm": 0.08250528573989868, + "learning_rate": 3.837507146003319e-05, + "loss": 0.038, + "step": 70860 + }, + { + "epoch": 0.34935, + "grad_norm": 0.07942062616348267, + "learning_rate": 3.837157894125067e-05, + "loss": 0.0378, + "step": 70870 + }, + { + "epoch": 0.3494, + "grad_norm": 0.09695404022932053, + "learning_rate": 3.836808605689421e-05, + "loss": 0.0358, + "step": 70880 + }, + { + "epoch": 0.34945, + "grad_norm": 0.08719910681247711, + "learning_rate": 3.836459280705931e-05, + "loss": 0.0371, + "step": 70890 + }, + { + "epoch": 0.3495, + "grad_norm": 0.10089585185050964, + "learning_rate": 3.8361099191841455e-05, + "loss": 0.0372, + "step": 70900 + }, + { + "epoch": 0.34955, + "grad_norm": 0.0855243131518364, + "learning_rate": 3.8357605211336164e-05, + "loss": 0.037, + "step": 70910 + }, + { + "epoch": 0.3496, + "grad_norm": 0.09252673387527466, + "learning_rate": 3.835411086563897e-05, + "loss": 0.0386, + "step": 70920 + }, + { + "epoch": 0.34965, + "grad_norm": 0.08542817831039429, + "learning_rate": 3.8350616154845404e-05, + "loss": 0.0377, + "step": 70930 + }, + { + "epoch": 0.3497, + "grad_norm": 0.1125243529677391, + "learning_rate": 3.8347121079051005e-05, + "loss": 0.0364, + "step": 70940 + }, + { + "epoch": 0.34975, + "grad_norm": 0.08776703476905823, + "learning_rate": 3.8343625638351336e-05, + "loss": 0.0403, + "step": 70950 + }, + { + "epoch": 0.3498, + "grad_norm": 0.08082476258277893, + "learning_rate": 3.834012983284194e-05, + "loss": 0.0372, + "step": 70960 + }, + { + "epoch": 0.34985, + "grad_norm": 0.08643815666437149, + "learning_rate": 3.833663366261842e-05, + "loss": 0.0367, + "step": 70970 + }, + { + "epoch": 0.3499, + "grad_norm": 0.08524535596370697, + "learning_rate": 3.8333137127776345e-05, + "loss": 0.0359, + "step": 70980 + }, + { + "epoch": 0.34995, + "grad_norm": 0.09941412508487701, + "learning_rate": 3.83296402284113e-05, + "loss": 0.037, + "step": 70990 + }, + { + "epoch": 0.35, + "grad_norm": 0.09036926180124283, + "learning_rate": 3.832614296461891e-05, + "loss": 0.0367, + "step": 71000 + }, + { + "epoch": 0.35005, + "grad_norm": 0.10156653821468353, + "learning_rate": 3.832264533649477e-05, + "loss": 0.0362, + "step": 71010 + }, + { + "epoch": 0.3501, + "grad_norm": 0.09857209026813507, + "learning_rate": 3.8319147344134523e-05, + "loss": 0.0385, + "step": 71020 + }, + { + "epoch": 0.35015, + "grad_norm": 0.0842348113656044, + "learning_rate": 3.831564898763378e-05, + "loss": 0.0357, + "step": 71030 + }, + { + "epoch": 0.3502, + "grad_norm": 0.0971158817410469, + "learning_rate": 3.831215026708819e-05, + "loss": 0.0371, + "step": 71040 + }, + { + "epoch": 0.35025, + "grad_norm": 0.08740291744470596, + "learning_rate": 3.830865118259342e-05, + "loss": 0.0365, + "step": 71050 + }, + { + "epoch": 0.3503, + "grad_norm": 0.09465018659830093, + "learning_rate": 3.8305151734245136e-05, + "loss": 0.0367, + "step": 71060 + }, + { + "epoch": 0.35035, + "grad_norm": 0.09075374156236649, + "learning_rate": 3.8301651922139e-05, + "loss": 0.0368, + "step": 71070 + }, + { + "epoch": 0.3504, + "grad_norm": 0.09688808023929596, + "learning_rate": 3.829815174637069e-05, + "loss": 0.0378, + "step": 71080 + }, + { + "epoch": 0.35045, + "grad_norm": 0.09882215410470963, + "learning_rate": 3.829465120703592e-05, + "loss": 0.0376, + "step": 71090 + }, + { + "epoch": 0.3505, + "grad_norm": 0.08773043751716614, + "learning_rate": 3.829115030423036e-05, + "loss": 0.0377, + "step": 71100 + }, + { + "epoch": 0.35055, + "grad_norm": 0.09482423216104507, + "learning_rate": 3.828764903804975e-05, + "loss": 0.0401, + "step": 71110 + }, + { + "epoch": 0.3506, + "grad_norm": 0.08655227720737457, + "learning_rate": 3.828414740858981e-05, + "loss": 0.0369, + "step": 71120 + }, + { + "epoch": 0.35065, + "grad_norm": 0.09375464171171188, + "learning_rate": 3.828064541594627e-05, + "loss": 0.0379, + "step": 71130 + }, + { + "epoch": 0.3507, + "grad_norm": 0.08788594603538513, + "learning_rate": 3.827714306021488e-05, + "loss": 0.041, + "step": 71140 + }, + { + "epoch": 0.35075, + "grad_norm": 0.09271853417158127, + "learning_rate": 3.8273640341491384e-05, + "loss": 0.0387, + "step": 71150 + }, + { + "epoch": 0.3508, + "grad_norm": 0.10908250510692596, + "learning_rate": 3.8270137259871544e-05, + "loss": 0.0394, + "step": 71160 + }, + { + "epoch": 0.35085, + "grad_norm": 0.12239451706409454, + "learning_rate": 3.8266633815451135e-05, + "loss": 0.0389, + "step": 71170 + }, + { + "epoch": 0.3509, + "grad_norm": 0.1414772868156433, + "learning_rate": 3.8263130008325946e-05, + "loss": 0.0399, + "step": 71180 + }, + { + "epoch": 0.35095, + "grad_norm": 0.1427716463804245, + "learning_rate": 3.8259625838591766e-05, + "loss": 0.0377, + "step": 71190 + }, + { + "epoch": 0.351, + "grad_norm": 0.11323218792676926, + "learning_rate": 3.825612130634439e-05, + "loss": 0.0383, + "step": 71200 + }, + { + "epoch": 0.35105, + "grad_norm": 0.09754623472690582, + "learning_rate": 3.8252616411679646e-05, + "loss": 0.0379, + "step": 71210 + }, + { + "epoch": 0.3511, + "grad_norm": 0.12253044545650482, + "learning_rate": 3.824911115469335e-05, + "loss": 0.0385, + "step": 71220 + }, + { + "epoch": 0.35115, + "grad_norm": 0.07869323343038559, + "learning_rate": 3.824560553548132e-05, + "loss": 0.0373, + "step": 71230 + }, + { + "epoch": 0.3512, + "grad_norm": 0.09849068522453308, + "learning_rate": 3.824209955413942e-05, + "loss": 0.0374, + "step": 71240 + }, + { + "epoch": 0.35125, + "grad_norm": 0.08278708904981613, + "learning_rate": 3.823859321076349e-05, + "loss": 0.037, + "step": 71250 + }, + { + "epoch": 0.3513, + "grad_norm": 0.09109234064817429, + "learning_rate": 3.82350865054494e-05, + "loss": 0.0364, + "step": 71260 + }, + { + "epoch": 0.35135, + "grad_norm": 0.08854912966489792, + "learning_rate": 3.8231579438293015e-05, + "loss": 0.0381, + "step": 71270 + }, + { + "epoch": 0.3514, + "grad_norm": 0.09258077293634415, + "learning_rate": 3.822807200939022e-05, + "loss": 0.0375, + "step": 71280 + }, + { + "epoch": 0.35145, + "grad_norm": 0.08642219007015228, + "learning_rate": 3.82245642188369e-05, + "loss": 0.0368, + "step": 71290 + }, + { + "epoch": 0.3515, + "grad_norm": 0.09005814790725708, + "learning_rate": 3.822105606672897e-05, + "loss": 0.0372, + "step": 71300 + }, + { + "epoch": 0.35155, + "grad_norm": 0.08447232097387314, + "learning_rate": 3.821754755316233e-05, + "loss": 0.035, + "step": 71310 + }, + { + "epoch": 0.3516, + "grad_norm": 0.08680635690689087, + "learning_rate": 3.82140386782329e-05, + "loss": 0.0368, + "step": 71320 + }, + { + "epoch": 0.35165, + "grad_norm": 0.09256864339113235, + "learning_rate": 3.821052944203663e-05, + "loss": 0.037, + "step": 71330 + }, + { + "epoch": 0.3517, + "grad_norm": 0.12238368391990662, + "learning_rate": 3.8207019844669435e-05, + "loss": 0.0364, + "step": 71340 + }, + { + "epoch": 0.35175, + "grad_norm": 0.1358606368303299, + "learning_rate": 3.820350988622728e-05, + "loss": 0.038, + "step": 71350 + }, + { + "epoch": 0.3518, + "grad_norm": 0.12239467352628708, + "learning_rate": 3.8199999566806134e-05, + "loss": 0.0382, + "step": 71360 + }, + { + "epoch": 0.35185, + "grad_norm": 0.09452878683805466, + "learning_rate": 3.8196488886501945e-05, + "loss": 0.0374, + "step": 71370 + }, + { + "epoch": 0.3519, + "grad_norm": 0.09258794039487839, + "learning_rate": 3.8192977845410725e-05, + "loss": 0.0395, + "step": 71380 + }, + { + "epoch": 0.35195, + "grad_norm": 0.09202852100133896, + "learning_rate": 3.818946644362844e-05, + "loss": 0.037, + "step": 71390 + }, + { + "epoch": 0.352, + "grad_norm": 0.08467541635036469, + "learning_rate": 3.8185954681251094e-05, + "loss": 0.0362, + "step": 71400 + }, + { + "epoch": 0.35205, + "grad_norm": 0.10234204679727554, + "learning_rate": 3.81824425583747e-05, + "loss": 0.0375, + "step": 71410 + }, + { + "epoch": 0.3521, + "grad_norm": 0.09726942330598831, + "learning_rate": 3.817893007509529e-05, + "loss": 0.0384, + "step": 71420 + }, + { + "epoch": 0.35215, + "grad_norm": 0.08774566650390625, + "learning_rate": 3.817541723150887e-05, + "loss": 0.0372, + "step": 71430 + }, + { + "epoch": 0.3522, + "grad_norm": 0.09790322929620743, + "learning_rate": 3.81719040277115e-05, + "loss": 0.0374, + "step": 71440 + }, + { + "epoch": 0.35225, + "grad_norm": 0.10679417103528976, + "learning_rate": 3.816839046379922e-05, + "loss": 0.038, + "step": 71450 + }, + { + "epoch": 0.3523, + "grad_norm": 0.0922967866063118, + "learning_rate": 3.816487653986809e-05, + "loss": 0.0374, + "step": 71460 + }, + { + "epoch": 0.35235, + "grad_norm": 0.08391154557466507, + "learning_rate": 3.816136225601418e-05, + "loss": 0.0361, + "step": 71470 + }, + { + "epoch": 0.3524, + "grad_norm": 0.1068667471408844, + "learning_rate": 3.815784761233357e-05, + "loss": 0.0362, + "step": 71480 + }, + { + "epoch": 0.35245, + "grad_norm": 0.11418743431568146, + "learning_rate": 3.815433260892235e-05, + "loss": 0.0389, + "step": 71490 + }, + { + "epoch": 0.3525, + "grad_norm": 0.10949188470840454, + "learning_rate": 3.815081724587662e-05, + "loss": 0.0387, + "step": 71500 + }, + { + "epoch": 0.35255, + "grad_norm": 0.08463648706674576, + "learning_rate": 3.81473015232925e-05, + "loss": 0.0365, + "step": 71510 + }, + { + "epoch": 0.3526, + "grad_norm": 0.08453572541475296, + "learning_rate": 3.814378544126608e-05, + "loss": 0.0353, + "step": 71520 + }, + { + "epoch": 0.35265, + "grad_norm": 0.0713263601064682, + "learning_rate": 3.814026899989351e-05, + "loss": 0.0353, + "step": 71530 + }, + { + "epoch": 0.3527, + "grad_norm": 0.08317417651414871, + "learning_rate": 3.813675219927092e-05, + "loss": 0.0361, + "step": 71540 + }, + { + "epoch": 0.35275, + "grad_norm": 0.10706380009651184, + "learning_rate": 3.8133235039494455e-05, + "loss": 0.0347, + "step": 71550 + }, + { + "epoch": 0.3528, + "grad_norm": 0.086525097489357, + "learning_rate": 3.812971752066028e-05, + "loss": 0.0338, + "step": 71560 + }, + { + "epoch": 0.35285, + "grad_norm": 0.08943665027618408, + "learning_rate": 3.812619964286457e-05, + "loss": 0.0353, + "step": 71570 + }, + { + "epoch": 0.3529, + "grad_norm": 0.11737718433141708, + "learning_rate": 3.812268140620349e-05, + "loss": 0.036, + "step": 71580 + }, + { + "epoch": 0.35295, + "grad_norm": 0.09202095121145248, + "learning_rate": 3.8119162810773224e-05, + "loss": 0.0359, + "step": 71590 + }, + { + "epoch": 0.353, + "grad_norm": 0.09099660068750381, + "learning_rate": 3.8115643856669976e-05, + "loss": 0.0353, + "step": 71600 + }, + { + "epoch": 0.35305, + "grad_norm": 0.08286184817552567, + "learning_rate": 3.811212454398996e-05, + "loss": 0.0354, + "step": 71610 + }, + { + "epoch": 0.3531, + "grad_norm": 0.08152265846729279, + "learning_rate": 3.810860487282937e-05, + "loss": 0.0382, + "step": 71620 + }, + { + "epoch": 0.35315, + "grad_norm": 0.09711956232786179, + "learning_rate": 3.810508484328446e-05, + "loss": 0.0363, + "step": 71630 + }, + { + "epoch": 0.3532, + "grad_norm": 0.09203144907951355, + "learning_rate": 3.810156445545145e-05, + "loss": 0.0373, + "step": 71640 + }, + { + "epoch": 0.35325, + "grad_norm": 0.09962151199579239, + "learning_rate": 3.809804370942659e-05, + "loss": 0.0374, + "step": 71650 + }, + { + "epoch": 0.3533, + "grad_norm": 0.10103832185268402, + "learning_rate": 3.8094522605306135e-05, + "loss": 0.0384, + "step": 71660 + }, + { + "epoch": 0.35335, + "grad_norm": 0.09982626140117645, + "learning_rate": 3.8091001143186354e-05, + "loss": 0.0375, + "step": 71670 + }, + { + "epoch": 0.3534, + "grad_norm": 0.12380947172641754, + "learning_rate": 3.8087479323163513e-05, + "loss": 0.0396, + "step": 71680 + }, + { + "epoch": 0.35345, + "grad_norm": 0.10245148837566376, + "learning_rate": 3.808395714533391e-05, + "loss": 0.0376, + "step": 71690 + }, + { + "epoch": 0.3535, + "grad_norm": 0.10948903858661652, + "learning_rate": 3.8080434609793834e-05, + "loss": 0.0376, + "step": 71700 + }, + { + "epoch": 0.35355, + "grad_norm": 0.09909265488386154, + "learning_rate": 3.807691171663959e-05, + "loss": 0.0368, + "step": 71710 + }, + { + "epoch": 0.3536, + "grad_norm": 0.09816741943359375, + "learning_rate": 3.8073388465967496e-05, + "loss": 0.0362, + "step": 71720 + }, + { + "epoch": 0.35365, + "grad_norm": 0.08939754217863083, + "learning_rate": 3.8069864857873866e-05, + "loss": 0.0361, + "step": 71730 + }, + { + "epoch": 0.3537, + "grad_norm": 0.11072148382663727, + "learning_rate": 3.806634089245504e-05, + "loss": 0.0388, + "step": 71740 + }, + { + "epoch": 0.35375, + "grad_norm": 0.09038572758436203, + "learning_rate": 3.8062816569807366e-05, + "loss": 0.0359, + "step": 71750 + }, + { + "epoch": 0.3538, + "grad_norm": 0.08297697454690933, + "learning_rate": 3.80592918900272e-05, + "loss": 0.035, + "step": 71760 + }, + { + "epoch": 0.35385, + "grad_norm": 0.07661821693181992, + "learning_rate": 3.805576685321089e-05, + "loss": 0.0356, + "step": 71770 + }, + { + "epoch": 0.3539, + "grad_norm": 0.08732740581035614, + "learning_rate": 3.805224145945483e-05, + "loss": 0.037, + "step": 71780 + }, + { + "epoch": 0.35395, + "grad_norm": 0.09262505173683167, + "learning_rate": 3.804871570885538e-05, + "loss": 0.0358, + "step": 71790 + }, + { + "epoch": 0.354, + "grad_norm": 0.0871446430683136, + "learning_rate": 3.804518960150896e-05, + "loss": 0.0366, + "step": 71800 + }, + { + "epoch": 0.35405, + "grad_norm": 0.1006646603345871, + "learning_rate": 3.8041663137511934e-05, + "loss": 0.0363, + "step": 71810 + }, + { + "epoch": 0.3541, + "grad_norm": 0.09728217124938965, + "learning_rate": 3.8038136316960755e-05, + "loss": 0.0383, + "step": 71820 + }, + { + "epoch": 0.35415, + "grad_norm": 0.09145552664995193, + "learning_rate": 3.803460913995182e-05, + "loss": 0.0367, + "step": 71830 + }, + { + "epoch": 0.3542, + "grad_norm": 0.10197024792432785, + "learning_rate": 3.8031081606581575e-05, + "loss": 0.0369, + "step": 71840 + }, + { + "epoch": 0.35425, + "grad_norm": 0.1022845134139061, + "learning_rate": 3.8027553716946454e-05, + "loss": 0.0408, + "step": 71850 + }, + { + "epoch": 0.3543, + "grad_norm": 0.09232233464717865, + "learning_rate": 3.80240254711429e-05, + "loss": 0.0363, + "step": 71860 + }, + { + "epoch": 0.35435, + "grad_norm": 0.07501015812158585, + "learning_rate": 3.802049686926739e-05, + "loss": 0.0365, + "step": 71870 + }, + { + "epoch": 0.3544, + "grad_norm": 0.09139897674322128, + "learning_rate": 3.801696791141638e-05, + "loss": 0.0379, + "step": 71880 + }, + { + "epoch": 0.35445, + "grad_norm": 0.08283814787864685, + "learning_rate": 3.8013438597686365e-05, + "loss": 0.0369, + "step": 71890 + }, + { + "epoch": 0.3545, + "grad_norm": 0.08500596880912781, + "learning_rate": 3.800990892817382e-05, + "loss": 0.0358, + "step": 71900 + }, + { + "epoch": 0.35455, + "grad_norm": 0.08232578635215759, + "learning_rate": 3.800637890297526e-05, + "loss": 0.0375, + "step": 71910 + }, + { + "epoch": 0.3546, + "grad_norm": 0.08956071734428406, + "learning_rate": 3.8002848522187185e-05, + "loss": 0.0375, + "step": 71920 + }, + { + "epoch": 0.35465, + "grad_norm": 0.09098503738641739, + "learning_rate": 3.799931778590611e-05, + "loss": 0.0375, + "step": 71930 + }, + { + "epoch": 0.3547, + "grad_norm": 0.08801445364952087, + "learning_rate": 3.7995786694228584e-05, + "loss": 0.0393, + "step": 71940 + }, + { + "epoch": 0.35475, + "grad_norm": 0.0851890817284584, + "learning_rate": 3.7992255247251115e-05, + "loss": 0.0373, + "step": 71950 + }, + { + "epoch": 0.3548, + "grad_norm": 0.1011219173669815, + "learning_rate": 3.7988723445070285e-05, + "loss": 0.0392, + "step": 71960 + }, + { + "epoch": 0.35485, + "grad_norm": 0.08286673575639725, + "learning_rate": 3.798519128778263e-05, + "loss": 0.0373, + "step": 71970 + }, + { + "epoch": 0.3549, + "grad_norm": 0.08837667107582092, + "learning_rate": 3.798165877548472e-05, + "loss": 0.0368, + "step": 71980 + }, + { + "epoch": 0.35495, + "grad_norm": 0.08247746527194977, + "learning_rate": 3.797812590827314e-05, + "loss": 0.037, + "step": 71990 + }, + { + "epoch": 0.355, + "grad_norm": 0.08544211834669113, + "learning_rate": 3.797459268624446e-05, + "loss": 0.0377, + "step": 72000 + }, + { + "epoch": 0.35505, + "grad_norm": 0.08988383412361145, + "learning_rate": 3.797105910949531e-05, + "loss": 0.0378, + "step": 72010 + }, + { + "epoch": 0.3551, + "grad_norm": 0.08067493885755539, + "learning_rate": 3.796752517812227e-05, + "loss": 0.0359, + "step": 72020 + }, + { + "epoch": 0.35515, + "grad_norm": 0.08864334225654602, + "learning_rate": 3.796399089222196e-05, + "loss": 0.0367, + "step": 72030 + }, + { + "epoch": 0.3552, + "grad_norm": 0.10649200528860092, + "learning_rate": 3.796045625189101e-05, + "loss": 0.0386, + "step": 72040 + }, + { + "epoch": 0.35525, + "grad_norm": 0.0983104333281517, + "learning_rate": 3.7956921257226064e-05, + "loss": 0.0375, + "step": 72050 + }, + { + "epoch": 0.3553, + "grad_norm": 0.10972260683774948, + "learning_rate": 3.7953385908323744e-05, + "loss": 0.0361, + "step": 72060 + }, + { + "epoch": 0.35535, + "grad_norm": 0.09170223027467728, + "learning_rate": 3.794985020528072e-05, + "loss": 0.0363, + "step": 72070 + }, + { + "epoch": 0.3554, + "grad_norm": 0.1001339927315712, + "learning_rate": 3.794631414819367e-05, + "loss": 0.0378, + "step": 72080 + }, + { + "epoch": 0.35545, + "grad_norm": 0.07550124824047089, + "learning_rate": 3.794277773715925e-05, + "loss": 0.0375, + "step": 72090 + }, + { + "epoch": 0.3555, + "grad_norm": 0.09928284585475922, + "learning_rate": 3.793924097227414e-05, + "loss": 0.0389, + "step": 72100 + }, + { + "epoch": 0.35555, + "grad_norm": 0.09975890815258026, + "learning_rate": 3.793570385363506e-05, + "loss": 0.0386, + "step": 72110 + }, + { + "epoch": 0.3556, + "grad_norm": 0.10257852077484131, + "learning_rate": 3.793216638133869e-05, + "loss": 0.0365, + "step": 72120 + }, + { + "epoch": 0.35565, + "grad_norm": 0.09048584848642349, + "learning_rate": 3.792862855548174e-05, + "loss": 0.0378, + "step": 72130 + }, + { + "epoch": 0.3557, + "grad_norm": 0.0968620702624321, + "learning_rate": 3.792509037616094e-05, + "loss": 0.0393, + "step": 72140 + }, + { + "epoch": 0.35575, + "grad_norm": 0.1037416085600853, + "learning_rate": 3.7921551843473036e-05, + "loss": 0.0395, + "step": 72150 + }, + { + "epoch": 0.3558, + "grad_norm": 0.0851578339934349, + "learning_rate": 3.791801295751476e-05, + "loss": 0.0372, + "step": 72160 + }, + { + "epoch": 0.35585, + "grad_norm": 0.09236549586057663, + "learning_rate": 3.791447371838285e-05, + "loss": 0.0386, + "step": 72170 + }, + { + "epoch": 0.3559, + "grad_norm": 0.10009481757879257, + "learning_rate": 3.791093412617409e-05, + "loss": 0.0377, + "step": 72180 + }, + { + "epoch": 0.35595, + "grad_norm": 0.09812429547309875, + "learning_rate": 3.7907394180985244e-05, + "loss": 0.0362, + "step": 72190 + }, + { + "epoch": 0.356, + "grad_norm": 0.0721142441034317, + "learning_rate": 3.790385388291308e-05, + "loss": 0.0354, + "step": 72200 + }, + { + "epoch": 0.35605, + "grad_norm": 0.10122848302125931, + "learning_rate": 3.790031323205441e-05, + "loss": 0.0381, + "step": 72210 + }, + { + "epoch": 0.3561, + "grad_norm": 0.08593422174453735, + "learning_rate": 3.789677222850602e-05, + "loss": 0.0377, + "step": 72220 + }, + { + "epoch": 0.35615, + "grad_norm": 0.09920187294483185, + "learning_rate": 3.7893230872364715e-05, + "loss": 0.0373, + "step": 72230 + }, + { + "epoch": 0.3562, + "grad_norm": 0.07431119680404663, + "learning_rate": 3.788968916372733e-05, + "loss": 0.0352, + "step": 72240 + }, + { + "epoch": 0.35625, + "grad_norm": 0.0823153406381607, + "learning_rate": 3.7886147102690675e-05, + "loss": 0.0359, + "step": 72250 + }, + { + "epoch": 0.3563, + "grad_norm": 0.08599826693534851, + "learning_rate": 3.788260468935161e-05, + "loss": 0.0367, + "step": 72260 + }, + { + "epoch": 0.35635, + "grad_norm": 0.08760447800159454, + "learning_rate": 3.787906192380697e-05, + "loss": 0.0359, + "step": 72270 + }, + { + "epoch": 0.3564, + "grad_norm": 0.09040172398090363, + "learning_rate": 3.787551880615362e-05, + "loss": 0.0351, + "step": 72280 + }, + { + "epoch": 0.35645, + "grad_norm": 0.08009956032037735, + "learning_rate": 3.7871975336488417e-05, + "loss": 0.0357, + "step": 72290 + }, + { + "epoch": 0.3565, + "grad_norm": 0.08367157727479935, + "learning_rate": 3.786843151490824e-05, + "loss": 0.0364, + "step": 72300 + }, + { + "epoch": 0.35655, + "grad_norm": 0.08577068150043488, + "learning_rate": 3.7864887341509984e-05, + "loss": 0.0355, + "step": 72310 + }, + { + "epoch": 0.3566, + "grad_norm": 0.0743073970079422, + "learning_rate": 3.7861342816390546e-05, + "loss": 0.0342, + "step": 72320 + }, + { + "epoch": 0.35665, + "grad_norm": 0.10048245638608932, + "learning_rate": 3.785779793964682e-05, + "loss": 0.0356, + "step": 72330 + }, + { + "epoch": 0.3567, + "grad_norm": 0.07976207137107849, + "learning_rate": 3.785425271137573e-05, + "loss": 0.0349, + "step": 72340 + }, + { + "epoch": 0.35675, + "grad_norm": 0.10690147429704666, + "learning_rate": 3.78507071316742e-05, + "loss": 0.036, + "step": 72350 + }, + { + "epoch": 0.3568, + "grad_norm": 0.08670288324356079, + "learning_rate": 3.784716120063917e-05, + "loss": 0.0352, + "step": 72360 + }, + { + "epoch": 0.35685, + "grad_norm": 0.09053745865821838, + "learning_rate": 3.784361491836758e-05, + "loss": 0.0367, + "step": 72370 + }, + { + "epoch": 0.3569, + "grad_norm": 0.07990558445453644, + "learning_rate": 3.7840068284956374e-05, + "loss": 0.0352, + "step": 72380 + }, + { + "epoch": 0.35695, + "grad_norm": 0.10328347980976105, + "learning_rate": 3.783652130050252e-05, + "loss": 0.0384, + "step": 72390 + }, + { + "epoch": 0.357, + "grad_norm": 0.09670775383710861, + "learning_rate": 3.783297396510301e-05, + "loss": 0.0373, + "step": 72400 + }, + { + "epoch": 0.35705, + "grad_norm": 0.09959684312343597, + "learning_rate": 3.782942627885482e-05, + "loss": 0.0422, + "step": 72410 + }, + { + "epoch": 0.3571, + "grad_norm": 0.09338296949863434, + "learning_rate": 3.7825878241854916e-05, + "loss": 0.0367, + "step": 72420 + }, + { + "epoch": 0.35715, + "grad_norm": 0.09818318486213684, + "learning_rate": 3.7822329854200335e-05, + "loss": 0.0374, + "step": 72430 + }, + { + "epoch": 0.3572, + "grad_norm": 0.10129484534263611, + "learning_rate": 3.781878111598806e-05, + "loss": 0.0347, + "step": 72440 + }, + { + "epoch": 0.35725, + "grad_norm": 0.09188708662986755, + "learning_rate": 3.781523202731513e-05, + "loss": 0.0366, + "step": 72450 + }, + { + "epoch": 0.3573, + "grad_norm": 0.07468585669994354, + "learning_rate": 3.781168258827857e-05, + "loss": 0.0374, + "step": 72460 + }, + { + "epoch": 0.35735, + "grad_norm": 0.08601050078868866, + "learning_rate": 3.7808132798975424e-05, + "loss": 0.0368, + "step": 72470 + }, + { + "epoch": 0.3574, + "grad_norm": 0.07531297206878662, + "learning_rate": 3.7804582659502744e-05, + "loss": 0.0366, + "step": 72480 + }, + { + "epoch": 0.35745, + "grad_norm": 0.08619679510593414, + "learning_rate": 3.7801032169957575e-05, + "loss": 0.0361, + "step": 72490 + }, + { + "epoch": 0.3575, + "grad_norm": 0.09112090617418289, + "learning_rate": 3.7797481330437e-05, + "loss": 0.0387, + "step": 72500 + }, + { + "epoch": 0.35755, + "grad_norm": 0.08577617257833481, + "learning_rate": 3.779393014103809e-05, + "loss": 0.036, + "step": 72510 + }, + { + "epoch": 0.3576, + "grad_norm": 0.08866028487682343, + "learning_rate": 3.7790378601857936e-05, + "loss": 0.0378, + "step": 72520 + }, + { + "epoch": 0.35765, + "grad_norm": 0.10137222707271576, + "learning_rate": 3.778682671299364e-05, + "loss": 0.0356, + "step": 72530 + }, + { + "epoch": 0.3577, + "grad_norm": 0.09635225683450699, + "learning_rate": 3.7783274474542304e-05, + "loss": 0.0364, + "step": 72540 + }, + { + "epoch": 0.35775, + "grad_norm": 0.07238588482141495, + "learning_rate": 3.777972188660105e-05, + "loss": 0.0373, + "step": 72550 + }, + { + "epoch": 0.3578, + "grad_norm": 0.0964500829577446, + "learning_rate": 3.7776168949267e-05, + "loss": 0.0392, + "step": 72560 + }, + { + "epoch": 0.35785, + "grad_norm": 0.10754073411226273, + "learning_rate": 3.7772615662637276e-05, + "loss": 0.0374, + "step": 72570 + }, + { + "epoch": 0.3579, + "grad_norm": 0.08237577229738235, + "learning_rate": 3.7769062026809054e-05, + "loss": 0.037, + "step": 72580 + }, + { + "epoch": 0.35795, + "grad_norm": 0.07473357766866684, + "learning_rate": 3.776550804187947e-05, + "loss": 0.0376, + "step": 72590 + }, + { + "epoch": 0.358, + "grad_norm": 0.09661019593477249, + "learning_rate": 3.7761953707945685e-05, + "loss": 0.0365, + "step": 72600 + }, + { + "epoch": 0.35805, + "grad_norm": 0.08514625579118729, + "learning_rate": 3.7758399025104896e-05, + "loss": 0.0373, + "step": 72610 + }, + { + "epoch": 0.3581, + "grad_norm": 0.07970965653657913, + "learning_rate": 3.775484399345426e-05, + "loss": 0.0387, + "step": 72620 + }, + { + "epoch": 0.35815, + "grad_norm": 0.0916329026222229, + "learning_rate": 3.775128861309097e-05, + "loss": 0.0373, + "step": 72630 + }, + { + "epoch": 0.3582, + "grad_norm": 0.08543266355991364, + "learning_rate": 3.774773288411226e-05, + "loss": 0.039, + "step": 72640 + }, + { + "epoch": 0.35825, + "grad_norm": 0.09130579233169556, + "learning_rate": 3.774417680661532e-05, + "loss": 0.0368, + "step": 72650 + }, + { + "epoch": 0.3583, + "grad_norm": 0.10344947874546051, + "learning_rate": 3.7740620380697356e-05, + "loss": 0.0369, + "step": 72660 + }, + { + "epoch": 0.35835, + "grad_norm": 0.09073154628276825, + "learning_rate": 3.773706360645563e-05, + "loss": 0.0399, + "step": 72670 + }, + { + "epoch": 0.3584, + "grad_norm": 0.09521952271461487, + "learning_rate": 3.773350648398737e-05, + "loss": 0.0367, + "step": 72680 + }, + { + "epoch": 0.35845, + "grad_norm": 0.129849374294281, + "learning_rate": 3.772994901338983e-05, + "loss": 0.0369, + "step": 72690 + }, + { + "epoch": 0.3585, + "grad_norm": 0.09686827659606934, + "learning_rate": 3.772639119476026e-05, + "loss": 0.0371, + "step": 72700 + }, + { + "epoch": 0.35855, + "grad_norm": 0.10051210969686508, + "learning_rate": 3.772283302819594e-05, + "loss": 0.0369, + "step": 72710 + }, + { + "epoch": 0.3586, + "grad_norm": 0.09047259390354156, + "learning_rate": 3.771927451379414e-05, + "loss": 0.0373, + "step": 72720 + }, + { + "epoch": 0.35865, + "grad_norm": 0.08867276459932327, + "learning_rate": 3.771571565165215e-05, + "loss": 0.0379, + "step": 72730 + }, + { + "epoch": 0.3587, + "grad_norm": 0.08804894238710403, + "learning_rate": 3.771215644186729e-05, + "loss": 0.0378, + "step": 72740 + }, + { + "epoch": 0.35875, + "grad_norm": 0.08896362036466599, + "learning_rate": 3.770859688453683e-05, + "loss": 0.0366, + "step": 72750 + }, + { + "epoch": 0.3588, + "grad_norm": 0.08998987823724747, + "learning_rate": 3.770503697975811e-05, + "loss": 0.0368, + "step": 72760 + }, + { + "epoch": 0.35885, + "grad_norm": 0.09973672777414322, + "learning_rate": 3.7701476727628447e-05, + "loss": 0.0399, + "step": 72770 + }, + { + "epoch": 0.3589, + "grad_norm": 0.08832769840955734, + "learning_rate": 3.7697916128245194e-05, + "loss": 0.0375, + "step": 72780 + }, + { + "epoch": 0.35895, + "grad_norm": 0.09316246956586838, + "learning_rate": 3.769435518170568e-05, + "loss": 0.0379, + "step": 72790 + }, + { + "epoch": 0.359, + "grad_norm": 0.08372035622596741, + "learning_rate": 3.769079388810726e-05, + "loss": 0.0371, + "step": 72800 + }, + { + "epoch": 0.35905, + "grad_norm": 0.07352810353040695, + "learning_rate": 3.7687232247547305e-05, + "loss": 0.0389, + "step": 72810 + }, + { + "epoch": 0.3591, + "grad_norm": 0.08349832147359848, + "learning_rate": 3.768367026012319e-05, + "loss": 0.0363, + "step": 72820 + }, + { + "epoch": 0.35915, + "grad_norm": 0.0870920792222023, + "learning_rate": 3.768010792593228e-05, + "loss": 0.04, + "step": 72830 + }, + { + "epoch": 0.3592, + "grad_norm": 0.0967041403055191, + "learning_rate": 3.7676545245072e-05, + "loss": 0.038, + "step": 72840 + }, + { + "epoch": 0.35925, + "grad_norm": 0.08353424817323685, + "learning_rate": 3.767298221763973e-05, + "loss": 0.0367, + "step": 72850 + }, + { + "epoch": 0.3593, + "grad_norm": 0.09391330182552338, + "learning_rate": 3.76694188437329e-05, + "loss": 0.038, + "step": 72860 + }, + { + "epoch": 0.35935, + "grad_norm": 0.08620428293943405, + "learning_rate": 3.7665855123448904e-05, + "loss": 0.0365, + "step": 72870 + }, + { + "epoch": 0.3594, + "grad_norm": 0.0925588607788086, + "learning_rate": 3.766229105688518e-05, + "loss": 0.0372, + "step": 72880 + }, + { + "epoch": 0.35945, + "grad_norm": 0.0822024792432785, + "learning_rate": 3.7658726644139185e-05, + "loss": 0.0399, + "step": 72890 + }, + { + "epoch": 0.3595, + "grad_norm": 0.08573483675718307, + "learning_rate": 3.7655161885308365e-05, + "loss": 0.0374, + "step": 72900 + }, + { + "epoch": 0.35955, + "grad_norm": 0.08759527653455734, + "learning_rate": 3.765159678049017e-05, + "loss": 0.0367, + "step": 72910 + }, + { + "epoch": 0.3596, + "grad_norm": 0.08639516681432724, + "learning_rate": 3.764803132978206e-05, + "loss": 0.037, + "step": 72920 + }, + { + "epoch": 0.35965, + "grad_norm": 0.08695117384195328, + "learning_rate": 3.764446553328154e-05, + "loss": 0.0363, + "step": 72930 + }, + { + "epoch": 0.3597, + "grad_norm": 0.11545746773481369, + "learning_rate": 3.764089939108608e-05, + "loss": 0.0395, + "step": 72940 + }, + { + "epoch": 0.35975, + "grad_norm": 0.10340339690446854, + "learning_rate": 3.7637332903293174e-05, + "loss": 0.0377, + "step": 72950 + }, + { + "epoch": 0.3598, + "grad_norm": 0.08980807662010193, + "learning_rate": 3.763376607000034e-05, + "loss": 0.0374, + "step": 72960 + }, + { + "epoch": 0.35985, + "grad_norm": 0.0981464833021164, + "learning_rate": 3.763019889130509e-05, + "loss": 0.04, + "step": 72970 + }, + { + "epoch": 0.3599, + "grad_norm": 0.09008200466632843, + "learning_rate": 3.762663136730493e-05, + "loss": 0.0361, + "step": 72980 + }, + { + "epoch": 0.35995, + "grad_norm": 0.09525028616189957, + "learning_rate": 3.7623063498097434e-05, + "loss": 0.0384, + "step": 72990 + }, + { + "epoch": 0.36, + "grad_norm": 0.10193130373954773, + "learning_rate": 3.7619495283780114e-05, + "loss": 0.0391, + "step": 73000 + }, + { + "epoch": 0.36005, + "grad_norm": 0.10165940970182419, + "learning_rate": 3.7615926724450534e-05, + "loss": 0.0384, + "step": 73010 + }, + { + "epoch": 0.3601, + "grad_norm": 0.10303997248411179, + "learning_rate": 3.761235782020626e-05, + "loss": 0.0365, + "step": 73020 + }, + { + "epoch": 0.36015, + "grad_norm": 0.10546267777681351, + "learning_rate": 3.7608788571144855e-05, + "loss": 0.0364, + "step": 73030 + }, + { + "epoch": 0.3602, + "grad_norm": 0.09056802093982697, + "learning_rate": 3.760521897736391e-05, + "loss": 0.0366, + "step": 73040 + }, + { + "epoch": 0.36025, + "grad_norm": 0.11318932473659515, + "learning_rate": 3.760164903896102e-05, + "loss": 0.0374, + "step": 73050 + }, + { + "epoch": 0.3603, + "grad_norm": 0.08870753645896912, + "learning_rate": 3.7598078756033773e-05, + "loss": 0.0391, + "step": 73060 + }, + { + "epoch": 0.36035, + "grad_norm": 0.09608108550310135, + "learning_rate": 3.7594508128679784e-05, + "loss": 0.0383, + "step": 73070 + }, + { + "epoch": 0.3604, + "grad_norm": 0.099686399102211, + "learning_rate": 3.759093715699668e-05, + "loss": 0.0387, + "step": 73080 + }, + { + "epoch": 0.36045, + "grad_norm": 0.09866029024124146, + "learning_rate": 3.7587365841082076e-05, + "loss": 0.0355, + "step": 73090 + }, + { + "epoch": 0.3605, + "grad_norm": 0.09272120893001556, + "learning_rate": 3.758379418103363e-05, + "loss": 0.0377, + "step": 73100 + }, + { + "epoch": 0.36055, + "grad_norm": 0.08696601539850235, + "learning_rate": 3.7580222176948974e-05, + "loss": 0.0363, + "step": 73110 + }, + { + "epoch": 0.3606, + "grad_norm": 0.10454379767179489, + "learning_rate": 3.757664982892577e-05, + "loss": 0.0361, + "step": 73120 + }, + { + "epoch": 0.36065, + "grad_norm": 0.10213061422109604, + "learning_rate": 3.757307713706168e-05, + "loss": 0.037, + "step": 73130 + }, + { + "epoch": 0.3607, + "grad_norm": 0.10697133094072342, + "learning_rate": 3.7569504101454385e-05, + "loss": 0.0365, + "step": 73140 + }, + { + "epoch": 0.36075, + "grad_norm": 0.07808250933885574, + "learning_rate": 3.7565930722201576e-05, + "loss": 0.0361, + "step": 73150 + }, + { + "epoch": 0.3608, + "grad_norm": 0.08140584081411362, + "learning_rate": 3.756235699940094e-05, + "loss": 0.0369, + "step": 73160 + }, + { + "epoch": 0.36085, + "grad_norm": 0.08964422345161438, + "learning_rate": 3.755878293315018e-05, + "loss": 0.0376, + "step": 73170 + }, + { + "epoch": 0.3609, + "grad_norm": 0.09899695217609406, + "learning_rate": 3.755520852354702e-05, + "loss": 0.0383, + "step": 73180 + }, + { + "epoch": 0.36095, + "grad_norm": 0.09381364285945892, + "learning_rate": 3.755163377068917e-05, + "loss": 0.037, + "step": 73190 + }, + { + "epoch": 0.361, + "grad_norm": 0.0851723849773407, + "learning_rate": 3.7548058674674366e-05, + "loss": 0.0377, + "step": 73200 + }, + { + "epoch": 0.36105, + "grad_norm": 0.09449709206819534, + "learning_rate": 3.754448323560035e-05, + "loss": 0.0389, + "step": 73210 + }, + { + "epoch": 0.3611, + "grad_norm": 0.08995748311281204, + "learning_rate": 3.754090745356488e-05, + "loss": 0.0382, + "step": 73220 + }, + { + "epoch": 0.36115, + "grad_norm": 0.08525869995355606, + "learning_rate": 3.753733132866571e-05, + "loss": 0.0398, + "step": 73230 + }, + { + "epoch": 0.3612, + "grad_norm": 0.09252028167247772, + "learning_rate": 3.753375486100061e-05, + "loss": 0.0382, + "step": 73240 + }, + { + "epoch": 0.36125, + "grad_norm": 0.09120626747608185, + "learning_rate": 3.753017805066737e-05, + "loss": 0.0403, + "step": 73250 + }, + { + "epoch": 0.3613, + "grad_norm": 0.08819838613271713, + "learning_rate": 3.7526600897763764e-05, + "loss": 0.0399, + "step": 73260 + }, + { + "epoch": 0.36135, + "grad_norm": 0.11151785403490067, + "learning_rate": 3.752302340238759e-05, + "loss": 0.0385, + "step": 73270 + }, + { + "epoch": 0.3614, + "grad_norm": 0.08899339288473129, + "learning_rate": 3.751944556463667e-05, + "loss": 0.0429, + "step": 73280 + }, + { + "epoch": 0.36145, + "grad_norm": 0.09409180283546448, + "learning_rate": 3.75158673846088e-05, + "loss": 0.0403, + "step": 73290 + }, + { + "epoch": 0.3615, + "grad_norm": 0.09247760474681854, + "learning_rate": 3.7512288862401835e-05, + "loss": 0.0391, + "step": 73300 + }, + { + "epoch": 0.36155, + "grad_norm": 0.10189498215913773, + "learning_rate": 3.750870999811358e-05, + "loss": 0.0369, + "step": 73310 + }, + { + "epoch": 0.3616, + "grad_norm": 0.08997152745723724, + "learning_rate": 3.7505130791841896e-05, + "loss": 0.0385, + "step": 73320 + }, + { + "epoch": 0.36165, + "grad_norm": 0.09530501812696457, + "learning_rate": 3.750155124368463e-05, + "loss": 0.0376, + "step": 73330 + }, + { + "epoch": 0.3617, + "grad_norm": 0.09209480881690979, + "learning_rate": 3.749797135373966e-05, + "loss": 0.0402, + "step": 73340 + }, + { + "epoch": 0.36175, + "grad_norm": 0.09806963801383972, + "learning_rate": 3.7494391122104834e-05, + "loss": 0.0364, + "step": 73350 + }, + { + "epoch": 0.3618, + "grad_norm": 0.09419302642345428, + "learning_rate": 3.7490810548878066e-05, + "loss": 0.038, + "step": 73360 + }, + { + "epoch": 0.36185, + "grad_norm": 0.09023649245500565, + "learning_rate": 3.748722963415722e-05, + "loss": 0.0364, + "step": 73370 + }, + { + "epoch": 0.3619, + "grad_norm": 0.07176271080970764, + "learning_rate": 3.74836483780402e-05, + "loss": 0.035, + "step": 73380 + }, + { + "epoch": 0.36195, + "grad_norm": 0.09162287414073944, + "learning_rate": 3.7480066780624935e-05, + "loss": 0.0364, + "step": 73390 + }, + { + "epoch": 0.362, + "grad_norm": 0.07958874106407166, + "learning_rate": 3.7476484842009326e-05, + "loss": 0.0359, + "step": 73400 + }, + { + "epoch": 0.36205, + "grad_norm": 0.0949300155043602, + "learning_rate": 3.747290256229131e-05, + "loss": 0.0367, + "step": 73410 + }, + { + "epoch": 0.3621, + "grad_norm": 0.08462786674499512, + "learning_rate": 3.7469319941568827e-05, + "loss": 0.0367, + "step": 73420 + }, + { + "epoch": 0.36215, + "grad_norm": 0.10595318675041199, + "learning_rate": 3.746573697993982e-05, + "loss": 0.0364, + "step": 73430 + }, + { + "epoch": 0.3622, + "grad_norm": 0.10279781371355057, + "learning_rate": 3.7462153677502244e-05, + "loss": 0.036, + "step": 73440 + }, + { + "epoch": 0.36225, + "grad_norm": 0.10992176830768585, + "learning_rate": 3.7458570034354076e-05, + "loss": 0.0367, + "step": 73450 + }, + { + "epoch": 0.3623, + "grad_norm": 0.10987858474254608, + "learning_rate": 3.745498605059327e-05, + "loss": 0.037, + "step": 73460 + }, + { + "epoch": 0.36235, + "grad_norm": 0.10498002916574478, + "learning_rate": 3.745140172631784e-05, + "loss": 0.0355, + "step": 73470 + }, + { + "epoch": 0.3624, + "grad_norm": 0.09899154305458069, + "learning_rate": 3.744781706162576e-05, + "loss": 0.0378, + "step": 73480 + }, + { + "epoch": 0.36245, + "grad_norm": 0.10823096334934235, + "learning_rate": 3.7444232056615036e-05, + "loss": 0.0373, + "step": 73490 + }, + { + "epoch": 0.3625, + "grad_norm": 0.11849300563335419, + "learning_rate": 3.744064671138368e-05, + "loss": 0.0371, + "step": 73500 + }, + { + "epoch": 0.36255, + "grad_norm": 0.10961859673261642, + "learning_rate": 3.7437061026029717e-05, + "loss": 0.0364, + "step": 73510 + }, + { + "epoch": 0.3626, + "grad_norm": 0.12412311881780624, + "learning_rate": 3.7433475000651184e-05, + "loss": 0.0384, + "step": 73520 + }, + { + "epoch": 0.36265, + "grad_norm": 0.08983161300420761, + "learning_rate": 3.7429888635346105e-05, + "loss": 0.0361, + "step": 73530 + }, + { + "epoch": 0.3627, + "grad_norm": 0.08294022083282471, + "learning_rate": 3.7426301930212545e-05, + "loss": 0.0385, + "step": 73540 + }, + { + "epoch": 0.36275, + "grad_norm": 0.0846564844250679, + "learning_rate": 3.7422714885348566e-05, + "loss": 0.0375, + "step": 73550 + }, + { + "epoch": 0.3628, + "grad_norm": 0.08093772828578949, + "learning_rate": 3.7419127500852224e-05, + "loss": 0.0367, + "step": 73560 + }, + { + "epoch": 0.36285, + "grad_norm": 0.0944150909781456, + "learning_rate": 3.74155397768216e-05, + "loss": 0.0364, + "step": 73570 + }, + { + "epoch": 0.3629, + "grad_norm": 0.09659276902675629, + "learning_rate": 3.741195171335479e-05, + "loss": 0.0358, + "step": 73580 + }, + { + "epoch": 0.36295, + "grad_norm": 0.10250847786664963, + "learning_rate": 3.740836331054987e-05, + "loss": 0.0383, + "step": 73590 + }, + { + "epoch": 0.363, + "grad_norm": 0.10492464154958725, + "learning_rate": 3.740477456850496e-05, + "loss": 0.0369, + "step": 73600 + }, + { + "epoch": 0.36305, + "grad_norm": 0.08737901598215103, + "learning_rate": 3.740118548731818e-05, + "loss": 0.0368, + "step": 73610 + }, + { + "epoch": 0.3631, + "grad_norm": 0.0963754653930664, + "learning_rate": 3.739759606708765e-05, + "loss": 0.0362, + "step": 73620 + }, + { + "epoch": 0.36315, + "grad_norm": 0.09435886889696121, + "learning_rate": 3.73940063079115e-05, + "loss": 0.0355, + "step": 73630 + }, + { + "epoch": 0.3632, + "grad_norm": 0.09186870604753494, + "learning_rate": 3.739041620988788e-05, + "loss": 0.0375, + "step": 73640 + }, + { + "epoch": 0.36325, + "grad_norm": 0.10190404951572418, + "learning_rate": 3.738682577311492e-05, + "loss": 0.0385, + "step": 73650 + }, + { + "epoch": 0.3633, + "grad_norm": 0.0959816426038742, + "learning_rate": 3.7383234997690806e-05, + "loss": 0.0365, + "step": 73660 + }, + { + "epoch": 0.36335, + "grad_norm": 0.09658969938755035, + "learning_rate": 3.73796438837137e-05, + "loss": 0.0386, + "step": 73670 + }, + { + "epoch": 0.3634, + "grad_norm": 0.10769996047019958, + "learning_rate": 3.737605243128178e-05, + "loss": 0.0372, + "step": 73680 + }, + { + "epoch": 0.36345, + "grad_norm": 0.09856262058019638, + "learning_rate": 3.737246064049323e-05, + "loss": 0.038, + "step": 73690 + }, + { + "epoch": 0.3635, + "grad_norm": 0.08360934257507324, + "learning_rate": 3.7368868511446266e-05, + "loss": 0.0365, + "step": 73700 + }, + { + "epoch": 0.36355, + "grad_norm": 0.09821160137653351, + "learning_rate": 3.7365276044239074e-05, + "loss": 0.0375, + "step": 73710 + }, + { + "epoch": 0.3636, + "grad_norm": 0.09756915271282196, + "learning_rate": 3.736168323896988e-05, + "loss": 0.0418, + "step": 73720 + }, + { + "epoch": 0.36365, + "grad_norm": 0.09500919282436371, + "learning_rate": 3.7358090095736905e-05, + "loss": 0.0364, + "step": 73730 + }, + { + "epoch": 0.3637, + "grad_norm": 0.08789382874965668, + "learning_rate": 3.7354496614638405e-05, + "loss": 0.0377, + "step": 73740 + }, + { + "epoch": 0.36375, + "grad_norm": 0.09167249500751495, + "learning_rate": 3.73509027957726e-05, + "loss": 0.0361, + "step": 73750 + }, + { + "epoch": 0.3638, + "grad_norm": 0.11411943286657333, + "learning_rate": 3.734730863923776e-05, + "loss": 0.0375, + "step": 73760 + }, + { + "epoch": 0.36385, + "grad_norm": 0.09541446715593338, + "learning_rate": 3.734371414513213e-05, + "loss": 0.0369, + "step": 73770 + }, + { + "epoch": 0.3639, + "grad_norm": 0.08876598626375198, + "learning_rate": 3.7340119313554e-05, + "loss": 0.0387, + "step": 73780 + }, + { + "epoch": 0.36395, + "grad_norm": 0.10548962652683258, + "learning_rate": 3.733652414460164e-05, + "loss": 0.0377, + "step": 73790 + }, + { + "epoch": 0.364, + "grad_norm": 0.10141220688819885, + "learning_rate": 3.7332928638373346e-05, + "loss": 0.0376, + "step": 73800 + }, + { + "epoch": 0.36405, + "grad_norm": 0.09801637381315231, + "learning_rate": 3.7329332794967414e-05, + "loss": 0.0369, + "step": 73810 + }, + { + "epoch": 0.3641, + "grad_norm": 0.10043822973966599, + "learning_rate": 3.732573661448215e-05, + "loss": 0.0379, + "step": 73820 + }, + { + "epoch": 0.36415, + "grad_norm": 0.08055119961500168, + "learning_rate": 3.73221400970159e-05, + "loss": 0.0357, + "step": 73830 + }, + { + "epoch": 0.3642, + "grad_norm": 0.09515772014856339, + "learning_rate": 3.7318543242666946e-05, + "loss": 0.0351, + "step": 73840 + }, + { + "epoch": 0.36425, + "grad_norm": 0.09603604674339294, + "learning_rate": 3.731494605153366e-05, + "loss": 0.0359, + "step": 73850 + }, + { + "epoch": 0.3643, + "grad_norm": 0.09765617549419403, + "learning_rate": 3.731134852371436e-05, + "loss": 0.0365, + "step": 73860 + }, + { + "epoch": 0.36435, + "grad_norm": 0.09156297892332077, + "learning_rate": 3.730775065930744e-05, + "loss": 0.0361, + "step": 73870 + }, + { + "epoch": 0.3644, + "grad_norm": 0.08625298738479614, + "learning_rate": 3.7304152458411226e-05, + "loss": 0.0383, + "step": 73880 + }, + { + "epoch": 0.36445, + "grad_norm": 0.08274701237678528, + "learning_rate": 3.730055392112411e-05, + "loss": 0.0373, + "step": 73890 + }, + { + "epoch": 0.3645, + "grad_norm": 0.0882805809378624, + "learning_rate": 3.729695504754447e-05, + "loss": 0.0368, + "step": 73900 + }, + { + "epoch": 0.36455, + "grad_norm": 0.09052780270576477, + "learning_rate": 3.729335583777069e-05, + "loss": 0.0359, + "step": 73910 + }, + { + "epoch": 0.3646, + "grad_norm": 0.0941757932305336, + "learning_rate": 3.728975629190119e-05, + "loss": 0.0377, + "step": 73920 + }, + { + "epoch": 0.36465, + "grad_norm": 0.11285432428121567, + "learning_rate": 3.7286156410034374e-05, + "loss": 0.0397, + "step": 73930 + }, + { + "epoch": 0.3647, + "grad_norm": 0.09500918537378311, + "learning_rate": 3.7282556192268646e-05, + "loss": 0.0376, + "step": 73940 + }, + { + "epoch": 0.36475, + "grad_norm": 0.11643466353416443, + "learning_rate": 3.727895563870245e-05, + "loss": 0.0425, + "step": 73950 + }, + { + "epoch": 0.3648, + "grad_norm": 0.10746884346008301, + "learning_rate": 3.7275354749434226e-05, + "loss": 0.0405, + "step": 73960 + }, + { + "epoch": 0.36485, + "grad_norm": 0.11454451084136963, + "learning_rate": 3.727175352456241e-05, + "loss": 0.0391, + "step": 73970 + }, + { + "epoch": 0.3649, + "grad_norm": 0.10189974308013916, + "learning_rate": 3.726815196418546e-05, + "loss": 0.0396, + "step": 73980 + }, + { + "epoch": 0.36495, + "grad_norm": 0.08168889582157135, + "learning_rate": 3.7264550068401846e-05, + "loss": 0.0369, + "step": 73990 + }, + { + "epoch": 0.365, + "grad_norm": 0.09755031764507294, + "learning_rate": 3.726094783731004e-05, + "loss": 0.037, + "step": 74000 + }, + { + "epoch": 0.36505, + "grad_norm": 0.08322950452566147, + "learning_rate": 3.725734527100854e-05, + "loss": 0.0365, + "step": 74010 + }, + { + "epoch": 0.3651, + "grad_norm": 0.091013602912426, + "learning_rate": 3.725374236959581e-05, + "loss": 0.0378, + "step": 74020 + }, + { + "epoch": 0.36515, + "grad_norm": 0.09770730882883072, + "learning_rate": 3.725013913317037e-05, + "loss": 0.0369, + "step": 74030 + }, + { + "epoch": 0.3652, + "grad_norm": 0.07589123398065567, + "learning_rate": 3.7246535561830725e-05, + "loss": 0.0376, + "step": 74040 + }, + { + "epoch": 0.36525, + "grad_norm": 0.0765521451830864, + "learning_rate": 3.7242931655675404e-05, + "loss": 0.0363, + "step": 74050 + }, + { + "epoch": 0.3653, + "grad_norm": 0.07517808675765991, + "learning_rate": 3.7239327414802925e-05, + "loss": 0.0362, + "step": 74060 + }, + { + "epoch": 0.36535, + "grad_norm": 0.076322041451931, + "learning_rate": 3.7235722839311835e-05, + "loss": 0.0352, + "step": 74070 + }, + { + "epoch": 0.3654, + "grad_norm": 0.07294822484254837, + "learning_rate": 3.723211792930069e-05, + "loss": 0.0369, + "step": 74080 + }, + { + "epoch": 0.36545, + "grad_norm": 0.0938844084739685, + "learning_rate": 3.722851268486802e-05, + "loss": 0.0374, + "step": 74090 + }, + { + "epoch": 0.3655, + "grad_norm": 0.09001727402210236, + "learning_rate": 3.7224907106112414e-05, + "loss": 0.0372, + "step": 74100 + }, + { + "epoch": 0.36555, + "grad_norm": 0.0800582766532898, + "learning_rate": 3.722130119313245e-05, + "loss": 0.0419, + "step": 74110 + }, + { + "epoch": 0.3656, + "grad_norm": 0.07695163041353226, + "learning_rate": 3.7217694946026695e-05, + "loss": 0.0356, + "step": 74120 + }, + { + "epoch": 0.36565, + "grad_norm": 0.09566215425729752, + "learning_rate": 3.7214088364893744e-05, + "loss": 0.0403, + "step": 74130 + }, + { + "epoch": 0.3657, + "grad_norm": 0.07777071744203568, + "learning_rate": 3.7210481449832215e-05, + "loss": 0.0382, + "step": 74140 + }, + { + "epoch": 0.36575, + "grad_norm": 0.10269416868686676, + "learning_rate": 3.7206874200940705e-05, + "loss": 0.037, + "step": 74150 + }, + { + "epoch": 0.3658, + "grad_norm": 0.08395794779062271, + "learning_rate": 3.720326661831784e-05, + "loss": 0.0375, + "step": 74160 + }, + { + "epoch": 0.36585, + "grad_norm": 0.09718802571296692, + "learning_rate": 3.719965870206224e-05, + "loss": 0.0374, + "step": 74170 + }, + { + "epoch": 0.3659, + "grad_norm": 0.08130740374326706, + "learning_rate": 3.719605045227258e-05, + "loss": 0.0385, + "step": 74180 + }, + { + "epoch": 0.36595, + "grad_norm": 0.0812409371137619, + "learning_rate": 3.719244186904747e-05, + "loss": 0.0368, + "step": 74190 + }, + { + "epoch": 0.366, + "grad_norm": 0.0834796354174614, + "learning_rate": 3.7188832952485574e-05, + "loss": 0.0357, + "step": 74200 + }, + { + "epoch": 0.36605, + "grad_norm": 0.07309699803590775, + "learning_rate": 3.718522370268557e-05, + "loss": 0.0373, + "step": 74210 + }, + { + "epoch": 0.3661, + "grad_norm": 0.09182523936033249, + "learning_rate": 3.718161411974613e-05, + "loss": 0.0407, + "step": 74220 + }, + { + "epoch": 0.36615, + "grad_norm": 0.10953807830810547, + "learning_rate": 3.7178004203765925e-05, + "loss": 0.0371, + "step": 74230 + }, + { + "epoch": 0.3662, + "grad_norm": 0.08421969413757324, + "learning_rate": 3.7174393954843675e-05, + "loss": 0.0351, + "step": 74240 + }, + { + "epoch": 0.36625, + "grad_norm": 0.1005755364894867, + "learning_rate": 3.7170783373078054e-05, + "loss": 0.0353, + "step": 74250 + }, + { + "epoch": 0.3663, + "grad_norm": 0.08802418410778046, + "learning_rate": 3.7167172458567804e-05, + "loss": 0.0355, + "step": 74260 + }, + { + "epoch": 0.36635, + "grad_norm": 0.1034860908985138, + "learning_rate": 3.7163561211411615e-05, + "loss": 0.0378, + "step": 74270 + }, + { + "epoch": 0.3664, + "grad_norm": 0.0703473761677742, + "learning_rate": 3.715994963170824e-05, + "loss": 0.0371, + "step": 74280 + }, + { + "epoch": 0.36645, + "grad_norm": 0.09141998738050461, + "learning_rate": 3.71563377195564e-05, + "loss": 0.0381, + "step": 74290 + }, + { + "epoch": 0.3665, + "grad_norm": 0.07497433573007584, + "learning_rate": 3.715272547505487e-05, + "loss": 0.0347, + "step": 74300 + }, + { + "epoch": 0.36655, + "grad_norm": 0.08348723500967026, + "learning_rate": 3.714911289830238e-05, + "loss": 0.0344, + "step": 74310 + }, + { + "epoch": 0.3666, + "grad_norm": 0.0912565067410469, + "learning_rate": 3.71454999893977e-05, + "loss": 0.0344, + "step": 74320 + }, + { + "epoch": 0.36665, + "grad_norm": 0.07660182565450668, + "learning_rate": 3.714188674843963e-05, + "loss": 0.0346, + "step": 74330 + }, + { + "epoch": 0.3667, + "grad_norm": 0.076512411236763, + "learning_rate": 3.7138273175526934e-05, + "loss": 0.0361, + "step": 74340 + }, + { + "epoch": 0.36675, + "grad_norm": 0.07564838975667953, + "learning_rate": 3.71346592707584e-05, + "loss": 0.0355, + "step": 74350 + }, + { + "epoch": 0.3668, + "grad_norm": 0.08536522090435028, + "learning_rate": 3.713104503423285e-05, + "loss": 0.0356, + "step": 74360 + }, + { + "epoch": 0.36685, + "grad_norm": 0.06882265210151672, + "learning_rate": 3.712743046604908e-05, + "loss": 0.036, + "step": 74370 + }, + { + "epoch": 0.3669, + "grad_norm": 0.07784570008516312, + "learning_rate": 3.7123815566305926e-05, + "loss": 0.0358, + "step": 74380 + }, + { + "epoch": 0.36695, + "grad_norm": 0.08287815749645233, + "learning_rate": 3.712020033510221e-05, + "loss": 0.0359, + "step": 74390 + }, + { + "epoch": 0.367, + "grad_norm": 0.08039284497499466, + "learning_rate": 3.711658477253676e-05, + "loss": 0.0374, + "step": 74400 + }, + { + "epoch": 0.36705, + "grad_norm": 0.0996885672211647, + "learning_rate": 3.711296887870844e-05, + "loss": 0.0379, + "step": 74410 + }, + { + "epoch": 0.3671, + "grad_norm": 0.09353972226381302, + "learning_rate": 3.710935265371609e-05, + "loss": 0.0359, + "step": 74420 + }, + { + "epoch": 0.36715, + "grad_norm": 0.08280579000711441, + "learning_rate": 3.710573609765861e-05, + "loss": 0.0371, + "step": 74430 + }, + { + "epoch": 0.3672, + "grad_norm": 0.0952754020690918, + "learning_rate": 3.710211921063483e-05, + "loss": 0.0362, + "step": 74440 + }, + { + "epoch": 0.36725, + "grad_norm": 0.08442886918783188, + "learning_rate": 3.7098501992743675e-05, + "loss": 0.0359, + "step": 74450 + }, + { + "epoch": 0.3673, + "grad_norm": 0.09808932989835739, + "learning_rate": 3.709488444408401e-05, + "loss": 0.0375, + "step": 74460 + }, + { + "epoch": 0.36735, + "grad_norm": 0.10025553405284882, + "learning_rate": 3.7091266564754754e-05, + "loss": 0.0369, + "step": 74470 + }, + { + "epoch": 0.3674, + "grad_norm": 0.1075684130191803, + "learning_rate": 3.70876483548548e-05, + "loss": 0.0386, + "step": 74480 + }, + { + "epoch": 0.36745, + "grad_norm": 0.13738977909088135, + "learning_rate": 3.70840298144831e-05, + "loss": 0.0374, + "step": 74490 + }, + { + "epoch": 0.3675, + "grad_norm": 0.11115943640470505, + "learning_rate": 3.7080410943738555e-05, + "loss": 0.0374, + "step": 74500 + }, + { + "epoch": 0.36755, + "grad_norm": 0.12851819396018982, + "learning_rate": 3.7076791742720114e-05, + "loss": 0.0368, + "step": 74510 + }, + { + "epoch": 0.3676, + "grad_norm": 0.12918046116828918, + "learning_rate": 3.7073172211526725e-05, + "loss": 0.0366, + "step": 74520 + }, + { + "epoch": 0.36765, + "grad_norm": 0.10422967374324799, + "learning_rate": 3.706955235025734e-05, + "loss": 0.0358, + "step": 74530 + }, + { + "epoch": 0.3677, + "grad_norm": 0.10385806113481522, + "learning_rate": 3.706593215901093e-05, + "loss": 0.036, + "step": 74540 + }, + { + "epoch": 0.36775, + "grad_norm": 0.10638242214918137, + "learning_rate": 3.706231163788647e-05, + "loss": 0.0366, + "step": 74550 + }, + { + "epoch": 0.3678, + "grad_norm": 0.1260419636964798, + "learning_rate": 3.705869078698294e-05, + "loss": 0.0356, + "step": 74560 + }, + { + "epoch": 0.36785, + "grad_norm": 0.09771779179573059, + "learning_rate": 3.705506960639933e-05, + "loss": 0.0352, + "step": 74570 + }, + { + "epoch": 0.3679, + "grad_norm": 0.1114879623055458, + "learning_rate": 3.705144809623465e-05, + "loss": 0.037, + "step": 74580 + }, + { + "epoch": 0.36795, + "grad_norm": 0.11442957818508148, + "learning_rate": 3.70478262565879e-05, + "loss": 0.0374, + "step": 74590 + }, + { + "epoch": 0.368, + "grad_norm": 0.08828095346689224, + "learning_rate": 3.704420408755812e-05, + "loss": 0.0349, + "step": 74600 + }, + { + "epoch": 0.36805, + "grad_norm": 0.08895467966794968, + "learning_rate": 3.704058158924431e-05, + "loss": 0.0353, + "step": 74610 + }, + { + "epoch": 0.3681, + "grad_norm": 0.09710677713155746, + "learning_rate": 3.7036958761745535e-05, + "loss": 0.0371, + "step": 74620 + }, + { + "epoch": 0.36815, + "grad_norm": 0.09256210178136826, + "learning_rate": 3.7033335605160825e-05, + "loss": 0.0355, + "step": 74630 + }, + { + "epoch": 0.3682, + "grad_norm": 0.09806990623474121, + "learning_rate": 3.702971211958924e-05, + "loss": 0.0347, + "step": 74640 + }, + { + "epoch": 0.36825, + "grad_norm": 0.08915026485919952, + "learning_rate": 3.7026088305129845e-05, + "loss": 0.0368, + "step": 74650 + }, + { + "epoch": 0.3683, + "grad_norm": 0.07760807871818542, + "learning_rate": 3.702246416188171e-05, + "loss": 0.0352, + "step": 74660 + }, + { + "epoch": 0.36835, + "grad_norm": 0.08770003914833069, + "learning_rate": 3.701883968994392e-05, + "loss": 0.0349, + "step": 74670 + }, + { + "epoch": 0.3684, + "grad_norm": 0.09773367643356323, + "learning_rate": 3.7015214889415585e-05, + "loss": 0.037, + "step": 74680 + }, + { + "epoch": 0.36845, + "grad_norm": 0.10205940157175064, + "learning_rate": 3.701158976039577e-05, + "loss": 0.0359, + "step": 74690 + }, + { + "epoch": 0.3685, + "grad_norm": 0.09153249114751816, + "learning_rate": 3.7007964302983614e-05, + "loss": 0.0388, + "step": 74700 + }, + { + "epoch": 0.36855, + "grad_norm": 0.09360933303833008, + "learning_rate": 3.700433851727822e-05, + "loss": 0.0378, + "step": 74710 + }, + { + "epoch": 0.3686, + "grad_norm": 0.10425516963005066, + "learning_rate": 3.700071240337873e-05, + "loss": 0.0379, + "step": 74720 + }, + { + "epoch": 0.36865, + "grad_norm": 0.08516538888216019, + "learning_rate": 3.6997085961384256e-05, + "loss": 0.0351, + "step": 74730 + }, + { + "epoch": 0.3687, + "grad_norm": 0.0863916203379631, + "learning_rate": 3.699345919139397e-05, + "loss": 0.0354, + "step": 74740 + }, + { + "epoch": 0.36875, + "grad_norm": 0.08580009639263153, + "learning_rate": 3.6989832093507007e-05, + "loss": 0.0363, + "step": 74750 + }, + { + "epoch": 0.3688, + "grad_norm": 0.08505256474018097, + "learning_rate": 3.698620466782255e-05, + "loss": 0.0343, + "step": 74760 + }, + { + "epoch": 0.36885, + "grad_norm": 0.07350530475378036, + "learning_rate": 3.6982576914439756e-05, + "loss": 0.0355, + "step": 74770 + }, + { + "epoch": 0.3689, + "grad_norm": 0.08322486281394958, + "learning_rate": 3.6978948833457805e-05, + "loss": 0.0366, + "step": 74780 + }, + { + "epoch": 0.36895, + "grad_norm": 0.0836733728647232, + "learning_rate": 3.6975320424975904e-05, + "loss": 0.0356, + "step": 74790 + }, + { + "epoch": 0.369, + "grad_norm": 0.08269577473402023, + "learning_rate": 3.697169168909323e-05, + "loss": 0.0359, + "step": 74800 + }, + { + "epoch": 0.36905, + "grad_norm": 0.08767407387495041, + "learning_rate": 3.6968062625909005e-05, + "loss": 0.0352, + "step": 74810 + }, + { + "epoch": 0.3691, + "grad_norm": 0.07590517401695251, + "learning_rate": 3.696443323552244e-05, + "loss": 0.0389, + "step": 74820 + }, + { + "epoch": 0.36915, + "grad_norm": 0.08965615928173065, + "learning_rate": 3.696080351803278e-05, + "loss": 0.0361, + "step": 74830 + }, + { + "epoch": 0.3692, + "grad_norm": 0.08823589980602264, + "learning_rate": 3.6957173473539236e-05, + "loss": 0.0359, + "step": 74840 + }, + { + "epoch": 0.36925, + "grad_norm": 0.07415693253278732, + "learning_rate": 3.695354310214106e-05, + "loss": 0.0356, + "step": 74850 + }, + { + "epoch": 0.3693, + "grad_norm": 0.08243182301521301, + "learning_rate": 3.6949912403937507e-05, + "loss": 0.0353, + "step": 74860 + }, + { + "epoch": 0.36935, + "grad_norm": 0.08238965272903442, + "learning_rate": 3.694628137902785e-05, + "loss": 0.0361, + "step": 74870 + }, + { + "epoch": 0.3694, + "grad_norm": 0.07500246912240982, + "learning_rate": 3.694265002751133e-05, + "loss": 0.0349, + "step": 74880 + }, + { + "epoch": 0.36945, + "grad_norm": 0.09064202755689621, + "learning_rate": 3.693901834948726e-05, + "loss": 0.0375, + "step": 74890 + }, + { + "epoch": 0.3695, + "grad_norm": 0.0864512175321579, + "learning_rate": 3.6935386345054904e-05, + "loss": 0.0369, + "step": 74900 + }, + { + "epoch": 0.36955, + "grad_norm": 0.09356842935085297, + "learning_rate": 3.6931754014313575e-05, + "loss": 0.0358, + "step": 74910 + }, + { + "epoch": 0.3696, + "grad_norm": 0.08547563850879669, + "learning_rate": 3.6928121357362564e-05, + "loss": 0.0349, + "step": 74920 + }, + { + "epoch": 0.36965, + "grad_norm": 0.08710693567991257, + "learning_rate": 3.6924488374301206e-05, + "loss": 0.0366, + "step": 74930 + }, + { + "epoch": 0.3697, + "grad_norm": 0.08611670881509781, + "learning_rate": 3.692085506522881e-05, + "loss": 0.0356, + "step": 74940 + }, + { + "epoch": 0.36975, + "grad_norm": 0.0835421234369278, + "learning_rate": 3.691722143024472e-05, + "loss": 0.0355, + "step": 74950 + }, + { + "epoch": 0.3698, + "grad_norm": 0.07917075604200363, + "learning_rate": 3.691358746944827e-05, + "loss": 0.0355, + "step": 74960 + }, + { + "epoch": 0.36985, + "grad_norm": 0.08740229904651642, + "learning_rate": 3.690995318293882e-05, + "loss": 0.0361, + "step": 74970 + }, + { + "epoch": 0.3699, + "grad_norm": 0.09352272748947144, + "learning_rate": 3.690631857081572e-05, + "loss": 0.0366, + "step": 74980 + }, + { + "epoch": 0.36995, + "grad_norm": 0.10072267055511475, + "learning_rate": 3.690268363317834e-05, + "loss": 0.0367, + "step": 74990 + }, + { + "epoch": 0.37, + "grad_norm": 0.08957238495349884, + "learning_rate": 3.689904837012606e-05, + "loss": 0.0346, + "step": 75000 + }, + { + "epoch": 0.37005, + "grad_norm": 0.08622787892818451, + "learning_rate": 3.6895412781758276e-05, + "loss": 0.0365, + "step": 75010 + }, + { + "epoch": 0.3701, + "grad_norm": 0.11911823600530624, + "learning_rate": 3.689177686817437e-05, + "loss": 0.0388, + "step": 75020 + }, + { + "epoch": 0.37015, + "grad_norm": 0.11229342222213745, + "learning_rate": 3.688814062947375e-05, + "loss": 0.0372, + "step": 75030 + }, + { + "epoch": 0.3702, + "grad_norm": 0.10843716561794281, + "learning_rate": 3.688450406575584e-05, + "loss": 0.0356, + "step": 75040 + }, + { + "epoch": 0.37025, + "grad_norm": 0.09995651245117188, + "learning_rate": 3.688086717712004e-05, + "loss": 0.0355, + "step": 75050 + }, + { + "epoch": 0.3703, + "grad_norm": 0.09323858469724655, + "learning_rate": 3.6877229963665805e-05, + "loss": 0.0347, + "step": 75060 + }, + { + "epoch": 0.37035, + "grad_norm": 0.1024266853928566, + "learning_rate": 3.6873592425492564e-05, + "loss": 0.0363, + "step": 75070 + }, + { + "epoch": 0.3704, + "grad_norm": 0.11296865344047546, + "learning_rate": 3.686995456269977e-05, + "loss": 0.0348, + "step": 75080 + }, + { + "epoch": 0.37045, + "grad_norm": 0.087582528591156, + "learning_rate": 3.686631637538687e-05, + "loss": 0.0345, + "step": 75090 + }, + { + "epoch": 0.3705, + "grad_norm": 0.11643913388252258, + "learning_rate": 3.6862677863653345e-05, + "loss": 0.0372, + "step": 75100 + }, + { + "epoch": 0.37055, + "grad_norm": 0.08716107904911041, + "learning_rate": 3.685903902759866e-05, + "loss": 0.036, + "step": 75110 + }, + { + "epoch": 0.3706, + "grad_norm": 0.0931491106748581, + "learning_rate": 3.68553998673223e-05, + "loss": 0.0361, + "step": 75120 + }, + { + "epoch": 0.37065, + "grad_norm": 0.08382998406887054, + "learning_rate": 3.6851760382923764e-05, + "loss": 0.0344, + "step": 75130 + }, + { + "epoch": 0.3707, + "grad_norm": 0.069289930164814, + "learning_rate": 3.6848120574502555e-05, + "loss": 0.0376, + "step": 75140 + }, + { + "epoch": 0.37075, + "grad_norm": 0.08519791066646576, + "learning_rate": 3.684448044215817e-05, + "loss": 0.035, + "step": 75150 + }, + { + "epoch": 0.3708, + "grad_norm": 0.10274061560630798, + "learning_rate": 3.6840839985990154e-05, + "loss": 0.037, + "step": 75160 + }, + { + "epoch": 0.37085, + "grad_norm": 0.10534089058637619, + "learning_rate": 3.6837199206098015e-05, + "loss": 0.0396, + "step": 75170 + }, + { + "epoch": 0.3709, + "grad_norm": 0.09910701960325241, + "learning_rate": 3.683355810258129e-05, + "loss": 0.0362, + "step": 75180 + }, + { + "epoch": 0.37095, + "grad_norm": 0.07578156143426895, + "learning_rate": 3.682991667553954e-05, + "loss": 0.0359, + "step": 75190 + }, + { + "epoch": 0.371, + "grad_norm": 0.09559353440999985, + "learning_rate": 3.682627492507232e-05, + "loss": 0.0378, + "step": 75200 + }, + { + "epoch": 0.37105, + "grad_norm": 0.08370592445135117, + "learning_rate": 3.6822632851279174e-05, + "loss": 0.0363, + "step": 75210 + }, + { + "epoch": 0.3711, + "grad_norm": 0.12570084631443024, + "learning_rate": 3.68189904542597e-05, + "loss": 0.0376, + "step": 75220 + }, + { + "epoch": 0.37115, + "grad_norm": 0.07416093349456787, + "learning_rate": 3.681534773411345e-05, + "loss": 0.036, + "step": 75230 + }, + { + "epoch": 0.3712, + "grad_norm": 0.08501870185136795, + "learning_rate": 3.681170469094004e-05, + "loss": 0.0383, + "step": 75240 + }, + { + "epoch": 0.37125, + "grad_norm": 0.08055275678634644, + "learning_rate": 3.680806132483906e-05, + "loss": 0.0416, + "step": 75250 + }, + { + "epoch": 0.3713, + "grad_norm": 0.08922593295574188, + "learning_rate": 3.6804417635910123e-05, + "loss": 0.0391, + "step": 75260 + }, + { + "epoch": 0.37135, + "grad_norm": 0.11191854625940323, + "learning_rate": 3.680077362425284e-05, + "loss": 0.0388, + "step": 75270 + }, + { + "epoch": 0.3714, + "grad_norm": 0.08514377474784851, + "learning_rate": 3.6797129289966835e-05, + "loss": 0.0373, + "step": 75280 + }, + { + "epoch": 0.37145, + "grad_norm": 0.09239083528518677, + "learning_rate": 3.679348463315176e-05, + "loss": 0.0364, + "step": 75290 + }, + { + "epoch": 0.3715, + "grad_norm": 0.0984504297375679, + "learning_rate": 3.678983965390723e-05, + "loss": 0.0366, + "step": 75300 + }, + { + "epoch": 0.37155, + "grad_norm": 0.08384273201227188, + "learning_rate": 3.678619435233292e-05, + "loss": 0.0368, + "step": 75310 + }, + { + "epoch": 0.3716, + "grad_norm": 0.07963405549526215, + "learning_rate": 3.6782548728528485e-05, + "loss": 0.0394, + "step": 75320 + }, + { + "epoch": 0.37165, + "grad_norm": 0.10861895978450775, + "learning_rate": 3.6778902782593594e-05, + "loss": 0.0361, + "step": 75330 + }, + { + "epoch": 0.3717, + "grad_norm": 0.10098570585250854, + "learning_rate": 3.6775256514627925e-05, + "loss": 0.0377, + "step": 75340 + }, + { + "epoch": 0.37175, + "grad_norm": 0.08465471863746643, + "learning_rate": 3.677160992473117e-05, + "loss": 0.0385, + "step": 75350 + }, + { + "epoch": 0.3718, + "grad_norm": 0.10758759826421738, + "learning_rate": 3.676796301300302e-05, + "loss": 0.0388, + "step": 75360 + }, + { + "epoch": 0.37185, + "grad_norm": 0.08927234262228012, + "learning_rate": 3.676431577954318e-05, + "loss": 0.0363, + "step": 75370 + }, + { + "epoch": 0.3719, + "grad_norm": 0.07932307571172714, + "learning_rate": 3.6760668224451365e-05, + "loss": 0.0385, + "step": 75380 + }, + { + "epoch": 0.37195, + "grad_norm": 0.08853064477443695, + "learning_rate": 3.675702034782731e-05, + "loss": 0.0363, + "step": 75390 + }, + { + "epoch": 0.372, + "grad_norm": 0.09992717951536179, + "learning_rate": 3.675337214977073e-05, + "loss": 0.0369, + "step": 75400 + }, + { + "epoch": 0.37205, + "grad_norm": 0.13069744408130646, + "learning_rate": 3.674972363038137e-05, + "loss": 0.0369, + "step": 75410 + }, + { + "epoch": 0.3721, + "grad_norm": 0.11567940562963486, + "learning_rate": 3.674607478975898e-05, + "loss": 0.0364, + "step": 75420 + }, + { + "epoch": 0.37215, + "grad_norm": 0.08629253506660461, + "learning_rate": 3.6742425628003316e-05, + "loss": 0.0361, + "step": 75430 + }, + { + "epoch": 0.3722, + "grad_norm": 0.0733034536242485, + "learning_rate": 3.673877614521414e-05, + "loss": 0.0361, + "step": 75440 + }, + { + "epoch": 0.37225, + "grad_norm": 0.08365897089242935, + "learning_rate": 3.6735126341491244e-05, + "loss": 0.0363, + "step": 75450 + }, + { + "epoch": 0.3723, + "grad_norm": 0.0908847376704216, + "learning_rate": 3.67314762169344e-05, + "loss": 0.0353, + "step": 75460 + }, + { + "epoch": 0.37235, + "grad_norm": 0.10139103978872299, + "learning_rate": 3.672782577164341e-05, + "loss": 0.0346, + "step": 75470 + }, + { + "epoch": 0.3724, + "grad_norm": 0.09700726717710495, + "learning_rate": 3.672417500571806e-05, + "loss": 0.0357, + "step": 75480 + }, + { + "epoch": 0.37245, + "grad_norm": 0.09164400398731232, + "learning_rate": 3.672052391925817e-05, + "loss": 0.038, + "step": 75490 + }, + { + "epoch": 0.3725, + "grad_norm": 0.1114017441868782, + "learning_rate": 3.6716872512363566e-05, + "loss": 0.0364, + "step": 75500 + }, + { + "epoch": 0.37255, + "grad_norm": 0.09311135858297348, + "learning_rate": 3.6713220785134064e-05, + "loss": 0.0351, + "step": 75510 + }, + { + "epoch": 0.3726, + "grad_norm": 0.09030146896839142, + "learning_rate": 3.6709568737669505e-05, + "loss": 0.0376, + "step": 75520 + }, + { + "epoch": 0.37265, + "grad_norm": 0.08873016387224197, + "learning_rate": 3.670591637006974e-05, + "loss": 0.0362, + "step": 75530 + }, + { + "epoch": 0.3727, + "grad_norm": 0.10129522532224655, + "learning_rate": 3.6702263682434626e-05, + "loss": 0.0371, + "step": 75540 + }, + { + "epoch": 0.37275, + "grad_norm": 0.08039398491382599, + "learning_rate": 3.6698610674864e-05, + "loss": 0.037, + "step": 75550 + }, + { + "epoch": 0.3728, + "grad_norm": 0.0955590084195137, + "learning_rate": 3.669495734745777e-05, + "loss": 0.0357, + "step": 75560 + }, + { + "epoch": 0.37285, + "grad_norm": 0.08611071854829788, + "learning_rate": 3.6691303700315796e-05, + "loss": 0.0356, + "step": 75570 + }, + { + "epoch": 0.3729, + "grad_norm": 0.08073154091835022, + "learning_rate": 3.6687649733537964e-05, + "loss": 0.0344, + "step": 75580 + }, + { + "epoch": 0.37295, + "grad_norm": 0.10840560495853424, + "learning_rate": 3.668399544722418e-05, + "loss": 0.0367, + "step": 75590 + }, + { + "epoch": 0.373, + "grad_norm": 0.07460062950849533, + "learning_rate": 3.668034084147436e-05, + "loss": 0.0363, + "step": 75600 + }, + { + "epoch": 0.37305, + "grad_norm": 0.08653230220079422, + "learning_rate": 3.667668591638841e-05, + "loss": 0.0346, + "step": 75610 + }, + { + "epoch": 0.3731, + "grad_norm": 0.08363914489746094, + "learning_rate": 3.6673030672066245e-05, + "loss": 0.0363, + "step": 75620 + }, + { + "epoch": 0.37315, + "grad_norm": 0.08001896739006042, + "learning_rate": 3.666937510860781e-05, + "loss": 0.0354, + "step": 75630 + }, + { + "epoch": 0.3732, + "grad_norm": 0.10220906883478165, + "learning_rate": 3.6665719226113035e-05, + "loss": 0.0364, + "step": 75640 + }, + { + "epoch": 0.37325, + "grad_norm": 0.07813125103712082, + "learning_rate": 3.666206302468189e-05, + "loss": 0.0371, + "step": 75650 + }, + { + "epoch": 0.3733, + "grad_norm": 0.08085132390260696, + "learning_rate": 3.6658406504414325e-05, + "loss": 0.0379, + "step": 75660 + }, + { + "epoch": 0.37335, + "grad_norm": 0.10213213413953781, + "learning_rate": 3.66547496654103e-05, + "loss": 0.0393, + "step": 75670 + }, + { + "epoch": 0.3734, + "grad_norm": 0.10202323645353317, + "learning_rate": 3.66510925077698e-05, + "loss": 0.0374, + "step": 75680 + }, + { + "epoch": 0.37345, + "grad_norm": 0.08972320705652237, + "learning_rate": 3.6647435031592804e-05, + "loss": 0.0372, + "step": 75690 + }, + { + "epoch": 0.3735, + "grad_norm": 0.10474187880754471, + "learning_rate": 3.6643777236979314e-05, + "loss": 0.0379, + "step": 75700 + }, + { + "epoch": 0.37355, + "grad_norm": 0.10356242209672928, + "learning_rate": 3.664011912402933e-05, + "loss": 0.0379, + "step": 75710 + }, + { + "epoch": 0.3736, + "grad_norm": 0.11297213286161423, + "learning_rate": 3.6636460692842855e-05, + "loss": 0.0382, + "step": 75720 + }, + { + "epoch": 0.37365, + "grad_norm": 0.10405473411083221, + "learning_rate": 3.663280194351992e-05, + "loss": 0.0365, + "step": 75730 + }, + { + "epoch": 0.3737, + "grad_norm": 0.07534322887659073, + "learning_rate": 3.6629142876160546e-05, + "loss": 0.0388, + "step": 75740 + }, + { + "epoch": 0.37375, + "grad_norm": 0.0884098932147026, + "learning_rate": 3.662548349086478e-05, + "loss": 0.0369, + "step": 75750 + }, + { + "epoch": 0.3738, + "grad_norm": 0.08216243982315063, + "learning_rate": 3.662182378773267e-05, + "loss": 0.0348, + "step": 75760 + }, + { + "epoch": 0.37385, + "grad_norm": 0.07160743325948715, + "learning_rate": 3.661816376686425e-05, + "loss": 0.0363, + "step": 75770 + }, + { + "epoch": 0.3739, + "grad_norm": 0.08498039096593857, + "learning_rate": 3.6614503428359606e-05, + "loss": 0.0339, + "step": 75780 + }, + { + "epoch": 0.37395, + "grad_norm": 0.09113490581512451, + "learning_rate": 3.66108427723188e-05, + "loss": 0.0359, + "step": 75790 + }, + { + "epoch": 0.374, + "grad_norm": 0.08843359351158142, + "learning_rate": 3.660718179884191e-05, + "loss": 0.0351, + "step": 75800 + }, + { + "epoch": 0.37405, + "grad_norm": 0.09002623707056046, + "learning_rate": 3.660352050802904e-05, + "loss": 0.0373, + "step": 75810 + }, + { + "epoch": 0.3741, + "grad_norm": 0.10079097002744675, + "learning_rate": 3.6599858899980265e-05, + "loss": 0.0362, + "step": 75820 + }, + { + "epoch": 0.37415, + "grad_norm": 0.07080857455730438, + "learning_rate": 3.6596196974795714e-05, + "loss": 0.0348, + "step": 75830 + }, + { + "epoch": 0.3742, + "grad_norm": 0.0856006070971489, + "learning_rate": 3.65925347325755e-05, + "loss": 0.0363, + "step": 75840 + }, + { + "epoch": 0.37425, + "grad_norm": 0.07604727894067764, + "learning_rate": 3.658887217341973e-05, + "loss": 0.0351, + "step": 75850 + }, + { + "epoch": 0.3743, + "grad_norm": 0.08135710656642914, + "learning_rate": 3.658520929742855e-05, + "loss": 0.0351, + "step": 75860 + }, + { + "epoch": 0.37435, + "grad_norm": 0.07487417757511139, + "learning_rate": 3.658154610470211e-05, + "loss": 0.0346, + "step": 75870 + }, + { + "epoch": 0.3744, + "grad_norm": 0.08480235934257507, + "learning_rate": 3.657788259534054e-05, + "loss": 0.0359, + "step": 75880 + }, + { + "epoch": 0.37445, + "grad_norm": 0.07889080792665482, + "learning_rate": 3.657421876944401e-05, + "loss": 0.0352, + "step": 75890 + }, + { + "epoch": 0.3745, + "grad_norm": 0.10762711614370346, + "learning_rate": 3.6570554627112693e-05, + "loss": 0.037, + "step": 75900 + }, + { + "epoch": 0.37455, + "grad_norm": 0.11260473728179932, + "learning_rate": 3.656689016844676e-05, + "loss": 0.0369, + "step": 75910 + }, + { + "epoch": 0.3746, + "grad_norm": 0.08627483248710632, + "learning_rate": 3.656322539354639e-05, + "loss": 0.0376, + "step": 75920 + }, + { + "epoch": 0.37465, + "grad_norm": 0.08173470199108124, + "learning_rate": 3.6559560302511785e-05, + "loss": 0.0355, + "step": 75930 + }, + { + "epoch": 0.3747, + "grad_norm": 0.07806728780269623, + "learning_rate": 3.655589489544314e-05, + "loss": 0.0355, + "step": 75940 + }, + { + "epoch": 0.37475, + "grad_norm": 0.11050267517566681, + "learning_rate": 3.655222917244068e-05, + "loss": 0.0366, + "step": 75950 + }, + { + "epoch": 0.3748, + "grad_norm": 0.08523990213871002, + "learning_rate": 3.65485631336046e-05, + "loss": 0.0359, + "step": 75960 + }, + { + "epoch": 0.37485, + "grad_norm": 0.08433665335178375, + "learning_rate": 3.6544896779035154e-05, + "loss": 0.037, + "step": 75970 + }, + { + "epoch": 0.3749, + "grad_norm": 0.09047384560108185, + "learning_rate": 3.654123010883256e-05, + "loss": 0.038, + "step": 75980 + }, + { + "epoch": 0.37495, + "grad_norm": 0.08714261651039124, + "learning_rate": 3.6537563123097075e-05, + "loss": 0.037, + "step": 75990 + }, + { + "epoch": 0.375, + "grad_norm": 0.08978958427906036, + "learning_rate": 3.653389582192895e-05, + "loss": 0.0396, + "step": 76000 + }, + { + "epoch": 0.37505, + "grad_norm": 0.11172284185886383, + "learning_rate": 3.653022820542844e-05, + "loss": 0.0387, + "step": 76010 + }, + { + "epoch": 0.3751, + "grad_norm": 0.08433420956134796, + "learning_rate": 3.652656027369583e-05, + "loss": 0.0369, + "step": 76020 + }, + { + "epoch": 0.37515, + "grad_norm": 0.09933721274137497, + "learning_rate": 3.652289202683138e-05, + "loss": 0.0372, + "step": 76030 + }, + { + "epoch": 0.3752, + "grad_norm": 0.09172473847866058, + "learning_rate": 3.6519223464935406e-05, + "loss": 0.0365, + "step": 76040 + }, + { + "epoch": 0.37525, + "grad_norm": 0.08624282479286194, + "learning_rate": 3.651555458810818e-05, + "loss": 0.0371, + "step": 76050 + }, + { + "epoch": 0.3753, + "grad_norm": 0.10207632929086685, + "learning_rate": 3.651188539645002e-05, + "loss": 0.0376, + "step": 76060 + }, + { + "epoch": 0.37535, + "grad_norm": 0.08815249055624008, + "learning_rate": 3.650821589006124e-05, + "loss": 0.0369, + "step": 76070 + }, + { + "epoch": 0.3754, + "grad_norm": 0.08788152039051056, + "learning_rate": 3.650454606904216e-05, + "loss": 0.0383, + "step": 76080 + }, + { + "epoch": 0.37545, + "grad_norm": 0.08695410192012787, + "learning_rate": 3.650087593349311e-05, + "loss": 0.0359, + "step": 76090 + }, + { + "epoch": 0.3755, + "grad_norm": 0.08386833220720291, + "learning_rate": 3.649720548351444e-05, + "loss": 0.0371, + "step": 76100 + }, + { + "epoch": 0.37555, + "grad_norm": 0.13007734715938568, + "learning_rate": 3.649353471920649e-05, + "loss": 0.0388, + "step": 76110 + }, + { + "epoch": 0.3756, + "grad_norm": 0.07977231591939926, + "learning_rate": 3.648986364066962e-05, + "loss": 0.0389, + "step": 76120 + }, + { + "epoch": 0.37565, + "grad_norm": 0.08852393180131912, + "learning_rate": 3.648619224800419e-05, + "loss": 0.0372, + "step": 76130 + }, + { + "epoch": 0.3757, + "grad_norm": 0.09893860667943954, + "learning_rate": 3.648252054131057e-05, + "loss": 0.0362, + "step": 76140 + }, + { + "epoch": 0.37575, + "grad_norm": 0.0855567455291748, + "learning_rate": 3.647884852068916e-05, + "loss": 0.0367, + "step": 76150 + }, + { + "epoch": 0.3758, + "grad_norm": 0.07578399777412415, + "learning_rate": 3.647517618624035e-05, + "loss": 0.0392, + "step": 76160 + }, + { + "epoch": 0.37585, + "grad_norm": 0.07223352044820786, + "learning_rate": 3.6471503538064527e-05, + "loss": 0.0358, + "step": 76170 + }, + { + "epoch": 0.3759, + "grad_norm": 0.07542915642261505, + "learning_rate": 3.6467830576262114e-05, + "loss": 0.0364, + "step": 76180 + }, + { + "epoch": 0.37595, + "grad_norm": 0.07215103507041931, + "learning_rate": 3.646415730093352e-05, + "loss": 0.0364, + "step": 76190 + }, + { + "epoch": 0.376, + "grad_norm": 0.08985480666160583, + "learning_rate": 3.6460483712179164e-05, + "loss": 0.0364, + "step": 76200 + }, + { + "epoch": 0.37605, + "grad_norm": 0.09463430196046829, + "learning_rate": 3.645680981009949e-05, + "loss": 0.0366, + "step": 76210 + }, + { + "epoch": 0.3761, + "grad_norm": 0.08468139916658401, + "learning_rate": 3.645313559479495e-05, + "loss": 0.0355, + "step": 76220 + }, + { + "epoch": 0.37615, + "grad_norm": 0.10156545788049698, + "learning_rate": 3.644946106636598e-05, + "loss": 0.042, + "step": 76230 + }, + { + "epoch": 0.3762, + "grad_norm": 0.09742777049541473, + "learning_rate": 3.6445786224913036e-05, + "loss": 0.0375, + "step": 76240 + }, + { + "epoch": 0.37625, + "grad_norm": 0.08506327122449875, + "learning_rate": 3.644211107053661e-05, + "loss": 0.0359, + "step": 76250 + }, + { + "epoch": 0.3763, + "grad_norm": 0.08412918448448181, + "learning_rate": 3.643843560333716e-05, + "loss": 0.0355, + "step": 76260 + }, + { + "epoch": 0.37635, + "grad_norm": 0.09755247086286545, + "learning_rate": 3.643475982341518e-05, + "loss": 0.0368, + "step": 76270 + }, + { + "epoch": 0.3764, + "grad_norm": 0.08067236840724945, + "learning_rate": 3.6431083730871165e-05, + "loss": 0.037, + "step": 76280 + }, + { + "epoch": 0.37645, + "grad_norm": 0.09624320268630981, + "learning_rate": 3.6427407325805615e-05, + "loss": 0.0379, + "step": 76290 + }, + { + "epoch": 0.3765, + "grad_norm": 0.07825248688459396, + "learning_rate": 3.6423730608319036e-05, + "loss": 0.0385, + "step": 76300 + }, + { + "epoch": 0.37655, + "grad_norm": 0.08972030133008957, + "learning_rate": 3.642005357851196e-05, + "loss": 0.0359, + "step": 76310 + }, + { + "epoch": 0.3766, + "grad_norm": 0.10110962390899658, + "learning_rate": 3.64163762364849e-05, + "loss": 0.0378, + "step": 76320 + }, + { + "epoch": 0.37665, + "grad_norm": 0.09208884090185165, + "learning_rate": 3.641269858233841e-05, + "loss": 0.0372, + "step": 76330 + }, + { + "epoch": 0.3767, + "grad_norm": 0.07630440592765808, + "learning_rate": 3.6409020616173024e-05, + "loss": 0.0361, + "step": 76340 + }, + { + "epoch": 0.37675, + "grad_norm": 0.08968579024076462, + "learning_rate": 3.640534233808931e-05, + "loss": 0.0359, + "step": 76350 + }, + { + "epoch": 0.3768, + "grad_norm": 0.07915177196264267, + "learning_rate": 3.640166374818781e-05, + "loss": 0.0371, + "step": 76360 + }, + { + "epoch": 0.37685, + "grad_norm": 0.08181800693273544, + "learning_rate": 3.6397984846569114e-05, + "loss": 0.0369, + "step": 76370 + }, + { + "epoch": 0.3769, + "grad_norm": 0.08259347826242447, + "learning_rate": 3.639430563333379e-05, + "loss": 0.035, + "step": 76380 + }, + { + "epoch": 0.37695, + "grad_norm": 0.07583151012659073, + "learning_rate": 3.639062610858243e-05, + "loss": 0.0366, + "step": 76390 + }, + { + "epoch": 0.377, + "grad_norm": 0.08552539348602295, + "learning_rate": 3.6386946272415636e-05, + "loss": 0.0366, + "step": 76400 + }, + { + "epoch": 0.37705, + "grad_norm": 0.09559700638055801, + "learning_rate": 3.638326612493401e-05, + "loss": 0.0376, + "step": 76410 + }, + { + "epoch": 0.3771, + "grad_norm": 0.08619485050439835, + "learning_rate": 3.637958566623816e-05, + "loss": 0.037, + "step": 76420 + }, + { + "epoch": 0.37715, + "grad_norm": 0.080460324883461, + "learning_rate": 3.637590489642871e-05, + "loss": 0.0371, + "step": 76430 + }, + { + "epoch": 0.3772, + "grad_norm": 0.07482301443815231, + "learning_rate": 3.63722238156063e-05, + "loss": 0.0361, + "step": 76440 + }, + { + "epoch": 0.37725, + "grad_norm": 0.08932841569185257, + "learning_rate": 3.636854242387156e-05, + "loss": 0.0375, + "step": 76450 + }, + { + "epoch": 0.3773, + "grad_norm": 0.10034222155809402, + "learning_rate": 3.6364860721325145e-05, + "loss": 0.0365, + "step": 76460 + }, + { + "epoch": 0.37735, + "grad_norm": 0.11923787742853165, + "learning_rate": 3.6361178708067705e-05, + "loss": 0.039, + "step": 76470 + }, + { + "epoch": 0.3774, + "grad_norm": 0.09061778336763382, + "learning_rate": 3.635749638419991e-05, + "loss": 0.0363, + "step": 76480 + }, + { + "epoch": 0.37745, + "grad_norm": 0.08245281875133514, + "learning_rate": 3.6353813749822425e-05, + "loss": 0.0385, + "step": 76490 + }, + { + "epoch": 0.3775, + "grad_norm": 0.0959947481751442, + "learning_rate": 3.635013080503594e-05, + "loss": 0.0379, + "step": 76500 + }, + { + "epoch": 0.37755, + "grad_norm": 0.08744388073682785, + "learning_rate": 3.6346447549941145e-05, + "loss": 0.0365, + "step": 76510 + }, + { + "epoch": 0.3776, + "grad_norm": 0.0963020771741867, + "learning_rate": 3.634276398463873e-05, + "loss": 0.0378, + "step": 76520 + }, + { + "epoch": 0.37765, + "grad_norm": 0.09702512621879578, + "learning_rate": 3.633908010922941e-05, + "loss": 0.0392, + "step": 76530 + }, + { + "epoch": 0.3777, + "grad_norm": 0.09394404292106628, + "learning_rate": 3.6335395923813906e-05, + "loss": 0.0381, + "step": 76540 + }, + { + "epoch": 0.37775, + "grad_norm": 0.100669726729393, + "learning_rate": 3.6331711428492934e-05, + "loss": 0.0378, + "step": 76550 + }, + { + "epoch": 0.3778, + "grad_norm": 0.10889852792024612, + "learning_rate": 3.6328026623367236e-05, + "loss": 0.0382, + "step": 76560 + }, + { + "epoch": 0.37785, + "grad_norm": 0.09627027064561844, + "learning_rate": 3.6324341508537534e-05, + "loss": 0.0389, + "step": 76570 + }, + { + "epoch": 0.3779, + "grad_norm": 0.11300645023584366, + "learning_rate": 3.632065608410459e-05, + "loss": 0.037, + "step": 76580 + }, + { + "epoch": 0.37795, + "grad_norm": 0.09655319899320602, + "learning_rate": 3.631697035016917e-05, + "loss": 0.0367, + "step": 76590 + }, + { + "epoch": 0.378, + "grad_norm": 0.09337843954563141, + "learning_rate": 3.631328430683203e-05, + "loss": 0.0378, + "step": 76600 + }, + { + "epoch": 0.37805, + "grad_norm": 0.08331812173128128, + "learning_rate": 3.630959795419394e-05, + "loss": 0.0402, + "step": 76610 + }, + { + "epoch": 0.3781, + "grad_norm": 0.08409467339515686, + "learning_rate": 3.6305911292355696e-05, + "loss": 0.0361, + "step": 76620 + }, + { + "epoch": 0.37815, + "grad_norm": 0.08916690200567245, + "learning_rate": 3.630222432141808e-05, + "loss": 0.0381, + "step": 76630 + }, + { + "epoch": 0.3782, + "grad_norm": 0.07716446369886398, + "learning_rate": 3.6298537041481907e-05, + "loss": 0.0352, + "step": 76640 + }, + { + "epoch": 0.37825, + "grad_norm": 0.07186071574687958, + "learning_rate": 3.629484945264797e-05, + "loss": 0.0353, + "step": 76650 + }, + { + "epoch": 0.3783, + "grad_norm": 0.09080082923173904, + "learning_rate": 3.629116155501709e-05, + "loss": 0.0341, + "step": 76660 + }, + { + "epoch": 0.37835, + "grad_norm": 0.10306417942047119, + "learning_rate": 3.628747334869009e-05, + "loss": 0.0353, + "step": 76670 + }, + { + "epoch": 0.3784, + "grad_norm": 0.0900692418217659, + "learning_rate": 3.6283784833767824e-05, + "loss": 0.0354, + "step": 76680 + }, + { + "epoch": 0.37845, + "grad_norm": 0.0794641375541687, + "learning_rate": 3.628009601035111e-05, + "loss": 0.0344, + "step": 76690 + }, + { + "epoch": 0.3785, + "grad_norm": 0.0730503723025322, + "learning_rate": 3.627640687854081e-05, + "loss": 0.0338, + "step": 76700 + }, + { + "epoch": 0.37855, + "grad_norm": 0.07156781852245331, + "learning_rate": 3.627271743843779e-05, + "loss": 0.0344, + "step": 76710 + }, + { + "epoch": 0.3786, + "grad_norm": 0.07406982779502869, + "learning_rate": 3.62690276901429e-05, + "loss": 0.0355, + "step": 76720 + }, + { + "epoch": 0.37865, + "grad_norm": 0.06705506891012192, + "learning_rate": 3.626533763375703e-05, + "loss": 0.0375, + "step": 76730 + }, + { + "epoch": 0.3787, + "grad_norm": 0.08933977782726288, + "learning_rate": 3.626164726938106e-05, + "loss": 0.0367, + "step": 76740 + }, + { + "epoch": 0.37875, + "grad_norm": 0.0913522019982338, + "learning_rate": 3.625795659711589e-05, + "loss": 0.035, + "step": 76750 + }, + { + "epoch": 0.3788, + "grad_norm": 0.08894678950309753, + "learning_rate": 3.625426561706241e-05, + "loss": 0.0366, + "step": 76760 + }, + { + "epoch": 0.37885, + "grad_norm": 0.08685548603534698, + "learning_rate": 3.6250574329321535e-05, + "loss": 0.0356, + "step": 76770 + }, + { + "epoch": 0.3789, + "grad_norm": 0.09826608002185822, + "learning_rate": 3.624688273399419e-05, + "loss": 0.0363, + "step": 76780 + }, + { + "epoch": 0.37895, + "grad_norm": 0.08041828870773315, + "learning_rate": 3.624319083118129e-05, + "loss": 0.0363, + "step": 76790 + }, + { + "epoch": 0.379, + "grad_norm": 0.10084494203329086, + "learning_rate": 3.623949862098378e-05, + "loss": 0.0352, + "step": 76800 + }, + { + "epoch": 0.37905, + "grad_norm": 0.08826006203889847, + "learning_rate": 3.623580610350261e-05, + "loss": 0.0351, + "step": 76810 + }, + { + "epoch": 0.3791, + "grad_norm": 0.0710434690117836, + "learning_rate": 3.623211327883871e-05, + "loss": 0.0352, + "step": 76820 + }, + { + "epoch": 0.37915, + "grad_norm": 0.07930386066436768, + "learning_rate": 3.622842014709305e-05, + "loss": 0.0338, + "step": 76830 + }, + { + "epoch": 0.3792, + "grad_norm": 0.10032349079847336, + "learning_rate": 3.622472670836661e-05, + "loss": 0.0365, + "step": 76840 + }, + { + "epoch": 0.37925, + "grad_norm": 0.08352944254875183, + "learning_rate": 3.6221032962760354e-05, + "loss": 0.0355, + "step": 76850 + }, + { + "epoch": 0.3793, + "grad_norm": 0.10487186908721924, + "learning_rate": 3.621733891037527e-05, + "loss": 0.0367, + "step": 76860 + }, + { + "epoch": 0.37935, + "grad_norm": 0.08631854504346848, + "learning_rate": 3.621364455131236e-05, + "loss": 0.036, + "step": 76870 + }, + { + "epoch": 0.3794, + "grad_norm": 0.08653370290994644, + "learning_rate": 3.620994988567262e-05, + "loss": 0.0375, + "step": 76880 + }, + { + "epoch": 0.37945, + "grad_norm": 0.08340851217508316, + "learning_rate": 3.6206254913557065e-05, + "loss": 0.0372, + "step": 76890 + }, + { + "epoch": 0.3795, + "grad_norm": 0.09805815666913986, + "learning_rate": 3.62025596350667e-05, + "loss": 0.0376, + "step": 76900 + }, + { + "epoch": 0.37955, + "grad_norm": 0.08484770357608795, + "learning_rate": 3.6198864050302574e-05, + "loss": 0.0376, + "step": 76910 + }, + { + "epoch": 0.3796, + "grad_norm": 0.07439166307449341, + "learning_rate": 3.61951681593657e-05, + "loss": 0.0396, + "step": 76920 + }, + { + "epoch": 0.37965, + "grad_norm": 0.09397024661302567, + "learning_rate": 3.619147196235715e-05, + "loss": 0.0349, + "step": 76930 + }, + { + "epoch": 0.3797, + "grad_norm": 0.09400299191474915, + "learning_rate": 3.618777545937795e-05, + "loss": 0.0387, + "step": 76940 + }, + { + "epoch": 0.37975, + "grad_norm": 0.06988395750522614, + "learning_rate": 3.6184078650529175e-05, + "loss": 0.0357, + "step": 76950 + }, + { + "epoch": 0.3798, + "grad_norm": 0.0905274897813797, + "learning_rate": 3.618038153591189e-05, + "loss": 0.0394, + "step": 76960 + }, + { + "epoch": 0.37985, + "grad_norm": 0.1146807000041008, + "learning_rate": 3.617668411562717e-05, + "loss": 0.037, + "step": 76970 + }, + { + "epoch": 0.3799, + "grad_norm": 0.08603715151548386, + "learning_rate": 3.617298638977611e-05, + "loss": 0.0379, + "step": 76980 + }, + { + "epoch": 0.37995, + "grad_norm": 0.08420335501432419, + "learning_rate": 3.61692883584598e-05, + "loss": 0.0379, + "step": 76990 + }, + { + "epoch": 0.38, + "grad_norm": 0.08034734427928925, + "learning_rate": 3.616559002177935e-05, + "loss": 0.0377, + "step": 77000 + }, + { + "epoch": 0.38005, + "grad_norm": 0.1009376272559166, + "learning_rate": 3.616189137983586e-05, + "loss": 0.0381, + "step": 77010 + }, + { + "epoch": 0.3801, + "grad_norm": 0.0851750373840332, + "learning_rate": 3.6158192432730444e-05, + "loss": 0.0357, + "step": 77020 + }, + { + "epoch": 0.38015, + "grad_norm": 0.09277315437793732, + "learning_rate": 3.615449318056424e-05, + "loss": 0.0379, + "step": 77030 + }, + { + "epoch": 0.3802, + "grad_norm": 0.07979048788547516, + "learning_rate": 3.615079362343839e-05, + "loss": 0.0365, + "step": 77040 + }, + { + "epoch": 0.38025, + "grad_norm": 0.09772320091724396, + "learning_rate": 3.614709376145402e-05, + "loss": 0.0372, + "step": 77050 + }, + { + "epoch": 0.3803, + "grad_norm": 0.07867799699306488, + "learning_rate": 3.614339359471231e-05, + "loss": 0.0368, + "step": 77060 + }, + { + "epoch": 0.38035, + "grad_norm": 0.08486375212669373, + "learning_rate": 3.61396931233144e-05, + "loss": 0.0365, + "step": 77070 + }, + { + "epoch": 0.3804, + "grad_norm": 0.09690821915864944, + "learning_rate": 3.613599234736146e-05, + "loss": 0.0368, + "step": 77080 + }, + { + "epoch": 0.38045, + "grad_norm": 0.0905151292681694, + "learning_rate": 3.613229126695467e-05, + "loss": 0.0355, + "step": 77090 + }, + { + "epoch": 0.3805, + "grad_norm": 0.08214130252599716, + "learning_rate": 3.612858988219523e-05, + "loss": 0.0354, + "step": 77100 + }, + { + "epoch": 0.38055, + "grad_norm": 0.09367689490318298, + "learning_rate": 3.612488819318431e-05, + "loss": 0.0356, + "step": 77110 + }, + { + "epoch": 0.3806, + "grad_norm": 0.08685409277677536, + "learning_rate": 3.612118620002314e-05, + "loss": 0.0354, + "step": 77120 + }, + { + "epoch": 0.38065, + "grad_norm": 0.09349635243415833, + "learning_rate": 3.6117483902812914e-05, + "loss": 0.0369, + "step": 77130 + }, + { + "epoch": 0.3807, + "grad_norm": 0.0709528923034668, + "learning_rate": 3.611378130165486e-05, + "loss": 0.0358, + "step": 77140 + }, + { + "epoch": 0.38075, + "grad_norm": 0.07947801798582077, + "learning_rate": 3.6110078396650186e-05, + "loss": 0.0336, + "step": 77150 + }, + { + "epoch": 0.3808, + "grad_norm": 0.06851892918348312, + "learning_rate": 3.6106375187900146e-05, + "loss": 0.0336, + "step": 77160 + }, + { + "epoch": 0.38085, + "grad_norm": 0.0975281223654747, + "learning_rate": 3.610267167550599e-05, + "loss": 0.0362, + "step": 77170 + }, + { + "epoch": 0.3809, + "grad_norm": 0.10145972669124603, + "learning_rate": 3.609896785956896e-05, + "loss": 0.0353, + "step": 77180 + }, + { + "epoch": 0.38095, + "grad_norm": 0.08614010363817215, + "learning_rate": 3.609526374019031e-05, + "loss": 0.035, + "step": 77190 + }, + { + "epoch": 0.381, + "grad_norm": 0.09822831302881241, + "learning_rate": 3.6091559317471316e-05, + "loss": 0.0351, + "step": 77200 + }, + { + "epoch": 0.38105, + "grad_norm": 0.0786934494972229, + "learning_rate": 3.608785459151327e-05, + "loss": 0.0372, + "step": 77210 + }, + { + "epoch": 0.3811, + "grad_norm": 0.09421947598457336, + "learning_rate": 3.608414956241743e-05, + "loss": 0.036, + "step": 77220 + }, + { + "epoch": 0.38115, + "grad_norm": 0.08654823154211044, + "learning_rate": 3.608044423028511e-05, + "loss": 0.0358, + "step": 77230 + }, + { + "epoch": 0.3812, + "grad_norm": 0.0809483677148819, + "learning_rate": 3.607673859521762e-05, + "loss": 0.0349, + "step": 77240 + }, + { + "epoch": 0.38125, + "grad_norm": 0.10821547359228134, + "learning_rate": 3.607303265731625e-05, + "loss": 0.0373, + "step": 77250 + }, + { + "epoch": 0.3813, + "grad_norm": 0.08119744807481766, + "learning_rate": 3.606932641668232e-05, + "loss": 0.0359, + "step": 77260 + }, + { + "epoch": 0.38135, + "grad_norm": 0.15310417115688324, + "learning_rate": 3.606561987341718e-05, + "loss": 0.0369, + "step": 77270 + }, + { + "epoch": 0.3814, + "grad_norm": 0.09876388311386108, + "learning_rate": 3.606191302762213e-05, + "loss": 0.0394, + "step": 77280 + }, + { + "epoch": 0.38145, + "grad_norm": 0.09586326032876968, + "learning_rate": 3.6058205879398544e-05, + "loss": 0.0361, + "step": 77290 + }, + { + "epoch": 0.3815, + "grad_norm": 0.07784045487642288, + "learning_rate": 3.605449842884776e-05, + "loss": 0.0372, + "step": 77300 + }, + { + "epoch": 0.38155, + "grad_norm": 0.08723390102386475, + "learning_rate": 3.605079067607115e-05, + "loss": 0.0367, + "step": 77310 + }, + { + "epoch": 0.3816, + "grad_norm": 0.09609778970479965, + "learning_rate": 3.604708262117007e-05, + "loss": 0.0377, + "step": 77320 + }, + { + "epoch": 0.38165, + "grad_norm": 0.08643398433923721, + "learning_rate": 3.6043374264245904e-05, + "loss": 0.0352, + "step": 77330 + }, + { + "epoch": 0.3817, + "grad_norm": 0.10599758476018906, + "learning_rate": 3.603966560540003e-05, + "loss": 0.0369, + "step": 77340 + }, + { + "epoch": 0.38175, + "grad_norm": 0.10870613902807236, + "learning_rate": 3.603595664473385e-05, + "loss": 0.0369, + "step": 77350 + }, + { + "epoch": 0.3818, + "grad_norm": 0.10345200449228287, + "learning_rate": 3.603224738234875e-05, + "loss": 0.0376, + "step": 77360 + }, + { + "epoch": 0.38185, + "grad_norm": 0.09829385578632355, + "learning_rate": 3.602853781834616e-05, + "loss": 0.0383, + "step": 77370 + }, + { + "epoch": 0.3819, + "grad_norm": 0.09776751697063446, + "learning_rate": 3.6024827952827486e-05, + "loss": 0.0345, + "step": 77380 + }, + { + "epoch": 0.38195, + "grad_norm": 0.10886778682470322, + "learning_rate": 3.602111778589417e-05, + "loss": 0.0357, + "step": 77390 + }, + { + "epoch": 0.382, + "grad_norm": 0.11069987714290619, + "learning_rate": 3.6017407317647626e-05, + "loss": 0.0373, + "step": 77400 + }, + { + "epoch": 0.38205, + "grad_norm": 0.12858431041240692, + "learning_rate": 3.60136965481893e-05, + "loss": 0.0374, + "step": 77410 + }, + { + "epoch": 0.3821, + "grad_norm": 0.0967574417591095, + "learning_rate": 3.600998547762065e-05, + "loss": 0.0404, + "step": 77420 + }, + { + "epoch": 0.38215, + "grad_norm": 0.11484010517597198, + "learning_rate": 3.6006274106043135e-05, + "loss": 0.0388, + "step": 77430 + }, + { + "epoch": 0.3822, + "grad_norm": 0.0994647741317749, + "learning_rate": 3.600256243355822e-05, + "loss": 0.0355, + "step": 77440 + }, + { + "epoch": 0.38225, + "grad_norm": 0.11798536032438278, + "learning_rate": 3.599885046026738e-05, + "loss": 0.0362, + "step": 77450 + }, + { + "epoch": 0.3823, + "grad_norm": 0.08284584432840347, + "learning_rate": 3.599513818627211e-05, + "loss": 0.037, + "step": 77460 + }, + { + "epoch": 0.38235, + "grad_norm": 0.08958155661821365, + "learning_rate": 3.5991425611673876e-05, + "loss": 0.0375, + "step": 77470 + }, + { + "epoch": 0.3824, + "grad_norm": 0.08036845177412033, + "learning_rate": 3.598771273657421e-05, + "loss": 0.0359, + "step": 77480 + }, + { + "epoch": 0.38245, + "grad_norm": 0.09704770147800446, + "learning_rate": 3.59839995610746e-05, + "loss": 0.0355, + "step": 77490 + }, + { + "epoch": 0.3825, + "grad_norm": 0.07127789407968521, + "learning_rate": 3.5980286085276574e-05, + "loss": 0.0345, + "step": 77500 + }, + { + "epoch": 0.38255, + "grad_norm": 0.07797723263502121, + "learning_rate": 3.597657230928164e-05, + "loss": 0.0368, + "step": 77510 + }, + { + "epoch": 0.3826, + "grad_norm": 0.08957751095294952, + "learning_rate": 3.5972858233191356e-05, + "loss": 0.0367, + "step": 77520 + }, + { + "epoch": 0.38265, + "grad_norm": 0.07110311836004257, + "learning_rate": 3.596914385710724e-05, + "loss": 0.0349, + "step": 77530 + }, + { + "epoch": 0.3827, + "grad_norm": 0.07258278876543045, + "learning_rate": 3.596542918113085e-05, + "loss": 0.0366, + "step": 77540 + }, + { + "epoch": 0.38275, + "grad_norm": 0.07738789170980453, + "learning_rate": 3.596171420536375e-05, + "loss": 0.0352, + "step": 77550 + }, + { + "epoch": 0.3828, + "grad_norm": 0.06839203089475632, + "learning_rate": 3.595799892990751e-05, + "loss": 0.0357, + "step": 77560 + }, + { + "epoch": 0.38285, + "grad_norm": 0.08227694034576416, + "learning_rate": 3.595428335486368e-05, + "loss": 0.035, + "step": 77570 + }, + { + "epoch": 0.3829, + "grad_norm": 0.0727728009223938, + "learning_rate": 3.5950567480333876e-05, + "loss": 0.0357, + "step": 77580 + }, + { + "epoch": 0.38295, + "grad_norm": 0.08525092154741287, + "learning_rate": 3.594685130641966e-05, + "loss": 0.0371, + "step": 77590 + }, + { + "epoch": 0.383, + "grad_norm": 0.08446183800697327, + "learning_rate": 3.594313483322264e-05, + "loss": 0.0367, + "step": 77600 + }, + { + "epoch": 0.38305, + "grad_norm": 0.08879677951335907, + "learning_rate": 3.593941806084443e-05, + "loss": 0.0374, + "step": 77610 + }, + { + "epoch": 0.3831, + "grad_norm": 0.08682093769311905, + "learning_rate": 3.593570098938664e-05, + "loss": 0.0381, + "step": 77620 + }, + { + "epoch": 0.38315, + "grad_norm": 0.08730120211839676, + "learning_rate": 3.5931983618950896e-05, + "loss": 0.0369, + "step": 77630 + }, + { + "epoch": 0.3832, + "grad_norm": 0.08197631686925888, + "learning_rate": 3.5928265949638816e-05, + "loss": 0.0368, + "step": 77640 + }, + { + "epoch": 0.38325, + "grad_norm": 0.08786528557538986, + "learning_rate": 3.592454798155206e-05, + "loss": 0.0375, + "step": 77650 + }, + { + "epoch": 0.3833, + "grad_norm": 0.08104783296585083, + "learning_rate": 3.592082971479226e-05, + "loss": 0.0364, + "step": 77660 + }, + { + "epoch": 0.38335, + "grad_norm": 0.07689512521028519, + "learning_rate": 3.591711114946108e-05, + "loss": 0.0372, + "step": 77670 + }, + { + "epoch": 0.3834, + "grad_norm": 0.11725229769945145, + "learning_rate": 3.591339228566019e-05, + "loss": 0.0411, + "step": 77680 + }, + { + "epoch": 0.38345, + "grad_norm": 0.09530025720596313, + "learning_rate": 3.590967312349125e-05, + "loss": 0.0371, + "step": 77690 + }, + { + "epoch": 0.3835, + "grad_norm": 0.0992274358868599, + "learning_rate": 3.5905953663055944e-05, + "loss": 0.0372, + "step": 77700 + }, + { + "epoch": 0.38355, + "grad_norm": 0.08710917085409164, + "learning_rate": 3.590223390445596e-05, + "loss": 0.0372, + "step": 77710 + }, + { + "epoch": 0.3836, + "grad_norm": 0.1039763018488884, + "learning_rate": 3.5898513847793004e-05, + "loss": 0.0362, + "step": 77720 + }, + { + "epoch": 0.38365, + "grad_norm": 0.09274699538946152, + "learning_rate": 3.589479349316877e-05, + "loss": 0.0376, + "step": 77730 + }, + { + "epoch": 0.3837, + "grad_norm": 0.11407878249883652, + "learning_rate": 3.589107284068497e-05, + "loss": 0.0374, + "step": 77740 + }, + { + "epoch": 0.38375, + "grad_norm": 0.0961870476603508, + "learning_rate": 3.5887351890443336e-05, + "loss": 0.0371, + "step": 77750 + }, + { + "epoch": 0.3838, + "grad_norm": 0.12014513462781906, + "learning_rate": 3.5883630642545586e-05, + "loss": 0.0379, + "step": 77760 + }, + { + "epoch": 0.38385, + "grad_norm": 0.0941246747970581, + "learning_rate": 3.5879909097093476e-05, + "loss": 0.0364, + "step": 77770 + }, + { + "epoch": 0.3839, + "grad_norm": 0.07910443842411041, + "learning_rate": 3.587618725418872e-05, + "loss": 0.0357, + "step": 77780 + }, + { + "epoch": 0.38395, + "grad_norm": 0.09617926180362701, + "learning_rate": 3.5872465113933104e-05, + "loss": 0.0361, + "step": 77790 + }, + { + "epoch": 0.384, + "grad_norm": 0.10334715247154236, + "learning_rate": 3.586874267642837e-05, + "loss": 0.0358, + "step": 77800 + }, + { + "epoch": 0.38405, + "grad_norm": 0.09399249404668808, + "learning_rate": 3.58650199417763e-05, + "loss": 0.0364, + "step": 77810 + }, + { + "epoch": 0.3841, + "grad_norm": 0.08352718502283096, + "learning_rate": 3.5861296910078664e-05, + "loss": 0.0361, + "step": 77820 + }, + { + "epoch": 0.38415, + "grad_norm": 0.0855955183506012, + "learning_rate": 3.585757358143725e-05, + "loss": 0.0362, + "step": 77830 + }, + { + "epoch": 0.3842, + "grad_norm": 0.10338416695594788, + "learning_rate": 3.5853849955953855e-05, + "loss": 0.0366, + "step": 77840 + }, + { + "epoch": 0.38425, + "grad_norm": 0.09872971475124359, + "learning_rate": 3.585012603373028e-05, + "loss": 0.038, + "step": 77850 + }, + { + "epoch": 0.3843, + "grad_norm": 0.09367211908102036, + "learning_rate": 3.584640181486833e-05, + "loss": 0.036, + "step": 77860 + }, + { + "epoch": 0.38435, + "grad_norm": 0.09950553625822067, + "learning_rate": 3.584267729946983e-05, + "loss": 0.0377, + "step": 77870 + }, + { + "epoch": 0.3844, + "grad_norm": 0.0915476605296135, + "learning_rate": 3.583895248763661e-05, + "loss": 0.0352, + "step": 77880 + }, + { + "epoch": 0.38445, + "grad_norm": 0.09203360974788666, + "learning_rate": 3.58352273794705e-05, + "loss": 0.0364, + "step": 77890 + }, + { + "epoch": 0.3845, + "grad_norm": 0.08658530563116074, + "learning_rate": 3.583150197507335e-05, + "loss": 0.0366, + "step": 77900 + }, + { + "epoch": 0.38455, + "grad_norm": 0.1105838268995285, + "learning_rate": 3.582777627454699e-05, + "loss": 0.0355, + "step": 77910 + }, + { + "epoch": 0.3846, + "grad_norm": 0.09125398844480515, + "learning_rate": 3.5824050277993304e-05, + "loss": 0.0378, + "step": 77920 + }, + { + "epoch": 0.38465, + "grad_norm": 0.07785584032535553, + "learning_rate": 3.5820323985514157e-05, + "loss": 0.0351, + "step": 77930 + }, + { + "epoch": 0.3847, + "grad_norm": 0.09558206051588058, + "learning_rate": 3.58165973972114e-05, + "loss": 0.0371, + "step": 77940 + }, + { + "epoch": 0.38475, + "grad_norm": 0.08811909705400467, + "learning_rate": 3.581287051318695e-05, + "loss": 0.0357, + "step": 77950 + }, + { + "epoch": 0.3848, + "grad_norm": 0.08579441159963608, + "learning_rate": 3.5809143333542676e-05, + "loss": 0.0361, + "step": 77960 + }, + { + "epoch": 0.38485, + "grad_norm": 0.09081538766622543, + "learning_rate": 3.580541585838049e-05, + "loss": 0.0368, + "step": 77970 + }, + { + "epoch": 0.3849, + "grad_norm": 0.10513069480657578, + "learning_rate": 3.580168808780228e-05, + "loss": 0.0372, + "step": 77980 + }, + { + "epoch": 0.38495, + "grad_norm": 0.0886191725730896, + "learning_rate": 3.579796002190998e-05, + "loss": 0.0357, + "step": 77990 + }, + { + "epoch": 0.385, + "grad_norm": 0.0755455270409584, + "learning_rate": 3.579423166080552e-05, + "loss": 0.0367, + "step": 78000 + }, + { + "epoch": 0.38505, + "grad_norm": 0.10791072249412537, + "learning_rate": 3.5790503004590814e-05, + "loss": 0.0373, + "step": 78010 + }, + { + "epoch": 0.3851, + "grad_norm": 0.1349969208240509, + "learning_rate": 3.578677405336781e-05, + "loss": 0.0386, + "step": 78020 + }, + { + "epoch": 0.38515, + "grad_norm": 0.11925962567329407, + "learning_rate": 3.578304480723845e-05, + "loss": 0.0387, + "step": 78030 + }, + { + "epoch": 0.3852, + "grad_norm": 0.09497813880443573, + "learning_rate": 3.577931526630471e-05, + "loss": 0.0392, + "step": 78040 + }, + { + "epoch": 0.38525, + "grad_norm": 0.09541451185941696, + "learning_rate": 3.5775585430668524e-05, + "loss": 0.0368, + "step": 78050 + }, + { + "epoch": 0.3853, + "grad_norm": 0.10846823453903198, + "learning_rate": 3.577185530043189e-05, + "loss": 0.0371, + "step": 78060 + }, + { + "epoch": 0.38535, + "grad_norm": 0.14890998601913452, + "learning_rate": 3.576812487569678e-05, + "loss": 0.0375, + "step": 78070 + }, + { + "epoch": 0.3854, + "grad_norm": 0.10274876654148102, + "learning_rate": 3.576439415656519e-05, + "loss": 0.0382, + "step": 78080 + }, + { + "epoch": 0.38545, + "grad_norm": 0.08878965675830841, + "learning_rate": 3.576066314313909e-05, + "loss": 0.0376, + "step": 78090 + }, + { + "epoch": 0.3855, + "grad_norm": 0.08467261493206024, + "learning_rate": 3.575693183552051e-05, + "loss": 0.0385, + "step": 78100 + }, + { + "epoch": 0.38555, + "grad_norm": 0.10923632234334946, + "learning_rate": 3.5753200233811446e-05, + "loss": 0.036, + "step": 78110 + }, + { + "epoch": 0.3856, + "grad_norm": 0.09093783795833588, + "learning_rate": 3.574946833811394e-05, + "loss": 0.0378, + "step": 78120 + }, + { + "epoch": 0.38565, + "grad_norm": 0.08795181661844254, + "learning_rate": 3.574573614853e-05, + "loss": 0.0405, + "step": 78130 + }, + { + "epoch": 0.3857, + "grad_norm": 0.10429956018924713, + "learning_rate": 3.5742003665161684e-05, + "loss": 0.0376, + "step": 78140 + }, + { + "epoch": 0.38575, + "grad_norm": 0.0812741219997406, + "learning_rate": 3.573827088811101e-05, + "loss": 0.0361, + "step": 78150 + }, + { + "epoch": 0.3858, + "grad_norm": 0.0807657390832901, + "learning_rate": 3.573453781748004e-05, + "loss": 0.0361, + "step": 78160 + }, + { + "epoch": 0.38585, + "grad_norm": 0.08330751955509186, + "learning_rate": 3.5730804453370846e-05, + "loss": 0.0364, + "step": 78170 + }, + { + "epoch": 0.3859, + "grad_norm": 0.08211551606655121, + "learning_rate": 3.5727070795885496e-05, + "loss": 0.0368, + "step": 78180 + }, + { + "epoch": 0.38595, + "grad_norm": 0.08663376420736313, + "learning_rate": 3.5723336845126055e-05, + "loss": 0.0367, + "step": 78190 + }, + { + "epoch": 0.386, + "grad_norm": 0.10587425529956818, + "learning_rate": 3.5719602601194606e-05, + "loss": 0.0423, + "step": 78200 + }, + { + "epoch": 0.38605, + "grad_norm": 0.07704432308673859, + "learning_rate": 3.571586806419326e-05, + "loss": 0.0373, + "step": 78210 + }, + { + "epoch": 0.3861, + "grad_norm": 0.08284270763397217, + "learning_rate": 3.57121332342241e-05, + "loss": 0.0365, + "step": 78220 + }, + { + "epoch": 0.38615, + "grad_norm": 0.09966335445642471, + "learning_rate": 3.570839811138925e-05, + "loss": 0.0385, + "step": 78230 + }, + { + "epoch": 0.3862, + "grad_norm": 0.09196805208921432, + "learning_rate": 3.570466269579081e-05, + "loss": 0.0375, + "step": 78240 + }, + { + "epoch": 0.38625, + "grad_norm": 0.08560289442539215, + "learning_rate": 3.570092698753091e-05, + "loss": 0.0372, + "step": 78250 + }, + { + "epoch": 0.3863, + "grad_norm": 0.08833914995193481, + "learning_rate": 3.5697190986711696e-05, + "loss": 0.0368, + "step": 78260 + }, + { + "epoch": 0.38635, + "grad_norm": 0.09780129790306091, + "learning_rate": 3.56934546934353e-05, + "loss": 0.0362, + "step": 78270 + }, + { + "epoch": 0.3864, + "grad_norm": 0.09053563326597214, + "learning_rate": 3.568971810780386e-05, + "loss": 0.0365, + "step": 78280 + }, + { + "epoch": 0.38645, + "grad_norm": 0.08348232507705688, + "learning_rate": 3.5685981229919557e-05, + "loss": 0.0365, + "step": 78290 + }, + { + "epoch": 0.3865, + "grad_norm": 0.0905507355928421, + "learning_rate": 3.568224405988453e-05, + "loss": 0.0372, + "step": 78300 + }, + { + "epoch": 0.38655, + "grad_norm": 0.0896824523806572, + "learning_rate": 3.567850659780097e-05, + "loss": 0.0368, + "step": 78310 + }, + { + "epoch": 0.3866, + "grad_norm": 0.08112866431474686, + "learning_rate": 3.567476884377104e-05, + "loss": 0.0363, + "step": 78320 + }, + { + "epoch": 0.38665, + "grad_norm": 0.10949297994375229, + "learning_rate": 3.567103079789695e-05, + "loss": 0.0362, + "step": 78330 + }, + { + "epoch": 0.3867, + "grad_norm": 0.09558262676000595, + "learning_rate": 3.566729246028089e-05, + "loss": 0.0356, + "step": 78340 + }, + { + "epoch": 0.38675, + "grad_norm": 0.0879255011677742, + "learning_rate": 3.566355383102506e-05, + "loss": 0.037, + "step": 78350 + }, + { + "epoch": 0.3868, + "grad_norm": 0.08192962408065796, + "learning_rate": 3.565981491023167e-05, + "loss": 0.036, + "step": 78360 + }, + { + "epoch": 0.38685, + "grad_norm": 0.09210727363824844, + "learning_rate": 3.5656075698002946e-05, + "loss": 0.0351, + "step": 78370 + }, + { + "epoch": 0.3869, + "grad_norm": 0.09961491823196411, + "learning_rate": 3.565233619444111e-05, + "loss": 0.0358, + "step": 78380 + }, + { + "epoch": 0.38695, + "grad_norm": 0.09111356735229492, + "learning_rate": 3.564859639964841e-05, + "loss": 0.0354, + "step": 78390 + }, + { + "epoch": 0.387, + "grad_norm": 0.10473669320344925, + "learning_rate": 3.564485631372709e-05, + "loss": 0.0348, + "step": 78400 + }, + { + "epoch": 0.38705, + "grad_norm": 0.0872640311717987, + "learning_rate": 3.564111593677939e-05, + "loss": 0.036, + "step": 78410 + }, + { + "epoch": 0.3871, + "grad_norm": 0.08784716576337814, + "learning_rate": 3.563737526890759e-05, + "loss": 0.0356, + "step": 78420 + }, + { + "epoch": 0.38715, + "grad_norm": 0.09517963230609894, + "learning_rate": 3.563363431021393e-05, + "loss": 0.0361, + "step": 78430 + }, + { + "epoch": 0.3872, + "grad_norm": 0.09568300098180771, + "learning_rate": 3.562989306080071e-05, + "loss": 0.0367, + "step": 78440 + }, + { + "epoch": 0.38725, + "grad_norm": 0.07990391552448273, + "learning_rate": 3.562615152077021e-05, + "loss": 0.0354, + "step": 78450 + }, + { + "epoch": 0.3873, + "grad_norm": 0.08299369364976883, + "learning_rate": 3.562240969022471e-05, + "loss": 0.0362, + "step": 78460 + }, + { + "epoch": 0.38735, + "grad_norm": 0.07802381366491318, + "learning_rate": 3.5618667569266525e-05, + "loss": 0.0355, + "step": 78470 + }, + { + "epoch": 0.3874, + "grad_norm": 0.08770909160375595, + "learning_rate": 3.561492515799797e-05, + "loss": 0.0371, + "step": 78480 + }, + { + "epoch": 0.38745, + "grad_norm": 0.08272002637386322, + "learning_rate": 3.561118245652133e-05, + "loss": 0.0373, + "step": 78490 + }, + { + "epoch": 0.3875, + "grad_norm": 0.08167009800672531, + "learning_rate": 3.560743946493896e-05, + "loss": 0.0366, + "step": 78500 + }, + { + "epoch": 0.38755, + "grad_norm": 0.10057704895734787, + "learning_rate": 3.560369618335317e-05, + "loss": 0.0386, + "step": 78510 + }, + { + "epoch": 0.3876, + "grad_norm": 0.09390366822481155, + "learning_rate": 3.5599952611866325e-05, + "loss": 0.0368, + "step": 78520 + }, + { + "epoch": 0.38765, + "grad_norm": 0.09164083003997803, + "learning_rate": 3.559620875058075e-05, + "loss": 0.0368, + "step": 78530 + }, + { + "epoch": 0.3877, + "grad_norm": 0.10917727649211884, + "learning_rate": 3.5592464599598804e-05, + "loss": 0.036, + "step": 78540 + }, + { + "epoch": 0.38775, + "grad_norm": 0.11053070425987244, + "learning_rate": 3.558872015902286e-05, + "loss": 0.0403, + "step": 78550 + }, + { + "epoch": 0.3878, + "grad_norm": 0.08020950853824615, + "learning_rate": 3.558497542895528e-05, + "loss": 0.0366, + "step": 78560 + }, + { + "epoch": 0.38785, + "grad_norm": 0.09129568189382553, + "learning_rate": 3.558123040949846e-05, + "loss": 0.0365, + "step": 78570 + }, + { + "epoch": 0.3879, + "grad_norm": 0.09383866935968399, + "learning_rate": 3.557748510075477e-05, + "loss": 0.0369, + "step": 78580 + }, + { + "epoch": 0.38795, + "grad_norm": 0.07243139296770096, + "learning_rate": 3.55737395028266e-05, + "loss": 0.0361, + "step": 78590 + }, + { + "epoch": 0.388, + "grad_norm": 0.07961437851190567, + "learning_rate": 3.5569993615816386e-05, + "loss": 0.0369, + "step": 78600 + }, + { + "epoch": 0.38805, + "grad_norm": 0.08032999187707901, + "learning_rate": 3.55662474398265e-05, + "loss": 0.0367, + "step": 78610 + }, + { + "epoch": 0.3881, + "grad_norm": 0.08743961155414581, + "learning_rate": 3.556250097495938e-05, + "loss": 0.036, + "step": 78620 + }, + { + "epoch": 0.38815, + "grad_norm": 0.10869049280881882, + "learning_rate": 3.5558754221317454e-05, + "loss": 0.0388, + "step": 78630 + }, + { + "epoch": 0.3882, + "grad_norm": 0.08218946307897568, + "learning_rate": 3.555500717900316e-05, + "loss": 0.0351, + "step": 78640 + }, + { + "epoch": 0.38825, + "grad_norm": 0.0724598839879036, + "learning_rate": 3.5551259848118926e-05, + "loss": 0.035, + "step": 78650 + }, + { + "epoch": 0.3883, + "grad_norm": 0.09266900271177292, + "learning_rate": 3.554751222876722e-05, + "loss": 0.0363, + "step": 78660 + }, + { + "epoch": 0.38835, + "grad_norm": 0.08385990560054779, + "learning_rate": 3.554376432105049e-05, + "loss": 0.0368, + "step": 78670 + }, + { + "epoch": 0.3884, + "grad_norm": 0.08530783653259277, + "learning_rate": 3.55400161250712e-05, + "loss": 0.0353, + "step": 78680 + }, + { + "epoch": 0.38845, + "grad_norm": 0.07914736121892929, + "learning_rate": 3.553626764093183e-05, + "loss": 0.0354, + "step": 78690 + }, + { + "epoch": 0.3885, + "grad_norm": 0.10238605737686157, + "learning_rate": 3.553251886873486e-05, + "loss": 0.0364, + "step": 78700 + }, + { + "epoch": 0.38855, + "grad_norm": 0.1394563615322113, + "learning_rate": 3.552876980858279e-05, + "loss": 0.0363, + "step": 78710 + }, + { + "epoch": 0.3886, + "grad_norm": 0.10636939108371735, + "learning_rate": 3.552502046057809e-05, + "loss": 0.0364, + "step": 78720 + }, + { + "epoch": 0.38865, + "grad_norm": 0.08263157308101654, + "learning_rate": 3.552127082482331e-05, + "loss": 0.0345, + "step": 78730 + }, + { + "epoch": 0.3887, + "grad_norm": 0.09180624783039093, + "learning_rate": 3.551752090142093e-05, + "loss": 0.0354, + "step": 78740 + }, + { + "epoch": 0.38875, + "grad_norm": 0.09710685163736343, + "learning_rate": 3.5513770690473466e-05, + "loss": 0.0349, + "step": 78750 + }, + { + "epoch": 0.3888, + "grad_norm": 0.08567915111780167, + "learning_rate": 3.551002019208347e-05, + "loss": 0.0349, + "step": 78760 + }, + { + "epoch": 0.38885, + "grad_norm": 0.10945023596286774, + "learning_rate": 3.5506269406353476e-05, + "loss": 0.0369, + "step": 78770 + }, + { + "epoch": 0.3889, + "grad_norm": 0.08565888553857803, + "learning_rate": 3.550251833338601e-05, + "loss": 0.0356, + "step": 78780 + }, + { + "epoch": 0.38895, + "grad_norm": 0.09337818622589111, + "learning_rate": 3.549876697328366e-05, + "loss": 0.0354, + "step": 78790 + }, + { + "epoch": 0.389, + "grad_norm": 0.07999156415462494, + "learning_rate": 3.5495015326148945e-05, + "loss": 0.0341, + "step": 78800 + }, + { + "epoch": 0.38905, + "grad_norm": 0.07796759903430939, + "learning_rate": 3.549126339208446e-05, + "loss": 0.035, + "step": 78810 + }, + { + "epoch": 0.3891, + "grad_norm": 0.09235970675945282, + "learning_rate": 3.548751117119278e-05, + "loss": 0.0347, + "step": 78820 + }, + { + "epoch": 0.38915, + "grad_norm": 0.08895865827798843, + "learning_rate": 3.5483758663576486e-05, + "loss": 0.034, + "step": 78830 + }, + { + "epoch": 0.3892, + "grad_norm": 0.0840078666806221, + "learning_rate": 3.548000586933816e-05, + "loss": 0.0349, + "step": 78840 + }, + { + "epoch": 0.38925, + "grad_norm": 0.10279300808906555, + "learning_rate": 3.547625278858041e-05, + "loss": 0.0364, + "step": 78850 + }, + { + "epoch": 0.3893, + "grad_norm": 0.11641772836446762, + "learning_rate": 3.5472499421405844e-05, + "loss": 0.036, + "step": 78860 + }, + { + "epoch": 0.38935, + "grad_norm": 0.08842886239290237, + "learning_rate": 3.5468745767917086e-05, + "loss": 0.0355, + "step": 78870 + }, + { + "epoch": 0.3894, + "grad_norm": 0.10167177766561508, + "learning_rate": 3.546499182821675e-05, + "loss": 0.0356, + "step": 78880 + }, + { + "epoch": 0.38945, + "grad_norm": 0.08147701621055603, + "learning_rate": 3.5461237602407474e-05, + "loss": 0.0351, + "step": 78890 + }, + { + "epoch": 0.3895, + "grad_norm": 0.0973169133067131, + "learning_rate": 3.545748309059188e-05, + "loss": 0.0358, + "step": 78900 + }, + { + "epoch": 0.38955, + "grad_norm": 0.08725616335868835, + "learning_rate": 3.545372829287263e-05, + "loss": 0.0347, + "step": 78910 + }, + { + "epoch": 0.3896, + "grad_norm": 0.09617139399051666, + "learning_rate": 3.5449973209352386e-05, + "loss": 0.0355, + "step": 78920 + }, + { + "epoch": 0.38965, + "grad_norm": 0.10085910558700562, + "learning_rate": 3.544621784013378e-05, + "loss": 0.0353, + "step": 78930 + }, + { + "epoch": 0.3897, + "grad_norm": 0.09641211479902267, + "learning_rate": 3.544246218531952e-05, + "loss": 0.0346, + "step": 78940 + }, + { + "epoch": 0.38975, + "grad_norm": 0.09419838339090347, + "learning_rate": 3.543870624501226e-05, + "loss": 0.0346, + "step": 78950 + }, + { + "epoch": 0.3898, + "grad_norm": 0.09785092622041702, + "learning_rate": 3.5434950019314694e-05, + "loss": 0.0353, + "step": 78960 + }, + { + "epoch": 0.38985, + "grad_norm": 0.08658993244171143, + "learning_rate": 3.543119350832952e-05, + "loss": 0.0343, + "step": 78970 + }, + { + "epoch": 0.3899, + "grad_norm": 0.11044695228338242, + "learning_rate": 3.542743671215943e-05, + "loss": 0.0366, + "step": 78980 + }, + { + "epoch": 0.38995, + "grad_norm": 0.08109579980373383, + "learning_rate": 3.542367963090714e-05, + "loss": 0.0365, + "step": 78990 + }, + { + "epoch": 0.39, + "grad_norm": 0.09335202723741531, + "learning_rate": 3.5419922264675356e-05, + "loss": 0.0352, + "step": 79000 + }, + { + "epoch": 0.39005, + "grad_norm": 0.0846087858080864, + "learning_rate": 3.541616461356682e-05, + "loss": 0.0415, + "step": 79010 + }, + { + "epoch": 0.3901, + "grad_norm": 0.09359367191791534, + "learning_rate": 3.541240667768426e-05, + "loss": 0.0343, + "step": 79020 + }, + { + "epoch": 0.39015, + "grad_norm": 0.09477211534976959, + "learning_rate": 3.54086484571304e-05, + "loss": 0.0373, + "step": 79030 + }, + { + "epoch": 0.3902, + "grad_norm": 0.09294027835130692, + "learning_rate": 3.540488995200801e-05, + "loss": 0.036, + "step": 79040 + }, + { + "epoch": 0.39025, + "grad_norm": 0.08735240995883942, + "learning_rate": 3.540113116241984e-05, + "loss": 0.0352, + "step": 79050 + }, + { + "epoch": 0.3903, + "grad_norm": 0.08991049975156784, + "learning_rate": 3.539737208846865e-05, + "loss": 0.0363, + "step": 79060 + }, + { + "epoch": 0.39035, + "grad_norm": 0.08595713973045349, + "learning_rate": 3.539361273025721e-05, + "loss": 0.0367, + "step": 79070 + }, + { + "epoch": 0.3904, + "grad_norm": 0.10878205299377441, + "learning_rate": 3.538985308788831e-05, + "loss": 0.0381, + "step": 79080 + }, + { + "epoch": 0.39045, + "grad_norm": 0.08834918588399887, + "learning_rate": 3.538609316146472e-05, + "loss": 0.0367, + "step": 79090 + }, + { + "epoch": 0.3905, + "grad_norm": 0.08705242723226547, + "learning_rate": 3.538233295108925e-05, + "loss": 0.0386, + "step": 79100 + }, + { + "epoch": 0.39055, + "grad_norm": 0.09597699344158173, + "learning_rate": 3.53785724568647e-05, + "loss": 0.0376, + "step": 79110 + }, + { + "epoch": 0.3906, + "grad_norm": 0.07092789560556412, + "learning_rate": 3.5374811678893874e-05, + "loss": 0.0386, + "step": 79120 + }, + { + "epoch": 0.39065, + "grad_norm": 0.10917045921087265, + "learning_rate": 3.537105061727959e-05, + "loss": 0.0374, + "step": 79130 + }, + { + "epoch": 0.3907, + "grad_norm": 0.09935753792524338, + "learning_rate": 3.536728927212469e-05, + "loss": 0.0362, + "step": 79140 + }, + { + "epoch": 0.39075, + "grad_norm": 0.08676110953092575, + "learning_rate": 3.536352764353198e-05, + "loss": 0.0364, + "step": 79150 + }, + { + "epoch": 0.3908, + "grad_norm": 0.08344744145870209, + "learning_rate": 3.5359765731604336e-05, + "loss": 0.036, + "step": 79160 + }, + { + "epoch": 0.39085, + "grad_norm": 0.08429134637117386, + "learning_rate": 3.535600353644458e-05, + "loss": 0.0349, + "step": 79170 + }, + { + "epoch": 0.3909, + "grad_norm": 0.10815691202878952, + "learning_rate": 3.535224105815558e-05, + "loss": 0.0383, + "step": 79180 + }, + { + "epoch": 0.39095, + "grad_norm": 0.08904128521680832, + "learning_rate": 3.534847829684019e-05, + "loss": 0.0354, + "step": 79190 + }, + { + "epoch": 0.391, + "grad_norm": 0.0823027566075325, + "learning_rate": 3.53447152526013e-05, + "loss": 0.0358, + "step": 79200 + }, + { + "epoch": 0.39105, + "grad_norm": 0.08951200544834137, + "learning_rate": 3.534095192554178e-05, + "loss": 0.0351, + "step": 79210 + }, + { + "epoch": 0.3911, + "grad_norm": 0.07517030835151672, + "learning_rate": 3.5337188315764516e-05, + "loss": 0.0349, + "step": 79220 + }, + { + "epoch": 0.39115, + "grad_norm": 0.10150893032550812, + "learning_rate": 3.533342442337241e-05, + "loss": 0.0369, + "step": 79230 + }, + { + "epoch": 0.3912, + "grad_norm": 0.08564486354589462, + "learning_rate": 3.5329660248468366e-05, + "loss": 0.0368, + "step": 79240 + }, + { + "epoch": 0.39125, + "grad_norm": 0.08716096729040146, + "learning_rate": 3.532589579115529e-05, + "loss": 0.0358, + "step": 79250 + }, + { + "epoch": 0.3913, + "grad_norm": 0.08219984173774719, + "learning_rate": 3.53221310515361e-05, + "loss": 0.0389, + "step": 79260 + }, + { + "epoch": 0.39135, + "grad_norm": 0.08610589057207108, + "learning_rate": 3.5318366029713724e-05, + "loss": 0.0367, + "step": 79270 + }, + { + "epoch": 0.3914, + "grad_norm": 0.06737655401229858, + "learning_rate": 3.531460072579109e-05, + "loss": 0.0351, + "step": 79280 + }, + { + "epoch": 0.39145, + "grad_norm": 0.09205567836761475, + "learning_rate": 3.5310835139871164e-05, + "loss": 0.0379, + "step": 79290 + }, + { + "epoch": 0.3915, + "grad_norm": 0.08664903789758682, + "learning_rate": 3.530706927205687e-05, + "loss": 0.0369, + "step": 79300 + }, + { + "epoch": 0.39155, + "grad_norm": 0.08422086387872696, + "learning_rate": 3.530330312245117e-05, + "loss": 0.0386, + "step": 79310 + }, + { + "epoch": 0.3916, + "grad_norm": 0.06627576798200607, + "learning_rate": 3.529953669115703e-05, + "loss": 0.036, + "step": 79320 + }, + { + "epoch": 0.39165, + "grad_norm": 0.096906378865242, + "learning_rate": 3.529576997827744e-05, + "loss": 0.0384, + "step": 79330 + }, + { + "epoch": 0.3917, + "grad_norm": 0.08024526387453079, + "learning_rate": 3.529200298391536e-05, + "loss": 0.0356, + "step": 79340 + }, + { + "epoch": 0.39175, + "grad_norm": 0.08066499978303909, + "learning_rate": 3.528823570817379e-05, + "loss": 0.0367, + "step": 79350 + }, + { + "epoch": 0.3918, + "grad_norm": 0.09209758043289185, + "learning_rate": 3.5284468151155716e-05, + "loss": 0.0391, + "step": 79360 + }, + { + "epoch": 0.39185, + "grad_norm": 0.096739761531353, + "learning_rate": 3.528070031296414e-05, + "loss": 0.0382, + "step": 79370 + }, + { + "epoch": 0.3919, + "grad_norm": 0.0762137770652771, + "learning_rate": 3.527693219370209e-05, + "loss": 0.0354, + "step": 79380 + }, + { + "epoch": 0.39195, + "grad_norm": 0.07095940411090851, + "learning_rate": 3.527316379347257e-05, + "loss": 0.0367, + "step": 79390 + }, + { + "epoch": 0.392, + "grad_norm": 0.07851160317659378, + "learning_rate": 3.526939511237861e-05, + "loss": 0.0361, + "step": 79400 + }, + { + "epoch": 0.39205, + "grad_norm": 0.08974769711494446, + "learning_rate": 3.526562615052325e-05, + "loss": 0.0363, + "step": 79410 + }, + { + "epoch": 0.3921, + "grad_norm": 0.0686379075050354, + "learning_rate": 3.526185690800953e-05, + "loss": 0.0358, + "step": 79420 + }, + { + "epoch": 0.39215, + "grad_norm": 0.07639919221401215, + "learning_rate": 3.525808738494049e-05, + "loss": 0.0341, + "step": 79430 + }, + { + "epoch": 0.3922, + "grad_norm": 0.09294627606868744, + "learning_rate": 3.52543175814192e-05, + "loss": 0.036, + "step": 79440 + }, + { + "epoch": 0.39225, + "grad_norm": 0.10250154882669449, + "learning_rate": 3.525054749754871e-05, + "loss": 0.0383, + "step": 79450 + }, + { + "epoch": 0.3923, + "grad_norm": 0.08065023273229599, + "learning_rate": 3.524677713343212e-05, + "loss": 0.0373, + "step": 79460 + }, + { + "epoch": 0.39235, + "grad_norm": 0.07971781492233276, + "learning_rate": 3.5243006489172475e-05, + "loss": 0.0385, + "step": 79470 + }, + { + "epoch": 0.3924, + "grad_norm": 0.08311796188354492, + "learning_rate": 3.523923556487289e-05, + "loss": 0.0356, + "step": 79480 + }, + { + "epoch": 0.39245, + "grad_norm": 0.08565957099199295, + "learning_rate": 3.523546436063645e-05, + "loss": 0.0364, + "step": 79490 + }, + { + "epoch": 0.3925, + "grad_norm": 0.08758903294801712, + "learning_rate": 3.5231692876566264e-05, + "loss": 0.0364, + "step": 79500 + }, + { + "epoch": 0.39255, + "grad_norm": 0.07650836557149887, + "learning_rate": 3.522792111276543e-05, + "loss": 0.0376, + "step": 79510 + }, + { + "epoch": 0.3926, + "grad_norm": 0.08982399851083755, + "learning_rate": 3.522414906933708e-05, + "loss": 0.0354, + "step": 79520 + }, + { + "epoch": 0.39265, + "grad_norm": 0.09377458691596985, + "learning_rate": 3.522037674638433e-05, + "loss": 0.0366, + "step": 79530 + }, + { + "epoch": 0.3927, + "grad_norm": 0.0800488144159317, + "learning_rate": 3.521660414401033e-05, + "loss": 0.0365, + "step": 79540 + }, + { + "epoch": 0.39275, + "grad_norm": 0.16223900020122528, + "learning_rate": 3.5212831262318204e-05, + "loss": 0.041, + "step": 79550 + }, + { + "epoch": 0.3928, + "grad_norm": 0.12490296363830566, + "learning_rate": 3.5209058101411114e-05, + "loss": 0.0371, + "step": 79560 + }, + { + "epoch": 0.39285, + "grad_norm": 0.11788595467805862, + "learning_rate": 3.52052846613922e-05, + "loss": 0.0366, + "step": 79570 + }, + { + "epoch": 0.3929, + "grad_norm": 0.1282113492488861, + "learning_rate": 3.520151094236465e-05, + "loss": 0.0383, + "step": 79580 + }, + { + "epoch": 0.39295, + "grad_norm": 0.08661283552646637, + "learning_rate": 3.519773694443161e-05, + "loss": 0.0383, + "step": 79590 + }, + { + "epoch": 0.393, + "grad_norm": 0.07716389745473862, + "learning_rate": 3.519396266769628e-05, + "loss": 0.0366, + "step": 79600 + }, + { + "epoch": 0.39305, + "grad_norm": 0.09312919527292252, + "learning_rate": 3.519018811226184e-05, + "loss": 0.0387, + "step": 79610 + }, + { + "epoch": 0.3931, + "grad_norm": 0.08408086746931076, + "learning_rate": 3.5186413278231487e-05, + "loss": 0.036, + "step": 79620 + }, + { + "epoch": 0.39315, + "grad_norm": 0.08217661082744598, + "learning_rate": 3.518263816570842e-05, + "loss": 0.0368, + "step": 79630 + }, + { + "epoch": 0.3932, + "grad_norm": 0.08453497290611267, + "learning_rate": 3.517886277479585e-05, + "loss": 0.0347, + "step": 79640 + }, + { + "epoch": 0.39325, + "grad_norm": 0.08217451721429825, + "learning_rate": 3.5175087105596995e-05, + "loss": 0.0357, + "step": 79650 + }, + { + "epoch": 0.3933, + "grad_norm": 0.09013067930936813, + "learning_rate": 3.517131115821508e-05, + "loss": 0.0364, + "step": 79660 + }, + { + "epoch": 0.39335, + "grad_norm": 0.0868317037820816, + "learning_rate": 3.5167534932753344e-05, + "loss": 0.0369, + "step": 79670 + }, + { + "epoch": 0.3934, + "grad_norm": 0.09239938110113144, + "learning_rate": 3.516375842931502e-05, + "loss": 0.0371, + "step": 79680 + }, + { + "epoch": 0.39345, + "grad_norm": 0.08693993091583252, + "learning_rate": 3.515998164800336e-05, + "loss": 0.0351, + "step": 79690 + }, + { + "epoch": 0.3935, + "grad_norm": 0.10249830782413483, + "learning_rate": 3.515620458892162e-05, + "loss": 0.0357, + "step": 79700 + }, + { + "epoch": 0.39355, + "grad_norm": 0.08632895350456238, + "learning_rate": 3.515242725217306e-05, + "loss": 0.0357, + "step": 79710 + }, + { + "epoch": 0.3936, + "grad_norm": 0.09208270907402039, + "learning_rate": 3.514864963786095e-05, + "loss": 0.0364, + "step": 79720 + }, + { + "epoch": 0.39365, + "grad_norm": 0.08991848677396774, + "learning_rate": 3.514487174608858e-05, + "loss": 0.0351, + "step": 79730 + }, + { + "epoch": 0.3937, + "grad_norm": 0.08170048147439957, + "learning_rate": 3.5141093576959225e-05, + "loss": 0.0357, + "step": 79740 + }, + { + "epoch": 0.39375, + "grad_norm": 0.10216086357831955, + "learning_rate": 3.5137315130576174e-05, + "loss": 0.0356, + "step": 79750 + }, + { + "epoch": 0.3938, + "grad_norm": 0.10113737732172012, + "learning_rate": 3.5133536407042743e-05, + "loss": 0.0353, + "step": 79760 + }, + { + "epoch": 0.39385, + "grad_norm": 0.09250824898481369, + "learning_rate": 3.512975740646223e-05, + "loss": 0.0367, + "step": 79770 + }, + { + "epoch": 0.3939, + "grad_norm": 0.10215970128774643, + "learning_rate": 3.512597812893795e-05, + "loss": 0.0363, + "step": 79780 + }, + { + "epoch": 0.39395, + "grad_norm": 0.08758533746004105, + "learning_rate": 3.512219857457325e-05, + "loss": 0.0356, + "step": 79790 + }, + { + "epoch": 0.394, + "grad_norm": 0.08108478039503098, + "learning_rate": 3.511841874347143e-05, + "loss": 0.0358, + "step": 79800 + }, + { + "epoch": 0.39405, + "grad_norm": 0.07901604473590851, + "learning_rate": 3.5114638635735843e-05, + "loss": 0.0366, + "step": 79810 + }, + { + "epoch": 0.3941, + "grad_norm": 0.09510098397731781, + "learning_rate": 3.511085825146984e-05, + "loss": 0.0353, + "step": 79820 + }, + { + "epoch": 0.39415, + "grad_norm": 0.09487070143222809, + "learning_rate": 3.510707759077677e-05, + "loss": 0.0365, + "step": 79830 + }, + { + "epoch": 0.3942, + "grad_norm": 0.09669601172208786, + "learning_rate": 3.510329665375999e-05, + "loss": 0.0345, + "step": 79840 + }, + { + "epoch": 0.39425, + "grad_norm": 0.09767556935548782, + "learning_rate": 3.509951544052288e-05, + "loss": 0.0355, + "step": 79850 + }, + { + "epoch": 0.3943, + "grad_norm": 0.11945126950740814, + "learning_rate": 3.509573395116881e-05, + "loss": 0.0354, + "step": 79860 + }, + { + "epoch": 0.39435, + "grad_norm": 0.08112930506467819, + "learning_rate": 3.509195218580117e-05, + "loss": 0.0365, + "step": 79870 + }, + { + "epoch": 0.3944, + "grad_norm": 0.07552963495254517, + "learning_rate": 3.508817014452335e-05, + "loss": 0.0351, + "step": 79880 + }, + { + "epoch": 0.39445, + "grad_norm": 0.0858389362692833, + "learning_rate": 3.5084387827438734e-05, + "loss": 0.0352, + "step": 79890 + }, + { + "epoch": 0.3945, + "grad_norm": 0.0821891501545906, + "learning_rate": 3.508060523465076e-05, + "loss": 0.0345, + "step": 79900 + }, + { + "epoch": 0.39455, + "grad_norm": 0.08184736222028732, + "learning_rate": 3.5076822366262816e-05, + "loss": 0.034, + "step": 79910 + }, + { + "epoch": 0.3946, + "grad_norm": 0.09934855997562408, + "learning_rate": 3.5073039222378344e-05, + "loss": 0.035, + "step": 79920 + }, + { + "epoch": 0.39465, + "grad_norm": 0.10376731306314468, + "learning_rate": 3.506925580310076e-05, + "loss": 0.0353, + "step": 79930 + }, + { + "epoch": 0.3947, + "grad_norm": 0.10761922597885132, + "learning_rate": 3.5065472108533505e-05, + "loss": 0.0346, + "step": 79940 + }, + { + "epoch": 0.39475, + "grad_norm": 0.10313154757022858, + "learning_rate": 3.506168813878002e-05, + "loss": 0.0351, + "step": 79950 + }, + { + "epoch": 0.3948, + "grad_norm": 0.091402068734169, + "learning_rate": 3.505790389394377e-05, + "loss": 0.0349, + "step": 79960 + }, + { + "epoch": 0.39485, + "grad_norm": 0.08425392955541611, + "learning_rate": 3.505411937412819e-05, + "loss": 0.0348, + "step": 79970 + }, + { + "epoch": 0.3949, + "grad_norm": 0.070528544485569, + "learning_rate": 3.505033457943678e-05, + "loss": 0.0361, + "step": 79980 + }, + { + "epoch": 0.39495, + "grad_norm": 0.08500799536705017, + "learning_rate": 3.504654950997299e-05, + "loss": 0.0353, + "step": 79990 + }, + { + "epoch": 0.395, + "grad_norm": 0.07569961994886398, + "learning_rate": 3.5042764165840314e-05, + "loss": 0.0368, + "step": 80000 + }, + { + "epoch": 5e-05, + "grad_norm": 0.07905327528715134, + "learning_rate": 3.5038978547142234e-05, + "loss": 0.0368, + "step": 80010 + }, + { + "epoch": 0.0001, + "grad_norm": 0.08307768404483795, + "learning_rate": 3.503519265398226e-05, + "loss": 0.0351, + "step": 80020 + }, + { + "epoch": 0.00015, + "grad_norm": 0.07752068340778351, + "learning_rate": 3.503140648646388e-05, + "loss": 0.0345, + "step": 80030 + }, + { + "epoch": 0.0002, + "grad_norm": 0.07796391099691391, + "learning_rate": 3.502762004469062e-05, + "loss": 0.0358, + "step": 80040 + }, + { + "epoch": 0.00025, + "grad_norm": 0.0741119384765625, + "learning_rate": 3.502383332876599e-05, + "loss": 0.0373, + "step": 80050 + }, + { + "epoch": 0.0003, + "grad_norm": 0.08147093653678894, + "learning_rate": 3.502004633879353e-05, + "loss": 0.035, + "step": 80060 + }, + { + "epoch": 0.00035, + "grad_norm": 0.08216322958469391, + "learning_rate": 3.5016259074876764e-05, + "loss": 0.035, + "step": 80070 + }, + { + "epoch": 0.0004, + "grad_norm": 0.07296330481767654, + "learning_rate": 3.501247153711924e-05, + "loss": 0.0356, + "step": 80080 + }, + { + "epoch": 0.00045, + "grad_norm": 0.09399837255477905, + "learning_rate": 3.5008683725624506e-05, + "loss": 0.0352, + "step": 80090 + }, + { + "epoch": 0.0005, + "grad_norm": 0.0898161381483078, + "learning_rate": 3.5004895640496113e-05, + "loss": 0.0351, + "step": 80100 + }, + { + "epoch": 0.00055, + "grad_norm": 0.09958512336015701, + "learning_rate": 3.5001107281837635e-05, + "loss": 0.0355, + "step": 80110 + }, + { + "epoch": 0.0006, + "grad_norm": 0.0894375592470169, + "learning_rate": 3.499731864975264e-05, + "loss": 0.0405, + "step": 80120 + }, + { + "epoch": 0.00065, + "grad_norm": 0.1024933010339737, + "learning_rate": 3.499352974434472e-05, + "loss": 0.0381, + "step": 80130 + }, + { + "epoch": 0.0007, + "grad_norm": 0.07921604067087173, + "learning_rate": 3.498974056571744e-05, + "loss": 0.0367, + "step": 80140 + }, + { + "epoch": 0.00075, + "grad_norm": 0.09752549976110458, + "learning_rate": 3.49859511139744e-05, + "loss": 0.0367, + "step": 80150 + }, + { + "epoch": 0.0008, + "grad_norm": 0.09751075506210327, + "learning_rate": 3.4982161389219214e-05, + "loss": 0.0393, + "step": 80160 + }, + { + "epoch": 0.00085, + "grad_norm": 0.08204904943704605, + "learning_rate": 3.49783713915555e-05, + "loss": 0.0363, + "step": 80170 + }, + { + "epoch": 0.0009, + "grad_norm": 0.09174011647701263, + "learning_rate": 3.497458112108684e-05, + "loss": 0.0348, + "step": 80180 + }, + { + "epoch": 0.00095, + "grad_norm": 0.12619981169700623, + "learning_rate": 3.49707905779169e-05, + "loss": 0.0387, + "step": 80190 + }, + { + "epoch": 0.001, + "grad_norm": 0.09585075080394745, + "learning_rate": 3.496699976214927e-05, + "loss": 0.038, + "step": 80200 + }, + { + "epoch": 0.00105, + "grad_norm": 0.09432283043861389, + "learning_rate": 3.496320867388762e-05, + "loss": 0.0357, + "step": 80210 + }, + { + "epoch": 0.0011, + "grad_norm": 0.09583482891321182, + "learning_rate": 3.4959417313235585e-05, + "loss": 0.0366, + "step": 80220 + }, + { + "epoch": 0.00115, + "grad_norm": 0.09143300354480743, + "learning_rate": 3.495562568029683e-05, + "loss": 0.0375, + "step": 80230 + }, + { + "epoch": 0.0012, + "grad_norm": 0.08289297670125961, + "learning_rate": 3.4951833775175005e-05, + "loss": 0.0367, + "step": 80240 + }, + { + "epoch": 0.00125, + "grad_norm": 0.07733283191919327, + "learning_rate": 3.494804159797378e-05, + "loss": 0.0375, + "step": 80250 + }, + { + "epoch": 0.0013, + "grad_norm": 0.0847741961479187, + "learning_rate": 3.4944249148796845e-05, + "loss": 0.04, + "step": 80260 + }, + { + "epoch": 0.00135, + "grad_norm": 0.08987481892108917, + "learning_rate": 3.4940456427747866e-05, + "loss": 0.0353, + "step": 80270 + }, + { + "epoch": 0.0014, + "grad_norm": 0.0879490077495575, + "learning_rate": 3.493666343493054e-05, + "loss": 0.0368, + "step": 80280 + }, + { + "epoch": 0.00145, + "grad_norm": 0.09618838876485825, + "learning_rate": 3.493287017044857e-05, + "loss": 0.0369, + "step": 80290 + }, + { + "epoch": 0.0015, + "grad_norm": 0.0725872665643692, + "learning_rate": 3.4929076634405667e-05, + "loss": 0.0345, + "step": 80300 + }, + { + "epoch": 0.00155, + "grad_norm": 0.08574867993593216, + "learning_rate": 3.4925282826905533e-05, + "loss": 0.0349, + "step": 80310 + }, + { + "epoch": 0.0016, + "grad_norm": 0.0960630550980568, + "learning_rate": 3.49214887480519e-05, + "loss": 0.0364, + "step": 80320 + }, + { + "epoch": 0.00165, + "grad_norm": 0.08120320737361908, + "learning_rate": 3.491769439794849e-05, + "loss": 0.035, + "step": 80330 + }, + { + "epoch": 0.0017, + "grad_norm": 0.09600193053483963, + "learning_rate": 3.491389977669904e-05, + "loss": 0.0357, + "step": 80340 + }, + { + "epoch": 0.00175, + "grad_norm": 0.07628503441810608, + "learning_rate": 3.4910104884407294e-05, + "loss": 0.0355, + "step": 80350 + }, + { + "epoch": 0.0018, + "grad_norm": 0.08768440037965775, + "learning_rate": 3.490630972117701e-05, + "loss": 0.0362, + "step": 80360 + }, + { + "epoch": 0.00185, + "grad_norm": 0.09640726447105408, + "learning_rate": 3.490251428711193e-05, + "loss": 0.0387, + "step": 80370 + }, + { + "epoch": 0.0019, + "grad_norm": 0.08039938658475876, + "learning_rate": 3.489871858231584e-05, + "loss": 0.0367, + "step": 80380 + }, + { + "epoch": 0.00195, + "grad_norm": 0.08057080209255219, + "learning_rate": 3.48949226068925e-05, + "loss": 0.0372, + "step": 80390 + }, + { + "epoch": 0.002, + "grad_norm": 0.08435969054698944, + "learning_rate": 3.489112636094569e-05, + "loss": 0.0377, + "step": 80400 + }, + { + "epoch": 0.00205, + "grad_norm": 0.08451063185930252, + "learning_rate": 3.4887329844579194e-05, + "loss": 0.038, + "step": 80410 + }, + { + "epoch": 0.0021, + "grad_norm": 0.07690353691577911, + "learning_rate": 3.4883533057896826e-05, + "loss": 0.0355, + "step": 80420 + }, + { + "epoch": 0.00215, + "grad_norm": 0.09554172307252884, + "learning_rate": 3.4879736001002375e-05, + "loss": 0.0375, + "step": 80430 + }, + { + "epoch": 0.0022, + "grad_norm": 0.07848838716745377, + "learning_rate": 3.4875938673999654e-05, + "loss": 0.0371, + "step": 80440 + }, + { + "epoch": 0.00225, + "grad_norm": 0.09010570496320724, + "learning_rate": 3.4872141076992476e-05, + "loss": 0.0355, + "step": 80450 + }, + { + "epoch": 0.0023, + "grad_norm": 0.07799280434846878, + "learning_rate": 3.486834321008467e-05, + "loss": 0.0383, + "step": 80460 + }, + { + "epoch": 0.00235, + "grad_norm": 0.10257743299007416, + "learning_rate": 3.4864545073380065e-05, + "loss": 0.0356, + "step": 80470 + }, + { + "epoch": 0.0024, + "grad_norm": 0.07939010113477707, + "learning_rate": 3.486074666698251e-05, + "loss": 0.0355, + "step": 80480 + }, + { + "epoch": 0.00245, + "grad_norm": 0.09207401424646378, + "learning_rate": 3.485694799099585e-05, + "loss": 0.0356, + "step": 80490 + }, + { + "epoch": 0.0025, + "grad_norm": 0.08484499156475067, + "learning_rate": 3.485314904552392e-05, + "loss": 0.036, + "step": 80500 + }, + { + "epoch": 0.00255, + "grad_norm": 0.1318163424730301, + "learning_rate": 3.4849349830670615e-05, + "loss": 0.0367, + "step": 80510 + }, + { + "epoch": 0.0026, + "grad_norm": 0.1633630394935608, + "learning_rate": 3.484555034653977e-05, + "loss": 0.0361, + "step": 80520 + }, + { + "epoch": 0.00265, + "grad_norm": 0.09611942619085312, + "learning_rate": 3.4841750593235285e-05, + "loss": 0.037, + "step": 80530 + }, + { + "epoch": 0.0027, + "grad_norm": 0.08366899192333221, + "learning_rate": 3.483795057086104e-05, + "loss": 0.0356, + "step": 80540 + }, + { + "epoch": 0.00275, + "grad_norm": 0.08930250257253647, + "learning_rate": 3.4834150279520916e-05, + "loss": 0.0359, + "step": 80550 + }, + { + "epoch": 0.0028, + "grad_norm": 0.08847183734178543, + "learning_rate": 3.4830349719318815e-05, + "loss": 0.0367, + "step": 80560 + }, + { + "epoch": 0.00285, + "grad_norm": 0.08116517961025238, + "learning_rate": 3.4826548890358656e-05, + "loss": 0.0367, + "step": 80570 + }, + { + "epoch": 0.0029, + "grad_norm": 0.08152367919683456, + "learning_rate": 3.482274779274433e-05, + "loss": 0.0348, + "step": 80580 + }, + { + "epoch": 0.00295, + "grad_norm": 0.08631303906440735, + "learning_rate": 3.481894642657977e-05, + "loss": 0.0366, + "step": 80590 + }, + { + "epoch": 0.003, + "grad_norm": 0.09753107279539108, + "learning_rate": 3.481514479196891e-05, + "loss": 0.0377, + "step": 80600 + }, + { + "epoch": 0.00305, + "grad_norm": 0.07813764363527298, + "learning_rate": 3.4811342889015686e-05, + "loss": 0.0379, + "step": 80610 + }, + { + "epoch": 0.0031, + "grad_norm": 0.08773379027843475, + "learning_rate": 3.4807540717824025e-05, + "loss": 0.0362, + "step": 80620 + }, + { + "epoch": 0.00315, + "grad_norm": 0.10971876233816147, + "learning_rate": 3.480373827849789e-05, + "loss": 0.0367, + "step": 80630 + }, + { + "epoch": 0.0032, + "grad_norm": 0.08488216251134872, + "learning_rate": 3.4799935571141226e-05, + "loss": 0.0375, + "step": 80640 + }, + { + "epoch": 0.00325, + "grad_norm": 0.08824058622121811, + "learning_rate": 3.479613259585801e-05, + "loss": 0.0355, + "step": 80650 + }, + { + "epoch": 0.0033, + "grad_norm": 0.1005680039525032, + "learning_rate": 3.47923293527522e-05, + "loss": 0.0363, + "step": 80660 + }, + { + "epoch": 0.00335, + "grad_norm": 0.0813353955745697, + "learning_rate": 3.4788525841927794e-05, + "loss": 0.0354, + "step": 80670 + }, + { + "epoch": 0.0034, + "grad_norm": 0.08333186060190201, + "learning_rate": 3.4784722063488764e-05, + "loss": 0.0359, + "step": 80680 + }, + { + "epoch": 0.00345, + "grad_norm": 0.11254861950874329, + "learning_rate": 3.478091801753912e-05, + "loss": 0.0354, + "step": 80690 + }, + { + "epoch": 0.0035, + "grad_norm": 0.09645597636699677, + "learning_rate": 3.477711370418284e-05, + "loss": 0.0355, + "step": 80700 + }, + { + "epoch": 0.00355, + "grad_norm": 0.08780481666326523, + "learning_rate": 3.4773309123523946e-05, + "loss": 0.0373, + "step": 80710 + }, + { + "epoch": 0.0036, + "grad_norm": 0.08916894346475601, + "learning_rate": 3.476950427566645e-05, + "loss": 0.0356, + "step": 80720 + }, + { + "epoch": 0.00365, + "grad_norm": 0.10412748157978058, + "learning_rate": 3.476569916071438e-05, + "loss": 0.0356, + "step": 80730 + }, + { + "epoch": 0.0037, + "grad_norm": 0.10716842114925385, + "learning_rate": 3.4761893778771766e-05, + "loss": 0.0375, + "step": 80740 + }, + { + "epoch": 0.00375, + "grad_norm": 0.10552069544792175, + "learning_rate": 3.475808812994264e-05, + "loss": 0.038, + "step": 80750 + }, + { + "epoch": 0.0038, + "grad_norm": 0.10847306251525879, + "learning_rate": 3.475428221433106e-05, + "loss": 0.0364, + "step": 80760 + }, + { + "epoch": 0.00385, + "grad_norm": 0.09877172857522964, + "learning_rate": 3.4750476032041054e-05, + "loss": 0.0377, + "step": 80770 + }, + { + "epoch": 0.0039, + "grad_norm": 0.06513556838035583, + "learning_rate": 3.474666958317671e-05, + "loss": 0.0366, + "step": 80780 + }, + { + "epoch": 0.00395, + "grad_norm": 0.07210590690374374, + "learning_rate": 3.474286286784207e-05, + "loss": 0.0359, + "step": 80790 + }, + { + "epoch": 0.004, + "grad_norm": 0.07605480402708054, + "learning_rate": 3.473905588614122e-05, + "loss": 0.0365, + "step": 80800 + }, + { + "epoch": 0.00405, + "grad_norm": 0.07498470693826675, + "learning_rate": 3.4735248638178255e-05, + "loss": 0.0371, + "step": 80810 + }, + { + "epoch": 0.0041, + "grad_norm": 0.08058614283800125, + "learning_rate": 3.473144112405724e-05, + "loss": 0.0356, + "step": 80820 + }, + { + "epoch": 0.00415, + "grad_norm": 0.0745462104678154, + "learning_rate": 3.472763334388228e-05, + "loss": 0.0359, + "step": 80830 + }, + { + "epoch": 0.0042, + "grad_norm": 0.07761227339506149, + "learning_rate": 3.472382529775748e-05, + "loss": 0.0355, + "step": 80840 + }, + { + "epoch": 0.00425, + "grad_norm": 0.07811295241117477, + "learning_rate": 3.472001698578694e-05, + "loss": 0.0343, + "step": 80850 + }, + { + "epoch": 0.0043, + "grad_norm": 0.08239776641130447, + "learning_rate": 3.47162084080748e-05, + "loss": 0.0365, + "step": 80860 + }, + { + "epoch": 0.00435, + "grad_norm": 0.07351667433977127, + "learning_rate": 3.471239956472517e-05, + "loss": 0.0348, + "step": 80870 + }, + { + "epoch": 0.0044, + "grad_norm": 0.07278984785079956, + "learning_rate": 3.470859045584218e-05, + "loss": 0.036, + "step": 80880 + }, + { + "epoch": 0.00445, + "grad_norm": 0.08412481844425201, + "learning_rate": 3.470478108152998e-05, + "loss": 0.0371, + "step": 80890 + }, + { + "epoch": 0.0045, + "grad_norm": 0.10147181153297424, + "learning_rate": 3.470097144189272e-05, + "loss": 0.0368, + "step": 80900 + }, + { + "epoch": 0.00455, + "grad_norm": 0.09770499169826508, + "learning_rate": 3.469716153703452e-05, + "loss": 0.0354, + "step": 80910 + }, + { + "epoch": 0.0046, + "grad_norm": 0.10202199220657349, + "learning_rate": 3.469335136705958e-05, + "loss": 0.0372, + "step": 80920 + }, + { + "epoch": 0.00465, + "grad_norm": 0.10253550112247467, + "learning_rate": 3.4689540932072057e-05, + "loss": 0.0377, + "step": 80930 + }, + { + "epoch": 0.0047, + "grad_norm": 0.09305887669324875, + "learning_rate": 3.468573023217613e-05, + "loss": 0.0363, + "step": 80940 + }, + { + "epoch": 0.00475, + "grad_norm": 0.11431948095560074, + "learning_rate": 3.468191926747597e-05, + "loss": 0.036, + "step": 80950 + }, + { + "epoch": 0.0048, + "grad_norm": 0.08740732073783875, + "learning_rate": 3.4678108038075775e-05, + "loss": 0.0361, + "step": 80960 + }, + { + "epoch": 0.00485, + "grad_norm": 0.07707460969686508, + "learning_rate": 3.4674296544079745e-05, + "loss": 0.0352, + "step": 80970 + }, + { + "epoch": 0.0049, + "grad_norm": 0.06795619428157806, + "learning_rate": 3.467048478559208e-05, + "loss": 0.0357, + "step": 80980 + }, + { + "epoch": 0.00495, + "grad_norm": 0.0804039016366005, + "learning_rate": 3.466667276271699e-05, + "loss": 0.0358, + "step": 80990 + }, + { + "epoch": 0.005, + "grad_norm": 0.08920106291770935, + "learning_rate": 3.46628604755587e-05, + "loss": 0.0372, + "step": 81000 + }, + { + "epoch": 0.00505, + "grad_norm": 0.09149488061666489, + "learning_rate": 3.465904792422144e-05, + "loss": 0.0382, + "step": 81010 + }, + { + "epoch": 0.0051, + "grad_norm": 0.08531561493873596, + "learning_rate": 3.465523510880943e-05, + "loss": 0.0373, + "step": 81020 + }, + { + "epoch": 0.00515, + "grad_norm": 0.09696276485919952, + "learning_rate": 3.4651422029426926e-05, + "loss": 0.036, + "step": 81030 + }, + { + "epoch": 0.0052, + "grad_norm": 0.06882733106613159, + "learning_rate": 3.464760868617817e-05, + "loss": 0.0363, + "step": 81040 + }, + { + "epoch": 0.00525, + "grad_norm": 0.07693909108638763, + "learning_rate": 3.4643795079167414e-05, + "loss": 0.0365, + "step": 81050 + }, + { + "epoch": 0.0053, + "grad_norm": 0.0854329913854599, + "learning_rate": 3.4639981208498924e-05, + "loss": 0.0354, + "step": 81060 + }, + { + "epoch": 0.00535, + "grad_norm": 0.09463875740766525, + "learning_rate": 3.463616707427697e-05, + "loss": 0.0382, + "step": 81070 + }, + { + "epoch": 0.0054, + "grad_norm": 0.08794492483139038, + "learning_rate": 3.463235267660583e-05, + "loss": 0.0367, + "step": 81080 + }, + { + "epoch": 0.00545, + "grad_norm": 0.08704029768705368, + "learning_rate": 3.462853801558979e-05, + "loss": 0.0351, + "step": 81090 + }, + { + "epoch": 0.0055, + "grad_norm": 0.08887019008398056, + "learning_rate": 3.462472309133312e-05, + "loss": 0.0382, + "step": 81100 + }, + { + "epoch": 0.00555, + "grad_norm": 0.08907011151313782, + "learning_rate": 3.4620907903940156e-05, + "loss": 0.0365, + "step": 81110 + }, + { + "epoch": 0.0056, + "grad_norm": 0.11149367690086365, + "learning_rate": 3.461709245351518e-05, + "loss": 0.0345, + "step": 81120 + }, + { + "epoch": 0.00565, + "grad_norm": 0.09168368577957153, + "learning_rate": 3.4613276740162506e-05, + "loss": 0.0363, + "step": 81130 + }, + { + "epoch": 0.0057, + "grad_norm": 0.10839654505252838, + "learning_rate": 3.4609460763986454e-05, + "loss": 0.0356, + "step": 81140 + }, + { + "epoch": 0.00575, + "grad_norm": 0.08748756349086761, + "learning_rate": 3.460564452509137e-05, + "loss": 0.0372, + "step": 81150 + }, + { + "epoch": 0.0058, + "grad_norm": 0.11382316797971725, + "learning_rate": 3.4601828023581554e-05, + "loss": 0.0354, + "step": 81160 + }, + { + "epoch": 0.00585, + "grad_norm": 0.10300473123788834, + "learning_rate": 3.459801125956138e-05, + "loss": 0.0359, + "step": 81170 + }, + { + "epoch": 0.0059, + "grad_norm": 0.08995317667722702, + "learning_rate": 3.459419423313517e-05, + "loss": 0.0375, + "step": 81180 + }, + { + "epoch": 0.00595, + "grad_norm": 0.07771133631467819, + "learning_rate": 3.459037694440731e-05, + "loss": 0.0351, + "step": 81190 + }, + { + "epoch": 0.006, + "grad_norm": 0.08672454953193665, + "learning_rate": 3.458655939348214e-05, + "loss": 0.0359, + "step": 81200 + }, + { + "epoch": 0.00605, + "grad_norm": 0.09423719346523285, + "learning_rate": 3.4582741580464044e-05, + "loss": 0.0367, + "step": 81210 + }, + { + "epoch": 0.0061, + "grad_norm": 0.07885994762182236, + "learning_rate": 3.457892350545738e-05, + "loss": 0.0368, + "step": 81220 + }, + { + "epoch": 0.00615, + "grad_norm": 0.07632511854171753, + "learning_rate": 3.457510516856656e-05, + "loss": 0.0359, + "step": 81230 + }, + { + "epoch": 0.0062, + "grad_norm": 0.07641930133104324, + "learning_rate": 3.457128656989596e-05, + "loss": 0.0371, + "step": 81240 + }, + { + "epoch": 0.00625, + "grad_norm": 0.09359002113342285, + "learning_rate": 3.456746770954997e-05, + "loss": 0.0367, + "step": 81250 + }, + { + "epoch": 0.0063, + "grad_norm": 0.09460229426622391, + "learning_rate": 3.456364858763301e-05, + "loss": 0.0391, + "step": 81260 + }, + { + "epoch": 0.00635, + "grad_norm": 0.09273259341716766, + "learning_rate": 3.45598292042495e-05, + "loss": 0.035, + "step": 81270 + }, + { + "epoch": 0.0064, + "grad_norm": 0.0678936317563057, + "learning_rate": 3.455600955950385e-05, + "loss": 0.035, + "step": 81280 + }, + { + "epoch": 0.00645, + "grad_norm": 0.0818251222372055, + "learning_rate": 3.4552189653500475e-05, + "loss": 0.0355, + "step": 81290 + }, + { + "epoch": 0.0065, + "grad_norm": 0.07798295468091965, + "learning_rate": 3.454836948634383e-05, + "loss": 0.0348, + "step": 81300 + }, + { + "epoch": 0.00655, + "grad_norm": 0.07539451122283936, + "learning_rate": 3.454454905813835e-05, + "loss": 0.0358, + "step": 81310 + }, + { + "epoch": 0.0066, + "grad_norm": 0.07479606568813324, + "learning_rate": 3.454072836898849e-05, + "loss": 0.0356, + "step": 81320 + }, + { + "epoch": 0.00665, + "grad_norm": 0.08967671543359756, + "learning_rate": 3.45369074189987e-05, + "loss": 0.0353, + "step": 81330 + }, + { + "epoch": 0.0067, + "grad_norm": 0.07154927402734756, + "learning_rate": 3.4533086208273447e-05, + "loss": 0.0355, + "step": 81340 + }, + { + "epoch": 0.00675, + "grad_norm": 0.08140010386705399, + "learning_rate": 3.45292647369172e-05, + "loss": 0.0361, + "step": 81350 + }, + { + "epoch": 0.0068, + "grad_norm": 0.08675054460763931, + "learning_rate": 3.452544300503442e-05, + "loss": 0.0365, + "step": 81360 + }, + { + "epoch": 0.00685, + "grad_norm": 0.08151241391897202, + "learning_rate": 3.452162101272961e-05, + "loss": 0.034, + "step": 81370 + }, + { + "epoch": 0.0069, + "grad_norm": 0.07963025569915771, + "learning_rate": 3.451779876010727e-05, + "loss": 0.037, + "step": 81380 + }, + { + "epoch": 0.00695, + "grad_norm": 0.08118466287851334, + "learning_rate": 3.4513976247271885e-05, + "loss": 0.0343, + "step": 81390 + }, + { + "epoch": 0.007, + "grad_norm": 0.07270652055740356, + "learning_rate": 3.451015347432796e-05, + "loss": 0.036, + "step": 81400 + }, + { + "epoch": 0.00705, + "grad_norm": 0.09753786772489548, + "learning_rate": 3.450633044138001e-05, + "loss": 0.037, + "step": 81410 + }, + { + "epoch": 0.0071, + "grad_norm": 0.07925489544868469, + "learning_rate": 3.4502507148532556e-05, + "loss": 0.0366, + "step": 81420 + }, + { + "epoch": 0.00715, + "grad_norm": 0.09283397346735, + "learning_rate": 3.449868359589013e-05, + "loss": 0.0363, + "step": 81430 + }, + { + "epoch": 0.0072, + "grad_norm": 0.08941236138343811, + "learning_rate": 3.449485978355726e-05, + "loss": 0.0379, + "step": 81440 + }, + { + "epoch": 0.00725, + "grad_norm": 0.09721534699201584, + "learning_rate": 3.44910357116385e-05, + "loss": 0.0376, + "step": 81450 + }, + { + "epoch": 0.0073, + "grad_norm": 0.08483276516199112, + "learning_rate": 3.448721138023838e-05, + "loss": 0.0367, + "step": 81460 + }, + { + "epoch": 0.00735, + "grad_norm": 0.09143737703561783, + "learning_rate": 3.448338678946147e-05, + "loss": 0.0388, + "step": 81470 + }, + { + "epoch": 0.0074, + "grad_norm": 0.0824364721775055, + "learning_rate": 3.447956193941233e-05, + "loss": 0.0359, + "step": 81480 + }, + { + "epoch": 0.00745, + "grad_norm": 0.07745369523763657, + "learning_rate": 3.4475736830195516e-05, + "loss": 0.0373, + "step": 81490 + }, + { + "epoch": 0.0075, + "grad_norm": 0.07666676491498947, + "learning_rate": 3.447191146191563e-05, + "loss": 0.0362, + "step": 81500 + }, + { + "epoch": 0.00755, + "grad_norm": 0.09171445667743683, + "learning_rate": 3.446808583467723e-05, + "loss": 0.0386, + "step": 81510 + }, + { + "epoch": 0.0076, + "grad_norm": 0.09327006340026855, + "learning_rate": 3.446425994858493e-05, + "loss": 0.0387, + "step": 81520 + }, + { + "epoch": 0.00765, + "grad_norm": 0.08528904616832733, + "learning_rate": 3.446043380374332e-05, + "loss": 0.0375, + "step": 81530 + }, + { + "epoch": 0.0077, + "grad_norm": 0.08111163973808289, + "learning_rate": 3.4456607400256994e-05, + "loss": 0.0363, + "step": 81540 + }, + { + "epoch": 0.00775, + "grad_norm": 0.07948674261569977, + "learning_rate": 3.4452780738230584e-05, + "loss": 0.0379, + "step": 81550 + }, + { + "epoch": 0.0078, + "grad_norm": 0.1013888418674469, + "learning_rate": 3.444895381776869e-05, + "loss": 0.0381, + "step": 81560 + }, + { + "epoch": 0.00785, + "grad_norm": 0.06877487152814865, + "learning_rate": 3.444512663897596e-05, + "loss": 0.0358, + "step": 81570 + }, + { + "epoch": 0.0079, + "grad_norm": 0.10422961413860321, + "learning_rate": 3.444129920195701e-05, + "loss": 0.037, + "step": 81580 + }, + { + "epoch": 0.00795, + "grad_norm": 0.07992278784513474, + "learning_rate": 3.4437471506816497e-05, + "loss": 0.037, + "step": 81590 + }, + { + "epoch": 0.008, + "grad_norm": 0.06682839244604111, + "learning_rate": 3.443364355365905e-05, + "loss": 0.0365, + "step": 81600 + }, + { + "epoch": 0.00805, + "grad_norm": 0.09197988361120224, + "learning_rate": 3.442981534258932e-05, + "loss": 0.0364, + "step": 81610 + }, + { + "epoch": 0.0081, + "grad_norm": 0.08035583049058914, + "learning_rate": 3.442598687371199e-05, + "loss": 0.0375, + "step": 81620 + }, + { + "epoch": 0.00815, + "grad_norm": 0.08478248864412308, + "learning_rate": 3.4422158147131726e-05, + "loss": 0.0372, + "step": 81630 + }, + { + "epoch": 0.0082, + "grad_norm": 0.08741386979818344, + "learning_rate": 3.4418329162953196e-05, + "loss": 0.0377, + "step": 81640 + }, + { + "epoch": 0.00825, + "grad_norm": 0.07935051620006561, + "learning_rate": 3.441449992128108e-05, + "loss": 0.0366, + "step": 81650 + }, + { + "epoch": 0.0083, + "grad_norm": 0.08112239092588425, + "learning_rate": 3.441067042222008e-05, + "loss": 0.0366, + "step": 81660 + }, + { + "epoch": 0.00835, + "grad_norm": 0.08402236551046371, + "learning_rate": 3.440684066587489e-05, + "loss": 0.0358, + "step": 81670 + }, + { + "epoch": 0.0084, + "grad_norm": 0.07969480007886887, + "learning_rate": 3.440301065235019e-05, + "loss": 0.0377, + "step": 81680 + }, + { + "epoch": 0.00845, + "grad_norm": 0.09947582334280014, + "learning_rate": 3.439918038175073e-05, + "loss": 0.0362, + "step": 81690 + }, + { + "epoch": 0.0085, + "grad_norm": 0.08369084447622299, + "learning_rate": 3.43953498541812e-05, + "loss": 0.0366, + "step": 81700 + }, + { + "epoch": 0.00855, + "grad_norm": 0.07206384837627411, + "learning_rate": 3.439151906974635e-05, + "loss": 0.0364, + "step": 81710 + }, + { + "epoch": 0.0086, + "grad_norm": 0.10137312859296799, + "learning_rate": 3.438768802855088e-05, + "loss": 0.0365, + "step": 81720 + }, + { + "epoch": 0.00865, + "grad_norm": 0.07239537686109543, + "learning_rate": 3.4383856730699546e-05, + "loss": 0.0392, + "step": 81730 + }, + { + "epoch": 0.0087, + "grad_norm": 0.07529709488153458, + "learning_rate": 3.4380025176297095e-05, + "loss": 0.0352, + "step": 81740 + }, + { + "epoch": 0.00875, + "grad_norm": 0.10035303235054016, + "learning_rate": 3.43761933654483e-05, + "loss": 0.0355, + "step": 81750 + }, + { + "epoch": 0.0088, + "grad_norm": 0.09971463680267334, + "learning_rate": 3.4372361298257875e-05, + "loss": 0.0349, + "step": 81760 + }, + { + "epoch": 0.00885, + "grad_norm": 0.09327927976846695, + "learning_rate": 3.436852897483062e-05, + "loss": 0.0392, + "step": 81770 + }, + { + "epoch": 0.0089, + "grad_norm": 0.08059550821781158, + "learning_rate": 3.4364696395271315e-05, + "loss": 0.036, + "step": 81780 + }, + { + "epoch": 0.00895, + "grad_norm": 0.07822158932685852, + "learning_rate": 3.4360863559684715e-05, + "loss": 0.0355, + "step": 81790 + }, + { + "epoch": 0.009, + "grad_norm": 0.08221601694822311, + "learning_rate": 3.435703046817562e-05, + "loss": 0.0353, + "step": 81800 + }, + { + "epoch": 0.00905, + "grad_norm": 0.08299103379249573, + "learning_rate": 3.4353197120848833e-05, + "loss": 0.0358, + "step": 81810 + }, + { + "epoch": 0.0091, + "grad_norm": 0.08895883709192276, + "learning_rate": 3.4349363517809156e-05, + "loss": 0.0367, + "step": 81820 + }, + { + "epoch": 0.00915, + "grad_norm": 0.09609355032444, + "learning_rate": 3.434552965916138e-05, + "loss": 0.0366, + "step": 81830 + }, + { + "epoch": 0.0092, + "grad_norm": 0.08394278585910797, + "learning_rate": 3.434169554501035e-05, + "loss": 0.0369, + "step": 81840 + }, + { + "epoch": 0.00925, + "grad_norm": 0.08705271780490875, + "learning_rate": 3.4337861175460864e-05, + "loss": 0.0373, + "step": 81850 + }, + { + "epoch": 0.0093, + "grad_norm": 0.0820394977927208, + "learning_rate": 3.433402655061777e-05, + "loss": 0.0349, + "step": 81860 + }, + { + "epoch": 0.00935, + "grad_norm": 0.08074338734149933, + "learning_rate": 3.433019167058588e-05, + "loss": 0.0355, + "step": 81870 + }, + { + "epoch": 0.0094, + "grad_norm": 0.07878132909536362, + "learning_rate": 3.432635653547007e-05, + "loss": 0.035, + "step": 81880 + }, + { + "epoch": 0.00945, + "grad_norm": 0.09017588943243027, + "learning_rate": 3.4322521145375167e-05, + "loss": 0.0357, + "step": 81890 + }, + { + "epoch": 0.0095, + "grad_norm": 0.0714225172996521, + "learning_rate": 3.4318685500406045e-05, + "loss": 0.0354, + "step": 81900 + }, + { + "epoch": 0.00955, + "grad_norm": 0.077426977455616, + "learning_rate": 3.431484960066756e-05, + "loss": 0.0366, + "step": 81910 + }, + { + "epoch": 0.0096, + "grad_norm": 0.08480240404605865, + "learning_rate": 3.4311013446264586e-05, + "loss": 0.0359, + "step": 81920 + }, + { + "epoch": 0.00965, + "grad_norm": 0.08634501695632935, + "learning_rate": 3.4307177037301996e-05, + "loss": 0.0348, + "step": 81930 + }, + { + "epoch": 0.0097, + "grad_norm": 0.08254533261060715, + "learning_rate": 3.430334037388469e-05, + "loss": 0.0353, + "step": 81940 + }, + { + "epoch": 0.00975, + "grad_norm": 0.08382163196802139, + "learning_rate": 3.4299503456117546e-05, + "loss": 0.0374, + "step": 81950 + }, + { + "epoch": 0.0098, + "grad_norm": 0.09046601504087448, + "learning_rate": 3.429566628410548e-05, + "loss": 0.0361, + "step": 81960 + }, + { + "epoch": 0.00985, + "grad_norm": 0.08730943500995636, + "learning_rate": 3.429182885795339e-05, + "loss": 0.0351, + "step": 81970 + }, + { + "epoch": 0.0099, + "grad_norm": 0.09844104200601578, + "learning_rate": 3.4287991177766184e-05, + "loss": 0.0366, + "step": 81980 + }, + { + "epoch": 0.00995, + "grad_norm": 0.07887846976518631, + "learning_rate": 3.428415324364879e-05, + "loss": 0.0367, + "step": 81990 + }, + { + "epoch": 0.01, + "grad_norm": 0.09441222995519638, + "learning_rate": 3.428031505570614e-05, + "loss": 0.0361, + "step": 82000 + }, + { + "epoch": 0.01005, + "grad_norm": 0.08640168607234955, + "learning_rate": 3.427647661404315e-05, + "loss": 0.0362, + "step": 82010 + }, + { + "epoch": 0.0101, + "grad_norm": 0.07743245363235474, + "learning_rate": 3.427263791876478e-05, + "loss": 0.0362, + "step": 82020 + }, + { + "epoch": 0.01015, + "grad_norm": 0.07744090259075165, + "learning_rate": 3.426879896997598e-05, + "loss": 0.035, + "step": 82030 + }, + { + "epoch": 0.0102, + "grad_norm": 0.08554510772228241, + "learning_rate": 3.42649597677817e-05, + "loss": 0.0359, + "step": 82040 + }, + { + "epoch": 0.01025, + "grad_norm": 0.08902224898338318, + "learning_rate": 3.426112031228689e-05, + "loss": 0.0355, + "step": 82050 + }, + { + "epoch": 0.0103, + "grad_norm": 0.09727823734283447, + "learning_rate": 3.425728060359653e-05, + "loss": 0.0356, + "step": 82060 + }, + { + "epoch": 0.01035, + "grad_norm": 0.0850490853190422, + "learning_rate": 3.42534406418156e-05, + "loss": 0.0354, + "step": 82070 + }, + { + "epoch": 0.0104, + "grad_norm": 0.06924112141132355, + "learning_rate": 3.424960042704908e-05, + "loss": 0.0352, + "step": 82080 + }, + { + "epoch": 0.01045, + "grad_norm": 0.07539879530668259, + "learning_rate": 3.424575995940196e-05, + "loss": 0.0344, + "step": 82090 + }, + { + "epoch": 0.0105, + "grad_norm": 0.07704594731330872, + "learning_rate": 3.424191923897923e-05, + "loss": 0.0365, + "step": 82100 + }, + { + "epoch": 0.01055, + "grad_norm": 0.07837717980146408, + "learning_rate": 3.423807826588591e-05, + "loss": 0.0354, + "step": 82110 + }, + { + "epoch": 0.0106, + "grad_norm": 0.08570121973752975, + "learning_rate": 3.423423704022699e-05, + "loss": 0.0363, + "step": 82120 + }, + { + "epoch": 0.01065, + "grad_norm": 0.07615642249584198, + "learning_rate": 3.4230395562107506e-05, + "loss": 0.0365, + "step": 82130 + }, + { + "epoch": 0.0107, + "grad_norm": 0.09024906158447266, + "learning_rate": 3.422655383163247e-05, + "loss": 0.0367, + "step": 82140 + }, + { + "epoch": 0.01075, + "grad_norm": 0.0853065475821495, + "learning_rate": 3.4222711848906927e-05, + "loss": 0.0368, + "step": 82150 + }, + { + "epoch": 0.0108, + "grad_norm": 0.07461123168468475, + "learning_rate": 3.42188696140359e-05, + "loss": 0.0371, + "step": 82160 + }, + { + "epoch": 0.01085, + "grad_norm": 0.08725110441446304, + "learning_rate": 3.421502712712445e-05, + "loss": 0.035, + "step": 82170 + }, + { + "epoch": 0.0109, + "grad_norm": 0.09073107689619064, + "learning_rate": 3.4211184388277604e-05, + "loss": 0.0345, + "step": 82180 + }, + { + "epoch": 0.01095, + "grad_norm": 0.08718758821487427, + "learning_rate": 3.420734139760045e-05, + "loss": 0.039, + "step": 82190 + }, + { + "epoch": 0.011, + "grad_norm": 0.07554249465465546, + "learning_rate": 3.420349815519803e-05, + "loss": 0.0363, + "step": 82200 + }, + { + "epoch": 0.01105, + "grad_norm": 0.0913548395037651, + "learning_rate": 3.4199654661175445e-05, + "loss": 0.0366, + "step": 82210 + }, + { + "epoch": 0.0111, + "grad_norm": 0.07682806253433228, + "learning_rate": 3.419581091563775e-05, + "loss": 0.0352, + "step": 82220 + }, + { + "epoch": 0.01115, + "grad_norm": 0.07206662744283676, + "learning_rate": 3.419196691869003e-05, + "loss": 0.0353, + "step": 82230 + }, + { + "epoch": 0.0112, + "grad_norm": 0.07625097036361694, + "learning_rate": 3.41881226704374e-05, + "loss": 0.039, + "step": 82240 + }, + { + "epoch": 0.01125, + "grad_norm": 0.09000785648822784, + "learning_rate": 3.418427817098494e-05, + "loss": 0.036, + "step": 82250 + }, + { + "epoch": 0.0113, + "grad_norm": 0.07559432089328766, + "learning_rate": 3.4180433420437766e-05, + "loss": 0.0344, + "step": 82260 + }, + { + "epoch": 0.01135, + "grad_norm": 0.11739441752433777, + "learning_rate": 3.417658841890099e-05, + "loss": 0.0352, + "step": 82270 + }, + { + "epoch": 0.0114, + "grad_norm": 0.09553413838148117, + "learning_rate": 3.417274316647974e-05, + "loss": 0.0348, + "step": 82280 + }, + { + "epoch": 0.01145, + "grad_norm": 0.09647861123085022, + "learning_rate": 3.416889766327914e-05, + "loss": 0.0366, + "step": 82290 + }, + { + "epoch": 0.0115, + "grad_norm": 0.12469710409641266, + "learning_rate": 3.416505190940432e-05, + "loss": 0.0361, + "step": 82300 + }, + { + "epoch": 0.01155, + "grad_norm": 0.10431114584207535, + "learning_rate": 3.4161205904960414e-05, + "loss": 0.0345, + "step": 82310 + }, + { + "epoch": 0.0116, + "grad_norm": 0.08366325497627258, + "learning_rate": 3.415735965005259e-05, + "loss": 0.0342, + "step": 82320 + }, + { + "epoch": 0.01165, + "grad_norm": 0.08749702572822571, + "learning_rate": 3.415351314478599e-05, + "loss": 0.0365, + "step": 82330 + }, + { + "epoch": 0.0117, + "grad_norm": 0.07440687716007233, + "learning_rate": 3.414966638926579e-05, + "loss": 0.0361, + "step": 82340 + }, + { + "epoch": 0.01175, + "grad_norm": 0.09167435765266418, + "learning_rate": 3.414581938359713e-05, + "loss": 0.0357, + "step": 82350 + }, + { + "epoch": 0.0118, + "grad_norm": 0.08168090879917145, + "learning_rate": 3.414197212788522e-05, + "loss": 0.0366, + "step": 82360 + }, + { + "epoch": 0.01185, + "grad_norm": 0.12418495118618011, + "learning_rate": 3.413812462223522e-05, + "loss": 0.0372, + "step": 82370 + }, + { + "epoch": 0.0119, + "grad_norm": 0.10299114882946014, + "learning_rate": 3.4134276866752325e-05, + "loss": 0.0356, + "step": 82380 + }, + { + "epoch": 0.01195, + "grad_norm": 0.08420941233634949, + "learning_rate": 3.413042886154173e-05, + "loss": 0.0377, + "step": 82390 + }, + { + "epoch": 0.012, + "grad_norm": 0.0910545364022255, + "learning_rate": 3.4126580606708644e-05, + "loss": 0.0382, + "step": 82400 + }, + { + "epoch": 0.01205, + "grad_norm": 0.08392889052629471, + "learning_rate": 3.4122732102358265e-05, + "loss": 0.0356, + "step": 82410 + }, + { + "epoch": 0.0121, + "grad_norm": 0.0799657553434372, + "learning_rate": 3.411888334859583e-05, + "loss": 0.0368, + "step": 82420 + }, + { + "epoch": 0.01215, + "grad_norm": 0.09799450635910034, + "learning_rate": 3.411503434552654e-05, + "loss": 0.0376, + "step": 82430 + }, + { + "epoch": 0.0122, + "grad_norm": 0.09583955258131027, + "learning_rate": 3.411118509325564e-05, + "loss": 0.0371, + "step": 82440 + }, + { + "epoch": 0.01225, + "grad_norm": 0.08460697531700134, + "learning_rate": 3.410733559188836e-05, + "loss": 0.0375, + "step": 82450 + }, + { + "epoch": 0.0123, + "grad_norm": 0.0918983668088913, + "learning_rate": 3.410348584152996e-05, + "loss": 0.0366, + "step": 82460 + }, + { + "epoch": 0.01235, + "grad_norm": 0.10554148256778717, + "learning_rate": 3.4099635842285657e-05, + "loss": 0.0398, + "step": 82470 + }, + { + "epoch": 0.0124, + "grad_norm": 0.09713000804185867, + "learning_rate": 3.409578559426074e-05, + "loss": 0.0364, + "step": 82480 + }, + { + "epoch": 0.01245, + "grad_norm": 0.10049746930599213, + "learning_rate": 3.409193509756046e-05, + "loss": 0.0364, + "step": 82490 + }, + { + "epoch": 0.0125, + "grad_norm": 0.0884561762213707, + "learning_rate": 3.408808435229009e-05, + "loss": 0.0362, + "step": 82500 + }, + { + "epoch": 0.01255, + "grad_norm": 0.0997234433889389, + "learning_rate": 3.4084233358554906e-05, + "loss": 0.0363, + "step": 82510 + }, + { + "epoch": 0.0126, + "grad_norm": 0.0873696431517601, + "learning_rate": 3.408038211646019e-05, + "loss": 0.0362, + "step": 82520 + }, + { + "epoch": 0.01265, + "grad_norm": 0.0935036689043045, + "learning_rate": 3.4076530626111244e-05, + "loss": 0.0375, + "step": 82530 + }, + { + "epoch": 0.0127, + "grad_norm": 0.09640396386384964, + "learning_rate": 3.4072678887613364e-05, + "loss": 0.0364, + "step": 82540 + }, + { + "epoch": 0.01275, + "grad_norm": 0.09741134196519852, + "learning_rate": 3.406882690107185e-05, + "loss": 0.038, + "step": 82550 + }, + { + "epoch": 0.0128, + "grad_norm": 0.10588259249925613, + "learning_rate": 3.4064974666592014e-05, + "loss": 0.0367, + "step": 82560 + }, + { + "epoch": 0.01285, + "grad_norm": 0.09171366691589355, + "learning_rate": 3.406112218427918e-05, + "loss": 0.0364, + "step": 82570 + }, + { + "epoch": 0.0129, + "grad_norm": 0.09563100337982178, + "learning_rate": 3.405726945423866e-05, + "loss": 0.0376, + "step": 82580 + }, + { + "epoch": 0.01295, + "grad_norm": 0.08102994412183762, + "learning_rate": 3.405341647657581e-05, + "loss": 0.0358, + "step": 82590 + }, + { + "epoch": 0.013, + "grad_norm": 0.0844772458076477, + "learning_rate": 3.404956325139594e-05, + "loss": 0.0394, + "step": 82600 + }, + { + "epoch": 0.01305, + "grad_norm": 0.09395796060562134, + "learning_rate": 3.4045709778804426e-05, + "loss": 0.037, + "step": 82610 + }, + { + "epoch": 0.0131, + "grad_norm": 0.08102398365736008, + "learning_rate": 3.404185605890659e-05, + "loss": 0.0362, + "step": 82620 + }, + { + "epoch": 0.01315, + "grad_norm": 0.0828714370727539, + "learning_rate": 3.403800209180781e-05, + "loss": 0.036, + "step": 82630 + }, + { + "epoch": 0.0132, + "grad_norm": 0.08016139268875122, + "learning_rate": 3.403414787761345e-05, + "loss": 0.0366, + "step": 82640 + }, + { + "epoch": 0.01325, + "grad_norm": 0.0881897360086441, + "learning_rate": 3.403029341642888e-05, + "loss": 0.0375, + "step": 82650 + }, + { + "epoch": 0.0133, + "grad_norm": 0.09445564448833466, + "learning_rate": 3.402643870835948e-05, + "loss": 0.0371, + "step": 82660 + }, + { + "epoch": 0.01335, + "grad_norm": 0.08610563725233078, + "learning_rate": 3.4022583753510646e-05, + "loss": 0.0366, + "step": 82670 + }, + { + "epoch": 0.0134, + "grad_norm": 0.08738268911838531, + "learning_rate": 3.4018728551987746e-05, + "loss": 0.0376, + "step": 82680 + }, + { + "epoch": 0.01345, + "grad_norm": 0.0908346176147461, + "learning_rate": 3.4014873103896205e-05, + "loss": 0.0384, + "step": 82690 + }, + { + "epoch": 0.0135, + "grad_norm": 0.08275371044874191, + "learning_rate": 3.4011017409341414e-05, + "loss": 0.037, + "step": 82700 + }, + { + "epoch": 0.01355, + "grad_norm": 0.08076354116201401, + "learning_rate": 3.4007161468428805e-05, + "loss": 0.0364, + "step": 82710 + }, + { + "epoch": 0.0136, + "grad_norm": 0.09224753081798553, + "learning_rate": 3.4003305281263776e-05, + "loss": 0.0364, + "step": 82720 + }, + { + "epoch": 0.01365, + "grad_norm": 0.08687998354434967, + "learning_rate": 3.3999448847951764e-05, + "loss": 0.0363, + "step": 82730 + }, + { + "epoch": 0.0137, + "grad_norm": 0.08099762350320816, + "learning_rate": 3.39955921685982e-05, + "loss": 0.0364, + "step": 82740 + }, + { + "epoch": 0.01375, + "grad_norm": 0.09829870611429214, + "learning_rate": 3.399173524330853e-05, + "loss": 0.0378, + "step": 82750 + }, + { + "epoch": 0.0138, + "grad_norm": 0.10540544241666794, + "learning_rate": 3.398787807218819e-05, + "loss": 0.0377, + "step": 82760 + }, + { + "epoch": 0.01385, + "grad_norm": 0.0992199257016182, + "learning_rate": 3.398402065534265e-05, + "loss": 0.0369, + "step": 82770 + }, + { + "epoch": 0.0139, + "grad_norm": 0.1026124507188797, + "learning_rate": 3.398016299287736e-05, + "loss": 0.0371, + "step": 82780 + }, + { + "epoch": 0.01395, + "grad_norm": 0.09222318977117538, + "learning_rate": 3.3976305084897776e-05, + "loss": 0.0363, + "step": 82790 + }, + { + "epoch": 0.014, + "grad_norm": 0.08628864586353302, + "learning_rate": 3.397244693150939e-05, + "loss": 0.0375, + "step": 82800 + }, + { + "epoch": 0.01405, + "grad_norm": 0.09229279309511185, + "learning_rate": 3.396858853281767e-05, + "loss": 0.0367, + "step": 82810 + }, + { + "epoch": 0.0141, + "grad_norm": 0.10248692333698273, + "learning_rate": 3.3964729888928115e-05, + "loss": 0.0361, + "step": 82820 + }, + { + "epoch": 0.01415, + "grad_norm": 0.09319385886192322, + "learning_rate": 3.396087099994621e-05, + "loss": 0.0357, + "step": 82830 + }, + { + "epoch": 0.0142, + "grad_norm": 0.08986565470695496, + "learning_rate": 3.3957011865977466e-05, + "loss": 0.0369, + "step": 82840 + }, + { + "epoch": 0.01425, + "grad_norm": 0.06687970459461212, + "learning_rate": 3.3953152487127375e-05, + "loss": 0.0357, + "step": 82850 + }, + { + "epoch": 0.0143, + "grad_norm": 0.08596998453140259, + "learning_rate": 3.3949292863501465e-05, + "loss": 0.0373, + "step": 82860 + }, + { + "epoch": 0.01435, + "grad_norm": 0.0879545584321022, + "learning_rate": 3.394543299520524e-05, + "loss": 0.0393, + "step": 82870 + }, + { + "epoch": 0.0144, + "grad_norm": 0.08258802443742752, + "learning_rate": 3.3941572882344244e-05, + "loss": 0.0355, + "step": 82880 + }, + { + "epoch": 0.01445, + "grad_norm": 0.08050991594791412, + "learning_rate": 3.3937712525024e-05, + "loss": 0.0361, + "step": 82890 + }, + { + "epoch": 0.0145, + "grad_norm": 0.08885800838470459, + "learning_rate": 3.393385192335006e-05, + "loss": 0.0358, + "step": 82900 + }, + { + "epoch": 0.01455, + "grad_norm": 0.07728556543588638, + "learning_rate": 3.392999107742796e-05, + "loss": 0.0377, + "step": 82910 + }, + { + "epoch": 0.0146, + "grad_norm": 0.08573156595230103, + "learning_rate": 3.392612998736327e-05, + "loss": 0.0364, + "step": 82920 + }, + { + "epoch": 0.01465, + "grad_norm": 0.07444633543491364, + "learning_rate": 3.392226865326153e-05, + "loss": 0.0357, + "step": 82930 + }, + { + "epoch": 0.0147, + "grad_norm": 0.07533413171768188, + "learning_rate": 3.3918407075228306e-05, + "loss": 0.0348, + "step": 82940 + }, + { + "epoch": 0.01475, + "grad_norm": 0.09161031991243362, + "learning_rate": 3.3914545253369196e-05, + "loss": 0.0356, + "step": 82950 + }, + { + "epoch": 0.0148, + "grad_norm": 0.08216315507888794, + "learning_rate": 3.3910683187789766e-05, + "loss": 0.0347, + "step": 82960 + }, + { + "epoch": 0.01485, + "grad_norm": 0.08883273601531982, + "learning_rate": 3.3906820878595604e-05, + "loss": 0.0364, + "step": 82970 + }, + { + "epoch": 0.0149, + "grad_norm": 0.08381297439336777, + "learning_rate": 3.3902958325892303e-05, + "loss": 0.0353, + "step": 82980 + }, + { + "epoch": 0.01495, + "grad_norm": 0.08417104184627533, + "learning_rate": 3.389909552978547e-05, + "loss": 0.0358, + "step": 82990 + }, + { + "epoch": 0.015, + "grad_norm": 0.08534098416566849, + "learning_rate": 3.3895232490380714e-05, + "loss": 0.0348, + "step": 83000 + }, + { + "epoch": 0.01505, + "grad_norm": 0.08981525152921677, + "learning_rate": 3.389136920778363e-05, + "loss": 0.0361, + "step": 83010 + }, + { + "epoch": 0.0151, + "grad_norm": 0.08870881050825119, + "learning_rate": 3.388750568209986e-05, + "loss": 0.0346, + "step": 83020 + }, + { + "epoch": 0.01515, + "grad_norm": 0.07469086349010468, + "learning_rate": 3.3883641913435025e-05, + "loss": 0.0359, + "step": 83030 + }, + { + "epoch": 0.0152, + "grad_norm": 0.10804679989814758, + "learning_rate": 3.3879777901894754e-05, + "loss": 0.0363, + "step": 83040 + }, + { + "epoch": 0.01525, + "grad_norm": 0.08663041144609451, + "learning_rate": 3.3875913647584695e-05, + "loss": 0.038, + "step": 83050 + }, + { + "epoch": 0.0153, + "grad_norm": 0.07422086596488953, + "learning_rate": 3.3872049150610486e-05, + "loss": 0.0349, + "step": 83060 + }, + { + "epoch": 0.01535, + "grad_norm": 0.09427531063556671, + "learning_rate": 3.38681844110778e-05, + "loss": 0.0372, + "step": 83070 + }, + { + "epoch": 0.0154, + "grad_norm": 0.08496449887752533, + "learning_rate": 3.386431942909226e-05, + "loss": 0.0358, + "step": 83080 + }, + { + "epoch": 0.01545, + "grad_norm": 0.09647180885076523, + "learning_rate": 3.3860454204759576e-05, + "loss": 0.0374, + "step": 83090 + }, + { + "epoch": 0.0155, + "grad_norm": 0.09570591151714325, + "learning_rate": 3.385658873818539e-05, + "loss": 0.0342, + "step": 83100 + }, + { + "epoch": 0.01555, + "grad_norm": 0.10772760957479477, + "learning_rate": 3.385272302947541e-05, + "loss": 0.037, + "step": 83110 + }, + { + "epoch": 0.0156, + "grad_norm": 0.09298603236675262, + "learning_rate": 3.384885707873529e-05, + "loss": 0.0353, + "step": 83120 + }, + { + "epoch": 0.01565, + "grad_norm": 0.0879300981760025, + "learning_rate": 3.384499088607076e-05, + "loss": 0.0352, + "step": 83130 + }, + { + "epoch": 0.0157, + "grad_norm": 0.09325937926769257, + "learning_rate": 3.3841124451587494e-05, + "loss": 0.0358, + "step": 83140 + }, + { + "epoch": 0.01575, + "grad_norm": 0.09831637144088745, + "learning_rate": 3.383725777539121e-05, + "loss": 0.0364, + "step": 83150 + }, + { + "epoch": 0.0158, + "grad_norm": 0.09208468347787857, + "learning_rate": 3.383339085758761e-05, + "loss": 0.0362, + "step": 83160 + }, + { + "epoch": 0.01585, + "grad_norm": 0.08796915411949158, + "learning_rate": 3.382952369828243e-05, + "loss": 0.0352, + "step": 83170 + }, + { + "epoch": 0.0159, + "grad_norm": 0.07426141202449799, + "learning_rate": 3.382565629758139e-05, + "loss": 0.0351, + "step": 83180 + }, + { + "epoch": 0.01595, + "grad_norm": 0.07396486401557922, + "learning_rate": 3.3821788655590215e-05, + "loss": 0.0349, + "step": 83190 + }, + { + "epoch": 0.016, + "grad_norm": 0.07119892537593842, + "learning_rate": 3.381792077241466e-05, + "loss": 0.0352, + "step": 83200 + }, + { + "epoch": 0.01605, + "grad_norm": 0.08340752124786377, + "learning_rate": 3.381405264816046e-05, + "loss": 0.0377, + "step": 83210 + }, + { + "epoch": 0.0161, + "grad_norm": 0.08224661648273468, + "learning_rate": 3.381018428293337e-05, + "loss": 0.0359, + "step": 83220 + }, + { + "epoch": 0.01615, + "grad_norm": 0.09233871102333069, + "learning_rate": 3.380631567683915e-05, + "loss": 0.0366, + "step": 83230 + }, + { + "epoch": 0.0162, + "grad_norm": 0.0836290791630745, + "learning_rate": 3.380244682998358e-05, + "loss": 0.0352, + "step": 83240 + }, + { + "epoch": 0.01625, + "grad_norm": 0.09171456098556519, + "learning_rate": 3.379857774247241e-05, + "loss": 0.0352, + "step": 83250 + }, + { + "epoch": 0.0163, + "grad_norm": 0.08661381900310516, + "learning_rate": 3.379470841441144e-05, + "loss": 0.0355, + "step": 83260 + }, + { + "epoch": 0.01635, + "grad_norm": 0.09041456878185272, + "learning_rate": 3.3790838845906426e-05, + "loss": 0.0357, + "step": 83270 + }, + { + "epoch": 0.0164, + "grad_norm": 0.07483222335577011, + "learning_rate": 3.3786969037063196e-05, + "loss": 0.0353, + "step": 83280 + }, + { + "epoch": 0.01645, + "grad_norm": 0.10447685420513153, + "learning_rate": 3.378309898798753e-05, + "loss": 0.0364, + "step": 83290 + }, + { + "epoch": 0.0165, + "grad_norm": 0.089652419090271, + "learning_rate": 3.377922869878524e-05, + "loss": 0.0344, + "step": 83300 + }, + { + "epoch": 0.01655, + "grad_norm": 0.11374127864837646, + "learning_rate": 3.377535816956213e-05, + "loss": 0.0368, + "step": 83310 + }, + { + "epoch": 0.0166, + "grad_norm": 0.0918031632900238, + "learning_rate": 3.3771487400424036e-05, + "loss": 0.0352, + "step": 83320 + }, + { + "epoch": 0.01665, + "grad_norm": 0.0891224667429924, + "learning_rate": 3.376761639147675e-05, + "loss": 0.0358, + "step": 83330 + }, + { + "epoch": 0.0167, + "grad_norm": 0.08948148041963577, + "learning_rate": 3.3763745142826146e-05, + "loss": 0.0376, + "step": 83340 + }, + { + "epoch": 0.01675, + "grad_norm": 0.09376305341720581, + "learning_rate": 3.375987365457804e-05, + "loss": 0.0351, + "step": 83350 + }, + { + "epoch": 0.0168, + "grad_norm": 0.08524085581302643, + "learning_rate": 3.3756001926838273e-05, + "loss": 0.0365, + "step": 83360 + }, + { + "epoch": 0.01685, + "grad_norm": 0.08106539398431778, + "learning_rate": 3.37521299597127e-05, + "loss": 0.0352, + "step": 83370 + }, + { + "epoch": 0.0169, + "grad_norm": 0.06964035332202911, + "learning_rate": 3.374825775330719e-05, + "loss": 0.0359, + "step": 83380 + }, + { + "epoch": 0.01695, + "grad_norm": 0.08054011315107346, + "learning_rate": 3.37443853077276e-05, + "loss": 0.0367, + "step": 83390 + }, + { + "epoch": 0.017, + "grad_norm": 0.07989324629306793, + "learning_rate": 3.3740512623079794e-05, + "loss": 0.0351, + "step": 83400 + }, + { + "epoch": 0.01705, + "grad_norm": 0.09449871629476547, + "learning_rate": 3.3736639699469655e-05, + "loss": 0.0367, + "step": 83410 + }, + { + "epoch": 0.0171, + "grad_norm": 0.07510863244533539, + "learning_rate": 3.373276653700308e-05, + "loss": 0.0355, + "step": 83420 + }, + { + "epoch": 0.01715, + "grad_norm": 0.08826517313718796, + "learning_rate": 3.3728893135785937e-05, + "loss": 0.0362, + "step": 83430 + }, + { + "epoch": 0.0172, + "grad_norm": 0.09773914515972137, + "learning_rate": 3.3725019495924135e-05, + "loss": 0.0363, + "step": 83440 + }, + { + "epoch": 0.01725, + "grad_norm": 0.08512328565120697, + "learning_rate": 3.372114561752359e-05, + "loss": 0.0366, + "step": 83450 + }, + { + "epoch": 0.0173, + "grad_norm": 0.10157383233308792, + "learning_rate": 3.37172715006902e-05, + "loss": 0.0369, + "step": 83460 + }, + { + "epoch": 0.01735, + "grad_norm": 0.09032903611660004, + "learning_rate": 3.371339714552987e-05, + "loss": 0.0353, + "step": 83470 + }, + { + "epoch": 0.0174, + "grad_norm": 0.07454940676689148, + "learning_rate": 3.370952255214853e-05, + "loss": 0.0363, + "step": 83480 + }, + { + "epoch": 0.01745, + "grad_norm": 0.07773549109697342, + "learning_rate": 3.3705647720652135e-05, + "loss": 0.0359, + "step": 83490 + }, + { + "epoch": 0.0175, + "grad_norm": 0.07942163944244385, + "learning_rate": 3.370177265114659e-05, + "loss": 0.0352, + "step": 83500 + }, + { + "epoch": 0.01755, + "grad_norm": 0.10010170936584473, + "learning_rate": 3.3697897343737855e-05, + "loss": 0.0358, + "step": 83510 + }, + { + "epoch": 0.0176, + "grad_norm": 0.09081415086984634, + "learning_rate": 3.3694021798531865e-05, + "loss": 0.0352, + "step": 83520 + }, + { + "epoch": 0.01765, + "grad_norm": 0.08686517924070358, + "learning_rate": 3.369014601563459e-05, + "loss": 0.0359, + "step": 83530 + }, + { + "epoch": 0.0177, + "grad_norm": 0.07877647876739502, + "learning_rate": 3.3686269995152e-05, + "loss": 0.0354, + "step": 83540 + }, + { + "epoch": 0.01775, + "grad_norm": 0.09665057063102722, + "learning_rate": 3.3682393737190035e-05, + "loss": 0.0372, + "step": 83550 + }, + { + "epoch": 0.0178, + "grad_norm": 0.0790608748793602, + "learning_rate": 3.367851724185469e-05, + "loss": 0.034, + "step": 83560 + }, + { + "epoch": 0.01785, + "grad_norm": 0.0802990272641182, + "learning_rate": 3.3674640509251956e-05, + "loss": 0.0364, + "step": 83570 + }, + { + "epoch": 0.0179, + "grad_norm": 0.09536537528038025, + "learning_rate": 3.36707635394878e-05, + "loss": 0.0362, + "step": 83580 + }, + { + "epoch": 0.01795, + "grad_norm": 0.089724600315094, + "learning_rate": 3.366688633266822e-05, + "loss": 0.0365, + "step": 83590 + }, + { + "epoch": 0.018, + "grad_norm": 0.10376014560461044, + "learning_rate": 3.366300888889923e-05, + "loss": 0.036, + "step": 83600 + }, + { + "epoch": 0.01805, + "grad_norm": 0.08771280199289322, + "learning_rate": 3.365913120828684e-05, + "loss": 0.0355, + "step": 83610 + }, + { + "epoch": 0.0181, + "grad_norm": 0.09692392498254776, + "learning_rate": 3.365525329093705e-05, + "loss": 0.0365, + "step": 83620 + }, + { + "epoch": 0.01815, + "grad_norm": 0.08540845662355423, + "learning_rate": 3.365137513695589e-05, + "loss": 0.0373, + "step": 83630 + }, + { + "epoch": 0.0182, + "grad_norm": 0.08605222404003143, + "learning_rate": 3.364749674644937e-05, + "loss": 0.037, + "step": 83640 + }, + { + "epoch": 0.01825, + "grad_norm": 0.10747594386339188, + "learning_rate": 3.3643618119523545e-05, + "loss": 0.0387, + "step": 83650 + }, + { + "epoch": 0.0183, + "grad_norm": 0.09305700659751892, + "learning_rate": 3.363973925628445e-05, + "loss": 0.0371, + "step": 83660 + }, + { + "epoch": 0.01835, + "grad_norm": 0.0908249244093895, + "learning_rate": 3.3635860156838137e-05, + "loss": 0.0356, + "step": 83670 + }, + { + "epoch": 0.0184, + "grad_norm": 0.07326623052358627, + "learning_rate": 3.363198082129064e-05, + "loss": 0.038, + "step": 83680 + }, + { + "epoch": 0.01845, + "grad_norm": 0.0957440733909607, + "learning_rate": 3.362810124974803e-05, + "loss": 0.0377, + "step": 83690 + }, + { + "epoch": 0.0185, + "grad_norm": 0.09219150990247726, + "learning_rate": 3.3624221442316376e-05, + "loss": 0.039, + "step": 83700 + }, + { + "epoch": 0.01855, + "grad_norm": 0.08684364706277847, + "learning_rate": 3.362034139910175e-05, + "loss": 0.0367, + "step": 83710 + }, + { + "epoch": 0.0186, + "grad_norm": 0.07927060127258301, + "learning_rate": 3.3616461120210224e-05, + "loss": 0.0369, + "step": 83720 + }, + { + "epoch": 0.01865, + "grad_norm": 0.08206473290920258, + "learning_rate": 3.361258060574789e-05, + "loss": 0.037, + "step": 83730 + }, + { + "epoch": 0.0187, + "grad_norm": 0.09019777923822403, + "learning_rate": 3.3608699855820846e-05, + "loss": 0.0394, + "step": 83740 + }, + { + "epoch": 0.01875, + "grad_norm": 0.08604075014591217, + "learning_rate": 3.3604818870535174e-05, + "loss": 0.0358, + "step": 83750 + }, + { + "epoch": 0.0188, + "grad_norm": 0.07271555811166763, + "learning_rate": 3.360093764999699e-05, + "loss": 0.0357, + "step": 83760 + }, + { + "epoch": 0.01885, + "grad_norm": 0.07016167789697647, + "learning_rate": 3.35970561943124e-05, + "loss": 0.0349, + "step": 83770 + }, + { + "epoch": 0.0189, + "grad_norm": 0.08462268859148026, + "learning_rate": 3.359317450358752e-05, + "loss": 0.035, + "step": 83780 + }, + { + "epoch": 0.01895, + "grad_norm": 0.08154400438070297, + "learning_rate": 3.358929257792848e-05, + "loss": 0.037, + "step": 83790 + }, + { + "epoch": 0.019, + "grad_norm": 0.0706799328327179, + "learning_rate": 3.358541041744141e-05, + "loss": 0.036, + "step": 83800 + }, + { + "epoch": 0.01905, + "grad_norm": 0.09226647764444351, + "learning_rate": 3.358152802223244e-05, + "loss": 0.037, + "step": 83810 + }, + { + "epoch": 0.0191, + "grad_norm": 0.09303418546915054, + "learning_rate": 3.357764539240772e-05, + "loss": 0.0363, + "step": 83820 + }, + { + "epoch": 0.01915, + "grad_norm": 0.0904071107506752, + "learning_rate": 3.3573762528073404e-05, + "loss": 0.0348, + "step": 83830 + }, + { + "epoch": 0.0192, + "grad_norm": 0.0851183757185936, + "learning_rate": 3.356987942933563e-05, + "loss": 0.0357, + "step": 83840 + }, + { + "epoch": 0.01925, + "grad_norm": 0.06624419242143631, + "learning_rate": 3.356599609630058e-05, + "loss": 0.0339, + "step": 83850 + }, + { + "epoch": 0.0193, + "grad_norm": 0.08188678324222565, + "learning_rate": 3.356211252907441e-05, + "loss": 0.0366, + "step": 83860 + }, + { + "epoch": 0.01935, + "grad_norm": 0.0802663192152977, + "learning_rate": 3.3558228727763305e-05, + "loss": 0.035, + "step": 83870 + }, + { + "epoch": 0.0194, + "grad_norm": 0.08806353062391281, + "learning_rate": 3.355434469247344e-05, + "loss": 0.0356, + "step": 83880 + }, + { + "epoch": 0.01945, + "grad_norm": 0.10073021054267883, + "learning_rate": 3.3550460423311004e-05, + "loss": 0.038, + "step": 83890 + }, + { + "epoch": 0.0195, + "grad_norm": 0.08883056789636612, + "learning_rate": 3.354657592038219e-05, + "loss": 0.0352, + "step": 83900 + }, + { + "epoch": 0.01955, + "grad_norm": 0.07281231135129929, + "learning_rate": 3.354269118379321e-05, + "loss": 0.0355, + "step": 83910 + }, + { + "epoch": 0.0196, + "grad_norm": 0.07044170051813126, + "learning_rate": 3.353880621365025e-05, + "loss": 0.0348, + "step": 83920 + }, + { + "epoch": 0.01965, + "grad_norm": 0.06740438938140869, + "learning_rate": 3.353492101005955e-05, + "loss": 0.0353, + "step": 83930 + }, + { + "epoch": 0.0197, + "grad_norm": 0.08475062996149063, + "learning_rate": 3.3531035573127304e-05, + "loss": 0.035, + "step": 83940 + }, + { + "epoch": 0.01975, + "grad_norm": 0.06419086456298828, + "learning_rate": 3.3527149902959755e-05, + "loss": 0.0367, + "step": 83950 + }, + { + "epoch": 0.0198, + "grad_norm": 0.07238471508026123, + "learning_rate": 3.3523263999663124e-05, + "loss": 0.0352, + "step": 83960 + }, + { + "epoch": 0.01985, + "grad_norm": 0.07790805399417877, + "learning_rate": 3.3519377863343664e-05, + "loss": 0.0366, + "step": 83970 + }, + { + "epoch": 0.0199, + "grad_norm": 0.08158902823925018, + "learning_rate": 3.351549149410761e-05, + "loss": 0.0361, + "step": 83980 + }, + { + "epoch": 0.01995, + "grad_norm": 0.07997786998748779, + "learning_rate": 3.351160489206123e-05, + "loss": 0.0353, + "step": 83990 + }, + { + "epoch": 0.02, + "grad_norm": 0.07322206348180771, + "learning_rate": 3.350771805731076e-05, + "loss": 0.0353, + "step": 84000 + }, + { + "epoch": 0.02005, + "grad_norm": 0.08143064379692078, + "learning_rate": 3.350383098996248e-05, + "loss": 0.0363, + "step": 84010 + }, + { + "epoch": 0.0201, + "grad_norm": 0.08571472018957138, + "learning_rate": 3.349994369012265e-05, + "loss": 0.0367, + "step": 84020 + }, + { + "epoch": 0.02015, + "grad_norm": 0.08176877349615097, + "learning_rate": 3.3496056157897545e-05, + "loss": 0.0364, + "step": 84030 + }, + { + "epoch": 0.0202, + "grad_norm": 0.0745140090584755, + "learning_rate": 3.3492168393393465e-05, + "loss": 0.0363, + "step": 84040 + }, + { + "epoch": 0.02025, + "grad_norm": 0.08554243296384811, + "learning_rate": 3.34882803967167e-05, + "loss": 0.035, + "step": 84050 + }, + { + "epoch": 0.0203, + "grad_norm": 0.08013322949409485, + "learning_rate": 3.348439216797353e-05, + "loss": 0.0368, + "step": 84060 + }, + { + "epoch": 0.02035, + "grad_norm": 0.07512438297271729, + "learning_rate": 3.348050370727027e-05, + "loss": 0.0349, + "step": 84070 + }, + { + "epoch": 0.0204, + "grad_norm": 0.08043927699327469, + "learning_rate": 3.347661501471321e-05, + "loss": 0.0353, + "step": 84080 + }, + { + "epoch": 0.02045, + "grad_norm": 0.09401516616344452, + "learning_rate": 3.34727260904087e-05, + "loss": 0.036, + "step": 84090 + }, + { + "epoch": 0.0205, + "grad_norm": 0.07855821400880814, + "learning_rate": 3.346883693446302e-05, + "loss": 0.0362, + "step": 84100 + }, + { + "epoch": 0.02055, + "grad_norm": 0.07647678256034851, + "learning_rate": 3.346494754698254e-05, + "loss": 0.0345, + "step": 84110 + }, + { + "epoch": 0.0206, + "grad_norm": 0.07470206916332245, + "learning_rate": 3.3461057928073556e-05, + "loss": 0.035, + "step": 84120 + }, + { + "epoch": 0.02065, + "grad_norm": 0.08800902217626572, + "learning_rate": 3.3457168077842444e-05, + "loss": 0.0372, + "step": 84130 + }, + { + "epoch": 0.0207, + "grad_norm": 0.09285101294517517, + "learning_rate": 3.345327799639553e-05, + "loss": 0.0362, + "step": 84140 + }, + { + "epoch": 0.02075, + "grad_norm": 0.09030649811029434, + "learning_rate": 3.3449387683839165e-05, + "loss": 0.0358, + "step": 84150 + }, + { + "epoch": 0.0208, + "grad_norm": 0.08570466935634613, + "learning_rate": 3.344549714027971e-05, + "loss": 0.0356, + "step": 84160 + }, + { + "epoch": 0.02085, + "grad_norm": 0.10109095275402069, + "learning_rate": 3.3441606365823553e-05, + "loss": 0.0365, + "step": 84170 + }, + { + "epoch": 0.0209, + "grad_norm": 0.07555997371673584, + "learning_rate": 3.343771536057704e-05, + "loss": 0.0357, + "step": 84180 + }, + { + "epoch": 0.02095, + "grad_norm": 0.09468481689691544, + "learning_rate": 3.3433824124646554e-05, + "loss": 0.0369, + "step": 84190 + }, + { + "epoch": 0.021, + "grad_norm": 0.09233641624450684, + "learning_rate": 3.34299326581385e-05, + "loss": 0.0366, + "step": 84200 + }, + { + "epoch": 0.02105, + "grad_norm": 0.08957118541002274, + "learning_rate": 3.342604096115923e-05, + "loss": 0.0367, + "step": 84210 + }, + { + "epoch": 0.0211, + "grad_norm": 0.07740267366170883, + "learning_rate": 3.342214903381519e-05, + "loss": 0.0361, + "step": 84220 + }, + { + "epoch": 0.02115, + "grad_norm": 0.0924314484000206, + "learning_rate": 3.341825687621274e-05, + "loss": 0.0367, + "step": 84230 + }, + { + "epoch": 0.0212, + "grad_norm": 0.08203538507223129, + "learning_rate": 3.3414364488458325e-05, + "loss": 0.0367, + "step": 84240 + }, + { + "epoch": 0.02125, + "grad_norm": 0.09372982382774353, + "learning_rate": 3.341047187065834e-05, + "loss": 0.0386, + "step": 84250 + }, + { + "epoch": 0.0213, + "grad_norm": 0.09793376922607422, + "learning_rate": 3.3406579022919216e-05, + "loss": 0.0371, + "step": 84260 + }, + { + "epoch": 0.02135, + "grad_norm": 0.08053075522184372, + "learning_rate": 3.3402685945347374e-05, + "loss": 0.0371, + "step": 84270 + }, + { + "epoch": 0.0214, + "grad_norm": 0.09939832240343094, + "learning_rate": 3.339879263804926e-05, + "loss": 0.0363, + "step": 84280 + }, + { + "epoch": 0.02145, + "grad_norm": 0.08278929442167282, + "learning_rate": 3.339489910113131e-05, + "loss": 0.0396, + "step": 84290 + }, + { + "epoch": 0.0215, + "grad_norm": 0.1037469282746315, + "learning_rate": 3.3391005334699966e-05, + "loss": 0.0382, + "step": 84300 + }, + { + "epoch": 0.02155, + "grad_norm": 0.07742772996425629, + "learning_rate": 3.338711133886169e-05, + "loss": 0.0391, + "step": 84310 + }, + { + "epoch": 0.0216, + "grad_norm": 0.08141908049583435, + "learning_rate": 3.338321711372295e-05, + "loss": 0.0364, + "step": 84320 + }, + { + "epoch": 0.02165, + "grad_norm": 0.08545833081007004, + "learning_rate": 3.33793226593902e-05, + "loss": 0.0376, + "step": 84330 + }, + { + "epoch": 0.0217, + "grad_norm": 0.08423074334859848, + "learning_rate": 3.337542797596992e-05, + "loss": 0.0374, + "step": 84340 + }, + { + "epoch": 0.02175, + "grad_norm": 0.09167063236236572, + "learning_rate": 3.337153306356857e-05, + "loss": 0.0374, + "step": 84350 + }, + { + "epoch": 0.0218, + "grad_norm": 0.072408527135849, + "learning_rate": 3.336763792229267e-05, + "loss": 0.0362, + "step": 84360 + }, + { + "epoch": 0.02185, + "grad_norm": 0.08044564723968506, + "learning_rate": 3.336374255224868e-05, + "loss": 0.0359, + "step": 84370 + }, + { + "epoch": 0.0219, + "grad_norm": 0.07284487038850784, + "learning_rate": 3.3359846953543117e-05, + "loss": 0.0364, + "step": 84380 + }, + { + "epoch": 0.02195, + "grad_norm": 0.06727509200572968, + "learning_rate": 3.335595112628248e-05, + "loss": 0.0354, + "step": 84390 + }, + { + "epoch": 0.022, + "grad_norm": 0.07404442131519318, + "learning_rate": 3.3352055070573266e-05, + "loss": 0.0373, + "step": 84400 + }, + { + "epoch": 0.02205, + "grad_norm": 0.07334660738706589, + "learning_rate": 3.334815878652202e-05, + "loss": 0.0374, + "step": 84410 + }, + { + "epoch": 0.0221, + "grad_norm": 0.08427029848098755, + "learning_rate": 3.334426227423524e-05, + "loss": 0.0371, + "step": 84420 + }, + { + "epoch": 0.02215, + "grad_norm": 0.08811043202877045, + "learning_rate": 3.334036553381946e-05, + "loss": 0.038, + "step": 84430 + }, + { + "epoch": 0.0222, + "grad_norm": 0.07698401808738708, + "learning_rate": 3.333646856538123e-05, + "loss": 0.0358, + "step": 84440 + }, + { + "epoch": 0.02225, + "grad_norm": 0.0758872851729393, + "learning_rate": 3.333257136902708e-05, + "loss": 0.0384, + "step": 84450 + }, + { + "epoch": 0.0223, + "grad_norm": 0.13882263004779816, + "learning_rate": 3.3328673944863556e-05, + "loss": 0.0379, + "step": 84460 + }, + { + "epoch": 0.02235, + "grad_norm": 0.14099398255348206, + "learning_rate": 3.332477629299722e-05, + "loss": 0.0353, + "step": 84470 + }, + { + "epoch": 0.0224, + "grad_norm": 0.09753715991973877, + "learning_rate": 3.332087841353462e-05, + "loss": 0.0357, + "step": 84480 + }, + { + "epoch": 0.02245, + "grad_norm": 0.10037767887115479, + "learning_rate": 3.3316980306582333e-05, + "loss": 0.0374, + "step": 84490 + }, + { + "epoch": 0.0225, + "grad_norm": 0.08424913883209229, + "learning_rate": 3.331308197224693e-05, + "loss": 0.0362, + "step": 84500 + }, + { + "epoch": 0.02255, + "grad_norm": 0.08143261820077896, + "learning_rate": 3.330918341063499e-05, + "loss": 0.0348, + "step": 84510 + }, + { + "epoch": 0.0226, + "grad_norm": 0.1272321194410324, + "learning_rate": 3.330528462185309e-05, + "loss": 0.0358, + "step": 84520 + }, + { + "epoch": 0.02265, + "grad_norm": 0.1025131568312645, + "learning_rate": 3.3301385606007837e-05, + "loss": 0.0363, + "step": 84530 + }, + { + "epoch": 0.0227, + "grad_norm": 0.10091119259595871, + "learning_rate": 3.3297486363205816e-05, + "loss": 0.0352, + "step": 84540 + }, + { + "epoch": 0.02275, + "grad_norm": 0.09058336168527603, + "learning_rate": 3.329358689355364e-05, + "loss": 0.0373, + "step": 84550 + }, + { + "epoch": 0.0228, + "grad_norm": 0.08595266938209534, + "learning_rate": 3.328968719715791e-05, + "loss": 0.0347, + "step": 84560 + }, + { + "epoch": 0.02285, + "grad_norm": 0.09240694344043732, + "learning_rate": 3.328578727412525e-05, + "loss": 0.036, + "step": 84570 + }, + { + "epoch": 0.0229, + "grad_norm": 0.08921545743942261, + "learning_rate": 3.3281887124562275e-05, + "loss": 0.035, + "step": 84580 + }, + { + "epoch": 0.02295, + "grad_norm": 0.08916866779327393, + "learning_rate": 3.3277986748575624e-05, + "loss": 0.0364, + "step": 84590 + }, + { + "epoch": 0.023, + "grad_norm": 0.08710570633411407, + "learning_rate": 3.327408614627191e-05, + "loss": 0.0396, + "step": 84600 + }, + { + "epoch": 0.02305, + "grad_norm": 0.08730561286211014, + "learning_rate": 3.32701853177578e-05, + "loss": 0.0362, + "step": 84610 + }, + { + "epoch": 0.0231, + "grad_norm": 0.08374005556106567, + "learning_rate": 3.326628426313993e-05, + "loss": 0.0373, + "step": 84620 + }, + { + "epoch": 0.02315, + "grad_norm": 0.10687735676765442, + "learning_rate": 3.3262382982524953e-05, + "loss": 0.0373, + "step": 84630 + }, + { + "epoch": 0.0232, + "grad_norm": 0.07977034151554108, + "learning_rate": 3.3258481476019535e-05, + "loss": 0.0352, + "step": 84640 + }, + { + "epoch": 0.02325, + "grad_norm": 0.08451544493436813, + "learning_rate": 3.325457974373032e-05, + "loss": 0.0374, + "step": 84650 + }, + { + "epoch": 0.0233, + "grad_norm": 0.08229358494281769, + "learning_rate": 3.325067778576401e-05, + "loss": 0.0355, + "step": 84660 + }, + { + "epoch": 0.02335, + "grad_norm": 0.08253395557403564, + "learning_rate": 3.3246775602227266e-05, + "loss": 0.035, + "step": 84670 + }, + { + "epoch": 0.0234, + "grad_norm": 0.10140329599380493, + "learning_rate": 3.3242873193226775e-05, + "loss": 0.0359, + "step": 84680 + }, + { + "epoch": 0.02345, + "grad_norm": 0.0771637111902237, + "learning_rate": 3.323897055886922e-05, + "loss": 0.0355, + "step": 84690 + }, + { + "epoch": 0.0235, + "grad_norm": 0.08327921479940414, + "learning_rate": 3.323506769926132e-05, + "loss": 0.0374, + "step": 84700 + }, + { + "epoch": 0.02355, + "grad_norm": 0.08944735676050186, + "learning_rate": 3.3231164614509755e-05, + "loss": 0.0369, + "step": 84710 + }, + { + "epoch": 0.0236, + "grad_norm": 0.09130865335464478, + "learning_rate": 3.322726130472124e-05, + "loss": 0.0379, + "step": 84720 + }, + { + "epoch": 0.02365, + "grad_norm": 0.08416770398616791, + "learning_rate": 3.322335777000249e-05, + "loss": 0.0362, + "step": 84730 + }, + { + "epoch": 0.0237, + "grad_norm": 0.07621768862009048, + "learning_rate": 3.321945401046023e-05, + "loss": 0.036, + "step": 84740 + }, + { + "epoch": 0.02375, + "grad_norm": 0.09680519998073578, + "learning_rate": 3.3215550026201186e-05, + "loss": 0.0377, + "step": 84750 + }, + { + "epoch": 0.0238, + "grad_norm": 0.10799897462129593, + "learning_rate": 3.321164581733209e-05, + "loss": 0.0358, + "step": 84760 + }, + { + "epoch": 0.02385, + "grad_norm": 0.0748513713479042, + "learning_rate": 3.320774138395969e-05, + "loss": 0.0358, + "step": 84770 + }, + { + "epoch": 0.0239, + "grad_norm": 0.08447539806365967, + "learning_rate": 3.3203836726190715e-05, + "loss": 0.0363, + "step": 84780 + }, + { + "epoch": 0.02395, + "grad_norm": 0.09034404903650284, + "learning_rate": 3.319993184413193e-05, + "loss": 0.0355, + "step": 84790 + }, + { + "epoch": 0.024, + "grad_norm": 0.09197676926851273, + "learning_rate": 3.3196026737890085e-05, + "loss": 0.0363, + "step": 84800 + }, + { + "epoch": 0.02405, + "grad_norm": 0.09282320737838745, + "learning_rate": 3.3192121407571954e-05, + "loss": 0.0357, + "step": 84810 + }, + { + "epoch": 0.0241, + "grad_norm": 0.10092207789421082, + "learning_rate": 3.31882158532843e-05, + "loss": 0.0355, + "step": 84820 + }, + { + "epoch": 0.02415, + "grad_norm": 0.06371040642261505, + "learning_rate": 3.31843100751339e-05, + "loss": 0.0357, + "step": 84830 + }, + { + "epoch": 0.0242, + "grad_norm": 0.0727243646979332, + "learning_rate": 3.318040407322753e-05, + "loss": 0.0354, + "step": 84840 + }, + { + "epoch": 0.02425, + "grad_norm": 0.08339813351631165, + "learning_rate": 3.317649784767199e-05, + "loss": 0.0377, + "step": 84850 + }, + { + "epoch": 0.0243, + "grad_norm": 0.07810098677873611, + "learning_rate": 3.3172591398574074e-05, + "loss": 0.0372, + "step": 84860 + }, + { + "epoch": 0.02435, + "grad_norm": 0.08262559771537781, + "learning_rate": 3.3168684726040575e-05, + "loss": 0.0353, + "step": 84870 + }, + { + "epoch": 0.0244, + "grad_norm": 0.09877780079841614, + "learning_rate": 3.3164777830178315e-05, + "loss": 0.0349, + "step": 84880 + }, + { + "epoch": 0.02445, + "grad_norm": 0.1113976314663887, + "learning_rate": 3.316087071109408e-05, + "loss": 0.0366, + "step": 84890 + }, + { + "epoch": 0.0245, + "grad_norm": 0.10309489816427231, + "learning_rate": 3.3156963368894714e-05, + "loss": 0.0366, + "step": 84900 + }, + { + "epoch": 0.02455, + "grad_norm": 0.09552957117557526, + "learning_rate": 3.315305580368704e-05, + "loss": 0.037, + "step": 84910 + }, + { + "epoch": 0.0246, + "grad_norm": 0.10593295842409134, + "learning_rate": 3.314914801557788e-05, + "loss": 0.0369, + "step": 84920 + }, + { + "epoch": 0.02465, + "grad_norm": 0.09463546425104141, + "learning_rate": 3.314524000467407e-05, + "loss": 0.0373, + "step": 84930 + }, + { + "epoch": 0.0247, + "grad_norm": 0.07869092375040054, + "learning_rate": 3.3141331771082456e-05, + "loss": 0.0357, + "step": 84940 + }, + { + "epoch": 0.02475, + "grad_norm": 0.07995082437992096, + "learning_rate": 3.31374233149099e-05, + "loss": 0.0359, + "step": 84950 + }, + { + "epoch": 0.0248, + "grad_norm": 0.07646608352661133, + "learning_rate": 3.313351463626324e-05, + "loss": 0.0379, + "step": 84960 + }, + { + "epoch": 0.02485, + "grad_norm": 0.09957541525363922, + "learning_rate": 3.3129605735249354e-05, + "loss": 0.0381, + "step": 84970 + }, + { + "epoch": 0.0249, + "grad_norm": 0.08227454870939255, + "learning_rate": 3.312569661197509e-05, + "loss": 0.0373, + "step": 84980 + }, + { + "epoch": 0.02495, + "grad_norm": 0.07015284895896912, + "learning_rate": 3.312178726654734e-05, + "loss": 0.0367, + "step": 84990 + }, + { + "epoch": 0.025, + "grad_norm": 0.09884293377399445, + "learning_rate": 3.3117877699072975e-05, + "loss": 0.0368, + "step": 85000 + }, + { + "epoch": 0.02505, + "grad_norm": 0.10437457263469696, + "learning_rate": 3.311396790965888e-05, + "loss": 0.0359, + "step": 85010 + }, + { + "epoch": 0.0251, + "grad_norm": 0.10808396339416504, + "learning_rate": 3.311005789841196e-05, + "loss": 0.0375, + "step": 85020 + }, + { + "epoch": 0.02515, + "grad_norm": 0.06980592757463455, + "learning_rate": 3.3106147665439105e-05, + "loss": 0.0344, + "step": 85030 + }, + { + "epoch": 0.0252, + "grad_norm": 0.07518026977777481, + "learning_rate": 3.3102237210847206e-05, + "loss": 0.0368, + "step": 85040 + }, + { + "epoch": 0.02525, + "grad_norm": 0.08038965612649918, + "learning_rate": 3.309832653474319e-05, + "loss": 0.0359, + "step": 85050 + }, + { + "epoch": 0.0253, + "grad_norm": 0.06596371531486511, + "learning_rate": 3.3094415637233966e-05, + "loss": 0.0358, + "step": 85060 + }, + { + "epoch": 0.02535, + "grad_norm": 0.06973963230848312, + "learning_rate": 3.309050451842647e-05, + "loss": 0.0352, + "step": 85070 + }, + { + "epoch": 0.0254, + "grad_norm": 0.08272168040275574, + "learning_rate": 3.308659317842761e-05, + "loss": 0.0379, + "step": 85080 + }, + { + "epoch": 0.02545, + "grad_norm": 0.0812903344631195, + "learning_rate": 3.308268161734434e-05, + "loss": 0.0357, + "step": 85090 + }, + { + "epoch": 0.0255, + "grad_norm": 0.08610591292381287, + "learning_rate": 3.3078769835283585e-05, + "loss": 0.0352, + "step": 85100 + }, + { + "epoch": 0.02555, + "grad_norm": 0.09931080788373947, + "learning_rate": 3.3074857832352294e-05, + "loss": 0.0363, + "step": 85110 + }, + { + "epoch": 0.0256, + "grad_norm": 0.10501144081354141, + "learning_rate": 3.307094560865743e-05, + "loss": 0.0352, + "step": 85120 + }, + { + "epoch": 0.02565, + "grad_norm": 0.10684654116630554, + "learning_rate": 3.3067033164305944e-05, + "loss": 0.0384, + "step": 85130 + }, + { + "epoch": 0.0257, + "grad_norm": 0.08528265357017517, + "learning_rate": 3.30631204994048e-05, + "loss": 0.0367, + "step": 85140 + }, + { + "epoch": 0.02575, + "grad_norm": 0.07739049941301346, + "learning_rate": 3.305920761406097e-05, + "loss": 0.0358, + "step": 85150 + }, + { + "epoch": 0.0258, + "grad_norm": 0.0894133523106575, + "learning_rate": 3.3055294508381435e-05, + "loss": 0.0367, + "step": 85160 + }, + { + "epoch": 0.02585, + "grad_norm": 0.07886653393507004, + "learning_rate": 3.3051381182473165e-05, + "loss": 0.0374, + "step": 85170 + }, + { + "epoch": 0.0259, + "grad_norm": 0.08629165589809418, + "learning_rate": 3.304746763644317e-05, + "loss": 0.0361, + "step": 85180 + }, + { + "epoch": 0.02595, + "grad_norm": 0.06818588823080063, + "learning_rate": 3.304355387039843e-05, + "loss": 0.0346, + "step": 85190 + }, + { + "epoch": 0.026, + "grad_norm": 0.08322655409574509, + "learning_rate": 3.3039639884445947e-05, + "loss": 0.0349, + "step": 85200 + }, + { + "epoch": 0.02605, + "grad_norm": 0.08271487057209015, + "learning_rate": 3.303572567869273e-05, + "loss": 0.0349, + "step": 85210 + }, + { + "epoch": 0.0261, + "grad_norm": 0.08312942832708359, + "learning_rate": 3.303181125324579e-05, + "loss": 0.0342, + "step": 85220 + }, + { + "epoch": 0.02615, + "grad_norm": 0.06256101280450821, + "learning_rate": 3.302789660821215e-05, + "loss": 0.0353, + "step": 85230 + }, + { + "epoch": 0.0262, + "grad_norm": 0.08462122827768326, + "learning_rate": 3.302398174369883e-05, + "loss": 0.0362, + "step": 85240 + }, + { + "epoch": 0.02625, + "grad_norm": 0.08205302804708481, + "learning_rate": 3.302006665981287e-05, + "loss": 0.0356, + "step": 85250 + }, + { + "epoch": 0.0263, + "grad_norm": 0.08983128517866135, + "learning_rate": 3.30161513566613e-05, + "loss": 0.0377, + "step": 85260 + }, + { + "epoch": 0.02635, + "grad_norm": 0.11347589641809464, + "learning_rate": 3.3012235834351154e-05, + "loss": 0.0361, + "step": 85270 + }, + { + "epoch": 0.0264, + "grad_norm": 0.08793602138757706, + "learning_rate": 3.30083200929895e-05, + "loss": 0.0371, + "step": 85280 + }, + { + "epoch": 0.02645, + "grad_norm": 0.09794158488512039, + "learning_rate": 3.3004404132683384e-05, + "loss": 0.0366, + "step": 85290 + }, + { + "epoch": 0.0265, + "grad_norm": 0.08309174329042435, + "learning_rate": 3.300048795353986e-05, + "loss": 0.0363, + "step": 85300 + }, + { + "epoch": 0.02655, + "grad_norm": 0.07576657831668854, + "learning_rate": 3.2996571555666e-05, + "loss": 0.0357, + "step": 85310 + }, + { + "epoch": 0.0266, + "grad_norm": 0.08200152963399887, + "learning_rate": 3.299265493916888e-05, + "loss": 0.0374, + "step": 85320 + }, + { + "epoch": 0.02665, + "grad_norm": 0.07354395091533661, + "learning_rate": 3.298873810415558e-05, + "loss": 0.0368, + "step": 85330 + }, + { + "epoch": 0.0267, + "grad_norm": 0.09424217790365219, + "learning_rate": 3.298482105073318e-05, + "loss": 0.036, + "step": 85340 + }, + { + "epoch": 0.02675, + "grad_norm": 0.09676692634820938, + "learning_rate": 3.298090377900877e-05, + "loss": 0.0382, + "step": 85350 + }, + { + "epoch": 0.0268, + "grad_norm": 0.09819674491882324, + "learning_rate": 3.297698628908945e-05, + "loss": 0.0376, + "step": 85360 + }, + { + "epoch": 0.02685, + "grad_norm": 0.11677680909633636, + "learning_rate": 3.297306858108232e-05, + "loss": 0.0384, + "step": 85370 + }, + { + "epoch": 0.0269, + "grad_norm": 0.07928648591041565, + "learning_rate": 3.296915065509449e-05, + "loss": 0.0361, + "step": 85380 + }, + { + "epoch": 0.02695, + "grad_norm": 0.11051487922668457, + "learning_rate": 3.296523251123308e-05, + "loss": 0.0397, + "step": 85390 + }, + { + "epoch": 0.027, + "grad_norm": 0.08754760771989822, + "learning_rate": 3.29613141496052e-05, + "loss": 0.0364, + "step": 85400 + }, + { + "epoch": 0.02705, + "grad_norm": 0.09275832772254944, + "learning_rate": 3.295739557031799e-05, + "loss": 0.037, + "step": 85410 + }, + { + "epoch": 0.0271, + "grad_norm": 0.08891261368989944, + "learning_rate": 3.295347677347857e-05, + "loss": 0.039, + "step": 85420 + }, + { + "epoch": 0.02715, + "grad_norm": 0.08942827582359314, + "learning_rate": 3.2949557759194075e-05, + "loss": 0.0369, + "step": 85430 + }, + { + "epoch": 0.0272, + "grad_norm": 0.0818541869521141, + "learning_rate": 3.294563852757167e-05, + "loss": 0.0367, + "step": 85440 + }, + { + "epoch": 0.02725, + "grad_norm": 0.08699609339237213, + "learning_rate": 3.294171907871849e-05, + "loss": 0.0374, + "step": 85450 + }, + { + "epoch": 0.0273, + "grad_norm": 0.10503701120615005, + "learning_rate": 3.2937799412741685e-05, + "loss": 0.0367, + "step": 85460 + }, + { + "epoch": 0.02735, + "grad_norm": 0.08131738752126694, + "learning_rate": 3.2933879529748435e-05, + "loss": 0.0356, + "step": 85470 + }, + { + "epoch": 0.0274, + "grad_norm": 0.07324840873479843, + "learning_rate": 3.2929959429845896e-05, + "loss": 0.0355, + "step": 85480 + }, + { + "epoch": 0.02745, + "grad_norm": 0.08623120933771133, + "learning_rate": 3.292603911314125e-05, + "loss": 0.0358, + "step": 85490 + }, + { + "epoch": 0.0275, + "grad_norm": 0.13020314276218414, + "learning_rate": 3.292211857974166e-05, + "loss": 0.0406, + "step": 85500 + }, + { + "epoch": 0.02755, + "grad_norm": 0.09622704982757568, + "learning_rate": 3.291819782975434e-05, + "loss": 0.0346, + "step": 85510 + }, + { + "epoch": 0.0276, + "grad_norm": 0.09420084953308105, + "learning_rate": 3.291427686328645e-05, + "loss": 0.0365, + "step": 85520 + }, + { + "epoch": 0.02765, + "grad_norm": 0.09245433658361435, + "learning_rate": 3.291035568044522e-05, + "loss": 0.0355, + "step": 85530 + }, + { + "epoch": 0.0277, + "grad_norm": 0.08546995371580124, + "learning_rate": 3.2906434281337826e-05, + "loss": 0.0373, + "step": 85540 + }, + { + "epoch": 0.02775, + "grad_norm": 0.07339838892221451, + "learning_rate": 3.29025126660715e-05, + "loss": 0.0364, + "step": 85550 + }, + { + "epoch": 0.0278, + "grad_norm": 0.0915692150592804, + "learning_rate": 3.289859083475343e-05, + "loss": 0.0353, + "step": 85560 + }, + { + "epoch": 0.02785, + "grad_norm": 0.10731060057878494, + "learning_rate": 3.289466878749087e-05, + "loss": 0.0368, + "step": 85570 + }, + { + "epoch": 0.0279, + "grad_norm": 0.09024395048618317, + "learning_rate": 3.289074652439102e-05, + "loss": 0.0366, + "step": 85580 + }, + { + "epoch": 0.02795, + "grad_norm": 0.11369603127241135, + "learning_rate": 3.2886824045561134e-05, + "loss": 0.0356, + "step": 85590 + }, + { + "epoch": 0.028, + "grad_norm": 0.11600227653980255, + "learning_rate": 3.288290135110844e-05, + "loss": 0.0355, + "step": 85600 + }, + { + "epoch": 0.02805, + "grad_norm": 0.10826078802347183, + "learning_rate": 3.2878978441140174e-05, + "loss": 0.0369, + "step": 85610 + }, + { + "epoch": 0.0281, + "grad_norm": 0.09068696945905685, + "learning_rate": 3.2875055315763606e-05, + "loss": 0.0353, + "step": 85620 + }, + { + "epoch": 0.02815, + "grad_norm": 0.08113870024681091, + "learning_rate": 3.287113197508598e-05, + "loss": 0.0351, + "step": 85630 + }, + { + "epoch": 0.0282, + "grad_norm": 0.08663254231214523, + "learning_rate": 3.286720841921457e-05, + "loss": 0.0348, + "step": 85640 + }, + { + "epoch": 0.02825, + "grad_norm": 0.07541149854660034, + "learning_rate": 3.286328464825663e-05, + "loss": 0.0353, + "step": 85650 + }, + { + "epoch": 0.0283, + "grad_norm": 0.06735502183437347, + "learning_rate": 3.285936066231945e-05, + "loss": 0.035, + "step": 85660 + }, + { + "epoch": 0.02835, + "grad_norm": 0.09373398870229721, + "learning_rate": 3.2855436461510295e-05, + "loss": 0.035, + "step": 85670 + }, + { + "epoch": 0.0284, + "grad_norm": 0.0879872515797615, + "learning_rate": 3.285151204593646e-05, + "loss": 0.0386, + "step": 85680 + }, + { + "epoch": 0.02845, + "grad_norm": 0.07386450469493866, + "learning_rate": 3.2847587415705236e-05, + "loss": 0.0356, + "step": 85690 + }, + { + "epoch": 0.0285, + "grad_norm": 0.08008229732513428, + "learning_rate": 3.284366257092392e-05, + "loss": 0.0367, + "step": 85700 + }, + { + "epoch": 0.02855, + "grad_norm": 0.08670288324356079, + "learning_rate": 3.283973751169981e-05, + "loss": 0.0342, + "step": 85710 + }, + { + "epoch": 0.0286, + "grad_norm": 0.07803010940551758, + "learning_rate": 3.283581223814024e-05, + "loss": 0.0366, + "step": 85720 + }, + { + "epoch": 0.02865, + "grad_norm": 0.0688256099820137, + "learning_rate": 3.283188675035249e-05, + "loss": 0.0347, + "step": 85730 + }, + { + "epoch": 0.0287, + "grad_norm": 0.07695078104734421, + "learning_rate": 3.2827961048443906e-05, + "loss": 0.0343, + "step": 85740 + }, + { + "epoch": 0.02875, + "grad_norm": 0.0665489062666893, + "learning_rate": 3.28240351325218e-05, + "loss": 0.0357, + "step": 85750 + }, + { + "epoch": 0.0288, + "grad_norm": 0.06531370431184769, + "learning_rate": 3.282010900269352e-05, + "loss": 0.036, + "step": 85760 + }, + { + "epoch": 0.02885, + "grad_norm": 0.08302236348390579, + "learning_rate": 3.281618265906639e-05, + "loss": 0.0353, + "step": 85770 + }, + { + "epoch": 0.0289, + "grad_norm": 0.08833914250135422, + "learning_rate": 3.281225610174778e-05, + "loss": 0.0345, + "step": 85780 + }, + { + "epoch": 0.02895, + "grad_norm": 0.08267134428024292, + "learning_rate": 3.2808329330845006e-05, + "loss": 0.0357, + "step": 85790 + }, + { + "epoch": 0.029, + "grad_norm": 0.0864887535572052, + "learning_rate": 3.280440234646544e-05, + "loss": 0.0357, + "step": 85800 + }, + { + "epoch": 0.02905, + "grad_norm": 0.0825200080871582, + "learning_rate": 3.280047514871645e-05, + "loss": 0.0354, + "step": 85810 + }, + { + "epoch": 0.0291, + "grad_norm": 0.08538206666707993, + "learning_rate": 3.2796547737705414e-05, + "loss": 0.0362, + "step": 85820 + }, + { + "epoch": 0.02915, + "grad_norm": 0.06769226491451263, + "learning_rate": 3.2792620113539674e-05, + "loss": 0.0351, + "step": 85830 + }, + { + "epoch": 0.0292, + "grad_norm": 0.08536852896213531, + "learning_rate": 3.2788692276326635e-05, + "loss": 0.038, + "step": 85840 + }, + { + "epoch": 0.02925, + "grad_norm": 0.08903807401657104, + "learning_rate": 3.2784764226173673e-05, + "loss": 0.0373, + "step": 85850 + }, + { + "epoch": 0.0293, + "grad_norm": 0.07940450310707092, + "learning_rate": 3.278083596318819e-05, + "loss": 0.0359, + "step": 85860 + }, + { + "epoch": 0.02935, + "grad_norm": 0.08667632192373276, + "learning_rate": 3.277690748747757e-05, + "loss": 0.0371, + "step": 85870 + }, + { + "epoch": 0.0294, + "grad_norm": 0.0772915631532669, + "learning_rate": 3.277297879914921e-05, + "loss": 0.0369, + "step": 85880 + }, + { + "epoch": 0.02945, + "grad_norm": 0.08793382346630096, + "learning_rate": 3.2769049898310545e-05, + "loss": 0.036, + "step": 85890 + }, + { + "epoch": 0.0295, + "grad_norm": 0.07650678604841232, + "learning_rate": 3.276512078506897e-05, + "loss": 0.0369, + "step": 85900 + }, + { + "epoch": 0.02955, + "grad_norm": 0.08649063855409622, + "learning_rate": 3.2761191459531904e-05, + "loss": 0.0386, + "step": 85910 + }, + { + "epoch": 0.0296, + "grad_norm": 0.08880212903022766, + "learning_rate": 3.275726192180678e-05, + "loss": 0.0377, + "step": 85920 + }, + { + "epoch": 0.02965, + "grad_norm": 0.08839485049247742, + "learning_rate": 3.2753332172001036e-05, + "loss": 0.0356, + "step": 85930 + }, + { + "epoch": 0.0297, + "grad_norm": 0.08877583593130112, + "learning_rate": 3.27494022102221e-05, + "loss": 0.0374, + "step": 85940 + }, + { + "epoch": 0.02975, + "grad_norm": 0.08329977095127106, + "learning_rate": 3.274547203657742e-05, + "loss": 0.0371, + "step": 85950 + }, + { + "epoch": 0.0298, + "grad_norm": 0.08751018345355988, + "learning_rate": 3.274154165117444e-05, + "loss": 0.036, + "step": 85960 + }, + { + "epoch": 0.02985, + "grad_norm": 0.07727228850126266, + "learning_rate": 3.273761105412063e-05, + "loss": 0.0375, + "step": 85970 + }, + { + "epoch": 0.0299, + "grad_norm": 0.07666754722595215, + "learning_rate": 3.273368024552343e-05, + "loss": 0.0362, + "step": 85980 + }, + { + "epoch": 0.02995, + "grad_norm": 0.09056229144334793, + "learning_rate": 3.272974922549032e-05, + "loss": 0.0368, + "step": 85990 + }, + { + "epoch": 0.03, + "grad_norm": 0.09175769984722137, + "learning_rate": 3.2725817994128774e-05, + "loss": 0.0374, + "step": 86000 + }, + { + "epoch": 0.03005, + "grad_norm": 0.08103155344724655, + "learning_rate": 3.272188655154626e-05, + "loss": 0.0366, + "step": 86010 + }, + { + "epoch": 0.0301, + "grad_norm": 0.10952797532081604, + "learning_rate": 3.2717954897850264e-05, + "loss": 0.0378, + "step": 86020 + }, + { + "epoch": 0.03015, + "grad_norm": 0.0774063915014267, + "learning_rate": 3.27140230331483e-05, + "loss": 0.0365, + "step": 86030 + }, + { + "epoch": 0.0302, + "grad_norm": 0.08491382747888565, + "learning_rate": 3.2710090957547826e-05, + "loss": 0.0368, + "step": 86040 + }, + { + "epoch": 0.03025, + "grad_norm": 0.08859048038721085, + "learning_rate": 3.2706158671156375e-05, + "loss": 0.0349, + "step": 86050 + }, + { + "epoch": 0.0303, + "grad_norm": 0.06936676055192947, + "learning_rate": 3.270222617408144e-05, + "loss": 0.0356, + "step": 86060 + }, + { + "epoch": 0.03035, + "grad_norm": 0.08609030395746231, + "learning_rate": 3.269829346643052e-05, + "loss": 0.0352, + "step": 86070 + }, + { + "epoch": 0.0304, + "grad_norm": 0.09073681384325027, + "learning_rate": 3.269436054831116e-05, + "loss": 0.0381, + "step": 86080 + }, + { + "epoch": 0.03045, + "grad_norm": 0.10690750181674957, + "learning_rate": 3.269042741983087e-05, + "loss": 0.0381, + "step": 86090 + }, + { + "epoch": 0.0305, + "grad_norm": 0.08611635863780975, + "learning_rate": 3.268649408109719e-05, + "loss": 0.0371, + "step": 86100 + }, + { + "epoch": 0.03055, + "grad_norm": 0.09616200625896454, + "learning_rate": 3.268256053221764e-05, + "loss": 0.0362, + "step": 86110 + }, + { + "epoch": 0.0306, + "grad_norm": 0.09797549247741699, + "learning_rate": 3.267862677329978e-05, + "loss": 0.0364, + "step": 86120 + }, + { + "epoch": 0.03065, + "grad_norm": 0.0829409658908844, + "learning_rate": 3.267469280445114e-05, + "loss": 0.0355, + "step": 86130 + }, + { + "epoch": 0.0307, + "grad_norm": 0.08753553777933121, + "learning_rate": 3.267075862577929e-05, + "loss": 0.0353, + "step": 86140 + }, + { + "epoch": 0.03075, + "grad_norm": 0.0914936363697052, + "learning_rate": 3.2666824237391774e-05, + "loss": 0.036, + "step": 86150 + }, + { + "epoch": 0.0308, + "grad_norm": 0.09489694237709045, + "learning_rate": 3.2662889639396175e-05, + "loss": 0.0377, + "step": 86160 + }, + { + "epoch": 0.03085, + "grad_norm": 0.08153481781482697, + "learning_rate": 3.265895483190004e-05, + "loss": 0.0364, + "step": 86170 + }, + { + "epoch": 0.0309, + "grad_norm": 0.08516906201839447, + "learning_rate": 3.2655019815010965e-05, + "loss": 0.0356, + "step": 86180 + }, + { + "epoch": 0.03095, + "grad_norm": 0.08353491872549057, + "learning_rate": 3.265108458883652e-05, + "loss": 0.0376, + "step": 86190 + }, + { + "epoch": 0.031, + "grad_norm": 0.08986058086156845, + "learning_rate": 3.2647149153484296e-05, + "loss": 0.0367, + "step": 86200 + }, + { + "epoch": 0.03105, + "grad_norm": 0.09755656123161316, + "learning_rate": 3.264321350906189e-05, + "loss": 0.037, + "step": 86210 + }, + { + "epoch": 0.0311, + "grad_norm": 0.08106609433889389, + "learning_rate": 3.2639277655676896e-05, + "loss": 0.0382, + "step": 86220 + }, + { + "epoch": 0.03115, + "grad_norm": 0.0735519528388977, + "learning_rate": 3.263534159343692e-05, + "loss": 0.0353, + "step": 86230 + }, + { + "epoch": 0.0312, + "grad_norm": 0.08236845582723618, + "learning_rate": 3.263140532244958e-05, + "loss": 0.036, + "step": 86240 + }, + { + "epoch": 0.03125, + "grad_norm": 0.09598668664693832, + "learning_rate": 3.262746884282248e-05, + "loss": 0.0378, + "step": 86250 + }, + { + "epoch": 0.0313, + "grad_norm": 0.07468762993812561, + "learning_rate": 3.262353215466325e-05, + "loss": 0.037, + "step": 86260 + }, + { + "epoch": 0.03135, + "grad_norm": 0.10694070905447006, + "learning_rate": 3.26195952580795e-05, + "loss": 0.0358, + "step": 86270 + }, + { + "epoch": 0.0314, + "grad_norm": 0.0785185769200325, + "learning_rate": 3.2615658153178894e-05, + "loss": 0.0364, + "step": 86280 + }, + { + "epoch": 0.03145, + "grad_norm": 0.07416651397943497, + "learning_rate": 3.2611720840069055e-05, + "loss": 0.0351, + "step": 86290 + }, + { + "epoch": 0.0315, + "grad_norm": 0.07013098150491714, + "learning_rate": 3.260778331885762e-05, + "loss": 0.0354, + "step": 86300 + }, + { + "epoch": 0.03155, + "grad_norm": 0.07015841454267502, + "learning_rate": 3.260384558965226e-05, + "loss": 0.0348, + "step": 86310 + }, + { + "epoch": 0.0316, + "grad_norm": 0.07380936294794083, + "learning_rate": 3.2599907652560605e-05, + "loss": 0.0346, + "step": 86320 + }, + { + "epoch": 0.03165, + "grad_norm": 0.06309569627046585, + "learning_rate": 3.259596950769033e-05, + "loss": 0.0355, + "step": 86330 + }, + { + "epoch": 0.0317, + "grad_norm": 0.06355613470077515, + "learning_rate": 3.259203115514911e-05, + "loss": 0.0358, + "step": 86340 + }, + { + "epoch": 0.03175, + "grad_norm": 0.08989034593105316, + "learning_rate": 3.258809259504461e-05, + "loss": 0.0357, + "step": 86350 + }, + { + "epoch": 0.0318, + "grad_norm": 0.08825557678937912, + "learning_rate": 3.258415382748451e-05, + "loss": 0.0411, + "step": 86360 + }, + { + "epoch": 0.03185, + "grad_norm": 0.08263460546731949, + "learning_rate": 3.258021485257649e-05, + "loss": 0.035, + "step": 86370 + }, + { + "epoch": 0.0319, + "grad_norm": 0.08822799474000931, + "learning_rate": 3.2576275670428245e-05, + "loss": 0.037, + "step": 86380 + }, + { + "epoch": 0.03195, + "grad_norm": 0.07840435951948166, + "learning_rate": 3.2572336281147466e-05, + "loss": 0.0353, + "step": 86390 + }, + { + "epoch": 0.032, + "grad_norm": 0.08038538694381714, + "learning_rate": 3.256839668484186e-05, + "loss": 0.038, + "step": 86400 + }, + { + "epoch": 0.03205, + "grad_norm": 0.08051291853189468, + "learning_rate": 3.2564456881619135e-05, + "loss": 0.0369, + "step": 86410 + }, + { + "epoch": 0.0321, + "grad_norm": 0.08047017455101013, + "learning_rate": 3.2560516871587e-05, + "loss": 0.0376, + "step": 86420 + }, + { + "epoch": 0.03215, + "grad_norm": 0.08687380701303482, + "learning_rate": 3.255657665485317e-05, + "loss": 0.0366, + "step": 86430 + }, + { + "epoch": 0.0322, + "grad_norm": 0.09004174917936325, + "learning_rate": 3.255263623152537e-05, + "loss": 0.0359, + "step": 86440 + }, + { + "epoch": 0.03225, + "grad_norm": 0.0855182409286499, + "learning_rate": 3.2548695601711344e-05, + "loss": 0.0374, + "step": 86450 + }, + { + "epoch": 0.0323, + "grad_norm": 0.09575967490673065, + "learning_rate": 3.25447547655188e-05, + "loss": 0.0379, + "step": 86460 + }, + { + "epoch": 0.03235, + "grad_norm": 0.09607132524251938, + "learning_rate": 3.254081372305552e-05, + "loss": 0.0371, + "step": 86470 + }, + { + "epoch": 0.0324, + "grad_norm": 0.08806268125772476, + "learning_rate": 3.2536872474429205e-05, + "loss": 0.0376, + "step": 86480 + }, + { + "epoch": 0.03245, + "grad_norm": 0.09159974753856659, + "learning_rate": 3.253293101974764e-05, + "loss": 0.0372, + "step": 86490 + }, + { + "epoch": 0.0325, + "grad_norm": 0.10884981602430344, + "learning_rate": 3.252898935911856e-05, + "loss": 0.0376, + "step": 86500 + }, + { + "epoch": 0.03255, + "grad_norm": 0.12202293425798416, + "learning_rate": 3.2525047492649744e-05, + "loss": 0.0363, + "step": 86510 + }, + { + "epoch": 0.0326, + "grad_norm": 0.09730051457881927, + "learning_rate": 3.252110542044896e-05, + "loss": 0.0388, + "step": 86520 + }, + { + "epoch": 0.03265, + "grad_norm": 0.0869157612323761, + "learning_rate": 3.251716314262398e-05, + "loss": 0.0352, + "step": 86530 + }, + { + "epoch": 0.0327, + "grad_norm": 0.09806448221206665, + "learning_rate": 3.251322065928257e-05, + "loss": 0.0349, + "step": 86540 + }, + { + "epoch": 0.03275, + "grad_norm": 0.07501520216464996, + "learning_rate": 3.250927797053254e-05, + "loss": 0.0348, + "step": 86550 + }, + { + "epoch": 0.0328, + "grad_norm": 0.08077821880578995, + "learning_rate": 3.250533507648168e-05, + "loss": 0.0343, + "step": 86560 + }, + { + "epoch": 0.03285, + "grad_norm": 0.08169900625944138, + "learning_rate": 3.250139197723776e-05, + "loss": 0.0348, + "step": 86570 + }, + { + "epoch": 0.0329, + "grad_norm": 0.07048002630472183, + "learning_rate": 3.249744867290862e-05, + "loss": 0.036, + "step": 86580 + }, + { + "epoch": 0.03295, + "grad_norm": 0.0722208097577095, + "learning_rate": 3.249350516360203e-05, + "loss": 0.0386, + "step": 86590 + }, + { + "epoch": 0.033, + "grad_norm": 0.11111637204885483, + "learning_rate": 3.2489561449425844e-05, + "loss": 0.0345, + "step": 86600 + }, + { + "epoch": 0.03305, + "grad_norm": 0.11488772928714752, + "learning_rate": 3.248561753048786e-05, + "loss": 0.0353, + "step": 86610 + }, + { + "epoch": 0.0331, + "grad_norm": 0.07998430728912354, + "learning_rate": 3.2481673406895895e-05, + "loss": 0.0352, + "step": 86620 + }, + { + "epoch": 0.03315, + "grad_norm": 0.07691726833581924, + "learning_rate": 3.247772907875779e-05, + "loss": 0.0353, + "step": 86630 + }, + { + "epoch": 0.0332, + "grad_norm": 0.08369085937738419, + "learning_rate": 3.247378454618138e-05, + "loss": 0.0343, + "step": 86640 + }, + { + "epoch": 0.03325, + "grad_norm": 0.09220634400844574, + "learning_rate": 3.2469839809274514e-05, + "loss": 0.0373, + "step": 86650 + }, + { + "epoch": 0.0333, + "grad_norm": 0.09541697800159454, + "learning_rate": 3.2465894868145034e-05, + "loss": 0.0345, + "step": 86660 + }, + { + "epoch": 0.03335, + "grad_norm": 0.09451382607221603, + "learning_rate": 3.246194972290079e-05, + "loss": 0.0351, + "step": 86670 + }, + { + "epoch": 0.0334, + "grad_norm": 0.09029912203550339, + "learning_rate": 3.2458004373649656e-05, + "loss": 0.0345, + "step": 86680 + }, + { + "epoch": 0.03345, + "grad_norm": 0.0942051112651825, + "learning_rate": 3.245405882049947e-05, + "loss": 0.0349, + "step": 86690 + }, + { + "epoch": 0.0335, + "grad_norm": 0.07905034720897675, + "learning_rate": 3.245011306355812e-05, + "loss": 0.0341, + "step": 86700 + }, + { + "epoch": 0.03355, + "grad_norm": 0.07603990286588669, + "learning_rate": 3.2446167102933474e-05, + "loss": 0.0356, + "step": 86710 + }, + { + "epoch": 0.0336, + "grad_norm": 0.09684465080499649, + "learning_rate": 3.244222093873342e-05, + "loss": 0.0356, + "step": 86720 + }, + { + "epoch": 0.03365, + "grad_norm": 0.08049037307500839, + "learning_rate": 3.243827457106584e-05, + "loss": 0.0362, + "step": 86730 + }, + { + "epoch": 0.0337, + "grad_norm": 0.07895974814891815, + "learning_rate": 3.243432800003863e-05, + "loss": 0.0347, + "step": 86740 + }, + { + "epoch": 0.03375, + "grad_norm": 0.06666752696037292, + "learning_rate": 3.2430381225759686e-05, + "loss": 0.0337, + "step": 86750 + }, + { + "epoch": 0.0338, + "grad_norm": 0.0748344138264656, + "learning_rate": 3.242643424833691e-05, + "loss": 0.0354, + "step": 86760 + }, + { + "epoch": 0.03385, + "grad_norm": 0.07513680309057236, + "learning_rate": 3.242248706787821e-05, + "loss": 0.0355, + "step": 86770 + }, + { + "epoch": 0.0339, + "grad_norm": 0.07731368392705917, + "learning_rate": 3.241853968449151e-05, + "loss": 0.0353, + "step": 86780 + }, + { + "epoch": 0.03395, + "grad_norm": 0.08503787219524384, + "learning_rate": 3.241459209828471e-05, + "loss": 0.0391, + "step": 86790 + }, + { + "epoch": 0.034, + "grad_norm": 0.08694326132535934, + "learning_rate": 3.241064430936575e-05, + "loss": 0.0373, + "step": 86800 + }, + { + "epoch": 0.03405, + "grad_norm": 0.07162105292081833, + "learning_rate": 3.2406696317842566e-05, + "loss": 0.035, + "step": 86810 + }, + { + "epoch": 0.0341, + "grad_norm": 0.08332312852144241, + "learning_rate": 3.2402748123823076e-05, + "loss": 0.036, + "step": 86820 + }, + { + "epoch": 0.03415, + "grad_norm": 0.08905179798603058, + "learning_rate": 3.239879972741524e-05, + "loss": 0.0381, + "step": 86830 + }, + { + "epoch": 0.0342, + "grad_norm": 0.08517122268676758, + "learning_rate": 3.2394851128727e-05, + "loss": 0.0362, + "step": 86840 + }, + { + "epoch": 0.03425, + "grad_norm": 0.07143902778625488, + "learning_rate": 3.2390902327866315e-05, + "loss": 0.0348, + "step": 86850 + }, + { + "epoch": 0.0343, + "grad_norm": 0.08254444599151611, + "learning_rate": 3.238695332494113e-05, + "loss": 0.0351, + "step": 86860 + }, + { + "epoch": 0.03435, + "grad_norm": 0.08892206102609634, + "learning_rate": 3.2383004120059415e-05, + "loss": 0.037, + "step": 86870 + }, + { + "epoch": 0.0344, + "grad_norm": 0.08615243434906006, + "learning_rate": 3.237905471332914e-05, + "loss": 0.0355, + "step": 86880 + }, + { + "epoch": 0.03445, + "grad_norm": 0.06337711960077286, + "learning_rate": 3.237510510485828e-05, + "loss": 0.0354, + "step": 86890 + }, + { + "epoch": 0.0345, + "grad_norm": 0.08787225186824799, + "learning_rate": 3.237115529475482e-05, + "loss": 0.0366, + "step": 86900 + }, + { + "epoch": 0.03455, + "grad_norm": 0.07330426573753357, + "learning_rate": 3.2367205283126744e-05, + "loss": 0.0344, + "step": 86910 + }, + { + "epoch": 0.0346, + "grad_norm": 0.07716590911149979, + "learning_rate": 3.236325507008204e-05, + "loss": 0.0365, + "step": 86920 + }, + { + "epoch": 0.03465, + "grad_norm": 0.0840856209397316, + "learning_rate": 3.235930465572872e-05, + "loss": 0.0349, + "step": 86930 + }, + { + "epoch": 0.0347, + "grad_norm": 0.07107097655534744, + "learning_rate": 3.2355354040174765e-05, + "loss": 0.037, + "step": 86940 + }, + { + "epoch": 0.03475, + "grad_norm": 0.07819947600364685, + "learning_rate": 3.23514032235282e-05, + "loss": 0.0347, + "step": 86950 + }, + { + "epoch": 0.0348, + "grad_norm": 0.0675472691655159, + "learning_rate": 3.234745220589702e-05, + "loss": 0.0344, + "step": 86960 + }, + { + "epoch": 0.03485, + "grad_norm": 0.08264216035604477, + "learning_rate": 3.234350098738927e-05, + "loss": 0.0339, + "step": 86970 + }, + { + "epoch": 0.0349, + "grad_norm": 0.09070836007595062, + "learning_rate": 3.233954956811295e-05, + "loss": 0.0346, + "step": 86980 + }, + { + "epoch": 0.03495, + "grad_norm": 0.07672609388828278, + "learning_rate": 3.2335597948176116e-05, + "loss": 0.0346, + "step": 86990 + }, + { + "epoch": 0.035, + "grad_norm": 0.08686935156583786, + "learning_rate": 3.233164612768678e-05, + "loss": 0.0352, + "step": 87000 + }, + { + "epoch": 0.03505, + "grad_norm": 0.07569900155067444, + "learning_rate": 3.2327694106753e-05, + "loss": 0.0353, + "step": 87010 + }, + { + "epoch": 0.0351, + "grad_norm": 0.09087596088647842, + "learning_rate": 3.232374188548281e-05, + "loss": 0.0352, + "step": 87020 + }, + { + "epoch": 0.03515, + "grad_norm": 0.09324821829795837, + "learning_rate": 3.231978946398427e-05, + "loss": 0.0368, + "step": 87030 + }, + { + "epoch": 0.0352, + "grad_norm": 0.08531796187162399, + "learning_rate": 3.2315836842365435e-05, + "loss": 0.0354, + "step": 87040 + }, + { + "epoch": 0.03525, + "grad_norm": 0.08075599372386932, + "learning_rate": 3.231188402073437e-05, + "loss": 0.0365, + "step": 87050 + }, + { + "epoch": 0.0353, + "grad_norm": 0.08507949113845825, + "learning_rate": 3.2307930999199155e-05, + "loss": 0.0371, + "step": 87060 + }, + { + "epoch": 0.03535, + "grad_norm": 0.07883370667695999, + "learning_rate": 3.230397777786783e-05, + "loss": 0.0348, + "step": 87070 + }, + { + "epoch": 0.0354, + "grad_norm": 0.07218769192695618, + "learning_rate": 3.2300024356848514e-05, + "loss": 0.035, + "step": 87080 + }, + { + "epoch": 0.03545, + "grad_norm": 0.09903611242771149, + "learning_rate": 3.229607073624926e-05, + "loss": 0.0353, + "step": 87090 + }, + { + "epoch": 0.0355, + "grad_norm": 0.08573555201292038, + "learning_rate": 3.229211691617819e-05, + "loss": 0.0364, + "step": 87100 + }, + { + "epoch": 0.03555, + "grad_norm": 0.06599839776754379, + "learning_rate": 3.228816289674337e-05, + "loss": 0.0362, + "step": 87110 + }, + { + "epoch": 0.0356, + "grad_norm": 0.0798148587346077, + "learning_rate": 3.2284208678052924e-05, + "loss": 0.0357, + "step": 87120 + }, + { + "epoch": 0.03565, + "grad_norm": 0.0782741829752922, + "learning_rate": 3.2280254260214936e-05, + "loss": 0.0343, + "step": 87130 + }, + { + "epoch": 0.0357, + "grad_norm": 0.06847995519638062, + "learning_rate": 3.227629964333755e-05, + "loss": 0.0346, + "step": 87140 + }, + { + "epoch": 0.03575, + "grad_norm": 0.07769029587507248, + "learning_rate": 3.227234482752884e-05, + "loss": 0.0344, + "step": 87150 + }, + { + "epoch": 0.0358, + "grad_norm": 0.09033706784248352, + "learning_rate": 3.226838981289698e-05, + "loss": 0.0361, + "step": 87160 + }, + { + "epoch": 0.03585, + "grad_norm": 0.0827813595533371, + "learning_rate": 3.226443459955006e-05, + "loss": 0.0345, + "step": 87170 + }, + { + "epoch": 0.0359, + "grad_norm": 0.0844188928604126, + "learning_rate": 3.226047918759623e-05, + "loss": 0.0346, + "step": 87180 + }, + { + "epoch": 0.03595, + "grad_norm": 0.073844313621521, + "learning_rate": 3.225652357714363e-05, + "loss": 0.036, + "step": 87190 + }, + { + "epoch": 0.036, + "grad_norm": 0.07636504620313644, + "learning_rate": 3.2252567768300394e-05, + "loss": 0.0355, + "step": 87200 + }, + { + "epoch": 0.03605, + "grad_norm": 0.08958642184734344, + "learning_rate": 3.2248611761174684e-05, + "loss": 0.0346, + "step": 87210 + }, + { + "epoch": 0.0361, + "grad_norm": 0.06772074848413467, + "learning_rate": 3.2244655555874645e-05, + "loss": 0.0344, + "step": 87220 + }, + { + "epoch": 0.03615, + "grad_norm": 0.11023818701505661, + "learning_rate": 3.224069915250846e-05, + "loss": 0.0362, + "step": 87230 + }, + { + "epoch": 0.0362, + "grad_norm": 0.08262085169553757, + "learning_rate": 3.2236742551184265e-05, + "loss": 0.0357, + "step": 87240 + }, + { + "epoch": 0.03625, + "grad_norm": 0.10467635840177536, + "learning_rate": 3.223278575201026e-05, + "loss": 0.0362, + "step": 87250 + }, + { + "epoch": 0.0363, + "grad_norm": 0.09873107075691223, + "learning_rate": 3.22288287550946e-05, + "loss": 0.0374, + "step": 87260 + }, + { + "epoch": 0.03635, + "grad_norm": 0.08017779141664505, + "learning_rate": 3.2224871560545484e-05, + "loss": 0.0358, + "step": 87270 + }, + { + "epoch": 0.0364, + "grad_norm": 0.0882391408085823, + "learning_rate": 3.222091416847109e-05, + "loss": 0.0353, + "step": 87280 + }, + { + "epoch": 0.03645, + "grad_norm": 0.09055408090353012, + "learning_rate": 3.221695657897961e-05, + "loss": 0.0355, + "step": 87290 + }, + { + "epoch": 0.0365, + "grad_norm": 0.08563832938671112, + "learning_rate": 3.2212998792179255e-05, + "loss": 0.0354, + "step": 87300 + }, + { + "epoch": 0.03655, + "grad_norm": 0.08165394514799118, + "learning_rate": 3.2209040808178223e-05, + "loss": 0.0353, + "step": 87310 + }, + { + "epoch": 0.0366, + "grad_norm": 0.08915353566408157, + "learning_rate": 3.220508262708473e-05, + "loss": 0.0387, + "step": 87320 + }, + { + "epoch": 0.03665, + "grad_norm": 0.11730699241161346, + "learning_rate": 3.2201124249006976e-05, + "loss": 0.0355, + "step": 87330 + }, + { + "epoch": 0.0367, + "grad_norm": 0.1185787171125412, + "learning_rate": 3.219716567405319e-05, + "loss": 0.0375, + "step": 87340 + }, + { + "epoch": 0.03675, + "grad_norm": 0.1295836716890335, + "learning_rate": 3.21932069023316e-05, + "loss": 0.0376, + "step": 87350 + }, + { + "epoch": 0.0368, + "grad_norm": 0.12778916954994202, + "learning_rate": 3.2189247933950436e-05, + "loss": 0.0368, + "step": 87360 + }, + { + "epoch": 0.03685, + "grad_norm": 0.09289523214101791, + "learning_rate": 3.218528876901794e-05, + "loss": 0.0353, + "step": 87370 + }, + { + "epoch": 0.0369, + "grad_norm": 0.08766157180070877, + "learning_rate": 3.218132940764234e-05, + "loss": 0.0355, + "step": 87380 + }, + { + "epoch": 0.03695, + "grad_norm": 0.08578447997570038, + "learning_rate": 3.21773698499319e-05, + "loss": 0.0362, + "step": 87390 + }, + { + "epoch": 0.037, + "grad_norm": 0.07584724575281143, + "learning_rate": 3.2173410095994854e-05, + "loss": 0.0374, + "step": 87400 + }, + { + "epoch": 0.03705, + "grad_norm": 0.07478147000074387, + "learning_rate": 3.216945014593948e-05, + "loss": 0.0402, + "step": 87410 + }, + { + "epoch": 0.0371, + "grad_norm": 0.09229233115911484, + "learning_rate": 3.2165489999874024e-05, + "loss": 0.0364, + "step": 87420 + }, + { + "epoch": 0.03715, + "grad_norm": 0.07323060184717178, + "learning_rate": 3.216152965790677e-05, + "loss": 0.0362, + "step": 87430 + }, + { + "epoch": 0.0372, + "grad_norm": 0.08562704920768738, + "learning_rate": 3.2157569120145986e-05, + "loss": 0.0357, + "step": 87440 + }, + { + "epoch": 0.03725, + "grad_norm": 0.07262744754552841, + "learning_rate": 3.2153608386699955e-05, + "loss": 0.0377, + "step": 87450 + }, + { + "epoch": 0.0373, + "grad_norm": 0.0773356705904007, + "learning_rate": 3.214964745767694e-05, + "loss": 0.0359, + "step": 87460 + }, + { + "epoch": 0.03735, + "grad_norm": 0.08478112518787384, + "learning_rate": 3.214568633318526e-05, + "loss": 0.0369, + "step": 87470 + }, + { + "epoch": 0.0374, + "grad_norm": 0.08348061889410019, + "learning_rate": 3.2141725013333206e-05, + "loss": 0.036, + "step": 87480 + }, + { + "epoch": 0.03745, + "grad_norm": 0.0778733417391777, + "learning_rate": 3.213776349822907e-05, + "loss": 0.0365, + "step": 87490 + }, + { + "epoch": 0.0375, + "grad_norm": 0.0955268070101738, + "learning_rate": 3.213380178798117e-05, + "loss": 0.0356, + "step": 87500 + }, + { + "epoch": 0.03755, + "grad_norm": 0.09940584748983383, + "learning_rate": 3.212983988269779e-05, + "loss": 0.0361, + "step": 87510 + }, + { + "epoch": 0.0376, + "grad_norm": 0.08237015455961227, + "learning_rate": 3.212587778248728e-05, + "loss": 0.0372, + "step": 87520 + }, + { + "epoch": 0.03765, + "grad_norm": 0.08107157051563263, + "learning_rate": 3.212191548745794e-05, + "loss": 0.0358, + "step": 87530 + }, + { + "epoch": 0.0377, + "grad_norm": 0.09527316689491272, + "learning_rate": 3.211795299771812e-05, + "loss": 0.0369, + "step": 87540 + }, + { + "epoch": 0.03775, + "grad_norm": 0.08648992329835892, + "learning_rate": 3.211399031337612e-05, + "loss": 0.0367, + "step": 87550 + }, + { + "epoch": 0.0378, + "grad_norm": 0.0921984314918518, + "learning_rate": 3.211002743454031e-05, + "loss": 0.0359, + "step": 87560 + }, + { + "epoch": 0.03785, + "grad_norm": 0.07162953913211823, + "learning_rate": 3.210606436131902e-05, + "loss": 0.0371, + "step": 87570 + }, + { + "epoch": 0.0379, + "grad_norm": 0.0776718333363533, + "learning_rate": 3.21021010938206e-05, + "loss": 0.0349, + "step": 87580 + }, + { + "epoch": 0.03795, + "grad_norm": 0.06633836776018143, + "learning_rate": 3.20981376321534e-05, + "loss": 0.0368, + "step": 87590 + }, + { + "epoch": 0.038, + "grad_norm": 0.09633494913578033, + "learning_rate": 3.209417397642579e-05, + "loss": 0.0383, + "step": 87600 + }, + { + "epoch": 0.03805, + "grad_norm": 0.08774057030677795, + "learning_rate": 3.209021012674612e-05, + "loss": 0.038, + "step": 87610 + }, + { + "epoch": 0.0381, + "grad_norm": 0.08426745235919952, + "learning_rate": 3.208624608322277e-05, + "loss": 0.0359, + "step": 87620 + }, + { + "epoch": 0.03815, + "grad_norm": 0.09102395176887512, + "learning_rate": 3.2082281845964125e-05, + "loss": 0.0345, + "step": 87630 + }, + { + "epoch": 0.0382, + "grad_norm": 0.07798417657613754, + "learning_rate": 3.207831741507855e-05, + "loss": 0.0355, + "step": 87640 + }, + { + "epoch": 0.03825, + "grad_norm": 0.07958195358514786, + "learning_rate": 3.207435279067443e-05, + "loss": 0.0356, + "step": 87650 + }, + { + "epoch": 0.0383, + "grad_norm": 0.07889354974031448, + "learning_rate": 3.207038797286017e-05, + "loss": 0.0351, + "step": 87660 + }, + { + "epoch": 0.03835, + "grad_norm": 0.08104643225669861, + "learning_rate": 3.2066422961744155e-05, + "loss": 0.0383, + "step": 87670 + }, + { + "epoch": 0.0384, + "grad_norm": 0.07670079171657562, + "learning_rate": 3.2062457757434794e-05, + "loss": 0.0374, + "step": 87680 + }, + { + "epoch": 0.03845, + "grad_norm": 0.08696895092725754, + "learning_rate": 3.2058492360040485e-05, + "loss": 0.0365, + "step": 87690 + }, + { + "epoch": 0.0385, + "grad_norm": 0.07794123142957687, + "learning_rate": 3.2054526769669654e-05, + "loss": 0.0366, + "step": 87700 + }, + { + "epoch": 0.03855, + "grad_norm": 0.0759347528219223, + "learning_rate": 3.20505609864307e-05, + "loss": 0.036, + "step": 87710 + }, + { + "epoch": 0.0386, + "grad_norm": 0.07720106095075607, + "learning_rate": 3.204659501043207e-05, + "loss": 0.0349, + "step": 87720 + }, + { + "epoch": 0.03865, + "grad_norm": 0.07206979393959045, + "learning_rate": 3.204262884178218e-05, + "loss": 0.0354, + "step": 87730 + }, + { + "epoch": 0.0387, + "grad_norm": 0.07867118716239929, + "learning_rate": 3.203866248058946e-05, + "loss": 0.0355, + "step": 87740 + }, + { + "epoch": 0.03875, + "grad_norm": 0.07198004424571991, + "learning_rate": 3.2034695926962344e-05, + "loss": 0.0344, + "step": 87750 + }, + { + "epoch": 0.0388, + "grad_norm": 0.06714322417974472, + "learning_rate": 3.203072918100929e-05, + "loss": 0.035, + "step": 87760 + }, + { + "epoch": 0.03885, + "grad_norm": 0.08932408690452576, + "learning_rate": 3.202676224283874e-05, + "loss": 0.035, + "step": 87770 + }, + { + "epoch": 0.0389, + "grad_norm": 0.08211679756641388, + "learning_rate": 3.202279511255915e-05, + "loss": 0.0346, + "step": 87780 + }, + { + "epoch": 0.03895, + "grad_norm": 0.07144937664270401, + "learning_rate": 3.201882779027898e-05, + "loss": 0.0366, + "step": 87790 + }, + { + "epoch": 0.039, + "grad_norm": 0.08687329292297363, + "learning_rate": 3.20148602761067e-05, + "loss": 0.0336, + "step": 87800 + }, + { + "epoch": 0.03905, + "grad_norm": 0.08850160986185074, + "learning_rate": 3.201089257015077e-05, + "loss": 0.0359, + "step": 87810 + }, + { + "epoch": 0.0391, + "grad_norm": 0.09064152091741562, + "learning_rate": 3.2006924672519677e-05, + "loss": 0.0339, + "step": 87820 + }, + { + "epoch": 0.03915, + "grad_norm": 0.09283946454524994, + "learning_rate": 3.2002956583321895e-05, + "loss": 0.0362, + "step": 87830 + }, + { + "epoch": 0.0392, + "grad_norm": 0.09552500396966934, + "learning_rate": 3.19989883026659e-05, + "loss": 0.0353, + "step": 87840 + }, + { + "epoch": 0.03925, + "grad_norm": 0.07619480788707733, + "learning_rate": 3.1995019830660213e-05, + "loss": 0.034, + "step": 87850 + }, + { + "epoch": 0.0393, + "grad_norm": 0.07461630553007126, + "learning_rate": 3.1991051167413296e-05, + "loss": 0.0345, + "step": 87860 + }, + { + "epoch": 0.03935, + "grad_norm": 0.07883042097091675, + "learning_rate": 3.198708231303367e-05, + "loss": 0.0353, + "step": 87870 + }, + { + "epoch": 0.0394, + "grad_norm": 0.09476449340581894, + "learning_rate": 3.1983113267629835e-05, + "loss": 0.035, + "step": 87880 + }, + { + "epoch": 0.03945, + "grad_norm": 0.08893871307373047, + "learning_rate": 3.197914403131032e-05, + "loss": 0.0353, + "step": 87890 + }, + { + "epoch": 0.0395, + "grad_norm": 0.09010148793458939, + "learning_rate": 3.197517460418362e-05, + "loss": 0.0337, + "step": 87900 + }, + { + "epoch": 0.03955, + "grad_norm": 0.08100104331970215, + "learning_rate": 3.1971204986358274e-05, + "loss": 0.0349, + "step": 87910 + }, + { + "epoch": 0.0396, + "grad_norm": 0.07950063794851303, + "learning_rate": 3.196723517794279e-05, + "loss": 0.0365, + "step": 87920 + }, + { + "epoch": 0.03965, + "grad_norm": 0.09475506097078323, + "learning_rate": 3.196326517904572e-05, + "loss": 0.0348, + "step": 87930 + }, + { + "epoch": 0.0397, + "grad_norm": 0.07854641228914261, + "learning_rate": 3.19592949897756e-05, + "loss": 0.0363, + "step": 87940 + }, + { + "epoch": 0.03975, + "grad_norm": 0.08651375770568848, + "learning_rate": 3.1955324610240965e-05, + "loss": 0.0362, + "step": 87950 + }, + { + "epoch": 0.0398, + "grad_norm": 0.08224500715732574, + "learning_rate": 3.195135404055037e-05, + "loss": 0.0378, + "step": 87960 + }, + { + "epoch": 0.03985, + "grad_norm": 0.07783657312393188, + "learning_rate": 3.194738328081236e-05, + "loss": 0.0362, + "step": 87970 + }, + { + "epoch": 0.0399, + "grad_norm": 0.08973323553800583, + "learning_rate": 3.1943412331135506e-05, + "loss": 0.0363, + "step": 87980 + }, + { + "epoch": 0.03995, + "grad_norm": 0.07899042963981628, + "learning_rate": 3.193944119162837e-05, + "loss": 0.036, + "step": 87990 + }, + { + "epoch": 0.04, + "grad_norm": 0.0829811543226242, + "learning_rate": 3.1935469862399515e-05, + "loss": 0.0406, + "step": 88000 + }, + { + "epoch": 0.04005, + "grad_norm": 0.08840011805295944, + "learning_rate": 3.193149834355752e-05, + "loss": 0.0369, + "step": 88010 + }, + { + "epoch": 0.0401, + "grad_norm": 0.08670192956924438, + "learning_rate": 3.1927526635210966e-05, + "loss": 0.0356, + "step": 88020 + }, + { + "epoch": 0.04015, + "grad_norm": 0.08463533222675323, + "learning_rate": 3.1923554737468444e-05, + "loss": 0.0361, + "step": 88030 + }, + { + "epoch": 0.0402, + "grad_norm": 0.08968721330165863, + "learning_rate": 3.191958265043852e-05, + "loss": 0.0366, + "step": 88040 + }, + { + "epoch": 0.04025, + "grad_norm": 0.08073660731315613, + "learning_rate": 3.191561037422981e-05, + "loss": 0.0361, + "step": 88050 + }, + { + "epoch": 0.0403, + "grad_norm": 0.07392221689224243, + "learning_rate": 3.191163790895092e-05, + "loss": 0.0349, + "step": 88060 + }, + { + "epoch": 0.04035, + "grad_norm": 0.0711127445101738, + "learning_rate": 3.190766525471045e-05, + "loss": 0.036, + "step": 88070 + }, + { + "epoch": 0.0404, + "grad_norm": 0.07902902364730835, + "learning_rate": 3.190369241161699e-05, + "loss": 0.0374, + "step": 88080 + }, + { + "epoch": 0.04045, + "grad_norm": 0.07067114859819412, + "learning_rate": 3.189971937977918e-05, + "loss": 0.0352, + "step": 88090 + }, + { + "epoch": 0.0405, + "grad_norm": 0.07859675586223602, + "learning_rate": 3.1895746159305646e-05, + "loss": 0.0355, + "step": 88100 + }, + { + "epoch": 0.04055, + "grad_norm": 0.06850699335336685, + "learning_rate": 3.1891772750304985e-05, + "loss": 0.037, + "step": 88110 + }, + { + "epoch": 0.0406, + "grad_norm": 0.07936318963766098, + "learning_rate": 3.1887799152885856e-05, + "loss": 0.036, + "step": 88120 + }, + { + "epoch": 0.04065, + "grad_norm": 0.11133728921413422, + "learning_rate": 3.188382536715688e-05, + "loss": 0.0359, + "step": 88130 + }, + { + "epoch": 0.0407, + "grad_norm": 0.11879123747348785, + "learning_rate": 3.187985139322671e-05, + "loss": 0.0376, + "step": 88140 + }, + { + "epoch": 0.04075, + "grad_norm": 0.06967378407716751, + "learning_rate": 3.187587723120399e-05, + "loss": 0.0364, + "step": 88150 + }, + { + "epoch": 0.0408, + "grad_norm": 0.08400028944015503, + "learning_rate": 3.1871902881197365e-05, + "loss": 0.037, + "step": 88160 + }, + { + "epoch": 0.04085, + "grad_norm": 0.08613050729036331, + "learning_rate": 3.186792834331549e-05, + "loss": 0.0354, + "step": 88170 + }, + { + "epoch": 0.0409, + "grad_norm": 0.08761177211999893, + "learning_rate": 3.186395361766704e-05, + "loss": 0.0354, + "step": 88180 + }, + { + "epoch": 0.04095, + "grad_norm": 0.07430850714445114, + "learning_rate": 3.185997870436068e-05, + "loss": 0.0353, + "step": 88190 + }, + { + "epoch": 0.041, + "grad_norm": 0.0727161318063736, + "learning_rate": 3.185600360350508e-05, + "loss": 0.0355, + "step": 88200 + }, + { + "epoch": 0.04105, + "grad_norm": 0.08402503281831741, + "learning_rate": 3.1852028315208914e-05, + "loss": 0.0361, + "step": 88210 + }, + { + "epoch": 0.0411, + "grad_norm": 0.08046665042638779, + "learning_rate": 3.1848052839580866e-05, + "loss": 0.035, + "step": 88220 + }, + { + "epoch": 0.04115, + "grad_norm": 0.07845325022935867, + "learning_rate": 3.184407717672962e-05, + "loss": 0.0366, + "step": 88230 + }, + { + "epoch": 0.0412, + "grad_norm": 0.08222614973783493, + "learning_rate": 3.1840101326763894e-05, + "loss": 0.0362, + "step": 88240 + }, + { + "epoch": 0.04125, + "grad_norm": 0.07744602859020233, + "learning_rate": 3.183612528979235e-05, + "loss": 0.0352, + "step": 88250 + }, + { + "epoch": 0.0413, + "grad_norm": 0.07237857580184937, + "learning_rate": 3.183214906592372e-05, + "loss": 0.0342, + "step": 88260 + }, + { + "epoch": 0.04135, + "grad_norm": 0.07495765388011932, + "learning_rate": 3.18281726552667e-05, + "loss": 0.0346, + "step": 88270 + }, + { + "epoch": 0.0414, + "grad_norm": 0.08611702919006348, + "learning_rate": 3.182419605793e-05, + "loss": 0.035, + "step": 88280 + }, + { + "epoch": 0.04145, + "grad_norm": 0.0782887414097786, + "learning_rate": 3.182021927402235e-05, + "loss": 0.0367, + "step": 88290 + }, + { + "epoch": 0.0415, + "grad_norm": 0.07414274662733078, + "learning_rate": 3.181624230365245e-05, + "loss": 0.0342, + "step": 88300 + }, + { + "epoch": 0.04155, + "grad_norm": 0.07089847326278687, + "learning_rate": 3.1812265146929064e-05, + "loss": 0.0349, + "step": 88310 + }, + { + "epoch": 0.0416, + "grad_norm": 0.08510833233594894, + "learning_rate": 3.18082878039609e-05, + "loss": 0.0354, + "step": 88320 + }, + { + "epoch": 0.04165, + "grad_norm": 0.0712892934679985, + "learning_rate": 3.180431027485672e-05, + "loss": 0.0343, + "step": 88330 + }, + { + "epoch": 0.0417, + "grad_norm": 0.10102806985378265, + "learning_rate": 3.1800332559725235e-05, + "loss": 0.0368, + "step": 88340 + }, + { + "epoch": 0.04175, + "grad_norm": 0.07337898015975952, + "learning_rate": 3.179635465867522e-05, + "loss": 0.035, + "step": 88350 + }, + { + "epoch": 0.0418, + "grad_norm": 0.0787159651517868, + "learning_rate": 3.179237657181542e-05, + "loss": 0.0357, + "step": 88360 + }, + { + "epoch": 0.04185, + "grad_norm": 0.07552842795848846, + "learning_rate": 3.1788398299254596e-05, + "loss": 0.0348, + "step": 88370 + }, + { + "epoch": 0.0419, + "grad_norm": 0.06384848058223724, + "learning_rate": 3.178441984110151e-05, + "loss": 0.0343, + "step": 88380 + }, + { + "epoch": 0.04195, + "grad_norm": 0.0709843784570694, + "learning_rate": 3.178044119746495e-05, + "loss": 0.0344, + "step": 88390 + }, + { + "epoch": 0.042, + "grad_norm": 0.0840144157409668, + "learning_rate": 3.177646236845366e-05, + "loss": 0.0343, + "step": 88400 + }, + { + "epoch": 0.04205, + "grad_norm": 0.0872381180524826, + "learning_rate": 3.177248335417644e-05, + "loss": 0.0372, + "step": 88410 + }, + { + "epoch": 0.0421, + "grad_norm": 0.08309582620859146, + "learning_rate": 3.176850415474206e-05, + "loss": 0.0351, + "step": 88420 + }, + { + "epoch": 0.04215, + "grad_norm": 0.07073325663805008, + "learning_rate": 3.176452477025933e-05, + "loss": 0.0359, + "step": 88430 + }, + { + "epoch": 0.0422, + "grad_norm": 0.06448944658041, + "learning_rate": 3.176054520083703e-05, + "loss": 0.0363, + "step": 88440 + }, + { + "epoch": 0.04225, + "grad_norm": 0.08094968646764755, + "learning_rate": 3.175656544658397e-05, + "loss": 0.0349, + "step": 88450 + }, + { + "epoch": 0.0423, + "grad_norm": 0.08361298590898514, + "learning_rate": 3.175258550760894e-05, + "loss": 0.0391, + "step": 88460 + }, + { + "epoch": 0.04235, + "grad_norm": 0.09900790452957153, + "learning_rate": 3.174860538402076e-05, + "loss": 0.0365, + "step": 88470 + }, + { + "epoch": 0.0424, + "grad_norm": 0.12086690217256546, + "learning_rate": 3.174462507592825e-05, + "loss": 0.0369, + "step": 88480 + }, + { + "epoch": 0.04245, + "grad_norm": 0.0957237109541893, + "learning_rate": 3.1740644583440224e-05, + "loss": 0.0377, + "step": 88490 + }, + { + "epoch": 0.0425, + "grad_norm": 0.11632449179887772, + "learning_rate": 3.17366639066655e-05, + "loss": 0.0373, + "step": 88500 + }, + { + "epoch": 0.04255, + "grad_norm": 0.10616426169872284, + "learning_rate": 3.173268304571292e-05, + "loss": 0.0375, + "step": 88510 + }, + { + "epoch": 0.0426, + "grad_norm": 0.09581998735666275, + "learning_rate": 3.172870200069132e-05, + "loss": 0.0363, + "step": 88520 + }, + { + "epoch": 0.04265, + "grad_norm": 0.07968276739120483, + "learning_rate": 3.1724720771709525e-05, + "loss": 0.0352, + "step": 88530 + }, + { + "epoch": 0.0427, + "grad_norm": 0.07968457788228989, + "learning_rate": 3.17207393588764e-05, + "loss": 0.0361, + "step": 88540 + }, + { + "epoch": 0.04275, + "grad_norm": 0.07705254852771759, + "learning_rate": 3.1716757762300775e-05, + "loss": 0.0362, + "step": 88550 + }, + { + "epoch": 0.0428, + "grad_norm": 0.07352401316165924, + "learning_rate": 3.171277598209153e-05, + "loss": 0.0351, + "step": 88560 + }, + { + "epoch": 0.04285, + "grad_norm": 0.07496006041765213, + "learning_rate": 3.17087940183575e-05, + "loss": 0.0362, + "step": 88570 + }, + { + "epoch": 0.0429, + "grad_norm": 0.09419265389442444, + "learning_rate": 3.170481187120757e-05, + "loss": 0.0353, + "step": 88580 + }, + { + "epoch": 0.04295, + "grad_norm": 0.0760672315955162, + "learning_rate": 3.1700829540750596e-05, + "loss": 0.0363, + "step": 88590 + }, + { + "epoch": 0.043, + "grad_norm": 0.07824535667896271, + "learning_rate": 3.1696847027095466e-05, + "loss": 0.0357, + "step": 88600 + }, + { + "epoch": 0.04305, + "grad_norm": 0.07357686758041382, + "learning_rate": 3.1692864330351046e-05, + "loss": 0.0356, + "step": 88610 + }, + { + "epoch": 0.0431, + "grad_norm": 0.08512439578771591, + "learning_rate": 3.168888145062623e-05, + "loss": 0.0367, + "step": 88620 + }, + { + "epoch": 0.04315, + "grad_norm": 0.08869446069002151, + "learning_rate": 3.168489838802991e-05, + "loss": 0.0353, + "step": 88630 + }, + { + "epoch": 0.0432, + "grad_norm": 0.0901620015501976, + "learning_rate": 3.168091514267099e-05, + "loss": 0.0358, + "step": 88640 + }, + { + "epoch": 0.04325, + "grad_norm": 0.08479337394237518, + "learning_rate": 3.167693171465835e-05, + "loss": 0.0361, + "step": 88650 + }, + { + "epoch": 0.0433, + "grad_norm": 0.09710962325334549, + "learning_rate": 3.167294810410091e-05, + "loss": 0.0364, + "step": 88660 + }, + { + "epoch": 0.04335, + "grad_norm": 0.14224158227443695, + "learning_rate": 3.166896431110757e-05, + "loss": 0.0361, + "step": 88670 + }, + { + "epoch": 0.0434, + "grad_norm": 0.10792012512683868, + "learning_rate": 3.166498033578725e-05, + "loss": 0.0358, + "step": 88680 + }, + { + "epoch": 0.04345, + "grad_norm": 0.09018728882074356, + "learning_rate": 3.166099617824888e-05, + "loss": 0.038, + "step": 88690 + }, + { + "epoch": 0.0435, + "grad_norm": 0.09542609006166458, + "learning_rate": 3.165701183860137e-05, + "loss": 0.037, + "step": 88700 + }, + { + "epoch": 0.04355, + "grad_norm": 0.08831940591335297, + "learning_rate": 3.165302731695366e-05, + "loss": 0.0362, + "step": 88710 + }, + { + "epoch": 0.0436, + "grad_norm": 0.0845828652381897, + "learning_rate": 3.1649042613414684e-05, + "loss": 0.0361, + "step": 88720 + }, + { + "epoch": 0.04365, + "grad_norm": 0.07160351425409317, + "learning_rate": 3.164505772809338e-05, + "loss": 0.0358, + "step": 88730 + }, + { + "epoch": 0.0437, + "grad_norm": 0.07241127640008926, + "learning_rate": 3.164107266109869e-05, + "loss": 0.035, + "step": 88740 + }, + { + "epoch": 0.04375, + "grad_norm": 0.06150011345744133, + "learning_rate": 3.163708741253957e-05, + "loss": 0.034, + "step": 88750 + }, + { + "epoch": 0.0438, + "grad_norm": 0.0823177844285965, + "learning_rate": 3.163310198252497e-05, + "loss": 0.035, + "step": 88760 + }, + { + "epoch": 0.04385, + "grad_norm": 0.07345187664031982, + "learning_rate": 3.162911637116386e-05, + "loss": 0.0364, + "step": 88770 + }, + { + "epoch": 0.0439, + "grad_norm": 0.07987566292285919, + "learning_rate": 3.1625130578565196e-05, + "loss": 0.0355, + "step": 88780 + }, + { + "epoch": 0.04395, + "grad_norm": 0.07271317392587662, + "learning_rate": 3.162114460483796e-05, + "loss": 0.0362, + "step": 88790 + }, + { + "epoch": 0.044, + "grad_norm": 0.08566244691610336, + "learning_rate": 3.1617158450091114e-05, + "loss": 0.0349, + "step": 88800 + }, + { + "epoch": 0.04405, + "grad_norm": 0.0695837214589119, + "learning_rate": 3.161317211443363e-05, + "loss": 0.0359, + "step": 88810 + }, + { + "epoch": 0.0441, + "grad_norm": 0.07219763100147247, + "learning_rate": 3.160918559797451e-05, + "loss": 0.0358, + "step": 88820 + }, + { + "epoch": 0.04415, + "grad_norm": 0.10157402604818344, + "learning_rate": 3.160519890082275e-05, + "loss": 0.0359, + "step": 88830 + }, + { + "epoch": 0.0442, + "grad_norm": 0.0994025319814682, + "learning_rate": 3.1601212023087324e-05, + "loss": 0.0358, + "step": 88840 + }, + { + "epoch": 0.04425, + "grad_norm": 0.09215868264436722, + "learning_rate": 3.159722496487725e-05, + "loss": 0.0359, + "step": 88850 + }, + { + "epoch": 0.0443, + "grad_norm": 0.08054187148809433, + "learning_rate": 3.159323772630151e-05, + "loss": 0.0351, + "step": 88860 + }, + { + "epoch": 0.04435, + "grad_norm": 0.0746442899107933, + "learning_rate": 3.1589250307469134e-05, + "loss": 0.0367, + "step": 88870 + }, + { + "epoch": 0.0444, + "grad_norm": 0.09109540283679962, + "learning_rate": 3.158526270848913e-05, + "loss": 0.0352, + "step": 88880 + }, + { + "epoch": 0.04445, + "grad_norm": 0.08202842622995377, + "learning_rate": 3.1581274929470514e-05, + "loss": 0.036, + "step": 88890 + }, + { + "epoch": 0.0445, + "grad_norm": 0.07617323100566864, + "learning_rate": 3.1577286970522316e-05, + "loss": 0.0364, + "step": 88900 + }, + { + "epoch": 0.04455, + "grad_norm": 0.08100539445877075, + "learning_rate": 3.157329883175357e-05, + "loss": 0.0353, + "step": 88910 + }, + { + "epoch": 0.0446, + "grad_norm": 0.0730210468173027, + "learning_rate": 3.15693105132733e-05, + "loss": 0.036, + "step": 88920 + }, + { + "epoch": 0.04465, + "grad_norm": 0.07562186568975449, + "learning_rate": 3.156532201519055e-05, + "loss": 0.0352, + "step": 88930 + }, + { + "epoch": 0.0447, + "grad_norm": 0.09780492633581161, + "learning_rate": 3.156133333761435e-05, + "loss": 0.0356, + "step": 88940 + }, + { + "epoch": 0.04475, + "grad_norm": 0.08263695985078812, + "learning_rate": 3.1557344480653776e-05, + "loss": 0.0358, + "step": 88950 + }, + { + "epoch": 0.0448, + "grad_norm": 0.08889447152614594, + "learning_rate": 3.155335544441786e-05, + "loss": 0.0369, + "step": 88960 + }, + { + "epoch": 0.04485, + "grad_norm": 0.07156159728765488, + "learning_rate": 3.154936622901567e-05, + "loss": 0.0358, + "step": 88970 + }, + { + "epoch": 0.0449, + "grad_norm": 0.07655026763677597, + "learning_rate": 3.154537683455627e-05, + "loss": 0.0373, + "step": 88980 + }, + { + "epoch": 0.04495, + "grad_norm": 0.07747284322977066, + "learning_rate": 3.154138726114872e-05, + "loss": 0.0365, + "step": 88990 + }, + { + "epoch": 0.045, + "grad_norm": 0.0795535296201706, + "learning_rate": 3.15373975089021e-05, + "loss": 0.0366, + "step": 89000 + }, + { + "epoch": 0.04505, + "grad_norm": 0.0787782073020935, + "learning_rate": 3.15334075779255e-05, + "loss": 0.0363, + "step": 89010 + }, + { + "epoch": 0.0451, + "grad_norm": 0.07431310415267944, + "learning_rate": 3.152941746832798e-05, + "loss": 0.0365, + "step": 89020 + }, + { + "epoch": 0.04515, + "grad_norm": 0.09488285332918167, + "learning_rate": 3.152542718021865e-05, + "loss": 0.0364, + "step": 89030 + }, + { + "epoch": 0.0452, + "grad_norm": 0.08887805789709091, + "learning_rate": 3.1521436713706585e-05, + "loss": 0.0362, + "step": 89040 + }, + { + "epoch": 0.04525, + "grad_norm": 0.0670175775885582, + "learning_rate": 3.151744606890089e-05, + "loss": 0.0374, + "step": 89050 + }, + { + "epoch": 0.0453, + "grad_norm": 0.09125831723213196, + "learning_rate": 3.1513455245910666e-05, + "loss": 0.0363, + "step": 89060 + }, + { + "epoch": 0.04535, + "grad_norm": 0.09820882230997086, + "learning_rate": 3.150946424484502e-05, + "loss": 0.0363, + "step": 89070 + }, + { + "epoch": 0.0454, + "grad_norm": 0.09063718467950821, + "learning_rate": 3.150547306581308e-05, + "loss": 0.0353, + "step": 89080 + }, + { + "epoch": 0.04545, + "grad_norm": 0.07643202692270279, + "learning_rate": 3.150148170892394e-05, + "loss": 0.0346, + "step": 89090 + }, + { + "epoch": 0.0455, + "grad_norm": 0.07743243128061295, + "learning_rate": 3.149749017428674e-05, + "loss": 0.0341, + "step": 89100 + }, + { + "epoch": 0.04555, + "grad_norm": 0.07924839109182358, + "learning_rate": 3.149349846201059e-05, + "loss": 0.0363, + "step": 89110 + }, + { + "epoch": 0.0456, + "grad_norm": 0.0933968722820282, + "learning_rate": 3.1489506572204644e-05, + "loss": 0.0378, + "step": 89120 + }, + { + "epoch": 0.04565, + "grad_norm": 0.08461254835128784, + "learning_rate": 3.148551450497801e-05, + "loss": 0.0385, + "step": 89130 + }, + { + "epoch": 0.0457, + "grad_norm": 0.08032810688018799, + "learning_rate": 3.1481522260439856e-05, + "loss": 0.0353, + "step": 89140 + }, + { + "epoch": 0.04575, + "grad_norm": 0.08250021189451218, + "learning_rate": 3.147752983869931e-05, + "loss": 0.0366, + "step": 89150 + }, + { + "epoch": 0.0458, + "grad_norm": 0.09803785383701324, + "learning_rate": 3.1473537239865545e-05, + "loss": 0.0358, + "step": 89160 + }, + { + "epoch": 0.04585, + "grad_norm": 0.09414341300725937, + "learning_rate": 3.14695444640477e-05, + "loss": 0.0363, + "step": 89170 + }, + { + "epoch": 0.0459, + "grad_norm": 0.08637914806604385, + "learning_rate": 3.1465551511354934e-05, + "loss": 0.0375, + "step": 89180 + }, + { + "epoch": 0.04595, + "grad_norm": 0.08976668864488602, + "learning_rate": 3.146155838189642e-05, + "loss": 0.037, + "step": 89190 + }, + { + "epoch": 0.046, + "grad_norm": 0.07519635558128357, + "learning_rate": 3.1457565075781333e-05, + "loss": 0.0343, + "step": 89200 + }, + { + "epoch": 0.04605, + "grad_norm": 0.09900743514299393, + "learning_rate": 3.145357159311884e-05, + "loss": 0.0381, + "step": 89210 + }, + { + "epoch": 0.0461, + "grad_norm": 0.10613662749528885, + "learning_rate": 3.144957793401812e-05, + "loss": 0.0364, + "step": 89220 + }, + { + "epoch": 0.04615, + "grad_norm": 0.09278790652751923, + "learning_rate": 3.144558409858837e-05, + "loss": 0.0348, + "step": 89230 + }, + { + "epoch": 0.0462, + "grad_norm": 0.08505608141422272, + "learning_rate": 3.1441590086938764e-05, + "loss": 0.0354, + "step": 89240 + }, + { + "epoch": 0.04625, + "grad_norm": 0.07010231912136078, + "learning_rate": 3.143759589917851e-05, + "loss": 0.0353, + "step": 89250 + }, + { + "epoch": 0.0463, + "grad_norm": 0.0737927183508873, + "learning_rate": 3.14336015354168e-05, + "loss": 0.0364, + "step": 89260 + }, + { + "epoch": 0.04635, + "grad_norm": 0.08519606292247772, + "learning_rate": 3.1429606995762844e-05, + "loss": 0.0364, + "step": 89270 + }, + { + "epoch": 0.0464, + "grad_norm": 0.0799202173948288, + "learning_rate": 3.1425612280325844e-05, + "loss": 0.0348, + "step": 89280 + }, + { + "epoch": 0.04645, + "grad_norm": 0.07240907847881317, + "learning_rate": 3.1421617389215025e-05, + "loss": 0.0366, + "step": 89290 + }, + { + "epoch": 0.0465, + "grad_norm": 0.09216149151325226, + "learning_rate": 3.14176223225396e-05, + "loss": 0.038, + "step": 89300 + }, + { + "epoch": 0.04655, + "grad_norm": 0.08080775290727615, + "learning_rate": 3.1413627080408784e-05, + "loss": 0.0351, + "step": 89310 + }, + { + "epoch": 0.0466, + "grad_norm": 0.07827922701835632, + "learning_rate": 3.140963166293181e-05, + "loss": 0.0395, + "step": 89320 + }, + { + "epoch": 0.04665, + "grad_norm": 0.0770445168018341, + "learning_rate": 3.140563607021793e-05, + "loss": 0.0349, + "step": 89330 + }, + { + "epoch": 0.0467, + "grad_norm": 0.08079741895198822, + "learning_rate": 3.1401640302376346e-05, + "loss": 0.0346, + "step": 89340 + }, + { + "epoch": 0.04675, + "grad_norm": 0.07338516414165497, + "learning_rate": 3.139764435951634e-05, + "loss": 0.0357, + "step": 89350 + }, + { + "epoch": 0.0468, + "grad_norm": 0.06986889243125916, + "learning_rate": 3.139364824174713e-05, + "loss": 0.0337, + "step": 89360 + }, + { + "epoch": 0.04685, + "grad_norm": 0.09464634954929352, + "learning_rate": 3.1389651949177987e-05, + "loss": 0.036, + "step": 89370 + }, + { + "epoch": 0.0469, + "grad_norm": 0.09366074949502945, + "learning_rate": 3.138565548191814e-05, + "loss": 0.0372, + "step": 89380 + }, + { + "epoch": 0.04695, + "grad_norm": 0.07965853065252304, + "learning_rate": 3.138165884007689e-05, + "loss": 0.0354, + "step": 89390 + }, + { + "epoch": 0.047, + "grad_norm": 0.08097198605537415, + "learning_rate": 3.137766202376348e-05, + "loss": 0.0368, + "step": 89400 + }, + { + "epoch": 0.04705, + "grad_norm": 0.07096298038959503, + "learning_rate": 3.137366503308719e-05, + "loss": 0.0372, + "step": 89410 + }, + { + "epoch": 0.0471, + "grad_norm": 0.06730058044195175, + "learning_rate": 3.136966786815729e-05, + "loss": 0.0347, + "step": 89420 + }, + { + "epoch": 0.04715, + "grad_norm": 0.07121149450540543, + "learning_rate": 3.136567052908306e-05, + "loss": 0.0361, + "step": 89430 + }, + { + "epoch": 0.0472, + "grad_norm": 0.07624838501214981, + "learning_rate": 3.136167301597379e-05, + "loss": 0.0359, + "step": 89440 + }, + { + "epoch": 0.04725, + "grad_norm": 0.0810113474726677, + "learning_rate": 3.135767532893877e-05, + "loss": 0.0366, + "step": 89450 + }, + { + "epoch": 0.0473, + "grad_norm": 0.08344297111034393, + "learning_rate": 3.13536774680873e-05, + "loss": 0.0356, + "step": 89460 + }, + { + "epoch": 0.04735, + "grad_norm": 0.07102300971746445, + "learning_rate": 3.1349679433528666e-05, + "loss": 0.0354, + "step": 89470 + }, + { + "epoch": 0.0474, + "grad_norm": 0.07442963868379593, + "learning_rate": 3.134568122537219e-05, + "loss": 0.0356, + "step": 89480 + }, + { + "epoch": 0.04745, + "grad_norm": 0.08615221083164215, + "learning_rate": 3.134168284372717e-05, + "loss": 0.0364, + "step": 89490 + }, + { + "epoch": 0.0475, + "grad_norm": 0.06951931864023209, + "learning_rate": 3.1337684288702926e-05, + "loss": 0.0368, + "step": 89500 + }, + { + "epoch": 0.04755, + "grad_norm": 0.07635512948036194, + "learning_rate": 3.133368556040877e-05, + "loss": 0.0348, + "step": 89510 + }, + { + "epoch": 0.0476, + "grad_norm": 0.0834120437502861, + "learning_rate": 3.132968665895404e-05, + "loss": 0.0364, + "step": 89520 + }, + { + "epoch": 0.04765, + "grad_norm": 0.10034742951393127, + "learning_rate": 3.1325687584448046e-05, + "loss": 0.0383, + "step": 89530 + }, + { + "epoch": 0.0477, + "grad_norm": 0.09734785556793213, + "learning_rate": 3.132168833700013e-05, + "loss": 0.0361, + "step": 89540 + }, + { + "epoch": 0.04775, + "grad_norm": 0.07262781262397766, + "learning_rate": 3.1317688916719636e-05, + "loss": 0.0361, + "step": 89550 + }, + { + "epoch": 0.0478, + "grad_norm": 0.08736280351877213, + "learning_rate": 3.1313689323715895e-05, + "loss": 0.0361, + "step": 89560 + }, + { + "epoch": 0.04785, + "grad_norm": 0.08496066927909851, + "learning_rate": 3.130968955809825e-05, + "loss": 0.0375, + "step": 89570 + }, + { + "epoch": 0.0479, + "grad_norm": 0.07764924317598343, + "learning_rate": 3.130568961997608e-05, + "loss": 0.0346, + "step": 89580 + }, + { + "epoch": 0.04795, + "grad_norm": 0.08531037718057632, + "learning_rate": 3.1301689509458715e-05, + "loss": 0.0355, + "step": 89590 + }, + { + "epoch": 0.048, + "grad_norm": 0.10499384999275208, + "learning_rate": 3.1297689226655534e-05, + "loss": 0.0374, + "step": 89600 + }, + { + "epoch": 0.04805, + "grad_norm": 0.09910687059164047, + "learning_rate": 3.129368877167589e-05, + "loss": 0.0368, + "step": 89610 + }, + { + "epoch": 0.0481, + "grad_norm": 0.10891690850257874, + "learning_rate": 3.128968814462916e-05, + "loss": 0.0368, + "step": 89620 + }, + { + "epoch": 0.04815, + "grad_norm": 0.08418095111846924, + "learning_rate": 3.128568734562472e-05, + "loss": 0.0366, + "step": 89630 + }, + { + "epoch": 0.0482, + "grad_norm": 0.09086181968450546, + "learning_rate": 3.128168637477195e-05, + "loss": 0.037, + "step": 89640 + }, + { + "epoch": 0.04825, + "grad_norm": 0.11261242628097534, + "learning_rate": 3.1277685232180234e-05, + "loss": 0.0377, + "step": 89650 + }, + { + "epoch": 0.0483, + "grad_norm": 0.08868662267923355, + "learning_rate": 3.1273683917958965e-05, + "loss": 0.04, + "step": 89660 + }, + { + "epoch": 0.04835, + "grad_norm": 0.0898551195859909, + "learning_rate": 3.126968243221752e-05, + "loss": 0.0366, + "step": 89670 + }, + { + "epoch": 0.0484, + "grad_norm": 0.08138702809810638, + "learning_rate": 3.126568077506533e-05, + "loss": 0.0362, + "step": 89680 + }, + { + "epoch": 0.04845, + "grad_norm": 0.09012820571660995, + "learning_rate": 3.126167894661177e-05, + "loss": 0.0355, + "step": 89690 + }, + { + "epoch": 0.0485, + "grad_norm": 0.08225119113922119, + "learning_rate": 3.125767694696627e-05, + "loss": 0.035, + "step": 89700 + }, + { + "epoch": 0.04855, + "grad_norm": 0.08970760554075241, + "learning_rate": 3.125367477623822e-05, + "loss": 0.0363, + "step": 89710 + }, + { + "epoch": 0.0486, + "grad_norm": 0.08274450898170471, + "learning_rate": 3.124967243453707e-05, + "loss": 0.0351, + "step": 89720 + }, + { + "epoch": 0.04865, + "grad_norm": 0.08255013078451157, + "learning_rate": 3.124566992197221e-05, + "loss": 0.0354, + "step": 89730 + }, + { + "epoch": 0.0487, + "grad_norm": 0.06928831338882446, + "learning_rate": 3.1241667238653084e-05, + "loss": 0.0347, + "step": 89740 + }, + { + "epoch": 0.04875, + "grad_norm": 0.08170973509550095, + "learning_rate": 3.123766438468912e-05, + "loss": 0.0367, + "step": 89750 + }, + { + "epoch": 0.0488, + "grad_norm": 0.06689556688070297, + "learning_rate": 3.123366136018975e-05, + "loss": 0.0345, + "step": 89760 + }, + { + "epoch": 0.04885, + "grad_norm": 0.07722654193639755, + "learning_rate": 3.1229658165264424e-05, + "loss": 0.0362, + "step": 89770 + }, + { + "epoch": 0.0489, + "grad_norm": 0.08215050399303436, + "learning_rate": 3.122565480002259e-05, + "loss": 0.0339, + "step": 89780 + }, + { + "epoch": 0.04895, + "grad_norm": 0.10155977308750153, + "learning_rate": 3.122165126457369e-05, + "loss": 0.0363, + "step": 89790 + }, + { + "epoch": 0.049, + "grad_norm": 0.08723064512014389, + "learning_rate": 3.1217647559027176e-05, + "loss": 0.0344, + "step": 89800 + }, + { + "epoch": 0.04905, + "grad_norm": 0.09421028941869736, + "learning_rate": 3.121364368349252e-05, + "loss": 0.0371, + "step": 89810 + }, + { + "epoch": 0.0491, + "grad_norm": 0.08751319348812103, + "learning_rate": 3.120963963807918e-05, + "loss": 0.0356, + "step": 89820 + }, + { + "epoch": 0.04915, + "grad_norm": 0.07526911795139313, + "learning_rate": 3.1205635422896616e-05, + "loss": 0.0345, + "step": 89830 + }, + { + "epoch": 0.0492, + "grad_norm": 0.08347878605127335, + "learning_rate": 3.120163103805432e-05, + "loss": 0.0356, + "step": 89840 + }, + { + "epoch": 0.04925, + "grad_norm": 0.07796986401081085, + "learning_rate": 3.119762648366176e-05, + "loss": 0.035, + "step": 89850 + }, + { + "epoch": 0.0493, + "grad_norm": 0.08017401397228241, + "learning_rate": 3.1193621759828415e-05, + "loss": 0.037, + "step": 89860 + }, + { + "epoch": 0.04935, + "grad_norm": 0.09265467524528503, + "learning_rate": 3.118961686666379e-05, + "loss": 0.0356, + "step": 89870 + }, + { + "epoch": 0.0494, + "grad_norm": 0.11003389209508896, + "learning_rate": 3.118561180427736e-05, + "loss": 0.0362, + "step": 89880 + }, + { + "epoch": 0.04945, + "grad_norm": 0.08867444843053818, + "learning_rate": 3.1181606572778623e-05, + "loss": 0.0364, + "step": 89890 + }, + { + "epoch": 0.0495, + "grad_norm": 0.08906946331262589, + "learning_rate": 3.117760117227708e-05, + "loss": 0.0362, + "step": 89900 + }, + { + "epoch": 0.04955, + "grad_norm": 0.08210599422454834, + "learning_rate": 3.1173595602882255e-05, + "loss": 0.0361, + "step": 89910 + }, + { + "epoch": 0.0496, + "grad_norm": 0.08254941552877426, + "learning_rate": 3.116958986470364e-05, + "loss": 0.0346, + "step": 89920 + }, + { + "epoch": 0.04965, + "grad_norm": 0.08105908334255219, + "learning_rate": 3.116558395785075e-05, + "loss": 0.0361, + "step": 89930 + }, + { + "epoch": 0.0497, + "grad_norm": 0.08087185770273209, + "learning_rate": 3.116157788243311e-05, + "loss": 0.0354, + "step": 89940 + }, + { + "epoch": 0.04975, + "grad_norm": 0.09427832067012787, + "learning_rate": 3.115757163856026e-05, + "loss": 0.0355, + "step": 89950 + }, + { + "epoch": 0.0498, + "grad_norm": 0.09030015021562576, + "learning_rate": 3.1153565226341695e-05, + "loss": 0.0348, + "step": 89960 + }, + { + "epoch": 0.04985, + "grad_norm": 0.08178424835205078, + "learning_rate": 3.114955864588698e-05, + "loss": 0.034, + "step": 89970 + }, + { + "epoch": 0.0499, + "grad_norm": 0.07544025033712387, + "learning_rate": 3.114555189730565e-05, + "loss": 0.0357, + "step": 89980 + }, + { + "epoch": 0.04995, + "grad_norm": 0.07285235822200775, + "learning_rate": 3.114154498070723e-05, + "loss": 0.0348, + "step": 89990 + }, + { + "epoch": 0.05, + "grad_norm": 0.08107560873031616, + "learning_rate": 3.113753789620127e-05, + "loss": 0.0355, + "step": 90000 + }, + { + "epoch": 0.05005, + "grad_norm": 0.06690838932991028, + "learning_rate": 3.113353064389734e-05, + "loss": 0.0339, + "step": 90010 + }, + { + "epoch": 0.0501, + "grad_norm": 0.09131324291229248, + "learning_rate": 3.1129523223904984e-05, + "loss": 0.0355, + "step": 90020 + }, + { + "epoch": 0.05015, + "grad_norm": 0.08263172954320908, + "learning_rate": 3.1125515636333766e-05, + "loss": 0.0353, + "step": 90030 + }, + { + "epoch": 0.0502, + "grad_norm": 0.08971139043569565, + "learning_rate": 3.112150788129326e-05, + "loss": 0.0381, + "step": 90040 + }, + { + "epoch": 0.05025, + "grad_norm": 0.08543914556503296, + "learning_rate": 3.111749995889302e-05, + "loss": 0.0386, + "step": 90050 + }, + { + "epoch": 0.0503, + "grad_norm": 0.07980793714523315, + "learning_rate": 3.111349186924263e-05, + "loss": 0.0346, + "step": 90060 + }, + { + "epoch": 0.05035, + "grad_norm": 0.08420425653457642, + "learning_rate": 3.110948361245166e-05, + "loss": 0.0339, + "step": 90070 + }, + { + "epoch": 0.0504, + "grad_norm": 0.08622614294290543, + "learning_rate": 3.110547518862971e-05, + "loss": 0.0373, + "step": 90080 + }, + { + "epoch": 0.05045, + "grad_norm": 0.06477276235818863, + "learning_rate": 3.1101466597886365e-05, + "loss": 0.0362, + "step": 90090 + }, + { + "epoch": 0.0505, + "grad_norm": 0.08431649208068848, + "learning_rate": 3.1097457840331217e-05, + "loss": 0.037, + "step": 90100 + }, + { + "epoch": 0.05055, + "grad_norm": 0.06587330996990204, + "learning_rate": 3.109344891607386e-05, + "loss": 0.0356, + "step": 90110 + }, + { + "epoch": 0.0506, + "grad_norm": 0.10042694956064224, + "learning_rate": 3.10894398252239e-05, + "loss": 0.0385, + "step": 90120 + }, + { + "epoch": 0.05065, + "grad_norm": 0.08202438056468964, + "learning_rate": 3.1085430567890937e-05, + "loss": 0.036, + "step": 90130 + }, + { + "epoch": 0.0507, + "grad_norm": 0.07736998051404953, + "learning_rate": 3.10814211441846e-05, + "loss": 0.0359, + "step": 90140 + }, + { + "epoch": 0.05075, + "grad_norm": 0.09101607650518417, + "learning_rate": 3.107741155421448e-05, + "loss": 0.0351, + "step": 90150 + }, + { + "epoch": 0.0508, + "grad_norm": 0.07944773882627487, + "learning_rate": 3.107340179809022e-05, + "loss": 0.0352, + "step": 90160 + }, + { + "epoch": 0.05085, + "grad_norm": 0.07260844856500626, + "learning_rate": 3.106939187592143e-05, + "loss": 0.0361, + "step": 90170 + }, + { + "epoch": 0.0509, + "grad_norm": 0.07529579102993011, + "learning_rate": 3.106538178781775e-05, + "loss": 0.0352, + "step": 90180 + }, + { + "epoch": 0.05095, + "grad_norm": 0.10861407220363617, + "learning_rate": 3.106137153388882e-05, + "loss": 0.0381, + "step": 90190 + }, + { + "epoch": 0.051, + "grad_norm": 0.079187773168087, + "learning_rate": 3.105736111424425e-05, + "loss": 0.0344, + "step": 90200 + }, + { + "epoch": 0.05105, + "grad_norm": 0.06789105385541916, + "learning_rate": 3.105335052899372e-05, + "loss": 0.0348, + "step": 90210 + }, + { + "epoch": 0.0511, + "grad_norm": 0.07160353660583496, + "learning_rate": 3.104933977824685e-05, + "loss": 0.0357, + "step": 90220 + }, + { + "epoch": 0.05115, + "grad_norm": 0.0799093246459961, + "learning_rate": 3.104532886211331e-05, + "loss": 0.0363, + "step": 90230 + }, + { + "epoch": 0.0512, + "grad_norm": 0.08041369915008545, + "learning_rate": 3.104131778070274e-05, + "loss": 0.0346, + "step": 90240 + }, + { + "epoch": 0.05125, + "grad_norm": 0.06926840543746948, + "learning_rate": 3.1037306534124826e-05, + "loss": 0.0355, + "step": 90250 + }, + { + "epoch": 0.0513, + "grad_norm": 0.07907085865736008, + "learning_rate": 3.103329512248922e-05, + "loss": 0.0358, + "step": 90260 + }, + { + "epoch": 0.05135, + "grad_norm": 0.08070918172597885, + "learning_rate": 3.102928354590558e-05, + "loss": 0.0365, + "step": 90270 + }, + { + "epoch": 0.0514, + "grad_norm": 0.08280420303344727, + "learning_rate": 3.102527180448359e-05, + "loss": 0.0375, + "step": 90280 + }, + { + "epoch": 0.05145, + "grad_norm": 0.08545567095279694, + "learning_rate": 3.1021259898332944e-05, + "loss": 0.0378, + "step": 90290 + }, + { + "epoch": 0.0515, + "grad_norm": 0.10459499806165695, + "learning_rate": 3.1017247827563306e-05, + "loss": 0.0344, + "step": 90300 + }, + { + "epoch": 0.05155, + "grad_norm": 0.08787772804498672, + "learning_rate": 3.1013235592284386e-05, + "loss": 0.0357, + "step": 90310 + }, + { + "epoch": 0.0516, + "grad_norm": 0.08612469583749771, + "learning_rate": 3.100922319260585e-05, + "loss": 0.0365, + "step": 90320 + }, + { + "epoch": 0.05165, + "grad_norm": 0.08855035156011581, + "learning_rate": 3.1005210628637414e-05, + "loss": 0.0367, + "step": 90330 + }, + { + "epoch": 0.0517, + "grad_norm": 0.09178449958562851, + "learning_rate": 3.100119790048877e-05, + "loss": 0.0355, + "step": 90340 + }, + { + "epoch": 0.05175, + "grad_norm": 0.08951833844184875, + "learning_rate": 3.0997185008269645e-05, + "loss": 0.0383, + "step": 90350 + }, + { + "epoch": 0.0518, + "grad_norm": 0.07153066247701645, + "learning_rate": 3.099317195208972e-05, + "loss": 0.036, + "step": 90360 + }, + { + "epoch": 0.05185, + "grad_norm": 0.08133627474308014, + "learning_rate": 3.098915873205874e-05, + "loss": 0.0354, + "step": 90370 + }, + { + "epoch": 0.0519, + "grad_norm": 0.07581649720668793, + "learning_rate": 3.0985145348286394e-05, + "loss": 0.0362, + "step": 90380 + }, + { + "epoch": 0.05195, + "grad_norm": 0.07978670299053192, + "learning_rate": 3.098113180088243e-05, + "loss": 0.0354, + "step": 90390 + }, + { + "epoch": 0.052, + "grad_norm": 0.07078913599252701, + "learning_rate": 3.097711808995657e-05, + "loss": 0.0343, + "step": 90400 + }, + { + "epoch": 0.05205, + "grad_norm": 0.07472700625658035, + "learning_rate": 3.0973104215618546e-05, + "loss": 0.0353, + "step": 90410 + }, + { + "epoch": 0.0521, + "grad_norm": 0.10212170332670212, + "learning_rate": 3.09690901779781e-05, + "loss": 0.0377, + "step": 90420 + }, + { + "epoch": 0.05215, + "grad_norm": 0.09118767082691193, + "learning_rate": 3.0965075977144964e-05, + "loss": 0.0366, + "step": 90430 + }, + { + "epoch": 0.0522, + "grad_norm": 0.0685313269495964, + "learning_rate": 3.09610616132289e-05, + "loss": 0.0362, + "step": 90440 + }, + { + "epoch": 0.05225, + "grad_norm": 0.07697869837284088, + "learning_rate": 3.0957047086339644e-05, + "loss": 0.0354, + "step": 90450 + }, + { + "epoch": 0.0523, + "grad_norm": 0.07973739504814148, + "learning_rate": 3.095303239658696e-05, + "loss": 0.036, + "step": 90460 + }, + { + "epoch": 0.05235, + "grad_norm": 0.08885246515274048, + "learning_rate": 3.094901754408061e-05, + "loss": 0.0348, + "step": 90470 + }, + { + "epoch": 0.0524, + "grad_norm": 0.07255177944898605, + "learning_rate": 3.0945002528930356e-05, + "loss": 0.0355, + "step": 90480 + }, + { + "epoch": 0.05245, + "grad_norm": 0.0941619724035263, + "learning_rate": 3.094098735124596e-05, + "loss": 0.0363, + "step": 90490 + }, + { + "epoch": 0.0525, + "grad_norm": 0.11020806431770325, + "learning_rate": 3.093697201113721e-05, + "loss": 0.0369, + "step": 90500 + }, + { + "epoch": 0.05255, + "grad_norm": 0.0922388806939125, + "learning_rate": 3.093295650871387e-05, + "loss": 0.0358, + "step": 90510 + }, + { + "epoch": 0.0526, + "grad_norm": 0.09871993213891983, + "learning_rate": 3.092894084408573e-05, + "loss": 0.0378, + "step": 90520 + }, + { + "epoch": 0.05265, + "grad_norm": 0.10309197753667831, + "learning_rate": 3.0924925017362564e-05, + "loss": 0.0359, + "step": 90530 + }, + { + "epoch": 0.0527, + "grad_norm": 0.09653574228286743, + "learning_rate": 3.092090902865419e-05, + "loss": 0.0357, + "step": 90540 + }, + { + "epoch": 0.05275, + "grad_norm": 0.08390119671821594, + "learning_rate": 3.091689287807038e-05, + "loss": 0.0352, + "step": 90550 + }, + { + "epoch": 0.0528, + "grad_norm": 0.08636614680290222, + "learning_rate": 3.091287656572095e-05, + "loss": 0.0345, + "step": 90560 + }, + { + "epoch": 0.05285, + "grad_norm": 0.07649514079093933, + "learning_rate": 3.0908860091715686e-05, + "loss": 0.0356, + "step": 90570 + }, + { + "epoch": 0.0529, + "grad_norm": 0.07269894331693649, + "learning_rate": 3.090484345616441e-05, + "loss": 0.0343, + "step": 90580 + }, + { + "epoch": 0.05295, + "grad_norm": 0.09663021564483643, + "learning_rate": 3.090082665917693e-05, + "loss": 0.0358, + "step": 90590 + }, + { + "epoch": 0.053, + "grad_norm": 0.08514353632926941, + "learning_rate": 3.089680970086307e-05, + "loss": 0.0351, + "step": 90600 + }, + { + "epoch": 0.05305, + "grad_norm": 0.08956708759069443, + "learning_rate": 3.0892792581332645e-05, + "loss": 0.0357, + "step": 90610 + }, + { + "epoch": 0.0531, + "grad_norm": 0.08290702104568481, + "learning_rate": 3.088877530069549e-05, + "loss": 0.035, + "step": 90620 + }, + { + "epoch": 0.05315, + "grad_norm": 0.0854518711566925, + "learning_rate": 3.088475785906143e-05, + "loss": 0.0365, + "step": 90630 + }, + { + "epoch": 0.0532, + "grad_norm": 0.07518326491117477, + "learning_rate": 3.088074025654029e-05, + "loss": 0.0349, + "step": 90640 + }, + { + "epoch": 0.05325, + "grad_norm": 0.0811287984251976, + "learning_rate": 3.0876722493241924e-05, + "loss": 0.0356, + "step": 90650 + }, + { + "epoch": 0.0533, + "grad_norm": 0.08146429806947708, + "learning_rate": 3.0872704569276184e-05, + "loss": 0.0347, + "step": 90660 + }, + { + "epoch": 0.05335, + "grad_norm": 0.08974656462669373, + "learning_rate": 3.0868686484752897e-05, + "loss": 0.0356, + "step": 90670 + }, + { + "epoch": 0.0534, + "grad_norm": 0.07963674515485764, + "learning_rate": 3.086466823978193e-05, + "loss": 0.0354, + "step": 90680 + }, + { + "epoch": 0.05345, + "grad_norm": 0.07970944792032242, + "learning_rate": 3.086064983447314e-05, + "loss": 0.0357, + "step": 90690 + }, + { + "epoch": 0.0535, + "grad_norm": 0.09441101551055908, + "learning_rate": 3.085663126893637e-05, + "loss": 0.036, + "step": 90700 + }, + { + "epoch": 0.05355, + "grad_norm": 0.08753246068954468, + "learning_rate": 3.085261254328152e-05, + "loss": 0.0361, + "step": 90710 + }, + { + "epoch": 0.0536, + "grad_norm": 0.0795416384935379, + "learning_rate": 3.084859365761843e-05, + "loss": 0.0356, + "step": 90720 + }, + { + "epoch": 0.05365, + "grad_norm": 0.07005535811185837, + "learning_rate": 3.0844574612057e-05, + "loss": 0.0358, + "step": 90730 + }, + { + "epoch": 0.0537, + "grad_norm": 0.10531385987997055, + "learning_rate": 3.0840555406707086e-05, + "loss": 0.0361, + "step": 90740 + }, + { + "epoch": 0.05375, + "grad_norm": 0.07994677871465683, + "learning_rate": 3.083653604167858e-05, + "loss": 0.0345, + "step": 90750 + }, + { + "epoch": 0.0538, + "grad_norm": 0.09184665232896805, + "learning_rate": 3.083251651708137e-05, + "loss": 0.0349, + "step": 90760 + }, + { + "epoch": 0.05385, + "grad_norm": 0.10555008798837662, + "learning_rate": 3.082849683302536e-05, + "loss": 0.034, + "step": 90770 + }, + { + "epoch": 0.0539, + "grad_norm": 0.07258718460798264, + "learning_rate": 3.0824476989620424e-05, + "loss": 0.0366, + "step": 90780 + }, + { + "epoch": 0.05395, + "grad_norm": 0.09504424780607224, + "learning_rate": 3.082045698697648e-05, + "loss": 0.0343, + "step": 90790 + }, + { + "epoch": 0.054, + "grad_norm": 0.09034702926874161, + "learning_rate": 3.0816436825203435e-05, + "loss": 0.0367, + "step": 90800 + }, + { + "epoch": 0.05405, + "grad_norm": 0.07364467531442642, + "learning_rate": 3.081241650441118e-05, + "loss": 0.0372, + "step": 90810 + }, + { + "epoch": 0.0541, + "grad_norm": 0.06732908636331558, + "learning_rate": 3.080839602470965e-05, + "loss": 0.0346, + "step": 90820 + }, + { + "epoch": 0.05415, + "grad_norm": 0.07207812368869781, + "learning_rate": 3.080437538620876e-05, + "loss": 0.0347, + "step": 90830 + }, + { + "epoch": 0.0542, + "grad_norm": 0.08139686286449432, + "learning_rate": 3.080035458901842e-05, + "loss": 0.0353, + "step": 90840 + }, + { + "epoch": 0.05425, + "grad_norm": 0.07417777180671692, + "learning_rate": 3.0796333633248566e-05, + "loss": 0.034, + "step": 90850 + }, + { + "epoch": 0.0543, + "grad_norm": 0.07608595490455627, + "learning_rate": 3.079231251900912e-05, + "loss": 0.0354, + "step": 90860 + }, + { + "epoch": 0.05435, + "grad_norm": 0.08691065013408661, + "learning_rate": 3.0788291246410036e-05, + "loss": 0.0341, + "step": 90870 + }, + { + "epoch": 0.0544, + "grad_norm": 0.08706577867269516, + "learning_rate": 3.078426981556124e-05, + "loss": 0.0361, + "step": 90880 + }, + { + "epoch": 0.05445, + "grad_norm": 0.09268978983163834, + "learning_rate": 3.0780248226572686e-05, + "loss": 0.0369, + "step": 90890 + }, + { + "epoch": 0.0545, + "grad_norm": 0.10335977375507355, + "learning_rate": 3.077622647955431e-05, + "loss": 0.0356, + "step": 90900 + }, + { + "epoch": 0.05455, + "grad_norm": 0.07753488421440125, + "learning_rate": 3.077220457461607e-05, + "loss": 0.0347, + "step": 90910 + }, + { + "epoch": 0.0546, + "grad_norm": 0.07556873559951782, + "learning_rate": 3.076818251186793e-05, + "loss": 0.0377, + "step": 90920 + }, + { + "epoch": 0.05465, + "grad_norm": 0.08519326895475388, + "learning_rate": 3.0764160291419846e-05, + "loss": 0.0358, + "step": 90930 + }, + { + "epoch": 0.0547, + "grad_norm": 0.09992724657058716, + "learning_rate": 3.0760137913381795e-05, + "loss": 0.0359, + "step": 90940 + }, + { + "epoch": 0.05475, + "grad_norm": 0.08147275447845459, + "learning_rate": 3.075611537786372e-05, + "loss": 0.0368, + "step": 90950 + }, + { + "epoch": 0.0548, + "grad_norm": 0.09403491765260696, + "learning_rate": 3.075209268497563e-05, + "loss": 0.0364, + "step": 90960 + }, + { + "epoch": 0.05485, + "grad_norm": 0.09434369951486588, + "learning_rate": 3.0748069834827467e-05, + "loss": 0.0346, + "step": 90970 + }, + { + "epoch": 0.0549, + "grad_norm": 0.07880568504333496, + "learning_rate": 3.074404682752925e-05, + "loss": 0.035, + "step": 90980 + }, + { + "epoch": 0.05495, + "grad_norm": 0.08967886865139008, + "learning_rate": 3.074002366319094e-05, + "loss": 0.0365, + "step": 90990 + }, + { + "epoch": 0.055, + "grad_norm": 0.08391301333904266, + "learning_rate": 3.0736000341922554e-05, + "loss": 0.0363, + "step": 91000 + }, + { + "epoch": 0.05505, + "grad_norm": 0.10305073857307434, + "learning_rate": 3.073197686383406e-05, + "loss": 0.0359, + "step": 91010 + }, + { + "epoch": 0.0551, + "grad_norm": 0.09799043834209442, + "learning_rate": 3.0727953229035486e-05, + "loss": 0.0369, + "step": 91020 + }, + { + "epoch": 0.05515, + "grad_norm": 0.09323476999998093, + "learning_rate": 3.072392943763681e-05, + "loss": 0.0354, + "step": 91030 + }, + { + "epoch": 0.0552, + "grad_norm": 0.08354030549526215, + "learning_rate": 3.071990548974806e-05, + "loss": 0.0351, + "step": 91040 + }, + { + "epoch": 0.05525, + "grad_norm": 0.07392622530460358, + "learning_rate": 3.0715881385479236e-05, + "loss": 0.0354, + "step": 91050 + }, + { + "epoch": 0.0553, + "grad_norm": 0.0685829147696495, + "learning_rate": 3.071185712494037e-05, + "loss": 0.0363, + "step": 91060 + }, + { + "epoch": 0.05535, + "grad_norm": 0.095224529504776, + "learning_rate": 3.070783270824147e-05, + "loss": 0.0363, + "step": 91070 + }, + { + "epoch": 0.0554, + "grad_norm": 0.09711430221796036, + "learning_rate": 3.0703808135492574e-05, + "loss": 0.0399, + "step": 91080 + }, + { + "epoch": 0.05545, + "grad_norm": 0.09328602254390717, + "learning_rate": 3.06997834068037e-05, + "loss": 0.0368, + "step": 91090 + }, + { + "epoch": 0.0555, + "grad_norm": 0.0859946459531784, + "learning_rate": 3.069575852228489e-05, + "loss": 0.0343, + "step": 91100 + }, + { + "epoch": 0.05555, + "grad_norm": 0.08314044773578644, + "learning_rate": 3.069173348204618e-05, + "loss": 0.0365, + "step": 91110 + }, + { + "epoch": 0.0556, + "grad_norm": 0.08494190871715546, + "learning_rate": 3.068770828619762e-05, + "loss": 0.035, + "step": 91120 + }, + { + "epoch": 0.05565, + "grad_norm": 0.06117646396160126, + "learning_rate": 3.0683682934849254e-05, + "loss": 0.0352, + "step": 91130 + }, + { + "epoch": 0.0557, + "grad_norm": 0.07039974629878998, + "learning_rate": 3.0679657428111125e-05, + "loss": 0.0361, + "step": 91140 + }, + { + "epoch": 0.05575, + "grad_norm": 0.06723567098379135, + "learning_rate": 3.0675631766093304e-05, + "loss": 0.0358, + "step": 91150 + }, + { + "epoch": 0.0558, + "grad_norm": 0.06547726690769196, + "learning_rate": 3.067160594890583e-05, + "loss": 0.0368, + "step": 91160 + }, + { + "epoch": 0.05585, + "grad_norm": 0.09160833805799484, + "learning_rate": 3.06675799766588e-05, + "loss": 0.0381, + "step": 91170 + }, + { + "epoch": 0.0559, + "grad_norm": 0.08084623515605927, + "learning_rate": 3.0663553849462245e-05, + "loss": 0.0353, + "step": 91180 + }, + { + "epoch": 0.05595, + "grad_norm": 0.07233183830976486, + "learning_rate": 3.065952756742626e-05, + "loss": 0.0371, + "step": 91190 + }, + { + "epoch": 0.056, + "grad_norm": 0.0768621563911438, + "learning_rate": 3.065550113066092e-05, + "loss": 0.0351, + "step": 91200 + }, + { + "epoch": 0.05605, + "grad_norm": 0.07012893259525299, + "learning_rate": 3.0651474539276304e-05, + "loss": 0.0353, + "step": 91210 + }, + { + "epoch": 0.0561, + "grad_norm": 0.07694104313850403, + "learning_rate": 3.064744779338249e-05, + "loss": 0.0367, + "step": 91220 + }, + { + "epoch": 0.05615, + "grad_norm": 0.0797204077243805, + "learning_rate": 3.0643420893089585e-05, + "loss": 0.0367, + "step": 91230 + }, + { + "epoch": 0.0562, + "grad_norm": 0.06652817875146866, + "learning_rate": 3.063939383850767e-05, + "loss": 0.0345, + "step": 91240 + }, + { + "epoch": 0.05625, + "grad_norm": 0.07184158265590668, + "learning_rate": 3.063536662974684e-05, + "loss": 0.035, + "step": 91250 + }, + { + "epoch": 0.0563, + "grad_norm": 0.0723545253276825, + "learning_rate": 3.063133926691721e-05, + "loss": 0.037, + "step": 91260 + }, + { + "epoch": 0.05635, + "grad_norm": 0.0889274924993515, + "learning_rate": 3.062731175012888e-05, + "loss": 0.0362, + "step": 91270 + }, + { + "epoch": 0.0564, + "grad_norm": 0.08235359936952591, + "learning_rate": 3.062328407949196e-05, + "loss": 0.0353, + "step": 91280 + }, + { + "epoch": 0.05645, + "grad_norm": 0.07834339141845703, + "learning_rate": 3.0619256255116566e-05, + "loss": 0.0337, + "step": 91290 + }, + { + "epoch": 0.0565, + "grad_norm": 0.08131270110607147, + "learning_rate": 3.061522827711281e-05, + "loss": 0.0355, + "step": 91300 + }, + { + "epoch": 0.05655, + "grad_norm": 0.0728047713637352, + "learning_rate": 3.061120014559084e-05, + "loss": 0.0347, + "step": 91310 + }, + { + "epoch": 0.0566, + "grad_norm": 0.07191229611635208, + "learning_rate": 3.0607171860660746e-05, + "loss": 0.035, + "step": 91320 + }, + { + "epoch": 0.05665, + "grad_norm": 0.06673692911863327, + "learning_rate": 3.060314342243269e-05, + "loss": 0.0336, + "step": 91330 + }, + { + "epoch": 0.0567, + "grad_norm": 0.07257112115621567, + "learning_rate": 3.0599114831016796e-05, + "loss": 0.0348, + "step": 91340 + }, + { + "epoch": 0.05675, + "grad_norm": 0.07849828898906708, + "learning_rate": 3.0595086086523206e-05, + "loss": 0.0351, + "step": 91350 + }, + { + "epoch": 0.0568, + "grad_norm": 0.0824127122759819, + "learning_rate": 3.059105718906206e-05, + "loss": 0.0359, + "step": 91360 + }, + { + "epoch": 0.05685, + "grad_norm": 0.0766623467206955, + "learning_rate": 3.0587028138743516e-05, + "loss": 0.0356, + "step": 91370 + }, + { + "epoch": 0.0569, + "grad_norm": 0.07816511392593384, + "learning_rate": 3.0582998935677726e-05, + "loss": 0.0363, + "step": 91380 + }, + { + "epoch": 0.05695, + "grad_norm": 0.09526880830526352, + "learning_rate": 3.057896957997484e-05, + "loss": 0.0356, + "step": 91390 + }, + { + "epoch": 0.057, + "grad_norm": 0.08799227327108383, + "learning_rate": 3.057494007174502e-05, + "loss": 0.0343, + "step": 91400 + }, + { + "epoch": 0.05705, + "grad_norm": 0.09132442623376846, + "learning_rate": 3.057091041109843e-05, + "loss": 0.036, + "step": 91410 + }, + { + "epoch": 0.0571, + "grad_norm": 0.12402817606925964, + "learning_rate": 3.0566880598145244e-05, + "loss": 0.0367, + "step": 91420 + }, + { + "epoch": 0.05715, + "grad_norm": 0.12232159078121185, + "learning_rate": 3.0562850632995624e-05, + "loss": 0.0368, + "step": 91430 + }, + { + "epoch": 0.0572, + "grad_norm": 0.09972596168518066, + "learning_rate": 3.0558820515759775e-05, + "loss": 0.0374, + "step": 91440 + }, + { + "epoch": 0.05725, + "grad_norm": 0.1160351112484932, + "learning_rate": 3.055479024654785e-05, + "loss": 0.0355, + "step": 91450 + }, + { + "epoch": 0.0573, + "grad_norm": 0.07318606972694397, + "learning_rate": 3.055075982547006e-05, + "loss": 0.0365, + "step": 91460 + }, + { + "epoch": 0.05735, + "grad_norm": 0.0797891616821289, + "learning_rate": 3.054672925263657e-05, + "loss": 0.0351, + "step": 91470 + }, + { + "epoch": 0.0574, + "grad_norm": 0.0765325278043747, + "learning_rate": 3.0542698528157585e-05, + "loss": 0.0363, + "step": 91480 + }, + { + "epoch": 0.05745, + "grad_norm": 0.09091998636722565, + "learning_rate": 3.0538667652143306e-05, + "loss": 0.035, + "step": 91490 + }, + { + "epoch": 0.0575, + "grad_norm": 0.07104264199733734, + "learning_rate": 3.053463662470394e-05, + "loss": 0.0355, + "step": 91500 + }, + { + "epoch": 0.05755, + "grad_norm": 0.06663914024829865, + "learning_rate": 3.053060544594968e-05, + "loss": 0.0354, + "step": 91510 + }, + { + "epoch": 0.0576, + "grad_norm": 0.06117236614227295, + "learning_rate": 3.052657411599075e-05, + "loss": 0.0343, + "step": 91520 + }, + { + "epoch": 0.05765, + "grad_norm": 0.08269451558589935, + "learning_rate": 3.052254263493736e-05, + "loss": 0.0341, + "step": 91530 + }, + { + "epoch": 0.0577, + "grad_norm": 0.08021652698516846, + "learning_rate": 3.051851100289973e-05, + "loss": 0.0344, + "step": 91540 + }, + { + "epoch": 0.05775, + "grad_norm": 0.08179452270269394, + "learning_rate": 3.0514479219988073e-05, + "loss": 0.035, + "step": 91550 + }, + { + "epoch": 0.0578, + "grad_norm": 0.08777602016925812, + "learning_rate": 3.0510447286312637e-05, + "loss": 0.0346, + "step": 91560 + }, + { + "epoch": 0.05785, + "grad_norm": 0.06952960789203644, + "learning_rate": 3.050641520198364e-05, + "loss": 0.0348, + "step": 91570 + }, + { + "epoch": 0.0579, + "grad_norm": 0.08493976294994354, + "learning_rate": 3.0502382967111315e-05, + "loss": 0.0355, + "step": 91580 + }, + { + "epoch": 0.05795, + "grad_norm": 0.08295717090368271, + "learning_rate": 3.049835058180591e-05, + "loss": 0.0339, + "step": 91590 + }, + { + "epoch": 0.058, + "grad_norm": 0.09477686136960983, + "learning_rate": 3.0494318046177668e-05, + "loss": 0.0359, + "step": 91600 + }, + { + "epoch": 0.05805, + "grad_norm": 0.07548397779464722, + "learning_rate": 3.0490285360336836e-05, + "loss": 0.0349, + "step": 91610 + }, + { + "epoch": 0.0581, + "grad_norm": 0.07475226372480392, + "learning_rate": 3.0486252524393654e-05, + "loss": 0.0341, + "step": 91620 + }, + { + "epoch": 0.05815, + "grad_norm": 0.0728631541132927, + "learning_rate": 3.04822195384584e-05, + "loss": 0.0336, + "step": 91630 + }, + { + "epoch": 0.0582, + "grad_norm": 0.07909126579761505, + "learning_rate": 3.0478186402641317e-05, + "loss": 0.0355, + "step": 91640 + }, + { + "epoch": 0.05825, + "grad_norm": 0.06893298774957657, + "learning_rate": 3.0474153117052684e-05, + "loss": 0.0346, + "step": 91650 + }, + { + "epoch": 0.0583, + "grad_norm": 0.06769859045743942, + "learning_rate": 3.0470119681802756e-05, + "loss": 0.0346, + "step": 91660 + }, + { + "epoch": 0.05835, + "grad_norm": 0.08800292015075684, + "learning_rate": 3.046608609700181e-05, + "loss": 0.0352, + "step": 91670 + }, + { + "epoch": 0.0584, + "grad_norm": 0.07564336806535721, + "learning_rate": 3.0462052362760125e-05, + "loss": 0.0345, + "step": 91680 + }, + { + "epoch": 0.05845, + "grad_norm": 0.06514771282672882, + "learning_rate": 3.0458018479187983e-05, + "loss": 0.035, + "step": 91690 + }, + { + "epoch": 0.0585, + "grad_norm": 0.07323320209980011, + "learning_rate": 3.045398444639566e-05, + "loss": 0.0351, + "step": 91700 + }, + { + "epoch": 0.05855, + "grad_norm": 0.06429063528776169, + "learning_rate": 3.044995026449346e-05, + "loss": 0.0339, + "step": 91710 + }, + { + "epoch": 0.0586, + "grad_norm": 0.07759048789739609, + "learning_rate": 3.0445915933591658e-05, + "loss": 0.0352, + "step": 91720 + }, + { + "epoch": 0.05865, + "grad_norm": 0.07607323676347733, + "learning_rate": 3.044188145380057e-05, + "loss": 0.0335, + "step": 91730 + }, + { + "epoch": 0.0587, + "grad_norm": 0.07866210490465164, + "learning_rate": 3.0437846825230476e-05, + "loss": 0.0348, + "step": 91740 + }, + { + "epoch": 0.05875, + "grad_norm": 0.08538687229156494, + "learning_rate": 3.04338120479917e-05, + "loss": 0.0351, + "step": 91750 + }, + { + "epoch": 0.0588, + "grad_norm": 0.08252998441457748, + "learning_rate": 3.042977712219454e-05, + "loss": 0.0354, + "step": 91760 + }, + { + "epoch": 0.05885, + "grad_norm": 0.08144179731607437, + "learning_rate": 3.042574204794932e-05, + "loss": 0.035, + "step": 91770 + }, + { + "epoch": 0.0589, + "grad_norm": 0.07281932234764099, + "learning_rate": 3.0421706825366343e-05, + "loss": 0.0342, + "step": 91780 + }, + { + "epoch": 0.05895, + "grad_norm": 0.09572433680295944, + "learning_rate": 3.041767145455594e-05, + "loss": 0.0368, + "step": 91790 + }, + { + "epoch": 0.059, + "grad_norm": 0.08905182778835297, + "learning_rate": 3.041363593562844e-05, + "loss": 0.0376, + "step": 91800 + }, + { + "epoch": 0.05905, + "grad_norm": 0.08899427205324173, + "learning_rate": 3.040960026869416e-05, + "loss": 0.0354, + "step": 91810 + }, + { + "epoch": 0.0591, + "grad_norm": 0.09908939152956009, + "learning_rate": 3.0405564453863445e-05, + "loss": 0.0355, + "step": 91820 + }, + { + "epoch": 0.05915, + "grad_norm": 0.0857677087187767, + "learning_rate": 3.0401528491246628e-05, + "loss": 0.0363, + "step": 91830 + }, + { + "epoch": 0.0592, + "grad_norm": 0.08181668072938919, + "learning_rate": 3.0397492380954057e-05, + "loss": 0.0349, + "step": 91840 + }, + { + "epoch": 0.05925, + "grad_norm": 0.08873151242733002, + "learning_rate": 3.0393456123096065e-05, + "loss": 0.0382, + "step": 91850 + }, + { + "epoch": 0.0593, + "grad_norm": 0.0892871618270874, + "learning_rate": 3.0389419717783007e-05, + "loss": 0.0382, + "step": 91860 + }, + { + "epoch": 0.05935, + "grad_norm": 0.07593189179897308, + "learning_rate": 3.0385383165125243e-05, + "loss": 0.0341, + "step": 91870 + }, + { + "epoch": 0.0594, + "grad_norm": 0.09186351299285889, + "learning_rate": 3.038134646523313e-05, + "loss": 0.0357, + "step": 91880 + }, + { + "epoch": 0.05945, + "grad_norm": 0.10620171576738358, + "learning_rate": 3.0377309618217015e-05, + "loss": 0.0358, + "step": 91890 + }, + { + "epoch": 0.0595, + "grad_norm": 0.07334204763174057, + "learning_rate": 3.037327262418729e-05, + "loss": 0.0354, + "step": 91900 + }, + { + "epoch": 0.05955, + "grad_norm": 0.08394617587327957, + "learning_rate": 3.03692354832543e-05, + "loss": 0.0367, + "step": 91910 + }, + { + "epoch": 0.0596, + "grad_norm": 0.08659309893846512, + "learning_rate": 3.036519819552843e-05, + "loss": 0.0374, + "step": 91920 + }, + { + "epoch": 0.05965, + "grad_norm": 0.07451268285512924, + "learning_rate": 3.0361160761120056e-05, + "loss": 0.0348, + "step": 91930 + }, + { + "epoch": 0.0597, + "grad_norm": 0.0766073614358902, + "learning_rate": 3.035712318013956e-05, + "loss": 0.0344, + "step": 91940 + }, + { + "epoch": 0.05975, + "grad_norm": 0.06007981672883034, + "learning_rate": 3.0353085452697333e-05, + "loss": 0.0333, + "step": 91950 + }, + { + "epoch": 0.0598, + "grad_norm": 0.06408718973398209, + "learning_rate": 3.0349047578903765e-05, + "loss": 0.0344, + "step": 91960 + }, + { + "epoch": 0.05985, + "grad_norm": 0.10001283884048462, + "learning_rate": 3.0345009558869235e-05, + "loss": 0.0345, + "step": 91970 + }, + { + "epoch": 0.0599, + "grad_norm": 0.08132842183113098, + "learning_rate": 3.0340971392704167e-05, + "loss": 0.0349, + "step": 91980 + }, + { + "epoch": 0.05995, + "grad_norm": 0.10986791551113129, + "learning_rate": 3.0336933080518926e-05, + "loss": 0.035, + "step": 91990 + }, + { + "epoch": 0.06, + "grad_norm": 0.11718198657035828, + "learning_rate": 3.0332894622423956e-05, + "loss": 0.0342, + "step": 92000 + }, + { + "epoch": 0.06005, + "grad_norm": 0.07450106739997864, + "learning_rate": 3.0328856018529645e-05, + "loss": 0.0358, + "step": 92010 + }, + { + "epoch": 0.0601, + "grad_norm": 0.07318412512540817, + "learning_rate": 3.0324817268946416e-05, + "loss": 0.0371, + "step": 92020 + }, + { + "epoch": 0.06015, + "grad_norm": 0.07544299960136414, + "learning_rate": 3.032077837378468e-05, + "loss": 0.0355, + "step": 92030 + }, + { + "epoch": 0.0602, + "grad_norm": 0.08277773857116699, + "learning_rate": 3.0316739333154857e-05, + "loss": 0.0347, + "step": 92040 + }, + { + "epoch": 0.06025, + "grad_norm": 0.08776474744081497, + "learning_rate": 3.0312700147167382e-05, + "loss": 0.0343, + "step": 92050 + }, + { + "epoch": 0.0603, + "grad_norm": 0.07797855138778687, + "learning_rate": 3.0308660815932686e-05, + "loss": 0.0368, + "step": 92060 + }, + { + "epoch": 0.06035, + "grad_norm": 0.11843080073595047, + "learning_rate": 3.0304621339561196e-05, + "loss": 0.0373, + "step": 92070 + }, + { + "epoch": 0.0604, + "grad_norm": 0.0953860804438591, + "learning_rate": 3.0300581718163346e-05, + "loss": 0.036, + "step": 92080 + }, + { + "epoch": 0.06045, + "grad_norm": 0.08200633525848389, + "learning_rate": 3.0296541951849594e-05, + "loss": 0.0352, + "step": 92090 + }, + { + "epoch": 0.0605, + "grad_norm": 0.0750812292098999, + "learning_rate": 3.0292502040730362e-05, + "loss": 0.0375, + "step": 92100 + }, + { + "epoch": 0.06055, + "grad_norm": 0.07450171560049057, + "learning_rate": 3.028846198491612e-05, + "loss": 0.0347, + "step": 92110 + }, + { + "epoch": 0.0606, + "grad_norm": 0.07821846008300781, + "learning_rate": 3.028442178451731e-05, + "loss": 0.0349, + "step": 92120 + }, + { + "epoch": 0.06065, + "grad_norm": 0.07577892392873764, + "learning_rate": 3.02803814396444e-05, + "loss": 0.0362, + "step": 92130 + }, + { + "epoch": 0.0607, + "grad_norm": 0.09052744507789612, + "learning_rate": 3.027634095040784e-05, + "loss": 0.0365, + "step": 92140 + }, + { + "epoch": 0.06075, + "grad_norm": 0.10810858756303787, + "learning_rate": 3.0272300316918107e-05, + "loss": 0.0372, + "step": 92150 + }, + { + "epoch": 0.0608, + "grad_norm": 0.1269952356815338, + "learning_rate": 3.0268259539285654e-05, + "loss": 0.0371, + "step": 92160 + }, + { + "epoch": 0.06085, + "grad_norm": 0.09791803359985352, + "learning_rate": 3.0264218617620975e-05, + "loss": 0.0365, + "step": 92170 + }, + { + "epoch": 0.0609, + "grad_norm": 0.09965765476226807, + "learning_rate": 3.0260177552034525e-05, + "loss": 0.0364, + "step": 92180 + }, + { + "epoch": 0.06095, + "grad_norm": 0.09902802109718323, + "learning_rate": 3.0256136342636803e-05, + "loss": 0.0356, + "step": 92190 + }, + { + "epoch": 0.061, + "grad_norm": 0.08540318161249161, + "learning_rate": 3.0252094989538287e-05, + "loss": 0.0354, + "step": 92200 + }, + { + "epoch": 0.06105, + "grad_norm": 0.08087259531021118, + "learning_rate": 3.0248053492849472e-05, + "loss": 0.0357, + "step": 92210 + }, + { + "epoch": 0.0611, + "grad_norm": 0.07754070311784744, + "learning_rate": 3.024401185268084e-05, + "loss": 0.0355, + "step": 92220 + }, + { + "epoch": 0.06115, + "grad_norm": 0.0839327871799469, + "learning_rate": 3.02399700691429e-05, + "loss": 0.0376, + "step": 92230 + }, + { + "epoch": 0.0612, + "grad_norm": 0.09323536604642868, + "learning_rate": 3.0235928142346143e-05, + "loss": 0.0356, + "step": 92240 + }, + { + "epoch": 0.06125, + "grad_norm": 0.07739691436290741, + "learning_rate": 3.0231886072401072e-05, + "loss": 0.0337, + "step": 92250 + }, + { + "epoch": 0.0613, + "grad_norm": 0.08068235963582993, + "learning_rate": 3.022784385941821e-05, + "loss": 0.0364, + "step": 92260 + }, + { + "epoch": 0.06135, + "grad_norm": 0.08458296209573746, + "learning_rate": 3.022380150350806e-05, + "loss": 0.0357, + "step": 92270 + }, + { + "epoch": 0.0614, + "grad_norm": 0.07799829542636871, + "learning_rate": 3.0219759004781134e-05, + "loss": 0.035, + "step": 92280 + }, + { + "epoch": 0.06145, + "grad_norm": 0.08659221231937408, + "learning_rate": 3.0215716363347956e-05, + "loss": 0.0364, + "step": 92290 + }, + { + "epoch": 0.0615, + "grad_norm": 0.08793576806783676, + "learning_rate": 3.0211673579319067e-05, + "loss": 0.0353, + "step": 92300 + }, + { + "epoch": 0.06155, + "grad_norm": 0.07005950063467026, + "learning_rate": 3.0207630652804963e-05, + "loss": 0.036, + "step": 92310 + }, + { + "epoch": 0.0616, + "grad_norm": 0.07392650842666626, + "learning_rate": 3.0203587583916204e-05, + "loss": 0.0346, + "step": 92320 + }, + { + "epoch": 0.06165, + "grad_norm": 0.0666443482041359, + "learning_rate": 3.0199544372763304e-05, + "loss": 0.034, + "step": 92330 + }, + { + "epoch": 0.0617, + "grad_norm": 0.06908619403839111, + "learning_rate": 3.019550101945683e-05, + "loss": 0.0356, + "step": 92340 + }, + { + "epoch": 0.06175, + "grad_norm": 0.07427336275577545, + "learning_rate": 3.0191457524107304e-05, + "loss": 0.0365, + "step": 92350 + }, + { + "epoch": 0.0618, + "grad_norm": 0.07534376531839371, + "learning_rate": 3.018741388682528e-05, + "loss": 0.0381, + "step": 92360 + }, + { + "epoch": 0.06185, + "grad_norm": 0.08381146192550659, + "learning_rate": 3.0183370107721297e-05, + "loss": 0.0359, + "step": 92370 + }, + { + "epoch": 0.0619, + "grad_norm": 0.08099161833524704, + "learning_rate": 3.0179326186905936e-05, + "loss": 0.0348, + "step": 92380 + }, + { + "epoch": 0.06195, + "grad_norm": 0.08048680424690247, + "learning_rate": 3.017528212448974e-05, + "loss": 0.0346, + "step": 92390 + }, + { + "epoch": 0.062, + "grad_norm": 0.0848207175731659, + "learning_rate": 3.017123792058328e-05, + "loss": 0.0352, + "step": 92400 + }, + { + "epoch": 0.06205, + "grad_norm": 0.07833609730005264, + "learning_rate": 3.016719357529711e-05, + "loss": 0.0351, + "step": 92410 + }, + { + "epoch": 0.0621, + "grad_norm": 0.0627160593867302, + "learning_rate": 3.0163149088741816e-05, + "loss": 0.0345, + "step": 92420 + }, + { + "epoch": 0.06215, + "grad_norm": 0.06662797927856445, + "learning_rate": 3.0159104461027953e-05, + "loss": 0.0366, + "step": 92430 + }, + { + "epoch": 0.0622, + "grad_norm": 0.09291603416204453, + "learning_rate": 3.015505969226612e-05, + "loss": 0.0365, + "step": 92440 + }, + { + "epoch": 0.06225, + "grad_norm": 0.09162125736474991, + "learning_rate": 3.0151014782566887e-05, + "loss": 0.0344, + "step": 92450 + }, + { + "epoch": 0.0623, + "grad_norm": 0.10630109906196594, + "learning_rate": 3.014696973204086e-05, + "loss": 0.0365, + "step": 92460 + }, + { + "epoch": 0.06235, + "grad_norm": 0.08084352314472198, + "learning_rate": 3.01429245407986e-05, + "loss": 0.0358, + "step": 92470 + }, + { + "epoch": 0.0624, + "grad_norm": 0.062248535454273224, + "learning_rate": 3.0138879208950722e-05, + "loss": 0.0365, + "step": 92480 + }, + { + "epoch": 0.06245, + "grad_norm": 0.08606372773647308, + "learning_rate": 3.0134833736607815e-05, + "loss": 0.0358, + "step": 92490 + }, + { + "epoch": 0.0625, + "grad_norm": 0.07619146257638931, + "learning_rate": 3.0130788123880476e-05, + "loss": 0.0352, + "step": 92500 + }, + { + "epoch": 0.06255, + "grad_norm": 0.07813157886266708, + "learning_rate": 3.0126742370879324e-05, + "loss": 0.0353, + "step": 92510 + }, + { + "epoch": 0.0626, + "grad_norm": 0.07503335922956467, + "learning_rate": 3.0122696477714962e-05, + "loss": 0.0347, + "step": 92520 + }, + { + "epoch": 0.06265, + "grad_norm": 0.06751769781112671, + "learning_rate": 3.0118650444498005e-05, + "loss": 0.0365, + "step": 92530 + }, + { + "epoch": 0.0627, + "grad_norm": 0.07441738992929459, + "learning_rate": 3.011460427133906e-05, + "loss": 0.0354, + "step": 92540 + }, + { + "epoch": 0.06275, + "grad_norm": 0.06860997527837753, + "learning_rate": 3.0110557958348762e-05, + "loss": 0.036, + "step": 92550 + }, + { + "epoch": 0.0628, + "grad_norm": 0.06549837440252304, + "learning_rate": 3.0106511505637725e-05, + "loss": 0.0378, + "step": 92560 + }, + { + "epoch": 0.06285, + "grad_norm": 0.06085606664419174, + "learning_rate": 3.0102464913316586e-05, + "loss": 0.0358, + "step": 92570 + }, + { + "epoch": 0.0629, + "grad_norm": 0.07938232272863388, + "learning_rate": 3.0098418181495968e-05, + "loss": 0.0342, + "step": 92580 + }, + { + "epoch": 0.06295, + "grad_norm": 0.06906241178512573, + "learning_rate": 3.0094371310286517e-05, + "loss": 0.0333, + "step": 92590 + }, + { + "epoch": 0.063, + "grad_norm": 0.07146913558244705, + "learning_rate": 3.0090324299798866e-05, + "loss": 0.0348, + "step": 92600 + }, + { + "epoch": 0.06305, + "grad_norm": 0.09440640360116959, + "learning_rate": 3.0086277150143665e-05, + "loss": 0.0347, + "step": 92610 + }, + { + "epoch": 0.0631, + "grad_norm": 0.0820523127913475, + "learning_rate": 3.0082229861431556e-05, + "loss": 0.0354, + "step": 92620 + }, + { + "epoch": 0.06315, + "grad_norm": 0.08139200508594513, + "learning_rate": 3.007818243377319e-05, + "loss": 0.0357, + "step": 92630 + }, + { + "epoch": 0.0632, + "grad_norm": 0.07425463199615479, + "learning_rate": 3.007413486727922e-05, + "loss": 0.0349, + "step": 92640 + }, + { + "epoch": 0.06325, + "grad_norm": 0.07356837391853333, + "learning_rate": 3.0070087162060316e-05, + "loss": 0.034, + "step": 92650 + }, + { + "epoch": 0.0633, + "grad_norm": 0.0619027353823185, + "learning_rate": 3.006603931822713e-05, + "loss": 0.0344, + "step": 92660 + }, + { + "epoch": 0.06335, + "grad_norm": 0.06232372298836708, + "learning_rate": 3.006199133589034e-05, + "loss": 0.0336, + "step": 92670 + }, + { + "epoch": 0.0634, + "grad_norm": 0.06282227486371994, + "learning_rate": 3.00579432151606e-05, + "loss": 0.0331, + "step": 92680 + }, + { + "epoch": 0.06345, + "grad_norm": 0.0717713013291359, + "learning_rate": 3.0053894956148593e-05, + "loss": 0.0347, + "step": 92690 + }, + { + "epoch": 0.0635, + "grad_norm": 0.055683765560388565, + "learning_rate": 3.0049846558964995e-05, + "loss": 0.0335, + "step": 92700 + }, + { + "epoch": 0.06355, + "grad_norm": 0.05818936973810196, + "learning_rate": 3.0045798023720494e-05, + "loss": 0.0346, + "step": 92710 + }, + { + "epoch": 0.0636, + "grad_norm": 0.0628824457526207, + "learning_rate": 3.0041749350525772e-05, + "loss": 0.0346, + "step": 92720 + }, + { + "epoch": 0.06365, + "grad_norm": 0.07777944952249527, + "learning_rate": 3.0037700539491515e-05, + "loss": 0.0341, + "step": 92730 + }, + { + "epoch": 0.0637, + "grad_norm": 0.07659552246332169, + "learning_rate": 3.0033651590728417e-05, + "loss": 0.0346, + "step": 92740 + }, + { + "epoch": 0.06375, + "grad_norm": 0.06526902318000793, + "learning_rate": 3.002960250434717e-05, + "loss": 0.0329, + "step": 92750 + }, + { + "epoch": 0.0638, + "grad_norm": 0.07887840270996094, + "learning_rate": 3.0025553280458485e-05, + "loss": 0.0344, + "step": 92760 + }, + { + "epoch": 0.06385, + "grad_norm": 0.067435123026371, + "learning_rate": 3.0021503919173065e-05, + "loss": 0.0357, + "step": 92770 + }, + { + "epoch": 0.0639, + "grad_norm": 0.0651765689253807, + "learning_rate": 3.001745442060161e-05, + "loss": 0.0347, + "step": 92780 + }, + { + "epoch": 0.06395, + "grad_norm": 0.07981258630752563, + "learning_rate": 3.0013404784854838e-05, + "loss": 0.0339, + "step": 92790 + }, + { + "epoch": 0.064, + "grad_norm": 0.07509902119636536, + "learning_rate": 3.000935501204346e-05, + "loss": 0.0348, + "step": 92800 + }, + { + "epoch": 0.06405, + "grad_norm": 0.06714314222335815, + "learning_rate": 3.0005305102278204e-05, + "loss": 0.0354, + "step": 92810 + }, + { + "epoch": 0.0641, + "grad_norm": 0.07060767710208893, + "learning_rate": 3.000125505566978e-05, + "loss": 0.0349, + "step": 92820 + }, + { + "epoch": 0.06415, + "grad_norm": 0.07698815315961838, + "learning_rate": 2.999720487232892e-05, + "loss": 0.0336, + "step": 92830 + }, + { + "epoch": 0.0642, + "grad_norm": 0.08230160176753998, + "learning_rate": 2.9993154552366363e-05, + "loss": 0.036, + "step": 92840 + }, + { + "epoch": 0.06425, + "grad_norm": 0.08095083385705948, + "learning_rate": 2.9989104095892835e-05, + "loss": 0.0361, + "step": 92850 + }, + { + "epoch": 0.0643, + "grad_norm": 0.07354466617107391, + "learning_rate": 2.9985053503019078e-05, + "loss": 0.035, + "step": 92860 + }, + { + "epoch": 0.06435, + "grad_norm": 0.07451581209897995, + "learning_rate": 2.9981002773855825e-05, + "loss": 0.0341, + "step": 92870 + }, + { + "epoch": 0.0644, + "grad_norm": 0.08016319572925568, + "learning_rate": 2.9976951908513828e-05, + "loss": 0.0343, + "step": 92880 + }, + { + "epoch": 0.06445, + "grad_norm": 0.07381189614534378, + "learning_rate": 2.9972900907103835e-05, + "loss": 0.0339, + "step": 92890 + }, + { + "epoch": 0.0645, + "grad_norm": 0.06846525520086288, + "learning_rate": 2.9968849769736608e-05, + "loss": 0.0347, + "step": 92900 + }, + { + "epoch": 0.06455, + "grad_norm": 0.07587923109531403, + "learning_rate": 2.9964798496522883e-05, + "loss": 0.0343, + "step": 92910 + }, + { + "epoch": 0.0646, + "grad_norm": 0.07672179490327835, + "learning_rate": 2.9960747087573443e-05, + "loss": 0.0352, + "step": 92920 + }, + { + "epoch": 0.06465, + "grad_norm": 0.0883919820189476, + "learning_rate": 2.9956695542999036e-05, + "loss": 0.0349, + "step": 92930 + }, + { + "epoch": 0.0647, + "grad_norm": 0.07688490301370621, + "learning_rate": 2.9952643862910434e-05, + "loss": 0.035, + "step": 92940 + }, + { + "epoch": 0.06475, + "grad_norm": 0.07831470668315887, + "learning_rate": 2.9948592047418407e-05, + "loss": 0.0343, + "step": 92950 + }, + { + "epoch": 0.0648, + "grad_norm": 0.07938302308320999, + "learning_rate": 2.9944540096633737e-05, + "loss": 0.0354, + "step": 92960 + }, + { + "epoch": 0.06485, + "grad_norm": 0.07706853747367859, + "learning_rate": 2.99404880106672e-05, + "loss": 0.0363, + "step": 92970 + }, + { + "epoch": 0.0649, + "grad_norm": 0.08308210223913193, + "learning_rate": 2.9936435789629575e-05, + "loss": 0.0348, + "step": 92980 + }, + { + "epoch": 0.06495, + "grad_norm": 0.06706178188323975, + "learning_rate": 2.993238343363165e-05, + "loss": 0.0352, + "step": 92990 + }, + { + "epoch": 0.065, + "grad_norm": 0.06068303436040878, + "learning_rate": 2.992833094278421e-05, + "loss": 0.0355, + "step": 93000 + }, + { + "epoch": 0.06505, + "grad_norm": 0.11114989221096039, + "learning_rate": 2.9924278317198057e-05, + "loss": 0.0375, + "step": 93010 + }, + { + "epoch": 0.0651, + "grad_norm": 0.07859257608652115, + "learning_rate": 2.9920225556983987e-05, + "loss": 0.0354, + "step": 93020 + }, + { + "epoch": 0.06515, + "grad_norm": 0.0722615048289299, + "learning_rate": 2.99161726622528e-05, + "loss": 0.035, + "step": 93030 + }, + { + "epoch": 0.0652, + "grad_norm": 0.08284981548786163, + "learning_rate": 2.9912119633115295e-05, + "loss": 0.0358, + "step": 93040 + }, + { + "epoch": 0.06525, + "grad_norm": 0.07400976121425629, + "learning_rate": 2.990806646968229e-05, + "loss": 0.0369, + "step": 93050 + }, + { + "epoch": 0.0653, + "grad_norm": 0.07793234288692474, + "learning_rate": 2.990401317206458e-05, + "loss": 0.0363, + "step": 93060 + }, + { + "epoch": 0.06535, + "grad_norm": 0.09183067083358765, + "learning_rate": 2.9899959740373003e-05, + "loss": 0.0351, + "step": 93070 + }, + { + "epoch": 0.0654, + "grad_norm": 0.0798955112695694, + "learning_rate": 2.9895906174718363e-05, + "loss": 0.0356, + "step": 93080 + }, + { + "epoch": 0.06545, + "grad_norm": 0.07224183529615402, + "learning_rate": 2.989185247521149e-05, + "loss": 0.0356, + "step": 93090 + }, + { + "epoch": 0.0655, + "grad_norm": 0.07504131644964218, + "learning_rate": 2.9887798641963204e-05, + "loss": 0.0359, + "step": 93100 + }, + { + "epoch": 0.06555, + "grad_norm": 0.0735727921128273, + "learning_rate": 2.988374467508435e-05, + "loss": 0.0347, + "step": 93110 + }, + { + "epoch": 0.0656, + "grad_norm": 0.06349542737007141, + "learning_rate": 2.9879690574685743e-05, + "loss": 0.0345, + "step": 93120 + }, + { + "epoch": 0.06565, + "grad_norm": 0.08468075096607208, + "learning_rate": 2.9875636340878233e-05, + "loss": 0.0352, + "step": 93130 + }, + { + "epoch": 0.0657, + "grad_norm": 0.07736873626708984, + "learning_rate": 2.9871581973772646e-05, + "loss": 0.0354, + "step": 93140 + }, + { + "epoch": 0.06575, + "grad_norm": 0.12461365759372711, + "learning_rate": 2.986752747347985e-05, + "loss": 0.0351, + "step": 93150 + }, + { + "epoch": 0.0658, + "grad_norm": 0.10645054280757904, + "learning_rate": 2.9863472840110672e-05, + "loss": 0.035, + "step": 93160 + }, + { + "epoch": 0.06585, + "grad_norm": 0.09245552867650986, + "learning_rate": 2.9859418073775987e-05, + "loss": 0.0364, + "step": 93170 + }, + { + "epoch": 0.0659, + "grad_norm": 0.09452535957098007, + "learning_rate": 2.9855363174586627e-05, + "loss": 0.0355, + "step": 93180 + }, + { + "epoch": 0.06595, + "grad_norm": 0.10148017853498459, + "learning_rate": 2.9851308142653466e-05, + "loss": 0.0374, + "step": 93190 + }, + { + "epoch": 0.066, + "grad_norm": 0.08406191319227219, + "learning_rate": 2.984725297808736e-05, + "loss": 0.036, + "step": 93200 + }, + { + "epoch": 0.06605, + "grad_norm": 0.0741802453994751, + "learning_rate": 2.984319768099918e-05, + "loss": 0.0349, + "step": 93210 + }, + { + "epoch": 0.0661, + "grad_norm": 0.06948202103376389, + "learning_rate": 2.9839142251499798e-05, + "loss": 0.0352, + "step": 93220 + }, + { + "epoch": 0.06615, + "grad_norm": 0.09487444907426834, + "learning_rate": 2.983508668970008e-05, + "loss": 0.0348, + "step": 93230 + }, + { + "epoch": 0.0662, + "grad_norm": 0.07550612837076187, + "learning_rate": 2.983103099571091e-05, + "loss": 0.0351, + "step": 93240 + }, + { + "epoch": 0.06625, + "grad_norm": 0.07216605544090271, + "learning_rate": 2.9826975169643168e-05, + "loss": 0.0349, + "step": 93250 + }, + { + "epoch": 0.0663, + "grad_norm": 0.08113081008195877, + "learning_rate": 2.9822919211607737e-05, + "loss": 0.0381, + "step": 93260 + }, + { + "epoch": 0.06635, + "grad_norm": 0.08273231238126755, + "learning_rate": 2.9818863121715506e-05, + "loss": 0.0354, + "step": 93270 + }, + { + "epoch": 0.0664, + "grad_norm": 0.082130566239357, + "learning_rate": 2.981480690007737e-05, + "loss": 0.0372, + "step": 93280 + }, + { + "epoch": 0.06645, + "grad_norm": 0.09722493588924408, + "learning_rate": 2.9810750546804222e-05, + "loss": 0.0356, + "step": 93290 + }, + { + "epoch": 0.0665, + "grad_norm": 0.11571363359689713, + "learning_rate": 2.9806694062006963e-05, + "loss": 0.0393, + "step": 93300 + }, + { + "epoch": 0.06655, + "grad_norm": 0.09508045017719269, + "learning_rate": 2.980263744579649e-05, + "loss": 0.035, + "step": 93310 + }, + { + "epoch": 0.0666, + "grad_norm": 0.07783970981836319, + "learning_rate": 2.9798580698283718e-05, + "loss": 0.0375, + "step": 93320 + }, + { + "epoch": 0.06665, + "grad_norm": 0.07939450442790985, + "learning_rate": 2.9794523819579546e-05, + "loss": 0.0362, + "step": 93330 + }, + { + "epoch": 0.0667, + "grad_norm": 0.07583706825971603, + "learning_rate": 2.9790466809794898e-05, + "loss": 0.0353, + "step": 93340 + }, + { + "epoch": 0.06675, + "grad_norm": 0.09335724264383316, + "learning_rate": 2.9786409669040687e-05, + "loss": 0.036, + "step": 93350 + }, + { + "epoch": 0.0668, + "grad_norm": 0.08478909730911255, + "learning_rate": 2.978235239742784e-05, + "loss": 0.0368, + "step": 93360 + }, + { + "epoch": 0.06685, + "grad_norm": 0.09881918877363205, + "learning_rate": 2.9778294995067262e-05, + "loss": 0.0356, + "step": 93370 + }, + { + "epoch": 0.0669, + "grad_norm": 0.08156245946884155, + "learning_rate": 2.9774237462069905e-05, + "loss": 0.0354, + "step": 93380 + }, + { + "epoch": 0.06695, + "grad_norm": 0.09070413559675217, + "learning_rate": 2.9770179798546678e-05, + "loss": 0.0389, + "step": 93390 + }, + { + "epoch": 0.067, + "grad_norm": 0.08894050866365433, + "learning_rate": 2.9766122004608537e-05, + "loss": 0.0362, + "step": 93400 + }, + { + "epoch": 0.06705, + "grad_norm": 0.08819899708032608, + "learning_rate": 2.9762064080366407e-05, + "loss": 0.0354, + "step": 93410 + }, + { + "epoch": 0.0671, + "grad_norm": 0.11669158190488815, + "learning_rate": 2.975800602593124e-05, + "loss": 0.0399, + "step": 93420 + }, + { + "epoch": 0.06715, + "grad_norm": 0.10961344093084335, + "learning_rate": 2.975394784141397e-05, + "loss": 0.0352, + "step": 93430 + }, + { + "epoch": 0.0672, + "grad_norm": 0.09197092801332474, + "learning_rate": 2.9749889526925557e-05, + "loss": 0.0357, + "step": 93440 + }, + { + "epoch": 0.06725, + "grad_norm": 0.10645048320293427, + "learning_rate": 2.9745831082576948e-05, + "loss": 0.0349, + "step": 93450 + }, + { + "epoch": 0.0673, + "grad_norm": 0.10623519122600555, + "learning_rate": 2.9741772508479093e-05, + "loss": 0.0356, + "step": 93460 + }, + { + "epoch": 0.06735, + "grad_norm": 0.07466422021389008, + "learning_rate": 2.973771380474296e-05, + "loss": 0.0347, + "step": 93470 + }, + { + "epoch": 0.0674, + "grad_norm": 0.07325384020805359, + "learning_rate": 2.973365497147952e-05, + "loss": 0.0351, + "step": 93480 + }, + { + "epoch": 0.06745, + "grad_norm": 0.08341530710458755, + "learning_rate": 2.972959600879972e-05, + "loss": 0.0368, + "step": 93490 + }, + { + "epoch": 0.0675, + "grad_norm": 0.09049838036298752, + "learning_rate": 2.972553691681455e-05, + "loss": 0.0362, + "step": 93500 + }, + { + "epoch": 0.06755, + "grad_norm": 0.07634222507476807, + "learning_rate": 2.9721477695634977e-05, + "loss": 0.0336, + "step": 93510 + }, + { + "epoch": 0.0676, + "grad_norm": 0.07836073637008667, + "learning_rate": 2.9717418345371972e-05, + "loss": 0.0342, + "step": 93520 + }, + { + "epoch": 0.06765, + "grad_norm": 0.09737568348646164, + "learning_rate": 2.9713358866136526e-05, + "loss": 0.0347, + "step": 93530 + }, + { + "epoch": 0.0677, + "grad_norm": 0.07100161910057068, + "learning_rate": 2.9709299258039613e-05, + "loss": 0.0349, + "step": 93540 + }, + { + "epoch": 0.06775, + "grad_norm": 0.07358980178833008, + "learning_rate": 2.9705239521192235e-05, + "loss": 0.0332, + "step": 93550 + }, + { + "epoch": 0.0678, + "grad_norm": 0.07812168449163437, + "learning_rate": 2.9701179655705374e-05, + "loss": 0.0342, + "step": 93560 + }, + { + "epoch": 0.06785, + "grad_norm": 0.09706774353981018, + "learning_rate": 2.9697119661690032e-05, + "loss": 0.0364, + "step": 93570 + }, + { + "epoch": 0.0679, + "grad_norm": 0.08430415391921997, + "learning_rate": 2.969305953925719e-05, + "loss": 0.0342, + "step": 93580 + }, + { + "epoch": 0.06795, + "grad_norm": 0.0800105482339859, + "learning_rate": 2.968899928851787e-05, + "loss": 0.0342, + "step": 93590 + }, + { + "epoch": 0.068, + "grad_norm": 0.08123991638422012, + "learning_rate": 2.9684938909583073e-05, + "loss": 0.0337, + "step": 93600 + }, + { + "epoch": 0.06805, + "grad_norm": 0.09043265879154205, + "learning_rate": 2.968087840256381e-05, + "loss": 0.0341, + "step": 93610 + }, + { + "epoch": 0.0681, + "grad_norm": 0.08123215287923813, + "learning_rate": 2.9676817767571086e-05, + "loss": 0.0339, + "step": 93620 + }, + { + "epoch": 0.06815, + "grad_norm": 0.07657421380281448, + "learning_rate": 2.9672757004715924e-05, + "loss": 0.0345, + "step": 93630 + }, + { + "epoch": 0.0682, + "grad_norm": 0.07973584532737732, + "learning_rate": 2.9668696114109333e-05, + "loss": 0.0353, + "step": 93640 + }, + { + "epoch": 0.06825, + "grad_norm": 0.07709518820047379, + "learning_rate": 2.9664635095862347e-05, + "loss": 0.0348, + "step": 93650 + }, + { + "epoch": 0.0683, + "grad_norm": 0.08385272324085236, + "learning_rate": 2.966057395008599e-05, + "loss": 0.036, + "step": 93660 + }, + { + "epoch": 0.06835, + "grad_norm": 0.07511667162179947, + "learning_rate": 2.9656512676891295e-05, + "loss": 0.0344, + "step": 93670 + }, + { + "epoch": 0.0684, + "grad_norm": 0.07143174856901169, + "learning_rate": 2.9652451276389294e-05, + "loss": 0.0345, + "step": 93680 + }, + { + "epoch": 0.06845, + "grad_norm": 0.062331750988960266, + "learning_rate": 2.9648389748691025e-05, + "loss": 0.0344, + "step": 93690 + }, + { + "epoch": 0.0685, + "grad_norm": 0.06511224806308746, + "learning_rate": 2.964432809390752e-05, + "loss": 0.0338, + "step": 93700 + }, + { + "epoch": 0.06855, + "grad_norm": 0.0728011503815651, + "learning_rate": 2.9640266312149827e-05, + "loss": 0.0347, + "step": 93710 + }, + { + "epoch": 0.0686, + "grad_norm": 0.06844881922006607, + "learning_rate": 2.9636204403529e-05, + "loss": 0.0339, + "step": 93720 + }, + { + "epoch": 0.06865, + "grad_norm": 0.0771973580121994, + "learning_rate": 2.963214236815609e-05, + "loss": 0.0352, + "step": 93730 + }, + { + "epoch": 0.0687, + "grad_norm": 0.08221804350614548, + "learning_rate": 2.9628080206142145e-05, + "loss": 0.0363, + "step": 93740 + }, + { + "epoch": 0.06875, + "grad_norm": 0.07745331525802612, + "learning_rate": 2.9624017917598225e-05, + "loss": 0.0368, + "step": 93750 + }, + { + "epoch": 0.0688, + "grad_norm": 0.09254749119281769, + "learning_rate": 2.96199555026354e-05, + "loss": 0.0346, + "step": 93760 + }, + { + "epoch": 0.06885, + "grad_norm": 0.08585920929908752, + "learning_rate": 2.9615892961364716e-05, + "loss": 0.0337, + "step": 93770 + }, + { + "epoch": 0.0689, + "grad_norm": 0.06769262999296188, + "learning_rate": 2.9611830293897253e-05, + "loss": 0.034, + "step": 93780 + }, + { + "epoch": 0.06895, + "grad_norm": 0.07950049638748169, + "learning_rate": 2.9607767500344084e-05, + "loss": 0.0347, + "step": 93790 + }, + { + "epoch": 0.069, + "grad_norm": 0.07528354972600937, + "learning_rate": 2.9603704580816293e-05, + "loss": 0.0364, + "step": 93800 + }, + { + "epoch": 0.06905, + "grad_norm": 0.07296212017536163, + "learning_rate": 2.9599641535424938e-05, + "loss": 0.0343, + "step": 93810 + }, + { + "epoch": 0.0691, + "grad_norm": 0.07152281701564789, + "learning_rate": 2.9595578364281117e-05, + "loss": 0.0353, + "step": 93820 + }, + { + "epoch": 0.06915, + "grad_norm": 0.062432125210762024, + "learning_rate": 2.9591515067495906e-05, + "loss": 0.0345, + "step": 93830 + }, + { + "epoch": 0.0692, + "grad_norm": 0.09543795883655548, + "learning_rate": 2.95874516451804e-05, + "loss": 0.0356, + "step": 93840 + }, + { + "epoch": 0.06925, + "grad_norm": 0.08136715739965439, + "learning_rate": 2.958338809744568e-05, + "loss": 0.0353, + "step": 93850 + }, + { + "epoch": 0.0693, + "grad_norm": 0.09028245508670807, + "learning_rate": 2.9579324424402865e-05, + "loss": 0.0346, + "step": 93860 + }, + { + "epoch": 0.06935, + "grad_norm": 0.07663322985172272, + "learning_rate": 2.9575260626163037e-05, + "loss": 0.0346, + "step": 93870 + }, + { + "epoch": 0.0694, + "grad_norm": 0.07476460933685303, + "learning_rate": 2.95711967028373e-05, + "loss": 0.0359, + "step": 93880 + }, + { + "epoch": 0.06945, + "grad_norm": 0.09127458930015564, + "learning_rate": 2.9567132654536763e-05, + "loss": 0.0377, + "step": 93890 + }, + { + "epoch": 0.0695, + "grad_norm": 0.07893301546573639, + "learning_rate": 2.9563068481372535e-05, + "loss": 0.036, + "step": 93900 + }, + { + "epoch": 0.06955, + "grad_norm": 0.08074024319648743, + "learning_rate": 2.9559004183455726e-05, + "loss": 0.0361, + "step": 93910 + }, + { + "epoch": 0.0696, + "grad_norm": 0.07430429756641388, + "learning_rate": 2.955493976089746e-05, + "loss": 0.0353, + "step": 93920 + }, + { + "epoch": 0.06965, + "grad_norm": 0.08006362617015839, + "learning_rate": 2.955087521380885e-05, + "loss": 0.0364, + "step": 93930 + }, + { + "epoch": 0.0697, + "grad_norm": 0.07894095778465271, + "learning_rate": 2.9546810542301024e-05, + "loss": 0.0356, + "step": 93940 + }, + { + "epoch": 0.06975, + "grad_norm": 0.0702204704284668, + "learning_rate": 2.9542745746485102e-05, + "loss": 0.0353, + "step": 93950 + }, + { + "epoch": 0.0698, + "grad_norm": 0.07127805799245834, + "learning_rate": 2.953868082647222e-05, + "loss": 0.0352, + "step": 93960 + }, + { + "epoch": 0.06985, + "grad_norm": 0.07629313319921494, + "learning_rate": 2.953461578237351e-05, + "loss": 0.0348, + "step": 93970 + }, + { + "epoch": 0.0699, + "grad_norm": 0.08887936174869537, + "learning_rate": 2.9530550614300106e-05, + "loss": 0.036, + "step": 93980 + }, + { + "epoch": 0.06995, + "grad_norm": 0.0876452699303627, + "learning_rate": 2.952648532236315e-05, + "loss": 0.036, + "step": 93990 + }, + { + "epoch": 0.07, + "grad_norm": 0.07844098657369614, + "learning_rate": 2.9522419906673786e-05, + "loss": 0.0353, + "step": 94000 + }, + { + "epoch": 0.07005, + "grad_norm": 0.09425082057714462, + "learning_rate": 2.9518354367343166e-05, + "loss": 0.0341, + "step": 94010 + }, + { + "epoch": 0.0701, + "grad_norm": 0.09124142676591873, + "learning_rate": 2.951428870448243e-05, + "loss": 0.0352, + "step": 94020 + }, + { + "epoch": 0.07015, + "grad_norm": 0.07855840772390366, + "learning_rate": 2.9510222918202733e-05, + "loss": 0.0352, + "step": 94030 + }, + { + "epoch": 0.0702, + "grad_norm": 0.0941929966211319, + "learning_rate": 2.9506157008615244e-05, + "loss": 0.0348, + "step": 94040 + }, + { + "epoch": 0.07025, + "grad_norm": 0.087624192237854, + "learning_rate": 2.9502090975831116e-05, + "loss": 0.0343, + "step": 94050 + }, + { + "epoch": 0.0703, + "grad_norm": 0.06899592280387878, + "learning_rate": 2.94980248199615e-05, + "loss": 0.0335, + "step": 94060 + }, + { + "epoch": 0.07035, + "grad_norm": 0.08209431916475296, + "learning_rate": 2.949395854111759e-05, + "loss": 0.0353, + "step": 94070 + }, + { + "epoch": 0.0704, + "grad_norm": 0.08980611711740494, + "learning_rate": 2.9489892139410536e-05, + "loss": 0.0346, + "step": 94080 + }, + { + "epoch": 0.07045, + "grad_norm": 0.07221923768520355, + "learning_rate": 2.9485825614951513e-05, + "loss": 0.0355, + "step": 94090 + }, + { + "epoch": 0.0705, + "grad_norm": 0.08524159342050552, + "learning_rate": 2.9481758967851702e-05, + "loss": 0.0345, + "step": 94100 + }, + { + "epoch": 0.07055, + "grad_norm": 0.0812133252620697, + "learning_rate": 2.9477692198222297e-05, + "loss": 0.0348, + "step": 94110 + }, + { + "epoch": 0.0706, + "grad_norm": 0.093808114528656, + "learning_rate": 2.947362530617446e-05, + "loss": 0.0375, + "step": 94120 + }, + { + "epoch": 0.07065, + "grad_norm": 0.08137372881174088, + "learning_rate": 2.946955829181939e-05, + "loss": 0.0352, + "step": 94130 + }, + { + "epoch": 0.0707, + "grad_norm": 0.08073768019676208, + "learning_rate": 2.9465491155268278e-05, + "loss": 0.0368, + "step": 94140 + }, + { + "epoch": 0.07075, + "grad_norm": 0.09060634672641754, + "learning_rate": 2.9461423896632312e-05, + "loss": 0.0348, + "step": 94150 + }, + { + "epoch": 0.0708, + "grad_norm": 0.08813582360744476, + "learning_rate": 2.9457356516022683e-05, + "loss": 0.0352, + "step": 94160 + }, + { + "epoch": 0.07085, + "grad_norm": 0.08457762002944946, + "learning_rate": 2.9453289013550618e-05, + "loss": 0.0381, + "step": 94170 + }, + { + "epoch": 0.0709, + "grad_norm": 0.072766974568367, + "learning_rate": 2.9449221389327297e-05, + "loss": 0.0359, + "step": 94180 + }, + { + "epoch": 0.07095, + "grad_norm": 0.0687379390001297, + "learning_rate": 2.9445153643463942e-05, + "loss": 0.0347, + "step": 94190 + }, + { + "epoch": 0.071, + "grad_norm": 0.076286181807518, + "learning_rate": 2.9441085776071743e-05, + "loss": 0.0363, + "step": 94200 + }, + { + "epoch": 0.07105, + "grad_norm": 0.07225333899259567, + "learning_rate": 2.9437017787261935e-05, + "loss": 0.0349, + "step": 94210 + }, + { + "epoch": 0.0711, + "grad_norm": 0.06537607312202454, + "learning_rate": 2.9432949677145722e-05, + "loss": 0.0355, + "step": 94220 + }, + { + "epoch": 0.07115, + "grad_norm": 0.06774396449327469, + "learning_rate": 2.942888144583434e-05, + "loss": 0.0341, + "step": 94230 + }, + { + "epoch": 0.0712, + "grad_norm": 0.07002709805965424, + "learning_rate": 2.9424813093438995e-05, + "loss": 0.0349, + "step": 94240 + }, + { + "epoch": 0.07125, + "grad_norm": 0.09582309424877167, + "learning_rate": 2.9420744620070928e-05, + "loss": 0.0372, + "step": 94250 + }, + { + "epoch": 0.0713, + "grad_norm": 0.09352750331163406, + "learning_rate": 2.9416676025841363e-05, + "loss": 0.0375, + "step": 94260 + }, + { + "epoch": 0.07135, + "grad_norm": 0.08576199412345886, + "learning_rate": 2.9412607310861528e-05, + "loss": 0.0347, + "step": 94270 + }, + { + "epoch": 0.0714, + "grad_norm": 0.08172550797462463, + "learning_rate": 2.9408538475242674e-05, + "loss": 0.0378, + "step": 94280 + }, + { + "epoch": 0.07145, + "grad_norm": 0.07877188175916672, + "learning_rate": 2.940446951909603e-05, + "loss": 0.036, + "step": 94290 + }, + { + "epoch": 0.0715, + "grad_norm": 0.08867704123258591, + "learning_rate": 2.940040044253285e-05, + "loss": 0.0361, + "step": 94300 + }, + { + "epoch": 0.07155, + "grad_norm": 0.064987413585186, + "learning_rate": 2.9396331245664366e-05, + "loss": 0.0356, + "step": 94310 + }, + { + "epoch": 0.0716, + "grad_norm": 0.0645001232624054, + "learning_rate": 2.939226192860185e-05, + "loss": 0.0354, + "step": 94320 + }, + { + "epoch": 0.07165, + "grad_norm": 0.06392812728881836, + "learning_rate": 2.9388192491456535e-05, + "loss": 0.0356, + "step": 94330 + }, + { + "epoch": 0.0717, + "grad_norm": 0.07957509905099869, + "learning_rate": 2.9384122934339685e-05, + "loss": 0.035, + "step": 94340 + }, + { + "epoch": 0.07175, + "grad_norm": 0.066752128303051, + "learning_rate": 2.938005325736256e-05, + "loss": 0.0356, + "step": 94350 + }, + { + "epoch": 0.0718, + "grad_norm": 0.09556171298027039, + "learning_rate": 2.937598346063643e-05, + "loss": 0.0353, + "step": 94360 + }, + { + "epoch": 0.07185, + "grad_norm": 0.06822418421506882, + "learning_rate": 2.937191354427255e-05, + "loss": 0.0355, + "step": 94370 + }, + { + "epoch": 0.0719, + "grad_norm": 0.06926630437374115, + "learning_rate": 2.9367843508382203e-05, + "loss": 0.0355, + "step": 94380 + }, + { + "epoch": 0.07195, + "grad_norm": 0.0884789526462555, + "learning_rate": 2.9363773353076652e-05, + "loss": 0.036, + "step": 94390 + }, + { + "epoch": 0.072, + "grad_norm": 0.07868551462888718, + "learning_rate": 2.9359703078467178e-05, + "loss": 0.0386, + "step": 94400 + }, + { + "epoch": 0.07205, + "grad_norm": 0.07347268611192703, + "learning_rate": 2.935563268466505e-05, + "loss": 0.0358, + "step": 94410 + }, + { + "epoch": 0.0721, + "grad_norm": 0.09271713346242905, + "learning_rate": 2.9351562171781576e-05, + "loss": 0.0364, + "step": 94420 + }, + { + "epoch": 0.07215, + "grad_norm": 0.08625374734401703, + "learning_rate": 2.934749153992802e-05, + "loss": 0.0349, + "step": 94430 + }, + { + "epoch": 0.0722, + "grad_norm": 0.08623244613409042, + "learning_rate": 2.934342078921568e-05, + "loss": 0.035, + "step": 94440 + }, + { + "epoch": 0.07225, + "grad_norm": 0.09188719093799591, + "learning_rate": 2.933934991975584e-05, + "loss": 0.0355, + "step": 94450 + }, + { + "epoch": 0.0723, + "grad_norm": 0.0780775398015976, + "learning_rate": 2.933527893165981e-05, + "loss": 0.0366, + "step": 94460 + }, + { + "epoch": 0.07235, + "grad_norm": 0.07540776580572128, + "learning_rate": 2.933120782503888e-05, + "loss": 0.0368, + "step": 94470 + }, + { + "epoch": 0.0724, + "grad_norm": 0.0667237788438797, + "learning_rate": 2.932713660000436e-05, + "loss": 0.0373, + "step": 94480 + }, + { + "epoch": 0.07245, + "grad_norm": 0.08438503742218018, + "learning_rate": 2.932306525666755e-05, + "loss": 0.0376, + "step": 94490 + }, + { + "epoch": 0.0725, + "grad_norm": 0.06112914904952049, + "learning_rate": 2.9318993795139754e-05, + "loss": 0.036, + "step": 94500 + }, + { + "epoch": 0.07255, + "grad_norm": 0.10258391499519348, + "learning_rate": 2.9314922215532304e-05, + "loss": 0.0371, + "step": 94510 + }, + { + "epoch": 0.0726, + "grad_norm": 0.09032279998064041, + "learning_rate": 2.931085051795649e-05, + "loss": 0.0382, + "step": 94520 + }, + { + "epoch": 0.07265, + "grad_norm": 0.09368538856506348, + "learning_rate": 2.9306778702523646e-05, + "loss": 0.0359, + "step": 94530 + }, + { + "epoch": 0.0727, + "grad_norm": 0.09324487298727036, + "learning_rate": 2.9302706769345077e-05, + "loss": 0.0349, + "step": 94540 + }, + { + "epoch": 0.07275, + "grad_norm": 0.08473429083824158, + "learning_rate": 2.929863471853214e-05, + "loss": 0.0342, + "step": 94550 + }, + { + "epoch": 0.0728, + "grad_norm": 0.08495772629976273, + "learning_rate": 2.9294562550196138e-05, + "loss": 0.0363, + "step": 94560 + }, + { + "epoch": 0.07285, + "grad_norm": 0.10111489146947861, + "learning_rate": 2.9290490264448412e-05, + "loss": 0.0355, + "step": 94570 + }, + { + "epoch": 0.0729, + "grad_norm": 0.07820268720388412, + "learning_rate": 2.928641786140029e-05, + "loss": 0.0388, + "step": 94580 + }, + { + "epoch": 0.07295, + "grad_norm": 0.08622400462627411, + "learning_rate": 2.9282345341163118e-05, + "loss": 0.0351, + "step": 94590 + }, + { + "epoch": 0.073, + "grad_norm": 0.08051776885986328, + "learning_rate": 2.9278272703848225e-05, + "loss": 0.037, + "step": 94600 + }, + { + "epoch": 0.07305, + "grad_norm": 0.08742921054363251, + "learning_rate": 2.9274199949566972e-05, + "loss": 0.0373, + "step": 94610 + }, + { + "epoch": 0.0731, + "grad_norm": 0.09345852583646774, + "learning_rate": 2.9270127078430694e-05, + "loss": 0.0367, + "step": 94620 + }, + { + "epoch": 0.07315, + "grad_norm": 0.0913676917552948, + "learning_rate": 2.9266054090550748e-05, + "loss": 0.0355, + "step": 94630 + }, + { + "epoch": 0.0732, + "grad_norm": 0.08236590772867203, + "learning_rate": 2.9261980986038485e-05, + "loss": 0.0362, + "step": 94640 + }, + { + "epoch": 0.07325, + "grad_norm": 0.07883656024932861, + "learning_rate": 2.9257907765005266e-05, + "loss": 0.0338, + "step": 94650 + }, + { + "epoch": 0.0733, + "grad_norm": 0.08704619109630585, + "learning_rate": 2.9253834427562442e-05, + "loss": 0.0361, + "step": 94660 + }, + { + "epoch": 0.07335, + "grad_norm": 0.10479659587144852, + "learning_rate": 2.9249760973821382e-05, + "loss": 0.0359, + "step": 94670 + }, + { + "epoch": 0.0734, + "grad_norm": 0.08075715601444244, + "learning_rate": 2.9245687403893456e-05, + "loss": 0.0349, + "step": 94680 + }, + { + "epoch": 0.07345, + "grad_norm": 0.07828488945960999, + "learning_rate": 2.924161371789004e-05, + "loss": 0.0357, + "step": 94690 + }, + { + "epoch": 0.0735, + "grad_norm": 0.07711922377347946, + "learning_rate": 2.9237539915922492e-05, + "loss": 0.0352, + "step": 94700 + }, + { + "epoch": 0.07355, + "grad_norm": 0.07504051923751831, + "learning_rate": 2.923346599810219e-05, + "loss": 0.0351, + "step": 94710 + }, + { + "epoch": 0.0736, + "grad_norm": 0.08983557671308517, + "learning_rate": 2.9229391964540524e-05, + "loss": 0.0357, + "step": 94720 + }, + { + "epoch": 0.07365, + "grad_norm": 0.07545730471611023, + "learning_rate": 2.922531781534887e-05, + "loss": 0.0347, + "step": 94730 + }, + { + "epoch": 0.0737, + "grad_norm": 0.08545920997858047, + "learning_rate": 2.922124355063861e-05, + "loss": 0.0351, + "step": 94740 + }, + { + "epoch": 0.07375, + "grad_norm": 0.07113443315029144, + "learning_rate": 2.9217169170521136e-05, + "loss": 0.034, + "step": 94750 + }, + { + "epoch": 0.0738, + "grad_norm": 0.09056176990270615, + "learning_rate": 2.9213094675107848e-05, + "loss": 0.0353, + "step": 94760 + }, + { + "epoch": 0.07385, + "grad_norm": 0.07598128914833069, + "learning_rate": 2.9209020064510133e-05, + "loss": 0.0354, + "step": 94770 + }, + { + "epoch": 0.0739, + "grad_norm": 0.08315098285675049, + "learning_rate": 2.920494533883939e-05, + "loss": 0.0355, + "step": 94780 + }, + { + "epoch": 0.07395, + "grad_norm": 0.0804397314786911, + "learning_rate": 2.9200870498207017e-05, + "loss": 0.0394, + "step": 94790 + }, + { + "epoch": 0.074, + "grad_norm": 0.07506150007247925, + "learning_rate": 2.9196795542724432e-05, + "loss": 0.0356, + "step": 94800 + }, + { + "epoch": 0.07405, + "grad_norm": 0.08508822321891785, + "learning_rate": 2.9192720472503022e-05, + "loss": 0.0388, + "step": 94810 + }, + { + "epoch": 0.0741, + "grad_norm": 0.08116913586854935, + "learning_rate": 2.9188645287654222e-05, + "loss": 0.0367, + "step": 94820 + }, + { + "epoch": 0.07415, + "grad_norm": 0.0697377398610115, + "learning_rate": 2.9184569988289424e-05, + "loss": 0.0362, + "step": 94830 + }, + { + "epoch": 0.0742, + "grad_norm": 0.07976461946964264, + "learning_rate": 2.9180494574520063e-05, + "loss": 0.0365, + "step": 94840 + }, + { + "epoch": 0.07425, + "grad_norm": 0.07997091114521027, + "learning_rate": 2.9176419046457544e-05, + "loss": 0.0373, + "step": 94850 + }, + { + "epoch": 0.0743, + "grad_norm": 0.07684572786092758, + "learning_rate": 2.9172343404213294e-05, + "loss": 0.0352, + "step": 94860 + }, + { + "epoch": 0.07435, + "grad_norm": 0.0832042247056961, + "learning_rate": 2.9168267647898743e-05, + "loss": 0.0345, + "step": 94870 + }, + { + "epoch": 0.0744, + "grad_norm": 0.08436644077301025, + "learning_rate": 2.9164191777625326e-05, + "loss": 0.0348, + "step": 94880 + }, + { + "epoch": 0.07445, + "grad_norm": 0.08339640498161316, + "learning_rate": 2.9160115793504473e-05, + "loss": 0.0376, + "step": 94890 + }, + { + "epoch": 0.0745, + "grad_norm": 0.08925966173410416, + "learning_rate": 2.9156039695647614e-05, + "loss": 0.0372, + "step": 94900 + }, + { + "epoch": 0.07455, + "grad_norm": 0.07819242775440216, + "learning_rate": 2.9151963484166188e-05, + "loss": 0.035, + "step": 94910 + }, + { + "epoch": 0.0746, + "grad_norm": 0.08378436416387558, + "learning_rate": 2.914788715917164e-05, + "loss": 0.0352, + "step": 94920 + }, + { + "epoch": 0.07465, + "grad_norm": 0.07194891571998596, + "learning_rate": 2.9143810720775417e-05, + "loss": 0.0347, + "step": 94930 + }, + { + "epoch": 0.0747, + "grad_norm": 0.0768907442688942, + "learning_rate": 2.9139734169088974e-05, + "loss": 0.0354, + "step": 94940 + }, + { + "epoch": 0.07475, + "grad_norm": 0.08474749326705933, + "learning_rate": 2.913565750422374e-05, + "loss": 0.0358, + "step": 94950 + }, + { + "epoch": 0.0748, + "grad_norm": 0.07622528076171875, + "learning_rate": 2.9131580726291192e-05, + "loss": 0.0364, + "step": 94960 + }, + { + "epoch": 0.07485, + "grad_norm": 0.06749384105205536, + "learning_rate": 2.9127503835402782e-05, + "loss": 0.0358, + "step": 94970 + }, + { + "epoch": 0.0749, + "grad_norm": 0.07656152546405792, + "learning_rate": 2.912342683166996e-05, + "loss": 0.0345, + "step": 94980 + }, + { + "epoch": 0.07495, + "grad_norm": 0.07424613833427429, + "learning_rate": 2.9119349715204197e-05, + "loss": 0.035, + "step": 94990 + }, + { + "epoch": 0.075, + "grad_norm": 0.06224847212433815, + "learning_rate": 2.911527248611696e-05, + "loss": 0.0384, + "step": 95000 + }, + { + "epoch": 0.07505, + "grad_norm": 0.08580512553453445, + "learning_rate": 2.9111195144519728e-05, + "loss": 0.0376, + "step": 95010 + }, + { + "epoch": 0.0751, + "grad_norm": 0.08321086317300797, + "learning_rate": 2.9107117690523957e-05, + "loss": 0.0353, + "step": 95020 + }, + { + "epoch": 0.07515, + "grad_norm": 0.08063076436519623, + "learning_rate": 2.9103040124241138e-05, + "loss": 0.0375, + "step": 95030 + }, + { + "epoch": 0.0752, + "grad_norm": 0.08435116708278656, + "learning_rate": 2.909896244578274e-05, + "loss": 0.0363, + "step": 95040 + }, + { + "epoch": 0.07525, + "grad_norm": 0.07457546889781952, + "learning_rate": 2.9094884655260247e-05, + "loss": 0.0357, + "step": 95050 + }, + { + "epoch": 0.0753, + "grad_norm": 0.07458683103322983, + "learning_rate": 2.9090806752785142e-05, + "loss": 0.0358, + "step": 95060 + }, + { + "epoch": 0.07535, + "grad_norm": 0.07121626287698746, + "learning_rate": 2.9086728738468928e-05, + "loss": 0.0338, + "step": 95070 + }, + { + "epoch": 0.0754, + "grad_norm": 0.0680113285779953, + "learning_rate": 2.9082650612423078e-05, + "loss": 0.036, + "step": 95080 + }, + { + "epoch": 0.07545, + "grad_norm": 0.07844933867454529, + "learning_rate": 2.90785723747591e-05, + "loss": 0.0351, + "step": 95090 + }, + { + "epoch": 0.0755, + "grad_norm": 0.08018365502357483, + "learning_rate": 2.9074494025588477e-05, + "loss": 0.0342, + "step": 95100 + }, + { + "epoch": 0.07555, + "grad_norm": 0.11203812807798386, + "learning_rate": 2.9070415565022722e-05, + "loss": 0.0397, + "step": 95110 + }, + { + "epoch": 0.0756, + "grad_norm": 0.1107054129242897, + "learning_rate": 2.906633699317334e-05, + "loss": 0.0356, + "step": 95120 + }, + { + "epoch": 0.07565, + "grad_norm": 0.11537740379571915, + "learning_rate": 2.906225831015183e-05, + "loss": 0.0365, + "step": 95130 + }, + { + "epoch": 0.0757, + "grad_norm": 0.08692065626382828, + "learning_rate": 2.9058179516069695e-05, + "loss": 0.0357, + "step": 95140 + }, + { + "epoch": 0.07575, + "grad_norm": 0.09226984530687332, + "learning_rate": 2.9054100611038472e-05, + "loss": 0.0355, + "step": 95150 + }, + { + "epoch": 0.0758, + "grad_norm": 0.09094180911779404, + "learning_rate": 2.9050021595169647e-05, + "loss": 0.0361, + "step": 95160 + }, + { + "epoch": 0.07585, + "grad_norm": 0.08938279002904892, + "learning_rate": 2.904594246857476e-05, + "loss": 0.0364, + "step": 95170 + }, + { + "epoch": 0.0759, + "grad_norm": 0.08622892200946808, + "learning_rate": 2.9041863231365318e-05, + "loss": 0.0353, + "step": 95180 + }, + { + "epoch": 0.07595, + "grad_norm": 0.08137310296297073, + "learning_rate": 2.903778388365287e-05, + "loss": 0.0361, + "step": 95190 + }, + { + "epoch": 0.076, + "grad_norm": 0.07452932745218277, + "learning_rate": 2.9033704425548913e-05, + "loss": 0.0361, + "step": 95200 + }, + { + "epoch": 0.07605, + "grad_norm": 0.0670681893825531, + "learning_rate": 2.9029624857164996e-05, + "loss": 0.0355, + "step": 95210 + }, + { + "epoch": 0.0761, + "grad_norm": 0.09448492527008057, + "learning_rate": 2.9025545178612657e-05, + "loss": 0.0381, + "step": 95220 + }, + { + "epoch": 0.07615, + "grad_norm": 0.060585349798202515, + "learning_rate": 2.9021465390003416e-05, + "loss": 0.0369, + "step": 95230 + }, + { + "epoch": 0.0762, + "grad_norm": 0.07551992684602737, + "learning_rate": 2.9017385491448824e-05, + "loss": 0.0389, + "step": 95240 + }, + { + "epoch": 0.07625, + "grad_norm": 0.07179474830627441, + "learning_rate": 2.9013305483060416e-05, + "loss": 0.036, + "step": 95250 + }, + { + "epoch": 0.0763, + "grad_norm": 0.07472289353609085, + "learning_rate": 2.9009225364949754e-05, + "loss": 0.039, + "step": 95260 + }, + { + "epoch": 0.07635, + "grad_norm": 0.07139992713928223, + "learning_rate": 2.9005145137228375e-05, + "loss": 0.0362, + "step": 95270 + }, + { + "epoch": 0.0764, + "grad_norm": 0.07775798439979553, + "learning_rate": 2.9001064800007834e-05, + "loss": 0.0392, + "step": 95280 + }, + { + "epoch": 0.07645, + "grad_norm": 0.08753141015768051, + "learning_rate": 2.8996984353399682e-05, + "loss": 0.0374, + "step": 95290 + }, + { + "epoch": 0.0765, + "grad_norm": 0.09571100771427155, + "learning_rate": 2.8992903797515475e-05, + "loss": 0.0362, + "step": 95300 + }, + { + "epoch": 0.07655, + "grad_norm": 0.061570536345243454, + "learning_rate": 2.898882313246678e-05, + "loss": 0.0372, + "step": 95310 + }, + { + "epoch": 0.0766, + "grad_norm": 0.06046005338430405, + "learning_rate": 2.8984742358365165e-05, + "loss": 0.0351, + "step": 95320 + }, + { + "epoch": 0.07665, + "grad_norm": 0.0736611932516098, + "learning_rate": 2.8980661475322186e-05, + "loss": 0.0375, + "step": 95330 + }, + { + "epoch": 0.0767, + "grad_norm": 0.08278156071901321, + "learning_rate": 2.8976580483449423e-05, + "loss": 0.0366, + "step": 95340 + }, + { + "epoch": 0.07675, + "grad_norm": 0.0938616693019867, + "learning_rate": 2.8972499382858433e-05, + "loss": 0.0361, + "step": 95350 + }, + { + "epoch": 0.0768, + "grad_norm": 0.07551280409097672, + "learning_rate": 2.8968418173660817e-05, + "loss": 0.0366, + "step": 95360 + }, + { + "epoch": 0.07685, + "grad_norm": 0.06807341426610947, + "learning_rate": 2.8964336855968123e-05, + "loss": 0.0379, + "step": 95370 + }, + { + "epoch": 0.0769, + "grad_norm": 0.0739855021238327, + "learning_rate": 2.8960255429891957e-05, + "loss": 0.0379, + "step": 95380 + }, + { + "epoch": 0.07695, + "grad_norm": 0.09074331820011139, + "learning_rate": 2.895617389554389e-05, + "loss": 0.0351, + "step": 95390 + }, + { + "epoch": 0.077, + "grad_norm": 0.07602587342262268, + "learning_rate": 2.8952092253035523e-05, + "loss": 0.0368, + "step": 95400 + }, + { + "epoch": 0.07705, + "grad_norm": 0.08063099533319473, + "learning_rate": 2.8948010502478435e-05, + "loss": 0.0365, + "step": 95410 + }, + { + "epoch": 0.0771, + "grad_norm": 0.08302353322505951, + "learning_rate": 2.894392864398422e-05, + "loss": 0.0365, + "step": 95420 + }, + { + "epoch": 0.07715, + "grad_norm": 0.08162595331668854, + "learning_rate": 2.8939846677664477e-05, + "loss": 0.0366, + "step": 95430 + }, + { + "epoch": 0.0772, + "grad_norm": 0.08327838033437729, + "learning_rate": 2.8935764603630816e-05, + "loss": 0.0355, + "step": 95440 + }, + { + "epoch": 0.07725, + "grad_norm": 0.12089723348617554, + "learning_rate": 2.893168242199482e-05, + "loss": 0.0362, + "step": 95450 + }, + { + "epoch": 0.0773, + "grad_norm": 0.0806155577301979, + "learning_rate": 2.8927600132868106e-05, + "loss": 0.0361, + "step": 95460 + }, + { + "epoch": 0.07735, + "grad_norm": 0.08697088062763214, + "learning_rate": 2.8923517736362284e-05, + "loss": 0.0365, + "step": 95470 + }, + { + "epoch": 0.0774, + "grad_norm": 0.07858090102672577, + "learning_rate": 2.8919435232588954e-05, + "loss": 0.0365, + "step": 95480 + }, + { + "epoch": 0.07745, + "grad_norm": 0.0615830160677433, + "learning_rate": 2.8915352621659743e-05, + "loss": 0.0345, + "step": 95490 + }, + { + "epoch": 0.0775, + "grad_norm": 0.08191350847482681, + "learning_rate": 2.8911269903686255e-05, + "loss": 0.036, + "step": 95500 + }, + { + "epoch": 0.07755, + "grad_norm": 0.06817498803138733, + "learning_rate": 2.890718707878013e-05, + "loss": 0.0351, + "step": 95510 + }, + { + "epoch": 0.0776, + "grad_norm": 0.0775030255317688, + "learning_rate": 2.890310414705297e-05, + "loss": 0.0354, + "step": 95520 + }, + { + "epoch": 0.07765, + "grad_norm": 0.10623601824045181, + "learning_rate": 2.889902110861641e-05, + "loss": 0.0369, + "step": 95530 + }, + { + "epoch": 0.0777, + "grad_norm": 0.08662627637386322, + "learning_rate": 2.8894937963582073e-05, + "loss": 0.039, + "step": 95540 + }, + { + "epoch": 0.07775, + "grad_norm": 0.08613138645887375, + "learning_rate": 2.8890854712061604e-05, + "loss": 0.036, + "step": 95550 + }, + { + "epoch": 0.0778, + "grad_norm": 0.06717013567686081, + "learning_rate": 2.888677135416662e-05, + "loss": 0.0351, + "step": 95560 + }, + { + "epoch": 0.07785, + "grad_norm": 0.09230372309684753, + "learning_rate": 2.8882687890008775e-05, + "loss": 0.0356, + "step": 95570 + }, + { + "epoch": 0.0779, + "grad_norm": 0.11337987333536148, + "learning_rate": 2.8878604319699693e-05, + "loss": 0.0383, + "step": 95580 + }, + { + "epoch": 0.07795, + "grad_norm": 0.10768648236989975, + "learning_rate": 2.887452064335103e-05, + "loss": 0.0371, + "step": 95590 + }, + { + "epoch": 0.078, + "grad_norm": 0.0736958459019661, + "learning_rate": 2.8870436861074435e-05, + "loss": 0.0359, + "step": 95600 + }, + { + "epoch": 0.07805, + "grad_norm": 0.07899592816829681, + "learning_rate": 2.8866352972981546e-05, + "loss": 0.0354, + "step": 95610 + }, + { + "epoch": 0.0781, + "grad_norm": 0.06220125034451485, + "learning_rate": 2.8862268979184015e-05, + "loss": 0.0349, + "step": 95620 + }, + { + "epoch": 0.07815, + "grad_norm": 0.10204660147428513, + "learning_rate": 2.8858184879793506e-05, + "loss": 0.0369, + "step": 95630 + }, + { + "epoch": 0.0782, + "grad_norm": 0.08213819563388824, + "learning_rate": 2.885410067492167e-05, + "loss": 0.0346, + "step": 95640 + }, + { + "epoch": 0.07825, + "grad_norm": 0.08925966173410416, + "learning_rate": 2.8850016364680173e-05, + "loss": 0.0365, + "step": 95650 + }, + { + "epoch": 0.0783, + "grad_norm": 0.07410264760255814, + "learning_rate": 2.8845931949180664e-05, + "loss": 0.0348, + "step": 95660 + }, + { + "epoch": 0.07835, + "grad_norm": 0.08311299234628677, + "learning_rate": 2.884184742853483e-05, + "loss": 0.036, + "step": 95670 + }, + { + "epoch": 0.0784, + "grad_norm": 0.07730239629745483, + "learning_rate": 2.8837762802854324e-05, + "loss": 0.036, + "step": 95680 + }, + { + "epoch": 0.07845, + "grad_norm": 0.07230732589960098, + "learning_rate": 2.8833678072250838e-05, + "loss": 0.0372, + "step": 95690 + }, + { + "epoch": 0.0785, + "grad_norm": 0.06986280530691147, + "learning_rate": 2.8829593236836016e-05, + "loss": 0.0355, + "step": 95700 + }, + { + "epoch": 0.07855, + "grad_norm": 0.08273430913686752, + "learning_rate": 2.8825508296721566e-05, + "loss": 0.0369, + "step": 95710 + }, + { + "epoch": 0.0786, + "grad_norm": 0.0761200562119484, + "learning_rate": 2.8821423252019154e-05, + "loss": 0.0353, + "step": 95720 + }, + { + "epoch": 0.07865, + "grad_norm": 0.07793602347373962, + "learning_rate": 2.8817338102840462e-05, + "loss": 0.035, + "step": 95730 + }, + { + "epoch": 0.0787, + "grad_norm": 0.06937800347805023, + "learning_rate": 2.881325284929719e-05, + "loss": 0.0339, + "step": 95740 + }, + { + "epoch": 0.07875, + "grad_norm": 0.07704199850559235, + "learning_rate": 2.8809167491501003e-05, + "loss": 0.0345, + "step": 95750 + }, + { + "epoch": 0.0788, + "grad_norm": 0.09481144696474075, + "learning_rate": 2.880508202956362e-05, + "loss": 0.0352, + "step": 95760 + }, + { + "epoch": 0.07885, + "grad_norm": 0.06698376685380936, + "learning_rate": 2.8800996463596717e-05, + "loss": 0.0359, + "step": 95770 + }, + { + "epoch": 0.0789, + "grad_norm": 0.06863502413034439, + "learning_rate": 2.8796910793712006e-05, + "loss": 0.0343, + "step": 95780 + }, + { + "epoch": 0.07895, + "grad_norm": 0.06124764680862427, + "learning_rate": 2.8792825020021174e-05, + "loss": 0.0341, + "step": 95790 + }, + { + "epoch": 0.079, + "grad_norm": 0.06939634680747986, + "learning_rate": 2.8788739142635935e-05, + "loss": 0.037, + "step": 95800 + }, + { + "epoch": 0.07905, + "grad_norm": 0.07363869994878769, + "learning_rate": 2.878465316166798e-05, + "loss": 0.0358, + "step": 95810 + }, + { + "epoch": 0.0791, + "grad_norm": 0.06982981413602829, + "learning_rate": 2.8780567077229042e-05, + "loss": 0.0371, + "step": 95820 + }, + { + "epoch": 0.07915, + "grad_norm": 0.07453575730323792, + "learning_rate": 2.8776480889430812e-05, + "loss": 0.0357, + "step": 95830 + }, + { + "epoch": 0.0792, + "grad_norm": 0.0722508579492569, + "learning_rate": 2.8772394598385022e-05, + "loss": 0.0352, + "step": 95840 + }, + { + "epoch": 0.07925, + "grad_norm": 0.06787462532520294, + "learning_rate": 2.8768308204203376e-05, + "loss": 0.0354, + "step": 95850 + }, + { + "epoch": 0.0793, + "grad_norm": 0.06790974736213684, + "learning_rate": 2.87642217069976e-05, + "loss": 0.0351, + "step": 95860 + }, + { + "epoch": 0.07935, + "grad_norm": 0.07508399337530136, + "learning_rate": 2.8760135106879415e-05, + "loss": 0.0358, + "step": 95870 + }, + { + "epoch": 0.0794, + "grad_norm": 0.0850457027554512, + "learning_rate": 2.875604840396055e-05, + "loss": 0.0348, + "step": 95880 + }, + { + "epoch": 0.07945, + "grad_norm": 0.07781939208507538, + "learning_rate": 2.8751961598352732e-05, + "loss": 0.035, + "step": 95890 + }, + { + "epoch": 0.0795, + "grad_norm": 0.08043967932462692, + "learning_rate": 2.87478746901677e-05, + "loss": 0.0353, + "step": 95900 + }, + { + "epoch": 0.07955, + "grad_norm": 0.07491166889667511, + "learning_rate": 2.8743787679517174e-05, + "loss": 0.0353, + "step": 95910 + }, + { + "epoch": 0.0796, + "grad_norm": 0.08218996971845627, + "learning_rate": 2.87397005665129e-05, + "loss": 0.0356, + "step": 95920 + }, + { + "epoch": 0.07965, + "grad_norm": 0.08948659151792526, + "learning_rate": 2.8735613351266622e-05, + "loss": 0.0343, + "step": 95930 + }, + { + "epoch": 0.0797, + "grad_norm": 0.07800249010324478, + "learning_rate": 2.873152603389008e-05, + "loss": 0.0357, + "step": 95940 + }, + { + "epoch": 0.07975, + "grad_norm": 0.07463128864765167, + "learning_rate": 2.8727438614495006e-05, + "loss": 0.035, + "step": 95950 + }, + { + "epoch": 0.0798, + "grad_norm": 0.07149926573038101, + "learning_rate": 2.872335109319317e-05, + "loss": 0.035, + "step": 95960 + }, + { + "epoch": 0.07985, + "grad_norm": 0.06871454417705536, + "learning_rate": 2.8719263470096313e-05, + "loss": 0.0347, + "step": 95970 + }, + { + "epoch": 0.0799, + "grad_norm": 0.09457767009735107, + "learning_rate": 2.8715175745316187e-05, + "loss": 0.0368, + "step": 95980 + }, + { + "epoch": 0.07995, + "grad_norm": 0.08058516681194305, + "learning_rate": 2.871108791896456e-05, + "loss": 0.0354, + "step": 95990 + }, + { + "epoch": 0.08, + "grad_norm": 0.08109599351882935, + "learning_rate": 2.8706999991153173e-05, + "loss": 0.0363, + "step": 96000 + }, + { + "epoch": 0.08005, + "grad_norm": 0.07671806216239929, + "learning_rate": 2.8702911961993807e-05, + "loss": 0.0356, + "step": 96010 + }, + { + "epoch": 0.0801, + "grad_norm": 0.0780465304851532, + "learning_rate": 2.8698823831598208e-05, + "loss": 0.0372, + "step": 96020 + }, + { + "epoch": 0.08015, + "grad_norm": 0.10541101545095444, + "learning_rate": 2.869473560007817e-05, + "loss": 0.0373, + "step": 96030 + }, + { + "epoch": 0.0802, + "grad_norm": 0.10399451106786728, + "learning_rate": 2.869064726754544e-05, + "loss": 0.0356, + "step": 96040 + }, + { + "epoch": 0.08025, + "grad_norm": 0.07450667768716812, + "learning_rate": 2.8686558834111804e-05, + "loss": 0.0363, + "step": 96050 + }, + { + "epoch": 0.0803, + "grad_norm": 0.09118582308292389, + "learning_rate": 2.868247029988903e-05, + "loss": 0.0362, + "step": 96060 + }, + { + "epoch": 0.08035, + "grad_norm": 0.09464675188064575, + "learning_rate": 2.8678381664988902e-05, + "loss": 0.0358, + "step": 96070 + }, + { + "epoch": 0.0804, + "grad_norm": 0.07721760869026184, + "learning_rate": 2.86742929295232e-05, + "loss": 0.0372, + "step": 96080 + }, + { + "epoch": 0.08045, + "grad_norm": 0.09013506770133972, + "learning_rate": 2.8670204093603713e-05, + "loss": 0.036, + "step": 96090 + }, + { + "epoch": 0.0805, + "grad_norm": 0.07364276796579361, + "learning_rate": 2.8666115157342226e-05, + "loss": 0.0365, + "step": 96100 + }, + { + "epoch": 0.08055, + "grad_norm": 0.08563251793384552, + "learning_rate": 2.8662026120850526e-05, + "loss": 0.0348, + "step": 96110 + }, + { + "epoch": 0.0806, + "grad_norm": 0.09776796400547028, + "learning_rate": 2.8657936984240407e-05, + "loss": 0.036, + "step": 96120 + }, + { + "epoch": 0.08065, + "grad_norm": 0.09072226285934448, + "learning_rate": 2.8653847747623665e-05, + "loss": 0.0345, + "step": 96130 + }, + { + "epoch": 0.0807, + "grad_norm": 0.10321402549743652, + "learning_rate": 2.86497584111121e-05, + "loss": 0.035, + "step": 96140 + }, + { + "epoch": 0.08075, + "grad_norm": 0.08187662065029144, + "learning_rate": 2.864566897481751e-05, + "loss": 0.0371, + "step": 96150 + }, + { + "epoch": 0.0808, + "grad_norm": 0.0743849128484726, + "learning_rate": 2.86415794388517e-05, + "loss": 0.0374, + "step": 96160 + }, + { + "epoch": 0.08085, + "grad_norm": 0.09088624268770218, + "learning_rate": 2.8637489803326472e-05, + "loss": 0.0373, + "step": 96170 + }, + { + "epoch": 0.0809, + "grad_norm": 0.09648296236991882, + "learning_rate": 2.863340006835365e-05, + "loss": 0.0368, + "step": 96180 + }, + { + "epoch": 0.08095, + "grad_norm": 0.11575803905725479, + "learning_rate": 2.8629310234045027e-05, + "loss": 0.0365, + "step": 96190 + }, + { + "epoch": 0.081, + "grad_norm": 0.10136217623949051, + "learning_rate": 2.8625220300512422e-05, + "loss": 0.0358, + "step": 96200 + }, + { + "epoch": 0.08105, + "grad_norm": 0.08919744193553925, + "learning_rate": 2.8621130267867663e-05, + "loss": 0.0353, + "step": 96210 + }, + { + "epoch": 0.0811, + "grad_norm": 0.08605343848466873, + "learning_rate": 2.8617040136222566e-05, + "loss": 0.0364, + "step": 96220 + }, + { + "epoch": 0.08115, + "grad_norm": 0.08273573219776154, + "learning_rate": 2.861294990568894e-05, + "loss": 0.0375, + "step": 96230 + }, + { + "epoch": 0.0812, + "grad_norm": 0.08137491345405579, + "learning_rate": 2.8608859576378634e-05, + "loss": 0.0395, + "step": 96240 + }, + { + "epoch": 0.08125, + "grad_norm": 0.09367059916257858, + "learning_rate": 2.8604769148403455e-05, + "loss": 0.0383, + "step": 96250 + }, + { + "epoch": 0.0813, + "grad_norm": 0.08180638402700424, + "learning_rate": 2.8600678621875237e-05, + "loss": 0.0362, + "step": 96260 + }, + { + "epoch": 0.08135, + "grad_norm": 0.07581344246864319, + "learning_rate": 2.8596587996905823e-05, + "loss": 0.0348, + "step": 96270 + }, + { + "epoch": 0.0814, + "grad_norm": 0.07765164971351624, + "learning_rate": 2.859249727360705e-05, + "loss": 0.0344, + "step": 96280 + }, + { + "epoch": 0.08145, + "grad_norm": 0.07539539784193039, + "learning_rate": 2.8588406452090742e-05, + "loss": 0.0344, + "step": 96290 + }, + { + "epoch": 0.0815, + "grad_norm": 0.06976207345724106, + "learning_rate": 2.8584315532468757e-05, + "loss": 0.0356, + "step": 96300 + }, + { + "epoch": 0.08155, + "grad_norm": 0.0770931988954544, + "learning_rate": 2.8580224514852928e-05, + "loss": 0.0351, + "step": 96310 + }, + { + "epoch": 0.0816, + "grad_norm": 0.0735212191939354, + "learning_rate": 2.8576133399355105e-05, + "loss": 0.0351, + "step": 96320 + }, + { + "epoch": 0.08165, + "grad_norm": 0.08054427802562714, + "learning_rate": 2.857204218608714e-05, + "loss": 0.0335, + "step": 96330 + }, + { + "epoch": 0.0817, + "grad_norm": 0.07628805190324783, + "learning_rate": 2.8567950875160887e-05, + "loss": 0.0343, + "step": 96340 + }, + { + "epoch": 0.08175, + "grad_norm": 0.0641375333070755, + "learning_rate": 2.8563859466688192e-05, + "loss": 0.0336, + "step": 96350 + }, + { + "epoch": 0.0818, + "grad_norm": 0.08200865238904953, + "learning_rate": 2.855976796078092e-05, + "loss": 0.036, + "step": 96360 + }, + { + "epoch": 0.08185, + "grad_norm": 0.09143612533807755, + "learning_rate": 2.8555676357550933e-05, + "loss": 0.0347, + "step": 96370 + }, + { + "epoch": 0.0819, + "grad_norm": 0.07617480307817459, + "learning_rate": 2.855158465711008e-05, + "loss": 0.0342, + "step": 96380 + }, + { + "epoch": 0.08195, + "grad_norm": 0.08174902945756912, + "learning_rate": 2.854749285957024e-05, + "loss": 0.0362, + "step": 96390 + }, + { + "epoch": 0.082, + "grad_norm": 0.07885648310184479, + "learning_rate": 2.8543400965043287e-05, + "loss": 0.0366, + "step": 96400 + }, + { + "epoch": 0.08205, + "grad_norm": 0.07714606821537018, + "learning_rate": 2.8539308973641078e-05, + "loss": 0.0351, + "step": 96410 + }, + { + "epoch": 0.0821, + "grad_norm": 0.07244784384965897, + "learning_rate": 2.8535216885475485e-05, + "loss": 0.0347, + "step": 96420 + }, + { + "epoch": 0.08215, + "grad_norm": 0.0732586458325386, + "learning_rate": 2.85311247006584e-05, + "loss": 0.0362, + "step": 96430 + }, + { + "epoch": 0.0822, + "grad_norm": 0.05604038015007973, + "learning_rate": 2.8527032419301686e-05, + "loss": 0.0335, + "step": 96440 + }, + { + "epoch": 0.08225, + "grad_norm": 0.06942863017320633, + "learning_rate": 2.8522940041517232e-05, + "loss": 0.0343, + "step": 96450 + }, + { + "epoch": 0.0823, + "grad_norm": 0.06616475433111191, + "learning_rate": 2.8518847567416916e-05, + "loss": 0.034, + "step": 96460 + }, + { + "epoch": 0.08235, + "grad_norm": 0.06547105312347412, + "learning_rate": 2.851475499711264e-05, + "loss": 0.0369, + "step": 96470 + }, + { + "epoch": 0.0824, + "grad_norm": 0.0807575210928917, + "learning_rate": 2.8510662330716276e-05, + "loss": 0.0357, + "step": 96480 + }, + { + "epoch": 0.08245, + "grad_norm": 0.08193361759185791, + "learning_rate": 2.8506569568339732e-05, + "loss": 0.035, + "step": 96490 + }, + { + "epoch": 0.0825, + "grad_norm": 0.07796820998191833, + "learning_rate": 2.8502476710094884e-05, + "loss": 0.0347, + "step": 96500 + }, + { + "epoch": 0.08255, + "grad_norm": 0.07326173037290573, + "learning_rate": 2.849838375609364e-05, + "loss": 0.0358, + "step": 96510 + }, + { + "epoch": 0.0826, + "grad_norm": 0.08830005675554276, + "learning_rate": 2.8494290706447896e-05, + "loss": 0.0351, + "step": 96520 + }, + { + "epoch": 0.08265, + "grad_norm": 0.09686273336410522, + "learning_rate": 2.849019756126956e-05, + "loss": 0.0358, + "step": 96530 + }, + { + "epoch": 0.0827, + "grad_norm": 0.08772691339254379, + "learning_rate": 2.8486104320670532e-05, + "loss": 0.035, + "step": 96540 + }, + { + "epoch": 0.08275, + "grad_norm": 0.07770222425460815, + "learning_rate": 2.848201098476273e-05, + "loss": 0.0358, + "step": 96550 + }, + { + "epoch": 0.0828, + "grad_norm": 0.07158953696489334, + "learning_rate": 2.8477917553658045e-05, + "loss": 0.0361, + "step": 96560 + }, + { + "epoch": 0.08285, + "grad_norm": 0.07134388387203217, + "learning_rate": 2.847382402746841e-05, + "loss": 0.0358, + "step": 96570 + }, + { + "epoch": 0.0829, + "grad_norm": 0.06280628591775894, + "learning_rate": 2.8469730406305718e-05, + "loss": 0.0349, + "step": 96580 + }, + { + "epoch": 0.08295, + "grad_norm": 0.08362621068954468, + "learning_rate": 2.846563669028191e-05, + "loss": 0.0347, + "step": 96590 + }, + { + "epoch": 0.083, + "grad_norm": 0.07226254791021347, + "learning_rate": 2.8461542879508895e-05, + "loss": 0.0341, + "step": 96600 + }, + { + "epoch": 0.08305, + "grad_norm": 0.07029656320810318, + "learning_rate": 2.84574489740986e-05, + "loss": 0.0347, + "step": 96610 + }, + { + "epoch": 0.0831, + "grad_norm": 0.059351012110710144, + "learning_rate": 2.8453354974162945e-05, + "loss": 0.0354, + "step": 96620 + }, + { + "epoch": 0.08315, + "grad_norm": 0.08634937554597855, + "learning_rate": 2.844926087981386e-05, + "loss": 0.0351, + "step": 96630 + }, + { + "epoch": 0.0832, + "grad_norm": 0.07627951353788376, + "learning_rate": 2.8445166691163283e-05, + "loss": 0.0355, + "step": 96640 + }, + { + "epoch": 0.08325, + "grad_norm": 0.07544269412755966, + "learning_rate": 2.8441072408323143e-05, + "loss": 0.0353, + "step": 96650 + }, + { + "epoch": 0.0833, + "grad_norm": 0.08564888685941696, + "learning_rate": 2.8436978031405375e-05, + "loss": 0.0349, + "step": 96660 + }, + { + "epoch": 0.08335, + "grad_norm": 0.064504474401474, + "learning_rate": 2.8432883560521915e-05, + "loss": 0.0333, + "step": 96670 + }, + { + "epoch": 0.0834, + "grad_norm": 0.09171956032514572, + "learning_rate": 2.842878899578472e-05, + "loss": 0.0344, + "step": 96680 + }, + { + "epoch": 0.08345, + "grad_norm": 0.08900267630815506, + "learning_rate": 2.8424694337305714e-05, + "loss": 0.0355, + "step": 96690 + }, + { + "epoch": 0.0835, + "grad_norm": 0.14382585883140564, + "learning_rate": 2.842059958519685e-05, + "loss": 0.0348, + "step": 96700 + }, + { + "epoch": 0.08355, + "grad_norm": 0.09673803299665451, + "learning_rate": 2.8416504739570076e-05, + "loss": 0.0389, + "step": 96710 + }, + { + "epoch": 0.0836, + "grad_norm": 0.1290545016527176, + "learning_rate": 2.8412409800537354e-05, + "loss": 0.0362, + "step": 96720 + }, + { + "epoch": 0.08365, + "grad_norm": 0.07535991072654724, + "learning_rate": 2.8408314768210625e-05, + "loss": 0.0352, + "step": 96730 + }, + { + "epoch": 0.0837, + "grad_norm": 0.07553493976593018, + "learning_rate": 2.8404219642701858e-05, + "loss": 0.0339, + "step": 96740 + }, + { + "epoch": 0.08375, + "grad_norm": 0.09884503483772278, + "learning_rate": 2.8400124424123e-05, + "loss": 0.0342, + "step": 96750 + }, + { + "epoch": 0.0838, + "grad_norm": 0.07270557433366776, + "learning_rate": 2.839602911258602e-05, + "loss": 0.0328, + "step": 96760 + }, + { + "epoch": 0.08385, + "grad_norm": 0.08007807284593582, + "learning_rate": 2.8391933708202867e-05, + "loss": 0.0351, + "step": 96770 + }, + { + "epoch": 0.0839, + "grad_norm": 0.08742036670446396, + "learning_rate": 2.8387838211085534e-05, + "loss": 0.0349, + "step": 96780 + }, + { + "epoch": 0.08395, + "grad_norm": 0.07782687991857529, + "learning_rate": 2.838374262134597e-05, + "loss": 0.0352, + "step": 96790 + }, + { + "epoch": 0.084, + "grad_norm": 0.07308624684810638, + "learning_rate": 2.837964693909616e-05, + "loss": 0.0341, + "step": 96800 + }, + { + "epoch": 0.08405, + "grad_norm": 0.06709211319684982, + "learning_rate": 2.837555116444807e-05, + "loss": 0.0346, + "step": 96810 + }, + { + "epoch": 0.0841, + "grad_norm": 0.07162594050168991, + "learning_rate": 2.8371455297513683e-05, + "loss": 0.0346, + "step": 96820 + }, + { + "epoch": 0.08415, + "grad_norm": 0.0674961507320404, + "learning_rate": 2.8367359338404963e-05, + "loss": 0.0329, + "step": 96830 + }, + { + "epoch": 0.0842, + "grad_norm": 0.061539795249700546, + "learning_rate": 2.8363263287233916e-05, + "loss": 0.0339, + "step": 96840 + }, + { + "epoch": 0.08425, + "grad_norm": 0.07633615285158157, + "learning_rate": 2.835916714411251e-05, + "loss": 0.0341, + "step": 96850 + }, + { + "epoch": 0.0843, + "grad_norm": 0.07183895260095596, + "learning_rate": 2.8355070909152738e-05, + "loss": 0.0344, + "step": 96860 + }, + { + "epoch": 0.08435, + "grad_norm": 0.07868233323097229, + "learning_rate": 2.8350974582466583e-05, + "loss": 0.0401, + "step": 96870 + }, + { + "epoch": 0.0844, + "grad_norm": 0.10423652827739716, + "learning_rate": 2.8346878164166042e-05, + "loss": 0.0372, + "step": 96880 + }, + { + "epoch": 0.08445, + "grad_norm": 0.09205686300992966, + "learning_rate": 2.834278165436311e-05, + "loss": 0.0349, + "step": 96890 + }, + { + "epoch": 0.0845, + "grad_norm": 0.08706831187009811, + "learning_rate": 2.833868505316979e-05, + "loss": 0.0368, + "step": 96900 + }, + { + "epoch": 0.08455, + "grad_norm": 0.06951024383306503, + "learning_rate": 2.8334588360698066e-05, + "loss": 0.0343, + "step": 96910 + }, + { + "epoch": 0.0846, + "grad_norm": 0.09689069539308548, + "learning_rate": 2.8330491577059953e-05, + "loss": 0.0352, + "step": 96920 + }, + { + "epoch": 0.08465, + "grad_norm": 0.07028918713331223, + "learning_rate": 2.8326394702367452e-05, + "loss": 0.0347, + "step": 96930 + }, + { + "epoch": 0.0847, + "grad_norm": 0.0676390528678894, + "learning_rate": 2.832229773673257e-05, + "loss": 0.0346, + "step": 96940 + }, + { + "epoch": 0.08475, + "grad_norm": 0.06127481535077095, + "learning_rate": 2.831820068026732e-05, + "loss": 0.0344, + "step": 96950 + }, + { + "epoch": 0.0848, + "grad_norm": 0.07048854976892471, + "learning_rate": 2.8314103533083698e-05, + "loss": 0.0359, + "step": 96960 + }, + { + "epoch": 0.08485, + "grad_norm": 0.08029826730489731, + "learning_rate": 2.831000629529374e-05, + "loss": 0.0371, + "step": 96970 + }, + { + "epoch": 0.0849, + "grad_norm": 0.08408576250076294, + "learning_rate": 2.8305908967009446e-05, + "loss": 0.0358, + "step": 96980 + }, + { + "epoch": 0.08495, + "grad_norm": 0.0695815235376358, + "learning_rate": 2.8301811548342856e-05, + "loss": 0.0371, + "step": 96990 + }, + { + "epoch": 0.085, + "grad_norm": 0.07242295891046524, + "learning_rate": 2.8297714039405965e-05, + "loss": 0.0345, + "step": 97000 + }, + { + "epoch": 0.08505, + "grad_norm": 0.06739848107099533, + "learning_rate": 2.8293616440310823e-05, + "loss": 0.0365, + "step": 97010 + }, + { + "epoch": 0.0851, + "grad_norm": 0.0829663798213005, + "learning_rate": 2.828951875116943e-05, + "loss": 0.0365, + "step": 97020 + }, + { + "epoch": 0.08515, + "grad_norm": 0.07622525840997696, + "learning_rate": 2.828542097209384e-05, + "loss": 0.0365, + "step": 97030 + }, + { + "epoch": 0.0852, + "grad_norm": 0.09016449749469757, + "learning_rate": 2.8281323103196073e-05, + "loss": 0.0362, + "step": 97040 + }, + { + "epoch": 0.08525, + "grad_norm": 0.09533847868442535, + "learning_rate": 2.827722514458817e-05, + "loss": 0.0361, + "step": 97050 + }, + { + "epoch": 0.0853, + "grad_norm": 0.08968351781368256, + "learning_rate": 2.8273127096382157e-05, + "loss": 0.036, + "step": 97060 + }, + { + "epoch": 0.08535, + "grad_norm": 0.0862366333603859, + "learning_rate": 2.8269028958690087e-05, + "loss": 0.035, + "step": 97070 + }, + { + "epoch": 0.0854, + "grad_norm": 0.08487775176763535, + "learning_rate": 2.8264930731623983e-05, + "loss": 0.0371, + "step": 97080 + }, + { + "epoch": 0.08545, + "grad_norm": 0.09566416591405869, + "learning_rate": 2.82608324152959e-05, + "loss": 0.0344, + "step": 97090 + }, + { + "epoch": 0.0855, + "grad_norm": 0.07718129456043243, + "learning_rate": 2.8256734009817887e-05, + "loss": 0.0355, + "step": 97100 + }, + { + "epoch": 0.08555, + "grad_norm": 0.0778222531080246, + "learning_rate": 2.825263551530199e-05, + "loss": 0.0343, + "step": 97110 + }, + { + "epoch": 0.0856, + "grad_norm": 0.07489711046218872, + "learning_rate": 2.824853693186026e-05, + "loss": 0.0342, + "step": 97120 + }, + { + "epoch": 0.08565, + "grad_norm": 0.07497640699148178, + "learning_rate": 2.8244438259604744e-05, + "loss": 0.0373, + "step": 97130 + }, + { + "epoch": 0.0857, + "grad_norm": 0.07032839953899384, + "learning_rate": 2.8240339498647518e-05, + "loss": 0.0336, + "step": 97140 + }, + { + "epoch": 0.08575, + "grad_norm": 0.0681866928935051, + "learning_rate": 2.823624064910061e-05, + "loss": 0.0351, + "step": 97150 + }, + { + "epoch": 0.0858, + "grad_norm": 0.06538959592580795, + "learning_rate": 2.8232141711076115e-05, + "loss": 0.0333, + "step": 97160 + }, + { + "epoch": 0.08585, + "grad_norm": 0.09290921688079834, + "learning_rate": 2.8228042684686072e-05, + "loss": 0.0348, + "step": 97170 + }, + { + "epoch": 0.0859, + "grad_norm": 0.08239532262086868, + "learning_rate": 2.822394357004256e-05, + "loss": 0.0347, + "step": 97180 + }, + { + "epoch": 0.08595, + "grad_norm": 0.08641859143972397, + "learning_rate": 2.8219844367257637e-05, + "loss": 0.0348, + "step": 97190 + }, + { + "epoch": 0.086, + "grad_norm": 0.0688556358218193, + "learning_rate": 2.8215745076443383e-05, + "loss": 0.0341, + "step": 97200 + }, + { + "epoch": 0.08605, + "grad_norm": 0.058573488146066666, + "learning_rate": 2.821164569771186e-05, + "loss": 0.0339, + "step": 97210 + }, + { + "epoch": 0.0861, + "grad_norm": 0.05978013575077057, + "learning_rate": 2.8207546231175152e-05, + "loss": 0.0338, + "step": 97220 + }, + { + "epoch": 0.08615, + "grad_norm": 0.0704859048128128, + "learning_rate": 2.8203446676945337e-05, + "loss": 0.0379, + "step": 97230 + }, + { + "epoch": 0.0862, + "grad_norm": 0.07969992607831955, + "learning_rate": 2.8199347035134498e-05, + "loss": 0.0349, + "step": 97240 + }, + { + "epoch": 0.08625, + "grad_norm": 0.07378263771533966, + "learning_rate": 2.8195247305854706e-05, + "loss": 0.0358, + "step": 97250 + }, + { + "epoch": 0.0863, + "grad_norm": 0.0759364664554596, + "learning_rate": 2.8191147489218062e-05, + "loss": 0.0348, + "step": 97260 + }, + { + "epoch": 0.08635, + "grad_norm": 0.06266133487224579, + "learning_rate": 2.8187047585336634e-05, + "loss": 0.0341, + "step": 97270 + }, + { + "epoch": 0.0864, + "grad_norm": 0.06443309783935547, + "learning_rate": 2.8182947594322524e-05, + "loss": 0.0342, + "step": 97280 + }, + { + "epoch": 0.08645, + "grad_norm": 0.09148667752742767, + "learning_rate": 2.8178847516287822e-05, + "loss": 0.0341, + "step": 97290 + }, + { + "epoch": 0.0865, + "grad_norm": 0.07785467058420181, + "learning_rate": 2.8174747351344633e-05, + "loss": 0.035, + "step": 97300 + }, + { + "epoch": 0.08655, + "grad_norm": 0.08465701341629028, + "learning_rate": 2.817064709960503e-05, + "loss": 0.0359, + "step": 97310 + }, + { + "epoch": 0.0866, + "grad_norm": 0.07838905602693558, + "learning_rate": 2.8166546761181138e-05, + "loss": 0.0343, + "step": 97320 + }, + { + "epoch": 0.08665, + "grad_norm": 0.08312281221151352, + "learning_rate": 2.8162446336185045e-05, + "loss": 0.0347, + "step": 97330 + }, + { + "epoch": 0.0867, + "grad_norm": 0.06643358618021011, + "learning_rate": 2.815834582472885e-05, + "loss": 0.0343, + "step": 97340 + }, + { + "epoch": 0.08675, + "grad_norm": 0.06893420219421387, + "learning_rate": 2.815424522692467e-05, + "loss": 0.0342, + "step": 97350 + }, + { + "epoch": 0.0868, + "grad_norm": 0.07925647497177124, + "learning_rate": 2.815014454288461e-05, + "loss": 0.0344, + "step": 97360 + }, + { + "epoch": 0.08685, + "grad_norm": 0.10124651342630386, + "learning_rate": 2.8146043772720787e-05, + "loss": 0.038, + "step": 97370 + }, + { + "epoch": 0.0869, + "grad_norm": 0.10052763670682907, + "learning_rate": 2.8141942916545306e-05, + "loss": 0.034, + "step": 97380 + }, + { + "epoch": 0.08695, + "grad_norm": 0.10913696885108948, + "learning_rate": 2.8137841974470286e-05, + "loss": 0.0359, + "step": 97390 + }, + { + "epoch": 0.087, + "grad_norm": 0.11984412372112274, + "learning_rate": 2.813374094660784e-05, + "loss": 0.0402, + "step": 97400 + }, + { + "epoch": 0.08705, + "grad_norm": 0.09089337289333344, + "learning_rate": 2.8129639833070103e-05, + "loss": 0.035, + "step": 97410 + }, + { + "epoch": 0.0871, + "grad_norm": 0.07296092063188553, + "learning_rate": 2.8125538633969183e-05, + "loss": 0.0367, + "step": 97420 + }, + { + "epoch": 0.08715, + "grad_norm": 0.07328640669584274, + "learning_rate": 2.8121437349417218e-05, + "loss": 0.0335, + "step": 97430 + }, + { + "epoch": 0.0872, + "grad_norm": 0.07069529592990875, + "learning_rate": 2.811733597952632e-05, + "loss": 0.0342, + "step": 97440 + }, + { + "epoch": 0.08725, + "grad_norm": 0.11309466511011124, + "learning_rate": 2.811323452440863e-05, + "loss": 0.0365, + "step": 97450 + }, + { + "epoch": 0.0873, + "grad_norm": 0.07509417086839676, + "learning_rate": 2.8109132984176278e-05, + "loss": 0.0356, + "step": 97460 + }, + { + "epoch": 0.08735, + "grad_norm": 0.08614693582057953, + "learning_rate": 2.8105031358941397e-05, + "loss": 0.0334, + "step": 97470 + }, + { + "epoch": 0.0874, + "grad_norm": 0.08508718758821487, + "learning_rate": 2.8100929648816128e-05, + "loss": 0.0357, + "step": 97480 + }, + { + "epoch": 0.08745, + "grad_norm": 0.07222725450992584, + "learning_rate": 2.8096827853912612e-05, + "loss": 0.0346, + "step": 97490 + }, + { + "epoch": 0.0875, + "grad_norm": 0.08393757790327072, + "learning_rate": 2.8092725974342976e-05, + "loss": 0.0343, + "step": 97500 + }, + { + "epoch": 0.08755, + "grad_norm": 0.07629408687353134, + "learning_rate": 2.8088624010219378e-05, + "loss": 0.0363, + "step": 97510 + }, + { + "epoch": 0.0876, + "grad_norm": 0.0961424857378006, + "learning_rate": 2.808452196165396e-05, + "loss": 0.0354, + "step": 97520 + }, + { + "epoch": 0.08765, + "grad_norm": 0.09951150417327881, + "learning_rate": 2.808041982875887e-05, + "loss": 0.0348, + "step": 97530 + }, + { + "epoch": 0.0877, + "grad_norm": 0.11493387818336487, + "learning_rate": 2.8076317611646253e-05, + "loss": 0.0355, + "step": 97540 + }, + { + "epoch": 0.08775, + "grad_norm": 0.09941182285547256, + "learning_rate": 2.8072215310428278e-05, + "loss": 0.0347, + "step": 97550 + }, + { + "epoch": 0.0878, + "grad_norm": 0.11129660159349442, + "learning_rate": 2.806811292521709e-05, + "loss": 0.0386, + "step": 97560 + }, + { + "epoch": 0.08785, + "grad_norm": 0.07355191558599472, + "learning_rate": 2.8064010456124838e-05, + "loss": 0.0346, + "step": 97570 + }, + { + "epoch": 0.0879, + "grad_norm": 0.07286284118890762, + "learning_rate": 2.8059907903263705e-05, + "loss": 0.0342, + "step": 97580 + }, + { + "epoch": 0.08795, + "grad_norm": 0.06354943662881851, + "learning_rate": 2.8055805266745827e-05, + "loss": 0.0346, + "step": 97590 + }, + { + "epoch": 0.088, + "grad_norm": 0.05854567885398865, + "learning_rate": 2.8051702546683385e-05, + "loss": 0.0349, + "step": 97600 + }, + { + "epoch": 0.08805, + "grad_norm": 0.07032527029514313, + "learning_rate": 2.804759974318854e-05, + "loss": 0.0346, + "step": 97610 + }, + { + "epoch": 0.0881, + "grad_norm": 0.07550732046365738, + "learning_rate": 2.804349685637347e-05, + "loss": 0.035, + "step": 97620 + }, + { + "epoch": 0.08815, + "grad_norm": 0.06810349971055984, + "learning_rate": 2.8039393886350335e-05, + "loss": 0.0345, + "step": 97630 + }, + { + "epoch": 0.0882, + "grad_norm": 0.07226649671792984, + "learning_rate": 2.8035290833231316e-05, + "loss": 0.034, + "step": 97640 + }, + { + "epoch": 0.08825, + "grad_norm": 0.07276780903339386, + "learning_rate": 2.803118769712858e-05, + "loss": 0.0354, + "step": 97650 + }, + { + "epoch": 0.0883, + "grad_norm": 0.07253876328468323, + "learning_rate": 2.8027084478154315e-05, + "loss": 0.0339, + "step": 97660 + }, + { + "epoch": 0.08835, + "grad_norm": 0.10958126932382584, + "learning_rate": 2.8022981176420694e-05, + "loss": 0.035, + "step": 97670 + }, + { + "epoch": 0.0884, + "grad_norm": 0.09185132384300232, + "learning_rate": 2.801887779203991e-05, + "loss": 0.0354, + "step": 97680 + }, + { + "epoch": 0.08845, + "grad_norm": 0.08397877961397171, + "learning_rate": 2.801477432512413e-05, + "loss": 0.0345, + "step": 97690 + }, + { + "epoch": 0.0885, + "grad_norm": 0.07449831068515778, + "learning_rate": 2.8010670775785568e-05, + "loss": 0.0366, + "step": 97700 + }, + { + "epoch": 0.08855, + "grad_norm": 0.0895819440484047, + "learning_rate": 2.8006567144136385e-05, + "loss": 0.0354, + "step": 97710 + }, + { + "epoch": 0.0886, + "grad_norm": 0.10191086679697037, + "learning_rate": 2.8002463430288794e-05, + "loss": 0.0356, + "step": 97720 + }, + { + "epoch": 0.08865, + "grad_norm": 0.07982117682695389, + "learning_rate": 2.7998359634354976e-05, + "loss": 0.0338, + "step": 97730 + }, + { + "epoch": 0.0887, + "grad_norm": 0.07811693847179413, + "learning_rate": 2.7994255756447135e-05, + "loss": 0.0349, + "step": 97740 + }, + { + "epoch": 0.08875, + "grad_norm": 0.07560224831104279, + "learning_rate": 2.799015179667746e-05, + "loss": 0.0347, + "step": 97750 + }, + { + "epoch": 0.0888, + "grad_norm": 0.06786207854747772, + "learning_rate": 2.7986047755158168e-05, + "loss": 0.0339, + "step": 97760 + }, + { + "epoch": 0.08885, + "grad_norm": 0.06226731091737747, + "learning_rate": 2.798194363200145e-05, + "loss": 0.0341, + "step": 97770 + }, + { + "epoch": 0.0889, + "grad_norm": 0.06887473911046982, + "learning_rate": 2.7977839427319508e-05, + "loss": 0.035, + "step": 97780 + }, + { + "epoch": 0.08895, + "grad_norm": 0.06426707655191422, + "learning_rate": 2.7973735141224555e-05, + "loss": 0.0346, + "step": 97790 + }, + { + "epoch": 0.089, + "grad_norm": 0.06857171654701233, + "learning_rate": 2.7969630773828802e-05, + "loss": 0.0362, + "step": 97800 + }, + { + "epoch": 0.08905, + "grad_norm": 0.07259707897901535, + "learning_rate": 2.7965526325244463e-05, + "loss": 0.0366, + "step": 97810 + }, + { + "epoch": 0.0891, + "grad_norm": 0.0817384123802185, + "learning_rate": 2.7961421795583743e-05, + "loss": 0.0374, + "step": 97820 + }, + { + "epoch": 0.08915, + "grad_norm": 0.0681479424238205, + "learning_rate": 2.795731718495887e-05, + "loss": 0.036, + "step": 97830 + }, + { + "epoch": 0.0892, + "grad_norm": 0.05921397730708122, + "learning_rate": 2.795321249348205e-05, + "loss": 0.0339, + "step": 97840 + }, + { + "epoch": 0.08925, + "grad_norm": 0.06496920436620712, + "learning_rate": 2.794910772126551e-05, + "loss": 0.0353, + "step": 97850 + }, + { + "epoch": 0.0893, + "grad_norm": 0.07719899713993073, + "learning_rate": 2.7945002868421478e-05, + "loss": 0.0359, + "step": 97860 + }, + { + "epoch": 0.08935, + "grad_norm": 0.08060154318809509, + "learning_rate": 2.794089793506217e-05, + "loss": 0.0356, + "step": 97870 + }, + { + "epoch": 0.0894, + "grad_norm": 0.09261590242385864, + "learning_rate": 2.793679292129982e-05, + "loss": 0.0351, + "step": 97880 + }, + { + "epoch": 0.08945, + "grad_norm": 0.10092183202505112, + "learning_rate": 2.7932687827246656e-05, + "loss": 0.0374, + "step": 97890 + }, + { + "epoch": 0.0895, + "grad_norm": 0.0752311423420906, + "learning_rate": 2.79285826530149e-05, + "loss": 0.0341, + "step": 97900 + }, + { + "epoch": 0.08955, + "grad_norm": 0.11677830666303635, + "learning_rate": 2.7924477398716803e-05, + "loss": 0.0354, + "step": 97910 + }, + { + "epoch": 0.0896, + "grad_norm": 0.07338506728410721, + "learning_rate": 2.792037206446459e-05, + "loss": 0.0361, + "step": 97920 + }, + { + "epoch": 0.08965, + "grad_norm": 0.06555341184139252, + "learning_rate": 2.7916266650370504e-05, + "loss": 0.0354, + "step": 97930 + }, + { + "epoch": 0.0897, + "grad_norm": 0.0656813457608223, + "learning_rate": 2.791216115654678e-05, + "loss": 0.035, + "step": 97940 + }, + { + "epoch": 0.08975, + "grad_norm": 0.07416542619466782, + "learning_rate": 2.7908055583105668e-05, + "loss": 0.0358, + "step": 97950 + }, + { + "epoch": 0.0898, + "grad_norm": 0.0651547834277153, + "learning_rate": 2.7903949930159402e-05, + "loss": 0.035, + "step": 97960 + }, + { + "epoch": 0.08985, + "grad_norm": 0.0780981034040451, + "learning_rate": 2.7899844197820246e-05, + "loss": 0.0353, + "step": 97970 + }, + { + "epoch": 0.0899, + "grad_norm": 0.0633014664053917, + "learning_rate": 2.7895738386200425e-05, + "loss": 0.0356, + "step": 97980 + }, + { + "epoch": 0.08995, + "grad_norm": 0.06496810168027878, + "learning_rate": 2.7891632495412217e-05, + "loss": 0.0347, + "step": 97990 + }, + { + "epoch": 0.09, + "grad_norm": 0.08917754143476486, + "learning_rate": 2.788752652556785e-05, + "loss": 0.0379, + "step": 98000 + }, + { + "epoch": 0.09005, + "grad_norm": 0.08220583200454712, + "learning_rate": 2.788342047677961e-05, + "loss": 0.037, + "step": 98010 + }, + { + "epoch": 0.0901, + "grad_norm": 0.083226278424263, + "learning_rate": 2.7879314349159724e-05, + "loss": 0.0346, + "step": 98020 + }, + { + "epoch": 0.09015, + "grad_norm": 0.07575386017560959, + "learning_rate": 2.787520814282047e-05, + "loss": 0.0368, + "step": 98030 + }, + { + "epoch": 0.0902, + "grad_norm": 0.07754475623369217, + "learning_rate": 2.7871101857874106e-05, + "loss": 0.0373, + "step": 98040 + }, + { + "epoch": 0.09025, + "grad_norm": 0.08634736388921738, + "learning_rate": 2.7866995494432897e-05, + "loss": 0.0363, + "step": 98050 + }, + { + "epoch": 0.0903, + "grad_norm": 0.08915960788726807, + "learning_rate": 2.7862889052609105e-05, + "loss": 0.0379, + "step": 98060 + }, + { + "epoch": 0.09035, + "grad_norm": 0.07953356206417084, + "learning_rate": 2.7858782532515e-05, + "loss": 0.0371, + "step": 98070 + }, + { + "epoch": 0.0904, + "grad_norm": 0.06254126876592636, + "learning_rate": 2.7854675934262864e-05, + "loss": 0.0357, + "step": 98080 + }, + { + "epoch": 0.09045, + "grad_norm": 0.06277334690093994, + "learning_rate": 2.7850569257964954e-05, + "loss": 0.0364, + "step": 98090 + }, + { + "epoch": 0.0905, + "grad_norm": 0.08272846043109894, + "learning_rate": 2.7846462503733544e-05, + "loss": 0.0369, + "step": 98100 + }, + { + "epoch": 0.09055, + "grad_norm": 0.0841066911816597, + "learning_rate": 2.7842355671680925e-05, + "loss": 0.0362, + "step": 98110 + }, + { + "epoch": 0.0906, + "grad_norm": 0.07980770617723465, + "learning_rate": 2.783824876191938e-05, + "loss": 0.0358, + "step": 98120 + }, + { + "epoch": 0.09065, + "grad_norm": 0.08716199547052383, + "learning_rate": 2.7834141774561168e-05, + "loss": 0.0347, + "step": 98130 + }, + { + "epoch": 0.0907, + "grad_norm": 0.07199542969465256, + "learning_rate": 2.783003470971859e-05, + "loss": 0.0344, + "step": 98140 + }, + { + "epoch": 0.09075, + "grad_norm": 0.058281343430280685, + "learning_rate": 2.7825927567503924e-05, + "loss": 0.0356, + "step": 98150 + }, + { + "epoch": 0.0908, + "grad_norm": 0.07014793157577515, + "learning_rate": 2.782182034802946e-05, + "loss": 0.0359, + "step": 98160 + }, + { + "epoch": 0.09085, + "grad_norm": 0.07957247644662857, + "learning_rate": 2.781771305140748e-05, + "loss": 0.0341, + "step": 98170 + }, + { + "epoch": 0.0909, + "grad_norm": 0.07609190791845322, + "learning_rate": 2.7813605677750297e-05, + "loss": 0.035, + "step": 98180 + }, + { + "epoch": 0.09095, + "grad_norm": 0.09158436208963394, + "learning_rate": 2.7809498227170184e-05, + "loss": 0.0345, + "step": 98190 + }, + { + "epoch": 0.091, + "grad_norm": 0.07892835885286331, + "learning_rate": 2.780539069977945e-05, + "loss": 0.0353, + "step": 98200 + }, + { + "epoch": 0.09105, + "grad_norm": 0.08445654064416885, + "learning_rate": 2.7801283095690384e-05, + "loss": 0.0335, + "step": 98210 + }, + { + "epoch": 0.0911, + "grad_norm": 0.09106568247079849, + "learning_rate": 2.77971754150153e-05, + "loss": 0.0344, + "step": 98220 + }, + { + "epoch": 0.09115, + "grad_norm": 0.0864868089556694, + "learning_rate": 2.779306765786647e-05, + "loss": 0.0352, + "step": 98230 + }, + { + "epoch": 0.0912, + "grad_norm": 0.07761655747890472, + "learning_rate": 2.7788959824356238e-05, + "loss": 0.0338, + "step": 98240 + }, + { + "epoch": 0.09125, + "grad_norm": 0.07849200069904327, + "learning_rate": 2.778485191459688e-05, + "loss": 0.0347, + "step": 98250 + }, + { + "epoch": 0.0913, + "grad_norm": 0.07718189060688019, + "learning_rate": 2.778074392870073e-05, + "loss": 0.0348, + "step": 98260 + }, + { + "epoch": 0.09135, + "grad_norm": 0.08885542303323746, + "learning_rate": 2.7776635866780077e-05, + "loss": 0.0337, + "step": 98270 + }, + { + "epoch": 0.0914, + "grad_norm": 0.07728283107280731, + "learning_rate": 2.7772527728947247e-05, + "loss": 0.0349, + "step": 98280 + }, + { + "epoch": 0.09145, + "grad_norm": 0.06739954650402069, + "learning_rate": 2.7768419515314542e-05, + "loss": 0.0345, + "step": 98290 + }, + { + "epoch": 0.0915, + "grad_norm": 0.07996700704097748, + "learning_rate": 2.77643112259943e-05, + "loss": 0.037, + "step": 98300 + }, + { + "epoch": 0.09155, + "grad_norm": 0.07308963686227798, + "learning_rate": 2.7760202861098815e-05, + "loss": 0.0334, + "step": 98310 + }, + { + "epoch": 0.0916, + "grad_norm": 0.07305646687746048, + "learning_rate": 2.7756094420740432e-05, + "loss": 0.0352, + "step": 98320 + }, + { + "epoch": 0.09165, + "grad_norm": 0.085787333548069, + "learning_rate": 2.775198590503146e-05, + "loss": 0.0351, + "step": 98330 + }, + { + "epoch": 0.0917, + "grad_norm": 0.06632562726736069, + "learning_rate": 2.774787731408422e-05, + "loss": 0.035, + "step": 98340 + }, + { + "epoch": 0.09175, + "grad_norm": 0.07978537678718567, + "learning_rate": 2.7743768648011053e-05, + "loss": 0.0341, + "step": 98350 + }, + { + "epoch": 0.0918, + "grad_norm": 0.07276801019906998, + "learning_rate": 2.7739659906924274e-05, + "loss": 0.0357, + "step": 98360 + }, + { + "epoch": 0.09185, + "grad_norm": 0.08967699110507965, + "learning_rate": 2.7735551090936236e-05, + "loss": 0.0361, + "step": 98370 + }, + { + "epoch": 0.0919, + "grad_norm": 0.07864616811275482, + "learning_rate": 2.7731442200159247e-05, + "loss": 0.0346, + "step": 98380 + }, + { + "epoch": 0.09195, + "grad_norm": 0.11499129235744476, + "learning_rate": 2.7727333234705665e-05, + "loss": 0.0399, + "step": 98390 + }, + { + "epoch": 0.092, + "grad_norm": 0.10010307282209396, + "learning_rate": 2.7723224194687807e-05, + "loss": 0.035, + "step": 98400 + }, + { + "epoch": 0.09205, + "grad_norm": 0.08330327272415161, + "learning_rate": 2.7719115080218033e-05, + "loss": 0.042, + "step": 98410 + }, + { + "epoch": 0.0921, + "grad_norm": 0.09687450528144836, + "learning_rate": 2.7715005891408663e-05, + "loss": 0.0343, + "step": 98420 + }, + { + "epoch": 0.09215, + "grad_norm": 0.07836932688951492, + "learning_rate": 2.7710896628372058e-05, + "loss": 0.0383, + "step": 98430 + }, + { + "epoch": 0.0922, + "grad_norm": 0.090105801820755, + "learning_rate": 2.7706787291220554e-05, + "loss": 0.0366, + "step": 98440 + }, + { + "epoch": 0.09225, + "grad_norm": 0.07002262026071548, + "learning_rate": 2.770267788006651e-05, + "loss": 0.0348, + "step": 98450 + }, + { + "epoch": 0.0923, + "grad_norm": 0.09269098192453384, + "learning_rate": 2.7698568395022263e-05, + "loss": 0.0347, + "step": 98460 + }, + { + "epoch": 0.09235, + "grad_norm": 0.07973234355449677, + "learning_rate": 2.769445883620017e-05, + "loss": 0.0354, + "step": 98470 + }, + { + "epoch": 0.0924, + "grad_norm": 0.07400687038898468, + "learning_rate": 2.7690349203712585e-05, + "loss": 0.0368, + "step": 98480 + }, + { + "epoch": 0.09245, + "grad_norm": 0.10803262144327164, + "learning_rate": 2.7686239497671863e-05, + "loss": 0.0363, + "step": 98490 + }, + { + "epoch": 0.0925, + "grad_norm": 0.07510066777467728, + "learning_rate": 2.768212971819036e-05, + "loss": 0.0341, + "step": 98500 + }, + { + "epoch": 0.09255, + "grad_norm": 0.07289294898509979, + "learning_rate": 2.7678019865380443e-05, + "loss": 0.0344, + "step": 98510 + }, + { + "epoch": 0.0926, + "grad_norm": 0.06677382439374924, + "learning_rate": 2.7673909939354464e-05, + "loss": 0.0343, + "step": 98520 + }, + { + "epoch": 0.09265, + "grad_norm": 0.057776324450969696, + "learning_rate": 2.7669799940224794e-05, + "loss": 0.0341, + "step": 98530 + }, + { + "epoch": 0.0927, + "grad_norm": 0.07983548939228058, + "learning_rate": 2.76656898681038e-05, + "loss": 0.0368, + "step": 98540 + }, + { + "epoch": 0.09275, + "grad_norm": 0.08397910743951797, + "learning_rate": 2.7661579723103844e-05, + "loss": 0.0349, + "step": 98550 + }, + { + "epoch": 0.0928, + "grad_norm": 0.0787234827876091, + "learning_rate": 2.765746950533729e-05, + "loss": 0.0339, + "step": 98560 + }, + { + "epoch": 0.09285, + "grad_norm": 0.0803915411233902, + "learning_rate": 2.7653359214916524e-05, + "loss": 0.0346, + "step": 98570 + }, + { + "epoch": 0.0929, + "grad_norm": 0.10380898416042328, + "learning_rate": 2.7649248851953925e-05, + "loss": 0.0346, + "step": 98580 + }, + { + "epoch": 0.09295, + "grad_norm": 0.08209935575723648, + "learning_rate": 2.7645138416561843e-05, + "loss": 0.0341, + "step": 98590 + }, + { + "epoch": 0.093, + "grad_norm": 0.07948790490627289, + "learning_rate": 2.764102790885268e-05, + "loss": 0.0349, + "step": 98600 + }, + { + "epoch": 0.09305, + "grad_norm": 0.0787932276725769, + "learning_rate": 2.7636917328938794e-05, + "loss": 0.0353, + "step": 98610 + }, + { + "epoch": 0.0931, + "grad_norm": 0.0815957635641098, + "learning_rate": 2.7632806676932594e-05, + "loss": 0.0344, + "step": 98620 + }, + { + "epoch": 0.09315, + "grad_norm": 0.061056435108184814, + "learning_rate": 2.7628695952946436e-05, + "loss": 0.0371, + "step": 98630 + }, + { + "epoch": 0.0932, + "grad_norm": 0.0791785940527916, + "learning_rate": 2.762458515709273e-05, + "loss": 0.0341, + "step": 98640 + }, + { + "epoch": 0.09325, + "grad_norm": 0.09801524877548218, + "learning_rate": 2.7620474289483843e-05, + "loss": 0.036, + "step": 98650 + }, + { + "epoch": 0.0933, + "grad_norm": 0.07026061415672302, + "learning_rate": 2.7616363350232177e-05, + "loss": 0.0352, + "step": 98660 + }, + { + "epoch": 0.09335, + "grad_norm": 0.07985706627368927, + "learning_rate": 2.761225233945012e-05, + "loss": 0.0353, + "step": 98670 + }, + { + "epoch": 0.0934, + "grad_norm": 0.07724630832672119, + "learning_rate": 2.760814125725006e-05, + "loss": 0.0351, + "step": 98680 + }, + { + "epoch": 0.09345, + "grad_norm": 0.09305231273174286, + "learning_rate": 2.76040301037444e-05, + "loss": 0.0349, + "step": 98690 + }, + { + "epoch": 0.0935, + "grad_norm": 0.09154807776212692, + "learning_rate": 2.759991887904554e-05, + "loss": 0.0363, + "step": 98700 + }, + { + "epoch": 0.09355, + "grad_norm": 0.08939708769321442, + "learning_rate": 2.759580758326587e-05, + "loss": 0.0374, + "step": 98710 + }, + { + "epoch": 0.0936, + "grad_norm": 0.07384146004915237, + "learning_rate": 2.7591696216517804e-05, + "loss": 0.0381, + "step": 98720 + }, + { + "epoch": 0.09365, + "grad_norm": 0.07931797206401825, + "learning_rate": 2.7587584778913727e-05, + "loss": 0.0349, + "step": 98730 + }, + { + "epoch": 0.0937, + "grad_norm": 0.06376132369041443, + "learning_rate": 2.7583473270566058e-05, + "loss": 0.0344, + "step": 98740 + }, + { + "epoch": 0.09375, + "grad_norm": 0.0623442605137825, + "learning_rate": 2.7579361691587198e-05, + "loss": 0.0367, + "step": 98750 + }, + { + "epoch": 0.0938, + "grad_norm": 0.06312055140733719, + "learning_rate": 2.7575250042089562e-05, + "loss": 0.0365, + "step": 98760 + }, + { + "epoch": 0.09385, + "grad_norm": 0.06792078167200089, + "learning_rate": 2.7571138322185558e-05, + "loss": 0.0358, + "step": 98770 + }, + { + "epoch": 0.0939, + "grad_norm": 0.06359875947237015, + "learning_rate": 2.7567026531987594e-05, + "loss": 0.0342, + "step": 98780 + }, + { + "epoch": 0.09395, + "grad_norm": 0.06873776018619537, + "learning_rate": 2.7562914671608092e-05, + "loss": 0.0342, + "step": 98790 + }, + { + "epoch": 0.094, + "grad_norm": 0.07305468618869781, + "learning_rate": 2.7558802741159463e-05, + "loss": 0.0357, + "step": 98800 + }, + { + "epoch": 0.09405, + "grad_norm": 0.07306445389986038, + "learning_rate": 2.755469074075413e-05, + "loss": 0.0355, + "step": 98810 + }, + { + "epoch": 0.0941, + "grad_norm": 0.08633474260568619, + "learning_rate": 2.755057867050451e-05, + "loss": 0.0363, + "step": 98820 + }, + { + "epoch": 0.09415, + "grad_norm": 0.08300592750310898, + "learning_rate": 2.7546466530523035e-05, + "loss": 0.0331, + "step": 98830 + }, + { + "epoch": 0.0942, + "grad_norm": 0.08110862970352173, + "learning_rate": 2.7542354320922115e-05, + "loss": 0.0357, + "step": 98840 + }, + { + "epoch": 0.09425, + "grad_norm": 0.07080795615911484, + "learning_rate": 2.753824204181419e-05, + "loss": 0.0344, + "step": 98850 + }, + { + "epoch": 0.0943, + "grad_norm": 0.0670158788561821, + "learning_rate": 2.7534129693311674e-05, + "loss": 0.0342, + "step": 98860 + }, + { + "epoch": 0.09435, + "grad_norm": 0.0654955580830574, + "learning_rate": 2.7530017275527e-05, + "loss": 0.0354, + "step": 98870 + }, + { + "epoch": 0.0944, + "grad_norm": 0.08094431459903717, + "learning_rate": 2.7525904788572608e-05, + "loss": 0.0341, + "step": 98880 + }, + { + "epoch": 0.09445, + "grad_norm": 0.09253891557455063, + "learning_rate": 2.7521792232560932e-05, + "loss": 0.0382, + "step": 98890 + }, + { + "epoch": 0.0945, + "grad_norm": 0.10331499576568604, + "learning_rate": 2.7517679607604402e-05, + "loss": 0.0353, + "step": 98900 + }, + { + "epoch": 0.09455, + "grad_norm": 0.07507817447185516, + "learning_rate": 2.7513566913815458e-05, + "loss": 0.0349, + "step": 98910 + }, + { + "epoch": 0.0946, + "grad_norm": 0.08221866935491562, + "learning_rate": 2.7509454151306534e-05, + "loss": 0.0353, + "step": 98920 + }, + { + "epoch": 0.09465, + "grad_norm": 0.08716592192649841, + "learning_rate": 2.750534132019008e-05, + "loss": 0.0347, + "step": 98930 + }, + { + "epoch": 0.0947, + "grad_norm": 0.0685884952545166, + "learning_rate": 2.7501228420578533e-05, + "loss": 0.037, + "step": 98940 + }, + { + "epoch": 0.09475, + "grad_norm": 0.08891511708498001, + "learning_rate": 2.749711545258435e-05, + "loss": 0.0363, + "step": 98950 + }, + { + "epoch": 0.0948, + "grad_norm": 0.08393566310405731, + "learning_rate": 2.7493002416319958e-05, + "loss": 0.0364, + "step": 98960 + }, + { + "epoch": 0.09485, + "grad_norm": 0.070647694170475, + "learning_rate": 2.7488889311897826e-05, + "loss": 0.0357, + "step": 98970 + }, + { + "epoch": 0.0949, + "grad_norm": 0.10159338265657425, + "learning_rate": 2.748477613943039e-05, + "loss": 0.0366, + "step": 98980 + }, + { + "epoch": 0.09495, + "grad_norm": 0.0961875468492508, + "learning_rate": 2.7480662899030103e-05, + "loss": 0.0354, + "step": 98990 + }, + { + "epoch": 0.095, + "grad_norm": 0.07758517563343048, + "learning_rate": 2.7476549590809425e-05, + "loss": 0.0367, + "step": 99000 + }, + { + "epoch": 0.09505, + "grad_norm": 0.07822514325380325, + "learning_rate": 2.747243621488082e-05, + "loss": 0.0361, + "step": 99010 + }, + { + "epoch": 0.0951, + "grad_norm": 0.0853765457868576, + "learning_rate": 2.7468322771356736e-05, + "loss": 0.0357, + "step": 99020 + }, + { + "epoch": 0.09515, + "grad_norm": 0.10141955316066742, + "learning_rate": 2.746420926034963e-05, + "loss": 0.0353, + "step": 99030 + }, + { + "epoch": 0.0952, + "grad_norm": 0.0946832075715065, + "learning_rate": 2.746009568197197e-05, + "loss": 0.0354, + "step": 99040 + }, + { + "epoch": 0.09525, + "grad_norm": 0.08281733095645905, + "learning_rate": 2.745598203633622e-05, + "loss": 0.0351, + "step": 99050 + }, + { + "epoch": 0.0953, + "grad_norm": 0.09299663454294205, + "learning_rate": 2.7451868323554842e-05, + "loss": 0.0351, + "step": 99060 + }, + { + "epoch": 0.09535, + "grad_norm": 0.11017463356256485, + "learning_rate": 2.74477545437403e-05, + "loss": 0.0364, + "step": 99070 + }, + { + "epoch": 0.0954, + "grad_norm": 0.09078861027956009, + "learning_rate": 2.744364069700508e-05, + "loss": 0.0367, + "step": 99080 + }, + { + "epoch": 0.09545, + "grad_norm": 0.10564927011728287, + "learning_rate": 2.7439526783461632e-05, + "loss": 0.0346, + "step": 99090 + }, + { + "epoch": 0.0955, + "grad_norm": 0.08630510419607162, + "learning_rate": 2.7435412803222443e-05, + "loss": 0.0348, + "step": 99100 + }, + { + "epoch": 0.09555, + "grad_norm": 0.08209118992090225, + "learning_rate": 2.7431298756399982e-05, + "loss": 0.0336, + "step": 99110 + }, + { + "epoch": 0.0956, + "grad_norm": 0.07582660764455795, + "learning_rate": 2.7427184643106723e-05, + "loss": 0.0343, + "step": 99120 + }, + { + "epoch": 0.09565, + "grad_norm": 0.07559537142515182, + "learning_rate": 2.7423070463455147e-05, + "loss": 0.0345, + "step": 99130 + }, + { + "epoch": 0.0957, + "grad_norm": 0.07149489223957062, + "learning_rate": 2.7418956217557745e-05, + "loss": 0.0345, + "step": 99140 + }, + { + "epoch": 0.09575, + "grad_norm": 0.08505064249038696, + "learning_rate": 2.741484190552698e-05, + "loss": 0.035, + "step": 99150 + }, + { + "epoch": 0.0958, + "grad_norm": 0.06394463777542114, + "learning_rate": 2.741072752747535e-05, + "loss": 0.0339, + "step": 99160 + }, + { + "epoch": 0.09585, + "grad_norm": 0.06664594262838364, + "learning_rate": 2.7406613083515333e-05, + "loss": 0.0342, + "step": 99170 + }, + { + "epoch": 0.0959, + "grad_norm": 0.0785508006811142, + "learning_rate": 2.7402498573759415e-05, + "loss": 0.0347, + "step": 99180 + }, + { + "epoch": 0.09595, + "grad_norm": 0.06014531850814819, + "learning_rate": 2.7398383998320088e-05, + "loss": 0.0376, + "step": 99190 + }, + { + "epoch": 0.096, + "grad_norm": 0.07316498458385468, + "learning_rate": 2.739426935730985e-05, + "loss": 0.0345, + "step": 99200 + }, + { + "epoch": 0.09605, + "grad_norm": 0.06742087006568909, + "learning_rate": 2.7390154650841182e-05, + "loss": 0.0345, + "step": 99210 + }, + { + "epoch": 0.0961, + "grad_norm": 0.07355812937021255, + "learning_rate": 2.7386039879026586e-05, + "loss": 0.0348, + "step": 99220 + }, + { + "epoch": 0.09615, + "grad_norm": 0.06187834590673447, + "learning_rate": 2.7381925041978558e-05, + "loss": 0.0346, + "step": 99230 + }, + { + "epoch": 0.0962, + "grad_norm": 0.07339289784431458, + "learning_rate": 2.737781013980959e-05, + "loss": 0.0354, + "step": 99240 + }, + { + "epoch": 0.09625, + "grad_norm": 0.06592538952827454, + "learning_rate": 2.7373695172632184e-05, + "loss": 0.0356, + "step": 99250 + }, + { + "epoch": 0.0963, + "grad_norm": 0.0785292387008667, + "learning_rate": 2.7369580140558855e-05, + "loss": 0.0354, + "step": 99260 + }, + { + "epoch": 0.09635, + "grad_norm": 0.06731037050485611, + "learning_rate": 2.736546504370208e-05, + "loss": 0.0348, + "step": 99270 + }, + { + "epoch": 0.0964, + "grad_norm": 0.0630384087562561, + "learning_rate": 2.7361349882174385e-05, + "loss": 0.0356, + "step": 99280 + }, + { + "epoch": 0.09645, + "grad_norm": 0.08435707539319992, + "learning_rate": 2.735723465608828e-05, + "loss": 0.0355, + "step": 99290 + }, + { + "epoch": 0.0965, + "grad_norm": 0.1166800782084465, + "learning_rate": 2.7353119365556258e-05, + "loss": 0.0379, + "step": 99300 + }, + { + "epoch": 0.09655, + "grad_norm": 0.0964205339550972, + "learning_rate": 2.7349004010690833e-05, + "loss": 0.0372, + "step": 99310 + }, + { + "epoch": 0.0966, + "grad_norm": 0.10768742859363556, + "learning_rate": 2.7344888591604524e-05, + "loss": 0.0367, + "step": 99320 + }, + { + "epoch": 0.09665, + "grad_norm": 0.08268402516841888, + "learning_rate": 2.7340773108409847e-05, + "loss": 0.0355, + "step": 99330 + }, + { + "epoch": 0.0967, + "grad_norm": 0.08508584648370743, + "learning_rate": 2.7336657561219302e-05, + "loss": 0.0349, + "step": 99340 + }, + { + "epoch": 0.09675, + "grad_norm": 0.07057398557662964, + "learning_rate": 2.733254195014543e-05, + "loss": 0.0338, + "step": 99350 + }, + { + "epoch": 0.0968, + "grad_norm": 0.08485107123851776, + "learning_rate": 2.732842627530073e-05, + "loss": 0.0344, + "step": 99360 + }, + { + "epoch": 0.09685, + "grad_norm": 0.09654036164283752, + "learning_rate": 2.732431053679773e-05, + "loss": 0.0353, + "step": 99370 + }, + { + "epoch": 0.0969, + "grad_norm": 0.09786161035299301, + "learning_rate": 2.732019473474895e-05, + "loss": 0.0372, + "step": 99380 + }, + { + "epoch": 0.09695, + "grad_norm": 0.10060153901576996, + "learning_rate": 2.7316078869266926e-05, + "loss": 0.0353, + "step": 99390 + }, + { + "epoch": 0.097, + "grad_norm": 0.07460450381040573, + "learning_rate": 2.731196294046417e-05, + "loss": 0.0362, + "step": 99400 + }, + { + "epoch": 0.09705, + "grad_norm": 0.07266578823328018, + "learning_rate": 2.730784694845322e-05, + "loss": 0.0349, + "step": 99410 + }, + { + "epoch": 0.0971, + "grad_norm": 0.06460300087928772, + "learning_rate": 2.7303730893346598e-05, + "loss": 0.0349, + "step": 99420 + }, + { + "epoch": 0.09715, + "grad_norm": 0.07580089569091797, + "learning_rate": 2.7299614775256843e-05, + "loss": 0.0341, + "step": 99430 + }, + { + "epoch": 0.0972, + "grad_norm": 0.07017458230257034, + "learning_rate": 2.7295498594296477e-05, + "loss": 0.0345, + "step": 99440 + }, + { + "epoch": 0.09725, + "grad_norm": 0.09122201800346375, + "learning_rate": 2.7291382350578048e-05, + "loss": 0.0372, + "step": 99450 + }, + { + "epoch": 0.0973, + "grad_norm": 0.06226756423711777, + "learning_rate": 2.7287266044214082e-05, + "loss": 0.034, + "step": 99460 + }, + { + "epoch": 0.09735, + "grad_norm": 0.07291603088378906, + "learning_rate": 2.7283149675317126e-05, + "loss": 0.0349, + "step": 99470 + }, + { + "epoch": 0.0974, + "grad_norm": 0.06803841888904572, + "learning_rate": 2.7279033243999714e-05, + "loss": 0.0359, + "step": 99480 + }, + { + "epoch": 0.09745, + "grad_norm": 0.0804368108510971, + "learning_rate": 2.7274916750374385e-05, + "loss": 0.0372, + "step": 99490 + }, + { + "epoch": 0.0975, + "grad_norm": 0.06407159566879272, + "learning_rate": 2.7270800194553686e-05, + "loss": 0.0352, + "step": 99500 + }, + { + "epoch": 0.09755, + "grad_norm": 0.07560861855745316, + "learning_rate": 2.726668357665017e-05, + "loss": 0.0361, + "step": 99510 + }, + { + "epoch": 0.0976, + "grad_norm": 0.08951292186975479, + "learning_rate": 2.7262566896776376e-05, + "loss": 0.0357, + "step": 99520 + }, + { + "epoch": 0.09765, + "grad_norm": 0.09539582580327988, + "learning_rate": 2.7258450155044844e-05, + "loss": 0.0356, + "step": 99530 + }, + { + "epoch": 0.0977, + "grad_norm": 0.07857454568147659, + "learning_rate": 2.7254333351568144e-05, + "loss": 0.0351, + "step": 99540 + }, + { + "epoch": 0.09775, + "grad_norm": 0.08996032178401947, + "learning_rate": 2.7250216486458813e-05, + "loss": 0.036, + "step": 99550 + }, + { + "epoch": 0.0978, + "grad_norm": 0.08443517982959747, + "learning_rate": 2.7246099559829412e-05, + "loss": 0.0358, + "step": 99560 + }, + { + "epoch": 0.09785, + "grad_norm": 0.07626023143529892, + "learning_rate": 2.7241982571792486e-05, + "loss": 0.0355, + "step": 99570 + }, + { + "epoch": 0.0979, + "grad_norm": 0.10047277808189392, + "learning_rate": 2.7237865522460604e-05, + "loss": 0.035, + "step": 99580 + }, + { + "epoch": 0.09795, + "grad_norm": 0.12941400706768036, + "learning_rate": 2.7233748411946313e-05, + "loss": 0.0361, + "step": 99590 + }, + { + "epoch": 0.098, + "grad_norm": 0.11946405470371246, + "learning_rate": 2.722963124036219e-05, + "loss": 0.0361, + "step": 99600 + }, + { + "epoch": 0.09805, + "grad_norm": 0.088363878428936, + "learning_rate": 2.722551400782078e-05, + "loss": 0.0353, + "step": 99610 + }, + { + "epoch": 0.0981, + "grad_norm": 0.09197662025690079, + "learning_rate": 2.7221396714434655e-05, + "loss": 0.0344, + "step": 99620 + }, + { + "epoch": 0.09815, + "grad_norm": 0.08126228302717209, + "learning_rate": 2.721727936031637e-05, + "loss": 0.0364, + "step": 99630 + }, + { + "epoch": 0.0982, + "grad_norm": 0.10342596471309662, + "learning_rate": 2.7213161945578514e-05, + "loss": 0.0358, + "step": 99640 + }, + { + "epoch": 0.09825, + "grad_norm": 0.0790046751499176, + "learning_rate": 2.7209044470333635e-05, + "loss": 0.0352, + "step": 99650 + }, + { + "epoch": 0.0983, + "grad_norm": 0.09566021710634232, + "learning_rate": 2.7204926934694307e-05, + "loss": 0.0358, + "step": 99660 + }, + { + "epoch": 0.09835, + "grad_norm": 0.07163716107606888, + "learning_rate": 2.7200809338773108e-05, + "loss": 0.0351, + "step": 99670 + }, + { + "epoch": 0.0984, + "grad_norm": 0.07070885598659515, + "learning_rate": 2.719669168268261e-05, + "loss": 0.0363, + "step": 99680 + }, + { + "epoch": 0.09845, + "grad_norm": 0.07439655065536499, + "learning_rate": 2.7192573966535385e-05, + "loss": 0.036, + "step": 99690 + }, + { + "epoch": 0.0985, + "grad_norm": 0.07881123572587967, + "learning_rate": 2.718845619044401e-05, + "loss": 0.0344, + "step": 99700 + }, + { + "epoch": 0.09855, + "grad_norm": 0.08434942364692688, + "learning_rate": 2.7184338354521067e-05, + "loss": 0.0356, + "step": 99710 + }, + { + "epoch": 0.0986, + "grad_norm": 0.0887083187699318, + "learning_rate": 2.7180220458879136e-05, + "loss": 0.0351, + "step": 99720 + }, + { + "epoch": 0.09865, + "grad_norm": 0.07735418528318405, + "learning_rate": 2.7176102503630796e-05, + "loss": 0.0344, + "step": 99730 + }, + { + "epoch": 0.0987, + "grad_norm": 0.07166688144207001, + "learning_rate": 2.7171984488888623e-05, + "loss": 0.0368, + "step": 99740 + }, + { + "epoch": 0.09875, + "grad_norm": 0.07416993379592896, + "learning_rate": 2.7167866414765226e-05, + "loss": 0.0346, + "step": 99750 + }, + { + "epoch": 0.0988, + "grad_norm": 0.07947028428316116, + "learning_rate": 2.7163748281373164e-05, + "loss": 0.0331, + "step": 99760 + }, + { + "epoch": 0.09885, + "grad_norm": 0.06419810652732849, + "learning_rate": 2.7159630088825034e-05, + "loss": 0.0353, + "step": 99770 + }, + { + "epoch": 0.0989, + "grad_norm": 0.07942546904087067, + "learning_rate": 2.715551183723343e-05, + "loss": 0.0341, + "step": 99780 + }, + { + "epoch": 0.09895, + "grad_norm": 0.08818191289901733, + "learning_rate": 2.7151393526710955e-05, + "loss": 0.0348, + "step": 99790 + }, + { + "epoch": 0.099, + "grad_norm": 0.07038391381502151, + "learning_rate": 2.714727515737018e-05, + "loss": 0.0342, + "step": 99800 + }, + { + "epoch": 0.09905, + "grad_norm": 0.06655339896678925, + "learning_rate": 2.714315672932371e-05, + "loss": 0.0346, + "step": 99810 + }, + { + "epoch": 0.0991, + "grad_norm": 0.07684661448001862, + "learning_rate": 2.7139038242684127e-05, + "loss": 0.0356, + "step": 99820 + }, + { + "epoch": 0.09915, + "grad_norm": 0.06352897733449936, + "learning_rate": 2.713491969756406e-05, + "loss": 0.0351, + "step": 99830 + }, + { + "epoch": 0.0992, + "grad_norm": 0.0849921852350235, + "learning_rate": 2.7130801094076088e-05, + "loss": 0.035, + "step": 99840 + }, + { + "epoch": 0.09925, + "grad_norm": 0.08481822907924652, + "learning_rate": 2.7126682432332812e-05, + "loss": 0.0338, + "step": 99850 + }, + { + "epoch": 0.0993, + "grad_norm": 0.0884605348110199, + "learning_rate": 2.7122563712446834e-05, + "loss": 0.0362, + "step": 99860 + }, + { + "epoch": 0.09935, + "grad_norm": 0.0767376720905304, + "learning_rate": 2.7118444934530768e-05, + "loss": 0.0345, + "step": 99870 + }, + { + "epoch": 0.0994, + "grad_norm": 0.07019027322530746, + "learning_rate": 2.7114326098697207e-05, + "loss": 0.0357, + "step": 99880 + }, + { + "epoch": 0.09945, + "grad_norm": 0.08582523465156555, + "learning_rate": 2.7110207205058768e-05, + "loss": 0.0348, + "step": 99890 + }, + { + "epoch": 0.0995, + "grad_norm": 0.087223120033741, + "learning_rate": 2.710608825372805e-05, + "loss": 0.0356, + "step": 99900 + }, + { + "epoch": 0.09955, + "grad_norm": 0.07381154596805573, + "learning_rate": 2.7101969244817683e-05, + "loss": 0.0354, + "step": 99910 + }, + { + "epoch": 0.0996, + "grad_norm": 0.07284123450517654, + "learning_rate": 2.709785017844026e-05, + "loss": 0.0358, + "step": 99920 + }, + { + "epoch": 0.09965, + "grad_norm": 0.0720820426940918, + "learning_rate": 2.7093731054708404e-05, + "loss": 0.0354, + "step": 99930 + }, + { + "epoch": 0.0997, + "grad_norm": 0.08696381002664566, + "learning_rate": 2.708961187373472e-05, + "loss": 0.0393, + "step": 99940 + }, + { + "epoch": 0.09975, + "grad_norm": 0.09174282103776932, + "learning_rate": 2.7085492635631838e-05, + "loss": 0.0354, + "step": 99950 + }, + { + "epoch": 0.0998, + "grad_norm": 0.07005579024553299, + "learning_rate": 2.708137334051237e-05, + "loss": 0.0352, + "step": 99960 + }, + { + "epoch": 0.09985, + "grad_norm": 0.05808929353952408, + "learning_rate": 2.7077253988488937e-05, + "loss": 0.0348, + "step": 99970 + }, + { + "epoch": 0.0999, + "grad_norm": 0.05459505692124367, + "learning_rate": 2.707313457967416e-05, + "loss": 0.0362, + "step": 99980 + }, + { + "epoch": 0.09995, + "grad_norm": 0.07056388258934021, + "learning_rate": 2.7069015114180664e-05, + "loss": 0.0358, + "step": 99990 + }, + { + "epoch": 0.1, + "grad_norm": 0.08420798927545547, + "learning_rate": 2.706489559212107e-05, + "loss": 0.0364, + "step": 100000 + }, + { + "epoch": 0.10005, + "grad_norm": 0.06554892659187317, + "learning_rate": 2.706077601360801e-05, + "loss": 0.0351, + "step": 100010 + }, + { + "epoch": 0.1001, + "grad_norm": 0.07413670420646667, + "learning_rate": 2.7056656378754097e-05, + "loss": 0.035, + "step": 100020 + }, + { + "epoch": 0.10015, + "grad_norm": 0.0762542188167572, + "learning_rate": 2.705253668767198e-05, + "loss": 0.0359, + "step": 100030 + }, + { + "epoch": 0.1002, + "grad_norm": 0.0821673646569252, + "learning_rate": 2.7048416940474285e-05, + "loss": 0.0356, + "step": 100040 + }, + { + "epoch": 0.10025, + "grad_norm": 0.0915229469537735, + "learning_rate": 2.7044297137273632e-05, + "loss": 0.0357, + "step": 100050 + }, + { + "epoch": 0.1003, + "grad_norm": 0.07093614339828491, + "learning_rate": 2.7040177278182672e-05, + "loss": 0.0341, + "step": 100060 + }, + { + "epoch": 0.10035, + "grad_norm": 0.0898672342300415, + "learning_rate": 2.7036057363314026e-05, + "loss": 0.0348, + "step": 100070 + }, + { + "epoch": 0.1004, + "grad_norm": 0.07349665462970734, + "learning_rate": 2.7031937392780334e-05, + "loss": 0.0333, + "step": 100080 + }, + { + "epoch": 0.10045, + "grad_norm": 0.06879694014787674, + "learning_rate": 2.7027817366694236e-05, + "loss": 0.0359, + "step": 100090 + }, + { + "epoch": 0.1005, + "grad_norm": 0.06934312731027603, + "learning_rate": 2.7023697285168382e-05, + "loss": 0.0334, + "step": 100100 + }, + { + "epoch": 0.10055, + "grad_norm": 0.06713636219501495, + "learning_rate": 2.70195771483154e-05, + "loss": 0.0328, + "step": 100110 + }, + { + "epoch": 0.1006, + "grad_norm": 0.0665070191025734, + "learning_rate": 2.701545695624794e-05, + "loss": 0.0329, + "step": 100120 + }, + { + "epoch": 0.10065, + "grad_norm": 0.07813754677772522, + "learning_rate": 2.7011336709078638e-05, + "loss": 0.0342, + "step": 100130 + }, + { + "epoch": 0.1007, + "grad_norm": 0.06740997731685638, + "learning_rate": 2.700721640692015e-05, + "loss": 0.0331, + "step": 100140 + }, + { + "epoch": 0.10075, + "grad_norm": 0.08873384445905685, + "learning_rate": 2.7003096049885112e-05, + "loss": 0.0393, + "step": 100150 + }, + { + "epoch": 0.1008, + "grad_norm": 0.08748091757297516, + "learning_rate": 2.6998975638086194e-05, + "loss": 0.0328, + "step": 100160 + }, + { + "epoch": 0.10085, + "grad_norm": 0.07819615304470062, + "learning_rate": 2.6994855171636026e-05, + "loss": 0.035, + "step": 100170 + }, + { + "epoch": 0.1009, + "grad_norm": 0.06912656128406525, + "learning_rate": 2.699073465064727e-05, + "loss": 0.0342, + "step": 100180 + }, + { + "epoch": 0.10095, + "grad_norm": 0.07246105372905731, + "learning_rate": 2.6986614075232574e-05, + "loss": 0.0359, + "step": 100190 + }, + { + "epoch": 0.101, + "grad_norm": 0.08377740532159805, + "learning_rate": 2.698249344550459e-05, + "loss": 0.0362, + "step": 100200 + }, + { + "epoch": 0.10105, + "grad_norm": 0.07372310757637024, + "learning_rate": 2.697837276157599e-05, + "loss": 0.0352, + "step": 100210 + }, + { + "epoch": 0.1011, + "grad_norm": 0.0842214971780777, + "learning_rate": 2.6974252023559414e-05, + "loss": 0.0364, + "step": 100220 + }, + { + "epoch": 0.10115, + "grad_norm": 0.07575521618127823, + "learning_rate": 2.697013123156753e-05, + "loss": 0.0356, + "step": 100230 + }, + { + "epoch": 0.1012, + "grad_norm": 0.07804497331380844, + "learning_rate": 2.6966010385713003e-05, + "loss": 0.0356, + "step": 100240 + }, + { + "epoch": 0.10125, + "grad_norm": 0.10025062412023544, + "learning_rate": 2.6961889486108495e-05, + "loss": 0.0376, + "step": 100250 + }, + { + "epoch": 0.1013, + "grad_norm": 0.07742762565612793, + "learning_rate": 2.6957768532866656e-05, + "loss": 0.0349, + "step": 100260 + }, + { + "epoch": 0.10135, + "grad_norm": 0.10000407695770264, + "learning_rate": 2.695364752610016e-05, + "loss": 0.0352, + "step": 100270 + }, + { + "epoch": 0.1014, + "grad_norm": 0.08434217423200607, + "learning_rate": 2.6949526465921675e-05, + "loss": 0.0338, + "step": 100280 + }, + { + "epoch": 0.10145, + "grad_norm": 0.10215528309345245, + "learning_rate": 2.6945405352443875e-05, + "loss": 0.0353, + "step": 100290 + }, + { + "epoch": 0.1015, + "grad_norm": 0.08367814868688583, + "learning_rate": 2.694128418577942e-05, + "loss": 0.0352, + "step": 100300 + }, + { + "epoch": 0.10155, + "grad_norm": 0.08544318377971649, + "learning_rate": 2.693716296604099e-05, + "loss": 0.0332, + "step": 100310 + }, + { + "epoch": 0.1016, + "grad_norm": 0.08193808048963547, + "learning_rate": 2.6933041693341248e-05, + "loss": 0.034, + "step": 100320 + }, + { + "epoch": 0.10165, + "grad_norm": 0.08198003470897675, + "learning_rate": 2.692892036779287e-05, + "loss": 0.0351, + "step": 100330 + }, + { + "epoch": 0.1017, + "grad_norm": 0.07437553256750107, + "learning_rate": 2.6924798989508532e-05, + "loss": 0.0343, + "step": 100340 + }, + { + "epoch": 0.10175, + "grad_norm": 0.07319196313619614, + "learning_rate": 2.692067755860092e-05, + "loss": 0.0386, + "step": 100350 + }, + { + "epoch": 0.1018, + "grad_norm": 0.06065535545349121, + "learning_rate": 2.6916556075182704e-05, + "loss": 0.0349, + "step": 100360 + }, + { + "epoch": 0.10185, + "grad_norm": 0.0815400555729866, + "learning_rate": 2.6912434539366565e-05, + "loss": 0.0347, + "step": 100370 + }, + { + "epoch": 0.1019, + "grad_norm": 0.07775052636861801, + "learning_rate": 2.690831295126518e-05, + "loss": 0.0371, + "step": 100380 + }, + { + "epoch": 0.10195, + "grad_norm": 0.07136549055576324, + "learning_rate": 2.6904191310991238e-05, + "loss": 0.0365, + "step": 100390 + }, + { + "epoch": 0.102, + "grad_norm": 0.07335707545280457, + "learning_rate": 2.6900069618657413e-05, + "loss": 0.0366, + "step": 100400 + }, + { + "epoch": 0.10205, + "grad_norm": 0.07020821422338486, + "learning_rate": 2.689594787437641e-05, + "loss": 0.0346, + "step": 100410 + }, + { + "epoch": 0.1021, + "grad_norm": 0.07441741228103638, + "learning_rate": 2.68918260782609e-05, + "loss": 0.0364, + "step": 100420 + }, + { + "epoch": 0.10215, + "grad_norm": 0.07880008220672607, + "learning_rate": 2.688770423042358e-05, + "loss": 0.0378, + "step": 100430 + }, + { + "epoch": 0.1022, + "grad_norm": 0.07149319350719452, + "learning_rate": 2.688358233097713e-05, + "loss": 0.0356, + "step": 100440 + }, + { + "epoch": 0.10225, + "grad_norm": 0.08383669704198837, + "learning_rate": 2.687946038003425e-05, + "loss": 0.0337, + "step": 100450 + }, + { + "epoch": 0.1023, + "grad_norm": 0.07485045492649078, + "learning_rate": 2.687533837770762e-05, + "loss": 0.0368, + "step": 100460 + }, + { + "epoch": 0.10235, + "grad_norm": 0.07422316074371338, + "learning_rate": 2.6871216324109956e-05, + "loss": 0.0338, + "step": 100470 + }, + { + "epoch": 0.1024, + "grad_norm": 0.09465152025222778, + "learning_rate": 2.6867094219353933e-05, + "loss": 0.0352, + "step": 100480 + }, + { + "epoch": 0.10245, + "grad_norm": 0.10185496509075165, + "learning_rate": 2.6862972063552262e-05, + "loss": 0.0352, + "step": 100490 + }, + { + "epoch": 0.1025, + "grad_norm": 0.09141545742750168, + "learning_rate": 2.685884985681763e-05, + "loss": 0.0353, + "step": 100500 + }, + { + "epoch": 0.10255, + "grad_norm": 0.08120530843734741, + "learning_rate": 2.685472759926274e-05, + "loss": 0.0346, + "step": 100510 + }, + { + "epoch": 0.1026, + "grad_norm": 0.0668620839715004, + "learning_rate": 2.6850605291000297e-05, + "loss": 0.0346, + "step": 100520 + }, + { + "epoch": 0.10265, + "grad_norm": 0.060053206980228424, + "learning_rate": 2.6846482932142996e-05, + "loss": 0.0366, + "step": 100530 + }, + { + "epoch": 0.1027, + "grad_norm": 0.07062922418117523, + "learning_rate": 2.6842360522803554e-05, + "loss": 0.0347, + "step": 100540 + }, + { + "epoch": 0.10275, + "grad_norm": 0.05922499671578407, + "learning_rate": 2.683823806309466e-05, + "loss": 0.0339, + "step": 100550 + }, + { + "epoch": 0.1028, + "grad_norm": 0.059927552938461304, + "learning_rate": 2.6834115553129034e-05, + "loss": 0.0354, + "step": 100560 + }, + { + "epoch": 0.10285, + "grad_norm": 0.06442233920097351, + "learning_rate": 2.682999299301937e-05, + "loss": 0.0351, + "step": 100570 + }, + { + "epoch": 0.1029, + "grad_norm": 0.07545732706785202, + "learning_rate": 2.68258703828784e-05, + "loss": 0.0357, + "step": 100580 + }, + { + "epoch": 0.10295, + "grad_norm": 0.06960097700357437, + "learning_rate": 2.6821747722818797e-05, + "loss": 0.0357, + "step": 100590 + }, + { + "epoch": 0.103, + "grad_norm": 0.07541335374116898, + "learning_rate": 2.6817625012953313e-05, + "loss": 0.0358, + "step": 100600 + }, + { + "epoch": 0.10305, + "grad_norm": 0.06176866218447685, + "learning_rate": 2.6813502253394635e-05, + "loss": 0.0376, + "step": 100610 + }, + { + "epoch": 0.1031, + "grad_norm": 0.06761564314365387, + "learning_rate": 2.6809379444255493e-05, + "loss": 0.0343, + "step": 100620 + }, + { + "epoch": 0.10315, + "grad_norm": 0.07260114699602127, + "learning_rate": 2.6805256585648597e-05, + "loss": 0.0369, + "step": 100630 + }, + { + "epoch": 0.1032, + "grad_norm": 0.06497721374034882, + "learning_rate": 2.6801133677686663e-05, + "loss": 0.035, + "step": 100640 + }, + { + "epoch": 0.10325, + "grad_norm": 0.07391471415758133, + "learning_rate": 2.67970107204824e-05, + "loss": 0.0358, + "step": 100650 + }, + { + "epoch": 0.1033, + "grad_norm": 0.06881912797689438, + "learning_rate": 2.679288771414855e-05, + "loss": 0.0352, + "step": 100660 + }, + { + "epoch": 0.10335, + "grad_norm": 0.06782495975494385, + "learning_rate": 2.6788764658797827e-05, + "loss": 0.0353, + "step": 100670 + }, + { + "epoch": 0.1034, + "grad_norm": 0.061936330050230026, + "learning_rate": 2.6784641554542943e-05, + "loss": 0.0365, + "step": 100680 + }, + { + "epoch": 0.10345, + "grad_norm": 0.09320276975631714, + "learning_rate": 2.6780518401496634e-05, + "loss": 0.0353, + "step": 100690 + }, + { + "epoch": 0.1035, + "grad_norm": 0.0687861442565918, + "learning_rate": 2.6776395199771616e-05, + "loss": 0.0351, + "step": 100700 + }, + { + "epoch": 0.10355, + "grad_norm": 0.06580278277397156, + "learning_rate": 2.6772271949480622e-05, + "loss": 0.0348, + "step": 100710 + }, + { + "epoch": 0.1036, + "grad_norm": 0.08795535564422607, + "learning_rate": 2.676814865073638e-05, + "loss": 0.0367, + "step": 100720 + }, + { + "epoch": 0.10365, + "grad_norm": 0.06928795576095581, + "learning_rate": 2.676402530365162e-05, + "loss": 0.0347, + "step": 100730 + }, + { + "epoch": 0.1037, + "grad_norm": 0.0687493160367012, + "learning_rate": 2.6759901908339065e-05, + "loss": 0.0353, + "step": 100740 + }, + { + "epoch": 0.10375, + "grad_norm": 0.09345578402280807, + "learning_rate": 2.6755778464911457e-05, + "loss": 0.0356, + "step": 100750 + }, + { + "epoch": 0.1038, + "grad_norm": 0.07706046849489212, + "learning_rate": 2.6751654973481526e-05, + "loss": 0.036, + "step": 100760 + }, + { + "epoch": 0.10385, + "grad_norm": 0.08092194050550461, + "learning_rate": 2.674753143416201e-05, + "loss": 0.036, + "step": 100770 + }, + { + "epoch": 0.1039, + "grad_norm": 0.08409049361944199, + "learning_rate": 2.6743407847065627e-05, + "loss": 0.0376, + "step": 100780 + }, + { + "epoch": 0.10395, + "grad_norm": 0.08491519838571548, + "learning_rate": 2.673928421230514e-05, + "loss": 0.0338, + "step": 100790 + }, + { + "epoch": 0.104, + "grad_norm": 0.11122860014438629, + "learning_rate": 2.673516052999327e-05, + "loss": 0.0419, + "step": 100800 + }, + { + "epoch": 0.10405, + "grad_norm": 0.088767871260643, + "learning_rate": 2.673103680024277e-05, + "loss": 0.0369, + "step": 100810 + }, + { + "epoch": 0.1041, + "grad_norm": 0.08989717811346054, + "learning_rate": 2.6726913023166374e-05, + "loss": 0.038, + "step": 100820 + }, + { + "epoch": 0.10415, + "grad_norm": 0.08206108957529068, + "learning_rate": 2.6722789198876825e-05, + "loss": 0.0353, + "step": 100830 + }, + { + "epoch": 0.1042, + "grad_norm": 0.07449140399694443, + "learning_rate": 2.6718665327486854e-05, + "loss": 0.0371, + "step": 100840 + }, + { + "epoch": 0.10425, + "grad_norm": 0.09066558629274368, + "learning_rate": 2.6714541409109228e-05, + "loss": 0.0348, + "step": 100850 + }, + { + "epoch": 0.1043, + "grad_norm": 0.07385815680027008, + "learning_rate": 2.6710417443856683e-05, + "loss": 0.0349, + "step": 100860 + }, + { + "epoch": 0.10435, + "grad_norm": 0.07193297892808914, + "learning_rate": 2.6706293431841974e-05, + "loss": 0.0341, + "step": 100870 + }, + { + "epoch": 0.1044, + "grad_norm": 0.07011745125055313, + "learning_rate": 2.670216937317784e-05, + "loss": 0.034, + "step": 100880 + }, + { + "epoch": 0.10445, + "grad_norm": 0.06763279438018799, + "learning_rate": 2.6698045267977034e-05, + "loss": 0.0334, + "step": 100890 + }, + { + "epoch": 0.1045, + "grad_norm": 0.0637497678399086, + "learning_rate": 2.6693921116352304e-05, + "loss": 0.0344, + "step": 100900 + }, + { + "epoch": 0.10455, + "grad_norm": 0.07138977944850922, + "learning_rate": 2.668979691841641e-05, + "loss": 0.035, + "step": 100910 + }, + { + "epoch": 0.1046, + "grad_norm": 0.0984807088971138, + "learning_rate": 2.6685672674282097e-05, + "loss": 0.0362, + "step": 100920 + }, + { + "epoch": 0.10465, + "grad_norm": 0.06861315667629242, + "learning_rate": 2.668154838406214e-05, + "loss": 0.0354, + "step": 100930 + }, + { + "epoch": 0.1047, + "grad_norm": 0.09650956094264984, + "learning_rate": 2.667742404786927e-05, + "loss": 0.0346, + "step": 100940 + }, + { + "epoch": 0.10475, + "grad_norm": 0.07507655769586563, + "learning_rate": 2.667329966581626e-05, + "loss": 0.0348, + "step": 100950 + }, + { + "epoch": 0.1048, + "grad_norm": 0.0630788579583168, + "learning_rate": 2.666917523801587e-05, + "loss": 0.0368, + "step": 100960 + }, + { + "epoch": 0.10485, + "grad_norm": 0.06309845298528671, + "learning_rate": 2.6665050764580852e-05, + "loss": 0.0362, + "step": 100970 + }, + { + "epoch": 0.1049, + "grad_norm": 0.07771171629428864, + "learning_rate": 2.6660926245623968e-05, + "loss": 0.0348, + "step": 100980 + }, + { + "epoch": 0.10495, + "grad_norm": 0.06262598931789398, + "learning_rate": 2.6656801681257986e-05, + "loss": 0.0348, + "step": 100990 + }, + { + "epoch": 0.105, + "grad_norm": 0.06783615052700043, + "learning_rate": 2.6652677071595677e-05, + "loss": 0.0342, + "step": 101000 + }, + { + "epoch": 0.10505, + "grad_norm": 0.06547072529792786, + "learning_rate": 2.6648552416749795e-05, + "loss": 0.0357, + "step": 101010 + }, + { + "epoch": 0.1051, + "grad_norm": 0.058621667325496674, + "learning_rate": 2.664442771683311e-05, + "loss": 0.0348, + "step": 101020 + }, + { + "epoch": 0.10515, + "grad_norm": 0.08480167388916016, + "learning_rate": 2.6640302971958376e-05, + "loss": 0.0382, + "step": 101030 + }, + { + "epoch": 0.1052, + "grad_norm": 0.0638345256447792, + "learning_rate": 2.6636178182238387e-05, + "loss": 0.035, + "step": 101040 + }, + { + "epoch": 0.10525, + "grad_norm": 0.07020757347345352, + "learning_rate": 2.6632053347785897e-05, + "loss": 0.035, + "step": 101050 + }, + { + "epoch": 0.1053, + "grad_norm": 0.0668468028306961, + "learning_rate": 2.6627928468713687e-05, + "loss": 0.0352, + "step": 101060 + }, + { + "epoch": 0.10535, + "grad_norm": 0.0771222710609436, + "learning_rate": 2.6623803545134517e-05, + "loss": 0.0348, + "step": 101070 + }, + { + "epoch": 0.1054, + "grad_norm": 0.0790410116314888, + "learning_rate": 2.6619678577161178e-05, + "loss": 0.0347, + "step": 101080 + }, + { + "epoch": 0.10545, + "grad_norm": 0.0950852632522583, + "learning_rate": 2.6615553564906426e-05, + "loss": 0.0346, + "step": 101090 + }, + { + "epoch": 0.1055, + "grad_norm": 0.06712023168802261, + "learning_rate": 2.661142850848305e-05, + "loss": 0.0344, + "step": 101100 + }, + { + "epoch": 0.10555, + "grad_norm": 0.0824480876326561, + "learning_rate": 2.660730340800382e-05, + "loss": 0.0357, + "step": 101110 + }, + { + "epoch": 0.1056, + "grad_norm": 0.08160246163606644, + "learning_rate": 2.6603178263581525e-05, + "loss": 0.0357, + "step": 101120 + }, + { + "epoch": 0.10565, + "grad_norm": 0.07077962905168533, + "learning_rate": 2.6599053075328933e-05, + "loss": 0.035, + "step": 101130 + }, + { + "epoch": 0.1057, + "grad_norm": 0.1023697629570961, + "learning_rate": 2.6594927843358836e-05, + "loss": 0.0352, + "step": 101140 + }, + { + "epoch": 0.10575, + "grad_norm": 0.0759579986333847, + "learning_rate": 2.6590802567784008e-05, + "loss": 0.0343, + "step": 101150 + }, + { + "epoch": 0.1058, + "grad_norm": 0.07697603106498718, + "learning_rate": 2.6586677248717233e-05, + "loss": 0.0356, + "step": 101160 + }, + { + "epoch": 0.10585, + "grad_norm": 0.07262375950813293, + "learning_rate": 2.65825518862713e-05, + "loss": 0.0352, + "step": 101170 + }, + { + "epoch": 0.1059, + "grad_norm": 0.07879820466041565, + "learning_rate": 2.6578426480558993e-05, + "loss": 0.0363, + "step": 101180 + }, + { + "epoch": 0.10595, + "grad_norm": 0.0667225569486618, + "learning_rate": 2.65743010316931e-05, + "loss": 0.0342, + "step": 101190 + }, + { + "epoch": 0.106, + "grad_norm": 0.08262769877910614, + "learning_rate": 2.6570175539786406e-05, + "loss": 0.0353, + "step": 101200 + }, + { + "epoch": 0.10605, + "grad_norm": 0.07676971703767776, + "learning_rate": 2.656605000495171e-05, + "loss": 0.0353, + "step": 101210 + }, + { + "epoch": 0.1061, + "grad_norm": 0.07954535633325577, + "learning_rate": 2.656192442730179e-05, + "loss": 0.0347, + "step": 101220 + }, + { + "epoch": 0.10615, + "grad_norm": 0.07525185495615005, + "learning_rate": 2.6557798806949437e-05, + "loss": 0.034, + "step": 101230 + }, + { + "epoch": 0.1062, + "grad_norm": 0.06477613747119904, + "learning_rate": 2.6553673144007452e-05, + "loss": 0.0344, + "step": 101240 + }, + { + "epoch": 0.10625, + "grad_norm": 0.07150783389806747, + "learning_rate": 2.6549547438588635e-05, + "loss": 0.0345, + "step": 101250 + }, + { + "epoch": 0.1063, + "grad_norm": 0.07217303663492203, + "learning_rate": 2.6545421690805766e-05, + "loss": 0.0344, + "step": 101260 + }, + { + "epoch": 0.10635, + "grad_norm": 0.07395917177200317, + "learning_rate": 2.6541295900771657e-05, + "loss": 0.0351, + "step": 101270 + }, + { + "epoch": 0.1064, + "grad_norm": 0.09638627618551254, + "learning_rate": 2.6537170068599086e-05, + "loss": 0.0357, + "step": 101280 + }, + { + "epoch": 0.10645, + "grad_norm": 0.08601228892803192, + "learning_rate": 2.653304419440087e-05, + "loss": 0.0347, + "step": 101290 + }, + { + "epoch": 0.1065, + "grad_norm": 0.056638069450855255, + "learning_rate": 2.65289182782898e-05, + "loss": 0.0338, + "step": 101300 + }, + { + "epoch": 0.10655, + "grad_norm": 0.07380425930023193, + "learning_rate": 2.6524792320378678e-05, + "loss": 0.0348, + "step": 101310 + }, + { + "epoch": 0.1066, + "grad_norm": 0.07186898589134216, + "learning_rate": 2.6520666320780307e-05, + "loss": 0.0339, + "step": 101320 + }, + { + "epoch": 0.10665, + "grad_norm": 0.10297045111656189, + "learning_rate": 2.6516540279607492e-05, + "loss": 0.0351, + "step": 101330 + }, + { + "epoch": 0.1067, + "grad_norm": 0.07520277053117752, + "learning_rate": 2.6512414196973035e-05, + "loss": 0.0334, + "step": 101340 + }, + { + "epoch": 0.10675, + "grad_norm": 0.0703219398856163, + "learning_rate": 2.6508288072989736e-05, + "loss": 0.0354, + "step": 101350 + }, + { + "epoch": 0.1068, + "grad_norm": 0.06466661393642426, + "learning_rate": 2.6504161907770413e-05, + "loss": 0.0334, + "step": 101360 + }, + { + "epoch": 0.10685, + "grad_norm": 0.0602884441614151, + "learning_rate": 2.650003570142787e-05, + "loss": 0.0333, + "step": 101370 + }, + { + "epoch": 0.1069, + "grad_norm": 0.05375294014811516, + "learning_rate": 2.6495909454074915e-05, + "loss": 0.0334, + "step": 101380 + }, + { + "epoch": 0.10695, + "grad_norm": 0.06414522975683212, + "learning_rate": 2.649178316582435e-05, + "loss": 0.0333, + "step": 101390 + }, + { + "epoch": 0.107, + "grad_norm": 0.06261350959539413, + "learning_rate": 2.6487656836789e-05, + "loss": 0.0323, + "step": 101400 + }, + { + "epoch": 0.10705, + "grad_norm": 0.0568736270070076, + "learning_rate": 2.648353046708167e-05, + "loss": 0.0324, + "step": 101410 + }, + { + "epoch": 0.1071, + "grad_norm": 0.06497329473495483, + "learning_rate": 2.6479404056815172e-05, + "loss": 0.033, + "step": 101420 + }, + { + "epoch": 0.10715, + "grad_norm": 0.06477135419845581, + "learning_rate": 2.6475277606102327e-05, + "loss": 0.0342, + "step": 101430 + }, + { + "epoch": 0.1072, + "grad_norm": 0.06530822813510895, + "learning_rate": 2.6471151115055942e-05, + "loss": 0.0338, + "step": 101440 + }, + { + "epoch": 0.10725, + "grad_norm": 0.06552419811487198, + "learning_rate": 2.646702458378884e-05, + "loss": 0.035, + "step": 101450 + }, + { + "epoch": 0.1073, + "grad_norm": 0.07599837332963943, + "learning_rate": 2.646289801241384e-05, + "loss": 0.0372, + "step": 101460 + }, + { + "epoch": 0.10735, + "grad_norm": 0.07614503055810928, + "learning_rate": 2.6458771401043753e-05, + "loss": 0.0343, + "step": 101470 + }, + { + "epoch": 0.1074, + "grad_norm": 0.07338558882474899, + "learning_rate": 2.6454644749791406e-05, + "loss": 0.0338, + "step": 101480 + }, + { + "epoch": 0.10745, + "grad_norm": 0.06588335335254669, + "learning_rate": 2.645051805876962e-05, + "loss": 0.0343, + "step": 101490 + }, + { + "epoch": 0.1075, + "grad_norm": 0.07893166691064835, + "learning_rate": 2.6446391328091212e-05, + "loss": 0.0347, + "step": 101500 + }, + { + "epoch": 0.10755, + "grad_norm": 0.07759903371334076, + "learning_rate": 2.6442264557869012e-05, + "loss": 0.0342, + "step": 101510 + }, + { + "epoch": 0.1076, + "grad_norm": 0.08482649177312851, + "learning_rate": 2.6438137748215842e-05, + "loss": 0.0354, + "step": 101520 + }, + { + "epoch": 0.10765, + "grad_norm": 0.07509204745292664, + "learning_rate": 2.643401089924452e-05, + "loss": 0.0355, + "step": 101530 + }, + { + "epoch": 0.1077, + "grad_norm": 0.09492380917072296, + "learning_rate": 2.642988401106788e-05, + "loss": 0.0359, + "step": 101540 + }, + { + "epoch": 0.10775, + "grad_norm": 0.08142454922199249, + "learning_rate": 2.642575708379875e-05, + "loss": 0.0346, + "step": 101550 + }, + { + "epoch": 0.1078, + "grad_norm": 0.07955773919820786, + "learning_rate": 2.6421630117549962e-05, + "loss": 0.0345, + "step": 101560 + }, + { + "epoch": 0.10785, + "grad_norm": 0.0736236572265625, + "learning_rate": 2.6417503112434334e-05, + "loss": 0.034, + "step": 101570 + }, + { + "epoch": 0.1079, + "grad_norm": 0.0750240609049797, + "learning_rate": 2.641337606856471e-05, + "loss": 0.0348, + "step": 101580 + }, + { + "epoch": 0.10795, + "grad_norm": 0.06797299534082413, + "learning_rate": 2.640924898605391e-05, + "loss": 0.035, + "step": 101590 + }, + { + "epoch": 0.108, + "grad_norm": 0.07420060783624649, + "learning_rate": 2.640512186501477e-05, + "loss": 0.0338, + "step": 101600 + }, + { + "epoch": 0.10805, + "grad_norm": 0.06877487897872925, + "learning_rate": 2.6400994705560122e-05, + "loss": 0.0341, + "step": 101610 + }, + { + "epoch": 0.1081, + "grad_norm": 0.080462247133255, + "learning_rate": 2.639686750780282e-05, + "loss": 0.0354, + "step": 101620 + }, + { + "epoch": 0.10815, + "grad_norm": 0.10589322447776794, + "learning_rate": 2.6392740271855677e-05, + "loss": 0.0336, + "step": 101630 + }, + { + "epoch": 0.1082, + "grad_norm": 0.08398301899433136, + "learning_rate": 2.6388612997831537e-05, + "loss": 0.0342, + "step": 101640 + }, + { + "epoch": 0.10825, + "grad_norm": 0.0867784395813942, + "learning_rate": 2.638448568584324e-05, + "loss": 0.038, + "step": 101650 + }, + { + "epoch": 0.1083, + "grad_norm": 0.07799968123435974, + "learning_rate": 2.6380358336003626e-05, + "loss": 0.0338, + "step": 101660 + }, + { + "epoch": 0.10835, + "grad_norm": 0.0743381604552269, + "learning_rate": 2.6376230948425527e-05, + "loss": 0.0344, + "step": 101670 + }, + { + "epoch": 0.1084, + "grad_norm": 0.07239367812871933, + "learning_rate": 2.6372103523221802e-05, + "loss": 0.0348, + "step": 101680 + }, + { + "epoch": 0.10845, + "grad_norm": 0.0794605016708374, + "learning_rate": 2.6367976060505274e-05, + "loss": 0.0345, + "step": 101690 + }, + { + "epoch": 0.1085, + "grad_norm": 0.11208935081958771, + "learning_rate": 2.6363848560388793e-05, + "loss": 0.0347, + "step": 101700 + }, + { + "epoch": 0.10855, + "grad_norm": 0.0862400084733963, + "learning_rate": 2.6359721022985217e-05, + "loss": 0.035, + "step": 101710 + }, + { + "epoch": 0.1086, + "grad_norm": 0.08822376281023026, + "learning_rate": 2.6355593448407367e-05, + "loss": 0.0361, + "step": 101720 + }, + { + "epoch": 0.10865, + "grad_norm": 0.07624153047800064, + "learning_rate": 2.63514658367681e-05, + "loss": 0.0347, + "step": 101730 + }, + { + "epoch": 0.1087, + "grad_norm": 0.07173407822847366, + "learning_rate": 2.634733818818027e-05, + "loss": 0.0341, + "step": 101740 + }, + { + "epoch": 0.10875, + "grad_norm": 0.09655409306287766, + "learning_rate": 2.6343210502756727e-05, + "loss": 0.0398, + "step": 101750 + }, + { + "epoch": 0.1088, + "grad_norm": 0.08558224141597748, + "learning_rate": 2.63390827806103e-05, + "loss": 0.0344, + "step": 101760 + }, + { + "epoch": 0.10885, + "grad_norm": 0.07107986509799957, + "learning_rate": 2.6334955021853868e-05, + "loss": 0.0347, + "step": 101770 + }, + { + "epoch": 0.1089, + "grad_norm": 0.08177818357944489, + "learning_rate": 2.633082722660026e-05, + "loss": 0.0351, + "step": 101780 + }, + { + "epoch": 0.10895, + "grad_norm": 0.07822753489017487, + "learning_rate": 2.6326699394962333e-05, + "loss": 0.034, + "step": 101790 + }, + { + "epoch": 0.109, + "grad_norm": 0.07757072150707245, + "learning_rate": 2.6322571527052934e-05, + "loss": 0.0351, + "step": 101800 + }, + { + "epoch": 0.10905, + "grad_norm": 0.08093541860580444, + "learning_rate": 2.6318443622984946e-05, + "loss": 0.0349, + "step": 101810 + }, + { + "epoch": 0.1091, + "grad_norm": 0.08605824410915375, + "learning_rate": 2.6314315682871193e-05, + "loss": 0.0371, + "step": 101820 + }, + { + "epoch": 0.10915, + "grad_norm": 0.07828135788440704, + "learning_rate": 2.6310187706824548e-05, + "loss": 0.0345, + "step": 101830 + }, + { + "epoch": 0.1092, + "grad_norm": 0.09517630934715271, + "learning_rate": 2.6306059694957858e-05, + "loss": 0.0346, + "step": 101840 + }, + { + "epoch": 0.10925, + "grad_norm": 0.11697806417942047, + "learning_rate": 2.6301931647383993e-05, + "loss": 0.0349, + "step": 101850 + }, + { + "epoch": 0.1093, + "grad_norm": 0.0877174437046051, + "learning_rate": 2.6297803564215794e-05, + "loss": 0.0342, + "step": 101860 + }, + { + "epoch": 0.10935, + "grad_norm": 0.07876308262348175, + "learning_rate": 2.6293675445566148e-05, + "loss": 0.0337, + "step": 101870 + }, + { + "epoch": 0.1094, + "grad_norm": 0.0758930966258049, + "learning_rate": 2.6289547291547888e-05, + "loss": 0.0341, + "step": 101880 + }, + { + "epoch": 0.10945, + "grad_norm": 0.08109204471111298, + "learning_rate": 2.62854191022739e-05, + "loss": 0.0348, + "step": 101890 + }, + { + "epoch": 0.1095, + "grad_norm": 0.07649962604045868, + "learning_rate": 2.6281290877857033e-05, + "loss": 0.0372, + "step": 101900 + }, + { + "epoch": 0.10955, + "grad_norm": 0.08296927809715271, + "learning_rate": 2.627716261841015e-05, + "loss": 0.0336, + "step": 101910 + }, + { + "epoch": 0.1096, + "grad_norm": 0.08457915484905243, + "learning_rate": 2.6273034324046125e-05, + "loss": 0.0338, + "step": 101920 + }, + { + "epoch": 0.10965, + "grad_norm": 0.06991255283355713, + "learning_rate": 2.6268905994877824e-05, + "loss": 0.0344, + "step": 101930 + }, + { + "epoch": 0.1097, + "grad_norm": 0.07757412642240524, + "learning_rate": 2.6264777631018106e-05, + "loss": 0.0338, + "step": 101940 + }, + { + "epoch": 0.10975, + "grad_norm": 0.06705652922391891, + "learning_rate": 2.6260649232579836e-05, + "loss": 0.0334, + "step": 101950 + }, + { + "epoch": 0.1098, + "grad_norm": 0.07722798734903336, + "learning_rate": 2.6256520799675904e-05, + "loss": 0.0334, + "step": 101960 + }, + { + "epoch": 0.10985, + "grad_norm": 0.06761211901903152, + "learning_rate": 2.6252392332419155e-05, + "loss": 0.0344, + "step": 101970 + }, + { + "epoch": 0.1099, + "grad_norm": 0.07083146274089813, + "learning_rate": 2.6248263830922475e-05, + "loss": 0.0356, + "step": 101980 + }, + { + "epoch": 0.10995, + "grad_norm": 0.06778588891029358, + "learning_rate": 2.6244135295298722e-05, + "loss": 0.0361, + "step": 101990 + }, + { + "epoch": 0.11, + "grad_norm": 0.07052619755268097, + "learning_rate": 2.6240006725660786e-05, + "loss": 0.0332, + "step": 102000 + }, + { + "epoch": 0.11005, + "grad_norm": 0.08210963755846024, + "learning_rate": 2.623587812212153e-05, + "loss": 0.0332, + "step": 102010 + }, + { + "epoch": 0.1101, + "grad_norm": 0.07508064806461334, + "learning_rate": 2.623174948479383e-05, + "loss": 0.0347, + "step": 102020 + }, + { + "epoch": 0.11015, + "grad_norm": 0.0741884708404541, + "learning_rate": 2.6227620813790564e-05, + "loss": 0.0348, + "step": 102030 + }, + { + "epoch": 0.1102, + "grad_norm": 0.06254855543375015, + "learning_rate": 2.6223492109224613e-05, + "loss": 0.0369, + "step": 102040 + }, + { + "epoch": 0.11025, + "grad_norm": 0.08497337996959686, + "learning_rate": 2.621936337120883e-05, + "loss": 0.0357, + "step": 102050 + }, + { + "epoch": 0.1103, + "grad_norm": 0.07627248018980026, + "learning_rate": 2.621523459985612e-05, + "loss": 0.0338, + "step": 102060 + }, + { + "epoch": 0.11035, + "grad_norm": 0.07429230958223343, + "learning_rate": 2.621110579527935e-05, + "loss": 0.0346, + "step": 102070 + }, + { + "epoch": 0.1104, + "grad_norm": 0.0647251307964325, + "learning_rate": 2.620697695759141e-05, + "loss": 0.0366, + "step": 102080 + }, + { + "epoch": 0.11045, + "grad_norm": 0.10145459324121475, + "learning_rate": 2.6202848086905164e-05, + "loss": 0.0386, + "step": 102090 + }, + { + "epoch": 0.1105, + "grad_norm": 0.07797590643167496, + "learning_rate": 2.6198719183333508e-05, + "loss": 0.0338, + "step": 102100 + }, + { + "epoch": 0.11055, + "grad_norm": 0.07635772973299026, + "learning_rate": 2.619459024698932e-05, + "loss": 0.0371, + "step": 102110 + }, + { + "epoch": 0.1106, + "grad_norm": 0.0660516545176506, + "learning_rate": 2.619046127798548e-05, + "loss": 0.035, + "step": 102120 + }, + { + "epoch": 0.11065, + "grad_norm": 0.05850354954600334, + "learning_rate": 2.618633227643488e-05, + "loss": 0.0356, + "step": 102130 + }, + { + "epoch": 0.1107, + "grad_norm": 0.06701915711164474, + "learning_rate": 2.6182203242450397e-05, + "loss": 0.0349, + "step": 102140 + }, + { + "epoch": 0.11075, + "grad_norm": 0.09488559514284134, + "learning_rate": 2.6178074176144924e-05, + "loss": 0.0341, + "step": 102150 + }, + { + "epoch": 0.1108, + "grad_norm": 0.06919229030609131, + "learning_rate": 2.6173945077631345e-05, + "loss": 0.0346, + "step": 102160 + }, + { + "epoch": 0.11085, + "grad_norm": 0.0965726301074028, + "learning_rate": 2.6169815947022553e-05, + "loss": 0.0343, + "step": 102170 + }, + { + "epoch": 0.1109, + "grad_norm": 0.08869483321905136, + "learning_rate": 2.6165686784431426e-05, + "loss": 0.0346, + "step": 102180 + }, + { + "epoch": 0.11095, + "grad_norm": 0.07211389392614365, + "learning_rate": 2.6161557589970865e-05, + "loss": 0.0339, + "step": 102190 + }, + { + "epoch": 0.111, + "grad_norm": 0.07438324391841888, + "learning_rate": 2.615742836375375e-05, + "loss": 0.0343, + "step": 102200 + }, + { + "epoch": 0.11105, + "grad_norm": 0.07350186258554459, + "learning_rate": 2.6153299105892986e-05, + "loss": 0.0334, + "step": 102210 + }, + { + "epoch": 0.1111, + "grad_norm": 0.08213651180267334, + "learning_rate": 2.614916981650145e-05, + "loss": 0.0356, + "step": 102220 + }, + { + "epoch": 0.11115, + "grad_norm": 0.05933769419789314, + "learning_rate": 2.6145040495692053e-05, + "loss": 0.0365, + "step": 102230 + }, + { + "epoch": 0.1112, + "grad_norm": 0.061010610312223434, + "learning_rate": 2.614091114357766e-05, + "loss": 0.0335, + "step": 102240 + }, + { + "epoch": 0.11125, + "grad_norm": 0.06389537453651428, + "learning_rate": 2.6136781760271205e-05, + "loss": 0.0337, + "step": 102250 + }, + { + "epoch": 0.1113, + "grad_norm": 0.06789553910493851, + "learning_rate": 2.6132652345885555e-05, + "loss": 0.0346, + "step": 102260 + }, + { + "epoch": 0.11135, + "grad_norm": 0.06623226404190063, + "learning_rate": 2.612852290053362e-05, + "loss": 0.0357, + "step": 102270 + }, + { + "epoch": 0.1114, + "grad_norm": 0.07182657718658447, + "learning_rate": 2.6124393424328285e-05, + "loss": 0.0357, + "step": 102280 + }, + { + "epoch": 0.11145, + "grad_norm": 0.0692882090806961, + "learning_rate": 2.612026391738247e-05, + "loss": 0.0349, + "step": 102290 + }, + { + "epoch": 0.1115, + "grad_norm": 0.08696920424699783, + "learning_rate": 2.6116134379809047e-05, + "loss": 0.0362, + "step": 102300 + }, + { + "epoch": 0.11155, + "grad_norm": 0.08429858833551407, + "learning_rate": 2.611200481172093e-05, + "loss": 0.0353, + "step": 102310 + }, + { + "epoch": 0.1116, + "grad_norm": 0.10219788551330566, + "learning_rate": 2.6107875213231027e-05, + "loss": 0.0356, + "step": 102320 + }, + { + "epoch": 0.11165, + "grad_norm": 0.09618718922138214, + "learning_rate": 2.6103745584452227e-05, + "loss": 0.0336, + "step": 102330 + }, + { + "epoch": 0.1117, + "grad_norm": 0.07313793152570724, + "learning_rate": 2.609961592549744e-05, + "loss": 0.0334, + "step": 102340 + }, + { + "epoch": 0.11175, + "grad_norm": 0.07702039182186127, + "learning_rate": 2.6095486236479567e-05, + "loss": 0.0371, + "step": 102350 + }, + { + "epoch": 0.1118, + "grad_norm": 0.09838972240686417, + "learning_rate": 2.6091356517511505e-05, + "loss": 0.0364, + "step": 102360 + }, + { + "epoch": 0.11185, + "grad_norm": 0.08678235858678818, + "learning_rate": 2.608722676870617e-05, + "loss": 0.0355, + "step": 102370 + }, + { + "epoch": 0.1119, + "grad_norm": 0.08802623301744461, + "learning_rate": 2.6083096990176464e-05, + "loss": 0.0349, + "step": 102380 + }, + { + "epoch": 0.11195, + "grad_norm": 0.11088254302740097, + "learning_rate": 2.6078967182035297e-05, + "loss": 0.0363, + "step": 102390 + }, + { + "epoch": 0.112, + "grad_norm": 0.0822426900267601, + "learning_rate": 2.6074837344395564e-05, + "loss": 0.0348, + "step": 102400 + }, + { + "epoch": 0.11205, + "grad_norm": 0.07988717406988144, + "learning_rate": 2.6070707477370188e-05, + "loss": 0.0351, + "step": 102410 + }, + { + "epoch": 0.1121, + "grad_norm": 0.08314771950244904, + "learning_rate": 2.6066577581072072e-05, + "loss": 0.0345, + "step": 102420 + }, + { + "epoch": 0.11215, + "grad_norm": 0.07573059946298599, + "learning_rate": 2.6062447655614125e-05, + "loss": 0.0349, + "step": 102430 + }, + { + "epoch": 0.1122, + "grad_norm": 0.07272529602050781, + "learning_rate": 2.6058317701109253e-05, + "loss": 0.0348, + "step": 102440 + }, + { + "epoch": 0.11225, + "grad_norm": 0.08205297589302063, + "learning_rate": 2.6054187717670375e-05, + "loss": 0.0393, + "step": 102450 + }, + { + "epoch": 0.1123, + "grad_norm": 0.08683769404888153, + "learning_rate": 2.6050057705410406e-05, + "loss": 0.0367, + "step": 102460 + }, + { + "epoch": 0.11235, + "grad_norm": 0.082497738301754, + "learning_rate": 2.604592766444225e-05, + "loss": 0.0362, + "step": 102470 + }, + { + "epoch": 0.1124, + "grad_norm": 0.07083296775817871, + "learning_rate": 2.6041797594878832e-05, + "loss": 0.0349, + "step": 102480 + }, + { + "epoch": 0.11245, + "grad_norm": 0.0852193683385849, + "learning_rate": 2.6037667496833046e-05, + "loss": 0.0357, + "step": 102490 + }, + { + "epoch": 0.1125, + "grad_norm": 0.08967921882867813, + "learning_rate": 2.6033537370417827e-05, + "loss": 0.0352, + "step": 102500 + }, + { + "epoch": 0.11255, + "grad_norm": 0.09028714150190353, + "learning_rate": 2.6029407215746082e-05, + "loss": 0.0353, + "step": 102510 + }, + { + "epoch": 0.1126, + "grad_norm": 0.08273370563983917, + "learning_rate": 2.6025277032930734e-05, + "loss": 0.0346, + "step": 102520 + }, + { + "epoch": 0.11265, + "grad_norm": 0.07858886569738388, + "learning_rate": 2.6021146822084696e-05, + "loss": 0.0344, + "step": 102530 + }, + { + "epoch": 0.1127, + "grad_norm": 0.07406245172023773, + "learning_rate": 2.601701658332089e-05, + "loss": 0.0353, + "step": 102540 + }, + { + "epoch": 0.11275, + "grad_norm": 0.0656527429819107, + "learning_rate": 2.6012886316752227e-05, + "loss": 0.0362, + "step": 102550 + }, + { + "epoch": 0.1128, + "grad_norm": 0.0866030603647232, + "learning_rate": 2.6008756022491636e-05, + "loss": 0.0358, + "step": 102560 + }, + { + "epoch": 0.11285, + "grad_norm": 0.0703306645154953, + "learning_rate": 2.6004625700652037e-05, + "loss": 0.0361, + "step": 102570 + }, + { + "epoch": 0.1129, + "grad_norm": 0.06990797817707062, + "learning_rate": 2.6000495351346342e-05, + "loss": 0.035, + "step": 102580 + }, + { + "epoch": 0.11295, + "grad_norm": 0.06496477872133255, + "learning_rate": 2.5996364974687486e-05, + "loss": 0.0354, + "step": 102590 + }, + { + "epoch": 0.113, + "grad_norm": 0.07002412527799606, + "learning_rate": 2.5992234570788386e-05, + "loss": 0.0348, + "step": 102600 + }, + { + "epoch": 0.11305, + "grad_norm": 0.07898403704166412, + "learning_rate": 2.5988104139761965e-05, + "loss": 0.0366, + "step": 102610 + }, + { + "epoch": 0.1131, + "grad_norm": 0.08444248139858246, + "learning_rate": 2.5983973681721142e-05, + "loss": 0.0352, + "step": 102620 + }, + { + "epoch": 0.11315, + "grad_norm": 0.08479222655296326, + "learning_rate": 2.597984319677885e-05, + "loss": 0.0349, + "step": 102630 + }, + { + "epoch": 0.1132, + "grad_norm": 0.09469138085842133, + "learning_rate": 2.5975712685048022e-05, + "loss": 0.0348, + "step": 102640 + }, + { + "epoch": 0.11325, + "grad_norm": 0.08109046518802643, + "learning_rate": 2.5971582146641564e-05, + "loss": 0.0361, + "step": 102650 + }, + { + "epoch": 0.1133, + "grad_norm": 0.07394376397132874, + "learning_rate": 2.596745158167242e-05, + "loss": 0.0351, + "step": 102660 + }, + { + "epoch": 0.11335, + "grad_norm": 0.09283407777547836, + "learning_rate": 2.596332099025352e-05, + "loss": 0.0376, + "step": 102670 + }, + { + "epoch": 0.1134, + "grad_norm": 0.08426985889673233, + "learning_rate": 2.5959190372497778e-05, + "loss": 0.0349, + "step": 102680 + }, + { + "epoch": 0.11345, + "grad_norm": 0.082867331802845, + "learning_rate": 2.5955059728518126e-05, + "loss": 0.0361, + "step": 102690 + }, + { + "epoch": 0.1135, + "grad_norm": 0.07347674667835236, + "learning_rate": 2.5950929058427508e-05, + "loss": 0.0348, + "step": 102700 + }, + { + "epoch": 0.11355, + "grad_norm": 0.06538400053977966, + "learning_rate": 2.5946798362338853e-05, + "loss": 0.035, + "step": 102710 + }, + { + "epoch": 0.1136, + "grad_norm": 0.07394007593393326, + "learning_rate": 2.5942667640365075e-05, + "loss": 0.0347, + "step": 102720 + }, + { + "epoch": 0.11365, + "grad_norm": 0.07163910567760468, + "learning_rate": 2.5938536892619126e-05, + "loss": 0.0362, + "step": 102730 + }, + { + "epoch": 0.1137, + "grad_norm": 0.0803307518362999, + "learning_rate": 2.5934406119213928e-05, + "loss": 0.0367, + "step": 102740 + }, + { + "epoch": 0.11375, + "grad_norm": 0.09824536740779877, + "learning_rate": 2.5930275320262415e-05, + "loss": 0.0354, + "step": 102750 + }, + { + "epoch": 0.1138, + "grad_norm": 0.07299476116895676, + "learning_rate": 2.5926144495877525e-05, + "loss": 0.0355, + "step": 102760 + }, + { + "epoch": 0.11385, + "grad_norm": 0.06663201004266739, + "learning_rate": 2.5922013646172195e-05, + "loss": 0.035, + "step": 102770 + }, + { + "epoch": 0.1139, + "grad_norm": 0.06026874855160713, + "learning_rate": 2.5917882771259354e-05, + "loss": 0.0347, + "step": 102780 + }, + { + "epoch": 0.11395, + "grad_norm": 0.07106168568134308, + "learning_rate": 2.5913751871251952e-05, + "loss": 0.0348, + "step": 102790 + }, + { + "epoch": 0.114, + "grad_norm": 0.06715033948421478, + "learning_rate": 2.590962094626291e-05, + "loss": 0.0346, + "step": 102800 + }, + { + "epoch": 0.11405, + "grad_norm": 0.059849146753549576, + "learning_rate": 2.5905489996405176e-05, + "loss": 0.0352, + "step": 102810 + }, + { + "epoch": 0.1141, + "grad_norm": 0.05644575506448746, + "learning_rate": 2.5901359021791678e-05, + "loss": 0.0356, + "step": 102820 + }, + { + "epoch": 0.11415, + "grad_norm": 0.07059363275766373, + "learning_rate": 2.589722802253537e-05, + "loss": 0.0355, + "step": 102830 + }, + { + "epoch": 0.1142, + "grad_norm": 0.07363907992839813, + "learning_rate": 2.5893096998749183e-05, + "loss": 0.0372, + "step": 102840 + }, + { + "epoch": 0.11425, + "grad_norm": 0.07004018127918243, + "learning_rate": 2.5888965950546062e-05, + "loss": 0.0345, + "step": 102850 + }, + { + "epoch": 0.1143, + "grad_norm": 0.07058753073215485, + "learning_rate": 2.5884834878038944e-05, + "loss": 0.0353, + "step": 102860 + }, + { + "epoch": 0.11435, + "grad_norm": 0.07271018624305725, + "learning_rate": 2.588070378134077e-05, + "loss": 0.0344, + "step": 102870 + }, + { + "epoch": 0.1144, + "grad_norm": 0.07091826945543289, + "learning_rate": 2.5876572660564484e-05, + "loss": 0.0392, + "step": 102880 + }, + { + "epoch": 0.11445, + "grad_norm": 0.06540842354297638, + "learning_rate": 2.5872441515823043e-05, + "loss": 0.035, + "step": 102890 + }, + { + "epoch": 0.1145, + "grad_norm": 0.07548410445451736, + "learning_rate": 2.5868310347229368e-05, + "loss": 0.0351, + "step": 102900 + }, + { + "epoch": 0.11455, + "grad_norm": 0.07375458627939224, + "learning_rate": 2.586417915489642e-05, + "loss": 0.0354, + "step": 102910 + }, + { + "epoch": 0.1146, + "grad_norm": 0.07141046226024628, + "learning_rate": 2.586004793893713e-05, + "loss": 0.0344, + "step": 102920 + }, + { + "epoch": 0.11465, + "grad_norm": 0.07697925716638565, + "learning_rate": 2.585591669946446e-05, + "loss": 0.036, + "step": 102930 + }, + { + "epoch": 0.1147, + "grad_norm": 0.08221516013145447, + "learning_rate": 2.5851785436591346e-05, + "loss": 0.0363, + "step": 102940 + }, + { + "epoch": 0.11475, + "grad_norm": 0.07305683940649033, + "learning_rate": 2.5847654150430738e-05, + "loss": 0.0376, + "step": 102950 + }, + { + "epoch": 0.1148, + "grad_norm": 0.07690191268920898, + "learning_rate": 2.584352284109559e-05, + "loss": 0.035, + "step": 102960 + }, + { + "epoch": 0.11485, + "grad_norm": 0.0880003422498703, + "learning_rate": 2.5839391508698834e-05, + "loss": 0.0342, + "step": 102970 + }, + { + "epoch": 0.1149, + "grad_norm": 0.06833967566490173, + "learning_rate": 2.5835260153353442e-05, + "loss": 0.0351, + "step": 102980 + }, + { + "epoch": 0.11495, + "grad_norm": 0.08681900799274445, + "learning_rate": 2.5831128775172343e-05, + "loss": 0.0359, + "step": 102990 + }, + { + "epoch": 0.115, + "grad_norm": 0.06930039823055267, + "learning_rate": 2.5826997374268498e-05, + "loss": 0.0345, + "step": 103000 + }, + { + "epoch": 0.11505, + "grad_norm": 0.07829985022544861, + "learning_rate": 2.582286595075485e-05, + "loss": 0.0359, + "step": 103010 + }, + { + "epoch": 0.1151, + "grad_norm": 0.072901152074337, + "learning_rate": 2.5818734504744362e-05, + "loss": 0.0347, + "step": 103020 + }, + { + "epoch": 0.11515, + "grad_norm": 0.06215919554233551, + "learning_rate": 2.581460303634998e-05, + "loss": 0.0342, + "step": 103030 + }, + { + "epoch": 0.1152, + "grad_norm": 0.07041651010513306, + "learning_rate": 2.5810471545684656e-05, + "loss": 0.0354, + "step": 103040 + }, + { + "epoch": 0.11525, + "grad_norm": 0.0670313835144043, + "learning_rate": 2.580634003286134e-05, + "loss": 0.0356, + "step": 103050 + }, + { + "epoch": 0.1153, + "grad_norm": 0.09243426471948624, + "learning_rate": 2.5802208497993e-05, + "loss": 0.0353, + "step": 103060 + }, + { + "epoch": 0.11535, + "grad_norm": 0.11291596293449402, + "learning_rate": 2.5798076941192573e-05, + "loss": 0.0356, + "step": 103070 + }, + { + "epoch": 0.1154, + "grad_norm": 0.08283979445695877, + "learning_rate": 2.5793945362573026e-05, + "loss": 0.0366, + "step": 103080 + }, + { + "epoch": 0.11545, + "grad_norm": 0.0757361352443695, + "learning_rate": 2.5789813762247305e-05, + "loss": 0.0353, + "step": 103090 + }, + { + "epoch": 0.1155, + "grad_norm": 0.07541199773550034, + "learning_rate": 2.5785682140328382e-05, + "loss": 0.0344, + "step": 103100 + }, + { + "epoch": 0.11555, + "grad_norm": 0.07456483691930771, + "learning_rate": 2.5781550496929203e-05, + "loss": 0.0352, + "step": 103110 + }, + { + "epoch": 0.1156, + "grad_norm": 0.0701032429933548, + "learning_rate": 2.577741883216272e-05, + "loss": 0.0356, + "step": 103120 + }, + { + "epoch": 0.11565, + "grad_norm": 0.07182919979095459, + "learning_rate": 2.5773287146141902e-05, + "loss": 0.034, + "step": 103130 + }, + { + "epoch": 0.1157, + "grad_norm": 0.06604748964309692, + "learning_rate": 2.5769155438979698e-05, + "loss": 0.034, + "step": 103140 + }, + { + "epoch": 0.11575, + "grad_norm": 0.07194375991821289, + "learning_rate": 2.576502371078908e-05, + "loss": 0.0346, + "step": 103150 + }, + { + "epoch": 0.1158, + "grad_norm": 0.07760827988386154, + "learning_rate": 2.5760891961683005e-05, + "loss": 0.0354, + "step": 103160 + }, + { + "epoch": 0.11585, + "grad_norm": 0.07550985366106033, + "learning_rate": 2.5756760191774427e-05, + "loss": 0.0343, + "step": 103170 + }, + { + "epoch": 0.1159, + "grad_norm": 0.07745255529880524, + "learning_rate": 2.5752628401176303e-05, + "loss": 0.037, + "step": 103180 + }, + { + "epoch": 0.11595, + "grad_norm": 0.06956346333026886, + "learning_rate": 2.5748496590001614e-05, + "loss": 0.0348, + "step": 103190 + }, + { + "epoch": 0.116, + "grad_norm": 0.06828167289495468, + "learning_rate": 2.5744364758363294e-05, + "loss": 0.0341, + "step": 103200 + }, + { + "epoch": 0.11605, + "grad_norm": 0.06441890448331833, + "learning_rate": 2.574023290637433e-05, + "loss": 0.0347, + "step": 103210 + }, + { + "epoch": 0.1161, + "grad_norm": 0.06448233872652054, + "learning_rate": 2.5736101034147674e-05, + "loss": 0.0339, + "step": 103220 + }, + { + "epoch": 0.11615, + "grad_norm": 0.08343735337257385, + "learning_rate": 2.5731969141796296e-05, + "loss": 0.0347, + "step": 103230 + }, + { + "epoch": 0.1162, + "grad_norm": 0.08863923698663712, + "learning_rate": 2.572783722943315e-05, + "loss": 0.0372, + "step": 103240 + }, + { + "epoch": 0.11625, + "grad_norm": 0.08323957026004791, + "learning_rate": 2.572370529717122e-05, + "loss": 0.0338, + "step": 103250 + }, + { + "epoch": 0.1163, + "grad_norm": 0.07746366411447525, + "learning_rate": 2.571957334512344e-05, + "loss": 0.0339, + "step": 103260 + }, + { + "epoch": 0.11635, + "grad_norm": 0.07018949091434479, + "learning_rate": 2.571544137340281e-05, + "loss": 0.0334, + "step": 103270 + }, + { + "epoch": 0.1164, + "grad_norm": 0.06720632314682007, + "learning_rate": 2.5711309382122272e-05, + "loss": 0.0343, + "step": 103280 + }, + { + "epoch": 0.11645, + "grad_norm": 0.07699345797300339, + "learning_rate": 2.5707177371394813e-05, + "loss": 0.0343, + "step": 103290 + }, + { + "epoch": 0.1165, + "grad_norm": 0.07368099689483643, + "learning_rate": 2.5703045341333387e-05, + "loss": 0.0347, + "step": 103300 + }, + { + "epoch": 0.11655, + "grad_norm": 0.08487499505281448, + "learning_rate": 2.5698913292050964e-05, + "loss": 0.0368, + "step": 103310 + }, + { + "epoch": 0.1166, + "grad_norm": 0.07324866950511932, + "learning_rate": 2.5694781223660515e-05, + "loss": 0.035, + "step": 103320 + }, + { + "epoch": 0.11665, + "grad_norm": 0.06309445202350616, + "learning_rate": 2.5690649136275002e-05, + "loss": 0.0346, + "step": 103330 + }, + { + "epoch": 0.1167, + "grad_norm": 0.05569295585155487, + "learning_rate": 2.5686517030007408e-05, + "loss": 0.0336, + "step": 103340 + }, + { + "epoch": 0.11675, + "grad_norm": 0.06275855004787445, + "learning_rate": 2.568238490497069e-05, + "loss": 0.0375, + "step": 103350 + }, + { + "epoch": 0.1168, + "grad_norm": 0.07405540347099304, + "learning_rate": 2.5678252761277834e-05, + "loss": 0.0352, + "step": 103360 + }, + { + "epoch": 0.11685, + "grad_norm": 0.0736590251326561, + "learning_rate": 2.5674120599041795e-05, + "loss": 0.0358, + "step": 103370 + }, + { + "epoch": 0.1169, + "grad_norm": 0.09769752621650696, + "learning_rate": 2.5669988418375563e-05, + "loss": 0.036, + "step": 103380 + }, + { + "epoch": 0.11695, + "grad_norm": 0.09497690200805664, + "learning_rate": 2.566585621939208e-05, + "loss": 0.036, + "step": 103390 + }, + { + "epoch": 0.117, + "grad_norm": 0.08184391260147095, + "learning_rate": 2.5661724002204357e-05, + "loss": 0.0365, + "step": 103400 + }, + { + "epoch": 0.11705, + "grad_norm": 0.06988545507192612, + "learning_rate": 2.5657591766925337e-05, + "loss": 0.0356, + "step": 103410 + }, + { + "epoch": 0.1171, + "grad_norm": 0.07233225554227829, + "learning_rate": 2.5653459513668015e-05, + "loss": 0.0346, + "step": 103420 + }, + { + "epoch": 0.11715, + "grad_norm": 0.09395267814397812, + "learning_rate": 2.5649327242545346e-05, + "loss": 0.0329, + "step": 103430 + }, + { + "epoch": 0.1172, + "grad_norm": 0.07595948129892349, + "learning_rate": 2.564519495367032e-05, + "loss": 0.0348, + "step": 103440 + }, + { + "epoch": 0.11725, + "grad_norm": 0.08757159113883972, + "learning_rate": 2.56410626471559e-05, + "loss": 0.0351, + "step": 103450 + }, + { + "epoch": 0.1173, + "grad_norm": 0.08131173998117447, + "learning_rate": 2.563693032311507e-05, + "loss": 0.0338, + "step": 103460 + }, + { + "epoch": 0.11735, + "grad_norm": 0.07512043416500092, + "learning_rate": 2.5632797981660813e-05, + "loss": 0.036, + "step": 103470 + }, + { + "epoch": 0.1174, + "grad_norm": 0.06512127071619034, + "learning_rate": 2.562866562290609e-05, + "loss": 0.034, + "step": 103480 + }, + { + "epoch": 0.11745, + "grad_norm": 0.0664801150560379, + "learning_rate": 2.5624533246963883e-05, + "loss": 0.0341, + "step": 103490 + }, + { + "epoch": 0.1175, + "grad_norm": 0.07180184870958328, + "learning_rate": 2.562040085394718e-05, + "loss": 0.0352, + "step": 103500 + }, + { + "epoch": 0.11755, + "grad_norm": 0.07850921154022217, + "learning_rate": 2.5616268443968938e-05, + "loss": 0.0343, + "step": 103510 + }, + { + "epoch": 0.1176, + "grad_norm": 0.07511237263679504, + "learning_rate": 2.5612136017142158e-05, + "loss": 0.0335, + "step": 103520 + }, + { + "epoch": 0.11765, + "grad_norm": 0.0689835473895073, + "learning_rate": 2.5608003573579803e-05, + "loss": 0.0338, + "step": 103530 + }, + { + "epoch": 0.1177, + "grad_norm": 0.06506875157356262, + "learning_rate": 2.560387111339486e-05, + "loss": 0.0358, + "step": 103540 + }, + { + "epoch": 0.11775, + "grad_norm": 0.06760787963867188, + "learning_rate": 2.559973863670031e-05, + "loss": 0.0381, + "step": 103550 + }, + { + "epoch": 0.1178, + "grad_norm": 0.0856146365404129, + "learning_rate": 2.559560614360913e-05, + "loss": 0.0366, + "step": 103560 + }, + { + "epoch": 0.11785, + "grad_norm": 0.07530411332845688, + "learning_rate": 2.559147363423431e-05, + "loss": 0.034, + "step": 103570 + }, + { + "epoch": 0.1179, + "grad_norm": 0.07682434469461441, + "learning_rate": 2.558734110868881e-05, + "loss": 0.0371, + "step": 103580 + }, + { + "epoch": 0.11795, + "grad_norm": 0.07229023426771164, + "learning_rate": 2.5583208567085625e-05, + "loss": 0.034, + "step": 103590 + }, + { + "epoch": 0.118, + "grad_norm": 0.07774133235216141, + "learning_rate": 2.5579076009537745e-05, + "loss": 0.034, + "step": 103600 + }, + { + "epoch": 0.11805, + "grad_norm": 0.11243210732936859, + "learning_rate": 2.5574943436158145e-05, + "loss": 0.0343, + "step": 103610 + }, + { + "epoch": 0.1181, + "grad_norm": 0.08603610843420029, + "learning_rate": 2.55708108470598e-05, + "loss": 0.0357, + "step": 103620 + }, + { + "epoch": 0.11815, + "grad_norm": 0.08804672956466675, + "learning_rate": 2.5566678242355706e-05, + "loss": 0.0345, + "step": 103630 + }, + { + "epoch": 0.1182, + "grad_norm": 0.07011974602937698, + "learning_rate": 2.5562545622158833e-05, + "loss": 0.0347, + "step": 103640 + }, + { + "epoch": 0.11825, + "grad_norm": 0.08410502970218658, + "learning_rate": 2.5558412986582186e-05, + "loss": 0.0372, + "step": 103650 + }, + { + "epoch": 0.1183, + "grad_norm": 0.0645311176776886, + "learning_rate": 2.5554280335738733e-05, + "loss": 0.035, + "step": 103660 + }, + { + "epoch": 0.11835, + "grad_norm": 0.06442949175834656, + "learning_rate": 2.555014766974147e-05, + "loss": 0.0344, + "step": 103670 + }, + { + "epoch": 0.1184, + "grad_norm": 0.07331974059343338, + "learning_rate": 2.5546014988703366e-05, + "loss": 0.037, + "step": 103680 + }, + { + "epoch": 0.11845, + "grad_norm": 0.06093655526638031, + "learning_rate": 2.554188229273743e-05, + "loss": 0.0353, + "step": 103690 + }, + { + "epoch": 0.1185, + "grad_norm": 0.07783251255750656, + "learning_rate": 2.553774958195662e-05, + "loss": 0.036, + "step": 103700 + }, + { + "epoch": 0.11855, + "grad_norm": 0.0731554701924324, + "learning_rate": 2.5533616856473945e-05, + "loss": 0.0374, + "step": 103710 + }, + { + "epoch": 0.1186, + "grad_norm": 0.06992392987012863, + "learning_rate": 2.5529484116402384e-05, + "loss": 0.0366, + "step": 103720 + }, + { + "epoch": 0.11865, + "grad_norm": 0.07954194396734238, + "learning_rate": 2.5525351361854932e-05, + "loss": 0.0357, + "step": 103730 + }, + { + "epoch": 0.1187, + "grad_norm": 0.07312312722206116, + "learning_rate": 2.552121859294457e-05, + "loss": 0.0356, + "step": 103740 + }, + { + "epoch": 0.11875, + "grad_norm": 0.0759984701871872, + "learning_rate": 2.5517085809784286e-05, + "loss": 0.0367, + "step": 103750 + }, + { + "epoch": 0.1188, + "grad_norm": 0.07431000471115112, + "learning_rate": 2.5512953012487067e-05, + "loss": 0.0354, + "step": 103760 + }, + { + "epoch": 0.11885, + "grad_norm": 0.08185004442930222, + "learning_rate": 2.5508820201165907e-05, + "loss": 0.0366, + "step": 103770 + }, + { + "epoch": 0.1189, + "grad_norm": 0.0904598981142044, + "learning_rate": 2.5504687375933796e-05, + "loss": 0.036, + "step": 103780 + }, + { + "epoch": 0.11895, + "grad_norm": 0.07650790363550186, + "learning_rate": 2.550055453690372e-05, + "loss": 0.0349, + "step": 103790 + }, + { + "epoch": 0.119, + "grad_norm": 0.06640942394733429, + "learning_rate": 2.549642168418867e-05, + "loss": 0.0344, + "step": 103800 + }, + { + "epoch": 0.11905, + "grad_norm": 0.07499713450670242, + "learning_rate": 2.5492288817901638e-05, + "loss": 0.0348, + "step": 103810 + }, + { + "epoch": 0.1191, + "grad_norm": 0.08475736528635025, + "learning_rate": 2.548815593815562e-05, + "loss": 0.0358, + "step": 103820 + }, + { + "epoch": 0.11915, + "grad_norm": 0.07885356992483139, + "learning_rate": 2.5484023045063598e-05, + "loss": 0.0377, + "step": 103830 + }, + { + "epoch": 0.1192, + "grad_norm": 0.07656852900981903, + "learning_rate": 2.5479890138738565e-05, + "loss": 0.0334, + "step": 103840 + }, + { + "epoch": 0.11925, + "grad_norm": 0.06738829612731934, + "learning_rate": 2.5475757219293516e-05, + "loss": 0.0358, + "step": 103850 + }, + { + "epoch": 0.1193, + "grad_norm": 0.07586340606212616, + "learning_rate": 2.547162428684145e-05, + "loss": 0.0363, + "step": 103860 + }, + { + "epoch": 0.11935, + "grad_norm": 0.06756033003330231, + "learning_rate": 2.5467491341495348e-05, + "loss": 0.0342, + "step": 103870 + }, + { + "epoch": 0.1194, + "grad_norm": 0.07476552575826645, + "learning_rate": 2.5463358383368212e-05, + "loss": 0.0337, + "step": 103880 + }, + { + "epoch": 0.11945, + "grad_norm": 0.07246913760900497, + "learning_rate": 2.545922541257303e-05, + "loss": 0.0349, + "step": 103890 + }, + { + "epoch": 0.1195, + "grad_norm": 0.06302965432405472, + "learning_rate": 2.5455092429222793e-05, + "loss": 0.0342, + "step": 103900 + }, + { + "epoch": 0.11955, + "grad_norm": 0.07466553896665573, + "learning_rate": 2.5450959433430505e-05, + "loss": 0.0348, + "step": 103910 + }, + { + "epoch": 0.1196, + "grad_norm": 0.09401459246873856, + "learning_rate": 2.5446826425309157e-05, + "loss": 0.0348, + "step": 103920 + }, + { + "epoch": 0.11965, + "grad_norm": 0.08614931255578995, + "learning_rate": 2.5442693404971735e-05, + "loss": 0.0351, + "step": 103930 + }, + { + "epoch": 0.1197, + "grad_norm": 0.11952122300863266, + "learning_rate": 2.543856037253125e-05, + "loss": 0.0359, + "step": 103940 + }, + { + "epoch": 0.11975, + "grad_norm": 0.08765014261007309, + "learning_rate": 2.5434427328100684e-05, + "loss": 0.0356, + "step": 103950 + }, + { + "epoch": 0.1198, + "grad_norm": 0.06619185209274292, + "learning_rate": 2.5430294271793042e-05, + "loss": 0.0352, + "step": 103960 + }, + { + "epoch": 0.11985, + "grad_norm": 0.0630049780011177, + "learning_rate": 2.542616120372131e-05, + "loss": 0.0355, + "step": 103970 + }, + { + "epoch": 0.1199, + "grad_norm": 0.06189363822340965, + "learning_rate": 2.54220281239985e-05, + "loss": 0.0344, + "step": 103980 + }, + { + "epoch": 0.11995, + "grad_norm": 0.0778106078505516, + "learning_rate": 2.5417895032737592e-05, + "loss": 0.036, + "step": 103990 + }, + { + "epoch": 0.12, + "grad_norm": 0.06497669219970703, + "learning_rate": 2.54137619300516e-05, + "loss": 0.0342, + "step": 104000 + }, + { + "epoch": 0.12005, + "grad_norm": 0.07563336193561554, + "learning_rate": 2.5409628816053498e-05, + "loss": 0.0347, + "step": 104010 + }, + { + "epoch": 0.1201, + "grad_norm": 0.07418262213468552, + "learning_rate": 2.5405495690856307e-05, + "loss": 0.0341, + "step": 104020 + }, + { + "epoch": 0.12015, + "grad_norm": 0.06178402155637741, + "learning_rate": 2.540136255457301e-05, + "loss": 0.0341, + "step": 104030 + }, + { + "epoch": 0.1202, + "grad_norm": 0.05717574805021286, + "learning_rate": 2.5397229407316624e-05, + "loss": 0.0338, + "step": 104040 + }, + { + "epoch": 0.12025, + "grad_norm": 0.08341573178768158, + "learning_rate": 2.5393096249200127e-05, + "loss": 0.0355, + "step": 104050 + }, + { + "epoch": 0.1203, + "grad_norm": 0.06537025421857834, + "learning_rate": 2.538896308033652e-05, + "loss": 0.0347, + "step": 104060 + }, + { + "epoch": 0.12035, + "grad_norm": 0.06877518445253372, + "learning_rate": 2.538482990083882e-05, + "loss": 0.0363, + "step": 104070 + }, + { + "epoch": 0.1204, + "grad_norm": 0.0737723708152771, + "learning_rate": 2.5380696710820012e-05, + "loss": 0.0337, + "step": 104080 + }, + { + "epoch": 0.12045, + "grad_norm": 0.08432099223136902, + "learning_rate": 2.537656351039309e-05, + "loss": 0.0331, + "step": 104090 + }, + { + "epoch": 0.1205, + "grad_norm": 0.07213162630796432, + "learning_rate": 2.5372430299671075e-05, + "loss": 0.0341, + "step": 104100 + }, + { + "epoch": 0.12055, + "grad_norm": 0.08251155912876129, + "learning_rate": 2.536829707876695e-05, + "loss": 0.0351, + "step": 104110 + }, + { + "epoch": 0.1206, + "grad_norm": 0.07315324246883392, + "learning_rate": 2.536416384779372e-05, + "loss": 0.0336, + "step": 104120 + }, + { + "epoch": 0.12065, + "grad_norm": 0.0627518743276596, + "learning_rate": 2.5360030606864392e-05, + "loss": 0.0324, + "step": 104130 + }, + { + "epoch": 0.1207, + "grad_norm": 0.07793736457824707, + "learning_rate": 2.535589735609196e-05, + "loss": 0.035, + "step": 104140 + }, + { + "epoch": 0.12075, + "grad_norm": 0.092694953083992, + "learning_rate": 2.5351764095589425e-05, + "loss": 0.033, + "step": 104150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.07844278216362, + "learning_rate": 2.5347630825469792e-05, + "loss": 0.0353, + "step": 104160 + }, + { + "epoch": 0.12085, + "grad_norm": 0.08317578583955765, + "learning_rate": 2.5343497545846074e-05, + "loss": 0.0349, + "step": 104170 + }, + { + "epoch": 0.1209, + "grad_norm": 0.08206788450479507, + "learning_rate": 2.533936425683125e-05, + "loss": 0.0342, + "step": 104180 + }, + { + "epoch": 0.12095, + "grad_norm": 0.07000011950731277, + "learning_rate": 2.5335230958538343e-05, + "loss": 0.0348, + "step": 104190 + }, + { + "epoch": 0.121, + "grad_norm": 0.08563758432865143, + "learning_rate": 2.533109765108034e-05, + "loss": 0.0331, + "step": 104200 + }, + { + "epoch": 0.12105, + "grad_norm": 0.09002260118722916, + "learning_rate": 2.5326964334570257e-05, + "loss": 0.0349, + "step": 104210 + }, + { + "epoch": 0.1211, + "grad_norm": 0.0715671107172966, + "learning_rate": 2.5322831009121084e-05, + "loss": 0.0345, + "step": 104220 + }, + { + "epoch": 0.12115, + "grad_norm": 0.07427511364221573, + "learning_rate": 2.5318697674845844e-05, + "loss": 0.0351, + "step": 104230 + }, + { + "epoch": 0.1212, + "grad_norm": 0.06503819674253464, + "learning_rate": 2.5314564331857515e-05, + "loss": 0.0348, + "step": 104240 + }, + { + "epoch": 0.12125, + "grad_norm": 0.07333590090274811, + "learning_rate": 2.531043098026913e-05, + "loss": 0.034, + "step": 104250 + }, + { + "epoch": 0.1213, + "grad_norm": 0.06548633426427841, + "learning_rate": 2.530629762019367e-05, + "loss": 0.0333, + "step": 104260 + }, + { + "epoch": 0.12135, + "grad_norm": 0.058450475335121155, + "learning_rate": 2.530216425174415e-05, + "loss": 0.0336, + "step": 104270 + }, + { + "epoch": 0.1214, + "grad_norm": 0.07402554154396057, + "learning_rate": 2.529803087503357e-05, + "loss": 0.0353, + "step": 104280 + }, + { + "epoch": 0.12145, + "grad_norm": 0.06446702778339386, + "learning_rate": 2.5293897490174945e-05, + "loss": 0.0335, + "step": 104290 + }, + { + "epoch": 0.1215, + "grad_norm": 0.06754470616579056, + "learning_rate": 2.5289764097281264e-05, + "loss": 0.0344, + "step": 104300 + }, + { + "epoch": 0.12155, + "grad_norm": 0.05972554534673691, + "learning_rate": 2.5285630696465546e-05, + "loss": 0.034, + "step": 104310 + }, + { + "epoch": 0.1216, + "grad_norm": 0.06952910125255585, + "learning_rate": 2.52814972878408e-05, + "loss": 0.0354, + "step": 104320 + }, + { + "epoch": 0.12165, + "grad_norm": 0.07992678135633469, + "learning_rate": 2.527736387152001e-05, + "loss": 0.0366, + "step": 104330 + }, + { + "epoch": 0.1217, + "grad_norm": 0.09057517349720001, + "learning_rate": 2.5273230447616203e-05, + "loss": 0.0362, + "step": 104340 + }, + { + "epoch": 0.12175, + "grad_norm": 0.0814778134226799, + "learning_rate": 2.5269097016242375e-05, + "loss": 0.0362, + "step": 104350 + }, + { + "epoch": 0.1218, + "grad_norm": 0.08909647166728973, + "learning_rate": 2.526496357751154e-05, + "loss": 0.0352, + "step": 104360 + }, + { + "epoch": 0.12185, + "grad_norm": 0.08012577146291733, + "learning_rate": 2.52608301315367e-05, + "loss": 0.0346, + "step": 104370 + }, + { + "epoch": 0.1219, + "grad_norm": 0.08825207501649857, + "learning_rate": 2.5256696678430864e-05, + "loss": 0.0342, + "step": 104380 + }, + { + "epoch": 0.12195, + "grad_norm": 0.07060939818620682, + "learning_rate": 2.525256321830703e-05, + "loss": 0.035, + "step": 104390 + }, + { + "epoch": 0.122, + "grad_norm": 0.08157352358102798, + "learning_rate": 2.5248429751278224e-05, + "loss": 0.0349, + "step": 104400 + }, + { + "epoch": 0.12205, + "grad_norm": 0.07402607053518295, + "learning_rate": 2.5244296277457423e-05, + "loss": 0.0336, + "step": 104410 + }, + { + "epoch": 0.1221, + "grad_norm": 0.07185854017734528, + "learning_rate": 2.5240162796957673e-05, + "loss": 0.0345, + "step": 104420 + }, + { + "epoch": 0.12215, + "grad_norm": 0.08592725545167923, + "learning_rate": 2.523602930989195e-05, + "loss": 0.0345, + "step": 104430 + }, + { + "epoch": 0.1222, + "grad_norm": 0.0761452242732048, + "learning_rate": 2.523189581637328e-05, + "loss": 0.034, + "step": 104440 + }, + { + "epoch": 0.12225, + "grad_norm": 0.07353299856185913, + "learning_rate": 2.5227762316514662e-05, + "loss": 0.034, + "step": 104450 + }, + { + "epoch": 0.1223, + "grad_norm": 0.07914377748966217, + "learning_rate": 2.5223628810429117e-05, + "loss": 0.036, + "step": 104460 + }, + { + "epoch": 0.12235, + "grad_norm": 0.0832958072423935, + "learning_rate": 2.521949529822963e-05, + "loss": 0.0361, + "step": 104470 + }, + { + "epoch": 0.1224, + "grad_norm": 0.10569994151592255, + "learning_rate": 2.5215361780029235e-05, + "loss": 0.0355, + "step": 104480 + }, + { + "epoch": 0.12245, + "grad_norm": 0.09755893796682358, + "learning_rate": 2.5211228255940922e-05, + "loss": 0.0353, + "step": 104490 + }, + { + "epoch": 0.1225, + "grad_norm": 0.08682700991630554, + "learning_rate": 2.5207094726077718e-05, + "loss": 0.037, + "step": 104500 + }, + { + "epoch": 0.12255, + "grad_norm": 0.09625197947025299, + "learning_rate": 2.5202961190552617e-05, + "loss": 0.0355, + "step": 104510 + }, + { + "epoch": 0.1226, + "grad_norm": 0.0831693559885025, + "learning_rate": 2.519882764947863e-05, + "loss": 0.0348, + "step": 104520 + }, + { + "epoch": 0.12265, + "grad_norm": 0.07173638045787811, + "learning_rate": 2.519469410296877e-05, + "loss": 0.0343, + "step": 104530 + }, + { + "epoch": 0.1227, + "grad_norm": 0.08513516187667847, + "learning_rate": 2.519056055113605e-05, + "loss": 0.0353, + "step": 104540 + }, + { + "epoch": 0.12275, + "grad_norm": 0.08899994939565659, + "learning_rate": 2.518642699409347e-05, + "loss": 0.0356, + "step": 104550 + }, + { + "epoch": 0.1228, + "grad_norm": 0.0777038037776947, + "learning_rate": 2.5182293431954052e-05, + "loss": 0.0355, + "step": 104560 + }, + { + "epoch": 0.12285, + "grad_norm": 0.09456444531679153, + "learning_rate": 2.5178159864830804e-05, + "loss": 0.0348, + "step": 104570 + }, + { + "epoch": 0.1229, + "grad_norm": 0.08388503640890121, + "learning_rate": 2.5174026292836723e-05, + "loss": 0.034, + "step": 104580 + }, + { + "epoch": 0.12295, + "grad_norm": 0.08418978005647659, + "learning_rate": 2.5169892716084838e-05, + "loss": 0.0355, + "step": 104590 + }, + { + "epoch": 0.123, + "grad_norm": 0.07934024184942245, + "learning_rate": 2.5165759134688132e-05, + "loss": 0.0349, + "step": 104600 + }, + { + "epoch": 0.12305, + "grad_norm": 0.07490506768226624, + "learning_rate": 2.5161625548759654e-05, + "loss": 0.0348, + "step": 104610 + }, + { + "epoch": 0.1231, + "grad_norm": 0.08375281095504761, + "learning_rate": 2.5157491958412382e-05, + "loss": 0.0355, + "step": 104620 + }, + { + "epoch": 0.12315, + "grad_norm": 0.08525901287794113, + "learning_rate": 2.515335836375935e-05, + "loss": 0.0364, + "step": 104630 + }, + { + "epoch": 0.1232, + "grad_norm": 0.0929773822426796, + "learning_rate": 2.514922476491355e-05, + "loss": 0.0354, + "step": 104640 + }, + { + "epoch": 0.12325, + "grad_norm": 0.08704689145088196, + "learning_rate": 2.5145091161988003e-05, + "loss": 0.0353, + "step": 104650 + }, + { + "epoch": 0.1233, + "grad_norm": 0.06518177688121796, + "learning_rate": 2.514095755509571e-05, + "loss": 0.0347, + "step": 104660 + }, + { + "epoch": 0.12335, + "grad_norm": 0.07102639973163605, + "learning_rate": 2.5136823944349704e-05, + "loss": 0.0336, + "step": 104670 + }, + { + "epoch": 0.1234, + "grad_norm": 0.07148510962724686, + "learning_rate": 2.5132690329862975e-05, + "loss": 0.0344, + "step": 104680 + }, + { + "epoch": 0.12345, + "grad_norm": 0.06739726662635803, + "learning_rate": 2.5128556711748546e-05, + "loss": 0.0362, + "step": 104690 + }, + { + "epoch": 0.1235, + "grad_norm": 0.0750051736831665, + "learning_rate": 2.512442309011942e-05, + "loss": 0.0346, + "step": 104700 + }, + { + "epoch": 0.12355, + "grad_norm": 0.07953881472349167, + "learning_rate": 2.512028946508862e-05, + "loss": 0.0365, + "step": 104710 + }, + { + "epoch": 0.1236, + "grad_norm": 0.07243786752223969, + "learning_rate": 2.5116155836769146e-05, + "loss": 0.0345, + "step": 104720 + }, + { + "epoch": 0.12365, + "grad_norm": 0.07352124899625778, + "learning_rate": 2.511202220527401e-05, + "loss": 0.0358, + "step": 104730 + }, + { + "epoch": 0.1237, + "grad_norm": 0.073238305747509, + "learning_rate": 2.5107888570716235e-05, + "loss": 0.0349, + "step": 104740 + }, + { + "epoch": 0.12375, + "grad_norm": 0.07899244874715805, + "learning_rate": 2.510375493320883e-05, + "loss": 0.036, + "step": 104750 + }, + { + "epoch": 0.1238, + "grad_norm": 0.09344808012247086, + "learning_rate": 2.50996212928648e-05, + "loss": 0.0359, + "step": 104760 + }, + { + "epoch": 0.12385, + "grad_norm": 0.09741408377885818, + "learning_rate": 2.509548764979716e-05, + "loss": 0.0359, + "step": 104770 + }, + { + "epoch": 0.1239, + "grad_norm": 0.07594095170497894, + "learning_rate": 2.5091354004118934e-05, + "loss": 0.0348, + "step": 104780 + }, + { + "epoch": 0.12395, + "grad_norm": 0.06269504874944687, + "learning_rate": 2.508722035594311e-05, + "loss": 0.0351, + "step": 104790 + }, + { + "epoch": 0.124, + "grad_norm": 0.07669692486524582, + "learning_rate": 2.5083086705382718e-05, + "loss": 0.0365, + "step": 104800 + }, + { + "epoch": 0.12405, + "grad_norm": 0.06946543604135513, + "learning_rate": 2.5078953052550767e-05, + "loss": 0.0358, + "step": 104810 + }, + { + "epoch": 0.1241, + "grad_norm": 0.13907350599765778, + "learning_rate": 2.507481939756028e-05, + "loss": 0.0369, + "step": 104820 + }, + { + "epoch": 0.12415, + "grad_norm": 0.0800219401717186, + "learning_rate": 2.5070685740524246e-05, + "loss": 0.0371, + "step": 104830 + }, + { + "epoch": 0.1242, + "grad_norm": 0.07465817779302597, + "learning_rate": 2.5066552081555693e-05, + "loss": 0.0343, + "step": 104840 + }, + { + "epoch": 0.12425, + "grad_norm": 0.06579568982124329, + "learning_rate": 2.506241842076763e-05, + "loss": 0.0341, + "step": 104850 + }, + { + "epoch": 0.1243, + "grad_norm": 0.06052146852016449, + "learning_rate": 2.5058284758273076e-05, + "loss": 0.0335, + "step": 104860 + }, + { + "epoch": 0.12435, + "grad_norm": 0.06922436505556107, + "learning_rate": 2.5054151094185036e-05, + "loss": 0.0328, + "step": 104870 + }, + { + "epoch": 0.1244, + "grad_norm": 0.06804629415273666, + "learning_rate": 2.5050017428616522e-05, + "loss": 0.0348, + "step": 104880 + }, + { + "epoch": 0.12445, + "grad_norm": 0.06646064668893814, + "learning_rate": 2.5045883761680555e-05, + "loss": 0.0346, + "step": 104890 + }, + { + "epoch": 0.1245, + "grad_norm": 0.07009711861610413, + "learning_rate": 2.5041750093490145e-05, + "loss": 0.0331, + "step": 104900 + }, + { + "epoch": 0.12455, + "grad_norm": 0.06541627645492554, + "learning_rate": 2.50376164241583e-05, + "loss": 0.0333, + "step": 104910 + }, + { + "epoch": 0.1246, + "grad_norm": 0.06215963512659073, + "learning_rate": 2.5033482753798033e-05, + "loss": 0.0339, + "step": 104920 + }, + { + "epoch": 0.12465, + "grad_norm": 0.06351916491985321, + "learning_rate": 2.5029349082522363e-05, + "loss": 0.034, + "step": 104930 + }, + { + "epoch": 0.1247, + "grad_norm": 0.06076859310269356, + "learning_rate": 2.5025215410444302e-05, + "loss": 0.0341, + "step": 104940 + }, + { + "epoch": 0.12475, + "grad_norm": 0.07077761739492416, + "learning_rate": 2.5021081737676855e-05, + "loss": 0.0336, + "step": 104950 + }, + { + "epoch": 0.1248, + "grad_norm": 0.0637740045785904, + "learning_rate": 2.5016948064333053e-05, + "loss": 0.0381, + "step": 104960 + }, + { + "epoch": 0.12485, + "grad_norm": 0.093325674533844, + "learning_rate": 2.5012814390525886e-05, + "loss": 0.0357, + "step": 104970 + }, + { + "epoch": 0.1249, + "grad_norm": 0.07910604029893875, + "learning_rate": 2.5008680716368383e-05, + "loss": 0.0341, + "step": 104980 + }, + { + "epoch": 0.12495, + "grad_norm": 0.07720960676670074, + "learning_rate": 2.5004547041973548e-05, + "loss": 0.0338, + "step": 104990 + }, + { + "epoch": 0.125, + "grad_norm": 0.06617166846990585, + "learning_rate": 2.5000413367454406e-05, + "loss": 0.0344, + "step": 105000 + }, + { + "epoch": 0.12505, + "grad_norm": 0.07431846857070923, + "learning_rate": 2.499627969292395e-05, + "loss": 0.0334, + "step": 105010 + }, + { + "epoch": 0.1251, + "grad_norm": 0.0698188915848732, + "learning_rate": 2.499214601849522e-05, + "loss": 0.0343, + "step": 105020 + }, + { + "epoch": 0.12515, + "grad_norm": 0.0604122169315815, + "learning_rate": 2.4988012344281205e-05, + "loss": 0.0354, + "step": 105030 + }, + { + "epoch": 0.1252, + "grad_norm": 0.06649839878082275, + "learning_rate": 2.498387867039494e-05, + "loss": 0.0347, + "step": 105040 + }, + { + "epoch": 0.12525, + "grad_norm": 0.06715084612369537, + "learning_rate": 2.4979744996949415e-05, + "loss": 0.0341, + "step": 105050 + }, + { + "epoch": 0.1253, + "grad_norm": 0.07764974236488342, + "learning_rate": 2.4975611324057664e-05, + "loss": 0.0341, + "step": 105060 + }, + { + "epoch": 0.12535, + "grad_norm": 0.061199646443128586, + "learning_rate": 2.497147765183268e-05, + "loss": 0.0349, + "step": 105070 + }, + { + "epoch": 0.1254, + "grad_norm": 0.06322883814573288, + "learning_rate": 2.496734398038749e-05, + "loss": 0.0337, + "step": 105080 + }, + { + "epoch": 0.12545, + "grad_norm": 0.06549254059791565, + "learning_rate": 2.4963210309835107e-05, + "loss": 0.0333, + "step": 105090 + }, + { + "epoch": 0.1255, + "grad_norm": 0.06285182386636734, + "learning_rate": 2.495907664028853e-05, + "loss": 0.0338, + "step": 105100 + }, + { + "epoch": 0.12555, + "grad_norm": 0.059575922787189484, + "learning_rate": 2.4954942971860798e-05, + "loss": 0.0328, + "step": 105110 + }, + { + "epoch": 0.1256, + "grad_norm": 0.07021772861480713, + "learning_rate": 2.495080930466489e-05, + "loss": 0.0341, + "step": 105120 + }, + { + "epoch": 0.12565, + "grad_norm": 0.07550730556249619, + "learning_rate": 2.4946675638813856e-05, + "loss": 0.0335, + "step": 105130 + }, + { + "epoch": 0.1257, + "grad_norm": 0.08097176253795624, + "learning_rate": 2.4942541974420674e-05, + "loss": 0.0351, + "step": 105140 + }, + { + "epoch": 0.12575, + "grad_norm": 0.07972034811973572, + "learning_rate": 2.4938408311598376e-05, + "loss": 0.0342, + "step": 105150 + }, + { + "epoch": 0.1258, + "grad_norm": 0.07270655035972595, + "learning_rate": 2.493427465045998e-05, + "loss": 0.0332, + "step": 105160 + }, + { + "epoch": 0.12585, + "grad_norm": 0.07965975254774094, + "learning_rate": 2.4930140991118483e-05, + "loss": 0.034, + "step": 105170 + }, + { + "epoch": 0.1259, + "grad_norm": 0.07292885333299637, + "learning_rate": 2.4926007333686912e-05, + "loss": 0.035, + "step": 105180 + }, + { + "epoch": 0.12595, + "grad_norm": 0.06530830264091492, + "learning_rate": 2.4921873678278267e-05, + "loss": 0.0355, + "step": 105190 + }, + { + "epoch": 0.126, + "grad_norm": 0.07178696990013123, + "learning_rate": 2.491774002500556e-05, + "loss": 0.0372, + "step": 105200 + }, + { + "epoch": 0.12605, + "grad_norm": 0.0647277906537056, + "learning_rate": 2.4913606373981825e-05, + "loss": 0.034, + "step": 105210 + }, + { + "epoch": 0.1261, + "grad_norm": 0.06109518185257912, + "learning_rate": 2.4909472725320045e-05, + "loss": 0.0358, + "step": 105220 + }, + { + "epoch": 0.12615, + "grad_norm": 0.08750329911708832, + "learning_rate": 2.4905339079133257e-05, + "loss": 0.0363, + "step": 105230 + }, + { + "epoch": 0.1262, + "grad_norm": 0.11697772890329361, + "learning_rate": 2.4901205435534457e-05, + "loss": 0.0351, + "step": 105240 + }, + { + "epoch": 0.12625, + "grad_norm": 0.08934105932712555, + "learning_rate": 2.489707179463667e-05, + "loss": 0.0361, + "step": 105250 + }, + { + "epoch": 0.1263, + "grad_norm": 0.08917882293462753, + "learning_rate": 2.4892938156552896e-05, + "loss": 0.0362, + "step": 105260 + }, + { + "epoch": 0.12635, + "grad_norm": 0.06422018259763718, + "learning_rate": 2.488880452139615e-05, + "loss": 0.036, + "step": 105270 + }, + { + "epoch": 0.1264, + "grad_norm": 0.07000827044248581, + "learning_rate": 2.4884670889279455e-05, + "loss": 0.0366, + "step": 105280 + }, + { + "epoch": 0.12645, + "grad_norm": 0.08107402920722961, + "learning_rate": 2.4880537260315808e-05, + "loss": 0.0354, + "step": 105290 + }, + { + "epoch": 0.1265, + "grad_norm": 0.07664652168750763, + "learning_rate": 2.4876403634618232e-05, + "loss": 0.0351, + "step": 105300 + }, + { + "epoch": 0.12655, + "grad_norm": 0.06906727701425552, + "learning_rate": 2.4872270012299725e-05, + "loss": 0.0354, + "step": 105310 + }, + { + "epoch": 0.1266, + "grad_norm": 0.07458998262882233, + "learning_rate": 2.4868136393473325e-05, + "loss": 0.0341, + "step": 105320 + }, + { + "epoch": 0.12665, + "grad_norm": 0.08547483384609222, + "learning_rate": 2.486400277825201e-05, + "loss": 0.0366, + "step": 105330 + }, + { + "epoch": 0.1267, + "grad_norm": 0.08772281557321548, + "learning_rate": 2.4859869166748808e-05, + "loss": 0.0359, + "step": 105340 + }, + { + "epoch": 0.12675, + "grad_norm": 0.06818553060293198, + "learning_rate": 2.485573555907674e-05, + "loss": 0.0394, + "step": 105350 + }, + { + "epoch": 0.1268, + "grad_norm": 0.08064989745616913, + "learning_rate": 2.4851601955348804e-05, + "loss": 0.0358, + "step": 105360 + }, + { + "epoch": 0.12685, + "grad_norm": 0.07183156907558441, + "learning_rate": 2.4847468355678016e-05, + "loss": 0.0344, + "step": 105370 + }, + { + "epoch": 0.1269, + "grad_norm": 0.08261267840862274, + "learning_rate": 2.4843334760177382e-05, + "loss": 0.0359, + "step": 105380 + }, + { + "epoch": 0.12695, + "grad_norm": 0.07098808884620667, + "learning_rate": 2.4839201168959912e-05, + "loss": 0.0355, + "step": 105390 + }, + { + "epoch": 0.127, + "grad_norm": 0.07491979748010635, + "learning_rate": 2.4835067582138638e-05, + "loss": 0.0357, + "step": 105400 + }, + { + "epoch": 0.12705, + "grad_norm": 0.07213471829891205, + "learning_rate": 2.483093399982654e-05, + "loss": 0.0345, + "step": 105410 + }, + { + "epoch": 0.1271, + "grad_norm": 0.06732694804668427, + "learning_rate": 2.4826800422136658e-05, + "loss": 0.0346, + "step": 105420 + }, + { + "epoch": 0.12715, + "grad_norm": 0.07045652717351913, + "learning_rate": 2.4822666849181967e-05, + "loss": 0.035, + "step": 105430 + }, + { + "epoch": 0.1272, + "grad_norm": 0.06555169820785522, + "learning_rate": 2.4818533281075513e-05, + "loss": 0.0339, + "step": 105440 + }, + { + "epoch": 0.12725, + "grad_norm": 0.06159425154328346, + "learning_rate": 2.4814399717930284e-05, + "loss": 0.0356, + "step": 105450 + }, + { + "epoch": 0.1273, + "grad_norm": 0.06212705001235008, + "learning_rate": 2.4810266159859297e-05, + "loss": 0.0397, + "step": 105460 + }, + { + "epoch": 0.12735, + "grad_norm": 0.07616399973630905, + "learning_rate": 2.4806132606975567e-05, + "loss": 0.0357, + "step": 105470 + }, + { + "epoch": 0.1274, + "grad_norm": 0.07089634239673615, + "learning_rate": 2.4801999059392095e-05, + "loss": 0.0354, + "step": 105480 + }, + { + "epoch": 0.12745, + "grad_norm": 0.0662955790758133, + "learning_rate": 2.4797865517221895e-05, + "loss": 0.0357, + "step": 105490 + }, + { + "epoch": 0.1275, + "grad_norm": 0.07061684131622314, + "learning_rate": 2.479373198057798e-05, + "loss": 0.0346, + "step": 105500 + }, + { + "epoch": 0.12755, + "grad_norm": 0.07761121541261673, + "learning_rate": 2.478959844957335e-05, + "loss": 0.0351, + "step": 105510 + }, + { + "epoch": 0.1276, + "grad_norm": 0.07609353214502335, + "learning_rate": 2.4785464924321014e-05, + "loss": 0.0361, + "step": 105520 + }, + { + "epoch": 0.12765, + "grad_norm": 0.09143120050430298, + "learning_rate": 2.478133140493399e-05, + "loss": 0.0346, + "step": 105530 + }, + { + "epoch": 0.1277, + "grad_norm": 0.06405118852853775, + "learning_rate": 2.4777197891525293e-05, + "loss": 0.0346, + "step": 105540 + }, + { + "epoch": 0.12775, + "grad_norm": 0.07845189422369003, + "learning_rate": 2.477306438420791e-05, + "loss": 0.0358, + "step": 105550 + }, + { + "epoch": 0.1278, + "grad_norm": 0.10201088339090347, + "learning_rate": 2.476893088309487e-05, + "loss": 0.0354, + "step": 105560 + }, + { + "epoch": 0.12785, + "grad_norm": 0.07596845179796219, + "learning_rate": 2.4764797388299167e-05, + "loss": 0.0337, + "step": 105570 + }, + { + "epoch": 0.1279, + "grad_norm": 0.09855609387159348, + "learning_rate": 2.476066389993382e-05, + "loss": 0.035, + "step": 105580 + }, + { + "epoch": 0.12795, + "grad_norm": 0.08643155544996262, + "learning_rate": 2.475653041811183e-05, + "loss": 0.0353, + "step": 105590 + }, + { + "epoch": 0.128, + "grad_norm": 0.07075414806604385, + "learning_rate": 2.4752396942946197e-05, + "loss": 0.0343, + "step": 105600 + }, + { + "epoch": 0.12805, + "grad_norm": 0.08212490379810333, + "learning_rate": 2.4748263474549958e-05, + "loss": 0.0353, + "step": 105610 + }, + { + "epoch": 0.1281, + "grad_norm": 0.05706174671649933, + "learning_rate": 2.4744130013036082e-05, + "loss": 0.0336, + "step": 105620 + }, + { + "epoch": 0.12815, + "grad_norm": 0.0789404883980751, + "learning_rate": 2.4739996558517614e-05, + "loss": 0.034, + "step": 105630 + }, + { + "epoch": 0.1282, + "grad_norm": 0.08181510120630264, + "learning_rate": 2.4735863111107528e-05, + "loss": 0.0352, + "step": 105640 + }, + { + "epoch": 0.12825, + "grad_norm": 0.0744834840297699, + "learning_rate": 2.4731729670918845e-05, + "loss": 0.034, + "step": 105650 + }, + { + "epoch": 0.1283, + "grad_norm": 0.07355952262878418, + "learning_rate": 2.4727596238064582e-05, + "loss": 0.0345, + "step": 105660 + }, + { + "epoch": 0.12835, + "grad_norm": 0.07162206619977951, + "learning_rate": 2.472346281265773e-05, + "loss": 0.0343, + "step": 105670 + }, + { + "epoch": 0.1284, + "grad_norm": 0.07498904317617416, + "learning_rate": 2.47193293948113e-05, + "loss": 0.0348, + "step": 105680 + }, + { + "epoch": 0.12845, + "grad_norm": 0.07180491834878922, + "learning_rate": 2.47151959846383e-05, + "loss": 0.0358, + "step": 105690 + }, + { + "epoch": 0.1285, + "grad_norm": 0.06147145479917526, + "learning_rate": 2.471106258225174e-05, + "loss": 0.0341, + "step": 105700 + }, + { + "epoch": 0.12855, + "grad_norm": 0.07467851787805557, + "learning_rate": 2.4706929187764614e-05, + "loss": 0.0352, + "step": 105710 + }, + { + "epoch": 0.1286, + "grad_norm": 0.08296094834804535, + "learning_rate": 2.4702795801289925e-05, + "loss": 0.0367, + "step": 105720 + }, + { + "epoch": 0.12865, + "grad_norm": 0.07414154708385468, + "learning_rate": 2.4698662422940702e-05, + "loss": 0.0359, + "step": 105730 + }, + { + "epoch": 0.1287, + "grad_norm": 0.08976276218891144, + "learning_rate": 2.4694529052829928e-05, + "loss": 0.0358, + "step": 105740 + }, + { + "epoch": 0.12875, + "grad_norm": 0.07030557096004486, + "learning_rate": 2.4690395691070624e-05, + "loss": 0.0345, + "step": 105750 + }, + { + "epoch": 0.1288, + "grad_norm": 0.08556876331567764, + "learning_rate": 2.4686262337775774e-05, + "loss": 0.0349, + "step": 105760 + }, + { + "epoch": 0.12885, + "grad_norm": 0.06981374323368073, + "learning_rate": 2.4682128993058404e-05, + "loss": 0.034, + "step": 105770 + }, + { + "epoch": 0.1289, + "grad_norm": 0.07156495004892349, + "learning_rate": 2.46779956570315e-05, + "loss": 0.0383, + "step": 105780 + }, + { + "epoch": 0.12895, + "grad_norm": 0.07469389587640762, + "learning_rate": 2.4673862329808077e-05, + "loss": 0.0342, + "step": 105790 + }, + { + "epoch": 0.129, + "grad_norm": 0.07833770662546158, + "learning_rate": 2.4669729011501137e-05, + "loss": 0.0358, + "step": 105800 + }, + { + "epoch": 0.12905, + "grad_norm": 0.07273399084806442, + "learning_rate": 2.466559570222367e-05, + "loss": 0.034, + "step": 105810 + }, + { + "epoch": 0.1291, + "grad_norm": 0.06519411504268646, + "learning_rate": 2.4661462402088712e-05, + "loss": 0.0343, + "step": 105820 + }, + { + "epoch": 0.12915, + "grad_norm": 0.07139147073030472, + "learning_rate": 2.4657329111209224e-05, + "loss": 0.0348, + "step": 105830 + }, + { + "epoch": 0.1292, + "grad_norm": 0.07328125089406967, + "learning_rate": 2.4653195829698238e-05, + "loss": 0.0354, + "step": 105840 + }, + { + "epoch": 0.12925, + "grad_norm": 0.0851602628827095, + "learning_rate": 2.464906255766875e-05, + "loss": 0.0348, + "step": 105850 + }, + { + "epoch": 0.1293, + "grad_norm": 0.07176851481199265, + "learning_rate": 2.464492929523376e-05, + "loss": 0.0345, + "step": 105860 + }, + { + "epoch": 0.12935, + "grad_norm": 0.07033860683441162, + "learning_rate": 2.464079604250627e-05, + "loss": 0.0346, + "step": 105870 + }, + { + "epoch": 0.1294, + "grad_norm": 0.07072217017412186, + "learning_rate": 2.4636662799599275e-05, + "loss": 0.0357, + "step": 105880 + }, + { + "epoch": 0.12945, + "grad_norm": 0.08237253874540329, + "learning_rate": 2.463252956662579e-05, + "loss": 0.0365, + "step": 105890 + }, + { + "epoch": 0.1295, + "grad_norm": 0.07988428324460983, + "learning_rate": 2.4628396343698803e-05, + "loss": 0.0361, + "step": 105900 + }, + { + "epoch": 0.12955, + "grad_norm": 0.072598896920681, + "learning_rate": 2.4624263130931317e-05, + "loss": 0.0355, + "step": 105910 + }, + { + "epoch": 0.1296, + "grad_norm": 0.07299260050058365, + "learning_rate": 2.462012992843635e-05, + "loss": 0.0356, + "step": 105920 + }, + { + "epoch": 0.12965, + "grad_norm": 0.0583757609128952, + "learning_rate": 2.461599673632687e-05, + "loss": 0.0346, + "step": 105930 + }, + { + "epoch": 0.1297, + "grad_norm": 0.06796620786190033, + "learning_rate": 2.461186355471591e-05, + "loss": 0.0345, + "step": 105940 + }, + { + "epoch": 0.12975, + "grad_norm": 0.06865128129720688, + "learning_rate": 2.460773038371645e-05, + "loss": 0.0354, + "step": 105950 + }, + { + "epoch": 0.1298, + "grad_norm": 0.06750306487083435, + "learning_rate": 2.4603597223441492e-05, + "loss": 0.0362, + "step": 105960 + }, + { + "epoch": 0.12985, + "grad_norm": 0.06639142334461212, + "learning_rate": 2.4599464074004037e-05, + "loss": 0.0344, + "step": 105970 + }, + { + "epoch": 0.1299, + "grad_norm": 0.08696747571229935, + "learning_rate": 2.4595330935517082e-05, + "loss": 0.0351, + "step": 105980 + }, + { + "epoch": 0.12995, + "grad_norm": 0.10079227387905121, + "learning_rate": 2.4591197808093634e-05, + "loss": 0.0341, + "step": 105990 + }, + { + "epoch": 0.13, + "grad_norm": 0.06885421276092529, + "learning_rate": 2.4587064691846678e-05, + "loss": 0.0355, + "step": 106000 + }, + { + "epoch": 0.13005, + "grad_norm": 0.10788242518901825, + "learning_rate": 2.4582931586889223e-05, + "loss": 0.0354, + "step": 106010 + }, + { + "epoch": 0.1301, + "grad_norm": 0.07853377610445023, + "learning_rate": 2.4578798493334256e-05, + "loss": 0.0357, + "step": 106020 + }, + { + "epoch": 0.13015, + "grad_norm": 0.07444460690021515, + "learning_rate": 2.457466541129478e-05, + "loss": 0.0347, + "step": 106030 + }, + { + "epoch": 0.1302, + "grad_norm": 0.07578030973672867, + "learning_rate": 2.4570532340883797e-05, + "loss": 0.0361, + "step": 106040 + }, + { + "epoch": 0.13025, + "grad_norm": 0.06895167380571365, + "learning_rate": 2.4566399282214295e-05, + "loss": 0.0346, + "step": 106050 + }, + { + "epoch": 0.1303, + "grad_norm": 0.06211972236633301, + "learning_rate": 2.456226623539928e-05, + "loss": 0.0359, + "step": 106060 + }, + { + "epoch": 0.13035, + "grad_norm": 0.0613601990044117, + "learning_rate": 2.4558133200551735e-05, + "loss": 0.0341, + "step": 106070 + }, + { + "epoch": 0.1304, + "grad_norm": 0.06245096027851105, + "learning_rate": 2.4554000177784666e-05, + "loss": 0.0347, + "step": 106080 + }, + { + "epoch": 0.13045, + "grad_norm": 0.059820763766765594, + "learning_rate": 2.454986716721106e-05, + "loss": 0.038, + "step": 106090 + }, + { + "epoch": 0.1305, + "grad_norm": 0.06739622354507446, + "learning_rate": 2.4545734168943914e-05, + "loss": 0.0354, + "step": 106100 + }, + { + "epoch": 0.13055, + "grad_norm": 0.06598929315805435, + "learning_rate": 2.454160118309624e-05, + "loss": 0.0336, + "step": 106110 + }, + { + "epoch": 0.1306, + "grad_norm": 0.08986051380634308, + "learning_rate": 2.4537468209781e-05, + "loss": 0.0359, + "step": 106120 + }, + { + "epoch": 0.13065, + "grad_norm": 0.07956880331039429, + "learning_rate": 2.453333524911122e-05, + "loss": 0.0345, + "step": 106130 + }, + { + "epoch": 0.1307, + "grad_norm": 0.06792153418064117, + "learning_rate": 2.4529202301199865e-05, + "loss": 0.0333, + "step": 106140 + }, + { + "epoch": 0.13075, + "grad_norm": 0.08238748461008072, + "learning_rate": 2.4525069366159955e-05, + "loss": 0.0334, + "step": 106150 + }, + { + "epoch": 0.1308, + "grad_norm": 0.07251271605491638, + "learning_rate": 2.4520936444104463e-05, + "loss": 0.0325, + "step": 106160 + }, + { + "epoch": 0.13085, + "grad_norm": 0.06597399711608887, + "learning_rate": 2.4516803535146387e-05, + "loss": 0.033, + "step": 106170 + }, + { + "epoch": 0.1309, + "grad_norm": 0.057592928409576416, + "learning_rate": 2.451267063939873e-05, + "loss": 0.0342, + "step": 106180 + }, + { + "epoch": 0.13095, + "grad_norm": 0.07524098455905914, + "learning_rate": 2.4508537756974465e-05, + "loss": 0.0346, + "step": 106190 + }, + { + "epoch": 0.131, + "grad_norm": 0.07812030613422394, + "learning_rate": 2.45044048879866e-05, + "loss": 0.0366, + "step": 106200 + }, + { + "epoch": 0.13105, + "grad_norm": 0.10690612345933914, + "learning_rate": 2.450027203254811e-05, + "loss": 0.0377, + "step": 106210 + }, + { + "epoch": 0.1311, + "grad_norm": 0.1581622064113617, + "learning_rate": 2.4496139190771997e-05, + "loss": 0.036, + "step": 106220 + }, + { + "epoch": 0.13115, + "grad_norm": 0.09735545516014099, + "learning_rate": 2.4492006362771257e-05, + "loss": 0.0357, + "step": 106230 + }, + { + "epoch": 0.1312, + "grad_norm": 0.07339781522750854, + "learning_rate": 2.4487873548658865e-05, + "loss": 0.0358, + "step": 106240 + }, + { + "epoch": 0.13125, + "grad_norm": 0.08246419578790665, + "learning_rate": 2.4483740748547827e-05, + "loss": 0.0369, + "step": 106250 + }, + { + "epoch": 0.1313, + "grad_norm": 0.08868840336799622, + "learning_rate": 2.4479607962551115e-05, + "loss": 0.0365, + "step": 106260 + }, + { + "epoch": 0.13135, + "grad_norm": 0.07506828755140305, + "learning_rate": 2.4475475190781728e-05, + "loss": 0.0386, + "step": 106270 + }, + { + "epoch": 0.1314, + "grad_norm": 0.07542876154184341, + "learning_rate": 2.4471342433352653e-05, + "loss": 0.0358, + "step": 106280 + }, + { + "epoch": 0.13145, + "grad_norm": 0.07146627455949783, + "learning_rate": 2.4467209690376873e-05, + "loss": 0.0361, + "step": 106290 + }, + { + "epoch": 0.1315, + "grad_norm": 0.08321649581193924, + "learning_rate": 2.4463076961967387e-05, + "loss": 0.0343, + "step": 106300 + }, + { + "epoch": 0.13155, + "grad_norm": 0.08326517790555954, + "learning_rate": 2.4458944248237165e-05, + "loss": 0.0368, + "step": 106310 + }, + { + "epoch": 0.1316, + "grad_norm": 0.06860451400279999, + "learning_rate": 2.4454811549299218e-05, + "loss": 0.034, + "step": 106320 + }, + { + "epoch": 0.13165, + "grad_norm": 0.0711694061756134, + "learning_rate": 2.4450678865266503e-05, + "loss": 0.0346, + "step": 106330 + }, + { + "epoch": 0.1317, + "grad_norm": 0.06888732314109802, + "learning_rate": 2.444654619625204e-05, + "loss": 0.0335, + "step": 106340 + }, + { + "epoch": 0.13175, + "grad_norm": 0.08941102027893066, + "learning_rate": 2.4442413542368776e-05, + "loss": 0.0369, + "step": 106350 + }, + { + "epoch": 0.1318, + "grad_norm": 0.07423651218414307, + "learning_rate": 2.4438280903729722e-05, + "loss": 0.0346, + "step": 106360 + }, + { + "epoch": 0.13185, + "grad_norm": 0.0659366324543953, + "learning_rate": 2.4434148280447867e-05, + "loss": 0.0345, + "step": 106370 + }, + { + "epoch": 0.1319, + "grad_norm": 0.08936917036771774, + "learning_rate": 2.4430015672636178e-05, + "loss": 0.0354, + "step": 106380 + }, + { + "epoch": 0.13195, + "grad_norm": 0.07357953488826752, + "learning_rate": 2.4425883080407648e-05, + "loss": 0.0354, + "step": 106390 + }, + { + "epoch": 0.132, + "grad_norm": 0.06603465229272842, + "learning_rate": 2.442175050387526e-05, + "loss": 0.0344, + "step": 106400 + }, + { + "epoch": 0.13205, + "grad_norm": 0.07264716923236847, + "learning_rate": 2.4417617943151984e-05, + "loss": 0.0359, + "step": 106410 + }, + { + "epoch": 0.1321, + "grad_norm": 0.07183219492435455, + "learning_rate": 2.4413485398350835e-05, + "loss": 0.0352, + "step": 106420 + }, + { + "epoch": 0.13215, + "grad_norm": 0.06507682055234909, + "learning_rate": 2.4409352869584758e-05, + "loss": 0.0351, + "step": 106430 + }, + { + "epoch": 0.1322, + "grad_norm": 0.08611390739679337, + "learning_rate": 2.440522035696676e-05, + "loss": 0.0348, + "step": 106440 + }, + { + "epoch": 0.13225, + "grad_norm": 0.07270409166812897, + "learning_rate": 2.440108786060981e-05, + "loss": 0.0337, + "step": 106450 + }, + { + "epoch": 0.1323, + "grad_norm": 0.0700516477227211, + "learning_rate": 2.43969553806269e-05, + "loss": 0.0344, + "step": 106460 + }, + { + "epoch": 0.13235, + "grad_norm": 0.08884414285421371, + "learning_rate": 2.4392822917130997e-05, + "loss": 0.0338, + "step": 106470 + }, + { + "epoch": 0.1324, + "grad_norm": 0.0819149985909462, + "learning_rate": 2.438869047023509e-05, + "loss": 0.0338, + "step": 106480 + }, + { + "epoch": 0.13245, + "grad_norm": 0.08014719188213348, + "learning_rate": 2.4384558040052158e-05, + "loss": 0.0335, + "step": 106490 + }, + { + "epoch": 0.1325, + "grad_norm": 0.07839412987232208, + "learning_rate": 2.438042562669517e-05, + "loss": 0.0334, + "step": 106500 + }, + { + "epoch": 0.13255, + "grad_norm": 0.06804198771715164, + "learning_rate": 2.437629323027712e-05, + "loss": 0.0338, + "step": 106510 + }, + { + "epoch": 0.1326, + "grad_norm": 0.07780171930789948, + "learning_rate": 2.4372160850910973e-05, + "loss": 0.0339, + "step": 106520 + }, + { + "epoch": 0.13265, + "grad_norm": 0.08501968532800674, + "learning_rate": 2.4368028488709724e-05, + "loss": 0.0344, + "step": 106530 + }, + { + "epoch": 0.1327, + "grad_norm": 0.08473043888807297, + "learning_rate": 2.436389614378632e-05, + "loss": 0.0331, + "step": 106540 + }, + { + "epoch": 0.13275, + "grad_norm": 0.08621985465288162, + "learning_rate": 2.4359763816253768e-05, + "loss": 0.0334, + "step": 106550 + }, + { + "epoch": 0.1328, + "grad_norm": 0.06827308982610703, + "learning_rate": 2.4355631506225035e-05, + "loss": 0.0324, + "step": 106560 + }, + { + "epoch": 0.13285, + "grad_norm": 0.06201021745800972, + "learning_rate": 2.435149921381309e-05, + "loss": 0.0331, + "step": 106570 + }, + { + "epoch": 0.1329, + "grad_norm": 0.07178918272256851, + "learning_rate": 2.4347366939130918e-05, + "loss": 0.0343, + "step": 106580 + }, + { + "epoch": 0.13295, + "grad_norm": 0.07109556347131729, + "learning_rate": 2.4343234682291484e-05, + "loss": 0.0334, + "step": 106590 + }, + { + "epoch": 0.133, + "grad_norm": 0.08138655871152878, + "learning_rate": 2.4339102443407756e-05, + "loss": 0.0349, + "step": 106600 + }, + { + "epoch": 0.13305, + "grad_norm": 0.05992733687162399, + "learning_rate": 2.433497022259274e-05, + "loss": 0.0347, + "step": 106610 + }, + { + "epoch": 0.1331, + "grad_norm": 0.0833921730518341, + "learning_rate": 2.4330838019959374e-05, + "loss": 0.0363, + "step": 106620 + }, + { + "epoch": 0.13315, + "grad_norm": 0.07941515743732452, + "learning_rate": 2.4326705835620658e-05, + "loss": 0.0351, + "step": 106630 + }, + { + "epoch": 0.1332, + "grad_norm": 0.06688013672828674, + "learning_rate": 2.4322573669689536e-05, + "loss": 0.0335, + "step": 106640 + }, + { + "epoch": 0.13325, + "grad_norm": 0.08466209471225739, + "learning_rate": 2.4318441522279007e-05, + "loss": 0.035, + "step": 106650 + }, + { + "epoch": 0.1333, + "grad_norm": 0.08238755911588669, + "learning_rate": 2.4314309393502024e-05, + "loss": 0.0346, + "step": 106660 + }, + { + "epoch": 0.13335, + "grad_norm": 0.06920625269412994, + "learning_rate": 2.4310177283471567e-05, + "loss": 0.0373, + "step": 106670 + }, + { + "epoch": 0.1334, + "grad_norm": 0.07812193781137466, + "learning_rate": 2.430604519230061e-05, + "loss": 0.0339, + "step": 106680 + }, + { + "epoch": 0.13345, + "grad_norm": 0.06610584259033203, + "learning_rate": 2.4301913120102107e-05, + "loss": 0.0355, + "step": 106690 + }, + { + "epoch": 0.1335, + "grad_norm": 0.06607034802436829, + "learning_rate": 2.4297781066989045e-05, + "loss": 0.0369, + "step": 106700 + }, + { + "epoch": 0.13355, + "grad_norm": 0.06498633325099945, + "learning_rate": 2.4293649033074378e-05, + "loss": 0.0348, + "step": 106710 + }, + { + "epoch": 0.1336, + "grad_norm": 0.07355812937021255, + "learning_rate": 2.4289517018471087e-05, + "loss": 0.0354, + "step": 106720 + }, + { + "epoch": 0.13365, + "grad_norm": 0.06755322217941284, + "learning_rate": 2.4285385023292124e-05, + "loss": 0.0357, + "step": 106730 + }, + { + "epoch": 0.1337, + "grad_norm": 0.07552862912416458, + "learning_rate": 2.428125304765047e-05, + "loss": 0.0347, + "step": 106740 + }, + { + "epoch": 0.13375, + "grad_norm": 0.06588675826787949, + "learning_rate": 2.4277121091659095e-05, + "loss": 0.0356, + "step": 106750 + }, + { + "epoch": 0.1338, + "grad_norm": 0.06582847237586975, + "learning_rate": 2.4272989155430952e-05, + "loss": 0.0342, + "step": 106760 + }, + { + "epoch": 0.13385, + "grad_norm": 0.06949807703495026, + "learning_rate": 2.4268857239079017e-05, + "loss": 0.0356, + "step": 106770 + }, + { + "epoch": 0.1339, + "grad_norm": 0.06784377247095108, + "learning_rate": 2.4264725342716242e-05, + "loss": 0.0348, + "step": 106780 + }, + { + "epoch": 0.13395, + "grad_norm": 0.09562524408102036, + "learning_rate": 2.426059346645561e-05, + "loss": 0.0353, + "step": 106790 + }, + { + "epoch": 0.134, + "grad_norm": 0.07481709867715836, + "learning_rate": 2.4256461610410066e-05, + "loss": 0.0339, + "step": 106800 + }, + { + "epoch": 0.13405, + "grad_norm": 0.07583610713481903, + "learning_rate": 2.425232977469258e-05, + "loss": 0.0338, + "step": 106810 + }, + { + "epoch": 0.1341, + "grad_norm": 0.08295128494501114, + "learning_rate": 2.424819795941613e-05, + "loss": 0.0338, + "step": 106820 + }, + { + "epoch": 0.13415, + "grad_norm": 0.07903057336807251, + "learning_rate": 2.424406616469365e-05, + "loss": 0.0351, + "step": 106830 + }, + { + "epoch": 0.1342, + "grad_norm": 0.07755926251411438, + "learning_rate": 2.4239934390638135e-05, + "loss": 0.033, + "step": 106840 + }, + { + "epoch": 0.13425, + "grad_norm": 0.06308278441429138, + "learning_rate": 2.4235802637362516e-05, + "loss": 0.0328, + "step": 106850 + }, + { + "epoch": 0.1343, + "grad_norm": 0.09073317795991898, + "learning_rate": 2.4231670904979764e-05, + "loss": 0.0349, + "step": 106860 + }, + { + "epoch": 0.13435, + "grad_norm": 0.08459851890802383, + "learning_rate": 2.422753919360285e-05, + "loss": 0.0336, + "step": 106870 + }, + { + "epoch": 0.1344, + "grad_norm": 0.07097344845533371, + "learning_rate": 2.4223407503344716e-05, + "loss": 0.0335, + "step": 106880 + }, + { + "epoch": 0.13445, + "grad_norm": 0.07137028127908707, + "learning_rate": 2.4219275834318338e-05, + "loss": 0.0343, + "step": 106890 + }, + { + "epoch": 0.1345, + "grad_norm": 0.07252298295497894, + "learning_rate": 2.4215144186636658e-05, + "loss": 0.0334, + "step": 106900 + }, + { + "epoch": 0.13455, + "grad_norm": 0.09245292842388153, + "learning_rate": 2.4211012560412643e-05, + "loss": 0.0349, + "step": 106910 + }, + { + "epoch": 0.1346, + "grad_norm": 0.07488303631544113, + "learning_rate": 2.4206880955759247e-05, + "loss": 0.0338, + "step": 106920 + }, + { + "epoch": 0.13465, + "grad_norm": 0.06381545215845108, + "learning_rate": 2.4202749372789424e-05, + "loss": 0.0362, + "step": 106930 + }, + { + "epoch": 0.1347, + "grad_norm": 0.07639806717634201, + "learning_rate": 2.419861781161614e-05, + "loss": 0.0341, + "step": 106940 + }, + { + "epoch": 0.13475, + "grad_norm": 0.10640095174312592, + "learning_rate": 2.419448627235234e-05, + "loss": 0.035, + "step": 106950 + }, + { + "epoch": 0.1348, + "grad_norm": 0.09375517815351486, + "learning_rate": 2.419035475511099e-05, + "loss": 0.0348, + "step": 106960 + }, + { + "epoch": 0.13485, + "grad_norm": 0.05909932032227516, + "learning_rate": 2.4186223260005032e-05, + "loss": 0.0339, + "step": 106970 + }, + { + "epoch": 0.1349, + "grad_norm": 0.0712037906050682, + "learning_rate": 2.4182091787147425e-05, + "loss": 0.0348, + "step": 106980 + }, + { + "epoch": 0.13495, + "grad_norm": 0.05575518310070038, + "learning_rate": 2.417796033665112e-05, + "loss": 0.0341, + "step": 106990 + }, + { + "epoch": 0.135, + "grad_norm": 0.07256089895963669, + "learning_rate": 2.417382890862907e-05, + "loss": 0.0341, + "step": 107000 + }, + { + "epoch": 0.13505, + "grad_norm": 0.07149246335029602, + "learning_rate": 2.416969750319423e-05, + "loss": 0.0355, + "step": 107010 + }, + { + "epoch": 0.1351, + "grad_norm": 0.06963690370321274, + "learning_rate": 2.416556612045954e-05, + "loss": 0.0335, + "step": 107020 + }, + { + "epoch": 0.13515, + "grad_norm": 0.06591261178255081, + "learning_rate": 2.4161434760537976e-05, + "loss": 0.0349, + "step": 107030 + }, + { + "epoch": 0.1352, + "grad_norm": 0.09097351133823395, + "learning_rate": 2.4157303423542452e-05, + "loss": 0.0359, + "step": 107040 + }, + { + "epoch": 0.13525, + "grad_norm": 0.06748231500387192, + "learning_rate": 2.4153172109585942e-05, + "loss": 0.0333, + "step": 107050 + }, + { + "epoch": 0.1353, + "grad_norm": 0.0742628425359726, + "learning_rate": 2.4149040818781395e-05, + "loss": 0.0352, + "step": 107060 + }, + { + "epoch": 0.13535, + "grad_norm": 0.09796442836523056, + "learning_rate": 2.4144909551241745e-05, + "loss": 0.0365, + "step": 107070 + }, + { + "epoch": 0.1354, + "grad_norm": 0.08070221543312073, + "learning_rate": 2.4140778307079954e-05, + "loss": 0.0349, + "step": 107080 + }, + { + "epoch": 0.13545, + "grad_norm": 0.08046849071979523, + "learning_rate": 2.4136647086408952e-05, + "loss": 0.0347, + "step": 107090 + }, + { + "epoch": 0.1355, + "grad_norm": 0.06781762838363647, + "learning_rate": 2.41325158893417e-05, + "loss": 0.0357, + "step": 107100 + }, + { + "epoch": 0.13555, + "grad_norm": 0.08092023432254791, + "learning_rate": 2.412838471599114e-05, + "loss": 0.0357, + "step": 107110 + }, + { + "epoch": 0.1356, + "grad_norm": 0.07321129739284515, + "learning_rate": 2.4124253566470204e-05, + "loss": 0.0336, + "step": 107120 + }, + { + "epoch": 0.13565, + "grad_norm": 0.07404232025146484, + "learning_rate": 2.412012244089186e-05, + "loss": 0.0368, + "step": 107130 + }, + { + "epoch": 0.1357, + "grad_norm": 0.07883698493242264, + "learning_rate": 2.4115991339369025e-05, + "loss": 0.0345, + "step": 107140 + }, + { + "epoch": 0.13575, + "grad_norm": 0.07799618691205978, + "learning_rate": 2.4111860262014666e-05, + "loss": 0.0344, + "step": 107150 + }, + { + "epoch": 0.1358, + "grad_norm": 0.06702414900064468, + "learning_rate": 2.4107729208941705e-05, + "loss": 0.0334, + "step": 107160 + }, + { + "epoch": 0.13585, + "grad_norm": 0.0739661380648613, + "learning_rate": 2.41035981802631e-05, + "loss": 0.0334, + "step": 107170 + }, + { + "epoch": 0.1359, + "grad_norm": 0.07123866677284241, + "learning_rate": 2.409946717609178e-05, + "loss": 0.0332, + "step": 107180 + }, + { + "epoch": 0.13595, + "grad_norm": 0.07094275206327438, + "learning_rate": 2.4095336196540685e-05, + "loss": 0.0348, + "step": 107190 + }, + { + "epoch": 0.136, + "grad_norm": 0.06380019336938858, + "learning_rate": 2.4091205241722767e-05, + "loss": 0.0331, + "step": 107200 + }, + { + "epoch": 0.13605, + "grad_norm": 0.066954106092453, + "learning_rate": 2.408707431175095e-05, + "loss": 0.0342, + "step": 107210 + }, + { + "epoch": 0.1361, + "grad_norm": 0.06480351090431213, + "learning_rate": 2.4082943406738185e-05, + "loss": 0.0332, + "step": 107220 + }, + { + "epoch": 0.13615, + "grad_norm": 0.07648269832134247, + "learning_rate": 2.407881252679739e-05, + "loss": 0.0369, + "step": 107230 + }, + { + "epoch": 0.1362, + "grad_norm": 0.10895515233278275, + "learning_rate": 2.407468167204152e-05, + "loss": 0.0338, + "step": 107240 + }, + { + "epoch": 0.13625, + "grad_norm": 0.062433335930109024, + "learning_rate": 2.407055084258351e-05, + "loss": 0.0339, + "step": 107250 + }, + { + "epoch": 0.1363, + "grad_norm": 0.08131091296672821, + "learning_rate": 2.4066420038536288e-05, + "loss": 0.0346, + "step": 107260 + }, + { + "epoch": 0.13635, + "grad_norm": 0.07425548881292343, + "learning_rate": 2.4062289260012797e-05, + "loss": 0.0343, + "step": 107270 + }, + { + "epoch": 0.1364, + "grad_norm": 0.0837615355849266, + "learning_rate": 2.405815850712596e-05, + "loss": 0.0336, + "step": 107280 + }, + { + "epoch": 0.13645, + "grad_norm": 0.06968902796506882, + "learning_rate": 2.405402777998872e-05, + "loss": 0.0329, + "step": 107290 + }, + { + "epoch": 0.1365, + "grad_norm": 0.06833193451166153, + "learning_rate": 2.4049897078714e-05, + "loss": 0.0323, + "step": 107300 + }, + { + "epoch": 0.13655, + "grad_norm": 0.06843312829732895, + "learning_rate": 2.4045766403414728e-05, + "loss": 0.0336, + "step": 107310 + }, + { + "epoch": 0.1366, + "grad_norm": 0.06134680286049843, + "learning_rate": 2.404163575420386e-05, + "loss": 0.0367, + "step": 107320 + }, + { + "epoch": 0.13665, + "grad_norm": 0.07994901388883591, + "learning_rate": 2.40375051311943e-05, + "loss": 0.0356, + "step": 107330 + }, + { + "epoch": 0.1367, + "grad_norm": 0.07322155684232712, + "learning_rate": 2.4033374534499004e-05, + "loss": 0.0344, + "step": 107340 + }, + { + "epoch": 0.13675, + "grad_norm": 0.07866465300321579, + "learning_rate": 2.4029243964230867e-05, + "loss": 0.0348, + "step": 107350 + }, + { + "epoch": 0.1368, + "grad_norm": 0.06857824325561523, + "learning_rate": 2.4025113420502843e-05, + "loss": 0.0338, + "step": 107360 + }, + { + "epoch": 0.13685, + "grad_norm": 0.05879867821931839, + "learning_rate": 2.402098290342785e-05, + "loss": 0.0346, + "step": 107370 + }, + { + "epoch": 0.1369, + "grad_norm": 0.0671166256070137, + "learning_rate": 2.4016852413118815e-05, + "loss": 0.0374, + "step": 107380 + }, + { + "epoch": 0.13695, + "grad_norm": 0.07220599055290222, + "learning_rate": 2.401272194968867e-05, + "loss": 0.0331, + "step": 107390 + }, + { + "epoch": 0.137, + "grad_norm": 0.06513206660747528, + "learning_rate": 2.4008591513250332e-05, + "loss": 0.0358, + "step": 107400 + }, + { + "epoch": 0.13705, + "grad_norm": 0.058335594832897186, + "learning_rate": 2.4004461103916736e-05, + "loss": 0.0379, + "step": 107410 + }, + { + "epoch": 0.1371, + "grad_norm": 0.06773054599761963, + "learning_rate": 2.4000330721800796e-05, + "loss": 0.0377, + "step": 107420 + }, + { + "epoch": 0.13715, + "grad_norm": 0.07019204646348953, + "learning_rate": 2.3996200367015428e-05, + "loss": 0.0358, + "step": 107430 + }, + { + "epoch": 0.1372, + "grad_norm": 0.07152324169874191, + "learning_rate": 2.399207003967358e-05, + "loss": 0.0362, + "step": 107440 + }, + { + "epoch": 0.13725, + "grad_norm": 0.06750397384166718, + "learning_rate": 2.3987939739888153e-05, + "loss": 0.0377, + "step": 107450 + }, + { + "epoch": 0.1373, + "grad_norm": 0.07339220494031906, + "learning_rate": 2.3983809467772075e-05, + "loss": 0.0348, + "step": 107460 + }, + { + "epoch": 0.13735, + "grad_norm": 0.06790155917406082, + "learning_rate": 2.397967922343826e-05, + "loss": 0.0351, + "step": 107470 + }, + { + "epoch": 0.1374, + "grad_norm": 0.06944229453802109, + "learning_rate": 2.3975549006999638e-05, + "loss": 0.0341, + "step": 107480 + }, + { + "epoch": 0.13745, + "grad_norm": 0.06574300676584244, + "learning_rate": 2.3971418818569115e-05, + "loss": 0.0356, + "step": 107490 + }, + { + "epoch": 0.1375, + "grad_norm": 0.07261010259389877, + "learning_rate": 2.3967288658259617e-05, + "loss": 0.0351, + "step": 107500 + }, + { + "epoch": 0.13755, + "grad_norm": 0.06218640133738518, + "learning_rate": 2.3963158526184066e-05, + "loss": 0.0369, + "step": 107510 + }, + { + "epoch": 0.1376, + "grad_norm": 0.06601933389902115, + "learning_rate": 2.3959028422455357e-05, + "loss": 0.0349, + "step": 107520 + }, + { + "epoch": 0.13765, + "grad_norm": 0.07100588083267212, + "learning_rate": 2.3954898347186436e-05, + "loss": 0.0345, + "step": 107530 + }, + { + "epoch": 0.1377, + "grad_norm": 0.0939004197716713, + "learning_rate": 2.3950768300490187e-05, + "loss": 0.0363, + "step": 107540 + }, + { + "epoch": 0.13775, + "grad_norm": 0.06147817522287369, + "learning_rate": 2.394663828247955e-05, + "loss": 0.0364, + "step": 107550 + }, + { + "epoch": 0.1378, + "grad_norm": 0.0856638178229332, + "learning_rate": 2.394250829326742e-05, + "loss": 0.0352, + "step": 107560 + }, + { + "epoch": 0.13785, + "grad_norm": 0.07360093295574188, + "learning_rate": 2.3938378332966714e-05, + "loss": 0.0371, + "step": 107570 + }, + { + "epoch": 0.1379, + "grad_norm": 0.07323596626520157, + "learning_rate": 2.3934248401690356e-05, + "loss": 0.0358, + "step": 107580 + }, + { + "epoch": 0.13795, + "grad_norm": 0.07091102004051208, + "learning_rate": 2.3930118499551236e-05, + "loss": 0.0387, + "step": 107590 + }, + { + "epoch": 0.138, + "grad_norm": 0.06781253218650818, + "learning_rate": 2.392598862666228e-05, + "loss": 0.035, + "step": 107600 + }, + { + "epoch": 0.13805, + "grad_norm": 0.0743185356259346, + "learning_rate": 2.3921858783136387e-05, + "loss": 0.0354, + "step": 107610 + }, + { + "epoch": 0.1381, + "grad_norm": 0.11321161687374115, + "learning_rate": 2.3917728969086468e-05, + "loss": 0.0359, + "step": 107620 + }, + { + "epoch": 0.13815, + "grad_norm": 0.11684630066156387, + "learning_rate": 2.3913599184625442e-05, + "loss": 0.0356, + "step": 107630 + }, + { + "epoch": 0.1382, + "grad_norm": 0.12119896709918976, + "learning_rate": 2.3909469429866192e-05, + "loss": 0.036, + "step": 107640 + }, + { + "epoch": 0.13825, + "grad_norm": 0.0739327073097229, + "learning_rate": 2.3905339704921652e-05, + "loss": 0.0354, + "step": 107650 + }, + { + "epoch": 0.1383, + "grad_norm": 0.07066735625267029, + "learning_rate": 2.3901210009904707e-05, + "loss": 0.0348, + "step": 107660 + }, + { + "epoch": 0.13835, + "grad_norm": 0.08110303431749344, + "learning_rate": 2.3897080344928273e-05, + "loss": 0.0358, + "step": 107670 + }, + { + "epoch": 0.1384, + "grad_norm": 0.08074736595153809, + "learning_rate": 2.3892950710105243e-05, + "loss": 0.0351, + "step": 107680 + }, + { + "epoch": 0.13845, + "grad_norm": 0.0737789049744606, + "learning_rate": 2.3888821105548523e-05, + "loss": 0.0348, + "step": 107690 + }, + { + "epoch": 0.1385, + "grad_norm": 0.06845972687005997, + "learning_rate": 2.3884691531371023e-05, + "loss": 0.0346, + "step": 107700 + }, + { + "epoch": 0.13855, + "grad_norm": 0.07794822752475739, + "learning_rate": 2.3880561987685627e-05, + "loss": 0.0337, + "step": 107710 + }, + { + "epoch": 0.1386, + "grad_norm": 0.07079590857028961, + "learning_rate": 2.387643247460526e-05, + "loss": 0.0337, + "step": 107720 + }, + { + "epoch": 0.13865, + "grad_norm": 0.07472711056470871, + "learning_rate": 2.387230299224279e-05, + "loss": 0.0365, + "step": 107730 + }, + { + "epoch": 0.1387, + "grad_norm": 0.0704125314950943, + "learning_rate": 2.386817354071115e-05, + "loss": 0.0336, + "step": 107740 + }, + { + "epoch": 0.13875, + "grad_norm": 0.06483296304941177, + "learning_rate": 2.3864044120123205e-05, + "loss": 0.0346, + "step": 107750 + }, + { + "epoch": 0.1388, + "grad_norm": 0.061506740748882294, + "learning_rate": 2.3859914730591873e-05, + "loss": 0.0339, + "step": 107760 + }, + { + "epoch": 0.13885, + "grad_norm": 0.08290746808052063, + "learning_rate": 2.385578537223005e-05, + "loss": 0.0332, + "step": 107770 + }, + { + "epoch": 0.1389, + "grad_norm": 0.08351372927427292, + "learning_rate": 2.3851656045150617e-05, + "loss": 0.0342, + "step": 107780 + }, + { + "epoch": 0.13895, + "grad_norm": 0.06477633863687515, + "learning_rate": 2.384752674946648e-05, + "loss": 0.0341, + "step": 107790 + }, + { + "epoch": 0.139, + "grad_norm": 0.07235559821128845, + "learning_rate": 2.3843397485290527e-05, + "loss": 0.0335, + "step": 107800 + }, + { + "epoch": 0.13905, + "grad_norm": 0.06576807051897049, + "learning_rate": 2.3839268252735647e-05, + "loss": 0.0339, + "step": 107810 + }, + { + "epoch": 0.1391, + "grad_norm": 0.0647667944431305, + "learning_rate": 2.3835139051914753e-05, + "loss": 0.0357, + "step": 107820 + }, + { + "epoch": 0.13915, + "grad_norm": 0.09135240316390991, + "learning_rate": 2.3831009882940704e-05, + "loss": 0.0353, + "step": 107830 + }, + { + "epoch": 0.1392, + "grad_norm": 0.07542411983013153, + "learning_rate": 2.3826880745926418e-05, + "loss": 0.0341, + "step": 107840 + }, + { + "epoch": 0.13925, + "grad_norm": 0.0774625837802887, + "learning_rate": 2.3822751640984757e-05, + "loss": 0.0339, + "step": 107850 + }, + { + "epoch": 0.1393, + "grad_norm": 0.07527747005224228, + "learning_rate": 2.3818622568228633e-05, + "loss": 0.0335, + "step": 107860 + }, + { + "epoch": 0.13935, + "grad_norm": 0.0802069753408432, + "learning_rate": 2.3814493527770923e-05, + "loss": 0.034, + "step": 107870 + }, + { + "epoch": 0.1394, + "grad_norm": 0.07678884267807007, + "learning_rate": 2.3810364519724515e-05, + "loss": 0.0349, + "step": 107880 + }, + { + "epoch": 0.13945, + "grad_norm": 0.09053342044353485, + "learning_rate": 2.3806235544202295e-05, + "loss": 0.0341, + "step": 107890 + }, + { + "epoch": 0.1395, + "grad_norm": 0.07176068425178528, + "learning_rate": 2.3802106601317146e-05, + "loss": 0.0368, + "step": 107900 + }, + { + "epoch": 0.13955, + "grad_norm": 0.09751512855291367, + "learning_rate": 2.3797977691181957e-05, + "loss": 0.0366, + "step": 107910 + }, + { + "epoch": 0.1396, + "grad_norm": 0.06735493242740631, + "learning_rate": 2.3793848813909596e-05, + "loss": 0.0348, + "step": 107920 + }, + { + "epoch": 0.13965, + "grad_norm": 0.06756073981523514, + "learning_rate": 2.3789719969612966e-05, + "loss": 0.0352, + "step": 107930 + }, + { + "epoch": 0.1397, + "grad_norm": 0.07313332706689835, + "learning_rate": 2.3785591158404922e-05, + "loss": 0.0353, + "step": 107940 + }, + { + "epoch": 0.13975, + "grad_norm": 0.0708153247833252, + "learning_rate": 2.378146238039837e-05, + "loss": 0.0356, + "step": 107950 + }, + { + "epoch": 0.1398, + "grad_norm": 0.0632934644818306, + "learning_rate": 2.377733363570618e-05, + "loss": 0.0342, + "step": 107960 + }, + { + "epoch": 0.13985, + "grad_norm": 0.06964541226625443, + "learning_rate": 2.3773204924441227e-05, + "loss": 0.0332, + "step": 107970 + }, + { + "epoch": 0.1399, + "grad_norm": 0.08246681839227676, + "learning_rate": 2.3769076246716395e-05, + "loss": 0.0349, + "step": 107980 + }, + { + "epoch": 0.13995, + "grad_norm": 0.08017262071371078, + "learning_rate": 2.376494760264455e-05, + "loss": 0.0342, + "step": 107990 + }, + { + "epoch": 0.14, + "grad_norm": 0.07744333148002625, + "learning_rate": 2.3760818992338573e-05, + "loss": 0.0338, + "step": 108000 + }, + { + "epoch": 0.14005, + "grad_norm": 0.08109390735626221, + "learning_rate": 2.3756690415911346e-05, + "loss": 0.0349, + "step": 108010 + }, + { + "epoch": 0.1401, + "grad_norm": 0.07111233472824097, + "learning_rate": 2.3752561873475724e-05, + "loss": 0.0332, + "step": 108020 + }, + { + "epoch": 0.14015, + "grad_norm": 0.08744429051876068, + "learning_rate": 2.3748433365144606e-05, + "loss": 0.0332, + "step": 108030 + }, + { + "epoch": 0.1402, + "grad_norm": 0.08111266046762466, + "learning_rate": 2.3744304891030837e-05, + "loss": 0.0334, + "step": 108040 + }, + { + "epoch": 0.14025, + "grad_norm": 0.0656583309173584, + "learning_rate": 2.3740176451247314e-05, + "loss": 0.0333, + "step": 108050 + }, + { + "epoch": 0.1403, + "grad_norm": 0.08800430595874786, + "learning_rate": 2.3736048045906877e-05, + "loss": 0.0372, + "step": 108060 + }, + { + "epoch": 0.14035, + "grad_norm": 0.07889630645513535, + "learning_rate": 2.373191967512242e-05, + "loss": 0.033, + "step": 108070 + }, + { + "epoch": 0.1404, + "grad_norm": 0.0784083753824234, + "learning_rate": 2.372779133900681e-05, + "loss": 0.0347, + "step": 108080 + }, + { + "epoch": 0.14045, + "grad_norm": 0.06280580163002014, + "learning_rate": 2.3723663037672898e-05, + "loss": 0.0343, + "step": 108090 + }, + { + "epoch": 0.1405, + "grad_norm": 0.08633331209421158, + "learning_rate": 2.3719534771233563e-05, + "loss": 0.0373, + "step": 108100 + }, + { + "epoch": 0.14055, + "grad_norm": 0.07899964600801468, + "learning_rate": 2.3715406539801663e-05, + "loss": 0.0347, + "step": 108110 + }, + { + "epoch": 0.1406, + "grad_norm": 0.08867514878511429, + "learning_rate": 2.371127834349007e-05, + "loss": 0.035, + "step": 108120 + }, + { + "epoch": 0.14065, + "grad_norm": 0.06929611414670944, + "learning_rate": 2.3707150182411637e-05, + "loss": 0.0343, + "step": 108130 + }, + { + "epoch": 0.1407, + "grad_norm": 0.07136829197406769, + "learning_rate": 2.3703022056679227e-05, + "loss": 0.0376, + "step": 108140 + }, + { + "epoch": 0.14075, + "grad_norm": 0.06537345796823502, + "learning_rate": 2.369889396640572e-05, + "loss": 0.037, + "step": 108150 + }, + { + "epoch": 0.1408, + "grad_norm": 0.06904958933591843, + "learning_rate": 2.3694765911703957e-05, + "loss": 0.0349, + "step": 108160 + }, + { + "epoch": 0.14085, + "grad_norm": 0.07893108576536179, + "learning_rate": 2.3690637892686808e-05, + "loss": 0.0357, + "step": 108170 + }, + { + "epoch": 0.1409, + "grad_norm": 0.08251333981752396, + "learning_rate": 2.368650990946712e-05, + "loss": 0.0353, + "step": 108180 + }, + { + "epoch": 0.14095, + "grad_norm": 0.0642128735780716, + "learning_rate": 2.3682381962157766e-05, + "loss": 0.0335, + "step": 108190 + }, + { + "epoch": 0.141, + "grad_norm": 0.059158291667699814, + "learning_rate": 2.3678254050871587e-05, + "loss": 0.0351, + "step": 108200 + }, + { + "epoch": 0.14105, + "grad_norm": 0.07299425452947617, + "learning_rate": 2.367412617572145e-05, + "loss": 0.0354, + "step": 108210 + }, + { + "epoch": 0.1411, + "grad_norm": 0.06661834567785263, + "learning_rate": 2.3669998336820205e-05, + "loss": 0.0349, + "step": 108220 + }, + { + "epoch": 0.14115, + "grad_norm": 0.07236451655626297, + "learning_rate": 2.3665870534280696e-05, + "loss": 0.0351, + "step": 108230 + }, + { + "epoch": 0.1412, + "grad_norm": 0.07977017015218735, + "learning_rate": 2.3661742768215802e-05, + "loss": 0.0339, + "step": 108240 + }, + { + "epoch": 0.14125, + "grad_norm": 0.0703221783041954, + "learning_rate": 2.3657615038738343e-05, + "loss": 0.0342, + "step": 108250 + }, + { + "epoch": 0.1413, + "grad_norm": 0.0639897808432579, + "learning_rate": 2.365348734596119e-05, + "loss": 0.035, + "step": 108260 + }, + { + "epoch": 0.14135, + "grad_norm": 0.08083923906087875, + "learning_rate": 2.364935968999719e-05, + "loss": 0.0371, + "step": 108270 + }, + { + "epoch": 0.1414, + "grad_norm": 0.06952771544456482, + "learning_rate": 2.3645232070959185e-05, + "loss": 0.0355, + "step": 108280 + }, + { + "epoch": 0.14145, + "grad_norm": 0.06466667354106903, + "learning_rate": 2.3641104488960032e-05, + "loss": 0.0383, + "step": 108290 + }, + { + "epoch": 0.1415, + "grad_norm": 0.07003355026245117, + "learning_rate": 2.3636976944112568e-05, + "loss": 0.0353, + "step": 108300 + }, + { + "epoch": 0.14155, + "grad_norm": 0.06676148623228073, + "learning_rate": 2.3632849436529643e-05, + "loss": 0.0356, + "step": 108310 + }, + { + "epoch": 0.1416, + "grad_norm": 0.09525799751281738, + "learning_rate": 2.36287219663241e-05, + "loss": 0.0376, + "step": 108320 + }, + { + "epoch": 0.14165, + "grad_norm": 0.07232654094696045, + "learning_rate": 2.3624594533608776e-05, + "loss": 0.0359, + "step": 108330 + }, + { + "epoch": 0.1417, + "grad_norm": 0.06810037791728973, + "learning_rate": 2.362046713849654e-05, + "loss": 0.0357, + "step": 108340 + }, + { + "epoch": 0.14175, + "grad_norm": 0.06049606204032898, + "learning_rate": 2.361633978110019e-05, + "loss": 0.0352, + "step": 108350 + }, + { + "epoch": 0.1418, + "grad_norm": 0.06264954805374146, + "learning_rate": 2.361221246153261e-05, + "loss": 0.036, + "step": 108360 + }, + { + "epoch": 0.14185, + "grad_norm": 0.08460656553506851, + "learning_rate": 2.3608085179906607e-05, + "loss": 0.039, + "step": 108370 + }, + { + "epoch": 0.1419, + "grad_norm": 0.07102148234844208, + "learning_rate": 2.3603957936335043e-05, + "loss": 0.0375, + "step": 108380 + }, + { + "epoch": 0.14195, + "grad_norm": 0.07672837376594543, + "learning_rate": 2.359983073093074e-05, + "loss": 0.0358, + "step": 108390 + }, + { + "epoch": 0.142, + "grad_norm": 0.05874524638056755, + "learning_rate": 2.3595703563806536e-05, + "loss": 0.0349, + "step": 108400 + }, + { + "epoch": 0.14205, + "grad_norm": 0.07055836915969849, + "learning_rate": 2.3591576435075276e-05, + "loss": 0.0349, + "step": 108410 + }, + { + "epoch": 0.1421, + "grad_norm": 0.055572111159563065, + "learning_rate": 2.358744934484978e-05, + "loss": 0.0343, + "step": 108420 + }, + { + "epoch": 0.14215, + "grad_norm": 0.06013401970267296, + "learning_rate": 2.3583322293242893e-05, + "loss": 0.0346, + "step": 108430 + }, + { + "epoch": 0.1422, + "grad_norm": 0.07167872786521912, + "learning_rate": 2.3579195280367434e-05, + "loss": 0.0342, + "step": 108440 + }, + { + "epoch": 0.14225, + "grad_norm": 0.07439404726028442, + "learning_rate": 2.3575068306336245e-05, + "loss": 0.0391, + "step": 108450 + }, + { + "epoch": 0.1423, + "grad_norm": 0.08786989748477936, + "learning_rate": 2.3570941371262158e-05, + "loss": 0.0361, + "step": 108460 + }, + { + "epoch": 0.14235, + "grad_norm": 0.0756273940205574, + "learning_rate": 2.3566814475257994e-05, + "loss": 0.0346, + "step": 108470 + }, + { + "epoch": 0.1424, + "grad_norm": 0.07808423787355423, + "learning_rate": 2.356268761843659e-05, + "loss": 0.0333, + "step": 108480 + }, + { + "epoch": 0.14245, + "grad_norm": 0.07155963033437729, + "learning_rate": 2.355856080091076e-05, + "loss": 0.0335, + "step": 108490 + }, + { + "epoch": 0.1425, + "grad_norm": 0.08156801760196686, + "learning_rate": 2.3554434022793344e-05, + "loss": 0.0361, + "step": 108500 + }, + { + "epoch": 0.14255, + "grad_norm": 0.06574303656816483, + "learning_rate": 2.3550307284197148e-05, + "loss": 0.0346, + "step": 108510 + }, + { + "epoch": 0.1426, + "grad_norm": 0.07332491129636765, + "learning_rate": 2.3546180585235003e-05, + "loss": 0.0381, + "step": 108520 + }, + { + "epoch": 0.14265, + "grad_norm": 0.07091228663921356, + "learning_rate": 2.3542053926019753e-05, + "loss": 0.034, + "step": 108530 + }, + { + "epoch": 0.1427, + "grad_norm": 0.06543724983930588, + "learning_rate": 2.353792730666418e-05, + "loss": 0.037, + "step": 108540 + }, + { + "epoch": 0.14275, + "grad_norm": 0.07887712121009827, + "learning_rate": 2.3533800727281145e-05, + "loss": 0.0356, + "step": 108550 + }, + { + "epoch": 0.1428, + "grad_norm": 0.08726233243942261, + "learning_rate": 2.3529674187983433e-05, + "loss": 0.0362, + "step": 108560 + }, + { + "epoch": 0.14285, + "grad_norm": 0.07006168365478516, + "learning_rate": 2.3525547688883885e-05, + "loss": 0.0351, + "step": 108570 + }, + { + "epoch": 0.1429, + "grad_norm": 0.07025811076164246, + "learning_rate": 2.3521421230095303e-05, + "loss": 0.0352, + "step": 108580 + }, + { + "epoch": 0.14295, + "grad_norm": 0.0629371628165245, + "learning_rate": 2.351729481173051e-05, + "loss": 0.0356, + "step": 108590 + }, + { + "epoch": 0.143, + "grad_norm": 0.0688789114356041, + "learning_rate": 2.3513168433902324e-05, + "loss": 0.0349, + "step": 108600 + }, + { + "epoch": 0.14305, + "grad_norm": 0.07087711989879608, + "learning_rate": 2.3509042096723552e-05, + "loss": 0.0352, + "step": 108610 + }, + { + "epoch": 0.1431, + "grad_norm": 0.07730893045663834, + "learning_rate": 2.3504915800307012e-05, + "loss": 0.0348, + "step": 108620 + }, + { + "epoch": 0.14315, + "grad_norm": 0.06419000029563904, + "learning_rate": 2.350078954476551e-05, + "loss": 0.0368, + "step": 108630 + }, + { + "epoch": 0.1432, + "grad_norm": 0.09525299072265625, + "learning_rate": 2.3496663330211848e-05, + "loss": 0.0405, + "step": 108640 + }, + { + "epoch": 0.14325, + "grad_norm": 0.07910957932472229, + "learning_rate": 2.349253715675886e-05, + "loss": 0.0345, + "step": 108650 + }, + { + "epoch": 0.1433, + "grad_norm": 0.07841334491968155, + "learning_rate": 2.3488411024519334e-05, + "loss": 0.0353, + "step": 108660 + }, + { + "epoch": 0.14335, + "grad_norm": 0.06821152567863464, + "learning_rate": 2.3484284933606085e-05, + "loss": 0.0353, + "step": 108670 + }, + { + "epoch": 0.1434, + "grad_norm": 0.07697838544845581, + "learning_rate": 2.3480158884131914e-05, + "loss": 0.0349, + "step": 108680 + }, + { + "epoch": 0.14345, + "grad_norm": 0.0828322321176529, + "learning_rate": 2.3476032876209632e-05, + "loss": 0.0363, + "step": 108690 + }, + { + "epoch": 0.1435, + "grad_norm": 0.07963103801012039, + "learning_rate": 2.3471906909952036e-05, + "loss": 0.0348, + "step": 108700 + }, + { + "epoch": 0.14355, + "grad_norm": 0.07140455394983292, + "learning_rate": 2.346778098547193e-05, + "loss": 0.0347, + "step": 108710 + }, + { + "epoch": 0.1436, + "grad_norm": 0.0864250510931015, + "learning_rate": 2.346365510288212e-05, + "loss": 0.0361, + "step": 108720 + }, + { + "epoch": 0.14365, + "grad_norm": 0.08561394363641739, + "learning_rate": 2.3459529262295394e-05, + "loss": 0.0343, + "step": 108730 + }, + { + "epoch": 0.1437, + "grad_norm": 0.09716397523880005, + "learning_rate": 2.3455403463824573e-05, + "loss": 0.0347, + "step": 108740 + }, + { + "epoch": 0.14375, + "grad_norm": 0.07353556901216507, + "learning_rate": 2.3451277707582427e-05, + "loss": 0.0362, + "step": 108750 + }, + { + "epoch": 0.1438, + "grad_norm": 0.06440164148807526, + "learning_rate": 2.344715199368178e-05, + "loss": 0.0341, + "step": 108760 + }, + { + "epoch": 0.14385, + "grad_norm": 0.07270939648151398, + "learning_rate": 2.34430263222354e-05, + "loss": 0.0351, + "step": 108770 + }, + { + "epoch": 0.1439, + "grad_norm": 0.06414466351270676, + "learning_rate": 2.3438900693356102e-05, + "loss": 0.035, + "step": 108780 + }, + { + "epoch": 0.14395, + "grad_norm": 0.06215673312544823, + "learning_rate": 2.3434775107156674e-05, + "loss": 0.0347, + "step": 108790 + }, + { + "epoch": 0.144, + "grad_norm": 0.059777650982141495, + "learning_rate": 2.3430649563749906e-05, + "loss": 0.0382, + "step": 108800 + }, + { + "epoch": 0.14405, + "grad_norm": 0.06394518911838531, + "learning_rate": 2.3426524063248593e-05, + "loss": 0.0384, + "step": 108810 + }, + { + "epoch": 0.1441, + "grad_norm": 0.053943321108818054, + "learning_rate": 2.3422398605765515e-05, + "loss": 0.0333, + "step": 108820 + }, + { + "epoch": 0.14415, + "grad_norm": 0.06608572602272034, + "learning_rate": 2.341827319141346e-05, + "loss": 0.0349, + "step": 108830 + }, + { + "epoch": 0.1442, + "grad_norm": 0.07171956449747086, + "learning_rate": 2.3414147820305238e-05, + "loss": 0.0348, + "step": 108840 + }, + { + "epoch": 0.14425, + "grad_norm": 0.06831841915845871, + "learning_rate": 2.3410022492553604e-05, + "loss": 0.0365, + "step": 108850 + }, + { + "epoch": 0.1443, + "grad_norm": 0.06574524939060211, + "learning_rate": 2.340589720827137e-05, + "loss": 0.0345, + "step": 108860 + }, + { + "epoch": 0.14435, + "grad_norm": 0.07829328626394272, + "learning_rate": 2.34017719675713e-05, + "loss": 0.0351, + "step": 108870 + }, + { + "epoch": 0.1444, + "grad_norm": 0.08291122317314148, + "learning_rate": 2.339764677056619e-05, + "loss": 0.0358, + "step": 108880 + }, + { + "epoch": 0.14445, + "grad_norm": 0.06285149604082108, + "learning_rate": 2.3393521617368806e-05, + "loss": 0.0333, + "step": 108890 + }, + { + "epoch": 0.1445, + "grad_norm": 0.0621388703584671, + "learning_rate": 2.3389396508091943e-05, + "loss": 0.0346, + "step": 108900 + }, + { + "epoch": 0.14455, + "grad_norm": 0.0737023651599884, + "learning_rate": 2.3385271442848376e-05, + "loss": 0.034, + "step": 108910 + }, + { + "epoch": 0.1446, + "grad_norm": 0.06401995569467545, + "learning_rate": 2.338114642175088e-05, + "loss": 0.0349, + "step": 108920 + }, + { + "epoch": 0.14465, + "grad_norm": 0.06761576980352402, + "learning_rate": 2.3377021444912235e-05, + "loss": 0.0342, + "step": 108930 + }, + { + "epoch": 0.1447, + "grad_norm": 0.07421578466892242, + "learning_rate": 2.3372896512445203e-05, + "loss": 0.0354, + "step": 108940 + }, + { + "epoch": 0.14475, + "grad_norm": 0.060112569481134415, + "learning_rate": 2.3368771624462585e-05, + "loss": 0.0337, + "step": 108950 + }, + { + "epoch": 0.1448, + "grad_norm": 0.06732338666915894, + "learning_rate": 2.3364646781077123e-05, + "loss": 0.0345, + "step": 108960 + }, + { + "epoch": 0.14485, + "grad_norm": 0.07164648175239563, + "learning_rate": 2.3360521982401608e-05, + "loss": 0.0342, + "step": 108970 + }, + { + "epoch": 0.1449, + "grad_norm": 0.0799083486199379, + "learning_rate": 2.3356397228548812e-05, + "loss": 0.0341, + "step": 108980 + }, + { + "epoch": 0.14495, + "grad_norm": 0.07315156608819962, + "learning_rate": 2.3352272519631494e-05, + "loss": 0.0347, + "step": 108990 + }, + { + "epoch": 0.145, + "grad_norm": 0.07438869029283524, + "learning_rate": 2.334814785576243e-05, + "loss": 0.035, + "step": 109000 + }, + { + "epoch": 0.14505, + "grad_norm": 0.0785011276602745, + "learning_rate": 2.334402323705438e-05, + "loss": 0.0342, + "step": 109010 + }, + { + "epoch": 0.1451, + "grad_norm": 0.06625154614448547, + "learning_rate": 2.3339898663620103e-05, + "loss": 0.0348, + "step": 109020 + }, + { + "epoch": 0.14515, + "grad_norm": 0.06950251013040543, + "learning_rate": 2.3335774135572394e-05, + "loss": 0.0357, + "step": 109030 + }, + { + "epoch": 0.1452, + "grad_norm": 0.0790690928697586, + "learning_rate": 2.3331649653023977e-05, + "loss": 0.0366, + "step": 109040 + }, + { + "epoch": 0.14525, + "grad_norm": 0.0788382738828659, + "learning_rate": 2.3327525216087644e-05, + "loss": 0.0369, + "step": 109050 + }, + { + "epoch": 0.1453, + "grad_norm": 0.07070305198431015, + "learning_rate": 2.332340082487613e-05, + "loss": 0.0351, + "step": 109060 + }, + { + "epoch": 0.14535, + "grad_norm": 0.06878872215747833, + "learning_rate": 2.331927647950222e-05, + "loss": 0.0356, + "step": 109070 + }, + { + "epoch": 0.1454, + "grad_norm": 0.06110698729753494, + "learning_rate": 2.3315152180078654e-05, + "loss": 0.035, + "step": 109080 + }, + { + "epoch": 0.14545, + "grad_norm": 0.09910532087087631, + "learning_rate": 2.3311027926718193e-05, + "loss": 0.0365, + "step": 109090 + }, + { + "epoch": 0.1455, + "grad_norm": 0.07843577861785889, + "learning_rate": 2.3306903719533598e-05, + "loss": 0.0379, + "step": 109100 + }, + { + "epoch": 0.14555, + "grad_norm": 0.06486725062131882, + "learning_rate": 2.3302779558637615e-05, + "loss": 0.0356, + "step": 109110 + }, + { + "epoch": 0.1456, + "grad_norm": 0.1011667549610138, + "learning_rate": 2.3298655444143008e-05, + "loss": 0.0364, + "step": 109120 + }, + { + "epoch": 0.14565, + "grad_norm": 0.0909094363451004, + "learning_rate": 2.329453137616251e-05, + "loss": 0.0394, + "step": 109130 + }, + { + "epoch": 0.1457, + "grad_norm": 0.07500338554382324, + "learning_rate": 2.32904073548089e-05, + "loss": 0.0349, + "step": 109140 + }, + { + "epoch": 0.14575, + "grad_norm": 0.07224991917610168, + "learning_rate": 2.3286283380194897e-05, + "loss": 0.0363, + "step": 109150 + }, + { + "epoch": 0.1458, + "grad_norm": 0.06772609055042267, + "learning_rate": 2.3282159452433267e-05, + "loss": 0.0343, + "step": 109160 + }, + { + "epoch": 0.14585, + "grad_norm": 0.06794916838407516, + "learning_rate": 2.3278035571636755e-05, + "loss": 0.0354, + "step": 109170 + }, + { + "epoch": 0.1459, + "grad_norm": 0.061266761273145676, + "learning_rate": 2.3273911737918096e-05, + "loss": 0.0359, + "step": 109180 + }, + { + "epoch": 0.14595, + "grad_norm": 0.07602766156196594, + "learning_rate": 2.3269787951390056e-05, + "loss": 0.0351, + "step": 109190 + }, + { + "epoch": 0.146, + "grad_norm": 0.07578182220458984, + "learning_rate": 2.326566421216535e-05, + "loss": 0.035, + "step": 109200 + }, + { + "epoch": 0.14605, + "grad_norm": 0.06298764050006866, + "learning_rate": 2.3261540520356727e-05, + "loss": 0.0342, + "step": 109210 + }, + { + "epoch": 0.1461, + "grad_norm": 0.057033587247133255, + "learning_rate": 2.3257416876076954e-05, + "loss": 0.0336, + "step": 109220 + }, + { + "epoch": 0.14615, + "grad_norm": 0.07515522837638855, + "learning_rate": 2.325329327943873e-05, + "loss": 0.0344, + "step": 109230 + }, + { + "epoch": 0.1462, + "grad_norm": 0.06341442465782166, + "learning_rate": 2.324916973055483e-05, + "loss": 0.0345, + "step": 109240 + }, + { + "epoch": 0.14625, + "grad_norm": 0.08179578930139542, + "learning_rate": 2.3245046229537954e-05, + "loss": 0.035, + "step": 109250 + }, + { + "epoch": 0.1463, + "grad_norm": 0.06408336013555527, + "learning_rate": 2.324092277650087e-05, + "loss": 0.0356, + "step": 109260 + }, + { + "epoch": 0.14635, + "grad_norm": 0.08868936449289322, + "learning_rate": 2.3236799371556282e-05, + "loss": 0.0353, + "step": 109270 + }, + { + "epoch": 0.1464, + "grad_norm": 0.07364244759082794, + "learning_rate": 2.323267601481694e-05, + "loss": 0.0358, + "step": 109280 + }, + { + "epoch": 0.14645, + "grad_norm": 0.07817887514829636, + "learning_rate": 2.322855270639558e-05, + "loss": 0.0361, + "step": 109290 + }, + { + "epoch": 0.1465, + "grad_norm": 0.06925657391548157, + "learning_rate": 2.3224429446404917e-05, + "loss": 0.0357, + "step": 109300 + }, + { + "epoch": 0.14655, + "grad_norm": 0.06357885152101517, + "learning_rate": 2.322030623495769e-05, + "loss": 0.0373, + "step": 109310 + }, + { + "epoch": 0.1466, + "grad_norm": 0.08716066926717758, + "learning_rate": 2.321618307216662e-05, + "loss": 0.0357, + "step": 109320 + }, + { + "epoch": 0.14665, + "grad_norm": 0.08204200118780136, + "learning_rate": 2.3212059958144434e-05, + "loss": 0.0357, + "step": 109330 + }, + { + "epoch": 0.1467, + "grad_norm": 0.07030654698610306, + "learning_rate": 2.320793689300385e-05, + "loss": 0.0356, + "step": 109340 + }, + { + "epoch": 0.14675, + "grad_norm": 0.07193808257579803, + "learning_rate": 2.3203813876857606e-05, + "loss": 0.0351, + "step": 109350 + }, + { + "epoch": 0.1468, + "grad_norm": 0.08544500917196274, + "learning_rate": 2.319969090981842e-05, + "loss": 0.037, + "step": 109360 + }, + { + "epoch": 0.14685, + "grad_norm": 0.09021718055009842, + "learning_rate": 2.3195567991999002e-05, + "loss": 0.0353, + "step": 109370 + }, + { + "epoch": 0.1469, + "grad_norm": 0.07073055952787399, + "learning_rate": 2.319144512351208e-05, + "loss": 0.0353, + "step": 109380 + }, + { + "epoch": 0.14695, + "grad_norm": 0.07055257260799408, + "learning_rate": 2.3187322304470365e-05, + "loss": 0.0358, + "step": 109390 + }, + { + "epoch": 0.147, + "grad_norm": 0.07493556290864944, + "learning_rate": 2.318319953498658e-05, + "loss": 0.036, + "step": 109400 + }, + { + "epoch": 0.14705, + "grad_norm": 0.0694839134812355, + "learning_rate": 2.317907681517344e-05, + "loss": 0.0357, + "step": 109410 + }, + { + "epoch": 0.1471, + "grad_norm": 0.07084786146879196, + "learning_rate": 2.3174954145143644e-05, + "loss": 0.0367, + "step": 109420 + }, + { + "epoch": 0.14715, + "grad_norm": 0.09428457170724869, + "learning_rate": 2.3170831525009933e-05, + "loss": 0.0361, + "step": 109430 + }, + { + "epoch": 0.1472, + "grad_norm": 0.06578890979290009, + "learning_rate": 2.3166708954884987e-05, + "loss": 0.0359, + "step": 109440 + }, + { + "epoch": 0.14725, + "grad_norm": 0.07545687258243561, + "learning_rate": 2.316258643488154e-05, + "loss": 0.0354, + "step": 109450 + }, + { + "epoch": 0.1473, + "grad_norm": 0.07363058626651764, + "learning_rate": 2.315846396511228e-05, + "loss": 0.0358, + "step": 109460 + }, + { + "epoch": 0.14735, + "grad_norm": 0.07134675234556198, + "learning_rate": 2.3154341545689926e-05, + "loss": 0.0332, + "step": 109470 + }, + { + "epoch": 0.1474, + "grad_norm": 0.07839695364236832, + "learning_rate": 2.3150219176727186e-05, + "loss": 0.0353, + "step": 109480 + }, + { + "epoch": 0.14745, + "grad_norm": 0.07358036935329437, + "learning_rate": 2.3146096858336752e-05, + "loss": 0.0334, + "step": 109490 + }, + { + "epoch": 0.1475, + "grad_norm": 0.0781262069940567, + "learning_rate": 2.314197459063134e-05, + "loss": 0.0344, + "step": 109500 + }, + { + "epoch": 0.14755, + "grad_norm": 0.07102091610431671, + "learning_rate": 2.313785237372364e-05, + "loss": 0.0327, + "step": 109510 + }, + { + "epoch": 0.1476, + "grad_norm": 0.08903536200523376, + "learning_rate": 2.313373020772636e-05, + "loss": 0.0351, + "step": 109520 + }, + { + "epoch": 0.14765, + "grad_norm": 0.08091005682945251, + "learning_rate": 2.3129608092752196e-05, + "loss": 0.0347, + "step": 109530 + }, + { + "epoch": 0.1477, + "grad_norm": 0.08431022614240646, + "learning_rate": 2.312548602891383e-05, + "loss": 0.0352, + "step": 109540 + }, + { + "epoch": 0.14775, + "grad_norm": 0.08247928321361542, + "learning_rate": 2.312136401632399e-05, + "loss": 0.035, + "step": 109550 + }, + { + "epoch": 0.1478, + "grad_norm": 0.08493194729089737, + "learning_rate": 2.3117242055095344e-05, + "loss": 0.0365, + "step": 109560 + }, + { + "epoch": 0.14785, + "grad_norm": 0.061399515718221664, + "learning_rate": 2.3113120145340593e-05, + "loss": 0.0339, + "step": 109570 + }, + { + "epoch": 0.1479, + "grad_norm": 0.06967747211456299, + "learning_rate": 2.310899828717243e-05, + "loss": 0.034, + "step": 109580 + }, + { + "epoch": 0.14795, + "grad_norm": 0.07984711974859238, + "learning_rate": 2.3104876480703545e-05, + "loss": 0.0347, + "step": 109590 + }, + { + "epoch": 0.148, + "grad_norm": 0.08287534862756729, + "learning_rate": 2.310075472604662e-05, + "loss": 0.0336, + "step": 109600 + }, + { + "epoch": 0.14805, + "grad_norm": 0.06081758812069893, + "learning_rate": 2.309663302331435e-05, + "loss": 0.033, + "step": 109610 + }, + { + "epoch": 0.1481, + "grad_norm": 0.07051486521959305, + "learning_rate": 2.3092511372619422e-05, + "loss": 0.0342, + "step": 109620 + }, + { + "epoch": 0.14815, + "grad_norm": 0.07883277535438538, + "learning_rate": 2.3088389774074504e-05, + "loss": 0.0336, + "step": 109630 + }, + { + "epoch": 0.1482, + "grad_norm": 0.06091853231191635, + "learning_rate": 2.3084268227792307e-05, + "loss": 0.035, + "step": 109640 + }, + { + "epoch": 0.14825, + "grad_norm": 0.07201207429170609, + "learning_rate": 2.3080146733885482e-05, + "loss": 0.0341, + "step": 109650 + }, + { + "epoch": 0.1483, + "grad_norm": 0.0676903948187828, + "learning_rate": 2.3076025292466733e-05, + "loss": 0.033, + "step": 109660 + }, + { + "epoch": 0.14835, + "grad_norm": 0.07095537334680557, + "learning_rate": 2.307190390364873e-05, + "loss": 0.0352, + "step": 109670 + }, + { + "epoch": 0.1484, + "grad_norm": 0.06302513927221298, + "learning_rate": 2.3067782567544147e-05, + "loss": 0.0364, + "step": 109680 + }, + { + "epoch": 0.14845, + "grad_norm": 0.06290469318628311, + "learning_rate": 2.3063661284265667e-05, + "loss": 0.0346, + "step": 109690 + }, + { + "epoch": 0.1485, + "grad_norm": 0.0665312334895134, + "learning_rate": 2.3059540053925957e-05, + "loss": 0.0348, + "step": 109700 + }, + { + "epoch": 0.14855, + "grad_norm": 0.11684829741716385, + "learning_rate": 2.3055418876637696e-05, + "loss": 0.0365, + "step": 109710 + }, + { + "epoch": 0.1486, + "grad_norm": 0.09129486232995987, + "learning_rate": 2.305129775251355e-05, + "loss": 0.035, + "step": 109720 + }, + { + "epoch": 0.14865, + "grad_norm": 0.08456933498382568, + "learning_rate": 2.304717668166618e-05, + "loss": 0.0337, + "step": 109730 + }, + { + "epoch": 0.1487, + "grad_norm": 0.07299253344535828, + "learning_rate": 2.304305566420829e-05, + "loss": 0.0341, + "step": 109740 + }, + { + "epoch": 0.14875, + "grad_norm": 0.05998293682932854, + "learning_rate": 2.30389347002525e-05, + "loss": 0.0338, + "step": 109750 + }, + { + "epoch": 0.1488, + "grad_norm": 0.06796760112047195, + "learning_rate": 2.303481378991151e-05, + "loss": 0.0343, + "step": 109760 + }, + { + "epoch": 0.14885, + "grad_norm": 0.10017074644565582, + "learning_rate": 2.3030692933297972e-05, + "loss": 0.0362, + "step": 109770 + }, + { + "epoch": 0.1489, + "grad_norm": 0.08416993916034698, + "learning_rate": 2.302657213052455e-05, + "loss": 0.0338, + "step": 109780 + }, + { + "epoch": 0.14895, + "grad_norm": 0.07039602100849152, + "learning_rate": 2.3022451381703903e-05, + "loss": 0.0335, + "step": 109790 + }, + { + "epoch": 0.149, + "grad_norm": 0.08664463460445404, + "learning_rate": 2.301833068694869e-05, + "loss": 0.0354, + "step": 109800 + }, + { + "epoch": 0.14905, + "grad_norm": 0.09198068827390671, + "learning_rate": 2.3014210046371576e-05, + "loss": 0.0357, + "step": 109810 + }, + { + "epoch": 0.1491, + "grad_norm": 0.06888342648744583, + "learning_rate": 2.301008946008521e-05, + "loss": 0.0338, + "step": 109820 + }, + { + "epoch": 0.14915, + "grad_norm": 0.0643400326371193, + "learning_rate": 2.3005968928202253e-05, + "loss": 0.035, + "step": 109830 + }, + { + "epoch": 0.1492, + "grad_norm": 0.06421161442995071, + "learning_rate": 2.3001848450835348e-05, + "loss": 0.0347, + "step": 109840 + }, + { + "epoch": 0.14925, + "grad_norm": 0.06878542900085449, + "learning_rate": 2.299772802809716e-05, + "loss": 0.0337, + "step": 109850 + }, + { + "epoch": 0.1493, + "grad_norm": 0.06569793075323105, + "learning_rate": 2.299360766010034e-05, + "loss": 0.0358, + "step": 109860 + }, + { + "epoch": 0.14935, + "grad_norm": 0.0654708594083786, + "learning_rate": 2.298948734695753e-05, + "loss": 0.0339, + "step": 109870 + }, + { + "epoch": 0.1494, + "grad_norm": 0.057217568159103394, + "learning_rate": 2.2985367088781387e-05, + "loss": 0.0342, + "step": 109880 + }, + { + "epoch": 0.14945, + "grad_norm": 0.07576543837785721, + "learning_rate": 2.2981246885684543e-05, + "loss": 0.0345, + "step": 109890 + }, + { + "epoch": 0.1495, + "grad_norm": 0.08619561791419983, + "learning_rate": 2.2977126737779658e-05, + "loss": 0.0375, + "step": 109900 + }, + { + "epoch": 0.14955, + "grad_norm": 0.09843472391366959, + "learning_rate": 2.297300664517936e-05, + "loss": 0.037, + "step": 109910 + }, + { + "epoch": 0.1496, + "grad_norm": 0.08414553105831146, + "learning_rate": 2.2968886607996297e-05, + "loss": 0.0354, + "step": 109920 + }, + { + "epoch": 0.14965, + "grad_norm": 0.0866297110915184, + "learning_rate": 2.2964766626343122e-05, + "loss": 0.0358, + "step": 109930 + }, + { + "epoch": 0.1497, + "grad_norm": 0.08381444960832596, + "learning_rate": 2.296064670033245e-05, + "loss": 0.035, + "step": 109940 + }, + { + "epoch": 0.14975, + "grad_norm": 0.07397166639566422, + "learning_rate": 2.295652683007695e-05, + "loss": 0.0338, + "step": 109950 + }, + { + "epoch": 0.1498, + "grad_norm": 0.07220173627138138, + "learning_rate": 2.295240701568922e-05, + "loss": 0.0349, + "step": 109960 + }, + { + "epoch": 0.14985, + "grad_norm": 0.09002058953046799, + "learning_rate": 2.294828725728192e-05, + "loss": 0.0363, + "step": 109970 + }, + { + "epoch": 0.1499, + "grad_norm": 0.06266094744205475, + "learning_rate": 2.2944167554967675e-05, + "loss": 0.0341, + "step": 109980 + }, + { + "epoch": 0.14995, + "grad_norm": 0.06673440337181091, + "learning_rate": 2.2940047908859114e-05, + "loss": 0.0334, + "step": 109990 + }, + { + "epoch": 0.15, + "grad_norm": 0.07416488975286484, + "learning_rate": 2.2935928319068876e-05, + "loss": 0.0364, + "step": 110000 + }, + { + "epoch": 0.15005, + "grad_norm": 0.06353218108415604, + "learning_rate": 2.2931808785709576e-05, + "loss": 0.0338, + "step": 110010 + }, + { + "epoch": 0.1501, + "grad_norm": 0.060413651168346405, + "learning_rate": 2.2927689308893855e-05, + "loss": 0.0335, + "step": 110020 + }, + { + "epoch": 0.15015, + "grad_norm": 0.07217101007699966, + "learning_rate": 2.292356988873432e-05, + "loss": 0.0339, + "step": 110030 + }, + { + "epoch": 0.1502, + "grad_norm": 0.06962756812572479, + "learning_rate": 2.2919450525343603e-05, + "loss": 0.0358, + "step": 110040 + }, + { + "epoch": 0.15025, + "grad_norm": 0.06547567248344421, + "learning_rate": 2.2915331218834335e-05, + "loss": 0.0334, + "step": 110050 + }, + { + "epoch": 0.1503, + "grad_norm": 0.06108114868402481, + "learning_rate": 2.2911211969319123e-05, + "loss": 0.0335, + "step": 110060 + }, + { + "epoch": 0.15035, + "grad_norm": 0.06455700099468231, + "learning_rate": 2.29070927769106e-05, + "loss": 0.0345, + "step": 110070 + }, + { + "epoch": 0.1504, + "grad_norm": 0.07985526323318481, + "learning_rate": 2.2902973641721363e-05, + "loss": 0.0338, + "step": 110080 + }, + { + "epoch": 0.15045, + "grad_norm": 0.06909330934286118, + "learning_rate": 2.289885456386405e-05, + "loss": 0.0333, + "step": 110090 + }, + { + "epoch": 0.1505, + "grad_norm": 0.07609910517930984, + "learning_rate": 2.2894735543451255e-05, + "loss": 0.0366, + "step": 110100 + }, + { + "epoch": 0.15055, + "grad_norm": 0.0777604803442955, + "learning_rate": 2.28906165805956e-05, + "loss": 0.0344, + "step": 110110 + }, + { + "epoch": 0.1506, + "grad_norm": 0.07123269885778427, + "learning_rate": 2.28864976754097e-05, + "loss": 0.036, + "step": 110120 + }, + { + "epoch": 0.15065, + "grad_norm": 0.09285328537225723, + "learning_rate": 2.288237882800615e-05, + "loss": 0.0346, + "step": 110130 + }, + { + "epoch": 0.1507, + "grad_norm": 0.07575418800115585, + "learning_rate": 2.2878260038497584e-05, + "loss": 0.0342, + "step": 110140 + }, + { + "epoch": 0.15075, + "grad_norm": 0.07250441610813141, + "learning_rate": 2.2874141306996576e-05, + "loss": 0.0355, + "step": 110150 + }, + { + "epoch": 0.1508, + "grad_norm": 0.06579921394586563, + "learning_rate": 2.287002263361576e-05, + "loss": 0.033, + "step": 110160 + }, + { + "epoch": 0.15085, + "grad_norm": 0.10029726475477219, + "learning_rate": 2.286590401846771e-05, + "loss": 0.0351, + "step": 110170 + }, + { + "epoch": 0.1509, + "grad_norm": 0.05899001285433769, + "learning_rate": 2.2861785461665046e-05, + "loss": 0.0337, + "step": 110180 + }, + { + "epoch": 0.15095, + "grad_norm": 0.0677531361579895, + "learning_rate": 2.285766696332037e-05, + "loss": 0.0336, + "step": 110190 + }, + { + "epoch": 0.151, + "grad_norm": 0.07853901386260986, + "learning_rate": 2.285354852354627e-05, + "loss": 0.0333, + "step": 110200 + }, + { + "epoch": 0.15105, + "grad_norm": 0.07537727802991867, + "learning_rate": 2.2849430142455353e-05, + "loss": 0.0347, + "step": 110210 + }, + { + "epoch": 0.1511, + "grad_norm": 0.09130075573921204, + "learning_rate": 2.28453118201602e-05, + "loss": 0.0366, + "step": 110220 + }, + { + "epoch": 0.15115, + "grad_norm": 0.07668808102607727, + "learning_rate": 2.2841193556773406e-05, + "loss": 0.0351, + "step": 110230 + }, + { + "epoch": 0.1512, + "grad_norm": 0.08927728235721588, + "learning_rate": 2.2837075352407587e-05, + "loss": 0.0372, + "step": 110240 + }, + { + "epoch": 0.15125, + "grad_norm": 0.09870357811450958, + "learning_rate": 2.2832957207175303e-05, + "loss": 0.0354, + "step": 110250 + }, + { + "epoch": 0.1513, + "grad_norm": 0.12028441578149796, + "learning_rate": 2.2828839121189162e-05, + "loss": 0.0345, + "step": 110260 + }, + { + "epoch": 0.15135, + "grad_norm": 0.11618024855852127, + "learning_rate": 2.282472109456174e-05, + "loss": 0.0358, + "step": 110270 + }, + { + "epoch": 0.1514, + "grad_norm": 0.10039499402046204, + "learning_rate": 2.282060312740563e-05, + "loss": 0.0356, + "step": 110280 + }, + { + "epoch": 0.15145, + "grad_norm": 0.06601977348327637, + "learning_rate": 2.2816485219833405e-05, + "loss": 0.0334, + "step": 110290 + }, + { + "epoch": 0.1515, + "grad_norm": 0.07388697564601898, + "learning_rate": 2.2812367371957658e-05, + "loss": 0.0383, + "step": 110300 + }, + { + "epoch": 0.15155, + "grad_norm": 0.06000300496816635, + "learning_rate": 2.2808249583890968e-05, + "loss": 0.0347, + "step": 110310 + }, + { + "epoch": 0.1516, + "grad_norm": 0.06344135105609894, + "learning_rate": 2.2804131855745906e-05, + "loss": 0.0346, + "step": 110320 + }, + { + "epoch": 0.15165, + "grad_norm": 0.0786777138710022, + "learning_rate": 2.2800014187635057e-05, + "loss": 0.0352, + "step": 110330 + }, + { + "epoch": 0.1517, + "grad_norm": 0.07005493342876434, + "learning_rate": 2.2795896579670987e-05, + "loss": 0.0352, + "step": 110340 + }, + { + "epoch": 0.15175, + "grad_norm": 0.08562647551298141, + "learning_rate": 2.279177903196629e-05, + "loss": 0.0342, + "step": 110350 + }, + { + "epoch": 0.1518, + "grad_norm": 0.07134796679019928, + "learning_rate": 2.278766154463351e-05, + "loss": 0.0372, + "step": 110360 + }, + { + "epoch": 0.15185, + "grad_norm": 0.0698230117559433, + "learning_rate": 2.2783544117785237e-05, + "loss": 0.0373, + "step": 110370 + }, + { + "epoch": 0.1519, + "grad_norm": 0.06944792717695236, + "learning_rate": 2.277942675153404e-05, + "loss": 0.034, + "step": 110380 + }, + { + "epoch": 0.15195, + "grad_norm": 0.07428590208292007, + "learning_rate": 2.2775309445992476e-05, + "loss": 0.0367, + "step": 110390 + }, + { + "epoch": 0.152, + "grad_norm": 0.0738653689622879, + "learning_rate": 2.2771192201273122e-05, + "loss": 0.0352, + "step": 110400 + }, + { + "epoch": 0.15205, + "grad_norm": 0.06859836727380753, + "learning_rate": 2.276707501748853e-05, + "loss": 0.0338, + "step": 110410 + }, + { + "epoch": 0.1521, + "grad_norm": 0.070834681391716, + "learning_rate": 2.2762957894751264e-05, + "loss": 0.0341, + "step": 110420 + }, + { + "epoch": 0.15215, + "grad_norm": 0.07761197537183762, + "learning_rate": 2.27588408331739e-05, + "loss": 0.0348, + "step": 110430 + }, + { + "epoch": 0.1522, + "grad_norm": 0.05937939137220383, + "learning_rate": 2.2754723832868978e-05, + "loss": 0.0349, + "step": 110440 + }, + { + "epoch": 0.15225, + "grad_norm": 0.0805535688996315, + "learning_rate": 2.2750606893949074e-05, + "loss": 0.0358, + "step": 110450 + }, + { + "epoch": 0.1523, + "grad_norm": 0.07439429312944412, + "learning_rate": 2.2746490016526713e-05, + "loss": 0.0354, + "step": 110460 + }, + { + "epoch": 0.15235, + "grad_norm": 0.057942844927310944, + "learning_rate": 2.2742373200714483e-05, + "loss": 0.0338, + "step": 110470 + }, + { + "epoch": 0.1524, + "grad_norm": 0.0685528889298439, + "learning_rate": 2.2738256446624917e-05, + "loss": 0.0328, + "step": 110480 + }, + { + "epoch": 0.15245, + "grad_norm": 0.06431593745946884, + "learning_rate": 2.273413975437057e-05, + "loss": 0.0345, + "step": 110490 + }, + { + "epoch": 0.1525, + "grad_norm": 0.07077065110206604, + "learning_rate": 2.2730023124063995e-05, + "loss": 0.0345, + "step": 110500 + }, + { + "epoch": 0.15255, + "grad_norm": 0.07308831065893173, + "learning_rate": 2.272590655581773e-05, + "loss": 0.0349, + "step": 110510 + }, + { + "epoch": 0.1526, + "grad_norm": 0.05925657972693443, + "learning_rate": 2.2721790049744333e-05, + "loss": 0.0352, + "step": 110520 + }, + { + "epoch": 0.15265, + "grad_norm": 0.0709601491689682, + "learning_rate": 2.271767360595633e-05, + "loss": 0.0352, + "step": 110530 + }, + { + "epoch": 0.1527, + "grad_norm": 0.08541488647460938, + "learning_rate": 2.271355722456628e-05, + "loss": 0.0358, + "step": 110540 + }, + { + "epoch": 0.15275, + "grad_norm": 0.08020167052745819, + "learning_rate": 2.270944090568671e-05, + "loss": 0.0358, + "step": 110550 + }, + { + "epoch": 0.1528, + "grad_norm": 0.0742366760969162, + "learning_rate": 2.2705324649430166e-05, + "loss": 0.0362, + "step": 110560 + }, + { + "epoch": 0.15285, + "grad_norm": 0.07816912233829498, + "learning_rate": 2.270120845590919e-05, + "loss": 0.0357, + "step": 110570 + }, + { + "epoch": 0.1529, + "grad_norm": 0.0812477096915245, + "learning_rate": 2.2697092325236306e-05, + "loss": 0.0359, + "step": 110580 + }, + { + "epoch": 0.15295, + "grad_norm": 0.0714782103896141, + "learning_rate": 2.269297625752406e-05, + "loss": 0.0353, + "step": 110590 + }, + { + "epoch": 0.153, + "grad_norm": 0.07697834074497223, + "learning_rate": 2.268886025288497e-05, + "loss": 0.0356, + "step": 110600 + }, + { + "epoch": 0.15305, + "grad_norm": 0.0860663428902626, + "learning_rate": 2.268474431143158e-05, + "loss": 0.035, + "step": 110610 + }, + { + "epoch": 0.1531, + "grad_norm": 0.07616297155618668, + "learning_rate": 2.2680628433276398e-05, + "loss": 0.0356, + "step": 110620 + }, + { + "epoch": 0.15315, + "grad_norm": 0.07121719419956207, + "learning_rate": 2.2676512618531964e-05, + "loss": 0.034, + "step": 110630 + }, + { + "epoch": 0.1532, + "grad_norm": 0.08533298224210739, + "learning_rate": 2.2672396867310817e-05, + "loss": 0.0364, + "step": 110640 + }, + { + "epoch": 0.15325, + "grad_norm": 0.07880530506372452, + "learning_rate": 2.266828117972545e-05, + "loss": 0.0345, + "step": 110650 + }, + { + "epoch": 0.1533, + "grad_norm": 0.07123337686061859, + "learning_rate": 2.2664165555888414e-05, + "loss": 0.0347, + "step": 110660 + }, + { + "epoch": 0.15335, + "grad_norm": 0.0665459856390953, + "learning_rate": 2.26600499959122e-05, + "loss": 0.0341, + "step": 110670 + }, + { + "epoch": 0.1534, + "grad_norm": 0.06696517020463943, + "learning_rate": 2.2655934499909342e-05, + "loss": 0.0344, + "step": 110680 + }, + { + "epoch": 0.15345, + "grad_norm": 0.06726125627756119, + "learning_rate": 2.2651819067992362e-05, + "loss": 0.0342, + "step": 110690 + }, + { + "epoch": 0.1535, + "grad_norm": 0.059936072677373886, + "learning_rate": 2.264770370027376e-05, + "loss": 0.0326, + "step": 110700 + }, + { + "epoch": 0.15355, + "grad_norm": 0.0649225041270256, + "learning_rate": 2.264358839686606e-05, + "loss": 0.0332, + "step": 110710 + }, + { + "epoch": 0.1536, + "grad_norm": 0.06835932284593582, + "learning_rate": 2.2639473157881766e-05, + "loss": 0.0339, + "step": 110720 + }, + { + "epoch": 0.15365, + "grad_norm": 0.07362768799066544, + "learning_rate": 2.2635357983433393e-05, + "loss": 0.0334, + "step": 110730 + }, + { + "epoch": 0.1537, + "grad_norm": 0.07793676108121872, + "learning_rate": 2.2631242873633437e-05, + "loss": 0.034, + "step": 110740 + }, + { + "epoch": 0.15375, + "grad_norm": 0.08216965198516846, + "learning_rate": 2.2627127828594408e-05, + "loss": 0.0339, + "step": 110750 + }, + { + "epoch": 0.1538, + "grad_norm": 0.08607140928506851, + "learning_rate": 2.262301284842882e-05, + "loss": 0.0349, + "step": 110760 + }, + { + "epoch": 0.15385, + "grad_norm": 0.06718862056732178, + "learning_rate": 2.2618897933249168e-05, + "loss": 0.0344, + "step": 110770 + }, + { + "epoch": 0.1539, + "grad_norm": 0.05782028287649155, + "learning_rate": 2.2614783083167952e-05, + "loss": 0.0334, + "step": 110780 + }, + { + "epoch": 0.15395, + "grad_norm": 0.07843133807182312, + "learning_rate": 2.2610668298297665e-05, + "loss": 0.034, + "step": 110790 + }, + { + "epoch": 0.154, + "grad_norm": 0.07167660444974899, + "learning_rate": 2.260655357875082e-05, + "loss": 0.035, + "step": 110800 + }, + { + "epoch": 0.15405, + "grad_norm": 0.07105650752782822, + "learning_rate": 2.260243892463989e-05, + "loss": 0.0341, + "step": 110810 + }, + { + "epoch": 0.1541, + "grad_norm": 0.06884513050317764, + "learning_rate": 2.259832433607738e-05, + "loss": 0.035, + "step": 110820 + }, + { + "epoch": 0.15415, + "grad_norm": 0.06763109564781189, + "learning_rate": 2.259420981317579e-05, + "loss": 0.0343, + "step": 110830 + }, + { + "epoch": 0.1542, + "grad_norm": 0.06715264171361923, + "learning_rate": 2.2590095356047584e-05, + "loss": 0.0339, + "step": 110840 + }, + { + "epoch": 0.15425, + "grad_norm": 0.08633338660001755, + "learning_rate": 2.2585980964805285e-05, + "loss": 0.0331, + "step": 110850 + }, + { + "epoch": 0.1543, + "grad_norm": 0.0647883340716362, + "learning_rate": 2.2581866639561343e-05, + "loss": 0.033, + "step": 110860 + }, + { + "epoch": 0.15435, + "grad_norm": 0.07531040161848068, + "learning_rate": 2.2577752380428265e-05, + "loss": 0.0333, + "step": 110870 + }, + { + "epoch": 0.1544, + "grad_norm": 0.06588154286146164, + "learning_rate": 2.257363818751853e-05, + "loss": 0.0347, + "step": 110880 + }, + { + "epoch": 0.15445, + "grad_norm": 0.07107117772102356, + "learning_rate": 2.2569524060944607e-05, + "loss": 0.0351, + "step": 110890 + }, + { + "epoch": 0.1545, + "grad_norm": 0.07523241639137268, + "learning_rate": 2.2565410000818993e-05, + "loss": 0.034, + "step": 110900 + }, + { + "epoch": 0.15455, + "grad_norm": 0.08571995049715042, + "learning_rate": 2.256129600725415e-05, + "loss": 0.0344, + "step": 110910 + }, + { + "epoch": 0.1546, + "grad_norm": 0.07107695937156677, + "learning_rate": 2.255718208036256e-05, + "loss": 0.0341, + "step": 110920 + }, + { + "epoch": 0.15465, + "grad_norm": 0.07713331282138824, + "learning_rate": 2.2553068220256693e-05, + "loss": 0.0342, + "step": 110930 + }, + { + "epoch": 0.1547, + "grad_norm": 0.081510029733181, + "learning_rate": 2.2548954427049012e-05, + "loss": 0.0328, + "step": 110940 + }, + { + "epoch": 0.15475, + "grad_norm": 0.0826130211353302, + "learning_rate": 2.2544840700852016e-05, + "loss": 0.0345, + "step": 110950 + }, + { + "epoch": 0.1548, + "grad_norm": 0.07099715620279312, + "learning_rate": 2.2540727041778135e-05, + "loss": 0.0325, + "step": 110960 + }, + { + "epoch": 0.15485, + "grad_norm": 0.09759236872196198, + "learning_rate": 2.2536613449939866e-05, + "loss": 0.0359, + "step": 110970 + }, + { + "epoch": 0.1549, + "grad_norm": 0.07107224315404892, + "learning_rate": 2.253249992544965e-05, + "loss": 0.0331, + "step": 110980 + }, + { + "epoch": 0.15495, + "grad_norm": 0.06958385556936264, + "learning_rate": 2.2528386468419965e-05, + "loss": 0.0342, + "step": 110990 + }, + { + "epoch": 0.155, + "grad_norm": 0.07820618152618408, + "learning_rate": 2.252427307896326e-05, + "loss": 0.0335, + "step": 111000 + }, + { + "epoch": 0.15505, + "grad_norm": 0.06739425659179688, + "learning_rate": 2.2520159757192004e-05, + "loss": 0.0373, + "step": 111010 + }, + { + "epoch": 0.1551, + "grad_norm": 0.07322181016206741, + "learning_rate": 2.251604650321865e-05, + "loss": 0.035, + "step": 111020 + }, + { + "epoch": 0.15515, + "grad_norm": 0.06647425889968872, + "learning_rate": 2.2511933317155645e-05, + "loss": 0.0333, + "step": 111030 + }, + { + "epoch": 0.1552, + "grad_norm": 0.06537932902574539, + "learning_rate": 2.2507820199115457e-05, + "loss": 0.0349, + "step": 111040 + }, + { + "epoch": 0.15525, + "grad_norm": 0.06186918169260025, + "learning_rate": 2.2503707149210514e-05, + "loss": 0.0331, + "step": 111050 + }, + { + "epoch": 0.1553, + "grad_norm": 0.06334685534238815, + "learning_rate": 2.2499594167553286e-05, + "loss": 0.0348, + "step": 111060 + }, + { + "epoch": 0.15535, + "grad_norm": 0.07704874128103256, + "learning_rate": 2.249548125425622e-05, + "loss": 0.0355, + "step": 111070 + }, + { + "epoch": 0.1554, + "grad_norm": 0.07260606437921524, + "learning_rate": 2.249136840943175e-05, + "loss": 0.0344, + "step": 111080 + }, + { + "epoch": 0.15545, + "grad_norm": 0.08095771074295044, + "learning_rate": 2.2487255633192335e-05, + "loss": 0.0341, + "step": 111090 + }, + { + "epoch": 0.1555, + "grad_norm": 0.07667740434408188, + "learning_rate": 2.2483142925650398e-05, + "loss": 0.0333, + "step": 111100 + }, + { + "epoch": 0.15555, + "grad_norm": 0.08815068751573563, + "learning_rate": 2.2479030286918392e-05, + "loss": 0.0351, + "step": 111110 + }, + { + "epoch": 0.1556, + "grad_norm": 0.07412435114383698, + "learning_rate": 2.2474917717108746e-05, + "loss": 0.0354, + "step": 111120 + }, + { + "epoch": 0.15565, + "grad_norm": 0.06973432749509811, + "learning_rate": 2.2470805216333894e-05, + "loss": 0.0341, + "step": 111130 + }, + { + "epoch": 0.1557, + "grad_norm": 0.08697531372308731, + "learning_rate": 2.24666927847063e-05, + "loss": 0.0384, + "step": 111140 + }, + { + "epoch": 0.15575, + "grad_norm": 0.08577197045087814, + "learning_rate": 2.2462580422338352e-05, + "loss": 0.034, + "step": 111150 + }, + { + "epoch": 0.1558, + "grad_norm": 0.1036408320069313, + "learning_rate": 2.2458468129342518e-05, + "loss": 0.0351, + "step": 111160 + }, + { + "epoch": 0.15585, + "grad_norm": 0.08290373533964157, + "learning_rate": 2.2454355905831196e-05, + "loss": 0.0346, + "step": 111170 + }, + { + "epoch": 0.1559, + "grad_norm": 0.06540827453136444, + "learning_rate": 2.2450243751916838e-05, + "loss": 0.0333, + "step": 111180 + }, + { + "epoch": 0.15595, + "grad_norm": 0.07136257737874985, + "learning_rate": 2.244613166771185e-05, + "loss": 0.0353, + "step": 111190 + }, + { + "epoch": 0.156, + "grad_norm": 0.07136812061071396, + "learning_rate": 2.2442019653328667e-05, + "loss": 0.0351, + "step": 111200 + }, + { + "epoch": 0.15605, + "grad_norm": 0.07507219165563583, + "learning_rate": 2.243790770887971e-05, + "loss": 0.037, + "step": 111210 + }, + { + "epoch": 0.1561, + "grad_norm": 0.0679161325097084, + "learning_rate": 2.2433795834477386e-05, + "loss": 0.0344, + "step": 111220 + }, + { + "epoch": 0.15615, + "grad_norm": 0.062479760497808456, + "learning_rate": 2.2429684030234125e-05, + "loss": 0.0349, + "step": 111230 + }, + { + "epoch": 0.1562, + "grad_norm": 0.07537438720464706, + "learning_rate": 2.2425572296262334e-05, + "loss": 0.035, + "step": 111240 + }, + { + "epoch": 0.15625, + "grad_norm": 0.07191035896539688, + "learning_rate": 2.2421460632674424e-05, + "loss": 0.0349, + "step": 111250 + }, + { + "epoch": 0.1563, + "grad_norm": 0.0660267248749733, + "learning_rate": 2.2417349039582822e-05, + "loss": 0.0347, + "step": 111260 + }, + { + "epoch": 0.15635, + "grad_norm": 0.08792657405138016, + "learning_rate": 2.2413237517099918e-05, + "loss": 0.0344, + "step": 111270 + }, + { + "epoch": 0.1564, + "grad_norm": 0.06152603402733803, + "learning_rate": 2.2409126065338136e-05, + "loss": 0.0336, + "step": 111280 + }, + { + "epoch": 0.15645, + "grad_norm": 0.06601186096668243, + "learning_rate": 2.2405014684409873e-05, + "loss": 0.0346, + "step": 111290 + }, + { + "epoch": 0.1565, + "grad_norm": 0.08689270168542862, + "learning_rate": 2.2400903374427536e-05, + "loss": 0.0351, + "step": 111300 + }, + { + "epoch": 0.15655, + "grad_norm": 0.07292336970567703, + "learning_rate": 2.2396792135503517e-05, + "loss": 0.0333, + "step": 111310 + }, + { + "epoch": 0.1566, + "grad_norm": 0.07466064393520355, + "learning_rate": 2.2392680967750225e-05, + "loss": 0.0333, + "step": 111320 + }, + { + "epoch": 0.15665, + "grad_norm": 0.0883336216211319, + "learning_rate": 2.238856987128006e-05, + "loss": 0.0347, + "step": 111330 + }, + { + "epoch": 0.1567, + "grad_norm": 0.06872241199016571, + "learning_rate": 2.2384458846205404e-05, + "loss": 0.0356, + "step": 111340 + }, + { + "epoch": 0.15675, + "grad_norm": 0.08283447474241257, + "learning_rate": 2.2380347892638677e-05, + "loss": 0.0355, + "step": 111350 + }, + { + "epoch": 0.1568, + "grad_norm": 0.09631136804819107, + "learning_rate": 2.2376237010692235e-05, + "loss": 0.0342, + "step": 111360 + }, + { + "epoch": 0.15685, + "grad_norm": 0.0883079394698143, + "learning_rate": 2.237212620047851e-05, + "loss": 0.034, + "step": 111370 + }, + { + "epoch": 0.1569, + "grad_norm": 0.0943932831287384, + "learning_rate": 2.2368015462109844e-05, + "loss": 0.0343, + "step": 111380 + }, + { + "epoch": 0.15695, + "grad_norm": 0.07425844669342041, + "learning_rate": 2.2363904795698653e-05, + "loss": 0.0336, + "step": 111390 + }, + { + "epoch": 0.157, + "grad_norm": 0.08522672951221466, + "learning_rate": 2.2359794201357322e-05, + "loss": 0.0352, + "step": 111400 + }, + { + "epoch": 0.15705, + "grad_norm": 0.08357825130224228, + "learning_rate": 2.235568367919822e-05, + "loss": 0.0347, + "step": 111410 + }, + { + "epoch": 0.1571, + "grad_norm": 0.07961345463991165, + "learning_rate": 2.235157322933374e-05, + "loss": 0.0344, + "step": 111420 + }, + { + "epoch": 0.15715, + "grad_norm": 0.08275941759347916, + "learning_rate": 2.2347462851876246e-05, + "loss": 0.0341, + "step": 111430 + }, + { + "epoch": 0.1572, + "grad_norm": 0.08862300217151642, + "learning_rate": 2.2343352546938118e-05, + "loss": 0.0351, + "step": 111440 + }, + { + "epoch": 0.15725, + "grad_norm": 0.06781277805566788, + "learning_rate": 2.2339242314631746e-05, + "loss": 0.0339, + "step": 111450 + }, + { + "epoch": 0.1573, + "grad_norm": 0.07251814752817154, + "learning_rate": 2.2335132155069476e-05, + "loss": 0.0333, + "step": 111460 + }, + { + "epoch": 0.15735, + "grad_norm": 0.07582741230726242, + "learning_rate": 2.23310220683637e-05, + "loss": 0.0349, + "step": 111470 + }, + { + "epoch": 0.1574, + "grad_norm": 0.06999965757131577, + "learning_rate": 2.2326912054626772e-05, + "loss": 0.0335, + "step": 111480 + }, + { + "epoch": 0.15745, + "grad_norm": 0.07859835028648376, + "learning_rate": 2.2322802113971073e-05, + "loss": 0.0338, + "step": 111490 + }, + { + "epoch": 0.1575, + "grad_norm": 0.05699661374092102, + "learning_rate": 2.231869224650895e-05, + "loss": 0.033, + "step": 111500 + }, + { + "epoch": 0.15755, + "grad_norm": 0.08140408247709274, + "learning_rate": 2.2314582452352774e-05, + "loss": 0.0333, + "step": 111510 + }, + { + "epoch": 0.1576, + "grad_norm": 0.07481715828180313, + "learning_rate": 2.2310472731614912e-05, + "loss": 0.035, + "step": 111520 + }, + { + "epoch": 0.15765, + "grad_norm": 0.07342035323381424, + "learning_rate": 2.230636308440771e-05, + "loss": 0.0335, + "step": 111530 + }, + { + "epoch": 0.1577, + "grad_norm": 0.08008212596178055, + "learning_rate": 2.2302253510843534e-05, + "loss": 0.0346, + "step": 111540 + }, + { + "epoch": 0.15775, + "grad_norm": 0.09975617378950119, + "learning_rate": 2.229814401103472e-05, + "loss": 0.0374, + "step": 111550 + }, + { + "epoch": 0.1578, + "grad_norm": 0.0929156094789505, + "learning_rate": 2.2294034585093653e-05, + "loss": 0.0363, + "step": 111560 + }, + { + "epoch": 0.15785, + "grad_norm": 0.07880815863609314, + "learning_rate": 2.228992523313265e-05, + "loss": 0.035, + "step": 111570 + }, + { + "epoch": 0.1579, + "grad_norm": 0.06929337233304977, + "learning_rate": 2.2285815955264077e-05, + "loss": 0.0346, + "step": 111580 + }, + { + "epoch": 0.15795, + "grad_norm": 0.08029261231422424, + "learning_rate": 2.228170675160028e-05, + "loss": 0.0356, + "step": 111590 + }, + { + "epoch": 0.158, + "grad_norm": 0.06501136720180511, + "learning_rate": 2.227759762225359e-05, + "loss": 0.0385, + "step": 111600 + }, + { + "epoch": 0.15805, + "grad_norm": 0.0692141130566597, + "learning_rate": 2.2273488567336366e-05, + "loss": 0.0355, + "step": 111610 + }, + { + "epoch": 0.1581, + "grad_norm": 0.0631387010216713, + "learning_rate": 2.226937958696094e-05, + "loss": 0.035, + "step": 111620 + }, + { + "epoch": 0.15815, + "grad_norm": 0.0704304650425911, + "learning_rate": 2.2265270681239637e-05, + "loss": 0.0339, + "step": 111630 + }, + { + "epoch": 0.1582, + "grad_norm": 0.078705333173275, + "learning_rate": 2.2261161850284828e-05, + "loss": 0.0346, + "step": 111640 + }, + { + "epoch": 0.15825, + "grad_norm": 0.061205726116895676, + "learning_rate": 2.2257053094208806e-05, + "loss": 0.0349, + "step": 111650 + }, + { + "epoch": 0.1583, + "grad_norm": 0.06317484378814697, + "learning_rate": 2.225294441312394e-05, + "loss": 0.0369, + "step": 111660 + }, + { + "epoch": 0.15835, + "grad_norm": 0.06498158723115921, + "learning_rate": 2.2248835807142525e-05, + "loss": 0.0346, + "step": 111670 + }, + { + "epoch": 0.1584, + "grad_norm": 0.07051656395196915, + "learning_rate": 2.2244727276376918e-05, + "loss": 0.0358, + "step": 111680 + }, + { + "epoch": 0.15845, + "grad_norm": 0.0564606674015522, + "learning_rate": 2.2240618820939424e-05, + "loss": 0.034, + "step": 111690 + }, + { + "epoch": 0.1585, + "grad_norm": 0.06933465600013733, + "learning_rate": 2.2236510440942378e-05, + "loss": 0.034, + "step": 111700 + }, + { + "epoch": 0.15855, + "grad_norm": 0.06537031382322311, + "learning_rate": 2.2232402136498102e-05, + "loss": 0.035, + "step": 111710 + }, + { + "epoch": 0.1586, + "grad_norm": 0.06910958886146545, + "learning_rate": 2.2228293907718907e-05, + "loss": 0.0348, + "step": 111720 + }, + { + "epoch": 0.15865, + "grad_norm": 0.0797191634774208, + "learning_rate": 2.2224185754717115e-05, + "loss": 0.0345, + "step": 111730 + }, + { + "epoch": 0.1587, + "grad_norm": 0.08333103358745575, + "learning_rate": 2.2220077677605044e-05, + "loss": 0.0346, + "step": 111740 + }, + { + "epoch": 0.15875, + "grad_norm": 0.0689079686999321, + "learning_rate": 2.2215969676495007e-05, + "loss": 0.0339, + "step": 111750 + }, + { + "epoch": 0.1588, + "grad_norm": 0.06846673786640167, + "learning_rate": 2.2211861751499303e-05, + "loss": 0.0346, + "step": 111760 + }, + { + "epoch": 0.15885, + "grad_norm": 0.0739859864115715, + "learning_rate": 2.2207753902730255e-05, + "loss": 0.0333, + "step": 111770 + }, + { + "epoch": 0.1589, + "grad_norm": 0.06391892582178116, + "learning_rate": 2.2203646130300174e-05, + "loss": 0.0336, + "step": 111780 + }, + { + "epoch": 0.15895, + "grad_norm": 0.060105256736278534, + "learning_rate": 2.2199538434321348e-05, + "loss": 0.0343, + "step": 111790 + }, + { + "epoch": 0.159, + "grad_norm": 0.06684567779302597, + "learning_rate": 2.21954308149061e-05, + "loss": 0.0328, + "step": 111800 + }, + { + "epoch": 0.15905, + "grad_norm": 0.049714989960193634, + "learning_rate": 2.219132327216671e-05, + "loss": 0.0338, + "step": 111810 + }, + { + "epoch": 0.1591, + "grad_norm": 0.06189383566379547, + "learning_rate": 2.2187215806215494e-05, + "loss": 0.0322, + "step": 111820 + }, + { + "epoch": 0.15915, + "grad_norm": 0.05540673807263374, + "learning_rate": 2.2183108417164736e-05, + "loss": 0.0318, + "step": 111830 + }, + { + "epoch": 0.1592, + "grad_norm": 0.06381874531507492, + "learning_rate": 2.2179001105126728e-05, + "loss": 0.0328, + "step": 111840 + }, + { + "epoch": 0.15925, + "grad_norm": 0.06503371894359589, + "learning_rate": 2.217489387021379e-05, + "loss": 0.033, + "step": 111850 + }, + { + "epoch": 0.1593, + "grad_norm": 0.09525395184755325, + "learning_rate": 2.2170786712538176e-05, + "loss": 0.0335, + "step": 111860 + }, + { + "epoch": 0.15935, + "grad_norm": 0.06650565564632416, + "learning_rate": 2.2166679632212203e-05, + "loss": 0.0341, + "step": 111870 + }, + { + "epoch": 0.1594, + "grad_norm": 0.06843181699514389, + "learning_rate": 2.216257262934813e-05, + "loss": 0.0337, + "step": 111880 + }, + { + "epoch": 0.15945, + "grad_norm": 0.07026353478431702, + "learning_rate": 2.215846570405826e-05, + "loss": 0.0341, + "step": 111890 + }, + { + "epoch": 0.1595, + "grad_norm": 0.0896812453866005, + "learning_rate": 2.2154358856454875e-05, + "loss": 0.0332, + "step": 111900 + }, + { + "epoch": 0.15955, + "grad_norm": 0.07158301770687103, + "learning_rate": 2.2150252086650246e-05, + "loss": 0.0377, + "step": 111910 + }, + { + "epoch": 0.1596, + "grad_norm": 0.07504840195178986, + "learning_rate": 2.214614539475666e-05, + "loss": 0.033, + "step": 111920 + }, + { + "epoch": 0.15965, + "grad_norm": 0.06362754851579666, + "learning_rate": 2.2142038780886382e-05, + "loss": 0.0347, + "step": 111930 + }, + { + "epoch": 0.1597, + "grad_norm": 0.059492312371730804, + "learning_rate": 2.2137932245151692e-05, + "loss": 0.0333, + "step": 111940 + }, + { + "epoch": 0.15975, + "grad_norm": 0.05978335440158844, + "learning_rate": 2.2133825787664855e-05, + "loss": 0.0329, + "step": 111950 + }, + { + "epoch": 0.1598, + "grad_norm": 0.07219391316175461, + "learning_rate": 2.212971940853814e-05, + "loss": 0.0345, + "step": 111960 + }, + { + "epoch": 0.15985, + "grad_norm": 0.06794709712266922, + "learning_rate": 2.212561310788383e-05, + "loss": 0.0346, + "step": 111970 + }, + { + "epoch": 0.1599, + "grad_norm": 0.06835256516933441, + "learning_rate": 2.212150688581417e-05, + "loss": 0.0338, + "step": 111980 + }, + { + "epoch": 0.15995, + "grad_norm": 0.07163048535585403, + "learning_rate": 2.2117400742441437e-05, + "loss": 0.0351, + "step": 111990 + }, + { + "epoch": 0.16, + "grad_norm": 0.06418144702911377, + "learning_rate": 2.2113294677877877e-05, + "loss": 0.0356, + "step": 112000 + }, + { + "epoch": 0.16005, + "grad_norm": 0.06732147932052612, + "learning_rate": 2.2109188692235767e-05, + "loss": 0.0353, + "step": 112010 + }, + { + "epoch": 0.1601, + "grad_norm": 0.07495000958442688, + "learning_rate": 2.210508278562734e-05, + "loss": 0.0352, + "step": 112020 + }, + { + "epoch": 0.16015, + "grad_norm": 0.07209627330303192, + "learning_rate": 2.2100976958164864e-05, + "loss": 0.0357, + "step": 112030 + }, + { + "epoch": 0.1602, + "grad_norm": 0.0804656371474266, + "learning_rate": 2.2096871209960598e-05, + "loss": 0.0347, + "step": 112040 + }, + { + "epoch": 0.16025, + "grad_norm": 0.10187839716672897, + "learning_rate": 2.209276554112677e-05, + "loss": 0.0365, + "step": 112050 + }, + { + "epoch": 0.1603, + "grad_norm": 0.0867825597524643, + "learning_rate": 2.2088659951775654e-05, + "loss": 0.0365, + "step": 112060 + }, + { + "epoch": 0.16035, + "grad_norm": 0.1051550805568695, + "learning_rate": 2.2084554442019467e-05, + "loss": 0.0368, + "step": 112070 + }, + { + "epoch": 0.1604, + "grad_norm": 0.0879400372505188, + "learning_rate": 2.208044901197047e-05, + "loss": 0.0378, + "step": 112080 + }, + { + "epoch": 0.16045, + "grad_norm": 0.11434191465377808, + "learning_rate": 2.2076343661740907e-05, + "loss": 0.0352, + "step": 112090 + }, + { + "epoch": 0.1605, + "grad_norm": 0.06594059616327286, + "learning_rate": 2.2072238391443004e-05, + "loss": 0.0351, + "step": 112100 + }, + { + "epoch": 0.16055, + "grad_norm": 0.0907931923866272, + "learning_rate": 2.206813320118901e-05, + "loss": 0.0352, + "step": 112110 + }, + { + "epoch": 0.1606, + "grad_norm": 0.06998275220394135, + "learning_rate": 2.2064028091091144e-05, + "loss": 0.033, + "step": 112120 + }, + { + "epoch": 0.16065, + "grad_norm": 0.07440738379955292, + "learning_rate": 2.2059923061261656e-05, + "loss": 0.0333, + "step": 112130 + }, + { + "epoch": 0.1607, + "grad_norm": 0.06170496344566345, + "learning_rate": 2.205581811181276e-05, + "loss": 0.0348, + "step": 112140 + }, + { + "epoch": 0.16075, + "grad_norm": 0.06256181746721268, + "learning_rate": 2.2051713242856682e-05, + "loss": 0.0356, + "step": 112150 + }, + { + "epoch": 0.1608, + "grad_norm": 0.07359161972999573, + "learning_rate": 2.204760845450568e-05, + "loss": 0.0348, + "step": 112160 + }, + { + "epoch": 0.16085, + "grad_norm": 0.06576690077781677, + "learning_rate": 2.2043503746871933e-05, + "loss": 0.0338, + "step": 112170 + }, + { + "epoch": 0.1609, + "grad_norm": 0.06027120724320412, + "learning_rate": 2.2039399120067694e-05, + "loss": 0.0344, + "step": 112180 + }, + { + "epoch": 0.16095, + "grad_norm": 0.06426198035478592, + "learning_rate": 2.2035294574205166e-05, + "loss": 0.0362, + "step": 112190 + }, + { + "epoch": 0.161, + "grad_norm": 0.0731712356209755, + "learning_rate": 2.2031190109396575e-05, + "loss": 0.0366, + "step": 112200 + }, + { + "epoch": 0.16105, + "grad_norm": 0.0613652728497982, + "learning_rate": 2.2027085725754126e-05, + "loss": 0.0342, + "step": 112210 + }, + { + "epoch": 0.1611, + "grad_norm": 0.061248842626810074, + "learning_rate": 2.202298142339004e-05, + "loss": 0.0336, + "step": 112220 + }, + { + "epoch": 0.16115, + "grad_norm": 0.06405367702245712, + "learning_rate": 2.201887720241652e-05, + "loss": 0.0366, + "step": 112230 + }, + { + "epoch": 0.1612, + "grad_norm": 0.062023404985666275, + "learning_rate": 2.2014773062945777e-05, + "loss": 0.0338, + "step": 112240 + }, + { + "epoch": 0.16125, + "grad_norm": 0.06777895987033844, + "learning_rate": 2.2010669005090025e-05, + "loss": 0.0346, + "step": 112250 + }, + { + "epoch": 0.1613, + "grad_norm": 0.06935904920101166, + "learning_rate": 2.2006565028961447e-05, + "loss": 0.036, + "step": 112260 + }, + { + "epoch": 0.16135, + "grad_norm": 0.08053556829690933, + "learning_rate": 2.200246113467226e-05, + "loss": 0.035, + "step": 112270 + }, + { + "epoch": 0.1614, + "grad_norm": 0.0714808776974678, + "learning_rate": 2.1998357322334666e-05, + "loss": 0.0351, + "step": 112280 + }, + { + "epoch": 0.16145, + "grad_norm": 0.07520432770252228, + "learning_rate": 2.199425359206085e-05, + "loss": 0.0362, + "step": 112290 + }, + { + "epoch": 0.1615, + "grad_norm": 0.06644560396671295, + "learning_rate": 2.199014994396302e-05, + "loss": 0.0346, + "step": 112300 + }, + { + "epoch": 0.16155, + "grad_norm": 0.07510702311992645, + "learning_rate": 2.1986046378153348e-05, + "loss": 0.035, + "step": 112310 + }, + { + "epoch": 0.1616, + "grad_norm": 0.05435965955257416, + "learning_rate": 2.1981942894744045e-05, + "loss": 0.0331, + "step": 112320 + }, + { + "epoch": 0.16165, + "grad_norm": 0.06645798683166504, + "learning_rate": 2.1977839493847284e-05, + "loss": 0.034, + "step": 112330 + }, + { + "epoch": 0.1617, + "grad_norm": 0.06540507078170776, + "learning_rate": 2.197373617557525e-05, + "loss": 0.0335, + "step": 112340 + }, + { + "epoch": 0.16175, + "grad_norm": 0.06752929836511612, + "learning_rate": 2.196963294004015e-05, + "loss": 0.0339, + "step": 112350 + }, + { + "epoch": 0.1618, + "grad_norm": 0.05909738317131996, + "learning_rate": 2.1965529787354127e-05, + "loss": 0.0327, + "step": 112360 + }, + { + "epoch": 0.16185, + "grad_norm": 0.06119058281183243, + "learning_rate": 2.19614267176294e-05, + "loss": 0.0332, + "step": 112370 + }, + { + "epoch": 0.1619, + "grad_norm": 0.06323596090078354, + "learning_rate": 2.1957323730978104e-05, + "loss": 0.0332, + "step": 112380 + }, + { + "epoch": 0.16195, + "grad_norm": 0.07488784939050674, + "learning_rate": 2.195322082751245e-05, + "loss": 0.0338, + "step": 112390 + }, + { + "epoch": 0.162, + "grad_norm": 0.06295579671859741, + "learning_rate": 2.1949118007344584e-05, + "loss": 0.0335, + "step": 112400 + }, + { + "epoch": 0.16205, + "grad_norm": 0.05650020018219948, + "learning_rate": 2.194501527058669e-05, + "loss": 0.0331, + "step": 112410 + }, + { + "epoch": 0.1621, + "grad_norm": 0.07044333964586258, + "learning_rate": 2.1940912617350932e-05, + "loss": 0.0334, + "step": 112420 + }, + { + "epoch": 0.16215, + "grad_norm": 0.06065429747104645, + "learning_rate": 2.193681004774947e-05, + "loss": 0.0338, + "step": 112430 + }, + { + "epoch": 0.1622, + "grad_norm": 0.06744283437728882, + "learning_rate": 2.1932707561894474e-05, + "loss": 0.0343, + "step": 112440 + }, + { + "epoch": 0.16225, + "grad_norm": 0.06047586351633072, + "learning_rate": 2.1928605159898098e-05, + "loss": 0.0337, + "step": 112450 + }, + { + "epoch": 0.1623, + "grad_norm": 0.07155334949493408, + "learning_rate": 2.19245028418725e-05, + "loss": 0.0359, + "step": 112460 + }, + { + "epoch": 0.16235, + "grad_norm": 0.08805252611637115, + "learning_rate": 2.1920400607929845e-05, + "loss": 0.0356, + "step": 112470 + }, + { + "epoch": 0.1624, + "grad_norm": 0.07390435039997101, + "learning_rate": 2.1916298458182276e-05, + "loss": 0.035, + "step": 112480 + }, + { + "epoch": 0.16245, + "grad_norm": 0.09888370335102081, + "learning_rate": 2.1912196392741956e-05, + "loss": 0.0361, + "step": 112490 + }, + { + "epoch": 0.1625, + "grad_norm": 0.06713777035474777, + "learning_rate": 2.190809441172102e-05, + "loss": 0.0349, + "step": 112500 + }, + { + "epoch": 0.16255, + "grad_norm": 0.07125592231750488, + "learning_rate": 2.1903992515231626e-05, + "loss": 0.0342, + "step": 112510 + }, + { + "epoch": 0.1626, + "grad_norm": 0.0708683654665947, + "learning_rate": 2.189989070338591e-05, + "loss": 0.0356, + "step": 112520 + }, + { + "epoch": 0.16265, + "grad_norm": 0.0808732882142067, + "learning_rate": 2.1895788976296018e-05, + "loss": 0.0346, + "step": 112530 + }, + { + "epoch": 0.1627, + "grad_norm": 0.08089447021484375, + "learning_rate": 2.189168733407409e-05, + "loss": 0.0346, + "step": 112540 + }, + { + "epoch": 0.16275, + "grad_norm": 0.07411898672580719, + "learning_rate": 2.1887585776832255e-05, + "loss": 0.0346, + "step": 112550 + }, + { + "epoch": 0.1628, + "grad_norm": 0.06882666796445847, + "learning_rate": 2.1883484304682672e-05, + "loss": 0.0356, + "step": 112560 + }, + { + "epoch": 0.16285, + "grad_norm": 0.0912306159734726, + "learning_rate": 2.187938291773744e-05, + "loss": 0.0365, + "step": 112570 + }, + { + "epoch": 0.1629, + "grad_norm": 0.06964357942342758, + "learning_rate": 2.1875281616108725e-05, + "loss": 0.0334, + "step": 112580 + }, + { + "epoch": 0.16295, + "grad_norm": 0.06867384910583496, + "learning_rate": 2.187118039990862e-05, + "loss": 0.037, + "step": 112590 + }, + { + "epoch": 0.163, + "grad_norm": 0.06882866472005844, + "learning_rate": 2.186707926924927e-05, + "loss": 0.0346, + "step": 112600 + }, + { + "epoch": 0.16305, + "grad_norm": 0.06393932551145554, + "learning_rate": 2.18629782242428e-05, + "loss": 0.034, + "step": 112610 + }, + { + "epoch": 0.1631, + "grad_norm": 0.07947162538766861, + "learning_rate": 2.1858877265001327e-05, + "loss": 0.0337, + "step": 112620 + }, + { + "epoch": 0.16315, + "grad_norm": 0.06633418053388596, + "learning_rate": 2.1854776391636973e-05, + "loss": 0.0347, + "step": 112630 + }, + { + "epoch": 0.1632, + "grad_norm": 0.06981424242258072, + "learning_rate": 2.1850675604261845e-05, + "loss": 0.036, + "step": 112640 + }, + { + "epoch": 0.16325, + "grad_norm": 0.06863073259592056, + "learning_rate": 2.1846574902988056e-05, + "loss": 0.0348, + "step": 112650 + }, + { + "epoch": 0.1633, + "grad_norm": 0.06586696952581406, + "learning_rate": 2.1842474287927744e-05, + "loss": 0.0347, + "step": 112660 + }, + { + "epoch": 0.16335, + "grad_norm": 0.08072131872177124, + "learning_rate": 2.1838373759192978e-05, + "loss": 0.0362, + "step": 112670 + }, + { + "epoch": 0.1634, + "grad_norm": 0.1065782681107521, + "learning_rate": 2.18342733168959e-05, + "loss": 0.0379, + "step": 112680 + }, + { + "epoch": 0.16345, + "grad_norm": 0.08189694583415985, + "learning_rate": 2.1830172961148594e-05, + "loss": 0.0363, + "step": 112690 + }, + { + "epoch": 0.1635, + "grad_norm": 0.07649283111095428, + "learning_rate": 2.1826072692063175e-05, + "loss": 0.0345, + "step": 112700 + }, + { + "epoch": 0.16355, + "grad_norm": 0.08806269615888596, + "learning_rate": 2.1821972509751728e-05, + "loss": 0.0339, + "step": 112710 + }, + { + "epoch": 0.1636, + "grad_norm": 0.10222853720188141, + "learning_rate": 2.181787241432636e-05, + "loss": 0.0358, + "step": 112720 + }, + { + "epoch": 0.16365, + "grad_norm": 0.07381557673215866, + "learning_rate": 2.181377240589917e-05, + "loss": 0.0348, + "step": 112730 + }, + { + "epoch": 0.1637, + "grad_norm": 0.08653289079666138, + "learning_rate": 2.180967248458224e-05, + "loss": 0.0346, + "step": 112740 + }, + { + "epoch": 0.16375, + "grad_norm": 0.06406396627426147, + "learning_rate": 2.180557265048767e-05, + "loss": 0.0342, + "step": 112750 + }, + { + "epoch": 0.1638, + "grad_norm": 0.07045432925224304, + "learning_rate": 2.1801472903727534e-05, + "loss": 0.0339, + "step": 112760 + }, + { + "epoch": 0.16385, + "grad_norm": 0.07248934358358383, + "learning_rate": 2.1797373244413947e-05, + "loss": 0.0348, + "step": 112770 + }, + { + "epoch": 0.1639, + "grad_norm": 0.07276508212089539, + "learning_rate": 2.179327367265895e-05, + "loss": 0.0369, + "step": 112780 + }, + { + "epoch": 0.16395, + "grad_norm": 0.06122051179409027, + "learning_rate": 2.1789174188574654e-05, + "loss": 0.0345, + "step": 112790 + }, + { + "epoch": 0.164, + "grad_norm": 0.07430427521467209, + "learning_rate": 2.1785074792273136e-05, + "loss": 0.0349, + "step": 112800 + }, + { + "epoch": 0.16405, + "grad_norm": 0.07259289920330048, + "learning_rate": 2.178097548386646e-05, + "loss": 0.0337, + "step": 112810 + }, + { + "epoch": 0.1641, + "grad_norm": 0.06403262168169022, + "learning_rate": 2.1776876263466707e-05, + "loss": 0.0349, + "step": 112820 + }, + { + "epoch": 0.16415, + "grad_norm": 0.07015439867973328, + "learning_rate": 2.1772777131185945e-05, + "loss": 0.0346, + "step": 112830 + }, + { + "epoch": 0.1642, + "grad_norm": 0.06901204586029053, + "learning_rate": 2.1768678087136235e-05, + "loss": 0.0338, + "step": 112840 + }, + { + "epoch": 0.16425, + "grad_norm": 0.07864746451377869, + "learning_rate": 2.1764579131429668e-05, + "loss": 0.0347, + "step": 112850 + }, + { + "epoch": 0.1643, + "grad_norm": 0.06460756063461304, + "learning_rate": 2.1760480264178278e-05, + "loss": 0.0333, + "step": 112860 + }, + { + "epoch": 0.16435, + "grad_norm": 0.06273296475410461, + "learning_rate": 2.1756381485494158e-05, + "loss": 0.0336, + "step": 112870 + }, + { + "epoch": 0.1644, + "grad_norm": 0.06931041181087494, + "learning_rate": 2.175228279548933e-05, + "loss": 0.0341, + "step": 112880 + }, + { + "epoch": 0.16445, + "grad_norm": 0.07009255886077881, + "learning_rate": 2.1748184194275882e-05, + "loss": 0.0343, + "step": 112890 + }, + { + "epoch": 0.1645, + "grad_norm": 0.08870544284582138, + "learning_rate": 2.1744085681965853e-05, + "loss": 0.0334, + "step": 112900 + }, + { + "epoch": 0.16455, + "grad_norm": 0.08789685368537903, + "learning_rate": 2.1739987258671295e-05, + "loss": 0.0347, + "step": 112910 + }, + { + "epoch": 0.1646, + "grad_norm": 0.0757160559296608, + "learning_rate": 2.173588892450427e-05, + "loss": 0.0338, + "step": 112920 + }, + { + "epoch": 0.16465, + "grad_norm": 0.05919770896434784, + "learning_rate": 2.1731790679576807e-05, + "loss": 0.0334, + "step": 112930 + }, + { + "epoch": 0.1647, + "grad_norm": 0.0561647042632103, + "learning_rate": 2.1727692524000968e-05, + "loss": 0.0348, + "step": 112940 + }, + { + "epoch": 0.16475, + "grad_norm": 0.06535935401916504, + "learning_rate": 2.172359445788878e-05, + "loss": 0.0335, + "step": 112950 + }, + { + "epoch": 0.1648, + "grad_norm": 0.06926766037940979, + "learning_rate": 2.1719496481352293e-05, + "loss": 0.0336, + "step": 112960 + }, + { + "epoch": 0.16485, + "grad_norm": 0.07666823267936707, + "learning_rate": 2.1715398594503525e-05, + "loss": 0.0333, + "step": 112970 + }, + { + "epoch": 0.1649, + "grad_norm": 0.07533027976751328, + "learning_rate": 2.1711300797454533e-05, + "loss": 0.0349, + "step": 112980 + }, + { + "epoch": 0.16495, + "grad_norm": 0.06927791237831116, + "learning_rate": 2.170720309031735e-05, + "loss": 0.0343, + "step": 112990 + }, + { + "epoch": 0.165, + "grad_norm": 0.06924304366111755, + "learning_rate": 2.1703105473203988e-05, + "loss": 0.0338, + "step": 113000 + }, + { + "epoch": 0.16505, + "grad_norm": 0.0706871822476387, + "learning_rate": 2.1699007946226495e-05, + "loss": 0.0362, + "step": 113010 + }, + { + "epoch": 0.1651, + "grad_norm": 0.08440165221691132, + "learning_rate": 2.1694910509496872e-05, + "loss": 0.0342, + "step": 113020 + }, + { + "epoch": 0.16515, + "grad_norm": 0.07027032971382141, + "learning_rate": 2.1690813163127166e-05, + "loss": 0.0348, + "step": 113030 + }, + { + "epoch": 0.1652, + "grad_norm": 0.07992295175790787, + "learning_rate": 2.1686715907229378e-05, + "loss": 0.0332, + "step": 113040 + }, + { + "epoch": 0.16525, + "grad_norm": 0.07327957451343536, + "learning_rate": 2.1682618741915522e-05, + "loss": 0.0338, + "step": 113050 + }, + { + "epoch": 0.1653, + "grad_norm": 0.07747048139572144, + "learning_rate": 2.1678521667297648e-05, + "loss": 0.0337, + "step": 113060 + }, + { + "epoch": 0.16535, + "grad_norm": 0.07693877816200256, + "learning_rate": 2.167442468348772e-05, + "loss": 0.0336, + "step": 113070 + }, + { + "epoch": 0.1654, + "grad_norm": 0.0580328106880188, + "learning_rate": 2.167032779059779e-05, + "loss": 0.0337, + "step": 113080 + }, + { + "epoch": 0.16545, + "grad_norm": 0.0614556185901165, + "learning_rate": 2.1666230988739833e-05, + "loss": 0.0328, + "step": 113090 + }, + { + "epoch": 0.1655, + "grad_norm": 0.06618588417768478, + "learning_rate": 2.1662134278025873e-05, + "loss": 0.0332, + "step": 113100 + }, + { + "epoch": 0.16555, + "grad_norm": 0.06605345755815506, + "learning_rate": 2.1658037658567913e-05, + "loss": 0.0326, + "step": 113110 + }, + { + "epoch": 0.1656, + "grad_norm": 0.11311843246221542, + "learning_rate": 2.165394113047794e-05, + "loss": 0.0351, + "step": 113120 + }, + { + "epoch": 0.16565, + "grad_norm": 0.06903208792209625, + "learning_rate": 2.1649844693867968e-05, + "loss": 0.0332, + "step": 113130 + }, + { + "epoch": 0.1657, + "grad_norm": 0.07519102841615677, + "learning_rate": 2.1645748348849977e-05, + "loss": 0.0329, + "step": 113140 + }, + { + "epoch": 0.16575, + "grad_norm": 0.07155120372772217, + "learning_rate": 2.164165209553597e-05, + "loss": 0.0331, + "step": 113150 + }, + { + "epoch": 0.1658, + "grad_norm": 0.0691385492682457, + "learning_rate": 2.163755593403793e-05, + "loss": 0.0339, + "step": 113160 + }, + { + "epoch": 0.16585, + "grad_norm": 0.08293487131595612, + "learning_rate": 2.1633459864467843e-05, + "loss": 0.0344, + "step": 113170 + }, + { + "epoch": 0.1659, + "grad_norm": 0.07103066146373749, + "learning_rate": 2.1629363886937705e-05, + "loss": 0.0353, + "step": 113180 + }, + { + "epoch": 0.16595, + "grad_norm": 0.07060660421848297, + "learning_rate": 2.162526800155949e-05, + "loss": 0.0347, + "step": 113190 + }, + { + "epoch": 0.166, + "grad_norm": 0.06417767703533173, + "learning_rate": 2.162117220844519e-05, + "loss": 0.0338, + "step": 113200 + }, + { + "epoch": 0.16605, + "grad_norm": 0.0670657530426979, + "learning_rate": 2.1617076507706756e-05, + "loss": 0.0343, + "step": 113210 + }, + { + "epoch": 0.1661, + "grad_norm": 0.06350884586572647, + "learning_rate": 2.1612980899456192e-05, + "loss": 0.035, + "step": 113220 + }, + { + "epoch": 0.16615, + "grad_norm": 0.06385102868080139, + "learning_rate": 2.1608885383805453e-05, + "loss": 0.0354, + "step": 113230 + }, + { + "epoch": 0.1662, + "grad_norm": 0.06831283122301102, + "learning_rate": 2.160478996086651e-05, + "loss": 0.0354, + "step": 113240 + }, + { + "epoch": 0.16625, + "grad_norm": 0.06666000932455063, + "learning_rate": 2.1600694630751343e-05, + "loss": 0.0345, + "step": 113250 + }, + { + "epoch": 0.1663, + "grad_norm": 0.06278067827224731, + "learning_rate": 2.1596599393571894e-05, + "loss": 0.0339, + "step": 113260 + }, + { + "epoch": 0.16635, + "grad_norm": 0.05852845311164856, + "learning_rate": 2.1592504249440156e-05, + "loss": 0.0377, + "step": 113270 + }, + { + "epoch": 0.1664, + "grad_norm": 0.05839408561587334, + "learning_rate": 2.1588409198468056e-05, + "loss": 0.0342, + "step": 113280 + }, + { + "epoch": 0.16645, + "grad_norm": 0.06353548914194107, + "learning_rate": 2.158431424076757e-05, + "loss": 0.0339, + "step": 113290 + }, + { + "epoch": 0.1665, + "grad_norm": 0.0663408488035202, + "learning_rate": 2.1580219376450657e-05, + "loss": 0.0339, + "step": 113300 + }, + { + "epoch": 0.16655, + "grad_norm": 0.07321648299694061, + "learning_rate": 2.1576124605629256e-05, + "loss": 0.0346, + "step": 113310 + }, + { + "epoch": 0.1666, + "grad_norm": 0.07605119794607162, + "learning_rate": 2.1572029928415326e-05, + "loss": 0.035, + "step": 113320 + }, + { + "epoch": 0.16665, + "grad_norm": 0.060462381690740585, + "learning_rate": 2.1567935344920805e-05, + "loss": 0.0342, + "step": 113330 + }, + { + "epoch": 0.1667, + "grad_norm": 0.0616709403693676, + "learning_rate": 2.156384085525765e-05, + "loss": 0.0334, + "step": 113340 + }, + { + "epoch": 0.16675, + "grad_norm": 0.06495338678359985, + "learning_rate": 2.155974645953779e-05, + "loss": 0.0331, + "step": 113350 + }, + { + "epoch": 0.1668, + "grad_norm": 0.06545952707529068, + "learning_rate": 2.155565215787316e-05, + "loss": 0.0332, + "step": 113360 + }, + { + "epoch": 0.16685, + "grad_norm": 0.07933729887008667, + "learning_rate": 2.1551557950375725e-05, + "loss": 0.0334, + "step": 113370 + }, + { + "epoch": 0.1669, + "grad_norm": 0.07859945297241211, + "learning_rate": 2.1547463837157382e-05, + "loss": 0.0336, + "step": 113380 + }, + { + "epoch": 0.16695, + "grad_norm": 0.05592895671725273, + "learning_rate": 2.1543369818330094e-05, + "loss": 0.0331, + "step": 113390 + }, + { + "epoch": 0.167, + "grad_norm": 0.07539980113506317, + "learning_rate": 2.153927589400577e-05, + "loss": 0.0339, + "step": 113400 + }, + { + "epoch": 0.16705, + "grad_norm": 0.05995357781648636, + "learning_rate": 2.1535182064296347e-05, + "loss": 0.0325, + "step": 113410 + }, + { + "epoch": 0.1671, + "grad_norm": 0.06396167725324631, + "learning_rate": 2.1531088329313743e-05, + "loss": 0.033, + "step": 113420 + }, + { + "epoch": 0.16715, + "grad_norm": 0.06876359134912491, + "learning_rate": 2.1526994689169878e-05, + "loss": 0.0344, + "step": 113430 + }, + { + "epoch": 0.1672, + "grad_norm": 0.07557900995016098, + "learning_rate": 2.1522901143976675e-05, + "loss": 0.0345, + "step": 113440 + }, + { + "epoch": 0.16725, + "grad_norm": 0.0743618831038475, + "learning_rate": 2.151880769384605e-05, + "loss": 0.0339, + "step": 113450 + }, + { + "epoch": 0.1673, + "grad_norm": 0.06382055580615997, + "learning_rate": 2.1514714338889914e-05, + "loss": 0.0354, + "step": 113460 + }, + { + "epoch": 0.16735, + "grad_norm": 0.06299655139446259, + "learning_rate": 2.1510621079220174e-05, + "loss": 0.0335, + "step": 113470 + }, + { + "epoch": 0.1674, + "grad_norm": 0.05774754658341408, + "learning_rate": 2.1506527914948746e-05, + "loss": 0.0338, + "step": 113480 + }, + { + "epoch": 0.16745, + "grad_norm": 0.05452101677656174, + "learning_rate": 2.1502434846187536e-05, + "loss": 0.0376, + "step": 113490 + }, + { + "epoch": 0.1675, + "grad_norm": 0.06228433549404144, + "learning_rate": 2.149834187304844e-05, + "loss": 0.0349, + "step": 113500 + }, + { + "epoch": 0.16755, + "grad_norm": 0.06229841336607933, + "learning_rate": 2.1494248995643366e-05, + "loss": 0.0345, + "step": 113510 + }, + { + "epoch": 0.1676, + "grad_norm": 0.05689483880996704, + "learning_rate": 2.1490156214084202e-05, + "loss": 0.0351, + "step": 113520 + }, + { + "epoch": 0.16765, + "grad_norm": 0.060247309505939484, + "learning_rate": 2.1486063528482853e-05, + "loss": 0.0349, + "step": 113530 + }, + { + "epoch": 0.1677, + "grad_norm": 0.06565812230110168, + "learning_rate": 2.1481970938951204e-05, + "loss": 0.0345, + "step": 113540 + }, + { + "epoch": 0.16775, + "grad_norm": 0.05560121685266495, + "learning_rate": 2.147787844560114e-05, + "loss": 0.033, + "step": 113550 + }, + { + "epoch": 0.1678, + "grad_norm": 0.07955660670995712, + "learning_rate": 2.1473786048544576e-05, + "loss": 0.0343, + "step": 113560 + }, + { + "epoch": 0.16785, + "grad_norm": 0.06279899179935455, + "learning_rate": 2.1469693747893355e-05, + "loss": 0.0341, + "step": 113570 + }, + { + "epoch": 0.1679, + "grad_norm": 0.07042830437421799, + "learning_rate": 2.14656015437594e-05, + "loss": 0.0356, + "step": 113580 + }, + { + "epoch": 0.16795, + "grad_norm": 0.08455251157283783, + "learning_rate": 2.1461509436254557e-05, + "loss": 0.0355, + "step": 113590 + }, + { + "epoch": 0.168, + "grad_norm": 0.07407843321561813, + "learning_rate": 2.1457417425490723e-05, + "loss": 0.0361, + "step": 113600 + }, + { + "epoch": 0.16805, + "grad_norm": 0.06888583302497864, + "learning_rate": 2.1453325511579764e-05, + "loss": 0.0351, + "step": 113610 + }, + { + "epoch": 0.1681, + "grad_norm": 0.0661529004573822, + "learning_rate": 2.144923369463355e-05, + "loss": 0.0342, + "step": 113620 + }, + { + "epoch": 0.16815, + "grad_norm": 0.07152344286441803, + "learning_rate": 2.1445141974763962e-05, + "loss": 0.0369, + "step": 113630 + }, + { + "epoch": 0.1682, + "grad_norm": 0.07802043855190277, + "learning_rate": 2.1441050352082848e-05, + "loss": 0.0364, + "step": 113640 + }, + { + "epoch": 0.16825, + "grad_norm": 0.06816740334033966, + "learning_rate": 2.1436958826702086e-05, + "loss": 0.035, + "step": 113650 + }, + { + "epoch": 0.1683, + "grad_norm": 0.07369918376207352, + "learning_rate": 2.143286739873353e-05, + "loss": 0.0359, + "step": 113660 + }, + { + "epoch": 0.16835, + "grad_norm": 0.07201619446277618, + "learning_rate": 2.1428776068289028e-05, + "loss": 0.0348, + "step": 113670 + }, + { + "epoch": 0.1684, + "grad_norm": 0.06485801190137863, + "learning_rate": 2.1424684835480456e-05, + "loss": 0.0348, + "step": 113680 + }, + { + "epoch": 0.16845, + "grad_norm": 0.07644911110401154, + "learning_rate": 2.142059370041966e-05, + "loss": 0.0398, + "step": 113690 + }, + { + "epoch": 0.1685, + "grad_norm": 0.08884541690349579, + "learning_rate": 2.141650266321849e-05, + "loss": 0.0362, + "step": 113700 + }, + { + "epoch": 0.16855, + "grad_norm": 0.060787659138441086, + "learning_rate": 2.1412411723988783e-05, + "loss": 0.0345, + "step": 113710 + }, + { + "epoch": 0.1686, + "grad_norm": 0.05763450637459755, + "learning_rate": 2.1408320882842398e-05, + "loss": 0.0342, + "step": 113720 + }, + { + "epoch": 0.16865, + "grad_norm": 0.07546335458755493, + "learning_rate": 2.1404230139891167e-05, + "loss": 0.0378, + "step": 113730 + }, + { + "epoch": 0.1687, + "grad_norm": 0.06813129037618637, + "learning_rate": 2.140013949524693e-05, + "loss": 0.034, + "step": 113740 + }, + { + "epoch": 0.16875, + "grad_norm": 0.06949824094772339, + "learning_rate": 2.1396048949021536e-05, + "loss": 0.0341, + "step": 113750 + }, + { + "epoch": 0.1688, + "grad_norm": 0.07007627934217453, + "learning_rate": 2.1391958501326793e-05, + "loss": 0.0343, + "step": 113760 + }, + { + "epoch": 0.16885, + "grad_norm": 0.0673021450638771, + "learning_rate": 2.138786815227457e-05, + "loss": 0.0342, + "step": 113770 + }, + { + "epoch": 0.1689, + "grad_norm": 0.058130063116550446, + "learning_rate": 2.1383777901976658e-05, + "loss": 0.0336, + "step": 113780 + }, + { + "epoch": 0.16895, + "grad_norm": 0.05610980838537216, + "learning_rate": 2.137968775054492e-05, + "loss": 0.0341, + "step": 113790 + }, + { + "epoch": 0.169, + "grad_norm": 0.07827328145503998, + "learning_rate": 2.1375597698091136e-05, + "loss": 0.0361, + "step": 113800 + }, + { + "epoch": 0.16905, + "grad_norm": 0.06286213546991348, + "learning_rate": 2.1371507744727155e-05, + "loss": 0.0342, + "step": 113810 + }, + { + "epoch": 0.1691, + "grad_norm": 0.0825994536280632, + "learning_rate": 2.1367417890564797e-05, + "loss": 0.0357, + "step": 113820 + }, + { + "epoch": 0.16915, + "grad_norm": 0.07677712291479111, + "learning_rate": 2.1363328135715867e-05, + "loss": 0.0332, + "step": 113830 + }, + { + "epoch": 0.1692, + "grad_norm": 0.09539990872144699, + "learning_rate": 2.135923848029218e-05, + "loss": 0.036, + "step": 113840 + }, + { + "epoch": 0.16925, + "grad_norm": 0.08900941163301468, + "learning_rate": 2.1355148924405537e-05, + "loss": 0.0348, + "step": 113850 + }, + { + "epoch": 0.1693, + "grad_norm": 0.08057425916194916, + "learning_rate": 2.1351059468167755e-05, + "loss": 0.0345, + "step": 113860 + }, + { + "epoch": 0.16935, + "grad_norm": 0.0812041163444519, + "learning_rate": 2.1346970111690647e-05, + "loss": 0.0352, + "step": 113870 + }, + { + "epoch": 0.1694, + "grad_norm": 0.07820922881364822, + "learning_rate": 2.1342880855085988e-05, + "loss": 0.035, + "step": 113880 + }, + { + "epoch": 0.16945, + "grad_norm": 0.08901207149028778, + "learning_rate": 2.1338791698465606e-05, + "loss": 0.034, + "step": 113890 + }, + { + "epoch": 0.1695, + "grad_norm": 0.07280625402927399, + "learning_rate": 2.133470264194128e-05, + "loss": 0.0339, + "step": 113900 + }, + { + "epoch": 0.16955, + "grad_norm": 0.06772835552692413, + "learning_rate": 2.133061368562481e-05, + "loss": 0.0344, + "step": 113910 + }, + { + "epoch": 0.1696, + "grad_norm": 0.07976733148097992, + "learning_rate": 2.1326524829627974e-05, + "loss": 0.0348, + "step": 113920 + }, + { + "epoch": 0.16965, + "grad_norm": 0.0896020233631134, + "learning_rate": 2.1322436074062575e-05, + "loss": 0.0364, + "step": 113930 + }, + { + "epoch": 0.1697, + "grad_norm": 0.0783344954252243, + "learning_rate": 2.1318347419040393e-05, + "loss": 0.0344, + "step": 113940 + }, + { + "epoch": 0.16975, + "grad_norm": 0.06524500995874405, + "learning_rate": 2.1314258864673207e-05, + "loss": 0.0337, + "step": 113950 + }, + { + "epoch": 0.1698, + "grad_norm": 0.0689595490694046, + "learning_rate": 2.1310170411072803e-05, + "loss": 0.0343, + "step": 113960 + }, + { + "epoch": 0.16985, + "grad_norm": 0.06631206721067429, + "learning_rate": 2.1306082058350944e-05, + "loss": 0.033, + "step": 113970 + }, + { + "epoch": 0.1699, + "grad_norm": 0.06748275458812714, + "learning_rate": 2.130199380661943e-05, + "loss": 0.034, + "step": 113980 + }, + { + "epoch": 0.16995, + "grad_norm": 0.07014484703540802, + "learning_rate": 2.1297905655989997e-05, + "loss": 0.0343, + "step": 113990 + }, + { + "epoch": 0.17, + "grad_norm": 0.0744910016655922, + "learning_rate": 2.129381760657444e-05, + "loss": 0.0331, + "step": 114000 + }, + { + "epoch": 0.17005, + "grad_norm": 0.06807012856006622, + "learning_rate": 2.128972965848452e-05, + "loss": 0.0353, + "step": 114010 + }, + { + "epoch": 0.1701, + "grad_norm": 0.07064221054315567, + "learning_rate": 2.1285641811831997e-05, + "loss": 0.0336, + "step": 114020 + }, + { + "epoch": 0.17015, + "grad_norm": 0.08048601448535919, + "learning_rate": 2.1281554066728636e-05, + "loss": 0.0342, + "step": 114030 + }, + { + "epoch": 0.1702, + "grad_norm": 0.08655247092247009, + "learning_rate": 2.1277466423286183e-05, + "loss": 0.0351, + "step": 114040 + }, + { + "epoch": 0.17025, + "grad_norm": 0.06995043903589249, + "learning_rate": 2.1273378881616393e-05, + "loss": 0.0334, + "step": 114050 + }, + { + "epoch": 0.1703, + "grad_norm": 0.05340658500790596, + "learning_rate": 2.1269291441831042e-05, + "loss": 0.0335, + "step": 114060 + }, + { + "epoch": 0.17035, + "grad_norm": 0.06602095067501068, + "learning_rate": 2.1265204104041845e-05, + "loss": 0.0329, + "step": 114070 + }, + { + "epoch": 0.1704, + "grad_norm": 0.06772585213184357, + "learning_rate": 2.1261116868360582e-05, + "loss": 0.0338, + "step": 114080 + }, + { + "epoch": 0.17045, + "grad_norm": 0.07455842196941376, + "learning_rate": 2.1257029734898957e-05, + "loss": 0.0342, + "step": 114090 + }, + { + "epoch": 0.1705, + "grad_norm": 0.08810683339834213, + "learning_rate": 2.1252942703768752e-05, + "loss": 0.035, + "step": 114100 + }, + { + "epoch": 0.17055, + "grad_norm": 0.0923193097114563, + "learning_rate": 2.1248855775081675e-05, + "loss": 0.0359, + "step": 114110 + }, + { + "epoch": 0.1706, + "grad_norm": 0.08906389772891998, + "learning_rate": 2.1244768948949472e-05, + "loss": 0.0332, + "step": 114120 + }, + { + "epoch": 0.17065, + "grad_norm": 0.07592211663722992, + "learning_rate": 2.1240682225483886e-05, + "loss": 0.0339, + "step": 114130 + }, + { + "epoch": 0.1707, + "grad_norm": 0.07437780499458313, + "learning_rate": 2.1236595604796624e-05, + "loss": 0.033, + "step": 114140 + }, + { + "epoch": 0.17075, + "grad_norm": 0.08547678589820862, + "learning_rate": 2.1232509086999433e-05, + "loss": 0.0324, + "step": 114150 + }, + { + "epoch": 0.1708, + "grad_norm": 0.07600551843643188, + "learning_rate": 2.122842267220402e-05, + "loss": 0.0333, + "step": 114160 + }, + { + "epoch": 0.17085, + "grad_norm": 0.0664534717798233, + "learning_rate": 2.1224336360522123e-05, + "loss": 0.0328, + "step": 114170 + }, + { + "epoch": 0.1709, + "grad_norm": 0.08195389807224274, + "learning_rate": 2.122025015206544e-05, + "loss": 0.0335, + "step": 114180 + }, + { + "epoch": 0.17095, + "grad_norm": 0.08264941722154617, + "learning_rate": 2.1216164046945703e-05, + "loss": 0.0341, + "step": 114190 + }, + { + "epoch": 0.171, + "grad_norm": 0.06394562870264053, + "learning_rate": 2.1212078045274622e-05, + "loss": 0.0337, + "step": 114200 + }, + { + "epoch": 0.17105, + "grad_norm": 0.06541323661804199, + "learning_rate": 2.1207992147163903e-05, + "loss": 0.0336, + "step": 114210 + }, + { + "epoch": 0.1711, + "grad_norm": 0.07025276869535446, + "learning_rate": 2.1203906352725256e-05, + "loss": 0.0332, + "step": 114220 + }, + { + "epoch": 0.17115, + "grad_norm": 0.08636878430843353, + "learning_rate": 2.119982066207038e-05, + "loss": 0.0341, + "step": 114230 + }, + { + "epoch": 0.1712, + "grad_norm": 0.06825531274080276, + "learning_rate": 2.119573507531098e-05, + "loss": 0.0359, + "step": 114240 + }, + { + "epoch": 0.17125, + "grad_norm": 0.06317528337240219, + "learning_rate": 2.1191649592558757e-05, + "loss": 0.0333, + "step": 114250 + }, + { + "epoch": 0.1713, + "grad_norm": 0.06706538796424866, + "learning_rate": 2.1187564213925393e-05, + "loss": 0.0329, + "step": 114260 + }, + { + "epoch": 0.17135, + "grad_norm": 0.06396844983100891, + "learning_rate": 2.1183478939522607e-05, + "loss": 0.033, + "step": 114270 + }, + { + "epoch": 0.1714, + "grad_norm": 0.061988480389118195, + "learning_rate": 2.117939376946206e-05, + "loss": 0.0346, + "step": 114280 + }, + { + "epoch": 0.17145, + "grad_norm": 0.06188317760825157, + "learning_rate": 2.1175308703855464e-05, + "loss": 0.0355, + "step": 114290 + }, + { + "epoch": 0.1715, + "grad_norm": 0.06005796045064926, + "learning_rate": 2.117122374281448e-05, + "loss": 0.0353, + "step": 114300 + }, + { + "epoch": 0.17155, + "grad_norm": 0.059571754187345505, + "learning_rate": 2.1167138886450798e-05, + "loss": 0.0343, + "step": 114310 + }, + { + "epoch": 0.1716, + "grad_norm": 0.0594683401286602, + "learning_rate": 2.116305413487611e-05, + "loss": 0.0336, + "step": 114320 + }, + { + "epoch": 0.17165, + "grad_norm": 0.10240515321493149, + "learning_rate": 2.1158969488202073e-05, + "loss": 0.0351, + "step": 114330 + }, + { + "epoch": 0.1717, + "grad_norm": 0.07099906355142593, + "learning_rate": 2.1154884946540378e-05, + "loss": 0.0346, + "step": 114340 + }, + { + "epoch": 0.17175, + "grad_norm": 0.09732221066951752, + "learning_rate": 2.1150800510002672e-05, + "loss": 0.036, + "step": 114350 + }, + { + "epoch": 0.1718, + "grad_norm": 0.06488008052110672, + "learning_rate": 2.1146716178700644e-05, + "loss": 0.0387, + "step": 114360 + }, + { + "epoch": 0.17185, + "grad_norm": 0.06984580308198929, + "learning_rate": 2.114263195274594e-05, + "loss": 0.0357, + "step": 114370 + }, + { + "epoch": 0.1719, + "grad_norm": 0.09493076056241989, + "learning_rate": 2.1138547832250223e-05, + "loss": 0.0379, + "step": 114380 + }, + { + "epoch": 0.17195, + "grad_norm": 0.11810750514268875, + "learning_rate": 2.1134463817325172e-05, + "loss": 0.0357, + "step": 114390 + }, + { + "epoch": 0.172, + "grad_norm": 0.08848319947719574, + "learning_rate": 2.113037990808242e-05, + "loss": 0.0349, + "step": 114400 + }, + { + "epoch": 0.17205, + "grad_norm": 0.07288021594285965, + "learning_rate": 2.112629610463363e-05, + "loss": 0.0351, + "step": 114410 + }, + { + "epoch": 0.1721, + "grad_norm": 0.0753333792090416, + "learning_rate": 2.1122212407090447e-05, + "loss": 0.0356, + "step": 114420 + }, + { + "epoch": 0.17215, + "grad_norm": 0.1015729233622551, + "learning_rate": 2.1118128815564525e-05, + "loss": 0.0355, + "step": 114430 + }, + { + "epoch": 0.1722, + "grad_norm": 0.07044872641563416, + "learning_rate": 2.1114045330167498e-05, + "loss": 0.0355, + "step": 114440 + }, + { + "epoch": 0.17225, + "grad_norm": 0.06678333878517151, + "learning_rate": 2.110996195101101e-05, + "loss": 0.0337, + "step": 114450 + }, + { + "epoch": 0.1723, + "grad_norm": 0.07907833904027939, + "learning_rate": 2.110587867820671e-05, + "loss": 0.0345, + "step": 114460 + }, + { + "epoch": 0.17235, + "grad_norm": 0.07277830690145493, + "learning_rate": 2.1101795511866213e-05, + "loss": 0.0348, + "step": 114470 + }, + { + "epoch": 0.1724, + "grad_norm": 0.0655599981546402, + "learning_rate": 2.1097712452101175e-05, + "loss": 0.0349, + "step": 114480 + }, + { + "epoch": 0.17245, + "grad_norm": 0.07155159115791321, + "learning_rate": 2.10936294990232e-05, + "loss": 0.0336, + "step": 114490 + }, + { + "epoch": 0.1725, + "grad_norm": 0.060851093381643295, + "learning_rate": 2.1089546652743926e-05, + "loss": 0.034, + "step": 114500 + }, + { + "epoch": 0.17255, + "grad_norm": 0.06356131285429001, + "learning_rate": 2.108546391337499e-05, + "loss": 0.0335, + "step": 114510 + }, + { + "epoch": 0.1726, + "grad_norm": 0.06545457243919373, + "learning_rate": 2.108138128102799e-05, + "loss": 0.0321, + "step": 114520 + }, + { + "epoch": 0.17265, + "grad_norm": 0.06263414770364761, + "learning_rate": 2.1077298755814563e-05, + "loss": 0.0338, + "step": 114530 + }, + { + "epoch": 0.1727, + "grad_norm": 0.07915288954973221, + "learning_rate": 2.1073216337846305e-05, + "loss": 0.0343, + "step": 114540 + }, + { + "epoch": 0.17275, + "grad_norm": 0.07472743093967438, + "learning_rate": 2.1069134027234844e-05, + "loss": 0.0347, + "step": 114550 + }, + { + "epoch": 0.1728, + "grad_norm": 0.08519298583269119, + "learning_rate": 2.1065051824091773e-05, + "loss": 0.036, + "step": 114560 + }, + { + "epoch": 0.17285, + "grad_norm": 0.06525319069623947, + "learning_rate": 2.1060969728528707e-05, + "loss": 0.0365, + "step": 114570 + }, + { + "epoch": 0.1729, + "grad_norm": 0.07866008579730988, + "learning_rate": 2.1056887740657264e-05, + "loss": 0.0357, + "step": 114580 + }, + { + "epoch": 0.17295, + "grad_norm": 0.06493719667196274, + "learning_rate": 2.105280586058901e-05, + "loss": 0.0327, + "step": 114590 + }, + { + "epoch": 0.173, + "grad_norm": 0.0681166872382164, + "learning_rate": 2.1048724088435576e-05, + "loss": 0.0339, + "step": 114600 + }, + { + "epoch": 0.17305, + "grad_norm": 0.08641801029443741, + "learning_rate": 2.104464242430853e-05, + "loss": 0.0356, + "step": 114610 + }, + { + "epoch": 0.1731, + "grad_norm": 0.08253999799489975, + "learning_rate": 2.1040560868319485e-05, + "loss": 0.0334, + "step": 114620 + }, + { + "epoch": 0.17315, + "grad_norm": 0.08029817044734955, + "learning_rate": 2.103647942058001e-05, + "loss": 0.0343, + "step": 114630 + }, + { + "epoch": 0.1732, + "grad_norm": 0.08552084118127823, + "learning_rate": 2.1032398081201698e-05, + "loss": 0.034, + "step": 114640 + }, + { + "epoch": 0.17325, + "grad_norm": 0.06611842662096024, + "learning_rate": 2.102831685029614e-05, + "loss": 0.0333, + "step": 114650 + }, + { + "epoch": 0.1733, + "grad_norm": 0.09615202993154526, + "learning_rate": 2.10242357279749e-05, + "loss": 0.0348, + "step": 114660 + }, + { + "epoch": 0.17335, + "grad_norm": 0.06889908015727997, + "learning_rate": 2.1020154714349566e-05, + "loss": 0.0333, + "step": 114670 + }, + { + "epoch": 0.1734, + "grad_norm": 0.08466773480176926, + "learning_rate": 2.1016073809531698e-05, + "loss": 0.036, + "step": 114680 + }, + { + "epoch": 0.17345, + "grad_norm": 0.09294584393501282, + "learning_rate": 2.101199301363288e-05, + "loss": 0.0339, + "step": 114690 + }, + { + "epoch": 0.1735, + "grad_norm": 0.1027880311012268, + "learning_rate": 2.100791232676468e-05, + "loss": 0.0343, + "step": 114700 + }, + { + "epoch": 0.17355, + "grad_norm": 0.07762772589921951, + "learning_rate": 2.1003831749038654e-05, + "loss": 0.0363, + "step": 114710 + }, + { + "epoch": 0.1736, + "grad_norm": 0.08277373015880585, + "learning_rate": 2.099975128056637e-05, + "loss": 0.0333, + "step": 114720 + }, + { + "epoch": 0.17365, + "grad_norm": 0.07976562529802322, + "learning_rate": 2.0995670921459375e-05, + "loss": 0.0332, + "step": 114730 + }, + { + "epoch": 0.1737, + "grad_norm": 0.0763949528336525, + "learning_rate": 2.099159067182924e-05, + "loss": 0.0339, + "step": 114740 + }, + { + "epoch": 0.17375, + "grad_norm": 0.0686565712094307, + "learning_rate": 2.0987510531787507e-05, + "loss": 0.0351, + "step": 114750 + }, + { + "epoch": 0.1738, + "grad_norm": 0.06873409450054169, + "learning_rate": 2.0983430501445722e-05, + "loss": 0.0336, + "step": 114760 + }, + { + "epoch": 0.17385, + "grad_norm": 0.07115428894758224, + "learning_rate": 2.0979350580915454e-05, + "loss": 0.034, + "step": 114770 + }, + { + "epoch": 0.1739, + "grad_norm": 0.05651280656456947, + "learning_rate": 2.0975270770308215e-05, + "loss": 0.0333, + "step": 114780 + }, + { + "epoch": 0.17395, + "grad_norm": 0.06276542693376541, + "learning_rate": 2.0971191069735578e-05, + "loss": 0.0367, + "step": 114790 + }, + { + "epoch": 0.174, + "grad_norm": 0.061557210981845856, + "learning_rate": 2.0967111479309044e-05, + "loss": 0.0349, + "step": 114800 + }, + { + "epoch": 0.17405, + "grad_norm": 0.05692186579108238, + "learning_rate": 2.096303199914018e-05, + "loss": 0.0343, + "step": 114810 + }, + { + "epoch": 0.1741, + "grad_norm": 0.06463827937841415, + "learning_rate": 2.0958952629340502e-05, + "loss": 0.0358, + "step": 114820 + }, + { + "epoch": 0.17415, + "grad_norm": 0.075020931661129, + "learning_rate": 2.095487337002154e-05, + "loss": 0.0353, + "step": 114830 + }, + { + "epoch": 0.1742, + "grad_norm": 0.06175791844725609, + "learning_rate": 2.095079422129482e-05, + "loss": 0.0362, + "step": 114840 + }, + { + "epoch": 0.17425, + "grad_norm": 0.0771561786532402, + "learning_rate": 2.0946715183271863e-05, + "loss": 0.0364, + "step": 114850 + }, + { + "epoch": 0.1743, + "grad_norm": 0.07464942336082458, + "learning_rate": 2.09426362560642e-05, + "loss": 0.0344, + "step": 114860 + }, + { + "epoch": 0.17435, + "grad_norm": 0.07180729508399963, + "learning_rate": 2.0938557439783327e-05, + "loss": 0.0335, + "step": 114870 + }, + { + "epoch": 0.1744, + "grad_norm": 0.07695086300373077, + "learning_rate": 2.0934478734540762e-05, + "loss": 0.0361, + "step": 114880 + }, + { + "epoch": 0.17445, + "grad_norm": 0.06530208885669708, + "learning_rate": 2.0930400140448033e-05, + "loss": 0.0354, + "step": 114890 + }, + { + "epoch": 0.1745, + "grad_norm": 0.07088170200586319, + "learning_rate": 2.092632165761663e-05, + "loss": 0.0336, + "step": 114900 + }, + { + "epoch": 0.17455, + "grad_norm": 0.06989553570747375, + "learning_rate": 2.092224328615807e-05, + "loss": 0.0337, + "step": 114910 + }, + { + "epoch": 0.1746, + "grad_norm": 0.0927966833114624, + "learning_rate": 2.0918165026183838e-05, + "loss": 0.0351, + "step": 114920 + }, + { + "epoch": 0.17465, + "grad_norm": 0.06884682178497314, + "learning_rate": 2.091408687780545e-05, + "loss": 0.0337, + "step": 114930 + }, + { + "epoch": 0.1747, + "grad_norm": 0.06937593966722488, + "learning_rate": 2.0910008841134383e-05, + "loss": 0.0349, + "step": 114940 + }, + { + "epoch": 0.17475, + "grad_norm": 0.07575168460607529, + "learning_rate": 2.090593091628213e-05, + "loss": 0.0326, + "step": 114950 + }, + { + "epoch": 0.1748, + "grad_norm": 0.07248397171497345, + "learning_rate": 2.0901853103360207e-05, + "loss": 0.0339, + "step": 114960 + }, + { + "epoch": 0.17485, + "grad_norm": 0.0727832019329071, + "learning_rate": 2.0897775402480065e-05, + "loss": 0.0331, + "step": 114970 + }, + { + "epoch": 0.1749, + "grad_norm": 0.06162170693278313, + "learning_rate": 2.089369781375322e-05, + "loss": 0.0349, + "step": 114980 + }, + { + "epoch": 0.17495, + "grad_norm": 0.056191302835941315, + "learning_rate": 2.0889620337291117e-05, + "loss": 0.0324, + "step": 114990 + }, + { + "epoch": 0.175, + "grad_norm": 0.06705120950937271, + "learning_rate": 2.0885542973205264e-05, + "loss": 0.0331, + "step": 115000 + }, + { + "epoch": 0.17505, + "grad_norm": 0.057472214102745056, + "learning_rate": 2.0881465721607104e-05, + "loss": 0.0325, + "step": 115010 + }, + { + "epoch": 0.1751, + "grad_norm": 0.06392459571361542, + "learning_rate": 2.087738858260813e-05, + "loss": 0.0331, + "step": 115020 + }, + { + "epoch": 0.17515, + "grad_norm": 0.057952750474214554, + "learning_rate": 2.087331155631981e-05, + "loss": 0.0331, + "step": 115030 + }, + { + "epoch": 0.1752, + "grad_norm": 0.06256795674562454, + "learning_rate": 2.08692346428536e-05, + "loss": 0.0341, + "step": 115040 + }, + { + "epoch": 0.17525, + "grad_norm": 0.07768010348081589, + "learning_rate": 2.0865157842320958e-05, + "loss": 0.0341, + "step": 115050 + }, + { + "epoch": 0.1753, + "grad_norm": 0.06745045632123947, + "learning_rate": 2.0861081154833348e-05, + "loss": 0.0336, + "step": 115060 + }, + { + "epoch": 0.17535, + "grad_norm": 0.06342687457799911, + "learning_rate": 2.0857004580502217e-05, + "loss": 0.0329, + "step": 115070 + }, + { + "epoch": 0.1754, + "grad_norm": 0.06732375174760818, + "learning_rate": 2.0852928119439043e-05, + "loss": 0.0331, + "step": 115080 + }, + { + "epoch": 0.17545, + "grad_norm": 0.06038998067378998, + "learning_rate": 2.084885177175524e-05, + "loss": 0.0347, + "step": 115090 + }, + { + "epoch": 0.1755, + "grad_norm": 0.0717553123831749, + "learning_rate": 2.084477553756228e-05, + "loss": 0.0349, + "step": 115100 + }, + { + "epoch": 0.17555, + "grad_norm": 0.06540452688932419, + "learning_rate": 2.084069941697159e-05, + "loss": 0.0355, + "step": 115110 + }, + { + "epoch": 0.1756, + "grad_norm": 0.07494036853313446, + "learning_rate": 2.0836623410094623e-05, + "loss": 0.0359, + "step": 115120 + }, + { + "epoch": 0.17565, + "grad_norm": 0.07349668443202972, + "learning_rate": 2.08325475170428e-05, + "loss": 0.0368, + "step": 115130 + }, + { + "epoch": 0.1757, + "grad_norm": 0.06400984525680542, + "learning_rate": 2.082847173792756e-05, + "loss": 0.035, + "step": 115140 + }, + { + "epoch": 0.17575, + "grad_norm": 0.061655063182115555, + "learning_rate": 2.0824396072860343e-05, + "loss": 0.0366, + "step": 115150 + }, + { + "epoch": 0.1758, + "grad_norm": 0.07127080112695694, + "learning_rate": 2.082032052195256e-05, + "loss": 0.0341, + "step": 115160 + }, + { + "epoch": 0.17585, + "grad_norm": 0.06664267182350159, + "learning_rate": 2.081624508531566e-05, + "loss": 0.035, + "step": 115170 + }, + { + "epoch": 0.1759, + "grad_norm": 0.06681042164564133, + "learning_rate": 2.081216976306103e-05, + "loss": 0.034, + "step": 115180 + }, + { + "epoch": 0.17595, + "grad_norm": 0.07515434920787811, + "learning_rate": 2.080809455530012e-05, + "loss": 0.035, + "step": 115190 + }, + { + "epoch": 0.176, + "grad_norm": 0.06582251936197281, + "learning_rate": 2.0804019462144315e-05, + "loss": 0.034, + "step": 115200 + }, + { + "epoch": 0.17605, + "grad_norm": 0.06440650671720505, + "learning_rate": 2.0799944483705047e-05, + "loss": 0.0353, + "step": 115210 + }, + { + "epoch": 0.1761, + "grad_norm": 0.06627684086561203, + "learning_rate": 2.0795869620093726e-05, + "loss": 0.0335, + "step": 115220 + }, + { + "epoch": 0.17615, + "grad_norm": 0.061812873929739, + "learning_rate": 2.0791794871421743e-05, + "loss": 0.0346, + "step": 115230 + }, + { + "epoch": 0.1762, + "grad_norm": 0.06207577511668205, + "learning_rate": 2.078772023780051e-05, + "loss": 0.0332, + "step": 115240 + }, + { + "epoch": 0.17625, + "grad_norm": 0.07133428752422333, + "learning_rate": 2.0783645719341424e-05, + "loss": 0.0355, + "step": 115250 + }, + { + "epoch": 0.1763, + "grad_norm": 0.06522677093744278, + "learning_rate": 2.077957131615587e-05, + "loss": 0.0333, + "step": 115260 + }, + { + "epoch": 0.17635, + "grad_norm": 0.06236393749713898, + "learning_rate": 2.0775497028355268e-05, + "loss": 0.0338, + "step": 115270 + }, + { + "epoch": 0.1764, + "grad_norm": 0.08299710601568222, + "learning_rate": 2.0771422856050978e-05, + "loss": 0.0374, + "step": 115280 + }, + { + "epoch": 0.17645, + "grad_norm": 0.08262594044208527, + "learning_rate": 2.076734879935441e-05, + "loss": 0.0345, + "step": 115290 + }, + { + "epoch": 0.1765, + "grad_norm": 0.07234200090169907, + "learning_rate": 2.0763274858376918e-05, + "loss": 0.0343, + "step": 115300 + }, + { + "epoch": 0.17655, + "grad_norm": 0.06531044840812683, + "learning_rate": 2.0759201033229914e-05, + "loss": 0.034, + "step": 115310 + }, + { + "epoch": 0.1766, + "grad_norm": 0.06469815969467163, + "learning_rate": 2.0755127324024754e-05, + "loss": 0.0331, + "step": 115320 + }, + { + "epoch": 0.17665, + "grad_norm": 0.06525961309671402, + "learning_rate": 2.0751053730872817e-05, + "loss": 0.0325, + "step": 115330 + }, + { + "epoch": 0.1767, + "grad_norm": 0.058061715215444565, + "learning_rate": 2.0746980253885483e-05, + "loss": 0.0342, + "step": 115340 + }, + { + "epoch": 0.17675, + "grad_norm": 0.09244202822446823, + "learning_rate": 2.0742906893174102e-05, + "loss": 0.0345, + "step": 115350 + }, + { + "epoch": 0.1768, + "grad_norm": 0.08636010438203812, + "learning_rate": 2.0738833648850056e-05, + "loss": 0.0335, + "step": 115360 + }, + { + "epoch": 0.17685, + "grad_norm": 0.06512778997421265, + "learning_rate": 2.0734760521024685e-05, + "loss": 0.0339, + "step": 115370 + }, + { + "epoch": 0.1769, + "grad_norm": 0.06986937671899796, + "learning_rate": 2.0730687509809377e-05, + "loss": 0.0346, + "step": 115380 + }, + { + "epoch": 0.17695, + "grad_norm": 0.07900271564722061, + "learning_rate": 2.0726614615315447e-05, + "loss": 0.0347, + "step": 115390 + }, + { + "epoch": 0.177, + "grad_norm": 0.06363274157047272, + "learning_rate": 2.072254183765428e-05, + "loss": 0.0323, + "step": 115400 + }, + { + "epoch": 0.17705, + "grad_norm": 0.09011182934045792, + "learning_rate": 2.0718469176937214e-05, + "loss": 0.037, + "step": 115410 + }, + { + "epoch": 0.1771, + "grad_norm": 0.11731145530939102, + "learning_rate": 2.0714396633275586e-05, + "loss": 0.0343, + "step": 115420 + }, + { + "epoch": 0.17715, + "grad_norm": 0.09642758220434189, + "learning_rate": 2.0710324206780756e-05, + "loss": 0.0338, + "step": 115430 + }, + { + "epoch": 0.1772, + "grad_norm": 0.10255391150712967, + "learning_rate": 2.0706251897564037e-05, + "loss": 0.035, + "step": 115440 + }, + { + "epoch": 0.17725, + "grad_norm": 0.08662714809179306, + "learning_rate": 2.0702179705736778e-05, + "loss": 0.0334, + "step": 115450 + }, + { + "epoch": 0.1773, + "grad_norm": 0.06695021688938141, + "learning_rate": 2.0698107631410323e-05, + "loss": 0.033, + "step": 115460 + }, + { + "epoch": 0.17735, + "grad_norm": 0.06570626050233841, + "learning_rate": 2.0694035674695974e-05, + "loss": 0.0349, + "step": 115470 + }, + { + "epoch": 0.1774, + "grad_norm": 0.06275024265050888, + "learning_rate": 2.068996383570509e-05, + "loss": 0.033, + "step": 115480 + }, + { + "epoch": 0.17745, + "grad_norm": 0.055382829159498215, + "learning_rate": 2.068589211454896e-05, + "loss": 0.0328, + "step": 115490 + }, + { + "epoch": 0.1775, + "grad_norm": 0.062211498618125916, + "learning_rate": 2.0681820511338927e-05, + "loss": 0.0337, + "step": 115500 + }, + { + "epoch": 0.17755, + "grad_norm": 0.062481772154569626, + "learning_rate": 2.0677749026186296e-05, + "loss": 0.0328, + "step": 115510 + }, + { + "epoch": 0.1776, + "grad_norm": 0.06326182186603546, + "learning_rate": 2.067367765920238e-05, + "loss": 0.0325, + "step": 115520 + }, + { + "epoch": 0.17765, + "grad_norm": 0.061724428087472916, + "learning_rate": 2.0669606410498498e-05, + "loss": 0.0328, + "step": 115530 + }, + { + "epoch": 0.1777, + "grad_norm": 0.06887421011924744, + "learning_rate": 2.066553528018595e-05, + "loss": 0.0337, + "step": 115540 + }, + { + "epoch": 0.17775, + "grad_norm": 0.06955559551715851, + "learning_rate": 2.0661464268376037e-05, + "loss": 0.0332, + "step": 115550 + }, + { + "epoch": 0.1778, + "grad_norm": 0.06526818871498108, + "learning_rate": 2.0657393375180058e-05, + "loss": 0.0344, + "step": 115560 + }, + { + "epoch": 0.17785, + "grad_norm": 0.07432231307029724, + "learning_rate": 2.065332260070932e-05, + "loss": 0.0338, + "step": 115570 + }, + { + "epoch": 0.1779, + "grad_norm": 0.09075212478637695, + "learning_rate": 2.0649251945075095e-05, + "loss": 0.0333, + "step": 115580 + }, + { + "epoch": 0.17795, + "grad_norm": 0.06763681769371033, + "learning_rate": 2.0645181408388694e-05, + "loss": 0.0333, + "step": 115590 + }, + { + "epoch": 0.178, + "grad_norm": 0.07734362781047821, + "learning_rate": 2.0641110990761403e-05, + "loss": 0.0332, + "step": 115600 + }, + { + "epoch": 0.17805, + "grad_norm": 0.06649279594421387, + "learning_rate": 2.0637040692304492e-05, + "loss": 0.0346, + "step": 115610 + }, + { + "epoch": 0.1781, + "grad_norm": 0.06801921129226685, + "learning_rate": 2.063297051312926e-05, + "loss": 0.0339, + "step": 115620 + }, + { + "epoch": 0.17815, + "grad_norm": 0.08945854008197784, + "learning_rate": 2.0628900453346967e-05, + "loss": 0.0337, + "step": 115630 + }, + { + "epoch": 0.1782, + "grad_norm": 0.07723744213581085, + "learning_rate": 2.0624830513068895e-05, + "loss": 0.035, + "step": 115640 + }, + { + "epoch": 0.17825, + "grad_norm": 0.06963032484054565, + "learning_rate": 2.062076069240631e-05, + "loss": 0.0333, + "step": 115650 + }, + { + "epoch": 0.1783, + "grad_norm": 0.06304597109556198, + "learning_rate": 2.0616690991470477e-05, + "loss": 0.0359, + "step": 115660 + }, + { + "epoch": 0.17835, + "grad_norm": 0.06315404921770096, + "learning_rate": 2.0612621410372685e-05, + "loss": 0.034, + "step": 115670 + }, + { + "epoch": 0.1784, + "grad_norm": 0.05911766365170479, + "learning_rate": 2.0608551949224152e-05, + "loss": 0.0342, + "step": 115680 + }, + { + "epoch": 0.17845, + "grad_norm": 0.06626007705926895, + "learning_rate": 2.0604482608136185e-05, + "loss": 0.0332, + "step": 115690 + }, + { + "epoch": 0.1785, + "grad_norm": 0.0709712877869606, + "learning_rate": 2.0600413387219986e-05, + "loss": 0.0341, + "step": 115700 + }, + { + "epoch": 0.17855, + "grad_norm": 0.06101493537425995, + "learning_rate": 2.059634428658684e-05, + "loss": 0.0338, + "step": 115710 + }, + { + "epoch": 0.1786, + "grad_norm": 0.07519076019525528, + "learning_rate": 2.0592275306347996e-05, + "loss": 0.0365, + "step": 115720 + }, + { + "epoch": 0.17865, + "grad_norm": 0.06127850338816643, + "learning_rate": 2.0588206446614683e-05, + "loss": 0.0342, + "step": 115730 + }, + { + "epoch": 0.1787, + "grad_norm": 0.06362968683242798, + "learning_rate": 2.0584137707498153e-05, + "loss": 0.0337, + "step": 115740 + }, + { + "epoch": 0.17875, + "grad_norm": 0.05607949197292328, + "learning_rate": 2.0580069089109633e-05, + "loss": 0.0337, + "step": 115750 + }, + { + "epoch": 0.1788, + "grad_norm": 0.08142589032649994, + "learning_rate": 2.0576000591560368e-05, + "loss": 0.0339, + "step": 115760 + }, + { + "epoch": 0.17885, + "grad_norm": 0.06536273658275604, + "learning_rate": 2.0571932214961583e-05, + "loss": 0.0344, + "step": 115770 + }, + { + "epoch": 0.1789, + "grad_norm": 0.07052770256996155, + "learning_rate": 2.0567863959424498e-05, + "loss": 0.035, + "step": 115780 + }, + { + "epoch": 0.17895, + "grad_norm": 0.07326888293027878, + "learning_rate": 2.0563795825060358e-05, + "loss": 0.0358, + "step": 115790 + }, + { + "epoch": 0.179, + "grad_norm": 0.06202247738838196, + "learning_rate": 2.055972781198037e-05, + "loss": 0.0341, + "step": 115800 + }, + { + "epoch": 0.17905, + "grad_norm": 0.06690514087677002, + "learning_rate": 2.0555659920295763e-05, + "loss": 0.0357, + "step": 115810 + }, + { + "epoch": 0.1791, + "grad_norm": 0.06660515069961548, + "learning_rate": 2.0551592150117735e-05, + "loss": 0.0345, + "step": 115820 + }, + { + "epoch": 0.17915, + "grad_norm": 0.07260756194591522, + "learning_rate": 2.0547524501557514e-05, + "loss": 0.0359, + "step": 115830 + }, + { + "epoch": 0.1792, + "grad_norm": 0.070611372590065, + "learning_rate": 2.0543456974726295e-05, + "loss": 0.0366, + "step": 115840 + }, + { + "epoch": 0.17925, + "grad_norm": 0.09465611726045609, + "learning_rate": 2.0539389569735287e-05, + "loss": 0.0358, + "step": 115850 + }, + { + "epoch": 0.1793, + "grad_norm": 0.08132003247737885, + "learning_rate": 2.05353222866957e-05, + "loss": 0.0369, + "step": 115860 + }, + { + "epoch": 0.17935, + "grad_norm": 0.12759603559970856, + "learning_rate": 2.0531255125718708e-05, + "loss": 0.0373, + "step": 115870 + }, + { + "epoch": 0.1794, + "grad_norm": 0.07082096487283707, + "learning_rate": 2.0527188086915544e-05, + "loss": 0.0357, + "step": 115880 + }, + { + "epoch": 0.17945, + "grad_norm": 0.07586158066987991, + "learning_rate": 2.052312117039736e-05, + "loss": 0.0361, + "step": 115890 + }, + { + "epoch": 0.1795, + "grad_norm": 0.07408928126096725, + "learning_rate": 2.0519054376275365e-05, + "loss": 0.0378, + "step": 115900 + }, + { + "epoch": 0.17955, + "grad_norm": 0.0980948954820633, + "learning_rate": 2.051498770466075e-05, + "loss": 0.0371, + "step": 115910 + }, + { + "epoch": 0.1796, + "grad_norm": 0.0799083560705185, + "learning_rate": 2.0510921155664674e-05, + "loss": 0.0364, + "step": 115920 + }, + { + "epoch": 0.17965, + "grad_norm": 0.07427407801151276, + "learning_rate": 2.0506854729398336e-05, + "loss": 0.0367, + "step": 115930 + }, + { + "epoch": 0.1797, + "grad_norm": 0.07316645979881287, + "learning_rate": 2.0502788425972896e-05, + "loss": 0.0351, + "step": 115940 + }, + { + "epoch": 0.17975, + "grad_norm": 0.06873858720064163, + "learning_rate": 2.0498722245499534e-05, + "loss": 0.0383, + "step": 115950 + }, + { + "epoch": 0.1798, + "grad_norm": 0.0818914845585823, + "learning_rate": 2.0494656188089414e-05, + "loss": 0.0361, + "step": 115960 + }, + { + "epoch": 0.17985, + "grad_norm": 0.05910930410027504, + "learning_rate": 2.0490590253853693e-05, + "loss": 0.035, + "step": 115970 + }, + { + "epoch": 0.1799, + "grad_norm": 0.07244950532913208, + "learning_rate": 2.048652444290356e-05, + "loss": 0.0355, + "step": 115980 + }, + { + "epoch": 0.17995, + "grad_norm": 0.07029040157794952, + "learning_rate": 2.0482458755350132e-05, + "loss": 0.0348, + "step": 115990 + }, + { + "epoch": 0.18, + "grad_norm": 0.06852120161056519, + "learning_rate": 2.0478393191304598e-05, + "loss": 0.0349, + "step": 116000 + }, + { + "epoch": 0.18005, + "grad_norm": 0.07397285103797913, + "learning_rate": 2.0474327750878088e-05, + "loss": 0.0353, + "step": 116010 + }, + { + "epoch": 0.1801, + "grad_norm": 0.06840717792510986, + "learning_rate": 2.0470262434181762e-05, + "loss": 0.0343, + "step": 116020 + }, + { + "epoch": 0.18015, + "grad_norm": 0.07946541160345078, + "learning_rate": 2.0466197241326757e-05, + "loss": 0.0359, + "step": 116030 + }, + { + "epoch": 0.1802, + "grad_norm": 0.07225006818771362, + "learning_rate": 2.0462132172424218e-05, + "loss": 0.0348, + "step": 116040 + }, + { + "epoch": 0.18025, + "grad_norm": 0.06738066673278809, + "learning_rate": 2.045806722758528e-05, + "loss": 0.0347, + "step": 116050 + }, + { + "epoch": 0.1803, + "grad_norm": 0.06892020255327225, + "learning_rate": 2.0454002406921075e-05, + "loss": 0.0342, + "step": 116060 + }, + { + "epoch": 0.18035, + "grad_norm": 0.07145651429891586, + "learning_rate": 2.0449937710542743e-05, + "loss": 0.0332, + "step": 116070 + }, + { + "epoch": 0.1804, + "grad_norm": 0.06717827171087265, + "learning_rate": 2.0445873138561393e-05, + "loss": 0.0344, + "step": 116080 + }, + { + "epoch": 0.18045, + "grad_norm": 0.07199730724096298, + "learning_rate": 2.0441808691088164e-05, + "loss": 0.0336, + "step": 116090 + }, + { + "epoch": 0.1805, + "grad_norm": 0.060535430908203125, + "learning_rate": 2.043774436823418e-05, + "loss": 0.0332, + "step": 116100 + }, + { + "epoch": 0.18055, + "grad_norm": 0.05504748225212097, + "learning_rate": 2.0433680170110548e-05, + "loss": 0.0335, + "step": 116110 + }, + { + "epoch": 0.1806, + "grad_norm": 0.0665460079908371, + "learning_rate": 2.0429616096828387e-05, + "loss": 0.0354, + "step": 116120 + }, + { + "epoch": 0.18065, + "grad_norm": 0.05895334109663963, + "learning_rate": 2.04255521484988e-05, + "loss": 0.0335, + "step": 116130 + }, + { + "epoch": 0.1807, + "grad_norm": 0.05527698993682861, + "learning_rate": 2.0421488325232904e-05, + "loss": 0.0333, + "step": 116140 + }, + { + "epoch": 0.18075, + "grad_norm": 0.06963811814785004, + "learning_rate": 2.041742462714179e-05, + "loss": 0.0344, + "step": 116150 + }, + { + "epoch": 0.1808, + "grad_norm": 0.05897054448723793, + "learning_rate": 2.0413361054336564e-05, + "loss": 0.0336, + "step": 116160 + }, + { + "epoch": 0.18085, + "grad_norm": 0.05657659471035004, + "learning_rate": 2.040929760692834e-05, + "loss": 0.0336, + "step": 116170 + }, + { + "epoch": 0.1809, + "grad_norm": 0.05925530940294266, + "learning_rate": 2.0405234285028174e-05, + "loss": 0.0333, + "step": 116180 + }, + { + "epoch": 0.18095, + "grad_norm": 0.06751128286123276, + "learning_rate": 2.0401171088747194e-05, + "loss": 0.0362, + "step": 116190 + }, + { + "epoch": 0.181, + "grad_norm": 0.08569232374429703, + "learning_rate": 2.0397108018196453e-05, + "loss": 0.0338, + "step": 116200 + }, + { + "epoch": 0.18105, + "grad_norm": 0.07641440629959106, + "learning_rate": 2.039304507348706e-05, + "loss": 0.034, + "step": 116210 + }, + { + "epoch": 0.1811, + "grad_norm": 0.0684174969792366, + "learning_rate": 2.038898225473008e-05, + "loss": 0.0351, + "step": 116220 + }, + { + "epoch": 0.18115, + "grad_norm": 0.07047194242477417, + "learning_rate": 2.0384919562036593e-05, + "loss": 0.0334, + "step": 116230 + }, + { + "epoch": 0.1812, + "grad_norm": 0.06977847218513489, + "learning_rate": 2.0380856995517673e-05, + "loss": 0.0342, + "step": 116240 + }, + { + "epoch": 0.18125, + "grad_norm": 0.080093152821064, + "learning_rate": 2.0376794555284386e-05, + "loss": 0.0343, + "step": 116250 + }, + { + "epoch": 0.1813, + "grad_norm": 0.05909072235226631, + "learning_rate": 2.0372732241447802e-05, + "loss": 0.0344, + "step": 116260 + }, + { + "epoch": 0.18135, + "grad_norm": 0.0653337761759758, + "learning_rate": 2.0368670054118976e-05, + "loss": 0.0332, + "step": 116270 + }, + { + "epoch": 0.1814, + "grad_norm": 0.07813318073749542, + "learning_rate": 2.036460799340897e-05, + "loss": 0.0329, + "step": 116280 + }, + { + "epoch": 0.18145, + "grad_norm": 0.06988602876663208, + "learning_rate": 2.0360546059428843e-05, + "loss": 0.0341, + "step": 116290 + }, + { + "epoch": 0.1815, + "grad_norm": 0.06033988296985626, + "learning_rate": 2.035648425228964e-05, + "loss": 0.0332, + "step": 116300 + }, + { + "epoch": 0.18155, + "grad_norm": 0.07113895565271378, + "learning_rate": 2.0352422572102423e-05, + "loss": 0.0357, + "step": 116310 + }, + { + "epoch": 0.1816, + "grad_norm": 0.0769420638680458, + "learning_rate": 2.0348361018978217e-05, + "loss": 0.0349, + "step": 116320 + }, + { + "epoch": 0.18165, + "grad_norm": 0.06596451252698898, + "learning_rate": 2.0344299593028083e-05, + "loss": 0.0336, + "step": 116330 + }, + { + "epoch": 0.1817, + "grad_norm": 0.07285913079977036, + "learning_rate": 2.034023829436304e-05, + "loss": 0.0329, + "step": 116340 + }, + { + "epoch": 0.18175, + "grad_norm": 0.07244844734668732, + "learning_rate": 2.033617712309413e-05, + "loss": 0.0349, + "step": 116350 + }, + { + "epoch": 0.1818, + "grad_norm": 0.09303736686706543, + "learning_rate": 2.0332116079332396e-05, + "loss": 0.0348, + "step": 116360 + }, + { + "epoch": 0.18185, + "grad_norm": 0.07680649310350418, + "learning_rate": 2.032805516318884e-05, + "loss": 0.0347, + "step": 116370 + }, + { + "epoch": 0.1819, + "grad_norm": 0.06392459571361542, + "learning_rate": 2.0323994374774516e-05, + "loss": 0.0337, + "step": 116380 + }, + { + "epoch": 0.18195, + "grad_norm": 0.059849537909030914, + "learning_rate": 2.0319933714200416e-05, + "loss": 0.0357, + "step": 116390 + }, + { + "epoch": 0.182, + "grad_norm": 0.05902472510933876, + "learning_rate": 2.031587318157758e-05, + "loss": 0.0342, + "step": 116400 + }, + { + "epoch": 0.18205, + "grad_norm": 0.07465706765651703, + "learning_rate": 2.0311812777017004e-05, + "loss": 0.0353, + "step": 116410 + }, + { + "epoch": 0.1821, + "grad_norm": 0.0838695615530014, + "learning_rate": 2.0307752500629707e-05, + "loss": 0.0368, + "step": 116420 + }, + { + "epoch": 0.18215, + "grad_norm": 0.06150379031896591, + "learning_rate": 2.0303692352526698e-05, + "loss": 0.0359, + "step": 116430 + }, + { + "epoch": 0.1822, + "grad_norm": 0.07571075856685638, + "learning_rate": 2.0299632332818973e-05, + "loss": 0.0357, + "step": 116440 + }, + { + "epoch": 0.18225, + "grad_norm": 0.10019955784082413, + "learning_rate": 2.029557244161754e-05, + "loss": 0.0346, + "step": 116450 + }, + { + "epoch": 0.1823, + "grad_norm": 0.07792889326810837, + "learning_rate": 2.029151267903338e-05, + "loss": 0.0343, + "step": 116460 + }, + { + "epoch": 0.18235, + "grad_norm": 0.07031518220901489, + "learning_rate": 2.028745304517749e-05, + "loss": 0.0331, + "step": 116470 + }, + { + "epoch": 0.1824, + "grad_norm": 0.06752107292413712, + "learning_rate": 2.028339354016088e-05, + "loss": 0.0358, + "step": 116480 + }, + { + "epoch": 0.18245, + "grad_norm": 0.06869736313819885, + "learning_rate": 2.0279334164094504e-05, + "loss": 0.0359, + "step": 116490 + }, + { + "epoch": 0.1825, + "grad_norm": 0.07195311039686203, + "learning_rate": 2.027527491708937e-05, + "loss": 0.035, + "step": 116500 + }, + { + "epoch": 0.18255, + "grad_norm": 0.07395985722541809, + "learning_rate": 2.0271215799256434e-05, + "loss": 0.0341, + "step": 116510 + }, + { + "epoch": 0.1826, + "grad_norm": 0.05496805161237717, + "learning_rate": 2.026715681070669e-05, + "loss": 0.0337, + "step": 116520 + }, + { + "epoch": 0.18265, + "grad_norm": 0.07406099140644073, + "learning_rate": 2.0263097951551098e-05, + "loss": 0.0342, + "step": 116530 + }, + { + "epoch": 0.1827, + "grad_norm": 0.05828932300209999, + "learning_rate": 2.0259039221900627e-05, + "loss": 0.034, + "step": 116540 + }, + { + "epoch": 0.18275, + "grad_norm": 0.06917819380760193, + "learning_rate": 2.0254980621866247e-05, + "loss": 0.0352, + "step": 116550 + }, + { + "epoch": 0.1828, + "grad_norm": 0.06745157390832901, + "learning_rate": 2.025092215155891e-05, + "loss": 0.0357, + "step": 116560 + }, + { + "epoch": 0.18285, + "grad_norm": 0.07505171000957489, + "learning_rate": 2.024686381108958e-05, + "loss": 0.0351, + "step": 116570 + }, + { + "epoch": 0.1829, + "grad_norm": 0.06549634039402008, + "learning_rate": 2.0242805600569198e-05, + "loss": 0.0339, + "step": 116580 + }, + { + "epoch": 0.18295, + "grad_norm": 0.06527920067310333, + "learning_rate": 2.023874752010874e-05, + "loss": 0.0354, + "step": 116590 + }, + { + "epoch": 0.183, + "grad_norm": 0.052015628665685654, + "learning_rate": 2.023468956981912e-05, + "loss": 0.0344, + "step": 116600 + }, + { + "epoch": 0.18305, + "grad_norm": 0.05962091684341431, + "learning_rate": 2.0230631749811306e-05, + "loss": 0.0349, + "step": 116610 + }, + { + "epoch": 0.1831, + "grad_norm": 0.062450990080833435, + "learning_rate": 2.022657406019623e-05, + "loss": 0.035, + "step": 116620 + }, + { + "epoch": 0.18315, + "grad_norm": 0.06071823462843895, + "learning_rate": 2.022251650108482e-05, + "loss": 0.0352, + "step": 116630 + }, + { + "epoch": 0.1832, + "grad_norm": 0.059769246727228165, + "learning_rate": 2.021845907258802e-05, + "loss": 0.0348, + "step": 116640 + }, + { + "epoch": 0.18325, + "grad_norm": 0.056287750601768494, + "learning_rate": 2.0214401774816748e-05, + "loss": 0.0377, + "step": 116650 + }, + { + "epoch": 0.1833, + "grad_norm": 0.07899390161037445, + "learning_rate": 2.0210344607881925e-05, + "loss": 0.0351, + "step": 116660 + }, + { + "epoch": 0.18335, + "grad_norm": 0.0634530559182167, + "learning_rate": 2.02062875718945e-05, + "loss": 0.0357, + "step": 116670 + }, + { + "epoch": 0.1834, + "grad_norm": 0.060274887830019, + "learning_rate": 2.0202230666965354e-05, + "loss": 0.0346, + "step": 116680 + }, + { + "epoch": 0.18345, + "grad_norm": 0.061985548585653305, + "learning_rate": 2.019817389320544e-05, + "loss": 0.0348, + "step": 116690 + }, + { + "epoch": 0.1835, + "grad_norm": 0.05522892624139786, + "learning_rate": 2.019411725072563e-05, + "loss": 0.0343, + "step": 116700 + }, + { + "epoch": 0.18355, + "grad_norm": 0.06568529456853867, + "learning_rate": 2.0190060739636856e-05, + "loss": 0.0356, + "step": 116710 + }, + { + "epoch": 0.1836, + "grad_norm": 0.06055418774485588, + "learning_rate": 2.0186004360050013e-05, + "loss": 0.0343, + "step": 116720 + }, + { + "epoch": 0.18365, + "grad_norm": 0.07213877886533737, + "learning_rate": 2.0181948112076e-05, + "loss": 0.0335, + "step": 116730 + }, + { + "epoch": 0.1837, + "grad_norm": 0.08859606087207794, + "learning_rate": 2.017789199582572e-05, + "loss": 0.0371, + "step": 116740 + }, + { + "epoch": 0.18375, + "grad_norm": 0.0729413777589798, + "learning_rate": 2.0173836011410057e-05, + "loss": 0.0339, + "step": 116750 + }, + { + "epoch": 0.1838, + "grad_norm": 0.05296160280704498, + "learning_rate": 2.016978015893991e-05, + "loss": 0.0341, + "step": 116760 + }, + { + "epoch": 0.18385, + "grad_norm": 0.06408056616783142, + "learning_rate": 2.0165724438526153e-05, + "loss": 0.034, + "step": 116770 + }, + { + "epoch": 0.1839, + "grad_norm": 0.05741770565509796, + "learning_rate": 2.0161668850279682e-05, + "loss": 0.0343, + "step": 116780 + }, + { + "epoch": 0.18395, + "grad_norm": 0.11421132832765579, + "learning_rate": 2.015761339431135e-05, + "loss": 0.0364, + "step": 116790 + }, + { + "epoch": 0.184, + "grad_norm": 0.08587351441383362, + "learning_rate": 2.015355807073206e-05, + "loss": 0.035, + "step": 116800 + }, + { + "epoch": 0.18405, + "grad_norm": 0.06537584215402603, + "learning_rate": 2.0149502879652674e-05, + "loss": 0.0352, + "step": 116810 + }, + { + "epoch": 0.1841, + "grad_norm": 0.057796284556388855, + "learning_rate": 2.0145447821184053e-05, + "loss": 0.0352, + "step": 116820 + }, + { + "epoch": 0.18415, + "grad_norm": 0.0686817318201065, + "learning_rate": 2.0141392895437067e-05, + "loss": 0.0359, + "step": 116830 + }, + { + "epoch": 0.1842, + "grad_norm": 0.05911717191338539, + "learning_rate": 2.0137338102522573e-05, + "loss": 0.0345, + "step": 116840 + }, + { + "epoch": 0.18425, + "grad_norm": 0.09625113755464554, + "learning_rate": 2.013328344255143e-05, + "loss": 0.0364, + "step": 116850 + }, + { + "epoch": 0.1843, + "grad_norm": 0.059197183698415756, + "learning_rate": 2.0129228915634485e-05, + "loss": 0.0351, + "step": 116860 + }, + { + "epoch": 0.18435, + "grad_norm": 0.0632222518324852, + "learning_rate": 2.012517452188259e-05, + "loss": 0.0348, + "step": 116870 + }, + { + "epoch": 0.1844, + "grad_norm": 0.0603092722594738, + "learning_rate": 2.0121120261406603e-05, + "loss": 0.0352, + "step": 116880 + }, + { + "epoch": 0.18445, + "grad_norm": 0.07025669515132904, + "learning_rate": 2.0117066134317343e-05, + "loss": 0.0351, + "step": 116890 + }, + { + "epoch": 0.1845, + "grad_norm": 0.0625699982047081, + "learning_rate": 2.0113012140725673e-05, + "loss": 0.0362, + "step": 116900 + }, + { + "epoch": 0.18455, + "grad_norm": 0.06732776015996933, + "learning_rate": 2.01089582807424e-05, + "loss": 0.0353, + "step": 116910 + }, + { + "epoch": 0.1846, + "grad_norm": 0.07922066003084183, + "learning_rate": 2.0104904554478378e-05, + "loss": 0.0355, + "step": 116920 + }, + { + "epoch": 0.18465, + "grad_norm": 0.07065048813819885, + "learning_rate": 2.0100850962044432e-05, + "loss": 0.0353, + "step": 116930 + }, + { + "epoch": 0.1847, + "grad_norm": 0.06404469907283783, + "learning_rate": 2.0096797503551372e-05, + "loss": 0.0338, + "step": 116940 + }, + { + "epoch": 0.18475, + "grad_norm": 0.06592314690351486, + "learning_rate": 2.009274417911003e-05, + "loss": 0.0344, + "step": 116950 + }, + { + "epoch": 0.1848, + "grad_norm": 0.0722537413239479, + "learning_rate": 2.008869098883122e-05, + "loss": 0.0347, + "step": 116960 + }, + { + "epoch": 0.18485, + "grad_norm": 0.06831130385398865, + "learning_rate": 2.0084637932825752e-05, + "loss": 0.0336, + "step": 116970 + }, + { + "epoch": 0.1849, + "grad_norm": 0.09930170327425003, + "learning_rate": 2.0080585011204434e-05, + "loss": 0.0356, + "step": 116980 + }, + { + "epoch": 0.18495, + "grad_norm": 0.07288338989019394, + "learning_rate": 2.0076532224078068e-05, + "loss": 0.0358, + "step": 116990 + }, + { + "epoch": 0.185, + "grad_norm": 0.09408794343471527, + "learning_rate": 2.007247957155747e-05, + "loss": 0.0347, + "step": 117000 + }, + { + "epoch": 0.18505, + "grad_norm": 0.07589706778526306, + "learning_rate": 2.006842705375343e-05, + "loss": 0.0343, + "step": 117010 + }, + { + "epoch": 0.1851, + "grad_norm": 0.05950513482093811, + "learning_rate": 2.006437467077674e-05, + "loss": 0.0364, + "step": 117020 + }, + { + "epoch": 0.18515, + "grad_norm": 0.0669105276465416, + "learning_rate": 2.006032242273819e-05, + "loss": 0.0356, + "step": 117030 + }, + { + "epoch": 0.1852, + "grad_norm": 0.07175637781620026, + "learning_rate": 2.0056270309748572e-05, + "loss": 0.0354, + "step": 117040 + }, + { + "epoch": 0.18525, + "grad_norm": 0.07348711043596268, + "learning_rate": 2.0052218331918666e-05, + "loss": 0.0347, + "step": 117050 + }, + { + "epoch": 0.1853, + "grad_norm": 0.07550489157438278, + "learning_rate": 2.0048166489359247e-05, + "loss": 0.0358, + "step": 117060 + }, + { + "epoch": 0.18535, + "grad_norm": 0.0652247816324234, + "learning_rate": 2.0044114782181105e-05, + "loss": 0.035, + "step": 117070 + }, + { + "epoch": 0.1854, + "grad_norm": 0.13588404655456543, + "learning_rate": 2.0040063210494992e-05, + "loss": 0.0342, + "step": 117080 + }, + { + "epoch": 0.18545, + "grad_norm": 0.10506972670555115, + "learning_rate": 2.00360117744117e-05, + "loss": 0.0346, + "step": 117090 + }, + { + "epoch": 0.1855, + "grad_norm": 0.08438332378864288, + "learning_rate": 2.0031960474041966e-05, + "loss": 0.033, + "step": 117100 + }, + { + "epoch": 0.18555, + "grad_norm": 0.08282383531332016, + "learning_rate": 2.0027909309496576e-05, + "loss": 0.0331, + "step": 117110 + }, + { + "epoch": 0.1856, + "grad_norm": 0.07057241350412369, + "learning_rate": 2.0023858280886278e-05, + "loss": 0.0332, + "step": 117120 + }, + { + "epoch": 0.18565, + "grad_norm": 0.07022110372781754, + "learning_rate": 2.0019807388321825e-05, + "loss": 0.0335, + "step": 117130 + }, + { + "epoch": 0.1857, + "grad_norm": 0.06743114441633224, + "learning_rate": 2.0015756631913967e-05, + "loss": 0.0342, + "step": 117140 + }, + { + "epoch": 0.18575, + "grad_norm": 0.06218687444925308, + "learning_rate": 2.0011706011773446e-05, + "loss": 0.0337, + "step": 117150 + }, + { + "epoch": 0.1858, + "grad_norm": 0.06828916817903519, + "learning_rate": 2.0007655528011017e-05, + "loss": 0.0328, + "step": 117160 + }, + { + "epoch": 0.18585, + "grad_norm": 0.06903926283121109, + "learning_rate": 2.0003605180737403e-05, + "loss": 0.0339, + "step": 117170 + }, + { + "epoch": 0.1859, + "grad_norm": 0.06969159841537476, + "learning_rate": 1.999955497006334e-05, + "loss": 0.0334, + "step": 117180 + }, + { + "epoch": 0.18595, + "grad_norm": 0.06109282001852989, + "learning_rate": 1.9995504896099583e-05, + "loss": 0.0329, + "step": 117190 + }, + { + "epoch": 0.186, + "grad_norm": 0.07328418642282486, + "learning_rate": 1.9991454958956822e-05, + "loss": 0.0332, + "step": 117200 + }, + { + "epoch": 0.18605, + "grad_norm": 0.06142791733145714, + "learning_rate": 1.9987405158745817e-05, + "loss": 0.0334, + "step": 117210 + }, + { + "epoch": 0.1861, + "grad_norm": 0.06849481165409088, + "learning_rate": 1.9983355495577266e-05, + "loss": 0.0333, + "step": 117220 + }, + { + "epoch": 0.18615, + "grad_norm": 0.06934063136577606, + "learning_rate": 1.9979305969561895e-05, + "loss": 0.0323, + "step": 117230 + }, + { + "epoch": 0.1862, + "grad_norm": 0.06079070642590523, + "learning_rate": 1.9975256580810405e-05, + "loss": 0.034, + "step": 117240 + }, + { + "epoch": 0.18625, + "grad_norm": 0.06963855028152466, + "learning_rate": 1.9971207329433518e-05, + "loss": 0.0332, + "step": 117250 + }, + { + "epoch": 0.1863, + "grad_norm": 0.0564182884991169, + "learning_rate": 1.9967158215541936e-05, + "loss": 0.0344, + "step": 117260 + }, + { + "epoch": 0.18635, + "grad_norm": 0.06805352121591568, + "learning_rate": 1.9963109239246346e-05, + "loss": 0.0329, + "step": 117270 + }, + { + "epoch": 0.1864, + "grad_norm": 0.058724112808704376, + "learning_rate": 1.995906040065747e-05, + "loss": 0.0327, + "step": 117280 + }, + { + "epoch": 0.18645, + "grad_norm": 0.058111634105443954, + "learning_rate": 1.995501169988598e-05, + "loss": 0.0326, + "step": 117290 + }, + { + "epoch": 0.1865, + "grad_norm": 0.05567874759435654, + "learning_rate": 1.9950963137042573e-05, + "loss": 0.0354, + "step": 117300 + }, + { + "epoch": 0.18655, + "grad_norm": 0.06355543434619904, + "learning_rate": 1.9946914712237946e-05, + "loss": 0.0359, + "step": 117310 + }, + { + "epoch": 0.1866, + "grad_norm": 0.08021455258131027, + "learning_rate": 1.994286642558277e-05, + "loss": 0.0339, + "step": 117320 + }, + { + "epoch": 0.18665, + "grad_norm": 0.07066422700881958, + "learning_rate": 1.9938818277187726e-05, + "loss": 0.0337, + "step": 117330 + }, + { + "epoch": 0.1867, + "grad_norm": 0.08102439343929291, + "learning_rate": 1.9934770267163484e-05, + "loss": 0.0349, + "step": 117340 + }, + { + "epoch": 0.18675, + "grad_norm": 0.07358285784721375, + "learning_rate": 1.9930722395620727e-05, + "loss": 0.0336, + "step": 117350 + }, + { + "epoch": 0.1868, + "grad_norm": 0.06806259602308273, + "learning_rate": 1.992667466267011e-05, + "loss": 0.0338, + "step": 117360 + }, + { + "epoch": 0.18685, + "grad_norm": 0.056118763983249664, + "learning_rate": 1.9922627068422297e-05, + "loss": 0.0327, + "step": 117370 + }, + { + "epoch": 0.1869, + "grad_norm": 0.06797992438077927, + "learning_rate": 1.9918579612987968e-05, + "loss": 0.0342, + "step": 117380 + }, + { + "epoch": 0.18695, + "grad_norm": 0.07334133982658386, + "learning_rate": 1.991453229647775e-05, + "loss": 0.0344, + "step": 117390 + }, + { + "epoch": 0.187, + "grad_norm": 0.06543901562690735, + "learning_rate": 1.991048511900232e-05, + "loss": 0.0339, + "step": 117400 + }, + { + "epoch": 0.18705, + "grad_norm": 0.07021824270486832, + "learning_rate": 1.99064380806723e-05, + "loss": 0.0355, + "step": 117410 + }, + { + "epoch": 0.1871, + "grad_norm": 0.06691295653581619, + "learning_rate": 1.9902391181598358e-05, + "loss": 0.0348, + "step": 117420 + }, + { + "epoch": 0.18715, + "grad_norm": 0.06111136078834534, + "learning_rate": 1.9898344421891125e-05, + "loss": 0.0356, + "step": 117430 + }, + { + "epoch": 0.1872, + "grad_norm": 0.07992721349000931, + "learning_rate": 1.9894297801661236e-05, + "loss": 0.0354, + "step": 117440 + }, + { + "epoch": 0.18725, + "grad_norm": 0.07154236733913422, + "learning_rate": 1.9890251321019335e-05, + "loss": 0.034, + "step": 117450 + }, + { + "epoch": 0.1873, + "grad_norm": 0.07538168132305145, + "learning_rate": 1.9886204980076033e-05, + "loss": 0.0334, + "step": 117460 + }, + { + "epoch": 0.18735, + "grad_norm": 0.05986643582582474, + "learning_rate": 1.9882158778941977e-05, + "loss": 0.0337, + "step": 117470 + }, + { + "epoch": 0.1874, + "grad_norm": 0.06718599796295166, + "learning_rate": 1.987811271772777e-05, + "loss": 0.0333, + "step": 117480 + }, + { + "epoch": 0.18745, + "grad_norm": 0.06104372441768646, + "learning_rate": 1.987406679654403e-05, + "loss": 0.0337, + "step": 117490 + }, + { + "epoch": 0.1875, + "grad_norm": 0.065663181245327, + "learning_rate": 1.987002101550139e-05, + "loss": 0.0328, + "step": 117500 + }, + { + "epoch": 0.18755, + "grad_norm": 0.1018490120768547, + "learning_rate": 1.9865975374710443e-05, + "loss": 0.0346, + "step": 117510 + }, + { + "epoch": 0.1876, + "grad_norm": 0.08078235387802124, + "learning_rate": 1.9861929874281804e-05, + "loss": 0.0376, + "step": 117520 + }, + { + "epoch": 0.18765, + "grad_norm": 0.08387861400842667, + "learning_rate": 1.985788451432607e-05, + "loss": 0.0364, + "step": 117530 + }, + { + "epoch": 0.1877, + "grad_norm": 0.06840310990810394, + "learning_rate": 1.9853839294953843e-05, + "loss": 0.0333, + "step": 117540 + }, + { + "epoch": 0.18775, + "grad_norm": 0.07385700941085815, + "learning_rate": 1.9849794216275712e-05, + "loss": 0.0344, + "step": 117550 + }, + { + "epoch": 0.1878, + "grad_norm": 0.07153363525867462, + "learning_rate": 1.9845749278402277e-05, + "loss": 0.0332, + "step": 117560 + }, + { + "epoch": 0.18785, + "grad_norm": 0.07106166332960129, + "learning_rate": 1.984170448144412e-05, + "loss": 0.0325, + "step": 117570 + }, + { + "epoch": 0.1879, + "grad_norm": 0.0803474560379982, + "learning_rate": 1.9837659825511818e-05, + "loss": 0.034, + "step": 117580 + }, + { + "epoch": 0.18795, + "grad_norm": 0.08528460562229156, + "learning_rate": 1.9833615310715968e-05, + "loss": 0.0342, + "step": 117590 + }, + { + "epoch": 0.188, + "grad_norm": 0.06444571167230606, + "learning_rate": 1.982957093716712e-05, + "loss": 0.0348, + "step": 117600 + }, + { + "epoch": 0.18805, + "grad_norm": 0.07973705232143402, + "learning_rate": 1.982552670497588e-05, + "loss": 0.0343, + "step": 117610 + }, + { + "epoch": 0.1881, + "grad_norm": 0.07987213879823685, + "learning_rate": 1.982148261425278e-05, + "loss": 0.0364, + "step": 117620 + }, + { + "epoch": 0.18815, + "grad_norm": 0.08558136969804764, + "learning_rate": 1.9817438665108402e-05, + "loss": 0.0364, + "step": 117630 + }, + { + "epoch": 0.1882, + "grad_norm": 0.06126723811030388, + "learning_rate": 1.981339485765331e-05, + "loss": 0.0335, + "step": 117640 + }, + { + "epoch": 0.18825, + "grad_norm": 0.06406763195991516, + "learning_rate": 1.9809351191998045e-05, + "loss": 0.034, + "step": 117650 + }, + { + "epoch": 0.1883, + "grad_norm": 0.04996425285935402, + "learning_rate": 1.980530766825318e-05, + "loss": 0.0331, + "step": 117660 + }, + { + "epoch": 0.18835, + "grad_norm": 0.0618036724627018, + "learning_rate": 1.980126428652924e-05, + "loss": 0.0363, + "step": 117670 + }, + { + "epoch": 0.1884, + "grad_norm": 0.06212411820888519, + "learning_rate": 1.979722104693678e-05, + "loss": 0.033, + "step": 117680 + }, + { + "epoch": 0.18845, + "grad_norm": 0.06862057745456696, + "learning_rate": 1.9793177949586363e-05, + "loss": 0.0342, + "step": 117690 + }, + { + "epoch": 0.1885, + "grad_norm": 0.06919129192829132, + "learning_rate": 1.9789134994588482e-05, + "loss": 0.0345, + "step": 117700 + }, + { + "epoch": 0.18855, + "grad_norm": 0.06631922721862793, + "learning_rate": 1.9785092182053702e-05, + "loss": 0.0336, + "step": 117710 + }, + { + "epoch": 0.1886, + "grad_norm": 0.054118335247039795, + "learning_rate": 1.9781049512092542e-05, + "loss": 0.0348, + "step": 117720 + }, + { + "epoch": 0.18865, + "grad_norm": 0.06398317962884903, + "learning_rate": 1.977700698481553e-05, + "loss": 0.0328, + "step": 117730 + }, + { + "epoch": 0.1887, + "grad_norm": 0.07057011872529984, + "learning_rate": 1.977296460033318e-05, + "loss": 0.0344, + "step": 117740 + }, + { + "epoch": 0.18875, + "grad_norm": 0.0695476233959198, + "learning_rate": 1.9768922358756014e-05, + "loss": 0.0338, + "step": 117750 + }, + { + "epoch": 0.1888, + "grad_norm": 0.06410462409257889, + "learning_rate": 1.9764880260194552e-05, + "loss": 0.0339, + "step": 117760 + }, + { + "epoch": 0.18885, + "grad_norm": 0.08577097207307816, + "learning_rate": 1.976083830475929e-05, + "loss": 0.0346, + "step": 117770 + }, + { + "epoch": 0.1889, + "grad_norm": 0.06604179739952087, + "learning_rate": 1.9756796492560748e-05, + "loss": 0.0337, + "step": 117780 + }, + { + "epoch": 0.18895, + "grad_norm": 0.068792924284935, + "learning_rate": 1.9752754823709406e-05, + "loss": 0.0341, + "step": 117790 + }, + { + "epoch": 0.189, + "grad_norm": 0.07170477509498596, + "learning_rate": 1.9748713298315797e-05, + "loss": 0.0345, + "step": 117800 + }, + { + "epoch": 0.18905, + "grad_norm": 0.06775487214326859, + "learning_rate": 1.9744671916490376e-05, + "loss": 0.0342, + "step": 117810 + }, + { + "epoch": 0.1891, + "grad_norm": 0.06951591372489929, + "learning_rate": 1.9740630678343653e-05, + "loss": 0.0329, + "step": 117820 + }, + { + "epoch": 0.18915, + "grad_norm": 0.07820066064596176, + "learning_rate": 1.973658958398612e-05, + "loss": 0.0334, + "step": 117830 + }, + { + "epoch": 0.1892, + "grad_norm": 0.06221409887075424, + "learning_rate": 1.9732548633528243e-05, + "loss": 0.0332, + "step": 117840 + }, + { + "epoch": 0.18925, + "grad_norm": 0.06415358930826187, + "learning_rate": 1.9728507827080512e-05, + "loss": 0.0335, + "step": 117850 + }, + { + "epoch": 0.1893, + "grad_norm": 0.052510686218738556, + "learning_rate": 1.9724467164753394e-05, + "loss": 0.033, + "step": 117860 + }, + { + "epoch": 0.18935, + "grad_norm": 0.06607841700315475, + "learning_rate": 1.9720426646657352e-05, + "loss": 0.0336, + "step": 117870 + }, + { + "epoch": 0.1894, + "grad_norm": 0.060421060770750046, + "learning_rate": 1.971638627290288e-05, + "loss": 0.0337, + "step": 117880 + }, + { + "epoch": 0.18945, + "grad_norm": 0.07931596040725708, + "learning_rate": 1.971234604360041e-05, + "loss": 0.0354, + "step": 117890 + }, + { + "epoch": 0.1895, + "grad_norm": 0.08308328688144684, + "learning_rate": 1.9708305958860425e-05, + "loss": 0.0349, + "step": 117900 + }, + { + "epoch": 0.18955, + "grad_norm": 0.08788150548934937, + "learning_rate": 1.9704266018793354e-05, + "loss": 0.0338, + "step": 117910 + }, + { + "epoch": 0.1896, + "grad_norm": 0.07126295566558838, + "learning_rate": 1.970022622350967e-05, + "loss": 0.0333, + "step": 117920 + }, + { + "epoch": 0.18965, + "grad_norm": 0.06478600949048996, + "learning_rate": 1.96961865731198e-05, + "loss": 0.032, + "step": 117930 + }, + { + "epoch": 0.1897, + "grad_norm": 0.07265316694974899, + "learning_rate": 1.9692147067734202e-05, + "loss": 0.0345, + "step": 117940 + }, + { + "epoch": 0.18975, + "grad_norm": 0.06458255648612976, + "learning_rate": 1.968810770746331e-05, + "loss": 0.0345, + "step": 117950 + }, + { + "epoch": 0.1898, + "grad_norm": 0.06969352811574936, + "learning_rate": 1.9684068492417558e-05, + "loss": 0.0343, + "step": 117960 + }, + { + "epoch": 0.18985, + "grad_norm": 0.05881544575095177, + "learning_rate": 1.968002942270738e-05, + "loss": 0.0334, + "step": 117970 + }, + { + "epoch": 0.1899, + "grad_norm": 0.06733879446983337, + "learning_rate": 1.967599049844319e-05, + "loss": 0.0345, + "step": 117980 + }, + { + "epoch": 0.18995, + "grad_norm": 0.06642694771289825, + "learning_rate": 1.967195171973543e-05, + "loss": 0.0326, + "step": 117990 + }, + { + "epoch": 0.19, + "grad_norm": 0.06816085427999496, + "learning_rate": 1.9667913086694494e-05, + "loss": 0.0328, + "step": 118000 + }, + { + "epoch": 0.19005, + "grad_norm": 0.07988379895687103, + "learning_rate": 1.966387459943082e-05, + "loss": 0.0346, + "step": 118010 + }, + { + "epoch": 0.1901, + "grad_norm": 0.05612655729055405, + "learning_rate": 1.965983625805481e-05, + "loss": 0.0331, + "step": 118020 + }, + { + "epoch": 0.19015, + "grad_norm": 0.07056666165590286, + "learning_rate": 1.965579806267687e-05, + "loss": 0.0331, + "step": 118030 + }, + { + "epoch": 0.1902, + "grad_norm": 0.07695815712213516, + "learning_rate": 1.9651760013407404e-05, + "loss": 0.0368, + "step": 118040 + }, + { + "epoch": 0.19025, + "grad_norm": 0.06521902233362198, + "learning_rate": 1.9647722110356807e-05, + "loss": 0.0338, + "step": 118050 + }, + { + "epoch": 0.1903, + "grad_norm": 0.06370382755994797, + "learning_rate": 1.9643684353635482e-05, + "loss": 0.0341, + "step": 118060 + }, + { + "epoch": 0.19035, + "grad_norm": 0.07350403070449829, + "learning_rate": 1.9639646743353814e-05, + "loss": 0.0337, + "step": 118070 + }, + { + "epoch": 0.1904, + "grad_norm": 0.07703772187232971, + "learning_rate": 1.9635609279622178e-05, + "loss": 0.0341, + "step": 118080 + }, + { + "epoch": 0.19045, + "grad_norm": 0.07223615795373917, + "learning_rate": 1.9631571962550986e-05, + "loss": 0.0343, + "step": 118090 + }, + { + "epoch": 0.1905, + "grad_norm": 0.06532876193523407, + "learning_rate": 1.9627534792250584e-05, + "loss": 0.034, + "step": 118100 + }, + { + "epoch": 0.19055, + "grad_norm": 0.06084360554814339, + "learning_rate": 1.962349776883138e-05, + "loss": 0.0337, + "step": 118110 + }, + { + "epoch": 0.1906, + "grad_norm": 0.06823130697011948, + "learning_rate": 1.9619460892403713e-05, + "loss": 0.0349, + "step": 118120 + }, + { + "epoch": 0.19065, + "grad_norm": 0.06761247664690018, + "learning_rate": 1.9615424163077963e-05, + "loss": 0.0346, + "step": 118130 + }, + { + "epoch": 0.1907, + "grad_norm": 0.08065170049667358, + "learning_rate": 1.9611387580964504e-05, + "loss": 0.0348, + "step": 118140 + }, + { + "epoch": 0.19075, + "grad_norm": 0.062395017594099045, + "learning_rate": 1.960735114617368e-05, + "loss": 0.0356, + "step": 118150 + }, + { + "epoch": 0.1908, + "grad_norm": 0.07153302431106567, + "learning_rate": 1.960331485881585e-05, + "loss": 0.0351, + "step": 118160 + }, + { + "epoch": 0.19085, + "grad_norm": 0.06883693486452103, + "learning_rate": 1.9599278719001363e-05, + "loss": 0.0354, + "step": 118170 + }, + { + "epoch": 0.1909, + "grad_norm": 0.06124531850218773, + "learning_rate": 1.9595242726840568e-05, + "loss": 0.0352, + "step": 118180 + }, + { + "epoch": 0.19095, + "grad_norm": 0.05614418163895607, + "learning_rate": 1.9591206882443806e-05, + "loss": 0.0338, + "step": 118190 + }, + { + "epoch": 0.191, + "grad_norm": 0.07188361138105392, + "learning_rate": 1.9587171185921406e-05, + "loss": 0.035, + "step": 118200 + }, + { + "epoch": 0.19105, + "grad_norm": 0.06130637601017952, + "learning_rate": 1.9583135637383726e-05, + "loss": 0.0374, + "step": 118210 + }, + { + "epoch": 0.1911, + "grad_norm": 0.06994622200727463, + "learning_rate": 1.9579100236941076e-05, + "loss": 0.0351, + "step": 118220 + }, + { + "epoch": 0.19115, + "grad_norm": 0.06938885897397995, + "learning_rate": 1.9575064984703794e-05, + "loss": 0.0363, + "step": 118230 + }, + { + "epoch": 0.1912, + "grad_norm": 0.06537797302007675, + "learning_rate": 1.9571029880782195e-05, + "loss": 0.0345, + "step": 118240 + }, + { + "epoch": 0.19125, + "grad_norm": 0.07701186090707779, + "learning_rate": 1.9566994925286602e-05, + "loss": 0.0363, + "step": 118250 + }, + { + "epoch": 0.1913, + "grad_norm": 0.06954272836446762, + "learning_rate": 1.956296011832732e-05, + "loss": 0.0336, + "step": 118260 + }, + { + "epoch": 0.19135, + "grad_norm": 0.06378397345542908, + "learning_rate": 1.9558925460014668e-05, + "loss": 0.0356, + "step": 118270 + }, + { + "epoch": 0.1914, + "grad_norm": 0.058148663491010666, + "learning_rate": 1.9554890950458954e-05, + "loss": 0.0347, + "step": 118280 + }, + { + "epoch": 0.19145, + "grad_norm": 0.05619463324546814, + "learning_rate": 1.9550856589770467e-05, + "loss": 0.0343, + "step": 118290 + }, + { + "epoch": 0.1915, + "grad_norm": 0.07490137219429016, + "learning_rate": 1.954682237805953e-05, + "loss": 0.0343, + "step": 118300 + }, + { + "epoch": 0.19155, + "grad_norm": 0.06550087779760361, + "learning_rate": 1.95427883154364e-05, + "loss": 0.0342, + "step": 118310 + }, + { + "epoch": 0.1916, + "grad_norm": 0.058537062257528305, + "learning_rate": 1.9538754402011396e-05, + "loss": 0.0343, + "step": 118320 + }, + { + "epoch": 0.19165, + "grad_norm": 0.07029842585325241, + "learning_rate": 1.95347206378948e-05, + "loss": 0.0339, + "step": 118330 + }, + { + "epoch": 0.1917, + "grad_norm": 0.07698415964841843, + "learning_rate": 1.9530687023196885e-05, + "loss": 0.0338, + "step": 118340 + }, + { + "epoch": 0.19175, + "grad_norm": 0.07064062356948853, + "learning_rate": 1.9526653558027937e-05, + "loss": 0.0364, + "step": 118350 + }, + { + "epoch": 0.1918, + "grad_norm": 0.06120840460062027, + "learning_rate": 1.9522620242498214e-05, + "loss": 0.0353, + "step": 118360 + }, + { + "epoch": 0.19185, + "grad_norm": 0.06712406128644943, + "learning_rate": 1.9518587076718008e-05, + "loss": 0.0343, + "step": 118370 + }, + { + "epoch": 0.1919, + "grad_norm": 0.06007279083132744, + "learning_rate": 1.951455406079756e-05, + "loss": 0.0344, + "step": 118380 + }, + { + "epoch": 0.19195, + "grad_norm": 0.07222392410039902, + "learning_rate": 1.9510521194847142e-05, + "loss": 0.0339, + "step": 118390 + }, + { + "epoch": 0.192, + "grad_norm": 0.06543446332216263, + "learning_rate": 1.9506488478977027e-05, + "loss": 0.0343, + "step": 118400 + }, + { + "epoch": 0.19205, + "grad_norm": 0.06642667949199677, + "learning_rate": 1.9502455913297438e-05, + "loss": 0.0361, + "step": 118410 + }, + { + "epoch": 0.1921, + "grad_norm": 0.06374597549438477, + "learning_rate": 1.949842349791865e-05, + "loss": 0.033, + "step": 118420 + }, + { + "epoch": 0.19215, + "grad_norm": 0.08575686067342758, + "learning_rate": 1.949439123295089e-05, + "loss": 0.0327, + "step": 118430 + }, + { + "epoch": 0.1922, + "grad_norm": 0.08806584030389786, + "learning_rate": 1.9490359118504412e-05, + "loss": 0.0366, + "step": 118440 + }, + { + "epoch": 0.19225, + "grad_norm": 0.0780394971370697, + "learning_rate": 1.948632715468944e-05, + "loss": 0.0351, + "step": 118450 + }, + { + "epoch": 0.1923, + "grad_norm": 0.07681611180305481, + "learning_rate": 1.9482295341616212e-05, + "loss": 0.034, + "step": 118460 + }, + { + "epoch": 0.19235, + "grad_norm": 0.06136760860681534, + "learning_rate": 1.947826367939496e-05, + "loss": 0.0347, + "step": 118470 + }, + { + "epoch": 0.1924, + "grad_norm": 0.09613508731126785, + "learning_rate": 1.9474232168135903e-05, + "loss": 0.0344, + "step": 118480 + }, + { + "epoch": 0.19245, + "grad_norm": 0.061935920268297195, + "learning_rate": 1.9470200807949267e-05, + "loss": 0.0345, + "step": 118490 + }, + { + "epoch": 0.1925, + "grad_norm": 0.06192437931895256, + "learning_rate": 1.946616959894525e-05, + "loss": 0.0345, + "step": 118500 + }, + { + "epoch": 0.19255, + "grad_norm": 0.08948417007923126, + "learning_rate": 1.946213854123409e-05, + "loss": 0.035, + "step": 118510 + }, + { + "epoch": 0.1926, + "grad_norm": 0.0753794014453888, + "learning_rate": 1.9458107634925975e-05, + "loss": 0.034, + "step": 118520 + }, + { + "epoch": 0.19265, + "grad_norm": 0.05915827676653862, + "learning_rate": 1.945407688013112e-05, + "loss": 0.0347, + "step": 118530 + }, + { + "epoch": 0.1927, + "grad_norm": 0.06349233537912369, + "learning_rate": 1.945004627695972e-05, + "loss": 0.0347, + "step": 118540 + }, + { + "epoch": 0.19275, + "grad_norm": 0.06719525903463364, + "learning_rate": 1.9446015825521967e-05, + "loss": 0.0346, + "step": 118550 + }, + { + "epoch": 0.1928, + "grad_norm": 0.06673227250576019, + "learning_rate": 1.944198552592806e-05, + "loss": 0.037, + "step": 118560 + }, + { + "epoch": 0.19285, + "grad_norm": 0.06972498446702957, + "learning_rate": 1.9437955378288173e-05, + "loss": 0.0353, + "step": 118570 + }, + { + "epoch": 0.1929, + "grad_norm": 0.06662328541278839, + "learning_rate": 1.9433925382712493e-05, + "loss": 0.0337, + "step": 118580 + }, + { + "epoch": 0.19295, + "grad_norm": 0.05945248901844025, + "learning_rate": 1.9429895539311215e-05, + "loss": 0.034, + "step": 118590 + }, + { + "epoch": 0.193, + "grad_norm": 0.06642909348011017, + "learning_rate": 1.9425865848194488e-05, + "loss": 0.0345, + "step": 118600 + }, + { + "epoch": 0.19305, + "grad_norm": 0.052793245762586594, + "learning_rate": 1.942183630947251e-05, + "loss": 0.0364, + "step": 118610 + }, + { + "epoch": 0.1931, + "grad_norm": 0.05456728860735893, + "learning_rate": 1.9417806923255415e-05, + "loss": 0.0339, + "step": 118620 + }, + { + "epoch": 0.19315, + "grad_norm": 0.06805785000324249, + "learning_rate": 1.9413777689653393e-05, + "loss": 0.0339, + "step": 118630 + }, + { + "epoch": 0.1932, + "grad_norm": 0.07911752909421921, + "learning_rate": 1.9409748608776585e-05, + "loss": 0.034, + "step": 118640 + }, + { + "epoch": 0.19325, + "grad_norm": 0.07075954228639603, + "learning_rate": 1.9405719680735146e-05, + "loss": 0.0352, + "step": 118650 + }, + { + "epoch": 0.1933, + "grad_norm": 0.06585540622472763, + "learning_rate": 1.940169090563924e-05, + "loss": 0.0342, + "step": 118660 + }, + { + "epoch": 0.19335, + "grad_norm": 0.06112068518996239, + "learning_rate": 1.9397662283598996e-05, + "loss": 0.0338, + "step": 118670 + }, + { + "epoch": 0.1934, + "grad_norm": 0.06887775659561157, + "learning_rate": 1.939363381472456e-05, + "loss": 0.0345, + "step": 118680 + }, + { + "epoch": 0.19345, + "grad_norm": 0.058120857924222946, + "learning_rate": 1.938960549912607e-05, + "loss": 0.0344, + "step": 118690 + }, + { + "epoch": 0.1935, + "grad_norm": 0.06258076429367065, + "learning_rate": 1.938557733691365e-05, + "loss": 0.0333, + "step": 118700 + }, + { + "epoch": 0.19355, + "grad_norm": 0.06609176099300385, + "learning_rate": 1.9381549328197445e-05, + "loss": 0.0355, + "step": 118710 + }, + { + "epoch": 0.1936, + "grad_norm": 0.0720101147890091, + "learning_rate": 1.937752147308757e-05, + "loss": 0.039, + "step": 118720 + }, + { + "epoch": 0.19365, + "grad_norm": 0.07010449469089508, + "learning_rate": 1.9373493771694145e-05, + "loss": 0.0346, + "step": 118730 + }, + { + "epoch": 0.1937, + "grad_norm": 0.07113895565271378, + "learning_rate": 1.9369466224127285e-05, + "loss": 0.0346, + "step": 118740 + }, + { + "epoch": 0.19375, + "grad_norm": 0.06821592897176743, + "learning_rate": 1.936543883049711e-05, + "loss": 0.0351, + "step": 118750 + }, + { + "epoch": 0.1938, + "grad_norm": 0.06012248992919922, + "learning_rate": 1.9361411590913715e-05, + "loss": 0.035, + "step": 118760 + }, + { + "epoch": 0.19385, + "grad_norm": 0.06248101964592934, + "learning_rate": 1.9357384505487204e-05, + "loss": 0.0357, + "step": 118770 + }, + { + "epoch": 0.1939, + "grad_norm": 0.07085489481687546, + "learning_rate": 1.935335757432769e-05, + "loss": 0.0343, + "step": 118780 + }, + { + "epoch": 0.19395, + "grad_norm": 0.06061448156833649, + "learning_rate": 1.9349330797545247e-05, + "loss": 0.0342, + "step": 118790 + }, + { + "epoch": 0.194, + "grad_norm": 0.06255852431058884, + "learning_rate": 1.9345304175249996e-05, + "loss": 0.0356, + "step": 118800 + }, + { + "epoch": 0.19405, + "grad_norm": 0.06098503991961479, + "learning_rate": 1.9341277707551982e-05, + "loss": 0.0356, + "step": 118810 + }, + { + "epoch": 0.1941, + "grad_norm": 0.06389370560646057, + "learning_rate": 1.933725139456133e-05, + "loss": 0.0365, + "step": 118820 + }, + { + "epoch": 0.19415, + "grad_norm": 0.07680466026067734, + "learning_rate": 1.933322523638808e-05, + "loss": 0.0354, + "step": 118830 + }, + { + "epoch": 0.1942, + "grad_norm": 0.0719519779086113, + "learning_rate": 1.932919923314233e-05, + "loss": 0.0341, + "step": 118840 + }, + { + "epoch": 0.19425, + "grad_norm": 0.08088349550962448, + "learning_rate": 1.932517338493415e-05, + "loss": 0.0346, + "step": 118850 + }, + { + "epoch": 0.1943, + "grad_norm": 0.06553579121828079, + "learning_rate": 1.9321147691873586e-05, + "loss": 0.0364, + "step": 118860 + }, + { + "epoch": 0.19435, + "grad_norm": 0.06444582343101501, + "learning_rate": 1.931712215407072e-05, + "loss": 0.034, + "step": 118870 + }, + { + "epoch": 0.1944, + "grad_norm": 0.06452842801809311, + "learning_rate": 1.9313096771635596e-05, + "loss": 0.0343, + "step": 118880 + }, + { + "epoch": 0.19445, + "grad_norm": 0.0822884812951088, + "learning_rate": 1.930907154467826e-05, + "loss": 0.0347, + "step": 118890 + }, + { + "epoch": 0.1945, + "grad_norm": 0.06797374784946442, + "learning_rate": 1.9305046473308792e-05, + "loss": 0.034, + "step": 118900 + }, + { + "epoch": 0.19455, + "grad_norm": 0.06760447472333908, + "learning_rate": 1.9301021557637193e-05, + "loss": 0.0346, + "step": 118910 + }, + { + "epoch": 0.1946, + "grad_norm": 0.06659888476133347, + "learning_rate": 1.9296996797773534e-05, + "loss": 0.0348, + "step": 118920 + }, + { + "epoch": 0.19465, + "grad_norm": 0.05706968903541565, + "learning_rate": 1.9292972193827837e-05, + "loss": 0.0344, + "step": 118930 + }, + { + "epoch": 0.1947, + "grad_norm": 0.07194396108388901, + "learning_rate": 1.928894774591014e-05, + "loss": 0.0343, + "step": 118940 + }, + { + "epoch": 0.19475, + "grad_norm": 0.09228664636611938, + "learning_rate": 1.928492345413046e-05, + "loss": 0.0347, + "step": 118950 + }, + { + "epoch": 0.1948, + "grad_norm": 0.06826335191726685, + "learning_rate": 1.9280899318598827e-05, + "loss": 0.0337, + "step": 118960 + }, + { + "epoch": 0.19485, + "grad_norm": 0.053651031106710434, + "learning_rate": 1.9276875339425262e-05, + "loss": 0.0345, + "step": 118970 + }, + { + "epoch": 0.1949, + "grad_norm": 0.07270044088363647, + "learning_rate": 1.9272851516719773e-05, + "loss": 0.0343, + "step": 118980 + }, + { + "epoch": 0.19495, + "grad_norm": 0.08653170615434647, + "learning_rate": 1.9268827850592374e-05, + "loss": 0.0353, + "step": 118990 + }, + { + "epoch": 0.195, + "grad_norm": 0.09220685809850693, + "learning_rate": 1.926480434115306e-05, + "loss": 0.0355, + "step": 119000 + }, + { + "epoch": 0.19505, + "grad_norm": 0.09737882763147354, + "learning_rate": 1.9260780988511856e-05, + "loss": 0.0357, + "step": 119010 + }, + { + "epoch": 0.1951, + "grad_norm": 0.0808955505490303, + "learning_rate": 1.925675779277873e-05, + "loss": 0.0353, + "step": 119020 + }, + { + "epoch": 0.19515, + "grad_norm": 0.06855889409780502, + "learning_rate": 1.925273475406369e-05, + "loss": 0.0344, + "step": 119030 + }, + { + "epoch": 0.1952, + "grad_norm": 0.07284665107727051, + "learning_rate": 1.9248711872476727e-05, + "loss": 0.0358, + "step": 119040 + }, + { + "epoch": 0.19525, + "grad_norm": 0.06209972873330116, + "learning_rate": 1.924468914812782e-05, + "loss": 0.0328, + "step": 119050 + }, + { + "epoch": 0.1953, + "grad_norm": 0.06067826226353645, + "learning_rate": 1.924066658112695e-05, + "loss": 0.0341, + "step": 119060 + }, + { + "epoch": 0.19535, + "grad_norm": 0.0795523151755333, + "learning_rate": 1.923664417158409e-05, + "loss": 0.0334, + "step": 119070 + }, + { + "epoch": 0.1954, + "grad_norm": 0.06257842481136322, + "learning_rate": 1.9232621919609207e-05, + "loss": 0.0377, + "step": 119080 + }, + { + "epoch": 0.19545, + "grad_norm": 0.060376379638910294, + "learning_rate": 1.922859982531229e-05, + "loss": 0.0329, + "step": 119090 + }, + { + "epoch": 0.1955, + "grad_norm": 0.06914056837558746, + "learning_rate": 1.922457788880327e-05, + "loss": 0.0348, + "step": 119100 + }, + { + "epoch": 0.19555, + "grad_norm": 0.08333203196525574, + "learning_rate": 1.9220556110192136e-05, + "loss": 0.0335, + "step": 119110 + }, + { + "epoch": 0.1956, + "grad_norm": 0.06205040588974953, + "learning_rate": 1.9216534489588812e-05, + "loss": 0.0336, + "step": 119120 + }, + { + "epoch": 0.19565, + "grad_norm": 0.06603963673114777, + "learning_rate": 1.921251302710327e-05, + "loss": 0.0338, + "step": 119130 + }, + { + "epoch": 0.1957, + "grad_norm": 0.057853810489177704, + "learning_rate": 1.9208491722845445e-05, + "loss": 0.0331, + "step": 119140 + }, + { + "epoch": 0.19575, + "grad_norm": 0.09958021342754364, + "learning_rate": 1.920447057692528e-05, + "loss": 0.0332, + "step": 119150 + }, + { + "epoch": 0.1958, + "grad_norm": 0.06875363737344742, + "learning_rate": 1.920044958945272e-05, + "loss": 0.0332, + "step": 119160 + }, + { + "epoch": 0.19585, + "grad_norm": 0.060864198952913284, + "learning_rate": 1.919642876053768e-05, + "loss": 0.0354, + "step": 119170 + }, + { + "epoch": 0.1959, + "grad_norm": 0.08430393040180206, + "learning_rate": 1.9192408090290105e-05, + "loss": 0.0354, + "step": 119180 + }, + { + "epoch": 0.19595, + "grad_norm": 0.11097610741853714, + "learning_rate": 1.9188387578819902e-05, + "loss": 0.039, + "step": 119190 + }, + { + "epoch": 0.196, + "grad_norm": 0.08503743261098862, + "learning_rate": 1.918436722623701e-05, + "loss": 0.0346, + "step": 119200 + }, + { + "epoch": 0.19605, + "grad_norm": 0.07787574082612991, + "learning_rate": 1.918034703265132e-05, + "loss": 0.0334, + "step": 119210 + }, + { + "epoch": 0.1961, + "grad_norm": 0.06750308722257614, + "learning_rate": 1.917632699817276e-05, + "loss": 0.0345, + "step": 119220 + }, + { + "epoch": 0.19615, + "grad_norm": 0.07580902427434921, + "learning_rate": 1.917230712291124e-05, + "loss": 0.0341, + "step": 119230 + }, + { + "epoch": 0.1962, + "grad_norm": 0.07479807734489441, + "learning_rate": 1.9168287406976646e-05, + "loss": 0.0347, + "step": 119240 + }, + { + "epoch": 0.19625, + "grad_norm": 0.060865797102451324, + "learning_rate": 1.916426785047889e-05, + "loss": 0.0345, + "step": 119250 + }, + { + "epoch": 0.1963, + "grad_norm": 0.06803198158740997, + "learning_rate": 1.9160248453527852e-05, + "loss": 0.034, + "step": 119260 + }, + { + "epoch": 0.19635, + "grad_norm": 0.07381574809551239, + "learning_rate": 1.9156229216233434e-05, + "loss": 0.0348, + "step": 119270 + }, + { + "epoch": 0.1964, + "grad_norm": 0.05769243463873863, + "learning_rate": 1.9152210138705508e-05, + "loss": 0.0331, + "step": 119280 + }, + { + "epoch": 0.19645, + "grad_norm": 0.06551354378461838, + "learning_rate": 1.9148191221053955e-05, + "loss": 0.0342, + "step": 119290 + }, + { + "epoch": 0.1965, + "grad_norm": 0.06820987910032272, + "learning_rate": 1.914417246338867e-05, + "loss": 0.0338, + "step": 119300 + }, + { + "epoch": 0.19655, + "grad_norm": 0.05691422149538994, + "learning_rate": 1.9140153865819496e-05, + "loss": 0.0323, + "step": 119310 + }, + { + "epoch": 0.1966, + "grad_norm": 0.06343129277229309, + "learning_rate": 1.913613542845633e-05, + "loss": 0.0338, + "step": 119320 + }, + { + "epoch": 0.19665, + "grad_norm": 0.058741576969623566, + "learning_rate": 1.9132117151409002e-05, + "loss": 0.0327, + "step": 119330 + }, + { + "epoch": 0.1967, + "grad_norm": 0.06788462400436401, + "learning_rate": 1.912809903478739e-05, + "loss": 0.0332, + "step": 119340 + }, + { + "epoch": 0.19675, + "grad_norm": 0.07411598414182663, + "learning_rate": 1.912408107870135e-05, + "loss": 0.0347, + "step": 119350 + }, + { + "epoch": 0.1968, + "grad_norm": 0.06857981532812119, + "learning_rate": 1.9120063283260722e-05, + "loss": 0.0332, + "step": 119360 + }, + { + "epoch": 0.19685, + "grad_norm": 0.05873365327715874, + "learning_rate": 1.9116045648575358e-05, + "loss": 0.0333, + "step": 119370 + }, + { + "epoch": 0.1969, + "grad_norm": 0.06224021315574646, + "learning_rate": 1.9112028174755094e-05, + "loss": 0.0343, + "step": 119380 + }, + { + "epoch": 0.19695, + "grad_norm": 0.06388100236654282, + "learning_rate": 1.910801086190977e-05, + "loss": 0.0345, + "step": 119390 + }, + { + "epoch": 0.197, + "grad_norm": 0.06213835999369621, + "learning_rate": 1.910399371014921e-05, + "loss": 0.0334, + "step": 119400 + }, + { + "epoch": 0.19705, + "grad_norm": 0.054965998977422714, + "learning_rate": 1.9099976719583245e-05, + "loss": 0.0341, + "step": 119410 + }, + { + "epoch": 0.1971, + "grad_norm": 0.06047357618808746, + "learning_rate": 1.909595989032171e-05, + "loss": 0.0348, + "step": 119420 + }, + { + "epoch": 0.19715, + "grad_norm": 0.062203872948884964, + "learning_rate": 1.9091943222474407e-05, + "loss": 0.0351, + "step": 119430 + }, + { + "epoch": 0.1972, + "grad_norm": 0.07518782466650009, + "learning_rate": 1.908792671615116e-05, + "loss": 0.0356, + "step": 119440 + }, + { + "epoch": 0.19725, + "grad_norm": 0.08392120152711868, + "learning_rate": 1.9083910371461772e-05, + "loss": 0.0355, + "step": 119450 + }, + { + "epoch": 0.1973, + "grad_norm": 0.09785937517881393, + "learning_rate": 1.9079894188516056e-05, + "loss": 0.0341, + "step": 119460 + }, + { + "epoch": 0.19735, + "grad_norm": 0.07539359480142593, + "learning_rate": 1.9075878167423805e-05, + "loss": 0.0342, + "step": 119470 + }, + { + "epoch": 0.1974, + "grad_norm": 0.10306426137685776, + "learning_rate": 1.907186230829482e-05, + "loss": 0.0349, + "step": 119480 + }, + { + "epoch": 0.19745, + "grad_norm": 0.08121484518051147, + "learning_rate": 1.90678466112389e-05, + "loss": 0.0355, + "step": 119490 + }, + { + "epoch": 0.1975, + "grad_norm": 0.07452220469713211, + "learning_rate": 1.9063831076365807e-05, + "loss": 0.0348, + "step": 119500 + }, + { + "epoch": 0.19755, + "grad_norm": 0.0740089938044548, + "learning_rate": 1.9059815703785362e-05, + "loss": 0.0342, + "step": 119510 + }, + { + "epoch": 0.1976, + "grad_norm": 0.09367933869361877, + "learning_rate": 1.905580049360731e-05, + "loss": 0.035, + "step": 119520 + }, + { + "epoch": 0.19765, + "grad_norm": 0.07209538668394089, + "learning_rate": 1.9051785445941446e-05, + "loss": 0.0346, + "step": 119530 + }, + { + "epoch": 0.1977, + "grad_norm": 0.06270518898963928, + "learning_rate": 1.9047770560897532e-05, + "loss": 0.0346, + "step": 119540 + }, + { + "epoch": 0.19775, + "grad_norm": 0.06139799952507019, + "learning_rate": 1.9043755838585334e-05, + "loss": 0.0335, + "step": 119550 + }, + { + "epoch": 0.1978, + "grad_norm": 0.05551351234316826, + "learning_rate": 1.9039741279114617e-05, + "loss": 0.0334, + "step": 119560 + }, + { + "epoch": 0.19785, + "grad_norm": 0.06037990376353264, + "learning_rate": 1.903572688259513e-05, + "loss": 0.0344, + "step": 119570 + }, + { + "epoch": 0.1979, + "grad_norm": 0.05958827584981918, + "learning_rate": 1.9031712649136634e-05, + "loss": 0.0334, + "step": 119580 + }, + { + "epoch": 0.19795, + "grad_norm": 0.0590079165995121, + "learning_rate": 1.9027698578848867e-05, + "loss": 0.034, + "step": 119590 + }, + { + "epoch": 0.198, + "grad_norm": 0.058510467410087585, + "learning_rate": 1.9023684671841575e-05, + "loss": 0.0345, + "step": 119600 + }, + { + "epoch": 0.19805, + "grad_norm": 0.09045732766389847, + "learning_rate": 1.9019670928224513e-05, + "loss": 0.0336, + "step": 119610 + }, + { + "epoch": 0.1981, + "grad_norm": 0.08185352385044098, + "learning_rate": 1.9015657348107384e-05, + "loss": 0.0345, + "step": 119620 + }, + { + "epoch": 0.19815, + "grad_norm": 0.07754584401845932, + "learning_rate": 1.901164393159994e-05, + "loss": 0.0327, + "step": 119630 + }, + { + "epoch": 0.1982, + "grad_norm": 0.09113860875368118, + "learning_rate": 1.9007630678811905e-05, + "loss": 0.0325, + "step": 119640 + }, + { + "epoch": 0.19825, + "grad_norm": 0.06712755560874939, + "learning_rate": 1.9003617589852998e-05, + "loss": 0.0332, + "step": 119650 + }, + { + "epoch": 0.1983, + "grad_norm": 0.08282347023487091, + "learning_rate": 1.899960466483293e-05, + "loss": 0.033, + "step": 119660 + }, + { + "epoch": 0.19835, + "grad_norm": 0.06524205207824707, + "learning_rate": 1.899559190386141e-05, + "loss": 0.0317, + "step": 119670 + }, + { + "epoch": 0.1984, + "grad_norm": 0.055411096662282944, + "learning_rate": 1.899157930704816e-05, + "loss": 0.0329, + "step": 119680 + }, + { + "epoch": 0.19845, + "grad_norm": 0.07240267843008041, + "learning_rate": 1.8987566874502874e-05, + "loss": 0.0329, + "step": 119690 + }, + { + "epoch": 0.1985, + "grad_norm": 0.055038850754499435, + "learning_rate": 1.8983554606335254e-05, + "loss": 0.0374, + "step": 119700 + }, + { + "epoch": 0.19855, + "grad_norm": 0.08657161891460419, + "learning_rate": 1.897954250265498e-05, + "loss": 0.0331, + "step": 119710 + }, + { + "epoch": 0.1986, + "grad_norm": 0.08538049459457397, + "learning_rate": 1.8975530563571752e-05, + "loss": 0.0341, + "step": 119720 + }, + { + "epoch": 0.19865, + "grad_norm": 0.06875836849212646, + "learning_rate": 1.8971518789195266e-05, + "loss": 0.036, + "step": 119730 + }, + { + "epoch": 0.1987, + "grad_norm": 0.07377111911773682, + "learning_rate": 1.8967507179635187e-05, + "loss": 0.0337, + "step": 119740 + }, + { + "epoch": 0.19875, + "grad_norm": 0.07042787224054337, + "learning_rate": 1.8963495735001197e-05, + "loss": 0.0348, + "step": 119750 + }, + { + "epoch": 0.1988, + "grad_norm": 0.06096513941884041, + "learning_rate": 1.895948445540296e-05, + "loss": 0.0331, + "step": 119760 + }, + { + "epoch": 0.19885, + "grad_norm": 0.07015082240104675, + "learning_rate": 1.895547334095016e-05, + "loss": 0.0348, + "step": 119770 + }, + { + "epoch": 0.1989, + "grad_norm": 0.06189883127808571, + "learning_rate": 1.8951462391752436e-05, + "loss": 0.0342, + "step": 119780 + }, + { + "epoch": 0.19895, + "grad_norm": 0.06042921170592308, + "learning_rate": 1.8947451607919457e-05, + "loss": 0.0357, + "step": 119790 + }, + { + "epoch": 0.199, + "grad_norm": 0.06005643680691719, + "learning_rate": 1.894344098956089e-05, + "loss": 0.0346, + "step": 119800 + }, + { + "epoch": 0.19905, + "grad_norm": 0.06529024243354797, + "learning_rate": 1.8939430536786357e-05, + "loss": 0.0354, + "step": 119810 + }, + { + "epoch": 0.1991, + "grad_norm": 0.06161453202366829, + "learning_rate": 1.8935420249705533e-05, + "loss": 0.0336, + "step": 119820 + }, + { + "epoch": 0.19915, + "grad_norm": 0.06204698234796524, + "learning_rate": 1.8931410128428024e-05, + "loss": 0.034, + "step": 119830 + }, + { + "epoch": 0.1992, + "grad_norm": 0.06497624516487122, + "learning_rate": 1.8927400173063493e-05, + "loss": 0.034, + "step": 119840 + }, + { + "epoch": 0.19925, + "grad_norm": 0.07811807096004486, + "learning_rate": 1.892339038372155e-05, + "loss": 0.0326, + "step": 119850 + }, + { + "epoch": 0.1993, + "grad_norm": 0.08065193146467209, + "learning_rate": 1.8919380760511838e-05, + "loss": 0.0332, + "step": 119860 + }, + { + "epoch": 0.19935, + "grad_norm": 0.06446090340614319, + "learning_rate": 1.8915371303543973e-05, + "loss": 0.0318, + "step": 119870 + }, + { + "epoch": 0.1994, + "grad_norm": 0.06551788002252579, + "learning_rate": 1.8911362012927565e-05, + "loss": 0.0327, + "step": 119880 + }, + { + "epoch": 0.19945, + "grad_norm": 0.08348044008016586, + "learning_rate": 1.890735288877224e-05, + "loss": 0.0337, + "step": 119890 + }, + { + "epoch": 0.1995, + "grad_norm": 0.08474072068929672, + "learning_rate": 1.890334393118759e-05, + "loss": 0.0334, + "step": 119900 + }, + { + "epoch": 0.19955, + "grad_norm": 0.09361223876476288, + "learning_rate": 1.8899335140283225e-05, + "loss": 0.0335, + "step": 119910 + }, + { + "epoch": 0.1996, + "grad_norm": 0.0642770305275917, + "learning_rate": 1.8895326516168755e-05, + "loss": 0.0327, + "step": 119920 + }, + { + "epoch": 0.19965, + "grad_norm": 0.06142275780439377, + "learning_rate": 1.8891318058953756e-05, + "loss": 0.0329, + "step": 119930 + }, + { + "epoch": 0.1997, + "grad_norm": 0.07025183737277985, + "learning_rate": 1.8887309768747834e-05, + "loss": 0.0321, + "step": 119940 + }, + { + "epoch": 0.19975, + "grad_norm": 0.05500560998916626, + "learning_rate": 1.8883301645660563e-05, + "loss": 0.0328, + "step": 119950 + }, + { + "epoch": 0.1998, + "grad_norm": 0.08264268934726715, + "learning_rate": 1.887929368980153e-05, + "loss": 0.0342, + "step": 119960 + }, + { + "epoch": 0.19985, + "grad_norm": 0.07302163541316986, + "learning_rate": 1.8875285901280303e-05, + "loss": 0.032, + "step": 119970 + }, + { + "epoch": 0.1999, + "grad_norm": 0.07754736393690109, + "learning_rate": 1.8871278280206458e-05, + "loss": 0.0341, + "step": 119980 + }, + { + "epoch": 0.19995, + "grad_norm": 0.06747692078351974, + "learning_rate": 1.886727082668957e-05, + "loss": 0.035, + "step": 119990 + }, + { + "epoch": 0.2, + "grad_norm": 0.07290555536746979, + "learning_rate": 1.886326354083918e-05, + "loss": 0.0361, + "step": 120000 + }, + { + "epoch": 0.20005, + "grad_norm": 0.06327662616968155, + "learning_rate": 1.8859256422764878e-05, + "loss": 0.0329, + "step": 120010 + }, + { + "epoch": 0.2001, + "grad_norm": 0.12843845784664154, + "learning_rate": 1.885524947257618e-05, + "loss": 0.0368, + "step": 120020 + }, + { + "epoch": 0.20015, + "grad_norm": 0.1008138656616211, + "learning_rate": 1.8851242690382672e-05, + "loss": 0.0337, + "step": 120030 + }, + { + "epoch": 0.2002, + "grad_norm": 0.09233039617538452, + "learning_rate": 1.884723607629386e-05, + "loss": 0.0358, + "step": 120040 + }, + { + "epoch": 0.20025, + "grad_norm": 0.07306820154190063, + "learning_rate": 1.884322963041931e-05, + "loss": 0.034, + "step": 120050 + }, + { + "epoch": 0.2003, + "grad_norm": 0.09731920063495636, + "learning_rate": 1.8839223352868553e-05, + "loss": 0.035, + "step": 120060 + }, + { + "epoch": 0.20035, + "grad_norm": 0.07172708213329315, + "learning_rate": 1.8835217243751107e-05, + "loss": 0.0332, + "step": 120070 + }, + { + "epoch": 0.2004, + "grad_norm": 0.0703640952706337, + "learning_rate": 1.8831211303176514e-05, + "loss": 0.0332, + "step": 120080 + }, + { + "epoch": 0.20045, + "grad_norm": 0.06962746381759644, + "learning_rate": 1.8827205531254282e-05, + "loss": 0.0334, + "step": 120090 + }, + { + "epoch": 0.2005, + "grad_norm": 0.07666989415884018, + "learning_rate": 1.8823199928093923e-05, + "loss": 0.037, + "step": 120100 + }, + { + "epoch": 0.20055, + "grad_norm": 0.05687066167593002, + "learning_rate": 1.8819194493804976e-05, + "loss": 0.0333, + "step": 120110 + }, + { + "epoch": 0.2006, + "grad_norm": 0.05135420337319374, + "learning_rate": 1.881518922849691e-05, + "loss": 0.0344, + "step": 120120 + }, + { + "epoch": 0.20065, + "grad_norm": 0.06644924730062485, + "learning_rate": 1.8811184132279265e-05, + "loss": 0.0347, + "step": 120130 + }, + { + "epoch": 0.2007, + "grad_norm": 0.07128880172967911, + "learning_rate": 1.880717920526151e-05, + "loss": 0.0342, + "step": 120140 + }, + { + "epoch": 0.20075, + "grad_norm": 0.06110058352351189, + "learning_rate": 1.8803174447553157e-05, + "loss": 0.0369, + "step": 120150 + }, + { + "epoch": 0.2008, + "grad_norm": 0.062450435012578964, + "learning_rate": 1.8799169859263676e-05, + "loss": 0.0347, + "step": 120160 + }, + { + "epoch": 0.20085, + "grad_norm": 0.056189533323049545, + "learning_rate": 1.8795165440502564e-05, + "loss": 0.0338, + "step": 120170 + }, + { + "epoch": 0.2009, + "grad_norm": 0.05827772989869118, + "learning_rate": 1.879116119137931e-05, + "loss": 0.0341, + "step": 120180 + }, + { + "epoch": 0.20095, + "grad_norm": 0.0684100016951561, + "learning_rate": 1.878715711200336e-05, + "loss": 0.0334, + "step": 120190 + }, + { + "epoch": 0.201, + "grad_norm": 0.06133823096752167, + "learning_rate": 1.8783153202484213e-05, + "loss": 0.0363, + "step": 120200 + }, + { + "epoch": 0.20105, + "grad_norm": 0.06145724281668663, + "learning_rate": 1.877914946293131e-05, + "loss": 0.0353, + "step": 120210 + }, + { + "epoch": 0.2011, + "grad_norm": 0.07646960020065308, + "learning_rate": 1.877514589345414e-05, + "loss": 0.0344, + "step": 120220 + }, + { + "epoch": 0.20115, + "grad_norm": 0.057070545852184296, + "learning_rate": 1.8771142494162124e-05, + "loss": 0.0343, + "step": 120230 + }, + { + "epoch": 0.2012, + "grad_norm": 0.06414657086133957, + "learning_rate": 1.876713926516474e-05, + "loss": 0.0347, + "step": 120240 + }, + { + "epoch": 0.20125, + "grad_norm": 0.0708075612783432, + "learning_rate": 1.8763136206571432e-05, + "loss": 0.0346, + "step": 120250 + }, + { + "epoch": 0.2013, + "grad_norm": 0.06626040488481522, + "learning_rate": 1.875913331849163e-05, + "loss": 0.0348, + "step": 120260 + }, + { + "epoch": 0.20135, + "grad_norm": 0.06647317111492157, + "learning_rate": 1.8755130601034787e-05, + "loss": 0.0344, + "step": 120270 + }, + { + "epoch": 0.2014, + "grad_norm": 0.06419264525175095, + "learning_rate": 1.875112805431032e-05, + "loss": 0.0334, + "step": 120280 + }, + { + "epoch": 0.20145, + "grad_norm": 0.07849488407373428, + "learning_rate": 1.8747125678427658e-05, + "loss": 0.0345, + "step": 120290 + }, + { + "epoch": 0.2015, + "grad_norm": 0.06297898292541504, + "learning_rate": 1.874312347349625e-05, + "loss": 0.0344, + "step": 120300 + }, + { + "epoch": 0.20155, + "grad_norm": 0.0669475868344307, + "learning_rate": 1.8739121439625474e-05, + "loss": 0.0341, + "step": 120310 + }, + { + "epoch": 0.2016, + "grad_norm": 0.07643059641122818, + "learning_rate": 1.8735119576924787e-05, + "loss": 0.0361, + "step": 120320 + }, + { + "epoch": 0.20165, + "grad_norm": 0.06809680163860321, + "learning_rate": 1.8731117885503558e-05, + "loss": 0.0358, + "step": 120330 + }, + { + "epoch": 0.2017, + "grad_norm": 0.06518039107322693, + "learning_rate": 1.8727116365471226e-05, + "loss": 0.0361, + "step": 120340 + }, + { + "epoch": 0.20175, + "grad_norm": 0.05954527109861374, + "learning_rate": 1.8723115016937164e-05, + "loss": 0.0353, + "step": 120350 + }, + { + "epoch": 0.2018, + "grad_norm": 0.07108394801616669, + "learning_rate": 1.8719113840010784e-05, + "loss": 0.0341, + "step": 120360 + }, + { + "epoch": 0.20185, + "grad_norm": 0.07770578563213348, + "learning_rate": 1.8715112834801476e-05, + "loss": 0.0333, + "step": 120370 + }, + { + "epoch": 0.2019, + "grad_norm": 0.07711070030927658, + "learning_rate": 1.8711112001418618e-05, + "loss": 0.0334, + "step": 120380 + }, + { + "epoch": 0.20195, + "grad_norm": 0.07317481189966202, + "learning_rate": 1.87071113399716e-05, + "loss": 0.0357, + "step": 120390 + }, + { + "epoch": 0.202, + "grad_norm": 0.06111655384302139, + "learning_rate": 1.870311085056979e-05, + "loss": 0.0348, + "step": 120400 + }, + { + "epoch": 0.20205, + "grad_norm": 0.062063030898571014, + "learning_rate": 1.8699110533322565e-05, + "loss": 0.0332, + "step": 120410 + }, + { + "epoch": 0.2021, + "grad_norm": 0.05810019373893738, + "learning_rate": 1.869511038833928e-05, + "loss": 0.0327, + "step": 120420 + }, + { + "epoch": 0.20215, + "grad_norm": 0.05780354142189026, + "learning_rate": 1.869111041572932e-05, + "loss": 0.0339, + "step": 120430 + }, + { + "epoch": 0.2022, + "grad_norm": 0.054892897605895996, + "learning_rate": 1.868711061560203e-05, + "loss": 0.0333, + "step": 120440 + }, + { + "epoch": 0.20225, + "grad_norm": 0.05134418606758118, + "learning_rate": 1.868311098806676e-05, + "loss": 0.0324, + "step": 120450 + }, + { + "epoch": 0.2023, + "grad_norm": 0.05112555995583534, + "learning_rate": 1.8679111533232867e-05, + "loss": 0.0313, + "step": 120460 + }, + { + "epoch": 0.20235, + "grad_norm": 0.06146746128797531, + "learning_rate": 1.867511225120969e-05, + "loss": 0.0342, + "step": 120470 + }, + { + "epoch": 0.2024, + "grad_norm": 0.06157961115241051, + "learning_rate": 1.8671113142106566e-05, + "loss": 0.0327, + "step": 120480 + }, + { + "epoch": 0.20245, + "grad_norm": 0.07634248584508896, + "learning_rate": 1.866711420603283e-05, + "loss": 0.0336, + "step": 120490 + }, + { + "epoch": 0.2025, + "grad_norm": 0.0633772686123848, + "learning_rate": 1.866311544309781e-05, + "loss": 0.0331, + "step": 120500 + }, + { + "epoch": 0.20255, + "grad_norm": 0.08006856590509415, + "learning_rate": 1.8659116853410847e-05, + "loss": 0.0356, + "step": 120510 + }, + { + "epoch": 0.2026, + "grad_norm": 0.06903557479381561, + "learning_rate": 1.8655118437081225e-05, + "loss": 0.0347, + "step": 120520 + }, + { + "epoch": 0.20265, + "grad_norm": 0.05871882662177086, + "learning_rate": 1.8651120194218305e-05, + "loss": 0.0335, + "step": 120530 + }, + { + "epoch": 0.2027, + "grad_norm": 0.07219228148460388, + "learning_rate": 1.8647122124931356e-05, + "loss": 0.0332, + "step": 120540 + }, + { + "epoch": 0.20275, + "grad_norm": 0.06969571113586426, + "learning_rate": 1.8643124229329705e-05, + "loss": 0.0323, + "step": 120550 + }, + { + "epoch": 0.2028, + "grad_norm": 0.06328929215669632, + "learning_rate": 1.8639126507522654e-05, + "loss": 0.0332, + "step": 120560 + }, + { + "epoch": 0.20285, + "grad_norm": 0.09350479394197464, + "learning_rate": 1.863512895961949e-05, + "loss": 0.0354, + "step": 120570 + }, + { + "epoch": 0.2029, + "grad_norm": 0.05679807439446449, + "learning_rate": 1.8631131585729517e-05, + "loss": 0.0338, + "step": 120580 + }, + { + "epoch": 0.20295, + "grad_norm": 0.05669408664107323, + "learning_rate": 1.8627134385962007e-05, + "loss": 0.0337, + "step": 120590 + }, + { + "epoch": 0.203, + "grad_norm": 0.07240969687700272, + "learning_rate": 1.862313736042625e-05, + "loss": 0.0353, + "step": 120600 + }, + { + "epoch": 0.20305, + "grad_norm": 0.07339102029800415, + "learning_rate": 1.861914050923152e-05, + "loss": 0.0381, + "step": 120610 + }, + { + "epoch": 0.2031, + "grad_norm": 0.06361488997936249, + "learning_rate": 1.8615143832487086e-05, + "loss": 0.0336, + "step": 120620 + }, + { + "epoch": 0.20315, + "grad_norm": 0.07316234707832336, + "learning_rate": 1.8611147330302233e-05, + "loss": 0.0333, + "step": 120630 + }, + { + "epoch": 0.2032, + "grad_norm": 0.058388516306877136, + "learning_rate": 1.8607151002786206e-05, + "loss": 0.0333, + "step": 120640 + }, + { + "epoch": 0.20325, + "grad_norm": 0.06330445408821106, + "learning_rate": 1.8603154850048275e-05, + "loss": 0.0344, + "step": 120650 + }, + { + "epoch": 0.2033, + "grad_norm": 0.07335477322340012, + "learning_rate": 1.859915887219768e-05, + "loss": 0.0343, + "step": 120660 + }, + { + "epoch": 0.20335, + "grad_norm": 0.07023430615663528, + "learning_rate": 1.859516306934368e-05, + "loss": 0.0334, + "step": 120670 + }, + { + "epoch": 0.2034, + "grad_norm": 0.06397784501314163, + "learning_rate": 1.8591167441595513e-05, + "loss": 0.0345, + "step": 120680 + }, + { + "epoch": 0.20345, + "grad_norm": 0.13211436569690704, + "learning_rate": 1.858717198906242e-05, + "loss": 0.0368, + "step": 120690 + }, + { + "epoch": 0.2035, + "grad_norm": 0.08479201048612595, + "learning_rate": 1.858317671185364e-05, + "loss": 0.0345, + "step": 120700 + }, + { + "epoch": 0.20355, + "grad_norm": 0.0865594744682312, + "learning_rate": 1.857918161007839e-05, + "loss": 0.0335, + "step": 120710 + }, + { + "epoch": 0.2036, + "grad_norm": 0.07507319748401642, + "learning_rate": 1.8575186683845917e-05, + "loss": 0.0335, + "step": 120720 + }, + { + "epoch": 0.20365, + "grad_norm": 0.07103493064641953, + "learning_rate": 1.857119193326541e-05, + "loss": 0.0353, + "step": 120730 + }, + { + "epoch": 0.2037, + "grad_norm": 0.062294647097587585, + "learning_rate": 1.8567197358446108e-05, + "loss": 0.0376, + "step": 120740 + }, + { + "epoch": 0.20375, + "grad_norm": 0.06791318953037262, + "learning_rate": 1.8563202959497212e-05, + "loss": 0.0356, + "step": 120750 + }, + { + "epoch": 0.2038, + "grad_norm": 0.06171085312962532, + "learning_rate": 1.855920873652793e-05, + "loss": 0.0342, + "step": 120760 + }, + { + "epoch": 0.20385, + "grad_norm": 0.07706128060817719, + "learning_rate": 1.8555214689647466e-05, + "loss": 0.0351, + "step": 120770 + }, + { + "epoch": 0.2039, + "grad_norm": 0.07189098745584488, + "learning_rate": 1.8551220818965004e-05, + "loss": 0.034, + "step": 120780 + }, + { + "epoch": 0.20395, + "grad_norm": 0.08183571696281433, + "learning_rate": 1.854722712458975e-05, + "loss": 0.035, + "step": 120790 + }, + { + "epoch": 0.204, + "grad_norm": 0.07607299834489822, + "learning_rate": 1.8543233606630874e-05, + "loss": 0.0364, + "step": 120800 + }, + { + "epoch": 0.20405, + "grad_norm": 0.08183683454990387, + "learning_rate": 1.8539240265197562e-05, + "loss": 0.0356, + "step": 120810 + }, + { + "epoch": 0.2041, + "grad_norm": 0.08728110045194626, + "learning_rate": 1.8535247100399012e-05, + "loss": 0.0353, + "step": 120820 + }, + { + "epoch": 0.20415, + "grad_norm": 0.07391119748353958, + "learning_rate": 1.8531254112344356e-05, + "loss": 0.0372, + "step": 120830 + }, + { + "epoch": 0.2042, + "grad_norm": 0.0634394958615303, + "learning_rate": 1.8527261301142796e-05, + "loss": 0.0354, + "step": 120840 + }, + { + "epoch": 0.20425, + "grad_norm": 0.06386538594961166, + "learning_rate": 1.8523268666903475e-05, + "loss": 0.0355, + "step": 120850 + }, + { + "epoch": 0.2043, + "grad_norm": 0.061842963099479675, + "learning_rate": 1.851927620973556e-05, + "loss": 0.0341, + "step": 120860 + }, + { + "epoch": 0.20435, + "grad_norm": 0.07959350943565369, + "learning_rate": 1.8515283929748194e-05, + "loss": 0.0402, + "step": 120870 + }, + { + "epoch": 0.2044, + "grad_norm": 0.06257916241884232, + "learning_rate": 1.851129182705053e-05, + "loss": 0.0342, + "step": 120880 + }, + { + "epoch": 0.20445, + "grad_norm": 0.06070181727409363, + "learning_rate": 1.8507299901751718e-05, + "loss": 0.0342, + "step": 120890 + }, + { + "epoch": 0.2045, + "grad_norm": 0.07804949581623077, + "learning_rate": 1.850330815396087e-05, + "loss": 0.0354, + "step": 120900 + }, + { + "epoch": 0.20455, + "grad_norm": 0.06420771032571793, + "learning_rate": 1.8499316583787157e-05, + "loss": 0.0331, + "step": 120910 + }, + { + "epoch": 0.2046, + "grad_norm": 0.05669880285859108, + "learning_rate": 1.8495325191339668e-05, + "loss": 0.0356, + "step": 120920 + }, + { + "epoch": 0.20465, + "grad_norm": 0.0698152482509613, + "learning_rate": 1.8491333976727553e-05, + "loss": 0.0335, + "step": 120930 + }, + { + "epoch": 0.2047, + "grad_norm": 0.06292180716991425, + "learning_rate": 1.8487342940059926e-05, + "loss": 0.0333, + "step": 120940 + }, + { + "epoch": 0.20475, + "grad_norm": 0.06293244659900665, + "learning_rate": 1.8483352081445886e-05, + "loss": 0.0332, + "step": 120950 + }, + { + "epoch": 0.2048, + "grad_norm": 0.059083491563797, + "learning_rate": 1.8479361400994567e-05, + "loss": 0.0343, + "step": 120960 + }, + { + "epoch": 0.20485, + "grad_norm": 0.07200410962104797, + "learning_rate": 1.8475370898815043e-05, + "loss": 0.0327, + "step": 120970 + }, + { + "epoch": 0.2049, + "grad_norm": 0.06585627049207687, + "learning_rate": 1.847138057501644e-05, + "loss": 0.034, + "step": 120980 + }, + { + "epoch": 0.20495, + "grad_norm": 0.07143794000148773, + "learning_rate": 1.8467390429707825e-05, + "loss": 0.034, + "step": 120990 + }, + { + "epoch": 0.205, + "grad_norm": 0.08228740096092224, + "learning_rate": 1.8463400462998302e-05, + "loss": 0.0336, + "step": 121000 + }, + { + "epoch": 0.20505, + "grad_norm": 0.06945434957742691, + "learning_rate": 1.8459410674996973e-05, + "loss": 0.0336, + "step": 121010 + }, + { + "epoch": 0.2051, + "grad_norm": 0.0655989870429039, + "learning_rate": 1.845542106581288e-05, + "loss": 0.0346, + "step": 121020 + }, + { + "epoch": 0.20515, + "grad_norm": 0.060558054596185684, + "learning_rate": 1.845143163555513e-05, + "loss": 0.0348, + "step": 121030 + }, + { + "epoch": 0.2052, + "grad_norm": 0.06830138713121414, + "learning_rate": 1.844744238433277e-05, + "loss": 0.0343, + "step": 121040 + }, + { + "epoch": 0.20525, + "grad_norm": 0.07038040459156036, + "learning_rate": 1.8443453312254876e-05, + "loss": 0.0331, + "step": 121050 + }, + { + "epoch": 0.2053, + "grad_norm": 0.06014655902981758, + "learning_rate": 1.843946441943051e-05, + "loss": 0.0358, + "step": 121060 + }, + { + "epoch": 0.20535, + "grad_norm": 0.06151202321052551, + "learning_rate": 1.8435475705968712e-05, + "loss": 0.0351, + "step": 121070 + }, + { + "epoch": 0.2054, + "grad_norm": 0.05301975831389427, + "learning_rate": 1.843148717197855e-05, + "loss": 0.0339, + "step": 121080 + }, + { + "epoch": 0.20545, + "grad_norm": 0.05595868080854416, + "learning_rate": 1.842749881756906e-05, + "loss": 0.0346, + "step": 121090 + }, + { + "epoch": 0.2055, + "grad_norm": 0.062360938638448715, + "learning_rate": 1.8423510642849284e-05, + "loss": 0.0355, + "step": 121100 + }, + { + "epoch": 0.20555, + "grad_norm": 0.06301844865083694, + "learning_rate": 1.8419522647928243e-05, + "loss": 0.0351, + "step": 121110 + }, + { + "epoch": 0.2056, + "grad_norm": 0.060403723269701004, + "learning_rate": 1.8415534832914995e-05, + "loss": 0.0329, + "step": 121120 + }, + { + "epoch": 0.20565, + "grad_norm": 0.0839466080069542, + "learning_rate": 1.841154719791855e-05, + "loss": 0.04, + "step": 121130 + }, + { + "epoch": 0.2057, + "grad_norm": 0.06863157451152802, + "learning_rate": 1.8407559743047924e-05, + "loss": 0.034, + "step": 121140 + }, + { + "epoch": 0.20575, + "grad_norm": 0.06865214556455612, + "learning_rate": 1.8403572468412145e-05, + "loss": 0.0341, + "step": 121150 + }, + { + "epoch": 0.2058, + "grad_norm": 0.06753120571374893, + "learning_rate": 1.8399585374120214e-05, + "loss": 0.0337, + "step": 121160 + }, + { + "epoch": 0.20585, + "grad_norm": 0.06102924793958664, + "learning_rate": 1.8395598460281137e-05, + "loss": 0.0339, + "step": 121170 + }, + { + "epoch": 0.2059, + "grad_norm": 0.07939987629652023, + "learning_rate": 1.839161172700392e-05, + "loss": 0.0342, + "step": 121180 + }, + { + "epoch": 0.20595, + "grad_norm": 0.06291405856609344, + "learning_rate": 1.8387625174397543e-05, + "loss": 0.0335, + "step": 121190 + }, + { + "epoch": 0.206, + "grad_norm": 0.05723979324102402, + "learning_rate": 1.8383638802571028e-05, + "loss": 0.0332, + "step": 121200 + }, + { + "epoch": 0.20605, + "grad_norm": 0.062456369400024414, + "learning_rate": 1.837965261163333e-05, + "loss": 0.0331, + "step": 121210 + }, + { + "epoch": 0.2061, + "grad_norm": 0.058690495789051056, + "learning_rate": 1.837566660169346e-05, + "loss": 0.0325, + "step": 121220 + }, + { + "epoch": 0.20615, + "grad_norm": 0.06099254637956619, + "learning_rate": 1.8371680772860353e-05, + "loss": 0.0339, + "step": 121230 + }, + { + "epoch": 0.2062, + "grad_norm": 0.05650908127427101, + "learning_rate": 1.8367695125243023e-05, + "loss": 0.0333, + "step": 121240 + }, + { + "epoch": 0.20625, + "grad_norm": 0.05411923676729202, + "learning_rate": 1.8363709658950402e-05, + "loss": 0.0345, + "step": 121250 + }, + { + "epoch": 0.2063, + "grad_norm": 0.07875292003154755, + "learning_rate": 1.835972437409147e-05, + "loss": 0.0344, + "step": 121260 + }, + { + "epoch": 0.20635, + "grad_norm": 0.07497681677341461, + "learning_rate": 1.8355739270775184e-05, + "loss": 0.0342, + "step": 121270 + }, + { + "epoch": 0.2064, + "grad_norm": 0.07842659205198288, + "learning_rate": 1.8351754349110484e-05, + "loss": 0.0345, + "step": 121280 + }, + { + "epoch": 0.20645, + "grad_norm": 0.05954407900571823, + "learning_rate": 1.834776960920633e-05, + "loss": 0.0342, + "step": 121290 + }, + { + "epoch": 0.2065, + "grad_norm": 0.057787492871284485, + "learning_rate": 1.8343785051171647e-05, + "loss": 0.0329, + "step": 121300 + }, + { + "epoch": 0.20655, + "grad_norm": 0.06499254703521729, + "learning_rate": 1.8339800675115376e-05, + "loss": 0.0359, + "step": 121310 + }, + { + "epoch": 0.2066, + "grad_norm": 0.06317613273859024, + "learning_rate": 1.8335816481146466e-05, + "loss": 0.033, + "step": 121320 + }, + { + "epoch": 0.20665, + "grad_norm": 0.055468276143074036, + "learning_rate": 1.833183246937382e-05, + "loss": 0.0328, + "step": 121330 + }, + { + "epoch": 0.2067, + "grad_norm": 0.05523278936743736, + "learning_rate": 1.832784863990638e-05, + "loss": 0.0318, + "step": 121340 + }, + { + "epoch": 0.20675, + "grad_norm": 0.06277237832546234, + "learning_rate": 1.832386499285304e-05, + "loss": 0.0326, + "step": 121350 + }, + { + "epoch": 0.2068, + "grad_norm": 0.05014962702989578, + "learning_rate": 1.8319881528322735e-05, + "loss": 0.0316, + "step": 121360 + }, + { + "epoch": 0.20685, + "grad_norm": 0.050259605050086975, + "learning_rate": 1.831589824642435e-05, + "loss": 0.0325, + "step": 121370 + }, + { + "epoch": 0.2069, + "grad_norm": 0.06408250331878662, + "learning_rate": 1.8311915147266796e-05, + "loss": 0.0358, + "step": 121380 + }, + { + "epoch": 0.20695, + "grad_norm": 0.06439071148633957, + "learning_rate": 1.8307932230958975e-05, + "loss": 0.0329, + "step": 121390 + }, + { + "epoch": 0.207, + "grad_norm": 0.06621982157230377, + "learning_rate": 1.8303949497609763e-05, + "loss": 0.0338, + "step": 121400 + }, + { + "epoch": 0.20705, + "grad_norm": 0.06697060167789459, + "learning_rate": 1.829996694732807e-05, + "loss": 0.0343, + "step": 121410 + }, + { + "epoch": 0.2071, + "grad_norm": 0.06745675951242447, + "learning_rate": 1.829598458022275e-05, + "loss": 0.0334, + "step": 121420 + }, + { + "epoch": 0.20715, + "grad_norm": 0.07169412821531296, + "learning_rate": 1.8292002396402708e-05, + "loss": 0.0345, + "step": 121430 + }, + { + "epoch": 0.2072, + "grad_norm": 0.053990427404642105, + "learning_rate": 1.8288020395976786e-05, + "loss": 0.0324, + "step": 121440 + }, + { + "epoch": 0.20725, + "grad_norm": 0.057537153363227844, + "learning_rate": 1.8284038579053865e-05, + "loss": 0.0339, + "step": 121450 + }, + { + "epoch": 0.2073, + "grad_norm": 0.06586366146802902, + "learning_rate": 1.8280056945742817e-05, + "loss": 0.0346, + "step": 121460 + }, + { + "epoch": 0.20735, + "grad_norm": 0.05934857577085495, + "learning_rate": 1.8276075496152477e-05, + "loss": 0.0337, + "step": 121470 + }, + { + "epoch": 0.2074, + "grad_norm": 0.050682567059993744, + "learning_rate": 1.8272094230391716e-05, + "loss": 0.0334, + "step": 121480 + }, + { + "epoch": 0.20745, + "grad_norm": 0.05052315071225166, + "learning_rate": 1.8268113148569367e-05, + "loss": 0.0342, + "step": 121490 + }, + { + "epoch": 0.2075, + "grad_norm": 0.06679129600524902, + "learning_rate": 1.826413225079427e-05, + "loss": 0.0345, + "step": 121500 + }, + { + "epoch": 0.20755, + "grad_norm": 0.06831441074609756, + "learning_rate": 1.826015153717528e-05, + "loss": 0.0334, + "step": 121510 + }, + { + "epoch": 0.2076, + "grad_norm": 0.0557723343372345, + "learning_rate": 1.8256171007821198e-05, + "loss": 0.033, + "step": 121520 + }, + { + "epoch": 0.20765, + "grad_norm": 0.07006657123565674, + "learning_rate": 1.825219066284088e-05, + "loss": 0.0347, + "step": 121530 + }, + { + "epoch": 0.2077, + "grad_norm": 0.061841513961553574, + "learning_rate": 1.8248210502343128e-05, + "loss": 0.0332, + "step": 121540 + }, + { + "epoch": 0.20775, + "grad_norm": 0.07461114972829819, + "learning_rate": 1.824423052643677e-05, + "loss": 0.0321, + "step": 121550 + }, + { + "epoch": 0.2078, + "grad_norm": 0.0667690858244896, + "learning_rate": 1.8240250735230607e-05, + "loss": 0.0332, + "step": 121560 + }, + { + "epoch": 0.20785, + "grad_norm": 0.07005876302719116, + "learning_rate": 1.8236271128833448e-05, + "loss": 0.0352, + "step": 121570 + }, + { + "epoch": 0.2079, + "grad_norm": 0.07365118712186813, + "learning_rate": 1.82322917073541e-05, + "loss": 0.0332, + "step": 121580 + }, + { + "epoch": 0.20795, + "grad_norm": 0.05793159827589989, + "learning_rate": 1.8228312470901356e-05, + "loss": 0.0326, + "step": 121590 + }, + { + "epoch": 0.208, + "grad_norm": 0.06920211762189865, + "learning_rate": 1.8224333419584e-05, + "loss": 0.0323, + "step": 121600 + }, + { + "epoch": 0.20805, + "grad_norm": 0.06781196594238281, + "learning_rate": 1.822035455351082e-05, + "loss": 0.0337, + "step": 121610 + }, + { + "epoch": 0.2081, + "grad_norm": 0.06084830313920975, + "learning_rate": 1.8216375872790608e-05, + "loss": 0.0326, + "step": 121620 + }, + { + "epoch": 0.20815, + "grad_norm": 0.06041372939944267, + "learning_rate": 1.821239737753212e-05, + "loss": 0.0342, + "step": 121630 + }, + { + "epoch": 0.2082, + "grad_norm": 0.06792076677083969, + "learning_rate": 1.8208419067844146e-05, + "loss": 0.0325, + "step": 121640 + }, + { + "epoch": 0.20825, + "grad_norm": 0.06462553143501282, + "learning_rate": 1.8204440943835444e-05, + "loss": 0.0337, + "step": 121650 + }, + { + "epoch": 0.2083, + "grad_norm": 0.05725152790546417, + "learning_rate": 1.8200463005614766e-05, + "loss": 0.0333, + "step": 121660 + }, + { + "epoch": 0.20835, + "grad_norm": 0.060946546494960785, + "learning_rate": 1.8196485253290885e-05, + "loss": 0.0337, + "step": 121670 + }, + { + "epoch": 0.2084, + "grad_norm": 0.06666068732738495, + "learning_rate": 1.8192507686972534e-05, + "loss": 0.0335, + "step": 121680 + }, + { + "epoch": 0.20845, + "grad_norm": 0.052066754549741745, + "learning_rate": 1.818853030676846e-05, + "loss": 0.0328, + "step": 121690 + }, + { + "epoch": 0.2085, + "grad_norm": 0.06224596127867699, + "learning_rate": 1.8184553112787428e-05, + "loss": 0.0332, + "step": 121700 + }, + { + "epoch": 0.20855, + "grad_norm": 0.057339563965797424, + "learning_rate": 1.8180576105138135e-05, + "loss": 0.033, + "step": 121710 + }, + { + "epoch": 0.2086, + "grad_norm": 0.05593106895685196, + "learning_rate": 1.8176599283929342e-05, + "loss": 0.0341, + "step": 121720 + }, + { + "epoch": 0.20865, + "grad_norm": 0.06946161389350891, + "learning_rate": 1.817262264926975e-05, + "loss": 0.0341, + "step": 121730 + }, + { + "epoch": 0.2087, + "grad_norm": 0.0783376544713974, + "learning_rate": 1.8168646201268096e-05, + "loss": 0.0337, + "step": 121740 + }, + { + "epoch": 0.20875, + "grad_norm": 0.06264694035053253, + "learning_rate": 1.8164669940033087e-05, + "loss": 0.0376, + "step": 121750 + }, + { + "epoch": 0.2088, + "grad_norm": 0.05206596851348877, + "learning_rate": 1.8160693865673433e-05, + "loss": 0.0337, + "step": 121760 + }, + { + "epoch": 0.20885, + "grad_norm": 0.060494888573884964, + "learning_rate": 1.8156717978297845e-05, + "loss": 0.0324, + "step": 121770 + }, + { + "epoch": 0.2089, + "grad_norm": 0.06030144542455673, + "learning_rate": 1.815274227801501e-05, + "loss": 0.0345, + "step": 121780 + }, + { + "epoch": 0.20895, + "grad_norm": 0.057702165096998215, + "learning_rate": 1.8148766764933634e-05, + "loss": 0.0332, + "step": 121790 + }, + { + "epoch": 0.209, + "grad_norm": 0.06220393255352974, + "learning_rate": 1.8144791439162397e-05, + "loss": 0.0337, + "step": 121800 + }, + { + "epoch": 0.20905, + "grad_norm": 0.061017636209726334, + "learning_rate": 1.8140816300809987e-05, + "loss": 0.0346, + "step": 121810 + }, + { + "epoch": 0.2091, + "grad_norm": 0.07539673149585724, + "learning_rate": 1.8136841349985077e-05, + "loss": 0.0342, + "step": 121820 + }, + { + "epoch": 0.20915, + "grad_norm": 0.06629452854394913, + "learning_rate": 1.813286658679635e-05, + "loss": 0.0346, + "step": 121830 + }, + { + "epoch": 0.2092, + "grad_norm": 0.0765426754951477, + "learning_rate": 1.8128892011352478e-05, + "loss": 0.0362, + "step": 121840 + }, + { + "epoch": 0.20925, + "grad_norm": 0.12590640783309937, + "learning_rate": 1.812491762376211e-05, + "loss": 0.0351, + "step": 121850 + }, + { + "epoch": 0.2093, + "grad_norm": 0.0840090662240982, + "learning_rate": 1.8120943424133915e-05, + "loss": 0.0357, + "step": 121860 + }, + { + "epoch": 0.20935, + "grad_norm": 0.08788575232028961, + "learning_rate": 1.811696941257654e-05, + "loss": 0.0352, + "step": 121870 + }, + { + "epoch": 0.2094, + "grad_norm": 0.06937284022569656, + "learning_rate": 1.811299558919864e-05, + "loss": 0.0346, + "step": 121880 + }, + { + "epoch": 0.20945, + "grad_norm": 0.05698763206601143, + "learning_rate": 1.8109021954108845e-05, + "loss": 0.034, + "step": 121890 + }, + { + "epoch": 0.2095, + "grad_norm": 0.06224209442734718, + "learning_rate": 1.8105048507415797e-05, + "loss": 0.0341, + "step": 121900 + }, + { + "epoch": 0.20955, + "grad_norm": 0.09045542776584625, + "learning_rate": 1.810107524922815e-05, + "loss": 0.0346, + "step": 121910 + }, + { + "epoch": 0.2096, + "grad_norm": 0.0724298283457756, + "learning_rate": 1.8097102179654498e-05, + "loss": 0.0359, + "step": 121920 + }, + { + "epoch": 0.20965, + "grad_norm": 0.06703891605138779, + "learning_rate": 1.8093129298803494e-05, + "loss": 0.0341, + "step": 121930 + }, + { + "epoch": 0.2097, + "grad_norm": 0.06555094569921494, + "learning_rate": 1.8089156606783726e-05, + "loss": 0.0342, + "step": 121940 + }, + { + "epoch": 0.20975, + "grad_norm": 0.055910781025886536, + "learning_rate": 1.8085184103703823e-05, + "loss": 0.0328, + "step": 121950 + }, + { + "epoch": 0.2098, + "grad_norm": 0.07043515890836716, + "learning_rate": 1.8081211789672393e-05, + "loss": 0.0337, + "step": 121960 + }, + { + "epoch": 0.20985, + "grad_norm": 0.07362595200538635, + "learning_rate": 1.807723966479803e-05, + "loss": 0.0349, + "step": 121970 + }, + { + "epoch": 0.2099, + "grad_norm": 0.06403699517250061, + "learning_rate": 1.807326772918934e-05, + "loss": 0.0334, + "step": 121980 + }, + { + "epoch": 0.20995, + "grad_norm": 0.06171891465783119, + "learning_rate": 1.8069295982954904e-05, + "loss": 0.0334, + "step": 121990 + }, + { + "epoch": 0.21, + "grad_norm": 0.0766458809375763, + "learning_rate": 1.8065324426203313e-05, + "loss": 0.0327, + "step": 122000 + }, + { + "epoch": 0.21005, + "grad_norm": 0.08834163844585419, + "learning_rate": 1.8061353059043144e-05, + "loss": 0.0341, + "step": 122010 + }, + { + "epoch": 0.2101, + "grad_norm": 0.08196330815553665, + "learning_rate": 1.8057381881582973e-05, + "loss": 0.0336, + "step": 122020 + }, + { + "epoch": 0.21015, + "grad_norm": 0.057843003422021866, + "learning_rate": 1.805341089393138e-05, + "loss": 0.0322, + "step": 122030 + }, + { + "epoch": 0.2102, + "grad_norm": 0.056796714663505554, + "learning_rate": 1.804944009619692e-05, + "loss": 0.0342, + "step": 122040 + }, + { + "epoch": 0.21025, + "grad_norm": 0.06158505007624626, + "learning_rate": 1.804546948848816e-05, + "loss": 0.0347, + "step": 122050 + }, + { + "epoch": 0.2103, + "grad_norm": 0.07728610932826996, + "learning_rate": 1.8041499070913646e-05, + "loss": 0.0342, + "step": 122060 + }, + { + "epoch": 0.21035, + "grad_norm": 0.06384309381246567, + "learning_rate": 1.803752884358194e-05, + "loss": 0.033, + "step": 122070 + }, + { + "epoch": 0.2104, + "grad_norm": 0.05880480259656906, + "learning_rate": 1.8033558806601576e-05, + "loss": 0.0334, + "step": 122080 + }, + { + "epoch": 0.21045, + "grad_norm": 0.053635504096746445, + "learning_rate": 1.8029588960081097e-05, + "loss": 0.0326, + "step": 122090 + }, + { + "epoch": 0.2105, + "grad_norm": 0.05860619246959686, + "learning_rate": 1.8025619304129037e-05, + "loss": 0.0341, + "step": 122100 + }, + { + "epoch": 0.21055, + "grad_norm": 0.06333498656749725, + "learning_rate": 1.802164983885392e-05, + "loss": 0.0341, + "step": 122110 + }, + { + "epoch": 0.2106, + "grad_norm": 0.06388422846794128, + "learning_rate": 1.801768056436429e-05, + "loss": 0.033, + "step": 122120 + }, + { + "epoch": 0.21065, + "grad_norm": 0.06269916892051697, + "learning_rate": 1.801371148076863e-05, + "loss": 0.033, + "step": 122130 + }, + { + "epoch": 0.2107, + "grad_norm": 0.06834321469068527, + "learning_rate": 1.8009742588175484e-05, + "loss": 0.0333, + "step": 122140 + }, + { + "epoch": 0.21075, + "grad_norm": 0.05867002159357071, + "learning_rate": 1.8005773886693353e-05, + "loss": 0.0341, + "step": 122150 + }, + { + "epoch": 0.2108, + "grad_norm": 0.07019494473934174, + "learning_rate": 1.800180537643073e-05, + "loss": 0.0328, + "step": 122160 + }, + { + "epoch": 0.21085, + "grad_norm": 0.06627378612756729, + "learning_rate": 1.7997837057496126e-05, + "loss": 0.0337, + "step": 122170 + }, + { + "epoch": 0.2109, + "grad_norm": 0.05703491345047951, + "learning_rate": 1.7993868929998022e-05, + "loss": 0.0328, + "step": 122180 + }, + { + "epoch": 0.21095, + "grad_norm": 0.05551630258560181, + "learning_rate": 1.7989900994044913e-05, + "loss": 0.0325, + "step": 122190 + }, + { + "epoch": 0.211, + "grad_norm": 0.05379296839237213, + "learning_rate": 1.798593324974527e-05, + "loss": 0.0352, + "step": 122200 + }, + { + "epoch": 0.21105, + "grad_norm": 0.06293535977602005, + "learning_rate": 1.7981965697207573e-05, + "loss": 0.033, + "step": 122210 + }, + { + "epoch": 0.2111, + "grad_norm": 0.06423439085483551, + "learning_rate": 1.7977998336540313e-05, + "loss": 0.0328, + "step": 122220 + }, + { + "epoch": 0.21115, + "grad_norm": 0.06156112998723984, + "learning_rate": 1.7974031167851924e-05, + "loss": 0.0336, + "step": 122230 + }, + { + "epoch": 0.2112, + "grad_norm": 0.06134914979338646, + "learning_rate": 1.7970064191250896e-05, + "loss": 0.0337, + "step": 122240 + }, + { + "epoch": 0.21125, + "grad_norm": 0.06287740916013718, + "learning_rate": 1.796609740684567e-05, + "loss": 0.0333, + "step": 122250 + }, + { + "epoch": 0.2113, + "grad_norm": 0.055571284145116806, + "learning_rate": 1.7962130814744696e-05, + "loss": 0.0331, + "step": 122260 + }, + { + "epoch": 0.21135, + "grad_norm": 0.059910066425800323, + "learning_rate": 1.795816441505642e-05, + "loss": 0.0354, + "step": 122270 + }, + { + "epoch": 0.2114, + "grad_norm": 0.056272391229867935, + "learning_rate": 1.7954198207889285e-05, + "loss": 0.0338, + "step": 122280 + }, + { + "epoch": 0.21145, + "grad_norm": 0.050958115607500076, + "learning_rate": 1.7950232193351724e-05, + "loss": 0.033, + "step": 122290 + }, + { + "epoch": 0.2115, + "grad_norm": 0.05666881054639816, + "learning_rate": 1.7946266371552166e-05, + "loss": 0.0334, + "step": 122300 + }, + { + "epoch": 0.21155, + "grad_norm": 0.09566666930913925, + "learning_rate": 1.794230074259904e-05, + "loss": 0.0351, + "step": 122310 + }, + { + "epoch": 0.2116, + "grad_norm": 0.07883096486330032, + "learning_rate": 1.7938335306600746e-05, + "loss": 0.0347, + "step": 122320 + }, + { + "epoch": 0.21165, + "grad_norm": 0.06914518028497696, + "learning_rate": 1.793437006366572e-05, + "loss": 0.0341, + "step": 122330 + }, + { + "epoch": 0.2117, + "grad_norm": 0.08925186842679977, + "learning_rate": 1.793040501390237e-05, + "loss": 0.0349, + "step": 122340 + }, + { + "epoch": 0.21175, + "grad_norm": 0.06953421980142593, + "learning_rate": 1.792644015741908e-05, + "loss": 0.0347, + "step": 122350 + }, + { + "epoch": 0.2118, + "grad_norm": 0.0675453245639801, + "learning_rate": 1.7922475494324266e-05, + "loss": 0.0344, + "step": 122360 + }, + { + "epoch": 0.21185, + "grad_norm": 0.07842274755239487, + "learning_rate": 1.791851102472631e-05, + "loss": 0.0346, + "step": 122370 + }, + { + "epoch": 0.2119, + "grad_norm": 0.07493267953395844, + "learning_rate": 1.7914546748733607e-05, + "loss": 0.035, + "step": 122380 + }, + { + "epoch": 0.21195, + "grad_norm": 0.061492398381233215, + "learning_rate": 1.7910582666454527e-05, + "loss": 0.0345, + "step": 122390 + }, + { + "epoch": 0.212, + "grad_norm": 0.06904582679271698, + "learning_rate": 1.7906618777997446e-05, + "loss": 0.0367, + "step": 122400 + }, + { + "epoch": 0.21205, + "grad_norm": 0.0825248658657074, + "learning_rate": 1.7902655083470764e-05, + "loss": 0.0349, + "step": 122410 + }, + { + "epoch": 0.2121, + "grad_norm": 0.06248489394783974, + "learning_rate": 1.7898691582982807e-05, + "loss": 0.0336, + "step": 122420 + }, + { + "epoch": 0.21215, + "grad_norm": 0.08642593026161194, + "learning_rate": 1.789472827664197e-05, + "loss": 0.034, + "step": 122430 + }, + { + "epoch": 0.2122, + "grad_norm": 0.059294890612363815, + "learning_rate": 1.7890765164556576e-05, + "loss": 0.0341, + "step": 122440 + }, + { + "epoch": 0.21225, + "grad_norm": 0.0638987347483635, + "learning_rate": 1.7886802246835005e-05, + "loss": 0.0341, + "step": 122450 + }, + { + "epoch": 0.2123, + "grad_norm": 0.05983492732048035, + "learning_rate": 1.788283952358558e-05, + "loss": 0.0341, + "step": 122460 + }, + { + "epoch": 0.21235, + "grad_norm": 0.06589211523532867, + "learning_rate": 1.7878876994916653e-05, + "loss": 0.0348, + "step": 122470 + }, + { + "epoch": 0.2124, + "grad_norm": 0.054842691868543625, + "learning_rate": 1.7874914660936555e-05, + "loss": 0.0341, + "step": 122480 + }, + { + "epoch": 0.21245, + "grad_norm": 0.06396602094173431, + "learning_rate": 1.7870952521753607e-05, + "loss": 0.0356, + "step": 122490 + }, + { + "epoch": 0.2125, + "grad_norm": 0.06473995745182037, + "learning_rate": 1.7866990577476146e-05, + "loss": 0.0351, + "step": 122500 + }, + { + "epoch": 0.21255, + "grad_norm": 0.07215896993875504, + "learning_rate": 1.786302882821248e-05, + "loss": 0.0349, + "step": 122510 + }, + { + "epoch": 0.2126, + "grad_norm": 0.062243200838565826, + "learning_rate": 1.7859067274070916e-05, + "loss": 0.0346, + "step": 122520 + }, + { + "epoch": 0.21265, + "grad_norm": 0.06540673226118088, + "learning_rate": 1.785510591515978e-05, + "loss": 0.0333, + "step": 122530 + }, + { + "epoch": 0.2127, + "grad_norm": 0.06943630427122116, + "learning_rate": 1.7851144751587363e-05, + "loss": 0.034, + "step": 122540 + }, + { + "epoch": 0.21275, + "grad_norm": 0.05753122642636299, + "learning_rate": 1.7847183783461967e-05, + "loss": 0.0347, + "step": 122550 + }, + { + "epoch": 0.2128, + "grad_norm": 0.06939942389726639, + "learning_rate": 1.784322301089187e-05, + "loss": 0.0344, + "step": 122560 + }, + { + "epoch": 0.21285, + "grad_norm": 0.0671052634716034, + "learning_rate": 1.7839262433985377e-05, + "loss": 0.0352, + "step": 122570 + }, + { + "epoch": 0.2129, + "grad_norm": 0.07446414977312088, + "learning_rate": 1.783530205285075e-05, + "loss": 0.0344, + "step": 122580 + }, + { + "epoch": 0.21295, + "grad_norm": 0.056451063603162766, + "learning_rate": 1.7831341867596273e-05, + "loss": 0.0336, + "step": 122590 + }, + { + "epoch": 0.213, + "grad_norm": 0.05866613984107971, + "learning_rate": 1.7827381878330225e-05, + "loss": 0.036, + "step": 122600 + }, + { + "epoch": 0.21305, + "grad_norm": 0.07412475347518921, + "learning_rate": 1.782342208516085e-05, + "loss": 0.0336, + "step": 122610 + }, + { + "epoch": 0.2131, + "grad_norm": 0.0759148970246315, + "learning_rate": 1.7819462488196435e-05, + "loss": 0.0346, + "step": 122620 + }, + { + "epoch": 0.21315, + "grad_norm": 0.06466677784919739, + "learning_rate": 1.7815503087545203e-05, + "loss": 0.0334, + "step": 122630 + }, + { + "epoch": 0.2132, + "grad_norm": 0.08377153426408768, + "learning_rate": 1.781154388331543e-05, + "loss": 0.0359, + "step": 122640 + }, + { + "epoch": 0.21325, + "grad_norm": 0.06291453540325165, + "learning_rate": 1.7807584875615334e-05, + "loss": 0.0356, + "step": 122650 + }, + { + "epoch": 0.2133, + "grad_norm": 0.05173429474234581, + "learning_rate": 1.7803626064553168e-05, + "loss": 0.0345, + "step": 122660 + }, + { + "epoch": 0.21335, + "grad_norm": 0.0746605321764946, + "learning_rate": 1.7799667450237166e-05, + "loss": 0.0342, + "step": 122670 + }, + { + "epoch": 0.2134, + "grad_norm": 0.06961067020893097, + "learning_rate": 1.779570903277555e-05, + "loss": 0.0351, + "step": 122680 + }, + { + "epoch": 0.21345, + "grad_norm": 0.08081655949354172, + "learning_rate": 1.7791750812276547e-05, + "loss": 0.037, + "step": 122690 + }, + { + "epoch": 0.2135, + "grad_norm": 0.06566688418388367, + "learning_rate": 1.778779278884836e-05, + "loss": 0.0342, + "step": 122700 + }, + { + "epoch": 0.21355, + "grad_norm": 0.06508004665374756, + "learning_rate": 1.778383496259921e-05, + "loss": 0.034, + "step": 122710 + }, + { + "epoch": 0.2136, + "grad_norm": 0.0744289830327034, + "learning_rate": 1.7779877333637312e-05, + "loss": 0.0354, + "step": 122720 + }, + { + "epoch": 0.21365, + "grad_norm": 0.06905969232320786, + "learning_rate": 1.7775919902070836e-05, + "loss": 0.0348, + "step": 122730 + }, + { + "epoch": 0.2137, + "grad_norm": 0.06146163120865822, + "learning_rate": 1.7771962668008012e-05, + "loss": 0.0349, + "step": 122740 + }, + { + "epoch": 0.21375, + "grad_norm": 0.06307521462440491, + "learning_rate": 1.776800563155701e-05, + "loss": 0.0347, + "step": 122750 + }, + { + "epoch": 0.2138, + "grad_norm": 0.053822193294763565, + "learning_rate": 1.7764048792826016e-05, + "loss": 0.034, + "step": 122760 + }, + { + "epoch": 0.21385, + "grad_norm": 0.06739028543233871, + "learning_rate": 1.7760092151923207e-05, + "loss": 0.035, + "step": 122770 + }, + { + "epoch": 0.2139, + "grad_norm": 0.047717057168483734, + "learning_rate": 1.775613570895676e-05, + "loss": 0.0342, + "step": 122780 + }, + { + "epoch": 0.21395, + "grad_norm": 0.13976827263832092, + "learning_rate": 1.7752179464034845e-05, + "loss": 0.0379, + "step": 122790 + }, + { + "epoch": 0.214, + "grad_norm": 0.09999356418848038, + "learning_rate": 1.7748223417265618e-05, + "loss": 0.035, + "step": 122800 + }, + { + "epoch": 0.21405, + "grad_norm": 0.062376927584409714, + "learning_rate": 1.774426756875724e-05, + "loss": 0.0338, + "step": 122810 + }, + { + "epoch": 0.2141, + "grad_norm": 0.08710140734910965, + "learning_rate": 1.7740311918617853e-05, + "loss": 0.0352, + "step": 122820 + }, + { + "epoch": 0.21415, + "grad_norm": 0.06661096215248108, + "learning_rate": 1.773635646695562e-05, + "loss": 0.0338, + "step": 122830 + }, + { + "epoch": 0.2142, + "grad_norm": 0.0639859214425087, + "learning_rate": 1.7732401213878664e-05, + "loss": 0.0341, + "step": 122840 + }, + { + "epoch": 0.21425, + "grad_norm": 0.06335785239934921, + "learning_rate": 1.7728446159495132e-05, + "loss": 0.0336, + "step": 122850 + }, + { + "epoch": 0.2143, + "grad_norm": 0.06624232232570648, + "learning_rate": 1.7724491303913156e-05, + "loss": 0.0352, + "step": 122860 + }, + { + "epoch": 0.21435, + "grad_norm": 0.06434208154678345, + "learning_rate": 1.772053664724085e-05, + "loss": 0.0352, + "step": 122870 + }, + { + "epoch": 0.2144, + "grad_norm": 0.06184322386980057, + "learning_rate": 1.771658218958634e-05, + "loss": 0.036, + "step": 122880 + }, + { + "epoch": 0.21445, + "grad_norm": 0.07136630266904831, + "learning_rate": 1.7712627931057732e-05, + "loss": 0.0352, + "step": 122890 + }, + { + "epoch": 0.2145, + "grad_norm": 0.09077708423137665, + "learning_rate": 1.7708673871763136e-05, + "loss": 0.035, + "step": 122900 + }, + { + "epoch": 0.21455, + "grad_norm": 0.07337837666273117, + "learning_rate": 1.770472001181067e-05, + "loss": 0.035, + "step": 122910 + }, + { + "epoch": 0.2146, + "grad_norm": 0.06152607500553131, + "learning_rate": 1.77007663513084e-05, + "loss": 0.0344, + "step": 122920 + }, + { + "epoch": 0.21465, + "grad_norm": 0.08395437896251678, + "learning_rate": 1.7696812890364455e-05, + "loss": 0.0358, + "step": 122930 + }, + { + "epoch": 0.2147, + "grad_norm": 0.0598633699119091, + "learning_rate": 1.769285962908689e-05, + "loss": 0.0377, + "step": 122940 + }, + { + "epoch": 0.21475, + "grad_norm": 0.07186885178089142, + "learning_rate": 1.76889065675838e-05, + "loss": 0.034, + "step": 122950 + }, + { + "epoch": 0.2148, + "grad_norm": 0.05980784073472023, + "learning_rate": 1.7684953705963258e-05, + "loss": 0.0324, + "step": 122960 + }, + { + "epoch": 0.21485, + "grad_norm": 0.050015125423669815, + "learning_rate": 1.768100104433333e-05, + "loss": 0.0333, + "step": 122970 + }, + { + "epoch": 0.2149, + "grad_norm": 0.0592203326523304, + "learning_rate": 1.767704858280209e-05, + "loss": 0.0337, + "step": 122980 + }, + { + "epoch": 0.21495, + "grad_norm": 0.0700206533074379, + "learning_rate": 1.767309632147759e-05, + "loss": 0.0363, + "step": 122990 + }, + { + "epoch": 0.215, + "grad_norm": 0.06486654281616211, + "learning_rate": 1.7669144260467883e-05, + "loss": 0.0338, + "step": 123000 + }, + { + "epoch": 0.21505, + "grad_norm": 0.06818637251853943, + "learning_rate": 1.7665192399881015e-05, + "loss": 0.0343, + "step": 123010 + }, + { + "epoch": 0.2151, + "grad_norm": 0.06002725660800934, + "learning_rate": 1.7661240739825036e-05, + "loss": 0.0325, + "step": 123020 + }, + { + "epoch": 0.21515, + "grad_norm": 0.06803982704877853, + "learning_rate": 1.7657289280407968e-05, + "loss": 0.0352, + "step": 123030 + }, + { + "epoch": 0.2152, + "grad_norm": 0.06577999144792557, + "learning_rate": 1.7653338021737857e-05, + "loss": 0.0331, + "step": 123040 + }, + { + "epoch": 0.21525, + "grad_norm": 0.06379947811365128, + "learning_rate": 1.764938696392273e-05, + "loss": 0.0328, + "step": 123050 + }, + { + "epoch": 0.2153, + "grad_norm": 0.05803842097520828, + "learning_rate": 1.76454361070706e-05, + "loss": 0.0337, + "step": 123060 + }, + { + "epoch": 0.21535, + "grad_norm": 0.08368424326181412, + "learning_rate": 1.7641485451289484e-05, + "loss": 0.0349, + "step": 123070 + }, + { + "epoch": 0.2154, + "grad_norm": 0.07134660333395004, + "learning_rate": 1.7637534996687394e-05, + "loss": 0.0337, + "step": 123080 + }, + { + "epoch": 0.21545, + "grad_norm": 0.07949186861515045, + "learning_rate": 1.7633584743372326e-05, + "loss": 0.035, + "step": 123090 + }, + { + "epoch": 0.2155, + "grad_norm": 0.061419837176799774, + "learning_rate": 1.7629634691452285e-05, + "loss": 0.0328, + "step": 123100 + }, + { + "epoch": 0.21555, + "grad_norm": 0.054267480969429016, + "learning_rate": 1.7625684841035255e-05, + "loss": 0.0339, + "step": 123110 + }, + { + "epoch": 0.2156, + "grad_norm": 0.05243111029267311, + "learning_rate": 1.762173519222925e-05, + "loss": 0.0332, + "step": 123120 + }, + { + "epoch": 0.21565, + "grad_norm": 0.06539767235517502, + "learning_rate": 1.7617785745142214e-05, + "loss": 0.0339, + "step": 123130 + }, + { + "epoch": 0.2157, + "grad_norm": 0.07243485003709793, + "learning_rate": 1.7613836499882158e-05, + "loss": 0.0346, + "step": 123140 + }, + { + "epoch": 0.21575, + "grad_norm": 0.07234058529138565, + "learning_rate": 1.7609887456557023e-05, + "loss": 0.0345, + "step": 123150 + }, + { + "epoch": 0.2158, + "grad_norm": 0.11803265661001205, + "learning_rate": 1.7605938615274793e-05, + "loss": 0.0338, + "step": 123160 + }, + { + "epoch": 0.21585, + "grad_norm": 0.11944836378097534, + "learning_rate": 1.760198997614343e-05, + "loss": 0.0354, + "step": 123170 + }, + { + "epoch": 0.2159, + "grad_norm": 0.08904100209474564, + "learning_rate": 1.7598041539270877e-05, + "loss": 0.0345, + "step": 123180 + }, + { + "epoch": 0.21595, + "grad_norm": 0.0659065842628479, + "learning_rate": 1.7594093304765093e-05, + "loss": 0.0358, + "step": 123190 + }, + { + "epoch": 0.216, + "grad_norm": 0.08313240110874176, + "learning_rate": 1.7590145272734007e-05, + "loss": 0.0356, + "step": 123200 + }, + { + "epoch": 0.21605, + "grad_norm": 0.05321460962295532, + "learning_rate": 1.7586197443285575e-05, + "loss": 0.0342, + "step": 123210 + }, + { + "epoch": 0.2161, + "grad_norm": 0.06692170351743698, + "learning_rate": 1.7582249816527706e-05, + "loss": 0.0352, + "step": 123220 + }, + { + "epoch": 0.21615, + "grad_norm": 0.06521233916282654, + "learning_rate": 1.7578302392568342e-05, + "loss": 0.0351, + "step": 123230 + }, + { + "epoch": 0.2162, + "grad_norm": 0.06535640358924866, + "learning_rate": 1.7574355171515413e-05, + "loss": 0.0343, + "step": 123240 + }, + { + "epoch": 0.21625, + "grad_norm": 0.06495814770460129, + "learning_rate": 1.757040815347682e-05, + "loss": 0.0377, + "step": 123250 + }, + { + "epoch": 0.2163, + "grad_norm": 0.07342529296875, + "learning_rate": 1.756646133856048e-05, + "loss": 0.0357, + "step": 123260 + }, + { + "epoch": 0.21635, + "grad_norm": 0.06682030856609344, + "learning_rate": 1.7562514726874288e-05, + "loss": 0.0338, + "step": 123270 + }, + { + "epoch": 0.2164, + "grad_norm": 0.05626925081014633, + "learning_rate": 1.7558568318526154e-05, + "loss": 0.0341, + "step": 123280 + }, + { + "epoch": 0.21645, + "grad_norm": 0.069165900349617, + "learning_rate": 1.7554622113623964e-05, + "loss": 0.0372, + "step": 123290 + }, + { + "epoch": 0.2165, + "grad_norm": 0.05860066041350365, + "learning_rate": 1.7550676112275605e-05, + "loss": 0.0335, + "step": 123300 + }, + { + "epoch": 0.21655, + "grad_norm": 0.057062942534685135, + "learning_rate": 1.7546730314588973e-05, + "loss": 0.034, + "step": 123310 + }, + { + "epoch": 0.2166, + "grad_norm": 0.05711813271045685, + "learning_rate": 1.754278472067192e-05, + "loss": 0.0342, + "step": 123320 + }, + { + "epoch": 0.21665, + "grad_norm": 0.06175006553530693, + "learning_rate": 1.7538839330632344e-05, + "loss": 0.0333, + "step": 123330 + }, + { + "epoch": 0.2167, + "grad_norm": 0.058298323303461075, + "learning_rate": 1.7534894144578086e-05, + "loss": 0.0337, + "step": 123340 + }, + { + "epoch": 0.21675, + "grad_norm": 0.06874939799308777, + "learning_rate": 1.7530949162617023e-05, + "loss": 0.0345, + "step": 123350 + }, + { + "epoch": 0.2168, + "grad_norm": 0.05856994166970253, + "learning_rate": 1.752700438485701e-05, + "loss": 0.0366, + "step": 123360 + }, + { + "epoch": 0.21685, + "grad_norm": 0.07017688453197479, + "learning_rate": 1.7523059811405877e-05, + "loss": 0.0352, + "step": 123370 + }, + { + "epoch": 0.2169, + "grad_norm": 0.07073337584733963, + "learning_rate": 1.7519115442371496e-05, + "loss": 0.0346, + "step": 123380 + }, + { + "epoch": 0.21695, + "grad_norm": 0.06469359248876572, + "learning_rate": 1.7515171277861676e-05, + "loss": 0.0335, + "step": 123390 + }, + { + "epoch": 0.217, + "grad_norm": 0.0709473267197609, + "learning_rate": 1.751122731798427e-05, + "loss": 0.0358, + "step": 123400 + }, + { + "epoch": 0.21705, + "grad_norm": 0.07505214214324951, + "learning_rate": 1.750728356284709e-05, + "loss": 0.0351, + "step": 123410 + }, + { + "epoch": 0.2171, + "grad_norm": 0.06235995888710022, + "learning_rate": 1.7503340012557953e-05, + "loss": 0.0347, + "step": 123420 + }, + { + "epoch": 0.21715, + "grad_norm": 0.0804169625043869, + "learning_rate": 1.7499396667224705e-05, + "loss": 0.0362, + "step": 123430 + }, + { + "epoch": 0.2172, + "grad_norm": 0.07404050976037979, + "learning_rate": 1.7495453526955114e-05, + "loss": 0.0362, + "step": 123440 + }, + { + "epoch": 0.21725, + "grad_norm": 0.07752832025289536, + "learning_rate": 1.7491510591857015e-05, + "loss": 0.0386, + "step": 123450 + }, + { + "epoch": 0.2173, + "grad_norm": 0.0669313594698906, + "learning_rate": 1.7487567862038195e-05, + "loss": 0.0365, + "step": 123460 + }, + { + "epoch": 0.21735, + "grad_norm": 0.06261171400547028, + "learning_rate": 1.748362533760645e-05, + "loss": 0.035, + "step": 123470 + }, + { + "epoch": 0.2174, + "grad_norm": 0.0605204738676548, + "learning_rate": 1.7479683018669556e-05, + "loss": 0.0347, + "step": 123480 + }, + { + "epoch": 0.21745, + "grad_norm": 0.06800848990678787, + "learning_rate": 1.7475740905335308e-05, + "loss": 0.0346, + "step": 123490 + }, + { + "epoch": 0.2175, + "grad_norm": 0.06989282369613647, + "learning_rate": 1.747179899771148e-05, + "loss": 0.0361, + "step": 123500 + }, + { + "epoch": 0.21755, + "grad_norm": 0.07043875008821487, + "learning_rate": 1.7467857295905836e-05, + "loss": 0.0341, + "step": 123510 + }, + { + "epoch": 0.2176, + "grad_norm": 0.05971217155456543, + "learning_rate": 1.7463915800026144e-05, + "loss": 0.0333, + "step": 123520 + }, + { + "epoch": 0.21765, + "grad_norm": 0.05468961223959923, + "learning_rate": 1.7459974510180156e-05, + "loss": 0.0329, + "step": 123530 + }, + { + "epoch": 0.2177, + "grad_norm": 0.08345212787389755, + "learning_rate": 1.7456033426475638e-05, + "loss": 0.0352, + "step": 123540 + }, + { + "epoch": 0.21775, + "grad_norm": 0.07348014414310455, + "learning_rate": 1.745209254902034e-05, + "loss": 0.034, + "step": 123550 + }, + { + "epoch": 0.2178, + "grad_norm": 0.07218588143587112, + "learning_rate": 1.7448151877921985e-05, + "loss": 0.0336, + "step": 123560 + }, + { + "epoch": 0.21785, + "grad_norm": 0.06379861384630203, + "learning_rate": 1.7444211413288325e-05, + "loss": 0.0347, + "step": 123570 + }, + { + "epoch": 0.2179, + "grad_norm": 0.07845243811607361, + "learning_rate": 1.7440271155227082e-05, + "loss": 0.0351, + "step": 123580 + }, + { + "epoch": 0.21795, + "grad_norm": 0.05751929059624672, + "learning_rate": 1.7436331103845995e-05, + "loss": 0.0341, + "step": 123590 + }, + { + "epoch": 0.218, + "grad_norm": 0.06091049313545227, + "learning_rate": 1.7432391259252768e-05, + "loss": 0.0335, + "step": 123600 + }, + { + "epoch": 0.21805, + "grad_norm": 0.06685923784971237, + "learning_rate": 1.7428451621555115e-05, + "loss": 0.0371, + "step": 123610 + }, + { + "epoch": 0.2181, + "grad_norm": 0.062011849135160446, + "learning_rate": 1.7424512190860764e-05, + "loss": 0.0326, + "step": 123620 + }, + { + "epoch": 0.21815, + "grad_norm": 0.06231192871928215, + "learning_rate": 1.742057296727739e-05, + "loss": 0.0337, + "step": 123630 + }, + { + "epoch": 0.2182, + "grad_norm": 0.07024695724248886, + "learning_rate": 1.741663395091272e-05, + "loss": 0.0342, + "step": 123640 + }, + { + "epoch": 0.21825, + "grad_norm": 0.07106582075357437, + "learning_rate": 1.741269514187441e-05, + "loss": 0.0337, + "step": 123650 + }, + { + "epoch": 0.2183, + "grad_norm": 0.06409791857004166, + "learning_rate": 1.740875654027018e-05, + "loss": 0.0336, + "step": 123660 + }, + { + "epoch": 0.21835, + "grad_norm": 0.052089840173721313, + "learning_rate": 1.7404818146207684e-05, + "loss": 0.0336, + "step": 123670 + }, + { + "epoch": 0.2184, + "grad_norm": 0.05470779910683632, + "learning_rate": 1.740087995979461e-05, + "loss": 0.0332, + "step": 123680 + }, + { + "epoch": 0.21845, + "grad_norm": 0.057459279894828796, + "learning_rate": 1.7396941981138624e-05, + "loss": 0.0347, + "step": 123690 + }, + { + "epoch": 0.2185, + "grad_norm": 0.06484170258045197, + "learning_rate": 1.7393004210347387e-05, + "loss": 0.0334, + "step": 123700 + }, + { + "epoch": 0.21855, + "grad_norm": 0.06167706847190857, + "learning_rate": 1.7389066647528556e-05, + "loss": 0.0347, + "step": 123710 + }, + { + "epoch": 0.2186, + "grad_norm": 0.05439428612589836, + "learning_rate": 1.7385129292789786e-05, + "loss": 0.0346, + "step": 123720 + }, + { + "epoch": 0.21865, + "grad_norm": 0.052550043910741806, + "learning_rate": 1.738119214623871e-05, + "loss": 0.0346, + "step": 123730 + }, + { + "epoch": 0.2187, + "grad_norm": 0.049942102283239365, + "learning_rate": 1.737725520798299e-05, + "loss": 0.0349, + "step": 123740 + }, + { + "epoch": 0.21875, + "grad_norm": 0.0705181211233139, + "learning_rate": 1.7373318478130245e-05, + "loss": 0.0351, + "step": 123750 + }, + { + "epoch": 0.2188, + "grad_norm": 0.08208610117435455, + "learning_rate": 1.7369381956788114e-05, + "loss": 0.0338, + "step": 123760 + }, + { + "epoch": 0.21885, + "grad_norm": 0.06833874434232712, + "learning_rate": 1.7365445644064207e-05, + "loss": 0.0342, + "step": 123770 + }, + { + "epoch": 0.2189, + "grad_norm": 0.07863493263721466, + "learning_rate": 1.736150954006615e-05, + "loss": 0.0345, + "step": 123780 + }, + { + "epoch": 0.21895, + "grad_norm": 0.05749135464429855, + "learning_rate": 1.7357573644901552e-05, + "loss": 0.0335, + "step": 123790 + }, + { + "epoch": 0.219, + "grad_norm": 0.060620639473199844, + "learning_rate": 1.735363795867802e-05, + "loss": 0.0356, + "step": 123800 + }, + { + "epoch": 0.21905, + "grad_norm": 0.06407640129327774, + "learning_rate": 1.7349702481503156e-05, + "loss": 0.034, + "step": 123810 + }, + { + "epoch": 0.2191, + "grad_norm": 0.056964900344610214, + "learning_rate": 1.734576721348454e-05, + "loss": 0.0333, + "step": 123820 + }, + { + "epoch": 0.21915, + "grad_norm": 0.10963110625743866, + "learning_rate": 1.7341832154729794e-05, + "loss": 0.035, + "step": 123830 + }, + { + "epoch": 0.2192, + "grad_norm": 0.08259830623865128, + "learning_rate": 1.7337897305346457e-05, + "loss": 0.0341, + "step": 123840 + }, + { + "epoch": 0.21925, + "grad_norm": 0.07771535217761993, + "learning_rate": 1.7333962665442148e-05, + "loss": 0.0338, + "step": 123850 + }, + { + "epoch": 0.2193, + "grad_norm": 0.065434031188488, + "learning_rate": 1.7330028235124408e-05, + "loss": 0.0341, + "step": 123860 + }, + { + "epoch": 0.21935, + "grad_norm": 0.06774911284446716, + "learning_rate": 1.7326094014500815e-05, + "loss": 0.0341, + "step": 123870 + }, + { + "epoch": 0.2194, + "grad_norm": 0.0658387616276741, + "learning_rate": 1.7322160003678934e-05, + "loss": 0.0344, + "step": 123880 + }, + { + "epoch": 0.21945, + "grad_norm": 0.06059543788433075, + "learning_rate": 1.7318226202766312e-05, + "loss": 0.0326, + "step": 123890 + }, + { + "epoch": 0.2195, + "grad_norm": 0.06279711425304413, + "learning_rate": 1.73142926118705e-05, + "loss": 0.0337, + "step": 123900 + }, + { + "epoch": 0.21955, + "grad_norm": 0.07527292519807816, + "learning_rate": 1.7310359231099042e-05, + "loss": 0.0337, + "step": 123910 + }, + { + "epoch": 0.2196, + "grad_norm": 0.07258503139019012, + "learning_rate": 1.7306426060559463e-05, + "loss": 0.035, + "step": 123920 + }, + { + "epoch": 0.21965, + "grad_norm": 0.07843124866485596, + "learning_rate": 1.7302493100359323e-05, + "loss": 0.0359, + "step": 123930 + }, + { + "epoch": 0.2197, + "grad_norm": 0.07263282686471939, + "learning_rate": 1.7298560350606115e-05, + "loss": 0.0358, + "step": 123940 + }, + { + "epoch": 0.21975, + "grad_norm": 0.0668807402253151, + "learning_rate": 1.729462781140738e-05, + "loss": 0.0336, + "step": 123950 + }, + { + "epoch": 0.2198, + "grad_norm": 0.0571887269616127, + "learning_rate": 1.7290695482870627e-05, + "loss": 0.0349, + "step": 123960 + }, + { + "epoch": 0.21985, + "grad_norm": 0.0678820088505745, + "learning_rate": 1.7286763365103364e-05, + "loss": 0.0349, + "step": 123970 + }, + { + "epoch": 0.2199, + "grad_norm": 0.06259248405694962, + "learning_rate": 1.728283145821309e-05, + "loss": 0.0336, + "step": 123980 + }, + { + "epoch": 0.21995, + "grad_norm": 0.05824877321720123, + "learning_rate": 1.7278899762307303e-05, + "loss": 0.0342, + "step": 123990 + }, + { + "epoch": 0.22, + "grad_norm": 0.07384226471185684, + "learning_rate": 1.72749682774935e-05, + "loss": 0.0339, + "step": 124000 + }, + { + "epoch": 0.22005, + "grad_norm": 0.06599892675876617, + "learning_rate": 1.727103700387916e-05, + "loss": 0.033, + "step": 124010 + }, + { + "epoch": 0.2201, + "grad_norm": 0.06363017857074738, + "learning_rate": 1.726710594157177e-05, + "loss": 0.0335, + "step": 124020 + }, + { + "epoch": 0.22015, + "grad_norm": 0.06069604679942131, + "learning_rate": 1.7263175090678786e-05, + "loss": 0.0338, + "step": 124030 + }, + { + "epoch": 0.2202, + "grad_norm": 0.06854899972677231, + "learning_rate": 1.7259244451307705e-05, + "loss": 0.0336, + "step": 124040 + }, + { + "epoch": 0.22025, + "grad_norm": 0.07456790655851364, + "learning_rate": 1.7255314023565956e-05, + "loss": 0.0343, + "step": 124050 + }, + { + "epoch": 0.2203, + "grad_norm": 0.06498544663190842, + "learning_rate": 1.7251383807561018e-05, + "loss": 0.0331, + "step": 124060 + }, + { + "epoch": 0.22035, + "grad_norm": 0.05230363830924034, + "learning_rate": 1.724745380340034e-05, + "loss": 0.0331, + "step": 124070 + }, + { + "epoch": 0.2204, + "grad_norm": 0.05821483954787254, + "learning_rate": 1.724352401119136e-05, + "loss": 0.034, + "step": 124080 + }, + { + "epoch": 0.22045, + "grad_norm": 0.055370673537254333, + "learning_rate": 1.723959443104152e-05, + "loss": 0.0334, + "step": 124090 + }, + { + "epoch": 0.2205, + "grad_norm": 0.05448485538363457, + "learning_rate": 1.723566506305825e-05, + "loss": 0.0346, + "step": 124100 + }, + { + "epoch": 0.22055, + "grad_norm": 0.06155163422226906, + "learning_rate": 1.723173590734898e-05, + "loss": 0.0336, + "step": 124110 + }, + { + "epoch": 0.2206, + "grad_norm": 0.05619033798575401, + "learning_rate": 1.722780696402114e-05, + "loss": 0.0331, + "step": 124120 + }, + { + "epoch": 0.22065, + "grad_norm": 0.07716576755046844, + "learning_rate": 1.7223878233182125e-05, + "loss": 0.0345, + "step": 124130 + }, + { + "epoch": 0.2207, + "grad_norm": 0.06452678143978119, + "learning_rate": 1.7219949714939374e-05, + "loss": 0.034, + "step": 124140 + }, + { + "epoch": 0.22075, + "grad_norm": 0.06073189154267311, + "learning_rate": 1.7216021409400256e-05, + "loss": 0.0335, + "step": 124150 + }, + { + "epoch": 0.2208, + "grad_norm": 0.06564535200595856, + "learning_rate": 1.7212093316672203e-05, + "loss": 0.0334, + "step": 124160 + }, + { + "epoch": 0.22085, + "grad_norm": 0.06574677675962448, + "learning_rate": 1.720816543686259e-05, + "loss": 0.0345, + "step": 124170 + }, + { + "epoch": 0.2209, + "grad_norm": 0.06860848516225815, + "learning_rate": 1.7204237770078803e-05, + "loss": 0.0344, + "step": 124180 + }, + { + "epoch": 0.22095, + "grad_norm": 0.07889265567064285, + "learning_rate": 1.7200310316428232e-05, + "loss": 0.0361, + "step": 124190 + }, + { + "epoch": 0.221, + "grad_norm": 0.07528276741504669, + "learning_rate": 1.719638307601824e-05, + "loss": 0.0351, + "step": 124200 + }, + { + "epoch": 0.22105, + "grad_norm": 0.0553547702729702, + "learning_rate": 1.719245604895621e-05, + "loss": 0.0347, + "step": 124210 + }, + { + "epoch": 0.2211, + "grad_norm": 0.06014566123485565, + "learning_rate": 1.7188529235349493e-05, + "loss": 0.0337, + "step": 124220 + }, + { + "epoch": 0.22115, + "grad_norm": 0.060287296772003174, + "learning_rate": 1.7184602635305455e-05, + "loss": 0.0341, + "step": 124230 + }, + { + "epoch": 0.2212, + "grad_norm": 0.06788293272256851, + "learning_rate": 1.7180676248931437e-05, + "loss": 0.0338, + "step": 124240 + }, + { + "epoch": 0.22125, + "grad_norm": 0.06208663061261177, + "learning_rate": 1.7176750076334797e-05, + "loss": 0.0337, + "step": 124250 + }, + { + "epoch": 0.2213, + "grad_norm": 0.057153813540935516, + "learning_rate": 1.7172824117622876e-05, + "loss": 0.0346, + "step": 124260 + }, + { + "epoch": 0.22135, + "grad_norm": 0.061345912516117096, + "learning_rate": 1.7168898372903e-05, + "loss": 0.0331, + "step": 124270 + }, + { + "epoch": 0.2214, + "grad_norm": 0.055477313697338104, + "learning_rate": 1.7164972842282504e-05, + "loss": 0.0328, + "step": 124280 + }, + { + "epoch": 0.22145, + "grad_norm": 0.06002333015203476, + "learning_rate": 1.7161047525868702e-05, + "loss": 0.035, + "step": 124290 + }, + { + "epoch": 0.2215, + "grad_norm": 0.06831765919923782, + "learning_rate": 1.715712242376892e-05, + "loss": 0.0367, + "step": 124300 + }, + { + "epoch": 0.22155, + "grad_norm": 0.059336576610803604, + "learning_rate": 1.7153197536090458e-05, + "loss": 0.0349, + "step": 124310 + }, + { + "epoch": 0.2216, + "grad_norm": 0.06508708000183105, + "learning_rate": 1.7149272862940628e-05, + "loss": 0.0338, + "step": 124320 + }, + { + "epoch": 0.22165, + "grad_norm": 0.06287387758493423, + "learning_rate": 1.714534840442674e-05, + "loss": 0.0352, + "step": 124330 + }, + { + "epoch": 0.2217, + "grad_norm": 0.05168507993221283, + "learning_rate": 1.7141424160656062e-05, + "loss": 0.0346, + "step": 124340 + }, + { + "epoch": 0.22175, + "grad_norm": 0.07266610860824585, + "learning_rate": 1.7137500131735907e-05, + "loss": 0.0378, + "step": 124350 + }, + { + "epoch": 0.2218, + "grad_norm": 0.10454844683408737, + "learning_rate": 1.713357631777353e-05, + "loss": 0.0372, + "step": 124360 + }, + { + "epoch": 0.22185, + "grad_norm": 0.0826743096113205, + "learning_rate": 1.712965271887623e-05, + "loss": 0.0376, + "step": 124370 + }, + { + "epoch": 0.2219, + "grad_norm": 0.06038236618041992, + "learning_rate": 1.712572933515127e-05, + "loss": 0.0363, + "step": 124380 + }, + { + "epoch": 0.22195, + "grad_norm": 0.06790652126073837, + "learning_rate": 1.712180616670591e-05, + "loss": 0.0346, + "step": 124390 + }, + { + "epoch": 0.222, + "grad_norm": 0.06319904327392578, + "learning_rate": 1.7117883213647413e-05, + "loss": 0.0348, + "step": 124400 + }, + { + "epoch": 0.22205, + "grad_norm": 0.07048717141151428, + "learning_rate": 1.711396047608302e-05, + "loss": 0.0357, + "step": 124410 + }, + { + "epoch": 0.2221, + "grad_norm": 0.06790277361869812, + "learning_rate": 1.7110037954119994e-05, + "loss": 0.0347, + "step": 124420 + }, + { + "epoch": 0.22215, + "grad_norm": 0.05983925983309746, + "learning_rate": 1.7106115647865557e-05, + "loss": 0.0378, + "step": 124430 + }, + { + "epoch": 0.2222, + "grad_norm": 0.06110047176480293, + "learning_rate": 1.710219355742695e-05, + "loss": 0.035, + "step": 124440 + }, + { + "epoch": 0.22225, + "grad_norm": 0.08017834275960922, + "learning_rate": 1.7098271682911416e-05, + "loss": 0.0343, + "step": 124450 + }, + { + "epoch": 0.2223, + "grad_norm": 0.06500820070505142, + "learning_rate": 1.7094350024426157e-05, + "loss": 0.0341, + "step": 124460 + }, + { + "epoch": 0.22235, + "grad_norm": 0.05860864371061325, + "learning_rate": 1.7090428582078403e-05, + "loss": 0.0343, + "step": 124470 + }, + { + "epoch": 0.2224, + "grad_norm": 0.07563772797584534, + "learning_rate": 1.7086507355975356e-05, + "loss": 0.0338, + "step": 124480 + }, + { + "epoch": 0.22245, + "grad_norm": 0.05746683478355408, + "learning_rate": 1.7082586346224232e-05, + "loss": 0.0336, + "step": 124490 + }, + { + "epoch": 0.2225, + "grad_norm": 0.0540003776550293, + "learning_rate": 1.7078665552932216e-05, + "loss": 0.0344, + "step": 124500 + }, + { + "epoch": 0.22255, + "grad_norm": 0.05797993019223213, + "learning_rate": 1.7074744976206506e-05, + "loss": 0.0329, + "step": 124510 + }, + { + "epoch": 0.2226, + "grad_norm": 0.06687459349632263, + "learning_rate": 1.7070824616154302e-05, + "loss": 0.0339, + "step": 124520 + }, + { + "epoch": 0.22265, + "grad_norm": 0.05583649501204491, + "learning_rate": 1.7066904472882762e-05, + "loss": 0.0332, + "step": 124530 + }, + { + "epoch": 0.2227, + "grad_norm": 0.06575287133455276, + "learning_rate": 1.7062984546499087e-05, + "loss": 0.0338, + "step": 124540 + }, + { + "epoch": 0.22275, + "grad_norm": 0.055947426706552505, + "learning_rate": 1.7059064837110416e-05, + "loss": 0.0341, + "step": 124550 + }, + { + "epoch": 0.2228, + "grad_norm": 0.06190335005521774, + "learning_rate": 1.7055145344823937e-05, + "loss": 0.034, + "step": 124560 + }, + { + "epoch": 0.22285, + "grad_norm": 0.06709223985671997, + "learning_rate": 1.7051226069746805e-05, + "loss": 0.0351, + "step": 124570 + }, + { + "epoch": 0.2229, + "grad_norm": 0.059285055845975876, + "learning_rate": 1.7047307011986158e-05, + "loss": 0.0336, + "step": 124580 + }, + { + "epoch": 0.22295, + "grad_norm": 0.0603620707988739, + "learning_rate": 1.7043388171649154e-05, + "loss": 0.0354, + "step": 124590 + }, + { + "epoch": 0.223, + "grad_norm": 0.0807933658361435, + "learning_rate": 1.703946954884293e-05, + "loss": 0.0358, + "step": 124600 + }, + { + "epoch": 0.22305, + "grad_norm": 0.06559517234563828, + "learning_rate": 1.7035551143674615e-05, + "loss": 0.0344, + "step": 124610 + }, + { + "epoch": 0.2231, + "grad_norm": 0.06602256000041962, + "learning_rate": 1.7031632956251336e-05, + "loss": 0.0373, + "step": 124620 + }, + { + "epoch": 0.22315, + "grad_norm": 0.06404702365398407, + "learning_rate": 1.7027714986680214e-05, + "loss": 0.0344, + "step": 124630 + }, + { + "epoch": 0.2232, + "grad_norm": 0.06762048602104187, + "learning_rate": 1.702379723506839e-05, + "loss": 0.0362, + "step": 124640 + }, + { + "epoch": 0.22325, + "grad_norm": 0.0721345841884613, + "learning_rate": 1.701987970152293e-05, + "loss": 0.0334, + "step": 124650 + }, + { + "epoch": 0.2233, + "grad_norm": 0.06913810223340988, + "learning_rate": 1.7015962386150978e-05, + "loss": 0.035, + "step": 124660 + }, + { + "epoch": 0.22335, + "grad_norm": 0.07465165853500366, + "learning_rate": 1.7012045289059604e-05, + "loss": 0.0342, + "step": 124670 + }, + { + "epoch": 0.2234, + "grad_norm": 0.06738516688346863, + "learning_rate": 1.700812841035592e-05, + "loss": 0.0332, + "step": 124680 + }, + { + "epoch": 0.22345, + "grad_norm": 0.06403343379497528, + "learning_rate": 1.7004211750146993e-05, + "loss": 0.0341, + "step": 124690 + }, + { + "epoch": 0.2235, + "grad_norm": 0.05525503680109978, + "learning_rate": 1.7000295308539917e-05, + "loss": 0.0345, + "step": 124700 + }, + { + "epoch": 0.22355, + "grad_norm": 0.06048808991909027, + "learning_rate": 1.6996379085641768e-05, + "loss": 0.0337, + "step": 124710 + }, + { + "epoch": 0.2236, + "grad_norm": 0.05922717973589897, + "learning_rate": 1.6992463081559602e-05, + "loss": 0.034, + "step": 124720 + }, + { + "epoch": 0.22365, + "grad_norm": 0.05505324527621269, + "learning_rate": 1.6988547296400488e-05, + "loss": 0.0328, + "step": 124730 + }, + { + "epoch": 0.2237, + "grad_norm": 0.05941491946578026, + "learning_rate": 1.6984631730271476e-05, + "loss": 0.0343, + "step": 124740 + }, + { + "epoch": 0.22375, + "grad_norm": 0.058640215545892715, + "learning_rate": 1.6980716383279622e-05, + "loss": 0.0325, + "step": 124750 + }, + { + "epoch": 0.2238, + "grad_norm": 0.05964535102248192, + "learning_rate": 1.6976801255531977e-05, + "loss": 0.0351, + "step": 124760 + }, + { + "epoch": 0.22385, + "grad_norm": 0.05463533103466034, + "learning_rate": 1.6972886347135565e-05, + "loss": 0.0333, + "step": 124770 + }, + { + "epoch": 0.2239, + "grad_norm": 0.06249071657657623, + "learning_rate": 1.696897165819743e-05, + "loss": 0.0344, + "step": 124780 + }, + { + "epoch": 0.22395, + "grad_norm": 0.07721731066703796, + "learning_rate": 1.696505718882459e-05, + "loss": 0.0338, + "step": 124790 + }, + { + "epoch": 0.224, + "grad_norm": 0.06688842922449112, + "learning_rate": 1.696114293912407e-05, + "loss": 0.0348, + "step": 124800 + }, + { + "epoch": 0.22405, + "grad_norm": 0.06384867429733276, + "learning_rate": 1.6957228909202883e-05, + "loss": 0.0343, + "step": 124810 + }, + { + "epoch": 0.2241, + "grad_norm": 0.06892948597669601, + "learning_rate": 1.6953315099168022e-05, + "loss": 0.0358, + "step": 124820 + }, + { + "epoch": 0.22415, + "grad_norm": 0.06952431052923203, + "learning_rate": 1.6949401509126524e-05, + "loss": 0.0341, + "step": 124830 + }, + { + "epoch": 0.2242, + "grad_norm": 0.06377065181732178, + "learning_rate": 1.694548813918535e-05, + "loss": 0.033, + "step": 124840 + }, + { + "epoch": 0.22425, + "grad_norm": 0.06937036663293839, + "learning_rate": 1.6941574989451518e-05, + "loss": 0.0334, + "step": 124850 + }, + { + "epoch": 0.2243, + "grad_norm": 0.06026379391551018, + "learning_rate": 1.693766206003198e-05, + "loss": 0.0359, + "step": 124860 + }, + { + "epoch": 0.22435, + "grad_norm": 0.06354683637619019, + "learning_rate": 1.6933749351033744e-05, + "loss": 0.036, + "step": 124870 + }, + { + "epoch": 0.2244, + "grad_norm": 0.06264355033636093, + "learning_rate": 1.692983686256377e-05, + "loss": 0.0342, + "step": 124880 + }, + { + "epoch": 0.22445, + "grad_norm": 0.067869171500206, + "learning_rate": 1.692592459472902e-05, + "loss": 0.0352, + "step": 124890 + }, + { + "epoch": 0.2245, + "grad_norm": 0.0714171975851059, + "learning_rate": 1.692201254763646e-05, + "loss": 0.0349, + "step": 124900 + }, + { + "epoch": 0.22455, + "grad_norm": 0.08100499957799911, + "learning_rate": 1.6918100721393045e-05, + "loss": 0.034, + "step": 124910 + }, + { + "epoch": 0.2246, + "grad_norm": 0.07563242316246033, + "learning_rate": 1.691418911610572e-05, + "loss": 0.0381, + "step": 124920 + }, + { + "epoch": 0.22465, + "grad_norm": 0.0738987997174263, + "learning_rate": 1.6910277731881424e-05, + "loss": 0.0353, + "step": 124930 + }, + { + "epoch": 0.2247, + "grad_norm": 0.06895702332258224, + "learning_rate": 1.6906366568827088e-05, + "loss": 0.036, + "step": 124940 + }, + { + "epoch": 0.22475, + "grad_norm": 0.09274575859308243, + "learning_rate": 1.690245562704966e-05, + "loss": 0.035, + "step": 124950 + }, + { + "epoch": 0.2248, + "grad_norm": 0.07356208562850952, + "learning_rate": 1.6898544906656052e-05, + "loss": 0.0351, + "step": 124960 + }, + { + "epoch": 0.22485, + "grad_norm": 0.06885560601949692, + "learning_rate": 1.6894634407753186e-05, + "loss": 0.0374, + "step": 124970 + }, + { + "epoch": 0.2249, + "grad_norm": 0.06371462345123291, + "learning_rate": 1.6890724130447963e-05, + "loss": 0.0353, + "step": 124980 + }, + { + "epoch": 0.22495, + "grad_norm": 0.06379080563783646, + "learning_rate": 1.68868140748473e-05, + "loss": 0.0341, + "step": 124990 + }, + { + "epoch": 0.225, + "grad_norm": 0.06789926439523697, + "learning_rate": 1.688290424105809e-05, + "loss": 0.0345, + "step": 125000 + }, + { + "epoch": 0.22505, + "grad_norm": 0.06488768756389618, + "learning_rate": 1.687899462918723e-05, + "loss": 0.0338, + "step": 125010 + }, + { + "epoch": 0.2251, + "grad_norm": 0.06970378756523132, + "learning_rate": 1.687508523934161e-05, + "loss": 0.0346, + "step": 125020 + }, + { + "epoch": 0.22515, + "grad_norm": 0.054153922945261, + "learning_rate": 1.687117607162809e-05, + "loss": 0.0353, + "step": 125030 + }, + { + "epoch": 0.2252, + "grad_norm": 0.06287054717540741, + "learning_rate": 1.6867267126153584e-05, + "loss": 0.0348, + "step": 125040 + }, + { + "epoch": 0.22525, + "grad_norm": 0.05829636752605438, + "learning_rate": 1.6863358403024928e-05, + "loss": 0.0344, + "step": 125050 + }, + { + "epoch": 0.2253, + "grad_norm": 0.049323271960020065, + "learning_rate": 1.6859449902349007e-05, + "loss": 0.0338, + "step": 125060 + }, + { + "epoch": 0.22535, + "grad_norm": 0.05087485536932945, + "learning_rate": 1.685554162423265e-05, + "loss": 0.034, + "step": 125070 + }, + { + "epoch": 0.2254, + "grad_norm": 0.06357801705598831, + "learning_rate": 1.6851633568782733e-05, + "loss": 0.034, + "step": 125080 + }, + { + "epoch": 0.22545, + "grad_norm": 0.06831253319978714, + "learning_rate": 1.68477257361061e-05, + "loss": 0.0351, + "step": 125090 + }, + { + "epoch": 0.2255, + "grad_norm": 0.049976907670497894, + "learning_rate": 1.6843818126309576e-05, + "loss": 0.0336, + "step": 125100 + }, + { + "epoch": 0.22555, + "grad_norm": 0.1038125678896904, + "learning_rate": 1.6839910739500002e-05, + "loss": 0.0362, + "step": 125110 + }, + { + "epoch": 0.2256, + "grad_norm": 0.07736409455537796, + "learning_rate": 1.68360035757842e-05, + "loss": 0.0344, + "step": 125120 + }, + { + "epoch": 0.22565, + "grad_norm": 0.06509806215763092, + "learning_rate": 1.683209663526899e-05, + "loss": 0.0342, + "step": 125130 + }, + { + "epoch": 0.2257, + "grad_norm": 0.06010761111974716, + "learning_rate": 1.6828189918061206e-05, + "loss": 0.0332, + "step": 125140 + }, + { + "epoch": 0.22575, + "grad_norm": 0.09686113148927689, + "learning_rate": 1.6824283424267617e-05, + "loss": 0.0341, + "step": 125150 + }, + { + "epoch": 0.2258, + "grad_norm": 0.06265415996313095, + "learning_rate": 1.6820377153995065e-05, + "loss": 0.0329, + "step": 125160 + }, + { + "epoch": 0.22585, + "grad_norm": 0.06570495665073395, + "learning_rate": 1.681647110735032e-05, + "loss": 0.0334, + "step": 125170 + }, + { + "epoch": 0.2259, + "grad_norm": 0.06195457652211189, + "learning_rate": 1.681256528444019e-05, + "loss": 0.0336, + "step": 125180 + }, + { + "epoch": 0.22595, + "grad_norm": 0.07338821887969971, + "learning_rate": 1.680865968537144e-05, + "loss": 0.0344, + "step": 125190 + }, + { + "epoch": 0.226, + "grad_norm": 0.07165399193763733, + "learning_rate": 1.6804754310250858e-05, + "loss": 0.0344, + "step": 125200 + }, + { + "epoch": 0.22605, + "grad_norm": 0.07292645424604416, + "learning_rate": 1.6800849159185217e-05, + "loss": 0.0343, + "step": 125210 + }, + { + "epoch": 0.2261, + "grad_norm": 0.06303286552429199, + "learning_rate": 1.6796944232281278e-05, + "loss": 0.0327, + "step": 125220 + }, + { + "epoch": 0.22615, + "grad_norm": 0.06282463669776917, + "learning_rate": 1.6793039529645806e-05, + "loss": 0.0348, + "step": 125230 + }, + { + "epoch": 0.2262, + "grad_norm": 0.07054274529218674, + "learning_rate": 1.678913505138554e-05, + "loss": 0.0351, + "step": 125240 + }, + { + "epoch": 0.22625, + "grad_norm": 0.0869477391242981, + "learning_rate": 1.6785230797607252e-05, + "loss": 0.034, + "step": 125250 + }, + { + "epoch": 0.2263, + "grad_norm": 0.07772769033908844, + "learning_rate": 1.678132676841765e-05, + "loss": 0.0339, + "step": 125260 + }, + { + "epoch": 0.22635, + "grad_norm": 0.0869736522436142, + "learning_rate": 1.6777422963923494e-05, + "loss": 0.0337, + "step": 125270 + }, + { + "epoch": 0.2264, + "grad_norm": 0.09080974757671356, + "learning_rate": 1.6773519384231512e-05, + "loss": 0.0334, + "step": 125280 + }, + { + "epoch": 0.22645, + "grad_norm": 0.05822908505797386, + "learning_rate": 1.6769616029448415e-05, + "loss": 0.0332, + "step": 125290 + }, + { + "epoch": 0.2265, + "grad_norm": 0.06880011409521103, + "learning_rate": 1.6765712899680924e-05, + "loss": 0.0357, + "step": 125300 + }, + { + "epoch": 0.22655, + "grad_norm": 0.0638493001461029, + "learning_rate": 1.676180999503575e-05, + "loss": 0.0325, + "step": 125310 + }, + { + "epoch": 0.2266, + "grad_norm": 0.0685199722647667, + "learning_rate": 1.6757907315619587e-05, + "loss": 0.0349, + "step": 125320 + }, + { + "epoch": 0.22665, + "grad_norm": 0.056751806288957596, + "learning_rate": 1.6754004861539156e-05, + "loss": 0.0337, + "step": 125330 + }, + { + "epoch": 0.2267, + "grad_norm": 0.056441500782966614, + "learning_rate": 1.6750102632901117e-05, + "loss": 0.0351, + "step": 125340 + }, + { + "epoch": 0.22675, + "grad_norm": 0.04892340302467346, + "learning_rate": 1.674620062981219e-05, + "loss": 0.0329, + "step": 125350 + }, + { + "epoch": 0.2268, + "grad_norm": 0.056955333799123764, + "learning_rate": 1.6742298852379025e-05, + "loss": 0.0333, + "step": 125360 + }, + { + "epoch": 0.22685, + "grad_norm": 0.05275273323059082, + "learning_rate": 1.6738397300708315e-05, + "loss": 0.0347, + "step": 125370 + }, + { + "epoch": 0.2269, + "grad_norm": 0.05845008045434952, + "learning_rate": 1.6734495974906713e-05, + "loss": 0.034, + "step": 125380 + }, + { + "epoch": 0.22695, + "grad_norm": 0.05458725616335869, + "learning_rate": 1.6730594875080887e-05, + "loss": 0.0333, + "step": 125390 + }, + { + "epoch": 0.227, + "grad_norm": 0.05155658721923828, + "learning_rate": 1.6726694001337496e-05, + "loss": 0.0338, + "step": 125400 + }, + { + "epoch": 0.22705, + "grad_norm": 0.06531024724245071, + "learning_rate": 1.6722793353783178e-05, + "loss": 0.0361, + "step": 125410 + }, + { + "epoch": 0.2271, + "grad_norm": 0.05535675212740898, + "learning_rate": 1.6718892932524584e-05, + "loss": 0.0336, + "step": 125420 + }, + { + "epoch": 0.22715, + "grad_norm": 0.05717598274350166, + "learning_rate": 1.671499273766834e-05, + "loss": 0.0318, + "step": 125430 + }, + { + "epoch": 0.2272, + "grad_norm": 0.06368177384138107, + "learning_rate": 1.6711092769321088e-05, + "loss": 0.0336, + "step": 125440 + }, + { + "epoch": 0.22725, + "grad_norm": 0.06778300553560257, + "learning_rate": 1.6707193027589434e-05, + "loss": 0.0327, + "step": 125450 + }, + { + "epoch": 0.2273, + "grad_norm": 0.06283332407474518, + "learning_rate": 1.6703293512580013e-05, + "loss": 0.0318, + "step": 125460 + }, + { + "epoch": 0.22735, + "grad_norm": 0.060217004269361496, + "learning_rate": 1.669939422439944e-05, + "loss": 0.0343, + "step": 125470 + }, + { + "epoch": 0.2274, + "grad_norm": 0.0580129511654377, + "learning_rate": 1.66954951631543e-05, + "loss": 0.0328, + "step": 125480 + }, + { + "epoch": 0.22745, + "grad_norm": 0.049048274755477905, + "learning_rate": 1.6691596328951212e-05, + "loss": 0.0313, + "step": 125490 + }, + { + "epoch": 0.2275, + "grad_norm": 0.05264052376151085, + "learning_rate": 1.668769772189675e-05, + "loss": 0.0329, + "step": 125500 + }, + { + "epoch": 0.22755, + "grad_norm": 0.05675099417567253, + "learning_rate": 1.6683799342097517e-05, + "loss": 0.0331, + "step": 125510 + }, + { + "epoch": 0.2276, + "grad_norm": 0.060881178826093674, + "learning_rate": 1.667990118966008e-05, + "loss": 0.0335, + "step": 125520 + }, + { + "epoch": 0.22765, + "grad_norm": 0.06495590507984161, + "learning_rate": 1.6676003264691015e-05, + "loss": 0.0355, + "step": 125530 + }, + { + "epoch": 0.2277, + "grad_norm": 0.05344432219862938, + "learning_rate": 1.6672105567296904e-05, + "loss": 0.0327, + "step": 125540 + }, + { + "epoch": 0.22775, + "grad_norm": 0.0638977512717247, + "learning_rate": 1.6668208097584287e-05, + "loss": 0.0351, + "step": 125550 + }, + { + "epoch": 0.2278, + "grad_norm": 0.07072903960943222, + "learning_rate": 1.6664310855659747e-05, + "loss": 0.0344, + "step": 125560 + }, + { + "epoch": 0.22785, + "grad_norm": 0.05785040929913521, + "learning_rate": 1.6660413841629795e-05, + "loss": 0.0349, + "step": 125570 + }, + { + "epoch": 0.2279, + "grad_norm": 0.061877842992544174, + "learning_rate": 1.6656517055601007e-05, + "loss": 0.0342, + "step": 125580 + }, + { + "epoch": 0.22795, + "grad_norm": 0.07650292664766312, + "learning_rate": 1.665262049767991e-05, + "loss": 0.035, + "step": 125590 + }, + { + "epoch": 0.228, + "grad_norm": 0.049784474074840546, + "learning_rate": 1.6648724167973028e-05, + "loss": 0.0326, + "step": 125600 + }, + { + "epoch": 0.22805, + "grad_norm": 0.0595824271440506, + "learning_rate": 1.6644828066586897e-05, + "loss": 0.0335, + "step": 125610 + }, + { + "epoch": 0.2281, + "grad_norm": 0.056988898664712906, + "learning_rate": 1.664093219362802e-05, + "loss": 0.0327, + "step": 125620 + }, + { + "epoch": 0.22815, + "grad_norm": 0.05611162260174751, + "learning_rate": 1.6637036549202924e-05, + "loss": 0.0349, + "step": 125630 + }, + { + "epoch": 0.2282, + "grad_norm": 0.05163775011897087, + "learning_rate": 1.66331411334181e-05, + "loss": 0.0338, + "step": 125640 + }, + { + "epoch": 0.22825, + "grad_norm": 0.04976727068424225, + "learning_rate": 1.6629245946380052e-05, + "loss": 0.0339, + "step": 125650 + }, + { + "epoch": 0.2283, + "grad_norm": 0.0533292330801487, + "learning_rate": 1.6625350988195282e-05, + "loss": 0.0329, + "step": 125660 + }, + { + "epoch": 0.22835, + "grad_norm": 0.05242196097970009, + "learning_rate": 1.6621456258970264e-05, + "loss": 0.0338, + "step": 125670 + }, + { + "epoch": 0.2284, + "grad_norm": 0.06851650774478912, + "learning_rate": 1.6617561758811493e-05, + "loss": 0.0355, + "step": 125680 + }, + { + "epoch": 0.22845, + "grad_norm": 0.05392906069755554, + "learning_rate": 1.6613667487825427e-05, + "loss": 0.0339, + "step": 125690 + }, + { + "epoch": 0.2285, + "grad_norm": 0.05616411939263344, + "learning_rate": 1.660977344611855e-05, + "loss": 0.0338, + "step": 125700 + }, + { + "epoch": 0.22855, + "grad_norm": 0.05417593568563461, + "learning_rate": 1.6605879633797304e-05, + "loss": 0.0345, + "step": 125710 + }, + { + "epoch": 0.2286, + "grad_norm": 0.051290225237607956, + "learning_rate": 1.6601986050968154e-05, + "loss": 0.0343, + "step": 125720 + }, + { + "epoch": 0.22865, + "grad_norm": 0.05213925987482071, + "learning_rate": 1.659809269773756e-05, + "loss": 0.0341, + "step": 125730 + }, + { + "epoch": 0.2287, + "grad_norm": 0.06550920009613037, + "learning_rate": 1.6594199574211944e-05, + "loss": 0.035, + "step": 125740 + }, + { + "epoch": 0.22875, + "grad_norm": 0.09685634821653366, + "learning_rate": 1.659030668049777e-05, + "loss": 0.0359, + "step": 125750 + }, + { + "epoch": 0.2288, + "grad_norm": 0.07793646305799484, + "learning_rate": 1.658641401670144e-05, + "loss": 0.0345, + "step": 125760 + }, + { + "epoch": 0.22885, + "grad_norm": 0.07697339355945587, + "learning_rate": 1.658252158292939e-05, + "loss": 0.0343, + "step": 125770 + }, + { + "epoch": 0.2289, + "grad_norm": 0.0588192492723465, + "learning_rate": 1.6578629379288042e-05, + "loss": 0.0359, + "step": 125780 + }, + { + "epoch": 0.22895, + "grad_norm": 0.0640011578798294, + "learning_rate": 1.65747374058838e-05, + "loss": 0.0344, + "step": 125790 + }, + { + "epoch": 0.229, + "grad_norm": 0.0636245459318161, + "learning_rate": 1.6570845662823075e-05, + "loss": 0.0354, + "step": 125800 + }, + { + "epoch": 0.22905, + "grad_norm": 0.06315630674362183, + "learning_rate": 1.656695415021226e-05, + "loss": 0.0341, + "step": 125810 + }, + { + "epoch": 0.2291, + "grad_norm": 0.05824105069041252, + "learning_rate": 1.6563062868157756e-05, + "loss": 0.0384, + "step": 125820 + }, + { + "epoch": 0.22915, + "grad_norm": 0.06384194642305374, + "learning_rate": 1.6559171816765936e-05, + "loss": 0.0337, + "step": 125830 + }, + { + "epoch": 0.2292, + "grad_norm": 0.04800541698932648, + "learning_rate": 1.6555280996143186e-05, + "loss": 0.0362, + "step": 125840 + }, + { + "epoch": 0.22925, + "grad_norm": 0.07251149415969849, + "learning_rate": 1.6551390406395896e-05, + "loss": 0.0389, + "step": 125850 + }, + { + "epoch": 0.2293, + "grad_norm": 0.06954000890254974, + "learning_rate": 1.6547500047630398e-05, + "loss": 0.0384, + "step": 125860 + }, + { + "epoch": 0.22935, + "grad_norm": 0.05920116603374481, + "learning_rate": 1.654360991995309e-05, + "loss": 0.0346, + "step": 125870 + }, + { + "epoch": 0.2294, + "grad_norm": 0.06064040586352348, + "learning_rate": 1.65397200234703e-05, + "loss": 0.0348, + "step": 125880 + }, + { + "epoch": 0.22945, + "grad_norm": 0.06895332783460617, + "learning_rate": 1.653583035828839e-05, + "loss": 0.0352, + "step": 125890 + }, + { + "epoch": 0.2295, + "grad_norm": 0.06560982763767242, + "learning_rate": 1.6531940924513697e-05, + "loss": 0.0351, + "step": 125900 + }, + { + "epoch": 0.22955, + "grad_norm": 0.059170350432395935, + "learning_rate": 1.6528051722252557e-05, + "loss": 0.0345, + "step": 125910 + }, + { + "epoch": 0.2296, + "grad_norm": 0.07745416462421417, + "learning_rate": 1.6524162751611304e-05, + "loss": 0.0339, + "step": 125920 + }, + { + "epoch": 0.22965, + "grad_norm": 0.05844829976558685, + "learning_rate": 1.6520274012696252e-05, + "loss": 0.0343, + "step": 125930 + }, + { + "epoch": 0.2297, + "grad_norm": 0.051176246255636215, + "learning_rate": 1.6516385505613728e-05, + "loss": 0.0339, + "step": 125940 + }, + { + "epoch": 0.22975, + "grad_norm": 0.05116390809416771, + "learning_rate": 1.651249723047003e-05, + "loss": 0.0323, + "step": 125950 + }, + { + "epoch": 0.2298, + "grad_norm": 0.06636460870504379, + "learning_rate": 1.650860918737147e-05, + "loss": 0.0338, + "step": 125960 + }, + { + "epoch": 0.22985, + "grad_norm": 0.07166604697704315, + "learning_rate": 1.6504721376424354e-05, + "loss": 0.0343, + "step": 125970 + }, + { + "epoch": 0.2299, + "grad_norm": 0.06095893681049347, + "learning_rate": 1.6500833797734955e-05, + "loss": 0.0338, + "step": 125980 + }, + { + "epoch": 0.22995, + "grad_norm": 0.05798349529504776, + "learning_rate": 1.6496946451409577e-05, + "loss": 0.0327, + "step": 125990 + }, + { + "epoch": 0.23, + "grad_norm": 0.05991566926240921, + "learning_rate": 1.649305933755448e-05, + "loss": 0.033, + "step": 126000 + }, + { + "epoch": 0.23005, + "grad_norm": 0.06393137574195862, + "learning_rate": 1.648917245627595e-05, + "loss": 0.0348, + "step": 126010 + }, + { + "epoch": 0.2301, + "grad_norm": 0.058520495891571045, + "learning_rate": 1.648528580768024e-05, + "loss": 0.0335, + "step": 126020 + }, + { + "epoch": 0.23015, + "grad_norm": 0.05948096886277199, + "learning_rate": 1.6481399391873615e-05, + "loss": 0.0339, + "step": 126030 + }, + { + "epoch": 0.2302, + "grad_norm": 0.060224246233701706, + "learning_rate": 1.647751320896235e-05, + "loss": 0.033, + "step": 126040 + }, + { + "epoch": 0.23025, + "grad_norm": 0.0713365450501442, + "learning_rate": 1.6473627259052648e-05, + "loss": 0.0353, + "step": 126050 + }, + { + "epoch": 0.2303, + "grad_norm": 0.07686593383550644, + "learning_rate": 1.6469741542250792e-05, + "loss": 0.0345, + "step": 126060 + }, + { + "epoch": 0.23035, + "grad_norm": 0.059691183269023895, + "learning_rate": 1.646585605866299e-05, + "loss": 0.034, + "step": 126070 + }, + { + "epoch": 0.2304, + "grad_norm": 0.06376127898693085, + "learning_rate": 1.6461970808395476e-05, + "loss": 0.0349, + "step": 126080 + }, + { + "epoch": 0.23045, + "grad_norm": 0.06100200489163399, + "learning_rate": 1.6458085791554474e-05, + "loss": 0.0337, + "step": 126090 + }, + { + "epoch": 0.2305, + "grad_norm": 0.05868079885840416, + "learning_rate": 1.6454201008246196e-05, + "loss": 0.0332, + "step": 126100 + }, + { + "epoch": 0.23055, + "grad_norm": 0.07140611112117767, + "learning_rate": 1.6450316458576852e-05, + "loss": 0.0352, + "step": 126110 + }, + { + "epoch": 0.2306, + "grad_norm": 0.07169882208108902, + "learning_rate": 1.6446432142652647e-05, + "loss": 0.0346, + "step": 126120 + }, + { + "epoch": 0.23065, + "grad_norm": 0.058701857924461365, + "learning_rate": 1.6442548060579778e-05, + "loss": 0.0332, + "step": 126130 + }, + { + "epoch": 0.2307, + "grad_norm": 0.06083898991346359, + "learning_rate": 1.643866421246442e-05, + "loss": 0.0329, + "step": 126140 + }, + { + "epoch": 0.23075, + "grad_norm": 0.056007515639066696, + "learning_rate": 1.6434780598412764e-05, + "loss": 0.0342, + "step": 126150 + }, + { + "epoch": 0.2308, + "grad_norm": 0.05735059827566147, + "learning_rate": 1.6430897218530998e-05, + "loss": 0.0328, + "step": 126160 + }, + { + "epoch": 0.23085, + "grad_norm": 0.06010760739445686, + "learning_rate": 1.642701407292528e-05, + "loss": 0.0334, + "step": 126170 + }, + { + "epoch": 0.2309, + "grad_norm": 0.0675911083817482, + "learning_rate": 1.6423131161701778e-05, + "loss": 0.0329, + "step": 126180 + }, + { + "epoch": 0.23095, + "grad_norm": 0.05605832487344742, + "learning_rate": 1.6419248484966642e-05, + "loss": 0.0344, + "step": 126190 + }, + { + "epoch": 0.231, + "grad_norm": 0.05773789808154106, + "learning_rate": 1.6415366042826036e-05, + "loss": 0.0339, + "step": 126200 + }, + { + "epoch": 0.23105, + "grad_norm": 0.0519493967294693, + "learning_rate": 1.6411483835386092e-05, + "loss": 0.0339, + "step": 126210 + }, + { + "epoch": 0.2311, + "grad_norm": 0.058896906673908234, + "learning_rate": 1.640760186275296e-05, + "loss": 0.0332, + "step": 126220 + }, + { + "epoch": 0.23115, + "grad_norm": 0.08163320273160934, + "learning_rate": 1.640372012503276e-05, + "loss": 0.0341, + "step": 126230 + }, + { + "epoch": 0.2312, + "grad_norm": 0.09805915504693985, + "learning_rate": 1.6399838622331616e-05, + "loss": 0.0341, + "step": 126240 + }, + { + "epoch": 0.23125, + "grad_norm": 0.08248712122440338, + "learning_rate": 1.639595735475567e-05, + "loss": 0.0342, + "step": 126250 + }, + { + "epoch": 0.2313, + "grad_norm": 0.08971148729324341, + "learning_rate": 1.6392076322411e-05, + "loss": 0.0329, + "step": 126260 + }, + { + "epoch": 0.23135, + "grad_norm": 0.06334342062473297, + "learning_rate": 1.6388195525403746e-05, + "loss": 0.0323, + "step": 126270 + }, + { + "epoch": 0.2314, + "grad_norm": 0.06715748459100723, + "learning_rate": 1.6384314963839976e-05, + "loss": 0.0319, + "step": 126280 + }, + { + "epoch": 0.23145, + "grad_norm": 0.06183129921555519, + "learning_rate": 1.6380434637825804e-05, + "loss": 0.033, + "step": 126290 + }, + { + "epoch": 0.2315, + "grad_norm": 0.07641464471817017, + "learning_rate": 1.637655454746731e-05, + "loss": 0.0342, + "step": 126300 + }, + { + "epoch": 0.23155, + "grad_norm": 0.06532736867666245, + "learning_rate": 1.6372674692870578e-05, + "loss": 0.0331, + "step": 126310 + }, + { + "epoch": 0.2316, + "grad_norm": 0.06653792411088943, + "learning_rate": 1.636879507414168e-05, + "loss": 0.0338, + "step": 126320 + }, + { + "epoch": 0.23165, + "grad_norm": 0.05246128514409065, + "learning_rate": 1.6364915691386677e-05, + "loss": 0.0323, + "step": 126330 + }, + { + "epoch": 0.2317, + "grad_norm": 0.07467667013406754, + "learning_rate": 1.6361036544711628e-05, + "loss": 0.0332, + "step": 126340 + }, + { + "epoch": 0.23175, + "grad_norm": 0.06301455944776535, + "learning_rate": 1.6357157634222613e-05, + "loss": 0.0329, + "step": 126350 + }, + { + "epoch": 0.2318, + "grad_norm": 0.08649516105651855, + "learning_rate": 1.6353278960025646e-05, + "loss": 0.0346, + "step": 126360 + }, + { + "epoch": 0.23185, + "grad_norm": 0.06692379713058472, + "learning_rate": 1.634940052222679e-05, + "loss": 0.033, + "step": 126370 + }, + { + "epoch": 0.2319, + "grad_norm": 0.06434053182601929, + "learning_rate": 1.634552232093207e-05, + "loss": 0.0334, + "step": 126380 + }, + { + "epoch": 0.23195, + "grad_norm": 0.06479530781507492, + "learning_rate": 1.6341644356247526e-05, + "loss": 0.0341, + "step": 126390 + }, + { + "epoch": 0.232, + "grad_norm": 0.05729609355330467, + "learning_rate": 1.6337766628279165e-05, + "loss": 0.0318, + "step": 126400 + }, + { + "epoch": 0.23205, + "grad_norm": 0.06309373676776886, + "learning_rate": 1.6333889137133014e-05, + "loss": 0.0323, + "step": 126410 + }, + { + "epoch": 0.2321, + "grad_norm": 0.06207401677966118, + "learning_rate": 1.633001188291508e-05, + "loss": 0.0345, + "step": 126420 + }, + { + "epoch": 0.23215, + "grad_norm": 0.06494017690420151, + "learning_rate": 1.632613486573136e-05, + "loss": 0.033, + "step": 126430 + }, + { + "epoch": 0.2322, + "grad_norm": 0.06067967042326927, + "learning_rate": 1.632225808568786e-05, + "loss": 0.0348, + "step": 126440 + }, + { + "epoch": 0.23225, + "grad_norm": 0.05891747027635574, + "learning_rate": 1.6318381542890552e-05, + "loss": 0.0337, + "step": 126450 + }, + { + "epoch": 0.2323, + "grad_norm": 0.07398000359535217, + "learning_rate": 1.6314505237445448e-05, + "loss": 0.0352, + "step": 126460 + }, + { + "epoch": 0.23235, + "grad_norm": 0.14084388315677643, + "learning_rate": 1.631062916945849e-05, + "loss": 0.0342, + "step": 126470 + }, + { + "epoch": 0.2324, + "grad_norm": 0.09354683756828308, + "learning_rate": 1.6306753339035673e-05, + "loss": 0.0333, + "step": 126480 + }, + { + "epoch": 0.23245, + "grad_norm": 0.09307026118040085, + "learning_rate": 1.630287774628296e-05, + "loss": 0.0362, + "step": 126490 + }, + { + "epoch": 0.2325, + "grad_norm": 0.07109540700912476, + "learning_rate": 1.6299002391306294e-05, + "loss": 0.0351, + "step": 126500 + }, + { + "epoch": 0.23255, + "grad_norm": 0.0916818305850029, + "learning_rate": 1.6295127274211643e-05, + "loss": 0.0361, + "step": 126510 + }, + { + "epoch": 0.2326, + "grad_norm": 0.05944633483886719, + "learning_rate": 1.6291252395104935e-05, + "loss": 0.0331, + "step": 126520 + }, + { + "epoch": 0.23265, + "grad_norm": 0.07088175415992737, + "learning_rate": 1.6287377754092108e-05, + "loss": 0.034, + "step": 126530 + }, + { + "epoch": 0.2327, + "grad_norm": 0.07546090334653854, + "learning_rate": 1.6283503351279118e-05, + "loss": 0.0341, + "step": 126540 + }, + { + "epoch": 0.23275, + "grad_norm": 0.06843554973602295, + "learning_rate": 1.627962918677185e-05, + "loss": 0.0345, + "step": 126550 + }, + { + "epoch": 0.2328, + "grad_norm": 0.06755559891462326, + "learning_rate": 1.6275755260676268e-05, + "loss": 0.034, + "step": 126560 + }, + { + "epoch": 0.23285, + "grad_norm": 0.05856577306985855, + "learning_rate": 1.627188157309824e-05, + "loss": 0.0328, + "step": 126570 + }, + { + "epoch": 0.2329, + "grad_norm": 0.054877739399671555, + "learning_rate": 1.6268008124143703e-05, + "loss": 0.0335, + "step": 126580 + }, + { + "epoch": 0.23295, + "grad_norm": 0.0606815367937088, + "learning_rate": 1.6264134913918537e-05, + "loss": 0.0336, + "step": 126590 + }, + { + "epoch": 0.233, + "grad_norm": 0.05075656995177269, + "learning_rate": 1.626026194252864e-05, + "loss": 0.0335, + "step": 126600 + }, + { + "epoch": 0.23305, + "grad_norm": 0.05685050040483475, + "learning_rate": 1.6256389210079904e-05, + "loss": 0.0329, + "step": 126610 + }, + { + "epoch": 0.2331, + "grad_norm": 0.0628228485584259, + "learning_rate": 1.6252516716678196e-05, + "loss": 0.0325, + "step": 126620 + }, + { + "epoch": 0.23315, + "grad_norm": 0.06094030663371086, + "learning_rate": 1.62486444624294e-05, + "loss": 0.0351, + "step": 126630 + }, + { + "epoch": 0.2332, + "grad_norm": 0.05392427742481232, + "learning_rate": 1.624477244743937e-05, + "loss": 0.0334, + "step": 126640 + }, + { + "epoch": 0.23325, + "grad_norm": 0.05599977821111679, + "learning_rate": 1.624090067181398e-05, + "loss": 0.0319, + "step": 126650 + }, + { + "epoch": 0.2333, + "grad_norm": 0.051868144422769547, + "learning_rate": 1.6237029135659065e-05, + "loss": 0.0357, + "step": 126660 + }, + { + "epoch": 0.23335, + "grad_norm": 0.06395290791988373, + "learning_rate": 1.6233157839080485e-05, + "loss": 0.0342, + "step": 126670 + }, + { + "epoch": 0.2334, + "grad_norm": 0.0637071505188942, + "learning_rate": 1.6229286782184083e-05, + "loss": 0.0335, + "step": 126680 + }, + { + "epoch": 0.23345, + "grad_norm": 0.057250093668699265, + "learning_rate": 1.6225415965075676e-05, + "loss": 0.0331, + "step": 126690 + }, + { + "epoch": 0.2335, + "grad_norm": 0.05252157896757126, + "learning_rate": 1.622154538786111e-05, + "loss": 0.0368, + "step": 126700 + }, + { + "epoch": 0.23355, + "grad_norm": 0.057001277804374695, + "learning_rate": 1.6217675050646188e-05, + "loss": 0.0348, + "step": 126710 + }, + { + "epoch": 0.2336, + "grad_norm": 0.060486093163490295, + "learning_rate": 1.6213804953536727e-05, + "loss": 0.0336, + "step": 126720 + }, + { + "epoch": 0.23365, + "grad_norm": 0.05644745007157326, + "learning_rate": 1.6209935096638553e-05, + "loss": 0.0329, + "step": 126730 + }, + { + "epoch": 0.2337, + "grad_norm": 0.06537935882806778, + "learning_rate": 1.6206065480057432e-05, + "loss": 0.0342, + "step": 126740 + }, + { + "epoch": 0.23375, + "grad_norm": 0.06667554378509521, + "learning_rate": 1.6202196103899197e-05, + "loss": 0.0336, + "step": 126750 + }, + { + "epoch": 0.2338, + "grad_norm": 0.0648476704955101, + "learning_rate": 1.6198326968269594e-05, + "loss": 0.0338, + "step": 126760 + }, + { + "epoch": 0.23385, + "grad_norm": 0.05586965009570122, + "learning_rate": 1.619445807327445e-05, + "loss": 0.0325, + "step": 126770 + }, + { + "epoch": 0.2339, + "grad_norm": 0.0503368154168129, + "learning_rate": 1.619058941901949e-05, + "loss": 0.0333, + "step": 126780 + }, + { + "epoch": 0.23395, + "grad_norm": 0.06135008856654167, + "learning_rate": 1.6186721005610515e-05, + "loss": 0.0337, + "step": 126790 + }, + { + "epoch": 0.234, + "grad_norm": 0.061872679740190506, + "learning_rate": 1.618285283315328e-05, + "loss": 0.0338, + "step": 126800 + }, + { + "epoch": 0.23405, + "grad_norm": 0.06412120163440704, + "learning_rate": 1.6178984901753534e-05, + "loss": 0.0335, + "step": 126810 + }, + { + "epoch": 0.2341, + "grad_norm": 0.048076044768095016, + "learning_rate": 1.617511721151703e-05, + "loss": 0.0329, + "step": 126820 + }, + { + "epoch": 0.23415, + "grad_norm": 0.060390032827854156, + "learning_rate": 1.61712497625495e-05, + "loss": 0.0343, + "step": 126830 + }, + { + "epoch": 0.2342, + "grad_norm": 0.06160585582256317, + "learning_rate": 1.616738255495669e-05, + "loss": 0.0326, + "step": 126840 + }, + { + "epoch": 0.23425, + "grad_norm": 0.06219357252120972, + "learning_rate": 1.6163515588844318e-05, + "loss": 0.033, + "step": 126850 + }, + { + "epoch": 0.2343, + "grad_norm": 0.06169546768069267, + "learning_rate": 1.6159648864318106e-05, + "loss": 0.0341, + "step": 126860 + }, + { + "epoch": 0.23435, + "grad_norm": 0.06924784928560257, + "learning_rate": 1.6155782381483784e-05, + "loss": 0.033, + "step": 126870 + }, + { + "epoch": 0.2344, + "grad_norm": 0.0668037161231041, + "learning_rate": 1.6151916140447042e-05, + "loss": 0.0333, + "step": 126880 + }, + { + "epoch": 0.23445, + "grad_norm": 0.06414536386728287, + "learning_rate": 1.6148050141313592e-05, + "loss": 0.0343, + "step": 126890 + }, + { + "epoch": 0.2345, + "grad_norm": 0.06171473115682602, + "learning_rate": 1.6144184384189127e-05, + "loss": 0.0332, + "step": 126900 + }, + { + "epoch": 0.23455, + "grad_norm": 0.06489849835634232, + "learning_rate": 1.6140318869179333e-05, + "loss": 0.034, + "step": 126910 + }, + { + "epoch": 0.2346, + "grad_norm": 0.059951625764369965, + "learning_rate": 1.613645359638989e-05, + "loss": 0.0328, + "step": 126920 + }, + { + "epoch": 0.23465, + "grad_norm": 0.05820494145154953, + "learning_rate": 1.613258856592647e-05, + "loss": 0.033, + "step": 126930 + }, + { + "epoch": 0.2347, + "grad_norm": 0.05218276381492615, + "learning_rate": 1.612872377789476e-05, + "loss": 0.0333, + "step": 126940 + }, + { + "epoch": 0.23475, + "grad_norm": 0.05372566729784012, + "learning_rate": 1.6124859232400396e-05, + "loss": 0.0325, + "step": 126950 + }, + { + "epoch": 0.2348, + "grad_norm": 0.05991847440600395, + "learning_rate": 1.6120994929549065e-05, + "loss": 0.0335, + "step": 126960 + }, + { + "epoch": 0.23485, + "grad_norm": 0.05379624664783478, + "learning_rate": 1.6117130869446378e-05, + "loss": 0.0341, + "step": 126970 + }, + { + "epoch": 0.2349, + "grad_norm": 0.07068346440792084, + "learning_rate": 1.6113267052198e-05, + "loss": 0.034, + "step": 126980 + }, + { + "epoch": 0.23495, + "grad_norm": 0.06268243491649628, + "learning_rate": 1.6109403477909572e-05, + "loss": 0.0355, + "step": 126990 + }, + { + "epoch": 0.235, + "grad_norm": 0.06141556426882744, + "learning_rate": 1.6105540146686706e-05, + "loss": 0.0336, + "step": 127000 + }, + { + "epoch": 0.23505, + "grad_norm": 0.06479032337665558, + "learning_rate": 1.6101677058635035e-05, + "loss": 0.0336, + "step": 127010 + }, + { + "epoch": 0.2351, + "grad_norm": 0.07502786815166473, + "learning_rate": 1.6097814213860165e-05, + "loss": 0.0345, + "step": 127020 + }, + { + "epoch": 0.23515, + "grad_norm": 0.05763748288154602, + "learning_rate": 1.6093951612467713e-05, + "loss": 0.0322, + "step": 127030 + }, + { + "epoch": 0.2352, + "grad_norm": 0.06246546283364296, + "learning_rate": 1.6090089254563274e-05, + "loss": 0.0349, + "step": 127040 + }, + { + "epoch": 0.23525, + "grad_norm": 0.06013244017958641, + "learning_rate": 1.6086227140252443e-05, + "loss": 0.0331, + "step": 127050 + }, + { + "epoch": 0.2353, + "grad_norm": 0.05250510200858116, + "learning_rate": 1.608236526964083e-05, + "loss": 0.0337, + "step": 127060 + }, + { + "epoch": 0.23535, + "grad_norm": 0.07325278222560883, + "learning_rate": 1.6078503642833985e-05, + "loss": 0.0353, + "step": 127070 + }, + { + "epoch": 0.2354, + "grad_norm": 0.08081185817718506, + "learning_rate": 1.6074642259937507e-05, + "loss": 0.0357, + "step": 127080 + }, + { + "epoch": 0.23545, + "grad_norm": 0.06401636451482773, + "learning_rate": 1.6070781121056953e-05, + "loss": 0.0347, + "step": 127090 + }, + { + "epoch": 0.2355, + "grad_norm": 0.05674474313855171, + "learning_rate": 1.6066920226297894e-05, + "loss": 0.036, + "step": 127100 + }, + { + "epoch": 0.23555, + "grad_norm": 0.07341116666793823, + "learning_rate": 1.6063059575765872e-05, + "loss": 0.0341, + "step": 127110 + }, + { + "epoch": 0.2356, + "grad_norm": 0.07966278493404388, + "learning_rate": 1.6059199169566446e-05, + "loss": 0.0338, + "step": 127120 + }, + { + "epoch": 0.23565, + "grad_norm": 0.06760095804929733, + "learning_rate": 1.605533900780516e-05, + "loss": 0.0345, + "step": 127130 + }, + { + "epoch": 0.2357, + "grad_norm": 0.06217208877205849, + "learning_rate": 1.6051479090587534e-05, + "loss": 0.0347, + "step": 127140 + }, + { + "epoch": 0.23575, + "grad_norm": 0.07714894413948059, + "learning_rate": 1.604761941801913e-05, + "loss": 0.0352, + "step": 127150 + }, + { + "epoch": 0.2358, + "grad_norm": 0.06739632785320282, + "learning_rate": 1.6043759990205427e-05, + "loss": 0.0365, + "step": 127160 + }, + { + "epoch": 0.23585, + "grad_norm": 0.05616482347249985, + "learning_rate": 1.6039900807251962e-05, + "loss": 0.0371, + "step": 127170 + }, + { + "epoch": 0.2359, + "grad_norm": 0.07639485597610474, + "learning_rate": 1.6036041869264254e-05, + "loss": 0.0337, + "step": 127180 + }, + { + "epoch": 0.23595, + "grad_norm": 0.06982140243053436, + "learning_rate": 1.6032183176347786e-05, + "loss": 0.0335, + "step": 127190 + }, + { + "epoch": 0.236, + "grad_norm": 0.08310776203870773, + "learning_rate": 1.6028324728608067e-05, + "loss": 0.0341, + "step": 127200 + }, + { + "epoch": 0.23605, + "grad_norm": 0.06467178463935852, + "learning_rate": 1.6024466526150574e-05, + "loss": 0.0345, + "step": 127210 + }, + { + "epoch": 0.2361, + "grad_norm": 0.066603884100914, + "learning_rate": 1.6020608569080802e-05, + "loss": 0.0376, + "step": 127220 + }, + { + "epoch": 0.23615, + "grad_norm": 0.058768756687641144, + "learning_rate": 1.6016750857504208e-05, + "loss": 0.0349, + "step": 127230 + }, + { + "epoch": 0.2362, + "grad_norm": 0.05835096910595894, + "learning_rate": 1.601289339152627e-05, + "loss": 0.0328, + "step": 127240 + }, + { + "epoch": 0.23625, + "grad_norm": 0.06757104396820068, + "learning_rate": 1.6009036171252465e-05, + "loss": 0.0339, + "step": 127250 + }, + { + "epoch": 0.2363, + "grad_norm": 0.07333344221115112, + "learning_rate": 1.6005179196788217e-05, + "loss": 0.0348, + "step": 127260 + }, + { + "epoch": 0.23635, + "grad_norm": 0.07218988239765167, + "learning_rate": 1.6001322468239e-05, + "loss": 0.0342, + "step": 127270 + }, + { + "epoch": 0.2364, + "grad_norm": 0.06297601759433746, + "learning_rate": 1.599746598571024e-05, + "loss": 0.0342, + "step": 127280 + }, + { + "epoch": 0.23645, + "grad_norm": 0.05608023330569267, + "learning_rate": 1.5993609749307385e-05, + "loss": 0.0355, + "step": 127290 + }, + { + "epoch": 0.2365, + "grad_norm": 0.07831545919179916, + "learning_rate": 1.5989753759135853e-05, + "loss": 0.0344, + "step": 127300 + }, + { + "epoch": 0.23655, + "grad_norm": 0.06702210754156113, + "learning_rate": 1.5985898015301064e-05, + "loss": 0.0341, + "step": 127310 + }, + { + "epoch": 0.2366, + "grad_norm": 0.05965721979737282, + "learning_rate": 1.5982042517908445e-05, + "loss": 0.0349, + "step": 127320 + }, + { + "epoch": 0.23665, + "grad_norm": 0.07125255465507507, + "learning_rate": 1.597818726706339e-05, + "loss": 0.0391, + "step": 127330 + }, + { + "epoch": 0.2367, + "grad_norm": 0.06022556126117706, + "learning_rate": 1.597433226287131e-05, + "loss": 0.034, + "step": 127340 + }, + { + "epoch": 0.23675, + "grad_norm": 0.053169671446084976, + "learning_rate": 1.5970477505437586e-05, + "loss": 0.0339, + "step": 127350 + }, + { + "epoch": 0.2368, + "grad_norm": 0.05481424927711487, + "learning_rate": 1.596662299486762e-05, + "loss": 0.0348, + "step": 127360 + }, + { + "epoch": 0.23685, + "grad_norm": 0.06159573793411255, + "learning_rate": 1.596276873126679e-05, + "loss": 0.0347, + "step": 127370 + }, + { + "epoch": 0.2369, + "grad_norm": 0.06082068756222725, + "learning_rate": 1.5958914714740464e-05, + "loss": 0.0341, + "step": 127380 + }, + { + "epoch": 0.23695, + "grad_norm": 0.07267327606678009, + "learning_rate": 1.595506094539402e-05, + "loss": 0.0356, + "step": 127390 + }, + { + "epoch": 0.237, + "grad_norm": 0.06275376677513123, + "learning_rate": 1.5951207423332806e-05, + "loss": 0.0342, + "step": 127400 + }, + { + "epoch": 0.23705, + "grad_norm": 0.04976017028093338, + "learning_rate": 1.5947354148662187e-05, + "loss": 0.0334, + "step": 127410 + }, + { + "epoch": 0.2371, + "grad_norm": 0.05717877671122551, + "learning_rate": 1.5943501121487496e-05, + "loss": 0.0356, + "step": 127420 + }, + { + "epoch": 0.23715, + "grad_norm": 0.057774148881435394, + "learning_rate": 1.5939648341914082e-05, + "loss": 0.0335, + "step": 127430 + }, + { + "epoch": 0.2372, + "grad_norm": 0.07764162123203278, + "learning_rate": 1.593579581004729e-05, + "loss": 0.0352, + "step": 127440 + }, + { + "epoch": 0.23725, + "grad_norm": 0.07691578567028046, + "learning_rate": 1.593194352599242e-05, + "loss": 0.0349, + "step": 127450 + }, + { + "epoch": 0.2373, + "grad_norm": 0.07152264565229416, + "learning_rate": 1.5928091489854823e-05, + "loss": 0.0344, + "step": 127460 + }, + { + "epoch": 0.23735, + "grad_norm": 0.0600624606013298, + "learning_rate": 1.5924239701739786e-05, + "loss": 0.033, + "step": 127470 + }, + { + "epoch": 0.2374, + "grad_norm": 0.05317433178424835, + "learning_rate": 1.5920388161752632e-05, + "loss": 0.033, + "step": 127480 + }, + { + "epoch": 0.23745, + "grad_norm": 0.05922538787126541, + "learning_rate": 1.591653686999865e-05, + "loss": 0.0325, + "step": 127490 + }, + { + "epoch": 0.2375, + "grad_norm": 0.06978096812963486, + "learning_rate": 1.5912685826583136e-05, + "loss": 0.0333, + "step": 127500 + }, + { + "epoch": 0.23755, + "grad_norm": 0.06716527044773102, + "learning_rate": 1.5908835031611386e-05, + "loss": 0.034, + "step": 127510 + }, + { + "epoch": 0.2376, + "grad_norm": 0.06381786614656448, + "learning_rate": 1.5904984485188662e-05, + "loss": 0.0353, + "step": 127520 + }, + { + "epoch": 0.23765, + "grad_norm": 0.058379948139190674, + "learning_rate": 1.5901134187420252e-05, + "loss": 0.0351, + "step": 127530 + }, + { + "epoch": 0.2377, + "grad_norm": 0.054788608103990555, + "learning_rate": 1.589728413841141e-05, + "loss": 0.0328, + "step": 127540 + }, + { + "epoch": 0.23775, + "grad_norm": 0.06037406250834465, + "learning_rate": 1.5893434338267394e-05, + "loss": 0.0331, + "step": 127550 + }, + { + "epoch": 0.2378, + "grad_norm": 0.07420245558023453, + "learning_rate": 1.588958478709347e-05, + "loss": 0.033, + "step": 127560 + }, + { + "epoch": 0.23785, + "grad_norm": 0.06018362194299698, + "learning_rate": 1.5885735484994876e-05, + "loss": 0.0331, + "step": 127570 + }, + { + "epoch": 0.2379, + "grad_norm": 0.05091716721653938, + "learning_rate": 1.5881886432076852e-05, + "loss": 0.0321, + "step": 127580 + }, + { + "epoch": 0.23795, + "grad_norm": 0.051424361765384674, + "learning_rate": 1.5878037628444624e-05, + "loss": 0.0319, + "step": 127590 + }, + { + "epoch": 0.238, + "grad_norm": 0.06559579819440842, + "learning_rate": 1.587418907420342e-05, + "loss": 0.0346, + "step": 127600 + }, + { + "epoch": 0.23805, + "grad_norm": 0.056557316333055496, + "learning_rate": 1.5870340769458457e-05, + "loss": 0.0345, + "step": 127610 + }, + { + "epoch": 0.2381, + "grad_norm": 0.06059598922729492, + "learning_rate": 1.5866492714314952e-05, + "loss": 0.0334, + "step": 127620 + }, + { + "epoch": 0.23815, + "grad_norm": 0.06248987838625908, + "learning_rate": 1.5862644908878106e-05, + "loss": 0.0338, + "step": 127630 + }, + { + "epoch": 0.2382, + "grad_norm": 0.06059509888291359, + "learning_rate": 1.58587973532531e-05, + "loss": 0.0326, + "step": 127640 + }, + { + "epoch": 0.23825, + "grad_norm": 0.056693557649850845, + "learning_rate": 1.5854950047545165e-05, + "loss": 0.0356, + "step": 127650 + }, + { + "epoch": 0.2383, + "grad_norm": 0.06219499930739403, + "learning_rate": 1.5851102991859437e-05, + "loss": 0.0327, + "step": 127660 + }, + { + "epoch": 0.23835, + "grad_norm": 0.05717054754495621, + "learning_rate": 1.5847256186301135e-05, + "loss": 0.0337, + "step": 127670 + }, + { + "epoch": 0.2384, + "grad_norm": 0.07006093859672546, + "learning_rate": 1.5843409630975394e-05, + "loss": 0.0345, + "step": 127680 + }, + { + "epoch": 0.23845, + "grad_norm": 0.06150520220398903, + "learning_rate": 1.58395633259874e-05, + "loss": 0.0365, + "step": 127690 + }, + { + "epoch": 0.2385, + "grad_norm": 0.06333986669778824, + "learning_rate": 1.5835717271442307e-05, + "loss": 0.0349, + "step": 127700 + }, + { + "epoch": 0.23855, + "grad_norm": 0.05847545713186264, + "learning_rate": 1.583187146744526e-05, + "loss": 0.0355, + "step": 127710 + }, + { + "epoch": 0.2386, + "grad_norm": 0.0587812140583992, + "learning_rate": 1.5828025914101402e-05, + "loss": 0.0345, + "step": 127720 + }, + { + "epoch": 0.23865, + "grad_norm": 0.05848672240972519, + "learning_rate": 1.5824180611515865e-05, + "loss": 0.0333, + "step": 127730 + }, + { + "epoch": 0.2387, + "grad_norm": 0.05444684997200966, + "learning_rate": 1.5820335559793782e-05, + "loss": 0.0348, + "step": 127740 + }, + { + "epoch": 0.23875, + "grad_norm": 0.06529933959245682, + "learning_rate": 1.5816490759040288e-05, + "loss": 0.0345, + "step": 127750 + }, + { + "epoch": 0.2388, + "grad_norm": 0.05455807223916054, + "learning_rate": 1.581264620936047e-05, + "loss": 0.0344, + "step": 127760 + }, + { + "epoch": 0.23885, + "grad_norm": 0.055863041430711746, + "learning_rate": 1.5808801910859468e-05, + "loss": 0.0379, + "step": 127770 + }, + { + "epoch": 0.2389, + "grad_norm": 0.05264187976717949, + "learning_rate": 1.580495786364236e-05, + "loss": 0.0332, + "step": 127780 + }, + { + "epoch": 0.23895, + "grad_norm": 0.05669938027858734, + "learning_rate": 1.580111406781426e-05, + "loss": 0.0332, + "step": 127790 + }, + { + "epoch": 0.239, + "grad_norm": 0.05038554593920708, + "learning_rate": 1.5797270523480236e-05, + "loss": 0.0322, + "step": 127800 + }, + { + "epoch": 0.23905, + "grad_norm": 0.05440721660852432, + "learning_rate": 1.579342723074538e-05, + "loss": 0.0337, + "step": 127810 + }, + { + "epoch": 0.2391, + "grad_norm": 0.05540277063846588, + "learning_rate": 1.578958418971477e-05, + "loss": 0.0337, + "step": 127820 + }, + { + "epoch": 0.23915, + "grad_norm": 0.050372201949357986, + "learning_rate": 1.578574140049346e-05, + "loss": 0.0321, + "step": 127830 + }, + { + "epoch": 0.2392, + "grad_norm": 0.05455819144845009, + "learning_rate": 1.5781898863186526e-05, + "loss": 0.0325, + "step": 127840 + }, + { + "epoch": 0.23925, + "grad_norm": 0.04976142942905426, + "learning_rate": 1.5778056577899003e-05, + "loss": 0.033, + "step": 127850 + }, + { + "epoch": 0.2393, + "grad_norm": 0.055015236139297485, + "learning_rate": 1.5774214544735962e-05, + "loss": 0.0366, + "step": 127860 + }, + { + "epoch": 0.23935, + "grad_norm": 0.05450107902288437, + "learning_rate": 1.577037276380242e-05, + "loss": 0.0337, + "step": 127870 + }, + { + "epoch": 0.2394, + "grad_norm": 0.06425601989030838, + "learning_rate": 1.5766531235203418e-05, + "loss": 0.0337, + "step": 127880 + }, + { + "epoch": 0.23945, + "grad_norm": 0.05643454194068909, + "learning_rate": 1.5762689959043992e-05, + "loss": 0.0323, + "step": 127890 + }, + { + "epoch": 0.2395, + "grad_norm": 0.056932345032691956, + "learning_rate": 1.5758848935429147e-05, + "loss": 0.0337, + "step": 127900 + }, + { + "epoch": 0.23955, + "grad_norm": 0.0619305819272995, + "learning_rate": 1.5755008164463904e-05, + "loss": 0.0333, + "step": 127910 + }, + { + "epoch": 0.2396, + "grad_norm": 0.08095294237136841, + "learning_rate": 1.575116764625326e-05, + "loss": 0.035, + "step": 127920 + }, + { + "epoch": 0.23965, + "grad_norm": 0.09136935323476791, + "learning_rate": 1.574732738090221e-05, + "loss": 0.0342, + "step": 127930 + }, + { + "epoch": 0.2397, + "grad_norm": 0.06716940551996231, + "learning_rate": 1.5743487368515775e-05, + "loss": 0.0343, + "step": 127940 + }, + { + "epoch": 0.23975, + "grad_norm": 0.05612089857459068, + "learning_rate": 1.57396476091989e-05, + "loss": 0.0335, + "step": 127950 + }, + { + "epoch": 0.2398, + "grad_norm": 0.06548678129911423, + "learning_rate": 1.5735808103056592e-05, + "loss": 0.0354, + "step": 127960 + }, + { + "epoch": 0.23985, + "grad_norm": 0.06602530926465988, + "learning_rate": 1.57319688501938e-05, + "loss": 0.0346, + "step": 127970 + }, + { + "epoch": 0.2399, + "grad_norm": 0.06799127161502838, + "learning_rate": 1.5728129850715503e-05, + "loss": 0.0346, + "step": 127980 + }, + { + "epoch": 0.23995, + "grad_norm": 0.06980675458908081, + "learning_rate": 1.5724291104726652e-05, + "loss": 0.0344, + "step": 127990 + }, + { + "epoch": 0.24, + "grad_norm": 0.05383969470858574, + "learning_rate": 1.572045261233219e-05, + "loss": 0.035, + "step": 128000 + }, + { + "epoch": 0.24005, + "grad_norm": 0.06383443623781204, + "learning_rate": 1.5716614373637085e-05, + "loss": 0.0333, + "step": 128010 + }, + { + "epoch": 0.2401, + "grad_norm": 0.0596088282763958, + "learning_rate": 1.5712776388746243e-05, + "loss": 0.0334, + "step": 128020 + }, + { + "epoch": 0.24015, + "grad_norm": 0.08484054356813431, + "learning_rate": 1.570893865776461e-05, + "loss": 0.0351, + "step": 128030 + }, + { + "epoch": 0.2402, + "grad_norm": 0.06879259645938873, + "learning_rate": 1.5705101180797098e-05, + "loss": 0.0331, + "step": 128040 + }, + { + "epoch": 0.24025, + "grad_norm": 0.06585878133773804, + "learning_rate": 1.5701263957948636e-05, + "loss": 0.0335, + "step": 128050 + }, + { + "epoch": 0.2403, + "grad_norm": 0.0568404421210289, + "learning_rate": 1.569742698932411e-05, + "loss": 0.0344, + "step": 128060 + }, + { + "epoch": 0.24035, + "grad_norm": 0.057048458606004715, + "learning_rate": 1.5693590275028445e-05, + "loss": 0.0326, + "step": 128070 + }, + { + "epoch": 0.2404, + "grad_norm": 0.05349932610988617, + "learning_rate": 1.5689753815166526e-05, + "loss": 0.0325, + "step": 128080 + }, + { + "epoch": 0.24045, + "grad_norm": 0.050432320684194565, + "learning_rate": 1.5685917609843236e-05, + "loss": 0.0326, + "step": 128090 + }, + { + "epoch": 0.2405, + "grad_norm": 0.05716854706406593, + "learning_rate": 1.5682081659163467e-05, + "loss": 0.0325, + "step": 128100 + }, + { + "epoch": 0.24055, + "grad_norm": 0.04774298518896103, + "learning_rate": 1.567824596323208e-05, + "loss": 0.0333, + "step": 128110 + }, + { + "epoch": 0.2406, + "grad_norm": 0.058883074671030045, + "learning_rate": 1.567441052215395e-05, + "loss": 0.0349, + "step": 128120 + }, + { + "epoch": 0.24065, + "grad_norm": 0.05763725936412811, + "learning_rate": 1.567057533603393e-05, + "loss": 0.0336, + "step": 128130 + }, + { + "epoch": 0.2407, + "grad_norm": 0.05150453373789787, + "learning_rate": 1.5666740404976864e-05, + "loss": 0.0331, + "step": 128140 + }, + { + "epoch": 0.24075, + "grad_norm": 0.08317604660987854, + "learning_rate": 1.566290572908763e-05, + "loss": 0.0342, + "step": 128150 + }, + { + "epoch": 0.2408, + "grad_norm": 0.058748021721839905, + "learning_rate": 1.565907130847103e-05, + "loss": 0.0339, + "step": 128160 + }, + { + "epoch": 0.24085, + "grad_norm": 0.06249953806400299, + "learning_rate": 1.565523714323192e-05, + "loss": 0.0343, + "step": 128170 + }, + { + "epoch": 0.2409, + "grad_norm": 0.07479379326105118, + "learning_rate": 1.565140323347511e-05, + "loss": 0.0361, + "step": 128180 + }, + { + "epoch": 0.24095, + "grad_norm": 0.0653737485408783, + "learning_rate": 1.564756957930542e-05, + "loss": 0.0328, + "step": 128190 + }, + { + "epoch": 0.241, + "grad_norm": 0.05890415981411934, + "learning_rate": 1.5643736180827676e-05, + "loss": 0.0342, + "step": 128200 + }, + { + "epoch": 0.24105, + "grad_norm": 0.06277629733085632, + "learning_rate": 1.5639903038146665e-05, + "loss": 0.0338, + "step": 128210 + }, + { + "epoch": 0.2411, + "grad_norm": 0.06385780870914459, + "learning_rate": 1.563607015136719e-05, + "loss": 0.0348, + "step": 128220 + }, + { + "epoch": 0.24115, + "grad_norm": 0.05908294767141342, + "learning_rate": 1.5632237520594036e-05, + "loss": 0.0339, + "step": 128230 + }, + { + "epoch": 0.2412, + "grad_norm": 0.05373472720384598, + "learning_rate": 1.562840514593199e-05, + "loss": 0.0335, + "step": 128240 + }, + { + "epoch": 0.24125, + "grad_norm": 0.050608761608600616, + "learning_rate": 1.562457302748582e-05, + "loss": 0.0339, + "step": 128250 + }, + { + "epoch": 0.2413, + "grad_norm": 0.06089496240019798, + "learning_rate": 1.5620741165360303e-05, + "loss": 0.0331, + "step": 128260 + }, + { + "epoch": 0.24135, + "grad_norm": 0.05857257544994354, + "learning_rate": 1.561690955966021e-05, + "loss": 0.0337, + "step": 128270 + }, + { + "epoch": 0.2414, + "grad_norm": 0.06435168534517288, + "learning_rate": 1.5613078210490274e-05, + "loss": 0.033, + "step": 128280 + }, + { + "epoch": 0.24145, + "grad_norm": 0.05921395868062973, + "learning_rate": 1.5609247117955262e-05, + "loss": 0.0329, + "step": 128290 + }, + { + "epoch": 0.2415, + "grad_norm": 0.052547890692949295, + "learning_rate": 1.5605416282159897e-05, + "loss": 0.0327, + "step": 128300 + }, + { + "epoch": 0.24155, + "grad_norm": 0.08207894116640091, + "learning_rate": 1.560158570320893e-05, + "loss": 0.0339, + "step": 128310 + }, + { + "epoch": 0.2416, + "grad_norm": 0.06959673017263412, + "learning_rate": 1.5597755381207075e-05, + "loss": 0.0338, + "step": 128320 + }, + { + "epoch": 0.24165, + "grad_norm": 0.06273122131824493, + "learning_rate": 1.559392531625905e-05, + "loss": 0.0347, + "step": 128330 + }, + { + "epoch": 0.2417, + "grad_norm": 0.0634051263332367, + "learning_rate": 1.5590095508469583e-05, + "loss": 0.0335, + "step": 128340 + }, + { + "epoch": 0.24175, + "grad_norm": 0.06408239901065826, + "learning_rate": 1.5586265957943358e-05, + "loss": 0.0328, + "step": 128350 + }, + { + "epoch": 0.2418, + "grad_norm": 0.06315138936042786, + "learning_rate": 1.5582436664785098e-05, + "loss": 0.0322, + "step": 128360 + }, + { + "epoch": 0.24185, + "grad_norm": 0.07049424201250076, + "learning_rate": 1.557860762909947e-05, + "loss": 0.0329, + "step": 128370 + }, + { + "epoch": 0.2419, + "grad_norm": 0.06051500141620636, + "learning_rate": 1.557477885099117e-05, + "loss": 0.0325, + "step": 128380 + }, + { + "epoch": 0.24195, + "grad_norm": 0.05313899368047714, + "learning_rate": 1.5570950330564888e-05, + "loss": 0.0327, + "step": 128390 + }, + { + "epoch": 0.242, + "grad_norm": 0.05960242077708244, + "learning_rate": 1.5567122067925272e-05, + "loss": 0.0336, + "step": 128400 + }, + { + "epoch": 0.24205, + "grad_norm": 0.07312949746847153, + "learning_rate": 1.5563294063177004e-05, + "loss": 0.0333, + "step": 128410 + }, + { + "epoch": 0.2421, + "grad_norm": 0.06439289450645447, + "learning_rate": 1.555946631642472e-05, + "loss": 0.0344, + "step": 128420 + }, + { + "epoch": 0.24215, + "grad_norm": 0.06231061741709709, + "learning_rate": 1.555563882777309e-05, + "loss": 0.0322, + "step": 128430 + }, + { + "epoch": 0.2422, + "grad_norm": 0.07115618884563446, + "learning_rate": 1.555181159732674e-05, + "loss": 0.0327, + "step": 128440 + }, + { + "epoch": 0.24225, + "grad_norm": 0.05875179171562195, + "learning_rate": 1.5547984625190303e-05, + "loss": 0.032, + "step": 128450 + }, + { + "epoch": 0.2423, + "grad_norm": 0.07768935710191727, + "learning_rate": 1.5544157911468433e-05, + "loss": 0.0342, + "step": 128460 + }, + { + "epoch": 0.24235, + "grad_norm": 0.07130517065525055, + "learning_rate": 1.554033145626572e-05, + "loss": 0.0339, + "step": 128470 + }, + { + "epoch": 0.2424, + "grad_norm": 0.07714162021875381, + "learning_rate": 1.55365052596868e-05, + "loss": 0.0334, + "step": 128480 + }, + { + "epoch": 0.24245, + "grad_norm": 0.05913243442773819, + "learning_rate": 1.5532679321836264e-05, + "loss": 0.0319, + "step": 128490 + }, + { + "epoch": 0.2425, + "grad_norm": 0.06145475059747696, + "learning_rate": 1.5528853642818726e-05, + "loss": 0.034, + "step": 128500 + }, + { + "epoch": 0.24255, + "grad_norm": 0.07546162605285645, + "learning_rate": 1.5525028222738763e-05, + "loss": 0.0333, + "step": 128510 + }, + { + "epoch": 0.2426, + "grad_norm": 0.06005942448973656, + "learning_rate": 1.5521203061700975e-05, + "loss": 0.0326, + "step": 128520 + }, + { + "epoch": 0.24265, + "grad_norm": 0.06266207247972488, + "learning_rate": 1.5517378159809935e-05, + "loss": 0.0326, + "step": 128530 + }, + { + "epoch": 0.2427, + "grad_norm": 0.05349930003285408, + "learning_rate": 1.551355351717021e-05, + "loss": 0.0327, + "step": 128540 + }, + { + "epoch": 0.24275, + "grad_norm": 0.06674955040216446, + "learning_rate": 1.550972913388637e-05, + "loss": 0.0333, + "step": 128550 + }, + { + "epoch": 0.2428, + "grad_norm": 0.06313853710889816, + "learning_rate": 1.5505905010062962e-05, + "loss": 0.0363, + "step": 128560 + }, + { + "epoch": 0.24285, + "grad_norm": 0.05368867889046669, + "learning_rate": 1.550208114580455e-05, + "loss": 0.0326, + "step": 128570 + }, + { + "epoch": 0.2429, + "grad_norm": 0.06399517506361008, + "learning_rate": 1.549825754121568e-05, + "loss": 0.0334, + "step": 128580 + }, + { + "epoch": 0.24295, + "grad_norm": 0.06687916070222855, + "learning_rate": 1.5494434196400864e-05, + "loss": 0.0369, + "step": 128590 + }, + { + "epoch": 0.243, + "grad_norm": 0.05454210191965103, + "learning_rate": 1.5490611111464657e-05, + "loss": 0.0341, + "step": 128600 + }, + { + "epoch": 0.24305, + "grad_norm": 0.07353127002716064, + "learning_rate": 1.5486788286511567e-05, + "loss": 0.0357, + "step": 128610 + }, + { + "epoch": 0.2431, + "grad_norm": 0.058172594755887985, + "learning_rate": 1.5482965721646113e-05, + "loss": 0.034, + "step": 128620 + }, + { + "epoch": 0.24315, + "grad_norm": 0.05430532246828079, + "learning_rate": 1.5479143416972795e-05, + "loss": 0.0334, + "step": 128630 + }, + { + "epoch": 0.2432, + "grad_norm": 0.04920189082622528, + "learning_rate": 1.5475321372596117e-05, + "loss": 0.0331, + "step": 128640 + }, + { + "epoch": 0.24325, + "grad_norm": 0.060522664338350296, + "learning_rate": 1.5471499588620593e-05, + "loss": 0.0346, + "step": 128650 + }, + { + "epoch": 0.2433, + "grad_norm": 0.058444004505872726, + "learning_rate": 1.5467678065150668e-05, + "loss": 0.0344, + "step": 128660 + }, + { + "epoch": 0.24335, + "grad_norm": 0.07578611373901367, + "learning_rate": 1.546385680229086e-05, + "loss": 0.0348, + "step": 128670 + }, + { + "epoch": 0.2434, + "grad_norm": 0.05158187448978424, + "learning_rate": 1.546003580014561e-05, + "loss": 0.0328, + "step": 128680 + }, + { + "epoch": 0.24345, + "grad_norm": 0.05328892916440964, + "learning_rate": 1.5456215058819412e-05, + "loss": 0.034, + "step": 128690 + }, + { + "epoch": 0.2435, + "grad_norm": 0.04662049934267998, + "learning_rate": 1.5452394578416697e-05, + "loss": 0.033, + "step": 128700 + }, + { + "epoch": 0.24355, + "grad_norm": 0.08248545974493027, + "learning_rate": 1.5448574359041934e-05, + "loss": 0.0334, + "step": 128710 + }, + { + "epoch": 0.2436, + "grad_norm": 0.07601383328437805, + "learning_rate": 1.544475440079956e-05, + "loss": 0.0336, + "step": 128720 + }, + { + "epoch": 0.24365, + "grad_norm": 0.082184799015522, + "learning_rate": 1.5440934703794007e-05, + "loss": 0.0333, + "step": 128730 + }, + { + "epoch": 0.2437, + "grad_norm": 0.0712849572300911, + "learning_rate": 1.5437115268129715e-05, + "loss": 0.0331, + "step": 128740 + }, + { + "epoch": 0.24375, + "grad_norm": 0.0680183470249176, + "learning_rate": 1.543329609391109e-05, + "loss": 0.033, + "step": 128750 + }, + { + "epoch": 0.2438, + "grad_norm": 0.06785959750413895, + "learning_rate": 1.5429477181242552e-05, + "loss": 0.0335, + "step": 128760 + }, + { + "epoch": 0.24385, + "grad_norm": 0.06210260093212128, + "learning_rate": 1.5425658530228522e-05, + "loss": 0.0356, + "step": 128770 + }, + { + "epoch": 0.2439, + "grad_norm": 0.06129412353038788, + "learning_rate": 1.5421840140973385e-05, + "loss": 0.0351, + "step": 128780 + }, + { + "epoch": 0.24395, + "grad_norm": 0.061416443437337875, + "learning_rate": 1.541802201358155e-05, + "loss": 0.034, + "step": 128790 + }, + { + "epoch": 0.244, + "grad_norm": 0.07154708355665207, + "learning_rate": 1.5414204148157385e-05, + "loss": 0.0325, + "step": 128800 + }, + { + "epoch": 0.24405, + "grad_norm": 0.06849026679992676, + "learning_rate": 1.5410386544805282e-05, + "loss": 0.0341, + "step": 128810 + }, + { + "epoch": 0.2441, + "grad_norm": 0.06167871132493019, + "learning_rate": 1.5406569203629605e-05, + "loss": 0.0333, + "step": 128820 + }, + { + "epoch": 0.24415, + "grad_norm": 0.06029805541038513, + "learning_rate": 1.5402752124734722e-05, + "loss": 0.0329, + "step": 128830 + }, + { + "epoch": 0.2442, + "grad_norm": 0.06149749830365181, + "learning_rate": 1.5398935308224995e-05, + "loss": 0.0329, + "step": 128840 + }, + { + "epoch": 0.24425, + "grad_norm": 0.060887884348630905, + "learning_rate": 1.539511875420476e-05, + "loss": 0.0327, + "step": 128850 + }, + { + "epoch": 0.2443, + "grad_norm": 0.06271271407604218, + "learning_rate": 1.5391302462778384e-05, + "loss": 0.0338, + "step": 128860 + }, + { + "epoch": 0.24435, + "grad_norm": 0.050711505115032196, + "learning_rate": 1.5387486434050175e-05, + "loss": 0.0328, + "step": 128870 + }, + { + "epoch": 0.2444, + "grad_norm": 0.05109580606222153, + "learning_rate": 1.538367066812449e-05, + "loss": 0.0327, + "step": 128880 + }, + { + "epoch": 0.24445, + "grad_norm": 0.048492636531591415, + "learning_rate": 1.537985516510562e-05, + "loss": 0.0331, + "step": 128890 + }, + { + "epoch": 0.2445, + "grad_norm": 0.06411636620759964, + "learning_rate": 1.5376039925097902e-05, + "loss": 0.0337, + "step": 128900 + }, + { + "epoch": 0.24455, + "grad_norm": 0.0920451357960701, + "learning_rate": 1.537222494820564e-05, + "loss": 0.034, + "step": 128910 + }, + { + "epoch": 0.2446, + "grad_norm": 0.05714166909456253, + "learning_rate": 1.5368410234533127e-05, + "loss": 0.0331, + "step": 128920 + }, + { + "epoch": 0.24465, + "grad_norm": 0.059067945927381516, + "learning_rate": 1.5364595784184666e-05, + "loss": 0.0332, + "step": 128930 + }, + { + "epoch": 0.2447, + "grad_norm": 0.08221178501844406, + "learning_rate": 1.536078159726453e-05, + "loss": 0.0346, + "step": 128940 + }, + { + "epoch": 0.24475, + "grad_norm": 0.07259626686573029, + "learning_rate": 1.5356967673877e-05, + "loss": 0.0324, + "step": 128950 + }, + { + "epoch": 0.2448, + "grad_norm": 0.0738881304860115, + "learning_rate": 1.5353154014126363e-05, + "loss": 0.0327, + "step": 128960 + }, + { + "epoch": 0.24485, + "grad_norm": 0.05740315094590187, + "learning_rate": 1.5349340618116857e-05, + "loss": 0.0323, + "step": 128970 + }, + { + "epoch": 0.2449, + "grad_norm": 0.054173581302165985, + "learning_rate": 1.5345527485952768e-05, + "loss": 0.0335, + "step": 128980 + }, + { + "epoch": 0.24495, + "grad_norm": 0.05699336901307106, + "learning_rate": 1.5341714617738324e-05, + "loss": 0.0338, + "step": 128990 + }, + { + "epoch": 0.245, + "grad_norm": 0.05243121460080147, + "learning_rate": 1.5337902013577775e-05, + "loss": 0.0348, + "step": 129000 + }, + { + "epoch": 0.24505, + "grad_norm": 0.054811857640743256, + "learning_rate": 1.533408967357535e-05, + "loss": 0.0338, + "step": 129010 + }, + { + "epoch": 0.2451, + "grad_norm": 0.06116772070527077, + "learning_rate": 1.5330277597835287e-05, + "loss": 0.0343, + "step": 129020 + }, + { + "epoch": 0.24515, + "grad_norm": 0.05350358039140701, + "learning_rate": 1.53264657864618e-05, + "loss": 0.0354, + "step": 129030 + }, + { + "epoch": 0.2452, + "grad_norm": 0.053333185613155365, + "learning_rate": 1.5322654239559104e-05, + "loss": 0.0367, + "step": 129040 + }, + { + "epoch": 0.24525, + "grad_norm": 0.06097253039479256, + "learning_rate": 1.531884295723141e-05, + "loss": 0.0332, + "step": 129050 + }, + { + "epoch": 0.2453, + "grad_norm": 0.06263768672943115, + "learning_rate": 1.53150319395829e-05, + "loss": 0.0347, + "step": 129060 + }, + { + "epoch": 0.24535, + "grad_norm": 0.07106564193964005, + "learning_rate": 1.53112211867178e-05, + "loss": 0.0354, + "step": 129070 + }, + { + "epoch": 0.2454, + "grad_norm": 0.06520446389913559, + "learning_rate": 1.530741069874025e-05, + "loss": 0.0341, + "step": 129080 + }, + { + "epoch": 0.24545, + "grad_norm": 0.05361152067780495, + "learning_rate": 1.530360047575446e-05, + "loss": 0.0379, + "step": 129090 + }, + { + "epoch": 0.2455, + "grad_norm": 0.0815877839922905, + "learning_rate": 1.5299790517864592e-05, + "loss": 0.0365, + "step": 129100 + }, + { + "epoch": 0.24555, + "grad_norm": 0.0634656473994255, + "learning_rate": 1.5295980825174804e-05, + "loss": 0.0341, + "step": 129110 + }, + { + "epoch": 0.2456, + "grad_norm": 0.05546913295984268, + "learning_rate": 1.529217139778926e-05, + "loss": 0.0343, + "step": 129120 + }, + { + "epoch": 0.24565, + "grad_norm": 0.07225493341684341, + "learning_rate": 1.5288362235812096e-05, + "loss": 0.0342, + "step": 129130 + }, + { + "epoch": 0.2457, + "grad_norm": 0.06344477832317352, + "learning_rate": 1.5284553339347458e-05, + "loss": 0.0344, + "step": 129140 + }, + { + "epoch": 0.24575, + "grad_norm": 0.08348975330591202, + "learning_rate": 1.5280744708499494e-05, + "loss": 0.0347, + "step": 129150 + }, + { + "epoch": 0.2458, + "grad_norm": 0.06194521114230156, + "learning_rate": 1.5276936343372304e-05, + "loss": 0.0341, + "step": 129160 + }, + { + "epoch": 0.24585, + "grad_norm": 0.06985343992710114, + "learning_rate": 1.5273128244070034e-05, + "loss": 0.0355, + "step": 129170 + }, + { + "epoch": 0.2459, + "grad_norm": 0.06529099494218826, + "learning_rate": 1.5269320410696773e-05, + "loss": 0.0339, + "step": 129180 + }, + { + "epoch": 0.24595, + "grad_norm": 0.05931119620800018, + "learning_rate": 1.5265512843356646e-05, + "loss": 0.0332, + "step": 129190 + }, + { + "epoch": 0.246, + "grad_norm": 0.06843093782663345, + "learning_rate": 1.526170554215373e-05, + "loss": 0.0343, + "step": 129200 + }, + { + "epoch": 0.24605, + "grad_norm": 0.062442880123853683, + "learning_rate": 1.525789850719213e-05, + "loss": 0.0331, + "step": 129210 + }, + { + "epoch": 0.2461, + "grad_norm": 0.06340550631284714, + "learning_rate": 1.5254091738575932e-05, + "loss": 0.0338, + "step": 129220 + }, + { + "epoch": 0.24615, + "grad_norm": 0.07777202129364014, + "learning_rate": 1.5250285236409199e-05, + "loss": 0.0349, + "step": 129230 + }, + { + "epoch": 0.2462, + "grad_norm": 0.07286890596151352, + "learning_rate": 1.524647900079601e-05, + "loss": 0.0353, + "step": 129240 + }, + { + "epoch": 0.24625, + "grad_norm": 0.0640026405453682, + "learning_rate": 1.5242673031840412e-05, + "loss": 0.0332, + "step": 129250 + }, + { + "epoch": 0.2463, + "grad_norm": 0.06555936485528946, + "learning_rate": 1.5238867329646479e-05, + "loss": 0.0335, + "step": 129260 + }, + { + "epoch": 0.24635, + "grad_norm": 0.06518343091011047, + "learning_rate": 1.5235061894318229e-05, + "loss": 0.0332, + "step": 129270 + }, + { + "epoch": 0.2464, + "grad_norm": 0.05131986364722252, + "learning_rate": 1.5231256725959725e-05, + "loss": 0.0333, + "step": 129280 + }, + { + "epoch": 0.24645, + "grad_norm": 0.062407419085502625, + "learning_rate": 1.5227451824674998e-05, + "loss": 0.0346, + "step": 129290 + }, + { + "epoch": 0.2465, + "grad_norm": 0.05796368047595024, + "learning_rate": 1.5223647190568064e-05, + "loss": 0.0321, + "step": 129300 + }, + { + "epoch": 0.24655, + "grad_norm": 0.06182119622826576, + "learning_rate": 1.5219842823742947e-05, + "loss": 0.0336, + "step": 129310 + }, + { + "epoch": 0.2466, + "grad_norm": 0.05594813823699951, + "learning_rate": 1.5216038724303647e-05, + "loss": 0.033, + "step": 129320 + }, + { + "epoch": 0.24665, + "grad_norm": 0.05082995444536209, + "learning_rate": 1.5212234892354176e-05, + "loss": 0.033, + "step": 129330 + }, + { + "epoch": 0.2467, + "grad_norm": 0.07302447408437729, + "learning_rate": 1.5208431327998523e-05, + "loss": 0.0348, + "step": 129340 + }, + { + "epoch": 0.24675, + "grad_norm": 0.09341346472501755, + "learning_rate": 1.5204628031340676e-05, + "loss": 0.0346, + "step": 129350 + }, + { + "epoch": 0.2468, + "grad_norm": 0.0673801451921463, + "learning_rate": 1.520082500248463e-05, + "loss": 0.0332, + "step": 129360 + }, + { + "epoch": 0.24685, + "grad_norm": 0.06581735610961914, + "learning_rate": 1.5197022241534337e-05, + "loss": 0.033, + "step": 129370 + }, + { + "epoch": 0.2469, + "grad_norm": 0.053973015397787094, + "learning_rate": 1.5193219748593784e-05, + "loss": 0.0324, + "step": 129380 + }, + { + "epoch": 0.24695, + "grad_norm": 0.06393645703792572, + "learning_rate": 1.5189417523766903e-05, + "loss": 0.0344, + "step": 129390 + }, + { + "epoch": 0.247, + "grad_norm": 0.05542540177702904, + "learning_rate": 1.5185615567157668e-05, + "loss": 0.0328, + "step": 129400 + }, + { + "epoch": 0.24705, + "grad_norm": 0.061920009553432465, + "learning_rate": 1.5181813878870022e-05, + "loss": 0.0389, + "step": 129410 + }, + { + "epoch": 0.2471, + "grad_norm": 0.05536089465022087, + "learning_rate": 1.517801245900789e-05, + "loss": 0.0339, + "step": 129420 + }, + { + "epoch": 0.24715, + "grad_norm": 0.06831495463848114, + "learning_rate": 1.5174211307675212e-05, + "loss": 0.0331, + "step": 129430 + }, + { + "epoch": 0.2472, + "grad_norm": 0.07440666854381561, + "learning_rate": 1.51704104249759e-05, + "loss": 0.0334, + "step": 129440 + }, + { + "epoch": 0.24725, + "grad_norm": 0.073429174721241, + "learning_rate": 1.5166609811013882e-05, + "loss": 0.0335, + "step": 129450 + }, + { + "epoch": 0.2473, + "grad_norm": 0.05541494861245155, + "learning_rate": 1.5162809465893052e-05, + "loss": 0.0329, + "step": 129460 + }, + { + "epoch": 0.24735, + "grad_norm": 0.05493704602122307, + "learning_rate": 1.5159009389717307e-05, + "loss": 0.0325, + "step": 129470 + }, + { + "epoch": 0.2474, + "grad_norm": 0.052072275429964066, + "learning_rate": 1.5155209582590562e-05, + "loss": 0.0329, + "step": 129480 + }, + { + "epoch": 0.24745, + "grad_norm": 0.0666222795844078, + "learning_rate": 1.5151410044616682e-05, + "loss": 0.0353, + "step": 129490 + }, + { + "epoch": 0.2475, + "grad_norm": 0.051309410482645035, + "learning_rate": 1.5147610775899557e-05, + "loss": 0.0331, + "step": 129500 + }, + { + "epoch": 0.24755, + "grad_norm": 0.05242369323968887, + "learning_rate": 1.5143811776543044e-05, + "loss": 0.0323, + "step": 129510 + }, + { + "epoch": 0.2476, + "grad_norm": 0.05651107430458069, + "learning_rate": 1.5140013046651022e-05, + "loss": 0.0338, + "step": 129520 + }, + { + "epoch": 0.24765, + "grad_norm": 0.06189333274960518, + "learning_rate": 1.5136214586327335e-05, + "loss": 0.0341, + "step": 129530 + }, + { + "epoch": 0.2477, + "grad_norm": 0.0606662854552269, + "learning_rate": 1.5132416395675834e-05, + "loss": 0.0374, + "step": 129540 + }, + { + "epoch": 0.24775, + "grad_norm": 0.05297665670514107, + "learning_rate": 1.5128618474800365e-05, + "loss": 0.0346, + "step": 129550 + }, + { + "epoch": 0.2478, + "grad_norm": 0.053852248936891556, + "learning_rate": 1.5124820823804754e-05, + "loss": 0.0333, + "step": 129560 + }, + { + "epoch": 0.24785, + "grad_norm": 0.06997379660606384, + "learning_rate": 1.5121023442792842e-05, + "loss": 0.0336, + "step": 129570 + }, + { + "epoch": 0.2479, + "grad_norm": 0.05877318233251572, + "learning_rate": 1.5117226331868423e-05, + "loss": 0.034, + "step": 129580 + }, + { + "epoch": 0.24795, + "grad_norm": 0.050678376108407974, + "learning_rate": 1.5113429491135328e-05, + "loss": 0.0338, + "step": 129590 + }, + { + "epoch": 0.248, + "grad_norm": 0.06394968181848526, + "learning_rate": 1.5109632920697364e-05, + "loss": 0.0338, + "step": 129600 + }, + { + "epoch": 0.24805, + "grad_norm": 0.06775014102458954, + "learning_rate": 1.5105836620658315e-05, + "loss": 0.033, + "step": 129610 + }, + { + "epoch": 0.2481, + "grad_norm": 0.062414124608039856, + "learning_rate": 1.510204059112198e-05, + "loss": 0.035, + "step": 129620 + }, + { + "epoch": 0.24815, + "grad_norm": 0.08363818377256393, + "learning_rate": 1.509824483219213e-05, + "loss": 0.035, + "step": 129630 + }, + { + "epoch": 0.2482, + "grad_norm": 0.08240839838981628, + "learning_rate": 1.5094449343972553e-05, + "loss": 0.0342, + "step": 129640 + }, + { + "epoch": 0.24825, + "grad_norm": 0.06142202019691467, + "learning_rate": 1.5090654126567006e-05, + "loss": 0.0351, + "step": 129650 + }, + { + "epoch": 0.2483, + "grad_norm": 0.064139723777771, + "learning_rate": 1.5086859180079244e-05, + "loss": 0.0345, + "step": 129660 + }, + { + "epoch": 0.24835, + "grad_norm": 0.06331875175237656, + "learning_rate": 1.5083064504613042e-05, + "loss": 0.034, + "step": 129670 + }, + { + "epoch": 0.2484, + "grad_norm": 0.055469829589128494, + "learning_rate": 1.5079270100272119e-05, + "loss": 0.0349, + "step": 129680 + }, + { + "epoch": 0.24845, + "grad_norm": 0.07021180540323257, + "learning_rate": 1.5075475967160235e-05, + "loss": 0.0348, + "step": 129690 + }, + { + "epoch": 0.2485, + "grad_norm": 0.09408631920814514, + "learning_rate": 1.5071682105381101e-05, + "loss": 0.0336, + "step": 129700 + }, + { + "epoch": 0.24855, + "grad_norm": 0.0889461413025856, + "learning_rate": 1.5067888515038459e-05, + "loss": 0.0333, + "step": 129710 + }, + { + "epoch": 0.2486, + "grad_norm": 0.05832533910870552, + "learning_rate": 1.5064095196236006e-05, + "loss": 0.0348, + "step": 129720 + }, + { + "epoch": 0.24865, + "grad_norm": 0.06746535748243332, + "learning_rate": 1.5060302149077454e-05, + "loss": 0.0356, + "step": 129730 + }, + { + "epoch": 0.2487, + "grad_norm": 0.06575343012809753, + "learning_rate": 1.5056509373666516e-05, + "loss": 0.0351, + "step": 129740 + }, + { + "epoch": 0.24875, + "grad_norm": 0.056073229759931564, + "learning_rate": 1.505271687010687e-05, + "loss": 0.0354, + "step": 129750 + }, + { + "epoch": 0.2488, + "grad_norm": 0.05148507282137871, + "learning_rate": 1.5048924638502216e-05, + "loss": 0.0323, + "step": 129760 + }, + { + "epoch": 0.24885, + "grad_norm": 0.05382775515317917, + "learning_rate": 1.5045132678956208e-05, + "loss": 0.0326, + "step": 129770 + }, + { + "epoch": 0.2489, + "grad_norm": 0.050694435834884644, + "learning_rate": 1.5041340991572542e-05, + "loss": 0.0326, + "step": 129780 + }, + { + "epoch": 0.24895, + "grad_norm": 0.06192651763558388, + "learning_rate": 1.5037549576454874e-05, + "loss": 0.0333, + "step": 129790 + }, + { + "epoch": 0.249, + "grad_norm": 0.06613980233669281, + "learning_rate": 1.5033758433706858e-05, + "loss": 0.034, + "step": 129800 + }, + { + "epoch": 0.24905, + "grad_norm": 0.0578388012945652, + "learning_rate": 1.502996756343214e-05, + "loss": 0.0335, + "step": 129810 + }, + { + "epoch": 0.2491, + "grad_norm": 0.056715864688158035, + "learning_rate": 1.5026176965734362e-05, + "loss": 0.0334, + "step": 129820 + }, + { + "epoch": 0.24915, + "grad_norm": 0.05182795599102974, + "learning_rate": 1.5022386640717165e-05, + "loss": 0.0341, + "step": 129830 + }, + { + "epoch": 0.2492, + "grad_norm": 0.05253973975777626, + "learning_rate": 1.5018596588484163e-05, + "loss": 0.0324, + "step": 129840 + }, + { + "epoch": 0.24925, + "grad_norm": 0.05492791533470154, + "learning_rate": 1.5014806809138975e-05, + "loss": 0.0351, + "step": 129850 + }, + { + "epoch": 0.2493, + "grad_norm": 0.05340440198779106, + "learning_rate": 1.5011017302785233e-05, + "loss": 0.0328, + "step": 129860 + }, + { + "epoch": 0.24935, + "grad_norm": 0.05057377368211746, + "learning_rate": 1.5007228069526508e-05, + "loss": 0.033, + "step": 129870 + }, + { + "epoch": 0.2494, + "grad_norm": 0.05610264837741852, + "learning_rate": 1.5003439109466433e-05, + "loss": 0.0324, + "step": 129880 + }, + { + "epoch": 0.24945, + "grad_norm": 0.04865795373916626, + "learning_rate": 1.4999650422708558e-05, + "loss": 0.033, + "step": 129890 + }, + { + "epoch": 0.2495, + "grad_norm": 0.0512232780456543, + "learning_rate": 1.4995862009356496e-05, + "loss": 0.0327, + "step": 129900 + }, + { + "epoch": 0.24955, + "grad_norm": 0.04978105425834656, + "learning_rate": 1.49920738695138e-05, + "loss": 0.0321, + "step": 129910 + }, + { + "epoch": 0.2496, + "grad_norm": 0.056026410311460495, + "learning_rate": 1.4988286003284047e-05, + "loss": 0.0339, + "step": 129920 + }, + { + "epoch": 0.24965, + "grad_norm": 0.05365905165672302, + "learning_rate": 1.4984498410770801e-05, + "loss": 0.0345, + "step": 129930 + }, + { + "epoch": 0.2497, + "grad_norm": 0.06317032873630524, + "learning_rate": 1.4980711092077598e-05, + "loss": 0.0334, + "step": 129940 + }, + { + "epoch": 0.24975, + "grad_norm": 0.04971380531787872, + "learning_rate": 1.4976924047307994e-05, + "loss": 0.0366, + "step": 129950 + }, + { + "epoch": 0.2498, + "grad_norm": 0.0540299117565155, + "learning_rate": 1.4973137276565519e-05, + "loss": 0.0336, + "step": 129960 + }, + { + "epoch": 0.24985, + "grad_norm": 0.058525823056697845, + "learning_rate": 1.4969350779953695e-05, + "loss": 0.0365, + "step": 129970 + }, + { + "epoch": 0.2499, + "grad_norm": 0.07276928424835205, + "learning_rate": 1.4965564557576064e-05, + "loss": 0.0346, + "step": 129980 + }, + { + "epoch": 0.24995, + "grad_norm": 0.0586562342941761, + "learning_rate": 1.4961778609536123e-05, + "loss": 0.0339, + "step": 129990 + }, + { + "epoch": 0.25, + "grad_norm": 0.0669587254524231, + "learning_rate": 1.495799293593739e-05, + "loss": 0.035, + "step": 130000 + }, + { + "epoch": 0.25005, + "grad_norm": 0.06829706579446793, + "learning_rate": 1.4954207536883352e-05, + "loss": 0.0353, + "step": 130010 + }, + { + "epoch": 0.2501, + "grad_norm": 0.055767159909009933, + "learning_rate": 1.495042241247751e-05, + "loss": 0.0363, + "step": 130020 + }, + { + "epoch": 0.25015, + "grad_norm": 0.059968505054712296, + "learning_rate": 1.494663756282334e-05, + "loss": 0.0329, + "step": 130030 + }, + { + "epoch": 0.2502, + "grad_norm": 0.06776098906993866, + "learning_rate": 1.494285298802432e-05, + "loss": 0.0338, + "step": 130040 + }, + { + "epoch": 0.25025, + "grad_norm": 0.07966256886720657, + "learning_rate": 1.4939068688183927e-05, + "loss": 0.0348, + "step": 130050 + }, + { + "epoch": 0.2503, + "grad_norm": 0.07551705837249756, + "learning_rate": 1.4935284663405608e-05, + "loss": 0.0371, + "step": 130060 + }, + { + "epoch": 0.25035, + "grad_norm": 0.06012747809290886, + "learning_rate": 1.493150091379284e-05, + "loss": 0.0356, + "step": 130070 + }, + { + "epoch": 0.2504, + "grad_norm": 0.053942855447530746, + "learning_rate": 1.4927717439449036e-05, + "loss": 0.0353, + "step": 130080 + }, + { + "epoch": 0.25045, + "grad_norm": 0.06574242562055588, + "learning_rate": 1.4923934240477672e-05, + "loss": 0.0343, + "step": 130090 + }, + { + "epoch": 0.2505, + "grad_norm": 0.06035936623811722, + "learning_rate": 1.4920151316982146e-05, + "loss": 0.0345, + "step": 130100 + }, + { + "epoch": 0.25055, + "grad_norm": 0.05731990560889244, + "learning_rate": 1.4916368669065895e-05, + "loss": 0.0341, + "step": 130110 + }, + { + "epoch": 0.2506, + "grad_norm": 0.05266165733337402, + "learning_rate": 1.4912586296832348e-05, + "loss": 0.0327, + "step": 130120 + }, + { + "epoch": 0.25065, + "grad_norm": 0.058082010596990585, + "learning_rate": 1.4908804200384893e-05, + "loss": 0.034, + "step": 130130 + }, + { + "epoch": 0.2507, + "grad_norm": 0.06456492096185684, + "learning_rate": 1.4905022379826947e-05, + "loss": 0.0347, + "step": 130140 + }, + { + "epoch": 0.25075, + "grad_norm": 0.06144573166966438, + "learning_rate": 1.4901240835261893e-05, + "loss": 0.0344, + "step": 130150 + }, + { + "epoch": 0.2508, + "grad_norm": 0.06580778956413269, + "learning_rate": 1.4897459566793112e-05, + "loss": 0.0347, + "step": 130160 + }, + { + "epoch": 0.25085, + "grad_norm": 0.06285262852907181, + "learning_rate": 1.4893678574524009e-05, + "loss": 0.0344, + "step": 130170 + }, + { + "epoch": 0.2509, + "grad_norm": 0.06998418271541595, + "learning_rate": 1.4889897858557921e-05, + "loss": 0.0362, + "step": 130180 + }, + { + "epoch": 0.25095, + "grad_norm": 0.061762843281030655, + "learning_rate": 1.4886117418998235e-05, + "loss": 0.0335, + "step": 130190 + }, + { + "epoch": 0.251, + "grad_norm": 0.05425383895635605, + "learning_rate": 1.4882337255948297e-05, + "loss": 0.035, + "step": 130200 + }, + { + "epoch": 0.25105, + "grad_norm": 0.10530871897935867, + "learning_rate": 1.4878557369511465e-05, + "loss": 0.0374, + "step": 130210 + }, + { + "epoch": 0.2511, + "grad_norm": 0.09325791895389557, + "learning_rate": 1.4874777759791065e-05, + "loss": 0.0347, + "step": 130220 + }, + { + "epoch": 0.25115, + "grad_norm": 0.07867307960987091, + "learning_rate": 1.4870998426890435e-05, + "loss": 0.0335, + "step": 130230 + }, + { + "epoch": 0.2512, + "grad_norm": 0.06272491812705994, + "learning_rate": 1.4867219370912908e-05, + "loss": 0.0337, + "step": 130240 + }, + { + "epoch": 0.25125, + "grad_norm": 0.06531760096549988, + "learning_rate": 1.4863440591961791e-05, + "loss": 0.033, + "step": 130250 + }, + { + "epoch": 0.2513, + "grad_norm": 0.05865396186709404, + "learning_rate": 1.4859662090140408e-05, + "loss": 0.0337, + "step": 130260 + }, + { + "epoch": 0.25135, + "grad_norm": 0.05396764725446701, + "learning_rate": 1.4855883865552042e-05, + "loss": 0.0333, + "step": 130270 + }, + { + "epoch": 0.2514, + "grad_norm": 0.0740916058421135, + "learning_rate": 1.4852105918300016e-05, + "loss": 0.0356, + "step": 130280 + }, + { + "epoch": 0.25145, + "grad_norm": 0.06599223613739014, + "learning_rate": 1.4848328248487586e-05, + "loss": 0.0361, + "step": 130290 + }, + { + "epoch": 0.2515, + "grad_norm": 0.053400177508592606, + "learning_rate": 1.4844550856218054e-05, + "loss": 0.0329, + "step": 130300 + }, + { + "epoch": 0.25155, + "grad_norm": 0.05594543740153313, + "learning_rate": 1.484077374159469e-05, + "loss": 0.033, + "step": 130310 + }, + { + "epoch": 0.2516, + "grad_norm": 0.06213982775807381, + "learning_rate": 1.483699690472075e-05, + "loss": 0.034, + "step": 130320 + }, + { + "epoch": 0.25165, + "grad_norm": 0.052713699638843536, + "learning_rate": 1.4833220345699506e-05, + "loss": 0.034, + "step": 130330 + }, + { + "epoch": 0.2517, + "grad_norm": 0.05229334905743599, + "learning_rate": 1.4829444064634187e-05, + "loss": 0.0334, + "step": 130340 + }, + { + "epoch": 0.25175, + "grad_norm": 0.061390116810798645, + "learning_rate": 1.4825668061628046e-05, + "loss": 0.0337, + "step": 130350 + }, + { + "epoch": 0.2518, + "grad_norm": 0.06338523328304291, + "learning_rate": 1.4821892336784333e-05, + "loss": 0.0348, + "step": 130360 + }, + { + "epoch": 0.25185, + "grad_norm": 0.055219992995262146, + "learning_rate": 1.481811689020624e-05, + "loss": 0.0332, + "step": 130370 + }, + { + "epoch": 0.2519, + "grad_norm": 0.06830650568008423, + "learning_rate": 1.4814341721997024e-05, + "loss": 0.0343, + "step": 130380 + }, + { + "epoch": 0.25195, + "grad_norm": 0.06185084208846092, + "learning_rate": 1.4810566832259865e-05, + "loss": 0.0349, + "step": 130390 + }, + { + "epoch": 0.252, + "grad_norm": 0.05709722265601158, + "learning_rate": 1.4806792221097986e-05, + "loss": 0.0337, + "step": 130400 + }, + { + "epoch": 0.25205, + "grad_norm": 0.07840575277805328, + "learning_rate": 1.480301788861458e-05, + "loss": 0.0341, + "step": 130410 + }, + { + "epoch": 0.2521, + "grad_norm": 0.06489013135433197, + "learning_rate": 1.4799243834912829e-05, + "loss": 0.0349, + "step": 130420 + }, + { + "epoch": 0.25215, + "grad_norm": 0.0650085061788559, + "learning_rate": 1.4795470060095928e-05, + "loss": 0.0348, + "step": 130430 + }, + { + "epoch": 0.2522, + "grad_norm": 0.06238774210214615, + "learning_rate": 1.4791696564267036e-05, + "loss": 0.0328, + "step": 130440 + }, + { + "epoch": 0.25225, + "grad_norm": 0.05862411484122276, + "learning_rate": 1.4787923347529328e-05, + "loss": 0.0353, + "step": 130450 + }, + { + "epoch": 0.2523, + "grad_norm": 0.061414580792188644, + "learning_rate": 1.4784150409985952e-05, + "loss": 0.034, + "step": 130460 + }, + { + "epoch": 0.25235, + "grad_norm": 0.06263222545385361, + "learning_rate": 1.4780377751740076e-05, + "loss": 0.0337, + "step": 130470 + }, + { + "epoch": 0.2524, + "grad_norm": 0.07081720232963562, + "learning_rate": 1.4776605372894819e-05, + "loss": 0.0325, + "step": 130480 + }, + { + "epoch": 0.25245, + "grad_norm": 0.0483192540705204, + "learning_rate": 1.4772833273553338e-05, + "loss": 0.0347, + "step": 130490 + }, + { + "epoch": 0.2525, + "grad_norm": 0.045618437230587006, + "learning_rate": 1.4769061453818761e-05, + "loss": 0.0317, + "step": 130500 + }, + { + "epoch": 0.25255, + "grad_norm": 0.0476597435772419, + "learning_rate": 1.4765289913794189e-05, + "loss": 0.0331, + "step": 130510 + }, + { + "epoch": 0.2526, + "grad_norm": 0.0782722607254982, + "learning_rate": 1.4761518653582753e-05, + "loss": 0.034, + "step": 130520 + }, + { + "epoch": 0.25265, + "grad_norm": 0.053563639521598816, + "learning_rate": 1.4757747673287548e-05, + "loss": 0.0333, + "step": 130530 + }, + { + "epoch": 0.2527, + "grad_norm": 0.05494308099150658, + "learning_rate": 1.4753976973011677e-05, + "loss": 0.0323, + "step": 130540 + }, + { + "epoch": 0.25275, + "grad_norm": 0.05890059843659401, + "learning_rate": 1.475020655285822e-05, + "loss": 0.0323, + "step": 130550 + }, + { + "epoch": 0.2528, + "grad_norm": 0.05034961178898811, + "learning_rate": 1.4746436412930261e-05, + "loss": 0.0339, + "step": 130560 + }, + { + "epoch": 0.25285, + "grad_norm": 0.05406641960144043, + "learning_rate": 1.4742666553330894e-05, + "loss": 0.0326, + "step": 130570 + }, + { + "epoch": 0.2529, + "grad_norm": 0.06735268235206604, + "learning_rate": 1.4738896974163154e-05, + "loss": 0.0332, + "step": 130580 + }, + { + "epoch": 0.25295, + "grad_norm": 0.062144964933395386, + "learning_rate": 1.473512767553013e-05, + "loss": 0.0329, + "step": 130590 + }, + { + "epoch": 0.253, + "grad_norm": 0.04975804686546326, + "learning_rate": 1.473135865753485e-05, + "loss": 0.0332, + "step": 130600 + }, + { + "epoch": 0.25305, + "grad_norm": 0.05743909627199173, + "learning_rate": 1.4727589920280366e-05, + "loss": 0.0319, + "step": 130610 + }, + { + "epoch": 0.2531, + "grad_norm": 0.06325464695692062, + "learning_rate": 1.472382146386972e-05, + "loss": 0.0335, + "step": 130620 + }, + { + "epoch": 0.25315, + "grad_norm": 0.07747869938611984, + "learning_rate": 1.4720053288405928e-05, + "loss": 0.0324, + "step": 130630 + }, + { + "epoch": 0.2532, + "grad_norm": 0.06305349618196487, + "learning_rate": 1.4716285393992025e-05, + "loss": 0.032, + "step": 130640 + }, + { + "epoch": 0.25325, + "grad_norm": 0.06549563258886337, + "learning_rate": 1.4712517780731011e-05, + "loss": 0.0324, + "step": 130650 + }, + { + "epoch": 0.2533, + "grad_norm": 0.0979926735162735, + "learning_rate": 1.4708750448725899e-05, + "loss": 0.0346, + "step": 130660 + }, + { + "epoch": 0.25335, + "grad_norm": 0.0753621831536293, + "learning_rate": 1.470498339807968e-05, + "loss": 0.034, + "step": 130670 + }, + { + "epoch": 0.2534, + "grad_norm": 0.06919003278017044, + "learning_rate": 1.4701216628895342e-05, + "loss": 0.0343, + "step": 130680 + }, + { + "epoch": 0.25345, + "grad_norm": 0.053296275436878204, + "learning_rate": 1.4697450141275881e-05, + "loss": 0.034, + "step": 130690 + }, + { + "epoch": 0.2535, + "grad_norm": 0.06867097318172455, + "learning_rate": 1.469368393532426e-05, + "loss": 0.0338, + "step": 130700 + }, + { + "epoch": 0.25355, + "grad_norm": 0.05448020622134209, + "learning_rate": 1.468991801114345e-05, + "loss": 0.0321, + "step": 130710 + }, + { + "epoch": 0.2536, + "grad_norm": 0.05940767377614975, + "learning_rate": 1.4686152368836406e-05, + "loss": 0.0331, + "step": 130720 + }, + { + "epoch": 0.25365, + "grad_norm": 0.053210772573947906, + "learning_rate": 1.4682387008506087e-05, + "loss": 0.0347, + "step": 130730 + }, + { + "epoch": 0.2537, + "grad_norm": 0.053768180310726166, + "learning_rate": 1.4678621930255426e-05, + "loss": 0.0334, + "step": 130740 + }, + { + "epoch": 0.25375, + "grad_norm": 0.05573287233710289, + "learning_rate": 1.467485713418736e-05, + "loss": 0.0328, + "step": 130750 + }, + { + "epoch": 0.2538, + "grad_norm": 0.05512506514787674, + "learning_rate": 1.4671092620404831e-05, + "loss": 0.0338, + "step": 130760 + }, + { + "epoch": 0.25385, + "grad_norm": 0.059196557849645615, + "learning_rate": 1.4667328389010738e-05, + "loss": 0.0336, + "step": 130770 + }, + { + "epoch": 0.2539, + "grad_norm": 0.05658409744501114, + "learning_rate": 1.4663564440108019e-05, + "loss": 0.035, + "step": 130780 + }, + { + "epoch": 0.25395, + "grad_norm": 0.05933243781328201, + "learning_rate": 1.4659800773799547e-05, + "loss": 0.0348, + "step": 130790 + }, + { + "epoch": 0.254, + "grad_norm": 0.06367258727550507, + "learning_rate": 1.4656037390188246e-05, + "loss": 0.0325, + "step": 130800 + }, + { + "epoch": 0.25405, + "grad_norm": 0.057351671159267426, + "learning_rate": 1.4652274289377e-05, + "loss": 0.0343, + "step": 130810 + }, + { + "epoch": 0.2541, + "grad_norm": 0.061499468982219696, + "learning_rate": 1.4648511471468685e-05, + "loss": 0.0325, + "step": 130820 + }, + { + "epoch": 0.25415, + "grad_norm": 0.059906553477048874, + "learning_rate": 1.464474893656618e-05, + "loss": 0.0341, + "step": 130830 + }, + { + "epoch": 0.2542, + "grad_norm": 0.05713615193963051, + "learning_rate": 1.4640986684772345e-05, + "loss": 0.0353, + "step": 130840 + }, + { + "epoch": 0.25425, + "grad_norm": 0.079178087413311, + "learning_rate": 1.4637224716190045e-05, + "loss": 0.035, + "step": 130850 + }, + { + "epoch": 0.2543, + "grad_norm": 0.0858449786901474, + "learning_rate": 1.4633463030922129e-05, + "loss": 0.0354, + "step": 130860 + }, + { + "epoch": 0.25435, + "grad_norm": 0.06844604760408401, + "learning_rate": 1.462970162907143e-05, + "loss": 0.0339, + "step": 130870 + }, + { + "epoch": 0.2544, + "grad_norm": 0.06488780677318573, + "learning_rate": 1.4625940510740807e-05, + "loss": 0.0338, + "step": 130880 + }, + { + "epoch": 0.25445, + "grad_norm": 0.06646611541509628, + "learning_rate": 1.462217967603306e-05, + "loss": 0.0356, + "step": 130890 + }, + { + "epoch": 0.2545, + "grad_norm": 0.06702245771884918, + "learning_rate": 1.461841912505103e-05, + "loss": 0.0338, + "step": 130900 + }, + { + "epoch": 0.25455, + "grad_norm": 0.06354524940252304, + "learning_rate": 1.4614658857897518e-05, + "loss": 0.0333, + "step": 130910 + }, + { + "epoch": 0.2546, + "grad_norm": 0.05211248621344566, + "learning_rate": 1.4610898874675333e-05, + "loss": 0.0334, + "step": 130920 + }, + { + "epoch": 0.25465, + "grad_norm": 0.06046607345342636, + "learning_rate": 1.4607139175487267e-05, + "loss": 0.0332, + "step": 130930 + }, + { + "epoch": 0.2547, + "grad_norm": 0.06311957538127899, + "learning_rate": 1.4603379760436111e-05, + "loss": 0.0336, + "step": 130940 + }, + { + "epoch": 0.25475, + "grad_norm": 0.05592069774866104, + "learning_rate": 1.4599620629624655e-05, + "loss": 0.0356, + "step": 130950 + }, + { + "epoch": 0.2548, + "grad_norm": 0.05934546887874603, + "learning_rate": 1.4595861783155654e-05, + "loss": 0.0335, + "step": 130960 + }, + { + "epoch": 0.25485, + "grad_norm": 0.06932884454727173, + "learning_rate": 1.459210322113189e-05, + "loss": 0.0343, + "step": 130970 + }, + { + "epoch": 0.2549, + "grad_norm": 0.05560052767395973, + "learning_rate": 1.4588344943656102e-05, + "loss": 0.0349, + "step": 130980 + }, + { + "epoch": 0.25495, + "grad_norm": 0.07452212274074554, + "learning_rate": 1.4584586950831064e-05, + "loss": 0.0339, + "step": 130990 + }, + { + "epoch": 0.255, + "grad_norm": 0.05970346927642822, + "learning_rate": 1.4580829242759506e-05, + "loss": 0.0349, + "step": 131000 + }, + { + "epoch": 0.25505, + "grad_norm": 0.05696116015315056, + "learning_rate": 1.4577071819544152e-05, + "loss": 0.0345, + "step": 131010 + }, + { + "epoch": 0.2551, + "grad_norm": 0.06986770778894424, + "learning_rate": 1.4573314681287747e-05, + "loss": 0.0342, + "step": 131020 + }, + { + "epoch": 0.25515, + "grad_norm": 0.06303253024816513, + "learning_rate": 1.4569557828092995e-05, + "loss": 0.0343, + "step": 131030 + }, + { + "epoch": 0.2552, + "grad_norm": 0.055824995040893555, + "learning_rate": 1.4565801260062628e-05, + "loss": 0.0331, + "step": 131040 + }, + { + "epoch": 0.25525, + "grad_norm": 0.054569143801927567, + "learning_rate": 1.4562044977299322e-05, + "loss": 0.0333, + "step": 131050 + }, + { + "epoch": 0.2553, + "grad_norm": 0.054146453738212585, + "learning_rate": 1.4558288979905779e-05, + "loss": 0.0336, + "step": 131060 + }, + { + "epoch": 0.25535, + "grad_norm": 0.058401964604854584, + "learning_rate": 1.4554533267984705e-05, + "loss": 0.0337, + "step": 131070 + }, + { + "epoch": 0.2554, + "grad_norm": 0.06326146423816681, + "learning_rate": 1.4550777841638765e-05, + "loss": 0.0338, + "step": 131080 + }, + { + "epoch": 0.25545, + "grad_norm": 0.06221143156290054, + "learning_rate": 1.4547022700970637e-05, + "loss": 0.0344, + "step": 131090 + }, + { + "epoch": 0.2555, + "grad_norm": 0.061658527702093124, + "learning_rate": 1.454326784608297e-05, + "loss": 0.0348, + "step": 131100 + }, + { + "epoch": 0.25555, + "grad_norm": 0.07470380514860153, + "learning_rate": 1.4539513277078437e-05, + "loss": 0.0343, + "step": 131110 + }, + { + "epoch": 0.2556, + "grad_norm": 0.061844952404499054, + "learning_rate": 1.4535758994059687e-05, + "loss": 0.0352, + "step": 131120 + }, + { + "epoch": 0.25565, + "grad_norm": 0.07039070874452591, + "learning_rate": 1.4532004997129342e-05, + "loss": 0.0338, + "step": 131130 + }, + { + "epoch": 0.2557, + "grad_norm": 0.05499129742383957, + "learning_rate": 1.452825128639006e-05, + "loss": 0.0342, + "step": 131140 + }, + { + "epoch": 0.25575, + "grad_norm": 0.05033637210726738, + "learning_rate": 1.4524497861944442e-05, + "loss": 0.0356, + "step": 131150 + }, + { + "epoch": 0.2558, + "grad_norm": 0.06897959858179092, + "learning_rate": 1.4520744723895128e-05, + "loss": 0.0369, + "step": 131160 + }, + { + "epoch": 0.25585, + "grad_norm": 0.06327641010284424, + "learning_rate": 1.4516991872344715e-05, + "loss": 0.036, + "step": 131170 + }, + { + "epoch": 0.2559, + "grad_norm": 0.06402651965618134, + "learning_rate": 1.4513239307395799e-05, + "loss": 0.0334, + "step": 131180 + }, + { + "epoch": 0.25595, + "grad_norm": 0.06561446934938431, + "learning_rate": 1.4509487029150986e-05, + "loss": 0.0358, + "step": 131190 + }, + { + "epoch": 0.256, + "grad_norm": 0.08165391534566879, + "learning_rate": 1.450573503771286e-05, + "loss": 0.0345, + "step": 131200 + }, + { + "epoch": 0.25605, + "grad_norm": 0.07116862386465073, + "learning_rate": 1.4501983333183994e-05, + "loss": 0.0346, + "step": 131210 + }, + { + "epoch": 0.2561, + "grad_norm": 0.05546938627958298, + "learning_rate": 1.4498231915666949e-05, + "loss": 0.0333, + "step": 131220 + }, + { + "epoch": 0.25615, + "grad_norm": 0.047905996441841125, + "learning_rate": 1.4494480785264308e-05, + "loss": 0.0343, + "step": 131230 + }, + { + "epoch": 0.2562, + "grad_norm": 0.0557168684899807, + "learning_rate": 1.4490729942078607e-05, + "loss": 0.034, + "step": 131240 + }, + { + "epoch": 0.25625, + "grad_norm": 0.06065616011619568, + "learning_rate": 1.4486979386212412e-05, + "loss": 0.0324, + "step": 131250 + }, + { + "epoch": 0.2563, + "grad_norm": 0.06020541861653328, + "learning_rate": 1.4483229117768249e-05, + "loss": 0.034, + "step": 131260 + }, + { + "epoch": 0.25635, + "grad_norm": 0.051777441054582596, + "learning_rate": 1.4479479136848645e-05, + "loss": 0.0331, + "step": 131270 + }, + { + "epoch": 0.2564, + "grad_norm": 0.05713462457060814, + "learning_rate": 1.4475729443556135e-05, + "loss": 0.0344, + "step": 131280 + }, + { + "epoch": 0.25645, + "grad_norm": 0.05230337381362915, + "learning_rate": 1.4471980037993229e-05, + "loss": 0.0316, + "step": 131290 + }, + { + "epoch": 0.2565, + "grad_norm": 0.06317819654941559, + "learning_rate": 1.4468230920262432e-05, + "loss": 0.0347, + "step": 131300 + }, + { + "epoch": 0.25655, + "grad_norm": 0.06475378572940826, + "learning_rate": 1.4464482090466238e-05, + "loss": 0.034, + "step": 131310 + }, + { + "epoch": 0.2566, + "grad_norm": 0.05910594016313553, + "learning_rate": 1.4460733548707144e-05, + "loss": 0.0356, + "step": 131320 + }, + { + "epoch": 0.25665, + "grad_norm": 0.07089439779520035, + "learning_rate": 1.4456985295087657e-05, + "loss": 0.0349, + "step": 131330 + }, + { + "epoch": 0.2567, + "grad_norm": 0.06080051511526108, + "learning_rate": 1.4453237329710213e-05, + "loss": 0.0334, + "step": 131340 + }, + { + "epoch": 0.25675, + "grad_norm": 0.06926887482404709, + "learning_rate": 1.4449489652677303e-05, + "loss": 0.0345, + "step": 131350 + }, + { + "epoch": 0.2568, + "grad_norm": 0.06788626313209534, + "learning_rate": 1.4445742264091374e-05, + "loss": 0.0328, + "step": 131360 + }, + { + "epoch": 0.25685, + "grad_norm": 0.0659489706158638, + "learning_rate": 1.4441995164054898e-05, + "loss": 0.0357, + "step": 131370 + }, + { + "epoch": 0.2569, + "grad_norm": 0.05504168942570686, + "learning_rate": 1.4438248352670309e-05, + "loss": 0.0337, + "step": 131380 + }, + { + "epoch": 0.25695, + "grad_norm": 0.05858158692717552, + "learning_rate": 1.4434501830040026e-05, + "loss": 0.034, + "step": 131390 + }, + { + "epoch": 0.257, + "grad_norm": 0.05563119798898697, + "learning_rate": 1.443075559626651e-05, + "loss": 0.0342, + "step": 131400 + }, + { + "epoch": 0.25705, + "grad_norm": 0.06597186625003815, + "learning_rate": 1.4427009651452161e-05, + "loss": 0.0378, + "step": 131410 + }, + { + "epoch": 0.2571, + "grad_norm": 0.06534725427627563, + "learning_rate": 1.4423263995699398e-05, + "loss": 0.0364, + "step": 131420 + }, + { + "epoch": 0.25715, + "grad_norm": 0.058977410197257996, + "learning_rate": 1.4419518629110615e-05, + "loss": 0.0372, + "step": 131430 + }, + { + "epoch": 0.2572, + "grad_norm": 0.056536633521318436, + "learning_rate": 1.4415773551788214e-05, + "loss": 0.0355, + "step": 131440 + }, + { + "epoch": 0.25725, + "grad_norm": 0.056408364325761795, + "learning_rate": 1.4412028763834602e-05, + "loss": 0.0351, + "step": 131450 + }, + { + "epoch": 0.2573, + "grad_norm": 0.06189712509512901, + "learning_rate": 1.4408284265352146e-05, + "loss": 0.0328, + "step": 131460 + }, + { + "epoch": 0.25735, + "grad_norm": 0.05862760916352272, + "learning_rate": 1.4404540056443216e-05, + "loss": 0.0336, + "step": 131470 + }, + { + "epoch": 0.2574, + "grad_norm": 0.06858477741479874, + "learning_rate": 1.4400796137210171e-05, + "loss": 0.0338, + "step": 131480 + }, + { + "epoch": 0.25745, + "grad_norm": 0.05762804299592972, + "learning_rate": 1.4397052507755388e-05, + "loss": 0.0334, + "step": 131490 + }, + { + "epoch": 0.2575, + "grad_norm": 0.06049410253763199, + "learning_rate": 1.4393309168181207e-05, + "loss": 0.0339, + "step": 131500 + }, + { + "epoch": 0.25755, + "grad_norm": 0.06922336667776108, + "learning_rate": 1.438956611858996e-05, + "loss": 0.0343, + "step": 131510 + }, + { + "epoch": 0.2576, + "grad_norm": 0.05931337550282478, + "learning_rate": 1.4385823359083994e-05, + "loss": 0.0327, + "step": 131520 + }, + { + "epoch": 0.25765, + "grad_norm": 0.05266860872507095, + "learning_rate": 1.4382080889765625e-05, + "loss": 0.0327, + "step": 131530 + }, + { + "epoch": 0.2577, + "grad_norm": 0.052181702107191086, + "learning_rate": 1.4378338710737194e-05, + "loss": 0.0338, + "step": 131540 + }, + { + "epoch": 0.25775, + "grad_norm": 0.057044558227062225, + "learning_rate": 1.4374596822100972e-05, + "loss": 0.0334, + "step": 131550 + }, + { + "epoch": 0.2578, + "grad_norm": 0.06613699346780777, + "learning_rate": 1.4370855223959279e-05, + "loss": 0.0331, + "step": 131560 + }, + { + "epoch": 0.25785, + "grad_norm": 0.04941616579890251, + "learning_rate": 1.4367113916414423e-05, + "loss": 0.0322, + "step": 131570 + }, + { + "epoch": 0.2579, + "grad_norm": 0.04932907968759537, + "learning_rate": 1.4363372899568677e-05, + "loss": 0.0337, + "step": 131580 + }, + { + "epoch": 0.25795, + "grad_norm": 0.0558835044503212, + "learning_rate": 1.4359632173524323e-05, + "loss": 0.0353, + "step": 131590 + }, + { + "epoch": 0.258, + "grad_norm": 0.054416995495557785, + "learning_rate": 1.4355891738383614e-05, + "loss": 0.0344, + "step": 131600 + }, + { + "epoch": 0.25805, + "grad_norm": 0.05548791214823723, + "learning_rate": 1.435215159424884e-05, + "loss": 0.0338, + "step": 131610 + }, + { + "epoch": 0.2581, + "grad_norm": 0.045173488557338715, + "learning_rate": 1.434841174122224e-05, + "loss": 0.0323, + "step": 131620 + }, + { + "epoch": 0.25815, + "grad_norm": 0.06165101379156113, + "learning_rate": 1.4344672179406049e-05, + "loss": 0.034, + "step": 131630 + }, + { + "epoch": 0.2582, + "grad_norm": 0.06000905483961105, + "learning_rate": 1.4340932908902527e-05, + "loss": 0.0339, + "step": 131640 + }, + { + "epoch": 0.25825, + "grad_norm": 0.07140598446130753, + "learning_rate": 1.4337193929813889e-05, + "loss": 0.0346, + "step": 131650 + }, + { + "epoch": 0.2583, + "grad_norm": 0.06754638999700546, + "learning_rate": 1.4333455242242371e-05, + "loss": 0.034, + "step": 131660 + }, + { + "epoch": 0.25835, + "grad_norm": 0.05807989090681076, + "learning_rate": 1.432971684629018e-05, + "loss": 0.0331, + "step": 131670 + }, + { + "epoch": 0.2584, + "grad_norm": 0.061242345720529556, + "learning_rate": 1.432597874205952e-05, + "loss": 0.0333, + "step": 131680 + }, + { + "epoch": 0.25845, + "grad_norm": 0.05244876816868782, + "learning_rate": 1.4322240929652581e-05, + "loss": 0.0332, + "step": 131690 + }, + { + "epoch": 0.2585, + "grad_norm": 0.05072590336203575, + "learning_rate": 1.4318503409171578e-05, + "loss": 0.033, + "step": 131700 + }, + { + "epoch": 0.25855, + "grad_norm": 0.07088343799114227, + "learning_rate": 1.4314766180718675e-05, + "loss": 0.0345, + "step": 131710 + }, + { + "epoch": 0.2586, + "grad_norm": 0.066108338534832, + "learning_rate": 1.4311029244396041e-05, + "loss": 0.0338, + "step": 131720 + }, + { + "epoch": 0.25865, + "grad_norm": 0.05816994979977608, + "learning_rate": 1.430729260030586e-05, + "loss": 0.0339, + "step": 131730 + }, + { + "epoch": 0.2587, + "grad_norm": 0.07457224279642105, + "learning_rate": 1.4303556248550276e-05, + "loss": 0.0331, + "step": 131740 + }, + { + "epoch": 0.25875, + "grad_norm": 0.05543922260403633, + "learning_rate": 1.4299820189231456e-05, + "loss": 0.0332, + "step": 131750 + }, + { + "epoch": 0.2588, + "grad_norm": 0.07165886461734772, + "learning_rate": 1.4296084422451528e-05, + "loss": 0.0353, + "step": 131760 + }, + { + "epoch": 0.25885, + "grad_norm": 0.06840430945158005, + "learning_rate": 1.4292348948312623e-05, + "loss": 0.0324, + "step": 131770 + }, + { + "epoch": 0.2589, + "grad_norm": 0.06386654078960419, + "learning_rate": 1.4288613766916883e-05, + "loss": 0.0328, + "step": 131780 + }, + { + "epoch": 0.25895, + "grad_norm": 0.05158427730202675, + "learning_rate": 1.428487887836642e-05, + "loss": 0.0319, + "step": 131790 + }, + { + "epoch": 0.259, + "grad_norm": 0.05500097945332527, + "learning_rate": 1.4281144282763342e-05, + "loss": 0.0351, + "step": 131800 + }, + { + "epoch": 0.25905, + "grad_norm": 0.06515280157327652, + "learning_rate": 1.4277409980209747e-05, + "loss": 0.0326, + "step": 131810 + }, + { + "epoch": 0.2591, + "grad_norm": 0.06642050296068192, + "learning_rate": 1.4273675970807732e-05, + "loss": 0.0323, + "step": 131820 + }, + { + "epoch": 0.25915, + "grad_norm": 0.06143985316157341, + "learning_rate": 1.4269942254659406e-05, + "loss": 0.0341, + "step": 131830 + }, + { + "epoch": 0.2592, + "grad_norm": 0.05663907155394554, + "learning_rate": 1.4266208831866806e-05, + "loss": 0.0321, + "step": 131840 + }, + { + "epoch": 0.25925, + "grad_norm": 0.08131860196590424, + "learning_rate": 1.4262475702532036e-05, + "loss": 0.0346, + "step": 131850 + }, + { + "epoch": 0.2593, + "grad_norm": 0.057754553854465485, + "learning_rate": 1.4258742866757135e-05, + "loss": 0.0327, + "step": 131860 + }, + { + "epoch": 0.25935, + "grad_norm": 0.05802265927195549, + "learning_rate": 1.425501032464418e-05, + "loss": 0.0336, + "step": 131870 + }, + { + "epoch": 0.2594, + "grad_norm": 0.059007786214351654, + "learning_rate": 1.4251278076295205e-05, + "loss": 0.0332, + "step": 131880 + }, + { + "epoch": 0.25945, + "grad_norm": 0.05896070599555969, + "learning_rate": 1.4247546121812238e-05, + "loss": 0.0333, + "step": 131890 + }, + { + "epoch": 0.2595, + "grad_norm": 0.07986626774072647, + "learning_rate": 1.4243814461297334e-05, + "loss": 0.0352, + "step": 131900 + }, + { + "epoch": 0.25955, + "grad_norm": 0.07115952670574188, + "learning_rate": 1.4240083094852502e-05, + "loss": 0.0326, + "step": 131910 + }, + { + "epoch": 0.2596, + "grad_norm": 0.061306633055210114, + "learning_rate": 1.4236352022579752e-05, + "loss": 0.0337, + "step": 131920 + }, + { + "epoch": 0.25965, + "grad_norm": 0.055750228464603424, + "learning_rate": 1.4232621244581088e-05, + "loss": 0.0337, + "step": 131930 + }, + { + "epoch": 0.2597, + "grad_norm": 0.054329417645931244, + "learning_rate": 1.4228890760958521e-05, + "loss": 0.0341, + "step": 131940 + }, + { + "epoch": 0.25975, + "grad_norm": 0.06293036788702011, + "learning_rate": 1.4225160571814026e-05, + "loss": 0.0336, + "step": 131950 + }, + { + "epoch": 0.2598, + "grad_norm": 0.054844535887241364, + "learning_rate": 1.4221430677249604e-05, + "loss": 0.0339, + "step": 131960 + }, + { + "epoch": 0.25985, + "grad_norm": 0.06715855002403259, + "learning_rate": 1.4217701077367212e-05, + "loss": 0.0336, + "step": 131970 + }, + { + "epoch": 0.2599, + "grad_norm": 0.057781148701906204, + "learning_rate": 1.4213971772268819e-05, + "loss": 0.0338, + "step": 131980 + }, + { + "epoch": 0.25995, + "grad_norm": 0.05987885966897011, + "learning_rate": 1.4210242762056391e-05, + "loss": 0.033, + "step": 131990 + }, + { + "epoch": 0.26, + "grad_norm": 0.05983780324459076, + "learning_rate": 1.4206514046831876e-05, + "loss": 0.0336, + "step": 132000 + }, + { + "epoch": 0.26005, + "grad_norm": 0.057271696627140045, + "learning_rate": 1.4202785626697199e-05, + "loss": 0.0339, + "step": 132010 + }, + { + "epoch": 0.2601, + "grad_norm": 0.06230291724205017, + "learning_rate": 1.4199057501754317e-05, + "loss": 0.0335, + "step": 132020 + }, + { + "epoch": 0.26015, + "grad_norm": 0.05530672147870064, + "learning_rate": 1.4195329672105135e-05, + "loss": 0.0333, + "step": 132030 + }, + { + "epoch": 0.2602, + "grad_norm": 0.1007639616727829, + "learning_rate": 1.41916021378516e-05, + "loss": 0.0343, + "step": 132040 + }, + { + "epoch": 0.26025, + "grad_norm": 0.08219780027866364, + "learning_rate": 1.4187874899095587e-05, + "loss": 0.0333, + "step": 132050 + }, + { + "epoch": 0.2603, + "grad_norm": 0.06372511386871338, + "learning_rate": 1.4184147955939018e-05, + "loss": 0.036, + "step": 132060 + }, + { + "epoch": 0.26035, + "grad_norm": 0.07079017162322998, + "learning_rate": 1.4180421308483771e-05, + "loss": 0.0387, + "step": 132070 + }, + { + "epoch": 0.2604, + "grad_norm": 0.07121309638023376, + "learning_rate": 1.417669495683175e-05, + "loss": 0.0336, + "step": 132080 + }, + { + "epoch": 0.26045, + "grad_norm": 0.07378543168306351, + "learning_rate": 1.4172968901084827e-05, + "loss": 0.0355, + "step": 132090 + }, + { + "epoch": 0.2605, + "grad_norm": 0.05647118389606476, + "learning_rate": 1.4169243141344851e-05, + "loss": 0.0326, + "step": 132100 + }, + { + "epoch": 0.26055, + "grad_norm": 0.07248450815677643, + "learning_rate": 1.4165517677713714e-05, + "loss": 0.0352, + "step": 132110 + }, + { + "epoch": 0.2606, + "grad_norm": 0.06483283638954163, + "learning_rate": 1.4161792510293248e-05, + "loss": 0.0335, + "step": 132120 + }, + { + "epoch": 0.26065, + "grad_norm": 0.06354866921901703, + "learning_rate": 1.4158067639185308e-05, + "loss": 0.0341, + "step": 132130 + }, + { + "epoch": 0.2607, + "grad_norm": 0.06080995872616768, + "learning_rate": 1.415434306449171e-05, + "loss": 0.0346, + "step": 132140 + }, + { + "epoch": 0.26075, + "grad_norm": 0.06035395339131355, + "learning_rate": 1.41506187863143e-05, + "loss": 0.0349, + "step": 132150 + }, + { + "epoch": 0.2608, + "grad_norm": 0.05553651601076126, + "learning_rate": 1.4146894804754912e-05, + "loss": 0.0344, + "step": 132160 + }, + { + "epoch": 0.26085, + "grad_norm": 0.05365028232336044, + "learning_rate": 1.4143171119915338e-05, + "loss": 0.0341, + "step": 132170 + }, + { + "epoch": 0.2609, + "grad_norm": 0.049796558916568756, + "learning_rate": 1.4139447731897387e-05, + "loss": 0.0338, + "step": 132180 + }, + { + "epoch": 0.26095, + "grad_norm": 0.056830406188964844, + "learning_rate": 1.4135724640802844e-05, + "loss": 0.0344, + "step": 132190 + }, + { + "epoch": 0.261, + "grad_norm": 0.05340270698070526, + "learning_rate": 1.413200184673352e-05, + "loss": 0.0346, + "step": 132200 + }, + { + "epoch": 0.26105, + "grad_norm": 0.05918886139988899, + "learning_rate": 1.4128279349791179e-05, + "loss": 0.0338, + "step": 132210 + }, + { + "epoch": 0.2611, + "grad_norm": 0.06484489142894745, + "learning_rate": 1.4124557150077588e-05, + "loss": 0.0362, + "step": 132220 + }, + { + "epoch": 0.26115, + "grad_norm": 0.06631860136985779, + "learning_rate": 1.412083524769453e-05, + "loss": 0.0346, + "step": 132230 + }, + { + "epoch": 0.2612, + "grad_norm": 0.05315980687737465, + "learning_rate": 1.411711364274374e-05, + "loss": 0.0334, + "step": 132240 + }, + { + "epoch": 0.26125, + "grad_norm": 0.049241699278354645, + "learning_rate": 1.4113392335326994e-05, + "loss": 0.0328, + "step": 132250 + }, + { + "epoch": 0.2613, + "grad_norm": 0.04963986948132515, + "learning_rate": 1.4109671325545992e-05, + "loss": 0.0321, + "step": 132260 + }, + { + "epoch": 0.26135, + "grad_norm": 0.05320208892226219, + "learning_rate": 1.4105950613502482e-05, + "loss": 0.0342, + "step": 132270 + }, + { + "epoch": 0.2614, + "grad_norm": 0.0504610501229763, + "learning_rate": 1.4102230199298203e-05, + "loss": 0.0328, + "step": 132280 + }, + { + "epoch": 0.26145, + "grad_norm": 0.06167743355035782, + "learning_rate": 1.4098510083034855e-05, + "loss": 0.0328, + "step": 132290 + }, + { + "epoch": 0.2615, + "grad_norm": 0.05889127030968666, + "learning_rate": 1.4094790264814145e-05, + "loss": 0.0329, + "step": 132300 + }, + { + "epoch": 0.26155, + "grad_norm": 0.0674830749630928, + "learning_rate": 1.4091070744737762e-05, + "loss": 0.0329, + "step": 132310 + }, + { + "epoch": 0.2616, + "grad_norm": 0.0638226568698883, + "learning_rate": 1.4087351522907416e-05, + "loss": 0.0317, + "step": 132320 + }, + { + "epoch": 0.26165, + "grad_norm": 0.06432203948497772, + "learning_rate": 1.408363259942478e-05, + "loss": 0.0333, + "step": 132330 + }, + { + "epoch": 0.2617, + "grad_norm": 0.06924125552177429, + "learning_rate": 1.407991397439152e-05, + "loss": 0.0325, + "step": 132340 + }, + { + "epoch": 0.26175, + "grad_norm": 0.06363802403211594, + "learning_rate": 1.4076195647909319e-05, + "loss": 0.0342, + "step": 132350 + }, + { + "epoch": 0.2618, + "grad_norm": 0.05731474980711937, + "learning_rate": 1.4072477620079813e-05, + "loss": 0.0317, + "step": 132360 + }, + { + "epoch": 0.26185, + "grad_norm": 0.05997426062822342, + "learning_rate": 1.4068759891004673e-05, + "loss": 0.0328, + "step": 132370 + }, + { + "epoch": 0.2619, + "grad_norm": 0.06531939655542374, + "learning_rate": 1.4065042460785532e-05, + "loss": 0.0354, + "step": 132380 + }, + { + "epoch": 0.26195, + "grad_norm": 0.05618453025817871, + "learning_rate": 1.4061325329524015e-05, + "loss": 0.0332, + "step": 132390 + }, + { + "epoch": 0.262, + "grad_norm": 0.06673513352870941, + "learning_rate": 1.4057608497321762e-05, + "loss": 0.0328, + "step": 132400 + }, + { + "epoch": 0.26205, + "grad_norm": 0.05080614238977432, + "learning_rate": 1.4053891964280381e-05, + "loss": 0.0326, + "step": 132410 + }, + { + "epoch": 0.2621, + "grad_norm": 0.0496988408267498, + "learning_rate": 1.405017573050148e-05, + "loss": 0.0334, + "step": 132420 + }, + { + "epoch": 0.26215, + "grad_norm": 0.0615084283053875, + "learning_rate": 1.4046459796086653e-05, + "loss": 0.0336, + "step": 132430 + }, + { + "epoch": 0.2622, + "grad_norm": 0.06069965288043022, + "learning_rate": 1.404274416113751e-05, + "loss": 0.0337, + "step": 132440 + }, + { + "epoch": 0.26225, + "grad_norm": 0.0644589215517044, + "learning_rate": 1.4039028825755616e-05, + "loss": 0.0346, + "step": 132450 + }, + { + "epoch": 0.2623, + "grad_norm": 0.06995908915996552, + "learning_rate": 1.4035313790042565e-05, + "loss": 0.0343, + "step": 132460 + }, + { + "epoch": 0.26235, + "grad_norm": 0.06145939975976944, + "learning_rate": 1.4031599054099915e-05, + "loss": 0.0334, + "step": 132470 + }, + { + "epoch": 0.2624, + "grad_norm": 0.061161722987890244, + "learning_rate": 1.4027884618029216e-05, + "loss": 0.0342, + "step": 132480 + }, + { + "epoch": 0.26245, + "grad_norm": 0.05761851370334625, + "learning_rate": 1.4024170481932041e-05, + "loss": 0.034, + "step": 132490 + }, + { + "epoch": 0.2625, + "grad_norm": 0.06380891054868698, + "learning_rate": 1.4020456645909921e-05, + "loss": 0.033, + "step": 132500 + }, + { + "epoch": 0.26255, + "grad_norm": 0.07063276320695877, + "learning_rate": 1.401674311006439e-05, + "loss": 0.034, + "step": 132510 + }, + { + "epoch": 0.2626, + "grad_norm": 0.06349416822195053, + "learning_rate": 1.401302987449697e-05, + "loss": 0.0333, + "step": 132520 + }, + { + "epoch": 0.26265, + "grad_norm": 0.09622623026371002, + "learning_rate": 1.4009316939309183e-05, + "loss": 0.0376, + "step": 132530 + }, + { + "epoch": 0.2627, + "grad_norm": 0.08947856724262238, + "learning_rate": 1.4005604304602563e-05, + "loss": 0.0359, + "step": 132540 + }, + { + "epoch": 0.26275, + "grad_norm": 0.0602882094681263, + "learning_rate": 1.400189197047857e-05, + "loss": 0.0336, + "step": 132550 + }, + { + "epoch": 0.2628, + "grad_norm": 0.060876134783029556, + "learning_rate": 1.3998179937038728e-05, + "loss": 0.0349, + "step": 132560 + }, + { + "epoch": 0.26285, + "grad_norm": 0.07160362601280212, + "learning_rate": 1.3994468204384504e-05, + "loss": 0.0337, + "step": 132570 + }, + { + "epoch": 0.2629, + "grad_norm": 0.05875960737466812, + "learning_rate": 1.3990756772617394e-05, + "loss": 0.0342, + "step": 132580 + }, + { + "epoch": 0.26295, + "grad_norm": 0.058581165969371796, + "learning_rate": 1.3987045641838858e-05, + "loss": 0.033, + "step": 132590 + }, + { + "epoch": 0.263, + "grad_norm": 0.05444763973355293, + "learning_rate": 1.398333481215035e-05, + "loss": 0.034, + "step": 132600 + }, + { + "epoch": 0.26305, + "grad_norm": 0.05249619483947754, + "learning_rate": 1.3979624283653336e-05, + "loss": 0.0329, + "step": 132610 + }, + { + "epoch": 0.2631, + "grad_norm": 0.05937279388308525, + "learning_rate": 1.3975914056449255e-05, + "loss": 0.0332, + "step": 132620 + }, + { + "epoch": 0.26315, + "grad_norm": 0.05373000726103783, + "learning_rate": 1.3972204130639544e-05, + "loss": 0.033, + "step": 132630 + }, + { + "epoch": 0.2632, + "grad_norm": 0.06635235249996185, + "learning_rate": 1.3968494506325613e-05, + "loss": 0.0342, + "step": 132640 + }, + { + "epoch": 0.26325, + "grad_norm": 0.054282885044813156, + "learning_rate": 1.3964785183608902e-05, + "loss": 0.0339, + "step": 132650 + }, + { + "epoch": 0.2633, + "grad_norm": 0.07432691007852554, + "learning_rate": 1.396107616259083e-05, + "loss": 0.0342, + "step": 132660 + }, + { + "epoch": 0.26335, + "grad_norm": 0.04971253499388695, + "learning_rate": 1.3957367443372787e-05, + "loss": 0.0345, + "step": 132670 + }, + { + "epoch": 0.2634, + "grad_norm": 0.060743436217308044, + "learning_rate": 1.3953659026056171e-05, + "loss": 0.0362, + "step": 132680 + }, + { + "epoch": 0.26345, + "grad_norm": 0.05153289809823036, + "learning_rate": 1.3949950910742354e-05, + "loss": 0.0331, + "step": 132690 + }, + { + "epoch": 0.2635, + "grad_norm": 0.059696514159440994, + "learning_rate": 1.394624309753274e-05, + "loss": 0.0332, + "step": 132700 + }, + { + "epoch": 0.26355, + "grad_norm": 0.054209042340517044, + "learning_rate": 1.3942535586528688e-05, + "loss": 0.0329, + "step": 132710 + }, + { + "epoch": 0.2636, + "grad_norm": 0.05692094564437866, + "learning_rate": 1.3938828377831548e-05, + "loss": 0.0368, + "step": 132720 + }, + { + "epoch": 0.26365, + "grad_norm": 0.058496423065662384, + "learning_rate": 1.3935121471542695e-05, + "loss": 0.033, + "step": 132730 + }, + { + "epoch": 0.2637, + "grad_norm": 0.06355134397745132, + "learning_rate": 1.393141486776345e-05, + "loss": 0.0345, + "step": 132740 + }, + { + "epoch": 0.26375, + "grad_norm": 0.06379279494285583, + "learning_rate": 1.3927708566595188e-05, + "loss": 0.0321, + "step": 132750 + }, + { + "epoch": 0.2638, + "grad_norm": 0.0828419178724289, + "learning_rate": 1.3924002568139194e-05, + "loss": 0.0345, + "step": 132760 + }, + { + "epoch": 0.26385, + "grad_norm": 0.07966157048940659, + "learning_rate": 1.3920296872496808e-05, + "loss": 0.0338, + "step": 132770 + }, + { + "epoch": 0.2639, + "grad_norm": 0.06958355009555817, + "learning_rate": 1.3916591479769353e-05, + "loss": 0.0337, + "step": 132780 + }, + { + "epoch": 0.26395, + "grad_norm": 0.06073891371488571, + "learning_rate": 1.3912886390058125e-05, + "loss": 0.0336, + "step": 132790 + }, + { + "epoch": 0.264, + "grad_norm": 0.047526225447654724, + "learning_rate": 1.3909181603464415e-05, + "loss": 0.0329, + "step": 132800 + }, + { + "epoch": 0.26405, + "grad_norm": 0.05731450021266937, + "learning_rate": 1.3905477120089505e-05, + "loss": 0.033, + "step": 132810 + }, + { + "epoch": 0.2641, + "grad_norm": 0.057753946632146835, + "learning_rate": 1.3901772940034693e-05, + "loss": 0.0337, + "step": 132820 + }, + { + "epoch": 0.26415, + "grad_norm": 0.05305123329162598, + "learning_rate": 1.389806906340124e-05, + "loss": 0.0326, + "step": 132830 + }, + { + "epoch": 0.2642, + "grad_norm": 0.06059778109192848, + "learning_rate": 1.3894365490290395e-05, + "loss": 0.0337, + "step": 132840 + }, + { + "epoch": 0.26425, + "grad_norm": 0.06575316190719604, + "learning_rate": 1.3890662220803437e-05, + "loss": 0.033, + "step": 132850 + }, + { + "epoch": 0.2643, + "grad_norm": 0.05115986987948418, + "learning_rate": 1.3886959255041592e-05, + "loss": 0.0341, + "step": 132860 + }, + { + "epoch": 0.26435, + "grad_norm": 0.060626786202192307, + "learning_rate": 1.3883256593106115e-05, + "loss": 0.0344, + "step": 132870 + }, + { + "epoch": 0.2644, + "grad_norm": 0.07723161578178406, + "learning_rate": 1.3879554235098224e-05, + "loss": 0.0348, + "step": 132880 + }, + { + "epoch": 0.26445, + "grad_norm": 0.05626492574810982, + "learning_rate": 1.3875852181119142e-05, + "loss": 0.0345, + "step": 132890 + }, + { + "epoch": 0.2645, + "grad_norm": 0.058993149548769, + "learning_rate": 1.3872150431270075e-05, + "loss": 0.0347, + "step": 132900 + }, + { + "epoch": 0.26455, + "grad_norm": 0.06206396594643593, + "learning_rate": 1.3868448985652244e-05, + "loss": 0.0332, + "step": 132910 + }, + { + "epoch": 0.2646, + "grad_norm": 0.05647966265678406, + "learning_rate": 1.3864747844366838e-05, + "loss": 0.0325, + "step": 132920 + }, + { + "epoch": 0.26465, + "grad_norm": 0.06411507725715637, + "learning_rate": 1.3861047007515029e-05, + "loss": 0.0336, + "step": 132930 + }, + { + "epoch": 0.2647, + "grad_norm": 0.07062128931283951, + "learning_rate": 1.3857346475198024e-05, + "loss": 0.034, + "step": 132940 + }, + { + "epoch": 0.26475, + "grad_norm": 0.04719579219818115, + "learning_rate": 1.3853646247516966e-05, + "loss": 0.0325, + "step": 132950 + }, + { + "epoch": 0.2648, + "grad_norm": 0.05616036430001259, + "learning_rate": 1.3849946324573042e-05, + "loss": 0.0341, + "step": 132960 + }, + { + "epoch": 0.26485, + "grad_norm": 0.05677328258752823, + "learning_rate": 1.38462467064674e-05, + "loss": 0.0325, + "step": 132970 + }, + { + "epoch": 0.2649, + "grad_norm": 0.05568123236298561, + "learning_rate": 1.3842547393301172e-05, + "loss": 0.0353, + "step": 132980 + }, + { + "epoch": 0.26495, + "grad_norm": 0.06068773567676544, + "learning_rate": 1.3838848385175515e-05, + "loss": 0.0343, + "step": 132990 + }, + { + "epoch": 0.265, + "grad_norm": 0.05522076413035393, + "learning_rate": 1.383514968219155e-05, + "loss": 0.034, + "step": 133000 + }, + { + "epoch": 0.26505, + "grad_norm": 0.05242743715643883, + "learning_rate": 1.3831451284450403e-05, + "loss": 0.035, + "step": 133010 + }, + { + "epoch": 0.2651, + "grad_norm": 0.05325956270098686, + "learning_rate": 1.3827753192053167e-05, + "loss": 0.035, + "step": 133020 + }, + { + "epoch": 0.26515, + "grad_norm": 0.05957451090216637, + "learning_rate": 1.3824055405100961e-05, + "loss": 0.0344, + "step": 133030 + }, + { + "epoch": 0.2652, + "grad_norm": 0.0515049509704113, + "learning_rate": 1.3820357923694904e-05, + "loss": 0.0341, + "step": 133040 + }, + { + "epoch": 0.26525, + "grad_norm": 0.059082720428705215, + "learning_rate": 1.3816660747936042e-05, + "loss": 0.0346, + "step": 133050 + }, + { + "epoch": 0.2653, + "grad_norm": 0.047861386090517044, + "learning_rate": 1.3812963877925482e-05, + "loss": 0.034, + "step": 133060 + }, + { + "epoch": 0.26535, + "grad_norm": 0.047886017709970474, + "learning_rate": 1.380926731376428e-05, + "loss": 0.0339, + "step": 133070 + }, + { + "epoch": 0.2654, + "grad_norm": 0.047841113060712814, + "learning_rate": 1.3805571055553509e-05, + "loss": 0.0329, + "step": 133080 + }, + { + "epoch": 0.26545, + "grad_norm": 0.05151384696364403, + "learning_rate": 1.3801875103394224e-05, + "loss": 0.034, + "step": 133090 + }, + { + "epoch": 0.2655, + "grad_norm": 0.06597917526960373, + "learning_rate": 1.3798179457387456e-05, + "loss": 0.0341, + "step": 133100 + }, + { + "epoch": 0.26555, + "grad_norm": 0.052481696009635925, + "learning_rate": 1.3794484117634265e-05, + "loss": 0.0338, + "step": 133110 + }, + { + "epoch": 0.2656, + "grad_norm": 0.051191624253988266, + "learning_rate": 1.3790789084235667e-05, + "loss": 0.0335, + "step": 133120 + }, + { + "epoch": 0.26565, + "grad_norm": 0.05215437337756157, + "learning_rate": 1.3787094357292684e-05, + "loss": 0.0343, + "step": 133130 + }, + { + "epoch": 0.2657, + "grad_norm": 0.05486556515097618, + "learning_rate": 1.378339993690632e-05, + "loss": 0.0328, + "step": 133140 + }, + { + "epoch": 0.26575, + "grad_norm": 0.05595598742365837, + "learning_rate": 1.3779705823177597e-05, + "loss": 0.0334, + "step": 133150 + }, + { + "epoch": 0.2658, + "grad_norm": 0.05401711165904999, + "learning_rate": 1.377601201620749e-05, + "loss": 0.033, + "step": 133160 + }, + { + "epoch": 0.26585, + "grad_norm": 0.05710318684577942, + "learning_rate": 1.377231851609701e-05, + "loss": 0.0351, + "step": 133170 + }, + { + "epoch": 0.2659, + "grad_norm": 0.05453859269618988, + "learning_rate": 1.3768625322947126e-05, + "loss": 0.0333, + "step": 133180 + }, + { + "epoch": 0.26595, + "grad_norm": 0.05498124659061432, + "learning_rate": 1.3764932436858791e-05, + "loss": 0.0334, + "step": 133190 + }, + { + "epoch": 0.266, + "grad_norm": 0.05131393298506737, + "learning_rate": 1.3761239857932995e-05, + "loss": 0.0331, + "step": 133200 + }, + { + "epoch": 0.26605, + "grad_norm": 0.07144580781459808, + "learning_rate": 1.375754758627068e-05, + "loss": 0.037, + "step": 133210 + }, + { + "epoch": 0.2661, + "grad_norm": 0.06492135673761368, + "learning_rate": 1.3753855621972777e-05, + "loss": 0.0333, + "step": 133220 + }, + { + "epoch": 0.26615, + "grad_norm": 0.05913819745182991, + "learning_rate": 1.375016396514025e-05, + "loss": 0.0319, + "step": 133230 + }, + { + "epoch": 0.2662, + "grad_norm": 0.05563320964574814, + "learning_rate": 1.3746472615874002e-05, + "loss": 0.033, + "step": 133240 + }, + { + "epoch": 0.26625, + "grad_norm": 0.053804319351911545, + "learning_rate": 1.3742781574274987e-05, + "loss": 0.0339, + "step": 133250 + }, + { + "epoch": 0.2663, + "grad_norm": 0.05768200010061264, + "learning_rate": 1.3739090840444072e-05, + "loss": 0.0341, + "step": 133260 + }, + { + "epoch": 0.26635, + "grad_norm": 0.06974507868289948, + "learning_rate": 1.3735400414482194e-05, + "loss": 0.0334, + "step": 133270 + }, + { + "epoch": 0.2664, + "grad_norm": 0.0580982081592083, + "learning_rate": 1.3731710296490227e-05, + "loss": 0.0339, + "step": 133280 + }, + { + "epoch": 0.26645, + "grad_norm": 0.07492054253816605, + "learning_rate": 1.3728020486569077e-05, + "loss": 0.0336, + "step": 133290 + }, + { + "epoch": 0.2665, + "grad_norm": 0.06081545352935791, + "learning_rate": 1.3724330984819611e-05, + "loss": 0.0328, + "step": 133300 + }, + { + "epoch": 0.26655, + "grad_norm": 0.05259373411536217, + "learning_rate": 1.3720641791342692e-05, + "loss": 0.0333, + "step": 133310 + }, + { + "epoch": 0.2666, + "grad_norm": 0.053279753774404526, + "learning_rate": 1.3716952906239196e-05, + "loss": 0.0322, + "step": 133320 + }, + { + "epoch": 0.26665, + "grad_norm": 0.052714549005031586, + "learning_rate": 1.371326432960997e-05, + "loss": 0.0336, + "step": 133330 + }, + { + "epoch": 0.2667, + "grad_norm": 0.04930153861641884, + "learning_rate": 1.3709576061555859e-05, + "loss": 0.0314, + "step": 133340 + }, + { + "epoch": 0.26675, + "grad_norm": 0.05692337453365326, + "learning_rate": 1.3705888102177685e-05, + "loss": 0.035, + "step": 133350 + }, + { + "epoch": 0.2668, + "grad_norm": 0.048727333545684814, + "learning_rate": 1.3702200451576289e-05, + "loss": 0.0316, + "step": 133360 + }, + { + "epoch": 0.26685, + "grad_norm": 0.050843335688114166, + "learning_rate": 1.3698513109852496e-05, + "loss": 0.0321, + "step": 133370 + }, + { + "epoch": 0.2669, + "grad_norm": 0.0695711225271225, + "learning_rate": 1.369482607710711e-05, + "loss": 0.0336, + "step": 133380 + }, + { + "epoch": 0.26695, + "grad_norm": 0.05299427732825279, + "learning_rate": 1.369113935344093e-05, + "loss": 0.0322, + "step": 133390 + }, + { + "epoch": 0.267, + "grad_norm": 0.045001398772001266, + "learning_rate": 1.3687452938954746e-05, + "loss": 0.0318, + "step": 133400 + }, + { + "epoch": 0.26705, + "grad_norm": 0.052017249166965485, + "learning_rate": 1.3683766833749356e-05, + "loss": 0.0313, + "step": 133410 + }, + { + "epoch": 0.2671, + "grad_norm": 0.04774123430252075, + "learning_rate": 1.368008103792553e-05, + "loss": 0.0325, + "step": 133420 + }, + { + "epoch": 0.26715, + "grad_norm": 0.058756422251462936, + "learning_rate": 1.3676395551584023e-05, + "loss": 0.0353, + "step": 133430 + }, + { + "epoch": 0.2672, + "grad_norm": 0.08225872367620468, + "learning_rate": 1.3672710374825621e-05, + "loss": 0.033, + "step": 133440 + }, + { + "epoch": 0.26725, + "grad_norm": 0.07365263253450394, + "learning_rate": 1.3669025507751048e-05, + "loss": 0.0322, + "step": 133450 + }, + { + "epoch": 0.2673, + "grad_norm": 0.0715201273560524, + "learning_rate": 1.3665340950461083e-05, + "loss": 0.0325, + "step": 133460 + }, + { + "epoch": 0.26735, + "grad_norm": 0.055179111659526825, + "learning_rate": 1.366165670305642e-05, + "loss": 0.0332, + "step": 133470 + }, + { + "epoch": 0.2674, + "grad_norm": 0.04986432567238808, + "learning_rate": 1.36579727656378e-05, + "loss": 0.0307, + "step": 133480 + }, + { + "epoch": 0.26745, + "grad_norm": 0.060405880212783813, + "learning_rate": 1.3654289138305956e-05, + "loss": 0.0328, + "step": 133490 + }, + { + "epoch": 0.2675, + "grad_norm": 0.06796519458293915, + "learning_rate": 1.3650605821161583e-05, + "loss": 0.0326, + "step": 133500 + }, + { + "epoch": 0.26755, + "grad_norm": 0.06855150312185287, + "learning_rate": 1.3646922814305385e-05, + "loss": 0.032, + "step": 133510 + }, + { + "epoch": 0.2676, + "grad_norm": 0.060563866049051285, + "learning_rate": 1.364324011783804e-05, + "loss": 0.0319, + "step": 133520 + }, + { + "epoch": 0.26765, + "grad_norm": 0.053877994418144226, + "learning_rate": 1.3639557731860252e-05, + "loss": 0.0319, + "step": 133530 + }, + { + "epoch": 0.2677, + "grad_norm": 0.05480059236288071, + "learning_rate": 1.3635875656472693e-05, + "loss": 0.0334, + "step": 133540 + }, + { + "epoch": 0.26775, + "grad_norm": 0.053848497569561005, + "learning_rate": 1.3632193891776013e-05, + "loss": 0.0324, + "step": 133550 + }, + { + "epoch": 0.2678, + "grad_norm": 0.06216348335146904, + "learning_rate": 1.3628512437870888e-05, + "loss": 0.0318, + "step": 133560 + }, + { + "epoch": 0.26785, + "grad_norm": 0.06599042564630508, + "learning_rate": 1.3624831294857954e-05, + "loss": 0.0348, + "step": 133570 + }, + { + "epoch": 0.2679, + "grad_norm": 0.07977837324142456, + "learning_rate": 1.3621150462837867e-05, + "loss": 0.0351, + "step": 133580 + }, + { + "epoch": 0.26795, + "grad_norm": 0.06804382801055908, + "learning_rate": 1.3617469941911254e-05, + "loss": 0.0328, + "step": 133590 + }, + { + "epoch": 0.268, + "grad_norm": 0.06853941828012466, + "learning_rate": 1.3613789732178722e-05, + "loss": 0.0342, + "step": 133600 + }, + { + "epoch": 0.26805, + "grad_norm": 0.06310584396123886, + "learning_rate": 1.3610109833740913e-05, + "loss": 0.0334, + "step": 133610 + }, + { + "epoch": 0.2681, + "grad_norm": 0.05592683330178261, + "learning_rate": 1.3606430246698427e-05, + "loss": 0.0345, + "step": 133620 + }, + { + "epoch": 0.26815, + "grad_norm": 0.0685850977897644, + "learning_rate": 1.360275097115185e-05, + "loss": 0.0333, + "step": 133630 + }, + { + "epoch": 0.2682, + "grad_norm": 0.060806307941675186, + "learning_rate": 1.3599072007201774e-05, + "loss": 0.0353, + "step": 133640 + }, + { + "epoch": 0.26825, + "grad_norm": 0.050386618822813034, + "learning_rate": 1.3595393354948797e-05, + "loss": 0.0344, + "step": 133650 + }, + { + "epoch": 0.2683, + "grad_norm": 0.051887303590774536, + "learning_rate": 1.3591715014493467e-05, + "loss": 0.0365, + "step": 133660 + }, + { + "epoch": 0.26835, + "grad_norm": 0.05312123894691467, + "learning_rate": 1.3588036985936375e-05, + "loss": 0.0343, + "step": 133670 + }, + { + "epoch": 0.2684, + "grad_norm": 0.07142849266529083, + "learning_rate": 1.3584359269378066e-05, + "loss": 0.0334, + "step": 133680 + }, + { + "epoch": 0.26845, + "grad_norm": 0.05248669534921646, + "learning_rate": 1.3580681864919076e-05, + "loss": 0.0338, + "step": 133690 + }, + { + "epoch": 0.2685, + "grad_norm": 0.06165587157011032, + "learning_rate": 1.3577004772659963e-05, + "loss": 0.0332, + "step": 133700 + }, + { + "epoch": 0.26855, + "grad_norm": 0.07028713077306747, + "learning_rate": 1.3573327992701245e-05, + "loss": 0.0347, + "step": 133710 + }, + { + "epoch": 0.2686, + "grad_norm": 0.07004103809595108, + "learning_rate": 1.356965152514345e-05, + "loss": 0.034, + "step": 133720 + }, + { + "epoch": 0.26865, + "grad_norm": 0.054050881415605545, + "learning_rate": 1.356597537008708e-05, + "loss": 0.0338, + "step": 133730 + }, + { + "epoch": 0.2687, + "grad_norm": 0.05609206482768059, + "learning_rate": 1.3562299527632643e-05, + "loss": 0.0325, + "step": 133740 + }, + { + "epoch": 0.26875, + "grad_norm": 0.049644213169813156, + "learning_rate": 1.3558623997880666e-05, + "loss": 0.0332, + "step": 133750 + }, + { + "epoch": 0.2688, + "grad_norm": 0.07129288464784622, + "learning_rate": 1.3554948780931586e-05, + "loss": 0.0326, + "step": 133760 + }, + { + "epoch": 0.26885, + "grad_norm": 0.048589419573545456, + "learning_rate": 1.3551273876885917e-05, + "loss": 0.0322, + "step": 133770 + }, + { + "epoch": 0.2689, + "grad_norm": 0.05963992327451706, + "learning_rate": 1.3547599285844109e-05, + "loss": 0.0331, + "step": 133780 + }, + { + "epoch": 0.26895, + "grad_norm": 0.07128456979990005, + "learning_rate": 1.3543925007906644e-05, + "loss": 0.0349, + "step": 133790 + }, + { + "epoch": 0.269, + "grad_norm": 0.05785830318927765, + "learning_rate": 1.3540251043173968e-05, + "loss": 0.0342, + "step": 133800 + }, + { + "epoch": 0.26905, + "grad_norm": 0.06798305362462997, + "learning_rate": 1.3536577391746511e-05, + "loss": 0.0322, + "step": 133810 + }, + { + "epoch": 0.2691, + "grad_norm": 0.0627092495560646, + "learning_rate": 1.3532904053724731e-05, + "loss": 0.0331, + "step": 133820 + }, + { + "epoch": 0.26915, + "grad_norm": 0.07346533983945847, + "learning_rate": 1.352923102920905e-05, + "loss": 0.0325, + "step": 133830 + }, + { + "epoch": 0.2692, + "grad_norm": 0.053694840520620346, + "learning_rate": 1.352555831829988e-05, + "loss": 0.0317, + "step": 133840 + }, + { + "epoch": 0.26925, + "grad_norm": 0.054381098598241806, + "learning_rate": 1.3521885921097624e-05, + "loss": 0.0327, + "step": 133850 + }, + { + "epoch": 0.2693, + "grad_norm": 0.06548306345939636, + "learning_rate": 1.3518213837702697e-05, + "loss": 0.0334, + "step": 133860 + }, + { + "epoch": 0.26935, + "grad_norm": 0.05894751101732254, + "learning_rate": 1.3514542068215496e-05, + "loss": 0.0337, + "step": 133870 + }, + { + "epoch": 0.2694, + "grad_norm": 0.05426390469074249, + "learning_rate": 1.3510870612736403e-05, + "loss": 0.0347, + "step": 133880 + }, + { + "epoch": 0.26945, + "grad_norm": 0.05677363649010658, + "learning_rate": 1.3507199471365794e-05, + "loss": 0.0342, + "step": 133890 + }, + { + "epoch": 0.2695, + "grad_norm": 0.055190473794937134, + "learning_rate": 1.3503528644204022e-05, + "loss": 0.0348, + "step": 133900 + }, + { + "epoch": 0.26955, + "grad_norm": 0.06938653439283371, + "learning_rate": 1.3499858131351467e-05, + "loss": 0.0359, + "step": 133910 + }, + { + "epoch": 0.2696, + "grad_norm": 0.06496905535459518, + "learning_rate": 1.3496187932908472e-05, + "loss": 0.0359, + "step": 133920 + }, + { + "epoch": 0.26965, + "grad_norm": 0.053804852068424225, + "learning_rate": 1.3492518048975365e-05, + "loss": 0.0338, + "step": 133930 + }, + { + "epoch": 0.2697, + "grad_norm": 0.050974227488040924, + "learning_rate": 1.34888484796525e-05, + "loss": 0.034, + "step": 133940 + }, + { + "epoch": 0.26975, + "grad_norm": 0.05593269690871239, + "learning_rate": 1.3485179225040185e-05, + "loss": 0.0342, + "step": 133950 + }, + { + "epoch": 0.2698, + "grad_norm": 0.04842517897486687, + "learning_rate": 1.3481510285238764e-05, + "loss": 0.0339, + "step": 133960 + }, + { + "epoch": 0.26985, + "grad_norm": 0.05278436839580536, + "learning_rate": 1.3477841660348506e-05, + "loss": 0.0353, + "step": 133970 + }, + { + "epoch": 0.2699, + "grad_norm": 0.06895551085472107, + "learning_rate": 1.3474173350469726e-05, + "loss": 0.0345, + "step": 133980 + }, + { + "epoch": 0.26995, + "grad_norm": 0.06069585680961609, + "learning_rate": 1.3470505355702729e-05, + "loss": 0.033, + "step": 133990 + }, + { + "epoch": 0.27, + "grad_norm": 0.08514784276485443, + "learning_rate": 1.3466837676147781e-05, + "loss": 0.0333, + "step": 134000 + }, + { + "epoch": 0.27005, + "grad_norm": 0.061267901211977005, + "learning_rate": 1.346317031190516e-05, + "loss": 0.0361, + "step": 134010 + }, + { + "epoch": 0.2701, + "grad_norm": 0.05103262886404991, + "learning_rate": 1.3459503263075118e-05, + "loss": 0.0346, + "step": 134020 + }, + { + "epoch": 0.27015, + "grad_norm": 0.044751379638910294, + "learning_rate": 1.3455836529757932e-05, + "loss": 0.0325, + "step": 134030 + }, + { + "epoch": 0.2702, + "grad_norm": 0.0550481379032135, + "learning_rate": 1.3452170112053835e-05, + "loss": 0.0333, + "step": 134040 + }, + { + "epoch": 0.27025, + "grad_norm": 0.05319865047931671, + "learning_rate": 1.3448504010063062e-05, + "loss": 0.0331, + "step": 134050 + }, + { + "epoch": 0.2703, + "grad_norm": 0.049767591059207916, + "learning_rate": 1.3444838223885859e-05, + "loss": 0.033, + "step": 134060 + }, + { + "epoch": 0.27035, + "grad_norm": 0.06488999724388123, + "learning_rate": 1.3441172753622428e-05, + "loss": 0.0331, + "step": 134070 + }, + { + "epoch": 0.2704, + "grad_norm": 0.05793136730790138, + "learning_rate": 1.3437507599373001e-05, + "loss": 0.0333, + "step": 134080 + }, + { + "epoch": 0.27045, + "grad_norm": 0.07698115706443787, + "learning_rate": 1.3433842761237774e-05, + "loss": 0.0335, + "step": 134090 + }, + { + "epoch": 0.2705, + "grad_norm": 0.08233670890331268, + "learning_rate": 1.343017823931694e-05, + "loss": 0.0333, + "step": 134100 + }, + { + "epoch": 0.27055, + "grad_norm": 0.08557845652103424, + "learning_rate": 1.3426514033710674e-05, + "loss": 0.0341, + "step": 134110 + }, + { + "epoch": 0.2706, + "grad_norm": 0.08992201834917068, + "learning_rate": 1.342285014451918e-05, + "loss": 0.0336, + "step": 134120 + }, + { + "epoch": 0.27065, + "grad_norm": 0.06437687575817108, + "learning_rate": 1.3419186571842612e-05, + "loss": 0.0345, + "step": 134130 + }, + { + "epoch": 0.2707, + "grad_norm": 0.06617096811532974, + "learning_rate": 1.3415523315781123e-05, + "loss": 0.034, + "step": 134140 + }, + { + "epoch": 0.27075, + "grad_norm": 0.061990637332201004, + "learning_rate": 1.3411860376434884e-05, + "loss": 0.0341, + "step": 134150 + }, + { + "epoch": 0.2708, + "grad_norm": 0.05145447701215744, + "learning_rate": 1.3408197753904018e-05, + "loss": 0.0337, + "step": 134160 + }, + { + "epoch": 0.27085, + "grad_norm": 0.05686485022306442, + "learning_rate": 1.3404535448288683e-05, + "loss": 0.033, + "step": 134170 + }, + { + "epoch": 0.2709, + "grad_norm": 0.06566409021615982, + "learning_rate": 1.3400873459688989e-05, + "loss": 0.0334, + "step": 134180 + }, + { + "epoch": 0.27095, + "grad_norm": 0.0592525377869606, + "learning_rate": 1.339721178820505e-05, + "loss": 0.0346, + "step": 134190 + }, + { + "epoch": 0.271, + "grad_norm": 0.0546598806977272, + "learning_rate": 1.3393550433936991e-05, + "loss": 0.0325, + "step": 134200 + }, + { + "epoch": 0.27105, + "grad_norm": 0.053363386541604996, + "learning_rate": 1.33898893969849e-05, + "loss": 0.0342, + "step": 134210 + }, + { + "epoch": 0.2711, + "grad_norm": 0.06242294982075691, + "learning_rate": 1.3386228677448876e-05, + "loss": 0.0353, + "step": 134220 + }, + { + "epoch": 0.27115, + "grad_norm": 0.05819866061210632, + "learning_rate": 1.3382568275428986e-05, + "loss": 0.0336, + "step": 134230 + }, + { + "epoch": 0.2712, + "grad_norm": 0.056766338646411896, + "learning_rate": 1.3378908191025313e-05, + "loss": 0.0331, + "step": 134240 + }, + { + "epoch": 0.27125, + "grad_norm": 0.06343390792608261, + "learning_rate": 1.3375248424337945e-05, + "loss": 0.0338, + "step": 134250 + }, + { + "epoch": 0.2713, + "grad_norm": 0.05885003134608269, + "learning_rate": 1.3371588975466898e-05, + "loss": 0.0322, + "step": 134260 + }, + { + "epoch": 0.27135, + "grad_norm": 0.0626772791147232, + "learning_rate": 1.3367929844512247e-05, + "loss": 0.0329, + "step": 134270 + }, + { + "epoch": 0.2714, + "grad_norm": 0.06084499880671501, + "learning_rate": 1.3364271031574016e-05, + "loss": 0.0375, + "step": 134280 + }, + { + "epoch": 0.27145, + "grad_norm": 0.05387156084179878, + "learning_rate": 1.3360612536752254e-05, + "loss": 0.0322, + "step": 134290 + }, + { + "epoch": 0.2715, + "grad_norm": 0.06268797814846039, + "learning_rate": 1.3356954360146973e-05, + "loss": 0.0332, + "step": 134300 + }, + { + "epoch": 0.27155, + "grad_norm": 0.07965473085641861, + "learning_rate": 1.3353296501858175e-05, + "loss": 0.0344, + "step": 134310 + }, + { + "epoch": 0.2716, + "grad_norm": 0.057444483041763306, + "learning_rate": 1.3349638961985888e-05, + "loss": 0.0324, + "step": 134320 + }, + { + "epoch": 0.27165, + "grad_norm": 0.07856761664152145, + "learning_rate": 1.3345981740630092e-05, + "loss": 0.0349, + "step": 134330 + }, + { + "epoch": 0.2717, + "grad_norm": 0.06308524310588837, + "learning_rate": 1.3342324837890777e-05, + "loss": 0.0336, + "step": 134340 + }, + { + "epoch": 0.27175, + "grad_norm": 0.05933582782745361, + "learning_rate": 1.3338668253867911e-05, + "loss": 0.0355, + "step": 134350 + }, + { + "epoch": 0.2718, + "grad_norm": 0.052174169570207596, + "learning_rate": 1.3335011988661486e-05, + "loss": 0.0338, + "step": 134360 + }, + { + "epoch": 0.27185, + "grad_norm": 0.06952715665102005, + "learning_rate": 1.333135604237144e-05, + "loss": 0.0346, + "step": 134370 + }, + { + "epoch": 0.2719, + "grad_norm": 0.06507941335439682, + "learning_rate": 1.3327700415097743e-05, + "loss": 0.0343, + "step": 134380 + }, + { + "epoch": 0.27195, + "grad_norm": 0.05496826767921448, + "learning_rate": 1.3324045106940337e-05, + "loss": 0.034, + "step": 134390 + }, + { + "epoch": 0.272, + "grad_norm": 0.06886152178049088, + "learning_rate": 1.3320390117999138e-05, + "loss": 0.035, + "step": 134400 + }, + { + "epoch": 0.27205, + "grad_norm": 0.051142431795597076, + "learning_rate": 1.3316735448374095e-05, + "loss": 0.0334, + "step": 134410 + }, + { + "epoch": 0.2721, + "grad_norm": 0.0529659204185009, + "learning_rate": 1.3313081098165118e-05, + "loss": 0.0336, + "step": 134420 + }, + { + "epoch": 0.27215, + "grad_norm": 0.05416841432452202, + "learning_rate": 1.3309427067472102e-05, + "loss": 0.0332, + "step": 134430 + }, + { + "epoch": 0.2722, + "grad_norm": 0.06381898373365402, + "learning_rate": 1.3305773356394969e-05, + "loss": 0.0341, + "step": 134440 + }, + { + "epoch": 0.27225, + "grad_norm": 0.0496458001434803, + "learning_rate": 1.3302119965033588e-05, + "loss": 0.0329, + "step": 134450 + }, + { + "epoch": 0.2723, + "grad_norm": 0.055828552693128586, + "learning_rate": 1.3298466893487871e-05, + "loss": 0.0345, + "step": 134460 + }, + { + "epoch": 0.27235, + "grad_norm": 0.060162320733070374, + "learning_rate": 1.3294814141857653e-05, + "loss": 0.0328, + "step": 134470 + }, + { + "epoch": 0.2724, + "grad_norm": 0.06394720822572708, + "learning_rate": 1.3291161710242833e-05, + "loss": 0.0328, + "step": 134480 + }, + { + "epoch": 0.27245, + "grad_norm": 0.05539621040225029, + "learning_rate": 1.3287509598743239e-05, + "loss": 0.0342, + "step": 134490 + }, + { + "epoch": 0.2725, + "grad_norm": 0.053097691386938095, + "learning_rate": 1.3283857807458744e-05, + "loss": 0.0348, + "step": 134500 + }, + { + "epoch": 0.27255, + "grad_norm": 0.04288434982299805, + "learning_rate": 1.3280206336489176e-05, + "loss": 0.0338, + "step": 134510 + }, + { + "epoch": 0.2726, + "grad_norm": 0.055250134319067, + "learning_rate": 1.3276555185934353e-05, + "loss": 0.0327, + "step": 134520 + }, + { + "epoch": 0.27265, + "grad_norm": 0.059500452131032944, + "learning_rate": 1.3272904355894117e-05, + "loss": 0.033, + "step": 134530 + }, + { + "epoch": 0.2727, + "grad_norm": 0.04469634220004082, + "learning_rate": 1.326925384646827e-05, + "loss": 0.0316, + "step": 134540 + }, + { + "epoch": 0.27275, + "grad_norm": 0.05874725431203842, + "learning_rate": 1.3265603657756615e-05, + "loss": 0.0323, + "step": 134550 + }, + { + "epoch": 0.2728, + "grad_norm": 0.05500799044966698, + "learning_rate": 1.3261953789858939e-05, + "loss": 0.0318, + "step": 134560 + }, + { + "epoch": 0.27285, + "grad_norm": 0.05438421666622162, + "learning_rate": 1.3258304242875036e-05, + "loss": 0.0337, + "step": 134570 + }, + { + "epoch": 0.2729, + "grad_norm": 0.060280703008174896, + "learning_rate": 1.3254655016904693e-05, + "loss": 0.0324, + "step": 134580 + }, + { + "epoch": 0.27295, + "grad_norm": 0.06587333232164383, + "learning_rate": 1.325100611204767e-05, + "loss": 0.0352, + "step": 134590 + }, + { + "epoch": 0.273, + "grad_norm": 0.06733546406030655, + "learning_rate": 1.3247357528403725e-05, + "loss": 0.0328, + "step": 134600 + }, + { + "epoch": 0.27305, + "grad_norm": 0.07355872541666031, + "learning_rate": 1.3243709266072601e-05, + "loss": 0.0329, + "step": 134610 + }, + { + "epoch": 0.2731, + "grad_norm": 0.11602780967950821, + "learning_rate": 1.324006132515406e-05, + "loss": 0.0361, + "step": 134620 + }, + { + "epoch": 0.27315, + "grad_norm": 0.09205681830644608, + "learning_rate": 1.3236413705747824e-05, + "loss": 0.0362, + "step": 134630 + }, + { + "epoch": 0.2732, + "grad_norm": 0.10276363044977188, + "learning_rate": 1.3232766407953607e-05, + "loss": 0.0344, + "step": 134640 + }, + { + "epoch": 0.27325, + "grad_norm": 0.07067776471376419, + "learning_rate": 1.3229119431871145e-05, + "loss": 0.0336, + "step": 134650 + }, + { + "epoch": 0.2733, + "grad_norm": 0.08137717097997665, + "learning_rate": 1.322547277760013e-05, + "loss": 0.04, + "step": 134660 + }, + { + "epoch": 0.27335, + "grad_norm": 0.09185691922903061, + "learning_rate": 1.3221826445240279e-05, + "loss": 0.0343, + "step": 134670 + }, + { + "epoch": 0.2734, + "grad_norm": 0.07215512543916702, + "learning_rate": 1.3218180434891252e-05, + "loss": 0.0359, + "step": 134680 + }, + { + "epoch": 0.27345, + "grad_norm": 0.06042398139834404, + "learning_rate": 1.3214534746652746e-05, + "loss": 0.0339, + "step": 134690 + }, + { + "epoch": 0.2735, + "grad_norm": 0.06867025047540665, + "learning_rate": 1.3210889380624442e-05, + "loss": 0.0366, + "step": 134700 + }, + { + "epoch": 0.27355, + "grad_norm": 0.07023248821496964, + "learning_rate": 1.3207244336905994e-05, + "loss": 0.0345, + "step": 134710 + }, + { + "epoch": 0.2736, + "grad_norm": 0.06673972308635712, + "learning_rate": 1.3203599615597052e-05, + "loss": 0.0356, + "step": 134720 + }, + { + "epoch": 0.27365, + "grad_norm": 0.0651073008775711, + "learning_rate": 1.3199955216797257e-05, + "loss": 0.0339, + "step": 134730 + }, + { + "epoch": 0.2737, + "grad_norm": 0.05058138817548752, + "learning_rate": 1.3196311140606266e-05, + "loss": 0.0343, + "step": 134740 + }, + { + "epoch": 0.27375, + "grad_norm": 0.051638465374708176, + "learning_rate": 1.319266738712369e-05, + "loss": 0.0334, + "step": 134750 + }, + { + "epoch": 0.2738, + "grad_norm": 0.06750901788473129, + "learning_rate": 1.3189023956449143e-05, + "loss": 0.0356, + "step": 134760 + }, + { + "epoch": 0.27385, + "grad_norm": 0.05553903803229332, + "learning_rate": 1.3185380848682255e-05, + "loss": 0.0339, + "step": 134770 + }, + { + "epoch": 0.2739, + "grad_norm": 0.0543627068400383, + "learning_rate": 1.3181738063922605e-05, + "loss": 0.0332, + "step": 134780 + }, + { + "epoch": 0.27395, + "grad_norm": 0.05075423792004585, + "learning_rate": 1.3178095602269807e-05, + "loss": 0.0334, + "step": 134790 + }, + { + "epoch": 0.274, + "grad_norm": 0.06586143374443054, + "learning_rate": 1.3174453463823433e-05, + "loss": 0.0337, + "step": 134800 + }, + { + "epoch": 0.27405, + "grad_norm": 0.05968514084815979, + "learning_rate": 1.3170811648683052e-05, + "loss": 0.0337, + "step": 134810 + }, + { + "epoch": 0.2741, + "grad_norm": 0.0509415827691555, + "learning_rate": 1.3167170156948242e-05, + "loss": 0.0335, + "step": 134820 + }, + { + "epoch": 0.27415, + "grad_norm": 0.05163281783461571, + "learning_rate": 1.3163528988718554e-05, + "loss": 0.0338, + "step": 134830 + }, + { + "epoch": 0.2742, + "grad_norm": 0.06403462588787079, + "learning_rate": 1.3159888144093541e-05, + "loss": 0.0348, + "step": 134840 + }, + { + "epoch": 0.27425, + "grad_norm": 0.08456991612911224, + "learning_rate": 1.3156247623172727e-05, + "loss": 0.035, + "step": 134850 + }, + { + "epoch": 0.2743, + "grad_norm": 0.06916992366313934, + "learning_rate": 1.3152607426055662e-05, + "loss": 0.0343, + "step": 134860 + }, + { + "epoch": 0.27435, + "grad_norm": 0.059461891651153564, + "learning_rate": 1.314896755284185e-05, + "loss": 0.0329, + "step": 134870 + }, + { + "epoch": 0.2744, + "grad_norm": 0.05513419583439827, + "learning_rate": 1.3145328003630821e-05, + "loss": 0.0362, + "step": 134880 + }, + { + "epoch": 0.27445, + "grad_norm": 0.05778443440794945, + "learning_rate": 1.3141688778522072e-05, + "loss": 0.0338, + "step": 134890 + }, + { + "epoch": 0.2745, + "grad_norm": 0.046256016939878464, + "learning_rate": 1.3138049877615088e-05, + "loss": 0.0349, + "step": 134900 + }, + { + "epoch": 0.27455, + "grad_norm": 0.05146146938204765, + "learning_rate": 1.3134411301009374e-05, + "loss": 0.0335, + "step": 134910 + }, + { + "epoch": 0.2746, + "grad_norm": 0.06435507535934448, + "learning_rate": 1.3130773048804396e-05, + "loss": 0.0357, + "step": 134920 + }, + { + "epoch": 0.27465, + "grad_norm": 0.06181373447179794, + "learning_rate": 1.3127135121099624e-05, + "loss": 0.0351, + "step": 134930 + }, + { + "epoch": 0.2747, + "grad_norm": 0.05738835781812668, + "learning_rate": 1.3123497517994509e-05, + "loss": 0.0323, + "step": 134940 + }, + { + "epoch": 0.27475, + "grad_norm": 0.053311824798583984, + "learning_rate": 1.3119860239588507e-05, + "loss": 0.0341, + "step": 134950 + }, + { + "epoch": 0.2748, + "grad_norm": 0.04415374621748924, + "learning_rate": 1.3116223285981086e-05, + "loss": 0.0331, + "step": 134960 + }, + { + "epoch": 0.27485, + "grad_norm": 0.050012920051813126, + "learning_rate": 1.3112586657271633e-05, + "loss": 0.0329, + "step": 134970 + }, + { + "epoch": 0.2749, + "grad_norm": 0.061650365591049194, + "learning_rate": 1.3108950353559607e-05, + "loss": 0.0346, + "step": 134980 + }, + { + "epoch": 0.27495, + "grad_norm": 0.07193632423877716, + "learning_rate": 1.3105314374944399e-05, + "loss": 0.0337, + "step": 134990 + }, + { + "epoch": 0.275, + "grad_norm": 0.05671504884958267, + "learning_rate": 1.310167872152544e-05, + "loss": 0.0343, + "step": 135000 + }, + { + "epoch": 0.27505, + "grad_norm": 0.06242790445685387, + "learning_rate": 1.3098043393402114e-05, + "loss": 0.0332, + "step": 135010 + }, + { + "epoch": 0.2751, + "grad_norm": 0.06047506630420685, + "learning_rate": 1.30944083906738e-05, + "loss": 0.0327, + "step": 135020 + }, + { + "epoch": 0.27515, + "grad_norm": 0.06618639826774597, + "learning_rate": 1.3090773713439896e-05, + "loss": 0.0341, + "step": 135030 + }, + { + "epoch": 0.2752, + "grad_norm": 0.055529411882162094, + "learning_rate": 1.3087139361799766e-05, + "loss": 0.0335, + "step": 135040 + }, + { + "epoch": 0.27525, + "grad_norm": 0.05129161477088928, + "learning_rate": 1.3083505335852771e-05, + "loss": 0.0351, + "step": 135050 + }, + { + "epoch": 0.2753, + "grad_norm": 0.05257735773921013, + "learning_rate": 1.3079871635698255e-05, + "loss": 0.0334, + "step": 135060 + }, + { + "epoch": 0.27535, + "grad_norm": 0.057310450822114944, + "learning_rate": 1.307623826143557e-05, + "loss": 0.0341, + "step": 135070 + }, + { + "epoch": 0.2754, + "grad_norm": 0.06880602985620499, + "learning_rate": 1.3072605213164057e-05, + "loss": 0.035, + "step": 135080 + }, + { + "epoch": 0.27545, + "grad_norm": 0.07616132497787476, + "learning_rate": 1.3068972490983039e-05, + "loss": 0.034, + "step": 135090 + }, + { + "epoch": 0.2755, + "grad_norm": 0.06144719943404198, + "learning_rate": 1.3065340094991832e-05, + "loss": 0.0333, + "step": 135100 + }, + { + "epoch": 0.27555, + "grad_norm": 0.05728575214743614, + "learning_rate": 1.3061708025289731e-05, + "loss": 0.0336, + "step": 135110 + }, + { + "epoch": 0.2756, + "grad_norm": 0.07311052083969116, + "learning_rate": 1.3058076281976059e-05, + "loss": 0.033, + "step": 135120 + }, + { + "epoch": 0.27565, + "grad_norm": 0.06681732088327408, + "learning_rate": 1.3054444865150095e-05, + "loss": 0.0325, + "step": 135130 + }, + { + "epoch": 0.2757, + "grad_norm": 0.06032009422779083, + "learning_rate": 1.3050813774911112e-05, + "loss": 0.0312, + "step": 135140 + }, + { + "epoch": 0.27575, + "grad_norm": 0.047618068754673004, + "learning_rate": 1.30471830113584e-05, + "loss": 0.0314, + "step": 135150 + }, + { + "epoch": 0.2758, + "grad_norm": 0.05425645038485527, + "learning_rate": 1.3043552574591204e-05, + "loss": 0.0337, + "step": 135160 + }, + { + "epoch": 0.27585, + "grad_norm": 0.054841794073581696, + "learning_rate": 1.3039922464708806e-05, + "loss": 0.0322, + "step": 135170 + }, + { + "epoch": 0.2759, + "grad_norm": 0.06096262484788895, + "learning_rate": 1.303629268181042e-05, + "loss": 0.0321, + "step": 135180 + }, + { + "epoch": 0.27595, + "grad_norm": 0.05185467004776001, + "learning_rate": 1.3032663225995292e-05, + "loss": 0.0325, + "step": 135190 + }, + { + "epoch": 0.276, + "grad_norm": 0.05274657532572746, + "learning_rate": 1.302903409736267e-05, + "loss": 0.0326, + "step": 135200 + }, + { + "epoch": 0.27605, + "grad_norm": 0.0590011291205883, + "learning_rate": 1.3025405296011756e-05, + "loss": 0.0318, + "step": 135210 + }, + { + "epoch": 0.2761, + "grad_norm": 0.05683165043592453, + "learning_rate": 1.3021776822041764e-05, + "loss": 0.0324, + "step": 135220 + }, + { + "epoch": 0.27615, + "grad_norm": 0.06452971696853638, + "learning_rate": 1.3018148675551884e-05, + "loss": 0.0325, + "step": 135230 + }, + { + "epoch": 0.2762, + "grad_norm": 0.058156803250312805, + "learning_rate": 1.3014520856641327e-05, + "loss": 0.0322, + "step": 135240 + }, + { + "epoch": 0.27625, + "grad_norm": 0.05621333420276642, + "learning_rate": 1.3010893365409265e-05, + "loss": 0.0334, + "step": 135250 + }, + { + "epoch": 0.2763, + "grad_norm": 0.051411014050245285, + "learning_rate": 1.3007266201954866e-05, + "loss": 0.0326, + "step": 135260 + }, + { + "epoch": 0.27635, + "grad_norm": 0.04615700989961624, + "learning_rate": 1.3003639366377312e-05, + "loss": 0.0341, + "step": 135270 + }, + { + "epoch": 0.2764, + "grad_norm": 0.0606042705476284, + "learning_rate": 1.3000012858775745e-05, + "loss": 0.0358, + "step": 135280 + }, + { + "epoch": 0.27645, + "grad_norm": 0.07736486196517944, + "learning_rate": 1.2996386679249325e-05, + "loss": 0.0347, + "step": 135290 + }, + { + "epoch": 0.2765, + "grad_norm": 0.04653066769242287, + "learning_rate": 1.2992760827897183e-05, + "loss": 0.0337, + "step": 135300 + }, + { + "epoch": 0.27655, + "grad_norm": 0.0559215284883976, + "learning_rate": 1.298913530481845e-05, + "loss": 0.0337, + "step": 135310 + }, + { + "epoch": 0.2766, + "grad_norm": 0.06434296071529388, + "learning_rate": 1.2985510110112237e-05, + "loss": 0.0349, + "step": 135320 + }, + { + "epoch": 0.27665, + "grad_norm": 0.04896357282996178, + "learning_rate": 1.2981885243877673e-05, + "loss": 0.0355, + "step": 135330 + }, + { + "epoch": 0.2767, + "grad_norm": 0.05654190108180046, + "learning_rate": 1.2978260706213852e-05, + "loss": 0.0342, + "step": 135340 + }, + { + "epoch": 0.27675, + "grad_norm": 0.05086608976125717, + "learning_rate": 1.2974636497219856e-05, + "loss": 0.0333, + "step": 135350 + }, + { + "epoch": 0.2768, + "grad_norm": 0.05549168214201927, + "learning_rate": 1.2971012616994794e-05, + "loss": 0.0345, + "step": 135360 + }, + { + "epoch": 0.27685, + "grad_norm": 0.051113519817590714, + "learning_rate": 1.2967389065637713e-05, + "loss": 0.0333, + "step": 135370 + }, + { + "epoch": 0.2769, + "grad_norm": 0.04853764921426773, + "learning_rate": 1.2963765843247705e-05, + "loss": 0.0335, + "step": 135380 + }, + { + "epoch": 0.27695, + "grad_norm": 0.05270055681467056, + "learning_rate": 1.296014294992382e-05, + "loss": 0.0354, + "step": 135390 + }, + { + "epoch": 0.277, + "grad_norm": 0.056161247193813324, + "learning_rate": 1.2956520385765091e-05, + "loss": 0.034, + "step": 135400 + }, + { + "epoch": 0.27705, + "grad_norm": 0.05111818388104439, + "learning_rate": 1.295289815087058e-05, + "loss": 0.0341, + "step": 135410 + }, + { + "epoch": 0.2771, + "grad_norm": 0.06116397678852081, + "learning_rate": 1.294927624533931e-05, + "loss": 0.0353, + "step": 135420 + }, + { + "epoch": 0.27715, + "grad_norm": 0.06604544073343277, + "learning_rate": 1.29456546692703e-05, + "loss": 0.0326, + "step": 135430 + }, + { + "epoch": 0.2772, + "grad_norm": 0.06811371445655823, + "learning_rate": 1.2942033422762551e-05, + "loss": 0.032, + "step": 135440 + }, + { + "epoch": 0.27725, + "grad_norm": 0.06051434203982353, + "learning_rate": 1.2938412505915079e-05, + "loss": 0.0315, + "step": 135450 + }, + { + "epoch": 0.2773, + "grad_norm": 0.05439218506217003, + "learning_rate": 1.2934791918826896e-05, + "loss": 0.0318, + "step": 135460 + }, + { + "epoch": 0.27735, + "grad_norm": 0.05119166523218155, + "learning_rate": 1.2931171661596952e-05, + "loss": 0.0331, + "step": 135470 + }, + { + "epoch": 0.2774, + "grad_norm": 0.04811709374189377, + "learning_rate": 1.292755173432425e-05, + "loss": 0.0326, + "step": 135480 + }, + { + "epoch": 0.27745, + "grad_norm": 0.06304600089788437, + "learning_rate": 1.2923932137107737e-05, + "loss": 0.0333, + "step": 135490 + }, + { + "epoch": 0.2775, + "grad_norm": 0.06395766139030457, + "learning_rate": 1.2920312870046394e-05, + "loss": 0.0323, + "step": 135500 + }, + { + "epoch": 0.27755, + "grad_norm": 0.06287093460559845, + "learning_rate": 1.2916693933239157e-05, + "loss": 0.0332, + "step": 135510 + }, + { + "epoch": 0.2776, + "grad_norm": 0.05557131767272949, + "learning_rate": 1.2913075326784962e-05, + "loss": 0.0347, + "step": 135520 + }, + { + "epoch": 0.27765, + "grad_norm": 0.05720607191324234, + "learning_rate": 1.2909457050782752e-05, + "loss": 0.0327, + "step": 135530 + }, + { + "epoch": 0.2777, + "grad_norm": 0.061198148876428604, + "learning_rate": 1.2905839105331447e-05, + "loss": 0.0343, + "step": 135540 + }, + { + "epoch": 0.27775, + "grad_norm": 0.059358034282922745, + "learning_rate": 1.2902221490529959e-05, + "loss": 0.0336, + "step": 135550 + }, + { + "epoch": 0.2778, + "grad_norm": 0.06045711785554886, + "learning_rate": 1.2898604206477178e-05, + "loss": 0.0323, + "step": 135560 + }, + { + "epoch": 0.27785, + "grad_norm": 0.12279672920703888, + "learning_rate": 1.2894987253272023e-05, + "loss": 0.0349, + "step": 135570 + }, + { + "epoch": 0.2779, + "grad_norm": 0.09237831830978394, + "learning_rate": 1.2891370631013355e-05, + "loss": 0.0327, + "step": 135580 + }, + { + "epoch": 0.27795, + "grad_norm": 0.06324107199907303, + "learning_rate": 1.2887754339800079e-05, + "loss": 0.0338, + "step": 135590 + }, + { + "epoch": 0.278, + "grad_norm": 0.0558076836168766, + "learning_rate": 1.2884138379731048e-05, + "loss": 0.0331, + "step": 135600 + }, + { + "epoch": 0.27805, + "grad_norm": 0.06076362729072571, + "learning_rate": 1.2880522750905111e-05, + "loss": 0.0327, + "step": 135610 + }, + { + "epoch": 0.2781, + "grad_norm": 0.050440818071365356, + "learning_rate": 1.2876907453421139e-05, + "loss": 0.0328, + "step": 135620 + }, + { + "epoch": 0.27815, + "grad_norm": 0.06723838299512863, + "learning_rate": 1.2873292487377964e-05, + "loss": 0.0335, + "step": 135630 + }, + { + "epoch": 0.2782, + "grad_norm": 0.055715836584568024, + "learning_rate": 1.2869677852874407e-05, + "loss": 0.0337, + "step": 135640 + }, + { + "epoch": 0.27825, + "grad_norm": 0.05138610675930977, + "learning_rate": 1.286606355000931e-05, + "loss": 0.0327, + "step": 135650 + }, + { + "epoch": 0.2783, + "grad_norm": 0.04878625646233559, + "learning_rate": 1.2862449578881466e-05, + "loss": 0.0315, + "step": 135660 + }, + { + "epoch": 0.27835, + "grad_norm": 0.06936074793338776, + "learning_rate": 1.2858835939589712e-05, + "loss": 0.0337, + "step": 135670 + }, + { + "epoch": 0.2784, + "grad_norm": 0.059619609266519547, + "learning_rate": 1.28552226322328e-05, + "loss": 0.0332, + "step": 135680 + }, + { + "epoch": 0.27845, + "grad_norm": 0.051202233880758286, + "learning_rate": 1.2851609656909552e-05, + "loss": 0.0323, + "step": 135690 + }, + { + "epoch": 0.2785, + "grad_norm": 0.048660289496183395, + "learning_rate": 1.2847997013718722e-05, + "loss": 0.0337, + "step": 135700 + }, + { + "epoch": 0.27855, + "grad_norm": 0.06189000606536865, + "learning_rate": 1.2844384702759094e-05, + "loss": 0.0336, + "step": 135710 + }, + { + "epoch": 0.2786, + "grad_norm": 0.06334463506937027, + "learning_rate": 1.2840772724129425e-05, + "loss": 0.034, + "step": 135720 + }, + { + "epoch": 0.27865, + "grad_norm": 0.057740218937397, + "learning_rate": 1.283716107792845e-05, + "loss": 0.0327, + "step": 135730 + }, + { + "epoch": 0.2787, + "grad_norm": 0.05583374947309494, + "learning_rate": 1.2833549764254932e-05, + "loss": 0.0328, + "step": 135740 + }, + { + "epoch": 0.27875, + "grad_norm": 0.053421925753355026, + "learning_rate": 1.2829938783207593e-05, + "loss": 0.0331, + "step": 135750 + }, + { + "epoch": 0.2788, + "grad_norm": 0.05022544041275978, + "learning_rate": 1.2826328134885156e-05, + "loss": 0.0332, + "step": 135760 + }, + { + "epoch": 0.27885, + "grad_norm": 0.06130515784025192, + "learning_rate": 1.2822717819386324e-05, + "loss": 0.0336, + "step": 135770 + }, + { + "epoch": 0.2789, + "grad_norm": 0.055185750126838684, + "learning_rate": 1.2819107836809813e-05, + "loss": 0.033, + "step": 135780 + }, + { + "epoch": 0.27895, + "grad_norm": 0.05438302829861641, + "learning_rate": 1.2815498187254327e-05, + "loss": 0.0342, + "step": 135790 + }, + { + "epoch": 0.279, + "grad_norm": 0.05572971701622009, + "learning_rate": 1.2811888870818543e-05, + "loss": 0.0335, + "step": 135800 + }, + { + "epoch": 0.27905, + "grad_norm": 0.05455252528190613, + "learning_rate": 1.2808279887601138e-05, + "loss": 0.0331, + "step": 135810 + }, + { + "epoch": 0.2791, + "grad_norm": 0.06497219204902649, + "learning_rate": 1.280467123770077e-05, + "loss": 0.0332, + "step": 135820 + }, + { + "epoch": 0.27915, + "grad_norm": 0.05847322940826416, + "learning_rate": 1.2801062921216111e-05, + "loss": 0.0338, + "step": 135830 + }, + { + "epoch": 0.2792, + "grad_norm": 0.06104997545480728, + "learning_rate": 1.2797454938245826e-05, + "loss": 0.0348, + "step": 135840 + }, + { + "epoch": 0.27925, + "grad_norm": 0.055407196283340454, + "learning_rate": 1.2793847288888521e-05, + "loss": 0.032, + "step": 135850 + }, + { + "epoch": 0.2793, + "grad_norm": 0.061831653118133545, + "learning_rate": 1.2790239973242854e-05, + "loss": 0.034, + "step": 135860 + }, + { + "epoch": 0.27935, + "grad_norm": 0.05441982299089432, + "learning_rate": 1.278663299140743e-05, + "loss": 0.0317, + "step": 135870 + }, + { + "epoch": 0.2794, + "grad_norm": 0.05456315353512764, + "learning_rate": 1.2783026343480892e-05, + "loss": 0.0326, + "step": 135880 + }, + { + "epoch": 0.27945, + "grad_norm": 0.07653749734163284, + "learning_rate": 1.27794200295618e-05, + "loss": 0.0341, + "step": 135890 + }, + { + "epoch": 0.2795, + "grad_norm": 0.0711459368467331, + "learning_rate": 1.277581404974878e-05, + "loss": 0.0333, + "step": 135900 + }, + { + "epoch": 0.27955, + "grad_norm": 0.06660649180412292, + "learning_rate": 1.2772208404140418e-05, + "loss": 0.0335, + "step": 135910 + }, + { + "epoch": 0.2796, + "grad_norm": 0.06644999980926514, + "learning_rate": 1.2768603092835285e-05, + "loss": 0.034, + "step": 135920 + }, + { + "epoch": 0.27965, + "grad_norm": 0.06661864370107651, + "learning_rate": 1.2764998115931948e-05, + "loss": 0.0347, + "step": 135930 + }, + { + "epoch": 0.2797, + "grad_norm": 0.05995258688926697, + "learning_rate": 1.2761393473528955e-05, + "loss": 0.0339, + "step": 135940 + }, + { + "epoch": 0.27975, + "grad_norm": 0.054902464151382446, + "learning_rate": 1.275778916572488e-05, + "loss": 0.0334, + "step": 135950 + }, + { + "epoch": 0.2798, + "grad_norm": 0.0524166114628315, + "learning_rate": 1.2754185192618238e-05, + "loss": 0.0341, + "step": 135960 + }, + { + "epoch": 0.27985, + "grad_norm": 0.06843091547489166, + "learning_rate": 1.275058155430758e-05, + "loss": 0.0378, + "step": 135970 + }, + { + "epoch": 0.2799, + "grad_norm": 0.06823035329580307, + "learning_rate": 1.2746978250891423e-05, + "loss": 0.036, + "step": 135980 + }, + { + "epoch": 0.27995, + "grad_norm": 0.05720892921090126, + "learning_rate": 1.2743375282468267e-05, + "loss": 0.0338, + "step": 135990 + }, + { + "epoch": 0.28, + "grad_norm": 0.06375530362129211, + "learning_rate": 1.2739772649136636e-05, + "loss": 0.0376, + "step": 136000 + }, + { + "epoch": 0.28005, + "grad_norm": 0.06734709441661835, + "learning_rate": 1.2736170350995013e-05, + "loss": 0.0342, + "step": 136010 + }, + { + "epoch": 0.2801, + "grad_norm": 0.07103891670703888, + "learning_rate": 1.273256838814188e-05, + "loss": 0.0341, + "step": 136020 + }, + { + "epoch": 0.28015, + "grad_norm": 0.07042978703975677, + "learning_rate": 1.2728966760675726e-05, + "loss": 0.0342, + "step": 136030 + }, + { + "epoch": 0.2802, + "grad_norm": 0.059580136090517044, + "learning_rate": 1.2725365468695e-05, + "loss": 0.0339, + "step": 136040 + }, + { + "epoch": 0.28025, + "grad_norm": 0.05698537081480026, + "learning_rate": 1.272176451229819e-05, + "loss": 0.035, + "step": 136050 + }, + { + "epoch": 0.2803, + "grad_norm": 0.05858423933386803, + "learning_rate": 1.2718163891583706e-05, + "loss": 0.0332, + "step": 136060 + }, + { + "epoch": 0.28035, + "grad_norm": 0.05761784315109253, + "learning_rate": 1.271456360665002e-05, + "loss": 0.035, + "step": 136070 + }, + { + "epoch": 0.2804, + "grad_norm": 0.09265638142824173, + "learning_rate": 1.2710963657595538e-05, + "loss": 0.0362, + "step": 136080 + }, + { + "epoch": 0.28045, + "grad_norm": 0.0590679869055748, + "learning_rate": 1.27073640445187e-05, + "loss": 0.0348, + "step": 136090 + }, + { + "epoch": 0.2805, + "grad_norm": 0.05699898675084114, + "learning_rate": 1.2703764767517914e-05, + "loss": 0.0337, + "step": 136100 + }, + { + "epoch": 0.28055, + "grad_norm": 0.07581443339586258, + "learning_rate": 1.2700165826691568e-05, + "loss": 0.0351, + "step": 136110 + }, + { + "epoch": 0.2806, + "grad_norm": 0.0731777548789978, + "learning_rate": 1.2696567222138078e-05, + "loss": 0.0336, + "step": 136120 + }, + { + "epoch": 0.28065, + "grad_norm": 0.08558888733386993, + "learning_rate": 1.269296895395582e-05, + "loss": 0.0341, + "step": 136130 + }, + { + "epoch": 0.2807, + "grad_norm": 0.07193329185247421, + "learning_rate": 1.2689371022243166e-05, + "loss": 0.0345, + "step": 136140 + }, + { + "epoch": 0.28075, + "grad_norm": 0.06691916286945343, + "learning_rate": 1.2685773427098474e-05, + "loss": 0.035, + "step": 136150 + }, + { + "epoch": 0.2808, + "grad_norm": 0.05623365938663483, + "learning_rate": 1.2682176168620107e-05, + "loss": 0.0335, + "step": 136160 + }, + { + "epoch": 0.28085, + "grad_norm": 0.05890646204352379, + "learning_rate": 1.267857924690643e-05, + "loss": 0.0332, + "step": 136170 + }, + { + "epoch": 0.2809, + "grad_norm": 0.053852204233407974, + "learning_rate": 1.2674982662055765e-05, + "loss": 0.0349, + "step": 136180 + }, + { + "epoch": 0.28095, + "grad_norm": 0.0638689175248146, + "learning_rate": 1.2671386414166445e-05, + "loss": 0.0339, + "step": 136190 + }, + { + "epoch": 0.281, + "grad_norm": 0.0775456428527832, + "learning_rate": 1.266779050333678e-05, + "loss": 0.0354, + "step": 136200 + }, + { + "epoch": 0.28105, + "grad_norm": 0.0630122721195221, + "learning_rate": 1.2664194929665096e-05, + "loss": 0.0343, + "step": 136210 + }, + { + "epoch": 0.2811, + "grad_norm": 0.0455157645046711, + "learning_rate": 1.2660599693249688e-05, + "loss": 0.0338, + "step": 136220 + }, + { + "epoch": 0.28115, + "grad_norm": 0.0488881878554821, + "learning_rate": 1.2657004794188842e-05, + "loss": 0.033, + "step": 136230 + }, + { + "epoch": 0.2812, + "grad_norm": 0.04977099597454071, + "learning_rate": 1.2653410232580857e-05, + "loss": 0.0338, + "step": 136240 + }, + { + "epoch": 0.28125, + "grad_norm": 0.07122388482093811, + "learning_rate": 1.2649816008523988e-05, + "loss": 0.0341, + "step": 136250 + }, + { + "epoch": 0.2813, + "grad_norm": 0.0568055734038353, + "learning_rate": 1.264622212211653e-05, + "loss": 0.0325, + "step": 136260 + }, + { + "epoch": 0.28135, + "grad_norm": 0.05512743070721626, + "learning_rate": 1.2642628573456694e-05, + "loss": 0.0346, + "step": 136270 + }, + { + "epoch": 0.2814, + "grad_norm": 0.07318541407585144, + "learning_rate": 1.2639035362642755e-05, + "loss": 0.0361, + "step": 136280 + }, + { + "epoch": 0.28145, + "grad_norm": 0.09015218913555145, + "learning_rate": 1.2635442489772954e-05, + "loss": 0.0336, + "step": 136290 + }, + { + "epoch": 0.2815, + "grad_norm": 0.060024112462997437, + "learning_rate": 1.263184995494551e-05, + "loss": 0.0334, + "step": 136300 + }, + { + "epoch": 0.28155, + "grad_norm": 0.05561055243015289, + "learning_rate": 1.2628257758258644e-05, + "loss": 0.0335, + "step": 136310 + }, + { + "epoch": 0.2816, + "grad_norm": 0.063319131731987, + "learning_rate": 1.2624665899810551e-05, + "loss": 0.0342, + "step": 136320 + }, + { + "epoch": 0.28165, + "grad_norm": 0.052624449133872986, + "learning_rate": 1.262107437969945e-05, + "loss": 0.0334, + "step": 136330 + }, + { + "epoch": 0.2817, + "grad_norm": 0.05468215420842171, + "learning_rate": 1.2617483198023527e-05, + "loss": 0.033, + "step": 136340 + }, + { + "epoch": 0.28175, + "grad_norm": 0.055670272558927536, + "learning_rate": 1.2613892354880955e-05, + "loss": 0.034, + "step": 136350 + }, + { + "epoch": 0.2818, + "grad_norm": 0.053894419223070145, + "learning_rate": 1.2610301850369921e-05, + "loss": 0.0342, + "step": 136360 + }, + { + "epoch": 0.28185, + "grad_norm": 0.06553252786397934, + "learning_rate": 1.2606711684588568e-05, + "loss": 0.0336, + "step": 136370 + }, + { + "epoch": 0.2819, + "grad_norm": 0.05735878273844719, + "learning_rate": 1.2603121857635073e-05, + "loss": 0.0347, + "step": 136380 + }, + { + "epoch": 0.28195, + "grad_norm": 0.04910058155655861, + "learning_rate": 1.2599532369607566e-05, + "loss": 0.0338, + "step": 136390 + }, + { + "epoch": 0.282, + "grad_norm": 0.06366072595119476, + "learning_rate": 1.2595943220604178e-05, + "loss": 0.0357, + "step": 136400 + }, + { + "epoch": 0.28205, + "grad_norm": 0.07096876204013824, + "learning_rate": 1.2592354410723053e-05, + "loss": 0.0349, + "step": 136410 + }, + { + "epoch": 0.2821, + "grad_norm": 0.0628928542137146, + "learning_rate": 1.2588765940062298e-05, + "loss": 0.0346, + "step": 136420 + }, + { + "epoch": 0.28215, + "grad_norm": 0.05668439716100693, + "learning_rate": 1.2585177808720017e-05, + "loss": 0.0348, + "step": 136430 + }, + { + "epoch": 0.2822, + "grad_norm": 0.06567050516605377, + "learning_rate": 1.2581590016794303e-05, + "loss": 0.0347, + "step": 136440 + }, + { + "epoch": 0.28225, + "grad_norm": 0.0673997700214386, + "learning_rate": 1.2578002564383263e-05, + "loss": 0.0358, + "step": 136450 + }, + { + "epoch": 0.2823, + "grad_norm": 0.06713567674160004, + "learning_rate": 1.2574415451584954e-05, + "loss": 0.0384, + "step": 136460 + }, + { + "epoch": 0.28235, + "grad_norm": 0.04711003229022026, + "learning_rate": 1.2570828678497465e-05, + "loss": 0.0342, + "step": 136470 + }, + { + "epoch": 0.2824, + "grad_norm": 0.05185955390334129, + "learning_rate": 1.2567242245218858e-05, + "loss": 0.0329, + "step": 136480 + }, + { + "epoch": 0.28245, + "grad_norm": 0.05066725239157677, + "learning_rate": 1.2563656151847162e-05, + "loss": 0.0338, + "step": 136490 + }, + { + "epoch": 0.2825, + "grad_norm": 0.05633728951215744, + "learning_rate": 1.2560070398480445e-05, + "loss": 0.0345, + "step": 136500 + }, + { + "epoch": 0.28255, + "grad_norm": 0.0669638141989708, + "learning_rate": 1.2556484985216732e-05, + "loss": 0.0343, + "step": 136510 + }, + { + "epoch": 0.2826, + "grad_norm": 0.05856877937912941, + "learning_rate": 1.2552899912154042e-05, + "loss": 0.0337, + "step": 136520 + }, + { + "epoch": 0.28265, + "grad_norm": 0.05611797422170639, + "learning_rate": 1.2549315179390387e-05, + "loss": 0.0344, + "step": 136530 + }, + { + "epoch": 0.2827, + "grad_norm": 0.06687197834253311, + "learning_rate": 1.2545730787023775e-05, + "loss": 0.0379, + "step": 136540 + }, + { + "epoch": 0.28275, + "grad_norm": 0.051813483238220215, + "learning_rate": 1.2542146735152222e-05, + "loss": 0.0333, + "step": 136550 + }, + { + "epoch": 0.2828, + "grad_norm": 0.0661899670958519, + "learning_rate": 1.2538563023873679e-05, + "loss": 0.0342, + "step": 136560 + }, + { + "epoch": 0.28285, + "grad_norm": 0.06310340762138367, + "learning_rate": 1.2534979653286153e-05, + "loss": 0.0328, + "step": 136570 + }, + { + "epoch": 0.2829, + "grad_norm": 0.0555579736828804, + "learning_rate": 1.253139662348759e-05, + "loss": 0.0343, + "step": 136580 + }, + { + "epoch": 0.28295, + "grad_norm": 0.05000456050038338, + "learning_rate": 1.2527813934575967e-05, + "loss": 0.033, + "step": 136590 + }, + { + "epoch": 0.283, + "grad_norm": 0.059967171400785446, + "learning_rate": 1.2524231586649227e-05, + "loss": 0.0331, + "step": 136600 + }, + { + "epoch": 0.28305, + "grad_norm": 0.05584190413355827, + "learning_rate": 1.2520649579805297e-05, + "loss": 0.0341, + "step": 136610 + }, + { + "epoch": 0.2831, + "grad_norm": 0.05312740430235863, + "learning_rate": 1.2517067914142128e-05, + "loss": 0.0333, + "step": 136620 + }, + { + "epoch": 0.28315, + "grad_norm": 0.05799239128828049, + "learning_rate": 1.2513486589757636e-05, + "loss": 0.0327, + "step": 136630 + }, + { + "epoch": 0.2832, + "grad_norm": 0.06061725690960884, + "learning_rate": 1.2509905606749728e-05, + "loss": 0.0332, + "step": 136640 + }, + { + "epoch": 0.28325, + "grad_norm": 0.06276709586381912, + "learning_rate": 1.2506324965216298e-05, + "loss": 0.0347, + "step": 136650 + }, + { + "epoch": 0.2833, + "grad_norm": 0.05817681550979614, + "learning_rate": 1.250274466525525e-05, + "loss": 0.0326, + "step": 136660 + }, + { + "epoch": 0.28335, + "grad_norm": 0.05032667890191078, + "learning_rate": 1.2499164706964481e-05, + "loss": 0.0327, + "step": 136670 + }, + { + "epoch": 0.2834, + "grad_norm": 0.05516530200839043, + "learning_rate": 1.249558509044185e-05, + "loss": 0.033, + "step": 136680 + }, + { + "epoch": 0.28345, + "grad_norm": 0.05928047373890877, + "learning_rate": 1.2492005815785225e-05, + "loss": 0.0359, + "step": 136690 + }, + { + "epoch": 0.2835, + "grad_norm": 0.05928775295615196, + "learning_rate": 1.2488426883092453e-05, + "loss": 0.0331, + "step": 136700 + }, + { + "epoch": 0.28355, + "grad_norm": 0.07499240338802338, + "learning_rate": 1.2484848292461396e-05, + "loss": 0.0354, + "step": 136710 + }, + { + "epoch": 0.2836, + "grad_norm": 0.048563044518232346, + "learning_rate": 1.2481270043989887e-05, + "loss": 0.034, + "step": 136720 + }, + { + "epoch": 0.28365, + "grad_norm": 0.051543883979320526, + "learning_rate": 1.2477692137775742e-05, + "loss": 0.033, + "step": 136730 + }, + { + "epoch": 0.2837, + "grad_norm": 0.05584711208939552, + "learning_rate": 1.24741145739168e-05, + "loss": 0.0364, + "step": 136740 + }, + { + "epoch": 0.28375, + "grad_norm": 0.060850005596876144, + "learning_rate": 1.2470537352510853e-05, + "loss": 0.0353, + "step": 136750 + }, + { + "epoch": 0.2838, + "grad_norm": 0.058700792491436005, + "learning_rate": 1.2466960473655723e-05, + "loss": 0.0323, + "step": 136760 + }, + { + "epoch": 0.28385, + "grad_norm": 0.0639333575963974, + "learning_rate": 1.2463383937449166e-05, + "loss": 0.0343, + "step": 136770 + }, + { + "epoch": 0.2839, + "grad_norm": 0.05942584574222565, + "learning_rate": 1.2459807743988993e-05, + "loss": 0.0338, + "step": 136780 + }, + { + "epoch": 0.28395, + "grad_norm": 0.05773216113448143, + "learning_rate": 1.2456231893372955e-05, + "loss": 0.0338, + "step": 136790 + }, + { + "epoch": 0.284, + "grad_norm": 0.05086157098412514, + "learning_rate": 1.2452656385698836e-05, + "loss": 0.0327, + "step": 136800 + }, + { + "epoch": 0.28405, + "grad_norm": 0.056690383702516556, + "learning_rate": 1.2449081221064377e-05, + "loss": 0.0331, + "step": 136810 + }, + { + "epoch": 0.2841, + "grad_norm": 0.06598389148712158, + "learning_rate": 1.2445506399567311e-05, + "loss": 0.0321, + "step": 136820 + }, + { + "epoch": 0.28415, + "grad_norm": 0.06285768747329712, + "learning_rate": 1.2441931921305394e-05, + "loss": 0.0323, + "step": 136830 + }, + { + "epoch": 0.2842, + "grad_norm": 0.05682032182812691, + "learning_rate": 1.243835778637634e-05, + "loss": 0.0315, + "step": 136840 + }, + { + "epoch": 0.28425, + "grad_norm": 0.062499918043613434, + "learning_rate": 1.2434783994877856e-05, + "loss": 0.0344, + "step": 136850 + }, + { + "epoch": 0.2843, + "grad_norm": 0.07428952306509018, + "learning_rate": 1.2431210546907666e-05, + "loss": 0.0324, + "step": 136860 + }, + { + "epoch": 0.28435, + "grad_norm": 0.06071348115801811, + "learning_rate": 1.2427637442563447e-05, + "loss": 0.0335, + "step": 136870 + }, + { + "epoch": 0.2844, + "grad_norm": 0.048171207308769226, + "learning_rate": 1.2424064681942909e-05, + "loss": 0.0313, + "step": 136880 + }, + { + "epoch": 0.28445, + "grad_norm": 0.056041669100522995, + "learning_rate": 1.2420492265143719e-05, + "loss": 0.034, + "step": 136890 + }, + { + "epoch": 0.2845, + "grad_norm": 0.0565711185336113, + "learning_rate": 1.2416920192263542e-05, + "loss": 0.0316, + "step": 136900 + }, + { + "epoch": 0.28455, + "grad_norm": 0.05370723828673363, + "learning_rate": 1.241334846340003e-05, + "loss": 0.0328, + "step": 136910 + }, + { + "epoch": 0.2846, + "grad_norm": 0.05528026446700096, + "learning_rate": 1.2409777078650853e-05, + "loss": 0.0311, + "step": 136920 + }, + { + "epoch": 0.28465, + "grad_norm": 0.06202398240566254, + "learning_rate": 1.2406206038113641e-05, + "loss": 0.0326, + "step": 136930 + }, + { + "epoch": 0.2847, + "grad_norm": 0.06248015537858009, + "learning_rate": 1.2402635341886016e-05, + "loss": 0.0326, + "step": 136940 + }, + { + "epoch": 0.28475, + "grad_norm": 0.052109282463788986, + "learning_rate": 1.2399064990065615e-05, + "loss": 0.0326, + "step": 136950 + }, + { + "epoch": 0.2848, + "grad_norm": 0.051801566034555435, + "learning_rate": 1.2395494982750037e-05, + "loss": 0.0324, + "step": 136960 + }, + { + "epoch": 0.28485, + "grad_norm": 0.07044409215450287, + "learning_rate": 1.2391925320036907e-05, + "loss": 0.0347, + "step": 136970 + }, + { + "epoch": 0.2849, + "grad_norm": 0.05475321784615517, + "learning_rate": 1.2388356002023785e-05, + "loss": 0.0332, + "step": 136980 + }, + { + "epoch": 0.28495, + "grad_norm": 0.06268125027418137, + "learning_rate": 1.238478702880827e-05, + "loss": 0.0346, + "step": 136990 + }, + { + "epoch": 0.285, + "grad_norm": 0.058081746101379395, + "learning_rate": 1.2381218400487949e-05, + "loss": 0.0325, + "step": 137000 + }, + { + "epoch": 0.28505, + "grad_norm": 0.05382139980792999, + "learning_rate": 1.2377650117160374e-05, + "loss": 0.0333, + "step": 137010 + }, + { + "epoch": 0.2851, + "grad_norm": 0.06895588338375092, + "learning_rate": 1.2374082178923108e-05, + "loss": 0.0329, + "step": 137020 + }, + { + "epoch": 0.28515, + "grad_norm": 0.05296281725168228, + "learning_rate": 1.2370514585873677e-05, + "loss": 0.0332, + "step": 137030 + }, + { + "epoch": 0.2852, + "grad_norm": 0.058857038617134094, + "learning_rate": 1.2366947338109635e-05, + "loss": 0.0327, + "step": 137040 + }, + { + "epoch": 0.28525, + "grad_norm": 0.05026755481958389, + "learning_rate": 1.2363380435728528e-05, + "loss": 0.0315, + "step": 137050 + }, + { + "epoch": 0.2853, + "grad_norm": 0.06389336287975311, + "learning_rate": 1.235981387882783e-05, + "loss": 0.0341, + "step": 137060 + }, + { + "epoch": 0.28535, + "grad_norm": 0.05075691267848015, + "learning_rate": 1.2356247667505084e-05, + "loss": 0.0324, + "step": 137070 + }, + { + "epoch": 0.2854, + "grad_norm": 0.05633467063307762, + "learning_rate": 1.2352681801857766e-05, + "loss": 0.0337, + "step": 137080 + }, + { + "epoch": 0.28545, + "grad_norm": 0.05485008284449577, + "learning_rate": 1.2349116281983383e-05, + "loss": 0.0343, + "step": 137090 + }, + { + "epoch": 0.2855, + "grad_norm": 0.06466303020715714, + "learning_rate": 1.2345551107979411e-05, + "loss": 0.0337, + "step": 137100 + }, + { + "epoch": 0.28555, + "grad_norm": 0.04850131645798683, + "learning_rate": 1.2341986279943308e-05, + "loss": 0.0337, + "step": 137110 + }, + { + "epoch": 0.2856, + "grad_norm": 0.05085092410445213, + "learning_rate": 1.2338421797972554e-05, + "loss": 0.0343, + "step": 137120 + }, + { + "epoch": 0.28565, + "grad_norm": 0.059733033180236816, + "learning_rate": 1.2334857662164593e-05, + "loss": 0.0335, + "step": 137130 + }, + { + "epoch": 0.2857, + "grad_norm": 0.05192878842353821, + "learning_rate": 1.2331293872616862e-05, + "loss": 0.0329, + "step": 137140 + }, + { + "epoch": 0.28575, + "grad_norm": 0.06230088695883751, + "learning_rate": 1.232773042942679e-05, + "loss": 0.0347, + "step": 137150 + }, + { + "epoch": 0.2858, + "grad_norm": 0.06391609460115433, + "learning_rate": 1.2324167332691817e-05, + "loss": 0.0352, + "step": 137160 + }, + { + "epoch": 0.28585, + "grad_norm": 0.06976404041051865, + "learning_rate": 1.2320604582509335e-05, + "loss": 0.0336, + "step": 137170 + }, + { + "epoch": 0.2859, + "grad_norm": 0.05778951942920685, + "learning_rate": 1.2317042178976773e-05, + "loss": 0.0332, + "step": 137180 + }, + { + "epoch": 0.28595, + "grad_norm": 0.05799651890993118, + "learning_rate": 1.2313480122191511e-05, + "loss": 0.034, + "step": 137190 + }, + { + "epoch": 0.286, + "grad_norm": 0.0600292906165123, + "learning_rate": 1.2309918412250927e-05, + "loss": 0.0352, + "step": 137200 + }, + { + "epoch": 0.28605, + "grad_norm": 0.0597301684319973, + "learning_rate": 1.2306357049252415e-05, + "loss": 0.0337, + "step": 137210 + }, + { + "epoch": 0.2861, + "grad_norm": 0.06079234182834625, + "learning_rate": 1.2302796033293334e-05, + "loss": 0.0339, + "step": 137220 + }, + { + "epoch": 0.28615, + "grad_norm": 0.06248587369918823, + "learning_rate": 1.229923536447103e-05, + "loss": 0.0337, + "step": 137230 + }, + { + "epoch": 0.2862, + "grad_norm": 0.05634044110774994, + "learning_rate": 1.2295675042882867e-05, + "loss": 0.0335, + "step": 137240 + }, + { + "epoch": 0.28625, + "grad_norm": 0.05484062433242798, + "learning_rate": 1.229211506862617e-05, + "loss": 0.0334, + "step": 137250 + }, + { + "epoch": 0.2863, + "grad_norm": 0.05344822630286217, + "learning_rate": 1.2288555441798289e-05, + "loss": 0.0349, + "step": 137260 + }, + { + "epoch": 0.28635, + "grad_norm": 0.055607929825782776, + "learning_rate": 1.2284996162496507e-05, + "loss": 0.0331, + "step": 137270 + }, + { + "epoch": 0.2864, + "grad_norm": 0.06643947213888168, + "learning_rate": 1.2281437230818166e-05, + "loss": 0.034, + "step": 137280 + }, + { + "epoch": 0.28645, + "grad_norm": 0.07382363826036453, + "learning_rate": 1.2277878646860542e-05, + "loss": 0.0349, + "step": 137290 + }, + { + "epoch": 0.2865, + "grad_norm": 0.08346658945083618, + "learning_rate": 1.2274320410720943e-05, + "loss": 0.0334, + "step": 137300 + }, + { + "epoch": 0.28655, + "grad_norm": 0.04902971163392067, + "learning_rate": 1.2270762522496645e-05, + "loss": 0.0328, + "step": 137310 + }, + { + "epoch": 0.2866, + "grad_norm": 0.07123240828514099, + "learning_rate": 1.2267204982284908e-05, + "loss": 0.0334, + "step": 137320 + }, + { + "epoch": 0.28665, + "grad_norm": 0.06636399775743484, + "learning_rate": 1.2263647790183014e-05, + "loss": 0.033, + "step": 137330 + }, + { + "epoch": 0.2867, + "grad_norm": 0.059328123927116394, + "learning_rate": 1.2260090946288203e-05, + "loss": 0.0338, + "step": 137340 + }, + { + "epoch": 0.28675, + "grad_norm": 0.04988183453679085, + "learning_rate": 1.225653445069772e-05, + "loss": 0.0336, + "step": 137350 + }, + { + "epoch": 0.2868, + "grad_norm": 0.06137142330408096, + "learning_rate": 1.2252978303508789e-05, + "loss": 0.0337, + "step": 137360 + }, + { + "epoch": 0.28685, + "grad_norm": 0.06146937236189842, + "learning_rate": 1.2249422504818642e-05, + "loss": 0.0348, + "step": 137370 + }, + { + "epoch": 0.2869, + "grad_norm": 0.056695591658353806, + "learning_rate": 1.2245867054724502e-05, + "loss": 0.0365, + "step": 137380 + }, + { + "epoch": 0.28695, + "grad_norm": 0.061989981681108475, + "learning_rate": 1.2242311953323566e-05, + "loss": 0.034, + "step": 137390 + }, + { + "epoch": 0.287, + "grad_norm": 0.059609219431877136, + "learning_rate": 1.2238757200713027e-05, + "loss": 0.033, + "step": 137400 + }, + { + "epoch": 0.28705, + "grad_norm": 0.05324563756585121, + "learning_rate": 1.2235202796990064e-05, + "loss": 0.0341, + "step": 137410 + }, + { + "epoch": 0.2871, + "grad_norm": 0.05062618479132652, + "learning_rate": 1.223164874225187e-05, + "loss": 0.0344, + "step": 137420 + }, + { + "epoch": 0.28715, + "grad_norm": 0.042948681861162186, + "learning_rate": 1.2228095036595601e-05, + "loss": 0.0329, + "step": 137430 + }, + { + "epoch": 0.2872, + "grad_norm": 0.05113916099071503, + "learning_rate": 1.2224541680118408e-05, + "loss": 0.033, + "step": 137440 + }, + { + "epoch": 0.28725, + "grad_norm": 0.07776989787817001, + "learning_rate": 1.2220988672917457e-05, + "loss": 0.0358, + "step": 137450 + }, + { + "epoch": 0.2873, + "grad_norm": 0.06589024513959885, + "learning_rate": 1.2217436015089864e-05, + "loss": 0.0342, + "step": 137460 + }, + { + "epoch": 0.28735, + "grad_norm": 0.06623056530952454, + "learning_rate": 1.2213883706732785e-05, + "loss": 0.0339, + "step": 137470 + }, + { + "epoch": 0.2874, + "grad_norm": 0.05212220549583435, + "learning_rate": 1.2210331747943305e-05, + "loss": 0.0335, + "step": 137480 + }, + { + "epoch": 0.28745, + "grad_norm": 0.0750085711479187, + "learning_rate": 1.220678013881855e-05, + "loss": 0.036, + "step": 137490 + }, + { + "epoch": 0.2875, + "grad_norm": 0.052169449627399445, + "learning_rate": 1.2203228879455627e-05, + "loss": 0.0335, + "step": 137500 + }, + { + "epoch": 0.28755, + "grad_norm": 0.05590628832578659, + "learning_rate": 1.2199677969951622e-05, + "loss": 0.0331, + "step": 137510 + }, + { + "epoch": 0.2876, + "grad_norm": 0.06221117451786995, + "learning_rate": 1.2196127410403613e-05, + "loss": 0.032, + "step": 137520 + }, + { + "epoch": 0.28765, + "grad_norm": 0.05918494984507561, + "learning_rate": 1.2192577200908659e-05, + "loss": 0.0332, + "step": 137530 + }, + { + "epoch": 0.2877, + "grad_norm": 0.048740535974502563, + "learning_rate": 1.218902734156384e-05, + "loss": 0.0336, + "step": 137540 + }, + { + "epoch": 0.28775, + "grad_norm": 0.05140310525894165, + "learning_rate": 1.2185477832466206e-05, + "loss": 0.0329, + "step": 137550 + }, + { + "epoch": 0.2878, + "grad_norm": 0.051502104848623276, + "learning_rate": 1.218192867371278e-05, + "loss": 0.0334, + "step": 137560 + }, + { + "epoch": 0.28785, + "grad_norm": 0.06754551827907562, + "learning_rate": 1.2178379865400622e-05, + "loss": 0.0344, + "step": 137570 + }, + { + "epoch": 0.2879, + "grad_norm": 0.05078601837158203, + "learning_rate": 1.2174831407626727e-05, + "loss": 0.0363, + "step": 137580 + }, + { + "epoch": 0.28795, + "grad_norm": 0.0703398659825325, + "learning_rate": 1.2171283300488136e-05, + "loss": 0.0344, + "step": 137590 + }, + { + "epoch": 0.288, + "grad_norm": 0.06634822487831116, + "learning_rate": 1.216773554408184e-05, + "loss": 0.0349, + "step": 137600 + }, + { + "epoch": 0.28805, + "grad_norm": 0.05775686725974083, + "learning_rate": 1.2164188138504823e-05, + "loss": 0.0326, + "step": 137610 + }, + { + "epoch": 0.2881, + "grad_norm": 0.057913362979888916, + "learning_rate": 1.216064108385409e-05, + "loss": 0.0339, + "step": 137620 + }, + { + "epoch": 0.28815, + "grad_norm": 0.05447633937001228, + "learning_rate": 1.2157094380226605e-05, + "loss": 0.0336, + "step": 137630 + }, + { + "epoch": 0.2882, + "grad_norm": 0.06147785484790802, + "learning_rate": 1.2153548027719336e-05, + "loss": 0.0338, + "step": 137640 + }, + { + "epoch": 0.28825, + "grad_norm": 0.0486493855714798, + "learning_rate": 1.215000202642923e-05, + "loss": 0.0334, + "step": 137650 + }, + { + "epoch": 0.2883, + "grad_norm": 0.04371405765414238, + "learning_rate": 1.2146456376453247e-05, + "loss": 0.0338, + "step": 137660 + }, + { + "epoch": 0.28835, + "grad_norm": 0.04846899211406708, + "learning_rate": 1.214291107788831e-05, + "loss": 0.0335, + "step": 137670 + }, + { + "epoch": 0.2884, + "grad_norm": 0.053997546434402466, + "learning_rate": 1.2139366130831364e-05, + "loss": 0.0348, + "step": 137680 + }, + { + "epoch": 0.28845, + "grad_norm": 0.060016434639692307, + "learning_rate": 1.2135821535379316e-05, + "loss": 0.0333, + "step": 137690 + }, + { + "epoch": 0.2885, + "grad_norm": 0.0512930192053318, + "learning_rate": 1.2132277291629066e-05, + "loss": 0.0325, + "step": 137700 + }, + { + "epoch": 0.28855, + "grad_norm": 0.047423042356967926, + "learning_rate": 1.2128733399677527e-05, + "loss": 0.0331, + "step": 137710 + }, + { + "epoch": 0.2886, + "grad_norm": 0.05766147002577782, + "learning_rate": 1.2125189859621583e-05, + "loss": 0.0347, + "step": 137720 + }, + { + "epoch": 0.28865, + "grad_norm": 0.07152627408504486, + "learning_rate": 1.2121646671558112e-05, + "loss": 0.0341, + "step": 137730 + }, + { + "epoch": 0.2887, + "grad_norm": 0.05913759395480156, + "learning_rate": 1.2118103835583974e-05, + "loss": 0.0325, + "step": 137740 + }, + { + "epoch": 0.28875, + "grad_norm": 0.09551774710416794, + "learning_rate": 1.2114561351796037e-05, + "loss": 0.0354, + "step": 137750 + }, + { + "epoch": 0.2888, + "grad_norm": 0.07812321931123734, + "learning_rate": 1.211101922029117e-05, + "loss": 0.035, + "step": 137760 + }, + { + "epoch": 0.28885, + "grad_norm": 0.07728440314531326, + "learning_rate": 1.2107477441166176e-05, + "loss": 0.0343, + "step": 137770 + }, + { + "epoch": 0.2889, + "grad_norm": 0.0633360967040062, + "learning_rate": 1.2103936014517917e-05, + "loss": 0.0351, + "step": 137780 + }, + { + "epoch": 0.28895, + "grad_norm": 0.05036721006035805, + "learning_rate": 1.210039494044319e-05, + "loss": 0.0332, + "step": 137790 + }, + { + "epoch": 0.289, + "grad_norm": 0.053151726722717285, + "learning_rate": 1.209685421903883e-05, + "loss": 0.0341, + "step": 137800 + }, + { + "epoch": 0.28905, + "grad_norm": 0.05260220915079117, + "learning_rate": 1.209331385040163e-05, + "loss": 0.0332, + "step": 137810 + }, + { + "epoch": 0.2891, + "grad_norm": 0.07106596976518631, + "learning_rate": 1.208977383462837e-05, + "loss": 0.0346, + "step": 137820 + }, + { + "epoch": 0.28915, + "grad_norm": 0.05681164190173149, + "learning_rate": 1.2086234171815852e-05, + "loss": 0.0331, + "step": 137830 + }, + { + "epoch": 0.2892, + "grad_norm": 0.06736712902784348, + "learning_rate": 1.2082694862060839e-05, + "loss": 0.0334, + "step": 137840 + }, + { + "epoch": 0.28925, + "grad_norm": 0.0499461404979229, + "learning_rate": 1.2079155905460099e-05, + "loss": 0.0329, + "step": 137850 + }, + { + "epoch": 0.2893, + "grad_norm": 0.04850183427333832, + "learning_rate": 1.207561730211037e-05, + "loss": 0.0334, + "step": 137860 + }, + { + "epoch": 0.28935, + "grad_norm": 0.06345546245574951, + "learning_rate": 1.207207905210841e-05, + "loss": 0.0332, + "step": 137870 + }, + { + "epoch": 0.2894, + "grad_norm": 0.05960962548851967, + "learning_rate": 1.206854115555096e-05, + "loss": 0.0353, + "step": 137880 + }, + { + "epoch": 0.28945, + "grad_norm": 0.05867240950465202, + "learning_rate": 1.206500361253474e-05, + "loss": 0.034, + "step": 137890 + }, + { + "epoch": 0.2895, + "grad_norm": 0.05325555056333542, + "learning_rate": 1.206146642315646e-05, + "loss": 0.0353, + "step": 137900 + }, + { + "epoch": 0.28955, + "grad_norm": 0.05943364277482033, + "learning_rate": 1.2057929587512814e-05, + "loss": 0.0336, + "step": 137910 + }, + { + "epoch": 0.2896, + "grad_norm": 0.05827326700091362, + "learning_rate": 1.2054393105700523e-05, + "loss": 0.0331, + "step": 137920 + }, + { + "epoch": 0.28965, + "grad_norm": 0.046118587255477905, + "learning_rate": 1.2050856977816264e-05, + "loss": 0.0334, + "step": 137930 + }, + { + "epoch": 0.2897, + "grad_norm": 0.04549780115485191, + "learning_rate": 1.2047321203956699e-05, + "loss": 0.0326, + "step": 137940 + }, + { + "epoch": 0.28975, + "grad_norm": 0.0508422777056694, + "learning_rate": 1.2043785784218514e-05, + "loss": 0.033, + "step": 137950 + }, + { + "epoch": 0.2898, + "grad_norm": 0.07315465062856674, + "learning_rate": 1.204025071869835e-05, + "loss": 0.0373, + "step": 137960 + }, + { + "epoch": 0.28985, + "grad_norm": 0.06244395673274994, + "learning_rate": 1.2036716007492882e-05, + "loss": 0.0355, + "step": 137970 + }, + { + "epoch": 0.2899, + "grad_norm": 0.061203405261039734, + "learning_rate": 1.2033181650698708e-05, + "loss": 0.0332, + "step": 137980 + }, + { + "epoch": 0.28995, + "grad_norm": 0.054324351251125336, + "learning_rate": 1.2029647648412479e-05, + "loss": 0.0349, + "step": 137990 + }, + { + "epoch": 0.29, + "grad_norm": 0.051710255444049835, + "learning_rate": 1.2026114000730818e-05, + "loss": 0.0326, + "step": 138000 + }, + { + "epoch": 0.29005, + "grad_norm": 0.05411173775792122, + "learning_rate": 1.2022580707750325e-05, + "loss": 0.0334, + "step": 138010 + }, + { + "epoch": 0.2901, + "grad_norm": 0.051477011293172836, + "learning_rate": 1.2019047769567601e-05, + "loss": 0.0371, + "step": 138020 + }, + { + "epoch": 0.29015, + "grad_norm": 0.06500280648469925, + "learning_rate": 1.2015515186279225e-05, + "loss": 0.0333, + "step": 138030 + }, + { + "epoch": 0.2902, + "grad_norm": 0.06152309849858284, + "learning_rate": 1.2011982957981795e-05, + "loss": 0.0339, + "step": 138040 + }, + { + "epoch": 0.29025, + "grad_norm": 0.046150896698236465, + "learning_rate": 1.2008451084771873e-05, + "loss": 0.0337, + "step": 138050 + }, + { + "epoch": 0.2903, + "grad_norm": 0.07280026376247406, + "learning_rate": 1.2004919566746009e-05, + "loss": 0.034, + "step": 138060 + }, + { + "epoch": 0.29035, + "grad_norm": 0.06372442096471786, + "learning_rate": 1.2001388404000769e-05, + "loss": 0.0314, + "step": 138070 + }, + { + "epoch": 0.2904, + "grad_norm": 0.053509101271629333, + "learning_rate": 1.1997857596632678e-05, + "loss": 0.0327, + "step": 138080 + }, + { + "epoch": 0.29045, + "grad_norm": 0.0671224296092987, + "learning_rate": 1.1994327144738285e-05, + "loss": 0.0334, + "step": 138090 + }, + { + "epoch": 0.2905, + "grad_norm": 0.055848028510808945, + "learning_rate": 1.1990797048414102e-05, + "loss": 0.0325, + "step": 138100 + }, + { + "epoch": 0.29055, + "grad_norm": 0.07784879952669144, + "learning_rate": 1.1987267307756639e-05, + "loss": 0.033, + "step": 138110 + }, + { + "epoch": 0.2906, + "grad_norm": 0.05662766471505165, + "learning_rate": 1.1983737922862392e-05, + "loss": 0.033, + "step": 138120 + }, + { + "epoch": 0.29065, + "grad_norm": 0.05109269171953201, + "learning_rate": 1.1980208893827868e-05, + "loss": 0.0322, + "step": 138130 + }, + { + "epoch": 0.2907, + "grad_norm": 0.045039620250463486, + "learning_rate": 1.1976680220749543e-05, + "loss": 0.0328, + "step": 138140 + }, + { + "epoch": 0.29075, + "grad_norm": 0.044065602123737335, + "learning_rate": 1.1973151903723875e-05, + "loss": 0.0347, + "step": 138150 + }, + { + "epoch": 0.2908, + "grad_norm": 0.05793699622154236, + "learning_rate": 1.1969623942847355e-05, + "loss": 0.0349, + "step": 138160 + }, + { + "epoch": 0.29085, + "grad_norm": 0.058388493955135345, + "learning_rate": 1.1966096338216406e-05, + "loss": 0.0336, + "step": 138170 + }, + { + "epoch": 0.2909, + "grad_norm": 0.06332320719957352, + "learning_rate": 1.1962569089927511e-05, + "loss": 0.0358, + "step": 138180 + }, + { + "epoch": 0.29095, + "grad_norm": 0.06636647880077362, + "learning_rate": 1.1959042198077056e-05, + "loss": 0.0356, + "step": 138190 + }, + { + "epoch": 0.291, + "grad_norm": 0.05746784806251526, + "learning_rate": 1.195551566276149e-05, + "loss": 0.0349, + "step": 138200 + }, + { + "epoch": 0.29105, + "grad_norm": 0.05651381239295006, + "learning_rate": 1.1951989484077234e-05, + "loss": 0.0339, + "step": 138210 + }, + { + "epoch": 0.2911, + "grad_norm": 0.05743367224931717, + "learning_rate": 1.1948463662120684e-05, + "loss": 0.0352, + "step": 138220 + }, + { + "epoch": 0.29115, + "grad_norm": 0.06609483063220978, + "learning_rate": 1.1944938196988234e-05, + "loss": 0.0343, + "step": 138230 + }, + { + "epoch": 0.2912, + "grad_norm": 0.06054598465561867, + "learning_rate": 1.194141308877626e-05, + "loss": 0.033, + "step": 138240 + }, + { + "epoch": 0.29125, + "grad_norm": 0.04975799098610878, + "learning_rate": 1.1937888337581146e-05, + "loss": 0.0338, + "step": 138250 + }, + { + "epoch": 0.2913, + "grad_norm": 0.053071774542331696, + "learning_rate": 1.1934363943499277e-05, + "loss": 0.0345, + "step": 138260 + }, + { + "epoch": 0.29135, + "grad_norm": 0.04644394293427467, + "learning_rate": 1.193083990662697e-05, + "loss": 0.034, + "step": 138270 + }, + { + "epoch": 0.2914, + "grad_norm": 0.05511296167969704, + "learning_rate": 1.19273162270606e-05, + "loss": 0.0347, + "step": 138280 + }, + { + "epoch": 0.29145, + "grad_norm": 0.05640757456421852, + "learning_rate": 1.1923792904896482e-05, + "loss": 0.0342, + "step": 138290 + }, + { + "epoch": 0.2915, + "grad_norm": 0.06079962104558945, + "learning_rate": 1.1920269940230963e-05, + "loss": 0.0337, + "step": 138300 + }, + { + "epoch": 0.29155, + "grad_norm": 0.0591752864420414, + "learning_rate": 1.1916747333160353e-05, + "loss": 0.0336, + "step": 138310 + }, + { + "epoch": 0.2916, + "grad_norm": 0.060306381434202194, + "learning_rate": 1.1913225083780943e-05, + "loss": 0.0346, + "step": 138320 + }, + { + "epoch": 0.29165, + "grad_norm": 0.08270762115716934, + "learning_rate": 1.1909703192189054e-05, + "loss": 0.0338, + "step": 138330 + }, + { + "epoch": 0.2917, + "grad_norm": 0.047715380787849426, + "learning_rate": 1.1906181658480961e-05, + "loss": 0.0327, + "step": 138340 + }, + { + "epoch": 0.29175, + "grad_norm": 0.04644634574651718, + "learning_rate": 1.1902660482752945e-05, + "loss": 0.0334, + "step": 138350 + }, + { + "epoch": 0.2918, + "grad_norm": 0.04178638383746147, + "learning_rate": 1.1899139665101259e-05, + "loss": 0.0325, + "step": 138360 + }, + { + "epoch": 0.29185, + "grad_norm": 0.05462385714054108, + "learning_rate": 1.1895619205622183e-05, + "loss": 0.0339, + "step": 138370 + }, + { + "epoch": 0.2919, + "grad_norm": 0.05796770751476288, + "learning_rate": 1.1892099104411944e-05, + "loss": 0.0327, + "step": 138380 + }, + { + "epoch": 0.29195, + "grad_norm": 0.05922938510775566, + "learning_rate": 1.18885793615668e-05, + "loss": 0.0346, + "step": 138390 + }, + { + "epoch": 0.292, + "grad_norm": 0.06663983315229416, + "learning_rate": 1.1885059977182975e-05, + "loss": 0.0348, + "step": 138400 + }, + { + "epoch": 0.29205, + "grad_norm": 0.04856368899345398, + "learning_rate": 1.188154095135667e-05, + "loss": 0.033, + "step": 138410 + }, + { + "epoch": 0.2921, + "grad_norm": 0.04834192246198654, + "learning_rate": 1.1878022284184118e-05, + "loss": 0.0338, + "step": 138420 + }, + { + "epoch": 0.29215, + "grad_norm": 0.04955565929412842, + "learning_rate": 1.1874503975761506e-05, + "loss": 0.034, + "step": 138430 + }, + { + "epoch": 0.2922, + "grad_norm": 0.04974037781357765, + "learning_rate": 1.1870986026185013e-05, + "loss": 0.0332, + "step": 138440 + }, + { + "epoch": 0.29225, + "grad_norm": 0.06445062160491943, + "learning_rate": 1.1867468435550844e-05, + "loss": 0.0332, + "step": 138450 + }, + { + "epoch": 0.2923, + "grad_norm": 0.056080423295497894, + "learning_rate": 1.186395120395514e-05, + "loss": 0.0329, + "step": 138460 + }, + { + "epoch": 0.29235, + "grad_norm": 0.0647350624203682, + "learning_rate": 1.1860434331494096e-05, + "loss": 0.0356, + "step": 138470 + }, + { + "epoch": 0.2924, + "grad_norm": 0.06235770508646965, + "learning_rate": 1.1856917818263824e-05, + "loss": 0.033, + "step": 138480 + }, + { + "epoch": 0.29245, + "grad_norm": 0.058304354548454285, + "learning_rate": 1.1853401664360489e-05, + "loss": 0.0322, + "step": 138490 + }, + { + "epoch": 0.2925, + "grad_norm": 0.049607373774051666, + "learning_rate": 1.1849885869880203e-05, + "loss": 0.0345, + "step": 138500 + }, + { + "epoch": 0.29255, + "grad_norm": 0.0553838312625885, + "learning_rate": 1.1846370434919108e-05, + "loss": 0.0332, + "step": 138510 + }, + { + "epoch": 0.2926, + "grad_norm": 0.04994002357125282, + "learning_rate": 1.1842855359573304e-05, + "loss": 0.0329, + "step": 138520 + }, + { + "epoch": 0.29265, + "grad_norm": 0.09766009449958801, + "learning_rate": 1.1839340643938881e-05, + "loss": 0.0326, + "step": 138530 + }, + { + "epoch": 0.2927, + "grad_norm": 0.057872503995895386, + "learning_rate": 1.183582628811195e-05, + "loss": 0.033, + "step": 138540 + }, + { + "epoch": 0.29275, + "grad_norm": 0.05053986236453056, + "learning_rate": 1.1832312292188582e-05, + "loss": 0.0326, + "step": 138550 + }, + { + "epoch": 0.2928, + "grad_norm": 0.048096589744091034, + "learning_rate": 1.1828798656264853e-05, + "loss": 0.0325, + "step": 138560 + }, + { + "epoch": 0.29285, + "grad_norm": 0.051040418446063995, + "learning_rate": 1.1825285380436807e-05, + "loss": 0.0327, + "step": 138570 + }, + { + "epoch": 0.2929, + "grad_norm": 0.04403144493699074, + "learning_rate": 1.182177246480051e-05, + "loss": 0.0313, + "step": 138580 + }, + { + "epoch": 0.29295, + "grad_norm": 0.048360489308834076, + "learning_rate": 1.1818259909452014e-05, + "loss": 0.0322, + "step": 138590 + }, + { + "epoch": 0.293, + "grad_norm": 0.049854207783937454, + "learning_rate": 1.1814747714487337e-05, + "loss": 0.0327, + "step": 138600 + }, + { + "epoch": 0.29305, + "grad_norm": 0.059948597103357315, + "learning_rate": 1.1811235880002507e-05, + "loss": 0.0337, + "step": 138610 + }, + { + "epoch": 0.2931, + "grad_norm": 0.04575120285153389, + "learning_rate": 1.1807724406093524e-05, + "loss": 0.0328, + "step": 138620 + }, + { + "epoch": 0.29315, + "grad_norm": 0.0619422048330307, + "learning_rate": 1.1804213292856405e-05, + "loss": 0.034, + "step": 138630 + }, + { + "epoch": 0.2932, + "grad_norm": 0.055875059217214584, + "learning_rate": 1.1800702540387143e-05, + "loss": 0.0336, + "step": 138640 + }, + { + "epoch": 0.29325, + "grad_norm": 0.06045487895607948, + "learning_rate": 1.1797192148781702e-05, + "loss": 0.0337, + "step": 138650 + }, + { + "epoch": 0.2933, + "grad_norm": 0.056943390518426895, + "learning_rate": 1.1793682118136076e-05, + "loss": 0.0333, + "step": 138660 + }, + { + "epoch": 0.29335, + "grad_norm": 0.05551246553659439, + "learning_rate": 1.179017244854621e-05, + "loss": 0.0342, + "step": 138670 + }, + { + "epoch": 0.2934, + "grad_norm": 0.05962681025266647, + "learning_rate": 1.178666314010809e-05, + "loss": 0.0336, + "step": 138680 + }, + { + "epoch": 0.29345, + "grad_norm": 0.05065612867474556, + "learning_rate": 1.1783154192917612e-05, + "loss": 0.0341, + "step": 138690 + }, + { + "epoch": 0.2935, + "grad_norm": 0.05642010271549225, + "learning_rate": 1.1779645607070736e-05, + "loss": 0.0339, + "step": 138700 + }, + { + "epoch": 0.29355, + "grad_norm": 0.05656618997454643, + "learning_rate": 1.1776137382663389e-05, + "loss": 0.0356, + "step": 138710 + }, + { + "epoch": 0.2936, + "grad_norm": 0.056820448487997055, + "learning_rate": 1.1772629519791481e-05, + "loss": 0.0339, + "step": 138720 + }, + { + "epoch": 0.29365, + "grad_norm": 0.05296377092599869, + "learning_rate": 1.176912201855091e-05, + "loss": 0.0331, + "step": 138730 + }, + { + "epoch": 0.2937, + "grad_norm": 0.05938102677464485, + "learning_rate": 1.1765614879037565e-05, + "loss": 0.0348, + "step": 138740 + }, + { + "epoch": 0.29375, + "grad_norm": 0.0567857027053833, + "learning_rate": 1.1762108101347344e-05, + "loss": 0.034, + "step": 138750 + }, + { + "epoch": 0.2938, + "grad_norm": 0.05834837257862091, + "learning_rate": 1.1758601685576118e-05, + "loss": 0.0345, + "step": 138760 + }, + { + "epoch": 0.29385, + "grad_norm": 0.05760972946882248, + "learning_rate": 1.1755095631819734e-05, + "loss": 0.0346, + "step": 138770 + }, + { + "epoch": 0.2939, + "grad_norm": 0.057558294385671616, + "learning_rate": 1.1751589940174074e-05, + "loss": 0.0319, + "step": 138780 + }, + { + "epoch": 0.29395, + "grad_norm": 0.059915948659181595, + "learning_rate": 1.1748084610734954e-05, + "loss": 0.0322, + "step": 138790 + }, + { + "epoch": 0.294, + "grad_norm": 0.06128979101777077, + "learning_rate": 1.1744579643598232e-05, + "loss": 0.0326, + "step": 138800 + }, + { + "epoch": 0.29405, + "grad_norm": 0.06308026611804962, + "learning_rate": 1.1741075038859725e-05, + "loss": 0.0327, + "step": 138810 + }, + { + "epoch": 0.2941, + "grad_norm": 0.0532807894051075, + "learning_rate": 1.1737570796615236e-05, + "loss": 0.0325, + "step": 138820 + }, + { + "epoch": 0.29415, + "grad_norm": 0.046811792999506, + "learning_rate": 1.1734066916960584e-05, + "loss": 0.0324, + "step": 138830 + }, + { + "epoch": 0.2942, + "grad_norm": 0.04966428130865097, + "learning_rate": 1.1730563399991563e-05, + "loss": 0.0316, + "step": 138840 + }, + { + "epoch": 0.29425, + "grad_norm": 0.04455842822790146, + "learning_rate": 1.1727060245803952e-05, + "loss": 0.032, + "step": 138850 + }, + { + "epoch": 0.2943, + "grad_norm": 0.05419816076755524, + "learning_rate": 1.172355745449352e-05, + "loss": 0.0331, + "step": 138860 + }, + { + "epoch": 0.29435, + "grad_norm": 0.05135956034064293, + "learning_rate": 1.1720055026156045e-05, + "loss": 0.0324, + "step": 138870 + }, + { + "epoch": 0.2944, + "grad_norm": 0.05116800218820572, + "learning_rate": 1.171655296088727e-05, + "loss": 0.0332, + "step": 138880 + }, + { + "epoch": 0.29445, + "grad_norm": 0.045259881764650345, + "learning_rate": 1.1713051258782955e-05, + "loss": 0.032, + "step": 138890 + }, + { + "epoch": 0.2945, + "grad_norm": 0.05142403393983841, + "learning_rate": 1.1709549919938827e-05, + "loss": 0.0328, + "step": 138900 + }, + { + "epoch": 0.29455, + "grad_norm": 0.05417029187083244, + "learning_rate": 1.1706048944450604e-05, + "loss": 0.0325, + "step": 138910 + }, + { + "epoch": 0.2946, + "grad_norm": 0.05212869867682457, + "learning_rate": 1.1702548332414014e-05, + "loss": 0.0328, + "step": 138920 + }, + { + "epoch": 0.29465, + "grad_norm": 0.044010862708091736, + "learning_rate": 1.169904808392476e-05, + "loss": 0.0329, + "step": 138930 + }, + { + "epoch": 0.2947, + "grad_norm": 0.05676640197634697, + "learning_rate": 1.1695548199078534e-05, + "loss": 0.0331, + "step": 138940 + }, + { + "epoch": 0.29475, + "grad_norm": 0.05736074596643448, + "learning_rate": 1.1692048677971013e-05, + "loss": 0.0332, + "step": 138950 + }, + { + "epoch": 0.2948, + "grad_norm": 0.054421745240688324, + "learning_rate": 1.168854952069788e-05, + "loss": 0.0336, + "step": 138960 + }, + { + "epoch": 0.29485, + "grad_norm": 0.04825567454099655, + "learning_rate": 1.1685050727354821e-05, + "loss": 0.032, + "step": 138970 + }, + { + "epoch": 0.2949, + "grad_norm": 0.056606877595186234, + "learning_rate": 1.1681552298037457e-05, + "loss": 0.0321, + "step": 138980 + }, + { + "epoch": 0.29495, + "grad_norm": 0.05282709375023842, + "learning_rate": 1.1678054232841456e-05, + "loss": 0.0333, + "step": 138990 + }, + { + "epoch": 0.295, + "grad_norm": 0.06271221488714218, + "learning_rate": 1.1674556531862438e-05, + "loss": 0.035, + "step": 139000 + }, + { + "epoch": 0.29505, + "grad_norm": 0.059332944452762604, + "learning_rate": 1.167105919519605e-05, + "loss": 0.0342, + "step": 139010 + }, + { + "epoch": 0.2951, + "grad_norm": 0.05737001448869705, + "learning_rate": 1.1667562222937895e-05, + "loss": 0.0331, + "step": 139020 + }, + { + "epoch": 0.29515, + "grad_norm": 0.05790884792804718, + "learning_rate": 1.166406561518357e-05, + "loss": 0.0344, + "step": 139030 + }, + { + "epoch": 0.2952, + "grad_norm": 0.06033563241362572, + "learning_rate": 1.166056937202869e-05, + "loss": 0.0341, + "step": 139040 + }, + { + "epoch": 0.29525, + "grad_norm": 0.056599847972393036, + "learning_rate": 1.1657073493568834e-05, + "loss": 0.0334, + "step": 139050 + }, + { + "epoch": 0.2953, + "grad_norm": 0.06243237853050232, + "learning_rate": 1.1653577979899574e-05, + "loss": 0.0333, + "step": 139060 + }, + { + "epoch": 0.29535, + "grad_norm": 0.0513954721391201, + "learning_rate": 1.1650082831116471e-05, + "loss": 0.0326, + "step": 139070 + }, + { + "epoch": 0.2954, + "grad_norm": 0.04541872814297676, + "learning_rate": 1.1646588047315084e-05, + "loss": 0.0329, + "step": 139080 + }, + { + "epoch": 0.29545, + "grad_norm": 0.04447149485349655, + "learning_rate": 1.1643093628590976e-05, + "loss": 0.033, + "step": 139090 + }, + { + "epoch": 0.2955, + "grad_norm": 0.05670711025595665, + "learning_rate": 1.163959957503967e-05, + "loss": 0.0354, + "step": 139100 + }, + { + "epoch": 0.29555, + "grad_norm": 0.04876406863331795, + "learning_rate": 1.1636105886756692e-05, + "loss": 0.0331, + "step": 139110 + }, + { + "epoch": 0.2956, + "grad_norm": 0.05532897636294365, + "learning_rate": 1.163261256383755e-05, + "loss": 0.0329, + "step": 139120 + }, + { + "epoch": 0.29565, + "grad_norm": 0.061281319707632065, + "learning_rate": 1.1629119606377764e-05, + "loss": 0.0342, + "step": 139130 + }, + { + "epoch": 0.2957, + "grad_norm": 0.06245240569114685, + "learning_rate": 1.1625627014472828e-05, + "loss": 0.0351, + "step": 139140 + }, + { + "epoch": 0.29575, + "grad_norm": 0.057373180985450745, + "learning_rate": 1.1622134788218217e-05, + "loss": 0.0323, + "step": 139150 + }, + { + "epoch": 0.2958, + "grad_norm": 0.05703739821910858, + "learning_rate": 1.1618642927709423e-05, + "loss": 0.0334, + "step": 139160 + }, + { + "epoch": 0.29585, + "grad_norm": 0.05991936847567558, + "learning_rate": 1.1615151433041894e-05, + "loss": 0.0335, + "step": 139170 + }, + { + "epoch": 0.2959, + "grad_norm": 0.07308800518512726, + "learning_rate": 1.1611660304311114e-05, + "loss": 0.0339, + "step": 139180 + }, + { + "epoch": 0.29595, + "grad_norm": 0.06480929255485535, + "learning_rate": 1.1608169541612493e-05, + "loss": 0.0351, + "step": 139190 + }, + { + "epoch": 0.296, + "grad_norm": 0.05141975358128548, + "learning_rate": 1.1604679145041489e-05, + "loss": 0.034, + "step": 139200 + }, + { + "epoch": 0.29605, + "grad_norm": 0.04355557635426521, + "learning_rate": 1.1601189114693531e-05, + "loss": 0.0326, + "step": 139210 + }, + { + "epoch": 0.2961, + "grad_norm": 0.054277196526527405, + "learning_rate": 1.1597699450664028e-05, + "loss": 0.0321, + "step": 139220 + }, + { + "epoch": 0.29615, + "grad_norm": 0.058067500591278076, + "learning_rate": 1.159421015304839e-05, + "loss": 0.0333, + "step": 139230 + }, + { + "epoch": 0.2962, + "grad_norm": 0.05233805626630783, + "learning_rate": 1.1590721221942e-05, + "loss": 0.0328, + "step": 139240 + }, + { + "epoch": 0.29625, + "grad_norm": 0.05005316063761711, + "learning_rate": 1.1587232657440264e-05, + "loss": 0.033, + "step": 139250 + }, + { + "epoch": 0.2963, + "grad_norm": 0.06589703261852264, + "learning_rate": 1.1583744459638545e-05, + "loss": 0.0327, + "step": 139260 + }, + { + "epoch": 0.29635, + "grad_norm": 0.05952505022287369, + "learning_rate": 1.1580256628632208e-05, + "loss": 0.0337, + "step": 139270 + }, + { + "epoch": 0.2964, + "grad_norm": 0.0663764625787735, + "learning_rate": 1.1576769164516618e-05, + "loss": 0.034, + "step": 139280 + }, + { + "epoch": 0.29645, + "grad_norm": 0.06101415678858757, + "learning_rate": 1.157328206738711e-05, + "loss": 0.0343, + "step": 139290 + }, + { + "epoch": 0.2965, + "grad_norm": 0.060911625623703, + "learning_rate": 1.1569795337339035e-05, + "loss": 0.0331, + "step": 139300 + }, + { + "epoch": 0.29655, + "grad_norm": 0.05092538520693779, + "learning_rate": 1.1566308974467707e-05, + "loss": 0.0335, + "step": 139310 + }, + { + "epoch": 0.2966, + "grad_norm": 0.056884728372097015, + "learning_rate": 1.1562822978868449e-05, + "loss": 0.0323, + "step": 139320 + }, + { + "epoch": 0.29665, + "grad_norm": 0.06848115473985672, + "learning_rate": 1.1559337350636552e-05, + "loss": 0.0338, + "step": 139330 + }, + { + "epoch": 0.2967, + "grad_norm": 0.05323183164000511, + "learning_rate": 1.1555852089867329e-05, + "loss": 0.0321, + "step": 139340 + }, + { + "epoch": 0.29675, + "grad_norm": 0.05719597637653351, + "learning_rate": 1.155236719665606e-05, + "loss": 0.0328, + "step": 139350 + }, + { + "epoch": 0.2968, + "grad_norm": 0.06035812571644783, + "learning_rate": 1.1548882671098014e-05, + "loss": 0.0357, + "step": 139360 + }, + { + "epoch": 0.29685, + "grad_norm": 0.061590977013111115, + "learning_rate": 1.1545398513288469e-05, + "loss": 0.0325, + "step": 139370 + }, + { + "epoch": 0.2969, + "grad_norm": 0.06258758902549744, + "learning_rate": 1.1541914723322664e-05, + "loss": 0.0324, + "step": 139380 + }, + { + "epoch": 0.29695, + "grad_norm": 0.060489144176244736, + "learning_rate": 1.1538431301295873e-05, + "loss": 0.0333, + "step": 139390 + }, + { + "epoch": 0.297, + "grad_norm": 0.10881593078374863, + "learning_rate": 1.1534948247303295e-05, + "loss": 0.0337, + "step": 139400 + }, + { + "epoch": 0.29705, + "grad_norm": 0.05411689355969429, + "learning_rate": 1.1531465561440174e-05, + "loss": 0.0333, + "step": 139410 + }, + { + "epoch": 0.2971, + "grad_norm": 0.0653669610619545, + "learning_rate": 1.1527983243801734e-05, + "loss": 0.0333, + "step": 139420 + }, + { + "epoch": 0.29715, + "grad_norm": 0.06009123474359512, + "learning_rate": 1.1524501294483173e-05, + "loss": 0.0326, + "step": 139430 + }, + { + "epoch": 0.2972, + "grad_norm": 0.05364300683140755, + "learning_rate": 1.1521019713579682e-05, + "loss": 0.0328, + "step": 139440 + }, + { + "epoch": 0.29725, + "grad_norm": 0.058414604514837265, + "learning_rate": 1.1517538501186437e-05, + "loss": 0.0319, + "step": 139450 + }, + { + "epoch": 0.2973, + "grad_norm": 0.06450974941253662, + "learning_rate": 1.1514057657398624e-05, + "loss": 0.0333, + "step": 139460 + }, + { + "epoch": 0.29735, + "grad_norm": 0.06400681287050247, + "learning_rate": 1.151057718231143e-05, + "loss": 0.0328, + "step": 139470 + }, + { + "epoch": 0.2974, + "grad_norm": 0.053194813430309296, + "learning_rate": 1.1507097076019967e-05, + "loss": 0.0318, + "step": 139480 + }, + { + "epoch": 0.29745, + "grad_norm": 0.06602397561073303, + "learning_rate": 1.1503617338619413e-05, + "loss": 0.033, + "step": 139490 + }, + { + "epoch": 0.2975, + "grad_norm": 0.07012414187192917, + "learning_rate": 1.150013797020488e-05, + "loss": 0.0345, + "step": 139500 + }, + { + "epoch": 0.29755, + "grad_norm": 0.059558626264333725, + "learning_rate": 1.1496658970871513e-05, + "loss": 0.0328, + "step": 139510 + }, + { + "epoch": 0.2976, + "grad_norm": 0.05656155198812485, + "learning_rate": 1.1493180340714416e-05, + "loss": 0.0327, + "step": 139520 + }, + { + "epoch": 0.29765, + "grad_norm": 0.059396471828222275, + "learning_rate": 1.1489702079828684e-05, + "loss": 0.0345, + "step": 139530 + }, + { + "epoch": 0.2977, + "grad_norm": 0.04921974241733551, + "learning_rate": 1.148622418830943e-05, + "loss": 0.0358, + "step": 139540 + }, + { + "epoch": 0.29775, + "grad_norm": 0.05693204700946808, + "learning_rate": 1.1482746666251734e-05, + "loss": 0.0336, + "step": 139550 + }, + { + "epoch": 0.2978, + "grad_norm": 0.055195748805999756, + "learning_rate": 1.1479269513750662e-05, + "loss": 0.035, + "step": 139560 + }, + { + "epoch": 0.29785, + "grad_norm": 0.04758436605334282, + "learning_rate": 1.1475792730901275e-05, + "loss": 0.0343, + "step": 139570 + }, + { + "epoch": 0.2979, + "grad_norm": 0.06030312925577164, + "learning_rate": 1.1472316317798643e-05, + "loss": 0.035, + "step": 139580 + }, + { + "epoch": 0.29795, + "grad_norm": 0.052978452295064926, + "learning_rate": 1.146884027453779e-05, + "loss": 0.0358, + "step": 139590 + }, + { + "epoch": 0.298, + "grad_norm": 0.054513029754161835, + "learning_rate": 1.1465364601213771e-05, + "loss": 0.0328, + "step": 139600 + }, + { + "epoch": 0.29805, + "grad_norm": 0.05603921785950661, + "learning_rate": 1.1461889297921599e-05, + "loss": 0.0343, + "step": 139610 + }, + { + "epoch": 0.2981, + "grad_norm": 0.057258982211351395, + "learning_rate": 1.1458414364756275e-05, + "loss": 0.0338, + "step": 139620 + }, + { + "epoch": 0.29815, + "grad_norm": 0.05240438133478165, + "learning_rate": 1.145493980181283e-05, + "loss": 0.0329, + "step": 139630 + }, + { + "epoch": 0.2982, + "grad_norm": 0.04709576070308685, + "learning_rate": 1.1451465609186238e-05, + "loss": 0.0323, + "step": 139640 + }, + { + "epoch": 0.29825, + "grad_norm": 0.044889263808727264, + "learning_rate": 1.1447991786971479e-05, + "loss": 0.0321, + "step": 139650 + }, + { + "epoch": 0.2983, + "grad_norm": 0.0566890612244606, + "learning_rate": 1.1444518335263543e-05, + "loss": 0.0319, + "step": 139660 + }, + { + "epoch": 0.29835, + "grad_norm": 0.050611212849617004, + "learning_rate": 1.1441045254157373e-05, + "loss": 0.0325, + "step": 139670 + }, + { + "epoch": 0.2984, + "grad_norm": 0.05062335729598999, + "learning_rate": 1.143757254374795e-05, + "loss": 0.033, + "step": 139680 + }, + { + "epoch": 0.29845, + "grad_norm": 0.0611569844186306, + "learning_rate": 1.143410020413018e-05, + "loss": 0.0323, + "step": 139690 + }, + { + "epoch": 0.2985, + "grad_norm": 0.056232936680316925, + "learning_rate": 1.1430628235399025e-05, + "loss": 0.0329, + "step": 139700 + }, + { + "epoch": 0.29855, + "grad_norm": 0.0519404299557209, + "learning_rate": 1.1427156637649384e-05, + "loss": 0.0345, + "step": 139710 + }, + { + "epoch": 0.2986, + "grad_norm": 0.07048199325799942, + "learning_rate": 1.1423685410976193e-05, + "loss": 0.033, + "step": 139720 + }, + { + "epoch": 0.29865, + "grad_norm": 0.06612701714038849, + "learning_rate": 1.142021455547434e-05, + "loss": 0.0351, + "step": 139730 + }, + { + "epoch": 0.2987, + "grad_norm": 0.07865101844072342, + "learning_rate": 1.141674407123871e-05, + "loss": 0.0331, + "step": 139740 + }, + { + "epoch": 0.29875, + "grad_norm": 0.07165993750095367, + "learning_rate": 1.1413273958364207e-05, + "loss": 0.0313, + "step": 139750 + }, + { + "epoch": 0.2988, + "grad_norm": 0.05868015065789223, + "learning_rate": 1.1409804216945688e-05, + "loss": 0.0324, + "step": 139760 + }, + { + "epoch": 0.29885, + "grad_norm": 0.052018482238054276, + "learning_rate": 1.1406334847078015e-05, + "loss": 0.032, + "step": 139770 + }, + { + "epoch": 0.2989, + "grad_norm": 0.05405759438872337, + "learning_rate": 1.1402865848856031e-05, + "loss": 0.032, + "step": 139780 + }, + { + "epoch": 0.29895, + "grad_norm": 0.046009961515665054, + "learning_rate": 1.1399397222374588e-05, + "loss": 0.0312, + "step": 139790 + }, + { + "epoch": 0.299, + "grad_norm": 0.0600450374186039, + "learning_rate": 1.1395928967728526e-05, + "loss": 0.0327, + "step": 139800 + }, + { + "epoch": 0.29905, + "grad_norm": 0.053973618894815445, + "learning_rate": 1.1392461085012655e-05, + "loss": 0.032, + "step": 139810 + }, + { + "epoch": 0.2991, + "grad_norm": 0.05563103035092354, + "learning_rate": 1.1388993574321782e-05, + "loss": 0.0308, + "step": 139820 + }, + { + "epoch": 0.29915, + "grad_norm": 0.05406641215085983, + "learning_rate": 1.1385526435750705e-05, + "loss": 0.0336, + "step": 139830 + }, + { + "epoch": 0.2992, + "grad_norm": 0.05502977594733238, + "learning_rate": 1.138205966939423e-05, + "loss": 0.0314, + "step": 139840 + }, + { + "epoch": 0.29925, + "grad_norm": 0.06398873031139374, + "learning_rate": 1.1378593275347123e-05, + "loss": 0.033, + "step": 139850 + }, + { + "epoch": 0.2993, + "grad_norm": 0.06901369243860245, + "learning_rate": 1.1375127253704155e-05, + "loss": 0.0348, + "step": 139860 + }, + { + "epoch": 0.29935, + "grad_norm": 0.08909549564123154, + "learning_rate": 1.1371661604560096e-05, + "loss": 0.0333, + "step": 139870 + }, + { + "epoch": 0.2994, + "grad_norm": 0.06168742850422859, + "learning_rate": 1.1368196328009682e-05, + "loss": 0.032, + "step": 139880 + }, + { + "epoch": 0.29945, + "grad_norm": 0.05655650794506073, + "learning_rate": 1.1364731424147674e-05, + "loss": 0.0357, + "step": 139890 + }, + { + "epoch": 0.2995, + "grad_norm": 0.06828747689723969, + "learning_rate": 1.136126689306877e-05, + "loss": 0.0348, + "step": 139900 + }, + { + "epoch": 0.29955, + "grad_norm": 0.05395118519663811, + "learning_rate": 1.1357802734867703e-05, + "loss": 0.0331, + "step": 139910 + }, + { + "epoch": 0.2996, + "grad_norm": 0.05931980162858963, + "learning_rate": 1.1354338949639196e-05, + "loss": 0.0378, + "step": 139920 + }, + { + "epoch": 0.29965, + "grad_norm": 0.05989324674010277, + "learning_rate": 1.1350875537477935e-05, + "loss": 0.0328, + "step": 139930 + }, + { + "epoch": 0.2997, + "grad_norm": 0.04802591726183891, + "learning_rate": 1.134741249847861e-05, + "loss": 0.0321, + "step": 139940 + }, + { + "epoch": 0.29975, + "grad_norm": 0.05060574784874916, + "learning_rate": 1.1343949832735887e-05, + "loss": 0.0326, + "step": 139950 + }, + { + "epoch": 0.2998, + "grad_norm": 0.05629558488726616, + "learning_rate": 1.1340487540344455e-05, + "loss": 0.0331, + "step": 139960 + }, + { + "epoch": 0.29985, + "grad_norm": 0.04951554164290428, + "learning_rate": 1.133702562139896e-05, + "loss": 0.032, + "step": 139970 + }, + { + "epoch": 0.2999, + "grad_norm": 0.06243675947189331, + "learning_rate": 1.1333564075994047e-05, + "loss": 0.0333, + "step": 139980 + }, + { + "epoch": 0.29995, + "grad_norm": 0.055823612958192825, + "learning_rate": 1.1330102904224365e-05, + "loss": 0.0327, + "step": 139990 + }, + { + "epoch": 0.3, + "grad_norm": 0.05091789364814758, + "learning_rate": 1.1326642106184524e-05, + "loss": 0.0323, + "step": 140000 + }, + { + "epoch": 0.30005, + "grad_norm": 0.04933144152164459, + "learning_rate": 1.1323181681969162e-05, + "loss": 0.0329, + "step": 140010 + }, + { + "epoch": 0.3001, + "grad_norm": 0.05192696675658226, + "learning_rate": 1.1319721631672872e-05, + "loss": 0.0332, + "step": 140020 + }, + { + "epoch": 0.30015, + "grad_norm": 0.04468753933906555, + "learning_rate": 1.1316261955390246e-05, + "loss": 0.032, + "step": 140030 + }, + { + "epoch": 0.3002, + "grad_norm": 0.04795127734541893, + "learning_rate": 1.1312802653215886e-05, + "loss": 0.0326, + "step": 140040 + }, + { + "epoch": 0.30025, + "grad_norm": 0.05080784112215042, + "learning_rate": 1.130934372524436e-05, + "loss": 0.0331, + "step": 140050 + }, + { + "epoch": 0.3003, + "grad_norm": 0.0521916002035141, + "learning_rate": 1.1305885171570232e-05, + "loss": 0.0332, + "step": 140060 + }, + { + "epoch": 0.30035, + "grad_norm": 0.05924517661333084, + "learning_rate": 1.130242699228805e-05, + "loss": 0.0341, + "step": 140070 + }, + { + "epoch": 0.3004, + "grad_norm": 0.07030452787876129, + "learning_rate": 1.1298969187492378e-05, + "loss": 0.0325, + "step": 140080 + }, + { + "epoch": 0.30045, + "grad_norm": 0.05708425119519234, + "learning_rate": 1.1295511757277732e-05, + "loss": 0.0325, + "step": 140090 + }, + { + "epoch": 0.3005, + "grad_norm": 0.06188962981104851, + "learning_rate": 1.1292054701738656e-05, + "loss": 0.034, + "step": 140100 + }, + { + "epoch": 0.30055, + "grad_norm": 0.06431712955236435, + "learning_rate": 1.1288598020969651e-05, + "loss": 0.0336, + "step": 140110 + }, + { + "epoch": 0.3006, + "grad_norm": 0.06019863858819008, + "learning_rate": 1.128514171506522e-05, + "loss": 0.0342, + "step": 140120 + }, + { + "epoch": 0.30065, + "grad_norm": 0.05877012386918068, + "learning_rate": 1.128168578411987e-05, + "loss": 0.0336, + "step": 140130 + }, + { + "epoch": 0.3007, + "grad_norm": 0.05326351523399353, + "learning_rate": 1.1278230228228076e-05, + "loss": 0.034, + "step": 140140 + }, + { + "epoch": 0.30075, + "grad_norm": 0.056856438517570496, + "learning_rate": 1.1274775047484312e-05, + "loss": 0.0333, + "step": 140150 + }, + { + "epoch": 0.3008, + "grad_norm": 0.05016437917947769, + "learning_rate": 1.1271320241983033e-05, + "loss": 0.034, + "step": 140160 + }, + { + "epoch": 0.30085, + "grad_norm": 0.05082874372601509, + "learning_rate": 1.1267865811818701e-05, + "loss": 0.0347, + "step": 140170 + }, + { + "epoch": 0.3009, + "grad_norm": 0.057839758694171906, + "learning_rate": 1.126441175708578e-05, + "loss": 0.0344, + "step": 140180 + }, + { + "epoch": 0.30095, + "grad_norm": 0.04288686066865921, + "learning_rate": 1.1260958077878658e-05, + "loss": 0.0341, + "step": 140190 + }, + { + "epoch": 0.301, + "grad_norm": 0.05189139395952225, + "learning_rate": 1.1257504774291793e-05, + "loss": 0.0334, + "step": 140200 + }, + { + "epoch": 0.30105, + "grad_norm": 0.051603466272354126, + "learning_rate": 1.1254051846419576e-05, + "loss": 0.0351, + "step": 140210 + }, + { + "epoch": 0.3011, + "grad_norm": 0.054475102573633194, + "learning_rate": 1.1250599294356425e-05, + "loss": 0.0322, + "step": 140220 + }, + { + "epoch": 0.30115, + "grad_norm": 0.04744827747344971, + "learning_rate": 1.1247147118196724e-05, + "loss": 0.0329, + "step": 140230 + }, + { + "epoch": 0.3012, + "grad_norm": 0.07402417808771133, + "learning_rate": 1.1243695318034848e-05, + "loss": 0.0352, + "step": 140240 + }, + { + "epoch": 0.30125, + "grad_norm": 0.05516153201460838, + "learning_rate": 1.124024389396518e-05, + "loss": 0.034, + "step": 140250 + }, + { + "epoch": 0.3013, + "grad_norm": 0.04782964661717415, + "learning_rate": 1.1236792846082072e-05, + "loss": 0.034, + "step": 140260 + }, + { + "epoch": 0.30135, + "grad_norm": 0.058844517916440964, + "learning_rate": 1.1233342174479883e-05, + "loss": 0.0338, + "step": 140270 + }, + { + "epoch": 0.3014, + "grad_norm": 0.05295547470450401, + "learning_rate": 1.1229891879252935e-05, + "loss": 0.0329, + "step": 140280 + }, + { + "epoch": 0.30145, + "grad_norm": 0.05227420851588249, + "learning_rate": 1.1226441960495567e-05, + "loss": 0.033, + "step": 140290 + }, + { + "epoch": 0.3015, + "grad_norm": 0.048177167773246765, + "learning_rate": 1.1222992418302114e-05, + "loss": 0.033, + "step": 140300 + }, + { + "epoch": 0.30155, + "grad_norm": 0.057371679693460464, + "learning_rate": 1.1219543252766874e-05, + "loss": 0.0367, + "step": 140310 + }, + { + "epoch": 0.3016, + "grad_norm": 0.04433856159448624, + "learning_rate": 1.1216094463984141e-05, + "loss": 0.0329, + "step": 140320 + }, + { + "epoch": 0.30165, + "grad_norm": 0.07420659810304642, + "learning_rate": 1.1212646052048198e-05, + "loss": 0.0339, + "step": 140330 + }, + { + "epoch": 0.3017, + "grad_norm": 0.05632463097572327, + "learning_rate": 1.1209198017053344e-05, + "loss": 0.0329, + "step": 140340 + }, + { + "epoch": 0.30175, + "grad_norm": 0.05505290627479553, + "learning_rate": 1.1205750359093833e-05, + "loss": 0.0328, + "step": 140350 + }, + { + "epoch": 0.3018, + "grad_norm": 0.048898130655288696, + "learning_rate": 1.1202303078263917e-05, + "loss": 0.0328, + "step": 140360 + }, + { + "epoch": 0.30185, + "grad_norm": 0.05216413736343384, + "learning_rate": 1.119885617465786e-05, + "loss": 0.0339, + "step": 140370 + }, + { + "epoch": 0.3019, + "grad_norm": 0.058374982327222824, + "learning_rate": 1.1195409648369881e-05, + "loss": 0.0329, + "step": 140380 + }, + { + "epoch": 0.30195, + "grad_norm": 0.04453812912106514, + "learning_rate": 1.1191963499494234e-05, + "loss": 0.033, + "step": 140390 + }, + { + "epoch": 0.302, + "grad_norm": 0.051837850362062454, + "learning_rate": 1.11885177281251e-05, + "loss": 0.0331, + "step": 140400 + }, + { + "epoch": 0.30205, + "grad_norm": 0.05339343100786209, + "learning_rate": 1.1185072334356702e-05, + "loss": 0.0338, + "step": 140410 + }, + { + "epoch": 0.3021, + "grad_norm": 0.05379300191998482, + "learning_rate": 1.1181627318283247e-05, + "loss": 0.0328, + "step": 140420 + }, + { + "epoch": 0.30215, + "grad_norm": 0.05971743166446686, + "learning_rate": 1.1178182679998909e-05, + "loss": 0.0342, + "step": 140430 + }, + { + "epoch": 0.3022, + "grad_norm": 0.07061241567134857, + "learning_rate": 1.1174738419597863e-05, + "loss": 0.0326, + "step": 140440 + }, + { + "epoch": 0.30225, + "grad_norm": 0.06061761453747749, + "learning_rate": 1.1171294537174264e-05, + "loss": 0.0336, + "step": 140450 + }, + { + "epoch": 0.3023, + "grad_norm": 0.06953947991132736, + "learning_rate": 1.116785103282229e-05, + "loss": 0.0344, + "step": 140460 + }, + { + "epoch": 0.30235, + "grad_norm": 0.05975327268242836, + "learning_rate": 1.116440790663607e-05, + "loss": 0.0324, + "step": 140470 + }, + { + "epoch": 0.3024, + "grad_norm": 0.05084272101521492, + "learning_rate": 1.1160965158709732e-05, + "loss": 0.0338, + "step": 140480 + }, + { + "epoch": 0.30245, + "grad_norm": 0.051011331379413605, + "learning_rate": 1.1157522789137415e-05, + "loss": 0.0328, + "step": 140490 + }, + { + "epoch": 0.3025, + "grad_norm": 0.05884523317217827, + "learning_rate": 1.1154080798013217e-05, + "loss": 0.0331, + "step": 140500 + }, + { + "epoch": 0.30255, + "grad_norm": 0.0666908249258995, + "learning_rate": 1.1150639185431258e-05, + "loss": 0.0338, + "step": 140510 + }, + { + "epoch": 0.3026, + "grad_norm": 0.07132396847009659, + "learning_rate": 1.1147197951485619e-05, + "loss": 0.033, + "step": 140520 + }, + { + "epoch": 0.30265, + "grad_norm": 0.06169174239039421, + "learning_rate": 1.1143757096270386e-05, + "loss": 0.0331, + "step": 140530 + }, + { + "epoch": 0.3027, + "grad_norm": 0.11681129783391953, + "learning_rate": 1.1140316619879615e-05, + "loss": 0.034, + "step": 140540 + }, + { + "epoch": 0.30275, + "grad_norm": 0.06157153099775314, + "learning_rate": 1.1136876522407393e-05, + "loss": 0.0337, + "step": 140550 + }, + { + "epoch": 0.3028, + "grad_norm": 0.06677278131246567, + "learning_rate": 1.1133436803947758e-05, + "loss": 0.0335, + "step": 140560 + }, + { + "epoch": 0.30285, + "grad_norm": 0.0701901838183403, + "learning_rate": 1.1129997464594743e-05, + "loss": 0.0346, + "step": 140570 + }, + { + "epoch": 0.3029, + "grad_norm": 0.07152388244867325, + "learning_rate": 1.1126558504442397e-05, + "loss": 0.0336, + "step": 140580 + }, + { + "epoch": 0.30295, + "grad_norm": 0.0734538659453392, + "learning_rate": 1.1123119923584718e-05, + "loss": 0.035, + "step": 140590 + }, + { + "epoch": 0.303, + "grad_norm": 0.07059818506240845, + "learning_rate": 1.1119681722115746e-05, + "loss": 0.036, + "step": 140600 + }, + { + "epoch": 0.30305, + "grad_norm": 0.0695488229393959, + "learning_rate": 1.1116243900129441e-05, + "loss": 0.0341, + "step": 140610 + }, + { + "epoch": 0.3031, + "grad_norm": 0.04940784350037575, + "learning_rate": 1.1112806457719816e-05, + "loss": 0.0337, + "step": 140620 + }, + { + "epoch": 0.30315, + "grad_norm": 0.05950392410159111, + "learning_rate": 1.1109369394980851e-05, + "loss": 0.0339, + "step": 140630 + }, + { + "epoch": 0.3032, + "grad_norm": 0.04839714616537094, + "learning_rate": 1.110593271200651e-05, + "loss": 0.0344, + "step": 140640 + }, + { + "epoch": 0.30325, + "grad_norm": 0.04441726580262184, + "learning_rate": 1.1102496408890747e-05, + "loss": 0.0338, + "step": 140650 + }, + { + "epoch": 0.3033, + "grad_norm": 0.04511004686355591, + "learning_rate": 1.1099060485727502e-05, + "loss": 0.0345, + "step": 140660 + }, + { + "epoch": 0.30335, + "grad_norm": 0.07027915120124817, + "learning_rate": 1.1095624942610725e-05, + "loss": 0.0347, + "step": 140670 + }, + { + "epoch": 0.3034, + "grad_norm": 0.05691104754805565, + "learning_rate": 1.1092189779634355e-05, + "loss": 0.0336, + "step": 140680 + }, + { + "epoch": 0.30345, + "grad_norm": 0.05413910001516342, + "learning_rate": 1.108875499689227e-05, + "loss": 0.0342, + "step": 140690 + }, + { + "epoch": 0.3035, + "grad_norm": 0.05556439980864525, + "learning_rate": 1.108532059447841e-05, + "loss": 0.0346, + "step": 140700 + }, + { + "epoch": 0.30355, + "grad_norm": 0.05685647204518318, + "learning_rate": 1.1081886572486646e-05, + "loss": 0.0355, + "step": 140710 + }, + { + "epoch": 0.3036, + "grad_norm": 0.061345312744379044, + "learning_rate": 1.1078452931010883e-05, + "loss": 0.0332, + "step": 140720 + }, + { + "epoch": 0.30365, + "grad_norm": 0.05181057006120682, + "learning_rate": 1.107501967014499e-05, + "loss": 0.0339, + "step": 140730 + }, + { + "epoch": 0.3037, + "grad_norm": 0.05965716391801834, + "learning_rate": 1.1071586789982816e-05, + "loss": 0.0364, + "step": 140740 + }, + { + "epoch": 0.30375, + "grad_norm": 0.06506678462028503, + "learning_rate": 1.1068154290618235e-05, + "loss": 0.0343, + "step": 140750 + }, + { + "epoch": 0.3038, + "grad_norm": 0.060299888253211975, + "learning_rate": 1.1064722172145084e-05, + "loss": 0.0332, + "step": 140760 + }, + { + "epoch": 0.30385, + "grad_norm": 0.05024135485291481, + "learning_rate": 1.1061290434657193e-05, + "loss": 0.0323, + "step": 140770 + }, + { + "epoch": 0.3039, + "grad_norm": 0.053525201976299286, + "learning_rate": 1.1057859078248376e-05, + "loss": 0.035, + "step": 140780 + }, + { + "epoch": 0.30395, + "grad_norm": 0.05458205193281174, + "learning_rate": 1.1054428103012463e-05, + "loss": 0.0346, + "step": 140790 + }, + { + "epoch": 0.304, + "grad_norm": 0.07601752132177353, + "learning_rate": 1.1050997509043237e-05, + "loss": 0.0326, + "step": 140800 + }, + { + "epoch": 0.30405, + "grad_norm": 0.06448765844106674, + "learning_rate": 1.1047567296434508e-05, + "loss": 0.0341, + "step": 140810 + }, + { + "epoch": 0.3041, + "grad_norm": 0.044361311942338943, + "learning_rate": 1.1044137465280047e-05, + "loss": 0.034, + "step": 140820 + }, + { + "epoch": 0.30415, + "grad_norm": 0.06594926118850708, + "learning_rate": 1.1040708015673616e-05, + "loss": 0.0331, + "step": 140830 + }, + { + "epoch": 0.3042, + "grad_norm": 0.05299568921327591, + "learning_rate": 1.1037278947708993e-05, + "loss": 0.0343, + "step": 140840 + }, + { + "epoch": 0.30425, + "grad_norm": 0.05918257683515549, + "learning_rate": 1.1033850261479917e-05, + "loss": 0.0329, + "step": 140850 + }, + { + "epoch": 0.3043, + "grad_norm": 0.05580601096153259, + "learning_rate": 1.103042195708012e-05, + "loss": 0.0324, + "step": 140860 + }, + { + "epoch": 0.30435, + "grad_norm": 0.051413439214229584, + "learning_rate": 1.1026994034603347e-05, + "loss": 0.0326, + "step": 140870 + }, + { + "epoch": 0.3044, + "grad_norm": 0.05384482815861702, + "learning_rate": 1.1023566494143298e-05, + "loss": 0.0333, + "step": 140880 + }, + { + "epoch": 0.30445, + "grad_norm": 0.04275573790073395, + "learning_rate": 1.1020139335793711e-05, + "loss": 0.0325, + "step": 140890 + }, + { + "epoch": 0.3045, + "grad_norm": 0.047934915870428085, + "learning_rate": 1.101671255964824e-05, + "loss": 0.0338, + "step": 140900 + }, + { + "epoch": 0.30455, + "grad_norm": 0.04720642417669296, + "learning_rate": 1.1013286165800608e-05, + "loss": 0.0321, + "step": 140910 + }, + { + "epoch": 0.3046, + "grad_norm": 0.051188092678785324, + "learning_rate": 1.1009860154344467e-05, + "loss": 0.0328, + "step": 140920 + }, + { + "epoch": 0.30465, + "grad_norm": 0.04356950521469116, + "learning_rate": 1.1006434525373502e-05, + "loss": 0.0309, + "step": 140930 + }, + { + "epoch": 0.3047, + "grad_norm": 0.06769690662622452, + "learning_rate": 1.1003009278981361e-05, + "loss": 0.0337, + "step": 140940 + }, + { + "epoch": 0.30475, + "grad_norm": 0.05538792163133621, + "learning_rate": 1.0999584415261677e-05, + "loss": 0.0326, + "step": 140950 + }, + { + "epoch": 0.3048, + "grad_norm": 0.05651751160621643, + "learning_rate": 1.0996159934308106e-05, + "loss": 0.0339, + "step": 140960 + }, + { + "epoch": 0.30485, + "grad_norm": 0.04621144384145737, + "learning_rate": 1.0992735836214261e-05, + "loss": 0.0335, + "step": 140970 + }, + { + "epoch": 0.3049, + "grad_norm": 0.05354321002960205, + "learning_rate": 1.0989312121073756e-05, + "loss": 0.0365, + "step": 140980 + }, + { + "epoch": 0.30495, + "grad_norm": 0.06050272285938263, + "learning_rate": 1.0985888788980184e-05, + "loss": 0.0332, + "step": 140990 + }, + { + "epoch": 0.305, + "grad_norm": 0.06105947494506836, + "learning_rate": 1.0982465840027147e-05, + "loss": 0.0331, + "step": 141000 + }, + { + "epoch": 0.30505, + "grad_norm": 0.06364921480417252, + "learning_rate": 1.097904327430824e-05, + "loss": 0.0343, + "step": 141010 + }, + { + "epoch": 0.3051, + "grad_norm": 0.06211453303694725, + "learning_rate": 1.0975621091917022e-05, + "loss": 0.0326, + "step": 141020 + }, + { + "epoch": 0.30515, + "grad_norm": 0.05600810796022415, + "learning_rate": 1.0972199292947052e-05, + "loss": 0.0324, + "step": 141030 + }, + { + "epoch": 0.3052, + "grad_norm": 0.049081169068813324, + "learning_rate": 1.0968777877491875e-05, + "loss": 0.0318, + "step": 141040 + }, + { + "epoch": 0.30525, + "grad_norm": 0.0588548518717289, + "learning_rate": 1.096535684564505e-05, + "loss": 0.0337, + "step": 141050 + }, + { + "epoch": 0.3053, + "grad_norm": 0.05001838132739067, + "learning_rate": 1.0961936197500097e-05, + "loss": 0.0334, + "step": 141060 + }, + { + "epoch": 0.30535, + "grad_norm": 0.05359674617648125, + "learning_rate": 1.0958515933150524e-05, + "loss": 0.0327, + "step": 141070 + }, + { + "epoch": 0.3054, + "grad_norm": 0.05704985931515694, + "learning_rate": 1.095509605268986e-05, + "loss": 0.0337, + "step": 141080 + }, + { + "epoch": 0.30545, + "grad_norm": 0.05883404240012169, + "learning_rate": 1.0951676556211583e-05, + "loss": 0.0324, + "step": 141090 + }, + { + "epoch": 0.3055, + "grad_norm": 0.058890387415885925, + "learning_rate": 1.094825744380921e-05, + "loss": 0.0329, + "step": 141100 + }, + { + "epoch": 0.30555, + "grad_norm": 0.07341659069061279, + "learning_rate": 1.0944838715576181e-05, + "loss": 0.0344, + "step": 141110 + }, + { + "epoch": 0.3056, + "grad_norm": 0.058523885905742645, + "learning_rate": 1.0941420371605981e-05, + "loss": 0.0339, + "step": 141120 + }, + { + "epoch": 0.30565, + "grad_norm": 0.045745741575956345, + "learning_rate": 1.0938002411992077e-05, + "loss": 0.0339, + "step": 141130 + }, + { + "epoch": 0.3057, + "grad_norm": 0.049728427082300186, + "learning_rate": 1.0934584836827904e-05, + "loss": 0.0338, + "step": 141140 + }, + { + "epoch": 0.30575, + "grad_norm": 0.07706481218338013, + "learning_rate": 1.0931167646206896e-05, + "loss": 0.0339, + "step": 141150 + }, + { + "epoch": 0.3058, + "grad_norm": 0.05314839258790016, + "learning_rate": 1.0927750840222473e-05, + "loss": 0.0366, + "step": 141160 + }, + { + "epoch": 0.30585, + "grad_norm": 0.046150967478752136, + "learning_rate": 1.0924334418968064e-05, + "loss": 0.0338, + "step": 141170 + }, + { + "epoch": 0.3059, + "grad_norm": 0.05311834439635277, + "learning_rate": 1.092091838253706e-05, + "loss": 0.036, + "step": 141180 + }, + { + "epoch": 0.30595, + "grad_norm": 0.05920974910259247, + "learning_rate": 1.0917502731022853e-05, + "loss": 0.035, + "step": 141190 + }, + { + "epoch": 0.306, + "grad_norm": 0.04775369539856911, + "learning_rate": 1.0914087464518839e-05, + "loss": 0.0347, + "step": 141200 + }, + { + "epoch": 0.30605, + "grad_norm": 0.05612358823418617, + "learning_rate": 1.091067258311837e-05, + "loss": 0.0338, + "step": 141210 + }, + { + "epoch": 0.3061, + "grad_norm": 0.06597428023815155, + "learning_rate": 1.0907258086914832e-05, + "loss": 0.035, + "step": 141220 + }, + { + "epoch": 0.30615, + "grad_norm": 0.05252828449010849, + "learning_rate": 1.0903843976001562e-05, + "loss": 0.0327, + "step": 141230 + }, + { + "epoch": 0.3062, + "grad_norm": 0.05416768416762352, + "learning_rate": 1.0900430250471893e-05, + "loss": 0.0335, + "step": 141240 + }, + { + "epoch": 0.30625, + "grad_norm": 0.062133170664310455, + "learning_rate": 1.0897016910419172e-05, + "loss": 0.0327, + "step": 141250 + }, + { + "epoch": 0.3063, + "grad_norm": 0.05661081522703171, + "learning_rate": 1.0893603955936712e-05, + "loss": 0.0336, + "step": 141260 + }, + { + "epoch": 0.30635, + "grad_norm": 0.05456758290529251, + "learning_rate": 1.0890191387117821e-05, + "loss": 0.0334, + "step": 141270 + }, + { + "epoch": 0.3064, + "grad_norm": 0.060421284288167953, + "learning_rate": 1.0886779204055786e-05, + "loss": 0.0364, + "step": 141280 + }, + { + "epoch": 0.30645, + "grad_norm": 0.06277312338352203, + "learning_rate": 1.0883367406843914e-05, + "loss": 0.0325, + "step": 141290 + }, + { + "epoch": 0.3065, + "grad_norm": 0.051737893372774124, + "learning_rate": 1.0879955995575466e-05, + "loss": 0.0319, + "step": 141300 + }, + { + "epoch": 0.30655, + "grad_norm": 0.04213999956846237, + "learning_rate": 1.0876544970343728e-05, + "loss": 0.0316, + "step": 141310 + }, + { + "epoch": 0.3066, + "grad_norm": 0.048217009752988815, + "learning_rate": 1.0873134331241942e-05, + "loss": 0.032, + "step": 141320 + }, + { + "epoch": 0.30665, + "grad_norm": 0.04515504091978073, + "learning_rate": 1.0869724078363344e-05, + "loss": 0.0329, + "step": 141330 + }, + { + "epoch": 0.3067, + "grad_norm": 0.058983009308576584, + "learning_rate": 1.0866314211801193e-05, + "loss": 0.0322, + "step": 141340 + }, + { + "epoch": 0.30675, + "grad_norm": 0.06672867387533188, + "learning_rate": 1.0862904731648705e-05, + "loss": 0.0337, + "step": 141350 + }, + { + "epoch": 0.3068, + "grad_norm": 0.05784101039171219, + "learning_rate": 1.0859495637999086e-05, + "loss": 0.0339, + "step": 141360 + }, + { + "epoch": 0.30685, + "grad_norm": 0.05413747578859329, + "learning_rate": 1.0856086930945536e-05, + "loss": 0.0344, + "step": 141370 + }, + { + "epoch": 0.3069, + "grad_norm": 0.0900455191731453, + "learning_rate": 1.0852678610581257e-05, + "loss": 0.0333, + "step": 141380 + }, + { + "epoch": 0.30695, + "grad_norm": 0.06914813816547394, + "learning_rate": 1.0849270676999446e-05, + "loss": 0.0332, + "step": 141390 + }, + { + "epoch": 0.307, + "grad_norm": 0.07659492641687393, + "learning_rate": 1.084586313029324e-05, + "loss": 0.0329, + "step": 141400 + }, + { + "epoch": 0.30705, + "grad_norm": 0.06035493686795235, + "learning_rate": 1.0842455970555832e-05, + "loss": 0.0333, + "step": 141410 + }, + { + "epoch": 0.3071, + "grad_norm": 0.07281692326068878, + "learning_rate": 1.0839049197880347e-05, + "loss": 0.0338, + "step": 141420 + }, + { + "epoch": 0.30715, + "grad_norm": 0.0637575164437294, + "learning_rate": 1.0835642812359945e-05, + "loss": 0.0331, + "step": 141430 + }, + { + "epoch": 0.3072, + "grad_norm": 0.06447659432888031, + "learning_rate": 1.0832236814087748e-05, + "loss": 0.0332, + "step": 141440 + }, + { + "epoch": 0.30725, + "grad_norm": 0.06269050389528275, + "learning_rate": 1.0828831203156865e-05, + "loss": 0.034, + "step": 141450 + }, + { + "epoch": 0.3073, + "grad_norm": 0.04924452304840088, + "learning_rate": 1.0825425979660422e-05, + "loss": 0.0324, + "step": 141460 + }, + { + "epoch": 0.30735, + "grad_norm": 0.0564417727291584, + "learning_rate": 1.0822021143691508e-05, + "loss": 0.034, + "step": 141470 + }, + { + "epoch": 0.3074, + "grad_norm": 0.04732852429151535, + "learning_rate": 1.081861669534321e-05, + "loss": 0.0321, + "step": 141480 + }, + { + "epoch": 0.30745, + "grad_norm": 0.048619452863931656, + "learning_rate": 1.0815212634708593e-05, + "loss": 0.0333, + "step": 141490 + }, + { + "epoch": 0.3075, + "grad_norm": 0.05323924869298935, + "learning_rate": 1.0811808961880734e-05, + "loss": 0.0338, + "step": 141500 + }, + { + "epoch": 0.30755, + "grad_norm": 0.12367910891771317, + "learning_rate": 1.0808405676952699e-05, + "loss": 0.034, + "step": 141510 + }, + { + "epoch": 0.3076, + "grad_norm": 0.05822361260652542, + "learning_rate": 1.0805002780017518e-05, + "loss": 0.0338, + "step": 141520 + }, + { + "epoch": 0.30765, + "grad_norm": 0.05467531830072403, + "learning_rate": 1.080160027116823e-05, + "loss": 0.0335, + "step": 141530 + }, + { + "epoch": 0.3077, + "grad_norm": 0.047766610980033875, + "learning_rate": 1.0798198150497848e-05, + "loss": 0.0337, + "step": 141540 + }, + { + "epoch": 0.30775, + "grad_norm": 0.06274426728487015, + "learning_rate": 1.0794796418099401e-05, + "loss": 0.0371, + "step": 141550 + }, + { + "epoch": 0.3078, + "grad_norm": 0.05022158846259117, + "learning_rate": 1.0791395074065883e-05, + "loss": 0.0354, + "step": 141560 + }, + { + "epoch": 0.30785, + "grad_norm": 0.07846273481845856, + "learning_rate": 1.0787994118490278e-05, + "loss": 0.0356, + "step": 141570 + }, + { + "epoch": 0.3079, + "grad_norm": 0.05049419030547142, + "learning_rate": 1.0784593551465582e-05, + "loss": 0.0344, + "step": 141580 + }, + { + "epoch": 0.30795, + "grad_norm": 0.06500475108623505, + "learning_rate": 1.078119337308475e-05, + "loss": 0.034, + "step": 141590 + }, + { + "epoch": 0.308, + "grad_norm": 0.05722379684448242, + "learning_rate": 1.0777793583440768e-05, + "loss": 0.0334, + "step": 141600 + }, + { + "epoch": 0.30805, + "grad_norm": 0.050137463957071304, + "learning_rate": 1.077439418262655e-05, + "loss": 0.0338, + "step": 141610 + }, + { + "epoch": 0.3081, + "grad_norm": 0.05086752027273178, + "learning_rate": 1.0770995170735046e-05, + "loss": 0.0328, + "step": 141620 + }, + { + "epoch": 0.30815, + "grad_norm": 0.04671629145741463, + "learning_rate": 1.0767596547859202e-05, + "loss": 0.0343, + "step": 141630 + }, + { + "epoch": 0.3082, + "grad_norm": 0.06504970788955688, + "learning_rate": 1.076419831409192e-05, + "loss": 0.0336, + "step": 141640 + }, + { + "epoch": 0.30825, + "grad_norm": 0.04972356930375099, + "learning_rate": 1.0760800469526106e-05, + "loss": 0.0334, + "step": 141650 + }, + { + "epoch": 0.3083, + "grad_norm": 0.061417948454618454, + "learning_rate": 1.075740301425465e-05, + "loss": 0.0329, + "step": 141660 + }, + { + "epoch": 0.30835, + "grad_norm": 0.0457824282348156, + "learning_rate": 1.0754005948370454e-05, + "loss": 0.0337, + "step": 141670 + }, + { + "epoch": 0.3084, + "grad_norm": 0.046346407383680344, + "learning_rate": 1.0750609271966384e-05, + "loss": 0.0332, + "step": 141680 + }, + { + "epoch": 0.30845, + "grad_norm": 0.05328997224569321, + "learning_rate": 1.0747212985135293e-05, + "loss": 0.034, + "step": 141690 + }, + { + "epoch": 0.3085, + "grad_norm": 0.054923467338085175, + "learning_rate": 1.0743817087970054e-05, + "loss": 0.0314, + "step": 141700 + }, + { + "epoch": 0.30855, + "grad_norm": 0.0563756600022316, + "learning_rate": 1.0740421580563493e-05, + "loss": 0.0343, + "step": 141710 + }, + { + "epoch": 0.3086, + "grad_norm": 0.06679999828338623, + "learning_rate": 1.0737026463008453e-05, + "loss": 0.0339, + "step": 141720 + }, + { + "epoch": 0.30865, + "grad_norm": 0.059505343437194824, + "learning_rate": 1.0733631735397755e-05, + "loss": 0.0327, + "step": 141730 + }, + { + "epoch": 0.3087, + "grad_norm": 0.05167857185006142, + "learning_rate": 1.0730237397824205e-05, + "loss": 0.0327, + "step": 141740 + }, + { + "epoch": 0.30875, + "grad_norm": 0.06931980699300766, + "learning_rate": 1.0726843450380594e-05, + "loss": 0.0324, + "step": 141750 + }, + { + "epoch": 0.3088, + "grad_norm": 0.05788620561361313, + "learning_rate": 1.0723449893159731e-05, + "loss": 0.0325, + "step": 141760 + }, + { + "epoch": 0.30885, + "grad_norm": 0.06421661376953125, + "learning_rate": 1.0720056726254384e-05, + "loss": 0.0333, + "step": 141770 + }, + { + "epoch": 0.3089, + "grad_norm": 0.04746703803539276, + "learning_rate": 1.0716663949757314e-05, + "loss": 0.0321, + "step": 141780 + }, + { + "epoch": 0.30895, + "grad_norm": 0.05642994865775108, + "learning_rate": 1.0713271563761293e-05, + "loss": 0.0332, + "step": 141790 + }, + { + "epoch": 0.309, + "grad_norm": 0.04913509637117386, + "learning_rate": 1.070987956835905e-05, + "loss": 0.0332, + "step": 141800 + }, + { + "epoch": 0.30905, + "grad_norm": 0.05893111974000931, + "learning_rate": 1.0706487963643349e-05, + "loss": 0.0348, + "step": 141810 + }, + { + "epoch": 0.3091, + "grad_norm": 0.059092987328767776, + "learning_rate": 1.0703096749706881e-05, + "loss": 0.034, + "step": 141820 + }, + { + "epoch": 0.30915, + "grad_norm": 0.05287367105484009, + "learning_rate": 1.0699705926642378e-05, + "loss": 0.0351, + "step": 141830 + }, + { + "epoch": 0.3092, + "grad_norm": 0.05104648694396019, + "learning_rate": 1.069631549454255e-05, + "loss": 0.0351, + "step": 141840 + }, + { + "epoch": 0.30925, + "grad_norm": 0.048764459788799286, + "learning_rate": 1.0692925453500082e-05, + "loss": 0.034, + "step": 141850 + }, + { + "epoch": 0.3093, + "grad_norm": 0.05129481479525566, + "learning_rate": 1.068953580360766e-05, + "loss": 0.0344, + "step": 141860 + }, + { + "epoch": 0.30935, + "grad_norm": 0.046603891998529434, + "learning_rate": 1.0686146544957939e-05, + "loss": 0.0322, + "step": 141870 + }, + { + "epoch": 0.3094, + "grad_norm": 0.04391618072986603, + "learning_rate": 1.0682757677643596e-05, + "loss": 0.033, + "step": 141880 + }, + { + "epoch": 0.30945, + "grad_norm": 0.06269185245037079, + "learning_rate": 1.06793692017573e-05, + "loss": 0.0345, + "step": 141890 + }, + { + "epoch": 0.3095, + "grad_norm": 0.04640522226691246, + "learning_rate": 1.0675981117391648e-05, + "loss": 0.0324, + "step": 141900 + }, + { + "epoch": 0.30955, + "grad_norm": 0.05266639217734337, + "learning_rate": 1.0672593424639301e-05, + "loss": 0.0348, + "step": 141910 + }, + { + "epoch": 0.3096, + "grad_norm": 0.052203577011823654, + "learning_rate": 1.0669206123592862e-05, + "loss": 0.0317, + "step": 141920 + }, + { + "epoch": 0.30965, + "grad_norm": 0.05076339840888977, + "learning_rate": 1.0665819214344949e-05, + "loss": 0.0339, + "step": 141930 + }, + { + "epoch": 0.3097, + "grad_norm": 0.04586315155029297, + "learning_rate": 1.0662432696988153e-05, + "loss": 0.0329, + "step": 141940 + }, + { + "epoch": 0.30975, + "grad_norm": 0.04728645458817482, + "learning_rate": 1.0659046571615055e-05, + "loss": 0.0346, + "step": 141950 + }, + { + "epoch": 0.3098, + "grad_norm": 0.04644077643752098, + "learning_rate": 1.0655660838318243e-05, + "loss": 0.0334, + "step": 141960 + }, + { + "epoch": 0.30985, + "grad_norm": 0.049580082297325134, + "learning_rate": 1.0652275497190276e-05, + "loss": 0.0337, + "step": 141970 + }, + { + "epoch": 0.3099, + "grad_norm": 0.053173281252384186, + "learning_rate": 1.0648890548323705e-05, + "loss": 0.0342, + "step": 141980 + }, + { + "epoch": 0.30995, + "grad_norm": 0.04500650241971016, + "learning_rate": 1.0645505991811066e-05, + "loss": 0.0348, + "step": 141990 + }, + { + "epoch": 0.31, + "grad_norm": 0.049792829900979996, + "learning_rate": 1.0642121827744911e-05, + "loss": 0.0342, + "step": 142000 + }, + { + "epoch": 0.31005, + "grad_norm": 0.04503854736685753, + "learning_rate": 1.0638738056217742e-05, + "loss": 0.0337, + "step": 142010 + }, + { + "epoch": 0.3101, + "grad_norm": 0.040192991495132446, + "learning_rate": 1.0635354677322087e-05, + "loss": 0.0331, + "step": 142020 + }, + { + "epoch": 0.31015, + "grad_norm": 0.05078494921326637, + "learning_rate": 1.0631971691150438e-05, + "loss": 0.0329, + "step": 142030 + }, + { + "epoch": 0.3102, + "grad_norm": 0.04828430712223053, + "learning_rate": 1.0628589097795277e-05, + "loss": 0.0342, + "step": 142040 + }, + { + "epoch": 0.31025, + "grad_norm": 0.04082063212990761, + "learning_rate": 1.0625206897349102e-05, + "loss": 0.0334, + "step": 142050 + }, + { + "epoch": 0.3103, + "grad_norm": 0.0471375398337841, + "learning_rate": 1.0621825089904369e-05, + "loss": 0.0343, + "step": 142060 + }, + { + "epoch": 0.31035, + "grad_norm": 0.05987219512462616, + "learning_rate": 1.0618443675553527e-05, + "loss": 0.037, + "step": 142070 + }, + { + "epoch": 0.3104, + "grad_norm": 0.11808999627828598, + "learning_rate": 1.0615062654389041e-05, + "loss": 0.0344, + "step": 142080 + }, + { + "epoch": 0.31045, + "grad_norm": 0.05503125488758087, + "learning_rate": 1.0611682026503328e-05, + "loss": 0.0347, + "step": 142090 + }, + { + "epoch": 0.3105, + "grad_norm": 0.0595247745513916, + "learning_rate": 1.0608301791988842e-05, + "loss": 0.0359, + "step": 142100 + }, + { + "epoch": 0.31055, + "grad_norm": 0.05358587205410004, + "learning_rate": 1.060492195093796e-05, + "loss": 0.0326, + "step": 142110 + }, + { + "epoch": 0.3106, + "grad_norm": 0.06559716165065765, + "learning_rate": 1.0601542503443112e-05, + "loss": 0.0333, + "step": 142120 + }, + { + "epoch": 0.31065, + "grad_norm": 0.05155010148882866, + "learning_rate": 1.0598163449596676e-05, + "loss": 0.0327, + "step": 142130 + }, + { + "epoch": 0.3107, + "grad_norm": 0.05211557820439339, + "learning_rate": 1.0594784789491047e-05, + "loss": 0.0329, + "step": 142140 + }, + { + "epoch": 0.31075, + "grad_norm": 0.05778401345014572, + "learning_rate": 1.0591406523218592e-05, + "loss": 0.0335, + "step": 142150 + }, + { + "epoch": 0.3108, + "grad_norm": 0.05356336012482643, + "learning_rate": 1.058802865087166e-05, + "loss": 0.0326, + "step": 142160 + }, + { + "epoch": 0.31085, + "grad_norm": 0.053282007575035095, + "learning_rate": 1.058465117254262e-05, + "loss": 0.0342, + "step": 142170 + }, + { + "epoch": 0.3109, + "grad_norm": 0.08436008542776108, + "learning_rate": 1.05812740883238e-05, + "loss": 0.0336, + "step": 142180 + }, + { + "epoch": 0.31095, + "grad_norm": 0.08286771178245544, + "learning_rate": 1.0577897398307529e-05, + "loss": 0.0328, + "step": 142190 + }, + { + "epoch": 0.311, + "grad_norm": 0.0891069695353508, + "learning_rate": 1.0574521102586118e-05, + "loss": 0.0328, + "step": 142200 + }, + { + "epoch": 0.31105, + "grad_norm": 0.06336149573326111, + "learning_rate": 1.0571145201251882e-05, + "loss": 0.0315, + "step": 142210 + }, + { + "epoch": 0.3111, + "grad_norm": 0.06335395574569702, + "learning_rate": 1.056776969439712e-05, + "loss": 0.0327, + "step": 142220 + }, + { + "epoch": 0.31115, + "grad_norm": 0.05544976890087128, + "learning_rate": 1.0564394582114115e-05, + "loss": 0.0329, + "step": 142230 + }, + { + "epoch": 0.3112, + "grad_norm": 0.04723066836595535, + "learning_rate": 1.056101986449514e-05, + "loss": 0.0323, + "step": 142240 + }, + { + "epoch": 0.31125, + "grad_norm": 0.045306917279958725, + "learning_rate": 1.0557645541632447e-05, + "loss": 0.0331, + "step": 142250 + }, + { + "epoch": 0.3113, + "grad_norm": 0.059146132320165634, + "learning_rate": 1.0554271613618308e-05, + "loss": 0.035, + "step": 142260 + }, + { + "epoch": 0.31135, + "grad_norm": 0.05136910825967789, + "learning_rate": 1.0550898080544958e-05, + "loss": 0.0329, + "step": 142270 + }, + { + "epoch": 0.3114, + "grad_norm": 0.05314375460147858, + "learning_rate": 1.0547524942504617e-05, + "loss": 0.0335, + "step": 142280 + }, + { + "epoch": 0.31145, + "grad_norm": 0.04912213236093521, + "learning_rate": 1.0544152199589521e-05, + "loss": 0.0319, + "step": 142290 + }, + { + "epoch": 0.3115, + "grad_norm": 0.04902366176247597, + "learning_rate": 1.0540779851891865e-05, + "loss": 0.0322, + "step": 142300 + }, + { + "epoch": 0.31155, + "grad_norm": 0.05074556544423103, + "learning_rate": 1.0537407899503876e-05, + "loss": 0.0314, + "step": 142310 + }, + { + "epoch": 0.3116, + "grad_norm": 0.06239822879433632, + "learning_rate": 1.05340363425177e-05, + "loss": 0.0322, + "step": 142320 + }, + { + "epoch": 0.31165, + "grad_norm": 0.058451853692531586, + "learning_rate": 1.053066518102554e-05, + "loss": 0.0344, + "step": 142330 + }, + { + "epoch": 0.3117, + "grad_norm": 0.0508386604487896, + "learning_rate": 1.0527294415119562e-05, + "loss": 0.0311, + "step": 142340 + }, + { + "epoch": 0.31175, + "grad_norm": 0.05742720514535904, + "learning_rate": 1.0523924044891923e-05, + "loss": 0.0331, + "step": 142350 + }, + { + "epoch": 0.3118, + "grad_norm": 0.05304501950740814, + "learning_rate": 1.0520554070434757e-05, + "loss": 0.0333, + "step": 142360 + }, + { + "epoch": 0.31185, + "grad_norm": 0.04958457872271538, + "learning_rate": 1.0517184491840199e-05, + "loss": 0.0318, + "step": 142370 + }, + { + "epoch": 0.3119, + "grad_norm": 0.05456435680389404, + "learning_rate": 1.051381530920038e-05, + "loss": 0.0328, + "step": 142380 + }, + { + "epoch": 0.31195, + "grad_norm": 0.05465429276227951, + "learning_rate": 1.0510446522607412e-05, + "loss": 0.0337, + "step": 142390 + }, + { + "epoch": 0.312, + "grad_norm": 0.059179335832595825, + "learning_rate": 1.0507078132153384e-05, + "loss": 0.0353, + "step": 142400 + }, + { + "epoch": 0.31205, + "grad_norm": 0.055974896997213364, + "learning_rate": 1.0503710137930401e-05, + "loss": 0.037, + "step": 142410 + }, + { + "epoch": 0.3121, + "grad_norm": 0.06301740556955338, + "learning_rate": 1.0500342540030531e-05, + "loss": 0.0362, + "step": 142420 + }, + { + "epoch": 0.31215, + "grad_norm": 0.06001967191696167, + "learning_rate": 1.0496975338545857e-05, + "loss": 0.0337, + "step": 142430 + }, + { + "epoch": 0.3122, + "grad_norm": 0.05626402050256729, + "learning_rate": 1.0493608533568424e-05, + "loss": 0.0344, + "step": 142440 + }, + { + "epoch": 0.31225, + "grad_norm": 0.049546778202056885, + "learning_rate": 1.049024212519028e-05, + "loss": 0.0325, + "step": 142450 + }, + { + "epoch": 0.3123, + "grad_norm": 0.05693064257502556, + "learning_rate": 1.0486876113503474e-05, + "loss": 0.0337, + "step": 142460 + }, + { + "epoch": 0.31235, + "grad_norm": 0.05737036466598511, + "learning_rate": 1.0483510498600021e-05, + "loss": 0.0346, + "step": 142470 + }, + { + "epoch": 0.3124, + "grad_norm": 0.046279098838567734, + "learning_rate": 1.0480145280571937e-05, + "loss": 0.0338, + "step": 142480 + }, + { + "epoch": 0.31245, + "grad_norm": 0.05355663597583771, + "learning_rate": 1.0476780459511218e-05, + "loss": 0.0343, + "step": 142490 + }, + { + "epoch": 0.3125, + "grad_norm": 0.0570637546479702, + "learning_rate": 1.0473416035509875e-05, + "loss": 0.0346, + "step": 142500 + }, + { + "epoch": 0.31255, + "grad_norm": 0.051324110478162766, + "learning_rate": 1.047005200865987e-05, + "loss": 0.0344, + "step": 142510 + }, + { + "epoch": 0.3126, + "grad_norm": 0.04968645051121712, + "learning_rate": 1.0466688379053193e-05, + "loss": 0.0336, + "step": 142520 + }, + { + "epoch": 0.31265, + "grad_norm": 0.05262758210301399, + "learning_rate": 1.04633251467818e-05, + "loss": 0.0354, + "step": 142530 + }, + { + "epoch": 0.3127, + "grad_norm": 0.055360324680805206, + "learning_rate": 1.0459962311937624e-05, + "loss": 0.0366, + "step": 142540 + }, + { + "epoch": 0.31275, + "grad_norm": 0.0528523214161396, + "learning_rate": 1.0456599874612624e-05, + "loss": 0.0337, + "step": 142550 + }, + { + "epoch": 0.3128, + "grad_norm": 0.05057939141988754, + "learning_rate": 1.0453237834898722e-05, + "loss": 0.0344, + "step": 142560 + }, + { + "epoch": 0.31285, + "grad_norm": 0.05711035802960396, + "learning_rate": 1.0449876192887831e-05, + "loss": 0.0347, + "step": 142570 + }, + { + "epoch": 0.3129, + "grad_norm": 0.05266747996211052, + "learning_rate": 1.0446514948671848e-05, + "loss": 0.0338, + "step": 142580 + }, + { + "epoch": 0.31295, + "grad_norm": 0.06965702772140503, + "learning_rate": 1.044315410234268e-05, + "loss": 0.0354, + "step": 142590 + }, + { + "epoch": 0.313, + "grad_norm": 0.05463392287492752, + "learning_rate": 1.043979365399223e-05, + "loss": 0.0344, + "step": 142600 + }, + { + "epoch": 0.31305, + "grad_norm": 0.043276332318782806, + "learning_rate": 1.0436433603712334e-05, + "loss": 0.0332, + "step": 142610 + }, + { + "epoch": 0.3131, + "grad_norm": 0.06321835517883301, + "learning_rate": 1.043307395159488e-05, + "loss": 0.0336, + "step": 142620 + }, + { + "epoch": 0.31315, + "grad_norm": 0.052459266036748886, + "learning_rate": 1.0429714697731702e-05, + "loss": 0.0348, + "step": 142630 + }, + { + "epoch": 0.3132, + "grad_norm": 0.04685082286596298, + "learning_rate": 1.0426355842214657e-05, + "loss": 0.0341, + "step": 142640 + }, + { + "epoch": 0.31325, + "grad_norm": 0.05648628994822502, + "learning_rate": 1.0422997385135571e-05, + "loss": 0.0355, + "step": 142650 + }, + { + "epoch": 0.3133, + "grad_norm": 0.05795169994235039, + "learning_rate": 1.0419639326586253e-05, + "loss": 0.0333, + "step": 142660 + }, + { + "epoch": 0.31335, + "grad_norm": 0.06540055572986603, + "learning_rate": 1.0416281666658523e-05, + "loss": 0.0328, + "step": 142670 + }, + { + "epoch": 0.3134, + "grad_norm": 0.05244702100753784, + "learning_rate": 1.0412924405444177e-05, + "loss": 0.0325, + "step": 142680 + }, + { + "epoch": 0.31345, + "grad_norm": 0.05591924488544464, + "learning_rate": 1.0409567543034995e-05, + "loss": 0.0355, + "step": 142690 + }, + { + "epoch": 0.3135, + "grad_norm": 0.062114112079143524, + "learning_rate": 1.0406211079522749e-05, + "loss": 0.0327, + "step": 142700 + }, + { + "epoch": 0.31355, + "grad_norm": 0.04614487290382385, + "learning_rate": 1.0402855014999205e-05, + "loss": 0.0316, + "step": 142710 + }, + { + "epoch": 0.3136, + "grad_norm": 0.050484973937273026, + "learning_rate": 1.0399499349556133e-05, + "loss": 0.0326, + "step": 142720 + }, + { + "epoch": 0.31365, + "grad_norm": 0.05026634410023689, + "learning_rate": 1.0396144083285264e-05, + "loss": 0.0327, + "step": 142730 + }, + { + "epoch": 0.3137, + "grad_norm": 0.05540686473250389, + "learning_rate": 1.0392789216278328e-05, + "loss": 0.0326, + "step": 142740 + }, + { + "epoch": 0.31375, + "grad_norm": 0.06008157134056091, + "learning_rate": 1.0389434748627038e-05, + "loss": 0.033, + "step": 142750 + }, + { + "epoch": 0.3138, + "grad_norm": 0.06368424743413925, + "learning_rate": 1.0386080680423124e-05, + "loss": 0.0332, + "step": 142760 + }, + { + "epoch": 0.31385, + "grad_norm": 0.062105316668748856, + "learning_rate": 1.0382727011758273e-05, + "loss": 0.033, + "step": 142770 + }, + { + "epoch": 0.3139, + "grad_norm": 0.0804833397269249, + "learning_rate": 1.0379373742724164e-05, + "loss": 0.0323, + "step": 142780 + }, + { + "epoch": 0.31395, + "grad_norm": 0.06516914814710617, + "learning_rate": 1.0376020873412492e-05, + "loss": 0.0331, + "step": 142790 + }, + { + "epoch": 0.314, + "grad_norm": 0.059499580413103104, + "learning_rate": 1.0372668403914909e-05, + "loss": 0.0329, + "step": 142800 + }, + { + "epoch": 0.31405, + "grad_norm": 0.057656656950712204, + "learning_rate": 1.0369316334323096e-05, + "loss": 0.0331, + "step": 142810 + }, + { + "epoch": 0.3141, + "grad_norm": 0.06045324355363846, + "learning_rate": 1.0365964664728655e-05, + "loss": 0.0343, + "step": 142820 + }, + { + "epoch": 0.31415, + "grad_norm": 0.054806213825941086, + "learning_rate": 1.0362613395223247e-05, + "loss": 0.0314, + "step": 142830 + }, + { + "epoch": 0.3142, + "grad_norm": 0.053731124848127365, + "learning_rate": 1.0359262525898497e-05, + "loss": 0.0331, + "step": 142840 + }, + { + "epoch": 0.31425, + "grad_norm": 0.05551392585039139, + "learning_rate": 1.0355912056846009e-05, + "loss": 0.0328, + "step": 142850 + }, + { + "epoch": 0.3143, + "grad_norm": 0.049054939299821854, + "learning_rate": 1.0352561988157382e-05, + "loss": 0.0314, + "step": 142860 + }, + { + "epoch": 0.31435, + "grad_norm": 0.05738339200615883, + "learning_rate": 1.03492123199242e-05, + "loss": 0.0332, + "step": 142870 + }, + { + "epoch": 0.3144, + "grad_norm": 0.04844846576452255, + "learning_rate": 1.0345863052238061e-05, + "loss": 0.0329, + "step": 142880 + }, + { + "epoch": 0.31445, + "grad_norm": 0.06852875649929047, + "learning_rate": 1.034251418519052e-05, + "loss": 0.0336, + "step": 142890 + }, + { + "epoch": 0.3145, + "grad_norm": 0.060325898230075836, + "learning_rate": 1.0339165718873122e-05, + "loss": 0.0325, + "step": 142900 + }, + { + "epoch": 0.31455, + "grad_norm": 0.05586904659867287, + "learning_rate": 1.0335817653377436e-05, + "loss": 0.0334, + "step": 142910 + }, + { + "epoch": 0.3146, + "grad_norm": 0.04665564000606537, + "learning_rate": 1.0332469988794977e-05, + "loss": 0.0335, + "step": 142920 + }, + { + "epoch": 0.31465, + "grad_norm": 0.05273238942027092, + "learning_rate": 1.0329122725217288e-05, + "loss": 0.0333, + "step": 142930 + }, + { + "epoch": 0.3147, + "grad_norm": 0.05633804574608803, + "learning_rate": 1.0325775862735873e-05, + "loss": 0.032, + "step": 142940 + }, + { + "epoch": 0.31475, + "grad_norm": 0.044203322380781174, + "learning_rate": 1.0322429401442232e-05, + "loss": 0.033, + "step": 142950 + }, + { + "epoch": 0.3148, + "grad_norm": 0.05241499841213226, + "learning_rate": 1.0319083341427849e-05, + "loss": 0.0342, + "step": 142960 + }, + { + "epoch": 0.31485, + "grad_norm": 0.04896755516529083, + "learning_rate": 1.0315737682784219e-05, + "loss": 0.0334, + "step": 142970 + }, + { + "epoch": 0.3149, + "grad_norm": 0.04177512973546982, + "learning_rate": 1.0312392425602805e-05, + "loss": 0.0323, + "step": 142980 + }, + { + "epoch": 0.31495, + "grad_norm": 0.05134735628962517, + "learning_rate": 1.0309047569975056e-05, + "loss": 0.0331, + "step": 142990 + }, + { + "epoch": 0.315, + "grad_norm": 0.04518589749932289, + "learning_rate": 1.0305703115992434e-05, + "loss": 0.0325, + "step": 143000 + }, + { + "epoch": 0.31505, + "grad_norm": 0.06465231627225876, + "learning_rate": 1.0302359063746364e-05, + "loss": 0.0339, + "step": 143010 + }, + { + "epoch": 0.3151, + "grad_norm": 0.05051697790622711, + "learning_rate": 1.0299015413328289e-05, + "loss": 0.0325, + "step": 143020 + }, + { + "epoch": 0.31515, + "grad_norm": 0.05085299164056778, + "learning_rate": 1.0295672164829595e-05, + "loss": 0.0327, + "step": 143030 + }, + { + "epoch": 0.3152, + "grad_norm": 0.06277789920568466, + "learning_rate": 1.0292329318341698e-05, + "loss": 0.0337, + "step": 143040 + }, + { + "epoch": 0.31525, + "grad_norm": 0.05738025903701782, + "learning_rate": 1.0288986873955999e-05, + "loss": 0.0344, + "step": 143050 + }, + { + "epoch": 0.3153, + "grad_norm": 0.04553530365228653, + "learning_rate": 1.0285644831763876e-05, + "loss": 0.0347, + "step": 143060 + }, + { + "epoch": 0.31535, + "grad_norm": 0.06229349598288536, + "learning_rate": 1.0282303191856696e-05, + "loss": 0.0327, + "step": 143070 + }, + { + "epoch": 0.3154, + "grad_norm": 0.055931445211172104, + "learning_rate": 1.0278961954325805e-05, + "loss": 0.0332, + "step": 143080 + }, + { + "epoch": 0.31545, + "grad_norm": 0.05285884067416191, + "learning_rate": 1.0275621119262565e-05, + "loss": 0.0326, + "step": 143090 + }, + { + "epoch": 0.3155, + "grad_norm": 0.055675894021987915, + "learning_rate": 1.0272280686758332e-05, + "loss": 0.0326, + "step": 143100 + }, + { + "epoch": 0.31555, + "grad_norm": 0.054981812834739685, + "learning_rate": 1.0268940656904392e-05, + "loss": 0.0329, + "step": 143110 + }, + { + "epoch": 0.3156, + "grad_norm": 0.05291634425520897, + "learning_rate": 1.0265601029792088e-05, + "loss": 0.0324, + "step": 143120 + }, + { + "epoch": 0.31565, + "grad_norm": 0.05016390606760979, + "learning_rate": 1.026226180551271e-05, + "loss": 0.036, + "step": 143130 + }, + { + "epoch": 0.3157, + "grad_norm": 0.05711352080106735, + "learning_rate": 1.0258922984157566e-05, + "loss": 0.0338, + "step": 143140 + }, + { + "epoch": 0.31575, + "grad_norm": 0.04906103014945984, + "learning_rate": 1.0255584565817928e-05, + "loss": 0.0339, + "step": 143150 + }, + { + "epoch": 0.3158, + "grad_norm": 0.049137938767671585, + "learning_rate": 1.0252246550585059e-05, + "loss": 0.0336, + "step": 143160 + }, + { + "epoch": 0.31585, + "grad_norm": 0.0545496866106987, + "learning_rate": 1.0248908938550242e-05, + "loss": 0.0343, + "step": 143170 + }, + { + "epoch": 0.3159, + "grad_norm": 0.05843012407422066, + "learning_rate": 1.024557172980471e-05, + "loss": 0.0325, + "step": 143180 + }, + { + "epoch": 0.31595, + "grad_norm": 0.058799244463443756, + "learning_rate": 1.0242234924439703e-05, + "loss": 0.0346, + "step": 143190 + }, + { + "epoch": 0.316, + "grad_norm": 0.057011742144823074, + "learning_rate": 1.0238898522546442e-05, + "loss": 0.0341, + "step": 143200 + }, + { + "epoch": 0.31605, + "grad_norm": 0.05914317071437836, + "learning_rate": 1.0235562524216158e-05, + "loss": 0.0356, + "step": 143210 + }, + { + "epoch": 0.3161, + "grad_norm": 0.05741539224982262, + "learning_rate": 1.023222692954004e-05, + "loss": 0.0342, + "step": 143220 + }, + { + "epoch": 0.31615, + "grad_norm": 0.06138933077454567, + "learning_rate": 1.0228891738609298e-05, + "loss": 0.035, + "step": 143230 + }, + { + "epoch": 0.3162, + "grad_norm": 0.057428572326898575, + "learning_rate": 1.0225556951515106e-05, + "loss": 0.034, + "step": 143240 + }, + { + "epoch": 0.31625, + "grad_norm": 0.05353344604372978, + "learning_rate": 1.0222222568348627e-05, + "loss": 0.0354, + "step": 143250 + }, + { + "epoch": 0.3163, + "grad_norm": 0.05361321568489075, + "learning_rate": 1.0218888589201043e-05, + "loss": 0.0338, + "step": 143260 + }, + { + "epoch": 0.31635, + "grad_norm": 0.04561053216457367, + "learning_rate": 1.0215555014163488e-05, + "loss": 0.0341, + "step": 143270 + }, + { + "epoch": 0.3164, + "grad_norm": 0.05260631814599037, + "learning_rate": 1.02122218433271e-05, + "loss": 0.034, + "step": 143280 + }, + { + "epoch": 0.31645, + "grad_norm": 0.0508575513958931, + "learning_rate": 1.0208889076783015e-05, + "loss": 0.0338, + "step": 143290 + }, + { + "epoch": 0.3165, + "grad_norm": 0.05368515104055405, + "learning_rate": 1.0205556714622342e-05, + "loss": 0.0344, + "step": 143300 + }, + { + "epoch": 0.31655, + "grad_norm": 0.04952288419008255, + "learning_rate": 1.0202224756936205e-05, + "loss": 0.0349, + "step": 143310 + }, + { + "epoch": 0.3166, + "grad_norm": 0.051482073962688446, + "learning_rate": 1.0198893203815669e-05, + "loss": 0.0343, + "step": 143320 + }, + { + "epoch": 0.31665, + "grad_norm": 0.08468296378850937, + "learning_rate": 1.0195562055351837e-05, + "loss": 0.0346, + "step": 143330 + }, + { + "epoch": 0.3167, + "grad_norm": 0.059160616248846054, + "learning_rate": 1.0192231311635771e-05, + "loss": 0.0341, + "step": 143340 + }, + { + "epoch": 0.31675, + "grad_norm": 0.05456196144223213, + "learning_rate": 1.0188900972758547e-05, + "loss": 0.0349, + "step": 143350 + }, + { + "epoch": 0.3168, + "grad_norm": 0.05162947624921799, + "learning_rate": 1.0185571038811204e-05, + "loss": 0.0328, + "step": 143360 + }, + { + "epoch": 0.31685, + "grad_norm": 0.058702047914266586, + "learning_rate": 1.0182241509884777e-05, + "loss": 0.0333, + "step": 143370 + }, + { + "epoch": 0.3169, + "grad_norm": 0.0639418363571167, + "learning_rate": 1.0178912386070307e-05, + "loss": 0.0339, + "step": 143380 + }, + { + "epoch": 0.31695, + "grad_norm": 0.06946256756782532, + "learning_rate": 1.0175583667458804e-05, + "loss": 0.0348, + "step": 143390 + }, + { + "epoch": 0.317, + "grad_norm": 0.05914067104458809, + "learning_rate": 1.0172255354141278e-05, + "loss": 0.0336, + "step": 143400 + }, + { + "epoch": 0.31705, + "grad_norm": 0.05531926080584526, + "learning_rate": 1.0168927446208707e-05, + "loss": 0.0333, + "step": 143410 + }, + { + "epoch": 0.3171, + "grad_norm": 0.047997161746025085, + "learning_rate": 1.016559994375209e-05, + "loss": 0.0332, + "step": 143420 + }, + { + "epoch": 0.31715, + "grad_norm": 0.05001961439847946, + "learning_rate": 1.0162272846862405e-05, + "loss": 0.0317, + "step": 143430 + }, + { + "epoch": 0.3172, + "grad_norm": 0.056277453899383545, + "learning_rate": 1.0158946155630608e-05, + "loss": 0.0338, + "step": 143440 + }, + { + "epoch": 0.31725, + "grad_norm": 0.057324331253767014, + "learning_rate": 1.0155619870147645e-05, + "loss": 0.0327, + "step": 143450 + }, + { + "epoch": 0.3173, + "grad_norm": 0.05637969449162483, + "learning_rate": 1.0152293990504452e-05, + "loss": 0.0325, + "step": 143460 + }, + { + "epoch": 0.31735, + "grad_norm": 0.05383256450295448, + "learning_rate": 1.014896851679197e-05, + "loss": 0.0339, + "step": 143470 + }, + { + "epoch": 0.3174, + "grad_norm": 0.05018642917275429, + "learning_rate": 1.0145643449101111e-05, + "loss": 0.0322, + "step": 143480 + }, + { + "epoch": 0.31745, + "grad_norm": 0.04998883977532387, + "learning_rate": 1.014231878752277e-05, + "loss": 0.0312, + "step": 143490 + }, + { + "epoch": 0.3175, + "grad_norm": 0.04939829930663109, + "learning_rate": 1.013899453214786e-05, + "loss": 0.0317, + "step": 143500 + }, + { + "epoch": 0.31755, + "grad_norm": 0.07261301577091217, + "learning_rate": 1.0135670683067247e-05, + "loss": 0.0336, + "step": 143510 + }, + { + "epoch": 0.3176, + "grad_norm": 0.06238599866628647, + "learning_rate": 1.0132347240371835e-05, + "loss": 0.0334, + "step": 143520 + }, + { + "epoch": 0.31765, + "grad_norm": 0.052265483886003494, + "learning_rate": 1.012902420415244e-05, + "loss": 0.0343, + "step": 143530 + }, + { + "epoch": 0.3177, + "grad_norm": 0.057238973677158356, + "learning_rate": 1.012570157449994e-05, + "loss": 0.032, + "step": 143540 + }, + { + "epoch": 0.31775, + "grad_norm": 0.06168012320995331, + "learning_rate": 1.0122379351505179e-05, + "loss": 0.0331, + "step": 143550 + }, + { + "epoch": 0.3178, + "grad_norm": 0.048543546348810196, + "learning_rate": 1.011905753525898e-05, + "loss": 0.0331, + "step": 143560 + }, + { + "epoch": 0.31785, + "grad_norm": 0.05441056936979294, + "learning_rate": 1.0115736125852154e-05, + "loss": 0.0339, + "step": 143570 + }, + { + "epoch": 0.3179, + "grad_norm": 0.05617294833064079, + "learning_rate": 1.0112415123375505e-05, + "loss": 0.0325, + "step": 143580 + }, + { + "epoch": 0.31795, + "grad_norm": 0.05055122449994087, + "learning_rate": 1.0109094527919838e-05, + "loss": 0.0324, + "step": 143590 + }, + { + "epoch": 0.318, + "grad_norm": 0.05427250638604164, + "learning_rate": 1.0105774339575935e-05, + "loss": 0.0322, + "step": 143600 + }, + { + "epoch": 0.31805, + "grad_norm": 0.045203424990177155, + "learning_rate": 1.0102454558434558e-05, + "loss": 0.0317, + "step": 143610 + }, + { + "epoch": 0.3181, + "grad_norm": 0.05851052328944206, + "learning_rate": 1.0099135184586484e-05, + "loss": 0.034, + "step": 143620 + }, + { + "epoch": 0.31815, + "grad_norm": 0.05924437940120697, + "learning_rate": 1.0095816218122447e-05, + "loss": 0.0374, + "step": 143630 + }, + { + "epoch": 0.3182, + "grad_norm": 0.05943276733160019, + "learning_rate": 1.0092497659133205e-05, + "loss": 0.0329, + "step": 143640 + }, + { + "epoch": 0.31825, + "grad_norm": 0.05054665356874466, + "learning_rate": 1.0089179507709476e-05, + "loss": 0.0314, + "step": 143650 + }, + { + "epoch": 0.3183, + "grad_norm": 0.0640970915555954, + "learning_rate": 1.008586176394197e-05, + "loss": 0.0339, + "step": 143660 + }, + { + "epoch": 0.31835, + "grad_norm": 0.04974190518260002, + "learning_rate": 1.0082544427921407e-05, + "loss": 0.0319, + "step": 143670 + }, + { + "epoch": 0.3184, + "grad_norm": 0.057464223355054855, + "learning_rate": 1.0079227499738475e-05, + "loss": 0.032, + "step": 143680 + }, + { + "epoch": 0.31845, + "grad_norm": 0.05096564441919327, + "learning_rate": 1.007591097948386e-05, + "loss": 0.0332, + "step": 143690 + }, + { + "epoch": 0.3185, + "grad_norm": 0.06046581640839577, + "learning_rate": 1.0072594867248223e-05, + "loss": 0.0327, + "step": 143700 + }, + { + "epoch": 0.31855, + "grad_norm": 0.05026427283883095, + "learning_rate": 1.0069279163122241e-05, + "loss": 0.0323, + "step": 143710 + }, + { + "epoch": 0.3186, + "grad_norm": 0.05407608672976494, + "learning_rate": 1.0065963867196552e-05, + "loss": 0.0348, + "step": 143720 + }, + { + "epoch": 0.31865, + "grad_norm": 0.046073343604803085, + "learning_rate": 1.0062648979561806e-05, + "loss": 0.0326, + "step": 143730 + }, + { + "epoch": 0.3187, + "grad_norm": 0.055437296628952026, + "learning_rate": 1.0059334500308626e-05, + "loss": 0.0349, + "step": 143740 + }, + { + "epoch": 0.31875, + "grad_norm": 0.06322011351585388, + "learning_rate": 1.005602042952762e-05, + "loss": 0.0345, + "step": 143750 + }, + { + "epoch": 0.3188, + "grad_norm": 0.05501188337802887, + "learning_rate": 1.0052706767309411e-05, + "loss": 0.0333, + "step": 143760 + }, + { + "epoch": 0.31885, + "grad_norm": 0.04927259311079979, + "learning_rate": 1.0049393513744581e-05, + "loss": 0.0338, + "step": 143770 + }, + { + "epoch": 0.3189, + "grad_norm": 0.05020604282617569, + "learning_rate": 1.0046080668923717e-05, + "loss": 0.035, + "step": 143780 + }, + { + "epoch": 0.31895, + "grad_norm": 0.054454952478408813, + "learning_rate": 1.004276823293738e-05, + "loss": 0.0348, + "step": 143790 + }, + { + "epoch": 0.319, + "grad_norm": 0.05578719824552536, + "learning_rate": 1.003945620587614e-05, + "loss": 0.0346, + "step": 143800 + }, + { + "epoch": 0.31905, + "grad_norm": 0.05327814072370529, + "learning_rate": 1.0036144587830568e-05, + "loss": 0.0354, + "step": 143810 + }, + { + "epoch": 0.3191, + "grad_norm": 0.059739600867033005, + "learning_rate": 1.003283337889116e-05, + "loss": 0.0355, + "step": 143820 + }, + { + "epoch": 0.31915, + "grad_norm": 0.05959959328174591, + "learning_rate": 1.0029522579148474e-05, + "loss": 0.0344, + "step": 143830 + }, + { + "epoch": 0.3192, + "grad_norm": 0.057411860674619675, + "learning_rate": 1.0026212188693006e-05, + "loss": 0.0336, + "step": 143840 + }, + { + "epoch": 0.31925, + "grad_norm": 0.06283937394618988, + "learning_rate": 1.0022902207615284e-05, + "loss": 0.036, + "step": 143850 + }, + { + "epoch": 0.3193, + "grad_norm": 0.05707542225718498, + "learning_rate": 1.0019592636005787e-05, + "loss": 0.0334, + "step": 143860 + }, + { + "epoch": 0.31935, + "grad_norm": 0.058691900223493576, + "learning_rate": 1.0016283473954993e-05, + "loss": 0.034, + "step": 143870 + }, + { + "epoch": 0.3194, + "grad_norm": 0.06506255269050598, + "learning_rate": 1.0012974721553386e-05, + "loss": 0.034, + "step": 143880 + }, + { + "epoch": 0.31945, + "grad_norm": 0.052746180444955826, + "learning_rate": 1.0009666378891419e-05, + "loss": 0.033, + "step": 143890 + }, + { + "epoch": 0.3195, + "grad_norm": 0.0566536970436573, + "learning_rate": 1.0006358446059544e-05, + "loss": 0.0337, + "step": 143900 + }, + { + "epoch": 0.31955, + "grad_norm": 0.07616173475980759, + "learning_rate": 1.0003050923148186e-05, + "loss": 0.0343, + "step": 143910 + }, + { + "epoch": 0.3196, + "grad_norm": 0.08153429627418518, + "learning_rate": 9.999743810247783e-06, + "loss": 0.0344, + "step": 143920 + }, + { + "epoch": 0.31965, + "grad_norm": 0.057439543306827545, + "learning_rate": 9.996437107448756e-06, + "loss": 0.0338, + "step": 143930 + }, + { + "epoch": 0.3197, + "grad_norm": 0.057741016149520874, + "learning_rate": 9.993130814841504e-06, + "loss": 0.0329, + "step": 143940 + }, + { + "epoch": 0.31975, + "grad_norm": 0.04677318036556244, + "learning_rate": 9.989824932516415e-06, + "loss": 0.0347, + "step": 143950 + }, + { + "epoch": 0.3198, + "grad_norm": 0.05746036767959595, + "learning_rate": 9.986519460563864e-06, + "loss": 0.0326, + "step": 143960 + }, + { + "epoch": 0.31985, + "grad_norm": 0.04599759727716446, + "learning_rate": 9.983214399074241e-06, + "loss": 0.0318, + "step": 143970 + }, + { + "epoch": 0.3199, + "grad_norm": 0.0466180220246315, + "learning_rate": 9.979909748137897e-06, + "loss": 0.0328, + "step": 143980 + }, + { + "epoch": 0.31995, + "grad_norm": 0.05364130064845085, + "learning_rate": 9.976605507845165e-06, + "loss": 0.0329, + "step": 143990 + }, + { + "epoch": 0.32, + "grad_norm": 0.050435539335012436, + "learning_rate": 9.973301678286406e-06, + "loss": 0.034, + "step": 144000 + }, + { + "epoch": 0.32005, + "grad_norm": 0.05588700622320175, + "learning_rate": 9.969998259551924e-06, + "loss": 0.0341, + "step": 144010 + }, + { + "epoch": 0.3201, + "grad_norm": 0.05198369920253754, + "learning_rate": 9.966695251732061e-06, + "loss": 0.0336, + "step": 144020 + }, + { + "epoch": 0.32015, + "grad_norm": 0.05904610827565193, + "learning_rate": 9.963392654917084e-06, + "loss": 0.0356, + "step": 144030 + }, + { + "epoch": 0.3202, + "grad_norm": 0.0924912765622139, + "learning_rate": 9.960090469197303e-06, + "loss": 0.0347, + "step": 144040 + }, + { + "epoch": 0.32025, + "grad_norm": 0.06023856997489929, + "learning_rate": 9.956788694663007e-06, + "loss": 0.0333, + "step": 144050 + }, + { + "epoch": 0.3203, + "grad_norm": 0.06063982471823692, + "learning_rate": 9.953487331404456e-06, + "loss": 0.0341, + "step": 144060 + }, + { + "epoch": 0.32035, + "grad_norm": 0.06193577125668526, + "learning_rate": 9.950186379511911e-06, + "loss": 0.0334, + "step": 144070 + }, + { + "epoch": 0.3204, + "grad_norm": 0.060056522488594055, + "learning_rate": 9.946885839075607e-06, + "loss": 0.0346, + "step": 144080 + }, + { + "epoch": 0.32045, + "grad_norm": 0.05272936448454857, + "learning_rate": 9.943585710185796e-06, + "loss": 0.0339, + "step": 144090 + }, + { + "epoch": 0.3205, + "grad_norm": 0.05106258764863014, + "learning_rate": 9.940285992932696e-06, + "loss": 0.0342, + "step": 144100 + }, + { + "epoch": 0.32055, + "grad_norm": 0.05771468207240105, + "learning_rate": 9.93698668740651e-06, + "loss": 0.0339, + "step": 144110 + }, + { + "epoch": 0.3206, + "grad_norm": 0.060430724173784256, + "learning_rate": 9.93368779369746e-06, + "loss": 0.0346, + "step": 144120 + }, + { + "epoch": 0.32065, + "grad_norm": 0.05532369762659073, + "learning_rate": 9.930389311895716e-06, + "loss": 0.033, + "step": 144130 + }, + { + "epoch": 0.3207, + "grad_norm": 0.050530027598142624, + "learning_rate": 9.927091242091475e-06, + "loss": 0.0333, + "step": 144140 + }, + { + "epoch": 0.32075, + "grad_norm": 0.046072810888290405, + "learning_rate": 9.923793584374897e-06, + "loss": 0.0323, + "step": 144150 + }, + { + "epoch": 0.3208, + "grad_norm": 0.05474727600812912, + "learning_rate": 9.920496338836135e-06, + "loss": 0.0338, + "step": 144160 + }, + { + "epoch": 0.32085, + "grad_norm": 0.07041291892528534, + "learning_rate": 9.917199505565333e-06, + "loss": 0.0327, + "step": 144170 + }, + { + "epoch": 0.3209, + "grad_norm": 0.05222819000482559, + "learning_rate": 9.91390308465264e-06, + "loss": 0.0336, + "step": 144180 + }, + { + "epoch": 0.32095, + "grad_norm": 0.05012571066617966, + "learning_rate": 9.910607076188166e-06, + "loss": 0.0333, + "step": 144190 + }, + { + "epoch": 0.321, + "grad_norm": 0.05344976484775543, + "learning_rate": 9.907311480262019e-06, + "loss": 0.0331, + "step": 144200 + }, + { + "epoch": 0.32105, + "grad_norm": 0.06622076779603958, + "learning_rate": 9.904016296964314e-06, + "loss": 0.0343, + "step": 144210 + }, + { + "epoch": 0.3211, + "grad_norm": 0.06994125247001648, + "learning_rate": 9.900721526385122e-06, + "loss": 0.0333, + "step": 144220 + }, + { + "epoch": 0.32115, + "grad_norm": 0.06810478121042252, + "learning_rate": 9.897427168614542e-06, + "loss": 0.0336, + "step": 144230 + }, + { + "epoch": 0.3212, + "grad_norm": 0.06874731183052063, + "learning_rate": 9.894133223742629e-06, + "loss": 0.035, + "step": 144240 + }, + { + "epoch": 0.32125, + "grad_norm": 0.08079435676336288, + "learning_rate": 9.89083969185943e-06, + "loss": 0.0346, + "step": 144250 + }, + { + "epoch": 0.3213, + "grad_norm": 0.055063385516405106, + "learning_rate": 9.887546573055006e-06, + "loss": 0.0331, + "step": 144260 + }, + { + "epoch": 0.32135, + "grad_norm": 0.04513436555862427, + "learning_rate": 9.884253867419383e-06, + "loss": 0.0326, + "step": 144270 + }, + { + "epoch": 0.3214, + "grad_norm": 0.0472201369702816, + "learning_rate": 9.880961575042578e-06, + "loss": 0.0323, + "step": 144280 + }, + { + "epoch": 0.32145, + "grad_norm": 0.04506751522421837, + "learning_rate": 9.877669696014593e-06, + "loss": 0.0338, + "step": 144290 + }, + { + "epoch": 0.3215, + "grad_norm": 0.046989090740680695, + "learning_rate": 9.87437823042544e-06, + "loss": 0.034, + "step": 144300 + }, + { + "epoch": 0.32155, + "grad_norm": 0.05014864727854729, + "learning_rate": 9.871087178365124e-06, + "loss": 0.0323, + "step": 144310 + }, + { + "epoch": 0.3216, + "grad_norm": 0.05255356431007385, + "learning_rate": 9.86779653992358e-06, + "loss": 0.0344, + "step": 144320 + }, + { + "epoch": 0.32165, + "grad_norm": 0.05546373128890991, + "learning_rate": 9.864506315190802e-06, + "loss": 0.0338, + "step": 144330 + }, + { + "epoch": 0.3217, + "grad_norm": 0.059057652950286865, + "learning_rate": 9.861216504256728e-06, + "loss": 0.0321, + "step": 144340 + }, + { + "epoch": 0.32175, + "grad_norm": 0.0549306720495224, + "learning_rate": 9.857927107211315e-06, + "loss": 0.0324, + "step": 144350 + }, + { + "epoch": 0.3218, + "grad_norm": 0.05145009234547615, + "learning_rate": 9.854638124144489e-06, + "loss": 0.0326, + "step": 144360 + }, + { + "epoch": 0.32185, + "grad_norm": 0.04620020091533661, + "learning_rate": 9.851349555146153e-06, + "loss": 0.0336, + "step": 144370 + }, + { + "epoch": 0.3219, + "grad_norm": 0.06289909034967422, + "learning_rate": 9.848061400306241e-06, + "loss": 0.0349, + "step": 144380 + }, + { + "epoch": 0.32195, + "grad_norm": 0.060509249567985535, + "learning_rate": 9.844773659714637e-06, + "loss": 0.0338, + "step": 144390 + }, + { + "epoch": 0.322, + "grad_norm": 0.05424680933356285, + "learning_rate": 9.84148633346123e-06, + "loss": 0.0343, + "step": 144400 + }, + { + "epoch": 0.32205, + "grad_norm": 0.04965484142303467, + "learning_rate": 9.838199421635883e-06, + "loss": 0.0335, + "step": 144410 + }, + { + "epoch": 0.3221, + "grad_norm": 0.05325651541352272, + "learning_rate": 9.834912924328474e-06, + "loss": 0.034, + "step": 144420 + }, + { + "epoch": 0.32215, + "grad_norm": 0.05906248837709427, + "learning_rate": 9.831626841628842e-06, + "loss": 0.0342, + "step": 144430 + }, + { + "epoch": 0.3222, + "grad_norm": 0.05295049399137497, + "learning_rate": 9.82834117362684e-06, + "loss": 0.033, + "step": 144440 + }, + { + "epoch": 0.32225, + "grad_norm": 0.048718854784965515, + "learning_rate": 9.825055920412291e-06, + "loss": 0.0327, + "step": 144450 + }, + { + "epoch": 0.3223, + "grad_norm": 0.04378309100866318, + "learning_rate": 9.821771082075004e-06, + "loss": 0.0326, + "step": 144460 + }, + { + "epoch": 0.32235, + "grad_norm": 0.0534726157784462, + "learning_rate": 9.818486658704801e-06, + "loss": 0.0336, + "step": 144470 + }, + { + "epoch": 0.3224, + "grad_norm": 0.04605276510119438, + "learning_rate": 9.815202650391473e-06, + "loss": 0.0318, + "step": 144480 + }, + { + "epoch": 0.32245, + "grad_norm": 0.057808876037597656, + "learning_rate": 9.811919057224786e-06, + "loss": 0.0329, + "step": 144490 + }, + { + "epoch": 0.3225, + "grad_norm": 0.052570734173059464, + "learning_rate": 9.808635879294539e-06, + "loss": 0.0345, + "step": 144500 + }, + { + "epoch": 0.32255, + "grad_norm": 0.04239342734217644, + "learning_rate": 9.805353116690468e-06, + "loss": 0.0327, + "step": 144510 + }, + { + "epoch": 0.3226, + "grad_norm": 0.06058270484209061, + "learning_rate": 9.802070769502355e-06, + "loss": 0.0329, + "step": 144520 + }, + { + "epoch": 0.32265, + "grad_norm": 0.05767593905329704, + "learning_rate": 9.798788837819898e-06, + "loss": 0.0316, + "step": 144530 + }, + { + "epoch": 0.3227, + "grad_norm": 0.06306783109903336, + "learning_rate": 9.795507321732853e-06, + "loss": 0.0346, + "step": 144540 + }, + { + "epoch": 0.32275, + "grad_norm": 0.053637877106666565, + "learning_rate": 9.792226221330916e-06, + "loss": 0.0321, + "step": 144550 + }, + { + "epoch": 0.3228, + "grad_norm": 0.0671757236123085, + "learning_rate": 9.788945536703811e-06, + "loss": 0.0334, + "step": 144560 + }, + { + "epoch": 0.32285, + "grad_norm": 0.04795819893479347, + "learning_rate": 9.785665267941218e-06, + "loss": 0.0319, + "step": 144570 + }, + { + "epoch": 0.3229, + "grad_norm": 0.06958995759487152, + "learning_rate": 9.782385415132816e-06, + "loss": 0.0355, + "step": 144580 + }, + { + "epoch": 0.32295, + "grad_norm": 0.056665509939193726, + "learning_rate": 9.779105978368285e-06, + "loss": 0.0321, + "step": 144590 + }, + { + "epoch": 0.323, + "grad_norm": 0.05132095888257027, + "learning_rate": 9.77582695773728e-06, + "loss": 0.0332, + "step": 144600 + }, + { + "epoch": 0.32305, + "grad_norm": 0.06006599962711334, + "learning_rate": 9.772548353329446e-06, + "loss": 0.0334, + "step": 144610 + }, + { + "epoch": 0.3231, + "grad_norm": 0.05475315824151039, + "learning_rate": 9.769270165234413e-06, + "loss": 0.0342, + "step": 144620 + }, + { + "epoch": 0.32315, + "grad_norm": 0.0589086152613163, + "learning_rate": 9.76599239354181e-06, + "loss": 0.033, + "step": 144630 + }, + { + "epoch": 0.3232, + "grad_norm": 0.05416763201355934, + "learning_rate": 9.762715038341258e-06, + "loss": 0.0355, + "step": 144640 + }, + { + "epoch": 0.32325, + "grad_norm": 0.04965779185295105, + "learning_rate": 9.759438099722353e-06, + "loss": 0.0329, + "step": 144650 + }, + { + "epoch": 0.3233, + "grad_norm": 0.04819157347083092, + "learning_rate": 9.756161577774688e-06, + "loss": 0.0324, + "step": 144660 + }, + { + "epoch": 0.32335, + "grad_norm": 0.04757603630423546, + "learning_rate": 9.752885472587828e-06, + "loss": 0.032, + "step": 144670 + }, + { + "epoch": 0.3234, + "grad_norm": 0.05028046295046806, + "learning_rate": 9.74960978425136e-06, + "loss": 0.0323, + "step": 144680 + }, + { + "epoch": 0.32345, + "grad_norm": 0.05156344920396805, + "learning_rate": 9.746334512854832e-06, + "loss": 0.037, + "step": 144690 + }, + { + "epoch": 0.3235, + "grad_norm": 0.049711503088474274, + "learning_rate": 9.743059658487777e-06, + "loss": 0.0326, + "step": 144700 + }, + { + "epoch": 0.32355, + "grad_norm": 0.06333158165216446, + "learning_rate": 9.73978522123975e-06, + "loss": 0.0336, + "step": 144710 + }, + { + "epoch": 0.3236, + "grad_norm": 0.06581209599971771, + "learning_rate": 9.736511201200251e-06, + "loss": 0.0347, + "step": 144720 + }, + { + "epoch": 0.32365, + "grad_norm": 0.06127002835273743, + "learning_rate": 9.733237598458821e-06, + "loss": 0.033, + "step": 144730 + }, + { + "epoch": 0.3237, + "grad_norm": 0.05636242777109146, + "learning_rate": 9.729964413104922e-06, + "loss": 0.033, + "step": 144740 + }, + { + "epoch": 0.32375, + "grad_norm": 0.06685664504766464, + "learning_rate": 9.72669164522806e-06, + "loss": 0.0327, + "step": 144750 + }, + { + "epoch": 0.3238, + "grad_norm": 0.051527686417102814, + "learning_rate": 9.723419294917719e-06, + "loss": 0.0315, + "step": 144760 + }, + { + "epoch": 0.32385, + "grad_norm": 0.04837772250175476, + "learning_rate": 9.720147362263354e-06, + "loss": 0.0321, + "step": 144770 + }, + { + "epoch": 0.3239, + "grad_norm": 0.05207151919603348, + "learning_rate": 9.71687584735442e-06, + "loss": 0.0316, + "step": 144780 + }, + { + "epoch": 0.32395, + "grad_norm": 0.045554231852293015, + "learning_rate": 9.71360475028035e-06, + "loss": 0.0314, + "step": 144790 + }, + { + "epoch": 0.324, + "grad_norm": 0.04364077001810074, + "learning_rate": 9.710334071130595e-06, + "loss": 0.0338, + "step": 144800 + }, + { + "epoch": 0.32405, + "grad_norm": 0.04708629101514816, + "learning_rate": 9.707063809994557e-06, + "loss": 0.0355, + "step": 144810 + }, + { + "epoch": 0.3241, + "grad_norm": 0.04998410493135452, + "learning_rate": 9.703793966961645e-06, + "loss": 0.0324, + "step": 144820 + }, + { + "epoch": 0.32415, + "grad_norm": 0.05050116032361984, + "learning_rate": 9.700524542121266e-06, + "loss": 0.0326, + "step": 144830 + }, + { + "epoch": 0.3242, + "grad_norm": 0.047319792211055756, + "learning_rate": 9.697255535562791e-06, + "loss": 0.0326, + "step": 144840 + }, + { + "epoch": 0.32425, + "grad_norm": 0.05137254297733307, + "learning_rate": 9.693986947375608e-06, + "loss": 0.0317, + "step": 144850 + }, + { + "epoch": 0.3243, + "grad_norm": 0.05128312483429909, + "learning_rate": 9.690718777649071e-06, + "loss": 0.0326, + "step": 144860 + }, + { + "epoch": 0.32435, + "grad_norm": 0.05278492346405983, + "learning_rate": 9.687451026472525e-06, + "loss": 0.0318, + "step": 144870 + }, + { + "epoch": 0.3244, + "grad_norm": 0.05346215143799782, + "learning_rate": 9.68418369393532e-06, + "loss": 0.0324, + "step": 144880 + }, + { + "epoch": 0.32445, + "grad_norm": 0.04384399205446243, + "learning_rate": 9.680916780126783e-06, + "loss": 0.0333, + "step": 144890 + }, + { + "epoch": 0.3245, + "grad_norm": 0.050685182213783264, + "learning_rate": 9.677650285136225e-06, + "loss": 0.0343, + "step": 144900 + }, + { + "epoch": 0.32455, + "grad_norm": 0.0474894642829895, + "learning_rate": 9.674384209052942e-06, + "loss": 0.0332, + "step": 144910 + }, + { + "epoch": 0.3246, + "grad_norm": 0.04734433814883232, + "learning_rate": 9.671118551966246e-06, + "loss": 0.0334, + "step": 144920 + }, + { + "epoch": 0.32465, + "grad_norm": 0.04418141022324562, + "learning_rate": 9.667853313965402e-06, + "loss": 0.0322, + "step": 144930 + }, + { + "epoch": 0.3247, + "grad_norm": 0.0511958971619606, + "learning_rate": 9.664588495139695e-06, + "loss": 0.0316, + "step": 144940 + }, + { + "epoch": 0.32475, + "grad_norm": 0.05163023620843887, + "learning_rate": 9.66132409557838e-06, + "loss": 0.0329, + "step": 144950 + }, + { + "epoch": 0.3248, + "grad_norm": 0.05425497889518738, + "learning_rate": 9.658060115370692e-06, + "loss": 0.0341, + "step": 144960 + }, + { + "epoch": 0.32485, + "grad_norm": 0.0633278489112854, + "learning_rate": 9.654796554605886e-06, + "loss": 0.0336, + "step": 144970 + }, + { + "epoch": 0.3249, + "grad_norm": 0.056204576045274734, + "learning_rate": 9.651533413373179e-06, + "loss": 0.035, + "step": 144980 + }, + { + "epoch": 0.32495, + "grad_norm": 0.05580423027276993, + "learning_rate": 9.64827069176178e-06, + "loss": 0.0334, + "step": 144990 + }, + { + "epoch": 0.325, + "grad_norm": 0.04533779248595238, + "learning_rate": 9.645008389860882e-06, + "loss": 0.0347, + "step": 145000 + }, + { + "epoch": 0.32505, + "grad_norm": 0.04800092428922653, + "learning_rate": 9.64174650775969e-06, + "loss": 0.0349, + "step": 145010 + }, + { + "epoch": 0.3251, + "grad_norm": 0.051888130605220795, + "learning_rate": 9.638485045547394e-06, + "loss": 0.0325, + "step": 145020 + }, + { + "epoch": 0.32515, + "grad_norm": 0.0473293699324131, + "learning_rate": 9.635224003313128e-06, + "loss": 0.0327, + "step": 145030 + }, + { + "epoch": 0.3252, + "grad_norm": 0.05297275632619858, + "learning_rate": 9.631963381146075e-06, + "loss": 0.0326, + "step": 145040 + }, + { + "epoch": 0.32525, + "grad_norm": 0.06034578010439873, + "learning_rate": 9.628703179135362e-06, + "loss": 0.0339, + "step": 145050 + }, + { + "epoch": 0.3253, + "grad_norm": 0.05310709774494171, + "learning_rate": 9.625443397370137e-06, + "loss": 0.0335, + "step": 145060 + }, + { + "epoch": 0.32535, + "grad_norm": 0.05877617746591568, + "learning_rate": 9.622184035939513e-06, + "loss": 0.0342, + "step": 145070 + }, + { + "epoch": 0.3254, + "grad_norm": 0.06464612483978271, + "learning_rate": 9.618925094932594e-06, + "loss": 0.0332, + "step": 145080 + }, + { + "epoch": 0.32545, + "grad_norm": 0.048762571066617966, + "learning_rate": 9.615666574438492e-06, + "loss": 0.0334, + "step": 145090 + }, + { + "epoch": 0.3255, + "grad_norm": 0.04977282136678696, + "learning_rate": 9.612408474546287e-06, + "loss": 0.0337, + "step": 145100 + }, + { + "epoch": 0.32555, + "grad_norm": 0.048385247588157654, + "learning_rate": 9.609150795345051e-06, + "loss": 0.0325, + "step": 145110 + }, + { + "epoch": 0.3256, + "grad_norm": 0.04673107713460922, + "learning_rate": 9.605893536923841e-06, + "loss": 0.0329, + "step": 145120 + }, + { + "epoch": 0.32565, + "grad_norm": 0.04676985740661621, + "learning_rate": 9.60263669937172e-06, + "loss": 0.033, + "step": 145130 + }, + { + "epoch": 0.3257, + "grad_norm": 0.048509273678064346, + "learning_rate": 9.599380282777737e-06, + "loss": 0.0325, + "step": 145140 + }, + { + "epoch": 0.32575, + "grad_norm": 0.07794582843780518, + "learning_rate": 9.596124287230909e-06, + "loss": 0.0328, + "step": 145150 + }, + { + "epoch": 0.3258, + "grad_norm": 0.06084432825446129, + "learning_rate": 9.592868712820258e-06, + "loss": 0.034, + "step": 145160 + }, + { + "epoch": 0.32585, + "grad_norm": 0.050786975771188736, + "learning_rate": 9.589613559634778e-06, + "loss": 0.0334, + "step": 145170 + }, + { + "epoch": 0.3259, + "grad_norm": 0.050088442862033844, + "learning_rate": 9.586358827763483e-06, + "loss": 0.0336, + "step": 145180 + }, + { + "epoch": 0.32595, + "grad_norm": 0.058238424360752106, + "learning_rate": 9.58310451729535e-06, + "loss": 0.0325, + "step": 145190 + }, + { + "epoch": 0.326, + "grad_norm": 0.04916360601782799, + "learning_rate": 9.579850628319334e-06, + "loss": 0.0336, + "step": 145200 + }, + { + "epoch": 0.32605, + "grad_norm": 0.07092466205358505, + "learning_rate": 9.57659716092442e-06, + "loss": 0.0329, + "step": 145210 + }, + { + "epoch": 0.3261, + "grad_norm": 0.056777022778987885, + "learning_rate": 9.573344115199538e-06, + "loss": 0.0327, + "step": 145220 + }, + { + "epoch": 0.32615, + "grad_norm": 0.05091371014714241, + "learning_rate": 9.570091491233646e-06, + "loss": 0.0328, + "step": 145230 + }, + { + "epoch": 0.3262, + "grad_norm": 0.05490479990839958, + "learning_rate": 9.566839289115643e-06, + "loss": 0.0329, + "step": 145240 + }, + { + "epoch": 0.32625, + "grad_norm": 0.06113282963633537, + "learning_rate": 9.563587508934451e-06, + "loss": 0.033, + "step": 145250 + }, + { + "epoch": 0.3263, + "grad_norm": 0.049865081906318665, + "learning_rate": 9.560336150778989e-06, + "loss": 0.0334, + "step": 145260 + }, + { + "epoch": 0.32635, + "grad_norm": 0.06664572656154633, + "learning_rate": 9.557085214738135e-06, + "loss": 0.0341, + "step": 145270 + }, + { + "epoch": 0.3264, + "grad_norm": 0.06306932121515274, + "learning_rate": 9.553834700900771e-06, + "loss": 0.0332, + "step": 145280 + }, + { + "epoch": 0.32645, + "grad_norm": 0.06288663297891617, + "learning_rate": 9.550584609355754e-06, + "loss": 0.034, + "step": 145290 + }, + { + "epoch": 0.3265, + "grad_norm": 0.07462607324123383, + "learning_rate": 9.547334940191957e-06, + "loss": 0.0357, + "step": 145300 + }, + { + "epoch": 0.32655, + "grad_norm": 0.05369102209806442, + "learning_rate": 9.54408569349822e-06, + "loss": 0.0319, + "step": 145310 + }, + { + "epoch": 0.3266, + "grad_norm": 0.04382164403796196, + "learning_rate": 9.540836869363365e-06, + "loss": 0.0321, + "step": 145320 + }, + { + "epoch": 0.32665, + "grad_norm": 0.04398827627301216, + "learning_rate": 9.53758846787623e-06, + "loss": 0.0315, + "step": 145330 + }, + { + "epoch": 0.3267, + "grad_norm": 0.046040408313274384, + "learning_rate": 9.534340489125607e-06, + "loss": 0.0324, + "step": 145340 + }, + { + "epoch": 0.32675, + "grad_norm": 0.048852741718292236, + "learning_rate": 9.531092933200314e-06, + "loss": 0.0322, + "step": 145350 + }, + { + "epoch": 0.3268, + "grad_norm": 0.05130552500486374, + "learning_rate": 9.52784580018913e-06, + "loss": 0.0319, + "step": 145360 + }, + { + "epoch": 0.32685, + "grad_norm": 0.055272601544857025, + "learning_rate": 9.52459909018083e-06, + "loss": 0.0336, + "step": 145370 + }, + { + "epoch": 0.3269, + "grad_norm": 0.049018390476703644, + "learning_rate": 9.521352803264167e-06, + "loss": 0.0313, + "step": 145380 + }, + { + "epoch": 0.32695, + "grad_norm": 0.045241452753543854, + "learning_rate": 9.518106939527913e-06, + "loss": 0.0313, + "step": 145390 + }, + { + "epoch": 0.327, + "grad_norm": 0.04454249516129494, + "learning_rate": 9.514861499060795e-06, + "loss": 0.0319, + "step": 145400 + }, + { + "epoch": 0.32705, + "grad_norm": 0.08072825521230698, + "learning_rate": 9.511616481951543e-06, + "loss": 0.0325, + "step": 145410 + }, + { + "epoch": 0.3271, + "grad_norm": 0.06192736327648163, + "learning_rate": 9.508371888288884e-06, + "loss": 0.034, + "step": 145420 + }, + { + "epoch": 0.32715, + "grad_norm": 0.05790010839700699, + "learning_rate": 9.505127718161508e-06, + "loss": 0.0338, + "step": 145430 + }, + { + "epoch": 0.3272, + "grad_norm": 0.06184834986925125, + "learning_rate": 9.50188397165813e-06, + "loss": 0.0339, + "step": 145440 + }, + { + "epoch": 0.32725, + "grad_norm": 0.05743928998708725, + "learning_rate": 9.498640648867418e-06, + "loss": 0.033, + "step": 145450 + }, + { + "epoch": 0.3273, + "grad_norm": 0.059182487428188324, + "learning_rate": 9.495397749878044e-06, + "loss": 0.0349, + "step": 145460 + }, + { + "epoch": 0.32735, + "grad_norm": 0.05375044420361519, + "learning_rate": 9.492155274778675e-06, + "loss": 0.0318, + "step": 145470 + }, + { + "epoch": 0.3274, + "grad_norm": 0.04980049282312393, + "learning_rate": 9.488913223657956e-06, + "loss": 0.0317, + "step": 145480 + }, + { + "epoch": 0.32745, + "grad_norm": 0.04385017976164818, + "learning_rate": 9.485671596604523e-06, + "loss": 0.0327, + "step": 145490 + }, + { + "epoch": 0.3275, + "grad_norm": 0.05428243428468704, + "learning_rate": 9.48243039370699e-06, + "loss": 0.0341, + "step": 145500 + }, + { + "epoch": 0.32755, + "grad_norm": 0.04685281217098236, + "learning_rate": 9.47918961505398e-06, + "loss": 0.0318, + "step": 145510 + }, + { + "epoch": 0.3276, + "grad_norm": 0.04714469239115715, + "learning_rate": 9.475949260734115e-06, + "loss": 0.0328, + "step": 145520 + }, + { + "epoch": 0.32765, + "grad_norm": 0.04670100286602974, + "learning_rate": 9.472709330835946e-06, + "loss": 0.033, + "step": 145530 + }, + { + "epoch": 0.3277, + "grad_norm": 0.06322925537824631, + "learning_rate": 9.46946982544808e-06, + "loss": 0.0334, + "step": 145540 + }, + { + "epoch": 0.32775, + "grad_norm": 0.05059336498379707, + "learning_rate": 9.466230744659063e-06, + "loss": 0.0334, + "step": 145550 + }, + { + "epoch": 0.3278, + "grad_norm": 0.04755368083715439, + "learning_rate": 9.462992088557473e-06, + "loss": 0.0327, + "step": 145560 + }, + { + "epoch": 0.32785, + "grad_norm": 0.04885302484035492, + "learning_rate": 9.459753857231843e-06, + "loss": 0.0325, + "step": 145570 + }, + { + "epoch": 0.3279, + "grad_norm": 0.0527208149433136, + "learning_rate": 9.456516050770695e-06, + "loss": 0.0328, + "step": 145580 + }, + { + "epoch": 0.32795, + "grad_norm": 0.0456591434776783, + "learning_rate": 9.453278669262566e-06, + "loss": 0.0332, + "step": 145590 + }, + { + "epoch": 0.328, + "grad_norm": 0.04801170900464058, + "learning_rate": 9.450041712795957e-06, + "loss": 0.0322, + "step": 145600 + }, + { + "epoch": 0.32805, + "grad_norm": 0.04043445363640785, + "learning_rate": 9.446805181459368e-06, + "loss": 0.034, + "step": 145610 + }, + { + "epoch": 0.3281, + "grad_norm": 0.05507500097155571, + "learning_rate": 9.443569075341274e-06, + "loss": 0.0347, + "step": 145620 + }, + { + "epoch": 0.32815, + "grad_norm": 0.05315954610705376, + "learning_rate": 9.440333394530163e-06, + "loss": 0.0345, + "step": 145630 + }, + { + "epoch": 0.3282, + "grad_norm": 0.05816284567117691, + "learning_rate": 9.437098139114487e-06, + "loss": 0.0332, + "step": 145640 + }, + { + "epoch": 0.32825, + "grad_norm": 0.0643046498298645, + "learning_rate": 9.433863309182706e-06, + "loss": 0.0351, + "step": 145650 + }, + { + "epoch": 0.3283, + "grad_norm": 0.05089773237705231, + "learning_rate": 9.430628904823255e-06, + "loss": 0.0337, + "step": 145660 + }, + { + "epoch": 0.32835, + "grad_norm": 0.05307083949446678, + "learning_rate": 9.427394926124553e-06, + "loss": 0.0379, + "step": 145670 + }, + { + "epoch": 0.3284, + "grad_norm": 0.056122709065675735, + "learning_rate": 9.42416137317503e-06, + "loss": 0.0353, + "step": 145680 + }, + { + "epoch": 0.32845, + "grad_norm": 0.0508899986743927, + "learning_rate": 9.420928246063085e-06, + "loss": 0.0341, + "step": 145690 + }, + { + "epoch": 0.3285, + "grad_norm": 0.052487220615148544, + "learning_rate": 9.4176955448771e-06, + "loss": 0.0333, + "step": 145700 + }, + { + "epoch": 0.32855, + "grad_norm": 0.05465936288237572, + "learning_rate": 9.414463269705475e-06, + "loss": 0.0342, + "step": 145710 + }, + { + "epoch": 0.3286, + "grad_norm": 0.04915636032819748, + "learning_rate": 9.41123142063656e-06, + "loss": 0.0332, + "step": 145720 + }, + { + "epoch": 0.32865, + "grad_norm": 0.05120699107646942, + "learning_rate": 9.407999997758738e-06, + "loss": 0.0321, + "step": 145730 + }, + { + "epoch": 0.3287, + "grad_norm": 0.047415148466825485, + "learning_rate": 9.404769001160322e-06, + "loss": 0.0325, + "step": 145740 + }, + { + "epoch": 0.32875, + "grad_norm": 0.06523957848548889, + "learning_rate": 9.401538430929669e-06, + "loss": 0.0351, + "step": 145750 + }, + { + "epoch": 0.3288, + "grad_norm": 0.05517665669322014, + "learning_rate": 9.398308287155085e-06, + "loss": 0.032, + "step": 145760 + }, + { + "epoch": 0.32885, + "grad_norm": 0.05780434235930443, + "learning_rate": 9.395078569924906e-06, + "loss": 0.0354, + "step": 145770 + }, + { + "epoch": 0.3289, + "grad_norm": 0.05136639624834061, + "learning_rate": 9.391849279327411e-06, + "loss": 0.0347, + "step": 145780 + }, + { + "epoch": 0.32895, + "grad_norm": 0.07062689960002899, + "learning_rate": 9.388620415450889e-06, + "loss": 0.0334, + "step": 145790 + }, + { + "epoch": 0.329, + "grad_norm": 0.0605822429060936, + "learning_rate": 9.385391978383626e-06, + "loss": 0.0329, + "step": 145800 + }, + { + "epoch": 0.32905, + "grad_norm": 0.0673321783542633, + "learning_rate": 9.382163968213879e-06, + "loss": 0.0335, + "step": 145810 + }, + { + "epoch": 0.3291, + "grad_norm": 0.05488608032464981, + "learning_rate": 9.378936385029901e-06, + "loss": 0.0339, + "step": 145820 + }, + { + "epoch": 0.32915, + "grad_norm": 0.05616806447505951, + "learning_rate": 9.37570922891993e-06, + "loss": 0.0333, + "step": 145830 + }, + { + "epoch": 0.3292, + "grad_norm": 0.039877749979496, + "learning_rate": 9.372482499972196e-06, + "loss": 0.0326, + "step": 145840 + }, + { + "epoch": 0.32925, + "grad_norm": 0.05098028853535652, + "learning_rate": 9.369256198274926e-06, + "loss": 0.0351, + "step": 145850 + }, + { + "epoch": 0.3293, + "grad_norm": 0.04647049680352211, + "learning_rate": 9.366030323916323e-06, + "loss": 0.0342, + "step": 145860 + }, + { + "epoch": 0.32935, + "grad_norm": 0.05865845829248428, + "learning_rate": 9.362804876984573e-06, + "loss": 0.0327, + "step": 145870 + }, + { + "epoch": 0.3294, + "grad_norm": 0.05891422927379608, + "learning_rate": 9.35957985756786e-06, + "loss": 0.0333, + "step": 145880 + }, + { + "epoch": 0.32945, + "grad_norm": 0.06232016533613205, + "learning_rate": 9.356355265754362e-06, + "loss": 0.0314, + "step": 145890 + }, + { + "epoch": 0.3295, + "grad_norm": 0.048157431185245514, + "learning_rate": 9.353131101632238e-06, + "loss": 0.0317, + "step": 145900 + }, + { + "epoch": 0.32955, + "grad_norm": 0.045827120542526245, + "learning_rate": 9.34990736528962e-06, + "loss": 0.0317, + "step": 145910 + }, + { + "epoch": 0.3296, + "grad_norm": 0.05135310813784599, + "learning_rate": 9.346684056814665e-06, + "loss": 0.0344, + "step": 145920 + }, + { + "epoch": 0.32965, + "grad_norm": 0.05288746580481529, + "learning_rate": 9.343461176295476e-06, + "loss": 0.0323, + "step": 145930 + }, + { + "epoch": 0.3297, + "grad_norm": 0.05296698585152626, + "learning_rate": 9.340238723820197e-06, + "loss": 0.0326, + "step": 145940 + }, + { + "epoch": 0.32975, + "grad_norm": 0.054329708218574524, + "learning_rate": 9.337016699476888e-06, + "loss": 0.0318, + "step": 145950 + }, + { + "epoch": 0.3298, + "grad_norm": 0.041853830218315125, + "learning_rate": 9.333795103353659e-06, + "loss": 0.0319, + "step": 145960 + }, + { + "epoch": 0.32985, + "grad_norm": 0.0503230020403862, + "learning_rate": 9.330573935538598e-06, + "loss": 0.0333, + "step": 145970 + }, + { + "epoch": 0.3299, + "grad_norm": 0.0444575697183609, + "learning_rate": 9.327353196119757e-06, + "loss": 0.0324, + "step": 145980 + }, + { + "epoch": 0.32995, + "grad_norm": 0.042210306972265244, + "learning_rate": 9.324132885185192e-06, + "loss": 0.0323, + "step": 145990 + }, + { + "epoch": 0.33, + "grad_norm": 0.04618427902460098, + "learning_rate": 9.320913002822934e-06, + "loss": 0.0314, + "step": 146000 + }, + { + "epoch": 5e-05, + "grad_norm": 0.045645494014024734, + "learning_rate": 9.317693549121034e-06, + "loss": 0.0326, + "step": 146010 + }, + { + "epoch": 0.0001, + "grad_norm": 0.044545263051986694, + "learning_rate": 9.314474524167502e-06, + "loss": 0.0339, + "step": 146020 + }, + { + "epoch": 0.00015, + "grad_norm": 0.05003022029995918, + "learning_rate": 9.311255928050333e-06, + "loss": 0.0341, + "step": 146030 + }, + { + "epoch": 0.0002, + "grad_norm": 0.04548483341932297, + "learning_rate": 9.308037760857544e-06, + "loss": 0.0333, + "step": 146040 + }, + { + "epoch": 0.00025, + "grad_norm": 0.04014163464307785, + "learning_rate": 9.304820022677097e-06, + "loss": 0.0327, + "step": 146050 + }, + { + "epoch": 0.0003, + "grad_norm": 0.0486493743956089, + "learning_rate": 9.301602713596982e-06, + "loss": 0.0332, + "step": 146060 + }, + { + "epoch": 0.00035, + "grad_norm": 0.056167442351579666, + "learning_rate": 9.298385833705153e-06, + "loss": 0.0326, + "step": 146070 + }, + { + "epoch": 0.0004, + "grad_norm": 0.045112937688827515, + "learning_rate": 9.295169383089547e-06, + "loss": 0.0334, + "step": 146080 + }, + { + "epoch": 0.00045, + "grad_norm": 0.05014557018876076, + "learning_rate": 9.291953361838121e-06, + "loss": 0.0329, + "step": 146090 + }, + { + "epoch": 0.0005, + "grad_norm": 0.05027300864458084, + "learning_rate": 9.288737770038785e-06, + "loss": 0.032, + "step": 146100 + }, + { + "epoch": 0.00055, + "grad_norm": 0.05353415012359619, + "learning_rate": 9.28552260777946e-06, + "loss": 0.0334, + "step": 146110 + }, + { + "epoch": 0.0006, + "grad_norm": 0.05639102682471275, + "learning_rate": 9.28230787514803e-06, + "loss": 0.0334, + "step": 146120 + }, + { + "epoch": 0.00065, + "grad_norm": 0.04886871203780174, + "learning_rate": 9.279093572232411e-06, + "loss": 0.0372, + "step": 146130 + }, + { + "epoch": 0.0007, + "grad_norm": 0.0415855348110199, + "learning_rate": 9.275879699120457e-06, + "loss": 0.0322, + "step": 146140 + }, + { + "epoch": 0.00075, + "grad_norm": 0.04586352780461311, + "learning_rate": 9.272666255900053e-06, + "loss": 0.0334, + "step": 146150 + }, + { + "epoch": 0.0008, + "grad_norm": 0.047350455075502396, + "learning_rate": 9.269453242659045e-06, + "loss": 0.0343, + "step": 146160 + }, + { + "epoch": 0.00085, + "grad_norm": 0.04921210557222366, + "learning_rate": 9.266240659485267e-06, + "loss": 0.0346, + "step": 146170 + }, + { + "epoch": 0.0009, + "grad_norm": 0.044031646102666855, + "learning_rate": 9.263028506466567e-06, + "loss": 0.0344, + "step": 146180 + }, + { + "epoch": 0.00095, + "grad_norm": 0.04523542895913124, + "learning_rate": 9.259816783690756e-06, + "loss": 0.0344, + "step": 146190 + }, + { + "epoch": 0.001, + "grad_norm": 0.048036910593509674, + "learning_rate": 9.256605491245642e-06, + "loss": 0.0341, + "step": 146200 + }, + { + "epoch": 0.00105, + "grad_norm": 0.05942752957344055, + "learning_rate": 9.253394629219008e-06, + "loss": 0.0356, + "step": 146210 + }, + { + "epoch": 0.0011, + "grad_norm": 0.05776863545179367, + "learning_rate": 9.250184197698653e-06, + "loss": 0.0336, + "step": 146220 + }, + { + "epoch": 0.00115, + "grad_norm": 0.05581040307879448, + "learning_rate": 9.246974196772359e-06, + "loss": 0.0339, + "step": 146230 + }, + { + "epoch": 0.0012, + "grad_norm": 0.04974944889545441, + "learning_rate": 9.243764626527856e-06, + "loss": 0.0341, + "step": 146240 + }, + { + "epoch": 0.00125, + "grad_norm": 0.054227203130722046, + "learning_rate": 9.240555487052918e-06, + "loss": 0.0351, + "step": 146250 + }, + { + "epoch": 0.0013, + "grad_norm": 0.04993123561143875, + "learning_rate": 9.237346778435264e-06, + "loss": 0.0331, + "step": 146260 + }, + { + "epoch": 0.00135, + "grad_norm": 0.04671796038746834, + "learning_rate": 9.234138500762635e-06, + "loss": 0.0325, + "step": 146270 + }, + { + "epoch": 0.0014, + "grad_norm": 0.06007940322160721, + "learning_rate": 9.230930654122736e-06, + "loss": 0.0346, + "step": 146280 + }, + { + "epoch": 0.00145, + "grad_norm": 0.059051770716905594, + "learning_rate": 9.227723238603262e-06, + "loss": 0.0332, + "step": 146290 + }, + { + "epoch": 0.0015, + "grad_norm": 0.05878492444753647, + "learning_rate": 9.224516254291915e-06, + "loss": 0.0331, + "step": 146300 + }, + { + "epoch": 0.00155, + "grad_norm": 0.05282504856586456, + "learning_rate": 9.22130970127637e-06, + "loss": 0.0324, + "step": 146310 + }, + { + "epoch": 0.0016, + "grad_norm": 0.04789440706372261, + "learning_rate": 9.218103579644289e-06, + "loss": 0.0331, + "step": 146320 + }, + { + "epoch": 0.00165, + "grad_norm": 0.06352198868989944, + "learning_rate": 9.214897889483317e-06, + "loss": 0.0337, + "step": 146330 + }, + { + "epoch": 0.0017, + "grad_norm": 0.06606133282184601, + "learning_rate": 9.211692630881108e-06, + "loss": 0.0331, + "step": 146340 + }, + { + "epoch": 0.00175, + "grad_norm": 0.05790630355477333, + "learning_rate": 9.2084878039253e-06, + "loss": 0.0339, + "step": 146350 + }, + { + "epoch": 0.0018, + "grad_norm": 0.10246631503105164, + "learning_rate": 9.205283408703502e-06, + "loss": 0.0348, + "step": 146360 + }, + { + "epoch": 0.00185, + "grad_norm": 0.1105370968580246, + "learning_rate": 9.202079445303322e-06, + "loss": 0.0325, + "step": 146370 + }, + { + "epoch": 0.0019, + "grad_norm": 0.06171422451734543, + "learning_rate": 9.198875913812346e-06, + "loss": 0.0344, + "step": 146380 + }, + { + "epoch": 0.00195, + "grad_norm": 0.059385087341070175, + "learning_rate": 9.195672814318176e-06, + "loss": 0.0336, + "step": 146390 + }, + { + "epoch": 0.002, + "grad_norm": 0.06413570791482925, + "learning_rate": 9.192470146908375e-06, + "loss": 0.035, + "step": 146400 + }, + { + "epoch": 0.00205, + "grad_norm": 0.0679282695055008, + "learning_rate": 9.18926791167049e-06, + "loss": 0.0338, + "step": 146410 + }, + { + "epoch": 0.0021, + "grad_norm": 0.0589129775762558, + "learning_rate": 9.18606610869209e-06, + "loss": 0.034, + "step": 146420 + }, + { + "epoch": 0.00215, + "grad_norm": 0.05278779938817024, + "learning_rate": 9.182864738060693e-06, + "loss": 0.0327, + "step": 146430 + }, + { + "epoch": 0.0022, + "grad_norm": 0.04730449244379997, + "learning_rate": 9.179663799863849e-06, + "loss": 0.0318, + "step": 146440 + }, + { + "epoch": 0.00225, + "grad_norm": 0.058954108506441116, + "learning_rate": 9.176463294189037e-06, + "loss": 0.0347, + "step": 146450 + }, + { + "epoch": 0.0023, + "grad_norm": 0.050698429346084595, + "learning_rate": 9.173263221123773e-06, + "loss": 0.034, + "step": 146460 + }, + { + "epoch": 0.00235, + "grad_norm": 0.05391349270939827, + "learning_rate": 9.170063580755556e-06, + "loss": 0.0334, + "step": 146470 + }, + { + "epoch": 0.0024, + "grad_norm": 0.05194341391324997, + "learning_rate": 9.166864373171854e-06, + "loss": 0.0326, + "step": 146480 + }, + { + "epoch": 0.00245, + "grad_norm": 0.05862382799386978, + "learning_rate": 9.16366559846013e-06, + "loss": 0.0333, + "step": 146490 + }, + { + "epoch": 0.0025, + "grad_norm": 0.05081118643283844, + "learning_rate": 9.160467256707834e-06, + "loss": 0.0336, + "step": 146500 + }, + { + "epoch": 0.00255, + "grad_norm": 0.051504164934158325, + "learning_rate": 9.157269348002417e-06, + "loss": 0.0342, + "step": 146510 + }, + { + "epoch": 0.0026, + "grad_norm": 0.06455394625663757, + "learning_rate": 9.154071872431305e-06, + "loss": 0.0358, + "step": 146520 + }, + { + "epoch": 0.00265, + "grad_norm": 0.06332694739103317, + "learning_rate": 9.15087483008191e-06, + "loss": 0.033, + "step": 146530 + }, + { + "epoch": 0.0027, + "grad_norm": 0.053663916885852814, + "learning_rate": 9.147678221041647e-06, + "loss": 0.0329, + "step": 146540 + }, + { + "epoch": 0.00275, + "grad_norm": 0.0548773817718029, + "learning_rate": 9.1444820453979e-06, + "loss": 0.033, + "step": 146550 + }, + { + "epoch": 0.0028, + "grad_norm": 0.05587553232908249, + "learning_rate": 9.141286303238065e-06, + "loss": 0.0321, + "step": 146560 + }, + { + "epoch": 0.00285, + "grad_norm": 0.046682775020599365, + "learning_rate": 9.138090994649503e-06, + "loss": 0.0332, + "step": 146570 + }, + { + "epoch": 0.0029, + "grad_norm": 0.05073460564017296, + "learning_rate": 9.134896119719578e-06, + "loss": 0.0319, + "step": 146580 + }, + { + "epoch": 0.00295, + "grad_norm": 0.09987479448318481, + "learning_rate": 9.131701678535621e-06, + "loss": 0.0333, + "step": 146590 + }, + { + "epoch": 0.003, + "grad_norm": 0.058221518993377686, + "learning_rate": 9.128507671184989e-06, + "loss": 0.0319, + "step": 146600 + }, + { + "epoch": 0.00305, + "grad_norm": 0.05034530907869339, + "learning_rate": 9.125314097754994e-06, + "loss": 0.0336, + "step": 146610 + }, + { + "epoch": 0.0031, + "grad_norm": 0.04986129701137543, + "learning_rate": 9.12212095833294e-06, + "loss": 0.0335, + "step": 146620 + }, + { + "epoch": 0.00315, + "grad_norm": 0.056024134159088135, + "learning_rate": 9.11892825300614e-06, + "loss": 0.0345, + "step": 146630 + }, + { + "epoch": 0.0032, + "grad_norm": 0.041960589587688446, + "learning_rate": 9.11573598186187e-06, + "loss": 0.0333, + "step": 146640 + }, + { + "epoch": 0.00325, + "grad_norm": 0.04810328409075737, + "learning_rate": 9.112544144987417e-06, + "loss": 0.0338, + "step": 146650 + }, + { + "epoch": 0.0033, + "grad_norm": 0.052023597061634064, + "learning_rate": 9.10935274247004e-06, + "loss": 0.0326, + "step": 146660 + }, + { + "epoch": 0.00335, + "grad_norm": 0.05089326947927475, + "learning_rate": 9.10616177439698e-06, + "loss": 0.0323, + "step": 146670 + }, + { + "epoch": 0.0034, + "grad_norm": 0.046460434794425964, + "learning_rate": 9.102971240855494e-06, + "loss": 0.0333, + "step": 146680 + }, + { + "epoch": 0.00345, + "grad_norm": 0.04194452613592148, + "learning_rate": 9.099781141932803e-06, + "loss": 0.0326, + "step": 146690 + }, + { + "epoch": 0.0035, + "grad_norm": 0.03830978274345398, + "learning_rate": 9.09659147771612e-06, + "loss": 0.0323, + "step": 146700 + }, + { + "epoch": 0.00355, + "grad_norm": 0.05357314646244049, + "learning_rate": 9.093402248292645e-06, + "loss": 0.0339, + "step": 146710 + }, + { + "epoch": 0.0036, + "grad_norm": 0.040867824107408524, + "learning_rate": 9.090213453749577e-06, + "loss": 0.0322, + "step": 146720 + }, + { + "epoch": 0.00365, + "grad_norm": 0.03825710713863373, + "learning_rate": 9.087025094174112e-06, + "loss": 0.032, + "step": 146730 + }, + { + "epoch": 0.0037, + "grad_norm": 0.047729622572660446, + "learning_rate": 9.083837169653387e-06, + "loss": 0.0337, + "step": 146740 + }, + { + "epoch": 0.00375, + "grad_norm": 0.05512430518865585, + "learning_rate": 9.08064968027458e-06, + "loss": 0.0336, + "step": 146750 + }, + { + "epoch": 0.0038, + "grad_norm": 0.04865284264087677, + "learning_rate": 9.077462626124825e-06, + "loss": 0.0337, + "step": 146760 + }, + { + "epoch": 0.00385, + "grad_norm": 0.052600521594285965, + "learning_rate": 9.074276007291266e-06, + "loss": 0.0332, + "step": 146770 + }, + { + "epoch": 0.0039, + "grad_norm": 0.07956542819738388, + "learning_rate": 9.071089823861021e-06, + "loss": 0.0351, + "step": 146780 + }, + { + "epoch": 0.00395, + "grad_norm": 0.06006759777665138, + "learning_rate": 9.067904075921182e-06, + "loss": 0.0329, + "step": 146790 + }, + { + "epoch": 0.004, + "grad_norm": 0.05700773000717163, + "learning_rate": 9.064718763558874e-06, + "loss": 0.0335, + "step": 146800 + }, + { + "epoch": 0.00405, + "grad_norm": 0.05110654979944229, + "learning_rate": 9.061533886861168e-06, + "loss": 0.032, + "step": 146810 + }, + { + "epoch": 0.0041, + "grad_norm": 0.04639684036374092, + "learning_rate": 9.058349445915135e-06, + "loss": 0.0324, + "step": 146820 + }, + { + "epoch": 0.00415, + "grad_norm": 0.053005319088697433, + "learning_rate": 9.05516544080783e-06, + "loss": 0.0336, + "step": 146830 + }, + { + "epoch": 0.0042, + "grad_norm": 0.08806686848402023, + "learning_rate": 9.051981871626326e-06, + "loss": 0.0348, + "step": 146840 + }, + { + "epoch": 0.00425, + "grad_norm": 0.052222974598407745, + "learning_rate": 9.048798738457632e-06, + "loss": 0.0333, + "step": 146850 + }, + { + "epoch": 0.0043, + "grad_norm": 0.049556005746126175, + "learning_rate": 9.045616041388799e-06, + "loss": 0.0341, + "step": 146860 + }, + { + "epoch": 0.00435, + "grad_norm": 0.04521545395255089, + "learning_rate": 9.042433780506829e-06, + "loss": 0.0336, + "step": 146870 + }, + { + "epoch": 0.0044, + "grad_norm": 0.0510425828397274, + "learning_rate": 9.039251955898715e-06, + "loss": 0.0335, + "step": 146880 + }, + { + "epoch": 0.00445, + "grad_norm": 0.05148714408278465, + "learning_rate": 9.036070567651463e-06, + "loss": 0.0352, + "step": 146890 + }, + { + "epoch": 0.0045, + "grad_norm": 0.06311694532632828, + "learning_rate": 9.03288961585205e-06, + "loss": 0.035, + "step": 146900 + }, + { + "epoch": 0.00455, + "grad_norm": 0.04238202050328255, + "learning_rate": 9.029709100587425e-06, + "loss": 0.0329, + "step": 146910 + }, + { + "epoch": 0.0046, + "grad_norm": 0.047373171895742416, + "learning_rate": 9.02652902194456e-06, + "loss": 0.0343, + "step": 146920 + }, + { + "epoch": 0.00465, + "grad_norm": 0.04213150963187218, + "learning_rate": 9.023349380010384e-06, + "loss": 0.0329, + "step": 146930 + }, + { + "epoch": 0.0047, + "grad_norm": 0.04656418040394783, + "learning_rate": 9.020170174871851e-06, + "loss": 0.0329, + "step": 146940 + }, + { + "epoch": 0.00475, + "grad_norm": 0.05302516743540764, + "learning_rate": 9.016991406615843e-06, + "loss": 0.0325, + "step": 146950 + }, + { + "epoch": 0.0048, + "grad_norm": 0.04817183315753937, + "learning_rate": 9.013813075329297e-06, + "loss": 0.0334, + "step": 146960 + }, + { + "epoch": 0.00485, + "grad_norm": 0.06088642403483391, + "learning_rate": 9.010635181099087e-06, + "loss": 0.0333, + "step": 146970 + }, + { + "epoch": 0.0049, + "grad_norm": 0.042321301996707916, + "learning_rate": 9.007457724012112e-06, + "loss": 0.0339, + "step": 146980 + }, + { + "epoch": 0.00495, + "grad_norm": 0.05898347496986389, + "learning_rate": 9.004280704155233e-06, + "loss": 0.0336, + "step": 146990 + }, + { + "epoch": 0.005, + "grad_norm": 0.04829243943095207, + "learning_rate": 9.001104121615303e-06, + "loss": 0.0327, + "step": 147000 + }, + { + "epoch": 0.00505, + "grad_norm": 0.05537727102637291, + "learning_rate": 8.997927976479185e-06, + "loss": 0.0333, + "step": 147010 + }, + { + "epoch": 0.0051, + "grad_norm": 0.04421060532331467, + "learning_rate": 8.994752268833703e-06, + "loss": 0.0317, + "step": 147020 + }, + { + "epoch": 0.00515, + "grad_norm": 0.05640105530619621, + "learning_rate": 8.991576998765682e-06, + "loss": 0.0318, + "step": 147030 + }, + { + "epoch": 0.0052, + "grad_norm": 0.05159401521086693, + "learning_rate": 8.988402166361923e-06, + "loss": 0.0324, + "step": 147040 + }, + { + "epoch": 0.00525, + "grad_norm": 0.040277011692523956, + "learning_rate": 8.985227771709233e-06, + "loss": 0.0323, + "step": 147050 + }, + { + "epoch": 0.0053, + "grad_norm": 0.04992297664284706, + "learning_rate": 8.98205381489441e-06, + "loss": 0.0329, + "step": 147060 + }, + { + "epoch": 0.00535, + "grad_norm": 0.04317447543144226, + "learning_rate": 8.978880296004217e-06, + "loss": 0.033, + "step": 147070 + }, + { + "epoch": 0.0054, + "grad_norm": 0.044240791350603104, + "learning_rate": 8.975707215125417e-06, + "loss": 0.0323, + "step": 147080 + }, + { + "epoch": 0.00545, + "grad_norm": 0.055999599397182465, + "learning_rate": 8.97253457234475e-06, + "loss": 0.0322, + "step": 147090 + }, + { + "epoch": 0.0055, + "grad_norm": 0.053453151136636734, + "learning_rate": 8.969362367748982e-06, + "loss": 0.0328, + "step": 147100 + }, + { + "epoch": 0.00555, + "grad_norm": 0.04140254482626915, + "learning_rate": 8.96619060142482e-06, + "loss": 0.0318, + "step": 147110 + }, + { + "epoch": 0.0056, + "grad_norm": 0.04652618616819382, + "learning_rate": 8.963019273458975e-06, + "loss": 0.0325, + "step": 147120 + }, + { + "epoch": 0.00565, + "grad_norm": 0.049115218222141266, + "learning_rate": 8.959848383938168e-06, + "loss": 0.0326, + "step": 147130 + }, + { + "epoch": 0.0057, + "grad_norm": 0.063838429749012, + "learning_rate": 8.95667793294907e-06, + "loss": 0.0348, + "step": 147140 + }, + { + "epoch": 0.00575, + "grad_norm": 0.055249787867069244, + "learning_rate": 8.953507920578391e-06, + "loss": 0.0324, + "step": 147150 + }, + { + "epoch": 0.0058, + "grad_norm": 0.06927811354398727, + "learning_rate": 8.950338346912757e-06, + "loss": 0.0333, + "step": 147160 + }, + { + "epoch": 0.00585, + "grad_norm": 0.05433012172579765, + "learning_rate": 8.947169212038842e-06, + "loss": 0.0341, + "step": 147170 + }, + { + "epoch": 0.0059, + "grad_norm": 0.0582391656935215, + "learning_rate": 8.9440005160433e-06, + "loss": 0.033, + "step": 147180 + }, + { + "epoch": 0.00595, + "grad_norm": 0.04783938080072403, + "learning_rate": 8.940832259012751e-06, + "loss": 0.0327, + "step": 147190 + }, + { + "epoch": 0.006, + "grad_norm": 0.05929884314537048, + "learning_rate": 8.937664441033817e-06, + "loss": 0.0318, + "step": 147200 + }, + { + "epoch": 0.00605, + "grad_norm": 0.06661206483840942, + "learning_rate": 8.934497062193092e-06, + "loss": 0.0329, + "step": 147210 + }, + { + "epoch": 0.0061, + "grad_norm": 0.05749279633164406, + "learning_rate": 8.931330122577191e-06, + "loss": 0.032, + "step": 147220 + }, + { + "epoch": 0.00615, + "grad_norm": 0.05884834751486778, + "learning_rate": 8.92816362227269e-06, + "loss": 0.0334, + "step": 147230 + }, + { + "epoch": 0.0062, + "grad_norm": 0.05903388932347298, + "learning_rate": 8.924997561366146e-06, + "loss": 0.0331, + "step": 147240 + }, + { + "epoch": 0.00625, + "grad_norm": 0.06084084510803223, + "learning_rate": 8.92183193994414e-06, + "loss": 0.0342, + "step": 147250 + }, + { + "epoch": 0.0063, + "grad_norm": 0.0606980137526989, + "learning_rate": 8.9186667580932e-06, + "loss": 0.0335, + "step": 147260 + }, + { + "epoch": 0.00635, + "grad_norm": 0.057444553822278976, + "learning_rate": 8.915502015899876e-06, + "loss": 0.0333, + "step": 147270 + }, + { + "epoch": 0.0064, + "grad_norm": 0.07764684408903122, + "learning_rate": 8.912337713450685e-06, + "loss": 0.0337, + "step": 147280 + }, + { + "epoch": 0.00645, + "grad_norm": 0.05538986250758171, + "learning_rate": 8.909173850832131e-06, + "loss": 0.0336, + "step": 147290 + }, + { + "epoch": 0.0065, + "grad_norm": 0.07165313512086868, + "learning_rate": 8.906010428130723e-06, + "loss": 0.0342, + "step": 147300 + }, + { + "epoch": 0.00655, + "grad_norm": 0.060678672045469284, + "learning_rate": 8.902847445432947e-06, + "loss": 0.0336, + "step": 147310 + }, + { + "epoch": 0.0066, + "grad_norm": 0.054712288081645966, + "learning_rate": 8.899684902825273e-06, + "loss": 0.0333, + "step": 147320 + }, + { + "epoch": 0.00665, + "grad_norm": 0.059218235313892365, + "learning_rate": 8.896522800394158e-06, + "loss": 0.0347, + "step": 147330 + }, + { + "epoch": 0.0067, + "grad_norm": 0.050863150507211685, + "learning_rate": 8.893361138226067e-06, + "loss": 0.0336, + "step": 147340 + }, + { + "epoch": 0.00675, + "grad_norm": 0.047979872673749924, + "learning_rate": 8.890199916407425e-06, + "loss": 0.0343, + "step": 147350 + }, + { + "epoch": 0.0068, + "grad_norm": 0.050607260316610336, + "learning_rate": 8.88703913502467e-06, + "loss": 0.0328, + "step": 147360 + }, + { + "epoch": 0.00685, + "grad_norm": 0.04506528750061989, + "learning_rate": 8.883878794164213e-06, + "loss": 0.0334, + "step": 147370 + }, + { + "epoch": 0.0069, + "grad_norm": 0.04153824225068092, + "learning_rate": 8.880718893912449e-06, + "loss": 0.0335, + "step": 147380 + }, + { + "epoch": 0.00695, + "grad_norm": 0.044839829206466675, + "learning_rate": 8.87755943435578e-06, + "loss": 0.0342, + "step": 147390 + }, + { + "epoch": 0.007, + "grad_norm": 0.046069853007793427, + "learning_rate": 8.87440041558058e-06, + "loss": 0.0323, + "step": 147400 + }, + { + "epoch": 0.00705, + "grad_norm": 0.04640653729438782, + "learning_rate": 8.871241837673216e-06, + "loss": 0.0315, + "step": 147410 + }, + { + "epoch": 0.0071, + "grad_norm": 0.054289527237415314, + "learning_rate": 8.868083700720034e-06, + "loss": 0.0316, + "step": 147420 + }, + { + "epoch": 0.00715, + "grad_norm": 0.045878611505031586, + "learning_rate": 8.864926004807381e-06, + "loss": 0.0325, + "step": 147430 + }, + { + "epoch": 0.0072, + "grad_norm": 0.05043681710958481, + "learning_rate": 8.861768750021604e-06, + "loss": 0.0326, + "step": 147440 + }, + { + "epoch": 0.00725, + "grad_norm": 0.0385061539709568, + "learning_rate": 8.858611936448993e-06, + "loss": 0.032, + "step": 147450 + }, + { + "epoch": 0.0073, + "grad_norm": 0.04848558083176613, + "learning_rate": 8.855455564175875e-06, + "loss": 0.0324, + "step": 147460 + }, + { + "epoch": 0.00735, + "grad_norm": 0.04834940657019615, + "learning_rate": 8.852299633288527e-06, + "loss": 0.033, + "step": 147470 + }, + { + "epoch": 0.0074, + "grad_norm": 0.05229799076914787, + "learning_rate": 8.84914414387325e-06, + "loss": 0.0323, + "step": 147480 + }, + { + "epoch": 0.00745, + "grad_norm": 0.06077688932418823, + "learning_rate": 8.845989096016305e-06, + "loss": 0.0329, + "step": 147490 + }, + { + "epoch": 0.0075, + "grad_norm": 0.04313872382044792, + "learning_rate": 8.84283448980394e-06, + "loss": 0.0317, + "step": 147500 + }, + { + "epoch": 0.00755, + "grad_norm": 0.049466077238321304, + "learning_rate": 8.839680325322419e-06, + "loss": 0.032, + "step": 147510 + }, + { + "epoch": 0.0076, + "grad_norm": 0.060395073145627975, + "learning_rate": 8.836526602657966e-06, + "loss": 0.0328, + "step": 147520 + }, + { + "epoch": 0.00765, + "grad_norm": 0.04946276545524597, + "learning_rate": 8.833373321896805e-06, + "loss": 0.0304, + "step": 147530 + }, + { + "epoch": 0.0077, + "grad_norm": 0.0486181303858757, + "learning_rate": 8.830220483125135e-06, + "loss": 0.0307, + "step": 147540 + }, + { + "epoch": 0.00775, + "grad_norm": 0.0499403178691864, + "learning_rate": 8.827068086429161e-06, + "loss": 0.0309, + "step": 147550 + }, + { + "epoch": 0.0078, + "grad_norm": 0.042571984231472015, + "learning_rate": 8.823916131895079e-06, + "loss": 0.0319, + "step": 147560 + }, + { + "epoch": 0.00785, + "grad_norm": 0.06789781153202057, + "learning_rate": 8.820764619609053e-06, + "loss": 0.032, + "step": 147570 + }, + { + "epoch": 0.0079, + "grad_norm": 0.05580562725663185, + "learning_rate": 8.817613549657244e-06, + "loss": 0.0321, + "step": 147580 + }, + { + "epoch": 0.00795, + "grad_norm": 0.04523137956857681, + "learning_rate": 8.81446292212579e-06, + "loss": 0.0302, + "step": 147590 + }, + { + "epoch": 0.008, + "grad_norm": 0.04392389580607414, + "learning_rate": 8.811312737100852e-06, + "loss": 0.031, + "step": 147600 + }, + { + "epoch": 0.00805, + "grad_norm": 0.045671090483665466, + "learning_rate": 8.80816299466854e-06, + "loss": 0.0317, + "step": 147610 + }, + { + "epoch": 0.0081, + "grad_norm": 0.04727155715227127, + "learning_rate": 8.80501369491496e-06, + "loss": 0.032, + "step": 147620 + }, + { + "epoch": 0.00815, + "grad_norm": 0.05655229836702347, + "learning_rate": 8.80186483792623e-06, + "loss": 0.0312, + "step": 147630 + }, + { + "epoch": 0.0082, + "grad_norm": 0.04339873418211937, + "learning_rate": 8.798716423788422e-06, + "loss": 0.0321, + "step": 147640 + }, + { + "epoch": 0.00825, + "grad_norm": 0.05079496279358864, + "learning_rate": 8.795568452587639e-06, + "loss": 0.033, + "step": 147650 + }, + { + "epoch": 0.0083, + "grad_norm": 0.05334087833762169, + "learning_rate": 8.792420924409905e-06, + "loss": 0.0337, + "step": 147660 + }, + { + "epoch": 0.00835, + "grad_norm": 0.04979987442493439, + "learning_rate": 8.789273839341297e-06, + "loss": 0.0343, + "step": 147670 + }, + { + "epoch": 0.0084, + "grad_norm": 0.05215590074658394, + "learning_rate": 8.78612719746786e-06, + "loss": 0.0324, + "step": 147680 + }, + { + "epoch": 0.00845, + "grad_norm": 0.04747818037867546, + "learning_rate": 8.782980998875615e-06, + "loss": 0.032, + "step": 147690 + }, + { + "epoch": 0.0085, + "grad_norm": 0.04803787171840668, + "learning_rate": 8.779835243650578e-06, + "loss": 0.032, + "step": 147700 + }, + { + "epoch": 0.00855, + "grad_norm": 0.04493388906121254, + "learning_rate": 8.776689931878744e-06, + "loss": 0.0323, + "step": 147710 + }, + { + "epoch": 0.0086, + "grad_norm": 0.06509993970394135, + "learning_rate": 8.773545063646116e-06, + "loss": 0.0327, + "step": 147720 + }, + { + "epoch": 0.00865, + "grad_norm": 0.046299874782562256, + "learning_rate": 8.770400639038672e-06, + "loss": 0.0326, + "step": 147730 + }, + { + "epoch": 0.0087, + "grad_norm": 0.052435941994190216, + "learning_rate": 8.76725665814237e-06, + "loss": 0.0318, + "step": 147740 + }, + { + "epoch": 0.00875, + "grad_norm": 0.055246591567993164, + "learning_rate": 8.764113121043182e-06, + "loss": 0.0333, + "step": 147750 + }, + { + "epoch": 0.0088, + "grad_norm": 0.04831727594137192, + "learning_rate": 8.76097002782703e-06, + "loss": 0.0324, + "step": 147760 + }, + { + "epoch": 0.00885, + "grad_norm": 0.04088793694972992, + "learning_rate": 8.75782737857987e-06, + "loss": 0.0322, + "step": 147770 + }, + { + "epoch": 0.0089, + "grad_norm": 0.05812996253371239, + "learning_rate": 8.754685173387604e-06, + "loss": 0.0331, + "step": 147780 + }, + { + "epoch": 0.00895, + "grad_norm": 0.06011068448424339, + "learning_rate": 8.751543412336144e-06, + "loss": 0.0329, + "step": 147790 + }, + { + "epoch": 0.009, + "grad_norm": 0.058193374425172806, + "learning_rate": 8.748402095511374e-06, + "loss": 0.034, + "step": 147800 + }, + { + "epoch": 0.00905, + "grad_norm": 0.05291305109858513, + "learning_rate": 8.745261222999181e-06, + "loss": 0.0333, + "step": 147810 + }, + { + "epoch": 0.0091, + "grad_norm": 0.0512242391705513, + "learning_rate": 8.742120794885464e-06, + "loss": 0.0337, + "step": 147820 + }, + { + "epoch": 0.00915, + "grad_norm": 0.04407033696770668, + "learning_rate": 8.738980811256033e-06, + "loss": 0.0331, + "step": 147830 + }, + { + "epoch": 0.0092, + "grad_norm": 0.047220148146152496, + "learning_rate": 8.735841272196771e-06, + "loss": 0.0338, + "step": 147840 + }, + { + "epoch": 0.00925, + "grad_norm": 0.04800218343734741, + "learning_rate": 8.732702177793487e-06, + "loss": 0.0337, + "step": 147850 + }, + { + "epoch": 0.0093, + "grad_norm": 0.044987574219703674, + "learning_rate": 8.729563528132023e-06, + "loss": 0.0332, + "step": 147860 + }, + { + "epoch": 0.00935, + "grad_norm": 0.05002535879611969, + "learning_rate": 8.726425323298181e-06, + "loss": 0.0325, + "step": 147870 + }, + { + "epoch": 0.0094, + "grad_norm": 0.05462311580777168, + "learning_rate": 8.723287563377749e-06, + "loss": 0.0334, + "step": 147880 + }, + { + "epoch": 0.00945, + "grad_norm": 0.048291947692632675, + "learning_rate": 8.720150248456527e-06, + "loss": 0.0332, + "step": 147890 + }, + { + "epoch": 0.0095, + "grad_norm": 0.06289687007665634, + "learning_rate": 8.717013378620282e-06, + "loss": 0.0339, + "step": 147900 + }, + { + "epoch": 0.00955, + "grad_norm": 0.04649645835161209, + "learning_rate": 8.713876953954772e-06, + "loss": 0.0323, + "step": 147910 + }, + { + "epoch": 0.0096, + "grad_norm": 0.047846417874097824, + "learning_rate": 8.710740974545742e-06, + "loss": 0.0318, + "step": 147920 + }, + { + "epoch": 0.00965, + "grad_norm": 0.04475260153412819, + "learning_rate": 8.707605440478933e-06, + "loss": 0.0315, + "step": 147930 + }, + { + "epoch": 0.0097, + "grad_norm": 0.04678916186094284, + "learning_rate": 8.704470351840077e-06, + "loss": 0.0333, + "step": 147940 + }, + { + "epoch": 0.00975, + "grad_norm": 0.05469123646616936, + "learning_rate": 8.701335708714883e-06, + "loss": 0.0325, + "step": 147950 + }, + { + "epoch": 0.0098, + "grad_norm": 0.05012594163417816, + "learning_rate": 8.698201511189048e-06, + "loss": 0.0325, + "step": 147960 + }, + { + "epoch": 0.00985, + "grad_norm": 0.05331389605998993, + "learning_rate": 8.695067759348246e-06, + "loss": 0.0362, + "step": 147970 + }, + { + "epoch": 0.0099, + "grad_norm": 0.041243359446525574, + "learning_rate": 8.691934453278178e-06, + "loss": 0.0322, + "step": 147980 + }, + { + "epoch": 0.00995, + "grad_norm": 0.04781245440244675, + "learning_rate": 8.688801593064494e-06, + "loss": 0.0338, + "step": 147990 + }, + { + "epoch": 0.01, + "grad_norm": 0.056681521236896515, + "learning_rate": 8.68566917879284e-06, + "loss": 0.0333, + "step": 148000 + }, + { + "epoch": 0.01005, + "grad_norm": 0.056346699595451355, + "learning_rate": 8.682537210548868e-06, + "loss": 0.0327, + "step": 148010 + }, + { + "epoch": 0.0101, + "grad_norm": 0.050917401909828186, + "learning_rate": 8.679405688418193e-06, + "loss": 0.0323, + "step": 148020 + }, + { + "epoch": 0.01015, + "grad_norm": 0.04824027791619301, + "learning_rate": 8.676274612486449e-06, + "loss": 0.032, + "step": 148030 + }, + { + "epoch": 0.0102, + "grad_norm": 0.04922124743461609, + "learning_rate": 8.67314398283921e-06, + "loss": 0.0334, + "step": 148040 + }, + { + "epoch": 0.01025, + "grad_norm": 0.05176009237766266, + "learning_rate": 8.670013799562087e-06, + "loss": 0.035, + "step": 148050 + }, + { + "epoch": 0.0103, + "grad_norm": 0.0421922504901886, + "learning_rate": 8.666884062740646e-06, + "loss": 0.0328, + "step": 148060 + }, + { + "epoch": 0.01035, + "grad_norm": 0.04595519229769707, + "learning_rate": 8.663754772460464e-06, + "loss": 0.0324, + "step": 148070 + }, + { + "epoch": 0.0104, + "grad_norm": 0.06706260144710541, + "learning_rate": 8.660625928807092e-06, + "loss": 0.0337, + "step": 148080 + }, + { + "epoch": 0.01045, + "grad_norm": 0.044981446117162704, + "learning_rate": 8.657497531866057e-06, + "loss": 0.0327, + "step": 148090 + }, + { + "epoch": 0.0105, + "grad_norm": 0.06317142397165298, + "learning_rate": 8.654369581722912e-06, + "loss": 0.0332, + "step": 148100 + }, + { + "epoch": 0.01055, + "grad_norm": 0.057659924030303955, + "learning_rate": 8.65124207846316e-06, + "loss": 0.0344, + "step": 148110 + }, + { + "epoch": 0.0106, + "grad_norm": 0.05683725327253342, + "learning_rate": 8.648115022172299e-06, + "loss": 0.0333, + "step": 148120 + }, + { + "epoch": 0.01065, + "grad_norm": 0.04943874478340149, + "learning_rate": 8.64498841293584e-06, + "loss": 0.0341, + "step": 148130 + }, + { + "epoch": 0.0107, + "grad_norm": 0.05976469814777374, + "learning_rate": 8.641862250839245e-06, + "loss": 0.032, + "step": 148140 + }, + { + "epoch": 0.01075, + "grad_norm": 0.052283961325883865, + "learning_rate": 8.638736535967998e-06, + "loss": 0.0317, + "step": 148150 + }, + { + "epoch": 0.0108, + "grad_norm": 0.05344764515757561, + "learning_rate": 8.635611268407545e-06, + "loss": 0.0339, + "step": 148160 + }, + { + "epoch": 0.01085, + "grad_norm": 0.04558238387107849, + "learning_rate": 8.632486448243335e-06, + "loss": 0.0327, + "step": 148170 + }, + { + "epoch": 0.0109, + "grad_norm": 0.05175260826945305, + "learning_rate": 8.62936207556079e-06, + "loss": 0.0326, + "step": 148180 + }, + { + "epoch": 0.01095, + "grad_norm": 0.04138151556253433, + "learning_rate": 8.62623815044534e-06, + "loss": 0.0312, + "step": 148190 + }, + { + "epoch": 0.011, + "grad_norm": 0.054857030510902405, + "learning_rate": 8.62311467298239e-06, + "loss": 0.032, + "step": 148200 + }, + { + "epoch": 0.01105, + "grad_norm": 0.04743338003754616, + "learning_rate": 8.619991643257324e-06, + "loss": 0.0316, + "step": 148210 + }, + { + "epoch": 0.0111, + "grad_norm": 0.04671315848827362, + "learning_rate": 8.61686906135554e-06, + "loss": 0.0323, + "step": 148220 + }, + { + "epoch": 0.01115, + "grad_norm": 0.04476676881313324, + "learning_rate": 8.613746927362392e-06, + "loss": 0.0322, + "step": 148230 + }, + { + "epoch": 0.0112, + "grad_norm": 0.04818776994943619, + "learning_rate": 8.610625241363265e-06, + "loss": 0.0317, + "step": 148240 + }, + { + "epoch": 0.01125, + "grad_norm": 0.048856962472200394, + "learning_rate": 8.607504003443473e-06, + "loss": 0.0315, + "step": 148250 + }, + { + "epoch": 0.0113, + "grad_norm": 0.048947639763355255, + "learning_rate": 8.604383213688358e-06, + "loss": 0.035, + "step": 148260 + }, + { + "epoch": 0.01135, + "grad_norm": 0.059466104954481125, + "learning_rate": 8.601262872183257e-06, + "loss": 0.0344, + "step": 148270 + }, + { + "epoch": 0.0114, + "grad_norm": 0.04785541072487831, + "learning_rate": 8.598142979013469e-06, + "loss": 0.0339, + "step": 148280 + }, + { + "epoch": 0.01145, + "grad_norm": 0.05641579627990723, + "learning_rate": 8.595023534264291e-06, + "loss": 0.0336, + "step": 148290 + }, + { + "epoch": 0.0115, + "grad_norm": 0.05350305512547493, + "learning_rate": 8.591904538020995e-06, + "loss": 0.0333, + "step": 148300 + }, + { + "epoch": 0.01155, + "grad_norm": 0.057271573692560196, + "learning_rate": 8.588785990368866e-06, + "loss": 0.0335, + "step": 148310 + }, + { + "epoch": 0.0116, + "grad_norm": 0.06025751307606697, + "learning_rate": 8.585667891393179e-06, + "loss": 0.0352, + "step": 148320 + }, + { + "epoch": 0.01165, + "grad_norm": 0.061855290085077286, + "learning_rate": 8.58255024117915e-06, + "loss": 0.0346, + "step": 148330 + }, + { + "epoch": 0.0117, + "grad_norm": 0.05323246493935585, + "learning_rate": 8.579433039812037e-06, + "loss": 0.0347, + "step": 148340 + }, + { + "epoch": 0.01175, + "grad_norm": 0.09866755455732346, + "learning_rate": 8.576316287377047e-06, + "loss": 0.0343, + "step": 148350 + }, + { + "epoch": 0.0118, + "grad_norm": 0.09101955592632294, + "learning_rate": 8.57319998395941e-06, + "loss": 0.0334, + "step": 148360 + }, + { + "epoch": 0.01185, + "grad_norm": 0.09026189893484116, + "learning_rate": 8.570084129644312e-06, + "loss": 0.0334, + "step": 148370 + }, + { + "epoch": 0.0119, + "grad_norm": 0.05150333791971207, + "learning_rate": 8.566968724516933e-06, + "loss": 0.0345, + "step": 148380 + }, + { + "epoch": 0.01195, + "grad_norm": 0.051717933267354965, + "learning_rate": 8.563853768662463e-06, + "loss": 0.0339, + "step": 148390 + }, + { + "epoch": 0.012, + "grad_norm": 0.04366070404648781, + "learning_rate": 8.56073926216606e-06, + "loss": 0.0327, + "step": 148400 + }, + { + "epoch": 0.01205, + "grad_norm": 0.04918728768825531, + "learning_rate": 8.557625205112864e-06, + "loss": 0.0336, + "step": 148410 + }, + { + "epoch": 0.0121, + "grad_norm": 0.04548975080251694, + "learning_rate": 8.554511597588017e-06, + "loss": 0.032, + "step": 148420 + }, + { + "epoch": 0.01215, + "grad_norm": 0.048331983387470245, + "learning_rate": 8.551398439676648e-06, + "loss": 0.0334, + "step": 148430 + }, + { + "epoch": 0.0122, + "grad_norm": 0.05829129368066788, + "learning_rate": 8.548285731463856e-06, + "loss": 0.0335, + "step": 148440 + }, + { + "epoch": 0.01225, + "grad_norm": 0.05428208038210869, + "learning_rate": 8.545173473034765e-06, + "loss": 0.0335, + "step": 148450 + }, + { + "epoch": 0.0123, + "grad_norm": 0.044460829347372055, + "learning_rate": 8.542061664474446e-06, + "loss": 0.0346, + "step": 148460 + }, + { + "epoch": 0.01235, + "grad_norm": 0.04665042832493782, + "learning_rate": 8.53895030586797e-06, + "loss": 0.0335, + "step": 148470 + }, + { + "epoch": 0.0124, + "grad_norm": 0.04890317842364311, + "learning_rate": 8.535839397300418e-06, + "loss": 0.0342, + "step": 148480 + }, + { + "epoch": 0.01245, + "grad_norm": 0.04238392040133476, + "learning_rate": 8.532728938856832e-06, + "loss": 0.0349, + "step": 148490 + }, + { + "epoch": 0.0125, + "grad_norm": 0.03974468261003494, + "learning_rate": 8.529618930622241e-06, + "loss": 0.033, + "step": 148500 + }, + { + "epoch": 0.01255, + "grad_norm": 0.05076686665415764, + "learning_rate": 8.52650937268169e-06, + "loss": 0.036, + "step": 148510 + }, + { + "epoch": 0.0126, + "grad_norm": 0.043793726712465286, + "learning_rate": 8.523400265120174e-06, + "loss": 0.0347, + "step": 148520 + }, + { + "epoch": 0.01265, + "grad_norm": 0.04453187808394432, + "learning_rate": 8.520291608022724e-06, + "loss": 0.0344, + "step": 148530 + }, + { + "epoch": 0.0127, + "grad_norm": 0.0442209355533123, + "learning_rate": 8.51718340147429e-06, + "loss": 0.034, + "step": 148540 + }, + { + "epoch": 0.01275, + "grad_norm": 0.05199456587433815, + "learning_rate": 8.51407564555988e-06, + "loss": 0.0331, + "step": 148550 + }, + { + "epoch": 0.0128, + "grad_norm": 0.05228140205144882, + "learning_rate": 8.51096834036444e-06, + "loss": 0.0335, + "step": 148560 + }, + { + "epoch": 0.01285, + "grad_norm": 0.0460548959672451, + "learning_rate": 8.50786148597294e-06, + "loss": 0.0337, + "step": 148570 + }, + { + "epoch": 0.0129, + "grad_norm": 0.05205879732966423, + "learning_rate": 8.504755082470308e-06, + "loss": 0.0336, + "step": 148580 + }, + { + "epoch": 0.01295, + "grad_norm": 0.053257234394550323, + "learning_rate": 8.501649129941472e-06, + "loss": 0.0329, + "step": 148590 + }, + { + "epoch": 0.013, + "grad_norm": 0.06522348523139954, + "learning_rate": 8.498543628471353e-06, + "loss": 0.0351, + "step": 148600 + }, + { + "epoch": 0.01305, + "grad_norm": 0.054856497794389725, + "learning_rate": 8.495438578144856e-06, + "loss": 0.0328, + "step": 148610 + }, + { + "epoch": 0.0131, + "grad_norm": 0.04815627261996269, + "learning_rate": 8.492333979046868e-06, + "loss": 0.0327, + "step": 148620 + }, + { + "epoch": 0.01315, + "grad_norm": 0.04261309280991554, + "learning_rate": 8.489229831262257e-06, + "loss": 0.0315, + "step": 148630 + }, + { + "epoch": 0.0132, + "grad_norm": 0.0457463413476944, + "learning_rate": 8.4861261348759e-06, + "loss": 0.0338, + "step": 148640 + }, + { + "epoch": 0.01325, + "grad_norm": 0.044380463659763336, + "learning_rate": 8.483022889972658e-06, + "loss": 0.0332, + "step": 148650 + }, + { + "epoch": 0.0133, + "grad_norm": 0.05065517500042915, + "learning_rate": 8.479920096637367e-06, + "loss": 0.0334, + "step": 148660 + }, + { + "epoch": 0.01335, + "grad_norm": 0.04468074440956116, + "learning_rate": 8.476817754954855e-06, + "loss": 0.0324, + "step": 148670 + }, + { + "epoch": 0.0134, + "grad_norm": 0.05222933366894722, + "learning_rate": 8.473715865009927e-06, + "loss": 0.0337, + "step": 148680 + }, + { + "epoch": 0.01345, + "grad_norm": 0.04466511681675911, + "learning_rate": 8.470614426887407e-06, + "loss": 0.033, + "step": 148690 + }, + { + "epoch": 0.0135, + "grad_norm": 0.04284243658185005, + "learning_rate": 8.467513440672081e-06, + "loss": 0.0323, + "step": 148700 + }, + { + "epoch": 0.01355, + "grad_norm": 0.04450090974569321, + "learning_rate": 8.464412906448718e-06, + "loss": 0.0326, + "step": 148710 + }, + { + "epoch": 0.0136, + "grad_norm": 0.0457986555993557, + "learning_rate": 8.4613128243021e-06, + "loss": 0.0338, + "step": 148720 + }, + { + "epoch": 0.01365, + "grad_norm": 0.047325409948825836, + "learning_rate": 8.458213194316972e-06, + "loss": 0.0325, + "step": 148730 + }, + { + "epoch": 0.0137, + "grad_norm": 0.05123257637023926, + "learning_rate": 8.455114016578095e-06, + "loss": 0.037, + "step": 148740 + }, + { + "epoch": 0.01375, + "grad_norm": 0.061940617859363556, + "learning_rate": 8.452015291170168e-06, + "loss": 0.0332, + "step": 148750 + }, + { + "epoch": 0.0138, + "grad_norm": 0.04933027923107147, + "learning_rate": 8.448917018177923e-06, + "loss": 0.033, + "step": 148760 + }, + { + "epoch": 0.01385, + "grad_norm": 0.0544595904648304, + "learning_rate": 8.445819197686084e-06, + "loss": 0.0326, + "step": 148770 + }, + { + "epoch": 0.0139, + "grad_norm": 0.04772160202264786, + "learning_rate": 8.442721829779324e-06, + "loss": 0.0338, + "step": 148780 + }, + { + "epoch": 0.01395, + "grad_norm": 0.047685492783784866, + "learning_rate": 8.439624914542329e-06, + "loss": 0.0354, + "step": 148790 + }, + { + "epoch": 0.014, + "grad_norm": 0.047835707664489746, + "learning_rate": 8.43652845205976e-06, + "loss": 0.034, + "step": 148800 + }, + { + "epoch": 0.01405, + "grad_norm": 0.0465179979801178, + "learning_rate": 8.433432442416291e-06, + "loss": 0.0326, + "step": 148810 + }, + { + "epoch": 0.0141, + "grad_norm": 0.0501941442489624, + "learning_rate": 8.430336885696555e-06, + "loss": 0.0331, + "step": 148820 + }, + { + "epoch": 0.01415, + "grad_norm": 0.053617771714925766, + "learning_rate": 8.427241781985174e-06, + "loss": 0.0323, + "step": 148830 + }, + { + "epoch": 0.0142, + "grad_norm": 0.056008417159318924, + "learning_rate": 8.424147131366783e-06, + "loss": 0.0335, + "step": 148840 + }, + { + "epoch": 0.01425, + "grad_norm": 0.05336494371294975, + "learning_rate": 8.421052933925976e-06, + "loss": 0.0336, + "step": 148850 + }, + { + "epoch": 0.0143, + "grad_norm": 0.04270797222852707, + "learning_rate": 8.417959189747363e-06, + "loss": 0.033, + "step": 148860 + }, + { + "epoch": 0.01435, + "grad_norm": 0.04849123954772949, + "learning_rate": 8.414865898915513e-06, + "loss": 0.0339, + "step": 148870 + }, + { + "epoch": 0.0144, + "grad_norm": 0.057978514581918716, + "learning_rate": 8.411773061514991e-06, + "loss": 0.0347, + "step": 148880 + }, + { + "epoch": 0.01445, + "grad_norm": 0.05593280866742134, + "learning_rate": 8.408680677630371e-06, + "loss": 0.0344, + "step": 148890 + }, + { + "epoch": 0.0145, + "grad_norm": 0.05761462450027466, + "learning_rate": 8.405588747346189e-06, + "loss": 0.0352, + "step": 148900 + }, + { + "epoch": 0.01455, + "grad_norm": 0.04773632436990738, + "learning_rate": 8.402497270746976e-06, + "loss": 0.033, + "step": 148910 + }, + { + "epoch": 0.0146, + "grad_norm": 0.055619977414608, + "learning_rate": 8.399406247917241e-06, + "loss": 0.0347, + "step": 148920 + }, + { + "epoch": 0.01465, + "grad_norm": 0.047743767499923706, + "learning_rate": 8.396315678941513e-06, + "loss": 0.0345, + "step": 148930 + }, + { + "epoch": 0.0147, + "grad_norm": 0.05026392266154289, + "learning_rate": 8.39322556390427e-06, + "loss": 0.0345, + "step": 148940 + }, + { + "epoch": 0.01475, + "grad_norm": 0.044830456376075745, + "learning_rate": 8.390135902890007e-06, + "loss": 0.0335, + "step": 148950 + }, + { + "epoch": 0.0148, + "grad_norm": 0.05488893389701843, + "learning_rate": 8.387046695983188e-06, + "loss": 0.0355, + "step": 148960 + }, + { + "epoch": 0.01485, + "grad_norm": 0.04811578243970871, + "learning_rate": 8.383957943268267e-06, + "loss": 0.035, + "step": 148970 + }, + { + "epoch": 0.0149, + "grad_norm": 0.04665228724479675, + "learning_rate": 8.380869644829698e-06, + "loss": 0.0332, + "step": 148980 + }, + { + "epoch": 0.01495, + "grad_norm": 0.04551544412970543, + "learning_rate": 8.37778180075191e-06, + "loss": 0.033, + "step": 148990 + }, + { + "epoch": 0.015, + "grad_norm": 0.047599222511053085, + "learning_rate": 8.374694411119325e-06, + "loss": 0.0334, + "step": 149000 + }, + { + "epoch": 0.01505, + "grad_norm": 0.04838322475552559, + "learning_rate": 8.371607476016338e-06, + "loss": 0.0335, + "step": 149010 + }, + { + "epoch": 0.0151, + "grad_norm": 0.051446493715047836, + "learning_rate": 8.368520995527356e-06, + "loss": 0.0324, + "step": 149020 + }, + { + "epoch": 0.01515, + "grad_norm": 0.060208600014448166, + "learning_rate": 8.36543496973678e-06, + "loss": 0.0327, + "step": 149030 + }, + { + "epoch": 0.0152, + "grad_norm": 0.06569566577672958, + "learning_rate": 8.362349398728944e-06, + "loss": 0.0333, + "step": 149040 + }, + { + "epoch": 0.01525, + "grad_norm": 0.050883982330560684, + "learning_rate": 8.359264282588233e-06, + "loss": 0.0338, + "step": 149050 + }, + { + "epoch": 0.0153, + "grad_norm": 0.062197163701057434, + "learning_rate": 8.35617962139898e-06, + "loss": 0.0329, + "step": 149060 + }, + { + "epoch": 0.01535, + "grad_norm": 0.06242142245173454, + "learning_rate": 8.353095415245527e-06, + "loss": 0.0318, + "step": 149070 + }, + { + "epoch": 0.0154, + "grad_norm": 0.06413523107767105, + "learning_rate": 8.350011664212195e-06, + "loss": 0.0336, + "step": 149080 + }, + { + "epoch": 0.01545, + "grad_norm": 0.058268480002880096, + "learning_rate": 8.34692836838328e-06, + "loss": 0.0322, + "step": 149090 + }, + { + "epoch": 0.0155, + "grad_norm": 0.06240219622850418, + "learning_rate": 8.343845527843094e-06, + "loss": 0.0314, + "step": 149100 + }, + { + "epoch": 0.01555, + "grad_norm": 0.05119459331035614, + "learning_rate": 8.340763142675914e-06, + "loss": 0.0331, + "step": 149110 + }, + { + "epoch": 0.0156, + "grad_norm": 0.04056113213300705, + "learning_rate": 8.337681212966014e-06, + "loss": 0.0317, + "step": 149120 + }, + { + "epoch": 0.01565, + "grad_norm": 0.06105110049247742, + "learning_rate": 8.334599738797638e-06, + "loss": 0.0317, + "step": 149130 + }, + { + "epoch": 0.0157, + "grad_norm": 0.056628599762916565, + "learning_rate": 8.331518720255047e-06, + "loss": 0.0325, + "step": 149140 + }, + { + "epoch": 0.01575, + "grad_norm": 0.050194285809993744, + "learning_rate": 8.328438157422478e-06, + "loss": 0.0313, + "step": 149150 + }, + { + "epoch": 0.0158, + "grad_norm": 0.04470765218138695, + "learning_rate": 8.325358050384149e-06, + "loss": 0.0309, + "step": 149160 + }, + { + "epoch": 0.01585, + "grad_norm": 0.0473356693983078, + "learning_rate": 8.322278399224265e-06, + "loss": 0.0321, + "step": 149170 + }, + { + "epoch": 0.0159, + "grad_norm": 0.04666948691010475, + "learning_rate": 8.319199204027015e-06, + "loss": 0.0318, + "step": 149180 + }, + { + "epoch": 0.01595, + "grad_norm": 0.05364212766289711, + "learning_rate": 8.3161204648766e-06, + "loss": 0.0313, + "step": 149190 + }, + { + "epoch": 0.016, + "grad_norm": 0.04709921032190323, + "learning_rate": 8.313042181857186e-06, + "loss": 0.0319, + "step": 149200 + }, + { + "epoch": 0.01605, + "grad_norm": 0.04465370252728462, + "learning_rate": 8.309964355052921e-06, + "loss": 0.0311, + "step": 149210 + }, + { + "epoch": 0.0161, + "grad_norm": 0.04493457078933716, + "learning_rate": 8.306886984547969e-06, + "loss": 0.0321, + "step": 149220 + }, + { + "epoch": 0.01615, + "grad_norm": 0.049775637686252594, + "learning_rate": 8.303810070426447e-06, + "loss": 0.0327, + "step": 149230 + }, + { + "epoch": 0.0162, + "grad_norm": 0.04455522075295448, + "learning_rate": 8.3007336127725e-06, + "loss": 0.0324, + "step": 149240 + }, + { + "epoch": 0.01625, + "grad_norm": 0.04167736694216728, + "learning_rate": 8.297657611670207e-06, + "loss": 0.0311, + "step": 149250 + }, + { + "epoch": 0.0163, + "grad_norm": 0.05651344731450081, + "learning_rate": 8.294582067203688e-06, + "loss": 0.033, + "step": 149260 + }, + { + "epoch": 0.01635, + "grad_norm": 0.043863266706466675, + "learning_rate": 8.291506979457011e-06, + "loss": 0.0323, + "step": 149270 + }, + { + "epoch": 0.0164, + "grad_norm": 0.04590892791748047, + "learning_rate": 8.288432348514267e-06, + "loss": 0.0321, + "step": 149280 + }, + { + "epoch": 0.01645, + "grad_norm": 0.05240105465054512, + "learning_rate": 8.2853581744595e-06, + "loss": 0.032, + "step": 149290 + }, + { + "epoch": 0.0165, + "grad_norm": 0.04963365197181702, + "learning_rate": 8.282284457376758e-06, + "loss": 0.0325, + "step": 149300 + }, + { + "epoch": 0.01655, + "grad_norm": 0.05475384742021561, + "learning_rate": 8.279211197350081e-06, + "loss": 0.0316, + "step": 149310 + }, + { + "epoch": 0.0166, + "grad_norm": 0.06097935512661934, + "learning_rate": 8.276138394463492e-06, + "loss": 0.0341, + "step": 149320 + }, + { + "epoch": 0.01665, + "grad_norm": 0.05405963957309723, + "learning_rate": 8.273066048800988e-06, + "loss": 0.0325, + "step": 149330 + }, + { + "epoch": 0.0167, + "grad_norm": 0.054827336221933365, + "learning_rate": 8.269994160446579e-06, + "loss": 0.0325, + "step": 149340 + }, + { + "epoch": 0.01675, + "grad_norm": 0.04705236107110977, + "learning_rate": 8.26692272948424e-06, + "loss": 0.0326, + "step": 149350 + }, + { + "epoch": 0.0168, + "grad_norm": 0.05563652142882347, + "learning_rate": 8.263851755997954e-06, + "loss": 0.0342, + "step": 149360 + }, + { + "epoch": 0.01685, + "grad_norm": 0.06205492094159126, + "learning_rate": 8.260781240071674e-06, + "loss": 0.0338, + "step": 149370 + }, + { + "epoch": 0.0169, + "grad_norm": 0.05830933526158333, + "learning_rate": 8.257711181789346e-06, + "loss": 0.0321, + "step": 149380 + }, + { + "epoch": 0.01695, + "grad_norm": 0.04731369391083717, + "learning_rate": 8.254641581234895e-06, + "loss": 0.033, + "step": 149390 + }, + { + "epoch": 0.017, + "grad_norm": 0.04741034284234047, + "learning_rate": 8.251572438492261e-06, + "loss": 0.034, + "step": 149400 + }, + { + "epoch": 0.01705, + "grad_norm": 0.05141216516494751, + "learning_rate": 8.248503753645345e-06, + "loss": 0.0335, + "step": 149410 + }, + { + "epoch": 0.0171, + "grad_norm": 0.04521205648779869, + "learning_rate": 8.245435526778036e-06, + "loss": 0.034, + "step": 149420 + }, + { + "epoch": 0.01715, + "grad_norm": 0.05473974347114563, + "learning_rate": 8.242367757974233e-06, + "loss": 0.0335, + "step": 149430 + }, + { + "epoch": 0.0172, + "grad_norm": 0.05049604922533035, + "learning_rate": 8.23930044731779e-06, + "loss": 0.0345, + "step": 149440 + }, + { + "epoch": 0.01725, + "grad_norm": 0.04677336663007736, + "learning_rate": 8.236233594892595e-06, + "loss": 0.0337, + "step": 149450 + }, + { + "epoch": 0.0173, + "grad_norm": 0.05898798257112503, + "learning_rate": 8.233167200782458e-06, + "loss": 0.0338, + "step": 149460 + }, + { + "epoch": 0.01735, + "grad_norm": 0.05927513912320137, + "learning_rate": 8.23010126507123e-06, + "loss": 0.0333, + "step": 149470 + }, + { + "epoch": 0.0174, + "grad_norm": 0.04218801110982895, + "learning_rate": 8.227035787842744e-06, + "loss": 0.0334, + "step": 149480 + }, + { + "epoch": 0.01745, + "grad_norm": 0.0435371994972229, + "learning_rate": 8.223970769180796e-06, + "loss": 0.0335, + "step": 149490 + }, + { + "epoch": 0.0175, + "grad_norm": 0.04960029572248459, + "learning_rate": 8.220906209169185e-06, + "loss": 0.0341, + "step": 149500 + }, + { + "epoch": 0.01755, + "grad_norm": 0.0509033203125, + "learning_rate": 8.217842107891688e-06, + "loss": 0.0318, + "step": 149510 + }, + { + "epoch": 0.0176, + "grad_norm": 0.046687591820955276, + "learning_rate": 8.21477846543208e-06, + "loss": 0.0324, + "step": 149520 + }, + { + "epoch": 0.01765, + "grad_norm": 0.07635864615440369, + "learning_rate": 8.211715281874141e-06, + "loss": 0.0337, + "step": 149530 + }, + { + "epoch": 0.0177, + "grad_norm": 0.0632171630859375, + "learning_rate": 8.208652557301582e-06, + "loss": 0.0332, + "step": 149540 + }, + { + "epoch": 0.01775, + "grad_norm": 0.12133687734603882, + "learning_rate": 8.205590291798162e-06, + "loss": 0.0332, + "step": 149550 + }, + { + "epoch": 0.0178, + "grad_norm": 0.057598382234573364, + "learning_rate": 8.202528485447589e-06, + "loss": 0.0358, + "step": 149560 + }, + { + "epoch": 0.01785, + "grad_norm": 0.07385782152414322, + "learning_rate": 8.19946713833358e-06, + "loss": 0.0325, + "step": 149570 + }, + { + "epoch": 0.0179, + "grad_norm": 0.064969003200531, + "learning_rate": 8.196406250539831e-06, + "loss": 0.0349, + "step": 149580 + }, + { + "epoch": 0.01795, + "grad_norm": 0.05651523545384407, + "learning_rate": 8.193345822150014e-06, + "loss": 0.0344, + "step": 149590 + }, + { + "epoch": 0.018, + "grad_norm": 0.0455944761633873, + "learning_rate": 8.190285853247815e-06, + "loss": 0.0324, + "step": 149600 + }, + { + "epoch": 0.01805, + "grad_norm": 0.05375504493713379, + "learning_rate": 8.187226343916887e-06, + "loss": 0.0334, + "step": 149610 + }, + { + "epoch": 0.0181, + "grad_norm": 0.053510844707489014, + "learning_rate": 8.184167294240874e-06, + "loss": 0.0339, + "step": 149620 + }, + { + "epoch": 0.01815, + "grad_norm": 0.050734080374240875, + "learning_rate": 8.1811087043034e-06, + "loss": 0.0332, + "step": 149630 + }, + { + "epoch": 0.0182, + "grad_norm": 0.05367788299918175, + "learning_rate": 8.178050574188106e-06, + "loss": 0.0343, + "step": 149640 + }, + { + "epoch": 0.01825, + "grad_norm": 0.07095736265182495, + "learning_rate": 8.174992903978581e-06, + "loss": 0.0353, + "step": 149650 + }, + { + "epoch": 0.0183, + "grad_norm": 0.06505496054887772, + "learning_rate": 8.171935693758437e-06, + "loss": 0.0341, + "step": 149660 + }, + { + "epoch": 0.01835, + "grad_norm": 0.05861425772309303, + "learning_rate": 8.16887894361125e-06, + "loss": 0.0348, + "step": 149670 + }, + { + "epoch": 0.0184, + "grad_norm": 0.05692286789417267, + "learning_rate": 8.165822653620578e-06, + "loss": 0.0341, + "step": 149680 + }, + { + "epoch": 0.01845, + "grad_norm": 0.04605856165289879, + "learning_rate": 8.162766823870002e-06, + "loss": 0.0325, + "step": 149690 + }, + { + "epoch": 0.0185, + "grad_norm": 0.042736127972602844, + "learning_rate": 8.159711454443054e-06, + "loss": 0.0323, + "step": 149700 + }, + { + "epoch": 0.01855, + "grad_norm": 0.04387698322534561, + "learning_rate": 8.15665654542326e-06, + "loss": 0.0346, + "step": 149710 + }, + { + "epoch": 0.0186, + "grad_norm": 0.04287761077284813, + "learning_rate": 8.153602096894159e-06, + "loss": 0.0331, + "step": 149720 + }, + { + "epoch": 0.01865, + "grad_norm": 0.048668548464775085, + "learning_rate": 8.150548108939236e-06, + "loss": 0.0342, + "step": 149730 + }, + { + "epoch": 0.0187, + "grad_norm": 0.045768819749355316, + "learning_rate": 8.147494581642015e-06, + "loss": 0.0338, + "step": 149740 + }, + { + "epoch": 0.01875, + "grad_norm": 0.04629748314619064, + "learning_rate": 8.144441515085946e-06, + "loss": 0.0317, + "step": 149750 + }, + { + "epoch": 0.0188, + "grad_norm": 0.050622209906578064, + "learning_rate": 8.14138890935452e-06, + "loss": 0.0338, + "step": 149760 + }, + { + "epoch": 0.01885, + "grad_norm": 0.05139302834868431, + "learning_rate": 8.138336764531182e-06, + "loss": 0.0323, + "step": 149770 + }, + { + "epoch": 0.0189, + "grad_norm": 0.04795796051621437, + "learning_rate": 8.135285080699387e-06, + "loss": 0.032, + "step": 149780 + }, + { + "epoch": 0.01895, + "grad_norm": 0.04131333902478218, + "learning_rate": 8.132233857942564e-06, + "loss": 0.0322, + "step": 149790 + }, + { + "epoch": 0.019, + "grad_norm": 0.056618932634592056, + "learning_rate": 8.129183096344123e-06, + "loss": 0.0338, + "step": 149800 + }, + { + "epoch": 0.01905, + "grad_norm": 0.05299549549818039, + "learning_rate": 8.126132795987485e-06, + "loss": 0.0322, + "step": 149810 + }, + { + "epoch": 0.0191, + "grad_norm": 0.05240718647837639, + "learning_rate": 8.123082956956037e-06, + "loss": 0.0358, + "step": 149820 + }, + { + "epoch": 0.01915, + "grad_norm": 0.04959771782159805, + "learning_rate": 8.120033579333162e-06, + "loss": 0.0336, + "step": 149830 + }, + { + "epoch": 0.0192, + "grad_norm": 0.05134819075465202, + "learning_rate": 8.116984663202218e-06, + "loss": 0.0323, + "step": 149840 + }, + { + "epoch": 0.01925, + "grad_norm": 0.04623432829976082, + "learning_rate": 8.113936208646572e-06, + "loss": 0.0337, + "step": 149850 + }, + { + "epoch": 0.0193, + "grad_norm": 0.04831528291106224, + "learning_rate": 8.110888215749574e-06, + "loss": 0.0338, + "step": 149860 + }, + { + "epoch": 0.01935, + "grad_norm": 0.03964311257004738, + "learning_rate": 8.107840684594547e-06, + "loss": 0.0327, + "step": 149870 + }, + { + "epoch": 0.0194, + "grad_norm": 0.07050962001085281, + "learning_rate": 8.104793615264807e-06, + "loss": 0.0331, + "step": 149880 + }, + { + "epoch": 0.01945, + "grad_norm": 0.07801316678524017, + "learning_rate": 8.101747007843658e-06, + "loss": 0.0337, + "step": 149890 + }, + { + "epoch": 0.0195, + "grad_norm": 0.061165932565927505, + "learning_rate": 8.098700862414404e-06, + "loss": 0.0323, + "step": 149900 + }, + { + "epoch": 0.01955, + "grad_norm": 0.0446687787771225, + "learning_rate": 8.095655179060318e-06, + "loss": 0.0332, + "step": 149910 + }, + { + "epoch": 0.0196, + "grad_norm": 0.05215480923652649, + "learning_rate": 8.092609957864663e-06, + "loss": 0.0329, + "step": 149920 + }, + { + "epoch": 0.01965, + "grad_norm": 0.04582972824573517, + "learning_rate": 8.089565198910706e-06, + "loss": 0.032, + "step": 149930 + }, + { + "epoch": 0.0197, + "grad_norm": 0.05249819532036781, + "learning_rate": 8.086520902281677e-06, + "loss": 0.033, + "step": 149940 + }, + { + "epoch": 0.01975, + "grad_norm": 0.04421620070934296, + "learning_rate": 8.083477068060827e-06, + "loss": 0.0328, + "step": 149950 + }, + { + "epoch": 0.0198, + "grad_norm": 0.043347086757421494, + "learning_rate": 8.080433696331344e-06, + "loss": 0.0355, + "step": 149960 + }, + { + "epoch": 0.01985, + "grad_norm": 0.04572188854217529, + "learning_rate": 8.077390787176447e-06, + "loss": 0.0329, + "step": 149970 + }, + { + "epoch": 0.0199, + "grad_norm": 0.04811607301235199, + "learning_rate": 8.074348340679336e-06, + "loss": 0.0338, + "step": 149980 + }, + { + "epoch": 0.01995, + "grad_norm": 0.045632511377334595, + "learning_rate": 8.071306356923184e-06, + "loss": 0.0343, + "step": 149990 + }, + { + "epoch": 0.02, + "grad_norm": 0.047678910195827484, + "learning_rate": 8.068264835991155e-06, + "loss": 0.0326, + "step": 150000 + }, + { + "epoch": 0.02005, + "grad_norm": 0.046015266329050064, + "learning_rate": 8.065223777966394e-06, + "loss": 0.0344, + "step": 150010 + }, + { + "epoch": 0.0201, + "grad_norm": 0.04278041422367096, + "learning_rate": 8.062183182932065e-06, + "loss": 0.0341, + "step": 150020 + }, + { + "epoch": 0.02015, + "grad_norm": 0.04753004014492035, + "learning_rate": 8.059143050971283e-06, + "loss": 0.0357, + "step": 150030 + }, + { + "epoch": 0.0202, + "grad_norm": 0.04136762022972107, + "learning_rate": 8.056103382167156e-06, + "loss": 0.0331, + "step": 150040 + }, + { + "epoch": 0.02025, + "grad_norm": 0.04396098479628563, + "learning_rate": 8.053064176602806e-06, + "loss": 0.0351, + "step": 150050 + }, + { + "epoch": 0.0203, + "grad_norm": 0.047903914004564285, + "learning_rate": 8.050025434361308e-06, + "loss": 0.0329, + "step": 150060 + }, + { + "epoch": 0.02035, + "grad_norm": 0.046780530363321304, + "learning_rate": 8.046987155525754e-06, + "loss": 0.0335, + "step": 150070 + }, + { + "epoch": 0.0204, + "grad_norm": 0.05217145010828972, + "learning_rate": 8.043949340179203e-06, + "loss": 0.0341, + "step": 150080 + }, + { + "epoch": 0.02045, + "grad_norm": 0.045180562883615494, + "learning_rate": 8.040911988404697e-06, + "loss": 0.033, + "step": 150090 + }, + { + "epoch": 0.0205, + "grad_norm": 0.05050816759467125, + "learning_rate": 8.037875100285297e-06, + "loss": 0.0332, + "step": 150100 + }, + { + "epoch": 0.02055, + "grad_norm": 0.052226655185222626, + "learning_rate": 8.034838675904017e-06, + "loss": 0.0337, + "step": 150110 + }, + { + "epoch": 0.0206, + "grad_norm": 0.052902113646268845, + "learning_rate": 8.031802715343875e-06, + "loss": 0.0327, + "step": 150120 + }, + { + "epoch": 0.02065, + "grad_norm": 0.04634511470794678, + "learning_rate": 8.028767218687864e-06, + "loss": 0.0336, + "step": 150130 + }, + { + "epoch": 0.0207, + "grad_norm": 0.050081513822078705, + "learning_rate": 8.025732186018989e-06, + "loss": 0.0324, + "step": 150140 + }, + { + "epoch": 0.02075, + "grad_norm": 0.05681074783205986, + "learning_rate": 8.02269761742021e-06, + "loss": 0.0349, + "step": 150150 + }, + { + "epoch": 0.0208, + "grad_norm": 0.057326339185237885, + "learning_rate": 8.019663512974509e-06, + "loss": 0.034, + "step": 150160 + }, + { + "epoch": 0.02085, + "grad_norm": 0.05329889431595802, + "learning_rate": 8.01662987276483e-06, + "loss": 0.0344, + "step": 150170 + }, + { + "epoch": 0.0209, + "grad_norm": 0.048233937472105026, + "learning_rate": 8.0135966968741e-06, + "loss": 0.0335, + "step": 150180 + }, + { + "epoch": 0.02095, + "grad_norm": 0.0592784509062767, + "learning_rate": 8.010563985385264e-06, + "loss": 0.0342, + "step": 150190 + }, + { + "epoch": 0.021, + "grad_norm": 0.05796947330236435, + "learning_rate": 8.007531738381225e-06, + "loss": 0.0366, + "step": 150200 + }, + { + "epoch": 0.02105, + "grad_norm": 0.05200716853141785, + "learning_rate": 8.004499955944886e-06, + "loss": 0.0342, + "step": 150210 + }, + { + "epoch": 0.0211, + "grad_norm": 0.0459669791162014, + "learning_rate": 8.001468638159124e-06, + "loss": 0.0353, + "step": 150220 + }, + { + "epoch": 0.02115, + "grad_norm": 0.04280899465084076, + "learning_rate": 7.998437785106825e-06, + "loss": 0.0348, + "step": 150230 + }, + { + "epoch": 0.0212, + "grad_norm": 0.05121898651123047, + "learning_rate": 7.995407396870862e-06, + "loss": 0.0344, + "step": 150240 + }, + { + "epoch": 0.02125, + "grad_norm": 0.07777711749076843, + "learning_rate": 7.992377473534061e-06, + "loss": 0.036, + "step": 150250 + }, + { + "epoch": 0.0213, + "grad_norm": 0.04437794163823128, + "learning_rate": 7.989348015179274e-06, + "loss": 0.0342, + "step": 150260 + }, + { + "epoch": 0.02135, + "grad_norm": 0.049214769154787064, + "learning_rate": 7.986319021889316e-06, + "loss": 0.0354, + "step": 150270 + }, + { + "epoch": 0.0214, + "grad_norm": 0.049523212015628815, + "learning_rate": 7.983290493747012e-06, + "loss": 0.036, + "step": 150280 + }, + { + "epoch": 0.02145, + "grad_norm": 0.05269530043005943, + "learning_rate": 7.980262430835153e-06, + "loss": 0.0352, + "step": 150290 + }, + { + "epoch": 0.0215, + "grad_norm": 0.049670156091451645, + "learning_rate": 7.977234833236519e-06, + "loss": 0.0346, + "step": 150300 + }, + { + "epoch": 0.02155, + "grad_norm": 0.06403318047523499, + "learning_rate": 7.974207701033895e-06, + "loss": 0.0348, + "step": 150310 + }, + { + "epoch": 0.0216, + "grad_norm": 0.05004489794373512, + "learning_rate": 7.971181034310037e-06, + "loss": 0.0346, + "step": 150320 + }, + { + "epoch": 0.02165, + "grad_norm": 0.05563574284315109, + "learning_rate": 7.968154833147692e-06, + "loss": 0.0337, + "step": 150330 + }, + { + "epoch": 0.0217, + "grad_norm": 0.04923945292830467, + "learning_rate": 7.965129097629587e-06, + "loss": 0.0338, + "step": 150340 + }, + { + "epoch": 0.02175, + "grad_norm": 0.054012149572372437, + "learning_rate": 7.962103827838455e-06, + "loss": 0.0337, + "step": 150350 + }, + { + "epoch": 0.0218, + "grad_norm": 0.06014170125126839, + "learning_rate": 7.959079023857007e-06, + "loss": 0.0352, + "step": 150360 + }, + { + "epoch": 0.02185, + "grad_norm": 0.05869707465171814, + "learning_rate": 7.956054685767941e-06, + "loss": 0.0339, + "step": 150370 + }, + { + "epoch": 0.0219, + "grad_norm": 0.04561547935009003, + "learning_rate": 7.953030813653934e-06, + "loss": 0.0336, + "step": 150380 + }, + { + "epoch": 0.02195, + "grad_norm": 0.053486719727516174, + "learning_rate": 7.950007407597654e-06, + "loss": 0.0328, + "step": 150390 + }, + { + "epoch": 0.022, + "grad_norm": 0.047152649611234665, + "learning_rate": 7.946984467681773e-06, + "loss": 0.0328, + "step": 150400 + }, + { + "epoch": 0.02205, + "grad_norm": 0.05373353883624077, + "learning_rate": 7.94396199398893e-06, + "loss": 0.0321, + "step": 150410 + }, + { + "epoch": 0.0221, + "grad_norm": 0.04691655933856964, + "learning_rate": 7.94093998660175e-06, + "loss": 0.033, + "step": 150420 + }, + { + "epoch": 0.02215, + "grad_norm": 0.047290802001953125, + "learning_rate": 7.937918445602871e-06, + "loss": 0.0329, + "step": 150430 + }, + { + "epoch": 0.0222, + "grad_norm": 0.05117656663060188, + "learning_rate": 7.934897371074884e-06, + "loss": 0.0354, + "step": 150440 + }, + { + "epoch": 0.02225, + "grad_norm": 0.04447808116674423, + "learning_rate": 7.931876763100407e-06, + "loss": 0.0323, + "step": 150450 + }, + { + "epoch": 0.0223, + "grad_norm": 0.048859499394893646, + "learning_rate": 7.928856621761993e-06, + "loss": 0.0339, + "step": 150460 + }, + { + "epoch": 0.02235, + "grad_norm": 0.06283494830131531, + "learning_rate": 7.925836947142223e-06, + "loss": 0.033, + "step": 150470 + }, + { + "epoch": 0.0224, + "grad_norm": 0.05105786770582199, + "learning_rate": 7.922817739323665e-06, + "loss": 0.0327, + "step": 150480 + }, + { + "epoch": 0.02245, + "grad_norm": 0.045580655336380005, + "learning_rate": 7.919798998388856e-06, + "loss": 0.0335, + "step": 150490 + }, + { + "epoch": 0.0225, + "grad_norm": 0.060102153569459915, + "learning_rate": 7.916780724420326e-06, + "loss": 0.0322, + "step": 150500 + }, + { + "epoch": 0.02255, + "grad_norm": 0.042073022574186325, + "learning_rate": 7.91376291750058e-06, + "loss": 0.0317, + "step": 150510 + }, + { + "epoch": 0.0226, + "grad_norm": 0.04824558645486832, + "learning_rate": 7.91074557771215e-06, + "loss": 0.0324, + "step": 150520 + }, + { + "epoch": 0.02265, + "grad_norm": 0.04602494835853577, + "learning_rate": 7.907728705137516e-06, + "loss": 0.0317, + "step": 150530 + }, + { + "epoch": 0.0227, + "grad_norm": 0.04034003987908363, + "learning_rate": 7.904712299859145e-06, + "loss": 0.0313, + "step": 150540 + }, + { + "epoch": 0.02275, + "grad_norm": 0.0565539188683033, + "learning_rate": 7.901696361959532e-06, + "loss": 0.0337, + "step": 150550 + }, + { + "epoch": 0.0228, + "grad_norm": 0.04368201643228531, + "learning_rate": 7.898680891521105e-06, + "loss": 0.0324, + "step": 150560 + }, + { + "epoch": 0.02285, + "grad_norm": 0.046120114624500275, + "learning_rate": 7.895665888626325e-06, + "loss": 0.0312, + "step": 150570 + }, + { + "epoch": 0.0229, + "grad_norm": 0.049742378294467926, + "learning_rate": 7.892651353357616e-06, + "loss": 0.0337, + "step": 150580 + }, + { + "epoch": 0.02295, + "grad_norm": 0.050557803362607956, + "learning_rate": 7.889637285797391e-06, + "loss": 0.0341, + "step": 150590 + }, + { + "epoch": 0.023, + "grad_norm": 0.05375564098358154, + "learning_rate": 7.886623686028047e-06, + "loss": 0.0331, + "step": 150600 + }, + { + "epoch": 0.02305, + "grad_norm": 0.05111253634095192, + "learning_rate": 7.883610554131989e-06, + "loss": 0.0335, + "step": 150610 + }, + { + "epoch": 0.0231, + "grad_norm": 0.04323218762874603, + "learning_rate": 7.880597890191587e-06, + "loss": 0.0328, + "step": 150620 + }, + { + "epoch": 0.02315, + "grad_norm": 0.05458337441086769, + "learning_rate": 7.877585694289203e-06, + "loss": 0.0339, + "step": 150630 + }, + { + "epoch": 0.0232, + "grad_norm": 0.051620274782180786, + "learning_rate": 7.8745739665072e-06, + "loss": 0.0337, + "step": 150640 + }, + { + "epoch": 0.02325, + "grad_norm": 0.04495498538017273, + "learning_rate": 7.871562706927904e-06, + "loss": 0.0325, + "step": 150650 + }, + { + "epoch": 0.0233, + "grad_norm": 0.03468136861920357, + "learning_rate": 7.868551915633662e-06, + "loss": 0.0323, + "step": 150660 + }, + { + "epoch": 0.02335, + "grad_norm": 0.053658053278923035, + "learning_rate": 7.86554159270676e-06, + "loss": 0.0334, + "step": 150670 + }, + { + "epoch": 0.0234, + "grad_norm": 0.04565673694014549, + "learning_rate": 7.862531738229515e-06, + "loss": 0.0335, + "step": 150680 + }, + { + "epoch": 0.02345, + "grad_norm": 0.053873706609010696, + "learning_rate": 7.859522352284222e-06, + "loss": 0.0338, + "step": 150690 + }, + { + "epoch": 0.0235, + "grad_norm": 0.047571588307619095, + "learning_rate": 7.856513434953147e-06, + "loss": 0.033, + "step": 150700 + }, + { + "epoch": 0.02355, + "grad_norm": 0.05546896904706955, + "learning_rate": 7.853504986318555e-06, + "loss": 0.0329, + "step": 150710 + }, + { + "epoch": 0.0236, + "grad_norm": 0.05743944272398949, + "learning_rate": 7.850497006462684e-06, + "loss": 0.0333, + "step": 150720 + }, + { + "epoch": 0.02365, + "grad_norm": 0.056781135499477386, + "learning_rate": 7.847489495467786e-06, + "loss": 0.0347, + "step": 150730 + }, + { + "epoch": 0.0237, + "grad_norm": 0.08583921939134598, + "learning_rate": 7.844482453416096e-06, + "loss": 0.0334, + "step": 150740 + }, + { + "epoch": 0.02375, + "grad_norm": 0.058244261890649796, + "learning_rate": 7.841475880389795e-06, + "loss": 0.033, + "step": 150750 + }, + { + "epoch": 0.0238, + "grad_norm": 0.045430902391672134, + "learning_rate": 7.838469776471105e-06, + "loss": 0.0336, + "step": 150760 + }, + { + "epoch": 0.02385, + "grad_norm": 0.04519680514931679, + "learning_rate": 7.835464141742197e-06, + "loss": 0.0345, + "step": 150770 + }, + { + "epoch": 0.0239, + "grad_norm": 0.049604613333940506, + "learning_rate": 7.832458976285256e-06, + "loss": 0.034, + "step": 150780 + }, + { + "epoch": 0.02395, + "grad_norm": 0.04675721377134323, + "learning_rate": 7.829454280182442e-06, + "loss": 0.034, + "step": 150790 + }, + { + "epoch": 0.024, + "grad_norm": 0.05836396664381027, + "learning_rate": 7.826450053515886e-06, + "loss": 0.0334, + "step": 150800 + }, + { + "epoch": 0.02405, + "grad_norm": 0.047985147684812546, + "learning_rate": 7.823446296367739e-06, + "loss": 0.0339, + "step": 150810 + }, + { + "epoch": 0.0241, + "grad_norm": 0.05170302093029022, + "learning_rate": 7.820443008820122e-06, + "loss": 0.0331, + "step": 150820 + }, + { + "epoch": 0.02415, + "grad_norm": 0.04719945415854454, + "learning_rate": 7.817440190955137e-06, + "loss": 0.0354, + "step": 150830 + }, + { + "epoch": 0.0242, + "grad_norm": 0.06532011181116104, + "learning_rate": 7.814437842854875e-06, + "loss": 0.0369, + "step": 150840 + }, + { + "epoch": 0.02425, + "grad_norm": 0.04977961257100105, + "learning_rate": 7.811435964601432e-06, + "loss": 0.0324, + "step": 150850 + }, + { + "epoch": 0.0243, + "grad_norm": 0.0519782118499279, + "learning_rate": 7.808434556276866e-06, + "loss": 0.0347, + "step": 150860 + }, + { + "epoch": 0.02435, + "grad_norm": 0.07689446210861206, + "learning_rate": 7.805433617963251e-06, + "loss": 0.0338, + "step": 150870 + }, + { + "epoch": 0.0244, + "grad_norm": 0.06940881907939911, + "learning_rate": 7.802433149742617e-06, + "loss": 0.0349, + "step": 150880 + }, + { + "epoch": 0.02445, + "grad_norm": 0.06524743884801865, + "learning_rate": 7.799433151696995e-06, + "loss": 0.0339, + "step": 150890 + }, + { + "epoch": 0.0245, + "grad_norm": 0.04952292516827583, + "learning_rate": 7.796433623908413e-06, + "loss": 0.034, + "step": 150900 + }, + { + "epoch": 0.02455, + "grad_norm": 0.05044098570942879, + "learning_rate": 7.793434566458876e-06, + "loss": 0.0331, + "step": 150910 + }, + { + "epoch": 0.0246, + "grad_norm": 0.04742221534252167, + "learning_rate": 7.790435979430363e-06, + "loss": 0.0331, + "step": 150920 + }, + { + "epoch": 0.02465, + "grad_norm": 0.04695028066635132, + "learning_rate": 7.787437862904875e-06, + "loss": 0.0332, + "step": 150930 + }, + { + "epoch": 0.0247, + "grad_norm": 0.052516527473926544, + "learning_rate": 7.784440216964361e-06, + "loss": 0.0336, + "step": 150940 + }, + { + "epoch": 0.02475, + "grad_norm": 0.048497218638658524, + "learning_rate": 7.7814430416908e-06, + "loss": 0.0322, + "step": 150950 + }, + { + "epoch": 0.0248, + "grad_norm": 0.06365183740854263, + "learning_rate": 7.7784463371661e-06, + "loss": 0.0351, + "step": 150960 + }, + { + "epoch": 0.02485, + "grad_norm": 0.05415652319788933, + "learning_rate": 7.775450103472217e-06, + "loss": 0.0334, + "step": 150970 + }, + { + "epoch": 0.0249, + "grad_norm": 0.06245667114853859, + "learning_rate": 7.772454340691052e-06, + "loss": 0.037, + "step": 150980 + }, + { + "epoch": 0.02495, + "grad_norm": 0.06386829167604446, + "learning_rate": 7.769459048904518e-06, + "loss": 0.0342, + "step": 150990 + }, + { + "epoch": 0.025, + "grad_norm": 0.043057940900325775, + "learning_rate": 7.7664642281945e-06, + "loss": 0.0335, + "step": 151000 + }, + { + "epoch": 0.02505, + "grad_norm": 0.05063904449343681, + "learning_rate": 7.763469878642868e-06, + "loss": 0.0336, + "step": 151010 + }, + { + "epoch": 0.0251, + "grad_norm": 0.04828261956572533, + "learning_rate": 7.7604760003315e-06, + "loss": 0.0342, + "step": 151020 + }, + { + "epoch": 0.02515, + "grad_norm": 0.05346516892313957, + "learning_rate": 7.757482593342243e-06, + "loss": 0.034, + "step": 151030 + }, + { + "epoch": 0.0252, + "grad_norm": 0.049246352165937424, + "learning_rate": 7.754489657756938e-06, + "loss": 0.0339, + "step": 151040 + }, + { + "epoch": 0.02525, + "grad_norm": 0.06279134750366211, + "learning_rate": 7.751497193657396e-06, + "loss": 0.0335, + "step": 151050 + }, + { + "epoch": 0.0253, + "grad_norm": 0.04387279972434044, + "learning_rate": 7.748505201125438e-06, + "loss": 0.0324, + "step": 151060 + }, + { + "epoch": 0.02535, + "grad_norm": 0.05643988773226738, + "learning_rate": 7.74551368024288e-06, + "loss": 0.0357, + "step": 151070 + }, + { + "epoch": 0.0254, + "grad_norm": 0.0686245784163475, + "learning_rate": 7.742522631091492e-06, + "loss": 0.0348, + "step": 151080 + }, + { + "epoch": 0.02545, + "grad_norm": 0.05688413232564926, + "learning_rate": 7.739532053753055e-06, + "loss": 0.0325, + "step": 151090 + }, + { + "epoch": 0.0255, + "grad_norm": 0.04766567051410675, + "learning_rate": 7.736541948309314e-06, + "loss": 0.0336, + "step": 151100 + }, + { + "epoch": 0.02555, + "grad_norm": 0.06140110269188881, + "learning_rate": 7.733552314842043e-06, + "loss": 0.0345, + "step": 151110 + }, + { + "epoch": 0.0256, + "grad_norm": 0.09793264418840408, + "learning_rate": 7.730563153432965e-06, + "loss": 0.0324, + "step": 151120 + }, + { + "epoch": 0.02565, + "grad_norm": 0.05823006108403206, + "learning_rate": 7.727574464163792e-06, + "loss": 0.0318, + "step": 151130 + }, + { + "epoch": 0.0257, + "grad_norm": 0.04784746095538139, + "learning_rate": 7.724586247116256e-06, + "loss": 0.0326, + "step": 151140 + }, + { + "epoch": 0.02575, + "grad_norm": 0.0745970755815506, + "learning_rate": 7.72159850237203e-06, + "loss": 0.0349, + "step": 151150 + }, + { + "epoch": 0.0258, + "grad_norm": 0.053788185119628906, + "learning_rate": 7.718611230012826e-06, + "loss": 0.0333, + "step": 151160 + }, + { + "epoch": 0.02585, + "grad_norm": 0.05982402712106705, + "learning_rate": 7.715624430120286e-06, + "loss": 0.0347, + "step": 151170 + }, + { + "epoch": 0.0259, + "grad_norm": 0.04564383253455162, + "learning_rate": 7.712638102776076e-06, + "loss": 0.0315, + "step": 151180 + }, + { + "epoch": 0.02595, + "grad_norm": 0.040683455765247345, + "learning_rate": 7.709652248061858e-06, + "loss": 0.0322, + "step": 151190 + }, + { + "epoch": 0.026, + "grad_norm": 0.03503594174981117, + "learning_rate": 7.706666866059251e-06, + "loss": 0.0322, + "step": 151200 + }, + { + "epoch": 0.02605, + "grad_norm": 0.038955919444561005, + "learning_rate": 7.703681956849873e-06, + "loss": 0.0325, + "step": 151210 + }, + { + "epoch": 0.0261, + "grad_norm": 0.0518600158393383, + "learning_rate": 7.700697520515327e-06, + "loss": 0.0325, + "step": 151220 + }, + { + "epoch": 0.02615, + "grad_norm": 0.03950066119432449, + "learning_rate": 7.697713557137218e-06, + "loss": 0.0322, + "step": 151230 + }, + { + "epoch": 0.0262, + "grad_norm": 0.041293129324913025, + "learning_rate": 7.694730066797121e-06, + "loss": 0.0313, + "step": 151240 + }, + { + "epoch": 0.02625, + "grad_norm": 0.04547581821680069, + "learning_rate": 7.691747049576593e-06, + "loss": 0.0315, + "step": 151250 + }, + { + "epoch": 0.0263, + "grad_norm": 0.04586722329258919, + "learning_rate": 7.688764505557208e-06, + "loss": 0.0317, + "step": 151260 + }, + { + "epoch": 0.02635, + "grad_norm": 0.05034118890762329, + "learning_rate": 7.685782434820488e-06, + "loss": 0.0324, + "step": 151270 + }, + { + "epoch": 0.0264, + "grad_norm": 0.04623064771294594, + "learning_rate": 7.682800837447982e-06, + "loss": 0.0326, + "step": 151280 + }, + { + "epoch": 0.02645, + "grad_norm": 0.04324917495250702, + "learning_rate": 7.679819713521194e-06, + "loss": 0.0325, + "step": 151290 + }, + { + "epoch": 0.0265, + "grad_norm": 0.04792845994234085, + "learning_rate": 7.676839063121621e-06, + "loss": 0.0324, + "step": 151300 + }, + { + "epoch": 0.02655, + "grad_norm": 0.04959246516227722, + "learning_rate": 7.673858886330768e-06, + "loss": 0.0358, + "step": 151310 + }, + { + "epoch": 0.0266, + "grad_norm": 0.04334568977355957, + "learning_rate": 7.6708791832301e-06, + "loss": 0.0334, + "step": 151320 + }, + { + "epoch": 0.02665, + "grad_norm": 0.04518502950668335, + "learning_rate": 7.667899953901089e-06, + "loss": 0.0335, + "step": 151330 + }, + { + "epoch": 0.0267, + "grad_norm": 0.04649922251701355, + "learning_rate": 7.664921198425173e-06, + "loss": 0.0338, + "step": 151340 + }, + { + "epoch": 0.02675, + "grad_norm": 0.054697297513484955, + "learning_rate": 7.661942916883807e-06, + "loss": 0.0337, + "step": 151350 + }, + { + "epoch": 0.0268, + "grad_norm": 0.05046942085027695, + "learning_rate": 7.658965109358401e-06, + "loss": 0.0348, + "step": 151360 + }, + { + "epoch": 0.02685, + "grad_norm": 0.050941936671733856, + "learning_rate": 7.655987775930381e-06, + "loss": 0.034, + "step": 151370 + }, + { + "epoch": 0.0269, + "grad_norm": 0.04394199326634407, + "learning_rate": 7.653010916681141e-06, + "loss": 0.0353, + "step": 151380 + }, + { + "epoch": 0.02695, + "grad_norm": 0.049377329647541046, + "learning_rate": 7.650034531692055e-06, + "loss": 0.0354, + "step": 151390 + }, + { + "epoch": 0.027, + "grad_norm": 0.05509459972381592, + "learning_rate": 7.647058621044516e-06, + "loss": 0.0375, + "step": 151400 + }, + { + "epoch": 0.02705, + "grad_norm": 0.04569574445486069, + "learning_rate": 7.644083184819876e-06, + "loss": 0.0347, + "step": 151410 + }, + { + "epoch": 0.0271, + "grad_norm": 0.04505246505141258, + "learning_rate": 7.64110822309948e-06, + "loss": 0.0338, + "step": 151420 + }, + { + "epoch": 0.02715, + "grad_norm": 0.04855041578412056, + "learning_rate": 7.638133735964655e-06, + "loss": 0.0341, + "step": 151430 + }, + { + "epoch": 0.0272, + "grad_norm": 0.04252030327916145, + "learning_rate": 7.635159723496735e-06, + "loss": 0.0342, + "step": 151440 + }, + { + "epoch": 0.02725, + "grad_norm": 0.04364844784140587, + "learning_rate": 7.632186185777037e-06, + "loss": 0.0335, + "step": 151450 + }, + { + "epoch": 0.0273, + "grad_norm": 0.041523970663547516, + "learning_rate": 7.6292131228868305e-06, + "loss": 0.0361, + "step": 151460 + }, + { + "epoch": 0.02735, + "grad_norm": 0.03853632137179375, + "learning_rate": 7.626240534907417e-06, + "loss": 0.0336, + "step": 151470 + }, + { + "epoch": 0.0274, + "grad_norm": 0.045850805938243866, + "learning_rate": 7.6232684219200515e-06, + "loss": 0.0333, + "step": 151480 + }, + { + "epoch": 0.02745, + "grad_norm": 0.050423864275217056, + "learning_rate": 7.620296784006009e-06, + "loss": 0.035, + "step": 151490 + }, + { + "epoch": 0.0275, + "grad_norm": 0.059781789779663086, + "learning_rate": 7.617325621246523e-06, + "loss": 0.0349, + "step": 151500 + }, + { + "epoch": 0.02755, + "grad_norm": 0.054336484521627426, + "learning_rate": 7.6143549337228175e-06, + "loss": 0.0334, + "step": 151510 + }, + { + "epoch": 0.0276, + "grad_norm": 0.05191996321082115, + "learning_rate": 7.611384721516121e-06, + "loss": 0.0344, + "step": 151520 + }, + { + "epoch": 0.02765, + "grad_norm": 0.051657989621162415, + "learning_rate": 7.608414984707635e-06, + "loss": 0.0324, + "step": 151530 + }, + { + "epoch": 0.0277, + "grad_norm": 0.045168012380599976, + "learning_rate": 7.605445723378552e-06, + "loss": 0.0315, + "step": 151540 + }, + { + "epoch": 0.02775, + "grad_norm": 0.0468955934047699, + "learning_rate": 7.602476937610037e-06, + "loss": 0.0339, + "step": 151550 + }, + { + "epoch": 0.0278, + "grad_norm": 0.040408361703157425, + "learning_rate": 7.599508627483268e-06, + "loss": 0.0334, + "step": 151560 + }, + { + "epoch": 0.02785, + "grad_norm": 0.04691183194518089, + "learning_rate": 7.596540793079404e-06, + "loss": 0.0326, + "step": 151570 + }, + { + "epoch": 0.0279, + "grad_norm": 0.0571589358150959, + "learning_rate": 7.593573434479579e-06, + "loss": 0.0334, + "step": 151580 + }, + { + "epoch": 0.02795, + "grad_norm": 0.06184757128357887, + "learning_rate": 7.590606551764912e-06, + "loss": 0.0346, + "step": 151590 + }, + { + "epoch": 0.028, + "grad_norm": 0.047520771622657776, + "learning_rate": 7.5876401450165165e-06, + "loss": 0.0328, + "step": 151600 + }, + { + "epoch": 0.02805, + "grad_norm": 0.05671709403395653, + "learning_rate": 7.584674214315507e-06, + "loss": 0.0347, + "step": 151610 + }, + { + "epoch": 0.0281, + "grad_norm": 0.051435671746730804, + "learning_rate": 7.581708759742959e-06, + "loss": 0.0331, + "step": 151620 + }, + { + "epoch": 0.02815, + "grad_norm": 0.05739184468984604, + "learning_rate": 7.578743781379944e-06, + "loss": 0.0343, + "step": 151630 + }, + { + "epoch": 0.0282, + "grad_norm": 0.05141328275203705, + "learning_rate": 7.575779279307535e-06, + "loss": 0.0324, + "step": 151640 + }, + { + "epoch": 0.02825, + "grad_norm": 0.049890533089637756, + "learning_rate": 7.5728152536067686e-06, + "loss": 0.0337, + "step": 151650 + }, + { + "epoch": 0.0283, + "grad_norm": 0.043297071009874344, + "learning_rate": 7.569851704358699e-06, + "loss": 0.0326, + "step": 151660 + }, + { + "epoch": 0.02835, + "grad_norm": 0.04433068633079529, + "learning_rate": 7.566888631644323e-06, + "loss": 0.0331, + "step": 151670 + }, + { + "epoch": 0.0284, + "grad_norm": 0.054903291165828705, + "learning_rate": 7.56392603554466e-06, + "loss": 0.0335, + "step": 151680 + }, + { + "epoch": 0.02845, + "grad_norm": 0.04608643427491188, + "learning_rate": 7.56096391614072e-06, + "loss": 0.0324, + "step": 151690 + }, + { + "epoch": 0.0285, + "grad_norm": 0.041108906269073486, + "learning_rate": 7.5580022735134735e-06, + "loss": 0.0325, + "step": 151700 + }, + { + "epoch": 0.02855, + "grad_norm": 0.04551394283771515, + "learning_rate": 7.55504110774389e-06, + "loss": 0.0336, + "step": 151710 + }, + { + "epoch": 0.0286, + "grad_norm": 0.04845035448670387, + "learning_rate": 7.5520804189129245e-06, + "loss": 0.0335, + "step": 151720 + }, + { + "epoch": 0.02865, + "grad_norm": 0.04496077448129654, + "learning_rate": 7.549120207101532e-06, + "loss": 0.0333, + "step": 151730 + }, + { + "epoch": 0.0287, + "grad_norm": 0.05059627443552017, + "learning_rate": 7.546160472390634e-06, + "loss": 0.0339, + "step": 151740 + }, + { + "epoch": 0.02875, + "grad_norm": 0.048855770379304886, + "learning_rate": 7.543201214861148e-06, + "loss": 0.0336, + "step": 151750 + }, + { + "epoch": 0.0288, + "grad_norm": 0.06655757129192352, + "learning_rate": 7.5402424345939884e-06, + "loss": 0.0371, + "step": 151760 + }, + { + "epoch": 0.02885, + "grad_norm": 0.0655401349067688, + "learning_rate": 7.5372841316700335e-06, + "loss": 0.0346, + "step": 151770 + }, + { + "epoch": 0.0289, + "grad_norm": 0.05282272398471832, + "learning_rate": 7.534326306170178e-06, + "loss": 0.0344, + "step": 151780 + }, + { + "epoch": 0.02895, + "grad_norm": 0.04365290328860283, + "learning_rate": 7.531368958175281e-06, + "loss": 0.0329, + "step": 151790 + }, + { + "epoch": 0.029, + "grad_norm": 0.03897915780544281, + "learning_rate": 7.528412087766193e-06, + "loss": 0.0328, + "step": 151800 + }, + { + "epoch": 0.02905, + "grad_norm": 0.058162543922662735, + "learning_rate": 7.525455695023745e-06, + "loss": 0.0331, + "step": 151810 + }, + { + "epoch": 0.0291, + "grad_norm": 0.04760391265153885, + "learning_rate": 7.522499780028783e-06, + "loss": 0.0331, + "step": 151820 + }, + { + "epoch": 0.02915, + "grad_norm": 0.04888239502906799, + "learning_rate": 7.519544342862112e-06, + "loss": 0.0324, + "step": 151830 + }, + { + "epoch": 0.0292, + "grad_norm": 0.05259674787521362, + "learning_rate": 7.516589383604522e-06, + "loss": 0.0347, + "step": 151840 + }, + { + "epoch": 0.02925, + "grad_norm": 0.04942569509148598, + "learning_rate": 7.513634902336819e-06, + "loss": 0.0325, + "step": 151850 + }, + { + "epoch": 0.0293, + "grad_norm": 0.04997270554304123, + "learning_rate": 7.510680899139761e-06, + "loss": 0.0339, + "step": 151860 + }, + { + "epoch": 0.02935, + "grad_norm": 0.04699475318193436, + "learning_rate": 7.507727374094131e-06, + "loss": 0.0325, + "step": 151870 + }, + { + "epoch": 0.0294, + "grad_norm": 0.05285327881574631, + "learning_rate": 7.504774327280648e-06, + "loss": 0.0339, + "step": 151880 + }, + { + "epoch": 0.02945, + "grad_norm": 0.04937724769115448, + "learning_rate": 7.501821758780062e-06, + "loss": 0.0335, + "step": 151890 + }, + { + "epoch": 0.0295, + "grad_norm": 0.07506319880485535, + "learning_rate": 7.498869668673106e-06, + "loss": 0.0333, + "step": 151900 + }, + { + "epoch": 0.02955, + "grad_norm": 0.056446999311447144, + "learning_rate": 7.495918057040474e-06, + "loss": 0.033, + "step": 151910 + }, + { + "epoch": 0.0296, + "grad_norm": 0.04971238970756531, + "learning_rate": 7.492966923962869e-06, + "loss": 0.0325, + "step": 151920 + }, + { + "epoch": 0.02965, + "grad_norm": 0.052909985184669495, + "learning_rate": 7.490016269520963e-06, + "loss": 0.0335, + "step": 151930 + }, + { + "epoch": 0.0297, + "grad_norm": 0.04486185684800148, + "learning_rate": 7.487066093795434e-06, + "loss": 0.0324, + "step": 151940 + }, + { + "epoch": 0.02975, + "grad_norm": 0.05712229385972023, + "learning_rate": 7.4841163968669524e-06, + "loss": 0.034, + "step": 151950 + }, + { + "epoch": 0.0298, + "grad_norm": 0.050738777965307236, + "learning_rate": 7.481167178816134e-06, + "loss": 0.0323, + "step": 151960 + }, + { + "epoch": 0.02985, + "grad_norm": 0.059770889580249786, + "learning_rate": 7.478218439723633e-06, + "loss": 0.0336, + "step": 151970 + }, + { + "epoch": 0.0299, + "grad_norm": 0.04169420152902603, + "learning_rate": 7.475270179670046e-06, + "loss": 0.0329, + "step": 151980 + }, + { + "epoch": 0.02995, + "grad_norm": 0.05187474563717842, + "learning_rate": 7.472322398735998e-06, + "loss": 0.0339, + "step": 151990 + }, + { + "epoch": 0.03, + "grad_norm": 0.034226927906274796, + "learning_rate": 7.469375097002071e-06, + "loss": 0.0348, + "step": 152000 + }, + { + "epoch": 0.03005, + "grad_norm": 0.045822881162166595, + "learning_rate": 7.466428274548837e-06, + "loss": 0.0347, + "step": 152010 + }, + { + "epoch": 0.0301, + "grad_norm": 0.051734428852796555, + "learning_rate": 7.463481931456873e-06, + "loss": 0.0355, + "step": 152020 + }, + { + "epoch": 0.03015, + "grad_norm": 0.05155622959136963, + "learning_rate": 7.460536067806722e-06, + "loss": 0.0335, + "step": 152030 + }, + { + "epoch": 0.0302, + "grad_norm": 0.10209552198648453, + "learning_rate": 7.457590683678928e-06, + "loss": 0.0371, + "step": 152040 + }, + { + "epoch": 0.03025, + "grad_norm": 0.06496402621269226, + "learning_rate": 7.454645779154007e-06, + "loss": 0.0353, + "step": 152050 + }, + { + "epoch": 0.0303, + "grad_norm": 0.059463124722242355, + "learning_rate": 7.451701354312487e-06, + "loss": 0.0352, + "step": 152060 + }, + { + "epoch": 0.03035, + "grad_norm": 0.04688674584031105, + "learning_rate": 7.448757409234852e-06, + "loss": 0.0346, + "step": 152070 + }, + { + "epoch": 0.0304, + "grad_norm": 0.05447123199701309, + "learning_rate": 7.445813944001601e-06, + "loss": 0.034, + "step": 152080 + }, + { + "epoch": 0.03045, + "grad_norm": 0.05676489695906639, + "learning_rate": 7.442870958693204e-06, + "loss": 0.0338, + "step": 152090 + }, + { + "epoch": 0.0305, + "grad_norm": 0.04234858974814415, + "learning_rate": 7.439928453390111e-06, + "loss": 0.0347, + "step": 152100 + }, + { + "epoch": 0.03055, + "grad_norm": 0.03998962789773941, + "learning_rate": 7.436986428172785e-06, + "loss": 0.0328, + "step": 152110 + }, + { + "epoch": 0.0306, + "grad_norm": 0.046995196491479874, + "learning_rate": 7.434044883121652e-06, + "loss": 0.0336, + "step": 152120 + }, + { + "epoch": 0.03065, + "grad_norm": 0.0382130928337574, + "learning_rate": 7.431103818317123e-06, + "loss": 0.0313, + "step": 152130 + }, + { + "epoch": 0.0307, + "grad_norm": 0.04772825539112091, + "learning_rate": 7.428163233839624e-06, + "loss": 0.0323, + "step": 152140 + }, + { + "epoch": 0.03075, + "grad_norm": 0.04025093838572502, + "learning_rate": 7.4252231297695345e-06, + "loss": 0.0319, + "step": 152150 + }, + { + "epoch": 0.0308, + "grad_norm": 0.0439000241458416, + "learning_rate": 7.4222835061872554e-06, + "loss": 0.0316, + "step": 152160 + }, + { + "epoch": 0.03085, + "grad_norm": 0.04240845516324043, + "learning_rate": 7.419344363173128e-06, + "loss": 0.0307, + "step": 152170 + }, + { + "epoch": 0.0309, + "grad_norm": 0.06312000006437302, + "learning_rate": 7.416405700807527e-06, + "loss": 0.0328, + "step": 152180 + }, + { + "epoch": 0.03095, + "grad_norm": 0.04434799775481224, + "learning_rate": 7.413467519170783e-06, + "loss": 0.0328, + "step": 152190 + }, + { + "epoch": 0.031, + "grad_norm": 0.046847231686115265, + "learning_rate": 7.410529818343237e-06, + "loss": 0.0323, + "step": 152200 + }, + { + "epoch": 0.03105, + "grad_norm": 0.04772993549704552, + "learning_rate": 7.407592598405197e-06, + "loss": 0.0326, + "step": 152210 + }, + { + "epoch": 0.0311, + "grad_norm": 0.05239289253950119, + "learning_rate": 7.404655859436957e-06, + "loss": 0.0325, + "step": 152220 + }, + { + "epoch": 0.03115, + "grad_norm": 0.042153894901275635, + "learning_rate": 7.401719601518825e-06, + "loss": 0.0327, + "step": 152230 + }, + { + "epoch": 0.0312, + "grad_norm": 0.04499209299683571, + "learning_rate": 7.398783824731067e-06, + "loss": 0.0328, + "step": 152240 + }, + { + "epoch": 0.03125, + "grad_norm": 0.042803965508937836, + "learning_rate": 7.395848529153948e-06, + "loss": 0.0333, + "step": 152250 + }, + { + "epoch": 0.0313, + "grad_norm": 0.039181217551231384, + "learning_rate": 7.392913714867708e-06, + "loss": 0.0332, + "step": 152260 + }, + { + "epoch": 0.03135, + "grad_norm": 0.040339015424251556, + "learning_rate": 7.3899793819525945e-06, + "loss": 0.0377, + "step": 152270 + }, + { + "epoch": 0.0314, + "grad_norm": 0.05206860229372978, + "learning_rate": 7.387045530488834e-06, + "loss": 0.0331, + "step": 152280 + }, + { + "epoch": 0.03145, + "grad_norm": 0.04714643210172653, + "learning_rate": 7.384112160556633e-06, + "loss": 0.0345, + "step": 152290 + }, + { + "epoch": 0.0315, + "grad_norm": 0.053952787071466446, + "learning_rate": 7.381179272236186e-06, + "loss": 0.0344, + "step": 152300 + }, + { + "epoch": 0.03155, + "grad_norm": 0.04658818617463112, + "learning_rate": 7.378246865607672e-06, + "loss": 0.0345, + "step": 152310 + }, + { + "epoch": 0.0316, + "grad_norm": 0.05002017691731453, + "learning_rate": 7.375314940751277e-06, + "loss": 0.0329, + "step": 152320 + }, + { + "epoch": 0.03165, + "grad_norm": 0.04997474327683449, + "learning_rate": 7.372383497747149e-06, + "loss": 0.0338, + "step": 152330 + }, + { + "epoch": 0.0317, + "grad_norm": 0.04623773321509361, + "learning_rate": 7.369452536675425e-06, + "loss": 0.0321, + "step": 152340 + }, + { + "epoch": 0.03175, + "grad_norm": 0.044432058930397034, + "learning_rate": 7.366522057616257e-06, + "loss": 0.0336, + "step": 152350 + }, + { + "epoch": 0.0318, + "grad_norm": 0.05239921808242798, + "learning_rate": 7.363592060649741e-06, + "loss": 0.0328, + "step": 152360 + }, + { + "epoch": 0.03185, + "grad_norm": 0.04295680671930313, + "learning_rate": 7.360662545856006e-06, + "loss": 0.0331, + "step": 152370 + }, + { + "epoch": 0.0319, + "grad_norm": 0.05377604812383652, + "learning_rate": 7.357733513315118e-06, + "loss": 0.0334, + "step": 152380 + }, + { + "epoch": 0.03195, + "grad_norm": 0.047571294009685516, + "learning_rate": 7.354804963107165e-06, + "loss": 0.0328, + "step": 152390 + }, + { + "epoch": 0.032, + "grad_norm": 0.05396561324596405, + "learning_rate": 7.351876895312226e-06, + "loss": 0.0365, + "step": 152400 + }, + { + "epoch": 0.03205, + "grad_norm": 0.04686145484447479, + "learning_rate": 7.348949310010339e-06, + "loss": 0.0333, + "step": 152410 + }, + { + "epoch": 0.0321, + "grad_norm": 0.04809960350394249, + "learning_rate": 7.34602220728155e-06, + "loss": 0.0337, + "step": 152420 + }, + { + "epoch": 0.03215, + "grad_norm": 0.03867340087890625, + "learning_rate": 7.3430955872058724e-06, + "loss": 0.0337, + "step": 152430 + }, + { + "epoch": 0.0322, + "grad_norm": 0.09577205032110214, + "learning_rate": 7.340169449863335e-06, + "loss": 0.0329, + "step": 152440 + }, + { + "epoch": 0.03225, + "grad_norm": 0.04986964166164398, + "learning_rate": 7.337243795333931e-06, + "loss": 0.0346, + "step": 152450 + }, + { + "epoch": 0.0323, + "grad_norm": 0.05069958046078682, + "learning_rate": 7.334318623697639e-06, + "loss": 0.0334, + "step": 152460 + }, + { + "epoch": 0.03235, + "grad_norm": 0.042942311614751816, + "learning_rate": 7.3313939350344475e-06, + "loss": 0.0362, + "step": 152470 + }, + { + "epoch": 0.0324, + "grad_norm": 0.05629931390285492, + "learning_rate": 7.328469729424301e-06, + "loss": 0.0342, + "step": 152480 + }, + { + "epoch": 0.03245, + "grad_norm": 0.06817825883626938, + "learning_rate": 7.325546006947156e-06, + "loss": 0.037, + "step": 152490 + }, + { + "epoch": 0.0325, + "grad_norm": 0.060083985328674316, + "learning_rate": 7.322622767682949e-06, + "loss": 0.0351, + "step": 152500 + }, + { + "epoch": 0.03255, + "grad_norm": 0.044535864144563675, + "learning_rate": 7.319700011711584e-06, + "loss": 0.0335, + "step": 152510 + }, + { + "epoch": 0.0326, + "grad_norm": 0.0461667962372303, + "learning_rate": 7.316777739112985e-06, + "loss": 0.033, + "step": 152520 + }, + { + "epoch": 0.03265, + "grad_norm": 0.05126021057367325, + "learning_rate": 7.313855949967041e-06, + "loss": 0.0336, + "step": 152530 + }, + { + "epoch": 0.0327, + "grad_norm": 0.04119449481368065, + "learning_rate": 7.310934644353632e-06, + "loss": 0.0373, + "step": 152540 + }, + { + "epoch": 0.03275, + "grad_norm": 0.05020667612552643, + "learning_rate": 7.308013822352614e-06, + "loss": 0.0315, + "step": 152550 + }, + { + "epoch": 0.0328, + "grad_norm": 0.040357209742069244, + "learning_rate": 7.30509348404386e-06, + "loss": 0.0321, + "step": 152560 + }, + { + "epoch": 0.03285, + "grad_norm": 0.04429003596305847, + "learning_rate": 7.3021736295071975e-06, + "loss": 0.0324, + "step": 152570 + }, + { + "epoch": 0.0329, + "grad_norm": 0.07690481096506119, + "learning_rate": 7.2992542588224635e-06, + "loss": 0.0341, + "step": 152580 + }, + { + "epoch": 0.03295, + "grad_norm": 0.05730977654457092, + "learning_rate": 7.2963353720694685e-06, + "loss": 0.034, + "step": 152590 + }, + { + "epoch": 0.033, + "grad_norm": 0.047369591891765594, + "learning_rate": 7.293416969328007e-06, + "loss": 0.0321, + "step": 152600 + }, + { + "epoch": 0.03305, + "grad_norm": 0.06182011216878891, + "learning_rate": 7.290499050677882e-06, + "loss": 0.0337, + "step": 152610 + }, + { + "epoch": 0.0331, + "grad_norm": 0.06516768783330917, + "learning_rate": 7.287581616198858e-06, + "loss": 0.0329, + "step": 152620 + }, + { + "epoch": 0.03315, + "grad_norm": 0.05089006945490837, + "learning_rate": 7.2846646659707005e-06, + "loss": 0.0323, + "step": 152630 + }, + { + "epoch": 0.0332, + "grad_norm": 0.04354912415146828, + "learning_rate": 7.281748200073146e-06, + "loss": 0.0333, + "step": 152640 + }, + { + "epoch": 0.03325, + "grad_norm": 0.039361096918582916, + "learning_rate": 7.27883221858594e-06, + "loss": 0.0312, + "step": 152650 + }, + { + "epoch": 0.0333, + "grad_norm": 0.050111912190914154, + "learning_rate": 7.275916721588818e-06, + "loss": 0.0316, + "step": 152660 + }, + { + "epoch": 0.03335, + "grad_norm": 0.04241053760051727, + "learning_rate": 7.273001709161459e-06, + "loss": 0.0323, + "step": 152670 + }, + { + "epoch": 0.0334, + "grad_norm": 0.052290454506874084, + "learning_rate": 7.270087181383583e-06, + "loss": 0.0337, + "step": 152680 + }, + { + "epoch": 0.03345, + "grad_norm": 0.048461172729730606, + "learning_rate": 7.267173138334854e-06, + "loss": 0.0337, + "step": 152690 + }, + { + "epoch": 0.0335, + "grad_norm": 0.047181010246276855, + "learning_rate": 7.264259580094956e-06, + "loss": 0.034, + "step": 152700 + }, + { + "epoch": 0.03355, + "grad_norm": 0.058403100818395615, + "learning_rate": 7.261346506743538e-06, + "loss": 0.0337, + "step": 152710 + }, + { + "epoch": 0.0336, + "grad_norm": 0.0491732694208622, + "learning_rate": 7.258433918360238e-06, + "loss": 0.0338, + "step": 152720 + }, + { + "epoch": 0.03365, + "grad_norm": 0.04675913229584694, + "learning_rate": 7.255521815024694e-06, + "loss": 0.0339, + "step": 152730 + }, + { + "epoch": 0.0337, + "grad_norm": 0.03744709864258766, + "learning_rate": 7.252610196816517e-06, + "loss": 0.0322, + "step": 152740 + }, + { + "epoch": 0.03375, + "grad_norm": 0.04835504665970802, + "learning_rate": 7.249699063815313e-06, + "loss": 0.0326, + "step": 152750 + }, + { + "epoch": 0.0338, + "grad_norm": 0.04576469957828522, + "learning_rate": 7.246788416100658e-06, + "loss": 0.0333, + "step": 152760 + }, + { + "epoch": 0.03385, + "grad_norm": 0.04791948199272156, + "learning_rate": 7.24387825375214e-06, + "loss": 0.0327, + "step": 152770 + }, + { + "epoch": 0.0339, + "grad_norm": 0.0478530153632164, + "learning_rate": 7.240968576849324e-06, + "loss": 0.0324, + "step": 152780 + }, + { + "epoch": 0.03395, + "grad_norm": 0.04098542034626007, + "learning_rate": 7.23805938547176e-06, + "loss": 0.0334, + "step": 152790 + }, + { + "epoch": 0.034, + "grad_norm": 0.047090351581573486, + "learning_rate": 7.235150679698977e-06, + "loss": 0.0328, + "step": 152800 + }, + { + "epoch": 0.03405, + "grad_norm": 0.04444635286927223, + "learning_rate": 7.232242459610491e-06, + "loss": 0.0318, + "step": 152810 + }, + { + "epoch": 0.0341, + "grad_norm": 0.04326888918876648, + "learning_rate": 7.2293347252858305e-06, + "loss": 0.0322, + "step": 152820 + }, + { + "epoch": 0.03415, + "grad_norm": 0.050240468233823776, + "learning_rate": 7.226427476804484e-06, + "loss": 0.0344, + "step": 152830 + }, + { + "epoch": 0.0342, + "grad_norm": 0.04395769163966179, + "learning_rate": 7.223520714245924e-06, + "loss": 0.0321, + "step": 152840 + }, + { + "epoch": 0.03425, + "grad_norm": 0.04159471392631531, + "learning_rate": 7.220614437689638e-06, + "loss": 0.0328, + "step": 152850 + }, + { + "epoch": 0.0343, + "grad_norm": 0.05443045496940613, + "learning_rate": 7.217708647215063e-06, + "loss": 0.0343, + "step": 152860 + }, + { + "epoch": 0.03435, + "grad_norm": 0.042212799191474915, + "learning_rate": 7.214803342901671e-06, + "loss": 0.033, + "step": 152870 + }, + { + "epoch": 0.0344, + "grad_norm": 0.04545920342206955, + "learning_rate": 7.211898524828859e-06, + "loss": 0.0319, + "step": 152880 + }, + { + "epoch": 0.03445, + "grad_norm": 0.04602941870689392, + "learning_rate": 7.208994193076057e-06, + "loss": 0.0328, + "step": 152890 + }, + { + "epoch": 0.0345, + "grad_norm": 0.050577692687511444, + "learning_rate": 7.20609034772268e-06, + "loss": 0.0336, + "step": 152900 + }, + { + "epoch": 0.03455, + "grad_norm": 0.03855586424469948, + "learning_rate": 7.203186988848107e-06, + "loss": 0.0316, + "step": 152910 + }, + { + "epoch": 0.0346, + "grad_norm": 0.048223525285720825, + "learning_rate": 7.200284116531716e-06, + "loss": 0.0312, + "step": 152920 + }, + { + "epoch": 0.03465, + "grad_norm": 0.0466330461204052, + "learning_rate": 7.197381730852862e-06, + "loss": 0.0327, + "step": 152930 + }, + { + "epoch": 0.0347, + "grad_norm": 0.047722309827804565, + "learning_rate": 7.19447983189091e-06, + "loss": 0.031, + "step": 152940 + }, + { + "epoch": 0.03475, + "grad_norm": 0.04887279495596886, + "learning_rate": 7.191578419725192e-06, + "loss": 0.0313, + "step": 152950 + }, + { + "epoch": 0.0348, + "grad_norm": 0.0436398871243, + "learning_rate": 7.18867749443502e-06, + "loss": 0.0312, + "step": 152960 + }, + { + "epoch": 0.03485, + "grad_norm": 0.040157947689294815, + "learning_rate": 7.185777056099724e-06, + "loss": 0.0329, + "step": 152970 + }, + { + "epoch": 0.0349, + "grad_norm": 0.044679731130599976, + "learning_rate": 7.182877104798583e-06, + "loss": 0.0313, + "step": 152980 + }, + { + "epoch": 0.03495, + "grad_norm": 0.04023885726928711, + "learning_rate": 7.179977640610894e-06, + "loss": 0.0307, + "step": 152990 + }, + { + "epoch": 0.035, + "grad_norm": 0.04457961022853851, + "learning_rate": 7.177078663615921e-06, + "loss": 0.0306, + "step": 153000 + }, + { + "epoch": 0.03505, + "grad_norm": 0.04377239570021629, + "learning_rate": 7.174180173892925e-06, + "loss": 0.0322, + "step": 153010 + }, + { + "epoch": 0.0351, + "grad_norm": 0.042049601674079895, + "learning_rate": 7.171282171521138e-06, + "loss": 0.0329, + "step": 153020 + }, + { + "epoch": 0.03515, + "grad_norm": 0.04242038354277611, + "learning_rate": 7.168384656579804e-06, + "loss": 0.0319, + "step": 153030 + }, + { + "epoch": 0.0352, + "grad_norm": 0.052859652787446976, + "learning_rate": 7.165487629148135e-06, + "loss": 0.0329, + "step": 153040 + }, + { + "epoch": 0.03525, + "grad_norm": 0.05696389451622963, + "learning_rate": 7.162591089305326e-06, + "loss": 0.0321, + "step": 153050 + }, + { + "epoch": 0.0353, + "grad_norm": 0.05143987387418747, + "learning_rate": 7.1596950371305845e-06, + "loss": 0.0311, + "step": 153060 + }, + { + "epoch": 0.03535, + "grad_norm": 0.047393981367349625, + "learning_rate": 7.156799472703072e-06, + "loss": 0.0322, + "step": 153070 + }, + { + "epoch": 0.0354, + "grad_norm": 0.042954448610544205, + "learning_rate": 7.1539043961019706e-06, + "loss": 0.0318, + "step": 153080 + }, + { + "epoch": 0.03545, + "grad_norm": 0.05207926407456398, + "learning_rate": 7.151009807406403e-06, + "loss": 0.0321, + "step": 153090 + }, + { + "epoch": 0.0355, + "grad_norm": 0.04027773067355156, + "learning_rate": 7.148115706695524e-06, + "loss": 0.0329, + "step": 153100 + }, + { + "epoch": 0.03555, + "grad_norm": 0.0464700311422348, + "learning_rate": 7.145222094048462e-06, + "loss": 0.0325, + "step": 153110 + }, + { + "epoch": 0.0356, + "grad_norm": 0.043472904711961746, + "learning_rate": 7.142328969544321e-06, + "loss": 0.0321, + "step": 153120 + }, + { + "epoch": 0.03565, + "grad_norm": 0.04800725355744362, + "learning_rate": 7.139436333262195e-06, + "loss": 0.033, + "step": 153130 + }, + { + "epoch": 0.0357, + "grad_norm": 0.04517852142453194, + "learning_rate": 7.136544185281163e-06, + "loss": 0.0322, + "step": 153140 + }, + { + "epoch": 0.03575, + "grad_norm": 0.047509852796792984, + "learning_rate": 7.1336525256803034e-06, + "loss": 0.0332, + "step": 153150 + }, + { + "epoch": 0.0358, + "grad_norm": 0.05477967858314514, + "learning_rate": 7.130761354538687e-06, + "loss": 0.0331, + "step": 153160 + }, + { + "epoch": 0.03585, + "grad_norm": 0.0509810596704483, + "learning_rate": 7.127870671935324e-06, + "loss": 0.0336, + "step": 153170 + }, + { + "epoch": 0.0359, + "grad_norm": 0.04239951819181442, + "learning_rate": 7.124980477949272e-06, + "loss": 0.0328, + "step": 153180 + }, + { + "epoch": 0.03595, + "grad_norm": 0.07508710771799088, + "learning_rate": 7.122090772659531e-06, + "loss": 0.0355, + "step": 153190 + }, + { + "epoch": 0.036, + "grad_norm": 0.05007263273000717, + "learning_rate": 7.119201556145119e-06, + "loss": 0.0331, + "step": 153200 + }, + { + "epoch": 0.03605, + "grad_norm": 0.08754939585924149, + "learning_rate": 7.11631282848502e-06, + "loss": 0.0339, + "step": 153210 + }, + { + "epoch": 0.0361, + "grad_norm": 0.06672363728284836, + "learning_rate": 7.1134245897582e-06, + "loss": 0.0332, + "step": 153220 + }, + { + "epoch": 0.03615, + "grad_norm": 0.04547875002026558, + "learning_rate": 7.110536840043641e-06, + "loss": 0.0334, + "step": 153230 + }, + { + "epoch": 0.0362, + "grad_norm": 0.04429539293050766, + "learning_rate": 7.107649579420283e-06, + "loss": 0.0317, + "step": 153240 + }, + { + "epoch": 0.03625, + "grad_norm": 0.06021294370293617, + "learning_rate": 7.104762807967066e-06, + "loss": 0.0328, + "step": 153250 + }, + { + "epoch": 0.0363, + "grad_norm": 0.04863046109676361, + "learning_rate": 7.101876525762901e-06, + "loss": 0.0321, + "step": 153260 + }, + { + "epoch": 0.03635, + "grad_norm": 0.04178769141435623, + "learning_rate": 7.098990732886718e-06, + "loss": 0.0335, + "step": 153270 + }, + { + "epoch": 0.0364, + "grad_norm": 0.03681156039237976, + "learning_rate": 7.096105429417393e-06, + "loss": 0.0358, + "step": 153280 + }, + { + "epoch": 0.03645, + "grad_norm": 0.049410223960876465, + "learning_rate": 7.093220615433827e-06, + "loss": 0.0334, + "step": 153290 + }, + { + "epoch": 0.0365, + "grad_norm": 0.04368637502193451, + "learning_rate": 7.090336291014884e-06, + "loss": 0.0322, + "step": 153300 + }, + { + "epoch": 0.03655, + "grad_norm": 0.04722609370946884, + "learning_rate": 7.08745245623941e-06, + "loss": 0.0328, + "step": 153310 + }, + { + "epoch": 0.0366, + "grad_norm": 0.057430315762758255, + "learning_rate": 7.084569111186262e-06, + "loss": 0.0339, + "step": 153320 + }, + { + "epoch": 0.03665, + "grad_norm": 0.054572589695453644, + "learning_rate": 7.0816862559342664e-06, + "loss": 0.033, + "step": 153330 + }, + { + "epoch": 0.0367, + "grad_norm": 0.05039558559656143, + "learning_rate": 7.07880389056223e-06, + "loss": 0.0359, + "step": 153340 + }, + { + "epoch": 0.03675, + "grad_norm": 0.06452251225709915, + "learning_rate": 7.075922015148967e-06, + "loss": 0.0326, + "step": 153350 + }, + { + "epoch": 0.0368, + "grad_norm": 0.04780831187963486, + "learning_rate": 7.073040629773259e-06, + "loss": 0.0332, + "step": 153360 + }, + { + "epoch": 0.03685, + "grad_norm": 0.051232628524303436, + "learning_rate": 7.070159734513898e-06, + "loss": 0.0323, + "step": 153370 + }, + { + "epoch": 0.0369, + "grad_norm": 0.05242280662059784, + "learning_rate": 7.067279329449616e-06, + "loss": 0.0331, + "step": 153380 + }, + { + "epoch": 0.03695, + "grad_norm": 0.04547596722841263, + "learning_rate": 7.064399414659193e-06, + "loss": 0.0322, + "step": 153390 + }, + { + "epoch": 0.037, + "grad_norm": 0.051696255803108215, + "learning_rate": 7.061519990221341e-06, + "loss": 0.0337, + "step": 153400 + }, + { + "epoch": 0.03705, + "grad_norm": 0.0777956172823906, + "learning_rate": 7.058641056214801e-06, + "loss": 0.0332, + "step": 153410 + }, + { + "epoch": 0.0371, + "grad_norm": 0.04949126020073891, + "learning_rate": 7.055762612718275e-06, + "loss": 0.0323, + "step": 153420 + }, + { + "epoch": 0.03715, + "grad_norm": 0.05280499532818794, + "learning_rate": 7.052884659810452e-06, + "loss": 0.0318, + "step": 153430 + }, + { + "epoch": 0.0372, + "grad_norm": 0.050723519176244736, + "learning_rate": 7.050007197570024e-06, + "loss": 0.0333, + "step": 153440 + }, + { + "epoch": 0.03725, + "grad_norm": 0.05991830676794052, + "learning_rate": 7.04713022607566e-06, + "loss": 0.0326, + "step": 153450 + }, + { + "epoch": 0.0373, + "grad_norm": 0.04820108786225319, + "learning_rate": 7.044253745406007e-06, + "loss": 0.0328, + "step": 153460 + }, + { + "epoch": 0.03735, + "grad_norm": 0.04432525113224983, + "learning_rate": 7.0413777556397055e-06, + "loss": 0.0323, + "step": 153470 + }, + { + "epoch": 0.0374, + "grad_norm": 0.04415155202150345, + "learning_rate": 7.038502256855389e-06, + "loss": 0.0316, + "step": 153480 + }, + { + "epoch": 0.03745, + "grad_norm": 0.049692247062921524, + "learning_rate": 7.035627249131682e-06, + "loss": 0.0334, + "step": 153490 + }, + { + "epoch": 0.0375, + "grad_norm": 0.05337144061923027, + "learning_rate": 7.032752732547174e-06, + "loss": 0.0322, + "step": 153500 + }, + { + "epoch": 0.03755, + "grad_norm": 0.03996986150741577, + "learning_rate": 7.02987870718046e-06, + "loss": 0.0308, + "step": 153510 + }, + { + "epoch": 0.0376, + "grad_norm": 0.038624316453933716, + "learning_rate": 7.027005173110099e-06, + "loss": 0.0317, + "step": 153520 + }, + { + "epoch": 0.03765, + "grad_norm": 0.040363065898418427, + "learning_rate": 7.0241321304146765e-06, + "loss": 0.0328, + "step": 153530 + }, + { + "epoch": 0.0377, + "grad_norm": 0.04164469242095947, + "learning_rate": 7.021259579172726e-06, + "loss": 0.0323, + "step": 153540 + }, + { + "epoch": 0.03775, + "grad_norm": 0.046082183718681335, + "learning_rate": 7.018387519462777e-06, + "loss": 0.0325, + "step": 153550 + }, + { + "epoch": 0.0378, + "grad_norm": 0.04982241988182068, + "learning_rate": 7.0155159513633635e-06, + "loss": 0.0333, + "step": 153560 + }, + { + "epoch": 0.03785, + "grad_norm": 0.052763279527425766, + "learning_rate": 7.0126448749529836e-06, + "loss": 0.0333, + "step": 153570 + }, + { + "epoch": 0.0379, + "grad_norm": 0.04755253344774246, + "learning_rate": 7.009774290310148e-06, + "loss": 0.0326, + "step": 153580 + }, + { + "epoch": 0.03795, + "grad_norm": 0.07079523801803589, + "learning_rate": 7.006904197513308e-06, + "loss": 0.0348, + "step": 153590 + }, + { + "epoch": 0.038, + "grad_norm": 0.08428723365068436, + "learning_rate": 7.0040345966409514e-06, + "loss": 0.0342, + "step": 153600 + }, + { + "epoch": 0.03805, + "grad_norm": 0.04555227607488632, + "learning_rate": 7.001165487771536e-06, + "loss": 0.0327, + "step": 153610 + }, + { + "epoch": 0.0381, + "grad_norm": 0.04715776443481445, + "learning_rate": 6.998296870983489e-06, + "loss": 0.0338, + "step": 153620 + }, + { + "epoch": 0.03815, + "grad_norm": 0.04404429718852043, + "learning_rate": 6.995428746355248e-06, + "loss": 0.0347, + "step": 153630 + }, + { + "epoch": 0.0382, + "grad_norm": 0.045779477804899216, + "learning_rate": 6.99256111396521e-06, + "loss": 0.0329, + "step": 153640 + }, + { + "epoch": 0.03825, + "grad_norm": 0.046304021030664444, + "learning_rate": 6.989693973891795e-06, + "loss": 0.0338, + "step": 153650 + }, + { + "epoch": 0.0383, + "grad_norm": 0.051491059362888336, + "learning_rate": 6.986827326213383e-06, + "loss": 0.0345, + "step": 153660 + }, + { + "epoch": 0.03835, + "grad_norm": 0.048529427498579025, + "learning_rate": 6.9839611710083325e-06, + "loss": 0.034, + "step": 153670 + }, + { + "epoch": 0.0384, + "grad_norm": 0.04330417141318321, + "learning_rate": 6.981095508355026e-06, + "loss": 0.0332, + "step": 153680 + }, + { + "epoch": 0.03845, + "grad_norm": 0.044940460473299026, + "learning_rate": 6.97823033833179e-06, + "loss": 0.033, + "step": 153690 + }, + { + "epoch": 0.0385, + "grad_norm": 0.03975803032517433, + "learning_rate": 6.9753656610169745e-06, + "loss": 0.0313, + "step": 153700 + }, + { + "epoch": 0.03855, + "grad_norm": 0.04785052686929703, + "learning_rate": 6.972501476488891e-06, + "loss": 0.032, + "step": 153710 + }, + { + "epoch": 0.0386, + "grad_norm": 0.04156506061553955, + "learning_rate": 6.969637784825836e-06, + "loss": 0.0321, + "step": 153720 + }, + { + "epoch": 0.03865, + "grad_norm": 0.0429970882833004, + "learning_rate": 6.966774586106117e-06, + "loss": 0.0318, + "step": 153730 + }, + { + "epoch": 0.0387, + "grad_norm": 0.046473126858472824, + "learning_rate": 6.963911880408006e-06, + "loss": 0.0313, + "step": 153740 + }, + { + "epoch": 0.03875, + "grad_norm": 0.04150541126728058, + "learning_rate": 6.961049667809768e-06, + "loss": 0.0317, + "step": 153750 + }, + { + "epoch": 0.0388, + "grad_norm": 0.04941317439079285, + "learning_rate": 6.958187948389649e-06, + "loss": 0.0332, + "step": 153760 + }, + { + "epoch": 0.03885, + "grad_norm": 0.05646712705492973, + "learning_rate": 6.955326722225902e-06, + "loss": 0.0325, + "step": 153770 + }, + { + "epoch": 0.0389, + "grad_norm": 0.052297193557024, + "learning_rate": 6.952465989396733e-06, + "loss": 0.0322, + "step": 153780 + }, + { + "epoch": 0.03895, + "grad_norm": 0.047596145421266556, + "learning_rate": 6.949605749980375e-06, + "loss": 0.033, + "step": 153790 + }, + { + "epoch": 0.039, + "grad_norm": 0.04238370433449745, + "learning_rate": 6.9467460040550134e-06, + "loss": 0.0339, + "step": 153800 + }, + { + "epoch": 0.03905, + "grad_norm": 0.04646526649594307, + "learning_rate": 6.943886751698825e-06, + "loss": 0.0323, + "step": 153810 + }, + { + "epoch": 0.0391, + "grad_norm": 0.04616091027855873, + "learning_rate": 6.941027992989996e-06, + "loss": 0.0334, + "step": 153820 + }, + { + "epoch": 0.03915, + "grad_norm": 0.05092548951506615, + "learning_rate": 6.938169728006677e-06, + "loss": 0.0351, + "step": 153830 + }, + { + "epoch": 0.0392, + "grad_norm": 0.0540512315928936, + "learning_rate": 6.935311956827015e-06, + "loss": 0.0336, + "step": 153840 + }, + { + "epoch": 0.03925, + "grad_norm": 0.04341093450784683, + "learning_rate": 6.932454679529129e-06, + "loss": 0.0337, + "step": 153850 + }, + { + "epoch": 0.0393, + "grad_norm": 0.04935673996806145, + "learning_rate": 6.929597896191142e-06, + "loss": 0.0352, + "step": 153860 + }, + { + "epoch": 0.03935, + "grad_norm": 0.052990105003118515, + "learning_rate": 6.926741606891179e-06, + "loss": 0.0352, + "step": 153870 + }, + { + "epoch": 0.0394, + "grad_norm": 0.042551975697278976, + "learning_rate": 6.923885811707292e-06, + "loss": 0.0339, + "step": 153880 + }, + { + "epoch": 0.03945, + "grad_norm": 0.04354416951537132, + "learning_rate": 6.921030510717585e-06, + "loss": 0.035, + "step": 153890 + }, + { + "epoch": 0.0395, + "grad_norm": 0.05921914428472519, + "learning_rate": 6.918175704000104e-06, + "loss": 0.0327, + "step": 153900 + }, + { + "epoch": 0.03955, + "grad_norm": 0.05651251971721649, + "learning_rate": 6.915321391632915e-06, + "loss": 0.0337, + "step": 153910 + }, + { + "epoch": 0.0396, + "grad_norm": 0.07422874867916107, + "learning_rate": 6.912467573694042e-06, + "loss": 0.0349, + "step": 153920 + }, + { + "epoch": 0.03965, + "grad_norm": 0.04788966849446297, + "learning_rate": 6.909614250261507e-06, + "loss": 0.0325, + "step": 153930 + }, + { + "epoch": 0.0397, + "grad_norm": 0.0516754575073719, + "learning_rate": 6.906761421413327e-06, + "loss": 0.0345, + "step": 153940 + }, + { + "epoch": 0.03975, + "grad_norm": 0.04991249740123749, + "learning_rate": 6.903909087227495e-06, + "loss": 0.0336, + "step": 153950 + }, + { + "epoch": 0.0398, + "grad_norm": 0.04570896178483963, + "learning_rate": 6.901057247781986e-06, + "loss": 0.0346, + "step": 153960 + }, + { + "epoch": 0.03985, + "grad_norm": 0.04673990607261658, + "learning_rate": 6.898205903154767e-06, + "loss": 0.0334, + "step": 153970 + }, + { + "epoch": 0.0399, + "grad_norm": 0.04619419947266579, + "learning_rate": 6.895355053423799e-06, + "loss": 0.0342, + "step": 153980 + }, + { + "epoch": 0.03995, + "grad_norm": 0.050655726343393326, + "learning_rate": 6.8925046986670295e-06, + "loss": 0.0338, + "step": 153990 + }, + { + "epoch": 0.04, + "grad_norm": 0.047669846564531326, + "learning_rate": 6.889654838962379e-06, + "loss": 0.0349, + "step": 154000 + }, + { + "epoch": 0.04005, + "grad_norm": 0.042578499764204025, + "learning_rate": 6.886805474387759e-06, + "loss": 0.034, + "step": 154010 + }, + { + "epoch": 0.0401, + "grad_norm": 0.04635762423276901, + "learning_rate": 6.883956605021066e-06, + "loss": 0.0331, + "step": 154020 + }, + { + "epoch": 0.04015, + "grad_norm": 0.04816887900233269, + "learning_rate": 6.881108230940203e-06, + "loss": 0.0351, + "step": 154030 + }, + { + "epoch": 0.0402, + "grad_norm": 0.0424954779446125, + "learning_rate": 6.8782603522230314e-06, + "loss": 0.0336, + "step": 154040 + }, + { + "epoch": 0.04025, + "grad_norm": 0.04485035315155983, + "learning_rate": 6.875412968947409e-06, + "loss": 0.0339, + "step": 154050 + }, + { + "epoch": 0.0403, + "grad_norm": 0.044789139181375504, + "learning_rate": 6.8725660811911924e-06, + "loss": 0.0344, + "step": 154060 + }, + { + "epoch": 0.04035, + "grad_norm": 0.04421795904636383, + "learning_rate": 6.869719689032203e-06, + "loss": 0.0338, + "step": 154070 + }, + { + "epoch": 0.0404, + "grad_norm": 0.047407958656549454, + "learning_rate": 6.866873792548281e-06, + "loss": 0.0349, + "step": 154080 + }, + { + "epoch": 0.04045, + "grad_norm": 0.04338439926505089, + "learning_rate": 6.864028391817201e-06, + "loss": 0.0356, + "step": 154090 + }, + { + "epoch": 0.0405, + "grad_norm": 0.057887546718120575, + "learning_rate": 6.861183486916773e-06, + "loss": 0.0339, + "step": 154100 + }, + { + "epoch": 0.04055, + "grad_norm": 0.05058251693844795, + "learning_rate": 6.858339077924777e-06, + "loss": 0.0337, + "step": 154110 + }, + { + "epoch": 0.0406, + "grad_norm": 0.10525360703468323, + "learning_rate": 6.855495164918979e-06, + "loss": 0.0354, + "step": 154120 + }, + { + "epoch": 0.04065, + "grad_norm": 0.04736180230975151, + "learning_rate": 6.852651747977126e-06, + "loss": 0.0337, + "step": 154130 + }, + { + "epoch": 0.0407, + "grad_norm": 0.04907584190368652, + "learning_rate": 6.849808827176951e-06, + "loss": 0.0335, + "step": 154140 + }, + { + "epoch": 0.04075, + "grad_norm": 0.04502156376838684, + "learning_rate": 6.846966402596189e-06, + "loss": 0.0336, + "step": 154150 + }, + { + "epoch": 0.0408, + "grad_norm": 0.04660884663462639, + "learning_rate": 6.8441244743125466e-06, + "loss": 0.033, + "step": 154160 + }, + { + "epoch": 0.04085, + "grad_norm": 0.0450296513736248, + "learning_rate": 6.841283042403712e-06, + "loss": 0.0334, + "step": 154170 + }, + { + "epoch": 0.0409, + "grad_norm": 0.043547023087739944, + "learning_rate": 6.838442106947385e-06, + "loss": 0.0321, + "step": 154180 + }, + { + "epoch": 0.04095, + "grad_norm": 0.05435403436422348, + "learning_rate": 6.8356016680212215e-06, + "loss": 0.0336, + "step": 154190 + }, + { + "epoch": 0.041, + "grad_norm": 0.053517408668994904, + "learning_rate": 6.832761725702891e-06, + "loss": 0.0331, + "step": 154200 + }, + { + "epoch": 0.04105, + "grad_norm": 0.05222579836845398, + "learning_rate": 6.829922280070028e-06, + "loss": 0.0327, + "step": 154210 + }, + { + "epoch": 0.0411, + "grad_norm": 0.04833085462450981, + "learning_rate": 6.827083331200265e-06, + "loss": 0.0324, + "step": 154220 + }, + { + "epoch": 0.04115, + "grad_norm": 0.04768088459968567, + "learning_rate": 6.82424487917121e-06, + "loss": 0.0331, + "step": 154230 + }, + { + "epoch": 0.0412, + "grad_norm": 0.057974107563495636, + "learning_rate": 6.821406924060478e-06, + "loss": 0.034, + "step": 154240 + }, + { + "epoch": 0.04125, + "grad_norm": 0.05497260019183159, + "learning_rate": 6.818569465945654e-06, + "loss": 0.0324, + "step": 154250 + }, + { + "epoch": 0.0413, + "grad_norm": 0.05150937661528587, + "learning_rate": 6.815732504904298e-06, + "loss": 0.0322, + "step": 154260 + }, + { + "epoch": 0.04135, + "grad_norm": 0.0442415289580822, + "learning_rate": 6.812896041013994e-06, + "loss": 0.0325, + "step": 154270 + }, + { + "epoch": 0.0414, + "grad_norm": 0.04612816125154495, + "learning_rate": 6.810060074352273e-06, + "loss": 0.0333, + "step": 154280 + }, + { + "epoch": 0.04145, + "grad_norm": 0.054583415389060974, + "learning_rate": 6.80722460499669e-06, + "loss": 0.0339, + "step": 154290 + }, + { + "epoch": 0.0415, + "grad_norm": 0.05033816769719124, + "learning_rate": 6.804389633024738e-06, + "loss": 0.0352, + "step": 154300 + }, + { + "epoch": 0.04155, + "grad_norm": 0.04901561886072159, + "learning_rate": 6.801555158513937e-06, + "loss": 0.0346, + "step": 154310 + }, + { + "epoch": 0.0416, + "grad_norm": 0.053737394511699677, + "learning_rate": 6.798721181541787e-06, + "loss": 0.0341, + "step": 154320 + }, + { + "epoch": 0.04165, + "grad_norm": 0.05055680498480797, + "learning_rate": 6.795887702185763e-06, + "loss": 0.0339, + "step": 154330 + }, + { + "epoch": 0.0417, + "grad_norm": 0.04323967546224594, + "learning_rate": 6.793054720523329e-06, + "loss": 0.0324, + "step": 154340 + }, + { + "epoch": 0.04175, + "grad_norm": 0.047659508883953094, + "learning_rate": 6.790222236631933e-06, + "loss": 0.0325, + "step": 154350 + }, + { + "epoch": 0.0418, + "grad_norm": 0.03829479217529297, + "learning_rate": 6.787390250589018e-06, + "loss": 0.0328, + "step": 154360 + }, + { + "epoch": 0.04185, + "grad_norm": 0.06552759557962418, + "learning_rate": 6.784558762472029e-06, + "loss": 0.034, + "step": 154370 + }, + { + "epoch": 0.0419, + "grad_norm": 0.054503217339515686, + "learning_rate": 6.781727772358346e-06, + "loss": 0.0362, + "step": 154380 + }, + { + "epoch": 0.04195, + "grad_norm": 0.07738342881202698, + "learning_rate": 6.778897280325386e-06, + "loss": 0.0356, + "step": 154390 + }, + { + "epoch": 0.042, + "grad_norm": 0.03825148195028305, + "learning_rate": 6.776067286450521e-06, + "loss": 0.0333, + "step": 154400 + }, + { + "epoch": 0.04205, + "grad_norm": 0.04342617094516754, + "learning_rate": 6.773237790811141e-06, + "loss": 0.0334, + "step": 154410 + }, + { + "epoch": 0.0421, + "grad_norm": 0.04140163213014603, + "learning_rate": 6.77040879348459e-06, + "loss": 0.0331, + "step": 154420 + }, + { + "epoch": 0.04215, + "grad_norm": 0.047371137887239456, + "learning_rate": 6.767580294548207e-06, + "loss": 0.0327, + "step": 154430 + }, + { + "epoch": 0.0422, + "grad_norm": 0.03845300152897835, + "learning_rate": 6.764752294079335e-06, + "loss": 0.0321, + "step": 154440 + }, + { + "epoch": 0.04225, + "grad_norm": 0.0411493293941021, + "learning_rate": 6.7619247921552854e-06, + "loss": 0.0314, + "step": 154450 + }, + { + "epoch": 0.0423, + "grad_norm": 0.07412539422512054, + "learning_rate": 6.759097788853364e-06, + "loss": 0.0325, + "step": 154460 + }, + { + "epoch": 0.04235, + "grad_norm": 0.0695980116724968, + "learning_rate": 6.756271284250845e-06, + "loss": 0.0325, + "step": 154470 + }, + { + "epoch": 0.0424, + "grad_norm": 0.050232771784067154, + "learning_rate": 6.753445278425022e-06, + "loss": 0.0323, + "step": 154480 + }, + { + "epoch": 0.04245, + "grad_norm": 0.04724445194005966, + "learning_rate": 6.750619771453146e-06, + "loss": 0.0329, + "step": 154490 + }, + { + "epoch": 0.0425, + "grad_norm": 0.051420923322439194, + "learning_rate": 6.7477947634124735e-06, + "loss": 0.0329, + "step": 154500 + }, + { + "epoch": 0.04255, + "grad_norm": 0.04676588997244835, + "learning_rate": 6.744970254380237e-06, + "loss": 0.0329, + "step": 154510 + }, + { + "epoch": 0.0426, + "grad_norm": 0.04401226341724396, + "learning_rate": 6.742146244433648e-06, + "loss": 0.0328, + "step": 154520 + }, + { + "epoch": 0.04265, + "grad_norm": 0.042951542884111404, + "learning_rate": 6.73932273364993e-06, + "loss": 0.0328, + "step": 154530 + }, + { + "epoch": 0.0427, + "grad_norm": 0.03603766858577728, + "learning_rate": 6.736499722106266e-06, + "loss": 0.0336, + "step": 154540 + }, + { + "epoch": 0.04275, + "grad_norm": 0.04851701110601425, + "learning_rate": 6.733677209879832e-06, + "loss": 0.033, + "step": 154550 + }, + { + "epoch": 0.0428, + "grad_norm": 0.04410025477409363, + "learning_rate": 6.7308551970478085e-06, + "loss": 0.035, + "step": 154560 + }, + { + "epoch": 0.04285, + "grad_norm": 0.04761466011404991, + "learning_rate": 6.728033683687332e-06, + "loss": 0.034, + "step": 154570 + }, + { + "epoch": 0.0429, + "grad_norm": 0.051995132118463516, + "learning_rate": 6.725212669875567e-06, + "loss": 0.0344, + "step": 154580 + }, + { + "epoch": 0.04295, + "grad_norm": 0.04637273773550987, + "learning_rate": 6.722392155689605e-06, + "loss": 0.0343, + "step": 154590 + }, + { + "epoch": 0.043, + "grad_norm": 0.04322204366326332, + "learning_rate": 6.719572141206584e-06, + "loss": 0.0354, + "step": 154600 + }, + { + "epoch": 0.04305, + "grad_norm": 0.04716825112700462, + "learning_rate": 6.716752626503586e-06, + "loss": 0.0342, + "step": 154610 + }, + { + "epoch": 0.0431, + "grad_norm": 0.04313306137919426, + "learning_rate": 6.713933611657708e-06, + "loss": 0.0336, + "step": 154620 + }, + { + "epoch": 0.04315, + "grad_norm": 0.05106494948267937, + "learning_rate": 6.7111150967460155e-06, + "loss": 0.0349, + "step": 154630 + }, + { + "epoch": 0.0432, + "grad_norm": 0.04225478321313858, + "learning_rate": 6.7082970818455605e-06, + "loss": 0.0339, + "step": 154640 + }, + { + "epoch": 0.04325, + "grad_norm": 0.04658160358667374, + "learning_rate": 6.705479567033396e-06, + "loss": 0.034, + "step": 154650 + }, + { + "epoch": 0.0433, + "grad_norm": 0.04925062134861946, + "learning_rate": 6.7026625523865485e-06, + "loss": 0.0331, + "step": 154660 + }, + { + "epoch": 0.04335, + "grad_norm": 0.04897158220410347, + "learning_rate": 6.699846037982033e-06, + "loss": 0.0332, + "step": 154670 + }, + { + "epoch": 0.0434, + "grad_norm": 0.044098395854234695, + "learning_rate": 6.697030023896844e-06, + "loss": 0.0328, + "step": 154680 + }, + { + "epoch": 0.04345, + "grad_norm": 0.04288400709629059, + "learning_rate": 6.694214510207978e-06, + "loss": 0.034, + "step": 154690 + }, + { + "epoch": 0.0435, + "grad_norm": 0.04962920770049095, + "learning_rate": 6.6913994969924176e-06, + "loss": 0.0339, + "step": 154700 + }, + { + "epoch": 0.04355, + "grad_norm": 0.044396474957466125, + "learning_rate": 6.688584984327115e-06, + "loss": 0.0345, + "step": 154710 + }, + { + "epoch": 0.0436, + "grad_norm": 0.04685245454311371, + "learning_rate": 6.685770972289021e-06, + "loss": 0.0327, + "step": 154720 + }, + { + "epoch": 0.04365, + "grad_norm": 0.050125036388635635, + "learning_rate": 6.682957460955061e-06, + "loss": 0.0342, + "step": 154730 + }, + { + "epoch": 0.0437, + "grad_norm": 0.048011697828769684, + "learning_rate": 6.68014445040217e-06, + "loss": 0.0327, + "step": 154740 + }, + { + "epoch": 0.04375, + "grad_norm": 0.03696242347359657, + "learning_rate": 6.677331940707249e-06, + "loss": 0.0325, + "step": 154750 + }, + { + "epoch": 0.0438, + "grad_norm": 0.03834865242242813, + "learning_rate": 6.674519931947179e-06, + "loss": 0.0327, + "step": 154760 + }, + { + "epoch": 0.04385, + "grad_norm": 0.038775499910116196, + "learning_rate": 6.671708424198858e-06, + "loss": 0.0329, + "step": 154770 + }, + { + "epoch": 0.0439, + "grad_norm": 0.0430787093937397, + "learning_rate": 6.668897417539136e-06, + "loss": 0.0323, + "step": 154780 + }, + { + "epoch": 0.04395, + "grad_norm": 0.04222596436738968, + "learning_rate": 6.666086912044889e-06, + "loss": 0.0317, + "step": 154790 + }, + { + "epoch": 0.044, + "grad_norm": 0.04960113391280174, + "learning_rate": 6.663276907792921e-06, + "loss": 0.0329, + "step": 154800 + }, + { + "epoch": 0.04405, + "grad_norm": 0.04817935824394226, + "learning_rate": 6.660467404860071e-06, + "loss": 0.0314, + "step": 154810 + }, + { + "epoch": 0.0441, + "grad_norm": 0.057387106120586395, + "learning_rate": 6.657658403323164e-06, + "loss": 0.0339, + "step": 154820 + }, + { + "epoch": 0.04415, + "grad_norm": 0.05020061880350113, + "learning_rate": 6.654849903258983e-06, + "loss": 0.032, + "step": 154830 + }, + { + "epoch": 0.0442, + "grad_norm": 0.05041590705513954, + "learning_rate": 6.652041904744314e-06, + "loss": 0.0337, + "step": 154840 + }, + { + "epoch": 0.04425, + "grad_norm": 0.048387397080659866, + "learning_rate": 6.64923440785592e-06, + "loss": 0.0318, + "step": 154850 + }, + { + "epoch": 0.0443, + "grad_norm": 0.04516212269663811, + "learning_rate": 6.646427412670569e-06, + "loss": 0.0328, + "step": 154860 + }, + { + "epoch": 0.04435, + "grad_norm": 0.04717303439974785, + "learning_rate": 6.643620919264998e-06, + "loss": 0.0315, + "step": 154870 + }, + { + "epoch": 0.0444, + "grad_norm": 0.04056984931230545, + "learning_rate": 6.640814927715927e-06, + "loss": 0.0315, + "step": 154880 + }, + { + "epoch": 0.04445, + "grad_norm": 0.04171687737107277, + "learning_rate": 6.6380094381000875e-06, + "loss": 0.0314, + "step": 154890 + }, + { + "epoch": 0.0445, + "grad_norm": 0.039897263050079346, + "learning_rate": 6.635204450494162e-06, + "loss": 0.0317, + "step": 154900 + }, + { + "epoch": 0.04455, + "grad_norm": 0.03952673822641373, + "learning_rate": 6.632399964974856e-06, + "loss": 0.0331, + "step": 154910 + }, + { + "epoch": 0.0446, + "grad_norm": 0.04094167798757553, + "learning_rate": 6.629595981618836e-06, + "loss": 0.0326, + "step": 154920 + }, + { + "epoch": 0.04465, + "grad_norm": 0.04107446223497391, + "learning_rate": 6.6267925005027515e-06, + "loss": 0.0342, + "step": 154930 + }, + { + "epoch": 0.0447, + "grad_norm": 0.03734170272946358, + "learning_rate": 6.623989521703267e-06, + "loss": 0.0332, + "step": 154940 + }, + { + "epoch": 0.04475, + "grad_norm": 0.0554497204720974, + "learning_rate": 6.621187045297003e-06, + "loss": 0.0332, + "step": 154950 + }, + { + "epoch": 0.0448, + "grad_norm": 0.04572724923491478, + "learning_rate": 6.618385071360581e-06, + "loss": 0.0343, + "step": 154960 + }, + { + "epoch": 0.04485, + "grad_norm": 0.045753978192806244, + "learning_rate": 6.615583599970598e-06, + "loss": 0.0332, + "step": 154970 + }, + { + "epoch": 0.0449, + "grad_norm": 0.061113327741622925, + "learning_rate": 6.612782631203662e-06, + "loss": 0.0339, + "step": 154980 + }, + { + "epoch": 0.04495, + "grad_norm": 0.0401221327483654, + "learning_rate": 6.609982165136331e-06, + "loss": 0.0333, + "step": 154990 + }, + { + "epoch": 0.045, + "grad_norm": 0.05207759141921997, + "learning_rate": 6.607182201845188e-06, + "loss": 0.0333, + "step": 155000 + }, + { + "epoch": 0.04505, + "grad_norm": 0.048382531851530075, + "learning_rate": 6.604382741406773e-06, + "loss": 0.0345, + "step": 155010 + }, + { + "epoch": 0.0451, + "grad_norm": 0.050435684621334076, + "learning_rate": 6.601583783897617e-06, + "loss": 0.0339, + "step": 155020 + }, + { + "epoch": 0.04515, + "grad_norm": 0.052984338253736496, + "learning_rate": 6.598785329394252e-06, + "loss": 0.033, + "step": 155030 + }, + { + "epoch": 0.0452, + "grad_norm": 0.04428156837821007, + "learning_rate": 6.595987377973187e-06, + "loss": 0.0335, + "step": 155040 + }, + { + "epoch": 0.04525, + "grad_norm": 0.041596777737140656, + "learning_rate": 6.5931899297109106e-06, + "loss": 0.0336, + "step": 155050 + }, + { + "epoch": 0.0453, + "grad_norm": 0.03791414573788643, + "learning_rate": 6.590392984683902e-06, + "loss": 0.0329, + "step": 155060 + }, + { + "epoch": 0.04535, + "grad_norm": 0.050665635615587234, + "learning_rate": 6.58759654296863e-06, + "loss": 0.0341, + "step": 155070 + }, + { + "epoch": 0.0454, + "grad_norm": 0.05314168706536293, + "learning_rate": 6.584800604641567e-06, + "loss": 0.0333, + "step": 155080 + }, + { + "epoch": 0.04545, + "grad_norm": 0.045230139046907425, + "learning_rate": 6.582005169779123e-06, + "loss": 0.0336, + "step": 155090 + }, + { + "epoch": 0.0455, + "grad_norm": 0.03916275128722191, + "learning_rate": 6.579210238457745e-06, + "loss": 0.0324, + "step": 155100 + }, + { + "epoch": 0.04555, + "grad_norm": 0.05019238591194153, + "learning_rate": 6.576415810753833e-06, + "loss": 0.0336, + "step": 155110 + }, + { + "epoch": 0.0456, + "grad_norm": 0.041514333337545395, + "learning_rate": 6.5736218867437974e-06, + "loss": 0.0331, + "step": 155120 + }, + { + "epoch": 0.04565, + "grad_norm": 0.0469856783747673, + "learning_rate": 6.570828466504017e-06, + "loss": 0.0325, + "step": 155130 + }, + { + "epoch": 0.0457, + "grad_norm": 0.045561276376247406, + "learning_rate": 6.568035550110855e-06, + "loss": 0.0336, + "step": 155140 + }, + { + "epoch": 0.04575, + "grad_norm": 0.044542096555233, + "learning_rate": 6.565243137640683e-06, + "loss": 0.0334, + "step": 155150 + }, + { + "epoch": 0.0458, + "grad_norm": 0.04612474516034126, + "learning_rate": 6.562451229169839e-06, + "loss": 0.033, + "step": 155160 + }, + { + "epoch": 0.04585, + "grad_norm": 0.052123814821243286, + "learning_rate": 6.559659824774652e-06, + "loss": 0.0335, + "step": 155170 + }, + { + "epoch": 0.0459, + "grad_norm": 0.051325369626283646, + "learning_rate": 6.556868924531431e-06, + "loss": 0.0314, + "step": 155180 + }, + { + "epoch": 0.04595, + "grad_norm": 0.04750853776931763, + "learning_rate": 6.554078528516483e-06, + "loss": 0.0328, + "step": 155190 + }, + { + "epoch": 0.046, + "grad_norm": 0.04768137261271477, + "learning_rate": 6.551288636806108e-06, + "loss": 0.0327, + "step": 155200 + }, + { + "epoch": 0.04605, + "grad_norm": 0.03860845789313316, + "learning_rate": 6.54849924947657e-06, + "loss": 0.0339, + "step": 155210 + }, + { + "epoch": 0.0461, + "grad_norm": 0.05007299780845642, + "learning_rate": 6.54571036660413e-06, + "loss": 0.0331, + "step": 155220 + }, + { + "epoch": 0.04615, + "grad_norm": 0.07463549077510834, + "learning_rate": 6.5429219882650275e-06, + "loss": 0.0351, + "step": 155230 + }, + { + "epoch": 0.0462, + "grad_norm": 0.07337648421525955, + "learning_rate": 6.540134114535512e-06, + "loss": 0.0346, + "step": 155240 + }, + { + "epoch": 0.04625, + "grad_norm": 0.039757709950208664, + "learning_rate": 6.5373467454917955e-06, + "loss": 0.0334, + "step": 155250 + }, + { + "epoch": 0.0463, + "grad_norm": 0.058458272367715836, + "learning_rate": 6.534559881210073e-06, + "loss": 0.034, + "step": 155260 + }, + { + "epoch": 0.04635, + "grad_norm": 0.04493442922830582, + "learning_rate": 6.531773521766557e-06, + "loss": 0.0322, + "step": 155270 + }, + { + "epoch": 0.0464, + "grad_norm": 0.04561833664774895, + "learning_rate": 6.528987667237405e-06, + "loss": 0.0322, + "step": 155280 + }, + { + "epoch": 0.04645, + "grad_norm": 0.04375355690717697, + "learning_rate": 6.5262023176988065e-06, + "loss": 0.0322, + "step": 155290 + }, + { + "epoch": 0.0465, + "grad_norm": 0.043602511286735535, + "learning_rate": 6.5234174732268814e-06, + "loss": 0.0317, + "step": 155300 + }, + { + "epoch": 0.04655, + "grad_norm": 0.04093132168054581, + "learning_rate": 6.520633133897783e-06, + "loss": 0.032, + "step": 155310 + }, + { + "epoch": 0.0466, + "grad_norm": 0.06170346215367317, + "learning_rate": 6.517849299787637e-06, + "loss": 0.0348, + "step": 155320 + }, + { + "epoch": 0.04665, + "grad_norm": 0.04812345653772354, + "learning_rate": 6.515065970972548e-06, + "loss": 0.0332, + "step": 155330 + }, + { + "epoch": 0.0467, + "grad_norm": 0.058116111904382706, + "learning_rate": 6.5122831475286115e-06, + "loss": 0.0336, + "step": 155340 + }, + { + "epoch": 0.04675, + "grad_norm": 0.05342204123735428, + "learning_rate": 6.509500829531901e-06, + "loss": 0.0325, + "step": 155350 + }, + { + "epoch": 0.0468, + "grad_norm": 0.046791404485702515, + "learning_rate": 6.506719017058499e-06, + "loss": 0.033, + "step": 155360 + }, + { + "epoch": 0.04685, + "grad_norm": 0.0552666075527668, + "learning_rate": 6.503937710184452e-06, + "loss": 0.0333, + "step": 155370 + }, + { + "epoch": 0.0469, + "grad_norm": 0.05266944691538811, + "learning_rate": 6.501156908985792e-06, + "loss": 0.0367, + "step": 155380 + }, + { + "epoch": 0.04695, + "grad_norm": 0.06553314626216888, + "learning_rate": 6.498376613538556e-06, + "loss": 0.0331, + "step": 155390 + }, + { + "epoch": 0.047, + "grad_norm": 0.04647262021899223, + "learning_rate": 6.4955968239187505e-06, + "loss": 0.034, + "step": 155400 + }, + { + "epoch": 0.04705, + "grad_norm": 0.05105951428413391, + "learning_rate": 6.492817540202381e-06, + "loss": 0.0336, + "step": 155410 + }, + { + "epoch": 0.0471, + "grad_norm": 0.05529676377773285, + "learning_rate": 6.490038762465428e-06, + "loss": 0.0338, + "step": 155420 + }, + { + "epoch": 0.04715, + "grad_norm": 0.05912640690803528, + "learning_rate": 6.487260490783859e-06, + "loss": 0.0333, + "step": 155430 + }, + { + "epoch": 0.0472, + "grad_norm": 0.047453779727220535, + "learning_rate": 6.484482725233629e-06, + "loss": 0.0316, + "step": 155440 + }, + { + "epoch": 0.04725, + "grad_norm": 0.04527302831411362, + "learning_rate": 6.481705465890689e-06, + "loss": 0.0322, + "step": 155450 + }, + { + "epoch": 0.0473, + "grad_norm": 0.04401553422212601, + "learning_rate": 6.478928712830967e-06, + "loss": 0.0317, + "step": 155460 + }, + { + "epoch": 0.04735, + "grad_norm": 0.045882705599069595, + "learning_rate": 6.4761524661303695e-06, + "loss": 0.0319, + "step": 155470 + }, + { + "epoch": 0.0474, + "grad_norm": 0.04246285557746887, + "learning_rate": 6.47337672586481e-06, + "loss": 0.032, + "step": 155480 + }, + { + "epoch": 0.04745, + "grad_norm": 0.0423617921769619, + "learning_rate": 6.4706014921101645e-06, + "loss": 0.0322, + "step": 155490 + }, + { + "epoch": 0.0475, + "grad_norm": 0.043872471898794174, + "learning_rate": 6.467826764942319e-06, + "loss": 0.032, + "step": 155500 + }, + { + "epoch": 0.04755, + "grad_norm": 0.04625878110527992, + "learning_rate": 6.465052544437131e-06, + "loss": 0.0328, + "step": 155510 + }, + { + "epoch": 0.0476, + "grad_norm": 0.046431832015514374, + "learning_rate": 6.462278830670432e-06, + "loss": 0.0314, + "step": 155520 + }, + { + "epoch": 0.04765, + "grad_norm": 0.04099517688155174, + "learning_rate": 6.459505623718073e-06, + "loss": 0.0321, + "step": 155530 + }, + { + "epoch": 0.0477, + "grad_norm": 0.04131109267473221, + "learning_rate": 6.456732923655867e-06, + "loss": 0.0324, + "step": 155540 + }, + { + "epoch": 0.04775, + "grad_norm": 0.040093787014484406, + "learning_rate": 6.453960730559616e-06, + "loss": 0.032, + "step": 155550 + }, + { + "epoch": 0.0478, + "grad_norm": 0.04762173816561699, + "learning_rate": 6.451189044505104e-06, + "loss": 0.0334, + "step": 155560 + }, + { + "epoch": 0.04785, + "grad_norm": 0.05464419350028038, + "learning_rate": 6.4484178655681125e-06, + "loss": 0.0322, + "step": 155570 + }, + { + "epoch": 0.0479, + "grad_norm": 0.04581758379936218, + "learning_rate": 6.445647193824425e-06, + "loss": 0.0329, + "step": 155580 + }, + { + "epoch": 0.04795, + "grad_norm": 0.05991151183843613, + "learning_rate": 6.442877029349756e-06, + "loss": 0.0352, + "step": 155590 + }, + { + "epoch": 0.048, + "grad_norm": 0.06271591037511826, + "learning_rate": 6.4401073722198665e-06, + "loss": 0.0339, + "step": 155600 + }, + { + "epoch": 0.04805, + "grad_norm": 0.04929928481578827, + "learning_rate": 6.437338222510461e-06, + "loss": 0.0324, + "step": 155610 + }, + { + "epoch": 0.0481, + "grad_norm": 0.04830869287252426, + "learning_rate": 6.4345695802972615e-06, + "loss": 0.0336, + "step": 155620 + }, + { + "epoch": 0.04815, + "grad_norm": 0.05835934728384018, + "learning_rate": 6.431801445655955e-06, + "loss": 0.0332, + "step": 155630 + }, + { + "epoch": 0.0482, + "grad_norm": 0.0470610111951828, + "learning_rate": 6.429033818662214e-06, + "loss": 0.0326, + "step": 155640 + }, + { + "epoch": 0.04825, + "grad_norm": 0.04287794232368469, + "learning_rate": 6.426266699391717e-06, + "loss": 0.0321, + "step": 155650 + }, + { + "epoch": 0.0483, + "grad_norm": 0.03915727511048317, + "learning_rate": 6.4235000879201145e-06, + "loss": 0.0317, + "step": 155660 + }, + { + "epoch": 0.04835, + "grad_norm": 0.0421532541513443, + "learning_rate": 6.420733984323038e-06, + "loss": 0.0327, + "step": 155670 + }, + { + "epoch": 0.0484, + "grad_norm": 0.04445471987128258, + "learning_rate": 6.417968388676107e-06, + "loss": 0.0327, + "step": 155680 + }, + { + "epoch": 0.04845, + "grad_norm": 0.0533364862203598, + "learning_rate": 6.415203301054948e-06, + "loss": 0.033, + "step": 155690 + }, + { + "epoch": 0.0485, + "grad_norm": 0.04205596446990967, + "learning_rate": 6.412438721535141e-06, + "loss": 0.0324, + "step": 155700 + }, + { + "epoch": 0.04855, + "grad_norm": 0.0469808354973793, + "learning_rate": 6.409674650192282e-06, + "loss": 0.032, + "step": 155710 + }, + { + "epoch": 0.0486, + "grad_norm": 0.05807473510503769, + "learning_rate": 6.406911087101938e-06, + "loss": 0.0322, + "step": 155720 + }, + { + "epoch": 0.04865, + "grad_norm": 0.05198590084910393, + "learning_rate": 6.404148032339649e-06, + "loss": 0.0324, + "step": 155730 + }, + { + "epoch": 0.0487, + "grad_norm": 0.08055607229471207, + "learning_rate": 6.401385485980976e-06, + "loss": 0.0339, + "step": 155740 + }, + { + "epoch": 0.04875, + "grad_norm": 0.048523034900426865, + "learning_rate": 6.398623448101434e-06, + "loss": 0.0349, + "step": 155750 + }, + { + "epoch": 0.0488, + "grad_norm": 0.0461469441652298, + "learning_rate": 6.395861918776533e-06, + "loss": 0.0327, + "step": 155760 + }, + { + "epoch": 0.04885, + "grad_norm": 0.04415708780288696, + "learning_rate": 6.393100898081786e-06, + "loss": 0.0333, + "step": 155770 + }, + { + "epoch": 0.0489, + "grad_norm": 0.04208403080701828, + "learning_rate": 6.390340386092664e-06, + "loss": 0.0333, + "step": 155780 + }, + { + "epoch": 0.04895, + "grad_norm": 0.03819586709141731, + "learning_rate": 6.387580382884656e-06, + "loss": 0.0326, + "step": 155790 + }, + { + "epoch": 0.049, + "grad_norm": 0.049763914197683334, + "learning_rate": 6.384820888533194e-06, + "loss": 0.0337, + "step": 155800 + }, + { + "epoch": 0.04905, + "grad_norm": 0.04622012749314308, + "learning_rate": 6.382061903113743e-06, + "loss": 0.0335, + "step": 155810 + }, + { + "epoch": 0.0491, + "grad_norm": 0.039324529469013214, + "learning_rate": 6.379303426701719e-06, + "loss": 0.0327, + "step": 155820 + }, + { + "epoch": 0.04915, + "grad_norm": 0.0509813018143177, + "learning_rate": 6.376545459372552e-06, + "loss": 0.0343, + "step": 155830 + }, + { + "epoch": 0.0492, + "grad_norm": 0.06558476388454437, + "learning_rate": 6.3737880012016335e-06, + "loss": 0.0355, + "step": 155840 + }, + { + "epoch": 0.04925, + "grad_norm": 0.05903024226427078, + "learning_rate": 6.3710310522643455e-06, + "loss": 0.0338, + "step": 155850 + }, + { + "epoch": 0.0493, + "grad_norm": 0.04996471479535103, + "learning_rate": 6.36827461263608e-06, + "loss": 0.0347, + "step": 155860 + }, + { + "epoch": 0.04935, + "grad_norm": 0.05509728565812111, + "learning_rate": 6.365518682392186e-06, + "loss": 0.034, + "step": 155870 + }, + { + "epoch": 0.0494, + "grad_norm": 0.047669917345047, + "learning_rate": 6.3627632616080095e-06, + "loss": 0.0344, + "step": 155880 + }, + { + "epoch": 0.04945, + "grad_norm": 0.04750572144985199, + "learning_rate": 6.360008350358876e-06, + "loss": 0.0334, + "step": 155890 + }, + { + "epoch": 0.0495, + "grad_norm": 0.04436175152659416, + "learning_rate": 6.357253948720113e-06, + "loss": 0.0359, + "step": 155900 + }, + { + "epoch": 0.04955, + "grad_norm": 0.05103078484535217, + "learning_rate": 6.354500056767029e-06, + "loss": 0.0332, + "step": 155910 + }, + { + "epoch": 0.0496, + "grad_norm": 0.04148725047707558, + "learning_rate": 6.35174667457491e-06, + "loss": 0.0343, + "step": 155920 + }, + { + "epoch": 0.04965, + "grad_norm": 0.04589519277215004, + "learning_rate": 6.348993802219031e-06, + "loss": 0.0331, + "step": 155930 + }, + { + "epoch": 0.0497, + "grad_norm": 0.04733560234308243, + "learning_rate": 6.346241439774648e-06, + "loss": 0.0326, + "step": 155940 + }, + { + "epoch": 0.04975, + "grad_norm": 0.047574255615472794, + "learning_rate": 6.343489587317022e-06, + "loss": 0.0341, + "step": 155950 + }, + { + "epoch": 0.0498, + "grad_norm": 0.04523150995373726, + "learning_rate": 6.340738244921382e-06, + "loss": 0.0331, + "step": 155960 + }, + { + "epoch": 0.04985, + "grad_norm": 0.04262368753552437, + "learning_rate": 6.337987412662941e-06, + "loss": 0.0326, + "step": 155970 + }, + { + "epoch": 0.0499, + "grad_norm": 0.05411456897854805, + "learning_rate": 6.335237090616922e-06, + "loss": 0.0344, + "step": 155980 + }, + { + "epoch": 0.04995, + "grad_norm": 0.04779418185353279, + "learning_rate": 6.332487278858498e-06, + "loss": 0.0322, + "step": 155990 + }, + { + "epoch": 0.05, + "grad_norm": 0.04428921639919281, + "learning_rate": 6.329737977462877e-06, + "loss": 0.0326, + "step": 156000 + }, + { + "epoch": 0.05005, + "grad_norm": 0.051774270832538605, + "learning_rate": 6.326989186505192e-06, + "loss": 0.0349, + "step": 156010 + }, + { + "epoch": 0.0501, + "grad_norm": 0.05182502418756485, + "learning_rate": 6.324240906060602e-06, + "loss": 0.0343, + "step": 156020 + }, + { + "epoch": 0.05015, + "grad_norm": 0.047747816890478134, + "learning_rate": 6.321493136204262e-06, + "loss": 0.034, + "step": 156030 + }, + { + "epoch": 0.0502, + "grad_norm": 0.043513353914022446, + "learning_rate": 6.318745877011281e-06, + "loss": 0.0317, + "step": 156040 + }, + { + "epoch": 0.05025, + "grad_norm": 0.04383932426571846, + "learning_rate": 6.315999128556768e-06, + "loss": 0.0328, + "step": 156050 + }, + { + "epoch": 0.0503, + "grad_norm": 0.04224616289138794, + "learning_rate": 6.313252890915813e-06, + "loss": 0.0327, + "step": 156060 + }, + { + "epoch": 0.05035, + "grad_norm": 0.04971592128276825, + "learning_rate": 6.310507164163512e-06, + "loss": 0.0346, + "step": 156070 + }, + { + "epoch": 0.0504, + "grad_norm": 0.060726605355739594, + "learning_rate": 6.307761948374924e-06, + "loss": 0.0324, + "step": 156080 + }, + { + "epoch": 0.05045, + "grad_norm": 0.04588589072227478, + "learning_rate": 6.305017243625094e-06, + "loss": 0.032, + "step": 156090 + }, + { + "epoch": 0.0505, + "grad_norm": 0.04747506603598595, + "learning_rate": 6.302273049989077e-06, + "loss": 0.0326, + "step": 156100 + }, + { + "epoch": 0.05055, + "grad_norm": 0.043740060180425644, + "learning_rate": 6.299529367541882e-06, + "loss": 0.0337, + "step": 156110 + }, + { + "epoch": 0.0506, + "grad_norm": 0.03844473138451576, + "learning_rate": 6.296786196358537e-06, + "loss": 0.0325, + "step": 156120 + }, + { + "epoch": 0.05065, + "grad_norm": 0.045568566769361496, + "learning_rate": 6.29404353651403e-06, + "loss": 0.0335, + "step": 156130 + }, + { + "epoch": 0.0507, + "grad_norm": 0.04699577018618584, + "learning_rate": 6.291301388083337e-06, + "loss": 0.0328, + "step": 156140 + }, + { + "epoch": 0.05075, + "grad_norm": 0.04970989748835564, + "learning_rate": 6.288559751141443e-06, + "loss": 0.0323, + "step": 156150 + }, + { + "epoch": 0.0508, + "grad_norm": 0.046929433941841125, + "learning_rate": 6.285818625763299e-06, + "loss": 0.0334, + "step": 156160 + }, + { + "epoch": 0.05085, + "grad_norm": 0.050449028611183167, + "learning_rate": 6.283078012023841e-06, + "loss": 0.0338, + "step": 156170 + }, + { + "epoch": 0.0509, + "grad_norm": 0.05461689457297325, + "learning_rate": 6.280337909997991e-06, + "loss": 0.0355, + "step": 156180 + }, + { + "epoch": 0.05095, + "grad_norm": 0.04760809615254402, + "learning_rate": 6.277598319760677e-06, + "loss": 0.0343, + "step": 156190 + }, + { + "epoch": 0.051, + "grad_norm": 0.04256792366504669, + "learning_rate": 6.2748592413867854e-06, + "loss": 0.0336, + "step": 156200 + }, + { + "epoch": 0.05105, + "grad_norm": 0.039392486214637756, + "learning_rate": 6.272120674951212e-06, + "loss": 0.0327, + "step": 156210 + }, + { + "epoch": 0.0511, + "grad_norm": 0.03896992281079292, + "learning_rate": 6.269382620528827e-06, + "loss": 0.0339, + "step": 156220 + }, + { + "epoch": 0.05115, + "grad_norm": 0.04479774087667465, + "learning_rate": 6.266645078194475e-06, + "loss": 0.0329, + "step": 156230 + }, + { + "epoch": 0.0512, + "grad_norm": 0.045208126306533813, + "learning_rate": 6.263908048023015e-06, + "loss": 0.0339, + "step": 156240 + }, + { + "epoch": 0.05125, + "grad_norm": 0.048391927033662796, + "learning_rate": 6.2611715300892715e-06, + "loss": 0.0336, + "step": 156250 + }, + { + "epoch": 0.0513, + "grad_norm": 0.048994030803442, + "learning_rate": 6.258435524468059e-06, + "loss": 0.033, + "step": 156260 + }, + { + "epoch": 0.05135, + "grad_norm": 0.05166760832071304, + "learning_rate": 6.2557000312341715e-06, + "loss": 0.0328, + "step": 156270 + }, + { + "epoch": 0.0514, + "grad_norm": 0.08172396570444107, + "learning_rate": 6.252965050462403e-06, + "loss": 0.032, + "step": 156280 + }, + { + "epoch": 0.05145, + "grad_norm": 0.057565782219171524, + "learning_rate": 6.250230582227539e-06, + "loss": 0.0335, + "step": 156290 + }, + { + "epoch": 0.0515, + "grad_norm": 0.047792620956897736, + "learning_rate": 6.247496626604316e-06, + "loss": 0.0331, + "step": 156300 + }, + { + "epoch": 0.05155, + "grad_norm": 0.04930257424712181, + "learning_rate": 6.244763183667496e-06, + "loss": 0.0315, + "step": 156310 + }, + { + "epoch": 0.0516, + "grad_norm": 0.053147315979003906, + "learning_rate": 6.242030253491798e-06, + "loss": 0.0344, + "step": 156320 + }, + { + "epoch": 0.05165, + "grad_norm": 0.0465032234787941, + "learning_rate": 6.2392978361519525e-06, + "loss": 0.0316, + "step": 156330 + }, + { + "epoch": 0.0517, + "grad_norm": 0.05579762905836105, + "learning_rate": 6.2365659317226545e-06, + "loss": 0.0348, + "step": 156340 + }, + { + "epoch": 0.05175, + "grad_norm": 0.04690273851156235, + "learning_rate": 6.233834540278591e-06, + "loss": 0.0325, + "step": 156350 + }, + { + "epoch": 0.0518, + "grad_norm": 0.03898358345031738, + "learning_rate": 6.2311036618944464e-06, + "loss": 0.0333, + "step": 156360 + }, + { + "epoch": 0.05185, + "grad_norm": 0.05056428536772728, + "learning_rate": 6.228373296644877e-06, + "loss": 0.0361, + "step": 156370 + }, + { + "epoch": 0.0519, + "grad_norm": 0.04741507023572922, + "learning_rate": 6.225643444604529e-06, + "loss": 0.0335, + "step": 156380 + }, + { + "epoch": 0.05195, + "grad_norm": 0.0468575656414032, + "learning_rate": 6.2229141058480265e-06, + "loss": 0.0338, + "step": 156390 + }, + { + "epoch": 0.052, + "grad_norm": 0.046106282621622086, + "learning_rate": 6.2201852804500025e-06, + "loss": 0.0337, + "step": 156400 + }, + { + "epoch": 0.05205, + "grad_norm": 0.050913646817207336, + "learning_rate": 6.217456968485061e-06, + "loss": 0.0339, + "step": 156410 + }, + { + "epoch": 0.0521, + "grad_norm": 0.04370098561048508, + "learning_rate": 6.214729170027792e-06, + "loss": 0.0328, + "step": 156420 + }, + { + "epoch": 0.05215, + "grad_norm": 0.04934019222855568, + "learning_rate": 6.212001885152771e-06, + "loss": 0.0344, + "step": 156430 + }, + { + "epoch": 0.0522, + "grad_norm": 0.04041753336787224, + "learning_rate": 6.209275113934552e-06, + "loss": 0.0334, + "step": 156440 + }, + { + "epoch": 0.05225, + "grad_norm": 0.05130622163414955, + "learning_rate": 6.206548856447697e-06, + "loss": 0.0351, + "step": 156450 + }, + { + "epoch": 0.0523, + "grad_norm": 0.04814887419342995, + "learning_rate": 6.203823112766738e-06, + "loss": 0.0332, + "step": 156460 + }, + { + "epoch": 0.05235, + "grad_norm": 0.050084177404642105, + "learning_rate": 6.201097882966186e-06, + "loss": 0.0332, + "step": 156470 + }, + { + "epoch": 0.0524, + "grad_norm": 0.05311131477355957, + "learning_rate": 6.198373167120564e-06, + "loss": 0.035, + "step": 156480 + }, + { + "epoch": 0.05245, + "grad_norm": 0.054679013788700104, + "learning_rate": 6.195648965304348e-06, + "loss": 0.035, + "step": 156490 + }, + { + "epoch": 0.0525, + "grad_norm": 0.042142126709222794, + "learning_rate": 6.19292527759204e-06, + "loss": 0.034, + "step": 156500 + }, + { + "epoch": 0.05255, + "grad_norm": 0.04101485759019852, + "learning_rate": 6.190202104058074e-06, + "loss": 0.033, + "step": 156510 + }, + { + "epoch": 0.0526, + "grad_norm": 0.05309950187802315, + "learning_rate": 6.187479444776914e-06, + "loss": 0.0327, + "step": 156520 + }, + { + "epoch": 0.05265, + "grad_norm": 0.05121513828635216, + "learning_rate": 6.18475729982301e-06, + "loss": 0.0348, + "step": 156530 + }, + { + "epoch": 0.0527, + "grad_norm": 0.04467151314020157, + "learning_rate": 6.182035669270769e-06, + "loss": 0.0332, + "step": 156540 + }, + { + "epoch": 0.05275, + "grad_norm": 0.04641894996166229, + "learning_rate": 6.179314553194607e-06, + "loss": 0.0321, + "step": 156550 + }, + { + "epoch": 0.0528, + "grad_norm": 0.05059409141540527, + "learning_rate": 6.1765939516689045e-06, + "loss": 0.0331, + "step": 156560 + }, + { + "epoch": 0.05285, + "grad_norm": 0.049843620508909225, + "learning_rate": 6.173873864768059e-06, + "loss": 0.0334, + "step": 156570 + }, + { + "epoch": 0.0529, + "grad_norm": 0.03690807521343231, + "learning_rate": 6.1711542925664305e-06, + "loss": 0.0313, + "step": 156580 + }, + { + "epoch": 0.05295, + "grad_norm": 0.03983112797141075, + "learning_rate": 6.168435235138362e-06, + "loss": 0.0328, + "step": 156590 + }, + { + "epoch": 0.053, + "grad_norm": 0.04444614425301552, + "learning_rate": 6.1657166925582075e-06, + "loss": 0.0355, + "step": 156600 + }, + { + "epoch": 0.05305, + "grad_norm": 0.04253576695919037, + "learning_rate": 6.162998664900274e-06, + "loss": 0.0324, + "step": 156610 + }, + { + "epoch": 0.0531, + "grad_norm": 0.06054367870092392, + "learning_rate": 6.160281152238889e-06, + "loss": 0.0344, + "step": 156620 + }, + { + "epoch": 0.05315, + "grad_norm": 0.038273777812719345, + "learning_rate": 6.1575641546483405e-06, + "loss": 0.0319, + "step": 156630 + }, + { + "epoch": 0.0532, + "grad_norm": 0.049611713737249374, + "learning_rate": 6.154847672202907e-06, + "loss": 0.0311, + "step": 156640 + }, + { + "epoch": 0.05325, + "grad_norm": 0.04181080684065819, + "learning_rate": 6.1521317049768515e-06, + "loss": 0.0318, + "step": 156650 + }, + { + "epoch": 0.0533, + "grad_norm": 0.0436883345246315, + "learning_rate": 6.149416253044443e-06, + "loss": 0.0322, + "step": 156660 + }, + { + "epoch": 0.05335, + "grad_norm": 0.0463520772755146, + "learning_rate": 6.146701316479911e-06, + "loss": 0.0325, + "step": 156670 + }, + { + "epoch": 0.0534, + "grad_norm": 0.052116844803094864, + "learning_rate": 6.143986895357476e-06, + "loss": 0.0327, + "step": 156680 + }, + { + "epoch": 0.05345, + "grad_norm": 0.03883455693721771, + "learning_rate": 6.141272989751362e-06, + "loss": 0.0327, + "step": 156690 + }, + { + "epoch": 0.0535, + "grad_norm": 0.04669365659356117, + "learning_rate": 6.138559599735752e-06, + "loss": 0.0325, + "step": 156700 + }, + { + "epoch": 0.05355, + "grad_norm": 0.048563260585069656, + "learning_rate": 6.135846725384844e-06, + "loss": 0.0342, + "step": 156710 + }, + { + "epoch": 0.0536, + "grad_norm": 0.04328960180282593, + "learning_rate": 6.1331343667728e-06, + "loss": 0.0333, + "step": 156720 + }, + { + "epoch": 0.05365, + "grad_norm": 0.051987145096063614, + "learning_rate": 6.130422523973766e-06, + "loss": 0.0331, + "step": 156730 + }, + { + "epoch": 0.0537, + "grad_norm": 0.04236971586942673, + "learning_rate": 6.1277111970619e-06, + "loss": 0.0328, + "step": 156740 + }, + { + "epoch": 0.05375, + "grad_norm": 0.050633545964956284, + "learning_rate": 6.125000386111321e-06, + "loss": 0.0344, + "step": 156750 + }, + { + "epoch": 0.0538, + "grad_norm": 0.06444554030895233, + "learning_rate": 6.122290091196137e-06, + "loss": 0.0337, + "step": 156760 + }, + { + "epoch": 0.05385, + "grad_norm": 0.04705757647752762, + "learning_rate": 6.119580312390447e-06, + "loss": 0.034, + "step": 156770 + }, + { + "epoch": 0.0539, + "grad_norm": 0.049717262387275696, + "learning_rate": 6.116871049768333e-06, + "loss": 0.0346, + "step": 156780 + }, + { + "epoch": 0.05395, + "grad_norm": 0.056450214236974716, + "learning_rate": 6.114162303403889e-06, + "loss": 0.0346, + "step": 156790 + }, + { + "epoch": 0.054, + "grad_norm": 0.053649645298719406, + "learning_rate": 6.111454073371136e-06, + "loss": 0.0349, + "step": 156800 + }, + { + "epoch": 0.05405, + "grad_norm": 0.04505099356174469, + "learning_rate": 6.108746359744141e-06, + "loss": 0.0354, + "step": 156810 + }, + { + "epoch": 0.0541, + "grad_norm": 0.05334670469164848, + "learning_rate": 6.106039162596916e-06, + "loss": 0.0331, + "step": 156820 + }, + { + "epoch": 0.05415, + "grad_norm": 0.0388176292181015, + "learning_rate": 6.103332482003488e-06, + "loss": 0.0326, + "step": 156830 + }, + { + "epoch": 0.0542, + "grad_norm": 0.04762693867087364, + "learning_rate": 6.100626318037853e-06, + "loss": 0.0333, + "step": 156840 + }, + { + "epoch": 0.05425, + "grad_norm": 0.0472024604678154, + "learning_rate": 6.097920670773985e-06, + "loss": 0.0339, + "step": 156850 + }, + { + "epoch": 0.0543, + "grad_norm": 0.04920189082622528, + "learning_rate": 6.095215540285873e-06, + "loss": 0.0352, + "step": 156860 + }, + { + "epoch": 0.05435, + "grad_norm": 0.049344856292009354, + "learning_rate": 6.092510926647466e-06, + "loss": 0.0332, + "step": 156870 + }, + { + "epoch": 0.0544, + "grad_norm": 0.042951151728630066, + "learning_rate": 6.089806829932707e-06, + "loss": 0.0332, + "step": 156880 + }, + { + "epoch": 0.05445, + "grad_norm": 0.04782388359308243, + "learning_rate": 6.087103250215518e-06, + "loss": 0.0338, + "step": 156890 + }, + { + "epoch": 0.0545, + "grad_norm": 0.04540665075182915, + "learning_rate": 6.0844001875698275e-06, + "loss": 0.034, + "step": 156900 + }, + { + "epoch": 0.05455, + "grad_norm": 0.05436317250132561, + "learning_rate": 6.081697642069523e-06, + "loss": 0.0345, + "step": 156910 + }, + { + "epoch": 0.0546, + "grad_norm": 0.04602791741490364, + "learning_rate": 6.0789956137885045e-06, + "loss": 0.0341, + "step": 156920 + }, + { + "epoch": 0.05465, + "grad_norm": 0.04309307783842087, + "learning_rate": 6.0762941028006365e-06, + "loss": 0.0332, + "step": 156930 + }, + { + "epoch": 0.0547, + "grad_norm": 0.040035031735897064, + "learning_rate": 6.0735931091797735e-06, + "loss": 0.0343, + "step": 156940 + }, + { + "epoch": 0.05475, + "grad_norm": 0.04369068890810013, + "learning_rate": 6.070892632999769e-06, + "loss": 0.0346, + "step": 156950 + }, + { + "epoch": 0.0548, + "grad_norm": 0.04811226949095726, + "learning_rate": 6.068192674334449e-06, + "loss": 0.0339, + "step": 156960 + }, + { + "epoch": 0.05485, + "grad_norm": 0.048526640981435776, + "learning_rate": 6.065493233257624e-06, + "loss": 0.0355, + "step": 156970 + }, + { + "epoch": 0.0549, + "grad_norm": 0.0485498420894146, + "learning_rate": 6.062794309843106e-06, + "loss": 0.0344, + "step": 156980 + }, + { + "epoch": 0.05495, + "grad_norm": 0.05765537545084953, + "learning_rate": 6.060095904164673e-06, + "loss": 0.0348, + "step": 156990 + }, + { + "epoch": 0.055, + "grad_norm": 0.056350041180849075, + "learning_rate": 6.0573980162961145e-06, + "loss": 0.0336, + "step": 157000 + }, + { + "epoch": 0.05505, + "grad_norm": 0.05552685633301735, + "learning_rate": 6.0547006463111625e-06, + "loss": 0.0353, + "step": 157010 + }, + { + "epoch": 0.0551, + "grad_norm": 0.05745597183704376, + "learning_rate": 6.0520037942835865e-06, + "loss": 0.0361, + "step": 157020 + }, + { + "epoch": 0.05515, + "grad_norm": 0.04678814485669136, + "learning_rate": 6.049307460287101e-06, + "loss": 0.0348, + "step": 157030 + }, + { + "epoch": 0.0552, + "grad_norm": 0.04759768024086952, + "learning_rate": 6.046611644395437e-06, + "loss": 0.0343, + "step": 157040 + }, + { + "epoch": 0.05525, + "grad_norm": 0.04795956611633301, + "learning_rate": 6.043916346682288e-06, + "loss": 0.0338, + "step": 157050 + }, + { + "epoch": 0.0553, + "grad_norm": 0.0424167737364769, + "learning_rate": 6.041221567221339e-06, + "loss": 0.0335, + "step": 157060 + }, + { + "epoch": 0.05535, + "grad_norm": 0.04473331198096275, + "learning_rate": 6.0385273060862775e-06, + "loss": 0.0334, + "step": 157070 + }, + { + "epoch": 0.0554, + "grad_norm": 0.04596247524023056, + "learning_rate": 6.035833563350757e-06, + "loss": 0.0337, + "step": 157080 + }, + { + "epoch": 0.05545, + "grad_norm": 0.05837064981460571, + "learning_rate": 6.033140339088422e-06, + "loss": 0.0325, + "step": 157090 + }, + { + "epoch": 0.0555, + "grad_norm": 0.04107905924320221, + "learning_rate": 6.030447633372896e-06, + "loss": 0.0337, + "step": 157100 + }, + { + "epoch": 0.05555, + "grad_norm": 0.05001448467373848, + "learning_rate": 6.027755446277805e-06, + "loss": 0.0328, + "step": 157110 + }, + { + "epoch": 0.0556, + "grad_norm": 0.04254784807562828, + "learning_rate": 6.025063777876761e-06, + "loss": 0.0314, + "step": 157120 + }, + { + "epoch": 0.05565, + "grad_norm": 0.043021947145462036, + "learning_rate": 6.0223726282433445e-06, + "loss": 0.0329, + "step": 157130 + }, + { + "epoch": 0.0557, + "grad_norm": 0.039636269211769104, + "learning_rate": 6.019681997451132e-06, + "loss": 0.0358, + "step": 157140 + }, + { + "epoch": 0.05575, + "grad_norm": 0.04664872959256172, + "learning_rate": 6.016991885573672e-06, + "loss": 0.0316, + "step": 157150 + }, + { + "epoch": 0.0558, + "grad_norm": 0.04457870125770569, + "learning_rate": 6.014302292684534e-06, + "loss": 0.0323, + "step": 157160 + }, + { + "epoch": 0.05585, + "grad_norm": 0.05264793708920479, + "learning_rate": 6.011613218857237e-06, + "loss": 0.0331, + "step": 157170 + }, + { + "epoch": 0.0559, + "grad_norm": 0.04360983520746231, + "learning_rate": 6.0089246641652916e-06, + "loss": 0.0317, + "step": 157180 + }, + { + "epoch": 0.05595, + "grad_norm": 0.04845651239156723, + "learning_rate": 6.006236628682221e-06, + "loss": 0.0336, + "step": 157190 + }, + { + "epoch": 0.056, + "grad_norm": 0.04868089035153389, + "learning_rate": 6.003549112481496e-06, + "loss": 0.0322, + "step": 157200 + }, + { + "epoch": 0.05605, + "grad_norm": 0.03788581117987633, + "learning_rate": 6.0008621156366184e-06, + "loss": 0.0335, + "step": 157210 + }, + { + "epoch": 0.0561, + "grad_norm": 0.03969200700521469, + "learning_rate": 5.998175638221018e-06, + "loss": 0.0322, + "step": 157220 + }, + { + "epoch": 0.05615, + "grad_norm": 0.050981346517801285, + "learning_rate": 5.9954896803081585e-06, + "loss": 0.0323, + "step": 157230 + }, + { + "epoch": 0.0562, + "grad_norm": 0.043794598430395126, + "learning_rate": 5.992804241971475e-06, + "loss": 0.034, + "step": 157240 + }, + { + "epoch": 0.05625, + "grad_norm": 0.04196935519576073, + "learning_rate": 5.990119323284385e-06, + "loss": 0.0321, + "step": 157250 + }, + { + "epoch": 0.0563, + "grad_norm": 0.05080387368798256, + "learning_rate": 5.98743492432029e-06, + "loss": 0.0323, + "step": 157260 + }, + { + "epoch": 0.05635, + "grad_norm": 0.04499037191271782, + "learning_rate": 5.984751045152576e-06, + "loss": 0.0325, + "step": 157270 + }, + { + "epoch": 0.0564, + "grad_norm": 0.05216510593891144, + "learning_rate": 5.982067685854631e-06, + "loss": 0.0325, + "step": 157280 + }, + { + "epoch": 0.05645, + "grad_norm": 0.04326760023832321, + "learning_rate": 5.979384846499811e-06, + "loss": 0.0328, + "step": 157290 + }, + { + "epoch": 0.0565, + "grad_norm": 0.052308179438114166, + "learning_rate": 5.976702527161457e-06, + "loss": 0.0344, + "step": 157300 + }, + { + "epoch": 0.05655, + "grad_norm": 0.042604342103004456, + "learning_rate": 5.974020727912913e-06, + "loss": 0.0325, + "step": 157310 + }, + { + "epoch": 0.0566, + "grad_norm": 0.047163888812065125, + "learning_rate": 5.97133944882749e-06, + "loss": 0.0336, + "step": 157320 + }, + { + "epoch": 0.05665, + "grad_norm": 0.04162364825606346, + "learning_rate": 5.9686586899785065e-06, + "loss": 0.0366, + "step": 157330 + }, + { + "epoch": 0.0567, + "grad_norm": 0.041386570781469345, + "learning_rate": 5.965978451439242e-06, + "loss": 0.0344, + "step": 157340 + }, + { + "epoch": 0.05675, + "grad_norm": 0.0403309091925621, + "learning_rate": 5.963298733282968e-06, + "loss": 0.0338, + "step": 157350 + }, + { + "epoch": 0.0568, + "grad_norm": 0.04396276921033859, + "learning_rate": 5.960619535582965e-06, + "loss": 0.0346, + "step": 157360 + }, + { + "epoch": 0.05685, + "grad_norm": 0.04074764624238014, + "learning_rate": 5.957940858412469e-06, + "loss": 0.0346, + "step": 157370 + }, + { + "epoch": 0.0569, + "grad_norm": 0.053959768265485764, + "learning_rate": 5.9552627018447185e-06, + "loss": 0.0358, + "step": 157380 + }, + { + "epoch": 0.05695, + "grad_norm": 0.04601474478840828, + "learning_rate": 5.952585065952923e-06, + "loss": 0.0332, + "step": 157390 + }, + { + "epoch": 0.057, + "grad_norm": 0.050543323159217834, + "learning_rate": 5.949907950810302e-06, + "loss": 0.0347, + "step": 157400 + }, + { + "epoch": 0.05705, + "grad_norm": 0.045483339577913284, + "learning_rate": 5.9472313564900325e-06, + "loss": 0.0338, + "step": 157410 + }, + { + "epoch": 0.0571, + "grad_norm": 0.039692364633083344, + "learning_rate": 5.944555283065309e-06, + "loss": 0.034, + "step": 157420 + }, + { + "epoch": 0.05715, + "grad_norm": 0.03985406085848808, + "learning_rate": 5.941879730609284e-06, + "loss": 0.0333, + "step": 157430 + }, + { + "epoch": 0.0572, + "grad_norm": 0.03966673091053963, + "learning_rate": 5.939204699195103e-06, + "loss": 0.0339, + "step": 157440 + }, + { + "epoch": 0.05725, + "grad_norm": 0.05533109977841377, + "learning_rate": 5.936530188895908e-06, + "loss": 0.0345, + "step": 157450 + }, + { + "epoch": 0.0573, + "grad_norm": 0.04460675269365311, + "learning_rate": 5.933856199784821e-06, + "loss": 0.0347, + "step": 157460 + }, + { + "epoch": 0.05735, + "grad_norm": 0.0479511059820652, + "learning_rate": 5.931182731934939e-06, + "loss": 0.0345, + "step": 157470 + }, + { + "epoch": 0.0574, + "grad_norm": 0.054211050271987915, + "learning_rate": 5.92850978541935e-06, + "loss": 0.0335, + "step": 157480 + }, + { + "epoch": 0.05745, + "grad_norm": 0.05536212399601936, + "learning_rate": 5.92583736031114e-06, + "loss": 0.0364, + "step": 157490 + }, + { + "epoch": 0.0575, + "grad_norm": 0.04610919579863548, + "learning_rate": 5.923165456683383e-06, + "loss": 0.0335, + "step": 157500 + }, + { + "epoch": 0.05755, + "grad_norm": 0.04443402588367462, + "learning_rate": 5.920494074609104e-06, + "loss": 0.0351, + "step": 157510 + }, + { + "epoch": 0.0576, + "grad_norm": 0.038978736847639084, + "learning_rate": 5.917823214161356e-06, + "loss": 0.0325, + "step": 157520 + }, + { + "epoch": 0.05765, + "grad_norm": 0.04635247588157654, + "learning_rate": 5.915152875413144e-06, + "loss": 0.0336, + "step": 157530 + }, + { + "epoch": 0.0577, + "grad_norm": 0.05298512056469917, + "learning_rate": 5.912483058437487e-06, + "loss": 0.0337, + "step": 157540 + }, + { + "epoch": 0.05775, + "grad_norm": 0.04913019761443138, + "learning_rate": 5.909813763307376e-06, + "loss": 0.0324, + "step": 157550 + }, + { + "epoch": 0.0578, + "grad_norm": 0.04847853630781174, + "learning_rate": 5.907144990095778e-06, + "loss": 0.0339, + "step": 157560 + }, + { + "epoch": 0.05785, + "grad_norm": 0.048599254339933395, + "learning_rate": 5.9044767388756695e-06, + "loss": 0.0323, + "step": 157570 + }, + { + "epoch": 0.0579, + "grad_norm": 0.044404398649930954, + "learning_rate": 5.9018090097199915e-06, + "loss": 0.0332, + "step": 157580 + }, + { + "epoch": 0.05795, + "grad_norm": 0.05398423969745636, + "learning_rate": 5.899141802701683e-06, + "loss": 0.0321, + "step": 157590 + }, + { + "epoch": 0.058, + "grad_norm": 0.0505392923951149, + "learning_rate": 5.8964751178936516e-06, + "loss": 0.0327, + "step": 157600 + }, + { + "epoch": 0.05805, + "grad_norm": 0.06475657224655151, + "learning_rate": 5.893808955368818e-06, + "loss": 0.0341, + "step": 157610 + }, + { + "epoch": 0.0581, + "grad_norm": 0.05430496111512184, + "learning_rate": 5.891143315200073e-06, + "loss": 0.0329, + "step": 157620 + }, + { + "epoch": 0.05815, + "grad_norm": 0.045613840222358704, + "learning_rate": 5.888478197460293e-06, + "loss": 0.0308, + "step": 157630 + }, + { + "epoch": 0.0582, + "grad_norm": 0.0437619574368, + "learning_rate": 5.885813602222337e-06, + "loss": 0.0318, + "step": 157640 + }, + { + "epoch": 0.05825, + "grad_norm": 0.04621758684515953, + "learning_rate": 5.883149529559051e-06, + "loss": 0.0322, + "step": 157650 + }, + { + "epoch": 0.0583, + "grad_norm": 0.048031993210315704, + "learning_rate": 5.880485979543282e-06, + "loss": 0.0331, + "step": 157660 + }, + { + "epoch": 0.05835, + "grad_norm": 0.04733674228191376, + "learning_rate": 5.877822952247841e-06, + "loss": 0.0322, + "step": 157670 + }, + { + "epoch": 0.0584, + "grad_norm": 0.04080720245838165, + "learning_rate": 5.875160447745534e-06, + "loss": 0.0322, + "step": 157680 + }, + { + "epoch": 0.05845, + "grad_norm": 0.04310221970081329, + "learning_rate": 5.872498466109158e-06, + "loss": 0.0342, + "step": 157690 + }, + { + "epoch": 0.0585, + "grad_norm": 0.046025704592466354, + "learning_rate": 5.869837007411483e-06, + "loss": 0.0326, + "step": 157700 + }, + { + "epoch": 0.05855, + "grad_norm": 0.0485345833003521, + "learning_rate": 5.867176071725292e-06, + "loss": 0.0331, + "step": 157710 + }, + { + "epoch": 0.0586, + "grad_norm": 0.04319790005683899, + "learning_rate": 5.864515659123304e-06, + "loss": 0.0322, + "step": 157720 + }, + { + "epoch": 0.05865, + "grad_norm": 0.047216981649398804, + "learning_rate": 5.861855769678271e-06, + "loss": 0.0323, + "step": 157730 + }, + { + "epoch": 0.0587, + "grad_norm": 0.045297492295503616, + "learning_rate": 5.859196403462916e-06, + "loss": 0.0323, + "step": 157740 + }, + { + "epoch": 0.05875, + "grad_norm": 0.03981221094727516, + "learning_rate": 5.856537560549943e-06, + "loss": 0.0324, + "step": 157750 + }, + { + "epoch": 0.0588, + "grad_norm": 0.04106670618057251, + "learning_rate": 5.853879241012039e-06, + "loss": 0.0333, + "step": 157760 + }, + { + "epoch": 0.05885, + "grad_norm": 0.04203006625175476, + "learning_rate": 5.851221444921878e-06, + "loss": 0.0335, + "step": 157770 + }, + { + "epoch": 0.0589, + "grad_norm": 0.04400645196437836, + "learning_rate": 5.8485641723521364e-06, + "loss": 0.0338, + "step": 157780 + }, + { + "epoch": 0.05895, + "grad_norm": 0.04924129322171211, + "learning_rate": 5.845907423375455e-06, + "loss": 0.0328, + "step": 157790 + }, + { + "epoch": 0.059, + "grad_norm": 0.05753031000494957, + "learning_rate": 5.843251198064459e-06, + "loss": 0.0362, + "step": 157800 + }, + { + "epoch": 0.05905, + "grad_norm": 0.08049297332763672, + "learning_rate": 5.840595496491788e-06, + "loss": 0.0344, + "step": 157810 + }, + { + "epoch": 0.0591, + "grad_norm": 0.05114581808447838, + "learning_rate": 5.837940318730031e-06, + "loss": 0.0337, + "step": 157820 + }, + { + "epoch": 0.05915, + "grad_norm": 0.05438638851046562, + "learning_rate": 5.8352856648517945e-06, + "loss": 0.0325, + "step": 157830 + }, + { + "epoch": 0.0592, + "grad_norm": 0.053465284407138824, + "learning_rate": 5.8326315349296476e-06, + "loss": 0.0321, + "step": 157840 + }, + { + "epoch": 0.05925, + "grad_norm": 0.044169776141643524, + "learning_rate": 5.829977929036154e-06, + "loss": 0.0323, + "step": 157850 + }, + { + "epoch": 0.0593, + "grad_norm": 0.041343994438648224, + "learning_rate": 5.827324847243853e-06, + "loss": 0.0341, + "step": 157860 + }, + { + "epoch": 0.05935, + "grad_norm": 0.04701950401067734, + "learning_rate": 5.824672289625297e-06, + "loss": 0.0324, + "step": 157870 + }, + { + "epoch": 0.0594, + "grad_norm": 0.043092407286167145, + "learning_rate": 5.822020256252997e-06, + "loss": 0.0326, + "step": 157880 + }, + { + "epoch": 0.05945, + "grad_norm": 0.042744092643260956, + "learning_rate": 5.81936874719945e-06, + "loss": 0.0322, + "step": 157890 + }, + { + "epoch": 0.0595, + "grad_norm": 0.04643106460571289, + "learning_rate": 5.816717762537163e-06, + "loss": 0.0335, + "step": 157900 + }, + { + "epoch": 0.05955, + "grad_norm": 0.042287472635507584, + "learning_rate": 5.8140673023385965e-06, + "loss": 0.0325, + "step": 157910 + }, + { + "epoch": 0.0596, + "grad_norm": 0.04261939972639084, + "learning_rate": 5.8114173666762335e-06, + "loss": 0.0339, + "step": 157920 + }, + { + "epoch": 0.05965, + "grad_norm": 0.04192541539669037, + "learning_rate": 5.8087679556225075e-06, + "loss": 0.034, + "step": 157930 + }, + { + "epoch": 0.0597, + "grad_norm": 0.045936793088912964, + "learning_rate": 5.806119069249849e-06, + "loss": 0.0338, + "step": 157940 + }, + { + "epoch": 0.05975, + "grad_norm": 0.03764326870441437, + "learning_rate": 5.803470707630692e-06, + "loss": 0.0325, + "step": 157950 + }, + { + "epoch": 0.0598, + "grad_norm": 0.05140896141529083, + "learning_rate": 5.800822870837436e-06, + "loss": 0.0358, + "step": 157960 + }, + { + "epoch": 0.05985, + "grad_norm": 0.042791955173015594, + "learning_rate": 5.798175558942468e-06, + "loss": 0.0345, + "step": 157970 + }, + { + "epoch": 0.0599, + "grad_norm": 0.05092308670282364, + "learning_rate": 5.7955287720181574e-06, + "loss": 0.0329, + "step": 157980 + }, + { + "epoch": 0.05995, + "grad_norm": 0.04748637229204178, + "learning_rate": 5.792882510136879e-06, + "loss": 0.0333, + "step": 157990 + }, + { + "epoch": 0.06, + "grad_norm": 0.05431441217660904, + "learning_rate": 5.7902367733709915e-06, + "loss": 0.0342, + "step": 158000 + }, + { + "epoch": 0.06005, + "grad_norm": 0.04721912741661072, + "learning_rate": 5.787591561792796e-06, + "loss": 0.036, + "step": 158010 + }, + { + "epoch": 0.0601, + "grad_norm": 0.046072278171777725, + "learning_rate": 5.784946875474639e-06, + "loss": 0.0349, + "step": 158020 + }, + { + "epoch": 0.06015, + "grad_norm": 0.05052720382809639, + "learning_rate": 5.7823027144888075e-06, + "loss": 0.0341, + "step": 158030 + }, + { + "epoch": 0.0602, + "grad_norm": 0.043350934982299805, + "learning_rate": 5.779659078907607e-06, + "loss": 0.0332, + "step": 158040 + }, + { + "epoch": 0.06025, + "grad_norm": 0.04384405538439751, + "learning_rate": 5.777015968803307e-06, + "loss": 0.0332, + "step": 158050 + }, + { + "epoch": 0.0603, + "grad_norm": 0.050363317131996155, + "learning_rate": 5.774373384248163e-06, + "loss": 0.0332, + "step": 158060 + }, + { + "epoch": 0.06035, + "grad_norm": 0.04257839918136597, + "learning_rate": 5.771731325314433e-06, + "loss": 0.032, + "step": 158070 + }, + { + "epoch": 0.0604, + "grad_norm": 0.058831073343753815, + "learning_rate": 5.769089792074345e-06, + "loss": 0.0332, + "step": 158080 + }, + { + "epoch": 0.06045, + "grad_norm": 0.044406186789274216, + "learning_rate": 5.766448784600117e-06, + "loss": 0.034, + "step": 158090 + }, + { + "epoch": 0.0605, + "grad_norm": 0.05047066509723663, + "learning_rate": 5.763808302963949e-06, + "loss": 0.0356, + "step": 158100 + }, + { + "epoch": 0.06055, + "grad_norm": 0.0494023896753788, + "learning_rate": 5.761168347238041e-06, + "loss": 0.0341, + "step": 158110 + }, + { + "epoch": 0.0606, + "grad_norm": 0.05668467655777931, + "learning_rate": 5.758528917494554e-06, + "loss": 0.0344, + "step": 158120 + }, + { + "epoch": 0.06065, + "grad_norm": 0.04482339695096016, + "learning_rate": 5.7558900138056676e-06, + "loss": 0.0346, + "step": 158130 + }, + { + "epoch": 0.0607, + "grad_norm": 0.04861469194293022, + "learning_rate": 5.753251636243518e-06, + "loss": 0.0322, + "step": 158140 + }, + { + "epoch": 0.06075, + "grad_norm": 0.04743332415819168, + "learning_rate": 5.7506137848802295e-06, + "loss": 0.0345, + "step": 158150 + }, + { + "epoch": 0.0608, + "grad_norm": 0.038775719702243805, + "learning_rate": 5.747976459787935e-06, + "loss": 0.0335, + "step": 158160 + }, + { + "epoch": 0.06085, + "grad_norm": 0.048766493797302246, + "learning_rate": 5.745339661038732e-06, + "loss": 0.0332, + "step": 158170 + }, + { + "epoch": 0.0609, + "grad_norm": 0.04896395280957222, + "learning_rate": 5.7427033887047036e-06, + "loss": 0.0341, + "step": 158180 + }, + { + "epoch": 0.06095, + "grad_norm": 0.04900505766272545, + "learning_rate": 5.7400676428579356e-06, + "loss": 0.0339, + "step": 158190 + }, + { + "epoch": 0.061, + "grad_norm": 0.04926890507340431, + "learning_rate": 5.737432423570477e-06, + "loss": 0.0348, + "step": 158200 + }, + { + "epoch": 0.06105, + "grad_norm": 0.04470258206129074, + "learning_rate": 5.7347977309143904e-06, + "loss": 0.0334, + "step": 158210 + }, + { + "epoch": 0.0611, + "grad_norm": 0.04631470888853073, + "learning_rate": 5.732163564961684e-06, + "loss": 0.0334, + "step": 158220 + }, + { + "epoch": 0.06115, + "grad_norm": 0.03767970949411392, + "learning_rate": 5.729529925784394e-06, + "loss": 0.0331, + "step": 158230 + }, + { + "epoch": 0.0612, + "grad_norm": 0.044212449342012405, + "learning_rate": 5.726896813454511e-06, + "loss": 0.0328, + "step": 158240 + }, + { + "epoch": 0.06125, + "grad_norm": 0.04186420887708664, + "learning_rate": 5.724264228044032e-06, + "loss": 0.0326, + "step": 158250 + }, + { + "epoch": 0.0613, + "grad_norm": 0.042783111333847046, + "learning_rate": 5.72163216962493e-06, + "loss": 0.0328, + "step": 158260 + }, + { + "epoch": 0.06135, + "grad_norm": 0.04458148777484894, + "learning_rate": 5.719000638269154e-06, + "loss": 0.0328, + "step": 158270 + }, + { + "epoch": 0.0614, + "grad_norm": 0.053216926753520966, + "learning_rate": 5.716369634048665e-06, + "loss": 0.0331, + "step": 158280 + }, + { + "epoch": 0.06145, + "grad_norm": 0.04896605387330055, + "learning_rate": 5.713739157035386e-06, + "loss": 0.0336, + "step": 158290 + }, + { + "epoch": 0.0615, + "grad_norm": 0.04810934141278267, + "learning_rate": 5.711109207301232e-06, + "loss": 0.0333, + "step": 158300 + }, + { + "epoch": 0.06155, + "grad_norm": 0.04081054776906967, + "learning_rate": 5.708479784918097e-06, + "loss": 0.0326, + "step": 158310 + }, + { + "epoch": 0.0616, + "grad_norm": 0.04219655692577362, + "learning_rate": 5.705850889957881e-06, + "loss": 0.0323, + "step": 158320 + }, + { + "epoch": 0.06165, + "grad_norm": 0.03747668117284775, + "learning_rate": 5.703222522492457e-06, + "loss": 0.0356, + "step": 158330 + }, + { + "epoch": 0.0617, + "grad_norm": 0.040792036801576614, + "learning_rate": 5.700594682593682e-06, + "loss": 0.0341, + "step": 158340 + }, + { + "epoch": 0.06175, + "grad_norm": 0.04115452989935875, + "learning_rate": 5.697967370333396e-06, + "loss": 0.033, + "step": 158350 + }, + { + "epoch": 0.0618, + "grad_norm": 0.04136979579925537, + "learning_rate": 5.695340585783424e-06, + "loss": 0.0339, + "step": 158360 + }, + { + "epoch": 0.06185, + "grad_norm": 0.040849775075912476, + "learning_rate": 5.692714329015597e-06, + "loss": 0.0333, + "step": 158370 + }, + { + "epoch": 0.0619, + "grad_norm": 0.047011423856019974, + "learning_rate": 5.690088600101703e-06, + "loss": 0.034, + "step": 158380 + }, + { + "epoch": 0.06195, + "grad_norm": 0.04129017889499664, + "learning_rate": 5.68746339911353e-06, + "loss": 0.0357, + "step": 158390 + }, + { + "epoch": 0.062, + "grad_norm": 0.045220695436000824, + "learning_rate": 5.684838726122854e-06, + "loss": 0.0331, + "step": 158400 + }, + { + "epoch": 0.06205, + "grad_norm": 0.043388549238443375, + "learning_rate": 5.6822145812014285e-06, + "loss": 0.0341, + "step": 158410 + }, + { + "epoch": 0.0621, + "grad_norm": 0.04896579682826996, + "learning_rate": 5.6795909644210114e-06, + "loss": 0.0332, + "step": 158420 + }, + { + "epoch": 0.06215, + "grad_norm": 0.040435150265693665, + "learning_rate": 5.676967875853303e-06, + "loss": 0.0336, + "step": 158430 + }, + { + "epoch": 0.0622, + "grad_norm": 0.03940362483263016, + "learning_rate": 5.674345315570037e-06, + "loss": 0.0334, + "step": 158440 + }, + { + "epoch": 0.06225, + "grad_norm": 0.047289952635765076, + "learning_rate": 5.671723283642916e-06, + "loss": 0.0353, + "step": 158450 + }, + { + "epoch": 0.0623, + "grad_norm": 0.04400908201932907, + "learning_rate": 5.669101780143618e-06, + "loss": 0.0326, + "step": 158460 + }, + { + "epoch": 0.06235, + "grad_norm": 0.03765032812952995, + "learning_rate": 5.666480805143815e-06, + "loss": 0.0329, + "step": 158470 + }, + { + "epoch": 0.0624, + "grad_norm": 0.05119500309228897, + "learning_rate": 5.663860358715156e-06, + "loss": 0.0322, + "step": 158480 + }, + { + "epoch": 0.06245, + "grad_norm": 0.03913963586091995, + "learning_rate": 5.6612404409293e-06, + "loss": 0.0334, + "step": 158490 + }, + { + "epoch": 0.0625, + "grad_norm": 0.03637481853365898, + "learning_rate": 5.658621051857863e-06, + "loss": 0.0342, + "step": 158500 + }, + { + "epoch": 0.06255, + "grad_norm": 0.04477520287036896, + "learning_rate": 5.656002191572452e-06, + "loss": 0.0338, + "step": 158510 + }, + { + "epoch": 0.0626, + "grad_norm": 0.06019911915063858, + "learning_rate": 5.6533838601446845e-06, + "loss": 0.0331, + "step": 158520 + }, + { + "epoch": 0.06265, + "grad_norm": 0.04236951097846031, + "learning_rate": 5.650766057646123e-06, + "loss": 0.0328, + "step": 158530 + }, + { + "epoch": 0.0627, + "grad_norm": 0.05303535982966423, + "learning_rate": 5.648148784148358e-06, + "loss": 0.0345, + "step": 158540 + }, + { + "epoch": 0.06275, + "grad_norm": 0.043178990483284, + "learning_rate": 5.6455320397229336e-06, + "loss": 0.033, + "step": 158550 + }, + { + "epoch": 0.0628, + "grad_norm": 0.04464222490787506, + "learning_rate": 5.642915824441386e-06, + "loss": 0.0339, + "step": 158560 + }, + { + "epoch": 0.06285, + "grad_norm": 0.047780852764844894, + "learning_rate": 5.640300138375257e-06, + "loss": 0.0334, + "step": 158570 + }, + { + "epoch": 0.0629, + "grad_norm": 0.04555336758494377, + "learning_rate": 5.637684981596045e-06, + "loss": 0.0341, + "step": 158580 + }, + { + "epoch": 0.06295, + "grad_norm": 0.05541256442666054, + "learning_rate": 5.635070354175254e-06, + "loss": 0.034, + "step": 158590 + }, + { + "epoch": 0.063, + "grad_norm": 0.04075656086206436, + "learning_rate": 5.632456256184357e-06, + "loss": 0.0342, + "step": 158600 + }, + { + "epoch": 0.06305, + "grad_norm": 0.04417794942855835, + "learning_rate": 5.629842687694837e-06, + "loss": 0.0338, + "step": 158610 + }, + { + "epoch": 0.0631, + "grad_norm": 0.04109339043498039, + "learning_rate": 5.627229648778132e-06, + "loss": 0.0331, + "step": 158620 + }, + { + "epoch": 0.06315, + "grad_norm": 0.0533117949962616, + "learning_rate": 5.6246171395057e-06, + "loss": 0.0337, + "step": 158630 + }, + { + "epoch": 0.0632, + "grad_norm": 0.04230556637048721, + "learning_rate": 5.622005159948957e-06, + "loss": 0.0328, + "step": 158640 + }, + { + "epoch": 0.06325, + "grad_norm": 0.04738219827413559, + "learning_rate": 5.619393710179302e-06, + "loss": 0.0346, + "step": 158650 + }, + { + "epoch": 0.0633, + "grad_norm": 0.057871103286743164, + "learning_rate": 5.616782790268152e-06, + "loss": 0.0343, + "step": 158660 + }, + { + "epoch": 0.06335, + "grad_norm": 0.056884873658418655, + "learning_rate": 5.614172400286877e-06, + "loss": 0.0343, + "step": 158670 + }, + { + "epoch": 0.0634, + "grad_norm": 0.044703949242830276, + "learning_rate": 5.611562540306847e-06, + "loss": 0.0363, + "step": 158680 + }, + { + "epoch": 0.06345, + "grad_norm": 0.04357363283634186, + "learning_rate": 5.608953210399406e-06, + "loss": 0.0368, + "step": 158690 + }, + { + "epoch": 0.0635, + "grad_norm": 0.050504423677921295, + "learning_rate": 5.6063444106359e-06, + "loss": 0.0342, + "step": 158700 + }, + { + "epoch": 0.06355, + "grad_norm": 0.04960758984088898, + "learning_rate": 5.6037361410876645e-06, + "loss": 0.0331, + "step": 158710 + }, + { + "epoch": 0.0636, + "grad_norm": 0.04468027502298355, + "learning_rate": 5.601128401825984e-06, + "loss": 0.0335, + "step": 158720 + }, + { + "epoch": 0.06365, + "grad_norm": 0.045513805001974106, + "learning_rate": 5.598521192922171e-06, + "loss": 0.0335, + "step": 158730 + }, + { + "epoch": 0.0637, + "grad_norm": 0.04335379600524902, + "learning_rate": 5.595914514447493e-06, + "loss": 0.0328, + "step": 158740 + }, + { + "epoch": 0.06375, + "grad_norm": 0.04896742105484009, + "learning_rate": 5.593308366473227e-06, + "loss": 0.0331, + "step": 158750 + }, + { + "epoch": 0.0638, + "grad_norm": 0.04479243606328964, + "learning_rate": 5.5907027490706225e-06, + "loss": 0.0326, + "step": 158760 + }, + { + "epoch": 0.06385, + "grad_norm": 0.04396402835845947, + "learning_rate": 5.5880976623109036e-06, + "loss": 0.0327, + "step": 158770 + }, + { + "epoch": 0.0639, + "grad_norm": 0.043852224946022034, + "learning_rate": 5.5854931062653105e-06, + "loss": 0.033, + "step": 158780 + }, + { + "epoch": 0.06395, + "grad_norm": 0.043481361120939255, + "learning_rate": 5.582889081005044e-06, + "loss": 0.0325, + "step": 158790 + }, + { + "epoch": 0.064, + "grad_norm": 0.04225054010748863, + "learning_rate": 5.5802855866012915e-06, + "loss": 0.0322, + "step": 158800 + }, + { + "epoch": 0.06405, + "grad_norm": 0.03953962028026581, + "learning_rate": 5.577682623125233e-06, + "loss": 0.0316, + "step": 158810 + }, + { + "epoch": 0.0641, + "grad_norm": 0.03687303513288498, + "learning_rate": 5.57508019064803e-06, + "loss": 0.0313, + "step": 158820 + }, + { + "epoch": 0.06415, + "grad_norm": 0.04551881551742554, + "learning_rate": 5.572478289240849e-06, + "loss": 0.0343, + "step": 158830 + }, + { + "epoch": 0.0642, + "grad_norm": 0.05536499246954918, + "learning_rate": 5.569876918974809e-06, + "loss": 0.0339, + "step": 158840 + }, + { + "epoch": 0.06425, + "grad_norm": 0.049355749040842056, + "learning_rate": 5.567276079921036e-06, + "loss": 0.0326, + "step": 158850 + }, + { + "epoch": 0.0643, + "grad_norm": 0.04733234643936157, + "learning_rate": 5.564675772150626e-06, + "loss": 0.033, + "step": 158860 + }, + { + "epoch": 0.06435, + "grad_norm": 0.06434501707553864, + "learning_rate": 5.562075995734689e-06, + "loss": 0.0345, + "step": 158870 + }, + { + "epoch": 0.0644, + "grad_norm": 0.04427826777100563, + "learning_rate": 5.559476750744288e-06, + "loss": 0.0323, + "step": 158880 + }, + { + "epoch": 0.06445, + "grad_norm": 0.04419584572315216, + "learning_rate": 5.5568780372504845e-06, + "loss": 0.0329, + "step": 158890 + }, + { + "epoch": 0.0645, + "grad_norm": 0.05121339112520218, + "learning_rate": 5.554279855324337e-06, + "loss": 0.0328, + "step": 158900 + }, + { + "epoch": 0.06455, + "grad_norm": 0.05606408789753914, + "learning_rate": 5.551682205036867e-06, + "loss": 0.0343, + "step": 158910 + }, + { + "epoch": 0.0646, + "grad_norm": 0.051648642867803574, + "learning_rate": 5.549085086459113e-06, + "loss": 0.032, + "step": 158920 + }, + { + "epoch": 0.06465, + "grad_norm": 0.05112677812576294, + "learning_rate": 5.54648849966205e-06, + "loss": 0.0316, + "step": 158930 + }, + { + "epoch": 0.0647, + "grad_norm": 0.07420983165502548, + "learning_rate": 5.543892444716686e-06, + "loss": 0.0328, + "step": 158940 + }, + { + "epoch": 0.06475, + "grad_norm": 0.05548356845974922, + "learning_rate": 5.541296921693998e-06, + "loss": 0.0328, + "step": 158950 + }, + { + "epoch": 0.0648, + "grad_norm": 0.04403974860906601, + "learning_rate": 5.538701930664941e-06, + "loss": 0.0327, + "step": 158960 + }, + { + "epoch": 0.06485, + "grad_norm": 0.04643482714891434, + "learning_rate": 5.536107471700463e-06, + "loss": 0.0321, + "step": 158970 + }, + { + "epoch": 0.0649, + "grad_norm": 0.04126209765672684, + "learning_rate": 5.533513544871488e-06, + "loss": 0.0318, + "step": 158980 + }, + { + "epoch": 0.06495, + "grad_norm": 0.04286893084645271, + "learning_rate": 5.5309201502489475e-06, + "loss": 0.0321, + "step": 158990 + }, + { + "epoch": 0.065, + "grad_norm": 0.05262665078043938, + "learning_rate": 5.528327287903734e-06, + "loss": 0.0329, + "step": 159000 + }, + { + "epoch": 0.06505, + "grad_norm": 0.051432929933071136, + "learning_rate": 5.525734957906731e-06, + "loss": 0.0332, + "step": 159010 + }, + { + "epoch": 0.0651, + "grad_norm": 0.06794150918722153, + "learning_rate": 5.523143160328823e-06, + "loss": 0.0355, + "step": 159020 + }, + { + "epoch": 0.06515, + "grad_norm": 0.06131485849618912, + "learning_rate": 5.520551895240858e-06, + "loss": 0.0338, + "step": 159030 + }, + { + "epoch": 0.0652, + "grad_norm": 0.09646933525800705, + "learning_rate": 5.517961162713695e-06, + "loss": 0.0347, + "step": 159040 + }, + { + "epoch": 0.06525, + "grad_norm": 0.052614159882068634, + "learning_rate": 5.5153709628181534e-06, + "loss": 0.0328, + "step": 159050 + }, + { + "epoch": 0.0653, + "grad_norm": 0.0494559109210968, + "learning_rate": 5.51278129562505e-06, + "loss": 0.0336, + "step": 159060 + }, + { + "epoch": 0.06535, + "grad_norm": 0.04907679557800293, + "learning_rate": 5.510192161205177e-06, + "loss": 0.0345, + "step": 159070 + }, + { + "epoch": 0.0654, + "grad_norm": 0.0441729798913002, + "learning_rate": 5.507603559629337e-06, + "loss": 0.0333, + "step": 159080 + }, + { + "epoch": 0.06545, + "grad_norm": 0.05236222967505455, + "learning_rate": 5.505015490968291e-06, + "loss": 0.0328, + "step": 159090 + }, + { + "epoch": 0.0655, + "grad_norm": 0.038976993411779404, + "learning_rate": 5.502427955292791e-06, + "loss": 0.0338, + "step": 159100 + }, + { + "epoch": 0.06555, + "grad_norm": 0.040054868906736374, + "learning_rate": 5.499840952673593e-06, + "loss": 0.0332, + "step": 159110 + }, + { + "epoch": 0.0656, + "grad_norm": 0.0409533828496933, + "learning_rate": 5.497254483181413e-06, + "loss": 0.0335, + "step": 159120 + }, + { + "epoch": 0.06565, + "grad_norm": 0.051507771015167236, + "learning_rate": 5.4946685468869715e-06, + "loss": 0.034, + "step": 159130 + }, + { + "epoch": 0.0657, + "grad_norm": 0.05494391545653343, + "learning_rate": 5.492083143860966e-06, + "loss": 0.0344, + "step": 159140 + }, + { + "epoch": 0.06575, + "grad_norm": 0.039874982088804245, + "learning_rate": 5.489498274174071e-06, + "loss": 0.0329, + "step": 159150 + }, + { + "epoch": 0.0658, + "grad_norm": 0.04216877371072769, + "learning_rate": 5.486913937896973e-06, + "loss": 0.0344, + "step": 159160 + }, + { + "epoch": 0.06585, + "grad_norm": 0.04714996740221977, + "learning_rate": 5.484330135100313e-06, + "loss": 0.0337, + "step": 159170 + }, + { + "epoch": 0.0659, + "grad_norm": 0.04491034150123596, + "learning_rate": 5.48174686585474e-06, + "loss": 0.0338, + "step": 159180 + }, + { + "epoch": 0.06595, + "grad_norm": 0.04163685441017151, + "learning_rate": 5.479164130230862e-06, + "loss": 0.0338, + "step": 159190 + }, + { + "epoch": 0.066, + "grad_norm": 0.04118207469582558, + "learning_rate": 5.47658192829931e-06, + "loss": 0.0337, + "step": 159200 + }, + { + "epoch": 0.06605, + "grad_norm": 0.04150707647204399, + "learning_rate": 5.474000260130682e-06, + "loss": 0.0332, + "step": 159210 + }, + { + "epoch": 0.0661, + "grad_norm": 0.041581131517887115, + "learning_rate": 5.471419125795541e-06, + "loss": 0.0335, + "step": 159220 + }, + { + "epoch": 0.06615, + "grad_norm": 0.044462401419878006, + "learning_rate": 5.46883852536447e-06, + "loss": 0.0326, + "step": 159230 + }, + { + "epoch": 0.0662, + "grad_norm": 0.04261643439531326, + "learning_rate": 5.466258458908008e-06, + "loss": 0.0332, + "step": 159240 + }, + { + "epoch": 0.06625, + "grad_norm": 0.046739377081394196, + "learning_rate": 5.46367892649671e-06, + "loss": 0.0334, + "step": 159250 + }, + { + "epoch": 0.0663, + "grad_norm": 0.036309320479631424, + "learning_rate": 5.461099928201088e-06, + "loss": 0.0325, + "step": 159260 + }, + { + "epoch": 0.06635, + "grad_norm": 0.04956991970539093, + "learning_rate": 5.458521464091648e-06, + "loss": 0.0333, + "step": 159270 + }, + { + "epoch": 0.0664, + "grad_norm": 0.042923565953969955, + "learning_rate": 5.4559435342389e-06, + "loss": 0.0324, + "step": 159280 + }, + { + "epoch": 0.06645, + "grad_norm": 0.040057938545942307, + "learning_rate": 5.453366138713309e-06, + "loss": 0.0327, + "step": 159290 + }, + { + "epoch": 0.0665, + "grad_norm": 0.03960050642490387, + "learning_rate": 5.450789277585347e-06, + "loss": 0.0331, + "step": 159300 + }, + { + "epoch": 0.06655, + "grad_norm": 0.04826389253139496, + "learning_rate": 5.448212950925455e-06, + "loss": 0.0332, + "step": 159310 + }, + { + "epoch": 0.0666, + "grad_norm": 0.03730017691850662, + "learning_rate": 5.445637158804082e-06, + "loss": 0.0331, + "step": 159320 + }, + { + "epoch": 0.06665, + "grad_norm": 0.03824429586529732, + "learning_rate": 5.443061901291635e-06, + "loss": 0.0332, + "step": 159330 + }, + { + "epoch": 0.0667, + "grad_norm": 0.04533061012625694, + "learning_rate": 5.440487178458533e-06, + "loss": 0.0335, + "step": 159340 + }, + { + "epoch": 0.06675, + "grad_norm": 0.0460333377122879, + "learning_rate": 5.437912990375169e-06, + "loss": 0.034, + "step": 159350 + }, + { + "epoch": 0.0668, + "grad_norm": 0.04362259805202484, + "learning_rate": 5.435339337111905e-06, + "loss": 0.034, + "step": 159360 + }, + { + "epoch": 0.06685, + "grad_norm": 0.04969523474574089, + "learning_rate": 5.432766218739118e-06, + "loss": 0.0331, + "step": 159370 + }, + { + "epoch": 0.0669, + "grad_norm": 0.04067446291446686, + "learning_rate": 5.430193635327155e-06, + "loss": 0.0322, + "step": 159380 + }, + { + "epoch": 0.06695, + "grad_norm": 0.04759836196899414, + "learning_rate": 5.427621586946338e-06, + "loss": 0.034, + "step": 159390 + }, + { + "epoch": 0.067, + "grad_norm": 0.04831007868051529, + "learning_rate": 5.425050073667002e-06, + "loss": 0.033, + "step": 159400 + }, + { + "epoch": 0.06705, + "grad_norm": 0.04356855899095535, + "learning_rate": 5.422479095559435e-06, + "loss": 0.033, + "step": 159410 + }, + { + "epoch": 0.0671, + "grad_norm": 0.03892563655972481, + "learning_rate": 5.419908652693947e-06, + "loss": 0.0338, + "step": 159420 + }, + { + "epoch": 0.06715, + "grad_norm": 0.05179993808269501, + "learning_rate": 5.417338745140788e-06, + "loss": 0.0343, + "step": 159430 + }, + { + "epoch": 0.0672, + "grad_norm": 0.041009191423654556, + "learning_rate": 5.414769372970238e-06, + "loss": 0.0319, + "step": 159440 + }, + { + "epoch": 0.06725, + "grad_norm": 0.04942528158426285, + "learning_rate": 5.412200536252529e-06, + "loss": 0.0351, + "step": 159450 + }, + { + "epoch": 0.0673, + "grad_norm": 0.0511196106672287, + "learning_rate": 5.409632235057904e-06, + "loss": 0.0343, + "step": 159460 + }, + { + "epoch": 0.06735, + "grad_norm": 0.04779931902885437, + "learning_rate": 5.4070644694565745e-06, + "loss": 0.0322, + "step": 159470 + }, + { + "epoch": 0.0674, + "grad_norm": 0.040881089866161346, + "learning_rate": 5.404497239518735e-06, + "loss": 0.0328, + "step": 159480 + }, + { + "epoch": 0.06745, + "grad_norm": 0.04905702546238899, + "learning_rate": 5.401930545314587e-06, + "loss": 0.0342, + "step": 159490 + }, + { + "epoch": 0.0675, + "grad_norm": 0.04856644570827484, + "learning_rate": 5.399364386914294e-06, + "loss": 0.0331, + "step": 159500 + }, + { + "epoch": 0.06755, + "grad_norm": 0.04014168679714203, + "learning_rate": 5.396798764388017e-06, + "loss": 0.033, + "step": 159510 + }, + { + "epoch": 0.0676, + "grad_norm": 0.036229848861694336, + "learning_rate": 5.3942336778058875e-06, + "loss": 0.0323, + "step": 159520 + }, + { + "epoch": 0.06765, + "grad_norm": 0.05225673317909241, + "learning_rate": 5.391669127238044e-06, + "loss": 0.0342, + "step": 159530 + }, + { + "epoch": 0.0677, + "grad_norm": 0.0477435439825058, + "learning_rate": 5.389105112754609e-06, + "loss": 0.0331, + "step": 159540 + }, + { + "epoch": 0.06775, + "grad_norm": 0.04617225006222725, + "learning_rate": 5.3865416344256705e-06, + "loss": 0.0336, + "step": 159550 + }, + { + "epoch": 0.0678, + "grad_norm": 0.05308155715465546, + "learning_rate": 5.3839786923213175e-06, + "loss": 0.0374, + "step": 159560 + }, + { + "epoch": 0.06785, + "grad_norm": 0.03811059147119522, + "learning_rate": 5.3814162865116094e-06, + "loss": 0.0335, + "step": 159570 + }, + { + "epoch": 0.0679, + "grad_norm": 0.04400903731584549, + "learning_rate": 5.378854417066612e-06, + "loss": 0.0346, + "step": 159580 + }, + { + "epoch": 0.06795, + "grad_norm": 0.0434289425611496, + "learning_rate": 5.376293084056375e-06, + "loss": 0.0348, + "step": 159590 + }, + { + "epoch": 0.068, + "grad_norm": 0.04518948495388031, + "learning_rate": 5.373732287550897e-06, + "loss": 0.0364, + "step": 159600 + }, + { + "epoch": 0.06805, + "grad_norm": 0.04921133071184158, + "learning_rate": 5.371172027620213e-06, + "loss": 0.0354, + "step": 159610 + }, + { + "epoch": 0.0681, + "grad_norm": 0.044258587062358856, + "learning_rate": 5.368612304334308e-06, + "loss": 0.034, + "step": 159620 + }, + { + "epoch": 0.06815, + "grad_norm": 0.03922766447067261, + "learning_rate": 5.366053117763179e-06, + "loss": 0.0336, + "step": 159630 + }, + { + "epoch": 0.0682, + "grad_norm": 0.04011528939008713, + "learning_rate": 5.363494467976768e-06, + "loss": 0.0344, + "step": 159640 + }, + { + "epoch": 0.06825, + "grad_norm": 0.04395853728055954, + "learning_rate": 5.360936355045041e-06, + "loss": 0.033, + "step": 159650 + }, + { + "epoch": 0.0683, + "grad_norm": 0.04406457394361496, + "learning_rate": 5.3583787790379424e-06, + "loss": 0.0346, + "step": 159660 + }, + { + "epoch": 0.06835, + "grad_norm": 0.041080668568611145, + "learning_rate": 5.355821740025391e-06, + "loss": 0.0343, + "step": 159670 + }, + { + "epoch": 0.0684, + "grad_norm": 0.04846780747175217, + "learning_rate": 5.3532652380772904e-06, + "loss": 0.0346, + "step": 159680 + }, + { + "epoch": 0.06845, + "grad_norm": 0.05273908004164696, + "learning_rate": 5.350709273263533e-06, + "loss": 0.0334, + "step": 159690 + }, + { + "epoch": 0.0685, + "grad_norm": 0.04335326701402664, + "learning_rate": 5.348153845654008e-06, + "loss": 0.0331, + "step": 159700 + }, + { + "epoch": 0.06855, + "grad_norm": 0.049266718327999115, + "learning_rate": 5.345598955318565e-06, + "loss": 0.0328, + "step": 159710 + }, + { + "epoch": 0.0686, + "grad_norm": 0.07993786036968231, + "learning_rate": 5.343044602327072e-06, + "loss": 0.033, + "step": 159720 + }, + { + "epoch": 0.06865, + "grad_norm": 0.05523889511823654, + "learning_rate": 5.340490786749355e-06, + "loss": 0.0325, + "step": 159730 + }, + { + "epoch": 0.0687, + "grad_norm": 0.046398960053920746, + "learning_rate": 5.337937508655228e-06, + "loss": 0.0335, + "step": 159740 + }, + { + "epoch": 0.06875, + "grad_norm": 0.05909277871251106, + "learning_rate": 5.3353847681145066e-06, + "loss": 0.0319, + "step": 159750 + }, + { + "epoch": 0.0688, + "grad_norm": 0.05883355066180229, + "learning_rate": 5.3328325651969795e-06, + "loss": 0.0329, + "step": 159760 + }, + { + "epoch": 0.06885, + "grad_norm": 0.044400133192539215, + "learning_rate": 5.330280899972415e-06, + "loss": 0.0325, + "step": 159770 + }, + { + "epoch": 0.0689, + "grad_norm": 0.050274547189474106, + "learning_rate": 5.327729772510587e-06, + "loss": 0.0328, + "step": 159780 + }, + { + "epoch": 0.06895, + "grad_norm": 0.06526787579059601, + "learning_rate": 5.325179182881232e-06, + "loss": 0.0331, + "step": 159790 + }, + { + "epoch": 0.069, + "grad_norm": 0.05052199214696884, + "learning_rate": 5.322629131154097e-06, + "loss": 0.0333, + "step": 159800 + }, + { + "epoch": 0.06905, + "grad_norm": 0.04053528234362602, + "learning_rate": 5.320079617398879e-06, + "loss": 0.033, + "step": 159810 + }, + { + "epoch": 0.0691, + "grad_norm": 0.05289068445563316, + "learning_rate": 5.3175306416852945e-06, + "loss": 0.0334, + "step": 159820 + }, + { + "epoch": 0.06915, + "grad_norm": 0.037741102278232574, + "learning_rate": 5.314982204083025e-06, + "loss": 0.032, + "step": 159830 + }, + { + "epoch": 0.0692, + "grad_norm": 0.03996310755610466, + "learning_rate": 5.312434304661748e-06, + "loss": 0.0335, + "step": 159840 + }, + { + "epoch": 0.06925, + "grad_norm": 0.03911514952778816, + "learning_rate": 5.3098869434911245e-06, + "loss": 0.0324, + "step": 159850 + }, + { + "epoch": 0.0693, + "grad_norm": 0.04355086758732796, + "learning_rate": 5.307340120640789e-06, + "loss": 0.032, + "step": 159860 + }, + { + "epoch": 0.06935, + "grad_norm": 0.038841210305690765, + "learning_rate": 5.3047938361803804e-06, + "loss": 0.0339, + "step": 159870 + }, + { + "epoch": 0.0694, + "grad_norm": 0.04024119675159454, + "learning_rate": 5.3022480901795096e-06, + "loss": 0.0321, + "step": 159880 + }, + { + "epoch": 0.06945, + "grad_norm": 0.05228933319449425, + "learning_rate": 5.299702882707777e-06, + "loss": 0.0341, + "step": 159890 + }, + { + "epoch": 0.0695, + "grad_norm": 0.052483391016721725, + "learning_rate": 5.29715821383476e-06, + "loss": 0.0331, + "step": 159900 + }, + { + "epoch": 0.06955, + "grad_norm": 0.042383596301078796, + "learning_rate": 5.294614083630034e-06, + "loss": 0.0333, + "step": 159910 + }, + { + "epoch": 0.0696, + "grad_norm": 0.04163794964551926, + "learning_rate": 5.292070492163165e-06, + "loss": 0.0328, + "step": 159920 + }, + { + "epoch": 0.06965, + "grad_norm": 0.04140407219529152, + "learning_rate": 5.289527439503683e-06, + "loss": 0.032, + "step": 159930 + }, + { + "epoch": 0.0697, + "grad_norm": 0.04909062013030052, + "learning_rate": 5.286984925721117e-06, + "loss": 0.0339, + "step": 159940 + }, + { + "epoch": 0.06975, + "grad_norm": 0.040237706154584885, + "learning_rate": 5.284442950884969e-06, + "loss": 0.0319, + "step": 159950 + }, + { + "epoch": 0.0698, + "grad_norm": 0.0394199937582016, + "learning_rate": 5.28190151506475e-06, + "loss": 0.0324, + "step": 159960 + }, + { + "epoch": 0.06985, + "grad_norm": 0.040862519294023514, + "learning_rate": 5.279360618329937e-06, + "loss": 0.0329, + "step": 159970 + }, + { + "epoch": 0.0699, + "grad_norm": 0.03927301615476608, + "learning_rate": 5.27682026074999e-06, + "loss": 0.0317, + "step": 159980 + }, + { + "epoch": 0.06995, + "grad_norm": 0.042789362370967865, + "learning_rate": 5.274280442394375e-06, + "loss": 0.0327, + "step": 159990 + }, + { + "epoch": 0.07, + "grad_norm": 0.042050816118717194, + "learning_rate": 5.271741163332514e-06, + "loss": 0.0324, + "step": 160000 + }, + { + "epoch": 0.07005, + "grad_norm": 0.04403001442551613, + "learning_rate": 5.269202423633851e-06, + "loss": 0.0322, + "step": 160010 + }, + { + "epoch": 0.0701, + "grad_norm": 0.03922681510448456, + "learning_rate": 5.2666642233677676e-06, + "loss": 0.032, + "step": 160020 + }, + { + "epoch": 0.07015, + "grad_norm": 0.044261377304792404, + "learning_rate": 5.264126562603672e-06, + "loss": 0.033, + "step": 160030 + }, + { + "epoch": 0.0702, + "grad_norm": 0.03984180465340614, + "learning_rate": 5.26158944141095e-06, + "loss": 0.0327, + "step": 160040 + }, + { + "epoch": 0.07025, + "grad_norm": 0.04177449643611908, + "learning_rate": 5.259052859858954e-06, + "loss": 0.0331, + "step": 160050 + }, + { + "epoch": 0.0703, + "grad_norm": 0.04058413580060005, + "learning_rate": 5.2565168180170374e-06, + "loss": 0.0329, + "step": 160060 + }, + { + "epoch": 0.07035, + "grad_norm": 0.04518529027700424, + "learning_rate": 5.253981315954528e-06, + "loss": 0.0328, + "step": 160070 + }, + { + "epoch": 0.0704, + "grad_norm": 0.04935334622859955, + "learning_rate": 5.2514463537407576e-06, + "loss": 0.0318, + "step": 160080 + }, + { + "epoch": 0.07045, + "grad_norm": 0.04204292595386505, + "learning_rate": 5.248911931445024e-06, + "loss": 0.032, + "step": 160090 + }, + { + "epoch": 0.0705, + "grad_norm": 0.040166597813367844, + "learning_rate": 5.24637804913661e-06, + "loss": 0.0312, + "step": 160100 + }, + { + "epoch": 0.07055, + "grad_norm": 0.06826286762952805, + "learning_rate": 5.24384470688481e-06, + "loss": 0.0332, + "step": 160110 + }, + { + "epoch": 0.0706, + "grad_norm": 0.041277870535850525, + "learning_rate": 5.241311904758864e-06, + "loss": 0.0321, + "step": 160120 + }, + { + "epoch": 0.07065, + "grad_norm": 0.04410373792052269, + "learning_rate": 5.238779642828034e-06, + "loss": 0.0341, + "step": 160130 + }, + { + "epoch": 0.0707, + "grad_norm": 0.046558964997529984, + "learning_rate": 5.2362479211615466e-06, + "loss": 0.0325, + "step": 160140 + }, + { + "epoch": 0.07075, + "grad_norm": 0.03963978961110115, + "learning_rate": 5.233716739828606e-06, + "loss": 0.0327, + "step": 160150 + }, + { + "epoch": 0.0708, + "grad_norm": 0.03820747137069702, + "learning_rate": 5.231186098898433e-06, + "loss": 0.0319, + "step": 160160 + }, + { + "epoch": 0.07085, + "grad_norm": 0.03716480731964111, + "learning_rate": 5.2286559984402075e-06, + "loss": 0.0323, + "step": 160170 + }, + { + "epoch": 0.0709, + "grad_norm": 0.04506603255867958, + "learning_rate": 5.2261264385230964e-06, + "loss": 0.0328, + "step": 160180 + }, + { + "epoch": 0.07095, + "grad_norm": 0.05314488336443901, + "learning_rate": 5.223597419216253e-06, + "loss": 0.0323, + "step": 160190 + }, + { + "epoch": 0.071, + "grad_norm": 0.043797120451927185, + "learning_rate": 5.221068940588833e-06, + "loss": 0.0339, + "step": 160200 + }, + { + "epoch": 0.07105, + "grad_norm": 0.03779615834355354, + "learning_rate": 5.21854100270995e-06, + "loss": 0.0319, + "step": 160210 + }, + { + "epoch": 0.0711, + "grad_norm": 0.03833272308111191, + "learning_rate": 5.216013605648734e-06, + "loss": 0.0319, + "step": 160220 + }, + { + "epoch": 0.07115, + "grad_norm": 0.04615391418337822, + "learning_rate": 5.213486749474273e-06, + "loss": 0.0327, + "step": 160230 + }, + { + "epoch": 0.0712, + "grad_norm": 0.04216158762574196, + "learning_rate": 5.210960434255643e-06, + "loss": 0.0328, + "step": 160240 + }, + { + "epoch": 0.07125, + "grad_norm": 0.039154306054115295, + "learning_rate": 5.208434660061928e-06, + "loss": 0.0327, + "step": 160250 + }, + { + "epoch": 0.0713, + "grad_norm": 0.04261749982833862, + "learning_rate": 5.2059094269621715e-06, + "loss": 0.0336, + "step": 160260 + }, + { + "epoch": 0.07135, + "grad_norm": 0.04285505414009094, + "learning_rate": 5.203384735025418e-06, + "loss": 0.0342, + "step": 160270 + }, + { + "epoch": 0.0714, + "grad_norm": 0.04513030871748924, + "learning_rate": 5.20086058432068e-06, + "loss": 0.0327, + "step": 160280 + }, + { + "epoch": 0.07145, + "grad_norm": 0.042512886226177216, + "learning_rate": 5.198336974916976e-06, + "loss": 0.0339, + "step": 160290 + }, + { + "epoch": 0.0715, + "grad_norm": 0.038196902722120285, + "learning_rate": 5.195813906883315e-06, + "loss": 0.0348, + "step": 160300 + }, + { + "epoch": 0.07155, + "grad_norm": 0.040633756667375565, + "learning_rate": 5.193291380288648e-06, + "loss": 0.0332, + "step": 160310 + }, + { + "epoch": 0.0716, + "grad_norm": 0.03849083557724953, + "learning_rate": 5.1907693952019585e-06, + "loss": 0.0336, + "step": 160320 + }, + { + "epoch": 0.07165, + "grad_norm": 0.04616985470056534, + "learning_rate": 5.188247951692185e-06, + "loss": 0.0335, + "step": 160330 + }, + { + "epoch": 0.0717, + "grad_norm": 0.04377584904432297, + "learning_rate": 5.185727049828276e-06, + "loss": 0.0338, + "step": 160340 + }, + { + "epoch": 0.07175, + "grad_norm": 0.04809568449854851, + "learning_rate": 5.183206689679148e-06, + "loss": 0.0323, + "step": 160350 + }, + { + "epoch": 0.0718, + "grad_norm": 0.04436049982905388, + "learning_rate": 5.180686871313695e-06, + "loss": 0.0328, + "step": 160360 + }, + { + "epoch": 0.07185, + "grad_norm": 0.041805848479270935, + "learning_rate": 5.178167594800825e-06, + "loss": 0.0324, + "step": 160370 + }, + { + "epoch": 0.0719, + "grad_norm": 0.03261794522404671, + "learning_rate": 5.175648860209406e-06, + "loss": 0.0331, + "step": 160380 + }, + { + "epoch": 0.07195, + "grad_norm": 0.03945595771074295, + "learning_rate": 5.1731306676083e-06, + "loss": 0.0337, + "step": 160390 + }, + { + "epoch": 0.072, + "grad_norm": 0.044130872935056686, + "learning_rate": 5.1706130170663474e-06, + "loss": 0.0331, + "step": 160400 + }, + { + "epoch": 0.07205, + "grad_norm": 0.04020753875374794, + "learning_rate": 5.1680959086523845e-06, + "loss": 0.0353, + "step": 160410 + }, + { + "epoch": 0.0721, + "grad_norm": 0.03573004901409149, + "learning_rate": 5.165579342435234e-06, + "loss": 0.0323, + "step": 160420 + }, + { + "epoch": 0.07215, + "grad_norm": 0.05447396636009216, + "learning_rate": 5.163063318483694e-06, + "loss": 0.0343, + "step": 160430 + }, + { + "epoch": 0.0722, + "grad_norm": 0.041657350957393646, + "learning_rate": 5.16054783686655e-06, + "loss": 0.0323, + "step": 160440 + }, + { + "epoch": 0.07225, + "grad_norm": 0.053743381053209305, + "learning_rate": 5.15803289765257e-06, + "loss": 0.0342, + "step": 160450 + }, + { + "epoch": 0.0723, + "grad_norm": 0.045378465205430984, + "learning_rate": 5.155518500910522e-06, + "loss": 0.0334, + "step": 160460 + }, + { + "epoch": 0.07235, + "grad_norm": 0.04740902781486511, + "learning_rate": 5.153004646709142e-06, + "loss": 0.0344, + "step": 160470 + }, + { + "epoch": 0.0724, + "grad_norm": 0.04553137347102165, + "learning_rate": 5.150491335117153e-06, + "loss": 0.0333, + "step": 160480 + }, + { + "epoch": 0.07245, + "grad_norm": 0.03902588039636612, + "learning_rate": 5.1479785662032795e-06, + "loss": 0.0341, + "step": 160490 + }, + { + "epoch": 0.0725, + "grad_norm": 0.04232773184776306, + "learning_rate": 5.145466340036206e-06, + "loss": 0.0339, + "step": 160500 + }, + { + "epoch": 0.07255, + "grad_norm": 0.039230186492204666, + "learning_rate": 5.14295465668464e-06, + "loss": 0.0346, + "step": 160510 + }, + { + "epoch": 0.0726, + "grad_norm": 0.04060075804591179, + "learning_rate": 5.14044351621722e-06, + "loss": 0.0353, + "step": 160520 + }, + { + "epoch": 0.07265, + "grad_norm": 0.03841876611113548, + "learning_rate": 5.13793291870262e-06, + "loss": 0.0332, + "step": 160530 + }, + { + "epoch": 0.0727, + "grad_norm": 0.04524306207895279, + "learning_rate": 5.1354228642094635e-06, + "loss": 0.0336, + "step": 160540 + }, + { + "epoch": 0.07275, + "grad_norm": 0.040125150233507156, + "learning_rate": 5.132913352806393e-06, + "loss": 0.0342, + "step": 160550 + }, + { + "epoch": 0.0728, + "grad_norm": 0.051557350903749466, + "learning_rate": 5.1304043845620045e-06, + "loss": 0.0344, + "step": 160560 + }, + { + "epoch": 0.07285, + "grad_norm": 0.04546048492193222, + "learning_rate": 5.12789595954489e-06, + "loss": 0.0338, + "step": 160570 + }, + { + "epoch": 0.0729, + "grad_norm": 0.04418913647532463, + "learning_rate": 5.125388077823642e-06, + "loss": 0.0341, + "step": 160580 + }, + { + "epoch": 0.07295, + "grad_norm": 0.04851749911904335, + "learning_rate": 5.122880739466818e-06, + "loss": 0.0345, + "step": 160590 + }, + { + "epoch": 0.073, + "grad_norm": 0.047638460993766785, + "learning_rate": 5.120373944542958e-06, + "loss": 0.0329, + "step": 160600 + }, + { + "epoch": 0.07305, + "grad_norm": 0.0418444387614727, + "learning_rate": 5.117867693120612e-06, + "loss": 0.033, + "step": 160610 + }, + { + "epoch": 0.0731, + "grad_norm": 0.03957684710621834, + "learning_rate": 5.115361985268291e-06, + "loss": 0.0333, + "step": 160620 + }, + { + "epoch": 0.07315, + "grad_norm": 0.04226352646946907, + "learning_rate": 5.112856821054507e-06, + "loss": 0.0331, + "step": 160630 + }, + { + "epoch": 0.0732, + "grad_norm": 0.0388200469315052, + "learning_rate": 5.110352200547747e-06, + "loss": 0.0327, + "step": 160640 + }, + { + "epoch": 0.07325, + "grad_norm": 0.053701531141996384, + "learning_rate": 5.107848123816486e-06, + "loss": 0.034, + "step": 160650 + }, + { + "epoch": 0.0733, + "grad_norm": 0.05368814989924431, + "learning_rate": 5.105344590929176e-06, + "loss": 0.0372, + "step": 160660 + }, + { + "epoch": 0.07335, + "grad_norm": 0.05174095556139946, + "learning_rate": 5.102841601954278e-06, + "loss": 0.0325, + "step": 160670 + }, + { + "epoch": 0.0734, + "grad_norm": 0.041240040212869644, + "learning_rate": 5.100339156960218e-06, + "loss": 0.0335, + "step": 160680 + }, + { + "epoch": 0.07345, + "grad_norm": 0.05635536462068558, + "learning_rate": 5.0978372560154e-06, + "loss": 0.0335, + "step": 160690 + }, + { + "epoch": 0.0735, + "grad_norm": 0.04730790853500366, + "learning_rate": 5.095335899188241e-06, + "loss": 0.0334, + "step": 160700 + }, + { + "epoch": 0.07355, + "grad_norm": 0.043954696506261826, + "learning_rate": 5.092835086547115e-06, + "loss": 0.0338, + "step": 160710 + }, + { + "epoch": 0.0736, + "grad_norm": 0.0432947538793087, + "learning_rate": 5.090334818160414e-06, + "loss": 0.0329, + "step": 160720 + }, + { + "epoch": 0.07365, + "grad_norm": 0.05044640228152275, + "learning_rate": 5.087835094096463e-06, + "loss": 0.034, + "step": 160730 + }, + { + "epoch": 0.0737, + "grad_norm": 0.04494147375226021, + "learning_rate": 5.085335914423622e-06, + "loss": 0.0332, + "step": 160740 + }, + { + "epoch": 0.07375, + "grad_norm": 0.04129369556903839, + "learning_rate": 5.08283727921022e-06, + "loss": 0.0324, + "step": 160750 + }, + { + "epoch": 0.0738, + "grad_norm": 0.03822697326540947, + "learning_rate": 5.080339188524566e-06, + "loss": 0.0311, + "step": 160760 + }, + { + "epoch": 0.07385, + "grad_norm": 0.053277526050806046, + "learning_rate": 5.077841642434955e-06, + "loss": 0.0333, + "step": 160770 + }, + { + "epoch": 0.0739, + "grad_norm": 0.054511070251464844, + "learning_rate": 5.075344641009663e-06, + "loss": 0.0319, + "step": 160780 + }, + { + "epoch": 0.07395, + "grad_norm": 0.04316005855798721, + "learning_rate": 5.072848184316964e-06, + "loss": 0.0328, + "step": 160790 + }, + { + "epoch": 0.074, + "grad_norm": 0.041936952620744705, + "learning_rate": 5.070352272425119e-06, + "loss": 0.0329, + "step": 160800 + }, + { + "epoch": 0.07405, + "grad_norm": 0.04728446528315544, + "learning_rate": 5.067856905402346e-06, + "loss": 0.0341, + "step": 160810 + }, + { + "epoch": 0.0741, + "grad_norm": 0.04272538423538208, + "learning_rate": 5.065362083316882e-06, + "loss": 0.0324, + "step": 160820 + }, + { + "epoch": 0.07415, + "grad_norm": 0.042639367282390594, + "learning_rate": 5.062867806236923e-06, + "loss": 0.0333, + "step": 160830 + }, + { + "epoch": 0.0742, + "grad_norm": 0.03823878616094589, + "learning_rate": 5.0603740742306755e-06, + "loss": 0.0322, + "step": 160840 + }, + { + "epoch": 0.07425, + "grad_norm": 0.0475417897105217, + "learning_rate": 5.057880887366309e-06, + "loss": 0.0345, + "step": 160850 + }, + { + "epoch": 0.0743, + "grad_norm": 0.04159359261393547, + "learning_rate": 5.055388245711978e-06, + "loss": 0.036, + "step": 160860 + }, + { + "epoch": 0.07435, + "grad_norm": 0.044668398797512054, + "learning_rate": 5.052896149335851e-06, + "loss": 0.0321, + "step": 160870 + }, + { + "epoch": 0.0744, + "grad_norm": 0.03785253316164017, + "learning_rate": 5.0504045983060465e-06, + "loss": 0.0326, + "step": 160880 + }, + { + "epoch": 0.07445, + "grad_norm": 0.044448185712099075, + "learning_rate": 5.0479135926906865e-06, + "loss": 0.0319, + "step": 160890 + }, + { + "epoch": 0.0745, + "grad_norm": 0.04127410799264908, + "learning_rate": 5.0454231325578666e-06, + "loss": 0.0325, + "step": 160900 + }, + { + "epoch": 0.07455, + "grad_norm": 0.04340953379869461, + "learning_rate": 5.042933217975687e-06, + "loss": 0.0328, + "step": 160910 + }, + { + "epoch": 0.0746, + "grad_norm": 0.0437263585627079, + "learning_rate": 5.040443849012211e-06, + "loss": 0.0327, + "step": 160920 + }, + { + "epoch": 0.07465, + "grad_norm": 0.07725926488637924, + "learning_rate": 5.037955025735508e-06, + "loss": 0.036, + "step": 160930 + }, + { + "epoch": 0.0747, + "grad_norm": 0.03931087255477905, + "learning_rate": 5.035466748213616e-06, + "loss": 0.033, + "step": 160940 + }, + { + "epoch": 0.07475, + "grad_norm": 0.04674782231450081, + "learning_rate": 5.032979016514555e-06, + "loss": 0.0341, + "step": 160950 + }, + { + "epoch": 0.0748, + "grad_norm": 0.0424048937857151, + "learning_rate": 5.030491830706352e-06, + "loss": 0.0331, + "step": 160960 + }, + { + "epoch": 0.07485, + "grad_norm": 0.04696732386946678, + "learning_rate": 5.028005190857002e-06, + "loss": 0.0326, + "step": 160970 + }, + { + "epoch": 0.0749, + "grad_norm": 0.03869684040546417, + "learning_rate": 5.025519097034478e-06, + "loss": 0.034, + "step": 160980 + }, + { + "epoch": 0.07495, + "grad_norm": 0.04522455111145973, + "learning_rate": 5.023033549306766e-06, + "loss": 0.0329, + "step": 160990 + }, + { + "epoch": 0.075, + "grad_norm": 0.04113109037280083, + "learning_rate": 5.020548547741805e-06, + "loss": 0.0335, + "step": 161000 + }, + { + "epoch": 0.07505, + "grad_norm": 0.0423266626894474, + "learning_rate": 5.018064092407554e-06, + "loss": 0.0338, + "step": 161010 + }, + { + "epoch": 0.0751, + "grad_norm": 0.05098946765065193, + "learning_rate": 5.015580183371912e-06, + "loss": 0.0346, + "step": 161020 + }, + { + "epoch": 0.07515, + "grad_norm": 0.04282857105135918, + "learning_rate": 5.013096820702804e-06, + "loss": 0.0341, + "step": 161030 + }, + { + "epoch": 0.0752, + "grad_norm": 0.03867142274975777, + "learning_rate": 5.010614004468114e-06, + "loss": 0.0343, + "step": 161040 + }, + { + "epoch": 0.07525, + "grad_norm": 0.04410089552402496, + "learning_rate": 5.008131734735735e-06, + "loss": 0.0327, + "step": 161050 + }, + { + "epoch": 0.0753, + "grad_norm": 0.043044958263635635, + "learning_rate": 5.005650011573526e-06, + "loss": 0.0331, + "step": 161060 + }, + { + "epoch": 0.07535, + "grad_norm": 0.041100747883319855, + "learning_rate": 5.003168835049324e-06, + "loss": 0.0323, + "step": 161070 + }, + { + "epoch": 0.0754, + "grad_norm": 0.04438444972038269, + "learning_rate": 5.000688205230983e-06, + "loss": 0.0333, + "step": 161080 + }, + { + "epoch": 0.07545, + "grad_norm": 0.04496739059686661, + "learning_rate": 4.9982081221863094e-06, + "loss": 0.0333, + "step": 161090 + }, + { + "epoch": 0.0755, + "grad_norm": 0.04447561874985695, + "learning_rate": 4.995728585983114e-06, + "loss": 0.0326, + "step": 161100 + }, + { + "epoch": 0.07555, + "grad_norm": 0.04621170461177826, + "learning_rate": 4.993249596689179e-06, + "loss": 0.0325, + "step": 161110 + }, + { + "epoch": 0.0756, + "grad_norm": 0.04111267998814583, + "learning_rate": 4.990771154372281e-06, + "loss": 0.0331, + "step": 161120 + }, + { + "epoch": 0.07565, + "grad_norm": 0.04611721634864807, + "learning_rate": 4.988293259100188e-06, + "loss": 0.0335, + "step": 161130 + }, + { + "epoch": 0.0757, + "grad_norm": 0.040240366011857986, + "learning_rate": 4.985815910940641e-06, + "loss": 0.033, + "step": 161140 + }, + { + "epoch": 0.07575, + "grad_norm": 0.04606426879763603, + "learning_rate": 4.9833391099613685e-06, + "loss": 0.0343, + "step": 161150 + }, + { + "epoch": 0.0758, + "grad_norm": 0.04275064542889595, + "learning_rate": 4.980862856230079e-06, + "loss": 0.0337, + "step": 161160 + }, + { + "epoch": 0.07585, + "grad_norm": 0.04866638407111168, + "learning_rate": 4.978387149814481e-06, + "loss": 0.0338, + "step": 161170 + }, + { + "epoch": 0.0759, + "grad_norm": 0.04923534020781517, + "learning_rate": 4.975911990782262e-06, + "loss": 0.0335, + "step": 161180 + }, + { + "epoch": 0.07595, + "grad_norm": 0.036708369851112366, + "learning_rate": 4.973437379201076e-06, + "loss": 0.0326, + "step": 161190 + }, + { + "epoch": 0.076, + "grad_norm": 0.040371738374233246, + "learning_rate": 4.970963315138596e-06, + "loss": 0.0322, + "step": 161200 + }, + { + "epoch": 0.07605, + "grad_norm": 0.040207087993621826, + "learning_rate": 4.968489798662445e-06, + "loss": 0.0326, + "step": 161210 + }, + { + "epoch": 0.0761, + "grad_norm": 0.0415315218269825, + "learning_rate": 4.966016829840276e-06, + "loss": 0.0323, + "step": 161220 + }, + { + "epoch": 0.07615, + "grad_norm": 0.038596540689468384, + "learning_rate": 4.963544408739665e-06, + "loss": 0.0326, + "step": 161230 + }, + { + "epoch": 0.0762, + "grad_norm": 0.0413893461227417, + "learning_rate": 4.9610725354282216e-06, + "loss": 0.0335, + "step": 161240 + }, + { + "epoch": 0.07625, + "grad_norm": 0.03666406869888306, + "learning_rate": 4.958601209973535e-06, + "loss": 0.034, + "step": 161250 + }, + { + "epoch": 0.0763, + "grad_norm": 0.03784109652042389, + "learning_rate": 4.956130432443159e-06, + "loss": 0.0315, + "step": 161260 + }, + { + "epoch": 0.07635, + "grad_norm": 0.03671599552035332, + "learning_rate": 4.953660202904651e-06, + "loss": 0.0319, + "step": 161270 + }, + { + "epoch": 0.0764, + "grad_norm": 0.03958893194794655, + "learning_rate": 4.951190521425531e-06, + "loss": 0.0331, + "step": 161280 + }, + { + "epoch": 0.07645, + "grad_norm": 0.042154353111982346, + "learning_rate": 4.948721388073341e-06, + "loss": 0.0341, + "step": 161290 + }, + { + "epoch": 0.0765, + "grad_norm": 0.03948592394590378, + "learning_rate": 4.9462528029155715e-06, + "loss": 0.0324, + "step": 161300 + }, + { + "epoch": 0.07655, + "grad_norm": 0.04275766387581825, + "learning_rate": 4.943784766019713e-06, + "loss": 0.032, + "step": 161310 + }, + { + "epoch": 0.0766, + "grad_norm": 0.04145859181880951, + "learning_rate": 4.94131727745325e-06, + "loss": 0.0339, + "step": 161320 + }, + { + "epoch": 0.07665, + "grad_norm": 0.0472242496907711, + "learning_rate": 4.938850337283629e-06, + "loss": 0.033, + "step": 161330 + }, + { + "epoch": 0.0767, + "grad_norm": 0.04166737571358681, + "learning_rate": 4.9363839455783104e-06, + "loss": 0.0329, + "step": 161340 + }, + { + "epoch": 0.07675, + "grad_norm": 0.04382137209177017, + "learning_rate": 4.933918102404717e-06, + "loss": 0.033, + "step": 161350 + }, + { + "epoch": 0.0768, + "grad_norm": 0.04431990534067154, + "learning_rate": 4.931452807830259e-06, + "loss": 0.0345, + "step": 161360 + }, + { + "epoch": 0.07685, + "grad_norm": 0.057345159351825714, + "learning_rate": 4.928988061922349e-06, + "loss": 0.0325, + "step": 161370 + }, + { + "epoch": 0.0769, + "grad_norm": 0.048131149262189865, + "learning_rate": 4.926523864748362e-06, + "loss": 0.0322, + "step": 161380 + }, + { + "epoch": 0.07695, + "grad_norm": 0.04350270330905914, + "learning_rate": 4.924060216375672e-06, + "loss": 0.0333, + "step": 161390 + }, + { + "epoch": 0.077, + "grad_norm": 0.04744592681527138, + "learning_rate": 4.921597116871629e-06, + "loss": 0.0325, + "step": 161400 + }, + { + "epoch": 0.07705, + "grad_norm": 0.043828800320625305, + "learning_rate": 4.919134566303582e-06, + "loss": 0.0319, + "step": 161410 + }, + { + "epoch": 0.0771, + "grad_norm": 0.052718572318553925, + "learning_rate": 4.916672564738847e-06, + "loss": 0.0317, + "step": 161420 + }, + { + "epoch": 0.07715, + "grad_norm": 0.04408946633338928, + "learning_rate": 4.914211112244746e-06, + "loss": 0.0329, + "step": 161430 + }, + { + "epoch": 0.0772, + "grad_norm": 0.040304381400346756, + "learning_rate": 4.9117502088885654e-06, + "loss": 0.0319, + "step": 161440 + }, + { + "epoch": 0.07725, + "grad_norm": 0.0368029847741127, + "learning_rate": 4.909289854737581e-06, + "loss": 0.0324, + "step": 161450 + }, + { + "epoch": 0.0773, + "grad_norm": 0.040707238018512726, + "learning_rate": 4.906830049859074e-06, + "loss": 0.0304, + "step": 161460 + }, + { + "epoch": 0.07735, + "grad_norm": 0.03966325893998146, + "learning_rate": 4.9043707943202815e-06, + "loss": 0.0314, + "step": 161470 + }, + { + "epoch": 0.0774, + "grad_norm": 0.041993141174316406, + "learning_rate": 4.901912088188443e-06, + "loss": 0.0325, + "step": 161480 + }, + { + "epoch": 0.07745, + "grad_norm": 0.03906678408384323, + "learning_rate": 4.8994539315307705e-06, + "loss": 0.0328, + "step": 161490 + }, + { + "epoch": 0.0775, + "grad_norm": 0.03417155519127846, + "learning_rate": 4.896996324414477e-06, + "loss": 0.0316, + "step": 161500 + }, + { + "epoch": 0.07755, + "grad_norm": 0.03894183784723282, + "learning_rate": 4.894539266906764e-06, + "loss": 0.0318, + "step": 161510 + }, + { + "epoch": 0.0776, + "grad_norm": 0.047191206365823746, + "learning_rate": 4.892082759074781e-06, + "loss": 0.032, + "step": 161520 + }, + { + "epoch": 0.07765, + "grad_norm": 0.03669824078679085, + "learning_rate": 4.889626800985708e-06, + "loss": 0.0316, + "step": 161530 + }, + { + "epoch": 0.0777, + "grad_norm": 0.049897853285074234, + "learning_rate": 4.8871713927066745e-06, + "loss": 0.0326, + "step": 161540 + }, + { + "epoch": 0.07775, + "grad_norm": 0.0515248104929924, + "learning_rate": 4.884716534304829e-06, + "loss": 0.0321, + "step": 161550 + }, + { + "epoch": 0.0778, + "grad_norm": 0.04020237550139427, + "learning_rate": 4.8822622258472755e-06, + "loss": 0.0334, + "step": 161560 + }, + { + "epoch": 0.07785, + "grad_norm": 0.0410909429192543, + "learning_rate": 4.879808467401106e-06, + "loss": 0.0328, + "step": 161570 + }, + { + "epoch": 0.0779, + "grad_norm": 0.04371938109397888, + "learning_rate": 4.877355259033423e-06, + "loss": 0.034, + "step": 161580 + }, + { + "epoch": 0.07795, + "grad_norm": 0.04580468684434891, + "learning_rate": 4.874902600811287e-06, + "loss": 0.0336, + "step": 161590 + }, + { + "epoch": 0.078, + "grad_norm": 0.04708908125758171, + "learning_rate": 4.872450492801753e-06, + "loss": 0.0332, + "step": 161600 + }, + { + "epoch": 0.07805, + "grad_norm": 0.04521835222840309, + "learning_rate": 4.869998935071856e-06, + "loss": 0.0335, + "step": 161610 + }, + { + "epoch": 0.0781, + "grad_norm": 0.03202797472476959, + "learning_rate": 4.867547927688623e-06, + "loss": 0.0325, + "step": 161620 + }, + { + "epoch": 0.07815, + "grad_norm": 0.05318346619606018, + "learning_rate": 4.8650974707190765e-06, + "loss": 0.034, + "step": 161630 + }, + { + "epoch": 0.0782, + "grad_norm": 0.039052218198776245, + "learning_rate": 4.8626475642301964e-06, + "loss": 0.0319, + "step": 161640 + }, + { + "epoch": 0.07825, + "grad_norm": 0.03733101114630699, + "learning_rate": 4.860198208288969e-06, + "loss": 0.0335, + "step": 161650 + }, + { + "epoch": 0.0783, + "grad_norm": 0.0385499969124794, + "learning_rate": 4.85774940296235e-06, + "loss": 0.0331, + "step": 161660 + }, + { + "epoch": 0.07835, + "grad_norm": 0.043056417256593704, + "learning_rate": 4.855301148317301e-06, + "loss": 0.0331, + "step": 161670 + }, + { + "epoch": 0.0784, + "grad_norm": 0.046939749270677567, + "learning_rate": 4.852853444420752e-06, + "loss": 0.0324, + "step": 161680 + }, + { + "epoch": 0.07845, + "grad_norm": 0.042191725224256516, + "learning_rate": 4.850406291339612e-06, + "loss": 0.0333, + "step": 161690 + }, + { + "epoch": 0.0785, + "grad_norm": 0.04209177568554878, + "learning_rate": 4.847959689140802e-06, + "loss": 0.0318, + "step": 161700 + }, + { + "epoch": 0.07855, + "grad_norm": 0.03851882740855217, + "learning_rate": 4.845513637891197e-06, + "loss": 0.0316, + "step": 161710 + }, + { + "epoch": 0.0786, + "grad_norm": 0.03582446649670601, + "learning_rate": 4.843068137657692e-06, + "loss": 0.0351, + "step": 161720 + }, + { + "epoch": 0.07865, + "grad_norm": 0.04331406578421593, + "learning_rate": 4.840623188507115e-06, + "loss": 0.0327, + "step": 161730 + }, + { + "epoch": 0.0787, + "grad_norm": 0.0433661974966526, + "learning_rate": 4.838178790506328e-06, + "loss": 0.0322, + "step": 161740 + }, + { + "epoch": 0.07875, + "grad_norm": 0.043242111802101135, + "learning_rate": 4.835734943722167e-06, + "loss": 0.0318, + "step": 161750 + }, + { + "epoch": 0.0788, + "grad_norm": 0.04517514258623123, + "learning_rate": 4.833291648221436e-06, + "loss": 0.0334, + "step": 161760 + }, + { + "epoch": 0.07885, + "grad_norm": 0.03985520452260971, + "learning_rate": 4.830848904070934e-06, + "loss": 0.0318, + "step": 161770 + }, + { + "epoch": 0.0789, + "grad_norm": 0.03926889970898628, + "learning_rate": 4.828406711337441e-06, + "loss": 0.0326, + "step": 161780 + }, + { + "epoch": 0.07895, + "grad_norm": 0.05253654345870018, + "learning_rate": 4.825965070087735e-06, + "loss": 0.0333, + "step": 161790 + }, + { + "epoch": 0.079, + "grad_norm": 0.04187209531664848, + "learning_rate": 4.823523980388564e-06, + "loss": 0.0342, + "step": 161800 + }, + { + "epoch": 0.07905, + "grad_norm": 0.05304960161447525, + "learning_rate": 4.821083442306665e-06, + "loss": 0.0332, + "step": 161810 + }, + { + "epoch": 0.0791, + "grad_norm": 0.042362842708826065, + "learning_rate": 4.818643455908767e-06, + "loss": 0.0319, + "step": 161820 + }, + { + "epoch": 0.07915, + "grad_norm": 0.044127993285655975, + "learning_rate": 4.8162040212615695e-06, + "loss": 0.033, + "step": 161830 + }, + { + "epoch": 0.0792, + "grad_norm": 0.04105303809046745, + "learning_rate": 4.8137651384317775e-06, + "loss": 0.0319, + "step": 161840 + }, + { + "epoch": 0.07925, + "grad_norm": 0.04213591665029526, + "learning_rate": 4.8113268074860634e-06, + "loss": 0.032, + "step": 161850 + }, + { + "epoch": 0.0793, + "grad_norm": 0.04188720881938934, + "learning_rate": 4.80888902849109e-06, + "loss": 0.0324, + "step": 161860 + }, + { + "epoch": 0.07935, + "grad_norm": 0.04474461078643799, + "learning_rate": 4.806451801513498e-06, + "loss": 0.0314, + "step": 161870 + }, + { + "epoch": 0.0794, + "grad_norm": 0.05128539726138115, + "learning_rate": 4.804015126619934e-06, + "loss": 0.0333, + "step": 161880 + }, + { + "epoch": 0.07945, + "grad_norm": 0.08220715075731277, + "learning_rate": 4.80157900387701e-06, + "loss": 0.0331, + "step": 161890 + }, + { + "epoch": 0.0795, + "grad_norm": 0.04189962521195412, + "learning_rate": 4.799143433351322e-06, + "loss": 0.0319, + "step": 161900 + }, + { + "epoch": 0.07955, + "grad_norm": 0.0444892942905426, + "learning_rate": 4.796708415109469e-06, + "loss": 0.0323, + "step": 161910 + }, + { + "epoch": 0.0796, + "grad_norm": 0.041526589542627335, + "learning_rate": 4.794273949218009e-06, + "loss": 0.0325, + "step": 161920 + }, + { + "epoch": 0.07965, + "grad_norm": 0.044409725815057755, + "learning_rate": 4.791840035743525e-06, + "loss": 0.0329, + "step": 161930 + }, + { + "epoch": 0.0797, + "grad_norm": 0.037172187119722366, + "learning_rate": 4.789406674752528e-06, + "loss": 0.0334, + "step": 161940 + }, + { + "epoch": 0.07975, + "grad_norm": 0.04312952980399132, + "learning_rate": 4.786973866311559e-06, + "loss": 0.0324, + "step": 161950 + }, + { + "epoch": 0.0798, + "grad_norm": 0.05106675252318382, + "learning_rate": 4.784541610487139e-06, + "loss": 0.0349, + "step": 161960 + }, + { + "epoch": 0.07985, + "grad_norm": 0.04369065910577774, + "learning_rate": 4.7821099073457554e-06, + "loss": 0.0332, + "step": 161970 + }, + { + "epoch": 0.0799, + "grad_norm": 0.040114372968673706, + "learning_rate": 4.779678756953893e-06, + "loss": 0.0328, + "step": 161980 + }, + { + "epoch": 0.07995, + "grad_norm": 0.04935789853334427, + "learning_rate": 4.7772481593780084e-06, + "loss": 0.0329, + "step": 161990 + }, + { + "epoch": 0.08, + "grad_norm": 0.04562026634812355, + "learning_rate": 4.7748181146845626e-06, + "loss": 0.0338, + "step": 162000 + }, + { + "epoch": 0.08005, + "grad_norm": 0.04056199640035629, + "learning_rate": 4.772388622940005e-06, + "loss": 0.0328, + "step": 162010 + }, + { + "epoch": 0.0801, + "grad_norm": 0.03721732646226883, + "learning_rate": 4.769959684210728e-06, + "loss": 0.0342, + "step": 162020 + }, + { + "epoch": 0.08015, + "grad_norm": 0.047930192202329636, + "learning_rate": 4.767531298563163e-06, + "loss": 0.0331, + "step": 162030 + }, + { + "epoch": 0.0802, + "grad_norm": 0.043639253824949265, + "learning_rate": 4.765103466063683e-06, + "loss": 0.0324, + "step": 162040 + }, + { + "epoch": 0.08025, + "grad_norm": 0.05039869248867035, + "learning_rate": 4.762676186778678e-06, + "loss": 0.0343, + "step": 162050 + }, + { + "epoch": 0.0803, + "grad_norm": 0.05747959762811661, + "learning_rate": 4.760249460774505e-06, + "loss": 0.0343, + "step": 162060 + }, + { + "epoch": 0.08035, + "grad_norm": 0.042816489934921265, + "learning_rate": 4.757823288117502e-06, + "loss": 0.0342, + "step": 162070 + }, + { + "epoch": 0.0804, + "grad_norm": 0.04397408664226532, + "learning_rate": 4.755397668874009e-06, + "loss": 0.0335, + "step": 162080 + }, + { + "epoch": 0.08045, + "grad_norm": 0.040711622685194016, + "learning_rate": 4.752972603110342e-06, + "loss": 0.033, + "step": 162090 + }, + { + "epoch": 0.0805, + "grad_norm": 0.04967189207673073, + "learning_rate": 4.750548090892795e-06, + "loss": 0.0342, + "step": 162100 + }, + { + "epoch": 0.08055, + "grad_norm": 0.046030282974243164, + "learning_rate": 4.748124132287651e-06, + "loss": 0.0344, + "step": 162110 + }, + { + "epoch": 0.0806, + "grad_norm": 0.04683135077357292, + "learning_rate": 4.745700727361191e-06, + "loss": 0.0337, + "step": 162120 + }, + { + "epoch": 0.08065, + "grad_norm": 0.04129005968570709, + "learning_rate": 4.7432778761796554e-06, + "loss": 0.0328, + "step": 162130 + }, + { + "epoch": 0.0807, + "grad_norm": 0.04156533256173134, + "learning_rate": 4.7408555788093e-06, + "loss": 0.033, + "step": 162140 + }, + { + "epoch": 0.08075, + "grad_norm": 0.046787671744823456, + "learning_rate": 4.738433835316344e-06, + "loss": 0.0331, + "step": 162150 + }, + { + "epoch": 0.0808, + "grad_norm": 0.0452694408595562, + "learning_rate": 4.7360126457669876e-06, + "loss": 0.034, + "step": 162160 + }, + { + "epoch": 0.08085, + "grad_norm": 0.05058307200670242, + "learning_rate": 4.733592010227439e-06, + "loss": 0.0327, + "step": 162170 + }, + { + "epoch": 0.0809, + "grad_norm": 0.05251476168632507, + "learning_rate": 4.73117192876387e-06, + "loss": 0.0327, + "step": 162180 + }, + { + "epoch": 0.08095, + "grad_norm": 0.04287990555167198, + "learning_rate": 4.728752401442441e-06, + "loss": 0.0325, + "step": 162190 + }, + { + "epoch": 0.081, + "grad_norm": 0.039758551865816116, + "learning_rate": 4.72633342832931e-06, + "loss": 0.0327, + "step": 162200 + }, + { + "epoch": 0.08105, + "grad_norm": 0.03664357587695122, + "learning_rate": 4.723915009490601e-06, + "loss": 0.0317, + "step": 162210 + }, + { + "epoch": 0.0811, + "grad_norm": 0.0380556657910347, + "learning_rate": 4.7214971449924535e-06, + "loss": 0.0321, + "step": 162220 + }, + { + "epoch": 0.08115, + "grad_norm": 0.04451322183012962, + "learning_rate": 4.719079834900941e-06, + "loss": 0.0321, + "step": 162230 + }, + { + "epoch": 0.0812, + "grad_norm": 0.040741708129644394, + "learning_rate": 4.716663079282174e-06, + "loss": 0.0316, + "step": 162240 + }, + { + "epoch": 0.08125, + "grad_norm": 0.0428212434053421, + "learning_rate": 4.714246878202211e-06, + "loss": 0.0321, + "step": 162250 + }, + { + "epoch": 0.0813, + "grad_norm": 0.03966105729341507, + "learning_rate": 4.711831231727123e-06, + "loss": 0.0314, + "step": 162260 + }, + { + "epoch": 0.08135, + "grad_norm": 0.04452461376786232, + "learning_rate": 4.709416139922948e-06, + "loss": 0.0321, + "step": 162270 + }, + { + "epoch": 0.0814, + "grad_norm": 0.049640312790870667, + "learning_rate": 4.707001602855707e-06, + "loss": 0.0351, + "step": 162280 + }, + { + "epoch": 0.08145, + "grad_norm": 0.05360918492078781, + "learning_rate": 4.704587620591425e-06, + "loss": 0.0322, + "step": 162290 + }, + { + "epoch": 0.0815, + "grad_norm": 0.04629068076610565, + "learning_rate": 4.70217419319609e-06, + "loss": 0.0332, + "step": 162300 + }, + { + "epoch": 0.08155, + "grad_norm": 0.04976212605834007, + "learning_rate": 4.69976132073569e-06, + "loss": 0.0338, + "step": 162310 + }, + { + "epoch": 0.0816, + "grad_norm": 0.046391867101192474, + "learning_rate": 4.697349003276183e-06, + "loss": 0.0352, + "step": 162320 + }, + { + "epoch": 0.08165, + "grad_norm": 0.04320315644145012, + "learning_rate": 4.694937240883527e-06, + "loss": 0.0324, + "step": 162330 + }, + { + "epoch": 0.0817, + "grad_norm": 0.044961974024772644, + "learning_rate": 4.692526033623662e-06, + "loss": 0.0319, + "step": 162340 + }, + { + "epoch": 0.08175, + "grad_norm": 0.046283621340990067, + "learning_rate": 4.6901153815625095e-06, + "loss": 0.0358, + "step": 162350 + }, + { + "epoch": 0.0818, + "grad_norm": 0.05020609870553017, + "learning_rate": 4.6877052847659695e-06, + "loss": 0.0322, + "step": 162360 + }, + { + "epoch": 0.08185, + "grad_norm": 0.03718167915940285, + "learning_rate": 4.68529574329993e-06, + "loss": 0.0312, + "step": 162370 + }, + { + "epoch": 0.0819, + "grad_norm": 0.03635135665535927, + "learning_rate": 4.682886757230282e-06, + "loss": 0.0311, + "step": 162380 + }, + { + "epoch": 0.08195, + "grad_norm": 0.03812402859330177, + "learning_rate": 4.680478326622875e-06, + "loss": 0.0318, + "step": 162390 + }, + { + "epoch": 0.082, + "grad_norm": 0.04369513317942619, + "learning_rate": 4.678070451543551e-06, + "loss": 0.0336, + "step": 162400 + }, + { + "epoch": 0.08205, + "grad_norm": 0.041224800050258636, + "learning_rate": 4.675663132058153e-06, + "loss": 0.0314, + "step": 162410 + }, + { + "epoch": 0.0821, + "grad_norm": 0.04262978956103325, + "learning_rate": 4.673256368232482e-06, + "loss": 0.0315, + "step": 162420 + }, + { + "epoch": 0.08215, + "grad_norm": 0.045711975544691086, + "learning_rate": 4.670850160132359e-06, + "loss": 0.0326, + "step": 162430 + }, + { + "epoch": 0.0822, + "grad_norm": 0.05547575652599335, + "learning_rate": 4.668444507823544e-06, + "loss": 0.0319, + "step": 162440 + }, + { + "epoch": 0.08225, + "grad_norm": 0.03910272195935249, + "learning_rate": 4.666039411371817e-06, + "loss": 0.0317, + "step": 162450 + }, + { + "epoch": 0.0823, + "grad_norm": 0.03833167999982834, + "learning_rate": 4.6636348708429394e-06, + "loss": 0.0322, + "step": 162460 + }, + { + "epoch": 0.08235, + "grad_norm": 0.04456144571304321, + "learning_rate": 4.661230886302642e-06, + "loss": 0.0341, + "step": 162470 + }, + { + "epoch": 0.0824, + "grad_norm": 0.04156813398003578, + "learning_rate": 4.658827457816656e-06, + "loss": 0.0318, + "step": 162480 + }, + { + "epoch": 0.08245, + "grad_norm": 0.03893129900097847, + "learning_rate": 4.656424585450675e-06, + "loss": 0.0332, + "step": 162490 + }, + { + "epoch": 0.0825, + "grad_norm": 0.04058993235230446, + "learning_rate": 4.654022269270411e-06, + "loss": 0.0342, + "step": 162500 + }, + { + "epoch": 0.08255, + "grad_norm": 0.04272441565990448, + "learning_rate": 4.651620509341537e-06, + "loss": 0.0328, + "step": 162510 + }, + { + "epoch": 0.0826, + "grad_norm": 0.04472669959068298, + "learning_rate": 4.649219305729705e-06, + "loss": 0.0327, + "step": 162520 + }, + { + "epoch": 0.08265, + "grad_norm": 0.04321610555052757, + "learning_rate": 4.646818658500576e-06, + "loss": 0.0322, + "step": 162530 + }, + { + "epoch": 0.0827, + "grad_norm": 0.034631673246622086, + "learning_rate": 4.644418567719774e-06, + "loss": 0.0326, + "step": 162540 + }, + { + "epoch": 0.08275, + "grad_norm": 0.04054947569966316, + "learning_rate": 4.642019033452929e-06, + "loss": 0.0325, + "step": 162550 + }, + { + "epoch": 0.0828, + "grad_norm": 0.043881386518478394, + "learning_rate": 4.639620055765634e-06, + "loss": 0.0352, + "step": 162560 + }, + { + "epoch": 0.08285, + "grad_norm": 0.04789385944604874, + "learning_rate": 4.637221634723471e-06, + "loss": 0.0334, + "step": 162570 + }, + { + "epoch": 0.0829, + "grad_norm": 0.047630563378334045, + "learning_rate": 4.634823770392027e-06, + "loss": 0.0325, + "step": 162580 + }, + { + "epoch": 0.08295, + "grad_norm": 0.043394166976213455, + "learning_rate": 4.632426462836848e-06, + "loss": 0.0352, + "step": 162590 + }, + { + "epoch": 0.083, + "grad_norm": 0.04159924015402794, + "learning_rate": 4.6300297121234795e-06, + "loss": 0.0331, + "step": 162600 + }, + { + "epoch": 0.08305, + "grad_norm": 0.04577537253499031, + "learning_rate": 4.627633518317439e-06, + "loss": 0.0332, + "step": 162610 + }, + { + "epoch": 0.0831, + "grad_norm": 0.04137266427278519, + "learning_rate": 4.625237881484251e-06, + "loss": 0.0331, + "step": 162620 + }, + { + "epoch": 0.08315, + "grad_norm": 0.05932014808058739, + "learning_rate": 4.622842801689397e-06, + "loss": 0.034, + "step": 162630 + }, + { + "epoch": 0.0832, + "grad_norm": 0.044636886566877365, + "learning_rate": 4.620448278998374e-06, + "loss": 0.0352, + "step": 162640 + }, + { + "epoch": 0.08325, + "grad_norm": 0.04746469110250473, + "learning_rate": 4.618054313476639e-06, + "loss": 0.034, + "step": 162650 + }, + { + "epoch": 0.0833, + "grad_norm": 0.04262920096516609, + "learning_rate": 4.615660905189633e-06, + "loss": 0.0352, + "step": 162660 + }, + { + "epoch": 0.08335, + "grad_norm": 0.04448793828487396, + "learning_rate": 4.6132680542028075e-06, + "loss": 0.0339, + "step": 162670 + }, + { + "epoch": 0.0834, + "grad_norm": 0.061389073729515076, + "learning_rate": 4.610875760581573e-06, + "loss": 0.0351, + "step": 162680 + }, + { + "epoch": 0.08345, + "grad_norm": 0.047155145555734634, + "learning_rate": 4.608484024391338e-06, + "loss": 0.0363, + "step": 162690 + }, + { + "epoch": 0.0835, + "grad_norm": 0.05749020725488663, + "learning_rate": 4.6060928456974825e-06, + "loss": 0.0354, + "step": 162700 + }, + { + "epoch": 0.08355, + "grad_norm": 0.04922579973936081, + "learning_rate": 4.603702224565384e-06, + "loss": 0.0324, + "step": 162710 + }, + { + "epoch": 0.0836, + "grad_norm": 0.048341743648052216, + "learning_rate": 4.6013121610604196e-06, + "loss": 0.0359, + "step": 162720 + }, + { + "epoch": 0.08365, + "grad_norm": 0.05521691218018532, + "learning_rate": 4.598922655247906e-06, + "loss": 0.0341, + "step": 162730 + }, + { + "epoch": 0.0837, + "grad_norm": 0.061152759939432144, + "learning_rate": 4.596533707193185e-06, + "loss": 0.0356, + "step": 162740 + }, + { + "epoch": 0.08375, + "grad_norm": 0.05486699938774109, + "learning_rate": 4.594145316961562e-06, + "loss": 0.0361, + "step": 162750 + }, + { + "epoch": 0.0838, + "grad_norm": 0.0427870899438858, + "learning_rate": 4.591757484618348e-06, + "loss": 0.0343, + "step": 162760 + }, + { + "epoch": 0.08385, + "grad_norm": 0.05687882378697395, + "learning_rate": 4.589370210228816e-06, + "loss": 0.0374, + "step": 162770 + }, + { + "epoch": 0.0839, + "grad_norm": 0.04917405545711517, + "learning_rate": 4.5869834938582295e-06, + "loss": 0.0335, + "step": 162780 + }, + { + "epoch": 0.08395, + "grad_norm": 0.04779008403420448, + "learning_rate": 4.58459733557185e-06, + "loss": 0.034, + "step": 162790 + }, + { + "epoch": 0.084, + "grad_norm": 0.04037446901202202, + "learning_rate": 4.582211735434911e-06, + "loss": 0.034, + "step": 162800 + }, + { + "epoch": 0.08405, + "grad_norm": 0.038166724145412445, + "learning_rate": 4.579826693512632e-06, + "loss": 0.0332, + "step": 162810 + }, + { + "epoch": 0.0841, + "grad_norm": 0.04677105322480202, + "learning_rate": 4.577442209870214e-06, + "loss": 0.0341, + "step": 162820 + }, + { + "epoch": 0.08415, + "grad_norm": 0.037814054638147354, + "learning_rate": 4.575058284572853e-06, + "loss": 0.0343, + "step": 162830 + }, + { + "epoch": 0.0842, + "grad_norm": 0.048276208341121674, + "learning_rate": 4.572674917685732e-06, + "loss": 0.0336, + "step": 162840 + }, + { + "epoch": 0.08425, + "grad_norm": 0.04171053692698479, + "learning_rate": 4.570292109274005e-06, + "loss": 0.0338, + "step": 162850 + }, + { + "epoch": 0.0843, + "grad_norm": 0.04667214676737785, + "learning_rate": 4.5679098594028135e-06, + "loss": 0.0335, + "step": 162860 + }, + { + "epoch": 0.08435, + "grad_norm": 0.04163886606693268, + "learning_rate": 4.5655281681372865e-06, + "loss": 0.0341, + "step": 162870 + }, + { + "epoch": 0.0844, + "grad_norm": 0.043504420667886734, + "learning_rate": 4.56314703554255e-06, + "loss": 0.0376, + "step": 162880 + }, + { + "epoch": 0.08445, + "grad_norm": 0.040289878845214844, + "learning_rate": 4.5607664616836935e-06, + "loss": 0.0327, + "step": 162890 + }, + { + "epoch": 0.0845, + "grad_norm": 0.0437614843249321, + "learning_rate": 4.558386446625798e-06, + "loss": 0.034, + "step": 162900 + }, + { + "epoch": 0.08455, + "grad_norm": 0.038885943591594696, + "learning_rate": 4.5560069904339445e-06, + "loss": 0.0332, + "step": 162910 + }, + { + "epoch": 0.0846, + "grad_norm": 0.04464210197329521, + "learning_rate": 4.553628093173173e-06, + "loss": 0.033, + "step": 162920 + }, + { + "epoch": 0.08465, + "grad_norm": 0.03692776337265968, + "learning_rate": 4.551249754908541e-06, + "loss": 0.0318, + "step": 162930 + }, + { + "epoch": 0.0847, + "grad_norm": 0.039213284850120544, + "learning_rate": 4.548871975705043e-06, + "loss": 0.033, + "step": 162940 + }, + { + "epoch": 0.08475, + "grad_norm": 0.038259051740169525, + "learning_rate": 4.546494755627703e-06, + "loss": 0.0313, + "step": 162950 + }, + { + "epoch": 0.0848, + "grad_norm": 0.0368780642747879, + "learning_rate": 4.54411809474152e-06, + "loss": 0.0321, + "step": 162960 + }, + { + "epoch": 0.08485, + "grad_norm": 0.03567549213767052, + "learning_rate": 4.541741993111465e-06, + "loss": 0.0313, + "step": 162970 + }, + { + "epoch": 0.0849, + "grad_norm": 0.043377235531806946, + "learning_rate": 4.539366450802496e-06, + "loss": 0.0325, + "step": 162980 + }, + { + "epoch": 0.08495, + "grad_norm": 0.05000419542193413, + "learning_rate": 4.5369914678795535e-06, + "loss": 0.0317, + "step": 162990 + }, + { + "epoch": 0.085, + "grad_norm": 0.04062369838356972, + "learning_rate": 4.534617044407586e-06, + "loss": 0.0318, + "step": 163000 + }, + { + "epoch": 0.08505, + "grad_norm": 0.03969515115022659, + "learning_rate": 4.532243180451498e-06, + "loss": 0.0329, + "step": 163010 + }, + { + "epoch": 0.0851, + "grad_norm": 0.0338253490626812, + "learning_rate": 4.529869876076187e-06, + "loss": 0.0323, + "step": 163020 + }, + { + "epoch": 0.08515, + "grad_norm": 0.03509035333991051, + "learning_rate": 4.52749713134655e-06, + "loss": 0.0331, + "step": 163030 + }, + { + "epoch": 0.0852, + "grad_norm": 0.04538458585739136, + "learning_rate": 4.525124946327444e-06, + "loss": 0.0325, + "step": 163040 + }, + { + "epoch": 0.08525, + "grad_norm": 0.045619990676641464, + "learning_rate": 4.522753321083734e-06, + "loss": 0.0348, + "step": 163050 + }, + { + "epoch": 0.0853, + "grad_norm": 0.0390336811542511, + "learning_rate": 4.5203822556802586e-06, + "loss": 0.0327, + "step": 163060 + }, + { + "epoch": 0.08535, + "grad_norm": 0.04070988669991493, + "learning_rate": 4.518011750181836e-06, + "loss": 0.0343, + "step": 163070 + }, + { + "epoch": 0.0854, + "grad_norm": 0.04247087240219116, + "learning_rate": 4.51564180465327e-06, + "loss": 0.0344, + "step": 163080 + }, + { + "epoch": 0.08545, + "grad_norm": 0.08123894035816193, + "learning_rate": 4.51327241915937e-06, + "loss": 0.0335, + "step": 163090 + }, + { + "epoch": 0.0855, + "grad_norm": 0.05599669739603996, + "learning_rate": 4.510903593764906e-06, + "loss": 0.0319, + "step": 163100 + }, + { + "epoch": 0.08555, + "grad_norm": 0.03611220419406891, + "learning_rate": 4.508535328534632e-06, + "loss": 0.032, + "step": 163110 + }, + { + "epoch": 0.0856, + "grad_norm": 0.04675504192709923, + "learning_rate": 4.506167623533311e-06, + "loss": 0.0332, + "step": 163120 + }, + { + "epoch": 0.08565, + "grad_norm": 0.03999366983771324, + "learning_rate": 4.5038004788256625e-06, + "loss": 0.0316, + "step": 163130 + }, + { + "epoch": 0.0857, + "grad_norm": 0.04325946420431137, + "learning_rate": 4.50143389447642e-06, + "loss": 0.0325, + "step": 163140 + }, + { + "epoch": 0.08575, + "grad_norm": 0.03891367092728615, + "learning_rate": 4.4990678705502635e-06, + "loss": 0.0327, + "step": 163150 + }, + { + "epoch": 0.0858, + "grad_norm": 0.0469384640455246, + "learning_rate": 4.496702407111888e-06, + "loss": 0.0319, + "step": 163160 + }, + { + "epoch": 0.08585, + "grad_norm": 0.04036073386669159, + "learning_rate": 4.494337504225971e-06, + "loss": 0.0314, + "step": 163170 + }, + { + "epoch": 0.0859, + "grad_norm": 0.03913462162017822, + "learning_rate": 4.491973161957167e-06, + "loss": 0.0325, + "step": 163180 + }, + { + "epoch": 0.08595, + "grad_norm": 0.039492640644311905, + "learning_rate": 4.4896093803701076e-06, + "loss": 0.0318, + "step": 163190 + }, + { + "epoch": 0.086, + "grad_norm": 0.043633848428726196, + "learning_rate": 4.487246159529418e-06, + "loss": 0.031, + "step": 163200 + }, + { + "epoch": 0.08605, + "grad_norm": 0.03938918560743332, + "learning_rate": 4.484883499499712e-06, + "loss": 0.0338, + "step": 163210 + }, + { + "epoch": 0.0861, + "grad_norm": 0.036930423229932785, + "learning_rate": 4.482521400345599e-06, + "loss": 0.032, + "step": 163220 + }, + { + "epoch": 0.08615, + "grad_norm": 0.0479891262948513, + "learning_rate": 4.4801598621316274e-06, + "loss": 0.0336, + "step": 163230 + }, + { + "epoch": 0.0862, + "grad_norm": 0.04100598394870758, + "learning_rate": 4.477798884922382e-06, + "loss": 0.0351, + "step": 163240 + }, + { + "epoch": 0.08625, + "grad_norm": 0.0451224111020565, + "learning_rate": 4.4754384687824e-06, + "loss": 0.0335, + "step": 163250 + }, + { + "epoch": 0.0863, + "grad_norm": 0.03889250010251999, + "learning_rate": 4.473078613776227e-06, + "loss": 0.035, + "step": 163260 + }, + { + "epoch": 0.08635, + "grad_norm": 0.04930736497044563, + "learning_rate": 4.470719319968372e-06, + "loss": 0.0342, + "step": 163270 + }, + { + "epoch": 0.0864, + "grad_norm": 0.04826205223798752, + "learning_rate": 4.4683605874233315e-06, + "loss": 0.0338, + "step": 163280 + }, + { + "epoch": 0.08645, + "grad_norm": 0.04418352618813515, + "learning_rate": 4.466002416205606e-06, + "loss": 0.0324, + "step": 163290 + }, + { + "epoch": 0.0865, + "grad_norm": 0.04168399050831795, + "learning_rate": 4.4636448063796605e-06, + "loss": 0.0321, + "step": 163300 + }, + { + "epoch": 0.08655, + "grad_norm": 0.044045589864254, + "learning_rate": 4.461287758009949e-06, + "loss": 0.0341, + "step": 163310 + }, + { + "epoch": 0.0866, + "grad_norm": 0.03900770843029022, + "learning_rate": 4.45893127116091e-06, + "loss": 0.0329, + "step": 163320 + }, + { + "epoch": 0.08665, + "grad_norm": 0.0450579933822155, + "learning_rate": 4.45657534589698e-06, + "loss": 0.0336, + "step": 163330 + }, + { + "epoch": 0.0867, + "grad_norm": 0.04815605282783508, + "learning_rate": 4.454219982282554e-06, + "loss": 0.0338, + "step": 163340 + }, + { + "epoch": 0.08675, + "grad_norm": 0.043602343648672104, + "learning_rate": 4.451865180382042e-06, + "loss": 0.0337, + "step": 163350 + }, + { + "epoch": 0.0868, + "grad_norm": 0.044232890009880066, + "learning_rate": 4.449510940259819e-06, + "loss": 0.0341, + "step": 163360 + }, + { + "epoch": 0.08685, + "grad_norm": 0.041185762733221054, + "learning_rate": 4.447157261980237e-06, + "loss": 0.0337, + "step": 163370 + }, + { + "epoch": 0.0869, + "grad_norm": 0.043885987251996994, + "learning_rate": 4.444804145607659e-06, + "loss": 0.0331, + "step": 163380 + }, + { + "epoch": 0.08695, + "grad_norm": 0.04258694499731064, + "learning_rate": 4.442451591206417e-06, + "loss": 0.0328, + "step": 163390 + }, + { + "epoch": 0.087, + "grad_norm": 0.04276786372065544, + "learning_rate": 4.440099598840816e-06, + "loss": 0.0336, + "step": 163400 + }, + { + "epoch": 0.08705, + "grad_norm": 0.04454897344112396, + "learning_rate": 4.437748168575176e-06, + "loss": 0.0334, + "step": 163410 + }, + { + "epoch": 0.0871, + "grad_norm": 0.039518099278211594, + "learning_rate": 4.43539730047377e-06, + "loss": 0.0322, + "step": 163420 + }, + { + "epoch": 0.08715, + "grad_norm": 0.04390877112746239, + "learning_rate": 4.433046994600889e-06, + "loss": 0.0335, + "step": 163430 + }, + { + "epoch": 0.0872, + "grad_norm": 0.03864302486181259, + "learning_rate": 4.4306972510207625e-06, + "loss": 0.0318, + "step": 163440 + }, + { + "epoch": 0.08725, + "grad_norm": 0.04346649348735809, + "learning_rate": 4.428348069797653e-06, + "loss": 0.0339, + "step": 163450 + }, + { + "epoch": 0.0873, + "grad_norm": 0.05340910330414772, + "learning_rate": 4.425999450995771e-06, + "loss": 0.0348, + "step": 163460 + }, + { + "epoch": 0.08735, + "grad_norm": 0.04619944468140602, + "learning_rate": 4.423651394679343e-06, + "loss": 0.0316, + "step": 163470 + }, + { + "epoch": 0.0874, + "grad_norm": 0.0388994924724102, + "learning_rate": 4.421303900912555e-06, + "loss": 0.0323, + "step": 163480 + }, + { + "epoch": 0.08745, + "grad_norm": 0.043405961245298386, + "learning_rate": 4.418956969759583e-06, + "loss": 0.0348, + "step": 163490 + }, + { + "epoch": 0.0875, + "grad_norm": 0.04131999611854553, + "learning_rate": 4.416610601284599e-06, + "loss": 0.0321, + "step": 163500 + }, + { + "epoch": 0.08755, + "grad_norm": 0.03781171143054962, + "learning_rate": 4.414264795551748e-06, + "loss": 0.0328, + "step": 163510 + }, + { + "epoch": 0.0876, + "grad_norm": 0.040955789387226105, + "learning_rate": 4.411919552625165e-06, + "loss": 0.0339, + "step": 163520 + }, + { + "epoch": 0.08765, + "grad_norm": 0.040484536439180374, + "learning_rate": 4.409574872568961e-06, + "loss": 0.0349, + "step": 163530 + }, + { + "epoch": 0.0877, + "grad_norm": 0.04398057982325554, + "learning_rate": 4.407230755447245e-06, + "loss": 0.0346, + "step": 163540 + }, + { + "epoch": 0.08775, + "grad_norm": 0.038379691541194916, + "learning_rate": 4.404887201324107e-06, + "loss": 0.0326, + "step": 163550 + }, + { + "epoch": 0.0878, + "grad_norm": 0.04793104901909828, + "learning_rate": 4.402544210263618e-06, + "loss": 0.0342, + "step": 163560 + }, + { + "epoch": 0.08785, + "grad_norm": 0.04331078752875328, + "learning_rate": 4.400201782329833e-06, + "loss": 0.0329, + "step": 163570 + }, + { + "epoch": 0.0879, + "grad_norm": 0.036909881979227066, + "learning_rate": 4.3978599175867855e-06, + "loss": 0.0324, + "step": 163580 + }, + { + "epoch": 0.08795, + "grad_norm": 0.038852840662002563, + "learning_rate": 4.395518616098513e-06, + "loss": 0.0332, + "step": 163590 + }, + { + "epoch": 0.088, + "grad_norm": 0.04239456355571747, + "learning_rate": 4.393177877929022e-06, + "loss": 0.033, + "step": 163600 + }, + { + "epoch": 0.08805, + "grad_norm": 0.036402180790901184, + "learning_rate": 4.390837703142298e-06, + "loss": 0.0368, + "step": 163610 + }, + { + "epoch": 0.0881, + "grad_norm": 0.04131398722529411, + "learning_rate": 4.388498091802337e-06, + "loss": 0.033, + "step": 163620 + }, + { + "epoch": 0.08815, + "grad_norm": 0.04325365275144577, + "learning_rate": 4.386159043973087e-06, + "loss": 0.0324, + "step": 163630 + }, + { + "epoch": 0.0882, + "grad_norm": 0.04191657900810242, + "learning_rate": 4.3838205597185186e-06, + "loss": 0.0317, + "step": 163640 + }, + { + "epoch": 0.08825, + "grad_norm": 0.04028930142521858, + "learning_rate": 4.381482639102538e-06, + "loss": 0.0334, + "step": 163650 + }, + { + "epoch": 0.0883, + "grad_norm": 0.0353885218501091, + "learning_rate": 4.379145282189076e-06, + "loss": 0.0324, + "step": 163660 + }, + { + "epoch": 0.08835, + "grad_norm": 0.037858471274375916, + "learning_rate": 4.376808489042042e-06, + "loss": 0.0319, + "step": 163670 + }, + { + "epoch": 0.0884, + "grad_norm": 0.03909361734986305, + "learning_rate": 4.374472259725315e-06, + "loss": 0.0323, + "step": 163680 + }, + { + "epoch": 0.08845, + "grad_norm": 0.03300325945019722, + "learning_rate": 4.372136594302767e-06, + "loss": 0.0315, + "step": 163690 + }, + { + "epoch": 0.0885, + "grad_norm": 0.04488392919301987, + "learning_rate": 4.369801492838249e-06, + "loss": 0.032, + "step": 163700 + }, + { + "epoch": 0.08855, + "grad_norm": 0.04392409324645996, + "learning_rate": 4.367466955395616e-06, + "loss": 0.0326, + "step": 163710 + }, + { + "epoch": 0.0886, + "grad_norm": 0.041952718049287796, + "learning_rate": 4.3651329820386835e-06, + "loss": 0.0325, + "step": 163720 + }, + { + "epoch": 0.08865, + "grad_norm": 0.035865992307662964, + "learning_rate": 4.362799572831258e-06, + "loss": 0.0317, + "step": 163730 + }, + { + "epoch": 0.0887, + "grad_norm": 0.042515747249126434, + "learning_rate": 4.360466727837146e-06, + "loss": 0.0318, + "step": 163740 + }, + { + "epoch": 0.08875, + "grad_norm": 0.03912314772605896, + "learning_rate": 4.35813444712011e-06, + "loss": 0.0339, + "step": 163750 + }, + { + "epoch": 0.0888, + "grad_norm": 0.041423387825489044, + "learning_rate": 4.355802730743932e-06, + "loss": 0.0322, + "step": 163760 + }, + { + "epoch": 0.08885, + "grad_norm": 0.04592496156692505, + "learning_rate": 4.3534715787723525e-06, + "loss": 0.0325, + "step": 163770 + }, + { + "epoch": 0.0889, + "grad_norm": 0.04569490626454353, + "learning_rate": 4.3511409912690955e-06, + "loss": 0.0336, + "step": 163780 + }, + { + "epoch": 0.08895, + "grad_norm": 0.04563876986503601, + "learning_rate": 4.348810968297895e-06, + "loss": 0.0333, + "step": 163790 + }, + { + "epoch": 0.089, + "grad_norm": 0.04631095752120018, + "learning_rate": 4.346481509922443e-06, + "loss": 0.0334, + "step": 163800 + }, + { + "epoch": 0.08905, + "grad_norm": 0.04735285043716431, + "learning_rate": 4.344152616206426e-06, + "loss": 0.0329, + "step": 163810 + }, + { + "epoch": 0.0891, + "grad_norm": 0.04486202821135521, + "learning_rate": 4.341824287213511e-06, + "loss": 0.0325, + "step": 163820 + }, + { + "epoch": 0.08915, + "grad_norm": 0.04026487097144127, + "learning_rate": 4.3394965230073665e-06, + "loss": 0.0329, + "step": 163830 + }, + { + "epoch": 0.0892, + "grad_norm": 0.03889838606119156, + "learning_rate": 4.337169323651619e-06, + "loss": 0.0344, + "step": 163840 + }, + { + "epoch": 0.08925, + "grad_norm": 0.047240667045116425, + "learning_rate": 4.334842689209903e-06, + "loss": 0.0336, + "step": 163850 + }, + { + "epoch": 0.0893, + "grad_norm": 0.04033409059047699, + "learning_rate": 4.332516619745828e-06, + "loss": 0.0347, + "step": 163860 + }, + { + "epoch": 0.08935, + "grad_norm": 0.05238153040409088, + "learning_rate": 4.330191115322973e-06, + "loss": 0.034, + "step": 163870 + }, + { + "epoch": 0.0894, + "grad_norm": 0.05487070977687836, + "learning_rate": 4.327866176004938e-06, + "loss": 0.0341, + "step": 163880 + }, + { + "epoch": 0.08945, + "grad_norm": 0.05616867542266846, + "learning_rate": 4.325541801855276e-06, + "loss": 0.0332, + "step": 163890 + }, + { + "epoch": 0.0895, + "grad_norm": 0.040924083441495895, + "learning_rate": 4.323217992937531e-06, + "loss": 0.037, + "step": 163900 + }, + { + "epoch": 0.08955, + "grad_norm": 0.04166407510638237, + "learning_rate": 4.320894749315235e-06, + "loss": 0.0334, + "step": 163910 + }, + { + "epoch": 0.0896, + "grad_norm": 0.03643600270152092, + "learning_rate": 4.3185720710519075e-06, + "loss": 0.0338, + "step": 163920 + }, + { + "epoch": 0.08965, + "grad_norm": 0.03127395734190941, + "learning_rate": 4.316249958211061e-06, + "loss": 0.0316, + "step": 163930 + }, + { + "epoch": 0.0897, + "grad_norm": 0.036043621599674225, + "learning_rate": 4.313928410856158e-06, + "loss": 0.0315, + "step": 163940 + }, + { + "epoch": 0.08975, + "grad_norm": 0.03948104754090309, + "learning_rate": 4.311607429050687e-06, + "loss": 0.0323, + "step": 163950 + }, + { + "epoch": 0.0898, + "grad_norm": 0.03400971367955208, + "learning_rate": 4.30928701285809e-06, + "loss": 0.0316, + "step": 163960 + }, + { + "epoch": 0.08985, + "grad_norm": 0.037716370075941086, + "learning_rate": 4.306967162341818e-06, + "loss": 0.0331, + "step": 163970 + }, + { + "epoch": 0.0899, + "grad_norm": 0.03730543330311775, + "learning_rate": 4.304647877565293e-06, + "loss": 0.032, + "step": 163980 + }, + { + "epoch": 0.08995, + "grad_norm": 0.039050523191690445, + "learning_rate": 4.302329158591911e-06, + "loss": 0.0309, + "step": 163990 + }, + { + "epoch": 0.09, + "grad_norm": 0.038181960582733154, + "learning_rate": 4.3000110054850826e-06, + "loss": 0.0312, + "step": 164000 + }, + { + "epoch": 0.09005, + "grad_norm": 0.03886967524886131, + "learning_rate": 4.297693418308177e-06, + "loss": 0.0338, + "step": 164010 + }, + { + "epoch": 0.0901, + "grad_norm": 0.03755145147442818, + "learning_rate": 4.295376397124554e-06, + "loss": 0.0316, + "step": 164020 + }, + { + "epoch": 0.09015, + "grad_norm": 0.03620903566479683, + "learning_rate": 4.293059941997557e-06, + "loss": 0.0324, + "step": 164030 + }, + { + "epoch": 0.0902, + "grad_norm": 0.04417108744382858, + "learning_rate": 4.29074405299052e-06, + "loss": 0.0318, + "step": 164040 + }, + { + "epoch": 0.09025, + "grad_norm": 0.04349370300769806, + "learning_rate": 4.288428730166768e-06, + "loss": 0.0335, + "step": 164050 + }, + { + "epoch": 0.0903, + "grad_norm": 0.04132212698459625, + "learning_rate": 4.286113973589595e-06, + "loss": 0.0329, + "step": 164060 + }, + { + "epoch": 0.09035, + "grad_norm": 0.04030095413327217, + "learning_rate": 4.283799783322282e-06, + "loss": 0.0328, + "step": 164070 + }, + { + "epoch": 0.0904, + "grad_norm": 0.044824227690696716, + "learning_rate": 4.2814861594280946e-06, + "loss": 0.0339, + "step": 164080 + }, + { + "epoch": 0.09045, + "grad_norm": 0.04444180056452751, + "learning_rate": 4.279173101970296e-06, + "loss": 0.0331, + "step": 164090 + }, + { + "epoch": 0.0905, + "grad_norm": 0.03922867774963379, + "learning_rate": 4.276860611012124e-06, + "loss": 0.0316, + "step": 164100 + }, + { + "epoch": 0.09055, + "grad_norm": 0.03677331656217575, + "learning_rate": 4.274548686616789e-06, + "loss": 0.0335, + "step": 164110 + }, + { + "epoch": 0.0906, + "grad_norm": 0.036875706166028976, + "learning_rate": 4.272237328847514e-06, + "loss": 0.0321, + "step": 164120 + }, + { + "epoch": 0.09065, + "grad_norm": 0.035596031695604324, + "learning_rate": 4.269926537767477e-06, + "loss": 0.0327, + "step": 164130 + }, + { + "epoch": 0.0907, + "grad_norm": 0.04016328975558281, + "learning_rate": 4.267616313439873e-06, + "loss": 0.0322, + "step": 164140 + }, + { + "epoch": 0.09075, + "grad_norm": 0.05132593587040901, + "learning_rate": 4.26530665592784e-06, + "loss": 0.0347, + "step": 164150 + }, + { + "epoch": 0.0908, + "grad_norm": 0.041420597583055496, + "learning_rate": 4.2629975652945295e-06, + "loss": 0.032, + "step": 164160 + }, + { + "epoch": 0.09085, + "grad_norm": 0.047631099820137024, + "learning_rate": 4.260689041603083e-06, + "loss": 0.0325, + "step": 164170 + }, + { + "epoch": 0.0909, + "grad_norm": 0.044410839676856995, + "learning_rate": 4.2583810849166076e-06, + "loss": 0.0316, + "step": 164180 + }, + { + "epoch": 0.09095, + "grad_norm": 0.054925624281167984, + "learning_rate": 4.2560736952981986e-06, + "loss": 0.0329, + "step": 164190 + }, + { + "epoch": 0.091, + "grad_norm": 0.04730357602238655, + "learning_rate": 4.253766872810938e-06, + "loss": 0.0339, + "step": 164200 + }, + { + "epoch": 0.09105, + "grad_norm": 0.04951634630560875, + "learning_rate": 4.251460617517903e-06, + "loss": 0.0336, + "step": 164210 + }, + { + "epoch": 0.0911, + "grad_norm": 0.05603445693850517, + "learning_rate": 4.249154929482138e-06, + "loss": 0.0339, + "step": 164220 + }, + { + "epoch": 0.09115, + "grad_norm": 0.045728061348199844, + "learning_rate": 4.246849808766676e-06, + "loss": 0.0332, + "step": 164230 + }, + { + "epoch": 0.0912, + "grad_norm": 0.040378279983997345, + "learning_rate": 4.244545255434551e-06, + "loss": 0.0335, + "step": 164240 + }, + { + "epoch": 0.09125, + "grad_norm": 0.043054450303316116, + "learning_rate": 4.242241269548752e-06, + "loss": 0.0325, + "step": 164250 + }, + { + "epoch": 0.0913, + "grad_norm": 0.03699095547199249, + "learning_rate": 4.239937851172287e-06, + "loss": 0.0326, + "step": 164260 + }, + { + "epoch": 0.09135, + "grad_norm": 0.044853974133729935, + "learning_rate": 4.237635000368123e-06, + "loss": 0.0333, + "step": 164270 + }, + { + "epoch": 0.0914, + "grad_norm": 0.037403397262096405, + "learning_rate": 4.235332717199217e-06, + "loss": 0.0327, + "step": 164280 + }, + { + "epoch": 0.09145, + "grad_norm": 0.04466768726706505, + "learning_rate": 4.233031001728508e-06, + "loss": 0.0342, + "step": 164290 + }, + { + "epoch": 0.0915, + "grad_norm": 0.04541900008916855, + "learning_rate": 4.230729854018933e-06, + "loss": 0.0344, + "step": 164300 + }, + { + "epoch": 0.09155, + "grad_norm": 0.04904589429497719, + "learning_rate": 4.228429274133403e-06, + "loss": 0.0351, + "step": 164310 + }, + { + "epoch": 0.0916, + "grad_norm": 0.03982043266296387, + "learning_rate": 4.226129262134807e-06, + "loss": 0.0335, + "step": 164320 + }, + { + "epoch": 0.09165, + "grad_norm": 0.04510324448347092, + "learning_rate": 4.2238298180860396e-06, + "loss": 0.033, + "step": 164330 + }, + { + "epoch": 0.0917, + "grad_norm": 0.05479760095477104, + "learning_rate": 4.221530942049953e-06, + "loss": 0.0336, + "step": 164340 + }, + { + "epoch": 0.09175, + "grad_norm": 0.04232299327850342, + "learning_rate": 4.219232634089415e-06, + "loss": 0.0324, + "step": 164350 + }, + { + "epoch": 0.0918, + "grad_norm": 0.03701121360063553, + "learning_rate": 4.2169348942672406e-06, + "loss": 0.0317, + "step": 164360 + }, + { + "epoch": 0.09185, + "grad_norm": 0.04614982753992081, + "learning_rate": 4.214637722646256e-06, + "loss": 0.0331, + "step": 164370 + }, + { + "epoch": 0.0919, + "grad_norm": 0.04048416391015053, + "learning_rate": 4.212341119289273e-06, + "loss": 0.0336, + "step": 164380 + }, + { + "epoch": 0.09195, + "grad_norm": 0.040577538311481476, + "learning_rate": 4.210045084259076e-06, + "loss": 0.0324, + "step": 164390 + }, + { + "epoch": 0.092, + "grad_norm": 0.040321774780750275, + "learning_rate": 4.207749617618437e-06, + "loss": 0.0336, + "step": 164400 + }, + { + "epoch": 0.09205, + "grad_norm": 0.04228787124156952, + "learning_rate": 4.205454719430105e-06, + "loss": 0.0325, + "step": 164410 + }, + { + "epoch": 0.0921, + "grad_norm": 0.036766789853572845, + "learning_rate": 4.203160389756827e-06, + "loss": 0.0332, + "step": 164420 + }, + { + "epoch": 0.09215, + "grad_norm": 0.03712713345885277, + "learning_rate": 4.200866628661346e-06, + "loss": 0.032, + "step": 164430 + }, + { + "epoch": 0.0922, + "grad_norm": 0.03988231346011162, + "learning_rate": 4.198573436206344e-06, + "loss": 0.0325, + "step": 164440 + }, + { + "epoch": 0.09225, + "grad_norm": 0.03686943277716637, + "learning_rate": 4.196280812454534e-06, + "loss": 0.0318, + "step": 164450 + }, + { + "epoch": 0.0923, + "grad_norm": 0.04354991391301155, + "learning_rate": 4.193988757468587e-06, + "loss": 0.0316, + "step": 164460 + }, + { + "epoch": 0.09235, + "grad_norm": 0.043063972145318985, + "learning_rate": 4.191697271311176e-06, + "loss": 0.0329, + "step": 164470 + }, + { + "epoch": 0.0924, + "grad_norm": 0.045166417956352234, + "learning_rate": 4.1894063540449445e-06, + "loss": 0.0311, + "step": 164480 + }, + { + "epoch": 0.09245, + "grad_norm": 0.041872259229421616, + "learning_rate": 4.18711600573252e-06, + "loss": 0.0324, + "step": 164490 + }, + { + "epoch": 0.0925, + "grad_norm": 0.04205867648124695, + "learning_rate": 4.184826226436528e-06, + "loss": 0.034, + "step": 164500 + }, + { + "epoch": 0.09255, + "grad_norm": 0.04708458483219147, + "learning_rate": 4.18253701621957e-06, + "loss": 0.0329, + "step": 164510 + }, + { + "epoch": 0.0926, + "grad_norm": 0.0457821786403656, + "learning_rate": 4.180248375144227e-06, + "loss": 0.0333, + "step": 164520 + }, + { + "epoch": 0.09265, + "grad_norm": 0.04448466747999191, + "learning_rate": 4.177960303273068e-06, + "loss": 0.0333, + "step": 164530 + }, + { + "epoch": 0.0927, + "grad_norm": 0.03960685804486275, + "learning_rate": 4.175672800668656e-06, + "loss": 0.0324, + "step": 164540 + }, + { + "epoch": 0.09275, + "grad_norm": 0.04607471451163292, + "learning_rate": 4.173385867393522e-06, + "loss": 0.0322, + "step": 164550 + }, + { + "epoch": 0.0928, + "grad_norm": 0.053429655730724335, + "learning_rate": 4.171099503510198e-06, + "loss": 0.0347, + "step": 164560 + }, + { + "epoch": 0.09285, + "grad_norm": 0.055426646023988724, + "learning_rate": 4.16881370908119e-06, + "loss": 0.0339, + "step": 164570 + }, + { + "epoch": 0.0929, + "grad_norm": 0.039910197257995605, + "learning_rate": 4.166528484168986e-06, + "loss": 0.0325, + "step": 164580 + }, + { + "epoch": 0.09295, + "grad_norm": 0.05889817699790001, + "learning_rate": 4.164243828836067e-06, + "loss": 0.0335, + "step": 164590 + }, + { + "epoch": 0.093, + "grad_norm": 0.04975072294473648, + "learning_rate": 4.161959743144897e-06, + "loss": 0.0321, + "step": 164600 + }, + { + "epoch": 0.09305, + "grad_norm": 0.04804937541484833, + "learning_rate": 4.159676227157913e-06, + "loss": 0.033, + "step": 164610 + }, + { + "epoch": 0.0931, + "grad_norm": 0.042053185403347015, + "learning_rate": 4.1573932809375574e-06, + "loss": 0.0335, + "step": 164620 + }, + { + "epoch": 0.09315, + "grad_norm": 0.04230290278792381, + "learning_rate": 4.155110904546233e-06, + "loss": 0.0317, + "step": 164630 + }, + { + "epoch": 0.0932, + "grad_norm": 0.036810606718063354, + "learning_rate": 4.1528290980463596e-06, + "loss": 0.035, + "step": 164640 + }, + { + "epoch": 0.09325, + "grad_norm": 0.04115741327404976, + "learning_rate": 4.150547861500293e-06, + "loss": 0.0315, + "step": 164650 + }, + { + "epoch": 0.0933, + "grad_norm": 0.04060962423682213, + "learning_rate": 4.14826719497042e-06, + "loss": 0.0321, + "step": 164660 + }, + { + "epoch": 0.09335, + "grad_norm": 0.04310805723071098, + "learning_rate": 4.145987098519083e-06, + "loss": 0.0322, + "step": 164670 + }, + { + "epoch": 0.0934, + "grad_norm": 0.0482821948826313, + "learning_rate": 4.14370757220863e-06, + "loss": 0.0338, + "step": 164680 + }, + { + "epoch": 0.09345, + "grad_norm": 0.03712339699268341, + "learning_rate": 4.141428616101378e-06, + "loss": 0.0316, + "step": 164690 + }, + { + "epoch": 0.0935, + "grad_norm": 0.033921223133802414, + "learning_rate": 4.139150230259625e-06, + "loss": 0.0326, + "step": 164700 + }, + { + "epoch": 0.09355, + "grad_norm": 0.03595450893044472, + "learning_rate": 4.136872414745674e-06, + "loss": 0.0323, + "step": 164710 + }, + { + "epoch": 0.0936, + "grad_norm": 0.0412093885242939, + "learning_rate": 4.134595169621791e-06, + "loss": 0.0317, + "step": 164720 + }, + { + "epoch": 0.09365, + "grad_norm": 0.04304290562868118, + "learning_rate": 4.132318494950241e-06, + "loss": 0.0327, + "step": 164730 + }, + { + "epoch": 0.0937, + "grad_norm": 0.03768237680196762, + "learning_rate": 4.130042390793254e-06, + "loss": 0.032, + "step": 164740 + }, + { + "epoch": 0.09375, + "grad_norm": 0.0378873385488987, + "learning_rate": 4.1277668572130695e-06, + "loss": 0.031, + "step": 164750 + }, + { + "epoch": 0.0938, + "grad_norm": 0.03945872187614441, + "learning_rate": 4.125491894271902e-06, + "loss": 0.0328, + "step": 164760 + }, + { + "epoch": 0.09385, + "grad_norm": 0.04004974663257599, + "learning_rate": 4.123217502031945e-06, + "loss": 0.0328, + "step": 164770 + }, + { + "epoch": 0.0939, + "grad_norm": 0.042620837688446045, + "learning_rate": 4.120943680555381e-06, + "loss": 0.0327, + "step": 164780 + }, + { + "epoch": 0.09395, + "grad_norm": 0.03914150223135948, + "learning_rate": 4.118670429904365e-06, + "loss": 0.0335, + "step": 164790 + }, + { + "epoch": 0.094, + "grad_norm": 0.03904701769351959, + "learning_rate": 4.11639775014106e-06, + "loss": 0.0345, + "step": 164800 + }, + { + "epoch": 0.09405, + "grad_norm": 0.03798987716436386, + "learning_rate": 4.114125641327593e-06, + "loss": 0.0338, + "step": 164810 + }, + { + "epoch": 0.0941, + "grad_norm": 0.03474852442741394, + "learning_rate": 4.111854103526083e-06, + "loss": 0.0338, + "step": 164820 + }, + { + "epoch": 0.09415, + "grad_norm": 0.03531309589743614, + "learning_rate": 4.109583136798636e-06, + "loss": 0.034, + "step": 164830 + }, + { + "epoch": 0.0942, + "grad_norm": 0.042505595833063126, + "learning_rate": 4.107312741207337e-06, + "loss": 0.0344, + "step": 164840 + }, + { + "epoch": 0.09425, + "grad_norm": 0.03951616212725639, + "learning_rate": 4.105042916814267e-06, + "loss": 0.0344, + "step": 164850 + }, + { + "epoch": 0.0943, + "grad_norm": 0.04498439282178879, + "learning_rate": 4.1027736636814615e-06, + "loss": 0.0337, + "step": 164860 + }, + { + "epoch": 0.09435, + "grad_norm": 0.05997840687632561, + "learning_rate": 4.100504981870975e-06, + "loss": 0.0355, + "step": 164870 + }, + { + "epoch": 0.0944, + "grad_norm": 0.0440804548561573, + "learning_rate": 4.098236871444836e-06, + "loss": 0.0325, + "step": 164880 + }, + { + "epoch": 0.09445, + "grad_norm": 0.04230072349309921, + "learning_rate": 4.095969332465047e-06, + "loss": 0.0326, + "step": 164890 + }, + { + "epoch": 0.0945, + "grad_norm": 0.04409044608473778, + "learning_rate": 4.093702364993607e-06, + "loss": 0.0338, + "step": 164900 + }, + { + "epoch": 0.09455, + "grad_norm": 0.03817106410861015, + "learning_rate": 4.091435969092481e-06, + "loss": 0.0336, + "step": 164910 + }, + { + "epoch": 0.0946, + "grad_norm": 0.03945619985461235, + "learning_rate": 4.089170144823648e-06, + "loss": 0.0332, + "step": 164920 + }, + { + "epoch": 0.09465, + "grad_norm": 0.0455905981361866, + "learning_rate": 4.0869048922490465e-06, + "loss": 0.0328, + "step": 164930 + }, + { + "epoch": 0.0947, + "grad_norm": 0.0419369712471962, + "learning_rate": 4.084640211430601e-06, + "loss": 0.0331, + "step": 164940 + }, + { + "epoch": 0.09475, + "grad_norm": 0.03600061684846878, + "learning_rate": 4.082376102430244e-06, + "loss": 0.033, + "step": 164950 + }, + { + "epoch": 0.0948, + "grad_norm": 0.045839328318834305, + "learning_rate": 4.08011256530986e-06, + "loss": 0.0344, + "step": 164960 + }, + { + "epoch": 0.09485, + "grad_norm": 0.04063069820404053, + "learning_rate": 4.077849600131342e-06, + "loss": 0.0334, + "step": 164970 + }, + { + "epoch": 0.0949, + "grad_norm": 0.04238218441605568, + "learning_rate": 4.075587206956558e-06, + "loss": 0.036, + "step": 164980 + }, + { + "epoch": 0.09495, + "grad_norm": 0.04314802214503288, + "learning_rate": 4.0733253858473545e-06, + "loss": 0.0333, + "step": 164990 + }, + { + "epoch": 0.095, + "grad_norm": 0.03600018471479416, + "learning_rate": 4.071064136865576e-06, + "loss": 0.0328, + "step": 165000 + }, + { + "epoch": 0.09505, + "grad_norm": 0.039877258241176605, + "learning_rate": 4.068803460073042e-06, + "loss": 0.0325, + "step": 165010 + }, + { + "epoch": 0.0951, + "grad_norm": 0.037514206022024155, + "learning_rate": 4.066543355531557e-06, + "loss": 0.0326, + "step": 165020 + }, + { + "epoch": 0.09515, + "grad_norm": 0.04040644317865372, + "learning_rate": 4.064283823302909e-06, + "loss": 0.0335, + "step": 165030 + }, + { + "epoch": 0.0952, + "grad_norm": 0.04327954351902008, + "learning_rate": 4.062024863448882e-06, + "loss": 0.038, + "step": 165040 + }, + { + "epoch": 0.09525, + "grad_norm": 0.04183371737599373, + "learning_rate": 4.059766476031221e-06, + "loss": 0.0336, + "step": 165050 + }, + { + "epoch": 0.0953, + "grad_norm": 0.03946535289287567, + "learning_rate": 4.057508661111686e-06, + "loss": 0.0343, + "step": 165060 + }, + { + "epoch": 0.09535, + "grad_norm": 0.041227441281080246, + "learning_rate": 4.055251418751993e-06, + "loss": 0.0345, + "step": 165070 + }, + { + "epoch": 0.0954, + "grad_norm": 0.044941313564777374, + "learning_rate": 4.052994749013855e-06, + "loss": 0.0335, + "step": 165080 + }, + { + "epoch": 0.09545, + "grad_norm": 0.06171907112002373, + "learning_rate": 4.0507386519589766e-06, + "loss": 0.0336, + "step": 165090 + }, + { + "epoch": 0.0955, + "grad_norm": 0.05504198744893074, + "learning_rate": 4.048483127649033e-06, + "loss": 0.0379, + "step": 165100 + }, + { + "epoch": 0.09555, + "grad_norm": 0.04573937878012657, + "learning_rate": 4.046228176145689e-06, + "loss": 0.0339, + "step": 165110 + }, + { + "epoch": 0.0956, + "grad_norm": 0.044137127697467804, + "learning_rate": 4.043973797510589e-06, + "loss": 0.033, + "step": 165120 + }, + { + "epoch": 0.09565, + "grad_norm": 0.043332889676094055, + "learning_rate": 4.041719991805371e-06, + "loss": 0.0365, + "step": 165130 + }, + { + "epoch": 0.0957, + "grad_norm": 0.06066054478287697, + "learning_rate": 4.039466759091667e-06, + "loss": 0.034, + "step": 165140 + }, + { + "epoch": 0.09575, + "grad_norm": 0.04395994171500206, + "learning_rate": 4.037214099431058e-06, + "loss": 0.0329, + "step": 165150 + }, + { + "epoch": 0.0958, + "grad_norm": 0.03807613253593445, + "learning_rate": 4.034962012885144e-06, + "loss": 0.0336, + "step": 165160 + }, + { + "epoch": 0.09585, + "grad_norm": 0.038211889564991, + "learning_rate": 4.032710499515488e-06, + "loss": 0.0341, + "step": 165170 + }, + { + "epoch": 0.0959, + "grad_norm": 0.036341212689876556, + "learning_rate": 4.0304595593836536e-06, + "loss": 0.0331, + "step": 165180 + }, + { + "epoch": 0.09595, + "grad_norm": 0.035370923578739166, + "learning_rate": 4.02820919255118e-06, + "loss": 0.0329, + "step": 165190 + }, + { + "epoch": 0.096, + "grad_norm": 0.038010936230421066, + "learning_rate": 4.0259593990795795e-06, + "loss": 0.0329, + "step": 165200 + }, + { + "epoch": 0.09605, + "grad_norm": 0.03992505371570587, + "learning_rate": 4.023710179030377e-06, + "loss": 0.034, + "step": 165210 + }, + { + "epoch": 0.0961, + "grad_norm": 0.049580223858356476, + "learning_rate": 4.021461532465057e-06, + "loss": 0.0352, + "step": 165220 + }, + { + "epoch": 0.09615, + "grad_norm": 0.04979586601257324, + "learning_rate": 4.019213459445098e-06, + "loss": 0.0331, + "step": 165230 + }, + { + "epoch": 0.0962, + "grad_norm": 0.05107463523745537, + "learning_rate": 4.016965960031954e-06, + "loss": 0.0327, + "step": 165240 + }, + { + "epoch": 0.09625, + "grad_norm": 0.04189491271972656, + "learning_rate": 4.014719034287079e-06, + "loss": 0.0324, + "step": 165250 + }, + { + "epoch": 0.0963, + "grad_norm": 0.048803169280290604, + "learning_rate": 4.012472682271906e-06, + "loss": 0.0329, + "step": 165260 + }, + { + "epoch": 0.09635, + "grad_norm": 0.04566030949354172, + "learning_rate": 4.0102269040478475e-06, + "loss": 0.0339, + "step": 165270 + }, + { + "epoch": 0.0964, + "grad_norm": 0.04436716064810753, + "learning_rate": 4.0079816996763e-06, + "loss": 0.0341, + "step": 165280 + }, + { + "epoch": 0.09645, + "grad_norm": 0.037450190633535385, + "learning_rate": 4.005737069218637e-06, + "loss": 0.0321, + "step": 165290 + }, + { + "epoch": 0.0965, + "grad_norm": 0.041611459106206894, + "learning_rate": 4.003493012736246e-06, + "loss": 0.0329, + "step": 165300 + }, + { + "epoch": 0.09655, + "grad_norm": 0.038101986050605774, + "learning_rate": 4.001249530290466e-06, + "loss": 0.0324, + "step": 165310 + }, + { + "epoch": 0.0966, + "grad_norm": 0.04389223828911781, + "learning_rate": 3.999006621942628e-06, + "loss": 0.0321, + "step": 165320 + }, + { + "epoch": 0.09665, + "grad_norm": 0.048132430762052536, + "learning_rate": 3.996764287754065e-06, + "loss": 0.0355, + "step": 165330 + }, + { + "epoch": 0.0967, + "grad_norm": 0.03650198504328728, + "learning_rate": 3.994522527786071e-06, + "loss": 0.032, + "step": 165340 + }, + { + "epoch": 0.09675, + "grad_norm": 0.03952125832438469, + "learning_rate": 3.992281342099952e-06, + "loss": 0.0339, + "step": 165350 + }, + { + "epoch": 0.0968, + "grad_norm": 0.044009458273649216, + "learning_rate": 3.990040730756955e-06, + "loss": 0.034, + "step": 165360 + }, + { + "epoch": 0.09685, + "grad_norm": 0.04283880069851875, + "learning_rate": 3.9878006938183525e-06, + "loss": 0.0331, + "step": 165370 + }, + { + "epoch": 0.0969, + "grad_norm": 0.04398813471198082, + "learning_rate": 3.985561231345391e-06, + "loss": 0.0327, + "step": 165380 + }, + { + "epoch": 0.09695, + "grad_norm": 0.041674233973026276, + "learning_rate": 3.983322343399293e-06, + "loss": 0.0329, + "step": 165390 + }, + { + "epoch": 0.097, + "grad_norm": 0.03643820062279701, + "learning_rate": 3.981084030041263e-06, + "loss": 0.032, + "step": 165400 + }, + { + "epoch": 0.09705, + "grad_norm": 0.045983344316482544, + "learning_rate": 3.9788462913324945e-06, + "loss": 0.0323, + "step": 165410 + }, + { + "epoch": 0.0971, + "grad_norm": 0.042985949665308, + "learning_rate": 3.976609127334177e-06, + "loss": 0.0317, + "step": 165420 + }, + { + "epoch": 0.09715, + "grad_norm": 0.04276910051703453, + "learning_rate": 3.974372538107468e-06, + "loss": 0.0332, + "step": 165430 + }, + { + "epoch": 0.0972, + "grad_norm": 0.037295885384082794, + "learning_rate": 3.97213652371351e-06, + "loss": 0.0324, + "step": 165440 + }, + { + "epoch": 0.09725, + "grad_norm": 0.042068447917699814, + "learning_rate": 3.9699010842134455e-06, + "loss": 0.0331, + "step": 165450 + }, + { + "epoch": 0.0973, + "grad_norm": 0.042526569217443466, + "learning_rate": 3.967666219668376e-06, + "loss": 0.0345, + "step": 165460 + }, + { + "epoch": 0.09735, + "grad_norm": 0.04848969727754593, + "learning_rate": 3.965431930139418e-06, + "loss": 0.0336, + "step": 165470 + }, + { + "epoch": 0.0974, + "grad_norm": 0.03859826177358627, + "learning_rate": 3.96319821568765e-06, + "loss": 0.0324, + "step": 165480 + }, + { + "epoch": 0.09745, + "grad_norm": 0.04887937381863594, + "learning_rate": 3.960965076374138e-06, + "loss": 0.0331, + "step": 165490 + }, + { + "epoch": 0.0975, + "grad_norm": 0.04081129655241966, + "learning_rate": 3.9587325122599325e-06, + "loss": 0.0326, + "step": 165500 + }, + { + "epoch": 0.09755, + "grad_norm": 0.03952052816748619, + "learning_rate": 3.956500523406079e-06, + "loss": 0.0355, + "step": 165510 + }, + { + "epoch": 0.0976, + "grad_norm": 0.04575337842106819, + "learning_rate": 3.9542691098735985e-06, + "loss": 0.0327, + "step": 165520 + }, + { + "epoch": 0.09765, + "grad_norm": 0.04624428600072861, + "learning_rate": 3.952038271723485e-06, + "loss": 0.032, + "step": 165530 + }, + { + "epoch": 0.0977, + "grad_norm": 0.03500010445713997, + "learning_rate": 3.949808009016745e-06, + "loss": 0.0315, + "step": 165540 + }, + { + "epoch": 0.09775, + "grad_norm": 0.03959944471716881, + "learning_rate": 3.947578321814341e-06, + "loss": 0.0327, + "step": 165550 + }, + { + "epoch": 0.0978, + "grad_norm": 0.04318464919924736, + "learning_rate": 3.945349210177249e-06, + "loss": 0.032, + "step": 165560 + }, + { + "epoch": 0.09785, + "grad_norm": 0.039310380816459656, + "learning_rate": 3.943120674166384e-06, + "loss": 0.0328, + "step": 165570 + }, + { + "epoch": 0.0979, + "grad_norm": 0.048084914684295654, + "learning_rate": 3.940892713842692e-06, + "loss": 0.0328, + "step": 165580 + }, + { + "epoch": 0.09795, + "grad_norm": 0.05158539488911629, + "learning_rate": 3.938665329267088e-06, + "loss": 0.0325, + "step": 165590 + }, + { + "epoch": 0.098, + "grad_norm": 0.047085583209991455, + "learning_rate": 3.93643852050046e-06, + "loss": 0.0342, + "step": 165600 + }, + { + "epoch": 0.09805, + "grad_norm": 0.04157368093729019, + "learning_rate": 3.9342122876036894e-06, + "loss": 0.0322, + "step": 165610 + }, + { + "epoch": 0.0981, + "grad_norm": 0.046697210520505905, + "learning_rate": 3.931986630637635e-06, + "loss": 0.0346, + "step": 165620 + }, + { + "epoch": 0.09815, + "grad_norm": 0.0374649278819561, + "learning_rate": 3.9297615496631525e-06, + "loss": 0.0343, + "step": 165630 + }, + { + "epoch": 0.0982, + "grad_norm": 0.042161110788583755, + "learning_rate": 3.927537044741086e-06, + "loss": 0.0334, + "step": 165640 + }, + { + "epoch": 0.09825, + "grad_norm": 0.0364234521985054, + "learning_rate": 3.925313115932227e-06, + "loss": 0.0333, + "step": 165650 + }, + { + "epoch": 0.0983, + "grad_norm": 0.04255906492471695, + "learning_rate": 3.923089763297397e-06, + "loss": 0.0343, + "step": 165660 + }, + { + "epoch": 0.09835, + "grad_norm": 0.03991887718439102, + "learning_rate": 3.920866986897367e-06, + "loss": 0.0336, + "step": 165670 + }, + { + "epoch": 0.0984, + "grad_norm": 0.041006337851285934, + "learning_rate": 3.918644786792922e-06, + "loss": 0.0336, + "step": 165680 + }, + { + "epoch": 0.09845, + "grad_norm": 0.0471954271197319, + "learning_rate": 3.916423163044808e-06, + "loss": 0.033, + "step": 165690 + }, + { + "epoch": 0.0985, + "grad_norm": 0.03990200161933899, + "learning_rate": 3.914202115713756e-06, + "loss": 0.034, + "step": 165700 + }, + { + "epoch": 0.09855, + "grad_norm": 0.04307753965258598, + "learning_rate": 3.911981644860505e-06, + "loss": 0.0346, + "step": 165710 + }, + { + "epoch": 0.0986, + "grad_norm": 0.03951175883412361, + "learning_rate": 3.909761750545754e-06, + "loss": 0.0348, + "step": 165720 + }, + { + "epoch": 0.09865, + "grad_norm": 0.041647281497716904, + "learning_rate": 3.9075424328301914e-06, + "loss": 0.0346, + "step": 165730 + }, + { + "epoch": 0.0987, + "grad_norm": 0.04150126501917839, + "learning_rate": 3.90532369177449e-06, + "loss": 0.0322, + "step": 165740 + }, + { + "epoch": 0.09875, + "grad_norm": 0.04288269206881523, + "learning_rate": 3.903105527439319e-06, + "loss": 0.0326, + "step": 165750 + }, + { + "epoch": 0.0988, + "grad_norm": 0.038329627364873886, + "learning_rate": 3.900887939885312e-06, + "loss": 0.0316, + "step": 165760 + }, + { + "epoch": 0.09885, + "grad_norm": 0.03365486487746239, + "learning_rate": 3.898670929173107e-06, + "loss": 0.0316, + "step": 165770 + }, + { + "epoch": 0.0989, + "grad_norm": 0.04152948781847954, + "learning_rate": 3.896454495363313e-06, + "loss": 0.0327, + "step": 165780 + }, + { + "epoch": 0.09895, + "grad_norm": 0.036098282784223557, + "learning_rate": 3.894238638516518e-06, + "loss": 0.0335, + "step": 165790 + }, + { + "epoch": 0.099, + "grad_norm": 0.0432005412876606, + "learning_rate": 3.892023358693317e-06, + "loss": 0.0344, + "step": 165800 + }, + { + "epoch": 0.09905, + "grad_norm": 0.036867767572402954, + "learning_rate": 3.889808655954263e-06, + "loss": 0.0334, + "step": 165810 + }, + { + "epoch": 0.0991, + "grad_norm": 0.03740301728248596, + "learning_rate": 3.887594530359909e-06, + "loss": 0.0328, + "step": 165820 + }, + { + "epoch": 0.09915, + "grad_norm": 0.03603968769311905, + "learning_rate": 3.885380981970793e-06, + "loss": 0.0324, + "step": 165830 + }, + { + "epoch": 0.0992, + "grad_norm": 0.03937996178865433, + "learning_rate": 3.883168010847421e-06, + "loss": 0.0343, + "step": 165840 + }, + { + "epoch": 0.09925, + "grad_norm": 0.038710664957761765, + "learning_rate": 3.880955617050316e-06, + "loss": 0.0332, + "step": 165850 + }, + { + "epoch": 0.0993, + "grad_norm": 0.04181608557701111, + "learning_rate": 3.878743800639939e-06, + "loss": 0.0327, + "step": 165860 + }, + { + "epoch": 0.09935, + "grad_norm": 0.039431218057870865, + "learning_rate": 3.876532561676777e-06, + "loss": 0.0338, + "step": 165870 + }, + { + "epoch": 0.0994, + "grad_norm": 0.044270072132349014, + "learning_rate": 3.874321900221273e-06, + "loss": 0.0326, + "step": 165880 + }, + { + "epoch": 0.09945, + "grad_norm": 0.0367770753800869, + "learning_rate": 3.872111816333876e-06, + "loss": 0.0324, + "step": 165890 + }, + { + "epoch": 0.0995, + "grad_norm": 0.04619767516851425, + "learning_rate": 3.869902310075005e-06, + "loss": 0.0328, + "step": 165900 + }, + { + "epoch": 0.09955, + "grad_norm": 0.041448675096035004, + "learning_rate": 3.867693381505064e-06, + "loss": 0.0326, + "step": 165910 + }, + { + "epoch": 0.0996, + "grad_norm": 0.042945943772792816, + "learning_rate": 3.865485030684449e-06, + "loss": 0.0317, + "step": 165920 + }, + { + "epoch": 0.09965, + "grad_norm": 0.039436765015125275, + "learning_rate": 3.863277257673533e-06, + "loss": 0.0326, + "step": 165930 + }, + { + "epoch": 0.0997, + "grad_norm": 0.04146011918783188, + "learning_rate": 3.861070062532679e-06, + "loss": 0.0317, + "step": 165940 + }, + { + "epoch": 0.09975, + "grad_norm": 0.04633131995797157, + "learning_rate": 3.858863445322222e-06, + "loss": 0.0316, + "step": 165950 + }, + { + "epoch": 0.0998, + "grad_norm": 0.03417607769370079, + "learning_rate": 3.856657406102496e-06, + "loss": 0.032, + "step": 165960 + }, + { + "epoch": 0.09985, + "grad_norm": 0.04359418526291847, + "learning_rate": 3.854451944933818e-06, + "loss": 0.033, + "step": 165970 + }, + { + "epoch": 0.0999, + "grad_norm": 0.04120621457695961, + "learning_rate": 3.85224706187648e-06, + "loss": 0.0327, + "step": 165980 + }, + { + "epoch": 0.09995, + "grad_norm": 0.0409977026283741, + "learning_rate": 3.850042756990763e-06, + "loss": 0.0334, + "step": 165990 + }, + { + "epoch": 0.1, + "grad_norm": 0.03260594978928566, + "learning_rate": 3.847839030336925e-06, + "loss": 0.0315, + "step": 166000 + }, + { + "epoch": 0.10005, + "grad_norm": 0.03696081042289734, + "learning_rate": 3.845635881975226e-06, + "loss": 0.033, + "step": 166010 + }, + { + "epoch": 0.1001, + "grad_norm": 0.03451487794518471, + "learning_rate": 3.843433311965897e-06, + "loss": 0.0317, + "step": 166020 + }, + { + "epoch": 0.10015, + "grad_norm": 0.03411698341369629, + "learning_rate": 3.841231320369146e-06, + "loss": 0.0316, + "step": 166030 + }, + { + "epoch": 0.1002, + "grad_norm": 0.03840222954750061, + "learning_rate": 3.8390299072451866e-06, + "loss": 0.0333, + "step": 166040 + }, + { + "epoch": 0.10025, + "grad_norm": 0.04946882650256157, + "learning_rate": 3.836829072654196e-06, + "loss": 0.0332, + "step": 166050 + }, + { + "epoch": 0.1003, + "grad_norm": 0.050360143184661865, + "learning_rate": 3.834628816656357e-06, + "loss": 0.0337, + "step": 166060 + }, + { + "epoch": 0.10035, + "grad_norm": 0.04284515231847763, + "learning_rate": 3.832429139311805e-06, + "loss": 0.0329, + "step": 166070 + }, + { + "epoch": 0.1004, + "grad_norm": 0.03992254287004471, + "learning_rate": 3.830230040680688e-06, + "loss": 0.0356, + "step": 166080 + }, + { + "epoch": 0.10045, + "grad_norm": 0.044528309255838394, + "learning_rate": 3.828031520823136e-06, + "loss": 0.0326, + "step": 166090 + }, + { + "epoch": 0.1005, + "grad_norm": 0.04010489583015442, + "learning_rate": 3.825833579799246e-06, + "loss": 0.0335, + "step": 166100 + }, + { + "epoch": 0.10055, + "grad_norm": 0.04283710569143295, + "learning_rate": 3.823636217669111e-06, + "loss": 0.0328, + "step": 166110 + }, + { + "epoch": 0.1006, + "grad_norm": 0.0381898432970047, + "learning_rate": 3.821439434492802e-06, + "loss": 0.032, + "step": 166120 + }, + { + "epoch": 0.10065, + "grad_norm": 0.0465182326734066, + "learning_rate": 3.819243230330385e-06, + "loss": 0.0333, + "step": 166130 + }, + { + "epoch": 0.1007, + "grad_norm": 0.04123750701546669, + "learning_rate": 3.817047605241905e-06, + "loss": 0.0327, + "step": 166140 + }, + { + "epoch": 0.10075, + "grad_norm": 0.042871516197919846, + "learning_rate": 3.814852559287377e-06, + "loss": 0.0318, + "step": 166150 + }, + { + "epoch": 0.1008, + "grad_norm": 0.05040085315704346, + "learning_rate": 3.8126580925268273e-06, + "loss": 0.0323, + "step": 166160 + }, + { + "epoch": 0.10085, + "grad_norm": 0.05391421169042587, + "learning_rate": 3.8104642050202393e-06, + "loss": 0.0334, + "step": 166170 + }, + { + "epoch": 0.1009, + "grad_norm": 0.04254022613167763, + "learning_rate": 3.8082708968276066e-06, + "loss": 0.0328, + "step": 166180 + }, + { + "epoch": 0.10095, + "grad_norm": 0.05270838364958763, + "learning_rate": 3.8060781680088865e-06, + "loss": 0.0339, + "step": 166190 + }, + { + "epoch": 0.101, + "grad_norm": 0.039037276059389114, + "learning_rate": 3.8038860186240198e-06, + "loss": 0.0323, + "step": 166200 + }, + { + "epoch": 0.10105, + "grad_norm": 0.03755092993378639, + "learning_rate": 3.801694448732954e-06, + "loss": 0.0329, + "step": 166210 + }, + { + "epoch": 0.1011, + "grad_norm": 0.037635862827301025, + "learning_rate": 3.799503458395598e-06, + "loss": 0.0328, + "step": 166220 + }, + { + "epoch": 0.10115, + "grad_norm": 0.03773429989814758, + "learning_rate": 3.7973130476718492e-06, + "loss": 0.0325, + "step": 166230 + }, + { + "epoch": 0.1012, + "grad_norm": 0.04328387975692749, + "learning_rate": 3.7951232166215933e-06, + "loss": 0.0341, + "step": 166240 + }, + { + "epoch": 0.10125, + "grad_norm": 0.04198687896132469, + "learning_rate": 3.7929339653047095e-06, + "loss": 0.0335, + "step": 166250 + }, + { + "epoch": 0.1013, + "grad_norm": 0.04273358732461929, + "learning_rate": 3.7907452937810366e-06, + "loss": 0.0343, + "step": 166260 + }, + { + "epoch": 0.10135, + "grad_norm": 0.0433444119989872, + "learning_rate": 3.788557202110424e-06, + "loss": 0.0326, + "step": 166270 + }, + { + "epoch": 0.1014, + "grad_norm": 0.039447490125894547, + "learning_rate": 3.7863696903526895e-06, + "loss": 0.0338, + "step": 166280 + }, + { + "epoch": 0.10145, + "grad_norm": 0.049982041120529175, + "learning_rate": 3.7841827585676337e-06, + "loss": 0.0336, + "step": 166290 + }, + { + "epoch": 0.1015, + "grad_norm": 0.047276243567466736, + "learning_rate": 3.7819964068150556e-06, + "loss": 0.033, + "step": 166300 + }, + { + "epoch": 0.10155, + "grad_norm": 0.04317307844758034, + "learning_rate": 3.7798106351547236e-06, + "loss": 0.0329, + "step": 166310 + }, + { + "epoch": 0.1016, + "grad_norm": 0.05367967113852501, + "learning_rate": 3.7776254436463985e-06, + "loss": 0.0339, + "step": 166320 + }, + { + "epoch": 0.10165, + "grad_norm": 0.05572677403688431, + "learning_rate": 3.775440832349814e-06, + "loss": 0.0334, + "step": 166330 + }, + { + "epoch": 0.1017, + "grad_norm": 0.05006201192736626, + "learning_rate": 3.773256801324704e-06, + "loss": 0.0323, + "step": 166340 + }, + { + "epoch": 0.10175, + "grad_norm": 0.04554805904626846, + "learning_rate": 3.7710733506307883e-06, + "loss": 0.0324, + "step": 166350 + }, + { + "epoch": 0.1018, + "grad_norm": 0.04822637140750885, + "learning_rate": 3.7688904803277414e-06, + "loss": 0.0328, + "step": 166360 + }, + { + "epoch": 0.10185, + "grad_norm": 0.05482174828648567, + "learning_rate": 3.7667081904752597e-06, + "loss": 0.0331, + "step": 166370 + }, + { + "epoch": 0.1019, + "grad_norm": 0.038746532052755356, + "learning_rate": 3.7645264811329934e-06, + "loss": 0.034, + "step": 166380 + }, + { + "epoch": 0.10195, + "grad_norm": 0.04358692839741707, + "learning_rate": 3.7623453523605994e-06, + "loss": 0.032, + "step": 166390 + }, + { + "epoch": 0.102, + "grad_norm": 0.04856157302856445, + "learning_rate": 3.7601648042177055e-06, + "loss": 0.0343, + "step": 166400 + }, + { + "epoch": 0.10205, + "grad_norm": 0.042550262063741684, + "learning_rate": 3.75798483676392e-06, + "loss": 0.0333, + "step": 166410 + }, + { + "epoch": 0.1021, + "grad_norm": 0.05196889862418175, + "learning_rate": 3.755805450058855e-06, + "loss": 0.0333, + "step": 166420 + }, + { + "epoch": 0.10215, + "grad_norm": 0.04230838268995285, + "learning_rate": 3.753626644162089e-06, + "loss": 0.0324, + "step": 166430 + }, + { + "epoch": 0.1022, + "grad_norm": 0.0412224642932415, + "learning_rate": 3.7514484191331885e-06, + "loss": 0.0325, + "step": 166440 + }, + { + "epoch": 0.10225, + "grad_norm": 0.04116053506731987, + "learning_rate": 3.749270775031699e-06, + "loss": 0.0313, + "step": 166450 + }, + { + "epoch": 0.1023, + "grad_norm": 0.036984339356422424, + "learning_rate": 3.747093711917163e-06, + "loss": 0.0318, + "step": 166460 + }, + { + "epoch": 0.10235, + "grad_norm": 0.03305768221616745, + "learning_rate": 3.744917229849107e-06, + "loss": 0.0312, + "step": 166470 + }, + { + "epoch": 0.1024, + "grad_norm": 0.04337235167622566, + "learning_rate": 3.7427413288870283e-06, + "loss": 0.0331, + "step": 166480 + }, + { + "epoch": 0.10245, + "grad_norm": 0.034650687128305435, + "learning_rate": 3.7405660090904153e-06, + "loss": 0.0328, + "step": 166490 + }, + { + "epoch": 0.1025, + "grad_norm": 0.0390767827630043, + "learning_rate": 3.738391270518735e-06, + "loss": 0.0314, + "step": 166500 + }, + { + "epoch": 0.10255, + "grad_norm": 0.03500323370099068, + "learning_rate": 3.7362171132314548e-06, + "loss": 0.0313, + "step": 166510 + }, + { + "epoch": 0.1026, + "grad_norm": 0.04510362446308136, + "learning_rate": 3.7340435372880124e-06, + "loss": 0.0322, + "step": 166520 + }, + { + "epoch": 0.10265, + "grad_norm": 0.044195231050252914, + "learning_rate": 3.731870542747823e-06, + "loss": 0.0322, + "step": 166530 + }, + { + "epoch": 0.1027, + "grad_norm": 0.04079239070415497, + "learning_rate": 3.7296981296703088e-06, + "loss": 0.0323, + "step": 166540 + }, + { + "epoch": 0.10275, + "grad_norm": 0.06017296016216278, + "learning_rate": 3.727526298114853e-06, + "loss": 0.0323, + "step": 166550 + }, + { + "epoch": 0.1028, + "grad_norm": 0.04101934656500816, + "learning_rate": 3.7253550481408467e-06, + "loss": 0.0331, + "step": 166560 + }, + { + "epoch": 0.10285, + "grad_norm": 0.034680336713790894, + "learning_rate": 3.723184379807629e-06, + "loss": 0.0312, + "step": 166570 + }, + { + "epoch": 0.1029, + "grad_norm": 0.045201726257801056, + "learning_rate": 3.7210142931745575e-06, + "loss": 0.0326, + "step": 166580 + }, + { + "epoch": 0.10295, + "grad_norm": 0.037214379757642746, + "learning_rate": 3.7188447883009653e-06, + "loss": 0.0306, + "step": 166590 + }, + { + "epoch": 0.103, + "grad_norm": 0.03503154218196869, + "learning_rate": 3.716675865246164e-06, + "loss": 0.0324, + "step": 166600 + }, + { + "epoch": 0.10305, + "grad_norm": 0.04241985082626343, + "learning_rate": 3.7145075240694465e-06, + "loss": 0.0329, + "step": 166610 + }, + { + "epoch": 0.1031, + "grad_norm": 0.037117183208465576, + "learning_rate": 3.7123397648300917e-06, + "loss": 0.0328, + "step": 166620 + }, + { + "epoch": 0.10315, + "grad_norm": 0.06032340228557587, + "learning_rate": 3.7101725875873765e-06, + "loss": 0.0337, + "step": 166630 + }, + { + "epoch": 0.1032, + "grad_norm": 0.04760419949889183, + "learning_rate": 3.7080059924005454e-06, + "loss": 0.032, + "step": 166640 + }, + { + "epoch": 0.10325, + "grad_norm": 0.04192169010639191, + "learning_rate": 3.7058399793288263e-06, + "loss": 0.0334, + "step": 166650 + }, + { + "epoch": 0.1033, + "grad_norm": 0.0428856760263443, + "learning_rate": 3.703674548431446e-06, + "loss": 0.0321, + "step": 166660 + }, + { + "epoch": 0.10335, + "grad_norm": 0.03872525319457054, + "learning_rate": 3.7015096997675967e-06, + "loss": 0.0325, + "step": 166670 + }, + { + "epoch": 0.1034, + "grad_norm": 0.04189387336373329, + "learning_rate": 3.699345433396478e-06, + "loss": 0.032, + "step": 166680 + }, + { + "epoch": 0.10345, + "grad_norm": 0.03546981140971184, + "learning_rate": 3.6971817493772517e-06, + "loss": 0.0335, + "step": 166690 + }, + { + "epoch": 0.1035, + "grad_norm": 0.03853558376431465, + "learning_rate": 3.6950186477690748e-06, + "loss": 0.0317, + "step": 166700 + }, + { + "epoch": 0.10355, + "grad_norm": 0.03602823242545128, + "learning_rate": 3.692856128631078e-06, + "loss": 0.0323, + "step": 166710 + }, + { + "epoch": 0.1036, + "grad_norm": 0.0376969650387764, + "learning_rate": 3.6906941920223953e-06, + "loss": 0.0313, + "step": 166720 + }, + { + "epoch": 0.10365, + "grad_norm": 0.03826133906841278, + "learning_rate": 3.688532838002129e-06, + "loss": 0.0318, + "step": 166730 + }, + { + "epoch": 0.1037, + "grad_norm": 0.045628514140844345, + "learning_rate": 3.6863720666293595e-06, + "loss": 0.0333, + "step": 166740 + }, + { + "epoch": 0.10375, + "grad_norm": 0.03853847086429596, + "learning_rate": 3.6842118779631785e-06, + "loss": 0.0326, + "step": 166750 + }, + { + "epoch": 0.1038, + "grad_norm": 0.036480020731687546, + "learning_rate": 3.6820522720626304e-06, + "loss": 0.0327, + "step": 166760 + }, + { + "epoch": 0.10385, + "grad_norm": 0.03850627318024635, + "learning_rate": 3.679893248986779e-06, + "loss": 0.0328, + "step": 166770 + }, + { + "epoch": 0.1039, + "grad_norm": 0.04077065363526344, + "learning_rate": 3.6777348087946224e-06, + "loss": 0.0325, + "step": 166780 + }, + { + "epoch": 0.10395, + "grad_norm": 0.04662526771426201, + "learning_rate": 3.6755769515451842e-06, + "loss": 0.0333, + "step": 166790 + }, + { + "epoch": 0.104, + "grad_norm": 0.03916983678936958, + "learning_rate": 3.673419677297468e-06, + "loss": 0.0336, + "step": 166800 + }, + { + "epoch": 0.10405, + "grad_norm": 0.039329130202531815, + "learning_rate": 3.6712629861104464e-06, + "loss": 0.0325, + "step": 166810 + }, + { + "epoch": 0.1041, + "grad_norm": 0.04693764075636864, + "learning_rate": 3.6691068780430825e-06, + "loss": 0.034, + "step": 166820 + }, + { + "epoch": 0.10415, + "grad_norm": 0.03850613906979561, + "learning_rate": 3.666951353154316e-06, + "loss": 0.0327, + "step": 166830 + }, + { + "epoch": 0.1042, + "grad_norm": 0.04074549302458763, + "learning_rate": 3.6647964115030853e-06, + "loss": 0.0328, + "step": 166840 + }, + { + "epoch": 0.10425, + "grad_norm": 0.04204528406262398, + "learning_rate": 3.6626420531483187e-06, + "loss": 0.0333, + "step": 166850 + }, + { + "epoch": 0.1043, + "grad_norm": 0.0419759526848793, + "learning_rate": 3.660488278148888e-06, + "loss": 0.0325, + "step": 166860 + }, + { + "epoch": 0.10435, + "grad_norm": 0.03967071697115898, + "learning_rate": 3.658335086563697e-06, + "loss": 0.0324, + "step": 166870 + }, + { + "epoch": 0.1044, + "grad_norm": 0.03214019164443016, + "learning_rate": 3.656182478451603e-06, + "loss": 0.0323, + "step": 166880 + }, + { + "epoch": 0.10445, + "grad_norm": 0.03751469403505325, + "learning_rate": 3.6540304538714655e-06, + "loss": 0.0317, + "step": 166890 + }, + { + "epoch": 0.1045, + "grad_norm": 0.038455113768577576, + "learning_rate": 3.6518790128821173e-06, + "loss": 0.0321, + "step": 166900 + }, + { + "epoch": 0.10455, + "grad_norm": 0.04511810466647148, + "learning_rate": 3.64972815554237e-06, + "loss": 0.0329, + "step": 166910 + }, + { + "epoch": 0.1046, + "grad_norm": 0.04041874781250954, + "learning_rate": 3.647577881911041e-06, + "loss": 0.0337, + "step": 166920 + }, + { + "epoch": 0.10465, + "grad_norm": 0.032743386924266815, + "learning_rate": 3.6454281920469126e-06, + "loss": 0.035, + "step": 166930 + }, + { + "epoch": 0.1047, + "grad_norm": 0.045992206782102585, + "learning_rate": 3.6432790860087525e-06, + "loss": 0.0324, + "step": 166940 + }, + { + "epoch": 0.10475, + "grad_norm": 0.07870320975780487, + "learning_rate": 3.6411305638553133e-06, + "loss": 0.0363, + "step": 166950 + }, + { + "epoch": 0.1048, + "grad_norm": 0.043390434235334396, + "learning_rate": 3.6389826256453457e-06, + "loss": 0.0345, + "step": 166960 + }, + { + "epoch": 0.10485, + "grad_norm": 0.04987112060189247, + "learning_rate": 3.636835271437561e-06, + "loss": 0.0332, + "step": 166970 + }, + { + "epoch": 0.1049, + "grad_norm": 0.048528384417295456, + "learning_rate": 3.634688501290684e-06, + "loss": 0.0364, + "step": 166980 + }, + { + "epoch": 0.10495, + "grad_norm": 0.05022319406270981, + "learning_rate": 3.632542315263393e-06, + "loss": 0.0315, + "step": 166990 + }, + { + "epoch": 0.105, + "grad_norm": 0.04441463574767113, + "learning_rate": 3.6303967134143637e-06, + "loss": 0.0348, + "step": 167000 + }, + { + "epoch": 0.10505, + "grad_norm": 0.05608077347278595, + "learning_rate": 3.628251695802265e-06, + "loss": 0.0337, + "step": 167010 + }, + { + "epoch": 0.1051, + "grad_norm": 0.04835597053170204, + "learning_rate": 3.6261072624857367e-06, + "loss": 0.033, + "step": 167020 + }, + { + "epoch": 0.10515, + "grad_norm": 0.03941258788108826, + "learning_rate": 3.6239634135234012e-06, + "loss": 0.0323, + "step": 167030 + }, + { + "epoch": 0.1052, + "grad_norm": 0.052929267287254333, + "learning_rate": 3.6218201489738783e-06, + "loss": 0.0332, + "step": 167040 + }, + { + "epoch": 0.10525, + "grad_norm": 0.032524507492780685, + "learning_rate": 3.6196774688957575e-06, + "loss": 0.0327, + "step": 167050 + }, + { + "epoch": 0.1053, + "grad_norm": 0.04127586632966995, + "learning_rate": 3.617535373347636e-06, + "loss": 0.0318, + "step": 167060 + }, + { + "epoch": 0.10535, + "grad_norm": 0.04529045522212982, + "learning_rate": 3.61539386238805e-06, + "loss": 0.0331, + "step": 167070 + }, + { + "epoch": 0.1054, + "grad_norm": 0.038415782153606415, + "learning_rate": 3.6132529360755674e-06, + "loss": 0.0311, + "step": 167080 + }, + { + "epoch": 0.10545, + "grad_norm": 0.03427970036864281, + "learning_rate": 3.61111259446871e-06, + "loss": 0.0315, + "step": 167090 + }, + { + "epoch": 0.1055, + "grad_norm": 0.033244382590055466, + "learning_rate": 3.608972837626004e-06, + "loss": 0.0315, + "step": 167100 + }, + { + "epoch": 0.10555, + "grad_norm": 0.038616396486759186, + "learning_rate": 3.6068336656059466e-06, + "loss": 0.032, + "step": 167110 + }, + { + "epoch": 0.1056, + "grad_norm": 0.039937783032655716, + "learning_rate": 3.6046950784670105e-06, + "loss": 0.0322, + "step": 167120 + }, + { + "epoch": 0.10565, + "grad_norm": 0.04301230236887932, + "learning_rate": 3.602557076267682e-06, + "loss": 0.0322, + "step": 167130 + }, + { + "epoch": 0.1057, + "grad_norm": 0.035067368298769, + "learning_rate": 3.6004196590664037e-06, + "loss": 0.0309, + "step": 167140 + }, + { + "epoch": 0.10575, + "grad_norm": 0.039692506194114685, + "learning_rate": 3.5982828269216117e-06, + "loss": 0.0325, + "step": 167150 + }, + { + "epoch": 0.1058, + "grad_norm": 0.045233264565467834, + "learning_rate": 3.596146579891721e-06, + "loss": 0.0328, + "step": 167160 + }, + { + "epoch": 0.10585, + "grad_norm": 0.04296007379889488, + "learning_rate": 3.594010918035143e-06, + "loss": 0.0328, + "step": 167170 + }, + { + "epoch": 0.1059, + "grad_norm": 0.04041620343923569, + "learning_rate": 3.5918758414102695e-06, + "loss": 0.032, + "step": 167180 + }, + { + "epoch": 0.10595, + "grad_norm": 0.041898369789123535, + "learning_rate": 3.589741350075465e-06, + "loss": 0.0313, + "step": 167190 + }, + { + "epoch": 0.106, + "grad_norm": 0.0341411828994751, + "learning_rate": 3.587607444089092e-06, + "loss": 0.0326, + "step": 167200 + }, + { + "epoch": 0.10605, + "grad_norm": 0.03707229718565941, + "learning_rate": 3.585474123509483e-06, + "loss": 0.0325, + "step": 167210 + }, + { + "epoch": 0.1061, + "grad_norm": 0.037575431168079376, + "learning_rate": 3.5833413883949675e-06, + "loss": 0.0324, + "step": 167220 + }, + { + "epoch": 0.10615, + "grad_norm": 0.034515380859375, + "learning_rate": 3.5812092388038567e-06, + "loss": 0.0329, + "step": 167230 + }, + { + "epoch": 0.1062, + "grad_norm": 0.03806985169649124, + "learning_rate": 3.5790776747944316e-06, + "loss": 0.033, + "step": 167240 + }, + { + "epoch": 0.10625, + "grad_norm": 0.03956698253750801, + "learning_rate": 3.5769466964249793e-06, + "loss": 0.0326, + "step": 167250 + }, + { + "epoch": 0.1063, + "grad_norm": 0.04012298211455345, + "learning_rate": 3.574816303753753e-06, + "loss": 0.0332, + "step": 167260 + }, + { + "epoch": 0.10635, + "grad_norm": 0.03527490049600601, + "learning_rate": 3.572686496839009e-06, + "loss": 0.0331, + "step": 167270 + }, + { + "epoch": 0.1064, + "grad_norm": 0.03796212002635002, + "learning_rate": 3.570557275738956e-06, + "loss": 0.0335, + "step": 167280 + }, + { + "epoch": 0.10645, + "grad_norm": 0.03673234581947327, + "learning_rate": 3.5684286405118173e-06, + "loss": 0.0331, + "step": 167290 + }, + { + "epoch": 0.1065, + "grad_norm": 0.03789430111646652, + "learning_rate": 3.5663005912157933e-06, + "loss": 0.0327, + "step": 167300 + }, + { + "epoch": 0.10655, + "grad_norm": 0.033841654658317566, + "learning_rate": 3.5641731279090596e-06, + "loss": 0.0336, + "step": 167310 + }, + { + "epoch": 0.1066, + "grad_norm": 0.03266311064362526, + "learning_rate": 3.5620462506497782e-06, + "loss": 0.0335, + "step": 167320 + }, + { + "epoch": 0.10665, + "grad_norm": 0.03473520651459694, + "learning_rate": 3.559919959496091e-06, + "loss": 0.0328, + "step": 167330 + }, + { + "epoch": 0.1067, + "grad_norm": 0.03881165385246277, + "learning_rate": 3.5577942545061473e-06, + "loss": 0.0338, + "step": 167340 + }, + { + "epoch": 0.10675, + "grad_norm": 0.03510915860533714, + "learning_rate": 3.555669135738049e-06, + "loss": 0.0331, + "step": 167350 + }, + { + "epoch": 0.1068, + "grad_norm": 0.04461480677127838, + "learning_rate": 3.5535446032498977e-06, + "loss": 0.0335, + "step": 167360 + }, + { + "epoch": 0.10685, + "grad_norm": 0.046035848557949066, + "learning_rate": 3.5514206570997854e-06, + "loss": 0.0329, + "step": 167370 + }, + { + "epoch": 0.1069, + "grad_norm": 0.03814137354493141, + "learning_rate": 3.549297297345766e-06, + "loss": 0.0337, + "step": 167380 + }, + { + "epoch": 0.10695, + "grad_norm": 0.03890189900994301, + "learning_rate": 3.5471745240459096e-06, + "loss": 0.0333, + "step": 167390 + }, + { + "epoch": 0.107, + "grad_norm": 0.04086184874176979, + "learning_rate": 3.5450523372582395e-06, + "loss": 0.0327, + "step": 167400 + }, + { + "epoch": 0.10705, + "grad_norm": 0.03729187697172165, + "learning_rate": 3.5429307370407728e-06, + "loss": 0.0335, + "step": 167410 + }, + { + "epoch": 0.1071, + "grad_norm": 0.03804780915379524, + "learning_rate": 3.5408097234515243e-06, + "loss": 0.0313, + "step": 167420 + }, + { + "epoch": 0.10715, + "grad_norm": 0.04238668456673622, + "learning_rate": 3.538689296548478e-06, + "loss": 0.0331, + "step": 167430 + }, + { + "epoch": 0.1072, + "grad_norm": 0.042996007949113846, + "learning_rate": 3.5365694563896016e-06, + "loss": 0.0316, + "step": 167440 + }, + { + "epoch": 0.10725, + "grad_norm": 0.03444478660821915, + "learning_rate": 3.5344502030328463e-06, + "loss": 0.0346, + "step": 167450 + }, + { + "epoch": 0.1073, + "grad_norm": 0.043042588979005814, + "learning_rate": 3.532331536536165e-06, + "loss": 0.0328, + "step": 167460 + }, + { + "epoch": 0.10735, + "grad_norm": 0.03950345143675804, + "learning_rate": 3.5302134569574706e-06, + "loss": 0.0335, + "step": 167470 + }, + { + "epoch": 0.1074, + "grad_norm": 0.041202180087566376, + "learning_rate": 3.52809596435468e-06, + "loss": 0.0344, + "step": 167480 + }, + { + "epoch": 0.10745, + "grad_norm": 0.035815853625535965, + "learning_rate": 3.525979058785678e-06, + "loss": 0.0324, + "step": 167490 + }, + { + "epoch": 0.1075, + "grad_norm": 0.036990031599998474, + "learning_rate": 3.523862740308334e-06, + "loss": 0.0319, + "step": 167500 + }, + { + "epoch": 0.10755, + "grad_norm": 0.03876454383134842, + "learning_rate": 3.5217470089805223e-06, + "loss": 0.0328, + "step": 167510 + }, + { + "epoch": 0.1076, + "grad_norm": 0.038854099810123444, + "learning_rate": 3.519631864860076e-06, + "loss": 0.0326, + "step": 167520 + }, + { + "epoch": 0.10765, + "grad_norm": 0.037550996989011765, + "learning_rate": 3.517517308004828e-06, + "loss": 0.032, + "step": 167530 + }, + { + "epoch": 0.1077, + "grad_norm": 0.035221610218286514, + "learning_rate": 3.515403338472578e-06, + "loss": 0.0344, + "step": 167540 + }, + { + "epoch": 0.10775, + "grad_norm": 0.03795564919710159, + "learning_rate": 3.513289956321131e-06, + "loss": 0.0326, + "step": 167550 + }, + { + "epoch": 0.1078, + "grad_norm": 0.03376448526978493, + "learning_rate": 3.511177161608273e-06, + "loss": 0.0326, + "step": 167560 + }, + { + "epoch": 0.10785, + "grad_norm": 0.04067350551486015, + "learning_rate": 3.50906495439175e-06, + "loss": 0.033, + "step": 167570 + }, + { + "epoch": 0.1079, + "grad_norm": 0.0399472676217556, + "learning_rate": 3.506953334729321e-06, + "loss": 0.0325, + "step": 167580 + }, + { + "epoch": 0.10795, + "grad_norm": 0.03734531253576279, + "learning_rate": 3.5048423026787095e-06, + "loss": 0.0328, + "step": 167590 + }, + { + "epoch": 0.108, + "grad_norm": 0.04117441922426224, + "learning_rate": 3.5027318582976394e-06, + "loss": 0.0323, + "step": 167600 + }, + { + "epoch": 0.10805, + "grad_norm": 0.0442042350769043, + "learning_rate": 3.5006220016438023e-06, + "loss": 0.0317, + "step": 167610 + }, + { + "epoch": 0.1081, + "grad_norm": 0.039410967379808426, + "learning_rate": 3.498512732774878e-06, + "loss": 0.0324, + "step": 167620 + }, + { + "epoch": 0.10815, + "grad_norm": 0.0426187664270401, + "learning_rate": 3.4964040517485447e-06, + "loss": 0.0332, + "step": 167630 + }, + { + "epoch": 0.1082, + "grad_norm": 0.0367325022816658, + "learning_rate": 3.4942959586224457e-06, + "loss": 0.0317, + "step": 167640 + }, + { + "epoch": 0.10825, + "grad_norm": 0.037765733897686005, + "learning_rate": 3.4921884534542148e-06, + "loss": 0.0327, + "step": 167650 + }, + { + "epoch": 0.1083, + "grad_norm": 0.040944699198007584, + "learning_rate": 3.4900815363014677e-06, + "loss": 0.032, + "step": 167660 + }, + { + "epoch": 0.10835, + "grad_norm": 0.03851859271526337, + "learning_rate": 3.487975207221808e-06, + "loss": 0.0334, + "step": 167670 + }, + { + "epoch": 0.1084, + "grad_norm": 0.038580868393182755, + "learning_rate": 3.4858694662728314e-06, + "loss": 0.0316, + "step": 167680 + }, + { + "epoch": 0.10845, + "grad_norm": 0.04467671737074852, + "learning_rate": 3.4837643135121e-06, + "loss": 0.0335, + "step": 167690 + }, + { + "epoch": 0.1085, + "grad_norm": 0.03394944965839386, + "learning_rate": 3.481659748997171e-06, + "loss": 0.0323, + "step": 167700 + }, + { + "epoch": 0.10855, + "grad_norm": 0.03451866656541824, + "learning_rate": 3.4795557727855754e-06, + "loss": 0.0311, + "step": 167710 + }, + { + "epoch": 0.1086, + "grad_norm": 0.038766369223594666, + "learning_rate": 3.477452384934843e-06, + "loss": 0.0325, + "step": 167720 + }, + { + "epoch": 0.10865, + "grad_norm": 0.0374983511865139, + "learning_rate": 3.475349585502477e-06, + "loss": 0.0339, + "step": 167730 + }, + { + "epoch": 0.1087, + "grad_norm": 0.04100382328033447, + "learning_rate": 3.4732473745459625e-06, + "loss": 0.0329, + "step": 167740 + }, + { + "epoch": 0.10875, + "grad_norm": 0.04044672101736069, + "learning_rate": 3.4711457521227843e-06, + "loss": 0.0335, + "step": 167750 + }, + { + "epoch": 0.1088, + "grad_norm": 0.035386454313993454, + "learning_rate": 3.4690447182903844e-06, + "loss": 0.033, + "step": 167760 + }, + { + "epoch": 0.10885, + "grad_norm": 0.04271038994193077, + "learning_rate": 3.466944273106226e-06, + "loss": 0.0326, + "step": 167770 + }, + { + "epoch": 0.1089, + "grad_norm": 0.03882293403148651, + "learning_rate": 3.4648444166277107e-06, + "loss": 0.0343, + "step": 167780 + }, + { + "epoch": 0.10895, + "grad_norm": 0.03273117542266846, + "learning_rate": 3.462745148912258e-06, + "loss": 0.0312, + "step": 167790 + }, + { + "epoch": 0.109, + "grad_norm": 0.03577882796525955, + "learning_rate": 3.4606464700172703e-06, + "loss": 0.0333, + "step": 167800 + }, + { + "epoch": 0.10905, + "grad_norm": 0.03669364005327225, + "learning_rate": 3.4585483800001124e-06, + "loss": 0.0333, + "step": 167810 + }, + { + "epoch": 0.1091, + "grad_norm": 0.0384441576898098, + "learning_rate": 3.456450878918149e-06, + "loss": 0.0328, + "step": 167820 + }, + { + "epoch": 0.10915, + "grad_norm": 0.037700068205595016, + "learning_rate": 3.4543539668287218e-06, + "loss": 0.0344, + "step": 167830 + }, + { + "epoch": 0.1092, + "grad_norm": 0.03969224914908409, + "learning_rate": 3.4522576437891668e-06, + "loss": 0.0331, + "step": 167840 + }, + { + "epoch": 0.10925, + "grad_norm": 0.04010225459933281, + "learning_rate": 3.4501619098567944e-06, + "loss": 0.0321, + "step": 167850 + }, + { + "epoch": 0.1093, + "grad_norm": 0.03746872395277023, + "learning_rate": 3.448066765088892e-06, + "loss": 0.0331, + "step": 167860 + }, + { + "epoch": 0.10935, + "grad_norm": 0.04125736281275749, + "learning_rate": 3.4459722095427554e-06, + "loss": 0.0333, + "step": 167870 + }, + { + "epoch": 0.1094, + "grad_norm": 0.036100760102272034, + "learning_rate": 3.4438782432756336e-06, + "loss": 0.032, + "step": 167880 + }, + { + "epoch": 0.10945, + "grad_norm": 0.040355175733566284, + "learning_rate": 3.4417848663447883e-06, + "loss": 0.0341, + "step": 167890 + }, + { + "epoch": 0.1095, + "grad_norm": 0.032878391444683075, + "learning_rate": 3.4396920788074436e-06, + "loss": 0.0348, + "step": 167900 + }, + { + "epoch": 0.10955, + "grad_norm": 0.0392109714448452, + "learning_rate": 3.437599880720821e-06, + "loss": 0.0326, + "step": 167910 + }, + { + "epoch": 0.1096, + "grad_norm": 0.03616342693567276, + "learning_rate": 3.43550827214211e-06, + "loss": 0.0338, + "step": 167920 + }, + { + "epoch": 0.10965, + "grad_norm": 0.04273064434528351, + "learning_rate": 3.433417253128507e-06, + "loss": 0.034, + "step": 167930 + }, + { + "epoch": 0.1097, + "grad_norm": 0.050068605691194534, + "learning_rate": 3.431326823737174e-06, + "loss": 0.0337, + "step": 167940 + }, + { + "epoch": 0.10975, + "grad_norm": 0.03763910382986069, + "learning_rate": 3.429236984025258e-06, + "loss": 0.0321, + "step": 167950 + }, + { + "epoch": 0.1098, + "grad_norm": 0.03633598983287811, + "learning_rate": 3.427147734049904e-06, + "loss": 0.0331, + "step": 167960 + }, + { + "epoch": 0.10985, + "grad_norm": 0.03945426270365715, + "learning_rate": 3.4250590738682224e-06, + "loss": 0.0322, + "step": 167970 + }, + { + "epoch": 0.1099, + "grad_norm": 0.03770596906542778, + "learning_rate": 3.422971003537323e-06, + "loss": 0.0326, + "step": 167980 + }, + { + "epoch": 0.10995, + "grad_norm": 0.036587655544281006, + "learning_rate": 3.4208835231142933e-06, + "loss": 0.0326, + "step": 167990 + }, + { + "epoch": 0.11, + "grad_norm": 0.038484178483486176, + "learning_rate": 3.4187966326561933e-06, + "loss": 0.0349, + "step": 168000 + }, + { + "epoch": 0.11005, + "grad_norm": 0.040012579411268234, + "learning_rate": 3.416710332220094e-06, + "loss": 0.0325, + "step": 168010 + }, + { + "epoch": 0.1101, + "grad_norm": 0.03485150635242462, + "learning_rate": 3.4146246218630217e-06, + "loss": 0.0325, + "step": 168020 + }, + { + "epoch": 0.11015, + "grad_norm": 0.0435153990983963, + "learning_rate": 3.4125395016420065e-06, + "loss": 0.032, + "step": 168030 + }, + { + "epoch": 0.1102, + "grad_norm": 0.032235775142908096, + "learning_rate": 3.410454971614044e-06, + "loss": 0.031, + "step": 168040 + }, + { + "epoch": 0.11025, + "grad_norm": 0.03393120318651199, + "learning_rate": 3.408371031836133e-06, + "loss": 0.0325, + "step": 168050 + }, + { + "epoch": 0.1103, + "grad_norm": 0.03405354917049408, + "learning_rate": 3.4062876823652557e-06, + "loss": 0.0321, + "step": 168060 + }, + { + "epoch": 0.11035, + "grad_norm": 0.038526881486177444, + "learning_rate": 3.4042049232583503e-06, + "loss": 0.0319, + "step": 168070 + }, + { + "epoch": 0.1104, + "grad_norm": 0.03656435385346413, + "learning_rate": 3.402122754572376e-06, + "loss": 0.0315, + "step": 168080 + }, + { + "epoch": 0.11045, + "grad_norm": 0.04226868599653244, + "learning_rate": 3.4000411763642436e-06, + "loss": 0.0326, + "step": 168090 + }, + { + "epoch": 0.1105, + "grad_norm": 0.03305479511618614, + "learning_rate": 3.397960188690877e-06, + "loss": 0.0316, + "step": 168100 + }, + { + "epoch": 0.11055, + "grad_norm": 0.03760003671050072, + "learning_rate": 3.3958797916091662e-06, + "loss": 0.0319, + "step": 168110 + }, + { + "epoch": 0.1106, + "grad_norm": 0.03340890631079674, + "learning_rate": 3.3937999851759773e-06, + "loss": 0.0333, + "step": 168120 + }, + { + "epoch": 0.11065, + "grad_norm": 0.033915311098098755, + "learning_rate": 3.3917207694481866e-06, + "loss": 0.0321, + "step": 168130 + }, + { + "epoch": 0.1107, + "grad_norm": 0.03152468055486679, + "learning_rate": 3.3896421444826297e-06, + "loss": 0.0321, + "step": 168140 + }, + { + "epoch": 0.11075, + "grad_norm": 0.03944316506385803, + "learning_rate": 3.3875641103361417e-06, + "loss": 0.0325, + "step": 168150 + }, + { + "epoch": 0.1108, + "grad_norm": 0.03829282894730568, + "learning_rate": 3.385486667065524e-06, + "loss": 0.0326, + "step": 168160 + }, + { + "epoch": 0.11085, + "grad_norm": 0.04128112271428108, + "learning_rate": 3.383409814727584e-06, + "loss": 0.0326, + "step": 168170 + }, + { + "epoch": 0.1109, + "grad_norm": 0.03964536637067795, + "learning_rate": 3.3813335533790957e-06, + "loss": 0.0329, + "step": 168180 + }, + { + "epoch": 0.11095, + "grad_norm": 0.03742802515625954, + "learning_rate": 3.3792578830768333e-06, + "loss": 0.0347, + "step": 168190 + }, + { + "epoch": 0.111, + "grad_norm": 0.04376475140452385, + "learning_rate": 3.377182803877535e-06, + "loss": 0.0351, + "step": 168200 + }, + { + "epoch": 0.11105, + "grad_norm": 0.03592308238148689, + "learning_rate": 3.3751083158379298e-06, + "loss": 0.0347, + "step": 168210 + }, + { + "epoch": 0.1111, + "grad_norm": 0.03608611598610878, + "learning_rate": 3.373034419014748e-06, + "loss": 0.0345, + "step": 168220 + }, + { + "epoch": 0.11115, + "grad_norm": 0.0484461635351181, + "learning_rate": 3.3709611134646766e-06, + "loss": 0.0336, + "step": 168230 + }, + { + "epoch": 0.1112, + "grad_norm": 0.044795531779527664, + "learning_rate": 3.368888399244399e-06, + "loss": 0.0338, + "step": 168240 + }, + { + "epoch": 0.11125, + "grad_norm": 0.03537564352154732, + "learning_rate": 3.366816276410592e-06, + "loss": 0.0322, + "step": 168250 + }, + { + "epoch": 0.1113, + "grad_norm": 0.03821335732936859, + "learning_rate": 3.364744745019893e-06, + "loss": 0.0332, + "step": 168260 + }, + { + "epoch": 0.11135, + "grad_norm": 0.03966094180941582, + "learning_rate": 3.36267380512896e-06, + "loss": 0.0325, + "step": 168270 + }, + { + "epoch": 0.1114, + "grad_norm": 0.036077141761779785, + "learning_rate": 3.3606034567943813e-06, + "loss": 0.0339, + "step": 168280 + }, + { + "epoch": 0.11145, + "grad_norm": 0.042035218328237534, + "learning_rate": 3.358533700072783e-06, + "loss": 0.0325, + "step": 168290 + }, + { + "epoch": 0.1115, + "grad_norm": 0.04042064771056175, + "learning_rate": 3.3564645350207343e-06, + "loss": 0.0327, + "step": 168300 + }, + { + "epoch": 0.11155, + "grad_norm": 0.03909675404429436, + "learning_rate": 3.354395961694823e-06, + "loss": 0.0333, + "step": 168310 + }, + { + "epoch": 0.1116, + "grad_norm": 0.03649524971842766, + "learning_rate": 3.3523279801515926e-06, + "loss": 0.0335, + "step": 168320 + }, + { + "epoch": 0.11165, + "grad_norm": 0.035712581127882004, + "learning_rate": 3.3502605904475763e-06, + "loss": 0.0327, + "step": 168330 + }, + { + "epoch": 0.1117, + "grad_norm": 0.055029258131980896, + "learning_rate": 3.348193792639309e-06, + "loss": 0.0334, + "step": 168340 + }, + { + "epoch": 0.11175, + "grad_norm": 0.042117055505514145, + "learning_rate": 3.3461275867832877e-06, + "loss": 0.0319, + "step": 168350 + }, + { + "epoch": 0.1118, + "grad_norm": 0.038168080151081085, + "learning_rate": 3.3440619729360053e-06, + "loss": 0.0313, + "step": 168360 + }, + { + "epoch": 0.11185, + "grad_norm": 0.042006079107522964, + "learning_rate": 3.341996951153925e-06, + "loss": 0.0344, + "step": 168370 + }, + { + "epoch": 0.1119, + "grad_norm": 0.04365299269556999, + "learning_rate": 3.3399325214935133e-06, + "loss": 0.0318, + "step": 168380 + }, + { + "epoch": 0.11195, + "grad_norm": 0.03829650580883026, + "learning_rate": 3.337868684011214e-06, + "loss": 0.0333, + "step": 168390 + }, + { + "epoch": 0.112, + "grad_norm": 0.05951496213674545, + "learning_rate": 3.335805438763445e-06, + "loss": 0.0333, + "step": 168400 + }, + { + "epoch": 0.11205, + "grad_norm": 0.044234320521354675, + "learning_rate": 3.3337427858066177e-06, + "loss": 0.0332, + "step": 168410 + }, + { + "epoch": 0.1121, + "grad_norm": 0.05862995237112045, + "learning_rate": 3.33168072519712e-06, + "loss": 0.0341, + "step": 168420 + }, + { + "epoch": 0.11215, + "grad_norm": 0.05722839757800102, + "learning_rate": 3.329619256991334e-06, + "loss": 0.0338, + "step": 168430 + }, + { + "epoch": 0.1122, + "grad_norm": 0.04409719631075859, + "learning_rate": 3.3275583812456157e-06, + "loss": 0.0338, + "step": 168440 + }, + { + "epoch": 0.11225, + "grad_norm": 0.045837193727493286, + "learning_rate": 3.3254980980163052e-06, + "loss": 0.0338, + "step": 168450 + }, + { + "epoch": 0.1123, + "grad_norm": 0.04249592497944832, + "learning_rate": 3.3234384073597386e-06, + "loss": 0.0333, + "step": 168460 + }, + { + "epoch": 0.11235, + "grad_norm": 0.04318466782569885, + "learning_rate": 3.3213793093322176e-06, + "loss": 0.0352, + "step": 168470 + }, + { + "epoch": 0.1124, + "grad_norm": 0.049135416746139526, + "learning_rate": 3.319320803990053e-06, + "loss": 0.0338, + "step": 168480 + }, + { + "epoch": 0.11245, + "grad_norm": 0.03898349404335022, + "learning_rate": 3.3172628913894998e-06, + "loss": 0.0377, + "step": 168490 + }, + { + "epoch": 0.1125, + "grad_norm": 0.0410807728767395, + "learning_rate": 3.315205571586835e-06, + "loss": 0.035, + "step": 168500 + }, + { + "epoch": 0.11255, + "grad_norm": 0.04127427935600281, + "learning_rate": 3.3131488446383086e-06, + "loss": 0.0344, + "step": 168510 + }, + { + "epoch": 0.1126, + "grad_norm": 0.03689667582511902, + "learning_rate": 3.311092710600147e-06, + "loss": 0.0335, + "step": 168520 + }, + { + "epoch": 0.11265, + "grad_norm": 0.04671632871031761, + "learning_rate": 3.3090371695285617e-06, + "loss": 0.0351, + "step": 168530 + }, + { + "epoch": 0.1127, + "grad_norm": 0.05275014415383339, + "learning_rate": 3.306982221479743e-06, + "loss": 0.0346, + "step": 168540 + }, + { + "epoch": 0.11275, + "grad_norm": 0.037315089255571365, + "learning_rate": 3.304927866509888e-06, + "loss": 0.0332, + "step": 168550 + }, + { + "epoch": 0.1128, + "grad_norm": 0.04311274737119675, + "learning_rate": 3.302874104675155e-06, + "loss": 0.0328, + "step": 168560 + }, + { + "epoch": 0.11285, + "grad_norm": 0.03334786370396614, + "learning_rate": 3.300820936031687e-06, + "loss": 0.0333, + "step": 168570 + }, + { + "epoch": 0.1129, + "grad_norm": 0.03692523390054703, + "learning_rate": 3.298768360635629e-06, + "loss": 0.033, + "step": 168580 + }, + { + "epoch": 0.11295, + "grad_norm": 0.03456854075193405, + "learning_rate": 3.2967163785430854e-06, + "loss": 0.0323, + "step": 168590 + }, + { + "epoch": 0.113, + "grad_norm": 0.03847216069698334, + "learning_rate": 3.294664989810167e-06, + "loss": 0.0323, + "step": 168600 + }, + { + "epoch": 0.11305, + "grad_norm": 0.03631163015961647, + "learning_rate": 3.2926141944929546e-06, + "loss": 0.0317, + "step": 168610 + }, + { + "epoch": 0.1131, + "grad_norm": 0.03581319749355316, + "learning_rate": 3.290563992647508e-06, + "loss": 0.031, + "step": 168620 + }, + { + "epoch": 0.11315, + "grad_norm": 0.03114577941596508, + "learning_rate": 3.288514384329894e-06, + "loss": 0.0315, + "step": 168630 + }, + { + "epoch": 0.1132, + "grad_norm": 0.04888668656349182, + "learning_rate": 3.2864653695961366e-06, + "loss": 0.0352, + "step": 168640 + }, + { + "epoch": 0.11325, + "grad_norm": 0.04437430948019028, + "learning_rate": 3.2844169485022612e-06, + "loss": 0.0316, + "step": 168650 + }, + { + "epoch": 0.1133, + "grad_norm": 0.045641057193279266, + "learning_rate": 3.282369121104262e-06, + "loss": 0.0328, + "step": 168660 + }, + { + "epoch": 0.11335, + "grad_norm": 0.03782769665122032, + "learning_rate": 3.2803218874581377e-06, + "loss": 0.0308, + "step": 168670 + }, + { + "epoch": 0.1134, + "grad_norm": 0.046148475259542465, + "learning_rate": 3.278275247619847e-06, + "loss": 0.0335, + "step": 168680 + }, + { + "epoch": 0.11345, + "grad_norm": 0.03975826874375343, + "learning_rate": 3.2762292016453587e-06, + "loss": 0.0339, + "step": 168690 + }, + { + "epoch": 0.1135, + "grad_norm": 0.04578711465001106, + "learning_rate": 3.2741837495906007e-06, + "loss": 0.032, + "step": 168700 + }, + { + "epoch": 0.11355, + "grad_norm": 0.043910104781389236, + "learning_rate": 3.272138891511492e-06, + "loss": 0.0325, + "step": 168710 + }, + { + "epoch": 0.1136, + "grad_norm": 0.03680054470896721, + "learning_rate": 3.2700946274639487e-06, + "loss": 0.0337, + "step": 168720 + }, + { + "epoch": 0.11365, + "grad_norm": 0.040630705654621124, + "learning_rate": 3.268050957503854e-06, + "loss": 0.0325, + "step": 168730 + }, + { + "epoch": 0.1137, + "grad_norm": 0.05036550760269165, + "learning_rate": 3.2660078816870804e-06, + "loss": 0.0338, + "step": 168740 + }, + { + "epoch": 0.11375, + "grad_norm": 0.045405495911836624, + "learning_rate": 3.2639654000694854e-06, + "loss": 0.0322, + "step": 168750 + }, + { + "epoch": 0.1138, + "grad_norm": 0.04269585758447647, + "learning_rate": 3.2619235127069077e-06, + "loss": 0.0331, + "step": 168760 + }, + { + "epoch": 0.11385, + "grad_norm": 0.04223855957388878, + "learning_rate": 3.2598822196551833e-06, + "loss": 0.0347, + "step": 168770 + }, + { + "epoch": 0.1139, + "grad_norm": 0.04172206297516823, + "learning_rate": 3.2578415209701037e-06, + "loss": 0.0336, + "step": 168780 + }, + { + "epoch": 0.11395, + "grad_norm": 0.04562797024846077, + "learning_rate": 3.255801416707474e-06, + "loss": 0.0319, + "step": 168790 + }, + { + "epoch": 0.114, + "grad_norm": 0.040666863322257996, + "learning_rate": 3.2537619069230586e-06, + "loss": 0.0317, + "step": 168800 + }, + { + "epoch": 0.11405, + "grad_norm": 0.03749188035726547, + "learning_rate": 3.2517229916726287e-06, + "loss": 0.0325, + "step": 168810 + }, + { + "epoch": 0.1141, + "grad_norm": 0.03828401491045952, + "learning_rate": 3.249684671011921e-06, + "loss": 0.033, + "step": 168820 + }, + { + "epoch": 0.11415, + "grad_norm": 0.03732697665691376, + "learning_rate": 3.247646944996657e-06, + "loss": 0.0319, + "step": 168830 + }, + { + "epoch": 0.1142, + "grad_norm": 0.04065719619393349, + "learning_rate": 3.2456098136825623e-06, + "loss": 0.0334, + "step": 168840 + }, + { + "epoch": 0.11425, + "grad_norm": 0.043520860373973846, + "learning_rate": 3.2435732771253195e-06, + "loss": 0.0326, + "step": 168850 + }, + { + "epoch": 0.1143, + "grad_norm": 0.056450847536325455, + "learning_rate": 3.2415373353806124e-06, + "loss": 0.0325, + "step": 168860 + }, + { + "epoch": 0.11435, + "grad_norm": 0.03849427029490471, + "learning_rate": 3.2395019885040927e-06, + "loss": 0.0319, + "step": 168870 + }, + { + "epoch": 0.1144, + "grad_norm": 0.03648701682686806, + "learning_rate": 3.2374672365514143e-06, + "loss": 0.0315, + "step": 168880 + }, + { + "epoch": 0.11445, + "grad_norm": 0.04408055916428566, + "learning_rate": 3.2354330795782095e-06, + "loss": 0.0325, + "step": 168890 + }, + { + "epoch": 0.1145, + "grad_norm": 0.03254362940788269, + "learning_rate": 3.23339951764009e-06, + "loss": 0.032, + "step": 168900 + }, + { + "epoch": 0.11455, + "grad_norm": 0.04348884895443916, + "learning_rate": 3.23136655079265e-06, + "loss": 0.0316, + "step": 168910 + }, + { + "epoch": 0.1146, + "grad_norm": 0.03716294839978218, + "learning_rate": 3.2293341790914645e-06, + "loss": 0.0322, + "step": 168920 + }, + { + "epoch": 0.11465, + "grad_norm": 0.034980643540620804, + "learning_rate": 3.2273024025921082e-06, + "loss": 0.033, + "step": 168930 + }, + { + "epoch": 0.1147, + "grad_norm": 0.03570462390780449, + "learning_rate": 3.225271221350126e-06, + "loss": 0.032, + "step": 168940 + }, + { + "epoch": 0.11475, + "grad_norm": 0.03555014356970787, + "learning_rate": 3.223240635421043e-06, + "loss": 0.035, + "step": 168950 + }, + { + "epoch": 0.1148, + "grad_norm": 0.0335003100335598, + "learning_rate": 3.221210644860384e-06, + "loss": 0.0329, + "step": 168960 + }, + { + "epoch": 0.11485, + "grad_norm": 0.035364456474781036, + "learning_rate": 3.2191812497236407e-06, + "loss": 0.0344, + "step": 168970 + }, + { + "epoch": 0.1149, + "grad_norm": 0.04259192943572998, + "learning_rate": 3.2171524500663074e-06, + "loss": 0.0341, + "step": 168980 + }, + { + "epoch": 0.11495, + "grad_norm": 0.041103146970272064, + "learning_rate": 3.2151242459438345e-06, + "loss": 0.0335, + "step": 168990 + }, + { + "epoch": 0.115, + "grad_norm": 0.038075946271419525, + "learning_rate": 3.2130966374116806e-06, + "loss": 0.033, + "step": 169000 + }, + { + "epoch": 0.11505, + "grad_norm": 0.03837242349982262, + "learning_rate": 3.211069624525284e-06, + "loss": 0.0329, + "step": 169010 + }, + { + "epoch": 0.1151, + "grad_norm": 0.03916725143790245, + "learning_rate": 3.209043207340057e-06, + "loss": 0.0324, + "step": 169020 + }, + { + "epoch": 0.11515, + "grad_norm": 0.0411614254117012, + "learning_rate": 3.207017385911404e-06, + "loss": 0.0322, + "step": 169030 + }, + { + "epoch": 0.1152, + "grad_norm": 0.03601289540529251, + "learning_rate": 3.2049921602947007e-06, + "loss": 0.0313, + "step": 169040 + }, + { + "epoch": 0.11525, + "grad_norm": 0.04232946038246155, + "learning_rate": 3.202967530545331e-06, + "loss": 0.0347, + "step": 169050 + }, + { + "epoch": 0.1153, + "grad_norm": 0.041129209101200104, + "learning_rate": 3.2009434967186418e-06, + "loss": 0.0323, + "step": 169060 + }, + { + "epoch": 0.11535, + "grad_norm": 0.04336433857679367, + "learning_rate": 3.1989200588699584e-06, + "loss": 0.0331, + "step": 169070 + }, + { + "epoch": 0.1154, + "grad_norm": 0.034335821866989136, + "learning_rate": 3.1968972170546203e-06, + "loss": 0.0324, + "step": 169080 + }, + { + "epoch": 0.11545, + "grad_norm": 0.03699544072151184, + "learning_rate": 3.194874971327913e-06, + "loss": 0.0323, + "step": 169090 + }, + { + "epoch": 0.1155, + "grad_norm": 0.03071075677871704, + "learning_rate": 3.1928533217451374e-06, + "loss": 0.0317, + "step": 169100 + }, + { + "epoch": 0.11555, + "grad_norm": 0.038801129907369614, + "learning_rate": 3.19083226836156e-06, + "loss": 0.0307, + "step": 169110 + }, + { + "epoch": 0.1156, + "grad_norm": 0.036969684064388275, + "learning_rate": 3.188811811232434e-06, + "loss": 0.0324, + "step": 169120 + }, + { + "epoch": 0.11565, + "grad_norm": 0.04446541890501976, + "learning_rate": 3.1867919504129954e-06, + "loss": 0.0315, + "step": 169130 + }, + { + "epoch": 0.1157, + "grad_norm": 0.03564651310443878, + "learning_rate": 3.1847726859584757e-06, + "loss": 0.0322, + "step": 169140 + }, + { + "epoch": 0.11575, + "grad_norm": 0.036352917551994324, + "learning_rate": 3.182754017924075e-06, + "loss": 0.0334, + "step": 169150 + }, + { + "epoch": 0.1158, + "grad_norm": 0.048761919140815735, + "learning_rate": 3.180735946364977e-06, + "loss": 0.0319, + "step": 169160 + }, + { + "epoch": 0.11585, + "grad_norm": 0.048298221081495285, + "learning_rate": 3.1787184713363643e-06, + "loss": 0.0323, + "step": 169170 + }, + { + "epoch": 0.1159, + "grad_norm": 0.04164647310972214, + "learning_rate": 3.1767015928933884e-06, + "loss": 0.0309, + "step": 169180 + }, + { + "epoch": 0.11595, + "grad_norm": 0.039654441177845, + "learning_rate": 3.1746853110911967e-06, + "loss": 0.0317, + "step": 169190 + }, + { + "epoch": 0.116, + "grad_norm": 0.03504246473312378, + "learning_rate": 3.1726696259849087e-06, + "loss": 0.0323, + "step": 169200 + }, + { + "epoch": 0.11605, + "grad_norm": 0.03481772914528847, + "learning_rate": 3.1706545376296277e-06, + "loss": 0.0321, + "step": 169210 + }, + { + "epoch": 0.1161, + "grad_norm": 0.039279524236917496, + "learning_rate": 3.1686400460804565e-06, + "loss": 0.0324, + "step": 169220 + }, + { + "epoch": 0.11615, + "grad_norm": 0.041676539927721024, + "learning_rate": 3.1666261513924655e-06, + "loss": 0.0331, + "step": 169230 + }, + { + "epoch": 0.1162, + "grad_norm": 0.04180776700377464, + "learning_rate": 3.164612853620713e-06, + "loss": 0.0316, + "step": 169240 + }, + { + "epoch": 0.11625, + "grad_norm": 0.03256388381123543, + "learning_rate": 3.162600152820236e-06, + "loss": 0.0325, + "step": 169250 + }, + { + "epoch": 0.1163, + "grad_norm": 0.03377092257142067, + "learning_rate": 3.1605880490460676e-06, + "loss": 0.032, + "step": 169260 + }, + { + "epoch": 0.11635, + "grad_norm": 0.035713210701942444, + "learning_rate": 3.1585765423532284e-06, + "loss": 0.0317, + "step": 169270 + }, + { + "epoch": 0.1164, + "grad_norm": 0.03613951802253723, + "learning_rate": 3.1565656327966877e-06, + "loss": 0.0332, + "step": 169280 + }, + { + "epoch": 0.11645, + "grad_norm": 0.034064631909132004, + "learning_rate": 3.1545553204314434e-06, + "loss": 0.0332, + "step": 169290 + }, + { + "epoch": 0.1165, + "grad_norm": 0.04030144587159157, + "learning_rate": 3.152545605312443e-06, + "loss": 0.0335, + "step": 169300 + }, + { + "epoch": 0.11655, + "grad_norm": 0.03883133456110954, + "learning_rate": 3.1505364874946457e-06, + "loss": 0.0341, + "step": 169310 + }, + { + "epoch": 0.1166, + "grad_norm": 0.042704951018095016, + "learning_rate": 3.1485279670329715e-06, + "loss": 0.0342, + "step": 169320 + }, + { + "epoch": 0.11665, + "grad_norm": 0.038515184074640274, + "learning_rate": 3.146520043982329e-06, + "loss": 0.0345, + "step": 169330 + }, + { + "epoch": 0.1167, + "grad_norm": 0.03892954811453819, + "learning_rate": 3.1445127183976213e-06, + "loss": 0.0348, + "step": 169340 + }, + { + "epoch": 0.11675, + "grad_norm": 0.03511323407292366, + "learning_rate": 3.142505990333727e-06, + "loss": 0.0324, + "step": 169350 + }, + { + "epoch": 0.1168, + "grad_norm": 0.04509086534380913, + "learning_rate": 3.140499859845508e-06, + "loss": 0.0341, + "step": 169360 + }, + { + "epoch": 0.11685, + "grad_norm": 0.037586696445941925, + "learning_rate": 3.1384943269878035e-06, + "loss": 0.0329, + "step": 169370 + }, + { + "epoch": 0.1169, + "grad_norm": 0.033918607980012894, + "learning_rate": 3.1364893918154586e-06, + "loss": 0.0319, + "step": 169380 + }, + { + "epoch": 0.11695, + "grad_norm": 0.036432284861803055, + "learning_rate": 3.134485054383271e-06, + "loss": 0.0328, + "step": 169390 + }, + { + "epoch": 0.117, + "grad_norm": 0.035951100289821625, + "learning_rate": 3.132481314746055e-06, + "loss": 0.0327, + "step": 169400 + }, + { + "epoch": 0.11705, + "grad_norm": 0.04071575775742531, + "learning_rate": 3.1304781729585867e-06, + "loss": 0.0319, + "step": 169410 + }, + { + "epoch": 0.1171, + "grad_norm": 0.03830753266811371, + "learning_rate": 3.1284756290756225e-06, + "loss": 0.0328, + "step": 169420 + }, + { + "epoch": 0.11715, + "grad_norm": 0.03831455111503601, + "learning_rate": 3.1264736831519204e-06, + "loss": 0.0326, + "step": 169430 + }, + { + "epoch": 0.1172, + "grad_norm": 0.03554108738899231, + "learning_rate": 3.124472335242215e-06, + "loss": 0.0333, + "step": 169440 + }, + { + "epoch": 0.11725, + "grad_norm": 0.047145262360572815, + "learning_rate": 3.1224715854012098e-06, + "loss": 0.0326, + "step": 169450 + }, + { + "epoch": 0.1173, + "grad_norm": 0.05718984082341194, + "learning_rate": 3.1204714336836166e-06, + "loss": 0.0325, + "step": 169460 + }, + { + "epoch": 0.11735, + "grad_norm": 0.06556253880262375, + "learning_rate": 3.118471880144111e-06, + "loss": 0.0337, + "step": 169470 + }, + { + "epoch": 0.1174, + "grad_norm": 0.04458180442452431, + "learning_rate": 3.116472924837374e-06, + "loss": 0.0321, + "step": 169480 + }, + { + "epoch": 0.11745, + "grad_norm": 0.04007299244403839, + "learning_rate": 3.114474567818035e-06, + "loss": 0.0321, + "step": 169490 + }, + { + "epoch": 0.1175, + "grad_norm": 0.03834760934114456, + "learning_rate": 3.1124768091407463e-06, + "loss": 0.031, + "step": 169500 + }, + { + "epoch": 0.11755, + "grad_norm": 0.05196467414498329, + "learning_rate": 3.1104796488601094e-06, + "loss": 0.0336, + "step": 169510 + }, + { + "epoch": 0.1176, + "grad_norm": 0.04448392242193222, + "learning_rate": 3.1084830870307445e-06, + "loss": 0.0328, + "step": 169520 + }, + { + "epoch": 0.11765, + "grad_norm": 0.0343671552836895, + "learning_rate": 3.1064871237072274e-06, + "loss": 0.0338, + "step": 169530 + }, + { + "epoch": 0.1177, + "grad_norm": 0.044008392840623856, + "learning_rate": 3.1044917589441195e-06, + "loss": 0.0322, + "step": 169540 + }, + { + "epoch": 0.11775, + "grad_norm": 0.038599561899900436, + "learning_rate": 3.102496992795989e-06, + "loss": 0.0335, + "step": 169550 + }, + { + "epoch": 0.1178, + "grad_norm": 0.04106726497411728, + "learning_rate": 3.100502825317364e-06, + "loss": 0.0321, + "step": 169560 + }, + { + "epoch": 0.11785, + "grad_norm": 0.040985990315675735, + "learning_rate": 3.098509256562765e-06, + "loss": 0.0331, + "step": 169570 + }, + { + "epoch": 0.1179, + "grad_norm": 0.0400564968585968, + "learning_rate": 3.096516286586687e-06, + "loss": 0.0329, + "step": 169580 + }, + { + "epoch": 0.11795, + "grad_norm": 0.03802260756492615, + "learning_rate": 3.0945239154436282e-06, + "loss": 0.0326, + "step": 169590 + }, + { + "epoch": 0.118, + "grad_norm": 0.04563620686531067, + "learning_rate": 3.0925321431880594e-06, + "loss": 0.0328, + "step": 169600 + }, + { + "epoch": 0.11805, + "grad_norm": 0.037531446665525436, + "learning_rate": 3.090540969874431e-06, + "loss": 0.0317, + "step": 169610 + }, + { + "epoch": 0.1181, + "grad_norm": 0.045137230306863785, + "learning_rate": 3.0885503955571826e-06, + "loss": 0.0317, + "step": 169620 + }, + { + "epoch": 0.11815, + "grad_norm": 0.03471638262271881, + "learning_rate": 3.0865604202907295e-06, + "loss": 0.0319, + "step": 169630 + }, + { + "epoch": 0.1182, + "grad_norm": 0.03710561618208885, + "learning_rate": 3.084571044129486e-06, + "loss": 0.0309, + "step": 169640 + }, + { + "epoch": 0.11825, + "grad_norm": 0.0388769693672657, + "learning_rate": 3.08258226712784e-06, + "loss": 0.0331, + "step": 169650 + }, + { + "epoch": 0.1183, + "grad_norm": 0.04335852339863777, + "learning_rate": 3.0805940893401526e-06, + "loss": 0.0324, + "step": 169660 + }, + { + "epoch": 0.11835, + "grad_norm": 0.04328613728284836, + "learning_rate": 3.0786065108207946e-06, + "loss": 0.0338, + "step": 169670 + }, + { + "epoch": 0.1184, + "grad_norm": 0.05562340095639229, + "learning_rate": 3.076619531624092e-06, + "loss": 0.0339, + "step": 169680 + }, + { + "epoch": 0.11845, + "grad_norm": 0.04683903604745865, + "learning_rate": 3.0746331518043876e-06, + "loss": 0.0317, + "step": 169690 + }, + { + "epoch": 0.1185, + "grad_norm": 0.047938767820596695, + "learning_rate": 3.072647371415965e-06, + "loss": 0.0342, + "step": 169700 + }, + { + "epoch": 0.11855, + "grad_norm": 0.047772448509931564, + "learning_rate": 3.070662190513124e-06, + "loss": 0.0323, + "step": 169710 + }, + { + "epoch": 0.1186, + "grad_norm": 0.05084957182407379, + "learning_rate": 3.0686776091501475e-06, + "loss": 0.031, + "step": 169720 + }, + { + "epoch": 0.11865, + "grad_norm": 0.11979934573173523, + "learning_rate": 3.066693627381284e-06, + "loss": 0.0334, + "step": 169730 + }, + { + "epoch": 0.1187, + "grad_norm": 0.06551992148160934, + "learning_rate": 3.0647102452607797e-06, + "loss": 0.0311, + "step": 169740 + }, + { + "epoch": 0.11875, + "grad_norm": 0.05387571081519127, + "learning_rate": 3.062727462842849e-06, + "loss": 0.0333, + "step": 169750 + }, + { + "epoch": 0.1188, + "grad_norm": 0.05842439830303192, + "learning_rate": 3.060745280181715e-06, + "loss": 0.0332, + "step": 169760 + }, + { + "epoch": 0.11885, + "grad_norm": 0.038283172994852066, + "learning_rate": 3.05876369733156e-06, + "loss": 0.0325, + "step": 169770 + }, + { + "epoch": 0.1189, + "grad_norm": 0.040976520627737045, + "learning_rate": 3.0567827143465596e-06, + "loss": 0.0329, + "step": 169780 + }, + { + "epoch": 0.11895, + "grad_norm": 0.03876696154475212, + "learning_rate": 3.0548023312808817e-06, + "loss": 0.0316, + "step": 169790 + }, + { + "epoch": 0.119, + "grad_norm": 0.03440241515636444, + "learning_rate": 3.052822548188658e-06, + "loss": 0.0326, + "step": 169800 + }, + { + "epoch": 0.11905, + "grad_norm": 0.037959929555654526, + "learning_rate": 3.050843365124026e-06, + "loss": 0.0329, + "step": 169810 + }, + { + "epoch": 0.1191, + "grad_norm": 0.03727288544178009, + "learning_rate": 3.048864782141089e-06, + "loss": 0.0317, + "step": 169820 + }, + { + "epoch": 0.11915, + "grad_norm": 0.036096200346946716, + "learning_rate": 3.0468867992939383e-06, + "loss": 0.0328, + "step": 169830 + }, + { + "epoch": 0.1192, + "grad_norm": 0.03416402265429497, + "learning_rate": 3.0449094166366597e-06, + "loss": 0.0337, + "step": 169840 + }, + { + "epoch": 0.11925, + "grad_norm": 0.036379892379045486, + "learning_rate": 3.0429326342233085e-06, + "loss": 0.033, + "step": 169850 + }, + { + "epoch": 0.1193, + "grad_norm": 0.037294864654541016, + "learning_rate": 3.040956452107932e-06, + "loss": 0.0336, + "step": 169860 + }, + { + "epoch": 0.11935, + "grad_norm": 0.0385604128241539, + "learning_rate": 3.0389808703445517e-06, + "loss": 0.0363, + "step": 169870 + }, + { + "epoch": 0.1194, + "grad_norm": 0.03919385373592377, + "learning_rate": 3.0370058889871878e-06, + "loss": 0.0331, + "step": 169880 + }, + { + "epoch": 0.11945, + "grad_norm": 0.03545527160167694, + "learning_rate": 3.035031508089828e-06, + "loss": 0.0339, + "step": 169890 + }, + { + "epoch": 0.1195, + "grad_norm": 0.03262300789356232, + "learning_rate": 3.0330577277064563e-06, + "loss": 0.0337, + "step": 169900 + }, + { + "epoch": 0.11955, + "grad_norm": 0.035723332315683365, + "learning_rate": 3.031084547891039e-06, + "loss": 0.0334, + "step": 169910 + }, + { + "epoch": 0.1196, + "grad_norm": 0.03940999507904053, + "learning_rate": 3.0291119686975093e-06, + "loss": 0.0344, + "step": 169920 + }, + { + "epoch": 0.11965, + "grad_norm": 0.038978319615125656, + "learning_rate": 3.027139990179809e-06, + "loss": 0.0338, + "step": 169930 + }, + { + "epoch": 0.1197, + "grad_norm": 0.03864939138293266, + "learning_rate": 3.025168612391846e-06, + "loss": 0.0338, + "step": 169940 + }, + { + "epoch": 0.11975, + "grad_norm": 0.03584860637784004, + "learning_rate": 3.023197835387517e-06, + "loss": 0.0328, + "step": 169950 + }, + { + "epoch": 0.1198, + "grad_norm": 0.04506705328822136, + "learning_rate": 3.0212276592207017e-06, + "loss": 0.0342, + "step": 169960 + }, + { + "epoch": 0.11985, + "grad_norm": 0.03933991119265556, + "learning_rate": 3.019258083945262e-06, + "loss": 0.0339, + "step": 169970 + }, + { + "epoch": 0.1199, + "grad_norm": 0.04669029638171196, + "learning_rate": 3.0172891096150595e-06, + "loss": 0.0354, + "step": 169980 + }, + { + "epoch": 0.11995, + "grad_norm": 0.03818805143237114, + "learning_rate": 3.0153207362839026e-06, + "loss": 0.0338, + "step": 169990 + }, + { + "epoch": 0.12, + "grad_norm": 0.03293650597333908, + "learning_rate": 3.013352964005625e-06, + "loss": 0.0329, + "step": 170000 + }, + { + "epoch": 0.12005, + "grad_norm": 0.03547142818570137, + "learning_rate": 3.011385792834012e-06, + "loss": 0.0339, + "step": 170010 + }, + { + "epoch": 0.1201, + "grad_norm": 0.0389530323445797, + "learning_rate": 3.009419222822854e-06, + "loss": 0.0333, + "step": 170020 + }, + { + "epoch": 0.12015, + "grad_norm": 0.04693255200982094, + "learning_rate": 3.0074532540259133e-06, + "loss": 0.0333, + "step": 170030 + }, + { + "epoch": 0.1202, + "grad_norm": 0.0418180488049984, + "learning_rate": 3.0054878864969353e-06, + "loss": 0.0343, + "step": 170040 + }, + { + "epoch": 0.12025, + "grad_norm": 0.03619196265935898, + "learning_rate": 3.0035231202896587e-06, + "loss": 0.0339, + "step": 170050 + }, + { + "epoch": 0.1203, + "grad_norm": 0.03492094203829765, + "learning_rate": 3.0015589554577977e-06, + "loss": 0.0331, + "step": 170060 + }, + { + "epoch": 0.12035, + "grad_norm": 0.035028375685214996, + "learning_rate": 2.999595392055049e-06, + "loss": 0.0324, + "step": 170070 + }, + { + "epoch": 0.1204, + "grad_norm": 0.03487279638648033, + "learning_rate": 2.9976324301350916e-06, + "loss": 0.0329, + "step": 170080 + }, + { + "epoch": 0.12045, + "grad_norm": 0.033144522458314896, + "learning_rate": 2.9956700697515996e-06, + "loss": 0.0316, + "step": 170090 + }, + { + "epoch": 0.1205, + "grad_norm": 0.03214790299534798, + "learning_rate": 2.9937083109582265e-06, + "loss": 0.0316, + "step": 170100 + }, + { + "epoch": 0.12055, + "grad_norm": 0.029983768239617348, + "learning_rate": 2.9917471538085996e-06, + "loss": 0.0321, + "step": 170110 + }, + { + "epoch": 0.1206, + "grad_norm": 0.03648291155695915, + "learning_rate": 2.9897865983563366e-06, + "loss": 0.0321, + "step": 170120 + }, + { + "epoch": 0.12065, + "grad_norm": 0.03465743735432625, + "learning_rate": 2.987826644655034e-06, + "loss": 0.0329, + "step": 170130 + }, + { + "epoch": 0.1207, + "grad_norm": 0.039328765124082565, + "learning_rate": 2.9858672927582876e-06, + "loss": 0.0324, + "step": 170140 + }, + { + "epoch": 0.12075, + "grad_norm": 0.039982233196496964, + "learning_rate": 2.98390854271966e-06, + "loss": 0.0349, + "step": 170150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.04412681609392166, + "learning_rate": 2.9819503945926946e-06, + "loss": 0.0323, + "step": 170160 + }, + { + "epoch": 0.12085, + "grad_norm": 0.0323239229619503, + "learning_rate": 2.9799928484309405e-06, + "loss": 0.0316, + "step": 170170 + }, + { + "epoch": 0.1209, + "grad_norm": 0.03849190101027489, + "learning_rate": 2.9780359042879043e-06, + "loss": 0.0327, + "step": 170180 + }, + { + "epoch": 0.12095, + "grad_norm": 0.03475498780608177, + "learning_rate": 2.9760795622171017e-06, + "loss": 0.0334, + "step": 170190 + }, + { + "epoch": 0.121, + "grad_norm": 0.0383419468998909, + "learning_rate": 2.974123822272001e-06, + "loss": 0.034, + "step": 170200 + }, + { + "epoch": 0.12105, + "grad_norm": 0.03523703292012215, + "learning_rate": 2.9721686845060797e-06, + "loss": 0.032, + "step": 170210 + }, + { + "epoch": 0.1211, + "grad_norm": 0.03748098015785217, + "learning_rate": 2.970214148972797e-06, + "loss": 0.0325, + "step": 170220 + }, + { + "epoch": 0.12115, + "grad_norm": 0.038272153586149216, + "learning_rate": 2.968260215725582e-06, + "loss": 0.034, + "step": 170230 + }, + { + "epoch": 0.1212, + "grad_norm": 0.03523880988359451, + "learning_rate": 2.966306884817857e-06, + "loss": 0.033, + "step": 170240 + }, + { + "epoch": 0.12125, + "grad_norm": 0.045698393136262894, + "learning_rate": 2.9643541563030177e-06, + "loss": 0.0343, + "step": 170250 + }, + { + "epoch": 0.1213, + "grad_norm": 0.03664017468690872, + "learning_rate": 2.9624020302344623e-06, + "loss": 0.0329, + "step": 170260 + }, + { + "epoch": 0.12135, + "grad_norm": 0.03602268174290657, + "learning_rate": 2.9604505066655575e-06, + "loss": 0.0337, + "step": 170270 + }, + { + "epoch": 0.1214, + "grad_norm": 0.03589862212538719, + "learning_rate": 2.9584995856496516e-06, + "loss": 0.0332, + "step": 170280 + }, + { + "epoch": 0.12145, + "grad_norm": 0.03870958834886551, + "learning_rate": 2.9565492672400886e-06, + "loss": 0.0327, + "step": 170290 + }, + { + "epoch": 0.1215, + "grad_norm": 0.039397165179252625, + "learning_rate": 2.9545995514901837e-06, + "loss": 0.0348, + "step": 170300 + }, + { + "epoch": 0.12155, + "grad_norm": 0.036325667053461075, + "learning_rate": 2.9526504384532525e-06, + "loss": 0.0335, + "step": 170310 + }, + { + "epoch": 0.1216, + "grad_norm": 0.03910687193274498, + "learning_rate": 2.9507019281825726e-06, + "loss": 0.0338, + "step": 170320 + }, + { + "epoch": 0.12165, + "grad_norm": 0.03571851924061775, + "learning_rate": 2.9487540207314203e-06, + "loss": 0.0341, + "step": 170330 + }, + { + "epoch": 0.1217, + "grad_norm": 0.033882223069667816, + "learning_rate": 2.9468067161530416e-06, + "loss": 0.0337, + "step": 170340 + }, + { + "epoch": 0.12175, + "grad_norm": 0.029706673696637154, + "learning_rate": 2.9448600145006888e-06, + "loss": 0.0343, + "step": 170350 + }, + { + "epoch": 0.1218, + "grad_norm": 0.03674902021884918, + "learning_rate": 2.9429139158275774e-06, + "loss": 0.0339, + "step": 170360 + }, + { + "epoch": 0.12185, + "grad_norm": 0.03714902698993683, + "learning_rate": 2.940968420186907e-06, + "loss": 0.0334, + "step": 170370 + }, + { + "epoch": 0.1219, + "grad_norm": 0.038870666176080704, + "learning_rate": 2.939023527631879e-06, + "loss": 0.0346, + "step": 170380 + }, + { + "epoch": 0.12195, + "grad_norm": 0.03690927103161812, + "learning_rate": 2.937079238215654e-06, + "loss": 0.0323, + "step": 170390 + }, + { + "epoch": 0.122, + "grad_norm": 0.03617848455905914, + "learning_rate": 2.9351355519914e-06, + "loss": 0.0333, + "step": 170400 + }, + { + "epoch": 0.12205, + "grad_norm": 0.03987789526581764, + "learning_rate": 2.9331924690122496e-06, + "loss": 0.0331, + "step": 170410 + }, + { + "epoch": 0.1221, + "grad_norm": 0.03884678706526756, + "learning_rate": 2.9312499893313224e-06, + "loss": 0.0333, + "step": 170420 + }, + { + "epoch": 0.12215, + "grad_norm": 0.03599599748849869, + "learning_rate": 2.929308113001733e-06, + "loss": 0.0343, + "step": 170430 + }, + { + "epoch": 0.1222, + "grad_norm": 0.034484054893255234, + "learning_rate": 2.92736684007657e-06, + "loss": 0.0327, + "step": 170440 + }, + { + "epoch": 0.12225, + "grad_norm": 0.04852869361639023, + "learning_rate": 2.925426170608905e-06, + "loss": 0.0321, + "step": 170450 + }, + { + "epoch": 0.1223, + "grad_norm": 0.037036336958408356, + "learning_rate": 2.92348610465179e-06, + "loss": 0.0341, + "step": 170460 + }, + { + "epoch": 0.12235, + "grad_norm": 0.04164846986532211, + "learning_rate": 2.9215466422582717e-06, + "loss": 0.0325, + "step": 170470 + }, + { + "epoch": 0.1224, + "grad_norm": 0.038174912333488464, + "learning_rate": 2.9196077834813844e-06, + "loss": 0.0326, + "step": 170480 + }, + { + "epoch": 0.12245, + "grad_norm": 0.03971322998404503, + "learning_rate": 2.9176695283741146e-06, + "loss": 0.0333, + "step": 170490 + }, + { + "epoch": 0.1225, + "grad_norm": 0.044817980378866196, + "learning_rate": 2.915731876989469e-06, + "loss": 0.0341, + "step": 170500 + }, + { + "epoch": 0.12255, + "grad_norm": 0.049287863075733185, + "learning_rate": 2.913794829380412e-06, + "loss": 0.0328, + "step": 170510 + }, + { + "epoch": 0.1226, + "grad_norm": 0.046644117683172226, + "learning_rate": 2.911858385599911e-06, + "loss": 0.0333, + "step": 170520 + }, + { + "epoch": 0.12265, + "grad_norm": 0.03821752965450287, + "learning_rate": 2.9099225457009047e-06, + "loss": 0.0324, + "step": 170530 + }, + { + "epoch": 0.1227, + "grad_norm": 0.04043368250131607, + "learning_rate": 2.9079873097363126e-06, + "loss": 0.0326, + "step": 170540 + }, + { + "epoch": 0.12275, + "grad_norm": 0.0473080575466156, + "learning_rate": 2.9060526777590522e-06, + "loss": 0.0323, + "step": 170550 + }, + { + "epoch": 0.1228, + "grad_norm": 0.03708431124687195, + "learning_rate": 2.9041186498220104e-06, + "loss": 0.0325, + "step": 170560 + }, + { + "epoch": 0.12285, + "grad_norm": 0.04041951522231102, + "learning_rate": 2.9021852259780656e-06, + "loss": 0.0368, + "step": 170570 + }, + { + "epoch": 0.1229, + "grad_norm": 0.03459900617599487, + "learning_rate": 2.9002524062800684e-06, + "loss": 0.0328, + "step": 170580 + }, + { + "epoch": 0.12295, + "grad_norm": 0.0452764630317688, + "learning_rate": 2.8983201907808734e-06, + "loss": 0.0356, + "step": 170590 + }, + { + "epoch": 0.123, + "grad_norm": 0.03793106600642204, + "learning_rate": 2.8963885795332963e-06, + "loss": 0.0335, + "step": 170600 + }, + { + "epoch": 0.12305, + "grad_norm": 0.0353790745139122, + "learning_rate": 2.8944575725901565e-06, + "loss": 0.0326, + "step": 170610 + }, + { + "epoch": 0.1231, + "grad_norm": 0.04083758220076561, + "learning_rate": 2.892527170004242e-06, + "loss": 0.0348, + "step": 170620 + }, + { + "epoch": 0.12315, + "grad_norm": 0.04061799496412277, + "learning_rate": 2.8905973718283243e-06, + "loss": 0.0329, + "step": 170630 + }, + { + "epoch": 0.1232, + "grad_norm": 0.041940487921237946, + "learning_rate": 2.8886681781151727e-06, + "loss": 0.0324, + "step": 170640 + }, + { + "epoch": 0.12325, + "grad_norm": 0.036815203726291656, + "learning_rate": 2.886739588917528e-06, + "loss": 0.0336, + "step": 170650 + }, + { + "epoch": 0.1233, + "grad_norm": 0.03948887810111046, + "learning_rate": 2.8848116042881094e-06, + "loss": 0.0338, + "step": 170660 + }, + { + "epoch": 0.12335, + "grad_norm": 0.033183470368385315, + "learning_rate": 2.882884224279636e-06, + "loss": 0.0342, + "step": 170670 + }, + { + "epoch": 0.1234, + "grad_norm": 0.030717089772224426, + "learning_rate": 2.880957448944796e-06, + "loss": 0.0327, + "step": 170680 + }, + { + "epoch": 0.12345, + "grad_norm": 0.03915680572390556, + "learning_rate": 2.8790312783362806e-06, + "loss": 0.0334, + "step": 170690 + }, + { + "epoch": 0.1235, + "grad_norm": 0.03605576604604721, + "learning_rate": 2.877105712506728e-06, + "loss": 0.0336, + "step": 170700 + }, + { + "epoch": 0.12355, + "grad_norm": 0.039679888635873795, + "learning_rate": 2.8751807515087993e-06, + "loss": 0.0322, + "step": 170710 + }, + { + "epoch": 0.1236, + "grad_norm": 0.033951517194509506, + "learning_rate": 2.873256395395113e-06, + "loss": 0.0329, + "step": 170720 + }, + { + "epoch": 0.12365, + "grad_norm": 0.04974079132080078, + "learning_rate": 2.8713326442182886e-06, + "loss": 0.0329, + "step": 170730 + }, + { + "epoch": 0.1237, + "grad_norm": 0.04222653806209564, + "learning_rate": 2.869409498030917e-06, + "loss": 0.0333, + "step": 170740 + }, + { + "epoch": 0.12375, + "grad_norm": 0.036745402961969376, + "learning_rate": 2.8674869568855676e-06, + "loss": 0.0324, + "step": 170750 + }, + { + "epoch": 0.1238, + "grad_norm": 0.030236944556236267, + "learning_rate": 2.8655650208348178e-06, + "loss": 0.0321, + "step": 170760 + }, + { + "epoch": 0.12385, + "grad_norm": 0.033882755786180496, + "learning_rate": 2.863643689931206e-06, + "loss": 0.0334, + "step": 170770 + }, + { + "epoch": 0.1239, + "grad_norm": 0.0467345230281353, + "learning_rate": 2.8617229642272563e-06, + "loss": 0.0339, + "step": 170780 + }, + { + "epoch": 0.12395, + "grad_norm": 0.03583429381251335, + "learning_rate": 2.8598028437754802e-06, + "loss": 0.0344, + "step": 170790 + }, + { + "epoch": 0.124, + "grad_norm": 0.033289577811956406, + "learning_rate": 2.857883328628377e-06, + "loss": 0.0325, + "step": 170800 + }, + { + "epoch": 0.12405, + "grad_norm": 0.03858331963419914, + "learning_rate": 2.8559644188384306e-06, + "loss": 0.0328, + "step": 170810 + }, + { + "epoch": 0.1241, + "grad_norm": 0.03308607637882233, + "learning_rate": 2.8540461144580978e-06, + "loss": 0.0331, + "step": 170820 + }, + { + "epoch": 0.12415, + "grad_norm": 0.04164617508649826, + "learning_rate": 2.852128415539823e-06, + "loss": 0.0345, + "step": 170830 + }, + { + "epoch": 0.1242, + "grad_norm": 0.0402013398706913, + "learning_rate": 2.8502113221360314e-06, + "loss": 0.0336, + "step": 170840 + }, + { + "epoch": 0.12425, + "grad_norm": 0.036746375262737274, + "learning_rate": 2.848294834299148e-06, + "loss": 0.0352, + "step": 170850 + }, + { + "epoch": 0.1243, + "grad_norm": 0.034505799412727356, + "learning_rate": 2.8463789520815602e-06, + "loss": 0.0345, + "step": 170860 + }, + { + "epoch": 0.12435, + "grad_norm": 0.036910947412252426, + "learning_rate": 2.8444636755356442e-06, + "loss": 0.0349, + "step": 170870 + }, + { + "epoch": 0.1244, + "grad_norm": 0.05080193281173706, + "learning_rate": 2.8425490047137737e-06, + "loss": 0.0341, + "step": 170880 + }, + { + "epoch": 0.12445, + "grad_norm": 0.041805725544691086, + "learning_rate": 2.840634939668285e-06, + "loss": 0.0332, + "step": 170890 + }, + { + "epoch": 0.1245, + "grad_norm": 0.0434776172041893, + "learning_rate": 2.83872148045152e-06, + "loss": 0.0347, + "step": 170900 + }, + { + "epoch": 0.12455, + "grad_norm": 0.035198379307985306, + "learning_rate": 2.836808627115775e-06, + "loss": 0.0343, + "step": 170910 + }, + { + "epoch": 0.1246, + "grad_norm": 0.03682961314916611, + "learning_rate": 2.834896379713356e-06, + "loss": 0.0321, + "step": 170920 + }, + { + "epoch": 0.12465, + "grad_norm": 0.038385991007089615, + "learning_rate": 2.8329847382965485e-06, + "loss": 0.0329, + "step": 170930 + }, + { + "epoch": 0.1247, + "grad_norm": 0.038753096014261246, + "learning_rate": 2.831073702917611e-06, + "loss": 0.0324, + "step": 170940 + }, + { + "epoch": 0.12475, + "grad_norm": 0.03721785545349121, + "learning_rate": 2.8291632736287877e-06, + "loss": 0.0323, + "step": 170950 + }, + { + "epoch": 0.1248, + "grad_norm": 0.04255424067378044, + "learning_rate": 2.827253450482309e-06, + "loss": 0.0337, + "step": 170960 + }, + { + "epoch": 0.12485, + "grad_norm": 0.03699033707380295, + "learning_rate": 2.8253442335303944e-06, + "loss": 0.0315, + "step": 170970 + }, + { + "epoch": 0.1249, + "grad_norm": 0.0359467975795269, + "learning_rate": 2.8234356228252377e-06, + "loss": 0.0328, + "step": 170980 + }, + { + "epoch": 0.12495, + "grad_norm": 0.03315534070134163, + "learning_rate": 2.821527618419015e-06, + "loss": 0.0319, + "step": 170990 + }, + { + "epoch": 0.125, + "grad_norm": 0.04848819971084595, + "learning_rate": 2.8196202203639e-06, + "loss": 0.0341, + "step": 171000 + }, + { + "epoch": 0.12505, + "grad_norm": 0.03591757267713547, + "learning_rate": 2.817713428712029e-06, + "loss": 0.0317, + "step": 171010 + }, + { + "epoch": 0.1251, + "grad_norm": 0.04786156117916107, + "learning_rate": 2.8158072435155474e-06, + "loss": 0.0351, + "step": 171020 + }, + { + "epoch": 0.12515, + "grad_norm": 0.04991556331515312, + "learning_rate": 2.8139016648265597e-06, + "loss": 0.0329, + "step": 171030 + }, + { + "epoch": 0.1252, + "grad_norm": 0.036221716552972794, + "learning_rate": 2.811996692697161e-06, + "loss": 0.0332, + "step": 171040 + }, + { + "epoch": 0.12525, + "grad_norm": 0.03783911094069481, + "learning_rate": 2.810092327179442e-06, + "loss": 0.0318, + "step": 171050 + }, + { + "epoch": 0.1253, + "grad_norm": 0.038371000438928604, + "learning_rate": 2.8081885683254623e-06, + "loss": 0.033, + "step": 171060 + }, + { + "epoch": 0.12535, + "grad_norm": 0.04190398380160332, + "learning_rate": 2.806285416187271e-06, + "loss": 0.0322, + "step": 171070 + }, + { + "epoch": 0.1254, + "grad_norm": 0.038543831557035446, + "learning_rate": 2.8043828708168935e-06, + "loss": 0.0321, + "step": 171080 + }, + { + "epoch": 0.12545, + "grad_norm": 0.03184407949447632, + "learning_rate": 2.8024809322663547e-06, + "loss": 0.0319, + "step": 171090 + }, + { + "epoch": 0.1255, + "grad_norm": 0.038300253450870514, + "learning_rate": 2.800579600587641e-06, + "loss": 0.0327, + "step": 171100 + }, + { + "epoch": 0.12555, + "grad_norm": 0.04688066616654396, + "learning_rate": 2.798678875832747e-06, + "loss": 0.0366, + "step": 171110 + }, + { + "epoch": 0.1256, + "grad_norm": 0.03489735722541809, + "learning_rate": 2.7967787580536336e-06, + "loss": 0.031, + "step": 171120 + }, + { + "epoch": 0.12565, + "grad_norm": 0.04473022371530533, + "learning_rate": 2.79487924730224e-06, + "loss": 0.0334, + "step": 171130 + }, + { + "epoch": 0.1257, + "grad_norm": 0.037319961935281754, + "learning_rate": 2.7929803436305137e-06, + "loss": 0.0329, + "step": 171140 + }, + { + "epoch": 0.12575, + "grad_norm": 0.0370020754635334, + "learning_rate": 2.79108204709036e-06, + "loss": 0.0329, + "step": 171150 + }, + { + "epoch": 0.1258, + "grad_norm": 0.03408210724592209, + "learning_rate": 2.789184357733679e-06, + "loss": 0.0329, + "step": 171160 + }, + { + "epoch": 0.12585, + "grad_norm": 0.03917255252599716, + "learning_rate": 2.787287275612349e-06, + "loss": 0.0318, + "step": 171170 + }, + { + "epoch": 0.1259, + "grad_norm": 0.03344712778925896, + "learning_rate": 2.7853908007782426e-06, + "loss": 0.0323, + "step": 171180 + }, + { + "epoch": 0.12595, + "grad_norm": 0.036113545298576355, + "learning_rate": 2.783494933283212e-06, + "loss": 0.0353, + "step": 171190 + }, + { + "epoch": 0.126, + "grad_norm": 0.042464982718229294, + "learning_rate": 2.7815996731790778e-06, + "loss": 0.0347, + "step": 171200 + }, + { + "epoch": 0.12605, + "grad_norm": 0.03898061439394951, + "learning_rate": 2.7797050205176644e-06, + "loss": 0.033, + "step": 171210 + }, + { + "epoch": 0.1261, + "grad_norm": 0.03752130642533302, + "learning_rate": 2.7778109753507614e-06, + "loss": 0.0324, + "step": 171220 + }, + { + "epoch": 0.12615, + "grad_norm": 0.033893827348947525, + "learning_rate": 2.775917537730163e-06, + "loss": 0.032, + "step": 171230 + }, + { + "epoch": 0.1262, + "grad_norm": 0.03558017686009407, + "learning_rate": 2.7740247077076316e-06, + "loss": 0.0316, + "step": 171240 + }, + { + "epoch": 0.12625, + "grad_norm": 0.039208702743053436, + "learning_rate": 2.7721324853349106e-06, + "loss": 0.0301, + "step": 171250 + }, + { + "epoch": 0.1263, + "grad_norm": 0.036127764731645584, + "learning_rate": 2.77024087066374e-06, + "loss": 0.0318, + "step": 171260 + }, + { + "epoch": 0.12635, + "grad_norm": 0.03924562782049179, + "learning_rate": 2.7683498637458336e-06, + "loss": 0.0316, + "step": 171270 + }, + { + "epoch": 0.1264, + "grad_norm": 0.03675839304924011, + "learning_rate": 2.7664594646328895e-06, + "loss": 0.032, + "step": 171280 + }, + { + "epoch": 0.12645, + "grad_norm": 0.03594402223825455, + "learning_rate": 2.764569673376585e-06, + "loss": 0.0315, + "step": 171290 + }, + { + "epoch": 0.1265, + "grad_norm": 0.03455478325486183, + "learning_rate": 2.7626804900285937e-06, + "loss": 0.031, + "step": 171300 + }, + { + "epoch": 0.12655, + "grad_norm": 0.03500540927052498, + "learning_rate": 2.7607919146405674e-06, + "loss": 0.033, + "step": 171310 + }, + { + "epoch": 0.1266, + "grad_norm": 0.03529797121882439, + "learning_rate": 2.7589039472641354e-06, + "loss": 0.0318, + "step": 171320 + }, + { + "epoch": 0.12665, + "grad_norm": 0.034966934472322464, + "learning_rate": 2.757016587950914e-06, + "loss": 0.0302, + "step": 171330 + }, + { + "epoch": 0.1267, + "grad_norm": 0.031065698713064194, + "learning_rate": 2.7551298367524985e-06, + "loss": 0.0317, + "step": 171340 + }, + { + "epoch": 0.12675, + "grad_norm": 0.03381740301847458, + "learning_rate": 2.753243693720481e-06, + "loss": 0.0325, + "step": 171350 + }, + { + "epoch": 0.1268, + "grad_norm": 0.03633000701665878, + "learning_rate": 2.7513581589064223e-06, + "loss": 0.0316, + "step": 171360 + }, + { + "epoch": 0.12685, + "grad_norm": 0.041332636028528214, + "learning_rate": 2.749473232361871e-06, + "loss": 0.0312, + "step": 171370 + }, + { + "epoch": 0.1269, + "grad_norm": 0.03545716404914856, + "learning_rate": 2.7475889141383632e-06, + "loss": 0.032, + "step": 171380 + }, + { + "epoch": 0.12695, + "grad_norm": 0.03463119640946388, + "learning_rate": 2.745705204287413e-06, + "loss": 0.0335, + "step": 171390 + }, + { + "epoch": 0.127, + "grad_norm": 0.03852127864956856, + "learning_rate": 2.74382210286053e-06, + "loss": 0.0338, + "step": 171400 + }, + { + "epoch": 0.12705, + "grad_norm": 0.03509530797600746, + "learning_rate": 2.7419396099091806e-06, + "loss": 0.0329, + "step": 171410 + }, + { + "epoch": 0.1271, + "grad_norm": 0.032109301537275314, + "learning_rate": 2.7400577254848405e-06, + "loss": 0.0336, + "step": 171420 + }, + { + "epoch": 0.12715, + "grad_norm": 0.03659934550523758, + "learning_rate": 2.7381764496389626e-06, + "loss": 0.0333, + "step": 171430 + }, + { + "epoch": 0.1272, + "grad_norm": 0.03949648514389992, + "learning_rate": 2.736295782422979e-06, + "loss": 0.0351, + "step": 171440 + }, + { + "epoch": 0.12725, + "grad_norm": 0.03775806725025177, + "learning_rate": 2.7344157238883034e-06, + "loss": 0.0332, + "step": 171450 + }, + { + "epoch": 0.1273, + "grad_norm": 0.03968925401568413, + "learning_rate": 2.7325362740863336e-06, + "loss": 0.0354, + "step": 171460 + }, + { + "epoch": 0.12735, + "grad_norm": 0.038549553602933884, + "learning_rate": 2.7306574330684593e-06, + "loss": 0.0326, + "step": 171470 + }, + { + "epoch": 0.1274, + "grad_norm": 0.03686949238181114, + "learning_rate": 2.7287792008860472e-06, + "loss": 0.0349, + "step": 171480 + }, + { + "epoch": 0.12745, + "grad_norm": 0.03963692486286163, + "learning_rate": 2.7269015775904377e-06, + "loss": 0.0329, + "step": 171490 + }, + { + "epoch": 0.1275, + "grad_norm": 0.03710221126675606, + "learning_rate": 2.725024563232978e-06, + "loss": 0.0349, + "step": 171500 + }, + { + "epoch": 0.12755, + "grad_norm": 0.039931006729602814, + "learning_rate": 2.7231481578649743e-06, + "loss": 0.0343, + "step": 171510 + }, + { + "epoch": 0.1276, + "grad_norm": 0.03956283628940582, + "learning_rate": 2.7212723615377326e-06, + "loss": 0.035, + "step": 171520 + }, + { + "epoch": 0.12765, + "grad_norm": 0.03812943771481514, + "learning_rate": 2.7193971743025376e-06, + "loss": 0.0352, + "step": 171530 + }, + { + "epoch": 0.1277, + "grad_norm": 0.03685388341546059, + "learning_rate": 2.7175225962106533e-06, + "loss": 0.033, + "step": 171540 + }, + { + "epoch": 0.12775, + "grad_norm": 0.040714338421821594, + "learning_rate": 2.715648627313322e-06, + "loss": 0.0321, + "step": 171550 + }, + { + "epoch": 0.1278, + "grad_norm": 0.0388384610414505, + "learning_rate": 2.713775267661789e-06, + "loss": 0.0326, + "step": 171560 + }, + { + "epoch": 0.12785, + "grad_norm": 0.03131807595491409, + "learning_rate": 2.7119025173072772e-06, + "loss": 0.033, + "step": 171570 + }, + { + "epoch": 0.1279, + "grad_norm": 0.03585648536682129, + "learning_rate": 2.7100303763009647e-06, + "loss": 0.0328, + "step": 171580 + }, + { + "epoch": 0.12795, + "grad_norm": 0.03749939799308777, + "learning_rate": 2.708158844694056e-06, + "loss": 0.0383, + "step": 171590 + }, + { + "epoch": 0.128, + "grad_norm": 0.037364646792411804, + "learning_rate": 2.7062879225377006e-06, + "loss": 0.0366, + "step": 171600 + }, + { + "epoch": 0.12805, + "grad_norm": 0.04412858933210373, + "learning_rate": 2.704417609883067e-06, + "loss": 0.0348, + "step": 171610 + }, + { + "epoch": 0.1281, + "grad_norm": 0.039236828684806824, + "learning_rate": 2.7025479067812777e-06, + "loss": 0.0332, + "step": 171620 + }, + { + "epoch": 0.12815, + "grad_norm": 0.04682591184973717, + "learning_rate": 2.7006788132834498e-06, + "loss": 0.033, + "step": 171630 + }, + { + "epoch": 0.1282, + "grad_norm": 0.04099787026643753, + "learning_rate": 2.6988103294406875e-06, + "loss": 0.0323, + "step": 171640 + }, + { + "epoch": 0.12825, + "grad_norm": 0.036896705627441406, + "learning_rate": 2.6969424553040746e-06, + "loss": 0.0331, + "step": 171650 + }, + { + "epoch": 0.1283, + "grad_norm": 0.038666076958179474, + "learning_rate": 2.6950751909246756e-06, + "loss": 0.0334, + "step": 171660 + }, + { + "epoch": 0.12835, + "grad_norm": 0.03882349282503128, + "learning_rate": 2.6932085363535362e-06, + "loss": 0.0328, + "step": 171670 + }, + { + "epoch": 0.1284, + "grad_norm": 0.04096510261297226, + "learning_rate": 2.6913424916416936e-06, + "loss": 0.0325, + "step": 171680 + }, + { + "epoch": 0.12845, + "grad_norm": 0.03649698942899704, + "learning_rate": 2.689477056840173e-06, + "loss": 0.0333, + "step": 171690 + }, + { + "epoch": 0.1285, + "grad_norm": 0.042621366679668427, + "learning_rate": 2.6876122319999668e-06, + "loss": 0.0331, + "step": 171700 + }, + { + "epoch": 0.12855, + "grad_norm": 0.03946394473314285, + "learning_rate": 2.685748017172063e-06, + "loss": 0.0326, + "step": 171710 + }, + { + "epoch": 0.1286, + "grad_norm": 0.034220434725284576, + "learning_rate": 2.683884412407417e-06, + "loss": 0.0328, + "step": 171720 + }, + { + "epoch": 0.12865, + "grad_norm": 0.039184972643852234, + "learning_rate": 2.682021417756994e-06, + "loss": 0.0341, + "step": 171730 + }, + { + "epoch": 0.1287, + "grad_norm": 0.03642386570572853, + "learning_rate": 2.6801590332717203e-06, + "loss": 0.0324, + "step": 171740 + }, + { + "epoch": 0.12875, + "grad_norm": 0.03386127948760986, + "learning_rate": 2.6782972590025074e-06, + "loss": 0.033, + "step": 171750 + }, + { + "epoch": 0.1288, + "grad_norm": 0.036138761788606644, + "learning_rate": 2.6764360950002677e-06, + "loss": 0.0336, + "step": 171760 + }, + { + "epoch": 0.12885, + "grad_norm": 0.036520663648843765, + "learning_rate": 2.674575541315874e-06, + "loss": 0.0332, + "step": 171770 + }, + { + "epoch": 0.1289, + "grad_norm": 0.03316280245780945, + "learning_rate": 2.6727155980002057e-06, + "loss": 0.0339, + "step": 171780 + }, + { + "epoch": 0.12895, + "grad_norm": 0.04363163560628891, + "learning_rate": 2.6708562651040963e-06, + "loss": 0.0328, + "step": 171790 + }, + { + "epoch": 0.129, + "grad_norm": 0.03942793607711792, + "learning_rate": 2.6689975426783946e-06, + "loss": 0.0343, + "step": 171800 + }, + { + "epoch": 0.12905, + "grad_norm": 0.038066670298576355, + "learning_rate": 2.6671394307739043e-06, + "loss": 0.0341, + "step": 171810 + }, + { + "epoch": 0.1291, + "grad_norm": 0.04269417002797127, + "learning_rate": 2.6652819294414377e-06, + "loss": 0.0318, + "step": 171820 + }, + { + "epoch": 0.12915, + "grad_norm": 0.03263647109270096, + "learning_rate": 2.6634250387317706e-06, + "loss": 0.0336, + "step": 171830 + }, + { + "epoch": 0.1292, + "grad_norm": 0.04190134257078171, + "learning_rate": 2.6615687586956678e-06, + "loss": 0.0337, + "step": 171840 + }, + { + "epoch": 0.12925, + "grad_norm": 0.03180139511823654, + "learning_rate": 2.659713089383886e-06, + "loss": 0.033, + "step": 171850 + }, + { + "epoch": 0.1293, + "grad_norm": 0.0395943745970726, + "learning_rate": 2.6578580308471575e-06, + "loss": 0.0334, + "step": 171860 + }, + { + "epoch": 0.12935, + "grad_norm": 0.0389542281627655, + "learning_rate": 2.6560035831361936e-06, + "loss": 0.0329, + "step": 171870 + }, + { + "epoch": 0.1294, + "grad_norm": 0.041193362325429916, + "learning_rate": 2.6541497463016983e-06, + "loss": 0.0354, + "step": 171880 + }, + { + "epoch": 0.12945, + "grad_norm": 0.04503790661692619, + "learning_rate": 2.652296520394351e-06, + "loss": 0.0344, + "step": 171890 + }, + { + "epoch": 0.1295, + "grad_norm": 0.04351415857672691, + "learning_rate": 2.650443905464828e-06, + "loss": 0.0347, + "step": 171900 + }, + { + "epoch": 0.12955, + "grad_norm": 0.03476227447390556, + "learning_rate": 2.6485919015637712e-06, + "loss": 0.034, + "step": 171910 + }, + { + "epoch": 0.1296, + "grad_norm": 0.04246963933110237, + "learning_rate": 2.646740508741813e-06, + "loss": 0.0343, + "step": 171920 + }, + { + "epoch": 0.12965, + "grad_norm": 0.03715596720576286, + "learning_rate": 2.6448897270495683e-06, + "loss": 0.0343, + "step": 171930 + }, + { + "epoch": 0.1297, + "grad_norm": 0.039296895265579224, + "learning_rate": 2.643039556537644e-06, + "loss": 0.0356, + "step": 171940 + }, + { + "epoch": 0.12975, + "grad_norm": 0.03285042941570282, + "learning_rate": 2.6411899972566186e-06, + "loss": 0.0333, + "step": 171950 + }, + { + "epoch": 0.1298, + "grad_norm": 0.03390447795391083, + "learning_rate": 2.6393410492570546e-06, + "loss": 0.0336, + "step": 171960 + }, + { + "epoch": 0.12985, + "grad_norm": 0.030247660353779793, + "learning_rate": 2.637492712589512e-06, + "loss": 0.033, + "step": 171970 + }, + { + "epoch": 0.1299, + "grad_norm": 0.034210652112960815, + "learning_rate": 2.6356449873045114e-06, + "loss": 0.0332, + "step": 171980 + }, + { + "epoch": 0.12995, + "grad_norm": 0.031560979783535004, + "learning_rate": 2.6337978734525844e-06, + "loss": 0.0337, + "step": 171990 + }, + { + "epoch": 0.13, + "grad_norm": 0.043251339346170425, + "learning_rate": 2.6319513710842127e-06, + "loss": 0.0336, + "step": 172000 + }, + { + "epoch": 0.13005, + "grad_norm": 0.03139464929699898, + "learning_rate": 2.6301054802498866e-06, + "loss": 0.0324, + "step": 172010 + }, + { + "epoch": 0.1301, + "grad_norm": 0.039893459528684616, + "learning_rate": 2.628260201000077e-06, + "loss": 0.0324, + "step": 172020 + }, + { + "epoch": 0.13015, + "grad_norm": 0.033400699496269226, + "learning_rate": 2.626415533385229e-06, + "loss": 0.0327, + "step": 172030 + }, + { + "epoch": 0.1302, + "grad_norm": 0.03817545995116234, + "learning_rate": 2.6245714774557728e-06, + "loss": 0.0342, + "step": 172040 + }, + { + "epoch": 0.13025, + "grad_norm": 0.03130049630999565, + "learning_rate": 2.6227280332621256e-06, + "loss": 0.0321, + "step": 172050 + }, + { + "epoch": 0.1303, + "grad_norm": 0.041443757712841034, + "learning_rate": 2.6208852008546836e-06, + "loss": 0.0345, + "step": 172060 + }, + { + "epoch": 0.13035, + "grad_norm": 0.037647463381290436, + "learning_rate": 2.6190429802838453e-06, + "loss": 0.0334, + "step": 172070 + }, + { + "epoch": 0.1304, + "grad_norm": 0.038211289793252945, + "learning_rate": 2.617201371599953e-06, + "loss": 0.0329, + "step": 172080 + }, + { + "epoch": 0.13045, + "grad_norm": 0.041644494980573654, + "learning_rate": 2.6153603748533705e-06, + "loss": 0.0333, + "step": 172090 + }, + { + "epoch": 0.1305, + "grad_norm": 0.03600402921438217, + "learning_rate": 2.613519990094421e-06, + "loss": 0.0332, + "step": 172100 + }, + { + "epoch": 0.13055, + "grad_norm": 0.03201232850551605, + "learning_rate": 2.61168021737343e-06, + "loss": 0.0335, + "step": 172110 + }, + { + "epoch": 0.1306, + "grad_norm": 0.04101625829935074, + "learning_rate": 2.6098410567406916e-06, + "loss": 0.0368, + "step": 172120 + }, + { + "epoch": 0.13065, + "grad_norm": 0.037356119602918625, + "learning_rate": 2.608002508246482e-06, + "loss": 0.0338, + "step": 172130 + }, + { + "epoch": 0.1307, + "grad_norm": 0.046279631555080414, + "learning_rate": 2.606164571941078e-06, + "loss": 0.0332, + "step": 172140 + }, + { + "epoch": 0.13075, + "grad_norm": 0.03496434912085533, + "learning_rate": 2.6043272478747187e-06, + "loss": 0.0329, + "step": 172150 + }, + { + "epoch": 0.1308, + "grad_norm": 0.04666680842638016, + "learning_rate": 2.6024905360976405e-06, + "loss": 0.0349, + "step": 172160 + }, + { + "epoch": 0.13085, + "grad_norm": 0.04172470420598984, + "learning_rate": 2.6006544366600494e-06, + "loss": 0.0326, + "step": 172170 + }, + { + "epoch": 0.1309, + "grad_norm": 0.035661906003952026, + "learning_rate": 2.5988189496121584e-06, + "loss": 0.0335, + "step": 172180 + }, + { + "epoch": 0.13095, + "grad_norm": 0.055430054664611816, + "learning_rate": 2.596984075004136e-06, + "loss": 0.0338, + "step": 172190 + }, + { + "epoch": 0.131, + "grad_norm": 0.07867436110973358, + "learning_rate": 2.595149812886158e-06, + "loss": 0.0327, + "step": 172200 + }, + { + "epoch": 0.13105, + "grad_norm": 0.05331834405660629, + "learning_rate": 2.5933161633083654e-06, + "loss": 0.0332, + "step": 172210 + }, + { + "epoch": 0.1311, + "grad_norm": 0.04124901071190834, + "learning_rate": 2.591483126320887e-06, + "loss": 0.0325, + "step": 172220 + }, + { + "epoch": 0.13115, + "grad_norm": 0.04230903834104538, + "learning_rate": 2.5896507019738475e-06, + "loss": 0.0342, + "step": 172230 + }, + { + "epoch": 0.1312, + "grad_norm": 0.03651657700538635, + "learning_rate": 2.587818890317337e-06, + "loss": 0.033, + "step": 172240 + }, + { + "epoch": 0.13125, + "grad_norm": 0.03932705521583557, + "learning_rate": 2.585987691401434e-06, + "loss": 0.0342, + "step": 172250 + }, + { + "epoch": 0.1313, + "grad_norm": 0.039938826113939285, + "learning_rate": 2.5841571052762135e-06, + "loss": 0.0348, + "step": 172260 + }, + { + "epoch": 0.13135, + "grad_norm": 0.03369868919253349, + "learning_rate": 2.58232713199171e-06, + "loss": 0.0329, + "step": 172270 + }, + { + "epoch": 0.1314, + "grad_norm": 0.03550058230757713, + "learning_rate": 2.58049777159797e-06, + "loss": 0.0329, + "step": 172280 + }, + { + "epoch": 0.13145, + "grad_norm": 0.03386114537715912, + "learning_rate": 2.578669024144989e-06, + "loss": 0.0335, + "step": 172290 + }, + { + "epoch": 0.1315, + "grad_norm": 0.040460165590047836, + "learning_rate": 2.576840889682777e-06, + "loss": 0.0336, + "step": 172300 + }, + { + "epoch": 0.13155, + "grad_norm": 0.0648966133594513, + "learning_rate": 2.5750133682613085e-06, + "loss": 0.0349, + "step": 172310 + }, + { + "epoch": 0.1316, + "grad_norm": 0.035546429455280304, + "learning_rate": 2.5731864599305545e-06, + "loss": 0.0312, + "step": 172320 + }, + { + "epoch": 0.13165, + "grad_norm": 0.03535797819495201, + "learning_rate": 2.571360164740455e-06, + "loss": 0.0332, + "step": 172330 + }, + { + "epoch": 0.1317, + "grad_norm": 0.07054276019334793, + "learning_rate": 2.56953448274094e-06, + "loss": 0.0331, + "step": 172340 + }, + { + "epoch": 0.13175, + "grad_norm": 0.04104088246822357, + "learning_rate": 2.5677094139819307e-06, + "loss": 0.0319, + "step": 172350 + }, + { + "epoch": 0.1318, + "grad_norm": 0.030941542237997055, + "learning_rate": 2.5658849585133178e-06, + "loss": 0.0319, + "step": 172360 + }, + { + "epoch": 0.13185, + "grad_norm": 0.042520422488451004, + "learning_rate": 2.564061116384983e-06, + "loss": 0.0325, + "step": 172370 + }, + { + "epoch": 0.1319, + "grad_norm": 0.035734474658966064, + "learning_rate": 2.5622378876467818e-06, + "loss": 0.0321, + "step": 172380 + }, + { + "epoch": 0.13195, + "grad_norm": 0.035758309066295624, + "learning_rate": 2.560415272348568e-06, + "loss": 0.0328, + "step": 172390 + }, + { + "epoch": 0.132, + "grad_norm": 0.029714185744524002, + "learning_rate": 2.5585932705401744e-06, + "loss": 0.0318, + "step": 172400 + }, + { + "epoch": 0.13205, + "grad_norm": 0.045044753700494766, + "learning_rate": 2.556771882271411e-06, + "loss": 0.0334, + "step": 172410 + }, + { + "epoch": 0.1321, + "grad_norm": 0.04619543254375458, + "learning_rate": 2.5549511075920742e-06, + "loss": 0.0322, + "step": 172420 + }, + { + "epoch": 0.13215, + "grad_norm": 0.03538672253489494, + "learning_rate": 2.5531309465519347e-06, + "loss": 0.0312, + "step": 172430 + }, + { + "epoch": 0.1322, + "grad_norm": 0.034619834274053574, + "learning_rate": 2.5513113992007675e-06, + "loss": 0.0323, + "step": 172440 + }, + { + "epoch": 0.13225, + "grad_norm": 0.03356612101197243, + "learning_rate": 2.549492465588313e-06, + "loss": 0.0327, + "step": 172450 + }, + { + "epoch": 0.1323, + "grad_norm": 0.03605108708143234, + "learning_rate": 2.5476741457642976e-06, + "loss": 0.0334, + "step": 172460 + }, + { + "epoch": 0.13235, + "grad_norm": 0.04438095539808273, + "learning_rate": 2.545856439778438e-06, + "loss": 0.0333, + "step": 172470 + }, + { + "epoch": 0.1324, + "grad_norm": 0.03273903578519821, + "learning_rate": 2.5440393476804243e-06, + "loss": 0.0324, + "step": 172480 + }, + { + "epoch": 0.13245, + "grad_norm": 0.03685907647013664, + "learning_rate": 2.542222869519947e-06, + "loss": 0.0311, + "step": 172490 + }, + { + "epoch": 0.1325, + "grad_norm": 0.03596313297748566, + "learning_rate": 2.540407005346651e-06, + "loss": 0.0321, + "step": 172500 + }, + { + "epoch": 0.13255, + "grad_norm": 0.030287181958556175, + "learning_rate": 2.53859175521019e-06, + "loss": 0.0315, + "step": 172510 + }, + { + "epoch": 0.1326, + "grad_norm": 0.034024789929389954, + "learning_rate": 2.536777119160197e-06, + "loss": 0.0321, + "step": 172520 + }, + { + "epoch": 0.13265, + "grad_norm": 0.032543737441301346, + "learning_rate": 2.5349630972462792e-06, + "loss": 0.0318, + "step": 172530 + }, + { + "epoch": 0.1327, + "grad_norm": 0.03257584199309349, + "learning_rate": 2.5331496895180272e-06, + "loss": 0.0311, + "step": 172540 + }, + { + "epoch": 0.13275, + "grad_norm": 0.03539401665329933, + "learning_rate": 2.5313368960250216e-06, + "loss": 0.0323, + "step": 172550 + }, + { + "epoch": 0.1328, + "grad_norm": 0.04302481189370155, + "learning_rate": 2.529524716816825e-06, + "loss": 0.0334, + "step": 172560 + }, + { + "epoch": 0.13285, + "grad_norm": 0.03241045027971268, + "learning_rate": 2.5277131519429843e-06, + "loss": 0.0321, + "step": 172570 + }, + { + "epoch": 0.1329, + "grad_norm": 0.0356929786503315, + "learning_rate": 2.5259022014530174e-06, + "loss": 0.0326, + "step": 172580 + }, + { + "epoch": 0.13295, + "grad_norm": 0.03583104535937309, + "learning_rate": 2.5240918653964467e-06, + "loss": 0.0323, + "step": 172590 + }, + { + "epoch": 0.133, + "grad_norm": 0.03354700654745102, + "learning_rate": 2.5222821438227545e-06, + "loss": 0.0342, + "step": 172600 + }, + { + "epoch": 0.13305, + "grad_norm": 0.042298246175050735, + "learning_rate": 2.520473036781429e-06, + "loss": 0.0324, + "step": 172610 + }, + { + "epoch": 0.1331, + "grad_norm": 0.04196954146027565, + "learning_rate": 2.5186645443219274e-06, + "loss": 0.0325, + "step": 172620 + }, + { + "epoch": 0.13315, + "grad_norm": 0.037745051085948944, + "learning_rate": 2.516856666493683e-06, + "loss": 0.0321, + "step": 172630 + }, + { + "epoch": 0.1332, + "grad_norm": 0.03741823136806488, + "learning_rate": 2.51504940334614e-06, + "loss": 0.0338, + "step": 172640 + }, + { + "epoch": 0.13325, + "grad_norm": 0.03566833585500717, + "learning_rate": 2.5132427549286965e-06, + "loss": 0.0321, + "step": 172650 + }, + { + "epoch": 0.1333, + "grad_norm": 0.03631792217493057, + "learning_rate": 2.511436721290747e-06, + "loss": 0.0321, + "step": 172660 + }, + { + "epoch": 0.13335, + "grad_norm": 0.03379756212234497, + "learning_rate": 2.509631302481666e-06, + "loss": 0.0318, + "step": 172670 + }, + { + "epoch": 0.1334, + "grad_norm": 0.038850437849760056, + "learning_rate": 2.5078264985508193e-06, + "loss": 0.0338, + "step": 172680 + }, + { + "epoch": 0.13345, + "grad_norm": 0.0350588895380497, + "learning_rate": 2.5060223095475426e-06, + "loss": 0.0329, + "step": 172690 + }, + { + "epoch": 0.1335, + "grad_norm": 0.0503779798746109, + "learning_rate": 2.5042187355211717e-06, + "loss": 0.0336, + "step": 172700 + }, + { + "epoch": 0.13355, + "grad_norm": 0.037332683801651, + "learning_rate": 2.5024157765210083e-06, + "loss": 0.0324, + "step": 172710 + }, + { + "epoch": 0.1336, + "grad_norm": 0.038964446634054184, + "learning_rate": 2.500613432596338e-06, + "loss": 0.0342, + "step": 172720 + }, + { + "epoch": 0.13365, + "grad_norm": 0.03178824484348297, + "learning_rate": 2.4988117037964527e-06, + "loss": 0.0325, + "step": 172730 + }, + { + "epoch": 0.1337, + "grad_norm": 0.037879884243011475, + "learning_rate": 2.497010590170598e-06, + "loss": 0.0334, + "step": 172740 + }, + { + "epoch": 0.13375, + "grad_norm": 0.04096490517258644, + "learning_rate": 2.4952100917680244e-06, + "loss": 0.0332, + "step": 172750 + }, + { + "epoch": 0.1338, + "grad_norm": 0.03576594591140747, + "learning_rate": 2.4934102086379445e-06, + "loss": 0.0329, + "step": 172760 + }, + { + "epoch": 0.13385, + "grad_norm": 0.0467950701713562, + "learning_rate": 2.491610940829575e-06, + "loss": 0.0351, + "step": 172770 + }, + { + "epoch": 0.1339, + "grad_norm": 0.05011410638689995, + "learning_rate": 2.4898122883921152e-06, + "loss": 0.0331, + "step": 172780 + }, + { + "epoch": 0.13395, + "grad_norm": 0.04678485170006752, + "learning_rate": 2.4880142513747205e-06, + "loss": 0.0328, + "step": 172790 + }, + { + "epoch": 0.134, + "grad_norm": 0.03232264891266823, + "learning_rate": 2.4862168298265652e-06, + "loss": 0.033, + "step": 172800 + }, + { + "epoch": 0.13405, + "grad_norm": 0.04205932840704918, + "learning_rate": 2.4844200237967795e-06, + "loss": 0.0345, + "step": 172810 + }, + { + "epoch": 0.1341, + "grad_norm": 0.03812059760093689, + "learning_rate": 2.482623833334494e-06, + "loss": 0.0316, + "step": 172820 + }, + { + "epoch": 0.13415, + "grad_norm": 0.04027124494314194, + "learning_rate": 2.4808282584888153e-06, + "loss": 0.0329, + "step": 172830 + }, + { + "epoch": 0.1342, + "grad_norm": 0.04119805246591568, + "learning_rate": 2.479033299308828e-06, + "loss": 0.0315, + "step": 172840 + }, + { + "epoch": 0.13425, + "grad_norm": 0.04910222068428993, + "learning_rate": 2.477238955843611e-06, + "loss": 0.0336, + "step": 172850 + }, + { + "epoch": 0.1343, + "grad_norm": 0.03819654881954193, + "learning_rate": 2.475445228142223e-06, + "loss": 0.0315, + "step": 172860 + }, + { + "epoch": 0.13435, + "grad_norm": 0.03676626831293106, + "learning_rate": 2.4736521162536996e-06, + "loss": 0.0326, + "step": 172870 + }, + { + "epoch": 0.1344, + "grad_norm": 0.0349026694893837, + "learning_rate": 2.4718596202270574e-06, + "loss": 0.0327, + "step": 172880 + }, + { + "epoch": 0.13445, + "grad_norm": 0.034219786524772644, + "learning_rate": 2.470067740111312e-06, + "loss": 0.0329, + "step": 172890 + }, + { + "epoch": 0.1345, + "grad_norm": 0.033093199133872986, + "learning_rate": 2.4682764759554553e-06, + "loss": 0.0319, + "step": 172900 + }, + { + "epoch": 0.13455, + "grad_norm": 0.036882225424051285, + "learning_rate": 2.466485827808454e-06, + "loss": 0.034, + "step": 172910 + }, + { + "epoch": 0.1346, + "grad_norm": 0.03655955195426941, + "learning_rate": 2.4646957957192656e-06, + "loss": 0.0326, + "step": 172920 + }, + { + "epoch": 0.13465, + "grad_norm": 0.03251729905605316, + "learning_rate": 2.4629063797368235e-06, + "loss": 0.0326, + "step": 172930 + }, + { + "epoch": 0.1347, + "grad_norm": 0.0346374586224556, + "learning_rate": 2.4611175799100577e-06, + "loss": 0.0321, + "step": 172940 + }, + { + "epoch": 0.13475, + "grad_norm": 0.030120186507701874, + "learning_rate": 2.459329396287871e-06, + "loss": 0.032, + "step": 172950 + }, + { + "epoch": 0.1348, + "grad_norm": 0.03634950518608093, + "learning_rate": 2.4575418289191437e-06, + "loss": 0.0325, + "step": 172960 + }, + { + "epoch": 0.13485, + "grad_norm": 0.034011442214250565, + "learning_rate": 2.455754877852762e-06, + "loss": 0.0342, + "step": 172970 + }, + { + "epoch": 0.1349, + "grad_norm": 0.03213994950056076, + "learning_rate": 2.4539685431375663e-06, + "loss": 0.0325, + "step": 172980 + }, + { + "epoch": 0.13495, + "grad_norm": 0.03639134019613266, + "learning_rate": 2.4521828248224075e-06, + "loss": 0.033, + "step": 172990 + }, + { + "epoch": 0.135, + "grad_norm": 0.036853231489658356, + "learning_rate": 2.450397722956094e-06, + "loss": 0.0339, + "step": 173000 + }, + { + "epoch": 0.13505, + "grad_norm": 0.0340038537979126, + "learning_rate": 2.4486132375874383e-06, + "loss": 0.0343, + "step": 173010 + }, + { + "epoch": 0.1351, + "grad_norm": 0.0397774763405323, + "learning_rate": 2.4468293687652187e-06, + "loss": 0.0336, + "step": 173020 + }, + { + "epoch": 0.13515, + "grad_norm": 0.03534870222210884, + "learning_rate": 2.445046116538216e-06, + "loss": 0.0341, + "step": 173030 + }, + { + "epoch": 0.1352, + "grad_norm": 0.03766913339495659, + "learning_rate": 2.4432634809551796e-06, + "loss": 0.0337, + "step": 173040 + }, + { + "epoch": 0.13525, + "grad_norm": 0.04208486154675484, + "learning_rate": 2.441481462064843e-06, + "loss": 0.0338, + "step": 173050 + }, + { + "epoch": 0.1353, + "grad_norm": 0.0469144769012928, + "learning_rate": 2.439700059915931e-06, + "loss": 0.0351, + "step": 173060 + }, + { + "epoch": 0.13535, + "grad_norm": 0.03179687634110451, + "learning_rate": 2.4379192745571434e-06, + "loss": 0.0341, + "step": 173070 + }, + { + "epoch": 0.1354, + "grad_norm": 0.04159516468644142, + "learning_rate": 2.4361391060371606e-06, + "loss": 0.0338, + "step": 173080 + }, + { + "epoch": 0.13545, + "grad_norm": 0.03519073873758316, + "learning_rate": 2.4343595544046666e-06, + "loss": 0.0322, + "step": 173090 + }, + { + "epoch": 0.1355, + "grad_norm": 0.033475492149591446, + "learning_rate": 2.432580619708297e-06, + "loss": 0.0328, + "step": 173100 + }, + { + "epoch": 0.13555, + "grad_norm": 0.031248176470398903, + "learning_rate": 2.4308023019967014e-06, + "loss": 0.0317, + "step": 173110 + }, + { + "epoch": 0.1356, + "grad_norm": 0.04228196665644646, + "learning_rate": 2.4290246013184913e-06, + "loss": 0.0328, + "step": 173120 + }, + { + "epoch": 0.13565, + "grad_norm": 0.037378840148448944, + "learning_rate": 2.4272475177222698e-06, + "loss": 0.0316, + "step": 173130 + }, + { + "epoch": 0.1357, + "grad_norm": 0.05218826234340668, + "learning_rate": 2.425471051256614e-06, + "loss": 0.0351, + "step": 173140 + }, + { + "epoch": 0.13575, + "grad_norm": 0.037467218935489655, + "learning_rate": 2.423695201970105e-06, + "loss": 0.0324, + "step": 173150 + }, + { + "epoch": 0.1358, + "grad_norm": 0.03451351821422577, + "learning_rate": 2.4219199699112876e-06, + "loss": 0.0325, + "step": 173160 + }, + { + "epoch": 0.13585, + "grad_norm": 0.03146947920322418, + "learning_rate": 2.4201453551286916e-06, + "loss": 0.0325, + "step": 173170 + }, + { + "epoch": 0.1359, + "grad_norm": 0.030889015644788742, + "learning_rate": 2.4183713576708427e-06, + "loss": 0.0335, + "step": 173180 + }, + { + "epoch": 0.13595, + "grad_norm": 0.032551318407058716, + "learning_rate": 2.4165979775862354e-06, + "loss": 0.0339, + "step": 173190 + }, + { + "epoch": 0.136, + "grad_norm": 0.04587157815694809, + "learning_rate": 2.4148252149233607e-06, + "loss": 0.0341, + "step": 173200 + }, + { + "epoch": 0.13605, + "grad_norm": 0.03541712835431099, + "learning_rate": 2.4130530697306724e-06, + "loss": 0.0331, + "step": 173210 + }, + { + "epoch": 0.1361, + "grad_norm": 0.03498711436986923, + "learning_rate": 2.4112815420566287e-06, + "loss": 0.0335, + "step": 173220 + }, + { + "epoch": 0.13615, + "grad_norm": 0.03785178065299988, + "learning_rate": 2.409510631949666e-06, + "loss": 0.0349, + "step": 173230 + }, + { + "epoch": 0.1362, + "grad_norm": 0.0386972650885582, + "learning_rate": 2.407740339458192e-06, + "loss": 0.0334, + "step": 173240 + }, + { + "epoch": 0.13625, + "grad_norm": 0.03768829628825188, + "learning_rate": 2.4059706646306133e-06, + "loss": 0.0339, + "step": 173250 + }, + { + "epoch": 0.1363, + "grad_norm": 0.034488171339035034, + "learning_rate": 2.4042016075153024e-06, + "loss": 0.0342, + "step": 173260 + }, + { + "epoch": 0.13635, + "grad_norm": 0.04211435094475746, + "learning_rate": 2.402433168160631e-06, + "loss": 0.0339, + "step": 173270 + }, + { + "epoch": 0.1364, + "grad_norm": 0.03679462522268295, + "learning_rate": 2.4006653466149582e-06, + "loss": 0.0335, + "step": 173280 + }, + { + "epoch": 0.13645, + "grad_norm": 0.042969267815351486, + "learning_rate": 2.398898142926592e-06, + "loss": 0.0335, + "step": 173290 + }, + { + "epoch": 0.1365, + "grad_norm": 0.03657511621713638, + "learning_rate": 2.3971315571438668e-06, + "loss": 0.0334, + "step": 173300 + }, + { + "epoch": 0.13655, + "grad_norm": 0.03540303185582161, + "learning_rate": 2.3953655893150683e-06, + "loss": 0.0324, + "step": 173310 + }, + { + "epoch": 0.1366, + "grad_norm": 0.03584011644124985, + "learning_rate": 2.393600239488486e-06, + "loss": 0.0345, + "step": 173320 + }, + { + "epoch": 0.13665, + "grad_norm": 0.03523749113082886, + "learning_rate": 2.3918355077123812e-06, + "loss": 0.0331, + "step": 173330 + }, + { + "epoch": 0.1367, + "grad_norm": 0.036041587591171265, + "learning_rate": 2.3900713940349956e-06, + "loss": 0.0326, + "step": 173340 + }, + { + "epoch": 0.13675, + "grad_norm": 0.036619171500205994, + "learning_rate": 2.3883078985045688e-06, + "loss": 0.0339, + "step": 173350 + }, + { + "epoch": 0.1368, + "grad_norm": 0.03686472028493881, + "learning_rate": 2.3865450211693093e-06, + "loss": 0.0316, + "step": 173360 + }, + { + "epoch": 0.13685, + "grad_norm": 0.0400625579059124, + "learning_rate": 2.3847827620774116e-06, + "loss": 0.0338, + "step": 173370 + }, + { + "epoch": 0.1369, + "grad_norm": 0.04308782517910004, + "learning_rate": 2.383021121277054e-06, + "loss": 0.0326, + "step": 173380 + }, + { + "epoch": 0.13695, + "grad_norm": 0.041462741792201996, + "learning_rate": 2.381260098816407e-06, + "loss": 0.0334, + "step": 173390 + }, + { + "epoch": 0.137, + "grad_norm": 0.04413997381925583, + "learning_rate": 2.379499694743606e-06, + "loss": 0.035, + "step": 173400 + }, + { + "epoch": 0.13705, + "grad_norm": 0.03597474470734596, + "learning_rate": 2.377739909106791e-06, + "loss": 0.0327, + "step": 173410 + }, + { + "epoch": 0.1371, + "grad_norm": 0.0335271991789341, + "learning_rate": 2.3759807419540675e-06, + "loss": 0.0327, + "step": 173420 + }, + { + "epoch": 0.13715, + "grad_norm": 0.03556426987051964, + "learning_rate": 2.374222193333525e-06, + "loss": 0.0324, + "step": 173430 + }, + { + "epoch": 0.1372, + "grad_norm": 0.03495519608259201, + "learning_rate": 2.372464263293253e-06, + "loss": 0.0331, + "step": 173440 + }, + { + "epoch": 0.13725, + "grad_norm": 0.03677154332399368, + "learning_rate": 2.3707069518813072e-06, + "loss": 0.034, + "step": 173450 + }, + { + "epoch": 0.1373, + "grad_norm": 0.03947559744119644, + "learning_rate": 2.3689502591457276e-06, + "loss": 0.0332, + "step": 173460 + }, + { + "epoch": 0.13735, + "grad_norm": 0.0355621762573719, + "learning_rate": 2.3671941851345524e-06, + "loss": 0.0338, + "step": 173470 + }, + { + "epoch": 0.1374, + "grad_norm": 0.03790060058236122, + "learning_rate": 2.3654387298957776e-06, + "loss": 0.0322, + "step": 173480 + }, + { + "epoch": 0.13745, + "grad_norm": 0.03127957880496979, + "learning_rate": 2.363683893477417e-06, + "loss": 0.0332, + "step": 173490 + }, + { + "epoch": 0.1375, + "grad_norm": 0.03315692022442818, + "learning_rate": 2.361929675927424e-06, + "loss": 0.0326, + "step": 173500 + }, + { + "epoch": 0.13755, + "grad_norm": 0.034898389130830765, + "learning_rate": 2.3601760772937716e-06, + "loss": 0.0335, + "step": 173510 + }, + { + "epoch": 0.1376, + "grad_norm": 0.03737428039312363, + "learning_rate": 2.358423097624396e-06, + "loss": 0.0339, + "step": 173520 + }, + { + "epoch": 0.13765, + "grad_norm": 0.03121230937540531, + "learning_rate": 2.356670736967234e-06, + "loss": 0.0325, + "step": 173530 + }, + { + "epoch": 0.1377, + "grad_norm": 0.03580779209733009, + "learning_rate": 2.354918995370184e-06, + "loss": 0.0317, + "step": 173540 + }, + { + "epoch": 0.13775, + "grad_norm": 0.035638175904750824, + "learning_rate": 2.35316787288114e-06, + "loss": 0.0328, + "step": 173550 + }, + { + "epoch": 0.1378, + "grad_norm": 0.032858967781066895, + "learning_rate": 2.3514173695479815e-06, + "loss": 0.0324, + "step": 173560 + }, + { + "epoch": 0.13785, + "grad_norm": 0.03643010929226875, + "learning_rate": 2.3496674854185637e-06, + "loss": 0.0324, + "step": 173570 + }, + { + "epoch": 0.1379, + "grad_norm": 0.036192405968904495, + "learning_rate": 2.3479182205407264e-06, + "loss": 0.0311, + "step": 173580 + }, + { + "epoch": 0.13795, + "grad_norm": 0.033632777631282806, + "learning_rate": 2.3461695749622896e-06, + "loss": 0.0329, + "step": 173590 + }, + { + "epoch": 0.138, + "grad_norm": 0.03389897570014, + "learning_rate": 2.3444215487310684e-06, + "loss": 0.0314, + "step": 173600 + }, + { + "epoch": 0.13805, + "grad_norm": 0.03711351007223129, + "learning_rate": 2.3426741418948545e-06, + "loss": 0.033, + "step": 173610 + }, + { + "epoch": 0.1381, + "grad_norm": 0.042397286742925644, + "learning_rate": 2.3409273545014183e-06, + "loss": 0.0336, + "step": 173620 + }, + { + "epoch": 0.13815, + "grad_norm": 0.037497516721487045, + "learning_rate": 2.339181186598513e-06, + "loss": 0.0325, + "step": 173630 + }, + { + "epoch": 0.1382, + "grad_norm": 0.0302271768450737, + "learning_rate": 2.3374356382338787e-06, + "loss": 0.0313, + "step": 173640 + }, + { + "epoch": 0.13825, + "grad_norm": 0.032058198004961014, + "learning_rate": 2.3356907094552434e-06, + "loss": 0.0313, + "step": 173650 + }, + { + "epoch": 0.1383, + "grad_norm": 0.033630333840847015, + "learning_rate": 2.3339464003103105e-06, + "loss": 0.0309, + "step": 173660 + }, + { + "epoch": 0.13835, + "grad_norm": 0.031055064871907234, + "learning_rate": 2.332202710846762e-06, + "loss": 0.0323, + "step": 173670 + }, + { + "epoch": 0.1384, + "grad_norm": 0.043439898639917374, + "learning_rate": 2.3304596411122814e-06, + "loss": 0.0322, + "step": 173680 + }, + { + "epoch": 0.13845, + "grad_norm": 0.03847711905837059, + "learning_rate": 2.3287171911545116e-06, + "loss": 0.032, + "step": 173690 + }, + { + "epoch": 0.1385, + "grad_norm": 0.03554611653089523, + "learning_rate": 2.3269753610211083e-06, + "loss": 0.0332, + "step": 173700 + }, + { + "epoch": 0.13855, + "grad_norm": 0.04088981822133064, + "learning_rate": 2.3252341507596697e-06, + "loss": 0.0334, + "step": 173710 + }, + { + "epoch": 0.1386, + "grad_norm": 0.041101280599832535, + "learning_rate": 2.323493560417811e-06, + "loss": 0.0337, + "step": 173720 + }, + { + "epoch": 0.13865, + "grad_norm": 0.040162887424230576, + "learning_rate": 2.321753590043124e-06, + "loss": 0.0325, + "step": 173730 + }, + { + "epoch": 0.1387, + "grad_norm": 0.033348675817251205, + "learning_rate": 2.3200142396831743e-06, + "loss": 0.0332, + "step": 173740 + }, + { + "epoch": 0.13875, + "grad_norm": 0.037173010408878326, + "learning_rate": 2.3182755093855146e-06, + "loss": 0.0331, + "step": 173750 + }, + { + "epoch": 0.1388, + "grad_norm": 0.03944842144846916, + "learning_rate": 2.3165373991976767e-06, + "loss": 0.034, + "step": 173760 + }, + { + "epoch": 0.13885, + "grad_norm": 0.047941941767930984, + "learning_rate": 2.3147999091671897e-06, + "loss": 0.0388, + "step": 173770 + }, + { + "epoch": 0.1389, + "grad_norm": 0.05058452859520912, + "learning_rate": 2.313063039341548e-06, + "loss": 0.0354, + "step": 173780 + }, + { + "epoch": 0.13895, + "grad_norm": 0.03964913263916969, + "learning_rate": 2.3113267897682393e-06, + "loss": 0.0329, + "step": 173790 + }, + { + "epoch": 0.139, + "grad_norm": 0.046650372445583344, + "learning_rate": 2.309591160494734e-06, + "loss": 0.0338, + "step": 173800 + }, + { + "epoch": 0.13905, + "grad_norm": 0.032496377825737, + "learning_rate": 2.307856151568477e-06, + "loss": 0.0325, + "step": 173810 + }, + { + "epoch": 0.1391, + "grad_norm": 0.0319688655436039, + "learning_rate": 2.3061217630369142e-06, + "loss": 0.0325, + "step": 173820 + }, + { + "epoch": 0.13915, + "grad_norm": 0.03517069295048714, + "learning_rate": 2.3043879949474574e-06, + "loss": 0.0324, + "step": 173830 + }, + { + "epoch": 0.1392, + "grad_norm": 0.03652153164148331, + "learning_rate": 2.302654847347499e-06, + "loss": 0.0318, + "step": 173840 + }, + { + "epoch": 0.13925, + "grad_norm": 0.03405037149786949, + "learning_rate": 2.300922320284438e-06, + "loss": 0.0329, + "step": 173850 + }, + { + "epoch": 0.1393, + "grad_norm": 0.041154876351356506, + "learning_rate": 2.2991904138056323e-06, + "loss": 0.0325, + "step": 173860 + }, + { + "epoch": 0.13935, + "grad_norm": 0.029424719512462616, + "learning_rate": 2.297459127958432e-06, + "loss": 0.0334, + "step": 173870 + }, + { + "epoch": 0.1394, + "grad_norm": 0.03557342290878296, + "learning_rate": 2.2957284627901644e-06, + "loss": 0.0334, + "step": 173880 + }, + { + "epoch": 0.13945, + "grad_norm": 0.033025480806827545, + "learning_rate": 2.293998418348156e-06, + "loss": 0.0333, + "step": 173890 + }, + { + "epoch": 0.1395, + "grad_norm": 0.03407977148890495, + "learning_rate": 2.292268994679697e-06, + "loss": 0.0319, + "step": 173900 + }, + { + "epoch": 0.13955, + "grad_norm": 0.03547189012169838, + "learning_rate": 2.2905401918320745e-06, + "loss": 0.0316, + "step": 173910 + }, + { + "epoch": 0.1396, + "grad_norm": 0.03738795220851898, + "learning_rate": 2.2888120098525563e-06, + "loss": 0.0326, + "step": 173920 + }, + { + "epoch": 0.13965, + "grad_norm": 0.03844847157597542, + "learning_rate": 2.2870844487883763e-06, + "loss": 0.0337, + "step": 173930 + }, + { + "epoch": 0.1397, + "grad_norm": 0.0328495129942894, + "learning_rate": 2.2853575086867834e-06, + "loss": 0.0328, + "step": 173940 + }, + { + "epoch": 0.13975, + "grad_norm": 0.03362671658396721, + "learning_rate": 2.2836311895949813e-06, + "loss": 0.0344, + "step": 173950 + }, + { + "epoch": 0.1398, + "grad_norm": 0.034235890954732895, + "learning_rate": 2.2819054915601656e-06, + "loss": 0.0341, + "step": 173960 + }, + { + "epoch": 0.13985, + "grad_norm": 0.04000657796859741, + "learning_rate": 2.280180414629518e-06, + "loss": 0.0345, + "step": 173970 + }, + { + "epoch": 0.1399, + "grad_norm": 0.03184925392270088, + "learning_rate": 2.278455958850201e-06, + "loss": 0.0334, + "step": 173980 + }, + { + "epoch": 0.13995, + "grad_norm": 0.036327533423900604, + "learning_rate": 2.2767321242693707e-06, + "loss": 0.0339, + "step": 173990 + }, + { + "epoch": 0.14, + "grad_norm": 0.03221013396978378, + "learning_rate": 2.27500891093414e-06, + "loss": 0.0334, + "step": 174000 + }, + { + "epoch": 0.14005, + "grad_norm": 0.038224413990974426, + "learning_rate": 2.2732863188916293e-06, + "loss": 0.0345, + "step": 174010 + }, + { + "epoch": 0.1401, + "grad_norm": 0.03446537256240845, + "learning_rate": 2.2715643481889314e-06, + "loss": 0.0339, + "step": 174020 + }, + { + "epoch": 0.14015, + "grad_norm": 0.03583173453807831, + "learning_rate": 2.2698429988731285e-06, + "loss": 0.0344, + "step": 174030 + }, + { + "epoch": 0.1402, + "grad_norm": 0.039286285638809204, + "learning_rate": 2.2681222709912797e-06, + "loss": 0.0339, + "step": 174040 + }, + { + "epoch": 0.14025, + "grad_norm": 0.038437262177467346, + "learning_rate": 2.2664021645904224e-06, + "loss": 0.0353, + "step": 174050 + }, + { + "epoch": 0.1403, + "grad_norm": 0.03841080889105797, + "learning_rate": 2.2646826797175945e-06, + "loss": 0.0344, + "step": 174060 + }, + { + "epoch": 0.14035, + "grad_norm": 0.043520137667655945, + "learning_rate": 2.2629638164198025e-06, + "loss": 0.0336, + "step": 174070 + }, + { + "epoch": 0.1404, + "grad_norm": 0.041831016540527344, + "learning_rate": 2.2612455747440363e-06, + "loss": 0.034, + "step": 174080 + }, + { + "epoch": 0.14045, + "grad_norm": 0.03334524482488632, + "learning_rate": 2.2595279547372673e-06, + "loss": 0.035, + "step": 174090 + }, + { + "epoch": 0.1405, + "grad_norm": 0.03944189473986626, + "learning_rate": 2.2578109564464627e-06, + "loss": 0.0371, + "step": 174100 + }, + { + "epoch": 0.14055, + "grad_norm": 0.03718682378530502, + "learning_rate": 2.2560945799185664e-06, + "loss": 0.0337, + "step": 174110 + }, + { + "epoch": 0.1406, + "grad_norm": 0.03292892500758171, + "learning_rate": 2.254378825200501e-06, + "loss": 0.0332, + "step": 174120 + }, + { + "epoch": 0.14065, + "grad_norm": 0.03702680766582489, + "learning_rate": 2.252663692339174e-06, + "loss": 0.033, + "step": 174130 + }, + { + "epoch": 0.1407, + "grad_norm": 0.03774098679423332, + "learning_rate": 2.2509491813814676e-06, + "loss": 0.0322, + "step": 174140 + }, + { + "epoch": 0.14075, + "grad_norm": 0.038331057876348495, + "learning_rate": 2.2492352923742714e-06, + "loss": 0.035, + "step": 174150 + }, + { + "epoch": 0.1408, + "grad_norm": 0.036840472370386124, + "learning_rate": 2.2475220253644374e-06, + "loss": 0.0352, + "step": 174160 + }, + { + "epoch": 0.14085, + "grad_norm": 0.04416101798415184, + "learning_rate": 2.2458093803987944e-06, + "loss": 0.0315, + "step": 174170 + }, + { + "epoch": 0.1409, + "grad_norm": 0.03244810178875923, + "learning_rate": 2.2440973575241832e-06, + "loss": 0.033, + "step": 174180 + }, + { + "epoch": 0.14095, + "grad_norm": 0.04091142490506172, + "learning_rate": 2.2423859567873936e-06, + "loss": 0.0331, + "step": 174190 + }, + { + "epoch": 0.141, + "grad_norm": 0.05770982801914215, + "learning_rate": 2.2406751782352324e-06, + "loss": 0.0325, + "step": 174200 + }, + { + "epoch": 0.14105, + "grad_norm": 0.04321189224720001, + "learning_rate": 2.238965021914452e-06, + "loss": 0.0341, + "step": 174210 + }, + { + "epoch": 0.1411, + "grad_norm": 0.03503815457224846, + "learning_rate": 2.23725548787182e-06, + "loss": 0.0326, + "step": 174220 + }, + { + "epoch": 0.14115, + "grad_norm": 0.041832130402326584, + "learning_rate": 2.235546576154071e-06, + "loss": 0.0323, + "step": 174230 + }, + { + "epoch": 0.1412, + "grad_norm": 0.06155143678188324, + "learning_rate": 2.2338382868079295e-06, + "loss": 0.0326, + "step": 174240 + }, + { + "epoch": 0.14125, + "grad_norm": 0.04963129758834839, + "learning_rate": 2.232130619880096e-06, + "loss": 0.034, + "step": 174250 + }, + { + "epoch": 0.1413, + "grad_norm": 0.03338255733251572, + "learning_rate": 2.230423575417254e-06, + "loss": 0.0317, + "step": 174260 + }, + { + "epoch": 0.14135, + "grad_norm": 0.03729334473609924, + "learning_rate": 2.228717153466081e-06, + "loss": 0.0316, + "step": 174270 + }, + { + "epoch": 0.1414, + "grad_norm": 0.03284437581896782, + "learning_rate": 2.227011354073227e-06, + "loss": 0.0322, + "step": 174280 + }, + { + "epoch": 0.14145, + "grad_norm": 0.04066002741456032, + "learning_rate": 2.2253061772853217e-06, + "loss": 0.0326, + "step": 174290 + }, + { + "epoch": 0.1415, + "grad_norm": 0.046504199504852295, + "learning_rate": 2.2236016231489938e-06, + "loss": 0.033, + "step": 174300 + }, + { + "epoch": 0.14155, + "grad_norm": 0.03390634059906006, + "learning_rate": 2.2218976917108365e-06, + "loss": 0.0324, + "step": 174310 + }, + { + "epoch": 0.1416, + "grad_norm": 0.042621005326509476, + "learning_rate": 2.2201943830174434e-06, + "loss": 0.0344, + "step": 174320 + }, + { + "epoch": 0.14165, + "grad_norm": 0.03581277281045914, + "learning_rate": 2.2184916971153797e-06, + "loss": 0.0332, + "step": 174330 + }, + { + "epoch": 0.1417, + "grad_norm": 0.0325859859585762, + "learning_rate": 2.2167896340511917e-06, + "loss": 0.0325, + "step": 174340 + }, + { + "epoch": 0.14175, + "grad_norm": 0.03300931304693222, + "learning_rate": 2.215088193871412e-06, + "loss": 0.0345, + "step": 174350 + }, + { + "epoch": 0.1418, + "grad_norm": 0.036072760820388794, + "learning_rate": 2.2133873766225665e-06, + "loss": 0.0338, + "step": 174360 + }, + { + "epoch": 0.14185, + "grad_norm": 0.03882085159420967, + "learning_rate": 2.211687182351149e-06, + "loss": 0.0322, + "step": 174370 + }, + { + "epoch": 0.1419, + "grad_norm": 0.03754011169075966, + "learning_rate": 2.2099876111036392e-06, + "loss": 0.0351, + "step": 174380 + }, + { + "epoch": 0.14195, + "grad_norm": 0.04201170802116394, + "learning_rate": 2.2082886629265107e-06, + "loss": 0.0341, + "step": 174390 + }, + { + "epoch": 0.142, + "grad_norm": 0.03877422213554382, + "learning_rate": 2.206590337866202e-06, + "loss": 0.0335, + "step": 174400 + }, + { + "epoch": 0.14205, + "grad_norm": 0.03186605125665665, + "learning_rate": 2.2048926359691606e-06, + "loss": 0.0337, + "step": 174410 + }, + { + "epoch": 0.1421, + "grad_norm": 0.03477894142270088, + "learning_rate": 2.2031955572817814e-06, + "loss": 0.0328, + "step": 174420 + }, + { + "epoch": 0.14215, + "grad_norm": 0.040825922042131424, + "learning_rate": 2.201499101850471e-06, + "loss": 0.0334, + "step": 174430 + }, + { + "epoch": 0.1422, + "grad_norm": 0.03556324541568756, + "learning_rate": 2.199803269721615e-06, + "loss": 0.0345, + "step": 174440 + }, + { + "epoch": 0.14225, + "grad_norm": 0.031087305396795273, + "learning_rate": 2.1981080609415726e-06, + "loss": 0.0325, + "step": 174450 + }, + { + "epoch": 0.1423, + "grad_norm": 0.04095204919576645, + "learning_rate": 2.1964134755566884e-06, + "loss": 0.0338, + "step": 174460 + }, + { + "epoch": 0.14235, + "grad_norm": 0.03513423725962639, + "learning_rate": 2.1947195136132886e-06, + "loss": 0.0333, + "step": 174470 + }, + { + "epoch": 0.1424, + "grad_norm": 0.03837606683373451, + "learning_rate": 2.1930261751576894e-06, + "loss": 0.0329, + "step": 174480 + }, + { + "epoch": 0.14245, + "grad_norm": 0.03757372498512268, + "learning_rate": 2.1913334602361946e-06, + "loss": 0.0324, + "step": 174490 + }, + { + "epoch": 0.1425, + "grad_norm": 0.03373987227678299, + "learning_rate": 2.189641368895068e-06, + "loss": 0.0325, + "step": 174500 + }, + { + "epoch": 0.14255, + "grad_norm": 0.042526379227638245, + "learning_rate": 2.1879499011805806e-06, + "loss": 0.0331, + "step": 174510 + }, + { + "epoch": 0.1426, + "grad_norm": 0.032664705067873, + "learning_rate": 2.1862590571389674e-06, + "loss": 0.0328, + "step": 174520 + }, + { + "epoch": 0.14265, + "grad_norm": 0.03375755250453949, + "learning_rate": 2.184568836816464e-06, + "loss": 0.032, + "step": 174530 + }, + { + "epoch": 0.1427, + "grad_norm": 0.03368820250034332, + "learning_rate": 2.1828792402592806e-06, + "loss": 0.0317, + "step": 174540 + }, + { + "epoch": 0.14275, + "grad_norm": 0.029644666239619255, + "learning_rate": 2.1811902675135996e-06, + "loss": 0.0336, + "step": 174550 + }, + { + "epoch": 0.1428, + "grad_norm": 0.03478449210524559, + "learning_rate": 2.1795019186256092e-06, + "loss": 0.0319, + "step": 174560 + }, + { + "epoch": 0.14285, + "grad_norm": 0.032810889184474945, + "learning_rate": 2.1778141936414643e-06, + "loss": 0.0326, + "step": 174570 + }, + { + "epoch": 0.1429, + "grad_norm": 0.03178194910287857, + "learning_rate": 2.1761270926073025e-06, + "loss": 0.0319, + "step": 174580 + }, + { + "epoch": 0.14295, + "grad_norm": 0.042895201593637466, + "learning_rate": 2.174440615569251e-06, + "loss": 0.0346, + "step": 174590 + }, + { + "epoch": 0.143, + "grad_norm": 0.04192337766289711, + "learning_rate": 2.1727547625734203e-06, + "loss": 0.0329, + "step": 174600 + }, + { + "epoch": 0.14305, + "grad_norm": 0.0405057892203331, + "learning_rate": 2.1710695336658926e-06, + "loss": 0.0334, + "step": 174610 + }, + { + "epoch": 0.1431, + "grad_norm": 0.04190431907773018, + "learning_rate": 2.1693849288927536e-06, + "loss": 0.0326, + "step": 174620 + }, + { + "epoch": 0.14315, + "grad_norm": 0.03621964529156685, + "learning_rate": 2.1677009483000526e-06, + "loss": 0.0316, + "step": 174630 + }, + { + "epoch": 0.1432, + "grad_norm": 0.03936958685517311, + "learning_rate": 2.1660175919338276e-06, + "loss": 0.034, + "step": 174640 + }, + { + "epoch": 0.14325, + "grad_norm": 0.042062707245349884, + "learning_rate": 2.164334859840106e-06, + "loss": 0.0327, + "step": 174650 + }, + { + "epoch": 0.1433, + "grad_norm": 0.03852769359946251, + "learning_rate": 2.1626527520648922e-06, + "loss": 0.0329, + "step": 174660 + }, + { + "epoch": 0.14335, + "grad_norm": 0.03222566843032837, + "learning_rate": 2.160971268654166e-06, + "loss": 0.0337, + "step": 174670 + }, + { + "epoch": 0.1434, + "grad_norm": 0.03322441130876541, + "learning_rate": 2.1592904096539108e-06, + "loss": 0.0322, + "step": 174680 + }, + { + "epoch": 0.14345, + "grad_norm": 0.03481338173151016, + "learning_rate": 2.15761017511007e-06, + "loss": 0.034, + "step": 174690 + }, + { + "epoch": 0.1435, + "grad_norm": 0.034989435225725174, + "learning_rate": 2.1559305650685925e-06, + "loss": 0.0345, + "step": 174700 + }, + { + "epoch": 0.14355, + "grad_norm": 0.040841348469257355, + "learning_rate": 2.1542515795753866e-06, + "loss": 0.0333, + "step": 174710 + }, + { + "epoch": 0.1436, + "grad_norm": 0.035034745931625366, + "learning_rate": 2.1525732186763596e-06, + "loss": 0.0336, + "step": 174720 + }, + { + "epoch": 0.14365, + "grad_norm": 0.03477044776082039, + "learning_rate": 2.150895482417395e-06, + "loss": 0.0333, + "step": 174730 + }, + { + "epoch": 0.1437, + "grad_norm": 0.038191020488739014, + "learning_rate": 2.149218370844369e-06, + "loss": 0.0332, + "step": 174740 + }, + { + "epoch": 0.14375, + "grad_norm": 0.0353204570710659, + "learning_rate": 2.147541884003129e-06, + "loss": 0.0333, + "step": 174750 + }, + { + "epoch": 0.1438, + "grad_norm": 0.03748689219355583, + "learning_rate": 2.1458660219395022e-06, + "loss": 0.0329, + "step": 174760 + }, + { + "epoch": 0.14385, + "grad_norm": 0.036651611328125, + "learning_rate": 2.144190784699318e-06, + "loss": 0.0333, + "step": 174770 + }, + { + "epoch": 0.1439, + "grad_norm": 0.036774635314941406, + "learning_rate": 2.142516172328371e-06, + "loss": 0.0324, + "step": 174780 + }, + { + "epoch": 0.14395, + "grad_norm": 0.05032498016953468, + "learning_rate": 2.1408421848724435e-06, + "loss": 0.0336, + "step": 174790 + }, + { + "epoch": 0.144, + "grad_norm": 0.05237219110131264, + "learning_rate": 2.1391688223772994e-06, + "loss": 0.0327, + "step": 174800 + }, + { + "epoch": 0.14405, + "grad_norm": 0.03800968453288078, + "learning_rate": 2.1374960848886905e-06, + "loss": 0.0329, + "step": 174810 + }, + { + "epoch": 0.1441, + "grad_norm": 0.05860413610935211, + "learning_rate": 2.1358239724523555e-06, + "loss": 0.0322, + "step": 174820 + }, + { + "epoch": 0.14415, + "grad_norm": 0.04500029981136322, + "learning_rate": 2.1341524851140023e-06, + "loss": 0.0308, + "step": 174830 + }, + { + "epoch": 0.1442, + "grad_norm": 0.02946821227669716, + "learning_rate": 2.13248162291933e-06, + "loss": 0.0334, + "step": 174840 + }, + { + "epoch": 0.14425, + "grad_norm": 0.04804980754852295, + "learning_rate": 2.130811385914017e-06, + "loss": 0.0333, + "step": 174850 + }, + { + "epoch": 0.1443, + "grad_norm": 0.04131940379738808, + "learning_rate": 2.1291417741437307e-06, + "loss": 0.0327, + "step": 174860 + }, + { + "epoch": 0.14435, + "grad_norm": 0.04331108182668686, + "learning_rate": 2.1274727876541166e-06, + "loss": 0.0314, + "step": 174870 + }, + { + "epoch": 0.1444, + "grad_norm": 0.035809844732284546, + "learning_rate": 2.1258044264908016e-06, + "loss": 0.0309, + "step": 174880 + }, + { + "epoch": 0.14445, + "grad_norm": 0.03436094522476196, + "learning_rate": 2.124136690699402e-06, + "loss": 0.033, + "step": 174890 + }, + { + "epoch": 0.1445, + "grad_norm": 0.029962128028273582, + "learning_rate": 2.122469580325506e-06, + "loss": 0.0323, + "step": 174900 + }, + { + "epoch": 0.14455, + "grad_norm": 0.038812559098005295, + "learning_rate": 2.1208030954147086e-06, + "loss": 0.032, + "step": 174910 + }, + { + "epoch": 0.1446, + "grad_norm": 0.037531036883592606, + "learning_rate": 2.1191372360125498e-06, + "loss": 0.0348, + "step": 174920 + }, + { + "epoch": 0.14465, + "grad_norm": 0.037080105394124985, + "learning_rate": 2.1174720021645805e-06, + "loss": 0.0316, + "step": 174930 + }, + { + "epoch": 0.1447, + "grad_norm": 0.03403100371360779, + "learning_rate": 2.1158073939163386e-06, + "loss": 0.0333, + "step": 174940 + }, + { + "epoch": 0.14475, + "grad_norm": 0.04135388135910034, + "learning_rate": 2.114143411313321e-06, + "loss": 0.033, + "step": 174950 + }, + { + "epoch": 0.1448, + "grad_norm": 0.03619159385561943, + "learning_rate": 2.112480054401028e-06, + "loss": 0.0323, + "step": 174960 + }, + { + "epoch": 0.14485, + "grad_norm": 0.03448307514190674, + "learning_rate": 2.110817323224926e-06, + "loss": 0.0344, + "step": 174970 + }, + { + "epoch": 0.1449, + "grad_norm": 0.030624793842434883, + "learning_rate": 2.109155217830483e-06, + "loss": 0.0323, + "step": 174980 + }, + { + "epoch": 0.14495, + "grad_norm": 0.03290669247508049, + "learning_rate": 2.107493738263139e-06, + "loss": 0.0344, + "step": 174990 + }, + { + "epoch": 0.145, + "grad_norm": 0.033714376389980316, + "learning_rate": 2.1058328845683096e-06, + "loss": 0.0318, + "step": 175000 + }, + { + "epoch": 0.14505, + "grad_norm": 0.04691474139690399, + "learning_rate": 2.1041726567914143e-06, + "loss": 0.0331, + "step": 175010 + }, + { + "epoch": 0.1451, + "grad_norm": 0.03821582719683647, + "learning_rate": 2.102513054977831e-06, + "loss": 0.0324, + "step": 175020 + }, + { + "epoch": 0.14515, + "grad_norm": 0.034602485597133636, + "learning_rate": 2.1008540791729426e-06, + "loss": 0.0357, + "step": 175030 + }, + { + "epoch": 0.1452, + "grad_norm": 0.031218018382787704, + "learning_rate": 2.0991957294221013e-06, + "loss": 0.0329, + "step": 175040 + }, + { + "epoch": 0.14525, + "grad_norm": 0.03488103300333023, + "learning_rate": 2.097538005770641e-06, + "loss": 0.0325, + "step": 175050 + }, + { + "epoch": 0.1453, + "grad_norm": 0.03420497477054596, + "learning_rate": 2.0958809082638947e-06, + "loss": 0.0334, + "step": 175060 + }, + { + "epoch": 0.14535, + "grad_norm": 0.03363852947950363, + "learning_rate": 2.0942244369471564e-06, + "loss": 0.0324, + "step": 175070 + }, + { + "epoch": 0.1454, + "grad_norm": 0.036308009177446365, + "learning_rate": 2.092568591865718e-06, + "loss": 0.0331, + "step": 175080 + }, + { + "epoch": 0.14545, + "grad_norm": 0.03419158235192299, + "learning_rate": 2.0909133730648428e-06, + "loss": 0.0327, + "step": 175090 + }, + { + "epoch": 0.1455, + "grad_norm": 0.03830074891448021, + "learning_rate": 2.089258780589795e-06, + "loss": 0.0335, + "step": 175100 + }, + { + "epoch": 0.14555, + "grad_norm": 0.038069311529397964, + "learning_rate": 2.0876048144857997e-06, + "loss": 0.0333, + "step": 175110 + }, + { + "epoch": 0.1456, + "grad_norm": 0.03244706615805626, + "learning_rate": 2.0859514747980867e-06, + "loss": 0.0356, + "step": 175120 + }, + { + "epoch": 0.14565, + "grad_norm": 0.038728900253772736, + "learning_rate": 2.084298761571851e-06, + "loss": 0.0335, + "step": 175130 + }, + { + "epoch": 0.1457, + "grad_norm": 0.03410856053233147, + "learning_rate": 2.0826466748522734e-06, + "loss": 0.0336, + "step": 175140 + }, + { + "epoch": 0.14575, + "grad_norm": 0.030474839732050896, + "learning_rate": 2.080995214684531e-06, + "loss": 0.0345, + "step": 175150 + }, + { + "epoch": 0.1458, + "grad_norm": 0.035134457051754, + "learning_rate": 2.079344381113768e-06, + "loss": 0.0341, + "step": 175160 + }, + { + "epoch": 0.14585, + "grad_norm": 0.03453769162297249, + "learning_rate": 2.0776941741851215e-06, + "loss": 0.0326, + "step": 175170 + }, + { + "epoch": 0.1459, + "grad_norm": 0.036479029804468155, + "learning_rate": 2.076044593943699e-06, + "loss": 0.0333, + "step": 175180 + }, + { + "epoch": 0.14595, + "grad_norm": 0.03493402525782585, + "learning_rate": 2.074395640434604e-06, + "loss": 0.0337, + "step": 175190 + }, + { + "epoch": 0.146, + "grad_norm": 0.03477869927883148, + "learning_rate": 2.0727473137029303e-06, + "loss": 0.033, + "step": 175200 + }, + { + "epoch": 0.14605, + "grad_norm": 0.03139925375580788, + "learning_rate": 2.07109961379372e-06, + "loss": 0.0331, + "step": 175210 + }, + { + "epoch": 0.1461, + "grad_norm": 0.031562916934490204, + "learning_rate": 2.06945254075204e-06, + "loss": 0.032, + "step": 175220 + }, + { + "epoch": 0.14615, + "grad_norm": 0.032667577266693115, + "learning_rate": 2.0678060946229066e-06, + "loss": 0.0324, + "step": 175230 + }, + { + "epoch": 0.1462, + "grad_norm": 0.03880128264427185, + "learning_rate": 2.0661602754513426e-06, + "loss": 0.0341, + "step": 175240 + }, + { + "epoch": 0.14625, + "grad_norm": 0.03329472988843918, + "learning_rate": 2.0645150832823424e-06, + "loss": 0.0316, + "step": 175250 + }, + { + "epoch": 0.1463, + "grad_norm": 0.030931759625673294, + "learning_rate": 2.062870518160878e-06, + "loss": 0.0326, + "step": 175260 + }, + { + "epoch": 0.14635, + "grad_norm": 0.03219376504421234, + "learning_rate": 2.0612265801319225e-06, + "loss": 0.0333, + "step": 175270 + }, + { + "epoch": 0.1464, + "grad_norm": 0.032046496868133545, + "learning_rate": 2.059583269240414e-06, + "loss": 0.0315, + "step": 175280 + }, + { + "epoch": 0.14645, + "grad_norm": 0.03174709901213646, + "learning_rate": 2.0579405855312815e-06, + "loss": 0.0324, + "step": 175290 + }, + { + "epoch": 0.1465, + "grad_norm": 0.02962440438568592, + "learning_rate": 2.0562985290494275e-06, + "loss": 0.0312, + "step": 175300 + }, + { + "epoch": 0.14655, + "grad_norm": 0.03580990433692932, + "learning_rate": 2.0546570998397547e-06, + "loss": 0.0331, + "step": 175310 + }, + { + "epoch": 0.1466, + "grad_norm": 0.03146926686167717, + "learning_rate": 2.0530162979471385e-06, + "loss": 0.0316, + "step": 175320 + }, + { + "epoch": 0.14665, + "grad_norm": 0.03137435391545296, + "learning_rate": 2.0513761234164377e-06, + "loss": 0.0343, + "step": 175330 + }, + { + "epoch": 0.1467, + "grad_norm": 0.03477836027741432, + "learning_rate": 2.0497365762924938e-06, + "loss": 0.0317, + "step": 175340 + }, + { + "epoch": 0.14675, + "grad_norm": 0.03239109739661217, + "learning_rate": 2.0480976566201237e-06, + "loss": 0.0327, + "step": 175350 + }, + { + "epoch": 0.1468, + "grad_norm": 0.03294882923364639, + "learning_rate": 2.046459364444145e-06, + "loss": 0.0317, + "step": 175360 + }, + { + "epoch": 0.14685, + "grad_norm": 0.032854992896318436, + "learning_rate": 2.0448216998093433e-06, + "loss": 0.0321, + "step": 175370 + }, + { + "epoch": 0.1469, + "grad_norm": 0.03243003040552139, + "learning_rate": 2.043184662760489e-06, + "loss": 0.0306, + "step": 175380 + }, + { + "epoch": 0.14695, + "grad_norm": 0.029420357197523117, + "learning_rate": 2.0415482533423486e-06, + "loss": 0.032, + "step": 175390 + }, + { + "epoch": 0.147, + "grad_norm": 0.039761677384376526, + "learning_rate": 2.039912471599645e-06, + "loss": 0.0344, + "step": 175400 + }, + { + "epoch": 0.14705, + "grad_norm": 0.03798593208193779, + "learning_rate": 2.0382773175771198e-06, + "loss": 0.0321, + "step": 175410 + }, + { + "epoch": 0.1471, + "grad_norm": 0.03551090881228447, + "learning_rate": 2.036642791319457e-06, + "loss": 0.0324, + "step": 175420 + }, + { + "epoch": 0.14715, + "grad_norm": 0.03543001040816307, + "learning_rate": 2.035008892871354e-06, + "loss": 0.0323, + "step": 175430 + }, + { + "epoch": 0.1472, + "grad_norm": 0.040055036544799805, + "learning_rate": 2.0333756222774865e-06, + "loss": 0.0329, + "step": 175440 + }, + { + "epoch": 0.14725, + "grad_norm": 0.029678745195269585, + "learning_rate": 2.0317429795824987e-06, + "loss": 0.032, + "step": 175450 + }, + { + "epoch": 0.1473, + "grad_norm": 0.036483991891145706, + "learning_rate": 2.0301109648310306e-06, + "loss": 0.0347, + "step": 175460 + }, + { + "epoch": 0.14735, + "grad_norm": 0.03847810998558998, + "learning_rate": 2.0284795780676936e-06, + "loss": 0.0335, + "step": 175470 + }, + { + "epoch": 0.1474, + "grad_norm": 0.03432368487119675, + "learning_rate": 2.0268488193370988e-06, + "loss": 0.0328, + "step": 175480 + }, + { + "epoch": 0.14745, + "grad_norm": 0.03539138287305832, + "learning_rate": 2.0252186886838276e-06, + "loss": 0.0336, + "step": 175490 + }, + { + "epoch": 0.1475, + "grad_norm": 0.03191179782152176, + "learning_rate": 2.0235891861524413e-06, + "loss": 0.0324, + "step": 175500 + }, + { + "epoch": 0.14755, + "grad_norm": 0.036091215908527374, + "learning_rate": 2.0219603117874992e-06, + "loss": 0.034, + "step": 175510 + }, + { + "epoch": 0.1476, + "grad_norm": 0.030849860981106758, + "learning_rate": 2.020332065633526e-06, + "loss": 0.0342, + "step": 175520 + }, + { + "epoch": 0.14765, + "grad_norm": 0.03476780652999878, + "learning_rate": 2.0187044477350454e-06, + "loss": 0.0321, + "step": 175530 + }, + { + "epoch": 0.1477, + "grad_norm": 0.03580397740006447, + "learning_rate": 2.0170774581365513e-06, + "loss": 0.0316, + "step": 175540 + }, + { + "epoch": 0.14775, + "grad_norm": 0.03447788953781128, + "learning_rate": 2.015451096882526e-06, + "loss": 0.0329, + "step": 175550 + }, + { + "epoch": 0.1478, + "grad_norm": 0.031128806993365288, + "learning_rate": 2.01382536401743e-06, + "loss": 0.035, + "step": 175560 + }, + { + "epoch": 0.14785, + "grad_norm": 0.039009448140859604, + "learning_rate": 2.012200259585714e-06, + "loss": 0.0334, + "step": 175570 + }, + { + "epoch": 0.1479, + "grad_norm": 0.0299491286277771, + "learning_rate": 2.010575783631807e-06, + "loss": 0.0331, + "step": 175580 + }, + { + "epoch": 0.14795, + "grad_norm": 0.0313674621284008, + "learning_rate": 2.008951936200118e-06, + "loss": 0.0345, + "step": 175590 + }, + { + "epoch": 0.148, + "grad_norm": 0.03528376296162605, + "learning_rate": 2.0073287173350525e-06, + "loss": 0.0323, + "step": 175600 + }, + { + "epoch": 0.14805, + "grad_norm": 0.029887860640883446, + "learning_rate": 2.005706127080975e-06, + "loss": 0.0327, + "step": 175610 + }, + { + "epoch": 0.1481, + "grad_norm": 0.0347294919192791, + "learning_rate": 2.0040841654822617e-06, + "loss": 0.0342, + "step": 175620 + }, + { + "epoch": 0.14815, + "grad_norm": 0.034214939922094345, + "learning_rate": 2.002462832583241e-06, + "loss": 0.0327, + "step": 175630 + }, + { + "epoch": 0.1482, + "grad_norm": 0.03237602487206459, + "learning_rate": 2.0008421284282432e-06, + "loss": 0.0328, + "step": 175640 + }, + { + "epoch": 0.14825, + "grad_norm": 0.03369561582803726, + "learning_rate": 1.999222053061589e-06, + "loss": 0.0328, + "step": 175650 + }, + { + "epoch": 0.1483, + "grad_norm": 0.031222902238368988, + "learning_rate": 1.9976026065275627e-06, + "loss": 0.0338, + "step": 175660 + }, + { + "epoch": 0.14835, + "grad_norm": 0.037712518125772476, + "learning_rate": 1.9959837888704395e-06, + "loss": 0.0324, + "step": 175670 + }, + { + "epoch": 0.1484, + "grad_norm": 0.03585483506321907, + "learning_rate": 1.9943656001344708e-06, + "loss": 0.0326, + "step": 175680 + }, + { + "epoch": 0.14845, + "grad_norm": 0.03238583356142044, + "learning_rate": 1.9927480403639063e-06, + "loss": 0.0325, + "step": 175690 + }, + { + "epoch": 0.1485, + "grad_norm": 0.03418656438589096, + "learning_rate": 1.9911311096029726e-06, + "loss": 0.0356, + "step": 175700 + }, + { + "epoch": 0.14855, + "grad_norm": 0.03242915868759155, + "learning_rate": 1.9895148078958646e-06, + "loss": 0.0323, + "step": 175710 + }, + { + "epoch": 0.1486, + "grad_norm": 0.032259389758110046, + "learning_rate": 1.9878991352867804e-06, + "loss": 0.0327, + "step": 175720 + }, + { + "epoch": 0.14865, + "grad_norm": 0.029926873743534088, + "learning_rate": 1.986284091819884e-06, + "loss": 0.0323, + "step": 175730 + }, + { + "epoch": 0.1487, + "grad_norm": 0.034845590591430664, + "learning_rate": 1.9846696775393413e-06, + "loss": 0.0314, + "step": 175740 + }, + { + "epoch": 0.14875, + "grad_norm": 0.030692042782902718, + "learning_rate": 1.98305589248928e-06, + "loss": 0.0323, + "step": 175750 + }, + { + "epoch": 0.1488, + "grad_norm": 0.04207862541079521, + "learning_rate": 1.9814427367138207e-06, + "loss": 0.032, + "step": 175760 + }, + { + "epoch": 0.14885, + "grad_norm": 0.033570546656847, + "learning_rate": 1.9798302102570747e-06, + "loss": 0.032, + "step": 175770 + }, + { + "epoch": 0.1489, + "grad_norm": 0.03618871420621872, + "learning_rate": 1.978218313163119e-06, + "loss": 0.0347, + "step": 175780 + }, + { + "epoch": 0.14895, + "grad_norm": 0.03668270260095596, + "learning_rate": 1.976607045476028e-06, + "loss": 0.0326, + "step": 175790 + }, + { + "epoch": 0.149, + "grad_norm": 0.03377063572406769, + "learning_rate": 1.9749964072398487e-06, + "loss": 0.0345, + "step": 175800 + }, + { + "epoch": 0.14905, + "grad_norm": 0.03670993819832802, + "learning_rate": 1.9733863984986195e-06, + "loss": 0.0334, + "step": 175810 + }, + { + "epoch": 0.1491, + "grad_norm": 0.03634348511695862, + "learning_rate": 1.9717770192963502e-06, + "loss": 0.0338, + "step": 175820 + }, + { + "epoch": 0.14915, + "grad_norm": 0.037743836641311646, + "learning_rate": 1.97016826967705e-06, + "loss": 0.0337, + "step": 175830 + }, + { + "epoch": 0.1492, + "grad_norm": 0.03787601739168167, + "learning_rate": 1.9685601496847006e-06, + "loss": 0.0341, + "step": 175840 + }, + { + "epoch": 0.14925, + "grad_norm": 0.03569841384887695, + "learning_rate": 1.966952659363258e-06, + "loss": 0.0346, + "step": 175850 + }, + { + "epoch": 0.1493, + "grad_norm": 0.032588619738817215, + "learning_rate": 1.965345798756679e-06, + "loss": 0.0333, + "step": 175860 + }, + { + "epoch": 0.14935, + "grad_norm": 0.039640285074710846, + "learning_rate": 1.9637395679088956e-06, + "loss": 0.0333, + "step": 175870 + }, + { + "epoch": 0.1494, + "grad_norm": 0.03261444345116615, + "learning_rate": 1.9621339668638105e-06, + "loss": 0.0331, + "step": 175880 + }, + { + "epoch": 0.14945, + "grad_norm": 0.03778470307588577, + "learning_rate": 1.9605289956653337e-06, + "loss": 0.0351, + "step": 175890 + }, + { + "epoch": 0.1495, + "grad_norm": 0.03554508462548256, + "learning_rate": 1.958924654357336e-06, + "loss": 0.0336, + "step": 175900 + }, + { + "epoch": 0.14955, + "grad_norm": 0.03847133368253708, + "learning_rate": 1.9573209429836897e-06, + "loss": 0.033, + "step": 175910 + }, + { + "epoch": 0.1496, + "grad_norm": 0.03790833801031113, + "learning_rate": 1.9557178615882217e-06, + "loss": 0.0327, + "step": 175920 + }, + { + "epoch": 0.14965, + "grad_norm": 0.05213003233075142, + "learning_rate": 1.954115410214777e-06, + "loss": 0.034, + "step": 175930 + }, + { + "epoch": 0.1497, + "grad_norm": 0.03753867745399475, + "learning_rate": 1.9525135889071538e-06, + "loss": 0.0338, + "step": 175940 + }, + { + "epoch": 0.14975, + "grad_norm": 0.03490155190229416, + "learning_rate": 1.9509123977091536e-06, + "loss": 0.0341, + "step": 175950 + }, + { + "epoch": 0.1498, + "grad_norm": 0.03734445571899414, + "learning_rate": 1.9493118366645497e-06, + "loss": 0.0337, + "step": 175960 + }, + { + "epoch": 0.14985, + "grad_norm": 0.03365827724337578, + "learning_rate": 1.9477119058170957e-06, + "loss": 0.0333, + "step": 175970 + }, + { + "epoch": 0.1499, + "grad_norm": 0.03264923766255379, + "learning_rate": 1.946112605210543e-06, + "loss": 0.0324, + "step": 175980 + }, + { + "epoch": 0.14995, + "grad_norm": 0.03164662420749664, + "learning_rate": 1.9445139348886114e-06, + "loss": 0.0327, + "step": 175990 + }, + { + "epoch": 0.15, + "grad_norm": 0.032765284180641174, + "learning_rate": 1.942915894895006e-06, + "loss": 0.0333, + "step": 176000 + }, + { + "epoch": 0.15005, + "grad_norm": 0.03914589062333107, + "learning_rate": 1.9413184852734163e-06, + "loss": 0.0342, + "step": 176010 + }, + { + "epoch": 0.1501, + "grad_norm": 0.04127218574285507, + "learning_rate": 1.9397217060675128e-06, + "loss": 0.0325, + "step": 176020 + }, + { + "epoch": 0.15015, + "grad_norm": 0.03312616050243378, + "learning_rate": 1.9381255573209607e-06, + "loss": 0.0328, + "step": 176030 + }, + { + "epoch": 0.1502, + "grad_norm": 0.03647003322839737, + "learning_rate": 1.9365300390773915e-06, + "loss": 0.0317, + "step": 176040 + }, + { + "epoch": 0.15025, + "grad_norm": 0.037491753697395325, + "learning_rate": 1.9349351513804267e-06, + "loss": 0.0317, + "step": 176050 + }, + { + "epoch": 0.1503, + "grad_norm": 0.04122772812843323, + "learning_rate": 1.9333408942736663e-06, + "loss": 0.0329, + "step": 176060 + }, + { + "epoch": 0.15035, + "grad_norm": 0.02853936143219471, + "learning_rate": 1.9317472678007016e-06, + "loss": 0.0325, + "step": 176070 + }, + { + "epoch": 0.1504, + "grad_norm": 0.03218531236052513, + "learning_rate": 1.9301542720051024e-06, + "loss": 0.0352, + "step": 176080 + }, + { + "epoch": 0.15045, + "grad_norm": 0.035726893693208694, + "learning_rate": 1.928561906930415e-06, + "loss": 0.0326, + "step": 176090 + }, + { + "epoch": 0.1505, + "grad_norm": 0.033531490713357925, + "learning_rate": 1.9269701726201795e-06, + "loss": 0.032, + "step": 176100 + }, + { + "epoch": 0.15055, + "grad_norm": 0.03677698224782944, + "learning_rate": 1.925379069117908e-06, + "loss": 0.0334, + "step": 176110 + }, + { + "epoch": 0.1506, + "grad_norm": 0.031199825927615166, + "learning_rate": 1.923788596467113e-06, + "loss": 0.0308, + "step": 176120 + }, + { + "epoch": 0.15065, + "grad_norm": 0.03129459545016289, + "learning_rate": 1.9221987547112603e-06, + "loss": 0.0317, + "step": 176130 + }, + { + "epoch": 0.1507, + "grad_norm": 0.030460912734270096, + "learning_rate": 1.9206095438938225e-06, + "loss": 0.0314, + "step": 176140 + }, + { + "epoch": 0.15075, + "grad_norm": 0.030488573014736176, + "learning_rate": 1.9190209640582547e-06, + "loss": 0.0316, + "step": 176150 + }, + { + "epoch": 0.1508, + "grad_norm": 0.0343417227268219, + "learning_rate": 1.9174330152479825e-06, + "loss": 0.0325, + "step": 176160 + }, + { + "epoch": 0.15085, + "grad_norm": 0.033270884305238724, + "learning_rate": 1.9158456975064186e-06, + "loss": 0.0328, + "step": 176170 + }, + { + "epoch": 0.1509, + "grad_norm": 0.030955199152231216, + "learning_rate": 1.9142590108769593e-06, + "loss": 0.032, + "step": 176180 + }, + { + "epoch": 0.15095, + "grad_norm": 0.03370071202516556, + "learning_rate": 1.912672955402986e-06, + "loss": 0.0316, + "step": 176190 + }, + { + "epoch": 0.151, + "grad_norm": 0.03444530814886093, + "learning_rate": 1.9110875311278643e-06, + "loss": 0.0325, + "step": 176200 + }, + { + "epoch": 0.15105, + "grad_norm": 0.032148417085409164, + "learning_rate": 1.9095027380949288e-06, + "loss": 0.0326, + "step": 176210 + }, + { + "epoch": 0.1511, + "grad_norm": 0.035337161272764206, + "learning_rate": 1.90791857634752e-06, + "loss": 0.0324, + "step": 176220 + }, + { + "epoch": 0.15115, + "grad_norm": 0.03444439545273781, + "learning_rate": 1.9063350459289364e-06, + "loss": 0.033, + "step": 176230 + }, + { + "epoch": 0.1512, + "grad_norm": 0.031171441078186035, + "learning_rate": 1.9047521468824826e-06, + "loss": 0.0327, + "step": 176240 + }, + { + "epoch": 0.15125, + "grad_norm": 0.03326416015625, + "learning_rate": 1.9031698792514262e-06, + "loss": 0.0326, + "step": 176250 + }, + { + "epoch": 0.1513, + "grad_norm": 0.03335323929786682, + "learning_rate": 1.901588243079025e-06, + "loss": 0.0322, + "step": 176260 + }, + { + "epoch": 0.15135, + "grad_norm": 0.03669118508696556, + "learning_rate": 1.9000072384085272e-06, + "loss": 0.0338, + "step": 176270 + }, + { + "epoch": 0.1514, + "grad_norm": 0.039805181324481964, + "learning_rate": 1.8984268652831538e-06, + "loss": 0.0309, + "step": 176280 + }, + { + "epoch": 0.15145, + "grad_norm": 0.03643699735403061, + "learning_rate": 1.896847123746112e-06, + "loss": 0.0319, + "step": 176290 + }, + { + "epoch": 0.1515, + "grad_norm": 0.033225320279598236, + "learning_rate": 1.8952680138405843e-06, + "loss": 0.0314, + "step": 176300 + }, + { + "epoch": 0.15155, + "grad_norm": 0.03185262903571129, + "learning_rate": 1.8936895356097551e-06, + "loss": 0.0321, + "step": 176310 + }, + { + "epoch": 0.1516, + "grad_norm": 0.03353235498070717, + "learning_rate": 1.8921116890967678e-06, + "loss": 0.0333, + "step": 176320 + }, + { + "epoch": 0.15165, + "grad_norm": 0.036939650774002075, + "learning_rate": 1.890534474344771e-06, + "loss": 0.0328, + "step": 176330 + }, + { + "epoch": 0.1517, + "grad_norm": 0.03296613693237305, + "learning_rate": 1.8889578913968807e-06, + "loss": 0.0306, + "step": 176340 + }, + { + "epoch": 0.15175, + "grad_norm": 0.03160223364830017, + "learning_rate": 1.8873819402961929e-06, + "loss": 0.0332, + "step": 176350 + }, + { + "epoch": 0.1518, + "grad_norm": 0.03642089292407036, + "learning_rate": 1.8858066210858061e-06, + "loss": 0.0319, + "step": 176360 + }, + { + "epoch": 0.15185, + "grad_norm": 0.03370288014411926, + "learning_rate": 1.8842319338087805e-06, + "loss": 0.0324, + "step": 176370 + }, + { + "epoch": 0.1519, + "grad_norm": 0.03192954510450363, + "learning_rate": 1.8826578785081705e-06, + "loss": 0.0313, + "step": 176380 + }, + { + "epoch": 0.15195, + "grad_norm": 0.03451680764555931, + "learning_rate": 1.8810844552270058e-06, + "loss": 0.0328, + "step": 176390 + }, + { + "epoch": 0.152, + "grad_norm": 0.03376030921936035, + "learning_rate": 1.8795116640083044e-06, + "loss": 0.0333, + "step": 176400 + }, + { + "epoch": 0.15205, + "grad_norm": 0.035202499479055405, + "learning_rate": 1.8779395048950794e-06, + "loss": 0.0325, + "step": 176410 + }, + { + "epoch": 0.1521, + "grad_norm": 0.03488573059439659, + "learning_rate": 1.8763679779302935e-06, + "loss": 0.0318, + "step": 176420 + }, + { + "epoch": 0.15215, + "grad_norm": 0.03220974653959274, + "learning_rate": 1.8747970831569206e-06, + "loss": 0.0324, + "step": 176430 + }, + { + "epoch": 0.1522, + "grad_norm": 0.03496704250574112, + "learning_rate": 1.8732268206179072e-06, + "loss": 0.0321, + "step": 176440 + }, + { + "epoch": 0.15225, + "grad_norm": 0.030801020562648773, + "learning_rate": 1.8716571903561853e-06, + "loss": 0.0321, + "step": 176450 + }, + { + "epoch": 0.1523, + "grad_norm": 0.03100545145571232, + "learning_rate": 1.8700881924146707e-06, + "loss": 0.0331, + "step": 176460 + }, + { + "epoch": 0.15235, + "grad_norm": 0.03517313301563263, + "learning_rate": 1.8685198268362486e-06, + "loss": 0.0331, + "step": 176470 + }, + { + "epoch": 0.1524, + "grad_norm": 0.035738781094551086, + "learning_rate": 1.8669520936638096e-06, + "loss": 0.0333, + "step": 176480 + }, + { + "epoch": 0.15245, + "grad_norm": 0.040103767067193985, + "learning_rate": 1.8653849929402084e-06, + "loss": 0.0342, + "step": 176490 + }, + { + "epoch": 0.1525, + "grad_norm": 0.03443825617432594, + "learning_rate": 1.8638185247082912e-06, + "loss": 0.0328, + "step": 176500 + }, + { + "epoch": 0.15255, + "grad_norm": 0.03138561546802521, + "learning_rate": 1.8622526890108795e-06, + "loss": 0.0328, + "step": 176510 + }, + { + "epoch": 0.1526, + "grad_norm": 0.03944150358438492, + "learning_rate": 1.8606874858907859e-06, + "loss": 0.033, + "step": 176520 + }, + { + "epoch": 0.15265, + "grad_norm": 0.03146817907691002, + "learning_rate": 1.859122915390807e-06, + "loss": 0.0327, + "step": 176530 + }, + { + "epoch": 0.1527, + "grad_norm": 0.03628481552004814, + "learning_rate": 1.8575589775537166e-06, + "loss": 0.0327, + "step": 176540 + }, + { + "epoch": 0.15275, + "grad_norm": 0.030864441767334938, + "learning_rate": 1.8559956724222672e-06, + "loss": 0.0332, + "step": 176550 + }, + { + "epoch": 0.1528, + "grad_norm": 0.03527746722102165, + "learning_rate": 1.854433000039199e-06, + "loss": 0.0318, + "step": 176560 + }, + { + "epoch": 0.15285, + "grad_norm": 0.030624518170952797, + "learning_rate": 1.8528709604472393e-06, + "loss": 0.0325, + "step": 176570 + }, + { + "epoch": 0.1529, + "grad_norm": 0.034199099987745285, + "learning_rate": 1.8513095536890928e-06, + "loss": 0.032, + "step": 176580 + }, + { + "epoch": 0.15295, + "grad_norm": 0.032137248665094376, + "learning_rate": 1.849748779807442e-06, + "loss": 0.0329, + "step": 176590 + }, + { + "epoch": 0.153, + "grad_norm": 0.0312294140458107, + "learning_rate": 1.8481886388449694e-06, + "loss": 0.0314, + "step": 176600 + }, + { + "epoch": 0.15305, + "grad_norm": 0.03156765550374985, + "learning_rate": 1.846629130844313e-06, + "loss": 0.0322, + "step": 176610 + }, + { + "epoch": 0.1531, + "grad_norm": 0.03706734627485275, + "learning_rate": 1.8450702558481304e-06, + "loss": 0.0305, + "step": 176620 + }, + { + "epoch": 0.15315, + "grad_norm": 0.03096882998943329, + "learning_rate": 1.8435120138990152e-06, + "loss": 0.0319, + "step": 176630 + }, + { + "epoch": 0.1532, + "grad_norm": 0.034606993198394775, + "learning_rate": 1.8419544050395865e-06, + "loss": 0.0313, + "step": 176640 + }, + { + "epoch": 0.15325, + "grad_norm": 0.031098688021302223, + "learning_rate": 1.8403974293124265e-06, + "loss": 0.0318, + "step": 176650 + }, + { + "epoch": 0.1533, + "grad_norm": 0.03658682480454445, + "learning_rate": 1.8388410867600986e-06, + "loss": 0.0319, + "step": 176660 + }, + { + "epoch": 0.15335, + "grad_norm": 0.03677098825573921, + "learning_rate": 1.8372853774251548e-06, + "loss": 0.0329, + "step": 176670 + }, + { + "epoch": 0.1534, + "grad_norm": 0.0359707847237587, + "learning_rate": 1.8357303013501248e-06, + "loss": 0.0323, + "step": 176680 + }, + { + "epoch": 0.15345, + "grad_norm": 0.03360195830464363, + "learning_rate": 1.8341758585775276e-06, + "loss": 0.0323, + "step": 176690 + }, + { + "epoch": 0.1535, + "grad_norm": 0.0349896065890789, + "learning_rate": 1.8326220491498624e-06, + "loss": 0.0311, + "step": 176700 + }, + { + "epoch": 0.15355, + "grad_norm": 0.03248278796672821, + "learning_rate": 1.831068873109601e-06, + "loss": 0.0328, + "step": 176710 + }, + { + "epoch": 0.1536, + "grad_norm": 0.03948421776294708, + "learning_rate": 1.8295163304992147e-06, + "loss": 0.0327, + "step": 176720 + }, + { + "epoch": 0.15365, + "grad_norm": 0.03639651834964752, + "learning_rate": 1.8279644213611447e-06, + "loss": 0.0326, + "step": 176730 + }, + { + "epoch": 0.1537, + "grad_norm": 0.03521611541509628, + "learning_rate": 1.8264131457378264e-06, + "loss": 0.0335, + "step": 176740 + }, + { + "epoch": 0.15375, + "grad_norm": 0.02922460436820984, + "learning_rate": 1.8248625036716676e-06, + "loss": 0.0321, + "step": 176750 + }, + { + "epoch": 0.1538, + "grad_norm": 0.03481606766581535, + "learning_rate": 1.8233124952050596e-06, + "loss": 0.0363, + "step": 176760 + }, + { + "epoch": 0.15385, + "grad_norm": 0.03618673235177994, + "learning_rate": 1.8217631203803791e-06, + "loss": 0.0339, + "step": 176770 + }, + { + "epoch": 0.1539, + "grad_norm": 0.03688351437449455, + "learning_rate": 1.82021437923999e-06, + "loss": 0.0325, + "step": 176780 + }, + { + "epoch": 0.15395, + "grad_norm": 0.030665552243590355, + "learning_rate": 1.8186662718262304e-06, + "loss": 0.0337, + "step": 176790 + }, + { + "epoch": 0.154, + "grad_norm": 0.03393794968724251, + "learning_rate": 1.8171187981814247e-06, + "loss": 0.0323, + "step": 176800 + }, + { + "epoch": 0.15405, + "grad_norm": 0.03129062056541443, + "learning_rate": 1.8155719583478836e-06, + "loss": 0.0334, + "step": 176810 + }, + { + "epoch": 0.1541, + "grad_norm": 0.036447811871767044, + "learning_rate": 1.8140257523678928e-06, + "loss": 0.033, + "step": 176820 + }, + { + "epoch": 0.15415, + "grad_norm": 0.03580206260085106, + "learning_rate": 1.812480180283735e-06, + "loss": 0.0348, + "step": 176830 + }, + { + "epoch": 0.1542, + "grad_norm": 0.03224453702569008, + "learning_rate": 1.8109352421376486e-06, + "loss": 0.0325, + "step": 176840 + }, + { + "epoch": 0.15425, + "grad_norm": 0.03797399252653122, + "learning_rate": 1.8093909379718804e-06, + "loss": 0.0328, + "step": 176850 + }, + { + "epoch": 0.1543, + "grad_norm": 0.0390145443379879, + "learning_rate": 1.807847267828658e-06, + "loss": 0.0334, + "step": 176860 + }, + { + "epoch": 0.15435, + "grad_norm": 0.03498688340187073, + "learning_rate": 1.8063042317501777e-06, + "loss": 0.0331, + "step": 176870 + }, + { + "epoch": 0.1544, + "grad_norm": 0.03264835849404335, + "learning_rate": 1.8047618297786257e-06, + "loss": 0.0336, + "step": 176880 + }, + { + "epoch": 0.15445, + "grad_norm": 0.04160667210817337, + "learning_rate": 1.803220061956165e-06, + "loss": 0.0355, + "step": 176890 + }, + { + "epoch": 0.1545, + "grad_norm": 0.036346692591905594, + "learning_rate": 1.8016789283249568e-06, + "loss": 0.0336, + "step": 176900 + }, + { + "epoch": 0.15455, + "grad_norm": 0.03496379032731056, + "learning_rate": 1.8001384289271394e-06, + "loss": 0.0331, + "step": 176910 + }, + { + "epoch": 0.1546, + "grad_norm": 0.03790908679366112, + "learning_rate": 1.7985985638048098e-06, + "loss": 0.0336, + "step": 176920 + }, + { + "epoch": 0.15465, + "grad_norm": 0.03414648398756981, + "learning_rate": 1.7970593330000867e-06, + "loss": 0.0331, + "step": 176930 + }, + { + "epoch": 0.1547, + "grad_norm": 0.03520461916923523, + "learning_rate": 1.7955207365550398e-06, + "loss": 0.032, + "step": 176940 + }, + { + "epoch": 0.15475, + "grad_norm": 0.03670840337872505, + "learning_rate": 1.7939827745117433e-06, + "loss": 0.0345, + "step": 176950 + }, + { + "epoch": 0.1548, + "grad_norm": 0.040852636098861694, + "learning_rate": 1.7924454469122387e-06, + "loss": 0.0339, + "step": 176960 + }, + { + "epoch": 0.15485, + "grad_norm": 0.04101041704416275, + "learning_rate": 1.7909087537985509e-06, + "loss": 0.0334, + "step": 176970 + }, + { + "epoch": 0.1549, + "grad_norm": 0.0339178703725338, + "learning_rate": 1.7893726952127043e-06, + "loss": 0.0329, + "step": 176980 + }, + { + "epoch": 0.15495, + "grad_norm": 0.035663388669490814, + "learning_rate": 1.7878372711966879e-06, + "loss": 0.0333, + "step": 176990 + }, + { + "epoch": 0.155, + "grad_norm": 0.038831040263175964, + "learning_rate": 1.7863024817924816e-06, + "loss": 0.0332, + "step": 177000 + }, + { + "epoch": 0.15505, + "grad_norm": 0.03730317950248718, + "learning_rate": 1.7847683270420385e-06, + "loss": 0.0335, + "step": 177010 + }, + { + "epoch": 0.1551, + "grad_norm": 0.03230489417910576, + "learning_rate": 1.7832348069873134e-06, + "loss": 0.0326, + "step": 177020 + }, + { + "epoch": 0.15515, + "grad_norm": 0.038198865950107574, + "learning_rate": 1.781701921670223e-06, + "loss": 0.033, + "step": 177030 + }, + { + "epoch": 0.1552, + "grad_norm": 0.03649018704891205, + "learning_rate": 1.780169671132681e-06, + "loss": 0.0317, + "step": 177040 + }, + { + "epoch": 0.15525, + "grad_norm": 0.03312964364886284, + "learning_rate": 1.7786380554165787e-06, + "loss": 0.0336, + "step": 177050 + }, + { + "epoch": 0.1553, + "grad_norm": 0.03336400166153908, + "learning_rate": 1.777107074563783e-06, + "loss": 0.0325, + "step": 177060 + }, + { + "epoch": 0.15535, + "grad_norm": 0.0388801284134388, + "learning_rate": 1.77557672861616e-06, + "loss": 0.033, + "step": 177070 + }, + { + "epoch": 0.1554, + "grad_norm": 0.0346393957734108, + "learning_rate": 1.7740470176155434e-06, + "loss": 0.0321, + "step": 177080 + }, + { + "epoch": 0.15545, + "grad_norm": 0.03467172011733055, + "learning_rate": 1.7725179416037546e-06, + "loss": 0.0326, + "step": 177090 + }, + { + "epoch": 0.1555, + "grad_norm": 0.03163683041930199, + "learning_rate": 1.7709895006225996e-06, + "loss": 0.0321, + "step": 177100 + }, + { + "epoch": 0.15555, + "grad_norm": 0.03598109260201454, + "learning_rate": 1.7694616947138642e-06, + "loss": 0.0316, + "step": 177110 + }, + { + "epoch": 0.1556, + "grad_norm": 0.033809930086135864, + "learning_rate": 1.7679345239193234e-06, + "loss": 0.0316, + "step": 177120 + }, + { + "epoch": 0.15565, + "grad_norm": 0.03225461021065712, + "learning_rate": 1.766407988280719e-06, + "loss": 0.0331, + "step": 177130 + }, + { + "epoch": 0.1557, + "grad_norm": 0.0304391048848629, + "learning_rate": 1.7648820878397948e-06, + "loss": 0.0342, + "step": 177140 + }, + { + "epoch": 0.15575, + "grad_norm": 0.03615155071020126, + "learning_rate": 1.7633568226382624e-06, + "loss": 0.0339, + "step": 177150 + }, + { + "epoch": 0.1558, + "grad_norm": 0.030978847295045853, + "learning_rate": 1.761832192717827e-06, + "loss": 0.0337, + "step": 177160 + }, + { + "epoch": 0.15585, + "grad_norm": 0.031113559380173683, + "learning_rate": 1.7603081981201696e-06, + "loss": 0.0332, + "step": 177170 + }, + { + "epoch": 0.1559, + "grad_norm": 0.03291467949748039, + "learning_rate": 1.7587848388869539e-06, + "loss": 0.0332, + "step": 177180 + }, + { + "epoch": 0.15595, + "grad_norm": 0.036255598068237305, + "learning_rate": 1.7572621150598323e-06, + "loss": 0.0329, + "step": 177190 + }, + { + "epoch": 0.156, + "grad_norm": 0.03551039472222328, + "learning_rate": 1.7557400266804302e-06, + "loss": 0.0345, + "step": 177200 + }, + { + "epoch": 0.15605, + "grad_norm": 0.040075451135635376, + "learning_rate": 1.7542185737903643e-06, + "loss": 0.0359, + "step": 177210 + }, + { + "epoch": 0.1561, + "grad_norm": 0.03435356914997101, + "learning_rate": 1.7526977564312263e-06, + "loss": 0.0351, + "step": 177220 + }, + { + "epoch": 0.15615, + "grad_norm": 0.03493310511112213, + "learning_rate": 1.7511775746445997e-06, + "loss": 0.0342, + "step": 177230 + }, + { + "epoch": 0.1562, + "grad_norm": 0.029415188357234, + "learning_rate": 1.7496580284720455e-06, + "loss": 0.0331, + "step": 177240 + }, + { + "epoch": 0.15625, + "grad_norm": 0.035070475190877914, + "learning_rate": 1.7481391179551082e-06, + "loss": 0.0328, + "step": 177250 + }, + { + "epoch": 0.1563, + "grad_norm": 0.03617245703935623, + "learning_rate": 1.7466208431353104e-06, + "loss": 0.033, + "step": 177260 + }, + { + "epoch": 0.15635, + "grad_norm": 0.033768750727176666, + "learning_rate": 1.7451032040541576e-06, + "loss": 0.0346, + "step": 177270 + }, + { + "epoch": 0.1564, + "grad_norm": 0.03762581944465637, + "learning_rate": 1.7435862007531527e-06, + "loss": 0.0343, + "step": 177280 + }, + { + "epoch": 0.15645, + "grad_norm": 0.031112609431147575, + "learning_rate": 1.742069833273763e-06, + "loss": 0.0328, + "step": 177290 + }, + { + "epoch": 0.1565, + "grad_norm": 0.037205617874860764, + "learning_rate": 1.7405541016574434e-06, + "loss": 0.0349, + "step": 177300 + }, + { + "epoch": 0.15655, + "grad_norm": 0.030738942325115204, + "learning_rate": 1.739039005945642e-06, + "loss": 0.033, + "step": 177310 + }, + { + "epoch": 0.1566, + "grad_norm": 0.03419323265552521, + "learning_rate": 1.7375245461797696e-06, + "loss": 0.0336, + "step": 177320 + }, + { + "epoch": 0.15665, + "grad_norm": 0.038969021290540695, + "learning_rate": 1.7360107224012434e-06, + "loss": 0.0347, + "step": 177330 + }, + { + "epoch": 0.1567, + "grad_norm": 0.03191017359495163, + "learning_rate": 1.7344975346514387e-06, + "loss": 0.0325, + "step": 177340 + }, + { + "epoch": 0.15675, + "grad_norm": 0.03612587973475456, + "learning_rate": 1.7329849829717303e-06, + "loss": 0.0333, + "step": 177350 + }, + { + "epoch": 0.1568, + "grad_norm": 0.0304668340831995, + "learning_rate": 1.7314730674034745e-06, + "loss": 0.0355, + "step": 177360 + }, + { + "epoch": 0.15685, + "grad_norm": 0.03494877740740776, + "learning_rate": 1.7299617879880048e-06, + "loss": 0.0318, + "step": 177370 + }, + { + "epoch": 0.1569, + "grad_norm": 0.03373005986213684, + "learning_rate": 1.7284511447666352e-06, + "loss": 0.0333, + "step": 177380 + }, + { + "epoch": 0.15695, + "grad_norm": 0.03489000350236893, + "learning_rate": 1.726941137780666e-06, + "loss": 0.0343, + "step": 177390 + }, + { + "epoch": 0.157, + "grad_norm": 0.03359324857592583, + "learning_rate": 1.7254317670713894e-06, + "loss": 0.0339, + "step": 177400 + }, + { + "epoch": 0.15705, + "grad_norm": 0.036254171282052994, + "learning_rate": 1.723923032680061e-06, + "loss": 0.0337, + "step": 177410 + }, + { + "epoch": 0.1571, + "grad_norm": 0.031140070408582687, + "learning_rate": 1.7224149346479285e-06, + "loss": 0.0312, + "step": 177420 + }, + { + "epoch": 0.15715, + "grad_norm": 0.03355412557721138, + "learning_rate": 1.7209074730162338e-06, + "loss": 0.0328, + "step": 177430 + }, + { + "epoch": 0.1572, + "grad_norm": 0.03463577479124069, + "learning_rate": 1.7194006478261776e-06, + "loss": 0.0318, + "step": 177440 + }, + { + "epoch": 0.15725, + "grad_norm": 0.03454165533185005, + "learning_rate": 1.7178944591189656e-06, + "loss": 0.0331, + "step": 177450 + }, + { + "epoch": 0.1573, + "grad_norm": 0.03468501567840576, + "learning_rate": 1.7163889069357702e-06, + "loss": 0.0333, + "step": 177460 + }, + { + "epoch": 0.15735, + "grad_norm": 0.03777414560317993, + "learning_rate": 1.7148839913177533e-06, + "loss": 0.0321, + "step": 177470 + }, + { + "epoch": 0.1574, + "grad_norm": 0.038221944123506546, + "learning_rate": 1.7133797123060625e-06, + "loss": 0.0347, + "step": 177480 + }, + { + "epoch": 0.15745, + "grad_norm": 0.031114550307393074, + "learning_rate": 1.711876069941823e-06, + "loss": 0.0339, + "step": 177490 + }, + { + "epoch": 0.1575, + "grad_norm": 0.037087131291627884, + "learning_rate": 1.7103730642661436e-06, + "loss": 0.0331, + "step": 177500 + }, + { + "epoch": 0.15755, + "grad_norm": 0.036503974348306656, + "learning_rate": 1.708870695320111e-06, + "loss": 0.0334, + "step": 177510 + }, + { + "epoch": 0.1576, + "grad_norm": 0.0365588515996933, + "learning_rate": 1.7073689631448063e-06, + "loss": 0.0333, + "step": 177520 + }, + { + "epoch": 0.15765, + "grad_norm": 0.03332650661468506, + "learning_rate": 1.70586786778128e-06, + "loss": 0.0337, + "step": 177530 + }, + { + "epoch": 0.1577, + "grad_norm": 0.044379010796546936, + "learning_rate": 1.7043674092705798e-06, + "loss": 0.0328, + "step": 177540 + }, + { + "epoch": 0.15775, + "grad_norm": 0.03675961121916771, + "learning_rate": 1.7028675876537225e-06, + "loss": 0.0329, + "step": 177550 + }, + { + "epoch": 0.1578, + "grad_norm": 0.03883613646030426, + "learning_rate": 1.7013684029717093e-06, + "loss": 0.0329, + "step": 177560 + }, + { + "epoch": 0.15785, + "grad_norm": 0.03839869052171707, + "learning_rate": 1.6998698552655345e-06, + "loss": 0.0329, + "step": 177570 + }, + { + "epoch": 0.1579, + "grad_norm": 0.03880983963608742, + "learning_rate": 1.698371944576163e-06, + "loss": 0.036, + "step": 177580 + }, + { + "epoch": 0.15795, + "grad_norm": 0.037065938115119934, + "learning_rate": 1.696874670944551e-06, + "loss": 0.0322, + "step": 177590 + }, + { + "epoch": 0.158, + "grad_norm": 0.0350380539894104, + "learning_rate": 1.6953780344116265e-06, + "loss": 0.0335, + "step": 177600 + }, + { + "epoch": 0.15805, + "grad_norm": 0.0355491079390049, + "learning_rate": 1.6938820350183098e-06, + "loss": 0.0332, + "step": 177610 + }, + { + "epoch": 0.1581, + "grad_norm": 0.033673424273729324, + "learning_rate": 1.6923866728055127e-06, + "loss": 0.0349, + "step": 177620 + }, + { + "epoch": 0.15815, + "grad_norm": 0.033299632370471954, + "learning_rate": 1.6908919478140966e-06, + "loss": 0.0327, + "step": 177630 + }, + { + "epoch": 0.1582, + "grad_norm": 0.031211506575345993, + "learning_rate": 1.689397860084946e-06, + "loss": 0.0366, + "step": 177640 + }, + { + "epoch": 0.15825, + "grad_norm": 0.03407136723399162, + "learning_rate": 1.6879044096588942e-06, + "loss": 0.0331, + "step": 177650 + }, + { + "epoch": 0.1583, + "grad_norm": 0.038370538502931595, + "learning_rate": 1.6864115965767814e-06, + "loss": 0.0355, + "step": 177660 + }, + { + "epoch": 0.15835, + "grad_norm": 0.03854278847575188, + "learning_rate": 1.6849194208794162e-06, + "loss": 0.0327, + "step": 177670 + }, + { + "epoch": 0.1584, + "grad_norm": 0.04024885594844818, + "learning_rate": 1.6834278826075939e-06, + "loss": 0.0336, + "step": 177680 + }, + { + "epoch": 0.15845, + "grad_norm": 0.03746865317225456, + "learning_rate": 1.6819369818020959e-06, + "loss": 0.0327, + "step": 177690 + }, + { + "epoch": 0.1585, + "grad_norm": 0.041079357266426086, + "learning_rate": 1.680446718503681e-06, + "loss": 0.0353, + "step": 177700 + }, + { + "epoch": 0.15855, + "grad_norm": 0.039807822555303574, + "learning_rate": 1.6789570927530918e-06, + "loss": 0.0331, + "step": 177710 + }, + { + "epoch": 0.1586, + "grad_norm": 0.03857456520199776, + "learning_rate": 1.6774681045910513e-06, + "loss": 0.0343, + "step": 177720 + }, + { + "epoch": 0.15865, + "grad_norm": 0.03497397527098656, + "learning_rate": 1.6759797540582684e-06, + "loss": 0.0327, + "step": 177730 + }, + { + "epoch": 0.1587, + "grad_norm": 0.034522783011198044, + "learning_rate": 1.674492041195444e-06, + "loss": 0.0324, + "step": 177740 + }, + { + "epoch": 0.15875, + "grad_norm": 0.04140979051589966, + "learning_rate": 1.6730049660432429e-06, + "loss": 0.033, + "step": 177750 + }, + { + "epoch": 0.1588, + "grad_norm": 0.04125241935253143, + "learning_rate": 1.6715185286423213e-06, + "loss": 0.0333, + "step": 177760 + }, + { + "epoch": 0.15885, + "grad_norm": 0.031144708395004272, + "learning_rate": 1.6700327290333163e-06, + "loss": 0.0326, + "step": 177770 + }, + { + "epoch": 0.1589, + "grad_norm": 0.03373735770583153, + "learning_rate": 1.6685475672568562e-06, + "loss": 0.0341, + "step": 177780 + }, + { + "epoch": 0.15895, + "grad_norm": 0.043694883584976196, + "learning_rate": 1.6670630433535395e-06, + "loss": 0.032, + "step": 177790 + }, + { + "epoch": 0.159, + "grad_norm": 0.03402690961956978, + "learning_rate": 1.6655791573639473e-06, + "loss": 0.0318, + "step": 177800 + }, + { + "epoch": 0.15905, + "grad_norm": 0.03880279138684273, + "learning_rate": 1.6640959093286612e-06, + "loss": 0.0317, + "step": 177810 + }, + { + "epoch": 0.1591, + "grad_norm": 0.032201413065195084, + "learning_rate": 1.6626132992882238e-06, + "loss": 0.0318, + "step": 177820 + }, + { + "epoch": 0.15915, + "grad_norm": 0.03442845121026039, + "learning_rate": 1.6611313272831747e-06, + "loss": 0.0318, + "step": 177830 + }, + { + "epoch": 0.1592, + "grad_norm": 0.029142117127776146, + "learning_rate": 1.6596499933540233e-06, + "loss": 0.034, + "step": 177840 + }, + { + "epoch": 0.15925, + "grad_norm": 0.030788760632276535, + "learning_rate": 1.6581692975412705e-06, + "loss": 0.0317, + "step": 177850 + }, + { + "epoch": 0.1593, + "grad_norm": 0.028324414044618607, + "learning_rate": 1.6566892398854033e-06, + "loss": 0.0327, + "step": 177860 + }, + { + "epoch": 0.15935, + "grad_norm": 0.03501060605049133, + "learning_rate": 1.655209820426884e-06, + "loss": 0.0336, + "step": 177870 + }, + { + "epoch": 0.1594, + "grad_norm": 0.03039061650633812, + "learning_rate": 1.6537310392061578e-06, + "loss": 0.0339, + "step": 177880 + }, + { + "epoch": 0.15945, + "grad_norm": 0.03187539055943489, + "learning_rate": 1.6522528962636507e-06, + "loss": 0.0353, + "step": 177890 + }, + { + "epoch": 0.1595, + "grad_norm": 0.03798288479447365, + "learning_rate": 1.6507753916397806e-06, + "loss": 0.0337, + "step": 177900 + }, + { + "epoch": 0.15955, + "grad_norm": 0.04895629361271858, + "learning_rate": 1.6492985253749399e-06, + "loss": 0.0355, + "step": 177910 + }, + { + "epoch": 0.1596, + "grad_norm": 0.04290872439742088, + "learning_rate": 1.647822297509502e-06, + "loss": 0.0342, + "step": 177920 + }, + { + "epoch": 0.15965, + "grad_norm": 0.0384073369204998, + "learning_rate": 1.6463467080838347e-06, + "loss": 0.0336, + "step": 177930 + }, + { + "epoch": 0.1597, + "grad_norm": 0.032967161387205124, + "learning_rate": 1.6448717571382665e-06, + "loss": 0.0328, + "step": 177940 + }, + { + "epoch": 0.15975, + "grad_norm": 0.030712133273482323, + "learning_rate": 1.6433974447131378e-06, + "loss": 0.0351, + "step": 177950 + }, + { + "epoch": 0.1598, + "grad_norm": 0.03501797094941139, + "learning_rate": 1.6419237708487466e-06, + "loss": 0.0344, + "step": 177960 + }, + { + "epoch": 0.15985, + "grad_norm": 0.0321439765393734, + "learning_rate": 1.640450735585386e-06, + "loss": 0.0323, + "step": 177970 + }, + { + "epoch": 0.1599, + "grad_norm": 0.03721248731017113, + "learning_rate": 1.6389783389633207e-06, + "loss": 0.0327, + "step": 177980 + }, + { + "epoch": 0.15995, + "grad_norm": 0.030178798362612724, + "learning_rate": 1.6375065810228157e-06, + "loss": 0.0325, + "step": 177990 + }, + { + "epoch": 0.16, + "grad_norm": 0.03577028214931488, + "learning_rate": 1.6360354618041058e-06, + "loss": 0.0333, + "step": 178000 + }, + { + "epoch": 0.16005, + "grad_norm": 0.02578200027346611, + "learning_rate": 1.6345649813474028e-06, + "loss": 0.0327, + "step": 178010 + }, + { + "epoch": 0.1601, + "grad_norm": 0.03544124960899353, + "learning_rate": 1.6330951396929195e-06, + "loss": 0.0344, + "step": 178020 + }, + { + "epoch": 0.16015, + "grad_norm": 0.03315258026123047, + "learning_rate": 1.6316259368808345e-06, + "loss": 0.0341, + "step": 178030 + }, + { + "epoch": 0.1602, + "grad_norm": 0.03840891271829605, + "learning_rate": 1.6301573729513241e-06, + "loss": 0.0328, + "step": 178040 + }, + { + "epoch": 0.16025, + "grad_norm": 0.03329712152481079, + "learning_rate": 1.628689447944523e-06, + "loss": 0.0316, + "step": 178050 + }, + { + "epoch": 0.1603, + "grad_norm": 0.03092890791594982, + "learning_rate": 1.6272221619005712e-06, + "loss": 0.0312, + "step": 178060 + }, + { + "epoch": 0.16035, + "grad_norm": 0.030587226152420044, + "learning_rate": 1.6257555148595893e-06, + "loss": 0.0322, + "step": 178070 + }, + { + "epoch": 0.1604, + "grad_norm": 0.03487320616841316, + "learning_rate": 1.6242895068616704e-06, + "loss": 0.032, + "step": 178080 + }, + { + "epoch": 0.16045, + "grad_norm": 0.032729100435972214, + "learning_rate": 1.6228241379468962e-06, + "loss": 0.0337, + "step": 178090 + }, + { + "epoch": 0.1605, + "grad_norm": 0.03575211390852928, + "learning_rate": 1.6213594081553236e-06, + "loss": 0.0324, + "step": 178100 + }, + { + "epoch": 0.16055, + "grad_norm": 0.031156299635767937, + "learning_rate": 1.619895317527001e-06, + "loss": 0.0324, + "step": 178110 + }, + { + "epoch": 0.1606, + "grad_norm": 0.033754050731658936, + "learning_rate": 1.618431866101963e-06, + "loss": 0.0333, + "step": 178120 + }, + { + "epoch": 0.16065, + "grad_norm": 0.03337110951542854, + "learning_rate": 1.6169690539202088e-06, + "loss": 0.0323, + "step": 178130 + }, + { + "epoch": 0.1607, + "grad_norm": 0.03375527262687683, + "learning_rate": 1.615506881021736e-06, + "loss": 0.0319, + "step": 178140 + }, + { + "epoch": 0.16075, + "grad_norm": 0.03295344114303589, + "learning_rate": 1.614045347446519e-06, + "loss": 0.0321, + "step": 178150 + }, + { + "epoch": 0.1608, + "grad_norm": 0.032287366688251495, + "learning_rate": 1.6125844532345225e-06, + "loss": 0.0317, + "step": 178160 + }, + { + "epoch": 0.16085, + "grad_norm": 0.0348057895898819, + "learning_rate": 1.6111241984256758e-06, + "loss": 0.034, + "step": 178170 + }, + { + "epoch": 0.1609, + "grad_norm": 0.030505768954753876, + "learning_rate": 1.6096645830599055e-06, + "loss": 0.0323, + "step": 178180 + }, + { + "epoch": 0.16095, + "grad_norm": 0.03195841610431671, + "learning_rate": 1.608205607177124e-06, + "loss": 0.032, + "step": 178190 + }, + { + "epoch": 0.161, + "grad_norm": 0.038009725511074066, + "learning_rate": 1.6067472708172104e-06, + "loss": 0.0319, + "step": 178200 + }, + { + "epoch": 0.16105, + "grad_norm": 0.04068221524357796, + "learning_rate": 1.6052895740200385e-06, + "loss": 0.0328, + "step": 178210 + }, + { + "epoch": 0.1611, + "grad_norm": 0.03707307204604149, + "learning_rate": 1.603832516825457e-06, + "loss": 0.0332, + "step": 178220 + }, + { + "epoch": 0.16115, + "grad_norm": 0.03900527209043503, + "learning_rate": 1.6023760992733089e-06, + "loss": 0.0329, + "step": 178230 + }, + { + "epoch": 0.1612, + "grad_norm": 0.03804875165224075, + "learning_rate": 1.600920321403404e-06, + "loss": 0.0336, + "step": 178240 + }, + { + "epoch": 0.16125, + "grad_norm": 0.028278328478336334, + "learning_rate": 1.5994651832555523e-06, + "loss": 0.0332, + "step": 178250 + }, + { + "epoch": 0.1613, + "grad_norm": 0.03762136399745941, + "learning_rate": 1.5980106848695303e-06, + "loss": 0.032, + "step": 178260 + }, + { + "epoch": 0.16135, + "grad_norm": 0.033716052770614624, + "learning_rate": 1.5965568262851005e-06, + "loss": 0.0332, + "step": 178270 + }, + { + "epoch": 0.1614, + "grad_norm": 0.03807881101965904, + "learning_rate": 1.5951036075420173e-06, + "loss": 0.0332, + "step": 178280 + }, + { + "epoch": 0.16145, + "grad_norm": 0.03636297583580017, + "learning_rate": 1.59365102868001e-06, + "loss": 0.0312, + "step": 178290 + }, + { + "epoch": 0.1615, + "grad_norm": 0.03612672910094261, + "learning_rate": 1.5921990897387857e-06, + "loss": 0.0324, + "step": 178300 + }, + { + "epoch": 0.16155, + "grad_norm": 0.036949120461940765, + "learning_rate": 1.5907477907580488e-06, + "loss": 0.0314, + "step": 178310 + }, + { + "epoch": 0.1616, + "grad_norm": 0.03454224765300751, + "learning_rate": 1.5892971317774702e-06, + "loss": 0.0327, + "step": 178320 + }, + { + "epoch": 0.16165, + "grad_norm": 0.03373921290040016, + "learning_rate": 1.5878471128367183e-06, + "loss": 0.0319, + "step": 178330 + }, + { + "epoch": 0.1617, + "grad_norm": 0.028942229226231575, + "learning_rate": 1.5863977339754255e-06, + "loss": 0.0324, + "step": 178340 + }, + { + "epoch": 0.16175, + "grad_norm": 0.03300546854734421, + "learning_rate": 1.5849489952332263e-06, + "loss": 0.0328, + "step": 178350 + }, + { + "epoch": 0.1618, + "grad_norm": 0.03184133768081665, + "learning_rate": 1.58350089664972e-06, + "loss": 0.0334, + "step": 178360 + }, + { + "epoch": 0.16185, + "grad_norm": 0.033677779138088226, + "learning_rate": 1.5820534382645052e-06, + "loss": 0.0333, + "step": 178370 + }, + { + "epoch": 0.1619, + "grad_norm": 0.030481450259685516, + "learning_rate": 1.5806066201171532e-06, + "loss": 0.0336, + "step": 178380 + }, + { + "epoch": 0.16195, + "grad_norm": 0.03218537196516991, + "learning_rate": 1.5791604422472128e-06, + "loss": 0.0331, + "step": 178390 + }, + { + "epoch": 0.162, + "grad_norm": 0.028280923143029213, + "learning_rate": 1.5777149046942301e-06, + "loss": 0.0327, + "step": 178400 + }, + { + "epoch": 0.16205, + "grad_norm": 0.033612944185733795, + "learning_rate": 1.5762700074977238e-06, + "loss": 0.0339, + "step": 178410 + }, + { + "epoch": 0.1621, + "grad_norm": 0.03201251104474068, + "learning_rate": 1.5748257506971953e-06, + "loss": 0.0327, + "step": 178420 + }, + { + "epoch": 0.16215, + "grad_norm": 0.037322528660297394, + "learning_rate": 1.573382134332127e-06, + "loss": 0.033, + "step": 178430 + }, + { + "epoch": 0.1622, + "grad_norm": 0.036024387925863266, + "learning_rate": 1.5719391584419902e-06, + "loss": 0.0339, + "step": 178440 + }, + { + "epoch": 0.16225, + "grad_norm": 0.029813559725880623, + "learning_rate": 1.5704968230662393e-06, + "loss": 0.0326, + "step": 178450 + }, + { + "epoch": 0.1623, + "grad_norm": 0.033420126885175705, + "learning_rate": 1.569055128244304e-06, + "loss": 0.0347, + "step": 178460 + }, + { + "epoch": 0.16235, + "grad_norm": 0.04035967215895653, + "learning_rate": 1.5676140740155971e-06, + "loss": 0.0339, + "step": 178470 + }, + { + "epoch": 0.1624, + "grad_norm": 0.03336472064256668, + "learning_rate": 1.5661736604195148e-06, + "loss": 0.0318, + "step": 178480 + }, + { + "epoch": 0.16245, + "grad_norm": 0.031709253787994385, + "learning_rate": 1.5647338874954453e-06, + "loss": 0.0324, + "step": 178490 + }, + { + "epoch": 0.1625, + "grad_norm": 0.028425998985767365, + "learning_rate": 1.5632947552827488e-06, + "loss": 0.0356, + "step": 178500 + }, + { + "epoch": 0.16255, + "grad_norm": 0.031041646376252174, + "learning_rate": 1.561856263820763e-06, + "loss": 0.0318, + "step": 178510 + }, + { + "epoch": 0.1626, + "grad_norm": 0.03370223194360733, + "learning_rate": 1.560418413148826e-06, + "loss": 0.0317, + "step": 178520 + }, + { + "epoch": 0.16265, + "grad_norm": 0.03401322662830353, + "learning_rate": 1.5589812033062396e-06, + "loss": 0.031, + "step": 178530 + }, + { + "epoch": 0.1627, + "grad_norm": 0.03218592703342438, + "learning_rate": 1.5575446343323086e-06, + "loss": 0.0325, + "step": 178540 + }, + { + "epoch": 0.16275, + "grad_norm": 0.03417234867811203, + "learning_rate": 1.5561087062662905e-06, + "loss": 0.0305, + "step": 178550 + }, + { + "epoch": 0.1628, + "grad_norm": 0.029766710475087166, + "learning_rate": 1.5546734191474565e-06, + "loss": 0.0343, + "step": 178560 + }, + { + "epoch": 0.16285, + "grad_norm": 0.03576730191707611, + "learning_rate": 1.553238773015045e-06, + "loss": 0.0331, + "step": 178570 + }, + { + "epoch": 0.1629, + "grad_norm": 0.03527417033910751, + "learning_rate": 1.551804767908277e-06, + "loss": 0.0332, + "step": 178580 + }, + { + "epoch": 0.16295, + "grad_norm": 0.030761733651161194, + "learning_rate": 1.5503714038663575e-06, + "loss": 0.0333, + "step": 178590 + }, + { + "epoch": 0.163, + "grad_norm": 0.03658977523446083, + "learning_rate": 1.5489386809284718e-06, + "loss": 0.0325, + "step": 178600 + }, + { + "epoch": 0.16305, + "grad_norm": 0.03047502413392067, + "learning_rate": 1.5475065991337966e-06, + "loss": 0.0321, + "step": 178610 + }, + { + "epoch": 0.1631, + "grad_norm": 0.03343275189399719, + "learning_rate": 1.5460751585214788e-06, + "loss": 0.0326, + "step": 178620 + }, + { + "epoch": 0.16315, + "grad_norm": 0.03201655298471451, + "learning_rate": 1.5446443591306509e-06, + "loss": 0.0339, + "step": 178630 + }, + { + "epoch": 0.1632, + "grad_norm": 0.03290139138698578, + "learning_rate": 1.543214201000437e-06, + "loss": 0.0325, + "step": 178640 + }, + { + "epoch": 0.16325, + "grad_norm": 0.03751504793763161, + "learning_rate": 1.5417846841699336e-06, + "loss": 0.0325, + "step": 178650 + }, + { + "epoch": 0.1633, + "grad_norm": 0.035507798194885254, + "learning_rate": 1.540355808678226e-06, + "loss": 0.032, + "step": 178660 + }, + { + "epoch": 0.16335, + "grad_norm": 0.032653480768203735, + "learning_rate": 1.5389275745643777e-06, + "loss": 0.0338, + "step": 178670 + }, + { + "epoch": 0.1634, + "grad_norm": 0.03228415176272392, + "learning_rate": 1.5374999818674324e-06, + "loss": 0.0338, + "step": 178680 + }, + { + "epoch": 0.16345, + "grad_norm": 0.033933766186237335, + "learning_rate": 1.536073030626428e-06, + "loss": 0.0321, + "step": 178690 + }, + { + "epoch": 0.1635, + "grad_norm": 0.03191066533327103, + "learning_rate": 1.53464672088037e-06, + "loss": 0.0328, + "step": 178700 + }, + { + "epoch": 0.16355, + "grad_norm": 0.038408685475587845, + "learning_rate": 1.5332210526682545e-06, + "loss": 0.0331, + "step": 178710 + }, + { + "epoch": 0.1636, + "grad_norm": 0.03488050028681755, + "learning_rate": 1.5317960260290559e-06, + "loss": 0.0334, + "step": 178720 + }, + { + "epoch": 0.16365, + "grad_norm": 0.02847726084291935, + "learning_rate": 1.5303716410017433e-06, + "loss": 0.0316, + "step": 178730 + }, + { + "epoch": 0.1637, + "grad_norm": 0.03325439989566803, + "learning_rate": 1.5289478976252491e-06, + "loss": 0.033, + "step": 178740 + }, + { + "epoch": 0.16375, + "grad_norm": 0.03413159027695656, + "learning_rate": 1.5275247959385037e-06, + "loss": 0.0327, + "step": 178750 + }, + { + "epoch": 0.1638, + "grad_norm": 0.03945057466626167, + "learning_rate": 1.5261023359804116e-06, + "loss": 0.037, + "step": 178760 + }, + { + "epoch": 0.16385, + "grad_norm": 0.030923176556825638, + "learning_rate": 1.5246805177898615e-06, + "loss": 0.033, + "step": 178770 + }, + { + "epoch": 0.1639, + "grad_norm": 0.03191499784588814, + "learning_rate": 1.5232593414057278e-06, + "loss": 0.0345, + "step": 178780 + }, + { + "epoch": 0.16395, + "grad_norm": 0.03918515890836716, + "learning_rate": 1.5218388068668655e-06, + "loss": 0.0322, + "step": 178790 + }, + { + "epoch": 0.164, + "grad_norm": 0.03057541884481907, + "learning_rate": 1.52041891421211e-06, + "loss": 0.0326, + "step": 178800 + }, + { + "epoch": 0.16405, + "grad_norm": 0.03401718661189079, + "learning_rate": 1.518999663480275e-06, + "loss": 0.0337, + "step": 178810 + }, + { + "epoch": 0.1641, + "grad_norm": 0.03624304383993149, + "learning_rate": 1.517581054710171e-06, + "loss": 0.0336, + "step": 178820 + }, + { + "epoch": 0.16415, + "grad_norm": 0.03786566108465195, + "learning_rate": 1.5161630879405835e-06, + "loss": 0.0331, + "step": 178830 + }, + { + "epoch": 0.1642, + "grad_norm": 0.03679925948381424, + "learning_rate": 1.5147457632102708e-06, + "loss": 0.0334, + "step": 178840 + }, + { + "epoch": 0.16425, + "grad_norm": 0.036222558468580246, + "learning_rate": 1.5133290805579847e-06, + "loss": 0.033, + "step": 178850 + }, + { + "epoch": 0.1643, + "grad_norm": 0.0338745042681694, + "learning_rate": 1.5119130400224584e-06, + "loss": 0.0347, + "step": 178860 + }, + { + "epoch": 0.16435, + "grad_norm": 0.03509068489074707, + "learning_rate": 1.5104976416424082e-06, + "loss": 0.0332, + "step": 178870 + }, + { + "epoch": 0.1644, + "grad_norm": 0.03290260210633278, + "learning_rate": 1.509082885456528e-06, + "loss": 0.0324, + "step": 178880 + }, + { + "epoch": 0.16445, + "grad_norm": 0.02996455878019333, + "learning_rate": 1.5076687715034926e-06, + "loss": 0.0334, + "step": 178890 + }, + { + "epoch": 0.1645, + "grad_norm": 0.03558652848005295, + "learning_rate": 1.506255299821968e-06, + "loss": 0.0356, + "step": 178900 + }, + { + "epoch": 0.16455, + "grad_norm": 0.0318191722035408, + "learning_rate": 1.5048424704506015e-06, + "loss": 0.0326, + "step": 178910 + }, + { + "epoch": 0.1646, + "grad_norm": 0.02919379062950611, + "learning_rate": 1.5034302834280146e-06, + "loss": 0.0322, + "step": 178920 + }, + { + "epoch": 0.16465, + "grad_norm": 0.03439747542142868, + "learning_rate": 1.5020187387928124e-06, + "loss": 0.0336, + "step": 178930 + }, + { + "epoch": 0.1647, + "grad_norm": 0.035345617681741714, + "learning_rate": 1.5006078365835895e-06, + "loss": 0.0328, + "step": 178940 + }, + { + "epoch": 0.16475, + "grad_norm": 0.033006761223077774, + "learning_rate": 1.4991975768389233e-06, + "loss": 0.0327, + "step": 178950 + }, + { + "epoch": 0.1648, + "grad_norm": 0.03655276447534561, + "learning_rate": 1.4977879595973659e-06, + "loss": 0.0335, + "step": 178960 + }, + { + "epoch": 0.16485, + "grad_norm": 0.030718335881829262, + "learning_rate": 1.4963789848974591e-06, + "loss": 0.0322, + "step": 178970 + }, + { + "epoch": 0.1649, + "grad_norm": 0.0313151478767395, + "learning_rate": 1.4949706527777136e-06, + "loss": 0.0331, + "step": 178980 + }, + { + "epoch": 0.16495, + "grad_norm": 0.029872264713048935, + "learning_rate": 1.4935629632766456e-06, + "loss": 0.0321, + "step": 178990 + }, + { + "epoch": 0.165, + "grad_norm": 0.03434133902192116, + "learning_rate": 1.4921559164327358e-06, + "loss": 0.034, + "step": 179000 + }, + { + "epoch": 0.16505, + "grad_norm": 0.02828471176326275, + "learning_rate": 1.4907495122844479e-06, + "loss": 0.0326, + "step": 179010 + }, + { + "epoch": 0.1651, + "grad_norm": 0.035396866500377655, + "learning_rate": 1.4893437508702395e-06, + "loss": 0.0362, + "step": 179020 + }, + { + "epoch": 0.16515, + "grad_norm": 0.030432626605033875, + "learning_rate": 1.4879386322285387e-06, + "loss": 0.0329, + "step": 179030 + }, + { + "epoch": 0.1652, + "grad_norm": 0.03829565644264221, + "learning_rate": 1.4865341563977674e-06, + "loss": 0.0331, + "step": 179040 + }, + { + "epoch": 0.16525, + "grad_norm": 0.032867953181266785, + "learning_rate": 1.4851303234163116e-06, + "loss": 0.0338, + "step": 179050 + }, + { + "epoch": 0.1653, + "grad_norm": 0.03387406840920448, + "learning_rate": 1.48372713332256e-06, + "loss": 0.035, + "step": 179060 + }, + { + "epoch": 0.16535, + "grad_norm": 0.029376495629549026, + "learning_rate": 1.4823245861548762e-06, + "loss": 0.0319, + "step": 179070 + }, + { + "epoch": 0.1654, + "grad_norm": 0.03476952761411667, + "learning_rate": 1.4809226819516048e-06, + "loss": 0.0322, + "step": 179080 + }, + { + "epoch": 0.16545, + "grad_norm": 0.035101015120744705, + "learning_rate": 1.4795214207510704e-06, + "loss": 0.0329, + "step": 179090 + }, + { + "epoch": 0.1655, + "grad_norm": 0.03613230586051941, + "learning_rate": 1.4781208025915788e-06, + "loss": 0.0318, + "step": 179100 + }, + { + "epoch": 0.16555, + "grad_norm": 0.0309486985206604, + "learning_rate": 1.4767208275114354e-06, + "loss": 0.033, + "step": 179110 + }, + { + "epoch": 0.1656, + "grad_norm": 0.029845857992768288, + "learning_rate": 1.4753214955489036e-06, + "loss": 0.0314, + "step": 179120 + }, + { + "epoch": 0.16565, + "grad_norm": 0.03276577591896057, + "learning_rate": 1.4739228067422422e-06, + "loss": 0.0318, + "step": 179130 + }, + { + "epoch": 0.1657, + "grad_norm": 0.031203771010041237, + "learning_rate": 1.4725247611296956e-06, + "loss": 0.0321, + "step": 179140 + }, + { + "epoch": 0.16575, + "grad_norm": 0.03170618787407875, + "learning_rate": 1.47112735874948e-06, + "loss": 0.0353, + "step": 179150 + }, + { + "epoch": 0.1658, + "grad_norm": 0.035838086158037186, + "learning_rate": 1.469730599639807e-06, + "loss": 0.0316, + "step": 179160 + }, + { + "epoch": 0.16585, + "grad_norm": 0.029705338180065155, + "learning_rate": 1.4683344838388595e-06, + "loss": 0.0322, + "step": 179170 + }, + { + "epoch": 0.1659, + "grad_norm": 0.03481472656130791, + "learning_rate": 1.4669390113848075e-06, + "loss": 0.0318, + "step": 179180 + }, + { + "epoch": 0.16595, + "grad_norm": 0.03514959290623665, + "learning_rate": 1.4655441823157978e-06, + "loss": 0.0327, + "step": 179190 + }, + { + "epoch": 0.166, + "grad_norm": 0.03326687961816788, + "learning_rate": 1.4641499966699723e-06, + "loss": 0.0329, + "step": 179200 + }, + { + "epoch": 0.16605, + "grad_norm": 0.03179854527115822, + "learning_rate": 1.4627564544854422e-06, + "loss": 0.031, + "step": 179210 + }, + { + "epoch": 0.1661, + "grad_norm": 0.029857762157917023, + "learning_rate": 1.4613635558003076e-06, + "loss": 0.0324, + "step": 179220 + }, + { + "epoch": 0.16615, + "grad_norm": 0.030423279851675034, + "learning_rate": 1.4599713006526517e-06, + "loss": 0.0338, + "step": 179230 + }, + { + "epoch": 0.1662, + "grad_norm": 0.03567027300596237, + "learning_rate": 1.4585796890805332e-06, + "loss": 0.0323, + "step": 179240 + }, + { + "epoch": 0.16625, + "grad_norm": 0.03131220489740372, + "learning_rate": 1.4571887211220075e-06, + "loss": 0.0356, + "step": 179250 + }, + { + "epoch": 0.1663, + "grad_norm": 0.02795344404876232, + "learning_rate": 1.4557983968150945e-06, + "loss": 0.0341, + "step": 179260 + }, + { + "epoch": 0.16635, + "grad_norm": 0.03564042970538139, + "learning_rate": 1.454408716197808e-06, + "loss": 0.0326, + "step": 179270 + }, + { + "epoch": 0.1664, + "grad_norm": 0.03069239854812622, + "learning_rate": 1.453019679308143e-06, + "loss": 0.033, + "step": 179280 + }, + { + "epoch": 0.16645, + "grad_norm": 0.03263924643397331, + "learning_rate": 1.4516312861840742e-06, + "loss": 0.0328, + "step": 179290 + }, + { + "epoch": 0.1665, + "grad_norm": 0.03687722980976105, + "learning_rate": 1.4502435368635603e-06, + "loss": 0.0341, + "step": 179300 + }, + { + "epoch": 0.16655, + "grad_norm": 0.03473667800426483, + "learning_rate": 1.4488564313845348e-06, + "loss": 0.033, + "step": 179310 + }, + { + "epoch": 0.1666, + "grad_norm": 0.03266061097383499, + "learning_rate": 1.4474699697849286e-06, + "loss": 0.0342, + "step": 179320 + }, + { + "epoch": 0.16665, + "grad_norm": 0.033977825194597244, + "learning_rate": 1.4460841521026504e-06, + "loss": 0.0329, + "step": 179330 + }, + { + "epoch": 0.1667, + "grad_norm": 0.03571394085884094, + "learning_rate": 1.4446989783755776e-06, + "loss": 0.0331, + "step": 179340 + }, + { + "epoch": 0.16675, + "grad_norm": 0.034471821039915085, + "learning_rate": 1.443314448641589e-06, + "loss": 0.032, + "step": 179350 + }, + { + "epoch": 0.1668, + "grad_norm": 0.03545355424284935, + "learning_rate": 1.4419305629385288e-06, + "loss": 0.032, + "step": 179360 + }, + { + "epoch": 0.16685, + "grad_norm": 0.033533014357089996, + "learning_rate": 1.440547321304242e-06, + "loss": 0.0331, + "step": 179370 + }, + { + "epoch": 0.1669, + "grad_norm": 0.03619558736681938, + "learning_rate": 1.4391647237765399e-06, + "loss": 0.0334, + "step": 179380 + }, + { + "epoch": 0.16695, + "grad_norm": 0.03412100672721863, + "learning_rate": 1.4377827703932172e-06, + "loss": 0.0325, + "step": 179390 + }, + { + "epoch": 0.167, + "grad_norm": 0.03337492421269417, + "learning_rate": 1.4364014611920662e-06, + "loss": 0.032, + "step": 179400 + }, + { + "epoch": 0.16705, + "grad_norm": 0.03705546259880066, + "learning_rate": 1.435020796210848e-06, + "loss": 0.0316, + "step": 179410 + }, + { + "epoch": 0.1671, + "grad_norm": 0.03405716270208359, + "learning_rate": 1.4336407754873077e-06, + "loss": 0.0324, + "step": 179420 + }, + { + "epoch": 0.16715, + "grad_norm": 0.03192787617444992, + "learning_rate": 1.4322613990591704e-06, + "loss": 0.0317, + "step": 179430 + }, + { + "epoch": 0.1672, + "grad_norm": 0.035241540521383286, + "learning_rate": 1.430882666964159e-06, + "loss": 0.0314, + "step": 179440 + }, + { + "epoch": 0.16725, + "grad_norm": 0.028237996622920036, + "learning_rate": 1.429504579239954e-06, + "loss": 0.0329, + "step": 179450 + }, + { + "epoch": 0.1673, + "grad_norm": 0.030574286356568336, + "learning_rate": 1.4281271359242455e-06, + "loss": 0.0319, + "step": 179460 + }, + { + "epoch": 0.16735, + "grad_norm": 0.030079707503318787, + "learning_rate": 1.4267503370546832e-06, + "loss": 0.0325, + "step": 179470 + }, + { + "epoch": 0.1674, + "grad_norm": 0.03491288796067238, + "learning_rate": 1.4253741826689094e-06, + "loss": 0.033, + "step": 179480 + }, + { + "epoch": 0.16745, + "grad_norm": 0.03250223770737648, + "learning_rate": 1.4239986728045495e-06, + "loss": 0.0331, + "step": 179490 + }, + { + "epoch": 0.1675, + "grad_norm": 0.031394198536872864, + "learning_rate": 1.4226238074992099e-06, + "loss": 0.0322, + "step": 179500 + }, + { + "epoch": 0.16755, + "grad_norm": 0.03774748370051384, + "learning_rate": 1.421249586790474e-06, + "loss": 0.0335, + "step": 179510 + }, + { + "epoch": 0.1676, + "grad_norm": 0.04410416632890701, + "learning_rate": 1.4198760107159203e-06, + "loss": 0.0331, + "step": 179520 + }, + { + "epoch": 0.16765, + "grad_norm": 0.034346163272857666, + "learning_rate": 1.4185030793130965e-06, + "loss": 0.0324, + "step": 179530 + }, + { + "epoch": 0.1677, + "grad_norm": 0.03225886449217796, + "learning_rate": 1.417130792619542e-06, + "loss": 0.0333, + "step": 179540 + }, + { + "epoch": 0.16775, + "grad_norm": 0.0325591005384922, + "learning_rate": 1.4157591506727685e-06, + "loss": 0.0319, + "step": 179550 + }, + { + "epoch": 0.1678, + "grad_norm": 0.03200984373688698, + "learning_rate": 1.4143881535102822e-06, + "loss": 0.0336, + "step": 179560 + }, + { + "epoch": 0.16785, + "grad_norm": 0.03078301064670086, + "learning_rate": 1.4130178011695584e-06, + "loss": 0.0329, + "step": 179570 + }, + { + "epoch": 0.1679, + "grad_norm": 0.03340752795338631, + "learning_rate": 1.4116480936880699e-06, + "loss": 0.0328, + "step": 179580 + }, + { + "epoch": 0.16795, + "grad_norm": 0.03492152690887451, + "learning_rate": 1.4102790311032621e-06, + "loss": 0.0343, + "step": 179590 + }, + { + "epoch": 0.168, + "grad_norm": 0.040707699954509735, + "learning_rate": 1.4089106134525604e-06, + "loss": 0.0335, + "step": 179600 + }, + { + "epoch": 0.16805, + "grad_norm": 0.03249933570623398, + "learning_rate": 1.407542840773382e-06, + "loss": 0.0328, + "step": 179610 + }, + { + "epoch": 0.1681, + "grad_norm": 0.03230295330286026, + "learning_rate": 1.4061757131031196e-06, + "loss": 0.0344, + "step": 179620 + }, + { + "epoch": 0.16815, + "grad_norm": 0.035333674401044846, + "learning_rate": 1.404809230479151e-06, + "loss": 0.0326, + "step": 179630 + }, + { + "epoch": 0.1682, + "grad_norm": 0.03570147976279259, + "learning_rate": 1.4034433929388274e-06, + "loss": 0.0339, + "step": 179640 + }, + { + "epoch": 0.16825, + "grad_norm": 0.03937431797385216, + "learning_rate": 1.4020782005194965e-06, + "loss": 0.0339, + "step": 179650 + }, + { + "epoch": 0.1683, + "grad_norm": 0.037766024470329285, + "learning_rate": 1.4007136532584897e-06, + "loss": 0.0325, + "step": 179660 + }, + { + "epoch": 0.16835, + "grad_norm": 0.034978706389665604, + "learning_rate": 1.399349751193102e-06, + "loss": 0.033, + "step": 179670 + }, + { + "epoch": 0.1684, + "grad_norm": 0.03416333720088005, + "learning_rate": 1.3979864943606259e-06, + "loss": 0.0329, + "step": 179680 + }, + { + "epoch": 0.16845, + "grad_norm": 0.032717134803533554, + "learning_rate": 1.3966238827983314e-06, + "loss": 0.0339, + "step": 179690 + }, + { + "epoch": 0.1685, + "grad_norm": 0.035884711891412735, + "learning_rate": 1.3952619165434721e-06, + "loss": 0.0322, + "step": 179700 + }, + { + "epoch": 0.16855, + "grad_norm": 0.031024346128106117, + "learning_rate": 1.3939005956332878e-06, + "loss": 0.0326, + "step": 179710 + }, + { + "epoch": 0.1686, + "grad_norm": 0.034197259694337845, + "learning_rate": 1.3925399201049876e-06, + "loss": 0.0331, + "step": 179720 + }, + { + "epoch": 0.16865, + "grad_norm": 0.035327374935150146, + "learning_rate": 1.3911798899957807e-06, + "loss": 0.0329, + "step": 179730 + }, + { + "epoch": 0.1687, + "grad_norm": 0.03364065662026405, + "learning_rate": 1.3898205053428425e-06, + "loss": 0.0329, + "step": 179740 + }, + { + "epoch": 0.16875, + "grad_norm": 0.03280908986926079, + "learning_rate": 1.3884617661833493e-06, + "loss": 0.0322, + "step": 179750 + }, + { + "epoch": 0.1688, + "grad_norm": 0.03385601565241814, + "learning_rate": 1.3871036725544352e-06, + "loss": 0.033, + "step": 179760 + }, + { + "epoch": 0.16885, + "grad_norm": 0.031862739473581314, + "learning_rate": 1.385746224493234e-06, + "loss": 0.0332, + "step": 179770 + }, + { + "epoch": 0.1689, + "grad_norm": 0.028963228687644005, + "learning_rate": 1.3843894220368637e-06, + "loss": 0.0325, + "step": 179780 + }, + { + "epoch": 0.16895, + "grad_norm": 0.035909876227378845, + "learning_rate": 1.3830332652224137e-06, + "loss": 0.0327, + "step": 179790 + }, + { + "epoch": 0.169, + "grad_norm": 0.03678746894001961, + "learning_rate": 1.381677754086963e-06, + "loss": 0.0362, + "step": 179800 + }, + { + "epoch": 0.16905, + "grad_norm": 0.03247828036546707, + "learning_rate": 1.380322888667565e-06, + "loss": 0.0327, + "step": 179810 + }, + { + "epoch": 0.1691, + "grad_norm": 0.03818798065185547, + "learning_rate": 1.3789686690012682e-06, + "loss": 0.033, + "step": 179820 + }, + { + "epoch": 0.16915, + "grad_norm": 0.03432374820113182, + "learning_rate": 1.3776150951250955e-06, + "loss": 0.0325, + "step": 179830 + }, + { + "epoch": 0.1692, + "grad_norm": 0.03608046472072601, + "learning_rate": 1.3762621670760478e-06, + "loss": 0.0323, + "step": 179840 + }, + { + "epoch": 0.16925, + "grad_norm": 0.03437166288495064, + "learning_rate": 1.3749098848911206e-06, + "loss": 0.0337, + "step": 179850 + }, + { + "epoch": 0.1693, + "grad_norm": 0.0373493991792202, + "learning_rate": 1.373558248607279e-06, + "loss": 0.0325, + "step": 179860 + }, + { + "epoch": 0.16935, + "grad_norm": 0.03960481658577919, + "learning_rate": 1.372207258261482e-06, + "loss": 0.0331, + "step": 179870 + }, + { + "epoch": 0.1694, + "grad_norm": 0.03167131170630455, + "learning_rate": 1.3708569138906612e-06, + "loss": 0.0332, + "step": 179880 + }, + { + "epoch": 0.16945, + "grad_norm": 0.03516445681452751, + "learning_rate": 1.3695072155317345e-06, + "loss": 0.0333, + "step": 179890 + }, + { + "epoch": 0.1695, + "grad_norm": 0.034307535737752914, + "learning_rate": 1.368158163221603e-06, + "loss": 0.0344, + "step": 179900 + }, + { + "epoch": 0.16955, + "grad_norm": 0.03313758969306946, + "learning_rate": 1.3668097569971505e-06, + "loss": 0.0325, + "step": 179910 + }, + { + "epoch": 0.1696, + "grad_norm": 0.035713810473680496, + "learning_rate": 1.3654619968952426e-06, + "loss": 0.0344, + "step": 179920 + }, + { + "epoch": 0.16965, + "grad_norm": 0.03464226424694061, + "learning_rate": 1.364114882952719e-06, + "loss": 0.0326, + "step": 179930 + }, + { + "epoch": 0.1697, + "grad_norm": 0.032023943960666656, + "learning_rate": 1.3627684152064196e-06, + "loss": 0.0332, + "step": 179940 + }, + { + "epoch": 0.16975, + "grad_norm": 0.030161544680595398, + "learning_rate": 1.3614225936931458e-06, + "loss": 0.0336, + "step": 179950 + }, + { + "epoch": 0.1698, + "grad_norm": 0.03181074932217598, + "learning_rate": 1.360077418449704e-06, + "loss": 0.032, + "step": 179960 + }, + { + "epoch": 0.16985, + "grad_norm": 0.03367370739579201, + "learning_rate": 1.3587328895128621e-06, + "loss": 0.0331, + "step": 179970 + }, + { + "epoch": 0.1699, + "grad_norm": 0.026204094290733337, + "learning_rate": 1.3573890069193796e-06, + "loss": 0.0319, + "step": 179980 + }, + { + "epoch": 0.16995, + "grad_norm": 0.029326176270842552, + "learning_rate": 1.356045770706002e-06, + "loss": 0.033, + "step": 179990 + }, + { + "epoch": 0.17, + "grad_norm": 0.03619501367211342, + "learning_rate": 1.3547031809094502e-06, + "loss": 0.0327, + "step": 180000 + }, + { + "epoch": 0.17005, + "grad_norm": 0.0314396396279335, + "learning_rate": 1.3533612375664333e-06, + "loss": 0.0333, + "step": 180010 + }, + { + "epoch": 0.1701, + "grad_norm": 0.03336181491613388, + "learning_rate": 1.3520199407136308e-06, + "loss": 0.0312, + "step": 180020 + }, + { + "epoch": 0.17015, + "grad_norm": 0.03133557736873627, + "learning_rate": 1.3506792903877186e-06, + "loss": 0.0326, + "step": 180030 + }, + { + "epoch": 0.1702, + "grad_norm": 0.03235197439789772, + "learning_rate": 1.3493392866253563e-06, + "loss": 0.0333, + "step": 180040 + }, + { + "epoch": 0.17025, + "grad_norm": 0.03352617099881172, + "learning_rate": 1.3479999294631673e-06, + "loss": 0.0332, + "step": 180050 + }, + { + "epoch": 0.1703, + "grad_norm": 0.03173385187983513, + "learning_rate": 1.346661218937778e-06, + "loss": 0.0311, + "step": 180060 + }, + { + "epoch": 0.17035, + "grad_norm": 0.03343489393591881, + "learning_rate": 1.3453231550857787e-06, + "loss": 0.0329, + "step": 180070 + }, + { + "epoch": 0.1704, + "grad_norm": 0.0289906058460474, + "learning_rate": 1.3439857379437647e-06, + "loss": 0.033, + "step": 180080 + }, + { + "epoch": 0.17045, + "grad_norm": 0.03115617111325264, + "learning_rate": 1.3426489675482907e-06, + "loss": 0.033, + "step": 180090 + }, + { + "epoch": 0.1705, + "grad_norm": 0.03450518101453781, + "learning_rate": 1.3413128439359046e-06, + "loss": 0.0331, + "step": 180100 + }, + { + "epoch": 0.17055, + "grad_norm": 0.035788483917713165, + "learning_rate": 1.3399773671431414e-06, + "loss": 0.032, + "step": 180110 + }, + { + "epoch": 0.1706, + "grad_norm": 0.035771444439888, + "learning_rate": 1.3386425372065081e-06, + "loss": 0.0336, + "step": 180120 + }, + { + "epoch": 0.17065, + "grad_norm": 0.03090623952448368, + "learning_rate": 1.3373083541624975e-06, + "loss": 0.0331, + "step": 180130 + }, + { + "epoch": 0.1707, + "grad_norm": 0.03492768108844757, + "learning_rate": 1.3359748180475835e-06, + "loss": 0.0337, + "step": 180140 + }, + { + "epoch": 0.17075, + "grad_norm": 0.03133808448910713, + "learning_rate": 1.3346419288982282e-06, + "loss": 0.0327, + "step": 180150 + }, + { + "epoch": 0.1708, + "grad_norm": 0.03177190199494362, + "learning_rate": 1.3333096867508748e-06, + "loss": 0.0333, + "step": 180160 + }, + { + "epoch": 0.17085, + "grad_norm": 0.034788765013217926, + "learning_rate": 1.3319780916419417e-06, + "loss": 0.0321, + "step": 180170 + }, + { + "epoch": 0.1709, + "grad_norm": 0.03276115655899048, + "learning_rate": 1.3306471436078383e-06, + "loss": 0.0339, + "step": 180180 + }, + { + "epoch": 0.17095, + "grad_norm": 0.0364110991358757, + "learning_rate": 1.3293168426849467e-06, + "loss": 0.0326, + "step": 180190 + }, + { + "epoch": 0.171, + "grad_norm": 0.030001336708664894, + "learning_rate": 1.3279871889096434e-06, + "loss": 0.0325, + "step": 180200 + }, + { + "epoch": 0.17105, + "grad_norm": 0.03211287036538124, + "learning_rate": 1.3266581823182771e-06, + "loss": 0.033, + "step": 180210 + }, + { + "epoch": 0.1711, + "grad_norm": 0.0314616933465004, + "learning_rate": 1.3253298229471772e-06, + "loss": 0.0323, + "step": 180220 + }, + { + "epoch": 0.17115, + "grad_norm": 0.03480513393878937, + "learning_rate": 1.32400211083267e-06, + "loss": 0.0336, + "step": 180230 + }, + { + "epoch": 0.1712, + "grad_norm": 0.03739199787378311, + "learning_rate": 1.3226750460110487e-06, + "loss": 0.0335, + "step": 180240 + }, + { + "epoch": 0.17125, + "grad_norm": 0.036488406360149384, + "learning_rate": 1.3213486285186012e-06, + "loss": 0.0342, + "step": 180250 + }, + { + "epoch": 0.1713, + "grad_norm": 0.03254292532801628, + "learning_rate": 1.3200228583915814e-06, + "loss": 0.0343, + "step": 180260 + }, + { + "epoch": 0.17135, + "grad_norm": 0.03373872861266136, + "learning_rate": 1.3186977356662383e-06, + "loss": 0.0333, + "step": 180270 + }, + { + "epoch": 0.1714, + "grad_norm": 0.028090765699744225, + "learning_rate": 1.317373260378807e-06, + "loss": 0.0319, + "step": 180280 + }, + { + "epoch": 0.17145, + "grad_norm": 0.032359834760427475, + "learning_rate": 1.3160494325654944e-06, + "loss": 0.0328, + "step": 180290 + }, + { + "epoch": 0.1715, + "grad_norm": 0.03507302701473236, + "learning_rate": 1.3147262522624936e-06, + "loss": 0.0317, + "step": 180300 + }, + { + "epoch": 0.17155, + "grad_norm": 0.03586224839091301, + "learning_rate": 1.3134037195059735e-06, + "loss": 0.0325, + "step": 180310 + }, + { + "epoch": 0.1716, + "grad_norm": 0.03436509892344475, + "learning_rate": 1.3120818343321018e-06, + "loss": 0.0334, + "step": 180320 + }, + { + "epoch": 0.17165, + "grad_norm": 0.030631018802523613, + "learning_rate": 1.3107605967770109e-06, + "loss": 0.0326, + "step": 180330 + }, + { + "epoch": 0.1717, + "grad_norm": 0.035087957978248596, + "learning_rate": 1.3094400068768248e-06, + "loss": 0.0322, + "step": 180340 + }, + { + "epoch": 0.17175, + "grad_norm": 0.03104361705482006, + "learning_rate": 1.3081200646676506e-06, + "loss": 0.0332, + "step": 180350 + }, + { + "epoch": 0.1718, + "grad_norm": 0.02846512943506241, + "learning_rate": 1.3068007701855705e-06, + "loss": 0.0324, + "step": 180360 + }, + { + "epoch": 0.17185, + "grad_norm": 0.033627621829509735, + "learning_rate": 1.3054821234666615e-06, + "loss": 0.0341, + "step": 180370 + }, + { + "epoch": 0.1719, + "grad_norm": 0.043194908648729324, + "learning_rate": 1.3041641245469665e-06, + "loss": 0.0323, + "step": 180380 + }, + { + "epoch": 0.17195, + "grad_norm": 0.04160348325967789, + "learning_rate": 1.3028467734625238e-06, + "loss": 0.0322, + "step": 180390 + }, + { + "epoch": 0.172, + "grad_norm": 0.03457237780094147, + "learning_rate": 1.301530070249346e-06, + "loss": 0.0338, + "step": 180400 + }, + { + "epoch": 0.17205, + "grad_norm": 0.0446646511554718, + "learning_rate": 1.3002140149434321e-06, + "loss": 0.0346, + "step": 180410 + }, + { + "epoch": 0.1721, + "grad_norm": 0.0351555310189724, + "learning_rate": 1.2988986075807674e-06, + "loss": 0.0346, + "step": 180420 + }, + { + "epoch": 0.17215, + "grad_norm": 0.03978744521737099, + "learning_rate": 1.2975838481973063e-06, + "loss": 0.0325, + "step": 180430 + }, + { + "epoch": 0.1722, + "grad_norm": 0.02988533116877079, + "learning_rate": 1.2962697368290006e-06, + "loss": 0.0347, + "step": 180440 + }, + { + "epoch": 0.17225, + "grad_norm": 0.031685106456279755, + "learning_rate": 1.2949562735117716e-06, + "loss": 0.0328, + "step": 180450 + }, + { + "epoch": 0.1723, + "grad_norm": 0.028254924342036247, + "learning_rate": 1.2936434582815377e-06, + "loss": 0.0322, + "step": 180460 + }, + { + "epoch": 0.17235, + "grad_norm": 0.02911999821662903, + "learning_rate": 1.292331291174184e-06, + "loss": 0.033, + "step": 180470 + }, + { + "epoch": 0.1724, + "grad_norm": 0.02961915358901024, + "learning_rate": 1.2910197722255824e-06, + "loss": 0.0326, + "step": 180480 + }, + { + "epoch": 0.17245, + "grad_norm": 0.03134278580546379, + "learning_rate": 1.289708901471598e-06, + "loss": 0.0327, + "step": 180490 + }, + { + "epoch": 0.1725, + "grad_norm": 0.033643536269664764, + "learning_rate": 1.2883986789480663e-06, + "loss": 0.0326, + "step": 180500 + }, + { + "epoch": 0.17255, + "grad_norm": 0.03249495476484299, + "learning_rate": 1.2870891046908028e-06, + "loss": 0.0327, + "step": 180510 + }, + { + "epoch": 0.1726, + "grad_norm": 0.03298050910234451, + "learning_rate": 1.2857801787356127e-06, + "loss": 0.0333, + "step": 180520 + }, + { + "epoch": 0.17265, + "grad_norm": 0.036723580211400986, + "learning_rate": 1.2844719011182837e-06, + "loss": 0.0337, + "step": 180530 + }, + { + "epoch": 0.1727, + "grad_norm": 0.03416711091995239, + "learning_rate": 1.28316427187459e-06, + "loss": 0.0316, + "step": 180540 + }, + { + "epoch": 0.17275, + "grad_norm": 0.03185368701815605, + "learning_rate": 1.2818572910402698e-06, + "loss": 0.0315, + "step": 180550 + }, + { + "epoch": 0.1728, + "grad_norm": 0.028836287558078766, + "learning_rate": 1.2805509586510639e-06, + "loss": 0.0324, + "step": 180560 + }, + { + "epoch": 0.17285, + "grad_norm": 0.0385720320045948, + "learning_rate": 1.2792452747426798e-06, + "loss": 0.0336, + "step": 180570 + }, + { + "epoch": 0.1729, + "grad_norm": 0.033134475350379944, + "learning_rate": 1.2779402393508195e-06, + "loss": 0.0344, + "step": 180580 + }, + { + "epoch": 0.17295, + "grad_norm": 0.03713807463645935, + "learning_rate": 1.2766358525111656e-06, + "loss": 0.0329, + "step": 180590 + }, + { + "epoch": 0.173, + "grad_norm": 0.032080747187137604, + "learning_rate": 1.2753321142593671e-06, + "loss": 0.0324, + "step": 180600 + }, + { + "epoch": 0.17305, + "grad_norm": 0.031095851212739944, + "learning_rate": 1.2740290246310821e-06, + "loss": 0.033, + "step": 180610 + }, + { + "epoch": 0.1731, + "grad_norm": 0.030296696349978447, + "learning_rate": 1.272726583661929e-06, + "loss": 0.0321, + "step": 180620 + }, + { + "epoch": 0.17315, + "grad_norm": 0.03122873604297638, + "learning_rate": 1.2714247913875183e-06, + "loss": 0.0332, + "step": 180630 + }, + { + "epoch": 0.1732, + "grad_norm": 0.03570917621254921, + "learning_rate": 1.2701236478434352e-06, + "loss": 0.0337, + "step": 180640 + }, + { + "epoch": 0.17325, + "grad_norm": 0.028863271698355675, + "learning_rate": 1.26882315306526e-06, + "loss": 0.0328, + "step": 180650 + }, + { + "epoch": 0.1733, + "grad_norm": 0.03672472760081291, + "learning_rate": 1.267523307088539e-06, + "loss": 0.0332, + "step": 180660 + }, + { + "epoch": 0.17335, + "grad_norm": 0.02857218310236931, + "learning_rate": 1.2662241099488215e-06, + "loss": 0.033, + "step": 180670 + }, + { + "epoch": 0.1734, + "grad_norm": 0.03267050161957741, + "learning_rate": 1.264925561681618e-06, + "loss": 0.0327, + "step": 180680 + }, + { + "epoch": 0.17345, + "grad_norm": 0.036747854202985764, + "learning_rate": 1.2636276623224308e-06, + "loss": 0.0346, + "step": 180690 + }, + { + "epoch": 0.1735, + "grad_norm": 0.03358514979481697, + "learning_rate": 1.2623304119067507e-06, + "loss": 0.0328, + "step": 180700 + }, + { + "epoch": 0.17355, + "grad_norm": 0.035015739500522614, + "learning_rate": 1.2610338104700359e-06, + "loss": 0.0342, + "step": 180710 + }, + { + "epoch": 0.1736, + "grad_norm": 0.03652586415410042, + "learning_rate": 1.2597378580477382e-06, + "loss": 0.0353, + "step": 180720 + }, + { + "epoch": 0.17365, + "grad_norm": 0.03134174644947052, + "learning_rate": 1.2584425546752903e-06, + "loss": 0.033, + "step": 180730 + }, + { + "epoch": 0.1737, + "grad_norm": 0.029100047424435616, + "learning_rate": 1.2571479003881004e-06, + "loss": 0.0342, + "step": 180740 + }, + { + "epoch": 0.17375, + "grad_norm": 0.029299162328243256, + "learning_rate": 1.2558538952215758e-06, + "loss": 0.0345, + "step": 180750 + }, + { + "epoch": 0.1738, + "grad_norm": 0.02790946140885353, + "learning_rate": 1.2545605392110776e-06, + "loss": 0.0348, + "step": 180760 + }, + { + "epoch": 0.17385, + "grad_norm": 0.035933081060647964, + "learning_rate": 1.2532678323919744e-06, + "loss": 0.0327, + "step": 180770 + }, + { + "epoch": 0.1739, + "grad_norm": 0.030573425814509392, + "learning_rate": 1.2519757747996074e-06, + "loss": 0.0323, + "step": 180780 + }, + { + "epoch": 0.17395, + "grad_norm": 0.031238127499818802, + "learning_rate": 1.2506843664693013e-06, + "loss": 0.0343, + "step": 180790 + }, + { + "epoch": 0.174, + "grad_norm": 0.03557777777314186, + "learning_rate": 1.2493936074363667e-06, + "loss": 0.0351, + "step": 180800 + }, + { + "epoch": 0.17405, + "grad_norm": 0.037748608738183975, + "learning_rate": 1.2481034977360806e-06, + "loss": 0.0338, + "step": 180810 + }, + { + "epoch": 0.1741, + "grad_norm": 0.03454018756747246, + "learning_rate": 1.2468140374037262e-06, + "loss": 0.0331, + "step": 180820 + }, + { + "epoch": 0.17415, + "grad_norm": 0.03299016132950783, + "learning_rate": 1.2455252264745532e-06, + "loss": 0.0331, + "step": 180830 + }, + { + "epoch": 0.1742, + "grad_norm": 0.031141648069024086, + "learning_rate": 1.244237064983797e-06, + "loss": 0.0338, + "step": 180840 + }, + { + "epoch": 0.17425, + "grad_norm": 0.03504917770624161, + "learning_rate": 1.2429495529666712e-06, + "loss": 0.0349, + "step": 180850 + }, + { + "epoch": 0.1743, + "grad_norm": 0.03259625658392906, + "learning_rate": 1.2416626904583783e-06, + "loss": 0.0326, + "step": 180860 + }, + { + "epoch": 0.17435, + "grad_norm": 0.0345325767993927, + "learning_rate": 1.240376477494104e-06, + "loss": 0.0317, + "step": 180870 + }, + { + "epoch": 0.1744, + "grad_norm": 0.029554620385169983, + "learning_rate": 1.2390909141090146e-06, + "loss": 0.0324, + "step": 180880 + }, + { + "epoch": 0.17445, + "grad_norm": 0.029579907655715942, + "learning_rate": 1.2378060003382486e-06, + "loss": 0.0328, + "step": 180890 + }, + { + "epoch": 0.1745, + "grad_norm": 0.0344732403755188, + "learning_rate": 1.236521736216939e-06, + "loss": 0.0332, + "step": 180900 + }, + { + "epoch": 0.17455, + "grad_norm": 0.029749970883131027, + "learning_rate": 1.2352381217802022e-06, + "loss": 0.0329, + "step": 180910 + }, + { + "epoch": 0.1746, + "grad_norm": 0.030175866559147835, + "learning_rate": 1.233955157063124e-06, + "loss": 0.0325, + "step": 180920 + }, + { + "epoch": 0.17465, + "grad_norm": 0.028932299464941025, + "learning_rate": 1.2326728421007821e-06, + "loss": 0.0323, + "step": 180930 + }, + { + "epoch": 0.1747, + "grad_norm": 0.032971736043691635, + "learning_rate": 1.231391176928237e-06, + "loss": 0.0321, + "step": 180940 + }, + { + "epoch": 0.17475, + "grad_norm": 0.03181711211800575, + "learning_rate": 1.2301101615805278e-06, + "loss": 0.0326, + "step": 180950 + }, + { + "epoch": 0.1748, + "grad_norm": 0.0331878699362278, + "learning_rate": 1.2288297960926814e-06, + "loss": 0.0316, + "step": 180960 + }, + { + "epoch": 0.17485, + "grad_norm": 0.03228962421417236, + "learning_rate": 1.2275500804996898e-06, + "loss": 0.0343, + "step": 180970 + }, + { + "epoch": 0.1749, + "grad_norm": 0.03149693086743355, + "learning_rate": 1.2262710148365498e-06, + "loss": 0.0328, + "step": 180980 + }, + { + "epoch": 0.17495, + "grad_norm": 0.030987145379185677, + "learning_rate": 1.2249925991382306e-06, + "loss": 0.0327, + "step": 180990 + }, + { + "epoch": 0.175, + "grad_norm": 0.0314842164516449, + "learning_rate": 1.2237148334396848e-06, + "loss": 0.0326, + "step": 181000 + }, + { + "epoch": 0.17505, + "grad_norm": 0.028250273317098618, + "learning_rate": 1.22243771777584e-06, + "loss": 0.0321, + "step": 181010 + }, + { + "epoch": 0.1751, + "grad_norm": 0.03365689516067505, + "learning_rate": 1.2211612521816156e-06, + "loss": 0.0326, + "step": 181020 + }, + { + "epoch": 0.17515, + "grad_norm": 0.03639967739582062, + "learning_rate": 1.2198854366919089e-06, + "loss": 0.033, + "step": 181030 + }, + { + "epoch": 0.1752, + "grad_norm": 0.03515635058283806, + "learning_rate": 1.2186102713416026e-06, + "loss": 0.0338, + "step": 181040 + }, + { + "epoch": 0.17525, + "grad_norm": 0.03461417555809021, + "learning_rate": 1.2173357561655525e-06, + "loss": 0.0314, + "step": 181050 + }, + { + "epoch": 0.1753, + "grad_norm": 0.03033815324306488, + "learning_rate": 1.2160618911986138e-06, + "loss": 0.0339, + "step": 181060 + }, + { + "epoch": 0.17535, + "grad_norm": 0.028576888144016266, + "learning_rate": 1.2147886764756033e-06, + "loss": 0.0317, + "step": 181070 + }, + { + "epoch": 0.1754, + "grad_norm": 0.035641852766275406, + "learning_rate": 1.2135161120313376e-06, + "loss": 0.0331, + "step": 181080 + }, + { + "epoch": 0.17545, + "grad_norm": 0.0301712267100811, + "learning_rate": 1.2122441979006056e-06, + "loss": 0.032, + "step": 181090 + }, + { + "epoch": 0.1755, + "grad_norm": 0.03008922189474106, + "learning_rate": 1.2109729341181763e-06, + "loss": 0.0328, + "step": 181100 + }, + { + "epoch": 0.17555, + "grad_norm": 0.03754622861742973, + "learning_rate": 1.2097023207188142e-06, + "loss": 0.0339, + "step": 181110 + }, + { + "epoch": 0.1756, + "grad_norm": 0.03148207813501358, + "learning_rate": 1.2084323577372519e-06, + "loss": 0.033, + "step": 181120 + }, + { + "epoch": 0.17565, + "grad_norm": 0.03381652757525444, + "learning_rate": 1.2071630452082123e-06, + "loss": 0.0318, + "step": 181130 + }, + { + "epoch": 0.1757, + "grad_norm": 0.03082115948200226, + "learning_rate": 1.2058943831663922e-06, + "loss": 0.033, + "step": 181140 + }, + { + "epoch": 0.17575, + "grad_norm": 0.029486792162060738, + "learning_rate": 1.2046263716464834e-06, + "loss": 0.0329, + "step": 181150 + }, + { + "epoch": 0.1758, + "grad_norm": 0.03202378749847412, + "learning_rate": 1.20335901068315e-06, + "loss": 0.0327, + "step": 181160 + }, + { + "epoch": 0.17585, + "grad_norm": 0.03240067511796951, + "learning_rate": 1.2020923003110418e-06, + "loss": 0.0326, + "step": 181170 + }, + { + "epoch": 0.1759, + "grad_norm": 0.03265637159347534, + "learning_rate": 1.2008262405647896e-06, + "loss": 0.0346, + "step": 181180 + }, + { + "epoch": 0.17595, + "grad_norm": 0.039439257234334946, + "learning_rate": 1.1995608314790046e-06, + "loss": 0.0329, + "step": 181190 + }, + { + "epoch": 0.176, + "grad_norm": 0.03045968897640705, + "learning_rate": 1.198296073088287e-06, + "loss": 0.0341, + "step": 181200 + }, + { + "epoch": 0.17605, + "grad_norm": 0.033822670578956604, + "learning_rate": 1.1970319654272144e-06, + "loss": 0.0343, + "step": 181210 + }, + { + "epoch": 0.1761, + "grad_norm": 0.03462981432676315, + "learning_rate": 1.1957685085303455e-06, + "loss": 0.0337, + "step": 181220 + }, + { + "epoch": 0.17615, + "grad_norm": 0.032089609652757645, + "learning_rate": 1.1945057024322192e-06, + "loss": 0.0352, + "step": 181230 + }, + { + "epoch": 0.1762, + "grad_norm": 0.03550237789750099, + "learning_rate": 1.1932435471673637e-06, + "loss": 0.0315, + "step": 181240 + }, + { + "epoch": 0.17625, + "grad_norm": 0.02741037867963314, + "learning_rate": 1.1919820427702927e-06, + "loss": 0.0338, + "step": 181250 + }, + { + "epoch": 0.1763, + "grad_norm": 0.03825649991631508, + "learning_rate": 1.1907211892754788e-06, + "loss": 0.0325, + "step": 181260 + }, + { + "epoch": 0.17635, + "grad_norm": 0.027359571307897568, + "learning_rate": 1.1894609867174112e-06, + "loss": 0.0331, + "step": 181270 + }, + { + "epoch": 0.1764, + "grad_norm": 0.030887959524989128, + "learning_rate": 1.1882014351305288e-06, + "loss": 0.033, + "step": 181280 + }, + { + "epoch": 0.17645, + "grad_norm": 0.03310007229447365, + "learning_rate": 1.1869425345492762e-06, + "loss": 0.0329, + "step": 181290 + }, + { + "epoch": 0.1765, + "grad_norm": 0.030865758657455444, + "learning_rate": 1.1856842850080707e-06, + "loss": 0.0344, + "step": 181300 + }, + { + "epoch": 0.17655, + "grad_norm": 0.033761125057935715, + "learning_rate": 1.1844266865413039e-06, + "loss": 0.0339, + "step": 181310 + }, + { + "epoch": 0.1766, + "grad_norm": 0.03483714908361435, + "learning_rate": 1.1831697391833708e-06, + "loss": 0.0328, + "step": 181320 + }, + { + "epoch": 0.17665, + "grad_norm": 0.033422935754060745, + "learning_rate": 1.1819134429686268e-06, + "loss": 0.0334, + "step": 181330 + }, + { + "epoch": 0.1767, + "grad_norm": 0.03466103971004486, + "learning_rate": 1.1806577979314225e-06, + "loss": 0.0331, + "step": 181340 + }, + { + "epoch": 0.17675, + "grad_norm": 0.03309163823723793, + "learning_rate": 1.1794028041060834e-06, + "loss": 0.0314, + "step": 181350 + }, + { + "epoch": 0.1768, + "grad_norm": 0.035105571150779724, + "learning_rate": 1.1781484615269207e-06, + "loss": 0.0325, + "step": 181360 + }, + { + "epoch": 0.17685, + "grad_norm": 0.02798105590045452, + "learning_rate": 1.1768947702282345e-06, + "loss": 0.0345, + "step": 181370 + }, + { + "epoch": 0.1769, + "grad_norm": 0.04057719185948372, + "learning_rate": 1.175641730244295e-06, + "loss": 0.0347, + "step": 181380 + }, + { + "epoch": 0.17695, + "grad_norm": 0.03164273500442505, + "learning_rate": 1.1743893416093582e-06, + "loss": 0.0311, + "step": 181390 + }, + { + "epoch": 0.177, + "grad_norm": 0.03630835935473442, + "learning_rate": 1.1731376043576659e-06, + "loss": 0.033, + "step": 181400 + }, + { + "epoch": 0.17705, + "grad_norm": 0.03291347995400429, + "learning_rate": 1.1718865185234407e-06, + "loss": 0.0326, + "step": 181410 + }, + { + "epoch": 0.1771, + "grad_norm": 0.037380918860435486, + "learning_rate": 1.1706360841408886e-06, + "loss": 0.0347, + "step": 181420 + }, + { + "epoch": 0.17715, + "grad_norm": 0.03159075975418091, + "learning_rate": 1.169386301244188e-06, + "loss": 0.033, + "step": 181430 + }, + { + "epoch": 0.1772, + "grad_norm": 0.032524630427360535, + "learning_rate": 1.1681371698675169e-06, + "loss": 0.0321, + "step": 181440 + }, + { + "epoch": 0.17725, + "grad_norm": 0.0323689766228199, + "learning_rate": 1.1668886900450205e-06, + "loss": 0.0347, + "step": 181450 + }, + { + "epoch": 0.1773, + "grad_norm": 0.034034889191389084, + "learning_rate": 1.1656408618108405e-06, + "loss": 0.033, + "step": 181460 + }, + { + "epoch": 0.17735, + "grad_norm": 0.03076021373271942, + "learning_rate": 1.1643936851990778e-06, + "loss": 0.0324, + "step": 181470 + }, + { + "epoch": 0.1774, + "grad_norm": 0.03190213441848755, + "learning_rate": 1.1631471602438355e-06, + "loss": 0.033, + "step": 181480 + }, + { + "epoch": 0.17745, + "grad_norm": 0.034344565123319626, + "learning_rate": 1.1619012869792e-06, + "loss": 0.0326, + "step": 181490 + }, + { + "epoch": 0.1775, + "grad_norm": 0.03356689587235451, + "learning_rate": 1.1606560654392278e-06, + "loss": 0.0349, + "step": 181500 + }, + { + "epoch": 0.17755, + "grad_norm": 0.035489995032548904, + "learning_rate": 1.159411495657961e-06, + "loss": 0.0355, + "step": 181510 + }, + { + "epoch": 0.1776, + "grad_norm": 0.03316465765237808, + "learning_rate": 1.158167577669428e-06, + "loss": 0.0344, + "step": 181520 + }, + { + "epoch": 0.17765, + "grad_norm": 0.03325631096959114, + "learning_rate": 1.1569243115076346e-06, + "loss": 0.0327, + "step": 181530 + }, + { + "epoch": 0.1777, + "grad_norm": 0.031159063801169395, + "learning_rate": 1.1556816972065759e-06, + "loss": 0.0332, + "step": 181540 + }, + { + "epoch": 0.17775, + "grad_norm": 0.03643181920051575, + "learning_rate": 1.1544397348002196e-06, + "loss": 0.034, + "step": 181550 + }, + { + "epoch": 0.1778, + "grad_norm": 0.0365658663213253, + "learning_rate": 1.1531984243225241e-06, + "loss": 0.0333, + "step": 181560 + }, + { + "epoch": 0.17785, + "grad_norm": 0.036051541566848755, + "learning_rate": 1.151957765807421e-06, + "loss": 0.0336, + "step": 181570 + }, + { + "epoch": 0.1779, + "grad_norm": 0.02893468365073204, + "learning_rate": 1.1507177592888384e-06, + "loss": 0.0331, + "step": 181580 + }, + { + "epoch": 0.17795, + "grad_norm": 0.03419188782572746, + "learning_rate": 1.1494784048006718e-06, + "loss": 0.0326, + "step": 181590 + }, + { + "epoch": 0.178, + "grad_norm": 0.03470698371529579, + "learning_rate": 1.1482397023768048e-06, + "loss": 0.0333, + "step": 181600 + }, + { + "epoch": 0.17805, + "grad_norm": 0.03663039579987526, + "learning_rate": 1.1470016520510996e-06, + "loss": 0.0316, + "step": 181610 + }, + { + "epoch": 0.1781, + "grad_norm": 0.031225159764289856, + "learning_rate": 1.145764253857412e-06, + "loss": 0.0334, + "step": 181620 + }, + { + "epoch": 0.17815, + "grad_norm": 0.03354804217815399, + "learning_rate": 1.1445275078295709e-06, + "loss": 0.0323, + "step": 181630 + }, + { + "epoch": 0.1782, + "grad_norm": 0.026633795350790024, + "learning_rate": 1.1432914140013795e-06, + "loss": 0.0319, + "step": 181640 + }, + { + "epoch": 0.17825, + "grad_norm": 0.030485892668366432, + "learning_rate": 1.1420559724066415e-06, + "loss": 0.0322, + "step": 181650 + }, + { + "epoch": 0.1783, + "grad_norm": 0.028709067031741142, + "learning_rate": 1.140821183079127e-06, + "loss": 0.0333, + "step": 181660 + }, + { + "epoch": 0.17835, + "grad_norm": 0.03147530183196068, + "learning_rate": 1.1395870460526008e-06, + "loss": 0.0329, + "step": 181670 + }, + { + "epoch": 0.1784, + "grad_norm": 0.028239740058779716, + "learning_rate": 1.1383535613608026e-06, + "loss": 0.0361, + "step": 181680 + }, + { + "epoch": 0.17845, + "grad_norm": 0.03355034440755844, + "learning_rate": 1.1371207290374497e-06, + "loss": 0.0314, + "step": 181690 + }, + { + "epoch": 0.1785, + "grad_norm": 0.03325127437710762, + "learning_rate": 1.135888549116254e-06, + "loss": 0.033, + "step": 181700 + }, + { + "epoch": 0.17855, + "grad_norm": 0.0414295494556427, + "learning_rate": 1.1346570216309e-06, + "loss": 0.0338, + "step": 181710 + }, + { + "epoch": 0.1786, + "grad_norm": 0.03711015358567238, + "learning_rate": 1.1334261466150575e-06, + "loss": 0.0325, + "step": 181720 + }, + { + "epoch": 0.17865, + "grad_norm": 0.036904025822877884, + "learning_rate": 1.132195924102375e-06, + "loss": 0.0327, + "step": 181730 + }, + { + "epoch": 0.1787, + "grad_norm": 0.02809218503534794, + "learning_rate": 1.1309663541264893e-06, + "loss": 0.0314, + "step": 181740 + }, + { + "epoch": 0.17875, + "grad_norm": 0.030258994549512863, + "learning_rate": 1.1297374367210205e-06, + "loss": 0.0324, + "step": 181750 + }, + { + "epoch": 0.1788, + "grad_norm": 0.033731479197740555, + "learning_rate": 1.1285091719195589e-06, + "loss": 0.0326, + "step": 181760 + }, + { + "epoch": 0.17885, + "grad_norm": 0.029932241886854172, + "learning_rate": 1.1272815597556914e-06, + "loss": 0.0323, + "step": 181770 + }, + { + "epoch": 0.1789, + "grad_norm": 0.030889419838786125, + "learning_rate": 1.126054600262974e-06, + "loss": 0.0332, + "step": 181780 + }, + { + "epoch": 0.17895, + "grad_norm": 0.03132836893200874, + "learning_rate": 1.1248282934749554e-06, + "loss": 0.0322, + "step": 181790 + }, + { + "epoch": 0.179, + "grad_norm": 0.03017721325159073, + "learning_rate": 1.1236026394251642e-06, + "loss": 0.0333, + "step": 181800 + }, + { + "epoch": 0.17905, + "grad_norm": 0.03249027952551842, + "learning_rate": 1.122377638147104e-06, + "loss": 0.0332, + "step": 181810 + }, + { + "epoch": 0.1791, + "grad_norm": 0.03272115811705589, + "learning_rate": 1.1211532896742704e-06, + "loss": 0.034, + "step": 181820 + }, + { + "epoch": 0.17915, + "grad_norm": 0.03992825374007225, + "learning_rate": 1.1199295940401367e-06, + "loss": 0.034, + "step": 181830 + }, + { + "epoch": 0.1792, + "grad_norm": 0.033313509076833725, + "learning_rate": 1.1187065512781564e-06, + "loss": 0.0338, + "step": 181840 + }, + { + "epoch": 0.17925, + "grad_norm": 0.03310203179717064, + "learning_rate": 1.117484161421764e-06, + "loss": 0.0328, + "step": 181850 + }, + { + "epoch": 0.1793, + "grad_norm": 0.03238173946738243, + "learning_rate": 1.1162624245043857e-06, + "loss": 0.0346, + "step": 181860 + }, + { + "epoch": 0.17935, + "grad_norm": 0.032731927931308746, + "learning_rate": 1.115041340559414e-06, + "loss": 0.033, + "step": 181870 + }, + { + "epoch": 0.1794, + "grad_norm": 0.03442100062966347, + "learning_rate": 1.1138209096202445e-06, + "loss": 0.0333, + "step": 181880 + }, + { + "epoch": 0.17945, + "grad_norm": 0.033178988844156265, + "learning_rate": 1.1126011317202367e-06, + "loss": 0.0345, + "step": 181890 + }, + { + "epoch": 0.1795, + "grad_norm": 0.03198530524969101, + "learning_rate": 1.1113820068927389e-06, + "loss": 0.0328, + "step": 181900 + }, + { + "epoch": 0.17955, + "grad_norm": 0.03042939305305481, + "learning_rate": 1.1101635351710826e-06, + "loss": 0.0327, + "step": 181910 + }, + { + "epoch": 0.1796, + "grad_norm": 0.03228777274489403, + "learning_rate": 1.108945716588583e-06, + "loss": 0.0342, + "step": 181920 + }, + { + "epoch": 0.17965, + "grad_norm": 0.029075900092720985, + "learning_rate": 1.1077285511785274e-06, + "loss": 0.0347, + "step": 181930 + }, + { + "epoch": 0.1797, + "grad_norm": 0.031394172459840775, + "learning_rate": 1.1065120389742e-06, + "loss": 0.0339, + "step": 181940 + }, + { + "epoch": 0.17975, + "grad_norm": 0.035124484449625015, + "learning_rate": 1.1052961800088552e-06, + "loss": 0.0315, + "step": 181950 + }, + { + "epoch": 0.1798, + "grad_norm": 0.033435892313718796, + "learning_rate": 1.1040809743157438e-06, + "loss": 0.0335, + "step": 181960 + }, + { + "epoch": 0.17985, + "grad_norm": 0.034077659249305725, + "learning_rate": 1.1028664219280727e-06, + "loss": 0.0317, + "step": 181970 + }, + { + "epoch": 0.1799, + "grad_norm": 0.034195058047771454, + "learning_rate": 1.1016525228790598e-06, + "loss": 0.0327, + "step": 181980 + }, + { + "epoch": 0.17995, + "grad_norm": 0.02897937037050724, + "learning_rate": 1.1004392772018841e-06, + "loss": 0.0315, + "step": 181990 + }, + { + "epoch": 0.18, + "grad_norm": 0.03581666946411133, + "learning_rate": 1.0992266849297246e-06, + "loss": 0.0315, + "step": 182000 + }, + { + "epoch": 0.18005, + "grad_norm": 0.02805798500776291, + "learning_rate": 1.0980147460957268e-06, + "loss": 0.0321, + "step": 182010 + }, + { + "epoch": 0.1801, + "grad_norm": 0.03290720656514168, + "learning_rate": 1.0968034607330258e-06, + "loss": 0.0322, + "step": 182020 + }, + { + "epoch": 0.18015, + "grad_norm": 0.03115527331829071, + "learning_rate": 1.0955928288747392e-06, + "loss": 0.0337, + "step": 182030 + }, + { + "epoch": 0.1802, + "grad_norm": 0.03652266785502434, + "learning_rate": 1.0943828505539656e-06, + "loss": 0.0327, + "step": 182040 + }, + { + "epoch": 0.18025, + "grad_norm": 0.03430403023958206, + "learning_rate": 1.093173525803781e-06, + "loss": 0.0329, + "step": 182050 + }, + { + "epoch": 0.1803, + "grad_norm": 0.032786864787340164, + "learning_rate": 1.0919648546572515e-06, + "loss": 0.0321, + "step": 182060 + }, + { + "epoch": 0.18035, + "grad_norm": 0.02922545000910759, + "learning_rate": 1.0907568371474191e-06, + "loss": 0.0318, + "step": 182070 + }, + { + "epoch": 0.1804, + "grad_norm": 0.029249820858240128, + "learning_rate": 1.0895494733073164e-06, + "loss": 0.0353, + "step": 182080 + }, + { + "epoch": 0.18045, + "grad_norm": 0.029231199994683266, + "learning_rate": 1.0883427631699472e-06, + "loss": 0.0321, + "step": 182090 + }, + { + "epoch": 0.1805, + "grad_norm": 0.029502786695957184, + "learning_rate": 1.0871367067683047e-06, + "loss": 0.0312, + "step": 182100 + }, + { + "epoch": 0.18055, + "grad_norm": 0.028941819444298744, + "learning_rate": 1.085931304135357e-06, + "loss": 0.033, + "step": 182110 + }, + { + "epoch": 0.1806, + "grad_norm": 0.03209478035569191, + "learning_rate": 1.0847265553040665e-06, + "loss": 0.0321, + "step": 182120 + }, + { + "epoch": 0.18065, + "grad_norm": 0.030377700924873352, + "learning_rate": 1.083522460307368e-06, + "loss": 0.0319, + "step": 182130 + }, + { + "epoch": 0.1807, + "grad_norm": 0.029901035130023956, + "learning_rate": 1.0823190191781768e-06, + "loss": 0.0318, + "step": 182140 + }, + { + "epoch": 0.18075, + "grad_norm": 0.02963991090655327, + "learning_rate": 1.0811162319494028e-06, + "loss": 0.0327, + "step": 182150 + }, + { + "epoch": 0.1808, + "grad_norm": 0.033136945217847824, + "learning_rate": 1.0799140986539197e-06, + "loss": 0.0326, + "step": 182160 + }, + { + "epoch": 0.18085, + "grad_norm": 0.030957898125052452, + "learning_rate": 1.0787126193246066e-06, + "loss": 0.0309, + "step": 182170 + }, + { + "epoch": 0.1809, + "grad_norm": 0.03250615671277046, + "learning_rate": 1.0775117939942957e-06, + "loss": 0.0337, + "step": 182180 + }, + { + "epoch": 0.18095, + "grad_norm": 0.032300982624292374, + "learning_rate": 1.0763116226958276e-06, + "loss": 0.0335, + "step": 182190 + }, + { + "epoch": 0.181, + "grad_norm": 0.03462289273738861, + "learning_rate": 1.0751121054620144e-06, + "loss": 0.0323, + "step": 182200 + }, + { + "epoch": 0.18105, + "grad_norm": 0.03301731124520302, + "learning_rate": 1.073913242325647e-06, + "loss": 0.0333, + "step": 182210 + }, + { + "epoch": 0.1811, + "grad_norm": 0.03348396345973015, + "learning_rate": 1.0727150333195046e-06, + "loss": 0.0326, + "step": 182220 + }, + { + "epoch": 0.18115, + "grad_norm": 0.028879957273602486, + "learning_rate": 1.0715174784763388e-06, + "loss": 0.031, + "step": 182230 + }, + { + "epoch": 0.1812, + "grad_norm": 0.03134356439113617, + "learning_rate": 1.070320577828901e-06, + "loss": 0.0326, + "step": 182240 + }, + { + "epoch": 0.18125, + "grad_norm": 0.03427322953939438, + "learning_rate": 1.069124331409907e-06, + "loss": 0.0324, + "step": 182250 + }, + { + "epoch": 0.1813, + "grad_norm": 0.0308916624635458, + "learning_rate": 1.0679287392520608e-06, + "loss": 0.0319, + "step": 182260 + }, + { + "epoch": 0.18135, + "grad_norm": 0.030325835570693016, + "learning_rate": 1.0667338013880563e-06, + "loss": 0.0324, + "step": 182270 + }, + { + "epoch": 0.1814, + "grad_norm": 0.03367728367447853, + "learning_rate": 1.0655395178505529e-06, + "loss": 0.0335, + "step": 182280 + }, + { + "epoch": 0.18145, + "grad_norm": 0.03004777617752552, + "learning_rate": 1.0643458886722108e-06, + "loss": 0.0318, + "step": 182290 + }, + { + "epoch": 0.1815, + "grad_norm": 0.031628742814064026, + "learning_rate": 1.0631529138856621e-06, + "loss": 0.0326, + "step": 182300 + }, + { + "epoch": 0.18155, + "grad_norm": 0.03036637231707573, + "learning_rate": 1.0619605935235145e-06, + "loss": 0.0319, + "step": 182310 + }, + { + "epoch": 0.1816, + "grad_norm": 0.034068137407302856, + "learning_rate": 1.0607689276183746e-06, + "loss": 0.0324, + "step": 182320 + }, + { + "epoch": 0.18165, + "grad_norm": 0.031157853081822395, + "learning_rate": 1.0595779162028196e-06, + "loss": 0.0321, + "step": 182330 + }, + { + "epoch": 0.1817, + "grad_norm": 0.03150942549109459, + "learning_rate": 1.0583875593094122e-06, + "loss": 0.0327, + "step": 182340 + }, + { + "epoch": 0.18175, + "grad_norm": 0.029292112216353416, + "learning_rate": 1.0571978569706876e-06, + "loss": 0.0319, + "step": 182350 + }, + { + "epoch": 0.1818, + "grad_norm": 0.03450557589530945, + "learning_rate": 1.0560088092191833e-06, + "loss": 0.0341, + "step": 182360 + }, + { + "epoch": 0.18185, + "grad_norm": 0.03337952122092247, + "learning_rate": 1.0548204160874015e-06, + "loss": 0.0329, + "step": 182370 + }, + { + "epoch": 0.1819, + "grad_norm": 0.028782257810235023, + "learning_rate": 1.0536326776078353e-06, + "loss": 0.0349, + "step": 182380 + }, + { + "epoch": 0.18195, + "grad_norm": 0.03170391544699669, + "learning_rate": 1.0524455938129534e-06, + "loss": 0.0332, + "step": 182390 + }, + { + "epoch": 0.182, + "grad_norm": 0.029747601598501205, + "learning_rate": 1.0512591647352133e-06, + "loss": 0.0338, + "step": 182400 + }, + { + "epoch": 0.18205, + "grad_norm": 0.0323491096496582, + "learning_rate": 1.0500733904070497e-06, + "loss": 0.0343, + "step": 182410 + }, + { + "epoch": 0.1821, + "grad_norm": 0.03273355960845947, + "learning_rate": 1.0488882708608843e-06, + "loss": 0.0337, + "step": 182420 + }, + { + "epoch": 0.18215, + "grad_norm": 0.0317191444337368, + "learning_rate": 1.0477038061291162e-06, + "loss": 0.0312, + "step": 182430 + }, + { + "epoch": 0.1822, + "grad_norm": 0.03376864641904831, + "learning_rate": 1.0465199962441246e-06, + "loss": 0.0352, + "step": 182440 + }, + { + "epoch": 0.18225, + "grad_norm": 0.03369827941060066, + "learning_rate": 1.0453368412382758e-06, + "loss": 0.0317, + "step": 182450 + }, + { + "epoch": 0.1823, + "grad_norm": 0.032915305346250534, + "learning_rate": 1.0441543411439242e-06, + "loss": 0.0323, + "step": 182460 + }, + { + "epoch": 0.18235, + "grad_norm": 0.031523942947387695, + "learning_rate": 1.0429724959933885e-06, + "loss": 0.0327, + "step": 182470 + }, + { + "epoch": 0.1824, + "grad_norm": 0.03458832949399948, + "learning_rate": 1.041791305818987e-06, + "loss": 0.0324, + "step": 182480 + }, + { + "epoch": 0.18245, + "grad_norm": 0.028763752430677414, + "learning_rate": 1.0406107706530056e-06, + "loss": 0.0343, + "step": 182490 + }, + { + "epoch": 0.1825, + "grad_norm": 0.03280990570783615, + "learning_rate": 1.0394308905277316e-06, + "loss": 0.0328, + "step": 182500 + }, + { + "epoch": 0.18255, + "grad_norm": 0.027918484061956406, + "learning_rate": 1.038251665475412e-06, + "loss": 0.0331, + "step": 182510 + }, + { + "epoch": 0.1826, + "grad_norm": 0.034678444266319275, + "learning_rate": 1.0370730955282876e-06, + "loss": 0.0345, + "step": 182520 + }, + { + "epoch": 0.18265, + "grad_norm": 0.02536662481725216, + "learning_rate": 1.035895180718588e-06, + "loss": 0.0335, + "step": 182530 + }, + { + "epoch": 0.1827, + "grad_norm": 0.031071780249476433, + "learning_rate": 1.034717921078507e-06, + "loss": 0.0329, + "step": 182540 + }, + { + "epoch": 0.18275, + "grad_norm": 0.03006831556558609, + "learning_rate": 1.033541316640238e-06, + "loss": 0.0349, + "step": 182550 + }, + { + "epoch": 0.1828, + "grad_norm": 0.03078848123550415, + "learning_rate": 1.0323653674359417e-06, + "loss": 0.0327, + "step": 182560 + }, + { + "epoch": 0.18285, + "grad_norm": 0.025866178795695305, + "learning_rate": 1.0311900734977702e-06, + "loss": 0.033, + "step": 182570 + }, + { + "epoch": 0.1829, + "grad_norm": 0.03372535854578018, + "learning_rate": 1.0300154348578616e-06, + "loss": 0.0345, + "step": 182580 + }, + { + "epoch": 0.18295, + "grad_norm": 0.03365950658917427, + "learning_rate": 1.0288414515483263e-06, + "loss": 0.0317, + "step": 182590 + }, + { + "epoch": 0.183, + "grad_norm": 0.03343362361192703, + "learning_rate": 1.0276681236012608e-06, + "loss": 0.0318, + "step": 182600 + }, + { + "epoch": 0.18305, + "grad_norm": 0.03235025331377983, + "learning_rate": 1.0264954510487395e-06, + "loss": 0.0321, + "step": 182610 + }, + { + "epoch": 0.1831, + "grad_norm": 0.0361965037882328, + "learning_rate": 1.0253234339228286e-06, + "loss": 0.0328, + "step": 182620 + }, + { + "epoch": 0.18315, + "grad_norm": 0.029026083648204803, + "learning_rate": 1.0241520722555685e-06, + "loss": 0.0323, + "step": 182630 + }, + { + "epoch": 0.1832, + "grad_norm": 0.02814497798681259, + "learning_rate": 1.0229813660789817e-06, + "loss": 0.032, + "step": 182640 + }, + { + "epoch": 0.18325, + "grad_norm": 0.031243901699781418, + "learning_rate": 1.021811315425078e-06, + "loss": 0.0329, + "step": 182650 + }, + { + "epoch": 0.1833, + "grad_norm": 0.032474394887685776, + "learning_rate": 1.0206419203258405e-06, + "loss": 0.0316, + "step": 182660 + }, + { + "epoch": 0.18335, + "grad_norm": 0.035039883106946945, + "learning_rate": 1.0194731808132518e-06, + "loss": 0.0353, + "step": 182670 + }, + { + "epoch": 0.1834, + "grad_norm": 0.028590889647603035, + "learning_rate": 1.0183050969192532e-06, + "loss": 0.0318, + "step": 182680 + }, + { + "epoch": 0.18345, + "grad_norm": 0.03609303757548332, + "learning_rate": 1.01713766867578e-06, + "loss": 0.0328, + "step": 182690 + }, + { + "epoch": 0.1835, + "grad_norm": 0.03724497929215431, + "learning_rate": 1.0159708961147596e-06, + "loss": 0.0317, + "step": 182700 + }, + { + "epoch": 0.18355, + "grad_norm": 0.03723419830203056, + "learning_rate": 1.0148047792680803e-06, + "loss": 0.0321, + "step": 182710 + }, + { + "epoch": 0.1836, + "grad_norm": 0.0317065566778183, + "learning_rate": 1.0136393181676306e-06, + "loss": 0.032, + "step": 182720 + }, + { + "epoch": 0.18365, + "grad_norm": 0.03472043573856354, + "learning_rate": 1.0124745128452685e-06, + "loss": 0.0315, + "step": 182730 + }, + { + "epoch": 0.1837, + "grad_norm": 0.031878240406513214, + "learning_rate": 1.011310363332843e-06, + "loss": 0.0325, + "step": 182740 + }, + { + "epoch": 0.18375, + "grad_norm": 0.033075589686632156, + "learning_rate": 1.0101468696621792e-06, + "loss": 0.0327, + "step": 182750 + }, + { + "epoch": 0.1838, + "grad_norm": 0.03156924620270729, + "learning_rate": 1.0089840318650845e-06, + "loss": 0.0338, + "step": 182760 + }, + { + "epoch": 0.18385, + "grad_norm": 0.03170720487833023, + "learning_rate": 1.0078218499733589e-06, + "loss": 0.0311, + "step": 182770 + }, + { + "epoch": 0.1839, + "grad_norm": 0.031086206436157227, + "learning_rate": 1.0066603240187655e-06, + "loss": 0.0324, + "step": 182780 + }, + { + "epoch": 0.18395, + "grad_norm": 0.030343493446707726, + "learning_rate": 1.005499454033068e-06, + "loss": 0.0332, + "step": 182790 + }, + { + "epoch": 0.184, + "grad_norm": 0.0303870290517807, + "learning_rate": 1.0043392400479996e-06, + "loss": 0.0336, + "step": 182800 + }, + { + "epoch": 0.18405, + "grad_norm": 0.029981570318341255, + "learning_rate": 1.0031796820952844e-06, + "loss": 0.0334, + "step": 182810 + }, + { + "epoch": 0.1841, + "grad_norm": 0.035858385264873505, + "learning_rate": 1.0020207802066166e-06, + "loss": 0.0319, + "step": 182820 + }, + { + "epoch": 0.18415, + "grad_norm": 0.030471090227365494, + "learning_rate": 1.0008625344136907e-06, + "loss": 0.0319, + "step": 182830 + }, + { + "epoch": 0.1842, + "grad_norm": 0.030062668025493622, + "learning_rate": 9.997049447481644e-07, + "loss": 0.0325, + "step": 182840 + }, + { + "epoch": 0.18425, + "grad_norm": 0.026811838150024414, + "learning_rate": 9.985480112416846e-07, + "loss": 0.0328, + "step": 182850 + }, + { + "epoch": 0.1843, + "grad_norm": 0.03334164619445801, + "learning_rate": 9.973917339258898e-07, + "loss": 0.032, + "step": 182860 + }, + { + "epoch": 0.18435, + "grad_norm": 0.03380081430077553, + "learning_rate": 9.962361128323854e-07, + "loss": 0.0326, + "step": 182870 + }, + { + "epoch": 0.1844, + "grad_norm": 0.030175121501088142, + "learning_rate": 9.950811479927712e-07, + "loss": 0.033, + "step": 182880 + }, + { + "epoch": 0.18445, + "grad_norm": 0.030168289318680763, + "learning_rate": 9.939268394386193e-07, + "loss": 0.0321, + "step": 182890 + }, + { + "epoch": 0.1845, + "grad_norm": 0.029061466455459595, + "learning_rate": 9.927731872014845e-07, + "loss": 0.031, + "step": 182900 + }, + { + "epoch": 0.18455, + "grad_norm": 0.030922871083021164, + "learning_rate": 9.916201913129169e-07, + "loss": 0.031, + "step": 182910 + }, + { + "epoch": 0.1846, + "grad_norm": 0.03239313140511513, + "learning_rate": 9.90467851804433e-07, + "loss": 0.0308, + "step": 182920 + }, + { + "epoch": 0.18465, + "grad_norm": 0.0330660305917263, + "learning_rate": 9.89316168707538e-07, + "loss": 0.0324, + "step": 182930 + }, + { + "epoch": 0.1847, + "grad_norm": 0.032215192914009094, + "learning_rate": 9.881651420537153e-07, + "loss": 0.0328, + "step": 182940 + }, + { + "epoch": 0.18475, + "grad_norm": 0.03356137499213219, + "learning_rate": 9.870147718744365e-07, + "loss": 0.0322, + "step": 182950 + }, + { + "epoch": 0.1848, + "grad_norm": 0.029361480847001076, + "learning_rate": 9.858650582011602e-07, + "loss": 0.0318, + "step": 182960 + }, + { + "epoch": 0.18485, + "grad_norm": 0.031547416001558304, + "learning_rate": 9.847160010653028e-07, + "loss": 0.0319, + "step": 182970 + }, + { + "epoch": 0.1849, + "grad_norm": 0.02946125902235508, + "learning_rate": 9.83567600498292e-07, + "loss": 0.0337, + "step": 182980 + }, + { + "epoch": 0.18495, + "grad_norm": 0.03201250731945038, + "learning_rate": 9.82419856531519e-07, + "loss": 0.0317, + "step": 182990 + }, + { + "epoch": 0.185, + "grad_norm": 0.03201431408524513, + "learning_rate": 9.812727691963647e-07, + "loss": 0.0327, + "step": 183000 + }, + { + "epoch": 0.18505, + "grad_norm": 0.03448358178138733, + "learning_rate": 9.801263385241927e-07, + "loss": 0.0318, + "step": 183010 + }, + { + "epoch": 0.1851, + "grad_norm": 0.03170866519212723, + "learning_rate": 9.789805645463363e-07, + "loss": 0.0331, + "step": 183020 + }, + { + "epoch": 0.18515, + "grad_norm": 0.03106853738427162, + "learning_rate": 9.778354472941315e-07, + "loss": 0.0329, + "step": 183030 + }, + { + "epoch": 0.1852, + "grad_norm": 0.02936870977282524, + "learning_rate": 9.76690986798881e-07, + "loss": 0.0324, + "step": 183040 + }, + { + "epoch": 0.18525, + "grad_norm": 0.02852589264512062, + "learning_rate": 9.755471830918738e-07, + "loss": 0.0324, + "step": 183050 + }, + { + "epoch": 0.1853, + "grad_norm": 0.032404638826847076, + "learning_rate": 9.744040362043765e-07, + "loss": 0.0334, + "step": 183060 + }, + { + "epoch": 0.18535, + "grad_norm": 0.026101185008883476, + "learning_rate": 9.732615461676531e-07, + "loss": 0.0332, + "step": 183070 + }, + { + "epoch": 0.1854, + "grad_norm": 0.03185758367180824, + "learning_rate": 9.721197130129255e-07, + "loss": 0.0315, + "step": 183080 + }, + { + "epoch": 0.18545, + "grad_norm": 0.031807247549295425, + "learning_rate": 9.709785367714246e-07, + "loss": 0.0325, + "step": 183090 + }, + { + "epoch": 0.1855, + "grad_norm": 0.03422261402010918, + "learning_rate": 9.69838017474342e-07, + "loss": 0.0329, + "step": 183100 + }, + { + "epoch": 0.18555, + "grad_norm": 0.03144890442490578, + "learning_rate": 9.686981551528584e-07, + "loss": 0.0326, + "step": 183110 + }, + { + "epoch": 0.1856, + "grad_norm": 0.03239867463707924, + "learning_rate": 9.675589498381405e-07, + "loss": 0.0338, + "step": 183120 + }, + { + "epoch": 0.18565, + "grad_norm": 0.03229037672281265, + "learning_rate": 9.664204015613327e-07, + "loss": 0.0327, + "step": 183130 + }, + { + "epoch": 0.1857, + "grad_norm": 0.033264774829149246, + "learning_rate": 9.652825103535572e-07, + "loss": 0.0327, + "step": 183140 + }, + { + "epoch": 0.18575, + "grad_norm": 0.03592655062675476, + "learning_rate": 9.64145276245934e-07, + "loss": 0.031, + "step": 183150 + }, + { + "epoch": 0.1858, + "grad_norm": 0.033310163766145706, + "learning_rate": 9.630086992695465e-07, + "loss": 0.0368, + "step": 183160 + }, + { + "epoch": 0.18585, + "grad_norm": 0.02804112248122692, + "learning_rate": 9.61872779455475e-07, + "loss": 0.0325, + "step": 183170 + }, + { + "epoch": 0.1859, + "grad_norm": 0.02908741869032383, + "learning_rate": 9.607375168347672e-07, + "loss": 0.0326, + "step": 183180 + }, + { + "epoch": 0.18595, + "grad_norm": 0.03357316181063652, + "learning_rate": 9.596029114384647e-07, + "loss": 0.0332, + "step": 183190 + }, + { + "epoch": 0.186, + "grad_norm": 0.03149103373289108, + "learning_rate": 9.584689632975874e-07, + "loss": 0.0332, + "step": 183200 + }, + { + "epoch": 0.18605, + "grad_norm": 0.033084675669670105, + "learning_rate": 9.573356724431381e-07, + "loss": 0.0318, + "step": 183210 + }, + { + "epoch": 0.1861, + "grad_norm": 0.03316180780529976, + "learning_rate": 9.562030389060977e-07, + "loss": 0.0325, + "step": 183220 + }, + { + "epoch": 0.18615, + "grad_norm": 0.02788355201482773, + "learning_rate": 9.550710627174304e-07, + "loss": 0.0328, + "step": 183230 + }, + { + "epoch": 0.1862, + "grad_norm": 0.028513340279459953, + "learning_rate": 9.539397439080917e-07, + "loss": 0.0332, + "step": 183240 + }, + { + "epoch": 0.18625, + "grad_norm": 0.029110228642821312, + "learning_rate": 9.528090825090069e-07, + "loss": 0.033, + "step": 183250 + }, + { + "epoch": 0.1863, + "grad_norm": 0.030120152980089188, + "learning_rate": 9.516790785510876e-07, + "loss": 0.033, + "step": 183260 + }, + { + "epoch": 0.18635, + "grad_norm": 0.027336476370692253, + "learning_rate": 9.505497320652229e-07, + "loss": 0.0317, + "step": 183270 + }, + { + "epoch": 0.1864, + "grad_norm": 0.02947922609746456, + "learning_rate": 9.494210430822937e-07, + "loss": 0.0319, + "step": 183280 + }, + { + "epoch": 0.18645, + "grad_norm": 0.02891182340681553, + "learning_rate": 9.48293011633164e-07, + "loss": 0.0328, + "step": 183290 + }, + { + "epoch": 0.1865, + "grad_norm": 0.03284899890422821, + "learning_rate": 9.471656377486649e-07, + "loss": 0.033, + "step": 183300 + }, + { + "epoch": 0.18655, + "grad_norm": 0.027189364656805992, + "learning_rate": 9.460389214596215e-07, + "loss": 0.032, + "step": 183310 + }, + { + "epoch": 0.1866, + "grad_norm": 0.03087041899561882, + "learning_rate": 9.449128627968345e-07, + "loss": 0.0323, + "step": 183320 + }, + { + "epoch": 0.18665, + "grad_norm": 0.03247639164328575, + "learning_rate": 9.43787461791093e-07, + "loss": 0.0322, + "step": 183330 + }, + { + "epoch": 0.1867, + "grad_norm": 0.03126617893576622, + "learning_rate": 9.426627184731696e-07, + "loss": 0.032, + "step": 183340 + }, + { + "epoch": 0.18675, + "grad_norm": 0.03199268504977226, + "learning_rate": 9.415386328738035e-07, + "loss": 0.0332, + "step": 183350 + }, + { + "epoch": 0.1868, + "grad_norm": 0.0347495973110199, + "learning_rate": 9.40415205023737e-07, + "loss": 0.0339, + "step": 183360 + }, + { + "epoch": 0.18685, + "grad_norm": 0.03300970792770386, + "learning_rate": 9.392924349536758e-07, + "loss": 0.033, + "step": 183370 + }, + { + "epoch": 0.1869, + "grad_norm": 0.03369023650884628, + "learning_rate": 9.381703226943289e-07, + "loss": 0.0328, + "step": 183380 + }, + { + "epoch": 0.18695, + "grad_norm": 0.031055910512804985, + "learning_rate": 9.370488682763579e-07, + "loss": 0.0321, + "step": 183390 + }, + { + "epoch": 0.187, + "grad_norm": 0.03379061818122864, + "learning_rate": 9.359280717304297e-07, + "loss": 0.035, + "step": 183400 + }, + { + "epoch": 0.18705, + "grad_norm": 0.030245963484048843, + "learning_rate": 9.348079330871923e-07, + "loss": 0.0325, + "step": 183410 + }, + { + "epoch": 0.1871, + "grad_norm": 0.03331426531076431, + "learning_rate": 9.336884523772654e-07, + "loss": 0.033, + "step": 183420 + }, + { + "epoch": 0.18715, + "grad_norm": 0.03207800164818764, + "learning_rate": 9.325696296312552e-07, + "loss": 0.034, + "step": 183430 + }, + { + "epoch": 0.1872, + "grad_norm": 0.03171722963452339, + "learning_rate": 9.314514648797457e-07, + "loss": 0.0328, + "step": 183440 + }, + { + "epoch": 0.18725, + "grad_norm": 0.030682936310768127, + "learning_rate": 9.303339581533122e-07, + "loss": 0.0324, + "step": 183450 + }, + { + "epoch": 0.1873, + "grad_norm": 0.033495258539915085, + "learning_rate": 9.292171094825053e-07, + "loss": 0.0335, + "step": 183460 + }, + { + "epoch": 0.18735, + "grad_norm": 0.03323593735694885, + "learning_rate": 9.281009188978618e-07, + "loss": 0.0345, + "step": 183470 + }, + { + "epoch": 0.1874, + "grad_norm": 0.03410422056913376, + "learning_rate": 9.269853864298961e-07, + "loss": 0.0336, + "step": 183480 + }, + { + "epoch": 0.18745, + "grad_norm": 0.031951263546943665, + "learning_rate": 9.258705121091032e-07, + "loss": 0.0335, + "step": 183490 + }, + { + "epoch": 0.1875, + "grad_norm": 0.0321340374648571, + "learning_rate": 9.247562959659673e-07, + "loss": 0.033, + "step": 183500 + }, + { + "epoch": 0.18755, + "grad_norm": 0.02933504246175289, + "learning_rate": 9.236427380309526e-07, + "loss": 0.0344, + "step": 183510 + }, + { + "epoch": 0.1876, + "grad_norm": 0.02984052337706089, + "learning_rate": 9.22529838334496e-07, + "loss": 0.0325, + "step": 183520 + }, + { + "epoch": 0.18765, + "grad_norm": 0.031184837222099304, + "learning_rate": 9.214175969070288e-07, + "loss": 0.0344, + "step": 183530 + }, + { + "epoch": 0.1877, + "grad_norm": 0.03433763235807419, + "learning_rate": 9.203060137789599e-07, + "loss": 0.0328, + "step": 183540 + }, + { + "epoch": 0.18775, + "grad_norm": 0.029375756159424782, + "learning_rate": 9.191950889806816e-07, + "loss": 0.0324, + "step": 183550 + }, + { + "epoch": 0.1878, + "grad_norm": 0.03221710026264191, + "learning_rate": 9.180848225425586e-07, + "loss": 0.0334, + "step": 183560 + }, + { + "epoch": 0.18785, + "grad_norm": 0.027692032977938652, + "learning_rate": 9.169752144949501e-07, + "loss": 0.0323, + "step": 183570 + }, + { + "epoch": 0.1879, + "grad_norm": 0.02899179421365261, + "learning_rate": 9.158662648681898e-07, + "loss": 0.0327, + "step": 183580 + }, + { + "epoch": 0.18795, + "grad_norm": 0.032324861735105515, + "learning_rate": 9.14757973692601e-07, + "loss": 0.0335, + "step": 183590 + }, + { + "epoch": 0.188, + "grad_norm": 0.029175706207752228, + "learning_rate": 9.136503409984815e-07, + "loss": 0.0329, + "step": 183600 + }, + { + "epoch": 0.18805, + "grad_norm": 0.03108326904475689, + "learning_rate": 9.125433668161071e-07, + "loss": 0.0317, + "step": 183610 + }, + { + "epoch": 0.1881, + "grad_norm": 0.029612571001052856, + "learning_rate": 9.114370511757536e-07, + "loss": 0.032, + "step": 183620 + }, + { + "epoch": 0.18815, + "grad_norm": 0.030926868319511414, + "learning_rate": 9.103313941076608e-07, + "loss": 0.0341, + "step": 183630 + }, + { + "epoch": 0.1882, + "grad_norm": 0.03476545587182045, + "learning_rate": 9.092263956420572e-07, + "loss": 0.0339, + "step": 183640 + }, + { + "epoch": 0.18825, + "grad_norm": 0.03209243714809418, + "learning_rate": 9.081220558091518e-07, + "loss": 0.0318, + "step": 183650 + }, + { + "epoch": 0.1883, + "grad_norm": 0.02719273418188095, + "learning_rate": 9.070183746391375e-07, + "loss": 0.0315, + "step": 183660 + }, + { + "epoch": 0.18835, + "grad_norm": 0.03333199396729469, + "learning_rate": 9.0591535216219e-07, + "loss": 0.0339, + "step": 183670 + }, + { + "epoch": 0.1884, + "grad_norm": 0.02819048799574375, + "learning_rate": 9.048129884084683e-07, + "loss": 0.033, + "step": 183680 + }, + { + "epoch": 0.18845, + "grad_norm": 0.03179158270359039, + "learning_rate": 9.037112834081068e-07, + "loss": 0.0335, + "step": 183690 + }, + { + "epoch": 0.1885, + "grad_norm": 0.029788820073008537, + "learning_rate": 9.026102371912232e-07, + "loss": 0.0322, + "step": 183700 + }, + { + "epoch": 0.18855, + "grad_norm": 0.02990236133337021, + "learning_rate": 9.015098497879265e-07, + "loss": 0.0331, + "step": 183710 + }, + { + "epoch": 0.1886, + "grad_norm": 0.031020531430840492, + "learning_rate": 9.004101212282956e-07, + "loss": 0.0349, + "step": 183720 + }, + { + "epoch": 0.18865, + "grad_norm": 0.030472010374069214, + "learning_rate": 8.993110515423953e-07, + "loss": 0.0346, + "step": 183730 + }, + { + "epoch": 0.1887, + "grad_norm": 0.04369658604264259, + "learning_rate": 8.982126407602792e-07, + "loss": 0.0384, + "step": 183740 + }, + { + "epoch": 0.18875, + "grad_norm": 0.038215216249227524, + "learning_rate": 8.971148889119734e-07, + "loss": 0.0313, + "step": 183750 + }, + { + "epoch": 0.1888, + "grad_norm": 0.029607990756630898, + "learning_rate": 8.960177960274957e-07, + "loss": 0.0333, + "step": 183760 + }, + { + "epoch": 0.18885, + "grad_norm": 0.03382629528641701, + "learning_rate": 8.94921362136833e-07, + "loss": 0.0323, + "step": 183770 + }, + { + "epoch": 0.1889, + "grad_norm": 0.03388132527470589, + "learning_rate": 8.938255872699613e-07, + "loss": 0.0315, + "step": 183780 + }, + { + "epoch": 0.18895, + "grad_norm": 0.03242596983909607, + "learning_rate": 8.927304714568458e-07, + "loss": 0.0331, + "step": 183790 + }, + { + "epoch": 0.189, + "grad_norm": 0.029478929936885834, + "learning_rate": 8.916360147274233e-07, + "loss": 0.0319, + "step": 183800 + }, + { + "epoch": 0.18905, + "grad_norm": 0.029674693942070007, + "learning_rate": 8.905422171116145e-07, + "loss": 0.031, + "step": 183810 + }, + { + "epoch": 0.1891, + "grad_norm": 0.029971517622470856, + "learning_rate": 8.894490786393206e-07, + "loss": 0.0313, + "step": 183820 + }, + { + "epoch": 0.18915, + "grad_norm": 0.03268228843808174, + "learning_rate": 8.883565993404341e-07, + "loss": 0.0324, + "step": 183830 + }, + { + "epoch": 0.1892, + "grad_norm": 0.03221019729971886, + "learning_rate": 8.872647792448203e-07, + "loss": 0.032, + "step": 183840 + }, + { + "epoch": 0.18925, + "grad_norm": 0.0357881523668766, + "learning_rate": 8.861736183823272e-07, + "loss": 0.0327, + "step": 183850 + }, + { + "epoch": 0.1893, + "grad_norm": 0.03263729438185692, + "learning_rate": 8.850831167827895e-07, + "loss": 0.0332, + "step": 183860 + }, + { + "epoch": 0.18935, + "grad_norm": 0.03233606740832329, + "learning_rate": 8.839932744760165e-07, + "loss": 0.0328, + "step": 183870 + }, + { + "epoch": 0.1894, + "grad_norm": 0.03519522398710251, + "learning_rate": 8.829040914918096e-07, + "loss": 0.0317, + "step": 183880 + }, + { + "epoch": 0.18945, + "grad_norm": 0.02882656268775463, + "learning_rate": 8.818155678599477e-07, + "loss": 0.0338, + "step": 183890 + }, + { + "epoch": 0.1895, + "grad_norm": 0.03132500872015953, + "learning_rate": 8.807277036101819e-07, + "loss": 0.0323, + "step": 183900 + }, + { + "epoch": 0.18955, + "grad_norm": 0.033128950744867325, + "learning_rate": 8.796404987722634e-07, + "loss": 0.0348, + "step": 183910 + }, + { + "epoch": 0.1896, + "grad_norm": 0.030885368585586548, + "learning_rate": 8.785539533759101e-07, + "loss": 0.0344, + "step": 183920 + }, + { + "epoch": 0.18965, + "grad_norm": 0.03386777639389038, + "learning_rate": 8.774680674508318e-07, + "loss": 0.0342, + "step": 183930 + }, + { + "epoch": 0.1897, + "grad_norm": 0.03264821693301201, + "learning_rate": 8.7638284102671e-07, + "loss": 0.0339, + "step": 183940 + }, + { + "epoch": 0.18975, + "grad_norm": 0.03148859739303589, + "learning_rate": 8.752982741332239e-07, + "loss": 0.0337, + "step": 183950 + }, + { + "epoch": 0.1898, + "grad_norm": 0.03271210193634033, + "learning_rate": 8.742143668000136e-07, + "loss": 0.0342, + "step": 183960 + }, + { + "epoch": 0.18985, + "grad_norm": 0.03562767803668976, + "learning_rate": 8.731311190567248e-07, + "loss": 0.0338, + "step": 183970 + }, + { + "epoch": 0.1899, + "grad_norm": 0.030357353389263153, + "learning_rate": 8.720485309329646e-07, + "loss": 0.0332, + "step": 183980 + }, + { + "epoch": 0.18995, + "grad_norm": 0.032088980078697205, + "learning_rate": 8.709666024583313e-07, + "loss": 0.0373, + "step": 183990 + }, + { + "epoch": 0.19, + "grad_norm": 0.03076958656311035, + "learning_rate": 8.698853336624097e-07, + "loss": 0.0333, + "step": 184000 + }, + { + "epoch": 0.19005, + "grad_norm": 0.03482131287455559, + "learning_rate": 8.688047245747566e-07, + "loss": 0.0355, + "step": 184010 + }, + { + "epoch": 0.1901, + "grad_norm": 0.03692816570401192, + "learning_rate": 8.677247752249151e-07, + "loss": 0.0338, + "step": 184020 + }, + { + "epoch": 0.19015, + "grad_norm": 0.029213665053248405, + "learning_rate": 8.666454856424116e-07, + "loss": 0.0331, + "step": 184030 + }, + { + "epoch": 0.1902, + "grad_norm": 0.035306889563798904, + "learning_rate": 8.655668558567559e-07, + "loss": 0.0332, + "step": 184040 + }, + { + "epoch": 0.19025, + "grad_norm": 0.029386315494775772, + "learning_rate": 8.644888858974381e-07, + "loss": 0.0332, + "step": 184050 + }, + { + "epoch": 0.1903, + "grad_norm": 0.031330134719610214, + "learning_rate": 8.634115757939209e-07, + "loss": 0.0344, + "step": 184060 + }, + { + "epoch": 0.19035, + "grad_norm": 0.029435602948069572, + "learning_rate": 8.623349255756697e-07, + "loss": 0.0333, + "step": 184070 + }, + { + "epoch": 0.1904, + "grad_norm": 0.02990766242146492, + "learning_rate": 8.612589352721079e-07, + "loss": 0.0318, + "step": 184080 + }, + { + "epoch": 0.19045, + "grad_norm": 0.03036910854279995, + "learning_rate": 8.601836049126622e-07, + "loss": 0.0319, + "step": 184090 + }, + { + "epoch": 0.1905, + "grad_norm": 0.02997252345085144, + "learning_rate": 8.591089345267284e-07, + "loss": 0.0343, + "step": 184100 + }, + { + "epoch": 0.19055, + "grad_norm": 0.03387441858649254, + "learning_rate": 8.58034924143683e-07, + "loss": 0.0327, + "step": 184110 + }, + { + "epoch": 0.1906, + "grad_norm": 0.03145996108651161, + "learning_rate": 8.569615737928944e-07, + "loss": 0.0321, + "step": 184120 + }, + { + "epoch": 0.19065, + "grad_norm": 0.03035629726946354, + "learning_rate": 8.558888835037082e-07, + "loss": 0.0314, + "step": 184130 + }, + { + "epoch": 0.1907, + "grad_norm": 0.0315399244427681, + "learning_rate": 8.548168533054513e-07, + "loss": 0.0318, + "step": 184140 + }, + { + "epoch": 0.19075, + "grad_norm": 0.03548549860715866, + "learning_rate": 8.53745483227425e-07, + "loss": 0.0314, + "step": 184150 + }, + { + "epoch": 0.1908, + "grad_norm": 0.02948744036257267, + "learning_rate": 8.526747732989254e-07, + "loss": 0.0328, + "step": 184160 + }, + { + "epoch": 0.19085, + "grad_norm": 0.030535975471138954, + "learning_rate": 8.516047235492292e-07, + "loss": 0.0318, + "step": 184170 + }, + { + "epoch": 0.1909, + "grad_norm": 0.03062056377530098, + "learning_rate": 8.505353340075906e-07, + "loss": 0.0312, + "step": 184180 + }, + { + "epoch": 0.19095, + "grad_norm": 0.028339235112071037, + "learning_rate": 8.49466604703239e-07, + "loss": 0.0332, + "step": 184190 + }, + { + "epoch": 0.191, + "grad_norm": 0.026632152497768402, + "learning_rate": 8.483985356653984e-07, + "loss": 0.0318, + "step": 184200 + }, + { + "epoch": 0.19105, + "grad_norm": 0.027862658724188805, + "learning_rate": 8.473311269232703e-07, + "loss": 0.0326, + "step": 184210 + }, + { + "epoch": 0.1911, + "grad_norm": 0.03462366759777069, + "learning_rate": 8.462643785060342e-07, + "loss": 0.0334, + "step": 184220 + }, + { + "epoch": 0.19115, + "grad_norm": 0.03214826062321663, + "learning_rate": 8.451982904428529e-07, + "loss": 0.035, + "step": 184230 + }, + { + "epoch": 0.1912, + "grad_norm": 0.030515849590301514, + "learning_rate": 8.441328627628808e-07, + "loss": 0.0318, + "step": 184240 + }, + { + "epoch": 0.19125, + "grad_norm": 0.033372119069099426, + "learning_rate": 8.430680954952364e-07, + "loss": 0.0323, + "step": 184250 + }, + { + "epoch": 0.1913, + "grad_norm": 0.03293319046497345, + "learning_rate": 8.420039886690434e-07, + "loss": 0.0334, + "step": 184260 + }, + { + "epoch": 0.19135, + "grad_norm": 0.03229885548353195, + "learning_rate": 8.409405423133759e-07, + "loss": 0.0319, + "step": 184270 + }, + { + "epoch": 0.1914, + "grad_norm": 0.029228324070572853, + "learning_rate": 8.398777564573246e-07, + "loss": 0.0319, + "step": 184280 + }, + { + "epoch": 0.19145, + "grad_norm": 0.03142399340867996, + "learning_rate": 8.388156311299328e-07, + "loss": 0.0332, + "step": 184290 + }, + { + "epoch": 0.1915, + "grad_norm": 0.0325978584587574, + "learning_rate": 8.377541663602495e-07, + "loss": 0.0334, + "step": 184300 + }, + { + "epoch": 0.19155, + "grad_norm": 0.03429727628827095, + "learning_rate": 8.366933621772905e-07, + "loss": 0.0318, + "step": 184310 + }, + { + "epoch": 0.1916, + "grad_norm": 0.03160303458571434, + "learning_rate": 8.356332186100519e-07, + "loss": 0.0326, + "step": 184320 + }, + { + "epoch": 0.19165, + "grad_norm": 0.028069067746400833, + "learning_rate": 8.345737356875272e-07, + "loss": 0.0338, + "step": 184330 + }, + { + "epoch": 0.1917, + "grad_norm": 0.03231353685259819, + "learning_rate": 8.335149134386794e-07, + "loss": 0.0335, + "step": 184340 + }, + { + "epoch": 0.19175, + "grad_norm": 0.029980802908539772, + "learning_rate": 8.324567518924492e-07, + "loss": 0.032, + "step": 184350 + }, + { + "epoch": 0.1918, + "grad_norm": 0.02924294024705887, + "learning_rate": 8.313992510777773e-07, + "loss": 0.0336, + "step": 184360 + }, + { + "epoch": 0.19185, + "grad_norm": 0.0314185693860054, + "learning_rate": 8.303424110235659e-07, + "loss": 0.0323, + "step": 184370 + }, + { + "epoch": 0.1919, + "grad_norm": 0.03167622163891792, + "learning_rate": 8.292862317587163e-07, + "loss": 0.0335, + "step": 184380 + }, + { + "epoch": 0.19195, + "grad_norm": 0.02776946686208248, + "learning_rate": 8.282307133121003e-07, + "loss": 0.0323, + "step": 184390 + }, + { + "epoch": 0.192, + "grad_norm": 0.028174638748168945, + "learning_rate": 8.271758557125752e-07, + "loss": 0.0328, + "step": 184400 + }, + { + "epoch": 0.19205, + "grad_norm": 0.02484823204576969, + "learning_rate": 8.261216589889792e-07, + "loss": 0.0319, + "step": 184410 + }, + { + "epoch": 0.1921, + "grad_norm": 0.0313238799571991, + "learning_rate": 8.25068123170139e-07, + "loss": 0.0316, + "step": 184420 + }, + { + "epoch": 0.19215, + "grad_norm": 0.0305357463657856, + "learning_rate": 8.240152482848513e-07, + "loss": 0.0302, + "step": 184430 + }, + { + "epoch": 0.1922, + "grad_norm": 0.031983766704797745, + "learning_rate": 8.229630343619038e-07, + "loss": 0.0329, + "step": 184440 + }, + { + "epoch": 0.19225, + "grad_norm": 0.028229843825101852, + "learning_rate": 8.219114814300655e-07, + "loss": 0.0309, + "step": 184450 + }, + { + "epoch": 0.1923, + "grad_norm": 0.029529288411140442, + "learning_rate": 8.208605895180826e-07, + "loss": 0.0304, + "step": 184460 + }, + { + "epoch": 0.19235, + "grad_norm": 0.03323786333203316, + "learning_rate": 8.198103586546934e-07, + "loss": 0.031, + "step": 184470 + }, + { + "epoch": 0.1924, + "grad_norm": 0.028421130031347275, + "learning_rate": 8.187607888685972e-07, + "loss": 0.0309, + "step": 184480 + }, + { + "epoch": 0.19245, + "grad_norm": 0.03185752406716347, + "learning_rate": 8.177118801884986e-07, + "loss": 0.0311, + "step": 184490 + }, + { + "epoch": 0.1925, + "grad_norm": 0.030270256102085114, + "learning_rate": 8.166636326430749e-07, + "loss": 0.0307, + "step": 184500 + }, + { + "epoch": 0.19255, + "grad_norm": 0.028725991025567055, + "learning_rate": 8.156160462609807e-07, + "loss": 0.0312, + "step": 184510 + }, + { + "epoch": 0.1926, + "grad_norm": 0.027295365929603577, + "learning_rate": 8.1456912107086e-07, + "loss": 0.0321, + "step": 184520 + }, + { + "epoch": 0.19265, + "grad_norm": 0.030689042061567307, + "learning_rate": 8.135228571013287e-07, + "loss": 0.0326, + "step": 184530 + }, + { + "epoch": 0.1927, + "grad_norm": 0.03691000118851662, + "learning_rate": 8.124772543809972e-07, + "loss": 0.0327, + "step": 184540 + }, + { + "epoch": 0.19275, + "grad_norm": 0.030952926725149155, + "learning_rate": 8.114323129384566e-07, + "loss": 0.0327, + "step": 184550 + }, + { + "epoch": 0.1928, + "grad_norm": 0.029755594208836555, + "learning_rate": 8.103880328022618e-07, + "loss": 0.0328, + "step": 184560 + }, + { + "epoch": 0.19285, + "grad_norm": 0.03075193241238594, + "learning_rate": 8.09344414000976e-07, + "loss": 0.0332, + "step": 184570 + }, + { + "epoch": 0.1929, + "grad_norm": 0.03310278430581093, + "learning_rate": 8.083014565631209e-07, + "loss": 0.0324, + "step": 184580 + }, + { + "epoch": 0.19295, + "grad_norm": 0.030689707025885582, + "learning_rate": 8.072591605172208e-07, + "loss": 0.033, + "step": 184590 + }, + { + "epoch": 0.193, + "grad_norm": 0.028939155861735344, + "learning_rate": 8.062175258917643e-07, + "loss": 0.0335, + "step": 184600 + }, + { + "epoch": 0.19305, + "grad_norm": 0.030596747994422913, + "learning_rate": 8.051765527152283e-07, + "loss": 0.032, + "step": 184610 + }, + { + "epoch": 0.1931, + "grad_norm": 0.027125678956508636, + "learning_rate": 8.041362410160819e-07, + "loss": 0.0336, + "step": 184620 + }, + { + "epoch": 0.19315, + "grad_norm": 0.029070226475596428, + "learning_rate": 8.030965908227578e-07, + "loss": 0.0329, + "step": 184630 + }, + { + "epoch": 0.1932, + "grad_norm": 0.032745059579610825, + "learning_rate": 8.020576021636834e-07, + "loss": 0.032, + "step": 184640 + }, + { + "epoch": 0.19325, + "grad_norm": 0.029334556311368942, + "learning_rate": 8.010192750672607e-07, + "loss": 0.0326, + "step": 184650 + }, + { + "epoch": 0.1933, + "grad_norm": 0.033059775829315186, + "learning_rate": 7.999816095618812e-07, + "loss": 0.0327, + "step": 184660 + }, + { + "epoch": 0.19335, + "grad_norm": 0.032330144196748734, + "learning_rate": 7.989446056759137e-07, + "loss": 0.0312, + "step": 184670 + }, + { + "epoch": 0.1934, + "grad_norm": 0.030641792342066765, + "learning_rate": 7.979082634377078e-07, + "loss": 0.0319, + "step": 184680 + }, + { + "epoch": 0.19345, + "grad_norm": 0.028173161670565605, + "learning_rate": 7.96872582875599e-07, + "loss": 0.0313, + "step": 184690 + }, + { + "epoch": 0.1935, + "grad_norm": 0.03235101327300072, + "learning_rate": 7.958375640178983e-07, + "loss": 0.0333, + "step": 184700 + }, + { + "epoch": 0.19355, + "grad_norm": 0.030581427738070488, + "learning_rate": 7.948032068929079e-07, + "loss": 0.0317, + "step": 184710 + }, + { + "epoch": 0.1936, + "grad_norm": 0.0303508210927248, + "learning_rate": 7.937695115289051e-07, + "loss": 0.032, + "step": 184720 + }, + { + "epoch": 0.19365, + "grad_norm": 0.032854851335287094, + "learning_rate": 7.927364779541479e-07, + "loss": 0.0314, + "step": 184730 + }, + { + "epoch": 0.1937, + "grad_norm": 0.029865741729736328, + "learning_rate": 7.917041061968833e-07, + "loss": 0.031, + "step": 184740 + }, + { + "epoch": 0.19375, + "grad_norm": 0.030726918950676918, + "learning_rate": 7.906723962853302e-07, + "loss": 0.032, + "step": 184750 + }, + { + "epoch": 0.1938, + "grad_norm": 0.02736191637814045, + "learning_rate": 7.896413482477049e-07, + "loss": 0.0326, + "step": 184760 + }, + { + "epoch": 0.19385, + "grad_norm": 0.029535965994000435, + "learning_rate": 7.886109621121851e-07, + "loss": 0.0316, + "step": 184770 + }, + { + "epoch": 0.1939, + "grad_norm": 0.028387896716594696, + "learning_rate": 7.87581237906948e-07, + "loss": 0.0318, + "step": 184780 + }, + { + "epoch": 0.19395, + "grad_norm": 0.03226856514811516, + "learning_rate": 7.865521756601407e-07, + "loss": 0.0323, + "step": 184790 + }, + { + "epoch": 0.194, + "grad_norm": 0.029670629650354385, + "learning_rate": 7.855237753999017e-07, + "loss": 0.0316, + "step": 184800 + }, + { + "epoch": 0.19405, + "grad_norm": 0.03363959863781929, + "learning_rate": 7.844960371543475e-07, + "loss": 0.0333, + "step": 184810 + }, + { + "epoch": 0.1941, + "grad_norm": 0.028326643630862236, + "learning_rate": 7.834689609515722e-07, + "loss": 0.0325, + "step": 184820 + }, + { + "epoch": 0.19415, + "grad_norm": 0.030526431277394295, + "learning_rate": 7.824425468196589e-07, + "loss": 0.0334, + "step": 184830 + }, + { + "epoch": 0.1942, + "grad_norm": 0.029290050268173218, + "learning_rate": 7.814167947866685e-07, + "loss": 0.0326, + "step": 184840 + }, + { + "epoch": 0.19425, + "grad_norm": 0.03072735294699669, + "learning_rate": 7.803917048806453e-07, + "loss": 0.0337, + "step": 184850 + }, + { + "epoch": 0.1943, + "grad_norm": 0.02674674801528454, + "learning_rate": 7.793672771296112e-07, + "loss": 0.0319, + "step": 184860 + }, + { + "epoch": 0.19435, + "grad_norm": 0.030022749677300453, + "learning_rate": 7.783435115615745e-07, + "loss": 0.0335, + "step": 184870 + }, + { + "epoch": 0.1944, + "grad_norm": 0.030002307146787643, + "learning_rate": 7.773204082045321e-07, + "loss": 0.032, + "step": 184880 + }, + { + "epoch": 0.19445, + "grad_norm": 0.02685355767607689, + "learning_rate": 7.762979670864479e-07, + "loss": 0.0323, + "step": 184890 + }, + { + "epoch": 0.1945, + "grad_norm": 0.03253389894962311, + "learning_rate": 7.752761882352771e-07, + "loss": 0.032, + "step": 184900 + }, + { + "epoch": 0.19455, + "grad_norm": 0.03180722892284393, + "learning_rate": 7.742550716789531e-07, + "loss": 0.0334, + "step": 184910 + }, + { + "epoch": 0.1946, + "grad_norm": 0.029725681990385056, + "learning_rate": 7.732346174453953e-07, + "loss": 0.0314, + "step": 184920 + }, + { + "epoch": 0.19465, + "grad_norm": 0.027232058346271515, + "learning_rate": 7.722148255625006e-07, + "loss": 0.0321, + "step": 184930 + }, + { + "epoch": 0.1947, + "grad_norm": 0.03249523788690567, + "learning_rate": 7.711956960581495e-07, + "loss": 0.035, + "step": 184940 + }, + { + "epoch": 0.19475, + "grad_norm": 0.03233838453888893, + "learning_rate": 7.701772289602089e-07, + "loss": 0.0326, + "step": 184950 + }, + { + "epoch": 0.1948, + "grad_norm": 0.028998851776123047, + "learning_rate": 7.691594242965172e-07, + "loss": 0.0326, + "step": 184960 + }, + { + "epoch": 0.19485, + "grad_norm": 0.031819239258766174, + "learning_rate": 7.68142282094908e-07, + "loss": 0.033, + "step": 184970 + }, + { + "epoch": 0.1949, + "grad_norm": 0.028010355308651924, + "learning_rate": 7.671258023831812e-07, + "loss": 0.0323, + "step": 184980 + }, + { + "epoch": 0.19495, + "grad_norm": 0.032065752893686295, + "learning_rate": 7.661099851891312e-07, + "loss": 0.0314, + "step": 184990 + }, + { + "epoch": 0.195, + "grad_norm": 0.0302036851644516, + "learning_rate": 7.650948305405303e-07, + "loss": 0.0332, + "step": 185000 + }, + { + "epoch": 0.19505, + "grad_norm": 0.02926947921514511, + "learning_rate": 7.64080338465134e-07, + "loss": 0.0325, + "step": 185010 + }, + { + "epoch": 0.1951, + "grad_norm": 0.03630632534623146, + "learning_rate": 7.630665089906758e-07, + "loss": 0.0326, + "step": 185020 + }, + { + "epoch": 0.19515, + "grad_norm": 0.03500113636255264, + "learning_rate": 7.620533421448722e-07, + "loss": 0.0316, + "step": 185030 + }, + { + "epoch": 0.1952, + "grad_norm": 0.027939170598983765, + "learning_rate": 7.610408379554263e-07, + "loss": 0.0326, + "step": 185040 + }, + { + "epoch": 0.19525, + "grad_norm": 0.027237216010689735, + "learning_rate": 7.600289964500184e-07, + "loss": 0.0329, + "step": 185050 + }, + { + "epoch": 0.1953, + "grad_norm": 0.031244980171322823, + "learning_rate": 7.590178176563073e-07, + "loss": 0.0318, + "step": 185060 + }, + { + "epoch": 0.19535, + "grad_norm": 0.031116962432861328, + "learning_rate": 7.580073016019457e-07, + "loss": 0.034, + "step": 185070 + }, + { + "epoch": 0.1954, + "grad_norm": 0.030840104445815086, + "learning_rate": 7.569974483145531e-07, + "loss": 0.0329, + "step": 185080 + }, + { + "epoch": 0.19545, + "grad_norm": 0.029891157522797585, + "learning_rate": 7.559882578217464e-07, + "loss": 0.0336, + "step": 185090 + }, + { + "epoch": 0.1955, + "grad_norm": 0.0267768744379282, + "learning_rate": 7.549797301511146e-07, + "loss": 0.0319, + "step": 185100 + }, + { + "epoch": 0.19555, + "grad_norm": 0.02854592353105545, + "learning_rate": 7.539718653302247e-07, + "loss": 0.0325, + "step": 185110 + }, + { + "epoch": 0.1956, + "grad_norm": 0.030489597469568253, + "learning_rate": 7.529646633866349e-07, + "loss": 0.033, + "step": 185120 + }, + { + "epoch": 0.19565, + "grad_norm": 0.028737762942910194, + "learning_rate": 7.519581243478846e-07, + "loss": 0.0336, + "step": 185130 + }, + { + "epoch": 0.1957, + "grad_norm": 0.030285654589533806, + "learning_rate": 7.509522482414905e-07, + "loss": 0.0321, + "step": 185140 + }, + { + "epoch": 0.19575, + "grad_norm": 0.029154222458600998, + "learning_rate": 7.499470350949473e-07, + "loss": 0.0334, + "step": 185150 + }, + { + "epoch": 0.1958, + "grad_norm": 0.032220274209976196, + "learning_rate": 7.489424849357441e-07, + "loss": 0.031, + "step": 185160 + }, + { + "epoch": 0.19585, + "grad_norm": 0.027242610231041908, + "learning_rate": 7.479385977913422e-07, + "loss": 0.0327, + "step": 185170 + }, + { + "epoch": 0.1959, + "grad_norm": 0.027455078437924385, + "learning_rate": 7.469353736891893e-07, + "loss": 0.0316, + "step": 185180 + }, + { + "epoch": 0.19595, + "grad_norm": 0.030220970511436462, + "learning_rate": 7.459328126567134e-07, + "loss": 0.0336, + "step": 185190 + }, + { + "epoch": 0.196, + "grad_norm": 0.030868202447891235, + "learning_rate": 7.449309147213173e-07, + "loss": 0.031, + "step": 185200 + }, + { + "epoch": 0.19605, + "grad_norm": 0.034092117100954056, + "learning_rate": 7.439296799104018e-07, + "loss": 0.0329, + "step": 185210 + }, + { + "epoch": 0.1961, + "grad_norm": 0.030522791668772697, + "learning_rate": 7.429291082513362e-07, + "loss": 0.0327, + "step": 185220 + }, + { + "epoch": 0.19615, + "grad_norm": 0.02995004877448082, + "learning_rate": 7.419291997714766e-07, + "loss": 0.0324, + "step": 185230 + }, + { + "epoch": 0.1962, + "grad_norm": 0.03135078400373459, + "learning_rate": 7.409299544981568e-07, + "loss": 0.034, + "step": 185240 + }, + { + "epoch": 0.19625, + "grad_norm": 0.03217240795493126, + "learning_rate": 7.399313724586965e-07, + "loss": 0.0343, + "step": 185250 + }, + { + "epoch": 0.1963, + "grad_norm": 0.03078630194067955, + "learning_rate": 7.389334536804044e-07, + "loss": 0.0332, + "step": 185260 + }, + { + "epoch": 0.19635, + "grad_norm": 0.02970891259610653, + "learning_rate": 7.379361981905531e-07, + "loss": 0.0335, + "step": 185270 + }, + { + "epoch": 0.1964, + "grad_norm": 0.029535263776779175, + "learning_rate": 7.369396060164124e-07, + "loss": 0.0353, + "step": 185280 + }, + { + "epoch": 0.19645, + "grad_norm": 0.03212609142065048, + "learning_rate": 7.359436771852274e-07, + "loss": 0.0327, + "step": 185290 + }, + { + "epoch": 0.1965, + "grad_norm": 0.024767275899648666, + "learning_rate": 7.349484117242261e-07, + "loss": 0.0338, + "step": 185300 + }, + { + "epoch": 0.19655, + "grad_norm": 0.03457412123680115, + "learning_rate": 7.339538096606202e-07, + "loss": 0.0321, + "step": 185310 + }, + { + "epoch": 0.1966, + "grad_norm": 0.024502325803041458, + "learning_rate": 7.32959871021599e-07, + "loss": 0.0324, + "step": 185320 + }, + { + "epoch": 0.19665, + "grad_norm": 0.03082755208015442, + "learning_rate": 7.319665958343408e-07, + "loss": 0.033, + "step": 185330 + }, + { + "epoch": 0.1967, + "grad_norm": 0.03430657833814621, + "learning_rate": 7.309739841259988e-07, + "loss": 0.0324, + "step": 185340 + }, + { + "epoch": 0.19675, + "grad_norm": 0.03185485675930977, + "learning_rate": 7.299820359237097e-07, + "loss": 0.0316, + "step": 185350 + }, + { + "epoch": 0.1968, + "grad_norm": 0.028983809053897858, + "learning_rate": 7.289907512545935e-07, + "loss": 0.0329, + "step": 185360 + }, + { + "epoch": 0.19685, + "grad_norm": 0.027429502457380295, + "learning_rate": 7.280001301457507e-07, + "loss": 0.0324, + "step": 185370 + }, + { + "epoch": 0.1969, + "grad_norm": 0.033588334918022156, + "learning_rate": 7.270101726242679e-07, + "loss": 0.0308, + "step": 185380 + }, + { + "epoch": 0.19695, + "grad_norm": 0.028332434594631195, + "learning_rate": 7.260208787172068e-07, + "loss": 0.0325, + "step": 185390 + }, + { + "epoch": 0.197, + "grad_norm": 0.02579881250858307, + "learning_rate": 7.250322484516181e-07, + "loss": 0.0325, + "step": 185400 + }, + { + "epoch": 0.19705, + "grad_norm": 0.027061356231570244, + "learning_rate": 7.240442818545245e-07, + "loss": 0.0327, + "step": 185410 + }, + { + "epoch": 0.1971, + "grad_norm": 0.027370747178792953, + "learning_rate": 7.230569789529434e-07, + "loss": 0.0323, + "step": 185420 + }, + { + "epoch": 0.19715, + "grad_norm": 0.02994256466627121, + "learning_rate": 7.220703397738615e-07, + "loss": 0.032, + "step": 185430 + }, + { + "epoch": 0.1972, + "grad_norm": 0.02895674854516983, + "learning_rate": 7.210843643442572e-07, + "loss": 0.0318, + "step": 185440 + }, + { + "epoch": 0.19725, + "grad_norm": 0.028868872672319412, + "learning_rate": 7.200990526910839e-07, + "loss": 0.0319, + "step": 185450 + }, + { + "epoch": 0.1973, + "grad_norm": 0.02675914578139782, + "learning_rate": 7.191144048412812e-07, + "loss": 0.0324, + "step": 185460 + }, + { + "epoch": 0.19735, + "grad_norm": 0.030706174671649933, + "learning_rate": 7.181304208217721e-07, + "loss": 0.0323, + "step": 185470 + }, + { + "epoch": 0.1974, + "grad_norm": 0.027158884331583977, + "learning_rate": 7.171471006594515e-07, + "loss": 0.032, + "step": 185480 + }, + { + "epoch": 0.19745, + "grad_norm": 0.03242984041571617, + "learning_rate": 7.161644443812065e-07, + "loss": 0.0323, + "step": 185490 + }, + { + "epoch": 0.1975, + "grad_norm": 0.02678975835442543, + "learning_rate": 7.151824520139044e-07, + "loss": 0.0299, + "step": 185500 + }, + { + "epoch": 0.19755, + "grad_norm": 0.031223347410559654, + "learning_rate": 7.142011235843904e-07, + "loss": 0.0314, + "step": 185510 + }, + { + "epoch": 0.1976, + "grad_norm": 0.03180424124002457, + "learning_rate": 7.13220459119493e-07, + "loss": 0.0319, + "step": 185520 + }, + { + "epoch": 0.19765, + "grad_norm": 0.03262518346309662, + "learning_rate": 7.122404586460213e-07, + "loss": 0.0316, + "step": 185530 + }, + { + "epoch": 0.1977, + "grad_norm": 0.03049931675195694, + "learning_rate": 7.112611221907761e-07, + "loss": 0.0333, + "step": 185540 + }, + { + "epoch": 0.19775, + "grad_norm": 0.030612872913479805, + "learning_rate": 7.10282449780525e-07, + "loss": 0.0319, + "step": 185550 + }, + { + "epoch": 0.1978, + "grad_norm": 0.029911888763308525, + "learning_rate": 7.093044414420241e-07, + "loss": 0.0315, + "step": 185560 + }, + { + "epoch": 0.19785, + "grad_norm": 0.03132949769496918, + "learning_rate": 7.083270972020189e-07, + "loss": 0.0329, + "step": 185570 + }, + { + "epoch": 0.1979, + "grad_norm": 0.027829304337501526, + "learning_rate": 7.073504170872213e-07, + "loss": 0.0317, + "step": 185580 + }, + { + "epoch": 0.19795, + "grad_norm": 0.027748405933380127, + "learning_rate": 7.063744011243378e-07, + "loss": 0.0318, + "step": 185590 + }, + { + "epoch": 0.198, + "grad_norm": 0.028777683153748512, + "learning_rate": 7.053990493400525e-07, + "loss": 0.033, + "step": 185600 + }, + { + "epoch": 0.19805, + "grad_norm": 0.03167557343840599, + "learning_rate": 7.044243617610302e-07, + "loss": 0.0332, + "step": 185610 + }, + { + "epoch": 0.1981, + "grad_norm": 0.03244497627019882, + "learning_rate": 7.034503384139163e-07, + "loss": 0.0349, + "step": 185620 + }, + { + "epoch": 0.19815, + "grad_norm": 0.02962498925626278, + "learning_rate": 7.024769793253449e-07, + "loss": 0.0321, + "step": 185630 + }, + { + "epoch": 0.1982, + "grad_norm": 0.03409123048186302, + "learning_rate": 7.015042845219256e-07, + "loss": 0.0331, + "step": 185640 + }, + { + "epoch": 0.19825, + "grad_norm": 0.032920096069574356, + "learning_rate": 7.00532254030245e-07, + "loss": 0.0325, + "step": 185650 + }, + { + "epoch": 0.1983, + "grad_norm": 0.030276518315076828, + "learning_rate": 6.995608878768906e-07, + "loss": 0.032, + "step": 185660 + }, + { + "epoch": 0.19835, + "grad_norm": 0.035315632820129395, + "learning_rate": 6.985901860884048e-07, + "loss": 0.0337, + "step": 185670 + }, + { + "epoch": 0.1984, + "grad_norm": 0.031692810356616974, + "learning_rate": 6.976201486913414e-07, + "loss": 0.0335, + "step": 185680 + }, + { + "epoch": 0.19845, + "grad_norm": 0.030826926231384277, + "learning_rate": 6.966507757122099e-07, + "loss": 0.0323, + "step": 185690 + }, + { + "epoch": 0.1985, + "grad_norm": 0.03321309760212898, + "learning_rate": 6.956820671775138e-07, + "loss": 0.0335, + "step": 185700 + }, + { + "epoch": 0.19855, + "grad_norm": 0.02920692041516304, + "learning_rate": 6.947140231137406e-07, + "loss": 0.0335, + "step": 185710 + }, + { + "epoch": 0.1986, + "grad_norm": 0.029470006003975868, + "learning_rate": 6.937466435473577e-07, + "loss": 0.0325, + "step": 185720 + }, + { + "epoch": 0.19865, + "grad_norm": 0.029567958787083626, + "learning_rate": 6.927799285048081e-07, + "loss": 0.0328, + "step": 185730 + }, + { + "epoch": 0.1987, + "grad_norm": 0.031060943379998207, + "learning_rate": 6.918138780125206e-07, + "loss": 0.0337, + "step": 185740 + }, + { + "epoch": 0.19875, + "grad_norm": 0.03221989423036575, + "learning_rate": 6.908484920969099e-07, + "loss": 0.0332, + "step": 185750 + }, + { + "epoch": 0.1988, + "grad_norm": 0.030383775010704994, + "learning_rate": 6.898837707843747e-07, + "loss": 0.0317, + "step": 185760 + }, + { + "epoch": 0.19885, + "grad_norm": 0.030970165506005287, + "learning_rate": 6.889197141012799e-07, + "loss": 0.0331, + "step": 185770 + }, + { + "epoch": 0.1989, + "grad_norm": 0.0318511463701725, + "learning_rate": 6.879563220739877e-07, + "loss": 0.0332, + "step": 185780 + }, + { + "epoch": 0.19895, + "grad_norm": 0.029180170968174934, + "learning_rate": 6.869935947288353e-07, + "loss": 0.0323, + "step": 185790 + }, + { + "epoch": 0.199, + "grad_norm": 0.03175966814160347, + "learning_rate": 6.860315320921462e-07, + "loss": 0.0333, + "step": 185800 + }, + { + "epoch": 0.19905, + "grad_norm": 0.027447475120425224, + "learning_rate": 6.850701341902188e-07, + "loss": 0.0332, + "step": 185810 + }, + { + "epoch": 0.1991, + "grad_norm": 0.026209700852632523, + "learning_rate": 6.841094010493376e-07, + "loss": 0.0323, + "step": 185820 + }, + { + "epoch": 0.19915, + "grad_norm": 0.03160052374005318, + "learning_rate": 6.831493326957733e-07, + "loss": 0.0327, + "step": 185830 + }, + { + "epoch": 0.1992, + "grad_norm": 0.03303055837750435, + "learning_rate": 6.821899291557715e-07, + "loss": 0.0335, + "step": 185840 + }, + { + "epoch": 0.19925, + "grad_norm": 0.03005937859416008, + "learning_rate": 6.812311904555613e-07, + "loss": 0.0329, + "step": 185850 + }, + { + "epoch": 0.1993, + "grad_norm": 0.02981853485107422, + "learning_rate": 6.802731166213495e-07, + "loss": 0.0345, + "step": 185860 + }, + { + "epoch": 0.19935, + "grad_norm": 0.027913715690374374, + "learning_rate": 6.793157076793399e-07, + "loss": 0.0331, + "step": 185870 + }, + { + "epoch": 0.1994, + "grad_norm": 0.03380822017788887, + "learning_rate": 6.783589636556981e-07, + "loss": 0.0325, + "step": 185880 + }, + { + "epoch": 0.19945, + "grad_norm": 0.032246727496385574, + "learning_rate": 6.774028845765862e-07, + "loss": 0.0335, + "step": 185890 + }, + { + "epoch": 0.1995, + "grad_norm": 0.029604503884911537, + "learning_rate": 6.764474704681417e-07, + "loss": 0.0323, + "step": 185900 + }, + { + "epoch": 0.19955, + "grad_norm": 0.032075826078653336, + "learning_rate": 6.754927213564855e-07, + "loss": 0.0326, + "step": 185910 + }, + { + "epoch": 0.1996, + "grad_norm": 0.024991866201162338, + "learning_rate": 6.745386372677215e-07, + "loss": 0.0314, + "step": 185920 + }, + { + "epoch": 0.19965, + "grad_norm": 0.02830740064382553, + "learning_rate": 6.735852182279318e-07, + "loss": 0.0318, + "step": 185930 + }, + { + "epoch": 0.1997, + "grad_norm": 0.027261871844530106, + "learning_rate": 6.726324642631814e-07, + "loss": 0.0331, + "step": 185940 + }, + { + "epoch": 0.19975, + "grad_norm": 0.03271022439002991, + "learning_rate": 6.716803753995221e-07, + "loss": 0.035, + "step": 185950 + }, + { + "epoch": 0.1998, + "grad_norm": 0.033689193427562714, + "learning_rate": 6.707289516629772e-07, + "loss": 0.0323, + "step": 185960 + }, + { + "epoch": 0.19985, + "grad_norm": 0.02958158403635025, + "learning_rate": 6.697781930795705e-07, + "loss": 0.0315, + "step": 185970 + }, + { + "epoch": 0.1999, + "grad_norm": 0.02819625288248062, + "learning_rate": 6.688280996752811e-07, + "loss": 0.0318, + "step": 185980 + }, + { + "epoch": 0.19995, + "grad_norm": 0.027729539200663567, + "learning_rate": 6.678786714760937e-07, + "loss": 0.0311, + "step": 185990 + }, + { + "epoch": 0.2, + "grad_norm": 0.02650776319205761, + "learning_rate": 6.6692990850796e-07, + "loss": 0.0319, + "step": 186000 + }, + { + "epoch": 0.20005, + "grad_norm": 0.028526129201054573, + "learning_rate": 6.65981810796823e-07, + "loss": 0.0311, + "step": 186010 + }, + { + "epoch": 0.2001, + "grad_norm": 0.033663127571344376, + "learning_rate": 6.650343783686036e-07, + "loss": 0.032, + "step": 186020 + }, + { + "epoch": 0.20015, + "grad_norm": 0.030074715614318848, + "learning_rate": 6.640876112491978e-07, + "loss": 0.0327, + "step": 186030 + }, + { + "epoch": 0.2002, + "grad_norm": 0.02809826284646988, + "learning_rate": 6.63141509464496e-07, + "loss": 0.0329, + "step": 186040 + }, + { + "epoch": 0.20025, + "grad_norm": 0.03013697639107704, + "learning_rate": 6.621960730403637e-07, + "loss": 0.0321, + "step": 186050 + }, + { + "epoch": 0.2003, + "grad_norm": 0.029662052169442177, + "learning_rate": 6.612513020026467e-07, + "loss": 0.0333, + "step": 186060 + }, + { + "epoch": 0.20035, + "grad_norm": 0.034348465502262115, + "learning_rate": 6.603071963771717e-07, + "loss": 0.0325, + "step": 186070 + }, + { + "epoch": 0.2004, + "grad_norm": 0.025963271036744118, + "learning_rate": 6.59363756189757e-07, + "loss": 0.0323, + "step": 186080 + }, + { + "epoch": 0.20045, + "grad_norm": 0.03223303705453873, + "learning_rate": 6.58420981466193e-07, + "loss": 0.0324, + "step": 186090 + }, + { + "epoch": 0.2005, + "grad_norm": 0.02908337488770485, + "learning_rate": 6.574788722322561e-07, + "loss": 0.032, + "step": 186100 + }, + { + "epoch": 0.20055, + "grad_norm": 0.029895760118961334, + "learning_rate": 6.56537428513701e-07, + "loss": 0.0347, + "step": 186110 + }, + { + "epoch": 0.2006, + "grad_norm": 0.02662333846092224, + "learning_rate": 6.555966503362626e-07, + "loss": 0.0319, + "step": 186120 + }, + { + "epoch": 0.20065, + "grad_norm": 0.033412255346775055, + "learning_rate": 6.546565377256731e-07, + "loss": 0.033, + "step": 186130 + }, + { + "epoch": 0.2007, + "grad_norm": 0.028197573497891426, + "learning_rate": 6.537170907076229e-07, + "loss": 0.033, + "step": 186140 + }, + { + "epoch": 0.20075, + "grad_norm": 0.03171047568321228, + "learning_rate": 6.527783093078027e-07, + "loss": 0.0333, + "step": 186150 + }, + { + "epoch": 0.2008, + "grad_norm": 0.028691383078694344, + "learning_rate": 6.518401935518753e-07, + "loss": 0.0353, + "step": 186160 + }, + { + "epoch": 0.20085, + "grad_norm": 0.03228580579161644, + "learning_rate": 6.509027434654896e-07, + "loss": 0.0325, + "step": 186170 + }, + { + "epoch": 0.2009, + "grad_norm": 0.02812507189810276, + "learning_rate": 6.499659590742807e-07, + "loss": 0.0331, + "step": 186180 + }, + { + "epoch": 0.20095, + "grad_norm": 0.034292567521333694, + "learning_rate": 6.490298404038503e-07, + "loss": 0.0319, + "step": 186190 + }, + { + "epoch": 0.201, + "grad_norm": 0.03362613543868065, + "learning_rate": 6.480943874797946e-07, + "loss": 0.0322, + "step": 186200 + }, + { + "epoch": 0.20105, + "grad_norm": 0.029850492253899574, + "learning_rate": 6.471596003276903e-07, + "loss": 0.0324, + "step": 186210 + }, + { + "epoch": 0.2011, + "grad_norm": 0.032299816608428955, + "learning_rate": 6.462254789730976e-07, + "loss": 0.0331, + "step": 186220 + }, + { + "epoch": 0.20115, + "grad_norm": 0.031973760575056076, + "learning_rate": 6.452920234415488e-07, + "loss": 0.0325, + "step": 186230 + }, + { + "epoch": 0.2012, + "grad_norm": 0.029282039031386375, + "learning_rate": 6.443592337585624e-07, + "loss": 0.0327, + "step": 186240 + }, + { + "epoch": 0.20125, + "grad_norm": 0.025493284687399864, + "learning_rate": 6.434271099496486e-07, + "loss": 0.0327, + "step": 186250 + }, + { + "epoch": 0.2013, + "grad_norm": 0.03137873485684395, + "learning_rate": 6.424956520402869e-07, + "loss": 0.0334, + "step": 186260 + }, + { + "epoch": 0.20135, + "grad_norm": 0.030143195763230324, + "learning_rate": 6.415648600559432e-07, + "loss": 0.034, + "step": 186270 + }, + { + "epoch": 0.2014, + "grad_norm": 0.02951633930206299, + "learning_rate": 6.406347340220664e-07, + "loss": 0.0336, + "step": 186280 + }, + { + "epoch": 0.20145, + "grad_norm": 0.030829209834337234, + "learning_rate": 6.397052739640808e-07, + "loss": 0.0321, + "step": 186290 + }, + { + "epoch": 0.2015, + "grad_norm": 0.03237324580550194, + "learning_rate": 6.387764799074047e-07, + "loss": 0.0318, + "step": 186300 + }, + { + "epoch": 0.20155, + "grad_norm": 0.028324754908680916, + "learning_rate": 6.378483518774264e-07, + "loss": 0.0327, + "step": 186310 + }, + { + "epoch": 0.2016, + "grad_norm": 0.03085837885737419, + "learning_rate": 6.369208898995199e-07, + "loss": 0.0319, + "step": 186320 + }, + { + "epoch": 0.20165, + "grad_norm": 0.028847992420196533, + "learning_rate": 6.359940939990484e-07, + "loss": 0.0338, + "step": 186330 + }, + { + "epoch": 0.2017, + "grad_norm": 0.030875183641910553, + "learning_rate": 6.350679642013413e-07, + "loss": 0.0319, + "step": 186340 + }, + { + "epoch": 0.20175, + "grad_norm": 0.03190063312649727, + "learning_rate": 6.341425005317259e-07, + "loss": 0.0327, + "step": 186350 + }, + { + "epoch": 0.2018, + "grad_norm": 0.02719941921532154, + "learning_rate": 6.332177030154957e-07, + "loss": 0.0323, + "step": 186360 + }, + { + "epoch": 0.20185, + "grad_norm": 0.031786005944013596, + "learning_rate": 6.322935716779416e-07, + "loss": 0.0322, + "step": 186370 + }, + { + "epoch": 0.2019, + "grad_norm": 0.027638213708996773, + "learning_rate": 6.313701065443268e-07, + "loss": 0.032, + "step": 186380 + }, + { + "epoch": 0.20195, + "grad_norm": 0.032468460500240326, + "learning_rate": 6.304473076399004e-07, + "loss": 0.0325, + "step": 186390 + }, + { + "epoch": 0.202, + "grad_norm": 0.028046119958162308, + "learning_rate": 6.295251749898868e-07, + "loss": 0.0319, + "step": 186400 + }, + { + "epoch": 0.20205, + "grad_norm": 0.032322533428668976, + "learning_rate": 6.28603708619499e-07, + "loss": 0.0313, + "step": 186410 + }, + { + "epoch": 0.2021, + "grad_norm": 0.028440384194254875, + "learning_rate": 6.276829085539337e-07, + "loss": 0.033, + "step": 186420 + }, + { + "epoch": 0.20215, + "grad_norm": 0.031794071197509766, + "learning_rate": 6.267627748183597e-07, + "loss": 0.0314, + "step": 186430 + }, + { + "epoch": 0.2022, + "grad_norm": 0.039318595081567764, + "learning_rate": 6.258433074379344e-07, + "loss": 0.0333, + "step": 186440 + }, + { + "epoch": 0.20225, + "grad_norm": 0.028141120448708534, + "learning_rate": 6.249245064377934e-07, + "loss": 0.0326, + "step": 186450 + }, + { + "epoch": 0.2023, + "grad_norm": 0.03346704691648483, + "learning_rate": 6.240063718430611e-07, + "loss": 0.0329, + "step": 186460 + }, + { + "epoch": 0.20235, + "grad_norm": 0.02704657055437565, + "learning_rate": 6.230889036788395e-07, + "loss": 0.0317, + "step": 186470 + }, + { + "epoch": 0.2024, + "grad_norm": 0.025984736159443855, + "learning_rate": 6.221721019702059e-07, + "loss": 0.0333, + "step": 186480 + }, + { + "epoch": 0.20245, + "grad_norm": 0.03137907013297081, + "learning_rate": 6.212559667422291e-07, + "loss": 0.0308, + "step": 186490 + }, + { + "epoch": 0.2025, + "grad_norm": 0.029576167464256287, + "learning_rate": 6.203404980199556e-07, + "loss": 0.0321, + "step": 186500 + }, + { + "epoch": 0.20255, + "grad_norm": 0.029019024223089218, + "learning_rate": 6.194256958284156e-07, + "loss": 0.0322, + "step": 186510 + }, + { + "epoch": 0.2026, + "grad_norm": 0.030817793682217598, + "learning_rate": 6.185115601926167e-07, + "loss": 0.0323, + "step": 186520 + }, + { + "epoch": 0.20265, + "grad_norm": 0.031037267297506332, + "learning_rate": 6.175980911375528e-07, + "loss": 0.0321, + "step": 186530 + }, + { + "epoch": 0.2027, + "grad_norm": 0.02664116770029068, + "learning_rate": 6.166852886881958e-07, + "loss": 0.0315, + "step": 186540 + }, + { + "epoch": 0.20275, + "grad_norm": 0.029405994340777397, + "learning_rate": 6.157731528695033e-07, + "loss": 0.0324, + "step": 186550 + }, + { + "epoch": 0.2028, + "grad_norm": 0.029494572430849075, + "learning_rate": 6.148616837064136e-07, + "loss": 0.0328, + "step": 186560 + }, + { + "epoch": 0.20285, + "grad_norm": 0.031170979142189026, + "learning_rate": 6.139508812238404e-07, + "loss": 0.0327, + "step": 186570 + }, + { + "epoch": 0.2029, + "grad_norm": 0.0288227628916502, + "learning_rate": 6.130407454466913e-07, + "loss": 0.0342, + "step": 186580 + }, + { + "epoch": 0.20295, + "grad_norm": 0.02931247465312481, + "learning_rate": 6.121312763998465e-07, + "loss": 0.0325, + "step": 186590 + }, + { + "epoch": 0.203, + "grad_norm": 0.03437415510416031, + "learning_rate": 6.112224741081696e-07, + "loss": 0.0336, + "step": 186600 + }, + { + "epoch": 0.20305, + "grad_norm": 0.02927682362496853, + "learning_rate": 6.103143385965099e-07, + "loss": 0.0343, + "step": 186610 + }, + { + "epoch": 0.2031, + "grad_norm": 0.03057851642370224, + "learning_rate": 6.094068698896893e-07, + "loss": 0.0335, + "step": 186620 + }, + { + "epoch": 0.20315, + "grad_norm": 0.03229416161775589, + "learning_rate": 6.085000680125269e-07, + "loss": 0.0343, + "step": 186630 + }, + { + "epoch": 0.2032, + "grad_norm": 0.031911615282297134, + "learning_rate": 6.075939329898056e-07, + "loss": 0.0333, + "step": 186640 + }, + { + "epoch": 0.20325, + "grad_norm": 0.033049266785383224, + "learning_rate": 6.066884648463028e-07, + "loss": 0.0333, + "step": 186650 + }, + { + "epoch": 0.2033, + "grad_norm": 0.030126668512821198, + "learning_rate": 6.057836636067738e-07, + "loss": 0.0348, + "step": 186660 + }, + { + "epoch": 0.20335, + "grad_norm": 0.027441836893558502, + "learning_rate": 6.048795292959541e-07, + "loss": 0.0332, + "step": 186670 + }, + { + "epoch": 0.2034, + "grad_norm": 0.03227389603853226, + "learning_rate": 6.039760619385687e-07, + "loss": 0.0348, + "step": 186680 + }, + { + "epoch": 0.20345, + "grad_norm": 0.0290833692997694, + "learning_rate": 6.03073261559306e-07, + "loss": 0.0332, + "step": 186690 + }, + { + "epoch": 0.2035, + "grad_norm": 0.03083917498588562, + "learning_rate": 6.021711281828546e-07, + "loss": 0.0341, + "step": 186700 + }, + { + "epoch": 0.20355, + "grad_norm": 0.031132884323596954, + "learning_rate": 6.012696618338809e-07, + "loss": 0.033, + "step": 186710 + }, + { + "epoch": 0.2036, + "grad_norm": 0.03095533512532711, + "learning_rate": 6.003688625370291e-07, + "loss": 0.0332, + "step": 186720 + }, + { + "epoch": 0.20365, + "grad_norm": 0.03213070333003998, + "learning_rate": 5.994687303169266e-07, + "loss": 0.0332, + "step": 186730 + }, + { + "epoch": 0.2037, + "grad_norm": 0.027768230065703392, + "learning_rate": 5.985692651981816e-07, + "loss": 0.0348, + "step": 186740 + }, + { + "epoch": 0.20375, + "grad_norm": 0.033160459250211716, + "learning_rate": 5.976704672053856e-07, + "loss": 0.0345, + "step": 186750 + }, + { + "epoch": 0.2038, + "grad_norm": 0.031500443816185, + "learning_rate": 5.967723363631106e-07, + "loss": 0.0332, + "step": 186760 + }, + { + "epoch": 0.20385, + "grad_norm": 0.0280007254332304, + "learning_rate": 5.958748726959118e-07, + "loss": 0.034, + "step": 186770 + }, + { + "epoch": 0.2039, + "grad_norm": 0.028425922617316246, + "learning_rate": 5.94978076228328e-07, + "loss": 0.0328, + "step": 186780 + }, + { + "epoch": 0.20395, + "grad_norm": 0.030183615162968636, + "learning_rate": 5.940819469848702e-07, + "loss": 0.0328, + "step": 186790 + }, + { + "epoch": 0.204, + "grad_norm": 0.033598169684410095, + "learning_rate": 5.931864849900493e-07, + "loss": 0.0337, + "step": 186800 + }, + { + "epoch": 0.20405, + "grad_norm": 0.030246658250689507, + "learning_rate": 5.922916902683373e-07, + "loss": 0.0322, + "step": 186810 + }, + { + "epoch": 0.2041, + "grad_norm": 0.030375244095921516, + "learning_rate": 5.913975628442037e-07, + "loss": 0.0339, + "step": 186820 + }, + { + "epoch": 0.20415, + "grad_norm": 0.031668275594711304, + "learning_rate": 5.905041027420871e-07, + "loss": 0.0326, + "step": 186830 + }, + { + "epoch": 0.2042, + "grad_norm": 0.03186219185590744, + "learning_rate": 5.896113099864209e-07, + "loss": 0.0327, + "step": 186840 + }, + { + "epoch": 0.20425, + "grad_norm": 0.03156822919845581, + "learning_rate": 5.887191846016104e-07, + "loss": 0.0332, + "step": 186850 + }, + { + "epoch": 0.2043, + "grad_norm": 0.02741374634206295, + "learning_rate": 5.878277266120419e-07, + "loss": 0.0331, + "step": 186860 + }, + { + "epoch": 0.20435, + "grad_norm": 0.030092798173427582, + "learning_rate": 5.869369360420985e-07, + "loss": 0.0333, + "step": 186870 + }, + { + "epoch": 0.2044, + "grad_norm": 0.033063847571611404, + "learning_rate": 5.860468129161218e-07, + "loss": 0.0344, + "step": 186880 + }, + { + "epoch": 0.20445, + "grad_norm": 0.03125067055225372, + "learning_rate": 5.851573572584618e-07, + "loss": 0.0342, + "step": 186890 + }, + { + "epoch": 0.2045, + "grad_norm": 0.034990180283784866, + "learning_rate": 5.842685690934214e-07, + "loss": 0.0338, + "step": 186900 + }, + { + "epoch": 0.20455, + "grad_norm": 0.03362009674310684, + "learning_rate": 5.833804484453031e-07, + "loss": 0.0321, + "step": 186910 + }, + { + "epoch": 0.2046, + "grad_norm": 0.028897780925035477, + "learning_rate": 5.824929953383962e-07, + "loss": 0.0345, + "step": 186920 + }, + { + "epoch": 0.20465, + "grad_norm": 0.02927793189883232, + "learning_rate": 5.81606209796956e-07, + "loss": 0.0344, + "step": 186930 + }, + { + "epoch": 0.2047, + "grad_norm": 0.031953468918800354, + "learning_rate": 5.807200918452299e-07, + "loss": 0.0347, + "step": 186940 + }, + { + "epoch": 0.20475, + "grad_norm": 0.034804243594408035, + "learning_rate": 5.798346415074373e-07, + "loss": 0.0316, + "step": 186950 + }, + { + "epoch": 0.2048, + "grad_norm": 0.027775434777140617, + "learning_rate": 5.789498588077924e-07, + "loss": 0.032, + "step": 186960 + }, + { + "epoch": 0.20485, + "grad_norm": 0.03166566044092178, + "learning_rate": 5.780657437704895e-07, + "loss": 0.0316, + "step": 186970 + }, + { + "epoch": 0.2049, + "grad_norm": 0.029515348374843597, + "learning_rate": 5.771822964196899e-07, + "loss": 0.0319, + "step": 186980 + }, + { + "epoch": 0.20495, + "grad_norm": 0.03157178685069084, + "learning_rate": 5.762995167795521e-07, + "loss": 0.0332, + "step": 186990 + }, + { + "epoch": 0.205, + "grad_norm": 0.033914707601070404, + "learning_rate": 5.754174048742094e-07, + "loss": 0.0321, + "step": 187000 + }, + { + "epoch": 0.20505, + "grad_norm": 0.025544434785842896, + "learning_rate": 5.745359607277789e-07, + "loss": 0.0329, + "step": 187010 + }, + { + "epoch": 0.2051, + "grad_norm": 0.028234345838427544, + "learning_rate": 5.736551843643606e-07, + "loss": 0.0334, + "step": 187020 + }, + { + "epoch": 0.20515, + "grad_norm": 0.02860359102487564, + "learning_rate": 5.727750758080324e-07, + "loss": 0.0319, + "step": 187030 + }, + { + "epoch": 0.2052, + "grad_norm": 0.03399662300944328, + "learning_rate": 5.718956350828558e-07, + "loss": 0.0335, + "step": 187040 + }, + { + "epoch": 0.20525, + "grad_norm": 0.02993389591574669, + "learning_rate": 5.710168622128781e-07, + "loss": 0.0333, + "step": 187050 + }, + { + "epoch": 0.2053, + "grad_norm": 0.028217710554599762, + "learning_rate": 5.70138757222119e-07, + "loss": 0.0323, + "step": 187060 + }, + { + "epoch": 0.20535, + "grad_norm": 0.026970872655510902, + "learning_rate": 5.692613201345869e-07, + "loss": 0.0331, + "step": 187070 + }, + { + "epoch": 0.2054, + "grad_norm": 0.02764004096388817, + "learning_rate": 5.683845509742769e-07, + "loss": 0.0325, + "step": 187080 + }, + { + "epoch": 0.20545, + "grad_norm": 0.03004232421517372, + "learning_rate": 5.675084497651501e-07, + "loss": 0.0325, + "step": 187090 + }, + { + "epoch": 0.2055, + "grad_norm": 0.027504999190568924, + "learning_rate": 5.666330165311651e-07, + "loss": 0.0329, + "step": 187100 + }, + { + "epoch": 0.20555, + "grad_norm": 0.02881404012441635, + "learning_rate": 5.657582512962556e-07, + "loss": 0.032, + "step": 187110 + }, + { + "epoch": 0.2056, + "grad_norm": 0.030582886189222336, + "learning_rate": 5.64884154084333e-07, + "loss": 0.0329, + "step": 187120 + }, + { + "epoch": 0.20565, + "grad_norm": 0.02659016102552414, + "learning_rate": 5.640107249193005e-07, + "loss": 0.0341, + "step": 187130 + }, + { + "epoch": 0.2057, + "grad_norm": 0.027498317882418633, + "learning_rate": 5.631379638250362e-07, + "loss": 0.0336, + "step": 187140 + }, + { + "epoch": 0.20575, + "grad_norm": 0.026438282802700996, + "learning_rate": 5.622658708253959e-07, + "loss": 0.0344, + "step": 187150 + }, + { + "epoch": 0.2058, + "grad_norm": 0.030266601592302322, + "learning_rate": 5.613944459442272e-07, + "loss": 0.0339, + "step": 187160 + }, + { + "epoch": 0.20585, + "grad_norm": 0.032785773277282715, + "learning_rate": 5.6052368920535e-07, + "loss": 0.0344, + "step": 187170 + }, + { + "epoch": 0.2059, + "grad_norm": 0.027709029614925385, + "learning_rate": 5.596536006325814e-07, + "loss": 0.0322, + "step": 187180 + }, + { + "epoch": 0.20595, + "grad_norm": 0.03300001472234726, + "learning_rate": 5.58784180249694e-07, + "loss": 0.0352, + "step": 187190 + }, + { + "epoch": 0.206, + "grad_norm": 0.03350808471441269, + "learning_rate": 5.579154280804688e-07, + "loss": 0.0352, + "step": 187200 + }, + { + "epoch": 0.20605, + "grad_norm": 0.03382578864693642, + "learning_rate": 5.570473441486507e-07, + "loss": 0.0345, + "step": 187210 + }, + { + "epoch": 0.2061, + "grad_norm": 0.033224329352378845, + "learning_rate": 5.56179928477979e-07, + "loss": 0.034, + "step": 187220 + }, + { + "epoch": 0.20615, + "grad_norm": 0.03182445093989372, + "learning_rate": 5.553131810921624e-07, + "loss": 0.0338, + "step": 187230 + }, + { + "epoch": 0.2062, + "grad_norm": 0.0331844724714756, + "learning_rate": 5.544471020148989e-07, + "loss": 0.0379, + "step": 187240 + }, + { + "epoch": 0.20625, + "grad_norm": 0.029393313452601433, + "learning_rate": 5.535816912698722e-07, + "loss": 0.0342, + "step": 187250 + }, + { + "epoch": 0.2063, + "grad_norm": 0.026721693575382233, + "learning_rate": 5.527169488807354e-07, + "loss": 0.037, + "step": 187260 + }, + { + "epoch": 0.20635, + "grad_norm": 0.0304440725594759, + "learning_rate": 5.518528748711338e-07, + "loss": 0.034, + "step": 187270 + }, + { + "epoch": 0.2064, + "grad_norm": 0.02855251356959343, + "learning_rate": 5.509894692646872e-07, + "loss": 0.0355, + "step": 187280 + }, + { + "epoch": 0.20645, + "grad_norm": 0.03019746206700802, + "learning_rate": 5.501267320850018e-07, + "loss": 0.0334, + "step": 187290 + }, + { + "epoch": 0.2065, + "grad_norm": 0.031135357916355133, + "learning_rate": 5.492646633556698e-07, + "loss": 0.0351, + "step": 187300 + }, + { + "epoch": 0.20655, + "grad_norm": 0.02638038992881775, + "learning_rate": 5.484032631002583e-07, + "loss": 0.0331, + "step": 187310 + }, + { + "epoch": 0.2066, + "grad_norm": 0.029533961787819862, + "learning_rate": 5.475425313423127e-07, + "loss": 0.0341, + "step": 187320 + }, + { + "epoch": 0.20665, + "grad_norm": 0.03587843477725983, + "learning_rate": 5.466824681053667e-07, + "loss": 0.0336, + "step": 187330 + }, + { + "epoch": 0.2067, + "grad_norm": 0.02860553003847599, + "learning_rate": 5.458230734129378e-07, + "loss": 0.0344, + "step": 187340 + }, + { + "epoch": 0.20675, + "grad_norm": 0.03146568313241005, + "learning_rate": 5.44964347288518e-07, + "loss": 0.0326, + "step": 187350 + }, + { + "epoch": 0.2068, + "grad_norm": 0.031543321907520294, + "learning_rate": 5.44106289755586e-07, + "loss": 0.033, + "step": 187360 + }, + { + "epoch": 0.20685, + "grad_norm": 0.03184448182582855, + "learning_rate": 5.432489008376007e-07, + "loss": 0.0328, + "step": 187370 + }, + { + "epoch": 0.2069, + "grad_norm": 0.03035517781972885, + "learning_rate": 5.42392180557999e-07, + "loss": 0.0325, + "step": 187380 + }, + { + "epoch": 0.20695, + "grad_norm": 0.027728291228413582, + "learning_rate": 5.415361289402148e-07, + "loss": 0.0345, + "step": 187390 + }, + { + "epoch": 0.207, + "grad_norm": 0.02883809804916382, + "learning_rate": 5.406807460076379e-07, + "loss": 0.0323, + "step": 187400 + }, + { + "epoch": 0.20705, + "grad_norm": 0.031016673892736435, + "learning_rate": 5.398260317836578e-07, + "loss": 0.0328, + "step": 187410 + }, + { + "epoch": 0.2071, + "grad_norm": 0.030551131814718246, + "learning_rate": 5.389719862916504e-07, + "loss": 0.0339, + "step": 187420 + }, + { + "epoch": 0.20715, + "grad_norm": 0.03497905284166336, + "learning_rate": 5.381186095549578e-07, + "loss": 0.0346, + "step": 187430 + }, + { + "epoch": 0.2072, + "grad_norm": 0.028034161776304245, + "learning_rate": 5.372659015969145e-07, + "loss": 0.0337, + "step": 187440 + }, + { + "epoch": 0.20725, + "grad_norm": 0.027271872386336327, + "learning_rate": 5.364138624408266e-07, + "loss": 0.0333, + "step": 187450 + }, + { + "epoch": 0.2073, + "grad_norm": 0.027147216722369194, + "learning_rate": 5.35562492109995e-07, + "loss": 0.0345, + "step": 187460 + }, + { + "epoch": 0.20735, + "grad_norm": 0.030328378081321716, + "learning_rate": 5.347117906276955e-07, + "loss": 0.0342, + "step": 187470 + }, + { + "epoch": 0.2074, + "grad_norm": 0.03240053728222847, + "learning_rate": 5.338617580171817e-07, + "loss": 0.0329, + "step": 187480 + }, + { + "epoch": 0.20745, + "grad_norm": 0.0268976092338562, + "learning_rate": 5.33012394301699e-07, + "loss": 0.0357, + "step": 187490 + }, + { + "epoch": 0.2075, + "grad_norm": 0.032217223197221756, + "learning_rate": 5.321636995044649e-07, + "loss": 0.0351, + "step": 187500 + }, + { + "epoch": 0.20755, + "grad_norm": 0.03143179416656494, + "learning_rate": 5.313156736486829e-07, + "loss": 0.034, + "step": 187510 + }, + { + "epoch": 0.2076, + "grad_norm": 0.03546081483364105, + "learning_rate": 5.304683167575374e-07, + "loss": 0.0321, + "step": 187520 + }, + { + "epoch": 0.20765, + "grad_norm": 0.029902616515755653, + "learning_rate": 5.296216288541933e-07, + "loss": 0.0338, + "step": 187530 + }, + { + "epoch": 0.2077, + "grad_norm": 0.03386787325143814, + "learning_rate": 5.287756099618041e-07, + "loss": 0.0337, + "step": 187540 + }, + { + "epoch": 0.20775, + "grad_norm": 0.031185338273644447, + "learning_rate": 5.279302601034958e-07, + "loss": 0.035, + "step": 187550 + }, + { + "epoch": 0.2078, + "grad_norm": 0.03265991061925888, + "learning_rate": 5.270855793023805e-07, + "loss": 0.0344, + "step": 187560 + }, + { + "epoch": 0.20785, + "grad_norm": 0.031603556126356125, + "learning_rate": 5.262415675815507e-07, + "loss": 0.0356, + "step": 187570 + }, + { + "epoch": 0.2079, + "grad_norm": 0.02581261657178402, + "learning_rate": 5.253982249640826e-07, + "loss": 0.0332, + "step": 187580 + }, + { + "epoch": 0.20795, + "grad_norm": 0.028013408184051514, + "learning_rate": 5.245555514730299e-07, + "loss": 0.0324, + "step": 187590 + }, + { + "epoch": 0.208, + "grad_norm": 0.03117271140217781, + "learning_rate": 5.237135471314352e-07, + "loss": 0.0327, + "step": 187600 + }, + { + "epoch": 0.20805, + "grad_norm": 0.02825392223894596, + "learning_rate": 5.228722119623192e-07, + "loss": 0.0329, + "step": 187610 + }, + { + "epoch": 0.2081, + "grad_norm": 0.02971552312374115, + "learning_rate": 5.220315459886771e-07, + "loss": 0.0322, + "step": 187620 + }, + { + "epoch": 0.20815, + "grad_norm": 0.02579326182603836, + "learning_rate": 5.21191549233499e-07, + "loss": 0.032, + "step": 187630 + }, + { + "epoch": 0.2082, + "grad_norm": 0.030249234288930893, + "learning_rate": 5.203522217197499e-07, + "loss": 0.0337, + "step": 187640 + }, + { + "epoch": 0.20825, + "grad_norm": 0.030575744807720184, + "learning_rate": 5.195135634703724e-07, + "loss": 0.0335, + "step": 187650 + }, + { + "epoch": 0.2083, + "grad_norm": 0.027355728670954704, + "learning_rate": 5.186755745082955e-07, + "loss": 0.0324, + "step": 187660 + }, + { + "epoch": 0.20835, + "grad_norm": 0.027793534100055695, + "learning_rate": 5.178382548564287e-07, + "loss": 0.0341, + "step": 187670 + }, + { + "epoch": 0.2084, + "grad_norm": 0.028333058580756187, + "learning_rate": 5.17001604537673e-07, + "loss": 0.0332, + "step": 187680 + }, + { + "epoch": 0.20845, + "grad_norm": 0.03142617270350456, + "learning_rate": 5.161656235748935e-07, + "loss": 0.0321, + "step": 187690 + }, + { + "epoch": 0.2085, + "grad_norm": 0.029200812801718712, + "learning_rate": 5.153303119909469e-07, + "loss": 0.0327, + "step": 187700 + }, + { + "epoch": 0.20855, + "grad_norm": 0.02589821256697178, + "learning_rate": 5.144956698086706e-07, + "loss": 0.0313, + "step": 187710 + }, + { + "epoch": 0.2086, + "grad_norm": 0.026966974139213562, + "learning_rate": 5.136616970508851e-07, + "loss": 0.0324, + "step": 187720 + }, + { + "epoch": 0.20865, + "grad_norm": 0.02764982171356678, + "learning_rate": 5.128283937403888e-07, + "loss": 0.0314, + "step": 187730 + }, + { + "epoch": 0.2087, + "grad_norm": 0.02868589200079441, + "learning_rate": 5.119957598999636e-07, + "loss": 0.0316, + "step": 187740 + }, + { + "epoch": 0.20875, + "grad_norm": 0.02683471143245697, + "learning_rate": 5.111637955523773e-07, + "loss": 0.0316, + "step": 187750 + }, + { + "epoch": 0.2088, + "grad_norm": 0.028452489525079727, + "learning_rate": 5.10332500720373e-07, + "loss": 0.0327, + "step": 187760 + }, + { + "epoch": 0.20885, + "grad_norm": 0.02829771302640438, + "learning_rate": 5.095018754266767e-07, + "loss": 0.0317, + "step": 187770 + }, + { + "epoch": 0.2089, + "grad_norm": 0.02697194740176201, + "learning_rate": 5.086719196939954e-07, + "loss": 0.0317, + "step": 187780 + }, + { + "epoch": 0.20895, + "grad_norm": 0.02591918595135212, + "learning_rate": 5.078426335450248e-07, + "loss": 0.0313, + "step": 187790 + }, + { + "epoch": 0.209, + "grad_norm": 0.026268985122442245, + "learning_rate": 5.070140170024384e-07, + "loss": 0.0326, + "step": 187800 + }, + { + "epoch": 0.20905, + "grad_norm": 0.026967208832502365, + "learning_rate": 5.061860700888849e-07, + "loss": 0.0365, + "step": 187810 + }, + { + "epoch": 0.2091, + "grad_norm": 0.02764206752181053, + "learning_rate": 5.053587928270014e-07, + "loss": 0.0323, + "step": 187820 + }, + { + "epoch": 0.20915, + "grad_norm": 0.030433472245931625, + "learning_rate": 5.045321852394064e-07, + "loss": 0.0324, + "step": 187830 + }, + { + "epoch": 0.2092, + "grad_norm": 0.031036920845508575, + "learning_rate": 5.037062473487009e-07, + "loss": 0.0336, + "step": 187840 + }, + { + "epoch": 0.20925, + "grad_norm": 0.03053821064531803, + "learning_rate": 5.028809791774641e-07, + "loss": 0.0335, + "step": 187850 + }, + { + "epoch": 0.2093, + "grad_norm": 0.036653995513916016, + "learning_rate": 5.020563807482559e-07, + "loss": 0.0335, + "step": 187860 + }, + { + "epoch": 0.20935, + "grad_norm": 0.030879056081175804, + "learning_rate": 5.012324520836248e-07, + "loss": 0.0331, + "step": 187870 + }, + { + "epoch": 0.2094, + "grad_norm": 0.03588445857167244, + "learning_rate": 5.004091932060917e-07, + "loss": 0.0333, + "step": 187880 + }, + { + "epoch": 0.20945, + "grad_norm": 0.03237863630056381, + "learning_rate": 4.995866041381719e-07, + "loss": 0.0332, + "step": 187890 + }, + { + "epoch": 0.2095, + "grad_norm": 0.032769039273262024, + "learning_rate": 4.987646849023447e-07, + "loss": 0.0343, + "step": 187900 + }, + { + "epoch": 0.20955, + "grad_norm": 0.029265888035297394, + "learning_rate": 4.979434355210866e-07, + "loss": 0.035, + "step": 187910 + }, + { + "epoch": 0.2096, + "grad_norm": 0.031995292752981186, + "learning_rate": 4.971228560168545e-07, + "loss": 0.034, + "step": 187920 + }, + { + "epoch": 0.20965, + "grad_norm": 0.031782425940036774, + "learning_rate": 4.96302946412075e-07, + "loss": 0.0334, + "step": 187930 + }, + { + "epoch": 0.2097, + "grad_norm": 0.02957136556506157, + "learning_rate": 4.95483706729169e-07, + "loss": 0.0343, + "step": 187940 + }, + { + "epoch": 0.20975, + "grad_norm": 0.03448570892214775, + "learning_rate": 4.946651369905297e-07, + "loss": 0.0333, + "step": 187950 + }, + { + "epoch": 0.2098, + "grad_norm": 0.03404470160603523, + "learning_rate": 4.938472372185449e-07, + "loss": 0.0347, + "step": 187960 + }, + { + "epoch": 0.20985, + "grad_norm": 0.030334487557411194, + "learning_rate": 4.930300074355659e-07, + "loss": 0.0343, + "step": 187970 + }, + { + "epoch": 0.2099, + "grad_norm": 0.0322144441306591, + "learning_rate": 4.922134476639389e-07, + "loss": 0.033, + "step": 187980 + }, + { + "epoch": 0.20995, + "grad_norm": 0.031088093295693398, + "learning_rate": 4.913975579259905e-07, + "loss": 0.0322, + "step": 187990 + }, + { + "epoch": 0.21, + "grad_norm": 0.03156421706080437, + "learning_rate": 4.90582338244025e-07, + "loss": 0.0323, + "step": 188000 + }, + { + "epoch": 0.21005, + "grad_norm": 0.030572090297937393, + "learning_rate": 4.897677886403301e-07, + "loss": 0.0331, + "step": 188010 + }, + { + "epoch": 0.2101, + "grad_norm": 0.03554949909448624, + "learning_rate": 4.889539091371797e-07, + "loss": 0.0341, + "step": 188020 + }, + { + "epoch": 0.21015, + "grad_norm": 0.03122507408261299, + "learning_rate": 4.881406997568172e-07, + "loss": 0.0339, + "step": 188030 + }, + { + "epoch": 0.2102, + "grad_norm": 0.030517447739839554, + "learning_rate": 4.873281605214802e-07, + "loss": 0.0317, + "step": 188040 + }, + { + "epoch": 0.21025, + "grad_norm": 0.026050496846437454, + "learning_rate": 4.865162914533816e-07, + "loss": 0.0338, + "step": 188050 + }, + { + "epoch": 0.2103, + "grad_norm": 0.03257044404745102, + "learning_rate": 4.857050925747203e-07, + "loss": 0.0337, + "step": 188060 + }, + { + "epoch": 0.21035, + "grad_norm": 0.03048430196940899, + "learning_rate": 4.848945639076702e-07, + "loss": 0.034, + "step": 188070 + }, + { + "epoch": 0.2104, + "grad_norm": 0.0283748097717762, + "learning_rate": 4.840847054743941e-07, + "loss": 0.0335, + "step": 188080 + }, + { + "epoch": 0.21045, + "grad_norm": 0.030453065410256386, + "learning_rate": 4.832755172970299e-07, + "loss": 0.0332, + "step": 188090 + }, + { + "epoch": 0.2105, + "grad_norm": 0.027233123779296875, + "learning_rate": 4.824669993977071e-07, + "loss": 0.0329, + "step": 188100 + }, + { + "epoch": 0.21055, + "grad_norm": 0.0295072291046381, + "learning_rate": 4.816591517985192e-07, + "loss": 0.033, + "step": 188110 + }, + { + "epoch": 0.2106, + "grad_norm": 0.027236513793468475, + "learning_rate": 4.808519745215623e-07, + "loss": 0.0332, + "step": 188120 + }, + { + "epoch": 0.21065, + "grad_norm": 0.02930319309234619, + "learning_rate": 4.800454675889021e-07, + "loss": 0.0345, + "step": 188130 + }, + { + "epoch": 0.2107, + "grad_norm": 0.03257892280817032, + "learning_rate": 4.79239631022585e-07, + "loss": 0.0335, + "step": 188140 + }, + { + "epoch": 0.21075, + "grad_norm": 0.02791167050600052, + "learning_rate": 4.784344648446487e-07, + "loss": 0.0355, + "step": 188150 + }, + { + "epoch": 0.2108, + "grad_norm": 0.0288220401853323, + "learning_rate": 4.776299690770952e-07, + "loss": 0.0331, + "step": 188160 + }, + { + "epoch": 0.21085, + "grad_norm": 0.03018355555832386, + "learning_rate": 4.7682614374192913e-07, + "loss": 0.0347, + "step": 188170 + }, + { + "epoch": 0.2109, + "grad_norm": 0.030637793242931366, + "learning_rate": 4.760229888611245e-07, + "loss": 0.0343, + "step": 188180 + }, + { + "epoch": 0.21095, + "grad_norm": 0.02879856899380684, + "learning_rate": 4.75220504456636e-07, + "loss": 0.033, + "step": 188190 + }, + { + "epoch": 0.211, + "grad_norm": 0.030380692332983017, + "learning_rate": 4.744186905504072e-07, + "loss": 0.0335, + "step": 188200 + }, + { + "epoch": 0.21105, + "grad_norm": 0.028760310262441635, + "learning_rate": 4.736175471643567e-07, + "loss": 0.0323, + "step": 188210 + }, + { + "epoch": 0.2111, + "grad_norm": 0.02885890007019043, + "learning_rate": 4.7281707432038915e-07, + "loss": 0.0349, + "step": 188220 + }, + { + "epoch": 0.21115, + "grad_norm": 0.02876354567706585, + "learning_rate": 4.72017272040387e-07, + "loss": 0.0335, + "step": 188230 + }, + { + "epoch": 0.2112, + "grad_norm": 0.02850225754082203, + "learning_rate": 4.7121814034621623e-07, + "loss": 0.0333, + "step": 188240 + }, + { + "epoch": 0.21125, + "grad_norm": 0.029061682522296906, + "learning_rate": 4.7041967925972873e-07, + "loss": 0.0323, + "step": 188250 + }, + { + "epoch": 0.2113, + "grad_norm": 0.028914617374539375, + "learning_rate": 4.6962188880275426e-07, + "loss": 0.0319, + "step": 188260 + }, + { + "epoch": 0.21135, + "grad_norm": 0.0294718686491251, + "learning_rate": 4.6882476899710037e-07, + "loss": 0.0324, + "step": 188270 + }, + { + "epoch": 0.2114, + "grad_norm": 0.026144810020923615, + "learning_rate": 4.68028319864558e-07, + "loss": 0.0324, + "step": 188280 + }, + { + "epoch": 0.21145, + "grad_norm": 0.026472182944417, + "learning_rate": 4.6723254142690687e-07, + "loss": 0.0316, + "step": 188290 + }, + { + "epoch": 0.2115, + "grad_norm": 0.029723865911364555, + "learning_rate": 4.664374337059019e-07, + "loss": 0.0329, + "step": 188300 + }, + { + "epoch": 0.21155, + "grad_norm": 0.02758902497589588, + "learning_rate": 4.6564299672328116e-07, + "loss": 0.0326, + "step": 188310 + }, + { + "epoch": 0.2116, + "grad_norm": 0.03302999958395958, + "learning_rate": 4.6484923050076344e-07, + "loss": 0.0339, + "step": 188320 + }, + { + "epoch": 0.21165, + "grad_norm": 0.03390069305896759, + "learning_rate": 4.640561350600509e-07, + "loss": 0.0315, + "step": 188330 + }, + { + "epoch": 0.2117, + "grad_norm": 0.029498016461730003, + "learning_rate": 4.6326371042282603e-07, + "loss": 0.0314, + "step": 188340 + }, + { + "epoch": 0.21175, + "grad_norm": 0.029174910858273506, + "learning_rate": 4.6247195661075214e-07, + "loss": 0.0326, + "step": 188350 + }, + { + "epoch": 0.2118, + "grad_norm": 0.028577126562595367, + "learning_rate": 4.616808736454759e-07, + "loss": 0.033, + "step": 188360 + }, + { + "epoch": 0.21185, + "grad_norm": 0.029439343139529228, + "learning_rate": 4.60890461548627e-07, + "loss": 0.0313, + "step": 188370 + }, + { + "epoch": 0.2119, + "grad_norm": 0.02873920649290085, + "learning_rate": 4.601007203418134e-07, + "loss": 0.0319, + "step": 188380 + }, + { + "epoch": 0.21195, + "grad_norm": 0.02306370809674263, + "learning_rate": 4.593116500466288e-07, + "loss": 0.0324, + "step": 188390 + }, + { + "epoch": 0.212, + "grad_norm": 0.027125053107738495, + "learning_rate": 4.5852325068464206e-07, + "loss": 0.0326, + "step": 188400 + }, + { + "epoch": 0.21205, + "grad_norm": 0.030318230390548706, + "learning_rate": 4.57735522277411e-07, + "loss": 0.0342, + "step": 188410 + }, + { + "epoch": 0.2121, + "grad_norm": 0.02977856434881687, + "learning_rate": 4.569484648464711e-07, + "loss": 0.0317, + "step": 188420 + }, + { + "epoch": 0.21215, + "grad_norm": 0.02663765288889408, + "learning_rate": 4.561620784133386e-07, + "loss": 0.0324, + "step": 188430 + }, + { + "epoch": 0.2122, + "grad_norm": 0.029476555064320564, + "learning_rate": 4.553763629995156e-07, + "loss": 0.0333, + "step": 188440 + }, + { + "epoch": 0.21225, + "grad_norm": 0.031408656388521194, + "learning_rate": 4.545913186264794e-07, + "loss": 0.0315, + "step": 188450 + }, + { + "epoch": 0.2123, + "grad_norm": 0.0317244715988636, + "learning_rate": 4.538069453156962e-07, + "loss": 0.0342, + "step": 188460 + }, + { + "epoch": 0.21235, + "grad_norm": 0.030181189998984337, + "learning_rate": 4.5302324308861275e-07, + "loss": 0.0344, + "step": 188470 + }, + { + "epoch": 0.2124, + "grad_norm": 0.029961835592985153, + "learning_rate": 4.5224021196664803e-07, + "loss": 0.0331, + "step": 188480 + }, + { + "epoch": 0.21245, + "grad_norm": 0.027818555012345314, + "learning_rate": 4.5145785197121537e-07, + "loss": 0.0335, + "step": 188490 + }, + { + "epoch": 0.2125, + "grad_norm": 0.03190042823553085, + "learning_rate": 4.5067616312370055e-07, + "loss": 0.0355, + "step": 188500 + }, + { + "epoch": 0.21255, + "grad_norm": 0.035076189786195755, + "learning_rate": 4.4989514544547807e-07, + "loss": 0.0343, + "step": 188510 + }, + { + "epoch": 0.2126, + "grad_norm": 0.03411129489541054, + "learning_rate": 4.491147989579003e-07, + "loss": 0.0335, + "step": 188520 + }, + { + "epoch": 0.21265, + "grad_norm": 0.028793184086680412, + "learning_rate": 4.483351236823002e-07, + "loss": 0.0332, + "step": 188530 + }, + { + "epoch": 0.2127, + "grad_norm": 0.02898499369621277, + "learning_rate": 4.4755611963999414e-07, + "loss": 0.0327, + "step": 188540 + }, + { + "epoch": 0.21275, + "grad_norm": 0.027540000155568123, + "learning_rate": 4.467777868522788e-07, + "loss": 0.0332, + "step": 188550 + }, + { + "epoch": 0.2128, + "grad_norm": 0.031743086874485016, + "learning_rate": 4.4600012534043723e-07, + "loss": 0.0379, + "step": 188560 + }, + { + "epoch": 0.21285, + "grad_norm": 0.030596960335969925, + "learning_rate": 4.452231351257247e-07, + "loss": 0.0338, + "step": 188570 + }, + { + "epoch": 0.2129, + "grad_norm": 0.030939722433686256, + "learning_rate": 4.44446816229388e-07, + "loss": 0.0333, + "step": 188580 + }, + { + "epoch": 0.21295, + "grad_norm": 0.028294682502746582, + "learning_rate": 4.4367116867264914e-07, + "loss": 0.0331, + "step": 188590 + }, + { + "epoch": 0.213, + "grad_norm": 0.03085498698055744, + "learning_rate": 4.4289619247671886e-07, + "loss": 0.033, + "step": 188600 + }, + { + "epoch": 0.21305, + "grad_norm": 0.03062901645898819, + "learning_rate": 4.421218876627775e-07, + "loss": 0.0327, + "step": 188610 + }, + { + "epoch": 0.2131, + "grad_norm": 0.03261803835630417, + "learning_rate": 4.41348254251997e-07, + "loss": 0.0325, + "step": 188620 + }, + { + "epoch": 0.21315, + "grad_norm": 0.03610049933195114, + "learning_rate": 4.4057529226552986e-07, + "loss": 0.0346, + "step": 188630 + }, + { + "epoch": 0.2132, + "grad_norm": 0.026045652106404305, + "learning_rate": 4.3980300172450914e-07, + "loss": 0.0335, + "step": 188640 + }, + { + "epoch": 0.21325, + "grad_norm": 0.027834657579660416, + "learning_rate": 4.390313826500486e-07, + "loss": 0.0329, + "step": 188650 + }, + { + "epoch": 0.2133, + "grad_norm": 0.03452204167842865, + "learning_rate": 4.3826043506323964e-07, + "loss": 0.0342, + "step": 188660 + }, + { + "epoch": 0.21335, + "grad_norm": 0.028179027140140533, + "learning_rate": 4.374901589851654e-07, + "loss": 0.0342, + "step": 188670 + }, + { + "epoch": 0.2134, + "grad_norm": 0.03150675445795059, + "learning_rate": 4.3672055443688134e-07, + "loss": 0.0325, + "step": 188680 + }, + { + "epoch": 0.21345, + "grad_norm": 0.028220821171998978, + "learning_rate": 4.359516214394288e-07, + "loss": 0.0325, + "step": 188690 + }, + { + "epoch": 0.2135, + "grad_norm": 0.029086345806717873, + "learning_rate": 4.3518336001382995e-07, + "loss": 0.0329, + "step": 188700 + }, + { + "epoch": 0.21355, + "grad_norm": 0.029618870466947556, + "learning_rate": 4.3441577018109025e-07, + "loss": 0.0339, + "step": 188710 + }, + { + "epoch": 0.2136, + "grad_norm": 0.02819829247891903, + "learning_rate": 4.3364885196219564e-07, + "loss": 0.033, + "step": 188720 + }, + { + "epoch": 0.21365, + "grad_norm": 0.026339873671531677, + "learning_rate": 4.3288260537811267e-07, + "loss": 0.0328, + "step": 188730 + }, + { + "epoch": 0.2137, + "grad_norm": 0.03084726259112358, + "learning_rate": 4.321170304497885e-07, + "loss": 0.0322, + "step": 188740 + }, + { + "epoch": 0.21375, + "grad_norm": 0.028051769360899925, + "learning_rate": 4.313521271981563e-07, + "loss": 0.0325, + "step": 188750 + }, + { + "epoch": 0.2138, + "grad_norm": 0.03308770805597305, + "learning_rate": 4.3058789564412724e-07, + "loss": 0.0319, + "step": 188760 + }, + { + "epoch": 0.21385, + "grad_norm": 0.028255965560674667, + "learning_rate": 4.298243358085929e-07, + "loss": 0.0322, + "step": 188770 + }, + { + "epoch": 0.2139, + "grad_norm": 0.030764950439333916, + "learning_rate": 4.2906144771243106e-07, + "loss": 0.0326, + "step": 188780 + }, + { + "epoch": 0.21395, + "grad_norm": 0.027962734922766685, + "learning_rate": 4.2829923137649996e-07, + "loss": 0.0319, + "step": 188790 + }, + { + "epoch": 0.214, + "grad_norm": 0.03067231923341751, + "learning_rate": 4.2753768682163297e-07, + "loss": 0.0328, + "step": 188800 + }, + { + "epoch": 0.21405, + "grad_norm": 0.031174374744296074, + "learning_rate": 4.267768140686579e-07, + "loss": 0.0323, + "step": 188810 + }, + { + "epoch": 0.2141, + "grad_norm": 0.02567133493721485, + "learning_rate": 4.2601661313837193e-07, + "loss": 0.0337, + "step": 188820 + }, + { + "epoch": 0.21415, + "grad_norm": 0.02617892250418663, + "learning_rate": 4.252570840515585e-07, + "loss": 0.0328, + "step": 188830 + }, + { + "epoch": 0.2142, + "grad_norm": 0.02718042954802513, + "learning_rate": 4.244982268289843e-07, + "loss": 0.0325, + "step": 188840 + }, + { + "epoch": 0.21425, + "grad_norm": 0.027024347335100174, + "learning_rate": 4.2374004149139944e-07, + "loss": 0.0331, + "step": 188850 + }, + { + "epoch": 0.2143, + "grad_norm": 0.023870961740612984, + "learning_rate": 4.229825280595262e-07, + "loss": 0.0316, + "step": 188860 + }, + { + "epoch": 0.21435, + "grad_norm": 0.030253371223807335, + "learning_rate": 4.222256865540758e-07, + "loss": 0.0346, + "step": 188870 + }, + { + "epoch": 0.2144, + "grad_norm": 0.029407694935798645, + "learning_rate": 4.214695169957428e-07, + "loss": 0.0311, + "step": 188880 + }, + { + "epoch": 0.21445, + "grad_norm": 0.029314350336790085, + "learning_rate": 4.207140194052023e-07, + "loss": 0.0318, + "step": 188890 + }, + { + "epoch": 0.2145, + "grad_norm": 0.02597719244658947, + "learning_rate": 4.1995919380310444e-07, + "loss": 0.033, + "step": 188900 + }, + { + "epoch": 0.21455, + "grad_norm": 0.027614757418632507, + "learning_rate": 4.192050402100883e-07, + "loss": 0.0322, + "step": 188910 + }, + { + "epoch": 0.2146, + "grad_norm": 0.027147000655531883, + "learning_rate": 4.184515586467708e-07, + "loss": 0.0316, + "step": 188920 + }, + { + "epoch": 0.21465, + "grad_norm": 0.02751532383263111, + "learning_rate": 4.1769874913375196e-07, + "loss": 0.0319, + "step": 188930 + }, + { + "epoch": 0.2147, + "grad_norm": 0.024400847032666206, + "learning_rate": 4.169466116916182e-07, + "loss": 0.0338, + "step": 188940 + }, + { + "epoch": 0.21475, + "grad_norm": 0.029047008603811264, + "learning_rate": 4.1619514634092526e-07, + "loss": 0.0316, + "step": 188950 + }, + { + "epoch": 0.2148, + "grad_norm": 0.02734926901757717, + "learning_rate": 4.154443531022206e-07, + "loss": 0.0376, + "step": 188960 + }, + { + "epoch": 0.21485, + "grad_norm": 0.025272736325860023, + "learning_rate": 4.1469423199603505e-07, + "loss": 0.0319, + "step": 188970 + }, + { + "epoch": 0.2149, + "grad_norm": 0.02764223702251911, + "learning_rate": 4.1394478304287167e-07, + "loss": 0.0322, + "step": 188980 + }, + { + "epoch": 0.21495, + "grad_norm": 0.029302928596735, + "learning_rate": 4.1319600626321955e-07, + "loss": 0.0319, + "step": 188990 + }, + { + "epoch": 0.215, + "grad_norm": 0.025961730629205704, + "learning_rate": 4.124479016775512e-07, + "loss": 0.032, + "step": 189000 + }, + { + "epoch": 0.21505, + "grad_norm": 0.03143499791622162, + "learning_rate": 4.1170046930632255e-07, + "loss": 0.0312, + "step": 189010 + }, + { + "epoch": 0.2151, + "grad_norm": 0.030058082193136215, + "learning_rate": 4.1095370916996443e-07, + "loss": 0.0323, + "step": 189020 + }, + { + "epoch": 0.21515, + "grad_norm": 0.026301292702555656, + "learning_rate": 4.102076212888939e-07, + "loss": 0.0321, + "step": 189030 + }, + { + "epoch": 0.2152, + "grad_norm": 0.027706298977136612, + "learning_rate": 4.0946220568350844e-07, + "loss": 0.0327, + "step": 189040 + }, + { + "epoch": 0.21525, + "grad_norm": 0.02647777646780014, + "learning_rate": 4.087174623741918e-07, + "loss": 0.032, + "step": 189050 + }, + { + "epoch": 0.2153, + "grad_norm": 0.03239917755126953, + "learning_rate": 4.079733913812972e-07, + "loss": 0.0317, + "step": 189060 + }, + { + "epoch": 0.21535, + "grad_norm": 0.026101669296622276, + "learning_rate": 4.0722999272517217e-07, + "loss": 0.0314, + "step": 189070 + }, + { + "epoch": 0.2154, + "grad_norm": 0.0336785614490509, + "learning_rate": 4.064872664261421e-07, + "loss": 0.0316, + "step": 189080 + }, + { + "epoch": 0.21545, + "grad_norm": 0.02944769896566868, + "learning_rate": 4.057452125045075e-07, + "loss": 0.032, + "step": 189090 + }, + { + "epoch": 0.2155, + "grad_norm": 0.028852440416812897, + "learning_rate": 4.0500383098056315e-07, + "loss": 0.0309, + "step": 189100 + }, + { + "epoch": 0.21555, + "grad_norm": 0.026862692087888718, + "learning_rate": 4.042631218745707e-07, + "loss": 0.0325, + "step": 189110 + }, + { + "epoch": 0.2156, + "grad_norm": 0.02777714654803276, + "learning_rate": 4.035230852067862e-07, + "loss": 0.0326, + "step": 189120 + }, + { + "epoch": 0.21565, + "grad_norm": 0.026612672954797745, + "learning_rate": 4.0278372099744054e-07, + "loss": 0.0312, + "step": 189130 + }, + { + "epoch": 0.2157, + "grad_norm": 0.032957401126623154, + "learning_rate": 4.020450292667455e-07, + "loss": 0.0319, + "step": 189140 + }, + { + "epoch": 0.21575, + "grad_norm": 0.028477706015110016, + "learning_rate": 4.013070100348987e-07, + "loss": 0.033, + "step": 189150 + }, + { + "epoch": 0.2158, + "grad_norm": 0.027616405859589577, + "learning_rate": 4.0056966332207844e-07, + "loss": 0.0309, + "step": 189160 + }, + { + "epoch": 0.21585, + "grad_norm": 0.028580589219927788, + "learning_rate": 3.9983298914844093e-07, + "loss": 0.0317, + "step": 189170 + }, + { + "epoch": 0.2159, + "grad_norm": 0.03125343471765518, + "learning_rate": 3.990969875341283e-07, + "loss": 0.0318, + "step": 189180 + }, + { + "epoch": 0.21595, + "grad_norm": 0.028869450092315674, + "learning_rate": 3.983616584992578e-07, + "loss": 0.0316, + "step": 189190 + }, + { + "epoch": 0.216, + "grad_norm": 0.028730357065796852, + "learning_rate": 3.9762700206394387e-07, + "loss": 0.0341, + "step": 189200 + }, + { + "epoch": 0.21605, + "grad_norm": 0.03006094880402088, + "learning_rate": 3.968930182482594e-07, + "loss": 0.0327, + "step": 189210 + }, + { + "epoch": 0.2161, + "grad_norm": 0.03059045970439911, + "learning_rate": 3.9615970707228e-07, + "loss": 0.0309, + "step": 189220 + }, + { + "epoch": 0.21615, + "grad_norm": 0.028501491993665695, + "learning_rate": 3.954270685560507e-07, + "loss": 0.0327, + "step": 189230 + }, + { + "epoch": 0.2162, + "grad_norm": 0.027690019458532333, + "learning_rate": 3.9469510271960274e-07, + "loss": 0.0315, + "step": 189240 + }, + { + "epoch": 0.21625, + "grad_norm": 0.026815390214323997, + "learning_rate": 3.9396380958294233e-07, + "loss": 0.0341, + "step": 189250 + }, + { + "epoch": 0.2163, + "grad_norm": 0.028432907536625862, + "learning_rate": 3.932331891660701e-07, + "loss": 0.0314, + "step": 189260 + }, + { + "epoch": 0.21635, + "grad_norm": 0.02925742045044899, + "learning_rate": 3.925032414889618e-07, + "loss": 0.0336, + "step": 189270 + }, + { + "epoch": 0.2164, + "grad_norm": 0.02542842924594879, + "learning_rate": 3.917739665715653e-07, + "loss": 0.036, + "step": 189280 + }, + { + "epoch": 0.21645, + "grad_norm": 0.02953684702515602, + "learning_rate": 3.910453644338258e-07, + "loss": 0.0329, + "step": 189290 + }, + { + "epoch": 0.2165, + "grad_norm": 0.02706819586455822, + "learning_rate": 3.903174350956579e-07, + "loss": 0.0339, + "step": 189300 + }, + { + "epoch": 0.21655, + "grad_norm": 0.02872304804623127, + "learning_rate": 3.895901785769707e-07, + "loss": 0.0324, + "step": 189310 + }, + { + "epoch": 0.2166, + "grad_norm": 0.030689561739563942, + "learning_rate": 3.888635948976399e-07, + "loss": 0.0344, + "step": 189320 + }, + { + "epoch": 0.21665, + "grad_norm": 0.028889385983347893, + "learning_rate": 3.8813768407753025e-07, + "loss": 0.033, + "step": 189330 + }, + { + "epoch": 0.2167, + "grad_norm": 0.03080740012228489, + "learning_rate": 3.874124461364925e-07, + "loss": 0.0323, + "step": 189340 + }, + { + "epoch": 0.21675, + "grad_norm": 0.028255589306354523, + "learning_rate": 3.866878810943525e-07, + "loss": 0.0322, + "step": 189350 + }, + { + "epoch": 0.2168, + "grad_norm": 0.030432872474193573, + "learning_rate": 3.8596398897091936e-07, + "loss": 0.0334, + "step": 189360 + }, + { + "epoch": 0.21685, + "grad_norm": 0.027476327493786812, + "learning_rate": 3.852407697859828e-07, + "loss": 0.0324, + "step": 189370 + }, + { + "epoch": 0.2169, + "grad_norm": 0.028619499877095222, + "learning_rate": 3.8451822355931313e-07, + "loss": 0.0333, + "step": 189380 + }, + { + "epoch": 0.21695, + "grad_norm": 0.02922292985022068, + "learning_rate": 3.837963503106723e-07, + "loss": 0.0315, + "step": 189390 + }, + { + "epoch": 0.217, + "grad_norm": 0.03195172920823097, + "learning_rate": 3.83075150059789e-07, + "loss": 0.0314, + "step": 189400 + }, + { + "epoch": 0.21705, + "grad_norm": 0.028518592938780785, + "learning_rate": 3.8235462282638357e-07, + "loss": 0.032, + "step": 189410 + }, + { + "epoch": 0.2171, + "grad_norm": 0.02795417048037052, + "learning_rate": 3.816347686301541e-07, + "loss": 0.0319, + "step": 189420 + }, + { + "epoch": 0.21715, + "grad_norm": 0.02762647718191147, + "learning_rate": 3.8091558749078214e-07, + "loss": 0.032, + "step": 189430 + }, + { + "epoch": 0.2172, + "grad_norm": 0.02550506219267845, + "learning_rate": 3.801970794279297e-07, + "loss": 0.0325, + "step": 189440 + }, + { + "epoch": 0.21725, + "grad_norm": 0.028216617181897163, + "learning_rate": 3.794792444612366e-07, + "loss": 0.032, + "step": 189450 + }, + { + "epoch": 0.2173, + "grad_norm": 0.028356458991765976, + "learning_rate": 3.7876208261033443e-07, + "loss": 0.0321, + "step": 189460 + }, + { + "epoch": 0.21735, + "grad_norm": 0.029298197478055954, + "learning_rate": 3.7804559389482686e-07, + "loss": 0.0334, + "step": 189470 + }, + { + "epoch": 0.2174, + "grad_norm": 0.02883121185004711, + "learning_rate": 3.7732977833430383e-07, + "loss": 0.0319, + "step": 189480 + }, + { + "epoch": 0.21745, + "grad_norm": 0.03034260682761669, + "learning_rate": 3.7661463594833027e-07, + "loss": 0.0336, + "step": 189490 + }, + { + "epoch": 0.2175, + "grad_norm": 0.03522493317723274, + "learning_rate": 3.759001667564654e-07, + "loss": 0.0354, + "step": 189500 + }, + { + "epoch": 0.21755, + "grad_norm": 0.029949499294161797, + "learning_rate": 3.751863707782383e-07, + "loss": 0.0342, + "step": 189510 + }, + { + "epoch": 0.2176, + "grad_norm": 0.031180361285805702, + "learning_rate": 3.7447324803316364e-07, + "loss": 0.0329, + "step": 189520 + }, + { + "epoch": 0.21765, + "grad_norm": 0.03215509653091431, + "learning_rate": 3.737607985407426e-07, + "loss": 0.0336, + "step": 189530 + }, + { + "epoch": 0.2177, + "grad_norm": 0.027371907606720924, + "learning_rate": 3.730490223204458e-07, + "loss": 0.0333, + "step": 189540 + }, + { + "epoch": 0.21775, + "grad_norm": 0.030672768130898476, + "learning_rate": 3.723379193917381e-07, + "loss": 0.0346, + "step": 189550 + }, + { + "epoch": 0.2178, + "grad_norm": 0.027337929233908653, + "learning_rate": 3.7162748977405957e-07, + "loss": 0.0336, + "step": 189560 + }, + { + "epoch": 0.21785, + "grad_norm": 0.03581344336271286, + "learning_rate": 3.709177334868308e-07, + "loss": 0.0332, + "step": 189570 + }, + { + "epoch": 0.2179, + "grad_norm": 0.030147524550557137, + "learning_rate": 3.7020865054946117e-07, + "loss": 0.0333, + "step": 189580 + }, + { + "epoch": 0.21795, + "grad_norm": 0.030416414141654968, + "learning_rate": 3.695002409813325e-07, + "loss": 0.0343, + "step": 189590 + }, + { + "epoch": 0.218, + "grad_norm": 0.030246717855334282, + "learning_rate": 3.6879250480181816e-07, + "loss": 0.0336, + "step": 189600 + }, + { + "epoch": 0.21805, + "grad_norm": 0.028833702206611633, + "learning_rate": 3.680854420302582e-07, + "loss": 0.0339, + "step": 189610 + }, + { + "epoch": 0.2181, + "grad_norm": 0.02759072370827198, + "learning_rate": 3.673790526859899e-07, + "loss": 0.0327, + "step": 189620 + }, + { + "epoch": 0.21815, + "grad_norm": 0.027316806837916374, + "learning_rate": 3.6667333678832294e-07, + "loss": 0.0345, + "step": 189630 + }, + { + "epoch": 0.2182, + "grad_norm": 0.03333732485771179, + "learning_rate": 3.6596829435655565e-07, + "loss": 0.0339, + "step": 189640 + }, + { + "epoch": 0.21825, + "grad_norm": 0.029696255922317505, + "learning_rate": 3.652639254099616e-07, + "loss": 0.0323, + "step": 189650 + }, + { + "epoch": 0.2183, + "grad_norm": 0.031996987760066986, + "learning_rate": 3.645602299677919e-07, + "loss": 0.0331, + "step": 189660 + }, + { + "epoch": 0.21835, + "grad_norm": 0.030974699184298515, + "learning_rate": 3.638572080492952e-07, + "loss": 0.0323, + "step": 189670 + }, + { + "epoch": 0.2184, + "grad_norm": 0.02718987688422203, + "learning_rate": 3.631548596736839e-07, + "loss": 0.0327, + "step": 189680 + }, + { + "epoch": 0.21845, + "grad_norm": 0.03376190364360809, + "learning_rate": 3.624531848601648e-07, + "loss": 0.0334, + "step": 189690 + }, + { + "epoch": 0.2185, + "grad_norm": 0.029311642050743103, + "learning_rate": 3.6175218362791976e-07, + "loss": 0.0319, + "step": 189700 + }, + { + "epoch": 0.21855, + "grad_norm": 0.02694571390748024, + "learning_rate": 3.6105185599611125e-07, + "loss": 0.0305, + "step": 189710 + }, + { + "epoch": 0.2186, + "grad_norm": 0.03201557695865631, + "learning_rate": 3.603522019838906e-07, + "loss": 0.0322, + "step": 189720 + }, + { + "epoch": 0.21865, + "grad_norm": 0.028358617797493935, + "learning_rate": 3.596532216103843e-07, + "loss": 0.0321, + "step": 189730 + }, + { + "epoch": 0.2187, + "grad_norm": 0.029142500832676888, + "learning_rate": 3.589549148947019e-07, + "loss": 0.0307, + "step": 189740 + }, + { + "epoch": 0.21875, + "grad_norm": 0.02805599942803383, + "learning_rate": 3.5825728185593374e-07, + "loss": 0.0313, + "step": 189750 + }, + { + "epoch": 0.2188, + "grad_norm": 0.02762032300233841, + "learning_rate": 3.575603225131563e-07, + "loss": 0.0319, + "step": 189760 + }, + { + "epoch": 0.21885, + "grad_norm": 0.02900952659547329, + "learning_rate": 3.568640368854209e-07, + "loss": 0.0328, + "step": 189770 + }, + { + "epoch": 0.2189, + "grad_norm": 0.02901539020240307, + "learning_rate": 3.561684249917652e-07, + "loss": 0.032, + "step": 189780 + }, + { + "epoch": 0.21895, + "grad_norm": 0.03188694640994072, + "learning_rate": 3.554734868512044e-07, + "loss": 0.0331, + "step": 189790 + }, + { + "epoch": 0.219, + "grad_norm": 0.028981653973460197, + "learning_rate": 3.547792224827401e-07, + "loss": 0.0328, + "step": 189800 + }, + { + "epoch": 0.21905, + "grad_norm": 0.02720548026263714, + "learning_rate": 3.5408563190535704e-07, + "loss": 0.0338, + "step": 189810 + }, + { + "epoch": 0.2191, + "grad_norm": 0.028782818466424942, + "learning_rate": 3.5339271513800953e-07, + "loss": 0.0324, + "step": 189820 + }, + { + "epoch": 0.21915, + "grad_norm": 0.034867823123931885, + "learning_rate": 3.527004721996463e-07, + "loss": 0.0333, + "step": 189830 + }, + { + "epoch": 0.2192, + "grad_norm": 0.027969233691692352, + "learning_rate": 3.5200890310919385e-07, + "loss": 0.0328, + "step": 189840 + }, + { + "epoch": 0.21925, + "grad_norm": 0.026636585593223572, + "learning_rate": 3.5131800788555936e-07, + "loss": 0.0338, + "step": 189850 + }, + { + "epoch": 0.2193, + "grad_norm": 0.027630211785435677, + "learning_rate": 3.506277865476304e-07, + "loss": 0.0325, + "step": 189860 + }, + { + "epoch": 0.21935, + "grad_norm": 0.026662291958928108, + "learning_rate": 3.4993823911427527e-07, + "loss": 0.0322, + "step": 189870 + }, + { + "epoch": 0.2194, + "grad_norm": 0.02801119163632393, + "learning_rate": 3.492493656043483e-07, + "loss": 0.0348, + "step": 189880 + }, + { + "epoch": 0.21945, + "grad_norm": 0.029345333576202393, + "learning_rate": 3.485611660366844e-07, + "loss": 0.0323, + "step": 189890 + }, + { + "epoch": 0.2195, + "grad_norm": 0.028748812153935432, + "learning_rate": 3.478736404300964e-07, + "loss": 0.0336, + "step": 189900 + }, + { + "epoch": 0.21955, + "grad_norm": 0.028934547677636147, + "learning_rate": 3.471867888033803e-07, + "loss": 0.0308, + "step": 189910 + }, + { + "epoch": 0.2196, + "grad_norm": 0.026555748656392097, + "learning_rate": 3.465006111753155e-07, + "loss": 0.0317, + "step": 189920 + }, + { + "epoch": 0.21965, + "grad_norm": 0.027853377163410187, + "learning_rate": 3.458151075646648e-07, + "loss": 0.0322, + "step": 189930 + }, + { + "epoch": 0.2197, + "grad_norm": 0.028755880892276764, + "learning_rate": 3.451302779901661e-07, + "loss": 0.0345, + "step": 189940 + }, + { + "epoch": 0.21975, + "grad_norm": 0.025807704776525497, + "learning_rate": 3.444461224705431e-07, + "loss": 0.0323, + "step": 189950 + }, + { + "epoch": 0.2198, + "grad_norm": 0.029136665165424347, + "learning_rate": 3.437626410245004e-07, + "loss": 0.0344, + "step": 189960 + }, + { + "epoch": 0.21985, + "grad_norm": 0.02834586426615715, + "learning_rate": 3.4307983367072304e-07, + "loss": 0.0321, + "step": 189970 + }, + { + "epoch": 0.2199, + "grad_norm": 0.028064792975783348, + "learning_rate": 3.423977004278822e-07, + "loss": 0.0321, + "step": 189980 + }, + { + "epoch": 0.21995, + "grad_norm": 0.025715850293636322, + "learning_rate": 3.417162413146213e-07, + "loss": 0.0322, + "step": 189990 + }, + { + "epoch": 0.22, + "grad_norm": 0.026166275143623352, + "learning_rate": 3.4103545634957825e-07, + "loss": 0.0315, + "step": 190000 + }, + { + "epoch": 0.22005, + "grad_norm": 0.02789056859910488, + "learning_rate": 3.4035534555135753e-07, + "loss": 0.0321, + "step": 190010 + }, + { + "epoch": 0.2201, + "grad_norm": 0.02760968543589115, + "learning_rate": 3.3967590893856096e-07, + "loss": 0.0325, + "step": 190020 + }, + { + "epoch": 0.22015, + "grad_norm": 0.030100341886281967, + "learning_rate": 3.389971465297598e-07, + "loss": 0.0321, + "step": 190030 + }, + { + "epoch": 0.2202, + "grad_norm": 0.027394311502575874, + "learning_rate": 3.383190583435114e-07, + "loss": 0.0317, + "step": 190040 + }, + { + "epoch": 0.22025, + "grad_norm": 0.031111113727092743, + "learning_rate": 3.3764164439835656e-07, + "loss": 0.0336, + "step": 190050 + }, + { + "epoch": 0.2203, + "grad_norm": 0.032798249274492264, + "learning_rate": 3.369649047128137e-07, + "loss": 0.0323, + "step": 190060 + }, + { + "epoch": 0.22035, + "grad_norm": 0.025414522737264633, + "learning_rate": 3.362888393053848e-07, + "loss": 0.0336, + "step": 190070 + }, + { + "epoch": 0.2204, + "grad_norm": 0.03104369156062603, + "learning_rate": 3.3561344819455223e-07, + "loss": 0.0341, + "step": 190080 + }, + { + "epoch": 0.22045, + "grad_norm": 0.027488548308610916, + "learning_rate": 3.3493873139878174e-07, + "loss": 0.032, + "step": 190090 + }, + { + "epoch": 0.2205, + "grad_norm": 0.026049386709928513, + "learning_rate": 3.342646889365225e-07, + "loss": 0.0333, + "step": 190100 + }, + { + "epoch": 0.22055, + "grad_norm": 0.02731347270309925, + "learning_rate": 3.335913208261959e-07, + "loss": 0.0328, + "step": 190110 + }, + { + "epoch": 0.2206, + "grad_norm": 0.027487874031066895, + "learning_rate": 3.329186270862206e-07, + "loss": 0.0324, + "step": 190120 + }, + { + "epoch": 0.22065, + "grad_norm": 0.030462585389614105, + "learning_rate": 3.3224660773497896e-07, + "loss": 0.0331, + "step": 190130 + }, + { + "epoch": 0.2207, + "grad_norm": 0.031835246831178665, + "learning_rate": 3.315752627908508e-07, + "loss": 0.0335, + "step": 190140 + }, + { + "epoch": 0.22075, + "grad_norm": 0.029755624011158943, + "learning_rate": 3.309045922721854e-07, + "loss": 0.032, + "step": 190150 + }, + { + "epoch": 0.2208, + "grad_norm": 0.031246056780219078, + "learning_rate": 3.302345961973208e-07, + "loss": 0.0351, + "step": 190160 + }, + { + "epoch": 0.22085, + "grad_norm": 0.030124295502901077, + "learning_rate": 3.295652745845756e-07, + "loss": 0.0352, + "step": 190170 + }, + { + "epoch": 0.2209, + "grad_norm": 0.030465351417660713, + "learning_rate": 3.2889662745224924e-07, + "loss": 0.0336, + "step": 190180 + }, + { + "epoch": 0.22095, + "grad_norm": 0.029054658487439156, + "learning_rate": 3.2822865481861865e-07, + "loss": 0.0333, + "step": 190190 + }, + { + "epoch": 0.221, + "grad_norm": 0.023691993206739426, + "learning_rate": 3.27561356701947e-07, + "loss": 0.033, + "step": 190200 + }, + { + "epoch": 0.22105, + "grad_norm": 0.03325042873620987, + "learning_rate": 3.2689473312047805e-07, + "loss": 0.0331, + "step": 190210 + }, + { + "epoch": 0.2211, + "grad_norm": 0.03363754227757454, + "learning_rate": 3.2622878409243884e-07, + "loss": 0.0332, + "step": 190220 + }, + { + "epoch": 0.22115, + "grad_norm": 0.028941981494426727, + "learning_rate": 3.255635096360371e-07, + "loss": 0.0322, + "step": 190230 + }, + { + "epoch": 0.2212, + "grad_norm": 0.030986396595835686, + "learning_rate": 3.2489890976945825e-07, + "loss": 0.0335, + "step": 190240 + }, + { + "epoch": 0.22125, + "grad_norm": 0.028466373682022095, + "learning_rate": 3.242349845108711e-07, + "loss": 0.034, + "step": 190250 + }, + { + "epoch": 0.2213, + "grad_norm": 0.02595651149749756, + "learning_rate": 3.235717338784333e-07, + "loss": 0.0323, + "step": 190260 + }, + { + "epoch": 0.22135, + "grad_norm": 0.026936156675219536, + "learning_rate": 3.229091578902693e-07, + "loss": 0.0311, + "step": 190270 + }, + { + "epoch": 0.2214, + "grad_norm": 0.028447262942790985, + "learning_rate": 3.2224725656450073e-07, + "loss": 0.0333, + "step": 190280 + }, + { + "epoch": 0.22145, + "grad_norm": 0.033287130296230316, + "learning_rate": 3.215860299192214e-07, + "loss": 0.0317, + "step": 190290 + }, + { + "epoch": 0.2215, + "grad_norm": 0.03119870461523533, + "learning_rate": 3.209254779725057e-07, + "loss": 0.0323, + "step": 190300 + }, + { + "epoch": 0.22155, + "grad_norm": 0.027340726926922798, + "learning_rate": 3.202656007424226e-07, + "loss": 0.0321, + "step": 190310 + }, + { + "epoch": 0.2216, + "grad_norm": 0.028995640575885773, + "learning_rate": 3.1960639824699936e-07, + "loss": 0.0353, + "step": 190320 + }, + { + "epoch": 0.22165, + "grad_norm": 0.028964687138795853, + "learning_rate": 3.189478705042659e-07, + "loss": 0.0307, + "step": 190330 + }, + { + "epoch": 0.2217, + "grad_norm": 0.03010116145014763, + "learning_rate": 3.1829001753223006e-07, + "loss": 0.032, + "step": 190340 + }, + { + "epoch": 0.22175, + "grad_norm": 0.026755282655358315, + "learning_rate": 3.176328393488692e-07, + "loss": 0.033, + "step": 190350 + }, + { + "epoch": 0.2218, + "grad_norm": 0.02934892289340496, + "learning_rate": 3.1697633597215503e-07, + "loss": 0.0332, + "step": 190360 + }, + { + "epoch": 0.22185, + "grad_norm": 0.027839114889502525, + "learning_rate": 3.1632050742003427e-07, + "loss": 0.0311, + "step": 190370 + }, + { + "epoch": 0.2219, + "grad_norm": 0.024980947375297546, + "learning_rate": 3.15665353710437e-07, + "loss": 0.0327, + "step": 190380 + }, + { + "epoch": 0.22195, + "grad_norm": 0.026112347841262817, + "learning_rate": 3.1501087486127677e-07, + "loss": 0.0317, + "step": 190390 + }, + { + "epoch": 0.222, + "grad_norm": 0.031251177191734314, + "learning_rate": 3.1435707089044474e-07, + "loss": 0.0329, + "step": 190400 + }, + { + "epoch": 0.22205, + "grad_norm": 0.02733309008181095, + "learning_rate": 3.137039418158155e-07, + "loss": 0.0328, + "step": 190410 + }, + { + "epoch": 0.2221, + "grad_norm": 0.030722726136446, + "learning_rate": 3.1305148765524707e-07, + "loss": 0.032, + "step": 190420 + }, + { + "epoch": 0.22215, + "grad_norm": 0.029925871640443802, + "learning_rate": 3.1239970842657783e-07, + "loss": 0.033, + "step": 190430 + }, + { + "epoch": 0.2222, + "grad_norm": 0.02639448270201683, + "learning_rate": 3.1174860414762417e-07, + "loss": 0.0329, + "step": 190440 + }, + { + "epoch": 0.22225, + "grad_norm": 0.03083515726029873, + "learning_rate": 3.110981748361913e-07, + "loss": 0.034, + "step": 190450 + }, + { + "epoch": 0.2223, + "grad_norm": 0.027510441839694977, + "learning_rate": 3.104484205100539e-07, + "loss": 0.0347, + "step": 190460 + }, + { + "epoch": 0.22235, + "grad_norm": 0.033879805356264114, + "learning_rate": 3.097993411869865e-07, + "loss": 0.032, + "step": 190470 + }, + { + "epoch": 0.2224, + "grad_norm": 0.03048882633447647, + "learning_rate": 3.0915093688472787e-07, + "loss": 0.0324, + "step": 190480 + }, + { + "epoch": 0.22245, + "grad_norm": 0.026504499837756157, + "learning_rate": 3.0850320762100536e-07, + "loss": 0.034, + "step": 190490 + }, + { + "epoch": 0.2225, + "grad_norm": 0.02800612710416317, + "learning_rate": 3.0785615341352993e-07, + "loss": 0.0346, + "step": 190500 + }, + { + "epoch": 0.22255, + "grad_norm": 0.02819383330643177, + "learning_rate": 3.072097742799901e-07, + "loss": 0.033, + "step": 190510 + }, + { + "epoch": 0.2226, + "grad_norm": 0.029371775686740875, + "learning_rate": 3.065640702380607e-07, + "loss": 0.0334, + "step": 190520 + }, + { + "epoch": 0.22265, + "grad_norm": 0.03021889552474022, + "learning_rate": 3.059190413053914e-07, + "loss": 0.0327, + "step": 190530 + }, + { + "epoch": 0.2227, + "grad_norm": 0.03006213903427124, + "learning_rate": 3.052746874996154e-07, + "loss": 0.0325, + "step": 190540 + }, + { + "epoch": 0.22275, + "grad_norm": 0.026731068268418312, + "learning_rate": 3.046310088383575e-07, + "loss": 0.0317, + "step": 190550 + }, + { + "epoch": 0.2228, + "grad_norm": 0.028391912579536438, + "learning_rate": 3.0398800533920633e-07, + "loss": 0.033, + "step": 190560 + }, + { + "epoch": 0.22285, + "grad_norm": 0.030019372701644897, + "learning_rate": 3.033456770197479e-07, + "loss": 0.0315, + "step": 190570 + }, + { + "epoch": 0.2229, + "grad_norm": 0.023982156068086624, + "learning_rate": 3.027040238975376e-07, + "loss": 0.0343, + "step": 190580 + }, + { + "epoch": 0.22295, + "grad_norm": 0.03015189617872238, + "learning_rate": 3.0206304599012246e-07, + "loss": 0.0326, + "step": 190590 + }, + { + "epoch": 0.223, + "grad_norm": 0.028634555637836456, + "learning_rate": 3.014227433150274e-07, + "loss": 0.0336, + "step": 190600 + }, + { + "epoch": 0.22305, + "grad_norm": 0.024594226852059364, + "learning_rate": 3.0078311588975225e-07, + "loss": 0.0323, + "step": 190610 + }, + { + "epoch": 0.2231, + "grad_norm": 0.027328481897711754, + "learning_rate": 3.0014416373178866e-07, + "loss": 0.0324, + "step": 190620 + }, + { + "epoch": 0.22315, + "grad_norm": 0.030780017375946045, + "learning_rate": 2.9950588685860317e-07, + "loss": 0.0326, + "step": 190630 + }, + { + "epoch": 0.2232, + "grad_norm": 0.028242947533726692, + "learning_rate": 2.988682852876484e-07, + "loss": 0.034, + "step": 190640 + }, + { + "epoch": 0.22325, + "grad_norm": 0.03165299445390701, + "learning_rate": 2.98231359036355e-07, + "loss": 0.0324, + "step": 190650 + }, + { + "epoch": 0.2233, + "grad_norm": 0.03017549030482769, + "learning_rate": 2.9759510812213676e-07, + "loss": 0.0347, + "step": 190660 + }, + { + "epoch": 0.22335, + "grad_norm": 0.028321314603090286, + "learning_rate": 2.969595325623881e-07, + "loss": 0.0347, + "step": 190670 + }, + { + "epoch": 0.2234, + "grad_norm": 0.03147047013044357, + "learning_rate": 2.963246323744839e-07, + "loss": 0.0335, + "step": 190680 + }, + { + "epoch": 0.22345, + "grad_norm": 0.02470092847943306, + "learning_rate": 2.956904075757855e-07, + "loss": 0.0325, + "step": 190690 + }, + { + "epoch": 0.2235, + "grad_norm": 0.0233890600502491, + "learning_rate": 2.9505685818362884e-07, + "loss": 0.0323, + "step": 190700 + }, + { + "epoch": 0.22355, + "grad_norm": 0.026280872523784637, + "learning_rate": 2.944239842153362e-07, + "loss": 0.0365, + "step": 190710 + }, + { + "epoch": 0.2236, + "grad_norm": 0.03087473474442959, + "learning_rate": 2.937917856882105e-07, + "loss": 0.0326, + "step": 190720 + }, + { + "epoch": 0.22365, + "grad_norm": 0.024799011647701263, + "learning_rate": 2.931602626195351e-07, + "loss": 0.0318, + "step": 190730 + }, + { + "epoch": 0.2237, + "grad_norm": 0.028520086780190468, + "learning_rate": 2.925294150265795e-07, + "loss": 0.0328, + "step": 190740 + }, + { + "epoch": 0.22375, + "grad_norm": 0.031669143587350845, + "learning_rate": 2.9189924292658265e-07, + "loss": 0.0328, + "step": 190750 + }, + { + "epoch": 0.2238, + "grad_norm": 0.03020283207297325, + "learning_rate": 2.9126974633678085e-07, + "loss": 0.0311, + "step": 190760 + }, + { + "epoch": 0.22385, + "grad_norm": 0.02963424101471901, + "learning_rate": 2.906409252743825e-07, + "loss": 0.0311, + "step": 190770 + }, + { + "epoch": 0.2239, + "grad_norm": 0.027471086010336876, + "learning_rate": 2.900127797565766e-07, + "loss": 0.0323, + "step": 190780 + }, + { + "epoch": 0.22395, + "grad_norm": 0.02767067588865757, + "learning_rate": 2.8938530980053836e-07, + "loss": 0.0348, + "step": 190790 + }, + { + "epoch": 0.224, + "grad_norm": 0.02803168073296547, + "learning_rate": 2.8875851542342347e-07, + "loss": 0.0313, + "step": 190800 + }, + { + "epoch": 0.22405, + "grad_norm": 0.02731022797524929, + "learning_rate": 2.881323966423682e-07, + "loss": 0.0323, + "step": 190810 + }, + { + "epoch": 0.2241, + "grad_norm": 0.024566948413848877, + "learning_rate": 2.8750695347448676e-07, + "loss": 0.032, + "step": 190820 + }, + { + "epoch": 0.22415, + "grad_norm": 0.028864016756415367, + "learning_rate": 2.86882185936882e-07, + "loss": 0.0356, + "step": 190830 + }, + { + "epoch": 0.2242, + "grad_norm": 0.02808845229446888, + "learning_rate": 2.862580940466347e-07, + "loss": 0.0334, + "step": 190840 + }, + { + "epoch": 0.22425, + "grad_norm": 0.029763944447040558, + "learning_rate": 2.8563467782080634e-07, + "loss": 0.0338, + "step": 190850 + }, + { + "epoch": 0.2243, + "grad_norm": 0.029420167207717896, + "learning_rate": 2.850119372764415e-07, + "loss": 0.033, + "step": 190860 + }, + { + "epoch": 0.22435, + "grad_norm": 0.029386799782514572, + "learning_rate": 2.843898724305627e-07, + "loss": 0.0331, + "step": 190870 + }, + { + "epoch": 0.2244, + "grad_norm": 0.02391725219786167, + "learning_rate": 2.8376848330018134e-07, + "loss": 0.0327, + "step": 190880 + }, + { + "epoch": 0.22445, + "grad_norm": 0.02884662337601185, + "learning_rate": 2.8314776990228665e-07, + "loss": 0.033, + "step": 190890 + }, + { + "epoch": 0.2245, + "grad_norm": 0.031461041420698166, + "learning_rate": 2.8252773225384276e-07, + "loss": 0.0338, + "step": 190900 + }, + { + "epoch": 0.22455, + "grad_norm": 0.027679523453116417, + "learning_rate": 2.819083703718056e-07, + "loss": 0.0317, + "step": 190910 + }, + { + "epoch": 0.2246, + "grad_norm": 0.027436112985014915, + "learning_rate": 2.812896842731061e-07, + "loss": 0.0322, + "step": 190920 + }, + { + "epoch": 0.22465, + "grad_norm": 0.03533259406685829, + "learning_rate": 2.80671673974664e-07, + "loss": 0.0328, + "step": 190930 + }, + { + "epoch": 0.2247, + "grad_norm": 0.031700585037469864, + "learning_rate": 2.8005433949336857e-07, + "loss": 0.0328, + "step": 190940 + }, + { + "epoch": 0.22475, + "grad_norm": 0.026392122730612755, + "learning_rate": 2.7943768084610356e-07, + "loss": 0.0323, + "step": 190950 + }, + { + "epoch": 0.2248, + "grad_norm": 0.027039388194680214, + "learning_rate": 2.7882169804972213e-07, + "loss": 0.0333, + "step": 190960 + }, + { + "epoch": 0.22485, + "grad_norm": 0.03179239109158516, + "learning_rate": 2.7820639112106916e-07, + "loss": 0.0338, + "step": 190970 + }, + { + "epoch": 0.2249, + "grad_norm": 0.028280850499868393, + "learning_rate": 2.775917600769673e-07, + "loss": 0.0325, + "step": 190980 + }, + { + "epoch": 0.22495, + "grad_norm": 0.029927313327789307, + "learning_rate": 2.76977804934217e-07, + "loss": 0.0327, + "step": 190990 + }, + { + "epoch": 0.225, + "grad_norm": 0.029478423297405243, + "learning_rate": 2.763645257096076e-07, + "loss": 0.0322, + "step": 191000 + }, + { + "epoch": 0.22505, + "grad_norm": 0.025449639186263084, + "learning_rate": 2.7575192241990065e-07, + "loss": 0.0331, + "step": 191010 + }, + { + "epoch": 0.2251, + "grad_norm": 0.02880786545574665, + "learning_rate": 2.751399950818523e-07, + "loss": 0.0335, + "step": 191020 + }, + { + "epoch": 0.22515, + "grad_norm": 0.027858871966600418, + "learning_rate": 2.745287437121824e-07, + "loss": 0.033, + "step": 191030 + }, + { + "epoch": 0.2252, + "grad_norm": 0.030391762033104897, + "learning_rate": 2.73918168327611e-07, + "loss": 0.0315, + "step": 191040 + }, + { + "epoch": 0.22525, + "grad_norm": 0.028253132477402687, + "learning_rate": 2.733082689448274e-07, + "loss": 0.0311, + "step": 191050 + }, + { + "epoch": 0.2253, + "grad_norm": 0.02949906326830387, + "learning_rate": 2.726990455805073e-07, + "loss": 0.0326, + "step": 191060 + }, + { + "epoch": 0.22535, + "grad_norm": 0.02909819222986698, + "learning_rate": 2.7209049825130393e-07, + "loss": 0.033, + "step": 191070 + }, + { + "epoch": 0.2254, + "grad_norm": 0.029378492385149002, + "learning_rate": 2.7148262697385685e-07, + "loss": 0.0325, + "step": 191080 + }, + { + "epoch": 0.22545, + "grad_norm": 0.027026867493987083, + "learning_rate": 2.7087543176478324e-07, + "loss": 0.0324, + "step": 191090 + }, + { + "epoch": 0.2255, + "grad_norm": 0.027683105319738388, + "learning_rate": 2.7026891264068934e-07, + "loss": 0.034, + "step": 191100 + }, + { + "epoch": 0.22555, + "grad_norm": 0.027874600142240524, + "learning_rate": 2.696630696181479e-07, + "loss": 0.0334, + "step": 191110 + }, + { + "epoch": 0.2256, + "grad_norm": 0.02619648166000843, + "learning_rate": 2.690579027137319e-07, + "loss": 0.0333, + "step": 191120 + }, + { + "epoch": 0.22565, + "grad_norm": 0.028711529448628426, + "learning_rate": 2.6845341194397797e-07, + "loss": 0.0319, + "step": 191130 + }, + { + "epoch": 0.2257, + "grad_norm": 0.027037620544433594, + "learning_rate": 2.678495973254175e-07, + "loss": 0.0327, + "step": 191140 + }, + { + "epoch": 0.22575, + "grad_norm": 0.0235857293009758, + "learning_rate": 2.672464588745593e-07, + "loss": 0.0313, + "step": 191150 + }, + { + "epoch": 0.2258, + "grad_norm": 0.027867760509252548, + "learning_rate": 2.666439966078904e-07, + "loss": 0.0325, + "step": 191160 + }, + { + "epoch": 0.22585, + "grad_norm": 0.027430761605501175, + "learning_rate": 2.6604221054188085e-07, + "loss": 0.0347, + "step": 191170 + }, + { + "epoch": 0.2259, + "grad_norm": 0.029700936749577522, + "learning_rate": 2.654411006929897e-07, + "loss": 0.0325, + "step": 191180 + }, + { + "epoch": 0.22595, + "grad_norm": 0.028554104268550873, + "learning_rate": 2.6484066707764266e-07, + "loss": 0.0309, + "step": 191190 + }, + { + "epoch": 0.226, + "grad_norm": 0.027590837329626083, + "learning_rate": 2.6424090971226e-07, + "loss": 0.0331, + "step": 191200 + }, + { + "epoch": 0.22605, + "grad_norm": 0.026260821148753166, + "learning_rate": 2.6364182861323694e-07, + "loss": 0.0317, + "step": 191210 + }, + { + "epoch": 0.2261, + "grad_norm": 0.02568810060620308, + "learning_rate": 2.630434237969548e-07, + "loss": 0.0311, + "step": 191220 + }, + { + "epoch": 0.22615, + "grad_norm": 0.02667641080915928, + "learning_rate": 2.624456952797727e-07, + "loss": 0.0319, + "step": 191230 + }, + { + "epoch": 0.2262, + "grad_norm": 0.026591245085000992, + "learning_rate": 2.6184864307803035e-07, + "loss": 0.032, + "step": 191240 + }, + { + "epoch": 0.22625, + "grad_norm": 0.029083015397191048, + "learning_rate": 2.6125226720805364e-07, + "loss": 0.032, + "step": 191250 + }, + { + "epoch": 0.2263, + "grad_norm": 0.029713058844208717, + "learning_rate": 2.6065656768614613e-07, + "loss": 0.0312, + "step": 191260 + }, + { + "epoch": 0.22635, + "grad_norm": 0.02492072619497776, + "learning_rate": 2.600615445285948e-07, + "loss": 0.0325, + "step": 191270 + }, + { + "epoch": 0.2264, + "grad_norm": 0.028060585260391235, + "learning_rate": 2.5946719775166437e-07, + "loss": 0.0323, + "step": 191280 + }, + { + "epoch": 0.22645, + "grad_norm": 0.025646554306149483, + "learning_rate": 2.5887352737160587e-07, + "loss": 0.033, + "step": 191290 + }, + { + "epoch": 0.2265, + "grad_norm": 0.028695549815893173, + "learning_rate": 2.5828053340465065e-07, + "loss": 0.0337, + "step": 191300 + }, + { + "epoch": 0.22655, + "grad_norm": 0.027269084006547928, + "learning_rate": 2.576882158670135e-07, + "loss": 0.0326, + "step": 191310 + }, + { + "epoch": 0.2266, + "grad_norm": 0.031050991266965866, + "learning_rate": 2.570965747748816e-07, + "loss": 0.0319, + "step": 191320 + }, + { + "epoch": 0.22665, + "grad_norm": 0.028266821056604385, + "learning_rate": 2.565056101444363e-07, + "loss": 0.0332, + "step": 191330 + }, + { + "epoch": 0.2267, + "grad_norm": 0.02769509144127369, + "learning_rate": 2.559153219918287e-07, + "loss": 0.0324, + "step": 191340 + }, + { + "epoch": 0.22675, + "grad_norm": 0.02997642755508423, + "learning_rate": 2.5532571033320407e-07, + "loss": 0.0321, + "step": 191350 + }, + { + "epoch": 0.2268, + "grad_norm": 0.032755471765995026, + "learning_rate": 2.5473677518467745e-07, + "loss": 0.0326, + "step": 191360 + }, + { + "epoch": 0.22685, + "grad_norm": 0.027477024123072624, + "learning_rate": 2.541485165623497e-07, + "loss": 0.0339, + "step": 191370 + }, + { + "epoch": 0.2269, + "grad_norm": 0.028355475515127182, + "learning_rate": 2.5356093448230533e-07, + "loss": 0.0335, + "step": 191380 + }, + { + "epoch": 0.22695, + "grad_norm": 0.0293252132833004, + "learning_rate": 2.529740289606092e-07, + "loss": 0.0337, + "step": 191390 + }, + { + "epoch": 0.227, + "grad_norm": 0.030119840055704117, + "learning_rate": 2.5238780001330674e-07, + "loss": 0.034, + "step": 191400 + }, + { + "epoch": 0.22705, + "grad_norm": 0.02708440274000168, + "learning_rate": 2.5180224765642133e-07, + "loss": 0.0345, + "step": 191410 + }, + { + "epoch": 0.2271, + "grad_norm": 0.03187808766961098, + "learning_rate": 2.5121737190596515e-07, + "loss": 0.0329, + "step": 191420 + }, + { + "epoch": 0.22715, + "grad_norm": 0.026619307696819305, + "learning_rate": 2.50633172777931e-07, + "loss": 0.0333, + "step": 191430 + }, + { + "epoch": 0.2272, + "grad_norm": 0.028194934129714966, + "learning_rate": 2.500496502882893e-07, + "loss": 0.0347, + "step": 191440 + }, + { + "epoch": 0.22725, + "grad_norm": 0.03038596548140049, + "learning_rate": 2.4946680445298853e-07, + "loss": 0.0335, + "step": 191450 + }, + { + "epoch": 0.2273, + "grad_norm": 0.02857760339975357, + "learning_rate": 2.4888463528796867e-07, + "loss": 0.0336, + "step": 191460 + }, + { + "epoch": 0.22735, + "grad_norm": 0.026745213195681572, + "learning_rate": 2.483031428091448e-07, + "loss": 0.0332, + "step": 191470 + }, + { + "epoch": 0.2274, + "grad_norm": 0.026955388486385345, + "learning_rate": 2.477223270324125e-07, + "loss": 0.033, + "step": 191480 + }, + { + "epoch": 0.22745, + "grad_norm": 0.030583078041672707, + "learning_rate": 2.4714218797365354e-07, + "loss": 0.0327, + "step": 191490 + }, + { + "epoch": 0.2275, + "grad_norm": 0.026133380830287933, + "learning_rate": 2.465627256487274e-07, + "loss": 0.0345, + "step": 191500 + }, + { + "epoch": 0.22755, + "grad_norm": 0.02993203140795231, + "learning_rate": 2.45983940073477e-07, + "loss": 0.0331, + "step": 191510 + }, + { + "epoch": 0.2276, + "grad_norm": 0.026168562471866608, + "learning_rate": 2.454058312637286e-07, + "loss": 0.0321, + "step": 191520 + }, + { + "epoch": 0.22765, + "grad_norm": 0.02790575847029686, + "learning_rate": 2.4482839923528343e-07, + "loss": 0.0325, + "step": 191530 + }, + { + "epoch": 0.2277, + "grad_norm": 0.028191763907670975, + "learning_rate": 2.4425164400392607e-07, + "loss": 0.0341, + "step": 191540 + }, + { + "epoch": 0.22775, + "grad_norm": 0.027023592963814735, + "learning_rate": 2.4367556558543283e-07, + "loss": 0.0316, + "step": 191550 + }, + { + "epoch": 0.2278, + "grad_norm": 0.023854469880461693, + "learning_rate": 2.431001639955494e-07, + "loss": 0.0317, + "step": 191560 + }, + { + "epoch": 0.22785, + "grad_norm": 0.028772985562682152, + "learning_rate": 2.425254392500048e-07, + "loss": 0.0334, + "step": 191570 + }, + { + "epoch": 0.2279, + "grad_norm": 0.027192121371626854, + "learning_rate": 2.4195139136451436e-07, + "loss": 0.0333, + "step": 191580 + }, + { + "epoch": 0.22795, + "grad_norm": 0.030033499002456665, + "learning_rate": 2.413780203547711e-07, + "loss": 0.0333, + "step": 191590 + }, + { + "epoch": 0.228, + "grad_norm": 0.02756202407181263, + "learning_rate": 2.4080532623645124e-07, + "loss": 0.0332, + "step": 191600 + }, + { + "epoch": 0.22805, + "grad_norm": 0.028734860941767693, + "learning_rate": 2.402333090252118e-07, + "loss": 0.0329, + "step": 191610 + }, + { + "epoch": 0.2281, + "grad_norm": 0.03284359350800514, + "learning_rate": 2.39661968736693e-07, + "loss": 0.0321, + "step": 191620 + }, + { + "epoch": 0.22815, + "grad_norm": 0.028583502396941185, + "learning_rate": 2.390913053865129e-07, + "loss": 0.0318, + "step": 191630 + }, + { + "epoch": 0.2282, + "grad_norm": 0.029747819527983665, + "learning_rate": 2.3852131899027576e-07, + "loss": 0.0339, + "step": 191640 + }, + { + "epoch": 0.22825, + "grad_norm": 0.030449246987700462, + "learning_rate": 2.3795200956356344e-07, + "loss": 0.032, + "step": 191650 + }, + { + "epoch": 0.2283, + "grad_norm": 0.028850451111793518, + "learning_rate": 2.3738337712194137e-07, + "loss": 0.0317, + "step": 191660 + }, + { + "epoch": 0.22835, + "grad_norm": 0.02912822924554348, + "learning_rate": 2.3681542168095262e-07, + "loss": 0.0323, + "step": 191670 + }, + { + "epoch": 0.2284, + "grad_norm": 0.02906673774123192, + "learning_rate": 2.3624814325612643e-07, + "loss": 0.0339, + "step": 191680 + }, + { + "epoch": 0.22845, + "grad_norm": 0.02683148719370365, + "learning_rate": 2.356815418629754e-07, + "loss": 0.0336, + "step": 191690 + }, + { + "epoch": 0.2285, + "grad_norm": 0.03201993927359581, + "learning_rate": 2.351156175169844e-07, + "loss": 0.0329, + "step": 191700 + }, + { + "epoch": 0.22855, + "grad_norm": 0.028824683278799057, + "learning_rate": 2.345503702336327e-07, + "loss": 0.0342, + "step": 191710 + }, + { + "epoch": 0.2286, + "grad_norm": 0.03363632410764694, + "learning_rate": 2.3398580002836624e-07, + "loss": 0.0342, + "step": 191720 + }, + { + "epoch": 0.22865, + "grad_norm": 0.02888336591422558, + "learning_rate": 2.3342190691662545e-07, + "loss": 0.0323, + "step": 191730 + }, + { + "epoch": 0.2287, + "grad_norm": 0.03185093775391579, + "learning_rate": 2.3285869091382583e-07, + "loss": 0.0327, + "step": 191740 + }, + { + "epoch": 0.22875, + "grad_norm": 0.026476627215743065, + "learning_rate": 2.3229615203536336e-07, + "loss": 0.0363, + "step": 191750 + }, + { + "epoch": 0.2288, + "grad_norm": 0.02874642238020897, + "learning_rate": 2.317342902966202e-07, + "loss": 0.0337, + "step": 191760 + }, + { + "epoch": 0.22885, + "grad_norm": 0.02916344441473484, + "learning_rate": 2.3117310571295625e-07, + "loss": 0.033, + "step": 191770 + }, + { + "epoch": 0.2289, + "grad_norm": 0.0316338911652565, + "learning_rate": 2.306125982997176e-07, + "loss": 0.0352, + "step": 191780 + }, + { + "epoch": 0.22895, + "grad_norm": 0.030737783759832382, + "learning_rate": 2.3005276807221976e-07, + "loss": 0.0348, + "step": 191790 + }, + { + "epoch": 0.229, + "grad_norm": 0.03237530589103699, + "learning_rate": 2.294936150457755e-07, + "loss": 0.0326, + "step": 191800 + }, + { + "epoch": 0.22905, + "grad_norm": 0.02888074517250061, + "learning_rate": 2.2893513923567255e-07, + "loss": 0.0335, + "step": 191810 + }, + { + "epoch": 0.2291, + "grad_norm": 0.02907225489616394, + "learning_rate": 2.2837734065717376e-07, + "loss": 0.0332, + "step": 191820 + }, + { + "epoch": 0.22915, + "grad_norm": 0.024844203144311905, + "learning_rate": 2.2782021932553354e-07, + "loss": 0.0322, + "step": 191830 + }, + { + "epoch": 0.2292, + "grad_norm": 0.02850308269262314, + "learning_rate": 2.2726377525598142e-07, + "loss": 0.0332, + "step": 191840 + }, + { + "epoch": 0.22925, + "grad_norm": 0.027192220091819763, + "learning_rate": 2.2670800846373018e-07, + "loss": 0.0317, + "step": 191850 + }, + { + "epoch": 0.2293, + "grad_norm": 0.02539575845003128, + "learning_rate": 2.2615291896397884e-07, + "loss": 0.0332, + "step": 191860 + }, + { + "epoch": 0.22935, + "grad_norm": 0.027498632669448853, + "learning_rate": 2.2559850677189577e-07, + "loss": 0.0331, + "step": 191870 + }, + { + "epoch": 0.2294, + "grad_norm": 0.028753872960805893, + "learning_rate": 2.2504477190264384e-07, + "loss": 0.0329, + "step": 191880 + }, + { + "epoch": 0.22945, + "grad_norm": 0.028393583372235298, + "learning_rate": 2.2449171437136097e-07, + "loss": 0.0327, + "step": 191890 + }, + { + "epoch": 0.2295, + "grad_norm": 0.028692688792943954, + "learning_rate": 2.239393341931656e-07, + "loss": 0.0325, + "step": 191900 + }, + { + "epoch": 0.22955, + "grad_norm": 0.025882720947265625, + "learning_rate": 2.2338763138315954e-07, + "loss": 0.033, + "step": 191910 + }, + { + "epoch": 0.2296, + "grad_norm": 0.030540117993950844, + "learning_rate": 2.2283660595643074e-07, + "loss": 0.0319, + "step": 191920 + }, + { + "epoch": 0.22965, + "grad_norm": 0.025634469464421272, + "learning_rate": 2.222862579280366e-07, + "loss": 0.0326, + "step": 191930 + }, + { + "epoch": 0.2297, + "grad_norm": 0.02588530071079731, + "learning_rate": 2.217365873130317e-07, + "loss": 0.0318, + "step": 191940 + }, + { + "epoch": 0.22975, + "grad_norm": 0.028533879667520523, + "learning_rate": 2.2118759412643742e-07, + "loss": 0.0329, + "step": 191950 + }, + { + "epoch": 0.2298, + "grad_norm": 0.027502888813614845, + "learning_rate": 2.206392783832667e-07, + "loss": 0.0332, + "step": 191960 + }, + { + "epoch": 0.22985, + "grad_norm": 0.028430012986063957, + "learning_rate": 2.2009164009850758e-07, + "loss": 0.0353, + "step": 191970 + }, + { + "epoch": 0.2299, + "grad_norm": 0.030735129490494728, + "learning_rate": 2.1954467928713697e-07, + "loss": 0.0331, + "step": 191980 + }, + { + "epoch": 0.22995, + "grad_norm": 0.029734564945101738, + "learning_rate": 2.189983959641012e-07, + "loss": 0.0332, + "step": 191990 + }, + { + "epoch": 0.23, + "grad_norm": 0.02846161276102066, + "learning_rate": 2.1845279014434117e-07, + "loss": 0.0321, + "step": 192000 + }, + { + "epoch": 0.23005, + "grad_norm": 0.03232620656490326, + "learning_rate": 2.179078618427699e-07, + "loss": 0.0334, + "step": 192010 + }, + { + "epoch": 0.2301, + "grad_norm": 0.028150519356131554, + "learning_rate": 2.1736361107429215e-07, + "loss": 0.0317, + "step": 192020 + }, + { + "epoch": 0.23015, + "grad_norm": 0.024552926421165466, + "learning_rate": 2.1682003785377936e-07, + "loss": 0.0307, + "step": 192030 + }, + { + "epoch": 0.2302, + "grad_norm": 0.02875414490699768, + "learning_rate": 2.162771421960974e-07, + "loss": 0.0328, + "step": 192040 + }, + { + "epoch": 0.23025, + "grad_norm": 0.030259743332862854, + "learning_rate": 2.157349241160872e-07, + "loss": 0.0328, + "step": 192050 + }, + { + "epoch": 0.2303, + "grad_norm": 0.02931458130478859, + "learning_rate": 2.1519338362857299e-07, + "loss": 0.0329, + "step": 192060 + }, + { + "epoch": 0.23035, + "grad_norm": 0.027251889929175377, + "learning_rate": 2.1465252074835963e-07, + "loss": 0.032, + "step": 192070 + }, + { + "epoch": 0.2304, + "grad_norm": 0.031506650149822235, + "learning_rate": 2.1411233549023525e-07, + "loss": 0.0337, + "step": 192080 + }, + { + "epoch": 0.23045, + "grad_norm": 0.028512097895145416, + "learning_rate": 2.135728278689686e-07, + "loss": 0.033, + "step": 192090 + }, + { + "epoch": 0.2305, + "grad_norm": 0.02668013609945774, + "learning_rate": 2.13033997899309e-07, + "loss": 0.0327, + "step": 192100 + }, + { + "epoch": 0.23055, + "grad_norm": 0.027364933863282204, + "learning_rate": 2.1249584559598913e-07, + "loss": 0.0327, + "step": 192110 + }, + { + "epoch": 0.2306, + "grad_norm": 0.02794736810028553, + "learning_rate": 2.1195837097371661e-07, + "loss": 0.0334, + "step": 192120 + }, + { + "epoch": 0.23065, + "grad_norm": 0.02584332786500454, + "learning_rate": 2.1142157404719365e-07, + "loss": 0.0315, + "step": 192130 + }, + { + "epoch": 0.2307, + "grad_norm": 0.029982447624206543, + "learning_rate": 2.108854548310918e-07, + "loss": 0.0325, + "step": 192140 + }, + { + "epoch": 0.23075, + "grad_norm": 0.02743016555905342, + "learning_rate": 2.10350013340066e-07, + "loss": 0.0319, + "step": 192150 + }, + { + "epoch": 0.2308, + "grad_norm": 0.026245078071951866, + "learning_rate": 2.098152495887601e-07, + "loss": 0.0321, + "step": 192160 + }, + { + "epoch": 0.23085, + "grad_norm": 0.027262257412075996, + "learning_rate": 2.0928116359179295e-07, + "loss": 0.0314, + "step": 192170 + }, + { + "epoch": 0.2309, + "grad_norm": 0.02742786332964897, + "learning_rate": 2.08747755363764e-07, + "loss": 0.0318, + "step": 192180 + }, + { + "epoch": 0.23095, + "grad_norm": 0.027811231091618538, + "learning_rate": 2.082150249192588e-07, + "loss": 0.0317, + "step": 192190 + }, + { + "epoch": 0.231, + "grad_norm": 0.027997516095638275, + "learning_rate": 2.0768297227283794e-07, + "loss": 0.0319, + "step": 192200 + }, + { + "epoch": 0.23105, + "grad_norm": 0.026771126314997673, + "learning_rate": 2.0715159743905365e-07, + "loss": 0.0309, + "step": 192210 + }, + { + "epoch": 0.2311, + "grad_norm": 0.03103078342974186, + "learning_rate": 2.0662090043242765e-07, + "loss": 0.034, + "step": 192220 + }, + { + "epoch": 0.23115, + "grad_norm": 0.026181593537330627, + "learning_rate": 2.060908812674761e-07, + "loss": 0.0338, + "step": 192230 + }, + { + "epoch": 0.2312, + "grad_norm": 0.03078121319413185, + "learning_rate": 2.055615399586819e-07, + "loss": 0.0343, + "step": 192240 + }, + { + "epoch": 0.23125, + "grad_norm": 0.02652091160416603, + "learning_rate": 2.0503287652051951e-07, + "loss": 0.0323, + "step": 192250 + }, + { + "epoch": 0.2313, + "grad_norm": 0.028631998226046562, + "learning_rate": 2.0450489096744685e-07, + "loss": 0.0353, + "step": 192260 + }, + { + "epoch": 0.23135, + "grad_norm": 0.028035033494234085, + "learning_rate": 2.0397758331389405e-07, + "loss": 0.0323, + "step": 192270 + }, + { + "epoch": 0.2314, + "grad_norm": 0.02855309657752514, + "learning_rate": 2.0345095357427735e-07, + "loss": 0.0316, + "step": 192280 + }, + { + "epoch": 0.23145, + "grad_norm": 0.02751971408724785, + "learning_rate": 2.0292500176299635e-07, + "loss": 0.0326, + "step": 192290 + }, + { + "epoch": 0.2315, + "grad_norm": 0.02798202820122242, + "learning_rate": 2.023997278944284e-07, + "loss": 0.0309, + "step": 192300 + }, + { + "epoch": 0.23155, + "grad_norm": 0.03205485641956329, + "learning_rate": 2.0187513198293983e-07, + "loss": 0.0315, + "step": 192310 + }, + { + "epoch": 0.2316, + "grad_norm": 0.02981637232005596, + "learning_rate": 2.0135121404286637e-07, + "loss": 0.0353, + "step": 192320 + }, + { + "epoch": 0.23165, + "grad_norm": 0.02659517154097557, + "learning_rate": 2.0082797408853272e-07, + "loss": 0.0312, + "step": 192330 + }, + { + "epoch": 0.2317, + "grad_norm": 0.026026614010334015, + "learning_rate": 2.003054121342468e-07, + "loss": 0.0314, + "step": 192340 + }, + { + "epoch": 0.23175, + "grad_norm": 0.030751172453165054, + "learning_rate": 1.9978352819429725e-07, + "loss": 0.0324, + "step": 192350 + }, + { + "epoch": 0.2318, + "grad_norm": 0.025679949671030045, + "learning_rate": 1.992623222829476e-07, + "loss": 0.0304, + "step": 192360 + }, + { + "epoch": 0.23185, + "grad_norm": 0.02781066484749317, + "learning_rate": 1.9874179441444484e-07, + "loss": 0.0321, + "step": 192370 + }, + { + "epoch": 0.2319, + "grad_norm": 0.03086954541504383, + "learning_rate": 1.9822194460302755e-07, + "loss": 0.0318, + "step": 192380 + }, + { + "epoch": 0.23195, + "grad_norm": 0.02711847797036171, + "learning_rate": 1.977027728629066e-07, + "loss": 0.0343, + "step": 192390 + }, + { + "epoch": 0.232, + "grad_norm": 0.027603790163993835, + "learning_rate": 1.9718427920827064e-07, + "loss": 0.0347, + "step": 192400 + }, + { + "epoch": 0.23205, + "grad_norm": 0.028913889080286026, + "learning_rate": 1.966664636533e-07, + "loss": 0.033, + "step": 192410 + }, + { + "epoch": 0.2321, + "grad_norm": 0.030028890818357468, + "learning_rate": 1.9614932621215e-07, + "loss": 0.0347, + "step": 192420 + }, + { + "epoch": 0.23215, + "grad_norm": 0.02870725467801094, + "learning_rate": 1.9563286689895655e-07, + "loss": 0.0326, + "step": 192430 + }, + { + "epoch": 0.2322, + "grad_norm": 0.026102574542164803, + "learning_rate": 1.9511708572784448e-07, + "loss": 0.0322, + "step": 192440 + }, + { + "epoch": 0.23225, + "grad_norm": 0.027219127863645554, + "learning_rate": 1.9460198271291364e-07, + "loss": 0.0319, + "step": 192450 + }, + { + "epoch": 0.2323, + "grad_norm": 0.026555761694908142, + "learning_rate": 1.940875578682416e-07, + "loss": 0.0322, + "step": 192460 + }, + { + "epoch": 0.23235, + "grad_norm": 0.02638009563088417, + "learning_rate": 1.9357381120790052e-07, + "loss": 0.0324, + "step": 192470 + }, + { + "epoch": 0.2324, + "grad_norm": 0.029163340106606483, + "learning_rate": 1.930607427459319e-07, + "loss": 0.0331, + "step": 192480 + }, + { + "epoch": 0.23245, + "grad_norm": 0.02803208865225315, + "learning_rate": 1.925483524963606e-07, + "loss": 0.0329, + "step": 192490 + }, + { + "epoch": 0.2325, + "grad_norm": 0.02882474660873413, + "learning_rate": 1.920366404731977e-07, + "loss": 0.0312, + "step": 192500 + }, + { + "epoch": 0.23255, + "grad_norm": 0.028602832928299904, + "learning_rate": 1.9152560669043473e-07, + "loss": 0.0336, + "step": 192510 + }, + { + "epoch": 0.2326, + "grad_norm": 0.027052734047174454, + "learning_rate": 1.910152511620439e-07, + "loss": 0.032, + "step": 192520 + }, + { + "epoch": 0.23265, + "grad_norm": 0.029143305495381355, + "learning_rate": 1.9050557390196956e-07, + "loss": 0.0335, + "step": 192530 + }, + { + "epoch": 0.2327, + "grad_norm": 0.026500994339585304, + "learning_rate": 1.899965749241561e-07, + "loss": 0.0319, + "step": 192540 + }, + { + "epoch": 0.23275, + "grad_norm": 0.025927657261490822, + "learning_rate": 1.894882542425147e-07, + "loss": 0.0317, + "step": 192550 + }, + { + "epoch": 0.2328, + "grad_norm": 0.025806518271565437, + "learning_rate": 1.8898061187094528e-07, + "loss": 0.0316, + "step": 192560 + }, + { + "epoch": 0.23285, + "grad_norm": 0.026212098076939583, + "learning_rate": 1.8847364782332567e-07, + "loss": 0.0322, + "step": 192570 + }, + { + "epoch": 0.2329, + "grad_norm": 0.02708258107304573, + "learning_rate": 1.879673621135114e-07, + "loss": 0.0329, + "step": 192580 + }, + { + "epoch": 0.23295, + "grad_norm": 0.026623349636793137, + "learning_rate": 1.8746175475534978e-07, + "loss": 0.0316, + "step": 192590 + }, + { + "epoch": 0.233, + "grad_norm": 0.02785472758114338, + "learning_rate": 1.8695682576266304e-07, + "loss": 0.0314, + "step": 192600 + }, + { + "epoch": 0.23305, + "grad_norm": 0.026384657248854637, + "learning_rate": 1.8645257514925406e-07, + "loss": 0.0321, + "step": 192610 + }, + { + "epoch": 0.2331, + "grad_norm": 0.03154732659459114, + "learning_rate": 1.8594900292890903e-07, + "loss": 0.0345, + "step": 192620 + }, + { + "epoch": 0.23315, + "grad_norm": 0.026553558185696602, + "learning_rate": 1.8544610911539472e-07, + "loss": 0.0327, + "step": 192630 + }, + { + "epoch": 0.2332, + "grad_norm": 0.02985524944961071, + "learning_rate": 1.8494389372246402e-07, + "loss": 0.0311, + "step": 192640 + }, + { + "epoch": 0.23325, + "grad_norm": 0.025981910526752472, + "learning_rate": 1.844423567638448e-07, + "loss": 0.0333, + "step": 192650 + }, + { + "epoch": 0.2333, + "grad_norm": 0.028925621882081032, + "learning_rate": 1.8394149825324834e-07, + "loss": 0.0319, + "step": 192660 + }, + { + "epoch": 0.23335, + "grad_norm": 0.025977320969104767, + "learning_rate": 1.8344131820436927e-07, + "loss": 0.0327, + "step": 192670 + }, + { + "epoch": 0.2334, + "grad_norm": 0.029717909172177315, + "learning_rate": 1.8294181663087995e-07, + "loss": 0.0343, + "step": 192680 + }, + { + "epoch": 0.23345, + "grad_norm": 0.026923447847366333, + "learning_rate": 1.824429935464389e-07, + "loss": 0.0344, + "step": 192690 + }, + { + "epoch": 0.2335, + "grad_norm": 0.03424394503235817, + "learning_rate": 1.8194484896467967e-07, + "loss": 0.0332, + "step": 192700 + }, + { + "epoch": 0.23355, + "grad_norm": 0.027123430743813515, + "learning_rate": 1.8144738289922747e-07, + "loss": 0.0333, + "step": 192710 + }, + { + "epoch": 0.2336, + "grad_norm": 0.0246063731610775, + "learning_rate": 1.8095059536367974e-07, + "loss": 0.0344, + "step": 192720 + }, + { + "epoch": 0.23365, + "grad_norm": 0.029820656403899193, + "learning_rate": 1.8045448637162011e-07, + "loss": 0.0353, + "step": 192730 + }, + { + "epoch": 0.2337, + "grad_norm": 0.025614451617002487, + "learning_rate": 1.7995905593660712e-07, + "loss": 0.0333, + "step": 192740 + }, + { + "epoch": 0.23375, + "grad_norm": 0.0293387733399868, + "learning_rate": 1.7946430407219105e-07, + "loss": 0.0354, + "step": 192750 + }, + { + "epoch": 0.2338, + "grad_norm": 0.0259340051561594, + "learning_rate": 1.7897023079189722e-07, + "loss": 0.034, + "step": 192760 + }, + { + "epoch": 0.23385, + "grad_norm": 0.026477213948965073, + "learning_rate": 1.784768361092315e-07, + "loss": 0.0341, + "step": 192770 + }, + { + "epoch": 0.2339, + "grad_norm": 0.029923735186457634, + "learning_rate": 1.7798412003768307e-07, + "loss": 0.0338, + "step": 192780 + }, + { + "epoch": 0.23395, + "grad_norm": 0.023119883611798286, + "learning_rate": 1.7749208259072448e-07, + "loss": 0.0335, + "step": 192790 + }, + { + "epoch": 0.234, + "grad_norm": 0.024888034909963608, + "learning_rate": 1.770007237818061e-07, + "loss": 0.0329, + "step": 192800 + }, + { + "epoch": 0.23405, + "grad_norm": 0.029386069625616074, + "learning_rate": 1.765100436243644e-07, + "loss": 0.0329, + "step": 192810 + }, + { + "epoch": 0.2341, + "grad_norm": 0.03265475854277611, + "learning_rate": 1.760200421318109e-07, + "loss": 0.0344, + "step": 192820 + }, + { + "epoch": 0.23415, + "grad_norm": 0.027390794828534126, + "learning_rate": 1.7553071931754317e-07, + "loss": 0.033, + "step": 192830 + }, + { + "epoch": 0.2342, + "grad_norm": 0.025554992258548737, + "learning_rate": 1.750420751949394e-07, + "loss": 0.0337, + "step": 192840 + }, + { + "epoch": 0.23425, + "grad_norm": 0.02988513931632042, + "learning_rate": 1.745541097773612e-07, + "loss": 0.034, + "step": 192850 + }, + { + "epoch": 0.2343, + "grad_norm": 0.028282584622502327, + "learning_rate": 1.7406682307814505e-07, + "loss": 0.0336, + "step": 192860 + }, + { + "epoch": 0.23435, + "grad_norm": 0.028356118127703667, + "learning_rate": 1.735802151106164e-07, + "loss": 0.0335, + "step": 192870 + }, + { + "epoch": 0.2344, + "grad_norm": 0.030579427257180214, + "learning_rate": 1.7309428588807575e-07, + "loss": 0.034, + "step": 192880 + }, + { + "epoch": 0.23445, + "grad_norm": 0.029863102361559868, + "learning_rate": 1.7260903542381246e-07, + "loss": 0.0339, + "step": 192890 + }, + { + "epoch": 0.2345, + "grad_norm": 0.028973223641514778, + "learning_rate": 1.7212446373109092e-07, + "loss": 0.0352, + "step": 192900 + }, + { + "epoch": 0.23455, + "grad_norm": 0.024388255551457405, + "learning_rate": 1.7164057082315887e-07, + "loss": 0.0321, + "step": 192910 + }, + { + "epoch": 0.2346, + "grad_norm": 0.028484197333455086, + "learning_rate": 1.7115735671324462e-07, + "loss": 0.0318, + "step": 192920 + }, + { + "epoch": 0.23465, + "grad_norm": 0.02944735251367092, + "learning_rate": 1.706748214145598e-07, + "loss": 0.0338, + "step": 192930 + }, + { + "epoch": 0.2347, + "grad_norm": 0.02990567497909069, + "learning_rate": 1.7019296494029945e-07, + "loss": 0.0324, + "step": 192940 + }, + { + "epoch": 0.23475, + "grad_norm": 0.026609139516949654, + "learning_rate": 1.6971178730363635e-07, + "loss": 0.0322, + "step": 192950 + }, + { + "epoch": 0.2348, + "grad_norm": 0.026992447674274445, + "learning_rate": 1.6923128851772385e-07, + "loss": 0.0319, + "step": 192960 + }, + { + "epoch": 0.23485, + "grad_norm": 0.03282034024596214, + "learning_rate": 1.687514685956987e-07, + "loss": 0.0316, + "step": 192970 + }, + { + "epoch": 0.2349, + "grad_norm": 0.02752852253615856, + "learning_rate": 1.682723275506809e-07, + "loss": 0.0305, + "step": 192980 + }, + { + "epoch": 0.23495, + "grad_norm": 0.026460938155651093, + "learning_rate": 1.6779386539576835e-07, + "loss": 0.0309, + "step": 192990 + }, + { + "epoch": 0.235, + "grad_norm": 0.02743816375732422, + "learning_rate": 1.6731608214403948e-07, + "loss": 0.0304, + "step": 193000 + }, + { + "epoch": 0.23505, + "grad_norm": 0.030733531340956688, + "learning_rate": 1.668389778085616e-07, + "loss": 0.0311, + "step": 193010 + }, + { + "epoch": 0.2351, + "grad_norm": 0.030693871900439262, + "learning_rate": 1.6636255240237986e-07, + "loss": 0.0306, + "step": 193020 + }, + { + "epoch": 0.23515, + "grad_norm": 0.0279314573854208, + "learning_rate": 1.6588680593851157e-07, + "loss": 0.0328, + "step": 193030 + }, + { + "epoch": 0.2352, + "grad_norm": 0.029680553823709488, + "learning_rate": 1.6541173842997137e-07, + "loss": 0.0303, + "step": 193040 + }, + { + "epoch": 0.23525, + "grad_norm": 0.03066929802298546, + "learning_rate": 1.649373498897433e-07, + "loss": 0.0312, + "step": 193050 + }, + { + "epoch": 0.2353, + "grad_norm": 0.028259824961423874, + "learning_rate": 1.6446364033079752e-07, + "loss": 0.0313, + "step": 193060 + }, + { + "epoch": 0.23535, + "grad_norm": 0.025081414729356766, + "learning_rate": 1.639906097660876e-07, + "loss": 0.0306, + "step": 193070 + }, + { + "epoch": 0.2354, + "grad_norm": 0.026953531429171562, + "learning_rate": 1.6351825820854206e-07, + "loss": 0.0311, + "step": 193080 + }, + { + "epoch": 0.23545, + "grad_norm": 0.02603212371468544, + "learning_rate": 1.6304658567107834e-07, + "loss": 0.0314, + "step": 193090 + }, + { + "epoch": 0.2355, + "grad_norm": 0.02506883069872856, + "learning_rate": 1.625755921665889e-07, + "loss": 0.031, + "step": 193100 + }, + { + "epoch": 0.23555, + "grad_norm": 0.026660045608878136, + "learning_rate": 1.6210527770795237e-07, + "loss": 0.0328, + "step": 193110 + }, + { + "epoch": 0.2356, + "grad_norm": 0.02374088764190674, + "learning_rate": 1.6163564230802507e-07, + "loss": 0.0313, + "step": 193120 + }, + { + "epoch": 0.23565, + "grad_norm": 0.028937511146068573, + "learning_rate": 1.6116668597964956e-07, + "loss": 0.0317, + "step": 193130 + }, + { + "epoch": 0.2357, + "grad_norm": 0.031917814165353775, + "learning_rate": 1.6069840873564335e-07, + "loss": 0.0311, + "step": 193140 + }, + { + "epoch": 0.23575, + "grad_norm": 0.028658544644713402, + "learning_rate": 1.6023081058881284e-07, + "loss": 0.0315, + "step": 193150 + }, + { + "epoch": 0.2358, + "grad_norm": 0.0289542805403471, + "learning_rate": 1.5976389155193948e-07, + "loss": 0.0323, + "step": 193160 + }, + { + "epoch": 0.23585, + "grad_norm": 0.025311751291155815, + "learning_rate": 1.592976516377881e-07, + "loss": 0.032, + "step": 193170 + }, + { + "epoch": 0.2359, + "grad_norm": 0.02736661396920681, + "learning_rate": 1.5883209085910678e-07, + "loss": 0.0315, + "step": 193180 + }, + { + "epoch": 0.23595, + "grad_norm": 0.03126165643334389, + "learning_rate": 1.5836720922862703e-07, + "loss": 0.0333, + "step": 193190 + }, + { + "epoch": 0.236, + "grad_norm": 0.030230486765503883, + "learning_rate": 1.5790300675904979e-07, + "loss": 0.0333, + "step": 193200 + }, + { + "epoch": 0.23605, + "grad_norm": 0.031154369935393333, + "learning_rate": 1.5743948346307603e-07, + "loss": 0.0331, + "step": 193210 + }, + { + "epoch": 0.2361, + "grad_norm": 0.027177628129720688, + "learning_rate": 1.5697663935337336e-07, + "loss": 0.0323, + "step": 193220 + }, + { + "epoch": 0.23615, + "grad_norm": 0.026651490479707718, + "learning_rate": 1.565144744425956e-07, + "loss": 0.0323, + "step": 193230 + }, + { + "epoch": 0.2362, + "grad_norm": 0.030147448182106018, + "learning_rate": 1.5605298874337704e-07, + "loss": 0.032, + "step": 193240 + }, + { + "epoch": 0.23625, + "grad_norm": 0.028682773932814598, + "learning_rate": 1.5559218226834094e-07, + "loss": 0.0322, + "step": 193250 + }, + { + "epoch": 0.2363, + "grad_norm": 0.02755606733262539, + "learning_rate": 1.5513205503007722e-07, + "loss": 0.033, + "step": 193260 + }, + { + "epoch": 0.23635, + "grad_norm": 0.026752468198537827, + "learning_rate": 1.5467260704117025e-07, + "loss": 0.0334, + "step": 193270 + }, + { + "epoch": 0.2364, + "grad_norm": 0.026598379015922546, + "learning_rate": 1.5421383831417945e-07, + "loss": 0.0349, + "step": 193280 + }, + { + "epoch": 0.23645, + "grad_norm": 0.02493317425251007, + "learning_rate": 1.5375574886164755e-07, + "loss": 0.032, + "step": 193290 + }, + { + "epoch": 0.2365, + "grad_norm": 0.029579149559140205, + "learning_rate": 1.5329833869610066e-07, + "loss": 0.0324, + "step": 193300 + }, + { + "epoch": 0.23655, + "grad_norm": 0.02820001170039177, + "learning_rate": 1.5284160783004543e-07, + "loss": 0.0324, + "step": 193310 + }, + { + "epoch": 0.2366, + "grad_norm": 0.02783927135169506, + "learning_rate": 1.5238555627596352e-07, + "loss": 0.0319, + "step": 193320 + }, + { + "epoch": 0.23665, + "grad_norm": 0.028218500316143036, + "learning_rate": 1.5193018404632277e-07, + "loss": 0.032, + "step": 193330 + }, + { + "epoch": 0.2367, + "grad_norm": 0.027087442576885223, + "learning_rate": 1.5147549115357706e-07, + "loss": 0.0328, + "step": 193340 + }, + { + "epoch": 0.23675, + "grad_norm": 0.029415588825941086, + "learning_rate": 1.5102147761015817e-07, + "loss": 0.0328, + "step": 193350 + }, + { + "epoch": 0.2368, + "grad_norm": 0.027064288035035133, + "learning_rate": 1.5056814342847836e-07, + "loss": 0.0319, + "step": 193360 + }, + { + "epoch": 0.23685, + "grad_norm": 0.027783773839473724, + "learning_rate": 1.5011548862092773e-07, + "loss": 0.0324, + "step": 193370 + }, + { + "epoch": 0.2369, + "grad_norm": 0.02960365265607834, + "learning_rate": 1.4966351319988248e-07, + "loss": 0.0347, + "step": 193380 + }, + { + "epoch": 0.23695, + "grad_norm": 0.030076824128627777, + "learning_rate": 1.4921221717770218e-07, + "loss": 0.0341, + "step": 193390 + }, + { + "epoch": 0.237, + "grad_norm": 0.026086056604981422, + "learning_rate": 1.4876160056672417e-07, + "loss": 0.0315, + "step": 193400 + }, + { + "epoch": 0.23705, + "grad_norm": 0.025460442528128624, + "learning_rate": 1.4831166337926917e-07, + "loss": 0.033, + "step": 193410 + }, + { + "epoch": 0.2371, + "grad_norm": 0.024396125227212906, + "learning_rate": 1.4786240562763566e-07, + "loss": 0.0331, + "step": 193420 + }, + { + "epoch": 0.23715, + "grad_norm": 0.03096782974898815, + "learning_rate": 1.4741382732410546e-07, + "loss": 0.0331, + "step": 193430 + }, + { + "epoch": 0.2372, + "grad_norm": 0.026441195979714394, + "learning_rate": 1.4696592848094935e-07, + "loss": 0.0321, + "step": 193440 + }, + { + "epoch": 0.23725, + "grad_norm": 0.026662474498152733, + "learning_rate": 1.4651870911040478e-07, + "loss": 0.033, + "step": 193450 + }, + { + "epoch": 0.2373, + "grad_norm": 0.028206605464220047, + "learning_rate": 1.4607216922470078e-07, + "loss": 0.0328, + "step": 193460 + }, + { + "epoch": 0.23735, + "grad_norm": 0.028729049488902092, + "learning_rate": 1.456263088360471e-07, + "loss": 0.0326, + "step": 193470 + }, + { + "epoch": 0.2374, + "grad_norm": 0.027290131896734238, + "learning_rate": 1.4518112795663397e-07, + "loss": 0.0327, + "step": 193480 + }, + { + "epoch": 0.23745, + "grad_norm": 0.026049882173538208, + "learning_rate": 1.447366265986322e-07, + "loss": 0.0334, + "step": 193490 + }, + { + "epoch": 0.2375, + "grad_norm": 0.025826916098594666, + "learning_rate": 1.4429280477419037e-07, + "loss": 0.0361, + "step": 193500 + }, + { + "epoch": 0.23755, + "grad_norm": 0.030189473181962967, + "learning_rate": 1.4384966249544606e-07, + "loss": 0.0339, + "step": 193510 + }, + { + "epoch": 0.2376, + "grad_norm": 0.02801290899515152, + "learning_rate": 1.4340719977451455e-07, + "loss": 0.0323, + "step": 193520 + }, + { + "epoch": 0.23765, + "grad_norm": 0.0285183098167181, + "learning_rate": 1.4296541662349172e-07, + "loss": 0.032, + "step": 193530 + }, + { + "epoch": 0.2377, + "grad_norm": 0.02660214714705944, + "learning_rate": 1.425243130544568e-07, + "loss": 0.0327, + "step": 193540 + }, + { + "epoch": 0.23775, + "grad_norm": 0.027230629697442055, + "learning_rate": 1.4208388907946957e-07, + "loss": 0.033, + "step": 193550 + }, + { + "epoch": 0.2378, + "grad_norm": 0.027467437088489532, + "learning_rate": 1.4164414471056764e-07, + "loss": 0.0332, + "step": 193560 + }, + { + "epoch": 0.23785, + "grad_norm": 0.02356082573533058, + "learning_rate": 1.412050799597775e-07, + "loss": 0.0313, + "step": 193570 + }, + { + "epoch": 0.2379, + "grad_norm": 0.026973361149430275, + "learning_rate": 1.4076669483910065e-07, + "loss": 0.0325, + "step": 193580 + }, + { + "epoch": 0.23795, + "grad_norm": 0.030826479196548462, + "learning_rate": 1.4032898936052475e-07, + "loss": 0.032, + "step": 193590 + }, + { + "epoch": 0.238, + "grad_norm": 0.02999156527221203, + "learning_rate": 1.398919635360152e-07, + "loss": 0.0328, + "step": 193600 + }, + { + "epoch": 0.23805, + "grad_norm": 0.031644634902477264, + "learning_rate": 1.39455617377518e-07, + "loss": 0.036, + "step": 193610 + }, + { + "epoch": 0.2381, + "grad_norm": 0.03171585127711296, + "learning_rate": 1.390199508969653e-07, + "loss": 0.0342, + "step": 193620 + }, + { + "epoch": 0.23815, + "grad_norm": 0.029693949967622757, + "learning_rate": 1.3858496410626698e-07, + "loss": 0.0324, + "step": 193630 + }, + { + "epoch": 0.2382, + "grad_norm": 0.028009943664073944, + "learning_rate": 1.3815065701731632e-07, + "loss": 0.0326, + "step": 193640 + }, + { + "epoch": 0.23825, + "grad_norm": 0.026626676321029663, + "learning_rate": 1.3771702964198442e-07, + "loss": 0.0321, + "step": 193650 + }, + { + "epoch": 0.2383, + "grad_norm": 0.030735468491911888, + "learning_rate": 1.3728408199213118e-07, + "loss": 0.0336, + "step": 193660 + }, + { + "epoch": 0.23835, + "grad_norm": 0.030684100463986397, + "learning_rate": 1.368518140795888e-07, + "loss": 0.0333, + "step": 193670 + }, + { + "epoch": 0.2384, + "grad_norm": 0.02619747631251812, + "learning_rate": 1.364202259161784e-07, + "loss": 0.0326, + "step": 193680 + }, + { + "epoch": 0.23845, + "grad_norm": 0.028299542143940926, + "learning_rate": 1.3598931751369893e-07, + "loss": 0.0339, + "step": 193690 + }, + { + "epoch": 0.2385, + "grad_norm": 0.027913063764572144, + "learning_rate": 1.3555908888392976e-07, + "loss": 0.0321, + "step": 193700 + }, + { + "epoch": 0.23855, + "grad_norm": 0.0298161543905735, + "learning_rate": 1.3512954003863377e-07, + "loss": 0.0327, + "step": 193710 + }, + { + "epoch": 0.2386, + "grad_norm": 0.027977202087640762, + "learning_rate": 1.347006709895543e-07, + "loss": 0.0323, + "step": 193720 + }, + { + "epoch": 0.23865, + "grad_norm": 0.031968772411346436, + "learning_rate": 1.342724817484181e-07, + "loss": 0.0334, + "step": 193730 + }, + { + "epoch": 0.2387, + "grad_norm": 0.031502120196819305, + "learning_rate": 1.3384497232692973e-07, + "loss": 0.0326, + "step": 193740 + }, + { + "epoch": 0.23875, + "grad_norm": 0.02637472376227379, + "learning_rate": 1.3341814273677977e-07, + "loss": 0.0336, + "step": 193750 + }, + { + "epoch": 0.2388, + "grad_norm": 0.024537673220038414, + "learning_rate": 1.3299199298963116e-07, + "loss": 0.0332, + "step": 193760 + }, + { + "epoch": 0.23885, + "grad_norm": 0.030038703233003616, + "learning_rate": 1.32566523097144e-07, + "loss": 0.0334, + "step": 193770 + }, + { + "epoch": 0.2389, + "grad_norm": 0.02504717744886875, + "learning_rate": 1.3214173307094513e-07, + "loss": 0.0317, + "step": 193780 + }, + { + "epoch": 0.23895, + "grad_norm": 0.02821286953985691, + "learning_rate": 1.3171762292264744e-07, + "loss": 0.0334, + "step": 193790 + }, + { + "epoch": 0.239, + "grad_norm": 0.026244711130857468, + "learning_rate": 1.3129419266385002e-07, + "loss": 0.0319, + "step": 193800 + }, + { + "epoch": 0.23905, + "grad_norm": 0.02635263279080391, + "learning_rate": 1.3087144230612414e-07, + "loss": 0.0327, + "step": 193810 + }, + { + "epoch": 0.2391, + "grad_norm": 0.02682708390057087, + "learning_rate": 1.3044937186103e-07, + "loss": 0.0315, + "step": 193820 + }, + { + "epoch": 0.23915, + "grad_norm": 0.029811793938279152, + "learning_rate": 1.3002798134010841e-07, + "loss": 0.033, + "step": 193830 + }, + { + "epoch": 0.2392, + "grad_norm": 0.02868504635989666, + "learning_rate": 1.2960727075487788e-07, + "loss": 0.0335, + "step": 193840 + }, + { + "epoch": 0.23925, + "grad_norm": 0.027720022946596146, + "learning_rate": 1.2918724011684036e-07, + "loss": 0.0339, + "step": 193850 + }, + { + "epoch": 0.2393, + "grad_norm": 0.027101268991827965, + "learning_rate": 1.2876788943748108e-07, + "loss": 0.0334, + "step": 193860 + }, + { + "epoch": 0.23935, + "grad_norm": 0.03304234892129898, + "learning_rate": 1.2834921872826588e-07, + "loss": 0.0346, + "step": 193870 + }, + { + "epoch": 0.2394, + "grad_norm": 0.03118724562227726, + "learning_rate": 1.279312280006356e-07, + "loss": 0.0349, + "step": 193880 + }, + { + "epoch": 0.23945, + "grad_norm": 0.031831204891204834, + "learning_rate": 1.275139172660228e-07, + "loss": 0.0344, + "step": 193890 + }, + { + "epoch": 0.2395, + "grad_norm": 0.03403082862496376, + "learning_rate": 1.2709728653583775e-07, + "loss": 0.0351, + "step": 193900 + }, + { + "epoch": 0.23955, + "grad_norm": 0.03151148185133934, + "learning_rate": 1.266813358214658e-07, + "loss": 0.0352, + "step": 193910 + }, + { + "epoch": 0.2396, + "grad_norm": 0.033800844103097916, + "learning_rate": 1.262660651342812e-07, + "loss": 0.0347, + "step": 193920 + }, + { + "epoch": 0.23965, + "grad_norm": 0.0265358816832304, + "learning_rate": 1.2585147448563872e-07, + "loss": 0.0345, + "step": 193930 + }, + { + "epoch": 0.2397, + "grad_norm": 0.02647106908261776, + "learning_rate": 1.2543756388687377e-07, + "loss": 0.0374, + "step": 193940 + }, + { + "epoch": 0.23975, + "grad_norm": 0.029090620577335358, + "learning_rate": 1.2502433334929954e-07, + "loss": 0.0339, + "step": 193950 + }, + { + "epoch": 0.2398, + "grad_norm": 0.031015174463391304, + "learning_rate": 1.2461178288421527e-07, + "loss": 0.0336, + "step": 193960 + }, + { + "epoch": 0.23985, + "grad_norm": 0.030042123049497604, + "learning_rate": 1.2419991250289808e-07, + "loss": 0.0351, + "step": 193970 + }, + { + "epoch": 0.2399, + "grad_norm": 0.03131449967622757, + "learning_rate": 1.237887222166112e-07, + "loss": 0.0349, + "step": 193980 + }, + { + "epoch": 0.23995, + "grad_norm": 0.027338171377778053, + "learning_rate": 1.2337821203659562e-07, + "loss": 0.0345, + "step": 193990 + }, + { + "epoch": 0.24, + "grad_norm": 0.029214883223176003, + "learning_rate": 1.229683819740729e-07, + "loss": 0.0345, + "step": 194000 + }, + { + "epoch": 0.24005, + "grad_norm": 0.030200736597180367, + "learning_rate": 1.2255923204025077e-07, + "loss": 0.0313, + "step": 194010 + }, + { + "epoch": 0.2401, + "grad_norm": 0.026402173563838005, + "learning_rate": 1.2215076224631194e-07, + "loss": 0.0318, + "step": 194020 + }, + { + "epoch": 0.24015, + "grad_norm": 0.028501980006694794, + "learning_rate": 1.217429726034225e-07, + "loss": 0.0318, + "step": 194030 + }, + { + "epoch": 0.2402, + "grad_norm": 0.03134189918637276, + "learning_rate": 1.2133586312273738e-07, + "loss": 0.0316, + "step": 194040 + }, + { + "epoch": 0.24025, + "grad_norm": 0.027999697253108025, + "learning_rate": 1.20929433815381e-07, + "loss": 0.0326, + "step": 194050 + }, + { + "epoch": 0.2403, + "grad_norm": 0.02933386340737343, + "learning_rate": 1.2052368469246954e-07, + "loss": 0.0319, + "step": 194060 + }, + { + "epoch": 0.24035, + "grad_norm": 0.02369658090174198, + "learning_rate": 1.201186157650941e-07, + "loss": 0.0315, + "step": 194070 + }, + { + "epoch": 0.2404, + "grad_norm": 0.028840554878115654, + "learning_rate": 1.1971422704432633e-07, + "loss": 0.0328, + "step": 194080 + }, + { + "epoch": 0.24045, + "grad_norm": 0.027982434257864952, + "learning_rate": 1.193105185412241e-07, + "loss": 0.0324, + "step": 194090 + }, + { + "epoch": 0.2405, + "grad_norm": 0.02944488450884819, + "learning_rate": 1.1890749026682857e-07, + "loss": 0.0311, + "step": 194100 + }, + { + "epoch": 0.24055, + "grad_norm": 0.02974824607372284, + "learning_rate": 1.1850514223215315e-07, + "loss": 0.0323, + "step": 194110 + }, + { + "epoch": 0.2406, + "grad_norm": 0.02925095707178116, + "learning_rate": 1.1810347444819735e-07, + "loss": 0.0326, + "step": 194120 + }, + { + "epoch": 0.24065, + "grad_norm": 0.026488423347473145, + "learning_rate": 1.1770248692594687e-07, + "loss": 0.0329, + "step": 194130 + }, + { + "epoch": 0.2407, + "grad_norm": 0.028202392160892487, + "learning_rate": 1.1730217967636236e-07, + "loss": 0.033, + "step": 194140 + }, + { + "epoch": 0.24075, + "grad_norm": 0.027688847854733467, + "learning_rate": 1.1690255271038786e-07, + "loss": 0.0343, + "step": 194150 + }, + { + "epoch": 0.2408, + "grad_norm": 0.028941884636878967, + "learning_rate": 1.1650360603895071e-07, + "loss": 0.0323, + "step": 194160 + }, + { + "epoch": 0.24085, + "grad_norm": 0.02789250761270523, + "learning_rate": 1.1610533967295334e-07, + "loss": 0.0354, + "step": 194170 + }, + { + "epoch": 0.2409, + "grad_norm": 0.02641705423593521, + "learning_rate": 1.1570775362329255e-07, + "loss": 0.0339, + "step": 194180 + }, + { + "epoch": 0.24095, + "grad_norm": 0.026302112266421318, + "learning_rate": 1.1531084790082913e-07, + "loss": 0.0333, + "step": 194190 + }, + { + "epoch": 0.241, + "grad_norm": 0.028520043939352036, + "learning_rate": 1.1491462251642104e-07, + "loss": 0.0353, + "step": 194200 + }, + { + "epoch": 0.24105, + "grad_norm": 0.027348719537258148, + "learning_rate": 1.1451907748089574e-07, + "loss": 0.0339, + "step": 194210 + }, + { + "epoch": 0.2411, + "grad_norm": 0.031844280660152435, + "learning_rate": 1.1412421280506957e-07, + "loss": 0.034, + "step": 194220 + }, + { + "epoch": 0.24115, + "grad_norm": 0.028920553624629974, + "learning_rate": 1.1373002849974223e-07, + "loss": 0.0334, + "step": 194230 + }, + { + "epoch": 0.2412, + "grad_norm": 0.02920878306031227, + "learning_rate": 1.1333652457568288e-07, + "loss": 0.0334, + "step": 194240 + }, + { + "epoch": 0.24125, + "grad_norm": 0.02664770931005478, + "learning_rate": 1.1294370104365515e-07, + "loss": 0.0333, + "step": 194250 + }, + { + "epoch": 0.2413, + "grad_norm": 0.026782682165503502, + "learning_rate": 1.1255155791439764e-07, + "loss": 0.0336, + "step": 194260 + }, + { + "epoch": 0.24135, + "grad_norm": 0.025523407384753227, + "learning_rate": 1.1216009519862958e-07, + "loss": 0.0343, + "step": 194270 + }, + { + "epoch": 0.2414, + "grad_norm": 0.027562156319618225, + "learning_rate": 1.117693129070535e-07, + "loss": 0.0337, + "step": 194280 + }, + { + "epoch": 0.24145, + "grad_norm": 0.03049503266811371, + "learning_rate": 1.1137921105035532e-07, + "loss": 0.0322, + "step": 194290 + }, + { + "epoch": 0.2415, + "grad_norm": 0.02665276825428009, + "learning_rate": 1.1098978963920148e-07, + "loss": 0.0336, + "step": 194300 + }, + { + "epoch": 0.24155, + "grad_norm": 0.027565669268369675, + "learning_rate": 1.106010486842335e-07, + "loss": 0.0323, + "step": 194310 + }, + { + "epoch": 0.2416, + "grad_norm": 0.02628151699900627, + "learning_rate": 1.1021298819608449e-07, + "loss": 0.0332, + "step": 194320 + }, + { + "epoch": 0.24165, + "grad_norm": 0.026651812717318535, + "learning_rate": 1.0982560818535991e-07, + "loss": 0.032, + "step": 194330 + }, + { + "epoch": 0.2417, + "grad_norm": 0.02496226131916046, + "learning_rate": 1.0943890866265405e-07, + "loss": 0.0323, + "step": 194340 + }, + { + "epoch": 0.24175, + "grad_norm": 0.025923805311322212, + "learning_rate": 1.0905288963853344e-07, + "loss": 0.0361, + "step": 194350 + }, + { + "epoch": 0.2418, + "grad_norm": 0.030155358836054802, + "learning_rate": 1.0866755112355908e-07, + "loss": 0.0334, + "step": 194360 + }, + { + "epoch": 0.24185, + "grad_norm": 0.025724314153194427, + "learning_rate": 1.0828289312826423e-07, + "loss": 0.0334, + "step": 194370 + }, + { + "epoch": 0.2419, + "grad_norm": 0.0289236418902874, + "learning_rate": 1.0789891566315991e-07, + "loss": 0.0321, + "step": 194380 + }, + { + "epoch": 0.24195, + "grad_norm": 0.02829277515411377, + "learning_rate": 1.075156187387516e-07, + "loss": 0.0325, + "step": 194390 + }, + { + "epoch": 0.242, + "grad_norm": 0.032805219292640686, + "learning_rate": 1.0713300236551149e-07, + "loss": 0.0337, + "step": 194400 + }, + { + "epoch": 0.24205, + "grad_norm": 0.025196310132741928, + "learning_rate": 1.0675106655390343e-07, + "loss": 0.0334, + "step": 194410 + }, + { + "epoch": 0.2421, + "grad_norm": 0.030526315793395042, + "learning_rate": 1.0636981131437185e-07, + "loss": 0.0344, + "step": 194420 + }, + { + "epoch": 0.24215, + "grad_norm": 0.026543065905570984, + "learning_rate": 1.0598923665733618e-07, + "loss": 0.0339, + "step": 194430 + }, + { + "epoch": 0.2422, + "grad_norm": 0.028449447825551033, + "learning_rate": 1.0560934259320199e-07, + "loss": 0.0335, + "step": 194440 + }, + { + "epoch": 0.24225, + "grad_norm": 0.027246778830885887, + "learning_rate": 1.0523012913235541e-07, + "loss": 0.0322, + "step": 194450 + }, + { + "epoch": 0.2423, + "grad_norm": 0.025745918974280357, + "learning_rate": 1.0485159628516595e-07, + "loss": 0.0336, + "step": 194460 + }, + { + "epoch": 0.24235, + "grad_norm": 0.02711482159793377, + "learning_rate": 1.0447374406198085e-07, + "loss": 0.0331, + "step": 194470 + }, + { + "epoch": 0.2424, + "grad_norm": 0.025958344340324402, + "learning_rate": 1.0409657247313076e-07, + "loss": 0.0328, + "step": 194480 + }, + { + "epoch": 0.24245, + "grad_norm": 0.02521250769495964, + "learning_rate": 1.0372008152892687e-07, + "loss": 0.0324, + "step": 194490 + }, + { + "epoch": 0.2425, + "grad_norm": 0.02667837217450142, + "learning_rate": 1.0334427123966373e-07, + "loss": 0.0334, + "step": 194500 + }, + { + "epoch": 0.24255, + "grad_norm": 0.029127979651093483, + "learning_rate": 1.0296914161561367e-07, + "loss": 0.0321, + "step": 194510 + }, + { + "epoch": 0.2426, + "grad_norm": 0.029505934566259384, + "learning_rate": 1.0259469266703514e-07, + "loss": 0.0324, + "step": 194520 + }, + { + "epoch": 0.24265, + "grad_norm": 0.03228716552257538, + "learning_rate": 1.0222092440416442e-07, + "loss": 0.0327, + "step": 194530 + }, + { + "epoch": 0.2427, + "grad_norm": 0.028279023244976997, + "learning_rate": 1.0184783683721832e-07, + "loss": 0.0327, + "step": 194540 + }, + { + "epoch": 0.24275, + "grad_norm": 0.02680935710668564, + "learning_rate": 1.0147542997639703e-07, + "loss": 0.032, + "step": 194550 + }, + { + "epoch": 0.2428, + "grad_norm": 0.025176655501127243, + "learning_rate": 1.0110370383188683e-07, + "loss": 0.0324, + "step": 194560 + }, + { + "epoch": 0.24285, + "grad_norm": 0.030117202550172806, + "learning_rate": 1.0073265841384627e-07, + "loss": 0.0338, + "step": 194570 + }, + { + "epoch": 0.2429, + "grad_norm": 0.028528856113553047, + "learning_rate": 1.0036229373242279e-07, + "loss": 0.0331, + "step": 194580 + }, + { + "epoch": 0.24295, + "grad_norm": 0.02289486862719059, + "learning_rate": 9.999260979773607e-08, + "loss": 0.0327, + "step": 194590 + }, + { + "epoch": 0.243, + "grad_norm": 0.028572697192430496, + "learning_rate": 9.962360661990022e-08, + "loss": 0.0331, + "step": 194600 + }, + { + "epoch": 0.24305, + "grad_norm": 0.027456842362880707, + "learning_rate": 9.925528420899888e-08, + "loss": 0.0328, + "step": 194610 + }, + { + "epoch": 0.2431, + "grad_norm": 0.02405891753733158, + "learning_rate": 9.888764257510174e-08, + "loss": 0.0327, + "step": 194620 + }, + { + "epoch": 0.24315, + "grad_norm": 0.02784058079123497, + "learning_rate": 9.852068172826468e-08, + "loss": 0.0323, + "step": 194630 + }, + { + "epoch": 0.2432, + "grad_norm": 0.027669742703437805, + "learning_rate": 9.815440167851297e-08, + "loss": 0.0325, + "step": 194640 + }, + { + "epoch": 0.24325, + "grad_norm": 0.03056858293712139, + "learning_rate": 9.778880243586919e-08, + "loss": 0.0332, + "step": 194650 + }, + { + "epoch": 0.2433, + "grad_norm": 0.02696123905479908, + "learning_rate": 9.742388401032254e-08, + "loss": 0.0318, + "step": 194660 + }, + { + "epoch": 0.24335, + "grad_norm": 0.026333358138799667, + "learning_rate": 9.705964641185117e-08, + "loss": 0.0325, + "step": 194670 + }, + { + "epoch": 0.2434, + "grad_norm": 0.0272892527282238, + "learning_rate": 9.669608965041377e-08, + "loss": 0.0333, + "step": 194680 + }, + { + "epoch": 0.24345, + "grad_norm": 0.027968794107437134, + "learning_rate": 9.633321373594962e-08, + "loss": 0.0331, + "step": 194690 + }, + { + "epoch": 0.2435, + "grad_norm": 0.027383040636777878, + "learning_rate": 9.597101867837854e-08, + "loss": 0.0329, + "step": 194700 + }, + { + "epoch": 0.24355, + "grad_norm": 0.02573508210480213, + "learning_rate": 9.560950448760375e-08, + "loss": 0.0321, + "step": 194710 + }, + { + "epoch": 0.2436, + "grad_norm": 0.023699520155787468, + "learning_rate": 9.5248671173509e-08, + "loss": 0.0315, + "step": 194720 + }, + { + "epoch": 0.24365, + "grad_norm": 0.024864180013537407, + "learning_rate": 9.488851874596138e-08, + "loss": 0.0326, + "step": 194730 + }, + { + "epoch": 0.2437, + "grad_norm": 0.027155090123414993, + "learning_rate": 9.452904721480304e-08, + "loss": 0.0323, + "step": 194740 + }, + { + "epoch": 0.24375, + "grad_norm": 0.029680032283067703, + "learning_rate": 9.4170256589865e-08, + "loss": 0.0335, + "step": 194750 + }, + { + "epoch": 0.2438, + "grad_norm": 0.031484510749578476, + "learning_rate": 9.38121468809533e-08, + "loss": 0.0328, + "step": 194760 + }, + { + "epoch": 0.24385, + "grad_norm": 0.024282047525048256, + "learning_rate": 9.345471809786289e-08, + "loss": 0.0325, + "step": 194770 + }, + { + "epoch": 0.2439, + "grad_norm": 0.0297898780554533, + "learning_rate": 9.309797025036371e-08, + "loss": 0.0342, + "step": 194780 + }, + { + "epoch": 0.24395, + "grad_norm": 0.02838975004851818, + "learning_rate": 9.27419033482091e-08, + "loss": 0.0334, + "step": 194790 + }, + { + "epoch": 0.244, + "grad_norm": 0.026697857305407524, + "learning_rate": 9.238651740113291e-08, + "loss": 0.0327, + "step": 194800 + }, + { + "epoch": 0.24405, + "grad_norm": 0.032274551689624786, + "learning_rate": 9.20318124188524e-08, + "loss": 0.0352, + "step": 194810 + }, + { + "epoch": 0.2441, + "grad_norm": 0.029393857344985008, + "learning_rate": 9.167778841106533e-08, + "loss": 0.0324, + "step": 194820 + }, + { + "epoch": 0.24415, + "grad_norm": 0.029212241992354393, + "learning_rate": 9.132444538744733e-08, + "loss": 0.0324, + "step": 194830 + }, + { + "epoch": 0.2442, + "grad_norm": 0.029543107375502586, + "learning_rate": 9.097178335766287e-08, + "loss": 0.0331, + "step": 194840 + }, + { + "epoch": 0.24425, + "grad_norm": 0.026619311422109604, + "learning_rate": 9.061980233135148e-08, + "loss": 0.0332, + "step": 194850 + }, + { + "epoch": 0.2443, + "grad_norm": 0.027571335434913635, + "learning_rate": 9.0268502318136e-08, + "loss": 0.033, + "step": 194860 + }, + { + "epoch": 0.24435, + "grad_norm": 0.028135353699326515, + "learning_rate": 8.991788332762263e-08, + "loss": 0.0329, + "step": 194870 + }, + { + "epoch": 0.2444, + "grad_norm": 0.028476586565375328, + "learning_rate": 8.95679453693954e-08, + "loss": 0.0318, + "step": 194880 + }, + { + "epoch": 0.24445, + "grad_norm": 0.02955649234354496, + "learning_rate": 8.92186884530244e-08, + "loss": 0.0317, + "step": 194890 + }, + { + "epoch": 0.2445, + "grad_norm": 0.027156932279467583, + "learning_rate": 8.887011258805478e-08, + "loss": 0.0346, + "step": 194900 + }, + { + "epoch": 0.24455, + "grad_norm": 0.02584977261722088, + "learning_rate": 8.852221778401781e-08, + "loss": 0.0325, + "step": 194910 + }, + { + "epoch": 0.2446, + "grad_norm": 0.025581881403923035, + "learning_rate": 8.817500405042256e-08, + "loss": 0.0329, + "step": 194920 + }, + { + "epoch": 0.24465, + "grad_norm": 0.02725781686604023, + "learning_rate": 8.78284713967642e-08, + "loss": 0.0337, + "step": 194930 + }, + { + "epoch": 0.2447, + "grad_norm": 0.02975981868803501, + "learning_rate": 8.748261983251849e-08, + "loss": 0.0332, + "step": 194940 + }, + { + "epoch": 0.24475, + "grad_norm": 0.02868971973657608, + "learning_rate": 8.71374493671362e-08, + "loss": 0.0327, + "step": 194950 + }, + { + "epoch": 0.2448, + "grad_norm": 0.028604021295905113, + "learning_rate": 8.679296001005976e-08, + "loss": 0.035, + "step": 194960 + }, + { + "epoch": 0.24485, + "grad_norm": 0.029460575431585312, + "learning_rate": 8.644915177070112e-08, + "loss": 0.0326, + "step": 194970 + }, + { + "epoch": 0.2449, + "grad_norm": 0.027132930234074593, + "learning_rate": 8.610602465846385e-08, + "loss": 0.0333, + "step": 194980 + }, + { + "epoch": 0.24495, + "grad_norm": 0.02888166345655918, + "learning_rate": 8.576357868272933e-08, + "loss": 0.0339, + "step": 194990 + }, + { + "epoch": 0.245, + "grad_norm": 0.0272995438426733, + "learning_rate": 8.542181385285952e-08, + "loss": 0.0339, + "step": 195000 + }, + { + "epoch": 0.24505, + "grad_norm": 0.0266796313226223, + "learning_rate": 8.508073017819418e-08, + "loss": 0.0338, + "step": 195010 + }, + { + "epoch": 0.2451, + "grad_norm": 0.02482810989022255, + "learning_rate": 8.474032766806472e-08, + "loss": 0.0354, + "step": 195020 + }, + { + "epoch": 0.24515, + "grad_norm": 0.027011895552277565, + "learning_rate": 8.440060633177483e-08, + "loss": 0.0333, + "step": 195030 + }, + { + "epoch": 0.2452, + "grad_norm": 0.026108799502253532, + "learning_rate": 8.406156617860872e-08, + "loss": 0.0339, + "step": 195040 + }, + { + "epoch": 0.24525, + "grad_norm": 0.030398061498999596, + "learning_rate": 8.372320721783955e-08, + "loss": 0.0334, + "step": 195050 + }, + { + "epoch": 0.2453, + "grad_norm": 0.02703634649515152, + "learning_rate": 8.338552945871825e-08, + "loss": 0.0328, + "step": 195060 + }, + { + "epoch": 0.24535, + "grad_norm": 0.02893189899623394, + "learning_rate": 8.304853291047631e-08, + "loss": 0.0345, + "step": 195070 + }, + { + "epoch": 0.2454, + "grad_norm": 0.027915995568037033, + "learning_rate": 8.271221758232583e-08, + "loss": 0.0323, + "step": 195080 + }, + { + "epoch": 0.24545, + "grad_norm": 0.02771705575287342, + "learning_rate": 8.237658348346222e-08, + "loss": 0.0342, + "step": 195090 + }, + { + "epoch": 0.2455, + "grad_norm": 0.02994770184159279, + "learning_rate": 8.204163062306425e-08, + "loss": 0.0335, + "step": 195100 + }, + { + "epoch": 0.24555, + "grad_norm": 0.030637120828032494, + "learning_rate": 8.17073590102857e-08, + "loss": 0.0331, + "step": 195110 + }, + { + "epoch": 0.2456, + "grad_norm": 0.025169167667627335, + "learning_rate": 8.137376865426371e-08, + "loss": 0.0351, + "step": 195120 + }, + { + "epoch": 0.24565, + "grad_norm": 0.029002079740166664, + "learning_rate": 8.104085956412432e-08, + "loss": 0.0328, + "step": 195130 + }, + { + "epoch": 0.2457, + "grad_norm": 0.02710791677236557, + "learning_rate": 8.070863174896303e-08, + "loss": 0.0328, + "step": 195140 + }, + { + "epoch": 0.24575, + "grad_norm": 0.024972479790449142, + "learning_rate": 8.03770852178698e-08, + "loss": 0.0341, + "step": 195150 + }, + { + "epoch": 0.2458, + "grad_norm": 0.02914566360414028, + "learning_rate": 8.004621997990403e-08, + "loss": 0.0349, + "step": 195160 + }, + { + "epoch": 0.24585, + "grad_norm": 0.027187839150428772, + "learning_rate": 7.971603604411126e-08, + "loss": 0.0345, + "step": 195170 + }, + { + "epoch": 0.2459, + "grad_norm": 0.02553461864590645, + "learning_rate": 7.93865334195204e-08, + "loss": 0.033, + "step": 195180 + }, + { + "epoch": 0.24595, + "grad_norm": 0.02773602120578289, + "learning_rate": 7.90577121151409e-08, + "loss": 0.0321, + "step": 195190 + }, + { + "epoch": 0.246, + "grad_norm": 0.02327432855963707, + "learning_rate": 7.872957213996002e-08, + "loss": 0.0337, + "step": 195200 + }, + { + "epoch": 0.24605, + "grad_norm": 0.025786172598600388, + "learning_rate": 7.840211350295112e-08, + "loss": 0.0332, + "step": 195210 + }, + { + "epoch": 0.2461, + "grad_norm": 0.029511207714676857, + "learning_rate": 7.80753362130654e-08, + "loss": 0.0329, + "step": 195220 + }, + { + "epoch": 0.24615, + "grad_norm": 0.026267917826771736, + "learning_rate": 7.774924027923736e-08, + "loss": 0.0315, + "step": 195230 + }, + { + "epoch": 0.2462, + "grad_norm": 0.027054287493228912, + "learning_rate": 7.742382571037931e-08, + "loss": 0.0327, + "step": 195240 + }, + { + "epoch": 0.24625, + "grad_norm": 0.02936594747006893, + "learning_rate": 7.709909251539526e-08, + "loss": 0.0322, + "step": 195250 + }, + { + "epoch": 0.2463, + "grad_norm": 0.025670327246189117, + "learning_rate": 7.677504070315589e-08, + "loss": 0.0334, + "step": 195260 + }, + { + "epoch": 0.24635, + "grad_norm": 0.02422959916293621, + "learning_rate": 7.645167028252631e-08, + "loss": 0.0318, + "step": 195270 + }, + { + "epoch": 0.2464, + "grad_norm": 0.030943427234888077, + "learning_rate": 7.612898126234114e-08, + "loss": 0.0347, + "step": 195280 + }, + { + "epoch": 0.24645, + "grad_norm": 0.028920264914631844, + "learning_rate": 7.580697365142941e-08, + "loss": 0.0317, + "step": 195290 + }, + { + "epoch": 0.2465, + "grad_norm": 0.02755330316722393, + "learning_rate": 7.548564745858965e-08, + "loss": 0.0318, + "step": 195300 + }, + { + "epoch": 0.24655, + "grad_norm": 0.031554482877254486, + "learning_rate": 7.516500269260929e-08, + "loss": 0.0324, + "step": 195310 + }, + { + "epoch": 0.2466, + "grad_norm": 0.02594132535159588, + "learning_rate": 7.484503936225629e-08, + "loss": 0.0322, + "step": 195320 + }, + { + "epoch": 0.24665, + "grad_norm": 0.025025133043527603, + "learning_rate": 7.452575747627088e-08, + "loss": 0.0329, + "step": 195330 + }, + { + "epoch": 0.2467, + "grad_norm": 0.029942601919174194, + "learning_rate": 7.420715704339054e-08, + "loss": 0.033, + "step": 195340 + }, + { + "epoch": 0.24675, + "grad_norm": 0.027799487113952637, + "learning_rate": 7.388923807232217e-08, + "loss": 0.0309, + "step": 195350 + }, + { + "epoch": 0.2468, + "grad_norm": 0.027677707374095917, + "learning_rate": 7.357200057175606e-08, + "loss": 0.0327, + "step": 195360 + }, + { + "epoch": 0.24685, + "grad_norm": 0.0252380333840847, + "learning_rate": 7.325544455036859e-08, + "loss": 0.0333, + "step": 195370 + }, + { + "epoch": 0.2469, + "grad_norm": 0.02922498993575573, + "learning_rate": 7.293957001681395e-08, + "loss": 0.0314, + "step": 195380 + }, + { + "epoch": 0.24695, + "grad_norm": 0.025162069126963615, + "learning_rate": 7.262437697972413e-08, + "loss": 0.0307, + "step": 195390 + }, + { + "epoch": 0.247, + "grad_norm": 0.02564503252506256, + "learning_rate": 7.230986544772001e-08, + "loss": 0.0316, + "step": 195400 + }, + { + "epoch": 0.24705, + "grad_norm": 0.028158506378531456, + "learning_rate": 7.199603542940026e-08, + "loss": 0.0318, + "step": 195410 + }, + { + "epoch": 0.2471, + "grad_norm": 0.0253047663718462, + "learning_rate": 7.168288693334135e-08, + "loss": 0.0349, + "step": 195420 + }, + { + "epoch": 0.24715, + "grad_norm": 0.028382329270243645, + "learning_rate": 7.137041996810866e-08, + "loss": 0.0317, + "step": 195430 + }, + { + "epoch": 0.2472, + "grad_norm": 0.02788013219833374, + "learning_rate": 7.105863454224537e-08, + "loss": 0.0325, + "step": 195440 + }, + { + "epoch": 0.24725, + "grad_norm": 0.026838254183530807, + "learning_rate": 7.074753066427242e-08, + "loss": 0.0353, + "step": 195450 + }, + { + "epoch": 0.2473, + "grad_norm": 0.029704350978136063, + "learning_rate": 7.043710834269413e-08, + "loss": 0.0321, + "step": 195460 + }, + { + "epoch": 0.24735, + "grad_norm": 0.02257765829563141, + "learning_rate": 7.012736758600091e-08, + "loss": 0.0325, + "step": 195470 + }, + { + "epoch": 0.2474, + "grad_norm": 0.024150114506483078, + "learning_rate": 6.981830840266102e-08, + "loss": 0.0319, + "step": 195480 + }, + { + "epoch": 0.24745, + "grad_norm": 0.025648394599556923, + "learning_rate": 6.950993080112322e-08, + "loss": 0.0316, + "step": 195490 + }, + { + "epoch": 0.2475, + "grad_norm": 0.027938872575759888, + "learning_rate": 6.92022347898169e-08, + "loss": 0.0314, + "step": 195500 + }, + { + "epoch": 0.24755, + "grad_norm": 0.027293045073747635, + "learning_rate": 6.889522037715479e-08, + "loss": 0.0312, + "step": 195510 + }, + { + "epoch": 0.2476, + "grad_norm": 0.028072131797671318, + "learning_rate": 6.858888757153014e-08, + "loss": 0.0316, + "step": 195520 + }, + { + "epoch": 0.24765, + "grad_norm": 0.0319143608212471, + "learning_rate": 6.828323638132239e-08, + "loss": 0.0321, + "step": 195530 + }, + { + "epoch": 0.2477, + "grad_norm": 0.026307567954063416, + "learning_rate": 6.797826681488318e-08, + "loss": 0.0319, + "step": 195540 + }, + { + "epoch": 0.24775, + "grad_norm": 0.026995103806257248, + "learning_rate": 6.767397888055027e-08, + "loss": 0.0318, + "step": 195550 + }, + { + "epoch": 0.2478, + "grad_norm": 0.028282200917601585, + "learning_rate": 6.73703725866448e-08, + "loss": 0.0338, + "step": 195560 + }, + { + "epoch": 0.24785, + "grad_norm": 0.027422990649938583, + "learning_rate": 6.706744794146846e-08, + "loss": 0.0325, + "step": 195570 + }, + { + "epoch": 0.2479, + "grad_norm": 0.025190260261297226, + "learning_rate": 6.676520495329797e-08, + "loss": 0.0328, + "step": 195580 + }, + { + "epoch": 0.24795, + "grad_norm": 0.026048485189676285, + "learning_rate": 6.64636436304017e-08, + "loss": 0.0333, + "step": 195590 + }, + { + "epoch": 0.248, + "grad_norm": 0.028442546725273132, + "learning_rate": 6.616276398102306e-08, + "loss": 0.0318, + "step": 195600 + }, + { + "epoch": 0.24805, + "grad_norm": 0.028586098924279213, + "learning_rate": 6.586256601338603e-08, + "loss": 0.0322, + "step": 195610 + }, + { + "epoch": 0.2481, + "grad_norm": 0.026411043480038643, + "learning_rate": 6.556304973569794e-08, + "loss": 0.0328, + "step": 195620 + }, + { + "epoch": 0.24815, + "grad_norm": 0.02859685756266117, + "learning_rate": 6.526421515615222e-08, + "loss": 0.0327, + "step": 195630 + }, + { + "epoch": 0.2482, + "grad_norm": 0.029196694493293762, + "learning_rate": 6.49660622829118e-08, + "loss": 0.0314, + "step": 195640 + }, + { + "epoch": 0.24825, + "grad_norm": 0.029072273522615433, + "learning_rate": 6.466859112413404e-08, + "loss": 0.0323, + "step": 195650 + }, + { + "epoch": 0.2483, + "grad_norm": 0.024802634492516518, + "learning_rate": 6.437180168794853e-08, + "loss": 0.0312, + "step": 195660 + }, + { + "epoch": 0.24835, + "grad_norm": 0.029871709644794464, + "learning_rate": 6.407569398246827e-08, + "loss": 0.0322, + "step": 195670 + }, + { + "epoch": 0.2484, + "grad_norm": 0.02558273635804653, + "learning_rate": 6.378026801579229e-08, + "loss": 0.0303, + "step": 195680 + }, + { + "epoch": 0.24845, + "grad_norm": 0.02865140326321125, + "learning_rate": 6.34855237959947e-08, + "loss": 0.0336, + "step": 195690 + }, + { + "epoch": 0.2485, + "grad_norm": 0.0272515956312418, + "learning_rate": 6.319146133113574e-08, + "loss": 0.0301, + "step": 195700 + }, + { + "epoch": 0.24855, + "grad_norm": 0.02938549779355526, + "learning_rate": 6.28980806292534e-08, + "loss": 0.0312, + "step": 195710 + }, + { + "epoch": 0.2486, + "grad_norm": 0.02831849828362465, + "learning_rate": 6.260538169836905e-08, + "loss": 0.0322, + "step": 195720 + }, + { + "epoch": 0.24865, + "grad_norm": 0.028205547481775284, + "learning_rate": 6.231336454648462e-08, + "loss": 0.0306, + "step": 195730 + }, + { + "epoch": 0.2487, + "grad_norm": 0.027268605306744576, + "learning_rate": 6.20220291815854e-08, + "loss": 0.032, + "step": 195740 + }, + { + "epoch": 0.24875, + "grad_norm": 0.029969947412610054, + "learning_rate": 6.173137561163444e-08, + "loss": 0.032, + "step": 195750 + }, + { + "epoch": 0.2488, + "grad_norm": 0.029885603114962578, + "learning_rate": 6.14414038445782e-08, + "loss": 0.0337, + "step": 195760 + }, + { + "epoch": 0.24885, + "grad_norm": 0.026608698070049286, + "learning_rate": 6.115211388834641e-08, + "loss": 0.0327, + "step": 195770 + }, + { + "epoch": 0.2489, + "grad_norm": 0.026822002604603767, + "learning_rate": 6.086350575084665e-08, + "loss": 0.0329, + "step": 195780 + }, + { + "epoch": 0.24895, + "grad_norm": 0.02918635495007038, + "learning_rate": 6.057557943996705e-08, + "loss": 0.0368, + "step": 195790 + }, + { + "epoch": 0.249, + "grad_norm": 0.026154810562729836, + "learning_rate": 6.028833496358465e-08, + "loss": 0.0343, + "step": 195800 + }, + { + "epoch": 0.24905, + "grad_norm": 0.028903482481837273, + "learning_rate": 6.000177232954873e-08, + "loss": 0.0332, + "step": 195810 + }, + { + "epoch": 0.2491, + "grad_norm": 0.02927960641682148, + "learning_rate": 5.971589154569468e-08, + "loss": 0.0354, + "step": 195820 + }, + { + "epoch": 0.24915, + "grad_norm": 0.02714477851986885, + "learning_rate": 5.943069261983847e-08, + "loss": 0.0345, + "step": 195830 + }, + { + "epoch": 0.2492, + "grad_norm": 0.02752465382218361, + "learning_rate": 5.914617555977664e-08, + "loss": 0.0335, + "step": 195840 + }, + { + "epoch": 0.24925, + "grad_norm": 0.028988268226385117, + "learning_rate": 5.8862340373289085e-08, + "loss": 0.034, + "step": 195850 + }, + { + "epoch": 0.2493, + "grad_norm": 0.02949836663901806, + "learning_rate": 5.857918706813625e-08, + "loss": 0.0347, + "step": 195860 + }, + { + "epoch": 0.24935, + "grad_norm": 0.024647751823067665, + "learning_rate": 5.829671565205641e-08, + "loss": 0.0343, + "step": 195870 + }, + { + "epoch": 0.2494, + "grad_norm": 0.02909855730831623, + "learning_rate": 5.8014926132776706e-08, + "loss": 0.0333, + "step": 195880 + }, + { + "epoch": 0.24945, + "grad_norm": 0.023018838837742805, + "learning_rate": 5.773381851799653e-08, + "loss": 0.0336, + "step": 195890 + }, + { + "epoch": 0.2495, + "grad_norm": 0.02698889747262001, + "learning_rate": 5.7453392815404185e-08, + "loss": 0.0339, + "step": 195900 + }, + { + "epoch": 0.24955, + "grad_norm": 0.022461283951997757, + "learning_rate": 5.7173649032665756e-08, + "loss": 0.0332, + "step": 195910 + }, + { + "epoch": 0.2496, + "grad_norm": 0.02490866556763649, + "learning_rate": 5.6894587177427904e-08, + "loss": 0.0336, + "step": 195920 + }, + { + "epoch": 0.24965, + "grad_norm": 0.03168974071741104, + "learning_rate": 5.661620725732342e-08, + "loss": 0.0339, + "step": 195930 + }, + { + "epoch": 0.2497, + "grad_norm": 0.0300124604254961, + "learning_rate": 5.63385092799601e-08, + "loss": 0.0342, + "step": 195940 + }, + { + "epoch": 0.24975, + "grad_norm": 0.028176583349704742, + "learning_rate": 5.606149325293186e-08, + "loss": 0.0348, + "step": 195950 + }, + { + "epoch": 0.2498, + "grad_norm": 0.032841626554727554, + "learning_rate": 5.5785159183810445e-08, + "loss": 0.033, + "step": 195960 + }, + { + "epoch": 0.24985, + "grad_norm": 0.02682030200958252, + "learning_rate": 5.55095070801509e-08, + "loss": 0.0329, + "step": 195970 + }, + { + "epoch": 0.2499, + "grad_norm": 0.031123394146561623, + "learning_rate": 5.5234536949491656e-08, + "loss": 0.0342, + "step": 195980 + }, + { + "epoch": 0.24995, + "grad_norm": 0.024029873311519623, + "learning_rate": 5.496024879934891e-08, + "loss": 0.0327, + "step": 195990 + }, + { + "epoch": 0.25, + "grad_norm": 0.025896171107888222, + "learning_rate": 5.4686642637219455e-08, + "loss": 0.0328, + "step": 196000 + } + ], + "logging_steps": 10, + "max_steps": 200000, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}