diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10534 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.670520231213873, + "eval_steps": 500, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005780346820809248, + "grad_norm": 3.1595253944396973, + "learning_rate": 3.6e-07, + "loss": 0.3371, + "step": 10 + }, + { + "epoch": 0.011560693641618497, + "grad_norm": 3.4880928993225098, + "learning_rate": 7.6e-07, + "loss": 0.3336, + "step": 20 + }, + { + "epoch": 0.017341040462427744, + "grad_norm": 3.2054455280303955, + "learning_rate": 1.16e-06, + "loss": 0.3104, + "step": 30 + }, + { + "epoch": 0.023121387283236993, + "grad_norm": 2.7082252502441406, + "learning_rate": 1.56e-06, + "loss": 0.2878, + "step": 40 + }, + { + "epoch": 0.028901734104046242, + "grad_norm": 1.6240012645721436, + "learning_rate": 1.96e-06, + "loss": 0.1887, + "step": 50 + }, + { + "epoch": 0.03468208092485549, + "grad_norm": 0.4911483824253082, + "learning_rate": 2.36e-06, + "loss": 0.1468, + "step": 60 + }, + { + "epoch": 0.04046242774566474, + "grad_norm": 0.6908175945281982, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.1346, + "step": 70 + }, + { + "epoch": 0.046242774566473986, + "grad_norm": 0.4389197826385498, + "learning_rate": 3.1600000000000007e-06, + "loss": 0.131, + "step": 80 + }, + { + "epoch": 0.05202312138728324, + "grad_norm": 0.5299481153488159, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.0966, + "step": 90 + }, + { + "epoch": 0.057803468208092484, + "grad_norm": 0.33272606134414673, + "learning_rate": 3.96e-06, + "loss": 0.1039, + "step": 100 + }, + { + "epoch": 0.06358381502890173, + "grad_norm": 0.2846597135066986, + "learning_rate": 4.360000000000001e-06, + "loss": 0.0933, + "step": 110 + }, + { + "epoch": 0.06936416184971098, + "grad_norm": 0.21347007155418396, + "learning_rate": 4.76e-06, + "loss": 0.0802, + "step": 120 + }, + { + "epoch": 0.07514450867052024, + "grad_norm": 0.24284492433071136, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.0858, + "step": 130 + }, + { + "epoch": 0.08092485549132948, + "grad_norm": 0.19188109040260315, + "learning_rate": 5.56e-06, + "loss": 0.0793, + "step": 140 + }, + { + "epoch": 0.08670520231213873, + "grad_norm": 0.15339428186416626, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.0744, + "step": 150 + }, + { + "epoch": 0.09248554913294797, + "grad_norm": 0.16470389068126678, + "learning_rate": 6.360000000000001e-06, + "loss": 0.0705, + "step": 160 + }, + { + "epoch": 0.09826589595375723, + "grad_norm": 0.15767133235931396, + "learning_rate": 6.76e-06, + "loss": 0.0665, + "step": 170 + }, + { + "epoch": 0.10404624277456648, + "grad_norm": 0.11162696778774261, + "learning_rate": 7.16e-06, + "loss": 0.0609, + "step": 180 + }, + { + "epoch": 0.10982658959537572, + "grad_norm": 0.08929910510778427, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.0617, + "step": 190 + }, + { + "epoch": 0.11560693641618497, + "grad_norm": 0.11486585438251495, + "learning_rate": 7.96e-06, + "loss": 0.0558, + "step": 200 + }, + { + "epoch": 0.12138728323699421, + "grad_norm": 0.1365112066268921, + "learning_rate": 8.36e-06, + "loss": 0.0529, + "step": 210 + }, + { + "epoch": 0.12716763005780346, + "grad_norm": 0.14509879052639008, + "learning_rate": 8.76e-06, + "loss": 0.0523, + "step": 220 + }, + { + "epoch": 0.1329479768786127, + "grad_norm": 0.11319673806428909, + "learning_rate": 9.16e-06, + "loss": 0.0454, + "step": 230 + }, + { + "epoch": 0.13872832369942195, + "grad_norm": 0.1477111279964447, + "learning_rate": 9.560000000000002e-06, + "loss": 0.0474, + "step": 240 + }, + { + "epoch": 0.14450867052023122, + "grad_norm": 0.10854203253984451, + "learning_rate": 9.96e-06, + "loss": 0.0393, + "step": 250 + }, + { + "epoch": 0.15028901734104047, + "grad_norm": 0.11513552814722061, + "learning_rate": 1.036e-05, + "loss": 0.045, + "step": 260 + }, + { + "epoch": 0.15606936416184972, + "grad_norm": 0.11579402536153793, + "learning_rate": 1.076e-05, + "loss": 0.0355, + "step": 270 + }, + { + "epoch": 0.16184971098265896, + "grad_norm": 0.11395751684904099, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.0422, + "step": 280 + }, + { + "epoch": 0.1676300578034682, + "grad_norm": 0.12264841049909592, + "learning_rate": 1.156e-05, + "loss": 0.0396, + "step": 290 + }, + { + "epoch": 0.17341040462427745, + "grad_norm": 0.1499921679496765, + "learning_rate": 1.196e-05, + "loss": 0.0368, + "step": 300 + }, + { + "epoch": 0.1791907514450867, + "grad_norm": 0.1338682770729065, + "learning_rate": 1.236e-05, + "loss": 0.035, + "step": 310 + }, + { + "epoch": 0.18497109826589594, + "grad_norm": 0.12111975252628326, + "learning_rate": 1.276e-05, + "loss": 0.0327, + "step": 320 + }, + { + "epoch": 0.1907514450867052, + "grad_norm": 0.08808861672878265, + "learning_rate": 1.316e-05, + "loss": 0.0318, + "step": 330 + }, + { + "epoch": 0.19653179190751446, + "grad_norm": 0.14213015139102936, + "learning_rate": 1.356e-05, + "loss": 0.0301, + "step": 340 + }, + { + "epoch": 0.2023121387283237, + "grad_norm": 0.11863279342651367, + "learning_rate": 1.396e-05, + "loss": 0.0271, + "step": 350 + }, + { + "epoch": 0.20809248554913296, + "grad_norm": 0.13171134889125824, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.0356, + "step": 360 + }, + { + "epoch": 0.2138728323699422, + "grad_norm": 0.13477171957492828, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.03, + "step": 370 + }, + { + "epoch": 0.21965317919075145, + "grad_norm": 0.17239578068256378, + "learning_rate": 1.5160000000000002e-05, + "loss": 0.0305, + "step": 380 + }, + { + "epoch": 0.2254335260115607, + "grad_norm": 0.11451636254787445, + "learning_rate": 1.556e-05, + "loss": 0.0288, + "step": 390 + }, + { + "epoch": 0.23121387283236994, + "grad_norm": 0.1459856629371643, + "learning_rate": 1.596e-05, + "loss": 0.0263, + "step": 400 + }, + { + "epoch": 0.23699421965317918, + "grad_norm": 0.11896130442619324, + "learning_rate": 1.636e-05, + "loss": 0.03, + "step": 410 + }, + { + "epoch": 0.24277456647398843, + "grad_norm": 0.09973743557929993, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.0262, + "step": 420 + }, + { + "epoch": 0.24855491329479767, + "grad_norm": 0.13354068994522095, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.024, + "step": 430 + }, + { + "epoch": 0.2543352601156069, + "grad_norm": 0.15291906893253326, + "learning_rate": 1.756e-05, + "loss": 0.0243, + "step": 440 + }, + { + "epoch": 0.26011560693641617, + "grad_norm": 0.16498644649982452, + "learning_rate": 1.796e-05, + "loss": 0.0232, + "step": 450 + }, + { + "epoch": 0.2658959537572254, + "grad_norm": 0.1057974100112915, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.0211, + "step": 460 + }, + { + "epoch": 0.27167630057803466, + "grad_norm": 0.10222145169973373, + "learning_rate": 1.876e-05, + "loss": 0.0218, + "step": 470 + }, + { + "epoch": 0.2774566473988439, + "grad_norm": 0.10169381648302078, + "learning_rate": 1.916e-05, + "loss": 0.0211, + "step": 480 + }, + { + "epoch": 0.2832369942196532, + "grad_norm": 0.15869389474391937, + "learning_rate": 1.956e-05, + "loss": 0.0237, + "step": 490 + }, + { + "epoch": 0.28901734104046245, + "grad_norm": 0.16140298545360565, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.0198, + "step": 500 + }, + { + "epoch": 0.2947976878612717, + "grad_norm": 0.1119980588555336, + "learning_rate": 2.036e-05, + "loss": 0.0203, + "step": 510 + }, + { + "epoch": 0.30057803468208094, + "grad_norm": 0.09472450613975525, + "learning_rate": 2.076e-05, + "loss": 0.0211, + "step": 520 + }, + { + "epoch": 0.3063583815028902, + "grad_norm": 0.1749098151922226, + "learning_rate": 2.116e-05, + "loss": 0.0189, + "step": 530 + }, + { + "epoch": 0.31213872832369943, + "grad_norm": 0.13768576085567474, + "learning_rate": 2.1560000000000004e-05, + "loss": 0.0175, + "step": 540 + }, + { + "epoch": 0.3179190751445087, + "grad_norm": 0.13592314720153809, + "learning_rate": 2.196e-05, + "loss": 0.0258, + "step": 550 + }, + { + "epoch": 0.3236994219653179, + "grad_norm": 0.1005687341094017, + "learning_rate": 2.236e-05, + "loss": 0.0179, + "step": 560 + }, + { + "epoch": 0.32947976878612717, + "grad_norm": 0.14020080864429474, + "learning_rate": 2.2760000000000002e-05, + "loss": 0.02, + "step": 570 + }, + { + "epoch": 0.3352601156069364, + "grad_norm": 0.10146922618150711, + "learning_rate": 2.3160000000000002e-05, + "loss": 0.0176, + "step": 580 + }, + { + "epoch": 0.34104046242774566, + "grad_norm": 0.1250441074371338, + "learning_rate": 2.356e-05, + "loss": 0.0169, + "step": 590 + }, + { + "epoch": 0.3468208092485549, + "grad_norm": 0.13686810433864594, + "learning_rate": 2.396e-05, + "loss": 0.0186, + "step": 600 + }, + { + "epoch": 0.35260115606936415, + "grad_norm": 0.15110881626605988, + "learning_rate": 2.4360000000000004e-05, + "loss": 0.021, + "step": 610 + }, + { + "epoch": 0.3583815028901734, + "grad_norm": 0.14723263680934906, + "learning_rate": 2.476e-05, + "loss": 0.0226, + "step": 620 + }, + { + "epoch": 0.36416184971098264, + "grad_norm": 0.11678220331668854, + "learning_rate": 2.516e-05, + "loss": 0.0151, + "step": 630 + }, + { + "epoch": 0.3699421965317919, + "grad_norm": 0.18138806521892548, + "learning_rate": 2.556e-05, + "loss": 0.0182, + "step": 640 + }, + { + "epoch": 0.37572254335260113, + "grad_norm": 0.14597956836223602, + "learning_rate": 2.5960000000000002e-05, + "loss": 0.0148, + "step": 650 + }, + { + "epoch": 0.3815028901734104, + "grad_norm": 0.16550736129283905, + "learning_rate": 2.6360000000000002e-05, + "loss": 0.0175, + "step": 660 + }, + { + "epoch": 0.3872832369942196, + "grad_norm": 0.12392124533653259, + "learning_rate": 2.676e-05, + "loss": 0.0178, + "step": 670 + }, + { + "epoch": 0.3930635838150289, + "grad_norm": 0.14373187720775604, + "learning_rate": 2.716e-05, + "loss": 0.0151, + "step": 680 + }, + { + "epoch": 0.3988439306358382, + "grad_norm": 0.10784381628036499, + "learning_rate": 2.7560000000000004e-05, + "loss": 0.0153, + "step": 690 + }, + { + "epoch": 0.4046242774566474, + "grad_norm": 0.12487441301345825, + "learning_rate": 2.7960000000000003e-05, + "loss": 0.0146, + "step": 700 + }, + { + "epoch": 0.41040462427745666, + "grad_norm": 0.1104297786951065, + "learning_rate": 2.8360000000000003e-05, + "loss": 0.0174, + "step": 710 + }, + { + "epoch": 0.4161849710982659, + "grad_norm": 0.09142022579908371, + "learning_rate": 2.8760000000000002e-05, + "loss": 0.0142, + "step": 720 + }, + { + "epoch": 0.42196531791907516, + "grad_norm": 0.12280980497598648, + "learning_rate": 2.9160000000000005e-05, + "loss": 0.0147, + "step": 730 + }, + { + "epoch": 0.4277456647398844, + "grad_norm": 0.12440512329339981, + "learning_rate": 2.9559999999999998e-05, + "loss": 0.0131, + "step": 740 + }, + { + "epoch": 0.43352601156069365, + "grad_norm": 0.09911549091339111, + "learning_rate": 2.9959999999999998e-05, + "loss": 0.0141, + "step": 750 + }, + { + "epoch": 0.4393063583815029, + "grad_norm": 0.12997058033943176, + "learning_rate": 3.036e-05, + "loss": 0.0121, + "step": 760 + }, + { + "epoch": 0.44508670520231214, + "grad_norm": 0.14805808663368225, + "learning_rate": 3.076e-05, + "loss": 0.0148, + "step": 770 + }, + { + "epoch": 0.4508670520231214, + "grad_norm": 0.10276526212692261, + "learning_rate": 3.116e-05, + "loss": 0.0134, + "step": 780 + }, + { + "epoch": 0.45664739884393063, + "grad_norm": 0.15157487988471985, + "learning_rate": 3.156e-05, + "loss": 0.0143, + "step": 790 + }, + { + "epoch": 0.4624277456647399, + "grad_norm": 0.10993634164333344, + "learning_rate": 3.196e-05, + "loss": 0.0151, + "step": 800 + }, + { + "epoch": 0.4682080924855491, + "grad_norm": 0.11326078325510025, + "learning_rate": 3.236e-05, + "loss": 0.0162, + "step": 810 + }, + { + "epoch": 0.47398843930635837, + "grad_norm": 0.13100625574588776, + "learning_rate": 3.2760000000000005e-05, + "loss": 0.013, + "step": 820 + }, + { + "epoch": 0.4797687861271676, + "grad_norm": 0.18897277116775513, + "learning_rate": 3.316e-05, + "loss": 0.0139, + "step": 830 + }, + { + "epoch": 0.48554913294797686, + "grad_norm": 0.17949187755584717, + "learning_rate": 3.3560000000000004e-05, + "loss": 0.011, + "step": 840 + }, + { + "epoch": 0.4913294797687861, + "grad_norm": 0.10038192570209503, + "learning_rate": 3.396e-05, + "loss": 0.0119, + "step": 850 + }, + { + "epoch": 0.49710982658959535, + "grad_norm": 0.16344571113586426, + "learning_rate": 3.436e-05, + "loss": 0.0133, + "step": 860 + }, + { + "epoch": 0.5028901734104047, + "grad_norm": 0.23551280796527863, + "learning_rate": 3.4760000000000006e-05, + "loss": 0.0134, + "step": 870 + }, + { + "epoch": 0.5086705202312138, + "grad_norm": 0.19831761717796326, + "learning_rate": 3.516e-05, + "loss": 0.0123, + "step": 880 + }, + { + "epoch": 0.5144508670520231, + "grad_norm": 0.19457103312015533, + "learning_rate": 3.5560000000000005e-05, + "loss": 0.0151, + "step": 890 + }, + { + "epoch": 0.5202312138728323, + "grad_norm": 0.1299736499786377, + "learning_rate": 3.596e-05, + "loss": 0.0138, + "step": 900 + }, + { + "epoch": 0.5260115606936416, + "grad_norm": 0.1533757746219635, + "learning_rate": 3.636e-05, + "loss": 0.0161, + "step": 910 + }, + { + "epoch": 0.5317919075144508, + "grad_norm": 0.14695550501346588, + "learning_rate": 3.676e-05, + "loss": 0.0119, + "step": 920 + }, + { + "epoch": 0.5375722543352601, + "grad_norm": 0.12472260743379593, + "learning_rate": 3.716e-05, + "loss": 0.013, + "step": 930 + }, + { + "epoch": 0.5433526011560693, + "grad_norm": 0.14407047629356384, + "learning_rate": 3.756e-05, + "loss": 0.0165, + "step": 940 + }, + { + "epoch": 0.5491329479768786, + "grad_norm": 0.11574450135231018, + "learning_rate": 3.796e-05, + "loss": 0.0137, + "step": 950 + }, + { + "epoch": 0.5549132947976878, + "grad_norm": 0.17657427489757538, + "learning_rate": 3.836e-05, + "loss": 0.0109, + "step": 960 + }, + { + "epoch": 0.5606936416184971, + "grad_norm": 0.11555900424718857, + "learning_rate": 3.876e-05, + "loss": 0.0148, + "step": 970 + }, + { + "epoch": 0.5664739884393064, + "grad_norm": 0.11354225873947144, + "learning_rate": 3.9160000000000005e-05, + "loss": 0.0093, + "step": 980 + }, + { + "epoch": 0.5722543352601156, + "grad_norm": 0.08584084361791611, + "learning_rate": 3.956e-05, + "loss": 0.0111, + "step": 990 + }, + { + "epoch": 0.5780346820809249, + "grad_norm": 0.06208997219800949, + "learning_rate": 3.9960000000000004e-05, + "loss": 0.0117, + "step": 1000 + }, + { + "epoch": 0.5838150289017341, + "grad_norm": 0.07749241590499878, + "learning_rate": 4.0360000000000007e-05, + "loss": 0.0088, + "step": 1010 + }, + { + "epoch": 0.5895953757225434, + "grad_norm": 0.1077214777469635, + "learning_rate": 4.076e-05, + "loss": 0.0107, + "step": 1020 + }, + { + "epoch": 0.5953757225433526, + "grad_norm": 0.12400258332490921, + "learning_rate": 4.1160000000000006e-05, + "loss": 0.0117, + "step": 1030 + }, + { + "epoch": 0.6011560693641619, + "grad_norm": 0.10394832491874695, + "learning_rate": 4.156e-05, + "loss": 0.0089, + "step": 1040 + }, + { + "epoch": 0.6069364161849711, + "grad_norm": 0.0824170783162117, + "learning_rate": 4.196e-05, + "loss": 0.0099, + "step": 1050 + }, + { + "epoch": 0.6127167630057804, + "grad_norm": 0.0983257070183754, + "learning_rate": 4.236e-05, + "loss": 0.0119, + "step": 1060 + }, + { + "epoch": 0.6184971098265896, + "grad_norm": 0.11756965517997742, + "learning_rate": 4.276e-05, + "loss": 0.0109, + "step": 1070 + }, + { + "epoch": 0.6242774566473989, + "grad_norm": 0.13317210972309113, + "learning_rate": 4.316e-05, + "loss": 0.0129, + "step": 1080 + }, + { + "epoch": 0.630057803468208, + "grad_norm": 0.1497182995080948, + "learning_rate": 4.356e-05, + "loss": 0.014, + "step": 1090 + }, + { + "epoch": 0.6358381502890174, + "grad_norm": 0.09919284284114838, + "learning_rate": 4.396e-05, + "loss": 0.0086, + "step": 1100 + }, + { + "epoch": 0.6416184971098265, + "grad_norm": 0.12873047590255737, + "learning_rate": 4.436e-05, + "loss": 0.0123, + "step": 1110 + }, + { + "epoch": 0.6473988439306358, + "grad_norm": 0.08432596921920776, + "learning_rate": 4.4760000000000005e-05, + "loss": 0.0095, + "step": 1120 + }, + { + "epoch": 0.653179190751445, + "grad_norm": 0.10877019166946411, + "learning_rate": 4.516e-05, + "loss": 0.0136, + "step": 1130 + }, + { + "epoch": 0.6589595375722543, + "grad_norm": 0.09828033298254013, + "learning_rate": 4.5560000000000004e-05, + "loss": 0.0098, + "step": 1140 + }, + { + "epoch": 0.6647398843930635, + "grad_norm": 0.17707689106464386, + "learning_rate": 4.596e-05, + "loss": 0.011, + "step": 1150 + }, + { + "epoch": 0.6705202312138728, + "grad_norm": 0.09169796109199524, + "learning_rate": 4.636e-05, + "loss": 0.0111, + "step": 1160 + }, + { + "epoch": 0.6763005780346821, + "grad_norm": 0.09853609651327133, + "learning_rate": 4.6760000000000006e-05, + "loss": 0.0091, + "step": 1170 + }, + { + "epoch": 0.6820809248554913, + "grad_norm": 0.0778835192322731, + "learning_rate": 4.716e-05, + "loss": 0.0094, + "step": 1180 + }, + { + "epoch": 0.6878612716763006, + "grad_norm": 0.07706254720687866, + "learning_rate": 4.7560000000000005e-05, + "loss": 0.0087, + "step": 1190 + }, + { + "epoch": 0.6936416184971098, + "grad_norm": 0.15445491671562195, + "learning_rate": 4.796e-05, + "loss": 0.012, + "step": 1200 + }, + { + "epoch": 0.6994219653179191, + "grad_norm": 0.10672589391469955, + "learning_rate": 4.836e-05, + "loss": 0.008, + "step": 1210 + }, + { + "epoch": 0.7052023121387283, + "grad_norm": 0.14515936374664307, + "learning_rate": 4.876e-05, + "loss": 0.0087, + "step": 1220 + }, + { + "epoch": 0.7109826589595376, + "grad_norm": 0.11830303817987442, + "learning_rate": 4.9160000000000004e-05, + "loss": 0.0095, + "step": 1230 + }, + { + "epoch": 0.7167630057803468, + "grad_norm": 0.10018444061279297, + "learning_rate": 4.956e-05, + "loss": 0.0087, + "step": 1240 + }, + { + "epoch": 0.7225433526011561, + "grad_norm": 0.09550796449184418, + "learning_rate": 4.996e-05, + "loss": 0.0093, + "step": 1250 + }, + { + "epoch": 0.7283236994219653, + "grad_norm": 0.13438743352890015, + "learning_rate": 5.0360000000000006e-05, + "loss": 0.0091, + "step": 1260 + }, + { + "epoch": 0.7341040462427746, + "grad_norm": 0.13329671323299408, + "learning_rate": 5.076000000000001e-05, + "loss": 0.011, + "step": 1270 + }, + { + "epoch": 0.7398843930635838, + "grad_norm": 0.10754700750112534, + "learning_rate": 5.1160000000000005e-05, + "loss": 0.0077, + "step": 1280 + }, + { + "epoch": 0.7456647398843931, + "grad_norm": 0.13164956867694855, + "learning_rate": 5.1559999999999994e-05, + "loss": 0.0088, + "step": 1290 + }, + { + "epoch": 0.7514450867052023, + "grad_norm": 0.07530736923217773, + "learning_rate": 5.196e-05, + "loss": 0.0086, + "step": 1300 + }, + { + "epoch": 0.7572254335260116, + "grad_norm": 0.08277012407779694, + "learning_rate": 5.236e-05, + "loss": 0.0089, + "step": 1310 + }, + { + "epoch": 0.7630057803468208, + "grad_norm": 0.1286892145872116, + "learning_rate": 5.2759999999999996e-05, + "loss": 0.0128, + "step": 1320 + }, + { + "epoch": 0.7687861271676301, + "grad_norm": 0.1276070475578308, + "learning_rate": 5.316e-05, + "loss": 0.0092, + "step": 1330 + }, + { + "epoch": 0.7745664739884393, + "grad_norm": 0.11473594605922699, + "learning_rate": 5.356e-05, + "loss": 0.0089, + "step": 1340 + }, + { + "epoch": 0.7803468208092486, + "grad_norm": 0.11573047190904617, + "learning_rate": 5.396e-05, + "loss": 0.0083, + "step": 1350 + }, + { + "epoch": 0.7861271676300579, + "grad_norm": 0.12039162963628769, + "learning_rate": 5.436e-05, + "loss": 0.0083, + "step": 1360 + }, + { + "epoch": 0.791907514450867, + "grad_norm": 0.18288345634937286, + "learning_rate": 5.476e-05, + "loss": 0.0084, + "step": 1370 + }, + { + "epoch": 0.7976878612716763, + "grad_norm": 0.1231662929058075, + "learning_rate": 5.516e-05, + "loss": 0.0095, + "step": 1380 + }, + { + "epoch": 0.8034682080924855, + "grad_norm": 0.08810202777385712, + "learning_rate": 5.556e-05, + "loss": 0.009, + "step": 1390 + }, + { + "epoch": 0.8092485549132948, + "grad_norm": 0.08831888437271118, + "learning_rate": 5.596e-05, + "loss": 0.0078, + "step": 1400 + }, + { + "epoch": 0.815028901734104, + "grad_norm": 0.15133686363697052, + "learning_rate": 5.636e-05, + "loss": 0.0114, + "step": 1410 + }, + { + "epoch": 0.8208092485549133, + "grad_norm": 0.11997071653604507, + "learning_rate": 5.6760000000000005e-05, + "loss": 0.0105, + "step": 1420 + }, + { + "epoch": 0.8265895953757225, + "grad_norm": 0.11660143733024597, + "learning_rate": 5.716e-05, + "loss": 0.008, + "step": 1430 + }, + { + "epoch": 0.8323699421965318, + "grad_norm": 0.19836877286434174, + "learning_rate": 5.7560000000000005e-05, + "loss": 0.0107, + "step": 1440 + }, + { + "epoch": 0.838150289017341, + "grad_norm": 0.16743585467338562, + "learning_rate": 5.796e-05, + "loss": 0.0072, + "step": 1450 + }, + { + "epoch": 0.8439306358381503, + "grad_norm": 0.19401532411575317, + "learning_rate": 5.8360000000000004e-05, + "loss": 0.0082, + "step": 1460 + }, + { + "epoch": 0.8497109826589595, + "grad_norm": 0.13777554035186768, + "learning_rate": 5.876000000000001e-05, + "loss": 0.0101, + "step": 1470 + }, + { + "epoch": 0.8554913294797688, + "grad_norm": 0.1695699542760849, + "learning_rate": 5.916e-05, + "loss": 0.0113, + "step": 1480 + }, + { + "epoch": 0.861271676300578, + "grad_norm": 0.14594483375549316, + "learning_rate": 5.9560000000000006e-05, + "loss": 0.01, + "step": 1490 + }, + { + "epoch": 0.8670520231213873, + "grad_norm": 0.1465466171503067, + "learning_rate": 5.996e-05, + "loss": 0.0093, + "step": 1500 + }, + { + "epoch": 0.8728323699421965, + "grad_norm": 0.16754291951656342, + "learning_rate": 6.0360000000000005e-05, + "loss": 0.0131, + "step": 1510 + }, + { + "epoch": 0.8786127167630058, + "grad_norm": 0.17738179862499237, + "learning_rate": 6.076000000000001e-05, + "loss": 0.0103, + "step": 1520 + }, + { + "epoch": 0.884393063583815, + "grad_norm": 0.1402902454137802, + "learning_rate": 6.116e-05, + "loss": 0.0095, + "step": 1530 + }, + { + "epoch": 0.8901734104046243, + "grad_norm": 0.1324438899755478, + "learning_rate": 6.156e-05, + "loss": 0.0081, + "step": 1540 + }, + { + "epoch": 0.8959537572254336, + "grad_norm": 0.08176060765981674, + "learning_rate": 6.196000000000001e-05, + "loss": 0.009, + "step": 1550 + }, + { + "epoch": 0.9017341040462428, + "grad_norm": 0.0868748277425766, + "learning_rate": 6.236e-05, + "loss": 0.0086, + "step": 1560 + }, + { + "epoch": 0.9075144508670521, + "grad_norm": 0.13637259602546692, + "learning_rate": 6.276e-05, + "loss": 0.0091, + "step": 1570 + }, + { + "epoch": 0.9132947976878613, + "grad_norm": 0.10653480142354965, + "learning_rate": 6.316000000000001e-05, + "loss": 0.0083, + "step": 1580 + }, + { + "epoch": 0.9190751445086706, + "grad_norm": 0.11942799389362335, + "learning_rate": 6.356000000000001e-05, + "loss": 0.0079, + "step": 1590 + }, + { + "epoch": 0.9248554913294798, + "grad_norm": 0.14978532493114471, + "learning_rate": 6.396e-05, + "loss": 0.0087, + "step": 1600 + }, + { + "epoch": 0.930635838150289, + "grad_norm": 0.17128850519657135, + "learning_rate": 6.436e-05, + "loss": 0.0087, + "step": 1610 + }, + { + "epoch": 0.9364161849710982, + "grad_norm": 0.10861340165138245, + "learning_rate": 6.476e-05, + "loss": 0.0078, + "step": 1620 + }, + { + "epoch": 0.9421965317919075, + "grad_norm": 0.24768634140491486, + "learning_rate": 6.515999999999999e-05, + "loss": 0.0098, + "step": 1630 + }, + { + "epoch": 0.9479768786127167, + "grad_norm": 0.11871711909770966, + "learning_rate": 6.556e-05, + "loss": 0.0078, + "step": 1640 + }, + { + "epoch": 0.953757225433526, + "grad_norm": 0.12986963987350464, + "learning_rate": 6.596e-05, + "loss": 0.0077, + "step": 1650 + }, + { + "epoch": 0.9595375722543352, + "grad_norm": 0.19239209592342377, + "learning_rate": 6.636e-05, + "loss": 0.0095, + "step": 1660 + }, + { + "epoch": 0.9653179190751445, + "grad_norm": 0.1672155112028122, + "learning_rate": 6.676e-05, + "loss": 0.0109, + "step": 1670 + }, + { + "epoch": 0.9710982658959537, + "grad_norm": 0.10741300880908966, + "learning_rate": 6.716e-05, + "loss": 0.0073, + "step": 1680 + }, + { + "epoch": 0.976878612716763, + "grad_norm": 0.1410427987575531, + "learning_rate": 6.756e-05, + "loss": 0.0086, + "step": 1690 + }, + { + "epoch": 0.9826589595375722, + "grad_norm": 0.14685547351837158, + "learning_rate": 6.796e-05, + "loss": 0.008, + "step": 1700 + }, + { + "epoch": 0.9884393063583815, + "grad_norm": 0.15410996973514557, + "learning_rate": 6.836e-05, + "loss": 0.0115, + "step": 1710 + }, + { + "epoch": 0.9942196531791907, + "grad_norm": 0.13527736067771912, + "learning_rate": 6.876e-05, + "loss": 0.0089, + "step": 1720 + }, + { + "epoch": 1.0, + "grad_norm": 0.11434699594974518, + "learning_rate": 6.916000000000001e-05, + "loss": 0.0095, + "step": 1730 + }, + { + "epoch": 1.0057803468208093, + "grad_norm": 0.12007783353328705, + "learning_rate": 6.956e-05, + "loss": 0.0075, + "step": 1740 + }, + { + "epoch": 1.0115606936416186, + "grad_norm": 0.1504870504140854, + "learning_rate": 6.996e-05, + "loss": 0.0083, + "step": 1750 + }, + { + "epoch": 1.0173410404624277, + "grad_norm": 0.1315043568611145, + "learning_rate": 7.036e-05, + "loss": 0.0079, + "step": 1760 + }, + { + "epoch": 1.023121387283237, + "grad_norm": 0.1160712018609047, + "learning_rate": 7.076000000000001e-05, + "loss": 0.0071, + "step": 1770 + }, + { + "epoch": 1.0289017341040463, + "grad_norm": 0.1722860336303711, + "learning_rate": 7.116e-05, + "loss": 0.0084, + "step": 1780 + }, + { + "epoch": 1.0346820809248556, + "grad_norm": 0.16109566390514374, + "learning_rate": 7.156e-05, + "loss": 0.0066, + "step": 1790 + }, + { + "epoch": 1.0404624277456647, + "grad_norm": 0.12346116453409195, + "learning_rate": 7.196000000000001e-05, + "loss": 0.007, + "step": 1800 + }, + { + "epoch": 1.046242774566474, + "grad_norm": 0.13088279962539673, + "learning_rate": 7.236e-05, + "loss": 0.0064, + "step": 1810 + }, + { + "epoch": 1.0520231213872833, + "grad_norm": 0.13289068639278412, + "learning_rate": 7.276e-05, + "loss": 0.007, + "step": 1820 + }, + { + "epoch": 1.0578034682080926, + "grad_norm": 0.1241140067577362, + "learning_rate": 7.316000000000001e-05, + "loss": 0.0067, + "step": 1830 + }, + { + "epoch": 1.0635838150289016, + "grad_norm": 0.12275862693786621, + "learning_rate": 7.356000000000001e-05, + "loss": 0.007, + "step": 1840 + }, + { + "epoch": 1.069364161849711, + "grad_norm": 0.09806959331035614, + "learning_rate": 7.396e-05, + "loss": 0.0064, + "step": 1850 + }, + { + "epoch": 1.0751445086705202, + "grad_norm": 0.10867589712142944, + "learning_rate": 7.436000000000001e-05, + "loss": 0.0079, + "step": 1860 + }, + { + "epoch": 1.0809248554913296, + "grad_norm": 0.09507458657026291, + "learning_rate": 7.476000000000001e-05, + "loss": 0.007, + "step": 1870 + }, + { + "epoch": 1.0867052023121386, + "grad_norm": 0.0947691947221756, + "learning_rate": 7.516e-05, + "loss": 0.0062, + "step": 1880 + }, + { + "epoch": 1.092485549132948, + "grad_norm": 0.1417185217142105, + "learning_rate": 7.556000000000002e-05, + "loss": 0.0108, + "step": 1890 + }, + { + "epoch": 1.0982658959537572, + "grad_norm": 0.13631682097911835, + "learning_rate": 7.596000000000001e-05, + "loss": 0.0079, + "step": 1900 + }, + { + "epoch": 1.1040462427745665, + "grad_norm": 0.23177769780158997, + "learning_rate": 7.636e-05, + "loss": 0.008, + "step": 1910 + }, + { + "epoch": 1.1098265895953756, + "grad_norm": 0.090873122215271, + "learning_rate": 7.676e-05, + "loss": 0.0079, + "step": 1920 + }, + { + "epoch": 1.115606936416185, + "grad_norm": 0.11183790862560272, + "learning_rate": 7.716e-05, + "loss": 0.0077, + "step": 1930 + }, + { + "epoch": 1.1213872832369942, + "grad_norm": 0.1344011276960373, + "learning_rate": 7.756e-05, + "loss": 0.0091, + "step": 1940 + }, + { + "epoch": 1.1271676300578035, + "grad_norm": 0.11749781668186188, + "learning_rate": 7.796e-05, + "loss": 0.0075, + "step": 1950 + }, + { + "epoch": 1.1329479768786128, + "grad_norm": 0.15016603469848633, + "learning_rate": 7.836e-05, + "loss": 0.01, + "step": 1960 + }, + { + "epoch": 1.138728323699422, + "grad_norm": 0.12128861248493195, + "learning_rate": 7.876e-05, + "loss": 0.0127, + "step": 1970 + }, + { + "epoch": 1.1445086705202312, + "grad_norm": 0.13656798005104065, + "learning_rate": 7.916e-05, + "loss": 0.0075, + "step": 1980 + }, + { + "epoch": 1.1502890173410405, + "grad_norm": 0.12774041295051575, + "learning_rate": 7.956e-05, + "loss": 0.0079, + "step": 1990 + }, + { + "epoch": 1.1560693641618498, + "grad_norm": 0.23355427384376526, + "learning_rate": 7.996e-05, + "loss": 0.0113, + "step": 2000 + }, + { + "epoch": 1.1618497109826589, + "grad_norm": 0.10483523458242416, + "learning_rate": 8.036e-05, + "loss": 0.008, + "step": 2010 + }, + { + "epoch": 1.1676300578034682, + "grad_norm": 0.14650487899780273, + "learning_rate": 8.076e-05, + "loss": 0.0075, + "step": 2020 + }, + { + "epoch": 1.1734104046242775, + "grad_norm": 0.1080266535282135, + "learning_rate": 8.116e-05, + "loss": 0.008, + "step": 2030 + }, + { + "epoch": 1.1791907514450868, + "grad_norm": 0.12676295638084412, + "learning_rate": 8.156e-05, + "loss": 0.0078, + "step": 2040 + }, + { + "epoch": 1.1849710982658959, + "grad_norm": 0.17598728835582733, + "learning_rate": 8.196000000000001e-05, + "loss": 0.009, + "step": 2050 + }, + { + "epoch": 1.1907514450867052, + "grad_norm": 0.16755390167236328, + "learning_rate": 8.236e-05, + "loss": 0.01, + "step": 2060 + }, + { + "epoch": 1.1965317919075145, + "grad_norm": 0.15602730214595795, + "learning_rate": 8.276e-05, + "loss": 0.0085, + "step": 2070 + }, + { + "epoch": 1.2023121387283238, + "grad_norm": 0.11544652283191681, + "learning_rate": 8.316000000000001e-05, + "loss": 0.0074, + "step": 2080 + }, + { + "epoch": 1.208092485549133, + "grad_norm": 0.09134082496166229, + "learning_rate": 8.356e-05, + "loss": 0.0072, + "step": 2090 + }, + { + "epoch": 1.2138728323699421, + "grad_norm": 0.10406164824962616, + "learning_rate": 8.396e-05, + "loss": 0.0088, + "step": 2100 + }, + { + "epoch": 1.2196531791907514, + "grad_norm": 0.0975494384765625, + "learning_rate": 8.436000000000001e-05, + "loss": 0.0059, + "step": 2110 + }, + { + "epoch": 1.2254335260115607, + "grad_norm": 0.08101125061511993, + "learning_rate": 8.476000000000001e-05, + "loss": 0.0092, + "step": 2120 + }, + { + "epoch": 1.2312138728323698, + "grad_norm": 0.0976252555847168, + "learning_rate": 8.516e-05, + "loss": 0.0067, + "step": 2130 + }, + { + "epoch": 1.2369942196531791, + "grad_norm": 0.15048253536224365, + "learning_rate": 8.556e-05, + "loss": 0.0073, + "step": 2140 + }, + { + "epoch": 1.2427745664739884, + "grad_norm": 0.1096828281879425, + "learning_rate": 8.596000000000001e-05, + "loss": 0.0065, + "step": 2150 + }, + { + "epoch": 1.2485549132947977, + "grad_norm": 0.12420912086963654, + "learning_rate": 8.636e-05, + "loss": 0.0063, + "step": 2160 + }, + { + "epoch": 1.254335260115607, + "grad_norm": 0.08858140558004379, + "learning_rate": 8.676e-05, + "loss": 0.0062, + "step": 2170 + }, + { + "epoch": 1.260115606936416, + "grad_norm": 0.10560262948274612, + "learning_rate": 8.716000000000001e-05, + "loss": 0.0073, + "step": 2180 + }, + { + "epoch": 1.2658959537572254, + "grad_norm": 0.13556477427482605, + "learning_rate": 8.756000000000001e-05, + "loss": 0.0074, + "step": 2190 + }, + { + "epoch": 1.2716763005780347, + "grad_norm": 0.10504916310310364, + "learning_rate": 8.796e-05, + "loss": 0.0078, + "step": 2200 + }, + { + "epoch": 1.2774566473988438, + "grad_norm": 0.11343459039926529, + "learning_rate": 8.836000000000001e-05, + "loss": 0.0058, + "step": 2210 + }, + { + "epoch": 1.2832369942196533, + "grad_norm": 0.09249500185251236, + "learning_rate": 8.876e-05, + "loss": 0.0067, + "step": 2220 + }, + { + "epoch": 1.2890173410404624, + "grad_norm": 0.08550640940666199, + "learning_rate": 8.916e-05, + "loss": 0.0072, + "step": 2230 + }, + { + "epoch": 1.2947976878612717, + "grad_norm": 0.10101890563964844, + "learning_rate": 8.956e-05, + "loss": 0.0083, + "step": 2240 + }, + { + "epoch": 1.300578034682081, + "grad_norm": 0.12792877852916718, + "learning_rate": 8.996e-05, + "loss": 0.0086, + "step": 2250 + }, + { + "epoch": 1.30635838150289, + "grad_norm": 0.21365466713905334, + "learning_rate": 9.036e-05, + "loss": 0.0074, + "step": 2260 + }, + { + "epoch": 1.3121387283236994, + "grad_norm": 0.18697352707386017, + "learning_rate": 9.076e-05, + "loss": 0.0068, + "step": 2270 + }, + { + "epoch": 1.3179190751445087, + "grad_norm": 0.16394391655921936, + "learning_rate": 9.116e-05, + "loss": 0.0084, + "step": 2280 + }, + { + "epoch": 1.323699421965318, + "grad_norm": 0.12319466471672058, + "learning_rate": 9.156e-05, + "loss": 0.0078, + "step": 2290 + }, + { + "epoch": 1.3294797687861273, + "grad_norm": 0.11505721509456635, + "learning_rate": 9.196000000000001e-05, + "loss": 0.0084, + "step": 2300 + }, + { + "epoch": 1.3352601156069364, + "grad_norm": 0.0842253565788269, + "learning_rate": 9.236e-05, + "loss": 0.007, + "step": 2310 + }, + { + "epoch": 1.3410404624277457, + "grad_norm": 0.10776695609092712, + "learning_rate": 9.276e-05, + "loss": 0.0054, + "step": 2320 + }, + { + "epoch": 1.346820809248555, + "grad_norm": 0.10675626248121262, + "learning_rate": 9.316000000000001e-05, + "loss": 0.0088, + "step": 2330 + }, + { + "epoch": 1.352601156069364, + "grad_norm": 0.08206217736005783, + "learning_rate": 9.356e-05, + "loss": 0.0079, + "step": 2340 + }, + { + "epoch": 1.3583815028901733, + "grad_norm": 0.12166175991296768, + "learning_rate": 9.396e-05, + "loss": 0.0074, + "step": 2350 + }, + { + "epoch": 1.3641618497109826, + "grad_norm": 0.10013966262340546, + "learning_rate": 9.436e-05, + "loss": 0.0087, + "step": 2360 + }, + { + "epoch": 1.369942196531792, + "grad_norm": 0.1003638356924057, + "learning_rate": 9.476000000000001e-05, + "loss": 0.0071, + "step": 2370 + }, + { + "epoch": 1.3757225433526012, + "grad_norm": 0.10239727795124054, + "learning_rate": 9.516e-05, + "loss": 0.0103, + "step": 2380 + }, + { + "epoch": 1.3815028901734103, + "grad_norm": 0.1256374716758728, + "learning_rate": 9.556e-05, + "loss": 0.0088, + "step": 2390 + }, + { + "epoch": 1.3872832369942196, + "grad_norm": 0.12118260562419891, + "learning_rate": 9.596000000000001e-05, + "loss": 0.0067, + "step": 2400 + }, + { + "epoch": 1.393063583815029, + "grad_norm": 0.10683480650186539, + "learning_rate": 9.636e-05, + "loss": 0.0067, + "step": 2410 + }, + { + "epoch": 1.3988439306358382, + "grad_norm": 0.0618288479745388, + "learning_rate": 9.676e-05, + "loss": 0.0062, + "step": 2420 + }, + { + "epoch": 1.4046242774566475, + "grad_norm": 0.13114090263843536, + "learning_rate": 9.716000000000001e-05, + "loss": 0.0061, + "step": 2430 + }, + { + "epoch": 1.4104046242774566, + "grad_norm": 0.10822831094264984, + "learning_rate": 9.756000000000001e-05, + "loss": 0.0055, + "step": 2440 + }, + { + "epoch": 1.416184971098266, + "grad_norm": 0.11746819317340851, + "learning_rate": 9.796e-05, + "loss": 0.0059, + "step": 2450 + }, + { + "epoch": 1.4219653179190752, + "grad_norm": 0.0757322609424591, + "learning_rate": 9.836000000000001e-05, + "loss": 0.0056, + "step": 2460 + }, + { + "epoch": 1.4277456647398843, + "grad_norm": 0.08555682748556137, + "learning_rate": 9.876000000000001e-05, + "loss": 0.006, + "step": 2470 + }, + { + "epoch": 1.4335260115606936, + "grad_norm": 0.1246783435344696, + "learning_rate": 9.916e-05, + "loss": 0.0071, + "step": 2480 + }, + { + "epoch": 1.439306358381503, + "grad_norm": 0.11538666486740112, + "learning_rate": 9.956e-05, + "loss": 0.0067, + "step": 2490 + }, + { + "epoch": 1.4450867052023122, + "grad_norm": 0.1484181433916092, + "learning_rate": 9.996000000000001e-05, + "loss": 0.0077, + "step": 2500 + }, + { + "epoch": 1.4508670520231215, + "grad_norm": 0.11887402832508087, + "learning_rate": 9.999999114196196e-05, + "loss": 0.0054, + "step": 2510 + }, + { + "epoch": 1.4566473988439306, + "grad_norm": 0.12257851660251617, + "learning_rate": 9.99999605215876e-05, + "loss": 0.0071, + "step": 2520 + }, + { + "epoch": 1.4624277456647399, + "grad_norm": 0.14709283411502838, + "learning_rate": 9.999990802953179e-05, + "loss": 0.0086, + "step": 2530 + }, + { + "epoch": 1.4682080924855492, + "grad_norm": 0.13029494881629944, + "learning_rate": 9.99998336658175e-05, + "loss": 0.0089, + "step": 2540 + }, + { + "epoch": 1.4739884393063583, + "grad_norm": 0.1392313539981842, + "learning_rate": 9.999973743047727e-05, + "loss": 0.0066, + "step": 2550 + }, + { + "epoch": 1.4797687861271676, + "grad_norm": 0.1205642893910408, + "learning_rate": 9.999961932355319e-05, + "loss": 0.0071, + "step": 2560 + }, + { + "epoch": 1.4855491329479769, + "grad_norm": 0.13903295993804932, + "learning_rate": 9.999947934509693e-05, + "loss": 0.0074, + "step": 2570 + }, + { + "epoch": 1.4913294797687862, + "grad_norm": 0.18161019682884216, + "learning_rate": 9.999931749516971e-05, + "loss": 0.0082, + "step": 2580 + }, + { + "epoch": 1.4971098265895955, + "grad_norm": 0.14651469886302948, + "learning_rate": 9.999913377384233e-05, + "loss": 0.0073, + "step": 2590 + }, + { + "epoch": 1.5028901734104045, + "grad_norm": 0.15712544322013855, + "learning_rate": 9.999892818119517e-05, + "loss": 0.0071, + "step": 2600 + }, + { + "epoch": 1.5086705202312138, + "grad_norm": 0.11392804235219955, + "learning_rate": 9.999870071731814e-05, + "loss": 0.0059, + "step": 2610 + }, + { + "epoch": 1.5144508670520231, + "grad_norm": 0.11064669489860535, + "learning_rate": 9.999845138231076e-05, + "loss": 0.0062, + "step": 2620 + }, + { + "epoch": 1.5202312138728322, + "grad_norm": 0.11116683483123779, + "learning_rate": 9.999818017628208e-05, + "loss": 0.0068, + "step": 2630 + }, + { + "epoch": 1.5260115606936417, + "grad_norm": 0.09671295434236526, + "learning_rate": 9.999788709935078e-05, + "loss": 0.01, + "step": 2640 + }, + { + "epoch": 1.5317919075144508, + "grad_norm": 0.11243397742509842, + "learning_rate": 9.9997572151645e-05, + "loss": 0.0065, + "step": 2650 + }, + { + "epoch": 1.5375722543352601, + "grad_norm": 0.1160590872168541, + "learning_rate": 9.999723533330254e-05, + "loss": 0.0062, + "step": 2660 + }, + { + "epoch": 1.5433526011560694, + "grad_norm": 0.08542856574058533, + "learning_rate": 9.999687664447074e-05, + "loss": 0.0053, + "step": 2670 + }, + { + "epoch": 1.5491329479768785, + "grad_norm": 0.08599895238876343, + "learning_rate": 9.99964960853065e-05, + "loss": 0.0054, + "step": 2680 + }, + { + "epoch": 1.5549132947976878, + "grad_norm": 0.13533271849155426, + "learning_rate": 9.999609365597627e-05, + "loss": 0.0058, + "step": 2690 + }, + { + "epoch": 1.560693641618497, + "grad_norm": 0.09653540700674057, + "learning_rate": 9.99956693566561e-05, + "loss": 0.0092, + "step": 2700 + }, + { + "epoch": 1.5664739884393064, + "grad_norm": 0.1159488782286644, + "learning_rate": 9.99952231875316e-05, + "loss": 0.0106, + "step": 2710 + }, + { + "epoch": 1.5722543352601157, + "grad_norm": 0.12663349509239197, + "learning_rate": 9.999475514879795e-05, + "loss": 0.0071, + "step": 2720 + }, + { + "epoch": 1.5780346820809248, + "grad_norm": 0.11458619683980942, + "learning_rate": 9.999426524065984e-05, + "loss": 0.0064, + "step": 2730 + }, + { + "epoch": 1.583815028901734, + "grad_norm": 0.13307511806488037, + "learning_rate": 9.999375346333162e-05, + "loss": 0.0061, + "step": 2740 + }, + { + "epoch": 1.5895953757225434, + "grad_norm": 0.11335356533527374, + "learning_rate": 9.999321981703715e-05, + "loss": 0.0059, + "step": 2750 + }, + { + "epoch": 1.5953757225433525, + "grad_norm": 0.11866944283246994, + "learning_rate": 9.999266430200985e-05, + "loss": 0.0073, + "step": 2760 + }, + { + "epoch": 1.601156069364162, + "grad_norm": 0.10777215659618378, + "learning_rate": 9.999208691849271e-05, + "loss": 0.0055, + "step": 2770 + }, + { + "epoch": 1.606936416184971, + "grad_norm": 0.10360101610422134, + "learning_rate": 9.999148766673832e-05, + "loss": 0.0049, + "step": 2780 + }, + { + "epoch": 1.6127167630057804, + "grad_norm": 0.0994260162115097, + "learning_rate": 9.999086654700881e-05, + "loss": 0.0056, + "step": 2790 + }, + { + "epoch": 1.6184971098265897, + "grad_norm": 0.09570357948541641, + "learning_rate": 9.999022355957588e-05, + "loss": 0.0056, + "step": 2800 + }, + { + "epoch": 1.6242774566473988, + "grad_norm": 0.07606939971446991, + "learning_rate": 9.998955870472079e-05, + "loss": 0.0053, + "step": 2810 + }, + { + "epoch": 1.630057803468208, + "grad_norm": 0.08179699629545212, + "learning_rate": 9.998887198273437e-05, + "loss": 0.0054, + "step": 2820 + }, + { + "epoch": 1.6358381502890174, + "grad_norm": 0.11064790934324265, + "learning_rate": 9.998816339391701e-05, + "loss": 0.0052, + "step": 2830 + }, + { + "epoch": 1.6416184971098264, + "grad_norm": 0.06641016155481339, + "learning_rate": 9.998743293857868e-05, + "loss": 0.005, + "step": 2840 + }, + { + "epoch": 1.647398843930636, + "grad_norm": 0.06775292754173279, + "learning_rate": 9.998668061703891e-05, + "loss": 0.0056, + "step": 2850 + }, + { + "epoch": 1.653179190751445, + "grad_norm": 0.17745137214660645, + "learning_rate": 9.998590642962679e-05, + "loss": 0.0087, + "step": 2860 + }, + { + "epoch": 1.6589595375722543, + "grad_norm": 0.14147275686264038, + "learning_rate": 9.998511037668095e-05, + "loss": 0.0052, + "step": 2870 + }, + { + "epoch": 1.6647398843930636, + "grad_norm": 0.1302187293767929, + "learning_rate": 9.998429245854964e-05, + "loss": 0.0058, + "step": 2880 + }, + { + "epoch": 1.6705202312138727, + "grad_norm": 0.08635162562131882, + "learning_rate": 9.998345267559064e-05, + "loss": 0.0062, + "step": 2890 + }, + { + "epoch": 1.6763005780346822, + "grad_norm": 0.07505862414836884, + "learning_rate": 9.998259102817129e-05, + "loss": 0.0048, + "step": 2900 + }, + { + "epoch": 1.6820809248554913, + "grad_norm": 0.0735427662730217, + "learning_rate": 9.99817075166685e-05, + "loss": 0.0067, + "step": 2910 + }, + { + "epoch": 1.6878612716763006, + "grad_norm": 0.06911532580852509, + "learning_rate": 9.998080214146878e-05, + "loss": 0.0052, + "step": 2920 + }, + { + "epoch": 1.69364161849711, + "grad_norm": 0.10301624238491058, + "learning_rate": 9.997987490296813e-05, + "loss": 0.0073, + "step": 2930 + }, + { + "epoch": 1.699421965317919, + "grad_norm": 0.1210310235619545, + "learning_rate": 9.99789258015722e-05, + "loss": 0.0046, + "step": 2940 + }, + { + "epoch": 1.7052023121387283, + "grad_norm": 0.14030395448207855, + "learning_rate": 9.997795483769611e-05, + "loss": 0.0052, + "step": 2950 + }, + { + "epoch": 1.7109826589595376, + "grad_norm": 0.1258503496646881, + "learning_rate": 9.997696201176462e-05, + "loss": 0.0056, + "step": 2960 + }, + { + "epoch": 1.7167630057803467, + "grad_norm": 0.13269788026809692, + "learning_rate": 9.997594732421203e-05, + "loss": 0.0059, + "step": 2970 + }, + { + "epoch": 1.7225433526011562, + "grad_norm": 0.15080974996089935, + "learning_rate": 9.997491077548217e-05, + "loss": 0.0067, + "step": 2980 + }, + { + "epoch": 1.7283236994219653, + "grad_norm": 0.13522954285144806, + "learning_rate": 9.997385236602851e-05, + "loss": 0.0047, + "step": 2990 + }, + { + "epoch": 1.7341040462427746, + "grad_norm": 0.10210167616605759, + "learning_rate": 9.997277209631399e-05, + "loss": 0.0084, + "step": 3000 + }, + { + "epoch": 1.739884393063584, + "grad_norm": 0.16219407320022583, + "learning_rate": 9.997166996681118e-05, + "loss": 0.0067, + "step": 3010 + }, + { + "epoch": 1.745664739884393, + "grad_norm": 0.09276897460222244, + "learning_rate": 9.997054597800218e-05, + "loss": 0.0077, + "step": 3020 + }, + { + "epoch": 1.7514450867052023, + "grad_norm": 0.1794775277376175, + "learning_rate": 9.996940013037866e-05, + "loss": 0.007, + "step": 3030 + }, + { + "epoch": 1.7572254335260116, + "grad_norm": 0.13645876944065094, + "learning_rate": 9.996823242444186e-05, + "loss": 0.0074, + "step": 3040 + }, + { + "epoch": 1.7630057803468207, + "grad_norm": 0.17062893509864807, + "learning_rate": 9.996704286070258e-05, + "loss": 0.0069, + "step": 3050 + }, + { + "epoch": 1.7687861271676302, + "grad_norm": 0.11452256888151169, + "learning_rate": 9.996583143968115e-05, + "loss": 0.0048, + "step": 3060 + }, + { + "epoch": 1.7745664739884393, + "grad_norm": 0.06533671915531158, + "learning_rate": 9.99645981619075e-05, + "loss": 0.0061, + "step": 3070 + }, + { + "epoch": 1.7803468208092486, + "grad_norm": 0.08524267375469208, + "learning_rate": 9.996334302792114e-05, + "loss": 0.0083, + "step": 3080 + }, + { + "epoch": 1.7861271676300579, + "grad_norm": 0.09816299378871918, + "learning_rate": 9.996206603827105e-05, + "loss": 0.0056, + "step": 3090 + }, + { + "epoch": 1.791907514450867, + "grad_norm": 0.18339581787586212, + "learning_rate": 9.996076719351587e-05, + "loss": 0.01, + "step": 3100 + }, + { + "epoch": 1.7976878612716765, + "grad_norm": 0.12458177655935287, + "learning_rate": 9.995944649422374e-05, + "loss": 0.0088, + "step": 3110 + }, + { + "epoch": 1.8034682080924855, + "grad_norm": 0.12647530436515808, + "learning_rate": 9.995810394097239e-05, + "loss": 0.006, + "step": 3120 + }, + { + "epoch": 1.8092485549132948, + "grad_norm": 0.12574432790279388, + "learning_rate": 9.995673953434909e-05, + "loss": 0.0081, + "step": 3130 + }, + { + "epoch": 1.8150289017341041, + "grad_norm": 0.08346758782863617, + "learning_rate": 9.995535327495068e-05, + "loss": 0.0075, + "step": 3140 + }, + { + "epoch": 1.8208092485549132, + "grad_norm": 0.05680999904870987, + "learning_rate": 9.995394516338355e-05, + "loss": 0.0046, + "step": 3150 + }, + { + "epoch": 1.8265895953757225, + "grad_norm": 0.05832390487194061, + "learning_rate": 9.995251520026367e-05, + "loss": 0.0064, + "step": 3160 + }, + { + "epoch": 1.8323699421965318, + "grad_norm": 0.08656340837478638, + "learning_rate": 9.995106338621656e-05, + "loss": 0.0047, + "step": 3170 + }, + { + "epoch": 1.838150289017341, + "grad_norm": 0.10437380522489548, + "learning_rate": 9.994958972187726e-05, + "loss": 0.0065, + "step": 3180 + }, + { + "epoch": 1.8439306358381504, + "grad_norm": 0.1443714201450348, + "learning_rate": 9.994809420789044e-05, + "loss": 0.0071, + "step": 3190 + }, + { + "epoch": 1.8497109826589595, + "grad_norm": 0.16334494948387146, + "learning_rate": 9.994657684491027e-05, + "loss": 0.0055, + "step": 3200 + }, + { + "epoch": 1.8554913294797688, + "grad_norm": 0.06717883050441742, + "learning_rate": 9.994503763360048e-05, + "loss": 0.0052, + "step": 3210 + }, + { + "epoch": 1.861271676300578, + "grad_norm": 0.13523557782173157, + "learning_rate": 9.99434765746344e-05, + "loss": 0.0053, + "step": 3220 + }, + { + "epoch": 1.8670520231213872, + "grad_norm": 0.0893624797463417, + "learning_rate": 9.994189366869488e-05, + "loss": 0.0054, + "step": 3230 + }, + { + "epoch": 1.8728323699421965, + "grad_norm": 0.11412649601697922, + "learning_rate": 9.994028891647433e-05, + "loss": 0.0047, + "step": 3240 + }, + { + "epoch": 1.8786127167630058, + "grad_norm": 0.12160219252109528, + "learning_rate": 9.993866231867475e-05, + "loss": 0.0059, + "step": 3250 + }, + { + "epoch": 1.8843930635838149, + "grad_norm": 0.09526138007640839, + "learning_rate": 9.993701387600762e-05, + "loss": 0.0052, + "step": 3260 + }, + { + "epoch": 1.8901734104046244, + "grad_norm": 0.09952362626791, + "learning_rate": 9.993534358919408e-05, + "loss": 0.005, + "step": 3270 + }, + { + "epoch": 1.8959537572254335, + "grad_norm": 0.09673795849084854, + "learning_rate": 9.993365145896473e-05, + "loss": 0.0065, + "step": 3280 + }, + { + "epoch": 1.9017341040462428, + "grad_norm": 0.10317232459783554, + "learning_rate": 9.993193748605977e-05, + "loss": 0.0052, + "step": 3290 + }, + { + "epoch": 1.907514450867052, + "grad_norm": 0.12684407830238342, + "learning_rate": 9.993020167122898e-05, + "loss": 0.0051, + "step": 3300 + }, + { + "epoch": 1.9132947976878611, + "grad_norm": 0.10689617693424225, + "learning_rate": 9.992844401523164e-05, + "loss": 0.0055, + "step": 3310 + }, + { + "epoch": 1.9190751445086707, + "grad_norm": 0.11384792625904083, + "learning_rate": 9.992666451883661e-05, + "loss": 0.0043, + "step": 3320 + }, + { + "epoch": 1.9248554913294798, + "grad_norm": 0.0782865509390831, + "learning_rate": 9.99248631828223e-05, + "loss": 0.0063, + "step": 3330 + }, + { + "epoch": 1.930635838150289, + "grad_norm": 0.11839958280324936, + "learning_rate": 9.99230400079767e-05, + "loss": 0.0064, + "step": 3340 + }, + { + "epoch": 1.9364161849710984, + "grad_norm": 0.07051268219947815, + "learning_rate": 9.992119499509728e-05, + "loss": 0.0057, + "step": 3350 + }, + { + "epoch": 1.9421965317919074, + "grad_norm": 0.09824991226196289, + "learning_rate": 9.991932814499114e-05, + "loss": 0.0055, + "step": 3360 + }, + { + "epoch": 1.9479768786127167, + "grad_norm": 0.09343143552541733, + "learning_rate": 9.991743945847493e-05, + "loss": 0.0046, + "step": 3370 + }, + { + "epoch": 1.953757225433526, + "grad_norm": 0.09618880599737167, + "learning_rate": 9.991552893637478e-05, + "loss": 0.005, + "step": 3380 + }, + { + "epoch": 1.9595375722543351, + "grad_norm": 0.14422376453876495, + "learning_rate": 9.991359657952644e-05, + "loss": 0.0058, + "step": 3390 + }, + { + "epoch": 1.9653179190751446, + "grad_norm": 0.11654495447874069, + "learning_rate": 9.991164238877519e-05, + "loss": 0.0072, + "step": 3400 + }, + { + "epoch": 1.9710982658959537, + "grad_norm": 0.16126972436904907, + "learning_rate": 9.990966636497585e-05, + "loss": 0.0056, + "step": 3410 + }, + { + "epoch": 1.976878612716763, + "grad_norm": 0.07894255220890045, + "learning_rate": 9.99076685089928e-05, + "loss": 0.0049, + "step": 3420 + }, + { + "epoch": 1.9826589595375723, + "grad_norm": 0.05879867449402809, + "learning_rate": 9.990564882169998e-05, + "loss": 0.0067, + "step": 3430 + }, + { + "epoch": 1.9884393063583814, + "grad_norm": 0.0764334499835968, + "learning_rate": 9.990360730398088e-05, + "loss": 0.0052, + "step": 3440 + }, + { + "epoch": 1.9942196531791907, + "grad_norm": 0.15867310762405396, + "learning_rate": 9.990154395672849e-05, + "loss": 0.0058, + "step": 3450 + }, + { + "epoch": 2.0, + "grad_norm": 0.11241374164819717, + "learning_rate": 9.989945878084541e-05, + "loss": 0.0052, + "step": 3460 + }, + { + "epoch": 2.005780346820809, + "grad_norm": 0.129130557179451, + "learning_rate": 9.989735177724378e-05, + "loss": 0.0058, + "step": 3470 + }, + { + "epoch": 2.0115606936416186, + "grad_norm": 0.14215660095214844, + "learning_rate": 9.989522294684526e-05, + "loss": 0.0053, + "step": 3480 + }, + { + "epoch": 2.0173410404624277, + "grad_norm": 0.1450652778148651, + "learning_rate": 9.989307229058107e-05, + "loss": 0.0058, + "step": 3490 + }, + { + "epoch": 2.023121387283237, + "grad_norm": 0.08095835149288177, + "learning_rate": 9.989089980939202e-05, + "loss": 0.0042, + "step": 3500 + }, + { + "epoch": 2.0289017341040463, + "grad_norm": 0.1411646455526352, + "learning_rate": 9.988870550422835e-05, + "loss": 0.0056, + "step": 3510 + }, + { + "epoch": 2.0346820809248554, + "grad_norm": 0.10258983075618744, + "learning_rate": 9.988648937604999e-05, + "loss": 0.0042, + "step": 3520 + }, + { + "epoch": 2.040462427745665, + "grad_norm": 0.0893290787935257, + "learning_rate": 9.988425142582632e-05, + "loss": 0.0049, + "step": 3530 + }, + { + "epoch": 2.046242774566474, + "grad_norm": 0.09758899360895157, + "learning_rate": 9.98819916545363e-05, + "loss": 0.0106, + "step": 3540 + }, + { + "epoch": 2.052023121387283, + "grad_norm": 0.11281487345695496, + "learning_rate": 9.987971006316844e-05, + "loss": 0.0061, + "step": 3550 + }, + { + "epoch": 2.0578034682080926, + "grad_norm": 0.06897272169589996, + "learning_rate": 9.987740665272077e-05, + "loss": 0.006, + "step": 3560 + }, + { + "epoch": 2.0635838150289016, + "grad_norm": 0.09770093113183975, + "learning_rate": 9.98750814242009e-05, + "loss": 0.0091, + "step": 3570 + }, + { + "epoch": 2.069364161849711, + "grad_norm": 0.10911057144403458, + "learning_rate": 9.987273437862594e-05, + "loss": 0.0045, + "step": 3580 + }, + { + "epoch": 2.0751445086705202, + "grad_norm": 0.14400342106819153, + "learning_rate": 9.987036551702259e-05, + "loss": 0.0072, + "step": 3590 + }, + { + "epoch": 2.0809248554913293, + "grad_norm": 0.08694253116846085, + "learning_rate": 9.986797484042706e-05, + "loss": 0.0062, + "step": 3600 + }, + { + "epoch": 2.086705202312139, + "grad_norm": 0.1378755420446396, + "learning_rate": 9.986556234988512e-05, + "loss": 0.0057, + "step": 3610 + }, + { + "epoch": 2.092485549132948, + "grad_norm": 0.12104221433401108, + "learning_rate": 9.986312804645205e-05, + "loss": 0.0063, + "step": 3620 + }, + { + "epoch": 2.098265895953757, + "grad_norm": 0.16360381245613098, + "learning_rate": 9.986067193119273e-05, + "loss": 0.0068, + "step": 3630 + }, + { + "epoch": 2.1040462427745665, + "grad_norm": 0.1390710473060608, + "learning_rate": 9.985819400518153e-05, + "loss": 0.0061, + "step": 3640 + }, + { + "epoch": 2.1098265895953756, + "grad_norm": 0.1273057609796524, + "learning_rate": 9.985569426950239e-05, + "loss": 0.007, + "step": 3650 + }, + { + "epoch": 2.115606936416185, + "grad_norm": 0.09377237409353256, + "learning_rate": 9.985317272524876e-05, + "loss": 0.0042, + "step": 3660 + }, + { + "epoch": 2.121387283236994, + "grad_norm": 0.09106730669736862, + "learning_rate": 9.985062937352366e-05, + "loss": 0.0051, + "step": 3670 + }, + { + "epoch": 2.1271676300578033, + "grad_norm": 0.10553478449583054, + "learning_rate": 9.984806421543966e-05, + "loss": 0.0047, + "step": 3680 + }, + { + "epoch": 2.132947976878613, + "grad_norm": 0.07460903376340866, + "learning_rate": 9.984547725211881e-05, + "loss": 0.005, + "step": 3690 + }, + { + "epoch": 2.138728323699422, + "grad_norm": 0.1071394607424736, + "learning_rate": 9.984286848469276e-05, + "loss": 0.0055, + "step": 3700 + }, + { + "epoch": 2.1445086705202314, + "grad_norm": 0.12161537259817123, + "learning_rate": 9.984023791430266e-05, + "loss": 0.0054, + "step": 3710 + }, + { + "epoch": 2.1502890173410405, + "grad_norm": 0.08869685977697372, + "learning_rate": 9.983758554209924e-05, + "loss": 0.004, + "step": 3720 + }, + { + "epoch": 2.1560693641618496, + "grad_norm": 0.1138811782002449, + "learning_rate": 9.983491136924268e-05, + "loss": 0.0046, + "step": 3730 + }, + { + "epoch": 2.161849710982659, + "grad_norm": 0.14750473201274872, + "learning_rate": 9.983221539690282e-05, + "loss": 0.0054, + "step": 3740 + }, + { + "epoch": 2.167630057803468, + "grad_norm": 0.11414989829063416, + "learning_rate": 9.982949762625892e-05, + "loss": 0.0053, + "step": 3750 + }, + { + "epoch": 2.1734104046242773, + "grad_norm": 0.12458052486181259, + "learning_rate": 9.982675805849986e-05, + "loss": 0.0051, + "step": 3760 + }, + { + "epoch": 2.179190751445087, + "grad_norm": 0.10004246979951859, + "learning_rate": 9.982399669482399e-05, + "loss": 0.0047, + "step": 3770 + }, + { + "epoch": 2.184971098265896, + "grad_norm": 0.08482618629932404, + "learning_rate": 9.982121353643924e-05, + "loss": 0.0042, + "step": 3780 + }, + { + "epoch": 2.1907514450867054, + "grad_norm": 0.08300045132637024, + "learning_rate": 9.981840858456306e-05, + "loss": 0.0063, + "step": 3790 + }, + { + "epoch": 2.1965317919075145, + "grad_norm": 0.10703772306442261, + "learning_rate": 9.981558184042243e-05, + "loss": 0.0058, + "step": 3800 + }, + { + "epoch": 2.2023121387283235, + "grad_norm": 0.12374887615442276, + "learning_rate": 9.981273330525387e-05, + "loss": 0.006, + "step": 3810 + }, + { + "epoch": 2.208092485549133, + "grad_norm": 0.14318065345287323, + "learning_rate": 9.980986298030341e-05, + "loss": 0.0062, + "step": 3820 + }, + { + "epoch": 2.213872832369942, + "grad_norm": 0.1355583518743515, + "learning_rate": 9.980697086682662e-05, + "loss": 0.0053, + "step": 3830 + }, + { + "epoch": 2.2196531791907512, + "grad_norm": 0.13883447647094727, + "learning_rate": 9.980405696608866e-05, + "loss": 0.0046, + "step": 3840 + }, + { + "epoch": 2.2254335260115607, + "grad_norm": 0.11007209122180939, + "learning_rate": 9.980112127936409e-05, + "loss": 0.0049, + "step": 3850 + }, + { + "epoch": 2.23121387283237, + "grad_norm": 0.13813987374305725, + "learning_rate": 9.979816380793717e-05, + "loss": 0.005, + "step": 3860 + }, + { + "epoch": 2.2369942196531793, + "grad_norm": 0.12208642065525055, + "learning_rate": 9.979518455310151e-05, + "loss": 0.0057, + "step": 3870 + }, + { + "epoch": 2.2427745664739884, + "grad_norm": 0.10291685163974762, + "learning_rate": 9.97921835161604e-05, + "loss": 0.0074, + "step": 3880 + }, + { + "epoch": 2.2485549132947975, + "grad_norm": 0.10677164793014526, + "learning_rate": 9.978916069842656e-05, + "loss": 0.0048, + "step": 3890 + }, + { + "epoch": 2.254335260115607, + "grad_norm": 0.09314499795436859, + "learning_rate": 9.97861161012223e-05, + "loss": 0.0053, + "step": 3900 + }, + { + "epoch": 2.260115606936416, + "grad_norm": 0.1184055432677269, + "learning_rate": 9.978304972587942e-05, + "loss": 0.0048, + "step": 3910 + }, + { + "epoch": 2.2658959537572256, + "grad_norm": 0.10291741788387299, + "learning_rate": 9.977996157373925e-05, + "loss": 0.0055, + "step": 3920 + }, + { + "epoch": 2.2716763005780347, + "grad_norm": 0.08237621188163757, + "learning_rate": 9.977685164615265e-05, + "loss": 0.0044, + "step": 3930 + }, + { + "epoch": 2.277456647398844, + "grad_norm": 0.1022019162774086, + "learning_rate": 9.977371994448002e-05, + "loss": 0.0055, + "step": 3940 + }, + { + "epoch": 2.2832369942196533, + "grad_norm": 0.09344568848609924, + "learning_rate": 9.977056647009127e-05, + "loss": 0.0057, + "step": 3950 + }, + { + "epoch": 2.2890173410404624, + "grad_norm": 0.0971241444349289, + "learning_rate": 9.976739122436582e-05, + "loss": 0.0065, + "step": 3960 + }, + { + "epoch": 2.294797687861272, + "grad_norm": 0.057168059051036835, + "learning_rate": 9.976419420869265e-05, + "loss": 0.0046, + "step": 3970 + }, + { + "epoch": 2.300578034682081, + "grad_norm": 0.06407888233661652, + "learning_rate": 9.976097542447025e-05, + "loss": 0.0044, + "step": 3980 + }, + { + "epoch": 2.30635838150289, + "grad_norm": 0.0672544315457344, + "learning_rate": 9.97577348731066e-05, + "loss": 0.0038, + "step": 3990 + }, + { + "epoch": 2.3121387283236996, + "grad_norm": 0.07293885201215744, + "learning_rate": 9.975447255601927e-05, + "loss": 0.0058, + "step": 4000 + }, + { + "epoch": 2.3179190751445087, + "grad_norm": 0.05307481437921524, + "learning_rate": 9.975118847463525e-05, + "loss": 0.0045, + "step": 4010 + }, + { + "epoch": 2.3236994219653178, + "grad_norm": 0.1027441918849945, + "learning_rate": 9.974788263039114e-05, + "loss": 0.0055, + "step": 4020 + }, + { + "epoch": 2.3294797687861273, + "grad_norm": 0.14790751039981842, + "learning_rate": 9.974455502473303e-05, + "loss": 0.0059, + "step": 4030 + }, + { + "epoch": 2.3352601156069364, + "grad_norm": 0.14081086218357086, + "learning_rate": 9.974120565911652e-05, + "loss": 0.005, + "step": 4040 + }, + { + "epoch": 2.3410404624277454, + "grad_norm": 0.12889717519283295, + "learning_rate": 9.973783453500674e-05, + "loss": 0.0046, + "step": 4050 + }, + { + "epoch": 2.346820809248555, + "grad_norm": 0.11189638823270798, + "learning_rate": 9.973444165387835e-05, + "loss": 0.0054, + "step": 4060 + }, + { + "epoch": 2.352601156069364, + "grad_norm": 0.11090421676635742, + "learning_rate": 9.973102701721549e-05, + "loss": 0.007, + "step": 4070 + }, + { + "epoch": 2.3583815028901736, + "grad_norm": 0.13979388773441315, + "learning_rate": 9.972759062651184e-05, + "loss": 0.0048, + "step": 4080 + }, + { + "epoch": 2.3641618497109826, + "grad_norm": 0.15473957359790802, + "learning_rate": 9.972413248327059e-05, + "loss": 0.0073, + "step": 4090 + }, + { + "epoch": 2.3699421965317917, + "grad_norm": 0.1623980700969696, + "learning_rate": 9.972065258900447e-05, + "loss": 0.0051, + "step": 4100 + }, + { + "epoch": 2.3757225433526012, + "grad_norm": 0.09650347381830215, + "learning_rate": 9.971715094523569e-05, + "loss": 0.0055, + "step": 4110 + }, + { + "epoch": 2.3815028901734103, + "grad_norm": 0.10511574894189835, + "learning_rate": 9.971362755349598e-05, + "loss": 0.0055, + "step": 4120 + }, + { + "epoch": 2.38728323699422, + "grad_norm": 0.11281032860279083, + "learning_rate": 9.971008241532662e-05, + "loss": 0.0046, + "step": 4130 + }, + { + "epoch": 2.393063583815029, + "grad_norm": 0.10358737409114838, + "learning_rate": 9.970651553227835e-05, + "loss": 0.0053, + "step": 4140 + }, + { + "epoch": 2.398843930635838, + "grad_norm": 0.09289064258337021, + "learning_rate": 9.970292690591143e-05, + "loss": 0.0064, + "step": 4150 + }, + { + "epoch": 2.4046242774566475, + "grad_norm": 0.1077444851398468, + "learning_rate": 9.969931653779569e-05, + "loss": 0.0053, + "step": 4160 + }, + { + "epoch": 2.4104046242774566, + "grad_norm": 0.11067981272935867, + "learning_rate": 9.969568442951038e-05, + "loss": 0.0059, + "step": 4170 + }, + { + "epoch": 2.416184971098266, + "grad_norm": 0.09449609369039536, + "learning_rate": 9.969203058264436e-05, + "loss": 0.0054, + "step": 4180 + }, + { + "epoch": 2.421965317919075, + "grad_norm": 0.12403042614459991, + "learning_rate": 9.96883549987959e-05, + "loss": 0.0046, + "step": 4190 + }, + { + "epoch": 2.4277456647398843, + "grad_norm": 0.06456232070922852, + "learning_rate": 9.968465767957287e-05, + "loss": 0.0051, + "step": 4200 + }, + { + "epoch": 2.433526011560694, + "grad_norm": 0.11479715257883072, + "learning_rate": 9.968093862659256e-05, + "loss": 0.0038, + "step": 4210 + }, + { + "epoch": 2.439306358381503, + "grad_norm": 0.16075153648853302, + "learning_rate": 9.967719784148182e-05, + "loss": 0.0066, + "step": 4220 + }, + { + "epoch": 2.445086705202312, + "grad_norm": 0.142628014087677, + "learning_rate": 9.967343532587702e-05, + "loss": 0.0064, + "step": 4230 + }, + { + "epoch": 2.4508670520231215, + "grad_norm": 0.19917619228363037, + "learning_rate": 9.966965108142399e-05, + "loss": 0.0053, + "step": 4240 + }, + { + "epoch": 2.4566473988439306, + "grad_norm": 0.10416891425848007, + "learning_rate": 9.96658451097781e-05, + "loss": 0.0046, + "step": 4250 + }, + { + "epoch": 2.4624277456647397, + "grad_norm": 0.07751508057117462, + "learning_rate": 9.966201741260419e-05, + "loss": 0.0071, + "step": 4260 + }, + { + "epoch": 2.468208092485549, + "grad_norm": 0.1693708747625351, + "learning_rate": 9.965816799157665e-05, + "loss": 0.0057, + "step": 4270 + }, + { + "epoch": 2.4739884393063583, + "grad_norm": 0.12062357366085052, + "learning_rate": 9.965429684837935e-05, + "loss": 0.0054, + "step": 4280 + }, + { + "epoch": 2.479768786127168, + "grad_norm": 0.11819230765104294, + "learning_rate": 9.965040398470562e-05, + "loss": 0.0047, + "step": 4290 + }, + { + "epoch": 2.485549132947977, + "grad_norm": 0.07853759080171585, + "learning_rate": 9.964648940225838e-05, + "loss": 0.005, + "step": 4300 + }, + { + "epoch": 2.491329479768786, + "grad_norm": 0.09461795538663864, + "learning_rate": 9.964255310274997e-05, + "loss": 0.004, + "step": 4310 + }, + { + "epoch": 2.4971098265895955, + "grad_norm": 0.07458837330341339, + "learning_rate": 9.963859508790228e-05, + "loss": 0.0037, + "step": 4320 + }, + { + "epoch": 2.5028901734104045, + "grad_norm": 0.07160282880067825, + "learning_rate": 9.963461535944664e-05, + "loss": 0.0036, + "step": 4330 + }, + { + "epoch": 2.508670520231214, + "grad_norm": 0.08057725429534912, + "learning_rate": 9.963061391912399e-05, + "loss": 0.007, + "step": 4340 + }, + { + "epoch": 2.514450867052023, + "grad_norm": 0.12584759294986725, + "learning_rate": 9.962659076868463e-05, + "loss": 0.0046, + "step": 4350 + }, + { + "epoch": 2.520231213872832, + "grad_norm": 0.13452067971229553, + "learning_rate": 9.962254590988846e-05, + "loss": 0.0055, + "step": 4360 + }, + { + "epoch": 2.5260115606936417, + "grad_norm": 0.09652341157197952, + "learning_rate": 9.961847934450481e-05, + "loss": 0.0046, + "step": 4370 + }, + { + "epoch": 2.531791907514451, + "grad_norm": 0.13719667494297028, + "learning_rate": 9.961439107431257e-05, + "loss": 0.0052, + "step": 4380 + }, + { + "epoch": 2.5375722543352603, + "grad_norm": 0.12765006721019745, + "learning_rate": 9.961028110110006e-05, + "loss": 0.0044, + "step": 4390 + }, + { + "epoch": 2.5433526011560694, + "grad_norm": 0.10085717588663101, + "learning_rate": 9.960614942666513e-05, + "loss": 0.005, + "step": 4400 + }, + { + "epoch": 2.5491329479768785, + "grad_norm": 0.04975849762558937, + "learning_rate": 9.960199605281511e-05, + "loss": 0.0038, + "step": 4410 + }, + { + "epoch": 2.5549132947976876, + "grad_norm": 0.08230627328157425, + "learning_rate": 9.959782098136683e-05, + "loss": 0.0054, + "step": 4420 + }, + { + "epoch": 2.560693641618497, + "grad_norm": 0.09695667773485184, + "learning_rate": 9.959362421414662e-05, + "loss": 0.0042, + "step": 4430 + }, + { + "epoch": 2.5664739884393066, + "grad_norm": 0.11576048284769058, + "learning_rate": 9.958940575299027e-05, + "loss": 0.005, + "step": 4440 + }, + { + "epoch": 2.5722543352601157, + "grad_norm": 0.07925650477409363, + "learning_rate": 9.95851655997431e-05, + "loss": 0.0044, + "step": 4450 + }, + { + "epoch": 2.578034682080925, + "grad_norm": 0.08335477858781815, + "learning_rate": 9.958090375625986e-05, + "loss": 0.0058, + "step": 4460 + }, + { + "epoch": 2.583815028901734, + "grad_norm": 0.09603440761566162, + "learning_rate": 9.957662022440486e-05, + "loss": 0.0054, + "step": 4470 + }, + { + "epoch": 2.5895953757225434, + "grad_norm": 0.10469746589660645, + "learning_rate": 9.957231500605187e-05, + "loss": 0.0045, + "step": 4480 + }, + { + "epoch": 2.5953757225433525, + "grad_norm": 0.11201290041208267, + "learning_rate": 9.95679881030841e-05, + "loss": 0.0074, + "step": 4490 + }, + { + "epoch": 2.601156069364162, + "grad_norm": 0.08057098835706711, + "learning_rate": 9.95636395173943e-05, + "loss": 0.0053, + "step": 4500 + }, + { + "epoch": 2.606936416184971, + "grad_norm": 0.10965419560670853, + "learning_rate": 9.95592692508847e-05, + "loss": 0.0069, + "step": 4510 + }, + { + "epoch": 2.61271676300578, + "grad_norm": 0.08233583718538284, + "learning_rate": 9.9554877305467e-05, + "loss": 0.0057, + "step": 4520 + }, + { + "epoch": 2.6184971098265897, + "grad_norm": 0.07618734985589981, + "learning_rate": 9.955046368306237e-05, + "loss": 0.0053, + "step": 4530 + }, + { + "epoch": 2.6242774566473988, + "grad_norm": 0.08831852674484253, + "learning_rate": 9.954602838560153e-05, + "loss": 0.0067, + "step": 4540 + }, + { + "epoch": 2.6300578034682083, + "grad_norm": 0.11426381021738052, + "learning_rate": 9.954157141502456e-05, + "loss": 0.0049, + "step": 4550 + }, + { + "epoch": 2.6358381502890174, + "grad_norm": 0.08342878520488739, + "learning_rate": 9.953709277328112e-05, + "loss": 0.005, + "step": 4560 + }, + { + "epoch": 2.6416184971098264, + "grad_norm": 0.09040774405002594, + "learning_rate": 9.953259246233032e-05, + "loss": 0.0048, + "step": 4570 + }, + { + "epoch": 2.647398843930636, + "grad_norm": 0.08589833974838257, + "learning_rate": 9.952807048414077e-05, + "loss": 0.0039, + "step": 4580 + }, + { + "epoch": 2.653179190751445, + "grad_norm": 0.05775552615523338, + "learning_rate": 9.95235268406905e-05, + "loss": 0.0065, + "step": 4590 + }, + { + "epoch": 2.6589595375722546, + "grad_norm": 0.058433182537555695, + "learning_rate": 9.951896153396708e-05, + "loss": 0.0041, + "step": 4600 + }, + { + "epoch": 2.6647398843930636, + "grad_norm": 0.10195489972829819, + "learning_rate": 9.95143745659675e-05, + "loss": 0.0075, + "step": 4610 + }, + { + "epoch": 2.6705202312138727, + "grad_norm": 0.0639519989490509, + "learning_rate": 9.95097659386983e-05, + "loss": 0.0045, + "step": 4620 + }, + { + "epoch": 2.6763005780346822, + "grad_norm": 0.07609432190656662, + "learning_rate": 9.950513565417542e-05, + "loss": 0.0052, + "step": 4630 + }, + { + "epoch": 2.6820809248554913, + "grad_norm": 0.10391134768724442, + "learning_rate": 9.95004837144243e-05, + "loss": 0.0045, + "step": 4640 + }, + { + "epoch": 2.687861271676301, + "grad_norm": 0.11459513753652573, + "learning_rate": 9.949581012147988e-05, + "loss": 0.0052, + "step": 4650 + }, + { + "epoch": 2.69364161849711, + "grad_norm": 0.12960045039653778, + "learning_rate": 9.949111487738653e-05, + "loss": 0.0082, + "step": 4660 + }, + { + "epoch": 2.699421965317919, + "grad_norm": 0.19312746822834015, + "learning_rate": 9.948639798419813e-05, + "loss": 0.0053, + "step": 4670 + }, + { + "epoch": 2.705202312138728, + "grad_norm": 0.15751752257347107, + "learning_rate": 9.948165944397799e-05, + "loss": 0.0065, + "step": 4680 + }, + { + "epoch": 2.7109826589595376, + "grad_norm": 0.10588734596967697, + "learning_rate": 9.94768992587989e-05, + "loss": 0.0055, + "step": 4690 + }, + { + "epoch": 2.7167630057803467, + "grad_norm": 0.0877213329076767, + "learning_rate": 9.947211743074313e-05, + "loss": 0.0054, + "step": 4700 + }, + { + "epoch": 2.722543352601156, + "grad_norm": 0.11511239409446716, + "learning_rate": 9.946731396190246e-05, + "loss": 0.0056, + "step": 4710 + }, + { + "epoch": 2.7283236994219653, + "grad_norm": 0.10262319445610046, + "learning_rate": 9.946248885437803e-05, + "loss": 0.0062, + "step": 4720 + }, + { + "epoch": 2.7341040462427744, + "grad_norm": 0.07959464937448502, + "learning_rate": 9.945764211028053e-05, + "loss": 0.0051, + "step": 4730 + }, + { + "epoch": 2.739884393063584, + "grad_norm": 0.08185133337974548, + "learning_rate": 9.94527737317301e-05, + "loss": 0.0055, + "step": 4740 + }, + { + "epoch": 2.745664739884393, + "grad_norm": 0.09177955985069275, + "learning_rate": 9.944788372085631e-05, + "loss": 0.0049, + "step": 4750 + }, + { + "epoch": 2.7514450867052025, + "grad_norm": 0.08497302234172821, + "learning_rate": 9.944297207979825e-05, + "loss": 0.0048, + "step": 4760 + }, + { + "epoch": 2.7572254335260116, + "grad_norm": 0.07832589000463486, + "learning_rate": 9.943803881070441e-05, + "loss": 0.0044, + "step": 4770 + }, + { + "epoch": 2.7630057803468207, + "grad_norm": 0.08363822847604752, + "learning_rate": 9.943308391573278e-05, + "loss": 0.0079, + "step": 4780 + }, + { + "epoch": 2.76878612716763, + "grad_norm": 0.06930410116910934, + "learning_rate": 9.942810739705079e-05, + "loss": 0.0042, + "step": 4790 + }, + { + "epoch": 2.7745664739884393, + "grad_norm": 0.08685854077339172, + "learning_rate": 9.942310925683538e-05, + "loss": 0.0055, + "step": 4800 + }, + { + "epoch": 2.7803468208092488, + "grad_norm": 0.10434369742870331, + "learning_rate": 9.941808949727285e-05, + "loss": 0.0041, + "step": 4810 + }, + { + "epoch": 2.786127167630058, + "grad_norm": 0.07987218350172043, + "learning_rate": 9.941304812055903e-05, + "loss": 0.0041, + "step": 4820 + }, + { + "epoch": 2.791907514450867, + "grad_norm": 0.0995841845870018, + "learning_rate": 9.940798512889921e-05, + "loss": 0.0048, + "step": 4830 + }, + { + "epoch": 2.7976878612716765, + "grad_norm": 0.1205340102314949, + "learning_rate": 9.94029005245081e-05, + "loss": 0.0054, + "step": 4840 + }, + { + "epoch": 2.8034682080924855, + "grad_norm": 0.1322150081396103, + "learning_rate": 9.939779430960988e-05, + "loss": 0.0061, + "step": 4850 + }, + { + "epoch": 2.809248554913295, + "grad_norm": 0.09959442168474197, + "learning_rate": 9.939266648643817e-05, + "loss": 0.0048, + "step": 4860 + }, + { + "epoch": 2.815028901734104, + "grad_norm": 0.08988264948129654, + "learning_rate": 9.938751705723607e-05, + "loss": 0.0063, + "step": 4870 + }, + { + "epoch": 2.820809248554913, + "grad_norm": 0.08533381670713425, + "learning_rate": 9.938234602425613e-05, + "loss": 0.0041, + "step": 4880 + }, + { + "epoch": 2.8265895953757223, + "grad_norm": 0.10899991542100906, + "learning_rate": 9.93771533897603e-05, + "loss": 0.0051, + "step": 4890 + }, + { + "epoch": 2.832369942196532, + "grad_norm": 0.10073061287403107, + "learning_rate": 9.937193915602004e-05, + "loss": 0.005, + "step": 4900 + }, + { + "epoch": 2.838150289017341, + "grad_norm": 0.10269533097743988, + "learning_rate": 9.936670332531621e-05, + "loss": 0.0058, + "step": 4910 + }, + { + "epoch": 2.8439306358381504, + "grad_norm": 0.08050937205553055, + "learning_rate": 9.936144589993916e-05, + "loss": 0.0054, + "step": 4920 + }, + { + "epoch": 2.8497109826589595, + "grad_norm": 0.09727004170417786, + "learning_rate": 9.935616688218867e-05, + "loss": 0.0042, + "step": 4930 + }, + { + "epoch": 2.8554913294797686, + "grad_norm": 0.07916481792926788, + "learning_rate": 9.935086627437395e-05, + "loss": 0.0048, + "step": 4940 + }, + { + "epoch": 2.861271676300578, + "grad_norm": 0.1361258327960968, + "learning_rate": 9.934554407881366e-05, + "loss": 0.0106, + "step": 4950 + }, + { + "epoch": 2.867052023121387, + "grad_norm": 0.1195870116353035, + "learning_rate": 9.934020029783593e-05, + "loss": 0.0045, + "step": 4960 + }, + { + "epoch": 2.8728323699421967, + "grad_norm": 0.0782453715801239, + "learning_rate": 9.933483493377829e-05, + "loss": 0.0045, + "step": 4970 + }, + { + "epoch": 2.878612716763006, + "grad_norm": 0.064698226749897, + "learning_rate": 9.932944798898774e-05, + "loss": 0.0047, + "step": 4980 + }, + { + "epoch": 2.884393063583815, + "grad_norm": 0.07942969352006912, + "learning_rate": 9.932403946582072e-05, + "loss": 0.008, + "step": 4990 + }, + { + "epoch": 2.8901734104046244, + "grad_norm": 0.09807512909173965, + "learning_rate": 9.93186093666431e-05, + "loss": 0.0058, + "step": 5000 + }, + { + "epoch": 2.8959537572254335, + "grad_norm": 0.10162036120891571, + "learning_rate": 9.931315769383018e-05, + "loss": 0.0042, + "step": 5010 + }, + { + "epoch": 2.901734104046243, + "grad_norm": 0.09182185679674149, + "learning_rate": 9.930768444976672e-05, + "loss": 0.0098, + "step": 5020 + }, + { + "epoch": 2.907514450867052, + "grad_norm": 0.09831009805202484, + "learning_rate": 9.93021896368469e-05, + "loss": 0.0056, + "step": 5030 + }, + { + "epoch": 2.913294797687861, + "grad_norm": 0.09399363398551941, + "learning_rate": 9.929667325747431e-05, + "loss": 0.0045, + "step": 5040 + }, + { + "epoch": 2.9190751445086707, + "grad_norm": 0.07335769385099411, + "learning_rate": 9.929113531406205e-05, + "loss": 0.0044, + "step": 5050 + }, + { + "epoch": 2.9248554913294798, + "grad_norm": 0.056684091687202454, + "learning_rate": 9.928557580903257e-05, + "loss": 0.0068, + "step": 5060 + }, + { + "epoch": 2.9306358381502893, + "grad_norm": 0.07843078672885895, + "learning_rate": 9.927999474481779e-05, + "loss": 0.0052, + "step": 5070 + }, + { + "epoch": 2.9364161849710984, + "grad_norm": 0.08666455745697021, + "learning_rate": 9.927439212385907e-05, + "loss": 0.004, + "step": 5080 + }, + { + "epoch": 2.9421965317919074, + "grad_norm": 0.13427752256393433, + "learning_rate": 9.926876794860718e-05, + "loss": 0.0084, + "step": 5090 + }, + { + "epoch": 2.9479768786127165, + "grad_norm": 0.08947822451591492, + "learning_rate": 9.926312222152235e-05, + "loss": 0.0057, + "step": 5100 + }, + { + "epoch": 2.953757225433526, + "grad_norm": 0.09052816778421402, + "learning_rate": 9.925745494507414e-05, + "loss": 0.0057, + "step": 5110 + }, + { + "epoch": 2.959537572254335, + "grad_norm": 0.09049113839864731, + "learning_rate": 9.925176612174169e-05, + "loss": 0.0052, + "step": 5120 + }, + { + "epoch": 2.9653179190751446, + "grad_norm": 0.09374893456697464, + "learning_rate": 9.924605575401346e-05, + "loss": 0.0049, + "step": 5130 + }, + { + "epoch": 2.9710982658959537, + "grad_norm": 0.10202183574438095, + "learning_rate": 9.924032384438733e-05, + "loss": 0.0047, + "step": 5140 + }, + { + "epoch": 2.976878612716763, + "grad_norm": 0.12373793870210648, + "learning_rate": 9.923457039537066e-05, + "loss": 0.0072, + "step": 5150 + }, + { + "epoch": 2.9826589595375723, + "grad_norm": 0.12671178579330444, + "learning_rate": 9.92287954094802e-05, + "loss": 0.0047, + "step": 5160 + }, + { + "epoch": 2.9884393063583814, + "grad_norm": 0.13292868435382843, + "learning_rate": 9.922299888924212e-05, + "loss": 0.0055, + "step": 5170 + }, + { + "epoch": 2.994219653179191, + "grad_norm": 0.0895073339343071, + "learning_rate": 9.921718083719203e-05, + "loss": 0.0039, + "step": 5180 + }, + { + "epoch": 3.0, + "grad_norm": 0.09845282137393951, + "learning_rate": 9.921134125587491e-05, + "loss": 0.0051, + "step": 5190 + }, + { + "epoch": 3.005780346820809, + "grad_norm": 0.17806030809879303, + "learning_rate": 9.920548014784523e-05, + "loss": 0.0052, + "step": 5200 + }, + { + "epoch": 3.0115606936416186, + "grad_norm": 0.09359262883663177, + "learning_rate": 9.919959751566681e-05, + "loss": 0.0049, + "step": 5210 + }, + { + "epoch": 3.0173410404624277, + "grad_norm": 0.14200544357299805, + "learning_rate": 9.919369336191291e-05, + "loss": 0.005, + "step": 5220 + }, + { + "epoch": 3.023121387283237, + "grad_norm": 0.08623161911964417, + "learning_rate": 9.918776768916625e-05, + "loss": 0.0042, + "step": 5230 + }, + { + "epoch": 3.0289017341040463, + "grad_norm": 0.0641922876238823, + "learning_rate": 9.918182050001888e-05, + "loss": 0.0047, + "step": 5240 + }, + { + "epoch": 3.0346820809248554, + "grad_norm": 0.08116123825311661, + "learning_rate": 9.91758517970723e-05, + "loss": 0.0054, + "step": 5250 + }, + { + "epoch": 3.040462427745665, + "grad_norm": 0.07552453875541687, + "learning_rate": 9.916986158293744e-05, + "loss": 0.0048, + "step": 5260 + }, + { + "epoch": 3.046242774566474, + "grad_norm": 0.09604078531265259, + "learning_rate": 9.916384986023463e-05, + "loss": 0.0055, + "step": 5270 + }, + { + "epoch": 3.052023121387283, + "grad_norm": 0.08662170171737671, + "learning_rate": 9.915781663159359e-05, + "loss": 0.004, + "step": 5280 + }, + { + "epoch": 3.0578034682080926, + "grad_norm": 0.07890532165765762, + "learning_rate": 9.915176189965346e-05, + "loss": 0.0049, + "step": 5290 + }, + { + "epoch": 3.0635838150289016, + "grad_norm": 0.07367055118083954, + "learning_rate": 9.914568566706279e-05, + "loss": 0.0043, + "step": 5300 + }, + { + "epoch": 3.069364161849711, + "grad_norm": 0.05489453300833702, + "learning_rate": 9.913958793647953e-05, + "loss": 0.0047, + "step": 5310 + }, + { + "epoch": 3.0751445086705202, + "grad_norm": 0.04112934693694115, + "learning_rate": 9.913346871057102e-05, + "loss": 0.0037, + "step": 5320 + }, + { + "epoch": 3.0809248554913293, + "grad_norm": 0.06684437394142151, + "learning_rate": 9.912732799201403e-05, + "loss": 0.0051, + "step": 5330 + }, + { + "epoch": 3.086705202312139, + "grad_norm": 0.06671774387359619, + "learning_rate": 9.912116578349474e-05, + "loss": 0.0043, + "step": 5340 + }, + { + "epoch": 3.092485549132948, + "grad_norm": 0.07309421896934509, + "learning_rate": 9.911498208770867e-05, + "loss": 0.0041, + "step": 5350 + }, + { + "epoch": 3.098265895953757, + "grad_norm": 0.08895828574895859, + "learning_rate": 9.910877690736078e-05, + "loss": 0.0038, + "step": 5360 + }, + { + "epoch": 3.1040462427745665, + "grad_norm": 0.05753541737794876, + "learning_rate": 9.910255024516546e-05, + "loss": 0.0046, + "step": 5370 + }, + { + "epoch": 3.1098265895953756, + "grad_norm": 0.1112755537033081, + "learning_rate": 9.909630210384644e-05, + "loss": 0.005, + "step": 5380 + }, + { + "epoch": 3.115606936416185, + "grad_norm": 0.07616247981786728, + "learning_rate": 9.909003248613688e-05, + "loss": 0.0055, + "step": 5390 + }, + { + "epoch": 3.121387283236994, + "grad_norm": 0.09826352447271347, + "learning_rate": 9.908374139477934e-05, + "loss": 0.0048, + "step": 5400 + }, + { + "epoch": 3.1271676300578033, + "grad_norm": 0.10984566062688828, + "learning_rate": 9.907742883252571e-05, + "loss": 0.0059, + "step": 5410 + }, + { + "epoch": 3.132947976878613, + "grad_norm": 0.10086532682180405, + "learning_rate": 9.907109480213736e-05, + "loss": 0.0052, + "step": 5420 + }, + { + "epoch": 3.138728323699422, + "grad_norm": 0.0686434879899025, + "learning_rate": 9.906473930638498e-05, + "loss": 0.0049, + "step": 5430 + }, + { + "epoch": 3.1445086705202314, + "grad_norm": 0.08263476192951202, + "learning_rate": 9.90583623480487e-05, + "loss": 0.004, + "step": 5440 + }, + { + "epoch": 3.1502890173410405, + "grad_norm": 0.08829036355018616, + "learning_rate": 9.905196392991802e-05, + "loss": 0.0047, + "step": 5450 + }, + { + "epoch": 3.1560693641618496, + "grad_norm": 0.07960904389619827, + "learning_rate": 9.904554405479183e-05, + "loss": 0.0052, + "step": 5460 + }, + { + "epoch": 3.161849710982659, + "grad_norm": 0.13208231329917908, + "learning_rate": 9.903910272547838e-05, + "loss": 0.005, + "step": 5470 + }, + { + "epoch": 3.167630057803468, + "grad_norm": 0.0960419774055481, + "learning_rate": 9.903263994479533e-05, + "loss": 0.0044, + "step": 5480 + }, + { + "epoch": 3.1734104046242773, + "grad_norm": 0.10696357488632202, + "learning_rate": 9.902615571556972e-05, + "loss": 0.0044, + "step": 5490 + }, + { + "epoch": 3.179190751445087, + "grad_norm": 0.12074033915996552, + "learning_rate": 9.9019650040638e-05, + "loss": 0.0054, + "step": 5500 + }, + { + "epoch": 3.184971098265896, + "grad_norm": 0.05908733978867531, + "learning_rate": 9.901312292284594e-05, + "loss": 0.005, + "step": 5510 + }, + { + "epoch": 3.1907514450867054, + "grad_norm": 0.08794012665748596, + "learning_rate": 9.900657436504875e-05, + "loss": 0.005, + "step": 5520 + }, + { + "epoch": 3.1965317919075145, + "grad_norm": 0.08090519905090332, + "learning_rate": 9.900000437011097e-05, + "loss": 0.0048, + "step": 5530 + }, + { + "epoch": 3.2023121387283235, + "grad_norm": 0.06776402145624161, + "learning_rate": 9.899341294090653e-05, + "loss": 0.0032, + "step": 5540 + }, + { + "epoch": 3.208092485549133, + "grad_norm": 0.09918577969074249, + "learning_rate": 9.898680008031877e-05, + "loss": 0.0053, + "step": 5550 + }, + { + "epoch": 3.213872832369942, + "grad_norm": 0.11192861199378967, + "learning_rate": 9.898016579124037e-05, + "loss": 0.0069, + "step": 5560 + }, + { + "epoch": 3.2196531791907512, + "grad_norm": 0.0986863300204277, + "learning_rate": 9.89735100765734e-05, + "loss": 0.0043, + "step": 5570 + }, + { + "epoch": 3.2254335260115607, + "grad_norm": 0.10766062885522842, + "learning_rate": 9.89668329392293e-05, + "loss": 0.0083, + "step": 5580 + }, + { + "epoch": 3.23121387283237, + "grad_norm": 0.11381088942289352, + "learning_rate": 9.896013438212885e-05, + "loss": 0.0051, + "step": 5590 + }, + { + "epoch": 3.2369942196531793, + "grad_norm": 0.13150329887866974, + "learning_rate": 9.895341440820225e-05, + "loss": 0.004, + "step": 5600 + }, + { + "epoch": 3.2427745664739884, + "grad_norm": 0.07293879985809326, + "learning_rate": 9.894667302038902e-05, + "loss": 0.004, + "step": 5610 + }, + { + "epoch": 3.2485549132947975, + "grad_norm": 0.09569191187620163, + "learning_rate": 9.893991022163812e-05, + "loss": 0.0049, + "step": 5620 + }, + { + "epoch": 3.254335260115607, + "grad_norm": 0.10423526167869568, + "learning_rate": 9.893312601490779e-05, + "loss": 0.0039, + "step": 5630 + }, + { + "epoch": 3.260115606936416, + "grad_norm": 0.0878419354557991, + "learning_rate": 9.892632040316568e-05, + "loss": 0.0038, + "step": 5640 + }, + { + "epoch": 3.2658959537572256, + "grad_norm": 0.08911574631929398, + "learning_rate": 9.891949338938877e-05, + "loss": 0.0049, + "step": 5650 + }, + { + "epoch": 3.2716763005780347, + "grad_norm": 0.07388553023338318, + "learning_rate": 9.89126449765635e-05, + "loss": 0.004, + "step": 5660 + }, + { + "epoch": 3.277456647398844, + "grad_norm": 0.09998785704374313, + "learning_rate": 9.890577516768551e-05, + "loss": 0.0076, + "step": 5670 + }, + { + "epoch": 3.2832369942196533, + "grad_norm": 0.10120239108800888, + "learning_rate": 9.889888396575995e-05, + "loss": 0.0051, + "step": 5680 + }, + { + "epoch": 3.2890173410404624, + "grad_norm": 0.10796697437763214, + "learning_rate": 9.889197137380125e-05, + "loss": 0.0063, + "step": 5690 + }, + { + "epoch": 3.294797687861272, + "grad_norm": 0.07642047852277756, + "learning_rate": 9.888503739483321e-05, + "loss": 0.0046, + "step": 5700 + }, + { + "epoch": 3.300578034682081, + "grad_norm": 0.1060706302523613, + "learning_rate": 9.887808203188897e-05, + "loss": 0.0046, + "step": 5710 + }, + { + "epoch": 3.30635838150289, + "grad_norm": 0.07729043066501617, + "learning_rate": 9.887110528801108e-05, + "loss": 0.0043, + "step": 5720 + }, + { + "epoch": 3.3121387283236996, + "grad_norm": 0.07654937356710434, + "learning_rate": 9.886410716625137e-05, + "loss": 0.0044, + "step": 5730 + }, + { + "epoch": 3.3179190751445087, + "grad_norm": 0.10756026208400726, + "learning_rate": 9.885708766967108e-05, + "loss": 0.0045, + "step": 5740 + }, + { + "epoch": 3.3236994219653178, + "grad_norm": 0.09746664017438889, + "learning_rate": 9.885004680134076e-05, + "loss": 0.0053, + "step": 5750 + }, + { + "epoch": 3.3294797687861273, + "grad_norm": 0.06226349249482155, + "learning_rate": 9.884298456434033e-05, + "loss": 0.0036, + "step": 5760 + }, + { + "epoch": 3.3352601156069364, + "grad_norm": 0.076237753033638, + "learning_rate": 9.883590096175905e-05, + "loss": 0.0046, + "step": 5770 + }, + { + "epoch": 3.3410404624277454, + "grad_norm": 0.09564107656478882, + "learning_rate": 9.882879599669554e-05, + "loss": 0.0036, + "step": 5780 + }, + { + "epoch": 3.346820809248555, + "grad_norm": 0.11294952034950256, + "learning_rate": 9.882166967225775e-05, + "loss": 0.0044, + "step": 5790 + }, + { + "epoch": 3.352601156069364, + "grad_norm": 0.10337945073843002, + "learning_rate": 9.881452199156296e-05, + "loss": 0.0042, + "step": 5800 + }, + { + "epoch": 3.3583815028901736, + "grad_norm": 0.08041290193796158, + "learning_rate": 9.880735295773782e-05, + "loss": 0.0034, + "step": 5810 + }, + { + "epoch": 3.3641618497109826, + "grad_norm": 0.08811701089143753, + "learning_rate": 9.88001625739183e-05, + "loss": 0.0046, + "step": 5820 + }, + { + "epoch": 3.3699421965317917, + "grad_norm": 0.08728475868701935, + "learning_rate": 9.879295084324976e-05, + "loss": 0.0035, + "step": 5830 + }, + { + "epoch": 3.3757225433526012, + "grad_norm": 0.07673713564872742, + "learning_rate": 9.87857177688868e-05, + "loss": 0.004, + "step": 5840 + }, + { + "epoch": 3.3815028901734103, + "grad_norm": 0.04676206782460213, + "learning_rate": 9.877846335399343e-05, + "loss": 0.0042, + "step": 5850 + }, + { + "epoch": 3.38728323699422, + "grad_norm": 0.1315360814332962, + "learning_rate": 9.8771187601743e-05, + "loss": 0.0073, + "step": 5860 + }, + { + "epoch": 3.393063583815029, + "grad_norm": 0.11324327439069748, + "learning_rate": 9.876389051531813e-05, + "loss": 0.0069, + "step": 5870 + }, + { + "epoch": 3.398843930635838, + "grad_norm": 0.1283894181251526, + "learning_rate": 9.875657209791088e-05, + "loss": 0.0045, + "step": 5880 + }, + { + "epoch": 3.4046242774566475, + "grad_norm": 0.09971684217453003, + "learning_rate": 9.874923235272248e-05, + "loss": 0.006, + "step": 5890 + }, + { + "epoch": 3.4104046242774566, + "grad_norm": 0.08040836453437805, + "learning_rate": 9.874187128296369e-05, + "loss": 0.0056, + "step": 5900 + }, + { + "epoch": 3.416184971098266, + "grad_norm": 0.09063635766506195, + "learning_rate": 9.873448889185439e-05, + "loss": 0.0046, + "step": 5910 + }, + { + "epoch": 3.421965317919075, + "grad_norm": 0.09057789295911789, + "learning_rate": 9.872708518262397e-05, + "loss": 0.0077, + "step": 5920 + }, + { + "epoch": 3.4277456647398843, + "grad_norm": 0.08726388961076736, + "learning_rate": 9.871966015851101e-05, + "loss": 0.0043, + "step": 5930 + }, + { + "epoch": 3.433526011560694, + "grad_norm": 0.06119583174586296, + "learning_rate": 9.87122138227635e-05, + "loss": 0.0045, + "step": 5940 + }, + { + "epoch": 3.439306358381503, + "grad_norm": 0.059672337025403976, + "learning_rate": 9.87047461786387e-05, + "loss": 0.0038, + "step": 5950 + }, + { + "epoch": 3.445086705202312, + "grad_norm": 0.08882300555706024, + "learning_rate": 9.869725722940323e-05, + "loss": 0.0043, + "step": 5960 + }, + { + "epoch": 3.4508670520231215, + "grad_norm": 0.09898592531681061, + "learning_rate": 9.868974697833299e-05, + "loss": 0.0041, + "step": 5970 + }, + { + "epoch": 3.4566473988439306, + "grad_norm": 0.054888781160116196, + "learning_rate": 9.868221542871324e-05, + "loss": 0.0037, + "step": 5980 + }, + { + "epoch": 3.4624277456647397, + "grad_norm": 0.07501913607120514, + "learning_rate": 9.867466258383853e-05, + "loss": 0.0054, + "step": 5990 + }, + { + "epoch": 3.468208092485549, + "grad_norm": 0.10471879690885544, + "learning_rate": 9.866708844701272e-05, + "loss": 0.0053, + "step": 6000 + }, + { + "epoch": 3.4739884393063583, + "grad_norm": 0.14950262010097504, + "learning_rate": 9.865949302154899e-05, + "loss": 0.0061, + "step": 6010 + }, + { + "epoch": 3.479768786127168, + "grad_norm": 0.14313995838165283, + "learning_rate": 9.865187631076987e-05, + "loss": 0.0048, + "step": 6020 + }, + { + "epoch": 3.485549132947977, + "grad_norm": 0.09261684119701385, + "learning_rate": 9.864423831800712e-05, + "loss": 0.0041, + "step": 6030 + }, + { + "epoch": 3.491329479768786, + "grad_norm": 0.10116013884544373, + "learning_rate": 9.863657904660191e-05, + "loss": 0.0049, + "step": 6040 + }, + { + "epoch": 3.4971098265895955, + "grad_norm": 0.10000468790531158, + "learning_rate": 9.862889849990462e-05, + "loss": 0.0042, + "step": 6050 + }, + { + "epoch": 3.5028901734104045, + "grad_norm": 0.08636848628520966, + "learning_rate": 9.8621196681275e-05, + "loss": 0.0039, + "step": 6060 + }, + { + "epoch": 3.508670520231214, + "grad_norm": 0.09822828322649002, + "learning_rate": 9.861347359408211e-05, + "loss": 0.0041, + "step": 6070 + }, + { + "epoch": 3.514450867052023, + "grad_norm": 0.06331049650907516, + "learning_rate": 9.860572924170426e-05, + "loss": 0.0035, + "step": 6080 + }, + { + "epoch": 3.520231213872832, + "grad_norm": 0.09790698438882828, + "learning_rate": 9.85979636275291e-05, + "loss": 0.0048, + "step": 6090 + }, + { + "epoch": 3.5260115606936417, + "grad_norm": 0.09691625833511353, + "learning_rate": 9.859017675495357e-05, + "loss": 0.0037, + "step": 6100 + }, + { + "epoch": 3.531791907514451, + "grad_norm": 0.08491610735654831, + "learning_rate": 9.858236862738392e-05, + "loss": 0.0045, + "step": 6110 + }, + { + "epoch": 3.5375722543352603, + "grad_norm": 0.09983476251363754, + "learning_rate": 9.85745392482357e-05, + "loss": 0.0039, + "step": 6120 + }, + { + "epoch": 3.5433526011560694, + "grad_norm": 0.06305649131536484, + "learning_rate": 9.856668862093372e-05, + "loss": 0.0035, + "step": 6130 + }, + { + "epoch": 3.5491329479768785, + "grad_norm": 0.08348894864320755, + "learning_rate": 9.855881674891214e-05, + "loss": 0.0052, + "step": 6140 + }, + { + "epoch": 3.5549132947976876, + "grad_norm": 0.07188717275857925, + "learning_rate": 9.855092363561437e-05, + "loss": 0.0044, + "step": 6150 + }, + { + "epoch": 3.560693641618497, + "grad_norm": 0.07703947275876999, + "learning_rate": 9.854300928449312e-05, + "loss": 0.0043, + "step": 6160 + }, + { + "epoch": 3.5664739884393066, + "grad_norm": 0.11623793840408325, + "learning_rate": 9.85350736990104e-05, + "loss": 0.0048, + "step": 6170 + }, + { + "epoch": 3.5722543352601157, + "grad_norm": 0.07263840734958649, + "learning_rate": 9.852711688263752e-05, + "loss": 0.0043, + "step": 6180 + }, + { + "epoch": 3.578034682080925, + "grad_norm": 0.08389998972415924, + "learning_rate": 9.851913883885503e-05, + "loss": 0.0053, + "step": 6190 + }, + { + "epoch": 3.583815028901734, + "grad_norm": 0.08913276344537735, + "learning_rate": 9.851113957115283e-05, + "loss": 0.0054, + "step": 6200 + }, + { + "epoch": 3.5895953757225434, + "grad_norm": 0.09740209579467773, + "learning_rate": 9.850311908303005e-05, + "loss": 0.0095, + "step": 6210 + }, + { + "epoch": 3.5953757225433525, + "grad_norm": 0.0637655109167099, + "learning_rate": 9.849507737799514e-05, + "loss": 0.0043, + "step": 6220 + }, + { + "epoch": 3.601156069364162, + "grad_norm": 0.1079898402094841, + "learning_rate": 9.84870144595658e-05, + "loss": 0.0043, + "step": 6230 + }, + { + "epoch": 3.606936416184971, + "grad_norm": 0.06550470739603043, + "learning_rate": 9.847893033126903e-05, + "loss": 0.0036, + "step": 6240 + }, + { + "epoch": 3.61271676300578, + "grad_norm": 0.07686427980661392, + "learning_rate": 9.847082499664111e-05, + "loss": 0.006, + "step": 6250 + }, + { + "epoch": 3.6184971098265897, + "grad_norm": 0.0707835853099823, + "learning_rate": 9.846269845922758e-05, + "loss": 0.008, + "step": 6260 + }, + { + "epoch": 3.6242774566473988, + "grad_norm": 0.051362019032239914, + "learning_rate": 9.845455072258326e-05, + "loss": 0.0043, + "step": 6270 + }, + { + "epoch": 3.6300578034682083, + "grad_norm": 0.06963202357292175, + "learning_rate": 9.844638179027226e-05, + "loss": 0.0042, + "step": 6280 + }, + { + "epoch": 3.6358381502890174, + "grad_norm": 0.08382728695869446, + "learning_rate": 9.843819166586795e-05, + "loss": 0.0047, + "step": 6290 + }, + { + "epoch": 3.6416184971098264, + "grad_norm": 0.06943295150995255, + "learning_rate": 9.842998035295294e-05, + "loss": 0.0059, + "step": 6300 + }, + { + "epoch": 3.647398843930636, + "grad_norm": 0.08568401634693146, + "learning_rate": 9.842174785511918e-05, + "loss": 0.0039, + "step": 6310 + }, + { + "epoch": 3.653179190751445, + "grad_norm": 0.05123983696103096, + "learning_rate": 9.841349417596779e-05, + "loss": 0.0039, + "step": 6320 + }, + { + "epoch": 3.6589595375722546, + "grad_norm": 0.08430792391300201, + "learning_rate": 9.840521931910926e-05, + "loss": 0.0038, + "step": 6330 + }, + { + "epoch": 3.6647398843930636, + "grad_norm": 0.07086501270532608, + "learning_rate": 9.839692328816327e-05, + "loss": 0.0046, + "step": 6340 + }, + { + "epoch": 3.6705202312138727, + "grad_norm": 0.08472570031881332, + "learning_rate": 9.838860608675879e-05, + "loss": 0.0048, + "step": 6350 + }, + { + "epoch": 3.6763005780346822, + "grad_norm": 0.07034272700548172, + "learning_rate": 9.838026771853406e-05, + "loss": 0.0046, + "step": 6360 + }, + { + "epoch": 3.6820809248554913, + "grad_norm": 0.05583292618393898, + "learning_rate": 9.837190818713655e-05, + "loss": 0.0054, + "step": 6370 + }, + { + "epoch": 3.687861271676301, + "grad_norm": 0.05575549602508545, + "learning_rate": 9.836352749622299e-05, + "loss": 0.0048, + "step": 6380 + }, + { + "epoch": 3.69364161849711, + "grad_norm": 0.0586441271007061, + "learning_rate": 9.835512564945941e-05, + "loss": 0.0036, + "step": 6390 + }, + { + "epoch": 3.699421965317919, + "grad_norm": 0.0920369029045105, + "learning_rate": 9.834670265052104e-05, + "loss": 0.0083, + "step": 6400 + }, + { + "epoch": 3.705202312138728, + "grad_norm": 0.11683415621519089, + "learning_rate": 9.833825850309239e-05, + "loss": 0.0042, + "step": 6410 + }, + { + "epoch": 3.7109826589595376, + "grad_norm": 0.06055425852537155, + "learning_rate": 9.832979321086723e-05, + "loss": 0.004, + "step": 6420 + }, + { + "epoch": 3.7167630057803467, + "grad_norm": 0.07325479388237, + "learning_rate": 9.832130677754854e-05, + "loss": 0.0044, + "step": 6430 + }, + { + "epoch": 3.722543352601156, + "grad_norm": 0.07589448988437653, + "learning_rate": 9.831279920684861e-05, + "loss": 0.0048, + "step": 6440 + }, + { + "epoch": 3.7283236994219653, + "grad_norm": 0.10280989110469818, + "learning_rate": 9.830427050248891e-05, + "loss": 0.0127, + "step": 6450 + }, + { + "epoch": 3.7341040462427744, + "grad_norm": 0.08522803336381912, + "learning_rate": 9.829572066820021e-05, + "loss": 0.0067, + "step": 6460 + }, + { + "epoch": 3.739884393063584, + "grad_norm": 0.10551602393388748, + "learning_rate": 9.828714970772247e-05, + "loss": 0.0044, + "step": 6470 + }, + { + "epoch": 3.745664739884393, + "grad_norm": 0.1057896614074707, + "learning_rate": 9.827855762480493e-05, + "loss": 0.0062, + "step": 6480 + }, + { + "epoch": 3.7514450867052025, + "grad_norm": 0.08756575733423233, + "learning_rate": 9.826994442320608e-05, + "loss": 0.0048, + "step": 6490 + }, + { + "epoch": 3.7572254335260116, + "grad_norm": 0.06838827580213547, + "learning_rate": 9.82613101066936e-05, + "loss": 0.0054, + "step": 6500 + }, + { + "epoch": 3.7630057803468207, + "grad_norm": 0.10524417459964752, + "learning_rate": 9.825265467904445e-05, + "loss": 0.0048, + "step": 6510 + }, + { + "epoch": 3.76878612716763, + "grad_norm": 0.16791360080242157, + "learning_rate": 9.82439781440448e-05, + "loss": 0.0048, + "step": 6520 + }, + { + "epoch": 3.7745664739884393, + "grad_norm": 0.10500805824995041, + "learning_rate": 9.823528050549006e-05, + "loss": 0.0044, + "step": 6530 + }, + { + "epoch": 3.7803468208092488, + "grad_norm": 0.10242146253585815, + "learning_rate": 9.822656176718487e-05, + "loss": 0.0047, + "step": 6540 + }, + { + "epoch": 3.786127167630058, + "grad_norm": 0.12535038590431213, + "learning_rate": 9.82178219329431e-05, + "loss": 0.0043, + "step": 6550 + }, + { + "epoch": 3.791907514450867, + "grad_norm": 0.10232984274625778, + "learning_rate": 9.820906100658789e-05, + "loss": 0.0035, + "step": 6560 + }, + { + "epoch": 3.7976878612716765, + "grad_norm": 0.09715797007083893, + "learning_rate": 9.82002789919515e-05, + "loss": 0.0047, + "step": 6570 + }, + { + "epoch": 3.8034682080924855, + "grad_norm": 0.10294798761606216, + "learning_rate": 9.819147589287554e-05, + "loss": 0.0042, + "step": 6580 + }, + { + "epoch": 3.809248554913295, + "grad_norm": 0.06270695477724075, + "learning_rate": 9.818265171321074e-05, + "loss": 0.0042, + "step": 6590 + }, + { + "epoch": 3.815028901734104, + "grad_norm": 0.10261218994855881, + "learning_rate": 9.817380645681714e-05, + "loss": 0.0052, + "step": 6600 + }, + { + "epoch": 3.820809248554913, + "grad_norm": 0.07845862209796906, + "learning_rate": 9.816494012756392e-05, + "loss": 0.0037, + "step": 6610 + }, + { + "epoch": 3.8265895953757223, + "grad_norm": 0.08661410212516785, + "learning_rate": 9.815605272932953e-05, + "loss": 0.0068, + "step": 6620 + }, + { + "epoch": 3.832369942196532, + "grad_norm": 0.07012265175580978, + "learning_rate": 9.814714426600162e-05, + "loss": 0.0049, + "step": 6630 + }, + { + "epoch": 3.838150289017341, + "grad_norm": 0.11753591895103455, + "learning_rate": 9.813821474147704e-05, + "loss": 0.0066, + "step": 6640 + }, + { + "epoch": 3.8439306358381504, + "grad_norm": 0.10362179577350616, + "learning_rate": 9.812926415966189e-05, + "loss": 0.0041, + "step": 6650 + }, + { + "epoch": 3.8497109826589595, + "grad_norm": 0.09316336363554001, + "learning_rate": 9.812029252447143e-05, + "loss": 0.0043, + "step": 6660 + }, + { + "epoch": 3.8554913294797686, + "grad_norm": 0.10363206267356873, + "learning_rate": 9.811129983983018e-05, + "loss": 0.0041, + "step": 6670 + }, + { + "epoch": 3.861271676300578, + "grad_norm": 0.0667562484741211, + "learning_rate": 9.810228610967185e-05, + "loss": 0.0071, + "step": 6680 + }, + { + "epoch": 3.867052023121387, + "grad_norm": 0.12337341159582138, + "learning_rate": 9.809325133793934e-05, + "loss": 0.0053, + "step": 6690 + }, + { + "epoch": 3.8728323699421967, + "grad_norm": 0.09047921746969223, + "learning_rate": 9.808419552858477e-05, + "loss": 0.0048, + "step": 6700 + }, + { + "epoch": 3.878612716763006, + "grad_norm": 0.07758771628141403, + "learning_rate": 9.807511868556944e-05, + "loss": 0.0045, + "step": 6710 + }, + { + "epoch": 3.884393063583815, + "grad_norm": 0.06790055334568024, + "learning_rate": 9.806602081286392e-05, + "loss": 0.0041, + "step": 6720 + }, + { + "epoch": 3.8901734104046244, + "grad_norm": 0.09480784088373184, + "learning_rate": 9.805690191444785e-05, + "loss": 0.0052, + "step": 6730 + }, + { + "epoch": 3.8959537572254335, + "grad_norm": 0.09276645630598068, + "learning_rate": 9.804776199431022e-05, + "loss": 0.0039, + "step": 6740 + }, + { + "epoch": 3.901734104046243, + "grad_norm": 0.11566020548343658, + "learning_rate": 9.80386010564491e-05, + "loss": 0.0047, + "step": 6750 + }, + { + "epoch": 3.907514450867052, + "grad_norm": 0.06422998756170273, + "learning_rate": 9.802941910487183e-05, + "loss": 0.0032, + "step": 6760 + }, + { + "epoch": 3.913294797687861, + "grad_norm": 0.10603903234004974, + "learning_rate": 9.802021614359487e-05, + "loss": 0.0048, + "step": 6770 + }, + { + "epoch": 3.9190751445086707, + "grad_norm": 0.12690307199954987, + "learning_rate": 9.801099217664394e-05, + "loss": 0.0043, + "step": 6780 + }, + { + "epoch": 3.9248554913294798, + "grad_norm": 0.12509547173976898, + "learning_rate": 9.800174720805388e-05, + "loss": 0.0048, + "step": 6790 + }, + { + "epoch": 3.9306358381502893, + "grad_norm": 0.11259645968675613, + "learning_rate": 9.799248124186878e-05, + "loss": 0.0068, + "step": 6800 + }, + { + "epoch": 3.9364161849710984, + "grad_norm": 0.08851031213998795, + "learning_rate": 9.798319428214187e-05, + "loss": 0.0039, + "step": 6810 + }, + { + "epoch": 3.9421965317919074, + "grad_norm": 0.10880535840988159, + "learning_rate": 9.79738863329356e-05, + "loss": 0.0069, + "step": 6820 + }, + { + "epoch": 3.9479768786127165, + "grad_norm": 0.10324430465698242, + "learning_rate": 9.79645573983216e-05, + "loss": 0.0047, + "step": 6830 + }, + { + "epoch": 3.953757225433526, + "grad_norm": 0.09232804924249649, + "learning_rate": 9.795520748238062e-05, + "loss": 0.0039, + "step": 6840 + }, + { + "epoch": 3.959537572254335, + "grad_norm": 0.07403694838285446, + "learning_rate": 9.794583658920264e-05, + "loss": 0.0044, + "step": 6850 + }, + { + "epoch": 3.9653179190751446, + "grad_norm": 0.05542504042387009, + "learning_rate": 9.793644472288683e-05, + "loss": 0.0034, + "step": 6860 + }, + { + "epoch": 3.9710982658959537, + "grad_norm": 0.07085678726434708, + "learning_rate": 9.79270318875415e-05, + "loss": 0.005, + "step": 6870 + }, + { + "epoch": 3.976878612716763, + "grad_norm": 0.0798199251294136, + "learning_rate": 9.791759808728416e-05, + "loss": 0.0038, + "step": 6880 + }, + { + "epoch": 3.9826589595375723, + "grad_norm": 0.07342632114887238, + "learning_rate": 9.790814332624143e-05, + "loss": 0.0029, + "step": 6890 + }, + { + "epoch": 3.9884393063583814, + "grad_norm": 0.07653181254863739, + "learning_rate": 9.789866760854919e-05, + "loss": 0.0041, + "step": 6900 + }, + { + "epoch": 3.994219653179191, + "grad_norm": 0.06251738965511322, + "learning_rate": 9.788917093835242e-05, + "loss": 0.0079, + "step": 6910 + }, + { + "epoch": 4.0, + "grad_norm": 0.07752005755901337, + "learning_rate": 9.787965331980528e-05, + "loss": 0.004, + "step": 6920 + }, + { + "epoch": 4.005780346820809, + "grad_norm": 0.10855576395988464, + "learning_rate": 9.787011475707111e-05, + "loss": 0.0054, + "step": 6930 + }, + { + "epoch": 4.011560693641618, + "grad_norm": 0.11480893194675446, + "learning_rate": 9.786055525432241e-05, + "loss": 0.006, + "step": 6940 + }, + { + "epoch": 4.017341040462428, + "grad_norm": 0.1400570124387741, + "learning_rate": 9.78509748157408e-05, + "loss": 0.0044, + "step": 6950 + }, + { + "epoch": 4.023121387283237, + "grad_norm": 0.10771267861127853, + "learning_rate": 9.784137344551713e-05, + "loss": 0.0052, + "step": 6960 + }, + { + "epoch": 4.028901734104046, + "grad_norm": 0.08657637983560562, + "learning_rate": 9.783175114785134e-05, + "loss": 0.0051, + "step": 6970 + }, + { + "epoch": 4.034682080924855, + "grad_norm": 0.07646024227142334, + "learning_rate": 9.782210792695254e-05, + "loss": 0.0039, + "step": 6980 + }, + { + "epoch": 4.040462427745664, + "grad_norm": 0.08171222358942032, + "learning_rate": 9.781244378703904e-05, + "loss": 0.0047, + "step": 6990 + }, + { + "epoch": 4.046242774566474, + "grad_norm": 0.07491995394229889, + "learning_rate": 9.780275873233824e-05, + "loss": 0.0037, + "step": 7000 + }, + { + "epoch": 4.0520231213872835, + "grad_norm": 0.13858239352703094, + "learning_rate": 9.77930527670867e-05, + "loss": 0.0052, + "step": 7010 + }, + { + "epoch": 4.057803468208093, + "grad_norm": 0.09935212135314941, + "learning_rate": 9.778332589553016e-05, + "loss": 0.0051, + "step": 7020 + }, + { + "epoch": 4.063583815028902, + "grad_norm": 0.07191646099090576, + "learning_rate": 9.777357812192349e-05, + "loss": 0.0044, + "step": 7030 + }, + { + "epoch": 4.069364161849711, + "grad_norm": 0.07832925766706467, + "learning_rate": 9.776380945053066e-05, + "loss": 0.004, + "step": 7040 + }, + { + "epoch": 4.07514450867052, + "grad_norm": 0.07194431871175766, + "learning_rate": 9.775401988562486e-05, + "loss": 0.0046, + "step": 7050 + }, + { + "epoch": 4.08092485549133, + "grad_norm": 0.07636430859565735, + "learning_rate": 9.774420943148835e-05, + "loss": 0.0037, + "step": 7060 + }, + { + "epoch": 4.086705202312139, + "grad_norm": 0.054698288440704346, + "learning_rate": 9.773437809241257e-05, + "loss": 0.0042, + "step": 7070 + }, + { + "epoch": 4.092485549132948, + "grad_norm": 0.06244511529803276, + "learning_rate": 9.772452587269808e-05, + "loss": 0.0037, + "step": 7080 + }, + { + "epoch": 4.098265895953757, + "grad_norm": 0.05341937392950058, + "learning_rate": 9.771465277665457e-05, + "loss": 0.0045, + "step": 7090 + }, + { + "epoch": 4.104046242774566, + "grad_norm": 0.06179845705628395, + "learning_rate": 9.770475880860089e-05, + "loss": 0.0038, + "step": 7100 + }, + { + "epoch": 4.109826589595376, + "grad_norm": 0.08358588814735413, + "learning_rate": 9.769484397286495e-05, + "loss": 0.0044, + "step": 7110 + }, + { + "epoch": 4.115606936416185, + "grad_norm": 0.07372605055570602, + "learning_rate": 9.768490827378388e-05, + "loss": 0.0042, + "step": 7120 + }, + { + "epoch": 4.121387283236994, + "grad_norm": 0.08515487611293793, + "learning_rate": 9.767495171570387e-05, + "loss": 0.0036, + "step": 7130 + }, + { + "epoch": 4.127167630057803, + "grad_norm": 0.06824032217264175, + "learning_rate": 9.766497430298027e-05, + "loss": 0.005, + "step": 7140 + }, + { + "epoch": 4.132947976878612, + "grad_norm": 0.06346060335636139, + "learning_rate": 9.765497603997754e-05, + "loss": 0.0044, + "step": 7150 + }, + { + "epoch": 4.138728323699422, + "grad_norm": 0.13387760519981384, + "learning_rate": 9.764495693106926e-05, + "loss": 0.0045, + "step": 7160 + }, + { + "epoch": 4.144508670520231, + "grad_norm": 0.10241768509149551, + "learning_rate": 9.76349169806381e-05, + "loss": 0.0044, + "step": 7170 + }, + { + "epoch": 4.1502890173410405, + "grad_norm": 0.09653797745704651, + "learning_rate": 9.762485619307591e-05, + "loss": 0.0036, + "step": 7180 + }, + { + "epoch": 4.15606936416185, + "grad_norm": 0.09247767180204391, + "learning_rate": 9.76147745727836e-05, + "loss": 0.0038, + "step": 7190 + }, + { + "epoch": 4.161849710982659, + "grad_norm": 0.08832607418298721, + "learning_rate": 9.760467212417124e-05, + "loss": 0.0044, + "step": 7200 + }, + { + "epoch": 4.167630057803469, + "grad_norm": 0.12045808881521225, + "learning_rate": 9.759454885165796e-05, + "loss": 0.0049, + "step": 7210 + }, + { + "epoch": 4.173410404624278, + "grad_norm": 0.07522665709257126, + "learning_rate": 9.7584404759672e-05, + "loss": 0.0038, + "step": 7220 + }, + { + "epoch": 4.179190751445087, + "grad_norm": 0.0632644072175026, + "learning_rate": 9.75742398526508e-05, + "loss": 0.0034, + "step": 7230 + }, + { + "epoch": 4.184971098265896, + "grad_norm": 0.08073872327804565, + "learning_rate": 9.756405413504077e-05, + "loss": 0.0043, + "step": 7240 + }, + { + "epoch": 4.190751445086705, + "grad_norm": 0.075049988925457, + "learning_rate": 9.755384761129752e-05, + "loss": 0.0033, + "step": 7250 + }, + { + "epoch": 4.196531791907514, + "grad_norm": 0.09683282673358917, + "learning_rate": 9.754362028588572e-05, + "loss": 0.0068, + "step": 7260 + }, + { + "epoch": 4.202312138728324, + "grad_norm": 0.08358323574066162, + "learning_rate": 9.753337216327917e-05, + "loss": 0.0039, + "step": 7270 + }, + { + "epoch": 4.208092485549133, + "grad_norm": 0.09701034426689148, + "learning_rate": 9.752310324796073e-05, + "loss": 0.004, + "step": 7280 + }, + { + "epoch": 4.213872832369942, + "grad_norm": 0.072493776679039, + "learning_rate": 9.751281354442235e-05, + "loss": 0.0048, + "step": 7290 + }, + { + "epoch": 4.219653179190751, + "grad_norm": 0.07446051388978958, + "learning_rate": 9.750250305716513e-05, + "loss": 0.0034, + "step": 7300 + }, + { + "epoch": 4.22543352601156, + "grad_norm": 0.09709165245294571, + "learning_rate": 9.749217179069923e-05, + "loss": 0.0043, + "step": 7310 + }, + { + "epoch": 4.23121387283237, + "grad_norm": 0.09105303138494492, + "learning_rate": 9.748181974954386e-05, + "loss": 0.0036, + "step": 7320 + }, + { + "epoch": 4.236994219653179, + "grad_norm": 0.08110969513654709, + "learning_rate": 9.74714469382274e-05, + "loss": 0.0039, + "step": 7330 + }, + { + "epoch": 4.242774566473988, + "grad_norm": 0.07299605756998062, + "learning_rate": 9.746105336128724e-05, + "loss": 0.0036, + "step": 7340 + }, + { + "epoch": 4.2485549132947975, + "grad_norm": 0.10487416386604309, + "learning_rate": 9.745063902326988e-05, + "loss": 0.0046, + "step": 7350 + }, + { + "epoch": 4.254335260115607, + "grad_norm": 0.04577060043811798, + "learning_rate": 9.744020392873093e-05, + "loss": 0.0036, + "step": 7360 + }, + { + "epoch": 4.2601156069364166, + "grad_norm": 0.10059268027544022, + "learning_rate": 9.742974808223504e-05, + "loss": 0.0078, + "step": 7370 + }, + { + "epoch": 4.265895953757226, + "grad_norm": 0.06948348879814148, + "learning_rate": 9.741927148835596e-05, + "loss": 0.0053, + "step": 7380 + }, + { + "epoch": 4.271676300578035, + "grad_norm": 0.051713019609451294, + "learning_rate": 9.740877415167648e-05, + "loss": 0.0034, + "step": 7390 + }, + { + "epoch": 4.277456647398844, + "grad_norm": 0.057397518306970596, + "learning_rate": 9.739825607678854e-05, + "loss": 0.0033, + "step": 7400 + }, + { + "epoch": 4.283236994219653, + "grad_norm": 0.05364494025707245, + "learning_rate": 9.738771726829308e-05, + "loss": 0.0032, + "step": 7410 + }, + { + "epoch": 4.289017341040463, + "grad_norm": 0.06555718183517456, + "learning_rate": 9.737715773080011e-05, + "loss": 0.0041, + "step": 7420 + }, + { + "epoch": 4.294797687861272, + "grad_norm": 0.07478900998830795, + "learning_rate": 9.736657746892877e-05, + "loss": 0.0037, + "step": 7430 + }, + { + "epoch": 4.300578034682081, + "grad_norm": 0.09410209208726883, + "learning_rate": 9.735597648730718e-05, + "loss": 0.0045, + "step": 7440 + }, + { + "epoch": 4.30635838150289, + "grad_norm": 0.09506339579820633, + "learning_rate": 9.734535479057262e-05, + "loss": 0.0053, + "step": 7450 + }, + { + "epoch": 4.312138728323699, + "grad_norm": 0.09616143256425858, + "learning_rate": 9.733471238337136e-05, + "loss": 0.005, + "step": 7460 + }, + { + "epoch": 4.317919075144509, + "grad_norm": 0.09171663224697113, + "learning_rate": 9.732404927035871e-05, + "loss": 0.0055, + "step": 7470 + }, + { + "epoch": 4.323699421965318, + "grad_norm": 0.06696099787950516, + "learning_rate": 9.731336545619915e-05, + "loss": 0.0037, + "step": 7480 + }, + { + "epoch": 4.329479768786127, + "grad_norm": 0.14176106452941895, + "learning_rate": 9.73026609455661e-05, + "loss": 0.0046, + "step": 7490 + }, + { + "epoch": 4.335260115606936, + "grad_norm": 0.08920951187610626, + "learning_rate": 9.72919357431421e-05, + "loss": 0.0039, + "step": 7500 + }, + { + "epoch": 4.341040462427745, + "grad_norm": 0.061191197484731674, + "learning_rate": 9.72811898536187e-05, + "loss": 0.0047, + "step": 7510 + }, + { + "epoch": 4.3468208092485545, + "grad_norm": 0.08400767296552658, + "learning_rate": 9.72704232816965e-05, + "loss": 0.0061, + "step": 7520 + }, + { + "epoch": 4.3526011560693645, + "grad_norm": 0.05825735628604889, + "learning_rate": 9.725963603208519e-05, + "loss": 0.0031, + "step": 7530 + }, + { + "epoch": 4.358381502890174, + "grad_norm": 0.06253275275230408, + "learning_rate": 9.724882810950348e-05, + "loss": 0.0032, + "step": 7540 + }, + { + "epoch": 4.364161849710983, + "grad_norm": 0.09076927602291107, + "learning_rate": 9.723799951867913e-05, + "loss": 0.0048, + "step": 7550 + }, + { + "epoch": 4.369942196531792, + "grad_norm": 0.14996130764484406, + "learning_rate": 9.722715026434889e-05, + "loss": 0.0036, + "step": 7560 + }, + { + "epoch": 4.375722543352601, + "grad_norm": 0.09381493926048279, + "learning_rate": 9.721628035125866e-05, + "loss": 0.0053, + "step": 7570 + }, + { + "epoch": 4.381502890173411, + "grad_norm": 0.06940823793411255, + "learning_rate": 9.720538978416325e-05, + "loss": 0.0034, + "step": 7580 + }, + { + "epoch": 4.38728323699422, + "grad_norm": 0.1408386528491974, + "learning_rate": 9.71944785678266e-05, + "loss": 0.0051, + "step": 7590 + }, + { + "epoch": 4.393063583815029, + "grad_norm": 0.07540041208267212, + "learning_rate": 9.718354670702161e-05, + "loss": 0.0031, + "step": 7600 + }, + { + "epoch": 4.398843930635838, + "grad_norm": 0.04325016215443611, + "learning_rate": 9.717259420653029e-05, + "loss": 0.0047, + "step": 7610 + }, + { + "epoch": 4.404624277456647, + "grad_norm": 0.15856178104877472, + "learning_rate": 9.716162107114361e-05, + "loss": 0.0073, + "step": 7620 + }, + { + "epoch": 4.410404624277457, + "grad_norm": 0.08500056713819504, + "learning_rate": 9.71506273056616e-05, + "loss": 0.0052, + "step": 7630 + }, + { + "epoch": 4.416184971098266, + "grad_norm": 0.1474866420030594, + "learning_rate": 9.713961291489331e-05, + "loss": 0.0073, + "step": 7640 + }, + { + "epoch": 4.421965317919075, + "grad_norm": 0.14277051389217377, + "learning_rate": 9.71285779036568e-05, + "loss": 0.0056, + "step": 7650 + }, + { + "epoch": 4.427745664739884, + "grad_norm": 0.10464701056480408, + "learning_rate": 9.711752227677916e-05, + "loss": 0.0039, + "step": 7660 + }, + { + "epoch": 4.433526011560693, + "grad_norm": 0.09123163670301437, + "learning_rate": 9.710644603909649e-05, + "loss": 0.004, + "step": 7670 + }, + { + "epoch": 4.4393063583815024, + "grad_norm": 0.06720109283924103, + "learning_rate": 9.709534919545393e-05, + "loss": 0.0039, + "step": 7680 + }, + { + "epoch": 4.445086705202312, + "grad_norm": 0.09755236655473709, + "learning_rate": 9.708423175070563e-05, + "loss": 0.0047, + "step": 7690 + }, + { + "epoch": 4.4508670520231215, + "grad_norm": 0.05199611186981201, + "learning_rate": 9.707309370971468e-05, + "loss": 0.0039, + "step": 7700 + }, + { + "epoch": 4.456647398843931, + "grad_norm": 0.08835835009813309, + "learning_rate": 9.70619350773533e-05, + "loss": 0.0048, + "step": 7710 + }, + { + "epoch": 4.46242774566474, + "grad_norm": 0.09824372082948685, + "learning_rate": 9.70507558585026e-05, + "loss": 0.006, + "step": 7720 + }, + { + "epoch": 4.468208092485549, + "grad_norm": 0.06983206421136856, + "learning_rate": 9.703955605805281e-05, + "loss": 0.0042, + "step": 7730 + }, + { + "epoch": 4.473988439306359, + "grad_norm": 0.05261417105793953, + "learning_rate": 9.702833568090306e-05, + "loss": 0.0033, + "step": 7740 + }, + { + "epoch": 4.479768786127168, + "grad_norm": 0.044182635843753815, + "learning_rate": 9.701709473196154e-05, + "loss": 0.0043, + "step": 7750 + }, + { + "epoch": 4.485549132947977, + "grad_norm": 0.1114625409245491, + "learning_rate": 9.700583321614541e-05, + "loss": 0.0035, + "step": 7760 + }, + { + "epoch": 4.491329479768786, + "grad_norm": 0.08481371402740479, + "learning_rate": 9.699455113838085e-05, + "loss": 0.006, + "step": 7770 + }, + { + "epoch": 4.497109826589595, + "grad_norm": 0.09450369328260422, + "learning_rate": 9.698324850360303e-05, + "loss": 0.0042, + "step": 7780 + }, + { + "epoch": 4.502890173410405, + "grad_norm": 0.09334803372621536, + "learning_rate": 9.69719253167561e-05, + "loss": 0.0047, + "step": 7790 + }, + { + "epoch": 4.508670520231214, + "grad_norm": 0.0968085527420044, + "learning_rate": 9.69605815827932e-05, + "loss": 0.0076, + "step": 7800 + }, + { + "epoch": 4.514450867052023, + "grad_norm": 0.09961452335119247, + "learning_rate": 9.694921730667647e-05, + "loss": 0.0045, + "step": 7810 + }, + { + "epoch": 4.520231213872832, + "grad_norm": 0.0660637617111206, + "learning_rate": 9.693783249337705e-05, + "loss": 0.0042, + "step": 7820 + }, + { + "epoch": 4.526011560693641, + "grad_norm": 0.0900220274925232, + "learning_rate": 9.692642714787501e-05, + "loss": 0.0041, + "step": 7830 + }, + { + "epoch": 4.531791907514451, + "grad_norm": 0.11947119981050491, + "learning_rate": 9.691500127515945e-05, + "loss": 0.0079, + "step": 7840 + }, + { + "epoch": 4.53757225433526, + "grad_norm": 0.09257648140192032, + "learning_rate": 9.690355488022844e-05, + "loss": 0.0044, + "step": 7850 + }, + { + "epoch": 4.543352601156069, + "grad_norm": 0.10505222529172897, + "learning_rate": 9.689208796808902e-05, + "loss": 0.0041, + "step": 7860 + }, + { + "epoch": 4.5491329479768785, + "grad_norm": 0.12087495625019073, + "learning_rate": 9.68806005437572e-05, + "loss": 0.0063, + "step": 7870 + }, + { + "epoch": 4.554913294797688, + "grad_norm": 0.08448950946331024, + "learning_rate": 9.686909261225796e-05, + "loss": 0.0047, + "step": 7880 + }, + { + "epoch": 4.5606936416184976, + "grad_norm": 0.16504046320915222, + "learning_rate": 9.685756417862531e-05, + "loss": 0.0047, + "step": 7890 + }, + { + "epoch": 4.566473988439307, + "grad_norm": 0.10585096478462219, + "learning_rate": 9.684601524790212e-05, + "loss": 0.0054, + "step": 7900 + }, + { + "epoch": 4.572254335260116, + "grad_norm": 0.11554574221372604, + "learning_rate": 9.68344458251403e-05, + "loss": 0.0047, + "step": 7910 + }, + { + "epoch": 4.578034682080925, + "grad_norm": 0.09645393490791321, + "learning_rate": 9.682285591540072e-05, + "loss": 0.0036, + "step": 7920 + }, + { + "epoch": 4.583815028901734, + "grad_norm": 0.0977889746427536, + "learning_rate": 9.681124552375322e-05, + "loss": 0.0035, + "step": 7930 + }, + { + "epoch": 4.589595375722544, + "grad_norm": 0.08475830405950546, + "learning_rate": 9.679961465527654e-05, + "loss": 0.0039, + "step": 7940 + }, + { + "epoch": 4.595375722543353, + "grad_norm": 0.06024939566850662, + "learning_rate": 9.678796331505843e-05, + "loss": 0.0034, + "step": 7950 + }, + { + "epoch": 4.601156069364162, + "grad_norm": 0.05883932113647461, + "learning_rate": 9.677629150819558e-05, + "loss": 0.0034, + "step": 7960 + }, + { + "epoch": 4.606936416184971, + "grad_norm": 0.09573355317115784, + "learning_rate": 9.676459923979366e-05, + "loss": 0.0037, + "step": 7970 + }, + { + "epoch": 4.61271676300578, + "grad_norm": 0.06836744397878647, + "learning_rate": 9.675288651496723e-05, + "loss": 0.0037, + "step": 7980 + }, + { + "epoch": 4.618497109826589, + "grad_norm": 0.07681182771921158, + "learning_rate": 9.674115333883986e-05, + "loss": 0.0053, + "step": 7990 + }, + { + "epoch": 4.624277456647399, + "grad_norm": 0.07006292045116425, + "learning_rate": 9.672939971654402e-05, + "loss": 0.0039, + "step": 8000 + }, + { + "epoch": 4.630057803468208, + "grad_norm": 0.05592655763030052, + "learning_rate": 9.671762565322117e-05, + "loss": 0.0034, + "step": 8010 + }, + { + "epoch": 4.635838150289017, + "grad_norm": 0.1108817607164383, + "learning_rate": 9.670583115402166e-05, + "loss": 0.0043, + "step": 8020 + }, + { + "epoch": 4.641618497109826, + "grad_norm": 0.06929147988557816, + "learning_rate": 9.669401622410482e-05, + "loss": 0.0036, + "step": 8030 + }, + { + "epoch": 4.6473988439306355, + "grad_norm": 0.05909327417612076, + "learning_rate": 9.668218086863887e-05, + "loss": 0.0037, + "step": 8040 + }, + { + "epoch": 4.653179190751445, + "grad_norm": 0.06097660958766937, + "learning_rate": 9.667032509280103e-05, + "loss": 0.0047, + "step": 8050 + }, + { + "epoch": 4.658959537572255, + "grad_norm": 0.07727665454149246, + "learning_rate": 9.665844890177743e-05, + "loss": 0.0033, + "step": 8060 + }, + { + "epoch": 4.664739884393064, + "grad_norm": 0.09187764674425125, + "learning_rate": 9.664655230076309e-05, + "loss": 0.0032, + "step": 8070 + }, + { + "epoch": 4.670520231213873, + "grad_norm": 0.06976690143346786, + "learning_rate": 9.663463529496199e-05, + "loss": 0.0043, + "step": 8080 + }, + { + "epoch": 4.676300578034682, + "grad_norm": 0.08565140515565872, + "learning_rate": 9.662269788958705e-05, + "loss": 0.0033, + "step": 8090 + }, + { + "epoch": 4.682080924855491, + "grad_norm": 0.08254718780517578, + "learning_rate": 9.661074008986008e-05, + "loss": 0.0046, + "step": 8100 + }, + { + "epoch": 4.687861271676301, + "grad_norm": 0.08245521783828735, + "learning_rate": 9.659876190101184e-05, + "loss": 0.0036, + "step": 8110 + }, + { + "epoch": 4.69364161849711, + "grad_norm": 0.06963507831096649, + "learning_rate": 9.6586763328282e-05, + "loss": 0.0042, + "step": 8120 + }, + { + "epoch": 4.699421965317919, + "grad_norm": 0.06510617583990097, + "learning_rate": 9.657474437691913e-05, + "loss": 0.0034, + "step": 8130 + }, + { + "epoch": 4.705202312138728, + "grad_norm": 0.08015453070402145, + "learning_rate": 9.656270505218073e-05, + "loss": 0.0033, + "step": 8140 + }, + { + "epoch": 4.710982658959537, + "grad_norm": 0.08493061363697052, + "learning_rate": 9.655064535933321e-05, + "loss": 0.0039, + "step": 8150 + }, + { + "epoch": 4.716763005780347, + "grad_norm": 0.0690418928861618, + "learning_rate": 9.653856530365189e-05, + "loss": 0.0041, + "step": 8160 + }, + { + "epoch": 4.722543352601156, + "grad_norm": 0.06100199371576309, + "learning_rate": 9.6526464890421e-05, + "loss": 0.0033, + "step": 8170 + }, + { + "epoch": 4.728323699421965, + "grad_norm": 0.12600620090961456, + "learning_rate": 9.651434412493367e-05, + "loss": 0.004, + "step": 8180 + }, + { + "epoch": 4.734104046242774, + "grad_norm": 0.08088401705026627, + "learning_rate": 9.650220301249195e-05, + "loss": 0.0036, + "step": 8190 + }, + { + "epoch": 4.7398843930635834, + "grad_norm": 0.05150901526212692, + "learning_rate": 9.649004155840675e-05, + "loss": 0.0037, + "step": 8200 + }, + { + "epoch": 4.745664739884393, + "grad_norm": 0.08037382364273071, + "learning_rate": 9.647785976799792e-05, + "loss": 0.005, + "step": 8210 + }, + { + "epoch": 4.7514450867052025, + "grad_norm": 0.08356626331806183, + "learning_rate": 9.646565764659417e-05, + "loss": 0.0042, + "step": 8220 + }, + { + "epoch": 4.757225433526012, + "grad_norm": 0.09985048323869705, + "learning_rate": 9.645343519953314e-05, + "loss": 0.0046, + "step": 8230 + }, + { + "epoch": 4.763005780346821, + "grad_norm": 0.06431553512811661, + "learning_rate": 9.644119243216135e-05, + "loss": 0.0042, + "step": 8240 + }, + { + "epoch": 4.76878612716763, + "grad_norm": 0.07443758845329285, + "learning_rate": 9.642892934983417e-05, + "loss": 0.004, + "step": 8250 + }, + { + "epoch": 4.77456647398844, + "grad_norm": 0.16717828810214996, + "learning_rate": 9.641664595791593e-05, + "loss": 0.0045, + "step": 8260 + }, + { + "epoch": 4.780346820809249, + "grad_norm": 0.1435498297214508, + "learning_rate": 9.640434226177977e-05, + "loss": 0.0037, + "step": 8270 + }, + { + "epoch": 4.786127167630058, + "grad_norm": 0.0821315199136734, + "learning_rate": 9.639201826680777e-05, + "loss": 0.0044, + "step": 8280 + }, + { + "epoch": 4.791907514450867, + "grad_norm": 0.09831906110048294, + "learning_rate": 9.637967397839083e-05, + "loss": 0.0044, + "step": 8290 + }, + { + "epoch": 4.797687861271676, + "grad_norm": 0.06787078827619553, + "learning_rate": 9.63673094019288e-05, + "loss": 0.0041, + "step": 8300 + }, + { + "epoch": 4.803468208092486, + "grad_norm": 0.04841814562678337, + "learning_rate": 9.635492454283035e-05, + "loss": 0.0031, + "step": 8310 + }, + { + "epoch": 4.809248554913295, + "grad_norm": 0.06711428612470627, + "learning_rate": 9.634251940651302e-05, + "loss": 0.0043, + "step": 8320 + }, + { + "epoch": 4.815028901734104, + "grad_norm": 0.07030502706766129, + "learning_rate": 9.633009399840327e-05, + "loss": 0.0054, + "step": 8330 + }, + { + "epoch": 4.820809248554913, + "grad_norm": 0.07398594170808792, + "learning_rate": 9.631764832393639e-05, + "loss": 0.0044, + "step": 8340 + }, + { + "epoch": 4.826589595375722, + "grad_norm": 0.085907481610775, + "learning_rate": 9.630518238855651e-05, + "loss": 0.0067, + "step": 8350 + }, + { + "epoch": 4.832369942196532, + "grad_norm": 0.10180293768644333, + "learning_rate": 9.629269619771668e-05, + "loss": 0.0041, + "step": 8360 + }, + { + "epoch": 4.838150289017341, + "grad_norm": 0.09444575756788254, + "learning_rate": 9.628018975687878e-05, + "loss": 0.0043, + "step": 8370 + }, + { + "epoch": 4.84393063583815, + "grad_norm": 0.10237473249435425, + "learning_rate": 9.626766307151355e-05, + "loss": 0.0074, + "step": 8380 + }, + { + "epoch": 4.8497109826589595, + "grad_norm": 0.07820641994476318, + "learning_rate": 9.62551161471006e-05, + "loss": 0.0035, + "step": 8390 + }, + { + "epoch": 4.855491329479769, + "grad_norm": 0.059219833463430405, + "learning_rate": 9.624254898912837e-05, + "loss": 0.004, + "step": 8400 + }, + { + "epoch": 4.861271676300578, + "grad_norm": 0.08172249048948288, + "learning_rate": 9.622996160309414e-05, + "loss": 0.0042, + "step": 8410 + }, + { + "epoch": 4.867052023121388, + "grad_norm": 0.07609304040670395, + "learning_rate": 9.62173539945041e-05, + "loss": 0.0031, + "step": 8420 + }, + { + "epoch": 4.872832369942197, + "grad_norm": 0.07762991636991501, + "learning_rate": 9.620472616887323e-05, + "loss": 0.0037, + "step": 8430 + }, + { + "epoch": 4.878612716763006, + "grad_norm": 0.11632972210645676, + "learning_rate": 9.619207813172536e-05, + "loss": 0.0037, + "step": 8440 + }, + { + "epoch": 4.884393063583815, + "grad_norm": 0.11336377263069153, + "learning_rate": 9.617940988859318e-05, + "loss": 0.0048, + "step": 8450 + }, + { + "epoch": 4.890173410404624, + "grad_norm": 0.08250712603330612, + "learning_rate": 9.616672144501821e-05, + "loss": 0.0039, + "step": 8460 + }, + { + "epoch": 4.895953757225434, + "grad_norm": 0.10133339464664459, + "learning_rate": 9.615401280655081e-05, + "loss": 0.0035, + "step": 8470 + }, + { + "epoch": 4.901734104046243, + "grad_norm": 0.06876753270626068, + "learning_rate": 9.614128397875017e-05, + "loss": 0.0042, + "step": 8480 + }, + { + "epoch": 4.907514450867052, + "grad_norm": 0.07006234675645828, + "learning_rate": 9.612853496718429e-05, + "loss": 0.0052, + "step": 8490 + }, + { + "epoch": 4.913294797687861, + "grad_norm": 0.06721346080303192, + "learning_rate": 9.611576577743004e-05, + "loss": 0.0037, + "step": 8500 + }, + { + "epoch": 4.91907514450867, + "grad_norm": 0.0753275528550148, + "learning_rate": 9.610297641507313e-05, + "loss": 0.004, + "step": 8510 + }, + { + "epoch": 4.924855491329479, + "grad_norm": 0.0652092695236206, + "learning_rate": 9.6090166885708e-05, + "loss": 0.0037, + "step": 8520 + }, + { + "epoch": 4.930635838150289, + "grad_norm": 0.05354798212647438, + "learning_rate": 9.607733719493798e-05, + "loss": 0.0036, + "step": 8530 + }, + { + "epoch": 4.936416184971098, + "grad_norm": 0.053498897701501846, + "learning_rate": 9.606448734837527e-05, + "loss": 0.0039, + "step": 8540 + }, + { + "epoch": 4.942196531791907, + "grad_norm": 0.09556004405021667, + "learning_rate": 9.605161735164079e-05, + "loss": 0.0037, + "step": 8550 + }, + { + "epoch": 4.9479768786127165, + "grad_norm": 0.08123383671045303, + "learning_rate": 9.60387272103643e-05, + "loss": 0.0038, + "step": 8560 + }, + { + "epoch": 4.953757225433526, + "grad_norm": 0.060370512306690216, + "learning_rate": 9.60258169301844e-05, + "loss": 0.0031, + "step": 8570 + }, + { + "epoch": 4.959537572254336, + "grad_norm": 0.06456678360700607, + "learning_rate": 9.601288651674851e-05, + "loss": 0.0033, + "step": 8580 + }, + { + "epoch": 4.965317919075145, + "grad_norm": 0.07259932905435562, + "learning_rate": 9.59999359757128e-05, + "loss": 0.0031, + "step": 8590 + }, + { + "epoch": 4.971098265895954, + "grad_norm": 0.05768544226884842, + "learning_rate": 9.598696531274227e-05, + "loss": 0.0027, + "step": 8600 + }, + { + "epoch": 4.976878612716763, + "grad_norm": 0.04464501142501831, + "learning_rate": 9.597397453351076e-05, + "loss": 0.0029, + "step": 8610 + }, + { + "epoch": 4.982658959537572, + "grad_norm": 0.08422578871250153, + "learning_rate": 9.596096364370087e-05, + "loss": 0.004, + "step": 8620 + }, + { + "epoch": 4.988439306358382, + "grad_norm": 0.10428819060325623, + "learning_rate": 9.594793264900399e-05, + "loss": 0.0051, + "step": 8630 + }, + { + "epoch": 4.994219653179191, + "grad_norm": 0.0938459113240242, + "learning_rate": 9.593488155512032e-05, + "loss": 0.0042, + "step": 8640 + }, + { + "epoch": 5.0, + "grad_norm": 0.0748751163482666, + "learning_rate": 9.592181036775886e-05, + "loss": 0.0037, + "step": 8650 + }, + { + "epoch": 5.005780346820809, + "grad_norm": 0.10755519568920135, + "learning_rate": 9.590871909263741e-05, + "loss": 0.0039, + "step": 8660 + }, + { + "epoch": 5.011560693641618, + "grad_norm": 0.11138191819190979, + "learning_rate": 9.589560773548252e-05, + "loss": 0.0041, + "step": 8670 + }, + { + "epoch": 5.017341040462428, + "grad_norm": 0.13861332833766937, + "learning_rate": 9.588247630202952e-05, + "loss": 0.0042, + "step": 8680 + }, + { + "epoch": 5.023121387283237, + "grad_norm": 0.10551301389932632, + "learning_rate": 9.586932479802258e-05, + "loss": 0.0042, + "step": 8690 + }, + { + "epoch": 5.028901734104046, + "grad_norm": 0.06994461268186569, + "learning_rate": 9.585615322921462e-05, + "loss": 0.0062, + "step": 8700 + }, + { + "epoch": 5.034682080924855, + "grad_norm": 0.09327414631843567, + "learning_rate": 9.58429616013673e-05, + "loss": 0.0043, + "step": 8710 + }, + { + "epoch": 5.040462427745664, + "grad_norm": 0.0711771622300148, + "learning_rate": 9.58297499202511e-05, + "loss": 0.005, + "step": 8720 + }, + { + "epoch": 5.046242774566474, + "grad_norm": 0.10359261929988861, + "learning_rate": 9.581651819164525e-05, + "loss": 0.0042, + "step": 8730 + }, + { + "epoch": 5.0520231213872835, + "grad_norm": 0.13505209982395172, + "learning_rate": 9.580326642133777e-05, + "loss": 0.0045, + "step": 8740 + }, + { + "epoch": 5.057803468208093, + "grad_norm": 0.09726086258888245, + "learning_rate": 9.578999461512544e-05, + "loss": 0.0045, + "step": 8750 + }, + { + "epoch": 5.063583815028902, + "grad_norm": 0.10977838933467865, + "learning_rate": 9.577670277881378e-05, + "loss": 0.0069, + "step": 8760 + }, + { + "epoch": 5.069364161849711, + "grad_norm": 0.13174647092819214, + "learning_rate": 9.57633909182171e-05, + "loss": 0.0073, + "step": 8770 + }, + { + "epoch": 5.07514450867052, + "grad_norm": 0.1174435019493103, + "learning_rate": 9.575005903915847e-05, + "loss": 0.0045, + "step": 8780 + }, + { + "epoch": 5.08092485549133, + "grad_norm": 0.09674505144357681, + "learning_rate": 9.573670714746972e-05, + "loss": 0.0038, + "step": 8790 + }, + { + "epoch": 5.086705202312139, + "grad_norm": 0.10631712526082993, + "learning_rate": 9.572333524899136e-05, + "loss": 0.0045, + "step": 8800 + }, + { + "epoch": 5.092485549132948, + "grad_norm": 0.09972011297941208, + "learning_rate": 9.570994334957278e-05, + "loss": 0.0034, + "step": 8810 + }, + { + "epoch": 5.098265895953757, + "grad_norm": 0.07860255986452103, + "learning_rate": 9.569653145507201e-05, + "loss": 0.0047, + "step": 8820 + }, + { + "epoch": 5.104046242774566, + "grad_norm": 0.09654318541288376, + "learning_rate": 9.568309957135587e-05, + "loss": 0.0035, + "step": 8830 + }, + { + "epoch": 5.109826589595376, + "grad_norm": 0.08574005216360092, + "learning_rate": 9.566964770429995e-05, + "loss": 0.0053, + "step": 8840 + }, + { + "epoch": 5.115606936416185, + "grad_norm": 0.08568380773067474, + "learning_rate": 9.565617585978853e-05, + "loss": 0.0036, + "step": 8850 + }, + { + "epoch": 5.121387283236994, + "grad_norm": 0.06915133446455002, + "learning_rate": 9.564268404371466e-05, + "loss": 0.0032, + "step": 8860 + }, + { + "epoch": 5.127167630057803, + "grad_norm": 0.07265638560056686, + "learning_rate": 9.562917226198013e-05, + "loss": 0.0037, + "step": 8870 + }, + { + "epoch": 5.132947976878612, + "grad_norm": 0.06257953494787216, + "learning_rate": 9.561564052049543e-05, + "loss": 0.0039, + "step": 8880 + }, + { + "epoch": 5.138728323699422, + "grad_norm": 0.10112006962299347, + "learning_rate": 9.560208882517982e-05, + "loss": 0.0037, + "step": 8890 + }, + { + "epoch": 5.144508670520231, + "grad_norm": 0.14546966552734375, + "learning_rate": 9.558851718196127e-05, + "loss": 0.005, + "step": 8900 + }, + { + "epoch": 5.1502890173410405, + "grad_norm": 0.07257446646690369, + "learning_rate": 9.557492559677646e-05, + "loss": 0.0036, + "step": 8910 + }, + { + "epoch": 5.15606936416185, + "grad_norm": 0.10585208982229233, + "learning_rate": 9.556131407557082e-05, + "loss": 0.0043, + "step": 8920 + }, + { + "epoch": 5.161849710982659, + "grad_norm": 0.08118341118097305, + "learning_rate": 9.554768262429853e-05, + "loss": 0.0038, + "step": 8930 + }, + { + "epoch": 5.167630057803469, + "grad_norm": 0.13657018542289734, + "learning_rate": 9.553403124892239e-05, + "loss": 0.0035, + "step": 8940 + }, + { + "epoch": 5.173410404624278, + "grad_norm": 0.07841265201568604, + "learning_rate": 9.552035995541402e-05, + "loss": 0.0042, + "step": 8950 + }, + { + "epoch": 5.179190751445087, + "grad_norm": 0.08602629601955414, + "learning_rate": 9.550666874975368e-05, + "loss": 0.0035, + "step": 8960 + }, + { + "epoch": 5.184971098265896, + "grad_norm": 0.05712622031569481, + "learning_rate": 9.549295763793038e-05, + "loss": 0.0039, + "step": 8970 + }, + { + "epoch": 5.190751445086705, + "grad_norm": 0.09507585316896439, + "learning_rate": 9.547922662594183e-05, + "loss": 0.0054, + "step": 8980 + }, + { + "epoch": 5.196531791907514, + "grad_norm": 0.10527266561985016, + "learning_rate": 9.546547571979443e-05, + "loss": 0.0044, + "step": 8990 + }, + { + "epoch": 5.202312138728324, + "grad_norm": 0.07147108763456345, + "learning_rate": 9.545170492550331e-05, + "loss": 0.0033, + "step": 9000 + }, + { + "epoch": 5.208092485549133, + "grad_norm": 0.06450354307889938, + "learning_rate": 9.543791424909226e-05, + "loss": 0.0029, + "step": 9010 + }, + { + "epoch": 5.213872832369942, + "grad_norm": 0.06563185155391693, + "learning_rate": 9.542410369659382e-05, + "loss": 0.0039, + "step": 9020 + }, + { + "epoch": 5.219653179190751, + "grad_norm": 0.05482746288180351, + "learning_rate": 9.541027327404916e-05, + "loss": 0.0034, + "step": 9030 + }, + { + "epoch": 5.22543352601156, + "grad_norm": 0.06039123609662056, + "learning_rate": 9.539642298750821e-05, + "loss": 0.0033, + "step": 9040 + }, + { + "epoch": 5.23121387283237, + "grad_norm": 0.057607367634773254, + "learning_rate": 9.538255284302954e-05, + "loss": 0.0038, + "step": 9050 + }, + { + "epoch": 5.236994219653179, + "grad_norm": 0.11214864253997803, + "learning_rate": 9.536866284668043e-05, + "loss": 0.0044, + "step": 9060 + }, + { + "epoch": 5.242774566473988, + "grad_norm": 0.10493134707212448, + "learning_rate": 9.53547530045368e-05, + "loss": 0.0046, + "step": 9070 + }, + { + "epoch": 5.2485549132947975, + "grad_norm": 0.08909250795841217, + "learning_rate": 9.534082332268335e-05, + "loss": 0.0032, + "step": 9080 + }, + { + "epoch": 5.254335260115607, + "grad_norm": 0.0613614022731781, + "learning_rate": 9.532687380721334e-05, + "loss": 0.0038, + "step": 9090 + }, + { + "epoch": 5.2601156069364166, + "grad_norm": 0.0754789412021637, + "learning_rate": 9.53129044642288e-05, + "loss": 0.0027, + "step": 9100 + }, + { + "epoch": 5.265895953757226, + "grad_norm": 0.0658283680677414, + "learning_rate": 9.529891529984039e-05, + "loss": 0.003, + "step": 9110 + }, + { + "epoch": 5.271676300578035, + "grad_norm": 0.0447990819811821, + "learning_rate": 9.528490632016743e-05, + "loss": 0.0041, + "step": 9120 + }, + { + "epoch": 5.277456647398844, + "grad_norm": 0.055104684084653854, + "learning_rate": 9.52708775313379e-05, + "loss": 0.003, + "step": 9130 + }, + { + "epoch": 5.283236994219653, + "grad_norm": 0.04724518209695816, + "learning_rate": 9.525682893948853e-05, + "loss": 0.003, + "step": 9140 + }, + { + "epoch": 5.289017341040463, + "grad_norm": 0.04141925647854805, + "learning_rate": 9.524276055076463e-05, + "loss": 0.0041, + "step": 9150 + }, + { + "epoch": 5.294797687861272, + "grad_norm": 0.08228343725204468, + "learning_rate": 9.522867237132016e-05, + "loss": 0.0044, + "step": 9160 + }, + { + "epoch": 5.300578034682081, + "grad_norm": 0.10406464338302612, + "learning_rate": 9.52145644073178e-05, + "loss": 0.0049, + "step": 9170 + }, + { + "epoch": 5.30635838150289, + "grad_norm": 0.07224124670028687, + "learning_rate": 9.520043666492884e-05, + "loss": 0.0044, + "step": 9180 + }, + { + "epoch": 5.312138728323699, + "grad_norm": 0.0873929113149643, + "learning_rate": 9.518628915033323e-05, + "loss": 0.0038, + "step": 9190 + }, + { + "epoch": 5.317919075144509, + "grad_norm": 0.06959396600723267, + "learning_rate": 9.517212186971957e-05, + "loss": 0.0035, + "step": 9200 + }, + { + "epoch": 5.323699421965318, + "grad_norm": 0.04830661416053772, + "learning_rate": 9.515793482928515e-05, + "loss": 0.0037, + "step": 9210 + }, + { + "epoch": 5.329479768786127, + "grad_norm": 0.08181358128786087, + "learning_rate": 9.514372803523582e-05, + "loss": 0.0057, + "step": 9220 + }, + { + "epoch": 5.335260115606936, + "grad_norm": 0.09455884248018265, + "learning_rate": 9.512950149378613e-05, + "loss": 0.004, + "step": 9230 + }, + { + "epoch": 5.341040462427745, + "grad_norm": 0.09299562871456146, + "learning_rate": 9.511525521115925e-05, + "loss": 0.0032, + "step": 9240 + }, + { + "epoch": 5.3468208092485545, + "grad_norm": 0.0856257900595665, + "learning_rate": 9.510098919358698e-05, + "loss": 0.003, + "step": 9250 + }, + { + "epoch": 5.3526011560693645, + "grad_norm": 0.08746090531349182, + "learning_rate": 9.508670344730979e-05, + "loss": 0.0048, + "step": 9260 + }, + { + "epoch": 5.358381502890174, + "grad_norm": 0.09835511445999146, + "learning_rate": 9.507239797857672e-05, + "loss": 0.0044, + "step": 9270 + }, + { + "epoch": 5.364161849710983, + "grad_norm": 0.07579044252634048, + "learning_rate": 9.505807279364548e-05, + "loss": 0.0035, + "step": 9280 + }, + { + "epoch": 5.369942196531792, + "grad_norm": 0.06782922893762589, + "learning_rate": 9.504372789878239e-05, + "loss": 0.0029, + "step": 9290 + }, + { + "epoch": 5.375722543352601, + "grad_norm": 0.052490346133708954, + "learning_rate": 9.502936330026239e-05, + "loss": 0.0067, + "step": 9300 + }, + { + "epoch": 5.381502890173411, + "grad_norm": 0.06339240074157715, + "learning_rate": 9.501497900436906e-05, + "loss": 0.0034, + "step": 9310 + }, + { + "epoch": 5.38728323699422, + "grad_norm": 0.04801017791032791, + "learning_rate": 9.500057501739457e-05, + "loss": 0.0048, + "step": 9320 + }, + { + "epoch": 5.393063583815029, + "grad_norm": 0.04857809841632843, + "learning_rate": 9.49861513456397e-05, + "loss": 0.0027, + "step": 9330 + }, + { + "epoch": 5.398843930635838, + "grad_norm": 0.08158544450998306, + "learning_rate": 9.497170799541388e-05, + "loss": 0.0043, + "step": 9340 + }, + { + "epoch": 5.404624277456647, + "grad_norm": 0.05422453209757805, + "learning_rate": 9.495724497303508e-05, + "loss": 0.0033, + "step": 9350 + }, + { + "epoch": 5.410404624277457, + "grad_norm": 0.05921275168657303, + "learning_rate": 9.494276228482998e-05, + "loss": 0.0029, + "step": 9360 + }, + { + "epoch": 5.416184971098266, + "grad_norm": 0.13661505281925201, + "learning_rate": 9.492825993713374e-05, + "loss": 0.0035, + "step": 9370 + }, + { + "epoch": 5.421965317919075, + "grad_norm": 0.08016598224639893, + "learning_rate": 9.491373793629023e-05, + "loss": 0.0032, + "step": 9380 + }, + { + "epoch": 5.427745664739884, + "grad_norm": 0.10663073509931564, + "learning_rate": 9.489919628865182e-05, + "loss": 0.005, + "step": 9390 + }, + { + "epoch": 5.433526011560693, + "grad_norm": 0.06865248829126358, + "learning_rate": 9.488463500057955e-05, + "loss": 0.0065, + "step": 9400 + }, + { + "epoch": 5.4393063583815024, + "grad_norm": 0.10003788024187088, + "learning_rate": 9.487005407844302e-05, + "loss": 0.0036, + "step": 9410 + }, + { + "epoch": 5.445086705202312, + "grad_norm": 0.08406813442707062, + "learning_rate": 9.485545352862039e-05, + "loss": 0.0043, + "step": 9420 + }, + { + "epoch": 5.4508670520231215, + "grad_norm": 0.07259850949048996, + "learning_rate": 9.48408333574985e-05, + "loss": 0.0034, + "step": 9430 + }, + { + "epoch": 5.456647398843931, + "grad_norm": 0.04301713407039642, + "learning_rate": 9.482619357147264e-05, + "loss": 0.003, + "step": 9440 + }, + { + "epoch": 5.46242774566474, + "grad_norm": 0.052886392921209335, + "learning_rate": 9.481153417694679e-05, + "loss": 0.0033, + "step": 9450 + }, + { + "epoch": 5.468208092485549, + "grad_norm": 0.06675713509321213, + "learning_rate": 9.479685518033347e-05, + "loss": 0.0031, + "step": 9460 + }, + { + "epoch": 5.473988439306359, + "grad_norm": 0.06410433351993561, + "learning_rate": 9.478215658805375e-05, + "loss": 0.0037, + "step": 9470 + }, + { + "epoch": 5.479768786127168, + "grad_norm": 0.07671128213405609, + "learning_rate": 9.47674384065373e-05, + "loss": 0.0038, + "step": 9480 + }, + { + "epoch": 5.485549132947977, + "grad_norm": 0.06870589405298233, + "learning_rate": 9.475270064222237e-05, + "loss": 0.0034, + "step": 9490 + }, + { + "epoch": 5.491329479768786, + "grad_norm": 0.060709405690431595, + "learning_rate": 9.473794330155572e-05, + "loss": 0.0039, + "step": 9500 + }, + { + "epoch": 5.497109826589595, + "grad_norm": 0.08371403068304062, + "learning_rate": 9.472316639099276e-05, + "loss": 0.004, + "step": 9510 + }, + { + "epoch": 5.502890173410405, + "grad_norm": 0.08837935328483582, + "learning_rate": 9.470836991699739e-05, + "loss": 0.0044, + "step": 9520 + }, + { + "epoch": 5.508670520231214, + "grad_norm": 0.06310413032770157, + "learning_rate": 9.469355388604208e-05, + "loss": 0.0053, + "step": 9530 + }, + { + "epoch": 5.514450867052023, + "grad_norm": 0.08625809848308563, + "learning_rate": 9.467871830460787e-05, + "loss": 0.0047, + "step": 9540 + }, + { + "epoch": 5.520231213872832, + "grad_norm": 0.04534368962049484, + "learning_rate": 9.466386317918436e-05, + "loss": 0.003, + "step": 9550 + }, + { + "epoch": 5.526011560693641, + "grad_norm": 0.10432472079992294, + "learning_rate": 9.464898851626969e-05, + "loss": 0.0033, + "step": 9560 + }, + { + "epoch": 5.531791907514451, + "grad_norm": 0.04911819100379944, + "learning_rate": 9.463409432237051e-05, + "loss": 0.0035, + "step": 9570 + }, + { + "epoch": 5.53757225433526, + "grad_norm": 0.0693276897072792, + "learning_rate": 9.461918060400209e-05, + "loss": 0.0049, + "step": 9580 + }, + { + "epoch": 5.543352601156069, + "grad_norm": 0.08006289601325989, + "learning_rate": 9.460424736768816e-05, + "loss": 0.0043, + "step": 9590 + }, + { + "epoch": 5.5491329479768785, + "grad_norm": 0.06762324273586273, + "learning_rate": 9.458929461996105e-05, + "loss": 0.0038, + "step": 9600 + }, + { + "epoch": 5.554913294797688, + "grad_norm": 0.05108136311173439, + "learning_rate": 9.457432236736158e-05, + "loss": 0.003, + "step": 9610 + }, + { + "epoch": 5.5606936416184976, + "grad_norm": 0.0543186254799366, + "learning_rate": 9.455933061643916e-05, + "loss": 0.0035, + "step": 9620 + }, + { + "epoch": 5.566473988439307, + "grad_norm": 0.08099451661109924, + "learning_rate": 9.454431937375164e-05, + "loss": 0.0041, + "step": 9630 + }, + { + "epoch": 5.572254335260116, + "grad_norm": 0.06708111613988876, + "learning_rate": 9.452928864586547e-05, + "loss": 0.0031, + "step": 9640 + }, + { + "epoch": 5.578034682080925, + "grad_norm": 0.06497319042682648, + "learning_rate": 9.451423843935563e-05, + "loss": 0.0035, + "step": 9650 + }, + { + "epoch": 5.583815028901734, + "grad_norm": 0.07020888477563858, + "learning_rate": 9.449916876080553e-05, + "loss": 0.0032, + "step": 9660 + }, + { + "epoch": 5.589595375722544, + "grad_norm": 0.07956274598836899, + "learning_rate": 9.44840796168072e-05, + "loss": 0.003, + "step": 9670 + }, + { + "epoch": 5.595375722543353, + "grad_norm": 0.08269158750772476, + "learning_rate": 9.446897101396115e-05, + "loss": 0.004, + "step": 9680 + }, + { + "epoch": 5.601156069364162, + "grad_norm": 0.11986695975065231, + "learning_rate": 9.445384295887638e-05, + "loss": 0.0036, + "step": 9690 + }, + { + "epoch": 5.606936416184971, + "grad_norm": 0.06522428244352341, + "learning_rate": 9.443869545817043e-05, + "loss": 0.0038, + "step": 9700 + }, + { + "epoch": 5.61271676300578, + "grad_norm": 0.09290037304162979, + "learning_rate": 9.442352851846929e-05, + "loss": 0.0033, + "step": 9710 + }, + { + "epoch": 5.618497109826589, + "grad_norm": 0.07107914239168167, + "learning_rate": 9.440834214640755e-05, + "loss": 0.004, + "step": 9720 + }, + { + "epoch": 5.624277456647399, + "grad_norm": 0.08233386278152466, + "learning_rate": 9.439313634862823e-05, + "loss": 0.0045, + "step": 9730 + }, + { + "epoch": 5.630057803468208, + "grad_norm": 0.09773562103509903, + "learning_rate": 9.437791113178282e-05, + "loss": 0.0048, + "step": 9740 + }, + { + "epoch": 5.635838150289017, + "grad_norm": 0.08826876431703568, + "learning_rate": 9.43626665025314e-05, + "loss": 0.0051, + "step": 9750 + }, + { + "epoch": 5.641618497109826, + "grad_norm": 0.08154237270355225, + "learning_rate": 9.434740246754248e-05, + "loss": 0.0035, + "step": 9760 + }, + { + "epoch": 5.6473988439306355, + "grad_norm": 0.0881495401263237, + "learning_rate": 9.433211903349304e-05, + "loss": 0.0046, + "step": 9770 + }, + { + "epoch": 5.653179190751445, + "grad_norm": 0.09400784969329834, + "learning_rate": 9.431681620706858e-05, + "loss": 0.0034, + "step": 9780 + }, + { + "epoch": 5.658959537572255, + "grad_norm": 0.1164061427116394, + "learning_rate": 9.43014939949631e-05, + "loss": 0.0042, + "step": 9790 + }, + { + "epoch": 5.664739884393064, + "grad_norm": 0.0716787651181221, + "learning_rate": 9.428615240387904e-05, + "loss": 0.0034, + "step": 9800 + }, + { + "epoch": 5.670520231213873, + "grad_norm": 0.06646711379289627, + "learning_rate": 9.427079144052732e-05, + "loss": 0.0034, + "step": 9810 + }, + { + "epoch": 5.676300578034682, + "grad_norm": 0.07756868004798889, + "learning_rate": 9.425541111162739e-05, + "loss": 0.0036, + "step": 9820 + }, + { + "epoch": 5.682080924855491, + "grad_norm": 0.11053356528282166, + "learning_rate": 9.424001142390709e-05, + "loss": 0.0038, + "step": 9830 + }, + { + "epoch": 5.687861271676301, + "grad_norm": 0.09140120446681976, + "learning_rate": 9.422459238410277e-05, + "loss": 0.0039, + "step": 9840 + }, + { + "epoch": 5.69364161849711, + "grad_norm": 0.06871363520622253, + "learning_rate": 9.420915399895926e-05, + "loss": 0.006, + "step": 9850 + }, + { + "epoch": 5.699421965317919, + "grad_norm": 0.07478412240743637, + "learning_rate": 9.419369627522981e-05, + "loss": 0.0037, + "step": 9860 + }, + { + "epoch": 5.705202312138728, + "grad_norm": 0.045829880982637405, + "learning_rate": 9.417821921967618e-05, + "loss": 0.0026, + "step": 9870 + }, + { + "epoch": 5.710982658959537, + "grad_norm": 0.050987813621759415, + "learning_rate": 9.416272283906855e-05, + "loss": 0.0035, + "step": 9880 + }, + { + "epoch": 5.716763005780347, + "grad_norm": 0.0743798017501831, + "learning_rate": 9.414720714018554e-05, + "loss": 0.0043, + "step": 9890 + }, + { + "epoch": 5.722543352601156, + "grad_norm": 0.07256080955266953, + "learning_rate": 9.413167212981427e-05, + "loss": 0.0036, + "step": 9900 + }, + { + "epoch": 5.728323699421965, + "grad_norm": 0.06769303977489471, + "learning_rate": 9.41161178147503e-05, + "loss": 0.0064, + "step": 9910 + }, + { + "epoch": 5.734104046242774, + "grad_norm": 0.08060161024332047, + "learning_rate": 9.410054420179755e-05, + "loss": 0.0041, + "step": 9920 + }, + { + "epoch": 5.7398843930635834, + "grad_norm": 0.10014849156141281, + "learning_rate": 9.408495129776851e-05, + "loss": 0.0036, + "step": 9930 + }, + { + "epoch": 5.745664739884393, + "grad_norm": 0.06562087684869766, + "learning_rate": 9.406933910948403e-05, + "loss": 0.003, + "step": 9940 + }, + { + "epoch": 5.7514450867052025, + "grad_norm": 0.12746410071849823, + "learning_rate": 9.40537076437734e-05, + "loss": 0.0057, + "step": 9950 + }, + { + "epoch": 5.757225433526012, + "grad_norm": 0.1049005389213562, + "learning_rate": 9.403805690747436e-05, + "loss": 0.0037, + "step": 9960 + }, + { + "epoch": 5.763005780346821, + "grad_norm": 0.08599194884300232, + "learning_rate": 9.402238690743308e-05, + "loss": 0.0047, + "step": 9970 + }, + { + "epoch": 5.76878612716763, + "grad_norm": 0.11726978421211243, + "learning_rate": 9.400669765050413e-05, + "loss": 0.0043, + "step": 9980 + }, + { + "epoch": 5.77456647398844, + "grad_norm": 0.09003078192472458, + "learning_rate": 9.399098914355055e-05, + "loss": 0.0036, + "step": 9990 + }, + { + "epoch": 5.780346820809249, + "grad_norm": 0.05620354786515236, + "learning_rate": 9.397526139344378e-05, + "loss": 0.0033, + "step": 10000 + }, + { + "epoch": 5.786127167630058, + "grad_norm": 0.05252154916524887, + "learning_rate": 9.395951440706362e-05, + "loss": 0.003, + "step": 10010 + }, + { + "epoch": 5.791907514450867, + "grad_norm": 0.041756268590688705, + "learning_rate": 9.394374819129839e-05, + "loss": 0.0049, + "step": 10020 + }, + { + "epoch": 5.797687861271676, + "grad_norm": 0.09076819568872452, + "learning_rate": 9.392796275304474e-05, + "loss": 0.0036, + "step": 10030 + }, + { + "epoch": 5.803468208092486, + "grad_norm": 0.0802321806550026, + "learning_rate": 9.391215809920778e-05, + "loss": 0.0045, + "step": 10040 + }, + { + "epoch": 5.809248554913295, + "grad_norm": 0.07671747356653214, + "learning_rate": 9.389633423670099e-05, + "loss": 0.0036, + "step": 10050 + }, + { + "epoch": 5.815028901734104, + "grad_norm": 0.056961119174957275, + "learning_rate": 9.388049117244626e-05, + "loss": 0.003, + "step": 10060 + }, + { + "epoch": 5.820809248554913, + "grad_norm": 0.059280671179294586, + "learning_rate": 9.386462891337389e-05, + "loss": 0.003, + "step": 10070 + }, + { + "epoch": 5.826589595375722, + "grad_norm": 0.0686354786157608, + "learning_rate": 9.384874746642257e-05, + "loss": 0.003, + "step": 10080 + }, + { + "epoch": 5.832369942196532, + "grad_norm": 0.06178855523467064, + "learning_rate": 9.383284683853937e-05, + "loss": 0.0029, + "step": 10090 + }, + { + "epoch": 5.838150289017341, + "grad_norm": 0.14005982875823975, + "learning_rate": 9.381692703667981e-05, + "loss": 0.0057, + "step": 10100 + }, + { + "epoch": 5.84393063583815, + "grad_norm": 0.1773129403591156, + "learning_rate": 9.380098806780771e-05, + "loss": 0.0055, + "step": 10110 + }, + { + "epoch": 5.8497109826589595, + "grad_norm": 0.1098577231168747, + "learning_rate": 9.378502993889533e-05, + "loss": 0.0047, + "step": 10120 + }, + { + "epoch": 5.855491329479769, + "grad_norm": 0.10691174119710922, + "learning_rate": 9.376905265692329e-05, + "loss": 0.0047, + "step": 10130 + }, + { + "epoch": 5.861271676300578, + "grad_norm": 0.0895080491900444, + "learning_rate": 9.37530562288806e-05, + "loss": 0.0033, + "step": 10140 + }, + { + "epoch": 5.867052023121388, + "grad_norm": 0.12388379126787186, + "learning_rate": 9.373704066176465e-05, + "loss": 0.0038, + "step": 10150 + }, + { + "epoch": 5.872832369942197, + "grad_norm": 0.06937913596630096, + "learning_rate": 9.372100596258118e-05, + "loss": 0.0033, + "step": 10160 + }, + { + "epoch": 5.878612716763006, + "grad_norm": 0.07229018956422806, + "learning_rate": 9.370495213834433e-05, + "loss": 0.0036, + "step": 10170 + }, + { + "epoch": 5.884393063583815, + "grad_norm": 0.054935961961746216, + "learning_rate": 9.368887919607657e-05, + "loss": 0.0038, + "step": 10180 + }, + { + "epoch": 5.890173410404624, + "grad_norm": 0.053139057010412216, + "learning_rate": 9.367278714280876e-05, + "loss": 0.0035, + "step": 10190 + }, + { + "epoch": 5.895953757225434, + "grad_norm": 0.04845484718680382, + "learning_rate": 9.36566759855801e-05, + "loss": 0.0034, + "step": 10200 + }, + { + "epoch": 5.901734104046243, + "grad_norm": 0.05633014440536499, + "learning_rate": 9.36405457314382e-05, + "loss": 0.0033, + "step": 10210 + }, + { + "epoch": 5.907514450867052, + "grad_norm": 0.04651571810245514, + "learning_rate": 9.36243963874389e-05, + "loss": 0.0037, + "step": 10220 + }, + { + "epoch": 5.913294797687861, + "grad_norm": 0.08233683556318283, + "learning_rate": 9.360822796064655e-05, + "loss": 0.0029, + "step": 10230 + }, + { + "epoch": 5.91907514450867, + "grad_norm": 0.05699615180492401, + "learning_rate": 9.359204045813372e-05, + "loss": 0.0032, + "step": 10240 + }, + { + "epoch": 5.924855491329479, + "grad_norm": 0.05122595652937889, + "learning_rate": 9.357583388698141e-05, + "loss": 0.0037, + "step": 10250 + }, + { + "epoch": 5.930635838150289, + "grad_norm": 0.04474084824323654, + "learning_rate": 9.35596082542789e-05, + "loss": 0.0029, + "step": 10260 + }, + { + "epoch": 5.936416184971098, + "grad_norm": 0.04808139428496361, + "learning_rate": 9.354336356712383e-05, + "loss": 0.0033, + "step": 10270 + }, + { + "epoch": 5.942196531791907, + "grad_norm": 0.06115487590432167, + "learning_rate": 9.35270998326222e-05, + "loss": 0.0034, + "step": 10280 + }, + { + "epoch": 5.9479768786127165, + "grad_norm": 0.0701942890882492, + "learning_rate": 9.351081705788831e-05, + "loss": 0.0034, + "step": 10290 + }, + { + "epoch": 5.953757225433526, + "grad_norm": 0.06478339433670044, + "learning_rate": 9.349451525004477e-05, + "loss": 0.0037, + "step": 10300 + }, + { + "epoch": 5.959537572254336, + "grad_norm": 0.05231313034892082, + "learning_rate": 9.347819441622261e-05, + "loss": 0.0034, + "step": 10310 + }, + { + "epoch": 5.965317919075145, + "grad_norm": 0.06736232340335846, + "learning_rate": 9.346185456356105e-05, + "loss": 0.0037, + "step": 10320 + }, + { + "epoch": 5.971098265895954, + "grad_norm": 0.09078212827444077, + "learning_rate": 9.344549569920774e-05, + "loss": 0.0031, + "step": 10330 + }, + { + "epoch": 5.976878612716763, + "grad_norm": 0.0714443176984787, + "learning_rate": 9.342911783031858e-05, + "loss": 0.0034, + "step": 10340 + }, + { + "epoch": 5.982658959537572, + "grad_norm": 0.0628506988286972, + "learning_rate": 9.341272096405782e-05, + "loss": 0.0034, + "step": 10350 + }, + { + "epoch": 5.988439306358382, + "grad_norm": 0.05350936949253082, + "learning_rate": 9.3396305107598e-05, + "loss": 0.0027, + "step": 10360 + }, + { + "epoch": 5.994219653179191, + "grad_norm": 0.08104755729436874, + "learning_rate": 9.337987026811998e-05, + "loss": 0.0039, + "step": 10370 + }, + { + "epoch": 6.0, + "grad_norm": 0.06955874711275101, + "learning_rate": 9.33634164528129e-05, + "loss": 0.0035, + "step": 10380 + }, + { + "epoch": 6.005780346820809, + "grad_norm": 0.061120934784412384, + "learning_rate": 9.334694366887424e-05, + "loss": 0.0029, + "step": 10390 + }, + { + "epoch": 6.011560693641618, + "grad_norm": 0.06715985387563705, + "learning_rate": 9.333045192350973e-05, + "loss": 0.0038, + "step": 10400 + }, + { + "epoch": 6.017341040462428, + "grad_norm": 0.051006004214286804, + "learning_rate": 9.331394122393345e-05, + "loss": 0.0036, + "step": 10410 + }, + { + "epoch": 6.023121387283237, + "grad_norm": 0.07410971075296402, + "learning_rate": 9.329741157736771e-05, + "loss": 0.0054, + "step": 10420 + }, + { + "epoch": 6.028901734104046, + "grad_norm": 0.04755874350667, + "learning_rate": 9.328086299104317e-05, + "loss": 0.0038, + "step": 10430 + }, + { + "epoch": 6.034682080924855, + "grad_norm": 0.08780265599489212, + "learning_rate": 9.326429547219872e-05, + "loss": 0.006, + "step": 10440 + }, + { + "epoch": 6.040462427745664, + "grad_norm": 0.05304821580648422, + "learning_rate": 9.324770902808155e-05, + "loss": 0.0032, + "step": 10450 + }, + { + "epoch": 6.046242774566474, + "grad_norm": 0.10874399542808533, + "learning_rate": 9.323110366594717e-05, + "loss": 0.0046, + "step": 10460 + }, + { + "epoch": 6.0520231213872835, + "grad_norm": 0.08250004798173904, + "learning_rate": 9.32144793930593e-05, + "loss": 0.0033, + "step": 10470 + }, + { + "epoch": 6.057803468208093, + "grad_norm": 0.0632195845246315, + "learning_rate": 9.319783621668996e-05, + "loss": 0.0028, + "step": 10480 + }, + { + "epoch": 6.063583815028902, + "grad_norm": 0.07797906547784805, + "learning_rate": 9.318117414411947e-05, + "loss": 0.0034, + "step": 10490 + }, + { + "epoch": 6.069364161849711, + "grad_norm": 0.0703006312251091, + "learning_rate": 9.316449318263635e-05, + "loss": 0.0043, + "step": 10500 + }, + { + "epoch": 6.07514450867052, + "grad_norm": 0.09008142352104187, + "learning_rate": 9.314779333953744e-05, + "loss": 0.0037, + "step": 10510 + }, + { + "epoch": 6.08092485549133, + "grad_norm": 0.06668701767921448, + "learning_rate": 9.313107462212781e-05, + "loss": 0.0029, + "step": 10520 + }, + { + "epoch": 6.086705202312139, + "grad_norm": 0.07460634410381317, + "learning_rate": 9.311433703772082e-05, + "loss": 0.0034, + "step": 10530 + }, + { + "epoch": 6.092485549132948, + "grad_norm": 0.05611598119139671, + "learning_rate": 9.3097580593638e-05, + "loss": 0.0035, + "step": 10540 + }, + { + "epoch": 6.098265895953757, + "grad_norm": 0.03957807272672653, + "learning_rate": 9.308080529720926e-05, + "loss": 0.0036, + "step": 10550 + }, + { + "epoch": 6.104046242774566, + "grad_norm": 0.07462923973798752, + "learning_rate": 9.306401115577264e-05, + "loss": 0.0033, + "step": 10560 + }, + { + "epoch": 6.109826589595376, + "grad_norm": 0.06398440897464752, + "learning_rate": 9.304719817667447e-05, + "loss": 0.0032, + "step": 10570 + }, + { + "epoch": 6.115606936416185, + "grad_norm": 0.07337431609630585, + "learning_rate": 9.303036636726934e-05, + "loss": 0.0039, + "step": 10580 + }, + { + "epoch": 6.121387283236994, + "grad_norm": 0.06975319236516953, + "learning_rate": 9.301351573492003e-05, + "loss": 0.004, + "step": 10590 + }, + { + "epoch": 6.127167630057803, + "grad_norm": 0.09302632510662079, + "learning_rate": 9.299664628699758e-05, + "loss": 0.0079, + "step": 10600 + }, + { + "epoch": 6.132947976878612, + "grad_norm": 0.10387637466192245, + "learning_rate": 9.297975803088129e-05, + "loss": 0.0059, + "step": 10610 + }, + { + "epoch": 6.138728323699422, + "grad_norm": 0.1043407991528511, + "learning_rate": 9.296285097395864e-05, + "loss": 0.0041, + "step": 10620 + }, + { + "epoch": 6.144508670520231, + "grad_norm": 0.1208314448595047, + "learning_rate": 9.294592512362533e-05, + "loss": 0.0075, + "step": 10630 + }, + { + "epoch": 6.1502890173410405, + "grad_norm": 0.10845641791820526, + "learning_rate": 9.292898048728533e-05, + "loss": 0.0057, + "step": 10640 + }, + { + "epoch": 6.15606936416185, + "grad_norm": 0.10029802471399307, + "learning_rate": 9.29120170723508e-05, + "loss": 0.0036, + "step": 10650 + }, + { + "epoch": 6.161849710982659, + "grad_norm": 0.07804198563098907, + "learning_rate": 9.28950348862421e-05, + "loss": 0.0042, + "step": 10660 + }, + { + "epoch": 6.167630057803469, + "grad_norm": 0.13739337027072906, + "learning_rate": 9.287803393638781e-05, + "loss": 0.0049, + "step": 10670 + }, + { + "epoch": 6.173410404624278, + "grad_norm": 0.14106610417366028, + "learning_rate": 9.286101423022474e-05, + "loss": 0.0045, + "step": 10680 + }, + { + "epoch": 6.179190751445087, + "grad_norm": 0.15256516635417938, + "learning_rate": 9.284397577519788e-05, + "loss": 0.0042, + "step": 10690 + }, + { + "epoch": 6.184971098265896, + "grad_norm": 0.09890145808458328, + "learning_rate": 9.282691857876043e-05, + "loss": 0.0042, + "step": 10700 + }, + { + "epoch": 6.190751445086705, + "grad_norm": 0.08473809063434601, + "learning_rate": 9.280984264837377e-05, + "loss": 0.0036, + "step": 10710 + }, + { + "epoch": 6.196531791907514, + "grad_norm": 0.0743294209241867, + "learning_rate": 9.279274799150752e-05, + "loss": 0.0029, + "step": 10720 + }, + { + "epoch": 6.202312138728324, + "grad_norm": 0.05356019362807274, + "learning_rate": 9.277563461563945e-05, + "loss": 0.0027, + "step": 10730 + }, + { + "epoch": 6.208092485549133, + "grad_norm": 0.07146003097295761, + "learning_rate": 9.275850252825555e-05, + "loss": 0.0036, + "step": 10740 + }, + { + "epoch": 6.213872832369942, + "grad_norm": 0.06811804324388504, + "learning_rate": 9.274135173684994e-05, + "loss": 0.0035, + "step": 10750 + }, + { + "epoch": 6.219653179190751, + "grad_norm": 0.06242654472589493, + "learning_rate": 9.272418224892498e-05, + "loss": 0.0026, + "step": 10760 + }, + { + "epoch": 6.22543352601156, + "grad_norm": 0.08808053284883499, + "learning_rate": 9.27069940719912e-05, + "loss": 0.0036, + "step": 10770 + }, + { + "epoch": 6.23121387283237, + "grad_norm": 0.04891262948513031, + "learning_rate": 9.268978721356727e-05, + "loss": 0.0028, + "step": 10780 + }, + { + "epoch": 6.236994219653179, + "grad_norm": 0.042387206107378006, + "learning_rate": 9.267256168118008e-05, + "loss": 0.0028, + "step": 10790 + }, + { + "epoch": 6.242774566473988, + "grad_norm": 0.06473665684461594, + "learning_rate": 9.265531748236463e-05, + "loss": 0.0031, + "step": 10800 + }, + { + "epoch": 6.2485549132947975, + "grad_norm": 0.04370260611176491, + "learning_rate": 9.263805462466416e-05, + "loss": 0.0037, + "step": 10810 + }, + { + "epoch": 6.254335260115607, + "grad_norm": 0.034692343324422836, + "learning_rate": 9.262077311562998e-05, + "loss": 0.0026, + "step": 10820 + }, + { + "epoch": 6.2601156069364166, + "grad_norm": 0.06662782281637192, + "learning_rate": 9.260347296282165e-05, + "loss": 0.0038, + "step": 10830 + }, + { + "epoch": 6.265895953757226, + "grad_norm": 0.05651827156543732, + "learning_rate": 9.258615417380683e-05, + "loss": 0.0028, + "step": 10840 + }, + { + "epoch": 6.271676300578035, + "grad_norm": 0.07076770067214966, + "learning_rate": 9.256881675616133e-05, + "loss": 0.0036, + "step": 10850 + }, + { + "epoch": 6.277456647398844, + "grad_norm": 0.06414354592561722, + "learning_rate": 9.255146071746917e-05, + "loss": 0.0036, + "step": 10860 + }, + { + "epoch": 6.283236994219653, + "grad_norm": 0.06279256939888, + "learning_rate": 9.253408606532241e-05, + "loss": 0.0033, + "step": 10870 + }, + { + "epoch": 6.289017341040463, + "grad_norm": 0.07311423867940903, + "learning_rate": 9.251669280732137e-05, + "loss": 0.0032, + "step": 10880 + }, + { + "epoch": 6.294797687861272, + "grad_norm": 0.07605946063995361, + "learning_rate": 9.249928095107441e-05, + "loss": 0.0034, + "step": 10890 + }, + { + "epoch": 6.300578034682081, + "grad_norm": 0.05828089267015457, + "learning_rate": 9.248185050419811e-05, + "loss": 0.0033, + "step": 10900 + }, + { + "epoch": 6.30635838150289, + "grad_norm": 0.0422709695994854, + "learning_rate": 9.24644014743171e-05, + "loss": 0.004, + "step": 10910 + }, + { + "epoch": 6.312138728323699, + "grad_norm": 0.08373105525970459, + "learning_rate": 9.24469338690642e-05, + "loss": 0.0034, + "step": 10920 + }, + { + "epoch": 6.317919075144509, + "grad_norm": 0.04995943605899811, + "learning_rate": 9.242944769608033e-05, + "loss": 0.0033, + "step": 10930 + }, + { + "epoch": 6.323699421965318, + "grad_norm": 0.06954579800367355, + "learning_rate": 9.241194296301454e-05, + "loss": 0.003, + "step": 10940 + }, + { + "epoch": 6.329479768786127, + "grad_norm": 0.0585656501352787, + "learning_rate": 9.239441967752397e-05, + "loss": 0.0051, + "step": 10950 + }, + { + "epoch": 6.335260115606936, + "grad_norm": 0.05706046521663666, + "learning_rate": 9.237687784727393e-05, + "loss": 0.0033, + "step": 10960 + }, + { + "epoch": 6.341040462427745, + "grad_norm": 0.06931231170892715, + "learning_rate": 9.235931747993781e-05, + "loss": 0.0039, + "step": 10970 + }, + { + "epoch": 6.3468208092485545, + "grad_norm": 0.0789090096950531, + "learning_rate": 9.234173858319707e-05, + "loss": 0.0033, + "step": 10980 + }, + { + "epoch": 6.3526011560693645, + "grad_norm": 0.08950246125459671, + "learning_rate": 9.23241411647414e-05, + "loss": 0.0031, + "step": 10990 + }, + { + "epoch": 6.358381502890174, + "grad_norm": 0.04269541800022125, + "learning_rate": 9.230652523226841e-05, + "loss": 0.0028, + "step": 11000 + }, + { + "epoch": 6.364161849710983, + "grad_norm": 0.08877093344926834, + "learning_rate": 9.2288890793484e-05, + "loss": 0.0036, + "step": 11010 + }, + { + "epoch": 6.369942196531792, + "grad_norm": 0.11143888533115387, + "learning_rate": 9.227123785610199e-05, + "loss": 0.0043, + "step": 11020 + }, + { + "epoch": 6.375722543352601, + "grad_norm": 0.09317649900913239, + "learning_rate": 9.225356642784443e-05, + "loss": 0.0061, + "step": 11030 + }, + { + "epoch": 6.381502890173411, + "grad_norm": 0.09370527416467667, + "learning_rate": 9.223587651644139e-05, + "loss": 0.0039, + "step": 11040 + }, + { + "epoch": 6.38728323699422, + "grad_norm": 0.10189167410135269, + "learning_rate": 9.221816812963104e-05, + "loss": 0.0038, + "step": 11050 + }, + { + "epoch": 6.393063583815029, + "grad_norm": 0.14939944446086884, + "learning_rate": 9.22004412751596e-05, + "loss": 0.0034, + "step": 11060 + }, + { + "epoch": 6.398843930635838, + "grad_norm": 0.09085329622030258, + "learning_rate": 9.218269596078146e-05, + "loss": 0.0035, + "step": 11070 + }, + { + "epoch": 6.404624277456647, + "grad_norm": 0.09412657469511032, + "learning_rate": 9.216493219425895e-05, + "loss": 0.0071, + "step": 11080 + }, + { + "epoch": 6.410404624277457, + "grad_norm": 0.06841382384300232, + "learning_rate": 9.21471499833626e-05, + "loss": 0.004, + "step": 11090 + }, + { + "epoch": 6.416184971098266, + "grad_norm": 0.07971781492233276, + "learning_rate": 9.212934933587094e-05, + "loss": 0.0033, + "step": 11100 + }, + { + "epoch": 6.421965317919075, + "grad_norm": 0.07304677367210388, + "learning_rate": 9.211153025957056e-05, + "loss": 0.0034, + "step": 11110 + }, + { + "epoch": 6.427745664739884, + "grad_norm": 0.08593588322401047, + "learning_rate": 9.209369276225614e-05, + "loss": 0.0033, + "step": 11120 + }, + { + "epoch": 6.433526011560693, + "grad_norm": 0.12527289986610413, + "learning_rate": 9.207583685173042e-05, + "loss": 0.0043, + "step": 11130 + }, + { + "epoch": 6.4393063583815024, + "grad_norm": 0.10674163699150085, + "learning_rate": 9.205796253580417e-05, + "loss": 0.0031, + "step": 11140 + }, + { + "epoch": 6.445086705202312, + "grad_norm": 0.0938422828912735, + "learning_rate": 9.204006982229621e-05, + "loss": 0.0032, + "step": 11150 + }, + { + "epoch": 6.4508670520231215, + "grad_norm": 0.09737176448106766, + "learning_rate": 9.202215871903346e-05, + "loss": 0.003, + "step": 11160 + }, + { + "epoch": 6.456647398843931, + "grad_norm": 0.06158210337162018, + "learning_rate": 9.20042292338508e-05, + "loss": 0.0042, + "step": 11170 + }, + { + "epoch": 6.46242774566474, + "grad_norm": 0.08288124948740005, + "learning_rate": 9.198628137459123e-05, + "loss": 0.0032, + "step": 11180 + }, + { + "epoch": 6.468208092485549, + "grad_norm": 0.06456904113292694, + "learning_rate": 9.196831514910572e-05, + "loss": 0.004, + "step": 11190 + }, + { + "epoch": 6.473988439306359, + "grad_norm": 0.0678388774394989, + "learning_rate": 9.195033056525332e-05, + "loss": 0.0027, + "step": 11200 + }, + { + "epoch": 6.479768786127168, + "grad_norm": 0.059251368045806885, + "learning_rate": 9.193232763090114e-05, + "loss": 0.0032, + "step": 11210 + }, + { + "epoch": 6.485549132947977, + "grad_norm": 0.060957446694374084, + "learning_rate": 9.191430635392422e-05, + "loss": 0.0031, + "step": 11220 + }, + { + "epoch": 6.491329479768786, + "grad_norm": 0.07789164781570435, + "learning_rate": 9.18962667422057e-05, + "loss": 0.0034, + "step": 11230 + }, + { + "epoch": 6.497109826589595, + "grad_norm": 0.0698968842625618, + "learning_rate": 9.187820880363671e-05, + "loss": 0.0033, + "step": 11240 + }, + { + "epoch": 6.502890173410405, + "grad_norm": 0.0620306022465229, + "learning_rate": 9.186013254611643e-05, + "loss": 0.0028, + "step": 11250 + }, + { + "epoch": 6.508670520231214, + "grad_norm": 0.04711582511663437, + "learning_rate": 9.1842037977552e-05, + "loss": 0.0031, + "step": 11260 + }, + { + "epoch": 6.514450867052023, + "grad_norm": 0.045567553490400314, + "learning_rate": 9.182392510585862e-05, + "loss": 0.0028, + "step": 11270 + }, + { + "epoch": 6.520231213872832, + "grad_norm": 0.09069813787937164, + "learning_rate": 9.180579393895946e-05, + "loss": 0.0028, + "step": 11280 + }, + { + "epoch": 6.526011560693641, + "grad_norm": 0.08624447137117386, + "learning_rate": 9.178764448478572e-05, + "loss": 0.0029, + "step": 11290 + }, + { + "epoch": 6.531791907514451, + "grad_norm": 0.07226696610450745, + "learning_rate": 9.176947675127658e-05, + "loss": 0.0031, + "step": 11300 + }, + { + "epoch": 6.53757225433526, + "grad_norm": 0.07378704845905304, + "learning_rate": 9.175129074637924e-05, + "loss": 0.0069, + "step": 11310 + }, + { + "epoch": 6.543352601156069, + "grad_norm": 0.08021886646747589, + "learning_rate": 9.173308647804884e-05, + "loss": 0.0045, + "step": 11320 + }, + { + "epoch": 6.5491329479768785, + "grad_norm": 0.06382913887500763, + "learning_rate": 9.171486395424859e-05, + "loss": 0.0024, + "step": 11330 + }, + { + "epoch": 6.554913294797688, + "grad_norm": 0.08531121909618378, + "learning_rate": 9.16966231829496e-05, + "loss": 0.0036, + "step": 11340 + }, + { + "epoch": 6.5606936416184976, + "grad_norm": 0.0522238165140152, + "learning_rate": 9.167836417213105e-05, + "loss": 0.0033, + "step": 11350 + }, + { + "epoch": 6.566473988439307, + "grad_norm": 0.06926757097244263, + "learning_rate": 9.166008692978001e-05, + "loss": 0.0028, + "step": 11360 + }, + { + "epoch": 6.572254335260116, + "grad_norm": 0.038560591638088226, + "learning_rate": 9.164179146389158e-05, + "loss": 0.0027, + "step": 11370 + }, + { + "epoch": 6.578034682080925, + "grad_norm": 0.05434241518378258, + "learning_rate": 9.162347778246882e-05, + "loss": 0.0024, + "step": 11380 + }, + { + "epoch": 6.583815028901734, + "grad_norm": 0.053169794380664825, + "learning_rate": 9.160514589352276e-05, + "loss": 0.0034, + "step": 11390 + }, + { + "epoch": 6.589595375722544, + "grad_norm": 0.07566075026988983, + "learning_rate": 9.15867958050724e-05, + "loss": 0.003, + "step": 11400 + }, + { + "epoch": 6.595375722543353, + "grad_norm": 0.05034327879548073, + "learning_rate": 9.156842752514466e-05, + "loss": 0.0031, + "step": 11410 + }, + { + "epoch": 6.601156069364162, + "grad_norm": 0.05703079700469971, + "learning_rate": 9.155004106177447e-05, + "loss": 0.003, + "step": 11420 + }, + { + "epoch": 6.606936416184971, + "grad_norm": 0.1105022132396698, + "learning_rate": 9.153163642300471e-05, + "loss": 0.004, + "step": 11430 + }, + { + "epoch": 6.61271676300578, + "grad_norm": 0.10962982475757599, + "learning_rate": 9.151321361688616e-05, + "loss": 0.0033, + "step": 11440 + }, + { + "epoch": 6.618497109826589, + "grad_norm": 0.05263747274875641, + "learning_rate": 9.149477265147762e-05, + "loss": 0.0029, + "step": 11450 + }, + { + "epoch": 6.624277456647399, + "grad_norm": 0.09197230637073517, + "learning_rate": 9.147631353484574e-05, + "loss": 0.0038, + "step": 11460 + }, + { + "epoch": 6.630057803468208, + "grad_norm": 0.0773550271987915, + "learning_rate": 9.145783627506522e-05, + "loss": 0.0035, + "step": 11470 + }, + { + "epoch": 6.635838150289017, + "grad_norm": 0.11397737264633179, + "learning_rate": 9.143934088021861e-05, + "loss": 0.0041, + "step": 11480 + }, + { + "epoch": 6.641618497109826, + "grad_norm": 0.10318799316883087, + "learning_rate": 9.142082735839645e-05, + "loss": 0.0042, + "step": 11490 + }, + { + "epoch": 6.6473988439306355, + "grad_norm": 0.08106772601604462, + "learning_rate": 9.140229571769715e-05, + "loss": 0.0032, + "step": 11500 + }, + { + "epoch": 6.653179190751445, + "grad_norm": 0.08967715501785278, + "learning_rate": 9.138374596622709e-05, + "loss": 0.0033, + "step": 11510 + }, + { + "epoch": 6.658959537572255, + "grad_norm": 0.06483248621225357, + "learning_rate": 9.136517811210059e-05, + "loss": 0.0026, + "step": 11520 + }, + { + "epoch": 6.664739884393064, + "grad_norm": 0.042875584214925766, + "learning_rate": 9.134659216343984e-05, + "loss": 0.0034, + "step": 11530 + }, + { + "epoch": 6.670520231213873, + "grad_norm": 0.0676209107041359, + "learning_rate": 9.132798812837494e-05, + "loss": 0.0039, + "step": 11540 + }, + { + "epoch": 6.676300578034682, + "grad_norm": 0.1124272495508194, + "learning_rate": 9.130936601504396e-05, + "loss": 0.0032, + "step": 11550 + }, + { + "epoch": 6.682080924855491, + "grad_norm": 0.09340932220220566, + "learning_rate": 9.129072583159284e-05, + "loss": 0.0034, + "step": 11560 + }, + { + "epoch": 6.687861271676301, + "grad_norm": 0.10958612710237503, + "learning_rate": 9.127206758617542e-05, + "loss": 0.0034, + "step": 11570 + }, + { + "epoch": 6.69364161849711, + "grad_norm": 0.09023173898458481, + "learning_rate": 9.125339128695346e-05, + "loss": 0.0044, + "step": 11580 + }, + { + "epoch": 6.699421965317919, + "grad_norm": 0.06543544679880142, + "learning_rate": 9.123469694209659e-05, + "loss": 0.003, + "step": 11590 + }, + { + "epoch": 6.705202312138728, + "grad_norm": 0.09502831101417542, + "learning_rate": 9.121598455978239e-05, + "loss": 0.0041, + "step": 11600 + }, + { + "epoch": 6.710982658959537, + "grad_norm": 0.05420244112610817, + "learning_rate": 9.119725414819624e-05, + "loss": 0.0029, + "step": 11610 + }, + { + "epoch": 6.716763005780347, + "grad_norm": 0.09080252051353455, + "learning_rate": 9.117850571553149e-05, + "loss": 0.0029, + "step": 11620 + }, + { + "epoch": 6.722543352601156, + "grad_norm": 0.11371125280857086, + "learning_rate": 9.115973926998935e-05, + "loss": 0.0039, + "step": 11630 + }, + { + "epoch": 6.728323699421965, + "grad_norm": 0.07286228984594345, + "learning_rate": 9.114095481977888e-05, + "loss": 0.0034, + "step": 11640 + }, + { + "epoch": 6.734104046242774, + "grad_norm": 0.06555897742509842, + "learning_rate": 9.112215237311703e-05, + "loss": 0.0032, + "step": 11650 + }, + { + "epoch": 6.7398843930635834, + "grad_norm": 0.07509700953960419, + "learning_rate": 9.110333193822867e-05, + "loss": 0.0041, + "step": 11660 + }, + { + "epoch": 6.745664739884393, + "grad_norm": 0.042179595679044724, + "learning_rate": 9.108449352334645e-05, + "loss": 0.003, + "step": 11670 + }, + { + "epoch": 6.7514450867052025, + "grad_norm": 0.07306253910064697, + "learning_rate": 9.106563713671094e-05, + "loss": 0.0032, + "step": 11680 + }, + { + "epoch": 6.757225433526012, + "grad_norm": 0.04574565216898918, + "learning_rate": 9.104676278657061e-05, + "loss": 0.0028, + "step": 11690 + }, + { + "epoch": 6.763005780346821, + "grad_norm": 0.059894826263189316, + "learning_rate": 9.102787048118169e-05, + "loss": 0.0041, + "step": 11700 + }, + { + "epoch": 6.76878612716763, + "grad_norm": 0.05459150671958923, + "learning_rate": 9.100896022880834e-05, + "loss": 0.0027, + "step": 11710 + }, + { + "epoch": 6.77456647398844, + "grad_norm": 0.10843174159526825, + "learning_rate": 9.099003203772254e-05, + "loss": 0.0052, + "step": 11720 + }, + { + "epoch": 6.780346820809249, + "grad_norm": 0.10990060865879059, + "learning_rate": 9.097108591620413e-05, + "loss": 0.0046, + "step": 11730 + }, + { + "epoch": 6.786127167630058, + "grad_norm": 0.07287583500146866, + "learning_rate": 9.095212187254078e-05, + "loss": 0.0062, + "step": 11740 + }, + { + "epoch": 6.791907514450867, + "grad_norm": 0.061475008726119995, + "learning_rate": 9.093313991502801e-05, + "loss": 0.0038, + "step": 11750 + }, + { + "epoch": 6.797687861271676, + "grad_norm": 0.10422157496213913, + "learning_rate": 9.091414005196917e-05, + "loss": 0.0046, + "step": 11760 + }, + { + "epoch": 6.803468208092486, + "grad_norm": 0.10327991098165512, + "learning_rate": 9.089512229167545e-05, + "loss": 0.0034, + "step": 11770 + }, + { + "epoch": 6.809248554913295, + "grad_norm": 0.06790751963853836, + "learning_rate": 9.087608664246587e-05, + "loss": 0.0032, + "step": 11780 + }, + { + "epoch": 6.815028901734104, + "grad_norm": 0.0866834744811058, + "learning_rate": 9.085703311266727e-05, + "loss": 0.0032, + "step": 11790 + }, + { + "epoch": 6.820809248554913, + "grad_norm": 0.11491991579532623, + "learning_rate": 9.083796171061429e-05, + "loss": 0.0043, + "step": 11800 + }, + { + "epoch": 6.826589595375722, + "grad_norm": 0.09453996270895004, + "learning_rate": 9.081887244464941e-05, + "loss": 0.0043, + "step": 11810 + }, + { + "epoch": 6.832369942196532, + "grad_norm": 0.09952332824468613, + "learning_rate": 9.079976532312297e-05, + "loss": 0.0038, + "step": 11820 + }, + { + "epoch": 6.838150289017341, + "grad_norm": 0.05078454315662384, + "learning_rate": 9.078064035439301e-05, + "loss": 0.0026, + "step": 11830 + }, + { + "epoch": 6.84393063583815, + "grad_norm": 0.04266422986984253, + "learning_rate": 9.07614975468255e-05, + "loss": 0.0039, + "step": 11840 + }, + { + "epoch": 6.8497109826589595, + "grad_norm": 0.058598946779966354, + "learning_rate": 9.074233690879412e-05, + "loss": 0.003, + "step": 11850 + }, + { + "epoch": 6.855491329479769, + "grad_norm": 0.06515590101480484, + "learning_rate": 9.072315844868038e-05, + "loss": 0.0027, + "step": 11860 + }, + { + "epoch": 6.861271676300578, + "grad_norm": 0.059226710349321365, + "learning_rate": 9.07039621748736e-05, + "loss": 0.0046, + "step": 11870 + }, + { + "epoch": 6.867052023121388, + "grad_norm": 0.07862482964992523, + "learning_rate": 9.06847480957709e-05, + "loss": 0.0026, + "step": 11880 + }, + { + "epoch": 6.872832369942197, + "grad_norm": 0.11240318417549133, + "learning_rate": 9.066551621977713e-05, + "loss": 0.0029, + "step": 11890 + }, + { + "epoch": 6.878612716763006, + "grad_norm": 0.09790018200874329, + "learning_rate": 9.064626655530501e-05, + "loss": 0.0029, + "step": 11900 + }, + { + "epoch": 6.884393063583815, + "grad_norm": 0.08414393663406372, + "learning_rate": 9.062699911077497e-05, + "loss": 0.0027, + "step": 11910 + }, + { + "epoch": 6.890173410404624, + "grad_norm": 0.044531289488077164, + "learning_rate": 9.060771389461524e-05, + "loss": 0.003, + "step": 11920 + }, + { + "epoch": 6.895953757225434, + "grad_norm": 0.08173054456710815, + "learning_rate": 9.058841091526187e-05, + "loss": 0.0036, + "step": 11930 + }, + { + "epoch": 6.901734104046243, + "grad_norm": 0.07367779314517975, + "learning_rate": 9.056909018115858e-05, + "loss": 0.0037, + "step": 11940 + }, + { + "epoch": 6.907514450867052, + "grad_norm": 0.06203809008002281, + "learning_rate": 9.054975170075697e-05, + "loss": 0.0035, + "step": 11950 + }, + { + "epoch": 6.913294797687861, + "grad_norm": 0.045646168291568756, + "learning_rate": 9.053039548251631e-05, + "loss": 0.0026, + "step": 11960 + }, + { + "epoch": 6.91907514450867, + "grad_norm": 0.05977427959442139, + "learning_rate": 9.051102153490368e-05, + "loss": 0.0064, + "step": 11970 + }, + { + "epoch": 6.924855491329479, + "grad_norm": 0.09023835510015488, + "learning_rate": 9.04916298663939e-05, + "loss": 0.0037, + "step": 11980 + }, + { + "epoch": 6.930635838150289, + "grad_norm": 0.0789908617734909, + "learning_rate": 9.047222048546955e-05, + "loss": 0.0033, + "step": 11990 + }, + { + "epoch": 6.936416184971098, + "grad_norm": 0.06687037646770477, + "learning_rate": 9.045279340062097e-05, + "loss": 0.0046, + "step": 12000 + }, + { + "epoch": 6.942196531791907, + "grad_norm": 0.0676008015871048, + "learning_rate": 9.043334862034618e-05, + "loss": 0.003, + "step": 12010 + }, + { + "epoch": 6.9479768786127165, + "grad_norm": 0.08746353536844254, + "learning_rate": 9.041388615315102e-05, + "loss": 0.0032, + "step": 12020 + }, + { + "epoch": 6.953757225433526, + "grad_norm": 0.050007738173007965, + "learning_rate": 9.039440600754905e-05, + "loss": 0.0031, + "step": 12030 + }, + { + "epoch": 6.959537572254336, + "grad_norm": 0.05344380810856819, + "learning_rate": 9.037490819206151e-05, + "loss": 0.0032, + "step": 12040 + }, + { + "epoch": 6.965317919075145, + "grad_norm": 0.05119822919368744, + "learning_rate": 9.035539271521744e-05, + "loss": 0.0027, + "step": 12050 + }, + { + "epoch": 6.971098265895954, + "grad_norm": 0.057583969086408615, + "learning_rate": 9.033585958555356e-05, + "loss": 0.0028, + "step": 12060 + }, + { + "epoch": 6.976878612716763, + "grad_norm": 0.05539444088935852, + "learning_rate": 9.031630881161431e-05, + "loss": 0.004, + "step": 12070 + }, + { + "epoch": 6.982658959537572, + "grad_norm": 0.056936319917440414, + "learning_rate": 9.029674040195186e-05, + "loss": 0.0029, + "step": 12080 + }, + { + "epoch": 6.988439306358382, + "grad_norm": 0.057017434388399124, + "learning_rate": 9.027715436512613e-05, + "loss": 0.003, + "step": 12090 + }, + { + "epoch": 6.994219653179191, + "grad_norm": 0.0817522257566452, + "learning_rate": 9.02575507097047e-05, + "loss": 0.0049, + "step": 12100 + }, + { + "epoch": 7.0, + "grad_norm": 0.05931788310408592, + "learning_rate": 9.023792944426286e-05, + "loss": 0.0028, + "step": 12110 + }, + { + "epoch": 7.005780346820809, + "grad_norm": 0.04272283613681793, + "learning_rate": 9.021829057738364e-05, + "loss": 0.0029, + "step": 12120 + }, + { + "epoch": 7.011560693641618, + "grad_norm": 0.06267021596431732, + "learning_rate": 9.019863411765775e-05, + "loss": 0.0034, + "step": 12130 + }, + { + "epoch": 7.017341040462428, + "grad_norm": 0.0873156264424324, + "learning_rate": 9.017896007368357e-05, + "loss": 0.0037, + "step": 12140 + }, + { + "epoch": 7.023121387283237, + "grad_norm": 0.08396241068840027, + "learning_rate": 9.015926845406722e-05, + "loss": 0.0038, + "step": 12150 + }, + { + "epoch": 7.028901734104046, + "grad_norm": 0.10236790031194687, + "learning_rate": 9.013955926742245e-05, + "loss": 0.0031, + "step": 12160 + }, + { + "epoch": 7.034682080924855, + "grad_norm": 0.06778319180011749, + "learning_rate": 9.011983252237077e-05, + "loss": 0.0025, + "step": 12170 + }, + { + "epoch": 7.040462427745664, + "grad_norm": 0.04483620822429657, + "learning_rate": 9.01000882275413e-05, + "loss": 0.0028, + "step": 12180 + }, + { + "epoch": 7.046242774566474, + "grad_norm": 0.06018118932843208, + "learning_rate": 9.008032639157088e-05, + "loss": 0.0029, + "step": 12190 + }, + { + "epoch": 7.0520231213872835, + "grad_norm": 0.05943775549530983, + "learning_rate": 9.006054702310401e-05, + "loss": 0.0044, + "step": 12200 + }, + { + "epoch": 7.057803468208093, + "grad_norm": 0.09683380275964737, + "learning_rate": 9.004075013079283e-05, + "loss": 0.0031, + "step": 12210 + }, + { + "epoch": 7.063583815028902, + "grad_norm": 0.055581022053956985, + "learning_rate": 9.00209357232972e-05, + "loss": 0.0031, + "step": 12220 + }, + { + "epoch": 7.069364161849711, + "grad_norm": 0.08908397704362869, + "learning_rate": 9.000110380928461e-05, + "loss": 0.0046, + "step": 12230 + }, + { + "epoch": 7.07514450867052, + "grad_norm": 0.09782064706087112, + "learning_rate": 8.998125439743021e-05, + "loss": 0.0033, + "step": 12240 + }, + { + "epoch": 7.08092485549133, + "grad_norm": 0.06307903677225113, + "learning_rate": 8.996138749641682e-05, + "loss": 0.0026, + "step": 12250 + }, + { + "epoch": 7.086705202312139, + "grad_norm": 0.10771429538726807, + "learning_rate": 8.994150311493488e-05, + "loss": 0.0041, + "step": 12260 + }, + { + "epoch": 7.092485549132948, + "grad_norm": 0.07503542304039001, + "learning_rate": 8.992160126168247e-05, + "loss": 0.003, + "step": 12270 + }, + { + "epoch": 7.098265895953757, + "grad_norm": 0.06886713206768036, + "learning_rate": 8.99016819453654e-05, + "loss": 0.0031, + "step": 12280 + }, + { + "epoch": 7.104046242774566, + "grad_norm": 0.0587981753051281, + "learning_rate": 8.988174517469702e-05, + "loss": 0.0037, + "step": 12290 + }, + { + "epoch": 7.109826589595376, + "grad_norm": 0.05444381386041641, + "learning_rate": 8.986179095839835e-05, + "loss": 0.0046, + "step": 12300 + }, + { + "epoch": 7.115606936416185, + "grad_norm": 0.07300411909818649, + "learning_rate": 8.984181930519804e-05, + "loss": 0.003, + "step": 12310 + }, + { + "epoch": 7.121387283236994, + "grad_norm": 0.059057146310806274, + "learning_rate": 8.982183022383237e-05, + "loss": 0.003, + "step": 12320 + }, + { + "epoch": 7.127167630057803, + "grad_norm": 0.051803700625896454, + "learning_rate": 8.980182372304525e-05, + "loss": 0.0045, + "step": 12330 + }, + { + "epoch": 7.132947976878612, + "grad_norm": 0.06521975994110107, + "learning_rate": 8.97817998115882e-05, + "loss": 0.0031, + "step": 12340 + }, + { + "epoch": 7.138728323699422, + "grad_norm": 0.06626100838184357, + "learning_rate": 8.976175849822038e-05, + "loss": 0.0034, + "step": 12350 + }, + { + "epoch": 7.144508670520231, + "grad_norm": 0.08046285808086395, + "learning_rate": 8.97416997917085e-05, + "loss": 0.0035, + "step": 12360 + }, + { + "epoch": 7.1502890173410405, + "grad_norm": 0.07497113198041916, + "learning_rate": 8.972162370082695e-05, + "loss": 0.0029, + "step": 12370 + }, + { + "epoch": 7.15606936416185, + "grad_norm": 0.08223730325698853, + "learning_rate": 8.97015302343577e-05, + "loss": 0.0034, + "step": 12380 + }, + { + "epoch": 7.161849710982659, + "grad_norm": 0.054061759263277054, + "learning_rate": 8.968141940109027e-05, + "loss": 0.0028, + "step": 12390 + }, + { + "epoch": 7.167630057803469, + "grad_norm": 0.061602234840393066, + "learning_rate": 8.966129120982188e-05, + "loss": 0.0054, + "step": 12400 + }, + { + "epoch": 7.173410404624278, + "grad_norm": 0.08485258370637894, + "learning_rate": 8.964114566935724e-05, + "loss": 0.0036, + "step": 12410 + }, + { + "epoch": 7.179190751445087, + "grad_norm": 0.07921281456947327, + "learning_rate": 8.962098278850871e-05, + "loss": 0.0035, + "step": 12420 + }, + { + "epoch": 7.184971098265896, + "grad_norm": 0.05399646610021591, + "learning_rate": 8.960080257609622e-05, + "loss": 0.0038, + "step": 12430 + }, + { + "epoch": 7.190751445086705, + "grad_norm": 0.07379326224327087, + "learning_rate": 8.95806050409473e-05, + "loss": 0.0037, + "step": 12440 + }, + { + "epoch": 7.196531791907514, + "grad_norm": 0.09412387758493423, + "learning_rate": 8.9560390191897e-05, + "loss": 0.0032, + "step": 12450 + }, + { + "epoch": 7.202312138728324, + "grad_norm": 0.05449094995856285, + "learning_rate": 8.9540158037788e-05, + "loss": 0.0032, + "step": 12460 + }, + { + "epoch": 7.208092485549133, + "grad_norm": 0.0610949732363224, + "learning_rate": 8.951990858747054e-05, + "loss": 0.0029, + "step": 12470 + }, + { + "epoch": 7.213872832369942, + "grad_norm": 0.05628536641597748, + "learning_rate": 8.94996418498024e-05, + "loss": 0.0056, + "step": 12480 + }, + { + "epoch": 7.219653179190751, + "grad_norm": 0.06268458813428879, + "learning_rate": 8.947935783364896e-05, + "loss": 0.0053, + "step": 12490 + }, + { + "epoch": 7.22543352601156, + "grad_norm": 0.06885628402233124, + "learning_rate": 8.945905654788311e-05, + "loss": 0.0031, + "step": 12500 + }, + { + "epoch": 7.23121387283237, + "grad_norm": 0.10913290828466415, + "learning_rate": 8.943873800138535e-05, + "loss": 0.0043, + "step": 12510 + }, + { + "epoch": 7.236994219653179, + "grad_norm": 0.0913657546043396, + "learning_rate": 8.94184022030437e-05, + "loss": 0.0042, + "step": 12520 + }, + { + "epoch": 7.242774566473988, + "grad_norm": 0.08144153654575348, + "learning_rate": 8.939804916175372e-05, + "loss": 0.0027, + "step": 12530 + }, + { + "epoch": 7.2485549132947975, + "grad_norm": 0.06376820802688599, + "learning_rate": 8.93776788864185e-05, + "loss": 0.0028, + "step": 12540 + }, + { + "epoch": 7.254335260115607, + "grad_norm": 0.07514312118291855, + "learning_rate": 8.935729138594873e-05, + "loss": 0.003, + "step": 12550 + }, + { + "epoch": 7.2601156069364166, + "grad_norm": 0.0901554524898529, + "learning_rate": 8.933688666926258e-05, + "loss": 0.0038, + "step": 12560 + }, + { + "epoch": 7.265895953757226, + "grad_norm": 0.0682053342461586, + "learning_rate": 8.931646474528575e-05, + "loss": 0.0027, + "step": 12570 + }, + { + "epoch": 7.271676300578035, + "grad_norm": 0.07660944759845734, + "learning_rate": 8.929602562295151e-05, + "loss": 0.0029, + "step": 12580 + }, + { + "epoch": 7.277456647398844, + "grad_norm": 0.08285539597272873, + "learning_rate": 8.92755693112006e-05, + "loss": 0.0051, + "step": 12590 + }, + { + "epoch": 7.283236994219653, + "grad_norm": 0.05912092328071594, + "learning_rate": 8.925509581898136e-05, + "loss": 0.0028, + "step": 12600 + }, + { + "epoch": 7.289017341040463, + "grad_norm": 0.07262987643480301, + "learning_rate": 8.923460515524951e-05, + "loss": 0.0038, + "step": 12610 + }, + { + "epoch": 7.294797687861272, + "grad_norm": 0.06825876981019974, + "learning_rate": 8.921409732896842e-05, + "loss": 0.003, + "step": 12620 + }, + { + "epoch": 7.300578034682081, + "grad_norm": 0.0889398455619812, + "learning_rate": 8.919357234910887e-05, + "loss": 0.0033, + "step": 12630 + }, + { + "epoch": 7.30635838150289, + "grad_norm": 0.050018060952425, + "learning_rate": 8.917303022464923e-05, + "loss": 0.003, + "step": 12640 + }, + { + "epoch": 7.312138728323699, + "grad_norm": 0.08514773845672607, + "learning_rate": 8.915247096457531e-05, + "loss": 0.0037, + "step": 12650 + }, + { + "epoch": 7.317919075144509, + "grad_norm": 0.05785389617085457, + "learning_rate": 8.91318945778804e-05, + "loss": 0.0031, + "step": 12660 + }, + { + "epoch": 7.323699421965318, + "grad_norm": 0.06569620966911316, + "learning_rate": 8.911130107356534e-05, + "loss": 0.0068, + "step": 12670 + }, + { + "epoch": 7.329479768786127, + "grad_norm": 0.08869968354701996, + "learning_rate": 8.90906904606384e-05, + "loss": 0.0029, + "step": 12680 + }, + { + "epoch": 7.335260115606936, + "grad_norm": 0.08334045857191086, + "learning_rate": 8.90700627481154e-05, + "loss": 0.0046, + "step": 12690 + }, + { + "epoch": 7.341040462427745, + "grad_norm": 0.09049692749977112, + "learning_rate": 8.904941794501957e-05, + "loss": 0.0041, + "step": 12700 + }, + { + "epoch": 7.3468208092485545, + "grad_norm": 0.07856415212154388, + "learning_rate": 8.902875606038166e-05, + "loss": 0.0045, + "step": 12710 + }, + { + "epoch": 7.3526011560693645, + "grad_norm": 0.11439894139766693, + "learning_rate": 8.900807710323989e-05, + "loss": 0.0042, + "step": 12720 + }, + { + "epoch": 7.358381502890174, + "grad_norm": 0.08839484304189682, + "learning_rate": 8.898738108263993e-05, + "loss": 0.0042, + "step": 12730 + }, + { + "epoch": 7.364161849710983, + "grad_norm": 0.07368623465299606, + "learning_rate": 8.896666800763491e-05, + "loss": 0.003, + "step": 12740 + }, + { + "epoch": 7.369942196531792, + "grad_norm": 0.061431001871824265, + "learning_rate": 8.894593788728546e-05, + "loss": 0.0036, + "step": 12750 + }, + { + "epoch": 7.375722543352601, + "grad_norm": 0.05455136299133301, + "learning_rate": 8.892519073065961e-05, + "loss": 0.0036, + "step": 12760 + }, + { + "epoch": 7.381502890173411, + "grad_norm": 0.06318709254264832, + "learning_rate": 8.89044265468329e-05, + "loss": 0.0034, + "step": 12770 + }, + { + "epoch": 7.38728323699422, + "grad_norm": 0.0825120210647583, + "learning_rate": 8.888364534488827e-05, + "loss": 0.0039, + "step": 12780 + }, + { + "epoch": 7.393063583815029, + "grad_norm": 0.07053670287132263, + "learning_rate": 8.886284713391613e-05, + "loss": 0.0023, + "step": 12790 + }, + { + "epoch": 7.398843930635838, + "grad_norm": 0.17149056494235992, + "learning_rate": 8.884203192301431e-05, + "loss": 0.0044, + "step": 12800 + }, + { + "epoch": 7.404624277456647, + "grad_norm": 0.08933155238628387, + "learning_rate": 8.88211997212881e-05, + "loss": 0.0052, + "step": 12810 + }, + { + "epoch": 7.410404624277457, + "grad_norm": 0.11136192828416824, + "learning_rate": 8.880035053785023e-05, + "loss": 0.0049, + "step": 12820 + }, + { + "epoch": 7.416184971098266, + "grad_norm": 0.08717218786478043, + "learning_rate": 8.877948438182083e-05, + "loss": 0.0054, + "step": 12830 + }, + { + "epoch": 7.421965317919075, + "grad_norm": 0.09373895078897476, + "learning_rate": 8.875860126232745e-05, + "loss": 0.0034, + "step": 12840 + }, + { + "epoch": 7.427745664739884, + "grad_norm": 0.09812264889478683, + "learning_rate": 8.87377011885051e-05, + "loss": 0.0032, + "step": 12850 + }, + { + "epoch": 7.433526011560693, + "grad_norm": 0.07721059024333954, + "learning_rate": 8.871678416949617e-05, + "loss": 0.0034, + "step": 12860 + }, + { + "epoch": 7.4393063583815024, + "grad_norm": 0.06405416131019592, + "learning_rate": 8.869585021445046e-05, + "loss": 0.0038, + "step": 12870 + }, + { + "epoch": 7.445086705202312, + "grad_norm": 0.07349882274866104, + "learning_rate": 8.867489933252521e-05, + "loss": 0.0031, + "step": 12880 + }, + { + "epoch": 7.4508670520231215, + "grad_norm": 0.06702619791030884, + "learning_rate": 8.865393153288504e-05, + "loss": 0.0028, + "step": 12890 + }, + { + "epoch": 7.456647398843931, + "grad_norm": 0.09085283428430557, + "learning_rate": 8.8632946824702e-05, + "loss": 0.0036, + "step": 12900 + }, + { + "epoch": 7.46242774566474, + "grad_norm": 0.07159951329231262, + "learning_rate": 8.86119452171555e-05, + "loss": 0.0035, + "step": 12910 + }, + { + "epoch": 7.468208092485549, + "grad_norm": 0.051984332501888275, + "learning_rate": 8.859092671943234e-05, + "loss": 0.0025, + "step": 12920 + }, + { + "epoch": 7.473988439306359, + "grad_norm": 0.0591067373752594, + "learning_rate": 8.856989134072676e-05, + "loss": 0.0025, + "step": 12930 + }, + { + "epoch": 7.479768786127168, + "grad_norm": 0.06747888028621674, + "learning_rate": 8.85488390902403e-05, + "loss": 0.0029, + "step": 12940 + }, + { + "epoch": 7.485549132947977, + "grad_norm": 0.07091999799013138, + "learning_rate": 8.852776997718199e-05, + "loss": 0.0042, + "step": 12950 + }, + { + "epoch": 7.491329479768786, + "grad_norm": 0.06534814089536667, + "learning_rate": 8.850668401076812e-05, + "loss": 0.003, + "step": 12960 + }, + { + "epoch": 7.497109826589595, + "grad_norm": 0.07369142025709152, + "learning_rate": 8.848558120022246e-05, + "loss": 0.0034, + "step": 12970 + }, + { + "epoch": 7.502890173410405, + "grad_norm": 0.07442428171634674, + "learning_rate": 8.846446155477603e-05, + "loss": 0.0035, + "step": 12980 + }, + { + "epoch": 7.508670520231214, + "grad_norm": 0.08190198242664337, + "learning_rate": 8.844332508366735e-05, + "loss": 0.0024, + "step": 12990 + }, + { + "epoch": 7.514450867052023, + "grad_norm": 0.06545107066631317, + "learning_rate": 8.84221717961422e-05, + "loss": 0.0031, + "step": 13000 + }, + { + "epoch": 7.520231213872832, + "grad_norm": 0.0414169579744339, + "learning_rate": 8.840100170145374e-05, + "loss": 0.0026, + "step": 13010 + }, + { + "epoch": 7.526011560693641, + "grad_norm": 0.05502517521381378, + "learning_rate": 8.837981480886249e-05, + "loss": 0.0029, + "step": 13020 + }, + { + "epoch": 7.531791907514451, + "grad_norm": 0.07878468185663223, + "learning_rate": 8.835861112763633e-05, + "loss": 0.0026, + "step": 13030 + }, + { + "epoch": 7.53757225433526, + "grad_norm": 0.059847693890333176, + "learning_rate": 8.833739066705044e-05, + "loss": 0.003, + "step": 13040 + }, + { + "epoch": 7.543352601156069, + "grad_norm": 0.06614434719085693, + "learning_rate": 8.831615343638742e-05, + "loss": 0.004, + "step": 13050 + }, + { + "epoch": 7.5491329479768785, + "grad_norm": 0.0676717534661293, + "learning_rate": 8.829489944493711e-05, + "loss": 0.0026, + "step": 13060 + }, + { + "epoch": 7.554913294797688, + "grad_norm": 0.04824170470237732, + "learning_rate": 8.827362870199675e-05, + "loss": 0.0031, + "step": 13070 + }, + { + "epoch": 7.5606936416184976, + "grad_norm": 0.0658811405301094, + "learning_rate": 8.825234121687089e-05, + "loss": 0.0036, + "step": 13080 + }, + { + "epoch": 7.566473988439307, + "grad_norm": 0.04927695915102959, + "learning_rate": 8.823103699887139e-05, + "loss": 0.0024, + "step": 13090 + }, + { + "epoch": 7.572254335260116, + "grad_norm": 0.06484200805425644, + "learning_rate": 8.820971605731745e-05, + "loss": 0.0049, + "step": 13100 + }, + { + "epoch": 7.578034682080925, + "grad_norm": 0.0489911250770092, + "learning_rate": 8.818837840153556e-05, + "loss": 0.0029, + "step": 13110 + }, + { + "epoch": 7.583815028901734, + "grad_norm": 0.0724041536450386, + "learning_rate": 8.816702404085952e-05, + "loss": 0.0037, + "step": 13120 + }, + { + "epoch": 7.589595375722544, + "grad_norm": 0.06730324774980545, + "learning_rate": 8.814565298463048e-05, + "loss": 0.0034, + "step": 13130 + }, + { + "epoch": 7.595375722543353, + "grad_norm": 0.040589164942502975, + "learning_rate": 8.812426524219688e-05, + "loss": 0.0066, + "step": 13140 + }, + { + "epoch": 7.601156069364162, + "grad_norm": 0.06166701763868332, + "learning_rate": 8.81028608229144e-05, + "loss": 0.0029, + "step": 13150 + }, + { + "epoch": 7.606936416184971, + "grad_norm": 0.0673922523856163, + "learning_rate": 8.808143973614611e-05, + "loss": 0.003, + "step": 13160 + }, + { + "epoch": 7.61271676300578, + "grad_norm": 0.05457756295800209, + "learning_rate": 8.806000199126228e-05, + "loss": 0.0044, + "step": 13170 + }, + { + "epoch": 7.618497109826589, + "grad_norm": 0.09412674605846405, + "learning_rate": 8.803854759764052e-05, + "loss": 0.003, + "step": 13180 + }, + { + "epoch": 7.624277456647399, + "grad_norm": 0.10493400692939758, + "learning_rate": 8.801707656466572e-05, + "loss": 0.0046, + "step": 13190 + }, + { + "epoch": 7.630057803468208, + "grad_norm": 0.13792482018470764, + "learning_rate": 8.799558890173003e-05, + "loss": 0.0049, + "step": 13200 + }, + { + "epoch": 7.635838150289017, + "grad_norm": 0.0934421643614769, + "learning_rate": 8.79740846182329e-05, + "loss": 0.0036, + "step": 13210 + }, + { + "epoch": 7.641618497109826, + "grad_norm": 0.0708174929022789, + "learning_rate": 8.7952563723581e-05, + "loss": 0.0065, + "step": 13220 + }, + { + "epoch": 7.6473988439306355, + "grad_norm": 0.07073547691106796, + "learning_rate": 8.793102622718834e-05, + "loss": 0.0027, + "step": 13230 + }, + { + "epoch": 7.653179190751445, + "grad_norm": 0.05832689628005028, + "learning_rate": 8.790947213847613e-05, + "loss": 0.0024, + "step": 13240 + }, + { + "epoch": 7.658959537572255, + "grad_norm": 0.07920937985181808, + "learning_rate": 8.788790146687286e-05, + "loss": 0.005, + "step": 13250 + }, + { + "epoch": 7.664739884393064, + "grad_norm": 0.0813925489783287, + "learning_rate": 8.786631422181429e-05, + "loss": 0.0043, + "step": 13260 + }, + { + "epoch": 7.670520231213873, + "grad_norm": 0.13647060096263885, + "learning_rate": 8.78447104127434e-05, + "loss": 0.0042, + "step": 13270 + }, + { + "epoch": 7.676300578034682, + "grad_norm": 0.09591488540172577, + "learning_rate": 8.782309004911042e-05, + "loss": 0.0037, + "step": 13280 + }, + { + "epoch": 7.682080924855491, + "grad_norm": 0.09956178069114685, + "learning_rate": 8.780145314037286e-05, + "loss": 0.0029, + "step": 13290 + }, + { + "epoch": 7.687861271676301, + "grad_norm": 0.05980099365115166, + "learning_rate": 8.777979969599542e-05, + "loss": 0.0032, + "step": 13300 + }, + { + "epoch": 7.69364161849711, + "grad_norm": 0.09983234107494354, + "learning_rate": 8.775812972545006e-05, + "loss": 0.0047, + "step": 13310 + }, + { + "epoch": 7.699421965317919, + "grad_norm": 0.11469981074333191, + "learning_rate": 8.773644323821596e-05, + "loss": 0.0033, + "step": 13320 + }, + { + "epoch": 7.705202312138728, + "grad_norm": 0.07005388289690018, + "learning_rate": 8.771474024377953e-05, + "loss": 0.004, + "step": 13330 + }, + { + "epoch": 7.710982658959537, + "grad_norm": 0.0865354910492897, + "learning_rate": 8.769302075163438e-05, + "loss": 0.0038, + "step": 13340 + }, + { + "epoch": 7.716763005780347, + "grad_norm": 0.07273339480161667, + "learning_rate": 8.767128477128137e-05, + "loss": 0.0039, + "step": 13350 + }, + { + "epoch": 7.722543352601156, + "grad_norm": 0.06498703360557556, + "learning_rate": 8.764953231222854e-05, + "loss": 0.0033, + "step": 13360 + }, + { + "epoch": 7.728323699421965, + "grad_norm": 0.062098268419504166, + "learning_rate": 8.762776338399119e-05, + "loss": 0.0035, + "step": 13370 + }, + { + "epoch": 7.734104046242774, + "grad_norm": 0.08519493788480759, + "learning_rate": 8.760597799609176e-05, + "loss": 0.0039, + "step": 13380 + }, + { + "epoch": 7.7398843930635834, + "grad_norm": 0.05835792422294617, + "learning_rate": 8.758417615805992e-05, + "loss": 0.0037, + "step": 13390 + }, + { + "epoch": 7.745664739884393, + "grad_norm": 0.05884012207388878, + "learning_rate": 8.756235787943254e-05, + "loss": 0.003, + "step": 13400 + }, + { + "epoch": 7.7514450867052025, + "grad_norm": 0.062465496361255646, + "learning_rate": 8.754052316975367e-05, + "loss": 0.0034, + "step": 13410 + }, + { + "epoch": 7.757225433526012, + "grad_norm": 0.12052380293607712, + "learning_rate": 8.751867203857455e-05, + "loss": 0.0037, + "step": 13420 + }, + { + "epoch": 7.763005780346821, + "grad_norm": 0.11391422897577286, + "learning_rate": 8.749680449545363e-05, + "loss": 0.0045, + "step": 13430 + }, + { + "epoch": 7.76878612716763, + "grad_norm": 0.07041367888450623, + "learning_rate": 8.747492054995649e-05, + "loss": 0.0035, + "step": 13440 + }, + { + "epoch": 7.77456647398844, + "grad_norm": 0.0718047097325325, + "learning_rate": 8.745302021165595e-05, + "loss": 0.0031, + "step": 13450 + }, + { + "epoch": 7.780346820809249, + "grad_norm": 0.0681358277797699, + "learning_rate": 8.743110349013192e-05, + "loss": 0.0029, + "step": 13460 + }, + { + "epoch": 7.786127167630058, + "grad_norm": 0.0745391845703125, + "learning_rate": 8.740917039497153e-05, + "loss": 0.0032, + "step": 13470 + }, + { + "epoch": 7.791907514450867, + "grad_norm": 0.07042954862117767, + "learning_rate": 8.738722093576906e-05, + "loss": 0.0034, + "step": 13480 + }, + { + "epoch": 7.797687861271676, + "grad_norm": 0.06926736980676651, + "learning_rate": 8.736525512212597e-05, + "loss": 0.0027, + "step": 13490 + }, + { + "epoch": 7.803468208092486, + "grad_norm": 0.06005188450217247, + "learning_rate": 8.734327296365084e-05, + "loss": 0.0029, + "step": 13500 + }, + { + "epoch": 7.809248554913295, + "grad_norm": 0.04462320730090141, + "learning_rate": 8.732127446995939e-05, + "loss": 0.0029, + "step": 13510 + }, + { + "epoch": 7.815028901734104, + "grad_norm": 0.03693057596683502, + "learning_rate": 8.729925965067454e-05, + "loss": 0.0029, + "step": 13520 + }, + { + "epoch": 7.820809248554913, + "grad_norm": 0.06181991472840309, + "learning_rate": 8.72772285154263e-05, + "loss": 0.0027, + "step": 13530 + }, + { + "epoch": 7.826589595375722, + "grad_norm": 0.06471090763807297, + "learning_rate": 8.725518107385187e-05, + "loss": 0.0027, + "step": 13540 + }, + { + "epoch": 7.832369942196532, + "grad_norm": 0.0629395842552185, + "learning_rate": 8.72331173355955e-05, + "loss": 0.0024, + "step": 13550 + }, + { + "epoch": 7.838150289017341, + "grad_norm": 0.060991790145635605, + "learning_rate": 8.721103731030867e-05, + "loss": 0.0036, + "step": 13560 + }, + { + "epoch": 7.84393063583815, + "grad_norm": 0.1394331008195877, + "learning_rate": 8.718894100764989e-05, + "loss": 0.0031, + "step": 13570 + }, + { + "epoch": 7.8497109826589595, + "grad_norm": 0.0855998545885086, + "learning_rate": 8.716682843728485e-05, + "loss": 0.0034, + "step": 13580 + }, + { + "epoch": 7.855491329479769, + "grad_norm": 0.07635796070098877, + "learning_rate": 8.714469960888634e-05, + "loss": 0.0031, + "step": 13590 + }, + { + "epoch": 7.861271676300578, + "grad_norm": 0.06049419194459915, + "learning_rate": 8.712255453213427e-05, + "loss": 0.0032, + "step": 13600 + }, + { + "epoch": 7.867052023121388, + "grad_norm": 0.08044454455375671, + "learning_rate": 8.710039321671563e-05, + "loss": 0.0036, + "step": 13610 + }, + { + "epoch": 7.872832369942197, + "grad_norm": 0.07877357304096222, + "learning_rate": 8.707821567232456e-05, + "loss": 0.0027, + "step": 13620 + }, + { + "epoch": 7.878612716763006, + "grad_norm": 0.10369919240474701, + "learning_rate": 8.705602190866225e-05, + "loss": 0.004, + "step": 13630 + }, + { + "epoch": 7.884393063583815, + "grad_norm": 0.09005500376224518, + "learning_rate": 8.703381193543701e-05, + "loss": 0.0032, + "step": 13640 + }, + { + "epoch": 7.890173410404624, + "grad_norm": 0.05346602946519852, + "learning_rate": 8.701158576236423e-05, + "loss": 0.0026, + "step": 13650 + }, + { + "epoch": 7.895953757225434, + "grad_norm": 0.05992775782942772, + "learning_rate": 8.69893433991664e-05, + "loss": 0.0027, + "step": 13660 + }, + { + "epoch": 7.901734104046243, + "grad_norm": 0.07658679038286209, + "learning_rate": 8.69670848555731e-05, + "loss": 0.0026, + "step": 13670 + }, + { + "epoch": 7.907514450867052, + "grad_norm": 0.0609201118350029, + "learning_rate": 8.694481014132096e-05, + "loss": 0.004, + "step": 13680 + }, + { + "epoch": 7.913294797687861, + "grad_norm": 0.06367721408605576, + "learning_rate": 8.69225192661537e-05, + "loss": 0.003, + "step": 13690 + }, + { + "epoch": 7.91907514450867, + "grad_norm": 0.06193988025188446, + "learning_rate": 8.690021223982208e-05, + "loss": 0.0034, + "step": 13700 + }, + { + "epoch": 7.924855491329479, + "grad_norm": 0.08065782487392426, + "learning_rate": 8.687788907208398e-05, + "loss": 0.0039, + "step": 13710 + }, + { + "epoch": 7.930635838150289, + "grad_norm": 0.06705836951732635, + "learning_rate": 8.685554977270431e-05, + "loss": 0.0036, + "step": 13720 + }, + { + "epoch": 7.936416184971098, + "grad_norm": 0.08512771129608154, + "learning_rate": 8.683319435145503e-05, + "loss": 0.0032, + "step": 13730 + }, + { + "epoch": 7.942196531791907, + "grad_norm": 0.05253278464078903, + "learning_rate": 8.681082281811517e-05, + "loss": 0.0025, + "step": 13740 + }, + { + "epoch": 7.9479768786127165, + "grad_norm": 0.12167950719594955, + "learning_rate": 8.67884351824708e-05, + "loss": 0.0037, + "step": 13750 + }, + { + "epoch": 7.953757225433526, + "grad_norm": 0.11908365786075592, + "learning_rate": 8.676603145431501e-05, + "loss": 0.0051, + "step": 13760 + }, + { + "epoch": 7.959537572254336, + "grad_norm": 0.08992159366607666, + "learning_rate": 8.674361164344799e-05, + "loss": 0.003, + "step": 13770 + }, + { + "epoch": 7.965317919075145, + "grad_norm": 0.0692647248506546, + "learning_rate": 8.672117575967688e-05, + "loss": 0.0027, + "step": 13780 + }, + { + "epoch": 7.971098265895954, + "grad_norm": 0.0604802742600441, + "learning_rate": 8.669872381281595e-05, + "loss": 0.0032, + "step": 13790 + }, + { + "epoch": 7.976878612716763, + "grad_norm": 0.05162026733160019, + "learning_rate": 8.667625581268639e-05, + "loss": 0.0023, + "step": 13800 + }, + { + "epoch": 7.982658959537572, + "grad_norm": 0.04985089972615242, + "learning_rate": 8.665377176911651e-05, + "loss": 0.0024, + "step": 13810 + }, + { + "epoch": 7.988439306358382, + "grad_norm": 0.04280465096235275, + "learning_rate": 8.663127169194159e-05, + "loss": 0.0025, + "step": 13820 + }, + { + "epoch": 7.994219653179191, + "grad_norm": 0.04978175833821297, + "learning_rate": 8.660875559100389e-05, + "loss": 0.0031, + "step": 13830 + }, + { + "epoch": 8.0, + "grad_norm": 0.06000877544283867, + "learning_rate": 8.658622347615274e-05, + "loss": 0.0029, + "step": 13840 + }, + { + "epoch": 8.00578034682081, + "grad_norm": 0.07454288750886917, + "learning_rate": 8.656367535724448e-05, + "loss": 0.0025, + "step": 13850 + }, + { + "epoch": 8.011560693641618, + "grad_norm": 0.061534419655799866, + "learning_rate": 8.65411112441424e-05, + "loss": 0.0042, + "step": 13860 + }, + { + "epoch": 8.017341040462428, + "grad_norm": 0.07896114140748978, + "learning_rate": 8.651853114671679e-05, + "loss": 0.003, + "step": 13870 + }, + { + "epoch": 8.023121387283236, + "grad_norm": 0.06675609946250916, + "learning_rate": 8.649593507484499e-05, + "loss": 0.0032, + "step": 13880 + }, + { + "epoch": 8.028901734104046, + "grad_norm": 0.06271135807037354, + "learning_rate": 8.647332303841126e-05, + "loss": 0.0033, + "step": 13890 + }, + { + "epoch": 8.034682080924856, + "grad_norm": 0.045987293124198914, + "learning_rate": 8.645069504730689e-05, + "loss": 0.0026, + "step": 13900 + }, + { + "epoch": 8.040462427745664, + "grad_norm": 0.05192944034934044, + "learning_rate": 8.64280511114301e-05, + "loss": 0.0035, + "step": 13910 + }, + { + "epoch": 8.046242774566474, + "grad_norm": 0.056140366941690445, + "learning_rate": 8.640539124068617e-05, + "loss": 0.0025, + "step": 13920 + }, + { + "epoch": 8.052023121387283, + "grad_norm": 0.08405833691358566, + "learning_rate": 8.638271544498727e-05, + "loss": 0.0028, + "step": 13930 + }, + { + "epoch": 8.057803468208093, + "grad_norm": 0.0532781258225441, + "learning_rate": 8.636002373425257e-05, + "loss": 0.0022, + "step": 13940 + }, + { + "epoch": 8.063583815028903, + "grad_norm": 0.06937815248966217, + "learning_rate": 8.633731611840817e-05, + "loss": 0.0028, + "step": 13950 + }, + { + "epoch": 8.06936416184971, + "grad_norm": 0.05528760328888893, + "learning_rate": 8.631459260738717e-05, + "loss": 0.0034, + "step": 13960 + }, + { + "epoch": 8.07514450867052, + "grad_norm": 0.0920768454670906, + "learning_rate": 8.62918532111296e-05, + "loss": 0.0032, + "step": 13970 + }, + { + "epoch": 8.080924855491329, + "grad_norm": 0.07862822711467743, + "learning_rate": 8.626909793958248e-05, + "loss": 0.0026, + "step": 13980 + }, + { + "epoch": 8.086705202312139, + "grad_norm": 0.08769793063402176, + "learning_rate": 8.624632680269969e-05, + "loss": 0.0036, + "step": 13990 + }, + { + "epoch": 8.092485549132949, + "grad_norm": 0.044517453759908676, + "learning_rate": 8.622353981044212e-05, + "loss": 0.0029, + "step": 14000 + }, + { + "epoch": 8.098265895953757, + "grad_norm": 0.05424928292632103, + "learning_rate": 8.620073697277757e-05, + "loss": 0.003, + "step": 14010 + }, + { + "epoch": 8.104046242774567, + "grad_norm": 0.06627245992422104, + "learning_rate": 8.617791829968079e-05, + "loss": 0.006, + "step": 14020 + }, + { + "epoch": 8.109826589595375, + "grad_norm": 0.05628069117665291, + "learning_rate": 8.615508380113344e-05, + "loss": 0.0033, + "step": 14030 + }, + { + "epoch": 8.115606936416185, + "grad_norm": 0.0704016163945198, + "learning_rate": 8.613223348712408e-05, + "loss": 0.0034, + "step": 14040 + }, + { + "epoch": 8.121387283236995, + "grad_norm": 0.07526744157075882, + "learning_rate": 8.610936736764824e-05, + "loss": 0.0029, + "step": 14050 + }, + { + "epoch": 8.127167630057803, + "grad_norm": 0.06005479395389557, + "learning_rate": 8.608648545270833e-05, + "loss": 0.0024, + "step": 14060 + }, + { + "epoch": 8.132947976878613, + "grad_norm": 0.08388201147317886, + "learning_rate": 8.606358775231366e-05, + "loss": 0.0028, + "step": 14070 + }, + { + "epoch": 8.138728323699421, + "grad_norm": 0.06882532685995102, + "learning_rate": 8.60406742764805e-05, + "loss": 0.0028, + "step": 14080 + }, + { + "epoch": 8.144508670520231, + "grad_norm": 0.04375378414988518, + "learning_rate": 8.601774503523195e-05, + "loss": 0.0028, + "step": 14090 + }, + { + "epoch": 8.15028901734104, + "grad_norm": 0.04277557507157326, + "learning_rate": 8.599480003859805e-05, + "loss": 0.0024, + "step": 14100 + }, + { + "epoch": 8.15606936416185, + "grad_norm": 0.07634586840867996, + "learning_rate": 8.597183929661573e-05, + "loss": 0.0023, + "step": 14110 + }, + { + "epoch": 8.16184971098266, + "grad_norm": 0.13055641949176788, + "learning_rate": 8.594886281932879e-05, + "loss": 0.0039, + "step": 14120 + }, + { + "epoch": 8.167630057803468, + "grad_norm": 0.08069360256195068, + "learning_rate": 8.59258706167879e-05, + "loss": 0.0031, + "step": 14130 + }, + { + "epoch": 8.173410404624278, + "grad_norm": 0.07761472463607788, + "learning_rate": 8.590286269905068e-05, + "loss": 0.0026, + "step": 14140 + }, + { + "epoch": 8.179190751445086, + "grad_norm": 0.0518157035112381, + "learning_rate": 8.587983907618154e-05, + "loss": 0.0032, + "step": 14150 + }, + { + "epoch": 8.184971098265896, + "grad_norm": 0.04370218887925148, + "learning_rate": 8.585679975825178e-05, + "loss": 0.0025, + "step": 14160 + }, + { + "epoch": 8.190751445086706, + "grad_norm": 0.05114853009581566, + "learning_rate": 8.583374475533962e-05, + "loss": 0.0033, + "step": 14170 + }, + { + "epoch": 8.196531791907514, + "grad_norm": 0.07591544091701508, + "learning_rate": 8.581067407753009e-05, + "loss": 0.0029, + "step": 14180 + }, + { + "epoch": 8.202312138728324, + "grad_norm": 0.058348849415779114, + "learning_rate": 8.578758773491507e-05, + "loss": 0.0025, + "step": 14190 + }, + { + "epoch": 8.208092485549132, + "grad_norm": 0.07974713295698166, + "learning_rate": 8.576448573759332e-05, + "loss": 0.0028, + "step": 14200 + }, + { + "epoch": 8.213872832369942, + "grad_norm": 0.08002932369709015, + "learning_rate": 8.574136809567044e-05, + "loss": 0.0036, + "step": 14210 + }, + { + "epoch": 8.219653179190752, + "grad_norm": 0.046907052397727966, + "learning_rate": 8.57182348192589e-05, + "loss": 0.003, + "step": 14220 + }, + { + "epoch": 8.22543352601156, + "grad_norm": 0.07600396871566772, + "learning_rate": 8.569508591847792e-05, + "loss": 0.0033, + "step": 14230 + }, + { + "epoch": 8.23121387283237, + "grad_norm": 0.07189874351024628, + "learning_rate": 8.567192140345367e-05, + "loss": 0.0045, + "step": 14240 + }, + { + "epoch": 8.236994219653178, + "grad_norm": 0.051627375185489655, + "learning_rate": 8.564874128431906e-05, + "loss": 0.0025, + "step": 14250 + }, + { + "epoch": 8.242774566473988, + "grad_norm": 0.056345611810684204, + "learning_rate": 8.562554557121389e-05, + "loss": 0.0027, + "step": 14260 + }, + { + "epoch": 8.248554913294798, + "grad_norm": 0.049111682921648026, + "learning_rate": 8.560233427428475e-05, + "loss": 0.0027, + "step": 14270 + }, + { + "epoch": 8.254335260115607, + "grad_norm": 0.0892246887087822, + "learning_rate": 8.557910740368503e-05, + "loss": 0.0049, + "step": 14280 + }, + { + "epoch": 8.260115606936417, + "grad_norm": 0.055140819400548935, + "learning_rate": 8.555586496957495e-05, + "loss": 0.0023, + "step": 14290 + }, + { + "epoch": 8.265895953757225, + "grad_norm": 0.07861759513616562, + "learning_rate": 8.553260698212155e-05, + "loss": 0.0029, + "step": 14300 + }, + { + "epoch": 8.271676300578035, + "grad_norm": 0.059682153165340424, + "learning_rate": 8.550933345149868e-05, + "loss": 0.0031, + "step": 14310 + }, + { + "epoch": 8.277456647398845, + "grad_norm": 0.06093475595116615, + "learning_rate": 8.548604438788696e-05, + "loss": 0.003, + "step": 14320 + }, + { + "epoch": 8.283236994219653, + "grad_norm": 0.087743379175663, + "learning_rate": 8.546273980147383e-05, + "loss": 0.0031, + "step": 14330 + }, + { + "epoch": 8.289017341040463, + "grad_norm": 0.09008658677339554, + "learning_rate": 8.543941970245348e-05, + "loss": 0.0026, + "step": 14340 + }, + { + "epoch": 8.294797687861271, + "grad_norm": 0.0581711046397686, + "learning_rate": 8.541608410102693e-05, + "loss": 0.0031, + "step": 14350 + }, + { + "epoch": 8.300578034682081, + "grad_norm": 0.05531102418899536, + "learning_rate": 8.539273300740195e-05, + "loss": 0.003, + "step": 14360 + }, + { + "epoch": 8.306358381502891, + "grad_norm": 0.049726564437150955, + "learning_rate": 8.536936643179313e-05, + "loss": 0.0028, + "step": 14370 + }, + { + "epoch": 8.3121387283237, + "grad_norm": 0.0582098588347435, + "learning_rate": 8.534598438442179e-05, + "loss": 0.0024, + "step": 14380 + }, + { + "epoch": 8.31791907514451, + "grad_norm": 0.07375791668891907, + "learning_rate": 8.532258687551603e-05, + "loss": 0.005, + "step": 14390 + }, + { + "epoch": 8.323699421965317, + "grad_norm": 0.0793997198343277, + "learning_rate": 8.529917391531071e-05, + "loss": 0.0031, + "step": 14400 + }, + { + "epoch": 8.329479768786127, + "grad_norm": 0.0530426912009716, + "learning_rate": 8.527574551404747e-05, + "loss": 0.0029, + "step": 14410 + }, + { + "epoch": 8.335260115606937, + "grad_norm": 0.09852785617113113, + "learning_rate": 8.525230168197468e-05, + "loss": 0.0038, + "step": 14420 + }, + { + "epoch": 8.341040462427745, + "grad_norm": 0.0513310432434082, + "learning_rate": 8.522884242934745e-05, + "loss": 0.0034, + "step": 14430 + }, + { + "epoch": 8.346820809248555, + "grad_norm": 0.06458441913127899, + "learning_rate": 8.520536776642768e-05, + "loss": 0.003, + "step": 14440 + }, + { + "epoch": 8.352601156069364, + "grad_norm": 0.05780723690986633, + "learning_rate": 8.5181877703484e-05, + "loss": 0.0031, + "step": 14450 + }, + { + "epoch": 8.358381502890174, + "grad_norm": 0.0697089433670044, + "learning_rate": 8.51583722507917e-05, + "loss": 0.0033, + "step": 14460 + }, + { + "epoch": 8.364161849710982, + "grad_norm": 0.049896955490112305, + "learning_rate": 8.513485141863293e-05, + "loss": 0.0022, + "step": 14470 + }, + { + "epoch": 8.369942196531792, + "grad_norm": 0.05260182172060013, + "learning_rate": 8.511131521729647e-05, + "loss": 0.0031, + "step": 14480 + }, + { + "epoch": 8.375722543352602, + "grad_norm": 0.06051109731197357, + "learning_rate": 8.508776365707787e-05, + "loss": 0.003, + "step": 14490 + }, + { + "epoch": 8.38150289017341, + "grad_norm": 0.07035737484693527, + "learning_rate": 8.506419674827934e-05, + "loss": 0.0027, + "step": 14500 + }, + { + "epoch": 8.38728323699422, + "grad_norm": 0.08471290022134781, + "learning_rate": 8.50406145012099e-05, + "loss": 0.0031, + "step": 14510 + }, + { + "epoch": 8.393063583815028, + "grad_norm": 0.08894751965999603, + "learning_rate": 8.501701692618519e-05, + "loss": 0.0027, + "step": 14520 + }, + { + "epoch": 8.398843930635838, + "grad_norm": 0.06659360229969025, + "learning_rate": 8.499340403352761e-05, + "loss": 0.0029, + "step": 14530 + }, + { + "epoch": 8.404624277456648, + "grad_norm": 0.0539698526263237, + "learning_rate": 8.496977583356623e-05, + "loss": 0.0026, + "step": 14540 + }, + { + "epoch": 8.410404624277456, + "grad_norm": 0.06316234171390533, + "learning_rate": 8.494613233663684e-05, + "loss": 0.0025, + "step": 14550 + }, + { + "epoch": 8.416184971098266, + "grad_norm": 0.052839938551187515, + "learning_rate": 8.492247355308189e-05, + "loss": 0.0022, + "step": 14560 + }, + { + "epoch": 8.421965317919074, + "grad_norm": 0.1064499169588089, + "learning_rate": 8.489879949325056e-05, + "loss": 0.0037, + "step": 14570 + }, + { + "epoch": 8.427745664739884, + "grad_norm": 0.07197859138250351, + "learning_rate": 8.487511016749868e-05, + "loss": 0.0025, + "step": 14580 + }, + { + "epoch": 8.433526011560694, + "grad_norm": 0.09276767820119858, + "learning_rate": 8.485140558618874e-05, + "loss": 0.003, + "step": 14590 + }, + { + "epoch": 8.439306358381502, + "grad_norm": 0.06622461974620819, + "learning_rate": 8.482768575968995e-05, + "loss": 0.0027, + "step": 14600 + }, + { + "epoch": 8.445086705202312, + "grad_norm": 0.04703529179096222, + "learning_rate": 8.480395069837818e-05, + "loss": 0.0026, + "step": 14610 + }, + { + "epoch": 8.45086705202312, + "grad_norm": 0.0429091602563858, + "learning_rate": 8.478020041263595e-05, + "loss": 0.0025, + "step": 14620 + }, + { + "epoch": 8.45664739884393, + "grad_norm": 0.04643228277564049, + "learning_rate": 8.475643491285242e-05, + "loss": 0.0025, + "step": 14630 + }, + { + "epoch": 8.46242774566474, + "grad_norm": 0.06544038653373718, + "learning_rate": 8.473265420942345e-05, + "loss": 0.003, + "step": 14640 + }, + { + "epoch": 8.468208092485549, + "grad_norm": 0.06004485487937927, + "learning_rate": 8.470885831275151e-05, + "loss": 0.0035, + "step": 14650 + }, + { + "epoch": 8.473988439306359, + "grad_norm": 0.11065249145030975, + "learning_rate": 8.468504723324574e-05, + "loss": 0.0039, + "step": 14660 + }, + { + "epoch": 8.479768786127167, + "grad_norm": 0.09323501586914062, + "learning_rate": 8.466122098132193e-05, + "loss": 0.0034, + "step": 14670 + }, + { + "epoch": 8.485549132947977, + "grad_norm": 0.08076255023479462, + "learning_rate": 8.463737956740245e-05, + "loss": 0.0027, + "step": 14680 + }, + { + "epoch": 8.491329479768787, + "grad_norm": 0.0990118682384491, + "learning_rate": 8.461352300191639e-05, + "loss": 0.0029, + "step": 14690 + }, + { + "epoch": 8.497109826589595, + "grad_norm": 0.07514192163944244, + "learning_rate": 8.45896512952994e-05, + "loss": 0.0037, + "step": 14700 + }, + { + "epoch": 8.502890173410405, + "grad_norm": 0.08111371845006943, + "learning_rate": 8.456576445799377e-05, + "loss": 0.0027, + "step": 14710 + }, + { + "epoch": 8.508670520231213, + "grad_norm": 0.07220485061407089, + "learning_rate": 8.454186250044844e-05, + "loss": 0.0027, + "step": 14720 + }, + { + "epoch": 8.514450867052023, + "grad_norm": 0.06481906026601791, + "learning_rate": 8.451794543311892e-05, + "loss": 0.0032, + "step": 14730 + }, + { + "epoch": 8.520231213872833, + "grad_norm": 0.06372509896755219, + "learning_rate": 8.449401326646736e-05, + "loss": 0.0025, + "step": 14740 + }, + { + "epoch": 8.526011560693641, + "grad_norm": 0.04399619251489639, + "learning_rate": 8.447006601096248e-05, + "loss": 0.0024, + "step": 14750 + }, + { + "epoch": 8.531791907514451, + "grad_norm": 0.0819878876209259, + "learning_rate": 8.444610367707964e-05, + "loss": 0.0034, + "step": 14760 + }, + { + "epoch": 8.53757225433526, + "grad_norm": 0.05718172341585159, + "learning_rate": 8.442212627530078e-05, + "loss": 0.003, + "step": 14770 + }, + { + "epoch": 8.54335260115607, + "grad_norm": 0.08189176768064499, + "learning_rate": 8.439813381611441e-05, + "loss": 0.0032, + "step": 14780 + }, + { + "epoch": 8.54913294797688, + "grad_norm": 0.049891822040081024, + "learning_rate": 8.437412631001567e-05, + "loss": 0.0023, + "step": 14790 + }, + { + "epoch": 8.554913294797688, + "grad_norm": 0.06955216079950333, + "learning_rate": 8.435010376750626e-05, + "loss": 0.003, + "step": 14800 + }, + { + "epoch": 8.560693641618498, + "grad_norm": 0.08844389766454697, + "learning_rate": 8.432606619909442e-05, + "loss": 0.004, + "step": 14810 + }, + { + "epoch": 8.566473988439306, + "grad_norm": 0.10105694085359573, + "learning_rate": 8.430201361529506e-05, + "loss": 0.0027, + "step": 14820 + }, + { + "epoch": 8.572254335260116, + "grad_norm": 0.11816252022981644, + "learning_rate": 8.427794602662954e-05, + "loss": 0.0028, + "step": 14830 + }, + { + "epoch": 8.578034682080926, + "grad_norm": 0.09827663004398346, + "learning_rate": 8.425386344362586e-05, + "loss": 0.003, + "step": 14840 + }, + { + "epoch": 8.583815028901734, + "grad_norm": 0.06215595826506615, + "learning_rate": 8.422976587681859e-05, + "loss": 0.0049, + "step": 14850 + }, + { + "epoch": 8.589595375722544, + "grad_norm": 0.08483150601387024, + "learning_rate": 8.42056533367488e-05, + "loss": 0.0063, + "step": 14860 + }, + { + "epoch": 8.595375722543352, + "grad_norm": 0.0631008893251419, + "learning_rate": 8.41815258339641e-05, + "loss": 0.0038, + "step": 14870 + }, + { + "epoch": 8.601156069364162, + "grad_norm": 0.11883585900068283, + "learning_rate": 8.415738337901874e-05, + "loss": 0.0049, + "step": 14880 + }, + { + "epoch": 8.606936416184972, + "grad_norm": 0.23546771705150604, + "learning_rate": 8.413322598247342e-05, + "loss": 0.0043, + "step": 14890 + }, + { + "epoch": 8.61271676300578, + "grad_norm": 0.2148403376340866, + "learning_rate": 8.41090536548954e-05, + "loss": 0.0045, + "step": 14900 + }, + { + "epoch": 8.61849710982659, + "grad_norm": 0.10700441896915436, + "learning_rate": 8.408486640685849e-05, + "loss": 0.0033, + "step": 14910 + }, + { + "epoch": 8.624277456647398, + "grad_norm": 0.12443286180496216, + "learning_rate": 8.4060664248943e-05, + "loss": 0.0058, + "step": 14920 + }, + { + "epoch": 8.630057803468208, + "grad_norm": 0.09611442685127258, + "learning_rate": 8.40364471917358e-05, + "loss": 0.0054, + "step": 14930 + }, + { + "epoch": 8.635838150289018, + "grad_norm": 0.08155146986246109, + "learning_rate": 8.401221524583024e-05, + "loss": 0.0045, + "step": 14940 + }, + { + "epoch": 8.641618497109826, + "grad_norm": 0.07205265760421753, + "learning_rate": 8.398796842182619e-05, + "loss": 0.0036, + "step": 14950 + }, + { + "epoch": 8.647398843930636, + "grad_norm": 0.04646911844611168, + "learning_rate": 8.396370673033006e-05, + "loss": 0.0029, + "step": 14960 + }, + { + "epoch": 8.653179190751445, + "grad_norm": 0.06000467762351036, + "learning_rate": 8.39394301819547e-05, + "loss": 0.0027, + "step": 14970 + }, + { + "epoch": 8.658959537572255, + "grad_norm": 0.07406723499298096, + "learning_rate": 8.391513878731949e-05, + "loss": 0.0039, + "step": 14980 + }, + { + "epoch": 8.664739884393063, + "grad_norm": 0.07790686190128326, + "learning_rate": 8.389083255705037e-05, + "loss": 0.0037, + "step": 14990 + }, + { + "epoch": 8.670520231213873, + "grad_norm": 0.0638248473405838, + "learning_rate": 8.386651150177968e-05, + "loss": 0.0036, + "step": 15000 + } + ], + "logging_steps": 10, + "max_steps": 50000, + "num_input_tokens_seen": 0, + "num_train_epochs": 29, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}