{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.670520231213873, "eval_steps": 500, "global_step": 15000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005780346820809248, "grad_norm": 3.1595253944396973, "learning_rate": 3.6e-07, "loss": 0.3371, "step": 10 }, { "epoch": 0.011560693641618497, "grad_norm": 3.4880928993225098, "learning_rate": 7.6e-07, "loss": 0.3336, "step": 20 }, { "epoch": 0.017341040462427744, "grad_norm": 3.2054455280303955, "learning_rate": 1.16e-06, "loss": 0.3104, "step": 30 }, { "epoch": 0.023121387283236993, "grad_norm": 2.7082252502441406, "learning_rate": 1.56e-06, "loss": 0.2878, "step": 40 }, { "epoch": 0.028901734104046242, "grad_norm": 1.6240012645721436, "learning_rate": 1.96e-06, "loss": 0.1887, "step": 50 }, { "epoch": 0.03468208092485549, "grad_norm": 0.4911483824253082, "learning_rate": 2.36e-06, "loss": 0.1468, "step": 60 }, { "epoch": 0.04046242774566474, "grad_norm": 0.6908175945281982, "learning_rate": 2.7600000000000003e-06, "loss": 0.1346, "step": 70 }, { "epoch": 0.046242774566473986, "grad_norm": 0.4389197826385498, "learning_rate": 3.1600000000000007e-06, "loss": 0.131, "step": 80 }, { "epoch": 0.05202312138728324, "grad_norm": 0.5299481153488159, "learning_rate": 3.5600000000000002e-06, "loss": 0.0966, "step": 90 }, { "epoch": 0.057803468208092484, "grad_norm": 0.33272606134414673, "learning_rate": 3.96e-06, "loss": 0.1039, "step": 100 }, { "epoch": 0.06358381502890173, "grad_norm": 0.2846597135066986, "learning_rate": 4.360000000000001e-06, "loss": 0.0933, "step": 110 }, { "epoch": 0.06936416184971098, "grad_norm": 0.21347007155418396, "learning_rate": 4.76e-06, "loss": 0.0802, "step": 120 }, { "epoch": 0.07514450867052024, "grad_norm": 0.24284492433071136, "learning_rate": 5.1600000000000006e-06, "loss": 0.0858, "step": 130 }, { "epoch": 0.08092485549132948, "grad_norm": 0.19188109040260315, "learning_rate": 5.56e-06, "loss": 0.0793, "step": 140 }, { "epoch": 0.08670520231213873, "grad_norm": 0.15339428186416626, "learning_rate": 5.9600000000000005e-06, "loss": 0.0744, "step": 150 }, { "epoch": 0.09248554913294797, "grad_norm": 0.16470389068126678, "learning_rate": 6.360000000000001e-06, "loss": 0.0705, "step": 160 }, { "epoch": 0.09826589595375723, "grad_norm": 0.15767133235931396, "learning_rate": 6.76e-06, "loss": 0.0665, "step": 170 }, { "epoch": 0.10404624277456648, "grad_norm": 0.11162696778774261, "learning_rate": 7.16e-06, "loss": 0.0609, "step": 180 }, { "epoch": 0.10982658959537572, "grad_norm": 0.08929910510778427, "learning_rate": 7.5600000000000005e-06, "loss": 0.0617, "step": 190 }, { "epoch": 0.11560693641618497, "grad_norm": 0.11486585438251495, "learning_rate": 7.96e-06, "loss": 0.0558, "step": 200 }, { "epoch": 0.12138728323699421, "grad_norm": 0.1365112066268921, "learning_rate": 8.36e-06, "loss": 0.0529, "step": 210 }, { "epoch": 0.12716763005780346, "grad_norm": 0.14509879052639008, "learning_rate": 8.76e-06, "loss": 0.0523, "step": 220 }, { "epoch": 0.1329479768786127, "grad_norm": 0.11319673806428909, "learning_rate": 9.16e-06, "loss": 0.0454, "step": 230 }, { "epoch": 0.13872832369942195, "grad_norm": 0.1477111279964447, "learning_rate": 9.560000000000002e-06, "loss": 0.0474, "step": 240 }, { "epoch": 0.14450867052023122, "grad_norm": 0.10854203253984451, "learning_rate": 9.96e-06, "loss": 0.0393, "step": 250 }, { "epoch": 0.15028901734104047, "grad_norm": 0.11513552814722061, "learning_rate": 1.036e-05, "loss": 0.045, "step": 260 }, { "epoch": 0.15606936416184972, "grad_norm": 0.11579402536153793, "learning_rate": 1.076e-05, "loss": 0.0355, "step": 270 }, { "epoch": 0.16184971098265896, "grad_norm": 0.11395751684904099, "learning_rate": 1.1160000000000002e-05, "loss": 0.0422, "step": 280 }, { "epoch": 0.1676300578034682, "grad_norm": 0.12264841049909592, "learning_rate": 1.156e-05, "loss": 0.0396, "step": 290 }, { "epoch": 0.17341040462427745, "grad_norm": 0.1499921679496765, "learning_rate": 1.196e-05, "loss": 0.0368, "step": 300 }, { "epoch": 0.1791907514450867, "grad_norm": 0.1338682770729065, "learning_rate": 1.236e-05, "loss": 0.035, "step": 310 }, { "epoch": 0.18497109826589594, "grad_norm": 0.12111975252628326, "learning_rate": 1.276e-05, "loss": 0.0327, "step": 320 }, { "epoch": 0.1907514450867052, "grad_norm": 0.08808861672878265, "learning_rate": 1.316e-05, "loss": 0.0318, "step": 330 }, { "epoch": 0.19653179190751446, "grad_norm": 0.14213015139102936, "learning_rate": 1.356e-05, "loss": 0.0301, "step": 340 }, { "epoch": 0.2023121387283237, "grad_norm": 0.11863279342651367, "learning_rate": 1.396e-05, "loss": 0.0271, "step": 350 }, { "epoch": 0.20809248554913296, "grad_norm": 0.13171134889125824, "learning_rate": 1.4360000000000001e-05, "loss": 0.0356, "step": 360 }, { "epoch": 0.2138728323699422, "grad_norm": 0.13477171957492828, "learning_rate": 1.4760000000000001e-05, "loss": 0.03, "step": 370 }, { "epoch": 0.21965317919075145, "grad_norm": 0.17239578068256378, "learning_rate": 1.5160000000000002e-05, "loss": 0.0305, "step": 380 }, { "epoch": 0.2254335260115607, "grad_norm": 0.11451636254787445, "learning_rate": 1.556e-05, "loss": 0.0288, "step": 390 }, { "epoch": 0.23121387283236994, "grad_norm": 0.1459856629371643, "learning_rate": 1.596e-05, "loss": 0.0263, "step": 400 }, { "epoch": 0.23699421965317918, "grad_norm": 0.11896130442619324, "learning_rate": 1.636e-05, "loss": 0.03, "step": 410 }, { "epoch": 0.24277456647398843, "grad_norm": 0.09973743557929993, "learning_rate": 1.6760000000000002e-05, "loss": 0.0262, "step": 420 }, { "epoch": 0.24855491329479767, "grad_norm": 0.13354068994522095, "learning_rate": 1.7160000000000002e-05, "loss": 0.024, "step": 430 }, { "epoch": 0.2543352601156069, "grad_norm": 0.15291906893253326, "learning_rate": 1.756e-05, "loss": 0.0243, "step": 440 }, { "epoch": 0.26011560693641617, "grad_norm": 0.16498644649982452, "learning_rate": 1.796e-05, "loss": 0.0232, "step": 450 }, { "epoch": 0.2658959537572254, "grad_norm": 0.1057974100112915, "learning_rate": 1.8360000000000004e-05, "loss": 0.0211, "step": 460 }, { "epoch": 0.27167630057803466, "grad_norm": 0.10222145169973373, "learning_rate": 1.876e-05, "loss": 0.0218, "step": 470 }, { "epoch": 0.2774566473988439, "grad_norm": 0.10169381648302078, "learning_rate": 1.916e-05, "loss": 0.0211, "step": 480 }, { "epoch": 0.2832369942196532, "grad_norm": 0.15869389474391937, "learning_rate": 1.956e-05, "loss": 0.0237, "step": 490 }, { "epoch": 0.28901734104046245, "grad_norm": 0.16140298545360565, "learning_rate": 1.9960000000000002e-05, "loss": 0.0198, "step": 500 }, { "epoch": 0.2947976878612717, "grad_norm": 0.1119980588555336, "learning_rate": 2.036e-05, "loss": 0.0203, "step": 510 }, { "epoch": 0.30057803468208094, "grad_norm": 0.09472450613975525, "learning_rate": 2.076e-05, "loss": 0.0211, "step": 520 }, { "epoch": 0.3063583815028902, "grad_norm": 0.1749098151922226, "learning_rate": 2.116e-05, "loss": 0.0189, "step": 530 }, { "epoch": 0.31213872832369943, "grad_norm": 0.13768576085567474, "learning_rate": 2.1560000000000004e-05, "loss": 0.0175, "step": 540 }, { "epoch": 0.3179190751445087, "grad_norm": 0.13592314720153809, "learning_rate": 2.196e-05, "loss": 0.0258, "step": 550 }, { "epoch": 0.3236994219653179, "grad_norm": 0.1005687341094017, "learning_rate": 2.236e-05, "loss": 0.0179, "step": 560 }, { "epoch": 0.32947976878612717, "grad_norm": 0.14020080864429474, "learning_rate": 2.2760000000000002e-05, "loss": 0.02, "step": 570 }, { "epoch": 0.3352601156069364, "grad_norm": 0.10146922618150711, "learning_rate": 2.3160000000000002e-05, "loss": 0.0176, "step": 580 }, { "epoch": 0.34104046242774566, "grad_norm": 0.1250441074371338, "learning_rate": 2.356e-05, "loss": 0.0169, "step": 590 }, { "epoch": 0.3468208092485549, "grad_norm": 0.13686810433864594, "learning_rate": 2.396e-05, "loss": 0.0186, "step": 600 }, { "epoch": 0.35260115606936415, "grad_norm": 0.15110881626605988, "learning_rate": 2.4360000000000004e-05, "loss": 0.021, "step": 610 }, { "epoch": 0.3583815028901734, "grad_norm": 0.14723263680934906, "learning_rate": 2.476e-05, "loss": 0.0226, "step": 620 }, { "epoch": 0.36416184971098264, "grad_norm": 0.11678220331668854, "learning_rate": 2.516e-05, "loss": 0.0151, "step": 630 }, { "epoch": 0.3699421965317919, "grad_norm": 0.18138806521892548, "learning_rate": 2.556e-05, "loss": 0.0182, "step": 640 }, { "epoch": 0.37572254335260113, "grad_norm": 0.14597956836223602, "learning_rate": 2.5960000000000002e-05, "loss": 0.0148, "step": 650 }, { "epoch": 0.3815028901734104, "grad_norm": 0.16550736129283905, "learning_rate": 2.6360000000000002e-05, "loss": 0.0175, "step": 660 }, { "epoch": 0.3872832369942196, "grad_norm": 0.12392124533653259, "learning_rate": 2.676e-05, "loss": 0.0178, "step": 670 }, { "epoch": 0.3930635838150289, "grad_norm": 0.14373187720775604, "learning_rate": 2.716e-05, "loss": 0.0151, "step": 680 }, { "epoch": 0.3988439306358382, "grad_norm": 0.10784381628036499, "learning_rate": 2.7560000000000004e-05, "loss": 0.0153, "step": 690 }, { "epoch": 0.4046242774566474, "grad_norm": 0.12487441301345825, "learning_rate": 2.7960000000000003e-05, "loss": 0.0146, "step": 700 }, { "epoch": 0.41040462427745666, "grad_norm": 0.1104297786951065, "learning_rate": 2.8360000000000003e-05, "loss": 0.0174, "step": 710 }, { "epoch": 0.4161849710982659, "grad_norm": 0.09142022579908371, "learning_rate": 2.8760000000000002e-05, "loss": 0.0142, "step": 720 }, { "epoch": 0.42196531791907516, "grad_norm": 0.12280980497598648, "learning_rate": 2.9160000000000005e-05, "loss": 0.0147, "step": 730 }, { "epoch": 0.4277456647398844, "grad_norm": 0.12440512329339981, "learning_rate": 2.9559999999999998e-05, "loss": 0.0131, "step": 740 }, { "epoch": 0.43352601156069365, "grad_norm": 0.09911549091339111, "learning_rate": 2.9959999999999998e-05, "loss": 0.0141, "step": 750 }, { "epoch": 0.4393063583815029, "grad_norm": 0.12997058033943176, "learning_rate": 3.036e-05, "loss": 0.0121, "step": 760 }, { "epoch": 0.44508670520231214, "grad_norm": 0.14805808663368225, "learning_rate": 3.076e-05, "loss": 0.0148, "step": 770 }, { "epoch": 0.4508670520231214, "grad_norm": 0.10276526212692261, "learning_rate": 3.116e-05, "loss": 0.0134, "step": 780 }, { "epoch": 0.45664739884393063, "grad_norm": 0.15157487988471985, "learning_rate": 3.156e-05, "loss": 0.0143, "step": 790 }, { "epoch": 0.4624277456647399, "grad_norm": 0.10993634164333344, "learning_rate": 3.196e-05, "loss": 0.0151, "step": 800 }, { "epoch": 0.4682080924855491, "grad_norm": 0.11326078325510025, "learning_rate": 3.236e-05, "loss": 0.0162, "step": 810 }, { "epoch": 0.47398843930635837, "grad_norm": 0.13100625574588776, "learning_rate": 3.2760000000000005e-05, "loss": 0.013, "step": 820 }, { "epoch": 0.4797687861271676, "grad_norm": 0.18897277116775513, "learning_rate": 3.316e-05, "loss": 0.0139, "step": 830 }, { "epoch": 0.48554913294797686, "grad_norm": 0.17949187755584717, "learning_rate": 3.3560000000000004e-05, "loss": 0.011, "step": 840 }, { "epoch": 0.4913294797687861, "grad_norm": 0.10038192570209503, "learning_rate": 3.396e-05, "loss": 0.0119, "step": 850 }, { "epoch": 0.49710982658959535, "grad_norm": 0.16344571113586426, "learning_rate": 3.436e-05, "loss": 0.0133, "step": 860 }, { "epoch": 0.5028901734104047, "grad_norm": 0.23551280796527863, "learning_rate": 3.4760000000000006e-05, "loss": 0.0134, "step": 870 }, { "epoch": 0.5086705202312138, "grad_norm": 0.19831761717796326, "learning_rate": 3.516e-05, "loss": 0.0123, "step": 880 }, { "epoch": 0.5144508670520231, "grad_norm": 0.19457103312015533, "learning_rate": 3.5560000000000005e-05, "loss": 0.0151, "step": 890 }, { "epoch": 0.5202312138728323, "grad_norm": 0.1299736499786377, "learning_rate": 3.596e-05, "loss": 0.0138, "step": 900 }, { "epoch": 0.5260115606936416, "grad_norm": 0.1533757746219635, "learning_rate": 3.636e-05, "loss": 0.0161, "step": 910 }, { "epoch": 0.5317919075144508, "grad_norm": 0.14695550501346588, "learning_rate": 3.676e-05, "loss": 0.0119, "step": 920 }, { "epoch": 0.5375722543352601, "grad_norm": 0.12472260743379593, "learning_rate": 3.716e-05, "loss": 0.013, "step": 930 }, { "epoch": 0.5433526011560693, "grad_norm": 0.14407047629356384, "learning_rate": 3.756e-05, "loss": 0.0165, "step": 940 }, { "epoch": 0.5491329479768786, "grad_norm": 0.11574450135231018, "learning_rate": 3.796e-05, "loss": 0.0137, "step": 950 }, { "epoch": 0.5549132947976878, "grad_norm": 0.17657427489757538, "learning_rate": 3.836e-05, "loss": 0.0109, "step": 960 }, { "epoch": 0.5606936416184971, "grad_norm": 0.11555900424718857, "learning_rate": 3.876e-05, "loss": 0.0148, "step": 970 }, { "epoch": 0.5664739884393064, "grad_norm": 0.11354225873947144, "learning_rate": 3.9160000000000005e-05, "loss": 0.0093, "step": 980 }, { "epoch": 0.5722543352601156, "grad_norm": 0.08584084361791611, "learning_rate": 3.956e-05, "loss": 0.0111, "step": 990 }, { "epoch": 0.5780346820809249, "grad_norm": 0.06208997219800949, "learning_rate": 3.9960000000000004e-05, "loss": 0.0117, "step": 1000 }, { "epoch": 0.5838150289017341, "grad_norm": 0.07749241590499878, "learning_rate": 4.0360000000000007e-05, "loss": 0.0088, "step": 1010 }, { "epoch": 0.5895953757225434, "grad_norm": 0.1077214777469635, "learning_rate": 4.076e-05, "loss": 0.0107, "step": 1020 }, { "epoch": 0.5953757225433526, "grad_norm": 0.12400258332490921, "learning_rate": 4.1160000000000006e-05, "loss": 0.0117, "step": 1030 }, { "epoch": 0.6011560693641619, "grad_norm": 0.10394832491874695, "learning_rate": 4.156e-05, "loss": 0.0089, "step": 1040 }, { "epoch": 0.6069364161849711, "grad_norm": 0.0824170783162117, "learning_rate": 4.196e-05, "loss": 0.0099, "step": 1050 }, { "epoch": 0.6127167630057804, "grad_norm": 0.0983257070183754, "learning_rate": 4.236e-05, "loss": 0.0119, "step": 1060 }, { "epoch": 0.6184971098265896, "grad_norm": 0.11756965517997742, "learning_rate": 4.276e-05, "loss": 0.0109, "step": 1070 }, { "epoch": 0.6242774566473989, "grad_norm": 0.13317210972309113, "learning_rate": 4.316e-05, "loss": 0.0129, "step": 1080 }, { "epoch": 0.630057803468208, "grad_norm": 0.1497182995080948, "learning_rate": 4.356e-05, "loss": 0.014, "step": 1090 }, { "epoch": 0.6358381502890174, "grad_norm": 0.09919284284114838, "learning_rate": 4.396e-05, "loss": 0.0086, "step": 1100 }, { "epoch": 0.6416184971098265, "grad_norm": 0.12873047590255737, "learning_rate": 4.436e-05, "loss": 0.0123, "step": 1110 }, { "epoch": 0.6473988439306358, "grad_norm": 0.08432596921920776, "learning_rate": 4.4760000000000005e-05, "loss": 0.0095, "step": 1120 }, { "epoch": 0.653179190751445, "grad_norm": 0.10877019166946411, "learning_rate": 4.516e-05, "loss": 0.0136, "step": 1130 }, { "epoch": 0.6589595375722543, "grad_norm": 0.09828033298254013, "learning_rate": 4.5560000000000004e-05, "loss": 0.0098, "step": 1140 }, { "epoch": 0.6647398843930635, "grad_norm": 0.17707689106464386, "learning_rate": 4.596e-05, "loss": 0.011, "step": 1150 }, { "epoch": 0.6705202312138728, "grad_norm": 0.09169796109199524, "learning_rate": 4.636e-05, "loss": 0.0111, "step": 1160 }, { "epoch": 0.6763005780346821, "grad_norm": 0.09853609651327133, "learning_rate": 4.6760000000000006e-05, "loss": 0.0091, "step": 1170 }, { "epoch": 0.6820809248554913, "grad_norm": 0.0778835192322731, "learning_rate": 4.716e-05, "loss": 0.0094, "step": 1180 }, { "epoch": 0.6878612716763006, "grad_norm": 0.07706254720687866, "learning_rate": 4.7560000000000005e-05, "loss": 0.0087, "step": 1190 }, { "epoch": 0.6936416184971098, "grad_norm": 0.15445491671562195, "learning_rate": 4.796e-05, "loss": 0.012, "step": 1200 }, { "epoch": 0.6994219653179191, "grad_norm": 0.10672589391469955, "learning_rate": 4.836e-05, "loss": 0.008, "step": 1210 }, { "epoch": 0.7052023121387283, "grad_norm": 0.14515936374664307, "learning_rate": 4.876e-05, "loss": 0.0087, "step": 1220 }, { "epoch": 0.7109826589595376, "grad_norm": 0.11830303817987442, "learning_rate": 4.9160000000000004e-05, "loss": 0.0095, "step": 1230 }, { "epoch": 0.7167630057803468, "grad_norm": 0.10018444061279297, "learning_rate": 4.956e-05, "loss": 0.0087, "step": 1240 }, { "epoch": 0.7225433526011561, "grad_norm": 0.09550796449184418, "learning_rate": 4.996e-05, "loss": 0.0093, "step": 1250 }, { "epoch": 0.7283236994219653, "grad_norm": 0.13438743352890015, "learning_rate": 5.0360000000000006e-05, "loss": 0.0091, "step": 1260 }, { "epoch": 0.7341040462427746, "grad_norm": 0.13329671323299408, "learning_rate": 5.076000000000001e-05, "loss": 0.011, "step": 1270 }, { "epoch": 0.7398843930635838, "grad_norm": 0.10754700750112534, "learning_rate": 5.1160000000000005e-05, "loss": 0.0077, "step": 1280 }, { "epoch": 0.7456647398843931, "grad_norm": 0.13164956867694855, "learning_rate": 5.1559999999999994e-05, "loss": 0.0088, "step": 1290 }, { "epoch": 0.7514450867052023, "grad_norm": 0.07530736923217773, "learning_rate": 5.196e-05, "loss": 0.0086, "step": 1300 }, { "epoch": 0.7572254335260116, "grad_norm": 0.08277012407779694, "learning_rate": 5.236e-05, "loss": 0.0089, "step": 1310 }, { "epoch": 0.7630057803468208, "grad_norm": 0.1286892145872116, "learning_rate": 5.2759999999999996e-05, "loss": 0.0128, "step": 1320 }, { "epoch": 0.7687861271676301, "grad_norm": 0.1276070475578308, "learning_rate": 5.316e-05, "loss": 0.0092, "step": 1330 }, { "epoch": 0.7745664739884393, "grad_norm": 0.11473594605922699, "learning_rate": 5.356e-05, "loss": 0.0089, "step": 1340 }, { "epoch": 0.7803468208092486, "grad_norm": 0.11573047190904617, "learning_rate": 5.396e-05, "loss": 0.0083, "step": 1350 }, { "epoch": 0.7861271676300579, "grad_norm": 0.12039162963628769, "learning_rate": 5.436e-05, "loss": 0.0083, "step": 1360 }, { "epoch": 0.791907514450867, "grad_norm": 0.18288345634937286, "learning_rate": 5.476e-05, "loss": 0.0084, "step": 1370 }, { "epoch": 0.7976878612716763, "grad_norm": 0.1231662929058075, "learning_rate": 5.516e-05, "loss": 0.0095, "step": 1380 }, { "epoch": 0.8034682080924855, "grad_norm": 0.08810202777385712, "learning_rate": 5.556e-05, "loss": 0.009, "step": 1390 }, { "epoch": 0.8092485549132948, "grad_norm": 0.08831888437271118, "learning_rate": 5.596e-05, "loss": 0.0078, "step": 1400 }, { "epoch": 0.815028901734104, "grad_norm": 0.15133686363697052, "learning_rate": 5.636e-05, "loss": 0.0114, "step": 1410 }, { "epoch": 0.8208092485549133, "grad_norm": 0.11997071653604507, "learning_rate": 5.6760000000000005e-05, "loss": 0.0105, "step": 1420 }, { "epoch": 0.8265895953757225, "grad_norm": 0.11660143733024597, "learning_rate": 5.716e-05, "loss": 0.008, "step": 1430 }, { "epoch": 0.8323699421965318, "grad_norm": 0.19836877286434174, "learning_rate": 5.7560000000000005e-05, "loss": 0.0107, "step": 1440 }, { "epoch": 0.838150289017341, "grad_norm": 0.16743585467338562, "learning_rate": 5.796e-05, "loss": 0.0072, "step": 1450 }, { "epoch": 0.8439306358381503, "grad_norm": 0.19401532411575317, "learning_rate": 5.8360000000000004e-05, "loss": 0.0082, "step": 1460 }, { "epoch": 0.8497109826589595, "grad_norm": 0.13777554035186768, "learning_rate": 5.876000000000001e-05, "loss": 0.0101, "step": 1470 }, { "epoch": 0.8554913294797688, "grad_norm": 0.1695699542760849, "learning_rate": 5.916e-05, "loss": 0.0113, "step": 1480 }, { "epoch": 0.861271676300578, "grad_norm": 0.14594483375549316, "learning_rate": 5.9560000000000006e-05, "loss": 0.01, "step": 1490 }, { "epoch": 0.8670520231213873, "grad_norm": 0.1465466171503067, "learning_rate": 5.996e-05, "loss": 0.0093, "step": 1500 }, { "epoch": 0.8728323699421965, "grad_norm": 0.16754291951656342, "learning_rate": 6.0360000000000005e-05, "loss": 0.0131, "step": 1510 }, { "epoch": 0.8786127167630058, "grad_norm": 0.17738179862499237, "learning_rate": 6.076000000000001e-05, "loss": 0.0103, "step": 1520 }, { "epoch": 0.884393063583815, "grad_norm": 0.1402902454137802, "learning_rate": 6.116e-05, "loss": 0.0095, "step": 1530 }, { "epoch": 0.8901734104046243, "grad_norm": 0.1324438899755478, "learning_rate": 6.156e-05, "loss": 0.0081, "step": 1540 }, { "epoch": 0.8959537572254336, "grad_norm": 0.08176060765981674, "learning_rate": 6.196000000000001e-05, "loss": 0.009, "step": 1550 }, { "epoch": 0.9017341040462428, "grad_norm": 0.0868748277425766, "learning_rate": 6.236e-05, "loss": 0.0086, "step": 1560 }, { "epoch": 0.9075144508670521, "grad_norm": 0.13637259602546692, "learning_rate": 6.276e-05, "loss": 0.0091, "step": 1570 }, { "epoch": 0.9132947976878613, "grad_norm": 0.10653480142354965, "learning_rate": 6.316000000000001e-05, "loss": 0.0083, "step": 1580 }, { "epoch": 0.9190751445086706, "grad_norm": 0.11942799389362335, "learning_rate": 6.356000000000001e-05, "loss": 0.0079, "step": 1590 }, { "epoch": 0.9248554913294798, "grad_norm": 0.14978532493114471, "learning_rate": 6.396e-05, "loss": 0.0087, "step": 1600 }, { "epoch": 0.930635838150289, "grad_norm": 0.17128850519657135, "learning_rate": 6.436e-05, "loss": 0.0087, "step": 1610 }, { "epoch": 0.9364161849710982, "grad_norm": 0.10861340165138245, "learning_rate": 6.476e-05, "loss": 0.0078, "step": 1620 }, { "epoch": 0.9421965317919075, "grad_norm": 0.24768634140491486, "learning_rate": 6.515999999999999e-05, "loss": 0.0098, "step": 1630 }, { "epoch": 0.9479768786127167, "grad_norm": 0.11871711909770966, "learning_rate": 6.556e-05, "loss": 0.0078, "step": 1640 }, { "epoch": 0.953757225433526, "grad_norm": 0.12986963987350464, "learning_rate": 6.596e-05, "loss": 0.0077, "step": 1650 }, { "epoch": 0.9595375722543352, "grad_norm": 0.19239209592342377, "learning_rate": 6.636e-05, "loss": 0.0095, "step": 1660 }, { "epoch": 0.9653179190751445, "grad_norm": 0.1672155112028122, "learning_rate": 6.676e-05, "loss": 0.0109, "step": 1670 }, { "epoch": 0.9710982658959537, "grad_norm": 0.10741300880908966, "learning_rate": 6.716e-05, "loss": 0.0073, "step": 1680 }, { "epoch": 0.976878612716763, "grad_norm": 0.1410427987575531, "learning_rate": 6.756e-05, "loss": 0.0086, "step": 1690 }, { "epoch": 0.9826589595375722, "grad_norm": 0.14685547351837158, "learning_rate": 6.796e-05, "loss": 0.008, "step": 1700 }, { "epoch": 0.9884393063583815, "grad_norm": 0.15410996973514557, "learning_rate": 6.836e-05, "loss": 0.0115, "step": 1710 }, { "epoch": 0.9942196531791907, "grad_norm": 0.13527736067771912, "learning_rate": 6.876e-05, "loss": 0.0089, "step": 1720 }, { "epoch": 1.0, "grad_norm": 0.11434699594974518, "learning_rate": 6.916000000000001e-05, "loss": 0.0095, "step": 1730 }, { "epoch": 1.0057803468208093, "grad_norm": 0.12007783353328705, "learning_rate": 6.956e-05, "loss": 0.0075, "step": 1740 }, { "epoch": 1.0115606936416186, "grad_norm": 0.1504870504140854, "learning_rate": 6.996e-05, "loss": 0.0083, "step": 1750 }, { "epoch": 1.0173410404624277, "grad_norm": 0.1315043568611145, "learning_rate": 7.036e-05, "loss": 0.0079, "step": 1760 }, { "epoch": 1.023121387283237, "grad_norm": 0.1160712018609047, "learning_rate": 7.076000000000001e-05, "loss": 0.0071, "step": 1770 }, { "epoch": 1.0289017341040463, "grad_norm": 0.1722860336303711, "learning_rate": 7.116e-05, "loss": 0.0084, "step": 1780 }, { "epoch": 1.0346820809248556, "grad_norm": 0.16109566390514374, "learning_rate": 7.156e-05, "loss": 0.0066, "step": 1790 }, { "epoch": 1.0404624277456647, "grad_norm": 0.12346116453409195, "learning_rate": 7.196000000000001e-05, "loss": 0.007, "step": 1800 }, { "epoch": 1.046242774566474, "grad_norm": 0.13088279962539673, "learning_rate": 7.236e-05, "loss": 0.0064, "step": 1810 }, { "epoch": 1.0520231213872833, "grad_norm": 0.13289068639278412, "learning_rate": 7.276e-05, "loss": 0.007, "step": 1820 }, { "epoch": 1.0578034682080926, "grad_norm": 0.1241140067577362, "learning_rate": 7.316000000000001e-05, "loss": 0.0067, "step": 1830 }, { "epoch": 1.0635838150289016, "grad_norm": 0.12275862693786621, "learning_rate": 7.356000000000001e-05, "loss": 0.007, "step": 1840 }, { "epoch": 1.069364161849711, "grad_norm": 0.09806959331035614, "learning_rate": 7.396e-05, "loss": 0.0064, "step": 1850 }, { "epoch": 1.0751445086705202, "grad_norm": 0.10867589712142944, "learning_rate": 7.436000000000001e-05, "loss": 0.0079, "step": 1860 }, { "epoch": 1.0809248554913296, "grad_norm": 0.09507458657026291, "learning_rate": 7.476000000000001e-05, "loss": 0.007, "step": 1870 }, { "epoch": 1.0867052023121386, "grad_norm": 0.0947691947221756, "learning_rate": 7.516e-05, "loss": 0.0062, "step": 1880 }, { "epoch": 1.092485549132948, "grad_norm": 0.1417185217142105, "learning_rate": 7.556000000000002e-05, "loss": 0.0108, "step": 1890 }, { "epoch": 1.0982658959537572, "grad_norm": 0.13631682097911835, "learning_rate": 7.596000000000001e-05, "loss": 0.0079, "step": 1900 }, { "epoch": 1.1040462427745665, "grad_norm": 0.23177769780158997, "learning_rate": 7.636e-05, "loss": 0.008, "step": 1910 }, { "epoch": 1.1098265895953756, "grad_norm": 0.090873122215271, "learning_rate": 7.676e-05, "loss": 0.0079, "step": 1920 }, { "epoch": 1.115606936416185, "grad_norm": 0.11183790862560272, "learning_rate": 7.716e-05, "loss": 0.0077, "step": 1930 }, { "epoch": 1.1213872832369942, "grad_norm": 0.1344011276960373, "learning_rate": 7.756e-05, "loss": 0.0091, "step": 1940 }, { "epoch": 1.1271676300578035, "grad_norm": 0.11749781668186188, "learning_rate": 7.796e-05, "loss": 0.0075, "step": 1950 }, { "epoch": 1.1329479768786128, "grad_norm": 0.15016603469848633, "learning_rate": 7.836e-05, "loss": 0.01, "step": 1960 }, { "epoch": 1.138728323699422, "grad_norm": 0.12128861248493195, "learning_rate": 7.876e-05, "loss": 0.0127, "step": 1970 }, { "epoch": 1.1445086705202312, "grad_norm": 0.13656798005104065, "learning_rate": 7.916e-05, "loss": 0.0075, "step": 1980 }, { "epoch": 1.1502890173410405, "grad_norm": 0.12774041295051575, "learning_rate": 7.956e-05, "loss": 0.0079, "step": 1990 }, { "epoch": 1.1560693641618498, "grad_norm": 0.23355427384376526, "learning_rate": 7.996e-05, "loss": 0.0113, "step": 2000 }, { "epoch": 1.1618497109826589, "grad_norm": 0.10483523458242416, "learning_rate": 8.036e-05, "loss": 0.008, "step": 2010 }, { "epoch": 1.1676300578034682, "grad_norm": 0.14650487899780273, "learning_rate": 8.076e-05, "loss": 0.0075, "step": 2020 }, { "epoch": 1.1734104046242775, "grad_norm": 0.1080266535282135, "learning_rate": 8.116e-05, "loss": 0.008, "step": 2030 }, { "epoch": 1.1791907514450868, "grad_norm": 0.12676295638084412, "learning_rate": 8.156e-05, "loss": 0.0078, "step": 2040 }, { "epoch": 1.1849710982658959, "grad_norm": 0.17598728835582733, "learning_rate": 8.196000000000001e-05, "loss": 0.009, "step": 2050 }, { "epoch": 1.1907514450867052, "grad_norm": 0.16755390167236328, "learning_rate": 8.236e-05, "loss": 0.01, "step": 2060 }, { "epoch": 1.1965317919075145, "grad_norm": 0.15602730214595795, "learning_rate": 8.276e-05, "loss": 0.0085, "step": 2070 }, { "epoch": 1.2023121387283238, "grad_norm": 0.11544652283191681, "learning_rate": 8.316000000000001e-05, "loss": 0.0074, "step": 2080 }, { "epoch": 1.208092485549133, "grad_norm": 0.09134082496166229, "learning_rate": 8.356e-05, "loss": 0.0072, "step": 2090 }, { "epoch": 1.2138728323699421, "grad_norm": 0.10406164824962616, "learning_rate": 8.396e-05, "loss": 0.0088, "step": 2100 }, { "epoch": 1.2196531791907514, "grad_norm": 0.0975494384765625, "learning_rate": 8.436000000000001e-05, "loss": 0.0059, "step": 2110 }, { "epoch": 1.2254335260115607, "grad_norm": 0.08101125061511993, "learning_rate": 8.476000000000001e-05, "loss": 0.0092, "step": 2120 }, { "epoch": 1.2312138728323698, "grad_norm": 0.0976252555847168, "learning_rate": 8.516e-05, "loss": 0.0067, "step": 2130 }, { "epoch": 1.2369942196531791, "grad_norm": 0.15048253536224365, "learning_rate": 8.556e-05, "loss": 0.0073, "step": 2140 }, { "epoch": 1.2427745664739884, "grad_norm": 0.1096828281879425, "learning_rate": 8.596000000000001e-05, "loss": 0.0065, "step": 2150 }, { "epoch": 1.2485549132947977, "grad_norm": 0.12420912086963654, "learning_rate": 8.636e-05, "loss": 0.0063, "step": 2160 }, { "epoch": 1.254335260115607, "grad_norm": 0.08858140558004379, "learning_rate": 8.676e-05, "loss": 0.0062, "step": 2170 }, { "epoch": 1.260115606936416, "grad_norm": 0.10560262948274612, "learning_rate": 8.716000000000001e-05, "loss": 0.0073, "step": 2180 }, { "epoch": 1.2658959537572254, "grad_norm": 0.13556477427482605, "learning_rate": 8.756000000000001e-05, "loss": 0.0074, "step": 2190 }, { "epoch": 1.2716763005780347, "grad_norm": 0.10504916310310364, "learning_rate": 8.796e-05, "loss": 0.0078, "step": 2200 }, { "epoch": 1.2774566473988438, "grad_norm": 0.11343459039926529, "learning_rate": 8.836000000000001e-05, "loss": 0.0058, "step": 2210 }, { "epoch": 1.2832369942196533, "grad_norm": 0.09249500185251236, "learning_rate": 8.876e-05, "loss": 0.0067, "step": 2220 }, { "epoch": 1.2890173410404624, "grad_norm": 0.08550640940666199, "learning_rate": 8.916e-05, "loss": 0.0072, "step": 2230 }, { "epoch": 1.2947976878612717, "grad_norm": 0.10101890563964844, "learning_rate": 8.956e-05, "loss": 0.0083, "step": 2240 }, { "epoch": 1.300578034682081, "grad_norm": 0.12792877852916718, "learning_rate": 8.996e-05, "loss": 0.0086, "step": 2250 }, { "epoch": 1.30635838150289, "grad_norm": 0.21365466713905334, "learning_rate": 9.036e-05, "loss": 0.0074, "step": 2260 }, { "epoch": 1.3121387283236994, "grad_norm": 0.18697352707386017, "learning_rate": 9.076e-05, "loss": 0.0068, "step": 2270 }, { "epoch": 1.3179190751445087, "grad_norm": 0.16394391655921936, "learning_rate": 9.116e-05, "loss": 0.0084, "step": 2280 }, { "epoch": 1.323699421965318, "grad_norm": 0.12319466471672058, "learning_rate": 9.156e-05, "loss": 0.0078, "step": 2290 }, { "epoch": 1.3294797687861273, "grad_norm": 0.11505721509456635, "learning_rate": 9.196000000000001e-05, "loss": 0.0084, "step": 2300 }, { "epoch": 1.3352601156069364, "grad_norm": 0.0842253565788269, "learning_rate": 9.236e-05, "loss": 0.007, "step": 2310 }, { "epoch": 1.3410404624277457, "grad_norm": 0.10776695609092712, "learning_rate": 9.276e-05, "loss": 0.0054, "step": 2320 }, { "epoch": 1.346820809248555, "grad_norm": 0.10675626248121262, "learning_rate": 9.316000000000001e-05, "loss": 0.0088, "step": 2330 }, { "epoch": 1.352601156069364, "grad_norm": 0.08206217736005783, "learning_rate": 9.356e-05, "loss": 0.0079, "step": 2340 }, { "epoch": 1.3583815028901733, "grad_norm": 0.12166175991296768, "learning_rate": 9.396e-05, "loss": 0.0074, "step": 2350 }, { "epoch": 1.3641618497109826, "grad_norm": 0.10013966262340546, "learning_rate": 9.436e-05, "loss": 0.0087, "step": 2360 }, { "epoch": 1.369942196531792, "grad_norm": 0.1003638356924057, "learning_rate": 9.476000000000001e-05, "loss": 0.0071, "step": 2370 }, { "epoch": 1.3757225433526012, "grad_norm": 0.10239727795124054, "learning_rate": 9.516e-05, "loss": 0.0103, "step": 2380 }, { "epoch": 1.3815028901734103, "grad_norm": 0.1256374716758728, "learning_rate": 9.556e-05, "loss": 0.0088, "step": 2390 }, { "epoch": 1.3872832369942196, "grad_norm": 0.12118260562419891, "learning_rate": 9.596000000000001e-05, "loss": 0.0067, "step": 2400 }, { "epoch": 1.393063583815029, "grad_norm": 0.10683480650186539, "learning_rate": 9.636e-05, "loss": 0.0067, "step": 2410 }, { "epoch": 1.3988439306358382, "grad_norm": 0.0618288479745388, "learning_rate": 9.676e-05, "loss": 0.0062, "step": 2420 }, { "epoch": 1.4046242774566475, "grad_norm": 0.13114090263843536, "learning_rate": 9.716000000000001e-05, "loss": 0.0061, "step": 2430 }, { "epoch": 1.4104046242774566, "grad_norm": 0.10822831094264984, "learning_rate": 9.756000000000001e-05, "loss": 0.0055, "step": 2440 }, { "epoch": 1.416184971098266, "grad_norm": 0.11746819317340851, "learning_rate": 9.796e-05, "loss": 0.0059, "step": 2450 }, { "epoch": 1.4219653179190752, "grad_norm": 0.0757322609424591, "learning_rate": 9.836000000000001e-05, "loss": 0.0056, "step": 2460 }, { "epoch": 1.4277456647398843, "grad_norm": 0.08555682748556137, "learning_rate": 9.876000000000001e-05, "loss": 0.006, "step": 2470 }, { "epoch": 1.4335260115606936, "grad_norm": 0.1246783435344696, "learning_rate": 9.916e-05, "loss": 0.0071, "step": 2480 }, { "epoch": 1.439306358381503, "grad_norm": 0.11538666486740112, "learning_rate": 9.956e-05, "loss": 0.0067, "step": 2490 }, { "epoch": 1.4450867052023122, "grad_norm": 0.1484181433916092, "learning_rate": 9.996000000000001e-05, "loss": 0.0077, "step": 2500 }, { "epoch": 1.4508670520231215, "grad_norm": 0.11887402832508087, "learning_rate": 9.999999114196196e-05, "loss": 0.0054, "step": 2510 }, { "epoch": 1.4566473988439306, "grad_norm": 0.12257851660251617, "learning_rate": 9.99999605215876e-05, "loss": 0.0071, "step": 2520 }, { "epoch": 1.4624277456647399, "grad_norm": 0.14709283411502838, "learning_rate": 9.999990802953179e-05, "loss": 0.0086, "step": 2530 }, { "epoch": 1.4682080924855492, "grad_norm": 0.13029494881629944, "learning_rate": 9.99998336658175e-05, "loss": 0.0089, "step": 2540 }, { "epoch": 1.4739884393063583, "grad_norm": 0.1392313539981842, "learning_rate": 9.999973743047727e-05, "loss": 0.0066, "step": 2550 }, { "epoch": 1.4797687861271676, "grad_norm": 0.1205642893910408, "learning_rate": 9.999961932355319e-05, "loss": 0.0071, "step": 2560 }, { "epoch": 1.4855491329479769, "grad_norm": 0.13903295993804932, "learning_rate": 9.999947934509693e-05, "loss": 0.0074, "step": 2570 }, { "epoch": 1.4913294797687862, "grad_norm": 0.18161019682884216, "learning_rate": 9.999931749516971e-05, "loss": 0.0082, "step": 2580 }, { "epoch": 1.4971098265895955, "grad_norm": 0.14651469886302948, "learning_rate": 9.999913377384233e-05, "loss": 0.0073, "step": 2590 }, { "epoch": 1.5028901734104045, "grad_norm": 0.15712544322013855, "learning_rate": 9.999892818119517e-05, "loss": 0.0071, "step": 2600 }, { "epoch": 1.5086705202312138, "grad_norm": 0.11392804235219955, "learning_rate": 9.999870071731814e-05, "loss": 0.0059, "step": 2610 }, { "epoch": 1.5144508670520231, "grad_norm": 0.11064669489860535, "learning_rate": 9.999845138231076e-05, "loss": 0.0062, "step": 2620 }, { "epoch": 1.5202312138728322, "grad_norm": 0.11116683483123779, "learning_rate": 9.999818017628208e-05, "loss": 0.0068, "step": 2630 }, { "epoch": 1.5260115606936417, "grad_norm": 0.09671295434236526, "learning_rate": 9.999788709935078e-05, "loss": 0.01, "step": 2640 }, { "epoch": 1.5317919075144508, "grad_norm": 0.11243397742509842, "learning_rate": 9.9997572151645e-05, "loss": 0.0065, "step": 2650 }, { "epoch": 1.5375722543352601, "grad_norm": 0.1160590872168541, "learning_rate": 9.999723533330254e-05, "loss": 0.0062, "step": 2660 }, { "epoch": 1.5433526011560694, "grad_norm": 0.08542856574058533, "learning_rate": 9.999687664447074e-05, "loss": 0.0053, "step": 2670 }, { "epoch": 1.5491329479768785, "grad_norm": 0.08599895238876343, "learning_rate": 9.99964960853065e-05, "loss": 0.0054, "step": 2680 }, { "epoch": 1.5549132947976878, "grad_norm": 0.13533271849155426, "learning_rate": 9.999609365597627e-05, "loss": 0.0058, "step": 2690 }, { "epoch": 1.560693641618497, "grad_norm": 0.09653540700674057, "learning_rate": 9.99956693566561e-05, "loss": 0.0092, "step": 2700 }, { "epoch": 1.5664739884393064, "grad_norm": 0.1159488782286644, "learning_rate": 9.99952231875316e-05, "loss": 0.0106, "step": 2710 }, { "epoch": 1.5722543352601157, "grad_norm": 0.12663349509239197, "learning_rate": 9.999475514879795e-05, "loss": 0.0071, "step": 2720 }, { "epoch": 1.5780346820809248, "grad_norm": 0.11458619683980942, "learning_rate": 9.999426524065984e-05, "loss": 0.0064, "step": 2730 }, { "epoch": 1.583815028901734, "grad_norm": 0.13307511806488037, "learning_rate": 9.999375346333162e-05, "loss": 0.0061, "step": 2740 }, { "epoch": 1.5895953757225434, "grad_norm": 0.11335356533527374, "learning_rate": 9.999321981703715e-05, "loss": 0.0059, "step": 2750 }, { "epoch": 1.5953757225433525, "grad_norm": 0.11866944283246994, "learning_rate": 9.999266430200985e-05, "loss": 0.0073, "step": 2760 }, { "epoch": 1.601156069364162, "grad_norm": 0.10777215659618378, "learning_rate": 9.999208691849271e-05, "loss": 0.0055, "step": 2770 }, { "epoch": 1.606936416184971, "grad_norm": 0.10360101610422134, "learning_rate": 9.999148766673832e-05, "loss": 0.0049, "step": 2780 }, { "epoch": 1.6127167630057804, "grad_norm": 0.0994260162115097, "learning_rate": 9.999086654700881e-05, "loss": 0.0056, "step": 2790 }, { "epoch": 1.6184971098265897, "grad_norm": 0.09570357948541641, "learning_rate": 9.999022355957588e-05, "loss": 0.0056, "step": 2800 }, { "epoch": 1.6242774566473988, "grad_norm": 0.07606939971446991, "learning_rate": 9.998955870472079e-05, "loss": 0.0053, "step": 2810 }, { "epoch": 1.630057803468208, "grad_norm": 0.08179699629545212, "learning_rate": 9.998887198273437e-05, "loss": 0.0054, "step": 2820 }, { "epoch": 1.6358381502890174, "grad_norm": 0.11064790934324265, "learning_rate": 9.998816339391701e-05, "loss": 0.0052, "step": 2830 }, { "epoch": 1.6416184971098264, "grad_norm": 0.06641016155481339, "learning_rate": 9.998743293857868e-05, "loss": 0.005, "step": 2840 }, { "epoch": 1.647398843930636, "grad_norm": 0.06775292754173279, "learning_rate": 9.998668061703891e-05, "loss": 0.0056, "step": 2850 }, { "epoch": 1.653179190751445, "grad_norm": 0.17745137214660645, "learning_rate": 9.998590642962679e-05, "loss": 0.0087, "step": 2860 }, { "epoch": 1.6589595375722543, "grad_norm": 0.14147275686264038, "learning_rate": 9.998511037668095e-05, "loss": 0.0052, "step": 2870 }, { "epoch": 1.6647398843930636, "grad_norm": 0.1302187293767929, "learning_rate": 9.998429245854964e-05, "loss": 0.0058, "step": 2880 }, { "epoch": 1.6705202312138727, "grad_norm": 0.08635162562131882, "learning_rate": 9.998345267559064e-05, "loss": 0.0062, "step": 2890 }, { "epoch": 1.6763005780346822, "grad_norm": 0.07505862414836884, "learning_rate": 9.998259102817129e-05, "loss": 0.0048, "step": 2900 }, { "epoch": 1.6820809248554913, "grad_norm": 0.0735427662730217, "learning_rate": 9.99817075166685e-05, "loss": 0.0067, "step": 2910 }, { "epoch": 1.6878612716763006, "grad_norm": 0.06911532580852509, "learning_rate": 9.998080214146878e-05, "loss": 0.0052, "step": 2920 }, { "epoch": 1.69364161849711, "grad_norm": 0.10301624238491058, "learning_rate": 9.997987490296813e-05, "loss": 0.0073, "step": 2930 }, { "epoch": 1.699421965317919, "grad_norm": 0.1210310235619545, "learning_rate": 9.99789258015722e-05, "loss": 0.0046, "step": 2940 }, { "epoch": 1.7052023121387283, "grad_norm": 0.14030395448207855, "learning_rate": 9.997795483769611e-05, "loss": 0.0052, "step": 2950 }, { "epoch": 1.7109826589595376, "grad_norm": 0.1258503496646881, "learning_rate": 9.997696201176462e-05, "loss": 0.0056, "step": 2960 }, { "epoch": 1.7167630057803467, "grad_norm": 0.13269788026809692, "learning_rate": 9.997594732421203e-05, "loss": 0.0059, "step": 2970 }, { "epoch": 1.7225433526011562, "grad_norm": 0.15080974996089935, "learning_rate": 9.997491077548217e-05, "loss": 0.0067, "step": 2980 }, { "epoch": 1.7283236994219653, "grad_norm": 0.13522954285144806, "learning_rate": 9.997385236602851e-05, "loss": 0.0047, "step": 2990 }, { "epoch": 1.7341040462427746, "grad_norm": 0.10210167616605759, "learning_rate": 9.997277209631399e-05, "loss": 0.0084, "step": 3000 }, { "epoch": 1.739884393063584, "grad_norm": 0.16219407320022583, "learning_rate": 9.997166996681118e-05, "loss": 0.0067, "step": 3010 }, { "epoch": 1.745664739884393, "grad_norm": 0.09276897460222244, "learning_rate": 9.997054597800218e-05, "loss": 0.0077, "step": 3020 }, { "epoch": 1.7514450867052023, "grad_norm": 0.1794775277376175, "learning_rate": 9.996940013037866e-05, "loss": 0.007, "step": 3030 }, { "epoch": 1.7572254335260116, "grad_norm": 0.13645876944065094, "learning_rate": 9.996823242444186e-05, "loss": 0.0074, "step": 3040 }, { "epoch": 1.7630057803468207, "grad_norm": 0.17062893509864807, "learning_rate": 9.996704286070258e-05, "loss": 0.0069, "step": 3050 }, { "epoch": 1.7687861271676302, "grad_norm": 0.11452256888151169, "learning_rate": 9.996583143968115e-05, "loss": 0.0048, "step": 3060 }, { "epoch": 1.7745664739884393, "grad_norm": 0.06533671915531158, "learning_rate": 9.99645981619075e-05, "loss": 0.0061, "step": 3070 }, { "epoch": 1.7803468208092486, "grad_norm": 0.08524267375469208, "learning_rate": 9.996334302792114e-05, "loss": 0.0083, "step": 3080 }, { "epoch": 1.7861271676300579, "grad_norm": 0.09816299378871918, "learning_rate": 9.996206603827105e-05, "loss": 0.0056, "step": 3090 }, { "epoch": 1.791907514450867, "grad_norm": 0.18339581787586212, "learning_rate": 9.996076719351587e-05, "loss": 0.01, "step": 3100 }, { "epoch": 1.7976878612716765, "grad_norm": 0.12458177655935287, "learning_rate": 9.995944649422374e-05, "loss": 0.0088, "step": 3110 }, { "epoch": 1.8034682080924855, "grad_norm": 0.12647530436515808, "learning_rate": 9.995810394097239e-05, "loss": 0.006, "step": 3120 }, { "epoch": 1.8092485549132948, "grad_norm": 0.12574432790279388, "learning_rate": 9.995673953434909e-05, "loss": 0.0081, "step": 3130 }, { "epoch": 1.8150289017341041, "grad_norm": 0.08346758782863617, "learning_rate": 9.995535327495068e-05, "loss": 0.0075, "step": 3140 }, { "epoch": 1.8208092485549132, "grad_norm": 0.05680999904870987, "learning_rate": 9.995394516338355e-05, "loss": 0.0046, "step": 3150 }, { "epoch": 1.8265895953757225, "grad_norm": 0.05832390487194061, "learning_rate": 9.995251520026367e-05, "loss": 0.0064, "step": 3160 }, { "epoch": 1.8323699421965318, "grad_norm": 0.08656340837478638, "learning_rate": 9.995106338621656e-05, "loss": 0.0047, "step": 3170 }, { "epoch": 1.838150289017341, "grad_norm": 0.10437380522489548, "learning_rate": 9.994958972187726e-05, "loss": 0.0065, "step": 3180 }, { "epoch": 1.8439306358381504, "grad_norm": 0.1443714201450348, "learning_rate": 9.994809420789044e-05, "loss": 0.0071, "step": 3190 }, { "epoch": 1.8497109826589595, "grad_norm": 0.16334494948387146, "learning_rate": 9.994657684491027e-05, "loss": 0.0055, "step": 3200 }, { "epoch": 1.8554913294797688, "grad_norm": 0.06717883050441742, "learning_rate": 9.994503763360048e-05, "loss": 0.0052, "step": 3210 }, { "epoch": 1.861271676300578, "grad_norm": 0.13523557782173157, "learning_rate": 9.99434765746344e-05, "loss": 0.0053, "step": 3220 }, { "epoch": 1.8670520231213872, "grad_norm": 0.0893624797463417, "learning_rate": 9.994189366869488e-05, "loss": 0.0054, "step": 3230 }, { "epoch": 1.8728323699421965, "grad_norm": 0.11412649601697922, "learning_rate": 9.994028891647433e-05, "loss": 0.0047, "step": 3240 }, { "epoch": 1.8786127167630058, "grad_norm": 0.12160219252109528, "learning_rate": 9.993866231867475e-05, "loss": 0.0059, "step": 3250 }, { "epoch": 1.8843930635838149, "grad_norm": 0.09526138007640839, "learning_rate": 9.993701387600762e-05, "loss": 0.0052, "step": 3260 }, { "epoch": 1.8901734104046244, "grad_norm": 0.09952362626791, "learning_rate": 9.993534358919408e-05, "loss": 0.005, "step": 3270 }, { "epoch": 1.8959537572254335, "grad_norm": 0.09673795849084854, "learning_rate": 9.993365145896473e-05, "loss": 0.0065, "step": 3280 }, { "epoch": 1.9017341040462428, "grad_norm": 0.10317232459783554, "learning_rate": 9.993193748605977e-05, "loss": 0.0052, "step": 3290 }, { "epoch": 1.907514450867052, "grad_norm": 0.12684407830238342, "learning_rate": 9.993020167122898e-05, "loss": 0.0051, "step": 3300 }, { "epoch": 1.9132947976878611, "grad_norm": 0.10689617693424225, "learning_rate": 9.992844401523164e-05, "loss": 0.0055, "step": 3310 }, { "epoch": 1.9190751445086707, "grad_norm": 0.11384792625904083, "learning_rate": 9.992666451883661e-05, "loss": 0.0043, "step": 3320 }, { "epoch": 1.9248554913294798, "grad_norm": 0.0782865509390831, "learning_rate": 9.99248631828223e-05, "loss": 0.0063, "step": 3330 }, { "epoch": 1.930635838150289, "grad_norm": 0.11839958280324936, "learning_rate": 9.99230400079767e-05, "loss": 0.0064, "step": 3340 }, { "epoch": 1.9364161849710984, "grad_norm": 0.07051268219947815, "learning_rate": 9.992119499509728e-05, "loss": 0.0057, "step": 3350 }, { "epoch": 1.9421965317919074, "grad_norm": 0.09824991226196289, "learning_rate": 9.991932814499114e-05, "loss": 0.0055, "step": 3360 }, { "epoch": 1.9479768786127167, "grad_norm": 0.09343143552541733, "learning_rate": 9.991743945847493e-05, "loss": 0.0046, "step": 3370 }, { "epoch": 1.953757225433526, "grad_norm": 0.09618880599737167, "learning_rate": 9.991552893637478e-05, "loss": 0.005, "step": 3380 }, { "epoch": 1.9595375722543351, "grad_norm": 0.14422376453876495, "learning_rate": 9.991359657952644e-05, "loss": 0.0058, "step": 3390 }, { "epoch": 1.9653179190751446, "grad_norm": 0.11654495447874069, "learning_rate": 9.991164238877519e-05, "loss": 0.0072, "step": 3400 }, { "epoch": 1.9710982658959537, "grad_norm": 0.16126972436904907, "learning_rate": 9.990966636497585e-05, "loss": 0.0056, "step": 3410 }, { "epoch": 1.976878612716763, "grad_norm": 0.07894255220890045, "learning_rate": 9.99076685089928e-05, "loss": 0.0049, "step": 3420 }, { "epoch": 1.9826589595375723, "grad_norm": 0.05879867449402809, "learning_rate": 9.990564882169998e-05, "loss": 0.0067, "step": 3430 }, { "epoch": 1.9884393063583814, "grad_norm": 0.0764334499835968, "learning_rate": 9.990360730398088e-05, "loss": 0.0052, "step": 3440 }, { "epoch": 1.9942196531791907, "grad_norm": 0.15867310762405396, "learning_rate": 9.990154395672849e-05, "loss": 0.0058, "step": 3450 }, { "epoch": 2.0, "grad_norm": 0.11241374164819717, "learning_rate": 9.989945878084541e-05, "loss": 0.0052, "step": 3460 }, { "epoch": 2.005780346820809, "grad_norm": 0.129130557179451, "learning_rate": 9.989735177724378e-05, "loss": 0.0058, "step": 3470 }, { "epoch": 2.0115606936416186, "grad_norm": 0.14215660095214844, "learning_rate": 9.989522294684526e-05, "loss": 0.0053, "step": 3480 }, { "epoch": 2.0173410404624277, "grad_norm": 0.1450652778148651, "learning_rate": 9.989307229058107e-05, "loss": 0.0058, "step": 3490 }, { "epoch": 2.023121387283237, "grad_norm": 0.08095835149288177, "learning_rate": 9.989089980939202e-05, "loss": 0.0042, "step": 3500 }, { "epoch": 2.0289017341040463, "grad_norm": 0.1411646455526352, "learning_rate": 9.988870550422835e-05, "loss": 0.0056, "step": 3510 }, { "epoch": 2.0346820809248554, "grad_norm": 0.10258983075618744, "learning_rate": 9.988648937604999e-05, "loss": 0.0042, "step": 3520 }, { "epoch": 2.040462427745665, "grad_norm": 0.0893290787935257, "learning_rate": 9.988425142582632e-05, "loss": 0.0049, "step": 3530 }, { "epoch": 2.046242774566474, "grad_norm": 0.09758899360895157, "learning_rate": 9.98819916545363e-05, "loss": 0.0106, "step": 3540 }, { "epoch": 2.052023121387283, "grad_norm": 0.11281487345695496, "learning_rate": 9.987971006316844e-05, "loss": 0.0061, "step": 3550 }, { "epoch": 2.0578034682080926, "grad_norm": 0.06897272169589996, "learning_rate": 9.987740665272077e-05, "loss": 0.006, "step": 3560 }, { "epoch": 2.0635838150289016, "grad_norm": 0.09770093113183975, "learning_rate": 9.98750814242009e-05, "loss": 0.0091, "step": 3570 }, { "epoch": 2.069364161849711, "grad_norm": 0.10911057144403458, "learning_rate": 9.987273437862594e-05, "loss": 0.0045, "step": 3580 }, { "epoch": 2.0751445086705202, "grad_norm": 0.14400342106819153, "learning_rate": 9.987036551702259e-05, "loss": 0.0072, "step": 3590 }, { "epoch": 2.0809248554913293, "grad_norm": 0.08694253116846085, "learning_rate": 9.986797484042706e-05, "loss": 0.0062, "step": 3600 }, { "epoch": 2.086705202312139, "grad_norm": 0.1378755420446396, "learning_rate": 9.986556234988512e-05, "loss": 0.0057, "step": 3610 }, { "epoch": 2.092485549132948, "grad_norm": 0.12104221433401108, "learning_rate": 9.986312804645205e-05, "loss": 0.0063, "step": 3620 }, { "epoch": 2.098265895953757, "grad_norm": 0.16360381245613098, "learning_rate": 9.986067193119273e-05, "loss": 0.0068, "step": 3630 }, { "epoch": 2.1040462427745665, "grad_norm": 0.1390710473060608, "learning_rate": 9.985819400518153e-05, "loss": 0.0061, "step": 3640 }, { "epoch": 2.1098265895953756, "grad_norm": 0.1273057609796524, "learning_rate": 9.985569426950239e-05, "loss": 0.007, "step": 3650 }, { "epoch": 2.115606936416185, "grad_norm": 0.09377237409353256, "learning_rate": 9.985317272524876e-05, "loss": 0.0042, "step": 3660 }, { "epoch": 2.121387283236994, "grad_norm": 0.09106730669736862, "learning_rate": 9.985062937352366e-05, "loss": 0.0051, "step": 3670 }, { "epoch": 2.1271676300578033, "grad_norm": 0.10553478449583054, "learning_rate": 9.984806421543966e-05, "loss": 0.0047, "step": 3680 }, { "epoch": 2.132947976878613, "grad_norm": 0.07460903376340866, "learning_rate": 9.984547725211881e-05, "loss": 0.005, "step": 3690 }, { "epoch": 2.138728323699422, "grad_norm": 0.1071394607424736, "learning_rate": 9.984286848469276e-05, "loss": 0.0055, "step": 3700 }, { "epoch": 2.1445086705202314, "grad_norm": 0.12161537259817123, "learning_rate": 9.984023791430266e-05, "loss": 0.0054, "step": 3710 }, { "epoch": 2.1502890173410405, "grad_norm": 0.08869685977697372, "learning_rate": 9.983758554209924e-05, "loss": 0.004, "step": 3720 }, { "epoch": 2.1560693641618496, "grad_norm": 0.1138811782002449, "learning_rate": 9.983491136924268e-05, "loss": 0.0046, "step": 3730 }, { "epoch": 2.161849710982659, "grad_norm": 0.14750473201274872, "learning_rate": 9.983221539690282e-05, "loss": 0.0054, "step": 3740 }, { "epoch": 2.167630057803468, "grad_norm": 0.11414989829063416, "learning_rate": 9.982949762625892e-05, "loss": 0.0053, "step": 3750 }, { "epoch": 2.1734104046242773, "grad_norm": 0.12458052486181259, "learning_rate": 9.982675805849986e-05, "loss": 0.0051, "step": 3760 }, { "epoch": 2.179190751445087, "grad_norm": 0.10004246979951859, "learning_rate": 9.982399669482399e-05, "loss": 0.0047, "step": 3770 }, { "epoch": 2.184971098265896, "grad_norm": 0.08482618629932404, "learning_rate": 9.982121353643924e-05, "loss": 0.0042, "step": 3780 }, { "epoch": 2.1907514450867054, "grad_norm": 0.08300045132637024, "learning_rate": 9.981840858456306e-05, "loss": 0.0063, "step": 3790 }, { "epoch": 2.1965317919075145, "grad_norm": 0.10703772306442261, "learning_rate": 9.981558184042243e-05, "loss": 0.0058, "step": 3800 }, { "epoch": 2.2023121387283235, "grad_norm": 0.12374887615442276, "learning_rate": 9.981273330525387e-05, "loss": 0.006, "step": 3810 }, { "epoch": 2.208092485549133, "grad_norm": 0.14318065345287323, "learning_rate": 9.980986298030341e-05, "loss": 0.0062, "step": 3820 }, { "epoch": 2.213872832369942, "grad_norm": 0.1355583518743515, "learning_rate": 9.980697086682662e-05, "loss": 0.0053, "step": 3830 }, { "epoch": 2.2196531791907512, "grad_norm": 0.13883447647094727, "learning_rate": 9.980405696608866e-05, "loss": 0.0046, "step": 3840 }, { "epoch": 2.2254335260115607, "grad_norm": 0.11007209122180939, "learning_rate": 9.980112127936409e-05, "loss": 0.0049, "step": 3850 }, { "epoch": 2.23121387283237, "grad_norm": 0.13813987374305725, "learning_rate": 9.979816380793717e-05, "loss": 0.005, "step": 3860 }, { "epoch": 2.2369942196531793, "grad_norm": 0.12208642065525055, "learning_rate": 9.979518455310151e-05, "loss": 0.0057, "step": 3870 }, { "epoch": 2.2427745664739884, "grad_norm": 0.10291685163974762, "learning_rate": 9.97921835161604e-05, "loss": 0.0074, "step": 3880 }, { "epoch": 2.2485549132947975, "grad_norm": 0.10677164793014526, "learning_rate": 9.978916069842656e-05, "loss": 0.0048, "step": 3890 }, { "epoch": 2.254335260115607, "grad_norm": 0.09314499795436859, "learning_rate": 9.97861161012223e-05, "loss": 0.0053, "step": 3900 }, { "epoch": 2.260115606936416, "grad_norm": 0.1184055432677269, "learning_rate": 9.978304972587942e-05, "loss": 0.0048, "step": 3910 }, { "epoch": 2.2658959537572256, "grad_norm": 0.10291741788387299, "learning_rate": 9.977996157373925e-05, "loss": 0.0055, "step": 3920 }, { "epoch": 2.2716763005780347, "grad_norm": 0.08237621188163757, "learning_rate": 9.977685164615265e-05, "loss": 0.0044, "step": 3930 }, { "epoch": 2.277456647398844, "grad_norm": 0.1022019162774086, "learning_rate": 9.977371994448002e-05, "loss": 0.0055, "step": 3940 }, { "epoch": 2.2832369942196533, "grad_norm": 0.09344568848609924, "learning_rate": 9.977056647009127e-05, "loss": 0.0057, "step": 3950 }, { "epoch": 2.2890173410404624, "grad_norm": 0.0971241444349289, "learning_rate": 9.976739122436582e-05, "loss": 0.0065, "step": 3960 }, { "epoch": 2.294797687861272, "grad_norm": 0.057168059051036835, "learning_rate": 9.976419420869265e-05, "loss": 0.0046, "step": 3970 }, { "epoch": 2.300578034682081, "grad_norm": 0.06407888233661652, "learning_rate": 9.976097542447025e-05, "loss": 0.0044, "step": 3980 }, { "epoch": 2.30635838150289, "grad_norm": 0.0672544315457344, "learning_rate": 9.97577348731066e-05, "loss": 0.0038, "step": 3990 }, { "epoch": 2.3121387283236996, "grad_norm": 0.07293885201215744, "learning_rate": 9.975447255601927e-05, "loss": 0.0058, "step": 4000 }, { "epoch": 2.3179190751445087, "grad_norm": 0.05307481437921524, "learning_rate": 9.975118847463525e-05, "loss": 0.0045, "step": 4010 }, { "epoch": 2.3236994219653178, "grad_norm": 0.1027441918849945, "learning_rate": 9.974788263039114e-05, "loss": 0.0055, "step": 4020 }, { "epoch": 2.3294797687861273, "grad_norm": 0.14790751039981842, "learning_rate": 9.974455502473303e-05, "loss": 0.0059, "step": 4030 }, { "epoch": 2.3352601156069364, "grad_norm": 0.14081086218357086, "learning_rate": 9.974120565911652e-05, "loss": 0.005, "step": 4040 }, { "epoch": 2.3410404624277454, "grad_norm": 0.12889717519283295, "learning_rate": 9.973783453500674e-05, "loss": 0.0046, "step": 4050 }, { "epoch": 2.346820809248555, "grad_norm": 0.11189638823270798, "learning_rate": 9.973444165387835e-05, "loss": 0.0054, "step": 4060 }, { "epoch": 2.352601156069364, "grad_norm": 0.11090421676635742, "learning_rate": 9.973102701721549e-05, "loss": 0.007, "step": 4070 }, { "epoch": 2.3583815028901736, "grad_norm": 0.13979388773441315, "learning_rate": 9.972759062651184e-05, "loss": 0.0048, "step": 4080 }, { "epoch": 2.3641618497109826, "grad_norm": 0.15473957359790802, "learning_rate": 9.972413248327059e-05, "loss": 0.0073, "step": 4090 }, { "epoch": 2.3699421965317917, "grad_norm": 0.1623980700969696, "learning_rate": 9.972065258900447e-05, "loss": 0.0051, "step": 4100 }, { "epoch": 2.3757225433526012, "grad_norm": 0.09650347381830215, "learning_rate": 9.971715094523569e-05, "loss": 0.0055, "step": 4110 }, { "epoch": 2.3815028901734103, "grad_norm": 0.10511574894189835, "learning_rate": 9.971362755349598e-05, "loss": 0.0055, "step": 4120 }, { "epoch": 2.38728323699422, "grad_norm": 0.11281032860279083, "learning_rate": 9.971008241532662e-05, "loss": 0.0046, "step": 4130 }, { "epoch": 2.393063583815029, "grad_norm": 0.10358737409114838, "learning_rate": 9.970651553227835e-05, "loss": 0.0053, "step": 4140 }, { "epoch": 2.398843930635838, "grad_norm": 0.09289064258337021, "learning_rate": 9.970292690591143e-05, "loss": 0.0064, "step": 4150 }, { "epoch": 2.4046242774566475, "grad_norm": 0.1077444851398468, "learning_rate": 9.969931653779569e-05, "loss": 0.0053, "step": 4160 }, { "epoch": 2.4104046242774566, "grad_norm": 0.11067981272935867, "learning_rate": 9.969568442951038e-05, "loss": 0.0059, "step": 4170 }, { "epoch": 2.416184971098266, "grad_norm": 0.09449609369039536, "learning_rate": 9.969203058264436e-05, "loss": 0.0054, "step": 4180 }, { "epoch": 2.421965317919075, "grad_norm": 0.12403042614459991, "learning_rate": 9.96883549987959e-05, "loss": 0.0046, "step": 4190 }, { "epoch": 2.4277456647398843, "grad_norm": 0.06456232070922852, "learning_rate": 9.968465767957287e-05, "loss": 0.0051, "step": 4200 }, { "epoch": 2.433526011560694, "grad_norm": 0.11479715257883072, "learning_rate": 9.968093862659256e-05, "loss": 0.0038, "step": 4210 }, { "epoch": 2.439306358381503, "grad_norm": 0.16075153648853302, "learning_rate": 9.967719784148182e-05, "loss": 0.0066, "step": 4220 }, { "epoch": 2.445086705202312, "grad_norm": 0.142628014087677, "learning_rate": 9.967343532587702e-05, "loss": 0.0064, "step": 4230 }, { "epoch": 2.4508670520231215, "grad_norm": 0.19917619228363037, "learning_rate": 9.966965108142399e-05, "loss": 0.0053, "step": 4240 }, { "epoch": 2.4566473988439306, "grad_norm": 0.10416891425848007, "learning_rate": 9.96658451097781e-05, "loss": 0.0046, "step": 4250 }, { "epoch": 2.4624277456647397, "grad_norm": 0.07751508057117462, "learning_rate": 9.966201741260419e-05, "loss": 0.0071, "step": 4260 }, { "epoch": 2.468208092485549, "grad_norm": 0.1693708747625351, "learning_rate": 9.965816799157665e-05, "loss": 0.0057, "step": 4270 }, { "epoch": 2.4739884393063583, "grad_norm": 0.12062357366085052, "learning_rate": 9.965429684837935e-05, "loss": 0.0054, "step": 4280 }, { "epoch": 2.479768786127168, "grad_norm": 0.11819230765104294, "learning_rate": 9.965040398470562e-05, "loss": 0.0047, "step": 4290 }, { "epoch": 2.485549132947977, "grad_norm": 0.07853759080171585, "learning_rate": 9.964648940225838e-05, "loss": 0.005, "step": 4300 }, { "epoch": 2.491329479768786, "grad_norm": 0.09461795538663864, "learning_rate": 9.964255310274997e-05, "loss": 0.004, "step": 4310 }, { "epoch": 2.4971098265895955, "grad_norm": 0.07458837330341339, "learning_rate": 9.963859508790228e-05, "loss": 0.0037, "step": 4320 }, { "epoch": 2.5028901734104045, "grad_norm": 0.07160282880067825, "learning_rate": 9.963461535944664e-05, "loss": 0.0036, "step": 4330 }, { "epoch": 2.508670520231214, "grad_norm": 0.08057725429534912, "learning_rate": 9.963061391912399e-05, "loss": 0.007, "step": 4340 }, { "epoch": 2.514450867052023, "grad_norm": 0.12584759294986725, "learning_rate": 9.962659076868463e-05, "loss": 0.0046, "step": 4350 }, { "epoch": 2.520231213872832, "grad_norm": 0.13452067971229553, "learning_rate": 9.962254590988846e-05, "loss": 0.0055, "step": 4360 }, { "epoch": 2.5260115606936417, "grad_norm": 0.09652341157197952, "learning_rate": 9.961847934450481e-05, "loss": 0.0046, "step": 4370 }, { "epoch": 2.531791907514451, "grad_norm": 0.13719667494297028, "learning_rate": 9.961439107431257e-05, "loss": 0.0052, "step": 4380 }, { "epoch": 2.5375722543352603, "grad_norm": 0.12765006721019745, "learning_rate": 9.961028110110006e-05, "loss": 0.0044, "step": 4390 }, { "epoch": 2.5433526011560694, "grad_norm": 0.10085717588663101, "learning_rate": 9.960614942666513e-05, "loss": 0.005, "step": 4400 }, { "epoch": 2.5491329479768785, "grad_norm": 0.04975849762558937, "learning_rate": 9.960199605281511e-05, "loss": 0.0038, "step": 4410 }, { "epoch": 2.5549132947976876, "grad_norm": 0.08230627328157425, "learning_rate": 9.959782098136683e-05, "loss": 0.0054, "step": 4420 }, { "epoch": 2.560693641618497, "grad_norm": 0.09695667773485184, "learning_rate": 9.959362421414662e-05, "loss": 0.0042, "step": 4430 }, { "epoch": 2.5664739884393066, "grad_norm": 0.11576048284769058, "learning_rate": 9.958940575299027e-05, "loss": 0.005, "step": 4440 }, { "epoch": 2.5722543352601157, "grad_norm": 0.07925650477409363, "learning_rate": 9.95851655997431e-05, "loss": 0.0044, "step": 4450 }, { "epoch": 2.578034682080925, "grad_norm": 0.08335477858781815, "learning_rate": 9.958090375625986e-05, "loss": 0.0058, "step": 4460 }, { "epoch": 2.583815028901734, "grad_norm": 0.09603440761566162, "learning_rate": 9.957662022440486e-05, "loss": 0.0054, "step": 4470 }, { "epoch": 2.5895953757225434, "grad_norm": 0.10469746589660645, "learning_rate": 9.957231500605187e-05, "loss": 0.0045, "step": 4480 }, { "epoch": 2.5953757225433525, "grad_norm": 0.11201290041208267, "learning_rate": 9.95679881030841e-05, "loss": 0.0074, "step": 4490 }, { "epoch": 2.601156069364162, "grad_norm": 0.08057098835706711, "learning_rate": 9.95636395173943e-05, "loss": 0.0053, "step": 4500 }, { "epoch": 2.606936416184971, "grad_norm": 0.10965419560670853, "learning_rate": 9.95592692508847e-05, "loss": 0.0069, "step": 4510 }, { "epoch": 2.61271676300578, "grad_norm": 0.08233583718538284, "learning_rate": 9.9554877305467e-05, "loss": 0.0057, "step": 4520 }, { "epoch": 2.6184971098265897, "grad_norm": 0.07618734985589981, "learning_rate": 9.955046368306237e-05, "loss": 0.0053, "step": 4530 }, { "epoch": 2.6242774566473988, "grad_norm": 0.08831852674484253, "learning_rate": 9.954602838560153e-05, "loss": 0.0067, "step": 4540 }, { "epoch": 2.6300578034682083, "grad_norm": 0.11426381021738052, "learning_rate": 9.954157141502456e-05, "loss": 0.0049, "step": 4550 }, { "epoch": 2.6358381502890174, "grad_norm": 0.08342878520488739, "learning_rate": 9.953709277328112e-05, "loss": 0.005, "step": 4560 }, { "epoch": 2.6416184971098264, "grad_norm": 0.09040774405002594, "learning_rate": 9.953259246233032e-05, "loss": 0.0048, "step": 4570 }, { "epoch": 2.647398843930636, "grad_norm": 0.08589833974838257, "learning_rate": 9.952807048414077e-05, "loss": 0.0039, "step": 4580 }, { "epoch": 2.653179190751445, "grad_norm": 0.05775552615523338, "learning_rate": 9.95235268406905e-05, "loss": 0.0065, "step": 4590 }, { "epoch": 2.6589595375722546, "grad_norm": 0.058433182537555695, "learning_rate": 9.951896153396708e-05, "loss": 0.0041, "step": 4600 }, { "epoch": 2.6647398843930636, "grad_norm": 0.10195489972829819, "learning_rate": 9.95143745659675e-05, "loss": 0.0075, "step": 4610 }, { "epoch": 2.6705202312138727, "grad_norm": 0.0639519989490509, "learning_rate": 9.95097659386983e-05, "loss": 0.0045, "step": 4620 }, { "epoch": 2.6763005780346822, "grad_norm": 0.07609432190656662, "learning_rate": 9.950513565417542e-05, "loss": 0.0052, "step": 4630 }, { "epoch": 2.6820809248554913, "grad_norm": 0.10391134768724442, "learning_rate": 9.95004837144243e-05, "loss": 0.0045, "step": 4640 }, { "epoch": 2.687861271676301, "grad_norm": 0.11459513753652573, "learning_rate": 9.949581012147988e-05, "loss": 0.0052, "step": 4650 }, { "epoch": 2.69364161849711, "grad_norm": 0.12960045039653778, "learning_rate": 9.949111487738653e-05, "loss": 0.0082, "step": 4660 }, { "epoch": 2.699421965317919, "grad_norm": 0.19312746822834015, "learning_rate": 9.948639798419813e-05, "loss": 0.0053, "step": 4670 }, { "epoch": 2.705202312138728, "grad_norm": 0.15751752257347107, "learning_rate": 9.948165944397799e-05, "loss": 0.0065, "step": 4680 }, { "epoch": 2.7109826589595376, "grad_norm": 0.10588734596967697, "learning_rate": 9.94768992587989e-05, "loss": 0.0055, "step": 4690 }, { "epoch": 2.7167630057803467, "grad_norm": 0.0877213329076767, "learning_rate": 9.947211743074313e-05, "loss": 0.0054, "step": 4700 }, { "epoch": 2.722543352601156, "grad_norm": 0.11511239409446716, "learning_rate": 9.946731396190246e-05, "loss": 0.0056, "step": 4710 }, { "epoch": 2.7283236994219653, "grad_norm": 0.10262319445610046, "learning_rate": 9.946248885437803e-05, "loss": 0.0062, "step": 4720 }, { "epoch": 2.7341040462427744, "grad_norm": 0.07959464937448502, "learning_rate": 9.945764211028053e-05, "loss": 0.0051, "step": 4730 }, { "epoch": 2.739884393063584, "grad_norm": 0.08185133337974548, "learning_rate": 9.94527737317301e-05, "loss": 0.0055, "step": 4740 }, { "epoch": 2.745664739884393, "grad_norm": 0.09177955985069275, "learning_rate": 9.944788372085631e-05, "loss": 0.0049, "step": 4750 }, { "epoch": 2.7514450867052025, "grad_norm": 0.08497302234172821, "learning_rate": 9.944297207979825e-05, "loss": 0.0048, "step": 4760 }, { "epoch": 2.7572254335260116, "grad_norm": 0.07832589000463486, "learning_rate": 9.943803881070441e-05, "loss": 0.0044, "step": 4770 }, { "epoch": 2.7630057803468207, "grad_norm": 0.08363822847604752, "learning_rate": 9.943308391573278e-05, "loss": 0.0079, "step": 4780 }, { "epoch": 2.76878612716763, "grad_norm": 0.06930410116910934, "learning_rate": 9.942810739705079e-05, "loss": 0.0042, "step": 4790 }, { "epoch": 2.7745664739884393, "grad_norm": 0.08685854077339172, "learning_rate": 9.942310925683538e-05, "loss": 0.0055, "step": 4800 }, { "epoch": 2.7803468208092488, "grad_norm": 0.10434369742870331, "learning_rate": 9.941808949727285e-05, "loss": 0.0041, "step": 4810 }, { "epoch": 2.786127167630058, "grad_norm": 0.07987218350172043, "learning_rate": 9.941304812055903e-05, "loss": 0.0041, "step": 4820 }, { "epoch": 2.791907514450867, "grad_norm": 0.0995841845870018, "learning_rate": 9.940798512889921e-05, "loss": 0.0048, "step": 4830 }, { "epoch": 2.7976878612716765, "grad_norm": 0.1205340102314949, "learning_rate": 9.94029005245081e-05, "loss": 0.0054, "step": 4840 }, { "epoch": 2.8034682080924855, "grad_norm": 0.1322150081396103, "learning_rate": 9.939779430960988e-05, "loss": 0.0061, "step": 4850 }, { "epoch": 2.809248554913295, "grad_norm": 0.09959442168474197, "learning_rate": 9.939266648643817e-05, "loss": 0.0048, "step": 4860 }, { "epoch": 2.815028901734104, "grad_norm": 0.08988264948129654, "learning_rate": 9.938751705723607e-05, "loss": 0.0063, "step": 4870 }, { "epoch": 2.820809248554913, "grad_norm": 0.08533381670713425, "learning_rate": 9.938234602425613e-05, "loss": 0.0041, "step": 4880 }, { "epoch": 2.8265895953757223, "grad_norm": 0.10899991542100906, "learning_rate": 9.93771533897603e-05, "loss": 0.0051, "step": 4890 }, { "epoch": 2.832369942196532, "grad_norm": 0.10073061287403107, "learning_rate": 9.937193915602004e-05, "loss": 0.005, "step": 4900 }, { "epoch": 2.838150289017341, "grad_norm": 0.10269533097743988, "learning_rate": 9.936670332531621e-05, "loss": 0.0058, "step": 4910 }, { "epoch": 2.8439306358381504, "grad_norm": 0.08050937205553055, "learning_rate": 9.936144589993916e-05, "loss": 0.0054, "step": 4920 }, { "epoch": 2.8497109826589595, "grad_norm": 0.09727004170417786, "learning_rate": 9.935616688218867e-05, "loss": 0.0042, "step": 4930 }, { "epoch": 2.8554913294797686, "grad_norm": 0.07916481792926788, "learning_rate": 9.935086627437395e-05, "loss": 0.0048, "step": 4940 }, { "epoch": 2.861271676300578, "grad_norm": 0.1361258327960968, "learning_rate": 9.934554407881366e-05, "loss": 0.0106, "step": 4950 }, { "epoch": 2.867052023121387, "grad_norm": 0.1195870116353035, "learning_rate": 9.934020029783593e-05, "loss": 0.0045, "step": 4960 }, { "epoch": 2.8728323699421967, "grad_norm": 0.0782453715801239, "learning_rate": 9.933483493377829e-05, "loss": 0.0045, "step": 4970 }, { "epoch": 2.878612716763006, "grad_norm": 0.064698226749897, "learning_rate": 9.932944798898774e-05, "loss": 0.0047, "step": 4980 }, { "epoch": 2.884393063583815, "grad_norm": 0.07942969352006912, "learning_rate": 9.932403946582072e-05, "loss": 0.008, "step": 4990 }, { "epoch": 2.8901734104046244, "grad_norm": 0.09807512909173965, "learning_rate": 9.93186093666431e-05, "loss": 0.0058, "step": 5000 }, { "epoch": 2.8959537572254335, "grad_norm": 0.10162036120891571, "learning_rate": 9.931315769383018e-05, "loss": 0.0042, "step": 5010 }, { "epoch": 2.901734104046243, "grad_norm": 0.09182185679674149, "learning_rate": 9.930768444976672e-05, "loss": 0.0098, "step": 5020 }, { "epoch": 2.907514450867052, "grad_norm": 0.09831009805202484, "learning_rate": 9.93021896368469e-05, "loss": 0.0056, "step": 5030 }, { "epoch": 2.913294797687861, "grad_norm": 0.09399363398551941, "learning_rate": 9.929667325747431e-05, "loss": 0.0045, "step": 5040 }, { "epoch": 2.9190751445086707, "grad_norm": 0.07335769385099411, "learning_rate": 9.929113531406205e-05, "loss": 0.0044, "step": 5050 }, { "epoch": 2.9248554913294798, "grad_norm": 0.056684091687202454, "learning_rate": 9.928557580903257e-05, "loss": 0.0068, "step": 5060 }, { "epoch": 2.9306358381502893, "grad_norm": 0.07843078672885895, "learning_rate": 9.927999474481779e-05, "loss": 0.0052, "step": 5070 }, { "epoch": 2.9364161849710984, "grad_norm": 0.08666455745697021, "learning_rate": 9.927439212385907e-05, "loss": 0.004, "step": 5080 }, { "epoch": 2.9421965317919074, "grad_norm": 0.13427752256393433, "learning_rate": 9.926876794860718e-05, "loss": 0.0084, "step": 5090 }, { "epoch": 2.9479768786127165, "grad_norm": 0.08947822451591492, "learning_rate": 9.926312222152235e-05, "loss": 0.0057, "step": 5100 }, { "epoch": 2.953757225433526, "grad_norm": 0.09052816778421402, "learning_rate": 9.925745494507414e-05, "loss": 0.0057, "step": 5110 }, { "epoch": 2.959537572254335, "grad_norm": 0.09049113839864731, "learning_rate": 9.925176612174169e-05, "loss": 0.0052, "step": 5120 }, { "epoch": 2.9653179190751446, "grad_norm": 0.09374893456697464, "learning_rate": 9.924605575401346e-05, "loss": 0.0049, "step": 5130 }, { "epoch": 2.9710982658959537, "grad_norm": 0.10202183574438095, "learning_rate": 9.924032384438733e-05, "loss": 0.0047, "step": 5140 }, { "epoch": 2.976878612716763, "grad_norm": 0.12373793870210648, "learning_rate": 9.923457039537066e-05, "loss": 0.0072, "step": 5150 }, { "epoch": 2.9826589595375723, "grad_norm": 0.12671178579330444, "learning_rate": 9.92287954094802e-05, "loss": 0.0047, "step": 5160 }, { "epoch": 2.9884393063583814, "grad_norm": 0.13292868435382843, "learning_rate": 9.922299888924212e-05, "loss": 0.0055, "step": 5170 }, { "epoch": 2.994219653179191, "grad_norm": 0.0895073339343071, "learning_rate": 9.921718083719203e-05, "loss": 0.0039, "step": 5180 }, { "epoch": 3.0, "grad_norm": 0.09845282137393951, "learning_rate": 9.921134125587491e-05, "loss": 0.0051, "step": 5190 }, { "epoch": 3.005780346820809, "grad_norm": 0.17806030809879303, "learning_rate": 9.920548014784523e-05, "loss": 0.0052, "step": 5200 }, { "epoch": 3.0115606936416186, "grad_norm": 0.09359262883663177, "learning_rate": 9.919959751566681e-05, "loss": 0.0049, "step": 5210 }, { "epoch": 3.0173410404624277, "grad_norm": 0.14200544357299805, "learning_rate": 9.919369336191291e-05, "loss": 0.005, "step": 5220 }, { "epoch": 3.023121387283237, "grad_norm": 0.08623161911964417, "learning_rate": 9.918776768916625e-05, "loss": 0.0042, "step": 5230 }, { "epoch": 3.0289017341040463, "grad_norm": 0.0641922876238823, "learning_rate": 9.918182050001888e-05, "loss": 0.0047, "step": 5240 }, { "epoch": 3.0346820809248554, "grad_norm": 0.08116123825311661, "learning_rate": 9.91758517970723e-05, "loss": 0.0054, "step": 5250 }, { "epoch": 3.040462427745665, "grad_norm": 0.07552453875541687, "learning_rate": 9.916986158293744e-05, "loss": 0.0048, "step": 5260 }, { "epoch": 3.046242774566474, "grad_norm": 0.09604078531265259, "learning_rate": 9.916384986023463e-05, "loss": 0.0055, "step": 5270 }, { "epoch": 3.052023121387283, "grad_norm": 0.08662170171737671, "learning_rate": 9.915781663159359e-05, "loss": 0.004, "step": 5280 }, { "epoch": 3.0578034682080926, "grad_norm": 0.07890532165765762, "learning_rate": 9.915176189965346e-05, "loss": 0.0049, "step": 5290 }, { "epoch": 3.0635838150289016, "grad_norm": 0.07367055118083954, "learning_rate": 9.914568566706279e-05, "loss": 0.0043, "step": 5300 }, { "epoch": 3.069364161849711, "grad_norm": 0.05489453300833702, "learning_rate": 9.913958793647953e-05, "loss": 0.0047, "step": 5310 }, { "epoch": 3.0751445086705202, "grad_norm": 0.04112934693694115, "learning_rate": 9.913346871057102e-05, "loss": 0.0037, "step": 5320 }, { "epoch": 3.0809248554913293, "grad_norm": 0.06684437394142151, "learning_rate": 9.912732799201403e-05, "loss": 0.0051, "step": 5330 }, { "epoch": 3.086705202312139, "grad_norm": 0.06671774387359619, "learning_rate": 9.912116578349474e-05, "loss": 0.0043, "step": 5340 }, { "epoch": 3.092485549132948, "grad_norm": 0.07309421896934509, "learning_rate": 9.911498208770867e-05, "loss": 0.0041, "step": 5350 }, { "epoch": 3.098265895953757, "grad_norm": 0.08895828574895859, "learning_rate": 9.910877690736078e-05, "loss": 0.0038, "step": 5360 }, { "epoch": 3.1040462427745665, "grad_norm": 0.05753541737794876, "learning_rate": 9.910255024516546e-05, "loss": 0.0046, "step": 5370 }, { "epoch": 3.1098265895953756, "grad_norm": 0.1112755537033081, "learning_rate": 9.909630210384644e-05, "loss": 0.005, "step": 5380 }, { "epoch": 3.115606936416185, "grad_norm": 0.07616247981786728, "learning_rate": 9.909003248613688e-05, "loss": 0.0055, "step": 5390 }, { "epoch": 3.121387283236994, "grad_norm": 0.09826352447271347, "learning_rate": 9.908374139477934e-05, "loss": 0.0048, "step": 5400 }, { "epoch": 3.1271676300578033, "grad_norm": 0.10984566062688828, "learning_rate": 9.907742883252571e-05, "loss": 0.0059, "step": 5410 }, { "epoch": 3.132947976878613, "grad_norm": 0.10086532682180405, "learning_rate": 9.907109480213736e-05, "loss": 0.0052, "step": 5420 }, { "epoch": 3.138728323699422, "grad_norm": 0.0686434879899025, "learning_rate": 9.906473930638498e-05, "loss": 0.0049, "step": 5430 }, { "epoch": 3.1445086705202314, "grad_norm": 0.08263476192951202, "learning_rate": 9.90583623480487e-05, "loss": 0.004, "step": 5440 }, { "epoch": 3.1502890173410405, "grad_norm": 0.08829036355018616, "learning_rate": 9.905196392991802e-05, "loss": 0.0047, "step": 5450 }, { "epoch": 3.1560693641618496, "grad_norm": 0.07960904389619827, "learning_rate": 9.904554405479183e-05, "loss": 0.0052, "step": 5460 }, { "epoch": 3.161849710982659, "grad_norm": 0.13208231329917908, "learning_rate": 9.903910272547838e-05, "loss": 0.005, "step": 5470 }, { "epoch": 3.167630057803468, "grad_norm": 0.0960419774055481, "learning_rate": 9.903263994479533e-05, "loss": 0.0044, "step": 5480 }, { "epoch": 3.1734104046242773, "grad_norm": 0.10696357488632202, "learning_rate": 9.902615571556972e-05, "loss": 0.0044, "step": 5490 }, { "epoch": 3.179190751445087, "grad_norm": 0.12074033915996552, "learning_rate": 9.9019650040638e-05, "loss": 0.0054, "step": 5500 }, { "epoch": 3.184971098265896, "grad_norm": 0.05908733978867531, "learning_rate": 9.901312292284594e-05, "loss": 0.005, "step": 5510 }, { "epoch": 3.1907514450867054, "grad_norm": 0.08794012665748596, "learning_rate": 9.900657436504875e-05, "loss": 0.005, "step": 5520 }, { "epoch": 3.1965317919075145, "grad_norm": 0.08090519905090332, "learning_rate": 9.900000437011097e-05, "loss": 0.0048, "step": 5530 }, { "epoch": 3.2023121387283235, "grad_norm": 0.06776402145624161, "learning_rate": 9.899341294090653e-05, "loss": 0.0032, "step": 5540 }, { "epoch": 3.208092485549133, "grad_norm": 0.09918577969074249, "learning_rate": 9.898680008031877e-05, "loss": 0.0053, "step": 5550 }, { "epoch": 3.213872832369942, "grad_norm": 0.11192861199378967, "learning_rate": 9.898016579124037e-05, "loss": 0.0069, "step": 5560 }, { "epoch": 3.2196531791907512, "grad_norm": 0.0986863300204277, "learning_rate": 9.89735100765734e-05, "loss": 0.0043, "step": 5570 }, { "epoch": 3.2254335260115607, "grad_norm": 0.10766062885522842, "learning_rate": 9.89668329392293e-05, "loss": 0.0083, "step": 5580 }, { "epoch": 3.23121387283237, "grad_norm": 0.11381088942289352, "learning_rate": 9.896013438212885e-05, "loss": 0.0051, "step": 5590 }, { "epoch": 3.2369942196531793, "grad_norm": 0.13150329887866974, "learning_rate": 9.895341440820225e-05, "loss": 0.004, "step": 5600 }, { "epoch": 3.2427745664739884, "grad_norm": 0.07293879985809326, "learning_rate": 9.894667302038902e-05, "loss": 0.004, "step": 5610 }, { "epoch": 3.2485549132947975, "grad_norm": 0.09569191187620163, "learning_rate": 9.893991022163812e-05, "loss": 0.0049, "step": 5620 }, { "epoch": 3.254335260115607, "grad_norm": 0.10423526167869568, "learning_rate": 9.893312601490779e-05, "loss": 0.0039, "step": 5630 }, { "epoch": 3.260115606936416, "grad_norm": 0.0878419354557991, "learning_rate": 9.892632040316568e-05, "loss": 0.0038, "step": 5640 }, { "epoch": 3.2658959537572256, "grad_norm": 0.08911574631929398, "learning_rate": 9.891949338938877e-05, "loss": 0.0049, "step": 5650 }, { "epoch": 3.2716763005780347, "grad_norm": 0.07388553023338318, "learning_rate": 9.89126449765635e-05, "loss": 0.004, "step": 5660 }, { "epoch": 3.277456647398844, "grad_norm": 0.09998785704374313, "learning_rate": 9.890577516768551e-05, "loss": 0.0076, "step": 5670 }, { "epoch": 3.2832369942196533, "grad_norm": 0.10120239108800888, "learning_rate": 9.889888396575995e-05, "loss": 0.0051, "step": 5680 }, { "epoch": 3.2890173410404624, "grad_norm": 0.10796697437763214, "learning_rate": 9.889197137380125e-05, "loss": 0.0063, "step": 5690 }, { "epoch": 3.294797687861272, "grad_norm": 0.07642047852277756, "learning_rate": 9.888503739483321e-05, "loss": 0.0046, "step": 5700 }, { "epoch": 3.300578034682081, "grad_norm": 0.1060706302523613, "learning_rate": 9.887808203188897e-05, "loss": 0.0046, "step": 5710 }, { "epoch": 3.30635838150289, "grad_norm": 0.07729043066501617, "learning_rate": 9.887110528801108e-05, "loss": 0.0043, "step": 5720 }, { "epoch": 3.3121387283236996, "grad_norm": 0.07654937356710434, "learning_rate": 9.886410716625137e-05, "loss": 0.0044, "step": 5730 }, { "epoch": 3.3179190751445087, "grad_norm": 0.10756026208400726, "learning_rate": 9.885708766967108e-05, "loss": 0.0045, "step": 5740 }, { "epoch": 3.3236994219653178, "grad_norm": 0.09746664017438889, "learning_rate": 9.885004680134076e-05, "loss": 0.0053, "step": 5750 }, { "epoch": 3.3294797687861273, "grad_norm": 0.06226349249482155, "learning_rate": 9.884298456434033e-05, "loss": 0.0036, "step": 5760 }, { "epoch": 3.3352601156069364, "grad_norm": 0.076237753033638, "learning_rate": 9.883590096175905e-05, "loss": 0.0046, "step": 5770 }, { "epoch": 3.3410404624277454, "grad_norm": 0.09564107656478882, "learning_rate": 9.882879599669554e-05, "loss": 0.0036, "step": 5780 }, { "epoch": 3.346820809248555, "grad_norm": 0.11294952034950256, "learning_rate": 9.882166967225775e-05, "loss": 0.0044, "step": 5790 }, { "epoch": 3.352601156069364, "grad_norm": 0.10337945073843002, "learning_rate": 9.881452199156296e-05, "loss": 0.0042, "step": 5800 }, { "epoch": 3.3583815028901736, "grad_norm": 0.08041290193796158, "learning_rate": 9.880735295773782e-05, "loss": 0.0034, "step": 5810 }, { "epoch": 3.3641618497109826, "grad_norm": 0.08811701089143753, "learning_rate": 9.88001625739183e-05, "loss": 0.0046, "step": 5820 }, { "epoch": 3.3699421965317917, "grad_norm": 0.08728475868701935, "learning_rate": 9.879295084324976e-05, "loss": 0.0035, "step": 5830 }, { "epoch": 3.3757225433526012, "grad_norm": 0.07673713564872742, "learning_rate": 9.87857177688868e-05, "loss": 0.004, "step": 5840 }, { "epoch": 3.3815028901734103, "grad_norm": 0.04676206782460213, "learning_rate": 9.877846335399343e-05, "loss": 0.0042, "step": 5850 }, { "epoch": 3.38728323699422, "grad_norm": 0.1315360814332962, "learning_rate": 9.8771187601743e-05, "loss": 0.0073, "step": 5860 }, { "epoch": 3.393063583815029, "grad_norm": 0.11324327439069748, "learning_rate": 9.876389051531813e-05, "loss": 0.0069, "step": 5870 }, { "epoch": 3.398843930635838, "grad_norm": 0.1283894181251526, "learning_rate": 9.875657209791088e-05, "loss": 0.0045, "step": 5880 }, { "epoch": 3.4046242774566475, "grad_norm": 0.09971684217453003, "learning_rate": 9.874923235272248e-05, "loss": 0.006, "step": 5890 }, { "epoch": 3.4104046242774566, "grad_norm": 0.08040836453437805, "learning_rate": 9.874187128296369e-05, "loss": 0.0056, "step": 5900 }, { "epoch": 3.416184971098266, "grad_norm": 0.09063635766506195, "learning_rate": 9.873448889185439e-05, "loss": 0.0046, "step": 5910 }, { "epoch": 3.421965317919075, "grad_norm": 0.09057789295911789, "learning_rate": 9.872708518262397e-05, "loss": 0.0077, "step": 5920 }, { "epoch": 3.4277456647398843, "grad_norm": 0.08726388961076736, "learning_rate": 9.871966015851101e-05, "loss": 0.0043, "step": 5930 }, { "epoch": 3.433526011560694, "grad_norm": 0.06119583174586296, "learning_rate": 9.87122138227635e-05, "loss": 0.0045, "step": 5940 }, { "epoch": 3.439306358381503, "grad_norm": 0.059672337025403976, "learning_rate": 9.87047461786387e-05, "loss": 0.0038, "step": 5950 }, { "epoch": 3.445086705202312, "grad_norm": 0.08882300555706024, "learning_rate": 9.869725722940323e-05, "loss": 0.0043, "step": 5960 }, { "epoch": 3.4508670520231215, "grad_norm": 0.09898592531681061, "learning_rate": 9.868974697833299e-05, "loss": 0.0041, "step": 5970 }, { "epoch": 3.4566473988439306, "grad_norm": 0.054888781160116196, "learning_rate": 9.868221542871324e-05, "loss": 0.0037, "step": 5980 }, { "epoch": 3.4624277456647397, "grad_norm": 0.07501913607120514, "learning_rate": 9.867466258383853e-05, "loss": 0.0054, "step": 5990 }, { "epoch": 3.468208092485549, "grad_norm": 0.10471879690885544, "learning_rate": 9.866708844701272e-05, "loss": 0.0053, "step": 6000 }, { "epoch": 3.4739884393063583, "grad_norm": 0.14950262010097504, "learning_rate": 9.865949302154899e-05, "loss": 0.0061, "step": 6010 }, { "epoch": 3.479768786127168, "grad_norm": 0.14313995838165283, "learning_rate": 9.865187631076987e-05, "loss": 0.0048, "step": 6020 }, { "epoch": 3.485549132947977, "grad_norm": 0.09261684119701385, "learning_rate": 9.864423831800712e-05, "loss": 0.0041, "step": 6030 }, { "epoch": 3.491329479768786, "grad_norm": 0.10116013884544373, "learning_rate": 9.863657904660191e-05, "loss": 0.0049, "step": 6040 }, { "epoch": 3.4971098265895955, "grad_norm": 0.10000468790531158, "learning_rate": 9.862889849990462e-05, "loss": 0.0042, "step": 6050 }, { "epoch": 3.5028901734104045, "grad_norm": 0.08636848628520966, "learning_rate": 9.8621196681275e-05, "loss": 0.0039, "step": 6060 }, { "epoch": 3.508670520231214, "grad_norm": 0.09822828322649002, "learning_rate": 9.861347359408211e-05, "loss": 0.0041, "step": 6070 }, { "epoch": 3.514450867052023, "grad_norm": 0.06331049650907516, "learning_rate": 9.860572924170426e-05, "loss": 0.0035, "step": 6080 }, { "epoch": 3.520231213872832, "grad_norm": 0.09790698438882828, "learning_rate": 9.85979636275291e-05, "loss": 0.0048, "step": 6090 }, { "epoch": 3.5260115606936417, "grad_norm": 0.09691625833511353, "learning_rate": 9.859017675495357e-05, "loss": 0.0037, "step": 6100 }, { "epoch": 3.531791907514451, "grad_norm": 0.08491610735654831, "learning_rate": 9.858236862738392e-05, "loss": 0.0045, "step": 6110 }, { "epoch": 3.5375722543352603, "grad_norm": 0.09983476251363754, "learning_rate": 9.85745392482357e-05, "loss": 0.0039, "step": 6120 }, { "epoch": 3.5433526011560694, "grad_norm": 0.06305649131536484, "learning_rate": 9.856668862093372e-05, "loss": 0.0035, "step": 6130 }, { "epoch": 3.5491329479768785, "grad_norm": 0.08348894864320755, "learning_rate": 9.855881674891214e-05, "loss": 0.0052, "step": 6140 }, { "epoch": 3.5549132947976876, "grad_norm": 0.07188717275857925, "learning_rate": 9.855092363561437e-05, "loss": 0.0044, "step": 6150 }, { "epoch": 3.560693641618497, "grad_norm": 0.07703947275876999, "learning_rate": 9.854300928449312e-05, "loss": 0.0043, "step": 6160 }, { "epoch": 3.5664739884393066, "grad_norm": 0.11623793840408325, "learning_rate": 9.85350736990104e-05, "loss": 0.0048, "step": 6170 }, { "epoch": 3.5722543352601157, "grad_norm": 0.07263840734958649, "learning_rate": 9.852711688263752e-05, "loss": 0.0043, "step": 6180 }, { "epoch": 3.578034682080925, "grad_norm": 0.08389998972415924, "learning_rate": 9.851913883885503e-05, "loss": 0.0053, "step": 6190 }, { "epoch": 3.583815028901734, "grad_norm": 0.08913276344537735, "learning_rate": 9.851113957115283e-05, "loss": 0.0054, "step": 6200 }, { "epoch": 3.5895953757225434, "grad_norm": 0.09740209579467773, "learning_rate": 9.850311908303005e-05, "loss": 0.0095, "step": 6210 }, { "epoch": 3.5953757225433525, "grad_norm": 0.0637655109167099, "learning_rate": 9.849507737799514e-05, "loss": 0.0043, "step": 6220 }, { "epoch": 3.601156069364162, "grad_norm": 0.1079898402094841, "learning_rate": 9.84870144595658e-05, "loss": 0.0043, "step": 6230 }, { "epoch": 3.606936416184971, "grad_norm": 0.06550470739603043, "learning_rate": 9.847893033126903e-05, "loss": 0.0036, "step": 6240 }, { "epoch": 3.61271676300578, "grad_norm": 0.07686427980661392, "learning_rate": 9.847082499664111e-05, "loss": 0.006, "step": 6250 }, { "epoch": 3.6184971098265897, "grad_norm": 0.0707835853099823, "learning_rate": 9.846269845922758e-05, "loss": 0.008, "step": 6260 }, { "epoch": 3.6242774566473988, "grad_norm": 0.051362019032239914, "learning_rate": 9.845455072258326e-05, "loss": 0.0043, "step": 6270 }, { "epoch": 3.6300578034682083, "grad_norm": 0.06963202357292175, "learning_rate": 9.844638179027226e-05, "loss": 0.0042, "step": 6280 }, { "epoch": 3.6358381502890174, "grad_norm": 0.08382728695869446, "learning_rate": 9.843819166586795e-05, "loss": 0.0047, "step": 6290 }, { "epoch": 3.6416184971098264, "grad_norm": 0.06943295150995255, "learning_rate": 9.842998035295294e-05, "loss": 0.0059, "step": 6300 }, { "epoch": 3.647398843930636, "grad_norm": 0.08568401634693146, "learning_rate": 9.842174785511918e-05, "loss": 0.0039, "step": 6310 }, { "epoch": 3.653179190751445, "grad_norm": 0.05123983696103096, "learning_rate": 9.841349417596779e-05, "loss": 0.0039, "step": 6320 }, { "epoch": 3.6589595375722546, "grad_norm": 0.08430792391300201, "learning_rate": 9.840521931910926e-05, "loss": 0.0038, "step": 6330 }, { "epoch": 3.6647398843930636, "grad_norm": 0.07086501270532608, "learning_rate": 9.839692328816327e-05, "loss": 0.0046, "step": 6340 }, { "epoch": 3.6705202312138727, "grad_norm": 0.08472570031881332, "learning_rate": 9.838860608675879e-05, "loss": 0.0048, "step": 6350 }, { "epoch": 3.6763005780346822, "grad_norm": 0.07034272700548172, "learning_rate": 9.838026771853406e-05, "loss": 0.0046, "step": 6360 }, { "epoch": 3.6820809248554913, "grad_norm": 0.05583292618393898, "learning_rate": 9.837190818713655e-05, "loss": 0.0054, "step": 6370 }, { "epoch": 3.687861271676301, "grad_norm": 0.05575549602508545, "learning_rate": 9.836352749622299e-05, "loss": 0.0048, "step": 6380 }, { "epoch": 3.69364161849711, "grad_norm": 0.0586441271007061, "learning_rate": 9.835512564945941e-05, "loss": 0.0036, "step": 6390 }, { "epoch": 3.699421965317919, "grad_norm": 0.0920369029045105, "learning_rate": 9.834670265052104e-05, "loss": 0.0083, "step": 6400 }, { "epoch": 3.705202312138728, "grad_norm": 0.11683415621519089, "learning_rate": 9.833825850309239e-05, "loss": 0.0042, "step": 6410 }, { "epoch": 3.7109826589595376, "grad_norm": 0.06055425852537155, "learning_rate": 9.832979321086723e-05, "loss": 0.004, "step": 6420 }, { "epoch": 3.7167630057803467, "grad_norm": 0.07325479388237, "learning_rate": 9.832130677754854e-05, "loss": 0.0044, "step": 6430 }, { "epoch": 3.722543352601156, "grad_norm": 0.07589448988437653, "learning_rate": 9.831279920684861e-05, "loss": 0.0048, "step": 6440 }, { "epoch": 3.7283236994219653, "grad_norm": 0.10280989110469818, "learning_rate": 9.830427050248891e-05, "loss": 0.0127, "step": 6450 }, { "epoch": 3.7341040462427744, "grad_norm": 0.08522803336381912, "learning_rate": 9.829572066820021e-05, "loss": 0.0067, "step": 6460 }, { "epoch": 3.739884393063584, "grad_norm": 0.10551602393388748, "learning_rate": 9.828714970772247e-05, "loss": 0.0044, "step": 6470 }, { "epoch": 3.745664739884393, "grad_norm": 0.1057896614074707, "learning_rate": 9.827855762480493e-05, "loss": 0.0062, "step": 6480 }, { "epoch": 3.7514450867052025, "grad_norm": 0.08756575733423233, "learning_rate": 9.826994442320608e-05, "loss": 0.0048, "step": 6490 }, { "epoch": 3.7572254335260116, "grad_norm": 0.06838827580213547, "learning_rate": 9.82613101066936e-05, "loss": 0.0054, "step": 6500 }, { "epoch": 3.7630057803468207, "grad_norm": 0.10524417459964752, "learning_rate": 9.825265467904445e-05, "loss": 0.0048, "step": 6510 }, { "epoch": 3.76878612716763, "grad_norm": 0.16791360080242157, "learning_rate": 9.82439781440448e-05, "loss": 0.0048, "step": 6520 }, { "epoch": 3.7745664739884393, "grad_norm": 0.10500805824995041, "learning_rate": 9.823528050549006e-05, "loss": 0.0044, "step": 6530 }, { "epoch": 3.7803468208092488, "grad_norm": 0.10242146253585815, "learning_rate": 9.822656176718487e-05, "loss": 0.0047, "step": 6540 }, { "epoch": 3.786127167630058, "grad_norm": 0.12535038590431213, "learning_rate": 9.82178219329431e-05, "loss": 0.0043, "step": 6550 }, { "epoch": 3.791907514450867, "grad_norm": 0.10232984274625778, "learning_rate": 9.820906100658789e-05, "loss": 0.0035, "step": 6560 }, { "epoch": 3.7976878612716765, "grad_norm": 0.09715797007083893, "learning_rate": 9.82002789919515e-05, "loss": 0.0047, "step": 6570 }, { "epoch": 3.8034682080924855, "grad_norm": 0.10294798761606216, "learning_rate": 9.819147589287554e-05, "loss": 0.0042, "step": 6580 }, { "epoch": 3.809248554913295, "grad_norm": 0.06270695477724075, "learning_rate": 9.818265171321074e-05, "loss": 0.0042, "step": 6590 }, { "epoch": 3.815028901734104, "grad_norm": 0.10261218994855881, "learning_rate": 9.817380645681714e-05, "loss": 0.0052, "step": 6600 }, { "epoch": 3.820809248554913, "grad_norm": 0.07845862209796906, "learning_rate": 9.816494012756392e-05, "loss": 0.0037, "step": 6610 }, { "epoch": 3.8265895953757223, "grad_norm": 0.08661410212516785, "learning_rate": 9.815605272932953e-05, "loss": 0.0068, "step": 6620 }, { "epoch": 3.832369942196532, "grad_norm": 0.07012265175580978, "learning_rate": 9.814714426600162e-05, "loss": 0.0049, "step": 6630 }, { "epoch": 3.838150289017341, "grad_norm": 0.11753591895103455, "learning_rate": 9.813821474147704e-05, "loss": 0.0066, "step": 6640 }, { "epoch": 3.8439306358381504, "grad_norm": 0.10362179577350616, "learning_rate": 9.812926415966189e-05, "loss": 0.0041, "step": 6650 }, { "epoch": 3.8497109826589595, "grad_norm": 0.09316336363554001, "learning_rate": 9.812029252447143e-05, "loss": 0.0043, "step": 6660 }, { "epoch": 3.8554913294797686, "grad_norm": 0.10363206267356873, "learning_rate": 9.811129983983018e-05, "loss": 0.0041, "step": 6670 }, { "epoch": 3.861271676300578, "grad_norm": 0.0667562484741211, "learning_rate": 9.810228610967185e-05, "loss": 0.0071, "step": 6680 }, { "epoch": 3.867052023121387, "grad_norm": 0.12337341159582138, "learning_rate": 9.809325133793934e-05, "loss": 0.0053, "step": 6690 }, { "epoch": 3.8728323699421967, "grad_norm": 0.09047921746969223, "learning_rate": 9.808419552858477e-05, "loss": 0.0048, "step": 6700 }, { "epoch": 3.878612716763006, "grad_norm": 0.07758771628141403, "learning_rate": 9.807511868556944e-05, "loss": 0.0045, "step": 6710 }, { "epoch": 3.884393063583815, "grad_norm": 0.06790055334568024, "learning_rate": 9.806602081286392e-05, "loss": 0.0041, "step": 6720 }, { "epoch": 3.8901734104046244, "grad_norm": 0.09480784088373184, "learning_rate": 9.805690191444785e-05, "loss": 0.0052, "step": 6730 }, { "epoch": 3.8959537572254335, "grad_norm": 0.09276645630598068, "learning_rate": 9.804776199431022e-05, "loss": 0.0039, "step": 6740 }, { "epoch": 3.901734104046243, "grad_norm": 0.11566020548343658, "learning_rate": 9.80386010564491e-05, "loss": 0.0047, "step": 6750 }, { "epoch": 3.907514450867052, "grad_norm": 0.06422998756170273, "learning_rate": 9.802941910487183e-05, "loss": 0.0032, "step": 6760 }, { "epoch": 3.913294797687861, "grad_norm": 0.10603903234004974, "learning_rate": 9.802021614359487e-05, "loss": 0.0048, "step": 6770 }, { "epoch": 3.9190751445086707, "grad_norm": 0.12690307199954987, "learning_rate": 9.801099217664394e-05, "loss": 0.0043, "step": 6780 }, { "epoch": 3.9248554913294798, "grad_norm": 0.12509547173976898, "learning_rate": 9.800174720805388e-05, "loss": 0.0048, "step": 6790 }, { "epoch": 3.9306358381502893, "grad_norm": 0.11259645968675613, "learning_rate": 9.799248124186878e-05, "loss": 0.0068, "step": 6800 }, { "epoch": 3.9364161849710984, "grad_norm": 0.08851031213998795, "learning_rate": 9.798319428214187e-05, "loss": 0.0039, "step": 6810 }, { "epoch": 3.9421965317919074, "grad_norm": 0.10880535840988159, "learning_rate": 9.79738863329356e-05, "loss": 0.0069, "step": 6820 }, { "epoch": 3.9479768786127165, "grad_norm": 0.10324430465698242, "learning_rate": 9.79645573983216e-05, "loss": 0.0047, "step": 6830 }, { "epoch": 3.953757225433526, "grad_norm": 0.09232804924249649, "learning_rate": 9.795520748238062e-05, "loss": 0.0039, "step": 6840 }, { "epoch": 3.959537572254335, "grad_norm": 0.07403694838285446, "learning_rate": 9.794583658920264e-05, "loss": 0.0044, "step": 6850 }, { "epoch": 3.9653179190751446, "grad_norm": 0.05542504042387009, "learning_rate": 9.793644472288683e-05, "loss": 0.0034, "step": 6860 }, { "epoch": 3.9710982658959537, "grad_norm": 0.07085678726434708, "learning_rate": 9.79270318875415e-05, "loss": 0.005, "step": 6870 }, { "epoch": 3.976878612716763, "grad_norm": 0.0798199251294136, "learning_rate": 9.791759808728416e-05, "loss": 0.0038, "step": 6880 }, { "epoch": 3.9826589595375723, "grad_norm": 0.07342632114887238, "learning_rate": 9.790814332624143e-05, "loss": 0.0029, "step": 6890 }, { "epoch": 3.9884393063583814, "grad_norm": 0.07653181254863739, "learning_rate": 9.789866760854919e-05, "loss": 0.0041, "step": 6900 }, { "epoch": 3.994219653179191, "grad_norm": 0.06251738965511322, "learning_rate": 9.788917093835242e-05, "loss": 0.0079, "step": 6910 }, { "epoch": 4.0, "grad_norm": 0.07752005755901337, "learning_rate": 9.787965331980528e-05, "loss": 0.004, "step": 6920 }, { "epoch": 4.005780346820809, "grad_norm": 0.10855576395988464, "learning_rate": 9.787011475707111e-05, "loss": 0.0054, "step": 6930 }, { "epoch": 4.011560693641618, "grad_norm": 0.11480893194675446, "learning_rate": 9.786055525432241e-05, "loss": 0.006, "step": 6940 }, { "epoch": 4.017341040462428, "grad_norm": 0.1400570124387741, "learning_rate": 9.78509748157408e-05, "loss": 0.0044, "step": 6950 }, { "epoch": 4.023121387283237, "grad_norm": 0.10771267861127853, "learning_rate": 9.784137344551713e-05, "loss": 0.0052, "step": 6960 }, { "epoch": 4.028901734104046, "grad_norm": 0.08657637983560562, "learning_rate": 9.783175114785134e-05, "loss": 0.0051, "step": 6970 }, { "epoch": 4.034682080924855, "grad_norm": 0.07646024227142334, "learning_rate": 9.782210792695254e-05, "loss": 0.0039, "step": 6980 }, { "epoch": 4.040462427745664, "grad_norm": 0.08171222358942032, "learning_rate": 9.781244378703904e-05, "loss": 0.0047, "step": 6990 }, { "epoch": 4.046242774566474, "grad_norm": 0.07491995394229889, "learning_rate": 9.780275873233824e-05, "loss": 0.0037, "step": 7000 }, { "epoch": 4.0520231213872835, "grad_norm": 0.13858239352703094, "learning_rate": 9.77930527670867e-05, "loss": 0.0052, "step": 7010 }, { "epoch": 4.057803468208093, "grad_norm": 0.09935212135314941, "learning_rate": 9.778332589553016e-05, "loss": 0.0051, "step": 7020 }, { "epoch": 4.063583815028902, "grad_norm": 0.07191646099090576, "learning_rate": 9.777357812192349e-05, "loss": 0.0044, "step": 7030 }, { "epoch": 4.069364161849711, "grad_norm": 0.07832925766706467, "learning_rate": 9.776380945053066e-05, "loss": 0.004, "step": 7040 }, { "epoch": 4.07514450867052, "grad_norm": 0.07194431871175766, "learning_rate": 9.775401988562486e-05, "loss": 0.0046, "step": 7050 }, { "epoch": 4.08092485549133, "grad_norm": 0.07636430859565735, "learning_rate": 9.774420943148835e-05, "loss": 0.0037, "step": 7060 }, { "epoch": 4.086705202312139, "grad_norm": 0.054698288440704346, "learning_rate": 9.773437809241257e-05, "loss": 0.0042, "step": 7070 }, { "epoch": 4.092485549132948, "grad_norm": 0.06244511529803276, "learning_rate": 9.772452587269808e-05, "loss": 0.0037, "step": 7080 }, { "epoch": 4.098265895953757, "grad_norm": 0.05341937392950058, "learning_rate": 9.771465277665457e-05, "loss": 0.0045, "step": 7090 }, { "epoch": 4.104046242774566, "grad_norm": 0.06179845705628395, "learning_rate": 9.770475880860089e-05, "loss": 0.0038, "step": 7100 }, { "epoch": 4.109826589595376, "grad_norm": 0.08358588814735413, "learning_rate": 9.769484397286495e-05, "loss": 0.0044, "step": 7110 }, { "epoch": 4.115606936416185, "grad_norm": 0.07372605055570602, "learning_rate": 9.768490827378388e-05, "loss": 0.0042, "step": 7120 }, { "epoch": 4.121387283236994, "grad_norm": 0.08515487611293793, "learning_rate": 9.767495171570387e-05, "loss": 0.0036, "step": 7130 }, { "epoch": 4.127167630057803, "grad_norm": 0.06824032217264175, "learning_rate": 9.766497430298027e-05, "loss": 0.005, "step": 7140 }, { "epoch": 4.132947976878612, "grad_norm": 0.06346060335636139, "learning_rate": 9.765497603997754e-05, "loss": 0.0044, "step": 7150 }, { "epoch": 4.138728323699422, "grad_norm": 0.13387760519981384, "learning_rate": 9.764495693106926e-05, "loss": 0.0045, "step": 7160 }, { "epoch": 4.144508670520231, "grad_norm": 0.10241768509149551, "learning_rate": 9.76349169806381e-05, "loss": 0.0044, "step": 7170 }, { "epoch": 4.1502890173410405, "grad_norm": 0.09653797745704651, "learning_rate": 9.762485619307591e-05, "loss": 0.0036, "step": 7180 }, { "epoch": 4.15606936416185, "grad_norm": 0.09247767180204391, "learning_rate": 9.76147745727836e-05, "loss": 0.0038, "step": 7190 }, { "epoch": 4.161849710982659, "grad_norm": 0.08832607418298721, "learning_rate": 9.760467212417124e-05, "loss": 0.0044, "step": 7200 }, { "epoch": 4.167630057803469, "grad_norm": 0.12045808881521225, "learning_rate": 9.759454885165796e-05, "loss": 0.0049, "step": 7210 }, { "epoch": 4.173410404624278, "grad_norm": 0.07522665709257126, "learning_rate": 9.7584404759672e-05, "loss": 0.0038, "step": 7220 }, { "epoch": 4.179190751445087, "grad_norm": 0.0632644072175026, "learning_rate": 9.75742398526508e-05, "loss": 0.0034, "step": 7230 }, { "epoch": 4.184971098265896, "grad_norm": 0.08073872327804565, "learning_rate": 9.756405413504077e-05, "loss": 0.0043, "step": 7240 }, { "epoch": 4.190751445086705, "grad_norm": 0.075049988925457, "learning_rate": 9.755384761129752e-05, "loss": 0.0033, "step": 7250 }, { "epoch": 4.196531791907514, "grad_norm": 0.09683282673358917, "learning_rate": 9.754362028588572e-05, "loss": 0.0068, "step": 7260 }, { "epoch": 4.202312138728324, "grad_norm": 0.08358323574066162, "learning_rate": 9.753337216327917e-05, "loss": 0.0039, "step": 7270 }, { "epoch": 4.208092485549133, "grad_norm": 0.09701034426689148, "learning_rate": 9.752310324796073e-05, "loss": 0.004, "step": 7280 }, { "epoch": 4.213872832369942, "grad_norm": 0.072493776679039, "learning_rate": 9.751281354442235e-05, "loss": 0.0048, "step": 7290 }, { "epoch": 4.219653179190751, "grad_norm": 0.07446051388978958, "learning_rate": 9.750250305716513e-05, "loss": 0.0034, "step": 7300 }, { "epoch": 4.22543352601156, "grad_norm": 0.09709165245294571, "learning_rate": 9.749217179069923e-05, "loss": 0.0043, "step": 7310 }, { "epoch": 4.23121387283237, "grad_norm": 0.09105303138494492, "learning_rate": 9.748181974954386e-05, "loss": 0.0036, "step": 7320 }, { "epoch": 4.236994219653179, "grad_norm": 0.08110969513654709, "learning_rate": 9.74714469382274e-05, "loss": 0.0039, "step": 7330 }, { "epoch": 4.242774566473988, "grad_norm": 0.07299605756998062, "learning_rate": 9.746105336128724e-05, "loss": 0.0036, "step": 7340 }, { "epoch": 4.2485549132947975, "grad_norm": 0.10487416386604309, "learning_rate": 9.745063902326988e-05, "loss": 0.0046, "step": 7350 }, { "epoch": 4.254335260115607, "grad_norm": 0.04577060043811798, "learning_rate": 9.744020392873093e-05, "loss": 0.0036, "step": 7360 }, { "epoch": 4.2601156069364166, "grad_norm": 0.10059268027544022, "learning_rate": 9.742974808223504e-05, "loss": 0.0078, "step": 7370 }, { "epoch": 4.265895953757226, "grad_norm": 0.06948348879814148, "learning_rate": 9.741927148835596e-05, "loss": 0.0053, "step": 7380 }, { "epoch": 4.271676300578035, "grad_norm": 0.051713019609451294, "learning_rate": 9.740877415167648e-05, "loss": 0.0034, "step": 7390 }, { "epoch": 4.277456647398844, "grad_norm": 0.057397518306970596, "learning_rate": 9.739825607678854e-05, "loss": 0.0033, "step": 7400 }, { "epoch": 4.283236994219653, "grad_norm": 0.05364494025707245, "learning_rate": 9.738771726829308e-05, "loss": 0.0032, "step": 7410 }, { "epoch": 4.289017341040463, "grad_norm": 0.06555718183517456, "learning_rate": 9.737715773080011e-05, "loss": 0.0041, "step": 7420 }, { "epoch": 4.294797687861272, "grad_norm": 0.07478900998830795, "learning_rate": 9.736657746892877e-05, "loss": 0.0037, "step": 7430 }, { "epoch": 4.300578034682081, "grad_norm": 0.09410209208726883, "learning_rate": 9.735597648730718e-05, "loss": 0.0045, "step": 7440 }, { "epoch": 4.30635838150289, "grad_norm": 0.09506339579820633, "learning_rate": 9.734535479057262e-05, "loss": 0.0053, "step": 7450 }, { "epoch": 4.312138728323699, "grad_norm": 0.09616143256425858, "learning_rate": 9.733471238337136e-05, "loss": 0.005, "step": 7460 }, { "epoch": 4.317919075144509, "grad_norm": 0.09171663224697113, "learning_rate": 9.732404927035871e-05, "loss": 0.0055, "step": 7470 }, { "epoch": 4.323699421965318, "grad_norm": 0.06696099787950516, "learning_rate": 9.731336545619915e-05, "loss": 0.0037, "step": 7480 }, { "epoch": 4.329479768786127, "grad_norm": 0.14176106452941895, "learning_rate": 9.73026609455661e-05, "loss": 0.0046, "step": 7490 }, { "epoch": 4.335260115606936, "grad_norm": 0.08920951187610626, "learning_rate": 9.72919357431421e-05, "loss": 0.0039, "step": 7500 }, { "epoch": 4.341040462427745, "grad_norm": 0.061191197484731674, "learning_rate": 9.72811898536187e-05, "loss": 0.0047, "step": 7510 }, { "epoch": 4.3468208092485545, "grad_norm": 0.08400767296552658, "learning_rate": 9.72704232816965e-05, "loss": 0.0061, "step": 7520 }, { "epoch": 4.3526011560693645, "grad_norm": 0.05825735628604889, "learning_rate": 9.725963603208519e-05, "loss": 0.0031, "step": 7530 }, { "epoch": 4.358381502890174, "grad_norm": 0.06253275275230408, "learning_rate": 9.724882810950348e-05, "loss": 0.0032, "step": 7540 }, { "epoch": 4.364161849710983, "grad_norm": 0.09076927602291107, "learning_rate": 9.723799951867913e-05, "loss": 0.0048, "step": 7550 }, { "epoch": 4.369942196531792, "grad_norm": 0.14996130764484406, "learning_rate": 9.722715026434889e-05, "loss": 0.0036, "step": 7560 }, { "epoch": 4.375722543352601, "grad_norm": 0.09381493926048279, "learning_rate": 9.721628035125866e-05, "loss": 0.0053, "step": 7570 }, { "epoch": 4.381502890173411, "grad_norm": 0.06940823793411255, "learning_rate": 9.720538978416325e-05, "loss": 0.0034, "step": 7580 }, { "epoch": 4.38728323699422, "grad_norm": 0.1408386528491974, "learning_rate": 9.71944785678266e-05, "loss": 0.0051, "step": 7590 }, { "epoch": 4.393063583815029, "grad_norm": 0.07540041208267212, "learning_rate": 9.718354670702161e-05, "loss": 0.0031, "step": 7600 }, { "epoch": 4.398843930635838, "grad_norm": 0.04325016215443611, "learning_rate": 9.717259420653029e-05, "loss": 0.0047, "step": 7610 }, { "epoch": 4.404624277456647, "grad_norm": 0.15856178104877472, "learning_rate": 9.716162107114361e-05, "loss": 0.0073, "step": 7620 }, { "epoch": 4.410404624277457, "grad_norm": 0.08500056713819504, "learning_rate": 9.71506273056616e-05, "loss": 0.0052, "step": 7630 }, { "epoch": 4.416184971098266, "grad_norm": 0.1474866420030594, "learning_rate": 9.713961291489331e-05, "loss": 0.0073, "step": 7640 }, { "epoch": 4.421965317919075, "grad_norm": 0.14277051389217377, "learning_rate": 9.71285779036568e-05, "loss": 0.0056, "step": 7650 }, { "epoch": 4.427745664739884, "grad_norm": 0.10464701056480408, "learning_rate": 9.711752227677916e-05, "loss": 0.0039, "step": 7660 }, { "epoch": 4.433526011560693, "grad_norm": 0.09123163670301437, "learning_rate": 9.710644603909649e-05, "loss": 0.004, "step": 7670 }, { "epoch": 4.4393063583815024, "grad_norm": 0.06720109283924103, "learning_rate": 9.709534919545393e-05, "loss": 0.0039, "step": 7680 }, { "epoch": 4.445086705202312, "grad_norm": 0.09755236655473709, "learning_rate": 9.708423175070563e-05, "loss": 0.0047, "step": 7690 }, { "epoch": 4.4508670520231215, "grad_norm": 0.05199611186981201, "learning_rate": 9.707309370971468e-05, "loss": 0.0039, "step": 7700 }, { "epoch": 4.456647398843931, "grad_norm": 0.08835835009813309, "learning_rate": 9.70619350773533e-05, "loss": 0.0048, "step": 7710 }, { "epoch": 4.46242774566474, "grad_norm": 0.09824372082948685, "learning_rate": 9.70507558585026e-05, "loss": 0.006, "step": 7720 }, { "epoch": 4.468208092485549, "grad_norm": 0.06983206421136856, "learning_rate": 9.703955605805281e-05, "loss": 0.0042, "step": 7730 }, { "epoch": 4.473988439306359, "grad_norm": 0.05261417105793953, "learning_rate": 9.702833568090306e-05, "loss": 0.0033, "step": 7740 }, { "epoch": 4.479768786127168, "grad_norm": 0.044182635843753815, "learning_rate": 9.701709473196154e-05, "loss": 0.0043, "step": 7750 }, { "epoch": 4.485549132947977, "grad_norm": 0.1114625409245491, "learning_rate": 9.700583321614541e-05, "loss": 0.0035, "step": 7760 }, { "epoch": 4.491329479768786, "grad_norm": 0.08481371402740479, "learning_rate": 9.699455113838085e-05, "loss": 0.006, "step": 7770 }, { "epoch": 4.497109826589595, "grad_norm": 0.09450369328260422, "learning_rate": 9.698324850360303e-05, "loss": 0.0042, "step": 7780 }, { "epoch": 4.502890173410405, "grad_norm": 0.09334803372621536, "learning_rate": 9.69719253167561e-05, "loss": 0.0047, "step": 7790 }, { "epoch": 4.508670520231214, "grad_norm": 0.0968085527420044, "learning_rate": 9.69605815827932e-05, "loss": 0.0076, "step": 7800 }, { "epoch": 4.514450867052023, "grad_norm": 0.09961452335119247, "learning_rate": 9.694921730667647e-05, "loss": 0.0045, "step": 7810 }, { "epoch": 4.520231213872832, "grad_norm": 0.0660637617111206, "learning_rate": 9.693783249337705e-05, "loss": 0.0042, "step": 7820 }, { "epoch": 4.526011560693641, "grad_norm": 0.0900220274925232, "learning_rate": 9.692642714787501e-05, "loss": 0.0041, "step": 7830 }, { "epoch": 4.531791907514451, "grad_norm": 0.11947119981050491, "learning_rate": 9.691500127515945e-05, "loss": 0.0079, "step": 7840 }, { "epoch": 4.53757225433526, "grad_norm": 0.09257648140192032, "learning_rate": 9.690355488022844e-05, "loss": 0.0044, "step": 7850 }, { "epoch": 4.543352601156069, "grad_norm": 0.10505222529172897, "learning_rate": 9.689208796808902e-05, "loss": 0.0041, "step": 7860 }, { "epoch": 4.5491329479768785, "grad_norm": 0.12087495625019073, "learning_rate": 9.68806005437572e-05, "loss": 0.0063, "step": 7870 }, { "epoch": 4.554913294797688, "grad_norm": 0.08448950946331024, "learning_rate": 9.686909261225796e-05, "loss": 0.0047, "step": 7880 }, { "epoch": 4.5606936416184976, "grad_norm": 0.16504046320915222, "learning_rate": 9.685756417862531e-05, "loss": 0.0047, "step": 7890 }, { "epoch": 4.566473988439307, "grad_norm": 0.10585096478462219, "learning_rate": 9.684601524790212e-05, "loss": 0.0054, "step": 7900 }, { "epoch": 4.572254335260116, "grad_norm": 0.11554574221372604, "learning_rate": 9.68344458251403e-05, "loss": 0.0047, "step": 7910 }, { "epoch": 4.578034682080925, "grad_norm": 0.09645393490791321, "learning_rate": 9.682285591540072e-05, "loss": 0.0036, "step": 7920 }, { "epoch": 4.583815028901734, "grad_norm": 0.0977889746427536, "learning_rate": 9.681124552375322e-05, "loss": 0.0035, "step": 7930 }, { "epoch": 4.589595375722544, "grad_norm": 0.08475830405950546, "learning_rate": 9.679961465527654e-05, "loss": 0.0039, "step": 7940 }, { "epoch": 4.595375722543353, "grad_norm": 0.06024939566850662, "learning_rate": 9.678796331505843e-05, "loss": 0.0034, "step": 7950 }, { "epoch": 4.601156069364162, "grad_norm": 0.05883932113647461, "learning_rate": 9.677629150819558e-05, "loss": 0.0034, "step": 7960 }, { "epoch": 4.606936416184971, "grad_norm": 0.09573355317115784, "learning_rate": 9.676459923979366e-05, "loss": 0.0037, "step": 7970 }, { "epoch": 4.61271676300578, "grad_norm": 0.06836744397878647, "learning_rate": 9.675288651496723e-05, "loss": 0.0037, "step": 7980 }, { "epoch": 4.618497109826589, "grad_norm": 0.07681182771921158, "learning_rate": 9.674115333883986e-05, "loss": 0.0053, "step": 7990 }, { "epoch": 4.624277456647399, "grad_norm": 0.07006292045116425, "learning_rate": 9.672939971654402e-05, "loss": 0.0039, "step": 8000 }, { "epoch": 4.630057803468208, "grad_norm": 0.05592655763030052, "learning_rate": 9.671762565322117e-05, "loss": 0.0034, "step": 8010 }, { "epoch": 4.635838150289017, "grad_norm": 0.1108817607164383, "learning_rate": 9.670583115402166e-05, "loss": 0.0043, "step": 8020 }, { "epoch": 4.641618497109826, "grad_norm": 0.06929147988557816, "learning_rate": 9.669401622410482e-05, "loss": 0.0036, "step": 8030 }, { "epoch": 4.6473988439306355, "grad_norm": 0.05909327417612076, "learning_rate": 9.668218086863887e-05, "loss": 0.0037, "step": 8040 }, { "epoch": 4.653179190751445, "grad_norm": 0.06097660958766937, "learning_rate": 9.667032509280103e-05, "loss": 0.0047, "step": 8050 }, { "epoch": 4.658959537572255, "grad_norm": 0.07727665454149246, "learning_rate": 9.665844890177743e-05, "loss": 0.0033, "step": 8060 }, { "epoch": 4.664739884393064, "grad_norm": 0.09187764674425125, "learning_rate": 9.664655230076309e-05, "loss": 0.0032, "step": 8070 }, { "epoch": 4.670520231213873, "grad_norm": 0.06976690143346786, "learning_rate": 9.663463529496199e-05, "loss": 0.0043, "step": 8080 }, { "epoch": 4.676300578034682, "grad_norm": 0.08565140515565872, "learning_rate": 9.662269788958705e-05, "loss": 0.0033, "step": 8090 }, { "epoch": 4.682080924855491, "grad_norm": 0.08254718780517578, "learning_rate": 9.661074008986008e-05, "loss": 0.0046, "step": 8100 }, { "epoch": 4.687861271676301, "grad_norm": 0.08245521783828735, "learning_rate": 9.659876190101184e-05, "loss": 0.0036, "step": 8110 }, { "epoch": 4.69364161849711, "grad_norm": 0.06963507831096649, "learning_rate": 9.6586763328282e-05, "loss": 0.0042, "step": 8120 }, { "epoch": 4.699421965317919, "grad_norm": 0.06510617583990097, "learning_rate": 9.657474437691913e-05, "loss": 0.0034, "step": 8130 }, { "epoch": 4.705202312138728, "grad_norm": 0.08015453070402145, "learning_rate": 9.656270505218073e-05, "loss": 0.0033, "step": 8140 }, { "epoch": 4.710982658959537, "grad_norm": 0.08493061363697052, "learning_rate": 9.655064535933321e-05, "loss": 0.0039, "step": 8150 }, { "epoch": 4.716763005780347, "grad_norm": 0.0690418928861618, "learning_rate": 9.653856530365189e-05, "loss": 0.0041, "step": 8160 }, { "epoch": 4.722543352601156, "grad_norm": 0.06100199371576309, "learning_rate": 9.6526464890421e-05, "loss": 0.0033, "step": 8170 }, { "epoch": 4.728323699421965, "grad_norm": 0.12600620090961456, "learning_rate": 9.651434412493367e-05, "loss": 0.004, "step": 8180 }, { "epoch": 4.734104046242774, "grad_norm": 0.08088401705026627, "learning_rate": 9.650220301249195e-05, "loss": 0.0036, "step": 8190 }, { "epoch": 4.7398843930635834, "grad_norm": 0.05150901526212692, "learning_rate": 9.649004155840675e-05, "loss": 0.0037, "step": 8200 }, { "epoch": 4.745664739884393, "grad_norm": 0.08037382364273071, "learning_rate": 9.647785976799792e-05, "loss": 0.005, "step": 8210 }, { "epoch": 4.7514450867052025, "grad_norm": 0.08356626331806183, "learning_rate": 9.646565764659417e-05, "loss": 0.0042, "step": 8220 }, { "epoch": 4.757225433526012, "grad_norm": 0.09985048323869705, "learning_rate": 9.645343519953314e-05, "loss": 0.0046, "step": 8230 }, { "epoch": 4.763005780346821, "grad_norm": 0.06431553512811661, "learning_rate": 9.644119243216135e-05, "loss": 0.0042, "step": 8240 }, { "epoch": 4.76878612716763, "grad_norm": 0.07443758845329285, "learning_rate": 9.642892934983417e-05, "loss": 0.004, "step": 8250 }, { "epoch": 4.77456647398844, "grad_norm": 0.16717828810214996, "learning_rate": 9.641664595791593e-05, "loss": 0.0045, "step": 8260 }, { "epoch": 4.780346820809249, "grad_norm": 0.1435498297214508, "learning_rate": 9.640434226177977e-05, "loss": 0.0037, "step": 8270 }, { "epoch": 4.786127167630058, "grad_norm": 0.0821315199136734, "learning_rate": 9.639201826680777e-05, "loss": 0.0044, "step": 8280 }, { "epoch": 4.791907514450867, "grad_norm": 0.09831906110048294, "learning_rate": 9.637967397839083e-05, "loss": 0.0044, "step": 8290 }, { "epoch": 4.797687861271676, "grad_norm": 0.06787078827619553, "learning_rate": 9.63673094019288e-05, "loss": 0.0041, "step": 8300 }, { "epoch": 4.803468208092486, "grad_norm": 0.04841814562678337, "learning_rate": 9.635492454283035e-05, "loss": 0.0031, "step": 8310 }, { "epoch": 4.809248554913295, "grad_norm": 0.06711428612470627, "learning_rate": 9.634251940651302e-05, "loss": 0.0043, "step": 8320 }, { "epoch": 4.815028901734104, "grad_norm": 0.07030502706766129, "learning_rate": 9.633009399840327e-05, "loss": 0.0054, "step": 8330 }, { "epoch": 4.820809248554913, "grad_norm": 0.07398594170808792, "learning_rate": 9.631764832393639e-05, "loss": 0.0044, "step": 8340 }, { "epoch": 4.826589595375722, "grad_norm": 0.085907481610775, "learning_rate": 9.630518238855651e-05, "loss": 0.0067, "step": 8350 }, { "epoch": 4.832369942196532, "grad_norm": 0.10180293768644333, "learning_rate": 9.629269619771668e-05, "loss": 0.0041, "step": 8360 }, { "epoch": 4.838150289017341, "grad_norm": 0.09444575756788254, "learning_rate": 9.628018975687878e-05, "loss": 0.0043, "step": 8370 }, { "epoch": 4.84393063583815, "grad_norm": 0.10237473249435425, "learning_rate": 9.626766307151355e-05, "loss": 0.0074, "step": 8380 }, { "epoch": 4.8497109826589595, "grad_norm": 0.07820641994476318, "learning_rate": 9.62551161471006e-05, "loss": 0.0035, "step": 8390 }, { "epoch": 4.855491329479769, "grad_norm": 0.059219833463430405, "learning_rate": 9.624254898912837e-05, "loss": 0.004, "step": 8400 }, { "epoch": 4.861271676300578, "grad_norm": 0.08172249048948288, "learning_rate": 9.622996160309414e-05, "loss": 0.0042, "step": 8410 }, { "epoch": 4.867052023121388, "grad_norm": 0.07609304040670395, "learning_rate": 9.62173539945041e-05, "loss": 0.0031, "step": 8420 }, { "epoch": 4.872832369942197, "grad_norm": 0.07762991636991501, "learning_rate": 9.620472616887323e-05, "loss": 0.0037, "step": 8430 }, { "epoch": 4.878612716763006, "grad_norm": 0.11632972210645676, "learning_rate": 9.619207813172536e-05, "loss": 0.0037, "step": 8440 }, { "epoch": 4.884393063583815, "grad_norm": 0.11336377263069153, "learning_rate": 9.617940988859318e-05, "loss": 0.0048, "step": 8450 }, { "epoch": 4.890173410404624, "grad_norm": 0.08250712603330612, "learning_rate": 9.616672144501821e-05, "loss": 0.0039, "step": 8460 }, { "epoch": 4.895953757225434, "grad_norm": 0.10133339464664459, "learning_rate": 9.615401280655081e-05, "loss": 0.0035, "step": 8470 }, { "epoch": 4.901734104046243, "grad_norm": 0.06876753270626068, "learning_rate": 9.614128397875017e-05, "loss": 0.0042, "step": 8480 }, { "epoch": 4.907514450867052, "grad_norm": 0.07006234675645828, "learning_rate": 9.612853496718429e-05, "loss": 0.0052, "step": 8490 }, { "epoch": 4.913294797687861, "grad_norm": 0.06721346080303192, "learning_rate": 9.611576577743004e-05, "loss": 0.0037, "step": 8500 }, { "epoch": 4.91907514450867, "grad_norm": 0.0753275528550148, "learning_rate": 9.610297641507313e-05, "loss": 0.004, "step": 8510 }, { "epoch": 4.924855491329479, "grad_norm": 0.0652092695236206, "learning_rate": 9.6090166885708e-05, "loss": 0.0037, "step": 8520 }, { "epoch": 4.930635838150289, "grad_norm": 0.05354798212647438, "learning_rate": 9.607733719493798e-05, "loss": 0.0036, "step": 8530 }, { "epoch": 4.936416184971098, "grad_norm": 0.053498897701501846, "learning_rate": 9.606448734837527e-05, "loss": 0.0039, "step": 8540 }, { "epoch": 4.942196531791907, "grad_norm": 0.09556004405021667, "learning_rate": 9.605161735164079e-05, "loss": 0.0037, "step": 8550 }, { "epoch": 4.9479768786127165, "grad_norm": 0.08123383671045303, "learning_rate": 9.60387272103643e-05, "loss": 0.0038, "step": 8560 }, { "epoch": 4.953757225433526, "grad_norm": 0.060370512306690216, "learning_rate": 9.60258169301844e-05, "loss": 0.0031, "step": 8570 }, { "epoch": 4.959537572254336, "grad_norm": 0.06456678360700607, "learning_rate": 9.601288651674851e-05, "loss": 0.0033, "step": 8580 }, { "epoch": 4.965317919075145, "grad_norm": 0.07259932905435562, "learning_rate": 9.59999359757128e-05, "loss": 0.0031, "step": 8590 }, { "epoch": 4.971098265895954, "grad_norm": 0.05768544226884842, "learning_rate": 9.598696531274227e-05, "loss": 0.0027, "step": 8600 }, { "epoch": 4.976878612716763, "grad_norm": 0.04464501142501831, "learning_rate": 9.597397453351076e-05, "loss": 0.0029, "step": 8610 }, { "epoch": 4.982658959537572, "grad_norm": 0.08422578871250153, "learning_rate": 9.596096364370087e-05, "loss": 0.004, "step": 8620 }, { "epoch": 4.988439306358382, "grad_norm": 0.10428819060325623, "learning_rate": 9.594793264900399e-05, "loss": 0.0051, "step": 8630 }, { "epoch": 4.994219653179191, "grad_norm": 0.0938459113240242, "learning_rate": 9.593488155512032e-05, "loss": 0.0042, "step": 8640 }, { "epoch": 5.0, "grad_norm": 0.0748751163482666, "learning_rate": 9.592181036775886e-05, "loss": 0.0037, "step": 8650 }, { "epoch": 5.005780346820809, "grad_norm": 0.10755519568920135, "learning_rate": 9.590871909263741e-05, "loss": 0.0039, "step": 8660 }, { "epoch": 5.011560693641618, "grad_norm": 0.11138191819190979, "learning_rate": 9.589560773548252e-05, "loss": 0.0041, "step": 8670 }, { "epoch": 5.017341040462428, "grad_norm": 0.13861332833766937, "learning_rate": 9.588247630202952e-05, "loss": 0.0042, "step": 8680 }, { "epoch": 5.023121387283237, "grad_norm": 0.10551301389932632, "learning_rate": 9.586932479802258e-05, "loss": 0.0042, "step": 8690 }, { "epoch": 5.028901734104046, "grad_norm": 0.06994461268186569, "learning_rate": 9.585615322921462e-05, "loss": 0.0062, "step": 8700 }, { "epoch": 5.034682080924855, "grad_norm": 0.09327414631843567, "learning_rate": 9.58429616013673e-05, "loss": 0.0043, "step": 8710 }, { "epoch": 5.040462427745664, "grad_norm": 0.0711771622300148, "learning_rate": 9.58297499202511e-05, "loss": 0.005, "step": 8720 }, { "epoch": 5.046242774566474, "grad_norm": 0.10359261929988861, "learning_rate": 9.581651819164525e-05, "loss": 0.0042, "step": 8730 }, { "epoch": 5.0520231213872835, "grad_norm": 0.13505209982395172, "learning_rate": 9.580326642133777e-05, "loss": 0.0045, "step": 8740 }, { "epoch": 5.057803468208093, "grad_norm": 0.09726086258888245, "learning_rate": 9.578999461512544e-05, "loss": 0.0045, "step": 8750 }, { "epoch": 5.063583815028902, "grad_norm": 0.10977838933467865, "learning_rate": 9.577670277881378e-05, "loss": 0.0069, "step": 8760 }, { "epoch": 5.069364161849711, "grad_norm": 0.13174647092819214, "learning_rate": 9.57633909182171e-05, "loss": 0.0073, "step": 8770 }, { "epoch": 5.07514450867052, "grad_norm": 0.1174435019493103, "learning_rate": 9.575005903915847e-05, "loss": 0.0045, "step": 8780 }, { "epoch": 5.08092485549133, "grad_norm": 0.09674505144357681, "learning_rate": 9.573670714746972e-05, "loss": 0.0038, "step": 8790 }, { "epoch": 5.086705202312139, "grad_norm": 0.10631712526082993, "learning_rate": 9.572333524899136e-05, "loss": 0.0045, "step": 8800 }, { "epoch": 5.092485549132948, "grad_norm": 0.09972011297941208, "learning_rate": 9.570994334957278e-05, "loss": 0.0034, "step": 8810 }, { "epoch": 5.098265895953757, "grad_norm": 0.07860255986452103, "learning_rate": 9.569653145507201e-05, "loss": 0.0047, "step": 8820 }, { "epoch": 5.104046242774566, "grad_norm": 0.09654318541288376, "learning_rate": 9.568309957135587e-05, "loss": 0.0035, "step": 8830 }, { "epoch": 5.109826589595376, "grad_norm": 0.08574005216360092, "learning_rate": 9.566964770429995e-05, "loss": 0.0053, "step": 8840 }, { "epoch": 5.115606936416185, "grad_norm": 0.08568380773067474, "learning_rate": 9.565617585978853e-05, "loss": 0.0036, "step": 8850 }, { "epoch": 5.121387283236994, "grad_norm": 0.06915133446455002, "learning_rate": 9.564268404371466e-05, "loss": 0.0032, "step": 8860 }, { "epoch": 5.127167630057803, "grad_norm": 0.07265638560056686, "learning_rate": 9.562917226198013e-05, "loss": 0.0037, "step": 8870 }, { "epoch": 5.132947976878612, "grad_norm": 0.06257953494787216, "learning_rate": 9.561564052049543e-05, "loss": 0.0039, "step": 8880 }, { "epoch": 5.138728323699422, "grad_norm": 0.10112006962299347, "learning_rate": 9.560208882517982e-05, "loss": 0.0037, "step": 8890 }, { "epoch": 5.144508670520231, "grad_norm": 0.14546966552734375, "learning_rate": 9.558851718196127e-05, "loss": 0.005, "step": 8900 }, { "epoch": 5.1502890173410405, "grad_norm": 0.07257446646690369, "learning_rate": 9.557492559677646e-05, "loss": 0.0036, "step": 8910 }, { "epoch": 5.15606936416185, "grad_norm": 0.10585208982229233, "learning_rate": 9.556131407557082e-05, "loss": 0.0043, "step": 8920 }, { "epoch": 5.161849710982659, "grad_norm": 0.08118341118097305, "learning_rate": 9.554768262429853e-05, "loss": 0.0038, "step": 8930 }, { "epoch": 5.167630057803469, "grad_norm": 0.13657018542289734, "learning_rate": 9.553403124892239e-05, "loss": 0.0035, "step": 8940 }, { "epoch": 5.173410404624278, "grad_norm": 0.07841265201568604, "learning_rate": 9.552035995541402e-05, "loss": 0.0042, "step": 8950 }, { "epoch": 5.179190751445087, "grad_norm": 0.08602629601955414, "learning_rate": 9.550666874975368e-05, "loss": 0.0035, "step": 8960 }, { "epoch": 5.184971098265896, "grad_norm": 0.05712622031569481, "learning_rate": 9.549295763793038e-05, "loss": 0.0039, "step": 8970 }, { "epoch": 5.190751445086705, "grad_norm": 0.09507585316896439, "learning_rate": 9.547922662594183e-05, "loss": 0.0054, "step": 8980 }, { "epoch": 5.196531791907514, "grad_norm": 0.10527266561985016, "learning_rate": 9.546547571979443e-05, "loss": 0.0044, "step": 8990 }, { "epoch": 5.202312138728324, "grad_norm": 0.07147108763456345, "learning_rate": 9.545170492550331e-05, "loss": 0.0033, "step": 9000 }, { "epoch": 5.208092485549133, "grad_norm": 0.06450354307889938, "learning_rate": 9.543791424909226e-05, "loss": 0.0029, "step": 9010 }, { "epoch": 5.213872832369942, "grad_norm": 0.06563185155391693, "learning_rate": 9.542410369659382e-05, "loss": 0.0039, "step": 9020 }, { "epoch": 5.219653179190751, "grad_norm": 0.05482746288180351, "learning_rate": 9.541027327404916e-05, "loss": 0.0034, "step": 9030 }, { "epoch": 5.22543352601156, "grad_norm": 0.06039123609662056, "learning_rate": 9.539642298750821e-05, "loss": 0.0033, "step": 9040 }, { "epoch": 5.23121387283237, "grad_norm": 0.057607367634773254, "learning_rate": 9.538255284302954e-05, "loss": 0.0038, "step": 9050 }, { "epoch": 5.236994219653179, "grad_norm": 0.11214864253997803, "learning_rate": 9.536866284668043e-05, "loss": 0.0044, "step": 9060 }, { "epoch": 5.242774566473988, "grad_norm": 0.10493134707212448, "learning_rate": 9.53547530045368e-05, "loss": 0.0046, "step": 9070 }, { "epoch": 5.2485549132947975, "grad_norm": 0.08909250795841217, "learning_rate": 9.534082332268335e-05, "loss": 0.0032, "step": 9080 }, { "epoch": 5.254335260115607, "grad_norm": 0.0613614022731781, "learning_rate": 9.532687380721334e-05, "loss": 0.0038, "step": 9090 }, { "epoch": 5.2601156069364166, "grad_norm": 0.0754789412021637, "learning_rate": 9.53129044642288e-05, "loss": 0.0027, "step": 9100 }, { "epoch": 5.265895953757226, "grad_norm": 0.0658283680677414, "learning_rate": 9.529891529984039e-05, "loss": 0.003, "step": 9110 }, { "epoch": 5.271676300578035, "grad_norm": 0.0447990819811821, "learning_rate": 9.528490632016743e-05, "loss": 0.0041, "step": 9120 }, { "epoch": 5.277456647398844, "grad_norm": 0.055104684084653854, "learning_rate": 9.52708775313379e-05, "loss": 0.003, "step": 9130 }, { "epoch": 5.283236994219653, "grad_norm": 0.04724518209695816, "learning_rate": 9.525682893948853e-05, "loss": 0.003, "step": 9140 }, { "epoch": 5.289017341040463, "grad_norm": 0.04141925647854805, "learning_rate": 9.524276055076463e-05, "loss": 0.0041, "step": 9150 }, { "epoch": 5.294797687861272, "grad_norm": 0.08228343725204468, "learning_rate": 9.522867237132016e-05, "loss": 0.0044, "step": 9160 }, { "epoch": 5.300578034682081, "grad_norm": 0.10406464338302612, "learning_rate": 9.52145644073178e-05, "loss": 0.0049, "step": 9170 }, { "epoch": 5.30635838150289, "grad_norm": 0.07224124670028687, "learning_rate": 9.520043666492884e-05, "loss": 0.0044, "step": 9180 }, { "epoch": 5.312138728323699, "grad_norm": 0.0873929113149643, "learning_rate": 9.518628915033323e-05, "loss": 0.0038, "step": 9190 }, { "epoch": 5.317919075144509, "grad_norm": 0.06959396600723267, "learning_rate": 9.517212186971957e-05, "loss": 0.0035, "step": 9200 }, { "epoch": 5.323699421965318, "grad_norm": 0.04830661416053772, "learning_rate": 9.515793482928515e-05, "loss": 0.0037, "step": 9210 }, { "epoch": 5.329479768786127, "grad_norm": 0.08181358128786087, "learning_rate": 9.514372803523582e-05, "loss": 0.0057, "step": 9220 }, { "epoch": 5.335260115606936, "grad_norm": 0.09455884248018265, "learning_rate": 9.512950149378613e-05, "loss": 0.004, "step": 9230 }, { "epoch": 5.341040462427745, "grad_norm": 0.09299562871456146, "learning_rate": 9.511525521115925e-05, "loss": 0.0032, "step": 9240 }, { "epoch": 5.3468208092485545, "grad_norm": 0.0856257900595665, "learning_rate": 9.510098919358698e-05, "loss": 0.003, "step": 9250 }, { "epoch": 5.3526011560693645, "grad_norm": 0.08746090531349182, "learning_rate": 9.508670344730979e-05, "loss": 0.0048, "step": 9260 }, { "epoch": 5.358381502890174, "grad_norm": 0.09835511445999146, "learning_rate": 9.507239797857672e-05, "loss": 0.0044, "step": 9270 }, { "epoch": 5.364161849710983, "grad_norm": 0.07579044252634048, "learning_rate": 9.505807279364548e-05, "loss": 0.0035, "step": 9280 }, { "epoch": 5.369942196531792, "grad_norm": 0.06782922893762589, "learning_rate": 9.504372789878239e-05, "loss": 0.0029, "step": 9290 }, { "epoch": 5.375722543352601, "grad_norm": 0.052490346133708954, "learning_rate": 9.502936330026239e-05, "loss": 0.0067, "step": 9300 }, { "epoch": 5.381502890173411, "grad_norm": 0.06339240074157715, "learning_rate": 9.501497900436906e-05, "loss": 0.0034, "step": 9310 }, { "epoch": 5.38728323699422, "grad_norm": 0.04801017791032791, "learning_rate": 9.500057501739457e-05, "loss": 0.0048, "step": 9320 }, { "epoch": 5.393063583815029, "grad_norm": 0.04857809841632843, "learning_rate": 9.49861513456397e-05, "loss": 0.0027, "step": 9330 }, { "epoch": 5.398843930635838, "grad_norm": 0.08158544450998306, "learning_rate": 9.497170799541388e-05, "loss": 0.0043, "step": 9340 }, { "epoch": 5.404624277456647, "grad_norm": 0.05422453209757805, "learning_rate": 9.495724497303508e-05, "loss": 0.0033, "step": 9350 }, { "epoch": 5.410404624277457, "grad_norm": 0.05921275168657303, "learning_rate": 9.494276228482998e-05, "loss": 0.0029, "step": 9360 }, { "epoch": 5.416184971098266, "grad_norm": 0.13661505281925201, "learning_rate": 9.492825993713374e-05, "loss": 0.0035, "step": 9370 }, { "epoch": 5.421965317919075, "grad_norm": 0.08016598224639893, "learning_rate": 9.491373793629023e-05, "loss": 0.0032, "step": 9380 }, { "epoch": 5.427745664739884, "grad_norm": 0.10663073509931564, "learning_rate": 9.489919628865182e-05, "loss": 0.005, "step": 9390 }, { "epoch": 5.433526011560693, "grad_norm": 0.06865248829126358, "learning_rate": 9.488463500057955e-05, "loss": 0.0065, "step": 9400 }, { "epoch": 5.4393063583815024, "grad_norm": 0.10003788024187088, "learning_rate": 9.487005407844302e-05, "loss": 0.0036, "step": 9410 }, { "epoch": 5.445086705202312, "grad_norm": 0.08406813442707062, "learning_rate": 9.485545352862039e-05, "loss": 0.0043, "step": 9420 }, { "epoch": 5.4508670520231215, "grad_norm": 0.07259850949048996, "learning_rate": 9.48408333574985e-05, "loss": 0.0034, "step": 9430 }, { "epoch": 5.456647398843931, "grad_norm": 0.04301713407039642, "learning_rate": 9.482619357147264e-05, "loss": 0.003, "step": 9440 }, { "epoch": 5.46242774566474, "grad_norm": 0.052886392921209335, "learning_rate": 9.481153417694679e-05, "loss": 0.0033, "step": 9450 }, { "epoch": 5.468208092485549, "grad_norm": 0.06675713509321213, "learning_rate": 9.479685518033347e-05, "loss": 0.0031, "step": 9460 }, { "epoch": 5.473988439306359, "grad_norm": 0.06410433351993561, "learning_rate": 9.478215658805375e-05, "loss": 0.0037, "step": 9470 }, { "epoch": 5.479768786127168, "grad_norm": 0.07671128213405609, "learning_rate": 9.47674384065373e-05, "loss": 0.0038, "step": 9480 }, { "epoch": 5.485549132947977, "grad_norm": 0.06870589405298233, "learning_rate": 9.475270064222237e-05, "loss": 0.0034, "step": 9490 }, { "epoch": 5.491329479768786, "grad_norm": 0.060709405690431595, "learning_rate": 9.473794330155572e-05, "loss": 0.0039, "step": 9500 }, { "epoch": 5.497109826589595, "grad_norm": 0.08371403068304062, "learning_rate": 9.472316639099276e-05, "loss": 0.004, "step": 9510 }, { "epoch": 5.502890173410405, "grad_norm": 0.08837935328483582, "learning_rate": 9.470836991699739e-05, "loss": 0.0044, "step": 9520 }, { "epoch": 5.508670520231214, "grad_norm": 0.06310413032770157, "learning_rate": 9.469355388604208e-05, "loss": 0.0053, "step": 9530 }, { "epoch": 5.514450867052023, "grad_norm": 0.08625809848308563, "learning_rate": 9.467871830460787e-05, "loss": 0.0047, "step": 9540 }, { "epoch": 5.520231213872832, "grad_norm": 0.04534368962049484, "learning_rate": 9.466386317918436e-05, "loss": 0.003, "step": 9550 }, { "epoch": 5.526011560693641, "grad_norm": 0.10432472079992294, "learning_rate": 9.464898851626969e-05, "loss": 0.0033, "step": 9560 }, { "epoch": 5.531791907514451, "grad_norm": 0.04911819100379944, "learning_rate": 9.463409432237051e-05, "loss": 0.0035, "step": 9570 }, { "epoch": 5.53757225433526, "grad_norm": 0.0693276897072792, "learning_rate": 9.461918060400209e-05, "loss": 0.0049, "step": 9580 }, { "epoch": 5.543352601156069, "grad_norm": 0.08006289601325989, "learning_rate": 9.460424736768816e-05, "loss": 0.0043, "step": 9590 }, { "epoch": 5.5491329479768785, "grad_norm": 0.06762324273586273, "learning_rate": 9.458929461996105e-05, "loss": 0.0038, "step": 9600 }, { "epoch": 5.554913294797688, "grad_norm": 0.05108136311173439, "learning_rate": 9.457432236736158e-05, "loss": 0.003, "step": 9610 }, { "epoch": 5.5606936416184976, "grad_norm": 0.0543186254799366, "learning_rate": 9.455933061643916e-05, "loss": 0.0035, "step": 9620 }, { "epoch": 5.566473988439307, "grad_norm": 0.08099451661109924, "learning_rate": 9.454431937375164e-05, "loss": 0.0041, "step": 9630 }, { "epoch": 5.572254335260116, "grad_norm": 0.06708111613988876, "learning_rate": 9.452928864586547e-05, "loss": 0.0031, "step": 9640 }, { "epoch": 5.578034682080925, "grad_norm": 0.06497319042682648, "learning_rate": 9.451423843935563e-05, "loss": 0.0035, "step": 9650 }, { "epoch": 5.583815028901734, "grad_norm": 0.07020888477563858, "learning_rate": 9.449916876080553e-05, "loss": 0.0032, "step": 9660 }, { "epoch": 5.589595375722544, "grad_norm": 0.07956274598836899, "learning_rate": 9.44840796168072e-05, "loss": 0.003, "step": 9670 }, { "epoch": 5.595375722543353, "grad_norm": 0.08269158750772476, "learning_rate": 9.446897101396115e-05, "loss": 0.004, "step": 9680 }, { "epoch": 5.601156069364162, "grad_norm": 0.11986695975065231, "learning_rate": 9.445384295887638e-05, "loss": 0.0036, "step": 9690 }, { "epoch": 5.606936416184971, "grad_norm": 0.06522428244352341, "learning_rate": 9.443869545817043e-05, "loss": 0.0038, "step": 9700 }, { "epoch": 5.61271676300578, "grad_norm": 0.09290037304162979, "learning_rate": 9.442352851846929e-05, "loss": 0.0033, "step": 9710 }, { "epoch": 5.618497109826589, "grad_norm": 0.07107914239168167, "learning_rate": 9.440834214640755e-05, "loss": 0.004, "step": 9720 }, { "epoch": 5.624277456647399, "grad_norm": 0.08233386278152466, "learning_rate": 9.439313634862823e-05, "loss": 0.0045, "step": 9730 }, { "epoch": 5.630057803468208, "grad_norm": 0.09773562103509903, "learning_rate": 9.437791113178282e-05, "loss": 0.0048, "step": 9740 }, { "epoch": 5.635838150289017, "grad_norm": 0.08826876431703568, "learning_rate": 9.43626665025314e-05, "loss": 0.0051, "step": 9750 }, { "epoch": 5.641618497109826, "grad_norm": 0.08154237270355225, "learning_rate": 9.434740246754248e-05, "loss": 0.0035, "step": 9760 }, { "epoch": 5.6473988439306355, "grad_norm": 0.0881495401263237, "learning_rate": 9.433211903349304e-05, "loss": 0.0046, "step": 9770 }, { "epoch": 5.653179190751445, "grad_norm": 0.09400784969329834, "learning_rate": 9.431681620706858e-05, "loss": 0.0034, "step": 9780 }, { "epoch": 5.658959537572255, "grad_norm": 0.1164061427116394, "learning_rate": 9.43014939949631e-05, "loss": 0.0042, "step": 9790 }, { "epoch": 5.664739884393064, "grad_norm": 0.0716787651181221, "learning_rate": 9.428615240387904e-05, "loss": 0.0034, "step": 9800 }, { "epoch": 5.670520231213873, "grad_norm": 0.06646711379289627, "learning_rate": 9.427079144052732e-05, "loss": 0.0034, "step": 9810 }, { "epoch": 5.676300578034682, "grad_norm": 0.07756868004798889, "learning_rate": 9.425541111162739e-05, "loss": 0.0036, "step": 9820 }, { "epoch": 5.682080924855491, "grad_norm": 0.11053356528282166, "learning_rate": 9.424001142390709e-05, "loss": 0.0038, "step": 9830 }, { "epoch": 5.687861271676301, "grad_norm": 0.09140120446681976, "learning_rate": 9.422459238410277e-05, "loss": 0.0039, "step": 9840 }, { "epoch": 5.69364161849711, "grad_norm": 0.06871363520622253, "learning_rate": 9.420915399895926e-05, "loss": 0.006, "step": 9850 }, { "epoch": 5.699421965317919, "grad_norm": 0.07478412240743637, "learning_rate": 9.419369627522981e-05, "loss": 0.0037, "step": 9860 }, { "epoch": 5.705202312138728, "grad_norm": 0.045829880982637405, "learning_rate": 9.417821921967618e-05, "loss": 0.0026, "step": 9870 }, { "epoch": 5.710982658959537, "grad_norm": 0.050987813621759415, "learning_rate": 9.416272283906855e-05, "loss": 0.0035, "step": 9880 }, { "epoch": 5.716763005780347, "grad_norm": 0.0743798017501831, "learning_rate": 9.414720714018554e-05, "loss": 0.0043, "step": 9890 }, { "epoch": 5.722543352601156, "grad_norm": 0.07256080955266953, "learning_rate": 9.413167212981427e-05, "loss": 0.0036, "step": 9900 }, { "epoch": 5.728323699421965, "grad_norm": 0.06769303977489471, "learning_rate": 9.41161178147503e-05, "loss": 0.0064, "step": 9910 }, { "epoch": 5.734104046242774, "grad_norm": 0.08060161024332047, "learning_rate": 9.410054420179755e-05, "loss": 0.0041, "step": 9920 }, { "epoch": 5.7398843930635834, "grad_norm": 0.10014849156141281, "learning_rate": 9.408495129776851e-05, "loss": 0.0036, "step": 9930 }, { "epoch": 5.745664739884393, "grad_norm": 0.06562087684869766, "learning_rate": 9.406933910948403e-05, "loss": 0.003, "step": 9940 }, { "epoch": 5.7514450867052025, "grad_norm": 0.12746410071849823, "learning_rate": 9.40537076437734e-05, "loss": 0.0057, "step": 9950 }, { "epoch": 5.757225433526012, "grad_norm": 0.1049005389213562, "learning_rate": 9.403805690747436e-05, "loss": 0.0037, "step": 9960 }, { "epoch": 5.763005780346821, "grad_norm": 0.08599194884300232, "learning_rate": 9.402238690743308e-05, "loss": 0.0047, "step": 9970 }, { "epoch": 5.76878612716763, "grad_norm": 0.11726978421211243, "learning_rate": 9.400669765050413e-05, "loss": 0.0043, "step": 9980 }, { "epoch": 5.77456647398844, "grad_norm": 0.09003078192472458, "learning_rate": 9.399098914355055e-05, "loss": 0.0036, "step": 9990 }, { "epoch": 5.780346820809249, "grad_norm": 0.05620354786515236, "learning_rate": 9.397526139344378e-05, "loss": 0.0033, "step": 10000 }, { "epoch": 5.786127167630058, "grad_norm": 0.05252154916524887, "learning_rate": 9.395951440706362e-05, "loss": 0.003, "step": 10010 }, { "epoch": 5.791907514450867, "grad_norm": 0.041756268590688705, "learning_rate": 9.394374819129839e-05, "loss": 0.0049, "step": 10020 }, { "epoch": 5.797687861271676, "grad_norm": 0.09076819568872452, "learning_rate": 9.392796275304474e-05, "loss": 0.0036, "step": 10030 }, { "epoch": 5.803468208092486, "grad_norm": 0.0802321806550026, "learning_rate": 9.391215809920778e-05, "loss": 0.0045, "step": 10040 }, { "epoch": 5.809248554913295, "grad_norm": 0.07671747356653214, "learning_rate": 9.389633423670099e-05, "loss": 0.0036, "step": 10050 }, { "epoch": 5.815028901734104, "grad_norm": 0.056961119174957275, "learning_rate": 9.388049117244626e-05, "loss": 0.003, "step": 10060 }, { "epoch": 5.820809248554913, "grad_norm": 0.059280671179294586, "learning_rate": 9.386462891337389e-05, "loss": 0.003, "step": 10070 }, { "epoch": 5.826589595375722, "grad_norm": 0.0686354786157608, "learning_rate": 9.384874746642257e-05, "loss": 0.003, "step": 10080 }, { "epoch": 5.832369942196532, "grad_norm": 0.06178855523467064, "learning_rate": 9.383284683853937e-05, "loss": 0.0029, "step": 10090 }, { "epoch": 5.838150289017341, "grad_norm": 0.14005982875823975, "learning_rate": 9.381692703667981e-05, "loss": 0.0057, "step": 10100 }, { "epoch": 5.84393063583815, "grad_norm": 0.1773129403591156, "learning_rate": 9.380098806780771e-05, "loss": 0.0055, "step": 10110 }, { "epoch": 5.8497109826589595, "grad_norm": 0.1098577231168747, "learning_rate": 9.378502993889533e-05, "loss": 0.0047, "step": 10120 }, { "epoch": 5.855491329479769, "grad_norm": 0.10691174119710922, "learning_rate": 9.376905265692329e-05, "loss": 0.0047, "step": 10130 }, { "epoch": 5.861271676300578, "grad_norm": 0.0895080491900444, "learning_rate": 9.37530562288806e-05, "loss": 0.0033, "step": 10140 }, { "epoch": 5.867052023121388, "grad_norm": 0.12388379126787186, "learning_rate": 9.373704066176465e-05, "loss": 0.0038, "step": 10150 }, { "epoch": 5.872832369942197, "grad_norm": 0.06937913596630096, "learning_rate": 9.372100596258118e-05, "loss": 0.0033, "step": 10160 }, { "epoch": 5.878612716763006, "grad_norm": 0.07229018956422806, "learning_rate": 9.370495213834433e-05, "loss": 0.0036, "step": 10170 }, { "epoch": 5.884393063583815, "grad_norm": 0.054935961961746216, "learning_rate": 9.368887919607657e-05, "loss": 0.0038, "step": 10180 }, { "epoch": 5.890173410404624, "grad_norm": 0.053139057010412216, "learning_rate": 9.367278714280876e-05, "loss": 0.0035, "step": 10190 }, { "epoch": 5.895953757225434, "grad_norm": 0.04845484718680382, "learning_rate": 9.36566759855801e-05, "loss": 0.0034, "step": 10200 }, { "epoch": 5.901734104046243, "grad_norm": 0.05633014440536499, "learning_rate": 9.36405457314382e-05, "loss": 0.0033, "step": 10210 }, { "epoch": 5.907514450867052, "grad_norm": 0.04651571810245514, "learning_rate": 9.36243963874389e-05, "loss": 0.0037, "step": 10220 }, { "epoch": 5.913294797687861, "grad_norm": 0.08233683556318283, "learning_rate": 9.360822796064655e-05, "loss": 0.0029, "step": 10230 }, { "epoch": 5.91907514450867, "grad_norm": 0.05699615180492401, "learning_rate": 9.359204045813372e-05, "loss": 0.0032, "step": 10240 }, { "epoch": 5.924855491329479, "grad_norm": 0.05122595652937889, "learning_rate": 9.357583388698141e-05, "loss": 0.0037, "step": 10250 }, { "epoch": 5.930635838150289, "grad_norm": 0.04474084824323654, "learning_rate": 9.35596082542789e-05, "loss": 0.0029, "step": 10260 }, { "epoch": 5.936416184971098, "grad_norm": 0.04808139428496361, "learning_rate": 9.354336356712383e-05, "loss": 0.0033, "step": 10270 }, { "epoch": 5.942196531791907, "grad_norm": 0.06115487590432167, "learning_rate": 9.35270998326222e-05, "loss": 0.0034, "step": 10280 }, { "epoch": 5.9479768786127165, "grad_norm": 0.0701942890882492, "learning_rate": 9.351081705788831e-05, "loss": 0.0034, "step": 10290 }, { "epoch": 5.953757225433526, "grad_norm": 0.06478339433670044, "learning_rate": 9.349451525004477e-05, "loss": 0.0037, "step": 10300 }, { "epoch": 5.959537572254336, "grad_norm": 0.05231313034892082, "learning_rate": 9.347819441622261e-05, "loss": 0.0034, "step": 10310 }, { "epoch": 5.965317919075145, "grad_norm": 0.06736232340335846, "learning_rate": 9.346185456356105e-05, "loss": 0.0037, "step": 10320 }, { "epoch": 5.971098265895954, "grad_norm": 0.09078212827444077, "learning_rate": 9.344549569920774e-05, "loss": 0.0031, "step": 10330 }, { "epoch": 5.976878612716763, "grad_norm": 0.0714443176984787, "learning_rate": 9.342911783031858e-05, "loss": 0.0034, "step": 10340 }, { "epoch": 5.982658959537572, "grad_norm": 0.0628506988286972, "learning_rate": 9.341272096405782e-05, "loss": 0.0034, "step": 10350 }, { "epoch": 5.988439306358382, "grad_norm": 0.05350936949253082, "learning_rate": 9.3396305107598e-05, "loss": 0.0027, "step": 10360 }, { "epoch": 5.994219653179191, "grad_norm": 0.08104755729436874, "learning_rate": 9.337987026811998e-05, "loss": 0.0039, "step": 10370 }, { "epoch": 6.0, "grad_norm": 0.06955874711275101, "learning_rate": 9.33634164528129e-05, "loss": 0.0035, "step": 10380 }, { "epoch": 6.005780346820809, "grad_norm": 0.061120934784412384, "learning_rate": 9.334694366887424e-05, "loss": 0.0029, "step": 10390 }, { "epoch": 6.011560693641618, "grad_norm": 0.06715985387563705, "learning_rate": 9.333045192350973e-05, "loss": 0.0038, "step": 10400 }, { "epoch": 6.017341040462428, "grad_norm": 0.051006004214286804, "learning_rate": 9.331394122393345e-05, "loss": 0.0036, "step": 10410 }, { "epoch": 6.023121387283237, "grad_norm": 0.07410971075296402, "learning_rate": 9.329741157736771e-05, "loss": 0.0054, "step": 10420 }, { "epoch": 6.028901734104046, "grad_norm": 0.04755874350667, "learning_rate": 9.328086299104317e-05, "loss": 0.0038, "step": 10430 }, { "epoch": 6.034682080924855, "grad_norm": 0.08780265599489212, "learning_rate": 9.326429547219872e-05, "loss": 0.006, "step": 10440 }, { "epoch": 6.040462427745664, "grad_norm": 0.05304821580648422, "learning_rate": 9.324770902808155e-05, "loss": 0.0032, "step": 10450 }, { "epoch": 6.046242774566474, "grad_norm": 0.10874399542808533, "learning_rate": 9.323110366594717e-05, "loss": 0.0046, "step": 10460 }, { "epoch": 6.0520231213872835, "grad_norm": 0.08250004798173904, "learning_rate": 9.32144793930593e-05, "loss": 0.0033, "step": 10470 }, { "epoch": 6.057803468208093, "grad_norm": 0.0632195845246315, "learning_rate": 9.319783621668996e-05, "loss": 0.0028, "step": 10480 }, { "epoch": 6.063583815028902, "grad_norm": 0.07797906547784805, "learning_rate": 9.318117414411947e-05, "loss": 0.0034, "step": 10490 }, { "epoch": 6.069364161849711, "grad_norm": 0.0703006312251091, "learning_rate": 9.316449318263635e-05, "loss": 0.0043, "step": 10500 }, { "epoch": 6.07514450867052, "grad_norm": 0.09008142352104187, "learning_rate": 9.314779333953744e-05, "loss": 0.0037, "step": 10510 }, { "epoch": 6.08092485549133, "grad_norm": 0.06668701767921448, "learning_rate": 9.313107462212781e-05, "loss": 0.0029, "step": 10520 }, { "epoch": 6.086705202312139, "grad_norm": 0.07460634410381317, "learning_rate": 9.311433703772082e-05, "loss": 0.0034, "step": 10530 }, { "epoch": 6.092485549132948, "grad_norm": 0.05611598119139671, "learning_rate": 9.3097580593638e-05, "loss": 0.0035, "step": 10540 }, { "epoch": 6.098265895953757, "grad_norm": 0.03957807272672653, "learning_rate": 9.308080529720926e-05, "loss": 0.0036, "step": 10550 }, { "epoch": 6.104046242774566, "grad_norm": 0.07462923973798752, "learning_rate": 9.306401115577264e-05, "loss": 0.0033, "step": 10560 }, { "epoch": 6.109826589595376, "grad_norm": 0.06398440897464752, "learning_rate": 9.304719817667447e-05, "loss": 0.0032, "step": 10570 }, { "epoch": 6.115606936416185, "grad_norm": 0.07337431609630585, "learning_rate": 9.303036636726934e-05, "loss": 0.0039, "step": 10580 }, { "epoch": 6.121387283236994, "grad_norm": 0.06975319236516953, "learning_rate": 9.301351573492003e-05, "loss": 0.004, "step": 10590 }, { "epoch": 6.127167630057803, "grad_norm": 0.09302632510662079, "learning_rate": 9.299664628699758e-05, "loss": 0.0079, "step": 10600 }, { "epoch": 6.132947976878612, "grad_norm": 0.10387637466192245, "learning_rate": 9.297975803088129e-05, "loss": 0.0059, "step": 10610 }, { "epoch": 6.138728323699422, "grad_norm": 0.1043407991528511, "learning_rate": 9.296285097395864e-05, "loss": 0.0041, "step": 10620 }, { "epoch": 6.144508670520231, "grad_norm": 0.1208314448595047, "learning_rate": 9.294592512362533e-05, "loss": 0.0075, "step": 10630 }, { "epoch": 6.1502890173410405, "grad_norm": 0.10845641791820526, "learning_rate": 9.292898048728533e-05, "loss": 0.0057, "step": 10640 }, { "epoch": 6.15606936416185, "grad_norm": 0.10029802471399307, "learning_rate": 9.29120170723508e-05, "loss": 0.0036, "step": 10650 }, { "epoch": 6.161849710982659, "grad_norm": 0.07804198563098907, "learning_rate": 9.28950348862421e-05, "loss": 0.0042, "step": 10660 }, { "epoch": 6.167630057803469, "grad_norm": 0.13739337027072906, "learning_rate": 9.287803393638781e-05, "loss": 0.0049, "step": 10670 }, { "epoch": 6.173410404624278, "grad_norm": 0.14106610417366028, "learning_rate": 9.286101423022474e-05, "loss": 0.0045, "step": 10680 }, { "epoch": 6.179190751445087, "grad_norm": 0.15256516635417938, "learning_rate": 9.284397577519788e-05, "loss": 0.0042, "step": 10690 }, { "epoch": 6.184971098265896, "grad_norm": 0.09890145808458328, "learning_rate": 9.282691857876043e-05, "loss": 0.0042, "step": 10700 }, { "epoch": 6.190751445086705, "grad_norm": 0.08473809063434601, "learning_rate": 9.280984264837377e-05, "loss": 0.0036, "step": 10710 }, { "epoch": 6.196531791907514, "grad_norm": 0.0743294209241867, "learning_rate": 9.279274799150752e-05, "loss": 0.0029, "step": 10720 }, { "epoch": 6.202312138728324, "grad_norm": 0.05356019362807274, "learning_rate": 9.277563461563945e-05, "loss": 0.0027, "step": 10730 }, { "epoch": 6.208092485549133, "grad_norm": 0.07146003097295761, "learning_rate": 9.275850252825555e-05, "loss": 0.0036, "step": 10740 }, { "epoch": 6.213872832369942, "grad_norm": 0.06811804324388504, "learning_rate": 9.274135173684994e-05, "loss": 0.0035, "step": 10750 }, { "epoch": 6.219653179190751, "grad_norm": 0.06242654472589493, "learning_rate": 9.272418224892498e-05, "loss": 0.0026, "step": 10760 }, { "epoch": 6.22543352601156, "grad_norm": 0.08808053284883499, "learning_rate": 9.27069940719912e-05, "loss": 0.0036, "step": 10770 }, { "epoch": 6.23121387283237, "grad_norm": 0.04891262948513031, "learning_rate": 9.268978721356727e-05, "loss": 0.0028, "step": 10780 }, { "epoch": 6.236994219653179, "grad_norm": 0.042387206107378006, "learning_rate": 9.267256168118008e-05, "loss": 0.0028, "step": 10790 }, { "epoch": 6.242774566473988, "grad_norm": 0.06473665684461594, "learning_rate": 9.265531748236463e-05, "loss": 0.0031, "step": 10800 }, { "epoch": 6.2485549132947975, "grad_norm": 0.04370260611176491, "learning_rate": 9.263805462466416e-05, "loss": 0.0037, "step": 10810 }, { "epoch": 6.254335260115607, "grad_norm": 0.034692343324422836, "learning_rate": 9.262077311562998e-05, "loss": 0.0026, "step": 10820 }, { "epoch": 6.2601156069364166, "grad_norm": 0.06662782281637192, "learning_rate": 9.260347296282165e-05, "loss": 0.0038, "step": 10830 }, { "epoch": 6.265895953757226, "grad_norm": 0.05651827156543732, "learning_rate": 9.258615417380683e-05, "loss": 0.0028, "step": 10840 }, { "epoch": 6.271676300578035, "grad_norm": 0.07076770067214966, "learning_rate": 9.256881675616133e-05, "loss": 0.0036, "step": 10850 }, { "epoch": 6.277456647398844, "grad_norm": 0.06414354592561722, "learning_rate": 9.255146071746917e-05, "loss": 0.0036, "step": 10860 }, { "epoch": 6.283236994219653, "grad_norm": 0.06279256939888, "learning_rate": 9.253408606532241e-05, "loss": 0.0033, "step": 10870 }, { "epoch": 6.289017341040463, "grad_norm": 0.07311423867940903, "learning_rate": 9.251669280732137e-05, "loss": 0.0032, "step": 10880 }, { "epoch": 6.294797687861272, "grad_norm": 0.07605946063995361, "learning_rate": 9.249928095107441e-05, "loss": 0.0034, "step": 10890 }, { "epoch": 6.300578034682081, "grad_norm": 0.05828089267015457, "learning_rate": 9.248185050419811e-05, "loss": 0.0033, "step": 10900 }, { "epoch": 6.30635838150289, "grad_norm": 0.0422709695994854, "learning_rate": 9.24644014743171e-05, "loss": 0.004, "step": 10910 }, { "epoch": 6.312138728323699, "grad_norm": 0.08373105525970459, "learning_rate": 9.24469338690642e-05, "loss": 0.0034, "step": 10920 }, { "epoch": 6.317919075144509, "grad_norm": 0.04995943605899811, "learning_rate": 9.242944769608033e-05, "loss": 0.0033, "step": 10930 }, { "epoch": 6.323699421965318, "grad_norm": 0.06954579800367355, "learning_rate": 9.241194296301454e-05, "loss": 0.003, "step": 10940 }, { "epoch": 6.329479768786127, "grad_norm": 0.0585656501352787, "learning_rate": 9.239441967752397e-05, "loss": 0.0051, "step": 10950 }, { "epoch": 6.335260115606936, "grad_norm": 0.05706046521663666, "learning_rate": 9.237687784727393e-05, "loss": 0.0033, "step": 10960 }, { "epoch": 6.341040462427745, "grad_norm": 0.06931231170892715, "learning_rate": 9.235931747993781e-05, "loss": 0.0039, "step": 10970 }, { "epoch": 6.3468208092485545, "grad_norm": 0.0789090096950531, "learning_rate": 9.234173858319707e-05, "loss": 0.0033, "step": 10980 }, { "epoch": 6.3526011560693645, "grad_norm": 0.08950246125459671, "learning_rate": 9.23241411647414e-05, "loss": 0.0031, "step": 10990 }, { "epoch": 6.358381502890174, "grad_norm": 0.04269541800022125, "learning_rate": 9.230652523226841e-05, "loss": 0.0028, "step": 11000 }, { "epoch": 6.364161849710983, "grad_norm": 0.08877093344926834, "learning_rate": 9.2288890793484e-05, "loss": 0.0036, "step": 11010 }, { "epoch": 6.369942196531792, "grad_norm": 0.11143888533115387, "learning_rate": 9.227123785610199e-05, "loss": 0.0043, "step": 11020 }, { "epoch": 6.375722543352601, "grad_norm": 0.09317649900913239, "learning_rate": 9.225356642784443e-05, "loss": 0.0061, "step": 11030 }, { "epoch": 6.381502890173411, "grad_norm": 0.09370527416467667, "learning_rate": 9.223587651644139e-05, "loss": 0.0039, "step": 11040 }, { "epoch": 6.38728323699422, "grad_norm": 0.10189167410135269, "learning_rate": 9.221816812963104e-05, "loss": 0.0038, "step": 11050 }, { "epoch": 6.393063583815029, "grad_norm": 0.14939944446086884, "learning_rate": 9.22004412751596e-05, "loss": 0.0034, "step": 11060 }, { "epoch": 6.398843930635838, "grad_norm": 0.09085329622030258, "learning_rate": 9.218269596078146e-05, "loss": 0.0035, "step": 11070 }, { "epoch": 6.404624277456647, "grad_norm": 0.09412657469511032, "learning_rate": 9.216493219425895e-05, "loss": 0.0071, "step": 11080 }, { "epoch": 6.410404624277457, "grad_norm": 0.06841382384300232, "learning_rate": 9.21471499833626e-05, "loss": 0.004, "step": 11090 }, { "epoch": 6.416184971098266, "grad_norm": 0.07971781492233276, "learning_rate": 9.212934933587094e-05, "loss": 0.0033, "step": 11100 }, { "epoch": 6.421965317919075, "grad_norm": 0.07304677367210388, "learning_rate": 9.211153025957056e-05, "loss": 0.0034, "step": 11110 }, { "epoch": 6.427745664739884, "grad_norm": 0.08593588322401047, "learning_rate": 9.209369276225614e-05, "loss": 0.0033, "step": 11120 }, { "epoch": 6.433526011560693, "grad_norm": 0.12527289986610413, "learning_rate": 9.207583685173042e-05, "loss": 0.0043, "step": 11130 }, { "epoch": 6.4393063583815024, "grad_norm": 0.10674163699150085, "learning_rate": 9.205796253580417e-05, "loss": 0.0031, "step": 11140 }, { "epoch": 6.445086705202312, "grad_norm": 0.0938422828912735, "learning_rate": 9.204006982229621e-05, "loss": 0.0032, "step": 11150 }, { "epoch": 6.4508670520231215, "grad_norm": 0.09737176448106766, "learning_rate": 9.202215871903346e-05, "loss": 0.003, "step": 11160 }, { "epoch": 6.456647398843931, "grad_norm": 0.06158210337162018, "learning_rate": 9.20042292338508e-05, "loss": 0.0042, "step": 11170 }, { "epoch": 6.46242774566474, "grad_norm": 0.08288124948740005, "learning_rate": 9.198628137459123e-05, "loss": 0.0032, "step": 11180 }, { "epoch": 6.468208092485549, "grad_norm": 0.06456904113292694, "learning_rate": 9.196831514910572e-05, "loss": 0.004, "step": 11190 }, { "epoch": 6.473988439306359, "grad_norm": 0.0678388774394989, "learning_rate": 9.195033056525332e-05, "loss": 0.0027, "step": 11200 }, { "epoch": 6.479768786127168, "grad_norm": 0.059251368045806885, "learning_rate": 9.193232763090114e-05, "loss": 0.0032, "step": 11210 }, { "epoch": 6.485549132947977, "grad_norm": 0.060957446694374084, "learning_rate": 9.191430635392422e-05, "loss": 0.0031, "step": 11220 }, { "epoch": 6.491329479768786, "grad_norm": 0.07789164781570435, "learning_rate": 9.18962667422057e-05, "loss": 0.0034, "step": 11230 }, { "epoch": 6.497109826589595, "grad_norm": 0.0698968842625618, "learning_rate": 9.187820880363671e-05, "loss": 0.0033, "step": 11240 }, { "epoch": 6.502890173410405, "grad_norm": 0.0620306022465229, "learning_rate": 9.186013254611643e-05, "loss": 0.0028, "step": 11250 }, { "epoch": 6.508670520231214, "grad_norm": 0.04711582511663437, "learning_rate": 9.1842037977552e-05, "loss": 0.0031, "step": 11260 }, { "epoch": 6.514450867052023, "grad_norm": 0.045567553490400314, "learning_rate": 9.182392510585862e-05, "loss": 0.0028, "step": 11270 }, { "epoch": 6.520231213872832, "grad_norm": 0.09069813787937164, "learning_rate": 9.180579393895946e-05, "loss": 0.0028, "step": 11280 }, { "epoch": 6.526011560693641, "grad_norm": 0.08624447137117386, "learning_rate": 9.178764448478572e-05, "loss": 0.0029, "step": 11290 }, { "epoch": 6.531791907514451, "grad_norm": 0.07226696610450745, "learning_rate": 9.176947675127658e-05, "loss": 0.0031, "step": 11300 }, { "epoch": 6.53757225433526, "grad_norm": 0.07378704845905304, "learning_rate": 9.175129074637924e-05, "loss": 0.0069, "step": 11310 }, { "epoch": 6.543352601156069, "grad_norm": 0.08021886646747589, "learning_rate": 9.173308647804884e-05, "loss": 0.0045, "step": 11320 }, { "epoch": 6.5491329479768785, "grad_norm": 0.06382913887500763, "learning_rate": 9.171486395424859e-05, "loss": 0.0024, "step": 11330 }, { "epoch": 6.554913294797688, "grad_norm": 0.08531121909618378, "learning_rate": 9.16966231829496e-05, "loss": 0.0036, "step": 11340 }, { "epoch": 6.5606936416184976, "grad_norm": 0.0522238165140152, "learning_rate": 9.167836417213105e-05, "loss": 0.0033, "step": 11350 }, { "epoch": 6.566473988439307, "grad_norm": 0.06926757097244263, "learning_rate": 9.166008692978001e-05, "loss": 0.0028, "step": 11360 }, { "epoch": 6.572254335260116, "grad_norm": 0.038560591638088226, "learning_rate": 9.164179146389158e-05, "loss": 0.0027, "step": 11370 }, { "epoch": 6.578034682080925, "grad_norm": 0.05434241518378258, "learning_rate": 9.162347778246882e-05, "loss": 0.0024, "step": 11380 }, { "epoch": 6.583815028901734, "grad_norm": 0.053169794380664825, "learning_rate": 9.160514589352276e-05, "loss": 0.0034, "step": 11390 }, { "epoch": 6.589595375722544, "grad_norm": 0.07566075026988983, "learning_rate": 9.15867958050724e-05, "loss": 0.003, "step": 11400 }, { "epoch": 6.595375722543353, "grad_norm": 0.05034327879548073, "learning_rate": 9.156842752514466e-05, "loss": 0.0031, "step": 11410 }, { "epoch": 6.601156069364162, "grad_norm": 0.05703079700469971, "learning_rate": 9.155004106177447e-05, "loss": 0.003, "step": 11420 }, { "epoch": 6.606936416184971, "grad_norm": 0.1105022132396698, "learning_rate": 9.153163642300471e-05, "loss": 0.004, "step": 11430 }, { "epoch": 6.61271676300578, "grad_norm": 0.10962982475757599, "learning_rate": 9.151321361688616e-05, "loss": 0.0033, "step": 11440 }, { "epoch": 6.618497109826589, "grad_norm": 0.05263747274875641, "learning_rate": 9.149477265147762e-05, "loss": 0.0029, "step": 11450 }, { "epoch": 6.624277456647399, "grad_norm": 0.09197230637073517, "learning_rate": 9.147631353484574e-05, "loss": 0.0038, "step": 11460 }, { "epoch": 6.630057803468208, "grad_norm": 0.0773550271987915, "learning_rate": 9.145783627506522e-05, "loss": 0.0035, "step": 11470 }, { "epoch": 6.635838150289017, "grad_norm": 0.11397737264633179, "learning_rate": 9.143934088021861e-05, "loss": 0.0041, "step": 11480 }, { "epoch": 6.641618497109826, "grad_norm": 0.10318799316883087, "learning_rate": 9.142082735839645e-05, "loss": 0.0042, "step": 11490 }, { "epoch": 6.6473988439306355, "grad_norm": 0.08106772601604462, "learning_rate": 9.140229571769715e-05, "loss": 0.0032, "step": 11500 }, { "epoch": 6.653179190751445, "grad_norm": 0.08967715501785278, "learning_rate": 9.138374596622709e-05, "loss": 0.0033, "step": 11510 }, { "epoch": 6.658959537572255, "grad_norm": 0.06483248621225357, "learning_rate": 9.136517811210059e-05, "loss": 0.0026, "step": 11520 }, { "epoch": 6.664739884393064, "grad_norm": 0.042875584214925766, "learning_rate": 9.134659216343984e-05, "loss": 0.0034, "step": 11530 }, { "epoch": 6.670520231213873, "grad_norm": 0.0676209107041359, "learning_rate": 9.132798812837494e-05, "loss": 0.0039, "step": 11540 }, { "epoch": 6.676300578034682, "grad_norm": 0.1124272495508194, "learning_rate": 9.130936601504396e-05, "loss": 0.0032, "step": 11550 }, { "epoch": 6.682080924855491, "grad_norm": 0.09340932220220566, "learning_rate": 9.129072583159284e-05, "loss": 0.0034, "step": 11560 }, { "epoch": 6.687861271676301, "grad_norm": 0.10958612710237503, "learning_rate": 9.127206758617542e-05, "loss": 0.0034, "step": 11570 }, { "epoch": 6.69364161849711, "grad_norm": 0.09023173898458481, "learning_rate": 9.125339128695346e-05, "loss": 0.0044, "step": 11580 }, { "epoch": 6.699421965317919, "grad_norm": 0.06543544679880142, "learning_rate": 9.123469694209659e-05, "loss": 0.003, "step": 11590 }, { "epoch": 6.705202312138728, "grad_norm": 0.09502831101417542, "learning_rate": 9.121598455978239e-05, "loss": 0.0041, "step": 11600 }, { "epoch": 6.710982658959537, "grad_norm": 0.05420244112610817, "learning_rate": 9.119725414819624e-05, "loss": 0.0029, "step": 11610 }, { "epoch": 6.716763005780347, "grad_norm": 0.09080252051353455, "learning_rate": 9.117850571553149e-05, "loss": 0.0029, "step": 11620 }, { "epoch": 6.722543352601156, "grad_norm": 0.11371125280857086, "learning_rate": 9.115973926998935e-05, "loss": 0.0039, "step": 11630 }, { "epoch": 6.728323699421965, "grad_norm": 0.07286228984594345, "learning_rate": 9.114095481977888e-05, "loss": 0.0034, "step": 11640 }, { "epoch": 6.734104046242774, "grad_norm": 0.06555897742509842, "learning_rate": 9.112215237311703e-05, "loss": 0.0032, "step": 11650 }, { "epoch": 6.7398843930635834, "grad_norm": 0.07509700953960419, "learning_rate": 9.110333193822867e-05, "loss": 0.0041, "step": 11660 }, { "epoch": 6.745664739884393, "grad_norm": 0.042179595679044724, "learning_rate": 9.108449352334645e-05, "loss": 0.003, "step": 11670 }, { "epoch": 6.7514450867052025, "grad_norm": 0.07306253910064697, "learning_rate": 9.106563713671094e-05, "loss": 0.0032, "step": 11680 }, { "epoch": 6.757225433526012, "grad_norm": 0.04574565216898918, "learning_rate": 9.104676278657061e-05, "loss": 0.0028, "step": 11690 }, { "epoch": 6.763005780346821, "grad_norm": 0.059894826263189316, "learning_rate": 9.102787048118169e-05, "loss": 0.0041, "step": 11700 }, { "epoch": 6.76878612716763, "grad_norm": 0.05459150671958923, "learning_rate": 9.100896022880834e-05, "loss": 0.0027, "step": 11710 }, { "epoch": 6.77456647398844, "grad_norm": 0.10843174159526825, "learning_rate": 9.099003203772254e-05, "loss": 0.0052, "step": 11720 }, { "epoch": 6.780346820809249, "grad_norm": 0.10990060865879059, "learning_rate": 9.097108591620413e-05, "loss": 0.0046, "step": 11730 }, { "epoch": 6.786127167630058, "grad_norm": 0.07287583500146866, "learning_rate": 9.095212187254078e-05, "loss": 0.0062, "step": 11740 }, { "epoch": 6.791907514450867, "grad_norm": 0.061475008726119995, "learning_rate": 9.093313991502801e-05, "loss": 0.0038, "step": 11750 }, { "epoch": 6.797687861271676, "grad_norm": 0.10422157496213913, "learning_rate": 9.091414005196917e-05, "loss": 0.0046, "step": 11760 }, { "epoch": 6.803468208092486, "grad_norm": 0.10327991098165512, "learning_rate": 9.089512229167545e-05, "loss": 0.0034, "step": 11770 }, { "epoch": 6.809248554913295, "grad_norm": 0.06790751963853836, "learning_rate": 9.087608664246587e-05, "loss": 0.0032, "step": 11780 }, { "epoch": 6.815028901734104, "grad_norm": 0.0866834744811058, "learning_rate": 9.085703311266727e-05, "loss": 0.0032, "step": 11790 }, { "epoch": 6.820809248554913, "grad_norm": 0.11491991579532623, "learning_rate": 9.083796171061429e-05, "loss": 0.0043, "step": 11800 }, { "epoch": 6.826589595375722, "grad_norm": 0.09453996270895004, "learning_rate": 9.081887244464941e-05, "loss": 0.0043, "step": 11810 }, { "epoch": 6.832369942196532, "grad_norm": 0.09952332824468613, "learning_rate": 9.079976532312297e-05, "loss": 0.0038, "step": 11820 }, { "epoch": 6.838150289017341, "grad_norm": 0.05078454315662384, "learning_rate": 9.078064035439301e-05, "loss": 0.0026, "step": 11830 }, { "epoch": 6.84393063583815, "grad_norm": 0.04266422986984253, "learning_rate": 9.07614975468255e-05, "loss": 0.0039, "step": 11840 }, { "epoch": 6.8497109826589595, "grad_norm": 0.058598946779966354, "learning_rate": 9.074233690879412e-05, "loss": 0.003, "step": 11850 }, { "epoch": 6.855491329479769, "grad_norm": 0.06515590101480484, "learning_rate": 9.072315844868038e-05, "loss": 0.0027, "step": 11860 }, { "epoch": 6.861271676300578, "grad_norm": 0.059226710349321365, "learning_rate": 9.07039621748736e-05, "loss": 0.0046, "step": 11870 }, { "epoch": 6.867052023121388, "grad_norm": 0.07862482964992523, "learning_rate": 9.06847480957709e-05, "loss": 0.0026, "step": 11880 }, { "epoch": 6.872832369942197, "grad_norm": 0.11240318417549133, "learning_rate": 9.066551621977713e-05, "loss": 0.0029, "step": 11890 }, { "epoch": 6.878612716763006, "grad_norm": 0.09790018200874329, "learning_rate": 9.064626655530501e-05, "loss": 0.0029, "step": 11900 }, { "epoch": 6.884393063583815, "grad_norm": 0.08414393663406372, "learning_rate": 9.062699911077497e-05, "loss": 0.0027, "step": 11910 }, { "epoch": 6.890173410404624, "grad_norm": 0.044531289488077164, "learning_rate": 9.060771389461524e-05, "loss": 0.003, "step": 11920 }, { "epoch": 6.895953757225434, "grad_norm": 0.08173054456710815, "learning_rate": 9.058841091526187e-05, "loss": 0.0036, "step": 11930 }, { "epoch": 6.901734104046243, "grad_norm": 0.07367779314517975, "learning_rate": 9.056909018115858e-05, "loss": 0.0037, "step": 11940 }, { "epoch": 6.907514450867052, "grad_norm": 0.06203809008002281, "learning_rate": 9.054975170075697e-05, "loss": 0.0035, "step": 11950 }, { "epoch": 6.913294797687861, "grad_norm": 0.045646168291568756, "learning_rate": 9.053039548251631e-05, "loss": 0.0026, "step": 11960 }, { "epoch": 6.91907514450867, "grad_norm": 0.05977427959442139, "learning_rate": 9.051102153490368e-05, "loss": 0.0064, "step": 11970 }, { "epoch": 6.924855491329479, "grad_norm": 0.09023835510015488, "learning_rate": 9.04916298663939e-05, "loss": 0.0037, "step": 11980 }, { "epoch": 6.930635838150289, "grad_norm": 0.0789908617734909, "learning_rate": 9.047222048546955e-05, "loss": 0.0033, "step": 11990 }, { "epoch": 6.936416184971098, "grad_norm": 0.06687037646770477, "learning_rate": 9.045279340062097e-05, "loss": 0.0046, "step": 12000 }, { "epoch": 6.942196531791907, "grad_norm": 0.0676008015871048, "learning_rate": 9.043334862034618e-05, "loss": 0.003, "step": 12010 }, { "epoch": 6.9479768786127165, "grad_norm": 0.08746353536844254, "learning_rate": 9.041388615315102e-05, "loss": 0.0032, "step": 12020 }, { "epoch": 6.953757225433526, "grad_norm": 0.050007738173007965, "learning_rate": 9.039440600754905e-05, "loss": 0.0031, "step": 12030 }, { "epoch": 6.959537572254336, "grad_norm": 0.05344380810856819, "learning_rate": 9.037490819206151e-05, "loss": 0.0032, "step": 12040 }, { "epoch": 6.965317919075145, "grad_norm": 0.05119822919368744, "learning_rate": 9.035539271521744e-05, "loss": 0.0027, "step": 12050 }, { "epoch": 6.971098265895954, "grad_norm": 0.057583969086408615, "learning_rate": 9.033585958555356e-05, "loss": 0.0028, "step": 12060 }, { "epoch": 6.976878612716763, "grad_norm": 0.05539444088935852, "learning_rate": 9.031630881161431e-05, "loss": 0.004, "step": 12070 }, { "epoch": 6.982658959537572, "grad_norm": 0.056936319917440414, "learning_rate": 9.029674040195186e-05, "loss": 0.0029, "step": 12080 }, { "epoch": 6.988439306358382, "grad_norm": 0.057017434388399124, "learning_rate": 9.027715436512613e-05, "loss": 0.003, "step": 12090 }, { "epoch": 6.994219653179191, "grad_norm": 0.0817522257566452, "learning_rate": 9.02575507097047e-05, "loss": 0.0049, "step": 12100 }, { "epoch": 7.0, "grad_norm": 0.05931788310408592, "learning_rate": 9.023792944426286e-05, "loss": 0.0028, "step": 12110 }, { "epoch": 7.005780346820809, "grad_norm": 0.04272283613681793, "learning_rate": 9.021829057738364e-05, "loss": 0.0029, "step": 12120 }, { "epoch": 7.011560693641618, "grad_norm": 0.06267021596431732, "learning_rate": 9.019863411765775e-05, "loss": 0.0034, "step": 12130 }, { "epoch": 7.017341040462428, "grad_norm": 0.0873156264424324, "learning_rate": 9.017896007368357e-05, "loss": 0.0037, "step": 12140 }, { "epoch": 7.023121387283237, "grad_norm": 0.08396241068840027, "learning_rate": 9.015926845406722e-05, "loss": 0.0038, "step": 12150 }, { "epoch": 7.028901734104046, "grad_norm": 0.10236790031194687, "learning_rate": 9.013955926742245e-05, "loss": 0.0031, "step": 12160 }, { "epoch": 7.034682080924855, "grad_norm": 0.06778319180011749, "learning_rate": 9.011983252237077e-05, "loss": 0.0025, "step": 12170 }, { "epoch": 7.040462427745664, "grad_norm": 0.04483620822429657, "learning_rate": 9.01000882275413e-05, "loss": 0.0028, "step": 12180 }, { "epoch": 7.046242774566474, "grad_norm": 0.06018118932843208, "learning_rate": 9.008032639157088e-05, "loss": 0.0029, "step": 12190 }, { "epoch": 7.0520231213872835, "grad_norm": 0.05943775549530983, "learning_rate": 9.006054702310401e-05, "loss": 0.0044, "step": 12200 }, { "epoch": 7.057803468208093, "grad_norm": 0.09683380275964737, "learning_rate": 9.004075013079283e-05, "loss": 0.0031, "step": 12210 }, { "epoch": 7.063583815028902, "grad_norm": 0.055581022053956985, "learning_rate": 9.00209357232972e-05, "loss": 0.0031, "step": 12220 }, { "epoch": 7.069364161849711, "grad_norm": 0.08908397704362869, "learning_rate": 9.000110380928461e-05, "loss": 0.0046, "step": 12230 }, { "epoch": 7.07514450867052, "grad_norm": 0.09782064706087112, "learning_rate": 8.998125439743021e-05, "loss": 0.0033, "step": 12240 }, { "epoch": 7.08092485549133, "grad_norm": 0.06307903677225113, "learning_rate": 8.996138749641682e-05, "loss": 0.0026, "step": 12250 }, { "epoch": 7.086705202312139, "grad_norm": 0.10771429538726807, "learning_rate": 8.994150311493488e-05, "loss": 0.0041, "step": 12260 }, { "epoch": 7.092485549132948, "grad_norm": 0.07503542304039001, "learning_rate": 8.992160126168247e-05, "loss": 0.003, "step": 12270 }, { "epoch": 7.098265895953757, "grad_norm": 0.06886713206768036, "learning_rate": 8.99016819453654e-05, "loss": 0.0031, "step": 12280 }, { "epoch": 7.104046242774566, "grad_norm": 0.0587981753051281, "learning_rate": 8.988174517469702e-05, "loss": 0.0037, "step": 12290 }, { "epoch": 7.109826589595376, "grad_norm": 0.05444381386041641, "learning_rate": 8.986179095839835e-05, "loss": 0.0046, "step": 12300 }, { "epoch": 7.115606936416185, "grad_norm": 0.07300411909818649, "learning_rate": 8.984181930519804e-05, "loss": 0.003, "step": 12310 }, { "epoch": 7.121387283236994, "grad_norm": 0.059057146310806274, "learning_rate": 8.982183022383237e-05, "loss": 0.003, "step": 12320 }, { "epoch": 7.127167630057803, "grad_norm": 0.051803700625896454, "learning_rate": 8.980182372304525e-05, "loss": 0.0045, "step": 12330 }, { "epoch": 7.132947976878612, "grad_norm": 0.06521975994110107, "learning_rate": 8.97817998115882e-05, "loss": 0.0031, "step": 12340 }, { "epoch": 7.138728323699422, "grad_norm": 0.06626100838184357, "learning_rate": 8.976175849822038e-05, "loss": 0.0034, "step": 12350 }, { "epoch": 7.144508670520231, "grad_norm": 0.08046285808086395, "learning_rate": 8.97416997917085e-05, "loss": 0.0035, "step": 12360 }, { "epoch": 7.1502890173410405, "grad_norm": 0.07497113198041916, "learning_rate": 8.972162370082695e-05, "loss": 0.0029, "step": 12370 }, { "epoch": 7.15606936416185, "grad_norm": 0.08223730325698853, "learning_rate": 8.97015302343577e-05, "loss": 0.0034, "step": 12380 }, { "epoch": 7.161849710982659, "grad_norm": 0.054061759263277054, "learning_rate": 8.968141940109027e-05, "loss": 0.0028, "step": 12390 }, { "epoch": 7.167630057803469, "grad_norm": 0.061602234840393066, "learning_rate": 8.966129120982188e-05, "loss": 0.0054, "step": 12400 }, { "epoch": 7.173410404624278, "grad_norm": 0.08485258370637894, "learning_rate": 8.964114566935724e-05, "loss": 0.0036, "step": 12410 }, { "epoch": 7.179190751445087, "grad_norm": 0.07921281456947327, "learning_rate": 8.962098278850871e-05, "loss": 0.0035, "step": 12420 }, { "epoch": 7.184971098265896, "grad_norm": 0.05399646610021591, "learning_rate": 8.960080257609622e-05, "loss": 0.0038, "step": 12430 }, { "epoch": 7.190751445086705, "grad_norm": 0.07379326224327087, "learning_rate": 8.95806050409473e-05, "loss": 0.0037, "step": 12440 }, { "epoch": 7.196531791907514, "grad_norm": 0.09412387758493423, "learning_rate": 8.9560390191897e-05, "loss": 0.0032, "step": 12450 }, { "epoch": 7.202312138728324, "grad_norm": 0.05449094995856285, "learning_rate": 8.9540158037788e-05, "loss": 0.0032, "step": 12460 }, { "epoch": 7.208092485549133, "grad_norm": 0.0610949732363224, "learning_rate": 8.951990858747054e-05, "loss": 0.0029, "step": 12470 }, { "epoch": 7.213872832369942, "grad_norm": 0.05628536641597748, "learning_rate": 8.94996418498024e-05, "loss": 0.0056, "step": 12480 }, { "epoch": 7.219653179190751, "grad_norm": 0.06268458813428879, "learning_rate": 8.947935783364896e-05, "loss": 0.0053, "step": 12490 }, { "epoch": 7.22543352601156, "grad_norm": 0.06885628402233124, "learning_rate": 8.945905654788311e-05, "loss": 0.0031, "step": 12500 }, { "epoch": 7.23121387283237, "grad_norm": 0.10913290828466415, "learning_rate": 8.943873800138535e-05, "loss": 0.0043, "step": 12510 }, { "epoch": 7.236994219653179, "grad_norm": 0.0913657546043396, "learning_rate": 8.94184022030437e-05, "loss": 0.0042, "step": 12520 }, { "epoch": 7.242774566473988, "grad_norm": 0.08144153654575348, "learning_rate": 8.939804916175372e-05, "loss": 0.0027, "step": 12530 }, { "epoch": 7.2485549132947975, "grad_norm": 0.06376820802688599, "learning_rate": 8.93776788864185e-05, "loss": 0.0028, "step": 12540 }, { "epoch": 7.254335260115607, "grad_norm": 0.07514312118291855, "learning_rate": 8.935729138594873e-05, "loss": 0.003, "step": 12550 }, { "epoch": 7.2601156069364166, "grad_norm": 0.0901554524898529, "learning_rate": 8.933688666926258e-05, "loss": 0.0038, "step": 12560 }, { "epoch": 7.265895953757226, "grad_norm": 0.0682053342461586, "learning_rate": 8.931646474528575e-05, "loss": 0.0027, "step": 12570 }, { "epoch": 7.271676300578035, "grad_norm": 0.07660944759845734, "learning_rate": 8.929602562295151e-05, "loss": 0.0029, "step": 12580 }, { "epoch": 7.277456647398844, "grad_norm": 0.08285539597272873, "learning_rate": 8.92755693112006e-05, "loss": 0.0051, "step": 12590 }, { "epoch": 7.283236994219653, "grad_norm": 0.05912092328071594, "learning_rate": 8.925509581898136e-05, "loss": 0.0028, "step": 12600 }, { "epoch": 7.289017341040463, "grad_norm": 0.07262987643480301, "learning_rate": 8.923460515524951e-05, "loss": 0.0038, "step": 12610 }, { "epoch": 7.294797687861272, "grad_norm": 0.06825876981019974, "learning_rate": 8.921409732896842e-05, "loss": 0.003, "step": 12620 }, { "epoch": 7.300578034682081, "grad_norm": 0.0889398455619812, "learning_rate": 8.919357234910887e-05, "loss": 0.0033, "step": 12630 }, { "epoch": 7.30635838150289, "grad_norm": 0.050018060952425, "learning_rate": 8.917303022464923e-05, "loss": 0.003, "step": 12640 }, { "epoch": 7.312138728323699, "grad_norm": 0.08514773845672607, "learning_rate": 8.915247096457531e-05, "loss": 0.0037, "step": 12650 }, { "epoch": 7.317919075144509, "grad_norm": 0.05785389617085457, "learning_rate": 8.91318945778804e-05, "loss": 0.0031, "step": 12660 }, { "epoch": 7.323699421965318, "grad_norm": 0.06569620966911316, "learning_rate": 8.911130107356534e-05, "loss": 0.0068, "step": 12670 }, { "epoch": 7.329479768786127, "grad_norm": 0.08869968354701996, "learning_rate": 8.90906904606384e-05, "loss": 0.0029, "step": 12680 }, { "epoch": 7.335260115606936, "grad_norm": 0.08334045857191086, "learning_rate": 8.90700627481154e-05, "loss": 0.0046, "step": 12690 }, { "epoch": 7.341040462427745, "grad_norm": 0.09049692749977112, "learning_rate": 8.904941794501957e-05, "loss": 0.0041, "step": 12700 }, { "epoch": 7.3468208092485545, "grad_norm": 0.07856415212154388, "learning_rate": 8.902875606038166e-05, "loss": 0.0045, "step": 12710 }, { "epoch": 7.3526011560693645, "grad_norm": 0.11439894139766693, "learning_rate": 8.900807710323989e-05, "loss": 0.0042, "step": 12720 }, { "epoch": 7.358381502890174, "grad_norm": 0.08839484304189682, "learning_rate": 8.898738108263993e-05, "loss": 0.0042, "step": 12730 }, { "epoch": 7.364161849710983, "grad_norm": 0.07368623465299606, "learning_rate": 8.896666800763491e-05, "loss": 0.003, "step": 12740 }, { "epoch": 7.369942196531792, "grad_norm": 0.061431001871824265, "learning_rate": 8.894593788728546e-05, "loss": 0.0036, "step": 12750 }, { "epoch": 7.375722543352601, "grad_norm": 0.05455136299133301, "learning_rate": 8.892519073065961e-05, "loss": 0.0036, "step": 12760 }, { "epoch": 7.381502890173411, "grad_norm": 0.06318709254264832, "learning_rate": 8.89044265468329e-05, "loss": 0.0034, "step": 12770 }, { "epoch": 7.38728323699422, "grad_norm": 0.0825120210647583, "learning_rate": 8.888364534488827e-05, "loss": 0.0039, "step": 12780 }, { "epoch": 7.393063583815029, "grad_norm": 0.07053670287132263, "learning_rate": 8.886284713391613e-05, "loss": 0.0023, "step": 12790 }, { "epoch": 7.398843930635838, "grad_norm": 0.17149056494235992, "learning_rate": 8.884203192301431e-05, "loss": 0.0044, "step": 12800 }, { "epoch": 7.404624277456647, "grad_norm": 0.08933155238628387, "learning_rate": 8.88211997212881e-05, "loss": 0.0052, "step": 12810 }, { "epoch": 7.410404624277457, "grad_norm": 0.11136192828416824, "learning_rate": 8.880035053785023e-05, "loss": 0.0049, "step": 12820 }, { "epoch": 7.416184971098266, "grad_norm": 0.08717218786478043, "learning_rate": 8.877948438182083e-05, "loss": 0.0054, "step": 12830 }, { "epoch": 7.421965317919075, "grad_norm": 0.09373895078897476, "learning_rate": 8.875860126232745e-05, "loss": 0.0034, "step": 12840 }, { "epoch": 7.427745664739884, "grad_norm": 0.09812264889478683, "learning_rate": 8.87377011885051e-05, "loss": 0.0032, "step": 12850 }, { "epoch": 7.433526011560693, "grad_norm": 0.07721059024333954, "learning_rate": 8.871678416949617e-05, "loss": 0.0034, "step": 12860 }, { "epoch": 7.4393063583815024, "grad_norm": 0.06405416131019592, "learning_rate": 8.869585021445046e-05, "loss": 0.0038, "step": 12870 }, { "epoch": 7.445086705202312, "grad_norm": 0.07349882274866104, "learning_rate": 8.867489933252521e-05, "loss": 0.0031, "step": 12880 }, { "epoch": 7.4508670520231215, "grad_norm": 0.06702619791030884, "learning_rate": 8.865393153288504e-05, "loss": 0.0028, "step": 12890 }, { "epoch": 7.456647398843931, "grad_norm": 0.09085283428430557, "learning_rate": 8.8632946824702e-05, "loss": 0.0036, "step": 12900 }, { "epoch": 7.46242774566474, "grad_norm": 0.07159951329231262, "learning_rate": 8.86119452171555e-05, "loss": 0.0035, "step": 12910 }, { "epoch": 7.468208092485549, "grad_norm": 0.051984332501888275, "learning_rate": 8.859092671943234e-05, "loss": 0.0025, "step": 12920 }, { "epoch": 7.473988439306359, "grad_norm": 0.0591067373752594, "learning_rate": 8.856989134072676e-05, "loss": 0.0025, "step": 12930 }, { "epoch": 7.479768786127168, "grad_norm": 0.06747888028621674, "learning_rate": 8.85488390902403e-05, "loss": 0.0029, "step": 12940 }, { "epoch": 7.485549132947977, "grad_norm": 0.07091999799013138, "learning_rate": 8.852776997718199e-05, "loss": 0.0042, "step": 12950 }, { "epoch": 7.491329479768786, "grad_norm": 0.06534814089536667, "learning_rate": 8.850668401076812e-05, "loss": 0.003, "step": 12960 }, { "epoch": 7.497109826589595, "grad_norm": 0.07369142025709152, "learning_rate": 8.848558120022246e-05, "loss": 0.0034, "step": 12970 }, { "epoch": 7.502890173410405, "grad_norm": 0.07442428171634674, "learning_rate": 8.846446155477603e-05, "loss": 0.0035, "step": 12980 }, { "epoch": 7.508670520231214, "grad_norm": 0.08190198242664337, "learning_rate": 8.844332508366735e-05, "loss": 0.0024, "step": 12990 }, { "epoch": 7.514450867052023, "grad_norm": 0.06545107066631317, "learning_rate": 8.84221717961422e-05, "loss": 0.0031, "step": 13000 }, { "epoch": 7.520231213872832, "grad_norm": 0.0414169579744339, "learning_rate": 8.840100170145374e-05, "loss": 0.0026, "step": 13010 }, { "epoch": 7.526011560693641, "grad_norm": 0.05502517521381378, "learning_rate": 8.837981480886249e-05, "loss": 0.0029, "step": 13020 }, { "epoch": 7.531791907514451, "grad_norm": 0.07878468185663223, "learning_rate": 8.835861112763633e-05, "loss": 0.0026, "step": 13030 }, { "epoch": 7.53757225433526, "grad_norm": 0.059847693890333176, "learning_rate": 8.833739066705044e-05, "loss": 0.003, "step": 13040 }, { "epoch": 7.543352601156069, "grad_norm": 0.06614434719085693, "learning_rate": 8.831615343638742e-05, "loss": 0.004, "step": 13050 }, { "epoch": 7.5491329479768785, "grad_norm": 0.0676717534661293, "learning_rate": 8.829489944493711e-05, "loss": 0.0026, "step": 13060 }, { "epoch": 7.554913294797688, "grad_norm": 0.04824170470237732, "learning_rate": 8.827362870199675e-05, "loss": 0.0031, "step": 13070 }, { "epoch": 7.5606936416184976, "grad_norm": 0.0658811405301094, "learning_rate": 8.825234121687089e-05, "loss": 0.0036, "step": 13080 }, { "epoch": 7.566473988439307, "grad_norm": 0.04927695915102959, "learning_rate": 8.823103699887139e-05, "loss": 0.0024, "step": 13090 }, { "epoch": 7.572254335260116, "grad_norm": 0.06484200805425644, "learning_rate": 8.820971605731745e-05, "loss": 0.0049, "step": 13100 }, { "epoch": 7.578034682080925, "grad_norm": 0.0489911250770092, "learning_rate": 8.818837840153556e-05, "loss": 0.0029, "step": 13110 }, { "epoch": 7.583815028901734, "grad_norm": 0.0724041536450386, "learning_rate": 8.816702404085952e-05, "loss": 0.0037, "step": 13120 }, { "epoch": 7.589595375722544, "grad_norm": 0.06730324774980545, "learning_rate": 8.814565298463048e-05, "loss": 0.0034, "step": 13130 }, { "epoch": 7.595375722543353, "grad_norm": 0.040589164942502975, "learning_rate": 8.812426524219688e-05, "loss": 0.0066, "step": 13140 }, { "epoch": 7.601156069364162, "grad_norm": 0.06166701763868332, "learning_rate": 8.81028608229144e-05, "loss": 0.0029, "step": 13150 }, { "epoch": 7.606936416184971, "grad_norm": 0.0673922523856163, "learning_rate": 8.808143973614611e-05, "loss": 0.003, "step": 13160 }, { "epoch": 7.61271676300578, "grad_norm": 0.05457756295800209, "learning_rate": 8.806000199126228e-05, "loss": 0.0044, "step": 13170 }, { "epoch": 7.618497109826589, "grad_norm": 0.09412674605846405, "learning_rate": 8.803854759764052e-05, "loss": 0.003, "step": 13180 }, { "epoch": 7.624277456647399, "grad_norm": 0.10493400692939758, "learning_rate": 8.801707656466572e-05, "loss": 0.0046, "step": 13190 }, { "epoch": 7.630057803468208, "grad_norm": 0.13792482018470764, "learning_rate": 8.799558890173003e-05, "loss": 0.0049, "step": 13200 }, { "epoch": 7.635838150289017, "grad_norm": 0.0934421643614769, "learning_rate": 8.79740846182329e-05, "loss": 0.0036, "step": 13210 }, { "epoch": 7.641618497109826, "grad_norm": 0.0708174929022789, "learning_rate": 8.7952563723581e-05, "loss": 0.0065, "step": 13220 }, { "epoch": 7.6473988439306355, "grad_norm": 0.07073547691106796, "learning_rate": 8.793102622718834e-05, "loss": 0.0027, "step": 13230 }, { "epoch": 7.653179190751445, "grad_norm": 0.05832689628005028, "learning_rate": 8.790947213847613e-05, "loss": 0.0024, "step": 13240 }, { "epoch": 7.658959537572255, "grad_norm": 0.07920937985181808, "learning_rate": 8.788790146687286e-05, "loss": 0.005, "step": 13250 }, { "epoch": 7.664739884393064, "grad_norm": 0.0813925489783287, "learning_rate": 8.786631422181429e-05, "loss": 0.0043, "step": 13260 }, { "epoch": 7.670520231213873, "grad_norm": 0.13647060096263885, "learning_rate": 8.78447104127434e-05, "loss": 0.0042, "step": 13270 }, { "epoch": 7.676300578034682, "grad_norm": 0.09591488540172577, "learning_rate": 8.782309004911042e-05, "loss": 0.0037, "step": 13280 }, { "epoch": 7.682080924855491, "grad_norm": 0.09956178069114685, "learning_rate": 8.780145314037286e-05, "loss": 0.0029, "step": 13290 }, { "epoch": 7.687861271676301, "grad_norm": 0.05980099365115166, "learning_rate": 8.777979969599542e-05, "loss": 0.0032, "step": 13300 }, { "epoch": 7.69364161849711, "grad_norm": 0.09983234107494354, "learning_rate": 8.775812972545006e-05, "loss": 0.0047, "step": 13310 }, { "epoch": 7.699421965317919, "grad_norm": 0.11469981074333191, "learning_rate": 8.773644323821596e-05, "loss": 0.0033, "step": 13320 }, { "epoch": 7.705202312138728, "grad_norm": 0.07005388289690018, "learning_rate": 8.771474024377953e-05, "loss": 0.004, "step": 13330 }, { "epoch": 7.710982658959537, "grad_norm": 0.0865354910492897, "learning_rate": 8.769302075163438e-05, "loss": 0.0038, "step": 13340 }, { "epoch": 7.716763005780347, "grad_norm": 0.07273339480161667, "learning_rate": 8.767128477128137e-05, "loss": 0.0039, "step": 13350 }, { "epoch": 7.722543352601156, "grad_norm": 0.06498703360557556, "learning_rate": 8.764953231222854e-05, "loss": 0.0033, "step": 13360 }, { "epoch": 7.728323699421965, "grad_norm": 0.062098268419504166, "learning_rate": 8.762776338399119e-05, "loss": 0.0035, "step": 13370 }, { "epoch": 7.734104046242774, "grad_norm": 0.08519493788480759, "learning_rate": 8.760597799609176e-05, "loss": 0.0039, "step": 13380 }, { "epoch": 7.7398843930635834, "grad_norm": 0.05835792422294617, "learning_rate": 8.758417615805992e-05, "loss": 0.0037, "step": 13390 }, { "epoch": 7.745664739884393, "grad_norm": 0.05884012207388878, "learning_rate": 8.756235787943254e-05, "loss": 0.003, "step": 13400 }, { "epoch": 7.7514450867052025, "grad_norm": 0.062465496361255646, "learning_rate": 8.754052316975367e-05, "loss": 0.0034, "step": 13410 }, { "epoch": 7.757225433526012, "grad_norm": 0.12052380293607712, "learning_rate": 8.751867203857455e-05, "loss": 0.0037, "step": 13420 }, { "epoch": 7.763005780346821, "grad_norm": 0.11391422897577286, "learning_rate": 8.749680449545363e-05, "loss": 0.0045, "step": 13430 }, { "epoch": 7.76878612716763, "grad_norm": 0.07041367888450623, "learning_rate": 8.747492054995649e-05, "loss": 0.0035, "step": 13440 }, { "epoch": 7.77456647398844, "grad_norm": 0.0718047097325325, "learning_rate": 8.745302021165595e-05, "loss": 0.0031, "step": 13450 }, { "epoch": 7.780346820809249, "grad_norm": 0.0681358277797699, "learning_rate": 8.743110349013192e-05, "loss": 0.0029, "step": 13460 }, { "epoch": 7.786127167630058, "grad_norm": 0.0745391845703125, "learning_rate": 8.740917039497153e-05, "loss": 0.0032, "step": 13470 }, { "epoch": 7.791907514450867, "grad_norm": 0.07042954862117767, "learning_rate": 8.738722093576906e-05, "loss": 0.0034, "step": 13480 }, { "epoch": 7.797687861271676, "grad_norm": 0.06926736980676651, "learning_rate": 8.736525512212597e-05, "loss": 0.0027, "step": 13490 }, { "epoch": 7.803468208092486, "grad_norm": 0.06005188450217247, "learning_rate": 8.734327296365084e-05, "loss": 0.0029, "step": 13500 }, { "epoch": 7.809248554913295, "grad_norm": 0.04462320730090141, "learning_rate": 8.732127446995939e-05, "loss": 0.0029, "step": 13510 }, { "epoch": 7.815028901734104, "grad_norm": 0.03693057596683502, "learning_rate": 8.729925965067454e-05, "loss": 0.0029, "step": 13520 }, { "epoch": 7.820809248554913, "grad_norm": 0.06181991472840309, "learning_rate": 8.72772285154263e-05, "loss": 0.0027, "step": 13530 }, { "epoch": 7.826589595375722, "grad_norm": 0.06471090763807297, "learning_rate": 8.725518107385187e-05, "loss": 0.0027, "step": 13540 }, { "epoch": 7.832369942196532, "grad_norm": 0.0629395842552185, "learning_rate": 8.72331173355955e-05, "loss": 0.0024, "step": 13550 }, { "epoch": 7.838150289017341, "grad_norm": 0.060991790145635605, "learning_rate": 8.721103731030867e-05, "loss": 0.0036, "step": 13560 }, { "epoch": 7.84393063583815, "grad_norm": 0.1394331008195877, "learning_rate": 8.718894100764989e-05, "loss": 0.0031, "step": 13570 }, { "epoch": 7.8497109826589595, "grad_norm": 0.0855998545885086, "learning_rate": 8.716682843728485e-05, "loss": 0.0034, "step": 13580 }, { "epoch": 7.855491329479769, "grad_norm": 0.07635796070098877, "learning_rate": 8.714469960888634e-05, "loss": 0.0031, "step": 13590 }, { "epoch": 7.861271676300578, "grad_norm": 0.06049419194459915, "learning_rate": 8.712255453213427e-05, "loss": 0.0032, "step": 13600 }, { "epoch": 7.867052023121388, "grad_norm": 0.08044454455375671, "learning_rate": 8.710039321671563e-05, "loss": 0.0036, "step": 13610 }, { "epoch": 7.872832369942197, "grad_norm": 0.07877357304096222, "learning_rate": 8.707821567232456e-05, "loss": 0.0027, "step": 13620 }, { "epoch": 7.878612716763006, "grad_norm": 0.10369919240474701, "learning_rate": 8.705602190866225e-05, "loss": 0.004, "step": 13630 }, { "epoch": 7.884393063583815, "grad_norm": 0.09005500376224518, "learning_rate": 8.703381193543701e-05, "loss": 0.0032, "step": 13640 }, { "epoch": 7.890173410404624, "grad_norm": 0.05346602946519852, "learning_rate": 8.701158576236423e-05, "loss": 0.0026, "step": 13650 }, { "epoch": 7.895953757225434, "grad_norm": 0.05992775782942772, "learning_rate": 8.69893433991664e-05, "loss": 0.0027, "step": 13660 }, { "epoch": 7.901734104046243, "grad_norm": 0.07658679038286209, "learning_rate": 8.69670848555731e-05, "loss": 0.0026, "step": 13670 }, { "epoch": 7.907514450867052, "grad_norm": 0.0609201118350029, "learning_rate": 8.694481014132096e-05, "loss": 0.004, "step": 13680 }, { "epoch": 7.913294797687861, "grad_norm": 0.06367721408605576, "learning_rate": 8.69225192661537e-05, "loss": 0.003, "step": 13690 }, { "epoch": 7.91907514450867, "grad_norm": 0.06193988025188446, "learning_rate": 8.690021223982208e-05, "loss": 0.0034, "step": 13700 }, { "epoch": 7.924855491329479, "grad_norm": 0.08065782487392426, "learning_rate": 8.687788907208398e-05, "loss": 0.0039, "step": 13710 }, { "epoch": 7.930635838150289, "grad_norm": 0.06705836951732635, "learning_rate": 8.685554977270431e-05, "loss": 0.0036, "step": 13720 }, { "epoch": 7.936416184971098, "grad_norm": 0.08512771129608154, "learning_rate": 8.683319435145503e-05, "loss": 0.0032, "step": 13730 }, { "epoch": 7.942196531791907, "grad_norm": 0.05253278464078903, "learning_rate": 8.681082281811517e-05, "loss": 0.0025, "step": 13740 }, { "epoch": 7.9479768786127165, "grad_norm": 0.12167950719594955, "learning_rate": 8.67884351824708e-05, "loss": 0.0037, "step": 13750 }, { "epoch": 7.953757225433526, "grad_norm": 0.11908365786075592, "learning_rate": 8.676603145431501e-05, "loss": 0.0051, "step": 13760 }, { "epoch": 7.959537572254336, "grad_norm": 0.08992159366607666, "learning_rate": 8.674361164344799e-05, "loss": 0.003, "step": 13770 }, { "epoch": 7.965317919075145, "grad_norm": 0.0692647248506546, "learning_rate": 8.672117575967688e-05, "loss": 0.0027, "step": 13780 }, { "epoch": 7.971098265895954, "grad_norm": 0.0604802742600441, "learning_rate": 8.669872381281595e-05, "loss": 0.0032, "step": 13790 }, { "epoch": 7.976878612716763, "grad_norm": 0.05162026733160019, "learning_rate": 8.667625581268639e-05, "loss": 0.0023, "step": 13800 }, { "epoch": 7.982658959537572, "grad_norm": 0.04985089972615242, "learning_rate": 8.665377176911651e-05, "loss": 0.0024, "step": 13810 }, { "epoch": 7.988439306358382, "grad_norm": 0.04280465096235275, "learning_rate": 8.663127169194159e-05, "loss": 0.0025, "step": 13820 }, { "epoch": 7.994219653179191, "grad_norm": 0.04978175833821297, "learning_rate": 8.660875559100389e-05, "loss": 0.0031, "step": 13830 }, { "epoch": 8.0, "grad_norm": 0.06000877544283867, "learning_rate": 8.658622347615274e-05, "loss": 0.0029, "step": 13840 }, { "epoch": 8.00578034682081, "grad_norm": 0.07454288750886917, "learning_rate": 8.656367535724448e-05, "loss": 0.0025, "step": 13850 }, { "epoch": 8.011560693641618, "grad_norm": 0.061534419655799866, "learning_rate": 8.65411112441424e-05, "loss": 0.0042, "step": 13860 }, { "epoch": 8.017341040462428, "grad_norm": 0.07896114140748978, "learning_rate": 8.651853114671679e-05, "loss": 0.003, "step": 13870 }, { "epoch": 8.023121387283236, "grad_norm": 0.06675609946250916, "learning_rate": 8.649593507484499e-05, "loss": 0.0032, "step": 13880 }, { "epoch": 8.028901734104046, "grad_norm": 0.06271135807037354, "learning_rate": 8.647332303841126e-05, "loss": 0.0033, "step": 13890 }, { "epoch": 8.034682080924856, "grad_norm": 0.045987293124198914, "learning_rate": 8.645069504730689e-05, "loss": 0.0026, "step": 13900 }, { "epoch": 8.040462427745664, "grad_norm": 0.05192944034934044, "learning_rate": 8.64280511114301e-05, "loss": 0.0035, "step": 13910 }, { "epoch": 8.046242774566474, "grad_norm": 0.056140366941690445, "learning_rate": 8.640539124068617e-05, "loss": 0.0025, "step": 13920 }, { "epoch": 8.052023121387283, "grad_norm": 0.08405833691358566, "learning_rate": 8.638271544498727e-05, "loss": 0.0028, "step": 13930 }, { "epoch": 8.057803468208093, "grad_norm": 0.0532781258225441, "learning_rate": 8.636002373425257e-05, "loss": 0.0022, "step": 13940 }, { "epoch": 8.063583815028903, "grad_norm": 0.06937815248966217, "learning_rate": 8.633731611840817e-05, "loss": 0.0028, "step": 13950 }, { "epoch": 8.06936416184971, "grad_norm": 0.05528760328888893, "learning_rate": 8.631459260738717e-05, "loss": 0.0034, "step": 13960 }, { "epoch": 8.07514450867052, "grad_norm": 0.0920768454670906, "learning_rate": 8.62918532111296e-05, "loss": 0.0032, "step": 13970 }, { "epoch": 8.080924855491329, "grad_norm": 0.07862822711467743, "learning_rate": 8.626909793958248e-05, "loss": 0.0026, "step": 13980 }, { "epoch": 8.086705202312139, "grad_norm": 0.08769793063402176, "learning_rate": 8.624632680269969e-05, "loss": 0.0036, "step": 13990 }, { "epoch": 8.092485549132949, "grad_norm": 0.044517453759908676, "learning_rate": 8.622353981044212e-05, "loss": 0.0029, "step": 14000 }, { "epoch": 8.098265895953757, "grad_norm": 0.05424928292632103, "learning_rate": 8.620073697277757e-05, "loss": 0.003, "step": 14010 }, { "epoch": 8.104046242774567, "grad_norm": 0.06627245992422104, "learning_rate": 8.617791829968079e-05, "loss": 0.006, "step": 14020 }, { "epoch": 8.109826589595375, "grad_norm": 0.05628069117665291, "learning_rate": 8.615508380113344e-05, "loss": 0.0033, "step": 14030 }, { "epoch": 8.115606936416185, "grad_norm": 0.0704016163945198, "learning_rate": 8.613223348712408e-05, "loss": 0.0034, "step": 14040 }, { "epoch": 8.121387283236995, "grad_norm": 0.07526744157075882, "learning_rate": 8.610936736764824e-05, "loss": 0.0029, "step": 14050 }, { "epoch": 8.127167630057803, "grad_norm": 0.06005479395389557, "learning_rate": 8.608648545270833e-05, "loss": 0.0024, "step": 14060 }, { "epoch": 8.132947976878613, "grad_norm": 0.08388201147317886, "learning_rate": 8.606358775231366e-05, "loss": 0.0028, "step": 14070 }, { "epoch": 8.138728323699421, "grad_norm": 0.06882532685995102, "learning_rate": 8.60406742764805e-05, "loss": 0.0028, "step": 14080 }, { "epoch": 8.144508670520231, "grad_norm": 0.04375378414988518, "learning_rate": 8.601774503523195e-05, "loss": 0.0028, "step": 14090 }, { "epoch": 8.15028901734104, "grad_norm": 0.04277557507157326, "learning_rate": 8.599480003859805e-05, "loss": 0.0024, "step": 14100 }, { "epoch": 8.15606936416185, "grad_norm": 0.07634586840867996, "learning_rate": 8.597183929661573e-05, "loss": 0.0023, "step": 14110 }, { "epoch": 8.16184971098266, "grad_norm": 0.13055641949176788, "learning_rate": 8.594886281932879e-05, "loss": 0.0039, "step": 14120 }, { "epoch": 8.167630057803468, "grad_norm": 0.08069360256195068, "learning_rate": 8.59258706167879e-05, "loss": 0.0031, "step": 14130 }, { "epoch": 8.173410404624278, "grad_norm": 0.07761472463607788, "learning_rate": 8.590286269905068e-05, "loss": 0.0026, "step": 14140 }, { "epoch": 8.179190751445086, "grad_norm": 0.0518157035112381, "learning_rate": 8.587983907618154e-05, "loss": 0.0032, "step": 14150 }, { "epoch": 8.184971098265896, "grad_norm": 0.04370218887925148, "learning_rate": 8.585679975825178e-05, "loss": 0.0025, "step": 14160 }, { "epoch": 8.190751445086706, "grad_norm": 0.05114853009581566, "learning_rate": 8.583374475533962e-05, "loss": 0.0033, "step": 14170 }, { "epoch": 8.196531791907514, "grad_norm": 0.07591544091701508, "learning_rate": 8.581067407753009e-05, "loss": 0.0029, "step": 14180 }, { "epoch": 8.202312138728324, "grad_norm": 0.058348849415779114, "learning_rate": 8.578758773491507e-05, "loss": 0.0025, "step": 14190 }, { "epoch": 8.208092485549132, "grad_norm": 0.07974713295698166, "learning_rate": 8.576448573759332e-05, "loss": 0.0028, "step": 14200 }, { "epoch": 8.213872832369942, "grad_norm": 0.08002932369709015, "learning_rate": 8.574136809567044e-05, "loss": 0.0036, "step": 14210 }, { "epoch": 8.219653179190752, "grad_norm": 0.046907052397727966, "learning_rate": 8.57182348192589e-05, "loss": 0.003, "step": 14220 }, { "epoch": 8.22543352601156, "grad_norm": 0.07600396871566772, "learning_rate": 8.569508591847792e-05, "loss": 0.0033, "step": 14230 }, { "epoch": 8.23121387283237, "grad_norm": 0.07189874351024628, "learning_rate": 8.567192140345367e-05, "loss": 0.0045, "step": 14240 }, { "epoch": 8.236994219653178, "grad_norm": 0.051627375185489655, "learning_rate": 8.564874128431906e-05, "loss": 0.0025, "step": 14250 }, { "epoch": 8.242774566473988, "grad_norm": 0.056345611810684204, "learning_rate": 8.562554557121389e-05, "loss": 0.0027, "step": 14260 }, { "epoch": 8.248554913294798, "grad_norm": 0.049111682921648026, "learning_rate": 8.560233427428475e-05, "loss": 0.0027, "step": 14270 }, { "epoch": 8.254335260115607, "grad_norm": 0.0892246887087822, "learning_rate": 8.557910740368503e-05, "loss": 0.0049, "step": 14280 }, { "epoch": 8.260115606936417, "grad_norm": 0.055140819400548935, "learning_rate": 8.555586496957495e-05, "loss": 0.0023, "step": 14290 }, { "epoch": 8.265895953757225, "grad_norm": 0.07861759513616562, "learning_rate": 8.553260698212155e-05, "loss": 0.0029, "step": 14300 }, { "epoch": 8.271676300578035, "grad_norm": 0.059682153165340424, "learning_rate": 8.550933345149868e-05, "loss": 0.0031, "step": 14310 }, { "epoch": 8.277456647398845, "grad_norm": 0.06093475595116615, "learning_rate": 8.548604438788696e-05, "loss": 0.003, "step": 14320 }, { "epoch": 8.283236994219653, "grad_norm": 0.087743379175663, "learning_rate": 8.546273980147383e-05, "loss": 0.0031, "step": 14330 }, { "epoch": 8.289017341040463, "grad_norm": 0.09008658677339554, "learning_rate": 8.543941970245348e-05, "loss": 0.0026, "step": 14340 }, { "epoch": 8.294797687861271, "grad_norm": 0.0581711046397686, "learning_rate": 8.541608410102693e-05, "loss": 0.0031, "step": 14350 }, { "epoch": 8.300578034682081, "grad_norm": 0.05531102418899536, "learning_rate": 8.539273300740195e-05, "loss": 0.003, "step": 14360 }, { "epoch": 8.306358381502891, "grad_norm": 0.049726564437150955, "learning_rate": 8.536936643179313e-05, "loss": 0.0028, "step": 14370 }, { "epoch": 8.3121387283237, "grad_norm": 0.0582098588347435, "learning_rate": 8.534598438442179e-05, "loss": 0.0024, "step": 14380 }, { "epoch": 8.31791907514451, "grad_norm": 0.07375791668891907, "learning_rate": 8.532258687551603e-05, "loss": 0.005, "step": 14390 }, { "epoch": 8.323699421965317, "grad_norm": 0.0793997198343277, "learning_rate": 8.529917391531071e-05, "loss": 0.0031, "step": 14400 }, { "epoch": 8.329479768786127, "grad_norm": 0.0530426912009716, "learning_rate": 8.527574551404747e-05, "loss": 0.0029, "step": 14410 }, { "epoch": 8.335260115606937, "grad_norm": 0.09852785617113113, "learning_rate": 8.525230168197468e-05, "loss": 0.0038, "step": 14420 }, { "epoch": 8.341040462427745, "grad_norm": 0.0513310432434082, "learning_rate": 8.522884242934745e-05, "loss": 0.0034, "step": 14430 }, { "epoch": 8.346820809248555, "grad_norm": 0.06458441913127899, "learning_rate": 8.520536776642768e-05, "loss": 0.003, "step": 14440 }, { "epoch": 8.352601156069364, "grad_norm": 0.05780723690986633, "learning_rate": 8.5181877703484e-05, "loss": 0.0031, "step": 14450 }, { "epoch": 8.358381502890174, "grad_norm": 0.0697089433670044, "learning_rate": 8.51583722507917e-05, "loss": 0.0033, "step": 14460 }, { "epoch": 8.364161849710982, "grad_norm": 0.049896955490112305, "learning_rate": 8.513485141863293e-05, "loss": 0.0022, "step": 14470 }, { "epoch": 8.369942196531792, "grad_norm": 0.05260182172060013, "learning_rate": 8.511131521729647e-05, "loss": 0.0031, "step": 14480 }, { "epoch": 8.375722543352602, "grad_norm": 0.06051109731197357, "learning_rate": 8.508776365707787e-05, "loss": 0.003, "step": 14490 }, { "epoch": 8.38150289017341, "grad_norm": 0.07035737484693527, "learning_rate": 8.506419674827934e-05, "loss": 0.0027, "step": 14500 }, { "epoch": 8.38728323699422, "grad_norm": 0.08471290022134781, "learning_rate": 8.50406145012099e-05, "loss": 0.0031, "step": 14510 }, { "epoch": 8.393063583815028, "grad_norm": 0.08894751965999603, "learning_rate": 8.501701692618519e-05, "loss": 0.0027, "step": 14520 }, { "epoch": 8.398843930635838, "grad_norm": 0.06659360229969025, "learning_rate": 8.499340403352761e-05, "loss": 0.0029, "step": 14530 }, { "epoch": 8.404624277456648, "grad_norm": 0.0539698526263237, "learning_rate": 8.496977583356623e-05, "loss": 0.0026, "step": 14540 }, { "epoch": 8.410404624277456, "grad_norm": 0.06316234171390533, "learning_rate": 8.494613233663684e-05, "loss": 0.0025, "step": 14550 }, { "epoch": 8.416184971098266, "grad_norm": 0.052839938551187515, "learning_rate": 8.492247355308189e-05, "loss": 0.0022, "step": 14560 }, { "epoch": 8.421965317919074, "grad_norm": 0.1064499169588089, "learning_rate": 8.489879949325056e-05, "loss": 0.0037, "step": 14570 }, { "epoch": 8.427745664739884, "grad_norm": 0.07197859138250351, "learning_rate": 8.487511016749868e-05, "loss": 0.0025, "step": 14580 }, { "epoch": 8.433526011560694, "grad_norm": 0.09276767820119858, "learning_rate": 8.485140558618874e-05, "loss": 0.003, "step": 14590 }, { "epoch": 8.439306358381502, "grad_norm": 0.06622461974620819, "learning_rate": 8.482768575968995e-05, "loss": 0.0027, "step": 14600 }, { "epoch": 8.445086705202312, "grad_norm": 0.04703529179096222, "learning_rate": 8.480395069837818e-05, "loss": 0.0026, "step": 14610 }, { "epoch": 8.45086705202312, "grad_norm": 0.0429091602563858, "learning_rate": 8.478020041263595e-05, "loss": 0.0025, "step": 14620 }, { "epoch": 8.45664739884393, "grad_norm": 0.04643228277564049, "learning_rate": 8.475643491285242e-05, "loss": 0.0025, "step": 14630 }, { "epoch": 8.46242774566474, "grad_norm": 0.06544038653373718, "learning_rate": 8.473265420942345e-05, "loss": 0.003, "step": 14640 }, { "epoch": 8.468208092485549, "grad_norm": 0.06004485487937927, "learning_rate": 8.470885831275151e-05, "loss": 0.0035, "step": 14650 }, { "epoch": 8.473988439306359, "grad_norm": 0.11065249145030975, "learning_rate": 8.468504723324574e-05, "loss": 0.0039, "step": 14660 }, { "epoch": 8.479768786127167, "grad_norm": 0.09323501586914062, "learning_rate": 8.466122098132193e-05, "loss": 0.0034, "step": 14670 }, { "epoch": 8.485549132947977, "grad_norm": 0.08076255023479462, "learning_rate": 8.463737956740245e-05, "loss": 0.0027, "step": 14680 }, { "epoch": 8.491329479768787, "grad_norm": 0.0990118682384491, "learning_rate": 8.461352300191639e-05, "loss": 0.0029, "step": 14690 }, { "epoch": 8.497109826589595, "grad_norm": 0.07514192163944244, "learning_rate": 8.45896512952994e-05, "loss": 0.0037, "step": 14700 }, { "epoch": 8.502890173410405, "grad_norm": 0.08111371845006943, "learning_rate": 8.456576445799377e-05, "loss": 0.0027, "step": 14710 }, { "epoch": 8.508670520231213, "grad_norm": 0.07220485061407089, "learning_rate": 8.454186250044844e-05, "loss": 0.0027, "step": 14720 }, { "epoch": 8.514450867052023, "grad_norm": 0.06481906026601791, "learning_rate": 8.451794543311892e-05, "loss": 0.0032, "step": 14730 }, { "epoch": 8.520231213872833, "grad_norm": 0.06372509896755219, "learning_rate": 8.449401326646736e-05, "loss": 0.0025, "step": 14740 }, { "epoch": 8.526011560693641, "grad_norm": 0.04399619251489639, "learning_rate": 8.447006601096248e-05, "loss": 0.0024, "step": 14750 }, { "epoch": 8.531791907514451, "grad_norm": 0.0819878876209259, "learning_rate": 8.444610367707964e-05, "loss": 0.0034, "step": 14760 }, { "epoch": 8.53757225433526, "grad_norm": 0.05718172341585159, "learning_rate": 8.442212627530078e-05, "loss": 0.003, "step": 14770 }, { "epoch": 8.54335260115607, "grad_norm": 0.08189176768064499, "learning_rate": 8.439813381611441e-05, "loss": 0.0032, "step": 14780 }, { "epoch": 8.54913294797688, "grad_norm": 0.049891822040081024, "learning_rate": 8.437412631001567e-05, "loss": 0.0023, "step": 14790 }, { "epoch": 8.554913294797688, "grad_norm": 0.06955216079950333, "learning_rate": 8.435010376750626e-05, "loss": 0.003, "step": 14800 }, { "epoch": 8.560693641618498, "grad_norm": 0.08844389766454697, "learning_rate": 8.432606619909442e-05, "loss": 0.004, "step": 14810 }, { "epoch": 8.566473988439306, "grad_norm": 0.10105694085359573, "learning_rate": 8.430201361529506e-05, "loss": 0.0027, "step": 14820 }, { "epoch": 8.572254335260116, "grad_norm": 0.11816252022981644, "learning_rate": 8.427794602662954e-05, "loss": 0.0028, "step": 14830 }, { "epoch": 8.578034682080926, "grad_norm": 0.09827663004398346, "learning_rate": 8.425386344362586e-05, "loss": 0.003, "step": 14840 }, { "epoch": 8.583815028901734, "grad_norm": 0.06215595826506615, "learning_rate": 8.422976587681859e-05, "loss": 0.0049, "step": 14850 }, { "epoch": 8.589595375722544, "grad_norm": 0.08483150601387024, "learning_rate": 8.42056533367488e-05, "loss": 0.0063, "step": 14860 }, { "epoch": 8.595375722543352, "grad_norm": 0.0631008893251419, "learning_rate": 8.41815258339641e-05, "loss": 0.0038, "step": 14870 }, { "epoch": 8.601156069364162, "grad_norm": 0.11883585900068283, "learning_rate": 8.415738337901874e-05, "loss": 0.0049, "step": 14880 }, { "epoch": 8.606936416184972, "grad_norm": 0.23546771705150604, "learning_rate": 8.413322598247342e-05, "loss": 0.0043, "step": 14890 }, { "epoch": 8.61271676300578, "grad_norm": 0.2148403376340866, "learning_rate": 8.41090536548954e-05, "loss": 0.0045, "step": 14900 }, { "epoch": 8.61849710982659, "grad_norm": 0.10700441896915436, "learning_rate": 8.408486640685849e-05, "loss": 0.0033, "step": 14910 }, { "epoch": 8.624277456647398, "grad_norm": 0.12443286180496216, "learning_rate": 8.4060664248943e-05, "loss": 0.0058, "step": 14920 }, { "epoch": 8.630057803468208, "grad_norm": 0.09611442685127258, "learning_rate": 8.40364471917358e-05, "loss": 0.0054, "step": 14930 }, { "epoch": 8.635838150289018, "grad_norm": 0.08155146986246109, "learning_rate": 8.401221524583024e-05, "loss": 0.0045, "step": 14940 }, { "epoch": 8.641618497109826, "grad_norm": 0.07205265760421753, "learning_rate": 8.398796842182619e-05, "loss": 0.0036, "step": 14950 }, { "epoch": 8.647398843930636, "grad_norm": 0.04646911844611168, "learning_rate": 8.396370673033006e-05, "loss": 0.0029, "step": 14960 }, { "epoch": 8.653179190751445, "grad_norm": 0.06000467762351036, "learning_rate": 8.39394301819547e-05, "loss": 0.0027, "step": 14970 }, { "epoch": 8.658959537572255, "grad_norm": 0.07406723499298096, "learning_rate": 8.391513878731949e-05, "loss": 0.0039, "step": 14980 }, { "epoch": 8.664739884393063, "grad_norm": 0.07790686190128326, "learning_rate": 8.389083255705037e-05, "loss": 0.0037, "step": 14990 }, { "epoch": 8.670520231213873, "grad_norm": 0.0638248473405838, "learning_rate": 8.386651150177968e-05, "loss": 0.0036, "step": 15000 } ], "logging_steps": 10, "max_steps": 50000, "num_input_tokens_seen": 0, "num_train_epochs": 29, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }