diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,237602 @@ +{ + "best_metric": 0.5747780203819275, + "best_model_checkpoint": "LanguageTutor_v1/core/models/models/modernbert_output/checkpoint-339354", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 339354, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 8.840326031224031e-05, + "grad_norm": 12.360391616821289, + "learning_rate": 4.999852661232813e-05, + "loss": 1.8933, + "step": 10 + }, + { + "epoch": 0.00017680652062448062, + "grad_norm": 8.394221305847168, + "learning_rate": 4.999705322465626e-05, + "loss": 1.5802, + "step": 20 + }, + { + "epoch": 0.00026520978093672097, + "grad_norm": 5.720775604248047, + "learning_rate": 4.9995579836984394e-05, + "loss": 1.564, + "step": 30 + }, + { + "epoch": 0.00035361304124896124, + "grad_norm": 4.403193473815918, + "learning_rate": 4.999410644931252e-05, + "loss": 1.5793, + "step": 40 + }, + { + "epoch": 0.00044201630156120157, + "grad_norm": 7.755221843719482, + "learning_rate": 4.999263306164065e-05, + "loss": 1.6448, + "step": 50 + }, + { + "epoch": 0.0005304195618734419, + "grad_norm": 5.395235538482666, + "learning_rate": 4.999115967396878e-05, + "loss": 1.5412, + "step": 60 + }, + { + "epoch": 0.0006188228221856822, + "grad_norm": 6.468478679656982, + "learning_rate": 4.998968628629691e-05, + "loss": 1.5383, + "step": 70 + }, + { + "epoch": 0.0007072260824979225, + "grad_norm": 8.549371719360352, + "learning_rate": 4.9988212898625036e-05, + "loss": 1.5251, + "step": 80 + }, + { + "epoch": 0.0007956293428101628, + "grad_norm": 5.457655429840088, + "learning_rate": 4.998673951095317e-05, + "loss": 1.4951, + "step": 90 + }, + { + "epoch": 0.0008840326031224031, + "grad_norm": 8.569847106933594, + "learning_rate": 4.998526612328129e-05, + "loss": 1.4208, + "step": 100 + }, + { + "epoch": 0.0009724358634346435, + "grad_norm": 4.5735297203063965, + "learning_rate": 4.998379273560943e-05, + "loss": 1.4895, + "step": 110 + }, + { + "epoch": 0.0010608391237468839, + "grad_norm": 4.128511905670166, + "learning_rate": 4.9982319347937556e-05, + "loss": 1.4945, + "step": 120 + }, + { + "epoch": 0.001149242384059124, + "grad_norm": 3.3097379207611084, + "learning_rate": 4.9980845960265685e-05, + "loss": 1.606, + "step": 130 + }, + { + "epoch": 0.0012376456443713643, + "grad_norm": 2.2360458374023438, + "learning_rate": 4.997937257259381e-05, + "loss": 1.5863, + "step": 140 + }, + { + "epoch": 0.0013260489046836047, + "grad_norm": 5.046316623687744, + "learning_rate": 4.997789918492195e-05, + "loss": 1.5055, + "step": 150 + }, + { + "epoch": 0.001414452164995845, + "grad_norm": 4.0020036697387695, + "learning_rate": 4.997642579725007e-05, + "loss": 1.4883, + "step": 160 + }, + { + "epoch": 0.0015028554253080854, + "grad_norm": 2.5163402557373047, + "learning_rate": 4.9974952409578205e-05, + "loss": 1.3965, + "step": 170 + }, + { + "epoch": 0.0015912586856203256, + "grad_norm": 5.472707748413086, + "learning_rate": 4.9973479021906326e-05, + "loss": 1.4061, + "step": 180 + }, + { + "epoch": 0.001679661945932566, + "grad_norm": 5.013034820556641, + "learning_rate": 4.997200563423446e-05, + "loss": 1.518, + "step": 190 + }, + { + "epoch": 0.0017680652062448063, + "grad_norm": 3.9262633323669434, + "learning_rate": 4.997053224656259e-05, + "loss": 1.435, + "step": 200 + }, + { + "epoch": 0.0018564684665570467, + "grad_norm": 4.9150614738464355, + "learning_rate": 4.996905885889072e-05, + "loss": 1.4966, + "step": 210 + }, + { + "epoch": 0.001944871726869287, + "grad_norm": 4.164141654968262, + "learning_rate": 4.9967585471218847e-05, + "loss": 1.4522, + "step": 220 + }, + { + "epoch": 0.0020332749871815273, + "grad_norm": 4.787590503692627, + "learning_rate": 4.996611208354698e-05, + "loss": 1.4752, + "step": 230 + }, + { + "epoch": 0.0021216782474937678, + "grad_norm": 8.909902572631836, + "learning_rate": 4.99646386958751e-05, + "loss": 1.418, + "step": 240 + }, + { + "epoch": 0.0022100815078060078, + "grad_norm": 8.003477096557617, + "learning_rate": 4.996316530820324e-05, + "loss": 1.3951, + "step": 250 + }, + { + "epoch": 0.002298484768118248, + "grad_norm": 4.158975601196289, + "learning_rate": 4.996169192053137e-05, + "loss": 1.4096, + "step": 260 + }, + { + "epoch": 0.0023868880284304886, + "grad_norm": 3.142237663269043, + "learning_rate": 4.9960218532859495e-05, + "loss": 1.3951, + "step": 270 + }, + { + "epoch": 0.0024752912887427286, + "grad_norm": 3.579383611679077, + "learning_rate": 4.9958745145187623e-05, + "loss": 1.3693, + "step": 280 + }, + { + "epoch": 0.002563694549054969, + "grad_norm": 12.148061752319336, + "learning_rate": 4.995727175751576e-05, + "loss": 1.4162, + "step": 290 + }, + { + "epoch": 0.0026520978093672095, + "grad_norm": 2.9150171279907227, + "learning_rate": 4.995579836984388e-05, + "loss": 1.4773, + "step": 300 + }, + { + "epoch": 0.00274050106967945, + "grad_norm": 3.3658225536346436, + "learning_rate": 4.9954324982172015e-05, + "loss": 1.4936, + "step": 310 + }, + { + "epoch": 0.00282890432999169, + "grad_norm": 5.113858699798584, + "learning_rate": 4.995285159450014e-05, + "loss": 1.3727, + "step": 320 + }, + { + "epoch": 0.0029173075903039304, + "grad_norm": 4.562338829040527, + "learning_rate": 4.995137820682827e-05, + "loss": 1.4645, + "step": 330 + }, + { + "epoch": 0.003005710850616171, + "grad_norm": 9.740865707397461, + "learning_rate": 4.99499048191564e-05, + "loss": 1.4879, + "step": 340 + }, + { + "epoch": 0.0030941141109284112, + "grad_norm": 4.165475368499756, + "learning_rate": 4.994843143148453e-05, + "loss": 1.4096, + "step": 350 + }, + { + "epoch": 0.0031825173712406512, + "grad_norm": 7.811041831970215, + "learning_rate": 4.994695804381266e-05, + "loss": 1.3699, + "step": 360 + }, + { + "epoch": 0.0032709206315528917, + "grad_norm": 4.133144855499268, + "learning_rate": 4.994548465614079e-05, + "loss": 1.3714, + "step": 370 + }, + { + "epoch": 0.003359323891865132, + "grad_norm": 2.9689953327178955, + "learning_rate": 4.9944011268468914e-05, + "loss": 1.3125, + "step": 380 + }, + { + "epoch": 0.003447727152177372, + "grad_norm": 3.44071102142334, + "learning_rate": 4.994253788079705e-05, + "loss": 1.4217, + "step": 390 + }, + { + "epoch": 0.0035361304124896125, + "grad_norm": 5.091910362243652, + "learning_rate": 4.994106449312518e-05, + "loss": 1.3862, + "step": 400 + }, + { + "epoch": 0.003624533672801853, + "grad_norm": 5.2655930519104, + "learning_rate": 4.9939591105453306e-05, + "loss": 1.4303, + "step": 410 + }, + { + "epoch": 0.0037129369331140934, + "grad_norm": 5.5515666007995605, + "learning_rate": 4.9938117717781434e-05, + "loss": 1.3853, + "step": 420 + }, + { + "epoch": 0.0038013401934263334, + "grad_norm": 5.711978435516357, + "learning_rate": 4.993664433010956e-05, + "loss": 1.3823, + "step": 430 + }, + { + "epoch": 0.003889743453738574, + "grad_norm": 3.29811692237854, + "learning_rate": 4.993517094243769e-05, + "loss": 1.4029, + "step": 440 + }, + { + "epoch": 0.003978146714050814, + "grad_norm": 4.7858405113220215, + "learning_rate": 4.9933697554765826e-05, + "loss": 1.345, + "step": 450 + }, + { + "epoch": 0.004066549974363055, + "grad_norm": 3.114687919616699, + "learning_rate": 4.993222416709395e-05, + "loss": 1.3509, + "step": 460 + }, + { + "epoch": 0.004154953234675295, + "grad_norm": 4.7597808837890625, + "learning_rate": 4.993075077942208e-05, + "loss": 1.426, + "step": 470 + }, + { + "epoch": 0.0042433564949875355, + "grad_norm": 4.4710469245910645, + "learning_rate": 4.992927739175021e-05, + "loss": 1.3886, + "step": 480 + }, + { + "epoch": 0.004331759755299775, + "grad_norm": 7.869180679321289, + "learning_rate": 4.992780400407834e-05, + "loss": 1.516, + "step": 490 + }, + { + "epoch": 0.0044201630156120155, + "grad_norm": 4.000175476074219, + "learning_rate": 4.992633061640647e-05, + "loss": 1.4539, + "step": 500 + }, + { + "epoch": 0.004508566275924256, + "grad_norm": 3.8426191806793213, + "learning_rate": 4.99248572287346e-05, + "loss": 1.4135, + "step": 510 + }, + { + "epoch": 0.004596969536236496, + "grad_norm": 4.768189430236816, + "learning_rate": 4.9923383841062724e-05, + "loss": 1.304, + "step": 520 + }, + { + "epoch": 0.004685372796548737, + "grad_norm": 6.300947666168213, + "learning_rate": 4.992191045339086e-05, + "loss": 1.3956, + "step": 530 + }, + { + "epoch": 0.004773776056860977, + "grad_norm": 5.551243782043457, + "learning_rate": 4.992043706571898e-05, + "loss": 1.3915, + "step": 540 + }, + { + "epoch": 0.004862179317173218, + "grad_norm": 3.9047024250030518, + "learning_rate": 4.9918963678047116e-05, + "loss": 1.3947, + "step": 550 + }, + { + "epoch": 0.004950582577485457, + "grad_norm": 6.883234977722168, + "learning_rate": 4.9917490290375244e-05, + "loss": 1.3875, + "step": 560 + }, + { + "epoch": 0.005038985837797698, + "grad_norm": 5.5822319984436035, + "learning_rate": 4.991601690270337e-05, + "loss": 1.3769, + "step": 570 + }, + { + "epoch": 0.005127389098109938, + "grad_norm": 4.9963297843933105, + "learning_rate": 4.99145435150315e-05, + "loss": 1.3642, + "step": 580 + }, + { + "epoch": 0.005215792358422179, + "grad_norm": 6.4286112785339355, + "learning_rate": 4.9913070127359636e-05, + "loss": 1.4052, + "step": 590 + }, + { + "epoch": 0.005304195618734419, + "grad_norm": 3.423480272293091, + "learning_rate": 4.991159673968776e-05, + "loss": 1.3347, + "step": 600 + }, + { + "epoch": 0.005392598879046659, + "grad_norm": 4.48936128616333, + "learning_rate": 4.991012335201589e-05, + "loss": 1.3606, + "step": 610 + }, + { + "epoch": 0.0054810021393589, + "grad_norm": 3.273667097091675, + "learning_rate": 4.990864996434402e-05, + "loss": 1.4611, + "step": 620 + }, + { + "epoch": 0.00556940539967114, + "grad_norm": 3.781583547592163, + "learning_rate": 4.990717657667215e-05, + "loss": 1.2665, + "step": 630 + }, + { + "epoch": 0.00565780865998338, + "grad_norm": 4.62367582321167, + "learning_rate": 4.990570318900028e-05, + "loss": 1.3098, + "step": 640 + }, + { + "epoch": 0.00574621192029562, + "grad_norm": 3.1934027671813965, + "learning_rate": 4.9904229801328406e-05, + "loss": 1.328, + "step": 650 + }, + { + "epoch": 0.005834615180607861, + "grad_norm": 5.021986484527588, + "learning_rate": 4.9902756413656535e-05, + "loss": 1.2974, + "step": 660 + }, + { + "epoch": 0.005923018440920101, + "grad_norm": 3.9169671535491943, + "learning_rate": 4.990128302598467e-05, + "loss": 1.3832, + "step": 670 + }, + { + "epoch": 0.006011421701232342, + "grad_norm": 5.65451192855835, + "learning_rate": 4.989980963831279e-05, + "loss": 1.3609, + "step": 680 + }, + { + "epoch": 0.006099824961544582, + "grad_norm": 3.694772243499756, + "learning_rate": 4.9898336250640927e-05, + "loss": 1.366, + "step": 690 + }, + { + "epoch": 0.0061882282218568225, + "grad_norm": 4.979604721069336, + "learning_rate": 4.9896862862969055e-05, + "loss": 1.2916, + "step": 700 + }, + { + "epoch": 0.006276631482169062, + "grad_norm": 3.898308515548706, + "learning_rate": 4.989538947529718e-05, + "loss": 1.342, + "step": 710 + }, + { + "epoch": 0.0063650347424813025, + "grad_norm": 7.851255416870117, + "learning_rate": 4.989391608762531e-05, + "loss": 1.2884, + "step": 720 + }, + { + "epoch": 0.006453438002793543, + "grad_norm": 5.869283199310303, + "learning_rate": 4.989244269995345e-05, + "loss": 1.2327, + "step": 730 + }, + { + "epoch": 0.006541841263105783, + "grad_norm": 3.6053364276885986, + "learning_rate": 4.989096931228157e-05, + "loss": 1.3692, + "step": 740 + }, + { + "epoch": 0.006630244523418024, + "grad_norm": 3.283907413482666, + "learning_rate": 4.9889495924609703e-05, + "loss": 1.2745, + "step": 750 + }, + { + "epoch": 0.006718647783730264, + "grad_norm": 4.512964248657227, + "learning_rate": 4.988802253693783e-05, + "loss": 1.3352, + "step": 760 + }, + { + "epoch": 0.006807051044042505, + "grad_norm": 4.550668716430664, + "learning_rate": 4.988654914926596e-05, + "loss": 1.3048, + "step": 770 + }, + { + "epoch": 0.006895454304354744, + "grad_norm": 3.349033832550049, + "learning_rate": 4.988507576159409e-05, + "loss": 1.2523, + "step": 780 + }, + { + "epoch": 0.006983857564666985, + "grad_norm": 4.971866607666016, + "learning_rate": 4.988360237392222e-05, + "loss": 1.2693, + "step": 790 + }, + { + "epoch": 0.007072260824979225, + "grad_norm": 3.0348961353302, + "learning_rate": 4.9882128986250345e-05, + "loss": 1.3548, + "step": 800 + }, + { + "epoch": 0.0071606640852914655, + "grad_norm": 3.880688428878784, + "learning_rate": 4.988065559857848e-05, + "loss": 1.2948, + "step": 810 + }, + { + "epoch": 0.007249067345603706, + "grad_norm": 3.494100570678711, + "learning_rate": 4.987918221090661e-05, + "loss": 1.2544, + "step": 820 + }, + { + "epoch": 0.007337470605915946, + "grad_norm": 6.909191608428955, + "learning_rate": 4.987770882323474e-05, + "loss": 1.3008, + "step": 830 + }, + { + "epoch": 0.007425873866228187, + "grad_norm": 4.434190273284912, + "learning_rate": 4.9876235435562865e-05, + "loss": 1.2514, + "step": 840 + }, + { + "epoch": 0.007514277126540427, + "grad_norm": 3.0245659351348877, + "learning_rate": 4.9874762047890994e-05, + "loss": 1.4152, + "step": 850 + }, + { + "epoch": 0.007602680386852667, + "grad_norm": 3.9230799674987793, + "learning_rate": 4.987328866021912e-05, + "loss": 1.4328, + "step": 860 + }, + { + "epoch": 0.007691083647164907, + "grad_norm": 3.984497547149658, + "learning_rate": 4.987181527254726e-05, + "loss": 1.257, + "step": 870 + }, + { + "epoch": 0.007779486907477148, + "grad_norm": 7.441206932067871, + "learning_rate": 4.9870341884875386e-05, + "loss": 1.304, + "step": 880 + }, + { + "epoch": 0.007867890167789389, + "grad_norm": 5.112447738647461, + "learning_rate": 4.9868868497203514e-05, + "loss": 1.3257, + "step": 890 + }, + { + "epoch": 0.007956293428101628, + "grad_norm": 2.8872740268707275, + "learning_rate": 4.986739510953164e-05, + "loss": 1.3393, + "step": 900 + }, + { + "epoch": 0.008044696688413868, + "grad_norm": 3.278285264968872, + "learning_rate": 4.986592172185977e-05, + "loss": 1.3374, + "step": 910 + }, + { + "epoch": 0.00813309994872611, + "grad_norm": 6.72818660736084, + "learning_rate": 4.98644483341879e-05, + "loss": 1.3533, + "step": 920 + }, + { + "epoch": 0.008221503209038349, + "grad_norm": 3.7416751384735107, + "learning_rate": 4.986297494651603e-05, + "loss": 1.2718, + "step": 930 + }, + { + "epoch": 0.00830990646935059, + "grad_norm": 5.336352825164795, + "learning_rate": 4.986150155884416e-05, + "loss": 1.276, + "step": 940 + }, + { + "epoch": 0.00839830972966283, + "grad_norm": 4.085439205169678, + "learning_rate": 4.986002817117229e-05, + "loss": 1.2924, + "step": 950 + }, + { + "epoch": 0.008486712989975071, + "grad_norm": 4.730822563171387, + "learning_rate": 4.985855478350042e-05, + "loss": 1.1914, + "step": 960 + }, + { + "epoch": 0.00857511625028731, + "grad_norm": 8.323603630065918, + "learning_rate": 4.985708139582855e-05, + "loss": 1.3709, + "step": 970 + }, + { + "epoch": 0.00866351951059955, + "grad_norm": 5.226520538330078, + "learning_rate": 4.9855608008156676e-05, + "loss": 1.3267, + "step": 980 + }, + { + "epoch": 0.008751922770911792, + "grad_norm": 6.736996650695801, + "learning_rate": 4.9854134620484804e-05, + "loss": 1.2823, + "step": 990 + }, + { + "epoch": 0.008840326031224031, + "grad_norm": 4.369340896606445, + "learning_rate": 4.985266123281294e-05, + "loss": 1.2985, + "step": 1000 + }, + { + "epoch": 0.008928729291536272, + "grad_norm": 4.70871639251709, + "learning_rate": 4.985118784514106e-05, + "loss": 1.3114, + "step": 1010 + }, + { + "epoch": 0.009017132551848512, + "grad_norm": 5.202262878417969, + "learning_rate": 4.9849714457469196e-05, + "loss": 1.3938, + "step": 1020 + }, + { + "epoch": 0.009105535812160753, + "grad_norm": 2.4306414127349854, + "learning_rate": 4.9848241069797324e-05, + "loss": 1.1892, + "step": 1030 + }, + { + "epoch": 0.009193939072472993, + "grad_norm": 8.203335762023926, + "learning_rate": 4.984676768212545e-05, + "loss": 1.1901, + "step": 1040 + }, + { + "epoch": 0.009282342332785232, + "grad_norm": 4.401038646697998, + "learning_rate": 4.984529429445358e-05, + "loss": 1.2973, + "step": 1050 + }, + { + "epoch": 0.009370745593097474, + "grad_norm": 4.717940330505371, + "learning_rate": 4.9843820906781716e-05, + "loss": 1.2784, + "step": 1060 + }, + { + "epoch": 0.009459148853409713, + "grad_norm": 2.6563339233398438, + "learning_rate": 4.984234751910984e-05, + "loss": 1.3232, + "step": 1070 + }, + { + "epoch": 0.009547552113721955, + "grad_norm": 5.412891864776611, + "learning_rate": 4.984087413143797e-05, + "loss": 1.2354, + "step": 1080 + }, + { + "epoch": 0.009635955374034194, + "grad_norm": 3.79567289352417, + "learning_rate": 4.98394007437661e-05, + "loss": 1.2998, + "step": 1090 + }, + { + "epoch": 0.009724358634346435, + "grad_norm": 6.965702533721924, + "learning_rate": 4.983792735609423e-05, + "loss": 1.1795, + "step": 1100 + }, + { + "epoch": 0.009812761894658675, + "grad_norm": 11.925300598144531, + "learning_rate": 4.983645396842236e-05, + "loss": 1.2163, + "step": 1110 + }, + { + "epoch": 0.009901165154970915, + "grad_norm": 4.589380264282227, + "learning_rate": 4.9834980580750486e-05, + "loss": 1.2232, + "step": 1120 + }, + { + "epoch": 0.009989568415283156, + "grad_norm": 4.49373722076416, + "learning_rate": 4.9833507193078615e-05, + "loss": 1.252, + "step": 1130 + }, + { + "epoch": 0.010077971675595395, + "grad_norm": 3.2139599323272705, + "learning_rate": 4.983203380540675e-05, + "loss": 1.2259, + "step": 1140 + }, + { + "epoch": 0.010166374935907637, + "grad_norm": 3.8811285495758057, + "learning_rate": 4.983056041773487e-05, + "loss": 1.2837, + "step": 1150 + }, + { + "epoch": 0.010254778196219876, + "grad_norm": 4.598730087280273, + "learning_rate": 4.982908703006301e-05, + "loss": 1.1905, + "step": 1160 + }, + { + "epoch": 0.010343181456532118, + "grad_norm": 5.53896951675415, + "learning_rate": 4.9827613642391135e-05, + "loss": 1.3271, + "step": 1170 + }, + { + "epoch": 0.010431584716844357, + "grad_norm": 3.8789303302764893, + "learning_rate": 4.982614025471926e-05, + "loss": 1.2986, + "step": 1180 + }, + { + "epoch": 0.010519987977156597, + "grad_norm": 5.903724193572998, + "learning_rate": 4.982466686704739e-05, + "loss": 1.2604, + "step": 1190 + }, + { + "epoch": 0.010608391237468838, + "grad_norm": 3.7968711853027344, + "learning_rate": 4.982319347937553e-05, + "loss": 1.3397, + "step": 1200 + }, + { + "epoch": 0.010696794497781078, + "grad_norm": 3.2264816761016846, + "learning_rate": 4.982172009170365e-05, + "loss": 1.2966, + "step": 1210 + }, + { + "epoch": 0.010785197758093319, + "grad_norm": 4.483380317687988, + "learning_rate": 4.9820246704031784e-05, + "loss": 1.2768, + "step": 1220 + }, + { + "epoch": 0.010873601018405558, + "grad_norm": 3.280275583267212, + "learning_rate": 4.981877331635991e-05, + "loss": 1.2943, + "step": 1230 + }, + { + "epoch": 0.0109620042787178, + "grad_norm": 3.700066566467285, + "learning_rate": 4.981729992868804e-05, + "loss": 1.2548, + "step": 1240 + }, + { + "epoch": 0.01105040753903004, + "grad_norm": 6.6137237548828125, + "learning_rate": 4.981582654101617e-05, + "loss": 1.2147, + "step": 1250 + }, + { + "epoch": 0.01113881079934228, + "grad_norm": 3.8108301162719727, + "learning_rate": 4.98143531533443e-05, + "loss": 1.2061, + "step": 1260 + }, + { + "epoch": 0.01122721405965452, + "grad_norm": 5.0468363761901855, + "learning_rate": 4.9812879765672425e-05, + "loss": 1.2851, + "step": 1270 + }, + { + "epoch": 0.01131561731996676, + "grad_norm": 4.338964939117432, + "learning_rate": 4.981140637800056e-05, + "loss": 1.2716, + "step": 1280 + }, + { + "epoch": 0.011404020580279001, + "grad_norm": 6.1364288330078125, + "learning_rate": 4.980993299032868e-05, + "loss": 1.1857, + "step": 1290 + }, + { + "epoch": 0.01149242384059124, + "grad_norm": 9.080232620239258, + "learning_rate": 4.980845960265682e-05, + "loss": 1.3553, + "step": 1300 + }, + { + "epoch": 0.011580827100903482, + "grad_norm": 3.361820697784424, + "learning_rate": 4.9806986214984946e-05, + "loss": 1.3031, + "step": 1310 + }, + { + "epoch": 0.011669230361215721, + "grad_norm": 4.053516864776611, + "learning_rate": 4.9805512827313074e-05, + "loss": 1.2737, + "step": 1320 + }, + { + "epoch": 0.011757633621527963, + "grad_norm": 4.365649700164795, + "learning_rate": 4.98040394396412e-05, + "loss": 1.2599, + "step": 1330 + }, + { + "epoch": 0.011846036881840202, + "grad_norm": 4.1593475341796875, + "learning_rate": 4.980256605196934e-05, + "loss": 1.3096, + "step": 1340 + }, + { + "epoch": 0.011934440142152442, + "grad_norm": 3.454594850540161, + "learning_rate": 4.980109266429746e-05, + "loss": 1.1883, + "step": 1350 + }, + { + "epoch": 0.012022843402464683, + "grad_norm": 5.860595703125, + "learning_rate": 4.9799619276625594e-05, + "loss": 1.1906, + "step": 1360 + }, + { + "epoch": 0.012111246662776923, + "grad_norm": 4.583675861358643, + "learning_rate": 4.9798145888953716e-05, + "loss": 1.2182, + "step": 1370 + }, + { + "epoch": 0.012199649923089164, + "grad_norm": 4.073757171630859, + "learning_rate": 4.979667250128185e-05, + "loss": 1.3099, + "step": 1380 + }, + { + "epoch": 0.012288053183401404, + "grad_norm": 6.656314373016357, + "learning_rate": 4.979519911360998e-05, + "loss": 1.1719, + "step": 1390 + }, + { + "epoch": 0.012376456443713645, + "grad_norm": 4.785161018371582, + "learning_rate": 4.979372572593811e-05, + "loss": 1.2248, + "step": 1400 + }, + { + "epoch": 0.012464859704025884, + "grad_norm": 4.953686237335205, + "learning_rate": 4.9792252338266236e-05, + "loss": 1.1863, + "step": 1410 + }, + { + "epoch": 0.012553262964338124, + "grad_norm": 6.9218268394470215, + "learning_rate": 4.979077895059437e-05, + "loss": 1.1563, + "step": 1420 + }, + { + "epoch": 0.012641666224650365, + "grad_norm": 6.062386989593506, + "learning_rate": 4.978930556292249e-05, + "loss": 1.2917, + "step": 1430 + }, + { + "epoch": 0.012730069484962605, + "grad_norm": 7.698704719543457, + "learning_rate": 4.978783217525063e-05, + "loss": 1.2457, + "step": 1440 + }, + { + "epoch": 0.012818472745274846, + "grad_norm": 6.8854169845581055, + "learning_rate": 4.9786358787578756e-05, + "loss": 1.255, + "step": 1450 + }, + { + "epoch": 0.012906876005587086, + "grad_norm": 12.989723205566406, + "learning_rate": 4.9784885399906884e-05, + "loss": 1.1496, + "step": 1460 + }, + { + "epoch": 0.012995279265899327, + "grad_norm": 6.384410858154297, + "learning_rate": 4.978341201223501e-05, + "loss": 1.2042, + "step": 1470 + }, + { + "epoch": 0.013083682526211567, + "grad_norm": 5.162613391876221, + "learning_rate": 4.978193862456314e-05, + "loss": 1.1764, + "step": 1480 + }, + { + "epoch": 0.013172085786523806, + "grad_norm": 7.121384620666504, + "learning_rate": 4.978046523689127e-05, + "loss": 1.2491, + "step": 1490 + }, + { + "epoch": 0.013260489046836047, + "grad_norm": 5.269081115722656, + "learning_rate": 4.9778991849219405e-05, + "loss": 1.1909, + "step": 1500 + }, + { + "epoch": 0.013348892307148287, + "grad_norm": 3.8427023887634277, + "learning_rate": 4.9777518461547526e-05, + "loss": 1.2535, + "step": 1510 + }, + { + "epoch": 0.013437295567460528, + "grad_norm": 4.315703392028809, + "learning_rate": 4.977604507387566e-05, + "loss": 1.286, + "step": 1520 + }, + { + "epoch": 0.013525698827772768, + "grad_norm": 6.8337578773498535, + "learning_rate": 4.977457168620379e-05, + "loss": 1.237, + "step": 1530 + }, + { + "epoch": 0.01361410208808501, + "grad_norm": 3.717587471008301, + "learning_rate": 4.977309829853192e-05, + "loss": 1.1511, + "step": 1540 + }, + { + "epoch": 0.013702505348397249, + "grad_norm": 5.48192024230957, + "learning_rate": 4.9771624910860046e-05, + "loss": 1.148, + "step": 1550 + }, + { + "epoch": 0.013790908608709488, + "grad_norm": 5.219634532928467, + "learning_rate": 4.977015152318818e-05, + "loss": 1.234, + "step": 1560 + }, + { + "epoch": 0.01387931186902173, + "grad_norm": 4.309090614318848, + "learning_rate": 4.97686781355163e-05, + "loss": 1.1488, + "step": 1570 + }, + { + "epoch": 0.01396771512933397, + "grad_norm": 6.279152870178223, + "learning_rate": 4.976720474784444e-05, + "loss": 1.1941, + "step": 1580 + }, + { + "epoch": 0.01405611838964621, + "grad_norm": 5.642574310302734, + "learning_rate": 4.9765731360172567e-05, + "loss": 1.3957, + "step": 1590 + }, + { + "epoch": 0.01414452164995845, + "grad_norm": 5.639019012451172, + "learning_rate": 4.9764257972500695e-05, + "loss": 1.251, + "step": 1600 + }, + { + "epoch": 0.014232924910270691, + "grad_norm": 3.8754444122314453, + "learning_rate": 4.976278458482882e-05, + "loss": 1.2151, + "step": 1610 + }, + { + "epoch": 0.014321328170582931, + "grad_norm": 3.1772994995117188, + "learning_rate": 4.976131119715695e-05, + "loss": 1.2177, + "step": 1620 + }, + { + "epoch": 0.014409731430895172, + "grad_norm": 6.411615371704102, + "learning_rate": 4.975983780948508e-05, + "loss": 1.1621, + "step": 1630 + }, + { + "epoch": 0.014498134691207412, + "grad_norm": 3.6244139671325684, + "learning_rate": 4.9758364421813215e-05, + "loss": 1.2433, + "step": 1640 + }, + { + "epoch": 0.014586537951519651, + "grad_norm": 3.861219882965088, + "learning_rate": 4.975689103414134e-05, + "loss": 1.1503, + "step": 1650 + }, + { + "epoch": 0.014674941211831893, + "grad_norm": 6.717658519744873, + "learning_rate": 4.975541764646947e-05, + "loss": 1.1981, + "step": 1660 + }, + { + "epoch": 0.014763344472144132, + "grad_norm": 4.457257270812988, + "learning_rate": 4.97539442587976e-05, + "loss": 1.173, + "step": 1670 + }, + { + "epoch": 0.014851747732456374, + "grad_norm": 4.979446887969971, + "learning_rate": 4.975247087112573e-05, + "loss": 1.1214, + "step": 1680 + }, + { + "epoch": 0.014940150992768613, + "grad_norm": 4.89369010925293, + "learning_rate": 4.975099748345386e-05, + "loss": 1.2174, + "step": 1690 + }, + { + "epoch": 0.015028554253080854, + "grad_norm": 5.477513790130615, + "learning_rate": 4.974952409578199e-05, + "loss": 1.1146, + "step": 1700 + }, + { + "epoch": 0.015116957513393094, + "grad_norm": 4.475773334503174, + "learning_rate": 4.9748050708110114e-05, + "loss": 1.1294, + "step": 1710 + }, + { + "epoch": 0.015205360773705334, + "grad_norm": 4.1478190422058105, + "learning_rate": 4.974657732043825e-05, + "loss": 1.2346, + "step": 1720 + }, + { + "epoch": 0.015293764034017575, + "grad_norm": 4.169029235839844, + "learning_rate": 4.974510393276638e-05, + "loss": 1.184, + "step": 1730 + }, + { + "epoch": 0.015382167294329814, + "grad_norm": 3.632573366165161, + "learning_rate": 4.9743630545094505e-05, + "loss": 1.281, + "step": 1740 + }, + { + "epoch": 0.015470570554642056, + "grad_norm": 3.9423365592956543, + "learning_rate": 4.9742157157422634e-05, + "loss": 1.0858, + "step": 1750 + }, + { + "epoch": 0.015558973814954295, + "grad_norm": 7.415809154510498, + "learning_rate": 4.974068376975076e-05, + "loss": 1.1911, + "step": 1760 + }, + { + "epoch": 0.015647377075266537, + "grad_norm": 6.262916564941406, + "learning_rate": 4.973921038207889e-05, + "loss": 1.1719, + "step": 1770 + }, + { + "epoch": 0.015735780335578778, + "grad_norm": 5.934061527252197, + "learning_rate": 4.9737736994407026e-05, + "loss": 1.2425, + "step": 1780 + }, + { + "epoch": 0.015824183595891016, + "grad_norm": 3.6301071643829346, + "learning_rate": 4.9736263606735154e-05, + "loss": 1.2585, + "step": 1790 + }, + { + "epoch": 0.015912586856203257, + "grad_norm": 4.821804046630859, + "learning_rate": 4.973479021906328e-05, + "loss": 1.2402, + "step": 1800 + }, + { + "epoch": 0.0160009901165155, + "grad_norm": 5.870527267456055, + "learning_rate": 4.973331683139141e-05, + "loss": 1.1582, + "step": 1810 + }, + { + "epoch": 0.016089393376827736, + "grad_norm": 3.978375196456909, + "learning_rate": 4.973184344371954e-05, + "loss": 1.2734, + "step": 1820 + }, + { + "epoch": 0.016177796637139977, + "grad_norm": 3.5663650035858154, + "learning_rate": 4.973037005604767e-05, + "loss": 1.2663, + "step": 1830 + }, + { + "epoch": 0.01626619989745222, + "grad_norm": 4.388373851776123, + "learning_rate": 4.9728896668375796e-05, + "loss": 1.2131, + "step": 1840 + }, + { + "epoch": 0.01635460315776446, + "grad_norm": 7.263591289520264, + "learning_rate": 4.972742328070393e-05, + "loss": 1.1366, + "step": 1850 + }, + { + "epoch": 0.016443006418076698, + "grad_norm": 3.496192455291748, + "learning_rate": 4.972594989303206e-05, + "loss": 1.1126, + "step": 1860 + }, + { + "epoch": 0.01653140967838894, + "grad_norm": 4.2536702156066895, + "learning_rate": 4.972447650536019e-05, + "loss": 1.2666, + "step": 1870 + }, + { + "epoch": 0.01661981293870118, + "grad_norm": 3.563420534133911, + "learning_rate": 4.9723003117688316e-05, + "loss": 1.1801, + "step": 1880 + }, + { + "epoch": 0.01670821619901342, + "grad_norm": 4.134334564208984, + "learning_rate": 4.9721529730016444e-05, + "loss": 1.2148, + "step": 1890 + }, + { + "epoch": 0.01679661945932566, + "grad_norm": 4.57625675201416, + "learning_rate": 4.972005634234457e-05, + "loss": 1.0861, + "step": 1900 + }, + { + "epoch": 0.0168850227196379, + "grad_norm": 6.389843463897705, + "learning_rate": 4.971858295467271e-05, + "loss": 1.1459, + "step": 1910 + }, + { + "epoch": 0.016973425979950142, + "grad_norm": 5.516792297363281, + "learning_rate": 4.9717109567000836e-05, + "loss": 1.2473, + "step": 1920 + }, + { + "epoch": 0.01706182924026238, + "grad_norm": 4.792562961578369, + "learning_rate": 4.9715636179328964e-05, + "loss": 1.253, + "step": 1930 + }, + { + "epoch": 0.01715023250057462, + "grad_norm": 3.1397716999053955, + "learning_rate": 4.971416279165709e-05, + "loss": 1.1486, + "step": 1940 + }, + { + "epoch": 0.017238635760886863, + "grad_norm": 4.664846897125244, + "learning_rate": 4.971268940398522e-05, + "loss": 1.2853, + "step": 1950 + }, + { + "epoch": 0.0173270390211991, + "grad_norm": 3.8001105785369873, + "learning_rate": 4.971121601631335e-05, + "loss": 1.2221, + "step": 1960 + }, + { + "epoch": 0.017415442281511342, + "grad_norm": 4.641026973724365, + "learning_rate": 4.9709742628641485e-05, + "loss": 1.2473, + "step": 1970 + }, + { + "epoch": 0.017503845541823583, + "grad_norm": 2.9220542907714844, + "learning_rate": 4.9708269240969606e-05, + "loss": 1.201, + "step": 1980 + }, + { + "epoch": 0.017592248802135824, + "grad_norm": 4.261163234710693, + "learning_rate": 4.970679585329774e-05, + "loss": 1.2117, + "step": 1990 + }, + { + "epoch": 0.017680652062448062, + "grad_norm": 8.084556579589844, + "learning_rate": 4.970532246562587e-05, + "loss": 1.2751, + "step": 2000 + }, + { + "epoch": 0.017769055322760303, + "grad_norm": 3.9785537719726562, + "learning_rate": 4.9703849077954e-05, + "loss": 1.2035, + "step": 2010 + }, + { + "epoch": 0.017857458583072545, + "grad_norm": 4.966981887817383, + "learning_rate": 4.9702375690282126e-05, + "loss": 1.0638, + "step": 2020 + }, + { + "epoch": 0.017945861843384783, + "grad_norm": 3.9670522212982178, + "learning_rate": 4.970090230261026e-05, + "loss": 1.2402, + "step": 2030 + }, + { + "epoch": 0.018034265103697024, + "grad_norm": 6.027310371398926, + "learning_rate": 4.969942891493838e-05, + "loss": 1.1792, + "step": 2040 + }, + { + "epoch": 0.018122668364009265, + "grad_norm": 8.415605545043945, + "learning_rate": 4.969795552726652e-05, + "loss": 1.1563, + "step": 2050 + }, + { + "epoch": 0.018211071624321507, + "grad_norm": 4.428287506103516, + "learning_rate": 4.9696482139594647e-05, + "loss": 1.2599, + "step": 2060 + }, + { + "epoch": 0.018299474884633744, + "grad_norm": 3.5284461975097656, + "learning_rate": 4.9695008751922775e-05, + "loss": 1.2318, + "step": 2070 + }, + { + "epoch": 0.018387878144945986, + "grad_norm": 8.998053550720215, + "learning_rate": 4.96935353642509e-05, + "loss": 1.1768, + "step": 2080 + }, + { + "epoch": 0.018476281405258227, + "grad_norm": 7.443400859832764, + "learning_rate": 4.969206197657903e-05, + "loss": 1.2308, + "step": 2090 + }, + { + "epoch": 0.018564684665570465, + "grad_norm": 5.766305923461914, + "learning_rate": 4.969058858890716e-05, + "loss": 1.2445, + "step": 2100 + }, + { + "epoch": 0.018653087925882706, + "grad_norm": 4.347663879394531, + "learning_rate": 4.9689115201235295e-05, + "loss": 1.1199, + "step": 2110 + }, + { + "epoch": 0.018741491186194947, + "grad_norm": 7.1854705810546875, + "learning_rate": 4.968764181356342e-05, + "loss": 1.1827, + "step": 2120 + }, + { + "epoch": 0.01882989444650719, + "grad_norm": 8.310243606567383, + "learning_rate": 4.968616842589155e-05, + "loss": 1.1784, + "step": 2130 + }, + { + "epoch": 0.018918297706819426, + "grad_norm": 3.564779043197632, + "learning_rate": 4.968469503821968e-05, + "loss": 1.1829, + "step": 2140 + }, + { + "epoch": 0.019006700967131668, + "grad_norm": 4.239924907684326, + "learning_rate": 4.968322165054781e-05, + "loss": 1.1483, + "step": 2150 + }, + { + "epoch": 0.01909510422744391, + "grad_norm": 3.9210386276245117, + "learning_rate": 4.968174826287594e-05, + "loss": 1.1491, + "step": 2160 + }, + { + "epoch": 0.019183507487756147, + "grad_norm": 3.756441831588745, + "learning_rate": 4.968027487520407e-05, + "loss": 1.1951, + "step": 2170 + }, + { + "epoch": 0.019271910748068388, + "grad_norm": 6.451724529266357, + "learning_rate": 4.9678801487532194e-05, + "loss": 1.1629, + "step": 2180 + }, + { + "epoch": 0.01936031400838063, + "grad_norm": 6.526517391204834, + "learning_rate": 4.967732809986033e-05, + "loss": 1.1858, + "step": 2190 + }, + { + "epoch": 0.01944871726869287, + "grad_norm": 3.928395986557007, + "learning_rate": 4.967585471218845e-05, + "loss": 1.1959, + "step": 2200 + }, + { + "epoch": 0.01953712052900511, + "grad_norm": 5.018512725830078, + "learning_rate": 4.9674381324516585e-05, + "loss": 1.1671, + "step": 2210 + }, + { + "epoch": 0.01962552378931735, + "grad_norm": 5.034059524536133, + "learning_rate": 4.9672907936844714e-05, + "loss": 1.1155, + "step": 2220 + }, + { + "epoch": 0.01971392704962959, + "grad_norm": 7.713180065155029, + "learning_rate": 4.967143454917284e-05, + "loss": 0.9553, + "step": 2230 + }, + { + "epoch": 0.01980233030994183, + "grad_norm": 4.939086437225342, + "learning_rate": 4.966996116150097e-05, + "loss": 1.2005, + "step": 2240 + }, + { + "epoch": 0.01989073357025407, + "grad_norm": 5.7985920906066895, + "learning_rate": 4.9668487773829106e-05, + "loss": 1.2426, + "step": 2250 + }, + { + "epoch": 0.01997913683056631, + "grad_norm": 3.0697240829467773, + "learning_rate": 4.966701438615723e-05, + "loss": 1.2048, + "step": 2260 + }, + { + "epoch": 0.020067540090878553, + "grad_norm": 5.362508773803711, + "learning_rate": 4.966554099848536e-05, + "loss": 1.2438, + "step": 2270 + }, + { + "epoch": 0.02015594335119079, + "grad_norm": 3.468385934829712, + "learning_rate": 4.966406761081349e-05, + "loss": 1.1942, + "step": 2280 + }, + { + "epoch": 0.020244346611503032, + "grad_norm": 5.108860015869141, + "learning_rate": 4.966259422314162e-05, + "loss": 1.1456, + "step": 2290 + }, + { + "epoch": 0.020332749871815273, + "grad_norm": 4.569157600402832, + "learning_rate": 4.966112083546975e-05, + "loss": 1.1289, + "step": 2300 + }, + { + "epoch": 0.02042115313212751, + "grad_norm": 8.107612609863281, + "learning_rate": 4.9659647447797876e-05, + "loss": 1.0621, + "step": 2310 + }, + { + "epoch": 0.020509556392439753, + "grad_norm": 3.4186110496520996, + "learning_rate": 4.9658174060126004e-05, + "loss": 1.0888, + "step": 2320 + }, + { + "epoch": 0.020597959652751994, + "grad_norm": 6.030848979949951, + "learning_rate": 4.965670067245414e-05, + "loss": 1.1178, + "step": 2330 + }, + { + "epoch": 0.020686362913064235, + "grad_norm": 3.222794532775879, + "learning_rate": 4.965522728478226e-05, + "loss": 1.1568, + "step": 2340 + }, + { + "epoch": 0.020774766173376473, + "grad_norm": 4.0737481117248535, + "learning_rate": 4.9653753897110396e-05, + "loss": 1.1832, + "step": 2350 + }, + { + "epoch": 0.020863169433688714, + "grad_norm": 4.104101657867432, + "learning_rate": 4.9652280509438524e-05, + "loss": 1.2447, + "step": 2360 + }, + { + "epoch": 0.020951572694000956, + "grad_norm": 6.571559429168701, + "learning_rate": 4.965080712176665e-05, + "loss": 1.1735, + "step": 2370 + }, + { + "epoch": 0.021039975954313193, + "grad_norm": 3.1230075359344482, + "learning_rate": 4.964933373409478e-05, + "loss": 1.1589, + "step": 2380 + }, + { + "epoch": 0.021128379214625435, + "grad_norm": 4.8723835945129395, + "learning_rate": 4.9647860346422916e-05, + "loss": 1.2285, + "step": 2390 + }, + { + "epoch": 0.021216782474937676, + "grad_norm": 3.4131295680999756, + "learning_rate": 4.964638695875104e-05, + "loss": 1.069, + "step": 2400 + }, + { + "epoch": 0.021305185735249917, + "grad_norm": 13.047353744506836, + "learning_rate": 4.964491357107917e-05, + "loss": 1.0728, + "step": 2410 + }, + { + "epoch": 0.021393588995562155, + "grad_norm": 4.622305393218994, + "learning_rate": 4.9643440183407294e-05, + "loss": 1.0736, + "step": 2420 + }, + { + "epoch": 0.021481992255874396, + "grad_norm": 4.833291053771973, + "learning_rate": 4.964196679573543e-05, + "loss": 1.1708, + "step": 2430 + }, + { + "epoch": 0.021570395516186638, + "grad_norm": 5.150927543640137, + "learning_rate": 4.964049340806356e-05, + "loss": 1.2133, + "step": 2440 + }, + { + "epoch": 0.021658798776498876, + "grad_norm": 5.4327592849731445, + "learning_rate": 4.9639020020391686e-05, + "loss": 1.0678, + "step": 2450 + }, + { + "epoch": 0.021747202036811117, + "grad_norm": 7.642373561859131, + "learning_rate": 4.9637546632719815e-05, + "loss": 1.2008, + "step": 2460 + }, + { + "epoch": 0.021835605297123358, + "grad_norm": 7.505542278289795, + "learning_rate": 4.963607324504795e-05, + "loss": 0.9884, + "step": 2470 + }, + { + "epoch": 0.0219240085574356, + "grad_norm": 5.696202278137207, + "learning_rate": 4.963459985737607e-05, + "loss": 1.0402, + "step": 2480 + }, + { + "epoch": 0.022012411817747837, + "grad_norm": 5.025146007537842, + "learning_rate": 4.9633126469704206e-05, + "loss": 1.16, + "step": 2490 + }, + { + "epoch": 0.02210081507806008, + "grad_norm": 4.16343879699707, + "learning_rate": 4.9631653082032335e-05, + "loss": 1.1264, + "step": 2500 + }, + { + "epoch": 0.02218921833837232, + "grad_norm": 4.130855560302734, + "learning_rate": 4.963017969436046e-05, + "loss": 1.0499, + "step": 2510 + }, + { + "epoch": 0.02227762159868456, + "grad_norm": 6.513915061950684, + "learning_rate": 4.962870630668859e-05, + "loss": 1.1892, + "step": 2520 + }, + { + "epoch": 0.0223660248589968, + "grad_norm": 7.0379486083984375, + "learning_rate": 4.962723291901673e-05, + "loss": 1.058, + "step": 2530 + }, + { + "epoch": 0.02245442811930904, + "grad_norm": 4.891809463500977, + "learning_rate": 4.962575953134485e-05, + "loss": 1.1433, + "step": 2540 + }, + { + "epoch": 0.02254283137962128, + "grad_norm": 2.6610805988311768, + "learning_rate": 4.962428614367298e-05, + "loss": 1.396, + "step": 2550 + }, + { + "epoch": 0.02263123463993352, + "grad_norm": 4.087412357330322, + "learning_rate": 4.9622812756001105e-05, + "loss": 1.1833, + "step": 2560 + }, + { + "epoch": 0.02271963790024576, + "grad_norm": 4.190392017364502, + "learning_rate": 4.962133936832924e-05, + "loss": 1.0581, + "step": 2570 + }, + { + "epoch": 0.022808041160558002, + "grad_norm": 5.221377849578857, + "learning_rate": 4.961986598065737e-05, + "loss": 0.9584, + "step": 2580 + }, + { + "epoch": 0.022896444420870243, + "grad_norm": 6.462989330291748, + "learning_rate": 4.96183925929855e-05, + "loss": 1.1468, + "step": 2590 + }, + { + "epoch": 0.02298484768118248, + "grad_norm": 5.1533308029174805, + "learning_rate": 4.9616919205313625e-05, + "loss": 1.1259, + "step": 2600 + }, + { + "epoch": 0.023073250941494722, + "grad_norm": 4.590929985046387, + "learning_rate": 4.961544581764176e-05, + "loss": 1.1365, + "step": 2610 + }, + { + "epoch": 0.023161654201806964, + "grad_norm": 4.935781478881836, + "learning_rate": 4.961397242996988e-05, + "loss": 1.154, + "step": 2620 + }, + { + "epoch": 0.0232500574621192, + "grad_norm": 6.083306312561035, + "learning_rate": 4.961249904229802e-05, + "loss": 1.008, + "step": 2630 + }, + { + "epoch": 0.023338460722431443, + "grad_norm": 3.8785037994384766, + "learning_rate": 4.9611025654626145e-05, + "loss": 1.1141, + "step": 2640 + }, + { + "epoch": 0.023426863982743684, + "grad_norm": 2.921254873275757, + "learning_rate": 4.9609552266954274e-05, + "loss": 1.1871, + "step": 2650 + }, + { + "epoch": 0.023515267243055926, + "grad_norm": 3.7994141578674316, + "learning_rate": 4.96080788792824e-05, + "loss": 1.2912, + "step": 2660 + }, + { + "epoch": 0.023603670503368163, + "grad_norm": 3.8834662437438965, + "learning_rate": 4.960660549161053e-05, + "loss": 1.206, + "step": 2670 + }, + { + "epoch": 0.023692073763680405, + "grad_norm": 3.5875086784362793, + "learning_rate": 4.960513210393866e-05, + "loss": 1.0901, + "step": 2680 + }, + { + "epoch": 0.023780477023992646, + "grad_norm": 6.173762321472168, + "learning_rate": 4.9603658716266794e-05, + "loss": 1.0821, + "step": 2690 + }, + { + "epoch": 0.023868880284304884, + "grad_norm": 5.421717166900635, + "learning_rate": 4.960218532859492e-05, + "loss": 1.1437, + "step": 2700 + }, + { + "epoch": 0.023957283544617125, + "grad_norm": 3.7461869716644287, + "learning_rate": 4.960071194092305e-05, + "loss": 1.063, + "step": 2710 + }, + { + "epoch": 0.024045686804929366, + "grad_norm": 3.8545279502868652, + "learning_rate": 4.959923855325118e-05, + "loss": 1.1014, + "step": 2720 + }, + { + "epoch": 0.024134090065241608, + "grad_norm": 7.023557186126709, + "learning_rate": 4.959776516557931e-05, + "loss": 1.1094, + "step": 2730 + }, + { + "epoch": 0.024222493325553845, + "grad_norm": 4.638785362243652, + "learning_rate": 4.9596291777907436e-05, + "loss": 1.1082, + "step": 2740 + }, + { + "epoch": 0.024310896585866087, + "grad_norm": 3.8988077640533447, + "learning_rate": 4.959481839023557e-05, + "loss": 1.0179, + "step": 2750 + }, + { + "epoch": 0.024399299846178328, + "grad_norm": 6.727329254150391, + "learning_rate": 4.95933450025637e-05, + "loss": 1.1137, + "step": 2760 + }, + { + "epoch": 0.024487703106490566, + "grad_norm": 5.458581924438477, + "learning_rate": 4.959187161489183e-05, + "loss": 1.0253, + "step": 2770 + }, + { + "epoch": 0.024576106366802807, + "grad_norm": 4.3151469230651855, + "learning_rate": 4.9590398227219956e-05, + "loss": 1.1655, + "step": 2780 + }, + { + "epoch": 0.02466450962711505, + "grad_norm": 3.221970796585083, + "learning_rate": 4.9588924839548084e-05, + "loss": 1.2279, + "step": 2790 + }, + { + "epoch": 0.02475291288742729, + "grad_norm": 5.484412670135498, + "learning_rate": 4.958745145187621e-05, + "loss": 1.1114, + "step": 2800 + }, + { + "epoch": 0.024841316147739528, + "grad_norm": 3.404670476913452, + "learning_rate": 4.958597806420434e-05, + "loss": 1.1584, + "step": 2810 + }, + { + "epoch": 0.02492971940805177, + "grad_norm": 4.5648298263549805, + "learning_rate": 4.9584504676532476e-05, + "loss": 0.9469, + "step": 2820 + }, + { + "epoch": 0.02501812266836401, + "grad_norm": 5.958621501922607, + "learning_rate": 4.9583031288860604e-05, + "loss": 1.1678, + "step": 2830 + }, + { + "epoch": 0.025106525928676248, + "grad_norm": 4.206620693206787, + "learning_rate": 4.958155790118873e-05, + "loss": 1.2014, + "step": 2840 + }, + { + "epoch": 0.02519492918898849, + "grad_norm": 3.300164222717285, + "learning_rate": 4.958008451351686e-05, + "loss": 1.0201, + "step": 2850 + }, + { + "epoch": 0.02528333244930073, + "grad_norm": 5.670661926269531, + "learning_rate": 4.957861112584499e-05, + "loss": 1.1142, + "step": 2860 + }, + { + "epoch": 0.025371735709612972, + "grad_norm": 5.453832626342773, + "learning_rate": 4.957713773817312e-05, + "loss": 1.1028, + "step": 2870 + }, + { + "epoch": 0.02546013896992521, + "grad_norm": 4.52194356918335, + "learning_rate": 4.957566435050125e-05, + "loss": 1.1089, + "step": 2880 + }, + { + "epoch": 0.02554854223023745, + "grad_norm": 5.5644073486328125, + "learning_rate": 4.9574190962829375e-05, + "loss": 1.0958, + "step": 2890 + }, + { + "epoch": 0.025636945490549692, + "grad_norm": 6.353917121887207, + "learning_rate": 4.957271757515751e-05, + "loss": 1.0777, + "step": 2900 + }, + { + "epoch": 0.02572534875086193, + "grad_norm": 5.321937561035156, + "learning_rate": 4.957124418748564e-05, + "loss": 1.0605, + "step": 2910 + }, + { + "epoch": 0.02581375201117417, + "grad_norm": 6.517303466796875, + "learning_rate": 4.9569770799813766e-05, + "loss": 1.0668, + "step": 2920 + }, + { + "epoch": 0.025902155271486413, + "grad_norm": 5.2277374267578125, + "learning_rate": 4.9568297412141895e-05, + "loss": 1.05, + "step": 2930 + }, + { + "epoch": 0.025990558531798654, + "grad_norm": 6.279105186462402, + "learning_rate": 4.956682402447003e-05, + "loss": 1.0839, + "step": 2940 + }, + { + "epoch": 0.026078961792110892, + "grad_norm": 4.64212703704834, + "learning_rate": 4.956535063679815e-05, + "loss": 1.044, + "step": 2950 + }, + { + "epoch": 0.026167365052423133, + "grad_norm": 7.826188564300537, + "learning_rate": 4.9563877249126287e-05, + "loss": 1.1387, + "step": 2960 + }, + { + "epoch": 0.026255768312735375, + "grad_norm": 5.159195899963379, + "learning_rate": 4.9562403861454415e-05, + "loss": 1.0018, + "step": 2970 + }, + { + "epoch": 0.026344171573047612, + "grad_norm": 13.026447296142578, + "learning_rate": 4.956093047378254e-05, + "loss": 1.2087, + "step": 2980 + }, + { + "epoch": 0.026432574833359854, + "grad_norm": 4.535762310028076, + "learning_rate": 4.955945708611067e-05, + "loss": 1.1684, + "step": 2990 + }, + { + "epoch": 0.026520978093672095, + "grad_norm": 3.30999755859375, + "learning_rate": 4.955798369843881e-05, + "loss": 1.0433, + "step": 3000 + }, + { + "epoch": 0.026609381353984336, + "grad_norm": 7.665961742401123, + "learning_rate": 4.955651031076693e-05, + "loss": 1.1903, + "step": 3010 + }, + { + "epoch": 0.026697784614296574, + "grad_norm": 5.118757724761963, + "learning_rate": 4.9555036923095063e-05, + "loss": 1.121, + "step": 3020 + }, + { + "epoch": 0.026786187874608815, + "grad_norm": 6.991201877593994, + "learning_rate": 4.9553563535423185e-05, + "loss": 1.1503, + "step": 3030 + }, + { + "epoch": 0.026874591134921057, + "grad_norm": 5.291536808013916, + "learning_rate": 4.955209014775132e-05, + "loss": 1.0973, + "step": 3040 + }, + { + "epoch": 0.026962994395233295, + "grad_norm": 5.872868537902832, + "learning_rate": 4.955061676007945e-05, + "loss": 1.1029, + "step": 3050 + }, + { + "epoch": 0.027051397655545536, + "grad_norm": 3.444056987762451, + "learning_rate": 4.954914337240758e-05, + "loss": 1.0218, + "step": 3060 + }, + { + "epoch": 0.027139800915857777, + "grad_norm": 6.330752372741699, + "learning_rate": 4.9547669984735705e-05, + "loss": 1.0465, + "step": 3070 + }, + { + "epoch": 0.02722820417617002, + "grad_norm": 7.991491317749023, + "learning_rate": 4.954619659706384e-05, + "loss": 1.0652, + "step": 3080 + }, + { + "epoch": 0.027316607436482256, + "grad_norm": 6.847014904022217, + "learning_rate": 4.954472320939196e-05, + "loss": 1.2172, + "step": 3090 + }, + { + "epoch": 0.027405010696794498, + "grad_norm": 8.696152687072754, + "learning_rate": 4.95432498217201e-05, + "loss": 1.1707, + "step": 3100 + }, + { + "epoch": 0.02749341395710674, + "grad_norm": 4.785496234893799, + "learning_rate": 4.9541776434048225e-05, + "loss": 1.0667, + "step": 3110 + }, + { + "epoch": 0.027581817217418977, + "grad_norm": 8.054608345031738, + "learning_rate": 4.9540303046376354e-05, + "loss": 1.204, + "step": 3120 + }, + { + "epoch": 0.027670220477731218, + "grad_norm": 6.484571933746338, + "learning_rate": 4.953882965870448e-05, + "loss": 1.1561, + "step": 3130 + }, + { + "epoch": 0.02775862373804346, + "grad_norm": 3.432863235473633, + "learning_rate": 4.953735627103261e-05, + "loss": 1.1016, + "step": 3140 + }, + { + "epoch": 0.0278470269983557, + "grad_norm": 4.097947597503662, + "learning_rate": 4.953588288336074e-05, + "loss": 1.0483, + "step": 3150 + }, + { + "epoch": 0.02793543025866794, + "grad_norm": 6.800324440002441, + "learning_rate": 4.9534409495688874e-05, + "loss": 1.0975, + "step": 3160 + }, + { + "epoch": 0.02802383351898018, + "grad_norm": 19.81365203857422, + "learning_rate": 4.9532936108016996e-05, + "loss": 1.1163, + "step": 3170 + }, + { + "epoch": 0.02811223677929242, + "grad_norm": 4.010586738586426, + "learning_rate": 4.953146272034513e-05, + "loss": 1.0635, + "step": 3180 + }, + { + "epoch": 0.028200640039604662, + "grad_norm": 5.544920444488525, + "learning_rate": 4.952998933267326e-05, + "loss": 1.0454, + "step": 3190 + }, + { + "epoch": 0.0282890432999169, + "grad_norm": 10.799997329711914, + "learning_rate": 4.952851594500139e-05, + "loss": 1.0682, + "step": 3200 + }, + { + "epoch": 0.02837744656022914, + "grad_norm": 11.736847877502441, + "learning_rate": 4.9527042557329516e-05, + "loss": 1.1602, + "step": 3210 + }, + { + "epoch": 0.028465849820541383, + "grad_norm": 6.757185459136963, + "learning_rate": 4.952556916965765e-05, + "loss": 1.1859, + "step": 3220 + }, + { + "epoch": 0.02855425308085362, + "grad_norm": 4.051261901855469, + "learning_rate": 4.952409578198577e-05, + "loss": 1.049, + "step": 3230 + }, + { + "epoch": 0.028642656341165862, + "grad_norm": 3.273878335952759, + "learning_rate": 4.952262239431391e-05, + "loss": 1.0997, + "step": 3240 + }, + { + "epoch": 0.028731059601478103, + "grad_norm": 3.5604453086853027, + "learning_rate": 4.952114900664203e-05, + "loss": 1.1143, + "step": 3250 + }, + { + "epoch": 0.028819462861790344, + "grad_norm": 7.519532203674316, + "learning_rate": 4.9519675618970164e-05, + "loss": 0.9892, + "step": 3260 + }, + { + "epoch": 0.028907866122102582, + "grad_norm": 8.42969799041748, + "learning_rate": 4.951820223129829e-05, + "loss": 1.0713, + "step": 3270 + }, + { + "epoch": 0.028996269382414824, + "grad_norm": 6.64420747756958, + "learning_rate": 4.951672884362642e-05, + "loss": 1.0585, + "step": 3280 + }, + { + "epoch": 0.029084672642727065, + "grad_norm": 5.263432025909424, + "learning_rate": 4.951525545595455e-05, + "loss": 1.1235, + "step": 3290 + }, + { + "epoch": 0.029173075903039303, + "grad_norm": 6.5242509841918945, + "learning_rate": 4.9513782068282684e-05, + "loss": 1.0536, + "step": 3300 + }, + { + "epoch": 0.029261479163351544, + "grad_norm": 5.996762275695801, + "learning_rate": 4.9512308680610806e-05, + "loss": 1.1308, + "step": 3310 + }, + { + "epoch": 0.029349882423663785, + "grad_norm": 7.299069404602051, + "learning_rate": 4.951083529293894e-05, + "loss": 1.0546, + "step": 3320 + }, + { + "epoch": 0.029438285683976027, + "grad_norm": 4.803090572357178, + "learning_rate": 4.950936190526707e-05, + "loss": 1.1245, + "step": 3330 + }, + { + "epoch": 0.029526688944288264, + "grad_norm": 5.4984130859375, + "learning_rate": 4.95078885175952e-05, + "loss": 1.0298, + "step": 3340 + }, + { + "epoch": 0.029615092204600506, + "grad_norm": 4.74207878112793, + "learning_rate": 4.9506415129923326e-05, + "loss": 1.0083, + "step": 3350 + }, + { + "epoch": 0.029703495464912747, + "grad_norm": 7.264033794403076, + "learning_rate": 4.9504941742251455e-05, + "loss": 1.0415, + "step": 3360 + }, + { + "epoch": 0.029791898725224985, + "grad_norm": 11.472503662109375, + "learning_rate": 4.950346835457958e-05, + "loss": 1.0234, + "step": 3370 + }, + { + "epoch": 0.029880301985537226, + "grad_norm": 10.286882400512695, + "learning_rate": 4.950199496690772e-05, + "loss": 1.0993, + "step": 3380 + }, + { + "epoch": 0.029968705245849468, + "grad_norm": 8.054348945617676, + "learning_rate": 4.950052157923584e-05, + "loss": 1.0699, + "step": 3390 + }, + { + "epoch": 0.03005710850616171, + "grad_norm": 4.6194610595703125, + "learning_rate": 4.9499048191563975e-05, + "loss": 1.0353, + "step": 3400 + }, + { + "epoch": 0.030145511766473947, + "grad_norm": 5.325869083404541, + "learning_rate": 4.94975748038921e-05, + "loss": 1.1676, + "step": 3410 + }, + { + "epoch": 0.030233915026786188, + "grad_norm": 3.003830671310425, + "learning_rate": 4.949610141622023e-05, + "loss": 0.9355, + "step": 3420 + }, + { + "epoch": 0.03032231828709843, + "grad_norm": 17.496395111083984, + "learning_rate": 4.949462802854836e-05, + "loss": 1.3197, + "step": 3430 + }, + { + "epoch": 0.030410721547410667, + "grad_norm": 5.781140327453613, + "learning_rate": 4.9493154640876495e-05, + "loss": 1.1219, + "step": 3440 + }, + { + "epoch": 0.03049912480772291, + "grad_norm": 4.405752182006836, + "learning_rate": 4.9491681253204617e-05, + "loss": 0.9957, + "step": 3450 + }, + { + "epoch": 0.03058752806803515, + "grad_norm": 3.774116039276123, + "learning_rate": 4.949020786553275e-05, + "loss": 0.9923, + "step": 3460 + }, + { + "epoch": 0.03067593132834739, + "grad_norm": 6.497739315032959, + "learning_rate": 4.948873447786088e-05, + "loss": 1.1951, + "step": 3470 + }, + { + "epoch": 0.03076433458865963, + "grad_norm": 6.66745662689209, + "learning_rate": 4.948726109018901e-05, + "loss": 1.1229, + "step": 3480 + }, + { + "epoch": 0.03085273784897187, + "grad_norm": 5.97498893737793, + "learning_rate": 4.948578770251714e-05, + "loss": 1.0747, + "step": 3490 + }, + { + "epoch": 0.03094114110928411, + "grad_norm": 9.32170581817627, + "learning_rate": 4.9484314314845265e-05, + "loss": 1.1125, + "step": 3500 + }, + { + "epoch": 0.03102954436959635, + "grad_norm": 5.269107818603516, + "learning_rate": 4.9482840927173393e-05, + "loss": 1.0845, + "step": 3510 + }, + { + "epoch": 0.03111794762990859, + "grad_norm": 5.987974643707275, + "learning_rate": 4.948136753950153e-05, + "loss": 1.0369, + "step": 3520 + }, + { + "epoch": 0.031206350890220832, + "grad_norm": 5.234177112579346, + "learning_rate": 4.947989415182965e-05, + "loss": 1.0749, + "step": 3530 + }, + { + "epoch": 0.03129475415053307, + "grad_norm": 3.885451555252075, + "learning_rate": 4.9478420764157785e-05, + "loss": 1.0154, + "step": 3540 + }, + { + "epoch": 0.03138315741084531, + "grad_norm": 5.198904514312744, + "learning_rate": 4.9476947376485914e-05, + "loss": 1.0572, + "step": 3550 + }, + { + "epoch": 0.031471560671157556, + "grad_norm": 5.708184719085693, + "learning_rate": 4.947547398881404e-05, + "loss": 1.1762, + "step": 3560 + }, + { + "epoch": 0.031559963931469794, + "grad_norm": 4.828928470611572, + "learning_rate": 4.947400060114217e-05, + "loss": 1.0067, + "step": 3570 + }, + { + "epoch": 0.03164836719178203, + "grad_norm": 3.9392752647399902, + "learning_rate": 4.9472527213470305e-05, + "loss": 1.0445, + "step": 3580 + }, + { + "epoch": 0.031736770452094276, + "grad_norm": 3.358604669570923, + "learning_rate": 4.947105382579843e-05, + "loss": 1.1345, + "step": 3590 + }, + { + "epoch": 0.031825173712406514, + "grad_norm": 6.108957290649414, + "learning_rate": 4.946958043812656e-05, + "loss": 1.0802, + "step": 3600 + }, + { + "epoch": 0.03191357697271875, + "grad_norm": 8.162700653076172, + "learning_rate": 4.946810705045469e-05, + "loss": 1.0491, + "step": 3610 + }, + { + "epoch": 0.032001980233031, + "grad_norm": 5.8155341148376465, + "learning_rate": 4.946663366278282e-05, + "loss": 1.1524, + "step": 3620 + }, + { + "epoch": 0.032090383493343234, + "grad_norm": 2.848914861679077, + "learning_rate": 4.946516027511095e-05, + "loss": 1.0649, + "step": 3630 + }, + { + "epoch": 0.03217878675365547, + "grad_norm": 4.426916599273682, + "learning_rate": 4.9463686887439076e-05, + "loss": 0.9712, + "step": 3640 + }, + { + "epoch": 0.03226719001396772, + "grad_norm": 3.7467966079711914, + "learning_rate": 4.9462213499767204e-05, + "loss": 1.1929, + "step": 3650 + }, + { + "epoch": 0.032355593274279955, + "grad_norm": 6.543834686279297, + "learning_rate": 4.946074011209534e-05, + "loss": 1.111, + "step": 3660 + }, + { + "epoch": 0.03244399653459219, + "grad_norm": 3.844190835952759, + "learning_rate": 4.945926672442347e-05, + "loss": 1.1195, + "step": 3670 + }, + { + "epoch": 0.03253239979490444, + "grad_norm": 6.082317352294922, + "learning_rate": 4.9457793336751596e-05, + "loss": 1.1465, + "step": 3680 + }, + { + "epoch": 0.032620803055216675, + "grad_norm": 6.333926200866699, + "learning_rate": 4.9456319949079724e-05, + "loss": 1.0256, + "step": 3690 + }, + { + "epoch": 0.03270920631552892, + "grad_norm": 4.987504959106445, + "learning_rate": 4.945484656140785e-05, + "loss": 1.0487, + "step": 3700 + }, + { + "epoch": 0.03279760957584116, + "grad_norm": 4.357222557067871, + "learning_rate": 4.945337317373598e-05, + "loss": 1.0347, + "step": 3710 + }, + { + "epoch": 0.032886012836153396, + "grad_norm": 2.961578369140625, + "learning_rate": 4.945189978606411e-05, + "loss": 0.9719, + "step": 3720 + }, + { + "epoch": 0.03297441609646564, + "grad_norm": 6.053518295288086, + "learning_rate": 4.9450426398392244e-05, + "loss": 1.2505, + "step": 3730 + }, + { + "epoch": 0.03306281935677788, + "grad_norm": 11.385558128356934, + "learning_rate": 4.944895301072037e-05, + "loss": 1.0115, + "step": 3740 + }, + { + "epoch": 0.033151222617090116, + "grad_norm": 7.48608922958374, + "learning_rate": 4.94474796230485e-05, + "loss": 0.9599, + "step": 3750 + }, + { + "epoch": 0.03323962587740236, + "grad_norm": 8.101400375366211, + "learning_rate": 4.944600623537663e-05, + "loss": 1.2365, + "step": 3760 + }, + { + "epoch": 0.0333280291377146, + "grad_norm": 3.3874876499176025, + "learning_rate": 4.944453284770476e-05, + "loss": 1.0796, + "step": 3770 + }, + { + "epoch": 0.03341643239802684, + "grad_norm": 4.571724891662598, + "learning_rate": 4.9443059460032886e-05, + "loss": 1.0822, + "step": 3780 + }, + { + "epoch": 0.03350483565833908, + "grad_norm": 7.061591625213623, + "learning_rate": 4.944158607236102e-05, + "loss": 0.9967, + "step": 3790 + }, + { + "epoch": 0.03359323891865132, + "grad_norm": 5.082997798919678, + "learning_rate": 4.944011268468915e-05, + "loss": 1.0895, + "step": 3800 + }, + { + "epoch": 0.03368164217896356, + "grad_norm": 3.501892566680908, + "learning_rate": 4.943863929701728e-05, + "loss": 1.1071, + "step": 3810 + }, + { + "epoch": 0.0337700454392758, + "grad_norm": 6.779443264007568, + "learning_rate": 4.9437165909345406e-05, + "loss": 0.994, + "step": 3820 + }, + { + "epoch": 0.03385844869958804, + "grad_norm": 5.151303768157959, + "learning_rate": 4.9435692521673535e-05, + "loss": 1.1656, + "step": 3830 + }, + { + "epoch": 0.033946851959900284, + "grad_norm": 2.304898738861084, + "learning_rate": 4.943421913400166e-05, + "loss": 1.1191, + "step": 3840 + }, + { + "epoch": 0.03403525522021252, + "grad_norm": 4.50697135925293, + "learning_rate": 4.94327457463298e-05, + "loss": 1.032, + "step": 3850 + }, + { + "epoch": 0.03412365848052476, + "grad_norm": 5.010416507720947, + "learning_rate": 4.943127235865792e-05, + "loss": 1.0835, + "step": 3860 + }, + { + "epoch": 0.034212061740837005, + "grad_norm": 6.183775901794434, + "learning_rate": 4.9429798970986055e-05, + "loss": 1.0877, + "step": 3870 + }, + { + "epoch": 0.03430046500114924, + "grad_norm": 7.286951065063477, + "learning_rate": 4.942832558331418e-05, + "loss": 1.1113, + "step": 3880 + }, + { + "epoch": 0.03438886826146148, + "grad_norm": 5.37122106552124, + "learning_rate": 4.942685219564231e-05, + "loss": 1.079, + "step": 3890 + }, + { + "epoch": 0.034477271521773725, + "grad_norm": 5.562955856323242, + "learning_rate": 4.942537880797044e-05, + "loss": 1.1608, + "step": 3900 + }, + { + "epoch": 0.03456567478208596, + "grad_norm": 5.252758026123047, + "learning_rate": 4.9423905420298575e-05, + "loss": 1.0415, + "step": 3910 + }, + { + "epoch": 0.0346540780423982, + "grad_norm": 6.825470924377441, + "learning_rate": 4.94224320326267e-05, + "loss": 1.0164, + "step": 3920 + }, + { + "epoch": 0.034742481302710446, + "grad_norm": 5.437935829162598, + "learning_rate": 4.942095864495483e-05, + "loss": 1.0165, + "step": 3930 + }, + { + "epoch": 0.034830884563022683, + "grad_norm": 6.060318946838379, + "learning_rate": 4.941948525728296e-05, + "loss": 1.1156, + "step": 3940 + }, + { + "epoch": 0.03491928782333492, + "grad_norm": 3.731422185897827, + "learning_rate": 4.941801186961109e-05, + "loss": 0.9086, + "step": 3950 + }, + { + "epoch": 0.035007691083647166, + "grad_norm": 7.884082794189453, + "learning_rate": 4.941653848193922e-05, + "loss": 1.1156, + "step": 3960 + }, + { + "epoch": 0.035096094343959404, + "grad_norm": 6.734913349151611, + "learning_rate": 4.9415065094267345e-05, + "loss": 1.0593, + "step": 3970 + }, + { + "epoch": 0.03518449760427165, + "grad_norm": 9.117546081542969, + "learning_rate": 4.9413591706595474e-05, + "loss": 1.0714, + "step": 3980 + }, + { + "epoch": 0.035272900864583887, + "grad_norm": 16.266523361206055, + "learning_rate": 4.941211831892361e-05, + "loss": 1.1006, + "step": 3990 + }, + { + "epoch": 0.035361304124896124, + "grad_norm": 3.810106039047241, + "learning_rate": 4.941064493125173e-05, + "loss": 0.9989, + "step": 4000 + }, + { + "epoch": 0.03544970738520837, + "grad_norm": 4.392092704772949, + "learning_rate": 4.9409171543579865e-05, + "loss": 1.0222, + "step": 4010 + }, + { + "epoch": 0.03553811064552061, + "grad_norm": 4.306662559509277, + "learning_rate": 4.9407698155907994e-05, + "loss": 1.0154, + "step": 4020 + }, + { + "epoch": 0.035626513905832845, + "grad_norm": 3.4554357528686523, + "learning_rate": 4.940622476823612e-05, + "loss": 1.0859, + "step": 4030 + }, + { + "epoch": 0.03571491716614509, + "grad_norm": 4.0056562423706055, + "learning_rate": 4.940475138056425e-05, + "loss": 1.0923, + "step": 4040 + }, + { + "epoch": 0.03580332042645733, + "grad_norm": 3.4190101623535156, + "learning_rate": 4.9403277992892386e-05, + "loss": 0.9547, + "step": 4050 + }, + { + "epoch": 0.035891723686769565, + "grad_norm": 4.973897457122803, + "learning_rate": 4.940180460522051e-05, + "loss": 1.2254, + "step": 4060 + }, + { + "epoch": 0.03598012694708181, + "grad_norm": 4.1999359130859375, + "learning_rate": 4.940033121754864e-05, + "loss": 0.9756, + "step": 4070 + }, + { + "epoch": 0.03606853020739405, + "grad_norm": 12.506514549255371, + "learning_rate": 4.9398857829876764e-05, + "loss": 0.9271, + "step": 4080 + }, + { + "epoch": 0.036156933467706286, + "grad_norm": 9.009496688842773, + "learning_rate": 4.93973844422049e-05, + "loss": 1.0405, + "step": 4090 + }, + { + "epoch": 0.03624533672801853, + "grad_norm": 4.755258560180664, + "learning_rate": 4.939591105453303e-05, + "loss": 0.9885, + "step": 4100 + }, + { + "epoch": 0.03633373998833077, + "grad_norm": 7.77773904800415, + "learning_rate": 4.9394437666861156e-05, + "loss": 1.1745, + "step": 4110 + }, + { + "epoch": 0.03642214324864301, + "grad_norm": 3.6486005783081055, + "learning_rate": 4.9392964279189284e-05, + "loss": 1.0569, + "step": 4120 + }, + { + "epoch": 0.03651054650895525, + "grad_norm": 6.329847812652588, + "learning_rate": 4.939149089151742e-05, + "loss": 1.0719, + "step": 4130 + }, + { + "epoch": 0.03659894976926749, + "grad_norm": 4.576066493988037, + "learning_rate": 4.939001750384554e-05, + "loss": 1.0469, + "step": 4140 + }, + { + "epoch": 0.03668735302957973, + "grad_norm": 5.984560966491699, + "learning_rate": 4.9388544116173676e-05, + "loss": 1.032, + "step": 4150 + }, + { + "epoch": 0.03677575628989197, + "grad_norm": 8.831100463867188, + "learning_rate": 4.9387070728501804e-05, + "loss": 1.0485, + "step": 4160 + }, + { + "epoch": 0.03686415955020421, + "grad_norm": 3.0822088718414307, + "learning_rate": 4.938559734082993e-05, + "loss": 0.9294, + "step": 4170 + }, + { + "epoch": 0.036952562810516454, + "grad_norm": 6.145000457763672, + "learning_rate": 4.938412395315806e-05, + "loss": 1.1835, + "step": 4180 + }, + { + "epoch": 0.03704096607082869, + "grad_norm": 3.957902193069458, + "learning_rate": 4.938265056548619e-05, + "loss": 1.0971, + "step": 4190 + }, + { + "epoch": 0.03712936933114093, + "grad_norm": 6.664913654327393, + "learning_rate": 4.938117717781432e-05, + "loss": 0.9565, + "step": 4200 + }, + { + "epoch": 0.037217772591453174, + "grad_norm": 4.730056285858154, + "learning_rate": 4.937970379014245e-05, + "loss": 1.1818, + "step": 4210 + }, + { + "epoch": 0.03730617585176541, + "grad_norm": 3.4853060245513916, + "learning_rate": 4.9378230402470574e-05, + "loss": 1.108, + "step": 4220 + }, + { + "epoch": 0.03739457911207766, + "grad_norm": 5.281261920928955, + "learning_rate": 4.937675701479871e-05, + "loss": 1.0069, + "step": 4230 + }, + { + "epoch": 0.037482982372389895, + "grad_norm": 3.6927974224090576, + "learning_rate": 4.937528362712684e-05, + "loss": 1.0068, + "step": 4240 + }, + { + "epoch": 0.03757138563270213, + "grad_norm": 5.452990531921387, + "learning_rate": 4.9373810239454966e-05, + "loss": 1.0916, + "step": 4250 + }, + { + "epoch": 0.03765978889301438, + "grad_norm": 6.807621479034424, + "learning_rate": 4.9372336851783095e-05, + "loss": 0.9799, + "step": 4260 + }, + { + "epoch": 0.037748192153326615, + "grad_norm": 6.8397393226623535, + "learning_rate": 4.937086346411123e-05, + "loss": 1.0614, + "step": 4270 + }, + { + "epoch": 0.03783659541363885, + "grad_norm": 3.4389631748199463, + "learning_rate": 4.936939007643935e-05, + "loss": 1.0625, + "step": 4280 + }, + { + "epoch": 0.0379249986739511, + "grad_norm": 6.080694675445557, + "learning_rate": 4.9367916688767486e-05, + "loss": 1.1324, + "step": 4290 + }, + { + "epoch": 0.038013401934263336, + "grad_norm": 3.531956911087036, + "learning_rate": 4.936644330109561e-05, + "loss": 1.0415, + "step": 4300 + }, + { + "epoch": 0.03810180519457557, + "grad_norm": 6.8647894859313965, + "learning_rate": 4.936496991342374e-05, + "loss": 1.0879, + "step": 4310 + }, + { + "epoch": 0.03819020845488782, + "grad_norm": 5.773015975952148, + "learning_rate": 4.936349652575187e-05, + "loss": 1.0298, + "step": 4320 + }, + { + "epoch": 0.038278611715200056, + "grad_norm": 11.786985397338867, + "learning_rate": 4.936202313808e-05, + "loss": 1.0216, + "step": 4330 + }, + { + "epoch": 0.038367014975512294, + "grad_norm": 9.405512809753418, + "learning_rate": 4.936054975040813e-05, + "loss": 0.9559, + "step": 4340 + }, + { + "epoch": 0.03845541823582454, + "grad_norm": 5.85341739654541, + "learning_rate": 4.935907636273626e-05, + "loss": 1.1283, + "step": 4350 + }, + { + "epoch": 0.038543821496136776, + "grad_norm": 3.5939722061157227, + "learning_rate": 4.9357602975064385e-05, + "loss": 1.0697, + "step": 4360 + }, + { + "epoch": 0.03863222475644902, + "grad_norm": 4.775249481201172, + "learning_rate": 4.935612958739252e-05, + "loss": 0.9419, + "step": 4370 + }, + { + "epoch": 0.03872062801676126, + "grad_norm": 3.026939868927002, + "learning_rate": 4.935465619972065e-05, + "loss": 1.0233, + "step": 4380 + }, + { + "epoch": 0.0388090312770735, + "grad_norm": 3.604031562805176, + "learning_rate": 4.935318281204878e-05, + "loss": 1.1227, + "step": 4390 + }, + { + "epoch": 0.03889743453738574, + "grad_norm": 5.833409786224365, + "learning_rate": 4.9351709424376905e-05, + "loss": 1.0837, + "step": 4400 + }, + { + "epoch": 0.03898583779769798, + "grad_norm": 8.4013090133667, + "learning_rate": 4.935023603670504e-05, + "loss": 1.074, + "step": 4410 + }, + { + "epoch": 0.03907424105801022, + "grad_norm": 8.928999900817871, + "learning_rate": 4.934876264903316e-05, + "loss": 1.0495, + "step": 4420 + }, + { + "epoch": 0.03916264431832246, + "grad_norm": 6.564711570739746, + "learning_rate": 4.93472892613613e-05, + "loss": 1.0646, + "step": 4430 + }, + { + "epoch": 0.0392510475786347, + "grad_norm": 4.471921920776367, + "learning_rate": 4.934581587368942e-05, + "loss": 1.1081, + "step": 4440 + }, + { + "epoch": 0.03933945083894694, + "grad_norm": 5.407211780548096, + "learning_rate": 4.9344342486017554e-05, + "loss": 1.1002, + "step": 4450 + }, + { + "epoch": 0.03942785409925918, + "grad_norm": 3.1841092109680176, + "learning_rate": 4.934286909834568e-05, + "loss": 1.0101, + "step": 4460 + }, + { + "epoch": 0.03951625735957142, + "grad_norm": 6.48630952835083, + "learning_rate": 4.934139571067381e-05, + "loss": 0.9795, + "step": 4470 + }, + { + "epoch": 0.03960466061988366, + "grad_norm": 6.4455389976501465, + "learning_rate": 4.933992232300194e-05, + "loss": 0.9063, + "step": 4480 + }, + { + "epoch": 0.0396930638801959, + "grad_norm": 6.892740249633789, + "learning_rate": 4.9338448935330074e-05, + "loss": 1.1026, + "step": 4490 + }, + { + "epoch": 0.03978146714050814, + "grad_norm": 6.074153423309326, + "learning_rate": 4.9336975547658195e-05, + "loss": 1.0885, + "step": 4500 + }, + { + "epoch": 0.039869870400820386, + "grad_norm": 6.199062824249268, + "learning_rate": 4.933550215998633e-05, + "loss": 1.0653, + "step": 4510 + }, + { + "epoch": 0.03995827366113262, + "grad_norm": 4.292714595794678, + "learning_rate": 4.933402877231446e-05, + "loss": 1.0616, + "step": 4520 + }, + { + "epoch": 0.04004667692144486, + "grad_norm": 5.541876792907715, + "learning_rate": 4.933255538464259e-05, + "loss": 1.045, + "step": 4530 + }, + { + "epoch": 0.040135080181757106, + "grad_norm": 5.079677581787109, + "learning_rate": 4.9331081996970716e-05, + "loss": 0.9155, + "step": 4540 + }, + { + "epoch": 0.040223483442069344, + "grad_norm": 4.602534770965576, + "learning_rate": 4.9329608609298844e-05, + "loss": 1.0384, + "step": 4550 + }, + { + "epoch": 0.04031188670238158, + "grad_norm": 5.581985950469971, + "learning_rate": 4.932813522162697e-05, + "loss": 1.0825, + "step": 4560 + }, + { + "epoch": 0.040400289962693826, + "grad_norm": 6.3072896003723145, + "learning_rate": 4.932666183395511e-05, + "loss": 1.0934, + "step": 4570 + }, + { + "epoch": 0.040488693223006064, + "grad_norm": 4.629852294921875, + "learning_rate": 4.9325188446283236e-05, + "loss": 1.0087, + "step": 4580 + }, + { + "epoch": 0.0405770964833183, + "grad_norm": 3.7570137977600098, + "learning_rate": 4.9323715058611364e-05, + "loss": 1.0211, + "step": 4590 + }, + { + "epoch": 0.04066549974363055, + "grad_norm": 9.198297500610352, + "learning_rate": 4.932224167093949e-05, + "loss": 0.9675, + "step": 4600 + }, + { + "epoch": 0.040753903003942785, + "grad_norm": 5.447032928466797, + "learning_rate": 4.932076828326762e-05, + "loss": 1.0059, + "step": 4610 + }, + { + "epoch": 0.04084230626425502, + "grad_norm": 6.04211950302124, + "learning_rate": 4.931929489559575e-05, + "loss": 1.0281, + "step": 4620 + }, + { + "epoch": 0.04093070952456727, + "grad_norm": 8.443472862243652, + "learning_rate": 4.9317821507923884e-05, + "loss": 1.0628, + "step": 4630 + }, + { + "epoch": 0.041019112784879505, + "grad_norm": 4.586818695068359, + "learning_rate": 4.931634812025201e-05, + "loss": 0.9729, + "step": 4640 + }, + { + "epoch": 0.04110751604519175, + "grad_norm": 5.2301344871521, + "learning_rate": 4.931487473258014e-05, + "loss": 1.0448, + "step": 4650 + }, + { + "epoch": 0.04119591930550399, + "grad_norm": 5.564103603363037, + "learning_rate": 4.931340134490827e-05, + "loss": 1.12, + "step": 4660 + }, + { + "epoch": 0.041284322565816226, + "grad_norm": 4.086526393890381, + "learning_rate": 4.93119279572364e-05, + "loss": 1.0465, + "step": 4670 + }, + { + "epoch": 0.04137272582612847, + "grad_norm": 5.185882091522217, + "learning_rate": 4.9310454569564526e-05, + "loss": 0.9242, + "step": 4680 + }, + { + "epoch": 0.04146112908644071, + "grad_norm": 8.439657211303711, + "learning_rate": 4.9308981181892654e-05, + "loss": 0.9688, + "step": 4690 + }, + { + "epoch": 0.041549532346752946, + "grad_norm": 4.6650800704956055, + "learning_rate": 4.930750779422079e-05, + "loss": 1.0498, + "step": 4700 + }, + { + "epoch": 0.04163793560706519, + "grad_norm": 5.943160057067871, + "learning_rate": 4.930603440654892e-05, + "loss": 1.1057, + "step": 4710 + }, + { + "epoch": 0.04172633886737743, + "grad_norm": 4.689329624176025, + "learning_rate": 4.9304561018877046e-05, + "loss": 1.0006, + "step": 4720 + }, + { + "epoch": 0.041814742127689666, + "grad_norm": 8.358399391174316, + "learning_rate": 4.9303087631205175e-05, + "loss": 1.0855, + "step": 4730 + }, + { + "epoch": 0.04190314538800191, + "grad_norm": 4.517096519470215, + "learning_rate": 4.93016142435333e-05, + "loss": 0.9695, + "step": 4740 + }, + { + "epoch": 0.04199154864831415, + "grad_norm": 6.596667289733887, + "learning_rate": 4.930014085586143e-05, + "loss": 1.0451, + "step": 4750 + }, + { + "epoch": 0.04207995190862639, + "grad_norm": 4.656362056732178, + "learning_rate": 4.9298667468189566e-05, + "loss": 0.9147, + "step": 4760 + }, + { + "epoch": 0.04216835516893863, + "grad_norm": 11.312739372253418, + "learning_rate": 4.9297194080517695e-05, + "loss": 0.9862, + "step": 4770 + }, + { + "epoch": 0.04225675842925087, + "grad_norm": 4.505188465118408, + "learning_rate": 4.929572069284582e-05, + "loss": 1.0174, + "step": 4780 + }, + { + "epoch": 0.042345161689563114, + "grad_norm": 4.732143878936768, + "learning_rate": 4.929424730517395e-05, + "loss": 0.9307, + "step": 4790 + }, + { + "epoch": 0.04243356494987535, + "grad_norm": 4.13836669921875, + "learning_rate": 4.929277391750208e-05, + "loss": 0.9735, + "step": 4800 + }, + { + "epoch": 0.04252196821018759, + "grad_norm": 5.821000099182129, + "learning_rate": 4.929130052983021e-05, + "loss": 0.9195, + "step": 4810 + }, + { + "epoch": 0.042610371470499835, + "grad_norm": 3.0030617713928223, + "learning_rate": 4.928982714215834e-05, + "loss": 0.9304, + "step": 4820 + }, + { + "epoch": 0.04269877473081207, + "grad_norm": 3.2358474731445312, + "learning_rate": 4.9288353754486465e-05, + "loss": 1.1401, + "step": 4830 + }, + { + "epoch": 0.04278717799112431, + "grad_norm": 4.475287914276123, + "learning_rate": 4.92868803668146e-05, + "loss": 0.9251, + "step": 4840 + }, + { + "epoch": 0.042875581251436555, + "grad_norm": 5.249177932739258, + "learning_rate": 4.928540697914273e-05, + "loss": 1.0662, + "step": 4850 + }, + { + "epoch": 0.04296398451174879, + "grad_norm": 6.232396602630615, + "learning_rate": 4.928393359147086e-05, + "loss": 1.0454, + "step": 4860 + }, + { + "epoch": 0.04305238777206103, + "grad_norm": 3.666123390197754, + "learning_rate": 4.9282460203798985e-05, + "loss": 1.0652, + "step": 4870 + }, + { + "epoch": 0.043140791032373275, + "grad_norm": 6.440056324005127, + "learning_rate": 4.928098681612712e-05, + "loss": 1.0286, + "step": 4880 + }, + { + "epoch": 0.04322919429268551, + "grad_norm": 4.403159141540527, + "learning_rate": 4.927951342845524e-05, + "loss": 1.0054, + "step": 4890 + }, + { + "epoch": 0.04331759755299775, + "grad_norm": 5.795849800109863, + "learning_rate": 4.927804004078338e-05, + "loss": 1.0152, + "step": 4900 + }, + { + "epoch": 0.043406000813309996, + "grad_norm": 12.21654987335205, + "learning_rate": 4.92765666531115e-05, + "loss": 0.9303, + "step": 4910 + }, + { + "epoch": 0.043494404073622234, + "grad_norm": 4.522624969482422, + "learning_rate": 4.9275093265439634e-05, + "loss": 1.0237, + "step": 4920 + }, + { + "epoch": 0.04358280733393448, + "grad_norm": 7.325983047485352, + "learning_rate": 4.927361987776776e-05, + "loss": 0.9337, + "step": 4930 + }, + { + "epoch": 0.043671210594246716, + "grad_norm": 11.711366653442383, + "learning_rate": 4.927214649009589e-05, + "loss": 1.0331, + "step": 4940 + }, + { + "epoch": 0.043759613854558954, + "grad_norm": 7.563577651977539, + "learning_rate": 4.927067310242402e-05, + "loss": 1.0056, + "step": 4950 + }, + { + "epoch": 0.0438480171148712, + "grad_norm": 3.4880125522613525, + "learning_rate": 4.9269199714752154e-05, + "loss": 0.8889, + "step": 4960 + }, + { + "epoch": 0.04393642037518344, + "grad_norm": 8.676736831665039, + "learning_rate": 4.9267726327080275e-05, + "loss": 1.2163, + "step": 4970 + }, + { + "epoch": 0.044024823635495675, + "grad_norm": 6.389759540557861, + "learning_rate": 4.926625293940841e-05, + "loss": 1.0256, + "step": 4980 + }, + { + "epoch": 0.04411322689580792, + "grad_norm": 5.722421646118164, + "learning_rate": 4.926477955173654e-05, + "loss": 0.9956, + "step": 4990 + }, + { + "epoch": 0.04420163015612016, + "grad_norm": 3.8869667053222656, + "learning_rate": 4.926330616406467e-05, + "loss": 0.924, + "step": 5000 + }, + { + "epoch": 0.044290033416432395, + "grad_norm": 9.306219100952148, + "learning_rate": 4.9261832776392796e-05, + "loss": 1.1055, + "step": 5010 + }, + { + "epoch": 0.04437843667674464, + "grad_norm": 5.7872633934021, + "learning_rate": 4.9260359388720924e-05, + "loss": 0.9434, + "step": 5020 + }, + { + "epoch": 0.04446683993705688, + "grad_norm": 2.841301679611206, + "learning_rate": 4.925888600104905e-05, + "loss": 0.9664, + "step": 5030 + }, + { + "epoch": 0.04455524319736912, + "grad_norm": 5.8848748207092285, + "learning_rate": 4.925741261337719e-05, + "loss": 0.9785, + "step": 5040 + }, + { + "epoch": 0.04464364645768136, + "grad_norm": 5.913470268249512, + "learning_rate": 4.925593922570531e-05, + "loss": 0.9967, + "step": 5050 + }, + { + "epoch": 0.0447320497179936, + "grad_norm": 5.830585479736328, + "learning_rate": 4.9254465838033444e-05, + "loss": 1.0564, + "step": 5060 + }, + { + "epoch": 0.04482045297830584, + "grad_norm": 3.5792245864868164, + "learning_rate": 4.925299245036157e-05, + "loss": 1.2437, + "step": 5070 + }, + { + "epoch": 0.04490885623861808, + "grad_norm": 5.205353736877441, + "learning_rate": 4.92515190626897e-05, + "loss": 1.2016, + "step": 5080 + }, + { + "epoch": 0.04499725949893032, + "grad_norm": 4.309473514556885, + "learning_rate": 4.925004567501783e-05, + "loss": 0.931, + "step": 5090 + }, + { + "epoch": 0.04508566275924256, + "grad_norm": 7.3599982261657715, + "learning_rate": 4.9248572287345964e-05, + "loss": 0.9347, + "step": 5100 + }, + { + "epoch": 0.0451740660195548, + "grad_norm": 4.055037021636963, + "learning_rate": 4.9247098899674086e-05, + "loss": 0.9706, + "step": 5110 + }, + { + "epoch": 0.04526246927986704, + "grad_norm": 7.1950578689575195, + "learning_rate": 4.924562551200222e-05, + "loss": 0.9902, + "step": 5120 + }, + { + "epoch": 0.045350872540179284, + "grad_norm": 3.477550506591797, + "learning_rate": 4.924415212433034e-05, + "loss": 1.1443, + "step": 5130 + }, + { + "epoch": 0.04543927580049152, + "grad_norm": 6.067864418029785, + "learning_rate": 4.924267873665848e-05, + "loss": 0.9565, + "step": 5140 + }, + { + "epoch": 0.04552767906080376, + "grad_norm": 4.849110126495361, + "learning_rate": 4.9241205348986606e-05, + "loss": 0.9515, + "step": 5150 + }, + { + "epoch": 0.045616082321116004, + "grad_norm": 6.101007461547852, + "learning_rate": 4.9239731961314734e-05, + "loss": 0.927, + "step": 5160 + }, + { + "epoch": 0.04570448558142824, + "grad_norm": 9.797233581542969, + "learning_rate": 4.923825857364286e-05, + "loss": 1.0617, + "step": 5170 + }, + { + "epoch": 0.04579288884174049, + "grad_norm": 6.435054302215576, + "learning_rate": 4.9236785185971e-05, + "loss": 0.9908, + "step": 5180 + }, + { + "epoch": 0.045881292102052725, + "grad_norm": 6.170694828033447, + "learning_rate": 4.923531179829912e-05, + "loss": 1.0215, + "step": 5190 + }, + { + "epoch": 0.04596969536236496, + "grad_norm": 5.148831844329834, + "learning_rate": 4.9233838410627255e-05, + "loss": 1.0137, + "step": 5200 + }, + { + "epoch": 0.04605809862267721, + "grad_norm": 5.061634540557861, + "learning_rate": 4.923236502295538e-05, + "loss": 0.9616, + "step": 5210 + }, + { + "epoch": 0.046146501882989445, + "grad_norm": 6.394944190979004, + "learning_rate": 4.923089163528351e-05, + "loss": 1.0306, + "step": 5220 + }, + { + "epoch": 0.04623490514330168, + "grad_norm": 3.3894765377044678, + "learning_rate": 4.922941824761164e-05, + "loss": 1.112, + "step": 5230 + }, + { + "epoch": 0.04632330840361393, + "grad_norm": 2.86142635345459, + "learning_rate": 4.9227944859939775e-05, + "loss": 0.9926, + "step": 5240 + }, + { + "epoch": 0.046411711663926165, + "grad_norm": 5.8516693115234375, + "learning_rate": 4.9226471472267896e-05, + "loss": 0.8999, + "step": 5250 + }, + { + "epoch": 0.0465001149242384, + "grad_norm": 5.580111503601074, + "learning_rate": 4.922499808459603e-05, + "loss": 0.9504, + "step": 5260 + }, + { + "epoch": 0.04658851818455065, + "grad_norm": 8.920647621154785, + "learning_rate": 4.922352469692415e-05, + "loss": 1.0487, + "step": 5270 + }, + { + "epoch": 0.046676921444862886, + "grad_norm": 6.242912769317627, + "learning_rate": 4.922205130925229e-05, + "loss": 1.0432, + "step": 5280 + }, + { + "epoch": 0.046765324705175124, + "grad_norm": 7.302402496337891, + "learning_rate": 4.922057792158042e-05, + "loss": 1.1208, + "step": 5290 + }, + { + "epoch": 0.04685372796548737, + "grad_norm": 9.732843399047852, + "learning_rate": 4.9219104533908545e-05, + "loss": 0.9864, + "step": 5300 + }, + { + "epoch": 0.046942131225799606, + "grad_norm": 4.585763931274414, + "learning_rate": 4.921763114623667e-05, + "loss": 1.0379, + "step": 5310 + }, + { + "epoch": 0.04703053448611185, + "grad_norm": 5.401805400848389, + "learning_rate": 4.921615775856481e-05, + "loss": 0.9634, + "step": 5320 + }, + { + "epoch": 0.04711893774642409, + "grad_norm": 6.35153865814209, + "learning_rate": 4.921468437089293e-05, + "loss": 1.0483, + "step": 5330 + }, + { + "epoch": 0.04720734100673633, + "grad_norm": 5.659788131713867, + "learning_rate": 4.9213210983221065e-05, + "loss": 1.0511, + "step": 5340 + }, + { + "epoch": 0.04729574426704857, + "grad_norm": 4.884911060333252, + "learning_rate": 4.9211737595549194e-05, + "loss": 1.0186, + "step": 5350 + }, + { + "epoch": 0.04738414752736081, + "grad_norm": 4.303860187530518, + "learning_rate": 4.921026420787732e-05, + "loss": 0.9606, + "step": 5360 + }, + { + "epoch": 0.04747255078767305, + "grad_norm": 10.028541564941406, + "learning_rate": 4.920879082020545e-05, + "loss": 1.1099, + "step": 5370 + }, + { + "epoch": 0.04756095404798529, + "grad_norm": 6.026510715484619, + "learning_rate": 4.920731743253358e-05, + "loss": 1.0223, + "step": 5380 + }, + { + "epoch": 0.04764935730829753, + "grad_norm": 3.0914306640625, + "learning_rate": 4.920584404486171e-05, + "loss": 0.9467, + "step": 5390 + }, + { + "epoch": 0.04773776056860977, + "grad_norm": 6.44119119644165, + "learning_rate": 4.920437065718984e-05, + "loss": 0.9499, + "step": 5400 + }, + { + "epoch": 0.04782616382892201, + "grad_norm": 3.9868052005767822, + "learning_rate": 4.9202897269517964e-05, + "loss": 0.987, + "step": 5410 + }, + { + "epoch": 0.04791456708923425, + "grad_norm": 7.742058753967285, + "learning_rate": 4.92014238818461e-05, + "loss": 1.0558, + "step": 5420 + }, + { + "epoch": 0.04800297034954649, + "grad_norm": 7.036317825317383, + "learning_rate": 4.919995049417423e-05, + "loss": 0.9177, + "step": 5430 + }, + { + "epoch": 0.04809137360985873, + "grad_norm": 9.096891403198242, + "learning_rate": 4.9198477106502355e-05, + "loss": 0.9944, + "step": 5440 + }, + { + "epoch": 0.04817977687017097, + "grad_norm": 6.504842758178711, + "learning_rate": 4.9197003718830484e-05, + "loss": 1.0012, + "step": 5450 + }, + { + "epoch": 0.048268180130483215, + "grad_norm": 3.3751108646392822, + "learning_rate": 4.919553033115862e-05, + "loss": 1.0993, + "step": 5460 + }, + { + "epoch": 0.04835658339079545, + "grad_norm": 3.9735724925994873, + "learning_rate": 4.919405694348674e-05, + "loss": 0.9421, + "step": 5470 + }, + { + "epoch": 0.04844498665110769, + "grad_norm": 7.27800178527832, + "learning_rate": 4.9192583555814876e-05, + "loss": 1.0485, + "step": 5480 + }, + { + "epoch": 0.048533389911419936, + "grad_norm": 6.35645055770874, + "learning_rate": 4.9191110168143004e-05, + "loss": 0.977, + "step": 5490 + }, + { + "epoch": 0.048621793171732174, + "grad_norm": 5.093990325927734, + "learning_rate": 4.918963678047113e-05, + "loss": 1.1082, + "step": 5500 + }, + { + "epoch": 0.04871019643204441, + "grad_norm": 7.979743957519531, + "learning_rate": 4.918816339279926e-05, + "loss": 0.9865, + "step": 5510 + }, + { + "epoch": 0.048798599692356656, + "grad_norm": 6.810271739959717, + "learning_rate": 4.918669000512739e-05, + "loss": 1.0031, + "step": 5520 + }, + { + "epoch": 0.048887002952668894, + "grad_norm": 3.874130964279175, + "learning_rate": 4.918521661745552e-05, + "loss": 0.9567, + "step": 5530 + }, + { + "epoch": 0.04897540621298113, + "grad_norm": 4.50253963470459, + "learning_rate": 4.918374322978365e-05, + "loss": 0.9467, + "step": 5540 + }, + { + "epoch": 0.04906380947329338, + "grad_norm": 6.759551048278809, + "learning_rate": 4.918226984211178e-05, + "loss": 0.9026, + "step": 5550 + }, + { + "epoch": 0.049152212733605614, + "grad_norm": 8.089125633239746, + "learning_rate": 4.918079645443991e-05, + "loss": 0.9087, + "step": 5560 + }, + { + "epoch": 0.04924061599391785, + "grad_norm": 5.775022029876709, + "learning_rate": 4.917932306676804e-05, + "loss": 0.9098, + "step": 5570 + }, + { + "epoch": 0.0493290192542301, + "grad_norm": 3.289224624633789, + "learning_rate": 4.9177849679096166e-05, + "loss": 0.9084, + "step": 5580 + }, + { + "epoch": 0.049417422514542335, + "grad_norm": 5.4761962890625, + "learning_rate": 4.9176376291424294e-05, + "loss": 0.9332, + "step": 5590 + }, + { + "epoch": 0.04950582577485458, + "grad_norm": 8.073454856872559, + "learning_rate": 4.917490290375242e-05, + "loss": 1.0176, + "step": 5600 + }, + { + "epoch": 0.04959422903516682, + "grad_norm": 8.242021560668945, + "learning_rate": 4.917342951608056e-05, + "loss": 1.0431, + "step": 5610 + }, + { + "epoch": 0.049682632295479055, + "grad_norm": 5.374790191650391, + "learning_rate": 4.9171956128408686e-05, + "loss": 0.9881, + "step": 5620 + }, + { + "epoch": 0.0497710355557913, + "grad_norm": 5.706557273864746, + "learning_rate": 4.9170482740736815e-05, + "loss": 0.9989, + "step": 5630 + }, + { + "epoch": 0.04985943881610354, + "grad_norm": 6.9351654052734375, + "learning_rate": 4.916900935306494e-05, + "loss": 0.9272, + "step": 5640 + }, + { + "epoch": 0.049947842076415776, + "grad_norm": 3.7204599380493164, + "learning_rate": 4.916753596539307e-05, + "loss": 0.9965, + "step": 5650 + }, + { + "epoch": 0.05003624533672802, + "grad_norm": 2.9318227767944336, + "learning_rate": 4.91660625777212e-05, + "loss": 0.936, + "step": 5660 + }, + { + "epoch": 0.05012464859704026, + "grad_norm": 8.022659301757812, + "learning_rate": 4.9164589190049335e-05, + "loss": 1.0342, + "step": 5670 + }, + { + "epoch": 0.050213051857352496, + "grad_norm": 5.471922397613525, + "learning_rate": 4.916311580237746e-05, + "loss": 1.0371, + "step": 5680 + }, + { + "epoch": 0.05030145511766474, + "grad_norm": 8.396404266357422, + "learning_rate": 4.916164241470559e-05, + "loss": 0.9629, + "step": 5690 + }, + { + "epoch": 0.05038985837797698, + "grad_norm": 10.464508056640625, + "learning_rate": 4.916016902703372e-05, + "loss": 0.9274, + "step": 5700 + }, + { + "epoch": 0.050478261638289224, + "grad_norm": 6.246397972106934, + "learning_rate": 4.915869563936185e-05, + "loss": 1.1343, + "step": 5710 + }, + { + "epoch": 0.05056666489860146, + "grad_norm": 8.593369483947754, + "learning_rate": 4.9157222251689977e-05, + "loss": 0.9722, + "step": 5720 + }, + { + "epoch": 0.0506550681589137, + "grad_norm": 12.5711030960083, + "learning_rate": 4.915574886401811e-05, + "loss": 0.9718, + "step": 5730 + }, + { + "epoch": 0.050743471419225944, + "grad_norm": 4.524014949798584, + "learning_rate": 4.915427547634623e-05, + "loss": 0.9674, + "step": 5740 + }, + { + "epoch": 0.05083187467953818, + "grad_norm": 4.370037078857422, + "learning_rate": 4.915280208867437e-05, + "loss": 1.1288, + "step": 5750 + }, + { + "epoch": 0.05092027793985042, + "grad_norm": 3.817610740661621, + "learning_rate": 4.91513287010025e-05, + "loss": 0.9584, + "step": 5760 + }, + { + "epoch": 0.051008681200162664, + "grad_norm": 5.315976619720459, + "learning_rate": 4.9149855313330625e-05, + "loss": 0.995, + "step": 5770 + }, + { + "epoch": 0.0510970844604749, + "grad_norm": 4.982341289520264, + "learning_rate": 4.9148381925658753e-05, + "loss": 0.9847, + "step": 5780 + }, + { + "epoch": 0.05118548772078714, + "grad_norm": 7.3806586265563965, + "learning_rate": 4.914690853798689e-05, + "loss": 0.8998, + "step": 5790 + }, + { + "epoch": 0.051273890981099385, + "grad_norm": 6.027826309204102, + "learning_rate": 4.914543515031501e-05, + "loss": 1.0207, + "step": 5800 + }, + { + "epoch": 0.05136229424141162, + "grad_norm": 4.423072814941406, + "learning_rate": 4.9143961762643145e-05, + "loss": 0.9825, + "step": 5810 + }, + { + "epoch": 0.05145069750172386, + "grad_norm": 9.734428405761719, + "learning_rate": 4.9142488374971274e-05, + "loss": 1.0604, + "step": 5820 + }, + { + "epoch": 0.051539100762036105, + "grad_norm": 5.924389362335205, + "learning_rate": 4.91410149872994e-05, + "loss": 1.022, + "step": 5830 + }, + { + "epoch": 0.05162750402234834, + "grad_norm": 9.964608192443848, + "learning_rate": 4.913954159962753e-05, + "loss": 1.0702, + "step": 5840 + }, + { + "epoch": 0.05171590728266059, + "grad_norm": 5.59250545501709, + "learning_rate": 4.913806821195566e-05, + "loss": 1.0136, + "step": 5850 + }, + { + "epoch": 0.051804310542972826, + "grad_norm": 5.76235294342041, + "learning_rate": 4.913659482428379e-05, + "loss": 0.9, + "step": 5860 + }, + { + "epoch": 0.051892713803285064, + "grad_norm": 5.628695964813232, + "learning_rate": 4.913512143661192e-05, + "loss": 1.0229, + "step": 5870 + }, + { + "epoch": 0.05198111706359731, + "grad_norm": 5.529850959777832, + "learning_rate": 4.9133648048940044e-05, + "loss": 0.8358, + "step": 5880 + }, + { + "epoch": 0.052069520323909546, + "grad_norm": 6.741031169891357, + "learning_rate": 4.913217466126818e-05, + "loss": 0.8761, + "step": 5890 + }, + { + "epoch": 0.052157923584221784, + "grad_norm": 4.22281551361084, + "learning_rate": 4.913070127359631e-05, + "loss": 1.1577, + "step": 5900 + }, + { + "epoch": 0.05224632684453403, + "grad_norm": 4.882099151611328, + "learning_rate": 4.9129227885924436e-05, + "loss": 0.7943, + "step": 5910 + }, + { + "epoch": 0.052334730104846267, + "grad_norm": 7.837223529815674, + "learning_rate": 4.9127754498252564e-05, + "loss": 0.8536, + "step": 5920 + }, + { + "epoch": 0.052423133365158504, + "grad_norm": 5.219480514526367, + "learning_rate": 4.91262811105807e-05, + "loss": 1.0275, + "step": 5930 + }, + { + "epoch": 0.05251153662547075, + "grad_norm": 4.461834907531738, + "learning_rate": 4.912480772290882e-05, + "loss": 0.8585, + "step": 5940 + }, + { + "epoch": 0.05259993988578299, + "grad_norm": 4.6096649169921875, + "learning_rate": 4.9123334335236956e-05, + "loss": 0.9949, + "step": 5950 + }, + { + "epoch": 0.052688343146095225, + "grad_norm": 6.320931911468506, + "learning_rate": 4.912186094756508e-05, + "loss": 0.888, + "step": 5960 + }, + { + "epoch": 0.05277674640640747, + "grad_norm": 6.780819892883301, + "learning_rate": 4.912038755989321e-05, + "loss": 0.9352, + "step": 5970 + }, + { + "epoch": 0.05286514966671971, + "grad_norm": 5.738193988800049, + "learning_rate": 4.911891417222134e-05, + "loss": 0.8641, + "step": 5980 + }, + { + "epoch": 0.05295355292703195, + "grad_norm": 4.92494010925293, + "learning_rate": 4.911744078454947e-05, + "loss": 0.9729, + "step": 5990 + }, + { + "epoch": 0.05304195618734419, + "grad_norm": 5.132249355316162, + "learning_rate": 4.91159673968776e-05, + "loss": 0.9417, + "step": 6000 + }, + { + "epoch": 0.05313035944765643, + "grad_norm": 6.281084060668945, + "learning_rate": 4.911449400920573e-05, + "loss": 0.9744, + "step": 6010 + }, + { + "epoch": 0.05321876270796867, + "grad_norm": 2.8937885761260986, + "learning_rate": 4.9113020621533854e-05, + "loss": 0.9816, + "step": 6020 + }, + { + "epoch": 0.05330716596828091, + "grad_norm": 5.416077136993408, + "learning_rate": 4.911154723386199e-05, + "loss": 0.9954, + "step": 6030 + }, + { + "epoch": 0.05339556922859315, + "grad_norm": 4.114582538604736, + "learning_rate": 4.911007384619012e-05, + "loss": 0.9122, + "step": 6040 + }, + { + "epoch": 0.05348397248890539, + "grad_norm": 8.810036659240723, + "learning_rate": 4.9108600458518246e-05, + "loss": 1.0605, + "step": 6050 + }, + { + "epoch": 0.05357237574921763, + "grad_norm": 10.188939094543457, + "learning_rate": 4.9107127070846374e-05, + "loss": 0.979, + "step": 6060 + }, + { + "epoch": 0.05366077900952987, + "grad_norm": 7.3219428062438965, + "learning_rate": 4.91056536831745e-05, + "loss": 1.0042, + "step": 6070 + }, + { + "epoch": 0.05374918226984211, + "grad_norm": 4.038445949554443, + "learning_rate": 4.910418029550263e-05, + "loss": 1.1042, + "step": 6080 + }, + { + "epoch": 0.05383758553015435, + "grad_norm": 4.296999454498291, + "learning_rate": 4.9102706907830766e-05, + "loss": 0.869, + "step": 6090 + }, + { + "epoch": 0.05392598879046659, + "grad_norm": 10.159406661987305, + "learning_rate": 4.910123352015889e-05, + "loss": 0.9012, + "step": 6100 + }, + { + "epoch": 0.054014392050778834, + "grad_norm": 5.289685249328613, + "learning_rate": 4.909976013248702e-05, + "loss": 0.8925, + "step": 6110 + }, + { + "epoch": 0.05410279531109107, + "grad_norm": 5.531337261199951, + "learning_rate": 4.909828674481515e-05, + "loss": 1.1365, + "step": 6120 + }, + { + "epoch": 0.054191198571403316, + "grad_norm": 6.189535617828369, + "learning_rate": 4.909681335714328e-05, + "loss": 1.1334, + "step": 6130 + }, + { + "epoch": 0.054279601831715554, + "grad_norm": 4.934144973754883, + "learning_rate": 4.909533996947141e-05, + "loss": 0.9185, + "step": 6140 + }, + { + "epoch": 0.05436800509202779, + "grad_norm": 4.428097248077393, + "learning_rate": 4.909386658179954e-05, + "loss": 0.9079, + "step": 6150 + }, + { + "epoch": 0.05445640835234004, + "grad_norm": 4.912667751312256, + "learning_rate": 4.9092393194127665e-05, + "loss": 1.002, + "step": 6160 + }, + { + "epoch": 0.054544811612652275, + "grad_norm": 7.442349910736084, + "learning_rate": 4.90909198064558e-05, + "loss": 1.0206, + "step": 6170 + }, + { + "epoch": 0.05463321487296451, + "grad_norm": 3.1982181072235107, + "learning_rate": 4.908944641878393e-05, + "loss": 1.0653, + "step": 6180 + }, + { + "epoch": 0.05472161813327676, + "grad_norm": 5.053394794464111, + "learning_rate": 4.9087973031112057e-05, + "loss": 1.021, + "step": 6190 + }, + { + "epoch": 0.054810021393588995, + "grad_norm": 9.006791114807129, + "learning_rate": 4.9086499643440185e-05, + "loss": 0.9457, + "step": 6200 + }, + { + "epoch": 0.05489842465390123, + "grad_norm": 7.753384113311768, + "learning_rate": 4.908502625576831e-05, + "loss": 0.9833, + "step": 6210 + }, + { + "epoch": 0.05498682791421348, + "grad_norm": 5.772186756134033, + "learning_rate": 4.908355286809644e-05, + "loss": 0.9516, + "step": 6220 + }, + { + "epoch": 0.055075231174525716, + "grad_norm": 4.40585470199585, + "learning_rate": 4.908207948042458e-05, + "loss": 0.9132, + "step": 6230 + }, + { + "epoch": 0.05516363443483795, + "grad_norm": 6.505217552185059, + "learning_rate": 4.90806060927527e-05, + "loss": 0.8467, + "step": 6240 + }, + { + "epoch": 0.0552520376951502, + "grad_norm": 4.562171459197998, + "learning_rate": 4.9079132705080833e-05, + "loss": 0.8822, + "step": 6250 + }, + { + "epoch": 0.055340440955462436, + "grad_norm": 7.463866710662842, + "learning_rate": 4.907765931740896e-05, + "loss": 1.0114, + "step": 6260 + }, + { + "epoch": 0.05542884421577468, + "grad_norm": 8.527029037475586, + "learning_rate": 4.907618592973709e-05, + "loss": 0.9943, + "step": 6270 + }, + { + "epoch": 0.05551724747608692, + "grad_norm": 5.194516658782959, + "learning_rate": 4.907471254206522e-05, + "loss": 0.9317, + "step": 6280 + }, + { + "epoch": 0.055605650736399156, + "grad_norm": 3.785703420639038, + "learning_rate": 4.9073239154393354e-05, + "loss": 0.8754, + "step": 6290 + }, + { + "epoch": 0.0556940539967114, + "grad_norm": 5.605743408203125, + "learning_rate": 4.9071765766721475e-05, + "loss": 1.0147, + "step": 6300 + }, + { + "epoch": 0.05578245725702364, + "grad_norm": 6.450657844543457, + "learning_rate": 4.907029237904961e-05, + "loss": 1.0177, + "step": 6310 + }, + { + "epoch": 0.05587086051733588, + "grad_norm": 3.6782920360565186, + "learning_rate": 4.906881899137773e-05, + "loss": 0.9223, + "step": 6320 + }, + { + "epoch": 0.05595926377764812, + "grad_norm": 10.444466590881348, + "learning_rate": 4.906734560370587e-05, + "loss": 0.9946, + "step": 6330 + }, + { + "epoch": 0.05604766703796036, + "grad_norm": 3.6840245723724365, + "learning_rate": 4.9065872216033995e-05, + "loss": 0.9282, + "step": 6340 + }, + { + "epoch": 0.0561360702982726, + "grad_norm": 6.504221439361572, + "learning_rate": 4.9064398828362124e-05, + "loss": 1.0093, + "step": 6350 + }, + { + "epoch": 0.05622447355858484, + "grad_norm": 5.361945629119873, + "learning_rate": 4.906292544069025e-05, + "loss": 0.945, + "step": 6360 + }, + { + "epoch": 0.05631287681889708, + "grad_norm": 3.5966508388519287, + "learning_rate": 4.906145205301839e-05, + "loss": 0.9992, + "step": 6370 + }, + { + "epoch": 0.056401280079209325, + "grad_norm": 5.785979747772217, + "learning_rate": 4.905997866534651e-05, + "loss": 0.9951, + "step": 6380 + }, + { + "epoch": 0.05648968333952156, + "grad_norm": 6.949944972991943, + "learning_rate": 4.9058505277674644e-05, + "loss": 0.8968, + "step": 6390 + }, + { + "epoch": 0.0565780865998338, + "grad_norm": 9.668983459472656, + "learning_rate": 4.905703189000277e-05, + "loss": 0.9029, + "step": 6400 + }, + { + "epoch": 0.056666489860146045, + "grad_norm": 5.306529521942139, + "learning_rate": 4.90555585023309e-05, + "loss": 0.8663, + "step": 6410 + }, + { + "epoch": 0.05675489312045828, + "grad_norm": 4.871407508850098, + "learning_rate": 4.905408511465903e-05, + "loss": 0.9431, + "step": 6420 + }, + { + "epoch": 0.05684329638077052, + "grad_norm": 4.606576442718506, + "learning_rate": 4.905261172698716e-05, + "loss": 1.0039, + "step": 6430 + }, + { + "epoch": 0.056931699641082766, + "grad_norm": 2.390242338180542, + "learning_rate": 4.9051138339315286e-05, + "loss": 0.9147, + "step": 6440 + }, + { + "epoch": 0.057020102901395, + "grad_norm": 4.913139343261719, + "learning_rate": 4.904966495164342e-05, + "loss": 1.0337, + "step": 6450 + }, + { + "epoch": 0.05710850616170724, + "grad_norm": 5.77178955078125, + "learning_rate": 4.904819156397155e-05, + "loss": 0.8926, + "step": 6460 + }, + { + "epoch": 0.057196909422019486, + "grad_norm": 3.674722194671631, + "learning_rate": 4.904671817629968e-05, + "loss": 0.9608, + "step": 6470 + }, + { + "epoch": 0.057285312682331724, + "grad_norm": 13.199363708496094, + "learning_rate": 4.9045244788627806e-05, + "loss": 1.2117, + "step": 6480 + }, + { + "epoch": 0.05737371594264396, + "grad_norm": 3.5203678607940674, + "learning_rate": 4.9043771400955934e-05, + "loss": 1.0523, + "step": 6490 + }, + { + "epoch": 0.057462119202956206, + "grad_norm": 5.5043110847473145, + "learning_rate": 4.904229801328406e-05, + "loss": 0.9041, + "step": 6500 + }, + { + "epoch": 0.057550522463268444, + "grad_norm": 3.238118886947632, + "learning_rate": 4.90408246256122e-05, + "loss": 0.9177, + "step": 6510 + }, + { + "epoch": 0.05763892572358069, + "grad_norm": 3.886664628982544, + "learning_rate": 4.9039351237940326e-05, + "loss": 1.0162, + "step": 6520 + }, + { + "epoch": 0.05772732898389293, + "grad_norm": 6.0458173751831055, + "learning_rate": 4.9037877850268455e-05, + "loss": 0.9505, + "step": 6530 + }, + { + "epoch": 0.057815732244205165, + "grad_norm": 3.132824182510376, + "learning_rate": 4.903640446259658e-05, + "loss": 0.8276, + "step": 6540 + }, + { + "epoch": 0.05790413550451741, + "grad_norm": 6.579557418823242, + "learning_rate": 4.903493107492471e-05, + "loss": 1.0451, + "step": 6550 + }, + { + "epoch": 0.05799253876482965, + "grad_norm": 5.665095329284668, + "learning_rate": 4.903345768725284e-05, + "loss": 1.0314, + "step": 6560 + }, + { + "epoch": 0.058080942025141885, + "grad_norm": 9.765207290649414, + "learning_rate": 4.903198429958097e-05, + "loss": 1.0061, + "step": 6570 + }, + { + "epoch": 0.05816934528545413, + "grad_norm": 6.479765892028809, + "learning_rate": 4.90305109119091e-05, + "loss": 0.9157, + "step": 6580 + }, + { + "epoch": 0.05825774854576637, + "grad_norm": 6.944589614868164, + "learning_rate": 4.902903752423723e-05, + "loss": 1.0337, + "step": 6590 + }, + { + "epoch": 0.058346151806078606, + "grad_norm": 5.715635776519775, + "learning_rate": 4.902756413656536e-05, + "loss": 0.9518, + "step": 6600 + }, + { + "epoch": 0.05843455506639085, + "grad_norm": 3.145275115966797, + "learning_rate": 4.902609074889349e-05, + "loss": 0.908, + "step": 6610 + }, + { + "epoch": 0.05852295832670309, + "grad_norm": 9.593637466430664, + "learning_rate": 4.9024617361221616e-05, + "loss": 0.9803, + "step": 6620 + }, + { + "epoch": 0.058611361587015326, + "grad_norm": 9.412375450134277, + "learning_rate": 4.9023143973549745e-05, + "loss": 1.0062, + "step": 6630 + }, + { + "epoch": 0.05869976484732757, + "grad_norm": 5.172863006591797, + "learning_rate": 4.902167058587788e-05, + "loss": 0.9477, + "step": 6640 + }, + { + "epoch": 0.05878816810763981, + "grad_norm": 5.577682971954346, + "learning_rate": 4.902019719820601e-05, + "loss": 1.0084, + "step": 6650 + }, + { + "epoch": 0.05887657136795205, + "grad_norm": 5.950476169586182, + "learning_rate": 4.901872381053414e-05, + "loss": 1.0198, + "step": 6660 + }, + { + "epoch": 0.05896497462826429, + "grad_norm": 6.371071815490723, + "learning_rate": 4.9017250422862265e-05, + "loss": 1.0459, + "step": 6670 + }, + { + "epoch": 0.05905337788857653, + "grad_norm": 3.299262523651123, + "learning_rate": 4.901577703519039e-05, + "loss": 0.975, + "step": 6680 + }, + { + "epoch": 0.059141781148888774, + "grad_norm": 3.839036226272583, + "learning_rate": 4.901430364751852e-05, + "loss": 0.9142, + "step": 6690 + }, + { + "epoch": 0.05923018440920101, + "grad_norm": 4.645992279052734, + "learning_rate": 4.901283025984666e-05, + "loss": 0.9328, + "step": 6700 + }, + { + "epoch": 0.05931858766951325, + "grad_norm": 7.860993385314941, + "learning_rate": 4.901135687217478e-05, + "loss": 1.0465, + "step": 6710 + }, + { + "epoch": 0.059406990929825494, + "grad_norm": 3.80489444732666, + "learning_rate": 4.9009883484502914e-05, + "loss": 0.9375, + "step": 6720 + }, + { + "epoch": 0.05949539419013773, + "grad_norm": 5.162045955657959, + "learning_rate": 4.900841009683104e-05, + "loss": 0.8921, + "step": 6730 + }, + { + "epoch": 0.05958379745044997, + "grad_norm": 2.4445645809173584, + "learning_rate": 4.900693670915917e-05, + "loss": 1.0937, + "step": 6740 + }, + { + "epoch": 0.059672200710762215, + "grad_norm": 3.2874808311462402, + "learning_rate": 4.90054633214873e-05, + "loss": 0.957, + "step": 6750 + }, + { + "epoch": 0.05976060397107445, + "grad_norm": 4.9104132652282715, + "learning_rate": 4.9003989933815434e-05, + "loss": 0.917, + "step": 6760 + }, + { + "epoch": 0.05984900723138669, + "grad_norm": 5.925168514251709, + "learning_rate": 4.9002516546143555e-05, + "loss": 0.9887, + "step": 6770 + }, + { + "epoch": 0.059937410491698935, + "grad_norm": 6.531099796295166, + "learning_rate": 4.900104315847169e-05, + "loss": 0.9383, + "step": 6780 + }, + { + "epoch": 0.06002581375201117, + "grad_norm": 6.779243469238281, + "learning_rate": 4.899956977079981e-05, + "loss": 1.0173, + "step": 6790 + }, + { + "epoch": 0.06011421701232342, + "grad_norm": 5.83915376663208, + "learning_rate": 4.899809638312795e-05, + "loss": 0.9161, + "step": 6800 + }, + { + "epoch": 0.060202620272635655, + "grad_norm": 4.769713401794434, + "learning_rate": 4.8996622995456076e-05, + "loss": 0.9705, + "step": 6810 + }, + { + "epoch": 0.06029102353294789, + "grad_norm": 4.014643669128418, + "learning_rate": 4.8995149607784204e-05, + "loss": 0.9136, + "step": 6820 + }, + { + "epoch": 0.06037942679326014, + "grad_norm": 9.945269584655762, + "learning_rate": 4.899367622011233e-05, + "loss": 0.9235, + "step": 6830 + }, + { + "epoch": 0.060467830053572376, + "grad_norm": 4.868852138519287, + "learning_rate": 4.899220283244047e-05, + "loss": 0.9456, + "step": 6840 + }, + { + "epoch": 0.060556233313884614, + "grad_norm": 5.041273593902588, + "learning_rate": 4.899072944476859e-05, + "loss": 0.9664, + "step": 6850 + }, + { + "epoch": 0.06064463657419686, + "grad_norm": 5.9616169929504395, + "learning_rate": 4.8989256057096724e-05, + "loss": 0.9784, + "step": 6860 + }, + { + "epoch": 0.060733039834509096, + "grad_norm": 5.6391072273254395, + "learning_rate": 4.898778266942485e-05, + "loss": 1.113, + "step": 6870 + }, + { + "epoch": 0.060821443094821334, + "grad_norm": 3.411214590072632, + "learning_rate": 4.898630928175298e-05, + "loss": 0.9794, + "step": 6880 + }, + { + "epoch": 0.06090984635513358, + "grad_norm": 7.82897424697876, + "learning_rate": 4.898483589408111e-05, + "loss": 0.8892, + "step": 6890 + }, + { + "epoch": 0.06099824961544582, + "grad_norm": 6.4274396896362305, + "learning_rate": 4.898336250640924e-05, + "loss": 0.9396, + "step": 6900 + }, + { + "epoch": 0.061086652875758055, + "grad_norm": 8.304560661315918, + "learning_rate": 4.8981889118737366e-05, + "loss": 1.0109, + "step": 6910 + }, + { + "epoch": 0.0611750561360703, + "grad_norm": 5.554924488067627, + "learning_rate": 4.89804157310655e-05, + "loss": 0.886, + "step": 6920 + }, + { + "epoch": 0.06126345939638254, + "grad_norm": 5.120604515075684, + "learning_rate": 4.897894234339362e-05, + "loss": 1.0324, + "step": 6930 + }, + { + "epoch": 0.06135186265669478, + "grad_norm": 3.963286876678467, + "learning_rate": 4.897746895572176e-05, + "loss": 0.9292, + "step": 6940 + }, + { + "epoch": 0.06144026591700702, + "grad_norm": 3.5170414447784424, + "learning_rate": 4.8975995568049886e-05, + "loss": 0.873, + "step": 6950 + }, + { + "epoch": 0.06152866917731926, + "grad_norm": 6.453105926513672, + "learning_rate": 4.8974522180378014e-05, + "loss": 0.9063, + "step": 6960 + }, + { + "epoch": 0.0616170724376315, + "grad_norm": 8.583768844604492, + "learning_rate": 4.897304879270614e-05, + "loss": 1.0464, + "step": 6970 + }, + { + "epoch": 0.06170547569794374, + "grad_norm": 4.491311550140381, + "learning_rate": 4.897157540503428e-05, + "loss": 0.9401, + "step": 6980 + }, + { + "epoch": 0.06179387895825598, + "grad_norm": 7.161159038543701, + "learning_rate": 4.89701020173624e-05, + "loss": 0.9634, + "step": 6990 + }, + { + "epoch": 0.06188228221856822, + "grad_norm": 6.075889587402344, + "learning_rate": 4.8968628629690535e-05, + "loss": 0.9824, + "step": 7000 + }, + { + "epoch": 0.06197068547888046, + "grad_norm": 6.824237823486328, + "learning_rate": 4.8967155242018656e-05, + "loss": 0.8896, + "step": 7010 + }, + { + "epoch": 0.0620590887391927, + "grad_norm": 4.351720333099365, + "learning_rate": 4.896568185434679e-05, + "loss": 0.8572, + "step": 7020 + }, + { + "epoch": 0.06214749199950494, + "grad_norm": 7.742468357086182, + "learning_rate": 4.896420846667492e-05, + "loss": 0.92, + "step": 7030 + }, + { + "epoch": 0.06223589525981718, + "grad_norm": 9.736001968383789, + "learning_rate": 4.896273507900305e-05, + "loss": 0.9778, + "step": 7040 + }, + { + "epoch": 0.062324298520129426, + "grad_norm": 7.4972920417785645, + "learning_rate": 4.8961261691331176e-05, + "loss": 0.9763, + "step": 7050 + }, + { + "epoch": 0.062412701780441664, + "grad_norm": 4.487346172332764, + "learning_rate": 4.895978830365931e-05, + "loss": 1.0487, + "step": 7060 + }, + { + "epoch": 0.0625011050407539, + "grad_norm": 3.527085304260254, + "learning_rate": 4.895831491598743e-05, + "loss": 0.9339, + "step": 7070 + }, + { + "epoch": 0.06258950830106615, + "grad_norm": 4.565359592437744, + "learning_rate": 4.895684152831557e-05, + "loss": 0.9336, + "step": 7080 + }, + { + "epoch": 0.06267791156137838, + "grad_norm": 7.8365936279296875, + "learning_rate": 4.8955368140643697e-05, + "loss": 0.8912, + "step": 7090 + }, + { + "epoch": 0.06276631482169062, + "grad_norm": 6.922832489013672, + "learning_rate": 4.8953894752971825e-05, + "loss": 0.9872, + "step": 7100 + }, + { + "epoch": 0.06285471808200287, + "grad_norm": 5.70310640335083, + "learning_rate": 4.895242136529995e-05, + "loss": 0.9987, + "step": 7110 + }, + { + "epoch": 0.06294312134231511, + "grad_norm": 4.13558292388916, + "learning_rate": 4.895094797762809e-05, + "loss": 0.9326, + "step": 7120 + }, + { + "epoch": 0.06303152460262734, + "grad_norm": 7.350679874420166, + "learning_rate": 4.894947458995621e-05, + "loss": 0.9005, + "step": 7130 + }, + { + "epoch": 0.06311992786293959, + "grad_norm": 4.997514247894287, + "learning_rate": 4.8948001202284345e-05, + "loss": 0.9472, + "step": 7140 + }, + { + "epoch": 0.06320833112325183, + "grad_norm": 8.707229614257812, + "learning_rate": 4.894652781461247e-05, + "loss": 0.8869, + "step": 7150 + }, + { + "epoch": 0.06329673438356406, + "grad_norm": 5.310278415679932, + "learning_rate": 4.89450544269406e-05, + "loss": 1.0221, + "step": 7160 + }, + { + "epoch": 0.06338513764387631, + "grad_norm": 7.762638568878174, + "learning_rate": 4.894358103926873e-05, + "loss": 1.0534, + "step": 7170 + }, + { + "epoch": 0.06347354090418855, + "grad_norm": 7.497394561767578, + "learning_rate": 4.894210765159686e-05, + "loss": 0.9206, + "step": 7180 + }, + { + "epoch": 0.06356194416450078, + "grad_norm": 3.281755208969116, + "learning_rate": 4.894063426392499e-05, + "loss": 0.8431, + "step": 7190 + }, + { + "epoch": 0.06365034742481303, + "grad_norm": 4.9097185134887695, + "learning_rate": 4.893916087625312e-05, + "loss": 1.0359, + "step": 7200 + }, + { + "epoch": 0.06373875068512527, + "grad_norm": 4.54490852355957, + "learning_rate": 4.8937687488581244e-05, + "loss": 0.9101, + "step": 7210 + }, + { + "epoch": 0.0638271539454375, + "grad_norm": 7.325924873352051, + "learning_rate": 4.893621410090938e-05, + "loss": 0.942, + "step": 7220 + }, + { + "epoch": 0.06391555720574975, + "grad_norm": 4.005319118499756, + "learning_rate": 4.893474071323751e-05, + "loss": 1.0018, + "step": 7230 + }, + { + "epoch": 0.064003960466062, + "grad_norm": 5.555293560028076, + "learning_rate": 4.8933267325565635e-05, + "loss": 0.9417, + "step": 7240 + }, + { + "epoch": 0.06409236372637422, + "grad_norm": 6.122361660003662, + "learning_rate": 4.8931793937893764e-05, + "loss": 0.9577, + "step": 7250 + }, + { + "epoch": 0.06418076698668647, + "grad_norm": 6.8876166343688965, + "learning_rate": 4.893032055022189e-05, + "loss": 0.9515, + "step": 7260 + }, + { + "epoch": 0.06426917024699871, + "grad_norm": 3.8940982818603516, + "learning_rate": 4.892884716255002e-05, + "loss": 0.942, + "step": 7270 + }, + { + "epoch": 0.06435757350731094, + "grad_norm": 3.3218398094177246, + "learning_rate": 4.8927373774878156e-05, + "loss": 1.0395, + "step": 7280 + }, + { + "epoch": 0.06444597676762319, + "grad_norm": 5.618769645690918, + "learning_rate": 4.892590038720628e-05, + "loss": 0.9807, + "step": 7290 + }, + { + "epoch": 0.06453438002793543, + "grad_norm": 3.878244400024414, + "learning_rate": 4.892442699953441e-05, + "loss": 0.9813, + "step": 7300 + }, + { + "epoch": 0.06462278328824766, + "grad_norm": 8.489561080932617, + "learning_rate": 4.892295361186254e-05, + "loss": 0.9528, + "step": 7310 + }, + { + "epoch": 0.06471118654855991, + "grad_norm": 7.555717468261719, + "learning_rate": 4.892148022419067e-05, + "loss": 0.8856, + "step": 7320 + }, + { + "epoch": 0.06479958980887215, + "grad_norm": 5.251434803009033, + "learning_rate": 4.89200068365188e-05, + "loss": 0.9103, + "step": 7330 + }, + { + "epoch": 0.06488799306918439, + "grad_norm": 10.57512378692627, + "learning_rate": 4.891853344884693e-05, + "loss": 0.9906, + "step": 7340 + }, + { + "epoch": 0.06497639632949663, + "grad_norm": 6.348610877990723, + "learning_rate": 4.8917060061175054e-05, + "loss": 0.7835, + "step": 7350 + }, + { + "epoch": 0.06506479958980887, + "grad_norm": 3.2515549659729004, + "learning_rate": 4.891558667350319e-05, + "loss": 0.8747, + "step": 7360 + }, + { + "epoch": 0.0651532028501211, + "grad_norm": 5.83371639251709, + "learning_rate": 4.891411328583132e-05, + "loss": 0.9333, + "step": 7370 + }, + { + "epoch": 0.06524160611043335, + "grad_norm": 12.981037139892578, + "learning_rate": 4.8912639898159446e-05, + "loss": 1.1068, + "step": 7380 + }, + { + "epoch": 0.0653300093707456, + "grad_norm": 5.9547271728515625, + "learning_rate": 4.8911166510487574e-05, + "loss": 1.0265, + "step": 7390 + }, + { + "epoch": 0.06541841263105784, + "grad_norm": 7.136798858642578, + "learning_rate": 4.89096931228157e-05, + "loss": 0.8714, + "step": 7400 + }, + { + "epoch": 0.06550681589137007, + "grad_norm": 7.322783946990967, + "learning_rate": 4.890821973514383e-05, + "loss": 0.9294, + "step": 7410 + }, + { + "epoch": 0.06559521915168232, + "grad_norm": 10.76966667175293, + "learning_rate": 4.8906746347471966e-05, + "loss": 0.9084, + "step": 7420 + }, + { + "epoch": 0.06568362241199456, + "grad_norm": 5.148689270019531, + "learning_rate": 4.8905272959800094e-05, + "loss": 1.021, + "step": 7430 + }, + { + "epoch": 0.06577202567230679, + "grad_norm": 3.7091686725616455, + "learning_rate": 4.890379957212822e-05, + "loss": 0.9651, + "step": 7440 + }, + { + "epoch": 0.06586042893261904, + "grad_norm": 3.9058432579040527, + "learning_rate": 4.890232618445635e-05, + "loss": 0.8596, + "step": 7450 + }, + { + "epoch": 0.06594883219293128, + "grad_norm": 5.251888275146484, + "learning_rate": 4.890085279678448e-05, + "loss": 0.9971, + "step": 7460 + }, + { + "epoch": 0.06603723545324351, + "grad_norm": 3.484361171722412, + "learning_rate": 4.889937940911261e-05, + "loss": 0.9627, + "step": 7470 + }, + { + "epoch": 0.06612563871355576, + "grad_norm": 4.799646854400635, + "learning_rate": 4.889790602144074e-05, + "loss": 1.0495, + "step": 7480 + }, + { + "epoch": 0.066214041973868, + "grad_norm": 3.453781843185425, + "learning_rate": 4.889643263376887e-05, + "loss": 1.0594, + "step": 7490 + }, + { + "epoch": 0.06630244523418023, + "grad_norm": 9.214920043945312, + "learning_rate": 4.8894959246097e-05, + "loss": 0.853, + "step": 7500 + }, + { + "epoch": 0.06639084849449248, + "grad_norm": 10.193352699279785, + "learning_rate": 4.889348585842513e-05, + "loss": 0.7956, + "step": 7510 + }, + { + "epoch": 0.06647925175480472, + "grad_norm": 5.335453510284424, + "learning_rate": 4.8892012470753256e-05, + "loss": 0.9247, + "step": 7520 + }, + { + "epoch": 0.06656765501511695, + "grad_norm": 3.7129592895507812, + "learning_rate": 4.8890539083081385e-05, + "loss": 1.0042, + "step": 7530 + }, + { + "epoch": 0.0666560582754292, + "grad_norm": 3.840801954269409, + "learning_rate": 4.888906569540951e-05, + "loss": 0.9194, + "step": 7540 + }, + { + "epoch": 0.06674446153574144, + "grad_norm": 3.651582717895508, + "learning_rate": 4.888759230773765e-05, + "loss": 0.9767, + "step": 7550 + }, + { + "epoch": 0.06683286479605367, + "grad_norm": 3.655207872390747, + "learning_rate": 4.8886118920065777e-05, + "loss": 0.9537, + "step": 7560 + }, + { + "epoch": 0.06692126805636592, + "grad_norm": 5.665995121002197, + "learning_rate": 4.8884645532393905e-05, + "loss": 0.9313, + "step": 7570 + }, + { + "epoch": 0.06700967131667816, + "grad_norm": 6.286605358123779, + "learning_rate": 4.888317214472203e-05, + "loss": 1.0161, + "step": 7580 + }, + { + "epoch": 0.0670980745769904, + "grad_norm": 8.228954315185547, + "learning_rate": 4.888169875705016e-05, + "loss": 1.0193, + "step": 7590 + }, + { + "epoch": 0.06718647783730264, + "grad_norm": 8.809334754943848, + "learning_rate": 4.888022536937829e-05, + "loss": 1.1218, + "step": 7600 + }, + { + "epoch": 0.06727488109761488, + "grad_norm": 8.462353706359863, + "learning_rate": 4.8878751981706425e-05, + "loss": 0.8927, + "step": 7610 + }, + { + "epoch": 0.06736328435792711, + "grad_norm": 7.429812431335449, + "learning_rate": 4.887727859403455e-05, + "loss": 0.9888, + "step": 7620 + }, + { + "epoch": 0.06745168761823936, + "grad_norm": 3.1937925815582275, + "learning_rate": 4.887580520636268e-05, + "loss": 0.9499, + "step": 7630 + }, + { + "epoch": 0.0675400908785516, + "grad_norm": 3.0585787296295166, + "learning_rate": 4.887433181869081e-05, + "loss": 0.9619, + "step": 7640 + }, + { + "epoch": 0.06762849413886383, + "grad_norm": 4.329367637634277, + "learning_rate": 4.887285843101894e-05, + "loss": 1.0216, + "step": 7650 + }, + { + "epoch": 0.06771689739917608, + "grad_norm": 12.496232986450195, + "learning_rate": 4.887138504334707e-05, + "loss": 1.0454, + "step": 7660 + }, + { + "epoch": 0.06780530065948832, + "grad_norm": 4.210485935211182, + "learning_rate": 4.88699116556752e-05, + "loss": 0.8951, + "step": 7670 + }, + { + "epoch": 0.06789370391980057, + "grad_norm": 3.4743120670318604, + "learning_rate": 4.8868438268003324e-05, + "loss": 0.8772, + "step": 7680 + }, + { + "epoch": 0.0679821071801128, + "grad_norm": 4.501906871795654, + "learning_rate": 4.886696488033146e-05, + "loss": 1.0451, + "step": 7690 + }, + { + "epoch": 0.06807051044042504, + "grad_norm": 3.1872076988220215, + "learning_rate": 4.886549149265959e-05, + "loss": 0.9059, + "step": 7700 + }, + { + "epoch": 0.06815891370073729, + "grad_norm": 5.285301685333252, + "learning_rate": 4.8864018104987715e-05, + "loss": 0.8134, + "step": 7710 + }, + { + "epoch": 0.06824731696104952, + "grad_norm": 5.507226943969727, + "learning_rate": 4.8862544717315844e-05, + "loss": 0.8476, + "step": 7720 + }, + { + "epoch": 0.06833572022136176, + "grad_norm": 7.218921661376953, + "learning_rate": 4.886107132964397e-05, + "loss": 1.0962, + "step": 7730 + }, + { + "epoch": 0.06842412348167401, + "grad_norm": 4.271151542663574, + "learning_rate": 4.88595979419721e-05, + "loss": 1.1046, + "step": 7740 + }, + { + "epoch": 0.06851252674198624, + "grad_norm": 4.690978050231934, + "learning_rate": 4.8858124554300236e-05, + "loss": 0.9518, + "step": 7750 + }, + { + "epoch": 0.06860093000229849, + "grad_norm": 3.4593546390533447, + "learning_rate": 4.885665116662836e-05, + "loss": 0.9038, + "step": 7760 + }, + { + "epoch": 0.06868933326261073, + "grad_norm": 3.535809278488159, + "learning_rate": 4.885517777895649e-05, + "loss": 0.8462, + "step": 7770 + }, + { + "epoch": 0.06877773652292296, + "grad_norm": 6.570487976074219, + "learning_rate": 4.885370439128462e-05, + "loss": 0.8509, + "step": 7780 + }, + { + "epoch": 0.0688661397832352, + "grad_norm": 4.717918395996094, + "learning_rate": 4.885223100361275e-05, + "loss": 0.9433, + "step": 7790 + }, + { + "epoch": 0.06895454304354745, + "grad_norm": 3.3045620918273926, + "learning_rate": 4.885075761594088e-05, + "loss": 1.032, + "step": 7800 + }, + { + "epoch": 0.06904294630385968, + "grad_norm": 3.694939136505127, + "learning_rate": 4.884928422826901e-05, + "loss": 0.8535, + "step": 7810 + }, + { + "epoch": 0.06913134956417193, + "grad_norm": 4.511366844177246, + "learning_rate": 4.8847810840597134e-05, + "loss": 0.8749, + "step": 7820 + }, + { + "epoch": 0.06921975282448417, + "grad_norm": 6.869948863983154, + "learning_rate": 4.884633745292527e-05, + "loss": 0.8407, + "step": 7830 + }, + { + "epoch": 0.0693081560847964, + "grad_norm": 5.6310272216796875, + "learning_rate": 4.884486406525339e-05, + "loss": 0.9186, + "step": 7840 + }, + { + "epoch": 0.06939655934510865, + "grad_norm": 2.3454859256744385, + "learning_rate": 4.8843390677581526e-05, + "loss": 0.9441, + "step": 7850 + }, + { + "epoch": 0.06948496260542089, + "grad_norm": 3.726121664047241, + "learning_rate": 4.8841917289909654e-05, + "loss": 0.9089, + "step": 7860 + }, + { + "epoch": 0.06957336586573312, + "grad_norm": 5.054235935211182, + "learning_rate": 4.884044390223778e-05, + "loss": 0.9131, + "step": 7870 + }, + { + "epoch": 0.06966176912604537, + "grad_norm": 5.308077812194824, + "learning_rate": 4.883897051456591e-05, + "loss": 0.9014, + "step": 7880 + }, + { + "epoch": 0.06975017238635761, + "grad_norm": 3.4830482006073, + "learning_rate": 4.8837497126894046e-05, + "loss": 0.9266, + "step": 7890 + }, + { + "epoch": 0.06983857564666984, + "grad_norm": 2.1684446334838867, + "learning_rate": 4.883602373922217e-05, + "loss": 0.8765, + "step": 7900 + }, + { + "epoch": 0.06992697890698209, + "grad_norm": 7.7253875732421875, + "learning_rate": 4.88345503515503e-05, + "loss": 0.8887, + "step": 7910 + }, + { + "epoch": 0.07001538216729433, + "grad_norm": 6.309045791625977, + "learning_rate": 4.883307696387843e-05, + "loss": 0.9195, + "step": 7920 + }, + { + "epoch": 0.07010378542760658, + "grad_norm": 7.049060344696045, + "learning_rate": 4.883160357620656e-05, + "loss": 0.95, + "step": 7930 + }, + { + "epoch": 0.07019218868791881, + "grad_norm": 5.963956832885742, + "learning_rate": 4.883013018853469e-05, + "loss": 0.9175, + "step": 7940 + }, + { + "epoch": 0.07028059194823105, + "grad_norm": 7.502468109130859, + "learning_rate": 4.882865680086282e-05, + "loss": 0.8839, + "step": 7950 + }, + { + "epoch": 0.0703689952085433, + "grad_norm": 11.2520751953125, + "learning_rate": 4.8827183413190945e-05, + "loss": 0.8784, + "step": 7960 + }, + { + "epoch": 0.07045739846885553, + "grad_norm": 6.546641826629639, + "learning_rate": 4.882571002551908e-05, + "loss": 1.1201, + "step": 7970 + }, + { + "epoch": 0.07054580172916777, + "grad_norm": 3.698744297027588, + "learning_rate": 4.88242366378472e-05, + "loss": 1.0061, + "step": 7980 + }, + { + "epoch": 0.07063420498948002, + "grad_norm": 5.63902473449707, + "learning_rate": 4.8822763250175336e-05, + "loss": 0.9939, + "step": 7990 + }, + { + "epoch": 0.07072260824979225, + "grad_norm": 5.437800884246826, + "learning_rate": 4.8821289862503465e-05, + "loss": 0.9739, + "step": 8000 + }, + { + "epoch": 0.0708110115101045, + "grad_norm": 11.084389686584473, + "learning_rate": 4.881981647483159e-05, + "loss": 0.9, + "step": 8010 + }, + { + "epoch": 0.07089941477041674, + "grad_norm": 8.50456428527832, + "learning_rate": 4.881834308715972e-05, + "loss": 0.9302, + "step": 8020 + }, + { + "epoch": 0.07098781803072897, + "grad_norm": 3.414212465286255, + "learning_rate": 4.881686969948786e-05, + "loss": 0.8557, + "step": 8030 + }, + { + "epoch": 0.07107622129104121, + "grad_norm": 5.6906890869140625, + "learning_rate": 4.881539631181598e-05, + "loss": 1.0758, + "step": 8040 + }, + { + "epoch": 0.07116462455135346, + "grad_norm": 3.445957899093628, + "learning_rate": 4.881392292414411e-05, + "loss": 0.7803, + "step": 8050 + }, + { + "epoch": 0.07125302781166569, + "grad_norm": 5.444856643676758, + "learning_rate": 4.881244953647224e-05, + "loss": 1.0992, + "step": 8060 + }, + { + "epoch": 0.07134143107197793, + "grad_norm": 3.4007680416107178, + "learning_rate": 4.881097614880037e-05, + "loss": 0.8427, + "step": 8070 + }, + { + "epoch": 0.07142983433229018, + "grad_norm": 5.089715957641602, + "learning_rate": 4.88095027611285e-05, + "loss": 0.8963, + "step": 8080 + }, + { + "epoch": 0.07151823759260241, + "grad_norm": 4.008412837982178, + "learning_rate": 4.880802937345663e-05, + "loss": 0.978, + "step": 8090 + }, + { + "epoch": 0.07160664085291465, + "grad_norm": 5.542277812957764, + "learning_rate": 4.8806555985784755e-05, + "loss": 1.0073, + "step": 8100 + }, + { + "epoch": 0.0716950441132269, + "grad_norm": 3.897390127182007, + "learning_rate": 4.880508259811289e-05, + "loss": 0.9959, + "step": 8110 + }, + { + "epoch": 0.07178344737353913, + "grad_norm": 4.909268856048584, + "learning_rate": 4.880360921044101e-05, + "loss": 1.0364, + "step": 8120 + }, + { + "epoch": 0.07187185063385138, + "grad_norm": 4.000115394592285, + "learning_rate": 4.880213582276915e-05, + "loss": 0.8925, + "step": 8130 + }, + { + "epoch": 0.07196025389416362, + "grad_norm": 2.9951322078704834, + "learning_rate": 4.8800662435097275e-05, + "loss": 0.8361, + "step": 8140 + }, + { + "epoch": 0.07204865715447585, + "grad_norm": 3.519526481628418, + "learning_rate": 4.8799189047425404e-05, + "loss": 0.8563, + "step": 8150 + }, + { + "epoch": 0.0721370604147881, + "grad_norm": 7.9687299728393555, + "learning_rate": 4.879771565975353e-05, + "loss": 0.9414, + "step": 8160 + }, + { + "epoch": 0.07222546367510034, + "grad_norm": 6.15947151184082, + "learning_rate": 4.879624227208167e-05, + "loss": 0.9849, + "step": 8170 + }, + { + "epoch": 0.07231386693541257, + "grad_norm": 3.434873342514038, + "learning_rate": 4.879476888440979e-05, + "loss": 0.9088, + "step": 8180 + }, + { + "epoch": 0.07240227019572482, + "grad_norm": 6.331581115722656, + "learning_rate": 4.8793295496737924e-05, + "loss": 0.966, + "step": 8190 + }, + { + "epoch": 0.07249067345603706, + "grad_norm": 2.9577791690826416, + "learning_rate": 4.8791822109066045e-05, + "loss": 0.9171, + "step": 8200 + }, + { + "epoch": 0.0725790767163493, + "grad_norm": 3.063281297683716, + "learning_rate": 4.879034872139418e-05, + "loss": 0.9952, + "step": 8210 + }, + { + "epoch": 0.07266747997666154, + "grad_norm": 6.271945953369141, + "learning_rate": 4.878887533372231e-05, + "loss": 0.8675, + "step": 8220 + }, + { + "epoch": 0.07275588323697378, + "grad_norm": 2.9355950355529785, + "learning_rate": 4.878740194605044e-05, + "loss": 0.8829, + "step": 8230 + }, + { + "epoch": 0.07284428649728603, + "grad_norm": 10.568734169006348, + "learning_rate": 4.8785928558378566e-05, + "loss": 0.9237, + "step": 8240 + }, + { + "epoch": 0.07293268975759826, + "grad_norm": 4.638792037963867, + "learning_rate": 4.87844551707067e-05, + "loss": 0.9894, + "step": 8250 + }, + { + "epoch": 0.0730210930179105, + "grad_norm": 12.097837448120117, + "learning_rate": 4.878298178303482e-05, + "loss": 0.9352, + "step": 8260 + }, + { + "epoch": 0.07310949627822275, + "grad_norm": 6.81761360168457, + "learning_rate": 4.878150839536296e-05, + "loss": 0.9865, + "step": 8270 + }, + { + "epoch": 0.07319789953853498, + "grad_norm": 4.1839823722839355, + "learning_rate": 4.8780035007691086e-05, + "loss": 0.7932, + "step": 8280 + }, + { + "epoch": 0.07328630279884722, + "grad_norm": 3.923272132873535, + "learning_rate": 4.8778561620019214e-05, + "loss": 0.9607, + "step": 8290 + }, + { + "epoch": 0.07337470605915947, + "grad_norm": 10.800895690917969, + "learning_rate": 4.877708823234734e-05, + "loss": 1.0836, + "step": 8300 + }, + { + "epoch": 0.0734631093194717, + "grad_norm": 7.747611045837402, + "learning_rate": 4.877561484467547e-05, + "loss": 1.0398, + "step": 8310 + }, + { + "epoch": 0.07355151257978394, + "grad_norm": 4.6890339851379395, + "learning_rate": 4.87741414570036e-05, + "loss": 0.8863, + "step": 8320 + }, + { + "epoch": 0.07363991584009619, + "grad_norm": 2.836679458618164, + "learning_rate": 4.8772668069331734e-05, + "loss": 0.982, + "step": 8330 + }, + { + "epoch": 0.07372831910040842, + "grad_norm": 7.454015254974365, + "learning_rate": 4.877119468165986e-05, + "loss": 1.0465, + "step": 8340 + }, + { + "epoch": 0.07381672236072066, + "grad_norm": 3.54472279548645, + "learning_rate": 4.876972129398799e-05, + "loss": 1.0009, + "step": 8350 + }, + { + "epoch": 0.07390512562103291, + "grad_norm": 4.462231159210205, + "learning_rate": 4.876824790631612e-05, + "loss": 0.8919, + "step": 8360 + }, + { + "epoch": 0.07399352888134514, + "grad_norm": 6.081721305847168, + "learning_rate": 4.876677451864425e-05, + "loss": 0.8636, + "step": 8370 + }, + { + "epoch": 0.07408193214165738, + "grad_norm": 9.968588829040527, + "learning_rate": 4.8765301130972376e-05, + "loss": 0.9628, + "step": 8380 + }, + { + "epoch": 0.07417033540196963, + "grad_norm": 3.739511489868164, + "learning_rate": 4.876382774330051e-05, + "loss": 0.951, + "step": 8390 + }, + { + "epoch": 0.07425873866228186, + "grad_norm": 4.119143962860107, + "learning_rate": 4.876235435562864e-05, + "loss": 0.8034, + "step": 8400 + }, + { + "epoch": 0.0743471419225941, + "grad_norm": 4.583399772644043, + "learning_rate": 4.876088096795677e-05, + "loss": 0.8868, + "step": 8410 + }, + { + "epoch": 0.07443554518290635, + "grad_norm": 6.812721252441406, + "learning_rate": 4.8759407580284896e-05, + "loss": 0.9057, + "step": 8420 + }, + { + "epoch": 0.07452394844321858, + "grad_norm": 5.281586647033691, + "learning_rate": 4.8757934192613025e-05, + "loss": 0.9477, + "step": 8430 + }, + { + "epoch": 0.07461235170353082, + "grad_norm": 7.319498538970947, + "learning_rate": 4.875646080494115e-05, + "loss": 0.9479, + "step": 8440 + }, + { + "epoch": 0.07470075496384307, + "grad_norm": 4.214430332183838, + "learning_rate": 4.875498741726928e-05, + "loss": 0.8986, + "step": 8450 + }, + { + "epoch": 0.07478915822415531, + "grad_norm": 6.45574951171875, + "learning_rate": 4.8753514029597417e-05, + "loss": 0.944, + "step": 8460 + }, + { + "epoch": 0.07487756148446754, + "grad_norm": 4.877260208129883, + "learning_rate": 4.8752040641925545e-05, + "loss": 0.912, + "step": 8470 + }, + { + "epoch": 0.07496596474477979, + "grad_norm": 14.313943862915039, + "learning_rate": 4.875056725425367e-05, + "loss": 0.9442, + "step": 8480 + }, + { + "epoch": 0.07505436800509203, + "grad_norm": 3.369229555130005, + "learning_rate": 4.87490938665818e-05, + "loss": 1.0468, + "step": 8490 + }, + { + "epoch": 0.07514277126540427, + "grad_norm": 5.822304725646973, + "learning_rate": 4.874762047890993e-05, + "loss": 0.9498, + "step": 8500 + }, + { + "epoch": 0.07523117452571651, + "grad_norm": 5.878641605377197, + "learning_rate": 4.874614709123806e-05, + "loss": 0.8911, + "step": 8510 + }, + { + "epoch": 0.07531957778602875, + "grad_norm": 6.837238788604736, + "learning_rate": 4.8744673703566193e-05, + "loss": 0.9657, + "step": 8520 + }, + { + "epoch": 0.07540798104634099, + "grad_norm": 7.226667881011963, + "learning_rate": 4.874320031589432e-05, + "loss": 1.0134, + "step": 8530 + }, + { + "epoch": 0.07549638430665323, + "grad_norm": 5.0386962890625, + "learning_rate": 4.874172692822245e-05, + "loss": 0.7914, + "step": 8540 + }, + { + "epoch": 0.07558478756696548, + "grad_norm": 3.4485931396484375, + "learning_rate": 4.874025354055058e-05, + "loss": 0.9248, + "step": 8550 + }, + { + "epoch": 0.0756731908272777, + "grad_norm": 11.87322998046875, + "learning_rate": 4.873878015287871e-05, + "loss": 1.0705, + "step": 8560 + }, + { + "epoch": 0.07576159408758995, + "grad_norm": 7.415741920471191, + "learning_rate": 4.8737306765206835e-05, + "loss": 1.0196, + "step": 8570 + }, + { + "epoch": 0.0758499973479022, + "grad_norm": 6.293057441711426, + "learning_rate": 4.873583337753497e-05, + "loss": 0.8976, + "step": 8580 + }, + { + "epoch": 0.07593840060821443, + "grad_norm": 10.351290702819824, + "learning_rate": 4.873435998986309e-05, + "loss": 0.8863, + "step": 8590 + }, + { + "epoch": 0.07602680386852667, + "grad_norm": 6.024335861206055, + "learning_rate": 4.873288660219123e-05, + "loss": 0.8745, + "step": 8600 + }, + { + "epoch": 0.07611520712883892, + "grad_norm": 3.7169315814971924, + "learning_rate": 4.8731413214519355e-05, + "loss": 0.8892, + "step": 8610 + }, + { + "epoch": 0.07620361038915115, + "grad_norm": 12.834798812866211, + "learning_rate": 4.8729939826847484e-05, + "loss": 0.8989, + "step": 8620 + }, + { + "epoch": 0.07629201364946339, + "grad_norm": 4.861924648284912, + "learning_rate": 4.872846643917561e-05, + "loss": 1.0471, + "step": 8630 + }, + { + "epoch": 0.07638041690977564, + "grad_norm": 9.233909606933594, + "learning_rate": 4.872699305150375e-05, + "loss": 0.9387, + "step": 8640 + }, + { + "epoch": 0.07646882017008787, + "grad_norm": 3.8283112049102783, + "learning_rate": 4.872551966383187e-05, + "loss": 0.9259, + "step": 8650 + }, + { + "epoch": 0.07655722343040011, + "grad_norm": 4.9739556312561035, + "learning_rate": 4.8724046276160004e-05, + "loss": 0.8678, + "step": 8660 + }, + { + "epoch": 0.07664562669071236, + "grad_norm": 3.300435781478882, + "learning_rate": 4.8722572888488126e-05, + "loss": 0.9091, + "step": 8670 + }, + { + "epoch": 0.07673402995102459, + "grad_norm": 3.170722723007202, + "learning_rate": 4.872109950081626e-05, + "loss": 0.8505, + "step": 8680 + }, + { + "epoch": 0.07682243321133683, + "grad_norm": 6.202303886413574, + "learning_rate": 4.871962611314439e-05, + "loss": 0.8057, + "step": 8690 + }, + { + "epoch": 0.07691083647164908, + "grad_norm": 4.593632221221924, + "learning_rate": 4.871815272547252e-05, + "loss": 0.8931, + "step": 8700 + }, + { + "epoch": 0.07699923973196131, + "grad_norm": 4.898752212524414, + "learning_rate": 4.8716679337800646e-05, + "loss": 0.9275, + "step": 8710 + }, + { + "epoch": 0.07708764299227355, + "grad_norm": 6.235629081726074, + "learning_rate": 4.871520595012878e-05, + "loss": 1.0612, + "step": 8720 + }, + { + "epoch": 0.0771760462525858, + "grad_norm": 3.890493631362915, + "learning_rate": 4.87137325624569e-05, + "loss": 0.8512, + "step": 8730 + }, + { + "epoch": 0.07726444951289804, + "grad_norm": 4.638036251068115, + "learning_rate": 4.871225917478504e-05, + "loss": 0.9186, + "step": 8740 + }, + { + "epoch": 0.07735285277321027, + "grad_norm": 5.432823657989502, + "learning_rate": 4.8710785787113166e-05, + "loss": 0.8126, + "step": 8750 + }, + { + "epoch": 0.07744125603352252, + "grad_norm": 8.495767593383789, + "learning_rate": 4.8709312399441294e-05, + "loss": 0.9044, + "step": 8760 + }, + { + "epoch": 0.07752965929383476, + "grad_norm": 6.364109039306641, + "learning_rate": 4.870783901176942e-05, + "loss": 0.8803, + "step": 8770 + }, + { + "epoch": 0.077618062554147, + "grad_norm": 5.039573669433594, + "learning_rate": 4.870636562409755e-05, + "loss": 0.8829, + "step": 8780 + }, + { + "epoch": 0.07770646581445924, + "grad_norm": 4.389164924621582, + "learning_rate": 4.870489223642568e-05, + "loss": 0.8927, + "step": 8790 + }, + { + "epoch": 0.07779486907477148, + "grad_norm": 3.298307180404663, + "learning_rate": 4.8703418848753814e-05, + "loss": 0.8852, + "step": 8800 + }, + { + "epoch": 0.07788327233508371, + "grad_norm": 4.961152076721191, + "learning_rate": 4.8701945461081936e-05, + "loss": 0.9537, + "step": 8810 + }, + { + "epoch": 0.07797167559539596, + "grad_norm": 3.1957497596740723, + "learning_rate": 4.870047207341007e-05, + "loss": 0.9297, + "step": 8820 + }, + { + "epoch": 0.0780600788557082, + "grad_norm": 11.411541938781738, + "learning_rate": 4.86989986857382e-05, + "loss": 0.9613, + "step": 8830 + }, + { + "epoch": 0.07814848211602043, + "grad_norm": 6.9205851554870605, + "learning_rate": 4.869752529806633e-05, + "loss": 0.8007, + "step": 8840 + }, + { + "epoch": 0.07823688537633268, + "grad_norm": 3.1366233825683594, + "learning_rate": 4.8696051910394456e-05, + "loss": 0.9913, + "step": 8850 + }, + { + "epoch": 0.07832528863664492, + "grad_norm": 5.935032844543457, + "learning_rate": 4.869457852272259e-05, + "loss": 0.9853, + "step": 8860 + }, + { + "epoch": 0.07841369189695716, + "grad_norm": 2.758721351623535, + "learning_rate": 4.869310513505071e-05, + "loss": 0.9228, + "step": 8870 + }, + { + "epoch": 0.0785020951572694, + "grad_norm": 15.793062210083008, + "learning_rate": 4.869163174737885e-05, + "loss": 0.9235, + "step": 8880 + }, + { + "epoch": 0.07859049841758164, + "grad_norm": 7.283065319061279, + "learning_rate": 4.8690158359706976e-05, + "loss": 0.9523, + "step": 8890 + }, + { + "epoch": 0.07867890167789388, + "grad_norm": 3.8146417140960693, + "learning_rate": 4.8688684972035105e-05, + "loss": 0.9303, + "step": 8900 + }, + { + "epoch": 0.07876730493820612, + "grad_norm": 8.225516319274902, + "learning_rate": 4.868721158436323e-05, + "loss": 0.804, + "step": 8910 + }, + { + "epoch": 0.07885570819851836, + "grad_norm": 6.507173538208008, + "learning_rate": 4.868573819669136e-05, + "loss": 0.7779, + "step": 8920 + }, + { + "epoch": 0.0789441114588306, + "grad_norm": 5.197059154510498, + "learning_rate": 4.868426480901949e-05, + "loss": 0.9114, + "step": 8930 + }, + { + "epoch": 0.07903251471914284, + "grad_norm": 3.2294628620147705, + "learning_rate": 4.8682791421347625e-05, + "loss": 0.7751, + "step": 8940 + }, + { + "epoch": 0.07912091797945509, + "grad_norm": 6.141626358032227, + "learning_rate": 4.8681318033675747e-05, + "loss": 0.959, + "step": 8950 + }, + { + "epoch": 0.07920932123976732, + "grad_norm": 9.837469100952148, + "learning_rate": 4.867984464600388e-05, + "loss": 0.8473, + "step": 8960 + }, + { + "epoch": 0.07929772450007956, + "grad_norm": 3.881667375564575, + "learning_rate": 4.867837125833201e-05, + "loss": 0.9265, + "step": 8970 + }, + { + "epoch": 0.0793861277603918, + "grad_norm": 3.616663932800293, + "learning_rate": 4.867689787066014e-05, + "loss": 0.9199, + "step": 8980 + }, + { + "epoch": 0.07947453102070404, + "grad_norm": 6.833520412445068, + "learning_rate": 4.867542448298827e-05, + "loss": 1.0582, + "step": 8990 + }, + { + "epoch": 0.07956293428101628, + "grad_norm": 6.162457466125488, + "learning_rate": 4.86739510953164e-05, + "loss": 0.8954, + "step": 9000 + }, + { + "epoch": 0.07965133754132853, + "grad_norm": 4.834630012512207, + "learning_rate": 4.8672477707644523e-05, + "loss": 0.897, + "step": 9010 + }, + { + "epoch": 0.07973974080164077, + "grad_norm": 6.014333248138428, + "learning_rate": 4.867100431997266e-05, + "loss": 0.8585, + "step": 9020 + }, + { + "epoch": 0.079828144061953, + "grad_norm": 4.596333980560303, + "learning_rate": 4.866953093230078e-05, + "loss": 1.0444, + "step": 9030 + }, + { + "epoch": 0.07991654732226525, + "grad_norm": 2.975196599960327, + "learning_rate": 4.8668057544628915e-05, + "loss": 0.9478, + "step": 9040 + }, + { + "epoch": 0.08000495058257749, + "grad_norm": 7.315517425537109, + "learning_rate": 4.8666584156957044e-05, + "loss": 0.9546, + "step": 9050 + }, + { + "epoch": 0.08009335384288972, + "grad_norm": 5.213054180145264, + "learning_rate": 4.866511076928517e-05, + "loss": 0.89, + "step": 9060 + }, + { + "epoch": 0.08018175710320197, + "grad_norm": 8.745564460754395, + "learning_rate": 4.86636373816133e-05, + "loss": 1.0033, + "step": 9070 + }, + { + "epoch": 0.08027016036351421, + "grad_norm": 12.515480995178223, + "learning_rate": 4.8662163993941435e-05, + "loss": 0.9331, + "step": 9080 + }, + { + "epoch": 0.08035856362382644, + "grad_norm": 8.361699104309082, + "learning_rate": 4.866069060626956e-05, + "loss": 0.9635, + "step": 9090 + }, + { + "epoch": 0.08044696688413869, + "grad_norm": 8.072646141052246, + "learning_rate": 4.865921721859769e-05, + "loss": 0.8679, + "step": 9100 + }, + { + "epoch": 0.08053537014445093, + "grad_norm": 2.1535627841949463, + "learning_rate": 4.865774383092582e-05, + "loss": 0.949, + "step": 9110 + }, + { + "epoch": 0.08062377340476316, + "grad_norm": 4.744264125823975, + "learning_rate": 4.865627044325395e-05, + "loss": 1.0331, + "step": 9120 + }, + { + "epoch": 0.08071217666507541, + "grad_norm": 8.3015775680542, + "learning_rate": 4.865479705558208e-05, + "loss": 0.9036, + "step": 9130 + }, + { + "epoch": 0.08080057992538765, + "grad_norm": 6.706944465637207, + "learning_rate": 4.8653323667910206e-05, + "loss": 0.9976, + "step": 9140 + }, + { + "epoch": 0.08088898318569988, + "grad_norm": 3.629948139190674, + "learning_rate": 4.8651850280238334e-05, + "loss": 1.0273, + "step": 9150 + }, + { + "epoch": 0.08097738644601213, + "grad_norm": 5.1356306076049805, + "learning_rate": 4.865037689256647e-05, + "loss": 0.8853, + "step": 9160 + }, + { + "epoch": 0.08106578970632437, + "grad_norm": 6.095510005950928, + "learning_rate": 4.864890350489459e-05, + "loss": 0.8794, + "step": 9170 + }, + { + "epoch": 0.0811541929666366, + "grad_norm": 5.80955171585083, + "learning_rate": 4.8647430117222726e-05, + "loss": 1.0623, + "step": 9180 + }, + { + "epoch": 0.08124259622694885, + "grad_norm": 5.339105129241943, + "learning_rate": 4.8645956729550854e-05, + "loss": 0.886, + "step": 9190 + }, + { + "epoch": 0.0813309994872611, + "grad_norm": 6.6511549949646, + "learning_rate": 4.864448334187898e-05, + "loss": 0.9781, + "step": 9200 + }, + { + "epoch": 0.08141940274757332, + "grad_norm": 7.387551784515381, + "learning_rate": 4.864300995420711e-05, + "loss": 0.9809, + "step": 9210 + }, + { + "epoch": 0.08150780600788557, + "grad_norm": 4.8425469398498535, + "learning_rate": 4.8641536566535246e-05, + "loss": 0.9702, + "step": 9220 + }, + { + "epoch": 0.08159620926819781, + "grad_norm": 9.483480453491211, + "learning_rate": 4.864006317886337e-05, + "loss": 0.8845, + "step": 9230 + }, + { + "epoch": 0.08168461252851004, + "grad_norm": 5.919254302978516, + "learning_rate": 4.86385897911915e-05, + "loss": 0.9047, + "step": 9240 + }, + { + "epoch": 0.08177301578882229, + "grad_norm": 5.504541397094727, + "learning_rate": 4.863711640351963e-05, + "loss": 0.9067, + "step": 9250 + }, + { + "epoch": 0.08186141904913453, + "grad_norm": 3.395012617111206, + "learning_rate": 4.863564301584776e-05, + "loss": 0.9285, + "step": 9260 + }, + { + "epoch": 0.08194982230944678, + "grad_norm": 5.435262203216553, + "learning_rate": 4.863416962817589e-05, + "loss": 0.9868, + "step": 9270 + }, + { + "epoch": 0.08203822556975901, + "grad_norm": 4.4511637687683105, + "learning_rate": 4.8632696240504016e-05, + "loss": 0.8484, + "step": 9280 + }, + { + "epoch": 0.08212662883007125, + "grad_norm": 5.902878761291504, + "learning_rate": 4.8631222852832144e-05, + "loss": 0.9654, + "step": 9290 + }, + { + "epoch": 0.0822150320903835, + "grad_norm": 5.942347526550293, + "learning_rate": 4.862974946516028e-05, + "loss": 0.8642, + "step": 9300 + }, + { + "epoch": 0.08230343535069573, + "grad_norm": 4.489561080932617, + "learning_rate": 4.862827607748841e-05, + "loss": 0.906, + "step": 9310 + }, + { + "epoch": 0.08239183861100798, + "grad_norm": 4.833526611328125, + "learning_rate": 4.8626802689816536e-05, + "loss": 0.9049, + "step": 9320 + }, + { + "epoch": 0.08248024187132022, + "grad_norm": 5.06949520111084, + "learning_rate": 4.8625329302144665e-05, + "loss": 0.8651, + "step": 9330 + }, + { + "epoch": 0.08256864513163245, + "grad_norm": 3.7296364307403564, + "learning_rate": 4.862385591447279e-05, + "loss": 0.9039, + "step": 9340 + }, + { + "epoch": 0.0826570483919447, + "grad_norm": 4.1431145668029785, + "learning_rate": 4.862238252680093e-05, + "loss": 0.8389, + "step": 9350 + }, + { + "epoch": 0.08274545165225694, + "grad_norm": 5.955991268157959, + "learning_rate": 4.8620909139129056e-05, + "loss": 1.0676, + "step": 9360 + }, + { + "epoch": 0.08283385491256917, + "grad_norm": 8.427042961120605, + "learning_rate": 4.8619435751457185e-05, + "loss": 0.9391, + "step": 9370 + }, + { + "epoch": 0.08292225817288142, + "grad_norm": 10.462730407714844, + "learning_rate": 4.861796236378531e-05, + "loss": 1.0418, + "step": 9380 + }, + { + "epoch": 0.08301066143319366, + "grad_norm": 6.5250725746154785, + "learning_rate": 4.861648897611344e-05, + "loss": 0.977, + "step": 9390 + }, + { + "epoch": 0.08309906469350589, + "grad_norm": 7.1132283210754395, + "learning_rate": 4.861501558844157e-05, + "loss": 0.9021, + "step": 9400 + }, + { + "epoch": 0.08318746795381814, + "grad_norm": 3.116255044937134, + "learning_rate": 4.8613542200769705e-05, + "loss": 0.9143, + "step": 9410 + }, + { + "epoch": 0.08327587121413038, + "grad_norm": 3.6663601398468018, + "learning_rate": 4.861206881309783e-05, + "loss": 1.0068, + "step": 9420 + }, + { + "epoch": 0.08336427447444261, + "grad_norm": 9.257044792175293, + "learning_rate": 4.861059542542596e-05, + "loss": 0.8149, + "step": 9430 + }, + { + "epoch": 0.08345267773475486, + "grad_norm": 8.620640754699707, + "learning_rate": 4.860912203775409e-05, + "loss": 0.8673, + "step": 9440 + }, + { + "epoch": 0.0835410809950671, + "grad_norm": 5.518143653869629, + "learning_rate": 4.860764865008222e-05, + "loss": 0.867, + "step": 9450 + }, + { + "epoch": 0.08362948425537933, + "grad_norm": 5.901253700256348, + "learning_rate": 4.860617526241035e-05, + "loss": 0.8197, + "step": 9460 + }, + { + "epoch": 0.08371788751569158, + "grad_norm": 5.221851825714111, + "learning_rate": 4.860470187473848e-05, + "loss": 0.8809, + "step": 9470 + }, + { + "epoch": 0.08380629077600382, + "grad_norm": 12.859980583190918, + "learning_rate": 4.8603228487066604e-05, + "loss": 1.0427, + "step": 9480 + }, + { + "epoch": 0.08389469403631605, + "grad_norm": 7.886823654174805, + "learning_rate": 4.860175509939474e-05, + "loss": 0.9257, + "step": 9490 + }, + { + "epoch": 0.0839830972966283, + "grad_norm": 12.58973503112793, + "learning_rate": 4.860028171172286e-05, + "loss": 0.8344, + "step": 9500 + }, + { + "epoch": 0.08407150055694054, + "grad_norm": 3.997868537902832, + "learning_rate": 4.8598808324050995e-05, + "loss": 0.8301, + "step": 9510 + }, + { + "epoch": 0.08415990381725277, + "grad_norm": 10.132091522216797, + "learning_rate": 4.8597334936379124e-05, + "loss": 0.9948, + "step": 9520 + }, + { + "epoch": 0.08424830707756502, + "grad_norm": 10.275101661682129, + "learning_rate": 4.859586154870725e-05, + "loss": 0.9757, + "step": 9530 + }, + { + "epoch": 0.08433671033787726, + "grad_norm": 7.305060386657715, + "learning_rate": 4.859438816103538e-05, + "loss": 0.8798, + "step": 9540 + }, + { + "epoch": 0.08442511359818951, + "grad_norm": 4.845224857330322, + "learning_rate": 4.8592914773363516e-05, + "loss": 0.8895, + "step": 9550 + }, + { + "epoch": 0.08451351685850174, + "grad_norm": 7.1752519607543945, + "learning_rate": 4.859144138569164e-05, + "loss": 0.9172, + "step": 9560 + }, + { + "epoch": 0.08460192011881398, + "grad_norm": 7.0624918937683105, + "learning_rate": 4.858996799801977e-05, + "loss": 0.9387, + "step": 9570 + }, + { + "epoch": 0.08469032337912623, + "grad_norm": 5.013216495513916, + "learning_rate": 4.85884946103479e-05, + "loss": 0.8568, + "step": 9580 + }, + { + "epoch": 0.08477872663943846, + "grad_norm": 7.294334411621094, + "learning_rate": 4.858702122267603e-05, + "loss": 0.8542, + "step": 9590 + }, + { + "epoch": 0.0848671298997507, + "grad_norm": 4.1750054359436035, + "learning_rate": 4.858554783500416e-05, + "loss": 0.9288, + "step": 9600 + }, + { + "epoch": 0.08495553316006295, + "grad_norm": 4.327548027038574, + "learning_rate": 4.8584074447332286e-05, + "loss": 0.8887, + "step": 9610 + }, + { + "epoch": 0.08504393642037518, + "grad_norm": 8.034274101257324, + "learning_rate": 4.8582601059660414e-05, + "loss": 0.9727, + "step": 9620 + }, + { + "epoch": 0.08513233968068742, + "grad_norm": 5.522838592529297, + "learning_rate": 4.858112767198855e-05, + "loss": 0.8279, + "step": 9630 + }, + { + "epoch": 0.08522074294099967, + "grad_norm": 3.7303531169891357, + "learning_rate": 4.857965428431667e-05, + "loss": 0.9085, + "step": 9640 + }, + { + "epoch": 0.0853091462013119, + "grad_norm": 5.9661054611206055, + "learning_rate": 4.8578180896644806e-05, + "loss": 0.7948, + "step": 9650 + }, + { + "epoch": 0.08539754946162414, + "grad_norm": 3.590968608856201, + "learning_rate": 4.8576707508972934e-05, + "loss": 0.8799, + "step": 9660 + }, + { + "epoch": 0.08548595272193639, + "grad_norm": 5.45489501953125, + "learning_rate": 4.857523412130106e-05, + "loss": 0.8394, + "step": 9670 + }, + { + "epoch": 0.08557435598224862, + "grad_norm": 5.977509498596191, + "learning_rate": 4.857376073362919e-05, + "loss": 0.9623, + "step": 9680 + }, + { + "epoch": 0.08566275924256087, + "grad_norm": 7.4215803146362305, + "learning_rate": 4.8572287345957326e-05, + "loss": 0.9526, + "step": 9690 + }, + { + "epoch": 0.08575116250287311, + "grad_norm": 5.85550594329834, + "learning_rate": 4.857081395828545e-05, + "loss": 0.8054, + "step": 9700 + }, + { + "epoch": 0.08583956576318534, + "grad_norm": 6.150406360626221, + "learning_rate": 4.856934057061358e-05, + "loss": 0.9118, + "step": 9710 + }, + { + "epoch": 0.08592796902349759, + "grad_norm": 6.535300254821777, + "learning_rate": 4.8567867182941704e-05, + "loss": 0.8618, + "step": 9720 + }, + { + "epoch": 0.08601637228380983, + "grad_norm": 6.574619293212891, + "learning_rate": 4.856639379526984e-05, + "loss": 0.9562, + "step": 9730 + }, + { + "epoch": 0.08610477554412206, + "grad_norm": 5.1996259689331055, + "learning_rate": 4.856492040759797e-05, + "loss": 0.8255, + "step": 9740 + }, + { + "epoch": 0.0861931788044343, + "grad_norm": 5.212782382965088, + "learning_rate": 4.8563447019926096e-05, + "loss": 0.9455, + "step": 9750 + }, + { + "epoch": 0.08628158206474655, + "grad_norm": 5.106650352478027, + "learning_rate": 4.8561973632254225e-05, + "loss": 0.9627, + "step": 9760 + }, + { + "epoch": 0.08636998532505878, + "grad_norm": 3.951993942260742, + "learning_rate": 4.856050024458236e-05, + "loss": 0.8424, + "step": 9770 + }, + { + "epoch": 0.08645838858537103, + "grad_norm": 3.8278937339782715, + "learning_rate": 4.855902685691048e-05, + "loss": 0.8581, + "step": 9780 + }, + { + "epoch": 0.08654679184568327, + "grad_norm": 7.4229736328125, + "learning_rate": 4.8557553469238616e-05, + "loss": 0.8744, + "step": 9790 + }, + { + "epoch": 0.0866351951059955, + "grad_norm": 2.8029661178588867, + "learning_rate": 4.8556080081566745e-05, + "loss": 0.7749, + "step": 9800 + }, + { + "epoch": 0.08672359836630775, + "grad_norm": 7.567657947540283, + "learning_rate": 4.855460669389487e-05, + "loss": 0.7839, + "step": 9810 + }, + { + "epoch": 0.08681200162661999, + "grad_norm": 5.7860026359558105, + "learning_rate": 4.8553133306223e-05, + "loss": 0.918, + "step": 9820 + }, + { + "epoch": 0.08690040488693224, + "grad_norm": 6.564748764038086, + "learning_rate": 4.8551659918551137e-05, + "loss": 0.787, + "step": 9830 + }, + { + "epoch": 0.08698880814724447, + "grad_norm": 8.89094352722168, + "learning_rate": 4.855018653087926e-05, + "loss": 0.7996, + "step": 9840 + }, + { + "epoch": 0.08707721140755671, + "grad_norm": 4.378356456756592, + "learning_rate": 4.854871314320739e-05, + "loss": 0.7955, + "step": 9850 + }, + { + "epoch": 0.08716561466786896, + "grad_norm": 8.747638702392578, + "learning_rate": 4.8547239755535515e-05, + "loss": 0.8285, + "step": 9860 + }, + { + "epoch": 0.08725401792818119, + "grad_norm": 4.274459362030029, + "learning_rate": 4.854576636786365e-05, + "loss": 1.0261, + "step": 9870 + }, + { + "epoch": 0.08734242118849343, + "grad_norm": 4.295324802398682, + "learning_rate": 4.854429298019178e-05, + "loss": 0.9296, + "step": 9880 + }, + { + "epoch": 0.08743082444880568, + "grad_norm": 6.3039960861206055, + "learning_rate": 4.854281959251991e-05, + "loss": 0.8966, + "step": 9890 + }, + { + "epoch": 0.08751922770911791, + "grad_norm": 8.68468189239502, + "learning_rate": 4.8541346204848035e-05, + "loss": 0.8689, + "step": 9900 + }, + { + "epoch": 0.08760763096943015, + "grad_norm": 4.591320991516113, + "learning_rate": 4.853987281717617e-05, + "loss": 0.8757, + "step": 9910 + }, + { + "epoch": 0.0876960342297424, + "grad_norm": 3.1836514472961426, + "learning_rate": 4.853839942950429e-05, + "loss": 0.829, + "step": 9920 + }, + { + "epoch": 0.08778443749005463, + "grad_norm": 6.4215850830078125, + "learning_rate": 4.853692604183243e-05, + "loss": 0.8945, + "step": 9930 + }, + { + "epoch": 0.08787284075036687, + "grad_norm": 4.03424072265625, + "learning_rate": 4.8535452654160555e-05, + "loss": 0.8697, + "step": 9940 + }, + { + "epoch": 0.08796124401067912, + "grad_norm": 6.259154796600342, + "learning_rate": 4.8533979266488684e-05, + "loss": 0.8026, + "step": 9950 + }, + { + "epoch": 0.08804964727099135, + "grad_norm": 6.984891891479492, + "learning_rate": 4.853250587881681e-05, + "loss": 0.9533, + "step": 9960 + }, + { + "epoch": 0.0881380505313036, + "grad_norm": 5.218731880187988, + "learning_rate": 4.853103249114494e-05, + "loss": 0.9003, + "step": 9970 + }, + { + "epoch": 0.08822645379161584, + "grad_norm": 9.232502937316895, + "learning_rate": 4.852955910347307e-05, + "loss": 0.9712, + "step": 9980 + }, + { + "epoch": 0.08831485705192807, + "grad_norm": 5.730737686157227, + "learning_rate": 4.8528085715801204e-05, + "loss": 0.9322, + "step": 9990 + }, + { + "epoch": 0.08840326031224031, + "grad_norm": 6.135512351989746, + "learning_rate": 4.8526612328129325e-05, + "loss": 0.8462, + "step": 10000 + }, + { + "epoch": 0.08849166357255256, + "grad_norm": 5.257114887237549, + "learning_rate": 4.852513894045746e-05, + "loss": 0.797, + "step": 10010 + }, + { + "epoch": 0.08858006683286479, + "grad_norm": 4.087240219116211, + "learning_rate": 4.852366555278559e-05, + "loss": 0.8649, + "step": 10020 + }, + { + "epoch": 0.08866847009317703, + "grad_norm": 7.3108367919921875, + "learning_rate": 4.852219216511372e-05, + "loss": 0.8297, + "step": 10030 + }, + { + "epoch": 0.08875687335348928, + "grad_norm": 9.954843521118164, + "learning_rate": 4.8520718777441846e-05, + "loss": 0.8958, + "step": 10040 + }, + { + "epoch": 0.08884527661380151, + "grad_norm": 5.5591864585876465, + "learning_rate": 4.851924538976998e-05, + "loss": 0.7817, + "step": 10050 + }, + { + "epoch": 0.08893367987411376, + "grad_norm": 3.648822546005249, + "learning_rate": 4.85177720020981e-05, + "loss": 0.8745, + "step": 10060 + }, + { + "epoch": 0.089022083134426, + "grad_norm": 7.206264495849609, + "learning_rate": 4.851629861442624e-05, + "loss": 0.8934, + "step": 10070 + }, + { + "epoch": 0.08911048639473824, + "grad_norm": 12.365501403808594, + "learning_rate": 4.8514825226754366e-05, + "loss": 0.8247, + "step": 10080 + }, + { + "epoch": 0.08919888965505048, + "grad_norm": 10.052908897399902, + "learning_rate": 4.8513351839082494e-05, + "loss": 0.8579, + "step": 10090 + }, + { + "epoch": 0.08928729291536272, + "grad_norm": 5.000236988067627, + "learning_rate": 4.851187845141062e-05, + "loss": 0.891, + "step": 10100 + }, + { + "epoch": 0.08937569617567497, + "grad_norm": 5.344237804412842, + "learning_rate": 4.851040506373875e-05, + "loss": 1.045, + "step": 10110 + }, + { + "epoch": 0.0894640994359872, + "grad_norm": 7.5027666091918945, + "learning_rate": 4.850893167606688e-05, + "loss": 0.8591, + "step": 10120 + }, + { + "epoch": 0.08955250269629944, + "grad_norm": 5.193624496459961, + "learning_rate": 4.8507458288395014e-05, + "loss": 0.9054, + "step": 10130 + }, + { + "epoch": 0.08964090595661169, + "grad_norm": 4.23881196975708, + "learning_rate": 4.850598490072314e-05, + "loss": 1.033, + "step": 10140 + }, + { + "epoch": 0.08972930921692392, + "grad_norm": 4.646491527557373, + "learning_rate": 4.850451151305127e-05, + "loss": 0.991, + "step": 10150 + }, + { + "epoch": 0.08981771247723616, + "grad_norm": 3.2145233154296875, + "learning_rate": 4.85030381253794e-05, + "loss": 0.9203, + "step": 10160 + }, + { + "epoch": 0.0899061157375484, + "grad_norm": 3.4959211349487305, + "learning_rate": 4.850156473770753e-05, + "loss": 0.9051, + "step": 10170 + }, + { + "epoch": 0.08999451899786064, + "grad_norm": 3.9835028648376465, + "learning_rate": 4.8500091350035656e-05, + "loss": 0.8963, + "step": 10180 + }, + { + "epoch": 0.09008292225817288, + "grad_norm": 5.471762180328369, + "learning_rate": 4.849861796236379e-05, + "loss": 0.9159, + "step": 10190 + }, + { + "epoch": 0.09017132551848513, + "grad_norm": 8.925811767578125, + "learning_rate": 4.849714457469192e-05, + "loss": 0.9832, + "step": 10200 + }, + { + "epoch": 0.09025972877879736, + "grad_norm": 4.184961795806885, + "learning_rate": 4.849567118702005e-05, + "loss": 0.9145, + "step": 10210 + }, + { + "epoch": 0.0903481320391096, + "grad_norm": 5.561929225921631, + "learning_rate": 4.8494197799348176e-05, + "loss": 0.8295, + "step": 10220 + }, + { + "epoch": 0.09043653529942185, + "grad_norm": 4.6912665367126465, + "learning_rate": 4.8492724411676305e-05, + "loss": 0.7778, + "step": 10230 + }, + { + "epoch": 0.09052493855973408, + "grad_norm": 4.672646999359131, + "learning_rate": 4.849125102400443e-05, + "loss": 0.9648, + "step": 10240 + }, + { + "epoch": 0.09061334182004632, + "grad_norm": 3.2846500873565674, + "learning_rate": 4.848977763633256e-05, + "loss": 0.9489, + "step": 10250 + }, + { + "epoch": 0.09070174508035857, + "grad_norm": 5.393837928771973, + "learning_rate": 4.8488304248660696e-05, + "loss": 0.8983, + "step": 10260 + }, + { + "epoch": 0.0907901483406708, + "grad_norm": 2.277843475341797, + "learning_rate": 4.8486830860988825e-05, + "loss": 0.8851, + "step": 10270 + }, + { + "epoch": 0.09087855160098304, + "grad_norm": 2.764491081237793, + "learning_rate": 4.848535747331695e-05, + "loss": 0.9351, + "step": 10280 + }, + { + "epoch": 0.09096695486129529, + "grad_norm": 10.896225929260254, + "learning_rate": 4.848388408564508e-05, + "loss": 0.9205, + "step": 10290 + }, + { + "epoch": 0.09105535812160752, + "grad_norm": 4.834585666656494, + "learning_rate": 4.848241069797321e-05, + "loss": 1.0148, + "step": 10300 + }, + { + "epoch": 0.09114376138191976, + "grad_norm": 4.664348125457764, + "learning_rate": 4.848093731030134e-05, + "loss": 0.8952, + "step": 10310 + }, + { + "epoch": 0.09123216464223201, + "grad_norm": 2.5782110691070557, + "learning_rate": 4.847946392262947e-05, + "loss": 0.8659, + "step": 10320 + }, + { + "epoch": 0.09132056790254424, + "grad_norm": 6.341892719268799, + "learning_rate": 4.8477990534957595e-05, + "loss": 0.955, + "step": 10330 + }, + { + "epoch": 0.09140897116285648, + "grad_norm": 4.601062297821045, + "learning_rate": 4.847651714728573e-05, + "loss": 0.8627, + "step": 10340 + }, + { + "epoch": 0.09149737442316873, + "grad_norm": 3.9025869369506836, + "learning_rate": 4.847504375961386e-05, + "loss": 0.8759, + "step": 10350 + }, + { + "epoch": 0.09158577768348097, + "grad_norm": 5.3651509284973145, + "learning_rate": 4.847357037194199e-05, + "loss": 0.8328, + "step": 10360 + }, + { + "epoch": 0.0916741809437932, + "grad_norm": 3.72887921333313, + "learning_rate": 4.8472096984270115e-05, + "loss": 0.8937, + "step": 10370 + }, + { + "epoch": 0.09176258420410545, + "grad_norm": 2.6255335807800293, + "learning_rate": 4.847062359659825e-05, + "loss": 0.9507, + "step": 10380 + }, + { + "epoch": 0.0918509874644177, + "grad_norm": 8.633094787597656, + "learning_rate": 4.846915020892637e-05, + "loss": 0.9912, + "step": 10390 + }, + { + "epoch": 0.09193939072472992, + "grad_norm": 4.747097015380859, + "learning_rate": 4.846767682125451e-05, + "loss": 0.9495, + "step": 10400 + }, + { + "epoch": 0.09202779398504217, + "grad_norm": 3.7421910762786865, + "learning_rate": 4.8466203433582635e-05, + "loss": 0.8498, + "step": 10410 + }, + { + "epoch": 0.09211619724535441, + "grad_norm": 7.091905117034912, + "learning_rate": 4.8464730045910764e-05, + "loss": 0.8067, + "step": 10420 + }, + { + "epoch": 0.09220460050566665, + "grad_norm": 6.685669898986816, + "learning_rate": 4.846325665823889e-05, + "loss": 0.8737, + "step": 10430 + }, + { + "epoch": 0.09229300376597889, + "grad_norm": 4.483755588531494, + "learning_rate": 4.846178327056702e-05, + "loss": 0.9254, + "step": 10440 + }, + { + "epoch": 0.09238140702629113, + "grad_norm": 6.154871940612793, + "learning_rate": 4.846030988289515e-05, + "loss": 0.9541, + "step": 10450 + }, + { + "epoch": 0.09246981028660337, + "grad_norm": 5.813018321990967, + "learning_rate": 4.8458836495223284e-05, + "loss": 0.8701, + "step": 10460 + }, + { + "epoch": 0.09255821354691561, + "grad_norm": 3.061584711074829, + "learning_rate": 4.8457363107551405e-05, + "loss": 0.9496, + "step": 10470 + }, + { + "epoch": 0.09264661680722786, + "grad_norm": 6.514309883117676, + "learning_rate": 4.845588971987954e-05, + "loss": 0.8014, + "step": 10480 + }, + { + "epoch": 0.09273502006754009, + "grad_norm": 2.5385093688964844, + "learning_rate": 4.845441633220767e-05, + "loss": 0.9017, + "step": 10490 + }, + { + "epoch": 0.09282342332785233, + "grad_norm": 5.3517680168151855, + "learning_rate": 4.84529429445358e-05, + "loss": 0.8886, + "step": 10500 + }, + { + "epoch": 0.09291182658816458, + "grad_norm": 6.019667625427246, + "learning_rate": 4.8451469556863926e-05, + "loss": 0.8635, + "step": 10510 + }, + { + "epoch": 0.0930002298484768, + "grad_norm": 6.268322944641113, + "learning_rate": 4.844999616919206e-05, + "loss": 0.7925, + "step": 10520 + }, + { + "epoch": 0.09308863310878905, + "grad_norm": 4.191029071807861, + "learning_rate": 4.844852278152018e-05, + "loss": 0.8386, + "step": 10530 + }, + { + "epoch": 0.0931770363691013, + "grad_norm": 5.0751519203186035, + "learning_rate": 4.844704939384832e-05, + "loss": 0.9943, + "step": 10540 + }, + { + "epoch": 0.09326543962941353, + "grad_norm": 4.535125732421875, + "learning_rate": 4.844557600617644e-05, + "loss": 0.8693, + "step": 10550 + }, + { + "epoch": 0.09335384288972577, + "grad_norm": 6.545707702636719, + "learning_rate": 4.8444102618504574e-05, + "loss": 0.9216, + "step": 10560 + }, + { + "epoch": 0.09344224615003802, + "grad_norm": 7.115601062774658, + "learning_rate": 4.84426292308327e-05, + "loss": 0.9629, + "step": 10570 + }, + { + "epoch": 0.09353064941035025, + "grad_norm": 4.578786373138428, + "learning_rate": 4.844115584316083e-05, + "loss": 0.9829, + "step": 10580 + }, + { + "epoch": 0.09361905267066249, + "grad_norm": 3.8402419090270996, + "learning_rate": 4.843968245548896e-05, + "loss": 0.9583, + "step": 10590 + }, + { + "epoch": 0.09370745593097474, + "grad_norm": 3.4268300533294678, + "learning_rate": 4.8438209067817094e-05, + "loss": 0.9275, + "step": 10600 + }, + { + "epoch": 0.09379585919128698, + "grad_norm": 4.041760444641113, + "learning_rate": 4.8436735680145216e-05, + "loss": 0.8986, + "step": 10610 + }, + { + "epoch": 0.09388426245159921, + "grad_norm": 4.180763244628906, + "learning_rate": 4.843526229247335e-05, + "loss": 0.9127, + "step": 10620 + }, + { + "epoch": 0.09397266571191146, + "grad_norm": 6.7749409675598145, + "learning_rate": 4.843378890480148e-05, + "loss": 0.8426, + "step": 10630 + }, + { + "epoch": 0.0940610689722237, + "grad_norm": 6.551861763000488, + "learning_rate": 4.843231551712961e-05, + "loss": 0.7469, + "step": 10640 + }, + { + "epoch": 0.09414947223253593, + "grad_norm": 6.459712982177734, + "learning_rate": 4.8430842129457736e-05, + "loss": 0.8874, + "step": 10650 + }, + { + "epoch": 0.09423787549284818, + "grad_norm": 8.966519355773926, + "learning_rate": 4.842936874178587e-05, + "loss": 0.961, + "step": 10660 + }, + { + "epoch": 0.09432627875316042, + "grad_norm": 4.387051582336426, + "learning_rate": 4.842789535411399e-05, + "loss": 0.857, + "step": 10670 + }, + { + "epoch": 0.09441468201347265, + "grad_norm": 5.910558223724365, + "learning_rate": 4.842642196644213e-05, + "loss": 0.8454, + "step": 10680 + }, + { + "epoch": 0.0945030852737849, + "grad_norm": 7.334143161773682, + "learning_rate": 4.842494857877025e-05, + "loss": 0.8719, + "step": 10690 + }, + { + "epoch": 0.09459148853409714, + "grad_norm": 3.0009491443634033, + "learning_rate": 4.8423475191098385e-05, + "loss": 0.9587, + "step": 10700 + }, + { + "epoch": 0.09467989179440937, + "grad_norm": 5.048775672912598, + "learning_rate": 4.842200180342651e-05, + "loss": 0.7913, + "step": 10710 + }, + { + "epoch": 0.09476829505472162, + "grad_norm": 9.675728797912598, + "learning_rate": 4.842052841575464e-05, + "loss": 0.9415, + "step": 10720 + }, + { + "epoch": 0.09485669831503386, + "grad_norm": 6.40328311920166, + "learning_rate": 4.841905502808277e-05, + "loss": 0.883, + "step": 10730 + }, + { + "epoch": 0.0949451015753461, + "grad_norm": 2.5814383029937744, + "learning_rate": 4.8417581640410905e-05, + "loss": 0.8322, + "step": 10740 + }, + { + "epoch": 0.09503350483565834, + "grad_norm": 7.165457248687744, + "learning_rate": 4.8416108252739026e-05, + "loss": 0.8912, + "step": 10750 + }, + { + "epoch": 0.09512190809597058, + "grad_norm": 7.884331703186035, + "learning_rate": 4.841463486506716e-05, + "loss": 0.807, + "step": 10760 + }, + { + "epoch": 0.09521031135628281, + "grad_norm": 3.5740268230438232, + "learning_rate": 4.841316147739529e-05, + "loss": 0.8781, + "step": 10770 + }, + { + "epoch": 0.09529871461659506, + "grad_norm": 5.46319055557251, + "learning_rate": 4.841168808972342e-05, + "loss": 0.8125, + "step": 10780 + }, + { + "epoch": 0.0953871178769073, + "grad_norm": 4.8293585777282715, + "learning_rate": 4.841021470205155e-05, + "loss": 0.7758, + "step": 10790 + }, + { + "epoch": 0.09547552113721954, + "grad_norm": 3.7307755947113037, + "learning_rate": 4.8408741314379675e-05, + "loss": 0.9022, + "step": 10800 + }, + { + "epoch": 0.09556392439753178, + "grad_norm": 6.22001838684082, + "learning_rate": 4.84072679267078e-05, + "loss": 0.8317, + "step": 10810 + }, + { + "epoch": 0.09565232765784402, + "grad_norm": 7.880277633666992, + "learning_rate": 4.840579453903594e-05, + "loss": 0.8278, + "step": 10820 + }, + { + "epoch": 0.09574073091815626, + "grad_norm": 7.64493465423584, + "learning_rate": 4.840432115136406e-05, + "loss": 0.9138, + "step": 10830 + }, + { + "epoch": 0.0958291341784685, + "grad_norm": 3.439091920852661, + "learning_rate": 4.8402847763692195e-05, + "loss": 0.928, + "step": 10840 + }, + { + "epoch": 0.09591753743878075, + "grad_norm": 5.199951648712158, + "learning_rate": 4.8401374376020324e-05, + "loss": 0.7234, + "step": 10850 + }, + { + "epoch": 0.09600594069909298, + "grad_norm": 5.242109298706055, + "learning_rate": 4.839990098834845e-05, + "loss": 0.8521, + "step": 10860 + }, + { + "epoch": 0.09609434395940522, + "grad_norm": 8.574398040771484, + "learning_rate": 4.839842760067658e-05, + "loss": 0.8285, + "step": 10870 + }, + { + "epoch": 0.09618274721971747, + "grad_norm": 7.547273635864258, + "learning_rate": 4.8396954213004715e-05, + "loss": 0.8114, + "step": 10880 + }, + { + "epoch": 0.09627115048002971, + "grad_norm": 5.9737114906311035, + "learning_rate": 4.839548082533284e-05, + "loss": 0.8522, + "step": 10890 + }, + { + "epoch": 0.09635955374034194, + "grad_norm": 6.698936462402344, + "learning_rate": 4.839400743766097e-05, + "loss": 0.9251, + "step": 10900 + }, + { + "epoch": 0.09644795700065419, + "grad_norm": 7.054299831390381, + "learning_rate": 4.8392534049989094e-05, + "loss": 0.9101, + "step": 10910 + }, + { + "epoch": 0.09653636026096643, + "grad_norm": 3.2542550563812256, + "learning_rate": 4.839106066231723e-05, + "loss": 0.9316, + "step": 10920 + }, + { + "epoch": 0.09662476352127866, + "grad_norm": 4.418131351470947, + "learning_rate": 4.838958727464536e-05, + "loss": 0.8635, + "step": 10930 + }, + { + "epoch": 0.0967131667815909, + "grad_norm": 6.187062740325928, + "learning_rate": 4.8388113886973486e-05, + "loss": 0.8683, + "step": 10940 + }, + { + "epoch": 0.09680157004190315, + "grad_norm": 4.491509914398193, + "learning_rate": 4.8386640499301614e-05, + "loss": 0.7484, + "step": 10950 + }, + { + "epoch": 0.09688997330221538, + "grad_norm": 3.0069119930267334, + "learning_rate": 4.838516711162975e-05, + "loss": 0.8678, + "step": 10960 + }, + { + "epoch": 0.09697837656252763, + "grad_norm": 6.826173305511475, + "learning_rate": 4.838369372395787e-05, + "loss": 0.9782, + "step": 10970 + }, + { + "epoch": 0.09706677982283987, + "grad_norm": 6.12066650390625, + "learning_rate": 4.8382220336286006e-05, + "loss": 0.7994, + "step": 10980 + }, + { + "epoch": 0.0971551830831521, + "grad_norm": 5.961526393890381, + "learning_rate": 4.8380746948614134e-05, + "loss": 1.0027, + "step": 10990 + }, + { + "epoch": 0.09724358634346435, + "grad_norm": 8.005057334899902, + "learning_rate": 4.837927356094226e-05, + "loss": 0.9393, + "step": 11000 + }, + { + "epoch": 0.09733198960377659, + "grad_norm": 5.7164764404296875, + "learning_rate": 4.837780017327039e-05, + "loss": 0.9266, + "step": 11010 + }, + { + "epoch": 0.09742039286408882, + "grad_norm": 1.6349455118179321, + "learning_rate": 4.837632678559852e-05, + "loss": 0.7811, + "step": 11020 + }, + { + "epoch": 0.09750879612440107, + "grad_norm": 3.9553287029266357, + "learning_rate": 4.837485339792665e-05, + "loss": 0.9023, + "step": 11030 + }, + { + "epoch": 0.09759719938471331, + "grad_norm": 3.272874355316162, + "learning_rate": 4.837338001025478e-05, + "loss": 0.7611, + "step": 11040 + }, + { + "epoch": 0.09768560264502554, + "grad_norm": 8.173721313476562, + "learning_rate": 4.837190662258291e-05, + "loss": 0.9666, + "step": 11050 + }, + { + "epoch": 0.09777400590533779, + "grad_norm": 5.934161186218262, + "learning_rate": 4.837043323491104e-05, + "loss": 0.8923, + "step": 11060 + }, + { + "epoch": 0.09786240916565003, + "grad_norm": 5.214540004730225, + "learning_rate": 4.836895984723917e-05, + "loss": 0.8765, + "step": 11070 + }, + { + "epoch": 0.09795081242596226, + "grad_norm": 2.524299144744873, + "learning_rate": 4.8367486459567296e-05, + "loss": 0.9086, + "step": 11080 + }, + { + "epoch": 0.09803921568627451, + "grad_norm": 5.146005153656006, + "learning_rate": 4.8366013071895424e-05, + "loss": 0.7716, + "step": 11090 + }, + { + "epoch": 0.09812761894658675, + "grad_norm": 4.304093837738037, + "learning_rate": 4.836453968422356e-05, + "loss": 0.8965, + "step": 11100 + }, + { + "epoch": 0.09821602220689898, + "grad_norm": 4.954878807067871, + "learning_rate": 4.836306629655169e-05, + "loss": 0.962, + "step": 11110 + }, + { + "epoch": 0.09830442546721123, + "grad_norm": 6.129110336303711, + "learning_rate": 4.8361592908879816e-05, + "loss": 0.9413, + "step": 11120 + }, + { + "epoch": 0.09839282872752347, + "grad_norm": 7.776825904846191, + "learning_rate": 4.8360119521207945e-05, + "loss": 0.8564, + "step": 11130 + }, + { + "epoch": 0.0984812319878357, + "grad_norm": 4.105815887451172, + "learning_rate": 4.835864613353607e-05, + "loss": 0.7796, + "step": 11140 + }, + { + "epoch": 0.09856963524814795, + "grad_norm": 4.403508186340332, + "learning_rate": 4.83571727458642e-05, + "loss": 1.0089, + "step": 11150 + }, + { + "epoch": 0.0986580385084602, + "grad_norm": 3.8178341388702393, + "learning_rate": 4.835569935819233e-05, + "loss": 1.0437, + "step": 11160 + }, + { + "epoch": 0.09874644176877244, + "grad_norm": 5.4979753494262695, + "learning_rate": 4.8354225970520465e-05, + "loss": 0.961, + "step": 11170 + }, + { + "epoch": 0.09883484502908467, + "grad_norm": 6.354760646820068, + "learning_rate": 4.835275258284859e-05, + "loss": 0.8086, + "step": 11180 + }, + { + "epoch": 0.09892324828939691, + "grad_norm": 3.0523722171783447, + "learning_rate": 4.835127919517672e-05, + "loss": 0.8666, + "step": 11190 + }, + { + "epoch": 0.09901165154970916, + "grad_norm": 6.463064193725586, + "learning_rate": 4.834980580750485e-05, + "loss": 0.8575, + "step": 11200 + }, + { + "epoch": 0.09910005481002139, + "grad_norm": 3.113537549972534, + "learning_rate": 4.834833241983298e-05, + "loss": 0.7764, + "step": 11210 + }, + { + "epoch": 0.09918845807033363, + "grad_norm": 6.625050067901611, + "learning_rate": 4.8346859032161107e-05, + "loss": 0.9465, + "step": 11220 + }, + { + "epoch": 0.09927686133064588, + "grad_norm": 3.369560480117798, + "learning_rate": 4.834538564448924e-05, + "loss": 0.8409, + "step": 11230 + }, + { + "epoch": 0.09936526459095811, + "grad_norm": 2.803184986114502, + "learning_rate": 4.834391225681737e-05, + "loss": 0.8473, + "step": 11240 + }, + { + "epoch": 0.09945366785127036, + "grad_norm": 2.738548517227173, + "learning_rate": 4.83424388691455e-05, + "loss": 0.8101, + "step": 11250 + }, + { + "epoch": 0.0995420711115826, + "grad_norm": 4.669358730316162, + "learning_rate": 4.834096548147363e-05, + "loss": 0.9251, + "step": 11260 + }, + { + "epoch": 0.09963047437189483, + "grad_norm": 10.148649215698242, + "learning_rate": 4.8339492093801755e-05, + "loss": 0.9198, + "step": 11270 + }, + { + "epoch": 0.09971887763220708, + "grad_norm": 9.202265739440918, + "learning_rate": 4.8338018706129883e-05, + "loss": 0.9573, + "step": 11280 + }, + { + "epoch": 0.09980728089251932, + "grad_norm": 6.037299633026123, + "learning_rate": 4.833654531845802e-05, + "loss": 0.7422, + "step": 11290 + }, + { + "epoch": 0.09989568415283155, + "grad_norm": 8.946399688720703, + "learning_rate": 4.833507193078614e-05, + "loss": 0.8829, + "step": 11300 + }, + { + "epoch": 0.0999840874131438, + "grad_norm": 6.282362937927246, + "learning_rate": 4.8333598543114275e-05, + "loss": 0.9511, + "step": 11310 + }, + { + "epoch": 0.10007249067345604, + "grad_norm": 5.291918754577637, + "learning_rate": 4.8332125155442404e-05, + "loss": 0.9102, + "step": 11320 + }, + { + "epoch": 0.10016089393376827, + "grad_norm": 4.973713397979736, + "learning_rate": 4.833065176777053e-05, + "loss": 0.9344, + "step": 11330 + }, + { + "epoch": 0.10024929719408052, + "grad_norm": 4.601949691772461, + "learning_rate": 4.832917838009866e-05, + "loss": 0.8408, + "step": 11340 + }, + { + "epoch": 0.10033770045439276, + "grad_norm": 7.364986896514893, + "learning_rate": 4.8327704992426795e-05, + "loss": 0.8399, + "step": 11350 + }, + { + "epoch": 0.10042610371470499, + "grad_norm": 5.818500995635986, + "learning_rate": 4.832623160475492e-05, + "loss": 0.7574, + "step": 11360 + }, + { + "epoch": 0.10051450697501724, + "grad_norm": 10.510309219360352, + "learning_rate": 4.832475821708305e-05, + "loss": 0.861, + "step": 11370 + }, + { + "epoch": 0.10060291023532948, + "grad_norm": 2.829066514968872, + "learning_rate": 4.8323284829411174e-05, + "loss": 0.8238, + "step": 11380 + }, + { + "epoch": 0.10069131349564171, + "grad_norm": 12.972811698913574, + "learning_rate": 4.832181144173931e-05, + "loss": 0.8737, + "step": 11390 + }, + { + "epoch": 0.10077971675595396, + "grad_norm": 6.312592506408691, + "learning_rate": 4.832033805406744e-05, + "loss": 0.8533, + "step": 11400 + }, + { + "epoch": 0.1008681200162662, + "grad_norm": 3.509385108947754, + "learning_rate": 4.8318864666395566e-05, + "loss": 0.836, + "step": 11410 + }, + { + "epoch": 0.10095652327657845, + "grad_norm": 4.647274494171143, + "learning_rate": 4.8317391278723694e-05, + "loss": 0.8366, + "step": 11420 + }, + { + "epoch": 0.10104492653689068, + "grad_norm": 12.710670471191406, + "learning_rate": 4.831591789105183e-05, + "loss": 0.8516, + "step": 11430 + }, + { + "epoch": 0.10113332979720292, + "grad_norm": 4.196151256561279, + "learning_rate": 4.831444450337995e-05, + "loss": 0.8707, + "step": 11440 + }, + { + "epoch": 0.10122173305751517, + "grad_norm": 3.9043242931365967, + "learning_rate": 4.8312971115708086e-05, + "loss": 0.9092, + "step": 11450 + }, + { + "epoch": 0.1013101363178274, + "grad_norm": 5.994088649749756, + "learning_rate": 4.8311497728036214e-05, + "loss": 0.9521, + "step": 11460 + }, + { + "epoch": 0.10139853957813964, + "grad_norm": 4.444270610809326, + "learning_rate": 4.831002434036434e-05, + "loss": 0.9715, + "step": 11470 + }, + { + "epoch": 0.10148694283845189, + "grad_norm": 6.034897804260254, + "learning_rate": 4.830855095269247e-05, + "loss": 0.8756, + "step": 11480 + }, + { + "epoch": 0.10157534609876412, + "grad_norm": 6.750916004180908, + "learning_rate": 4.83070775650206e-05, + "loss": 0.8579, + "step": 11490 + }, + { + "epoch": 0.10166374935907636, + "grad_norm": 4.86402702331543, + "learning_rate": 4.830560417734873e-05, + "loss": 0.8653, + "step": 11500 + }, + { + "epoch": 0.10175215261938861, + "grad_norm": 5.484646797180176, + "learning_rate": 4.830413078967686e-05, + "loss": 0.8304, + "step": 11510 + }, + { + "epoch": 0.10184055587970084, + "grad_norm": 10.572221755981445, + "learning_rate": 4.8302657402004984e-05, + "loss": 0.9065, + "step": 11520 + }, + { + "epoch": 0.10192895914001308, + "grad_norm": 4.579751014709473, + "learning_rate": 4.830118401433312e-05, + "loss": 0.9681, + "step": 11530 + }, + { + "epoch": 0.10201736240032533, + "grad_norm": 3.4729080200195312, + "learning_rate": 4.829971062666125e-05, + "loss": 0.8756, + "step": 11540 + }, + { + "epoch": 0.10210576566063756, + "grad_norm": 6.62471342086792, + "learning_rate": 4.8298237238989376e-05, + "loss": 0.8131, + "step": 11550 + }, + { + "epoch": 0.1021941689209498, + "grad_norm": 6.508872032165527, + "learning_rate": 4.8296763851317504e-05, + "loss": 1.0484, + "step": 11560 + }, + { + "epoch": 0.10228257218126205, + "grad_norm": 7.294726848602295, + "learning_rate": 4.829529046364564e-05, + "loss": 0.8732, + "step": 11570 + }, + { + "epoch": 0.10237097544157428, + "grad_norm": 6.782046318054199, + "learning_rate": 4.829381707597376e-05, + "loss": 0.9028, + "step": 11580 + }, + { + "epoch": 0.10245937870188652, + "grad_norm": 5.460529804229736, + "learning_rate": 4.8292343688301896e-05, + "loss": 0.949, + "step": 11590 + }, + { + "epoch": 0.10254778196219877, + "grad_norm": 5.221329689025879, + "learning_rate": 4.8290870300630025e-05, + "loss": 0.811, + "step": 11600 + }, + { + "epoch": 0.102636185222511, + "grad_norm": 5.675347805023193, + "learning_rate": 4.828939691295815e-05, + "loss": 0.9441, + "step": 11610 + }, + { + "epoch": 0.10272458848282325, + "grad_norm": 6.541971206665039, + "learning_rate": 4.828792352528628e-05, + "loss": 0.8346, + "step": 11620 + }, + { + "epoch": 0.10281299174313549, + "grad_norm": 2.437593936920166, + "learning_rate": 4.828645013761441e-05, + "loss": 0.8141, + "step": 11630 + }, + { + "epoch": 0.10290139500344772, + "grad_norm": 4.701081275939941, + "learning_rate": 4.828497674994254e-05, + "loss": 0.7186, + "step": 11640 + }, + { + "epoch": 0.10298979826375997, + "grad_norm": 8.1825532913208, + "learning_rate": 4.828350336227067e-05, + "loss": 0.9811, + "step": 11650 + }, + { + "epoch": 0.10307820152407221, + "grad_norm": 4.1160125732421875, + "learning_rate": 4.8282029974598795e-05, + "loss": 0.939, + "step": 11660 + }, + { + "epoch": 0.10316660478438444, + "grad_norm": 7.095728397369385, + "learning_rate": 4.828055658692693e-05, + "loss": 0.7836, + "step": 11670 + }, + { + "epoch": 0.10325500804469669, + "grad_norm": 10.679615020751953, + "learning_rate": 4.827908319925506e-05, + "loss": 0.8775, + "step": 11680 + }, + { + "epoch": 0.10334341130500893, + "grad_norm": 3.686213970184326, + "learning_rate": 4.8277609811583187e-05, + "loss": 0.8241, + "step": 11690 + }, + { + "epoch": 0.10343181456532118, + "grad_norm": 4.497840404510498, + "learning_rate": 4.8276136423911315e-05, + "loss": 0.795, + "step": 11700 + }, + { + "epoch": 0.1035202178256334, + "grad_norm": 5.162242889404297, + "learning_rate": 4.827466303623945e-05, + "loss": 0.9048, + "step": 11710 + }, + { + "epoch": 0.10360862108594565, + "grad_norm": 8.991860389709473, + "learning_rate": 4.827318964856757e-05, + "loss": 0.9184, + "step": 11720 + }, + { + "epoch": 0.1036970243462579, + "grad_norm": 5.413891315460205, + "learning_rate": 4.827171626089571e-05, + "loss": 0.8428, + "step": 11730 + }, + { + "epoch": 0.10378542760657013, + "grad_norm": 6.8643412590026855, + "learning_rate": 4.827024287322383e-05, + "loss": 0.9267, + "step": 11740 + }, + { + "epoch": 0.10387383086688237, + "grad_norm": 4.406735897064209, + "learning_rate": 4.8268769485551963e-05, + "loss": 0.9461, + "step": 11750 + }, + { + "epoch": 0.10396223412719462, + "grad_norm": 5.080199241638184, + "learning_rate": 4.826729609788009e-05, + "loss": 0.8362, + "step": 11760 + }, + { + "epoch": 0.10405063738750685, + "grad_norm": 4.05060338973999, + "learning_rate": 4.826582271020822e-05, + "loss": 0.7967, + "step": 11770 + }, + { + "epoch": 0.10413904064781909, + "grad_norm": 5.957949638366699, + "learning_rate": 4.826434932253635e-05, + "loss": 0.8772, + "step": 11780 + }, + { + "epoch": 0.10422744390813134, + "grad_norm": 3.757291555404663, + "learning_rate": 4.8262875934864484e-05, + "loss": 0.8104, + "step": 11790 + }, + { + "epoch": 0.10431584716844357, + "grad_norm": 6.859879016876221, + "learning_rate": 4.8261402547192605e-05, + "loss": 0.8481, + "step": 11800 + }, + { + "epoch": 0.10440425042875581, + "grad_norm": 3.6826179027557373, + "learning_rate": 4.825992915952074e-05, + "loss": 0.8707, + "step": 11810 + }, + { + "epoch": 0.10449265368906806, + "grad_norm": 2.7624754905700684, + "learning_rate": 4.825845577184887e-05, + "loss": 0.7285, + "step": 11820 + }, + { + "epoch": 0.10458105694938029, + "grad_norm": 4.933928966522217, + "learning_rate": 4.8256982384177e-05, + "loss": 0.8739, + "step": 11830 + }, + { + "epoch": 0.10466946020969253, + "grad_norm": 6.146503925323486, + "learning_rate": 4.8255508996505125e-05, + "loss": 0.8291, + "step": 11840 + }, + { + "epoch": 0.10475786347000478, + "grad_norm": 5.96083402633667, + "learning_rate": 4.8254035608833254e-05, + "loss": 0.7992, + "step": 11850 + }, + { + "epoch": 0.10484626673031701, + "grad_norm": 3.181730031967163, + "learning_rate": 4.825256222116138e-05, + "loss": 0.8775, + "step": 11860 + }, + { + "epoch": 0.10493466999062925, + "grad_norm": 4.656289100646973, + "learning_rate": 4.825108883348952e-05, + "loss": 0.9216, + "step": 11870 + }, + { + "epoch": 0.1050230732509415, + "grad_norm": 3.093475103378296, + "learning_rate": 4.824961544581764e-05, + "loss": 0.8623, + "step": 11880 + }, + { + "epoch": 0.10511147651125373, + "grad_norm": 10.09500503540039, + "learning_rate": 4.8248142058145774e-05, + "loss": 0.8994, + "step": 11890 + }, + { + "epoch": 0.10519987977156597, + "grad_norm": 8.591645240783691, + "learning_rate": 4.82466686704739e-05, + "loss": 0.7397, + "step": 11900 + }, + { + "epoch": 0.10528828303187822, + "grad_norm": 5.774130344390869, + "learning_rate": 4.824519528280203e-05, + "loss": 0.8121, + "step": 11910 + }, + { + "epoch": 0.10537668629219045, + "grad_norm": 4.944594383239746, + "learning_rate": 4.824372189513016e-05, + "loss": 0.7577, + "step": 11920 + }, + { + "epoch": 0.1054650895525027, + "grad_norm": 10.666511535644531, + "learning_rate": 4.8242248507458294e-05, + "loss": 0.8921, + "step": 11930 + }, + { + "epoch": 0.10555349281281494, + "grad_norm": 4.933461666107178, + "learning_rate": 4.8240775119786416e-05, + "loss": 0.9285, + "step": 11940 + }, + { + "epoch": 0.10564189607312717, + "grad_norm": 5.876432418823242, + "learning_rate": 4.823930173211455e-05, + "loss": 0.9049, + "step": 11950 + }, + { + "epoch": 0.10573029933343941, + "grad_norm": 2.357940673828125, + "learning_rate": 4.823782834444268e-05, + "loss": 0.8446, + "step": 11960 + }, + { + "epoch": 0.10581870259375166, + "grad_norm": 11.82457447052002, + "learning_rate": 4.823635495677081e-05, + "loss": 0.9313, + "step": 11970 + }, + { + "epoch": 0.1059071058540639, + "grad_norm": 7.283470153808594, + "learning_rate": 4.8234881569098936e-05, + "loss": 0.7034, + "step": 11980 + }, + { + "epoch": 0.10599550911437614, + "grad_norm": 5.469303131103516, + "learning_rate": 4.8233408181427064e-05, + "loss": 0.7876, + "step": 11990 + }, + { + "epoch": 0.10608391237468838, + "grad_norm": 6.705424785614014, + "learning_rate": 4.823193479375519e-05, + "loss": 0.81, + "step": 12000 + }, + { + "epoch": 0.10617231563500062, + "grad_norm": 11.108362197875977, + "learning_rate": 4.823046140608333e-05, + "loss": 0.8724, + "step": 12010 + }, + { + "epoch": 0.10626071889531286, + "grad_norm": 3.9691977500915527, + "learning_rate": 4.8228988018411456e-05, + "loss": 0.8841, + "step": 12020 + }, + { + "epoch": 0.1063491221556251, + "grad_norm": 4.791903972625732, + "learning_rate": 4.8227514630739585e-05, + "loss": 0.888, + "step": 12030 + }, + { + "epoch": 0.10643752541593735, + "grad_norm": 3.9145760536193848, + "learning_rate": 4.822604124306771e-05, + "loss": 0.8979, + "step": 12040 + }, + { + "epoch": 0.10652592867624958, + "grad_norm": 7.0544281005859375, + "learning_rate": 4.822456785539584e-05, + "loss": 0.7288, + "step": 12050 + }, + { + "epoch": 0.10661433193656182, + "grad_norm": 5.192449569702148, + "learning_rate": 4.822309446772397e-05, + "loss": 0.9639, + "step": 12060 + }, + { + "epoch": 0.10670273519687407, + "grad_norm": 5.621539115905762, + "learning_rate": 4.8221621080052105e-05, + "loss": 0.8467, + "step": 12070 + }, + { + "epoch": 0.1067911384571863, + "grad_norm": 4.238648891448975, + "learning_rate": 4.822014769238023e-05, + "loss": 0.8116, + "step": 12080 + }, + { + "epoch": 0.10687954171749854, + "grad_norm": 5.105109214782715, + "learning_rate": 4.821867430470836e-05, + "loss": 0.9827, + "step": 12090 + }, + { + "epoch": 0.10696794497781079, + "grad_norm": 5.218087196350098, + "learning_rate": 4.821720091703649e-05, + "loss": 0.9326, + "step": 12100 + }, + { + "epoch": 0.10705634823812302, + "grad_norm": 9.325528144836426, + "learning_rate": 4.821572752936462e-05, + "loss": 0.9618, + "step": 12110 + }, + { + "epoch": 0.10714475149843526, + "grad_norm": 4.683792591094971, + "learning_rate": 4.8214254141692746e-05, + "loss": 0.8442, + "step": 12120 + }, + { + "epoch": 0.1072331547587475, + "grad_norm": 5.67968225479126, + "learning_rate": 4.8212780754020875e-05, + "loss": 0.8684, + "step": 12130 + }, + { + "epoch": 0.10732155801905974, + "grad_norm": 5.964956760406494, + "learning_rate": 4.821130736634901e-05, + "loss": 0.8322, + "step": 12140 + }, + { + "epoch": 0.10740996127937198, + "grad_norm": 4.661000728607178, + "learning_rate": 4.820983397867714e-05, + "loss": 0.8833, + "step": 12150 + }, + { + "epoch": 0.10749836453968423, + "grad_norm": 7.7476420402526855, + "learning_rate": 4.820836059100527e-05, + "loss": 0.9018, + "step": 12160 + }, + { + "epoch": 0.10758676779999646, + "grad_norm": 3.875717878341675, + "learning_rate": 4.8206887203333395e-05, + "loss": 0.8409, + "step": 12170 + }, + { + "epoch": 0.1076751710603087, + "grad_norm": 3.472538948059082, + "learning_rate": 4.820541381566152e-05, + "loss": 0.8306, + "step": 12180 + }, + { + "epoch": 0.10776357432062095, + "grad_norm": 5.734470367431641, + "learning_rate": 4.820394042798965e-05, + "loss": 0.9054, + "step": 12190 + }, + { + "epoch": 0.10785197758093318, + "grad_norm": 3.783080577850342, + "learning_rate": 4.820246704031779e-05, + "loss": 0.833, + "step": 12200 + }, + { + "epoch": 0.10794038084124542, + "grad_norm": 7.443694114685059, + "learning_rate": 4.820099365264591e-05, + "loss": 0.9199, + "step": 12210 + }, + { + "epoch": 0.10802878410155767, + "grad_norm": 7.434304714202881, + "learning_rate": 4.8199520264974044e-05, + "loss": 0.8747, + "step": 12220 + }, + { + "epoch": 0.10811718736186991, + "grad_norm": 6.162380695343018, + "learning_rate": 4.819804687730217e-05, + "loss": 0.9705, + "step": 12230 + }, + { + "epoch": 0.10820559062218214, + "grad_norm": 7.422550201416016, + "learning_rate": 4.81965734896303e-05, + "loss": 0.8789, + "step": 12240 + }, + { + "epoch": 0.10829399388249439, + "grad_norm": 3.3764584064483643, + "learning_rate": 4.819510010195843e-05, + "loss": 0.8193, + "step": 12250 + }, + { + "epoch": 0.10838239714280663, + "grad_norm": 5.908439636230469, + "learning_rate": 4.8193626714286564e-05, + "loss": 0.9523, + "step": 12260 + }, + { + "epoch": 0.10847080040311886, + "grad_norm": 5.220740795135498, + "learning_rate": 4.8192153326614685e-05, + "loss": 0.7079, + "step": 12270 + }, + { + "epoch": 0.10855920366343111, + "grad_norm": 4.891901969909668, + "learning_rate": 4.819067993894282e-05, + "loss": 0.9311, + "step": 12280 + }, + { + "epoch": 0.10864760692374335, + "grad_norm": 2.5737478733062744, + "learning_rate": 4.818920655127095e-05, + "loss": 0.8561, + "step": 12290 + }, + { + "epoch": 0.10873601018405558, + "grad_norm": 3.4949209690093994, + "learning_rate": 4.818773316359908e-05, + "loss": 0.8153, + "step": 12300 + }, + { + "epoch": 0.10882441344436783, + "grad_norm": 4.714328289031982, + "learning_rate": 4.8186259775927206e-05, + "loss": 0.7879, + "step": 12310 + }, + { + "epoch": 0.10891281670468007, + "grad_norm": 5.5439958572387695, + "learning_rate": 4.8184786388255334e-05, + "loss": 0.8124, + "step": 12320 + }, + { + "epoch": 0.1090012199649923, + "grad_norm": 3.950995683670044, + "learning_rate": 4.818331300058346e-05, + "loss": 0.9348, + "step": 12330 + }, + { + "epoch": 0.10908962322530455, + "grad_norm": 3.3742518424987793, + "learning_rate": 4.81818396129116e-05, + "loss": 0.8962, + "step": 12340 + }, + { + "epoch": 0.1091780264856168, + "grad_norm": 3.243555784225464, + "learning_rate": 4.818036622523972e-05, + "loss": 0.868, + "step": 12350 + }, + { + "epoch": 0.10926642974592903, + "grad_norm": 5.580383777618408, + "learning_rate": 4.8178892837567854e-05, + "loss": 0.941, + "step": 12360 + }, + { + "epoch": 0.10935483300624127, + "grad_norm": 3.664823532104492, + "learning_rate": 4.817741944989598e-05, + "loss": 0.7888, + "step": 12370 + }, + { + "epoch": 0.10944323626655351, + "grad_norm": 4.724151134490967, + "learning_rate": 4.817594606222411e-05, + "loss": 0.8237, + "step": 12380 + }, + { + "epoch": 0.10953163952686575, + "grad_norm": 2.60208797454834, + "learning_rate": 4.817447267455224e-05, + "loss": 0.8005, + "step": 12390 + }, + { + "epoch": 0.10962004278717799, + "grad_norm": 7.287562370300293, + "learning_rate": 4.8172999286880374e-05, + "loss": 0.8841, + "step": 12400 + }, + { + "epoch": 0.10970844604749024, + "grad_norm": 5.459963321685791, + "learning_rate": 4.8171525899208496e-05, + "loss": 0.8946, + "step": 12410 + }, + { + "epoch": 0.10979684930780247, + "grad_norm": 5.479110240936279, + "learning_rate": 4.817005251153663e-05, + "loss": 0.8602, + "step": 12420 + }, + { + "epoch": 0.10988525256811471, + "grad_norm": 7.66988468170166, + "learning_rate": 4.816857912386475e-05, + "loss": 0.8133, + "step": 12430 + }, + { + "epoch": 0.10997365582842696, + "grad_norm": 5.17045783996582, + "learning_rate": 4.816710573619289e-05, + "loss": 0.6647, + "step": 12440 + }, + { + "epoch": 0.11006205908873919, + "grad_norm": 7.7116241455078125, + "learning_rate": 4.8165632348521016e-05, + "loss": 0.9312, + "step": 12450 + }, + { + "epoch": 0.11015046234905143, + "grad_norm": 5.106606483459473, + "learning_rate": 4.8164158960849144e-05, + "loss": 0.9675, + "step": 12460 + }, + { + "epoch": 0.11023886560936368, + "grad_norm": 4.0043721199035645, + "learning_rate": 4.816268557317727e-05, + "loss": 0.8789, + "step": 12470 + }, + { + "epoch": 0.1103272688696759, + "grad_norm": 3.060173273086548, + "learning_rate": 4.816121218550541e-05, + "loss": 0.7391, + "step": 12480 + }, + { + "epoch": 0.11041567212998815, + "grad_norm": 6.218465328216553, + "learning_rate": 4.815973879783353e-05, + "loss": 0.7834, + "step": 12490 + }, + { + "epoch": 0.1105040753903004, + "grad_norm": 6.211796283721924, + "learning_rate": 4.8158265410161665e-05, + "loss": 0.8404, + "step": 12500 + }, + { + "epoch": 0.11059247865061264, + "grad_norm": 10.186081886291504, + "learning_rate": 4.815679202248979e-05, + "loss": 0.8758, + "step": 12510 + }, + { + "epoch": 0.11068088191092487, + "grad_norm": 4.9329962730407715, + "learning_rate": 4.815531863481792e-05, + "loss": 0.9294, + "step": 12520 + }, + { + "epoch": 0.11076928517123712, + "grad_norm": 6.29591703414917, + "learning_rate": 4.815384524714605e-05, + "loss": 0.9319, + "step": 12530 + }, + { + "epoch": 0.11085768843154936, + "grad_norm": 4.162048816680908, + "learning_rate": 4.8152371859474185e-05, + "loss": 0.8672, + "step": 12540 + }, + { + "epoch": 0.11094609169186159, + "grad_norm": 5.286823272705078, + "learning_rate": 4.8150898471802306e-05, + "loss": 0.8435, + "step": 12550 + }, + { + "epoch": 0.11103449495217384, + "grad_norm": 13.852834701538086, + "learning_rate": 4.814942508413044e-05, + "loss": 0.922, + "step": 12560 + }, + { + "epoch": 0.11112289821248608, + "grad_norm": 7.8984270095825195, + "learning_rate": 4.814795169645856e-05, + "loss": 0.8533, + "step": 12570 + }, + { + "epoch": 0.11121130147279831, + "grad_norm": 3.034083604812622, + "learning_rate": 4.81464783087867e-05, + "loss": 0.7907, + "step": 12580 + }, + { + "epoch": 0.11129970473311056, + "grad_norm": 14.976812362670898, + "learning_rate": 4.8145004921114827e-05, + "loss": 0.9485, + "step": 12590 + }, + { + "epoch": 0.1113881079934228, + "grad_norm": 6.471790313720703, + "learning_rate": 4.8143531533442955e-05, + "loss": 0.8911, + "step": 12600 + }, + { + "epoch": 0.11147651125373503, + "grad_norm": 5.903090953826904, + "learning_rate": 4.814205814577108e-05, + "loss": 0.8458, + "step": 12610 + }, + { + "epoch": 0.11156491451404728, + "grad_norm": 3.115103244781494, + "learning_rate": 4.814058475809922e-05, + "loss": 0.6789, + "step": 12620 + }, + { + "epoch": 0.11165331777435952, + "grad_norm": 3.257558822631836, + "learning_rate": 4.813911137042734e-05, + "loss": 0.891, + "step": 12630 + }, + { + "epoch": 0.11174172103467175, + "grad_norm": 4.755401134490967, + "learning_rate": 4.8137637982755475e-05, + "loss": 0.8643, + "step": 12640 + }, + { + "epoch": 0.111830124294984, + "grad_norm": 6.717006683349609, + "learning_rate": 4.8136164595083603e-05, + "loss": 0.7305, + "step": 12650 + }, + { + "epoch": 0.11191852755529624, + "grad_norm": 6.6784234046936035, + "learning_rate": 4.813469120741173e-05, + "loss": 0.9982, + "step": 12660 + }, + { + "epoch": 0.11200693081560847, + "grad_norm": 6.589071273803711, + "learning_rate": 4.813321781973986e-05, + "loss": 0.7812, + "step": 12670 + }, + { + "epoch": 0.11209533407592072, + "grad_norm": 5.508271217346191, + "learning_rate": 4.813174443206799e-05, + "loss": 0.8973, + "step": 12680 + }, + { + "epoch": 0.11218373733623296, + "grad_norm": 12.26569938659668, + "learning_rate": 4.813027104439612e-05, + "loss": 0.8188, + "step": 12690 + }, + { + "epoch": 0.1122721405965452, + "grad_norm": 3.524712085723877, + "learning_rate": 4.812879765672425e-05, + "loss": 0.8285, + "step": 12700 + }, + { + "epoch": 0.11236054385685744, + "grad_norm": 12.058805465698242, + "learning_rate": 4.8127324269052374e-05, + "loss": 0.8275, + "step": 12710 + }, + { + "epoch": 0.11244894711716968, + "grad_norm": 9.321803092956543, + "learning_rate": 4.812585088138051e-05, + "loss": 0.9625, + "step": 12720 + }, + { + "epoch": 0.11253735037748192, + "grad_norm": 5.731935977935791, + "learning_rate": 4.812437749370864e-05, + "loss": 0.9214, + "step": 12730 + }, + { + "epoch": 0.11262575363779416, + "grad_norm": 5.440855979919434, + "learning_rate": 4.8122904106036765e-05, + "loss": 0.9237, + "step": 12740 + }, + { + "epoch": 0.1127141568981064, + "grad_norm": 4.87943172454834, + "learning_rate": 4.8121430718364894e-05, + "loss": 0.8803, + "step": 12750 + }, + { + "epoch": 0.11280256015841865, + "grad_norm": 9.990445137023926, + "learning_rate": 4.811995733069303e-05, + "loss": 0.7905, + "step": 12760 + }, + { + "epoch": 0.11289096341873088, + "grad_norm": 7.565195083618164, + "learning_rate": 4.811848394302115e-05, + "loss": 0.9275, + "step": 12770 + }, + { + "epoch": 0.11297936667904313, + "grad_norm": 9.698315620422363, + "learning_rate": 4.8117010555349286e-05, + "loss": 0.8425, + "step": 12780 + }, + { + "epoch": 0.11306776993935537, + "grad_norm": 4.264374732971191, + "learning_rate": 4.811553716767741e-05, + "loss": 0.8511, + "step": 12790 + }, + { + "epoch": 0.1131561731996676, + "grad_norm": 3.337890148162842, + "learning_rate": 4.811406378000554e-05, + "loss": 1.0133, + "step": 12800 + }, + { + "epoch": 0.11324457645997985, + "grad_norm": 3.5003468990325928, + "learning_rate": 4.811259039233367e-05, + "loss": 0.9532, + "step": 12810 + }, + { + "epoch": 0.11333297972029209, + "grad_norm": 3.5763087272644043, + "learning_rate": 4.81111170046618e-05, + "loss": 0.8774, + "step": 12820 + }, + { + "epoch": 0.11342138298060432, + "grad_norm": 5.544234275817871, + "learning_rate": 4.810964361698993e-05, + "loss": 0.8788, + "step": 12830 + }, + { + "epoch": 0.11350978624091657, + "grad_norm": 8.555781364440918, + "learning_rate": 4.810817022931806e-05, + "loss": 0.9446, + "step": 12840 + }, + { + "epoch": 0.11359818950122881, + "grad_norm": 2.7551257610321045, + "learning_rate": 4.8106696841646184e-05, + "loss": 0.9086, + "step": 12850 + }, + { + "epoch": 0.11368659276154104, + "grad_norm": 4.804686546325684, + "learning_rate": 4.810522345397432e-05, + "loss": 0.8292, + "step": 12860 + }, + { + "epoch": 0.11377499602185329, + "grad_norm": 6.941967487335205, + "learning_rate": 4.810375006630245e-05, + "loss": 0.7287, + "step": 12870 + }, + { + "epoch": 0.11386339928216553, + "grad_norm": 5.422636032104492, + "learning_rate": 4.8102276678630576e-05, + "loss": 0.7451, + "step": 12880 + }, + { + "epoch": 0.11395180254247776, + "grad_norm": 5.8049397468566895, + "learning_rate": 4.8100803290958704e-05, + "loss": 0.8179, + "step": 12890 + }, + { + "epoch": 0.11404020580279, + "grad_norm": 4.307275772094727, + "learning_rate": 4.809932990328683e-05, + "loss": 0.7998, + "step": 12900 + }, + { + "epoch": 0.11412860906310225, + "grad_norm": 5.677903175354004, + "learning_rate": 4.809785651561496e-05, + "loss": 0.9389, + "step": 12910 + }, + { + "epoch": 0.11421701232341448, + "grad_norm": 4.883234977722168, + "learning_rate": 4.8096383127943096e-05, + "loss": 1.0766, + "step": 12920 + }, + { + "epoch": 0.11430541558372673, + "grad_norm": 6.127511501312256, + "learning_rate": 4.8094909740271224e-05, + "loss": 0.8464, + "step": 12930 + }, + { + "epoch": 0.11439381884403897, + "grad_norm": 3.314553737640381, + "learning_rate": 4.809343635259935e-05, + "loss": 0.841, + "step": 12940 + }, + { + "epoch": 0.1144822221043512, + "grad_norm": 9.716215133666992, + "learning_rate": 4.809196296492748e-05, + "loss": 0.828, + "step": 12950 + }, + { + "epoch": 0.11457062536466345, + "grad_norm": 3.888396739959717, + "learning_rate": 4.809048957725561e-05, + "loss": 0.7595, + "step": 12960 + }, + { + "epoch": 0.11465902862497569, + "grad_norm": 10.254668235778809, + "learning_rate": 4.808901618958374e-05, + "loss": 0.806, + "step": 12970 + }, + { + "epoch": 0.11474743188528792, + "grad_norm": 5.166515350341797, + "learning_rate": 4.808754280191187e-05, + "loss": 0.8192, + "step": 12980 + }, + { + "epoch": 0.11483583514560017, + "grad_norm": 6.8301920890808105, + "learning_rate": 4.808606941424e-05, + "loss": 0.8468, + "step": 12990 + }, + { + "epoch": 0.11492423840591241, + "grad_norm": 6.361786365509033, + "learning_rate": 4.808459602656813e-05, + "loss": 0.9592, + "step": 13000 + }, + { + "epoch": 0.11501264166622464, + "grad_norm": 4.025946617126465, + "learning_rate": 4.808312263889626e-05, + "loss": 0.9441, + "step": 13010 + }, + { + "epoch": 0.11510104492653689, + "grad_norm": 2.533721685409546, + "learning_rate": 4.8081649251224386e-05, + "loss": 0.7379, + "step": 13020 + }, + { + "epoch": 0.11518944818684913, + "grad_norm": 13.652111053466797, + "learning_rate": 4.8080175863552515e-05, + "loss": 0.8379, + "step": 13030 + }, + { + "epoch": 0.11527785144716138, + "grad_norm": 5.801906585693359, + "learning_rate": 4.807870247588064e-05, + "loss": 0.8973, + "step": 13040 + }, + { + "epoch": 0.11536625470747361, + "grad_norm": 4.005748748779297, + "learning_rate": 4.807722908820878e-05, + "loss": 1.0268, + "step": 13050 + }, + { + "epoch": 0.11545465796778585, + "grad_norm": 19.381954193115234, + "learning_rate": 4.8075755700536907e-05, + "loss": 0.9041, + "step": 13060 + }, + { + "epoch": 0.1155430612280981, + "grad_norm": 4.4473981857299805, + "learning_rate": 4.8074282312865035e-05, + "loss": 0.8134, + "step": 13070 + }, + { + "epoch": 0.11563146448841033, + "grad_norm": 4.011990070343018, + "learning_rate": 4.807280892519316e-05, + "loss": 0.8182, + "step": 13080 + }, + { + "epoch": 0.11571986774872257, + "grad_norm": 6.486347198486328, + "learning_rate": 4.807133553752129e-05, + "loss": 0.8882, + "step": 13090 + }, + { + "epoch": 0.11580827100903482, + "grad_norm": 3.178736686706543, + "learning_rate": 4.806986214984942e-05, + "loss": 0.8041, + "step": 13100 + }, + { + "epoch": 0.11589667426934705, + "grad_norm": 4.373308181762695, + "learning_rate": 4.8068388762177555e-05, + "loss": 0.8849, + "step": 13110 + }, + { + "epoch": 0.1159850775296593, + "grad_norm": 5.344725608825684, + "learning_rate": 4.8066915374505684e-05, + "loss": 0.8081, + "step": 13120 + }, + { + "epoch": 0.11607348078997154, + "grad_norm": 5.3296356201171875, + "learning_rate": 4.806544198683381e-05, + "loss": 0.8636, + "step": 13130 + }, + { + "epoch": 0.11616188405028377, + "grad_norm": 4.61037015914917, + "learning_rate": 4.806396859916194e-05, + "loss": 0.7847, + "step": 13140 + }, + { + "epoch": 0.11625028731059601, + "grad_norm": 9.912908554077148, + "learning_rate": 4.806249521149007e-05, + "loss": 0.8597, + "step": 13150 + }, + { + "epoch": 0.11633869057090826, + "grad_norm": 4.560932159423828, + "learning_rate": 4.80610218238182e-05, + "loss": 0.8411, + "step": 13160 + }, + { + "epoch": 0.11642709383122049, + "grad_norm": 4.295501232147217, + "learning_rate": 4.805954843614633e-05, + "loss": 0.9372, + "step": 13170 + }, + { + "epoch": 0.11651549709153274, + "grad_norm": 6.329944133758545, + "learning_rate": 4.8058075048474454e-05, + "loss": 0.8476, + "step": 13180 + }, + { + "epoch": 0.11660390035184498, + "grad_norm": 2.4239487648010254, + "learning_rate": 4.805660166080259e-05, + "loss": 0.8588, + "step": 13190 + }, + { + "epoch": 0.11669230361215721, + "grad_norm": 3.171091318130493, + "learning_rate": 4.805512827313072e-05, + "loss": 0.8872, + "step": 13200 + }, + { + "epoch": 0.11678070687246946, + "grad_norm": 2.808485984802246, + "learning_rate": 4.8053654885458845e-05, + "loss": 0.7506, + "step": 13210 + }, + { + "epoch": 0.1168691101327817, + "grad_norm": 6.207040309906006, + "learning_rate": 4.8052181497786974e-05, + "loss": 0.9784, + "step": 13220 + }, + { + "epoch": 0.11695751339309393, + "grad_norm": 3.784930467605591, + "learning_rate": 4.805070811011511e-05, + "loss": 0.9089, + "step": 13230 + }, + { + "epoch": 0.11704591665340618, + "grad_norm": 4.8151044845581055, + "learning_rate": 4.804923472244323e-05, + "loss": 0.8305, + "step": 13240 + }, + { + "epoch": 0.11713431991371842, + "grad_norm": 4.724689483642578, + "learning_rate": 4.8047761334771366e-05, + "loss": 0.8698, + "step": 13250 + }, + { + "epoch": 0.11722272317403065, + "grad_norm": 6.355766773223877, + "learning_rate": 4.804628794709949e-05, + "loss": 0.8895, + "step": 13260 + }, + { + "epoch": 0.1173111264343429, + "grad_norm": 3.905327796936035, + "learning_rate": 4.804481455942762e-05, + "loss": 0.9173, + "step": 13270 + }, + { + "epoch": 0.11739952969465514, + "grad_norm": 6.199215412139893, + "learning_rate": 4.804334117175575e-05, + "loss": 0.9852, + "step": 13280 + }, + { + "epoch": 0.11748793295496737, + "grad_norm": 6.366835117340088, + "learning_rate": 4.804186778408388e-05, + "loss": 0.8567, + "step": 13290 + }, + { + "epoch": 0.11757633621527962, + "grad_norm": 4.903016567230225, + "learning_rate": 4.804039439641201e-05, + "loss": 0.8809, + "step": 13300 + }, + { + "epoch": 0.11766473947559186, + "grad_norm": 5.171665191650391, + "learning_rate": 4.803892100874014e-05, + "loss": 0.8251, + "step": 13310 + }, + { + "epoch": 0.1177531427359041, + "grad_norm": 10.476667404174805, + "learning_rate": 4.8037447621068264e-05, + "loss": 0.8059, + "step": 13320 + }, + { + "epoch": 0.11784154599621634, + "grad_norm": 3.8133482933044434, + "learning_rate": 4.80359742333964e-05, + "loss": 0.8106, + "step": 13330 + }, + { + "epoch": 0.11792994925652858, + "grad_norm": 8.48343563079834, + "learning_rate": 4.803450084572453e-05, + "loss": 0.9235, + "step": 13340 + }, + { + "epoch": 0.11801835251684083, + "grad_norm": 6.139083385467529, + "learning_rate": 4.8033027458052656e-05, + "loss": 0.7608, + "step": 13350 + }, + { + "epoch": 0.11810675577715306, + "grad_norm": 14.097145080566406, + "learning_rate": 4.8031554070380784e-05, + "loss": 0.841, + "step": 13360 + }, + { + "epoch": 0.1181951590374653, + "grad_norm": 7.063056468963623, + "learning_rate": 4.803008068270892e-05, + "loss": 0.8513, + "step": 13370 + }, + { + "epoch": 0.11828356229777755, + "grad_norm": 2.2021477222442627, + "learning_rate": 4.802860729503704e-05, + "loss": 0.9442, + "step": 13380 + }, + { + "epoch": 0.11837196555808978, + "grad_norm": 7.366992950439453, + "learning_rate": 4.8027133907365176e-05, + "loss": 0.9797, + "step": 13390 + }, + { + "epoch": 0.11846036881840202, + "grad_norm": 4.778909683227539, + "learning_rate": 4.80256605196933e-05, + "loss": 0.8654, + "step": 13400 + }, + { + "epoch": 0.11854877207871427, + "grad_norm": 6.330195426940918, + "learning_rate": 4.802418713202143e-05, + "loss": 0.912, + "step": 13410 + }, + { + "epoch": 0.1186371753390265, + "grad_norm": 3.0833261013031006, + "learning_rate": 4.802271374434956e-05, + "loss": 0.9457, + "step": 13420 + }, + { + "epoch": 0.11872557859933874, + "grad_norm": 2.9535045623779297, + "learning_rate": 4.802124035667769e-05, + "loss": 0.8395, + "step": 13430 + }, + { + "epoch": 0.11881398185965099, + "grad_norm": 3.640794038772583, + "learning_rate": 4.801976696900582e-05, + "loss": 0.9153, + "step": 13440 + }, + { + "epoch": 0.11890238511996322, + "grad_norm": 4.801519393920898, + "learning_rate": 4.801829358133395e-05, + "loss": 0.7588, + "step": 13450 + }, + { + "epoch": 0.11899078838027546, + "grad_norm": 6.261791706085205, + "learning_rate": 4.8016820193662075e-05, + "loss": 0.8941, + "step": 13460 + }, + { + "epoch": 0.11907919164058771, + "grad_norm": 5.783773422241211, + "learning_rate": 4.801534680599021e-05, + "loss": 0.8079, + "step": 13470 + }, + { + "epoch": 0.11916759490089994, + "grad_norm": 2.9788925647735596, + "learning_rate": 4.801387341831834e-05, + "loss": 0.8982, + "step": 13480 + }, + { + "epoch": 0.11925599816121218, + "grad_norm": 6.95315408706665, + "learning_rate": 4.8012400030646466e-05, + "loss": 0.9352, + "step": 13490 + }, + { + "epoch": 0.11934440142152443, + "grad_norm": 3.809793710708618, + "learning_rate": 4.8010926642974595e-05, + "loss": 0.9175, + "step": 13500 + }, + { + "epoch": 0.11943280468183666, + "grad_norm": 5.846870422363281, + "learning_rate": 4.800945325530272e-05, + "loss": 0.9252, + "step": 13510 + }, + { + "epoch": 0.1195212079421489, + "grad_norm": 2.3290843963623047, + "learning_rate": 4.800797986763085e-05, + "loss": 0.7772, + "step": 13520 + }, + { + "epoch": 0.11960961120246115, + "grad_norm": 3.1424500942230225, + "learning_rate": 4.800650647995899e-05, + "loss": 0.9232, + "step": 13530 + }, + { + "epoch": 0.11969801446277338, + "grad_norm": 5.945415019989014, + "learning_rate": 4.800503309228711e-05, + "loss": 0.8722, + "step": 13540 + }, + { + "epoch": 0.11978641772308563, + "grad_norm": 4.896644592285156, + "learning_rate": 4.800355970461524e-05, + "loss": 0.9078, + "step": 13550 + }, + { + "epoch": 0.11987482098339787, + "grad_norm": 3.6316447257995605, + "learning_rate": 4.800208631694337e-05, + "loss": 0.8587, + "step": 13560 + }, + { + "epoch": 0.11996322424371011, + "grad_norm": 5.4388017654418945, + "learning_rate": 4.80006129292715e-05, + "loss": 0.7887, + "step": 13570 + }, + { + "epoch": 0.12005162750402235, + "grad_norm": 5.254912376403809, + "learning_rate": 4.799913954159963e-05, + "loss": 0.8473, + "step": 13580 + }, + { + "epoch": 0.12014003076433459, + "grad_norm": 10.683406829833984, + "learning_rate": 4.7997666153927764e-05, + "loss": 0.7889, + "step": 13590 + }, + { + "epoch": 0.12022843402464684, + "grad_norm": 9.83441162109375, + "learning_rate": 4.7996192766255885e-05, + "loss": 0.7997, + "step": 13600 + }, + { + "epoch": 0.12031683728495907, + "grad_norm": 3.063049793243408, + "learning_rate": 4.799471937858402e-05, + "loss": 0.7573, + "step": 13610 + }, + { + "epoch": 0.12040524054527131, + "grad_norm": 6.09453010559082, + "learning_rate": 4.799324599091214e-05, + "loss": 0.7714, + "step": 13620 + }, + { + "epoch": 0.12049364380558356, + "grad_norm": 5.107629299163818, + "learning_rate": 4.799177260324028e-05, + "loss": 0.866, + "step": 13630 + }, + { + "epoch": 0.12058204706589579, + "grad_norm": 3.678056001663208, + "learning_rate": 4.7990299215568405e-05, + "loss": 0.8765, + "step": 13640 + }, + { + "epoch": 0.12067045032620803, + "grad_norm": 6.041098117828369, + "learning_rate": 4.7988825827896534e-05, + "loss": 0.8945, + "step": 13650 + }, + { + "epoch": 0.12075885358652028, + "grad_norm": 6.088251113891602, + "learning_rate": 4.798735244022466e-05, + "loss": 0.8592, + "step": 13660 + }, + { + "epoch": 0.12084725684683251, + "grad_norm": 6.8704423904418945, + "learning_rate": 4.79858790525528e-05, + "loss": 0.8, + "step": 13670 + }, + { + "epoch": 0.12093566010714475, + "grad_norm": 4.484743595123291, + "learning_rate": 4.798440566488092e-05, + "loss": 0.7832, + "step": 13680 + }, + { + "epoch": 0.121024063367457, + "grad_norm": 4.457681655883789, + "learning_rate": 4.7982932277209054e-05, + "loss": 0.8724, + "step": 13690 + }, + { + "epoch": 0.12111246662776923, + "grad_norm": 5.599424839019775, + "learning_rate": 4.798145888953718e-05, + "loss": 0.8824, + "step": 13700 + }, + { + "epoch": 0.12120086988808147, + "grad_norm": 3.0846285820007324, + "learning_rate": 4.797998550186531e-05, + "loss": 0.9783, + "step": 13710 + }, + { + "epoch": 0.12128927314839372, + "grad_norm": 3.4690675735473633, + "learning_rate": 4.797851211419344e-05, + "loss": 0.8959, + "step": 13720 + }, + { + "epoch": 0.12137767640870595, + "grad_norm": 4.735897064208984, + "learning_rate": 4.797703872652157e-05, + "loss": 0.8452, + "step": 13730 + }, + { + "epoch": 0.12146607966901819, + "grad_norm": 3.343179225921631, + "learning_rate": 4.7975565338849696e-05, + "loss": 0.8787, + "step": 13740 + }, + { + "epoch": 0.12155448292933044, + "grad_norm": 3.7673146724700928, + "learning_rate": 4.797409195117783e-05, + "loss": 1.0123, + "step": 13750 + }, + { + "epoch": 0.12164288618964267, + "grad_norm": 4.631723880767822, + "learning_rate": 4.797261856350595e-05, + "loss": 0.8697, + "step": 13760 + }, + { + "epoch": 0.12173128944995491, + "grad_norm": 13.146268844604492, + "learning_rate": 4.797114517583409e-05, + "loss": 0.7591, + "step": 13770 + }, + { + "epoch": 0.12181969271026716, + "grad_norm": 5.641947269439697, + "learning_rate": 4.7969671788162216e-05, + "loss": 0.8896, + "step": 13780 + }, + { + "epoch": 0.12190809597057939, + "grad_norm": 3.458003520965576, + "learning_rate": 4.7968198400490344e-05, + "loss": 0.825, + "step": 13790 + }, + { + "epoch": 0.12199649923089163, + "grad_norm": 5.369232654571533, + "learning_rate": 4.796672501281847e-05, + "loss": 0.9013, + "step": 13800 + }, + { + "epoch": 0.12208490249120388, + "grad_norm": 11.477063179016113, + "learning_rate": 4.796525162514661e-05, + "loss": 0.8588, + "step": 13810 + }, + { + "epoch": 0.12217330575151611, + "grad_norm": 3.9801762104034424, + "learning_rate": 4.796377823747473e-05, + "loss": 0.8606, + "step": 13820 + }, + { + "epoch": 0.12226170901182835, + "grad_norm": 6.173799991607666, + "learning_rate": 4.7962304849802864e-05, + "loss": 1.0348, + "step": 13830 + }, + { + "epoch": 0.1223501122721406, + "grad_norm": 3.98711895942688, + "learning_rate": 4.796083146213099e-05, + "loss": 0.858, + "step": 13840 + }, + { + "epoch": 0.12243851553245284, + "grad_norm": 9.050692558288574, + "learning_rate": 4.795935807445912e-05, + "loss": 0.9099, + "step": 13850 + }, + { + "epoch": 0.12252691879276507, + "grad_norm": 2.451476573944092, + "learning_rate": 4.795788468678725e-05, + "loss": 0.7765, + "step": 13860 + }, + { + "epoch": 0.12261532205307732, + "grad_norm": 4.387509346008301, + "learning_rate": 4.795641129911538e-05, + "loss": 0.8677, + "step": 13870 + }, + { + "epoch": 0.12270372531338956, + "grad_norm": 3.3893465995788574, + "learning_rate": 4.7954937911443506e-05, + "loss": 0.8658, + "step": 13880 + }, + { + "epoch": 0.1227921285737018, + "grad_norm": 6.2101898193359375, + "learning_rate": 4.795346452377164e-05, + "loss": 0.811, + "step": 13890 + }, + { + "epoch": 0.12288053183401404, + "grad_norm": 4.000254154205322, + "learning_rate": 4.795199113609977e-05, + "loss": 0.8461, + "step": 13900 + }, + { + "epoch": 0.12296893509432628, + "grad_norm": 4.265214920043945, + "learning_rate": 4.79505177484279e-05, + "loss": 0.9001, + "step": 13910 + }, + { + "epoch": 0.12305733835463852, + "grad_norm": 9.801916122436523, + "learning_rate": 4.7949044360756026e-05, + "loss": 0.9023, + "step": 13920 + }, + { + "epoch": 0.12314574161495076, + "grad_norm": 5.067219257354736, + "learning_rate": 4.7947570973084155e-05, + "loss": 0.7693, + "step": 13930 + }, + { + "epoch": 0.123234144875263, + "grad_norm": 5.618040084838867, + "learning_rate": 4.794609758541228e-05, + "loss": 0.9039, + "step": 13940 + }, + { + "epoch": 0.12332254813557524, + "grad_norm": 2.829528331756592, + "learning_rate": 4.794462419774042e-05, + "loss": 0.8986, + "step": 13950 + }, + { + "epoch": 0.12341095139588748, + "grad_norm": 6.598517894744873, + "learning_rate": 4.7943150810068547e-05, + "loss": 0.8015, + "step": 13960 + }, + { + "epoch": 0.12349935465619973, + "grad_norm": 8.449544906616211, + "learning_rate": 4.7941677422396675e-05, + "loss": 0.8606, + "step": 13970 + }, + { + "epoch": 0.12358775791651196, + "grad_norm": 3.056562662124634, + "learning_rate": 4.79402040347248e-05, + "loss": 0.8804, + "step": 13980 + }, + { + "epoch": 0.1236761611768242, + "grad_norm": 4.60211181640625, + "learning_rate": 4.793873064705293e-05, + "loss": 0.8114, + "step": 13990 + }, + { + "epoch": 0.12376456443713645, + "grad_norm": 11.740740776062012, + "learning_rate": 4.793725725938106e-05, + "loss": 0.835, + "step": 14000 + }, + { + "epoch": 0.12385296769744868, + "grad_norm": 4.6112775802612305, + "learning_rate": 4.793578387170919e-05, + "loss": 0.8879, + "step": 14010 + }, + { + "epoch": 0.12394137095776092, + "grad_norm": 10.848852157592773, + "learning_rate": 4.7934310484037323e-05, + "loss": 0.8923, + "step": 14020 + }, + { + "epoch": 0.12402977421807317, + "grad_norm": 3.880849599838257, + "learning_rate": 4.793283709636545e-05, + "loss": 0.8143, + "step": 14030 + }, + { + "epoch": 0.1241181774783854, + "grad_norm": 3.7946925163269043, + "learning_rate": 4.793136370869358e-05, + "loss": 0.893, + "step": 14040 + }, + { + "epoch": 0.12420658073869764, + "grad_norm": 3.187323808670044, + "learning_rate": 4.792989032102171e-05, + "loss": 0.9894, + "step": 14050 + }, + { + "epoch": 0.12429498399900989, + "grad_norm": 5.719008922576904, + "learning_rate": 4.792841693334984e-05, + "loss": 0.8328, + "step": 14060 + }, + { + "epoch": 0.12438338725932212, + "grad_norm": 1.9846103191375732, + "learning_rate": 4.7926943545677965e-05, + "loss": 0.6889, + "step": 14070 + }, + { + "epoch": 0.12447179051963436, + "grad_norm": 9.411588668823242, + "learning_rate": 4.79254701580061e-05, + "loss": 1.0346, + "step": 14080 + }, + { + "epoch": 0.1245601937799466, + "grad_norm": 4.604274272918701, + "learning_rate": 4.792399677033422e-05, + "loss": 0.8962, + "step": 14090 + }, + { + "epoch": 0.12464859704025885, + "grad_norm": 3.6185877323150635, + "learning_rate": 4.792252338266236e-05, + "loss": 0.9321, + "step": 14100 + }, + { + "epoch": 0.12473700030057108, + "grad_norm": 6.4194865226745605, + "learning_rate": 4.7921049994990485e-05, + "loss": 1.0308, + "step": 14110 + }, + { + "epoch": 0.12482540356088333, + "grad_norm": 7.311243534088135, + "learning_rate": 4.7919576607318614e-05, + "loss": 0.9134, + "step": 14120 + }, + { + "epoch": 0.12491380682119557, + "grad_norm": 2.457221508026123, + "learning_rate": 4.791810321964674e-05, + "loss": 0.8825, + "step": 14130 + }, + { + "epoch": 0.1250022100815078, + "grad_norm": 9.435111999511719, + "learning_rate": 4.791662983197488e-05, + "loss": 0.8359, + "step": 14140 + }, + { + "epoch": 0.12509061334182003, + "grad_norm": 4.99104642868042, + "learning_rate": 4.7915156444303e-05, + "loss": 0.9245, + "step": 14150 + }, + { + "epoch": 0.1251790166021323, + "grad_norm": 7.287257194519043, + "learning_rate": 4.7913683056631134e-05, + "loss": 0.8291, + "step": 14160 + }, + { + "epoch": 0.12526741986244452, + "grad_norm": 8.82583999633789, + "learning_rate": 4.791220966895926e-05, + "loss": 0.9718, + "step": 14170 + }, + { + "epoch": 0.12535582312275675, + "grad_norm": 2.8207483291625977, + "learning_rate": 4.791073628128739e-05, + "loss": 0.9119, + "step": 14180 + }, + { + "epoch": 0.125444226383069, + "grad_norm": 3.5591232776641846, + "learning_rate": 4.790926289361552e-05, + "loss": 0.8609, + "step": 14190 + }, + { + "epoch": 0.12553262964338124, + "grad_norm": 3.5088720321655273, + "learning_rate": 4.790778950594365e-05, + "loss": 0.8482, + "step": 14200 + }, + { + "epoch": 0.12562103290369347, + "grad_norm": 2.4947993755340576, + "learning_rate": 4.7906316118271776e-05, + "loss": 0.7897, + "step": 14210 + }, + { + "epoch": 0.12570943616400573, + "grad_norm": 3.704815149307251, + "learning_rate": 4.790484273059991e-05, + "loss": 0.8861, + "step": 14220 + }, + { + "epoch": 0.12579783942431796, + "grad_norm": 3.8462188243865967, + "learning_rate": 4.790336934292803e-05, + "loss": 0.7991, + "step": 14230 + }, + { + "epoch": 0.12588624268463022, + "grad_norm": 3.9030914306640625, + "learning_rate": 4.790189595525617e-05, + "loss": 0.8342, + "step": 14240 + }, + { + "epoch": 0.12597464594494245, + "grad_norm": 9.146306037902832, + "learning_rate": 4.7900422567584296e-05, + "loss": 1.004, + "step": 14250 + }, + { + "epoch": 0.12606304920525468, + "grad_norm": 4.142906188964844, + "learning_rate": 4.7898949179912424e-05, + "loss": 0.8683, + "step": 14260 + }, + { + "epoch": 0.12615145246556694, + "grad_norm": 4.316192626953125, + "learning_rate": 4.789747579224055e-05, + "loss": 0.794, + "step": 14270 + }, + { + "epoch": 0.12623985572587917, + "grad_norm": 2.6158251762390137, + "learning_rate": 4.789600240456869e-05, + "loss": 0.8485, + "step": 14280 + }, + { + "epoch": 0.1263282589861914, + "grad_norm": 6.301787853240967, + "learning_rate": 4.789452901689681e-05, + "loss": 0.8897, + "step": 14290 + }, + { + "epoch": 0.12641666224650366, + "grad_norm": 3.070772886276245, + "learning_rate": 4.7893055629224944e-05, + "loss": 0.8201, + "step": 14300 + }, + { + "epoch": 0.1265050655068159, + "grad_norm": 4.846888065338135, + "learning_rate": 4.789158224155307e-05, + "loss": 0.9017, + "step": 14310 + }, + { + "epoch": 0.12659346876712813, + "grad_norm": 6.518237590789795, + "learning_rate": 4.78901088538812e-05, + "loss": 0.9906, + "step": 14320 + }, + { + "epoch": 0.12668187202744038, + "grad_norm": 3.3887789249420166, + "learning_rate": 4.788863546620933e-05, + "loss": 0.8593, + "step": 14330 + }, + { + "epoch": 0.12677027528775262, + "grad_norm": 3.343748092651367, + "learning_rate": 4.788716207853746e-05, + "loss": 0.8383, + "step": 14340 + }, + { + "epoch": 0.12685867854806485, + "grad_norm": 2.786721706390381, + "learning_rate": 4.7885688690865586e-05, + "loss": 0.8111, + "step": 14350 + }, + { + "epoch": 0.1269470818083771, + "grad_norm": 3.8152003288269043, + "learning_rate": 4.788421530319372e-05, + "loss": 0.8315, + "step": 14360 + }, + { + "epoch": 0.12703548506868934, + "grad_norm": 3.993624687194824, + "learning_rate": 4.788274191552184e-05, + "loss": 0.88, + "step": 14370 + }, + { + "epoch": 0.12712388832900157, + "grad_norm": 4.20762825012207, + "learning_rate": 4.788126852784998e-05, + "loss": 0.8888, + "step": 14380 + }, + { + "epoch": 0.12721229158931383, + "grad_norm": 7.828494548797607, + "learning_rate": 4.7879795140178106e-05, + "loss": 0.8765, + "step": 14390 + }, + { + "epoch": 0.12730069484962606, + "grad_norm": 2.8013503551483154, + "learning_rate": 4.7878321752506235e-05, + "loss": 0.908, + "step": 14400 + }, + { + "epoch": 0.1273890981099383, + "grad_norm": 6.353537559509277, + "learning_rate": 4.787684836483436e-05, + "loss": 0.8286, + "step": 14410 + }, + { + "epoch": 0.12747750137025055, + "grad_norm": 4.11337423324585, + "learning_rate": 4.78753749771625e-05, + "loss": 0.9107, + "step": 14420 + }, + { + "epoch": 0.12756590463056278, + "grad_norm": 4.779507160186768, + "learning_rate": 4.787390158949062e-05, + "loss": 0.7006, + "step": 14430 + }, + { + "epoch": 0.127654307890875, + "grad_norm": 5.153433322906494, + "learning_rate": 4.7872428201818755e-05, + "loss": 0.8357, + "step": 14440 + }, + { + "epoch": 0.12774271115118727, + "grad_norm": 6.731582164764404, + "learning_rate": 4.7870954814146877e-05, + "loss": 0.867, + "step": 14450 + }, + { + "epoch": 0.1278311144114995, + "grad_norm": 2.1910324096679688, + "learning_rate": 4.786948142647501e-05, + "loss": 0.9936, + "step": 14460 + }, + { + "epoch": 0.12791951767181173, + "grad_norm": 4.246039390563965, + "learning_rate": 4.786800803880314e-05, + "loss": 0.8295, + "step": 14470 + }, + { + "epoch": 0.128007920932124, + "grad_norm": 3.4341440200805664, + "learning_rate": 4.786653465113127e-05, + "loss": 0.7022, + "step": 14480 + }, + { + "epoch": 0.12809632419243622, + "grad_norm": 3.687922716140747, + "learning_rate": 4.78650612634594e-05, + "loss": 0.962, + "step": 14490 + }, + { + "epoch": 0.12818472745274845, + "grad_norm": 5.527141094207764, + "learning_rate": 4.786358787578753e-05, + "loss": 0.8514, + "step": 14500 + }, + { + "epoch": 0.1282731307130607, + "grad_norm": 3.9552197456359863, + "learning_rate": 4.7862114488115653e-05, + "loss": 0.7996, + "step": 14510 + }, + { + "epoch": 0.12836153397337294, + "grad_norm": 4.820691108703613, + "learning_rate": 4.786064110044379e-05, + "loss": 0.86, + "step": 14520 + }, + { + "epoch": 0.12844993723368517, + "grad_norm": 4.065998077392578, + "learning_rate": 4.785916771277192e-05, + "loss": 0.799, + "step": 14530 + }, + { + "epoch": 0.12853834049399743, + "grad_norm": 17.629016876220703, + "learning_rate": 4.7857694325100045e-05, + "loss": 0.9446, + "step": 14540 + }, + { + "epoch": 0.12862674375430966, + "grad_norm": 7.211923599243164, + "learning_rate": 4.7856220937428174e-05, + "loss": 0.8793, + "step": 14550 + }, + { + "epoch": 0.1287151470146219, + "grad_norm": 1.7147059440612793, + "learning_rate": 4.78547475497563e-05, + "loss": 0.7855, + "step": 14560 + }, + { + "epoch": 0.12880355027493415, + "grad_norm": 5.725169658660889, + "learning_rate": 4.785327416208443e-05, + "loss": 0.8838, + "step": 14570 + }, + { + "epoch": 0.12889195353524638, + "grad_norm": 5.495853900909424, + "learning_rate": 4.7851800774412565e-05, + "loss": 0.9136, + "step": 14580 + }, + { + "epoch": 0.1289803567955586, + "grad_norm": 6.933687210083008, + "learning_rate": 4.785032738674069e-05, + "loss": 0.8623, + "step": 14590 + }, + { + "epoch": 0.12906876005587087, + "grad_norm": 6.265336513519287, + "learning_rate": 4.784885399906882e-05, + "loss": 0.7843, + "step": 14600 + }, + { + "epoch": 0.1291571633161831, + "grad_norm": 4.088986396789551, + "learning_rate": 4.784738061139695e-05, + "loss": 0.9676, + "step": 14610 + }, + { + "epoch": 0.12924556657649533, + "grad_norm": 3.6620850563049316, + "learning_rate": 4.784590722372508e-05, + "loss": 0.898, + "step": 14620 + }, + { + "epoch": 0.1293339698368076, + "grad_norm": 4.730831146240234, + "learning_rate": 4.784443383605321e-05, + "loss": 0.9408, + "step": 14630 + }, + { + "epoch": 0.12942237309711982, + "grad_norm": 3.718355417251587, + "learning_rate": 4.784296044838134e-05, + "loss": 0.898, + "step": 14640 + }, + { + "epoch": 0.12951077635743205, + "grad_norm": 5.269217491149902, + "learning_rate": 4.7841487060709464e-05, + "loss": 0.7567, + "step": 14650 + }, + { + "epoch": 0.1295991796177443, + "grad_norm": 5.338615417480469, + "learning_rate": 4.78400136730376e-05, + "loss": 0.8754, + "step": 14660 + }, + { + "epoch": 0.12968758287805654, + "grad_norm": 2.691253900527954, + "learning_rate": 4.783854028536572e-05, + "loss": 0.8197, + "step": 14670 + }, + { + "epoch": 0.12977598613836877, + "grad_norm": 4.899257183074951, + "learning_rate": 4.7837066897693856e-05, + "loss": 1.0636, + "step": 14680 + }, + { + "epoch": 0.12986438939868103, + "grad_norm": 7.4292168617248535, + "learning_rate": 4.7835593510021984e-05, + "loss": 0.8505, + "step": 14690 + }, + { + "epoch": 0.12995279265899326, + "grad_norm": 5.259484767913818, + "learning_rate": 4.783412012235011e-05, + "loss": 0.8914, + "step": 14700 + }, + { + "epoch": 0.1300411959193055, + "grad_norm": 2.593456506729126, + "learning_rate": 4.783264673467824e-05, + "loss": 0.8399, + "step": 14710 + }, + { + "epoch": 0.13012959917961775, + "grad_norm": 4.29045295715332, + "learning_rate": 4.7831173347006376e-05, + "loss": 0.8655, + "step": 14720 + }, + { + "epoch": 0.13021800243992998, + "grad_norm": 5.755084037780762, + "learning_rate": 4.78296999593345e-05, + "loss": 0.827, + "step": 14730 + }, + { + "epoch": 0.1303064057002422, + "grad_norm": 8.260836601257324, + "learning_rate": 4.782822657166263e-05, + "loss": 0.892, + "step": 14740 + }, + { + "epoch": 0.13039480896055447, + "grad_norm": 5.283960819244385, + "learning_rate": 4.782675318399076e-05, + "loss": 0.7857, + "step": 14750 + }, + { + "epoch": 0.1304832122208667, + "grad_norm": 3.982168197631836, + "learning_rate": 4.782527979631889e-05, + "loss": 0.7798, + "step": 14760 + }, + { + "epoch": 0.13057161548117893, + "grad_norm": 10.328381538391113, + "learning_rate": 4.782380640864702e-05, + "loss": 1.0108, + "step": 14770 + }, + { + "epoch": 0.1306600187414912, + "grad_norm": 4.827663898468018, + "learning_rate": 4.782233302097515e-05, + "loss": 0.7952, + "step": 14780 + }, + { + "epoch": 0.13074842200180342, + "grad_norm": 5.2243523597717285, + "learning_rate": 4.7820859633303274e-05, + "loss": 0.878, + "step": 14790 + }, + { + "epoch": 0.13083682526211568, + "grad_norm": 6.853215217590332, + "learning_rate": 4.781938624563141e-05, + "loss": 0.7715, + "step": 14800 + }, + { + "epoch": 0.1309252285224279, + "grad_norm": 8.641997337341309, + "learning_rate": 4.781791285795954e-05, + "loss": 0.9502, + "step": 14810 + }, + { + "epoch": 0.13101363178274014, + "grad_norm": 7.017167091369629, + "learning_rate": 4.7816439470287666e-05, + "loss": 0.7975, + "step": 14820 + }, + { + "epoch": 0.1311020350430524, + "grad_norm": 5.059592247009277, + "learning_rate": 4.7814966082615795e-05, + "loss": 0.8215, + "step": 14830 + }, + { + "epoch": 0.13119043830336463, + "grad_norm": 2.984628200531006, + "learning_rate": 4.781349269494392e-05, + "loss": 0.78, + "step": 14840 + }, + { + "epoch": 0.13127884156367686, + "grad_norm": 3.692122220993042, + "learning_rate": 4.781201930727205e-05, + "loss": 0.6725, + "step": 14850 + }, + { + "epoch": 0.13136724482398912, + "grad_norm": 10.22104263305664, + "learning_rate": 4.7810545919600187e-05, + "loss": 0.8766, + "step": 14860 + }, + { + "epoch": 0.13145564808430135, + "grad_norm": 5.891976833343506, + "learning_rate": 4.7809072531928315e-05, + "loss": 0.7219, + "step": 14870 + }, + { + "epoch": 0.13154405134461358, + "grad_norm": 11.42141056060791, + "learning_rate": 4.780759914425644e-05, + "loss": 0.8912, + "step": 14880 + }, + { + "epoch": 0.13163245460492584, + "grad_norm": 3.748466730117798, + "learning_rate": 4.780612575658457e-05, + "loss": 0.919, + "step": 14890 + }, + { + "epoch": 0.13172085786523807, + "grad_norm": 8.474726676940918, + "learning_rate": 4.78046523689127e-05, + "loss": 0.9573, + "step": 14900 + }, + { + "epoch": 0.1318092611255503, + "grad_norm": 9.216867446899414, + "learning_rate": 4.780317898124083e-05, + "loss": 0.7432, + "step": 14910 + }, + { + "epoch": 0.13189766438586256, + "grad_norm": 1.9129865169525146, + "learning_rate": 4.780170559356896e-05, + "loss": 1.0191, + "step": 14920 + }, + { + "epoch": 0.1319860676461748, + "grad_norm": 4.176100730895996, + "learning_rate": 4.780023220589709e-05, + "loss": 0.8914, + "step": 14930 + }, + { + "epoch": 0.13207447090648702, + "grad_norm": 6.394561290740967, + "learning_rate": 4.779875881822522e-05, + "loss": 0.8583, + "step": 14940 + }, + { + "epoch": 0.13216287416679928, + "grad_norm": 3.249783992767334, + "learning_rate": 4.779728543055335e-05, + "loss": 0.8655, + "step": 14950 + }, + { + "epoch": 0.1322512774271115, + "grad_norm": 9.301237106323242, + "learning_rate": 4.779581204288148e-05, + "loss": 0.815, + "step": 14960 + }, + { + "epoch": 0.13233968068742374, + "grad_norm": 2.9319992065429688, + "learning_rate": 4.7794338655209605e-05, + "loss": 0.8361, + "step": 14970 + }, + { + "epoch": 0.132428083947736, + "grad_norm": 4.976524829864502, + "learning_rate": 4.7792865267537734e-05, + "loss": 0.7031, + "step": 14980 + }, + { + "epoch": 0.13251648720804823, + "grad_norm": 5.306775093078613, + "learning_rate": 4.779139187986587e-05, + "loss": 0.8439, + "step": 14990 + }, + { + "epoch": 0.13260489046836046, + "grad_norm": 2.589348554611206, + "learning_rate": 4.7789918492194e-05, + "loss": 0.8209, + "step": 15000 + }, + { + "epoch": 0.13269329372867272, + "grad_norm": 5.116392135620117, + "learning_rate": 4.7788445104522125e-05, + "loss": 0.9382, + "step": 15010 + }, + { + "epoch": 0.13278169698898495, + "grad_norm": 4.212767601013184, + "learning_rate": 4.7786971716850254e-05, + "loss": 0.7489, + "step": 15020 + }, + { + "epoch": 0.13287010024929718, + "grad_norm": 3.410351514816284, + "learning_rate": 4.778549832917838e-05, + "loss": 0.8789, + "step": 15030 + }, + { + "epoch": 0.13295850350960944, + "grad_norm": 4.694933891296387, + "learning_rate": 4.778402494150651e-05, + "loss": 0.8327, + "step": 15040 + }, + { + "epoch": 0.13304690676992167, + "grad_norm": 1.849266529083252, + "learning_rate": 4.7782551553834646e-05, + "loss": 0.7674, + "step": 15050 + }, + { + "epoch": 0.1331353100302339, + "grad_norm": 4.732289791107178, + "learning_rate": 4.778107816616277e-05, + "loss": 0.761, + "step": 15060 + }, + { + "epoch": 0.13322371329054616, + "grad_norm": 5.243071556091309, + "learning_rate": 4.77796047784909e-05, + "loss": 0.8116, + "step": 15070 + }, + { + "epoch": 0.1333121165508584, + "grad_norm": 3.66876482963562, + "learning_rate": 4.777813139081903e-05, + "loss": 0.8861, + "step": 15080 + }, + { + "epoch": 0.13340051981117063, + "grad_norm": 5.465664386749268, + "learning_rate": 4.777665800314716e-05, + "loss": 0.8703, + "step": 15090 + }, + { + "epoch": 0.13348892307148288, + "grad_norm": 4.7874627113342285, + "learning_rate": 4.777518461547529e-05, + "loss": 0.7911, + "step": 15100 + }, + { + "epoch": 0.13357732633179512, + "grad_norm": 4.305376052856445, + "learning_rate": 4.777371122780342e-05, + "loss": 0.8853, + "step": 15110 + }, + { + "epoch": 0.13366572959210735, + "grad_norm": 4.703049659729004, + "learning_rate": 4.7772237840131544e-05, + "loss": 0.7474, + "step": 15120 + }, + { + "epoch": 0.1337541328524196, + "grad_norm": 4.865607261657715, + "learning_rate": 4.777076445245968e-05, + "loss": 0.8021, + "step": 15130 + }, + { + "epoch": 0.13384253611273184, + "grad_norm": 4.444889068603516, + "learning_rate": 4.77692910647878e-05, + "loss": 0.9501, + "step": 15140 + }, + { + "epoch": 0.13393093937304407, + "grad_norm": 5.616090297698975, + "learning_rate": 4.7767817677115936e-05, + "loss": 0.8098, + "step": 15150 + }, + { + "epoch": 0.13401934263335633, + "grad_norm": 8.412457466125488, + "learning_rate": 4.7766344289444064e-05, + "loss": 0.9875, + "step": 15160 + }, + { + "epoch": 0.13410774589366856, + "grad_norm": 2.9461898803710938, + "learning_rate": 4.776487090177219e-05, + "loss": 0.8954, + "step": 15170 + }, + { + "epoch": 0.1341961491539808, + "grad_norm": 4.904505729675293, + "learning_rate": 4.776339751410032e-05, + "loss": 0.7522, + "step": 15180 + }, + { + "epoch": 0.13428455241429305, + "grad_norm": 8.480589866638184, + "learning_rate": 4.7761924126428456e-05, + "loss": 0.7521, + "step": 15190 + }, + { + "epoch": 0.13437295567460528, + "grad_norm": 4.232245922088623, + "learning_rate": 4.776045073875658e-05, + "loss": 0.7521, + "step": 15200 + }, + { + "epoch": 0.1344613589349175, + "grad_norm": 12.435349464416504, + "learning_rate": 4.775897735108471e-05, + "loss": 0.8171, + "step": 15210 + }, + { + "epoch": 0.13454976219522977, + "grad_norm": 3.5983023643493652, + "learning_rate": 4.775750396341284e-05, + "loss": 0.812, + "step": 15220 + }, + { + "epoch": 0.134638165455542, + "grad_norm": 8.229449272155762, + "learning_rate": 4.775603057574097e-05, + "loss": 0.7887, + "step": 15230 + }, + { + "epoch": 0.13472656871585423, + "grad_norm": 13.410287857055664, + "learning_rate": 4.77545571880691e-05, + "loss": 0.8531, + "step": 15240 + }, + { + "epoch": 0.1348149719761665, + "grad_norm": 13.72982120513916, + "learning_rate": 4.775308380039723e-05, + "loss": 0.9009, + "step": 15250 + }, + { + "epoch": 0.13490337523647872, + "grad_norm": 4.8448028564453125, + "learning_rate": 4.7751610412725355e-05, + "loss": 0.8003, + "step": 15260 + }, + { + "epoch": 0.13499177849679095, + "grad_norm": 4.699975490570068, + "learning_rate": 4.775013702505349e-05, + "loss": 0.8005, + "step": 15270 + }, + { + "epoch": 0.1350801817571032, + "grad_norm": 3.7422728538513184, + "learning_rate": 4.774866363738161e-05, + "loss": 0.9568, + "step": 15280 + }, + { + "epoch": 0.13516858501741544, + "grad_norm": 5.041273593902588, + "learning_rate": 4.7747190249709746e-05, + "loss": 0.8933, + "step": 15290 + }, + { + "epoch": 0.13525698827772767, + "grad_norm": 4.4108428955078125, + "learning_rate": 4.7745716862037875e-05, + "loss": 0.8552, + "step": 15300 + }, + { + "epoch": 0.13534539153803993, + "grad_norm": 8.836749076843262, + "learning_rate": 4.7744243474366e-05, + "loss": 0.8533, + "step": 15310 + }, + { + "epoch": 0.13543379479835216, + "grad_norm": 3.6807878017425537, + "learning_rate": 4.774277008669413e-05, + "loss": 0.908, + "step": 15320 + }, + { + "epoch": 0.13552219805866442, + "grad_norm": 5.754607200622559, + "learning_rate": 4.7741296699022267e-05, + "loss": 0.721, + "step": 15330 + }, + { + "epoch": 0.13561060131897665, + "grad_norm": 5.532271862030029, + "learning_rate": 4.773982331135039e-05, + "loss": 1.0477, + "step": 15340 + }, + { + "epoch": 0.13569900457928888, + "grad_norm": 3.1188676357269287, + "learning_rate": 4.773834992367852e-05, + "loss": 0.8783, + "step": 15350 + }, + { + "epoch": 0.13578740783960114, + "grad_norm": 4.697937488555908, + "learning_rate": 4.773687653600665e-05, + "loss": 0.8323, + "step": 15360 + }, + { + "epoch": 0.13587581109991337, + "grad_norm": 12.706995964050293, + "learning_rate": 4.773540314833478e-05, + "loss": 0.801, + "step": 15370 + }, + { + "epoch": 0.1359642143602256, + "grad_norm": 6.7111735343933105, + "learning_rate": 4.773392976066291e-05, + "loss": 0.7436, + "step": 15380 + }, + { + "epoch": 0.13605261762053786, + "grad_norm": 4.693384170532227, + "learning_rate": 4.773245637299104e-05, + "loss": 0.7366, + "step": 15390 + }, + { + "epoch": 0.1361410208808501, + "grad_norm": 3.8970184326171875, + "learning_rate": 4.7730982985319165e-05, + "loss": 0.6114, + "step": 15400 + }, + { + "epoch": 0.13622942414116232, + "grad_norm": 7.473209857940674, + "learning_rate": 4.77295095976473e-05, + "loss": 0.9589, + "step": 15410 + }, + { + "epoch": 0.13631782740147458, + "grad_norm": 3.8958041667938232, + "learning_rate": 4.772803620997542e-05, + "loss": 0.9313, + "step": 15420 + }, + { + "epoch": 0.1364062306617868, + "grad_norm": 4.38140869140625, + "learning_rate": 4.772656282230356e-05, + "loss": 0.7831, + "step": 15430 + }, + { + "epoch": 0.13649463392209904, + "grad_norm": 3.4247639179229736, + "learning_rate": 4.7725089434631685e-05, + "loss": 0.8002, + "step": 15440 + }, + { + "epoch": 0.1365830371824113, + "grad_norm": 5.48562479019165, + "learning_rate": 4.7723616046959814e-05, + "loss": 0.7916, + "step": 15450 + }, + { + "epoch": 0.13667144044272353, + "grad_norm": 7.411536693572998, + "learning_rate": 4.772214265928794e-05, + "loss": 0.8044, + "step": 15460 + }, + { + "epoch": 0.13675984370303576, + "grad_norm": 2.3300602436065674, + "learning_rate": 4.772066927161608e-05, + "loss": 0.75, + "step": 15470 + }, + { + "epoch": 0.13684824696334802, + "grad_norm": 3.276644468307495, + "learning_rate": 4.77191958839442e-05, + "loss": 0.8477, + "step": 15480 + }, + { + "epoch": 0.13693665022366025, + "grad_norm": 4.852524280548096, + "learning_rate": 4.7717722496272334e-05, + "loss": 0.8006, + "step": 15490 + }, + { + "epoch": 0.13702505348397248, + "grad_norm": 10.70208740234375, + "learning_rate": 4.7716249108600455e-05, + "loss": 0.8151, + "step": 15500 + }, + { + "epoch": 0.13711345674428474, + "grad_norm": 3.9455597400665283, + "learning_rate": 4.771477572092859e-05, + "loss": 0.8346, + "step": 15510 + }, + { + "epoch": 0.13720186000459697, + "grad_norm": 5.669267177581787, + "learning_rate": 4.771330233325672e-05, + "loss": 0.8101, + "step": 15520 + }, + { + "epoch": 0.1372902632649092, + "grad_norm": 5.7352495193481445, + "learning_rate": 4.771182894558485e-05, + "loss": 0.8337, + "step": 15530 + }, + { + "epoch": 0.13737866652522146, + "grad_norm": 5.060227394104004, + "learning_rate": 4.7710355557912976e-05, + "loss": 0.7496, + "step": 15540 + }, + { + "epoch": 0.1374670697855337, + "grad_norm": 2.309218406677246, + "learning_rate": 4.770888217024111e-05, + "loss": 0.8622, + "step": 15550 + }, + { + "epoch": 0.13755547304584592, + "grad_norm": 5.97791051864624, + "learning_rate": 4.770740878256923e-05, + "loss": 0.7933, + "step": 15560 + }, + { + "epoch": 0.13764387630615818, + "grad_norm": 5.149936199188232, + "learning_rate": 4.770593539489737e-05, + "loss": 0.6902, + "step": 15570 + }, + { + "epoch": 0.1377322795664704, + "grad_norm": 4.803082466125488, + "learning_rate": 4.7704462007225496e-05, + "loss": 0.7019, + "step": 15580 + }, + { + "epoch": 0.13782068282678264, + "grad_norm": 8.148839950561523, + "learning_rate": 4.7702988619553624e-05, + "loss": 0.8295, + "step": 15590 + }, + { + "epoch": 0.1379090860870949, + "grad_norm": 4.663215637207031, + "learning_rate": 4.770151523188175e-05, + "loss": 0.9214, + "step": 15600 + }, + { + "epoch": 0.13799748934740713, + "grad_norm": 7.309902667999268, + "learning_rate": 4.770004184420988e-05, + "loss": 0.7368, + "step": 15610 + }, + { + "epoch": 0.13808589260771936, + "grad_norm": 3.3789870738983154, + "learning_rate": 4.769856845653801e-05, + "loss": 0.8317, + "step": 15620 + }, + { + "epoch": 0.13817429586803162, + "grad_norm": 6.748372554779053, + "learning_rate": 4.7697095068866144e-05, + "loss": 0.8868, + "step": 15630 + }, + { + "epoch": 0.13826269912834385, + "grad_norm": 12.395320892333984, + "learning_rate": 4.7695621681194266e-05, + "loss": 0.885, + "step": 15640 + }, + { + "epoch": 0.13835110238865608, + "grad_norm": 5.548238754272461, + "learning_rate": 4.76941482935224e-05, + "loss": 0.7629, + "step": 15650 + }, + { + "epoch": 0.13843950564896834, + "grad_norm": 2.1533687114715576, + "learning_rate": 4.769267490585053e-05, + "loss": 0.7936, + "step": 15660 + }, + { + "epoch": 0.13852790890928057, + "grad_norm": 3.9797375202178955, + "learning_rate": 4.769120151817866e-05, + "loss": 0.8491, + "step": 15670 + }, + { + "epoch": 0.1386163121695928, + "grad_norm": 2.603139638900757, + "learning_rate": 4.7689728130506786e-05, + "loss": 0.8466, + "step": 15680 + }, + { + "epoch": 0.13870471542990506, + "grad_norm": 2.4111125469207764, + "learning_rate": 4.768825474283492e-05, + "loss": 0.8236, + "step": 15690 + }, + { + "epoch": 0.1387931186902173, + "grad_norm": 5.43579626083374, + "learning_rate": 4.768678135516304e-05, + "loss": 0.8628, + "step": 15700 + }, + { + "epoch": 0.13888152195052952, + "grad_norm": 2.9611611366271973, + "learning_rate": 4.768530796749118e-05, + "loss": 0.7865, + "step": 15710 + }, + { + "epoch": 0.13896992521084178, + "grad_norm": 3.3550655841827393, + "learning_rate": 4.7683834579819306e-05, + "loss": 0.8111, + "step": 15720 + }, + { + "epoch": 0.139058328471154, + "grad_norm": 6.024102687835693, + "learning_rate": 4.7682361192147435e-05, + "loss": 0.8792, + "step": 15730 + }, + { + "epoch": 0.13914673173146624, + "grad_norm": 6.341953754425049, + "learning_rate": 4.768088780447556e-05, + "loss": 0.9089, + "step": 15740 + }, + { + "epoch": 0.1392351349917785, + "grad_norm": 2.9542484283447266, + "learning_rate": 4.767941441680369e-05, + "loss": 0.7912, + "step": 15750 + }, + { + "epoch": 0.13932353825209073, + "grad_norm": 12.003358840942383, + "learning_rate": 4.767794102913182e-05, + "loss": 0.8087, + "step": 15760 + }, + { + "epoch": 0.13941194151240296, + "grad_norm": 9.762426376342773, + "learning_rate": 4.7676467641459955e-05, + "loss": 0.7731, + "step": 15770 + }, + { + "epoch": 0.13950034477271522, + "grad_norm": 4.455225944519043, + "learning_rate": 4.767499425378808e-05, + "loss": 0.8065, + "step": 15780 + }, + { + "epoch": 0.13958874803302745, + "grad_norm": 5.925757884979248, + "learning_rate": 4.767352086611621e-05, + "loss": 0.9041, + "step": 15790 + }, + { + "epoch": 0.13967715129333969, + "grad_norm": 4.6304521560668945, + "learning_rate": 4.767204747844434e-05, + "loss": 0.8063, + "step": 15800 + }, + { + "epoch": 0.13976555455365194, + "grad_norm": 9.210540771484375, + "learning_rate": 4.767057409077247e-05, + "loss": 0.8222, + "step": 15810 + }, + { + "epoch": 0.13985395781396417, + "grad_norm": 2.7507073879241943, + "learning_rate": 4.7669100703100597e-05, + "loss": 0.8012, + "step": 15820 + }, + { + "epoch": 0.1399423610742764, + "grad_norm": 11.781067848205566, + "learning_rate": 4.766762731542873e-05, + "loss": 0.7386, + "step": 15830 + }, + { + "epoch": 0.14003076433458866, + "grad_norm": 4.750818729400635, + "learning_rate": 4.766615392775686e-05, + "loss": 0.9513, + "step": 15840 + }, + { + "epoch": 0.1401191675949009, + "grad_norm": 3.239306688308716, + "learning_rate": 4.766468054008499e-05, + "loss": 0.7889, + "step": 15850 + }, + { + "epoch": 0.14020757085521315, + "grad_norm": 5.480432510375977, + "learning_rate": 4.766320715241312e-05, + "loss": 0.7502, + "step": 15860 + }, + { + "epoch": 0.14029597411552538, + "grad_norm": 13.013794898986816, + "learning_rate": 4.7661733764741245e-05, + "loss": 0.9384, + "step": 15870 + }, + { + "epoch": 0.14038437737583762, + "grad_norm": 4.921560287475586, + "learning_rate": 4.7660260377069373e-05, + "loss": 0.8006, + "step": 15880 + }, + { + "epoch": 0.14047278063614987, + "grad_norm": 1.9659104347229004, + "learning_rate": 4.76587869893975e-05, + "loss": 0.8384, + "step": 15890 + }, + { + "epoch": 0.1405611838964621, + "grad_norm": 6.83479118347168, + "learning_rate": 4.765731360172564e-05, + "loss": 0.7581, + "step": 15900 + }, + { + "epoch": 0.14064958715677434, + "grad_norm": 5.402812480926514, + "learning_rate": 4.7655840214053765e-05, + "loss": 0.9052, + "step": 15910 + }, + { + "epoch": 0.1407379904170866, + "grad_norm": 6.77946138381958, + "learning_rate": 4.7654366826381894e-05, + "loss": 0.793, + "step": 15920 + }, + { + "epoch": 0.14082639367739883, + "grad_norm": 4.841235160827637, + "learning_rate": 4.765289343871002e-05, + "loss": 0.6792, + "step": 15930 + }, + { + "epoch": 0.14091479693771106, + "grad_norm": 6.926675796508789, + "learning_rate": 4.765142005103815e-05, + "loss": 0.8645, + "step": 15940 + }, + { + "epoch": 0.14100320019802332, + "grad_norm": 7.380031585693359, + "learning_rate": 4.764994666336628e-05, + "loss": 0.8126, + "step": 15950 + }, + { + "epoch": 0.14109160345833555, + "grad_norm": 7.4095988273620605, + "learning_rate": 4.7648473275694414e-05, + "loss": 0.7796, + "step": 15960 + }, + { + "epoch": 0.14118000671864778, + "grad_norm": 3.8512370586395264, + "learning_rate": 4.7646999888022535e-05, + "loss": 0.7745, + "step": 15970 + }, + { + "epoch": 0.14126840997896004, + "grad_norm": 2.0865118503570557, + "learning_rate": 4.764552650035067e-05, + "loss": 0.8013, + "step": 15980 + }, + { + "epoch": 0.14135681323927227, + "grad_norm": 2.9433555603027344, + "learning_rate": 4.76440531126788e-05, + "loss": 0.7495, + "step": 15990 + }, + { + "epoch": 0.1414452164995845, + "grad_norm": 4.146435260772705, + "learning_rate": 4.764257972500693e-05, + "loss": 0.8801, + "step": 16000 + }, + { + "epoch": 0.14153361975989676, + "grad_norm": 9.377398490905762, + "learning_rate": 4.7641106337335056e-05, + "loss": 0.8938, + "step": 16010 + }, + { + "epoch": 0.141622023020209, + "grad_norm": 7.253297805786133, + "learning_rate": 4.763963294966319e-05, + "loss": 0.8652, + "step": 16020 + }, + { + "epoch": 0.14171042628052122, + "grad_norm": 6.060379505157471, + "learning_rate": 4.763815956199131e-05, + "loss": 0.8512, + "step": 16030 + }, + { + "epoch": 0.14179882954083348, + "grad_norm": 2.855339527130127, + "learning_rate": 4.763668617431945e-05, + "loss": 0.8104, + "step": 16040 + }, + { + "epoch": 0.1418872328011457, + "grad_norm": 9.66022777557373, + "learning_rate": 4.7635212786647576e-05, + "loss": 0.9924, + "step": 16050 + }, + { + "epoch": 0.14197563606145794, + "grad_norm": 3.456629991531372, + "learning_rate": 4.7633739398975704e-05, + "loss": 0.8359, + "step": 16060 + }, + { + "epoch": 0.1420640393217702, + "grad_norm": 5.4765191078186035, + "learning_rate": 4.763226601130383e-05, + "loss": 0.7722, + "step": 16070 + }, + { + "epoch": 0.14215244258208243, + "grad_norm": 8.099782943725586, + "learning_rate": 4.763079262363197e-05, + "loss": 0.7993, + "step": 16080 + }, + { + "epoch": 0.14224084584239466, + "grad_norm": 6.92963171005249, + "learning_rate": 4.762931923596009e-05, + "loss": 0.8937, + "step": 16090 + }, + { + "epoch": 0.14232924910270692, + "grad_norm": 3.91198992729187, + "learning_rate": 4.7627845848288224e-05, + "loss": 0.8908, + "step": 16100 + }, + { + "epoch": 0.14241765236301915, + "grad_norm": 4.417505264282227, + "learning_rate": 4.7626372460616346e-05, + "loss": 0.7337, + "step": 16110 + }, + { + "epoch": 0.14250605562333138, + "grad_norm": 2.9531617164611816, + "learning_rate": 4.762489907294448e-05, + "loss": 0.7656, + "step": 16120 + }, + { + "epoch": 0.14259445888364364, + "grad_norm": 6.2340407371521, + "learning_rate": 4.762342568527261e-05, + "loss": 0.9849, + "step": 16130 + }, + { + "epoch": 0.14268286214395587, + "grad_norm": 2.0591773986816406, + "learning_rate": 4.762195229760074e-05, + "loss": 0.8713, + "step": 16140 + }, + { + "epoch": 0.1427712654042681, + "grad_norm": 3.8224780559539795, + "learning_rate": 4.7620478909928866e-05, + "loss": 0.8033, + "step": 16150 + }, + { + "epoch": 0.14285966866458036, + "grad_norm": 3.3133275508880615, + "learning_rate": 4.7619005522257e-05, + "loss": 0.9767, + "step": 16160 + }, + { + "epoch": 0.1429480719248926, + "grad_norm": 10.423565864562988, + "learning_rate": 4.761753213458512e-05, + "loss": 0.9264, + "step": 16170 + }, + { + "epoch": 0.14303647518520482, + "grad_norm": 6.098910331726074, + "learning_rate": 4.761605874691326e-05, + "loss": 0.7519, + "step": 16180 + }, + { + "epoch": 0.14312487844551708, + "grad_norm": 8.131715774536133, + "learning_rate": 4.7614585359241386e-05, + "loss": 0.9972, + "step": 16190 + }, + { + "epoch": 0.1432132817058293, + "grad_norm": 4.524472236633301, + "learning_rate": 4.7613111971569515e-05, + "loss": 0.8572, + "step": 16200 + }, + { + "epoch": 0.14330168496614154, + "grad_norm": 4.032301425933838, + "learning_rate": 4.761163858389764e-05, + "loss": 0.7664, + "step": 16210 + }, + { + "epoch": 0.1433900882264538, + "grad_norm": 4.198031425476074, + "learning_rate": 4.761016519622577e-05, + "loss": 0.9005, + "step": 16220 + }, + { + "epoch": 0.14347849148676603, + "grad_norm": 5.050949573516846, + "learning_rate": 4.76086918085539e-05, + "loss": 0.7792, + "step": 16230 + }, + { + "epoch": 0.14356689474707826, + "grad_norm": 4.83923864364624, + "learning_rate": 4.7607218420882035e-05, + "loss": 0.8835, + "step": 16240 + }, + { + "epoch": 0.14365529800739052, + "grad_norm": 4.71106481552124, + "learning_rate": 4.7605745033210156e-05, + "loss": 0.8189, + "step": 16250 + }, + { + "epoch": 0.14374370126770275, + "grad_norm": 2.0753655433654785, + "learning_rate": 4.760427164553829e-05, + "loss": 0.8018, + "step": 16260 + }, + { + "epoch": 0.14383210452801498, + "grad_norm": 7.493185520172119, + "learning_rate": 4.760279825786642e-05, + "loss": 0.7613, + "step": 16270 + }, + { + "epoch": 0.14392050778832724, + "grad_norm": 11.220280647277832, + "learning_rate": 4.760132487019455e-05, + "loss": 0.8538, + "step": 16280 + }, + { + "epoch": 0.14400891104863947, + "grad_norm": 4.551141262054443, + "learning_rate": 4.759985148252268e-05, + "loss": 0.8649, + "step": 16290 + }, + { + "epoch": 0.1440973143089517, + "grad_norm": 2.6454129219055176, + "learning_rate": 4.759837809485081e-05, + "loss": 0.7681, + "step": 16300 + }, + { + "epoch": 0.14418571756926396, + "grad_norm": 6.077807903289795, + "learning_rate": 4.759690470717893e-05, + "loss": 0.8123, + "step": 16310 + }, + { + "epoch": 0.1442741208295762, + "grad_norm": 13.279069900512695, + "learning_rate": 4.759543131950707e-05, + "loss": 0.7205, + "step": 16320 + }, + { + "epoch": 0.14436252408988842, + "grad_norm": 5.215303421020508, + "learning_rate": 4.759395793183519e-05, + "loss": 0.9053, + "step": 16330 + }, + { + "epoch": 0.14445092735020068, + "grad_norm": 3.764324188232422, + "learning_rate": 4.7592484544163325e-05, + "loss": 0.7493, + "step": 16340 + }, + { + "epoch": 0.1445393306105129, + "grad_norm": 9.877325057983398, + "learning_rate": 4.7591011156491454e-05, + "loss": 0.7996, + "step": 16350 + }, + { + "epoch": 0.14462773387082514, + "grad_norm": 7.554166316986084, + "learning_rate": 4.758953776881958e-05, + "loss": 0.8533, + "step": 16360 + }, + { + "epoch": 0.1447161371311374, + "grad_norm": 2.535881996154785, + "learning_rate": 4.758806438114771e-05, + "loss": 0.7532, + "step": 16370 + }, + { + "epoch": 0.14480454039144963, + "grad_norm": 3.8387203216552734, + "learning_rate": 4.7586590993475845e-05, + "loss": 0.8488, + "step": 16380 + }, + { + "epoch": 0.1448929436517619, + "grad_norm": 4.006622314453125, + "learning_rate": 4.758511760580397e-05, + "loss": 0.9753, + "step": 16390 + }, + { + "epoch": 0.14498134691207412, + "grad_norm": 3.8722243309020996, + "learning_rate": 4.75836442181321e-05, + "loss": 0.8216, + "step": 16400 + }, + { + "epoch": 0.14506975017238635, + "grad_norm": 4.150247573852539, + "learning_rate": 4.758217083046023e-05, + "loss": 0.9156, + "step": 16410 + }, + { + "epoch": 0.1451581534326986, + "grad_norm": 2.6241047382354736, + "learning_rate": 4.758069744278836e-05, + "loss": 0.8719, + "step": 16420 + }, + { + "epoch": 0.14524655669301084, + "grad_norm": 5.52913761138916, + "learning_rate": 4.757922405511649e-05, + "loss": 0.7967, + "step": 16430 + }, + { + "epoch": 0.14533495995332307, + "grad_norm": 2.647954225540161, + "learning_rate": 4.7577750667444616e-05, + "loss": 0.7305, + "step": 16440 + }, + { + "epoch": 0.14542336321363533, + "grad_norm": 10.302239418029785, + "learning_rate": 4.7576277279772744e-05, + "loss": 0.8786, + "step": 16450 + }, + { + "epoch": 0.14551176647394756, + "grad_norm": 6.946014404296875, + "learning_rate": 4.757480389210088e-05, + "loss": 0.7101, + "step": 16460 + }, + { + "epoch": 0.1456001697342598, + "grad_norm": 7.227524757385254, + "learning_rate": 4.7573330504429e-05, + "loss": 0.7306, + "step": 16470 + }, + { + "epoch": 0.14568857299457205, + "grad_norm": 1.9578170776367188, + "learning_rate": 4.7571857116757136e-05, + "loss": 0.7232, + "step": 16480 + }, + { + "epoch": 0.14577697625488428, + "grad_norm": 4.118729114532471, + "learning_rate": 4.7570383729085264e-05, + "loss": 0.8288, + "step": 16490 + }, + { + "epoch": 0.1458653795151965, + "grad_norm": 9.074803352355957, + "learning_rate": 4.756891034141339e-05, + "loss": 0.9186, + "step": 16500 + }, + { + "epoch": 0.14595378277550877, + "grad_norm": 2.3599421977996826, + "learning_rate": 4.756743695374152e-05, + "loss": 0.7595, + "step": 16510 + }, + { + "epoch": 0.146042186035821, + "grad_norm": 5.108723163604736, + "learning_rate": 4.7565963566069656e-05, + "loss": 0.776, + "step": 16520 + }, + { + "epoch": 0.14613058929613323, + "grad_norm": 17.747661590576172, + "learning_rate": 4.756449017839778e-05, + "loss": 0.7488, + "step": 16530 + }, + { + "epoch": 0.1462189925564455, + "grad_norm": 2.2069766521453857, + "learning_rate": 4.756301679072591e-05, + "loss": 0.7622, + "step": 16540 + }, + { + "epoch": 0.14630739581675772, + "grad_norm": 2.8158762454986572, + "learning_rate": 4.756154340305404e-05, + "loss": 0.8401, + "step": 16550 + }, + { + "epoch": 0.14639579907706995, + "grad_norm": 6.137599945068359, + "learning_rate": 4.756007001538217e-05, + "loss": 0.8381, + "step": 16560 + }, + { + "epoch": 0.1464842023373822, + "grad_norm": 7.609254360198975, + "learning_rate": 4.75585966277103e-05, + "loss": 0.837, + "step": 16570 + }, + { + "epoch": 0.14657260559769444, + "grad_norm": 2.9540882110595703, + "learning_rate": 4.7557123240038426e-05, + "loss": 0.9364, + "step": 16580 + }, + { + "epoch": 0.14666100885800668, + "grad_norm": 2.969904899597168, + "learning_rate": 4.7555649852366554e-05, + "loss": 0.8806, + "step": 16590 + }, + { + "epoch": 0.14674941211831893, + "grad_norm": 3.8954246044158936, + "learning_rate": 4.755417646469469e-05, + "loss": 0.9661, + "step": 16600 + }, + { + "epoch": 0.14683781537863116, + "grad_norm": 3.806460380554199, + "learning_rate": 4.755270307702281e-05, + "loss": 0.8994, + "step": 16610 + }, + { + "epoch": 0.1469262186389434, + "grad_norm": 3.983349323272705, + "learning_rate": 4.7551229689350946e-05, + "loss": 0.9444, + "step": 16620 + }, + { + "epoch": 0.14701462189925565, + "grad_norm": 3.6446897983551025, + "learning_rate": 4.7549756301679075e-05, + "loss": 0.7827, + "step": 16630 + }, + { + "epoch": 0.14710302515956789, + "grad_norm": 4.506073951721191, + "learning_rate": 4.75482829140072e-05, + "loss": 0.7976, + "step": 16640 + }, + { + "epoch": 0.14719142841988012, + "grad_norm": 5.558804512023926, + "learning_rate": 4.754680952633533e-05, + "loss": 0.8037, + "step": 16650 + }, + { + "epoch": 0.14727983168019237, + "grad_norm": 5.593502998352051, + "learning_rate": 4.7545336138663466e-05, + "loss": 0.7896, + "step": 16660 + }, + { + "epoch": 0.1473682349405046, + "grad_norm": 2.9031991958618164, + "learning_rate": 4.754386275099159e-05, + "loss": 0.8126, + "step": 16670 + }, + { + "epoch": 0.14745663820081684, + "grad_norm": 2.8378851413726807, + "learning_rate": 4.754238936331972e-05, + "loss": 0.7547, + "step": 16680 + }, + { + "epoch": 0.1475450414611291, + "grad_norm": 9.635090827941895, + "learning_rate": 4.754091597564785e-05, + "loss": 0.962, + "step": 16690 + }, + { + "epoch": 0.14763344472144133, + "grad_norm": 6.492239475250244, + "learning_rate": 4.753944258797598e-05, + "loss": 0.8047, + "step": 16700 + }, + { + "epoch": 0.14772184798175356, + "grad_norm": 6.705989360809326, + "learning_rate": 4.753796920030411e-05, + "loss": 0.9173, + "step": 16710 + }, + { + "epoch": 0.14781025124206582, + "grad_norm": 6.1324992179870605, + "learning_rate": 4.7536495812632237e-05, + "loss": 0.8281, + "step": 16720 + }, + { + "epoch": 0.14789865450237805, + "grad_norm": 3.7849693298339844, + "learning_rate": 4.7535022424960365e-05, + "loss": 0.7801, + "step": 16730 + }, + { + "epoch": 0.14798705776269028, + "grad_norm": 2.7035951614379883, + "learning_rate": 4.75335490372885e-05, + "loss": 0.8403, + "step": 16740 + }, + { + "epoch": 0.14807546102300254, + "grad_norm": 2.984168529510498, + "learning_rate": 4.753207564961663e-05, + "loss": 0.8745, + "step": 16750 + }, + { + "epoch": 0.14816386428331477, + "grad_norm": 9.56676959991455, + "learning_rate": 4.753060226194476e-05, + "loss": 0.8441, + "step": 16760 + }, + { + "epoch": 0.148252267543627, + "grad_norm": 3.6529154777526855, + "learning_rate": 4.7529128874272885e-05, + "loss": 0.7925, + "step": 16770 + }, + { + "epoch": 0.14834067080393926, + "grad_norm": 4.137853622436523, + "learning_rate": 4.7527655486601013e-05, + "loss": 0.7155, + "step": 16780 + }, + { + "epoch": 0.1484290740642515, + "grad_norm": 3.5439553260803223, + "learning_rate": 4.752618209892914e-05, + "loss": 0.8398, + "step": 16790 + }, + { + "epoch": 0.14851747732456372, + "grad_norm": 7.92951774597168, + "learning_rate": 4.752470871125727e-05, + "loss": 0.94, + "step": 16800 + }, + { + "epoch": 0.14860588058487598, + "grad_norm": 6.424148082733154, + "learning_rate": 4.7523235323585405e-05, + "loss": 0.8397, + "step": 16810 + }, + { + "epoch": 0.1486942838451882, + "grad_norm": 2.9883008003234863, + "learning_rate": 4.7521761935913534e-05, + "loss": 0.7806, + "step": 16820 + }, + { + "epoch": 0.14878268710550044, + "grad_norm": 3.802116870880127, + "learning_rate": 4.752028854824166e-05, + "loss": 0.8431, + "step": 16830 + }, + { + "epoch": 0.1488710903658127, + "grad_norm": 3.7867634296417236, + "learning_rate": 4.751881516056979e-05, + "loss": 0.7787, + "step": 16840 + }, + { + "epoch": 0.14895949362612493, + "grad_norm": 5.5591840744018555, + "learning_rate": 4.751734177289792e-05, + "loss": 0.885, + "step": 16850 + }, + { + "epoch": 0.14904789688643716, + "grad_norm": 6.289994239807129, + "learning_rate": 4.751586838522605e-05, + "loss": 0.7527, + "step": 16860 + }, + { + "epoch": 0.14913630014674942, + "grad_norm": 2.677204132080078, + "learning_rate": 4.751439499755418e-05, + "loss": 0.8225, + "step": 16870 + }, + { + "epoch": 0.14922470340706165, + "grad_norm": 2.8042759895324707, + "learning_rate": 4.751292160988231e-05, + "loss": 0.814, + "step": 16880 + }, + { + "epoch": 0.14931310666737388, + "grad_norm": 3.9682416915893555, + "learning_rate": 4.751144822221044e-05, + "loss": 0.9185, + "step": 16890 + }, + { + "epoch": 0.14940150992768614, + "grad_norm": 6.54267692565918, + "learning_rate": 4.750997483453857e-05, + "loss": 0.8052, + "step": 16900 + }, + { + "epoch": 0.14948991318799837, + "grad_norm": 5.714585304260254, + "learning_rate": 4.7508501446866696e-05, + "loss": 0.7981, + "step": 16910 + }, + { + "epoch": 0.14957831644831063, + "grad_norm": 5.427567958831787, + "learning_rate": 4.7507028059194824e-05, + "loss": 0.8729, + "step": 16920 + }, + { + "epoch": 0.14966671970862286, + "grad_norm": 6.700192928314209, + "learning_rate": 4.750555467152296e-05, + "loss": 0.8984, + "step": 16930 + }, + { + "epoch": 0.1497551229689351, + "grad_norm": 6.20875358581543, + "learning_rate": 4.750408128385108e-05, + "loss": 0.8019, + "step": 16940 + }, + { + "epoch": 0.14984352622924735, + "grad_norm": 6.183334827423096, + "learning_rate": 4.7502607896179216e-05, + "loss": 0.8384, + "step": 16950 + }, + { + "epoch": 0.14993192948955958, + "grad_norm": 6.7605156898498535, + "learning_rate": 4.7501134508507344e-05, + "loss": 0.9076, + "step": 16960 + }, + { + "epoch": 0.1500203327498718, + "grad_norm": 10.34814167022705, + "learning_rate": 4.749966112083547e-05, + "loss": 0.7882, + "step": 16970 + }, + { + "epoch": 0.15010873601018407, + "grad_norm": 4.3602190017700195, + "learning_rate": 4.74981877331636e-05, + "loss": 0.8297, + "step": 16980 + }, + { + "epoch": 0.1501971392704963, + "grad_norm": 12.673126220703125, + "learning_rate": 4.7496714345491736e-05, + "loss": 0.8272, + "step": 16990 + }, + { + "epoch": 0.15028554253080853, + "grad_norm": 2.8232502937316895, + "learning_rate": 4.749524095781986e-05, + "loss": 0.7984, + "step": 17000 + }, + { + "epoch": 0.1503739457911208, + "grad_norm": 4.268876075744629, + "learning_rate": 4.749376757014799e-05, + "loss": 0.8776, + "step": 17010 + }, + { + "epoch": 0.15046234905143302, + "grad_norm": 3.9623379707336426, + "learning_rate": 4.749229418247612e-05, + "loss": 0.8895, + "step": 17020 + }, + { + "epoch": 0.15055075231174525, + "grad_norm": 5.467015743255615, + "learning_rate": 4.749082079480425e-05, + "loss": 0.8376, + "step": 17030 + }, + { + "epoch": 0.1506391555720575, + "grad_norm": 3.534050941467285, + "learning_rate": 4.748934740713238e-05, + "loss": 0.8534, + "step": 17040 + }, + { + "epoch": 0.15072755883236974, + "grad_norm": 11.9103422164917, + "learning_rate": 4.7487874019460506e-05, + "loss": 0.7452, + "step": 17050 + }, + { + "epoch": 0.15081596209268197, + "grad_norm": 7.80486536026001, + "learning_rate": 4.7486400631788634e-05, + "loss": 0.8906, + "step": 17060 + }, + { + "epoch": 0.15090436535299423, + "grad_norm": 6.0464887619018555, + "learning_rate": 4.748492724411677e-05, + "loss": 0.9138, + "step": 17070 + }, + { + "epoch": 0.15099276861330646, + "grad_norm": 4.2649030685424805, + "learning_rate": 4.748345385644489e-05, + "loss": 0.8548, + "step": 17080 + }, + { + "epoch": 0.1510811718736187, + "grad_norm": 3.9609460830688477, + "learning_rate": 4.7481980468773026e-05, + "loss": 0.8733, + "step": 17090 + }, + { + "epoch": 0.15116957513393095, + "grad_norm": 3.602222204208374, + "learning_rate": 4.7480507081101155e-05, + "loss": 0.8242, + "step": 17100 + }, + { + "epoch": 0.15125797839424318, + "grad_norm": 2.77740478515625, + "learning_rate": 4.747903369342928e-05, + "loss": 0.7811, + "step": 17110 + }, + { + "epoch": 0.1513463816545554, + "grad_norm": 10.50583267211914, + "learning_rate": 4.747756030575741e-05, + "loss": 0.8794, + "step": 17120 + }, + { + "epoch": 0.15143478491486767, + "grad_norm": 7.266853332519531, + "learning_rate": 4.7476086918085546e-05, + "loss": 0.8185, + "step": 17130 + }, + { + "epoch": 0.1515231881751799, + "grad_norm": 6.573174953460693, + "learning_rate": 4.747461353041367e-05, + "loss": 0.8115, + "step": 17140 + }, + { + "epoch": 0.15161159143549213, + "grad_norm": 4.139992713928223, + "learning_rate": 4.74731401427418e-05, + "loss": 0.7369, + "step": 17150 + }, + { + "epoch": 0.1516999946958044, + "grad_norm": 6.643852710723877, + "learning_rate": 4.7471666755069925e-05, + "loss": 0.9176, + "step": 17160 + }, + { + "epoch": 0.15178839795611662, + "grad_norm": 3.8509538173675537, + "learning_rate": 4.747019336739806e-05, + "loss": 0.8128, + "step": 17170 + }, + { + "epoch": 0.15187680121642885, + "grad_norm": 4.570491313934326, + "learning_rate": 4.746871997972619e-05, + "loss": 0.873, + "step": 17180 + }, + { + "epoch": 0.1519652044767411, + "grad_norm": 3.9173784255981445, + "learning_rate": 4.7467246592054317e-05, + "loss": 0.8889, + "step": 17190 + }, + { + "epoch": 0.15205360773705334, + "grad_norm": 6.56118631362915, + "learning_rate": 4.7465773204382445e-05, + "loss": 0.7714, + "step": 17200 + }, + { + "epoch": 0.15214201099736557, + "grad_norm": 7.693734169006348, + "learning_rate": 4.746429981671058e-05, + "loss": 0.8005, + "step": 17210 + }, + { + "epoch": 0.15223041425767783, + "grad_norm": 9.865682601928711, + "learning_rate": 4.74628264290387e-05, + "loss": 0.8259, + "step": 17220 + }, + { + "epoch": 0.15231881751799006, + "grad_norm": 4.894525051116943, + "learning_rate": 4.746135304136684e-05, + "loss": 0.9204, + "step": 17230 + }, + { + "epoch": 0.1524072207783023, + "grad_norm": 4.633498191833496, + "learning_rate": 4.7459879653694965e-05, + "loss": 0.8317, + "step": 17240 + }, + { + "epoch": 0.15249562403861455, + "grad_norm": 2.514660120010376, + "learning_rate": 4.7458406266023093e-05, + "loss": 0.8179, + "step": 17250 + }, + { + "epoch": 0.15258402729892678, + "grad_norm": 3.449573040008545, + "learning_rate": 4.745693287835122e-05, + "loss": 0.739, + "step": 17260 + }, + { + "epoch": 0.15267243055923901, + "grad_norm": 7.599994659423828, + "learning_rate": 4.745545949067935e-05, + "loss": 0.7893, + "step": 17270 + }, + { + "epoch": 0.15276083381955127, + "grad_norm": 5.116689205169678, + "learning_rate": 4.745398610300748e-05, + "loss": 0.8111, + "step": 17280 + }, + { + "epoch": 0.1528492370798635, + "grad_norm": 3.194286346435547, + "learning_rate": 4.7452512715335614e-05, + "loss": 0.8104, + "step": 17290 + }, + { + "epoch": 0.15293764034017573, + "grad_norm": 6.917521953582764, + "learning_rate": 4.7451039327663735e-05, + "loss": 0.9206, + "step": 17300 + }, + { + "epoch": 0.153026043600488, + "grad_norm": 4.247321605682373, + "learning_rate": 4.744956593999187e-05, + "loss": 0.7929, + "step": 17310 + }, + { + "epoch": 0.15311444686080022, + "grad_norm": 3.5399179458618164, + "learning_rate": 4.744809255232e-05, + "loss": 0.8369, + "step": 17320 + }, + { + "epoch": 0.15320285012111245, + "grad_norm": 8.189291000366211, + "learning_rate": 4.744661916464813e-05, + "loss": 0.7917, + "step": 17330 + }, + { + "epoch": 0.1532912533814247, + "grad_norm": 4.0611467361450195, + "learning_rate": 4.7445145776976255e-05, + "loss": 0.8189, + "step": 17340 + }, + { + "epoch": 0.15337965664173694, + "grad_norm": 8.815164566040039, + "learning_rate": 4.744367238930439e-05, + "loss": 0.7376, + "step": 17350 + }, + { + "epoch": 0.15346805990204918, + "grad_norm": 6.515460968017578, + "learning_rate": 4.744219900163251e-05, + "loss": 0.7956, + "step": 17360 + }, + { + "epoch": 0.15355646316236143, + "grad_norm": 7.194701194763184, + "learning_rate": 4.744072561396065e-05, + "loss": 0.8334, + "step": 17370 + }, + { + "epoch": 0.15364486642267366, + "grad_norm": 10.56981086730957, + "learning_rate": 4.743925222628877e-05, + "loss": 0.898, + "step": 17380 + }, + { + "epoch": 0.1537332696829859, + "grad_norm": 6.981093406677246, + "learning_rate": 4.7437778838616904e-05, + "loss": 0.8355, + "step": 17390 + }, + { + "epoch": 0.15382167294329815, + "grad_norm": 7.125991344451904, + "learning_rate": 4.743630545094503e-05, + "loss": 0.8454, + "step": 17400 + }, + { + "epoch": 0.15391007620361039, + "grad_norm": 6.938432216644287, + "learning_rate": 4.743483206327316e-05, + "loss": 0.8408, + "step": 17410 + }, + { + "epoch": 0.15399847946392262, + "grad_norm": 8.310367584228516, + "learning_rate": 4.743335867560129e-05, + "loss": 0.7057, + "step": 17420 + }, + { + "epoch": 0.15408688272423487, + "grad_norm": 5.260408878326416, + "learning_rate": 4.7431885287929424e-05, + "loss": 0.7177, + "step": 17430 + }, + { + "epoch": 0.1541752859845471, + "grad_norm": 5.00064754486084, + "learning_rate": 4.7430411900257546e-05, + "loss": 0.8111, + "step": 17440 + }, + { + "epoch": 0.15426368924485934, + "grad_norm": 4.080453395843506, + "learning_rate": 4.742893851258568e-05, + "loss": 0.9186, + "step": 17450 + }, + { + "epoch": 0.1543520925051716, + "grad_norm": 7.736790180206299, + "learning_rate": 4.742746512491381e-05, + "loss": 0.9419, + "step": 17460 + }, + { + "epoch": 0.15444049576548383, + "grad_norm": 5.28859281539917, + "learning_rate": 4.742599173724194e-05, + "loss": 0.876, + "step": 17470 + }, + { + "epoch": 0.15452889902579608, + "grad_norm": 1.9699537754058838, + "learning_rate": 4.7424518349570066e-05, + "loss": 0.7367, + "step": 17480 + }, + { + "epoch": 0.15461730228610832, + "grad_norm": 4.309940338134766, + "learning_rate": 4.74230449618982e-05, + "loss": 0.8329, + "step": 17490 + }, + { + "epoch": 0.15470570554642055, + "grad_norm": 6.036145210266113, + "learning_rate": 4.742157157422632e-05, + "loss": 0.8226, + "step": 17500 + }, + { + "epoch": 0.1547941088067328, + "grad_norm": 4.646291255950928, + "learning_rate": 4.742009818655446e-05, + "loss": 0.7517, + "step": 17510 + }, + { + "epoch": 0.15488251206704504, + "grad_norm": 2.6001710891723633, + "learning_rate": 4.741862479888258e-05, + "loss": 0.8583, + "step": 17520 + }, + { + "epoch": 0.15497091532735727, + "grad_norm": 2.145974636077881, + "learning_rate": 4.7417151411210715e-05, + "loss": 0.6555, + "step": 17530 + }, + { + "epoch": 0.15505931858766953, + "grad_norm": 3.989692449569702, + "learning_rate": 4.741567802353884e-05, + "loss": 0.8159, + "step": 17540 + }, + { + "epoch": 0.15514772184798176, + "grad_norm": 8.53581714630127, + "learning_rate": 4.741420463586697e-05, + "loss": 0.7689, + "step": 17550 + }, + { + "epoch": 0.155236125108294, + "grad_norm": 6.605713844299316, + "learning_rate": 4.74127312481951e-05, + "loss": 0.8069, + "step": 17560 + }, + { + "epoch": 0.15532452836860625, + "grad_norm": 7.3282999992370605, + "learning_rate": 4.7411257860523235e-05, + "loss": 0.8309, + "step": 17570 + }, + { + "epoch": 0.15541293162891848, + "grad_norm": 4.699496269226074, + "learning_rate": 4.7409784472851356e-05, + "loss": 0.8369, + "step": 17580 + }, + { + "epoch": 0.1555013348892307, + "grad_norm": 1.816676378250122, + "learning_rate": 4.740831108517949e-05, + "loss": 0.7336, + "step": 17590 + }, + { + "epoch": 0.15558973814954297, + "grad_norm": 5.680145263671875, + "learning_rate": 4.740683769750762e-05, + "loss": 0.7462, + "step": 17600 + }, + { + "epoch": 0.1556781414098552, + "grad_norm": 2.0201945304870605, + "learning_rate": 4.740536430983575e-05, + "loss": 0.9124, + "step": 17610 + }, + { + "epoch": 0.15576654467016743, + "grad_norm": 4.1975812911987305, + "learning_rate": 4.7403890922163876e-05, + "loss": 0.7377, + "step": 17620 + }, + { + "epoch": 0.1558549479304797, + "grad_norm": 8.152978897094727, + "learning_rate": 4.7402417534492005e-05, + "loss": 0.8281, + "step": 17630 + }, + { + "epoch": 0.15594335119079192, + "grad_norm": 3.530212163925171, + "learning_rate": 4.740094414682013e-05, + "loss": 0.7422, + "step": 17640 + }, + { + "epoch": 0.15603175445110415, + "grad_norm": 6.836564540863037, + "learning_rate": 4.739947075914827e-05, + "loss": 0.8205, + "step": 17650 + }, + { + "epoch": 0.1561201577114164, + "grad_norm": 6.836387634277344, + "learning_rate": 4.73979973714764e-05, + "loss": 0.9054, + "step": 17660 + }, + { + "epoch": 0.15620856097172864, + "grad_norm": 11.761429786682129, + "learning_rate": 4.7396523983804525e-05, + "loss": 0.7469, + "step": 17670 + }, + { + "epoch": 0.15629696423204087, + "grad_norm": 19.648902893066406, + "learning_rate": 4.739505059613265e-05, + "loss": 0.9581, + "step": 17680 + }, + { + "epoch": 0.15638536749235313, + "grad_norm": 5.903117656707764, + "learning_rate": 4.739357720846078e-05, + "loss": 0.7907, + "step": 17690 + }, + { + "epoch": 0.15647377075266536, + "grad_norm": 9.697010040283203, + "learning_rate": 4.739210382078891e-05, + "loss": 0.8265, + "step": 17700 + }, + { + "epoch": 0.1565621740129776, + "grad_norm": 4.563940525054932, + "learning_rate": 4.7390630433117045e-05, + "loss": 0.8707, + "step": 17710 + }, + { + "epoch": 0.15665057727328985, + "grad_norm": 4.969250202178955, + "learning_rate": 4.7389157045445174e-05, + "loss": 0.9309, + "step": 17720 + }, + { + "epoch": 0.15673898053360208, + "grad_norm": 3.407184600830078, + "learning_rate": 4.73876836577733e-05, + "loss": 0.7936, + "step": 17730 + }, + { + "epoch": 0.1568273837939143, + "grad_norm": 7.285817623138428, + "learning_rate": 4.738621027010143e-05, + "loss": 0.7602, + "step": 17740 + }, + { + "epoch": 0.15691578705422657, + "grad_norm": 2.969336748123169, + "learning_rate": 4.738473688242956e-05, + "loss": 0.6712, + "step": 17750 + }, + { + "epoch": 0.1570041903145388, + "grad_norm": 5.654608249664307, + "learning_rate": 4.738326349475769e-05, + "loss": 0.8435, + "step": 17760 + }, + { + "epoch": 0.15709259357485103, + "grad_norm": 6.064638137817383, + "learning_rate": 4.7381790107085815e-05, + "loss": 0.7975, + "step": 17770 + }, + { + "epoch": 0.1571809968351633, + "grad_norm": 8.733338356018066, + "learning_rate": 4.738031671941395e-05, + "loss": 0.7818, + "step": 17780 + }, + { + "epoch": 0.15726940009547552, + "grad_norm": 8.815160751342773, + "learning_rate": 4.737884333174208e-05, + "loss": 0.9034, + "step": 17790 + }, + { + "epoch": 0.15735780335578775, + "grad_norm": 11.449951171875, + "learning_rate": 4.737736994407021e-05, + "loss": 0.8046, + "step": 17800 + }, + { + "epoch": 0.1574462066161, + "grad_norm": 2.8350718021392822, + "learning_rate": 4.7375896556398336e-05, + "loss": 0.7466, + "step": 17810 + }, + { + "epoch": 0.15753460987641224, + "grad_norm": 6.78036642074585, + "learning_rate": 4.7374423168726464e-05, + "loss": 0.8652, + "step": 17820 + }, + { + "epoch": 0.15762301313672447, + "grad_norm": 14.450384140014648, + "learning_rate": 4.737294978105459e-05, + "loss": 0.7937, + "step": 17830 + }, + { + "epoch": 0.15771141639703673, + "grad_norm": 3.3525030612945557, + "learning_rate": 4.737147639338273e-05, + "loss": 0.7093, + "step": 17840 + }, + { + "epoch": 0.15779981965734896, + "grad_norm": 4.203586101531982, + "learning_rate": 4.737000300571085e-05, + "loss": 0.9038, + "step": 17850 + }, + { + "epoch": 0.1578882229176612, + "grad_norm": 5.063088417053223, + "learning_rate": 4.7368529618038984e-05, + "loss": 0.7865, + "step": 17860 + }, + { + "epoch": 0.15797662617797345, + "grad_norm": 3.026818037033081, + "learning_rate": 4.736705623036711e-05, + "loss": 0.9251, + "step": 17870 + }, + { + "epoch": 0.15806502943828568, + "grad_norm": 3.864360809326172, + "learning_rate": 4.736558284269524e-05, + "loss": 0.892, + "step": 17880 + }, + { + "epoch": 0.1581534326985979, + "grad_norm": 6.614564418792725, + "learning_rate": 4.736410945502337e-05, + "loss": 0.747, + "step": 17890 + }, + { + "epoch": 0.15824183595891017, + "grad_norm": 4.892355442047119, + "learning_rate": 4.7362636067351504e-05, + "loss": 0.9152, + "step": 17900 + }, + { + "epoch": 0.1583302392192224, + "grad_norm": 5.17777156829834, + "learning_rate": 4.7361162679679626e-05, + "loss": 0.8945, + "step": 17910 + }, + { + "epoch": 0.15841864247953463, + "grad_norm": 7.7891435623168945, + "learning_rate": 4.735968929200776e-05, + "loss": 1.0744, + "step": 17920 + }, + { + "epoch": 0.1585070457398469, + "grad_norm": 3.0126953125, + "learning_rate": 4.735821590433589e-05, + "loss": 0.8189, + "step": 17930 + }, + { + "epoch": 0.15859544900015912, + "grad_norm": 3.6667325496673584, + "learning_rate": 4.735674251666402e-05, + "loss": 0.8698, + "step": 17940 + }, + { + "epoch": 0.15868385226047135, + "grad_norm": 5.9046549797058105, + "learning_rate": 4.7355269128992146e-05, + "loss": 0.7714, + "step": 17950 + }, + { + "epoch": 0.1587722555207836, + "grad_norm": 6.023751735687256, + "learning_rate": 4.735379574132028e-05, + "loss": 0.7184, + "step": 17960 + }, + { + "epoch": 0.15886065878109584, + "grad_norm": 2.8186686038970947, + "learning_rate": 4.73523223536484e-05, + "loss": 0.7829, + "step": 17970 + }, + { + "epoch": 0.15894906204140807, + "grad_norm": 6.186093330383301, + "learning_rate": 4.735084896597654e-05, + "loss": 0.9277, + "step": 17980 + }, + { + "epoch": 0.15903746530172033, + "grad_norm": 11.59505558013916, + "learning_rate": 4.734937557830466e-05, + "loss": 0.7751, + "step": 17990 + }, + { + "epoch": 0.15912586856203256, + "grad_norm": 5.0415120124816895, + "learning_rate": 4.7347902190632795e-05, + "loss": 0.8904, + "step": 18000 + }, + { + "epoch": 0.15921427182234482, + "grad_norm": 3.4007201194763184, + "learning_rate": 4.734642880296092e-05, + "loss": 0.9191, + "step": 18010 + }, + { + "epoch": 0.15930267508265705, + "grad_norm": 3.271740436553955, + "learning_rate": 4.734495541528905e-05, + "loss": 0.7514, + "step": 18020 + }, + { + "epoch": 0.15939107834296928, + "grad_norm": 4.815491676330566, + "learning_rate": 4.734348202761718e-05, + "loss": 0.8331, + "step": 18030 + }, + { + "epoch": 0.15947948160328154, + "grad_norm": 6.87015962600708, + "learning_rate": 4.7342008639945315e-05, + "loss": 0.8161, + "step": 18040 + }, + { + "epoch": 0.15956788486359377, + "grad_norm": 6.4885573387146, + "learning_rate": 4.7340535252273436e-05, + "loss": 0.8169, + "step": 18050 + }, + { + "epoch": 0.159656288123906, + "grad_norm": 3.9206607341766357, + "learning_rate": 4.733906186460157e-05, + "loss": 0.8445, + "step": 18060 + }, + { + "epoch": 0.15974469138421826, + "grad_norm": 3.4145724773406982, + "learning_rate": 4.73375884769297e-05, + "loss": 0.8926, + "step": 18070 + }, + { + "epoch": 0.1598330946445305, + "grad_norm": 3.1308720111846924, + "learning_rate": 4.733611508925783e-05, + "loss": 0.7923, + "step": 18080 + }, + { + "epoch": 0.15992149790484272, + "grad_norm": 2.564180850982666, + "learning_rate": 4.7334641701585957e-05, + "loss": 0.874, + "step": 18090 + }, + { + "epoch": 0.16000990116515498, + "grad_norm": 2.432877540588379, + "learning_rate": 4.7333168313914085e-05, + "loss": 0.8792, + "step": 18100 + }, + { + "epoch": 0.1600983044254672, + "grad_norm": 5.122593879699707, + "learning_rate": 4.733169492624221e-05, + "loss": 0.7577, + "step": 18110 + }, + { + "epoch": 0.16018670768577944, + "grad_norm": 5.265462875366211, + "learning_rate": 4.733022153857035e-05, + "loss": 0.8311, + "step": 18120 + }, + { + "epoch": 0.1602751109460917, + "grad_norm": 5.594770431518555, + "learning_rate": 4.732874815089847e-05, + "loss": 0.8619, + "step": 18130 + }, + { + "epoch": 0.16036351420640393, + "grad_norm": 9.650065422058105, + "learning_rate": 4.7327274763226605e-05, + "loss": 0.8059, + "step": 18140 + }, + { + "epoch": 0.16045191746671617, + "grad_norm": 7.045647144317627, + "learning_rate": 4.7325801375554733e-05, + "loss": 0.9249, + "step": 18150 + }, + { + "epoch": 0.16054032072702842, + "grad_norm": 5.509271621704102, + "learning_rate": 4.732432798788286e-05, + "loss": 0.7212, + "step": 18160 + }, + { + "epoch": 0.16062872398734065, + "grad_norm": 7.598241806030273, + "learning_rate": 4.732285460021099e-05, + "loss": 0.7302, + "step": 18170 + }, + { + "epoch": 0.16071712724765289, + "grad_norm": 5.693346977233887, + "learning_rate": 4.7321381212539125e-05, + "loss": 0.787, + "step": 18180 + }, + { + "epoch": 0.16080553050796514, + "grad_norm": 7.060309410095215, + "learning_rate": 4.731990782486725e-05, + "loss": 0.7158, + "step": 18190 + }, + { + "epoch": 0.16089393376827738, + "grad_norm": 8.715744972229004, + "learning_rate": 4.731843443719538e-05, + "loss": 0.8629, + "step": 18200 + }, + { + "epoch": 0.1609823370285896, + "grad_norm": 5.086950302124023, + "learning_rate": 4.7316961049523504e-05, + "loss": 0.8574, + "step": 18210 + }, + { + "epoch": 0.16107074028890186, + "grad_norm": 2.8243017196655273, + "learning_rate": 4.731548766185164e-05, + "loss": 0.6888, + "step": 18220 + }, + { + "epoch": 0.1611591435492141, + "grad_norm": 2.8839316368103027, + "learning_rate": 4.731401427417977e-05, + "loss": 0.8373, + "step": 18230 + }, + { + "epoch": 0.16124754680952633, + "grad_norm": 5.097020149230957, + "learning_rate": 4.7312540886507895e-05, + "loss": 0.788, + "step": 18240 + }, + { + "epoch": 0.16133595006983859, + "grad_norm": 5.074483871459961, + "learning_rate": 4.7311067498836024e-05, + "loss": 0.7805, + "step": 18250 + }, + { + "epoch": 0.16142435333015082, + "grad_norm": 4.021812915802002, + "learning_rate": 4.730959411116416e-05, + "loss": 0.7763, + "step": 18260 + }, + { + "epoch": 0.16151275659046305, + "grad_norm": 7.153144836425781, + "learning_rate": 4.730812072349228e-05, + "loss": 0.9315, + "step": 18270 + }, + { + "epoch": 0.1616011598507753, + "grad_norm": 3.6252365112304688, + "learning_rate": 4.7306647335820416e-05, + "loss": 0.8893, + "step": 18280 + }, + { + "epoch": 0.16168956311108754, + "grad_norm": 7.570452690124512, + "learning_rate": 4.7305173948148544e-05, + "loss": 0.7164, + "step": 18290 + }, + { + "epoch": 0.16177796637139977, + "grad_norm": 3.345590591430664, + "learning_rate": 4.730370056047667e-05, + "loss": 0.8292, + "step": 18300 + }, + { + "epoch": 0.16186636963171203, + "grad_norm": 8.756311416625977, + "learning_rate": 4.73022271728048e-05, + "loss": 0.8451, + "step": 18310 + }, + { + "epoch": 0.16195477289202426, + "grad_norm": 7.167171478271484, + "learning_rate": 4.730075378513293e-05, + "loss": 0.8121, + "step": 18320 + }, + { + "epoch": 0.1620431761523365, + "grad_norm": 2.588923454284668, + "learning_rate": 4.729928039746106e-05, + "loss": 0.8351, + "step": 18330 + }, + { + "epoch": 0.16213157941264875, + "grad_norm": 5.5366997718811035, + "learning_rate": 4.729780700978919e-05, + "loss": 0.7912, + "step": 18340 + }, + { + "epoch": 0.16221998267296098, + "grad_norm": 4.38487434387207, + "learning_rate": 4.7296333622117314e-05, + "loss": 0.7493, + "step": 18350 + }, + { + "epoch": 0.1623083859332732, + "grad_norm": 4.288943767547607, + "learning_rate": 4.729486023444545e-05, + "loss": 0.8852, + "step": 18360 + }, + { + "epoch": 0.16239678919358547, + "grad_norm": 4.566148281097412, + "learning_rate": 4.729338684677358e-05, + "loss": 0.849, + "step": 18370 + }, + { + "epoch": 0.1624851924538977, + "grad_norm": 6.831437587738037, + "learning_rate": 4.7291913459101706e-05, + "loss": 0.7716, + "step": 18380 + }, + { + "epoch": 0.16257359571420993, + "grad_norm": 6.257946491241455, + "learning_rate": 4.7290440071429834e-05, + "loss": 0.8591, + "step": 18390 + }, + { + "epoch": 0.1626619989745222, + "grad_norm": 3.754155397415161, + "learning_rate": 4.728896668375797e-05, + "loss": 0.8443, + "step": 18400 + }, + { + "epoch": 0.16275040223483442, + "grad_norm": 6.450690269470215, + "learning_rate": 4.728749329608609e-05, + "loss": 0.8167, + "step": 18410 + }, + { + "epoch": 0.16283880549514665, + "grad_norm": 4.664546489715576, + "learning_rate": 4.7286019908414226e-05, + "loss": 0.7837, + "step": 18420 + }, + { + "epoch": 0.1629272087554589, + "grad_norm": 7.001297473907471, + "learning_rate": 4.7284546520742354e-05, + "loss": 0.7788, + "step": 18430 + }, + { + "epoch": 0.16301561201577114, + "grad_norm": 2.9480931758880615, + "learning_rate": 4.728307313307048e-05, + "loss": 0.8633, + "step": 18440 + }, + { + "epoch": 0.16310401527608337, + "grad_norm": 4.539743423461914, + "learning_rate": 4.728159974539861e-05, + "loss": 0.6939, + "step": 18450 + }, + { + "epoch": 0.16319241853639563, + "grad_norm": 1.7249478101730347, + "learning_rate": 4.728012635772674e-05, + "loss": 0.7591, + "step": 18460 + }, + { + "epoch": 0.16328082179670786, + "grad_norm": 4.965416431427002, + "learning_rate": 4.727865297005487e-05, + "loss": 0.8193, + "step": 18470 + }, + { + "epoch": 0.1633692250570201, + "grad_norm": 3.3861422538757324, + "learning_rate": 4.7277179582383e-05, + "loss": 0.7937, + "step": 18480 + }, + { + "epoch": 0.16345762831733235, + "grad_norm": 4.316111087799072, + "learning_rate": 4.7275706194711125e-05, + "loss": 0.8458, + "step": 18490 + }, + { + "epoch": 0.16354603157764458, + "grad_norm": 3.774136543273926, + "learning_rate": 4.727423280703926e-05, + "loss": 0.7364, + "step": 18500 + }, + { + "epoch": 0.1636344348379568, + "grad_norm": 4.713778972625732, + "learning_rate": 4.727275941936739e-05, + "loss": 0.8164, + "step": 18510 + }, + { + "epoch": 0.16372283809826907, + "grad_norm": 3.9100100994110107, + "learning_rate": 4.7271286031695516e-05, + "loss": 0.8595, + "step": 18520 + }, + { + "epoch": 0.1638112413585813, + "grad_norm": 4.755215167999268, + "learning_rate": 4.7269812644023645e-05, + "loss": 0.7251, + "step": 18530 + }, + { + "epoch": 0.16389964461889356, + "grad_norm": 5.091159820556641, + "learning_rate": 4.726833925635178e-05, + "loss": 0.8085, + "step": 18540 + }, + { + "epoch": 0.1639880478792058, + "grad_norm": 4.025550365447998, + "learning_rate": 4.72668658686799e-05, + "loss": 0.9259, + "step": 18550 + }, + { + "epoch": 0.16407645113951802, + "grad_norm": 4.209859848022461, + "learning_rate": 4.7265392481008037e-05, + "loss": 0.7756, + "step": 18560 + }, + { + "epoch": 0.16416485439983028, + "grad_norm": 10.84984302520752, + "learning_rate": 4.7263919093336165e-05, + "loss": 0.8719, + "step": 18570 + }, + { + "epoch": 0.1642532576601425, + "grad_norm": 2.2355659008026123, + "learning_rate": 4.726244570566429e-05, + "loss": 0.8152, + "step": 18580 + }, + { + "epoch": 0.16434166092045474, + "grad_norm": 8.716652870178223, + "learning_rate": 4.726097231799242e-05, + "loss": 0.97, + "step": 18590 + }, + { + "epoch": 0.164430064180767, + "grad_norm": 2.814476728439331, + "learning_rate": 4.725949893032055e-05, + "loss": 0.7929, + "step": 18600 + }, + { + "epoch": 0.16451846744107923, + "grad_norm": 4.9129414558410645, + "learning_rate": 4.725802554264868e-05, + "loss": 0.8613, + "step": 18610 + }, + { + "epoch": 0.16460687070139146, + "grad_norm": 3.8735432624816895, + "learning_rate": 4.7256552154976814e-05, + "loss": 0.788, + "step": 18620 + }, + { + "epoch": 0.16469527396170372, + "grad_norm": 2.8175761699676514, + "learning_rate": 4.725507876730494e-05, + "loss": 0.7751, + "step": 18630 + }, + { + "epoch": 0.16478367722201595, + "grad_norm": 6.7121500968933105, + "learning_rate": 4.725360537963307e-05, + "loss": 0.8249, + "step": 18640 + }, + { + "epoch": 0.16487208048232818, + "grad_norm": 7.731828212738037, + "learning_rate": 4.72521319919612e-05, + "loss": 0.792, + "step": 18650 + }, + { + "epoch": 0.16496048374264044, + "grad_norm": 4.961273670196533, + "learning_rate": 4.725065860428933e-05, + "loss": 0.7956, + "step": 18660 + }, + { + "epoch": 0.16504888700295267, + "grad_norm": 4.998683929443359, + "learning_rate": 4.724918521661746e-05, + "loss": 0.793, + "step": 18670 + }, + { + "epoch": 0.1651372902632649, + "grad_norm": 6.564632892608643, + "learning_rate": 4.7247711828945584e-05, + "loss": 0.7985, + "step": 18680 + }, + { + "epoch": 0.16522569352357716, + "grad_norm": 4.772511005401611, + "learning_rate": 4.724623844127372e-05, + "loss": 0.853, + "step": 18690 + }, + { + "epoch": 0.1653140967838894, + "grad_norm": 2.7908151149749756, + "learning_rate": 4.724476505360185e-05, + "loss": 0.9016, + "step": 18700 + }, + { + "epoch": 0.16540250004420162, + "grad_norm": 3.49704909324646, + "learning_rate": 4.7243291665929975e-05, + "loss": 0.751, + "step": 18710 + }, + { + "epoch": 0.16549090330451388, + "grad_norm": 8.343111038208008, + "learning_rate": 4.7241818278258104e-05, + "loss": 0.6942, + "step": 18720 + }, + { + "epoch": 0.1655793065648261, + "grad_norm": 10.985702514648438, + "learning_rate": 4.724034489058624e-05, + "loss": 0.8471, + "step": 18730 + }, + { + "epoch": 0.16566770982513834, + "grad_norm": 8.541181564331055, + "learning_rate": 4.723887150291436e-05, + "loss": 0.8589, + "step": 18740 + }, + { + "epoch": 0.1657561130854506, + "grad_norm": 3.302056074142456, + "learning_rate": 4.7237398115242496e-05, + "loss": 0.7596, + "step": 18750 + }, + { + "epoch": 0.16584451634576283, + "grad_norm": 2.5116047859191895, + "learning_rate": 4.7235924727570624e-05, + "loss": 0.8419, + "step": 18760 + }, + { + "epoch": 0.16593291960607506, + "grad_norm": 3.3291523456573486, + "learning_rate": 4.723445133989875e-05, + "loss": 0.7754, + "step": 18770 + }, + { + "epoch": 0.16602132286638732, + "grad_norm": 4.486542701721191, + "learning_rate": 4.723297795222688e-05, + "loss": 0.7598, + "step": 18780 + }, + { + "epoch": 0.16610972612669955, + "grad_norm": 3.321672201156616, + "learning_rate": 4.7231504564555016e-05, + "loss": 0.616, + "step": 18790 + }, + { + "epoch": 0.16619812938701178, + "grad_norm": 3.0971567630767822, + "learning_rate": 4.723003117688314e-05, + "loss": 0.7112, + "step": 18800 + }, + { + "epoch": 0.16628653264732404, + "grad_norm": 3.232220411300659, + "learning_rate": 4.722855778921127e-05, + "loss": 0.815, + "step": 18810 + }, + { + "epoch": 0.16637493590763627, + "grad_norm": 4.4582295417785645, + "learning_rate": 4.7227084401539394e-05, + "loss": 0.7364, + "step": 18820 + }, + { + "epoch": 0.1664633391679485, + "grad_norm": 3.4270033836364746, + "learning_rate": 4.722561101386753e-05, + "loss": 0.6765, + "step": 18830 + }, + { + "epoch": 0.16655174242826076, + "grad_norm": 5.478305816650391, + "learning_rate": 4.722413762619566e-05, + "loss": 0.8738, + "step": 18840 + }, + { + "epoch": 0.166640145688573, + "grad_norm": 4.589667797088623, + "learning_rate": 4.7222664238523786e-05, + "loss": 0.7627, + "step": 18850 + }, + { + "epoch": 0.16672854894888522, + "grad_norm": 3.996263265609741, + "learning_rate": 4.7221190850851914e-05, + "loss": 0.6948, + "step": 18860 + }, + { + "epoch": 0.16681695220919748, + "grad_norm": 3.293722152709961, + "learning_rate": 4.721971746318005e-05, + "loss": 0.8482, + "step": 18870 + }, + { + "epoch": 0.16690535546950971, + "grad_norm": 9.512335777282715, + "learning_rate": 4.721824407550817e-05, + "loss": 0.7064, + "step": 18880 + }, + { + "epoch": 0.16699375872982195, + "grad_norm": 5.593800067901611, + "learning_rate": 4.7216770687836306e-05, + "loss": 0.843, + "step": 18890 + }, + { + "epoch": 0.1670821619901342, + "grad_norm": 3.658989191055298, + "learning_rate": 4.7215297300164435e-05, + "loss": 0.8077, + "step": 18900 + }, + { + "epoch": 0.16717056525044643, + "grad_norm": 7.332565784454346, + "learning_rate": 4.721382391249256e-05, + "loss": 0.7967, + "step": 18910 + }, + { + "epoch": 0.16725896851075867, + "grad_norm": 6.507866382598877, + "learning_rate": 4.721235052482069e-05, + "loss": 0.8039, + "step": 18920 + }, + { + "epoch": 0.16734737177107092, + "grad_norm": 10.709056854248047, + "learning_rate": 4.721087713714882e-05, + "loss": 0.751, + "step": 18930 + }, + { + "epoch": 0.16743577503138315, + "grad_norm": 7.497487545013428, + "learning_rate": 4.720940374947695e-05, + "loss": 0.9073, + "step": 18940 + }, + { + "epoch": 0.16752417829169539, + "grad_norm": 6.352433204650879, + "learning_rate": 4.720793036180508e-05, + "loss": 0.8455, + "step": 18950 + }, + { + "epoch": 0.16761258155200764, + "grad_norm": 3.689246892929077, + "learning_rate": 4.7206456974133205e-05, + "loss": 0.8592, + "step": 18960 + }, + { + "epoch": 0.16770098481231988, + "grad_norm": 3.891657829284668, + "learning_rate": 4.720498358646134e-05, + "loss": 0.8722, + "step": 18970 + }, + { + "epoch": 0.1677893880726321, + "grad_norm": 4.450270652770996, + "learning_rate": 4.720351019878947e-05, + "loss": 0.7952, + "step": 18980 + }, + { + "epoch": 0.16787779133294436, + "grad_norm": 4.3971781730651855, + "learning_rate": 4.7202036811117596e-05, + "loss": 0.7979, + "step": 18990 + }, + { + "epoch": 0.1679661945932566, + "grad_norm": 4.213810920715332, + "learning_rate": 4.7200563423445725e-05, + "loss": 0.7506, + "step": 19000 + }, + { + "epoch": 0.16805459785356883, + "grad_norm": 4.515845775604248, + "learning_rate": 4.719909003577386e-05, + "loss": 0.8104, + "step": 19010 + }, + { + "epoch": 0.16814300111388109, + "grad_norm": 7.110860824584961, + "learning_rate": 4.719761664810198e-05, + "loss": 0.7089, + "step": 19020 + }, + { + "epoch": 0.16823140437419332, + "grad_norm": 4.53788948059082, + "learning_rate": 4.719614326043012e-05, + "loss": 0.8878, + "step": 19030 + }, + { + "epoch": 0.16831980763450555, + "grad_norm": 8.444923400878906, + "learning_rate": 4.719466987275824e-05, + "loss": 0.7488, + "step": 19040 + }, + { + "epoch": 0.1684082108948178, + "grad_norm": 3.3344855308532715, + "learning_rate": 4.719319648508637e-05, + "loss": 0.8422, + "step": 19050 + }, + { + "epoch": 0.16849661415513004, + "grad_norm": 4.313567161560059, + "learning_rate": 4.71917230974145e-05, + "loss": 0.772, + "step": 19060 + }, + { + "epoch": 0.1685850174154423, + "grad_norm": 7.400096416473389, + "learning_rate": 4.719024970974263e-05, + "loss": 0.7867, + "step": 19070 + }, + { + "epoch": 0.16867342067575453, + "grad_norm": 7.661045074462891, + "learning_rate": 4.718877632207076e-05, + "loss": 0.7241, + "step": 19080 + }, + { + "epoch": 0.16876182393606676, + "grad_norm": 4.5807271003723145, + "learning_rate": 4.7187302934398894e-05, + "loss": 0.7532, + "step": 19090 + }, + { + "epoch": 0.16885022719637902, + "grad_norm": 5.509703636169434, + "learning_rate": 4.7185829546727015e-05, + "loss": 0.6222, + "step": 19100 + }, + { + "epoch": 0.16893863045669125, + "grad_norm": 7.484439373016357, + "learning_rate": 4.718435615905515e-05, + "loss": 0.7193, + "step": 19110 + }, + { + "epoch": 0.16902703371700348, + "grad_norm": 7.234420299530029, + "learning_rate": 4.718288277138328e-05, + "loss": 0.9654, + "step": 19120 + }, + { + "epoch": 0.16911543697731574, + "grad_norm": 5.691709041595459, + "learning_rate": 4.718140938371141e-05, + "loss": 0.7726, + "step": 19130 + }, + { + "epoch": 0.16920384023762797, + "grad_norm": 5.517917156219482, + "learning_rate": 4.7179935996039535e-05, + "loss": 0.9405, + "step": 19140 + }, + { + "epoch": 0.1692922434979402, + "grad_norm": 4.151499271392822, + "learning_rate": 4.7178462608367664e-05, + "loss": 0.7854, + "step": 19150 + }, + { + "epoch": 0.16938064675825246, + "grad_norm": 8.64465618133545, + "learning_rate": 4.717698922069579e-05, + "loss": 0.9154, + "step": 19160 + }, + { + "epoch": 0.1694690500185647, + "grad_norm": 4.667221546173096, + "learning_rate": 4.717551583302393e-05, + "loss": 0.8714, + "step": 19170 + }, + { + "epoch": 0.16955745327887692, + "grad_norm": 2.7628471851348877, + "learning_rate": 4.717404244535205e-05, + "loss": 0.8054, + "step": 19180 + }, + { + "epoch": 0.16964585653918918, + "grad_norm": 2.8770718574523926, + "learning_rate": 4.7172569057680184e-05, + "loss": 0.7926, + "step": 19190 + }, + { + "epoch": 0.1697342597995014, + "grad_norm": 11.768014907836914, + "learning_rate": 4.717109567000831e-05, + "loss": 0.8588, + "step": 19200 + }, + { + "epoch": 0.16982266305981364, + "grad_norm": 6.627838611602783, + "learning_rate": 4.716962228233644e-05, + "loss": 0.7746, + "step": 19210 + }, + { + "epoch": 0.1699110663201259, + "grad_norm": 4.2306294441223145, + "learning_rate": 4.716814889466457e-05, + "loss": 0.6987, + "step": 19220 + }, + { + "epoch": 0.16999946958043813, + "grad_norm": 2.051795244216919, + "learning_rate": 4.7166675506992704e-05, + "loss": 0.7172, + "step": 19230 + }, + { + "epoch": 0.17008787284075036, + "grad_norm": 3.473698377609253, + "learning_rate": 4.7165202119320826e-05, + "loss": 0.7591, + "step": 19240 + }, + { + "epoch": 0.17017627610106262, + "grad_norm": 3.525426149368286, + "learning_rate": 4.716372873164896e-05, + "loss": 0.7131, + "step": 19250 + }, + { + "epoch": 0.17026467936137485, + "grad_norm": 1.7267205715179443, + "learning_rate": 4.716225534397709e-05, + "loss": 0.6918, + "step": 19260 + }, + { + "epoch": 0.17035308262168708, + "grad_norm": 11.006839752197266, + "learning_rate": 4.716078195630522e-05, + "loss": 0.8123, + "step": 19270 + }, + { + "epoch": 0.17044148588199934, + "grad_norm": 5.82366943359375, + "learning_rate": 4.7159308568633346e-05, + "loss": 0.749, + "step": 19280 + }, + { + "epoch": 0.17052988914231157, + "grad_norm": 6.12000036239624, + "learning_rate": 4.7157835180961474e-05, + "loss": 0.6957, + "step": 19290 + }, + { + "epoch": 0.1706182924026238, + "grad_norm": 8.563779830932617, + "learning_rate": 4.71563617932896e-05, + "loss": 0.7675, + "step": 19300 + }, + { + "epoch": 0.17070669566293606, + "grad_norm": 4.752929210662842, + "learning_rate": 4.715488840561774e-05, + "loss": 0.7587, + "step": 19310 + }, + { + "epoch": 0.1707950989232483, + "grad_norm": 1.848655343055725, + "learning_rate": 4.715341501794586e-05, + "loss": 0.6361, + "step": 19320 + }, + { + "epoch": 0.17088350218356052, + "grad_norm": 11.473823547363281, + "learning_rate": 4.7151941630273994e-05, + "loss": 0.8394, + "step": 19330 + }, + { + "epoch": 0.17097190544387278, + "grad_norm": 5.776627540588379, + "learning_rate": 4.715046824260212e-05, + "loss": 0.7531, + "step": 19340 + }, + { + "epoch": 0.171060308704185, + "grad_norm": 3.338092088699341, + "learning_rate": 4.714899485493025e-05, + "loss": 0.8278, + "step": 19350 + }, + { + "epoch": 0.17114871196449724, + "grad_norm": 3.929267406463623, + "learning_rate": 4.714752146725838e-05, + "loss": 0.7747, + "step": 19360 + }, + { + "epoch": 0.1712371152248095, + "grad_norm": 11.310409545898438, + "learning_rate": 4.7146048079586515e-05, + "loss": 0.7816, + "step": 19370 + }, + { + "epoch": 0.17132551848512173, + "grad_norm": 3.5103700160980225, + "learning_rate": 4.7144574691914636e-05, + "loss": 0.8185, + "step": 19380 + }, + { + "epoch": 0.17141392174543396, + "grad_norm": 3.7547149658203125, + "learning_rate": 4.714310130424277e-05, + "loss": 0.731, + "step": 19390 + }, + { + "epoch": 0.17150232500574622, + "grad_norm": 4.463881969451904, + "learning_rate": 4.71416279165709e-05, + "loss": 0.7501, + "step": 19400 + }, + { + "epoch": 0.17159072826605845, + "grad_norm": 4.538525104522705, + "learning_rate": 4.714015452889903e-05, + "loss": 0.9555, + "step": 19410 + }, + { + "epoch": 0.17167913152637068, + "grad_norm": 5.343875408172607, + "learning_rate": 4.7138681141227156e-05, + "loss": 0.8399, + "step": 19420 + }, + { + "epoch": 0.17176753478668294, + "grad_norm": 5.001431941986084, + "learning_rate": 4.7137207753555285e-05, + "loss": 0.8188, + "step": 19430 + }, + { + "epoch": 0.17185593804699517, + "grad_norm": 5.407555103302002, + "learning_rate": 4.713573436588341e-05, + "loss": 0.6176, + "step": 19440 + }, + { + "epoch": 0.1719443413073074, + "grad_norm": 3.2855517864227295, + "learning_rate": 4.713426097821155e-05, + "loss": 0.8608, + "step": 19450 + }, + { + "epoch": 0.17203274456761966, + "grad_norm": 10.408510208129883, + "learning_rate": 4.7132787590539677e-05, + "loss": 0.7515, + "step": 19460 + }, + { + "epoch": 0.1721211478279319, + "grad_norm": 5.119137763977051, + "learning_rate": 4.7131314202867805e-05, + "loss": 0.8096, + "step": 19470 + }, + { + "epoch": 0.17220955108824412, + "grad_norm": 4.531383991241455, + "learning_rate": 4.712984081519593e-05, + "loss": 0.8578, + "step": 19480 + }, + { + "epoch": 0.17229795434855638, + "grad_norm": 5.950822830200195, + "learning_rate": 4.712836742752406e-05, + "loss": 0.8478, + "step": 19490 + }, + { + "epoch": 0.1723863576088686, + "grad_norm": 8.27846908569336, + "learning_rate": 4.712689403985219e-05, + "loss": 0.787, + "step": 19500 + }, + { + "epoch": 0.17247476086918084, + "grad_norm": 4.818996429443359, + "learning_rate": 4.712542065218032e-05, + "loss": 0.7874, + "step": 19510 + }, + { + "epoch": 0.1725631641294931, + "grad_norm": 8.866082191467285, + "learning_rate": 4.7123947264508453e-05, + "loss": 0.8239, + "step": 19520 + }, + { + "epoch": 0.17265156738980533, + "grad_norm": 5.025813579559326, + "learning_rate": 4.712247387683658e-05, + "loss": 0.7968, + "step": 19530 + }, + { + "epoch": 0.17273997065011756, + "grad_norm": 3.0801727771759033, + "learning_rate": 4.712100048916471e-05, + "loss": 0.7977, + "step": 19540 + }, + { + "epoch": 0.17282837391042982, + "grad_norm": 2.6314749717712402, + "learning_rate": 4.711952710149284e-05, + "loss": 0.8456, + "step": 19550 + }, + { + "epoch": 0.17291677717074205, + "grad_norm": 7.432890892028809, + "learning_rate": 4.711805371382097e-05, + "loss": 0.8546, + "step": 19560 + }, + { + "epoch": 0.17300518043105428, + "grad_norm": 7.585647106170654, + "learning_rate": 4.7116580326149095e-05, + "loss": 0.8915, + "step": 19570 + }, + { + "epoch": 0.17309358369136654, + "grad_norm": 5.4618988037109375, + "learning_rate": 4.711510693847723e-05, + "loss": 0.7779, + "step": 19580 + }, + { + "epoch": 0.17318198695167877, + "grad_norm": 4.218719959259033, + "learning_rate": 4.711363355080536e-05, + "loss": 0.8346, + "step": 19590 + }, + { + "epoch": 0.173270390211991, + "grad_norm": 5.693262577056885, + "learning_rate": 4.711216016313349e-05, + "loss": 0.8077, + "step": 19600 + }, + { + "epoch": 0.17335879347230326, + "grad_norm": 3.136305332183838, + "learning_rate": 4.7110686775461615e-05, + "loss": 0.7089, + "step": 19610 + }, + { + "epoch": 0.1734471967326155, + "grad_norm": 10.014686584472656, + "learning_rate": 4.7109213387789744e-05, + "loss": 0.8659, + "step": 19620 + }, + { + "epoch": 0.17353559999292775, + "grad_norm": 4.309294700622559, + "learning_rate": 4.710774000011787e-05, + "loss": 0.851, + "step": 19630 + }, + { + "epoch": 0.17362400325323998, + "grad_norm": 3.900099277496338, + "learning_rate": 4.710626661244601e-05, + "loss": 0.9194, + "step": 19640 + }, + { + "epoch": 0.17371240651355221, + "grad_norm": 8.823202133178711, + "learning_rate": 4.710479322477413e-05, + "loss": 0.7955, + "step": 19650 + }, + { + "epoch": 0.17380080977386447, + "grad_norm": 3.3904662132263184, + "learning_rate": 4.7103319837102264e-05, + "loss": 0.8078, + "step": 19660 + }, + { + "epoch": 0.1738892130341767, + "grad_norm": 2.620433807373047, + "learning_rate": 4.710184644943039e-05, + "loss": 0.687, + "step": 19670 + }, + { + "epoch": 0.17397761629448893, + "grad_norm": 6.136517524719238, + "learning_rate": 4.710037306175852e-05, + "loss": 0.7462, + "step": 19680 + }, + { + "epoch": 0.1740660195548012, + "grad_norm": 9.900436401367188, + "learning_rate": 4.709889967408665e-05, + "loss": 0.8872, + "step": 19690 + }, + { + "epoch": 0.17415442281511342, + "grad_norm": 3.0412752628326416, + "learning_rate": 4.7097426286414784e-05, + "loss": 0.7468, + "step": 19700 + }, + { + "epoch": 0.17424282607542566, + "grad_norm": 7.812278747558594, + "learning_rate": 4.7095952898742906e-05, + "loss": 0.7556, + "step": 19710 + }, + { + "epoch": 0.17433122933573791, + "grad_norm": 4.977499485015869, + "learning_rate": 4.709447951107104e-05, + "loss": 0.9045, + "step": 19720 + }, + { + "epoch": 0.17441963259605014, + "grad_norm": 7.382686138153076, + "learning_rate": 4.709300612339917e-05, + "loss": 0.9253, + "step": 19730 + }, + { + "epoch": 0.17450803585636238, + "grad_norm": 2.005552291870117, + "learning_rate": 4.70915327357273e-05, + "loss": 0.7806, + "step": 19740 + }, + { + "epoch": 0.17459643911667463, + "grad_norm": 5.624714374542236, + "learning_rate": 4.7090059348055426e-05, + "loss": 0.7612, + "step": 19750 + }, + { + "epoch": 0.17468484237698687, + "grad_norm": 5.911917209625244, + "learning_rate": 4.7088585960383554e-05, + "loss": 0.8858, + "step": 19760 + }, + { + "epoch": 0.1747732456372991, + "grad_norm": 4.969256401062012, + "learning_rate": 4.708711257271168e-05, + "loss": 0.8302, + "step": 19770 + }, + { + "epoch": 0.17486164889761135, + "grad_norm": 6.530728816986084, + "learning_rate": 4.708563918503982e-05, + "loss": 0.7979, + "step": 19780 + }, + { + "epoch": 0.17495005215792359, + "grad_norm": 2.1959164142608643, + "learning_rate": 4.708416579736794e-05, + "loss": 0.8566, + "step": 19790 + }, + { + "epoch": 0.17503845541823582, + "grad_norm": 5.137004852294922, + "learning_rate": 4.7082692409696074e-05, + "loss": 0.7901, + "step": 19800 + }, + { + "epoch": 0.17512685867854808, + "grad_norm": 2.6731183528900146, + "learning_rate": 4.70812190220242e-05, + "loss": 0.7823, + "step": 19810 + }, + { + "epoch": 0.1752152619388603, + "grad_norm": 4.632927417755127, + "learning_rate": 4.707974563435233e-05, + "loss": 0.765, + "step": 19820 + }, + { + "epoch": 0.17530366519917254, + "grad_norm": 8.491394996643066, + "learning_rate": 4.707827224668046e-05, + "loss": 0.8173, + "step": 19830 + }, + { + "epoch": 0.1753920684594848, + "grad_norm": 2.1966021060943604, + "learning_rate": 4.7076798859008595e-05, + "loss": 0.8613, + "step": 19840 + }, + { + "epoch": 0.17548047171979703, + "grad_norm": 6.492778778076172, + "learning_rate": 4.7075325471336716e-05, + "loss": 0.8378, + "step": 19850 + }, + { + "epoch": 0.17556887498010926, + "grad_norm": 1.9610486030578613, + "learning_rate": 4.707385208366485e-05, + "loss": 0.7533, + "step": 19860 + }, + { + "epoch": 0.17565727824042152, + "grad_norm": 4.732749938964844, + "learning_rate": 4.707237869599297e-05, + "loss": 0.8484, + "step": 19870 + }, + { + "epoch": 0.17574568150073375, + "grad_norm": 4.292774677276611, + "learning_rate": 4.707090530832111e-05, + "loss": 0.8442, + "step": 19880 + }, + { + "epoch": 0.17583408476104598, + "grad_norm": 3.8512213230133057, + "learning_rate": 4.7069431920649236e-05, + "loss": 0.7514, + "step": 19890 + }, + { + "epoch": 0.17592248802135824, + "grad_norm": 5.678794860839844, + "learning_rate": 4.7067958532977365e-05, + "loss": 0.8178, + "step": 19900 + }, + { + "epoch": 0.17601089128167047, + "grad_norm": 6.57100772857666, + "learning_rate": 4.706648514530549e-05, + "loss": 0.7885, + "step": 19910 + }, + { + "epoch": 0.1760992945419827, + "grad_norm": 8.180868148803711, + "learning_rate": 4.706501175763363e-05, + "loss": 0.7123, + "step": 19920 + }, + { + "epoch": 0.17618769780229496, + "grad_norm": 4.43673849105835, + "learning_rate": 4.706353836996175e-05, + "loss": 0.8278, + "step": 19930 + }, + { + "epoch": 0.1762761010626072, + "grad_norm": 5.025807857513428, + "learning_rate": 4.7062064982289885e-05, + "loss": 0.7896, + "step": 19940 + }, + { + "epoch": 0.17636450432291942, + "grad_norm": 6.425665378570557, + "learning_rate": 4.706059159461801e-05, + "loss": 0.8596, + "step": 19950 + }, + { + "epoch": 0.17645290758323168, + "grad_norm": 5.0455241203308105, + "learning_rate": 4.705911820694614e-05, + "loss": 0.7195, + "step": 19960 + }, + { + "epoch": 0.1765413108435439, + "grad_norm": 5.214970111846924, + "learning_rate": 4.705764481927427e-05, + "loss": 0.7317, + "step": 19970 + }, + { + "epoch": 0.17662971410385614, + "grad_norm": 5.591923713684082, + "learning_rate": 4.70561714316024e-05, + "loss": 0.8803, + "step": 19980 + }, + { + "epoch": 0.1767181173641684, + "grad_norm": 6.58145809173584, + "learning_rate": 4.705469804393053e-05, + "loss": 0.8799, + "step": 19990 + }, + { + "epoch": 0.17680652062448063, + "grad_norm": 6.532370567321777, + "learning_rate": 4.705322465625866e-05, + "loss": 0.8206, + "step": 20000 + }, + { + "epoch": 0.17689492388479286, + "grad_norm": 7.6214518547058105, + "learning_rate": 4.7051751268586783e-05, + "loss": 0.7727, + "step": 20010 + }, + { + "epoch": 0.17698332714510512, + "grad_norm": 3.491711139678955, + "learning_rate": 4.705027788091492e-05, + "loss": 0.6936, + "step": 20020 + }, + { + "epoch": 0.17707173040541735, + "grad_norm": 2.3454463481903076, + "learning_rate": 4.704880449324305e-05, + "loss": 0.7526, + "step": 20030 + }, + { + "epoch": 0.17716013366572958, + "grad_norm": 2.3436224460601807, + "learning_rate": 4.7047331105571175e-05, + "loss": 0.7448, + "step": 20040 + }, + { + "epoch": 0.17724853692604184, + "grad_norm": 4.434281349182129, + "learning_rate": 4.7045857717899304e-05, + "loss": 0.7042, + "step": 20050 + }, + { + "epoch": 0.17733694018635407, + "grad_norm": 3.948333501815796, + "learning_rate": 4.704438433022744e-05, + "loss": 0.7103, + "step": 20060 + }, + { + "epoch": 0.1774253434466663, + "grad_norm": 4.56276798248291, + "learning_rate": 4.704291094255556e-05, + "loss": 0.9007, + "step": 20070 + }, + { + "epoch": 0.17751374670697856, + "grad_norm": 3.5347900390625, + "learning_rate": 4.7041437554883695e-05, + "loss": 0.6617, + "step": 20080 + }, + { + "epoch": 0.1776021499672908, + "grad_norm": 3.578092575073242, + "learning_rate": 4.703996416721182e-05, + "loss": 0.7879, + "step": 20090 + }, + { + "epoch": 0.17769055322760302, + "grad_norm": 5.835831165313721, + "learning_rate": 4.703849077953995e-05, + "loss": 0.8649, + "step": 20100 + }, + { + "epoch": 0.17777895648791528, + "grad_norm": 3.1239800453186035, + "learning_rate": 4.703701739186808e-05, + "loss": 0.6506, + "step": 20110 + }, + { + "epoch": 0.1778673597482275, + "grad_norm": 10.112980842590332, + "learning_rate": 4.703554400419621e-05, + "loss": 0.7838, + "step": 20120 + }, + { + "epoch": 0.17795576300853974, + "grad_norm": 4.310737133026123, + "learning_rate": 4.703407061652434e-05, + "loss": 0.8217, + "step": 20130 + }, + { + "epoch": 0.178044166268852, + "grad_norm": 2.8160488605499268, + "learning_rate": 4.703259722885247e-05, + "loss": 0.7248, + "step": 20140 + }, + { + "epoch": 0.17813256952916423, + "grad_norm": 13.706145286560059, + "learning_rate": 4.7031123841180594e-05, + "loss": 1.007, + "step": 20150 + }, + { + "epoch": 0.1782209727894765, + "grad_norm": 8.242176055908203, + "learning_rate": 4.702965045350873e-05, + "loss": 0.7377, + "step": 20160 + }, + { + "epoch": 0.17830937604978872, + "grad_norm": 4.9279046058654785, + "learning_rate": 4.702817706583686e-05, + "loss": 0.8589, + "step": 20170 + }, + { + "epoch": 0.17839777931010095, + "grad_norm": 4.710366725921631, + "learning_rate": 4.7026703678164986e-05, + "loss": 0.7259, + "step": 20180 + }, + { + "epoch": 0.1784861825704132, + "grad_norm": 2.227062225341797, + "learning_rate": 4.7025230290493114e-05, + "loss": 0.7803, + "step": 20190 + }, + { + "epoch": 0.17857458583072544, + "grad_norm": 8.307478904724121, + "learning_rate": 4.702375690282125e-05, + "loss": 0.8592, + "step": 20200 + }, + { + "epoch": 0.17866298909103767, + "grad_norm": 3.467332363128662, + "learning_rate": 4.702228351514937e-05, + "loss": 0.6721, + "step": 20210 + }, + { + "epoch": 0.17875139235134993, + "grad_norm": 4.272871494293213, + "learning_rate": 4.7020810127477506e-05, + "loss": 0.6416, + "step": 20220 + }, + { + "epoch": 0.17883979561166216, + "grad_norm": 6.130920886993408, + "learning_rate": 4.701933673980563e-05, + "loss": 0.7404, + "step": 20230 + }, + { + "epoch": 0.1789281988719744, + "grad_norm": 3.2900898456573486, + "learning_rate": 4.701786335213376e-05, + "loss": 0.7589, + "step": 20240 + }, + { + "epoch": 0.17901660213228665, + "grad_norm": 6.0761542320251465, + "learning_rate": 4.701638996446189e-05, + "loss": 0.7381, + "step": 20250 + }, + { + "epoch": 0.17910500539259888, + "grad_norm": 6.198101997375488, + "learning_rate": 4.701491657679002e-05, + "loss": 0.8172, + "step": 20260 + }, + { + "epoch": 0.1791934086529111, + "grad_norm": 5.321105480194092, + "learning_rate": 4.701344318911815e-05, + "loss": 0.8677, + "step": 20270 + }, + { + "epoch": 0.17928181191322337, + "grad_norm": 7.723829746246338, + "learning_rate": 4.701196980144628e-05, + "loss": 0.8929, + "step": 20280 + }, + { + "epoch": 0.1793702151735356, + "grad_norm": 3.7290022373199463, + "learning_rate": 4.7010496413774404e-05, + "loss": 0.7007, + "step": 20290 + }, + { + "epoch": 0.17945861843384783, + "grad_norm": 9.33781623840332, + "learning_rate": 4.700902302610254e-05, + "loss": 0.8504, + "step": 20300 + }, + { + "epoch": 0.1795470216941601, + "grad_norm": 4.633574485778809, + "learning_rate": 4.700754963843067e-05, + "loss": 0.9296, + "step": 20310 + }, + { + "epoch": 0.17963542495447232, + "grad_norm": 4.936581611633301, + "learning_rate": 4.7006076250758796e-05, + "loss": 0.795, + "step": 20320 + }, + { + "epoch": 0.17972382821478455, + "grad_norm": 7.54608154296875, + "learning_rate": 4.7004602863086925e-05, + "loss": 0.7732, + "step": 20330 + }, + { + "epoch": 0.1798122314750968, + "grad_norm": 4.504227161407471, + "learning_rate": 4.700312947541505e-05, + "loss": 0.8954, + "step": 20340 + }, + { + "epoch": 0.17990063473540904, + "grad_norm": 5.178501605987549, + "learning_rate": 4.700165608774318e-05, + "loss": 0.7758, + "step": 20350 + }, + { + "epoch": 0.17998903799572127, + "grad_norm": 3.549858331680298, + "learning_rate": 4.7000182700071317e-05, + "loss": 0.6726, + "step": 20360 + }, + { + "epoch": 0.18007744125603353, + "grad_norm": 6.557262420654297, + "learning_rate": 4.6998709312399445e-05, + "loss": 0.8728, + "step": 20370 + }, + { + "epoch": 0.18016584451634576, + "grad_norm": 5.670749664306641, + "learning_rate": 4.699723592472757e-05, + "loss": 0.7527, + "step": 20380 + }, + { + "epoch": 0.180254247776658, + "grad_norm": 6.3349833488464355, + "learning_rate": 4.69957625370557e-05, + "loss": 0.8495, + "step": 20390 + }, + { + "epoch": 0.18034265103697025, + "grad_norm": 3.4349277019500732, + "learning_rate": 4.699428914938383e-05, + "loss": 0.7331, + "step": 20400 + }, + { + "epoch": 0.18043105429728248, + "grad_norm": 7.19025182723999, + "learning_rate": 4.699281576171196e-05, + "loss": 0.7024, + "step": 20410 + }, + { + "epoch": 0.18051945755759471, + "grad_norm": 2.892963409423828, + "learning_rate": 4.6991342374040093e-05, + "loss": 0.862, + "step": 20420 + }, + { + "epoch": 0.18060786081790697, + "grad_norm": 6.494869232177734, + "learning_rate": 4.698986898636822e-05, + "loss": 0.7154, + "step": 20430 + }, + { + "epoch": 0.1806962640782192, + "grad_norm": 4.208085536956787, + "learning_rate": 4.698839559869635e-05, + "loss": 0.8957, + "step": 20440 + }, + { + "epoch": 0.18078466733853144, + "grad_norm": 9.377950668334961, + "learning_rate": 4.698692221102448e-05, + "loss": 0.8051, + "step": 20450 + }, + { + "epoch": 0.1808730705988437, + "grad_norm": 3.125303268432617, + "learning_rate": 4.698544882335261e-05, + "loss": 0.8558, + "step": 20460 + }, + { + "epoch": 0.18096147385915592, + "grad_norm": 1.9119974374771118, + "learning_rate": 4.6983975435680735e-05, + "loss": 0.8276, + "step": 20470 + }, + { + "epoch": 0.18104987711946816, + "grad_norm": 6.218015670776367, + "learning_rate": 4.6982502048008864e-05, + "loss": 0.8171, + "step": 20480 + }, + { + "epoch": 0.18113828037978041, + "grad_norm": 6.455350875854492, + "learning_rate": 4.6981028660337e-05, + "loss": 0.833, + "step": 20490 + }, + { + "epoch": 0.18122668364009265, + "grad_norm": 8.490579605102539, + "learning_rate": 4.697955527266513e-05, + "loss": 0.8084, + "step": 20500 + }, + { + "epoch": 0.18131508690040488, + "grad_norm": 9.880264282226562, + "learning_rate": 4.6978081884993255e-05, + "loss": 0.7329, + "step": 20510 + }, + { + "epoch": 0.18140349016071713, + "grad_norm": 4.4080681800842285, + "learning_rate": 4.6976608497321384e-05, + "loss": 0.8693, + "step": 20520 + }, + { + "epoch": 0.18149189342102937, + "grad_norm": 6.001032829284668, + "learning_rate": 4.697513510964951e-05, + "loss": 0.6817, + "step": 20530 + }, + { + "epoch": 0.1815802966813416, + "grad_norm": 10.893025398254395, + "learning_rate": 4.697366172197764e-05, + "loss": 0.7788, + "step": 20540 + }, + { + "epoch": 0.18166869994165386, + "grad_norm": 6.218920707702637, + "learning_rate": 4.6972188334305776e-05, + "loss": 0.7104, + "step": 20550 + }, + { + "epoch": 0.18175710320196609, + "grad_norm": 3.348966598510742, + "learning_rate": 4.69707149466339e-05, + "loss": 0.8124, + "step": 20560 + }, + { + "epoch": 0.18184550646227832, + "grad_norm": 4.686115741729736, + "learning_rate": 4.696924155896203e-05, + "loss": 0.7374, + "step": 20570 + }, + { + "epoch": 0.18193390972259058, + "grad_norm": 5.5031609535217285, + "learning_rate": 4.696776817129016e-05, + "loss": 0.8236, + "step": 20580 + }, + { + "epoch": 0.1820223129829028, + "grad_norm": 7.1041975021362305, + "learning_rate": 4.696629478361829e-05, + "loss": 0.7116, + "step": 20590 + }, + { + "epoch": 0.18211071624321504, + "grad_norm": 3.8152554035186768, + "learning_rate": 4.696482139594642e-05, + "loss": 0.7432, + "step": 20600 + }, + { + "epoch": 0.1821991195035273, + "grad_norm": 2.5621113777160645, + "learning_rate": 4.696334800827455e-05, + "loss": 0.7318, + "step": 20610 + }, + { + "epoch": 0.18228752276383953, + "grad_norm": 11.487798690795898, + "learning_rate": 4.6961874620602674e-05, + "loss": 0.8046, + "step": 20620 + }, + { + "epoch": 0.18237592602415176, + "grad_norm": 6.597385406494141, + "learning_rate": 4.696040123293081e-05, + "loss": 0.7962, + "step": 20630 + }, + { + "epoch": 0.18246432928446402, + "grad_norm": 5.777371883392334, + "learning_rate": 4.695892784525894e-05, + "loss": 0.8109, + "step": 20640 + }, + { + "epoch": 0.18255273254477625, + "grad_norm": 4.814777851104736, + "learning_rate": 4.6957454457587066e-05, + "loss": 0.766, + "step": 20650 + }, + { + "epoch": 0.18264113580508848, + "grad_norm": 9.146038055419922, + "learning_rate": 4.6955981069915194e-05, + "loss": 0.8558, + "step": 20660 + }, + { + "epoch": 0.18272953906540074, + "grad_norm": 11.374016761779785, + "learning_rate": 4.695450768224333e-05, + "loss": 0.7406, + "step": 20670 + }, + { + "epoch": 0.18281794232571297, + "grad_norm": 2.69401478767395, + "learning_rate": 4.695303429457145e-05, + "loss": 0.7204, + "step": 20680 + }, + { + "epoch": 0.18290634558602523, + "grad_norm": 6.359562397003174, + "learning_rate": 4.6951560906899586e-05, + "loss": 0.7661, + "step": 20690 + }, + { + "epoch": 0.18299474884633746, + "grad_norm": 2.5801827907562256, + "learning_rate": 4.695008751922771e-05, + "loss": 0.704, + "step": 20700 + }, + { + "epoch": 0.1830831521066497, + "grad_norm": 2.6450891494750977, + "learning_rate": 4.694861413155584e-05, + "loss": 0.8052, + "step": 20710 + }, + { + "epoch": 0.18317155536696195, + "grad_norm": 3.069169282913208, + "learning_rate": 4.694714074388397e-05, + "loss": 0.8935, + "step": 20720 + }, + { + "epoch": 0.18325995862727418, + "grad_norm": 2.230119228363037, + "learning_rate": 4.69456673562121e-05, + "loss": 0.8753, + "step": 20730 + }, + { + "epoch": 0.1833483618875864, + "grad_norm": 6.34372615814209, + "learning_rate": 4.694419396854023e-05, + "loss": 0.6104, + "step": 20740 + }, + { + "epoch": 0.18343676514789867, + "grad_norm": 11.162723541259766, + "learning_rate": 4.694272058086836e-05, + "loss": 0.856, + "step": 20750 + }, + { + "epoch": 0.1835251684082109, + "grad_norm": 5.337286949157715, + "learning_rate": 4.6941247193196485e-05, + "loss": 0.7558, + "step": 20760 + }, + { + "epoch": 0.18361357166852313, + "grad_norm": 2.351268768310547, + "learning_rate": 4.693977380552462e-05, + "loss": 0.7964, + "step": 20770 + }, + { + "epoch": 0.1837019749288354, + "grad_norm": 11.143387794494629, + "learning_rate": 4.693830041785275e-05, + "loss": 0.7412, + "step": 20780 + }, + { + "epoch": 0.18379037818914762, + "grad_norm": 10.03717041015625, + "learning_rate": 4.6936827030180876e-05, + "loss": 0.7689, + "step": 20790 + }, + { + "epoch": 0.18387878144945985, + "grad_norm": 2.3697686195373535, + "learning_rate": 4.6935353642509005e-05, + "loss": 0.7574, + "step": 20800 + }, + { + "epoch": 0.1839671847097721, + "grad_norm": 4.796818733215332, + "learning_rate": 4.693388025483713e-05, + "loss": 0.7846, + "step": 20810 + }, + { + "epoch": 0.18405558797008434, + "grad_norm": 8.595834732055664, + "learning_rate": 4.693240686716526e-05, + "loss": 0.7348, + "step": 20820 + }, + { + "epoch": 0.18414399123039657, + "grad_norm": 5.621718883514404, + "learning_rate": 4.6930933479493397e-05, + "loss": 0.8067, + "step": 20830 + }, + { + "epoch": 0.18423239449070883, + "grad_norm": 5.930288791656494, + "learning_rate": 4.692946009182152e-05, + "loss": 0.788, + "step": 20840 + }, + { + "epoch": 0.18432079775102106, + "grad_norm": 3.473284959793091, + "learning_rate": 4.692798670414965e-05, + "loss": 0.8705, + "step": 20850 + }, + { + "epoch": 0.1844092010113333, + "grad_norm": 2.478281259536743, + "learning_rate": 4.692651331647778e-05, + "loss": 0.8396, + "step": 20860 + }, + { + "epoch": 0.18449760427164555, + "grad_norm": 8.749351501464844, + "learning_rate": 4.692503992880591e-05, + "loss": 0.9392, + "step": 20870 + }, + { + "epoch": 0.18458600753195778, + "grad_norm": 8.984127044677734, + "learning_rate": 4.692356654113404e-05, + "loss": 0.7349, + "step": 20880 + }, + { + "epoch": 0.18467441079227, + "grad_norm": 5.528654098510742, + "learning_rate": 4.6922093153462173e-05, + "loss": 0.872, + "step": 20890 + }, + { + "epoch": 0.18476281405258227, + "grad_norm": 6.0602898597717285, + "learning_rate": 4.6920619765790295e-05, + "loss": 0.6958, + "step": 20900 + }, + { + "epoch": 0.1848512173128945, + "grad_norm": 3.6754910945892334, + "learning_rate": 4.691914637811843e-05, + "loss": 0.7429, + "step": 20910 + }, + { + "epoch": 0.18493962057320673, + "grad_norm": 2.2601325511932373, + "learning_rate": 4.691767299044655e-05, + "loss": 0.6988, + "step": 20920 + }, + { + "epoch": 0.185028023833519, + "grad_norm": 5.051455020904541, + "learning_rate": 4.691619960277469e-05, + "loss": 0.7465, + "step": 20930 + }, + { + "epoch": 0.18511642709383122, + "grad_norm": 11.01762580871582, + "learning_rate": 4.6914726215102815e-05, + "loss": 0.9008, + "step": 20940 + }, + { + "epoch": 0.18520483035414345, + "grad_norm": 5.050708770751953, + "learning_rate": 4.6913252827430944e-05, + "loss": 0.8705, + "step": 20950 + }, + { + "epoch": 0.1852932336144557, + "grad_norm": 10.381126403808594, + "learning_rate": 4.691177943975907e-05, + "loss": 0.781, + "step": 20960 + }, + { + "epoch": 0.18538163687476794, + "grad_norm": 10.907230377197266, + "learning_rate": 4.691030605208721e-05, + "loss": 0.8314, + "step": 20970 + }, + { + "epoch": 0.18547004013508017, + "grad_norm": 1.5914275646209717, + "learning_rate": 4.690883266441533e-05, + "loss": 0.9625, + "step": 20980 + }, + { + "epoch": 0.18555844339539243, + "grad_norm": 6.222545146942139, + "learning_rate": 4.6907359276743464e-05, + "loss": 0.8171, + "step": 20990 + }, + { + "epoch": 0.18564684665570466, + "grad_norm": 7.4232258796691895, + "learning_rate": 4.690588588907159e-05, + "loss": 0.8049, + "step": 21000 + }, + { + "epoch": 0.1857352499160169, + "grad_norm": 4.576233863830566, + "learning_rate": 4.690441250139972e-05, + "loss": 0.8539, + "step": 21010 + }, + { + "epoch": 0.18582365317632915, + "grad_norm": 2.3963844776153564, + "learning_rate": 4.690293911372785e-05, + "loss": 0.7495, + "step": 21020 + }, + { + "epoch": 0.18591205643664138, + "grad_norm": 6.205860614776611, + "learning_rate": 4.690146572605598e-05, + "loss": 0.7821, + "step": 21030 + }, + { + "epoch": 0.1860004596969536, + "grad_norm": 2.3440845012664795, + "learning_rate": 4.6899992338384106e-05, + "loss": 0.7759, + "step": 21040 + }, + { + "epoch": 0.18608886295726587, + "grad_norm": 2.135939121246338, + "learning_rate": 4.689851895071224e-05, + "loss": 1.0169, + "step": 21050 + }, + { + "epoch": 0.1861772662175781, + "grad_norm": 4.532798767089844, + "learning_rate": 4.689704556304036e-05, + "loss": 0.8234, + "step": 21060 + }, + { + "epoch": 0.18626566947789033, + "grad_norm": 2.2755768299102783, + "learning_rate": 4.68955721753685e-05, + "loss": 0.8013, + "step": 21070 + }, + { + "epoch": 0.1863540727382026, + "grad_norm": 6.704551696777344, + "learning_rate": 4.6894098787696626e-05, + "loss": 0.9893, + "step": 21080 + }, + { + "epoch": 0.18644247599851482, + "grad_norm": 5.669991970062256, + "learning_rate": 4.6892625400024754e-05, + "loss": 0.7986, + "step": 21090 + }, + { + "epoch": 0.18653087925882705, + "grad_norm": 5.91778039932251, + "learning_rate": 4.689115201235288e-05, + "loss": 0.7941, + "step": 21100 + }, + { + "epoch": 0.1866192825191393, + "grad_norm": 2.228177309036255, + "learning_rate": 4.688967862468102e-05, + "loss": 0.841, + "step": 21110 + }, + { + "epoch": 0.18670768577945154, + "grad_norm": 9.556169509887695, + "learning_rate": 4.688820523700914e-05, + "loss": 0.9429, + "step": 21120 + }, + { + "epoch": 0.18679608903976377, + "grad_norm": 3.492906093597412, + "learning_rate": 4.6886731849337274e-05, + "loss": 0.876, + "step": 21130 + }, + { + "epoch": 0.18688449230007603, + "grad_norm": 6.84982442855835, + "learning_rate": 4.68852584616654e-05, + "loss": 0.7982, + "step": 21140 + }, + { + "epoch": 0.18697289556038826, + "grad_norm": 4.788298606872559, + "learning_rate": 4.688378507399353e-05, + "loss": 0.8319, + "step": 21150 + }, + { + "epoch": 0.1870612988207005, + "grad_norm": 7.109243392944336, + "learning_rate": 4.688231168632166e-05, + "loss": 0.8333, + "step": 21160 + }, + { + "epoch": 0.18714970208101275, + "grad_norm": 3.8994028568267822, + "learning_rate": 4.688083829864979e-05, + "loss": 0.8284, + "step": 21170 + }, + { + "epoch": 0.18723810534132498, + "grad_norm": 4.735000133514404, + "learning_rate": 4.6879364910977916e-05, + "loss": 0.7821, + "step": 21180 + }, + { + "epoch": 0.18732650860163721, + "grad_norm": 2.890619993209839, + "learning_rate": 4.687789152330605e-05, + "loss": 0.7143, + "step": 21190 + }, + { + "epoch": 0.18741491186194947, + "grad_norm": 2.096660852432251, + "learning_rate": 4.687641813563417e-05, + "loss": 0.7704, + "step": 21200 + }, + { + "epoch": 0.1875033151222617, + "grad_norm": 5.878719806671143, + "learning_rate": 4.687494474796231e-05, + "loss": 0.8685, + "step": 21210 + }, + { + "epoch": 0.18759171838257396, + "grad_norm": 10.67667293548584, + "learning_rate": 4.6873471360290436e-05, + "loss": 0.772, + "step": 21220 + }, + { + "epoch": 0.1876801216428862, + "grad_norm": 3.8728549480438232, + "learning_rate": 4.6871997972618565e-05, + "loss": 0.6603, + "step": 21230 + }, + { + "epoch": 0.18776852490319842, + "grad_norm": 3.7111921310424805, + "learning_rate": 4.687052458494669e-05, + "loss": 0.8378, + "step": 21240 + }, + { + "epoch": 0.18785692816351068, + "grad_norm": 6.716635704040527, + "learning_rate": 4.686905119727483e-05, + "loss": 0.7352, + "step": 21250 + }, + { + "epoch": 0.18794533142382291, + "grad_norm": 6.060492038726807, + "learning_rate": 4.686757780960295e-05, + "loss": 0.7173, + "step": 21260 + }, + { + "epoch": 0.18803373468413515, + "grad_norm": 4.747287273406982, + "learning_rate": 4.6866104421931085e-05, + "loss": 0.8773, + "step": 21270 + }, + { + "epoch": 0.1881221379444474, + "grad_norm": 6.806166648864746, + "learning_rate": 4.686463103425921e-05, + "loss": 0.7035, + "step": 21280 + }, + { + "epoch": 0.18821054120475963, + "grad_norm": 2.809680938720703, + "learning_rate": 4.686315764658734e-05, + "loss": 0.6529, + "step": 21290 + }, + { + "epoch": 0.18829894446507187, + "grad_norm": 4.832018852233887, + "learning_rate": 4.686168425891547e-05, + "loss": 0.8729, + "step": 21300 + }, + { + "epoch": 0.18838734772538412, + "grad_norm": 8.115225791931152, + "learning_rate": 4.68602108712436e-05, + "loss": 0.8716, + "step": 21310 + }, + { + "epoch": 0.18847575098569636, + "grad_norm": 4.351589679718018, + "learning_rate": 4.6858737483571727e-05, + "loss": 0.8144, + "step": 21320 + }, + { + "epoch": 0.1885641542460086, + "grad_norm": 5.018931865692139, + "learning_rate": 4.685726409589986e-05, + "loss": 0.7823, + "step": 21330 + }, + { + "epoch": 0.18865255750632084, + "grad_norm": 3.043612003326416, + "learning_rate": 4.685579070822799e-05, + "loss": 0.7336, + "step": 21340 + }, + { + "epoch": 0.18874096076663308, + "grad_norm": 7.226938247680664, + "learning_rate": 4.685431732055612e-05, + "loss": 0.8222, + "step": 21350 + }, + { + "epoch": 0.1888293640269453, + "grad_norm": 5.591948509216309, + "learning_rate": 4.685284393288425e-05, + "loss": 0.8815, + "step": 21360 + }, + { + "epoch": 0.18891776728725757, + "grad_norm": 3.7986953258514404, + "learning_rate": 4.6851370545212375e-05, + "loss": 0.8976, + "step": 21370 + }, + { + "epoch": 0.1890061705475698, + "grad_norm": 6.667764186859131, + "learning_rate": 4.6849897157540503e-05, + "loss": 0.7915, + "step": 21380 + }, + { + "epoch": 0.18909457380788203, + "grad_norm": 3.3211944103240967, + "learning_rate": 4.684842376986863e-05, + "loss": 0.7493, + "step": 21390 + }, + { + "epoch": 0.18918297706819429, + "grad_norm": 8.289932250976562, + "learning_rate": 4.684695038219677e-05, + "loss": 0.7112, + "step": 21400 + }, + { + "epoch": 0.18927138032850652, + "grad_norm": 8.218725204467773, + "learning_rate": 4.6845476994524895e-05, + "loss": 0.7393, + "step": 21410 + }, + { + "epoch": 0.18935978358881875, + "grad_norm": 3.564985990524292, + "learning_rate": 4.6844003606853024e-05, + "loss": 0.8505, + "step": 21420 + }, + { + "epoch": 0.189448186849131, + "grad_norm": 3.3935065269470215, + "learning_rate": 4.684253021918115e-05, + "loss": 0.7197, + "step": 21430 + }, + { + "epoch": 0.18953659010944324, + "grad_norm": 7.465757369995117, + "learning_rate": 4.684105683150928e-05, + "loss": 0.6755, + "step": 21440 + }, + { + "epoch": 0.18962499336975547, + "grad_norm": 9.55097770690918, + "learning_rate": 4.683958344383741e-05, + "loss": 0.8269, + "step": 21450 + }, + { + "epoch": 0.18971339663006773, + "grad_norm": 11.654396057128906, + "learning_rate": 4.6838110056165544e-05, + "loss": 0.7776, + "step": 21460 + }, + { + "epoch": 0.18980179989037996, + "grad_norm": 3.557164192199707, + "learning_rate": 4.683663666849367e-05, + "loss": 0.8328, + "step": 21470 + }, + { + "epoch": 0.1898902031506922, + "grad_norm": 2.5593674182891846, + "learning_rate": 4.68351632808218e-05, + "loss": 0.7031, + "step": 21480 + }, + { + "epoch": 0.18997860641100445, + "grad_norm": 4.95280647277832, + "learning_rate": 4.683368989314993e-05, + "loss": 0.8917, + "step": 21490 + }, + { + "epoch": 0.19006700967131668, + "grad_norm": 6.238870620727539, + "learning_rate": 4.683221650547806e-05, + "loss": 0.8169, + "step": 21500 + }, + { + "epoch": 0.1901554129316289, + "grad_norm": 4.238143444061279, + "learning_rate": 4.6830743117806186e-05, + "loss": 0.6997, + "step": 21510 + }, + { + "epoch": 0.19024381619194117, + "grad_norm": 4.553244590759277, + "learning_rate": 4.682926973013432e-05, + "loss": 0.8714, + "step": 21520 + }, + { + "epoch": 0.1903322194522534, + "grad_norm": 4.513711929321289, + "learning_rate": 4.682779634246244e-05, + "loss": 0.6667, + "step": 21530 + }, + { + "epoch": 0.19042062271256563, + "grad_norm": 8.019719123840332, + "learning_rate": 4.682632295479058e-05, + "loss": 0.7857, + "step": 21540 + }, + { + "epoch": 0.1905090259728779, + "grad_norm": 12.806079864501953, + "learning_rate": 4.6824849567118706e-05, + "loss": 0.8822, + "step": 21550 + }, + { + "epoch": 0.19059742923319012, + "grad_norm": 5.0920538902282715, + "learning_rate": 4.6823376179446834e-05, + "loss": 0.7505, + "step": 21560 + }, + { + "epoch": 0.19068583249350235, + "grad_norm": 16.16451644897461, + "learning_rate": 4.682190279177496e-05, + "loss": 0.7657, + "step": 21570 + }, + { + "epoch": 0.1907742357538146, + "grad_norm": 2.1458187103271484, + "learning_rate": 4.68204294041031e-05, + "loss": 0.696, + "step": 21580 + }, + { + "epoch": 0.19086263901412684, + "grad_norm": 3.2431187629699707, + "learning_rate": 4.681895601643122e-05, + "loss": 0.9059, + "step": 21590 + }, + { + "epoch": 0.19095104227443907, + "grad_norm": 6.888151168823242, + "learning_rate": 4.6817482628759354e-05, + "loss": 0.8723, + "step": 21600 + }, + { + "epoch": 0.19103944553475133, + "grad_norm": 4.5483622550964355, + "learning_rate": 4.681600924108748e-05, + "loss": 0.7336, + "step": 21610 + }, + { + "epoch": 0.19112784879506356, + "grad_norm": 8.848033905029297, + "learning_rate": 4.681453585341561e-05, + "loss": 0.8398, + "step": 21620 + }, + { + "epoch": 0.1912162520553758, + "grad_norm": 2.365640640258789, + "learning_rate": 4.681306246574374e-05, + "loss": 0.7033, + "step": 21630 + }, + { + "epoch": 0.19130465531568805, + "grad_norm": 4.853655815124512, + "learning_rate": 4.681158907807187e-05, + "loss": 0.8226, + "step": 21640 + }, + { + "epoch": 0.19139305857600028, + "grad_norm": 7.536863803863525, + "learning_rate": 4.6810115690399996e-05, + "loss": 0.8291, + "step": 21650 + }, + { + "epoch": 0.1914814618363125, + "grad_norm": 5.142647743225098, + "learning_rate": 4.680864230272813e-05, + "loss": 0.7114, + "step": 21660 + }, + { + "epoch": 0.19156986509662477, + "grad_norm": 7.097962856292725, + "learning_rate": 4.680716891505625e-05, + "loss": 0.7708, + "step": 21670 + }, + { + "epoch": 0.191658268356937, + "grad_norm": 10.546960830688477, + "learning_rate": 4.680569552738439e-05, + "loss": 0.7818, + "step": 21680 + }, + { + "epoch": 0.19174667161724923, + "grad_norm": 5.610840797424316, + "learning_rate": 4.6804222139712516e-05, + "loss": 0.8427, + "step": 21690 + }, + { + "epoch": 0.1918350748775615, + "grad_norm": 4.584107398986816, + "learning_rate": 4.6802748752040645e-05, + "loss": 0.7315, + "step": 21700 + }, + { + "epoch": 0.19192347813787372, + "grad_norm": 5.1670660972595215, + "learning_rate": 4.680127536436877e-05, + "loss": 0.7063, + "step": 21710 + }, + { + "epoch": 0.19201188139818595, + "grad_norm": 4.768735885620117, + "learning_rate": 4.679980197669691e-05, + "loss": 0.6673, + "step": 21720 + }, + { + "epoch": 0.1921002846584982, + "grad_norm": 5.147122859954834, + "learning_rate": 4.679832858902503e-05, + "loss": 0.7206, + "step": 21730 + }, + { + "epoch": 0.19218868791881044, + "grad_norm": 4.420533180236816, + "learning_rate": 4.6796855201353165e-05, + "loss": 0.8599, + "step": 21740 + }, + { + "epoch": 0.19227709117912267, + "grad_norm": 4.13287878036499, + "learning_rate": 4.6795381813681286e-05, + "loss": 0.7373, + "step": 21750 + }, + { + "epoch": 0.19236549443943493, + "grad_norm": 9.321707725524902, + "learning_rate": 4.679390842600942e-05, + "loss": 0.8204, + "step": 21760 + }, + { + "epoch": 0.19245389769974716, + "grad_norm": 5.07951545715332, + "learning_rate": 4.679243503833755e-05, + "loss": 0.7072, + "step": 21770 + }, + { + "epoch": 0.19254230096005942, + "grad_norm": 2.755408763885498, + "learning_rate": 4.679096165066568e-05, + "loss": 0.7929, + "step": 21780 + }, + { + "epoch": 0.19263070422037165, + "grad_norm": 3.890803098678589, + "learning_rate": 4.678948826299381e-05, + "loss": 0.8066, + "step": 21790 + }, + { + "epoch": 0.19271910748068388, + "grad_norm": 14.816995620727539, + "learning_rate": 4.678801487532194e-05, + "loss": 0.7702, + "step": 21800 + }, + { + "epoch": 0.19280751074099614, + "grad_norm": 1.9433757066726685, + "learning_rate": 4.678654148765006e-05, + "loss": 0.7438, + "step": 21810 + }, + { + "epoch": 0.19289591400130837, + "grad_norm": 12.897647857666016, + "learning_rate": 4.67850680999782e-05, + "loss": 0.7654, + "step": 21820 + }, + { + "epoch": 0.1929843172616206, + "grad_norm": 5.284687519073486, + "learning_rate": 4.678359471230633e-05, + "loss": 0.8564, + "step": 21830 + }, + { + "epoch": 0.19307272052193286, + "grad_norm": 3.4131147861480713, + "learning_rate": 4.6782121324634455e-05, + "loss": 0.842, + "step": 21840 + }, + { + "epoch": 0.1931611237822451, + "grad_norm": 8.69863224029541, + "learning_rate": 4.6780647936962584e-05, + "loss": 0.9538, + "step": 21850 + }, + { + "epoch": 0.19324952704255732, + "grad_norm": 5.734713077545166, + "learning_rate": 4.677917454929071e-05, + "loss": 0.7811, + "step": 21860 + }, + { + "epoch": 0.19333793030286958, + "grad_norm": 3.2469704151153564, + "learning_rate": 4.677770116161884e-05, + "loss": 0.8141, + "step": 21870 + }, + { + "epoch": 0.1934263335631818, + "grad_norm": 6.072801113128662, + "learning_rate": 4.6776227773946975e-05, + "loss": 0.7981, + "step": 21880 + }, + { + "epoch": 0.19351473682349404, + "grad_norm": 4.3431291580200195, + "learning_rate": 4.67747543862751e-05, + "loss": 0.8232, + "step": 21890 + }, + { + "epoch": 0.1936031400838063, + "grad_norm": 2.9932267665863037, + "learning_rate": 4.677328099860323e-05, + "loss": 0.844, + "step": 21900 + }, + { + "epoch": 0.19369154334411853, + "grad_norm": 7.422665596008301, + "learning_rate": 4.677180761093136e-05, + "loss": 0.8118, + "step": 21910 + }, + { + "epoch": 0.19377994660443076, + "grad_norm": 2.4402995109558105, + "learning_rate": 4.677033422325949e-05, + "loss": 0.8435, + "step": 21920 + }, + { + "epoch": 0.19386834986474302, + "grad_norm": 11.59831714630127, + "learning_rate": 4.676886083558762e-05, + "loss": 0.7656, + "step": 21930 + }, + { + "epoch": 0.19395675312505525, + "grad_norm": 4.198363304138184, + "learning_rate": 4.676738744791575e-05, + "loss": 0.8767, + "step": 21940 + }, + { + "epoch": 0.19404515638536748, + "grad_norm": 3.5214431285858154, + "learning_rate": 4.6765914060243874e-05, + "loss": 0.7284, + "step": 21950 + }, + { + "epoch": 0.19413355964567974, + "grad_norm": 2.6636786460876465, + "learning_rate": 4.676444067257201e-05, + "loss": 0.994, + "step": 21960 + }, + { + "epoch": 0.19422196290599197, + "grad_norm": 3.3864927291870117, + "learning_rate": 4.676296728490014e-05, + "loss": 0.6833, + "step": 21970 + }, + { + "epoch": 0.1943103661663042, + "grad_norm": 2.2272789478302, + "learning_rate": 4.6761493897228266e-05, + "loss": 0.7847, + "step": 21980 + }, + { + "epoch": 0.19439876942661646, + "grad_norm": 2.8747472763061523, + "learning_rate": 4.6760020509556394e-05, + "loss": 0.8957, + "step": 21990 + }, + { + "epoch": 0.1944871726869287, + "grad_norm": 2.239976406097412, + "learning_rate": 4.675854712188452e-05, + "loss": 0.7382, + "step": 22000 + }, + { + "epoch": 0.19457557594724093, + "grad_norm": 4.4402618408203125, + "learning_rate": 4.675707373421265e-05, + "loss": 0.8839, + "step": 22010 + }, + { + "epoch": 0.19466397920755318, + "grad_norm": 4.302387237548828, + "learning_rate": 4.6755600346540786e-05, + "loss": 0.8253, + "step": 22020 + }, + { + "epoch": 0.19475238246786541, + "grad_norm": 3.6364643573760986, + "learning_rate": 4.675412695886891e-05, + "loss": 0.7194, + "step": 22030 + }, + { + "epoch": 0.19484078572817765, + "grad_norm": 2.026207447052002, + "learning_rate": 4.675265357119704e-05, + "loss": 0.7733, + "step": 22040 + }, + { + "epoch": 0.1949291889884899, + "grad_norm": 11.127307891845703, + "learning_rate": 4.675118018352517e-05, + "loss": 0.8729, + "step": 22050 + }, + { + "epoch": 0.19501759224880214, + "grad_norm": 4.305873870849609, + "learning_rate": 4.67497067958533e-05, + "loss": 0.7843, + "step": 22060 + }, + { + "epoch": 0.19510599550911437, + "grad_norm": 2.9550535678863525, + "learning_rate": 4.674823340818143e-05, + "loss": 0.854, + "step": 22070 + }, + { + "epoch": 0.19519439876942662, + "grad_norm": 9.263606071472168, + "learning_rate": 4.674676002050956e-05, + "loss": 0.8685, + "step": 22080 + }, + { + "epoch": 0.19528280202973886, + "grad_norm": 4.128292083740234, + "learning_rate": 4.6745286632837684e-05, + "loss": 0.7077, + "step": 22090 + }, + { + "epoch": 0.1953712052900511, + "grad_norm": 3.258774995803833, + "learning_rate": 4.674381324516582e-05, + "loss": 0.74, + "step": 22100 + }, + { + "epoch": 0.19545960855036335, + "grad_norm": 6.328365325927734, + "learning_rate": 4.674233985749394e-05, + "loss": 0.8824, + "step": 22110 + }, + { + "epoch": 0.19554801181067558, + "grad_norm": 8.921202659606934, + "learning_rate": 4.6740866469822076e-05, + "loss": 0.7058, + "step": 22120 + }, + { + "epoch": 0.1956364150709878, + "grad_norm": 12.509286880493164, + "learning_rate": 4.6739393082150205e-05, + "loss": 0.8928, + "step": 22130 + }, + { + "epoch": 0.19572481833130007, + "grad_norm": 6.750686168670654, + "learning_rate": 4.673791969447833e-05, + "loss": 0.832, + "step": 22140 + }, + { + "epoch": 0.1958132215916123, + "grad_norm": 2.3646960258483887, + "learning_rate": 4.673644630680646e-05, + "loss": 0.8489, + "step": 22150 + }, + { + "epoch": 0.19590162485192453, + "grad_norm": 4.463263511657715, + "learning_rate": 4.6734972919134596e-05, + "loss": 0.7064, + "step": 22160 + }, + { + "epoch": 0.19599002811223679, + "grad_norm": 3.6031270027160645, + "learning_rate": 4.673349953146272e-05, + "loss": 0.7205, + "step": 22170 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 5.079982280731201, + "learning_rate": 4.673202614379085e-05, + "loss": 0.8516, + "step": 22180 + }, + { + "epoch": 0.19616683463286125, + "grad_norm": 2.213585376739502, + "learning_rate": 4.673055275611898e-05, + "loss": 0.8975, + "step": 22190 + }, + { + "epoch": 0.1962552378931735, + "grad_norm": 8.803163528442383, + "learning_rate": 4.672907936844711e-05, + "loss": 0.8135, + "step": 22200 + }, + { + "epoch": 0.19634364115348574, + "grad_norm": 9.55571460723877, + "learning_rate": 4.672760598077524e-05, + "loss": 0.7677, + "step": 22210 + }, + { + "epoch": 0.19643204441379797, + "grad_norm": 2.8771579265594482, + "learning_rate": 4.6726132593103367e-05, + "loss": 0.6235, + "step": 22220 + }, + { + "epoch": 0.19652044767411023, + "grad_norm": 13.60547924041748, + "learning_rate": 4.6724659205431495e-05, + "loss": 0.8516, + "step": 22230 + }, + { + "epoch": 0.19660885093442246, + "grad_norm": 8.28115463256836, + "learning_rate": 4.672318581775963e-05, + "loss": 0.8508, + "step": 22240 + }, + { + "epoch": 0.1966972541947347, + "grad_norm": 4.614414691925049, + "learning_rate": 4.672171243008776e-05, + "loss": 0.9152, + "step": 22250 + }, + { + "epoch": 0.19678565745504695, + "grad_norm": 4.92991304397583, + "learning_rate": 4.672023904241589e-05, + "loss": 0.8689, + "step": 22260 + }, + { + "epoch": 0.19687406071535918, + "grad_norm": 6.292807579040527, + "learning_rate": 4.6718765654744015e-05, + "loss": 0.7708, + "step": 22270 + }, + { + "epoch": 0.1969624639756714, + "grad_norm": 2.8370020389556885, + "learning_rate": 4.6717292267072143e-05, + "loss": 0.7834, + "step": 22280 + }, + { + "epoch": 0.19705086723598367, + "grad_norm": 4.450292587280273, + "learning_rate": 4.671581887940027e-05, + "loss": 0.7851, + "step": 22290 + }, + { + "epoch": 0.1971392704962959, + "grad_norm": 9.224190711975098, + "learning_rate": 4.671434549172841e-05, + "loss": 0.7873, + "step": 22300 + }, + { + "epoch": 0.19722767375660816, + "grad_norm": 8.472943305969238, + "learning_rate": 4.6712872104056535e-05, + "loss": 0.7665, + "step": 22310 + }, + { + "epoch": 0.1973160770169204, + "grad_norm": 5.1440863609313965, + "learning_rate": 4.6711398716384664e-05, + "loss": 0.8139, + "step": 22320 + }, + { + "epoch": 0.19740448027723262, + "grad_norm": 7.636616230010986, + "learning_rate": 4.670992532871279e-05, + "loss": 0.805, + "step": 22330 + }, + { + "epoch": 0.19749288353754488, + "grad_norm": 7.643059730529785, + "learning_rate": 4.670845194104092e-05, + "loss": 0.8944, + "step": 22340 + }, + { + "epoch": 0.1975812867978571, + "grad_norm": 5.8303632736206055, + "learning_rate": 4.670697855336905e-05, + "loss": 0.7028, + "step": 22350 + }, + { + "epoch": 0.19766969005816934, + "grad_norm": 2.1993534564971924, + "learning_rate": 4.670550516569718e-05, + "loss": 0.7117, + "step": 22360 + }, + { + "epoch": 0.1977580933184816, + "grad_norm": 4.56084680557251, + "learning_rate": 4.670403177802531e-05, + "loss": 0.7634, + "step": 22370 + }, + { + "epoch": 0.19784649657879383, + "grad_norm": 9.242644309997559, + "learning_rate": 4.670255839035344e-05, + "loss": 0.8332, + "step": 22380 + }, + { + "epoch": 0.19793489983910606, + "grad_norm": 15.057245254516602, + "learning_rate": 4.670108500268157e-05, + "loss": 0.7764, + "step": 22390 + }, + { + "epoch": 0.19802330309941832, + "grad_norm": 3.096505880355835, + "learning_rate": 4.66996116150097e-05, + "loss": 0.8239, + "step": 22400 + }, + { + "epoch": 0.19811170635973055, + "grad_norm": 4.721726417541504, + "learning_rate": 4.6698138227337826e-05, + "loss": 0.7958, + "step": 22410 + }, + { + "epoch": 0.19820010962004278, + "grad_norm": 3.4196979999542236, + "learning_rate": 4.6696664839665954e-05, + "loss": 0.8045, + "step": 22420 + }, + { + "epoch": 0.19828851288035504, + "grad_norm": 6.442022800445557, + "learning_rate": 4.669519145199409e-05, + "loss": 0.7235, + "step": 22430 + }, + { + "epoch": 0.19837691614066727, + "grad_norm": 7.930601119995117, + "learning_rate": 4.669371806432222e-05, + "loss": 0.8965, + "step": 22440 + }, + { + "epoch": 0.1984653194009795, + "grad_norm": 1.932212233543396, + "learning_rate": 4.6692244676650346e-05, + "loss": 0.8522, + "step": 22450 + }, + { + "epoch": 0.19855372266129176, + "grad_norm": 2.316455841064453, + "learning_rate": 4.6690771288978474e-05, + "loss": 0.7646, + "step": 22460 + }, + { + "epoch": 0.198642125921604, + "grad_norm": 5.144872188568115, + "learning_rate": 4.66892979013066e-05, + "loss": 0.7287, + "step": 22470 + }, + { + "epoch": 0.19873052918191622, + "grad_norm": 5.668541431427002, + "learning_rate": 4.668782451363473e-05, + "loss": 0.6577, + "step": 22480 + }, + { + "epoch": 0.19881893244222848, + "grad_norm": 6.585099697113037, + "learning_rate": 4.6686351125962866e-05, + "loss": 0.7031, + "step": 22490 + }, + { + "epoch": 0.1989073357025407, + "grad_norm": 5.952176094055176, + "learning_rate": 4.668487773829099e-05, + "loss": 0.7519, + "step": 22500 + }, + { + "epoch": 0.19899573896285294, + "grad_norm": 10.434608459472656, + "learning_rate": 4.668340435061912e-05, + "loss": 0.7314, + "step": 22510 + }, + { + "epoch": 0.1990841422231652, + "grad_norm": 6.423018932342529, + "learning_rate": 4.668193096294725e-05, + "loss": 0.7372, + "step": 22520 + }, + { + "epoch": 0.19917254548347743, + "grad_norm": 3.842513084411621, + "learning_rate": 4.668045757527538e-05, + "loss": 0.8626, + "step": 22530 + }, + { + "epoch": 0.19926094874378966, + "grad_norm": 4.58589506149292, + "learning_rate": 4.667898418760351e-05, + "loss": 0.9298, + "step": 22540 + }, + { + "epoch": 0.19934935200410192, + "grad_norm": 3.9880809783935547, + "learning_rate": 4.667751079993164e-05, + "loss": 0.7535, + "step": 22550 + }, + { + "epoch": 0.19943775526441415, + "grad_norm": 3.5416860580444336, + "learning_rate": 4.6676037412259764e-05, + "loss": 0.7448, + "step": 22560 + }, + { + "epoch": 0.19952615852472638, + "grad_norm": 6.561315059661865, + "learning_rate": 4.66745640245879e-05, + "loss": 0.9248, + "step": 22570 + }, + { + "epoch": 0.19961456178503864, + "grad_norm": 8.85240650177002, + "learning_rate": 4.667309063691602e-05, + "loss": 0.6894, + "step": 22580 + }, + { + "epoch": 0.19970296504535087, + "grad_norm": 4.638481140136719, + "learning_rate": 4.6671617249244156e-05, + "loss": 0.6975, + "step": 22590 + }, + { + "epoch": 0.1997913683056631, + "grad_norm": 9.74061107635498, + "learning_rate": 4.6670143861572285e-05, + "loss": 0.9096, + "step": 22600 + }, + { + "epoch": 0.19987977156597536, + "grad_norm": 7.308407783508301, + "learning_rate": 4.666867047390041e-05, + "loss": 0.8025, + "step": 22610 + }, + { + "epoch": 0.1999681748262876, + "grad_norm": 4.789428234100342, + "learning_rate": 4.666719708622854e-05, + "loss": 0.7437, + "step": 22620 + }, + { + "epoch": 0.20005657808659982, + "grad_norm": 4.0193190574646, + "learning_rate": 4.6665723698556676e-05, + "loss": 0.6633, + "step": 22630 + }, + { + "epoch": 0.20014498134691208, + "grad_norm": 11.492403030395508, + "learning_rate": 4.66642503108848e-05, + "loss": 0.8315, + "step": 22640 + }, + { + "epoch": 0.2002333846072243, + "grad_norm": 3.3232409954071045, + "learning_rate": 4.666277692321293e-05, + "loss": 0.7715, + "step": 22650 + }, + { + "epoch": 0.20032178786753654, + "grad_norm": 6.283932209014893, + "learning_rate": 4.666130353554106e-05, + "loss": 0.8237, + "step": 22660 + }, + { + "epoch": 0.2004101911278488, + "grad_norm": 8.97296142578125, + "learning_rate": 4.665983014786919e-05, + "loss": 0.8166, + "step": 22670 + }, + { + "epoch": 0.20049859438816103, + "grad_norm": 6.649289131164551, + "learning_rate": 4.665835676019732e-05, + "loss": 0.8446, + "step": 22680 + }, + { + "epoch": 0.20058699764847326, + "grad_norm": 6.020296573638916, + "learning_rate": 4.6656883372525447e-05, + "loss": 0.8312, + "step": 22690 + }, + { + "epoch": 0.20067540090878552, + "grad_norm": 9.102386474609375, + "learning_rate": 4.6655409984853575e-05, + "loss": 0.7975, + "step": 22700 + }, + { + "epoch": 0.20076380416909775, + "grad_norm": 3.375962972640991, + "learning_rate": 4.665393659718171e-05, + "loss": 0.8166, + "step": 22710 + }, + { + "epoch": 0.20085220742940998, + "grad_norm": 3.7020063400268555, + "learning_rate": 4.665246320950983e-05, + "loss": 0.8367, + "step": 22720 + }, + { + "epoch": 0.20094061068972224, + "grad_norm": 8.71592903137207, + "learning_rate": 4.665098982183797e-05, + "loss": 0.8129, + "step": 22730 + }, + { + "epoch": 0.20102901395003447, + "grad_norm": 3.620912790298462, + "learning_rate": 4.6649516434166095e-05, + "loss": 0.8335, + "step": 22740 + }, + { + "epoch": 0.2011174172103467, + "grad_norm": 2.114528179168701, + "learning_rate": 4.6648043046494223e-05, + "loss": 0.8284, + "step": 22750 + }, + { + "epoch": 0.20120582047065896, + "grad_norm": 7.276772975921631, + "learning_rate": 4.664656965882235e-05, + "loss": 0.7431, + "step": 22760 + }, + { + "epoch": 0.2012942237309712, + "grad_norm": 4.809281826019287, + "learning_rate": 4.664509627115049e-05, + "loss": 0.8252, + "step": 22770 + }, + { + "epoch": 0.20138262699128343, + "grad_norm": 2.5595180988311768, + "learning_rate": 4.664362288347861e-05, + "loss": 0.8273, + "step": 22780 + }, + { + "epoch": 0.20147103025159568, + "grad_norm": 7.62675666809082, + "learning_rate": 4.6642149495806744e-05, + "loss": 0.9131, + "step": 22790 + }, + { + "epoch": 0.20155943351190792, + "grad_norm": 5.095913410186768, + "learning_rate": 4.6640676108134865e-05, + "loss": 0.7483, + "step": 22800 + }, + { + "epoch": 0.20164783677222015, + "grad_norm": 4.876876354217529, + "learning_rate": 4.6639202720463e-05, + "loss": 0.7971, + "step": 22810 + }, + { + "epoch": 0.2017362400325324, + "grad_norm": 3.420225143432617, + "learning_rate": 4.663772933279113e-05, + "loss": 0.7375, + "step": 22820 + }, + { + "epoch": 0.20182464329284464, + "grad_norm": 5.955178260803223, + "learning_rate": 4.663625594511926e-05, + "loss": 0.6665, + "step": 22830 + }, + { + "epoch": 0.2019130465531569, + "grad_norm": 5.702826976776123, + "learning_rate": 4.6634782557447385e-05, + "loss": 0.8483, + "step": 22840 + }, + { + "epoch": 0.20200144981346912, + "grad_norm": 4.032488822937012, + "learning_rate": 4.663330916977552e-05, + "loss": 0.7132, + "step": 22850 + }, + { + "epoch": 0.20208985307378136, + "grad_norm": 3.2358500957489014, + "learning_rate": 4.663183578210364e-05, + "loss": 0.8269, + "step": 22860 + }, + { + "epoch": 0.20217825633409361, + "grad_norm": 8.215649604797363, + "learning_rate": 4.663036239443178e-05, + "loss": 0.8625, + "step": 22870 + }, + { + "epoch": 0.20226665959440585, + "grad_norm": 2.6914680004119873, + "learning_rate": 4.6628889006759906e-05, + "loss": 0.7684, + "step": 22880 + }, + { + "epoch": 0.20235506285471808, + "grad_norm": 5.066773891448975, + "learning_rate": 4.6627415619088034e-05, + "loss": 0.8112, + "step": 22890 + }, + { + "epoch": 0.20244346611503033, + "grad_norm": 9.938232421875, + "learning_rate": 4.662594223141616e-05, + "loss": 0.7483, + "step": 22900 + }, + { + "epoch": 0.20253186937534257, + "grad_norm": 9.867469787597656, + "learning_rate": 4.66244688437443e-05, + "loss": 0.8257, + "step": 22910 + }, + { + "epoch": 0.2026202726356548, + "grad_norm": 2.619184970855713, + "learning_rate": 4.662299545607242e-05, + "loss": 0.74, + "step": 22920 + }, + { + "epoch": 0.20270867589596706, + "grad_norm": 7.602010250091553, + "learning_rate": 4.6621522068400554e-05, + "loss": 0.8422, + "step": 22930 + }, + { + "epoch": 0.2027970791562793, + "grad_norm": 4.414831161499023, + "learning_rate": 4.6620048680728676e-05, + "loss": 0.7838, + "step": 22940 + }, + { + "epoch": 0.20288548241659152, + "grad_norm": 4.029686450958252, + "learning_rate": 4.661857529305681e-05, + "loss": 0.783, + "step": 22950 + }, + { + "epoch": 0.20297388567690378, + "grad_norm": 2.539350986480713, + "learning_rate": 4.661710190538494e-05, + "loss": 0.8021, + "step": 22960 + }, + { + "epoch": 0.203062288937216, + "grad_norm": 4.161380767822266, + "learning_rate": 4.661562851771307e-05, + "loss": 0.8077, + "step": 22970 + }, + { + "epoch": 0.20315069219752824, + "grad_norm": 3.5896804332733154, + "learning_rate": 4.6614155130041196e-05, + "loss": 0.8087, + "step": 22980 + }, + { + "epoch": 0.2032390954578405, + "grad_norm": 2.0759224891662598, + "learning_rate": 4.661268174236933e-05, + "loss": 0.8295, + "step": 22990 + }, + { + "epoch": 0.20332749871815273, + "grad_norm": 3.9206223487854004, + "learning_rate": 4.661120835469745e-05, + "loss": 0.8418, + "step": 23000 + }, + { + "epoch": 0.20341590197846496, + "grad_norm": 5.133076190948486, + "learning_rate": 4.660973496702559e-05, + "loss": 0.8928, + "step": 23010 + }, + { + "epoch": 0.20350430523877722, + "grad_norm": 3.193308115005493, + "learning_rate": 4.6608261579353716e-05, + "loss": 0.8147, + "step": 23020 + }, + { + "epoch": 0.20359270849908945, + "grad_norm": 4.302774906158447, + "learning_rate": 4.6606788191681845e-05, + "loss": 0.7561, + "step": 23030 + }, + { + "epoch": 0.20368111175940168, + "grad_norm": 3.4367735385894775, + "learning_rate": 4.660531480400997e-05, + "loss": 0.7127, + "step": 23040 + }, + { + "epoch": 0.20376951501971394, + "grad_norm": 6.8108015060424805, + "learning_rate": 4.66038414163381e-05, + "loss": 0.7675, + "step": 23050 + }, + { + "epoch": 0.20385791828002617, + "grad_norm": 9.134854316711426, + "learning_rate": 4.660236802866623e-05, + "loss": 0.8798, + "step": 23060 + }, + { + "epoch": 0.2039463215403384, + "grad_norm": 2.6153030395507812, + "learning_rate": 4.6600894640994365e-05, + "loss": 0.8223, + "step": 23070 + }, + { + "epoch": 0.20403472480065066, + "grad_norm": 5.8745903968811035, + "learning_rate": 4.6599421253322486e-05, + "loss": 0.8014, + "step": 23080 + }, + { + "epoch": 0.2041231280609629, + "grad_norm": 4.749171257019043, + "learning_rate": 4.659794786565062e-05, + "loss": 0.7336, + "step": 23090 + }, + { + "epoch": 0.20421153132127512, + "grad_norm": 3.4737186431884766, + "learning_rate": 4.659647447797875e-05, + "loss": 0.7971, + "step": 23100 + }, + { + "epoch": 0.20429993458158738, + "grad_norm": 3.0345163345336914, + "learning_rate": 4.659500109030688e-05, + "loss": 0.7936, + "step": 23110 + }, + { + "epoch": 0.2043883378418996, + "grad_norm": 9.059696197509766, + "learning_rate": 4.6593527702635006e-05, + "loss": 0.7744, + "step": 23120 + }, + { + "epoch": 0.20447674110221184, + "grad_norm": 4.60284423828125, + "learning_rate": 4.659205431496314e-05, + "loss": 0.8543, + "step": 23130 + }, + { + "epoch": 0.2045651443625241, + "grad_norm": 5.638271808624268, + "learning_rate": 4.659058092729126e-05, + "loss": 0.8235, + "step": 23140 + }, + { + "epoch": 0.20465354762283633, + "grad_norm": 9.336036682128906, + "learning_rate": 4.65891075396194e-05, + "loss": 0.8429, + "step": 23150 + }, + { + "epoch": 0.20474195088314856, + "grad_norm": 3.498440980911255, + "learning_rate": 4.658763415194753e-05, + "loss": 0.8518, + "step": 23160 + }, + { + "epoch": 0.20483035414346082, + "grad_norm": 4.826033115386963, + "learning_rate": 4.6586160764275655e-05, + "loss": 0.8763, + "step": 23170 + }, + { + "epoch": 0.20491875740377305, + "grad_norm": 7.477751731872559, + "learning_rate": 4.658468737660378e-05, + "loss": 0.7637, + "step": 23180 + }, + { + "epoch": 0.20500716066408528, + "grad_norm": 8.1306791305542, + "learning_rate": 4.658321398893191e-05, + "loss": 0.6267, + "step": 23190 + }, + { + "epoch": 0.20509556392439754, + "grad_norm": 4.623834609985352, + "learning_rate": 4.658174060126004e-05, + "loss": 0.8497, + "step": 23200 + }, + { + "epoch": 0.20518396718470977, + "grad_norm": 4.3841166496276855, + "learning_rate": 4.6580267213588175e-05, + "loss": 0.6705, + "step": 23210 + }, + { + "epoch": 0.205272370445022, + "grad_norm": 3.9328527450561523, + "learning_rate": 4.6578793825916304e-05, + "loss": 0.7929, + "step": 23220 + }, + { + "epoch": 0.20536077370533426, + "grad_norm": 3.7214295864105225, + "learning_rate": 4.657732043824443e-05, + "loss": 0.7466, + "step": 23230 + }, + { + "epoch": 0.2054491769656465, + "grad_norm": 2.744231939315796, + "learning_rate": 4.657584705057256e-05, + "loss": 0.8482, + "step": 23240 + }, + { + "epoch": 0.20553758022595872, + "grad_norm": 7.006786823272705, + "learning_rate": 4.657437366290069e-05, + "loss": 0.7837, + "step": 23250 + }, + { + "epoch": 0.20562598348627098, + "grad_norm": 5.491894721984863, + "learning_rate": 4.657290027522882e-05, + "loss": 0.746, + "step": 23260 + }, + { + "epoch": 0.2057143867465832, + "grad_norm": 6.551908016204834, + "learning_rate": 4.6571426887556945e-05, + "loss": 0.7904, + "step": 23270 + }, + { + "epoch": 0.20580279000689544, + "grad_norm": 4.034269332885742, + "learning_rate": 4.656995349988508e-05, + "loss": 0.8047, + "step": 23280 + }, + { + "epoch": 0.2058911932672077, + "grad_norm": 2.973371982574463, + "learning_rate": 4.656848011221321e-05, + "loss": 0.8121, + "step": 23290 + }, + { + "epoch": 0.20597959652751993, + "grad_norm": 2.032548666000366, + "learning_rate": 4.656700672454134e-05, + "loss": 0.6712, + "step": 23300 + }, + { + "epoch": 0.20606799978783216, + "grad_norm": 8.16117000579834, + "learning_rate": 4.6565533336869466e-05, + "loss": 0.9361, + "step": 23310 + }, + { + "epoch": 0.20615640304814442, + "grad_norm": 5.261748790740967, + "learning_rate": 4.6564059949197594e-05, + "loss": 0.8432, + "step": 23320 + }, + { + "epoch": 0.20624480630845665, + "grad_norm": 3.1846604347229004, + "learning_rate": 4.656258656152572e-05, + "loss": 0.7282, + "step": 23330 + }, + { + "epoch": 0.20633320956876888, + "grad_norm": 3.816189765930176, + "learning_rate": 4.656111317385386e-05, + "loss": 0.8425, + "step": 23340 + }, + { + "epoch": 0.20642161282908114, + "grad_norm": 5.439279556274414, + "learning_rate": 4.6559639786181986e-05, + "loss": 0.7602, + "step": 23350 + }, + { + "epoch": 0.20651001608939337, + "grad_norm": 4.367234230041504, + "learning_rate": 4.6558166398510114e-05, + "loss": 0.76, + "step": 23360 + }, + { + "epoch": 0.20659841934970563, + "grad_norm": 12.20114517211914, + "learning_rate": 4.655669301083824e-05, + "loss": 0.812, + "step": 23370 + }, + { + "epoch": 0.20668682261001786, + "grad_norm": 8.623457908630371, + "learning_rate": 4.655521962316637e-05, + "loss": 0.834, + "step": 23380 + }, + { + "epoch": 0.2067752258703301, + "grad_norm": 2.464705228805542, + "learning_rate": 4.65537462354945e-05, + "loss": 0.7101, + "step": 23390 + }, + { + "epoch": 0.20686362913064235, + "grad_norm": 4.3877482414245605, + "learning_rate": 4.6552272847822634e-05, + "loss": 0.7512, + "step": 23400 + }, + { + "epoch": 0.20695203239095458, + "grad_norm": 1.7268105745315552, + "learning_rate": 4.6550799460150756e-05, + "loss": 0.7052, + "step": 23410 + }, + { + "epoch": 0.2070404356512668, + "grad_norm": 2.445685863494873, + "learning_rate": 4.654932607247889e-05, + "loss": 0.6972, + "step": 23420 + }, + { + "epoch": 0.20712883891157907, + "grad_norm": 2.92224383354187, + "learning_rate": 4.654785268480702e-05, + "loss": 0.7562, + "step": 23430 + }, + { + "epoch": 0.2072172421718913, + "grad_norm": 10.058605194091797, + "learning_rate": 4.654637929713515e-05, + "loss": 0.7735, + "step": 23440 + }, + { + "epoch": 0.20730564543220353, + "grad_norm": 3.240065097808838, + "learning_rate": 4.6544905909463276e-05, + "loss": 0.9023, + "step": 23450 + }, + { + "epoch": 0.2073940486925158, + "grad_norm": 5.235249996185303, + "learning_rate": 4.654343252179141e-05, + "loss": 0.8871, + "step": 23460 + }, + { + "epoch": 0.20748245195282802, + "grad_norm": 5.795628547668457, + "learning_rate": 4.654195913411953e-05, + "loss": 0.725, + "step": 23470 + }, + { + "epoch": 0.20757085521314025, + "grad_norm": 9.360791206359863, + "learning_rate": 4.654048574644767e-05, + "loss": 0.8301, + "step": 23480 + }, + { + "epoch": 0.2076592584734525, + "grad_norm": 3.732430934906006, + "learning_rate": 4.6539012358775796e-05, + "loss": 0.836, + "step": 23490 + }, + { + "epoch": 0.20774766173376474, + "grad_norm": 18.71736717224121, + "learning_rate": 4.6537538971103925e-05, + "loss": 0.7987, + "step": 23500 + }, + { + "epoch": 0.20783606499407697, + "grad_norm": 5.200915336608887, + "learning_rate": 4.653606558343205e-05, + "loss": 0.8573, + "step": 23510 + }, + { + "epoch": 0.20792446825438923, + "grad_norm": 1.8514925241470337, + "learning_rate": 4.653459219576018e-05, + "loss": 0.8253, + "step": 23520 + }, + { + "epoch": 0.20801287151470146, + "grad_norm": 4.80307149887085, + "learning_rate": 4.653311880808831e-05, + "loss": 0.7087, + "step": 23530 + }, + { + "epoch": 0.2081012747750137, + "grad_norm": 8.400822639465332, + "learning_rate": 4.6531645420416445e-05, + "loss": 0.7526, + "step": 23540 + }, + { + "epoch": 0.20818967803532595, + "grad_norm": 7.24381685256958, + "learning_rate": 4.6530172032744566e-05, + "loss": 0.7866, + "step": 23550 + }, + { + "epoch": 0.20827808129563818, + "grad_norm": 7.329807281494141, + "learning_rate": 4.65286986450727e-05, + "loss": 0.834, + "step": 23560 + }, + { + "epoch": 0.20836648455595042, + "grad_norm": 4.557115077972412, + "learning_rate": 4.652722525740083e-05, + "loss": 0.6873, + "step": 23570 + }, + { + "epoch": 0.20845488781626267, + "grad_norm": 4.804540634155273, + "learning_rate": 4.652575186972896e-05, + "loss": 0.6711, + "step": 23580 + }, + { + "epoch": 0.2085432910765749, + "grad_norm": 3.193962574005127, + "learning_rate": 4.6524278482057087e-05, + "loss": 0.8167, + "step": 23590 + }, + { + "epoch": 0.20863169433688714, + "grad_norm": 5.953191757202148, + "learning_rate": 4.652280509438522e-05, + "loss": 0.9337, + "step": 23600 + }, + { + "epoch": 0.2087200975971994, + "grad_norm": 3.995999574661255, + "learning_rate": 4.652133170671334e-05, + "loss": 0.7633, + "step": 23610 + }, + { + "epoch": 0.20880850085751163, + "grad_norm": 3.6463701725006104, + "learning_rate": 4.651985831904148e-05, + "loss": 0.7166, + "step": 23620 + }, + { + "epoch": 0.20889690411782386, + "grad_norm": 3.6311116218566895, + "learning_rate": 4.65183849313696e-05, + "loss": 0.7353, + "step": 23630 + }, + { + "epoch": 0.20898530737813611, + "grad_norm": 9.220864295959473, + "learning_rate": 4.6516911543697735e-05, + "loss": 0.7907, + "step": 23640 + }, + { + "epoch": 0.20907371063844835, + "grad_norm": 2.683284282684326, + "learning_rate": 4.6515438156025863e-05, + "loss": 0.8561, + "step": 23650 + }, + { + "epoch": 0.20916211389876058, + "grad_norm": 5.696201801300049, + "learning_rate": 4.651396476835399e-05, + "loss": 0.7591, + "step": 23660 + }, + { + "epoch": 0.20925051715907284, + "grad_norm": 3.0992331504821777, + "learning_rate": 4.651249138068212e-05, + "loss": 0.7089, + "step": 23670 + }, + { + "epoch": 0.20933892041938507, + "grad_norm": 3.3280680179595947, + "learning_rate": 4.6511017993010255e-05, + "loss": 0.7249, + "step": 23680 + }, + { + "epoch": 0.2094273236796973, + "grad_norm": 2.1861989498138428, + "learning_rate": 4.650954460533838e-05, + "loss": 0.6126, + "step": 23690 + }, + { + "epoch": 0.20951572694000956, + "grad_norm": 9.512399673461914, + "learning_rate": 4.650807121766651e-05, + "loss": 0.8728, + "step": 23700 + }, + { + "epoch": 0.2096041302003218, + "grad_norm": 4.519936561584473, + "learning_rate": 4.650659782999464e-05, + "loss": 0.7135, + "step": 23710 + }, + { + "epoch": 0.20969253346063402, + "grad_norm": 5.0087738037109375, + "learning_rate": 4.650512444232277e-05, + "loss": 0.7502, + "step": 23720 + }, + { + "epoch": 0.20978093672094628, + "grad_norm": 7.563531875610352, + "learning_rate": 4.65036510546509e-05, + "loss": 0.7536, + "step": 23730 + }, + { + "epoch": 0.2098693399812585, + "grad_norm": 1.7587312459945679, + "learning_rate": 4.6502177666979025e-05, + "loss": 0.7315, + "step": 23740 + }, + { + "epoch": 0.20995774324157074, + "grad_norm": 4.123885154724121, + "learning_rate": 4.6500704279307154e-05, + "loss": 0.7776, + "step": 23750 + }, + { + "epoch": 0.210046146501883, + "grad_norm": 4.363010406494141, + "learning_rate": 4.649923089163529e-05, + "loss": 0.7807, + "step": 23760 + }, + { + "epoch": 0.21013454976219523, + "grad_norm": 5.097952365875244, + "learning_rate": 4.649775750396341e-05, + "loss": 0.8445, + "step": 23770 + }, + { + "epoch": 0.21022295302250746, + "grad_norm": 8.998221397399902, + "learning_rate": 4.6496284116291546e-05, + "loss": 0.7876, + "step": 23780 + }, + { + "epoch": 0.21031135628281972, + "grad_norm": 3.2325947284698486, + "learning_rate": 4.6494810728619674e-05, + "loss": 0.7932, + "step": 23790 + }, + { + "epoch": 0.21039975954313195, + "grad_norm": 6.585747718811035, + "learning_rate": 4.64933373409478e-05, + "loss": 0.7614, + "step": 23800 + }, + { + "epoch": 0.21048816280344418, + "grad_norm": 3.006070137023926, + "learning_rate": 4.649186395327593e-05, + "loss": 0.8762, + "step": 23810 + }, + { + "epoch": 0.21057656606375644, + "grad_norm": 5.100870132446289, + "learning_rate": 4.6490390565604066e-05, + "loss": 0.906, + "step": 23820 + }, + { + "epoch": 0.21066496932406867, + "grad_norm": 6.8270745277404785, + "learning_rate": 4.648891717793219e-05, + "loss": 0.8616, + "step": 23830 + }, + { + "epoch": 0.2107533725843809, + "grad_norm": 5.72274112701416, + "learning_rate": 4.648744379026032e-05, + "loss": 0.8046, + "step": 23840 + }, + { + "epoch": 0.21084177584469316, + "grad_norm": 5.608553886413574, + "learning_rate": 4.648597040258845e-05, + "loss": 0.8117, + "step": 23850 + }, + { + "epoch": 0.2109301791050054, + "grad_norm": 4.716365814208984, + "learning_rate": 4.648449701491658e-05, + "loss": 0.8121, + "step": 23860 + }, + { + "epoch": 0.21101858236531762, + "grad_norm": 8.17506217956543, + "learning_rate": 4.648302362724471e-05, + "loss": 0.7239, + "step": 23870 + }, + { + "epoch": 0.21110698562562988, + "grad_norm": 5.354269981384277, + "learning_rate": 4.6481550239572836e-05, + "loss": 0.8251, + "step": 23880 + }, + { + "epoch": 0.2111953888859421, + "grad_norm": 7.155550479888916, + "learning_rate": 4.6480076851900964e-05, + "loss": 0.855, + "step": 23890 + }, + { + "epoch": 0.21128379214625434, + "grad_norm": 4.716402053833008, + "learning_rate": 4.64786034642291e-05, + "loss": 0.8917, + "step": 23900 + }, + { + "epoch": 0.2113721954065666, + "grad_norm": 6.345791339874268, + "learning_rate": 4.647713007655722e-05, + "loss": 0.7738, + "step": 23910 + }, + { + "epoch": 0.21146059866687883, + "grad_norm": 2.3480265140533447, + "learning_rate": 4.6475656688885356e-05, + "loss": 0.8602, + "step": 23920 + }, + { + "epoch": 0.2115490019271911, + "grad_norm": 3.5254249572753906, + "learning_rate": 4.6474183301213484e-05, + "loss": 0.838, + "step": 23930 + }, + { + "epoch": 0.21163740518750332, + "grad_norm": 3.4268798828125, + "learning_rate": 4.647270991354161e-05, + "loss": 0.6845, + "step": 23940 + }, + { + "epoch": 0.21172580844781555, + "grad_norm": 1.6763031482696533, + "learning_rate": 4.647123652586974e-05, + "loss": 0.6627, + "step": 23950 + }, + { + "epoch": 0.2118142117081278, + "grad_norm": 7.624725818634033, + "learning_rate": 4.6469763138197876e-05, + "loss": 0.7902, + "step": 23960 + }, + { + "epoch": 0.21190261496844004, + "grad_norm": 4.639545440673828, + "learning_rate": 4.6468289750526e-05, + "loss": 0.6541, + "step": 23970 + }, + { + "epoch": 0.21199101822875227, + "grad_norm": 2.399841547012329, + "learning_rate": 4.646681636285413e-05, + "loss": 0.8074, + "step": 23980 + }, + { + "epoch": 0.21207942148906453, + "grad_norm": 1.593639850616455, + "learning_rate": 4.6465342975182255e-05, + "loss": 0.7049, + "step": 23990 + }, + { + "epoch": 0.21216782474937676, + "grad_norm": 3.583134412765503, + "learning_rate": 4.646386958751039e-05, + "loss": 0.9288, + "step": 24000 + }, + { + "epoch": 0.212256228009689, + "grad_norm": 3.410632371902466, + "learning_rate": 4.646239619983852e-05, + "loss": 0.7309, + "step": 24010 + }, + { + "epoch": 0.21234463127000125, + "grad_norm": 1.9905996322631836, + "learning_rate": 4.6460922812166646e-05, + "loss": 0.6831, + "step": 24020 + }, + { + "epoch": 0.21243303453031348, + "grad_norm": 5.136494159698486, + "learning_rate": 4.6459449424494775e-05, + "loss": 0.8327, + "step": 24030 + }, + { + "epoch": 0.2125214377906257, + "grad_norm": 8.443650245666504, + "learning_rate": 4.645797603682291e-05, + "loss": 0.768, + "step": 24040 + }, + { + "epoch": 0.21260984105093797, + "grad_norm": 8.261244773864746, + "learning_rate": 4.645650264915103e-05, + "loss": 0.7784, + "step": 24050 + }, + { + "epoch": 0.2126982443112502, + "grad_norm": 5.827929496765137, + "learning_rate": 4.645502926147917e-05, + "loss": 0.7436, + "step": 24060 + }, + { + "epoch": 0.21278664757156243, + "grad_norm": 7.286040782928467, + "learning_rate": 4.6453555873807295e-05, + "loss": 0.7721, + "step": 24070 + }, + { + "epoch": 0.2128750508318747, + "grad_norm": 5.581329345703125, + "learning_rate": 4.645208248613542e-05, + "loss": 0.8695, + "step": 24080 + }, + { + "epoch": 0.21296345409218692, + "grad_norm": 2.1000869274139404, + "learning_rate": 4.645060909846355e-05, + "loss": 0.7413, + "step": 24090 + }, + { + "epoch": 0.21305185735249915, + "grad_norm": 4.673179626464844, + "learning_rate": 4.644913571079168e-05, + "loss": 0.6993, + "step": 24100 + }, + { + "epoch": 0.2131402606128114, + "grad_norm": 5.403928279876709, + "learning_rate": 4.644766232311981e-05, + "loss": 0.7247, + "step": 24110 + }, + { + "epoch": 0.21322866387312364, + "grad_norm": 2.7159504890441895, + "learning_rate": 4.6446188935447944e-05, + "loss": 0.8364, + "step": 24120 + }, + { + "epoch": 0.21331706713343587, + "grad_norm": 6.9059062004089355, + "learning_rate": 4.644471554777607e-05, + "loss": 0.9024, + "step": 24130 + }, + { + "epoch": 0.21340547039374813, + "grad_norm": 8.990559577941895, + "learning_rate": 4.64432421601042e-05, + "loss": 0.7564, + "step": 24140 + }, + { + "epoch": 0.21349387365406036, + "grad_norm": 8.319303512573242, + "learning_rate": 4.644176877243233e-05, + "loss": 0.7636, + "step": 24150 + }, + { + "epoch": 0.2135822769143726, + "grad_norm": 5.187260627746582, + "learning_rate": 4.644029538476046e-05, + "loss": 0.7803, + "step": 24160 + }, + { + "epoch": 0.21367068017468485, + "grad_norm": 4.388763904571533, + "learning_rate": 4.6438821997088585e-05, + "loss": 0.7169, + "step": 24170 + }, + { + "epoch": 0.21375908343499708, + "grad_norm": 8.37094497680664, + "learning_rate": 4.643734860941672e-05, + "loss": 0.7542, + "step": 24180 + }, + { + "epoch": 0.2138474866953093, + "grad_norm": 2.8514420986175537, + "learning_rate": 4.643587522174485e-05, + "loss": 0.7043, + "step": 24190 + }, + { + "epoch": 0.21393588995562157, + "grad_norm": 11.018105506896973, + "learning_rate": 4.643440183407298e-05, + "loss": 0.8111, + "step": 24200 + }, + { + "epoch": 0.2140242932159338, + "grad_norm": 8.870553016662598, + "learning_rate": 4.6432928446401105e-05, + "loss": 0.7464, + "step": 24210 + }, + { + "epoch": 0.21411269647624603, + "grad_norm": 2.422752857208252, + "learning_rate": 4.6431455058729234e-05, + "loss": 0.7303, + "step": 24220 + }, + { + "epoch": 0.2142010997365583, + "grad_norm": 3.441040515899658, + "learning_rate": 4.642998167105736e-05, + "loss": 0.8015, + "step": 24230 + }, + { + "epoch": 0.21428950299687052, + "grad_norm": 4.044212818145752, + "learning_rate": 4.642850828338549e-05, + "loss": 0.6903, + "step": 24240 + }, + { + "epoch": 0.21437790625718275, + "grad_norm": 4.94411563873291, + "learning_rate": 4.6427034895713626e-05, + "loss": 0.7756, + "step": 24250 + }, + { + "epoch": 0.214466309517495, + "grad_norm": 20.273862838745117, + "learning_rate": 4.6425561508041754e-05, + "loss": 0.7523, + "step": 24260 + }, + { + "epoch": 0.21455471277780724, + "grad_norm": 3.986461639404297, + "learning_rate": 4.642408812036988e-05, + "loss": 0.8173, + "step": 24270 + }, + { + "epoch": 0.21464311603811947, + "grad_norm": 4.408081531524658, + "learning_rate": 4.642261473269801e-05, + "loss": 0.8444, + "step": 24280 + }, + { + "epoch": 0.21473151929843173, + "grad_norm": 4.525137901306152, + "learning_rate": 4.642114134502614e-05, + "loss": 0.9284, + "step": 24290 + }, + { + "epoch": 0.21481992255874396, + "grad_norm": 4.937618732452393, + "learning_rate": 4.641966795735427e-05, + "loss": 0.7936, + "step": 24300 + }, + { + "epoch": 0.2149083258190562, + "grad_norm": 2.9195120334625244, + "learning_rate": 4.64181945696824e-05, + "loss": 0.8288, + "step": 24310 + }, + { + "epoch": 0.21499672907936845, + "grad_norm": 8.352294921875, + "learning_rate": 4.641672118201053e-05, + "loss": 0.8597, + "step": 24320 + }, + { + "epoch": 0.21508513233968068, + "grad_norm": 6.399908065795898, + "learning_rate": 4.641524779433866e-05, + "loss": 0.7747, + "step": 24330 + }, + { + "epoch": 0.21517353559999292, + "grad_norm": 6.880349159240723, + "learning_rate": 4.641377440666679e-05, + "loss": 0.7494, + "step": 24340 + }, + { + "epoch": 0.21526193886030517, + "grad_norm": 4.56484317779541, + "learning_rate": 4.6412301018994916e-05, + "loss": 0.6893, + "step": 24350 + }, + { + "epoch": 0.2153503421206174, + "grad_norm": 4.912026405334473, + "learning_rate": 4.6410827631323044e-05, + "loss": 0.9449, + "step": 24360 + }, + { + "epoch": 0.21543874538092964, + "grad_norm": 3.681528329849243, + "learning_rate": 4.640935424365118e-05, + "loss": 0.7282, + "step": 24370 + }, + { + "epoch": 0.2155271486412419, + "grad_norm": 3.427675724029541, + "learning_rate": 4.64078808559793e-05, + "loss": 0.7578, + "step": 24380 + }, + { + "epoch": 0.21561555190155413, + "grad_norm": 6.079005241394043, + "learning_rate": 4.6406407468307436e-05, + "loss": 0.8165, + "step": 24390 + }, + { + "epoch": 0.21570395516186636, + "grad_norm": 5.2435784339904785, + "learning_rate": 4.6404934080635565e-05, + "loss": 0.8901, + "step": 24400 + }, + { + "epoch": 0.21579235842217862, + "grad_norm": 6.158112525939941, + "learning_rate": 4.640346069296369e-05, + "loss": 0.6585, + "step": 24410 + }, + { + "epoch": 0.21588076168249085, + "grad_norm": 6.721048355102539, + "learning_rate": 4.640198730529182e-05, + "loss": 0.6893, + "step": 24420 + }, + { + "epoch": 0.21596916494280308, + "grad_norm": 3.782916784286499, + "learning_rate": 4.6400513917619956e-05, + "loss": 0.7559, + "step": 24430 + }, + { + "epoch": 0.21605756820311534, + "grad_norm": 4.375424861907959, + "learning_rate": 4.639904052994808e-05, + "loss": 0.7981, + "step": 24440 + }, + { + "epoch": 0.21614597146342757, + "grad_norm": 6.188992023468018, + "learning_rate": 4.639756714227621e-05, + "loss": 0.853, + "step": 24450 + }, + { + "epoch": 0.21623437472373983, + "grad_norm": 3.779165267944336, + "learning_rate": 4.6396093754604335e-05, + "loss": 0.8897, + "step": 24460 + }, + { + "epoch": 0.21632277798405206, + "grad_norm": 4.001415729522705, + "learning_rate": 4.639462036693247e-05, + "loss": 0.7805, + "step": 24470 + }, + { + "epoch": 0.2164111812443643, + "grad_norm": 9.615870475769043, + "learning_rate": 4.63931469792606e-05, + "loss": 0.9249, + "step": 24480 + }, + { + "epoch": 0.21649958450467655, + "grad_norm": 2.7439322471618652, + "learning_rate": 4.6391673591588726e-05, + "loss": 0.7208, + "step": 24490 + }, + { + "epoch": 0.21658798776498878, + "grad_norm": 4.920548439025879, + "learning_rate": 4.6390200203916855e-05, + "loss": 0.8799, + "step": 24500 + }, + { + "epoch": 0.216676391025301, + "grad_norm": 4.9506072998046875, + "learning_rate": 4.638872681624499e-05, + "loss": 0.815, + "step": 24510 + }, + { + "epoch": 0.21676479428561327, + "grad_norm": 4.992445468902588, + "learning_rate": 4.638725342857311e-05, + "loss": 0.7988, + "step": 24520 + }, + { + "epoch": 0.2168531975459255, + "grad_norm": 3.184152841567993, + "learning_rate": 4.638578004090125e-05, + "loss": 0.7298, + "step": 24530 + }, + { + "epoch": 0.21694160080623773, + "grad_norm": 3.891679048538208, + "learning_rate": 4.6384306653229375e-05, + "loss": 0.7793, + "step": 24540 + }, + { + "epoch": 0.21703000406655, + "grad_norm": 2.918549060821533, + "learning_rate": 4.6382833265557503e-05, + "loss": 0.7285, + "step": 24550 + }, + { + "epoch": 0.21711840732686222, + "grad_norm": 2.4985158443450928, + "learning_rate": 4.638135987788563e-05, + "loss": 0.9261, + "step": 24560 + }, + { + "epoch": 0.21720681058717445, + "grad_norm": 4.97471284866333, + "learning_rate": 4.637988649021376e-05, + "loss": 0.8452, + "step": 24570 + }, + { + "epoch": 0.2172952138474867, + "grad_norm": 3.9032785892486572, + "learning_rate": 4.637841310254189e-05, + "loss": 0.7743, + "step": 24580 + }, + { + "epoch": 0.21738361710779894, + "grad_norm": 4.991990566253662, + "learning_rate": 4.6376939714870024e-05, + "loss": 0.7649, + "step": 24590 + }, + { + "epoch": 0.21747202036811117, + "grad_norm": 2.503690004348755, + "learning_rate": 4.6375466327198145e-05, + "loss": 0.7208, + "step": 24600 + }, + { + "epoch": 0.21756042362842343, + "grad_norm": 3.138916254043579, + "learning_rate": 4.637399293952628e-05, + "loss": 0.9338, + "step": 24610 + }, + { + "epoch": 0.21764882688873566, + "grad_norm": 5.933905601501465, + "learning_rate": 4.637251955185441e-05, + "loss": 0.7391, + "step": 24620 + }, + { + "epoch": 0.2177372301490479, + "grad_norm": 4.459654808044434, + "learning_rate": 4.637104616418254e-05, + "loss": 0.9008, + "step": 24630 + }, + { + "epoch": 0.21782563340936015, + "grad_norm": 4.548059940338135, + "learning_rate": 4.6369572776510665e-05, + "loss": 0.8563, + "step": 24640 + }, + { + "epoch": 0.21791403666967238, + "grad_norm": 1.5870684385299683, + "learning_rate": 4.63680993888388e-05, + "loss": 0.7146, + "step": 24650 + }, + { + "epoch": 0.2180024399299846, + "grad_norm": 6.337950229644775, + "learning_rate": 4.636662600116692e-05, + "loss": 0.8823, + "step": 24660 + }, + { + "epoch": 0.21809084319029687, + "grad_norm": 3.137346029281616, + "learning_rate": 4.636515261349506e-05, + "loss": 0.8795, + "step": 24670 + }, + { + "epoch": 0.2181792464506091, + "grad_norm": 4.048831462860107, + "learning_rate": 4.6363679225823186e-05, + "loss": 0.8347, + "step": 24680 + }, + { + "epoch": 0.21826764971092133, + "grad_norm": 5.1624555587768555, + "learning_rate": 4.6362205838151314e-05, + "loss": 0.7449, + "step": 24690 + }, + { + "epoch": 0.2183560529712336, + "grad_norm": 3.5979888439178467, + "learning_rate": 4.636073245047944e-05, + "loss": 0.8668, + "step": 24700 + }, + { + "epoch": 0.21844445623154582, + "grad_norm": 5.237533092498779, + "learning_rate": 4.635925906280757e-05, + "loss": 0.8265, + "step": 24710 + }, + { + "epoch": 0.21853285949185805, + "grad_norm": 4.043185710906982, + "learning_rate": 4.63577856751357e-05, + "loss": 0.8415, + "step": 24720 + }, + { + "epoch": 0.2186212627521703, + "grad_norm": 3.46463942527771, + "learning_rate": 4.6356312287463834e-05, + "loss": 0.8238, + "step": 24730 + }, + { + "epoch": 0.21870966601248254, + "grad_norm": 3.736070156097412, + "learning_rate": 4.6354838899791956e-05, + "loss": 0.7936, + "step": 24740 + }, + { + "epoch": 0.21879806927279477, + "grad_norm": 3.1564292907714844, + "learning_rate": 4.635336551212009e-05, + "loss": 0.7544, + "step": 24750 + }, + { + "epoch": 0.21888647253310703, + "grad_norm": 2.714038848876953, + "learning_rate": 4.635189212444822e-05, + "loss": 0.8748, + "step": 24760 + }, + { + "epoch": 0.21897487579341926, + "grad_norm": 2.51503324508667, + "learning_rate": 4.635041873677635e-05, + "loss": 0.8479, + "step": 24770 + }, + { + "epoch": 0.2190632790537315, + "grad_norm": 3.2833774089813232, + "learning_rate": 4.6348945349104476e-05, + "loss": 0.9123, + "step": 24780 + }, + { + "epoch": 0.21915168231404375, + "grad_norm": 11.11308479309082, + "learning_rate": 4.634747196143261e-05, + "loss": 0.8166, + "step": 24790 + }, + { + "epoch": 0.21924008557435598, + "grad_norm": 6.787374019622803, + "learning_rate": 4.634599857376073e-05, + "loss": 0.8701, + "step": 24800 + }, + { + "epoch": 0.2193284888346682, + "grad_norm": 5.895148754119873, + "learning_rate": 4.634452518608887e-05, + "loss": 0.7403, + "step": 24810 + }, + { + "epoch": 0.21941689209498047, + "grad_norm": 3.7027394771575928, + "learning_rate": 4.634305179841699e-05, + "loss": 0.8693, + "step": 24820 + }, + { + "epoch": 0.2195052953552927, + "grad_norm": 4.522027492523193, + "learning_rate": 4.6341578410745124e-05, + "loss": 0.7761, + "step": 24830 + }, + { + "epoch": 0.21959369861560493, + "grad_norm": 9.361669540405273, + "learning_rate": 4.634010502307325e-05, + "loss": 0.6888, + "step": 24840 + }, + { + "epoch": 0.2196821018759172, + "grad_norm": 4.22054386138916, + "learning_rate": 4.633863163540138e-05, + "loss": 0.7377, + "step": 24850 + }, + { + "epoch": 0.21977050513622942, + "grad_norm": 15.273462295532227, + "learning_rate": 4.633715824772951e-05, + "loss": 0.7282, + "step": 24860 + }, + { + "epoch": 0.21985890839654165, + "grad_norm": 6.5163493156433105, + "learning_rate": 4.6335684860057645e-05, + "loss": 0.8005, + "step": 24870 + }, + { + "epoch": 0.2199473116568539, + "grad_norm": 6.353625774383545, + "learning_rate": 4.6334211472385766e-05, + "loss": 0.7718, + "step": 24880 + }, + { + "epoch": 0.22003571491716614, + "grad_norm": 2.002206563949585, + "learning_rate": 4.63327380847139e-05, + "loss": 0.8258, + "step": 24890 + }, + { + "epoch": 0.22012411817747837, + "grad_norm": 2.044071674346924, + "learning_rate": 4.633126469704203e-05, + "loss": 0.8062, + "step": 24900 + }, + { + "epoch": 0.22021252143779063, + "grad_norm": 7.506868362426758, + "learning_rate": 4.632979130937016e-05, + "loss": 0.7715, + "step": 24910 + }, + { + "epoch": 0.22030092469810286, + "grad_norm": 4.116670608520508, + "learning_rate": 4.6328317921698286e-05, + "loss": 0.7053, + "step": 24920 + }, + { + "epoch": 0.2203893279584151, + "grad_norm": 3.8336546421051025, + "learning_rate": 4.6326844534026415e-05, + "loss": 0.8956, + "step": 24930 + }, + { + "epoch": 0.22047773121872735, + "grad_norm": 2.41890549659729, + "learning_rate": 4.632537114635454e-05, + "loss": 0.7353, + "step": 24940 + }, + { + "epoch": 0.22056613447903958, + "grad_norm": 5.446094512939453, + "learning_rate": 4.632389775868268e-05, + "loss": 0.7688, + "step": 24950 + }, + { + "epoch": 0.2206545377393518, + "grad_norm": 3.2237637042999268, + "learning_rate": 4.63224243710108e-05, + "loss": 0.7948, + "step": 24960 + }, + { + "epoch": 0.22074294099966407, + "grad_norm": 2.7062480449676514, + "learning_rate": 4.6320950983338935e-05, + "loss": 0.7024, + "step": 24970 + }, + { + "epoch": 0.2208313442599763, + "grad_norm": 2.661411762237549, + "learning_rate": 4.631947759566706e-05, + "loss": 0.7522, + "step": 24980 + }, + { + "epoch": 0.22091974752028856, + "grad_norm": 2.45198392868042, + "learning_rate": 4.631800420799519e-05, + "loss": 0.6827, + "step": 24990 + }, + { + "epoch": 0.2210081507806008, + "grad_norm": 15.193877220153809, + "learning_rate": 4.631653082032332e-05, + "loss": 0.8492, + "step": 25000 + }, + { + "epoch": 0.22109655404091302, + "grad_norm": 6.492677688598633, + "learning_rate": 4.6315057432651455e-05, + "loss": 0.8934, + "step": 25010 + }, + { + "epoch": 0.22118495730122528, + "grad_norm": 4.400557518005371, + "learning_rate": 4.631358404497958e-05, + "loss": 0.7585, + "step": 25020 + }, + { + "epoch": 0.2212733605615375, + "grad_norm": 4.58823823928833, + "learning_rate": 4.631211065730771e-05, + "loss": 0.8256, + "step": 25030 + }, + { + "epoch": 0.22136176382184974, + "grad_norm": 2.6293187141418457, + "learning_rate": 4.631063726963584e-05, + "loss": 0.7012, + "step": 25040 + }, + { + "epoch": 0.221450167082162, + "grad_norm": 4.5372395515441895, + "learning_rate": 4.630916388196397e-05, + "loss": 0.8542, + "step": 25050 + }, + { + "epoch": 0.22153857034247423, + "grad_norm": 6.668642997741699, + "learning_rate": 4.63076904942921e-05, + "loss": 0.9586, + "step": 25060 + }, + { + "epoch": 0.22162697360278646, + "grad_norm": 6.260819435119629, + "learning_rate": 4.6306217106620225e-05, + "loss": 0.8067, + "step": 25070 + }, + { + "epoch": 0.22171537686309872, + "grad_norm": 6.783611297607422, + "learning_rate": 4.6304743718948354e-05, + "loss": 0.8143, + "step": 25080 + }, + { + "epoch": 0.22180378012341095, + "grad_norm": 5.217430114746094, + "learning_rate": 4.630327033127649e-05, + "loss": 0.807, + "step": 25090 + }, + { + "epoch": 0.22189218338372318, + "grad_norm": 7.699872016906738, + "learning_rate": 4.630179694360462e-05, + "loss": 0.7318, + "step": 25100 + }, + { + "epoch": 0.22198058664403544, + "grad_norm": 3.798049211502075, + "learning_rate": 4.6300323555932745e-05, + "loss": 0.7622, + "step": 25110 + }, + { + "epoch": 0.22206898990434767, + "grad_norm": 8.742514610290527, + "learning_rate": 4.6298850168260874e-05, + "loss": 0.7571, + "step": 25120 + }, + { + "epoch": 0.2221573931646599, + "grad_norm": 2.609999418258667, + "learning_rate": 4.6297376780589e-05, + "loss": 0.7713, + "step": 25130 + }, + { + "epoch": 0.22224579642497216, + "grad_norm": 3.4092695713043213, + "learning_rate": 4.629590339291713e-05, + "loss": 0.7371, + "step": 25140 + }, + { + "epoch": 0.2223341996852844, + "grad_norm": 3.388076066970825, + "learning_rate": 4.6294430005245266e-05, + "loss": 0.8635, + "step": 25150 + }, + { + "epoch": 0.22242260294559663, + "grad_norm": 4.0364990234375, + "learning_rate": 4.6292956617573394e-05, + "loss": 0.777, + "step": 25160 + }, + { + "epoch": 0.22251100620590888, + "grad_norm": 8.548382759094238, + "learning_rate": 4.629148322990152e-05, + "loss": 0.7714, + "step": 25170 + }, + { + "epoch": 0.22259940946622112, + "grad_norm": 9.222799301147461, + "learning_rate": 4.629000984222965e-05, + "loss": 0.7877, + "step": 25180 + }, + { + "epoch": 0.22268781272653335, + "grad_norm": 4.6049485206604, + "learning_rate": 4.628853645455778e-05, + "loss": 0.7727, + "step": 25190 + }, + { + "epoch": 0.2227762159868456, + "grad_norm": 14.871455192565918, + "learning_rate": 4.628706306688591e-05, + "loss": 0.7477, + "step": 25200 + }, + { + "epoch": 0.22286461924715784, + "grad_norm": 4.842946529388428, + "learning_rate": 4.6285589679214036e-05, + "loss": 0.7824, + "step": 25210 + }, + { + "epoch": 0.22295302250747007, + "grad_norm": 4.010987281799316, + "learning_rate": 4.628411629154217e-05, + "loss": 0.7365, + "step": 25220 + }, + { + "epoch": 0.22304142576778233, + "grad_norm": 5.73057746887207, + "learning_rate": 4.62826429038703e-05, + "loss": 0.7973, + "step": 25230 + }, + { + "epoch": 0.22312982902809456, + "grad_norm": 3.9699220657348633, + "learning_rate": 4.628116951619843e-05, + "loss": 0.8923, + "step": 25240 + }, + { + "epoch": 0.2232182322884068, + "grad_norm": 2.9780991077423096, + "learning_rate": 4.6279696128526556e-05, + "loss": 0.6232, + "step": 25250 + }, + { + "epoch": 0.22330663554871905, + "grad_norm": 5.854918003082275, + "learning_rate": 4.6278222740854684e-05, + "loss": 0.9031, + "step": 25260 + }, + { + "epoch": 0.22339503880903128, + "grad_norm": 7.665713787078857, + "learning_rate": 4.627674935318281e-05, + "loss": 0.8961, + "step": 25270 + }, + { + "epoch": 0.2234834420693435, + "grad_norm": 4.15420389175415, + "learning_rate": 4.627527596551095e-05, + "loss": 0.8783, + "step": 25280 + }, + { + "epoch": 0.22357184532965577, + "grad_norm": 3.887390375137329, + "learning_rate": 4.627380257783907e-05, + "loss": 0.8838, + "step": 25290 + }, + { + "epoch": 0.223660248589968, + "grad_norm": 2.9458789825439453, + "learning_rate": 4.6272329190167204e-05, + "loss": 0.7953, + "step": 25300 + }, + { + "epoch": 0.22374865185028023, + "grad_norm": 7.877856731414795, + "learning_rate": 4.627085580249533e-05, + "loss": 0.8599, + "step": 25310 + }, + { + "epoch": 0.2238370551105925, + "grad_norm": 2.4428250789642334, + "learning_rate": 4.626938241482346e-05, + "loss": 0.7414, + "step": 25320 + }, + { + "epoch": 0.22392545837090472, + "grad_norm": 4.274850368499756, + "learning_rate": 4.626790902715159e-05, + "loss": 0.7594, + "step": 25330 + }, + { + "epoch": 0.22401386163121695, + "grad_norm": 7.199704170227051, + "learning_rate": 4.6266435639479725e-05, + "loss": 0.7728, + "step": 25340 + }, + { + "epoch": 0.2241022648915292, + "grad_norm": 4.567758560180664, + "learning_rate": 4.6264962251807846e-05, + "loss": 0.7248, + "step": 25350 + }, + { + "epoch": 0.22419066815184144, + "grad_norm": 4.301654815673828, + "learning_rate": 4.626348886413598e-05, + "loss": 0.7015, + "step": 25360 + }, + { + "epoch": 0.22427907141215367, + "grad_norm": 2.2330312728881836, + "learning_rate": 4.626201547646411e-05, + "loss": 0.6446, + "step": 25370 + }, + { + "epoch": 0.22436747467246593, + "grad_norm": 6.475268840789795, + "learning_rate": 4.626054208879224e-05, + "loss": 0.7297, + "step": 25380 + }, + { + "epoch": 0.22445587793277816, + "grad_norm": 2.9262239933013916, + "learning_rate": 4.6259068701120366e-05, + "loss": 0.7867, + "step": 25390 + }, + { + "epoch": 0.2245442811930904, + "grad_norm": 6.337355613708496, + "learning_rate": 4.6257595313448495e-05, + "loss": 0.8341, + "step": 25400 + }, + { + "epoch": 0.22463268445340265, + "grad_norm": 2.55621337890625, + "learning_rate": 4.625612192577662e-05, + "loss": 0.8733, + "step": 25410 + }, + { + "epoch": 0.22472108771371488, + "grad_norm": 7.797390460968018, + "learning_rate": 4.625464853810476e-05, + "loss": 0.7046, + "step": 25420 + }, + { + "epoch": 0.2248094909740271, + "grad_norm": 9.955615997314453, + "learning_rate": 4.625317515043288e-05, + "loss": 0.7353, + "step": 25430 + }, + { + "epoch": 0.22489789423433937, + "grad_norm": 2.787454605102539, + "learning_rate": 4.6251701762761015e-05, + "loss": 0.7961, + "step": 25440 + }, + { + "epoch": 0.2249862974946516, + "grad_norm": 5.0545549392700195, + "learning_rate": 4.625022837508914e-05, + "loss": 0.7113, + "step": 25450 + }, + { + "epoch": 0.22507470075496383, + "grad_norm": 2.6684982776641846, + "learning_rate": 4.624875498741727e-05, + "loss": 0.7372, + "step": 25460 + }, + { + "epoch": 0.2251631040152761, + "grad_norm": 2.917710542678833, + "learning_rate": 4.62472815997454e-05, + "loss": 0.7434, + "step": 25470 + }, + { + "epoch": 0.22525150727558832, + "grad_norm": 3.460366725921631, + "learning_rate": 4.6245808212073535e-05, + "loss": 0.7249, + "step": 25480 + }, + { + "epoch": 0.22533991053590055, + "grad_norm": 3.279576539993286, + "learning_rate": 4.624433482440166e-05, + "loss": 0.7065, + "step": 25490 + }, + { + "epoch": 0.2254283137962128, + "grad_norm": 2.606126070022583, + "learning_rate": 4.624286143672979e-05, + "loss": 0.8126, + "step": 25500 + }, + { + "epoch": 0.22551671705652504, + "grad_norm": 2.7988295555114746, + "learning_rate": 4.6241388049057913e-05, + "loss": 0.8257, + "step": 25510 + }, + { + "epoch": 0.2256051203168373, + "grad_norm": 2.0422515869140625, + "learning_rate": 4.623991466138605e-05, + "loss": 0.6915, + "step": 25520 + }, + { + "epoch": 0.22569352357714953, + "grad_norm": 2.5446791648864746, + "learning_rate": 4.623844127371418e-05, + "loss": 0.6996, + "step": 25530 + }, + { + "epoch": 0.22578192683746176, + "grad_norm": 7.3593549728393555, + "learning_rate": 4.6236967886042305e-05, + "loss": 0.8156, + "step": 25540 + }, + { + "epoch": 0.22587033009777402, + "grad_norm": 7.084205150604248, + "learning_rate": 4.6235494498370434e-05, + "loss": 0.7561, + "step": 25550 + }, + { + "epoch": 0.22595873335808625, + "grad_norm": 4.026143550872803, + "learning_rate": 4.623402111069857e-05, + "loss": 0.8163, + "step": 25560 + }, + { + "epoch": 0.22604713661839848, + "grad_norm": 9.809626579284668, + "learning_rate": 4.623254772302669e-05, + "loss": 0.7599, + "step": 25570 + }, + { + "epoch": 0.22613553987871074, + "grad_norm": 5.922295093536377, + "learning_rate": 4.6231074335354825e-05, + "loss": 0.7928, + "step": 25580 + }, + { + "epoch": 0.22622394313902297, + "grad_norm": 8.087105751037598, + "learning_rate": 4.6229600947682954e-05, + "loss": 0.8638, + "step": 25590 + }, + { + "epoch": 0.2263123463993352, + "grad_norm": 1.9569982290267944, + "learning_rate": 4.622812756001108e-05, + "loss": 0.817, + "step": 25600 + }, + { + "epoch": 0.22640074965964746, + "grad_norm": 3.5140202045440674, + "learning_rate": 4.622665417233921e-05, + "loss": 0.7167, + "step": 25610 + }, + { + "epoch": 0.2264891529199597, + "grad_norm": 1.2821376323699951, + "learning_rate": 4.6225180784667346e-05, + "loss": 0.6847, + "step": 25620 + }, + { + "epoch": 0.22657755618027192, + "grad_norm": 4.603971004486084, + "learning_rate": 4.622370739699547e-05, + "loss": 0.6777, + "step": 25630 + }, + { + "epoch": 0.22666595944058418, + "grad_norm": 2.99287748336792, + "learning_rate": 4.62222340093236e-05, + "loss": 0.7352, + "step": 25640 + }, + { + "epoch": 0.2267543627008964, + "grad_norm": 13.168510437011719, + "learning_rate": 4.6220760621651724e-05, + "loss": 0.889, + "step": 25650 + }, + { + "epoch": 0.22684276596120864, + "grad_norm": 3.5099494457244873, + "learning_rate": 4.621928723397986e-05, + "loss": 0.8372, + "step": 25660 + }, + { + "epoch": 0.2269311692215209, + "grad_norm": 1.6717056035995483, + "learning_rate": 4.621781384630799e-05, + "loss": 0.584, + "step": 25670 + }, + { + "epoch": 0.22701957248183313, + "grad_norm": 2.5716476440429688, + "learning_rate": 4.6216340458636116e-05, + "loss": 0.8171, + "step": 25680 + }, + { + "epoch": 0.22710797574214536, + "grad_norm": 7.323851585388184, + "learning_rate": 4.6214867070964244e-05, + "loss": 0.7505, + "step": 25690 + }, + { + "epoch": 0.22719637900245762, + "grad_norm": 9.877863883972168, + "learning_rate": 4.621339368329238e-05, + "loss": 0.7003, + "step": 25700 + }, + { + "epoch": 0.22728478226276985, + "grad_norm": 5.2179646492004395, + "learning_rate": 4.62119202956205e-05, + "loss": 0.903, + "step": 25710 + }, + { + "epoch": 0.22737318552308208, + "grad_norm": 7.904167175292969, + "learning_rate": 4.6210446907948636e-05, + "loss": 0.835, + "step": 25720 + }, + { + "epoch": 0.22746158878339434, + "grad_norm": 4.367486476898193, + "learning_rate": 4.6208973520276764e-05, + "loss": 0.6942, + "step": 25730 + }, + { + "epoch": 0.22754999204370657, + "grad_norm": 8.75403118133545, + "learning_rate": 4.620750013260489e-05, + "loss": 0.7477, + "step": 25740 + }, + { + "epoch": 0.2276383953040188, + "grad_norm": 4.122621536254883, + "learning_rate": 4.620602674493302e-05, + "loss": 0.872, + "step": 25750 + }, + { + "epoch": 0.22772679856433106, + "grad_norm": 2.2306032180786133, + "learning_rate": 4.620455335726115e-05, + "loss": 0.7103, + "step": 25760 + }, + { + "epoch": 0.2278152018246433, + "grad_norm": 13.444779396057129, + "learning_rate": 4.620307996958928e-05, + "loss": 0.8849, + "step": 25770 + }, + { + "epoch": 0.22790360508495552, + "grad_norm": 10.491046905517578, + "learning_rate": 4.620160658191741e-05, + "loss": 0.8103, + "step": 25780 + }, + { + "epoch": 0.22799200834526778, + "grad_norm": 2.5766170024871826, + "learning_rate": 4.6200133194245534e-05, + "loss": 0.8632, + "step": 25790 + }, + { + "epoch": 0.22808041160558, + "grad_norm": 2.156754493713379, + "learning_rate": 4.619865980657367e-05, + "loss": 0.7755, + "step": 25800 + }, + { + "epoch": 0.22816881486589224, + "grad_norm": 4.522340774536133, + "learning_rate": 4.61971864189018e-05, + "loss": 0.6697, + "step": 25810 + }, + { + "epoch": 0.2282572181262045, + "grad_norm": 5.178337574005127, + "learning_rate": 4.6195713031229926e-05, + "loss": 0.7709, + "step": 25820 + }, + { + "epoch": 0.22834562138651673, + "grad_norm": 4.399623870849609, + "learning_rate": 4.6194239643558055e-05, + "loss": 0.753, + "step": 25830 + }, + { + "epoch": 0.22843402464682896, + "grad_norm": 10.465398788452148, + "learning_rate": 4.619276625588619e-05, + "loss": 0.764, + "step": 25840 + }, + { + "epoch": 0.22852242790714122, + "grad_norm": 4.597443103790283, + "learning_rate": 4.619129286821431e-05, + "loss": 0.8505, + "step": 25850 + }, + { + "epoch": 0.22861083116745345, + "grad_norm": 2.543112277984619, + "learning_rate": 4.6189819480542447e-05, + "loss": 0.8435, + "step": 25860 + }, + { + "epoch": 0.22869923442776569, + "grad_norm": 8.69192123413086, + "learning_rate": 4.618834609287057e-05, + "loss": 0.73, + "step": 25870 + }, + { + "epoch": 0.22878763768807794, + "grad_norm": 4.735687255859375, + "learning_rate": 4.61868727051987e-05, + "loss": 0.7655, + "step": 25880 + }, + { + "epoch": 0.22887604094839017, + "grad_norm": 7.194514751434326, + "learning_rate": 4.618539931752683e-05, + "loss": 0.7434, + "step": 25890 + }, + { + "epoch": 0.2289644442087024, + "grad_norm": 3.0040442943573, + "learning_rate": 4.618392592985496e-05, + "loss": 0.7199, + "step": 25900 + }, + { + "epoch": 0.22905284746901466, + "grad_norm": 8.181593894958496, + "learning_rate": 4.618245254218309e-05, + "loss": 0.6733, + "step": 25910 + }, + { + "epoch": 0.2291412507293269, + "grad_norm": 4.748161315917969, + "learning_rate": 4.6180979154511223e-05, + "loss": 0.7654, + "step": 25920 + }, + { + "epoch": 0.22922965398963913, + "grad_norm": 5.968911170959473, + "learning_rate": 4.6179505766839345e-05, + "loss": 0.8292, + "step": 25930 + }, + { + "epoch": 0.22931805724995138, + "grad_norm": 1.8074452877044678, + "learning_rate": 4.617803237916748e-05, + "loss": 0.7501, + "step": 25940 + }, + { + "epoch": 0.22940646051026362, + "grad_norm": 7.659058570861816, + "learning_rate": 4.617655899149561e-05, + "loss": 0.7532, + "step": 25950 + }, + { + "epoch": 0.22949486377057585, + "grad_norm": 6.085169315338135, + "learning_rate": 4.617508560382374e-05, + "loss": 0.7895, + "step": 25960 + }, + { + "epoch": 0.2295832670308881, + "grad_norm": 19.16455841064453, + "learning_rate": 4.6173612216151865e-05, + "loss": 0.7464, + "step": 25970 + }, + { + "epoch": 0.22967167029120034, + "grad_norm": 6.268245697021484, + "learning_rate": 4.6172138828479994e-05, + "loss": 0.7581, + "step": 25980 + }, + { + "epoch": 0.22976007355151257, + "grad_norm": 3.595996141433716, + "learning_rate": 4.617066544080812e-05, + "loss": 0.7438, + "step": 25990 + }, + { + "epoch": 0.22984847681182483, + "grad_norm": 5.319154739379883, + "learning_rate": 4.616919205313626e-05, + "loss": 0.8218, + "step": 26000 + }, + { + "epoch": 0.22993688007213706, + "grad_norm": 5.387119770050049, + "learning_rate": 4.6167718665464385e-05, + "loss": 0.8435, + "step": 26010 + }, + { + "epoch": 0.2300252833324493, + "grad_norm": 2.5516552925109863, + "learning_rate": 4.6166245277792514e-05, + "loss": 0.8539, + "step": 26020 + }, + { + "epoch": 0.23011368659276155, + "grad_norm": 4.0663862228393555, + "learning_rate": 4.616477189012064e-05, + "loss": 0.7074, + "step": 26030 + }, + { + "epoch": 0.23020208985307378, + "grad_norm": 1.879555344581604, + "learning_rate": 4.616329850244877e-05, + "loss": 0.9003, + "step": 26040 + }, + { + "epoch": 0.230290493113386, + "grad_norm": 4.159968376159668, + "learning_rate": 4.61618251147769e-05, + "loss": 0.7554, + "step": 26050 + }, + { + "epoch": 0.23037889637369827, + "grad_norm": 8.559286117553711, + "learning_rate": 4.6160351727105034e-05, + "loss": 0.6224, + "step": 26060 + }, + { + "epoch": 0.2304672996340105, + "grad_norm": 6.309031009674072, + "learning_rate": 4.615887833943316e-05, + "loss": 0.7677, + "step": 26070 + }, + { + "epoch": 0.23055570289432276, + "grad_norm": 8.937350273132324, + "learning_rate": 4.615740495176129e-05, + "loss": 0.7323, + "step": 26080 + }, + { + "epoch": 0.230644106154635, + "grad_norm": 2.526836633682251, + "learning_rate": 4.615593156408942e-05, + "loss": 0.7272, + "step": 26090 + }, + { + "epoch": 0.23073250941494722, + "grad_norm": 2.119723081588745, + "learning_rate": 4.615445817641755e-05, + "loss": 0.7469, + "step": 26100 + }, + { + "epoch": 0.23082091267525948, + "grad_norm": 4.8421502113342285, + "learning_rate": 4.6152984788745676e-05, + "loss": 0.9137, + "step": 26110 + }, + { + "epoch": 0.2309093159355717, + "grad_norm": 4.467935562133789, + "learning_rate": 4.6151511401073804e-05, + "loss": 0.8161, + "step": 26120 + }, + { + "epoch": 0.23099771919588394, + "grad_norm": 4.147600173950195, + "learning_rate": 4.615003801340194e-05, + "loss": 0.9402, + "step": 26130 + }, + { + "epoch": 0.2310861224561962, + "grad_norm": 3.4456043243408203, + "learning_rate": 4.614856462573007e-05, + "loss": 0.7644, + "step": 26140 + }, + { + "epoch": 0.23117452571650843, + "grad_norm": 3.3381314277648926, + "learning_rate": 4.6147091238058196e-05, + "loss": 0.7687, + "step": 26150 + }, + { + "epoch": 0.23126292897682066, + "grad_norm": 5.1820549964904785, + "learning_rate": 4.6145617850386324e-05, + "loss": 0.858, + "step": 26160 + }, + { + "epoch": 0.23135133223713292, + "grad_norm": 2.9732067584991455, + "learning_rate": 4.614414446271445e-05, + "loss": 0.762, + "step": 26170 + }, + { + "epoch": 0.23143973549744515, + "grad_norm": 4.63769006729126, + "learning_rate": 4.614267107504258e-05, + "loss": 0.672, + "step": 26180 + }, + { + "epoch": 0.23152813875775738, + "grad_norm": 5.112890720367432, + "learning_rate": 4.6141197687370716e-05, + "loss": 0.629, + "step": 26190 + }, + { + "epoch": 0.23161654201806964, + "grad_norm": 3.3782641887664795, + "learning_rate": 4.6139724299698844e-05, + "loss": 0.6637, + "step": 26200 + }, + { + "epoch": 0.23170494527838187, + "grad_norm": 2.266312599182129, + "learning_rate": 4.613825091202697e-05, + "loss": 0.9511, + "step": 26210 + }, + { + "epoch": 0.2317933485386941, + "grad_norm": 8.544103622436523, + "learning_rate": 4.61367775243551e-05, + "loss": 0.8023, + "step": 26220 + }, + { + "epoch": 0.23188175179900636, + "grad_norm": 3.6856672763824463, + "learning_rate": 4.613530413668323e-05, + "loss": 0.7015, + "step": 26230 + }, + { + "epoch": 0.2319701550593186, + "grad_norm": 17.594270706176758, + "learning_rate": 4.613383074901136e-05, + "loss": 0.7831, + "step": 26240 + }, + { + "epoch": 0.23205855831963082, + "grad_norm": 4.329131603240967, + "learning_rate": 4.613235736133949e-05, + "loss": 0.787, + "step": 26250 + }, + { + "epoch": 0.23214696157994308, + "grad_norm": 7.807017803192139, + "learning_rate": 4.6130883973667615e-05, + "loss": 0.7351, + "step": 26260 + }, + { + "epoch": 0.2322353648402553, + "grad_norm": 5.564336776733398, + "learning_rate": 4.612941058599575e-05, + "loss": 0.8509, + "step": 26270 + }, + { + "epoch": 0.23232376810056754, + "grad_norm": 4.030319690704346, + "learning_rate": 4.612793719832388e-05, + "loss": 0.7824, + "step": 26280 + }, + { + "epoch": 0.2324121713608798, + "grad_norm": 3.7925074100494385, + "learning_rate": 4.6126463810652006e-05, + "loss": 0.741, + "step": 26290 + }, + { + "epoch": 0.23250057462119203, + "grad_norm": 5.255957126617432, + "learning_rate": 4.6124990422980135e-05, + "loss": 0.8586, + "step": 26300 + }, + { + "epoch": 0.23258897788150426, + "grad_norm": 2.7999472618103027, + "learning_rate": 4.612351703530827e-05, + "loss": 0.7317, + "step": 26310 + }, + { + "epoch": 0.23267738114181652, + "grad_norm": 7.2217254638671875, + "learning_rate": 4.612204364763639e-05, + "loss": 0.8289, + "step": 26320 + }, + { + "epoch": 0.23276578440212875, + "grad_norm": 2.340519666671753, + "learning_rate": 4.6120570259964527e-05, + "loss": 0.7462, + "step": 26330 + }, + { + "epoch": 0.23285418766244098, + "grad_norm": 11.22556209564209, + "learning_rate": 4.611909687229265e-05, + "loss": 0.7649, + "step": 26340 + }, + { + "epoch": 0.23294259092275324, + "grad_norm": 2.4115490913391113, + "learning_rate": 4.611762348462078e-05, + "loss": 0.8036, + "step": 26350 + }, + { + "epoch": 0.23303099418306547, + "grad_norm": 5.332367897033691, + "learning_rate": 4.611615009694891e-05, + "loss": 0.8238, + "step": 26360 + }, + { + "epoch": 0.2331193974433777, + "grad_norm": 3.896395683288574, + "learning_rate": 4.611467670927704e-05, + "loss": 0.6377, + "step": 26370 + }, + { + "epoch": 0.23320780070368996, + "grad_norm": 3.6120071411132812, + "learning_rate": 4.611320332160517e-05, + "loss": 0.6768, + "step": 26380 + }, + { + "epoch": 0.2332962039640022, + "grad_norm": 4.14366340637207, + "learning_rate": 4.6111729933933303e-05, + "loss": 0.8205, + "step": 26390 + }, + { + "epoch": 0.23338460722431442, + "grad_norm": 4.731985569000244, + "learning_rate": 4.6110256546261425e-05, + "loss": 0.7406, + "step": 26400 + }, + { + "epoch": 0.23347301048462668, + "grad_norm": 5.177968502044678, + "learning_rate": 4.610878315858956e-05, + "loss": 0.8204, + "step": 26410 + }, + { + "epoch": 0.2335614137449389, + "grad_norm": 1.977300763130188, + "learning_rate": 4.610730977091769e-05, + "loss": 0.8042, + "step": 26420 + }, + { + "epoch": 0.23364981700525114, + "grad_norm": 5.712028503417969, + "learning_rate": 4.610583638324582e-05, + "loss": 0.8159, + "step": 26430 + }, + { + "epoch": 0.2337382202655634, + "grad_norm": 21.36813735961914, + "learning_rate": 4.6104362995573945e-05, + "loss": 0.8236, + "step": 26440 + }, + { + "epoch": 0.23382662352587563, + "grad_norm": 9.561662673950195, + "learning_rate": 4.6102889607902074e-05, + "loss": 0.6932, + "step": 26450 + }, + { + "epoch": 0.23391502678618786, + "grad_norm": 2.5096518993377686, + "learning_rate": 4.61014162202302e-05, + "loss": 0.7873, + "step": 26460 + }, + { + "epoch": 0.23400343004650012, + "grad_norm": 14.823768615722656, + "learning_rate": 4.609994283255834e-05, + "loss": 0.8304, + "step": 26470 + }, + { + "epoch": 0.23409183330681235, + "grad_norm": 9.787788391113281, + "learning_rate": 4.609846944488646e-05, + "loss": 0.7191, + "step": 26480 + }, + { + "epoch": 0.23418023656712458, + "grad_norm": 7.436131477355957, + "learning_rate": 4.6096996057214594e-05, + "loss": 0.9062, + "step": 26490 + }, + { + "epoch": 0.23426863982743684, + "grad_norm": 5.646495342254639, + "learning_rate": 4.609552266954272e-05, + "loss": 0.9358, + "step": 26500 + }, + { + "epoch": 0.23435704308774907, + "grad_norm": 11.70395565032959, + "learning_rate": 4.609404928187085e-05, + "loss": 0.8903, + "step": 26510 + }, + { + "epoch": 0.2344454463480613, + "grad_norm": 3.845158100128174, + "learning_rate": 4.609257589419898e-05, + "loss": 0.8289, + "step": 26520 + }, + { + "epoch": 0.23453384960837356, + "grad_norm": 6.0214948654174805, + "learning_rate": 4.6091102506527114e-05, + "loss": 0.7012, + "step": 26530 + }, + { + "epoch": 0.2346222528686858, + "grad_norm": 7.993401050567627, + "learning_rate": 4.6089629118855236e-05, + "loss": 0.8474, + "step": 26540 + }, + { + "epoch": 0.23471065612899802, + "grad_norm": 8.60523509979248, + "learning_rate": 4.608815573118337e-05, + "loss": 0.781, + "step": 26550 + }, + { + "epoch": 0.23479905938931028, + "grad_norm": 3.2778806686401367, + "learning_rate": 4.60866823435115e-05, + "loss": 0.762, + "step": 26560 + }, + { + "epoch": 0.2348874626496225, + "grad_norm": 3.393656015396118, + "learning_rate": 4.608520895583963e-05, + "loss": 0.7628, + "step": 26570 + }, + { + "epoch": 0.23497586590993474, + "grad_norm": 2.1216790676116943, + "learning_rate": 4.6083735568167756e-05, + "loss": 0.7846, + "step": 26580 + }, + { + "epoch": 0.235064269170247, + "grad_norm": 4.257018089294434, + "learning_rate": 4.6082262180495884e-05, + "loss": 0.7494, + "step": 26590 + }, + { + "epoch": 0.23515267243055923, + "grad_norm": 4.937124252319336, + "learning_rate": 4.608078879282401e-05, + "loss": 0.7143, + "step": 26600 + }, + { + "epoch": 0.2352410756908715, + "grad_norm": 1.3708324432373047, + "learning_rate": 4.607931540515215e-05, + "loss": 0.7661, + "step": 26610 + }, + { + "epoch": 0.23532947895118372, + "grad_norm": 6.229585647583008, + "learning_rate": 4.607784201748027e-05, + "loss": 0.839, + "step": 26620 + }, + { + "epoch": 0.23541788221149595, + "grad_norm": 4.585300922393799, + "learning_rate": 4.6076368629808404e-05, + "loss": 0.7521, + "step": 26630 + }, + { + "epoch": 0.2355062854718082, + "grad_norm": 7.729150295257568, + "learning_rate": 4.607489524213653e-05, + "loss": 0.638, + "step": 26640 + }, + { + "epoch": 0.23559468873212044, + "grad_norm": 1.7038516998291016, + "learning_rate": 4.607342185446466e-05, + "loss": 0.7783, + "step": 26650 + }, + { + "epoch": 0.23568309199243268, + "grad_norm": 2.533660411834717, + "learning_rate": 4.607194846679279e-05, + "loss": 0.7886, + "step": 26660 + }, + { + "epoch": 0.23577149525274493, + "grad_norm": 5.983635425567627, + "learning_rate": 4.6070475079120924e-05, + "loss": 0.8338, + "step": 26670 + }, + { + "epoch": 0.23585989851305716, + "grad_norm": 1.9739922285079956, + "learning_rate": 4.6069001691449046e-05, + "loss": 0.6989, + "step": 26680 + }, + { + "epoch": 0.2359483017733694, + "grad_norm": 3.406186819076538, + "learning_rate": 4.606752830377718e-05, + "loss": 0.7917, + "step": 26690 + }, + { + "epoch": 0.23603670503368165, + "grad_norm": 5.995182514190674, + "learning_rate": 4.60660549161053e-05, + "loss": 0.7603, + "step": 26700 + }, + { + "epoch": 0.23612510829399389, + "grad_norm": 6.2465009689331055, + "learning_rate": 4.606458152843344e-05, + "loss": 0.7102, + "step": 26710 + }, + { + "epoch": 0.23621351155430612, + "grad_norm": 4.614730358123779, + "learning_rate": 4.6063108140761566e-05, + "loss": 0.8176, + "step": 26720 + }, + { + "epoch": 0.23630191481461837, + "grad_norm": 1.6852394342422485, + "learning_rate": 4.6061634753089695e-05, + "loss": 0.8228, + "step": 26730 + }, + { + "epoch": 0.2363903180749306, + "grad_norm": 2.212242841720581, + "learning_rate": 4.606016136541782e-05, + "loss": 0.6936, + "step": 26740 + }, + { + "epoch": 0.23647872133524284, + "grad_norm": 1.9207688570022583, + "learning_rate": 4.605868797774596e-05, + "loss": 0.7462, + "step": 26750 + }, + { + "epoch": 0.2365671245955551, + "grad_norm": 5.530776023864746, + "learning_rate": 4.605721459007408e-05, + "loss": 0.8081, + "step": 26760 + }, + { + "epoch": 0.23665552785586733, + "grad_norm": 6.019812107086182, + "learning_rate": 4.6055741202402215e-05, + "loss": 0.8332, + "step": 26770 + }, + { + "epoch": 0.23674393111617956, + "grad_norm": 4.056359767913818, + "learning_rate": 4.605426781473034e-05, + "loss": 0.7116, + "step": 26780 + }, + { + "epoch": 0.23683233437649182, + "grad_norm": 17.098737716674805, + "learning_rate": 4.605279442705847e-05, + "loss": 0.8057, + "step": 26790 + }, + { + "epoch": 0.23692073763680405, + "grad_norm": 3.866529703140259, + "learning_rate": 4.60513210393866e-05, + "loss": 0.7172, + "step": 26800 + }, + { + "epoch": 0.23700914089711628, + "grad_norm": 4.999325752258301, + "learning_rate": 4.604984765171473e-05, + "loss": 0.7121, + "step": 26810 + }, + { + "epoch": 0.23709754415742854, + "grad_norm": 4.941673278808594, + "learning_rate": 4.6048374264042857e-05, + "loss": 0.616, + "step": 26820 + }, + { + "epoch": 0.23718594741774077, + "grad_norm": 8.706204414367676, + "learning_rate": 4.604690087637099e-05, + "loss": 0.9083, + "step": 26830 + }, + { + "epoch": 0.237274350678053, + "grad_norm": 6.6517205238342285, + "learning_rate": 4.604542748869911e-05, + "loss": 0.6911, + "step": 26840 + }, + { + "epoch": 0.23736275393836526, + "grad_norm": 5.494711875915527, + "learning_rate": 4.604395410102725e-05, + "loss": 0.7714, + "step": 26850 + }, + { + "epoch": 0.2374511571986775, + "grad_norm": 5.264119625091553, + "learning_rate": 4.604248071335538e-05, + "loss": 0.6888, + "step": 26860 + }, + { + "epoch": 0.23753956045898972, + "grad_norm": 2.502356767654419, + "learning_rate": 4.6041007325683505e-05, + "loss": 0.9187, + "step": 26870 + }, + { + "epoch": 0.23762796371930198, + "grad_norm": 3.20857572555542, + "learning_rate": 4.6039533938011633e-05, + "loss": 0.8616, + "step": 26880 + }, + { + "epoch": 0.2377163669796142, + "grad_norm": 3.5631120204925537, + "learning_rate": 4.603806055033977e-05, + "loss": 0.7893, + "step": 26890 + }, + { + "epoch": 0.23780477023992644, + "grad_norm": 1.9661896228790283, + "learning_rate": 4.603658716266789e-05, + "loss": 0.7289, + "step": 26900 + }, + { + "epoch": 0.2378931735002387, + "grad_norm": 16.530513763427734, + "learning_rate": 4.6035113774996025e-05, + "loss": 0.7672, + "step": 26910 + }, + { + "epoch": 0.23798157676055093, + "grad_norm": 5.208324909210205, + "learning_rate": 4.6033640387324154e-05, + "loss": 0.5701, + "step": 26920 + }, + { + "epoch": 0.23806998002086316, + "grad_norm": 6.618350982666016, + "learning_rate": 4.603216699965228e-05, + "loss": 0.8072, + "step": 26930 + }, + { + "epoch": 0.23815838328117542, + "grad_norm": 5.809628009796143, + "learning_rate": 4.603069361198041e-05, + "loss": 0.6595, + "step": 26940 + }, + { + "epoch": 0.23824678654148765, + "grad_norm": 4.600014686584473, + "learning_rate": 4.602922022430854e-05, + "loss": 0.724, + "step": 26950 + }, + { + "epoch": 0.23833518980179988, + "grad_norm": 7.521449089050293, + "learning_rate": 4.602774683663667e-05, + "loss": 0.965, + "step": 26960 + }, + { + "epoch": 0.23842359306211214, + "grad_norm": 6.910586357116699, + "learning_rate": 4.60262734489648e-05, + "loss": 0.7996, + "step": 26970 + }, + { + "epoch": 0.23851199632242437, + "grad_norm": 2.6404194831848145, + "learning_rate": 4.602480006129293e-05, + "loss": 0.7884, + "step": 26980 + }, + { + "epoch": 0.2386003995827366, + "grad_norm": 3.6584150791168213, + "learning_rate": 4.602332667362106e-05, + "loss": 0.5845, + "step": 26990 + }, + { + "epoch": 0.23868880284304886, + "grad_norm": 3.9669134616851807, + "learning_rate": 4.602185328594919e-05, + "loss": 0.7716, + "step": 27000 + }, + { + "epoch": 0.2387772061033611, + "grad_norm": 3.2543506622314453, + "learning_rate": 4.6020379898277316e-05, + "loss": 0.732, + "step": 27010 + }, + { + "epoch": 0.23886560936367332, + "grad_norm": 6.32755708694458, + "learning_rate": 4.6018906510605444e-05, + "loss": 0.7717, + "step": 27020 + }, + { + "epoch": 0.23895401262398558, + "grad_norm": 3.7160096168518066, + "learning_rate": 4.601743312293358e-05, + "loss": 0.7618, + "step": 27030 + }, + { + "epoch": 0.2390424158842978, + "grad_norm": 4.574923515319824, + "learning_rate": 4.601595973526171e-05, + "loss": 0.7261, + "step": 27040 + }, + { + "epoch": 0.23913081914461004, + "grad_norm": 8.13255500793457, + "learning_rate": 4.6014486347589836e-05, + "loss": 0.7097, + "step": 27050 + }, + { + "epoch": 0.2392192224049223, + "grad_norm": 5.212277412414551, + "learning_rate": 4.6013012959917964e-05, + "loss": 0.7916, + "step": 27060 + }, + { + "epoch": 0.23930762566523453, + "grad_norm": 7.050914764404297, + "learning_rate": 4.601153957224609e-05, + "loss": 0.6378, + "step": 27070 + }, + { + "epoch": 0.23939602892554676, + "grad_norm": 2.6058921813964844, + "learning_rate": 4.601006618457422e-05, + "loss": 0.7586, + "step": 27080 + }, + { + "epoch": 0.23948443218585902, + "grad_norm": 2.754028081893921, + "learning_rate": 4.600859279690235e-05, + "loss": 0.8515, + "step": 27090 + }, + { + "epoch": 0.23957283544617125, + "grad_norm": 4.837860107421875, + "learning_rate": 4.6007119409230484e-05, + "loss": 0.8845, + "step": 27100 + }, + { + "epoch": 0.23966123870648348, + "grad_norm": 3.7284557819366455, + "learning_rate": 4.600564602155861e-05, + "loss": 0.9217, + "step": 27110 + }, + { + "epoch": 0.23974964196679574, + "grad_norm": 3.5191853046417236, + "learning_rate": 4.600417263388674e-05, + "loss": 0.7213, + "step": 27120 + }, + { + "epoch": 0.23983804522710797, + "grad_norm": 4.532087802886963, + "learning_rate": 4.600269924621487e-05, + "loss": 0.8679, + "step": 27130 + }, + { + "epoch": 0.23992644848742023, + "grad_norm": 4.2826337814331055, + "learning_rate": 4.6001225858543e-05, + "loss": 0.6832, + "step": 27140 + }, + { + "epoch": 0.24001485174773246, + "grad_norm": 2.4995365142822266, + "learning_rate": 4.5999752470871126e-05, + "loss": 0.583, + "step": 27150 + }, + { + "epoch": 0.2401032550080447, + "grad_norm": 3.484452247619629, + "learning_rate": 4.599827908319926e-05, + "loss": 0.7244, + "step": 27160 + }, + { + "epoch": 0.24019165826835695, + "grad_norm": 5.703882217407227, + "learning_rate": 4.599680569552738e-05, + "loss": 0.7813, + "step": 27170 + }, + { + "epoch": 0.24028006152866918, + "grad_norm": 8.513440132141113, + "learning_rate": 4.599533230785552e-05, + "loss": 0.8008, + "step": 27180 + }, + { + "epoch": 0.2403684647889814, + "grad_norm": 3.886124610900879, + "learning_rate": 4.5993858920183646e-05, + "loss": 0.8106, + "step": 27190 + }, + { + "epoch": 0.24045686804929367, + "grad_norm": 2.080432415008545, + "learning_rate": 4.5992385532511775e-05, + "loss": 0.775, + "step": 27200 + }, + { + "epoch": 0.2405452713096059, + "grad_norm": 4.064215660095215, + "learning_rate": 4.59909121448399e-05, + "loss": 0.7833, + "step": 27210 + }, + { + "epoch": 0.24063367456991813, + "grad_norm": 1.8495577573776245, + "learning_rate": 4.598943875716804e-05, + "loss": 0.7723, + "step": 27220 + }, + { + "epoch": 0.2407220778302304, + "grad_norm": 1.6558035612106323, + "learning_rate": 4.598796536949616e-05, + "loss": 0.8189, + "step": 27230 + }, + { + "epoch": 0.24081048109054262, + "grad_norm": 4.51279878616333, + "learning_rate": 4.5986491981824295e-05, + "loss": 0.7964, + "step": 27240 + }, + { + "epoch": 0.24089888435085485, + "grad_norm": 9.106587409973145, + "learning_rate": 4.598501859415242e-05, + "loss": 0.8, + "step": 27250 + }, + { + "epoch": 0.2409872876111671, + "grad_norm": 3.420579671859741, + "learning_rate": 4.598354520648055e-05, + "loss": 0.7039, + "step": 27260 + }, + { + "epoch": 0.24107569087147934, + "grad_norm": 9.412304878234863, + "learning_rate": 4.598207181880868e-05, + "loss": 0.8447, + "step": 27270 + }, + { + "epoch": 0.24116409413179157, + "grad_norm": 1.5723434686660767, + "learning_rate": 4.598059843113681e-05, + "loss": 0.7944, + "step": 27280 + }, + { + "epoch": 0.24125249739210383, + "grad_norm": 2.9342610836029053, + "learning_rate": 4.597912504346494e-05, + "loss": 0.8458, + "step": 27290 + }, + { + "epoch": 0.24134090065241606, + "grad_norm": 11.885449409484863, + "learning_rate": 4.597765165579307e-05, + "loss": 0.7906, + "step": 27300 + }, + { + "epoch": 0.2414293039127283, + "grad_norm": 3.185608148574829, + "learning_rate": 4.597617826812119e-05, + "loss": 0.9034, + "step": 27310 + }, + { + "epoch": 0.24151770717304055, + "grad_norm": 5.354928970336914, + "learning_rate": 4.597470488044933e-05, + "loss": 0.8559, + "step": 27320 + }, + { + "epoch": 0.24160611043335278, + "grad_norm": 3.766575336456299, + "learning_rate": 4.597323149277746e-05, + "loss": 0.7863, + "step": 27330 + }, + { + "epoch": 0.24169451369366501, + "grad_norm": 3.2981998920440674, + "learning_rate": 4.5971758105105585e-05, + "loss": 0.8559, + "step": 27340 + }, + { + "epoch": 0.24178291695397727, + "grad_norm": 5.427745819091797, + "learning_rate": 4.5970284717433714e-05, + "loss": 0.9341, + "step": 27350 + }, + { + "epoch": 0.2418713202142895, + "grad_norm": 8.101058959960938, + "learning_rate": 4.596881132976185e-05, + "loss": 0.7719, + "step": 27360 + }, + { + "epoch": 0.24195972347460173, + "grad_norm": 2.6489789485931396, + "learning_rate": 4.596733794208997e-05, + "loss": 0.6542, + "step": 27370 + }, + { + "epoch": 0.242048126734914, + "grad_norm": 5.422354221343994, + "learning_rate": 4.5965864554418105e-05, + "loss": 0.889, + "step": 27380 + }, + { + "epoch": 0.24213652999522622, + "grad_norm": 3.1926510334014893, + "learning_rate": 4.5964391166746234e-05, + "loss": 0.8579, + "step": 27390 + }, + { + "epoch": 0.24222493325553845, + "grad_norm": 7.074790000915527, + "learning_rate": 4.596291777907436e-05, + "loss": 0.7468, + "step": 27400 + }, + { + "epoch": 0.2423133365158507, + "grad_norm": 3.0544276237487793, + "learning_rate": 4.596144439140249e-05, + "loss": 0.6276, + "step": 27410 + }, + { + "epoch": 0.24240173977616294, + "grad_norm": 2.9188649654388428, + "learning_rate": 4.595997100373062e-05, + "loss": 0.8359, + "step": 27420 + }, + { + "epoch": 0.24249014303647518, + "grad_norm": 8.019341468811035, + "learning_rate": 4.595849761605875e-05, + "loss": 0.7151, + "step": 27430 + }, + { + "epoch": 0.24257854629678743, + "grad_norm": 3.7809083461761475, + "learning_rate": 4.595702422838688e-05, + "loss": 0.7658, + "step": 27440 + }, + { + "epoch": 0.24266694955709966, + "grad_norm": 2.2371914386749268, + "learning_rate": 4.5955550840715004e-05, + "loss": 0.6999, + "step": 27450 + }, + { + "epoch": 0.2427553528174119, + "grad_norm": 2.8070926666259766, + "learning_rate": 4.595407745304314e-05, + "loss": 0.7651, + "step": 27460 + }, + { + "epoch": 0.24284375607772415, + "grad_norm": 1.8410590887069702, + "learning_rate": 4.595260406537127e-05, + "loss": 0.6764, + "step": 27470 + }, + { + "epoch": 0.24293215933803639, + "grad_norm": 1.984129786491394, + "learning_rate": 4.5951130677699396e-05, + "loss": 0.6803, + "step": 27480 + }, + { + "epoch": 0.24302056259834862, + "grad_norm": 4.3879313468933105, + "learning_rate": 4.5949657290027524e-05, + "loss": 0.8164, + "step": 27490 + }, + { + "epoch": 0.24310896585866087, + "grad_norm": 2.5979106426239014, + "learning_rate": 4.594818390235566e-05, + "loss": 0.7351, + "step": 27500 + }, + { + "epoch": 0.2431973691189731, + "grad_norm": 8.960977554321289, + "learning_rate": 4.594671051468378e-05, + "loss": 0.8408, + "step": 27510 + }, + { + "epoch": 0.24328577237928534, + "grad_norm": 5.437119960784912, + "learning_rate": 4.5945237127011916e-05, + "loss": 0.7397, + "step": 27520 + }, + { + "epoch": 0.2433741756395976, + "grad_norm": 4.247231960296631, + "learning_rate": 4.594376373934004e-05, + "loss": 0.7607, + "step": 27530 + }, + { + "epoch": 0.24346257889990983, + "grad_norm": 7.173487663269043, + "learning_rate": 4.594229035166817e-05, + "loss": 0.7898, + "step": 27540 + }, + { + "epoch": 0.24355098216022206, + "grad_norm": 4.7880682945251465, + "learning_rate": 4.59408169639963e-05, + "loss": 0.7287, + "step": 27550 + }, + { + "epoch": 0.24363938542053432, + "grad_norm": 2.163510322570801, + "learning_rate": 4.593934357632443e-05, + "loss": 0.8242, + "step": 27560 + }, + { + "epoch": 0.24372778868084655, + "grad_norm": 2.545175313949585, + "learning_rate": 4.593787018865256e-05, + "loss": 0.7271, + "step": 27570 + }, + { + "epoch": 0.24381619194115878, + "grad_norm": 7.023959636688232, + "learning_rate": 4.593639680098069e-05, + "loss": 0.755, + "step": 27580 + }, + { + "epoch": 0.24390459520147104, + "grad_norm": 5.333517551422119, + "learning_rate": 4.5934923413308814e-05, + "loss": 0.8837, + "step": 27590 + }, + { + "epoch": 0.24399299846178327, + "grad_norm": 4.334492206573486, + "learning_rate": 4.593345002563695e-05, + "loss": 0.8622, + "step": 27600 + }, + { + "epoch": 0.2440814017220955, + "grad_norm": 5.533698558807373, + "learning_rate": 4.593197663796508e-05, + "loss": 0.8461, + "step": 27610 + }, + { + "epoch": 0.24416980498240776, + "grad_norm": 12.364745140075684, + "learning_rate": 4.5930503250293206e-05, + "loss": 0.7126, + "step": 27620 + }, + { + "epoch": 0.24425820824272, + "grad_norm": 7.118648052215576, + "learning_rate": 4.5929029862621335e-05, + "loss": 0.7628, + "step": 27630 + }, + { + "epoch": 0.24434661150303222, + "grad_norm": 3.5467753410339355, + "learning_rate": 4.592755647494946e-05, + "loss": 0.7172, + "step": 27640 + }, + { + "epoch": 0.24443501476334448, + "grad_norm": 3.2943429946899414, + "learning_rate": 4.592608308727759e-05, + "loss": 0.6839, + "step": 27650 + }, + { + "epoch": 0.2445234180236567, + "grad_norm": 5.245155334472656, + "learning_rate": 4.5924609699605726e-05, + "loss": 0.7213, + "step": 27660 + }, + { + "epoch": 0.24461182128396897, + "grad_norm": 4.831164360046387, + "learning_rate": 4.592313631193385e-05, + "loss": 0.8247, + "step": 27670 + }, + { + "epoch": 0.2447002245442812, + "grad_norm": 4.629475116729736, + "learning_rate": 4.592166292426198e-05, + "loss": 0.7381, + "step": 27680 + }, + { + "epoch": 0.24478862780459343, + "grad_norm": 6.993200302124023, + "learning_rate": 4.592018953659011e-05, + "loss": 0.6655, + "step": 27690 + }, + { + "epoch": 0.2448770310649057, + "grad_norm": 3.4463162422180176, + "learning_rate": 4.591871614891824e-05, + "loss": 0.8809, + "step": 27700 + }, + { + "epoch": 0.24496543432521792, + "grad_norm": 3.353153705596924, + "learning_rate": 4.591724276124637e-05, + "loss": 0.7817, + "step": 27710 + }, + { + "epoch": 0.24505383758553015, + "grad_norm": 8.690110206604004, + "learning_rate": 4.59157693735745e-05, + "loss": 0.8445, + "step": 27720 + }, + { + "epoch": 0.2451422408458424, + "grad_norm": 5.797652721405029, + "learning_rate": 4.5914295985902625e-05, + "loss": 0.6521, + "step": 27730 + }, + { + "epoch": 0.24523064410615464, + "grad_norm": 7.199275016784668, + "learning_rate": 4.591282259823076e-05, + "loss": 0.7692, + "step": 27740 + }, + { + "epoch": 0.24531904736646687, + "grad_norm": 7.3925886154174805, + "learning_rate": 4.591134921055888e-05, + "loss": 0.8059, + "step": 27750 + }, + { + "epoch": 0.24540745062677913, + "grad_norm": 4.901026725769043, + "learning_rate": 4.590987582288702e-05, + "loss": 0.82, + "step": 27760 + }, + { + "epoch": 0.24549585388709136, + "grad_norm": 5.541647434234619, + "learning_rate": 4.5908402435215145e-05, + "loss": 0.7396, + "step": 27770 + }, + { + "epoch": 0.2455842571474036, + "grad_norm": 3.0398240089416504, + "learning_rate": 4.5906929047543273e-05, + "loss": 0.715, + "step": 27780 + }, + { + "epoch": 0.24567266040771585, + "grad_norm": 6.12975549697876, + "learning_rate": 4.59054556598714e-05, + "loss": 0.7317, + "step": 27790 + }, + { + "epoch": 0.24576106366802808, + "grad_norm": 5.894838809967041, + "learning_rate": 4.590398227219954e-05, + "loss": 0.9235, + "step": 27800 + }, + { + "epoch": 0.2458494669283403, + "grad_norm": 4.738114833831787, + "learning_rate": 4.590250888452766e-05, + "loss": 0.8374, + "step": 27810 + }, + { + "epoch": 0.24593787018865257, + "grad_norm": 3.513248920440674, + "learning_rate": 4.5901035496855794e-05, + "loss": 0.7429, + "step": 27820 + }, + { + "epoch": 0.2460262734489648, + "grad_norm": 7.718358039855957, + "learning_rate": 4.589956210918392e-05, + "loss": 0.7613, + "step": 27830 + }, + { + "epoch": 0.24611467670927703, + "grad_norm": 2.4155099391937256, + "learning_rate": 4.589808872151205e-05, + "loss": 0.8053, + "step": 27840 + }, + { + "epoch": 0.2462030799695893, + "grad_norm": 2.726713180541992, + "learning_rate": 4.589661533384018e-05, + "loss": 0.7494, + "step": 27850 + }, + { + "epoch": 0.24629148322990152, + "grad_norm": 4.137597560882568, + "learning_rate": 4.5895141946168314e-05, + "loss": 0.7612, + "step": 27860 + }, + { + "epoch": 0.24637988649021375, + "grad_norm": 4.160385608673096, + "learning_rate": 4.5893668558496435e-05, + "loss": 0.7191, + "step": 27870 + }, + { + "epoch": 0.246468289750526, + "grad_norm": 4.072481155395508, + "learning_rate": 4.589219517082457e-05, + "loss": 0.8007, + "step": 27880 + }, + { + "epoch": 0.24655669301083824, + "grad_norm": 7.5308637619018555, + "learning_rate": 4.58907217831527e-05, + "loss": 0.8299, + "step": 27890 + }, + { + "epoch": 0.24664509627115047, + "grad_norm": 6.215305328369141, + "learning_rate": 4.588924839548083e-05, + "loss": 0.7622, + "step": 27900 + }, + { + "epoch": 0.24673349953146273, + "grad_norm": 3.666205406188965, + "learning_rate": 4.5887775007808956e-05, + "loss": 0.6522, + "step": 27910 + }, + { + "epoch": 0.24682190279177496, + "grad_norm": 4.303045749664307, + "learning_rate": 4.5886301620137084e-05, + "loss": 0.8075, + "step": 27920 + }, + { + "epoch": 0.2469103060520872, + "grad_norm": 3.4121458530426025, + "learning_rate": 4.588482823246521e-05, + "loss": 0.7274, + "step": 27930 + }, + { + "epoch": 0.24699870931239945, + "grad_norm": 7.0728936195373535, + "learning_rate": 4.588335484479335e-05, + "loss": 0.9065, + "step": 27940 + }, + { + "epoch": 0.24708711257271168, + "grad_norm": 2.6692259311676025, + "learning_rate": 4.5881881457121476e-05, + "loss": 0.7414, + "step": 27950 + }, + { + "epoch": 0.2471755158330239, + "grad_norm": 4.277823448181152, + "learning_rate": 4.5880408069449604e-05, + "loss": 0.7551, + "step": 27960 + }, + { + "epoch": 0.24726391909333617, + "grad_norm": 6.4987921714782715, + "learning_rate": 4.587893468177773e-05, + "loss": 0.8438, + "step": 27970 + }, + { + "epoch": 0.2473523223536484, + "grad_norm": 3.7967724800109863, + "learning_rate": 4.587746129410586e-05, + "loss": 0.8001, + "step": 27980 + }, + { + "epoch": 0.24744072561396063, + "grad_norm": 8.63526725769043, + "learning_rate": 4.5875987906433996e-05, + "loss": 0.8722, + "step": 27990 + }, + { + "epoch": 0.2475291288742729, + "grad_norm": 4.242366313934326, + "learning_rate": 4.587451451876212e-05, + "loss": 0.7051, + "step": 28000 + }, + { + "epoch": 0.24761753213458512, + "grad_norm": 10.421416282653809, + "learning_rate": 4.587304113109025e-05, + "loss": 0.7388, + "step": 28010 + }, + { + "epoch": 0.24770593539489735, + "grad_norm": 2.73309588432312, + "learning_rate": 4.587156774341838e-05, + "loss": 0.6632, + "step": 28020 + }, + { + "epoch": 0.2477943386552096, + "grad_norm": 6.762021064758301, + "learning_rate": 4.587009435574651e-05, + "loss": 0.7726, + "step": 28030 + }, + { + "epoch": 0.24788274191552184, + "grad_norm": 2.386453866958618, + "learning_rate": 4.586862096807464e-05, + "loss": 0.8197, + "step": 28040 + }, + { + "epoch": 0.24797114517583407, + "grad_norm": 2.4636244773864746, + "learning_rate": 4.586714758040277e-05, + "loss": 0.8321, + "step": 28050 + }, + { + "epoch": 0.24805954843614633, + "grad_norm": 5.344876766204834, + "learning_rate": 4.5865674192730894e-05, + "loss": 0.9034, + "step": 28060 + }, + { + "epoch": 0.24814795169645856, + "grad_norm": 2.711806297302246, + "learning_rate": 4.586420080505903e-05, + "loss": 0.7574, + "step": 28070 + }, + { + "epoch": 0.2482363549567708, + "grad_norm": 3.473045825958252, + "learning_rate": 4.586272741738716e-05, + "loss": 0.7393, + "step": 28080 + }, + { + "epoch": 0.24832475821708305, + "grad_norm": 9.402650833129883, + "learning_rate": 4.5861254029715286e-05, + "loss": 0.7757, + "step": 28090 + }, + { + "epoch": 0.24841316147739528, + "grad_norm": 2.2732865810394287, + "learning_rate": 4.5859780642043415e-05, + "loss": 0.7385, + "step": 28100 + }, + { + "epoch": 0.24850156473770751, + "grad_norm": 2.804537534713745, + "learning_rate": 4.585830725437154e-05, + "loss": 0.7192, + "step": 28110 + }, + { + "epoch": 0.24858996799801977, + "grad_norm": 12.078184127807617, + "learning_rate": 4.585683386669967e-05, + "loss": 0.7096, + "step": 28120 + }, + { + "epoch": 0.248678371258332, + "grad_norm": 5.753732681274414, + "learning_rate": 4.5855360479027806e-05, + "loss": 0.7176, + "step": 28130 + }, + { + "epoch": 0.24876677451864423, + "grad_norm": 10.479565620422363, + "learning_rate": 4.585388709135593e-05, + "loss": 0.7143, + "step": 28140 + }, + { + "epoch": 0.2488551777789565, + "grad_norm": 3.086230993270874, + "learning_rate": 4.585241370368406e-05, + "loss": 0.9096, + "step": 28150 + }, + { + "epoch": 0.24894358103926872, + "grad_norm": 4.537883281707764, + "learning_rate": 4.585094031601219e-05, + "loss": 0.8254, + "step": 28160 + }, + { + "epoch": 0.24903198429958096, + "grad_norm": 5.582494258880615, + "learning_rate": 4.584946692834032e-05, + "loss": 0.8814, + "step": 28170 + }, + { + "epoch": 0.2491203875598932, + "grad_norm": 4.154595851898193, + "learning_rate": 4.584799354066845e-05, + "loss": 0.7157, + "step": 28180 + }, + { + "epoch": 0.24920879082020544, + "grad_norm": 5.340351581573486, + "learning_rate": 4.584652015299658e-05, + "loss": 0.7943, + "step": 28190 + }, + { + "epoch": 0.2492971940805177, + "grad_norm": 14.03711986541748, + "learning_rate": 4.5845046765324705e-05, + "loss": 0.8136, + "step": 28200 + }, + { + "epoch": 0.24938559734082993, + "grad_norm": 3.375035285949707, + "learning_rate": 4.584357337765284e-05, + "loss": 0.7918, + "step": 28210 + }, + { + "epoch": 0.24947400060114217, + "grad_norm": 5.254472255706787, + "learning_rate": 4.584209998998096e-05, + "loss": 0.6279, + "step": 28220 + }, + { + "epoch": 0.24956240386145442, + "grad_norm": 7.013589382171631, + "learning_rate": 4.58406266023091e-05, + "loss": 1.0012, + "step": 28230 + }, + { + "epoch": 0.24965080712176665, + "grad_norm": 15.84322452545166, + "learning_rate": 4.5839153214637225e-05, + "loss": 0.8033, + "step": 28240 + }, + { + "epoch": 0.24973921038207889, + "grad_norm": 4.328507423400879, + "learning_rate": 4.5837679826965353e-05, + "loss": 0.6547, + "step": 28250 + }, + { + "epoch": 0.24982761364239114, + "grad_norm": 3.8125171661376953, + "learning_rate": 4.583620643929348e-05, + "loss": 0.8319, + "step": 28260 + }, + { + "epoch": 0.24991601690270338, + "grad_norm": 1.9388508796691895, + "learning_rate": 4.583473305162162e-05, + "loss": 0.9568, + "step": 28270 + }, + { + "epoch": 0.2500044201630156, + "grad_norm": 6.131495475769043, + "learning_rate": 4.583325966394974e-05, + "loss": 0.8844, + "step": 28280 + }, + { + "epoch": 0.25009282342332784, + "grad_norm": 1.8883302211761475, + "learning_rate": 4.5831786276277874e-05, + "loss": 0.771, + "step": 28290 + }, + { + "epoch": 0.25018122668364007, + "grad_norm": 3.7101242542266846, + "learning_rate": 4.5830312888606e-05, + "loss": 0.6934, + "step": 28300 + }, + { + "epoch": 0.25026962994395235, + "grad_norm": 7.083991527557373, + "learning_rate": 4.582883950093413e-05, + "loss": 0.7457, + "step": 28310 + }, + { + "epoch": 0.2503580332042646, + "grad_norm": 6.361082077026367, + "learning_rate": 4.582736611326226e-05, + "loss": 0.6809, + "step": 28320 + }, + { + "epoch": 0.2504464364645768, + "grad_norm": 4.989359378814697, + "learning_rate": 4.5825892725590394e-05, + "loss": 0.8322, + "step": 28330 + }, + { + "epoch": 0.25053483972488905, + "grad_norm": 10.848586082458496, + "learning_rate": 4.5824419337918515e-05, + "loss": 0.7794, + "step": 28340 + }, + { + "epoch": 0.2506232429852013, + "grad_norm": 2.3265511989593506, + "learning_rate": 4.582294595024665e-05, + "loss": 0.8029, + "step": 28350 + }, + { + "epoch": 0.2507116462455135, + "grad_norm": 5.207944869995117, + "learning_rate": 4.582147256257477e-05, + "loss": 0.7481, + "step": 28360 + }, + { + "epoch": 0.2508000495058258, + "grad_norm": 11.006331443786621, + "learning_rate": 4.581999917490291e-05, + "loss": 0.866, + "step": 28370 + }, + { + "epoch": 0.250888452766138, + "grad_norm": 5.190815448760986, + "learning_rate": 4.5818525787231036e-05, + "loss": 0.7849, + "step": 28380 + }, + { + "epoch": 0.25097685602645026, + "grad_norm": 3.069892168045044, + "learning_rate": 4.5817052399559164e-05, + "loss": 0.7629, + "step": 28390 + }, + { + "epoch": 0.2510652592867625, + "grad_norm": 3.2880120277404785, + "learning_rate": 4.581557901188729e-05, + "loss": 0.7874, + "step": 28400 + }, + { + "epoch": 0.2511536625470747, + "grad_norm": 6.7434401512146, + "learning_rate": 4.581410562421543e-05, + "loss": 0.7109, + "step": 28410 + }, + { + "epoch": 0.25124206580738695, + "grad_norm": 4.0119524002075195, + "learning_rate": 4.581263223654355e-05, + "loss": 0.7303, + "step": 28420 + }, + { + "epoch": 0.25133046906769924, + "grad_norm": 8.777091979980469, + "learning_rate": 4.5811158848871684e-05, + "loss": 0.7814, + "step": 28430 + }, + { + "epoch": 0.25141887232801147, + "grad_norm": 6.044012069702148, + "learning_rate": 4.580968546119981e-05, + "loss": 0.8355, + "step": 28440 + }, + { + "epoch": 0.2515072755883237, + "grad_norm": 4.43526554107666, + "learning_rate": 4.580821207352794e-05, + "loss": 0.8782, + "step": 28450 + }, + { + "epoch": 0.25159567884863593, + "grad_norm": 6.353603363037109, + "learning_rate": 4.580673868585607e-05, + "loss": 0.822, + "step": 28460 + }, + { + "epoch": 0.25168408210894816, + "grad_norm": 7.672333717346191, + "learning_rate": 4.58052652981842e-05, + "loss": 0.7529, + "step": 28470 + }, + { + "epoch": 0.25177248536926045, + "grad_norm": 1.5266413688659668, + "learning_rate": 4.5803791910512326e-05, + "loss": 0.9385, + "step": 28480 + }, + { + "epoch": 0.2518608886295727, + "grad_norm": 2.4057815074920654, + "learning_rate": 4.580231852284046e-05, + "loss": 0.836, + "step": 28490 + }, + { + "epoch": 0.2519492918898849, + "grad_norm": 5.290966510772705, + "learning_rate": 4.580084513516858e-05, + "loss": 0.7352, + "step": 28500 + }, + { + "epoch": 0.25203769515019714, + "grad_norm": 5.318004131317139, + "learning_rate": 4.579937174749672e-05, + "loss": 0.7593, + "step": 28510 + }, + { + "epoch": 0.25212609841050937, + "grad_norm": 10.14842414855957, + "learning_rate": 4.5797898359824846e-05, + "loss": 0.7628, + "step": 28520 + }, + { + "epoch": 0.2522145016708216, + "grad_norm": 8.42928409576416, + "learning_rate": 4.5796424972152975e-05, + "loss": 0.6934, + "step": 28530 + }, + { + "epoch": 0.2523029049311339, + "grad_norm": 5.067338943481445, + "learning_rate": 4.57949515844811e-05, + "loss": 0.72, + "step": 28540 + }, + { + "epoch": 0.2523913081914461, + "grad_norm": 4.782569408416748, + "learning_rate": 4.579347819680924e-05, + "loss": 0.8456, + "step": 28550 + }, + { + "epoch": 0.25247971145175835, + "grad_norm": 2.653374195098877, + "learning_rate": 4.579200480913736e-05, + "loss": 0.8257, + "step": 28560 + }, + { + "epoch": 0.2525681147120706, + "grad_norm": 4.4330854415893555, + "learning_rate": 4.5790531421465495e-05, + "loss": 0.778, + "step": 28570 + }, + { + "epoch": 0.2526565179723828, + "grad_norm": 11.04740047454834, + "learning_rate": 4.5789058033793616e-05, + "loss": 0.7596, + "step": 28580 + }, + { + "epoch": 0.25274492123269504, + "grad_norm": 3.77433705329895, + "learning_rate": 4.578758464612175e-05, + "loss": 0.9691, + "step": 28590 + }, + { + "epoch": 0.2528333244930073, + "grad_norm": 6.705318927764893, + "learning_rate": 4.578611125844988e-05, + "loss": 0.8182, + "step": 28600 + }, + { + "epoch": 0.25292172775331956, + "grad_norm": 5.084736347198486, + "learning_rate": 4.578463787077801e-05, + "loss": 0.8034, + "step": 28610 + }, + { + "epoch": 0.2530101310136318, + "grad_norm": 5.857778549194336, + "learning_rate": 4.5783164483106136e-05, + "loss": 0.5936, + "step": 28620 + }, + { + "epoch": 0.253098534273944, + "grad_norm": 4.9835124015808105, + "learning_rate": 4.578169109543427e-05, + "loss": 0.7461, + "step": 28630 + }, + { + "epoch": 0.25318693753425625, + "grad_norm": 5.676827907562256, + "learning_rate": 4.578021770776239e-05, + "loss": 0.8185, + "step": 28640 + }, + { + "epoch": 0.2532753407945685, + "grad_norm": 4.298700332641602, + "learning_rate": 4.577874432009053e-05, + "loss": 0.8346, + "step": 28650 + }, + { + "epoch": 0.25336374405488077, + "grad_norm": 3.056025981903076, + "learning_rate": 4.577727093241866e-05, + "loss": 0.7172, + "step": 28660 + }, + { + "epoch": 0.253452147315193, + "grad_norm": 4.879296779632568, + "learning_rate": 4.5775797544746785e-05, + "loss": 0.7053, + "step": 28670 + }, + { + "epoch": 0.25354055057550523, + "grad_norm": 6.258749008178711, + "learning_rate": 4.577432415707491e-05, + "loss": 0.8798, + "step": 28680 + }, + { + "epoch": 0.25362895383581746, + "grad_norm": 3.5010361671447754, + "learning_rate": 4.577285076940304e-05, + "loss": 0.8394, + "step": 28690 + }, + { + "epoch": 0.2537173570961297, + "grad_norm": 5.559300899505615, + "learning_rate": 4.577137738173117e-05, + "loss": 0.7309, + "step": 28700 + }, + { + "epoch": 0.2538057603564419, + "grad_norm": 2.909944534301758, + "learning_rate": 4.5769903994059305e-05, + "loss": 0.6919, + "step": 28710 + }, + { + "epoch": 0.2538941636167542, + "grad_norm": 8.954793930053711, + "learning_rate": 4.5768430606387434e-05, + "loss": 0.8466, + "step": 28720 + }, + { + "epoch": 0.25398256687706644, + "grad_norm": 3.158184289932251, + "learning_rate": 4.576695721871556e-05, + "loss": 0.7055, + "step": 28730 + }, + { + "epoch": 0.25407097013737867, + "grad_norm": 1.5547363758087158, + "learning_rate": 4.576548383104369e-05, + "loss": 0.8503, + "step": 28740 + }, + { + "epoch": 0.2541593733976909, + "grad_norm": 11.266592025756836, + "learning_rate": 4.576401044337182e-05, + "loss": 0.6418, + "step": 28750 + }, + { + "epoch": 0.25424777665800313, + "grad_norm": 4.114709377288818, + "learning_rate": 4.576253705569995e-05, + "loss": 0.8064, + "step": 28760 + }, + { + "epoch": 0.25433617991831536, + "grad_norm": 1.7465564012527466, + "learning_rate": 4.576106366802808e-05, + "loss": 0.8276, + "step": 28770 + }, + { + "epoch": 0.25442458317862765, + "grad_norm": 5.222322463989258, + "learning_rate": 4.575959028035621e-05, + "loss": 0.7708, + "step": 28780 + }, + { + "epoch": 0.2545129864389399, + "grad_norm": 2.0481183528900146, + "learning_rate": 4.575811689268434e-05, + "loss": 0.7236, + "step": 28790 + }, + { + "epoch": 0.2546013896992521, + "grad_norm": 1.1635178327560425, + "learning_rate": 4.575664350501247e-05, + "loss": 0.6725, + "step": 28800 + }, + { + "epoch": 0.25468979295956434, + "grad_norm": 2.7443790435791016, + "learning_rate": 4.5755170117340596e-05, + "loss": 0.7547, + "step": 28810 + }, + { + "epoch": 0.2547781962198766, + "grad_norm": 5.969183444976807, + "learning_rate": 4.5753696729668724e-05, + "loss": 0.8911, + "step": 28820 + }, + { + "epoch": 0.2548665994801888, + "grad_norm": 5.39157772064209, + "learning_rate": 4.575222334199685e-05, + "loss": 0.8314, + "step": 28830 + }, + { + "epoch": 0.2549550027405011, + "grad_norm": 5.395505428314209, + "learning_rate": 4.575074995432499e-05, + "loss": 0.8861, + "step": 28840 + }, + { + "epoch": 0.2550434060008133, + "grad_norm": 12.572649002075195, + "learning_rate": 4.5749276566653116e-05, + "loss": 0.8176, + "step": 28850 + }, + { + "epoch": 0.25513180926112555, + "grad_norm": 3.854978561401367, + "learning_rate": 4.5747803178981244e-05, + "loss": 0.6201, + "step": 28860 + }, + { + "epoch": 0.2552202125214378, + "grad_norm": 3.417965888977051, + "learning_rate": 4.574632979130937e-05, + "loss": 0.6962, + "step": 28870 + }, + { + "epoch": 0.25530861578175, + "grad_norm": 4.227417945861816, + "learning_rate": 4.57448564036375e-05, + "loss": 0.7025, + "step": 28880 + }, + { + "epoch": 0.25539701904206225, + "grad_norm": 6.945284366607666, + "learning_rate": 4.574338301596563e-05, + "loss": 0.6971, + "step": 28890 + }, + { + "epoch": 0.25548542230237453, + "grad_norm": 6.564403533935547, + "learning_rate": 4.5741909628293764e-05, + "loss": 0.836, + "step": 28900 + }, + { + "epoch": 0.25557382556268676, + "grad_norm": 4.663499355316162, + "learning_rate": 4.574043624062189e-05, + "loss": 0.8684, + "step": 28910 + }, + { + "epoch": 0.255662228822999, + "grad_norm": 4.455374240875244, + "learning_rate": 4.573896285295002e-05, + "loss": 0.8096, + "step": 28920 + }, + { + "epoch": 0.2557506320833112, + "grad_norm": 2.70426869392395, + "learning_rate": 4.573748946527815e-05, + "loss": 0.6601, + "step": 28930 + }, + { + "epoch": 0.25583903534362346, + "grad_norm": 2.442452907562256, + "learning_rate": 4.573601607760628e-05, + "loss": 0.7376, + "step": 28940 + }, + { + "epoch": 0.2559274386039357, + "grad_norm": 3.1411333084106445, + "learning_rate": 4.5734542689934406e-05, + "loss": 0.6878, + "step": 28950 + }, + { + "epoch": 0.256015841864248, + "grad_norm": 4.382972240447998, + "learning_rate": 4.573306930226254e-05, + "loss": 0.878, + "step": 28960 + }, + { + "epoch": 0.2561042451245602, + "grad_norm": 3.5169060230255127, + "learning_rate": 4.573159591459066e-05, + "loss": 0.7822, + "step": 28970 + }, + { + "epoch": 0.25619264838487243, + "grad_norm": 9.13555908203125, + "learning_rate": 4.57301225269188e-05, + "loss": 0.7743, + "step": 28980 + }, + { + "epoch": 0.25628105164518467, + "grad_norm": 5.3839898109436035, + "learning_rate": 4.5728649139246926e-05, + "loss": 0.7427, + "step": 28990 + }, + { + "epoch": 0.2563694549054969, + "grad_norm": 3.092209577560425, + "learning_rate": 4.5727175751575055e-05, + "loss": 0.7135, + "step": 29000 + }, + { + "epoch": 0.2564578581658092, + "grad_norm": 7.3641486167907715, + "learning_rate": 4.572570236390318e-05, + "loss": 0.8581, + "step": 29010 + }, + { + "epoch": 0.2565462614261214, + "grad_norm": 4.167380332946777, + "learning_rate": 4.572422897623132e-05, + "loss": 0.7557, + "step": 29020 + }, + { + "epoch": 0.25663466468643364, + "grad_norm": 2.1778273582458496, + "learning_rate": 4.572275558855944e-05, + "loss": 0.846, + "step": 29030 + }, + { + "epoch": 0.2567230679467459, + "grad_norm": 8.991447448730469, + "learning_rate": 4.5721282200887575e-05, + "loss": 0.6931, + "step": 29040 + }, + { + "epoch": 0.2568114712070581, + "grad_norm": 1.7608896493911743, + "learning_rate": 4.5719808813215696e-05, + "loss": 0.8228, + "step": 29050 + }, + { + "epoch": 0.25689987446737034, + "grad_norm": 4.679460525512695, + "learning_rate": 4.571833542554383e-05, + "loss": 0.6683, + "step": 29060 + }, + { + "epoch": 0.2569882777276826, + "grad_norm": 11.739288330078125, + "learning_rate": 4.571686203787196e-05, + "loss": 0.7965, + "step": 29070 + }, + { + "epoch": 0.25707668098799485, + "grad_norm": 7.693981170654297, + "learning_rate": 4.571538865020009e-05, + "loss": 0.6544, + "step": 29080 + }, + { + "epoch": 0.2571650842483071, + "grad_norm": 4.8238301277160645, + "learning_rate": 4.5713915262528217e-05, + "loss": 0.7453, + "step": 29090 + }, + { + "epoch": 0.2572534875086193, + "grad_norm": 4.031973361968994, + "learning_rate": 4.571244187485635e-05, + "loss": 0.7022, + "step": 29100 + }, + { + "epoch": 0.25734189076893155, + "grad_norm": 13.379624366760254, + "learning_rate": 4.571096848718447e-05, + "loss": 0.7846, + "step": 29110 + }, + { + "epoch": 0.2574302940292438, + "grad_norm": 11.782503128051758, + "learning_rate": 4.570949509951261e-05, + "loss": 0.8556, + "step": 29120 + }, + { + "epoch": 0.25751869728955606, + "grad_norm": 13.237866401672363, + "learning_rate": 4.570802171184074e-05, + "loss": 0.8431, + "step": 29130 + }, + { + "epoch": 0.2576071005498683, + "grad_norm": 3.6365902423858643, + "learning_rate": 4.5706548324168865e-05, + "loss": 0.7829, + "step": 29140 + }, + { + "epoch": 0.2576955038101805, + "grad_norm": 3.789350748062134, + "learning_rate": 4.5705074936496993e-05, + "loss": 0.8309, + "step": 29150 + }, + { + "epoch": 0.25778390707049276, + "grad_norm": 3.392885208129883, + "learning_rate": 4.570360154882512e-05, + "loss": 0.7413, + "step": 29160 + }, + { + "epoch": 0.257872310330805, + "grad_norm": 4.8836750984191895, + "learning_rate": 4.570212816115325e-05, + "loss": 0.7744, + "step": 29170 + }, + { + "epoch": 0.2579607135911172, + "grad_norm": 2.781611204147339, + "learning_rate": 4.5700654773481385e-05, + "loss": 0.7637, + "step": 29180 + }, + { + "epoch": 0.2580491168514295, + "grad_norm": 6.791811943054199, + "learning_rate": 4.569918138580951e-05, + "loss": 0.6883, + "step": 29190 + }, + { + "epoch": 0.25813752011174174, + "grad_norm": 4.718374729156494, + "learning_rate": 4.569770799813764e-05, + "loss": 0.7378, + "step": 29200 + }, + { + "epoch": 0.25822592337205397, + "grad_norm": 3.223762273788452, + "learning_rate": 4.569623461046577e-05, + "loss": 0.8206, + "step": 29210 + }, + { + "epoch": 0.2583143266323662, + "grad_norm": 4.6873650550842285, + "learning_rate": 4.56947612227939e-05, + "loss": 0.8102, + "step": 29220 + }, + { + "epoch": 0.25840272989267843, + "grad_norm": 7.738824367523193, + "learning_rate": 4.569328783512203e-05, + "loss": 0.8461, + "step": 29230 + }, + { + "epoch": 0.25849113315299066, + "grad_norm": 7.177900314331055, + "learning_rate": 4.569181444745016e-05, + "loss": 0.6179, + "step": 29240 + }, + { + "epoch": 0.25857953641330295, + "grad_norm": 4.860195159912109, + "learning_rate": 4.5690341059778284e-05, + "loss": 0.7505, + "step": 29250 + }, + { + "epoch": 0.2586679396736152, + "grad_norm": 9.726078033447266, + "learning_rate": 4.568886767210642e-05, + "loss": 0.6895, + "step": 29260 + }, + { + "epoch": 0.2587563429339274, + "grad_norm": 9.46513557434082, + "learning_rate": 4.568739428443455e-05, + "loss": 0.8684, + "step": 29270 + }, + { + "epoch": 0.25884474619423964, + "grad_norm": 1.5469259023666382, + "learning_rate": 4.5685920896762676e-05, + "loss": 0.7257, + "step": 29280 + }, + { + "epoch": 0.25893314945455187, + "grad_norm": 5.646541595458984, + "learning_rate": 4.5684447509090804e-05, + "loss": 0.7053, + "step": 29290 + }, + { + "epoch": 0.2590215527148641, + "grad_norm": 4.826288223266602, + "learning_rate": 4.568297412141893e-05, + "loss": 0.7533, + "step": 29300 + }, + { + "epoch": 0.2591099559751764, + "grad_norm": 5.6624040603637695, + "learning_rate": 4.568150073374706e-05, + "loss": 0.6851, + "step": 29310 + }, + { + "epoch": 0.2591983592354886, + "grad_norm": 5.440062046051025, + "learning_rate": 4.5680027346075196e-05, + "loss": 0.8107, + "step": 29320 + }, + { + "epoch": 0.25928676249580085, + "grad_norm": 2.6743078231811523, + "learning_rate": 4.567855395840332e-05, + "loss": 0.6605, + "step": 29330 + }, + { + "epoch": 0.2593751657561131, + "grad_norm": 2.3574798107147217, + "learning_rate": 4.567708057073145e-05, + "loss": 0.721, + "step": 29340 + }, + { + "epoch": 0.2594635690164253, + "grad_norm": 3.5360846519470215, + "learning_rate": 4.567560718305958e-05, + "loss": 0.7052, + "step": 29350 + }, + { + "epoch": 0.25955197227673754, + "grad_norm": 2.8996875286102295, + "learning_rate": 4.567413379538771e-05, + "loss": 0.6752, + "step": 29360 + }, + { + "epoch": 0.25964037553704983, + "grad_norm": 8.65214729309082, + "learning_rate": 4.567266040771584e-05, + "loss": 0.7918, + "step": 29370 + }, + { + "epoch": 0.25972877879736206, + "grad_norm": 8.966941833496094, + "learning_rate": 4.567118702004397e-05, + "loss": 0.7335, + "step": 29380 + }, + { + "epoch": 0.2598171820576743, + "grad_norm": 2.327826976776123, + "learning_rate": 4.5669713632372094e-05, + "loss": 0.7333, + "step": 29390 + }, + { + "epoch": 0.2599055853179865, + "grad_norm": 8.276576042175293, + "learning_rate": 4.566824024470023e-05, + "loss": 0.8879, + "step": 29400 + }, + { + "epoch": 0.25999398857829875, + "grad_norm": 2.417679786682129, + "learning_rate": 4.566676685702835e-05, + "loss": 0.7675, + "step": 29410 + }, + { + "epoch": 0.260082391838611, + "grad_norm": 6.383553981781006, + "learning_rate": 4.5665293469356486e-05, + "loss": 0.7119, + "step": 29420 + }, + { + "epoch": 0.26017079509892327, + "grad_norm": 5.931835651397705, + "learning_rate": 4.5663820081684614e-05, + "loss": 0.8431, + "step": 29430 + }, + { + "epoch": 0.2602591983592355, + "grad_norm": 11.034318923950195, + "learning_rate": 4.566234669401274e-05, + "loss": 0.7607, + "step": 29440 + }, + { + "epoch": 0.26034760161954773, + "grad_norm": 4.3858513832092285, + "learning_rate": 4.566087330634087e-05, + "loss": 0.8781, + "step": 29450 + }, + { + "epoch": 0.26043600487985996, + "grad_norm": 3.791039228439331, + "learning_rate": 4.5659399918669006e-05, + "loss": 0.7861, + "step": 29460 + }, + { + "epoch": 0.2605244081401722, + "grad_norm": 1.4404274225234985, + "learning_rate": 4.565792653099713e-05, + "loss": 0.8871, + "step": 29470 + }, + { + "epoch": 0.2606128114004844, + "grad_norm": 3.2746682167053223, + "learning_rate": 4.565645314332526e-05, + "loss": 0.7398, + "step": 29480 + }, + { + "epoch": 0.2607012146607967, + "grad_norm": 3.9915900230407715, + "learning_rate": 4.565497975565339e-05, + "loss": 0.7003, + "step": 29490 + }, + { + "epoch": 0.26078961792110894, + "grad_norm": 6.544107913970947, + "learning_rate": 4.565350636798152e-05, + "loss": 0.8379, + "step": 29500 + }, + { + "epoch": 0.26087802118142117, + "grad_norm": 8.069681167602539, + "learning_rate": 4.565203298030965e-05, + "loss": 0.7272, + "step": 29510 + }, + { + "epoch": 0.2609664244417334, + "grad_norm": 5.080019474029541, + "learning_rate": 4.5650559592637776e-05, + "loss": 0.6892, + "step": 29520 + }, + { + "epoch": 0.26105482770204563, + "grad_norm": 4.58833122253418, + "learning_rate": 4.5649086204965905e-05, + "loss": 0.6827, + "step": 29530 + }, + { + "epoch": 0.26114323096235786, + "grad_norm": 2.668739080429077, + "learning_rate": 4.564761281729404e-05, + "loss": 0.7702, + "step": 29540 + }, + { + "epoch": 0.26123163422267015, + "grad_norm": 3.2595272064208984, + "learning_rate": 4.564613942962216e-05, + "loss": 0.7984, + "step": 29550 + }, + { + "epoch": 0.2613200374829824, + "grad_norm": 4.894524574279785, + "learning_rate": 4.56446660419503e-05, + "loss": 0.7828, + "step": 29560 + }, + { + "epoch": 0.2614084407432946, + "grad_norm": 7.50266170501709, + "learning_rate": 4.5643192654278425e-05, + "loss": 0.7309, + "step": 29570 + }, + { + "epoch": 0.26149684400360684, + "grad_norm": 10.72905158996582, + "learning_rate": 4.564171926660655e-05, + "loss": 0.8975, + "step": 29580 + }, + { + "epoch": 0.2615852472639191, + "grad_norm": 2.84784197807312, + "learning_rate": 4.564024587893468e-05, + "loss": 0.7273, + "step": 29590 + }, + { + "epoch": 0.26167365052423136, + "grad_norm": 4.118752479553223, + "learning_rate": 4.563877249126282e-05, + "loss": 0.7445, + "step": 29600 + }, + { + "epoch": 0.2617620537845436, + "grad_norm": 4.063949108123779, + "learning_rate": 4.563729910359094e-05, + "loss": 0.767, + "step": 29610 + }, + { + "epoch": 0.2618504570448558, + "grad_norm": 3.4891610145568848, + "learning_rate": 4.5635825715919074e-05, + "loss": 0.8095, + "step": 29620 + }, + { + "epoch": 0.26193886030516805, + "grad_norm": 3.2074899673461914, + "learning_rate": 4.56343523282472e-05, + "loss": 0.8034, + "step": 29630 + }, + { + "epoch": 0.2620272635654803, + "grad_norm": 3.521451950073242, + "learning_rate": 4.563287894057533e-05, + "loss": 0.7601, + "step": 29640 + }, + { + "epoch": 0.2621156668257925, + "grad_norm": 3.6601784229278564, + "learning_rate": 4.563140555290346e-05, + "loss": 0.8181, + "step": 29650 + }, + { + "epoch": 0.2622040700861048, + "grad_norm": 5.0407843589782715, + "learning_rate": 4.562993216523159e-05, + "loss": 0.6479, + "step": 29660 + }, + { + "epoch": 0.26229247334641703, + "grad_norm": 3.6762752532958984, + "learning_rate": 4.5628458777559715e-05, + "loss": 0.6781, + "step": 29670 + }, + { + "epoch": 0.26238087660672926, + "grad_norm": 4.478204727172852, + "learning_rate": 4.562698538988785e-05, + "loss": 0.7519, + "step": 29680 + }, + { + "epoch": 0.2624692798670415, + "grad_norm": 6.099710464477539, + "learning_rate": 4.562551200221598e-05, + "loss": 0.804, + "step": 29690 + }, + { + "epoch": 0.2625576831273537, + "grad_norm": 3.5862505435943604, + "learning_rate": 4.562403861454411e-05, + "loss": 0.8954, + "step": 29700 + }, + { + "epoch": 0.26264608638766596, + "grad_norm": 3.853513717651367, + "learning_rate": 4.5622565226872235e-05, + "loss": 0.7243, + "step": 29710 + }, + { + "epoch": 0.26273448964797824, + "grad_norm": 4.7881317138671875, + "learning_rate": 4.5621091839200364e-05, + "loss": 0.868, + "step": 29720 + }, + { + "epoch": 0.2628228929082905, + "grad_norm": 9.800519943237305, + "learning_rate": 4.561961845152849e-05, + "loss": 0.7831, + "step": 29730 + }, + { + "epoch": 0.2629112961686027, + "grad_norm": 2.545400857925415, + "learning_rate": 4.561814506385663e-05, + "loss": 0.7556, + "step": 29740 + }, + { + "epoch": 0.26299969942891493, + "grad_norm": 3.167473316192627, + "learning_rate": 4.5616671676184756e-05, + "loss": 0.7714, + "step": 29750 + }, + { + "epoch": 0.26308810268922717, + "grad_norm": 5.078524589538574, + "learning_rate": 4.5615198288512884e-05, + "loss": 0.8415, + "step": 29760 + }, + { + "epoch": 0.2631765059495394, + "grad_norm": 1.960549235343933, + "learning_rate": 4.561372490084101e-05, + "loss": 0.8475, + "step": 29770 + }, + { + "epoch": 0.2632649092098517, + "grad_norm": 5.510914325714111, + "learning_rate": 4.561225151316914e-05, + "loss": 0.8689, + "step": 29780 + }, + { + "epoch": 0.2633533124701639, + "grad_norm": 9.019115447998047, + "learning_rate": 4.561077812549727e-05, + "loss": 0.6989, + "step": 29790 + }, + { + "epoch": 0.26344171573047614, + "grad_norm": 1.9665924310684204, + "learning_rate": 4.56093047378254e-05, + "loss": 0.8381, + "step": 29800 + }, + { + "epoch": 0.2635301189907884, + "grad_norm": 4.336777687072754, + "learning_rate": 4.560783135015353e-05, + "loss": 0.7906, + "step": 29810 + }, + { + "epoch": 0.2636185222511006, + "grad_norm": 4.672365665435791, + "learning_rate": 4.560635796248166e-05, + "loss": 0.7862, + "step": 29820 + }, + { + "epoch": 0.26370692551141284, + "grad_norm": 5.014092445373535, + "learning_rate": 4.560488457480979e-05, + "loss": 0.8526, + "step": 29830 + }, + { + "epoch": 0.2637953287717251, + "grad_norm": 4.661694049835205, + "learning_rate": 4.560341118713792e-05, + "loss": 0.7181, + "step": 29840 + }, + { + "epoch": 0.26388373203203735, + "grad_norm": 1.9793035984039307, + "learning_rate": 4.5601937799466046e-05, + "loss": 0.6739, + "step": 29850 + }, + { + "epoch": 0.2639721352923496, + "grad_norm": 7.034848213195801, + "learning_rate": 4.5600464411794174e-05, + "loss": 0.8302, + "step": 29860 + }, + { + "epoch": 0.2640605385526618, + "grad_norm": 7.229719161987305, + "learning_rate": 4.559899102412231e-05, + "loss": 0.6392, + "step": 29870 + }, + { + "epoch": 0.26414894181297405, + "grad_norm": 12.791730880737305, + "learning_rate": 4.559751763645043e-05, + "loss": 0.6703, + "step": 29880 + }, + { + "epoch": 0.2642373450732863, + "grad_norm": 4.993589878082275, + "learning_rate": 4.5596044248778566e-05, + "loss": 0.8488, + "step": 29890 + }, + { + "epoch": 0.26432574833359856, + "grad_norm": 2.2029237747192383, + "learning_rate": 4.5594570861106695e-05, + "loss": 0.7983, + "step": 29900 + }, + { + "epoch": 0.2644141515939108, + "grad_norm": 1.6999568939208984, + "learning_rate": 4.559309747343482e-05, + "loss": 0.7184, + "step": 29910 + }, + { + "epoch": 0.264502554854223, + "grad_norm": 6.056368350982666, + "learning_rate": 4.559162408576295e-05, + "loss": 0.6905, + "step": 29920 + }, + { + "epoch": 0.26459095811453526, + "grad_norm": 4.264249801635742, + "learning_rate": 4.5590150698091086e-05, + "loss": 0.7324, + "step": 29930 + }, + { + "epoch": 0.2646793613748475, + "grad_norm": 3.543482780456543, + "learning_rate": 4.558867731041921e-05, + "loss": 0.8234, + "step": 29940 + }, + { + "epoch": 0.2647677646351597, + "grad_norm": 3.5544373989105225, + "learning_rate": 4.558720392274734e-05, + "loss": 0.5627, + "step": 29950 + }, + { + "epoch": 0.264856167895472, + "grad_norm": 5.946412086486816, + "learning_rate": 4.558573053507547e-05, + "loss": 0.7565, + "step": 29960 + }, + { + "epoch": 0.26494457115578424, + "grad_norm": 6.860233783721924, + "learning_rate": 4.55842571474036e-05, + "loss": 0.7988, + "step": 29970 + }, + { + "epoch": 0.26503297441609647, + "grad_norm": 5.926501274108887, + "learning_rate": 4.558278375973173e-05, + "loss": 0.8694, + "step": 29980 + }, + { + "epoch": 0.2651213776764087, + "grad_norm": 2.451258659362793, + "learning_rate": 4.5581310372059856e-05, + "loss": 0.7034, + "step": 29990 + }, + { + "epoch": 0.26520978093672093, + "grad_norm": 4.44228458404541, + "learning_rate": 4.5579836984387985e-05, + "loss": 0.7657, + "step": 30000 + }, + { + "epoch": 0.26529818419703316, + "grad_norm": 6.117458343505859, + "learning_rate": 4.557836359671612e-05, + "loss": 0.7869, + "step": 30010 + }, + { + "epoch": 0.26538658745734545, + "grad_norm": 6.146592140197754, + "learning_rate": 4.557689020904424e-05, + "loss": 0.6836, + "step": 30020 + }, + { + "epoch": 0.2654749907176577, + "grad_norm": 2.639390468597412, + "learning_rate": 4.557541682137238e-05, + "loss": 0.7957, + "step": 30030 + }, + { + "epoch": 0.2655633939779699, + "grad_norm": 2.850609302520752, + "learning_rate": 4.5573943433700505e-05, + "loss": 0.7955, + "step": 30040 + }, + { + "epoch": 0.26565179723828214, + "grad_norm": 2.463507890701294, + "learning_rate": 4.5572470046028633e-05, + "loss": 0.6879, + "step": 30050 + }, + { + "epoch": 0.26574020049859437, + "grad_norm": 2.183257579803467, + "learning_rate": 4.557099665835676e-05, + "loss": 0.6821, + "step": 30060 + }, + { + "epoch": 0.2658286037589066, + "grad_norm": 6.851439476013184, + "learning_rate": 4.55695232706849e-05, + "loss": 0.8008, + "step": 30070 + }, + { + "epoch": 0.2659170070192189, + "grad_norm": 0.9116033911705017, + "learning_rate": 4.556804988301302e-05, + "loss": 0.672, + "step": 30080 + }, + { + "epoch": 0.2660054102795311, + "grad_norm": 2.7948808670043945, + "learning_rate": 4.5566576495341154e-05, + "loss": 0.6747, + "step": 30090 + }, + { + "epoch": 0.26609381353984335, + "grad_norm": 5.142726898193359, + "learning_rate": 4.5565103107669275e-05, + "loss": 0.7999, + "step": 30100 + }, + { + "epoch": 0.2661822168001556, + "grad_norm": 2.4580373764038086, + "learning_rate": 4.556362971999741e-05, + "loss": 0.8119, + "step": 30110 + }, + { + "epoch": 0.2662706200604678, + "grad_norm": 8.661834716796875, + "learning_rate": 4.556215633232554e-05, + "loss": 0.8391, + "step": 30120 + }, + { + "epoch": 0.2663590233207801, + "grad_norm": 4.809054851531982, + "learning_rate": 4.556068294465367e-05, + "loss": 0.6991, + "step": 30130 + }, + { + "epoch": 0.26644742658109233, + "grad_norm": 5.437647342681885, + "learning_rate": 4.5559209556981795e-05, + "loss": 0.7262, + "step": 30140 + }, + { + "epoch": 0.26653582984140456, + "grad_norm": 2.0850820541381836, + "learning_rate": 4.555773616930993e-05, + "loss": 0.7492, + "step": 30150 + }, + { + "epoch": 0.2666242331017168, + "grad_norm": 4.553308010101318, + "learning_rate": 4.555626278163805e-05, + "loss": 0.6981, + "step": 30160 + }, + { + "epoch": 0.266712636362029, + "grad_norm": 2.7255380153656006, + "learning_rate": 4.555478939396619e-05, + "loss": 0.7656, + "step": 30170 + }, + { + "epoch": 0.26680103962234125, + "grad_norm": 18.0553035736084, + "learning_rate": 4.5553316006294316e-05, + "loss": 0.8432, + "step": 30180 + }, + { + "epoch": 0.26688944288265354, + "grad_norm": 3.5528764724731445, + "learning_rate": 4.5551842618622444e-05, + "loss": 0.9037, + "step": 30190 + }, + { + "epoch": 0.26697784614296577, + "grad_norm": 3.4594099521636963, + "learning_rate": 4.555036923095057e-05, + "loss": 0.6194, + "step": 30200 + }, + { + "epoch": 0.267066249403278, + "grad_norm": 4.578429698944092, + "learning_rate": 4.554889584327871e-05, + "loss": 0.7139, + "step": 30210 + }, + { + "epoch": 0.26715465266359023, + "grad_norm": 3.477224111557007, + "learning_rate": 4.554742245560683e-05, + "loss": 0.7869, + "step": 30220 + }, + { + "epoch": 0.26724305592390246, + "grad_norm": 2.6681270599365234, + "learning_rate": 4.5545949067934964e-05, + "loss": 0.8341, + "step": 30230 + }, + { + "epoch": 0.2673314591842147, + "grad_norm": 2.92987322807312, + "learning_rate": 4.5544475680263086e-05, + "loss": 0.8675, + "step": 30240 + }, + { + "epoch": 0.267419862444527, + "grad_norm": 4.547966957092285, + "learning_rate": 4.554300229259122e-05, + "loss": 0.6654, + "step": 30250 + }, + { + "epoch": 0.2675082657048392, + "grad_norm": 5.176558971405029, + "learning_rate": 4.554152890491935e-05, + "loss": 0.8412, + "step": 30260 + }, + { + "epoch": 0.26759666896515144, + "grad_norm": 8.517532348632812, + "learning_rate": 4.554005551724748e-05, + "loss": 0.7317, + "step": 30270 + }, + { + "epoch": 0.26768507222546367, + "grad_norm": 4.544113636016846, + "learning_rate": 4.5538582129575606e-05, + "loss": 0.6362, + "step": 30280 + }, + { + "epoch": 0.2677734754857759, + "grad_norm": 6.943235874176025, + "learning_rate": 4.553710874190374e-05, + "loss": 0.7622, + "step": 30290 + }, + { + "epoch": 0.26786187874608813, + "grad_norm": 5.016669273376465, + "learning_rate": 4.553563535423186e-05, + "loss": 0.6729, + "step": 30300 + }, + { + "epoch": 0.2679502820064004, + "grad_norm": 6.666787147521973, + "learning_rate": 4.553416196656e-05, + "loss": 0.7929, + "step": 30310 + }, + { + "epoch": 0.26803868526671265, + "grad_norm": 7.704743385314941, + "learning_rate": 4.5532688578888126e-05, + "loss": 0.7398, + "step": 30320 + }, + { + "epoch": 0.2681270885270249, + "grad_norm": 2.8809094429016113, + "learning_rate": 4.5531215191216254e-05, + "loss": 0.6199, + "step": 30330 + }, + { + "epoch": 0.2682154917873371, + "grad_norm": 5.0369367599487305, + "learning_rate": 4.552974180354438e-05, + "loss": 0.8144, + "step": 30340 + }, + { + "epoch": 0.26830389504764934, + "grad_norm": 5.744924068450928, + "learning_rate": 4.552826841587251e-05, + "loss": 0.8257, + "step": 30350 + }, + { + "epoch": 0.2683922983079616, + "grad_norm": 6.447247505187988, + "learning_rate": 4.552679502820064e-05, + "loss": 0.737, + "step": 30360 + }, + { + "epoch": 0.26848070156827386, + "grad_norm": 5.7486467361450195, + "learning_rate": 4.5525321640528775e-05, + "loss": 0.7288, + "step": 30370 + }, + { + "epoch": 0.2685691048285861, + "grad_norm": 2.520045518875122, + "learning_rate": 4.5523848252856896e-05, + "loss": 0.8139, + "step": 30380 + }, + { + "epoch": 0.2686575080888983, + "grad_norm": 2.754589796066284, + "learning_rate": 4.552237486518503e-05, + "loss": 0.7589, + "step": 30390 + }, + { + "epoch": 0.26874591134921055, + "grad_norm": 3.138747453689575, + "learning_rate": 4.552090147751316e-05, + "loss": 0.7891, + "step": 30400 + }, + { + "epoch": 0.2688343146095228, + "grad_norm": 2.6308279037475586, + "learning_rate": 4.551942808984129e-05, + "loss": 0.8124, + "step": 30410 + }, + { + "epoch": 0.268922717869835, + "grad_norm": 8.300191879272461, + "learning_rate": 4.5517954702169416e-05, + "loss": 0.7729, + "step": 30420 + }, + { + "epoch": 0.2690111211301473, + "grad_norm": 3.9517529010772705, + "learning_rate": 4.551648131449755e-05, + "loss": 0.802, + "step": 30430 + }, + { + "epoch": 0.26909952439045953, + "grad_norm": 4.681349754333496, + "learning_rate": 4.551500792682567e-05, + "loss": 0.6749, + "step": 30440 + }, + { + "epoch": 0.26918792765077176, + "grad_norm": 6.55589485168457, + "learning_rate": 4.551353453915381e-05, + "loss": 0.7256, + "step": 30450 + }, + { + "epoch": 0.269276330911084, + "grad_norm": 7.0311360359191895, + "learning_rate": 4.551206115148193e-05, + "loss": 0.7756, + "step": 30460 + }, + { + "epoch": 0.2693647341713962, + "grad_norm": 2.4952967166900635, + "learning_rate": 4.5510587763810065e-05, + "loss": 0.6642, + "step": 30470 + }, + { + "epoch": 0.26945313743170846, + "grad_norm": 7.936524391174316, + "learning_rate": 4.550911437613819e-05, + "loss": 0.7063, + "step": 30480 + }, + { + "epoch": 0.26954154069202074, + "grad_norm": 4.251359462738037, + "learning_rate": 4.550764098846632e-05, + "loss": 0.8426, + "step": 30490 + }, + { + "epoch": 0.269629943952333, + "grad_norm": 3.33892822265625, + "learning_rate": 4.550616760079445e-05, + "loss": 0.7088, + "step": 30500 + }, + { + "epoch": 0.2697183472126452, + "grad_norm": 7.408570766448975, + "learning_rate": 4.5504694213122585e-05, + "loss": 0.6597, + "step": 30510 + }, + { + "epoch": 0.26980675047295744, + "grad_norm": 5.651181221008301, + "learning_rate": 4.550322082545071e-05, + "loss": 0.8075, + "step": 30520 + }, + { + "epoch": 0.26989515373326967, + "grad_norm": 7.12681770324707, + "learning_rate": 4.550174743777884e-05, + "loss": 0.6747, + "step": 30530 + }, + { + "epoch": 0.2699835569935819, + "grad_norm": 6.387990474700928, + "learning_rate": 4.550027405010697e-05, + "loss": 0.7736, + "step": 30540 + }, + { + "epoch": 0.2700719602538942, + "grad_norm": 3.5151522159576416, + "learning_rate": 4.54988006624351e-05, + "loss": 0.9457, + "step": 30550 + }, + { + "epoch": 0.2701603635142064, + "grad_norm": 2.4693853855133057, + "learning_rate": 4.549732727476323e-05, + "loss": 0.6732, + "step": 30560 + }, + { + "epoch": 0.27024876677451865, + "grad_norm": 6.303086280822754, + "learning_rate": 4.549585388709136e-05, + "loss": 0.6541, + "step": 30570 + }, + { + "epoch": 0.2703371700348309, + "grad_norm": 4.603034973144531, + "learning_rate": 4.5494380499419484e-05, + "loss": 0.7209, + "step": 30580 + }, + { + "epoch": 0.2704255732951431, + "grad_norm": 3.458364486694336, + "learning_rate": 4.549290711174762e-05, + "loss": 0.7822, + "step": 30590 + }, + { + "epoch": 0.27051397655545534, + "grad_norm": 1.8052581548690796, + "learning_rate": 4.549143372407575e-05, + "loss": 0.7178, + "step": 30600 + }, + { + "epoch": 0.2706023798157676, + "grad_norm": 4.1473541259765625, + "learning_rate": 4.5489960336403875e-05, + "loss": 0.7764, + "step": 30610 + }, + { + "epoch": 0.27069078307607986, + "grad_norm": 7.287553310394287, + "learning_rate": 4.5488486948732004e-05, + "loss": 0.7904, + "step": 30620 + }, + { + "epoch": 0.2707791863363921, + "grad_norm": 10.020027160644531, + "learning_rate": 4.548701356106013e-05, + "loss": 0.8004, + "step": 30630 + }, + { + "epoch": 0.2708675895967043, + "grad_norm": 3.4412293434143066, + "learning_rate": 4.548554017338826e-05, + "loss": 0.6155, + "step": 30640 + }, + { + "epoch": 0.27095599285701655, + "grad_norm": 13.466927528381348, + "learning_rate": 4.5484066785716396e-05, + "loss": 0.7676, + "step": 30650 + }, + { + "epoch": 0.27104439611732883, + "grad_norm": 3.792027473449707, + "learning_rate": 4.5482593398044524e-05, + "loss": 0.7674, + "step": 30660 + }, + { + "epoch": 0.27113279937764106, + "grad_norm": 2.025473117828369, + "learning_rate": 4.548112001037265e-05, + "loss": 0.6348, + "step": 30670 + }, + { + "epoch": 0.2712212026379533, + "grad_norm": 3.9698288440704346, + "learning_rate": 4.547964662270078e-05, + "loss": 0.742, + "step": 30680 + }, + { + "epoch": 0.2713096058982655, + "grad_norm": 2.8065898418426514, + "learning_rate": 4.547817323502891e-05, + "loss": 0.7711, + "step": 30690 + }, + { + "epoch": 0.27139800915857776, + "grad_norm": 1.3832138776779175, + "learning_rate": 4.547669984735704e-05, + "loss": 0.6708, + "step": 30700 + }, + { + "epoch": 0.27148641241889, + "grad_norm": 4.950160980224609, + "learning_rate": 4.5475226459685166e-05, + "loss": 0.8564, + "step": 30710 + }, + { + "epoch": 0.2715748156792023, + "grad_norm": 5.886739730834961, + "learning_rate": 4.54737530720133e-05, + "loss": 0.7204, + "step": 30720 + }, + { + "epoch": 0.2716632189395145, + "grad_norm": 5.6793317794799805, + "learning_rate": 4.547227968434143e-05, + "loss": 0.6691, + "step": 30730 + }, + { + "epoch": 0.27175162219982674, + "grad_norm": 5.759725570678711, + "learning_rate": 4.547080629666956e-05, + "loss": 0.7842, + "step": 30740 + }, + { + "epoch": 0.27184002546013897, + "grad_norm": 6.173468112945557, + "learning_rate": 4.5469332908997686e-05, + "loss": 0.8394, + "step": 30750 + }, + { + "epoch": 0.2719284287204512, + "grad_norm": 4.296173572540283, + "learning_rate": 4.5467859521325814e-05, + "loss": 0.755, + "step": 30760 + }, + { + "epoch": 0.27201683198076343, + "grad_norm": 2.925757646560669, + "learning_rate": 4.546638613365394e-05, + "loss": 0.7721, + "step": 30770 + }, + { + "epoch": 0.2721052352410757, + "grad_norm": 3.2852859497070312, + "learning_rate": 4.546491274598208e-05, + "loss": 0.9166, + "step": 30780 + }, + { + "epoch": 0.27219363850138795, + "grad_norm": 4.882367134094238, + "learning_rate": 4.5463439358310206e-05, + "loss": 0.8579, + "step": 30790 + }, + { + "epoch": 0.2722820417617002, + "grad_norm": 3.473365306854248, + "learning_rate": 4.5461965970638334e-05, + "loss": 0.8085, + "step": 30800 + }, + { + "epoch": 0.2723704450220124, + "grad_norm": 4.500138282775879, + "learning_rate": 4.546049258296646e-05, + "loss": 0.7087, + "step": 30810 + }, + { + "epoch": 0.27245884828232464, + "grad_norm": 3.8217194080352783, + "learning_rate": 4.545901919529459e-05, + "loss": 0.6978, + "step": 30820 + }, + { + "epoch": 0.27254725154263687, + "grad_norm": 2.913564443588257, + "learning_rate": 4.545754580762272e-05, + "loss": 0.7134, + "step": 30830 + }, + { + "epoch": 0.27263565480294916, + "grad_norm": 5.970975399017334, + "learning_rate": 4.5456072419950855e-05, + "loss": 0.7294, + "step": 30840 + }, + { + "epoch": 0.2727240580632614, + "grad_norm": 4.244891166687012, + "learning_rate": 4.5454599032278976e-05, + "loss": 0.8509, + "step": 30850 + }, + { + "epoch": 0.2728124613235736, + "grad_norm": 3.983084201812744, + "learning_rate": 4.545312564460711e-05, + "loss": 0.8641, + "step": 30860 + }, + { + "epoch": 0.27290086458388585, + "grad_norm": 3.3314168453216553, + "learning_rate": 4.545165225693524e-05, + "loss": 0.7002, + "step": 30870 + }, + { + "epoch": 0.2729892678441981, + "grad_norm": 5.25112771987915, + "learning_rate": 4.545017886926337e-05, + "loss": 0.7936, + "step": 30880 + }, + { + "epoch": 0.2730776711045103, + "grad_norm": 1.5507129430770874, + "learning_rate": 4.5448705481591496e-05, + "loss": 0.6824, + "step": 30890 + }, + { + "epoch": 0.2731660743648226, + "grad_norm": 2.241774320602417, + "learning_rate": 4.544723209391963e-05, + "loss": 0.6614, + "step": 30900 + }, + { + "epoch": 0.27325447762513483, + "grad_norm": 9.836780548095703, + "learning_rate": 4.544575870624775e-05, + "loss": 0.8151, + "step": 30910 + }, + { + "epoch": 0.27334288088544706, + "grad_norm": 1.664473295211792, + "learning_rate": 4.544428531857589e-05, + "loss": 0.7083, + "step": 30920 + }, + { + "epoch": 0.2734312841457593, + "grad_norm": 7.396553993225098, + "learning_rate": 4.544281193090401e-05, + "loss": 0.7315, + "step": 30930 + }, + { + "epoch": 0.2735196874060715, + "grad_norm": 4.085721492767334, + "learning_rate": 4.5441338543232145e-05, + "loss": 0.7369, + "step": 30940 + }, + { + "epoch": 0.27360809066638375, + "grad_norm": 2.1590044498443604, + "learning_rate": 4.543986515556027e-05, + "loss": 0.7037, + "step": 30950 + }, + { + "epoch": 0.27369649392669604, + "grad_norm": 6.069953918457031, + "learning_rate": 4.54383917678884e-05, + "loss": 0.7966, + "step": 30960 + }, + { + "epoch": 0.27378489718700827, + "grad_norm": 7.211731433868408, + "learning_rate": 4.543691838021653e-05, + "loss": 0.7953, + "step": 30970 + }, + { + "epoch": 0.2738733004473205, + "grad_norm": 2.097848415374756, + "learning_rate": 4.5435444992544665e-05, + "loss": 0.6958, + "step": 30980 + }, + { + "epoch": 0.27396170370763273, + "grad_norm": 4.619161605834961, + "learning_rate": 4.543397160487279e-05, + "loss": 0.7856, + "step": 30990 + }, + { + "epoch": 0.27405010696794496, + "grad_norm": 6.556807994842529, + "learning_rate": 4.543249821720092e-05, + "loss": 0.6143, + "step": 31000 + }, + { + "epoch": 0.2741385102282572, + "grad_norm": 3.8834311962127686, + "learning_rate": 4.543102482952905e-05, + "loss": 0.7609, + "step": 31010 + }, + { + "epoch": 0.2742269134885695, + "grad_norm": 6.754446983337402, + "learning_rate": 4.542955144185718e-05, + "loss": 0.8063, + "step": 31020 + }, + { + "epoch": 0.2743153167488817, + "grad_norm": 6.257603645324707, + "learning_rate": 4.542807805418531e-05, + "loss": 0.7922, + "step": 31030 + }, + { + "epoch": 0.27440372000919394, + "grad_norm": 2.162466526031494, + "learning_rate": 4.542660466651344e-05, + "loss": 0.8371, + "step": 31040 + }, + { + "epoch": 0.27449212326950617, + "grad_norm": 3.549175977706909, + "learning_rate": 4.5425131278841564e-05, + "loss": 0.7575, + "step": 31050 + }, + { + "epoch": 0.2745805265298184, + "grad_norm": 6.57182502746582, + "learning_rate": 4.54236578911697e-05, + "loss": 0.7086, + "step": 31060 + }, + { + "epoch": 0.27466892979013063, + "grad_norm": 7.509003162384033, + "learning_rate": 4.542218450349782e-05, + "loss": 0.7143, + "step": 31070 + }, + { + "epoch": 0.2747573330504429, + "grad_norm": 4.139196395874023, + "learning_rate": 4.5420711115825955e-05, + "loss": 0.775, + "step": 31080 + }, + { + "epoch": 0.27484573631075515, + "grad_norm": 4.169070243835449, + "learning_rate": 4.5419237728154084e-05, + "loss": 0.7722, + "step": 31090 + }, + { + "epoch": 0.2749341395710674, + "grad_norm": 5.725850582122803, + "learning_rate": 4.541776434048221e-05, + "loss": 0.7125, + "step": 31100 + }, + { + "epoch": 0.2750225428313796, + "grad_norm": 7.626198768615723, + "learning_rate": 4.541629095281034e-05, + "loss": 0.8489, + "step": 31110 + }, + { + "epoch": 0.27511094609169184, + "grad_norm": 7.418731212615967, + "learning_rate": 4.5414817565138476e-05, + "loss": 0.8092, + "step": 31120 + }, + { + "epoch": 0.2751993493520041, + "grad_norm": 1.7932617664337158, + "learning_rate": 4.54133441774666e-05, + "loss": 0.8192, + "step": 31130 + }, + { + "epoch": 0.27528775261231636, + "grad_norm": 4.300790309906006, + "learning_rate": 4.541187078979473e-05, + "loss": 0.7661, + "step": 31140 + }, + { + "epoch": 0.2753761558726286, + "grad_norm": 5.326014518737793, + "learning_rate": 4.541039740212286e-05, + "loss": 0.6129, + "step": 31150 + }, + { + "epoch": 0.2754645591329408, + "grad_norm": 8.741189956665039, + "learning_rate": 4.540892401445099e-05, + "loss": 0.8287, + "step": 31160 + }, + { + "epoch": 0.27555296239325305, + "grad_norm": 9.096046447753906, + "learning_rate": 4.540745062677912e-05, + "loss": 0.8181, + "step": 31170 + }, + { + "epoch": 0.2756413656535653, + "grad_norm": 5.3577399253845215, + "learning_rate": 4.5405977239107246e-05, + "loss": 0.6864, + "step": 31180 + }, + { + "epoch": 0.27572976891387757, + "grad_norm": 2.724480628967285, + "learning_rate": 4.5404503851435374e-05, + "loss": 0.7777, + "step": 31190 + }, + { + "epoch": 0.2758181721741898, + "grad_norm": 3.459695816040039, + "learning_rate": 4.540303046376351e-05, + "loss": 0.7123, + "step": 31200 + }, + { + "epoch": 0.27590657543450203, + "grad_norm": 6.0657501220703125, + "learning_rate": 4.540155707609163e-05, + "loss": 0.6474, + "step": 31210 + }, + { + "epoch": 0.27599497869481426, + "grad_norm": 3.518301010131836, + "learning_rate": 4.5400083688419766e-05, + "loss": 0.6475, + "step": 31220 + }, + { + "epoch": 0.2760833819551265, + "grad_norm": 6.364334583282471, + "learning_rate": 4.5398610300747894e-05, + "loss": 0.6488, + "step": 31230 + }, + { + "epoch": 0.2761717852154387, + "grad_norm": 7.290544509887695, + "learning_rate": 4.539713691307602e-05, + "loss": 0.8546, + "step": 31240 + }, + { + "epoch": 0.276260188475751, + "grad_norm": 5.463634967803955, + "learning_rate": 4.539566352540415e-05, + "loss": 0.8186, + "step": 31250 + }, + { + "epoch": 0.27634859173606324, + "grad_norm": 7.422460079193115, + "learning_rate": 4.5394190137732286e-05, + "loss": 0.7488, + "step": 31260 + }, + { + "epoch": 0.2764369949963755, + "grad_norm": 4.348067760467529, + "learning_rate": 4.539271675006041e-05, + "loss": 0.7998, + "step": 31270 + }, + { + "epoch": 0.2765253982566877, + "grad_norm": 7.973194122314453, + "learning_rate": 4.539124336238854e-05, + "loss": 0.756, + "step": 31280 + }, + { + "epoch": 0.27661380151699994, + "grad_norm": 10.215353012084961, + "learning_rate": 4.5389769974716664e-05, + "loss": 0.6645, + "step": 31290 + }, + { + "epoch": 0.27670220477731217, + "grad_norm": 4.779819965362549, + "learning_rate": 4.53882965870448e-05, + "loss": 0.748, + "step": 31300 + }, + { + "epoch": 0.27679060803762445, + "grad_norm": 10.566243171691895, + "learning_rate": 4.538682319937293e-05, + "loss": 0.721, + "step": 31310 + }, + { + "epoch": 0.2768790112979367, + "grad_norm": 3.0014986991882324, + "learning_rate": 4.5385349811701056e-05, + "loss": 0.6974, + "step": 31320 + }, + { + "epoch": 0.2769674145582489, + "grad_norm": 3.077083110809326, + "learning_rate": 4.5383876424029185e-05, + "loss": 0.85, + "step": 31330 + }, + { + "epoch": 0.27705581781856115, + "grad_norm": 3.404559373855591, + "learning_rate": 4.538240303635732e-05, + "loss": 0.8457, + "step": 31340 + }, + { + "epoch": 0.2771442210788734, + "grad_norm": 5.274980068206787, + "learning_rate": 4.538092964868544e-05, + "loss": 0.7926, + "step": 31350 + }, + { + "epoch": 0.2772326243391856, + "grad_norm": 18.80033302307129, + "learning_rate": 4.5379456261013577e-05, + "loss": 0.7141, + "step": 31360 + }, + { + "epoch": 0.2773210275994979, + "grad_norm": 2.937209129333496, + "learning_rate": 4.5377982873341705e-05, + "loss": 0.7576, + "step": 31370 + }, + { + "epoch": 0.2774094308598101, + "grad_norm": 10.378564834594727, + "learning_rate": 4.537650948566983e-05, + "loss": 0.7622, + "step": 31380 + }, + { + "epoch": 0.27749783412012236, + "grad_norm": 6.353061676025391, + "learning_rate": 4.537503609799796e-05, + "loss": 0.826, + "step": 31390 + }, + { + "epoch": 0.2775862373804346, + "grad_norm": 8.064691543579102, + "learning_rate": 4.537356271032609e-05, + "loss": 0.7963, + "step": 31400 + }, + { + "epoch": 0.2776746406407468, + "grad_norm": 23.419443130493164, + "learning_rate": 4.537208932265422e-05, + "loss": 0.7, + "step": 31410 + }, + { + "epoch": 0.27776304390105905, + "grad_norm": 4.450402736663818, + "learning_rate": 4.5370615934982353e-05, + "loss": 0.6737, + "step": 31420 + }, + { + "epoch": 0.27785144716137133, + "grad_norm": 3.4850399494171143, + "learning_rate": 4.5369142547310475e-05, + "loss": 0.7534, + "step": 31430 + }, + { + "epoch": 0.27793985042168357, + "grad_norm": 2.655029535293579, + "learning_rate": 4.536766915963861e-05, + "loss": 0.7924, + "step": 31440 + }, + { + "epoch": 0.2780282536819958, + "grad_norm": 2.5935635566711426, + "learning_rate": 4.536619577196674e-05, + "loss": 0.7457, + "step": 31450 + }, + { + "epoch": 0.278116656942308, + "grad_norm": 2.8094661235809326, + "learning_rate": 4.536472238429487e-05, + "loss": 0.8115, + "step": 31460 + }, + { + "epoch": 0.27820506020262026, + "grad_norm": 5.6358256340026855, + "learning_rate": 4.5363248996622995e-05, + "loss": 0.7454, + "step": 31470 + }, + { + "epoch": 0.2782934634629325, + "grad_norm": 1.1668996810913086, + "learning_rate": 4.536177560895113e-05, + "loss": 0.6985, + "step": 31480 + }, + { + "epoch": 0.2783818667232448, + "grad_norm": 5.340427398681641, + "learning_rate": 4.536030222127925e-05, + "loss": 0.8203, + "step": 31490 + }, + { + "epoch": 0.278470269983557, + "grad_norm": 7.364120960235596, + "learning_rate": 4.535882883360739e-05, + "loss": 0.5594, + "step": 31500 + }, + { + "epoch": 0.27855867324386924, + "grad_norm": 2.81132435798645, + "learning_rate": 4.5357355445935515e-05, + "loss": 0.7554, + "step": 31510 + }, + { + "epoch": 0.27864707650418147, + "grad_norm": 7.576087951660156, + "learning_rate": 4.5355882058263644e-05, + "loss": 0.8755, + "step": 31520 + }, + { + "epoch": 0.2787354797644937, + "grad_norm": 6.171428203582764, + "learning_rate": 4.535440867059177e-05, + "loss": 0.8165, + "step": 31530 + }, + { + "epoch": 0.27882388302480593, + "grad_norm": 4.261667728424072, + "learning_rate": 4.53529352829199e-05, + "loss": 0.8848, + "step": 31540 + }, + { + "epoch": 0.2789122862851182, + "grad_norm": 1.7031608819961548, + "learning_rate": 4.535146189524803e-05, + "loss": 0.7545, + "step": 31550 + }, + { + "epoch": 0.27900068954543045, + "grad_norm": 3.160386562347412, + "learning_rate": 4.5349988507576164e-05, + "loss": 0.7302, + "step": 31560 + }, + { + "epoch": 0.2790890928057427, + "grad_norm": 3.2563092708587646, + "learning_rate": 4.534851511990429e-05, + "loss": 0.8251, + "step": 31570 + }, + { + "epoch": 0.2791774960660549, + "grad_norm": 2.8622827529907227, + "learning_rate": 4.534704173223242e-05, + "loss": 0.7475, + "step": 31580 + }, + { + "epoch": 0.27926589932636714, + "grad_norm": 2.567610740661621, + "learning_rate": 4.534556834456055e-05, + "loss": 0.6366, + "step": 31590 + }, + { + "epoch": 0.27935430258667937, + "grad_norm": 1.403955340385437, + "learning_rate": 4.534409495688868e-05, + "loss": 0.6678, + "step": 31600 + }, + { + "epoch": 0.27944270584699166, + "grad_norm": 3.9427952766418457, + "learning_rate": 4.5342621569216806e-05, + "loss": 0.7393, + "step": 31610 + }, + { + "epoch": 0.2795311091073039, + "grad_norm": 14.722577095031738, + "learning_rate": 4.534114818154494e-05, + "loss": 0.8377, + "step": 31620 + }, + { + "epoch": 0.2796195123676161, + "grad_norm": 5.493706703186035, + "learning_rate": 4.533967479387307e-05, + "loss": 0.8231, + "step": 31630 + }, + { + "epoch": 0.27970791562792835, + "grad_norm": 15.475202560424805, + "learning_rate": 4.53382014062012e-05, + "loss": 0.8847, + "step": 31640 + }, + { + "epoch": 0.2797963188882406, + "grad_norm": 3.6678528785705566, + "learning_rate": 4.5336728018529326e-05, + "loss": 0.7225, + "step": 31650 + }, + { + "epoch": 0.2798847221485528, + "grad_norm": 4.4838738441467285, + "learning_rate": 4.5335254630857454e-05, + "loss": 0.7591, + "step": 31660 + }, + { + "epoch": 0.2799731254088651, + "grad_norm": 2.967912435531616, + "learning_rate": 4.533378124318558e-05, + "loss": 0.8195, + "step": 31670 + }, + { + "epoch": 0.28006152866917733, + "grad_norm": 2.3313729763031006, + "learning_rate": 4.533230785551371e-05, + "loss": 0.8615, + "step": 31680 + }, + { + "epoch": 0.28014993192948956, + "grad_norm": 3.8316330909729004, + "learning_rate": 4.5330834467841846e-05, + "loss": 0.7331, + "step": 31690 + }, + { + "epoch": 0.2802383351898018, + "grad_norm": 7.374897480010986, + "learning_rate": 4.5329361080169974e-05, + "loss": 0.8383, + "step": 31700 + }, + { + "epoch": 0.280326738450114, + "grad_norm": 2.9984328746795654, + "learning_rate": 4.53278876924981e-05, + "loss": 0.8201, + "step": 31710 + }, + { + "epoch": 0.2804151417104263, + "grad_norm": 7.2484211921691895, + "learning_rate": 4.532641430482623e-05, + "loss": 0.8188, + "step": 31720 + }, + { + "epoch": 0.28050354497073854, + "grad_norm": 5.417243480682373, + "learning_rate": 4.532494091715436e-05, + "loss": 0.7514, + "step": 31730 + }, + { + "epoch": 0.28059194823105077, + "grad_norm": 2.9138054847717285, + "learning_rate": 4.532346752948249e-05, + "loss": 0.7529, + "step": 31740 + }, + { + "epoch": 0.280680351491363, + "grad_norm": 1.2528184652328491, + "learning_rate": 4.532199414181062e-05, + "loss": 0.7594, + "step": 31750 + }, + { + "epoch": 0.28076875475167523, + "grad_norm": 5.503701210021973, + "learning_rate": 4.5320520754138745e-05, + "loss": 0.8043, + "step": 31760 + }, + { + "epoch": 0.28085715801198746, + "grad_norm": 1.4960774183273315, + "learning_rate": 4.531904736646688e-05, + "loss": 0.6899, + "step": 31770 + }, + { + "epoch": 0.28094556127229975, + "grad_norm": 2.262402057647705, + "learning_rate": 4.531757397879501e-05, + "loss": 0.7645, + "step": 31780 + }, + { + "epoch": 0.281033964532612, + "grad_norm": 4.280818939208984, + "learning_rate": 4.5316100591123136e-05, + "loss": 0.6701, + "step": 31790 + }, + { + "epoch": 0.2811223677929242, + "grad_norm": 5.302253246307373, + "learning_rate": 4.5314627203451265e-05, + "loss": 0.8618, + "step": 31800 + }, + { + "epoch": 0.28121077105323644, + "grad_norm": 4.153197288513184, + "learning_rate": 4.53131538157794e-05, + "loss": 0.8364, + "step": 31810 + }, + { + "epoch": 0.28129917431354867, + "grad_norm": 4.214580535888672, + "learning_rate": 4.531168042810752e-05, + "loss": 0.8173, + "step": 31820 + }, + { + "epoch": 0.2813875775738609, + "grad_norm": 9.246479988098145, + "learning_rate": 4.5310207040435657e-05, + "loss": 0.7248, + "step": 31830 + }, + { + "epoch": 0.2814759808341732, + "grad_norm": 3.1720852851867676, + "learning_rate": 4.5308733652763785e-05, + "loss": 0.7114, + "step": 31840 + }, + { + "epoch": 0.2815643840944854, + "grad_norm": 2.7999463081359863, + "learning_rate": 4.530726026509191e-05, + "loss": 0.6936, + "step": 31850 + }, + { + "epoch": 0.28165278735479765, + "grad_norm": 1.7840044498443604, + "learning_rate": 4.530578687742004e-05, + "loss": 0.6884, + "step": 31860 + }, + { + "epoch": 0.2817411906151099, + "grad_norm": 2.3521809577941895, + "learning_rate": 4.530431348974817e-05, + "loss": 0.8267, + "step": 31870 + }, + { + "epoch": 0.2818295938754221, + "grad_norm": 2.475029230117798, + "learning_rate": 4.53028401020763e-05, + "loss": 0.7436, + "step": 31880 + }, + { + "epoch": 0.28191799713573434, + "grad_norm": 5.852056503295898, + "learning_rate": 4.5301366714404433e-05, + "loss": 0.7246, + "step": 31890 + }, + { + "epoch": 0.28200640039604663, + "grad_norm": 5.086641311645508, + "learning_rate": 4.5299893326732555e-05, + "loss": 0.7414, + "step": 31900 + }, + { + "epoch": 0.28209480365635886, + "grad_norm": 3.8775100708007812, + "learning_rate": 4.529841993906069e-05, + "loss": 0.7888, + "step": 31910 + }, + { + "epoch": 0.2821832069166711, + "grad_norm": 2.451167583465576, + "learning_rate": 4.529694655138882e-05, + "loss": 0.6424, + "step": 31920 + }, + { + "epoch": 0.2822716101769833, + "grad_norm": 7.1754913330078125, + "learning_rate": 4.529547316371695e-05, + "loss": 0.6673, + "step": 31930 + }, + { + "epoch": 0.28236001343729555, + "grad_norm": 1.0810681581497192, + "learning_rate": 4.5293999776045075e-05, + "loss": 0.7369, + "step": 31940 + }, + { + "epoch": 0.2824484166976078, + "grad_norm": 3.676067352294922, + "learning_rate": 4.529252638837321e-05, + "loss": 0.9427, + "step": 31950 + }, + { + "epoch": 0.28253681995792007, + "grad_norm": 11.599294662475586, + "learning_rate": 4.529105300070133e-05, + "loss": 0.8924, + "step": 31960 + }, + { + "epoch": 0.2826252232182323, + "grad_norm": 3.139760971069336, + "learning_rate": 4.528957961302947e-05, + "loss": 0.7779, + "step": 31970 + }, + { + "epoch": 0.28271362647854453, + "grad_norm": 5.7416486740112305, + "learning_rate": 4.5288106225357595e-05, + "loss": 0.7921, + "step": 31980 + }, + { + "epoch": 0.28280202973885676, + "grad_norm": 2.841639757156372, + "learning_rate": 4.5286632837685724e-05, + "loss": 0.6645, + "step": 31990 + }, + { + "epoch": 0.282890432999169, + "grad_norm": 2.6094791889190674, + "learning_rate": 4.528515945001385e-05, + "loss": 0.7191, + "step": 32000 + }, + { + "epoch": 0.2829788362594812, + "grad_norm": 9.99990177154541, + "learning_rate": 4.528368606234198e-05, + "loss": 0.7702, + "step": 32010 + }, + { + "epoch": 0.2830672395197935, + "grad_norm": 6.792905807495117, + "learning_rate": 4.528221267467011e-05, + "loss": 0.8994, + "step": 32020 + }, + { + "epoch": 0.28315564278010574, + "grad_norm": 4.174898624420166, + "learning_rate": 4.5280739286998244e-05, + "loss": 0.7427, + "step": 32030 + }, + { + "epoch": 0.283244046040418, + "grad_norm": 2.525766611099243, + "learning_rate": 4.5279265899326366e-05, + "loss": 0.7161, + "step": 32040 + }, + { + "epoch": 0.2833324493007302, + "grad_norm": 2.4678916931152344, + "learning_rate": 4.52777925116545e-05, + "loss": 0.7507, + "step": 32050 + }, + { + "epoch": 0.28342085256104244, + "grad_norm": 2.9384021759033203, + "learning_rate": 4.527631912398263e-05, + "loss": 0.8175, + "step": 32060 + }, + { + "epoch": 0.28350925582135467, + "grad_norm": 2.0960898399353027, + "learning_rate": 4.527484573631076e-05, + "loss": 0.814, + "step": 32070 + }, + { + "epoch": 0.28359765908166695, + "grad_norm": 1.9491114616394043, + "learning_rate": 4.5273372348638886e-05, + "loss": 0.7507, + "step": 32080 + }, + { + "epoch": 0.2836860623419792, + "grad_norm": 2.4328441619873047, + "learning_rate": 4.527189896096702e-05, + "loss": 0.7115, + "step": 32090 + }, + { + "epoch": 0.2837744656022914, + "grad_norm": 5.653671741485596, + "learning_rate": 4.527042557329514e-05, + "loss": 0.7711, + "step": 32100 + }, + { + "epoch": 0.28386286886260365, + "grad_norm": 5.98483943939209, + "learning_rate": 4.526895218562328e-05, + "loss": 0.7459, + "step": 32110 + }, + { + "epoch": 0.2839512721229159, + "grad_norm": 3.305734634399414, + "learning_rate": 4.52674787979514e-05, + "loss": 0.5808, + "step": 32120 + }, + { + "epoch": 0.2840396753832281, + "grad_norm": 3.777399778366089, + "learning_rate": 4.5266005410279534e-05, + "loss": 0.7134, + "step": 32130 + }, + { + "epoch": 0.2841280786435404, + "grad_norm": 4.952066421508789, + "learning_rate": 4.526453202260766e-05, + "loss": 0.7596, + "step": 32140 + }, + { + "epoch": 0.2842164819038526, + "grad_norm": 7.45338773727417, + "learning_rate": 4.526305863493579e-05, + "loss": 0.7016, + "step": 32150 + }, + { + "epoch": 0.28430488516416486, + "grad_norm": 4.668613910675049, + "learning_rate": 4.526158524726392e-05, + "loss": 0.7366, + "step": 32160 + }, + { + "epoch": 0.2843932884244771, + "grad_norm": 3.1431198120117188, + "learning_rate": 4.5260111859592054e-05, + "loss": 0.6675, + "step": 32170 + }, + { + "epoch": 0.2844816916847893, + "grad_norm": 5.466425895690918, + "learning_rate": 4.5258638471920176e-05, + "loss": 0.8678, + "step": 32180 + }, + { + "epoch": 0.28457009494510155, + "grad_norm": 8.157071113586426, + "learning_rate": 4.525716508424831e-05, + "loss": 0.8261, + "step": 32190 + }, + { + "epoch": 0.28465849820541383, + "grad_norm": 2.376061201095581, + "learning_rate": 4.525569169657644e-05, + "loss": 0.7547, + "step": 32200 + }, + { + "epoch": 0.28474690146572607, + "grad_norm": 3.0361568927764893, + "learning_rate": 4.525421830890457e-05, + "loss": 0.8647, + "step": 32210 + }, + { + "epoch": 0.2848353047260383, + "grad_norm": 4.107792377471924, + "learning_rate": 4.5252744921232696e-05, + "loss": 0.77, + "step": 32220 + }, + { + "epoch": 0.2849237079863505, + "grad_norm": 4.00471830368042, + "learning_rate": 4.5251271533560825e-05, + "loss": 0.7364, + "step": 32230 + }, + { + "epoch": 0.28501211124666276, + "grad_norm": 2.015693426132202, + "learning_rate": 4.524979814588895e-05, + "loss": 0.6751, + "step": 32240 + }, + { + "epoch": 0.28510051450697504, + "grad_norm": 3.4149277210235596, + "learning_rate": 4.524832475821709e-05, + "loss": 0.6419, + "step": 32250 + }, + { + "epoch": 0.2851889177672873, + "grad_norm": 2.5606470108032227, + "learning_rate": 4.524685137054521e-05, + "loss": 0.6333, + "step": 32260 + }, + { + "epoch": 0.2852773210275995, + "grad_norm": 6.284425258636475, + "learning_rate": 4.5245377982873345e-05, + "loss": 0.7133, + "step": 32270 + }, + { + "epoch": 0.28536572428791174, + "grad_norm": 3.353984832763672, + "learning_rate": 4.524390459520147e-05, + "loss": 0.7698, + "step": 32280 + }, + { + "epoch": 0.28545412754822397, + "grad_norm": 5.407648086547852, + "learning_rate": 4.52424312075296e-05, + "loss": 0.8678, + "step": 32290 + }, + { + "epoch": 0.2855425308085362, + "grad_norm": 6.983589172363281, + "learning_rate": 4.524095781985773e-05, + "loss": 0.7221, + "step": 32300 + }, + { + "epoch": 0.2856309340688485, + "grad_norm": 2.404683828353882, + "learning_rate": 4.5239484432185865e-05, + "loss": 0.8152, + "step": 32310 + }, + { + "epoch": 0.2857193373291607, + "grad_norm": 4.022778511047363, + "learning_rate": 4.5238011044513987e-05, + "loss": 0.7457, + "step": 32320 + }, + { + "epoch": 0.28580774058947295, + "grad_norm": 3.6083014011383057, + "learning_rate": 4.523653765684212e-05, + "loss": 0.6555, + "step": 32330 + }, + { + "epoch": 0.2858961438497852, + "grad_norm": 3.905674695968628, + "learning_rate": 4.523506426917024e-05, + "loss": 0.7194, + "step": 32340 + }, + { + "epoch": 0.2859845471100974, + "grad_norm": 3.084463596343994, + "learning_rate": 4.523359088149838e-05, + "loss": 0.9105, + "step": 32350 + }, + { + "epoch": 0.28607295037040964, + "grad_norm": 4.024795055389404, + "learning_rate": 4.523211749382651e-05, + "loss": 0.66, + "step": 32360 + }, + { + "epoch": 0.2861613536307219, + "grad_norm": 3.0946974754333496, + "learning_rate": 4.5230644106154635e-05, + "loss": 0.6783, + "step": 32370 + }, + { + "epoch": 0.28624975689103416, + "grad_norm": 9.112564086914062, + "learning_rate": 4.5229170718482763e-05, + "loss": 0.7838, + "step": 32380 + }, + { + "epoch": 0.2863381601513464, + "grad_norm": 8.240164756774902, + "learning_rate": 4.52276973308109e-05, + "loss": 0.7674, + "step": 32390 + }, + { + "epoch": 0.2864265634116586, + "grad_norm": 5.458837985992432, + "learning_rate": 4.522622394313902e-05, + "loss": 0.7803, + "step": 32400 + }, + { + "epoch": 0.28651496667197085, + "grad_norm": 2.699753999710083, + "learning_rate": 4.5224750555467155e-05, + "loss": 0.775, + "step": 32410 + }, + { + "epoch": 0.2866033699322831, + "grad_norm": 3.461235523223877, + "learning_rate": 4.5223277167795284e-05, + "loss": 0.6948, + "step": 32420 + }, + { + "epoch": 0.28669177319259537, + "grad_norm": 3.9574456214904785, + "learning_rate": 4.522180378012341e-05, + "loss": 0.7613, + "step": 32430 + }, + { + "epoch": 0.2867801764529076, + "grad_norm": 1.863532304763794, + "learning_rate": 4.522033039245154e-05, + "loss": 0.6545, + "step": 32440 + }, + { + "epoch": 0.28686857971321983, + "grad_norm": 2.3592708110809326, + "learning_rate": 4.5218857004779676e-05, + "loss": 0.802, + "step": 32450 + }, + { + "epoch": 0.28695698297353206, + "grad_norm": 5.686901092529297, + "learning_rate": 4.52173836171078e-05, + "loss": 0.747, + "step": 32460 + }, + { + "epoch": 0.2870453862338443, + "grad_norm": 6.838834762573242, + "learning_rate": 4.521591022943593e-05, + "loss": 0.6799, + "step": 32470 + }, + { + "epoch": 0.2871337894941565, + "grad_norm": 6.67786169052124, + "learning_rate": 4.521443684176406e-05, + "loss": 0.6916, + "step": 32480 + }, + { + "epoch": 0.2872221927544688, + "grad_norm": 3.9122962951660156, + "learning_rate": 4.521296345409219e-05, + "loss": 0.7445, + "step": 32490 + }, + { + "epoch": 0.28731059601478104, + "grad_norm": 2.963627576828003, + "learning_rate": 4.521149006642032e-05, + "loss": 0.747, + "step": 32500 + }, + { + "epoch": 0.28739899927509327, + "grad_norm": 5.150475978851318, + "learning_rate": 4.5210016678748446e-05, + "loss": 0.8416, + "step": 32510 + }, + { + "epoch": 0.2874874025354055, + "grad_norm": 4.771765232086182, + "learning_rate": 4.5208543291076574e-05, + "loss": 0.6932, + "step": 32520 + }, + { + "epoch": 0.28757580579571773, + "grad_norm": 6.81682825088501, + "learning_rate": 4.520706990340471e-05, + "loss": 0.7802, + "step": 32530 + }, + { + "epoch": 0.28766420905602996, + "grad_norm": 4.43913459777832, + "learning_rate": 4.520559651573284e-05, + "loss": 0.8444, + "step": 32540 + }, + { + "epoch": 0.28775261231634225, + "grad_norm": 4.907598972320557, + "learning_rate": 4.5204123128060966e-05, + "loss": 0.8626, + "step": 32550 + }, + { + "epoch": 0.2878410155766545, + "grad_norm": 2.7618095874786377, + "learning_rate": 4.5202649740389094e-05, + "loss": 0.6698, + "step": 32560 + }, + { + "epoch": 0.2879294188369667, + "grad_norm": 1.5626672506332397, + "learning_rate": 4.520117635271722e-05, + "loss": 0.6031, + "step": 32570 + }, + { + "epoch": 0.28801782209727894, + "grad_norm": 4.612063407897949, + "learning_rate": 4.519970296504535e-05, + "loss": 0.6114, + "step": 32580 + }, + { + "epoch": 0.2881062253575912, + "grad_norm": 2.4098236560821533, + "learning_rate": 4.519822957737348e-05, + "loss": 0.7503, + "step": 32590 + }, + { + "epoch": 0.2881946286179034, + "grad_norm": 4.4840850830078125, + "learning_rate": 4.5196756189701614e-05, + "loss": 0.753, + "step": 32600 + }, + { + "epoch": 0.2882830318782157, + "grad_norm": 4.803959369659424, + "learning_rate": 4.519528280202974e-05, + "loss": 0.6325, + "step": 32610 + }, + { + "epoch": 0.2883714351385279, + "grad_norm": 2.624789237976074, + "learning_rate": 4.519380941435787e-05, + "loss": 0.6635, + "step": 32620 + }, + { + "epoch": 0.28845983839884015, + "grad_norm": 4.030854225158691, + "learning_rate": 4.5192336026686e-05, + "loss": 0.7889, + "step": 32630 + }, + { + "epoch": 0.2885482416591524, + "grad_norm": 6.312045097351074, + "learning_rate": 4.519086263901413e-05, + "loss": 0.866, + "step": 32640 + }, + { + "epoch": 0.2886366449194646, + "grad_norm": 4.309917449951172, + "learning_rate": 4.5189389251342256e-05, + "loss": 0.8799, + "step": 32650 + }, + { + "epoch": 0.28872504817977684, + "grad_norm": 3.6183743476867676, + "learning_rate": 4.518791586367039e-05, + "loss": 0.8994, + "step": 32660 + }, + { + "epoch": 0.28881345144008913, + "grad_norm": 5.355915069580078, + "learning_rate": 4.518644247599852e-05, + "loss": 0.7293, + "step": 32670 + }, + { + "epoch": 0.28890185470040136, + "grad_norm": 7.802880764007568, + "learning_rate": 4.518496908832665e-05, + "loss": 0.7388, + "step": 32680 + }, + { + "epoch": 0.2889902579607136, + "grad_norm": 4.839122772216797, + "learning_rate": 4.5183495700654776e-05, + "loss": 0.7913, + "step": 32690 + }, + { + "epoch": 0.2890786612210258, + "grad_norm": 7.241281509399414, + "learning_rate": 4.5182022312982905e-05, + "loss": 0.8092, + "step": 32700 + }, + { + "epoch": 0.28916706448133805, + "grad_norm": 6.2426838874816895, + "learning_rate": 4.518054892531103e-05, + "loss": 0.8079, + "step": 32710 + }, + { + "epoch": 0.2892554677416503, + "grad_norm": 7.339729309082031, + "learning_rate": 4.517907553763917e-05, + "loss": 0.8304, + "step": 32720 + }, + { + "epoch": 0.28934387100196257, + "grad_norm": 3.2782113552093506, + "learning_rate": 4.517760214996729e-05, + "loss": 0.7398, + "step": 32730 + }, + { + "epoch": 0.2894322742622748, + "grad_norm": 6.81158447265625, + "learning_rate": 4.5176128762295425e-05, + "loss": 0.7231, + "step": 32740 + }, + { + "epoch": 0.28952067752258703, + "grad_norm": 4.297384262084961, + "learning_rate": 4.517465537462355e-05, + "loss": 0.8208, + "step": 32750 + }, + { + "epoch": 0.28960908078289926, + "grad_norm": 1.8786605596542358, + "learning_rate": 4.517318198695168e-05, + "loss": 0.7891, + "step": 32760 + }, + { + "epoch": 0.2896974840432115, + "grad_norm": 4.043199062347412, + "learning_rate": 4.517170859927981e-05, + "loss": 0.6814, + "step": 32770 + }, + { + "epoch": 0.2897858873035238, + "grad_norm": 2.5615527629852295, + "learning_rate": 4.5170235211607945e-05, + "loss": 0.7769, + "step": 32780 + }, + { + "epoch": 0.289874290563836, + "grad_norm": 14.03801441192627, + "learning_rate": 4.516876182393607e-05, + "loss": 0.7883, + "step": 32790 + }, + { + "epoch": 0.28996269382414824, + "grad_norm": 4.790248394012451, + "learning_rate": 4.51672884362642e-05, + "loss": 0.6845, + "step": 32800 + }, + { + "epoch": 0.2900510970844605, + "grad_norm": 3.695822238922119, + "learning_rate": 4.516581504859232e-05, + "loss": 0.6317, + "step": 32810 + }, + { + "epoch": 0.2901395003447727, + "grad_norm": 3.9681992530822754, + "learning_rate": 4.516434166092046e-05, + "loss": 0.835, + "step": 32820 + }, + { + "epoch": 0.29022790360508494, + "grad_norm": 5.208821773529053, + "learning_rate": 4.516286827324859e-05, + "loss": 0.7193, + "step": 32830 + }, + { + "epoch": 0.2903163068653972, + "grad_norm": 2.9277303218841553, + "learning_rate": 4.5161394885576715e-05, + "loss": 0.6765, + "step": 32840 + }, + { + "epoch": 0.29040471012570945, + "grad_norm": 9.771438598632812, + "learning_rate": 4.5159921497904844e-05, + "loss": 0.7952, + "step": 32850 + }, + { + "epoch": 0.2904931133860217, + "grad_norm": 3.2962722778320312, + "learning_rate": 4.515844811023298e-05, + "loss": 0.7132, + "step": 32860 + }, + { + "epoch": 0.2905815166463339, + "grad_norm": 4.077047824859619, + "learning_rate": 4.51569747225611e-05, + "loss": 0.7213, + "step": 32870 + }, + { + "epoch": 0.29066991990664615, + "grad_norm": 5.25209379196167, + "learning_rate": 4.5155501334889235e-05, + "loss": 0.8393, + "step": 32880 + }, + { + "epoch": 0.2907583231669584, + "grad_norm": 3.1044015884399414, + "learning_rate": 4.5154027947217364e-05, + "loss": 0.8898, + "step": 32890 + }, + { + "epoch": 0.29084672642727066, + "grad_norm": 4.176365852355957, + "learning_rate": 4.515255455954549e-05, + "loss": 0.6721, + "step": 32900 + }, + { + "epoch": 0.2909351296875829, + "grad_norm": 5.453918933868408, + "learning_rate": 4.515108117187362e-05, + "loss": 0.8482, + "step": 32910 + }, + { + "epoch": 0.2910235329478951, + "grad_norm": 4.604106903076172, + "learning_rate": 4.5149607784201756e-05, + "loss": 0.7786, + "step": 32920 + }, + { + "epoch": 0.29111193620820736, + "grad_norm": 6.950430393218994, + "learning_rate": 4.514813439652988e-05, + "loss": 0.7997, + "step": 32930 + }, + { + "epoch": 0.2912003394685196, + "grad_norm": 2.0689749717712402, + "learning_rate": 4.514666100885801e-05, + "loss": 0.756, + "step": 32940 + }, + { + "epoch": 0.2912887427288318, + "grad_norm": 3.6594607830047607, + "learning_rate": 4.5145187621186134e-05, + "loss": 0.7576, + "step": 32950 + }, + { + "epoch": 0.2913771459891441, + "grad_norm": 3.749756097793579, + "learning_rate": 4.514371423351427e-05, + "loss": 0.667, + "step": 32960 + }, + { + "epoch": 0.29146554924945633, + "grad_norm": 2.4697751998901367, + "learning_rate": 4.51422408458424e-05, + "loss": 0.945, + "step": 32970 + }, + { + "epoch": 0.29155395250976857, + "grad_norm": 9.962291717529297, + "learning_rate": 4.5140767458170526e-05, + "loss": 0.7015, + "step": 32980 + }, + { + "epoch": 0.2916423557700808, + "grad_norm": 10.507822036743164, + "learning_rate": 4.5139294070498654e-05, + "loss": 0.6948, + "step": 32990 + }, + { + "epoch": 0.291730759030393, + "grad_norm": 5.705935001373291, + "learning_rate": 4.513782068282679e-05, + "loss": 0.6677, + "step": 33000 + }, + { + "epoch": 0.29181916229070526, + "grad_norm": 2.1423537731170654, + "learning_rate": 4.513634729515491e-05, + "loss": 0.6951, + "step": 33010 + }, + { + "epoch": 0.29190756555101754, + "grad_norm": 5.549307823181152, + "learning_rate": 4.5134873907483046e-05, + "loss": 0.7208, + "step": 33020 + }, + { + "epoch": 0.2919959688113298, + "grad_norm": 3.8369317054748535, + "learning_rate": 4.5133400519811174e-05, + "loss": 0.7848, + "step": 33030 + }, + { + "epoch": 0.292084372071642, + "grad_norm": 5.902238368988037, + "learning_rate": 4.51319271321393e-05, + "loss": 0.8579, + "step": 33040 + }, + { + "epoch": 0.29217277533195424, + "grad_norm": 2.6832330226898193, + "learning_rate": 4.513045374446743e-05, + "loss": 0.738, + "step": 33050 + }, + { + "epoch": 0.29226117859226647, + "grad_norm": 5.537685394287109, + "learning_rate": 4.512898035679556e-05, + "loss": 0.7045, + "step": 33060 + }, + { + "epoch": 0.2923495818525787, + "grad_norm": 4.328697681427002, + "learning_rate": 4.512750696912369e-05, + "loss": 0.8025, + "step": 33070 + }, + { + "epoch": 0.292437985112891, + "grad_norm": 1.3331048488616943, + "learning_rate": 4.512603358145182e-05, + "loss": 0.7723, + "step": 33080 + }, + { + "epoch": 0.2925263883732032, + "grad_norm": 5.987185001373291, + "learning_rate": 4.5124560193779944e-05, + "loss": 0.7358, + "step": 33090 + }, + { + "epoch": 0.29261479163351545, + "grad_norm": 3.3136699199676514, + "learning_rate": 4.512308680610808e-05, + "loss": 0.7887, + "step": 33100 + }, + { + "epoch": 0.2927031948938277, + "grad_norm": 2.6293253898620605, + "learning_rate": 4.512161341843621e-05, + "loss": 0.7684, + "step": 33110 + }, + { + "epoch": 0.2927915981541399, + "grad_norm": 3.0009572505950928, + "learning_rate": 4.5120140030764336e-05, + "loss": 0.7661, + "step": 33120 + }, + { + "epoch": 0.29288000141445214, + "grad_norm": 3.628920555114746, + "learning_rate": 4.5118666643092465e-05, + "loss": 0.6792, + "step": 33130 + }, + { + "epoch": 0.2929684046747644, + "grad_norm": 5.839208602905273, + "learning_rate": 4.51171932554206e-05, + "loss": 0.6378, + "step": 33140 + }, + { + "epoch": 0.29305680793507666, + "grad_norm": 8.483647346496582, + "learning_rate": 4.511571986774872e-05, + "loss": 0.6737, + "step": 33150 + }, + { + "epoch": 0.2931452111953889, + "grad_norm": 6.439718723297119, + "learning_rate": 4.5114246480076856e-05, + "loss": 0.7123, + "step": 33160 + }, + { + "epoch": 0.2932336144557011, + "grad_norm": 3.1471869945526123, + "learning_rate": 4.511277309240498e-05, + "loss": 0.8745, + "step": 33170 + }, + { + "epoch": 0.29332201771601335, + "grad_norm": 2.6022372245788574, + "learning_rate": 4.511129970473311e-05, + "loss": 0.7852, + "step": 33180 + }, + { + "epoch": 0.2934104209763256, + "grad_norm": 7.2737555503845215, + "learning_rate": 4.510982631706124e-05, + "loss": 0.7135, + "step": 33190 + }, + { + "epoch": 0.29349882423663787, + "grad_norm": 2.462229013442993, + "learning_rate": 4.510835292938937e-05, + "loss": 0.6438, + "step": 33200 + }, + { + "epoch": 0.2935872274969501, + "grad_norm": 7.040503025054932, + "learning_rate": 4.51068795417175e-05, + "loss": 0.6436, + "step": 33210 + }, + { + "epoch": 0.29367563075726233, + "grad_norm": 4.6756110191345215, + "learning_rate": 4.510540615404563e-05, + "loss": 0.7018, + "step": 33220 + }, + { + "epoch": 0.29376403401757456, + "grad_norm": 18.167781829833984, + "learning_rate": 4.5103932766373755e-05, + "loss": 0.7519, + "step": 33230 + }, + { + "epoch": 0.2938524372778868, + "grad_norm": 4.194391250610352, + "learning_rate": 4.510245937870189e-05, + "loss": 0.7561, + "step": 33240 + }, + { + "epoch": 0.293940840538199, + "grad_norm": 16.583396911621094, + "learning_rate": 4.510098599103002e-05, + "loss": 0.6939, + "step": 33250 + }, + { + "epoch": 0.2940292437985113, + "grad_norm": 3.5381343364715576, + "learning_rate": 4.509951260335815e-05, + "loss": 0.8336, + "step": 33260 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 2.9034624099731445, + "learning_rate": 4.5098039215686275e-05, + "loss": 0.6719, + "step": 33270 + }, + { + "epoch": 0.29420605031913577, + "grad_norm": 7.28995418548584, + "learning_rate": 4.509656582801441e-05, + "loss": 0.7327, + "step": 33280 + }, + { + "epoch": 0.294294453579448, + "grad_norm": 7.901029586791992, + "learning_rate": 4.509509244034253e-05, + "loss": 0.9311, + "step": 33290 + }, + { + "epoch": 0.29438285683976023, + "grad_norm": 2.0050694942474365, + "learning_rate": 4.509361905267067e-05, + "loss": 0.6719, + "step": 33300 + }, + { + "epoch": 0.2944712601000725, + "grad_norm": 1.942972183227539, + "learning_rate": 4.509214566499879e-05, + "loss": 0.7469, + "step": 33310 + }, + { + "epoch": 0.29455966336038475, + "grad_norm": 10.133966445922852, + "learning_rate": 4.5090672277326924e-05, + "loss": 0.7659, + "step": 33320 + }, + { + "epoch": 0.294648066620697, + "grad_norm": 8.564141273498535, + "learning_rate": 4.508919888965505e-05, + "loss": 0.6516, + "step": 33330 + }, + { + "epoch": 0.2947364698810092, + "grad_norm": 7.872227668762207, + "learning_rate": 4.508772550198318e-05, + "loss": 0.7498, + "step": 33340 + }, + { + "epoch": 0.29482487314132144, + "grad_norm": 3.1297037601470947, + "learning_rate": 4.508625211431131e-05, + "loss": 0.6493, + "step": 33350 + }, + { + "epoch": 0.2949132764016337, + "grad_norm": 2.6965694427490234, + "learning_rate": 4.5084778726639444e-05, + "loss": 0.8029, + "step": 33360 + }, + { + "epoch": 0.29500167966194596, + "grad_norm": 1.995500087738037, + "learning_rate": 4.5083305338967565e-05, + "loss": 0.7052, + "step": 33370 + }, + { + "epoch": 0.2950900829222582, + "grad_norm": 3.699645757675171, + "learning_rate": 4.50818319512957e-05, + "loss": 0.8108, + "step": 33380 + }, + { + "epoch": 0.2951784861825704, + "grad_norm": 3.0953612327575684, + "learning_rate": 4.508035856362383e-05, + "loss": 0.8239, + "step": 33390 + }, + { + "epoch": 0.29526688944288265, + "grad_norm": 2.6699650287628174, + "learning_rate": 4.507888517595196e-05, + "loss": 0.7155, + "step": 33400 + }, + { + "epoch": 0.2953552927031949, + "grad_norm": 3.6951022148132324, + "learning_rate": 4.5077411788280086e-05, + "loss": 0.8089, + "step": 33410 + }, + { + "epoch": 0.2954436959635071, + "grad_norm": 13.246560096740723, + "learning_rate": 4.5075938400608214e-05, + "loss": 0.8607, + "step": 33420 + }, + { + "epoch": 0.2955320992238194, + "grad_norm": 2.9321255683898926, + "learning_rate": 4.507446501293634e-05, + "loss": 0.6976, + "step": 33430 + }, + { + "epoch": 0.29562050248413163, + "grad_norm": 4.297573566436768, + "learning_rate": 4.507299162526448e-05, + "loss": 0.7081, + "step": 33440 + }, + { + "epoch": 0.29570890574444386, + "grad_norm": 1.6129165887832642, + "learning_rate": 4.5071518237592606e-05, + "loss": 0.7055, + "step": 33450 + }, + { + "epoch": 0.2957973090047561, + "grad_norm": 5.101630210876465, + "learning_rate": 4.5070044849920734e-05, + "loss": 0.7261, + "step": 33460 + }, + { + "epoch": 0.2958857122650683, + "grad_norm": 2.3036487102508545, + "learning_rate": 4.506857146224886e-05, + "loss": 0.8177, + "step": 33470 + }, + { + "epoch": 0.29597411552538055, + "grad_norm": 10.941174507141113, + "learning_rate": 4.506709807457699e-05, + "loss": 0.7958, + "step": 33480 + }, + { + "epoch": 0.29606251878569284, + "grad_norm": 5.281665802001953, + "learning_rate": 4.506562468690512e-05, + "loss": 0.6946, + "step": 33490 + }, + { + "epoch": 0.29615092204600507, + "grad_norm": 6.0411152839660645, + "learning_rate": 4.5064151299233254e-05, + "loss": 0.7124, + "step": 33500 + }, + { + "epoch": 0.2962393253063173, + "grad_norm": 2.2550160884857178, + "learning_rate": 4.506267791156138e-05, + "loss": 0.6775, + "step": 33510 + }, + { + "epoch": 0.29632772856662953, + "grad_norm": 1.734143614768982, + "learning_rate": 4.506120452388951e-05, + "loss": 0.6546, + "step": 33520 + }, + { + "epoch": 0.29641613182694176, + "grad_norm": 4.533917427062988, + "learning_rate": 4.505973113621764e-05, + "loss": 0.789, + "step": 33530 + }, + { + "epoch": 0.296504535087254, + "grad_norm": 2.692505359649658, + "learning_rate": 4.505825774854577e-05, + "loss": 0.84, + "step": 33540 + }, + { + "epoch": 0.2965929383475663, + "grad_norm": 4.516151428222656, + "learning_rate": 4.5056784360873896e-05, + "loss": 0.7122, + "step": 33550 + }, + { + "epoch": 0.2966813416078785, + "grad_norm": 3.1818645000457764, + "learning_rate": 4.5055310973202024e-05, + "loss": 0.821, + "step": 33560 + }, + { + "epoch": 0.29676974486819074, + "grad_norm": 2.779891014099121, + "learning_rate": 4.505383758553016e-05, + "loss": 0.7608, + "step": 33570 + }, + { + "epoch": 0.296858148128503, + "grad_norm": 6.305975437164307, + "learning_rate": 4.505236419785829e-05, + "loss": 0.8242, + "step": 33580 + }, + { + "epoch": 0.2969465513888152, + "grad_norm": 2.454766273498535, + "learning_rate": 4.5050890810186416e-05, + "loss": 0.7936, + "step": 33590 + }, + { + "epoch": 0.29703495464912744, + "grad_norm": 3.8064215183258057, + "learning_rate": 4.5049417422514545e-05, + "loss": 0.7833, + "step": 33600 + }, + { + "epoch": 0.2971233579094397, + "grad_norm": 7.597683429718018, + "learning_rate": 4.504794403484267e-05, + "loss": 0.7053, + "step": 33610 + }, + { + "epoch": 0.29721176116975195, + "grad_norm": 9.404048919677734, + "learning_rate": 4.50464706471708e-05, + "loss": 0.7911, + "step": 33620 + }, + { + "epoch": 0.2973001644300642, + "grad_norm": 5.486915111541748, + "learning_rate": 4.5044997259498936e-05, + "loss": 0.7071, + "step": 33630 + }, + { + "epoch": 0.2973885676903764, + "grad_norm": 6.558414459228516, + "learning_rate": 4.504352387182706e-05, + "loss": 0.7104, + "step": 33640 + }, + { + "epoch": 0.29747697095068865, + "grad_norm": 7.517357349395752, + "learning_rate": 4.504205048415519e-05, + "loss": 0.6982, + "step": 33650 + }, + { + "epoch": 0.2975653742110009, + "grad_norm": 4.337510585784912, + "learning_rate": 4.504057709648332e-05, + "loss": 0.7708, + "step": 33660 + }, + { + "epoch": 0.29765377747131316, + "grad_norm": 3.676593065261841, + "learning_rate": 4.503910370881145e-05, + "loss": 0.7596, + "step": 33670 + }, + { + "epoch": 0.2977421807316254, + "grad_norm": 5.71250581741333, + "learning_rate": 4.503763032113958e-05, + "loss": 0.7676, + "step": 33680 + }, + { + "epoch": 0.2978305839919376, + "grad_norm": 2.9847748279571533, + "learning_rate": 4.503615693346771e-05, + "loss": 0.7273, + "step": 33690 + }, + { + "epoch": 0.29791898725224986, + "grad_norm": 2.5993552207946777, + "learning_rate": 4.5034683545795835e-05, + "loss": 0.8274, + "step": 33700 + }, + { + "epoch": 0.2980073905125621, + "grad_norm": 2.1354825496673584, + "learning_rate": 4.503321015812397e-05, + "loss": 0.8048, + "step": 33710 + }, + { + "epoch": 0.2980957937728743, + "grad_norm": 6.078588962554932, + "learning_rate": 4.50317367704521e-05, + "loss": 0.7835, + "step": 33720 + }, + { + "epoch": 0.2981841970331866, + "grad_norm": 5.610902309417725, + "learning_rate": 4.503026338278023e-05, + "loss": 0.7088, + "step": 33730 + }, + { + "epoch": 0.29827260029349884, + "grad_norm": 1.3966268301010132, + "learning_rate": 4.5028789995108355e-05, + "loss": 0.6612, + "step": 33740 + }, + { + "epoch": 0.29836100355381107, + "grad_norm": 5.88723087310791, + "learning_rate": 4.502731660743649e-05, + "loss": 0.7465, + "step": 33750 + }, + { + "epoch": 0.2984494068141233, + "grad_norm": 2.5628538131713867, + "learning_rate": 4.502584321976461e-05, + "loss": 0.6704, + "step": 33760 + }, + { + "epoch": 0.29853781007443553, + "grad_norm": 5.098702907562256, + "learning_rate": 4.502436983209275e-05, + "loss": 0.7279, + "step": 33770 + }, + { + "epoch": 0.29862621333474776, + "grad_norm": 2.44887113571167, + "learning_rate": 4.502289644442087e-05, + "loss": 0.7042, + "step": 33780 + }, + { + "epoch": 0.29871461659506005, + "grad_norm": 4.624105930328369, + "learning_rate": 4.5021423056749004e-05, + "loss": 0.7581, + "step": 33790 + }, + { + "epoch": 0.2988030198553723, + "grad_norm": 5.336048126220703, + "learning_rate": 4.501994966907713e-05, + "loss": 0.6621, + "step": 33800 + }, + { + "epoch": 0.2988914231156845, + "grad_norm": 29.99491310119629, + "learning_rate": 4.501847628140526e-05, + "loss": 0.7816, + "step": 33810 + }, + { + "epoch": 0.29897982637599674, + "grad_norm": 1.6243245601654053, + "learning_rate": 4.501700289373339e-05, + "loss": 0.6673, + "step": 33820 + }, + { + "epoch": 0.29906822963630897, + "grad_norm": 5.088352680206299, + "learning_rate": 4.5015529506061524e-05, + "loss": 0.7942, + "step": 33830 + }, + { + "epoch": 0.29915663289662126, + "grad_norm": 7.78596830368042, + "learning_rate": 4.5014056118389645e-05, + "loss": 0.7443, + "step": 33840 + }, + { + "epoch": 0.2992450361569335, + "grad_norm": 3.9854860305786133, + "learning_rate": 4.501258273071778e-05, + "loss": 0.7123, + "step": 33850 + }, + { + "epoch": 0.2993334394172457, + "grad_norm": 7.866239547729492, + "learning_rate": 4.501110934304591e-05, + "loss": 0.8152, + "step": 33860 + }, + { + "epoch": 0.29942184267755795, + "grad_norm": 5.544895172119141, + "learning_rate": 4.500963595537404e-05, + "loss": 0.7945, + "step": 33870 + }, + { + "epoch": 0.2995102459378702, + "grad_norm": 2.7663767337799072, + "learning_rate": 4.5008162567702166e-05, + "loss": 0.785, + "step": 33880 + }, + { + "epoch": 0.2995986491981824, + "grad_norm": 7.807943344116211, + "learning_rate": 4.5006689180030294e-05, + "loss": 0.7619, + "step": 33890 + }, + { + "epoch": 0.2996870524584947, + "grad_norm": 5.797680854797363, + "learning_rate": 4.500521579235842e-05, + "loss": 0.7638, + "step": 33900 + }, + { + "epoch": 0.2997754557188069, + "grad_norm": 2.5137417316436768, + "learning_rate": 4.500374240468656e-05, + "loss": 0.7784, + "step": 33910 + }, + { + "epoch": 0.29986385897911916, + "grad_norm": 3.4669203758239746, + "learning_rate": 4.500226901701468e-05, + "loss": 0.7297, + "step": 33920 + }, + { + "epoch": 0.2999522622394314, + "grad_norm": 2.0675132274627686, + "learning_rate": 4.5000795629342814e-05, + "loss": 0.6618, + "step": 33930 + }, + { + "epoch": 0.3000406654997436, + "grad_norm": 5.007786750793457, + "learning_rate": 4.499932224167094e-05, + "loss": 0.6963, + "step": 33940 + }, + { + "epoch": 0.30012906876005585, + "grad_norm": 3.6443653106689453, + "learning_rate": 4.499784885399907e-05, + "loss": 0.6755, + "step": 33950 + }, + { + "epoch": 0.30021747202036814, + "grad_norm": 6.972654819488525, + "learning_rate": 4.49963754663272e-05, + "loss": 0.786, + "step": 33960 + }, + { + "epoch": 0.30030587528068037, + "grad_norm": 14.206008911132812, + "learning_rate": 4.4994902078655334e-05, + "loss": 0.7552, + "step": 33970 + }, + { + "epoch": 0.3003942785409926, + "grad_norm": 4.875217437744141, + "learning_rate": 4.4993428690983456e-05, + "loss": 0.7843, + "step": 33980 + }, + { + "epoch": 0.30048268180130483, + "grad_norm": 2.0247178077697754, + "learning_rate": 4.499195530331159e-05, + "loss": 0.77, + "step": 33990 + }, + { + "epoch": 0.30057108506161706, + "grad_norm": 5.592731952667236, + "learning_rate": 4.499048191563971e-05, + "loss": 0.7991, + "step": 34000 + }, + { + "epoch": 0.3006594883219293, + "grad_norm": 4.4301605224609375, + "learning_rate": 4.498900852796785e-05, + "loss": 0.6283, + "step": 34010 + }, + { + "epoch": 0.3007478915822416, + "grad_norm": 10.852659225463867, + "learning_rate": 4.4987535140295976e-05, + "loss": 0.8067, + "step": 34020 + }, + { + "epoch": 0.3008362948425538, + "grad_norm": 4.8646016120910645, + "learning_rate": 4.4986061752624105e-05, + "loss": 0.6956, + "step": 34030 + }, + { + "epoch": 0.30092469810286604, + "grad_norm": 2.4376742839813232, + "learning_rate": 4.498458836495223e-05, + "loss": 0.6968, + "step": 34040 + }, + { + "epoch": 0.30101310136317827, + "grad_norm": 5.311902046203613, + "learning_rate": 4.498311497728037e-05, + "loss": 0.6929, + "step": 34050 + }, + { + "epoch": 0.3011015046234905, + "grad_norm": 2.5958940982818604, + "learning_rate": 4.498164158960849e-05, + "loss": 0.8462, + "step": 34060 + }, + { + "epoch": 0.30118990788380273, + "grad_norm": 3.115326404571533, + "learning_rate": 4.4980168201936625e-05, + "loss": 0.7645, + "step": 34070 + }, + { + "epoch": 0.301278311144115, + "grad_norm": 2.6564133167266846, + "learning_rate": 4.497869481426475e-05, + "loss": 0.7752, + "step": 34080 + }, + { + "epoch": 0.30136671440442725, + "grad_norm": 2.4497995376586914, + "learning_rate": 4.497722142659288e-05, + "loss": 0.6383, + "step": 34090 + }, + { + "epoch": 0.3014551176647395, + "grad_norm": 1.471512794494629, + "learning_rate": 4.497574803892101e-05, + "loss": 0.7211, + "step": 34100 + }, + { + "epoch": 0.3015435209250517, + "grad_norm": 6.132297992706299, + "learning_rate": 4.497427465124914e-05, + "loss": 0.8168, + "step": 34110 + }, + { + "epoch": 0.30163192418536394, + "grad_norm": 6.400779724121094, + "learning_rate": 4.4972801263577266e-05, + "loss": 0.5699, + "step": 34120 + }, + { + "epoch": 0.3017203274456762, + "grad_norm": 3.533576488494873, + "learning_rate": 4.49713278759054e-05, + "loss": 0.729, + "step": 34130 + }, + { + "epoch": 0.30180873070598846, + "grad_norm": 5.203378677368164, + "learning_rate": 4.496985448823352e-05, + "loss": 0.7799, + "step": 34140 + }, + { + "epoch": 0.3018971339663007, + "grad_norm": 5.792718410491943, + "learning_rate": 4.496838110056166e-05, + "loss": 0.7584, + "step": 34150 + }, + { + "epoch": 0.3019855372266129, + "grad_norm": 9.333540916442871, + "learning_rate": 4.496690771288979e-05, + "loss": 0.5008, + "step": 34160 + }, + { + "epoch": 0.30207394048692515, + "grad_norm": 2.0677549839019775, + "learning_rate": 4.4965434325217915e-05, + "loss": 0.8411, + "step": 34170 + }, + { + "epoch": 0.3021623437472374, + "grad_norm": 3.623750925064087, + "learning_rate": 4.496396093754604e-05, + "loss": 0.7233, + "step": 34180 + }, + { + "epoch": 0.3022507470075496, + "grad_norm": 3.4018473625183105, + "learning_rate": 4.496248754987418e-05, + "loss": 0.6843, + "step": 34190 + }, + { + "epoch": 0.3023391502678619, + "grad_norm": 7.940188407897949, + "learning_rate": 4.49610141622023e-05, + "loss": 0.8278, + "step": 34200 + }, + { + "epoch": 0.30242755352817413, + "grad_norm": 4.774045944213867, + "learning_rate": 4.4959540774530435e-05, + "loss": 0.6959, + "step": 34210 + }, + { + "epoch": 0.30251595678848636, + "grad_norm": 1.9966486692428589, + "learning_rate": 4.4958067386858564e-05, + "loss": 0.6778, + "step": 34220 + }, + { + "epoch": 0.3026043600487986, + "grad_norm": 3.796740770339966, + "learning_rate": 4.495659399918669e-05, + "loss": 0.8041, + "step": 34230 + }, + { + "epoch": 0.3026927633091108, + "grad_norm": 9.700430870056152, + "learning_rate": 4.495512061151482e-05, + "loss": 0.8319, + "step": 34240 + }, + { + "epoch": 0.30278116656942305, + "grad_norm": 2.1992483139038086, + "learning_rate": 4.495364722384295e-05, + "loss": 0.8348, + "step": 34250 + }, + { + "epoch": 0.30286956982973534, + "grad_norm": 2.6925811767578125, + "learning_rate": 4.495217383617108e-05, + "loss": 0.7807, + "step": 34260 + }, + { + "epoch": 0.30295797309004757, + "grad_norm": 4.500965595245361, + "learning_rate": 4.495070044849921e-05, + "loss": 0.7108, + "step": 34270 + }, + { + "epoch": 0.3030463763503598, + "grad_norm": 5.057172775268555, + "learning_rate": 4.4949227060827334e-05, + "loss": 0.8005, + "step": 34280 + }, + { + "epoch": 0.30313477961067203, + "grad_norm": 5.226869583129883, + "learning_rate": 4.494775367315547e-05, + "loss": 0.5941, + "step": 34290 + }, + { + "epoch": 0.30322318287098426, + "grad_norm": 2.1849160194396973, + "learning_rate": 4.49462802854836e-05, + "loss": 0.7457, + "step": 34300 + }, + { + "epoch": 0.3033115861312965, + "grad_norm": 3.0871520042419434, + "learning_rate": 4.4944806897811726e-05, + "loss": 0.7671, + "step": 34310 + }, + { + "epoch": 0.3033999893916088, + "grad_norm": 4.777992248535156, + "learning_rate": 4.4943333510139854e-05, + "loss": 0.7081, + "step": 34320 + }, + { + "epoch": 0.303488392651921, + "grad_norm": 1.8558039665222168, + "learning_rate": 4.494186012246799e-05, + "loss": 0.6043, + "step": 34330 + }, + { + "epoch": 0.30357679591223324, + "grad_norm": 11.023305892944336, + "learning_rate": 4.494038673479611e-05, + "loss": 0.8404, + "step": 34340 + }, + { + "epoch": 0.3036651991725455, + "grad_norm": 4.605042457580566, + "learning_rate": 4.4938913347124246e-05, + "loss": 0.7625, + "step": 34350 + }, + { + "epoch": 0.3037536024328577, + "grad_norm": 2.363144636154175, + "learning_rate": 4.4937439959452374e-05, + "loss": 0.6667, + "step": 34360 + }, + { + "epoch": 0.30384200569316994, + "grad_norm": 26.961488723754883, + "learning_rate": 4.49359665717805e-05, + "loss": 0.8653, + "step": 34370 + }, + { + "epoch": 0.3039304089534822, + "grad_norm": 2.402686595916748, + "learning_rate": 4.493449318410863e-05, + "loss": 0.6584, + "step": 34380 + }, + { + "epoch": 0.30401881221379445, + "grad_norm": 4.610472679138184, + "learning_rate": 4.493301979643676e-05, + "loss": 0.6445, + "step": 34390 + }, + { + "epoch": 0.3041072154741067, + "grad_norm": 1.5059280395507812, + "learning_rate": 4.493154640876489e-05, + "loss": 0.7671, + "step": 34400 + }, + { + "epoch": 0.3041956187344189, + "grad_norm": 3.2649242877960205, + "learning_rate": 4.493007302109302e-05, + "loss": 0.656, + "step": 34410 + }, + { + "epoch": 0.30428402199473115, + "grad_norm": 6.003420352935791, + "learning_rate": 4.492859963342115e-05, + "loss": 0.7645, + "step": 34420 + }, + { + "epoch": 0.30437242525504343, + "grad_norm": 6.091169834136963, + "learning_rate": 4.492712624574928e-05, + "loss": 0.7411, + "step": 34430 + }, + { + "epoch": 0.30446082851535566, + "grad_norm": 3.928722620010376, + "learning_rate": 4.492565285807741e-05, + "loss": 0.8999, + "step": 34440 + }, + { + "epoch": 0.3045492317756679, + "grad_norm": 4.167619228363037, + "learning_rate": 4.4924179470405536e-05, + "loss": 0.8574, + "step": 34450 + }, + { + "epoch": 0.3046376350359801, + "grad_norm": 2.832871675491333, + "learning_rate": 4.4922706082733664e-05, + "loss": 0.7686, + "step": 34460 + }, + { + "epoch": 0.30472603829629236, + "grad_norm": 2.916842222213745, + "learning_rate": 4.492123269506179e-05, + "loss": 0.6476, + "step": 34470 + }, + { + "epoch": 0.3048144415566046, + "grad_norm": 2.728239059448242, + "learning_rate": 4.491975930738993e-05, + "loss": 0.7421, + "step": 34480 + }, + { + "epoch": 0.3049028448169169, + "grad_norm": 7.834898471832275, + "learning_rate": 4.4918285919718056e-05, + "loss": 0.8405, + "step": 34490 + }, + { + "epoch": 0.3049912480772291, + "grad_norm": 3.9580485820770264, + "learning_rate": 4.4916812532046185e-05, + "loss": 0.7717, + "step": 34500 + }, + { + "epoch": 0.30507965133754134, + "grad_norm": 7.409971714019775, + "learning_rate": 4.491533914437431e-05, + "loss": 0.6351, + "step": 34510 + }, + { + "epoch": 0.30516805459785357, + "grad_norm": 9.813450813293457, + "learning_rate": 4.491386575670244e-05, + "loss": 0.625, + "step": 34520 + }, + { + "epoch": 0.3052564578581658, + "grad_norm": 4.363447666168213, + "learning_rate": 4.491239236903057e-05, + "loss": 0.8406, + "step": 34530 + }, + { + "epoch": 0.30534486111847803, + "grad_norm": 13.420183181762695, + "learning_rate": 4.4910918981358705e-05, + "loss": 0.6346, + "step": 34540 + }, + { + "epoch": 0.3054332643787903, + "grad_norm": 2.382375717163086, + "learning_rate": 4.490944559368683e-05, + "loss": 0.6244, + "step": 34550 + }, + { + "epoch": 0.30552166763910255, + "grad_norm": 3.6090288162231445, + "learning_rate": 4.490797220601496e-05, + "loss": 0.7077, + "step": 34560 + }, + { + "epoch": 0.3056100708994148, + "grad_norm": 5.673571586608887, + "learning_rate": 4.490649881834309e-05, + "loss": 0.6857, + "step": 34570 + }, + { + "epoch": 0.305698474159727, + "grad_norm": 18.463489532470703, + "learning_rate": 4.490502543067122e-05, + "loss": 0.6403, + "step": 34580 + }, + { + "epoch": 0.30578687742003924, + "grad_norm": 3.3048393726348877, + "learning_rate": 4.4903552042999347e-05, + "loss": 0.7948, + "step": 34590 + }, + { + "epoch": 0.30587528068035147, + "grad_norm": 4.157071113586426, + "learning_rate": 4.490207865532748e-05, + "loss": 0.7781, + "step": 34600 + }, + { + "epoch": 0.30596368394066376, + "grad_norm": 3.253995180130005, + "learning_rate": 4.49006052676556e-05, + "loss": 0.7411, + "step": 34610 + }, + { + "epoch": 0.306052087200976, + "grad_norm": 7.761780261993408, + "learning_rate": 4.489913187998374e-05, + "loss": 0.7864, + "step": 34620 + }, + { + "epoch": 0.3061404904612882, + "grad_norm": 5.318267822265625, + "learning_rate": 4.489765849231187e-05, + "loss": 0.8214, + "step": 34630 + }, + { + "epoch": 0.30622889372160045, + "grad_norm": 1.8405423164367676, + "learning_rate": 4.4896185104639995e-05, + "loss": 0.6918, + "step": 34640 + }, + { + "epoch": 0.3063172969819127, + "grad_norm": 3.742891788482666, + "learning_rate": 4.4894711716968123e-05, + "loss": 0.7683, + "step": 34650 + }, + { + "epoch": 0.3064057002422249, + "grad_norm": 7.632383346557617, + "learning_rate": 4.489323832929626e-05, + "loss": 0.8693, + "step": 34660 + }, + { + "epoch": 0.3064941035025372, + "grad_norm": 4.455667972564697, + "learning_rate": 4.489176494162438e-05, + "loss": 0.7577, + "step": 34670 + }, + { + "epoch": 0.3065825067628494, + "grad_norm": 7.343852996826172, + "learning_rate": 4.4890291553952515e-05, + "loss": 0.6562, + "step": 34680 + }, + { + "epoch": 0.30667091002316166, + "grad_norm": 3.7397708892822266, + "learning_rate": 4.4888818166280644e-05, + "loss": 0.7118, + "step": 34690 + }, + { + "epoch": 0.3067593132834739, + "grad_norm": 2.8678243160247803, + "learning_rate": 4.488734477860877e-05, + "loss": 0.6978, + "step": 34700 + }, + { + "epoch": 0.3068477165437861, + "grad_norm": 5.855990409851074, + "learning_rate": 4.48858713909369e-05, + "loss": 0.6802, + "step": 34710 + }, + { + "epoch": 0.30693611980409835, + "grad_norm": 4.8320112228393555, + "learning_rate": 4.488439800326503e-05, + "loss": 0.6886, + "step": 34720 + }, + { + "epoch": 0.30702452306441064, + "grad_norm": 4.854421138763428, + "learning_rate": 4.488292461559316e-05, + "loss": 0.8148, + "step": 34730 + }, + { + "epoch": 0.30711292632472287, + "grad_norm": 4.3127055168151855, + "learning_rate": 4.488145122792129e-05, + "loss": 0.8299, + "step": 34740 + }, + { + "epoch": 0.3072013295850351, + "grad_norm": 3.706496477127075, + "learning_rate": 4.4879977840249414e-05, + "loss": 0.6845, + "step": 34750 + }, + { + "epoch": 0.30728973284534733, + "grad_norm": 3.475883960723877, + "learning_rate": 4.487850445257755e-05, + "loss": 0.7979, + "step": 34760 + }, + { + "epoch": 0.30737813610565956, + "grad_norm": 4.517330169677734, + "learning_rate": 4.487703106490568e-05, + "loss": 0.651, + "step": 34770 + }, + { + "epoch": 0.3074665393659718, + "grad_norm": 9.706879615783691, + "learning_rate": 4.4875557677233806e-05, + "loss": 0.8229, + "step": 34780 + }, + { + "epoch": 0.3075549426262841, + "grad_norm": 2.5399911403656006, + "learning_rate": 4.4874084289561934e-05, + "loss": 0.7164, + "step": 34790 + }, + { + "epoch": 0.3076433458865963, + "grad_norm": 8.67696762084961, + "learning_rate": 4.487261090189007e-05, + "loss": 0.7514, + "step": 34800 + }, + { + "epoch": 0.30773174914690854, + "grad_norm": 3.7912492752075195, + "learning_rate": 4.487113751421819e-05, + "loss": 0.8312, + "step": 34810 + }, + { + "epoch": 0.30782015240722077, + "grad_norm": 5.7773356437683105, + "learning_rate": 4.4869664126546326e-05, + "loss": 0.8123, + "step": 34820 + }, + { + "epoch": 0.307908555667533, + "grad_norm": 4.9688639640808105, + "learning_rate": 4.486819073887445e-05, + "loss": 0.7679, + "step": 34830 + }, + { + "epoch": 0.30799695892784523, + "grad_norm": 2.416210889816284, + "learning_rate": 4.486671735120258e-05, + "loss": 0.8568, + "step": 34840 + }, + { + "epoch": 0.3080853621881575, + "grad_norm": 5.57459831237793, + "learning_rate": 4.486524396353071e-05, + "loss": 0.621, + "step": 34850 + }, + { + "epoch": 0.30817376544846975, + "grad_norm": 8.48627758026123, + "learning_rate": 4.486377057585884e-05, + "loss": 0.7955, + "step": 34860 + }, + { + "epoch": 0.308262168708782, + "grad_norm": 3.1546690464019775, + "learning_rate": 4.486229718818697e-05, + "loss": 0.8805, + "step": 34870 + }, + { + "epoch": 0.3083505719690942, + "grad_norm": 5.674517631530762, + "learning_rate": 4.48608238005151e-05, + "loss": 0.7785, + "step": 34880 + }, + { + "epoch": 0.30843897522940644, + "grad_norm": 4.3803815841674805, + "learning_rate": 4.4859350412843224e-05, + "loss": 0.7112, + "step": 34890 + }, + { + "epoch": 0.3085273784897187, + "grad_norm": 3.8419272899627686, + "learning_rate": 4.485787702517136e-05, + "loss": 0.787, + "step": 34900 + }, + { + "epoch": 0.30861578175003096, + "grad_norm": 2.115967273712158, + "learning_rate": 4.485640363749949e-05, + "loss": 0.7202, + "step": 34910 + }, + { + "epoch": 0.3087041850103432, + "grad_norm": 2.223862409591675, + "learning_rate": 4.4854930249827616e-05, + "loss": 0.6737, + "step": 34920 + }, + { + "epoch": 0.3087925882706554, + "grad_norm": 3.2410547733306885, + "learning_rate": 4.4853456862155744e-05, + "loss": 0.8091, + "step": 34930 + }, + { + "epoch": 0.30888099153096765, + "grad_norm": 5.367846488952637, + "learning_rate": 4.485198347448387e-05, + "loss": 0.7883, + "step": 34940 + }, + { + "epoch": 0.3089693947912799, + "grad_norm": 4.547714710235596, + "learning_rate": 4.4850510086812e-05, + "loss": 0.7055, + "step": 34950 + }, + { + "epoch": 0.30905779805159217, + "grad_norm": 5.68997049331665, + "learning_rate": 4.4849036699140136e-05, + "loss": 0.6594, + "step": 34960 + }, + { + "epoch": 0.3091462013119044, + "grad_norm": 2.666247844696045, + "learning_rate": 4.484756331146826e-05, + "loss": 0.7091, + "step": 34970 + }, + { + "epoch": 0.30923460457221663, + "grad_norm": 5.0559983253479, + "learning_rate": 4.484608992379639e-05, + "loss": 0.7026, + "step": 34980 + }, + { + "epoch": 0.30932300783252886, + "grad_norm": 1.8135817050933838, + "learning_rate": 4.484461653612452e-05, + "loss": 0.6423, + "step": 34990 + }, + { + "epoch": 0.3094114110928411, + "grad_norm": 7.472788333892822, + "learning_rate": 4.484314314845265e-05, + "loss": 0.6638, + "step": 35000 + }, + { + "epoch": 0.3094998143531533, + "grad_norm": 3.6537375450134277, + "learning_rate": 4.484166976078078e-05, + "loss": 0.7728, + "step": 35010 + }, + { + "epoch": 0.3095882176134656, + "grad_norm": 4.108083724975586, + "learning_rate": 4.484019637310891e-05, + "loss": 0.7638, + "step": 35020 + }, + { + "epoch": 0.30967662087377784, + "grad_norm": 3.3913795948028564, + "learning_rate": 4.4838722985437035e-05, + "loss": 0.6888, + "step": 35030 + }, + { + "epoch": 0.3097650241340901, + "grad_norm": 6.567787170410156, + "learning_rate": 4.483724959776517e-05, + "loss": 0.6654, + "step": 35040 + }, + { + "epoch": 0.3098534273944023, + "grad_norm": 5.884479522705078, + "learning_rate": 4.483577621009329e-05, + "loss": 0.7992, + "step": 35050 + }, + { + "epoch": 0.30994183065471453, + "grad_norm": 2.6067841053009033, + "learning_rate": 4.483430282242143e-05, + "loss": 0.8537, + "step": 35060 + }, + { + "epoch": 0.31003023391502676, + "grad_norm": 1.8995705842971802, + "learning_rate": 4.4832829434749555e-05, + "loss": 0.6798, + "step": 35070 + }, + { + "epoch": 0.31011863717533905, + "grad_norm": 6.674327850341797, + "learning_rate": 4.483135604707768e-05, + "loss": 0.6286, + "step": 35080 + }, + { + "epoch": 0.3102070404356513, + "grad_norm": 2.3585987091064453, + "learning_rate": 4.482988265940581e-05, + "loss": 0.8528, + "step": 35090 + }, + { + "epoch": 0.3102954436959635, + "grad_norm": 2.9615836143493652, + "learning_rate": 4.482840927173395e-05, + "loss": 0.7392, + "step": 35100 + }, + { + "epoch": 0.31038384695627574, + "grad_norm": 10.153748512268066, + "learning_rate": 4.482693588406207e-05, + "loss": 0.7166, + "step": 35110 + }, + { + "epoch": 0.310472250216588, + "grad_norm": 2.615586042404175, + "learning_rate": 4.4825462496390204e-05, + "loss": 0.7242, + "step": 35120 + }, + { + "epoch": 0.3105606534769002, + "grad_norm": 9.03085708618164, + "learning_rate": 4.482398910871833e-05, + "loss": 0.6931, + "step": 35130 + }, + { + "epoch": 0.3106490567372125, + "grad_norm": 2.5908732414245605, + "learning_rate": 4.482251572104646e-05, + "loss": 0.8251, + "step": 35140 + }, + { + "epoch": 0.3107374599975247, + "grad_norm": 2.6006767749786377, + "learning_rate": 4.482104233337459e-05, + "loss": 0.7147, + "step": 35150 + }, + { + "epoch": 0.31082586325783695, + "grad_norm": 3.929838180541992, + "learning_rate": 4.4819568945702724e-05, + "loss": 0.7676, + "step": 35160 + }, + { + "epoch": 0.3109142665181492, + "grad_norm": 4.024363040924072, + "learning_rate": 4.4818095558030845e-05, + "loss": 0.8565, + "step": 35170 + }, + { + "epoch": 0.3110026697784614, + "grad_norm": 3.61643385887146, + "learning_rate": 4.481662217035898e-05, + "loss": 0.7725, + "step": 35180 + }, + { + "epoch": 0.31109107303877365, + "grad_norm": 11.784936904907227, + "learning_rate": 4.48151487826871e-05, + "loss": 0.76, + "step": 35190 + }, + { + "epoch": 0.31117947629908593, + "grad_norm": 3.1878347396850586, + "learning_rate": 4.481367539501524e-05, + "loss": 0.8009, + "step": 35200 + }, + { + "epoch": 0.31126787955939816, + "grad_norm": 1.882789969444275, + "learning_rate": 4.4812202007343365e-05, + "loss": 0.7287, + "step": 35210 + }, + { + "epoch": 0.3113562828197104, + "grad_norm": 3.1435670852661133, + "learning_rate": 4.4810728619671494e-05, + "loss": 0.6436, + "step": 35220 + }, + { + "epoch": 0.3114446860800226, + "grad_norm": 7.801337718963623, + "learning_rate": 4.480925523199962e-05, + "loss": 0.7324, + "step": 35230 + }, + { + "epoch": 0.31153308934033486, + "grad_norm": 3.7233738899230957, + "learning_rate": 4.480778184432776e-05, + "loss": 0.7875, + "step": 35240 + }, + { + "epoch": 0.3116214926006471, + "grad_norm": 7.069220066070557, + "learning_rate": 4.480630845665588e-05, + "loss": 0.7657, + "step": 35250 + }, + { + "epoch": 0.3117098958609594, + "grad_norm": 1.9100210666656494, + "learning_rate": 4.4804835068984014e-05, + "loss": 0.7953, + "step": 35260 + }, + { + "epoch": 0.3117982991212716, + "grad_norm": 3.015167713165283, + "learning_rate": 4.480336168131214e-05, + "loss": 0.8179, + "step": 35270 + }, + { + "epoch": 0.31188670238158384, + "grad_norm": 3.5229954719543457, + "learning_rate": 4.480188829364027e-05, + "loss": 0.8366, + "step": 35280 + }, + { + "epoch": 0.31197510564189607, + "grad_norm": 2.8934969902038574, + "learning_rate": 4.48004149059684e-05, + "loss": 0.6987, + "step": 35290 + }, + { + "epoch": 0.3120635089022083, + "grad_norm": 2.1959846019744873, + "learning_rate": 4.479894151829653e-05, + "loss": 0.7815, + "step": 35300 + }, + { + "epoch": 0.31215191216252053, + "grad_norm": 2.49710750579834, + "learning_rate": 4.4797468130624656e-05, + "loss": 0.6647, + "step": 35310 + }, + { + "epoch": 0.3122403154228328, + "grad_norm": 2.9548497200012207, + "learning_rate": 4.479599474295279e-05, + "loss": 0.7203, + "step": 35320 + }, + { + "epoch": 0.31232871868314505, + "grad_norm": 3.4954943656921387, + "learning_rate": 4.479452135528092e-05, + "loss": 0.7129, + "step": 35330 + }, + { + "epoch": 0.3124171219434573, + "grad_norm": 7.048424243927002, + "learning_rate": 4.479304796760905e-05, + "loss": 0.6283, + "step": 35340 + }, + { + "epoch": 0.3125055252037695, + "grad_norm": 1.6082574129104614, + "learning_rate": 4.4791574579937176e-05, + "loss": 0.77, + "step": 35350 + }, + { + "epoch": 0.31259392846408174, + "grad_norm": 9.34705924987793, + "learning_rate": 4.4790101192265304e-05, + "loss": 0.7601, + "step": 35360 + }, + { + "epoch": 0.31268233172439397, + "grad_norm": 10.420363426208496, + "learning_rate": 4.478862780459343e-05, + "loss": 0.7167, + "step": 35370 + }, + { + "epoch": 0.31277073498470626, + "grad_norm": 4.788998603820801, + "learning_rate": 4.478715441692157e-05, + "loss": 0.6051, + "step": 35380 + }, + { + "epoch": 0.3128591382450185, + "grad_norm": 3.5290868282318115, + "learning_rate": 4.4785681029249696e-05, + "loss": 0.76, + "step": 35390 + }, + { + "epoch": 0.3129475415053307, + "grad_norm": 7.129870414733887, + "learning_rate": 4.4784207641577825e-05, + "loss": 0.6829, + "step": 35400 + }, + { + "epoch": 0.31303594476564295, + "grad_norm": 2.5844061374664307, + "learning_rate": 4.478273425390595e-05, + "loss": 0.7663, + "step": 35410 + }, + { + "epoch": 0.3131243480259552, + "grad_norm": 2.622905969619751, + "learning_rate": 4.478126086623408e-05, + "loss": 0.7099, + "step": 35420 + }, + { + "epoch": 0.3132127512862674, + "grad_norm": 2.8615331649780273, + "learning_rate": 4.477978747856221e-05, + "loss": 0.7074, + "step": 35430 + }, + { + "epoch": 0.3133011545465797, + "grad_norm": 2.793071985244751, + "learning_rate": 4.477831409089034e-05, + "loss": 0.6503, + "step": 35440 + }, + { + "epoch": 0.3133895578068919, + "grad_norm": 3.404242992401123, + "learning_rate": 4.477684070321847e-05, + "loss": 0.6747, + "step": 35450 + }, + { + "epoch": 0.31347796106720416, + "grad_norm": 3.489241600036621, + "learning_rate": 4.47753673155466e-05, + "loss": 0.6033, + "step": 35460 + }, + { + "epoch": 0.3135663643275164, + "grad_norm": 10.060763359069824, + "learning_rate": 4.477389392787473e-05, + "loss": 0.729, + "step": 35470 + }, + { + "epoch": 0.3136547675878286, + "grad_norm": 6.695730209350586, + "learning_rate": 4.477242054020286e-05, + "loss": 0.6457, + "step": 35480 + }, + { + "epoch": 0.3137431708481409, + "grad_norm": 7.306407451629639, + "learning_rate": 4.4770947152530986e-05, + "loss": 0.7439, + "step": 35490 + }, + { + "epoch": 0.31383157410845314, + "grad_norm": 10.030495643615723, + "learning_rate": 4.4769473764859115e-05, + "loss": 0.6102, + "step": 35500 + }, + { + "epoch": 0.31391997736876537, + "grad_norm": 1.6690642833709717, + "learning_rate": 4.476800037718725e-05, + "loss": 0.6725, + "step": 35510 + }, + { + "epoch": 0.3140083806290776, + "grad_norm": 4.747469425201416, + "learning_rate": 4.476652698951537e-05, + "loss": 0.7041, + "step": 35520 + }, + { + "epoch": 0.31409678388938983, + "grad_norm": 2.086782455444336, + "learning_rate": 4.476505360184351e-05, + "loss": 0.7764, + "step": 35530 + }, + { + "epoch": 0.31418518714970206, + "grad_norm": 2.241271495819092, + "learning_rate": 4.4763580214171635e-05, + "loss": 0.6617, + "step": 35540 + }, + { + "epoch": 0.31427359041001435, + "grad_norm": 4.012430191040039, + "learning_rate": 4.4762106826499763e-05, + "loss": 0.7873, + "step": 35550 + }, + { + "epoch": 0.3143619936703266, + "grad_norm": 12.640499114990234, + "learning_rate": 4.476063343882789e-05, + "loss": 0.7215, + "step": 35560 + }, + { + "epoch": 0.3144503969306388, + "grad_norm": 5.47008752822876, + "learning_rate": 4.475916005115603e-05, + "loss": 0.7184, + "step": 35570 + }, + { + "epoch": 0.31453880019095104, + "grad_norm": 4.258671283721924, + "learning_rate": 4.475768666348415e-05, + "loss": 0.7131, + "step": 35580 + }, + { + "epoch": 0.31462720345126327, + "grad_norm": 10.900259971618652, + "learning_rate": 4.4756213275812284e-05, + "loss": 0.6627, + "step": 35590 + }, + { + "epoch": 0.3147156067115755, + "grad_norm": 7.8930983543396, + "learning_rate": 4.475473988814041e-05, + "loss": 0.7873, + "step": 35600 + }, + { + "epoch": 0.3148040099718878, + "grad_norm": 1.966615915298462, + "learning_rate": 4.475326650046854e-05, + "loss": 0.7197, + "step": 35610 + }, + { + "epoch": 0.3148924132322, + "grad_norm": 3.239475965499878, + "learning_rate": 4.475179311279667e-05, + "loss": 0.826, + "step": 35620 + }, + { + "epoch": 0.31498081649251225, + "grad_norm": 8.250494956970215, + "learning_rate": 4.4750319725124804e-05, + "loss": 0.7778, + "step": 35630 + }, + { + "epoch": 0.3150692197528245, + "grad_norm": 2.4526751041412354, + "learning_rate": 4.4748846337452925e-05, + "loss": 0.6566, + "step": 35640 + }, + { + "epoch": 0.3151576230131367, + "grad_norm": 7.8469109535217285, + "learning_rate": 4.474737294978106e-05, + "loss": 0.6996, + "step": 35650 + }, + { + "epoch": 0.31524602627344894, + "grad_norm": 2.9463260173797607, + "learning_rate": 4.474589956210918e-05, + "loss": 0.6945, + "step": 35660 + }, + { + "epoch": 0.31533442953376123, + "grad_norm": 4.764516353607178, + "learning_rate": 4.474442617443732e-05, + "loss": 0.7386, + "step": 35670 + }, + { + "epoch": 0.31542283279407346, + "grad_norm": 8.156478881835938, + "learning_rate": 4.4742952786765446e-05, + "loss": 0.6639, + "step": 35680 + }, + { + "epoch": 0.3155112360543857, + "grad_norm": 6.431391716003418, + "learning_rate": 4.4741479399093574e-05, + "loss": 0.8414, + "step": 35690 + }, + { + "epoch": 0.3155996393146979, + "grad_norm": 2.5973117351531982, + "learning_rate": 4.47400060114217e-05, + "loss": 0.7268, + "step": 35700 + }, + { + "epoch": 0.31568804257501015, + "grad_norm": 2.5390853881835938, + "learning_rate": 4.473853262374984e-05, + "loss": 0.9046, + "step": 35710 + }, + { + "epoch": 0.3157764458353224, + "grad_norm": 2.5553884506225586, + "learning_rate": 4.473705923607796e-05, + "loss": 0.7369, + "step": 35720 + }, + { + "epoch": 0.31586484909563467, + "grad_norm": 5.37067985534668, + "learning_rate": 4.4735585848406094e-05, + "loss": 0.7933, + "step": 35730 + }, + { + "epoch": 0.3159532523559469, + "grad_norm": 2.2314293384552, + "learning_rate": 4.473411246073422e-05, + "loss": 0.7073, + "step": 35740 + }, + { + "epoch": 0.31604165561625913, + "grad_norm": 4.727914810180664, + "learning_rate": 4.473263907306235e-05, + "loss": 0.6766, + "step": 35750 + }, + { + "epoch": 0.31613005887657136, + "grad_norm": 3.5170843601226807, + "learning_rate": 4.473116568539048e-05, + "loss": 0.9076, + "step": 35760 + }, + { + "epoch": 0.3162184621368836, + "grad_norm": 3.3607208728790283, + "learning_rate": 4.472969229771861e-05, + "loss": 0.6969, + "step": 35770 + }, + { + "epoch": 0.3163068653971958, + "grad_norm": 3.736297607421875, + "learning_rate": 4.4728218910046736e-05, + "loss": 0.6165, + "step": 35780 + }, + { + "epoch": 0.3163952686575081, + "grad_norm": 2.551983594894409, + "learning_rate": 4.472674552237487e-05, + "loss": 0.6292, + "step": 35790 + }, + { + "epoch": 0.31648367191782034, + "grad_norm": 7.082693099975586, + "learning_rate": 4.472527213470299e-05, + "loss": 0.8233, + "step": 35800 + }, + { + "epoch": 0.3165720751781326, + "grad_norm": 1.9834364652633667, + "learning_rate": 4.472379874703113e-05, + "loss": 0.6516, + "step": 35810 + }, + { + "epoch": 0.3166604784384448, + "grad_norm": 3.8084793090820312, + "learning_rate": 4.4722325359359256e-05, + "loss": 0.7678, + "step": 35820 + }, + { + "epoch": 0.31674888169875703, + "grad_norm": 5.054144859313965, + "learning_rate": 4.4720851971687384e-05, + "loss": 0.7355, + "step": 35830 + }, + { + "epoch": 0.31683728495906927, + "grad_norm": 3.4051175117492676, + "learning_rate": 4.471937858401551e-05, + "loss": 0.6345, + "step": 35840 + }, + { + "epoch": 0.31692568821938155, + "grad_norm": 5.139364242553711, + "learning_rate": 4.471790519634365e-05, + "loss": 0.7938, + "step": 35850 + }, + { + "epoch": 0.3170140914796938, + "grad_norm": 4.080883502960205, + "learning_rate": 4.471643180867177e-05, + "loss": 0.7778, + "step": 35860 + }, + { + "epoch": 0.317102494740006, + "grad_norm": 3.567744016647339, + "learning_rate": 4.4714958420999905e-05, + "loss": 0.672, + "step": 35870 + }, + { + "epoch": 0.31719089800031824, + "grad_norm": 6.6573286056518555, + "learning_rate": 4.4713485033328026e-05, + "loss": 0.7922, + "step": 35880 + }, + { + "epoch": 0.3172793012606305, + "grad_norm": 3.859511375427246, + "learning_rate": 4.471201164565616e-05, + "loss": 0.8442, + "step": 35890 + }, + { + "epoch": 0.3173677045209427, + "grad_norm": 1.7332161664962769, + "learning_rate": 4.471053825798429e-05, + "loss": 0.6553, + "step": 35900 + }, + { + "epoch": 0.317456107781255, + "grad_norm": 1.9279416799545288, + "learning_rate": 4.470906487031242e-05, + "loss": 0.772, + "step": 35910 + }, + { + "epoch": 0.3175445110415672, + "grad_norm": 1.5226647853851318, + "learning_rate": 4.4707591482640546e-05, + "loss": 0.7767, + "step": 35920 + }, + { + "epoch": 0.31763291430187945, + "grad_norm": 3.45474910736084, + "learning_rate": 4.470611809496868e-05, + "loss": 0.728, + "step": 35930 + }, + { + "epoch": 0.3177213175621917, + "grad_norm": 5.9742913246154785, + "learning_rate": 4.47046447072968e-05, + "loss": 0.8092, + "step": 35940 + }, + { + "epoch": 0.3178097208225039, + "grad_norm": 1.7873629331588745, + "learning_rate": 4.470317131962494e-05, + "loss": 0.6293, + "step": 35950 + }, + { + "epoch": 0.31789812408281615, + "grad_norm": 2.174562454223633, + "learning_rate": 4.4701697931953067e-05, + "loss": 0.7105, + "step": 35960 + }, + { + "epoch": 0.31798652734312843, + "grad_norm": 4.899603843688965, + "learning_rate": 4.4700224544281195e-05, + "loss": 0.7141, + "step": 35970 + }, + { + "epoch": 0.31807493060344066, + "grad_norm": 10.002727508544922, + "learning_rate": 4.469875115660932e-05, + "loss": 0.8434, + "step": 35980 + }, + { + "epoch": 0.3181633338637529, + "grad_norm": 4.238052845001221, + "learning_rate": 4.469727776893745e-05, + "loss": 0.7131, + "step": 35990 + }, + { + "epoch": 0.3182517371240651, + "grad_norm": 3.539936065673828, + "learning_rate": 4.469580438126558e-05, + "loss": 0.8386, + "step": 36000 + }, + { + "epoch": 0.31834014038437736, + "grad_norm": 12.063549041748047, + "learning_rate": 4.4694330993593715e-05, + "loss": 0.6592, + "step": 36010 + }, + { + "epoch": 0.31842854364468964, + "grad_norm": 2.9906394481658936, + "learning_rate": 4.469285760592184e-05, + "loss": 0.7182, + "step": 36020 + }, + { + "epoch": 0.3185169469050019, + "grad_norm": 1.3524519205093384, + "learning_rate": 4.469138421824997e-05, + "loss": 0.7373, + "step": 36030 + }, + { + "epoch": 0.3186053501653141, + "grad_norm": 1.7572444677352905, + "learning_rate": 4.46899108305781e-05, + "loss": 0.7276, + "step": 36040 + }, + { + "epoch": 0.31869375342562634, + "grad_norm": 5.897258281707764, + "learning_rate": 4.468843744290623e-05, + "loss": 0.7438, + "step": 36050 + }, + { + "epoch": 0.31878215668593857, + "grad_norm": 4.364428520202637, + "learning_rate": 4.468696405523436e-05, + "loss": 0.686, + "step": 36060 + }, + { + "epoch": 0.3188705599462508, + "grad_norm": 4.301984786987305, + "learning_rate": 4.468549066756249e-05, + "loss": 0.6266, + "step": 36070 + }, + { + "epoch": 0.3189589632065631, + "grad_norm": 2.856872081756592, + "learning_rate": 4.4684017279890614e-05, + "loss": 0.8214, + "step": 36080 + }, + { + "epoch": 0.3190473664668753, + "grad_norm": 2.6192708015441895, + "learning_rate": 4.468254389221875e-05, + "loss": 0.6855, + "step": 36090 + }, + { + "epoch": 0.31913576972718755, + "grad_norm": 2.8822391033172607, + "learning_rate": 4.468107050454688e-05, + "loss": 0.7082, + "step": 36100 + }, + { + "epoch": 0.3192241729874998, + "grad_norm": 19.16712760925293, + "learning_rate": 4.4679597116875005e-05, + "loss": 0.7894, + "step": 36110 + }, + { + "epoch": 0.319312576247812, + "grad_norm": 7.504942893981934, + "learning_rate": 4.4678123729203134e-05, + "loss": 0.7222, + "step": 36120 + }, + { + "epoch": 0.31940097950812424, + "grad_norm": 4.075170993804932, + "learning_rate": 4.467665034153126e-05, + "loss": 0.8002, + "step": 36130 + }, + { + "epoch": 0.3194893827684365, + "grad_norm": 2.7567331790924072, + "learning_rate": 4.467517695385939e-05, + "loss": 0.7948, + "step": 36140 + }, + { + "epoch": 0.31957778602874876, + "grad_norm": 1.9092066287994385, + "learning_rate": 4.4673703566187526e-05, + "loss": 0.6888, + "step": 36150 + }, + { + "epoch": 0.319666189289061, + "grad_norm": 7.898533821105957, + "learning_rate": 4.467223017851565e-05, + "loss": 0.7943, + "step": 36160 + }, + { + "epoch": 0.3197545925493732, + "grad_norm": 6.137923717498779, + "learning_rate": 4.467075679084378e-05, + "loss": 0.8553, + "step": 36170 + }, + { + "epoch": 0.31984299580968545, + "grad_norm": 2.9837148189544678, + "learning_rate": 4.466928340317191e-05, + "loss": 0.8759, + "step": 36180 + }, + { + "epoch": 0.3199313990699977, + "grad_norm": 3.2219789028167725, + "learning_rate": 4.466781001550004e-05, + "loss": 0.7754, + "step": 36190 + }, + { + "epoch": 0.32001980233030997, + "grad_norm": 6.509387493133545, + "learning_rate": 4.466633662782817e-05, + "loss": 0.6833, + "step": 36200 + }, + { + "epoch": 0.3201082055906222, + "grad_norm": 9.746037483215332, + "learning_rate": 4.46648632401563e-05, + "loss": 0.8367, + "step": 36210 + }, + { + "epoch": 0.3201966088509344, + "grad_norm": 4.755408763885498, + "learning_rate": 4.4663389852484424e-05, + "loss": 0.6208, + "step": 36220 + }, + { + "epoch": 0.32028501211124666, + "grad_norm": 3.9872827529907227, + "learning_rate": 4.466191646481256e-05, + "loss": 0.7711, + "step": 36230 + }, + { + "epoch": 0.3203734153715589, + "grad_norm": 3.089550256729126, + "learning_rate": 4.466044307714069e-05, + "loss": 0.7777, + "step": 36240 + }, + { + "epoch": 0.3204618186318711, + "grad_norm": 5.104980945587158, + "learning_rate": 4.4658969689468816e-05, + "loss": 0.6708, + "step": 36250 + }, + { + "epoch": 0.3205502218921834, + "grad_norm": 5.793055534362793, + "learning_rate": 4.4657496301796944e-05, + "loss": 0.778, + "step": 36260 + }, + { + "epoch": 0.32063862515249564, + "grad_norm": 7.498042583465576, + "learning_rate": 4.465602291412507e-05, + "loss": 0.64, + "step": 36270 + }, + { + "epoch": 0.32072702841280787, + "grad_norm": 1.810036540031433, + "learning_rate": 4.46545495264532e-05, + "loss": 0.6931, + "step": 36280 + }, + { + "epoch": 0.3208154316731201, + "grad_norm": 4.0473222732543945, + "learning_rate": 4.4653076138781336e-05, + "loss": 0.6462, + "step": 36290 + }, + { + "epoch": 0.32090383493343233, + "grad_norm": 3.327176570892334, + "learning_rate": 4.4651602751109464e-05, + "loss": 0.669, + "step": 36300 + }, + { + "epoch": 0.32099223819374456, + "grad_norm": 12.564557075500488, + "learning_rate": 4.465012936343759e-05, + "loss": 0.6947, + "step": 36310 + }, + { + "epoch": 0.32108064145405685, + "grad_norm": 3.4070346355438232, + "learning_rate": 4.464865597576572e-05, + "loss": 0.834, + "step": 36320 + }, + { + "epoch": 0.3211690447143691, + "grad_norm": 8.077347755432129, + "learning_rate": 4.464718258809385e-05, + "loss": 0.65, + "step": 36330 + }, + { + "epoch": 0.3212574479746813, + "grad_norm": 6.8531036376953125, + "learning_rate": 4.464570920042198e-05, + "loss": 0.7236, + "step": 36340 + }, + { + "epoch": 0.32134585123499354, + "grad_norm": 4.59869909286499, + "learning_rate": 4.4644235812750106e-05, + "loss": 0.6737, + "step": 36350 + }, + { + "epoch": 0.32143425449530577, + "grad_norm": 2.6326897144317627, + "learning_rate": 4.464276242507824e-05, + "loss": 0.7285, + "step": 36360 + }, + { + "epoch": 0.321522657755618, + "grad_norm": 7.368484973907471, + "learning_rate": 4.464128903740637e-05, + "loss": 0.9055, + "step": 36370 + }, + { + "epoch": 0.3216110610159303, + "grad_norm": 4.5146484375, + "learning_rate": 4.46398156497345e-05, + "loss": 0.7162, + "step": 36380 + }, + { + "epoch": 0.3216994642762425, + "grad_norm": 5.891968250274658, + "learning_rate": 4.4638342262062626e-05, + "loss": 0.7745, + "step": 36390 + }, + { + "epoch": 0.32178786753655475, + "grad_norm": 7.0796990394592285, + "learning_rate": 4.4636868874390755e-05, + "loss": 0.7461, + "step": 36400 + }, + { + "epoch": 0.321876270796867, + "grad_norm": 2.960014581680298, + "learning_rate": 4.463539548671888e-05, + "loss": 0.6261, + "step": 36410 + }, + { + "epoch": 0.3219646740571792, + "grad_norm": 3.8946077823638916, + "learning_rate": 4.463392209904702e-05, + "loss": 0.7359, + "step": 36420 + }, + { + "epoch": 0.32205307731749144, + "grad_norm": 3.6543290615081787, + "learning_rate": 4.463244871137515e-05, + "loss": 0.695, + "step": 36430 + }, + { + "epoch": 0.32214148057780373, + "grad_norm": 4.2082438468933105, + "learning_rate": 4.4630975323703275e-05, + "loss": 0.6462, + "step": 36440 + }, + { + "epoch": 0.32222988383811596, + "grad_norm": 7.0076751708984375, + "learning_rate": 4.46295019360314e-05, + "loss": 0.7556, + "step": 36450 + }, + { + "epoch": 0.3223182870984282, + "grad_norm": 5.183751106262207, + "learning_rate": 4.462802854835953e-05, + "loss": 0.6722, + "step": 36460 + }, + { + "epoch": 0.3224066903587404, + "grad_norm": 10.116753578186035, + "learning_rate": 4.462655516068766e-05, + "loss": 0.8298, + "step": 36470 + }, + { + "epoch": 0.32249509361905265, + "grad_norm": 5.947454929351807, + "learning_rate": 4.4625081773015795e-05, + "loss": 0.7069, + "step": 36480 + }, + { + "epoch": 0.3225834968793649, + "grad_norm": 5.148024082183838, + "learning_rate": 4.462360838534392e-05, + "loss": 0.67, + "step": 36490 + }, + { + "epoch": 0.32267190013967717, + "grad_norm": 2.5367794036865234, + "learning_rate": 4.462213499767205e-05, + "loss": 0.7776, + "step": 36500 + }, + { + "epoch": 0.3227603033999894, + "grad_norm": 3.567840099334717, + "learning_rate": 4.462066161000018e-05, + "loss": 0.8079, + "step": 36510 + }, + { + "epoch": 0.32284870666030163, + "grad_norm": 3.3447000980377197, + "learning_rate": 4.461918822232831e-05, + "loss": 0.8521, + "step": 36520 + }, + { + "epoch": 0.32293710992061386, + "grad_norm": 1.703671932220459, + "learning_rate": 4.461771483465644e-05, + "loss": 0.7592, + "step": 36530 + }, + { + "epoch": 0.3230255131809261, + "grad_norm": 5.826778888702393, + "learning_rate": 4.461624144698457e-05, + "loss": 0.6254, + "step": 36540 + }, + { + "epoch": 0.3231139164412384, + "grad_norm": 5.9275288581848145, + "learning_rate": 4.4614768059312694e-05, + "loss": 0.7534, + "step": 36550 + }, + { + "epoch": 0.3232023197015506, + "grad_norm": 2.486701250076294, + "learning_rate": 4.461329467164083e-05, + "loss": 0.8027, + "step": 36560 + }, + { + "epoch": 0.32329072296186284, + "grad_norm": 8.054226875305176, + "learning_rate": 4.461182128396896e-05, + "loss": 0.8708, + "step": 36570 + }, + { + "epoch": 0.3233791262221751, + "grad_norm": 2.529788017272949, + "learning_rate": 4.4610347896297085e-05, + "loss": 0.7099, + "step": 36580 + }, + { + "epoch": 0.3234675294824873, + "grad_norm": 2.2886850833892822, + "learning_rate": 4.4608874508625214e-05, + "loss": 0.7396, + "step": 36590 + }, + { + "epoch": 0.32355593274279953, + "grad_norm": 6.225261211395264, + "learning_rate": 4.460740112095334e-05, + "loss": 0.7366, + "step": 36600 + }, + { + "epoch": 0.3236443360031118, + "grad_norm": 7.799415111541748, + "learning_rate": 4.460592773328147e-05, + "loss": 0.6858, + "step": 36610 + }, + { + "epoch": 0.32373273926342405, + "grad_norm": 2.9057488441467285, + "learning_rate": 4.4604454345609606e-05, + "loss": 0.6897, + "step": 36620 + }, + { + "epoch": 0.3238211425237363, + "grad_norm": 3.1471447944641113, + "learning_rate": 4.460298095793773e-05, + "loss": 0.757, + "step": 36630 + }, + { + "epoch": 0.3239095457840485, + "grad_norm": 8.476110458374023, + "learning_rate": 4.460150757026586e-05, + "loss": 0.7871, + "step": 36640 + }, + { + "epoch": 0.32399794904436074, + "grad_norm": 3.5183098316192627, + "learning_rate": 4.460003418259399e-05, + "loss": 0.5878, + "step": 36650 + }, + { + "epoch": 0.324086352304673, + "grad_norm": 5.67738676071167, + "learning_rate": 4.459856079492212e-05, + "loss": 0.5429, + "step": 36660 + }, + { + "epoch": 0.32417475556498526, + "grad_norm": 5.265235424041748, + "learning_rate": 4.459708740725025e-05, + "loss": 0.7492, + "step": 36670 + }, + { + "epoch": 0.3242631588252975, + "grad_norm": 14.093735694885254, + "learning_rate": 4.459561401957838e-05, + "loss": 0.5584, + "step": 36680 + }, + { + "epoch": 0.3243515620856097, + "grad_norm": 5.544258117675781, + "learning_rate": 4.4594140631906504e-05, + "loss": 0.7839, + "step": 36690 + }, + { + "epoch": 0.32443996534592195, + "grad_norm": 4.524329662322998, + "learning_rate": 4.459266724423464e-05, + "loss": 0.6969, + "step": 36700 + }, + { + "epoch": 0.3245283686062342, + "grad_norm": 6.473782539367676, + "learning_rate": 4.459119385656276e-05, + "loss": 0.7758, + "step": 36710 + }, + { + "epoch": 0.3246167718665464, + "grad_norm": 3.0859789848327637, + "learning_rate": 4.4589720468890896e-05, + "loss": 0.6721, + "step": 36720 + }, + { + "epoch": 0.3247051751268587, + "grad_norm": 5.294314861297607, + "learning_rate": 4.4588247081219024e-05, + "loss": 0.8849, + "step": 36730 + }, + { + "epoch": 0.32479357838717093, + "grad_norm": 10.948575973510742, + "learning_rate": 4.458677369354715e-05, + "loss": 0.7714, + "step": 36740 + }, + { + "epoch": 0.32488198164748316, + "grad_norm": 4.2016754150390625, + "learning_rate": 4.458530030587528e-05, + "loss": 0.8192, + "step": 36750 + }, + { + "epoch": 0.3249703849077954, + "grad_norm": 4.120140552520752, + "learning_rate": 4.4583826918203416e-05, + "loss": 0.8306, + "step": 36760 + }, + { + "epoch": 0.3250587881681076, + "grad_norm": 5.85779333114624, + "learning_rate": 4.458235353053154e-05, + "loss": 0.928, + "step": 36770 + }, + { + "epoch": 0.32514719142841986, + "grad_norm": 4.28076696395874, + "learning_rate": 4.458088014285967e-05, + "loss": 0.9034, + "step": 36780 + }, + { + "epoch": 0.32523559468873214, + "grad_norm": 3.8169968128204346, + "learning_rate": 4.45794067551878e-05, + "loss": 0.7378, + "step": 36790 + }, + { + "epoch": 0.3253239979490444, + "grad_norm": 3.805772304534912, + "learning_rate": 4.457793336751593e-05, + "loss": 0.6729, + "step": 36800 + }, + { + "epoch": 0.3254124012093566, + "grad_norm": 2.670017957687378, + "learning_rate": 4.457645997984406e-05, + "loss": 0.696, + "step": 36810 + }, + { + "epoch": 0.32550080446966884, + "grad_norm": 9.231040954589844, + "learning_rate": 4.4574986592172186e-05, + "loss": 0.6923, + "step": 36820 + }, + { + "epoch": 0.32558920772998107, + "grad_norm": 3.8730318546295166, + "learning_rate": 4.4573513204500315e-05, + "loss": 0.7452, + "step": 36830 + }, + { + "epoch": 0.3256776109902933, + "grad_norm": 1.7324206829071045, + "learning_rate": 4.457203981682845e-05, + "loss": 0.7284, + "step": 36840 + }, + { + "epoch": 0.3257660142506056, + "grad_norm": 7.449343681335449, + "learning_rate": 4.457056642915657e-05, + "loss": 0.7128, + "step": 36850 + }, + { + "epoch": 0.3258544175109178, + "grad_norm": 2.6910338401794434, + "learning_rate": 4.4569093041484707e-05, + "loss": 0.6969, + "step": 36860 + }, + { + "epoch": 0.32594282077123005, + "grad_norm": 7.11206579208374, + "learning_rate": 4.4567619653812835e-05, + "loss": 0.7615, + "step": 36870 + }, + { + "epoch": 0.3260312240315423, + "grad_norm": 4.754086017608643, + "learning_rate": 4.456614626614096e-05, + "loss": 0.7397, + "step": 36880 + }, + { + "epoch": 0.3261196272918545, + "grad_norm": 3.5037434101104736, + "learning_rate": 4.456467287846909e-05, + "loss": 0.649, + "step": 36890 + }, + { + "epoch": 0.32620803055216674, + "grad_norm": 3.6783218383789062, + "learning_rate": 4.456319949079723e-05, + "loss": 0.8453, + "step": 36900 + }, + { + "epoch": 0.326296433812479, + "grad_norm": 3.4167351722717285, + "learning_rate": 4.456172610312535e-05, + "loss": 0.7621, + "step": 36910 + }, + { + "epoch": 0.32638483707279126, + "grad_norm": 2.969130039215088, + "learning_rate": 4.4560252715453483e-05, + "loss": 0.675, + "step": 36920 + }, + { + "epoch": 0.3264732403331035, + "grad_norm": 8.845534324645996, + "learning_rate": 4.455877932778161e-05, + "loss": 0.8149, + "step": 36930 + }, + { + "epoch": 0.3265616435934157, + "grad_norm": 2.8086657524108887, + "learning_rate": 4.455730594010974e-05, + "loss": 0.7067, + "step": 36940 + }, + { + "epoch": 0.32665004685372795, + "grad_norm": 8.169519424438477, + "learning_rate": 4.455583255243787e-05, + "loss": 0.7916, + "step": 36950 + }, + { + "epoch": 0.3267384501140402, + "grad_norm": 3.0278542041778564, + "learning_rate": 4.4554359164766e-05, + "loss": 0.6999, + "step": 36960 + }, + { + "epoch": 0.32682685337435247, + "grad_norm": 1.4574494361877441, + "learning_rate": 4.4552885777094125e-05, + "loss": 0.7723, + "step": 36970 + }, + { + "epoch": 0.3269152566346647, + "grad_norm": 5.4884114265441895, + "learning_rate": 4.455141238942226e-05, + "loss": 0.6939, + "step": 36980 + }, + { + "epoch": 0.32700365989497693, + "grad_norm": 6.8113532066345215, + "learning_rate": 4.454993900175038e-05, + "loss": 0.7398, + "step": 36990 + }, + { + "epoch": 0.32709206315528916, + "grad_norm": 4.3907928466796875, + "learning_rate": 4.454846561407852e-05, + "loss": 0.677, + "step": 37000 + }, + { + "epoch": 0.3271804664156014, + "grad_norm": 4.854316711425781, + "learning_rate": 4.4546992226406645e-05, + "loss": 0.7735, + "step": 37010 + }, + { + "epoch": 0.3272688696759136, + "grad_norm": 3.0051686763763428, + "learning_rate": 4.4545518838734774e-05, + "loss": 0.7206, + "step": 37020 + }, + { + "epoch": 0.3273572729362259, + "grad_norm": 4.15510892868042, + "learning_rate": 4.45440454510629e-05, + "loss": 0.7273, + "step": 37030 + }, + { + "epoch": 0.32744567619653814, + "grad_norm": 7.260164260864258, + "learning_rate": 4.454257206339104e-05, + "loss": 0.8417, + "step": 37040 + }, + { + "epoch": 0.32753407945685037, + "grad_norm": 4.777120590209961, + "learning_rate": 4.454109867571916e-05, + "loss": 0.6485, + "step": 37050 + }, + { + "epoch": 0.3276224827171626, + "grad_norm": 5.564696311950684, + "learning_rate": 4.4539625288047294e-05, + "loss": 0.8041, + "step": 37060 + }, + { + "epoch": 0.32771088597747483, + "grad_norm": 7.659509658813477, + "learning_rate": 4.4538151900375415e-05, + "loss": 0.8141, + "step": 37070 + }, + { + "epoch": 0.3277992892377871, + "grad_norm": 4.162317752838135, + "learning_rate": 4.453667851270355e-05, + "loss": 0.747, + "step": 37080 + }, + { + "epoch": 0.32788769249809935, + "grad_norm": 2.4380946159362793, + "learning_rate": 4.453520512503168e-05, + "loss": 0.7385, + "step": 37090 + }, + { + "epoch": 0.3279760957584116, + "grad_norm": 5.835511207580566, + "learning_rate": 4.453373173735981e-05, + "loss": 0.7104, + "step": 37100 + }, + { + "epoch": 0.3280644990187238, + "grad_norm": 3.2107927799224854, + "learning_rate": 4.4532258349687936e-05, + "loss": 0.8393, + "step": 37110 + }, + { + "epoch": 0.32815290227903604, + "grad_norm": 2.5637314319610596, + "learning_rate": 4.453078496201607e-05, + "loss": 0.7245, + "step": 37120 + }, + { + "epoch": 0.32824130553934827, + "grad_norm": 4.150752067565918, + "learning_rate": 4.452931157434419e-05, + "loss": 0.7199, + "step": 37130 + }, + { + "epoch": 0.32832970879966056, + "grad_norm": 7.983151912689209, + "learning_rate": 4.452783818667233e-05, + "loss": 0.6668, + "step": 37140 + }, + { + "epoch": 0.3284181120599728, + "grad_norm": 5.745832443237305, + "learning_rate": 4.4526364799000456e-05, + "loss": 0.71, + "step": 37150 + }, + { + "epoch": 0.328506515320285, + "grad_norm": 2.8052961826324463, + "learning_rate": 4.4524891411328584e-05, + "loss": 0.5905, + "step": 37160 + }, + { + "epoch": 0.32859491858059725, + "grad_norm": 7.838033676147461, + "learning_rate": 4.452341802365671e-05, + "loss": 0.6963, + "step": 37170 + }, + { + "epoch": 0.3286833218409095, + "grad_norm": 4.461402893066406, + "learning_rate": 4.452194463598484e-05, + "loss": 0.87, + "step": 37180 + }, + { + "epoch": 0.3287717251012217, + "grad_norm": 11.672638893127441, + "learning_rate": 4.452047124831297e-05, + "loss": 0.8347, + "step": 37190 + }, + { + "epoch": 0.328860128361534, + "grad_norm": 5.240647792816162, + "learning_rate": 4.4518997860641104e-05, + "loss": 0.8136, + "step": 37200 + }, + { + "epoch": 0.32894853162184623, + "grad_norm": 5.189323902130127, + "learning_rate": 4.451752447296923e-05, + "loss": 0.8399, + "step": 37210 + }, + { + "epoch": 0.32903693488215846, + "grad_norm": 6.071349620819092, + "learning_rate": 4.451605108529736e-05, + "loss": 0.7196, + "step": 37220 + }, + { + "epoch": 0.3291253381424707, + "grad_norm": 3.4593567848205566, + "learning_rate": 4.451457769762549e-05, + "loss": 0.7408, + "step": 37230 + }, + { + "epoch": 0.3292137414027829, + "grad_norm": 3.869748115539551, + "learning_rate": 4.451310430995362e-05, + "loss": 0.7996, + "step": 37240 + }, + { + "epoch": 0.32930214466309515, + "grad_norm": 2.687638998031616, + "learning_rate": 4.4511630922281746e-05, + "loss": 0.7685, + "step": 37250 + }, + { + "epoch": 0.32939054792340744, + "grad_norm": 3.2222094535827637, + "learning_rate": 4.451015753460988e-05, + "loss": 0.6959, + "step": 37260 + }, + { + "epoch": 0.32947895118371967, + "grad_norm": 5.09929895401001, + "learning_rate": 4.450868414693801e-05, + "loss": 0.6967, + "step": 37270 + }, + { + "epoch": 0.3295673544440319, + "grad_norm": 6.807804107666016, + "learning_rate": 4.450721075926614e-05, + "loss": 0.7079, + "step": 37280 + }, + { + "epoch": 0.32965575770434413, + "grad_norm": 6.794445037841797, + "learning_rate": 4.4505737371594266e-05, + "loss": 0.7696, + "step": 37290 + }, + { + "epoch": 0.32974416096465636, + "grad_norm": 2.0575544834136963, + "learning_rate": 4.4504263983922395e-05, + "loss": 0.7636, + "step": 37300 + }, + { + "epoch": 0.3298325642249686, + "grad_norm": 4.3312764167785645, + "learning_rate": 4.450279059625053e-05, + "loss": 0.8836, + "step": 37310 + }, + { + "epoch": 0.3299209674852809, + "grad_norm": 9.852190017700195, + "learning_rate": 4.450131720857865e-05, + "loss": 0.6994, + "step": 37320 + }, + { + "epoch": 0.3300093707455931, + "grad_norm": 3.0304930210113525, + "learning_rate": 4.4499843820906787e-05, + "loss": 0.7432, + "step": 37330 + }, + { + "epoch": 0.33009777400590534, + "grad_norm": 4.672139644622803, + "learning_rate": 4.4498370433234915e-05, + "loss": 0.6682, + "step": 37340 + }, + { + "epoch": 0.3301861772662176, + "grad_norm": 4.245362758636475, + "learning_rate": 4.449689704556304e-05, + "loss": 0.7603, + "step": 37350 + }, + { + "epoch": 0.3302745805265298, + "grad_norm": 2.399181365966797, + "learning_rate": 4.449542365789117e-05, + "loss": 0.7421, + "step": 37360 + }, + { + "epoch": 0.33036298378684203, + "grad_norm": 6.010427474975586, + "learning_rate": 4.449395027021931e-05, + "loss": 0.7896, + "step": 37370 + }, + { + "epoch": 0.3304513870471543, + "grad_norm": 4.387106895446777, + "learning_rate": 4.449247688254743e-05, + "loss": 0.7033, + "step": 37380 + }, + { + "epoch": 0.33053979030746655, + "grad_norm": 2.2987771034240723, + "learning_rate": 4.4491003494875563e-05, + "loss": 0.8088, + "step": 37390 + }, + { + "epoch": 0.3306281935677788, + "grad_norm": 5.155879497528076, + "learning_rate": 4.448953010720369e-05, + "loss": 0.7601, + "step": 37400 + }, + { + "epoch": 0.330716596828091, + "grad_norm": 3.248589515686035, + "learning_rate": 4.448805671953182e-05, + "loss": 0.7168, + "step": 37410 + }, + { + "epoch": 0.33080500008840324, + "grad_norm": 3.9455742835998535, + "learning_rate": 4.448658333185995e-05, + "loss": 0.7608, + "step": 37420 + }, + { + "epoch": 0.3308934033487155, + "grad_norm": 4.91671085357666, + "learning_rate": 4.448510994418808e-05, + "loss": 0.6567, + "step": 37430 + }, + { + "epoch": 0.33098180660902776, + "grad_norm": 2.7089884281158447, + "learning_rate": 4.4483636556516205e-05, + "loss": 0.6045, + "step": 37440 + }, + { + "epoch": 0.33107020986934, + "grad_norm": 4.3610687255859375, + "learning_rate": 4.448216316884434e-05, + "loss": 0.6921, + "step": 37450 + }, + { + "epoch": 0.3311586131296522, + "grad_norm": 5.27649450302124, + "learning_rate": 4.448068978117246e-05, + "loss": 0.6848, + "step": 37460 + }, + { + "epoch": 0.33124701638996445, + "grad_norm": 8.016512870788574, + "learning_rate": 4.44792163935006e-05, + "loss": 0.6426, + "step": 37470 + }, + { + "epoch": 0.3313354196502767, + "grad_norm": 2.9452693462371826, + "learning_rate": 4.4477743005828725e-05, + "loss": 0.7456, + "step": 37480 + }, + { + "epoch": 0.3314238229105889, + "grad_norm": 7.632255554199219, + "learning_rate": 4.4476269618156854e-05, + "loss": 0.6582, + "step": 37490 + }, + { + "epoch": 0.3315122261709012, + "grad_norm": 4.6061530113220215, + "learning_rate": 4.447479623048498e-05, + "loss": 0.7213, + "step": 37500 + }, + { + "epoch": 0.33160062943121343, + "grad_norm": 8.105106353759766, + "learning_rate": 4.447332284281312e-05, + "loss": 0.7712, + "step": 37510 + }, + { + "epoch": 0.33168903269152566, + "grad_norm": 13.633162498474121, + "learning_rate": 4.447184945514124e-05, + "loss": 0.7063, + "step": 37520 + }, + { + "epoch": 0.3317774359518379, + "grad_norm": 1.565040111541748, + "learning_rate": 4.4470376067469374e-05, + "loss": 0.6888, + "step": 37530 + }, + { + "epoch": 0.3318658392121501, + "grad_norm": 1.4071500301361084, + "learning_rate": 4.4468902679797496e-05, + "loss": 0.6259, + "step": 37540 + }, + { + "epoch": 0.33195424247246236, + "grad_norm": 3.1355643272399902, + "learning_rate": 4.446742929212563e-05, + "loss": 0.8225, + "step": 37550 + }, + { + "epoch": 0.33204264573277464, + "grad_norm": 5.063669681549072, + "learning_rate": 4.446595590445376e-05, + "loss": 0.6292, + "step": 37560 + }, + { + "epoch": 0.3321310489930869, + "grad_norm": 3.942572593688965, + "learning_rate": 4.446448251678189e-05, + "loss": 0.7594, + "step": 37570 + }, + { + "epoch": 0.3322194522533991, + "grad_norm": 2.7459118366241455, + "learning_rate": 4.4463009129110016e-05, + "loss": 0.7612, + "step": 37580 + }, + { + "epoch": 0.33230785551371134, + "grad_norm": 3.110966682434082, + "learning_rate": 4.446153574143815e-05, + "loss": 0.7944, + "step": 37590 + }, + { + "epoch": 0.33239625877402357, + "grad_norm": 2.620520830154419, + "learning_rate": 4.446006235376627e-05, + "loss": 0.7875, + "step": 37600 + }, + { + "epoch": 0.33248466203433585, + "grad_norm": 3.686331033706665, + "learning_rate": 4.445858896609441e-05, + "loss": 0.7606, + "step": 37610 + }, + { + "epoch": 0.3325730652946481, + "grad_norm": 9.094108581542969, + "learning_rate": 4.4457115578422536e-05, + "loss": 0.6452, + "step": 37620 + }, + { + "epoch": 0.3326614685549603, + "grad_norm": 3.4581174850463867, + "learning_rate": 4.4455642190750664e-05, + "loss": 0.7565, + "step": 37630 + }, + { + "epoch": 0.33274987181527255, + "grad_norm": 7.295807838439941, + "learning_rate": 4.445416880307879e-05, + "loss": 0.6143, + "step": 37640 + }, + { + "epoch": 0.3328382750755848, + "grad_norm": 7.385070323944092, + "learning_rate": 4.445269541540692e-05, + "loss": 0.8879, + "step": 37650 + }, + { + "epoch": 0.332926678335897, + "grad_norm": 4.298337459564209, + "learning_rate": 4.445122202773505e-05, + "loss": 0.6443, + "step": 37660 + }, + { + "epoch": 0.3330150815962093, + "grad_norm": 4.23216438293457, + "learning_rate": 4.4449748640063185e-05, + "loss": 0.6731, + "step": 37670 + }, + { + "epoch": 0.3331034848565215, + "grad_norm": 3.5965120792388916, + "learning_rate": 4.4448275252391306e-05, + "loss": 0.7848, + "step": 37680 + }, + { + "epoch": 0.33319188811683376, + "grad_norm": 6.041168212890625, + "learning_rate": 4.444680186471944e-05, + "loss": 0.8365, + "step": 37690 + }, + { + "epoch": 0.333280291377146, + "grad_norm": 2.818695306777954, + "learning_rate": 4.444532847704757e-05, + "loss": 0.7816, + "step": 37700 + }, + { + "epoch": 0.3333686946374582, + "grad_norm": 1.4142529964447021, + "learning_rate": 4.44438550893757e-05, + "loss": 0.7574, + "step": 37710 + }, + { + "epoch": 0.33345709789777045, + "grad_norm": 5.777283668518066, + "learning_rate": 4.4442381701703826e-05, + "loss": 0.7448, + "step": 37720 + }, + { + "epoch": 0.33354550115808274, + "grad_norm": 6.268942832946777, + "learning_rate": 4.444090831403196e-05, + "loss": 0.7011, + "step": 37730 + }, + { + "epoch": 0.33363390441839497, + "grad_norm": 3.510544776916504, + "learning_rate": 4.443943492636008e-05, + "loss": 0.7341, + "step": 37740 + }, + { + "epoch": 0.3337223076787072, + "grad_norm": 7.378816604614258, + "learning_rate": 4.443796153868822e-05, + "loss": 0.8453, + "step": 37750 + }, + { + "epoch": 0.33381071093901943, + "grad_norm": 3.546304941177368, + "learning_rate": 4.443648815101634e-05, + "loss": 0.8961, + "step": 37760 + }, + { + "epoch": 0.33389911419933166, + "grad_norm": 5.9796905517578125, + "learning_rate": 4.4435014763344475e-05, + "loss": 0.8286, + "step": 37770 + }, + { + "epoch": 0.3339875174596439, + "grad_norm": 3.9999821186065674, + "learning_rate": 4.44335413756726e-05, + "loss": 0.7225, + "step": 37780 + }, + { + "epoch": 0.3340759207199562, + "grad_norm": 5.240865230560303, + "learning_rate": 4.443206798800073e-05, + "loss": 0.7175, + "step": 37790 + }, + { + "epoch": 0.3341643239802684, + "grad_norm": 6.773401737213135, + "learning_rate": 4.443059460032886e-05, + "loss": 0.6985, + "step": 37800 + }, + { + "epoch": 0.33425272724058064, + "grad_norm": 1.9193450212478638, + "learning_rate": 4.4429121212656995e-05, + "loss": 0.7263, + "step": 37810 + }, + { + "epoch": 0.33434113050089287, + "grad_norm": 5.275112628936768, + "learning_rate": 4.4427647824985117e-05, + "loss": 0.7313, + "step": 37820 + }, + { + "epoch": 0.3344295337612051, + "grad_norm": 2.2873361110687256, + "learning_rate": 4.442617443731325e-05, + "loss": 0.7307, + "step": 37830 + }, + { + "epoch": 0.33451793702151733, + "grad_norm": 3.0789976119995117, + "learning_rate": 4.442470104964138e-05, + "loss": 0.7869, + "step": 37840 + }, + { + "epoch": 0.3346063402818296, + "grad_norm": 5.9778361320495605, + "learning_rate": 4.442322766196951e-05, + "loss": 0.7762, + "step": 37850 + }, + { + "epoch": 0.33469474354214185, + "grad_norm": 8.424156188964844, + "learning_rate": 4.442175427429764e-05, + "loss": 0.7244, + "step": 37860 + }, + { + "epoch": 0.3347831468024541, + "grad_norm": 3.348323106765747, + "learning_rate": 4.442028088662577e-05, + "loss": 0.7951, + "step": 37870 + }, + { + "epoch": 0.3348715500627663, + "grad_norm": 6.860462188720703, + "learning_rate": 4.4418807498953893e-05, + "loss": 0.6985, + "step": 37880 + }, + { + "epoch": 0.33495995332307854, + "grad_norm": 20.336254119873047, + "learning_rate": 4.441733411128203e-05, + "loss": 0.8301, + "step": 37890 + }, + { + "epoch": 0.33504835658339077, + "grad_norm": 2.144000768661499, + "learning_rate": 4.441586072361015e-05, + "loss": 0.6725, + "step": 37900 + }, + { + "epoch": 0.33513675984370306, + "grad_norm": 1.9997081756591797, + "learning_rate": 4.4414387335938285e-05, + "loss": 0.7408, + "step": 37910 + }, + { + "epoch": 0.3352251631040153, + "grad_norm": 7.838317394256592, + "learning_rate": 4.4412913948266414e-05, + "loss": 0.8178, + "step": 37920 + }, + { + "epoch": 0.3353135663643275, + "grad_norm": 2.0753121376037598, + "learning_rate": 4.441144056059454e-05, + "loss": 0.7305, + "step": 37930 + }, + { + "epoch": 0.33540196962463975, + "grad_norm": 2.404961585998535, + "learning_rate": 4.440996717292267e-05, + "loss": 0.7607, + "step": 37940 + }, + { + "epoch": 0.335490372884952, + "grad_norm": 2.8562088012695312, + "learning_rate": 4.4408493785250806e-05, + "loss": 0.6851, + "step": 37950 + }, + { + "epoch": 0.3355787761452642, + "grad_norm": 2.9436092376708984, + "learning_rate": 4.440702039757893e-05, + "loss": 0.7258, + "step": 37960 + }, + { + "epoch": 0.3356671794055765, + "grad_norm": 2.2134668827056885, + "learning_rate": 4.440554700990706e-05, + "loss": 0.7526, + "step": 37970 + }, + { + "epoch": 0.33575558266588873, + "grad_norm": 37.28148651123047, + "learning_rate": 4.440407362223519e-05, + "loss": 0.7394, + "step": 37980 + }, + { + "epoch": 0.33584398592620096, + "grad_norm": 3.944171190261841, + "learning_rate": 4.440260023456332e-05, + "loss": 0.7511, + "step": 37990 + }, + { + "epoch": 0.3359323891865132, + "grad_norm": 1.5751720666885376, + "learning_rate": 4.440112684689145e-05, + "loss": 0.83, + "step": 38000 + }, + { + "epoch": 0.3360207924468254, + "grad_norm": 3.7713801860809326, + "learning_rate": 4.4399653459219576e-05, + "loss": 0.8186, + "step": 38010 + }, + { + "epoch": 0.33610919570713765, + "grad_norm": 3.6327595710754395, + "learning_rate": 4.4398180071547704e-05, + "loss": 0.6205, + "step": 38020 + }, + { + "epoch": 0.33619759896744994, + "grad_norm": 8.001874923706055, + "learning_rate": 4.439670668387584e-05, + "loss": 0.695, + "step": 38030 + }, + { + "epoch": 0.33628600222776217, + "grad_norm": 2.667207956314087, + "learning_rate": 4.439523329620397e-05, + "loss": 0.8388, + "step": 38040 + }, + { + "epoch": 0.3363744054880744, + "grad_norm": 2.9962949752807617, + "learning_rate": 4.4393759908532096e-05, + "loss": 0.6691, + "step": 38050 + }, + { + "epoch": 0.33646280874838663, + "grad_norm": 4.5162529945373535, + "learning_rate": 4.4392286520860224e-05, + "loss": 0.7725, + "step": 38060 + }, + { + "epoch": 0.33655121200869886, + "grad_norm": 11.078516960144043, + "learning_rate": 4.439081313318835e-05, + "loss": 0.7412, + "step": 38070 + }, + { + "epoch": 0.3366396152690111, + "grad_norm": 11.169146537780762, + "learning_rate": 4.438933974551648e-05, + "loss": 0.7767, + "step": 38080 + }, + { + "epoch": 0.3367280185293234, + "grad_norm": 3.0054001808166504, + "learning_rate": 4.4387866357844616e-05, + "loss": 0.9013, + "step": 38090 + }, + { + "epoch": 0.3368164217896356, + "grad_norm": 2.3745791912078857, + "learning_rate": 4.4386392970172744e-05, + "loss": 0.8172, + "step": 38100 + }, + { + "epoch": 0.33690482504994784, + "grad_norm": 4.295470714569092, + "learning_rate": 4.438491958250087e-05, + "loss": 0.7548, + "step": 38110 + }, + { + "epoch": 0.3369932283102601, + "grad_norm": 5.31907320022583, + "learning_rate": 4.4383446194829e-05, + "loss": 0.7774, + "step": 38120 + }, + { + "epoch": 0.3370816315705723, + "grad_norm": 2.9692611694335938, + "learning_rate": 4.438197280715713e-05, + "loss": 0.6463, + "step": 38130 + }, + { + "epoch": 0.3371700348308846, + "grad_norm": 2.7916946411132812, + "learning_rate": 4.438049941948526e-05, + "loss": 0.7102, + "step": 38140 + }, + { + "epoch": 0.3372584380911968, + "grad_norm": 2.026956081390381, + "learning_rate": 4.4379026031813386e-05, + "loss": 0.6695, + "step": 38150 + }, + { + "epoch": 0.33734684135150905, + "grad_norm": 4.379522323608398, + "learning_rate": 4.437755264414152e-05, + "loss": 0.8069, + "step": 38160 + }, + { + "epoch": 0.3374352446118213, + "grad_norm": 2.1830618381500244, + "learning_rate": 4.437607925646965e-05, + "loss": 0.6677, + "step": 38170 + }, + { + "epoch": 0.3375236478721335, + "grad_norm": 3.632488965988159, + "learning_rate": 4.437460586879778e-05, + "loss": 0.8248, + "step": 38180 + }, + { + "epoch": 0.33761205113244575, + "grad_norm": 2.902557849884033, + "learning_rate": 4.4373132481125906e-05, + "loss": 0.7919, + "step": 38190 + }, + { + "epoch": 0.33770045439275803, + "grad_norm": 6.855132102966309, + "learning_rate": 4.4371659093454035e-05, + "loss": 0.608, + "step": 38200 + }, + { + "epoch": 0.33778885765307026, + "grad_norm": 20.101909637451172, + "learning_rate": 4.437018570578216e-05, + "loss": 0.806, + "step": 38210 + }, + { + "epoch": 0.3378772609133825, + "grad_norm": 4.567832946777344, + "learning_rate": 4.43687123181103e-05, + "loss": 0.7061, + "step": 38220 + }, + { + "epoch": 0.3379656641736947, + "grad_norm": 2.52120304107666, + "learning_rate": 4.436723893043842e-05, + "loss": 0.6626, + "step": 38230 + }, + { + "epoch": 0.33805406743400696, + "grad_norm": 7.894671440124512, + "learning_rate": 4.4365765542766555e-05, + "loss": 0.7207, + "step": 38240 + }, + { + "epoch": 0.3381424706943192, + "grad_norm": 2.661726951599121, + "learning_rate": 4.436429215509468e-05, + "loss": 0.6896, + "step": 38250 + }, + { + "epoch": 0.3382308739546315, + "grad_norm": 8.05888557434082, + "learning_rate": 4.436281876742281e-05, + "loss": 0.8958, + "step": 38260 + }, + { + "epoch": 0.3383192772149437, + "grad_norm": 5.401699542999268, + "learning_rate": 4.436134537975094e-05, + "loss": 0.8154, + "step": 38270 + }, + { + "epoch": 0.33840768047525593, + "grad_norm": 4.924627780914307, + "learning_rate": 4.4359871992079075e-05, + "loss": 0.8478, + "step": 38280 + }, + { + "epoch": 0.33849608373556817, + "grad_norm": 2.4446215629577637, + "learning_rate": 4.43583986044072e-05, + "loss": 0.7358, + "step": 38290 + }, + { + "epoch": 0.3385844869958804, + "grad_norm": 4.387701988220215, + "learning_rate": 4.435692521673533e-05, + "loss": 0.7378, + "step": 38300 + }, + { + "epoch": 0.3386728902561926, + "grad_norm": 4.186991214752197, + "learning_rate": 4.435545182906346e-05, + "loss": 0.7074, + "step": 38310 + }, + { + "epoch": 0.3387612935165049, + "grad_norm": 2.201347589492798, + "learning_rate": 4.435397844139159e-05, + "loss": 0.8112, + "step": 38320 + }, + { + "epoch": 0.33884969677681714, + "grad_norm": 8.414388656616211, + "learning_rate": 4.435250505371972e-05, + "loss": 0.6043, + "step": 38330 + }, + { + "epoch": 0.3389381000371294, + "grad_norm": 2.9845588207244873, + "learning_rate": 4.435103166604785e-05, + "loss": 0.5832, + "step": 38340 + }, + { + "epoch": 0.3390265032974416, + "grad_norm": 3.291576623916626, + "learning_rate": 4.4349558278375974e-05, + "loss": 0.7391, + "step": 38350 + }, + { + "epoch": 0.33911490655775384, + "grad_norm": 7.75879430770874, + "learning_rate": 4.434808489070411e-05, + "loss": 0.6709, + "step": 38360 + }, + { + "epoch": 0.33920330981806607, + "grad_norm": 3.6497490406036377, + "learning_rate": 4.434661150303223e-05, + "loss": 0.8984, + "step": 38370 + }, + { + "epoch": 0.33929171307837835, + "grad_norm": 3.4737775325775146, + "learning_rate": 4.4345138115360365e-05, + "loss": 0.6528, + "step": 38380 + }, + { + "epoch": 0.3393801163386906, + "grad_norm": 8.572341918945312, + "learning_rate": 4.4343664727688494e-05, + "loss": 0.6855, + "step": 38390 + }, + { + "epoch": 0.3394685195990028, + "grad_norm": 7.881353855133057, + "learning_rate": 4.434219134001662e-05, + "loss": 0.7787, + "step": 38400 + }, + { + "epoch": 0.33955692285931505, + "grad_norm": 1.3303899765014648, + "learning_rate": 4.434071795234475e-05, + "loss": 0.5963, + "step": 38410 + }, + { + "epoch": 0.3396453261196273, + "grad_norm": 4.56694221496582, + "learning_rate": 4.4339244564672886e-05, + "loss": 0.9442, + "step": 38420 + }, + { + "epoch": 0.3397337293799395, + "grad_norm": 3.620880365371704, + "learning_rate": 4.433777117700101e-05, + "loss": 0.7801, + "step": 38430 + }, + { + "epoch": 0.3398221326402518, + "grad_norm": 2.286504030227661, + "learning_rate": 4.433629778932914e-05, + "loss": 0.909, + "step": 38440 + }, + { + "epoch": 0.339910535900564, + "grad_norm": 3.468268871307373, + "learning_rate": 4.433482440165727e-05, + "loss": 0.6692, + "step": 38450 + }, + { + "epoch": 0.33999893916087626, + "grad_norm": 7.405500888824463, + "learning_rate": 4.43333510139854e-05, + "loss": 0.6987, + "step": 38460 + }, + { + "epoch": 0.3400873424211885, + "grad_norm": 5.755014419555664, + "learning_rate": 4.433187762631353e-05, + "loss": 0.6701, + "step": 38470 + }, + { + "epoch": 0.3401757456815007, + "grad_norm": 3.834282875061035, + "learning_rate": 4.4330404238641656e-05, + "loss": 0.8425, + "step": 38480 + }, + { + "epoch": 0.34026414894181295, + "grad_norm": 3.051474094390869, + "learning_rate": 4.4328930850969784e-05, + "loss": 0.6757, + "step": 38490 + }, + { + "epoch": 0.34035255220212524, + "grad_norm": 2.1180267333984375, + "learning_rate": 4.432745746329792e-05, + "loss": 0.7447, + "step": 38500 + }, + { + "epoch": 0.34044095546243747, + "grad_norm": 11.79562759399414, + "learning_rate": 4.432598407562604e-05, + "loss": 0.7153, + "step": 38510 + }, + { + "epoch": 0.3405293587227497, + "grad_norm": 4.963253974914551, + "learning_rate": 4.4324510687954176e-05, + "loss": 0.6423, + "step": 38520 + }, + { + "epoch": 0.34061776198306193, + "grad_norm": 3.5599465370178223, + "learning_rate": 4.4323037300282304e-05, + "loss": 0.6866, + "step": 38530 + }, + { + "epoch": 0.34070616524337416, + "grad_norm": 4.488028049468994, + "learning_rate": 4.432156391261043e-05, + "loss": 0.7868, + "step": 38540 + }, + { + "epoch": 0.3407945685036864, + "grad_norm": 2.4642605781555176, + "learning_rate": 4.432009052493856e-05, + "loss": 0.8101, + "step": 38550 + }, + { + "epoch": 0.3408829717639987, + "grad_norm": 4.765092849731445, + "learning_rate": 4.4318617137266696e-05, + "loss": 0.8099, + "step": 38560 + }, + { + "epoch": 0.3409713750243109, + "grad_norm": 2.6710798740386963, + "learning_rate": 4.431714374959482e-05, + "loss": 0.8209, + "step": 38570 + }, + { + "epoch": 0.34105977828462314, + "grad_norm": 2.7096149921417236, + "learning_rate": 4.431567036192295e-05, + "loss": 0.6875, + "step": 38580 + }, + { + "epoch": 0.34114818154493537, + "grad_norm": 1.7125182151794434, + "learning_rate": 4.4314196974251074e-05, + "loss": 0.7881, + "step": 38590 + }, + { + "epoch": 0.3412365848052476, + "grad_norm": 2.053903818130493, + "learning_rate": 4.431272358657921e-05, + "loss": 0.7455, + "step": 38600 + }, + { + "epoch": 0.34132498806555983, + "grad_norm": 5.243853569030762, + "learning_rate": 4.431125019890734e-05, + "loss": 0.6705, + "step": 38610 + }, + { + "epoch": 0.3414133913258721, + "grad_norm": 4.675059795379639, + "learning_rate": 4.4309776811235466e-05, + "loss": 0.7284, + "step": 38620 + }, + { + "epoch": 0.34150179458618435, + "grad_norm": 0.9795480370521545, + "learning_rate": 4.4308303423563595e-05, + "loss": 0.7653, + "step": 38630 + }, + { + "epoch": 0.3415901978464966, + "grad_norm": 2.595299005508423, + "learning_rate": 4.430683003589173e-05, + "loss": 0.7195, + "step": 38640 + }, + { + "epoch": 0.3416786011068088, + "grad_norm": 8.659255981445312, + "learning_rate": 4.430535664821985e-05, + "loss": 0.7188, + "step": 38650 + }, + { + "epoch": 0.34176700436712104, + "grad_norm": 1.6105016469955444, + "learning_rate": 4.4303883260547986e-05, + "loss": 0.6767, + "step": 38660 + }, + { + "epoch": 0.34185540762743327, + "grad_norm": 3.3377585411071777, + "learning_rate": 4.4302409872876115e-05, + "loss": 0.7457, + "step": 38670 + }, + { + "epoch": 0.34194381088774556, + "grad_norm": 3.788832426071167, + "learning_rate": 4.430093648520424e-05, + "loss": 0.7619, + "step": 38680 + }, + { + "epoch": 0.3420322141480578, + "grad_norm": 6.489564418792725, + "learning_rate": 4.429946309753237e-05, + "loss": 0.7225, + "step": 38690 + }, + { + "epoch": 0.34212061740837, + "grad_norm": 1.7470455169677734, + "learning_rate": 4.42979897098605e-05, + "loss": 0.6937, + "step": 38700 + }, + { + "epoch": 0.34220902066868225, + "grad_norm": 6.949313163757324, + "learning_rate": 4.429651632218863e-05, + "loss": 0.6912, + "step": 38710 + }, + { + "epoch": 0.3422974239289945, + "grad_norm": 3.092290163040161, + "learning_rate": 4.429504293451676e-05, + "loss": 0.7623, + "step": 38720 + }, + { + "epoch": 0.34238582718930677, + "grad_norm": 8.560245513916016, + "learning_rate": 4.4293569546844885e-05, + "loss": 0.6472, + "step": 38730 + }, + { + "epoch": 0.342474230449619, + "grad_norm": 4.158474922180176, + "learning_rate": 4.429209615917302e-05, + "loss": 0.6863, + "step": 38740 + }, + { + "epoch": 0.34256263370993123, + "grad_norm": 5.6758503913879395, + "learning_rate": 4.429062277150115e-05, + "loss": 0.702, + "step": 38750 + }, + { + "epoch": 0.34265103697024346, + "grad_norm": 7.173279762268066, + "learning_rate": 4.428914938382928e-05, + "loss": 0.7719, + "step": 38760 + }, + { + "epoch": 0.3427394402305557, + "grad_norm": 3.5123791694641113, + "learning_rate": 4.4287675996157405e-05, + "loss": 0.7444, + "step": 38770 + }, + { + "epoch": 0.3428278434908679, + "grad_norm": 10.433554649353027, + "learning_rate": 4.428620260848554e-05, + "loss": 0.6987, + "step": 38780 + }, + { + "epoch": 0.3429162467511802, + "grad_norm": 3.3632681369781494, + "learning_rate": 4.428472922081366e-05, + "loss": 0.7488, + "step": 38790 + }, + { + "epoch": 0.34300465001149244, + "grad_norm": 2.0097849369049072, + "learning_rate": 4.42832558331418e-05, + "loss": 0.7543, + "step": 38800 + }, + { + "epoch": 0.34309305327180467, + "grad_norm": 6.305533409118652, + "learning_rate": 4.4281782445469925e-05, + "loss": 0.7586, + "step": 38810 + }, + { + "epoch": 0.3431814565321169, + "grad_norm": 4.433217525482178, + "learning_rate": 4.4280309057798054e-05, + "loss": 0.8186, + "step": 38820 + }, + { + "epoch": 0.34326985979242913, + "grad_norm": 1.886404037475586, + "learning_rate": 4.427883567012618e-05, + "loss": 0.7404, + "step": 38830 + }, + { + "epoch": 0.34335826305274136, + "grad_norm": 7.266593933105469, + "learning_rate": 4.427736228245431e-05, + "loss": 0.6237, + "step": 38840 + }, + { + "epoch": 0.34344666631305365, + "grad_norm": 2.7771661281585693, + "learning_rate": 4.427588889478244e-05, + "loss": 0.6898, + "step": 38850 + }, + { + "epoch": 0.3435350695733659, + "grad_norm": 2.3502469062805176, + "learning_rate": 4.4274415507110574e-05, + "loss": 0.7738, + "step": 38860 + }, + { + "epoch": 0.3436234728336781, + "grad_norm": 4.1199822425842285, + "learning_rate": 4.4272942119438695e-05, + "loss": 0.849, + "step": 38870 + }, + { + "epoch": 0.34371187609399034, + "grad_norm": 3.7873480319976807, + "learning_rate": 4.427146873176683e-05, + "loss": 0.6122, + "step": 38880 + }, + { + "epoch": 0.3438002793543026, + "grad_norm": 9.393013000488281, + "learning_rate": 4.426999534409496e-05, + "loss": 0.7628, + "step": 38890 + }, + { + "epoch": 0.3438886826146148, + "grad_norm": 3.412522077560425, + "learning_rate": 4.426852195642309e-05, + "loss": 0.6627, + "step": 38900 + }, + { + "epoch": 0.3439770858749271, + "grad_norm": 4.772474765777588, + "learning_rate": 4.4267048568751216e-05, + "loss": 0.8214, + "step": 38910 + }, + { + "epoch": 0.3440654891352393, + "grad_norm": 4.12615442276001, + "learning_rate": 4.426557518107935e-05, + "loss": 0.6827, + "step": 38920 + }, + { + "epoch": 0.34415389239555155, + "grad_norm": 3.1334691047668457, + "learning_rate": 4.426410179340747e-05, + "loss": 0.6754, + "step": 38930 + }, + { + "epoch": 0.3442422956558638, + "grad_norm": 2.491276979446411, + "learning_rate": 4.426262840573561e-05, + "loss": 0.7243, + "step": 38940 + }, + { + "epoch": 0.344330698916176, + "grad_norm": 1.7073016166687012, + "learning_rate": 4.4261155018063736e-05, + "loss": 0.7204, + "step": 38950 + }, + { + "epoch": 0.34441910217648825, + "grad_norm": 6.672577381134033, + "learning_rate": 4.4259681630391864e-05, + "loss": 0.6357, + "step": 38960 + }, + { + "epoch": 0.34450750543680053, + "grad_norm": 3.4583609104156494, + "learning_rate": 4.425820824271999e-05, + "loss": 0.7868, + "step": 38970 + }, + { + "epoch": 0.34459590869711276, + "grad_norm": 2.7108612060546875, + "learning_rate": 4.425673485504812e-05, + "loss": 0.7452, + "step": 38980 + }, + { + "epoch": 0.344684311957425, + "grad_norm": 3.729666233062744, + "learning_rate": 4.425526146737625e-05, + "loss": 0.796, + "step": 38990 + }, + { + "epoch": 0.3447727152177372, + "grad_norm": 2.58547306060791, + "learning_rate": 4.4253788079704384e-05, + "loss": 0.7642, + "step": 39000 + }, + { + "epoch": 0.34486111847804946, + "grad_norm": 1.6560652256011963, + "learning_rate": 4.425231469203251e-05, + "loss": 0.8992, + "step": 39010 + }, + { + "epoch": 0.3449495217383617, + "grad_norm": 2.2690773010253906, + "learning_rate": 4.425084130436064e-05, + "loss": 0.662, + "step": 39020 + }, + { + "epoch": 0.345037924998674, + "grad_norm": 8.749836921691895, + "learning_rate": 4.424936791668877e-05, + "loss": 0.7323, + "step": 39030 + }, + { + "epoch": 0.3451263282589862, + "grad_norm": 1.8457791805267334, + "learning_rate": 4.42478945290169e-05, + "loss": 0.7426, + "step": 39040 + }, + { + "epoch": 0.34521473151929843, + "grad_norm": 2.8448383808135986, + "learning_rate": 4.4246421141345026e-05, + "loss": 0.7657, + "step": 39050 + }, + { + "epoch": 0.34530313477961067, + "grad_norm": 2.9351272583007812, + "learning_rate": 4.4244947753673154e-05, + "loss": 0.7119, + "step": 39060 + }, + { + "epoch": 0.3453915380399229, + "grad_norm": 6.2362961769104, + "learning_rate": 4.424347436600129e-05, + "loss": 0.8469, + "step": 39070 + }, + { + "epoch": 0.3454799413002351, + "grad_norm": 6.559598922729492, + "learning_rate": 4.424200097832942e-05, + "loss": 0.6604, + "step": 39080 + }, + { + "epoch": 0.3455683445605474, + "grad_norm": 6.368211269378662, + "learning_rate": 4.4240527590657546e-05, + "loss": 0.5775, + "step": 39090 + }, + { + "epoch": 0.34565674782085964, + "grad_norm": 1.8094475269317627, + "learning_rate": 4.4239054202985675e-05, + "loss": 0.7922, + "step": 39100 + }, + { + "epoch": 0.3457451510811719, + "grad_norm": 9.018067359924316, + "learning_rate": 4.42375808153138e-05, + "loss": 0.8196, + "step": 39110 + }, + { + "epoch": 0.3458335543414841, + "grad_norm": 4.900691032409668, + "learning_rate": 4.423610742764193e-05, + "loss": 0.6535, + "step": 39120 + }, + { + "epoch": 0.34592195760179634, + "grad_norm": 8.475072860717773, + "learning_rate": 4.4234634039970066e-05, + "loss": 0.7438, + "step": 39130 + }, + { + "epoch": 0.34601036086210857, + "grad_norm": 2.53385853767395, + "learning_rate": 4.4233160652298195e-05, + "loss": 0.626, + "step": 39140 + }, + { + "epoch": 0.34609876412242085, + "grad_norm": 7.454282760620117, + "learning_rate": 4.423168726462632e-05, + "loss": 0.6581, + "step": 39150 + }, + { + "epoch": 0.3461871673827331, + "grad_norm": 2.232750415802002, + "learning_rate": 4.423021387695445e-05, + "loss": 0.6313, + "step": 39160 + }, + { + "epoch": 0.3462755706430453, + "grad_norm": 10.551502227783203, + "learning_rate": 4.422874048928258e-05, + "loss": 0.666, + "step": 39170 + }, + { + "epoch": 0.34636397390335755, + "grad_norm": 2.270404577255249, + "learning_rate": 4.422726710161071e-05, + "loss": 0.6959, + "step": 39180 + }, + { + "epoch": 0.3464523771636698, + "grad_norm": 6.183990955352783, + "learning_rate": 4.422579371393884e-05, + "loss": 0.6748, + "step": 39190 + }, + { + "epoch": 0.346540780423982, + "grad_norm": 2.9584808349609375, + "learning_rate": 4.4224320326266965e-05, + "loss": 0.6835, + "step": 39200 + }, + { + "epoch": 0.3466291836842943, + "grad_norm": 7.449177265167236, + "learning_rate": 4.42228469385951e-05, + "loss": 0.7968, + "step": 39210 + }, + { + "epoch": 0.3467175869446065, + "grad_norm": 4.392550945281982, + "learning_rate": 4.422137355092323e-05, + "loss": 0.7211, + "step": 39220 + }, + { + "epoch": 0.34680599020491876, + "grad_norm": 7.9824066162109375, + "learning_rate": 4.421990016325136e-05, + "loss": 0.6396, + "step": 39230 + }, + { + "epoch": 0.346894393465231, + "grad_norm": 8.24870777130127, + "learning_rate": 4.4218426775579485e-05, + "loss": 0.5851, + "step": 39240 + }, + { + "epoch": 0.3469827967255432, + "grad_norm": 5.892064571380615, + "learning_rate": 4.421695338790762e-05, + "loss": 0.637, + "step": 39250 + }, + { + "epoch": 0.3470711999858555, + "grad_norm": 1.8518664836883545, + "learning_rate": 4.421548000023574e-05, + "loss": 0.6943, + "step": 39260 + }, + { + "epoch": 0.34715960324616774, + "grad_norm": 2.4457929134368896, + "learning_rate": 4.421400661256388e-05, + "loss": 0.7044, + "step": 39270 + }, + { + "epoch": 0.34724800650647997, + "grad_norm": 6.163054943084717, + "learning_rate": 4.4212533224892005e-05, + "loss": 0.7894, + "step": 39280 + }, + { + "epoch": 0.3473364097667922, + "grad_norm": 3.261793613433838, + "learning_rate": 4.4211059837220134e-05, + "loss": 0.6195, + "step": 39290 + }, + { + "epoch": 0.34742481302710443, + "grad_norm": 3.6927614212036133, + "learning_rate": 4.420958644954826e-05, + "loss": 0.7456, + "step": 39300 + }, + { + "epoch": 0.34751321628741666, + "grad_norm": 3.9024741649627686, + "learning_rate": 4.420811306187639e-05, + "loss": 0.8354, + "step": 39310 + }, + { + "epoch": 0.34760161954772895, + "grad_norm": 6.899050235748291, + "learning_rate": 4.420663967420452e-05, + "loss": 0.6942, + "step": 39320 + }, + { + "epoch": 0.3476900228080412, + "grad_norm": 2.2269694805145264, + "learning_rate": 4.4205166286532654e-05, + "loss": 0.7314, + "step": 39330 + }, + { + "epoch": 0.3477784260683534, + "grad_norm": 4.20051908493042, + "learning_rate": 4.4203692898860775e-05, + "loss": 0.7062, + "step": 39340 + }, + { + "epoch": 0.34786682932866564, + "grad_norm": 4.613553047180176, + "learning_rate": 4.420221951118891e-05, + "loss": 0.8253, + "step": 39350 + }, + { + "epoch": 0.34795523258897787, + "grad_norm": 13.918585777282715, + "learning_rate": 4.420074612351704e-05, + "loss": 0.7943, + "step": 39360 + }, + { + "epoch": 0.3480436358492901, + "grad_norm": 3.904947280883789, + "learning_rate": 4.419927273584517e-05, + "loss": 0.8599, + "step": 39370 + }, + { + "epoch": 0.3481320391096024, + "grad_norm": 7.048567771911621, + "learning_rate": 4.4197799348173296e-05, + "loss": 0.6769, + "step": 39380 + }, + { + "epoch": 0.3482204423699146, + "grad_norm": 2.5431551933288574, + "learning_rate": 4.419632596050143e-05, + "loss": 0.6828, + "step": 39390 + }, + { + "epoch": 0.34830884563022685, + "grad_norm": 1.659877061843872, + "learning_rate": 4.419485257282955e-05, + "loss": 0.7342, + "step": 39400 + }, + { + "epoch": 0.3483972488905391, + "grad_norm": 2.5533907413482666, + "learning_rate": 4.419337918515769e-05, + "loss": 0.7747, + "step": 39410 + }, + { + "epoch": 0.3484856521508513, + "grad_norm": 4.082030296325684, + "learning_rate": 4.419190579748581e-05, + "loss": 0.8179, + "step": 39420 + }, + { + "epoch": 0.34857405541116354, + "grad_norm": 6.7454118728637695, + "learning_rate": 4.4190432409813944e-05, + "loss": 0.7295, + "step": 39430 + }, + { + "epoch": 0.34866245867147583, + "grad_norm": 11.954776763916016, + "learning_rate": 4.418895902214207e-05, + "loss": 0.7646, + "step": 39440 + }, + { + "epoch": 0.34875086193178806, + "grad_norm": 14.232224464416504, + "learning_rate": 4.41874856344702e-05, + "loss": 0.6452, + "step": 39450 + }, + { + "epoch": 0.3488392651921003, + "grad_norm": 2.053473711013794, + "learning_rate": 4.418601224679833e-05, + "loss": 0.7143, + "step": 39460 + }, + { + "epoch": 0.3489276684524125, + "grad_norm": 1.4256279468536377, + "learning_rate": 4.4184538859126464e-05, + "loss": 0.7698, + "step": 39470 + }, + { + "epoch": 0.34901607171272475, + "grad_norm": 19.723196029663086, + "learning_rate": 4.4183065471454586e-05, + "loss": 0.7785, + "step": 39480 + }, + { + "epoch": 0.349104474973037, + "grad_norm": 2.6459712982177734, + "learning_rate": 4.418159208378272e-05, + "loss": 0.6502, + "step": 39490 + }, + { + "epoch": 0.34919287823334927, + "grad_norm": 2.494441032409668, + "learning_rate": 4.418011869611085e-05, + "loss": 0.7649, + "step": 39500 + }, + { + "epoch": 0.3492812814936615, + "grad_norm": 3.158606767654419, + "learning_rate": 4.417864530843898e-05, + "loss": 0.7775, + "step": 39510 + }, + { + "epoch": 0.34936968475397373, + "grad_norm": 12.12488079071045, + "learning_rate": 4.4177171920767106e-05, + "loss": 0.7198, + "step": 39520 + }, + { + "epoch": 0.34945808801428596, + "grad_norm": 2.545377254486084, + "learning_rate": 4.4175698533095235e-05, + "loss": 0.7555, + "step": 39530 + }, + { + "epoch": 0.3495464912745982, + "grad_norm": 2.610600233078003, + "learning_rate": 4.417422514542336e-05, + "loss": 0.6876, + "step": 39540 + }, + { + "epoch": 0.3496348945349104, + "grad_norm": 4.401505470275879, + "learning_rate": 4.41727517577515e-05, + "loss": 0.6802, + "step": 39550 + }, + { + "epoch": 0.3497232977952227, + "grad_norm": 13.02286434173584, + "learning_rate": 4.417127837007962e-05, + "loss": 0.7054, + "step": 39560 + }, + { + "epoch": 0.34981170105553494, + "grad_norm": 5.426609039306641, + "learning_rate": 4.4169804982407755e-05, + "loss": 0.7053, + "step": 39570 + }, + { + "epoch": 0.34990010431584717, + "grad_norm": 3.6680009365081787, + "learning_rate": 4.416833159473588e-05, + "loss": 0.6769, + "step": 39580 + }, + { + "epoch": 0.3499885075761594, + "grad_norm": 6.889299392700195, + "learning_rate": 4.416685820706401e-05, + "loss": 0.8272, + "step": 39590 + }, + { + "epoch": 0.35007691083647163, + "grad_norm": 2.7408905029296875, + "learning_rate": 4.416538481939214e-05, + "loss": 0.7558, + "step": 39600 + }, + { + "epoch": 0.35016531409678386, + "grad_norm": 6.113536357879639, + "learning_rate": 4.4163911431720275e-05, + "loss": 0.7436, + "step": 39610 + }, + { + "epoch": 0.35025371735709615, + "grad_norm": 9.523975372314453, + "learning_rate": 4.4162438044048396e-05, + "loss": 0.648, + "step": 39620 + }, + { + "epoch": 0.3503421206174084, + "grad_norm": 2.6401007175445557, + "learning_rate": 4.416096465637653e-05, + "loss": 0.8088, + "step": 39630 + }, + { + "epoch": 0.3504305238777206, + "grad_norm": 5.548010349273682, + "learning_rate": 4.415949126870466e-05, + "loss": 0.6727, + "step": 39640 + }, + { + "epoch": 0.35051892713803284, + "grad_norm": 5.084409713745117, + "learning_rate": 4.415801788103279e-05, + "loss": 0.7815, + "step": 39650 + }, + { + "epoch": 0.3506073303983451, + "grad_norm": 6.08608865737915, + "learning_rate": 4.415654449336092e-05, + "loss": 0.7262, + "step": 39660 + }, + { + "epoch": 0.3506957336586573, + "grad_norm": 4.022087097167969, + "learning_rate": 4.4155071105689045e-05, + "loss": 0.7984, + "step": 39670 + }, + { + "epoch": 0.3507841369189696, + "grad_norm": 5.5862531661987305, + "learning_rate": 4.415359771801717e-05, + "loss": 0.9135, + "step": 39680 + }, + { + "epoch": 0.3508725401792818, + "grad_norm": 5.848991870880127, + "learning_rate": 4.415212433034531e-05, + "loss": 0.6956, + "step": 39690 + }, + { + "epoch": 0.35096094343959405, + "grad_norm": 6.420487880706787, + "learning_rate": 4.415065094267343e-05, + "loss": 0.761, + "step": 39700 + }, + { + "epoch": 0.3510493466999063, + "grad_norm": 6.251640319824219, + "learning_rate": 4.4149177555001565e-05, + "loss": 0.6705, + "step": 39710 + }, + { + "epoch": 0.3511377499602185, + "grad_norm": 9.221297264099121, + "learning_rate": 4.4147704167329694e-05, + "loss": 0.7528, + "step": 39720 + }, + { + "epoch": 0.35122615322053075, + "grad_norm": 1.613277554512024, + "learning_rate": 4.414623077965782e-05, + "loss": 0.7531, + "step": 39730 + }, + { + "epoch": 0.35131455648084303, + "grad_norm": 1.8966875076293945, + "learning_rate": 4.414475739198595e-05, + "loss": 0.7017, + "step": 39740 + }, + { + "epoch": 0.35140295974115526, + "grad_norm": 1.356676697731018, + "learning_rate": 4.4143284004314085e-05, + "loss": 0.9041, + "step": 39750 + }, + { + "epoch": 0.3514913630014675, + "grad_norm": 2.4049720764160156, + "learning_rate": 4.414181061664221e-05, + "loss": 0.8855, + "step": 39760 + }, + { + "epoch": 0.3515797662617797, + "grad_norm": 2.64721417427063, + "learning_rate": 4.414033722897034e-05, + "loss": 0.7483, + "step": 39770 + }, + { + "epoch": 0.35166816952209196, + "grad_norm": 2.6219327449798584, + "learning_rate": 4.4138863841298464e-05, + "loss": 0.7394, + "step": 39780 + }, + { + "epoch": 0.35175657278240424, + "grad_norm": 2.7238476276397705, + "learning_rate": 4.41373904536266e-05, + "loss": 0.6265, + "step": 39790 + }, + { + "epoch": 0.3518449760427165, + "grad_norm": 2.0804193019866943, + "learning_rate": 4.413591706595473e-05, + "loss": 0.5891, + "step": 39800 + }, + { + "epoch": 0.3519333793030287, + "grad_norm": 2.7444496154785156, + "learning_rate": 4.4134443678282856e-05, + "loss": 0.6599, + "step": 39810 + }, + { + "epoch": 0.35202178256334093, + "grad_norm": 5.589512348175049, + "learning_rate": 4.4132970290610984e-05, + "loss": 0.6934, + "step": 39820 + }, + { + "epoch": 0.35211018582365317, + "grad_norm": 5.298748970031738, + "learning_rate": 4.413149690293912e-05, + "loss": 0.9626, + "step": 39830 + }, + { + "epoch": 0.3521985890839654, + "grad_norm": 1.9821408987045288, + "learning_rate": 4.413002351526724e-05, + "loss": 0.7993, + "step": 39840 + }, + { + "epoch": 0.3522869923442777, + "grad_norm": 2.8862950801849365, + "learning_rate": 4.4128550127595376e-05, + "loss": 0.7875, + "step": 39850 + }, + { + "epoch": 0.3523753956045899, + "grad_norm": 5.353822231292725, + "learning_rate": 4.4127076739923504e-05, + "loss": 0.8324, + "step": 39860 + }, + { + "epoch": 0.35246379886490214, + "grad_norm": 6.620767593383789, + "learning_rate": 4.412560335225163e-05, + "loss": 0.72, + "step": 39870 + }, + { + "epoch": 0.3525522021252144, + "grad_norm": 10.123428344726562, + "learning_rate": 4.412412996457976e-05, + "loss": 0.6088, + "step": 39880 + }, + { + "epoch": 0.3526406053855266, + "grad_norm": 2.5653140544891357, + "learning_rate": 4.412265657690789e-05, + "loss": 0.7364, + "step": 39890 + }, + { + "epoch": 0.35272900864583884, + "grad_norm": 3.4880590438842773, + "learning_rate": 4.412118318923602e-05, + "loss": 0.612, + "step": 39900 + }, + { + "epoch": 0.3528174119061511, + "grad_norm": 4.04166841506958, + "learning_rate": 4.411970980156415e-05, + "loss": 0.6852, + "step": 39910 + }, + { + "epoch": 0.35290581516646335, + "grad_norm": 12.76233959197998, + "learning_rate": 4.411823641389228e-05, + "loss": 0.7562, + "step": 39920 + }, + { + "epoch": 0.3529942184267756, + "grad_norm": 2.44209361076355, + "learning_rate": 4.411676302622041e-05, + "loss": 0.6899, + "step": 39930 + }, + { + "epoch": 0.3530826216870878, + "grad_norm": 2.385620355606079, + "learning_rate": 4.411528963854854e-05, + "loss": 0.7664, + "step": 39940 + }, + { + "epoch": 0.35317102494740005, + "grad_norm": 6.197722911834717, + "learning_rate": 4.4113816250876666e-05, + "loss": 0.7016, + "step": 39950 + }, + { + "epoch": 0.3532594282077123, + "grad_norm": 2.3152050971984863, + "learning_rate": 4.4112342863204794e-05, + "loss": 0.7747, + "step": 39960 + }, + { + "epoch": 0.35334783146802456, + "grad_norm": 4.877121448516846, + "learning_rate": 4.411086947553293e-05, + "loss": 0.7875, + "step": 39970 + }, + { + "epoch": 0.3534362347283368, + "grad_norm": 2.164944648742676, + "learning_rate": 4.410939608786106e-05, + "loss": 0.6888, + "step": 39980 + }, + { + "epoch": 0.353524637988649, + "grad_norm": 1.504726767539978, + "learning_rate": 4.4107922700189186e-05, + "loss": 0.7891, + "step": 39990 + }, + { + "epoch": 0.35361304124896126, + "grad_norm": 4.400787353515625, + "learning_rate": 4.4106449312517315e-05, + "loss": 0.7454, + "step": 40000 + }, + { + "epoch": 0.3537014445092735, + "grad_norm": 2.773509979248047, + "learning_rate": 4.410497592484544e-05, + "loss": 0.6843, + "step": 40010 + }, + { + "epoch": 0.3537898477695857, + "grad_norm": 3.336489677429199, + "learning_rate": 4.410350253717357e-05, + "loss": 0.7951, + "step": 40020 + }, + { + "epoch": 0.353878251029898, + "grad_norm": 1.953701138496399, + "learning_rate": 4.41020291495017e-05, + "loss": 0.7838, + "step": 40030 + }, + { + "epoch": 0.35396665429021024, + "grad_norm": 7.4070143699646, + "learning_rate": 4.4100555761829835e-05, + "loss": 0.729, + "step": 40040 + }, + { + "epoch": 0.35405505755052247, + "grad_norm": 5.879505634307861, + "learning_rate": 4.409908237415796e-05, + "loss": 0.5443, + "step": 40050 + }, + { + "epoch": 0.3541434608108347, + "grad_norm": 1.577394962310791, + "learning_rate": 4.409760898648609e-05, + "loss": 0.7443, + "step": 40060 + }, + { + "epoch": 0.35423186407114693, + "grad_norm": 12.447354316711426, + "learning_rate": 4.409613559881422e-05, + "loss": 0.8277, + "step": 40070 + }, + { + "epoch": 0.35432026733145916, + "grad_norm": 1.5400261878967285, + "learning_rate": 4.409466221114235e-05, + "loss": 0.6151, + "step": 40080 + }, + { + "epoch": 0.35440867059177145, + "grad_norm": 7.868757247924805, + "learning_rate": 4.4093188823470477e-05, + "loss": 0.7102, + "step": 40090 + }, + { + "epoch": 0.3544970738520837, + "grad_norm": 2.3401095867156982, + "learning_rate": 4.409171543579861e-05, + "loss": 0.814, + "step": 40100 + }, + { + "epoch": 0.3545854771123959, + "grad_norm": 5.074752330780029, + "learning_rate": 4.409024204812674e-05, + "loss": 0.7593, + "step": 40110 + }, + { + "epoch": 0.35467388037270814, + "grad_norm": 12.343538284301758, + "learning_rate": 4.408876866045487e-05, + "loss": 0.6912, + "step": 40120 + }, + { + "epoch": 0.35476228363302037, + "grad_norm": 7.01021146774292, + "learning_rate": 4.4087295272783e-05, + "loss": 0.6184, + "step": 40130 + }, + { + "epoch": 0.3548506868933326, + "grad_norm": 5.7294721603393555, + "learning_rate": 4.4085821885111125e-05, + "loss": 0.6806, + "step": 40140 + }, + { + "epoch": 0.3549390901536449, + "grad_norm": 5.026396751403809, + "learning_rate": 4.4084348497439253e-05, + "loss": 0.614, + "step": 40150 + }, + { + "epoch": 0.3550274934139571, + "grad_norm": 1.325736403465271, + "learning_rate": 4.408287510976739e-05, + "loss": 0.6932, + "step": 40160 + }, + { + "epoch": 0.35511589667426935, + "grad_norm": 3.47292423248291, + "learning_rate": 4.408140172209551e-05, + "loss": 0.8155, + "step": 40170 + }, + { + "epoch": 0.3552042999345816, + "grad_norm": 5.075028896331787, + "learning_rate": 4.4079928334423645e-05, + "loss": 0.7405, + "step": 40180 + }, + { + "epoch": 0.3552927031948938, + "grad_norm": 6.216986656188965, + "learning_rate": 4.4078454946751774e-05, + "loss": 0.7883, + "step": 40190 + }, + { + "epoch": 0.35538110645520604, + "grad_norm": 3.336992025375366, + "learning_rate": 4.40769815590799e-05, + "loss": 0.744, + "step": 40200 + }, + { + "epoch": 0.35546950971551833, + "grad_norm": 1.6610621213912964, + "learning_rate": 4.407550817140803e-05, + "loss": 0.6141, + "step": 40210 + }, + { + "epoch": 0.35555791297583056, + "grad_norm": 4.76738166809082, + "learning_rate": 4.4074034783736165e-05, + "loss": 0.6706, + "step": 40220 + }, + { + "epoch": 0.3556463162361428, + "grad_norm": 9.403095245361328, + "learning_rate": 4.407256139606429e-05, + "loss": 0.6813, + "step": 40230 + }, + { + "epoch": 0.355734719496455, + "grad_norm": 7.407747268676758, + "learning_rate": 4.407108800839242e-05, + "loss": 0.6746, + "step": 40240 + }, + { + "epoch": 0.35582312275676725, + "grad_norm": 17.36669158935547, + "learning_rate": 4.4069614620720544e-05, + "loss": 0.6397, + "step": 40250 + }, + { + "epoch": 0.3559115260170795, + "grad_norm": 6.369690895080566, + "learning_rate": 4.406814123304868e-05, + "loss": 0.768, + "step": 40260 + }, + { + "epoch": 0.35599992927739177, + "grad_norm": 3.3993122577667236, + "learning_rate": 4.406666784537681e-05, + "loss": 0.7498, + "step": 40270 + }, + { + "epoch": 0.356088332537704, + "grad_norm": 2.4197874069213867, + "learning_rate": 4.4065194457704936e-05, + "loss": 0.8067, + "step": 40280 + }, + { + "epoch": 0.35617673579801623, + "grad_norm": 1.7032784223556519, + "learning_rate": 4.4063721070033064e-05, + "loss": 0.589, + "step": 40290 + }, + { + "epoch": 0.35626513905832846, + "grad_norm": 5.97502326965332, + "learning_rate": 4.40622476823612e-05, + "loss": 0.7253, + "step": 40300 + }, + { + "epoch": 0.3563535423186407, + "grad_norm": 3.3060402870178223, + "learning_rate": 4.406077429468932e-05, + "loss": 0.6148, + "step": 40310 + }, + { + "epoch": 0.356441945578953, + "grad_norm": 4.172302722930908, + "learning_rate": 4.4059300907017456e-05, + "loss": 0.6672, + "step": 40320 + }, + { + "epoch": 0.3565303488392652, + "grad_norm": 3.88161301612854, + "learning_rate": 4.4057827519345584e-05, + "loss": 0.6523, + "step": 40330 + }, + { + "epoch": 0.35661875209957744, + "grad_norm": 3.1442339420318604, + "learning_rate": 4.405635413167371e-05, + "loss": 0.6888, + "step": 40340 + }, + { + "epoch": 0.35670715535988967, + "grad_norm": 3.7767527103424072, + "learning_rate": 4.405488074400184e-05, + "loss": 0.7837, + "step": 40350 + }, + { + "epoch": 0.3567955586202019, + "grad_norm": 2.4356746673583984, + "learning_rate": 4.405340735632997e-05, + "loss": 0.7589, + "step": 40360 + }, + { + "epoch": 0.35688396188051413, + "grad_norm": 1.566416621208191, + "learning_rate": 4.40519339686581e-05, + "loss": 0.7081, + "step": 40370 + }, + { + "epoch": 0.3569723651408264, + "grad_norm": 5.95517110824585, + "learning_rate": 4.405046058098623e-05, + "loss": 0.8114, + "step": 40380 + }, + { + "epoch": 0.35706076840113865, + "grad_norm": 2.393517255783081, + "learning_rate": 4.4048987193314354e-05, + "loss": 0.7004, + "step": 40390 + }, + { + "epoch": 0.3571491716614509, + "grad_norm": 3.452021598815918, + "learning_rate": 4.404751380564249e-05, + "loss": 0.565, + "step": 40400 + }, + { + "epoch": 0.3572375749217631, + "grad_norm": 2.5372750759124756, + "learning_rate": 4.404604041797062e-05, + "loss": 0.7225, + "step": 40410 + }, + { + "epoch": 0.35732597818207534, + "grad_norm": 5.6464338302612305, + "learning_rate": 4.4044567030298746e-05, + "loss": 0.7225, + "step": 40420 + }, + { + "epoch": 0.3574143814423876, + "grad_norm": 4.8920392990112305, + "learning_rate": 4.4043093642626874e-05, + "loss": 0.8245, + "step": 40430 + }, + { + "epoch": 0.35750278470269986, + "grad_norm": 2.4476685523986816, + "learning_rate": 4.404162025495501e-05, + "loss": 0.6622, + "step": 40440 + }, + { + "epoch": 0.3575911879630121, + "grad_norm": 9.860517501831055, + "learning_rate": 4.404014686728313e-05, + "loss": 0.5973, + "step": 40450 + }, + { + "epoch": 0.3576795912233243, + "grad_norm": 2.5402731895446777, + "learning_rate": 4.4038673479611266e-05, + "loss": 0.6907, + "step": 40460 + }, + { + "epoch": 0.35776799448363655, + "grad_norm": 6.171511650085449, + "learning_rate": 4.403720009193939e-05, + "loss": 0.6051, + "step": 40470 + }, + { + "epoch": 0.3578563977439488, + "grad_norm": 1.7023460865020752, + "learning_rate": 4.403572670426752e-05, + "loss": 0.7183, + "step": 40480 + }, + { + "epoch": 0.357944801004261, + "grad_norm": 3.548947811126709, + "learning_rate": 4.403425331659565e-05, + "loss": 0.6988, + "step": 40490 + }, + { + "epoch": 0.3580332042645733, + "grad_norm": 2.4574687480926514, + "learning_rate": 4.403277992892378e-05, + "loss": 0.7046, + "step": 40500 + }, + { + "epoch": 0.35812160752488553, + "grad_norm": 7.232769012451172, + "learning_rate": 4.403130654125191e-05, + "loss": 0.6473, + "step": 40510 + }, + { + "epoch": 0.35821001078519776, + "grad_norm": 10.139755249023438, + "learning_rate": 4.402983315358004e-05, + "loss": 0.6927, + "step": 40520 + }, + { + "epoch": 0.35829841404551, + "grad_norm": 4.592871189117432, + "learning_rate": 4.4028359765908165e-05, + "loss": 0.7979, + "step": 40530 + }, + { + "epoch": 0.3583868173058222, + "grad_norm": 18.107608795166016, + "learning_rate": 4.40268863782363e-05, + "loss": 0.7414, + "step": 40540 + }, + { + "epoch": 0.35847522056613446, + "grad_norm": 2.707333564758301, + "learning_rate": 4.402541299056443e-05, + "loss": 0.6631, + "step": 40550 + }, + { + "epoch": 0.35856362382644674, + "grad_norm": 5.194340229034424, + "learning_rate": 4.402393960289256e-05, + "loss": 0.6022, + "step": 40560 + }, + { + "epoch": 0.358652027086759, + "grad_norm": 1.5801736116409302, + "learning_rate": 4.4022466215220685e-05, + "loss": 0.7492, + "step": 40570 + }, + { + "epoch": 0.3587404303470712, + "grad_norm": 11.096145629882812, + "learning_rate": 4.402099282754882e-05, + "loss": 0.7574, + "step": 40580 + }, + { + "epoch": 0.35882883360738344, + "grad_norm": 3.0222949981689453, + "learning_rate": 4.401951943987694e-05, + "loss": 0.6925, + "step": 40590 + }, + { + "epoch": 0.35891723686769567, + "grad_norm": 3.1919920444488525, + "learning_rate": 4.401804605220508e-05, + "loss": 0.9316, + "step": 40600 + }, + { + "epoch": 0.3590056401280079, + "grad_norm": 1.5551713705062866, + "learning_rate": 4.40165726645332e-05, + "loss": 0.8169, + "step": 40610 + }, + { + "epoch": 0.3590940433883202, + "grad_norm": 6.985397815704346, + "learning_rate": 4.4015099276861334e-05, + "loss": 0.6582, + "step": 40620 + }, + { + "epoch": 0.3591824466486324, + "grad_norm": 1.5592365264892578, + "learning_rate": 4.401362588918946e-05, + "loss": 0.7288, + "step": 40630 + }, + { + "epoch": 0.35927084990894465, + "grad_norm": 6.607183933258057, + "learning_rate": 4.401215250151759e-05, + "loss": 0.7035, + "step": 40640 + }, + { + "epoch": 0.3593592531692569, + "grad_norm": 5.1960368156433105, + "learning_rate": 4.401067911384572e-05, + "loss": 0.8285, + "step": 40650 + }, + { + "epoch": 0.3594476564295691, + "grad_norm": 3.718777656555176, + "learning_rate": 4.4009205726173854e-05, + "loss": 0.6918, + "step": 40660 + }, + { + "epoch": 0.35953605968988134, + "grad_norm": 4.332536220550537, + "learning_rate": 4.4007732338501975e-05, + "loss": 0.8502, + "step": 40670 + }, + { + "epoch": 0.3596244629501936, + "grad_norm": 3.90313720703125, + "learning_rate": 4.400625895083011e-05, + "loss": 0.7566, + "step": 40680 + }, + { + "epoch": 0.35971286621050585, + "grad_norm": 3.1796460151672363, + "learning_rate": 4.400478556315824e-05, + "loss": 0.6631, + "step": 40690 + }, + { + "epoch": 0.3598012694708181, + "grad_norm": 2.104417085647583, + "learning_rate": 4.400331217548637e-05, + "loss": 0.8594, + "step": 40700 + }, + { + "epoch": 0.3598896727311303, + "grad_norm": 8.83961296081543, + "learning_rate": 4.4001838787814495e-05, + "loss": 0.7134, + "step": 40710 + }, + { + "epoch": 0.35997807599144255, + "grad_norm": 9.377950668334961, + "learning_rate": 4.4000365400142624e-05, + "loss": 0.6579, + "step": 40720 + }, + { + "epoch": 0.3600664792517548, + "grad_norm": 5.622836112976074, + "learning_rate": 4.399889201247075e-05, + "loss": 0.6832, + "step": 40730 + }, + { + "epoch": 0.36015488251206706, + "grad_norm": 2.3973677158355713, + "learning_rate": 4.399741862479889e-05, + "loss": 0.6981, + "step": 40740 + }, + { + "epoch": 0.3602432857723793, + "grad_norm": 3.3558311462402344, + "learning_rate": 4.399594523712701e-05, + "loss": 0.7008, + "step": 40750 + }, + { + "epoch": 0.3603316890326915, + "grad_norm": 3.977409601211548, + "learning_rate": 4.3994471849455144e-05, + "loss": 0.794, + "step": 40760 + }, + { + "epoch": 0.36042009229300376, + "grad_norm": 0.9260974526405334, + "learning_rate": 4.399299846178327e-05, + "loss": 0.6739, + "step": 40770 + }, + { + "epoch": 0.360508495553316, + "grad_norm": 2.4397428035736084, + "learning_rate": 4.39915250741114e-05, + "loss": 0.7254, + "step": 40780 + }, + { + "epoch": 0.3605968988136282, + "grad_norm": 3.673962354660034, + "learning_rate": 4.399005168643953e-05, + "loss": 0.7404, + "step": 40790 + }, + { + "epoch": 0.3606853020739405, + "grad_norm": 15.185490608215332, + "learning_rate": 4.3988578298767664e-05, + "loss": 0.7712, + "step": 40800 + }, + { + "epoch": 0.36077370533425274, + "grad_norm": 2.7475123405456543, + "learning_rate": 4.3987104911095786e-05, + "loss": 0.7129, + "step": 40810 + }, + { + "epoch": 0.36086210859456497, + "grad_norm": 4.323202610015869, + "learning_rate": 4.398563152342392e-05, + "loss": 0.6802, + "step": 40820 + }, + { + "epoch": 0.3609505118548772, + "grad_norm": 1.4806162118911743, + "learning_rate": 4.398415813575205e-05, + "loss": 0.7264, + "step": 40830 + }, + { + "epoch": 0.36103891511518943, + "grad_norm": 3.3781514167785645, + "learning_rate": 4.398268474808018e-05, + "loss": 0.6933, + "step": 40840 + }, + { + "epoch": 0.3611273183755017, + "grad_norm": 2.4755170345306396, + "learning_rate": 4.3981211360408306e-05, + "loss": 0.8133, + "step": 40850 + }, + { + "epoch": 0.36121572163581395, + "grad_norm": 6.750313758850098, + "learning_rate": 4.3979737972736434e-05, + "loss": 0.7433, + "step": 40860 + }, + { + "epoch": 0.3613041248961262, + "grad_norm": 10.428040504455566, + "learning_rate": 4.397826458506456e-05, + "loss": 0.6626, + "step": 40870 + }, + { + "epoch": 0.3613925281564384, + "grad_norm": 8.72851848602295, + "learning_rate": 4.39767911973927e-05, + "loss": 0.6704, + "step": 40880 + }, + { + "epoch": 0.36148093141675064, + "grad_norm": 15.175640106201172, + "learning_rate": 4.3975317809720826e-05, + "loss": 0.7522, + "step": 40890 + }, + { + "epoch": 0.36156933467706287, + "grad_norm": 6.841087818145752, + "learning_rate": 4.3973844422048955e-05, + "loss": 0.6681, + "step": 40900 + }, + { + "epoch": 0.36165773793737516, + "grad_norm": 7.627110958099365, + "learning_rate": 4.397237103437708e-05, + "loss": 0.7088, + "step": 40910 + }, + { + "epoch": 0.3617461411976874, + "grad_norm": 1.9037063121795654, + "learning_rate": 4.397089764670521e-05, + "loss": 0.6926, + "step": 40920 + }, + { + "epoch": 0.3618345444579996, + "grad_norm": 4.059687614440918, + "learning_rate": 4.396942425903334e-05, + "loss": 0.67, + "step": 40930 + }, + { + "epoch": 0.36192294771831185, + "grad_norm": 2.2511236667633057, + "learning_rate": 4.396795087136147e-05, + "loss": 0.738, + "step": 40940 + }, + { + "epoch": 0.3620113509786241, + "grad_norm": 2.366767406463623, + "learning_rate": 4.39664774836896e-05, + "loss": 0.6747, + "step": 40950 + }, + { + "epoch": 0.3620997542389363, + "grad_norm": 3.485605239868164, + "learning_rate": 4.396500409601773e-05, + "loss": 0.6978, + "step": 40960 + }, + { + "epoch": 0.3621881574992486, + "grad_norm": 1.3318157196044922, + "learning_rate": 4.396353070834586e-05, + "loss": 0.7764, + "step": 40970 + }, + { + "epoch": 0.36227656075956083, + "grad_norm": 2.444885492324829, + "learning_rate": 4.396205732067399e-05, + "loss": 0.7502, + "step": 40980 + }, + { + "epoch": 0.36236496401987306, + "grad_norm": 11.757702827453613, + "learning_rate": 4.3960583933002116e-05, + "loss": 0.8271, + "step": 40990 + }, + { + "epoch": 0.3624533672801853, + "grad_norm": 3.8691651821136475, + "learning_rate": 4.3959110545330245e-05, + "loss": 0.7425, + "step": 41000 + }, + { + "epoch": 0.3625417705404975, + "grad_norm": 6.782963752746582, + "learning_rate": 4.395763715765838e-05, + "loss": 0.7009, + "step": 41010 + }, + { + "epoch": 0.36263017380080975, + "grad_norm": 8.647643089294434, + "learning_rate": 4.395616376998651e-05, + "loss": 0.689, + "step": 41020 + }, + { + "epoch": 0.36271857706112204, + "grad_norm": 6.964686393737793, + "learning_rate": 4.395469038231464e-05, + "loss": 0.576, + "step": 41030 + }, + { + "epoch": 0.36280698032143427, + "grad_norm": 10.234725952148438, + "learning_rate": 4.3953216994642765e-05, + "loss": 0.6068, + "step": 41040 + }, + { + "epoch": 0.3628953835817465, + "grad_norm": 5.168300151824951, + "learning_rate": 4.3951743606970893e-05, + "loss": 0.8187, + "step": 41050 + }, + { + "epoch": 0.36298378684205873, + "grad_norm": 2.50061297416687, + "learning_rate": 4.395027021929902e-05, + "loss": 0.5756, + "step": 41060 + }, + { + "epoch": 0.36307219010237096, + "grad_norm": 5.854412078857422, + "learning_rate": 4.394879683162716e-05, + "loss": 0.796, + "step": 41070 + }, + { + "epoch": 0.3631605933626832, + "grad_norm": 4.857027053833008, + "learning_rate": 4.394732344395528e-05, + "loss": 0.7482, + "step": 41080 + }, + { + "epoch": 0.3632489966229955, + "grad_norm": 5.145595550537109, + "learning_rate": 4.3945850056283414e-05, + "loss": 0.727, + "step": 41090 + }, + { + "epoch": 0.3633373998833077, + "grad_norm": 4.18808650970459, + "learning_rate": 4.394437666861154e-05, + "loss": 0.7778, + "step": 41100 + }, + { + "epoch": 0.36342580314361994, + "grad_norm": 4.522815227508545, + "learning_rate": 4.394290328093967e-05, + "loss": 0.7866, + "step": 41110 + }, + { + "epoch": 0.36351420640393217, + "grad_norm": 2.089757204055786, + "learning_rate": 4.39414298932678e-05, + "loss": 0.7501, + "step": 41120 + }, + { + "epoch": 0.3636026096642444, + "grad_norm": 14.277029991149902, + "learning_rate": 4.3939956505595934e-05, + "loss": 0.7619, + "step": 41130 + }, + { + "epoch": 0.36369101292455663, + "grad_norm": 2.5681841373443604, + "learning_rate": 4.3938483117924055e-05, + "loss": 0.7376, + "step": 41140 + }, + { + "epoch": 0.3637794161848689, + "grad_norm": 5.73472785949707, + "learning_rate": 4.393700973025219e-05, + "loss": 0.7434, + "step": 41150 + }, + { + "epoch": 0.36386781944518115, + "grad_norm": 2.0774855613708496, + "learning_rate": 4.393553634258032e-05, + "loss": 0.6711, + "step": 41160 + }, + { + "epoch": 0.3639562227054934, + "grad_norm": 1.8315597772598267, + "learning_rate": 4.393406295490845e-05, + "loss": 0.7965, + "step": 41170 + }, + { + "epoch": 0.3640446259658056, + "grad_norm": 1.7054177522659302, + "learning_rate": 4.3932589567236576e-05, + "loss": 0.6805, + "step": 41180 + }, + { + "epoch": 0.36413302922611784, + "grad_norm": 3.2949397563934326, + "learning_rate": 4.3931116179564704e-05, + "loss": 0.7417, + "step": 41190 + }, + { + "epoch": 0.3642214324864301, + "grad_norm": 1.2452337741851807, + "learning_rate": 4.392964279189283e-05, + "loss": 0.5739, + "step": 41200 + }, + { + "epoch": 0.36430983574674236, + "grad_norm": 1.940799355506897, + "learning_rate": 4.392816940422097e-05, + "loss": 0.6658, + "step": 41210 + }, + { + "epoch": 0.3643982390070546, + "grad_norm": 20.6791934967041, + "learning_rate": 4.392669601654909e-05, + "loss": 0.8094, + "step": 41220 + }, + { + "epoch": 0.3644866422673668, + "grad_norm": 3.6161069869995117, + "learning_rate": 4.3925222628877224e-05, + "loss": 0.8835, + "step": 41230 + }, + { + "epoch": 0.36457504552767905, + "grad_norm": 9.129081726074219, + "learning_rate": 4.392374924120535e-05, + "loss": 0.9506, + "step": 41240 + }, + { + "epoch": 0.3646634487879913, + "grad_norm": 2.6817915439605713, + "learning_rate": 4.392227585353348e-05, + "loss": 0.8211, + "step": 41250 + }, + { + "epoch": 0.3647518520483035, + "grad_norm": 3.572265386581421, + "learning_rate": 4.392080246586161e-05, + "loss": 0.7337, + "step": 41260 + }, + { + "epoch": 0.3648402553086158, + "grad_norm": 1.5965874195098877, + "learning_rate": 4.3919329078189744e-05, + "loss": 0.7683, + "step": 41270 + }, + { + "epoch": 0.36492865856892803, + "grad_norm": 6.551940441131592, + "learning_rate": 4.3917855690517866e-05, + "loss": 0.8188, + "step": 41280 + }, + { + "epoch": 0.36501706182924026, + "grad_norm": 5.230619430541992, + "learning_rate": 4.3916382302846e-05, + "loss": 0.6382, + "step": 41290 + }, + { + "epoch": 0.3651054650895525, + "grad_norm": 7.282522678375244, + "learning_rate": 4.391490891517412e-05, + "loss": 0.7625, + "step": 41300 + }, + { + "epoch": 0.3651938683498647, + "grad_norm": 2.7996153831481934, + "learning_rate": 4.391343552750226e-05, + "loss": 0.713, + "step": 41310 + }, + { + "epoch": 0.36528227161017696, + "grad_norm": 3.4378294944763184, + "learning_rate": 4.3911962139830386e-05, + "loss": 0.67, + "step": 41320 + }, + { + "epoch": 0.36537067487048924, + "grad_norm": 6.836731433868408, + "learning_rate": 4.3910488752158514e-05, + "loss": 0.6547, + "step": 41330 + }, + { + "epoch": 0.3654590781308015, + "grad_norm": 6.535715579986572, + "learning_rate": 4.390901536448664e-05, + "loss": 0.7291, + "step": 41340 + }, + { + "epoch": 0.3655474813911137, + "grad_norm": 4.123851299285889, + "learning_rate": 4.390754197681478e-05, + "loss": 0.6429, + "step": 41350 + }, + { + "epoch": 0.36563588465142594, + "grad_norm": 15.508459091186523, + "learning_rate": 4.39060685891429e-05, + "loss": 0.673, + "step": 41360 + }, + { + "epoch": 0.36572428791173817, + "grad_norm": 7.07671594619751, + "learning_rate": 4.3904595201471035e-05, + "loss": 0.6897, + "step": 41370 + }, + { + "epoch": 0.36581269117205045, + "grad_norm": 7.29509162902832, + "learning_rate": 4.390312181379916e-05, + "loss": 0.752, + "step": 41380 + }, + { + "epoch": 0.3659010944323627, + "grad_norm": 3.431602954864502, + "learning_rate": 4.390164842612729e-05, + "loss": 0.6463, + "step": 41390 + }, + { + "epoch": 0.3659894976926749, + "grad_norm": 9.897151947021484, + "learning_rate": 4.390017503845542e-05, + "loss": 0.665, + "step": 41400 + }, + { + "epoch": 0.36607790095298715, + "grad_norm": 6.333930969238281, + "learning_rate": 4.389870165078355e-05, + "loss": 0.8146, + "step": 41410 + }, + { + "epoch": 0.3661663042132994, + "grad_norm": 3.1365952491760254, + "learning_rate": 4.3897228263111676e-05, + "loss": 0.809, + "step": 41420 + }, + { + "epoch": 0.3662547074736116, + "grad_norm": 1.5993609428405762, + "learning_rate": 4.389575487543981e-05, + "loss": 0.6031, + "step": 41430 + }, + { + "epoch": 0.3663431107339239, + "grad_norm": 6.380438327789307, + "learning_rate": 4.389428148776793e-05, + "loss": 0.778, + "step": 41440 + }, + { + "epoch": 0.3664315139942361, + "grad_norm": 14.338891983032227, + "learning_rate": 4.389280810009607e-05, + "loss": 0.6802, + "step": 41450 + }, + { + "epoch": 0.36651991725454836, + "grad_norm": 2.1834185123443604, + "learning_rate": 4.3891334712424197e-05, + "loss": 0.7517, + "step": 41460 + }, + { + "epoch": 0.3666083205148606, + "grad_norm": 6.721607208251953, + "learning_rate": 4.3889861324752325e-05, + "loss": 0.8268, + "step": 41470 + }, + { + "epoch": 0.3666967237751728, + "grad_norm": 7.450668811798096, + "learning_rate": 4.388838793708045e-05, + "loss": 0.9185, + "step": 41480 + }, + { + "epoch": 0.36678512703548505, + "grad_norm": 8.780129432678223, + "learning_rate": 4.388691454940859e-05, + "loss": 0.7457, + "step": 41490 + }, + { + "epoch": 0.36687353029579733, + "grad_norm": 1.755095362663269, + "learning_rate": 4.388544116173671e-05, + "loss": 0.6655, + "step": 41500 + }, + { + "epoch": 0.36696193355610957, + "grad_norm": 1.4471567869186401, + "learning_rate": 4.3883967774064845e-05, + "loss": 0.7387, + "step": 41510 + }, + { + "epoch": 0.3670503368164218, + "grad_norm": 3.916208267211914, + "learning_rate": 4.3882494386392973e-05, + "loss": 0.8674, + "step": 41520 + }, + { + "epoch": 0.367138740076734, + "grad_norm": 3.033754348754883, + "learning_rate": 4.38810209987211e-05, + "loss": 0.8175, + "step": 41530 + }, + { + "epoch": 0.36722714333704626, + "grad_norm": 2.02097487449646, + "learning_rate": 4.387954761104923e-05, + "loss": 0.8189, + "step": 41540 + }, + { + "epoch": 0.3673155465973585, + "grad_norm": 5.735281467437744, + "learning_rate": 4.387807422337736e-05, + "loss": 0.6936, + "step": 41550 + }, + { + "epoch": 0.3674039498576708, + "grad_norm": 4.488494396209717, + "learning_rate": 4.387660083570549e-05, + "loss": 0.7135, + "step": 41560 + }, + { + "epoch": 0.367492353117983, + "grad_norm": 1.9440511465072632, + "learning_rate": 4.387512744803362e-05, + "loss": 0.6788, + "step": 41570 + }, + { + "epoch": 0.36758075637829524, + "grad_norm": 2.082098960876465, + "learning_rate": 4.3873654060361744e-05, + "loss": 0.7994, + "step": 41580 + }, + { + "epoch": 0.36766915963860747, + "grad_norm": 5.62264347076416, + "learning_rate": 4.387218067268988e-05, + "loss": 0.6692, + "step": 41590 + }, + { + "epoch": 0.3677575628989197, + "grad_norm": 2.6430423259735107, + "learning_rate": 4.387070728501801e-05, + "loss": 0.8584, + "step": 41600 + }, + { + "epoch": 0.36784596615923193, + "grad_norm": 13.46376895904541, + "learning_rate": 4.3869233897346135e-05, + "loss": 0.7638, + "step": 41610 + }, + { + "epoch": 0.3679343694195442, + "grad_norm": 5.512721061706543, + "learning_rate": 4.3867760509674264e-05, + "loss": 0.6937, + "step": 41620 + }, + { + "epoch": 0.36802277267985645, + "grad_norm": 2.6101555824279785, + "learning_rate": 4.38662871220024e-05, + "loss": 0.7419, + "step": 41630 + }, + { + "epoch": 0.3681111759401687, + "grad_norm": 9.809114456176758, + "learning_rate": 4.386481373433052e-05, + "loss": 0.7475, + "step": 41640 + }, + { + "epoch": 0.3681995792004809, + "grad_norm": 2.038137435913086, + "learning_rate": 4.3863340346658656e-05, + "loss": 0.7122, + "step": 41650 + }, + { + "epoch": 0.36828798246079314, + "grad_norm": 17.743118286132812, + "learning_rate": 4.386186695898678e-05, + "loss": 0.7984, + "step": 41660 + }, + { + "epoch": 0.36837638572110537, + "grad_norm": 11.9349365234375, + "learning_rate": 4.386039357131491e-05, + "loss": 0.8233, + "step": 41670 + }, + { + "epoch": 0.36846478898141766, + "grad_norm": 3.9197232723236084, + "learning_rate": 4.385892018364304e-05, + "loss": 0.6308, + "step": 41680 + }, + { + "epoch": 0.3685531922417299, + "grad_norm": 8.652718544006348, + "learning_rate": 4.385744679597117e-05, + "loss": 0.7744, + "step": 41690 + }, + { + "epoch": 0.3686415955020421, + "grad_norm": 2.7132115364074707, + "learning_rate": 4.38559734082993e-05, + "loss": 0.8058, + "step": 41700 + }, + { + "epoch": 0.36872999876235435, + "grad_norm": 16.54650115966797, + "learning_rate": 4.385450002062743e-05, + "loss": 0.7016, + "step": 41710 + }, + { + "epoch": 0.3688184020226666, + "grad_norm": 10.114154815673828, + "learning_rate": 4.3853026632955554e-05, + "loss": 0.6669, + "step": 41720 + }, + { + "epoch": 0.3689068052829788, + "grad_norm": 4.78609037399292, + "learning_rate": 4.385155324528369e-05, + "loss": 0.7174, + "step": 41730 + }, + { + "epoch": 0.3689952085432911, + "grad_norm": 5.867161273956299, + "learning_rate": 4.385007985761182e-05, + "loss": 0.7428, + "step": 41740 + }, + { + "epoch": 0.36908361180360333, + "grad_norm": 3.208101511001587, + "learning_rate": 4.3848606469939946e-05, + "loss": 0.7772, + "step": 41750 + }, + { + "epoch": 0.36917201506391556, + "grad_norm": 1.9551610946655273, + "learning_rate": 4.3847133082268074e-05, + "loss": 0.7183, + "step": 41760 + }, + { + "epoch": 0.3692604183242278, + "grad_norm": 1.4658784866333008, + "learning_rate": 4.38456596945962e-05, + "loss": 0.7232, + "step": 41770 + }, + { + "epoch": 0.36934882158454, + "grad_norm": 3.3785741329193115, + "learning_rate": 4.384418630692433e-05, + "loss": 0.7762, + "step": 41780 + }, + { + "epoch": 0.36943722484485225, + "grad_norm": 4.0606465339660645, + "learning_rate": 4.3842712919252466e-05, + "loss": 0.6346, + "step": 41790 + }, + { + "epoch": 0.36952562810516454, + "grad_norm": 10.871868133544922, + "learning_rate": 4.3841239531580594e-05, + "loss": 0.6976, + "step": 41800 + }, + { + "epoch": 0.36961403136547677, + "grad_norm": 7.123082160949707, + "learning_rate": 4.383976614390872e-05, + "loss": 0.7326, + "step": 41810 + }, + { + "epoch": 0.369702434625789, + "grad_norm": 7.928825378417969, + "learning_rate": 4.383829275623685e-05, + "loss": 0.6402, + "step": 41820 + }, + { + "epoch": 0.36979083788610123, + "grad_norm": 1.1369304656982422, + "learning_rate": 4.383681936856498e-05, + "loss": 0.5528, + "step": 41830 + }, + { + "epoch": 0.36987924114641346, + "grad_norm": 1.3517223596572876, + "learning_rate": 4.383534598089311e-05, + "loss": 0.7826, + "step": 41840 + }, + { + "epoch": 0.3699676444067257, + "grad_norm": 18.013916015625, + "learning_rate": 4.383387259322124e-05, + "loss": 0.6546, + "step": 41850 + }, + { + "epoch": 0.370056047667038, + "grad_norm": 2.3677890300750732, + "learning_rate": 4.383239920554937e-05, + "loss": 0.6548, + "step": 41860 + }, + { + "epoch": 0.3701444509273502, + "grad_norm": 5.211153507232666, + "learning_rate": 4.38309258178775e-05, + "loss": 0.8121, + "step": 41870 + }, + { + "epoch": 0.37023285418766244, + "grad_norm": 2.7310221195220947, + "learning_rate": 4.382945243020563e-05, + "loss": 0.6616, + "step": 41880 + }, + { + "epoch": 0.37032125744797467, + "grad_norm": 4.248507976531982, + "learning_rate": 4.3827979042533756e-05, + "loss": 0.742, + "step": 41890 + }, + { + "epoch": 0.3704096607082869, + "grad_norm": 3.743333578109741, + "learning_rate": 4.3826505654861885e-05, + "loss": 0.7965, + "step": 41900 + }, + { + "epoch": 0.3704980639685992, + "grad_norm": 6.355827808380127, + "learning_rate": 4.382503226719001e-05, + "loss": 0.7357, + "step": 41910 + }, + { + "epoch": 0.3705864672289114, + "grad_norm": 3.2447757720947266, + "learning_rate": 4.382355887951815e-05, + "loss": 0.6423, + "step": 41920 + }, + { + "epoch": 0.37067487048922365, + "grad_norm": 6.498961925506592, + "learning_rate": 4.382208549184628e-05, + "loss": 0.6671, + "step": 41930 + }, + { + "epoch": 0.3707632737495359, + "grad_norm": 3.632401466369629, + "learning_rate": 4.3820612104174405e-05, + "loss": 0.7282, + "step": 41940 + }, + { + "epoch": 0.3708516770098481, + "grad_norm": 4.138214111328125, + "learning_rate": 4.381913871650253e-05, + "loss": 0.7385, + "step": 41950 + }, + { + "epoch": 0.37094008027016034, + "grad_norm": 10.882044792175293, + "learning_rate": 4.381766532883066e-05, + "loss": 0.7503, + "step": 41960 + }, + { + "epoch": 0.37102848353047263, + "grad_norm": 2.232042074203491, + "learning_rate": 4.381619194115879e-05, + "loss": 0.5926, + "step": 41970 + }, + { + "epoch": 0.37111688679078486, + "grad_norm": 9.09072494506836, + "learning_rate": 4.3814718553486925e-05, + "loss": 0.7787, + "step": 41980 + }, + { + "epoch": 0.3712052900510971, + "grad_norm": 2.8556175231933594, + "learning_rate": 4.3813245165815054e-05, + "loss": 0.8473, + "step": 41990 + }, + { + "epoch": 0.3712936933114093, + "grad_norm": 4.36794900894165, + "learning_rate": 4.381177177814318e-05, + "loss": 0.7505, + "step": 42000 + }, + { + "epoch": 0.37138209657172155, + "grad_norm": 3.274106740951538, + "learning_rate": 4.381029839047131e-05, + "loss": 0.6209, + "step": 42010 + }, + { + "epoch": 0.3714704998320338, + "grad_norm": 7.524702072143555, + "learning_rate": 4.380882500279944e-05, + "loss": 0.8172, + "step": 42020 + }, + { + "epoch": 0.37155890309234607, + "grad_norm": 3.5206210613250732, + "learning_rate": 4.380735161512757e-05, + "loss": 0.6482, + "step": 42030 + }, + { + "epoch": 0.3716473063526583, + "grad_norm": 1.3513615131378174, + "learning_rate": 4.38058782274557e-05, + "loss": 0.8467, + "step": 42040 + }, + { + "epoch": 0.37173570961297053, + "grad_norm": 2.57810640335083, + "learning_rate": 4.3804404839783824e-05, + "loss": 0.7536, + "step": 42050 + }, + { + "epoch": 0.37182411287328276, + "grad_norm": 10.607290267944336, + "learning_rate": 4.380293145211196e-05, + "loss": 0.775, + "step": 42060 + }, + { + "epoch": 0.371912516133595, + "grad_norm": 8.072117805480957, + "learning_rate": 4.380145806444009e-05, + "loss": 0.7045, + "step": 42070 + }, + { + "epoch": 0.3720009193939072, + "grad_norm": 4.225197792053223, + "learning_rate": 4.3799984676768216e-05, + "loss": 0.6823, + "step": 42080 + }, + { + "epoch": 0.3720893226542195, + "grad_norm": 5.244751930236816, + "learning_rate": 4.3798511289096344e-05, + "loss": 0.7275, + "step": 42090 + }, + { + "epoch": 0.37217772591453174, + "grad_norm": 1.7465792894363403, + "learning_rate": 4.379703790142448e-05, + "loss": 0.5684, + "step": 42100 + }, + { + "epoch": 0.372266129174844, + "grad_norm": 6.863415241241455, + "learning_rate": 4.37955645137526e-05, + "loss": 0.6789, + "step": 42110 + }, + { + "epoch": 0.3723545324351562, + "grad_norm": 1.4748578071594238, + "learning_rate": 4.3794091126080736e-05, + "loss": 0.8776, + "step": 42120 + }, + { + "epoch": 0.37244293569546844, + "grad_norm": 8.206558227539062, + "learning_rate": 4.379261773840886e-05, + "loss": 0.6086, + "step": 42130 + }, + { + "epoch": 0.37253133895578067, + "grad_norm": 4.347940921783447, + "learning_rate": 4.379114435073699e-05, + "loss": 0.8139, + "step": 42140 + }, + { + "epoch": 0.37261974221609295, + "grad_norm": 2.5444750785827637, + "learning_rate": 4.378967096306512e-05, + "loss": 0.6933, + "step": 42150 + }, + { + "epoch": 0.3727081454764052, + "grad_norm": 3.0841615200042725, + "learning_rate": 4.378819757539325e-05, + "loss": 0.6849, + "step": 42160 + }, + { + "epoch": 0.3727965487367174, + "grad_norm": 11.621058464050293, + "learning_rate": 4.378672418772138e-05, + "loss": 0.7994, + "step": 42170 + }, + { + "epoch": 0.37288495199702965, + "grad_norm": 7.347725868225098, + "learning_rate": 4.378525080004951e-05, + "loss": 0.6785, + "step": 42180 + }, + { + "epoch": 0.3729733552573419, + "grad_norm": 4.26618766784668, + "learning_rate": 4.3783777412377634e-05, + "loss": 0.7254, + "step": 42190 + }, + { + "epoch": 0.3730617585176541, + "grad_norm": 5.139187812805176, + "learning_rate": 4.378230402470577e-05, + "loss": 0.6932, + "step": 42200 + }, + { + "epoch": 0.3731501617779664, + "grad_norm": 2.561830759048462, + "learning_rate": 4.37808306370339e-05, + "loss": 0.7827, + "step": 42210 + }, + { + "epoch": 0.3732385650382786, + "grad_norm": 8.85420036315918, + "learning_rate": 4.3779357249362026e-05, + "loss": 0.7687, + "step": 42220 + }, + { + "epoch": 0.37332696829859086, + "grad_norm": 3.529747486114502, + "learning_rate": 4.3777883861690154e-05, + "loss": 0.6549, + "step": 42230 + }, + { + "epoch": 0.3734153715589031, + "grad_norm": 3.643594741821289, + "learning_rate": 4.377641047401828e-05, + "loss": 0.8189, + "step": 42240 + }, + { + "epoch": 0.3735037748192153, + "grad_norm": 8.124227523803711, + "learning_rate": 4.377493708634641e-05, + "loss": 0.7284, + "step": 42250 + }, + { + "epoch": 0.37359217807952755, + "grad_norm": 6.099574565887451, + "learning_rate": 4.3773463698674546e-05, + "loss": 0.8366, + "step": 42260 + }, + { + "epoch": 0.37368058133983983, + "grad_norm": 1.4467730522155762, + "learning_rate": 4.377199031100267e-05, + "loss": 0.6122, + "step": 42270 + }, + { + "epoch": 0.37376898460015207, + "grad_norm": 3.4916584491729736, + "learning_rate": 4.37705169233308e-05, + "loss": 0.6087, + "step": 42280 + }, + { + "epoch": 0.3738573878604643, + "grad_norm": 8.590494155883789, + "learning_rate": 4.376904353565893e-05, + "loss": 0.7862, + "step": 42290 + }, + { + "epoch": 0.3739457911207765, + "grad_norm": 1.360739827156067, + "learning_rate": 4.376757014798706e-05, + "loss": 0.6658, + "step": 42300 + }, + { + "epoch": 0.37403419438108876, + "grad_norm": 8.499696731567383, + "learning_rate": 4.376609676031519e-05, + "loss": 0.6978, + "step": 42310 + }, + { + "epoch": 0.374122597641401, + "grad_norm": 4.420283794403076, + "learning_rate": 4.376462337264332e-05, + "loss": 0.7048, + "step": 42320 + }, + { + "epoch": 0.3742110009017133, + "grad_norm": 7.824456214904785, + "learning_rate": 4.3763149984971445e-05, + "loss": 0.6504, + "step": 42330 + }, + { + "epoch": 0.3742994041620255, + "grad_norm": 8.62056827545166, + "learning_rate": 4.376167659729958e-05, + "loss": 0.9202, + "step": 42340 + }, + { + "epoch": 0.37438780742233774, + "grad_norm": 1.988431453704834, + "learning_rate": 4.376020320962771e-05, + "loss": 0.6392, + "step": 42350 + }, + { + "epoch": 0.37447621068264997, + "grad_norm": 2.344813346862793, + "learning_rate": 4.3758729821955837e-05, + "loss": 0.6795, + "step": 42360 + }, + { + "epoch": 0.3745646139429622, + "grad_norm": 3.136532783508301, + "learning_rate": 4.3757256434283965e-05, + "loss": 0.796, + "step": 42370 + }, + { + "epoch": 0.37465301720327443, + "grad_norm": 2.549125909805298, + "learning_rate": 4.375578304661209e-05, + "loss": 0.6904, + "step": 42380 + }, + { + "epoch": 0.3747414204635867, + "grad_norm": 6.721037864685059, + "learning_rate": 4.375430965894022e-05, + "loss": 0.6246, + "step": 42390 + }, + { + "epoch": 0.37482982372389895, + "grad_norm": 13.33173942565918, + "learning_rate": 4.375283627126836e-05, + "loss": 0.6006, + "step": 42400 + }, + { + "epoch": 0.3749182269842112, + "grad_norm": 2.099210739135742, + "learning_rate": 4.375136288359648e-05, + "loss": 0.7107, + "step": 42410 + }, + { + "epoch": 0.3750066302445234, + "grad_norm": 7.2005743980407715, + "learning_rate": 4.3749889495924613e-05, + "loss": 0.8144, + "step": 42420 + }, + { + "epoch": 0.37509503350483564, + "grad_norm": 1.8392319679260254, + "learning_rate": 4.374841610825274e-05, + "loss": 0.7128, + "step": 42430 + }, + { + "epoch": 0.3751834367651479, + "grad_norm": 2.1908111572265625, + "learning_rate": 4.374694272058087e-05, + "loss": 0.7567, + "step": 42440 + }, + { + "epoch": 0.37527184002546016, + "grad_norm": 1.7751134634017944, + "learning_rate": 4.3745469332909e-05, + "loss": 0.5544, + "step": 42450 + }, + { + "epoch": 0.3753602432857724, + "grad_norm": 8.485166549682617, + "learning_rate": 4.3743995945237134e-05, + "loss": 0.7592, + "step": 42460 + }, + { + "epoch": 0.3754486465460846, + "grad_norm": 6.904178142547607, + "learning_rate": 4.3742522557565255e-05, + "loss": 0.7997, + "step": 42470 + }, + { + "epoch": 0.37553704980639685, + "grad_norm": 8.167841911315918, + "learning_rate": 4.374104916989339e-05, + "loss": 0.8538, + "step": 42480 + }, + { + "epoch": 0.3756254530667091, + "grad_norm": 4.086103439331055, + "learning_rate": 4.373957578222151e-05, + "loss": 0.7311, + "step": 42490 + }, + { + "epoch": 0.37571385632702137, + "grad_norm": 4.62849760055542, + "learning_rate": 4.373810239454965e-05, + "loss": 0.5907, + "step": 42500 + }, + { + "epoch": 0.3758022595873336, + "grad_norm": 2.8434784412384033, + "learning_rate": 4.3736629006877775e-05, + "loss": 0.6638, + "step": 42510 + }, + { + "epoch": 0.37589066284764583, + "grad_norm": 1.8806405067443848, + "learning_rate": 4.3735155619205904e-05, + "loss": 0.772, + "step": 42520 + }, + { + "epoch": 0.37597906610795806, + "grad_norm": 4.3698506355285645, + "learning_rate": 4.373368223153403e-05, + "loss": 0.8148, + "step": 42530 + }, + { + "epoch": 0.3760674693682703, + "grad_norm": 7.582542896270752, + "learning_rate": 4.373220884386217e-05, + "loss": 0.7863, + "step": 42540 + }, + { + "epoch": 0.3761558726285825, + "grad_norm": 1.7512024641036987, + "learning_rate": 4.373073545619029e-05, + "loss": 0.7152, + "step": 42550 + }, + { + "epoch": 0.3762442758888948, + "grad_norm": 4.376026153564453, + "learning_rate": 4.3729262068518424e-05, + "loss": 0.7138, + "step": 42560 + }, + { + "epoch": 0.37633267914920704, + "grad_norm": 3.759315252304077, + "learning_rate": 4.372778868084655e-05, + "loss": 0.7308, + "step": 42570 + }, + { + "epoch": 0.37642108240951927, + "grad_norm": 6.476296424865723, + "learning_rate": 4.372631529317468e-05, + "loss": 0.624, + "step": 42580 + }, + { + "epoch": 0.3765094856698315, + "grad_norm": 6.2172532081604, + "learning_rate": 4.372484190550281e-05, + "loss": 0.7558, + "step": 42590 + }, + { + "epoch": 0.37659788893014373, + "grad_norm": 4.6294074058532715, + "learning_rate": 4.372336851783094e-05, + "loss": 0.8166, + "step": 42600 + }, + { + "epoch": 0.37668629219045596, + "grad_norm": 5.101017951965332, + "learning_rate": 4.3721895130159066e-05, + "loss": 0.6992, + "step": 42610 + }, + { + "epoch": 0.37677469545076825, + "grad_norm": 3.605889320373535, + "learning_rate": 4.37204217424872e-05, + "loss": 0.7652, + "step": 42620 + }, + { + "epoch": 0.3768630987110805, + "grad_norm": 17.48322868347168, + "learning_rate": 4.371894835481532e-05, + "loss": 0.7787, + "step": 42630 + }, + { + "epoch": 0.3769515019713927, + "grad_norm": 2.6086907386779785, + "learning_rate": 4.371747496714346e-05, + "loss": 0.7104, + "step": 42640 + }, + { + "epoch": 0.37703990523170494, + "grad_norm": 5.691768169403076, + "learning_rate": 4.3716001579471586e-05, + "loss": 0.8355, + "step": 42650 + }, + { + "epoch": 0.3771283084920172, + "grad_norm": 3.5036377906799316, + "learning_rate": 4.3714528191799714e-05, + "loss": 0.7362, + "step": 42660 + }, + { + "epoch": 0.3772167117523294, + "grad_norm": 7.328939914703369, + "learning_rate": 4.371305480412784e-05, + "loss": 0.7096, + "step": 42670 + }, + { + "epoch": 0.3773051150126417, + "grad_norm": 1.5260515213012695, + "learning_rate": 4.371158141645598e-05, + "loss": 0.6667, + "step": 42680 + }, + { + "epoch": 0.3773935182729539, + "grad_norm": 9.549139976501465, + "learning_rate": 4.37101080287841e-05, + "loss": 0.7302, + "step": 42690 + }, + { + "epoch": 0.37748192153326615, + "grad_norm": 6.921221733093262, + "learning_rate": 4.3708634641112234e-05, + "loss": 0.6683, + "step": 42700 + }, + { + "epoch": 0.3775703247935784, + "grad_norm": 1.72034752368927, + "learning_rate": 4.370716125344036e-05, + "loss": 0.6454, + "step": 42710 + }, + { + "epoch": 0.3776587280538906, + "grad_norm": 2.3333041667938232, + "learning_rate": 4.370568786576849e-05, + "loss": 0.627, + "step": 42720 + }, + { + "epoch": 0.37774713131420284, + "grad_norm": 2.242698907852173, + "learning_rate": 4.370421447809662e-05, + "loss": 0.811, + "step": 42730 + }, + { + "epoch": 0.37783553457451513, + "grad_norm": 2.7384488582611084, + "learning_rate": 4.370274109042475e-05, + "loss": 0.787, + "step": 42740 + }, + { + "epoch": 0.37792393783482736, + "grad_norm": 2.396437644958496, + "learning_rate": 4.3701267702752876e-05, + "loss": 0.7265, + "step": 42750 + }, + { + "epoch": 0.3780123410951396, + "grad_norm": 2.599555730819702, + "learning_rate": 4.369979431508101e-05, + "loss": 0.6787, + "step": 42760 + }, + { + "epoch": 0.3781007443554518, + "grad_norm": 9.896321296691895, + "learning_rate": 4.369832092740914e-05, + "loss": 0.74, + "step": 42770 + }, + { + "epoch": 0.37818914761576405, + "grad_norm": 4.279694080352783, + "learning_rate": 4.369684753973727e-05, + "loss": 0.896, + "step": 42780 + }, + { + "epoch": 0.3782775508760763, + "grad_norm": 4.9580078125, + "learning_rate": 4.3695374152065396e-05, + "loss": 0.7434, + "step": 42790 + }, + { + "epoch": 0.37836595413638857, + "grad_norm": 5.293226718902588, + "learning_rate": 4.3693900764393525e-05, + "loss": 0.6834, + "step": 42800 + }, + { + "epoch": 0.3784543573967008, + "grad_norm": 1.704933762550354, + "learning_rate": 4.369242737672165e-05, + "loss": 0.7799, + "step": 42810 + }, + { + "epoch": 0.37854276065701303, + "grad_norm": 10.47739315032959, + "learning_rate": 4.369095398904979e-05, + "loss": 0.6782, + "step": 42820 + }, + { + "epoch": 0.37863116391732526, + "grad_norm": 5.8227996826171875, + "learning_rate": 4.3689480601377917e-05, + "loss": 0.5866, + "step": 42830 + }, + { + "epoch": 0.3787195671776375, + "grad_norm": 3.2358522415161133, + "learning_rate": 4.3688007213706045e-05, + "loss": 0.7726, + "step": 42840 + }, + { + "epoch": 0.3788079704379497, + "grad_norm": 2.994148015975952, + "learning_rate": 4.368653382603417e-05, + "loss": 0.6556, + "step": 42850 + }, + { + "epoch": 0.378896373698262, + "grad_norm": 4.2836785316467285, + "learning_rate": 4.36850604383623e-05, + "loss": 0.7697, + "step": 42860 + }, + { + "epoch": 0.37898477695857424, + "grad_norm": 5.644418239593506, + "learning_rate": 4.368358705069043e-05, + "loss": 0.6793, + "step": 42870 + }, + { + "epoch": 0.3790731802188865, + "grad_norm": 2.221799612045288, + "learning_rate": 4.368211366301856e-05, + "loss": 0.6692, + "step": 42880 + }, + { + "epoch": 0.3791615834791987, + "grad_norm": 14.580803871154785, + "learning_rate": 4.3680640275346693e-05, + "loss": 0.8032, + "step": 42890 + }, + { + "epoch": 0.37924998673951094, + "grad_norm": 5.326828956604004, + "learning_rate": 4.367916688767482e-05, + "loss": 0.7324, + "step": 42900 + }, + { + "epoch": 0.37933838999982317, + "grad_norm": 1.2381949424743652, + "learning_rate": 4.367769350000295e-05, + "loss": 0.616, + "step": 42910 + }, + { + "epoch": 0.37942679326013545, + "grad_norm": 3.5227928161621094, + "learning_rate": 4.367622011233108e-05, + "loss": 0.7771, + "step": 42920 + }, + { + "epoch": 0.3795151965204477, + "grad_norm": 6.06992769241333, + "learning_rate": 4.367474672465921e-05, + "loss": 0.6412, + "step": 42930 + }, + { + "epoch": 0.3796035997807599, + "grad_norm": 2.6467907428741455, + "learning_rate": 4.3673273336987335e-05, + "loss": 0.7874, + "step": 42940 + }, + { + "epoch": 0.37969200304107215, + "grad_norm": 9.846500396728516, + "learning_rate": 4.367179994931547e-05, + "loss": 0.7751, + "step": 42950 + }, + { + "epoch": 0.3797804063013844, + "grad_norm": 5.769461154937744, + "learning_rate": 4.367032656164359e-05, + "loss": 0.8085, + "step": 42960 + }, + { + "epoch": 0.37986880956169666, + "grad_norm": 2.9866957664489746, + "learning_rate": 4.366885317397173e-05, + "loss": 0.7827, + "step": 42970 + }, + { + "epoch": 0.3799572128220089, + "grad_norm": 2.3471596240997314, + "learning_rate": 4.3667379786299855e-05, + "loss": 0.6464, + "step": 42980 + }, + { + "epoch": 0.3800456160823211, + "grad_norm": 3.230699300765991, + "learning_rate": 4.3665906398627984e-05, + "loss": 0.798, + "step": 42990 + }, + { + "epoch": 0.38013401934263336, + "grad_norm": 4.558084011077881, + "learning_rate": 4.366443301095611e-05, + "loss": 0.8389, + "step": 43000 + }, + { + "epoch": 0.3802224226029456, + "grad_norm": 3.927190065383911, + "learning_rate": 4.366295962328425e-05, + "loss": 0.7066, + "step": 43010 + }, + { + "epoch": 0.3803108258632578, + "grad_norm": 6.640660762786865, + "learning_rate": 4.366148623561237e-05, + "loss": 0.8037, + "step": 43020 + }, + { + "epoch": 0.3803992291235701, + "grad_norm": 1.7032018899917603, + "learning_rate": 4.3660012847940504e-05, + "loss": 0.8016, + "step": 43030 + }, + { + "epoch": 0.38048763238388233, + "grad_norm": 3.529633045196533, + "learning_rate": 4.365853946026863e-05, + "loss": 0.8302, + "step": 43040 + }, + { + "epoch": 0.38057603564419457, + "grad_norm": 8.706425666809082, + "learning_rate": 4.365706607259676e-05, + "loss": 0.7552, + "step": 43050 + }, + { + "epoch": 0.3806644389045068, + "grad_norm": 6.448954105377197, + "learning_rate": 4.365559268492489e-05, + "loss": 0.7492, + "step": 43060 + }, + { + "epoch": 0.380752842164819, + "grad_norm": 2.459713935852051, + "learning_rate": 4.365411929725302e-05, + "loss": 0.7489, + "step": 43070 + }, + { + "epoch": 0.38084124542513126, + "grad_norm": 2.4048988819122314, + "learning_rate": 4.3652645909581146e-05, + "loss": 0.7565, + "step": 43080 + }, + { + "epoch": 0.38092964868544354, + "grad_norm": 8.155169486999512, + "learning_rate": 4.365117252190928e-05, + "loss": 0.7606, + "step": 43090 + }, + { + "epoch": 0.3810180519457558, + "grad_norm": 3.0469725131988525, + "learning_rate": 4.36496991342374e-05, + "loss": 0.821, + "step": 43100 + }, + { + "epoch": 0.381106455206068, + "grad_norm": 4.180539608001709, + "learning_rate": 4.364822574656554e-05, + "loss": 0.6277, + "step": 43110 + }, + { + "epoch": 0.38119485846638024, + "grad_norm": 3.0767343044281006, + "learning_rate": 4.3646752358893666e-05, + "loss": 0.8115, + "step": 43120 + }, + { + "epoch": 0.38128326172669247, + "grad_norm": 3.867610216140747, + "learning_rate": 4.3645278971221794e-05, + "loss": 0.5614, + "step": 43130 + }, + { + "epoch": 0.3813716649870047, + "grad_norm": 3.3509323596954346, + "learning_rate": 4.364380558354992e-05, + "loss": 0.7551, + "step": 43140 + }, + { + "epoch": 0.381460068247317, + "grad_norm": 2.767132043838501, + "learning_rate": 4.364233219587806e-05, + "loss": 0.7499, + "step": 43150 + }, + { + "epoch": 0.3815484715076292, + "grad_norm": 4.620316505432129, + "learning_rate": 4.364085880820618e-05, + "loss": 0.6625, + "step": 43160 + }, + { + "epoch": 0.38163687476794145, + "grad_norm": 5.942996501922607, + "learning_rate": 4.3639385420534315e-05, + "loss": 0.6448, + "step": 43170 + }, + { + "epoch": 0.3817252780282537, + "grad_norm": 7.085754871368408, + "learning_rate": 4.3637912032862436e-05, + "loss": 0.7338, + "step": 43180 + }, + { + "epoch": 0.3818136812885659, + "grad_norm": 1.9270126819610596, + "learning_rate": 4.363643864519057e-05, + "loss": 0.7296, + "step": 43190 + }, + { + "epoch": 0.38190208454887814, + "grad_norm": 4.797279357910156, + "learning_rate": 4.36349652575187e-05, + "loss": 0.7614, + "step": 43200 + }, + { + "epoch": 0.3819904878091904, + "grad_norm": 4.248345851898193, + "learning_rate": 4.363349186984683e-05, + "loss": 0.7257, + "step": 43210 + }, + { + "epoch": 0.38207889106950266, + "grad_norm": 4.137569427490234, + "learning_rate": 4.3632018482174956e-05, + "loss": 0.6503, + "step": 43220 + }, + { + "epoch": 0.3821672943298149, + "grad_norm": 2.1250267028808594, + "learning_rate": 4.363054509450309e-05, + "loss": 0.7874, + "step": 43230 + }, + { + "epoch": 0.3822556975901271, + "grad_norm": 2.3361072540283203, + "learning_rate": 4.362907170683121e-05, + "loss": 0.7408, + "step": 43240 + }, + { + "epoch": 0.38234410085043935, + "grad_norm": 10.690939903259277, + "learning_rate": 4.362759831915935e-05, + "loss": 0.8218, + "step": 43250 + }, + { + "epoch": 0.3824325041107516, + "grad_norm": 5.73743200302124, + "learning_rate": 4.3626124931487476e-05, + "loss": 0.8849, + "step": 43260 + }, + { + "epoch": 0.38252090737106387, + "grad_norm": 8.772205352783203, + "learning_rate": 4.3624651543815605e-05, + "loss": 0.7937, + "step": 43270 + }, + { + "epoch": 0.3826093106313761, + "grad_norm": 5.444756031036377, + "learning_rate": 4.362317815614373e-05, + "loss": 0.7493, + "step": 43280 + }, + { + "epoch": 0.38269771389168833, + "grad_norm": 2.365267038345337, + "learning_rate": 4.362170476847187e-05, + "loss": 0.8066, + "step": 43290 + }, + { + "epoch": 0.38278611715200056, + "grad_norm": 5.630627632141113, + "learning_rate": 4.362023138079999e-05, + "loss": 0.7531, + "step": 43300 + }, + { + "epoch": 0.3828745204123128, + "grad_norm": 3.3397905826568604, + "learning_rate": 4.3618757993128125e-05, + "loss": 0.7223, + "step": 43310 + }, + { + "epoch": 0.382962923672625, + "grad_norm": 7.111069202423096, + "learning_rate": 4.3617284605456247e-05, + "loss": 0.6533, + "step": 43320 + }, + { + "epoch": 0.3830513269329373, + "grad_norm": 2.439441680908203, + "learning_rate": 4.361581121778438e-05, + "loss": 0.7964, + "step": 43330 + }, + { + "epoch": 0.38313973019324954, + "grad_norm": 5.555276870727539, + "learning_rate": 4.361433783011251e-05, + "loss": 0.6576, + "step": 43340 + }, + { + "epoch": 0.38322813345356177, + "grad_norm": 8.322966575622559, + "learning_rate": 4.361286444244064e-05, + "loss": 0.7967, + "step": 43350 + }, + { + "epoch": 0.383316536713874, + "grad_norm": 15.328668594360352, + "learning_rate": 4.361139105476877e-05, + "loss": 0.6977, + "step": 43360 + }, + { + "epoch": 0.38340493997418623, + "grad_norm": 9.367509841918945, + "learning_rate": 4.36099176670969e-05, + "loss": 0.7085, + "step": 43370 + }, + { + "epoch": 0.38349334323449846, + "grad_norm": 8.669416427612305, + "learning_rate": 4.3608444279425023e-05, + "loss": 0.7451, + "step": 43380 + }, + { + "epoch": 0.38358174649481075, + "grad_norm": 3.248068332672119, + "learning_rate": 4.360697089175316e-05, + "loss": 0.6189, + "step": 43390 + }, + { + "epoch": 0.383670149755123, + "grad_norm": 9.352893829345703, + "learning_rate": 4.360549750408129e-05, + "loss": 0.7816, + "step": 43400 + }, + { + "epoch": 0.3837585530154352, + "grad_norm": 3.005835771560669, + "learning_rate": 4.3604024116409415e-05, + "loss": 0.8267, + "step": 43410 + }, + { + "epoch": 0.38384695627574744, + "grad_norm": 2.432826042175293, + "learning_rate": 4.3602550728737544e-05, + "loss": 0.7032, + "step": 43420 + }, + { + "epoch": 0.3839353595360597, + "grad_norm": 2.6174306869506836, + "learning_rate": 4.360107734106567e-05, + "loss": 0.7607, + "step": 43430 + }, + { + "epoch": 0.3840237627963719, + "grad_norm": 2.715211868286133, + "learning_rate": 4.35996039533938e-05, + "loss": 0.7467, + "step": 43440 + }, + { + "epoch": 0.3841121660566842, + "grad_norm": 10.05677318572998, + "learning_rate": 4.3598130565721936e-05, + "loss": 0.7146, + "step": 43450 + }, + { + "epoch": 0.3842005693169964, + "grad_norm": 2.7749428749084473, + "learning_rate": 4.359665717805006e-05, + "loss": 0.7512, + "step": 43460 + }, + { + "epoch": 0.38428897257730865, + "grad_norm": 4.629761219024658, + "learning_rate": 4.359518379037819e-05, + "loss": 0.7493, + "step": 43470 + }, + { + "epoch": 0.3843773758376209, + "grad_norm": 3.2584049701690674, + "learning_rate": 4.359371040270632e-05, + "loss": 0.6809, + "step": 43480 + }, + { + "epoch": 0.3844657790979331, + "grad_norm": 4.336820125579834, + "learning_rate": 4.359223701503445e-05, + "loss": 0.731, + "step": 43490 + }, + { + "epoch": 0.38455418235824534, + "grad_norm": 5.319371700286865, + "learning_rate": 4.359076362736258e-05, + "loss": 0.675, + "step": 43500 + }, + { + "epoch": 0.38464258561855763, + "grad_norm": 3.8213205337524414, + "learning_rate": 4.358929023969071e-05, + "loss": 0.7017, + "step": 43510 + }, + { + "epoch": 0.38473098887886986, + "grad_norm": 3.5950369834899902, + "learning_rate": 4.3587816852018834e-05, + "loss": 0.7241, + "step": 43520 + }, + { + "epoch": 0.3848193921391821, + "grad_norm": 4.100440502166748, + "learning_rate": 4.358634346434697e-05, + "loss": 0.6119, + "step": 43530 + }, + { + "epoch": 0.3849077953994943, + "grad_norm": 4.950203895568848, + "learning_rate": 4.358487007667509e-05, + "loss": 0.6241, + "step": 43540 + }, + { + "epoch": 0.38499619865980655, + "grad_norm": 2.266775369644165, + "learning_rate": 4.3583396689003226e-05, + "loss": 0.6648, + "step": 43550 + }, + { + "epoch": 0.38508460192011884, + "grad_norm": 3.357187032699585, + "learning_rate": 4.3581923301331354e-05, + "loss": 0.6899, + "step": 43560 + }, + { + "epoch": 0.38517300518043107, + "grad_norm": 3.2378625869750977, + "learning_rate": 4.358044991365948e-05, + "loss": 0.7301, + "step": 43570 + }, + { + "epoch": 0.3852614084407433, + "grad_norm": 3.4043920040130615, + "learning_rate": 4.357897652598761e-05, + "loss": 0.6919, + "step": 43580 + }, + { + "epoch": 0.38534981170105553, + "grad_norm": 9.793386459350586, + "learning_rate": 4.3577503138315746e-05, + "loss": 0.7642, + "step": 43590 + }, + { + "epoch": 0.38543821496136776, + "grad_norm": 4.7564311027526855, + "learning_rate": 4.357602975064387e-05, + "loss": 0.7277, + "step": 43600 + }, + { + "epoch": 0.38552661822168, + "grad_norm": 10.041168212890625, + "learning_rate": 4.3574556362972e-05, + "loss": 0.6197, + "step": 43610 + }, + { + "epoch": 0.3856150214819923, + "grad_norm": 2.1328935623168945, + "learning_rate": 4.357308297530013e-05, + "loss": 0.723, + "step": 43620 + }, + { + "epoch": 0.3857034247423045, + "grad_norm": 4.792178630828857, + "learning_rate": 4.357160958762826e-05, + "loss": 0.7592, + "step": 43630 + }, + { + "epoch": 0.38579182800261674, + "grad_norm": 7.255916595458984, + "learning_rate": 4.357013619995639e-05, + "loss": 0.7721, + "step": 43640 + }, + { + "epoch": 0.385880231262929, + "grad_norm": 2.069598436355591, + "learning_rate": 4.3568662812284516e-05, + "loss": 0.7243, + "step": 43650 + }, + { + "epoch": 0.3859686345232412, + "grad_norm": 4.392913818359375, + "learning_rate": 4.3567189424612645e-05, + "loss": 0.7054, + "step": 43660 + }, + { + "epoch": 0.38605703778355344, + "grad_norm": 14.540793418884277, + "learning_rate": 4.356571603694078e-05, + "loss": 0.6862, + "step": 43670 + }, + { + "epoch": 0.3861454410438657, + "grad_norm": 8.115978240966797, + "learning_rate": 4.356424264926891e-05, + "loss": 0.7095, + "step": 43680 + }, + { + "epoch": 0.38623384430417795, + "grad_norm": 5.372908592224121, + "learning_rate": 4.3562769261597036e-05, + "loss": 0.6752, + "step": 43690 + }, + { + "epoch": 0.3863222475644902, + "grad_norm": 2.51652455329895, + "learning_rate": 4.3561295873925165e-05, + "loss": 0.7102, + "step": 43700 + }, + { + "epoch": 0.3864106508248024, + "grad_norm": 3.7599122524261475, + "learning_rate": 4.355982248625329e-05, + "loss": 0.6836, + "step": 43710 + }, + { + "epoch": 0.38649905408511465, + "grad_norm": 6.177261829376221, + "learning_rate": 4.355834909858142e-05, + "loss": 0.792, + "step": 43720 + }, + { + "epoch": 0.3865874573454269, + "grad_norm": 4.799642086029053, + "learning_rate": 4.3556875710909557e-05, + "loss": 0.6688, + "step": 43730 + }, + { + "epoch": 0.38667586060573916, + "grad_norm": 2.5385894775390625, + "learning_rate": 4.3555402323237685e-05, + "loss": 0.6201, + "step": 43740 + }, + { + "epoch": 0.3867642638660514, + "grad_norm": 1.4727774858474731, + "learning_rate": 4.355392893556581e-05, + "loss": 0.6661, + "step": 43750 + }, + { + "epoch": 0.3868526671263636, + "grad_norm": 4.139338970184326, + "learning_rate": 4.355245554789394e-05, + "loss": 0.6879, + "step": 43760 + }, + { + "epoch": 0.38694107038667586, + "grad_norm": 3.651340961456299, + "learning_rate": 4.355098216022207e-05, + "loss": 0.74, + "step": 43770 + }, + { + "epoch": 0.3870294736469881, + "grad_norm": 4.973133563995361, + "learning_rate": 4.35495087725502e-05, + "loss": 0.6589, + "step": 43780 + }, + { + "epoch": 0.3871178769073003, + "grad_norm": 9.292289733886719, + "learning_rate": 4.354803538487833e-05, + "loss": 0.7245, + "step": 43790 + }, + { + "epoch": 0.3872062801676126, + "grad_norm": 2.549406051635742, + "learning_rate": 4.354656199720646e-05, + "loss": 0.7695, + "step": 43800 + }, + { + "epoch": 0.38729468342792484, + "grad_norm": 7.283100128173828, + "learning_rate": 4.354508860953459e-05, + "loss": 0.6357, + "step": 43810 + }, + { + "epoch": 0.38738308668823707, + "grad_norm": 7.145496845245361, + "learning_rate": 4.354361522186272e-05, + "loss": 0.6583, + "step": 43820 + }, + { + "epoch": 0.3874714899485493, + "grad_norm": 3.2963976860046387, + "learning_rate": 4.354214183419085e-05, + "loss": 0.712, + "step": 43830 + }, + { + "epoch": 0.38755989320886153, + "grad_norm": 9.30205249786377, + "learning_rate": 4.3540668446518975e-05, + "loss": 0.749, + "step": 43840 + }, + { + "epoch": 0.38764829646917376, + "grad_norm": 4.466895580291748, + "learning_rate": 4.3539195058847104e-05, + "loss": 0.7446, + "step": 43850 + }, + { + "epoch": 0.38773669972948605, + "grad_norm": 4.649153232574463, + "learning_rate": 4.353772167117524e-05, + "loss": 0.7192, + "step": 43860 + }, + { + "epoch": 0.3878251029897983, + "grad_norm": 3.718433380126953, + "learning_rate": 4.353624828350337e-05, + "loss": 0.7486, + "step": 43870 + }, + { + "epoch": 0.3879135062501105, + "grad_norm": 3.3186070919036865, + "learning_rate": 4.3534774895831495e-05, + "loss": 0.7655, + "step": 43880 + }, + { + "epoch": 0.38800190951042274, + "grad_norm": 6.101158142089844, + "learning_rate": 4.3533301508159624e-05, + "loss": 0.6282, + "step": 43890 + }, + { + "epoch": 0.38809031277073497, + "grad_norm": 7.869748115539551, + "learning_rate": 4.353182812048775e-05, + "loss": 0.7269, + "step": 43900 + }, + { + "epoch": 0.3881787160310472, + "grad_norm": 4.291072368621826, + "learning_rate": 4.353035473281588e-05, + "loss": 0.7121, + "step": 43910 + }, + { + "epoch": 0.3882671192913595, + "grad_norm": 2.806745767593384, + "learning_rate": 4.3528881345144016e-05, + "loss": 0.7336, + "step": 43920 + }, + { + "epoch": 0.3883555225516717, + "grad_norm": 1.7958487272262573, + "learning_rate": 4.352740795747214e-05, + "loss": 0.7401, + "step": 43930 + }, + { + "epoch": 0.38844392581198395, + "grad_norm": 2.4116592407226562, + "learning_rate": 4.352593456980027e-05, + "loss": 0.8089, + "step": 43940 + }, + { + "epoch": 0.3885323290722962, + "grad_norm": 2.7241897583007812, + "learning_rate": 4.35244611821284e-05, + "loss": 0.6696, + "step": 43950 + }, + { + "epoch": 0.3886207323326084, + "grad_norm": 3.5028350353240967, + "learning_rate": 4.352298779445653e-05, + "loss": 0.6476, + "step": 43960 + }, + { + "epoch": 0.38870913559292064, + "grad_norm": 8.729159355163574, + "learning_rate": 4.352151440678466e-05, + "loss": 0.6532, + "step": 43970 + }, + { + "epoch": 0.3887975388532329, + "grad_norm": 7.070501804351807, + "learning_rate": 4.352004101911279e-05, + "loss": 0.4986, + "step": 43980 + }, + { + "epoch": 0.38888594211354516, + "grad_norm": 3.0011374950408936, + "learning_rate": 4.3518567631440914e-05, + "loss": 0.7366, + "step": 43990 + }, + { + "epoch": 0.3889743453738574, + "grad_norm": 10.321503639221191, + "learning_rate": 4.351709424376905e-05, + "loss": 0.7788, + "step": 44000 + }, + { + "epoch": 0.3890627486341696, + "grad_norm": 5.201939582824707, + "learning_rate": 4.351562085609717e-05, + "loss": 0.757, + "step": 44010 + }, + { + "epoch": 0.38915115189448185, + "grad_norm": 1.737633466720581, + "learning_rate": 4.3514147468425306e-05, + "loss": 0.6608, + "step": 44020 + }, + { + "epoch": 0.3892395551547941, + "grad_norm": 2.7050435543060303, + "learning_rate": 4.3512674080753434e-05, + "loss": 0.8404, + "step": 44030 + }, + { + "epoch": 0.38932795841510637, + "grad_norm": 2.749333381652832, + "learning_rate": 4.351120069308156e-05, + "loss": 0.7292, + "step": 44040 + }, + { + "epoch": 0.3894163616754186, + "grad_norm": 1.5947707891464233, + "learning_rate": 4.350972730540969e-05, + "loss": 0.6128, + "step": 44050 + }, + { + "epoch": 0.38950476493573083, + "grad_norm": 4.752467155456543, + "learning_rate": 4.3508253917737826e-05, + "loss": 0.6352, + "step": 44060 + }, + { + "epoch": 0.38959316819604306, + "grad_norm": 3.5451290607452393, + "learning_rate": 4.350678053006595e-05, + "loss": 0.7254, + "step": 44070 + }, + { + "epoch": 0.3896815714563553, + "grad_norm": 3.0532515048980713, + "learning_rate": 4.350530714239408e-05, + "loss": 0.6065, + "step": 44080 + }, + { + "epoch": 0.3897699747166676, + "grad_norm": 4.595867156982422, + "learning_rate": 4.350383375472221e-05, + "loss": 0.721, + "step": 44090 + }, + { + "epoch": 0.3898583779769798, + "grad_norm": 2.1476407051086426, + "learning_rate": 4.350236036705034e-05, + "loss": 0.789, + "step": 44100 + }, + { + "epoch": 0.38994678123729204, + "grad_norm": 1.92362642288208, + "learning_rate": 4.350088697937847e-05, + "loss": 0.6836, + "step": 44110 + }, + { + "epoch": 0.39003518449760427, + "grad_norm": 1.0036829710006714, + "learning_rate": 4.3499413591706596e-05, + "loss": 0.6622, + "step": 44120 + }, + { + "epoch": 0.3901235877579165, + "grad_norm": 5.729055881500244, + "learning_rate": 4.3497940204034725e-05, + "loss": 0.632, + "step": 44130 + }, + { + "epoch": 0.39021199101822873, + "grad_norm": 2.0600526332855225, + "learning_rate": 4.349646681636286e-05, + "loss": 0.6647, + "step": 44140 + }, + { + "epoch": 0.390300394278541, + "grad_norm": 2.524857759475708, + "learning_rate": 4.349499342869098e-05, + "loss": 0.6827, + "step": 44150 + }, + { + "epoch": 0.39038879753885325, + "grad_norm": 5.2838544845581055, + "learning_rate": 4.3493520041019116e-05, + "loss": 0.7239, + "step": 44160 + }, + { + "epoch": 0.3904772007991655, + "grad_norm": 5.721658706665039, + "learning_rate": 4.3492046653347245e-05, + "loss": 0.6816, + "step": 44170 + }, + { + "epoch": 0.3905656040594777, + "grad_norm": 7.646422386169434, + "learning_rate": 4.349057326567537e-05, + "loss": 0.7988, + "step": 44180 + }, + { + "epoch": 0.39065400731978994, + "grad_norm": 3.5777852535247803, + "learning_rate": 4.34890998780035e-05, + "loss": 0.7122, + "step": 44190 + }, + { + "epoch": 0.3907424105801022, + "grad_norm": 3.9402763843536377, + "learning_rate": 4.3487626490331637e-05, + "loss": 0.7637, + "step": 44200 + }, + { + "epoch": 0.39083081384041446, + "grad_norm": 6.075642108917236, + "learning_rate": 4.348615310265976e-05, + "loss": 0.8593, + "step": 44210 + }, + { + "epoch": 0.3909192171007267, + "grad_norm": 1.5269951820373535, + "learning_rate": 4.348467971498789e-05, + "loss": 0.6379, + "step": 44220 + }, + { + "epoch": 0.3910076203610389, + "grad_norm": 5.038540840148926, + "learning_rate": 4.348320632731602e-05, + "loss": 0.7545, + "step": 44230 + }, + { + "epoch": 0.39109602362135115, + "grad_norm": 4.454661846160889, + "learning_rate": 4.348173293964415e-05, + "loss": 0.6046, + "step": 44240 + }, + { + "epoch": 0.3911844268816634, + "grad_norm": 3.3446669578552246, + "learning_rate": 4.348025955197228e-05, + "loss": 0.7091, + "step": 44250 + }, + { + "epoch": 0.3912728301419756, + "grad_norm": 4.3299031257629395, + "learning_rate": 4.347878616430041e-05, + "loss": 0.7568, + "step": 44260 + }, + { + "epoch": 0.3913612334022879, + "grad_norm": 2.940662145614624, + "learning_rate": 4.3477312776628535e-05, + "loss": 0.8204, + "step": 44270 + }, + { + "epoch": 0.39144963666260013, + "grad_norm": 2.40651535987854, + "learning_rate": 4.347583938895667e-05, + "loss": 0.7732, + "step": 44280 + }, + { + "epoch": 0.39153803992291236, + "grad_norm": 3.4223484992980957, + "learning_rate": 4.347436600128479e-05, + "loss": 0.6626, + "step": 44290 + }, + { + "epoch": 0.3916264431832246, + "grad_norm": 2.267432928085327, + "learning_rate": 4.347289261361293e-05, + "loss": 0.6695, + "step": 44300 + }, + { + "epoch": 0.3917148464435368, + "grad_norm": 5.355408191680908, + "learning_rate": 4.3471419225941055e-05, + "loss": 0.8069, + "step": 44310 + }, + { + "epoch": 0.39180324970384905, + "grad_norm": 3.348864793777466, + "learning_rate": 4.3469945838269184e-05, + "loss": 0.8162, + "step": 44320 + }, + { + "epoch": 0.39189165296416134, + "grad_norm": 2.9551172256469727, + "learning_rate": 4.346847245059731e-05, + "loss": 0.651, + "step": 44330 + }, + { + "epoch": 0.39198005622447357, + "grad_norm": 6.741861343383789, + "learning_rate": 4.346699906292545e-05, + "loss": 0.6796, + "step": 44340 + }, + { + "epoch": 0.3920684594847858, + "grad_norm": 7.677488803863525, + "learning_rate": 4.346552567525357e-05, + "loss": 0.8011, + "step": 44350 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.3081274032592773, + "learning_rate": 4.3464052287581704e-05, + "loss": 0.6788, + "step": 44360 + }, + { + "epoch": 0.39224526600541026, + "grad_norm": 2.5753400325775146, + "learning_rate": 4.3462578899909825e-05, + "loss": 0.8173, + "step": 44370 + }, + { + "epoch": 0.3923336692657225, + "grad_norm": 3.5336172580718994, + "learning_rate": 4.346110551223796e-05, + "loss": 0.6856, + "step": 44380 + }, + { + "epoch": 0.3924220725260348, + "grad_norm": 4.361479759216309, + "learning_rate": 4.345963212456609e-05, + "loss": 0.685, + "step": 44390 + }, + { + "epoch": 0.392510475786347, + "grad_norm": 1.9852592945098877, + "learning_rate": 4.345815873689422e-05, + "loss": 0.6808, + "step": 44400 + }, + { + "epoch": 0.39259887904665924, + "grad_norm": 1.8553427457809448, + "learning_rate": 4.3456685349222346e-05, + "loss": 0.7773, + "step": 44410 + }, + { + "epoch": 0.3926872823069715, + "grad_norm": 1.07711923122406, + "learning_rate": 4.345521196155048e-05, + "loss": 0.617, + "step": 44420 + }, + { + "epoch": 0.3927756855672837, + "grad_norm": 7.83463191986084, + "learning_rate": 4.34537385738786e-05, + "loss": 0.7796, + "step": 44430 + }, + { + "epoch": 0.39286408882759594, + "grad_norm": 2.078061819076538, + "learning_rate": 4.345226518620674e-05, + "loss": 0.6024, + "step": 44440 + }, + { + "epoch": 0.3929524920879082, + "grad_norm": 13.942878723144531, + "learning_rate": 4.3450791798534866e-05, + "loss": 0.7386, + "step": 44450 + }, + { + "epoch": 0.39304089534822045, + "grad_norm": 2.6606757640838623, + "learning_rate": 4.3449318410862994e-05, + "loss": 0.7623, + "step": 44460 + }, + { + "epoch": 0.3931292986085327, + "grad_norm": 5.481786727905273, + "learning_rate": 4.344784502319112e-05, + "loss": 0.6791, + "step": 44470 + }, + { + "epoch": 0.3932177018688449, + "grad_norm": 4.303702354431152, + "learning_rate": 4.344637163551925e-05, + "loss": 0.5947, + "step": 44480 + }, + { + "epoch": 0.39330610512915715, + "grad_norm": 4.576141834259033, + "learning_rate": 4.344489824784738e-05, + "loss": 0.802, + "step": 44490 + }, + { + "epoch": 0.3933945083894694, + "grad_norm": 4.796335220336914, + "learning_rate": 4.3443424860175514e-05, + "loss": 0.7512, + "step": 44500 + }, + { + "epoch": 0.39348291164978166, + "grad_norm": 3.4886820316314697, + "learning_rate": 4.3441951472503636e-05, + "loss": 0.7189, + "step": 44510 + }, + { + "epoch": 0.3935713149100939, + "grad_norm": 3.4879636764526367, + "learning_rate": 4.344047808483177e-05, + "loss": 0.6584, + "step": 44520 + }, + { + "epoch": 0.3936597181704061, + "grad_norm": 7.3791303634643555, + "learning_rate": 4.34390046971599e-05, + "loss": 0.716, + "step": 44530 + }, + { + "epoch": 0.39374812143071836, + "grad_norm": 1.2782517671585083, + "learning_rate": 4.343753130948803e-05, + "loss": 0.6979, + "step": 44540 + }, + { + "epoch": 0.3938365246910306, + "grad_norm": 3.2520925998687744, + "learning_rate": 4.3436057921816156e-05, + "loss": 0.6334, + "step": 44550 + }, + { + "epoch": 0.3939249279513428, + "grad_norm": 3.488539218902588, + "learning_rate": 4.343458453414429e-05, + "loss": 0.7581, + "step": 44560 + }, + { + "epoch": 0.3940133312116551, + "grad_norm": 2.7524282932281494, + "learning_rate": 4.343311114647241e-05, + "loss": 0.7155, + "step": 44570 + }, + { + "epoch": 0.39410173447196734, + "grad_norm": 5.6065144538879395, + "learning_rate": 4.343163775880055e-05, + "loss": 0.8001, + "step": 44580 + }, + { + "epoch": 0.39419013773227957, + "grad_norm": 1.8697189092636108, + "learning_rate": 4.3430164371128676e-05, + "loss": 0.699, + "step": 44590 + }, + { + "epoch": 0.3942785409925918, + "grad_norm": 5.810731887817383, + "learning_rate": 4.3428690983456805e-05, + "loss": 0.73, + "step": 44600 + }, + { + "epoch": 0.39436694425290403, + "grad_norm": 9.312521934509277, + "learning_rate": 4.342721759578493e-05, + "loss": 0.688, + "step": 44610 + }, + { + "epoch": 0.3944553475132163, + "grad_norm": 2.037626266479492, + "learning_rate": 4.342574420811306e-05, + "loss": 0.823, + "step": 44620 + }, + { + "epoch": 0.39454375077352855, + "grad_norm": 1.7342125177383423, + "learning_rate": 4.342427082044119e-05, + "loss": 0.8827, + "step": 44630 + }, + { + "epoch": 0.3946321540338408, + "grad_norm": 1.786419153213501, + "learning_rate": 4.3422797432769325e-05, + "loss": 0.6922, + "step": 44640 + }, + { + "epoch": 0.394720557294153, + "grad_norm": 2.1118435859680176, + "learning_rate": 4.342132404509745e-05, + "loss": 0.6553, + "step": 44650 + }, + { + "epoch": 0.39480896055446524, + "grad_norm": 3.9189305305480957, + "learning_rate": 4.341985065742558e-05, + "loss": 0.7709, + "step": 44660 + }, + { + "epoch": 0.39489736381477747, + "grad_norm": 5.795950412750244, + "learning_rate": 4.341837726975371e-05, + "loss": 0.7495, + "step": 44670 + }, + { + "epoch": 0.39498576707508976, + "grad_norm": 4.017739295959473, + "learning_rate": 4.341690388208184e-05, + "loss": 0.8069, + "step": 44680 + }, + { + "epoch": 0.395074170335402, + "grad_norm": 4.883296012878418, + "learning_rate": 4.3415430494409967e-05, + "loss": 0.7928, + "step": 44690 + }, + { + "epoch": 0.3951625735957142, + "grad_norm": 6.279229640960693, + "learning_rate": 4.34139571067381e-05, + "loss": 0.7267, + "step": 44700 + }, + { + "epoch": 0.39525097685602645, + "grad_norm": 5.529674530029297, + "learning_rate": 4.341248371906623e-05, + "loss": 0.7277, + "step": 44710 + }, + { + "epoch": 0.3953393801163387, + "grad_norm": 8.532251358032227, + "learning_rate": 4.341101033139436e-05, + "loss": 0.8273, + "step": 44720 + }, + { + "epoch": 0.3954277833766509, + "grad_norm": 2.29500150680542, + "learning_rate": 4.340953694372249e-05, + "loss": 0.7567, + "step": 44730 + }, + { + "epoch": 0.3955161866369632, + "grad_norm": 3.6768202781677246, + "learning_rate": 4.3408063556050615e-05, + "loss": 0.8448, + "step": 44740 + }, + { + "epoch": 0.3956045898972754, + "grad_norm": 3.7816247940063477, + "learning_rate": 4.3406590168378744e-05, + "loss": 0.7814, + "step": 44750 + }, + { + "epoch": 0.39569299315758766, + "grad_norm": 4.607700824737549, + "learning_rate": 4.340511678070687e-05, + "loss": 0.8145, + "step": 44760 + }, + { + "epoch": 0.3957813964178999, + "grad_norm": 3.395230770111084, + "learning_rate": 4.340364339303501e-05, + "loss": 0.8027, + "step": 44770 + }, + { + "epoch": 0.3958697996782121, + "grad_norm": 4.820852756500244, + "learning_rate": 4.3402170005363135e-05, + "loss": 0.7184, + "step": 44780 + }, + { + "epoch": 0.39595820293852435, + "grad_norm": 8.004124641418457, + "learning_rate": 4.3400696617691264e-05, + "loss": 0.7167, + "step": 44790 + }, + { + "epoch": 0.39604660619883664, + "grad_norm": 3.0855867862701416, + "learning_rate": 4.339922323001939e-05, + "loss": 0.7763, + "step": 44800 + }, + { + "epoch": 0.39613500945914887, + "grad_norm": 3.385728120803833, + "learning_rate": 4.339774984234752e-05, + "loss": 0.8062, + "step": 44810 + }, + { + "epoch": 0.3962234127194611, + "grad_norm": 6.4531426429748535, + "learning_rate": 4.339627645467565e-05, + "loss": 0.8392, + "step": 44820 + }, + { + "epoch": 0.39631181597977333, + "grad_norm": 2.572552442550659, + "learning_rate": 4.3394803067003784e-05, + "loss": 0.7051, + "step": 44830 + }, + { + "epoch": 0.39640021924008556, + "grad_norm": 3.2080845832824707, + "learning_rate": 4.3393329679331905e-05, + "loss": 0.7502, + "step": 44840 + }, + { + "epoch": 0.3964886225003978, + "grad_norm": 1.4954874515533447, + "learning_rate": 4.339185629166004e-05, + "loss": 0.7671, + "step": 44850 + }, + { + "epoch": 0.3965770257607101, + "grad_norm": 2.5237584114074707, + "learning_rate": 4.339038290398817e-05, + "loss": 0.6499, + "step": 44860 + }, + { + "epoch": 0.3966654290210223, + "grad_norm": 3.3950321674346924, + "learning_rate": 4.33889095163163e-05, + "loss": 0.7518, + "step": 44870 + }, + { + "epoch": 0.39675383228133454, + "grad_norm": 6.780747413635254, + "learning_rate": 4.3387436128644426e-05, + "loss": 0.767, + "step": 44880 + }, + { + "epoch": 0.39684223554164677, + "grad_norm": 4.4082722663879395, + "learning_rate": 4.338596274097256e-05, + "loss": 0.7036, + "step": 44890 + }, + { + "epoch": 0.396930638801959, + "grad_norm": 1.5104519128799438, + "learning_rate": 4.338448935330068e-05, + "loss": 0.7855, + "step": 44900 + }, + { + "epoch": 0.39701904206227123, + "grad_norm": 3.7163853645324707, + "learning_rate": 4.338301596562882e-05, + "loss": 0.6486, + "step": 44910 + }, + { + "epoch": 0.3971074453225835, + "grad_norm": 6.42086124420166, + "learning_rate": 4.3381542577956946e-05, + "loss": 0.7757, + "step": 44920 + }, + { + "epoch": 0.39719584858289575, + "grad_norm": 3.1415579319000244, + "learning_rate": 4.3380069190285074e-05, + "loss": 0.7589, + "step": 44930 + }, + { + "epoch": 0.397284251843208, + "grad_norm": 7.206704139709473, + "learning_rate": 4.33785958026132e-05, + "loss": 0.5722, + "step": 44940 + }, + { + "epoch": 0.3973726551035202, + "grad_norm": 9.921045303344727, + "learning_rate": 4.337712241494133e-05, + "loss": 0.6701, + "step": 44950 + }, + { + "epoch": 0.39746105836383244, + "grad_norm": 3.760974168777466, + "learning_rate": 4.337564902726946e-05, + "loss": 0.6208, + "step": 44960 + }, + { + "epoch": 0.3975494616241447, + "grad_norm": 2.382204294204712, + "learning_rate": 4.3374175639597594e-05, + "loss": 0.7419, + "step": 44970 + }, + { + "epoch": 0.39763786488445696, + "grad_norm": 1.4210044145584106, + "learning_rate": 4.3372702251925716e-05, + "loss": 0.7163, + "step": 44980 + }, + { + "epoch": 0.3977262681447692, + "grad_norm": 4.760910511016846, + "learning_rate": 4.337122886425385e-05, + "loss": 0.7925, + "step": 44990 + }, + { + "epoch": 0.3978146714050814, + "grad_norm": 2.803283214569092, + "learning_rate": 4.336975547658198e-05, + "loss": 0.7196, + "step": 45000 + }, + { + "epoch": 0.39790307466539365, + "grad_norm": 2.0723612308502197, + "learning_rate": 4.336828208891011e-05, + "loss": 0.6277, + "step": 45010 + }, + { + "epoch": 0.3979914779257059, + "grad_norm": 4.410577297210693, + "learning_rate": 4.3366808701238236e-05, + "loss": 0.7849, + "step": 45020 + }, + { + "epoch": 0.3980798811860181, + "grad_norm": 3.897620916366577, + "learning_rate": 4.336533531356637e-05, + "loss": 0.7683, + "step": 45030 + }, + { + "epoch": 0.3981682844463304, + "grad_norm": 1.4067083597183228, + "learning_rate": 4.336386192589449e-05, + "loss": 0.8945, + "step": 45040 + }, + { + "epoch": 0.39825668770664263, + "grad_norm": 2.771552801132202, + "learning_rate": 4.336238853822263e-05, + "loss": 0.6967, + "step": 45050 + }, + { + "epoch": 0.39834509096695486, + "grad_norm": 5.584054470062256, + "learning_rate": 4.3360915150550756e-05, + "loss": 0.628, + "step": 45060 + }, + { + "epoch": 0.3984334942272671, + "grad_norm": 3.6713294982910156, + "learning_rate": 4.3359441762878885e-05, + "loss": 0.6827, + "step": 45070 + }, + { + "epoch": 0.3985218974875793, + "grad_norm": 2.9452860355377197, + "learning_rate": 4.335796837520701e-05, + "loss": 0.6416, + "step": 45080 + }, + { + "epoch": 0.39861030074789155, + "grad_norm": 1.9640501737594604, + "learning_rate": 4.335649498753514e-05, + "loss": 0.7386, + "step": 45090 + }, + { + "epoch": 0.39869870400820384, + "grad_norm": 3.809224843978882, + "learning_rate": 4.335502159986327e-05, + "loss": 0.7708, + "step": 45100 + }, + { + "epoch": 0.3987871072685161, + "grad_norm": 2.6763768196105957, + "learning_rate": 4.3353548212191405e-05, + "loss": 0.7598, + "step": 45110 + }, + { + "epoch": 0.3988755105288283, + "grad_norm": 2.7607669830322266, + "learning_rate": 4.3352074824519526e-05, + "loss": 0.7248, + "step": 45120 + }, + { + "epoch": 0.39896391378914053, + "grad_norm": 1.9777405261993408, + "learning_rate": 4.335060143684766e-05, + "loss": 0.7871, + "step": 45130 + }, + { + "epoch": 0.39905231704945276, + "grad_norm": 5.079465389251709, + "learning_rate": 4.334912804917579e-05, + "loss": 0.7279, + "step": 45140 + }, + { + "epoch": 0.39914072030976505, + "grad_norm": 7.746766567230225, + "learning_rate": 4.334765466150392e-05, + "loss": 0.7292, + "step": 45150 + }, + { + "epoch": 0.3992291235700773, + "grad_norm": 5.309417724609375, + "learning_rate": 4.334618127383205e-05, + "loss": 0.9228, + "step": 45160 + }, + { + "epoch": 0.3993175268303895, + "grad_norm": 3.4967904090881348, + "learning_rate": 4.334470788616018e-05, + "loss": 0.7025, + "step": 45170 + }, + { + "epoch": 0.39940593009070174, + "grad_norm": 8.043968200683594, + "learning_rate": 4.33432344984883e-05, + "loss": 0.6382, + "step": 45180 + }, + { + "epoch": 0.399494333351014, + "grad_norm": 4.333873271942139, + "learning_rate": 4.334176111081644e-05, + "loss": 0.7109, + "step": 45190 + }, + { + "epoch": 0.3995827366113262, + "grad_norm": 3.7081408500671387, + "learning_rate": 4.334028772314456e-05, + "loss": 0.7677, + "step": 45200 + }, + { + "epoch": 0.3996711398716385, + "grad_norm": 6.2343668937683105, + "learning_rate": 4.3338814335472695e-05, + "loss": 0.7573, + "step": 45210 + }, + { + "epoch": 0.3997595431319507, + "grad_norm": 3.258674383163452, + "learning_rate": 4.3337340947800824e-05, + "loss": 0.7545, + "step": 45220 + }, + { + "epoch": 0.39984794639226295, + "grad_norm": 2.7142248153686523, + "learning_rate": 4.333586756012895e-05, + "loss": 0.7248, + "step": 45230 + }, + { + "epoch": 0.3999363496525752, + "grad_norm": 4.933760643005371, + "learning_rate": 4.333439417245708e-05, + "loss": 0.6933, + "step": 45240 + }, + { + "epoch": 0.4000247529128874, + "grad_norm": 4.056682109832764, + "learning_rate": 4.3332920784785215e-05, + "loss": 0.661, + "step": 45250 + }, + { + "epoch": 0.40011315617319965, + "grad_norm": 2.1073946952819824, + "learning_rate": 4.333144739711334e-05, + "loss": 0.8888, + "step": 45260 + }, + { + "epoch": 0.40020155943351193, + "grad_norm": 2.877234935760498, + "learning_rate": 4.332997400944147e-05, + "loss": 0.8523, + "step": 45270 + }, + { + "epoch": 0.40028996269382416, + "grad_norm": 3.1276321411132812, + "learning_rate": 4.33285006217696e-05, + "loss": 0.8019, + "step": 45280 + }, + { + "epoch": 0.4003783659541364, + "grad_norm": 6.948557376861572, + "learning_rate": 4.332702723409773e-05, + "loss": 0.623, + "step": 45290 + }, + { + "epoch": 0.4004667692144486, + "grad_norm": 4.216034889221191, + "learning_rate": 4.332555384642586e-05, + "loss": 0.7965, + "step": 45300 + }, + { + "epoch": 0.40055517247476086, + "grad_norm": 4.658138751983643, + "learning_rate": 4.3324080458753986e-05, + "loss": 0.7569, + "step": 45310 + }, + { + "epoch": 0.4006435757350731, + "grad_norm": 2.5574252605438232, + "learning_rate": 4.3322607071082114e-05, + "loss": 0.7729, + "step": 45320 + }, + { + "epoch": 0.4007319789953854, + "grad_norm": 3.4924285411834717, + "learning_rate": 4.332113368341025e-05, + "loss": 0.6704, + "step": 45330 + }, + { + "epoch": 0.4008203822556976, + "grad_norm": 5.622508525848389, + "learning_rate": 4.331966029573837e-05, + "loss": 0.6332, + "step": 45340 + }, + { + "epoch": 0.40090878551600984, + "grad_norm": 12.408244132995605, + "learning_rate": 4.3318186908066506e-05, + "loss": 0.7196, + "step": 45350 + }, + { + "epoch": 0.40099718877632207, + "grad_norm": 3.5734784603118896, + "learning_rate": 4.3316713520394634e-05, + "loss": 0.8647, + "step": 45360 + }, + { + "epoch": 0.4010855920366343, + "grad_norm": 8.04663372039795, + "learning_rate": 4.331524013272276e-05, + "loss": 0.7479, + "step": 45370 + }, + { + "epoch": 0.40117399529694653, + "grad_norm": 4.209608554840088, + "learning_rate": 4.331376674505089e-05, + "loss": 0.8009, + "step": 45380 + }, + { + "epoch": 0.4012623985572588, + "grad_norm": 2.2174675464630127, + "learning_rate": 4.3312293357379026e-05, + "loss": 0.7681, + "step": 45390 + }, + { + "epoch": 0.40135080181757105, + "grad_norm": 3.37431001663208, + "learning_rate": 4.331081996970715e-05, + "loss": 0.772, + "step": 45400 + }, + { + "epoch": 0.4014392050778833, + "grad_norm": 3.809494733810425, + "learning_rate": 4.330934658203528e-05, + "loss": 0.6842, + "step": 45410 + }, + { + "epoch": 0.4015276083381955, + "grad_norm": 4.1557087898254395, + "learning_rate": 4.3307873194363404e-05, + "loss": 0.8625, + "step": 45420 + }, + { + "epoch": 0.40161601159850774, + "grad_norm": 5.699997425079346, + "learning_rate": 4.330639980669154e-05, + "loss": 0.7146, + "step": 45430 + }, + { + "epoch": 0.40170441485881997, + "grad_norm": 3.7727091312408447, + "learning_rate": 4.330492641901967e-05, + "loss": 0.7158, + "step": 45440 + }, + { + "epoch": 0.40179281811913226, + "grad_norm": 13.562322616577148, + "learning_rate": 4.3303453031347796e-05, + "loss": 0.728, + "step": 45450 + }, + { + "epoch": 0.4018812213794445, + "grad_norm": 8.282513618469238, + "learning_rate": 4.3301979643675924e-05, + "loss": 0.6925, + "step": 45460 + }, + { + "epoch": 0.4019696246397567, + "grad_norm": 3.493535041809082, + "learning_rate": 4.330050625600406e-05, + "loss": 0.7821, + "step": 45470 + }, + { + "epoch": 0.40205802790006895, + "grad_norm": 2.2107646465301514, + "learning_rate": 4.329903286833218e-05, + "loss": 0.7237, + "step": 45480 + }, + { + "epoch": 0.4021464311603812, + "grad_norm": 3.566540241241455, + "learning_rate": 4.3297559480660316e-05, + "loss": 0.7904, + "step": 45490 + }, + { + "epoch": 0.4022348344206934, + "grad_norm": 4.750785827636719, + "learning_rate": 4.3296086092988445e-05, + "loss": 0.7075, + "step": 45500 + }, + { + "epoch": 0.4023232376810057, + "grad_norm": 4.600589752197266, + "learning_rate": 4.329461270531657e-05, + "loss": 0.6793, + "step": 45510 + }, + { + "epoch": 0.4024116409413179, + "grad_norm": 2.5680198669433594, + "learning_rate": 4.32931393176447e-05, + "loss": 0.7164, + "step": 45520 + }, + { + "epoch": 0.40250004420163016, + "grad_norm": 1.9664194583892822, + "learning_rate": 4.3291665929972836e-05, + "loss": 0.7052, + "step": 45530 + }, + { + "epoch": 0.4025884474619424, + "grad_norm": 3.885457754135132, + "learning_rate": 4.329019254230096e-05, + "loss": 0.6824, + "step": 45540 + }, + { + "epoch": 0.4026768507222546, + "grad_norm": 1.6770765781402588, + "learning_rate": 4.328871915462909e-05, + "loss": 0.6608, + "step": 45550 + }, + { + "epoch": 0.40276525398256685, + "grad_norm": 2.748335838317871, + "learning_rate": 4.328724576695722e-05, + "loss": 0.6492, + "step": 45560 + }, + { + "epoch": 0.40285365724287914, + "grad_norm": 2.647151470184326, + "learning_rate": 4.328577237928535e-05, + "loss": 0.6563, + "step": 45570 + }, + { + "epoch": 0.40294206050319137, + "grad_norm": 5.991791725158691, + "learning_rate": 4.328429899161348e-05, + "loss": 0.6322, + "step": 45580 + }, + { + "epoch": 0.4030304637635036, + "grad_norm": 8.063817024230957, + "learning_rate": 4.3282825603941607e-05, + "loss": 0.8245, + "step": 45590 + }, + { + "epoch": 0.40311886702381583, + "grad_norm": 3.5337412357330322, + "learning_rate": 4.3281352216269735e-05, + "loss": 0.8835, + "step": 45600 + }, + { + "epoch": 0.40320727028412806, + "grad_norm": 7.444693088531494, + "learning_rate": 4.327987882859787e-05, + "loss": 0.6971, + "step": 45610 + }, + { + "epoch": 0.4032956735444403, + "grad_norm": 3.863515853881836, + "learning_rate": 4.3278405440926e-05, + "loss": 0.7218, + "step": 45620 + }, + { + "epoch": 0.4033840768047526, + "grad_norm": 7.443853378295898, + "learning_rate": 4.327693205325413e-05, + "loss": 0.74, + "step": 45630 + }, + { + "epoch": 0.4034724800650648, + "grad_norm": 3.8616251945495605, + "learning_rate": 4.3275458665582255e-05, + "loss": 0.7282, + "step": 45640 + }, + { + "epoch": 0.40356088332537704, + "grad_norm": 2.6907570362091064, + "learning_rate": 4.3273985277910383e-05, + "loss": 0.6497, + "step": 45650 + }, + { + "epoch": 0.40364928658568927, + "grad_norm": 2.6877543926239014, + "learning_rate": 4.327251189023851e-05, + "loss": 0.6957, + "step": 45660 + }, + { + "epoch": 0.4037376898460015, + "grad_norm": 2.5589005947113037, + "learning_rate": 4.327103850256664e-05, + "loss": 0.6228, + "step": 45670 + }, + { + "epoch": 0.4038260931063138, + "grad_norm": 3.4897139072418213, + "learning_rate": 4.3269565114894775e-05, + "loss": 0.6918, + "step": 45680 + }, + { + "epoch": 0.403914496366626, + "grad_norm": 7.576402187347412, + "learning_rate": 4.3268091727222904e-05, + "loss": 0.6894, + "step": 45690 + }, + { + "epoch": 0.40400289962693825, + "grad_norm": 3.203425407409668, + "learning_rate": 4.326661833955103e-05, + "loss": 0.8509, + "step": 45700 + }, + { + "epoch": 0.4040913028872505, + "grad_norm": 6.4992876052856445, + "learning_rate": 4.326514495187916e-05, + "loss": 0.5883, + "step": 45710 + }, + { + "epoch": 0.4041797061475627, + "grad_norm": 4.828888893127441, + "learning_rate": 4.326367156420729e-05, + "loss": 0.731, + "step": 45720 + }, + { + "epoch": 0.40426810940787494, + "grad_norm": 7.019105911254883, + "learning_rate": 4.326219817653542e-05, + "loss": 0.7222, + "step": 45730 + }, + { + "epoch": 0.40435651266818723, + "grad_norm": 4.875137805938721, + "learning_rate": 4.326072478886355e-05, + "loss": 0.6626, + "step": 45740 + }, + { + "epoch": 0.40444491592849946, + "grad_norm": 6.4282941818237305, + "learning_rate": 4.325925140119168e-05, + "loss": 0.6805, + "step": 45750 + }, + { + "epoch": 0.4045333191888117, + "grad_norm": 5.351105213165283, + "learning_rate": 4.325777801351981e-05, + "loss": 0.6759, + "step": 45760 + }, + { + "epoch": 0.4046217224491239, + "grad_norm": 5.707307815551758, + "learning_rate": 4.325630462584794e-05, + "loss": 0.6961, + "step": 45770 + }, + { + "epoch": 0.40471012570943615, + "grad_norm": 13.36841869354248, + "learning_rate": 4.3254831238176066e-05, + "loss": 0.6734, + "step": 45780 + }, + { + "epoch": 0.4047985289697484, + "grad_norm": 2.540553569793701, + "learning_rate": 4.3253357850504194e-05, + "loss": 0.6704, + "step": 45790 + }, + { + "epoch": 0.40488693223006067, + "grad_norm": 1.2700477838516235, + "learning_rate": 4.325188446283233e-05, + "loss": 0.6393, + "step": 45800 + }, + { + "epoch": 0.4049753354903729, + "grad_norm": 7.109341621398926, + "learning_rate": 4.325041107516045e-05, + "loss": 0.6969, + "step": 45810 + }, + { + "epoch": 0.40506373875068513, + "grad_norm": 3.18410325050354, + "learning_rate": 4.3248937687488586e-05, + "loss": 0.603, + "step": 45820 + }, + { + "epoch": 0.40515214201099736, + "grad_norm": 3.89701771736145, + "learning_rate": 4.3247464299816714e-05, + "loss": 0.758, + "step": 45830 + }, + { + "epoch": 0.4052405452713096, + "grad_norm": 6.603641986846924, + "learning_rate": 4.324599091214484e-05, + "loss": 0.7111, + "step": 45840 + }, + { + "epoch": 0.4053289485316218, + "grad_norm": 10.412839889526367, + "learning_rate": 4.324451752447297e-05, + "loss": 0.7406, + "step": 45850 + }, + { + "epoch": 0.4054173517919341, + "grad_norm": 3.6471614837646484, + "learning_rate": 4.3243044136801106e-05, + "loss": 0.7184, + "step": 45860 + }, + { + "epoch": 0.40550575505224634, + "grad_norm": 12.33071517944336, + "learning_rate": 4.324157074912923e-05, + "loss": 0.7196, + "step": 45870 + }, + { + "epoch": 0.4055941583125586, + "grad_norm": 4.292669296264648, + "learning_rate": 4.324009736145736e-05, + "loss": 0.7329, + "step": 45880 + }, + { + "epoch": 0.4056825615728708, + "grad_norm": 3.919067859649658, + "learning_rate": 4.3238623973785484e-05, + "loss": 0.7455, + "step": 45890 + }, + { + "epoch": 0.40577096483318303, + "grad_norm": 3.294001817703247, + "learning_rate": 4.323715058611362e-05, + "loss": 0.8079, + "step": 45900 + }, + { + "epoch": 0.40585936809349527, + "grad_norm": 1.6107417345046997, + "learning_rate": 4.323567719844175e-05, + "loss": 0.6853, + "step": 45910 + }, + { + "epoch": 0.40594777135380755, + "grad_norm": 12.0560884475708, + "learning_rate": 4.3234203810769876e-05, + "loss": 0.6912, + "step": 45920 + }, + { + "epoch": 0.4060361746141198, + "grad_norm": 1.6195980310440063, + "learning_rate": 4.3232730423098004e-05, + "loss": 0.6615, + "step": 45930 + }, + { + "epoch": 0.406124577874432, + "grad_norm": 5.623993873596191, + "learning_rate": 4.323125703542614e-05, + "loss": 0.7895, + "step": 45940 + }, + { + "epoch": 0.40621298113474424, + "grad_norm": 4.299506187438965, + "learning_rate": 4.322978364775426e-05, + "loss": 0.6948, + "step": 45950 + }, + { + "epoch": 0.4063013843950565, + "grad_norm": 2.0287930965423584, + "learning_rate": 4.3228310260082396e-05, + "loss": 0.7466, + "step": 45960 + }, + { + "epoch": 0.4063897876553687, + "grad_norm": 2.0097010135650635, + "learning_rate": 4.3226836872410525e-05, + "loss": 0.7427, + "step": 45970 + }, + { + "epoch": 0.406478190915681, + "grad_norm": 2.9460787773132324, + "learning_rate": 4.322536348473865e-05, + "loss": 0.6041, + "step": 45980 + }, + { + "epoch": 0.4065665941759932, + "grad_norm": 6.002807140350342, + "learning_rate": 4.322389009706678e-05, + "loss": 0.6147, + "step": 45990 + }, + { + "epoch": 0.40665499743630545, + "grad_norm": 1.2274689674377441, + "learning_rate": 4.3222416709394917e-05, + "loss": 0.6377, + "step": 46000 + }, + { + "epoch": 0.4067434006966177, + "grad_norm": 3.9840760231018066, + "learning_rate": 4.322094332172304e-05, + "loss": 0.7048, + "step": 46010 + }, + { + "epoch": 0.4068318039569299, + "grad_norm": 2.5341808795928955, + "learning_rate": 4.321946993405117e-05, + "loss": 0.703, + "step": 46020 + }, + { + "epoch": 0.40692020721724215, + "grad_norm": 2.6240248680114746, + "learning_rate": 4.3217996546379295e-05, + "loss": 0.592, + "step": 46030 + }, + { + "epoch": 0.40700861047755443, + "grad_norm": 4.737199783325195, + "learning_rate": 4.321652315870743e-05, + "loss": 0.7852, + "step": 46040 + }, + { + "epoch": 0.40709701373786666, + "grad_norm": 6.359683036804199, + "learning_rate": 4.321504977103556e-05, + "loss": 0.6721, + "step": 46050 + }, + { + "epoch": 0.4071854169981789, + "grad_norm": 2.270880699157715, + "learning_rate": 4.321357638336369e-05, + "loss": 0.6013, + "step": 46060 + }, + { + "epoch": 0.4072738202584911, + "grad_norm": 6.392951488494873, + "learning_rate": 4.3212102995691815e-05, + "loss": 0.8781, + "step": 46070 + }, + { + "epoch": 0.40736222351880336, + "grad_norm": 2.4666483402252197, + "learning_rate": 4.321062960801995e-05, + "loss": 0.6326, + "step": 46080 + }, + { + "epoch": 0.4074506267791156, + "grad_norm": 6.785998344421387, + "learning_rate": 4.320915622034807e-05, + "loss": 0.7256, + "step": 46090 + }, + { + "epoch": 0.4075390300394279, + "grad_norm": 4.3129472732543945, + "learning_rate": 4.320768283267621e-05, + "loss": 0.7233, + "step": 46100 + }, + { + "epoch": 0.4076274332997401, + "grad_norm": 2.7423253059387207, + "learning_rate": 4.3206209445004335e-05, + "loss": 0.5911, + "step": 46110 + }, + { + "epoch": 0.40771583656005234, + "grad_norm": 4.845990180969238, + "learning_rate": 4.3204736057332464e-05, + "loss": 0.7796, + "step": 46120 + }, + { + "epoch": 0.40780423982036457, + "grad_norm": 6.406247138977051, + "learning_rate": 4.320326266966059e-05, + "loss": 0.7896, + "step": 46130 + }, + { + "epoch": 0.4078926430806768, + "grad_norm": 5.142701625823975, + "learning_rate": 4.320178928198872e-05, + "loss": 0.7059, + "step": 46140 + }, + { + "epoch": 0.40798104634098903, + "grad_norm": 6.629715442657471, + "learning_rate": 4.320031589431685e-05, + "loss": 0.7531, + "step": 46150 + }, + { + "epoch": 0.4080694496013013, + "grad_norm": 2.1500115394592285, + "learning_rate": 4.3198842506644984e-05, + "loss": 0.7188, + "step": 46160 + }, + { + "epoch": 0.40815785286161355, + "grad_norm": 3.7265231609344482, + "learning_rate": 4.3197369118973105e-05, + "loss": 0.6814, + "step": 46170 + }, + { + "epoch": 0.4082462561219258, + "grad_norm": 5.071183681488037, + "learning_rate": 4.319589573130124e-05, + "loss": 0.7336, + "step": 46180 + }, + { + "epoch": 0.408334659382238, + "grad_norm": 4.056783199310303, + "learning_rate": 4.319442234362937e-05, + "loss": 0.6939, + "step": 46190 + }, + { + "epoch": 0.40842306264255024, + "grad_norm": 4.755985260009766, + "learning_rate": 4.31929489559575e-05, + "loss": 0.6514, + "step": 46200 + }, + { + "epoch": 0.4085114659028625, + "grad_norm": 1.7640128135681152, + "learning_rate": 4.3191475568285625e-05, + "loss": 0.7348, + "step": 46210 + }, + { + "epoch": 0.40859986916317476, + "grad_norm": 8.810541152954102, + "learning_rate": 4.319000218061376e-05, + "loss": 0.6172, + "step": 46220 + }, + { + "epoch": 0.408688272423487, + "grad_norm": 10.951849937438965, + "learning_rate": 4.318852879294188e-05, + "loss": 0.8489, + "step": 46230 + }, + { + "epoch": 0.4087766756837992, + "grad_norm": 5.246570110321045, + "learning_rate": 4.318705540527002e-05, + "loss": 0.7214, + "step": 46240 + }, + { + "epoch": 0.40886507894411145, + "grad_norm": 2.8555872440338135, + "learning_rate": 4.318558201759814e-05, + "loss": 0.6974, + "step": 46250 + }, + { + "epoch": 0.4089534822044237, + "grad_norm": 6.7091264724731445, + "learning_rate": 4.3184108629926274e-05, + "loss": 0.6762, + "step": 46260 + }, + { + "epoch": 0.40904188546473597, + "grad_norm": 4.609272480010986, + "learning_rate": 4.31826352422544e-05, + "loss": 0.5934, + "step": 46270 + }, + { + "epoch": 0.4091302887250482, + "grad_norm": 8.799849510192871, + "learning_rate": 4.318116185458253e-05, + "loss": 0.7227, + "step": 46280 + }, + { + "epoch": 0.4092186919853604, + "grad_norm": 7.23222017288208, + "learning_rate": 4.317968846691066e-05, + "loss": 0.7064, + "step": 46290 + }, + { + "epoch": 0.40930709524567266, + "grad_norm": 3.8420917987823486, + "learning_rate": 4.3178215079238794e-05, + "loss": 0.6214, + "step": 46300 + }, + { + "epoch": 0.4093954985059849, + "grad_norm": 2.3287384510040283, + "learning_rate": 4.3176741691566916e-05, + "loss": 0.7229, + "step": 46310 + }, + { + "epoch": 0.4094839017662971, + "grad_norm": 6.242074012756348, + "learning_rate": 4.317526830389505e-05, + "loss": 0.7189, + "step": 46320 + }, + { + "epoch": 0.4095723050266094, + "grad_norm": 8.26706600189209, + "learning_rate": 4.317379491622318e-05, + "loss": 0.826, + "step": 46330 + }, + { + "epoch": 0.40966070828692164, + "grad_norm": 8.299762725830078, + "learning_rate": 4.317232152855131e-05, + "loss": 0.7992, + "step": 46340 + }, + { + "epoch": 0.40974911154723387, + "grad_norm": 1.838484764099121, + "learning_rate": 4.3170848140879436e-05, + "loss": 0.7939, + "step": 46350 + }, + { + "epoch": 0.4098375148075461, + "grad_norm": 2.187119483947754, + "learning_rate": 4.3169374753207564e-05, + "loss": 0.7924, + "step": 46360 + }, + { + "epoch": 0.40992591806785833, + "grad_norm": 4.523279666900635, + "learning_rate": 4.316790136553569e-05, + "loss": 0.6176, + "step": 46370 + }, + { + "epoch": 0.41001432132817056, + "grad_norm": 10.365835189819336, + "learning_rate": 4.316642797786383e-05, + "loss": 0.7321, + "step": 46380 + }, + { + "epoch": 0.41010272458848285, + "grad_norm": 1.1902995109558105, + "learning_rate": 4.316495459019195e-05, + "loss": 0.6697, + "step": 46390 + }, + { + "epoch": 0.4101911278487951, + "grad_norm": 5.285069942474365, + "learning_rate": 4.3163481202520085e-05, + "loss": 0.7626, + "step": 46400 + }, + { + "epoch": 0.4102795311091073, + "grad_norm": 6.092878341674805, + "learning_rate": 4.316200781484821e-05, + "loss": 0.6946, + "step": 46410 + }, + { + "epoch": 0.41036793436941954, + "grad_norm": 3.1772255897521973, + "learning_rate": 4.316053442717634e-05, + "loss": 0.6973, + "step": 46420 + }, + { + "epoch": 0.41045633762973177, + "grad_norm": 5.581402778625488, + "learning_rate": 4.315906103950447e-05, + "loss": 0.6844, + "step": 46430 + }, + { + "epoch": 0.410544740890044, + "grad_norm": 2.424489974975586, + "learning_rate": 4.3157587651832605e-05, + "loss": 0.7254, + "step": 46440 + }, + { + "epoch": 0.4106331441503563, + "grad_norm": 12.22135066986084, + "learning_rate": 4.3156114264160726e-05, + "loss": 0.641, + "step": 46450 + }, + { + "epoch": 0.4107215474106685, + "grad_norm": 4.146501064300537, + "learning_rate": 4.315464087648886e-05, + "loss": 0.7661, + "step": 46460 + }, + { + "epoch": 0.41080995067098075, + "grad_norm": 2.738677978515625, + "learning_rate": 4.315316748881699e-05, + "loss": 0.6031, + "step": 46470 + }, + { + "epoch": 0.410898353931293, + "grad_norm": 6.335766315460205, + "learning_rate": 4.315169410114512e-05, + "loss": 0.6432, + "step": 46480 + }, + { + "epoch": 0.4109867571916052, + "grad_norm": 7.835579872131348, + "learning_rate": 4.3150220713473247e-05, + "loss": 0.7454, + "step": 46490 + }, + { + "epoch": 0.41107516045191744, + "grad_norm": 6.93509578704834, + "learning_rate": 4.3148747325801375e-05, + "loss": 0.7536, + "step": 46500 + }, + { + "epoch": 0.41116356371222973, + "grad_norm": 3.279168128967285, + "learning_rate": 4.31472739381295e-05, + "loss": 0.8043, + "step": 46510 + }, + { + "epoch": 0.41125196697254196, + "grad_norm": 1.5511794090270996, + "learning_rate": 4.314580055045764e-05, + "loss": 0.5827, + "step": 46520 + }, + { + "epoch": 0.4113403702328542, + "grad_norm": 3.3686983585357666, + "learning_rate": 4.314432716278577e-05, + "loss": 0.8558, + "step": 46530 + }, + { + "epoch": 0.4114287734931664, + "grad_norm": 4.269227504730225, + "learning_rate": 4.3142853775113895e-05, + "loss": 0.7983, + "step": 46540 + }, + { + "epoch": 0.41151717675347865, + "grad_norm": 3.802067756652832, + "learning_rate": 4.3141380387442023e-05, + "loss": 0.8038, + "step": 46550 + }, + { + "epoch": 0.4116055800137909, + "grad_norm": 3.33076548576355, + "learning_rate": 4.313990699977015e-05, + "loss": 0.6559, + "step": 46560 + }, + { + "epoch": 0.41169398327410317, + "grad_norm": 3.8315722942352295, + "learning_rate": 4.313843361209828e-05, + "loss": 0.7363, + "step": 46570 + }, + { + "epoch": 0.4117823865344154, + "grad_norm": 4.44300651550293, + "learning_rate": 4.3136960224426415e-05, + "loss": 0.8118, + "step": 46580 + }, + { + "epoch": 0.41187078979472763, + "grad_norm": 1.329114556312561, + "learning_rate": 4.3135486836754544e-05, + "loss": 0.5832, + "step": 46590 + }, + { + "epoch": 0.41195919305503986, + "grad_norm": 1.7089744806289673, + "learning_rate": 4.313401344908267e-05, + "loss": 0.6526, + "step": 46600 + }, + { + "epoch": 0.4120475963153521, + "grad_norm": 6.235609531402588, + "learning_rate": 4.31325400614108e-05, + "loss": 0.6764, + "step": 46610 + }, + { + "epoch": 0.4121359995756643, + "grad_norm": 1.7314995527267456, + "learning_rate": 4.313106667373893e-05, + "loss": 0.5767, + "step": 46620 + }, + { + "epoch": 0.4122244028359766, + "grad_norm": 4.788182735443115, + "learning_rate": 4.3129593286067064e-05, + "loss": 0.7667, + "step": 46630 + }, + { + "epoch": 0.41231280609628884, + "grad_norm": 2.273735284805298, + "learning_rate": 4.3128119898395185e-05, + "loss": 0.5933, + "step": 46640 + }, + { + "epoch": 0.4124012093566011, + "grad_norm": 9.519006729125977, + "learning_rate": 4.312664651072332e-05, + "loss": 0.6746, + "step": 46650 + }, + { + "epoch": 0.4124896126169133, + "grad_norm": 10.015620231628418, + "learning_rate": 4.312517312305145e-05, + "loss": 0.6826, + "step": 46660 + }, + { + "epoch": 0.41257801587722553, + "grad_norm": 4.5188798904418945, + "learning_rate": 4.312369973537958e-05, + "loss": 0.7965, + "step": 46670 + }, + { + "epoch": 0.41266641913753777, + "grad_norm": 11.5077486038208, + "learning_rate": 4.3122226347707706e-05, + "loss": 0.7107, + "step": 46680 + }, + { + "epoch": 0.41275482239785005, + "grad_norm": 4.420197486877441, + "learning_rate": 4.312075296003584e-05, + "loss": 0.8013, + "step": 46690 + }, + { + "epoch": 0.4128432256581623, + "grad_norm": 3.409897565841675, + "learning_rate": 4.311927957236396e-05, + "loss": 0.5752, + "step": 46700 + }, + { + "epoch": 0.4129316289184745, + "grad_norm": 5.816707134246826, + "learning_rate": 4.31178061846921e-05, + "loss": 0.8147, + "step": 46710 + }, + { + "epoch": 0.41302003217878674, + "grad_norm": 2.3938848972320557, + "learning_rate": 4.311633279702022e-05, + "loss": 0.6432, + "step": 46720 + }, + { + "epoch": 0.413108435439099, + "grad_norm": 6.591314315795898, + "learning_rate": 4.3114859409348354e-05, + "loss": 0.5776, + "step": 46730 + }, + { + "epoch": 0.41319683869941126, + "grad_norm": 3.783867835998535, + "learning_rate": 4.311338602167648e-05, + "loss": 0.8421, + "step": 46740 + }, + { + "epoch": 0.4132852419597235, + "grad_norm": 3.204338788986206, + "learning_rate": 4.311191263400461e-05, + "loss": 0.8047, + "step": 46750 + }, + { + "epoch": 0.4133736452200357, + "grad_norm": 4.927525997161865, + "learning_rate": 4.311043924633274e-05, + "loss": 0.761, + "step": 46760 + }, + { + "epoch": 0.41346204848034795, + "grad_norm": 3.0009915828704834, + "learning_rate": 4.3108965858660874e-05, + "loss": 0.6035, + "step": 46770 + }, + { + "epoch": 0.4135504517406602, + "grad_norm": 3.8898062705993652, + "learning_rate": 4.3107492470988996e-05, + "loss": 0.7727, + "step": 46780 + }, + { + "epoch": 0.4136388550009724, + "grad_norm": 9.556239128112793, + "learning_rate": 4.310601908331713e-05, + "loss": 0.5699, + "step": 46790 + }, + { + "epoch": 0.4137272582612847, + "grad_norm": 4.684115409851074, + "learning_rate": 4.310454569564526e-05, + "loss": 0.7099, + "step": 46800 + }, + { + "epoch": 0.41381566152159693, + "grad_norm": 10.142693519592285, + "learning_rate": 4.310307230797339e-05, + "loss": 0.7854, + "step": 46810 + }, + { + "epoch": 0.41390406478190916, + "grad_norm": 8.784010887145996, + "learning_rate": 4.3101598920301516e-05, + "loss": 0.7731, + "step": 46820 + }, + { + "epoch": 0.4139924680422214, + "grad_norm": 3.1128392219543457, + "learning_rate": 4.3100125532629644e-05, + "loss": 0.6951, + "step": 46830 + }, + { + "epoch": 0.4140808713025336, + "grad_norm": 4.86821174621582, + "learning_rate": 4.309865214495777e-05, + "loss": 0.7843, + "step": 46840 + }, + { + "epoch": 0.41416927456284586, + "grad_norm": 5.835916996002197, + "learning_rate": 4.309717875728591e-05, + "loss": 0.715, + "step": 46850 + }, + { + "epoch": 0.41425767782315814, + "grad_norm": 5.9440131187438965, + "learning_rate": 4.309570536961403e-05, + "loss": 0.9036, + "step": 46860 + }, + { + "epoch": 0.4143460810834704, + "grad_norm": 3.788001775741577, + "learning_rate": 4.3094231981942165e-05, + "loss": 0.7966, + "step": 46870 + }, + { + "epoch": 0.4144344843437826, + "grad_norm": 3.809252977371216, + "learning_rate": 4.309275859427029e-05, + "loss": 0.7644, + "step": 46880 + }, + { + "epoch": 0.41452288760409484, + "grad_norm": 2.9296512603759766, + "learning_rate": 4.309128520659842e-05, + "loss": 0.6768, + "step": 46890 + }, + { + "epoch": 0.41461129086440707, + "grad_norm": 3.632952928543091, + "learning_rate": 4.308981181892655e-05, + "loss": 0.792, + "step": 46900 + }, + { + "epoch": 0.4146996941247193, + "grad_norm": 2.2417125701904297, + "learning_rate": 4.3088338431254685e-05, + "loss": 0.557, + "step": 46910 + }, + { + "epoch": 0.4147880973850316, + "grad_norm": 4.355308532714844, + "learning_rate": 4.3086865043582806e-05, + "loss": 0.7944, + "step": 46920 + }, + { + "epoch": 0.4148765006453438, + "grad_norm": 7.980536937713623, + "learning_rate": 4.308539165591094e-05, + "loss": 0.7624, + "step": 46930 + }, + { + "epoch": 0.41496490390565605, + "grad_norm": 9.85515308380127, + "learning_rate": 4.308391826823907e-05, + "loss": 0.7009, + "step": 46940 + }, + { + "epoch": 0.4150533071659683, + "grad_norm": 7.113284111022949, + "learning_rate": 4.30824448805672e-05, + "loss": 0.7405, + "step": 46950 + }, + { + "epoch": 0.4151417104262805, + "grad_norm": 5.405037879943848, + "learning_rate": 4.3080971492895327e-05, + "loss": 0.7009, + "step": 46960 + }, + { + "epoch": 0.41523011368659274, + "grad_norm": 3.425945520401001, + "learning_rate": 4.3079498105223455e-05, + "loss": 0.7684, + "step": 46970 + }, + { + "epoch": 0.415318516946905, + "grad_norm": 2.992239236831665, + "learning_rate": 4.307802471755158e-05, + "loss": 0.6517, + "step": 46980 + }, + { + "epoch": 0.41540692020721726, + "grad_norm": 2.573235511779785, + "learning_rate": 4.307655132987972e-05, + "loss": 0.6795, + "step": 46990 + }, + { + "epoch": 0.4154953234675295, + "grad_norm": 2.5305981636047363, + "learning_rate": 4.307507794220784e-05, + "loss": 0.6851, + "step": 47000 + }, + { + "epoch": 0.4155837267278417, + "grad_norm": 2.1519672870635986, + "learning_rate": 4.3073604554535975e-05, + "loss": 0.619, + "step": 47010 + }, + { + "epoch": 0.41567212998815395, + "grad_norm": 5.283392906188965, + "learning_rate": 4.3072131166864103e-05, + "loss": 0.8358, + "step": 47020 + }, + { + "epoch": 0.4157605332484662, + "grad_norm": 3.2216572761535645, + "learning_rate": 4.307065777919223e-05, + "loss": 0.7383, + "step": 47030 + }, + { + "epoch": 0.41584893650877847, + "grad_norm": 2.4524753093719482, + "learning_rate": 4.306918439152036e-05, + "loss": 0.7274, + "step": 47040 + }, + { + "epoch": 0.4159373397690907, + "grad_norm": 3.2131965160369873, + "learning_rate": 4.3067711003848495e-05, + "loss": 0.6277, + "step": 47050 + }, + { + "epoch": 0.41602574302940293, + "grad_norm": 1.1415796279907227, + "learning_rate": 4.306623761617662e-05, + "loss": 0.624, + "step": 47060 + }, + { + "epoch": 0.41611414628971516, + "grad_norm": 3.8114659786224365, + "learning_rate": 4.306476422850475e-05, + "loss": 0.6871, + "step": 47070 + }, + { + "epoch": 0.4162025495500274, + "grad_norm": 3.4970264434814453, + "learning_rate": 4.3063290840832874e-05, + "loss": 0.6781, + "step": 47080 + }, + { + "epoch": 0.4162909528103396, + "grad_norm": 3.1682686805725098, + "learning_rate": 4.306181745316101e-05, + "loss": 0.6686, + "step": 47090 + }, + { + "epoch": 0.4163793560706519, + "grad_norm": 2.582796573638916, + "learning_rate": 4.306034406548914e-05, + "loss": 0.7516, + "step": 47100 + }, + { + "epoch": 0.41646775933096414, + "grad_norm": 1.6551960706710815, + "learning_rate": 4.3058870677817265e-05, + "loss": 0.6789, + "step": 47110 + }, + { + "epoch": 0.41655616259127637, + "grad_norm": 2.9809072017669678, + "learning_rate": 4.3057397290145394e-05, + "loss": 0.619, + "step": 47120 + }, + { + "epoch": 0.4166445658515886, + "grad_norm": 4.427015781402588, + "learning_rate": 4.305592390247353e-05, + "loss": 0.6953, + "step": 47130 + }, + { + "epoch": 0.41673296911190083, + "grad_norm": 2.082231283187866, + "learning_rate": 4.305445051480165e-05, + "loss": 0.7311, + "step": 47140 + }, + { + "epoch": 0.41682137237221306, + "grad_norm": 2.5461673736572266, + "learning_rate": 4.3052977127129786e-05, + "loss": 0.6575, + "step": 47150 + }, + { + "epoch": 0.41690977563252535, + "grad_norm": 2.1919140815734863, + "learning_rate": 4.3051503739457914e-05, + "loss": 0.7268, + "step": 47160 + }, + { + "epoch": 0.4169981788928376, + "grad_norm": 2.6959102153778076, + "learning_rate": 4.305003035178604e-05, + "loss": 0.6995, + "step": 47170 + }, + { + "epoch": 0.4170865821531498, + "grad_norm": 4.8872456550598145, + "learning_rate": 4.304855696411417e-05, + "loss": 0.7612, + "step": 47180 + }, + { + "epoch": 0.41717498541346204, + "grad_norm": 1.840260624885559, + "learning_rate": 4.30470835764423e-05, + "loss": 0.5063, + "step": 47190 + }, + { + "epoch": 0.41726338867377427, + "grad_norm": 1.9699889421463013, + "learning_rate": 4.304561018877043e-05, + "loss": 0.7761, + "step": 47200 + }, + { + "epoch": 0.4173517919340865, + "grad_norm": 6.508878231048584, + "learning_rate": 4.304413680109856e-05, + "loss": 0.618, + "step": 47210 + }, + { + "epoch": 0.4174401951943988, + "grad_norm": 4.580071449279785, + "learning_rate": 4.3042663413426684e-05, + "loss": 0.6685, + "step": 47220 + }, + { + "epoch": 0.417528598454711, + "grad_norm": 1.9174926280975342, + "learning_rate": 4.304119002575482e-05, + "loss": 0.8065, + "step": 47230 + }, + { + "epoch": 0.41761700171502325, + "grad_norm": 4.042043209075928, + "learning_rate": 4.303971663808295e-05, + "loss": 0.747, + "step": 47240 + }, + { + "epoch": 0.4177054049753355, + "grad_norm": 5.927581310272217, + "learning_rate": 4.3038243250411076e-05, + "loss": 0.7249, + "step": 47250 + }, + { + "epoch": 0.4177938082356477, + "grad_norm": 7.885900974273682, + "learning_rate": 4.3036769862739204e-05, + "loss": 0.6813, + "step": 47260 + }, + { + "epoch": 0.41788221149596, + "grad_norm": 3.979492425918579, + "learning_rate": 4.303529647506734e-05, + "loss": 0.6934, + "step": 47270 + }, + { + "epoch": 0.41797061475627223, + "grad_norm": 13.328450202941895, + "learning_rate": 4.303382308739546e-05, + "loss": 0.7064, + "step": 47280 + }, + { + "epoch": 0.41805901801658446, + "grad_norm": 4.928228378295898, + "learning_rate": 4.3032349699723596e-05, + "loss": 0.7181, + "step": 47290 + }, + { + "epoch": 0.4181474212768967, + "grad_norm": 9.209067344665527, + "learning_rate": 4.303087631205172e-05, + "loss": 0.7984, + "step": 47300 + }, + { + "epoch": 0.4182358245372089, + "grad_norm": 1.4937260150909424, + "learning_rate": 4.302940292437985e-05, + "loss": 0.7779, + "step": 47310 + }, + { + "epoch": 0.41832422779752115, + "grad_norm": 5.342955589294434, + "learning_rate": 4.302792953670798e-05, + "loss": 0.7885, + "step": 47320 + }, + { + "epoch": 0.41841263105783344, + "grad_norm": 2.092026710510254, + "learning_rate": 4.302645614903611e-05, + "loss": 0.6582, + "step": 47330 + }, + { + "epoch": 0.41850103431814567, + "grad_norm": 7.898820400238037, + "learning_rate": 4.302498276136424e-05, + "loss": 0.7518, + "step": 47340 + }, + { + "epoch": 0.4185894375784579, + "grad_norm": 2.6180171966552734, + "learning_rate": 4.302350937369237e-05, + "loss": 0.6744, + "step": 47350 + }, + { + "epoch": 0.41867784083877013, + "grad_norm": 2.7855191230773926, + "learning_rate": 4.30220359860205e-05, + "loss": 0.6565, + "step": 47360 + }, + { + "epoch": 0.41876624409908236, + "grad_norm": 5.365550518035889, + "learning_rate": 4.302056259834863e-05, + "loss": 0.7078, + "step": 47370 + }, + { + "epoch": 0.4188546473593946, + "grad_norm": 5.605372905731201, + "learning_rate": 4.301908921067676e-05, + "loss": 0.703, + "step": 47380 + }, + { + "epoch": 0.4189430506197069, + "grad_norm": 9.14799690246582, + "learning_rate": 4.3017615823004886e-05, + "loss": 0.5925, + "step": 47390 + }, + { + "epoch": 0.4190314538800191, + "grad_norm": 4.325501441955566, + "learning_rate": 4.3016142435333015e-05, + "loss": 0.7511, + "step": 47400 + }, + { + "epoch": 0.41911985714033134, + "grad_norm": 2.7949655055999756, + "learning_rate": 4.301466904766115e-05, + "loss": 0.7299, + "step": 47410 + }, + { + "epoch": 0.4192082604006436, + "grad_norm": 5.639239311218262, + "learning_rate": 4.301319565998928e-05, + "loss": 0.804, + "step": 47420 + }, + { + "epoch": 0.4192966636609558, + "grad_norm": 1.233499526977539, + "learning_rate": 4.301172227231741e-05, + "loss": 0.7503, + "step": 47430 + }, + { + "epoch": 0.41938506692126803, + "grad_norm": 4.049893379211426, + "learning_rate": 4.3010248884645535e-05, + "loss": 0.762, + "step": 47440 + }, + { + "epoch": 0.4194734701815803, + "grad_norm": 2.9958951473236084, + "learning_rate": 4.300877549697366e-05, + "loss": 0.7079, + "step": 47450 + }, + { + "epoch": 0.41956187344189255, + "grad_norm": 2.1400082111358643, + "learning_rate": 4.300730210930179e-05, + "loss": 0.7352, + "step": 47460 + }, + { + "epoch": 0.4196502767022048, + "grad_norm": 3.1742537021636963, + "learning_rate": 4.300582872162992e-05, + "loss": 0.7544, + "step": 47470 + }, + { + "epoch": 0.419738679962517, + "grad_norm": 3.2784366607666016, + "learning_rate": 4.3004355333958055e-05, + "loss": 0.8853, + "step": 47480 + }, + { + "epoch": 0.41982708322282924, + "grad_norm": 5.5541815757751465, + "learning_rate": 4.3002881946286184e-05, + "loss": 0.7872, + "step": 47490 + }, + { + "epoch": 0.4199154864831415, + "grad_norm": 1.592294692993164, + "learning_rate": 4.300140855861431e-05, + "loss": 0.6647, + "step": 47500 + }, + { + "epoch": 0.42000388974345376, + "grad_norm": 2.8090524673461914, + "learning_rate": 4.299993517094244e-05, + "loss": 0.708, + "step": 47510 + }, + { + "epoch": 0.420092293003766, + "grad_norm": 2.53983736038208, + "learning_rate": 4.299846178327057e-05, + "loss": 0.7153, + "step": 47520 + }, + { + "epoch": 0.4201806962640782, + "grad_norm": 3.2868728637695312, + "learning_rate": 4.29969883955987e-05, + "loss": 0.6632, + "step": 47530 + }, + { + "epoch": 0.42026909952439045, + "grad_norm": 3.456585168838501, + "learning_rate": 4.299551500792683e-05, + "loss": 0.745, + "step": 47540 + }, + { + "epoch": 0.4203575027847027, + "grad_norm": 4.485711574554443, + "learning_rate": 4.2994041620254954e-05, + "loss": 0.7192, + "step": 47550 + }, + { + "epoch": 0.4204459060450149, + "grad_norm": 4.93747615814209, + "learning_rate": 4.299256823258309e-05, + "loss": 0.76, + "step": 47560 + }, + { + "epoch": 0.4205343093053272, + "grad_norm": 1.0447183847427368, + "learning_rate": 4.299109484491122e-05, + "loss": 0.6799, + "step": 47570 + }, + { + "epoch": 0.42062271256563943, + "grad_norm": 1.838577389717102, + "learning_rate": 4.2989621457239346e-05, + "loss": 0.62, + "step": 47580 + }, + { + "epoch": 0.42071111582595166, + "grad_norm": 5.7548017501831055, + "learning_rate": 4.2988148069567474e-05, + "loss": 0.6714, + "step": 47590 + }, + { + "epoch": 0.4207995190862639, + "grad_norm": 5.040812015533447, + "learning_rate": 4.298667468189561e-05, + "loss": 0.5932, + "step": 47600 + }, + { + "epoch": 0.4208879223465761, + "grad_norm": 2.7236719131469727, + "learning_rate": 4.298520129422373e-05, + "loss": 0.7645, + "step": 47610 + }, + { + "epoch": 0.42097632560688836, + "grad_norm": 8.309236526489258, + "learning_rate": 4.2983727906551866e-05, + "loss": 0.6699, + "step": 47620 + }, + { + "epoch": 0.42106472886720064, + "grad_norm": 9.064888000488281, + "learning_rate": 4.2982254518879994e-05, + "loss": 0.7382, + "step": 47630 + }, + { + "epoch": 0.4211531321275129, + "grad_norm": 9.858525276184082, + "learning_rate": 4.298078113120812e-05, + "loss": 0.7338, + "step": 47640 + }, + { + "epoch": 0.4212415353878251, + "grad_norm": 6.124359607696533, + "learning_rate": 4.297930774353625e-05, + "loss": 0.6258, + "step": 47650 + }, + { + "epoch": 0.42132993864813734, + "grad_norm": 2.7527408599853516, + "learning_rate": 4.297783435586438e-05, + "loss": 0.8399, + "step": 47660 + }, + { + "epoch": 0.42141834190844957, + "grad_norm": 3.949218988418579, + "learning_rate": 4.297636096819251e-05, + "loss": 0.6427, + "step": 47670 + }, + { + "epoch": 0.4215067451687618, + "grad_norm": 3.412393569946289, + "learning_rate": 4.297488758052064e-05, + "loss": 0.6616, + "step": 47680 + }, + { + "epoch": 0.4215951484290741, + "grad_norm": 5.252253532409668, + "learning_rate": 4.2973414192848764e-05, + "loss": 0.6492, + "step": 47690 + }, + { + "epoch": 0.4216835516893863, + "grad_norm": 8.8892240524292, + "learning_rate": 4.29719408051769e-05, + "loss": 0.8107, + "step": 47700 + }, + { + "epoch": 0.42177195494969855, + "grad_norm": 1.4994401931762695, + "learning_rate": 4.297046741750503e-05, + "loss": 0.6338, + "step": 47710 + }, + { + "epoch": 0.4218603582100108, + "grad_norm": 2.5555267333984375, + "learning_rate": 4.2968994029833156e-05, + "loss": 0.7034, + "step": 47720 + }, + { + "epoch": 0.421948761470323, + "grad_norm": 7.5677714347839355, + "learning_rate": 4.2967520642161284e-05, + "loss": 0.7261, + "step": 47730 + }, + { + "epoch": 0.42203716473063524, + "grad_norm": 4.397103309631348, + "learning_rate": 4.296604725448942e-05, + "loss": 0.766, + "step": 47740 + }, + { + "epoch": 0.4221255679909475, + "grad_norm": 1.9925589561462402, + "learning_rate": 4.296457386681754e-05, + "loss": 0.6313, + "step": 47750 + }, + { + "epoch": 0.42221397125125976, + "grad_norm": 6.518177032470703, + "learning_rate": 4.2963100479145676e-05, + "loss": 0.8554, + "step": 47760 + }, + { + "epoch": 0.422302374511572, + "grad_norm": 3.5906131267547607, + "learning_rate": 4.2961627091473805e-05, + "loss": 0.7362, + "step": 47770 + }, + { + "epoch": 0.4223907777718842, + "grad_norm": 3.3934285640716553, + "learning_rate": 4.296015370380193e-05, + "loss": 0.7243, + "step": 47780 + }, + { + "epoch": 0.42247918103219645, + "grad_norm": 5.146400451660156, + "learning_rate": 4.295868031613006e-05, + "loss": 0.7424, + "step": 47790 + }, + { + "epoch": 0.4225675842925087, + "grad_norm": 14.254595756530762, + "learning_rate": 4.295720692845819e-05, + "loss": 0.7237, + "step": 47800 + }, + { + "epoch": 0.42265598755282097, + "grad_norm": 4.460474967956543, + "learning_rate": 4.295573354078632e-05, + "loss": 0.7145, + "step": 47810 + }, + { + "epoch": 0.4227443908131332, + "grad_norm": 2.277794599533081, + "learning_rate": 4.295426015311445e-05, + "loss": 0.632, + "step": 47820 + }, + { + "epoch": 0.42283279407344543, + "grad_norm": 3.8273043632507324, + "learning_rate": 4.2952786765442575e-05, + "loss": 0.5973, + "step": 47830 + }, + { + "epoch": 0.42292119733375766, + "grad_norm": 2.9968783855438232, + "learning_rate": 4.295131337777071e-05, + "loss": 0.7608, + "step": 47840 + }, + { + "epoch": 0.4230096005940699, + "grad_norm": 5.234260082244873, + "learning_rate": 4.294983999009884e-05, + "loss": 0.6281, + "step": 47850 + }, + { + "epoch": 0.4230980038543822, + "grad_norm": 13.368101119995117, + "learning_rate": 4.2948366602426967e-05, + "loss": 0.7162, + "step": 47860 + }, + { + "epoch": 0.4231864071146944, + "grad_norm": 1.5968952178955078, + "learning_rate": 4.2946893214755095e-05, + "loss": 0.6554, + "step": 47870 + }, + { + "epoch": 0.42327481037500664, + "grad_norm": 6.714796543121338, + "learning_rate": 4.294541982708323e-05, + "loss": 0.7419, + "step": 47880 + }, + { + "epoch": 0.42336321363531887, + "grad_norm": 6.458313941955566, + "learning_rate": 4.294394643941135e-05, + "loss": 0.6574, + "step": 47890 + }, + { + "epoch": 0.4234516168956311, + "grad_norm": 6.0132951736450195, + "learning_rate": 4.294247305173949e-05, + "loss": 0.8116, + "step": 47900 + }, + { + "epoch": 0.42354002015594333, + "grad_norm": 2.5821726322174072, + "learning_rate": 4.294099966406761e-05, + "loss": 0.6987, + "step": 47910 + }, + { + "epoch": 0.4236284234162556, + "grad_norm": 1.2777179479599, + "learning_rate": 4.2939526276395743e-05, + "loss": 0.6982, + "step": 47920 + }, + { + "epoch": 0.42371682667656785, + "grad_norm": 6.209251403808594, + "learning_rate": 4.293805288872387e-05, + "loss": 0.7179, + "step": 47930 + }, + { + "epoch": 0.4238052299368801, + "grad_norm": 2.6391115188598633, + "learning_rate": 4.2936579501052e-05, + "loss": 0.6967, + "step": 47940 + }, + { + "epoch": 0.4238936331971923, + "grad_norm": 5.176142692565918, + "learning_rate": 4.293510611338013e-05, + "loss": 0.7014, + "step": 47950 + }, + { + "epoch": 0.42398203645750454, + "grad_norm": 3.1526174545288086, + "learning_rate": 4.2933632725708264e-05, + "loss": 0.7503, + "step": 47960 + }, + { + "epoch": 0.42407043971781677, + "grad_norm": 3.8150854110717773, + "learning_rate": 4.2932159338036385e-05, + "loss": 0.6631, + "step": 47970 + }, + { + "epoch": 0.42415884297812906, + "grad_norm": 8.35955810546875, + "learning_rate": 4.293068595036452e-05, + "loss": 0.6546, + "step": 47980 + }, + { + "epoch": 0.4242472462384413, + "grad_norm": 3.098579168319702, + "learning_rate": 4.292921256269265e-05, + "loss": 0.7479, + "step": 47990 + }, + { + "epoch": 0.4243356494987535, + "grad_norm": 1.8033745288848877, + "learning_rate": 4.292773917502078e-05, + "loss": 0.6314, + "step": 48000 + }, + { + "epoch": 0.42442405275906575, + "grad_norm": 3.5809051990509033, + "learning_rate": 4.2926265787348905e-05, + "loss": 0.9129, + "step": 48010 + }, + { + "epoch": 0.424512456019378, + "grad_norm": 1.3962634801864624, + "learning_rate": 4.2924792399677034e-05, + "loss": 0.7015, + "step": 48020 + }, + { + "epoch": 0.4246008592796902, + "grad_norm": 1.5739784240722656, + "learning_rate": 4.292331901200516e-05, + "loss": 0.6883, + "step": 48030 + }, + { + "epoch": 0.4246892625400025, + "grad_norm": 5.076678276062012, + "learning_rate": 4.29218456243333e-05, + "loss": 0.6829, + "step": 48040 + }, + { + "epoch": 0.42477766580031473, + "grad_norm": 2.7974698543548584, + "learning_rate": 4.292037223666142e-05, + "loss": 0.8116, + "step": 48050 + }, + { + "epoch": 0.42486606906062696, + "grad_norm": 2.0378050804138184, + "learning_rate": 4.2918898848989554e-05, + "loss": 0.6363, + "step": 48060 + }, + { + "epoch": 0.4249544723209392, + "grad_norm": 9.311989784240723, + "learning_rate": 4.291742546131768e-05, + "loss": 0.799, + "step": 48070 + }, + { + "epoch": 0.4250428755812514, + "grad_norm": 3.0429904460906982, + "learning_rate": 4.291595207364581e-05, + "loss": 0.7563, + "step": 48080 + }, + { + "epoch": 0.42513127884156365, + "grad_norm": 1.8256853818893433, + "learning_rate": 4.291447868597394e-05, + "loss": 0.7063, + "step": 48090 + }, + { + "epoch": 0.42521968210187594, + "grad_norm": 3.6194562911987305, + "learning_rate": 4.2913005298302074e-05, + "loss": 0.6743, + "step": 48100 + }, + { + "epoch": 0.42530808536218817, + "grad_norm": 1.738796353340149, + "learning_rate": 4.2911531910630196e-05, + "loss": 0.665, + "step": 48110 + }, + { + "epoch": 0.4253964886225004, + "grad_norm": 3.6646478176116943, + "learning_rate": 4.291005852295833e-05, + "loss": 0.6546, + "step": 48120 + }, + { + "epoch": 0.42548489188281263, + "grad_norm": 2.307434320449829, + "learning_rate": 4.290858513528645e-05, + "loss": 0.6931, + "step": 48130 + }, + { + "epoch": 0.42557329514312486, + "grad_norm": 2.7822699546813965, + "learning_rate": 4.290711174761459e-05, + "loss": 0.6119, + "step": 48140 + }, + { + "epoch": 0.4256616984034371, + "grad_norm": 6.795778751373291, + "learning_rate": 4.2905638359942716e-05, + "loss": 0.8332, + "step": 48150 + }, + { + "epoch": 0.4257501016637494, + "grad_norm": 2.296041488647461, + "learning_rate": 4.2904164972270844e-05, + "loss": 0.7512, + "step": 48160 + }, + { + "epoch": 0.4258385049240616, + "grad_norm": 2.6243736743927, + "learning_rate": 4.290269158459897e-05, + "loss": 0.7573, + "step": 48170 + }, + { + "epoch": 0.42592690818437384, + "grad_norm": 2.2302157878875732, + "learning_rate": 4.290121819692711e-05, + "loss": 0.7542, + "step": 48180 + }, + { + "epoch": 0.4260153114446861, + "grad_norm": 3.0792062282562256, + "learning_rate": 4.289974480925523e-05, + "loss": 0.722, + "step": 48190 + }, + { + "epoch": 0.4261037147049983, + "grad_norm": 3.389698028564453, + "learning_rate": 4.2898271421583364e-05, + "loss": 0.746, + "step": 48200 + }, + { + "epoch": 0.42619211796531054, + "grad_norm": 1.883493185043335, + "learning_rate": 4.289679803391149e-05, + "loss": 0.6322, + "step": 48210 + }, + { + "epoch": 0.4262805212256228, + "grad_norm": 7.4424943923950195, + "learning_rate": 4.289532464623962e-05, + "loss": 0.7688, + "step": 48220 + }, + { + "epoch": 0.42636892448593505, + "grad_norm": 5.436248779296875, + "learning_rate": 4.289385125856775e-05, + "loss": 0.7501, + "step": 48230 + }, + { + "epoch": 0.4264573277462473, + "grad_norm": 3.845896005630493, + "learning_rate": 4.2892377870895885e-05, + "loss": 0.5885, + "step": 48240 + }, + { + "epoch": 0.4265457310065595, + "grad_norm": 3.046862840652466, + "learning_rate": 4.2890904483224006e-05, + "loss": 0.782, + "step": 48250 + }, + { + "epoch": 0.42663413426687175, + "grad_norm": 2.5822560787200928, + "learning_rate": 4.288943109555214e-05, + "loss": 0.646, + "step": 48260 + }, + { + "epoch": 0.426722537527184, + "grad_norm": 11.160552024841309, + "learning_rate": 4.288795770788027e-05, + "loss": 0.707, + "step": 48270 + }, + { + "epoch": 0.42681094078749626, + "grad_norm": 2.8353912830352783, + "learning_rate": 4.28864843202084e-05, + "loss": 0.6083, + "step": 48280 + }, + { + "epoch": 0.4268993440478085, + "grad_norm": 10.617039680480957, + "learning_rate": 4.2885010932536526e-05, + "loss": 0.7785, + "step": 48290 + }, + { + "epoch": 0.4269877473081207, + "grad_norm": 5.462192058563232, + "learning_rate": 4.2883537544864655e-05, + "loss": 0.8402, + "step": 48300 + }, + { + "epoch": 0.42707615056843296, + "grad_norm": 2.9955265522003174, + "learning_rate": 4.288206415719278e-05, + "loss": 0.728, + "step": 48310 + }, + { + "epoch": 0.4271645538287452, + "grad_norm": 2.647054672241211, + "learning_rate": 4.288059076952092e-05, + "loss": 0.7728, + "step": 48320 + }, + { + "epoch": 0.4272529570890574, + "grad_norm": 5.712986946105957, + "learning_rate": 4.2879117381849047e-05, + "loss": 0.7458, + "step": 48330 + }, + { + "epoch": 0.4273413603493697, + "grad_norm": 3.097459316253662, + "learning_rate": 4.2877643994177175e-05, + "loss": 0.6847, + "step": 48340 + }, + { + "epoch": 0.42742976360968193, + "grad_norm": 4.371756076812744, + "learning_rate": 4.28761706065053e-05, + "loss": 0.6855, + "step": 48350 + }, + { + "epoch": 0.42751816686999417, + "grad_norm": 1.550958514213562, + "learning_rate": 4.287469721883343e-05, + "loss": 0.5489, + "step": 48360 + }, + { + "epoch": 0.4276065701303064, + "grad_norm": 1.1235427856445312, + "learning_rate": 4.287322383116156e-05, + "loss": 0.7475, + "step": 48370 + }, + { + "epoch": 0.4276949733906186, + "grad_norm": 4.641275882720947, + "learning_rate": 4.287175044348969e-05, + "loss": 0.7154, + "step": 48380 + }, + { + "epoch": 0.4277833766509309, + "grad_norm": 3.2773590087890625, + "learning_rate": 4.2870277055817823e-05, + "loss": 0.7551, + "step": 48390 + }, + { + "epoch": 0.42787177991124314, + "grad_norm": 1.6866613626480103, + "learning_rate": 4.286880366814595e-05, + "loss": 0.771, + "step": 48400 + }, + { + "epoch": 0.4279601831715554, + "grad_norm": 10.191542625427246, + "learning_rate": 4.286733028047408e-05, + "loss": 0.7619, + "step": 48410 + }, + { + "epoch": 0.4280485864318676, + "grad_norm": 3.14107084274292, + "learning_rate": 4.286585689280221e-05, + "loss": 0.6854, + "step": 48420 + }, + { + "epoch": 0.42813698969217984, + "grad_norm": 2.8374431133270264, + "learning_rate": 4.286438350513034e-05, + "loss": 0.6736, + "step": 48430 + }, + { + "epoch": 0.42822539295249207, + "grad_norm": 11.783573150634766, + "learning_rate": 4.2862910117458465e-05, + "loss": 0.7313, + "step": 48440 + }, + { + "epoch": 0.42831379621280435, + "grad_norm": 2.8745875358581543, + "learning_rate": 4.28614367297866e-05, + "loss": 0.7187, + "step": 48450 + }, + { + "epoch": 0.4284021994731166, + "grad_norm": 10.306466102600098, + "learning_rate": 4.285996334211473e-05, + "loss": 0.7406, + "step": 48460 + }, + { + "epoch": 0.4284906027334288, + "grad_norm": 4.1658735275268555, + "learning_rate": 4.285848995444286e-05, + "loss": 0.7009, + "step": 48470 + }, + { + "epoch": 0.42857900599374105, + "grad_norm": 4.120835781097412, + "learning_rate": 4.2857016566770985e-05, + "loss": 0.7448, + "step": 48480 + }, + { + "epoch": 0.4286674092540533, + "grad_norm": 1.7996022701263428, + "learning_rate": 4.2855543179099114e-05, + "loss": 0.6167, + "step": 48490 + }, + { + "epoch": 0.4287558125143655, + "grad_norm": 14.082418441772461, + "learning_rate": 4.285406979142724e-05, + "loss": 0.7737, + "step": 48500 + }, + { + "epoch": 0.4288442157746778, + "grad_norm": 3.8586723804473877, + "learning_rate": 4.285259640375538e-05, + "loss": 0.6981, + "step": 48510 + }, + { + "epoch": 0.42893261903499, + "grad_norm": 2.0205917358398438, + "learning_rate": 4.28511230160835e-05, + "loss": 0.6336, + "step": 48520 + }, + { + "epoch": 0.42902102229530226, + "grad_norm": 2.4731831550598145, + "learning_rate": 4.2849649628411634e-05, + "loss": 0.7425, + "step": 48530 + }, + { + "epoch": 0.4291094255556145, + "grad_norm": 7.64461088180542, + "learning_rate": 4.284817624073976e-05, + "loss": 0.8316, + "step": 48540 + }, + { + "epoch": 0.4291978288159267, + "grad_norm": 6.518263816833496, + "learning_rate": 4.284670285306789e-05, + "loss": 0.7612, + "step": 48550 + }, + { + "epoch": 0.42928623207623895, + "grad_norm": 3.602928638458252, + "learning_rate": 4.284522946539602e-05, + "loss": 0.7646, + "step": 48560 + }, + { + "epoch": 0.42937463533655124, + "grad_norm": 1.4430686235427856, + "learning_rate": 4.2843756077724154e-05, + "loss": 0.6019, + "step": 48570 + }, + { + "epoch": 0.42946303859686347, + "grad_norm": 2.056321144104004, + "learning_rate": 4.2842282690052276e-05, + "loss": 0.734, + "step": 48580 + }, + { + "epoch": 0.4295514418571757, + "grad_norm": 8.859973907470703, + "learning_rate": 4.284080930238041e-05, + "loss": 0.5719, + "step": 48590 + }, + { + "epoch": 0.42963984511748793, + "grad_norm": 1.0789034366607666, + "learning_rate": 4.283933591470853e-05, + "loss": 0.8037, + "step": 48600 + }, + { + "epoch": 0.42972824837780016, + "grad_norm": 9.667101860046387, + "learning_rate": 4.283786252703667e-05, + "loss": 0.6936, + "step": 48610 + }, + { + "epoch": 0.4298166516381124, + "grad_norm": 4.853716850280762, + "learning_rate": 4.2836389139364796e-05, + "loss": 0.8355, + "step": 48620 + }, + { + "epoch": 0.4299050548984247, + "grad_norm": 2.299448251724243, + "learning_rate": 4.2834915751692924e-05, + "loss": 0.7775, + "step": 48630 + }, + { + "epoch": 0.4299934581587369, + "grad_norm": 5.701653003692627, + "learning_rate": 4.283344236402105e-05, + "loss": 0.6251, + "step": 48640 + }, + { + "epoch": 0.43008186141904914, + "grad_norm": 4.0857834815979, + "learning_rate": 4.283196897634919e-05, + "loss": 0.6859, + "step": 48650 + }, + { + "epoch": 0.43017026467936137, + "grad_norm": 2.6716785430908203, + "learning_rate": 4.283049558867731e-05, + "loss": 0.7045, + "step": 48660 + }, + { + "epoch": 0.4302586679396736, + "grad_norm": 3.9186277389526367, + "learning_rate": 4.2829022201005445e-05, + "loss": 0.6712, + "step": 48670 + }, + { + "epoch": 0.43034707119998583, + "grad_norm": 1.6243484020233154, + "learning_rate": 4.282754881333357e-05, + "loss": 0.7508, + "step": 48680 + }, + { + "epoch": 0.4304354744602981, + "grad_norm": 2.530704975128174, + "learning_rate": 4.28260754256617e-05, + "loss": 0.7329, + "step": 48690 + }, + { + "epoch": 0.43052387772061035, + "grad_norm": 4.32342529296875, + "learning_rate": 4.282460203798983e-05, + "loss": 0.8491, + "step": 48700 + }, + { + "epoch": 0.4306122809809226, + "grad_norm": 1.8311306238174438, + "learning_rate": 4.2823128650317965e-05, + "loss": 0.7729, + "step": 48710 + }, + { + "epoch": 0.4307006842412348, + "grad_norm": 2.167034149169922, + "learning_rate": 4.2821655262646086e-05, + "loss": 0.764, + "step": 48720 + }, + { + "epoch": 0.43078908750154704, + "grad_norm": 4.249022006988525, + "learning_rate": 4.282018187497422e-05, + "loss": 0.7243, + "step": 48730 + }, + { + "epoch": 0.43087749076185927, + "grad_norm": 2.4285356998443604, + "learning_rate": 4.281870848730234e-05, + "loss": 0.8625, + "step": 48740 + }, + { + "epoch": 0.43096589402217156, + "grad_norm": 2.9705419540405273, + "learning_rate": 4.281723509963048e-05, + "loss": 0.6557, + "step": 48750 + }, + { + "epoch": 0.4310542972824838, + "grad_norm": 3.2237141132354736, + "learning_rate": 4.2815761711958606e-05, + "loss": 0.855, + "step": 48760 + }, + { + "epoch": 0.431142700542796, + "grad_norm": 8.296978950500488, + "learning_rate": 4.2814288324286735e-05, + "loss": 0.7049, + "step": 48770 + }, + { + "epoch": 0.43123110380310825, + "grad_norm": 10.160294532775879, + "learning_rate": 4.281281493661486e-05, + "loss": 0.8685, + "step": 48780 + }, + { + "epoch": 0.4313195070634205, + "grad_norm": 2.1390461921691895, + "learning_rate": 4.2811341548943e-05, + "loss": 0.6697, + "step": 48790 + }, + { + "epoch": 0.4314079103237327, + "grad_norm": 5.534269332885742, + "learning_rate": 4.280986816127112e-05, + "loss": 0.6721, + "step": 48800 + }, + { + "epoch": 0.431496313584045, + "grad_norm": 11.164871215820312, + "learning_rate": 4.2808394773599255e-05, + "loss": 0.7006, + "step": 48810 + }, + { + "epoch": 0.43158471684435723, + "grad_norm": 6.385516166687012, + "learning_rate": 4.280692138592738e-05, + "loss": 0.7559, + "step": 48820 + }, + { + "epoch": 0.43167312010466946, + "grad_norm": 7.103679180145264, + "learning_rate": 4.280544799825551e-05, + "loss": 0.5955, + "step": 48830 + }, + { + "epoch": 0.4317615233649817, + "grad_norm": 1.2492018938064575, + "learning_rate": 4.280397461058364e-05, + "loss": 0.4904, + "step": 48840 + }, + { + "epoch": 0.4318499266252939, + "grad_norm": 1.7535746097564697, + "learning_rate": 4.280250122291177e-05, + "loss": 0.7749, + "step": 48850 + }, + { + "epoch": 0.43193832988560615, + "grad_norm": 8.345301628112793, + "learning_rate": 4.28010278352399e-05, + "loss": 0.6524, + "step": 48860 + }, + { + "epoch": 0.43202673314591844, + "grad_norm": 5.098632335662842, + "learning_rate": 4.279955444756803e-05, + "loss": 0.6604, + "step": 48870 + }, + { + "epoch": 0.43211513640623067, + "grad_norm": 11.396991729736328, + "learning_rate": 4.2798081059896153e-05, + "loss": 0.7243, + "step": 48880 + }, + { + "epoch": 0.4322035396665429, + "grad_norm": 3.8040575981140137, + "learning_rate": 4.279660767222429e-05, + "loss": 0.8036, + "step": 48890 + }, + { + "epoch": 0.43229194292685513, + "grad_norm": 7.881913661956787, + "learning_rate": 4.279513428455242e-05, + "loss": 0.6083, + "step": 48900 + }, + { + "epoch": 0.43238034618716736, + "grad_norm": 4.152080059051514, + "learning_rate": 4.2793660896880545e-05, + "loss": 0.8232, + "step": 48910 + }, + { + "epoch": 0.43246874944747965, + "grad_norm": 4.705545902252197, + "learning_rate": 4.2792187509208674e-05, + "loss": 0.8514, + "step": 48920 + }, + { + "epoch": 0.4325571527077919, + "grad_norm": 5.913174152374268, + "learning_rate": 4.279071412153681e-05, + "loss": 0.665, + "step": 48930 + }, + { + "epoch": 0.4326455559681041, + "grad_norm": 4.060830593109131, + "learning_rate": 4.278924073386493e-05, + "loss": 0.7206, + "step": 48940 + }, + { + "epoch": 0.43273395922841634, + "grad_norm": 2.5899369716644287, + "learning_rate": 4.2787767346193066e-05, + "loss": 0.6992, + "step": 48950 + }, + { + "epoch": 0.4328223624887286, + "grad_norm": 5.40007209777832, + "learning_rate": 4.278629395852119e-05, + "loss": 0.6133, + "step": 48960 + }, + { + "epoch": 0.4329107657490408, + "grad_norm": 2.1534299850463867, + "learning_rate": 4.278482057084932e-05, + "loss": 0.7526, + "step": 48970 + }, + { + "epoch": 0.4329991690093531, + "grad_norm": 7.855589389801025, + "learning_rate": 4.278334718317745e-05, + "loss": 0.7828, + "step": 48980 + }, + { + "epoch": 0.4330875722696653, + "grad_norm": 4.2604570388793945, + "learning_rate": 4.278187379550558e-05, + "loss": 0.7934, + "step": 48990 + }, + { + "epoch": 0.43317597552997755, + "grad_norm": 1.6459400653839111, + "learning_rate": 4.278040040783371e-05, + "loss": 0.6258, + "step": 49000 + }, + { + "epoch": 0.4332643787902898, + "grad_norm": 5.904418468475342, + "learning_rate": 4.277892702016184e-05, + "loss": 0.7951, + "step": 49010 + }, + { + "epoch": 0.433352782050602, + "grad_norm": 4.660916328430176, + "learning_rate": 4.2777453632489964e-05, + "loss": 0.7777, + "step": 49020 + }, + { + "epoch": 0.43344118531091425, + "grad_norm": 1.808325171470642, + "learning_rate": 4.27759802448181e-05, + "loss": 0.7035, + "step": 49030 + }, + { + "epoch": 0.43352958857122653, + "grad_norm": 7.488389015197754, + "learning_rate": 4.277450685714623e-05, + "loss": 0.7222, + "step": 49040 + }, + { + "epoch": 0.43361799183153876, + "grad_norm": 2.403414249420166, + "learning_rate": 4.2773033469474356e-05, + "loss": 0.7446, + "step": 49050 + }, + { + "epoch": 0.433706395091851, + "grad_norm": 4.051290512084961, + "learning_rate": 4.2771560081802484e-05, + "loss": 0.6622, + "step": 49060 + }, + { + "epoch": 0.4337947983521632, + "grad_norm": 2.8323144912719727, + "learning_rate": 4.277008669413061e-05, + "loss": 0.6099, + "step": 49070 + }, + { + "epoch": 0.43388320161247546, + "grad_norm": 1.4692400693893433, + "learning_rate": 4.276861330645874e-05, + "loss": 0.6458, + "step": 49080 + }, + { + "epoch": 0.4339716048727877, + "grad_norm": 7.437715530395508, + "learning_rate": 4.2767139918786876e-05, + "loss": 0.7714, + "step": 49090 + }, + { + "epoch": 0.4340600081331, + "grad_norm": 1.789042592048645, + "learning_rate": 4.2765666531115e-05, + "loss": 0.7629, + "step": 49100 + }, + { + "epoch": 0.4341484113934122, + "grad_norm": 9.832332611083984, + "learning_rate": 4.276419314344313e-05, + "loss": 0.8524, + "step": 49110 + }, + { + "epoch": 0.43423681465372443, + "grad_norm": 4.003662109375, + "learning_rate": 4.276271975577126e-05, + "loss": 0.713, + "step": 49120 + }, + { + "epoch": 0.43432521791403667, + "grad_norm": 4.436911106109619, + "learning_rate": 4.276124636809939e-05, + "loss": 0.7281, + "step": 49130 + }, + { + "epoch": 0.4344136211743489, + "grad_norm": 3.5087971687316895, + "learning_rate": 4.275977298042752e-05, + "loss": 0.7506, + "step": 49140 + }, + { + "epoch": 0.4345020244346611, + "grad_norm": 3.611255407333374, + "learning_rate": 4.275829959275565e-05, + "loss": 0.6588, + "step": 49150 + }, + { + "epoch": 0.4345904276949734, + "grad_norm": 8.180458068847656, + "learning_rate": 4.2756826205083775e-05, + "loss": 0.7707, + "step": 49160 + }, + { + "epoch": 0.43467883095528564, + "grad_norm": 6.969361782073975, + "learning_rate": 4.275535281741191e-05, + "loss": 0.7826, + "step": 49170 + }, + { + "epoch": 0.4347672342155979, + "grad_norm": 10.904851913452148, + "learning_rate": 4.275387942974004e-05, + "loss": 0.7563, + "step": 49180 + }, + { + "epoch": 0.4348556374759101, + "grad_norm": 6.551028728485107, + "learning_rate": 4.2752406042068166e-05, + "loss": 0.6823, + "step": 49190 + }, + { + "epoch": 0.43494404073622234, + "grad_norm": 2.8073883056640625, + "learning_rate": 4.2750932654396295e-05, + "loss": 0.7997, + "step": 49200 + }, + { + "epoch": 0.43503244399653457, + "grad_norm": 1.836535930633545, + "learning_rate": 4.274945926672442e-05, + "loss": 0.6028, + "step": 49210 + }, + { + "epoch": 0.43512084725684685, + "grad_norm": 2.6261870861053467, + "learning_rate": 4.274798587905255e-05, + "loss": 0.6354, + "step": 49220 + }, + { + "epoch": 0.4352092505171591, + "grad_norm": 1.8312478065490723, + "learning_rate": 4.2746512491380687e-05, + "loss": 0.7721, + "step": 49230 + }, + { + "epoch": 0.4352976537774713, + "grad_norm": 3.289795160293579, + "learning_rate": 4.2745039103708815e-05, + "loss": 0.7262, + "step": 49240 + }, + { + "epoch": 0.43538605703778355, + "grad_norm": 2.8596582412719727, + "learning_rate": 4.274356571603694e-05, + "loss": 0.6417, + "step": 49250 + }, + { + "epoch": 0.4354744602980958, + "grad_norm": 4.209091663360596, + "learning_rate": 4.274209232836507e-05, + "loss": 0.5637, + "step": 49260 + }, + { + "epoch": 0.435562863558408, + "grad_norm": 3.2064123153686523, + "learning_rate": 4.27406189406932e-05, + "loss": 0.7819, + "step": 49270 + }, + { + "epoch": 0.4356512668187203, + "grad_norm": 3.832395553588867, + "learning_rate": 4.273914555302133e-05, + "loss": 0.8224, + "step": 49280 + }, + { + "epoch": 0.4357396700790325, + "grad_norm": 14.278970718383789, + "learning_rate": 4.2737672165349463e-05, + "loss": 0.8477, + "step": 49290 + }, + { + "epoch": 0.43582807333934476, + "grad_norm": 7.994842052459717, + "learning_rate": 4.273619877767759e-05, + "loss": 0.6582, + "step": 49300 + }, + { + "epoch": 0.435916476599657, + "grad_norm": 3.741403102874756, + "learning_rate": 4.273472539000572e-05, + "loss": 0.6977, + "step": 49310 + }, + { + "epoch": 0.4360048798599692, + "grad_norm": 2.5595083236694336, + "learning_rate": 4.273325200233385e-05, + "loss": 0.7516, + "step": 49320 + }, + { + "epoch": 0.43609328312028145, + "grad_norm": 7.676196575164795, + "learning_rate": 4.273177861466198e-05, + "loss": 0.8734, + "step": 49330 + }, + { + "epoch": 0.43618168638059374, + "grad_norm": 3.580632448196411, + "learning_rate": 4.2730305226990105e-05, + "loss": 0.7638, + "step": 49340 + }, + { + "epoch": 0.43627008964090597, + "grad_norm": 6.204030990600586, + "learning_rate": 4.2728831839318234e-05, + "loss": 0.705, + "step": 49350 + }, + { + "epoch": 0.4363584929012182, + "grad_norm": 2.959484815597534, + "learning_rate": 4.272735845164637e-05, + "loss": 0.7366, + "step": 49360 + }, + { + "epoch": 0.43644689616153043, + "grad_norm": 3.1605892181396484, + "learning_rate": 4.27258850639745e-05, + "loss": 0.7496, + "step": 49370 + }, + { + "epoch": 0.43653529942184266, + "grad_norm": 6.258642196655273, + "learning_rate": 4.2724411676302625e-05, + "loss": 0.6651, + "step": 49380 + }, + { + "epoch": 0.4366237026821549, + "grad_norm": 5.046906471252441, + "learning_rate": 4.2722938288630754e-05, + "loss": 0.8945, + "step": 49390 + }, + { + "epoch": 0.4367121059424672, + "grad_norm": 3.16194224357605, + "learning_rate": 4.272146490095888e-05, + "loss": 0.682, + "step": 49400 + }, + { + "epoch": 0.4368005092027794, + "grad_norm": 3.2964229583740234, + "learning_rate": 4.271999151328701e-05, + "loss": 0.7375, + "step": 49410 + }, + { + "epoch": 0.43688891246309164, + "grad_norm": 2.3401637077331543, + "learning_rate": 4.2718518125615146e-05, + "loss": 0.6115, + "step": 49420 + }, + { + "epoch": 0.43697731572340387, + "grad_norm": 2.230109453201294, + "learning_rate": 4.271704473794327e-05, + "loss": 0.8274, + "step": 49430 + }, + { + "epoch": 0.4370657189837161, + "grad_norm": 6.246984481811523, + "learning_rate": 4.27155713502714e-05, + "loss": 0.6711, + "step": 49440 + }, + { + "epoch": 0.4371541222440284, + "grad_norm": 2.381495475769043, + "learning_rate": 4.271409796259953e-05, + "loss": 0.6404, + "step": 49450 + }, + { + "epoch": 0.4372425255043406, + "grad_norm": 4.55625057220459, + "learning_rate": 4.271262457492766e-05, + "loss": 0.8701, + "step": 49460 + }, + { + "epoch": 0.43733092876465285, + "grad_norm": 1.407307744026184, + "learning_rate": 4.271115118725579e-05, + "loss": 0.78, + "step": 49470 + }, + { + "epoch": 0.4374193320249651, + "grad_norm": 1.2274616956710815, + "learning_rate": 4.270967779958392e-05, + "loss": 0.7828, + "step": 49480 + }, + { + "epoch": 0.4375077352852773, + "grad_norm": 4.4979987144470215, + "learning_rate": 4.2708204411912044e-05, + "loss": 0.7073, + "step": 49490 + }, + { + "epoch": 0.43759613854558954, + "grad_norm": 2.6861391067504883, + "learning_rate": 4.270673102424018e-05, + "loss": 0.7902, + "step": 49500 + }, + { + "epoch": 0.43768454180590183, + "grad_norm": 4.484062194824219, + "learning_rate": 4.270525763656831e-05, + "loss": 0.6514, + "step": 49510 + }, + { + "epoch": 0.43777294506621406, + "grad_norm": 8.268613815307617, + "learning_rate": 4.2703784248896436e-05, + "loss": 0.6924, + "step": 49520 + }, + { + "epoch": 0.4378613483265263, + "grad_norm": 4.025659561157227, + "learning_rate": 4.2702310861224564e-05, + "loss": 0.7731, + "step": 49530 + }, + { + "epoch": 0.4379497515868385, + "grad_norm": 4.77163553237915, + "learning_rate": 4.270083747355269e-05, + "loss": 0.7548, + "step": 49540 + }, + { + "epoch": 0.43803815484715075, + "grad_norm": 4.086894989013672, + "learning_rate": 4.269936408588082e-05, + "loss": 0.7109, + "step": 49550 + }, + { + "epoch": 0.438126558107463, + "grad_norm": 3.181554079055786, + "learning_rate": 4.2697890698208956e-05, + "loss": 0.6558, + "step": 49560 + }, + { + "epoch": 0.43821496136777527, + "grad_norm": 2.358431100845337, + "learning_rate": 4.269641731053708e-05, + "loss": 0.6314, + "step": 49570 + }, + { + "epoch": 0.4383033646280875, + "grad_norm": 2.7088799476623535, + "learning_rate": 4.269494392286521e-05, + "loss": 0.624, + "step": 49580 + }, + { + "epoch": 0.43839176788839973, + "grad_norm": 6.590661525726318, + "learning_rate": 4.269347053519334e-05, + "loss": 0.7938, + "step": 49590 + }, + { + "epoch": 0.43848017114871196, + "grad_norm": 1.3354746103286743, + "learning_rate": 4.269199714752147e-05, + "loss": 0.6071, + "step": 49600 + }, + { + "epoch": 0.4385685744090242, + "grad_norm": 3.4821600914001465, + "learning_rate": 4.26905237598496e-05, + "loss": 0.746, + "step": 49610 + }, + { + "epoch": 0.4386569776693364, + "grad_norm": 2.755751371383667, + "learning_rate": 4.268905037217773e-05, + "loss": 0.6226, + "step": 49620 + }, + { + "epoch": 0.4387453809296487, + "grad_norm": 5.020684242248535, + "learning_rate": 4.2687576984505855e-05, + "loss": 0.8043, + "step": 49630 + }, + { + "epoch": 0.43883378418996094, + "grad_norm": 6.5135674476623535, + "learning_rate": 4.268610359683399e-05, + "loss": 0.7847, + "step": 49640 + }, + { + "epoch": 0.43892218745027317, + "grad_norm": 5.412643909454346, + "learning_rate": 4.268463020916212e-05, + "loss": 0.7045, + "step": 49650 + }, + { + "epoch": 0.4390105907105854, + "grad_norm": 3.9150893688201904, + "learning_rate": 4.2683156821490246e-05, + "loss": 0.7433, + "step": 49660 + }, + { + "epoch": 0.43909899397089763, + "grad_norm": 7.079890251159668, + "learning_rate": 4.2681683433818375e-05, + "loss": 0.6557, + "step": 49670 + }, + { + "epoch": 0.43918739723120986, + "grad_norm": 2.9877278804779053, + "learning_rate": 4.26802100461465e-05, + "loss": 0.7075, + "step": 49680 + }, + { + "epoch": 0.43927580049152215, + "grad_norm": 1.8254339694976807, + "learning_rate": 4.267873665847463e-05, + "loss": 0.609, + "step": 49690 + }, + { + "epoch": 0.4393642037518344, + "grad_norm": 8.803850173950195, + "learning_rate": 4.2677263270802767e-05, + "loss": 0.8693, + "step": 49700 + }, + { + "epoch": 0.4394526070121466, + "grad_norm": 5.286747455596924, + "learning_rate": 4.267578988313089e-05, + "loss": 0.6831, + "step": 49710 + }, + { + "epoch": 0.43954101027245884, + "grad_norm": 4.2131147384643555, + "learning_rate": 4.267431649545902e-05, + "loss": 0.589, + "step": 49720 + }, + { + "epoch": 0.4396294135327711, + "grad_norm": 2.2544007301330566, + "learning_rate": 4.267284310778715e-05, + "loss": 0.6905, + "step": 49730 + }, + { + "epoch": 0.4397178167930833, + "grad_norm": 2.0811386108398438, + "learning_rate": 4.267136972011528e-05, + "loss": 0.722, + "step": 49740 + }, + { + "epoch": 0.4398062200533956, + "grad_norm": 1.8482768535614014, + "learning_rate": 4.266989633244341e-05, + "loss": 0.8377, + "step": 49750 + }, + { + "epoch": 0.4398946233137078, + "grad_norm": 3.3543198108673096, + "learning_rate": 4.2668422944771544e-05, + "loss": 0.7036, + "step": 49760 + }, + { + "epoch": 0.43998302657402005, + "grad_norm": 4.920956134796143, + "learning_rate": 4.2666949557099665e-05, + "loss": 0.7653, + "step": 49770 + }, + { + "epoch": 0.4400714298343323, + "grad_norm": 15.025528907775879, + "learning_rate": 4.26654761694278e-05, + "loss": 0.906, + "step": 49780 + }, + { + "epoch": 0.4401598330946445, + "grad_norm": 6.583856105804443, + "learning_rate": 4.266400278175592e-05, + "loss": 0.6997, + "step": 49790 + }, + { + "epoch": 0.44024823635495675, + "grad_norm": 6.270252227783203, + "learning_rate": 4.266252939408406e-05, + "loss": 0.7197, + "step": 49800 + }, + { + "epoch": 0.44033663961526903, + "grad_norm": 1.6152703762054443, + "learning_rate": 4.2661056006412185e-05, + "loss": 0.6435, + "step": 49810 + }, + { + "epoch": 0.44042504287558126, + "grad_norm": 1.472029209136963, + "learning_rate": 4.2659582618740314e-05, + "loss": 0.6587, + "step": 49820 + }, + { + "epoch": 0.4405134461358935, + "grad_norm": 6.564108371734619, + "learning_rate": 4.265810923106844e-05, + "loss": 0.7238, + "step": 49830 + }, + { + "epoch": 0.4406018493962057, + "grad_norm": 3.803205966949463, + "learning_rate": 4.265663584339658e-05, + "loss": 0.6642, + "step": 49840 + }, + { + "epoch": 0.44069025265651796, + "grad_norm": 2.6269888877868652, + "learning_rate": 4.26551624557247e-05, + "loss": 0.7132, + "step": 49850 + }, + { + "epoch": 0.4407786559168302, + "grad_norm": 4.599565029144287, + "learning_rate": 4.2653689068052834e-05, + "loss": 0.8978, + "step": 49860 + }, + { + "epoch": 0.4408670591771425, + "grad_norm": 8.206945419311523, + "learning_rate": 4.265221568038096e-05, + "loss": 0.7625, + "step": 49870 + }, + { + "epoch": 0.4409554624374547, + "grad_norm": 2.6349050998687744, + "learning_rate": 4.265074229270909e-05, + "loss": 0.7431, + "step": 49880 + }, + { + "epoch": 0.44104386569776693, + "grad_norm": 2.3383517265319824, + "learning_rate": 4.264926890503722e-05, + "loss": 0.6219, + "step": 49890 + }, + { + "epoch": 0.44113226895807917, + "grad_norm": 1.913715124130249, + "learning_rate": 4.264779551736535e-05, + "loss": 0.7326, + "step": 49900 + }, + { + "epoch": 0.4412206722183914, + "grad_norm": 11.836853981018066, + "learning_rate": 4.2646322129693476e-05, + "loss": 0.6723, + "step": 49910 + }, + { + "epoch": 0.4413090754787036, + "grad_norm": 6.696552753448486, + "learning_rate": 4.264484874202161e-05, + "loss": 0.7211, + "step": 49920 + }, + { + "epoch": 0.4413974787390159, + "grad_norm": 2.5942704677581787, + "learning_rate": 4.264337535434973e-05, + "loss": 0.7202, + "step": 49930 + }, + { + "epoch": 0.44148588199932814, + "grad_norm": 5.038695812225342, + "learning_rate": 4.264190196667787e-05, + "loss": 0.7669, + "step": 49940 + }, + { + "epoch": 0.4415742852596404, + "grad_norm": 4.409718036651611, + "learning_rate": 4.2640428579005996e-05, + "loss": 0.7907, + "step": 49950 + }, + { + "epoch": 0.4416626885199526, + "grad_norm": 2.8787426948547363, + "learning_rate": 4.2638955191334124e-05, + "loss": 0.6615, + "step": 49960 + }, + { + "epoch": 0.44175109178026484, + "grad_norm": 2.493668556213379, + "learning_rate": 4.263748180366225e-05, + "loss": 0.7624, + "step": 49970 + }, + { + "epoch": 0.4418394950405771, + "grad_norm": 1.4627968072891235, + "learning_rate": 4.263600841599039e-05, + "loss": 0.6539, + "step": 49980 + }, + { + "epoch": 0.44192789830088935, + "grad_norm": 4.599356174468994, + "learning_rate": 4.263453502831851e-05, + "loss": 0.7396, + "step": 49990 + }, + { + "epoch": 0.4420163015612016, + "grad_norm": 4.267996788024902, + "learning_rate": 4.2633061640646644e-05, + "loss": 0.7359, + "step": 50000 + }, + { + "epoch": 0.4421047048215138, + "grad_norm": 5.884395599365234, + "learning_rate": 4.2631588252974766e-05, + "loss": 0.7275, + "step": 50010 + }, + { + "epoch": 0.44219310808182605, + "grad_norm": 2.460977792739868, + "learning_rate": 4.26301148653029e-05, + "loss": 0.5478, + "step": 50020 + }, + { + "epoch": 0.4422815113421383, + "grad_norm": 5.67397928237915, + "learning_rate": 4.262864147763103e-05, + "loss": 0.6254, + "step": 50030 + }, + { + "epoch": 0.44236991460245056, + "grad_norm": 3.4750771522521973, + "learning_rate": 4.262716808995916e-05, + "loss": 0.7862, + "step": 50040 + }, + { + "epoch": 0.4424583178627628, + "grad_norm": 1.5983625650405884, + "learning_rate": 4.2625694702287286e-05, + "loss": 0.6876, + "step": 50050 + }, + { + "epoch": 0.442546721123075, + "grad_norm": 3.5901355743408203, + "learning_rate": 4.262422131461542e-05, + "loss": 0.7766, + "step": 50060 + }, + { + "epoch": 0.44263512438338726, + "grad_norm": 5.239259719848633, + "learning_rate": 4.262274792694354e-05, + "loss": 0.7626, + "step": 50070 + }, + { + "epoch": 0.4427235276436995, + "grad_norm": 1.8367315530776978, + "learning_rate": 4.262127453927168e-05, + "loss": 0.7592, + "step": 50080 + }, + { + "epoch": 0.4428119309040117, + "grad_norm": 4.611416339874268, + "learning_rate": 4.2619801151599806e-05, + "loss": 0.6571, + "step": 50090 + }, + { + "epoch": 0.442900334164324, + "grad_norm": 5.311549186706543, + "learning_rate": 4.2618327763927935e-05, + "loss": 0.7994, + "step": 50100 + }, + { + "epoch": 0.44298873742463624, + "grad_norm": 7.112986087799072, + "learning_rate": 4.261685437625606e-05, + "loss": 0.6723, + "step": 50110 + }, + { + "epoch": 0.44307714068494847, + "grad_norm": 2.7187156677246094, + "learning_rate": 4.26153809885842e-05, + "loss": 0.7939, + "step": 50120 + }, + { + "epoch": 0.4431655439452607, + "grad_norm": 5.719393730163574, + "learning_rate": 4.261390760091232e-05, + "loss": 0.7802, + "step": 50130 + }, + { + "epoch": 0.44325394720557293, + "grad_norm": 3.5198893547058105, + "learning_rate": 4.2612434213240455e-05, + "loss": 0.636, + "step": 50140 + }, + { + "epoch": 0.44334235046588516, + "grad_norm": 6.148463726043701, + "learning_rate": 4.261096082556858e-05, + "loss": 0.5975, + "step": 50150 + }, + { + "epoch": 0.44343075372619745, + "grad_norm": 1.562666654586792, + "learning_rate": 4.260948743789671e-05, + "loss": 0.7161, + "step": 50160 + }, + { + "epoch": 0.4435191569865097, + "grad_norm": 5.703946113586426, + "learning_rate": 4.260801405022484e-05, + "loss": 0.6523, + "step": 50170 + }, + { + "epoch": 0.4436075602468219, + "grad_norm": 2.8563754558563232, + "learning_rate": 4.260654066255297e-05, + "loss": 0.7661, + "step": 50180 + }, + { + "epoch": 0.44369596350713414, + "grad_norm": 2.9135634899139404, + "learning_rate": 4.2605067274881097e-05, + "loss": 0.8834, + "step": 50190 + }, + { + "epoch": 0.44378436676744637, + "grad_norm": 5.729390621185303, + "learning_rate": 4.260359388720923e-05, + "loss": 0.7242, + "step": 50200 + }, + { + "epoch": 0.4438727700277586, + "grad_norm": 4.321021556854248, + "learning_rate": 4.260212049953736e-05, + "loss": 0.6896, + "step": 50210 + }, + { + "epoch": 0.4439611732880709, + "grad_norm": 4.484278202056885, + "learning_rate": 4.260064711186549e-05, + "loss": 0.6525, + "step": 50220 + }, + { + "epoch": 0.4440495765483831, + "grad_norm": 19.692768096923828, + "learning_rate": 4.259917372419362e-05, + "loss": 0.7327, + "step": 50230 + }, + { + "epoch": 0.44413797980869535, + "grad_norm": 9.12367057800293, + "learning_rate": 4.2597700336521745e-05, + "loss": 0.7455, + "step": 50240 + }, + { + "epoch": 0.4442263830690076, + "grad_norm": 4.884012222290039, + "learning_rate": 4.2596226948849874e-05, + "loss": 0.6688, + "step": 50250 + }, + { + "epoch": 0.4443147863293198, + "grad_norm": 1.6866464614868164, + "learning_rate": 4.2594753561178e-05, + "loss": 0.6921, + "step": 50260 + }, + { + "epoch": 0.44440318958963204, + "grad_norm": 4.648808479309082, + "learning_rate": 4.259328017350614e-05, + "loss": 0.6725, + "step": 50270 + }, + { + "epoch": 0.44449159284994433, + "grad_norm": 6.333119869232178, + "learning_rate": 4.2591806785834265e-05, + "loss": 0.793, + "step": 50280 + }, + { + "epoch": 0.44457999611025656, + "grad_norm": 2.2390098571777344, + "learning_rate": 4.2590333398162394e-05, + "loss": 0.7509, + "step": 50290 + }, + { + "epoch": 0.4446683993705688, + "grad_norm": 8.741256713867188, + "learning_rate": 4.258886001049052e-05, + "loss": 0.7244, + "step": 50300 + }, + { + "epoch": 0.444756802630881, + "grad_norm": 2.243675947189331, + "learning_rate": 4.258738662281865e-05, + "loss": 0.7969, + "step": 50310 + }, + { + "epoch": 0.44484520589119325, + "grad_norm": 3.3647618293762207, + "learning_rate": 4.258591323514678e-05, + "loss": 0.6842, + "step": 50320 + }, + { + "epoch": 0.4449336091515055, + "grad_norm": 4.149233341217041, + "learning_rate": 4.2584439847474914e-05, + "loss": 0.736, + "step": 50330 + }, + { + "epoch": 0.44502201241181777, + "grad_norm": 1.2286020517349243, + "learning_rate": 4.258296645980304e-05, + "loss": 0.7811, + "step": 50340 + }, + { + "epoch": 0.44511041567213, + "grad_norm": 4.393327236175537, + "learning_rate": 4.258149307213117e-05, + "loss": 0.7214, + "step": 50350 + }, + { + "epoch": 0.44519881893244223, + "grad_norm": 7.338047027587891, + "learning_rate": 4.25800196844593e-05, + "loss": 0.7202, + "step": 50360 + }, + { + "epoch": 0.44528722219275446, + "grad_norm": 2.7075812816619873, + "learning_rate": 4.257854629678743e-05, + "loss": 0.6667, + "step": 50370 + }, + { + "epoch": 0.4453756254530667, + "grad_norm": 4.2327656745910645, + "learning_rate": 4.2577072909115556e-05, + "loss": 0.7128, + "step": 50380 + }, + { + "epoch": 0.4454640287133789, + "grad_norm": 2.7099413871765137, + "learning_rate": 4.257559952144369e-05, + "loss": 0.5684, + "step": 50390 + }, + { + "epoch": 0.4455524319736912, + "grad_norm": 3.604851245880127, + "learning_rate": 4.257412613377181e-05, + "loss": 0.6998, + "step": 50400 + }, + { + "epoch": 0.44564083523400344, + "grad_norm": 3.393521785736084, + "learning_rate": 4.257265274609995e-05, + "loss": 0.679, + "step": 50410 + }, + { + "epoch": 0.44572923849431567, + "grad_norm": 3.353628635406494, + "learning_rate": 4.2571179358428076e-05, + "loss": 0.586, + "step": 50420 + }, + { + "epoch": 0.4458176417546279, + "grad_norm": 3.7172305583953857, + "learning_rate": 4.2569705970756204e-05, + "loss": 0.8801, + "step": 50430 + }, + { + "epoch": 0.44590604501494013, + "grad_norm": 1.9572820663452148, + "learning_rate": 4.256823258308433e-05, + "loss": 0.7879, + "step": 50440 + }, + { + "epoch": 0.44599444827525236, + "grad_norm": 3.830937385559082, + "learning_rate": 4.256675919541247e-05, + "loss": 0.6682, + "step": 50450 + }, + { + "epoch": 0.44608285153556465, + "grad_norm": 6.2671074867248535, + "learning_rate": 4.256528580774059e-05, + "loss": 0.6641, + "step": 50460 + }, + { + "epoch": 0.4461712547958769, + "grad_norm": 3.6998579502105713, + "learning_rate": 4.2563812420068724e-05, + "loss": 0.6657, + "step": 50470 + }, + { + "epoch": 0.4462596580561891, + "grad_norm": 2.041537284851074, + "learning_rate": 4.256233903239685e-05, + "loss": 0.629, + "step": 50480 + }, + { + "epoch": 0.44634806131650134, + "grad_norm": 14.65803050994873, + "learning_rate": 4.256086564472498e-05, + "loss": 0.6656, + "step": 50490 + }, + { + "epoch": 0.4464364645768136, + "grad_norm": 7.739551067352295, + "learning_rate": 4.255939225705311e-05, + "loss": 0.8985, + "step": 50500 + }, + { + "epoch": 0.44652486783712586, + "grad_norm": 3.0493173599243164, + "learning_rate": 4.255791886938124e-05, + "loss": 0.7058, + "step": 50510 + }, + { + "epoch": 0.4466132710974381, + "grad_norm": 7.795433044433594, + "learning_rate": 4.2556445481709366e-05, + "loss": 0.7278, + "step": 50520 + }, + { + "epoch": 0.4467016743577503, + "grad_norm": 2.150282859802246, + "learning_rate": 4.25549720940375e-05, + "loss": 0.7359, + "step": 50530 + }, + { + "epoch": 0.44679007761806255, + "grad_norm": 3.0631790161132812, + "learning_rate": 4.255349870636562e-05, + "loss": 0.7116, + "step": 50540 + }, + { + "epoch": 0.4468784808783748, + "grad_norm": 3.001638174057007, + "learning_rate": 4.255202531869376e-05, + "loss": 0.7833, + "step": 50550 + }, + { + "epoch": 0.446966884138687, + "grad_norm": 3.490363121032715, + "learning_rate": 4.2550551931021886e-05, + "loss": 0.7498, + "step": 50560 + }, + { + "epoch": 0.4470552873989993, + "grad_norm": 1.8131797313690186, + "learning_rate": 4.2549078543350015e-05, + "loss": 0.6677, + "step": 50570 + }, + { + "epoch": 0.44714369065931153, + "grad_norm": 2.176320791244507, + "learning_rate": 4.254760515567814e-05, + "loss": 0.7848, + "step": 50580 + }, + { + "epoch": 0.44723209391962376, + "grad_norm": 2.8201351165771484, + "learning_rate": 4.254613176800628e-05, + "loss": 0.6118, + "step": 50590 + }, + { + "epoch": 0.447320497179936, + "grad_norm": 7.577088832855225, + "learning_rate": 4.25446583803344e-05, + "loss": 0.6237, + "step": 50600 + }, + { + "epoch": 0.4474089004402482, + "grad_norm": 4.768428325653076, + "learning_rate": 4.2543184992662535e-05, + "loss": 0.7648, + "step": 50610 + }, + { + "epoch": 0.44749730370056046, + "grad_norm": 3.6399502754211426, + "learning_rate": 4.2541711604990656e-05, + "loss": 0.659, + "step": 50620 + }, + { + "epoch": 0.44758570696087274, + "grad_norm": 4.284818172454834, + "learning_rate": 4.254023821731879e-05, + "loss": 0.8383, + "step": 50630 + }, + { + "epoch": 0.447674110221185, + "grad_norm": 4.566434860229492, + "learning_rate": 4.253876482964692e-05, + "loss": 0.6057, + "step": 50640 + }, + { + "epoch": 0.4477625134814972, + "grad_norm": 5.5485615730285645, + "learning_rate": 4.253729144197505e-05, + "loss": 0.7449, + "step": 50650 + }, + { + "epoch": 0.44785091674180943, + "grad_norm": 1.481044888496399, + "learning_rate": 4.253581805430318e-05, + "loss": 0.7224, + "step": 50660 + }, + { + "epoch": 0.44793932000212167, + "grad_norm": 1.7246774435043335, + "learning_rate": 4.253434466663131e-05, + "loss": 0.608, + "step": 50670 + }, + { + "epoch": 0.4480277232624339, + "grad_norm": 5.615640640258789, + "learning_rate": 4.253287127895943e-05, + "loss": 0.7658, + "step": 50680 + }, + { + "epoch": 0.4481161265227462, + "grad_norm": 5.9075188636779785, + "learning_rate": 4.253139789128757e-05, + "loss": 0.8186, + "step": 50690 + }, + { + "epoch": 0.4482045297830584, + "grad_norm": 4.552731037139893, + "learning_rate": 4.25299245036157e-05, + "loss": 0.773, + "step": 50700 + }, + { + "epoch": 0.44829293304337064, + "grad_norm": 2.067314624786377, + "learning_rate": 4.2528451115943825e-05, + "loss": 0.6254, + "step": 50710 + }, + { + "epoch": 0.4483813363036829, + "grad_norm": 5.75351619720459, + "learning_rate": 4.2526977728271954e-05, + "loss": 0.6519, + "step": 50720 + }, + { + "epoch": 0.4484697395639951, + "grad_norm": 1.9102205038070679, + "learning_rate": 4.252550434060008e-05, + "loss": 0.7194, + "step": 50730 + }, + { + "epoch": 0.44855814282430734, + "grad_norm": 2.0806591510772705, + "learning_rate": 4.252403095292821e-05, + "loss": 0.729, + "step": 50740 + }, + { + "epoch": 0.4486465460846196, + "grad_norm": 2.3463573455810547, + "learning_rate": 4.2522557565256345e-05, + "loss": 0.7261, + "step": 50750 + }, + { + "epoch": 0.44873494934493185, + "grad_norm": 1.6279937028884888, + "learning_rate": 4.252108417758447e-05, + "loss": 0.8222, + "step": 50760 + }, + { + "epoch": 0.4488233526052441, + "grad_norm": 5.521528720855713, + "learning_rate": 4.25196107899126e-05, + "loss": 0.8362, + "step": 50770 + }, + { + "epoch": 0.4489117558655563, + "grad_norm": 3.009153127670288, + "learning_rate": 4.251813740224073e-05, + "loss": 0.7793, + "step": 50780 + }, + { + "epoch": 0.44900015912586855, + "grad_norm": 4.9164652824401855, + "learning_rate": 4.251666401456886e-05, + "loss": 0.7881, + "step": 50790 + }, + { + "epoch": 0.4490885623861808, + "grad_norm": 5.745452880859375, + "learning_rate": 4.251519062689699e-05, + "loss": 0.5978, + "step": 50800 + }, + { + "epoch": 0.44917696564649306, + "grad_norm": 2.97101092338562, + "learning_rate": 4.251371723922512e-05, + "loss": 0.7987, + "step": 50810 + }, + { + "epoch": 0.4492653689068053, + "grad_norm": 3.5567069053649902, + "learning_rate": 4.2512243851553244e-05, + "loss": 0.7784, + "step": 50820 + }, + { + "epoch": 0.4493537721671175, + "grad_norm": 7.108294486999512, + "learning_rate": 4.251077046388138e-05, + "loss": 0.7194, + "step": 50830 + }, + { + "epoch": 0.44944217542742976, + "grad_norm": 6.408024311065674, + "learning_rate": 4.25092970762095e-05, + "loss": 0.7198, + "step": 50840 + }, + { + "epoch": 0.449530578687742, + "grad_norm": 1.4201781749725342, + "learning_rate": 4.2507823688537636e-05, + "loss": 0.7156, + "step": 50850 + }, + { + "epoch": 0.4496189819480542, + "grad_norm": 5.604401111602783, + "learning_rate": 4.2506350300865764e-05, + "loss": 0.713, + "step": 50860 + }, + { + "epoch": 0.4497073852083665, + "grad_norm": 5.523240566253662, + "learning_rate": 4.250487691319389e-05, + "loss": 0.705, + "step": 50870 + }, + { + "epoch": 0.44979578846867874, + "grad_norm": 3.3463923931121826, + "learning_rate": 4.250340352552202e-05, + "loss": 0.7345, + "step": 50880 + }, + { + "epoch": 0.44988419172899097, + "grad_norm": 2.3162336349487305, + "learning_rate": 4.2501930137850156e-05, + "loss": 0.7248, + "step": 50890 + }, + { + "epoch": 0.4499725949893032, + "grad_norm": 1.6102399826049805, + "learning_rate": 4.250045675017828e-05, + "loss": 0.627, + "step": 50900 + }, + { + "epoch": 0.45006099824961543, + "grad_norm": 3.2316250801086426, + "learning_rate": 4.249898336250641e-05, + "loss": 0.8308, + "step": 50910 + }, + { + "epoch": 0.45014940150992766, + "grad_norm": 2.5728564262390137, + "learning_rate": 4.249750997483454e-05, + "loss": 0.8771, + "step": 50920 + }, + { + "epoch": 0.45023780477023995, + "grad_norm": 1.5083261728286743, + "learning_rate": 4.249603658716267e-05, + "loss": 0.71, + "step": 50930 + }, + { + "epoch": 0.4503262080305522, + "grad_norm": 11.411937713623047, + "learning_rate": 4.24945631994908e-05, + "loss": 0.7299, + "step": 50940 + }, + { + "epoch": 0.4504146112908644, + "grad_norm": 4.420965671539307, + "learning_rate": 4.249308981181893e-05, + "loss": 0.7322, + "step": 50950 + }, + { + "epoch": 0.45050301455117664, + "grad_norm": 7.932520389556885, + "learning_rate": 4.2491616424147054e-05, + "loss": 0.7061, + "step": 50960 + }, + { + "epoch": 0.45059141781148887, + "grad_norm": 4.018369197845459, + "learning_rate": 4.249014303647519e-05, + "loss": 0.717, + "step": 50970 + }, + { + "epoch": 0.4506798210718011, + "grad_norm": 4.696685314178467, + "learning_rate": 4.248866964880331e-05, + "loss": 0.7478, + "step": 50980 + }, + { + "epoch": 0.4507682243321134, + "grad_norm": 6.151251792907715, + "learning_rate": 4.2487196261131446e-05, + "loss": 0.7025, + "step": 50990 + }, + { + "epoch": 0.4508566275924256, + "grad_norm": 2.805976152420044, + "learning_rate": 4.2485722873459575e-05, + "loss": 0.6407, + "step": 51000 + }, + { + "epoch": 0.45094503085273785, + "grad_norm": 11.724607467651367, + "learning_rate": 4.24842494857877e-05, + "loss": 0.766, + "step": 51010 + }, + { + "epoch": 0.4510334341130501, + "grad_norm": 3.093700885772705, + "learning_rate": 4.248277609811583e-05, + "loss": 0.6129, + "step": 51020 + }, + { + "epoch": 0.4511218373733623, + "grad_norm": 6.869553565979004, + "learning_rate": 4.2481302710443966e-05, + "loss": 0.7046, + "step": 51030 + }, + { + "epoch": 0.4512102406336746, + "grad_norm": 7.355849266052246, + "learning_rate": 4.247982932277209e-05, + "loss": 0.7978, + "step": 51040 + }, + { + "epoch": 0.45129864389398683, + "grad_norm": 3.561415195465088, + "learning_rate": 4.247835593510022e-05, + "loss": 0.6356, + "step": 51050 + }, + { + "epoch": 0.45138704715429906, + "grad_norm": 4.891472816467285, + "learning_rate": 4.247688254742835e-05, + "loss": 0.6888, + "step": 51060 + }, + { + "epoch": 0.4514754504146113, + "grad_norm": 4.600146770477295, + "learning_rate": 4.247540915975648e-05, + "loss": 0.6876, + "step": 51070 + }, + { + "epoch": 0.4515638536749235, + "grad_norm": 3.3943684101104736, + "learning_rate": 4.247393577208461e-05, + "loss": 0.6932, + "step": 51080 + }, + { + "epoch": 0.45165225693523575, + "grad_norm": 6.3659586906433105, + "learning_rate": 4.2472462384412737e-05, + "loss": 0.8293, + "step": 51090 + }, + { + "epoch": 0.45174066019554804, + "grad_norm": 2.781177043914795, + "learning_rate": 4.2470988996740865e-05, + "loss": 0.8964, + "step": 51100 + }, + { + "epoch": 0.45182906345586027, + "grad_norm": 2.336111068725586, + "learning_rate": 4.2469515609069e-05, + "loss": 0.8381, + "step": 51110 + }, + { + "epoch": 0.4519174667161725, + "grad_norm": 4.049881458282471, + "learning_rate": 4.246804222139713e-05, + "loss": 0.7464, + "step": 51120 + }, + { + "epoch": 0.45200586997648473, + "grad_norm": 3.050784111022949, + "learning_rate": 4.246656883372526e-05, + "loss": 0.7435, + "step": 51130 + }, + { + "epoch": 0.45209427323679696, + "grad_norm": 1.7821450233459473, + "learning_rate": 4.2465095446053385e-05, + "loss": 0.6286, + "step": 51140 + }, + { + "epoch": 0.4521826764971092, + "grad_norm": 1.96043860912323, + "learning_rate": 4.2463622058381513e-05, + "loss": 0.832, + "step": 51150 + }, + { + "epoch": 0.4522710797574215, + "grad_norm": 4.979556083679199, + "learning_rate": 4.246214867070964e-05, + "loss": 0.5888, + "step": 51160 + }, + { + "epoch": 0.4523594830177337, + "grad_norm": 4.7774858474731445, + "learning_rate": 4.246067528303778e-05, + "loss": 0.7634, + "step": 51170 + }, + { + "epoch": 0.45244788627804594, + "grad_norm": 7.052032470703125, + "learning_rate": 4.2459201895365905e-05, + "loss": 0.7601, + "step": 51180 + }, + { + "epoch": 0.45253628953835817, + "grad_norm": 2.726879119873047, + "learning_rate": 4.2457728507694034e-05, + "loss": 0.826, + "step": 51190 + }, + { + "epoch": 0.4526246927986704, + "grad_norm": 5.2360758781433105, + "learning_rate": 4.245625512002216e-05, + "loss": 0.5117, + "step": 51200 + }, + { + "epoch": 0.45271309605898263, + "grad_norm": 2.565112829208374, + "learning_rate": 4.245478173235029e-05, + "loss": 0.7631, + "step": 51210 + }, + { + "epoch": 0.4528014993192949, + "grad_norm": 1.7049453258514404, + "learning_rate": 4.245330834467842e-05, + "loss": 0.6309, + "step": 51220 + }, + { + "epoch": 0.45288990257960715, + "grad_norm": 4.346341609954834, + "learning_rate": 4.245183495700655e-05, + "loss": 0.8687, + "step": 51230 + }, + { + "epoch": 0.4529783058399194, + "grad_norm": 7.231688022613525, + "learning_rate": 4.245036156933468e-05, + "loss": 0.6993, + "step": 51240 + }, + { + "epoch": 0.4530667091002316, + "grad_norm": 5.763049602508545, + "learning_rate": 4.244888818166281e-05, + "loss": 0.7153, + "step": 51250 + }, + { + "epoch": 0.45315511236054384, + "grad_norm": 2.694974899291992, + "learning_rate": 4.244741479399094e-05, + "loss": 0.6416, + "step": 51260 + }, + { + "epoch": 0.4532435156208561, + "grad_norm": 11.770453453063965, + "learning_rate": 4.244594140631907e-05, + "loss": 0.7204, + "step": 51270 + }, + { + "epoch": 0.45333191888116836, + "grad_norm": 1.7721461057662964, + "learning_rate": 4.2444468018647196e-05, + "loss": 0.6159, + "step": 51280 + }, + { + "epoch": 0.4534203221414806, + "grad_norm": 1.4127252101898193, + "learning_rate": 4.2442994630975324e-05, + "loss": 0.6948, + "step": 51290 + }, + { + "epoch": 0.4535087254017928, + "grad_norm": 3.8951640129089355, + "learning_rate": 4.244152124330346e-05, + "loss": 0.711, + "step": 51300 + }, + { + "epoch": 0.45359712866210505, + "grad_norm": 4.719071388244629, + "learning_rate": 4.244004785563158e-05, + "loss": 0.6654, + "step": 51310 + }, + { + "epoch": 0.4536855319224173, + "grad_norm": 1.3236104249954224, + "learning_rate": 4.2438574467959716e-05, + "loss": 0.7087, + "step": 51320 + }, + { + "epoch": 0.4537739351827295, + "grad_norm": 3.448472499847412, + "learning_rate": 4.2437101080287844e-05, + "loss": 0.6766, + "step": 51330 + }, + { + "epoch": 0.4538623384430418, + "grad_norm": 1.7239394187927246, + "learning_rate": 4.243562769261597e-05, + "loss": 0.572, + "step": 51340 + }, + { + "epoch": 0.45395074170335403, + "grad_norm": 9.879199028015137, + "learning_rate": 4.24341543049441e-05, + "loss": 0.7762, + "step": 51350 + }, + { + "epoch": 0.45403914496366626, + "grad_norm": 9.45833683013916, + "learning_rate": 4.2432680917272236e-05, + "loss": 0.7796, + "step": 51360 + }, + { + "epoch": 0.4541275482239785, + "grad_norm": 1.6749019622802734, + "learning_rate": 4.243120752960036e-05, + "loss": 0.7972, + "step": 51370 + }, + { + "epoch": 0.4542159514842907, + "grad_norm": 1.6194690465927124, + "learning_rate": 4.242973414192849e-05, + "loss": 0.8425, + "step": 51380 + }, + { + "epoch": 0.45430435474460296, + "grad_norm": 3.211587905883789, + "learning_rate": 4.242826075425662e-05, + "loss": 0.6692, + "step": 51390 + }, + { + "epoch": 0.45439275800491524, + "grad_norm": 4.525156497955322, + "learning_rate": 4.242678736658475e-05, + "loss": 0.7135, + "step": 51400 + }, + { + "epoch": 0.4544811612652275, + "grad_norm": 1.4770921468734741, + "learning_rate": 4.242531397891288e-05, + "loss": 0.6965, + "step": 51410 + }, + { + "epoch": 0.4545695645255397, + "grad_norm": 6.5885009765625, + "learning_rate": 4.242384059124101e-05, + "loss": 0.7756, + "step": 51420 + }, + { + "epoch": 0.45465796778585194, + "grad_norm": 8.741536140441895, + "learning_rate": 4.2422367203569134e-05, + "loss": 0.7028, + "step": 51430 + }, + { + "epoch": 0.45474637104616417, + "grad_norm": 2.4928131103515625, + "learning_rate": 4.242089381589727e-05, + "loss": 0.7567, + "step": 51440 + }, + { + "epoch": 0.4548347743064764, + "grad_norm": 4.189772605895996, + "learning_rate": 4.241942042822539e-05, + "loss": 0.5896, + "step": 51450 + }, + { + "epoch": 0.4549231775667887, + "grad_norm": 5.977318286895752, + "learning_rate": 4.2417947040553526e-05, + "loss": 0.6195, + "step": 51460 + }, + { + "epoch": 0.4550115808271009, + "grad_norm": 8.101325035095215, + "learning_rate": 4.2416473652881655e-05, + "loss": 0.7467, + "step": 51470 + }, + { + "epoch": 0.45509998408741315, + "grad_norm": 5.809628486633301, + "learning_rate": 4.241500026520978e-05, + "loss": 0.6599, + "step": 51480 + }, + { + "epoch": 0.4551883873477254, + "grad_norm": 6.189295768737793, + "learning_rate": 4.241352687753791e-05, + "loss": 0.5845, + "step": 51490 + }, + { + "epoch": 0.4552767906080376, + "grad_norm": 1.7385181188583374, + "learning_rate": 4.2412053489866047e-05, + "loss": 0.5603, + "step": 51500 + }, + { + "epoch": 0.45536519386834984, + "grad_norm": 3.193965196609497, + "learning_rate": 4.241058010219417e-05, + "loss": 0.6597, + "step": 51510 + }, + { + "epoch": 0.4554535971286621, + "grad_norm": 4.2179107666015625, + "learning_rate": 4.24091067145223e-05, + "loss": 0.7087, + "step": 51520 + }, + { + "epoch": 0.45554200038897436, + "grad_norm": 2.7894127368927, + "learning_rate": 4.240763332685043e-05, + "loss": 0.6678, + "step": 51530 + }, + { + "epoch": 0.4556304036492866, + "grad_norm": 5.392573356628418, + "learning_rate": 4.240615993917856e-05, + "loss": 0.5749, + "step": 51540 + }, + { + "epoch": 0.4557188069095988, + "grad_norm": 2.8715810775756836, + "learning_rate": 4.240468655150669e-05, + "loss": 0.7528, + "step": 51550 + }, + { + "epoch": 0.45580721016991105, + "grad_norm": 7.369666576385498, + "learning_rate": 4.240321316383482e-05, + "loss": 0.6643, + "step": 51560 + }, + { + "epoch": 0.45589561343022333, + "grad_norm": 2.1460392475128174, + "learning_rate": 4.2401739776162945e-05, + "loss": 0.6124, + "step": 51570 + }, + { + "epoch": 0.45598401669053557, + "grad_norm": 2.804109811782837, + "learning_rate": 4.240026638849108e-05, + "loss": 0.7637, + "step": 51580 + }, + { + "epoch": 0.4560724199508478, + "grad_norm": 5.18781042098999, + "learning_rate": 4.23987930008192e-05, + "loss": 0.7274, + "step": 51590 + }, + { + "epoch": 0.45616082321116, + "grad_norm": 7.195793151855469, + "learning_rate": 4.239731961314734e-05, + "loss": 0.7772, + "step": 51600 + }, + { + "epoch": 0.45624922647147226, + "grad_norm": 10.898725509643555, + "learning_rate": 4.2395846225475465e-05, + "loss": 0.699, + "step": 51610 + }, + { + "epoch": 0.4563376297317845, + "grad_norm": 3.8482308387756348, + "learning_rate": 4.2394372837803594e-05, + "loss": 0.6364, + "step": 51620 + }, + { + "epoch": 0.4564260329920968, + "grad_norm": 4.435975074768066, + "learning_rate": 4.239289945013172e-05, + "loss": 0.7792, + "step": 51630 + }, + { + "epoch": 0.456514436252409, + "grad_norm": 10.318117141723633, + "learning_rate": 4.239142606245986e-05, + "loss": 0.7525, + "step": 51640 + }, + { + "epoch": 0.45660283951272124, + "grad_norm": 1.6074044704437256, + "learning_rate": 4.238995267478798e-05, + "loss": 0.7726, + "step": 51650 + }, + { + "epoch": 0.45669124277303347, + "grad_norm": 4.462329864501953, + "learning_rate": 4.2388479287116114e-05, + "loss": 0.489, + "step": 51660 + }, + { + "epoch": 0.4567796460333457, + "grad_norm": 3.9491384029388428, + "learning_rate": 4.2387005899444235e-05, + "loss": 0.6887, + "step": 51670 + }, + { + "epoch": 0.45686804929365793, + "grad_norm": 5.480148792266846, + "learning_rate": 4.238553251177237e-05, + "loss": 0.7327, + "step": 51680 + }, + { + "epoch": 0.4569564525539702, + "grad_norm": 9.441926002502441, + "learning_rate": 4.23840591241005e-05, + "loss": 0.7712, + "step": 51690 + }, + { + "epoch": 0.45704485581428245, + "grad_norm": 7.183199882507324, + "learning_rate": 4.238258573642863e-05, + "loss": 0.7392, + "step": 51700 + }, + { + "epoch": 0.4571332590745947, + "grad_norm": 7.0118560791015625, + "learning_rate": 4.2381112348756755e-05, + "loss": 0.7721, + "step": 51710 + }, + { + "epoch": 0.4572216623349069, + "grad_norm": 6.411294460296631, + "learning_rate": 4.237963896108489e-05, + "loss": 0.6004, + "step": 51720 + }, + { + "epoch": 0.45731006559521914, + "grad_norm": 11.087834358215332, + "learning_rate": 4.237816557341301e-05, + "loss": 0.7065, + "step": 51730 + }, + { + "epoch": 0.45739846885553137, + "grad_norm": 3.378361701965332, + "learning_rate": 4.237669218574115e-05, + "loss": 0.8234, + "step": 51740 + }, + { + "epoch": 0.45748687211584366, + "grad_norm": 6.5096893310546875, + "learning_rate": 4.2375218798069276e-05, + "loss": 0.5817, + "step": 51750 + }, + { + "epoch": 0.4575752753761559, + "grad_norm": 2.6013710498809814, + "learning_rate": 4.2373745410397404e-05, + "loss": 0.7469, + "step": 51760 + }, + { + "epoch": 0.4576636786364681, + "grad_norm": 6.895487308502197, + "learning_rate": 4.237227202272553e-05, + "loss": 0.7929, + "step": 51770 + }, + { + "epoch": 0.45775208189678035, + "grad_norm": 7.680325031280518, + "learning_rate": 4.237079863505366e-05, + "loss": 0.7848, + "step": 51780 + }, + { + "epoch": 0.4578404851570926, + "grad_norm": 2.104198455810547, + "learning_rate": 4.236932524738179e-05, + "loss": 0.7355, + "step": 51790 + }, + { + "epoch": 0.4579288884174048, + "grad_norm": 2.9252047538757324, + "learning_rate": 4.2367851859709924e-05, + "loss": 0.7322, + "step": 51800 + }, + { + "epoch": 0.4580172916777171, + "grad_norm": 3.3038887977600098, + "learning_rate": 4.2366378472038046e-05, + "loss": 0.6402, + "step": 51810 + }, + { + "epoch": 0.45810569493802933, + "grad_norm": 3.781679153442383, + "learning_rate": 4.236490508436618e-05, + "loss": 0.6917, + "step": 51820 + }, + { + "epoch": 0.45819409819834156, + "grad_norm": 4.439880847930908, + "learning_rate": 4.236343169669431e-05, + "loss": 0.562, + "step": 51830 + }, + { + "epoch": 0.4582825014586538, + "grad_norm": 3.3872768878936768, + "learning_rate": 4.236195830902244e-05, + "loss": 0.8094, + "step": 51840 + }, + { + "epoch": 0.458370904718966, + "grad_norm": 5.398439884185791, + "learning_rate": 4.2360484921350566e-05, + "loss": 0.8208, + "step": 51850 + }, + { + "epoch": 0.45845930797927825, + "grad_norm": 4.339428901672363, + "learning_rate": 4.23590115336787e-05, + "loss": 0.6078, + "step": 51860 + }, + { + "epoch": 0.45854771123959054, + "grad_norm": 1.3413759469985962, + "learning_rate": 4.235753814600682e-05, + "loss": 0.789, + "step": 51870 + }, + { + "epoch": 0.45863611449990277, + "grad_norm": 4.0492353439331055, + "learning_rate": 4.235606475833496e-05, + "loss": 0.6432, + "step": 51880 + }, + { + "epoch": 0.458724517760215, + "grad_norm": 1.8701279163360596, + "learning_rate": 4.2354591370663086e-05, + "loss": 0.6345, + "step": 51890 + }, + { + "epoch": 0.45881292102052723, + "grad_norm": 4.309274196624756, + "learning_rate": 4.2353117982991215e-05, + "loss": 0.5966, + "step": 51900 + }, + { + "epoch": 0.45890132428083946, + "grad_norm": 2.160752058029175, + "learning_rate": 4.235164459531934e-05, + "loss": 0.6225, + "step": 51910 + }, + { + "epoch": 0.4589897275411517, + "grad_norm": 2.036487340927124, + "learning_rate": 4.235017120764747e-05, + "loss": 0.6836, + "step": 51920 + }, + { + "epoch": 0.459078130801464, + "grad_norm": 13.578177452087402, + "learning_rate": 4.23486978199756e-05, + "loss": 0.6748, + "step": 51930 + }, + { + "epoch": 0.4591665340617762, + "grad_norm": 3.629070281982422, + "learning_rate": 4.2347224432303735e-05, + "loss": 0.8608, + "step": 51940 + }, + { + "epoch": 0.45925493732208844, + "grad_norm": 4.116369247436523, + "learning_rate": 4.2345751044631856e-05, + "loss": 0.7195, + "step": 51950 + }, + { + "epoch": 0.45934334058240067, + "grad_norm": 6.127593994140625, + "learning_rate": 4.234427765695999e-05, + "loss": 0.7316, + "step": 51960 + }, + { + "epoch": 0.4594317438427129, + "grad_norm": 2.790940284729004, + "learning_rate": 4.234280426928812e-05, + "loss": 0.7933, + "step": 51970 + }, + { + "epoch": 0.45952014710302513, + "grad_norm": 1.463698387145996, + "learning_rate": 4.234133088161625e-05, + "loss": 0.5768, + "step": 51980 + }, + { + "epoch": 0.4596085503633374, + "grad_norm": 3.5501725673675537, + "learning_rate": 4.2339857493944377e-05, + "loss": 0.743, + "step": 51990 + }, + { + "epoch": 0.45969695362364965, + "grad_norm": 1.4623552560806274, + "learning_rate": 4.233838410627251e-05, + "loss": 0.6944, + "step": 52000 + }, + { + "epoch": 0.4597853568839619, + "grad_norm": 5.345694541931152, + "learning_rate": 4.233691071860063e-05, + "loss": 0.7163, + "step": 52010 + }, + { + "epoch": 0.4598737601442741, + "grad_norm": 1.3032469749450684, + "learning_rate": 4.233543733092877e-05, + "loss": 0.6934, + "step": 52020 + }, + { + "epoch": 0.45996216340458634, + "grad_norm": 2.792205810546875, + "learning_rate": 4.23339639432569e-05, + "loss": 0.7318, + "step": 52030 + }, + { + "epoch": 0.4600505666648986, + "grad_norm": 5.106879711151123, + "learning_rate": 4.2332490555585025e-05, + "loss": 0.7482, + "step": 52040 + }, + { + "epoch": 0.46013896992521086, + "grad_norm": 3.861652374267578, + "learning_rate": 4.2331017167913153e-05, + "loss": 0.7403, + "step": 52050 + }, + { + "epoch": 0.4602273731855231, + "grad_norm": 2.3954854011535645, + "learning_rate": 4.232954378024128e-05, + "loss": 0.6757, + "step": 52060 + }, + { + "epoch": 0.4603157764458353, + "grad_norm": 7.476187229156494, + "learning_rate": 4.232807039256941e-05, + "loss": 0.8295, + "step": 52070 + }, + { + "epoch": 0.46040417970614755, + "grad_norm": 3.2276484966278076, + "learning_rate": 4.2326597004897545e-05, + "loss": 0.7112, + "step": 52080 + }, + { + "epoch": 0.4604925829664598, + "grad_norm": 5.918450832366943, + "learning_rate": 4.2325123617225674e-05, + "loss": 0.6645, + "step": 52090 + }, + { + "epoch": 0.460580986226772, + "grad_norm": 1.5007085800170898, + "learning_rate": 4.23236502295538e-05, + "loss": 0.7499, + "step": 52100 + }, + { + "epoch": 0.4606693894870843, + "grad_norm": 2.5571868419647217, + "learning_rate": 4.232217684188193e-05, + "loss": 0.7393, + "step": 52110 + }, + { + "epoch": 0.46075779274739653, + "grad_norm": 4.453342437744141, + "learning_rate": 4.232070345421006e-05, + "loss": 0.7919, + "step": 52120 + }, + { + "epoch": 0.46084619600770876, + "grad_norm": 8.468208312988281, + "learning_rate": 4.231923006653819e-05, + "loss": 0.6137, + "step": 52130 + }, + { + "epoch": 0.460934599268021, + "grad_norm": 5.198253631591797, + "learning_rate": 4.2317756678866315e-05, + "loss": 0.7684, + "step": 52140 + }, + { + "epoch": 0.4610230025283332, + "grad_norm": 2.5067381858825684, + "learning_rate": 4.231628329119445e-05, + "loss": 0.7395, + "step": 52150 + }, + { + "epoch": 0.4611114057886455, + "grad_norm": 5.629721164703369, + "learning_rate": 4.231480990352258e-05, + "loss": 0.7291, + "step": 52160 + }, + { + "epoch": 0.46119980904895774, + "grad_norm": 4.729606628417969, + "learning_rate": 4.231333651585071e-05, + "loss": 0.8573, + "step": 52170 + }, + { + "epoch": 0.46128821230927, + "grad_norm": 7.049135208129883, + "learning_rate": 4.2311863128178836e-05, + "loss": 0.6238, + "step": 52180 + }, + { + "epoch": 0.4613766155695822, + "grad_norm": 5.8184919357299805, + "learning_rate": 4.2310389740506964e-05, + "loss": 0.7103, + "step": 52190 + }, + { + "epoch": 0.46146501882989444, + "grad_norm": 5.012911796569824, + "learning_rate": 4.230891635283509e-05, + "loss": 0.7557, + "step": 52200 + }, + { + "epoch": 0.46155342209020667, + "grad_norm": 4.654698371887207, + "learning_rate": 4.230744296516323e-05, + "loss": 0.6179, + "step": 52210 + }, + { + "epoch": 0.46164182535051895, + "grad_norm": 4.618826866149902, + "learning_rate": 4.2305969577491356e-05, + "loss": 0.6809, + "step": 52220 + }, + { + "epoch": 0.4617302286108312, + "grad_norm": 3.6892995834350586, + "learning_rate": 4.2304496189819484e-05, + "loss": 0.8625, + "step": 52230 + }, + { + "epoch": 0.4618186318711434, + "grad_norm": 2.1181583404541016, + "learning_rate": 4.230302280214761e-05, + "loss": 0.5665, + "step": 52240 + }, + { + "epoch": 0.46190703513145565, + "grad_norm": 2.672295331954956, + "learning_rate": 4.230154941447574e-05, + "loss": 0.6711, + "step": 52250 + }, + { + "epoch": 0.4619954383917679, + "grad_norm": 7.382059097290039, + "learning_rate": 4.230007602680387e-05, + "loss": 0.5662, + "step": 52260 + }, + { + "epoch": 0.4620838416520801, + "grad_norm": 9.90565013885498, + "learning_rate": 4.2298602639132004e-05, + "loss": 0.777, + "step": 52270 + }, + { + "epoch": 0.4621722449123924, + "grad_norm": 3.0468902587890625, + "learning_rate": 4.2297129251460126e-05, + "loss": 0.7485, + "step": 52280 + }, + { + "epoch": 0.4622606481727046, + "grad_norm": 3.749941349029541, + "learning_rate": 4.229565586378826e-05, + "loss": 0.6977, + "step": 52290 + }, + { + "epoch": 0.46234905143301686, + "grad_norm": 4.086340427398682, + "learning_rate": 4.229418247611639e-05, + "loss": 0.6526, + "step": 52300 + }, + { + "epoch": 0.4624374546933291, + "grad_norm": 10.430846214294434, + "learning_rate": 4.229270908844452e-05, + "loss": 0.6436, + "step": 52310 + }, + { + "epoch": 0.4625258579536413, + "grad_norm": 4.599187850952148, + "learning_rate": 4.2291235700772646e-05, + "loss": 0.6184, + "step": 52320 + }, + { + "epoch": 0.46261426121395355, + "grad_norm": 2.2521350383758545, + "learning_rate": 4.228976231310078e-05, + "loss": 0.6449, + "step": 52330 + }, + { + "epoch": 0.46270266447426583, + "grad_norm": 1.5472252368927002, + "learning_rate": 4.22882889254289e-05, + "loss": 0.7082, + "step": 52340 + }, + { + "epoch": 0.46279106773457807, + "grad_norm": 3.4148683547973633, + "learning_rate": 4.228681553775704e-05, + "loss": 0.5674, + "step": 52350 + }, + { + "epoch": 0.4628794709948903, + "grad_norm": 4.581430435180664, + "learning_rate": 4.2285342150085166e-05, + "loss": 0.8915, + "step": 52360 + }, + { + "epoch": 0.4629678742552025, + "grad_norm": 5.565468788146973, + "learning_rate": 4.2283868762413295e-05, + "loss": 0.7555, + "step": 52370 + }, + { + "epoch": 0.46305627751551476, + "grad_norm": 8.734930992126465, + "learning_rate": 4.228239537474142e-05, + "loss": 0.6926, + "step": 52380 + }, + { + "epoch": 0.463144680775827, + "grad_norm": 2.026643753051758, + "learning_rate": 4.228092198706955e-05, + "loss": 0.6251, + "step": 52390 + }, + { + "epoch": 0.4632330840361393, + "grad_norm": 1.6456331014633179, + "learning_rate": 4.227944859939768e-05, + "loss": 0.7649, + "step": 52400 + }, + { + "epoch": 0.4633214872964515, + "grad_norm": 1.775565266609192, + "learning_rate": 4.2277975211725815e-05, + "loss": 0.6088, + "step": 52410 + }, + { + "epoch": 0.46340989055676374, + "grad_norm": 4.751766681671143, + "learning_rate": 4.2276501824053936e-05, + "loss": 0.693, + "step": 52420 + }, + { + "epoch": 0.46349829381707597, + "grad_norm": 5.910129070281982, + "learning_rate": 4.227502843638207e-05, + "loss": 0.7025, + "step": 52430 + }, + { + "epoch": 0.4635866970773882, + "grad_norm": 3.6207194328308105, + "learning_rate": 4.22735550487102e-05, + "loss": 0.7307, + "step": 52440 + }, + { + "epoch": 0.46367510033770043, + "grad_norm": 6.6016693115234375, + "learning_rate": 4.227208166103833e-05, + "loss": 0.8024, + "step": 52450 + }, + { + "epoch": 0.4637635035980127, + "grad_norm": 2.4947381019592285, + "learning_rate": 4.2270608273366457e-05, + "loss": 0.6709, + "step": 52460 + }, + { + "epoch": 0.46385190685832495, + "grad_norm": 2.1537387371063232, + "learning_rate": 4.226913488569459e-05, + "loss": 0.6732, + "step": 52470 + }, + { + "epoch": 0.4639403101186372, + "grad_norm": 2.8036246299743652, + "learning_rate": 4.226766149802271e-05, + "loss": 0.7085, + "step": 52480 + }, + { + "epoch": 0.4640287133789494, + "grad_norm": 2.016580820083618, + "learning_rate": 4.226618811035085e-05, + "loss": 0.7783, + "step": 52490 + }, + { + "epoch": 0.46411711663926164, + "grad_norm": 3.0787274837493896, + "learning_rate": 4.226471472267897e-05, + "loss": 0.5591, + "step": 52500 + }, + { + "epoch": 0.46420551989957387, + "grad_norm": 7.491862773895264, + "learning_rate": 4.2263241335007105e-05, + "loss": 0.7915, + "step": 52510 + }, + { + "epoch": 0.46429392315988616, + "grad_norm": 5.583034038543701, + "learning_rate": 4.2261767947335233e-05, + "loss": 0.6445, + "step": 52520 + }, + { + "epoch": 0.4643823264201984, + "grad_norm": 3.946359157562256, + "learning_rate": 4.226029455966336e-05, + "loss": 0.7772, + "step": 52530 + }, + { + "epoch": 0.4644707296805106, + "grad_norm": 2.98714280128479, + "learning_rate": 4.225882117199149e-05, + "loss": 0.7471, + "step": 52540 + }, + { + "epoch": 0.46455913294082285, + "grad_norm": 3.2688980102539062, + "learning_rate": 4.2257347784319625e-05, + "loss": 0.6658, + "step": 52550 + }, + { + "epoch": 0.4646475362011351, + "grad_norm": 3.5141897201538086, + "learning_rate": 4.225587439664775e-05, + "loss": 0.6256, + "step": 52560 + }, + { + "epoch": 0.4647359394614473, + "grad_norm": 5.148055076599121, + "learning_rate": 4.225440100897588e-05, + "loss": 0.7596, + "step": 52570 + }, + { + "epoch": 0.4648243427217596, + "grad_norm": 9.163445472717285, + "learning_rate": 4.225292762130401e-05, + "loss": 0.774, + "step": 52580 + }, + { + "epoch": 0.46491274598207183, + "grad_norm": 2.8391289710998535, + "learning_rate": 4.225145423363214e-05, + "loss": 0.7007, + "step": 52590 + }, + { + "epoch": 0.46500114924238406, + "grad_norm": 6.29801607131958, + "learning_rate": 4.224998084596027e-05, + "loss": 0.8114, + "step": 52600 + }, + { + "epoch": 0.4650895525026963, + "grad_norm": 4.505367279052734, + "learning_rate": 4.2248507458288395e-05, + "loss": 0.7236, + "step": 52610 + }, + { + "epoch": 0.4651779557630085, + "grad_norm": 11.121551513671875, + "learning_rate": 4.2247034070616524e-05, + "loss": 0.7362, + "step": 52620 + }, + { + "epoch": 0.46526635902332075, + "grad_norm": 8.888157844543457, + "learning_rate": 4.224556068294466e-05, + "loss": 0.7404, + "step": 52630 + }, + { + "epoch": 0.46535476228363304, + "grad_norm": 18.693090438842773, + "learning_rate": 4.224408729527278e-05, + "loss": 0.5955, + "step": 52640 + }, + { + "epoch": 0.46544316554394527, + "grad_norm": 5.19010066986084, + "learning_rate": 4.2242613907600916e-05, + "loss": 0.762, + "step": 52650 + }, + { + "epoch": 0.4655315688042575, + "grad_norm": 5.069072246551514, + "learning_rate": 4.2241140519929044e-05, + "loss": 0.6206, + "step": 52660 + }, + { + "epoch": 0.46561997206456973, + "grad_norm": 9.630207061767578, + "learning_rate": 4.223966713225717e-05, + "loss": 0.796, + "step": 52670 + }, + { + "epoch": 0.46570837532488196, + "grad_norm": 3.386852741241455, + "learning_rate": 4.22381937445853e-05, + "loss": 0.6616, + "step": 52680 + }, + { + "epoch": 0.46579677858519425, + "grad_norm": 0.8200322389602661, + "learning_rate": 4.2236720356913436e-05, + "loss": 0.6339, + "step": 52690 + }, + { + "epoch": 0.4658851818455065, + "grad_norm": 1.5718613862991333, + "learning_rate": 4.223524696924156e-05, + "loss": 0.7416, + "step": 52700 + }, + { + "epoch": 0.4659735851058187, + "grad_norm": 3.913846731185913, + "learning_rate": 4.223377358156969e-05, + "loss": 0.8131, + "step": 52710 + }, + { + "epoch": 0.46606198836613094, + "grad_norm": 10.412713050842285, + "learning_rate": 4.2232300193897814e-05, + "loss": 0.5501, + "step": 52720 + }, + { + "epoch": 0.4661503916264432, + "grad_norm": 1.6883413791656494, + "learning_rate": 4.223082680622595e-05, + "loss": 0.7556, + "step": 52730 + }, + { + "epoch": 0.4662387948867554, + "grad_norm": 1.4152474403381348, + "learning_rate": 4.222935341855408e-05, + "loss": 0.7298, + "step": 52740 + }, + { + "epoch": 0.4663271981470677, + "grad_norm": 5.408758640289307, + "learning_rate": 4.2227880030882206e-05, + "loss": 0.5726, + "step": 52750 + }, + { + "epoch": 0.4664156014073799, + "grad_norm": 1.2836005687713623, + "learning_rate": 4.2226406643210334e-05, + "loss": 0.6989, + "step": 52760 + }, + { + "epoch": 0.46650400466769215, + "grad_norm": 11.31423282623291, + "learning_rate": 4.222493325553847e-05, + "loss": 0.6883, + "step": 52770 + }, + { + "epoch": 0.4665924079280044, + "grad_norm": 14.369865417480469, + "learning_rate": 4.222345986786659e-05, + "loss": 0.6813, + "step": 52780 + }, + { + "epoch": 0.4666808111883166, + "grad_norm": 6.9002790451049805, + "learning_rate": 4.2221986480194726e-05, + "loss": 0.73, + "step": 52790 + }, + { + "epoch": 0.46676921444862884, + "grad_norm": 2.44985294342041, + "learning_rate": 4.2220513092522854e-05, + "loss": 0.698, + "step": 52800 + }, + { + "epoch": 0.46685761770894113, + "grad_norm": 6.439610004425049, + "learning_rate": 4.221903970485098e-05, + "loss": 0.6888, + "step": 52810 + }, + { + "epoch": 0.46694602096925336, + "grad_norm": 7.309405326843262, + "learning_rate": 4.221756631717911e-05, + "loss": 0.7419, + "step": 52820 + }, + { + "epoch": 0.4670344242295656, + "grad_norm": 4.0311455726623535, + "learning_rate": 4.2216092929507246e-05, + "loss": 0.6956, + "step": 52830 + }, + { + "epoch": 0.4671228274898778, + "grad_norm": 3.848418951034546, + "learning_rate": 4.221461954183537e-05, + "loss": 0.5721, + "step": 52840 + }, + { + "epoch": 0.46721123075019005, + "grad_norm": 2.4307098388671875, + "learning_rate": 4.22131461541635e-05, + "loss": 0.7171, + "step": 52850 + }, + { + "epoch": 0.4672996340105023, + "grad_norm": 2.257612466812134, + "learning_rate": 4.2211672766491625e-05, + "loss": 0.7377, + "step": 52860 + }, + { + "epoch": 0.46738803727081457, + "grad_norm": 8.017738342285156, + "learning_rate": 4.221019937881976e-05, + "loss": 0.7199, + "step": 52870 + }, + { + "epoch": 0.4674764405311268, + "grad_norm": 4.491119861602783, + "learning_rate": 4.220872599114789e-05, + "loss": 0.677, + "step": 52880 + }, + { + "epoch": 0.46756484379143903, + "grad_norm": 7.327272891998291, + "learning_rate": 4.2207252603476016e-05, + "loss": 0.7773, + "step": 52890 + }, + { + "epoch": 0.46765324705175126, + "grad_norm": 6.760767936706543, + "learning_rate": 4.2205779215804145e-05, + "loss": 0.7271, + "step": 52900 + }, + { + "epoch": 0.4677416503120635, + "grad_norm": 5.4441704750061035, + "learning_rate": 4.220430582813228e-05, + "loss": 0.7207, + "step": 52910 + }, + { + "epoch": 0.4678300535723757, + "grad_norm": 4.132839679718018, + "learning_rate": 4.22028324404604e-05, + "loss": 0.7278, + "step": 52920 + }, + { + "epoch": 0.467918456832688, + "grad_norm": 3.256896495819092, + "learning_rate": 4.220135905278854e-05, + "loss": 0.7062, + "step": 52930 + }, + { + "epoch": 0.46800686009300024, + "grad_norm": 3.098531484603882, + "learning_rate": 4.2199885665116665e-05, + "loss": 0.806, + "step": 52940 + }, + { + "epoch": 0.4680952633533125, + "grad_norm": 3.618494749069214, + "learning_rate": 4.219841227744479e-05, + "loss": 0.5924, + "step": 52950 + }, + { + "epoch": 0.4681836666136247, + "grad_norm": 3.786844253540039, + "learning_rate": 4.219693888977292e-05, + "loss": 0.7546, + "step": 52960 + }, + { + "epoch": 0.46827206987393694, + "grad_norm": 4.665619373321533, + "learning_rate": 4.219546550210105e-05, + "loss": 0.8267, + "step": 52970 + }, + { + "epoch": 0.46836047313424917, + "grad_norm": 12.414656639099121, + "learning_rate": 4.219399211442918e-05, + "loss": 0.7019, + "step": 52980 + }, + { + "epoch": 0.46844887639456145, + "grad_norm": 1.855115532875061, + "learning_rate": 4.2192518726757314e-05, + "loss": 0.6623, + "step": 52990 + }, + { + "epoch": 0.4685372796548737, + "grad_norm": 2.5500354766845703, + "learning_rate": 4.219104533908544e-05, + "loss": 0.808, + "step": 53000 + }, + { + "epoch": 0.4686256829151859, + "grad_norm": 3.0885989665985107, + "learning_rate": 4.218957195141357e-05, + "loss": 0.5627, + "step": 53010 + }, + { + "epoch": 0.46871408617549815, + "grad_norm": 4.889555931091309, + "learning_rate": 4.21880985637417e-05, + "loss": 0.6399, + "step": 53020 + }, + { + "epoch": 0.4688024894358104, + "grad_norm": 2.5776076316833496, + "learning_rate": 4.218662517606983e-05, + "loss": 0.5943, + "step": 53030 + }, + { + "epoch": 0.4688908926961226, + "grad_norm": 3.828183174133301, + "learning_rate": 4.2185151788397955e-05, + "loss": 0.7725, + "step": 53040 + }, + { + "epoch": 0.4689792959564349, + "grad_norm": 4.145967960357666, + "learning_rate": 4.218367840072609e-05, + "loss": 0.6897, + "step": 53050 + }, + { + "epoch": 0.4690676992167471, + "grad_norm": 5.40083122253418, + "learning_rate": 4.218220501305422e-05, + "loss": 0.7789, + "step": 53060 + }, + { + "epoch": 0.46915610247705936, + "grad_norm": 8.548382759094238, + "learning_rate": 4.218073162538235e-05, + "loss": 0.6406, + "step": 53070 + }, + { + "epoch": 0.4692445057373716, + "grad_norm": 1.5795224905014038, + "learning_rate": 4.2179258237710476e-05, + "loss": 0.7382, + "step": 53080 + }, + { + "epoch": 0.4693329089976838, + "grad_norm": 4.020104885101318, + "learning_rate": 4.2177784850038604e-05, + "loss": 0.532, + "step": 53090 + }, + { + "epoch": 0.46942131225799605, + "grad_norm": 2.954549551010132, + "learning_rate": 4.217631146236673e-05, + "loss": 0.7156, + "step": 53100 + }, + { + "epoch": 0.46950971551830833, + "grad_norm": 7.192239761352539, + "learning_rate": 4.217483807469486e-05, + "loss": 0.7216, + "step": 53110 + }, + { + "epoch": 0.46959811877862057, + "grad_norm": 5.926204681396484, + "learning_rate": 4.2173364687022996e-05, + "loss": 0.6224, + "step": 53120 + }, + { + "epoch": 0.4696865220389328, + "grad_norm": 5.667525768280029, + "learning_rate": 4.2171891299351124e-05, + "loss": 0.6615, + "step": 53130 + }, + { + "epoch": 0.469774925299245, + "grad_norm": 4.380832672119141, + "learning_rate": 4.217041791167925e-05, + "loss": 0.6975, + "step": 53140 + }, + { + "epoch": 0.46986332855955726, + "grad_norm": 2.1343722343444824, + "learning_rate": 4.216894452400738e-05, + "loss": 0.7124, + "step": 53150 + }, + { + "epoch": 0.4699517318198695, + "grad_norm": 2.3305907249450684, + "learning_rate": 4.216747113633551e-05, + "loss": 0.8327, + "step": 53160 + }, + { + "epoch": 0.4700401350801818, + "grad_norm": 2.136439561843872, + "learning_rate": 4.216599774866364e-05, + "loss": 0.7655, + "step": 53170 + }, + { + "epoch": 0.470128538340494, + "grad_norm": 12.449284553527832, + "learning_rate": 4.216452436099177e-05, + "loss": 0.7124, + "step": 53180 + }, + { + "epoch": 0.47021694160080624, + "grad_norm": 3.001804828643799, + "learning_rate": 4.2163050973319894e-05, + "loss": 0.6815, + "step": 53190 + }, + { + "epoch": 0.47030534486111847, + "grad_norm": 2.120994806289673, + "learning_rate": 4.216157758564803e-05, + "loss": 0.7672, + "step": 53200 + }, + { + "epoch": 0.4703937481214307, + "grad_norm": 3.306929349899292, + "learning_rate": 4.216010419797616e-05, + "loss": 0.7134, + "step": 53210 + }, + { + "epoch": 0.470482151381743, + "grad_norm": 5.747926235198975, + "learning_rate": 4.2158630810304286e-05, + "loss": 0.7064, + "step": 53220 + }, + { + "epoch": 0.4705705546420552, + "grad_norm": 2.683472156524658, + "learning_rate": 4.2157157422632414e-05, + "loss": 0.756, + "step": 53230 + }, + { + "epoch": 0.47065895790236745, + "grad_norm": 2.397841215133667, + "learning_rate": 4.215568403496055e-05, + "loss": 0.7754, + "step": 53240 + }, + { + "epoch": 0.4707473611626797, + "grad_norm": 1.5261938571929932, + "learning_rate": 4.215421064728867e-05, + "loss": 0.6572, + "step": 53250 + }, + { + "epoch": 0.4708357644229919, + "grad_norm": 7.471903324127197, + "learning_rate": 4.2152737259616806e-05, + "loss": 0.8332, + "step": 53260 + }, + { + "epoch": 0.47092416768330414, + "grad_norm": 5.161137104034424, + "learning_rate": 4.2151263871944935e-05, + "loss": 0.6651, + "step": 53270 + }, + { + "epoch": 0.4710125709436164, + "grad_norm": 2.039276599884033, + "learning_rate": 4.214979048427306e-05, + "loss": 0.6569, + "step": 53280 + }, + { + "epoch": 0.47110097420392866, + "grad_norm": 3.896923303604126, + "learning_rate": 4.214831709660119e-05, + "loss": 0.7154, + "step": 53290 + }, + { + "epoch": 0.4711893774642409, + "grad_norm": 6.703306198120117, + "learning_rate": 4.2146843708929326e-05, + "loss": 0.6368, + "step": 53300 + }, + { + "epoch": 0.4712777807245531, + "grad_norm": 1.7361499071121216, + "learning_rate": 4.214537032125745e-05, + "loss": 0.7386, + "step": 53310 + }, + { + "epoch": 0.47136618398486535, + "grad_norm": 4.387570381164551, + "learning_rate": 4.214389693358558e-05, + "loss": 0.7631, + "step": 53320 + }, + { + "epoch": 0.4714545872451776, + "grad_norm": 4.573794364929199, + "learning_rate": 4.2142423545913705e-05, + "loss": 0.7806, + "step": 53330 + }, + { + "epoch": 0.47154299050548987, + "grad_norm": 5.908381462097168, + "learning_rate": 4.214095015824184e-05, + "loss": 0.6827, + "step": 53340 + }, + { + "epoch": 0.4716313937658021, + "grad_norm": 4.799923419952393, + "learning_rate": 4.213947677056997e-05, + "loss": 0.6317, + "step": 53350 + }, + { + "epoch": 0.47171979702611433, + "grad_norm": 4.03240442276001, + "learning_rate": 4.2138003382898097e-05, + "loss": 0.8315, + "step": 53360 + }, + { + "epoch": 0.47180820028642656, + "grad_norm": 1.7955182790756226, + "learning_rate": 4.2136529995226225e-05, + "loss": 0.6812, + "step": 53370 + }, + { + "epoch": 0.4718966035467388, + "grad_norm": 3.1082003116607666, + "learning_rate": 4.213505660755436e-05, + "loss": 0.7699, + "step": 53380 + }, + { + "epoch": 0.471985006807051, + "grad_norm": 7.408374786376953, + "learning_rate": 4.213358321988248e-05, + "loss": 0.709, + "step": 53390 + }, + { + "epoch": 0.4720734100673633, + "grad_norm": 1.9412243366241455, + "learning_rate": 4.213210983221062e-05, + "loss": 0.7306, + "step": 53400 + }, + { + "epoch": 0.47216181332767554, + "grad_norm": 5.350911617279053, + "learning_rate": 4.2130636444538745e-05, + "loss": 0.7458, + "step": 53410 + }, + { + "epoch": 0.47225021658798777, + "grad_norm": 5.449341297149658, + "learning_rate": 4.2129163056866873e-05, + "loss": 0.6542, + "step": 53420 + }, + { + "epoch": 0.4723386198483, + "grad_norm": 8.928043365478516, + "learning_rate": 4.2127689669195e-05, + "loss": 0.72, + "step": 53430 + }, + { + "epoch": 0.47242702310861223, + "grad_norm": 8.926413536071777, + "learning_rate": 4.212621628152313e-05, + "loss": 0.6064, + "step": 53440 + }, + { + "epoch": 0.47251542636892446, + "grad_norm": 4.622246265411377, + "learning_rate": 4.212474289385126e-05, + "loss": 0.6907, + "step": 53450 + }, + { + "epoch": 0.47260382962923675, + "grad_norm": 3.0518133640289307, + "learning_rate": 4.2123269506179394e-05, + "loss": 0.7999, + "step": 53460 + }, + { + "epoch": 0.472692232889549, + "grad_norm": 2.4066314697265625, + "learning_rate": 4.2121796118507515e-05, + "loss": 0.6944, + "step": 53470 + }, + { + "epoch": 0.4727806361498612, + "grad_norm": 2.3513970375061035, + "learning_rate": 4.212032273083565e-05, + "loss": 0.7082, + "step": 53480 + }, + { + "epoch": 0.47286903941017344, + "grad_norm": 2.125847816467285, + "learning_rate": 4.211884934316378e-05, + "loss": 0.6513, + "step": 53490 + }, + { + "epoch": 0.4729574426704857, + "grad_norm": 2.1552071571350098, + "learning_rate": 4.211737595549191e-05, + "loss": 0.5535, + "step": 53500 + }, + { + "epoch": 0.4730458459307979, + "grad_norm": 6.239655017852783, + "learning_rate": 4.2115902567820035e-05, + "loss": 0.8503, + "step": 53510 + }, + { + "epoch": 0.4731342491911102, + "grad_norm": 7.939951419830322, + "learning_rate": 4.211442918014817e-05, + "loss": 0.6466, + "step": 53520 + }, + { + "epoch": 0.4732226524514224, + "grad_norm": 2.3682501316070557, + "learning_rate": 4.211295579247629e-05, + "loss": 0.777, + "step": 53530 + }, + { + "epoch": 0.47331105571173465, + "grad_norm": 7.721220970153809, + "learning_rate": 4.211148240480443e-05, + "loss": 0.718, + "step": 53540 + }, + { + "epoch": 0.4733994589720469, + "grad_norm": 2.304795742034912, + "learning_rate": 4.211000901713255e-05, + "loss": 0.7109, + "step": 53550 + }, + { + "epoch": 0.4734878622323591, + "grad_norm": 5.924191951751709, + "learning_rate": 4.2108535629460684e-05, + "loss": 0.6815, + "step": 53560 + }, + { + "epoch": 0.47357626549267134, + "grad_norm": 4.856740474700928, + "learning_rate": 4.210706224178881e-05, + "loss": 0.7755, + "step": 53570 + }, + { + "epoch": 0.47366466875298363, + "grad_norm": 3.9173684120178223, + "learning_rate": 4.210558885411694e-05, + "loss": 0.7756, + "step": 53580 + }, + { + "epoch": 0.47375307201329586, + "grad_norm": 5.201943397521973, + "learning_rate": 4.210411546644507e-05, + "loss": 0.679, + "step": 53590 + }, + { + "epoch": 0.4738414752736081, + "grad_norm": 3.4333572387695312, + "learning_rate": 4.2102642078773204e-05, + "loss": 0.7432, + "step": 53600 + }, + { + "epoch": 0.4739298785339203, + "grad_norm": 9.132095336914062, + "learning_rate": 4.2101168691101326e-05, + "loss": 0.8044, + "step": 53610 + }, + { + "epoch": 0.47401828179423255, + "grad_norm": 4.284627437591553, + "learning_rate": 4.209969530342946e-05, + "loss": 0.6837, + "step": 53620 + }, + { + "epoch": 0.4741066850545448, + "grad_norm": 2.861417531967163, + "learning_rate": 4.209822191575759e-05, + "loss": 0.7408, + "step": 53630 + }, + { + "epoch": 0.47419508831485707, + "grad_norm": 5.994079113006592, + "learning_rate": 4.209674852808572e-05, + "loss": 0.7608, + "step": 53640 + }, + { + "epoch": 0.4742834915751693, + "grad_norm": 5.555888652801514, + "learning_rate": 4.2095275140413846e-05, + "loss": 0.6456, + "step": 53650 + }, + { + "epoch": 0.47437189483548153, + "grad_norm": 2.0312082767486572, + "learning_rate": 4.209380175274198e-05, + "loss": 0.6923, + "step": 53660 + }, + { + "epoch": 0.47446029809579376, + "grad_norm": 3.7952773571014404, + "learning_rate": 4.20923283650701e-05, + "loss": 0.7644, + "step": 53670 + }, + { + "epoch": 0.474548701356106, + "grad_norm": 4.994013786315918, + "learning_rate": 4.209085497739824e-05, + "loss": 0.7715, + "step": 53680 + }, + { + "epoch": 0.4746371046164182, + "grad_norm": 8.997997283935547, + "learning_rate": 4.208938158972636e-05, + "loss": 0.7227, + "step": 53690 + }, + { + "epoch": 0.4747255078767305, + "grad_norm": 2.1706600189208984, + "learning_rate": 4.2087908202054494e-05, + "loss": 0.6464, + "step": 53700 + }, + { + "epoch": 0.47481391113704274, + "grad_norm": 3.9491937160491943, + "learning_rate": 4.208643481438262e-05, + "loss": 0.7095, + "step": 53710 + }, + { + "epoch": 0.474902314397355, + "grad_norm": 3.4568047523498535, + "learning_rate": 4.208496142671075e-05, + "loss": 0.7505, + "step": 53720 + }, + { + "epoch": 0.4749907176576672, + "grad_norm": 6.685573577880859, + "learning_rate": 4.208348803903888e-05, + "loss": 0.7671, + "step": 53730 + }, + { + "epoch": 0.47507912091797944, + "grad_norm": 5.928294658660889, + "learning_rate": 4.2082014651367015e-05, + "loss": 0.7748, + "step": 53740 + }, + { + "epoch": 0.4751675241782917, + "grad_norm": 2.370215654373169, + "learning_rate": 4.2080541263695136e-05, + "loss": 0.6908, + "step": 53750 + }, + { + "epoch": 0.47525592743860395, + "grad_norm": 4.826587200164795, + "learning_rate": 4.207906787602327e-05, + "loss": 0.6833, + "step": 53760 + }, + { + "epoch": 0.4753443306989162, + "grad_norm": 8.44394302368164, + "learning_rate": 4.20775944883514e-05, + "loss": 0.8003, + "step": 53770 + }, + { + "epoch": 0.4754327339592284, + "grad_norm": 2.7617194652557373, + "learning_rate": 4.207612110067953e-05, + "loss": 0.598, + "step": 53780 + }, + { + "epoch": 0.47552113721954065, + "grad_norm": 1.5828273296356201, + "learning_rate": 4.2074647713007656e-05, + "loss": 0.7267, + "step": 53790 + }, + { + "epoch": 0.4756095404798529, + "grad_norm": 7.319702625274658, + "learning_rate": 4.2073174325335785e-05, + "loss": 0.6068, + "step": 53800 + }, + { + "epoch": 0.47569794374016516, + "grad_norm": 2.0254814624786377, + "learning_rate": 4.207170093766391e-05, + "loss": 0.7966, + "step": 53810 + }, + { + "epoch": 0.4757863470004774, + "grad_norm": 1.9124932289123535, + "learning_rate": 4.207022754999205e-05, + "loss": 0.8703, + "step": 53820 + }, + { + "epoch": 0.4758747502607896, + "grad_norm": 4.167019367218018, + "learning_rate": 4.206875416232017e-05, + "loss": 0.7182, + "step": 53830 + }, + { + "epoch": 0.47596315352110186, + "grad_norm": 0.9499847292900085, + "learning_rate": 4.2067280774648305e-05, + "loss": 0.6645, + "step": 53840 + }, + { + "epoch": 0.4760515567814141, + "grad_norm": 5.8929314613342285, + "learning_rate": 4.206580738697643e-05, + "loss": 0.6525, + "step": 53850 + }, + { + "epoch": 0.4761399600417263, + "grad_norm": 10.652838706970215, + "learning_rate": 4.206433399930456e-05, + "loss": 0.814, + "step": 53860 + }, + { + "epoch": 0.4762283633020386, + "grad_norm": 2.5791962146759033, + "learning_rate": 4.206286061163269e-05, + "loss": 0.6601, + "step": 53870 + }, + { + "epoch": 0.47631676656235084, + "grad_norm": 3.0517194271087646, + "learning_rate": 4.2061387223960825e-05, + "loss": 0.7004, + "step": 53880 + }, + { + "epoch": 0.47640516982266307, + "grad_norm": 2.1429057121276855, + "learning_rate": 4.205991383628895e-05, + "loss": 0.5849, + "step": 53890 + }, + { + "epoch": 0.4764935730829753, + "grad_norm": 1.8528274297714233, + "learning_rate": 4.205844044861708e-05, + "loss": 0.6686, + "step": 53900 + }, + { + "epoch": 0.47658197634328753, + "grad_norm": 7.8360276222229, + "learning_rate": 4.205696706094521e-05, + "loss": 0.6581, + "step": 53910 + }, + { + "epoch": 0.47667037960359976, + "grad_norm": 2.7732481956481934, + "learning_rate": 4.205549367327334e-05, + "loss": 0.6492, + "step": 53920 + }, + { + "epoch": 0.47675878286391205, + "grad_norm": 4.227574825286865, + "learning_rate": 4.205402028560147e-05, + "loss": 0.7264, + "step": 53930 + }, + { + "epoch": 0.4768471861242243, + "grad_norm": 3.157350778579712, + "learning_rate": 4.2052546897929595e-05, + "loss": 0.7311, + "step": 53940 + }, + { + "epoch": 0.4769355893845365, + "grad_norm": 4.121584892272949, + "learning_rate": 4.2051073510257724e-05, + "loss": 0.7377, + "step": 53950 + }, + { + "epoch": 0.47702399264484874, + "grad_norm": 6.046733856201172, + "learning_rate": 4.204960012258586e-05, + "loss": 0.577, + "step": 53960 + }, + { + "epoch": 0.47711239590516097, + "grad_norm": 2.53169322013855, + "learning_rate": 4.204812673491399e-05, + "loss": 0.62, + "step": 53970 + }, + { + "epoch": 0.4772007991654732, + "grad_norm": 13.655868530273438, + "learning_rate": 4.2046653347242115e-05, + "loss": 0.6317, + "step": 53980 + }, + { + "epoch": 0.4772892024257855, + "grad_norm": 1.9250119924545288, + "learning_rate": 4.2045179959570244e-05, + "loss": 0.6965, + "step": 53990 + }, + { + "epoch": 0.4773776056860977, + "grad_norm": 16.838058471679688, + "learning_rate": 4.204370657189837e-05, + "loss": 0.7622, + "step": 54000 + }, + { + "epoch": 0.47746600894640995, + "grad_norm": 2.1734912395477295, + "learning_rate": 4.20422331842265e-05, + "loss": 0.8387, + "step": 54010 + }, + { + "epoch": 0.4775544122067222, + "grad_norm": 8.551667213439941, + "learning_rate": 4.204075979655463e-05, + "loss": 0.6874, + "step": 54020 + }, + { + "epoch": 0.4776428154670344, + "grad_norm": 2.9934494495391846, + "learning_rate": 4.2039286408882764e-05, + "loss": 0.6875, + "step": 54030 + }, + { + "epoch": 0.47773121872734664, + "grad_norm": 4.535610198974609, + "learning_rate": 4.203781302121089e-05, + "loss": 0.6902, + "step": 54040 + }, + { + "epoch": 0.4778196219876589, + "grad_norm": 4.351871490478516, + "learning_rate": 4.203633963353902e-05, + "loss": 0.7331, + "step": 54050 + }, + { + "epoch": 0.47790802524797116, + "grad_norm": 2.035493850708008, + "learning_rate": 4.203486624586715e-05, + "loss": 0.6733, + "step": 54060 + }, + { + "epoch": 0.4779964285082834, + "grad_norm": 4.51971435546875, + "learning_rate": 4.203339285819528e-05, + "loss": 0.6584, + "step": 54070 + }, + { + "epoch": 0.4780848317685956, + "grad_norm": 14.118454933166504, + "learning_rate": 4.2031919470523406e-05, + "loss": 0.8002, + "step": 54080 + }, + { + "epoch": 0.47817323502890785, + "grad_norm": 5.4625091552734375, + "learning_rate": 4.203044608285154e-05, + "loss": 0.7582, + "step": 54090 + }, + { + "epoch": 0.4782616382892201, + "grad_norm": 2.040560245513916, + "learning_rate": 4.202897269517967e-05, + "loss": 0.6083, + "step": 54100 + }, + { + "epoch": 0.47835004154953237, + "grad_norm": 3.6075687408447266, + "learning_rate": 4.20274993075078e-05, + "loss": 0.6612, + "step": 54110 + }, + { + "epoch": 0.4784384448098446, + "grad_norm": 10.534326553344727, + "learning_rate": 4.2026025919835926e-05, + "loss": 0.7524, + "step": 54120 + }, + { + "epoch": 0.47852684807015683, + "grad_norm": 1.964032530784607, + "learning_rate": 4.2024552532164054e-05, + "loss": 0.7115, + "step": 54130 + }, + { + "epoch": 0.47861525133046906, + "grad_norm": 9.679758071899414, + "learning_rate": 4.202307914449218e-05, + "loss": 0.7912, + "step": 54140 + }, + { + "epoch": 0.4787036545907813, + "grad_norm": 3.8983168601989746, + "learning_rate": 4.202160575682032e-05, + "loss": 0.8151, + "step": 54150 + }, + { + "epoch": 0.4787920578510935, + "grad_norm": 1.4353677034378052, + "learning_rate": 4.202013236914844e-05, + "loss": 0.679, + "step": 54160 + }, + { + "epoch": 0.4788804611114058, + "grad_norm": 13.79578971862793, + "learning_rate": 4.2018658981476575e-05, + "loss": 0.6974, + "step": 54170 + }, + { + "epoch": 0.47896886437171804, + "grad_norm": 3.1516776084899902, + "learning_rate": 4.20171855938047e-05, + "loss": 0.6885, + "step": 54180 + }, + { + "epoch": 0.47905726763203027, + "grad_norm": 1.3675308227539062, + "learning_rate": 4.201571220613283e-05, + "loss": 0.7082, + "step": 54190 + }, + { + "epoch": 0.4791456708923425, + "grad_norm": 3.3600339889526367, + "learning_rate": 4.201423881846096e-05, + "loss": 0.8791, + "step": 54200 + }, + { + "epoch": 0.47923407415265473, + "grad_norm": 5.11338472366333, + "learning_rate": 4.2012765430789095e-05, + "loss": 0.8247, + "step": 54210 + }, + { + "epoch": 0.47932247741296696, + "grad_norm": 1.0892534255981445, + "learning_rate": 4.2011292043117216e-05, + "loss": 0.722, + "step": 54220 + }, + { + "epoch": 0.47941088067327925, + "grad_norm": 1.8006415367126465, + "learning_rate": 4.200981865544535e-05, + "loss": 0.6324, + "step": 54230 + }, + { + "epoch": 0.4794992839335915, + "grad_norm": 13.464760780334473, + "learning_rate": 4.200834526777348e-05, + "loss": 0.7503, + "step": 54240 + }, + { + "epoch": 0.4795876871939037, + "grad_norm": 3.999490976333618, + "learning_rate": 4.200687188010161e-05, + "loss": 0.5964, + "step": 54250 + }, + { + "epoch": 0.47967609045421594, + "grad_norm": 1.042232871055603, + "learning_rate": 4.2005398492429736e-05, + "loss": 0.5991, + "step": 54260 + }, + { + "epoch": 0.4797644937145282, + "grad_norm": 12.239282608032227, + "learning_rate": 4.2003925104757865e-05, + "loss": 0.7737, + "step": 54270 + }, + { + "epoch": 0.47985289697484046, + "grad_norm": 5.707892417907715, + "learning_rate": 4.200245171708599e-05, + "loss": 0.5718, + "step": 54280 + }, + { + "epoch": 0.4799413002351527, + "grad_norm": 10.65614128112793, + "learning_rate": 4.200097832941413e-05, + "loss": 0.7817, + "step": 54290 + }, + { + "epoch": 0.4800297034954649, + "grad_norm": 7.1056671142578125, + "learning_rate": 4.199950494174225e-05, + "loss": 0.6784, + "step": 54300 + }, + { + "epoch": 0.48011810675577715, + "grad_norm": 3.8355488777160645, + "learning_rate": 4.1998031554070385e-05, + "loss": 0.7311, + "step": 54310 + }, + { + "epoch": 0.4802065100160894, + "grad_norm": 1.3522825241088867, + "learning_rate": 4.199655816639851e-05, + "loss": 0.7119, + "step": 54320 + }, + { + "epoch": 0.4802949132764016, + "grad_norm": 1.1591590642929077, + "learning_rate": 4.199508477872664e-05, + "loss": 0.594, + "step": 54330 + }, + { + "epoch": 0.4803833165367139, + "grad_norm": 5.093685150146484, + "learning_rate": 4.199361139105477e-05, + "loss": 0.6828, + "step": 54340 + }, + { + "epoch": 0.48047171979702613, + "grad_norm": 3.0881261825561523, + "learning_rate": 4.1992138003382905e-05, + "loss": 0.7552, + "step": 54350 + }, + { + "epoch": 0.48056012305733836, + "grad_norm": 2.330265998840332, + "learning_rate": 4.199066461571103e-05, + "loss": 0.8338, + "step": 54360 + }, + { + "epoch": 0.4806485263176506, + "grad_norm": 2.2567107677459717, + "learning_rate": 4.198919122803916e-05, + "loss": 0.6752, + "step": 54370 + }, + { + "epoch": 0.4807369295779628, + "grad_norm": 4.572037220001221, + "learning_rate": 4.1987717840367283e-05, + "loss": 0.6666, + "step": 54380 + }, + { + "epoch": 0.48082533283827505, + "grad_norm": 8.050597190856934, + "learning_rate": 4.198624445269542e-05, + "loss": 0.7116, + "step": 54390 + }, + { + "epoch": 0.48091373609858734, + "grad_norm": 1.7825093269348145, + "learning_rate": 4.198477106502355e-05, + "loss": 0.6326, + "step": 54400 + }, + { + "epoch": 0.48100213935889957, + "grad_norm": 2.3205370903015137, + "learning_rate": 4.1983297677351675e-05, + "loss": 0.6829, + "step": 54410 + }, + { + "epoch": 0.4810905426192118, + "grad_norm": 3.3724260330200195, + "learning_rate": 4.1981824289679804e-05, + "loss": 0.5336, + "step": 54420 + }, + { + "epoch": 0.48117894587952403, + "grad_norm": 2.3022725582122803, + "learning_rate": 4.198035090200794e-05, + "loss": 0.6742, + "step": 54430 + }, + { + "epoch": 0.48126734913983626, + "grad_norm": 3.31050443649292, + "learning_rate": 4.197887751433606e-05, + "loss": 0.7448, + "step": 54440 + }, + { + "epoch": 0.4813557524001485, + "grad_norm": 6.843266487121582, + "learning_rate": 4.1977404126664196e-05, + "loss": 0.7601, + "step": 54450 + }, + { + "epoch": 0.4814441556604608, + "grad_norm": 4.714592933654785, + "learning_rate": 4.1975930738992324e-05, + "loss": 0.6474, + "step": 54460 + }, + { + "epoch": 0.481532558920773, + "grad_norm": 2.2546401023864746, + "learning_rate": 4.197445735132045e-05, + "loss": 0.7151, + "step": 54470 + }, + { + "epoch": 0.48162096218108524, + "grad_norm": 3.8935787677764893, + "learning_rate": 4.197298396364858e-05, + "loss": 0.6055, + "step": 54480 + }, + { + "epoch": 0.4817093654413975, + "grad_norm": 4.3352437019348145, + "learning_rate": 4.197151057597671e-05, + "loss": 0.6747, + "step": 54490 + }, + { + "epoch": 0.4817977687017097, + "grad_norm": 10.236634254455566, + "learning_rate": 4.197003718830484e-05, + "loss": 0.7769, + "step": 54500 + }, + { + "epoch": 0.48188617196202194, + "grad_norm": 2.106126070022583, + "learning_rate": 4.196856380063297e-05, + "loss": 0.7155, + "step": 54510 + }, + { + "epoch": 0.4819745752223342, + "grad_norm": 3.186664581298828, + "learning_rate": 4.1967090412961094e-05, + "loss": 0.5455, + "step": 54520 + }, + { + "epoch": 0.48206297848264645, + "grad_norm": 2.3363170623779297, + "learning_rate": 4.196561702528923e-05, + "loss": 0.6786, + "step": 54530 + }, + { + "epoch": 0.4821513817429587, + "grad_norm": 5.277554035186768, + "learning_rate": 4.196414363761736e-05, + "loss": 0.5956, + "step": 54540 + }, + { + "epoch": 0.4822397850032709, + "grad_norm": 14.230496406555176, + "learning_rate": 4.1962670249945486e-05, + "loss": 0.7322, + "step": 54550 + }, + { + "epoch": 0.48232818826358315, + "grad_norm": 4.711739540100098, + "learning_rate": 4.1961196862273614e-05, + "loss": 0.8439, + "step": 54560 + }, + { + "epoch": 0.4824165915238954, + "grad_norm": 1.5514178276062012, + "learning_rate": 4.195972347460175e-05, + "loss": 0.6972, + "step": 54570 + }, + { + "epoch": 0.48250499478420766, + "grad_norm": 3.1461331844329834, + "learning_rate": 4.195825008692987e-05, + "loss": 0.7505, + "step": 54580 + }, + { + "epoch": 0.4825933980445199, + "grad_norm": 2.6853485107421875, + "learning_rate": 4.1956776699258006e-05, + "loss": 0.6901, + "step": 54590 + }, + { + "epoch": 0.4826818013048321, + "grad_norm": 9.775922775268555, + "learning_rate": 4.1955303311586134e-05, + "loss": 0.6844, + "step": 54600 + }, + { + "epoch": 0.48277020456514436, + "grad_norm": 3.3453292846679688, + "learning_rate": 4.195382992391426e-05, + "loss": 0.6983, + "step": 54610 + }, + { + "epoch": 0.4828586078254566, + "grad_norm": 5.067368984222412, + "learning_rate": 4.195235653624239e-05, + "loss": 0.8372, + "step": 54620 + }, + { + "epoch": 0.4829470110857688, + "grad_norm": 7.5894575119018555, + "learning_rate": 4.195088314857052e-05, + "loss": 0.8135, + "step": 54630 + }, + { + "epoch": 0.4830354143460811, + "grad_norm": 3.6968748569488525, + "learning_rate": 4.194940976089865e-05, + "loss": 0.7562, + "step": 54640 + }, + { + "epoch": 0.48312381760639334, + "grad_norm": 2.677279472351074, + "learning_rate": 4.194793637322678e-05, + "loss": 0.6042, + "step": 54650 + }, + { + "epoch": 0.48321222086670557, + "grad_norm": 8.589305877685547, + "learning_rate": 4.1946462985554905e-05, + "loss": 0.5726, + "step": 54660 + }, + { + "epoch": 0.4833006241270178, + "grad_norm": 5.660636901855469, + "learning_rate": 4.194498959788304e-05, + "loss": 0.7243, + "step": 54670 + }, + { + "epoch": 0.48338902738733003, + "grad_norm": 1.8863121271133423, + "learning_rate": 4.194351621021117e-05, + "loss": 0.7849, + "step": 54680 + }, + { + "epoch": 0.48347743064764226, + "grad_norm": 6.596573352813721, + "learning_rate": 4.1942042822539296e-05, + "loss": 0.6581, + "step": 54690 + }, + { + "epoch": 0.48356583390795455, + "grad_norm": 1.5892181396484375, + "learning_rate": 4.1940569434867425e-05, + "loss": 0.6964, + "step": 54700 + }, + { + "epoch": 0.4836542371682668, + "grad_norm": 1.8390005826950073, + "learning_rate": 4.193909604719556e-05, + "loss": 0.6897, + "step": 54710 + }, + { + "epoch": 0.483742640428579, + "grad_norm": 11.260882377624512, + "learning_rate": 4.193762265952368e-05, + "loss": 0.6603, + "step": 54720 + }, + { + "epoch": 0.48383104368889124, + "grad_norm": 2.672907829284668, + "learning_rate": 4.1936149271851817e-05, + "loss": 0.7485, + "step": 54730 + }, + { + "epoch": 0.48391944694920347, + "grad_norm": 7.139774322509766, + "learning_rate": 4.193467588417994e-05, + "loss": 0.801, + "step": 54740 + }, + { + "epoch": 0.4840078502095157, + "grad_norm": 2.941298007965088, + "learning_rate": 4.193320249650807e-05, + "loss": 0.8014, + "step": 54750 + }, + { + "epoch": 0.484096253469828, + "grad_norm": 4.749353885650635, + "learning_rate": 4.19317291088362e-05, + "loss": 0.6926, + "step": 54760 + }, + { + "epoch": 0.4841846567301402, + "grad_norm": 11.97219467163086, + "learning_rate": 4.193025572116433e-05, + "loss": 0.6381, + "step": 54770 + }, + { + "epoch": 0.48427305999045245, + "grad_norm": 5.572201728820801, + "learning_rate": 4.192878233349246e-05, + "loss": 0.8293, + "step": 54780 + }, + { + "epoch": 0.4843614632507647, + "grad_norm": 2.2339625358581543, + "learning_rate": 4.1927308945820593e-05, + "loss": 0.6532, + "step": 54790 + }, + { + "epoch": 0.4844498665110769, + "grad_norm": 5.473230838775635, + "learning_rate": 4.1925835558148715e-05, + "loss": 0.7743, + "step": 54800 + }, + { + "epoch": 0.4845382697713892, + "grad_norm": 3.816479206085205, + "learning_rate": 4.192436217047685e-05, + "loss": 0.7157, + "step": 54810 + }, + { + "epoch": 0.4846266730317014, + "grad_norm": 11.660649299621582, + "learning_rate": 4.192288878280498e-05, + "loss": 0.6943, + "step": 54820 + }, + { + "epoch": 0.48471507629201366, + "grad_norm": 3.910614490509033, + "learning_rate": 4.192141539513311e-05, + "loss": 0.7379, + "step": 54830 + }, + { + "epoch": 0.4848034795523259, + "grad_norm": 4.9388108253479, + "learning_rate": 4.1919942007461235e-05, + "loss": 0.7465, + "step": 54840 + }, + { + "epoch": 0.4848918828126381, + "grad_norm": 10.465206146240234, + "learning_rate": 4.1918468619789364e-05, + "loss": 0.8192, + "step": 54850 + }, + { + "epoch": 0.48498028607295035, + "grad_norm": 2.099726438522339, + "learning_rate": 4.191699523211749e-05, + "loss": 0.7743, + "step": 54860 + }, + { + "epoch": 0.48506868933326264, + "grad_norm": 2.6266822814941406, + "learning_rate": 4.191552184444563e-05, + "loss": 0.6546, + "step": 54870 + }, + { + "epoch": 0.48515709259357487, + "grad_norm": 4.461912631988525, + "learning_rate": 4.1914048456773755e-05, + "loss": 0.7234, + "step": 54880 + }, + { + "epoch": 0.4852454958538871, + "grad_norm": 10.542143821716309, + "learning_rate": 4.1912575069101884e-05, + "loss": 0.6395, + "step": 54890 + }, + { + "epoch": 0.48533389911419933, + "grad_norm": 2.121877431869507, + "learning_rate": 4.191110168143001e-05, + "loss": 0.7799, + "step": 54900 + }, + { + "epoch": 0.48542230237451156, + "grad_norm": 5.219315052032471, + "learning_rate": 4.190962829375814e-05, + "loss": 0.6919, + "step": 54910 + }, + { + "epoch": 0.4855107056348238, + "grad_norm": 1.6760114431381226, + "learning_rate": 4.190815490608627e-05, + "loss": 0.632, + "step": 54920 + }, + { + "epoch": 0.4855991088951361, + "grad_norm": 6.953105449676514, + "learning_rate": 4.1906681518414404e-05, + "loss": 0.6959, + "step": 54930 + }, + { + "epoch": 0.4856875121554483, + "grad_norm": 2.8986353874206543, + "learning_rate": 4.190520813074253e-05, + "loss": 0.6063, + "step": 54940 + }, + { + "epoch": 0.48577591541576054, + "grad_norm": 8.168563842773438, + "learning_rate": 4.190373474307066e-05, + "loss": 0.721, + "step": 54950 + }, + { + "epoch": 0.48586431867607277, + "grad_norm": 2.3021225929260254, + "learning_rate": 4.190226135539879e-05, + "loss": 0.6756, + "step": 54960 + }, + { + "epoch": 0.485952721936385, + "grad_norm": 1.4999538660049438, + "learning_rate": 4.190078796772692e-05, + "loss": 0.6641, + "step": 54970 + }, + { + "epoch": 0.48604112519669723, + "grad_norm": 7.899734020233154, + "learning_rate": 4.1899314580055046e-05, + "loss": 0.6603, + "step": 54980 + }, + { + "epoch": 0.4861295284570095, + "grad_norm": 4.906933784484863, + "learning_rate": 4.1897841192383174e-05, + "loss": 0.6396, + "step": 54990 + }, + { + "epoch": 0.48621793171732175, + "grad_norm": 3.425278902053833, + "learning_rate": 4.189636780471131e-05, + "loss": 0.6654, + "step": 55000 + }, + { + "epoch": 0.486306334977634, + "grad_norm": 3.697119951248169, + "learning_rate": 4.189489441703944e-05, + "loss": 0.7726, + "step": 55010 + }, + { + "epoch": 0.4863947382379462, + "grad_norm": 9.480390548706055, + "learning_rate": 4.1893421029367566e-05, + "loss": 0.7102, + "step": 55020 + }, + { + "epoch": 0.48648314149825844, + "grad_norm": 6.128053665161133, + "learning_rate": 4.1891947641695694e-05, + "loss": 0.6335, + "step": 55030 + }, + { + "epoch": 0.4865715447585707, + "grad_norm": 3.5200870037078857, + "learning_rate": 4.189047425402382e-05, + "loss": 0.5863, + "step": 55040 + }, + { + "epoch": 0.48665994801888296, + "grad_norm": 2.784259557723999, + "learning_rate": 4.188900086635195e-05, + "loss": 0.6824, + "step": 55050 + }, + { + "epoch": 0.4867483512791952, + "grad_norm": 6.356380939483643, + "learning_rate": 4.1887527478680086e-05, + "loss": 0.6934, + "step": 55060 + }, + { + "epoch": 0.4868367545395074, + "grad_norm": 2.7658591270446777, + "learning_rate": 4.1886054091008214e-05, + "loss": 0.6902, + "step": 55070 + }, + { + "epoch": 0.48692515779981965, + "grad_norm": 4.423125267028809, + "learning_rate": 4.188458070333634e-05, + "loss": 0.6749, + "step": 55080 + }, + { + "epoch": 0.4870135610601319, + "grad_norm": 4.625323295593262, + "learning_rate": 4.188310731566447e-05, + "loss": 0.7453, + "step": 55090 + }, + { + "epoch": 0.4871019643204441, + "grad_norm": 4.361440658569336, + "learning_rate": 4.18816339279926e-05, + "loss": 0.705, + "step": 55100 + }, + { + "epoch": 0.4871903675807564, + "grad_norm": 4.610135078430176, + "learning_rate": 4.188016054032073e-05, + "loss": 0.7953, + "step": 55110 + }, + { + "epoch": 0.48727877084106863, + "grad_norm": 2.346588373184204, + "learning_rate": 4.187868715264886e-05, + "loss": 0.867, + "step": 55120 + }, + { + "epoch": 0.48736717410138086, + "grad_norm": 5.302258491516113, + "learning_rate": 4.1877213764976985e-05, + "loss": 0.7313, + "step": 55130 + }, + { + "epoch": 0.4874555773616931, + "grad_norm": 5.039346218109131, + "learning_rate": 4.187574037730512e-05, + "loss": 0.6289, + "step": 55140 + }, + { + "epoch": 0.4875439806220053, + "grad_norm": 7.583291053771973, + "learning_rate": 4.187426698963325e-05, + "loss": 0.6106, + "step": 55150 + }, + { + "epoch": 0.48763238388231755, + "grad_norm": 2.6910438537597656, + "learning_rate": 4.1872793601961376e-05, + "loss": 0.7304, + "step": 55160 + }, + { + "epoch": 0.48772078714262984, + "grad_norm": 3.1498353481292725, + "learning_rate": 4.1871320214289505e-05, + "loss": 0.6721, + "step": 55170 + }, + { + "epoch": 0.48780919040294207, + "grad_norm": 4.106415748596191, + "learning_rate": 4.186984682661764e-05, + "loss": 0.7291, + "step": 55180 + }, + { + "epoch": 0.4878975936632543, + "grad_norm": 11.023696899414062, + "learning_rate": 4.186837343894576e-05, + "loss": 0.5497, + "step": 55190 + }, + { + "epoch": 0.48798599692356653, + "grad_norm": 2.3651418685913086, + "learning_rate": 4.1866900051273897e-05, + "loss": 0.6666, + "step": 55200 + }, + { + "epoch": 0.48807440018387876, + "grad_norm": 5.506601333618164, + "learning_rate": 4.186542666360202e-05, + "loss": 0.7184, + "step": 55210 + }, + { + "epoch": 0.488162803444191, + "grad_norm": 8.60173225402832, + "learning_rate": 4.186395327593015e-05, + "loss": 0.6957, + "step": 55220 + }, + { + "epoch": 0.4882512067045033, + "grad_norm": 2.281357765197754, + "learning_rate": 4.186247988825828e-05, + "loss": 0.6491, + "step": 55230 + }, + { + "epoch": 0.4883396099648155, + "grad_norm": 5.497363567352295, + "learning_rate": 4.186100650058641e-05, + "loss": 0.7456, + "step": 55240 + }, + { + "epoch": 0.48842801322512774, + "grad_norm": 6.958611965179443, + "learning_rate": 4.185953311291454e-05, + "loss": 0.6327, + "step": 55250 + }, + { + "epoch": 0.48851641648544, + "grad_norm": 6.949708938598633, + "learning_rate": 4.1858059725242674e-05, + "loss": 0.5946, + "step": 55260 + }, + { + "epoch": 0.4886048197457522, + "grad_norm": 2.0884857177734375, + "learning_rate": 4.1856586337570795e-05, + "loss": 0.8607, + "step": 55270 + }, + { + "epoch": 0.48869322300606444, + "grad_norm": 5.647246360778809, + "learning_rate": 4.185511294989893e-05, + "loss": 0.7551, + "step": 55280 + }, + { + "epoch": 0.4887816262663767, + "grad_norm": 1.4908417463302612, + "learning_rate": 4.185363956222706e-05, + "loss": 0.7701, + "step": 55290 + }, + { + "epoch": 0.48887002952668895, + "grad_norm": 1.7229188680648804, + "learning_rate": 4.185216617455519e-05, + "loss": 0.6892, + "step": 55300 + }, + { + "epoch": 0.4889584327870012, + "grad_norm": 3.431084632873535, + "learning_rate": 4.1850692786883315e-05, + "loss": 0.7167, + "step": 55310 + }, + { + "epoch": 0.4890468360473134, + "grad_norm": 2.7224199771881104, + "learning_rate": 4.1849219399211444e-05, + "loss": 0.6454, + "step": 55320 + }, + { + "epoch": 0.48913523930762565, + "grad_norm": 7.204508304595947, + "learning_rate": 4.184774601153957e-05, + "loss": 0.7286, + "step": 55330 + }, + { + "epoch": 0.48922364256793793, + "grad_norm": 8.568854331970215, + "learning_rate": 4.184627262386771e-05, + "loss": 0.6871, + "step": 55340 + }, + { + "epoch": 0.48931204582825016, + "grad_norm": 4.631820201873779, + "learning_rate": 4.184479923619583e-05, + "loss": 0.6819, + "step": 55350 + }, + { + "epoch": 0.4894004490885624, + "grad_norm": 7.1135640144348145, + "learning_rate": 4.1843325848523964e-05, + "loss": 0.8508, + "step": 55360 + }, + { + "epoch": 0.4894888523488746, + "grad_norm": 4.763911247253418, + "learning_rate": 4.184185246085209e-05, + "loss": 0.7262, + "step": 55370 + }, + { + "epoch": 0.48957725560918686, + "grad_norm": 1.350186824798584, + "learning_rate": 4.184037907318022e-05, + "loss": 0.6756, + "step": 55380 + }, + { + "epoch": 0.4896656588694991, + "grad_norm": 7.568343162536621, + "learning_rate": 4.183890568550835e-05, + "loss": 0.7142, + "step": 55390 + }, + { + "epoch": 0.4897540621298114, + "grad_norm": 7.722400188446045, + "learning_rate": 4.1837432297836484e-05, + "loss": 0.67, + "step": 55400 + }, + { + "epoch": 0.4898424653901236, + "grad_norm": 5.146885395050049, + "learning_rate": 4.1835958910164606e-05, + "loss": 0.6448, + "step": 55410 + }, + { + "epoch": 0.48993086865043584, + "grad_norm": 1.5060570240020752, + "learning_rate": 4.183448552249274e-05, + "loss": 0.7488, + "step": 55420 + }, + { + "epoch": 0.49001927191074807, + "grad_norm": 4.813604831695557, + "learning_rate": 4.183301213482086e-05, + "loss": 0.7058, + "step": 55430 + }, + { + "epoch": 0.4901076751710603, + "grad_norm": 3.104266405105591, + "learning_rate": 4.1831538747149e-05, + "loss": 0.7286, + "step": 55440 + }, + { + "epoch": 0.49019607843137253, + "grad_norm": 1.2457255125045776, + "learning_rate": 4.1830065359477126e-05, + "loss": 0.6328, + "step": 55450 + }, + { + "epoch": 0.4902844816916848, + "grad_norm": 2.3449554443359375, + "learning_rate": 4.1828591971805254e-05, + "loss": 0.6452, + "step": 55460 + }, + { + "epoch": 0.49037288495199705, + "grad_norm": 2.7399024963378906, + "learning_rate": 4.182711858413338e-05, + "loss": 0.8105, + "step": 55470 + }, + { + "epoch": 0.4904612882123093, + "grad_norm": 11.01954174041748, + "learning_rate": 4.182564519646152e-05, + "loss": 0.6609, + "step": 55480 + }, + { + "epoch": 0.4905496914726215, + "grad_norm": 7.126019477844238, + "learning_rate": 4.182417180878964e-05, + "loss": 0.775, + "step": 55490 + }, + { + "epoch": 0.49063809473293374, + "grad_norm": 2.2502033710479736, + "learning_rate": 4.1822698421117774e-05, + "loss": 0.5554, + "step": 55500 + }, + { + "epoch": 0.49072649799324597, + "grad_norm": 10.011443138122559, + "learning_rate": 4.18212250334459e-05, + "loss": 0.7572, + "step": 55510 + }, + { + "epoch": 0.49081490125355826, + "grad_norm": 2.1530823707580566, + "learning_rate": 4.181975164577403e-05, + "loss": 0.6739, + "step": 55520 + }, + { + "epoch": 0.4909033045138705, + "grad_norm": 13.02297306060791, + "learning_rate": 4.181827825810216e-05, + "loss": 0.7904, + "step": 55530 + }, + { + "epoch": 0.4909917077741827, + "grad_norm": 6.199241638183594, + "learning_rate": 4.1816804870430295e-05, + "loss": 0.6325, + "step": 55540 + }, + { + "epoch": 0.49108011103449495, + "grad_norm": 8.223472595214844, + "learning_rate": 4.1815331482758416e-05, + "loss": 0.6923, + "step": 55550 + }, + { + "epoch": 0.4911685142948072, + "grad_norm": 2.7837607860565186, + "learning_rate": 4.181385809508655e-05, + "loss": 0.7552, + "step": 55560 + }, + { + "epoch": 0.4912569175551194, + "grad_norm": 9.880025863647461, + "learning_rate": 4.181238470741467e-05, + "loss": 0.5899, + "step": 55570 + }, + { + "epoch": 0.4913453208154317, + "grad_norm": 8.495399475097656, + "learning_rate": 4.181091131974281e-05, + "loss": 0.6938, + "step": 55580 + }, + { + "epoch": 0.4914337240757439, + "grad_norm": 5.334437370300293, + "learning_rate": 4.1809437932070936e-05, + "loss": 0.7148, + "step": 55590 + }, + { + "epoch": 0.49152212733605616, + "grad_norm": 2.6492724418640137, + "learning_rate": 4.1807964544399065e-05, + "loss": 0.716, + "step": 55600 + }, + { + "epoch": 0.4916105305963684, + "grad_norm": 3.826448678970337, + "learning_rate": 4.180649115672719e-05, + "loss": 0.8247, + "step": 55610 + }, + { + "epoch": 0.4916989338566806, + "grad_norm": 2.4406564235687256, + "learning_rate": 4.180501776905533e-05, + "loss": 0.6665, + "step": 55620 + }, + { + "epoch": 0.49178733711699285, + "grad_norm": 1.7770804166793823, + "learning_rate": 4.180354438138345e-05, + "loss": 0.7198, + "step": 55630 + }, + { + "epoch": 0.49187574037730514, + "grad_norm": 2.9972198009490967, + "learning_rate": 4.1802070993711585e-05, + "loss": 0.6669, + "step": 55640 + }, + { + "epoch": 0.49196414363761737, + "grad_norm": 8.456132888793945, + "learning_rate": 4.180059760603971e-05, + "loss": 0.6704, + "step": 55650 + }, + { + "epoch": 0.4920525468979296, + "grad_norm": 10.448269844055176, + "learning_rate": 4.179912421836784e-05, + "loss": 0.646, + "step": 55660 + }, + { + "epoch": 0.49214095015824183, + "grad_norm": 5.333916664123535, + "learning_rate": 4.179765083069597e-05, + "loss": 0.5548, + "step": 55670 + }, + { + "epoch": 0.49222935341855406, + "grad_norm": 6.754070281982422, + "learning_rate": 4.17961774430241e-05, + "loss": 0.7663, + "step": 55680 + }, + { + "epoch": 0.4923177566788663, + "grad_norm": 9.926854133605957, + "learning_rate": 4.179470405535223e-05, + "loss": 0.6625, + "step": 55690 + }, + { + "epoch": 0.4924061599391786, + "grad_norm": 3.9071664810180664, + "learning_rate": 4.179323066768036e-05, + "loss": 0.7274, + "step": 55700 + }, + { + "epoch": 0.4924945631994908, + "grad_norm": 1.381919503211975, + "learning_rate": 4.179175728000848e-05, + "loss": 0.6246, + "step": 55710 + }, + { + "epoch": 0.49258296645980304, + "grad_norm": 7.704475402832031, + "learning_rate": 4.179028389233662e-05, + "loss": 0.7636, + "step": 55720 + }, + { + "epoch": 0.49267136972011527, + "grad_norm": 6.495126724243164, + "learning_rate": 4.178881050466475e-05, + "loss": 0.696, + "step": 55730 + }, + { + "epoch": 0.4927597729804275, + "grad_norm": 1.3690553903579712, + "learning_rate": 4.1787337116992875e-05, + "loss": 0.7047, + "step": 55740 + }, + { + "epoch": 0.49284817624073973, + "grad_norm": 2.840855598449707, + "learning_rate": 4.1785863729321004e-05, + "loss": 0.6929, + "step": 55750 + }, + { + "epoch": 0.492936579501052, + "grad_norm": 4.008294105529785, + "learning_rate": 4.178439034164914e-05, + "loss": 0.7485, + "step": 55760 + }, + { + "epoch": 0.49302498276136425, + "grad_norm": 0.9874984622001648, + "learning_rate": 4.178291695397726e-05, + "loss": 0.8122, + "step": 55770 + }, + { + "epoch": 0.4931133860216765, + "grad_norm": 3.377452850341797, + "learning_rate": 4.1781443566305395e-05, + "loss": 0.6806, + "step": 55780 + }, + { + "epoch": 0.4932017892819887, + "grad_norm": 1.7771506309509277, + "learning_rate": 4.1779970178633524e-05, + "loss": 0.6399, + "step": 55790 + }, + { + "epoch": 0.49329019254230094, + "grad_norm": 2.521798610687256, + "learning_rate": 4.177849679096165e-05, + "loss": 0.8092, + "step": 55800 + }, + { + "epoch": 0.4933785958026132, + "grad_norm": 3.044299840927124, + "learning_rate": 4.177702340328978e-05, + "loss": 0.6771, + "step": 55810 + }, + { + "epoch": 0.49346699906292546, + "grad_norm": 1.857763648033142, + "learning_rate": 4.177555001561791e-05, + "loss": 0.7844, + "step": 55820 + }, + { + "epoch": 0.4935554023232377, + "grad_norm": 4.613230228424072, + "learning_rate": 4.177407662794604e-05, + "loss": 0.7614, + "step": 55830 + }, + { + "epoch": 0.4936438055835499, + "grad_norm": 21.550128936767578, + "learning_rate": 4.177260324027417e-05, + "loss": 0.6442, + "step": 55840 + }, + { + "epoch": 0.49373220884386215, + "grad_norm": 1.7921669483184814, + "learning_rate": 4.17711298526023e-05, + "loss": 0.8092, + "step": 55850 + }, + { + "epoch": 0.4938206121041744, + "grad_norm": 1.795630931854248, + "learning_rate": 4.176965646493043e-05, + "loss": 0.672, + "step": 55860 + }, + { + "epoch": 0.49390901536448667, + "grad_norm": 4.374889850616455, + "learning_rate": 4.176818307725856e-05, + "loss": 0.6347, + "step": 55870 + }, + { + "epoch": 0.4939974186247989, + "grad_norm": 3.6233792304992676, + "learning_rate": 4.1766709689586686e-05, + "loss": 0.6676, + "step": 55880 + }, + { + "epoch": 0.49408582188511113, + "grad_norm": 14.791594505310059, + "learning_rate": 4.1765236301914814e-05, + "loss": 0.7522, + "step": 55890 + }, + { + "epoch": 0.49417422514542336, + "grad_norm": 9.890098571777344, + "learning_rate": 4.176376291424294e-05, + "loss": 0.7474, + "step": 55900 + }, + { + "epoch": 0.4942626284057356, + "grad_norm": 1.5646966695785522, + "learning_rate": 4.176228952657108e-05, + "loss": 0.6713, + "step": 55910 + }, + { + "epoch": 0.4943510316660478, + "grad_norm": 3.6969211101531982, + "learning_rate": 4.1760816138899206e-05, + "loss": 0.7218, + "step": 55920 + }, + { + "epoch": 0.4944394349263601, + "grad_norm": 1.3526560068130493, + "learning_rate": 4.1759342751227334e-05, + "loss": 0.7743, + "step": 55930 + }, + { + "epoch": 0.49452783818667234, + "grad_norm": 5.316917419433594, + "learning_rate": 4.175786936355546e-05, + "loss": 0.6966, + "step": 55940 + }, + { + "epoch": 0.4946162414469846, + "grad_norm": 4.94674825668335, + "learning_rate": 4.17563959758836e-05, + "loss": 0.7137, + "step": 55950 + }, + { + "epoch": 0.4947046447072968, + "grad_norm": 6.7031731605529785, + "learning_rate": 4.175492258821172e-05, + "loss": 0.8149, + "step": 55960 + }, + { + "epoch": 0.49479304796760903, + "grad_norm": 3.462838649749756, + "learning_rate": 4.1753449200539854e-05, + "loss": 0.7457, + "step": 55970 + }, + { + "epoch": 0.49488145122792127, + "grad_norm": 2.193026304244995, + "learning_rate": 4.175197581286798e-05, + "loss": 0.78, + "step": 55980 + }, + { + "epoch": 0.49496985448823355, + "grad_norm": 2.6484906673431396, + "learning_rate": 4.175050242519611e-05, + "loss": 0.5925, + "step": 55990 + }, + { + "epoch": 0.4950582577485458, + "grad_norm": 5.234287738800049, + "learning_rate": 4.174902903752424e-05, + "loss": 0.7181, + "step": 56000 + }, + { + "epoch": 0.495146661008858, + "grad_norm": 2.6760685443878174, + "learning_rate": 4.1747555649852375e-05, + "loss": 0.7013, + "step": 56010 + }, + { + "epoch": 0.49523506426917024, + "grad_norm": 2.5146236419677734, + "learning_rate": 4.1746082262180496e-05, + "loss": 0.7546, + "step": 56020 + }, + { + "epoch": 0.4953234675294825, + "grad_norm": 1.9755581617355347, + "learning_rate": 4.174460887450863e-05, + "loss": 0.7427, + "step": 56030 + }, + { + "epoch": 0.4954118707897947, + "grad_norm": 2.856619358062744, + "learning_rate": 4.174313548683675e-05, + "loss": 0.7434, + "step": 56040 + }, + { + "epoch": 0.495500274050107, + "grad_norm": 2.2519772052764893, + "learning_rate": 4.174166209916489e-05, + "loss": 0.6547, + "step": 56050 + }, + { + "epoch": 0.4955886773104192, + "grad_norm": 2.288954019546509, + "learning_rate": 4.1740188711493016e-05, + "loss": 0.7181, + "step": 56060 + }, + { + "epoch": 0.49567708057073145, + "grad_norm": 1.9550548791885376, + "learning_rate": 4.1738715323821145e-05, + "loss": 0.6374, + "step": 56070 + }, + { + "epoch": 0.4957654838310437, + "grad_norm": 3.3265535831451416, + "learning_rate": 4.173724193614927e-05, + "loss": 0.6593, + "step": 56080 + }, + { + "epoch": 0.4958538870913559, + "grad_norm": 1.2758525609970093, + "learning_rate": 4.173576854847741e-05, + "loss": 0.7149, + "step": 56090 + }, + { + "epoch": 0.49594229035166815, + "grad_norm": 3.466811180114746, + "learning_rate": 4.173429516080553e-05, + "loss": 0.6414, + "step": 56100 + }, + { + "epoch": 0.49603069361198043, + "grad_norm": 1.6507645845413208, + "learning_rate": 4.1732821773133665e-05, + "loss": 0.6309, + "step": 56110 + }, + { + "epoch": 0.49611909687229266, + "grad_norm": 1.9765758514404297, + "learning_rate": 4.173134838546179e-05, + "loss": 0.63, + "step": 56120 + }, + { + "epoch": 0.4962075001326049, + "grad_norm": 14.110745429992676, + "learning_rate": 4.172987499778992e-05, + "loss": 0.8376, + "step": 56130 + }, + { + "epoch": 0.4962959033929171, + "grad_norm": 2.5902881622314453, + "learning_rate": 4.172840161011805e-05, + "loss": 0.6001, + "step": 56140 + }, + { + "epoch": 0.49638430665322936, + "grad_norm": 1.9173308610916138, + "learning_rate": 4.172692822244618e-05, + "loss": 0.6811, + "step": 56150 + }, + { + "epoch": 0.4964727099135416, + "grad_norm": 1.4938806295394897, + "learning_rate": 4.172545483477431e-05, + "loss": 0.6975, + "step": 56160 + }, + { + "epoch": 0.4965611131738539, + "grad_norm": 3.743889331817627, + "learning_rate": 4.172398144710244e-05, + "loss": 0.678, + "step": 56170 + }, + { + "epoch": 0.4966495164341661, + "grad_norm": 3.569836378097534, + "learning_rate": 4.1722508059430563e-05, + "loss": 0.6603, + "step": 56180 + }, + { + "epoch": 0.49673791969447834, + "grad_norm": 5.14637565612793, + "learning_rate": 4.17210346717587e-05, + "loss": 0.7726, + "step": 56190 + }, + { + "epoch": 0.49682632295479057, + "grad_norm": 1.631245732307434, + "learning_rate": 4.171956128408683e-05, + "loss": 0.6322, + "step": 56200 + }, + { + "epoch": 0.4969147262151028, + "grad_norm": 1.28780996799469, + "learning_rate": 4.1718087896414955e-05, + "loss": 0.7523, + "step": 56210 + }, + { + "epoch": 0.49700312947541503, + "grad_norm": 3.152726411819458, + "learning_rate": 4.1716614508743084e-05, + "loss": 0.5857, + "step": 56220 + }, + { + "epoch": 0.4970915327357273, + "grad_norm": 8.108668327331543, + "learning_rate": 4.171514112107122e-05, + "loss": 0.7438, + "step": 56230 + }, + { + "epoch": 0.49717993599603955, + "grad_norm": 3.006171703338623, + "learning_rate": 4.171366773339934e-05, + "loss": 0.6027, + "step": 56240 + }, + { + "epoch": 0.4972683392563518, + "grad_norm": 8.034201622009277, + "learning_rate": 4.1712194345727475e-05, + "loss": 0.6328, + "step": 56250 + }, + { + "epoch": 0.497356742516664, + "grad_norm": 7.66131591796875, + "learning_rate": 4.17107209580556e-05, + "loss": 0.6001, + "step": 56260 + }, + { + "epoch": 0.49744514577697624, + "grad_norm": 1.8868650197982788, + "learning_rate": 4.170924757038373e-05, + "loss": 0.5932, + "step": 56270 + }, + { + "epoch": 0.49753354903728847, + "grad_norm": 4.654658317565918, + "learning_rate": 4.170777418271186e-05, + "loss": 0.6393, + "step": 56280 + }, + { + "epoch": 0.49762195229760076, + "grad_norm": 3.2896487712860107, + "learning_rate": 4.170630079503999e-05, + "loss": 0.7229, + "step": 56290 + }, + { + "epoch": 0.497710355557913, + "grad_norm": 3.0114638805389404, + "learning_rate": 4.170482740736812e-05, + "loss": 0.661, + "step": 56300 + }, + { + "epoch": 0.4977987588182252, + "grad_norm": 1.7431437969207764, + "learning_rate": 4.170335401969625e-05, + "loss": 0.6516, + "step": 56310 + }, + { + "epoch": 0.49788716207853745, + "grad_norm": 1.5742127895355225, + "learning_rate": 4.1701880632024374e-05, + "loss": 0.5967, + "step": 56320 + }, + { + "epoch": 0.4979755653388497, + "grad_norm": 3.6714813709259033, + "learning_rate": 4.170040724435251e-05, + "loss": 0.6864, + "step": 56330 + }, + { + "epoch": 0.4980639685991619, + "grad_norm": 3.167954683303833, + "learning_rate": 4.169893385668064e-05, + "loss": 0.7768, + "step": 56340 + }, + { + "epoch": 0.4981523718594742, + "grad_norm": 3.3005383014678955, + "learning_rate": 4.1697460469008766e-05, + "loss": 0.8772, + "step": 56350 + }, + { + "epoch": 0.4982407751197864, + "grad_norm": 3.6041250228881836, + "learning_rate": 4.1695987081336894e-05, + "loss": 0.9309, + "step": 56360 + }, + { + "epoch": 0.49832917838009866, + "grad_norm": 1.3319722414016724, + "learning_rate": 4.169451369366503e-05, + "loss": 0.6713, + "step": 56370 + }, + { + "epoch": 0.4984175816404109, + "grad_norm": 6.420892715454102, + "learning_rate": 4.169304030599315e-05, + "loss": 0.6064, + "step": 56380 + }, + { + "epoch": 0.4985059849007231, + "grad_norm": 12.575167655944824, + "learning_rate": 4.1691566918321286e-05, + "loss": 0.7888, + "step": 56390 + }, + { + "epoch": 0.4985943881610354, + "grad_norm": 3.8580946922302246, + "learning_rate": 4.169009353064941e-05, + "loss": 0.7808, + "step": 56400 + }, + { + "epoch": 0.49868279142134764, + "grad_norm": 2.1893560886383057, + "learning_rate": 4.168862014297754e-05, + "loss": 0.7169, + "step": 56410 + }, + { + "epoch": 0.49877119468165987, + "grad_norm": 3.957118511199951, + "learning_rate": 4.168714675530567e-05, + "loss": 0.7767, + "step": 56420 + }, + { + "epoch": 0.4988595979419721, + "grad_norm": 2.7286252975463867, + "learning_rate": 4.16856733676338e-05, + "loss": 0.7691, + "step": 56430 + }, + { + "epoch": 0.49894800120228433, + "grad_norm": 1.7598273754119873, + "learning_rate": 4.168419997996193e-05, + "loss": 0.6991, + "step": 56440 + }, + { + "epoch": 0.49903640446259656, + "grad_norm": 2.537155866622925, + "learning_rate": 4.168272659229006e-05, + "loss": 0.7169, + "step": 56450 + }, + { + "epoch": 0.49912480772290885, + "grad_norm": 10.953792572021484, + "learning_rate": 4.1681253204618184e-05, + "loss": 0.8031, + "step": 56460 + }, + { + "epoch": 0.4992132109832211, + "grad_norm": 4.062319755554199, + "learning_rate": 4.167977981694632e-05, + "loss": 0.5463, + "step": 56470 + }, + { + "epoch": 0.4993016142435333, + "grad_norm": 7.464442253112793, + "learning_rate": 4.167830642927445e-05, + "loss": 0.7858, + "step": 56480 + }, + { + "epoch": 0.49939001750384554, + "grad_norm": 7.360393047332764, + "learning_rate": 4.1676833041602576e-05, + "loss": 0.6219, + "step": 56490 + }, + { + "epoch": 0.49947842076415777, + "grad_norm": 3.858891248703003, + "learning_rate": 4.1675359653930705e-05, + "loss": 0.6408, + "step": 56500 + }, + { + "epoch": 0.49956682402447, + "grad_norm": 4.827006816864014, + "learning_rate": 4.167388626625883e-05, + "loss": 0.7617, + "step": 56510 + }, + { + "epoch": 0.4996552272847823, + "grad_norm": 1.9536699056625366, + "learning_rate": 4.167241287858696e-05, + "loss": 0.7106, + "step": 56520 + }, + { + "epoch": 0.4997436305450945, + "grad_norm": 6.272826671600342, + "learning_rate": 4.1670939490915096e-05, + "loss": 0.7382, + "step": 56530 + }, + { + "epoch": 0.49983203380540675, + "grad_norm": 3.2847611904144287, + "learning_rate": 4.166946610324322e-05, + "loss": 0.6765, + "step": 56540 + }, + { + "epoch": 0.499920437065719, + "grad_norm": 4.158660888671875, + "learning_rate": 4.166799271557135e-05, + "loss": 0.7309, + "step": 56550 + }, + { + "epoch": 0.5000088403260312, + "grad_norm": 3.1422619819641113, + "learning_rate": 4.166651932789948e-05, + "loss": 0.6606, + "step": 56560 + }, + { + "epoch": 0.5000972435863434, + "grad_norm": 5.493398189544678, + "learning_rate": 4.166504594022761e-05, + "loss": 0.7619, + "step": 56570 + }, + { + "epoch": 0.5001856468466557, + "grad_norm": 3.35305118560791, + "learning_rate": 4.166357255255574e-05, + "loss": 0.7336, + "step": 56580 + }, + { + "epoch": 0.5002740501069679, + "grad_norm": 1.8183269500732422, + "learning_rate": 4.166209916488387e-05, + "loss": 0.6442, + "step": 56590 + }, + { + "epoch": 0.5003624533672801, + "grad_norm": 8.141581535339355, + "learning_rate": 4.1660625777211995e-05, + "loss": 0.6402, + "step": 56600 + }, + { + "epoch": 0.5004508566275925, + "grad_norm": 3.1588070392608643, + "learning_rate": 4.165915238954013e-05, + "loss": 0.713, + "step": 56610 + }, + { + "epoch": 0.5005392598879047, + "grad_norm": 2.835822820663452, + "learning_rate": 4.165767900186825e-05, + "loss": 0.7602, + "step": 56620 + }, + { + "epoch": 0.5006276631482169, + "grad_norm": 8.367727279663086, + "learning_rate": 4.165620561419639e-05, + "loss": 0.7216, + "step": 56630 + }, + { + "epoch": 0.5007160664085292, + "grad_norm": 2.2879111766815186, + "learning_rate": 4.1654732226524515e-05, + "loss": 0.5942, + "step": 56640 + }, + { + "epoch": 0.5008044696688414, + "grad_norm": 2.3880019187927246, + "learning_rate": 4.1653258838852643e-05, + "loss": 0.712, + "step": 56650 + }, + { + "epoch": 0.5008928729291536, + "grad_norm": 2.2284128665924072, + "learning_rate": 4.165178545118077e-05, + "loss": 0.6762, + "step": 56660 + }, + { + "epoch": 0.5009812761894659, + "grad_norm": 19.109281539916992, + "learning_rate": 4.165031206350891e-05, + "loss": 0.5511, + "step": 56670 + }, + { + "epoch": 0.5010696794497781, + "grad_norm": 3.4032065868377686, + "learning_rate": 4.1648838675837035e-05, + "loss": 0.5864, + "step": 56680 + }, + { + "epoch": 0.5011580827100903, + "grad_norm": 4.226108551025391, + "learning_rate": 4.1647365288165164e-05, + "loss": 0.7623, + "step": 56690 + }, + { + "epoch": 0.5012464859704026, + "grad_norm": 6.362026214599609, + "learning_rate": 4.164589190049329e-05, + "loss": 0.7535, + "step": 56700 + }, + { + "epoch": 0.5013348892307148, + "grad_norm": 5.453469276428223, + "learning_rate": 4.164441851282142e-05, + "loss": 0.619, + "step": 56710 + }, + { + "epoch": 0.501423292491027, + "grad_norm": 8.311491966247559, + "learning_rate": 4.164294512514955e-05, + "loss": 0.8121, + "step": 56720 + }, + { + "epoch": 0.5015116957513394, + "grad_norm": 3.5187137126922607, + "learning_rate": 4.164147173747768e-05, + "loss": 0.6545, + "step": 56730 + }, + { + "epoch": 0.5016000990116516, + "grad_norm": 3.2361950874328613, + "learning_rate": 4.163999834980581e-05, + "loss": 0.6789, + "step": 56740 + }, + { + "epoch": 0.5016885022719638, + "grad_norm": 2.4146649837493896, + "learning_rate": 4.163852496213394e-05, + "loss": 0.6575, + "step": 56750 + }, + { + "epoch": 0.501776905532276, + "grad_norm": 4.072633266448975, + "learning_rate": 4.163705157446207e-05, + "loss": 0.7072, + "step": 56760 + }, + { + "epoch": 0.5018653087925883, + "grad_norm": 2.62351393699646, + "learning_rate": 4.16355781867902e-05, + "loss": 0.7861, + "step": 56770 + }, + { + "epoch": 0.5019537120529005, + "grad_norm": 1.3204588890075684, + "learning_rate": 4.1634104799118326e-05, + "loss": 0.6465, + "step": 56780 + }, + { + "epoch": 0.5020421153132127, + "grad_norm": 2.0702619552612305, + "learning_rate": 4.1632631411446454e-05, + "loss": 0.5643, + "step": 56790 + }, + { + "epoch": 0.502130518573525, + "grad_norm": 11.134904861450195, + "learning_rate": 4.163115802377459e-05, + "loss": 0.6589, + "step": 56800 + }, + { + "epoch": 0.5022189218338372, + "grad_norm": 12.244222640991211, + "learning_rate": 4.162968463610272e-05, + "loss": 0.8243, + "step": 56810 + }, + { + "epoch": 0.5023073250941494, + "grad_norm": 1.670577049255371, + "learning_rate": 4.1628211248430846e-05, + "loss": 0.6299, + "step": 56820 + }, + { + "epoch": 0.5023957283544617, + "grad_norm": 5.465409755706787, + "learning_rate": 4.1626737860758974e-05, + "loss": 0.8664, + "step": 56830 + }, + { + "epoch": 0.5024841316147739, + "grad_norm": 3.7542972564697266, + "learning_rate": 4.16252644730871e-05, + "loss": 0.7647, + "step": 56840 + }, + { + "epoch": 0.5025725348750862, + "grad_norm": 5.315617084503174, + "learning_rate": 4.162379108541523e-05, + "loss": 0.6223, + "step": 56850 + }, + { + "epoch": 0.5026609381353985, + "grad_norm": 9.695874214172363, + "learning_rate": 4.1622317697743366e-05, + "loss": 0.6954, + "step": 56860 + }, + { + "epoch": 0.5027493413957107, + "grad_norm": 3.826537847518921, + "learning_rate": 4.162084431007149e-05, + "loss": 0.6427, + "step": 56870 + }, + { + "epoch": 0.5028377446560229, + "grad_norm": 2.182504415512085, + "learning_rate": 4.161937092239962e-05, + "loss": 0.8364, + "step": 56880 + }, + { + "epoch": 0.5029261479163352, + "grad_norm": 3.3862853050231934, + "learning_rate": 4.161789753472775e-05, + "loss": 0.6931, + "step": 56890 + }, + { + "epoch": 0.5030145511766474, + "grad_norm": 3.5248701572418213, + "learning_rate": 4.161642414705588e-05, + "loss": 0.6316, + "step": 56900 + }, + { + "epoch": 0.5031029544369596, + "grad_norm": 8.183309555053711, + "learning_rate": 4.161495075938401e-05, + "loss": 0.724, + "step": 56910 + }, + { + "epoch": 0.5031913576972719, + "grad_norm": 3.988558530807495, + "learning_rate": 4.161347737171214e-05, + "loss": 0.7791, + "step": 56920 + }, + { + "epoch": 0.5032797609575841, + "grad_norm": 4.229586124420166, + "learning_rate": 4.1612003984040264e-05, + "loss": 0.5944, + "step": 56930 + }, + { + "epoch": 0.5033681642178963, + "grad_norm": 1.4051169157028198, + "learning_rate": 4.16105305963684e-05, + "loss": 0.6589, + "step": 56940 + }, + { + "epoch": 0.5034565674782085, + "grad_norm": 1.8541871309280396, + "learning_rate": 4.160905720869653e-05, + "loss": 0.61, + "step": 56950 + }, + { + "epoch": 0.5035449707385209, + "grad_norm": 2.2848703861236572, + "learning_rate": 4.1607583821024656e-05, + "loss": 0.6376, + "step": 56960 + }, + { + "epoch": 0.5036333739988331, + "grad_norm": 12.388575553894043, + "learning_rate": 4.1606110433352785e-05, + "loss": 0.6717, + "step": 56970 + }, + { + "epoch": 0.5037217772591454, + "grad_norm": 1.0625922679901123, + "learning_rate": 4.160463704568091e-05, + "loss": 0.5748, + "step": 56980 + }, + { + "epoch": 0.5038101805194576, + "grad_norm": 4.856298446655273, + "learning_rate": 4.160316365800904e-05, + "loss": 0.793, + "step": 56990 + }, + { + "epoch": 0.5038985837797698, + "grad_norm": 7.612451076507568, + "learning_rate": 4.1601690270337177e-05, + "loss": 0.7467, + "step": 57000 + }, + { + "epoch": 0.503986987040082, + "grad_norm": 6.634158611297607, + "learning_rate": 4.16002168826653e-05, + "loss": 0.8421, + "step": 57010 + }, + { + "epoch": 0.5040753903003943, + "grad_norm": 2.936037302017212, + "learning_rate": 4.159874349499343e-05, + "loss": 0.5945, + "step": 57020 + }, + { + "epoch": 0.5041637935607065, + "grad_norm": 5.316231727600098, + "learning_rate": 4.159727010732156e-05, + "loss": 0.7688, + "step": 57030 + }, + { + "epoch": 0.5042521968210187, + "grad_norm": 8.598160743713379, + "learning_rate": 4.159579671964969e-05, + "loss": 0.8109, + "step": 57040 + }, + { + "epoch": 0.504340600081331, + "grad_norm": 3.372987747192383, + "learning_rate": 4.159432333197782e-05, + "loss": 0.7786, + "step": 57050 + }, + { + "epoch": 0.5044290033416432, + "grad_norm": 3.2942261695861816, + "learning_rate": 4.1592849944305953e-05, + "loss": 0.6363, + "step": 57060 + }, + { + "epoch": 0.5045174066019554, + "grad_norm": 4.1155781745910645, + "learning_rate": 4.1591376556634075e-05, + "loss": 0.631, + "step": 57070 + }, + { + "epoch": 0.5046058098622678, + "grad_norm": 8.191411018371582, + "learning_rate": 4.158990316896221e-05, + "loss": 0.6696, + "step": 57080 + }, + { + "epoch": 0.50469421312258, + "grad_norm": 1.8772951364517212, + "learning_rate": 4.158842978129033e-05, + "loss": 0.6356, + "step": 57090 + }, + { + "epoch": 0.5047826163828922, + "grad_norm": 2.4823923110961914, + "learning_rate": 4.158695639361847e-05, + "loss": 0.7862, + "step": 57100 + }, + { + "epoch": 0.5048710196432045, + "grad_norm": 5.776914119720459, + "learning_rate": 4.1585483005946595e-05, + "loss": 0.7658, + "step": 57110 + }, + { + "epoch": 0.5049594229035167, + "grad_norm": 5.254019737243652, + "learning_rate": 4.1584009618274724e-05, + "loss": 0.6319, + "step": 57120 + }, + { + "epoch": 0.5050478261638289, + "grad_norm": 3.2729878425598145, + "learning_rate": 4.158253623060285e-05, + "loss": 0.7331, + "step": 57130 + }, + { + "epoch": 0.5051362294241412, + "grad_norm": 2.2706246376037598, + "learning_rate": 4.158106284293099e-05, + "loss": 0.7267, + "step": 57140 + }, + { + "epoch": 0.5052246326844534, + "grad_norm": 4.088400363922119, + "learning_rate": 4.157958945525911e-05, + "loss": 0.7926, + "step": 57150 + }, + { + "epoch": 0.5053130359447656, + "grad_norm": 3.899329423904419, + "learning_rate": 4.1578116067587244e-05, + "loss": 0.6409, + "step": 57160 + }, + { + "epoch": 0.5054014392050779, + "grad_norm": 2.1385231018066406, + "learning_rate": 4.157664267991537e-05, + "loss": 0.6898, + "step": 57170 + }, + { + "epoch": 0.5054898424653901, + "grad_norm": 7.217191696166992, + "learning_rate": 4.15751692922435e-05, + "loss": 0.6133, + "step": 57180 + }, + { + "epoch": 0.5055782457257023, + "grad_norm": 2.897397994995117, + "learning_rate": 4.157369590457163e-05, + "loss": 0.6874, + "step": 57190 + }, + { + "epoch": 0.5056666489860147, + "grad_norm": 3.4928135871887207, + "learning_rate": 4.157222251689976e-05, + "loss": 0.6597, + "step": 57200 + }, + { + "epoch": 0.5057550522463269, + "grad_norm": 4.433698654174805, + "learning_rate": 4.1570749129227885e-05, + "loss": 0.7171, + "step": 57210 + }, + { + "epoch": 0.5058434555066391, + "grad_norm": 5.090980529785156, + "learning_rate": 4.156927574155602e-05, + "loss": 0.6595, + "step": 57220 + }, + { + "epoch": 0.5059318587669513, + "grad_norm": 3.493039131164551, + "learning_rate": 4.156780235388414e-05, + "loss": 0.6937, + "step": 57230 + }, + { + "epoch": 0.5060202620272636, + "grad_norm": 3.3215017318725586, + "learning_rate": 4.156632896621228e-05, + "loss": 0.698, + "step": 57240 + }, + { + "epoch": 0.5061086652875758, + "grad_norm": 3.170927047729492, + "learning_rate": 4.1564855578540406e-05, + "loss": 0.7577, + "step": 57250 + }, + { + "epoch": 0.506197068547888, + "grad_norm": 2.4635794162750244, + "learning_rate": 4.1563382190868534e-05, + "loss": 0.6094, + "step": 57260 + }, + { + "epoch": 0.5062854718082003, + "grad_norm": 4.042263031005859, + "learning_rate": 4.156190880319666e-05, + "loss": 0.6449, + "step": 57270 + }, + { + "epoch": 0.5063738750685125, + "grad_norm": 1.4860950708389282, + "learning_rate": 4.15604354155248e-05, + "loss": 0.6807, + "step": 57280 + }, + { + "epoch": 0.5064622783288247, + "grad_norm": 2.3431918621063232, + "learning_rate": 4.155896202785292e-05, + "loss": 0.7994, + "step": 57290 + }, + { + "epoch": 0.506550681589137, + "grad_norm": 2.3435821533203125, + "learning_rate": 4.1557488640181054e-05, + "loss": 0.5329, + "step": 57300 + }, + { + "epoch": 0.5066390848494492, + "grad_norm": 6.458644866943359, + "learning_rate": 4.155601525250918e-05, + "loss": 0.7369, + "step": 57310 + }, + { + "epoch": 0.5067274881097615, + "grad_norm": 1.1299951076507568, + "learning_rate": 4.155454186483731e-05, + "loss": 0.7584, + "step": 57320 + }, + { + "epoch": 0.5068158913700738, + "grad_norm": 1.9832658767700195, + "learning_rate": 4.155306847716544e-05, + "loss": 0.6646, + "step": 57330 + }, + { + "epoch": 0.506904294630386, + "grad_norm": 4.427059173583984, + "learning_rate": 4.155159508949357e-05, + "loss": 0.6112, + "step": 57340 + }, + { + "epoch": 0.5069926978906982, + "grad_norm": 6.744326591491699, + "learning_rate": 4.1550121701821696e-05, + "loss": 0.7796, + "step": 57350 + }, + { + "epoch": 0.5070811011510105, + "grad_norm": 2.2644999027252197, + "learning_rate": 4.154864831414983e-05, + "loss": 0.6926, + "step": 57360 + }, + { + "epoch": 0.5071695044113227, + "grad_norm": 2.1050004959106445, + "learning_rate": 4.154717492647795e-05, + "loss": 0.7545, + "step": 57370 + }, + { + "epoch": 0.5072579076716349, + "grad_norm": 4.931227207183838, + "learning_rate": 4.154570153880609e-05, + "loss": 0.6872, + "step": 57380 + }, + { + "epoch": 0.5073463109319472, + "grad_norm": 6.39907693862915, + "learning_rate": 4.1544228151134216e-05, + "loss": 0.638, + "step": 57390 + }, + { + "epoch": 0.5074347141922594, + "grad_norm": 7.43491792678833, + "learning_rate": 4.1542754763462345e-05, + "loss": 0.7172, + "step": 57400 + }, + { + "epoch": 0.5075231174525716, + "grad_norm": 3.913090944290161, + "learning_rate": 4.154128137579047e-05, + "loss": 0.6168, + "step": 57410 + }, + { + "epoch": 0.5076115207128838, + "grad_norm": 15.152998924255371, + "learning_rate": 4.153980798811861e-05, + "loss": 0.7492, + "step": 57420 + }, + { + "epoch": 0.5076999239731961, + "grad_norm": 10.227533340454102, + "learning_rate": 4.153833460044673e-05, + "loss": 0.7735, + "step": 57430 + }, + { + "epoch": 0.5077883272335084, + "grad_norm": 2.000770330429077, + "learning_rate": 4.1536861212774865e-05, + "loss": 0.5962, + "step": 57440 + }, + { + "epoch": 0.5078767304938206, + "grad_norm": 1.3782382011413574, + "learning_rate": 4.1535387825102986e-05, + "loss": 0.7818, + "step": 57450 + }, + { + "epoch": 0.5079651337541329, + "grad_norm": 2.868053436279297, + "learning_rate": 4.153391443743112e-05, + "loss": 0.7098, + "step": 57460 + }, + { + "epoch": 0.5080535370144451, + "grad_norm": 3.6339824199676514, + "learning_rate": 4.153244104975925e-05, + "loss": 0.6535, + "step": 57470 + }, + { + "epoch": 0.5081419402747573, + "grad_norm": 3.4667751789093018, + "learning_rate": 4.153096766208738e-05, + "loss": 0.6274, + "step": 57480 + }, + { + "epoch": 0.5082303435350696, + "grad_norm": 2.1609416007995605, + "learning_rate": 4.1529494274415507e-05, + "loss": 0.7642, + "step": 57490 + }, + { + "epoch": 0.5083187467953818, + "grad_norm": 1.7735949754714966, + "learning_rate": 4.152802088674364e-05, + "loss": 0.6719, + "step": 57500 + }, + { + "epoch": 0.508407150055694, + "grad_norm": 3.4717016220092773, + "learning_rate": 4.152654749907176e-05, + "loss": 0.6177, + "step": 57510 + }, + { + "epoch": 0.5084955533160063, + "grad_norm": 1.4515308141708374, + "learning_rate": 4.15250741113999e-05, + "loss": 0.6885, + "step": 57520 + }, + { + "epoch": 0.5085839565763185, + "grad_norm": 2.896169662475586, + "learning_rate": 4.152360072372803e-05, + "loss": 0.6789, + "step": 57530 + }, + { + "epoch": 0.5086723598366307, + "grad_norm": 6.473743915557861, + "learning_rate": 4.1522127336056155e-05, + "loss": 0.657, + "step": 57540 + }, + { + "epoch": 0.5087607630969431, + "grad_norm": 2.919818878173828, + "learning_rate": 4.1520653948384283e-05, + "loss": 0.6598, + "step": 57550 + }, + { + "epoch": 0.5088491663572553, + "grad_norm": 4.229133605957031, + "learning_rate": 4.151918056071241e-05, + "loss": 0.806, + "step": 57560 + }, + { + "epoch": 0.5089375696175675, + "grad_norm": 10.783143997192383, + "learning_rate": 4.151770717304054e-05, + "loss": 0.6289, + "step": 57570 + }, + { + "epoch": 0.5090259728778798, + "grad_norm": 1.7085390090942383, + "learning_rate": 4.1516233785368675e-05, + "loss": 0.6389, + "step": 57580 + }, + { + "epoch": 0.509114376138192, + "grad_norm": 5.904126167297363, + "learning_rate": 4.1514760397696804e-05, + "loss": 0.81, + "step": 57590 + }, + { + "epoch": 0.5092027793985042, + "grad_norm": 3.2074472904205322, + "learning_rate": 4.151328701002493e-05, + "loss": 0.6909, + "step": 57600 + }, + { + "epoch": 0.5092911826588165, + "grad_norm": 6.975406646728516, + "learning_rate": 4.151181362235306e-05, + "loss": 0.7315, + "step": 57610 + }, + { + "epoch": 0.5093795859191287, + "grad_norm": 1.1402912139892578, + "learning_rate": 4.151034023468119e-05, + "loss": 0.8062, + "step": 57620 + }, + { + "epoch": 0.5094679891794409, + "grad_norm": 4.790241241455078, + "learning_rate": 4.150886684700932e-05, + "loss": 0.6636, + "step": 57630 + }, + { + "epoch": 0.5095563924397531, + "grad_norm": 5.190394401550293, + "learning_rate": 4.150739345933745e-05, + "loss": 0.5493, + "step": 57640 + }, + { + "epoch": 0.5096447957000654, + "grad_norm": 1.768310785293579, + "learning_rate": 4.150592007166558e-05, + "loss": 0.7117, + "step": 57650 + }, + { + "epoch": 0.5097331989603776, + "grad_norm": 4.656040668487549, + "learning_rate": 4.150444668399371e-05, + "loss": 0.7457, + "step": 57660 + }, + { + "epoch": 0.50982160222069, + "grad_norm": 19.833181381225586, + "learning_rate": 4.150297329632184e-05, + "loss": 0.6536, + "step": 57670 + }, + { + "epoch": 0.5099100054810022, + "grad_norm": 3.820453405380249, + "learning_rate": 4.1501499908649966e-05, + "loss": 0.7856, + "step": 57680 + }, + { + "epoch": 0.5099984087413144, + "grad_norm": 5.237705230712891, + "learning_rate": 4.1500026520978094e-05, + "loss": 0.6445, + "step": 57690 + }, + { + "epoch": 0.5100868120016266, + "grad_norm": 3.4213385581970215, + "learning_rate": 4.149855313330622e-05, + "loss": 0.7726, + "step": 57700 + }, + { + "epoch": 0.5101752152619389, + "grad_norm": 6.883492946624756, + "learning_rate": 4.149707974563436e-05, + "loss": 0.6229, + "step": 57710 + }, + { + "epoch": 0.5102636185222511, + "grad_norm": 2.100466251373291, + "learning_rate": 4.1495606357962486e-05, + "loss": 0.6373, + "step": 57720 + }, + { + "epoch": 0.5103520217825633, + "grad_norm": 1.392956018447876, + "learning_rate": 4.1494132970290614e-05, + "loss": 0.548, + "step": 57730 + }, + { + "epoch": 0.5104404250428756, + "grad_norm": 4.357324600219727, + "learning_rate": 4.149265958261874e-05, + "loss": 0.7606, + "step": 57740 + }, + { + "epoch": 0.5105288283031878, + "grad_norm": 11.297916412353516, + "learning_rate": 4.149118619494687e-05, + "loss": 0.7159, + "step": 57750 + }, + { + "epoch": 0.5106172315635, + "grad_norm": 2.839669704437256, + "learning_rate": 4.1489712807275e-05, + "loss": 0.742, + "step": 57760 + }, + { + "epoch": 0.5107056348238123, + "grad_norm": 1.8932141065597534, + "learning_rate": 4.1488239419603134e-05, + "loss": 0.7786, + "step": 57770 + }, + { + "epoch": 0.5107940380841245, + "grad_norm": 4.7063374519348145, + "learning_rate": 4.148676603193126e-05, + "loss": 0.8112, + "step": 57780 + }, + { + "epoch": 0.5108824413444368, + "grad_norm": 1.873314619064331, + "learning_rate": 4.148529264425939e-05, + "loss": 0.6394, + "step": 57790 + }, + { + "epoch": 0.5109708446047491, + "grad_norm": 5.169839382171631, + "learning_rate": 4.148381925658752e-05, + "loss": 0.7142, + "step": 57800 + }, + { + "epoch": 0.5110592478650613, + "grad_norm": 4.607201099395752, + "learning_rate": 4.148234586891565e-05, + "loss": 0.683, + "step": 57810 + }, + { + "epoch": 0.5111476511253735, + "grad_norm": 9.959545135498047, + "learning_rate": 4.1480872481243776e-05, + "loss": 0.7761, + "step": 57820 + }, + { + "epoch": 0.5112360543856858, + "grad_norm": 1.661966323852539, + "learning_rate": 4.147939909357191e-05, + "loss": 0.7066, + "step": 57830 + }, + { + "epoch": 0.511324457645998, + "grad_norm": 4.903796195983887, + "learning_rate": 4.147792570590003e-05, + "loss": 0.6152, + "step": 57840 + }, + { + "epoch": 0.5114128609063102, + "grad_norm": 14.816862106323242, + "learning_rate": 4.147645231822817e-05, + "loss": 0.7046, + "step": 57850 + }, + { + "epoch": 0.5115012641666224, + "grad_norm": 2.8706767559051514, + "learning_rate": 4.1474978930556296e-05, + "loss": 0.7634, + "step": 57860 + }, + { + "epoch": 0.5115896674269347, + "grad_norm": 2.605053424835205, + "learning_rate": 4.1473505542884425e-05, + "loss": 0.6965, + "step": 57870 + }, + { + "epoch": 0.5116780706872469, + "grad_norm": 2.3393845558166504, + "learning_rate": 4.147203215521255e-05, + "loss": 0.6555, + "step": 57880 + }, + { + "epoch": 0.5117664739475591, + "grad_norm": 4.712070941925049, + "learning_rate": 4.147055876754069e-05, + "loss": 0.6605, + "step": 57890 + }, + { + "epoch": 0.5118548772078714, + "grad_norm": 3.3680405616760254, + "learning_rate": 4.146908537986881e-05, + "loss": 0.8241, + "step": 57900 + }, + { + "epoch": 0.5119432804681837, + "grad_norm": 8.827312469482422, + "learning_rate": 4.1467611992196945e-05, + "loss": 0.6417, + "step": 57910 + }, + { + "epoch": 0.512031683728496, + "grad_norm": 11.913137435913086, + "learning_rate": 4.1466138604525066e-05, + "loss": 0.6392, + "step": 57920 + }, + { + "epoch": 0.5121200869888082, + "grad_norm": 2.735583543777466, + "learning_rate": 4.14646652168532e-05, + "loss": 0.74, + "step": 57930 + }, + { + "epoch": 0.5122084902491204, + "grad_norm": 6.679357051849365, + "learning_rate": 4.146319182918133e-05, + "loss": 0.6854, + "step": 57940 + }, + { + "epoch": 0.5122968935094326, + "grad_norm": 5.886544227600098, + "learning_rate": 4.146171844150946e-05, + "loss": 0.7648, + "step": 57950 + }, + { + "epoch": 0.5123852967697449, + "grad_norm": 7.434231758117676, + "learning_rate": 4.1460245053837587e-05, + "loss": 0.6851, + "step": 57960 + }, + { + "epoch": 0.5124737000300571, + "grad_norm": 5.318103790283203, + "learning_rate": 4.145877166616572e-05, + "loss": 0.6382, + "step": 57970 + }, + { + "epoch": 0.5125621032903693, + "grad_norm": 3.052842140197754, + "learning_rate": 4.145729827849384e-05, + "loss": 0.8278, + "step": 57980 + }, + { + "epoch": 0.5126505065506816, + "grad_norm": 4.2541632652282715, + "learning_rate": 4.145582489082198e-05, + "loss": 0.6927, + "step": 57990 + }, + { + "epoch": 0.5127389098109938, + "grad_norm": 3.4220988750457764, + "learning_rate": 4.145435150315011e-05, + "loss": 0.6728, + "step": 58000 + }, + { + "epoch": 0.512827313071306, + "grad_norm": 1.8903169631958008, + "learning_rate": 4.1452878115478235e-05, + "loss": 0.7378, + "step": 58010 + }, + { + "epoch": 0.5129157163316184, + "grad_norm": 1.5866371393203735, + "learning_rate": 4.1451404727806363e-05, + "loss": 0.8031, + "step": 58020 + }, + { + "epoch": 0.5130041195919306, + "grad_norm": 6.577564239501953, + "learning_rate": 4.144993134013449e-05, + "loss": 0.7508, + "step": 58030 + }, + { + "epoch": 0.5130925228522428, + "grad_norm": 4.205037593841553, + "learning_rate": 4.144845795246262e-05, + "loss": 0.7304, + "step": 58040 + }, + { + "epoch": 0.5131809261125551, + "grad_norm": 6.128932476043701, + "learning_rate": 4.1446984564790755e-05, + "loss": 0.6685, + "step": 58050 + }, + { + "epoch": 0.5132693293728673, + "grad_norm": 7.909671306610107, + "learning_rate": 4.144551117711888e-05, + "loss": 0.6849, + "step": 58060 + }, + { + "epoch": 0.5133577326331795, + "grad_norm": 2.2326576709747314, + "learning_rate": 4.144403778944701e-05, + "loss": 0.6745, + "step": 58070 + }, + { + "epoch": 0.5134461358934918, + "grad_norm": 2.6070303916931152, + "learning_rate": 4.144256440177514e-05, + "loss": 0.5746, + "step": 58080 + }, + { + "epoch": 0.513534539153804, + "grad_norm": 4.542726993560791, + "learning_rate": 4.144109101410327e-05, + "loss": 0.683, + "step": 58090 + }, + { + "epoch": 0.5136229424141162, + "grad_norm": 1.4204988479614258, + "learning_rate": 4.14396176264314e-05, + "loss": 0.6697, + "step": 58100 + }, + { + "epoch": 0.5137113456744284, + "grad_norm": 2.0310754776000977, + "learning_rate": 4.143814423875953e-05, + "loss": 0.734, + "step": 58110 + }, + { + "epoch": 0.5137997489347407, + "grad_norm": 3.9325571060180664, + "learning_rate": 4.1436670851087654e-05, + "loss": 0.7326, + "step": 58120 + }, + { + "epoch": 0.5138881521950529, + "grad_norm": 4.3330206871032715, + "learning_rate": 4.143519746341579e-05, + "loss": 0.8025, + "step": 58130 + }, + { + "epoch": 0.5139765554553652, + "grad_norm": 4.2222514152526855, + "learning_rate": 4.143372407574391e-05, + "loss": 0.7605, + "step": 58140 + }, + { + "epoch": 0.5140649587156775, + "grad_norm": 0.9864398837089539, + "learning_rate": 4.1432250688072046e-05, + "loss": 0.6808, + "step": 58150 + }, + { + "epoch": 0.5141533619759897, + "grad_norm": 3.626385450363159, + "learning_rate": 4.1430777300400174e-05, + "loss": 0.6332, + "step": 58160 + }, + { + "epoch": 0.5142417652363019, + "grad_norm": 1.2500076293945312, + "learning_rate": 4.14293039127283e-05, + "loss": 0.6963, + "step": 58170 + }, + { + "epoch": 0.5143301684966142, + "grad_norm": 9.736292839050293, + "learning_rate": 4.142783052505643e-05, + "loss": 0.6112, + "step": 58180 + }, + { + "epoch": 0.5144185717569264, + "grad_norm": 3.2459876537323, + "learning_rate": 4.1426357137384566e-05, + "loss": 0.7272, + "step": 58190 + }, + { + "epoch": 0.5145069750172386, + "grad_norm": 1.8464261293411255, + "learning_rate": 4.142488374971269e-05, + "loss": 0.6929, + "step": 58200 + }, + { + "epoch": 0.5145953782775509, + "grad_norm": 2.659166097640991, + "learning_rate": 4.142341036204082e-05, + "loss": 0.6395, + "step": 58210 + }, + { + "epoch": 0.5146837815378631, + "grad_norm": 3.726670503616333, + "learning_rate": 4.142193697436895e-05, + "loss": 0.7603, + "step": 58220 + }, + { + "epoch": 0.5147721847981753, + "grad_norm": 7.228145122528076, + "learning_rate": 4.142046358669708e-05, + "loss": 0.7084, + "step": 58230 + }, + { + "epoch": 0.5148605880584876, + "grad_norm": 2.7289206981658936, + "learning_rate": 4.141899019902521e-05, + "loss": 0.8021, + "step": 58240 + }, + { + "epoch": 0.5149489913187998, + "grad_norm": 4.535951614379883, + "learning_rate": 4.141751681135334e-05, + "loss": 0.7113, + "step": 58250 + }, + { + "epoch": 0.5150373945791121, + "grad_norm": 7.421276569366455, + "learning_rate": 4.1416043423681464e-05, + "loss": 0.7187, + "step": 58260 + }, + { + "epoch": 0.5151257978394244, + "grad_norm": 11.996021270751953, + "learning_rate": 4.14145700360096e-05, + "loss": 0.7444, + "step": 58270 + }, + { + "epoch": 0.5152142010997366, + "grad_norm": 4.288655757904053, + "learning_rate": 4.141309664833772e-05, + "loss": 0.5822, + "step": 58280 + }, + { + "epoch": 0.5153026043600488, + "grad_norm": 6.721730709075928, + "learning_rate": 4.1411623260665856e-05, + "loss": 0.7321, + "step": 58290 + }, + { + "epoch": 0.515391007620361, + "grad_norm": 8.89136028289795, + "learning_rate": 4.1410149872993984e-05, + "loss": 0.6965, + "step": 58300 + }, + { + "epoch": 0.5154794108806733, + "grad_norm": 9.430233001708984, + "learning_rate": 4.140867648532211e-05, + "loss": 0.6737, + "step": 58310 + }, + { + "epoch": 0.5155678141409855, + "grad_norm": 4.176496505737305, + "learning_rate": 4.140720309765024e-05, + "loss": 0.6452, + "step": 58320 + }, + { + "epoch": 0.5156562174012977, + "grad_norm": 4.162666320800781, + "learning_rate": 4.1405729709978376e-05, + "loss": 0.6797, + "step": 58330 + }, + { + "epoch": 0.51574462066161, + "grad_norm": 7.322152137756348, + "learning_rate": 4.14042563223065e-05, + "loss": 0.7263, + "step": 58340 + }, + { + "epoch": 0.5158330239219222, + "grad_norm": 3.2498905658721924, + "learning_rate": 4.140278293463463e-05, + "loss": 0.7062, + "step": 58350 + }, + { + "epoch": 0.5159214271822344, + "grad_norm": 1.6769343614578247, + "learning_rate": 4.140130954696276e-05, + "loss": 0.556, + "step": 58360 + }, + { + "epoch": 0.5160098304425467, + "grad_norm": 4.269484996795654, + "learning_rate": 4.139983615929089e-05, + "loss": 0.653, + "step": 58370 + }, + { + "epoch": 0.516098233702859, + "grad_norm": 4.248597145080566, + "learning_rate": 4.139836277161902e-05, + "loss": 0.6451, + "step": 58380 + }, + { + "epoch": 0.5161866369631712, + "grad_norm": 3.693969249725342, + "learning_rate": 4.1396889383947146e-05, + "loss": 0.7437, + "step": 58390 + }, + { + "epoch": 0.5162750402234835, + "grad_norm": 2.039949893951416, + "learning_rate": 4.1395415996275275e-05, + "loss": 0.5287, + "step": 58400 + }, + { + "epoch": 0.5163634434837957, + "grad_norm": 5.918385028839111, + "learning_rate": 4.139394260860341e-05, + "loss": 0.8401, + "step": 58410 + }, + { + "epoch": 0.5164518467441079, + "grad_norm": 0.8016469478607178, + "learning_rate": 4.139246922093153e-05, + "loss": 0.7515, + "step": 58420 + }, + { + "epoch": 0.5165402500044202, + "grad_norm": 3.3638181686401367, + "learning_rate": 4.139099583325967e-05, + "loss": 0.6435, + "step": 58430 + }, + { + "epoch": 0.5166286532647324, + "grad_norm": 2.5616002082824707, + "learning_rate": 4.1389522445587795e-05, + "loss": 0.8031, + "step": 58440 + }, + { + "epoch": 0.5167170565250446, + "grad_norm": 6.8796491622924805, + "learning_rate": 4.138804905791592e-05, + "loss": 0.672, + "step": 58450 + }, + { + "epoch": 0.5168054597853569, + "grad_norm": 4.082709789276123, + "learning_rate": 4.138657567024405e-05, + "loss": 0.7497, + "step": 58460 + }, + { + "epoch": 0.5168938630456691, + "grad_norm": 7.032834529876709, + "learning_rate": 4.138510228257219e-05, + "loss": 0.6995, + "step": 58470 + }, + { + "epoch": 0.5169822663059813, + "grad_norm": 3.5431320667266846, + "learning_rate": 4.138362889490031e-05, + "loss": 0.8405, + "step": 58480 + }, + { + "epoch": 0.5170706695662936, + "grad_norm": 3.9321630001068115, + "learning_rate": 4.1382155507228444e-05, + "loss": 0.7096, + "step": 58490 + }, + { + "epoch": 0.5171590728266059, + "grad_norm": 3.8108572959899902, + "learning_rate": 4.138068211955657e-05, + "loss": 0.7797, + "step": 58500 + }, + { + "epoch": 0.5172474760869181, + "grad_norm": 3.7932891845703125, + "learning_rate": 4.13792087318847e-05, + "loss": 0.7674, + "step": 58510 + }, + { + "epoch": 0.5173358793472304, + "grad_norm": 5.418814659118652, + "learning_rate": 4.137773534421283e-05, + "loss": 0.6954, + "step": 58520 + }, + { + "epoch": 0.5174242826075426, + "grad_norm": 2.8099708557128906, + "learning_rate": 4.137626195654096e-05, + "loss": 0.6776, + "step": 58530 + }, + { + "epoch": 0.5175126858678548, + "grad_norm": 2.9042539596557617, + "learning_rate": 4.1374788568869085e-05, + "loss": 0.6521, + "step": 58540 + }, + { + "epoch": 0.517601089128167, + "grad_norm": 8.134185791015625, + "learning_rate": 4.137331518119722e-05, + "loss": 0.7712, + "step": 58550 + }, + { + "epoch": 0.5176894923884793, + "grad_norm": 4.098917484283447, + "learning_rate": 4.137184179352535e-05, + "loss": 0.6245, + "step": 58560 + }, + { + "epoch": 0.5177778956487915, + "grad_norm": 6.311615943908691, + "learning_rate": 4.137036840585348e-05, + "loss": 0.888, + "step": 58570 + }, + { + "epoch": 0.5178662989091037, + "grad_norm": 0.7927438020706177, + "learning_rate": 4.1368895018181606e-05, + "loss": 0.662, + "step": 58580 + }, + { + "epoch": 0.517954702169416, + "grad_norm": 5.305534839630127, + "learning_rate": 4.1367421630509734e-05, + "loss": 0.6315, + "step": 58590 + }, + { + "epoch": 0.5180431054297282, + "grad_norm": 2.211588144302368, + "learning_rate": 4.136594824283786e-05, + "loss": 0.6306, + "step": 58600 + }, + { + "epoch": 0.5181315086900405, + "grad_norm": 3.837446928024292, + "learning_rate": 4.136447485516599e-05, + "loss": 0.6584, + "step": 58610 + }, + { + "epoch": 0.5182199119503528, + "grad_norm": 3.016932487487793, + "learning_rate": 4.1363001467494126e-05, + "loss": 0.7476, + "step": 58620 + }, + { + "epoch": 0.518308315210665, + "grad_norm": 2.5306055545806885, + "learning_rate": 4.1361528079822254e-05, + "loss": 0.774, + "step": 58630 + }, + { + "epoch": 0.5183967184709772, + "grad_norm": 12.529138565063477, + "learning_rate": 4.136005469215038e-05, + "loss": 0.6646, + "step": 58640 + }, + { + "epoch": 0.5184851217312895, + "grad_norm": 2.591942071914673, + "learning_rate": 4.135858130447851e-05, + "loss": 0.7123, + "step": 58650 + }, + { + "epoch": 0.5185735249916017, + "grad_norm": 9.732890129089355, + "learning_rate": 4.135710791680664e-05, + "loss": 0.7201, + "step": 58660 + }, + { + "epoch": 0.5186619282519139, + "grad_norm": 43.34449768066406, + "learning_rate": 4.135563452913477e-05, + "loss": 0.6637, + "step": 58670 + }, + { + "epoch": 0.5187503315122262, + "grad_norm": 2.275779962539673, + "learning_rate": 4.13541611414629e-05, + "loss": 0.6233, + "step": 58680 + }, + { + "epoch": 0.5188387347725384, + "grad_norm": 6.794007301330566, + "learning_rate": 4.135268775379103e-05, + "loss": 0.7058, + "step": 58690 + }, + { + "epoch": 0.5189271380328506, + "grad_norm": 6.4937052726745605, + "learning_rate": 4.135121436611916e-05, + "loss": 0.7063, + "step": 58700 + }, + { + "epoch": 0.5190155412931629, + "grad_norm": 4.3119072914123535, + "learning_rate": 4.134974097844729e-05, + "loss": 0.7617, + "step": 58710 + }, + { + "epoch": 0.5191039445534751, + "grad_norm": 2.4573779106140137, + "learning_rate": 4.1348267590775416e-05, + "loss": 0.7211, + "step": 58720 + }, + { + "epoch": 0.5191923478137874, + "grad_norm": 1.5036373138427734, + "learning_rate": 4.1346794203103544e-05, + "loss": 0.6407, + "step": 58730 + }, + { + "epoch": 0.5192807510740997, + "grad_norm": 5.157151222229004, + "learning_rate": 4.134532081543168e-05, + "loss": 0.7112, + "step": 58740 + }, + { + "epoch": 0.5193691543344119, + "grad_norm": 4.024260997772217, + "learning_rate": 4.13438474277598e-05, + "loss": 0.5759, + "step": 58750 + }, + { + "epoch": 0.5194575575947241, + "grad_norm": 6.83284330368042, + "learning_rate": 4.1342374040087936e-05, + "loss": 0.598, + "step": 58760 + }, + { + "epoch": 0.5195459608550363, + "grad_norm": 2.6124684810638428, + "learning_rate": 4.1340900652416065e-05, + "loss": 0.7998, + "step": 58770 + }, + { + "epoch": 0.5196343641153486, + "grad_norm": 11.75877571105957, + "learning_rate": 4.133942726474419e-05, + "loss": 0.7857, + "step": 58780 + }, + { + "epoch": 0.5197227673756608, + "grad_norm": 5.469943046569824, + "learning_rate": 4.133795387707232e-05, + "loss": 0.6067, + "step": 58790 + }, + { + "epoch": 0.519811170635973, + "grad_norm": 8.962861061096191, + "learning_rate": 4.1336480489400456e-05, + "loss": 0.5748, + "step": 58800 + }, + { + "epoch": 0.5198995738962853, + "grad_norm": 5.187961578369141, + "learning_rate": 4.133500710172858e-05, + "loss": 0.7629, + "step": 58810 + }, + { + "epoch": 0.5199879771565975, + "grad_norm": 10.618610382080078, + "learning_rate": 4.133353371405671e-05, + "loss": 0.8602, + "step": 58820 + }, + { + "epoch": 0.5200763804169097, + "grad_norm": 4.766876220703125, + "learning_rate": 4.133206032638484e-05, + "loss": 0.6604, + "step": 58830 + }, + { + "epoch": 0.520164783677222, + "grad_norm": 2.7952160835266113, + "learning_rate": 4.133058693871297e-05, + "loss": 0.6609, + "step": 58840 + }, + { + "epoch": 0.5202531869375343, + "grad_norm": 1.3467514514923096, + "learning_rate": 4.13291135510411e-05, + "loss": 0.6487, + "step": 58850 + }, + { + "epoch": 0.5203415901978465, + "grad_norm": 12.943243980407715, + "learning_rate": 4.1327640163369227e-05, + "loss": 0.5737, + "step": 58860 + }, + { + "epoch": 0.5204299934581588, + "grad_norm": 10.962312698364258, + "learning_rate": 4.1326166775697355e-05, + "loss": 0.7685, + "step": 58870 + }, + { + "epoch": 0.520518396718471, + "grad_norm": 2.4418134689331055, + "learning_rate": 4.132469338802549e-05, + "loss": 0.698, + "step": 58880 + }, + { + "epoch": 0.5206067999787832, + "grad_norm": 4.1538238525390625, + "learning_rate": 4.132322000035361e-05, + "loss": 0.7381, + "step": 58890 + }, + { + "epoch": 0.5206952032390955, + "grad_norm": 5.691303253173828, + "learning_rate": 4.132174661268175e-05, + "loss": 0.6875, + "step": 58900 + }, + { + "epoch": 0.5207836064994077, + "grad_norm": 3.1926708221435547, + "learning_rate": 4.1320273225009875e-05, + "loss": 0.6794, + "step": 58910 + }, + { + "epoch": 0.5208720097597199, + "grad_norm": 8.986908912658691, + "learning_rate": 4.1318799837338003e-05, + "loss": 0.6556, + "step": 58920 + }, + { + "epoch": 0.5209604130200322, + "grad_norm": 7.45997953414917, + "learning_rate": 4.131732644966613e-05, + "loss": 0.6706, + "step": 58930 + }, + { + "epoch": 0.5210488162803444, + "grad_norm": 3.960982322692871, + "learning_rate": 4.131585306199427e-05, + "loss": 0.7887, + "step": 58940 + }, + { + "epoch": 0.5211372195406566, + "grad_norm": 2.496504068374634, + "learning_rate": 4.131437967432239e-05, + "loss": 0.7178, + "step": 58950 + }, + { + "epoch": 0.5212256228009688, + "grad_norm": 1.9526597261428833, + "learning_rate": 4.1312906286650524e-05, + "loss": 0.5851, + "step": 58960 + }, + { + "epoch": 0.5213140260612812, + "grad_norm": 9.463976860046387, + "learning_rate": 4.1311432898978645e-05, + "loss": 0.7451, + "step": 58970 + }, + { + "epoch": 0.5214024293215934, + "grad_norm": 7.764998912811279, + "learning_rate": 4.130995951130678e-05, + "loss": 0.8033, + "step": 58980 + }, + { + "epoch": 0.5214908325819057, + "grad_norm": 1.8823398351669312, + "learning_rate": 4.130848612363491e-05, + "loss": 0.5242, + "step": 58990 + }, + { + "epoch": 0.5215792358422179, + "grad_norm": 2.699099063873291, + "learning_rate": 4.130701273596304e-05, + "loss": 0.7504, + "step": 59000 + }, + { + "epoch": 0.5216676391025301, + "grad_norm": 1.5988144874572754, + "learning_rate": 4.1305539348291165e-05, + "loss": 0.6925, + "step": 59010 + }, + { + "epoch": 0.5217560423628423, + "grad_norm": 2.170100688934326, + "learning_rate": 4.13040659606193e-05, + "loss": 0.7847, + "step": 59020 + }, + { + "epoch": 0.5218444456231546, + "grad_norm": 4.162135601043701, + "learning_rate": 4.130259257294742e-05, + "loss": 0.6309, + "step": 59030 + }, + { + "epoch": 0.5219328488834668, + "grad_norm": 7.490842819213867, + "learning_rate": 4.130111918527556e-05, + "loss": 0.6931, + "step": 59040 + }, + { + "epoch": 0.522021252143779, + "grad_norm": 1.2390353679656982, + "learning_rate": 4.1299645797603686e-05, + "loss": 0.7108, + "step": 59050 + }, + { + "epoch": 0.5221096554040913, + "grad_norm": 1.089184045791626, + "learning_rate": 4.1298172409931814e-05, + "loss": 0.6427, + "step": 59060 + }, + { + "epoch": 0.5221980586644035, + "grad_norm": 4.4747819900512695, + "learning_rate": 4.129669902225994e-05, + "loss": 0.8834, + "step": 59070 + }, + { + "epoch": 0.5222864619247157, + "grad_norm": 1.8252582550048828, + "learning_rate": 4.129522563458808e-05, + "loss": 0.7283, + "step": 59080 + }, + { + "epoch": 0.5223748651850281, + "grad_norm": 3.516965389251709, + "learning_rate": 4.12937522469162e-05, + "loss": 0.7267, + "step": 59090 + }, + { + "epoch": 0.5224632684453403, + "grad_norm": 8.140121459960938, + "learning_rate": 4.1292278859244334e-05, + "loss": 0.7086, + "step": 59100 + }, + { + "epoch": 0.5225516717056525, + "grad_norm": 1.565888524055481, + "learning_rate": 4.1290805471572456e-05, + "loss": 0.6313, + "step": 59110 + }, + { + "epoch": 0.5226400749659648, + "grad_norm": 7.233582973480225, + "learning_rate": 4.128933208390059e-05, + "loss": 0.7085, + "step": 59120 + }, + { + "epoch": 0.522728478226277, + "grad_norm": 2.30755877494812, + "learning_rate": 4.128785869622872e-05, + "loss": 0.6304, + "step": 59130 + }, + { + "epoch": 0.5228168814865892, + "grad_norm": 1.746673345565796, + "learning_rate": 4.128638530855685e-05, + "loss": 0.711, + "step": 59140 + }, + { + "epoch": 0.5229052847469015, + "grad_norm": 6.198861598968506, + "learning_rate": 4.1284911920884976e-05, + "loss": 0.7649, + "step": 59150 + }, + { + "epoch": 0.5229936880072137, + "grad_norm": 5.0859150886535645, + "learning_rate": 4.128343853321311e-05, + "loss": 0.8103, + "step": 59160 + }, + { + "epoch": 0.5230820912675259, + "grad_norm": 7.284816265106201, + "learning_rate": 4.128196514554123e-05, + "loss": 0.6774, + "step": 59170 + }, + { + "epoch": 0.5231704945278381, + "grad_norm": 10.868987083435059, + "learning_rate": 4.128049175786937e-05, + "loss": 0.6578, + "step": 59180 + }, + { + "epoch": 0.5232588977881504, + "grad_norm": 4.1673431396484375, + "learning_rate": 4.1279018370197496e-05, + "loss": 0.7689, + "step": 59190 + }, + { + "epoch": 0.5233473010484627, + "grad_norm": 7.5632147789001465, + "learning_rate": 4.1277544982525624e-05, + "loss": 0.6873, + "step": 59200 + }, + { + "epoch": 0.523435704308775, + "grad_norm": 5.527274131774902, + "learning_rate": 4.127607159485375e-05, + "loss": 0.629, + "step": 59210 + }, + { + "epoch": 0.5235241075690872, + "grad_norm": 2.827582359313965, + "learning_rate": 4.127459820718188e-05, + "loss": 0.7302, + "step": 59220 + }, + { + "epoch": 0.5236125108293994, + "grad_norm": 0.5756442546844482, + "learning_rate": 4.127312481951001e-05, + "loss": 0.6671, + "step": 59230 + }, + { + "epoch": 0.5237009140897116, + "grad_norm": 11.035965919494629, + "learning_rate": 4.1271651431838145e-05, + "loss": 0.611, + "step": 59240 + }, + { + "epoch": 0.5237893173500239, + "grad_norm": 9.307296752929688, + "learning_rate": 4.1270178044166266e-05, + "loss": 0.6444, + "step": 59250 + }, + { + "epoch": 0.5238777206103361, + "grad_norm": 2.356525182723999, + "learning_rate": 4.12687046564944e-05, + "loss": 0.6942, + "step": 59260 + }, + { + "epoch": 0.5239661238706483, + "grad_norm": 3.160883665084839, + "learning_rate": 4.126723126882253e-05, + "loss": 0.6771, + "step": 59270 + }, + { + "epoch": 0.5240545271309606, + "grad_norm": 6.4535112380981445, + "learning_rate": 4.126575788115066e-05, + "loss": 0.7218, + "step": 59280 + }, + { + "epoch": 0.5241429303912728, + "grad_norm": 9.797268867492676, + "learning_rate": 4.1264284493478786e-05, + "loss": 0.8082, + "step": 59290 + }, + { + "epoch": 0.524231333651585, + "grad_norm": 3.2987751960754395, + "learning_rate": 4.126281110580692e-05, + "loss": 0.6652, + "step": 59300 + }, + { + "epoch": 0.5243197369118973, + "grad_norm": 1.5817397832870483, + "learning_rate": 4.126133771813504e-05, + "loss": 0.7169, + "step": 59310 + }, + { + "epoch": 0.5244081401722096, + "grad_norm": 4.6767754554748535, + "learning_rate": 4.125986433046318e-05, + "loss": 0.8169, + "step": 59320 + }, + { + "epoch": 0.5244965434325218, + "grad_norm": 8.094579696655273, + "learning_rate": 4.12583909427913e-05, + "loss": 0.6755, + "step": 59330 + }, + { + "epoch": 0.5245849466928341, + "grad_norm": 4.5218682289123535, + "learning_rate": 4.1256917555119435e-05, + "loss": 0.6427, + "step": 59340 + }, + { + "epoch": 0.5246733499531463, + "grad_norm": 3.401343822479248, + "learning_rate": 4.125544416744756e-05, + "loss": 0.6054, + "step": 59350 + }, + { + "epoch": 0.5247617532134585, + "grad_norm": 3.321723222732544, + "learning_rate": 4.125397077977569e-05, + "loss": 0.8062, + "step": 59360 + }, + { + "epoch": 0.5248501564737708, + "grad_norm": 10.014087677001953, + "learning_rate": 4.125249739210382e-05, + "loss": 0.6035, + "step": 59370 + }, + { + "epoch": 0.524938559734083, + "grad_norm": 1.1051753759384155, + "learning_rate": 4.1251024004431955e-05, + "loss": 0.6847, + "step": 59380 + }, + { + "epoch": 0.5250269629943952, + "grad_norm": 6.683419704437256, + "learning_rate": 4.124955061676008e-05, + "loss": 0.6571, + "step": 59390 + }, + { + "epoch": 0.5251153662547074, + "grad_norm": 4.3637261390686035, + "learning_rate": 4.124807722908821e-05, + "loss": 0.872, + "step": 59400 + }, + { + "epoch": 0.5252037695150197, + "grad_norm": 5.4338579177856445, + "learning_rate": 4.124660384141634e-05, + "loss": 0.693, + "step": 59410 + }, + { + "epoch": 0.5252921727753319, + "grad_norm": 4.866839408874512, + "learning_rate": 4.124513045374447e-05, + "loss": 0.7539, + "step": 59420 + }, + { + "epoch": 0.5253805760356441, + "grad_norm": 2.122809410095215, + "learning_rate": 4.12436570660726e-05, + "loss": 0.752, + "step": 59430 + }, + { + "epoch": 0.5254689792959565, + "grad_norm": 6.219788551330566, + "learning_rate": 4.1242183678400725e-05, + "loss": 0.6567, + "step": 59440 + }, + { + "epoch": 0.5255573825562687, + "grad_norm": 3.0866973400115967, + "learning_rate": 4.1240710290728854e-05, + "loss": 0.6744, + "step": 59450 + }, + { + "epoch": 0.525645785816581, + "grad_norm": 1.6338080167770386, + "learning_rate": 4.123923690305699e-05, + "loss": 0.6373, + "step": 59460 + }, + { + "epoch": 0.5257341890768932, + "grad_norm": 13.409435272216797, + "learning_rate": 4.123776351538512e-05, + "loss": 0.6503, + "step": 59470 + }, + { + "epoch": 0.5258225923372054, + "grad_norm": 10.768632888793945, + "learning_rate": 4.1236290127713245e-05, + "loss": 0.7392, + "step": 59480 + }, + { + "epoch": 0.5259109955975176, + "grad_norm": 2.5887343883514404, + "learning_rate": 4.1234816740041374e-05, + "loss": 0.7239, + "step": 59490 + }, + { + "epoch": 0.5259993988578299, + "grad_norm": 7.033324241638184, + "learning_rate": 4.12333433523695e-05, + "loss": 0.7645, + "step": 59500 + }, + { + "epoch": 0.5260878021181421, + "grad_norm": 16.299779891967773, + "learning_rate": 4.123186996469763e-05, + "loss": 0.6919, + "step": 59510 + }, + { + "epoch": 0.5261762053784543, + "grad_norm": 5.943434238433838, + "learning_rate": 4.1230396577025766e-05, + "loss": 0.5952, + "step": 59520 + }, + { + "epoch": 0.5262646086387666, + "grad_norm": 4.512763023376465, + "learning_rate": 4.1228923189353894e-05, + "loss": 0.6243, + "step": 59530 + }, + { + "epoch": 0.5263530118990788, + "grad_norm": 4.6850433349609375, + "learning_rate": 4.122744980168202e-05, + "loss": 0.7236, + "step": 59540 + }, + { + "epoch": 0.526441415159391, + "grad_norm": 1.1690351963043213, + "learning_rate": 4.122597641401015e-05, + "loss": 0.7844, + "step": 59550 + }, + { + "epoch": 0.5265298184197034, + "grad_norm": 3.369765281677246, + "learning_rate": 4.122450302633828e-05, + "loss": 0.6246, + "step": 59560 + }, + { + "epoch": 0.5266182216800156, + "grad_norm": 3.848281145095825, + "learning_rate": 4.122302963866641e-05, + "loss": 0.658, + "step": 59570 + }, + { + "epoch": 0.5267066249403278, + "grad_norm": 4.382859706878662, + "learning_rate": 4.1221556250994536e-05, + "loss": 0.6938, + "step": 59580 + }, + { + "epoch": 0.5267950282006401, + "grad_norm": 5.016404628753662, + "learning_rate": 4.122008286332267e-05, + "loss": 0.75, + "step": 59590 + }, + { + "epoch": 0.5268834314609523, + "grad_norm": 4.435308933258057, + "learning_rate": 4.12186094756508e-05, + "loss": 0.7533, + "step": 59600 + }, + { + "epoch": 0.5269718347212645, + "grad_norm": 1.7211618423461914, + "learning_rate": 4.121713608797893e-05, + "loss": 0.7225, + "step": 59610 + }, + { + "epoch": 0.5270602379815768, + "grad_norm": 3.0837252140045166, + "learning_rate": 4.1215662700307056e-05, + "loss": 0.8015, + "step": 59620 + }, + { + "epoch": 0.527148641241889, + "grad_norm": 1.7684383392333984, + "learning_rate": 4.1214189312635184e-05, + "loss": 0.6607, + "step": 59630 + }, + { + "epoch": 0.5272370445022012, + "grad_norm": 1.327669620513916, + "learning_rate": 4.121271592496331e-05, + "loss": 0.7018, + "step": 59640 + }, + { + "epoch": 0.5273254477625134, + "grad_norm": 2.8634445667266846, + "learning_rate": 4.121124253729145e-05, + "loss": 0.6262, + "step": 59650 + }, + { + "epoch": 0.5274138510228257, + "grad_norm": 5.957954406738281, + "learning_rate": 4.1209769149619576e-05, + "loss": 0.527, + "step": 59660 + }, + { + "epoch": 0.527502254283138, + "grad_norm": 5.212845802307129, + "learning_rate": 4.1208295761947705e-05, + "loss": 0.7562, + "step": 59670 + }, + { + "epoch": 0.5275906575434502, + "grad_norm": 2.8976666927337646, + "learning_rate": 4.120682237427583e-05, + "loss": 0.5941, + "step": 59680 + }, + { + "epoch": 0.5276790608037625, + "grad_norm": 1.9253665208816528, + "learning_rate": 4.120534898660396e-05, + "loss": 0.5756, + "step": 59690 + }, + { + "epoch": 0.5277674640640747, + "grad_norm": 3.331376552581787, + "learning_rate": 4.120387559893209e-05, + "loss": 0.5925, + "step": 59700 + }, + { + "epoch": 0.5278558673243869, + "grad_norm": 7.005678176879883, + "learning_rate": 4.1202402211260225e-05, + "loss": 0.6466, + "step": 59710 + }, + { + "epoch": 0.5279442705846992, + "grad_norm": 5.357455730438232, + "learning_rate": 4.1200928823588346e-05, + "loss": 0.5732, + "step": 59720 + }, + { + "epoch": 0.5280326738450114, + "grad_norm": 1.7362685203552246, + "learning_rate": 4.119945543591648e-05, + "loss": 0.6216, + "step": 59730 + }, + { + "epoch": 0.5281210771053236, + "grad_norm": 2.231977939605713, + "learning_rate": 4.119798204824461e-05, + "loss": 0.6929, + "step": 59740 + }, + { + "epoch": 0.5282094803656359, + "grad_norm": 1.708698034286499, + "learning_rate": 4.119650866057274e-05, + "loss": 0.7208, + "step": 59750 + }, + { + "epoch": 0.5282978836259481, + "grad_norm": 5.46640157699585, + "learning_rate": 4.1195035272900866e-05, + "loss": 0.7098, + "step": 59760 + }, + { + "epoch": 0.5283862868862603, + "grad_norm": 7.511179447174072, + "learning_rate": 4.1193561885229e-05, + "loss": 0.6752, + "step": 59770 + }, + { + "epoch": 0.5284746901465726, + "grad_norm": 4.109575271606445, + "learning_rate": 4.119208849755712e-05, + "loss": 0.6387, + "step": 59780 + }, + { + "epoch": 0.5285630934068849, + "grad_norm": 14.311249732971191, + "learning_rate": 4.119061510988526e-05, + "loss": 0.7875, + "step": 59790 + }, + { + "epoch": 0.5286514966671971, + "grad_norm": 1.998884677886963, + "learning_rate": 4.118914172221338e-05, + "loss": 0.6728, + "step": 59800 + }, + { + "epoch": 0.5287398999275094, + "grad_norm": 3.9691755771636963, + "learning_rate": 4.1187668334541515e-05, + "loss": 0.6487, + "step": 59810 + }, + { + "epoch": 0.5288283031878216, + "grad_norm": 14.432732582092285, + "learning_rate": 4.118619494686964e-05, + "loss": 0.7015, + "step": 59820 + }, + { + "epoch": 0.5289167064481338, + "grad_norm": 8.749857902526855, + "learning_rate": 4.118472155919777e-05, + "loss": 0.756, + "step": 59830 + }, + { + "epoch": 0.529005109708446, + "grad_norm": 3.3115224838256836, + "learning_rate": 4.11832481715259e-05, + "loss": 0.7646, + "step": 59840 + }, + { + "epoch": 0.5290935129687583, + "grad_norm": 5.961042881011963, + "learning_rate": 4.1181774783854035e-05, + "loss": 0.6872, + "step": 59850 + }, + { + "epoch": 0.5291819162290705, + "grad_norm": 3.7259011268615723, + "learning_rate": 4.118030139618216e-05, + "loss": 0.6913, + "step": 59860 + }, + { + "epoch": 0.5292703194893827, + "grad_norm": 1.4316565990447998, + "learning_rate": 4.117882800851029e-05, + "loss": 0.8236, + "step": 59870 + }, + { + "epoch": 0.529358722749695, + "grad_norm": 3.7824482917785645, + "learning_rate": 4.117735462083842e-05, + "loss": 0.612, + "step": 59880 + }, + { + "epoch": 0.5294471260100072, + "grad_norm": 6.120148181915283, + "learning_rate": 4.117588123316655e-05, + "loss": 0.7283, + "step": 59890 + }, + { + "epoch": 0.5295355292703194, + "grad_norm": 3.144371271133423, + "learning_rate": 4.117440784549468e-05, + "loss": 0.6866, + "step": 59900 + }, + { + "epoch": 0.5296239325306318, + "grad_norm": 10.815184593200684, + "learning_rate": 4.1172934457822805e-05, + "loss": 0.6136, + "step": 59910 + }, + { + "epoch": 0.529712335790944, + "grad_norm": 5.573663711547852, + "learning_rate": 4.1171461070150934e-05, + "loss": 0.643, + "step": 59920 + }, + { + "epoch": 0.5298007390512562, + "grad_norm": 1.9701824188232422, + "learning_rate": 4.116998768247907e-05, + "loss": 0.7374, + "step": 59930 + }, + { + "epoch": 0.5298891423115685, + "grad_norm": 2.470747232437134, + "learning_rate": 4.116851429480719e-05, + "loss": 0.627, + "step": 59940 + }, + { + "epoch": 0.5299775455718807, + "grad_norm": 2.7850711345672607, + "learning_rate": 4.1167040907135326e-05, + "loss": 0.6567, + "step": 59950 + }, + { + "epoch": 0.5300659488321929, + "grad_norm": 1.6354985237121582, + "learning_rate": 4.1165567519463454e-05, + "loss": 0.6099, + "step": 59960 + }, + { + "epoch": 0.5301543520925052, + "grad_norm": 10.398540496826172, + "learning_rate": 4.116409413179158e-05, + "loss": 0.677, + "step": 59970 + }, + { + "epoch": 0.5302427553528174, + "grad_norm": 3.451448440551758, + "learning_rate": 4.116262074411971e-05, + "loss": 0.6194, + "step": 59980 + }, + { + "epoch": 0.5303311586131296, + "grad_norm": 19.99856948852539, + "learning_rate": 4.1161147356447846e-05, + "loss": 0.7397, + "step": 59990 + }, + { + "epoch": 0.5304195618734419, + "grad_norm": 1.5104225873947144, + "learning_rate": 4.115967396877597e-05, + "loss": 0.6559, + "step": 60000 + }, + { + "epoch": 0.5305079651337541, + "grad_norm": 4.500480651855469, + "learning_rate": 4.11582005811041e-05, + "loss": 0.7364, + "step": 60010 + }, + { + "epoch": 0.5305963683940663, + "grad_norm": 4.798305034637451, + "learning_rate": 4.115672719343223e-05, + "loss": 0.767, + "step": 60020 + }, + { + "epoch": 0.5306847716543787, + "grad_norm": 8.019207954406738, + "learning_rate": 4.115525380576036e-05, + "loss": 0.7647, + "step": 60030 + }, + { + "epoch": 0.5307731749146909, + "grad_norm": 4.901878356933594, + "learning_rate": 4.115378041808849e-05, + "loss": 0.7183, + "step": 60040 + }, + { + "epoch": 0.5308615781750031, + "grad_norm": 3.3756978511810303, + "learning_rate": 4.1152307030416616e-05, + "loss": 0.5972, + "step": 60050 + }, + { + "epoch": 0.5309499814353154, + "grad_norm": 1.3737549781799316, + "learning_rate": 4.1150833642744744e-05, + "loss": 0.6125, + "step": 60060 + }, + { + "epoch": 0.5310383846956276, + "grad_norm": 6.525880813598633, + "learning_rate": 4.114936025507288e-05, + "loss": 0.7338, + "step": 60070 + }, + { + "epoch": 0.5311267879559398, + "grad_norm": 7.533184051513672, + "learning_rate": 4.1147886867401e-05, + "loss": 0.6426, + "step": 60080 + }, + { + "epoch": 0.531215191216252, + "grad_norm": 2.9823904037475586, + "learning_rate": 4.1146413479729136e-05, + "loss": 0.5598, + "step": 60090 + }, + { + "epoch": 0.5313035944765643, + "grad_norm": 1.5166293382644653, + "learning_rate": 4.1144940092057264e-05, + "loss": 0.6468, + "step": 60100 + }, + { + "epoch": 0.5313919977368765, + "grad_norm": 3.1905624866485596, + "learning_rate": 4.114346670438539e-05, + "loss": 0.602, + "step": 60110 + }, + { + "epoch": 0.5314804009971887, + "grad_norm": 1.7269514799118042, + "learning_rate": 4.114199331671352e-05, + "loss": 0.7132, + "step": 60120 + }, + { + "epoch": 0.531568804257501, + "grad_norm": 11.500814437866211, + "learning_rate": 4.1140519929041656e-05, + "loss": 0.6368, + "step": 60130 + }, + { + "epoch": 0.5316572075178132, + "grad_norm": 2.3837459087371826, + "learning_rate": 4.113904654136978e-05, + "loss": 0.6723, + "step": 60140 + }, + { + "epoch": 0.5317456107781255, + "grad_norm": 5.128081321716309, + "learning_rate": 4.113757315369791e-05, + "loss": 0.7357, + "step": 60150 + }, + { + "epoch": 0.5318340140384378, + "grad_norm": 7.816521167755127, + "learning_rate": 4.1136099766026035e-05, + "loss": 0.7343, + "step": 60160 + }, + { + "epoch": 0.53192241729875, + "grad_norm": 6.74944543838501, + "learning_rate": 4.113462637835417e-05, + "loss": 0.7601, + "step": 60170 + }, + { + "epoch": 0.5320108205590622, + "grad_norm": 6.753887176513672, + "learning_rate": 4.11331529906823e-05, + "loss": 0.6861, + "step": 60180 + }, + { + "epoch": 0.5320992238193745, + "grad_norm": 2.1089651584625244, + "learning_rate": 4.1131679603010426e-05, + "loss": 0.5807, + "step": 60190 + }, + { + "epoch": 0.5321876270796867, + "grad_norm": 5.0412445068359375, + "learning_rate": 4.1130206215338555e-05, + "loss": 0.7874, + "step": 60200 + }, + { + "epoch": 0.5322760303399989, + "grad_norm": 3.698671579360962, + "learning_rate": 4.112873282766669e-05, + "loss": 0.5968, + "step": 60210 + }, + { + "epoch": 0.5323644336003112, + "grad_norm": 6.474447727203369, + "learning_rate": 4.112725943999481e-05, + "loss": 0.6826, + "step": 60220 + }, + { + "epoch": 0.5324528368606234, + "grad_norm": 4.3067755699157715, + "learning_rate": 4.1125786052322947e-05, + "loss": 0.6573, + "step": 60230 + }, + { + "epoch": 0.5325412401209356, + "grad_norm": 2.8778164386749268, + "learning_rate": 4.1124312664651075e-05, + "loss": 0.67, + "step": 60240 + }, + { + "epoch": 0.5326296433812479, + "grad_norm": 4.32625150680542, + "learning_rate": 4.11228392769792e-05, + "loss": 0.7479, + "step": 60250 + }, + { + "epoch": 0.5327180466415602, + "grad_norm": 0.9179672598838806, + "learning_rate": 4.112136588930733e-05, + "loss": 0.5865, + "step": 60260 + }, + { + "epoch": 0.5328064499018724, + "grad_norm": 2.5670909881591797, + "learning_rate": 4.111989250163546e-05, + "loss": 0.613, + "step": 60270 + }, + { + "epoch": 0.5328948531621847, + "grad_norm": 5.7348713874816895, + "learning_rate": 4.111841911396359e-05, + "loss": 0.5972, + "step": 60280 + }, + { + "epoch": 0.5329832564224969, + "grad_norm": 1.8548667430877686, + "learning_rate": 4.1116945726291723e-05, + "loss": 0.7069, + "step": 60290 + }, + { + "epoch": 0.5330716596828091, + "grad_norm": 9.18394947052002, + "learning_rate": 4.1115472338619845e-05, + "loss": 0.758, + "step": 60300 + }, + { + "epoch": 0.5331600629431213, + "grad_norm": 8.072030067443848, + "learning_rate": 4.111399895094798e-05, + "loss": 0.6602, + "step": 60310 + }, + { + "epoch": 0.5332484662034336, + "grad_norm": 2.1545324325561523, + "learning_rate": 4.111252556327611e-05, + "loss": 0.6938, + "step": 60320 + }, + { + "epoch": 0.5333368694637458, + "grad_norm": 14.209163665771484, + "learning_rate": 4.111105217560424e-05, + "loss": 0.594, + "step": 60330 + }, + { + "epoch": 0.533425272724058, + "grad_norm": 3.6010241508483887, + "learning_rate": 4.1109578787932365e-05, + "loss": 0.7207, + "step": 60340 + }, + { + "epoch": 0.5335136759843703, + "grad_norm": 4.17183780670166, + "learning_rate": 4.11081054002605e-05, + "loss": 0.7749, + "step": 60350 + }, + { + "epoch": 0.5336020792446825, + "grad_norm": 4.2827887535095215, + "learning_rate": 4.110663201258862e-05, + "loss": 0.5999, + "step": 60360 + }, + { + "epoch": 0.5336904825049947, + "grad_norm": 2.8354761600494385, + "learning_rate": 4.110515862491676e-05, + "loss": 0.7673, + "step": 60370 + }, + { + "epoch": 0.5337788857653071, + "grad_norm": 1.863929033279419, + "learning_rate": 4.1103685237244885e-05, + "loss": 0.6725, + "step": 60380 + }, + { + "epoch": 0.5338672890256193, + "grad_norm": 2.815585136413574, + "learning_rate": 4.1102211849573014e-05, + "loss": 0.6931, + "step": 60390 + }, + { + "epoch": 0.5339556922859315, + "grad_norm": 8.654183387756348, + "learning_rate": 4.110073846190114e-05, + "loss": 0.6496, + "step": 60400 + }, + { + "epoch": 0.5340440955462438, + "grad_norm": 3.421689748764038, + "learning_rate": 4.109926507422927e-05, + "loss": 0.6436, + "step": 60410 + }, + { + "epoch": 0.534132498806556, + "grad_norm": 5.369647026062012, + "learning_rate": 4.10977916865574e-05, + "loss": 0.686, + "step": 60420 + }, + { + "epoch": 0.5342209020668682, + "grad_norm": 2.883557081222534, + "learning_rate": 4.1096318298885534e-05, + "loss": 0.5704, + "step": 60430 + }, + { + "epoch": 0.5343093053271805, + "grad_norm": 4.321925640106201, + "learning_rate": 4.109484491121366e-05, + "loss": 0.776, + "step": 60440 + }, + { + "epoch": 0.5343977085874927, + "grad_norm": 9.737076759338379, + "learning_rate": 4.109337152354179e-05, + "loss": 0.6094, + "step": 60450 + }, + { + "epoch": 0.5344861118478049, + "grad_norm": 2.928046464920044, + "learning_rate": 4.109189813586992e-05, + "loss": 0.6826, + "step": 60460 + }, + { + "epoch": 0.5345745151081172, + "grad_norm": 9.470121383666992, + "learning_rate": 4.109042474819805e-05, + "loss": 0.856, + "step": 60470 + }, + { + "epoch": 0.5346629183684294, + "grad_norm": 4.74775505065918, + "learning_rate": 4.1088951360526176e-05, + "loss": 0.5865, + "step": 60480 + }, + { + "epoch": 0.5347513216287416, + "grad_norm": 4.381826400756836, + "learning_rate": 4.108747797285431e-05, + "loss": 0.7575, + "step": 60490 + }, + { + "epoch": 0.534839724889054, + "grad_norm": 6.804551601409912, + "learning_rate": 4.108600458518244e-05, + "loss": 0.7795, + "step": 60500 + }, + { + "epoch": 0.5349281281493662, + "grad_norm": 1.8940178155899048, + "learning_rate": 4.108453119751057e-05, + "loss": 0.6678, + "step": 60510 + }, + { + "epoch": 0.5350165314096784, + "grad_norm": 2.1449060440063477, + "learning_rate": 4.1083057809838696e-05, + "loss": 0.729, + "step": 60520 + }, + { + "epoch": 0.5351049346699907, + "grad_norm": 3.6761341094970703, + "learning_rate": 4.1081584422166824e-05, + "loss": 0.7945, + "step": 60530 + }, + { + "epoch": 0.5351933379303029, + "grad_norm": 2.263317108154297, + "learning_rate": 4.108011103449495e-05, + "loss": 0.7009, + "step": 60540 + }, + { + "epoch": 0.5352817411906151, + "grad_norm": 3.6665337085723877, + "learning_rate": 4.107863764682308e-05, + "loss": 0.6259, + "step": 60550 + }, + { + "epoch": 0.5353701444509273, + "grad_norm": 1.876232624053955, + "learning_rate": 4.1077164259151216e-05, + "loss": 0.7658, + "step": 60560 + }, + { + "epoch": 0.5354585477112396, + "grad_norm": 3.949385166168213, + "learning_rate": 4.1075690871479344e-05, + "loss": 0.6565, + "step": 60570 + }, + { + "epoch": 0.5355469509715518, + "grad_norm": 2.0023906230926514, + "learning_rate": 4.107421748380747e-05, + "loss": 0.5897, + "step": 60580 + }, + { + "epoch": 0.535635354231864, + "grad_norm": 2.9671292304992676, + "learning_rate": 4.10727440961356e-05, + "loss": 0.6803, + "step": 60590 + }, + { + "epoch": 0.5357237574921763, + "grad_norm": 6.2092604637146, + "learning_rate": 4.107127070846373e-05, + "loss": 0.7583, + "step": 60600 + }, + { + "epoch": 0.5358121607524885, + "grad_norm": 1.2505688667297363, + "learning_rate": 4.106979732079186e-05, + "loss": 0.6547, + "step": 60610 + }, + { + "epoch": 0.5359005640128008, + "grad_norm": 7.783191204071045, + "learning_rate": 4.106832393311999e-05, + "loss": 0.7015, + "step": 60620 + }, + { + "epoch": 0.5359889672731131, + "grad_norm": 9.489867210388184, + "learning_rate": 4.1066850545448115e-05, + "loss": 0.9866, + "step": 60630 + }, + { + "epoch": 0.5360773705334253, + "grad_norm": 2.7540061473846436, + "learning_rate": 4.106537715777625e-05, + "loss": 0.6282, + "step": 60640 + }, + { + "epoch": 0.5361657737937375, + "grad_norm": 5.599301815032959, + "learning_rate": 4.106390377010438e-05, + "loss": 0.6783, + "step": 60650 + }, + { + "epoch": 0.5362541770540498, + "grad_norm": 2.267117738723755, + "learning_rate": 4.1062430382432506e-05, + "loss": 0.8601, + "step": 60660 + }, + { + "epoch": 0.536342580314362, + "grad_norm": 1.5320429801940918, + "learning_rate": 4.1060956994760635e-05, + "loss": 0.5809, + "step": 60670 + }, + { + "epoch": 0.5364309835746742, + "grad_norm": 2.189704656600952, + "learning_rate": 4.105948360708877e-05, + "loss": 0.6703, + "step": 60680 + }, + { + "epoch": 0.5365193868349865, + "grad_norm": 3.8731002807617188, + "learning_rate": 4.105801021941689e-05, + "loss": 0.6857, + "step": 60690 + }, + { + "epoch": 0.5366077900952987, + "grad_norm": 4.347623825073242, + "learning_rate": 4.105653683174503e-05, + "loss": 0.7235, + "step": 60700 + }, + { + "epoch": 0.5366961933556109, + "grad_norm": 4.423962593078613, + "learning_rate": 4.1055063444073155e-05, + "loss": 0.679, + "step": 60710 + }, + { + "epoch": 0.5367845966159231, + "grad_norm": 7.153933525085449, + "learning_rate": 4.105359005640128e-05, + "loss": 0.5527, + "step": 60720 + }, + { + "epoch": 0.5368729998762355, + "grad_norm": 7.4368672370910645, + "learning_rate": 4.105211666872941e-05, + "loss": 0.775, + "step": 60730 + }, + { + "epoch": 0.5369614031365477, + "grad_norm": 4.453962326049805, + "learning_rate": 4.105064328105754e-05, + "loss": 0.7614, + "step": 60740 + }, + { + "epoch": 0.53704980639686, + "grad_norm": 4.159331798553467, + "learning_rate": 4.104916989338567e-05, + "loss": 0.5699, + "step": 60750 + }, + { + "epoch": 0.5371382096571722, + "grad_norm": 4.9161248207092285, + "learning_rate": 4.1047696505713804e-05, + "loss": 0.7272, + "step": 60760 + }, + { + "epoch": 0.5372266129174844, + "grad_norm": 4.475187301635742, + "learning_rate": 4.1046223118041925e-05, + "loss": 0.7538, + "step": 60770 + }, + { + "epoch": 0.5373150161777966, + "grad_norm": 4.353623390197754, + "learning_rate": 4.104474973037006e-05, + "loss": 0.7292, + "step": 60780 + }, + { + "epoch": 0.5374034194381089, + "grad_norm": 9.009878158569336, + "learning_rate": 4.104327634269819e-05, + "loss": 0.6242, + "step": 60790 + }, + { + "epoch": 0.5374918226984211, + "grad_norm": 1.3772538900375366, + "learning_rate": 4.104180295502632e-05, + "loss": 0.6964, + "step": 60800 + }, + { + "epoch": 0.5375802259587333, + "grad_norm": 2.8624658584594727, + "learning_rate": 4.1040329567354445e-05, + "loss": 0.7204, + "step": 60810 + }, + { + "epoch": 0.5376686292190456, + "grad_norm": 3.146087408065796, + "learning_rate": 4.103885617968258e-05, + "loss": 0.6147, + "step": 60820 + }, + { + "epoch": 0.5377570324793578, + "grad_norm": 2.5250422954559326, + "learning_rate": 4.10373827920107e-05, + "loss": 0.7273, + "step": 60830 + }, + { + "epoch": 0.53784543573967, + "grad_norm": 4.4286699295043945, + "learning_rate": 4.103590940433884e-05, + "loss": 0.7399, + "step": 60840 + }, + { + "epoch": 0.5379338389999824, + "grad_norm": 3.8323612213134766, + "learning_rate": 4.103443601666696e-05, + "loss": 0.6672, + "step": 60850 + }, + { + "epoch": 0.5380222422602946, + "grad_norm": 1.406678557395935, + "learning_rate": 4.1032962628995094e-05, + "loss": 0.6859, + "step": 60860 + }, + { + "epoch": 0.5381106455206068, + "grad_norm": 3.1148641109466553, + "learning_rate": 4.103148924132322e-05, + "loss": 0.7467, + "step": 60870 + }, + { + "epoch": 0.5381990487809191, + "grad_norm": 5.790852069854736, + "learning_rate": 4.103001585365135e-05, + "loss": 0.6747, + "step": 60880 + }, + { + "epoch": 0.5382874520412313, + "grad_norm": 3.980956554412842, + "learning_rate": 4.102854246597948e-05, + "loss": 0.7802, + "step": 60890 + }, + { + "epoch": 0.5383758553015435, + "grad_norm": 5.957650184631348, + "learning_rate": 4.1027069078307614e-05, + "loss": 0.8262, + "step": 60900 + }, + { + "epoch": 0.5384642585618558, + "grad_norm": 2.129995822906494, + "learning_rate": 4.1025595690635736e-05, + "loss": 0.5558, + "step": 60910 + }, + { + "epoch": 0.538552661822168, + "grad_norm": 3.5420641899108887, + "learning_rate": 4.102412230296387e-05, + "loss": 0.8433, + "step": 60920 + }, + { + "epoch": 0.5386410650824802, + "grad_norm": 2.2441043853759766, + "learning_rate": 4.1022648915292e-05, + "loss": 0.8058, + "step": 60930 + }, + { + "epoch": 0.5387294683427925, + "grad_norm": 1.1636404991149902, + "learning_rate": 4.102117552762013e-05, + "loss": 0.6055, + "step": 60940 + }, + { + "epoch": 0.5388178716031047, + "grad_norm": 1.2521240711212158, + "learning_rate": 4.1019702139948256e-05, + "loss": 0.7209, + "step": 60950 + }, + { + "epoch": 0.5389062748634169, + "grad_norm": 5.205761432647705, + "learning_rate": 4.101822875227639e-05, + "loss": 0.8199, + "step": 60960 + }, + { + "epoch": 0.5389946781237293, + "grad_norm": 10.133781433105469, + "learning_rate": 4.101675536460451e-05, + "loss": 0.6464, + "step": 60970 + }, + { + "epoch": 0.5390830813840415, + "grad_norm": 2.7149558067321777, + "learning_rate": 4.101528197693265e-05, + "loss": 0.6848, + "step": 60980 + }, + { + "epoch": 0.5391714846443537, + "grad_norm": 2.772820472717285, + "learning_rate": 4.101380858926077e-05, + "loss": 0.6317, + "step": 60990 + }, + { + "epoch": 0.539259887904666, + "grad_norm": 7.021060943603516, + "learning_rate": 4.1012335201588904e-05, + "loss": 0.6979, + "step": 61000 + }, + { + "epoch": 0.5393482911649782, + "grad_norm": 2.6324551105499268, + "learning_rate": 4.101086181391703e-05, + "loss": 0.6622, + "step": 61010 + }, + { + "epoch": 0.5394366944252904, + "grad_norm": 9.457308769226074, + "learning_rate": 4.100938842624516e-05, + "loss": 0.6594, + "step": 61020 + }, + { + "epoch": 0.5395250976856026, + "grad_norm": 5.52971076965332, + "learning_rate": 4.100791503857329e-05, + "loss": 0.7412, + "step": 61030 + }, + { + "epoch": 0.5396135009459149, + "grad_norm": 5.493051052093506, + "learning_rate": 4.1006441650901425e-05, + "loss": 0.7294, + "step": 61040 + }, + { + "epoch": 0.5397019042062271, + "grad_norm": 1.9145418405532837, + "learning_rate": 4.1004968263229546e-05, + "loss": 0.6207, + "step": 61050 + }, + { + "epoch": 0.5397903074665393, + "grad_norm": 1.7710217237472534, + "learning_rate": 4.100349487555768e-05, + "loss": 0.5974, + "step": 61060 + }, + { + "epoch": 0.5398787107268516, + "grad_norm": 7.692457675933838, + "learning_rate": 4.100202148788581e-05, + "loss": 0.5967, + "step": 61070 + }, + { + "epoch": 0.5399671139871638, + "grad_norm": 3.235154151916504, + "learning_rate": 4.100054810021394e-05, + "loss": 0.6482, + "step": 61080 + }, + { + "epoch": 0.5400555172474761, + "grad_norm": 2.170754909515381, + "learning_rate": 4.0999074712542066e-05, + "loss": 0.6941, + "step": 61090 + }, + { + "epoch": 0.5401439205077884, + "grad_norm": 1.1746199131011963, + "learning_rate": 4.0997601324870195e-05, + "loss": 0.6075, + "step": 61100 + }, + { + "epoch": 0.5402323237681006, + "grad_norm": 2.0137100219726562, + "learning_rate": 4.099612793719832e-05, + "loss": 0.6752, + "step": 61110 + }, + { + "epoch": 0.5403207270284128, + "grad_norm": 1.190150260925293, + "learning_rate": 4.099465454952646e-05, + "loss": 0.7344, + "step": 61120 + }, + { + "epoch": 0.5404091302887251, + "grad_norm": 2.0723729133605957, + "learning_rate": 4.099318116185458e-05, + "loss": 0.7581, + "step": 61130 + }, + { + "epoch": 0.5404975335490373, + "grad_norm": 2.7568132877349854, + "learning_rate": 4.0991707774182715e-05, + "loss": 0.6639, + "step": 61140 + }, + { + "epoch": 0.5405859368093495, + "grad_norm": 2.5267369747161865, + "learning_rate": 4.099023438651084e-05, + "loss": 0.7648, + "step": 61150 + }, + { + "epoch": 0.5406743400696618, + "grad_norm": 9.138727188110352, + "learning_rate": 4.098876099883897e-05, + "loss": 0.6851, + "step": 61160 + }, + { + "epoch": 0.540762743329974, + "grad_norm": 2.625595808029175, + "learning_rate": 4.09872876111671e-05, + "loss": 0.6179, + "step": 61170 + }, + { + "epoch": 0.5408511465902862, + "grad_norm": 4.42297887802124, + "learning_rate": 4.0985814223495235e-05, + "loss": 0.7064, + "step": 61180 + }, + { + "epoch": 0.5409395498505984, + "grad_norm": 9.926868438720703, + "learning_rate": 4.098434083582336e-05, + "loss": 0.6189, + "step": 61190 + }, + { + "epoch": 0.5410279531109107, + "grad_norm": 8.057879447937012, + "learning_rate": 4.098286744815149e-05, + "loss": 0.7297, + "step": 61200 + }, + { + "epoch": 0.541116356371223, + "grad_norm": 2.313573122024536, + "learning_rate": 4.098139406047961e-05, + "loss": 0.7502, + "step": 61210 + }, + { + "epoch": 0.5412047596315352, + "grad_norm": 2.662954330444336, + "learning_rate": 4.097992067280775e-05, + "loss": 0.6181, + "step": 61220 + }, + { + "epoch": 0.5412931628918475, + "grad_norm": 13.367076873779297, + "learning_rate": 4.097844728513588e-05, + "loss": 0.7337, + "step": 61230 + }, + { + "epoch": 0.5413815661521597, + "grad_norm": 7.762792110443115, + "learning_rate": 4.0976973897464005e-05, + "loss": 0.628, + "step": 61240 + }, + { + "epoch": 0.5414699694124719, + "grad_norm": 10.466089248657227, + "learning_rate": 4.0975500509792134e-05, + "loss": 0.7639, + "step": 61250 + }, + { + "epoch": 0.5415583726727842, + "grad_norm": 4.6191020011901855, + "learning_rate": 4.097402712212027e-05, + "loss": 0.7662, + "step": 61260 + }, + { + "epoch": 0.5416467759330964, + "grad_norm": 3.765852212905884, + "learning_rate": 4.097255373444839e-05, + "loss": 0.8055, + "step": 61270 + }, + { + "epoch": 0.5417351791934086, + "grad_norm": 6.5301055908203125, + "learning_rate": 4.0971080346776525e-05, + "loss": 0.6931, + "step": 61280 + }, + { + "epoch": 0.5418235824537209, + "grad_norm": 4.521273136138916, + "learning_rate": 4.0969606959104654e-05, + "loss": 0.7309, + "step": 61290 + }, + { + "epoch": 0.5419119857140331, + "grad_norm": 3.3177034854888916, + "learning_rate": 4.096813357143278e-05, + "loss": 0.7317, + "step": 61300 + }, + { + "epoch": 0.5420003889743453, + "grad_norm": 12.594902992248535, + "learning_rate": 4.096666018376091e-05, + "loss": 0.6187, + "step": 61310 + }, + { + "epoch": 0.5420887922346577, + "grad_norm": 4.101683139801025, + "learning_rate": 4.096518679608904e-05, + "loss": 0.6698, + "step": 61320 + }, + { + "epoch": 0.5421771954949699, + "grad_norm": 2.899308919906616, + "learning_rate": 4.096371340841717e-05, + "loss": 0.6732, + "step": 61330 + }, + { + "epoch": 0.5422655987552821, + "grad_norm": 14.442461013793945, + "learning_rate": 4.09622400207453e-05, + "loss": 0.7821, + "step": 61340 + }, + { + "epoch": 0.5423540020155944, + "grad_norm": 2.837376832962036, + "learning_rate": 4.096076663307343e-05, + "loss": 0.6085, + "step": 61350 + }, + { + "epoch": 0.5424424052759066, + "grad_norm": 1.6499812602996826, + "learning_rate": 4.095929324540156e-05, + "loss": 0.7831, + "step": 61360 + }, + { + "epoch": 0.5425308085362188, + "grad_norm": 3.734788656234741, + "learning_rate": 4.095781985772969e-05, + "loss": 0.7417, + "step": 61370 + }, + { + "epoch": 0.542619211796531, + "grad_norm": 5.909337043762207, + "learning_rate": 4.0956346470057816e-05, + "loss": 0.655, + "step": 61380 + }, + { + "epoch": 0.5427076150568433, + "grad_norm": 8.06618595123291, + "learning_rate": 4.0954873082385944e-05, + "loss": 0.7344, + "step": 61390 + }, + { + "epoch": 0.5427960183171555, + "grad_norm": 5.110528945922852, + "learning_rate": 4.095339969471408e-05, + "loss": 0.7057, + "step": 61400 + }, + { + "epoch": 0.5428844215774677, + "grad_norm": 5.048502445220947, + "learning_rate": 4.095192630704221e-05, + "loss": 0.6689, + "step": 61410 + }, + { + "epoch": 0.54297282483778, + "grad_norm": 2.925981044769287, + "learning_rate": 4.0950452919370336e-05, + "loss": 0.6583, + "step": 61420 + }, + { + "epoch": 0.5430612280980922, + "grad_norm": 7.1030449867248535, + "learning_rate": 4.0948979531698464e-05, + "loss": 0.4908, + "step": 61430 + }, + { + "epoch": 0.5431496313584045, + "grad_norm": 2.4644997119903564, + "learning_rate": 4.094750614402659e-05, + "loss": 0.6573, + "step": 61440 + }, + { + "epoch": 0.5432380346187168, + "grad_norm": 4.937765121459961, + "learning_rate": 4.094603275635472e-05, + "loss": 0.7287, + "step": 61450 + }, + { + "epoch": 0.543326437879029, + "grad_norm": 3.202455997467041, + "learning_rate": 4.094455936868285e-05, + "loss": 0.6487, + "step": 61460 + }, + { + "epoch": 0.5434148411393412, + "grad_norm": 2.912321090698242, + "learning_rate": 4.0943085981010984e-05, + "loss": 0.6049, + "step": 61470 + }, + { + "epoch": 0.5435032443996535, + "grad_norm": 8.38735294342041, + "learning_rate": 4.094161259333911e-05, + "loss": 0.6741, + "step": 61480 + }, + { + "epoch": 0.5435916476599657, + "grad_norm": 5.502791404724121, + "learning_rate": 4.094013920566724e-05, + "loss": 0.7838, + "step": 61490 + }, + { + "epoch": 0.5436800509202779, + "grad_norm": 4.26815938949585, + "learning_rate": 4.093866581799537e-05, + "loss": 0.7441, + "step": 61500 + }, + { + "epoch": 0.5437684541805902, + "grad_norm": 0.7829049825668335, + "learning_rate": 4.09371924303235e-05, + "loss": 0.6335, + "step": 61510 + }, + { + "epoch": 0.5438568574409024, + "grad_norm": 4.094738483428955, + "learning_rate": 4.0935719042651626e-05, + "loss": 0.6195, + "step": 61520 + }, + { + "epoch": 0.5439452607012146, + "grad_norm": 7.83526611328125, + "learning_rate": 4.093424565497976e-05, + "loss": 0.6159, + "step": 61530 + }, + { + "epoch": 0.5440336639615269, + "grad_norm": 4.799098014831543, + "learning_rate": 4.093277226730789e-05, + "loss": 0.6435, + "step": 61540 + }, + { + "epoch": 0.5441220672218391, + "grad_norm": 2.3341596126556396, + "learning_rate": 4.093129887963602e-05, + "loss": 0.656, + "step": 61550 + }, + { + "epoch": 0.5442104704821514, + "grad_norm": 7.418471336364746, + "learning_rate": 4.0929825491964146e-05, + "loss": 0.6934, + "step": 61560 + }, + { + "epoch": 0.5442988737424637, + "grad_norm": 8.626866340637207, + "learning_rate": 4.0928352104292275e-05, + "loss": 0.6294, + "step": 61570 + }, + { + "epoch": 0.5443872770027759, + "grad_norm": 4.980144500732422, + "learning_rate": 4.09268787166204e-05, + "loss": 0.7899, + "step": 61580 + }, + { + "epoch": 0.5444756802630881, + "grad_norm": 2.441241979598999, + "learning_rate": 4.092540532894854e-05, + "loss": 0.7266, + "step": 61590 + }, + { + "epoch": 0.5445640835234004, + "grad_norm": 4.767135143280029, + "learning_rate": 4.092393194127666e-05, + "loss": 0.6007, + "step": 61600 + }, + { + "epoch": 0.5446524867837126, + "grad_norm": 6.329469203948975, + "learning_rate": 4.0922458553604795e-05, + "loss": 0.5614, + "step": 61610 + }, + { + "epoch": 0.5447408900440248, + "grad_norm": 4.289069175720215, + "learning_rate": 4.092098516593292e-05, + "loss": 0.7063, + "step": 61620 + }, + { + "epoch": 0.544829293304337, + "grad_norm": 4.172220706939697, + "learning_rate": 4.091951177826105e-05, + "loss": 0.6329, + "step": 61630 + }, + { + "epoch": 0.5449176965646493, + "grad_norm": 1.5058437585830688, + "learning_rate": 4.091803839058918e-05, + "loss": 0.5715, + "step": 61640 + }, + { + "epoch": 0.5450060998249615, + "grad_norm": 1.9413853883743286, + "learning_rate": 4.0916565002917315e-05, + "loss": 0.643, + "step": 61650 + }, + { + "epoch": 0.5450945030852737, + "grad_norm": 2.87971568107605, + "learning_rate": 4.091509161524544e-05, + "loss": 0.6928, + "step": 61660 + }, + { + "epoch": 0.545182906345586, + "grad_norm": 8.396600723266602, + "learning_rate": 4.091361822757357e-05, + "loss": 0.7239, + "step": 61670 + }, + { + "epoch": 0.5452713096058983, + "grad_norm": 9.455760955810547, + "learning_rate": 4.0912144839901693e-05, + "loss": 0.7265, + "step": 61680 + }, + { + "epoch": 0.5453597128662105, + "grad_norm": 2.3186399936676025, + "learning_rate": 4.091067145222983e-05, + "loss": 0.6015, + "step": 61690 + }, + { + "epoch": 0.5454481161265228, + "grad_norm": 5.35010290145874, + "learning_rate": 4.090919806455796e-05, + "loss": 0.6971, + "step": 61700 + }, + { + "epoch": 0.545536519386835, + "grad_norm": 2.6566903591156006, + "learning_rate": 4.0907724676886085e-05, + "loss": 0.7582, + "step": 61710 + }, + { + "epoch": 0.5456249226471472, + "grad_norm": 3.0584723949432373, + "learning_rate": 4.0906251289214214e-05, + "loss": 0.7132, + "step": 61720 + }, + { + "epoch": 0.5457133259074595, + "grad_norm": 6.962405681610107, + "learning_rate": 4.090477790154235e-05, + "loss": 0.6459, + "step": 61730 + }, + { + "epoch": 0.5458017291677717, + "grad_norm": 7.146580219268799, + "learning_rate": 4.090330451387047e-05, + "loss": 0.696, + "step": 61740 + }, + { + "epoch": 0.5458901324280839, + "grad_norm": 1.7345503568649292, + "learning_rate": 4.0901831126198605e-05, + "loss": 0.5787, + "step": 61750 + }, + { + "epoch": 0.5459785356883962, + "grad_norm": 2.4765796661376953, + "learning_rate": 4.0900357738526734e-05, + "loss": 0.6041, + "step": 61760 + }, + { + "epoch": 0.5460669389487084, + "grad_norm": 7.141934394836426, + "learning_rate": 4.089888435085486e-05, + "loss": 0.6972, + "step": 61770 + }, + { + "epoch": 0.5461553422090206, + "grad_norm": 6.15211820602417, + "learning_rate": 4.089741096318299e-05, + "loss": 0.7501, + "step": 61780 + }, + { + "epoch": 0.546243745469333, + "grad_norm": 9.471246719360352, + "learning_rate": 4.089593757551112e-05, + "loss": 0.6583, + "step": 61790 + }, + { + "epoch": 0.5463321487296452, + "grad_norm": 2.117295265197754, + "learning_rate": 4.089446418783925e-05, + "loss": 0.7657, + "step": 61800 + }, + { + "epoch": 0.5464205519899574, + "grad_norm": 2.312269926071167, + "learning_rate": 4.089299080016738e-05, + "loss": 0.7321, + "step": 61810 + }, + { + "epoch": 0.5465089552502697, + "grad_norm": 5.327282905578613, + "learning_rate": 4.0891517412495504e-05, + "loss": 0.713, + "step": 61820 + }, + { + "epoch": 0.5465973585105819, + "grad_norm": 4.53051233291626, + "learning_rate": 4.089004402482364e-05, + "loss": 0.6507, + "step": 61830 + }, + { + "epoch": 0.5466857617708941, + "grad_norm": 4.317416191101074, + "learning_rate": 4.088857063715177e-05, + "loss": 0.6268, + "step": 61840 + }, + { + "epoch": 0.5467741650312063, + "grad_norm": 4.29363489151001, + "learning_rate": 4.0887097249479896e-05, + "loss": 0.7671, + "step": 61850 + }, + { + "epoch": 0.5468625682915186, + "grad_norm": 2.106245517730713, + "learning_rate": 4.0885623861808024e-05, + "loss": 0.6954, + "step": 61860 + }, + { + "epoch": 0.5469509715518308, + "grad_norm": 4.963006496429443, + "learning_rate": 4.088415047413616e-05, + "loss": 0.5751, + "step": 61870 + }, + { + "epoch": 0.547039374812143, + "grad_norm": 3.012784957885742, + "learning_rate": 4.088267708646428e-05, + "loss": 0.5432, + "step": 61880 + }, + { + "epoch": 0.5471277780724553, + "grad_norm": 2.363640069961548, + "learning_rate": 4.0881203698792416e-05, + "loss": 0.6364, + "step": 61890 + }, + { + "epoch": 0.5472161813327675, + "grad_norm": 4.916860103607178, + "learning_rate": 4.0879730311120544e-05, + "loss": 0.7315, + "step": 61900 + }, + { + "epoch": 0.5473045845930798, + "grad_norm": 13.727585792541504, + "learning_rate": 4.087825692344867e-05, + "loss": 0.837, + "step": 61910 + }, + { + "epoch": 0.5473929878533921, + "grad_norm": 3.6278133392333984, + "learning_rate": 4.08767835357768e-05, + "loss": 0.745, + "step": 61920 + }, + { + "epoch": 0.5474813911137043, + "grad_norm": 4.7713775634765625, + "learning_rate": 4.087531014810493e-05, + "loss": 0.6926, + "step": 61930 + }, + { + "epoch": 0.5475697943740165, + "grad_norm": 4.205535411834717, + "learning_rate": 4.087383676043306e-05, + "loss": 0.59, + "step": 61940 + }, + { + "epoch": 0.5476581976343288, + "grad_norm": 4.933831214904785, + "learning_rate": 4.087236337276119e-05, + "loss": 0.7224, + "step": 61950 + }, + { + "epoch": 0.547746600894641, + "grad_norm": 5.320747375488281, + "learning_rate": 4.0870889985089314e-05, + "loss": 0.7419, + "step": 61960 + }, + { + "epoch": 0.5478350041549532, + "grad_norm": 2.2930240631103516, + "learning_rate": 4.086941659741745e-05, + "loss": 0.6866, + "step": 61970 + }, + { + "epoch": 0.5479234074152655, + "grad_norm": 2.417546033859253, + "learning_rate": 4.086794320974558e-05, + "loss": 0.6766, + "step": 61980 + }, + { + "epoch": 0.5480118106755777, + "grad_norm": 4.099992275238037, + "learning_rate": 4.0866469822073706e-05, + "loss": 0.7691, + "step": 61990 + }, + { + "epoch": 0.5481002139358899, + "grad_norm": 7.419407844543457, + "learning_rate": 4.0864996434401835e-05, + "loss": 0.5728, + "step": 62000 + }, + { + "epoch": 0.5481886171962022, + "grad_norm": 3.2243587970733643, + "learning_rate": 4.086352304672997e-05, + "loss": 0.7734, + "step": 62010 + }, + { + "epoch": 0.5482770204565144, + "grad_norm": 3.938995599746704, + "learning_rate": 4.086204965905809e-05, + "loss": 0.6605, + "step": 62020 + }, + { + "epoch": 0.5483654237168267, + "grad_norm": 4.894810676574707, + "learning_rate": 4.0860576271386226e-05, + "loss": 0.8069, + "step": 62030 + }, + { + "epoch": 0.548453826977139, + "grad_norm": 1.3692402839660645, + "learning_rate": 4.085910288371435e-05, + "loss": 0.6191, + "step": 62040 + }, + { + "epoch": 0.5485422302374512, + "grad_norm": 7.836918354034424, + "learning_rate": 4.085762949604248e-05, + "loss": 0.7416, + "step": 62050 + }, + { + "epoch": 0.5486306334977634, + "grad_norm": 3.5874412059783936, + "learning_rate": 4.085615610837061e-05, + "loss": 0.8542, + "step": 62060 + }, + { + "epoch": 0.5487190367580757, + "grad_norm": 1.5806734561920166, + "learning_rate": 4.085468272069874e-05, + "loss": 0.6764, + "step": 62070 + }, + { + "epoch": 0.5488074400183879, + "grad_norm": 1.5705277919769287, + "learning_rate": 4.085320933302687e-05, + "loss": 0.5837, + "step": 62080 + }, + { + "epoch": 0.5488958432787001, + "grad_norm": 2.244008779525757, + "learning_rate": 4.0851735945355e-05, + "loss": 0.662, + "step": 62090 + }, + { + "epoch": 0.5489842465390123, + "grad_norm": 1.9295954704284668, + "learning_rate": 4.0850262557683125e-05, + "loss": 0.6677, + "step": 62100 + }, + { + "epoch": 0.5490726497993246, + "grad_norm": 4.382631778717041, + "learning_rate": 4.084878917001126e-05, + "loss": 0.5893, + "step": 62110 + }, + { + "epoch": 0.5491610530596368, + "grad_norm": 4.301537036895752, + "learning_rate": 4.084731578233939e-05, + "loss": 0.6719, + "step": 62120 + }, + { + "epoch": 0.549249456319949, + "grad_norm": 4.617231369018555, + "learning_rate": 4.084584239466752e-05, + "loss": 0.8648, + "step": 62130 + }, + { + "epoch": 0.5493378595802613, + "grad_norm": 4.951000690460205, + "learning_rate": 4.0844369006995645e-05, + "loss": 0.7484, + "step": 62140 + }, + { + "epoch": 0.5494262628405736, + "grad_norm": 3.792289972305298, + "learning_rate": 4.0842895619323773e-05, + "loss": 0.6072, + "step": 62150 + }, + { + "epoch": 0.5495146661008858, + "grad_norm": 5.221554279327393, + "learning_rate": 4.08414222316519e-05, + "loss": 0.8724, + "step": 62160 + }, + { + "epoch": 0.5496030693611981, + "grad_norm": 4.404388904571533, + "learning_rate": 4.083994884398004e-05, + "loss": 0.7609, + "step": 62170 + }, + { + "epoch": 0.5496914726215103, + "grad_norm": 3.834625482559204, + "learning_rate": 4.083847545630816e-05, + "loss": 0.6846, + "step": 62180 + }, + { + "epoch": 0.5497798758818225, + "grad_norm": 1.522938847541809, + "learning_rate": 4.0837002068636294e-05, + "loss": 0.7164, + "step": 62190 + }, + { + "epoch": 0.5498682791421348, + "grad_norm": 1.1847554445266724, + "learning_rate": 4.083552868096442e-05, + "loss": 0.7044, + "step": 62200 + }, + { + "epoch": 0.549956682402447, + "grad_norm": 2.3974757194519043, + "learning_rate": 4.083405529329255e-05, + "loss": 0.6138, + "step": 62210 + }, + { + "epoch": 0.5500450856627592, + "grad_norm": 1.6963533163070679, + "learning_rate": 4.083258190562068e-05, + "loss": 0.6669, + "step": 62220 + }, + { + "epoch": 0.5501334889230715, + "grad_norm": 4.234209060668945, + "learning_rate": 4.0831108517948814e-05, + "loss": 0.6213, + "step": 62230 + }, + { + "epoch": 0.5502218921833837, + "grad_norm": 4.611056327819824, + "learning_rate": 4.0829635130276935e-05, + "loss": 0.6935, + "step": 62240 + }, + { + "epoch": 0.5503102954436959, + "grad_norm": 3.103215456008911, + "learning_rate": 4.082816174260507e-05, + "loss": 0.7327, + "step": 62250 + }, + { + "epoch": 0.5503986987040081, + "grad_norm": 3.6425230503082275, + "learning_rate": 4.08266883549332e-05, + "loss": 0.6821, + "step": 62260 + }, + { + "epoch": 0.5504871019643205, + "grad_norm": 14.899616241455078, + "learning_rate": 4.082521496726133e-05, + "loss": 0.7051, + "step": 62270 + }, + { + "epoch": 0.5505755052246327, + "grad_norm": 1.558464765548706, + "learning_rate": 4.0823741579589456e-05, + "loss": 0.7049, + "step": 62280 + }, + { + "epoch": 0.550663908484945, + "grad_norm": 1.8028136491775513, + "learning_rate": 4.0822268191917584e-05, + "loss": 0.7035, + "step": 62290 + }, + { + "epoch": 0.5507523117452572, + "grad_norm": 1.6013673543930054, + "learning_rate": 4.082079480424571e-05, + "loss": 0.8175, + "step": 62300 + }, + { + "epoch": 0.5508407150055694, + "grad_norm": 2.7481155395507812, + "learning_rate": 4.081932141657385e-05, + "loss": 0.6141, + "step": 62310 + }, + { + "epoch": 0.5509291182658816, + "grad_norm": 1.3825974464416504, + "learning_rate": 4.0817848028901976e-05, + "loss": 0.6164, + "step": 62320 + }, + { + "epoch": 0.5510175215261939, + "grad_norm": 3.1612820625305176, + "learning_rate": 4.0816374641230104e-05, + "loss": 0.671, + "step": 62330 + }, + { + "epoch": 0.5511059247865061, + "grad_norm": 7.41900110244751, + "learning_rate": 4.081490125355823e-05, + "loss": 0.8493, + "step": 62340 + }, + { + "epoch": 0.5511943280468183, + "grad_norm": 5.661056041717529, + "learning_rate": 4.081342786588636e-05, + "loss": 0.8018, + "step": 62350 + }, + { + "epoch": 0.5512827313071306, + "grad_norm": 3.1594958305358887, + "learning_rate": 4.081195447821449e-05, + "loss": 0.5826, + "step": 62360 + }, + { + "epoch": 0.5513711345674428, + "grad_norm": 3.1781249046325684, + "learning_rate": 4.0810481090542624e-05, + "loss": 0.6851, + "step": 62370 + }, + { + "epoch": 0.5514595378277551, + "grad_norm": 10.142784118652344, + "learning_rate": 4.080900770287075e-05, + "loss": 0.7852, + "step": 62380 + }, + { + "epoch": 0.5515479410880674, + "grad_norm": 6.739329814910889, + "learning_rate": 4.080753431519888e-05, + "loss": 0.6094, + "step": 62390 + }, + { + "epoch": 0.5516363443483796, + "grad_norm": 29.661962509155273, + "learning_rate": 4.080606092752701e-05, + "loss": 0.7518, + "step": 62400 + }, + { + "epoch": 0.5517247476086918, + "grad_norm": 7.815852642059326, + "learning_rate": 4.080458753985514e-05, + "loss": 0.6028, + "step": 62410 + }, + { + "epoch": 0.5518131508690041, + "grad_norm": 6.031912326812744, + "learning_rate": 4.0803114152183266e-05, + "loss": 0.6138, + "step": 62420 + }, + { + "epoch": 0.5519015541293163, + "grad_norm": 3.4759647846221924, + "learning_rate": 4.0801640764511394e-05, + "loss": 0.7457, + "step": 62430 + }, + { + "epoch": 0.5519899573896285, + "grad_norm": 2.1413931846618652, + "learning_rate": 4.080016737683953e-05, + "loss": 0.6812, + "step": 62440 + }, + { + "epoch": 0.5520783606499408, + "grad_norm": 14.791672706604004, + "learning_rate": 4.079869398916766e-05, + "loss": 0.7169, + "step": 62450 + }, + { + "epoch": 0.552166763910253, + "grad_norm": 3.942746162414551, + "learning_rate": 4.0797220601495786e-05, + "loss": 0.5147, + "step": 62460 + }, + { + "epoch": 0.5522551671705652, + "grad_norm": 3.358020067214966, + "learning_rate": 4.0795747213823915e-05, + "loss": 0.7153, + "step": 62470 + }, + { + "epoch": 0.5523435704308775, + "grad_norm": 1.729537844657898, + "learning_rate": 4.079427382615204e-05, + "loss": 0.7613, + "step": 62480 + }, + { + "epoch": 0.5524319736911897, + "grad_norm": 2.4452719688415527, + "learning_rate": 4.079280043848017e-05, + "loss": 0.5938, + "step": 62490 + }, + { + "epoch": 0.552520376951502, + "grad_norm": 4.644379138946533, + "learning_rate": 4.0791327050808307e-05, + "loss": 0.7941, + "step": 62500 + }, + { + "epoch": 0.5526087802118143, + "grad_norm": 5.547428607940674, + "learning_rate": 4.078985366313643e-05, + "loss": 0.7711, + "step": 62510 + }, + { + "epoch": 0.5526971834721265, + "grad_norm": 1.1845346689224243, + "learning_rate": 4.078838027546456e-05, + "loss": 0.6581, + "step": 62520 + }, + { + "epoch": 0.5527855867324387, + "grad_norm": 4.600733757019043, + "learning_rate": 4.078690688779269e-05, + "loss": 0.7008, + "step": 62530 + }, + { + "epoch": 0.552873989992751, + "grad_norm": 1.6078307628631592, + "learning_rate": 4.078543350012082e-05, + "loss": 0.678, + "step": 62540 + }, + { + "epoch": 0.5529623932530632, + "grad_norm": 2.8887579441070557, + "learning_rate": 4.078396011244895e-05, + "loss": 0.7163, + "step": 62550 + }, + { + "epoch": 0.5530507965133754, + "grad_norm": 13.506646156311035, + "learning_rate": 4.0782486724777083e-05, + "loss": 0.7312, + "step": 62560 + }, + { + "epoch": 0.5531391997736876, + "grad_norm": 3.7088615894317627, + "learning_rate": 4.0781013337105205e-05, + "loss": 0.6049, + "step": 62570 + }, + { + "epoch": 0.5532276030339999, + "grad_norm": 4.431879043579102, + "learning_rate": 4.077953994943334e-05, + "loss": 0.8565, + "step": 62580 + }, + { + "epoch": 0.5533160062943121, + "grad_norm": 4.177177429199219, + "learning_rate": 4.077806656176147e-05, + "loss": 0.7676, + "step": 62590 + }, + { + "epoch": 0.5534044095546243, + "grad_norm": 3.7162039279937744, + "learning_rate": 4.07765931740896e-05, + "loss": 0.6364, + "step": 62600 + }, + { + "epoch": 0.5534928128149366, + "grad_norm": 2.4209091663360596, + "learning_rate": 4.0775119786417725e-05, + "loss": 0.737, + "step": 62610 + }, + { + "epoch": 0.5535812160752489, + "grad_norm": 2.490230083465576, + "learning_rate": 4.0773646398745854e-05, + "loss": 0.6825, + "step": 62620 + }, + { + "epoch": 0.5536696193355611, + "grad_norm": 5.82399320602417, + "learning_rate": 4.077217301107398e-05, + "loss": 0.7084, + "step": 62630 + }, + { + "epoch": 0.5537580225958734, + "grad_norm": 3.0212762355804443, + "learning_rate": 4.077069962340212e-05, + "loss": 0.6651, + "step": 62640 + }, + { + "epoch": 0.5538464258561856, + "grad_norm": 3.4942400455474854, + "learning_rate": 4.076922623573024e-05, + "loss": 0.7842, + "step": 62650 + }, + { + "epoch": 0.5539348291164978, + "grad_norm": 2.226706027984619, + "learning_rate": 4.0767752848058374e-05, + "loss": 0.591, + "step": 62660 + }, + { + "epoch": 0.5540232323768101, + "grad_norm": 3.081984281539917, + "learning_rate": 4.07662794603865e-05, + "loss": 0.6871, + "step": 62670 + }, + { + "epoch": 0.5541116356371223, + "grad_norm": 4.682066917419434, + "learning_rate": 4.076480607271463e-05, + "loss": 0.615, + "step": 62680 + }, + { + "epoch": 0.5542000388974345, + "grad_norm": 4.302504539489746, + "learning_rate": 4.076333268504276e-05, + "loss": 0.8338, + "step": 62690 + }, + { + "epoch": 0.5542884421577468, + "grad_norm": 3.471498727798462, + "learning_rate": 4.0761859297370894e-05, + "loss": 0.8216, + "step": 62700 + }, + { + "epoch": 0.554376845418059, + "grad_norm": 5.331011772155762, + "learning_rate": 4.0760385909699015e-05, + "loss": 0.6846, + "step": 62710 + }, + { + "epoch": 0.5544652486783712, + "grad_norm": 2.4722280502319336, + "learning_rate": 4.075891252202715e-05, + "loss": 0.6023, + "step": 62720 + }, + { + "epoch": 0.5545536519386834, + "grad_norm": 2.3275868892669678, + "learning_rate": 4.075743913435528e-05, + "loss": 0.6967, + "step": 62730 + }, + { + "epoch": 0.5546420551989958, + "grad_norm": 3.251068592071533, + "learning_rate": 4.075596574668341e-05, + "loss": 0.6167, + "step": 62740 + }, + { + "epoch": 0.554730458459308, + "grad_norm": 3.044917345046997, + "learning_rate": 4.0754492359011536e-05, + "loss": 0.7082, + "step": 62750 + }, + { + "epoch": 0.5548188617196202, + "grad_norm": 5.043300151824951, + "learning_rate": 4.0753018971339664e-05, + "loss": 0.7656, + "step": 62760 + }, + { + "epoch": 0.5549072649799325, + "grad_norm": 2.0907504558563232, + "learning_rate": 4.075154558366779e-05, + "loss": 0.7462, + "step": 62770 + }, + { + "epoch": 0.5549956682402447, + "grad_norm": 3.9822773933410645, + "learning_rate": 4.075007219599593e-05, + "loss": 0.6726, + "step": 62780 + }, + { + "epoch": 0.5550840715005569, + "grad_norm": 6.933872222900391, + "learning_rate": 4.074859880832405e-05, + "loss": 0.6487, + "step": 62790 + }, + { + "epoch": 0.5551724747608692, + "grad_norm": 3.5690855979919434, + "learning_rate": 4.0747125420652184e-05, + "loss": 0.6857, + "step": 62800 + }, + { + "epoch": 0.5552608780211814, + "grad_norm": 3.5433759689331055, + "learning_rate": 4.074565203298031e-05, + "loss": 0.6341, + "step": 62810 + }, + { + "epoch": 0.5553492812814936, + "grad_norm": 3.214184284210205, + "learning_rate": 4.074417864530844e-05, + "loss": 0.6289, + "step": 62820 + }, + { + "epoch": 0.5554376845418059, + "grad_norm": 4.199161529541016, + "learning_rate": 4.074270525763657e-05, + "loss": 0.7053, + "step": 62830 + }, + { + "epoch": 0.5555260878021181, + "grad_norm": 2.815532684326172, + "learning_rate": 4.0741231869964704e-05, + "loss": 0.7036, + "step": 62840 + }, + { + "epoch": 0.5556144910624304, + "grad_norm": 4.300614833831787, + "learning_rate": 4.0739758482292826e-05, + "loss": 0.7452, + "step": 62850 + }, + { + "epoch": 0.5557028943227427, + "grad_norm": 1.6825121641159058, + "learning_rate": 4.073828509462096e-05, + "loss": 0.6978, + "step": 62860 + }, + { + "epoch": 0.5557912975830549, + "grad_norm": 7.21452522277832, + "learning_rate": 4.073681170694908e-05, + "loss": 0.618, + "step": 62870 + }, + { + "epoch": 0.5558797008433671, + "grad_norm": 3.448690176010132, + "learning_rate": 4.073533831927722e-05, + "loss": 0.5747, + "step": 62880 + }, + { + "epoch": 0.5559681041036794, + "grad_norm": 3.577071189880371, + "learning_rate": 4.0733864931605346e-05, + "loss": 0.6362, + "step": 62890 + }, + { + "epoch": 0.5560565073639916, + "grad_norm": 2.1710007190704346, + "learning_rate": 4.0732391543933475e-05, + "loss": 0.622, + "step": 62900 + }, + { + "epoch": 0.5561449106243038, + "grad_norm": 2.937624931335449, + "learning_rate": 4.07309181562616e-05, + "loss": 0.6407, + "step": 62910 + }, + { + "epoch": 0.556233313884616, + "grad_norm": 3.720982313156128, + "learning_rate": 4.072944476858974e-05, + "loss": 0.6316, + "step": 62920 + }, + { + "epoch": 0.5563217171449283, + "grad_norm": 1.773363709449768, + "learning_rate": 4.072797138091786e-05, + "loss": 0.5862, + "step": 62930 + }, + { + "epoch": 0.5564101204052405, + "grad_norm": 2.2955081462860107, + "learning_rate": 4.0726497993245995e-05, + "loss": 0.7583, + "step": 62940 + }, + { + "epoch": 0.5564985236655527, + "grad_norm": 7.422618389129639, + "learning_rate": 4.072502460557412e-05, + "loss": 0.7207, + "step": 62950 + }, + { + "epoch": 0.556586926925865, + "grad_norm": 2.2737765312194824, + "learning_rate": 4.072355121790225e-05, + "loss": 0.6888, + "step": 62960 + }, + { + "epoch": 0.5566753301861773, + "grad_norm": 1.7188626527786255, + "learning_rate": 4.072207783023038e-05, + "loss": 0.7744, + "step": 62970 + }, + { + "epoch": 0.5567637334464896, + "grad_norm": 7.6729044914245605, + "learning_rate": 4.072060444255851e-05, + "loss": 0.6764, + "step": 62980 + }, + { + "epoch": 0.5568521367068018, + "grad_norm": 2.308917284011841, + "learning_rate": 4.0719131054886637e-05, + "loss": 0.6571, + "step": 62990 + }, + { + "epoch": 0.556940539967114, + "grad_norm": 2.4506571292877197, + "learning_rate": 4.071765766721477e-05, + "loss": 0.5771, + "step": 63000 + }, + { + "epoch": 0.5570289432274262, + "grad_norm": 3.898029088973999, + "learning_rate": 4.071618427954289e-05, + "loss": 0.6696, + "step": 63010 + }, + { + "epoch": 0.5571173464877385, + "grad_norm": 7.187760353088379, + "learning_rate": 4.071471089187103e-05, + "loss": 0.6918, + "step": 63020 + }, + { + "epoch": 0.5572057497480507, + "grad_norm": 6.468751430511475, + "learning_rate": 4.071323750419916e-05, + "loss": 0.6641, + "step": 63030 + }, + { + "epoch": 0.5572941530083629, + "grad_norm": 3.098630666732788, + "learning_rate": 4.0711764116527285e-05, + "loss": 0.683, + "step": 63040 + }, + { + "epoch": 0.5573825562686752, + "grad_norm": 5.286598205566406, + "learning_rate": 4.0710290728855413e-05, + "loss": 0.7044, + "step": 63050 + }, + { + "epoch": 0.5574709595289874, + "grad_norm": 2.019216775894165, + "learning_rate": 4.070881734118355e-05, + "loss": 0.7049, + "step": 63060 + }, + { + "epoch": 0.5575593627892996, + "grad_norm": 4.356855392456055, + "learning_rate": 4.070734395351167e-05, + "loss": 0.8097, + "step": 63070 + }, + { + "epoch": 0.5576477660496119, + "grad_norm": 5.114986419677734, + "learning_rate": 4.0705870565839805e-05, + "loss": 0.8909, + "step": 63080 + }, + { + "epoch": 0.5577361693099242, + "grad_norm": 1.8002680540084839, + "learning_rate": 4.070439717816793e-05, + "loss": 0.6946, + "step": 63090 + }, + { + "epoch": 0.5578245725702364, + "grad_norm": 4.779249668121338, + "learning_rate": 4.070292379049606e-05, + "loss": 0.7003, + "step": 63100 + }, + { + "epoch": 0.5579129758305487, + "grad_norm": 17.739381790161133, + "learning_rate": 4.070145040282419e-05, + "loss": 0.6996, + "step": 63110 + }, + { + "epoch": 0.5580013790908609, + "grad_norm": 4.2556071281433105, + "learning_rate": 4.069997701515232e-05, + "loss": 0.7975, + "step": 63120 + }, + { + "epoch": 0.5580897823511731, + "grad_norm": 7.880222320556641, + "learning_rate": 4.069850362748045e-05, + "loss": 0.6855, + "step": 63130 + }, + { + "epoch": 0.5581781856114854, + "grad_norm": 3.777540683746338, + "learning_rate": 4.069703023980858e-05, + "loss": 0.7618, + "step": 63140 + }, + { + "epoch": 0.5582665888717976, + "grad_norm": 2.8185460567474365, + "learning_rate": 4.0695556852136704e-05, + "loss": 0.7183, + "step": 63150 + }, + { + "epoch": 0.5583549921321098, + "grad_norm": 2.2345547676086426, + "learning_rate": 4.069408346446484e-05, + "loss": 0.7175, + "step": 63160 + }, + { + "epoch": 0.558443395392422, + "grad_norm": 2.3325679302215576, + "learning_rate": 4.069261007679297e-05, + "loss": 0.5852, + "step": 63170 + }, + { + "epoch": 0.5585317986527343, + "grad_norm": 2.6563258171081543, + "learning_rate": 4.0691136689121096e-05, + "loss": 0.723, + "step": 63180 + }, + { + "epoch": 0.5586202019130465, + "grad_norm": 3.589921712875366, + "learning_rate": 4.0689663301449224e-05, + "loss": 0.7361, + "step": 63190 + }, + { + "epoch": 0.5587086051733587, + "grad_norm": 3.861968994140625, + "learning_rate": 4.068818991377736e-05, + "loss": 0.724, + "step": 63200 + }, + { + "epoch": 0.5587970084336711, + "grad_norm": 8.236564636230469, + "learning_rate": 4.068671652610548e-05, + "loss": 0.675, + "step": 63210 + }, + { + "epoch": 0.5588854116939833, + "grad_norm": 1.6007232666015625, + "learning_rate": 4.0685243138433616e-05, + "loss": 0.7619, + "step": 63220 + }, + { + "epoch": 0.5589738149542955, + "grad_norm": 8.26694393157959, + "learning_rate": 4.0683769750761744e-05, + "loss": 0.7679, + "step": 63230 + }, + { + "epoch": 0.5590622182146078, + "grad_norm": 1.9998350143432617, + "learning_rate": 4.068229636308987e-05, + "loss": 0.7287, + "step": 63240 + }, + { + "epoch": 0.55915062147492, + "grad_norm": 4.671010494232178, + "learning_rate": 4.0680822975418e-05, + "loss": 0.6734, + "step": 63250 + }, + { + "epoch": 0.5592390247352322, + "grad_norm": 1.9065940380096436, + "learning_rate": 4.067934958774613e-05, + "loss": 0.7706, + "step": 63260 + }, + { + "epoch": 0.5593274279955445, + "grad_norm": 8.396730422973633, + "learning_rate": 4.067787620007426e-05, + "loss": 0.7225, + "step": 63270 + }, + { + "epoch": 0.5594158312558567, + "grad_norm": 3.9346113204956055, + "learning_rate": 4.067640281240239e-05, + "loss": 0.7234, + "step": 63280 + }, + { + "epoch": 0.5595042345161689, + "grad_norm": 6.4728803634643555, + "learning_rate": 4.067492942473052e-05, + "loss": 0.859, + "step": 63290 + }, + { + "epoch": 0.5595926377764812, + "grad_norm": 6.107116222381592, + "learning_rate": 4.067345603705865e-05, + "loss": 0.7255, + "step": 63300 + }, + { + "epoch": 0.5596810410367934, + "grad_norm": 2.462620973587036, + "learning_rate": 4.067198264938678e-05, + "loss": 0.7193, + "step": 63310 + }, + { + "epoch": 0.5597694442971056, + "grad_norm": 1.311640977859497, + "learning_rate": 4.0670509261714906e-05, + "loss": 0.679, + "step": 63320 + }, + { + "epoch": 0.559857847557418, + "grad_norm": 3.0472991466522217, + "learning_rate": 4.0669035874043034e-05, + "loss": 0.6367, + "step": 63330 + }, + { + "epoch": 0.5599462508177302, + "grad_norm": 3.4147191047668457, + "learning_rate": 4.066756248637116e-05, + "loss": 0.5648, + "step": 63340 + }, + { + "epoch": 0.5600346540780424, + "grad_norm": 5.423051357269287, + "learning_rate": 4.06660890986993e-05, + "loss": 0.5828, + "step": 63350 + }, + { + "epoch": 0.5601230573383547, + "grad_norm": 0.9590580463409424, + "learning_rate": 4.0664615711027426e-05, + "loss": 0.6916, + "step": 63360 + }, + { + "epoch": 0.5602114605986669, + "grad_norm": 1.4947302341461182, + "learning_rate": 4.0663142323355555e-05, + "loss": 0.6561, + "step": 63370 + }, + { + "epoch": 0.5602998638589791, + "grad_norm": 4.569432258605957, + "learning_rate": 4.066166893568368e-05, + "loss": 0.7948, + "step": 63380 + }, + { + "epoch": 0.5603882671192914, + "grad_norm": 1.5504459142684937, + "learning_rate": 4.066019554801181e-05, + "loss": 0.6398, + "step": 63390 + }, + { + "epoch": 0.5604766703796036, + "grad_norm": 10.235306739807129, + "learning_rate": 4.065872216033994e-05, + "loss": 0.7731, + "step": 63400 + }, + { + "epoch": 0.5605650736399158, + "grad_norm": 1.8039636611938477, + "learning_rate": 4.0657248772668075e-05, + "loss": 0.636, + "step": 63410 + }, + { + "epoch": 0.560653476900228, + "grad_norm": 5.058978080749512, + "learning_rate": 4.06557753849962e-05, + "loss": 0.83, + "step": 63420 + }, + { + "epoch": 0.5607418801605403, + "grad_norm": 7.112691402435303, + "learning_rate": 4.065430199732433e-05, + "loss": 0.6408, + "step": 63430 + }, + { + "epoch": 0.5608302834208526, + "grad_norm": 6.672459125518799, + "learning_rate": 4.065282860965246e-05, + "loss": 0.7014, + "step": 63440 + }, + { + "epoch": 0.5609186866811648, + "grad_norm": 6.996494293212891, + "learning_rate": 4.065135522198059e-05, + "loss": 0.6675, + "step": 63450 + }, + { + "epoch": 0.5610070899414771, + "grad_norm": 16.02535629272461, + "learning_rate": 4.0649881834308717e-05, + "loss": 0.621, + "step": 63460 + }, + { + "epoch": 0.5610954932017893, + "grad_norm": 3.3546600341796875, + "learning_rate": 4.064840844663685e-05, + "loss": 0.6007, + "step": 63470 + }, + { + "epoch": 0.5611838964621015, + "grad_norm": 7.53946590423584, + "learning_rate": 4.064693505896497e-05, + "loss": 0.7236, + "step": 63480 + }, + { + "epoch": 0.5612722997224138, + "grad_norm": 6.919145584106445, + "learning_rate": 4.064546167129311e-05, + "loss": 0.773, + "step": 63490 + }, + { + "epoch": 0.561360702982726, + "grad_norm": 12.32744026184082, + "learning_rate": 4.064398828362124e-05, + "loss": 0.7469, + "step": 63500 + }, + { + "epoch": 0.5614491062430382, + "grad_norm": 3.5610527992248535, + "learning_rate": 4.0642514895949365e-05, + "loss": 0.7002, + "step": 63510 + }, + { + "epoch": 0.5615375095033505, + "grad_norm": 2.0038833618164062, + "learning_rate": 4.0641041508277493e-05, + "loss": 0.6595, + "step": 63520 + }, + { + "epoch": 0.5616259127636627, + "grad_norm": 2.8348968029022217, + "learning_rate": 4.063956812060563e-05, + "loss": 0.6184, + "step": 63530 + }, + { + "epoch": 0.5617143160239749, + "grad_norm": 12.131918907165527, + "learning_rate": 4.063809473293375e-05, + "loss": 0.7021, + "step": 63540 + }, + { + "epoch": 0.5618027192842872, + "grad_norm": 8.311182022094727, + "learning_rate": 4.0636621345261885e-05, + "loss": 0.7012, + "step": 63550 + }, + { + "epoch": 0.5618911225445995, + "grad_norm": 5.661182403564453, + "learning_rate": 4.063514795759001e-05, + "loss": 0.7304, + "step": 63560 + }, + { + "epoch": 0.5619795258049117, + "grad_norm": 3.6150553226470947, + "learning_rate": 4.063367456991814e-05, + "loss": 0.6578, + "step": 63570 + }, + { + "epoch": 0.562067929065224, + "grad_norm": 9.561511993408203, + "learning_rate": 4.063220118224627e-05, + "loss": 0.6741, + "step": 63580 + }, + { + "epoch": 0.5621563323255362, + "grad_norm": 7.904484272003174, + "learning_rate": 4.06307277945744e-05, + "loss": 0.7737, + "step": 63590 + }, + { + "epoch": 0.5622447355858484, + "grad_norm": 6.017001628875732, + "learning_rate": 4.062925440690253e-05, + "loss": 0.7488, + "step": 63600 + }, + { + "epoch": 0.5623331388461607, + "grad_norm": 4.022983074188232, + "learning_rate": 4.062778101923066e-05, + "loss": 0.6404, + "step": 63610 + }, + { + "epoch": 0.5624215421064729, + "grad_norm": 2.01179838180542, + "learning_rate": 4.0626307631558784e-05, + "loss": 0.6649, + "step": 63620 + }, + { + "epoch": 0.5625099453667851, + "grad_norm": 1.6342637538909912, + "learning_rate": 4.062483424388692e-05, + "loss": 0.73, + "step": 63630 + }, + { + "epoch": 0.5625983486270973, + "grad_norm": 3.375537395477295, + "learning_rate": 4.062336085621505e-05, + "loss": 0.7265, + "step": 63640 + }, + { + "epoch": 0.5626867518874096, + "grad_norm": 1.0897823572158813, + "learning_rate": 4.0621887468543176e-05, + "loss": 0.6606, + "step": 63650 + }, + { + "epoch": 0.5627751551477218, + "grad_norm": 8.687250137329102, + "learning_rate": 4.0620414080871304e-05, + "loss": 0.72, + "step": 63660 + }, + { + "epoch": 0.562863558408034, + "grad_norm": 7.519293308258057, + "learning_rate": 4.061894069319944e-05, + "loss": 0.7433, + "step": 63670 + }, + { + "epoch": 0.5629519616683464, + "grad_norm": 1.4763296842575073, + "learning_rate": 4.061746730552756e-05, + "loss": 0.5541, + "step": 63680 + }, + { + "epoch": 0.5630403649286586, + "grad_norm": 11.709550857543945, + "learning_rate": 4.0615993917855696e-05, + "loss": 0.7339, + "step": 63690 + }, + { + "epoch": 0.5631287681889708, + "grad_norm": 1.6048792600631714, + "learning_rate": 4.061452053018382e-05, + "loss": 0.5828, + "step": 63700 + }, + { + "epoch": 0.5632171714492831, + "grad_norm": 3.0552163124084473, + "learning_rate": 4.061304714251195e-05, + "loss": 0.7597, + "step": 63710 + }, + { + "epoch": 0.5633055747095953, + "grad_norm": 13.991518020629883, + "learning_rate": 4.061157375484008e-05, + "loss": 0.7509, + "step": 63720 + }, + { + "epoch": 0.5633939779699075, + "grad_norm": 2.3903353214263916, + "learning_rate": 4.061010036716821e-05, + "loss": 0.8877, + "step": 63730 + }, + { + "epoch": 0.5634823812302198, + "grad_norm": 6.726386547088623, + "learning_rate": 4.060862697949634e-05, + "loss": 0.5926, + "step": 63740 + }, + { + "epoch": 0.563570784490532, + "grad_norm": 4.98592472076416, + "learning_rate": 4.060715359182447e-05, + "loss": 0.7932, + "step": 63750 + }, + { + "epoch": 0.5636591877508442, + "grad_norm": 4.511202812194824, + "learning_rate": 4.0605680204152594e-05, + "loss": 0.5619, + "step": 63760 + }, + { + "epoch": 0.5637475910111565, + "grad_norm": 7.471907138824463, + "learning_rate": 4.060420681648073e-05, + "loss": 0.6135, + "step": 63770 + }, + { + "epoch": 0.5638359942714687, + "grad_norm": 15.382930755615234, + "learning_rate": 4.060273342880886e-05, + "loss": 0.7252, + "step": 63780 + }, + { + "epoch": 0.5639243975317809, + "grad_norm": 1.4106415510177612, + "learning_rate": 4.0601260041136986e-05, + "loss": 0.6339, + "step": 63790 + }, + { + "epoch": 0.5640128007920933, + "grad_norm": 3.6871800422668457, + "learning_rate": 4.0599786653465114e-05, + "loss": 0.6325, + "step": 63800 + }, + { + "epoch": 0.5641012040524055, + "grad_norm": 3.1861133575439453, + "learning_rate": 4.059831326579324e-05, + "loss": 0.7985, + "step": 63810 + }, + { + "epoch": 0.5641896073127177, + "grad_norm": 6.78101110458374, + "learning_rate": 4.059683987812137e-05, + "loss": 0.7035, + "step": 63820 + }, + { + "epoch": 0.56427801057303, + "grad_norm": 2.288201093673706, + "learning_rate": 4.0595366490449506e-05, + "loss": 0.6991, + "step": 63830 + }, + { + "epoch": 0.5643664138333422, + "grad_norm": 2.836599826812744, + "learning_rate": 4.059389310277763e-05, + "loss": 0.6553, + "step": 63840 + }, + { + "epoch": 0.5644548170936544, + "grad_norm": 3.3340418338775635, + "learning_rate": 4.059241971510576e-05, + "loss": 0.6236, + "step": 63850 + }, + { + "epoch": 0.5645432203539666, + "grad_norm": 2.1750998497009277, + "learning_rate": 4.059094632743389e-05, + "loss": 0.6451, + "step": 63860 + }, + { + "epoch": 0.5646316236142789, + "grad_norm": 2.46244740486145, + "learning_rate": 4.058947293976202e-05, + "loss": 0.7091, + "step": 63870 + }, + { + "epoch": 0.5647200268745911, + "grad_norm": 3.372239828109741, + "learning_rate": 4.058799955209015e-05, + "loss": 0.7002, + "step": 63880 + }, + { + "epoch": 0.5648084301349033, + "grad_norm": 4.3613362312316895, + "learning_rate": 4.058652616441828e-05, + "loss": 0.5936, + "step": 63890 + }, + { + "epoch": 0.5648968333952156, + "grad_norm": 1.1230272054672241, + "learning_rate": 4.0585052776746405e-05, + "loss": 0.6175, + "step": 63900 + }, + { + "epoch": 0.5649852366555278, + "grad_norm": 1.0587000846862793, + "learning_rate": 4.058357938907454e-05, + "loss": 0.6762, + "step": 63910 + }, + { + "epoch": 0.5650736399158401, + "grad_norm": 2.200409412384033, + "learning_rate": 4.058210600140266e-05, + "loss": 0.8633, + "step": 63920 + }, + { + "epoch": 0.5651620431761524, + "grad_norm": 3.6095399856567383, + "learning_rate": 4.05806326137308e-05, + "loss": 0.7222, + "step": 63930 + }, + { + "epoch": 0.5652504464364646, + "grad_norm": 10.71081829071045, + "learning_rate": 4.0579159226058925e-05, + "loss": 0.7224, + "step": 63940 + }, + { + "epoch": 0.5653388496967768, + "grad_norm": 4.717750072479248, + "learning_rate": 4.057768583838705e-05, + "loss": 0.7204, + "step": 63950 + }, + { + "epoch": 0.5654272529570891, + "grad_norm": 1.6694716215133667, + "learning_rate": 4.057621245071518e-05, + "loss": 0.5246, + "step": 63960 + }, + { + "epoch": 0.5655156562174013, + "grad_norm": 1.9727901220321655, + "learning_rate": 4.057473906304332e-05, + "loss": 0.667, + "step": 63970 + }, + { + "epoch": 0.5656040594777135, + "grad_norm": 2.3130252361297607, + "learning_rate": 4.057326567537144e-05, + "loss": 0.5482, + "step": 63980 + }, + { + "epoch": 0.5656924627380258, + "grad_norm": 5.267376899719238, + "learning_rate": 4.0571792287699574e-05, + "loss": 0.6814, + "step": 63990 + }, + { + "epoch": 0.565780865998338, + "grad_norm": 1.3883095979690552, + "learning_rate": 4.05703189000277e-05, + "loss": 0.5598, + "step": 64000 + }, + { + "epoch": 0.5658692692586502, + "grad_norm": 7.479804039001465, + "learning_rate": 4.056884551235583e-05, + "loss": 0.8481, + "step": 64010 + }, + { + "epoch": 0.5659576725189625, + "grad_norm": 5.832056999206543, + "learning_rate": 4.056737212468396e-05, + "loss": 0.6619, + "step": 64020 + }, + { + "epoch": 0.5660460757792748, + "grad_norm": 4.774280071258545, + "learning_rate": 4.056589873701209e-05, + "loss": 0.6966, + "step": 64030 + }, + { + "epoch": 0.566134479039587, + "grad_norm": 2.387117624282837, + "learning_rate": 4.0564425349340215e-05, + "loss": 0.6996, + "step": 64040 + }, + { + "epoch": 0.5662228822998993, + "grad_norm": 2.4382340908050537, + "learning_rate": 4.056295196166835e-05, + "loss": 0.6519, + "step": 64050 + }, + { + "epoch": 0.5663112855602115, + "grad_norm": 6.238379955291748, + "learning_rate": 4.056147857399647e-05, + "loss": 0.7063, + "step": 64060 + }, + { + "epoch": 0.5663996888205237, + "grad_norm": 3.4802324771881104, + "learning_rate": 4.056000518632461e-05, + "loss": 0.6635, + "step": 64070 + }, + { + "epoch": 0.566488092080836, + "grad_norm": 1.2869523763656616, + "learning_rate": 4.0558531798652736e-05, + "loss": 0.5068, + "step": 64080 + }, + { + "epoch": 0.5665764953411482, + "grad_norm": 2.2430055141448975, + "learning_rate": 4.0557058410980864e-05, + "loss": 0.7223, + "step": 64090 + }, + { + "epoch": 0.5666648986014604, + "grad_norm": 2.5135419368743896, + "learning_rate": 4.055558502330899e-05, + "loss": 0.655, + "step": 64100 + }, + { + "epoch": 0.5667533018617726, + "grad_norm": 6.266421794891357, + "learning_rate": 4.055411163563713e-05, + "loss": 0.7837, + "step": 64110 + }, + { + "epoch": 0.5668417051220849, + "grad_norm": 1.3949779272079468, + "learning_rate": 4.055263824796525e-05, + "loss": 0.6651, + "step": 64120 + }, + { + "epoch": 0.5669301083823971, + "grad_norm": 13.763872146606445, + "learning_rate": 4.0551164860293384e-05, + "loss": 0.6233, + "step": 64130 + }, + { + "epoch": 0.5670185116427093, + "grad_norm": 3.128615379333496, + "learning_rate": 4.054969147262151e-05, + "loss": 0.6717, + "step": 64140 + }, + { + "epoch": 0.5671069149030217, + "grad_norm": 1.4400246143341064, + "learning_rate": 4.054821808494964e-05, + "loss": 0.7256, + "step": 64150 + }, + { + "epoch": 0.5671953181633339, + "grad_norm": 1.9768950939178467, + "learning_rate": 4.054674469727777e-05, + "loss": 0.6783, + "step": 64160 + }, + { + "epoch": 0.5672837214236461, + "grad_norm": 2.8429722785949707, + "learning_rate": 4.05452713096059e-05, + "loss": 0.6701, + "step": 64170 + }, + { + "epoch": 0.5673721246839584, + "grad_norm": 4.377533912658691, + "learning_rate": 4.0543797921934026e-05, + "loss": 0.6689, + "step": 64180 + }, + { + "epoch": 0.5674605279442706, + "grad_norm": 4.206196308135986, + "learning_rate": 4.054232453426216e-05, + "loss": 0.8279, + "step": 64190 + }, + { + "epoch": 0.5675489312045828, + "grad_norm": 5.869049072265625, + "learning_rate": 4.054085114659029e-05, + "loss": 0.7438, + "step": 64200 + }, + { + "epoch": 0.5676373344648951, + "grad_norm": 6.651582717895508, + "learning_rate": 4.053937775891842e-05, + "loss": 0.6973, + "step": 64210 + }, + { + "epoch": 0.5677257377252073, + "grad_norm": 3.322523355484009, + "learning_rate": 4.0537904371246546e-05, + "loss": 0.5148, + "step": 64220 + }, + { + "epoch": 0.5678141409855195, + "grad_norm": 3.9739694595336914, + "learning_rate": 4.0536430983574674e-05, + "loss": 0.7276, + "step": 64230 + }, + { + "epoch": 0.5679025442458318, + "grad_norm": 3.5674448013305664, + "learning_rate": 4.05349575959028e-05, + "loss": 0.7397, + "step": 64240 + }, + { + "epoch": 0.567990947506144, + "grad_norm": 4.198413848876953, + "learning_rate": 4.053348420823094e-05, + "loss": 0.6122, + "step": 64250 + }, + { + "epoch": 0.5680793507664562, + "grad_norm": 1.4144407510757446, + "learning_rate": 4.0532010820559066e-05, + "loss": 0.7536, + "step": 64260 + }, + { + "epoch": 0.5681677540267686, + "grad_norm": 3.42972731590271, + "learning_rate": 4.0530537432887195e-05, + "loss": 0.6837, + "step": 64270 + }, + { + "epoch": 0.5682561572870808, + "grad_norm": 2.9982354640960693, + "learning_rate": 4.052906404521532e-05, + "loss": 0.5659, + "step": 64280 + }, + { + "epoch": 0.568344560547393, + "grad_norm": 6.105149269104004, + "learning_rate": 4.052759065754345e-05, + "loss": 0.8467, + "step": 64290 + }, + { + "epoch": 0.5684329638077052, + "grad_norm": 3.902435779571533, + "learning_rate": 4.052611726987158e-05, + "loss": 0.7616, + "step": 64300 + }, + { + "epoch": 0.5685213670680175, + "grad_norm": 3.6051948070526123, + "learning_rate": 4.052464388219971e-05, + "loss": 0.7347, + "step": 64310 + }, + { + "epoch": 0.5686097703283297, + "grad_norm": 2.8388123512268066, + "learning_rate": 4.052317049452784e-05, + "loss": 0.7009, + "step": 64320 + }, + { + "epoch": 0.5686981735886419, + "grad_norm": 1.8754425048828125, + "learning_rate": 4.052169710685597e-05, + "loss": 0.7271, + "step": 64330 + }, + { + "epoch": 0.5687865768489542, + "grad_norm": 1.294590950012207, + "learning_rate": 4.05202237191841e-05, + "loss": 0.7239, + "step": 64340 + }, + { + "epoch": 0.5688749801092664, + "grad_norm": 6.263574600219727, + "learning_rate": 4.051875033151223e-05, + "loss": 0.6336, + "step": 64350 + }, + { + "epoch": 0.5689633833695786, + "grad_norm": 2.241262674331665, + "learning_rate": 4.0517276943840357e-05, + "loss": 0.6028, + "step": 64360 + }, + { + "epoch": 0.5690517866298909, + "grad_norm": 8.491609573364258, + "learning_rate": 4.0515803556168485e-05, + "loss": 0.7002, + "step": 64370 + }, + { + "epoch": 0.5691401898902031, + "grad_norm": 16.753889083862305, + "learning_rate": 4.051433016849662e-05, + "loss": 0.6694, + "step": 64380 + }, + { + "epoch": 0.5692285931505154, + "grad_norm": 5.131870269775391, + "learning_rate": 4.051285678082474e-05, + "loss": 0.6292, + "step": 64390 + }, + { + "epoch": 0.5693169964108277, + "grad_norm": 1.8675251007080078, + "learning_rate": 4.051138339315288e-05, + "loss": 0.5255, + "step": 64400 + }, + { + "epoch": 0.5694053996711399, + "grad_norm": 5.0697431564331055, + "learning_rate": 4.0509910005481005e-05, + "loss": 0.7475, + "step": 64410 + }, + { + "epoch": 0.5694938029314521, + "grad_norm": 6.377092361450195, + "learning_rate": 4.0508436617809133e-05, + "loss": 0.7615, + "step": 64420 + }, + { + "epoch": 0.5695822061917644, + "grad_norm": 1.8682746887207031, + "learning_rate": 4.050696323013726e-05, + "loss": 0.6611, + "step": 64430 + }, + { + "epoch": 0.5696706094520766, + "grad_norm": 3.793256998062134, + "learning_rate": 4.05054898424654e-05, + "loss": 0.6795, + "step": 64440 + }, + { + "epoch": 0.5697590127123888, + "grad_norm": 8.376814842224121, + "learning_rate": 4.050401645479352e-05, + "loss": 0.675, + "step": 64450 + }, + { + "epoch": 0.569847415972701, + "grad_norm": 4.916607856750488, + "learning_rate": 4.0502543067121654e-05, + "loss": 0.7669, + "step": 64460 + }, + { + "epoch": 0.5699358192330133, + "grad_norm": 1.955283761024475, + "learning_rate": 4.050106967944978e-05, + "loss": 0.6457, + "step": 64470 + }, + { + "epoch": 0.5700242224933255, + "grad_norm": 1.6909441947937012, + "learning_rate": 4.049959629177791e-05, + "loss": 0.5638, + "step": 64480 + }, + { + "epoch": 0.5701126257536377, + "grad_norm": 5.194309234619141, + "learning_rate": 4.049812290410604e-05, + "loss": 0.709, + "step": 64490 + }, + { + "epoch": 0.5702010290139501, + "grad_norm": 1.4350976943969727, + "learning_rate": 4.049664951643417e-05, + "loss": 0.7526, + "step": 64500 + }, + { + "epoch": 0.5702894322742623, + "grad_norm": 1.5561854839324951, + "learning_rate": 4.0495176128762295e-05, + "loss": 0.5437, + "step": 64510 + }, + { + "epoch": 0.5703778355345746, + "grad_norm": 6.050344944000244, + "learning_rate": 4.049370274109043e-05, + "loss": 0.6692, + "step": 64520 + }, + { + "epoch": 0.5704662387948868, + "grad_norm": 2.152381658554077, + "learning_rate": 4.049222935341855e-05, + "loss": 0.4805, + "step": 64530 + }, + { + "epoch": 0.570554642055199, + "grad_norm": 9.078564643859863, + "learning_rate": 4.049075596574669e-05, + "loss": 0.7142, + "step": 64540 + }, + { + "epoch": 0.5706430453155112, + "grad_norm": 3.8883821964263916, + "learning_rate": 4.0489282578074816e-05, + "loss": 0.7341, + "step": 64550 + }, + { + "epoch": 0.5707314485758235, + "grad_norm": 4.619812965393066, + "learning_rate": 4.0487809190402944e-05, + "loss": 0.6385, + "step": 64560 + }, + { + "epoch": 0.5708198518361357, + "grad_norm": 3.550454616546631, + "learning_rate": 4.048633580273107e-05, + "loss": 0.7299, + "step": 64570 + }, + { + "epoch": 0.5709082550964479, + "grad_norm": 10.33322525024414, + "learning_rate": 4.048486241505921e-05, + "loss": 0.7653, + "step": 64580 + }, + { + "epoch": 0.5709966583567602, + "grad_norm": 11.441985130310059, + "learning_rate": 4.048338902738733e-05, + "loss": 0.7106, + "step": 64590 + }, + { + "epoch": 0.5710850616170724, + "grad_norm": 6.242215156555176, + "learning_rate": 4.0481915639715464e-05, + "loss": 0.7193, + "step": 64600 + }, + { + "epoch": 0.5711734648773846, + "grad_norm": 15.218245506286621, + "learning_rate": 4.048044225204359e-05, + "loss": 0.7969, + "step": 64610 + }, + { + "epoch": 0.571261868137697, + "grad_norm": 2.025117874145508, + "learning_rate": 4.047896886437172e-05, + "loss": 0.6353, + "step": 64620 + }, + { + "epoch": 0.5713502713980092, + "grad_norm": 10.602964401245117, + "learning_rate": 4.047749547669985e-05, + "loss": 0.9693, + "step": 64630 + }, + { + "epoch": 0.5714386746583214, + "grad_norm": 2.2587060928344727, + "learning_rate": 4.047602208902798e-05, + "loss": 0.726, + "step": 64640 + }, + { + "epoch": 0.5715270779186337, + "grad_norm": 3.2599239349365234, + "learning_rate": 4.0474548701356106e-05, + "loss": 0.8149, + "step": 64650 + }, + { + "epoch": 0.5716154811789459, + "grad_norm": 4.679561138153076, + "learning_rate": 4.047307531368424e-05, + "loss": 0.7313, + "step": 64660 + }, + { + "epoch": 0.5717038844392581, + "grad_norm": 8.353294372558594, + "learning_rate": 4.047160192601236e-05, + "loss": 0.6951, + "step": 64670 + }, + { + "epoch": 0.5717922876995704, + "grad_norm": 2.8976051807403564, + "learning_rate": 4.04701285383405e-05, + "loss": 0.6465, + "step": 64680 + }, + { + "epoch": 0.5718806909598826, + "grad_norm": 3.4874932765960693, + "learning_rate": 4.0468655150668626e-05, + "loss": 0.6176, + "step": 64690 + }, + { + "epoch": 0.5719690942201948, + "grad_norm": 4.699406623840332, + "learning_rate": 4.0467181762996754e-05, + "loss": 0.66, + "step": 64700 + }, + { + "epoch": 0.572057497480507, + "grad_norm": 2.328927755355835, + "learning_rate": 4.046570837532488e-05, + "loss": 0.7214, + "step": 64710 + }, + { + "epoch": 0.5721459007408193, + "grad_norm": 1.8024733066558838, + "learning_rate": 4.046423498765302e-05, + "loss": 0.5946, + "step": 64720 + }, + { + "epoch": 0.5722343040011315, + "grad_norm": 5.493916034698486, + "learning_rate": 4.046276159998114e-05, + "loss": 0.6911, + "step": 64730 + }, + { + "epoch": 0.5723227072614439, + "grad_norm": 3.101094961166382, + "learning_rate": 4.0461288212309275e-05, + "loss": 0.7828, + "step": 64740 + }, + { + "epoch": 0.5724111105217561, + "grad_norm": 13.01898193359375, + "learning_rate": 4.0459814824637396e-05, + "loss": 0.7767, + "step": 64750 + }, + { + "epoch": 0.5724995137820683, + "grad_norm": 4.371884346008301, + "learning_rate": 4.045834143696553e-05, + "loss": 0.6418, + "step": 64760 + }, + { + "epoch": 0.5725879170423805, + "grad_norm": 1.7412854433059692, + "learning_rate": 4.045686804929366e-05, + "loss": 0.6645, + "step": 64770 + }, + { + "epoch": 0.5726763203026928, + "grad_norm": 3.0693604946136475, + "learning_rate": 4.045539466162179e-05, + "loss": 0.5786, + "step": 64780 + }, + { + "epoch": 0.572764723563005, + "grad_norm": 1.506795883178711, + "learning_rate": 4.0453921273949916e-05, + "loss": 0.6994, + "step": 64790 + }, + { + "epoch": 0.5728531268233172, + "grad_norm": 4.32845401763916, + "learning_rate": 4.045244788627805e-05, + "loss": 0.6652, + "step": 64800 + }, + { + "epoch": 0.5729415300836295, + "grad_norm": 5.896068572998047, + "learning_rate": 4.045097449860617e-05, + "loss": 0.6759, + "step": 64810 + }, + { + "epoch": 0.5730299333439417, + "grad_norm": 8.06838607788086, + "learning_rate": 4.044950111093431e-05, + "loss": 0.7163, + "step": 64820 + }, + { + "epoch": 0.5731183366042539, + "grad_norm": 3.362428665161133, + "learning_rate": 4.0448027723262437e-05, + "loss": 0.8562, + "step": 64830 + }, + { + "epoch": 0.5732067398645662, + "grad_norm": 3.129589319229126, + "learning_rate": 4.0446554335590565e-05, + "loss": 0.4917, + "step": 64840 + }, + { + "epoch": 0.5732951431248784, + "grad_norm": 8.218469619750977, + "learning_rate": 4.044508094791869e-05, + "loss": 0.6536, + "step": 64850 + }, + { + "epoch": 0.5733835463851907, + "grad_norm": 2.535635232925415, + "learning_rate": 4.044360756024682e-05, + "loss": 0.7227, + "step": 64860 + }, + { + "epoch": 0.573471949645503, + "grad_norm": 7.734989643096924, + "learning_rate": 4.044213417257495e-05, + "loss": 0.7018, + "step": 64870 + }, + { + "epoch": 0.5735603529058152, + "grad_norm": 2.6127724647521973, + "learning_rate": 4.0440660784903085e-05, + "loss": 0.7118, + "step": 64880 + }, + { + "epoch": 0.5736487561661274, + "grad_norm": 3.806565761566162, + "learning_rate": 4.043918739723121e-05, + "loss": 0.649, + "step": 64890 + }, + { + "epoch": 0.5737371594264397, + "grad_norm": 4.902356147766113, + "learning_rate": 4.043771400955934e-05, + "loss": 0.6632, + "step": 64900 + }, + { + "epoch": 0.5738255626867519, + "grad_norm": 1.861557960510254, + "learning_rate": 4.043624062188747e-05, + "loss": 0.6707, + "step": 64910 + }, + { + "epoch": 0.5739139659470641, + "grad_norm": 11.613804817199707, + "learning_rate": 4.04347672342156e-05, + "loss": 0.6406, + "step": 64920 + }, + { + "epoch": 0.5740023692073764, + "grad_norm": 1.9614821672439575, + "learning_rate": 4.043329384654373e-05, + "loss": 0.7516, + "step": 64930 + }, + { + "epoch": 0.5740907724676886, + "grad_norm": 1.5052820444107056, + "learning_rate": 4.043182045887186e-05, + "loss": 0.6, + "step": 64940 + }, + { + "epoch": 0.5741791757280008, + "grad_norm": 3.503469467163086, + "learning_rate": 4.0430347071199984e-05, + "loss": 0.7295, + "step": 64950 + }, + { + "epoch": 0.574267578988313, + "grad_norm": 1.8578046560287476, + "learning_rate": 4.042887368352812e-05, + "loss": 0.6088, + "step": 64960 + }, + { + "epoch": 0.5743559822486253, + "grad_norm": 7.115762233734131, + "learning_rate": 4.042740029585625e-05, + "loss": 0.6956, + "step": 64970 + }, + { + "epoch": 0.5744443855089376, + "grad_norm": 3.296905994415283, + "learning_rate": 4.0425926908184375e-05, + "loss": 0.5472, + "step": 64980 + }, + { + "epoch": 0.5745327887692498, + "grad_norm": 4.446994781494141, + "learning_rate": 4.0424453520512504e-05, + "loss": 0.6906, + "step": 64990 + }, + { + "epoch": 0.5746211920295621, + "grad_norm": 2.829014539718628, + "learning_rate": 4.042298013284063e-05, + "loss": 0.7285, + "step": 65000 + }, + { + "epoch": 0.5747095952898743, + "grad_norm": 13.407563209533691, + "learning_rate": 4.042150674516876e-05, + "loss": 0.6594, + "step": 65010 + }, + { + "epoch": 0.5747979985501865, + "grad_norm": 5.668613910675049, + "learning_rate": 4.0420033357496896e-05, + "loss": 0.7137, + "step": 65020 + }, + { + "epoch": 0.5748864018104988, + "grad_norm": 2.314248561859131, + "learning_rate": 4.041855996982502e-05, + "loss": 0.5666, + "step": 65030 + }, + { + "epoch": 0.574974805070811, + "grad_norm": 2.1190857887268066, + "learning_rate": 4.041708658215315e-05, + "loss": 0.7266, + "step": 65040 + }, + { + "epoch": 0.5750632083311232, + "grad_norm": 5.008220672607422, + "learning_rate": 4.041561319448128e-05, + "loss": 0.7779, + "step": 65050 + }, + { + "epoch": 0.5751516115914355, + "grad_norm": 2.3553032875061035, + "learning_rate": 4.041413980680941e-05, + "loss": 0.734, + "step": 65060 + }, + { + "epoch": 0.5752400148517477, + "grad_norm": 2.6983978748321533, + "learning_rate": 4.041266641913754e-05, + "loss": 0.7602, + "step": 65070 + }, + { + "epoch": 0.5753284181120599, + "grad_norm": 5.724380016326904, + "learning_rate": 4.041119303146567e-05, + "loss": 0.6551, + "step": 65080 + }, + { + "epoch": 0.5754168213723723, + "grad_norm": 1.12800931930542, + "learning_rate": 4.0409719643793794e-05, + "loss": 0.663, + "step": 65090 + }, + { + "epoch": 0.5755052246326845, + "grad_norm": 12.098381996154785, + "learning_rate": 4.040824625612193e-05, + "loss": 0.6348, + "step": 65100 + }, + { + "epoch": 0.5755936278929967, + "grad_norm": 1.838953971862793, + "learning_rate": 4.040677286845006e-05, + "loss": 0.8018, + "step": 65110 + }, + { + "epoch": 0.575682031153309, + "grad_norm": 8.700167655944824, + "learning_rate": 4.0405299480778186e-05, + "loss": 0.6953, + "step": 65120 + }, + { + "epoch": 0.5757704344136212, + "grad_norm": 2.6799418926239014, + "learning_rate": 4.0403826093106314e-05, + "loss": 0.6421, + "step": 65130 + }, + { + "epoch": 0.5758588376739334, + "grad_norm": 2.222330331802368, + "learning_rate": 4.040235270543444e-05, + "loss": 0.731, + "step": 65140 + }, + { + "epoch": 0.5759472409342457, + "grad_norm": 6.967453479766846, + "learning_rate": 4.040087931776257e-05, + "loss": 0.607, + "step": 65150 + }, + { + "epoch": 0.5760356441945579, + "grad_norm": 3.4749906063079834, + "learning_rate": 4.0399405930090706e-05, + "loss": 0.4454, + "step": 65160 + }, + { + "epoch": 0.5761240474548701, + "grad_norm": 1.130460262298584, + "learning_rate": 4.0397932542418835e-05, + "loss": 0.6654, + "step": 65170 + }, + { + "epoch": 0.5762124507151823, + "grad_norm": 7.090402126312256, + "learning_rate": 4.039645915474696e-05, + "loss": 0.6064, + "step": 65180 + }, + { + "epoch": 0.5763008539754946, + "grad_norm": 8.348467826843262, + "learning_rate": 4.039498576707509e-05, + "loss": 0.7333, + "step": 65190 + }, + { + "epoch": 0.5763892572358068, + "grad_norm": 2.160935163497925, + "learning_rate": 4.039351237940322e-05, + "loss": 0.5871, + "step": 65200 + }, + { + "epoch": 0.5764776604961191, + "grad_norm": 2.075876474380493, + "learning_rate": 4.039203899173135e-05, + "loss": 0.6993, + "step": 65210 + }, + { + "epoch": 0.5765660637564314, + "grad_norm": 3.337186098098755, + "learning_rate": 4.0390565604059476e-05, + "loss": 0.6739, + "step": 65220 + }, + { + "epoch": 0.5766544670167436, + "grad_norm": 3.3875410556793213, + "learning_rate": 4.038909221638761e-05, + "loss": 0.6239, + "step": 65230 + }, + { + "epoch": 0.5767428702770558, + "grad_norm": 2.0924062728881836, + "learning_rate": 4.038761882871574e-05, + "loss": 0.5542, + "step": 65240 + }, + { + "epoch": 0.5768312735373681, + "grad_norm": 1.5167820453643799, + "learning_rate": 4.038614544104387e-05, + "loss": 0.7349, + "step": 65250 + }, + { + "epoch": 0.5769196767976803, + "grad_norm": 3.1909453868865967, + "learning_rate": 4.0384672053371996e-05, + "loss": 0.745, + "step": 65260 + }, + { + "epoch": 0.5770080800579925, + "grad_norm": 3.2624502182006836, + "learning_rate": 4.0383198665700125e-05, + "loss": 0.6994, + "step": 65270 + }, + { + "epoch": 0.5770964833183048, + "grad_norm": 3.371648073196411, + "learning_rate": 4.038172527802825e-05, + "loss": 0.6332, + "step": 65280 + }, + { + "epoch": 0.577184886578617, + "grad_norm": 5.004847049713135, + "learning_rate": 4.038025189035639e-05, + "loss": 0.6358, + "step": 65290 + }, + { + "epoch": 0.5772732898389292, + "grad_norm": 8.004755973815918, + "learning_rate": 4.037877850268452e-05, + "loss": 0.6212, + "step": 65300 + }, + { + "epoch": 0.5773616930992415, + "grad_norm": 4.12190580368042, + "learning_rate": 4.0377305115012645e-05, + "loss": 0.7918, + "step": 65310 + }, + { + "epoch": 0.5774500963595537, + "grad_norm": 4.382535457611084, + "learning_rate": 4.037583172734077e-05, + "loss": 0.6414, + "step": 65320 + }, + { + "epoch": 0.577538499619866, + "grad_norm": 2.492431640625, + "learning_rate": 4.03743583396689e-05, + "loss": 0.6578, + "step": 65330 + }, + { + "epoch": 0.5776269028801783, + "grad_norm": 10.768872261047363, + "learning_rate": 4.037288495199703e-05, + "loss": 0.6652, + "step": 65340 + }, + { + "epoch": 0.5777153061404905, + "grad_norm": 2.8188974857330322, + "learning_rate": 4.0371411564325165e-05, + "loss": 0.5491, + "step": 65350 + }, + { + "epoch": 0.5778037094008027, + "grad_norm": 31.166112899780273, + "learning_rate": 4.036993817665329e-05, + "loss": 0.7656, + "step": 65360 + }, + { + "epoch": 0.577892112661115, + "grad_norm": 3.75449538230896, + "learning_rate": 4.036846478898142e-05, + "loss": 0.575, + "step": 65370 + }, + { + "epoch": 0.5779805159214272, + "grad_norm": 4.671249866485596, + "learning_rate": 4.036699140130955e-05, + "loss": 0.6896, + "step": 65380 + }, + { + "epoch": 0.5780689191817394, + "grad_norm": 12.375015258789062, + "learning_rate": 4.036551801363768e-05, + "loss": 0.6865, + "step": 65390 + }, + { + "epoch": 0.5781573224420516, + "grad_norm": 3.042642831802368, + "learning_rate": 4.036404462596581e-05, + "loss": 0.5746, + "step": 65400 + }, + { + "epoch": 0.5782457257023639, + "grad_norm": 4.171643257141113, + "learning_rate": 4.036257123829394e-05, + "loss": 0.6592, + "step": 65410 + }, + { + "epoch": 0.5783341289626761, + "grad_norm": 2.997007131576538, + "learning_rate": 4.0361097850622064e-05, + "loss": 0.7502, + "step": 65420 + }, + { + "epoch": 0.5784225322229883, + "grad_norm": 5.75356388092041, + "learning_rate": 4.03596244629502e-05, + "loss": 0.6929, + "step": 65430 + }, + { + "epoch": 0.5785109354833006, + "grad_norm": 2.511322021484375, + "learning_rate": 4.035815107527833e-05, + "loss": 0.6957, + "step": 65440 + }, + { + "epoch": 0.5785993387436129, + "grad_norm": 1.8392339944839478, + "learning_rate": 4.0356677687606456e-05, + "loss": 0.6734, + "step": 65450 + }, + { + "epoch": 0.5786877420039251, + "grad_norm": 5.3239665031433105, + "learning_rate": 4.0355204299934584e-05, + "loss": 0.7657, + "step": 65460 + }, + { + "epoch": 0.5787761452642374, + "grad_norm": 4.524316787719727, + "learning_rate": 4.035373091226271e-05, + "loss": 0.6809, + "step": 65470 + }, + { + "epoch": 0.5788645485245496, + "grad_norm": 1.9566243886947632, + "learning_rate": 4.035225752459084e-05, + "loss": 0.6773, + "step": 65480 + }, + { + "epoch": 0.5789529517848618, + "grad_norm": 7.495596885681152, + "learning_rate": 4.0350784136918976e-05, + "loss": 0.7635, + "step": 65490 + }, + { + "epoch": 0.5790413550451741, + "grad_norm": 1.264648675918579, + "learning_rate": 4.03493107492471e-05, + "loss": 0.689, + "step": 65500 + }, + { + "epoch": 0.5791297583054863, + "grad_norm": 2.051647424697876, + "learning_rate": 4.034783736157523e-05, + "loss": 0.5404, + "step": 65510 + }, + { + "epoch": 0.5792181615657985, + "grad_norm": 3.8799514770507812, + "learning_rate": 4.034636397390336e-05, + "loss": 0.6113, + "step": 65520 + }, + { + "epoch": 0.5793065648261108, + "grad_norm": 2.673724412918091, + "learning_rate": 4.034489058623149e-05, + "loss": 0.6334, + "step": 65530 + }, + { + "epoch": 0.579394968086423, + "grad_norm": 2.2784740924835205, + "learning_rate": 4.034341719855962e-05, + "loss": 0.6283, + "step": 65540 + }, + { + "epoch": 0.5794833713467352, + "grad_norm": 4.802986145019531, + "learning_rate": 4.034194381088775e-05, + "loss": 0.7282, + "step": 65550 + }, + { + "epoch": 0.5795717746070476, + "grad_norm": 16.573030471801758, + "learning_rate": 4.0340470423215874e-05, + "loss": 0.6995, + "step": 65560 + }, + { + "epoch": 0.5796601778673598, + "grad_norm": 5.87691593170166, + "learning_rate": 4.033899703554401e-05, + "loss": 0.778, + "step": 65570 + }, + { + "epoch": 0.579748581127672, + "grad_norm": 3.7554335594177246, + "learning_rate": 4.033752364787213e-05, + "loss": 0.6854, + "step": 65580 + }, + { + "epoch": 0.5798369843879843, + "grad_norm": 6.366271018981934, + "learning_rate": 4.0336050260200266e-05, + "loss": 0.8265, + "step": 65590 + }, + { + "epoch": 0.5799253876482965, + "grad_norm": 8.081609725952148, + "learning_rate": 4.0334576872528394e-05, + "loss": 0.7417, + "step": 65600 + }, + { + "epoch": 0.5800137909086087, + "grad_norm": 1.2006568908691406, + "learning_rate": 4.033310348485652e-05, + "loss": 0.6332, + "step": 65610 + }, + { + "epoch": 0.580102194168921, + "grad_norm": 15.604028701782227, + "learning_rate": 4.033163009718465e-05, + "loss": 0.7658, + "step": 65620 + }, + { + "epoch": 0.5801905974292332, + "grad_norm": 4.313818454742432, + "learning_rate": 4.0330156709512786e-05, + "loss": 0.6091, + "step": 65630 + }, + { + "epoch": 0.5802790006895454, + "grad_norm": 5.838403701782227, + "learning_rate": 4.032868332184091e-05, + "loss": 0.5807, + "step": 65640 + }, + { + "epoch": 0.5803674039498576, + "grad_norm": 7.906291484832764, + "learning_rate": 4.032720993416904e-05, + "loss": 0.7414, + "step": 65650 + }, + { + "epoch": 0.5804558072101699, + "grad_norm": 3.6370160579681396, + "learning_rate": 4.032573654649717e-05, + "loss": 0.673, + "step": 65660 + }, + { + "epoch": 0.5805442104704821, + "grad_norm": 1.5348896980285645, + "learning_rate": 4.03242631588253e-05, + "loss": 0.6197, + "step": 65670 + }, + { + "epoch": 0.5806326137307944, + "grad_norm": 3.536381959915161, + "learning_rate": 4.032278977115343e-05, + "loss": 0.6769, + "step": 65680 + }, + { + "epoch": 0.5807210169911067, + "grad_norm": 5.558623313903809, + "learning_rate": 4.0321316383481556e-05, + "loss": 0.64, + "step": 65690 + }, + { + "epoch": 0.5808094202514189, + "grad_norm": 1.8969066143035889, + "learning_rate": 4.0319842995809685e-05, + "loss": 0.554, + "step": 65700 + }, + { + "epoch": 0.5808978235117311, + "grad_norm": 2.612123727798462, + "learning_rate": 4.031836960813782e-05, + "loss": 0.6549, + "step": 65710 + }, + { + "epoch": 0.5809862267720434, + "grad_norm": 4.810793876647949, + "learning_rate": 4.031689622046594e-05, + "loss": 0.8441, + "step": 65720 + }, + { + "epoch": 0.5810746300323556, + "grad_norm": 2.560311794281006, + "learning_rate": 4.0315422832794077e-05, + "loss": 0.593, + "step": 65730 + }, + { + "epoch": 0.5811630332926678, + "grad_norm": 2.641464948654175, + "learning_rate": 4.0313949445122205e-05, + "loss": 0.6973, + "step": 65740 + }, + { + "epoch": 0.5812514365529801, + "grad_norm": 1.3652746677398682, + "learning_rate": 4.031247605745033e-05, + "loss": 0.6517, + "step": 65750 + }, + { + "epoch": 0.5813398398132923, + "grad_norm": 4.5376811027526855, + "learning_rate": 4.031100266977846e-05, + "loss": 0.5679, + "step": 65760 + }, + { + "epoch": 0.5814282430736045, + "grad_norm": 1.8268612623214722, + "learning_rate": 4.03095292821066e-05, + "loss": 0.7254, + "step": 65770 + }, + { + "epoch": 0.5815166463339168, + "grad_norm": 6.924813747406006, + "learning_rate": 4.030805589443472e-05, + "loss": 0.6783, + "step": 65780 + }, + { + "epoch": 0.581605049594229, + "grad_norm": 10.423727989196777, + "learning_rate": 4.0306582506762853e-05, + "loss": 0.75, + "step": 65790 + }, + { + "epoch": 0.5816934528545413, + "grad_norm": 4.0514631271362305, + "learning_rate": 4.0305109119090975e-05, + "loss": 0.6846, + "step": 65800 + }, + { + "epoch": 0.5817818561148536, + "grad_norm": 17.880250930786133, + "learning_rate": 4.030363573141911e-05, + "loss": 0.7167, + "step": 65810 + }, + { + "epoch": 0.5818702593751658, + "grad_norm": 3.270742177963257, + "learning_rate": 4.030216234374724e-05, + "loss": 0.6176, + "step": 65820 + }, + { + "epoch": 0.581958662635478, + "grad_norm": 4.598825931549072, + "learning_rate": 4.030068895607537e-05, + "loss": 0.7393, + "step": 65830 + }, + { + "epoch": 0.5820470658957902, + "grad_norm": 3.6737895011901855, + "learning_rate": 4.0299215568403495e-05, + "loss": 0.6977, + "step": 65840 + }, + { + "epoch": 0.5821354691561025, + "grad_norm": 7.075663089752197, + "learning_rate": 4.029774218073163e-05, + "loss": 0.6143, + "step": 65850 + }, + { + "epoch": 0.5822238724164147, + "grad_norm": 3.0455093383789062, + "learning_rate": 4.029626879305975e-05, + "loss": 0.7357, + "step": 65860 + }, + { + "epoch": 0.5823122756767269, + "grad_norm": 2.9687275886535645, + "learning_rate": 4.029479540538789e-05, + "loss": 0.6443, + "step": 65870 + }, + { + "epoch": 0.5824006789370392, + "grad_norm": 6.235419273376465, + "learning_rate": 4.0293322017716015e-05, + "loss": 0.767, + "step": 65880 + }, + { + "epoch": 0.5824890821973514, + "grad_norm": 8.538867950439453, + "learning_rate": 4.0291848630044144e-05, + "loss": 0.6702, + "step": 65890 + }, + { + "epoch": 0.5825774854576636, + "grad_norm": 3.7236874103546143, + "learning_rate": 4.029037524237227e-05, + "loss": 0.8373, + "step": 65900 + }, + { + "epoch": 0.5826658887179759, + "grad_norm": 3.416780948638916, + "learning_rate": 4.028890185470041e-05, + "loss": 0.6486, + "step": 65910 + }, + { + "epoch": 0.5827542919782882, + "grad_norm": 2.2044196128845215, + "learning_rate": 4.028742846702853e-05, + "loss": 0.5961, + "step": 65920 + }, + { + "epoch": 0.5828426952386004, + "grad_norm": 1.9446840286254883, + "learning_rate": 4.0285955079356664e-05, + "loss": 0.6601, + "step": 65930 + }, + { + "epoch": 0.5829310984989127, + "grad_norm": 1.5652238130569458, + "learning_rate": 4.0284481691684786e-05, + "loss": 0.6993, + "step": 65940 + }, + { + "epoch": 0.5830195017592249, + "grad_norm": 1.8059438467025757, + "learning_rate": 4.028300830401292e-05, + "loss": 0.7818, + "step": 65950 + }, + { + "epoch": 0.5831079050195371, + "grad_norm": 2.366891860961914, + "learning_rate": 4.028153491634105e-05, + "loss": 0.6826, + "step": 65960 + }, + { + "epoch": 0.5831963082798494, + "grad_norm": 5.274660110473633, + "learning_rate": 4.028006152866918e-05, + "loss": 0.7629, + "step": 65970 + }, + { + "epoch": 0.5832847115401616, + "grad_norm": 6.935770511627197, + "learning_rate": 4.0278588140997306e-05, + "loss": 0.7826, + "step": 65980 + }, + { + "epoch": 0.5833731148004738, + "grad_norm": 3.2195160388946533, + "learning_rate": 4.027711475332544e-05, + "loss": 0.7378, + "step": 65990 + }, + { + "epoch": 0.583461518060786, + "grad_norm": 3.6363794803619385, + "learning_rate": 4.027564136565357e-05, + "loss": 0.6644, + "step": 66000 + }, + { + "epoch": 0.5835499213210983, + "grad_norm": 4.602499961853027, + "learning_rate": 4.02741679779817e-05, + "loss": 0.6812, + "step": 66010 + }, + { + "epoch": 0.5836383245814105, + "grad_norm": 12.960406303405762, + "learning_rate": 4.0272694590309826e-05, + "loss": 0.5673, + "step": 66020 + }, + { + "epoch": 0.5837267278417227, + "grad_norm": 3.3600995540618896, + "learning_rate": 4.0271221202637954e-05, + "loss": 0.6716, + "step": 66030 + }, + { + "epoch": 0.5838151311020351, + "grad_norm": 1.669277310371399, + "learning_rate": 4.026974781496608e-05, + "loss": 0.5967, + "step": 66040 + }, + { + "epoch": 0.5839035343623473, + "grad_norm": 1.8807207345962524, + "learning_rate": 4.026827442729421e-05, + "loss": 0.6004, + "step": 66050 + }, + { + "epoch": 0.5839919376226596, + "grad_norm": 4.403038024902344, + "learning_rate": 4.0266801039622346e-05, + "loss": 0.6126, + "step": 66060 + }, + { + "epoch": 0.5840803408829718, + "grad_norm": 1.1700714826583862, + "learning_rate": 4.0265327651950474e-05, + "loss": 0.7059, + "step": 66070 + }, + { + "epoch": 0.584168744143284, + "grad_norm": 4.6222662925720215, + "learning_rate": 4.02638542642786e-05, + "loss": 0.6469, + "step": 66080 + }, + { + "epoch": 0.5842571474035962, + "grad_norm": 1.3058925867080688, + "learning_rate": 4.026238087660673e-05, + "loss": 0.8034, + "step": 66090 + }, + { + "epoch": 0.5843455506639085, + "grad_norm": 4.285163879394531, + "learning_rate": 4.026090748893486e-05, + "loss": 0.6897, + "step": 66100 + }, + { + "epoch": 0.5844339539242207, + "grad_norm": 1.457245111465454, + "learning_rate": 4.025943410126299e-05, + "loss": 0.7481, + "step": 66110 + }, + { + "epoch": 0.5845223571845329, + "grad_norm": 11.897272109985352, + "learning_rate": 4.025796071359112e-05, + "loss": 0.754, + "step": 66120 + }, + { + "epoch": 0.5846107604448452, + "grad_norm": 4.249215126037598, + "learning_rate": 4.025648732591925e-05, + "loss": 0.7145, + "step": 66130 + }, + { + "epoch": 0.5846991637051574, + "grad_norm": 7.60205078125, + "learning_rate": 4.025501393824738e-05, + "loss": 0.7374, + "step": 66140 + }, + { + "epoch": 0.5847875669654697, + "grad_norm": 3.8163368701934814, + "learning_rate": 4.025354055057551e-05, + "loss": 0.8132, + "step": 66150 + }, + { + "epoch": 0.584875970225782, + "grad_norm": 3.2086496353149414, + "learning_rate": 4.0252067162903636e-05, + "loss": 0.7485, + "step": 66160 + }, + { + "epoch": 0.5849643734860942, + "grad_norm": 2.357443332672119, + "learning_rate": 4.0250593775231765e-05, + "loss": 0.5496, + "step": 66170 + }, + { + "epoch": 0.5850527767464064, + "grad_norm": 9.534817695617676, + "learning_rate": 4.02491203875599e-05, + "loss": 0.6263, + "step": 66180 + }, + { + "epoch": 0.5851411800067187, + "grad_norm": 6.535423755645752, + "learning_rate": 4.024764699988802e-05, + "loss": 0.7151, + "step": 66190 + }, + { + "epoch": 0.5852295832670309, + "grad_norm": 4.886234760284424, + "learning_rate": 4.024617361221616e-05, + "loss": 0.6801, + "step": 66200 + }, + { + "epoch": 0.5853179865273431, + "grad_norm": 1.1482465267181396, + "learning_rate": 4.0244700224544285e-05, + "loss": 0.6077, + "step": 66210 + }, + { + "epoch": 0.5854063897876554, + "grad_norm": 7.958798408508301, + "learning_rate": 4.024322683687241e-05, + "loss": 0.6062, + "step": 66220 + }, + { + "epoch": 0.5854947930479676, + "grad_norm": 5.445904731750488, + "learning_rate": 4.024175344920054e-05, + "loss": 0.7115, + "step": 66230 + }, + { + "epoch": 0.5855831963082798, + "grad_norm": 5.273890972137451, + "learning_rate": 4.024028006152868e-05, + "loss": 0.5686, + "step": 66240 + }, + { + "epoch": 0.585671599568592, + "grad_norm": 1.4359538555145264, + "learning_rate": 4.02388066738568e-05, + "loss": 0.7021, + "step": 66250 + }, + { + "epoch": 0.5857600028289043, + "grad_norm": 3.0926342010498047, + "learning_rate": 4.0237333286184934e-05, + "loss": 0.6771, + "step": 66260 + }, + { + "epoch": 0.5858484060892166, + "grad_norm": 1.8125255107879639, + "learning_rate": 4.0235859898513055e-05, + "loss": 0.7517, + "step": 66270 + }, + { + "epoch": 0.5859368093495289, + "grad_norm": 7.297733783721924, + "learning_rate": 4.023438651084119e-05, + "loss": 0.7357, + "step": 66280 + }, + { + "epoch": 0.5860252126098411, + "grad_norm": 2.663259267807007, + "learning_rate": 4.023291312316932e-05, + "loss": 0.5976, + "step": 66290 + }, + { + "epoch": 0.5861136158701533, + "grad_norm": 7.248925685882568, + "learning_rate": 4.023143973549745e-05, + "loss": 0.6108, + "step": 66300 + }, + { + "epoch": 0.5862020191304655, + "grad_norm": 5.889019966125488, + "learning_rate": 4.0229966347825575e-05, + "loss": 0.671, + "step": 66310 + }, + { + "epoch": 0.5862904223907778, + "grad_norm": 6.2864990234375, + "learning_rate": 4.022849296015371e-05, + "loss": 0.7183, + "step": 66320 + }, + { + "epoch": 0.58637882565109, + "grad_norm": 5.465065002441406, + "learning_rate": 4.022701957248183e-05, + "loss": 0.7012, + "step": 66330 + }, + { + "epoch": 0.5864672289114022, + "grad_norm": 4.232016563415527, + "learning_rate": 4.022554618480997e-05, + "loss": 0.6973, + "step": 66340 + }, + { + "epoch": 0.5865556321717145, + "grad_norm": 4.214752674102783, + "learning_rate": 4.0224072797138095e-05, + "loss": 0.8033, + "step": 66350 + }, + { + "epoch": 0.5866440354320267, + "grad_norm": 6.341785430908203, + "learning_rate": 4.0222599409466224e-05, + "loss": 0.6417, + "step": 66360 + }, + { + "epoch": 0.5867324386923389, + "grad_norm": 1.4448235034942627, + "learning_rate": 4.022112602179435e-05, + "loss": 0.6446, + "step": 66370 + }, + { + "epoch": 0.5868208419526512, + "grad_norm": 4.314273834228516, + "learning_rate": 4.021965263412249e-05, + "loss": 0.7746, + "step": 66380 + }, + { + "epoch": 0.5869092452129635, + "grad_norm": 2.4736480712890625, + "learning_rate": 4.021817924645061e-05, + "loss": 0.5755, + "step": 66390 + }, + { + "epoch": 0.5869976484732757, + "grad_norm": 3.5737013816833496, + "learning_rate": 4.0216705858778744e-05, + "loss": 0.7399, + "step": 66400 + }, + { + "epoch": 0.587086051733588, + "grad_norm": 7.623898029327393, + "learning_rate": 4.0215232471106866e-05, + "loss": 0.7194, + "step": 66410 + }, + { + "epoch": 0.5871744549939002, + "grad_norm": 2.8788838386535645, + "learning_rate": 4.0213759083435e-05, + "loss": 0.6291, + "step": 66420 + }, + { + "epoch": 0.5872628582542124, + "grad_norm": 4.522538185119629, + "learning_rate": 4.021228569576313e-05, + "loss": 0.6819, + "step": 66430 + }, + { + "epoch": 0.5873512615145247, + "grad_norm": 2.913949966430664, + "learning_rate": 4.021081230809126e-05, + "loss": 0.784, + "step": 66440 + }, + { + "epoch": 0.5874396647748369, + "grad_norm": 11.622467994689941, + "learning_rate": 4.0209338920419386e-05, + "loss": 0.663, + "step": 66450 + }, + { + "epoch": 0.5875280680351491, + "grad_norm": 1.5301315784454346, + "learning_rate": 4.020786553274752e-05, + "loss": 0.5763, + "step": 66460 + }, + { + "epoch": 0.5876164712954614, + "grad_norm": 1.5925779342651367, + "learning_rate": 4.020639214507564e-05, + "loss": 0.7092, + "step": 66470 + }, + { + "epoch": 0.5877048745557736, + "grad_norm": 2.24004864692688, + "learning_rate": 4.020491875740378e-05, + "loss": 0.7717, + "step": 66480 + }, + { + "epoch": 0.5877932778160858, + "grad_norm": 9.111332893371582, + "learning_rate": 4.0203445369731906e-05, + "loss": 0.657, + "step": 66490 + }, + { + "epoch": 0.587881681076398, + "grad_norm": 4.095296382904053, + "learning_rate": 4.0201971982060034e-05, + "loss": 0.6275, + "step": 66500 + }, + { + "epoch": 0.5879700843367104, + "grad_norm": 1.959965705871582, + "learning_rate": 4.020049859438816e-05, + "loss": 0.6269, + "step": 66510 + }, + { + "epoch": 0.5880584875970226, + "grad_norm": 5.743034362792969, + "learning_rate": 4.019902520671629e-05, + "loss": 0.6759, + "step": 66520 + }, + { + "epoch": 0.5881468908573348, + "grad_norm": 2.343559503555298, + "learning_rate": 4.019755181904442e-05, + "loss": 0.7763, + "step": 66530 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 4.0077362060546875, + "learning_rate": 4.0196078431372555e-05, + "loss": 0.7834, + "step": 66540 + }, + { + "epoch": 0.5883236973779593, + "grad_norm": 4.8376946449279785, + "learning_rate": 4.0194605043700676e-05, + "loss": 0.6889, + "step": 66550 + }, + { + "epoch": 0.5884121006382715, + "grad_norm": 4.436862468719482, + "learning_rate": 4.019313165602881e-05, + "loss": 0.5604, + "step": 66560 + }, + { + "epoch": 0.5885005038985838, + "grad_norm": 18.047977447509766, + "learning_rate": 4.019165826835694e-05, + "loss": 0.7222, + "step": 66570 + }, + { + "epoch": 0.588588907158896, + "grad_norm": 0.8846916556358337, + "learning_rate": 4.019018488068507e-05, + "loss": 0.6758, + "step": 66580 + }, + { + "epoch": 0.5886773104192082, + "grad_norm": 2.162344217300415, + "learning_rate": 4.0188711493013196e-05, + "loss": 0.6119, + "step": 66590 + }, + { + "epoch": 0.5887657136795205, + "grad_norm": 1.688251256942749, + "learning_rate": 4.018723810534133e-05, + "loss": 0.8133, + "step": 66600 + }, + { + "epoch": 0.5888541169398327, + "grad_norm": 4.278174877166748, + "learning_rate": 4.018576471766945e-05, + "loss": 0.8368, + "step": 66610 + }, + { + "epoch": 0.588942520200145, + "grad_norm": 6.618953227996826, + "learning_rate": 4.018429132999759e-05, + "loss": 0.7329, + "step": 66620 + }, + { + "epoch": 0.5890309234604573, + "grad_norm": 8.79420280456543, + "learning_rate": 4.018281794232571e-05, + "loss": 0.6397, + "step": 66630 + }, + { + "epoch": 0.5891193267207695, + "grad_norm": 4.479436874389648, + "learning_rate": 4.0181344554653845e-05, + "loss": 0.7685, + "step": 66640 + }, + { + "epoch": 0.5892077299810817, + "grad_norm": 0.97049880027771, + "learning_rate": 4.017987116698197e-05, + "loss": 0.6728, + "step": 66650 + }, + { + "epoch": 0.589296133241394, + "grad_norm": 2.2249491214752197, + "learning_rate": 4.01783977793101e-05, + "loss": 0.5655, + "step": 66660 + }, + { + "epoch": 0.5893845365017062, + "grad_norm": 6.43360710144043, + "learning_rate": 4.017692439163823e-05, + "loss": 0.6488, + "step": 66670 + }, + { + "epoch": 0.5894729397620184, + "grad_norm": 2.6443846225738525, + "learning_rate": 4.0175451003966365e-05, + "loss": 0.7102, + "step": 66680 + }, + { + "epoch": 0.5895613430223307, + "grad_norm": 5.7051286697387695, + "learning_rate": 4.017397761629449e-05, + "loss": 0.8128, + "step": 66690 + }, + { + "epoch": 0.5896497462826429, + "grad_norm": 1.7258425951004028, + "learning_rate": 4.017250422862262e-05, + "loss": 0.7701, + "step": 66700 + }, + { + "epoch": 0.5897381495429551, + "grad_norm": 3.6739070415496826, + "learning_rate": 4.017103084095075e-05, + "loss": 0.5741, + "step": 66710 + }, + { + "epoch": 0.5898265528032673, + "grad_norm": 1.321820855140686, + "learning_rate": 4.016955745327888e-05, + "loss": 0.622, + "step": 66720 + }, + { + "epoch": 0.5899149560635796, + "grad_norm": 2.5632264614105225, + "learning_rate": 4.016808406560701e-05, + "loss": 0.5827, + "step": 66730 + }, + { + "epoch": 0.5900033593238919, + "grad_norm": 2.0769879817962646, + "learning_rate": 4.0166610677935135e-05, + "loss": 0.7996, + "step": 66740 + }, + { + "epoch": 0.5900917625842041, + "grad_norm": 6.099169731140137, + "learning_rate": 4.0165137290263264e-05, + "loss": 0.7308, + "step": 66750 + }, + { + "epoch": 0.5901801658445164, + "grad_norm": 17.474607467651367, + "learning_rate": 4.01636639025914e-05, + "loss": 0.6336, + "step": 66760 + }, + { + "epoch": 0.5902685691048286, + "grad_norm": 4.201455116271973, + "learning_rate": 4.016219051491952e-05, + "loss": 0.7295, + "step": 66770 + }, + { + "epoch": 0.5903569723651408, + "grad_norm": 3.24267315864563, + "learning_rate": 4.0160717127247655e-05, + "loss": 0.6257, + "step": 66780 + }, + { + "epoch": 0.5904453756254531, + "grad_norm": 7.121216773986816, + "learning_rate": 4.0159243739575784e-05, + "loss": 0.6715, + "step": 66790 + }, + { + "epoch": 0.5905337788857653, + "grad_norm": 1.9864522218704224, + "learning_rate": 4.015777035190391e-05, + "loss": 0.7091, + "step": 66800 + }, + { + "epoch": 0.5906221821460775, + "grad_norm": 1.4397283792495728, + "learning_rate": 4.015629696423204e-05, + "loss": 0.7288, + "step": 66810 + }, + { + "epoch": 0.5907105854063898, + "grad_norm": 10.089944839477539, + "learning_rate": 4.0154823576560176e-05, + "loss": 0.5199, + "step": 66820 + }, + { + "epoch": 0.590798988666702, + "grad_norm": 2.347733974456787, + "learning_rate": 4.01533501888883e-05, + "loss": 0.7035, + "step": 66830 + }, + { + "epoch": 0.5908873919270142, + "grad_norm": 6.436496257781982, + "learning_rate": 4.015187680121643e-05, + "loss": 0.7507, + "step": 66840 + }, + { + "epoch": 0.5909757951873265, + "grad_norm": 2.4167637825012207, + "learning_rate": 4.015040341354456e-05, + "loss": 0.6771, + "step": 66850 + }, + { + "epoch": 0.5910641984476388, + "grad_norm": 18.951627731323242, + "learning_rate": 4.014893002587269e-05, + "loss": 0.7464, + "step": 66860 + }, + { + "epoch": 0.591152601707951, + "grad_norm": 3.278839588165283, + "learning_rate": 4.014745663820082e-05, + "loss": 0.6733, + "step": 66870 + }, + { + "epoch": 0.5912410049682633, + "grad_norm": 1.3182076215744019, + "learning_rate": 4.0145983250528946e-05, + "loss": 0.5269, + "step": 66880 + }, + { + "epoch": 0.5913294082285755, + "grad_norm": 15.219719886779785, + "learning_rate": 4.0144509862857074e-05, + "loss": 0.6556, + "step": 66890 + }, + { + "epoch": 0.5914178114888877, + "grad_norm": 2.8584930896759033, + "learning_rate": 4.014303647518521e-05, + "loss": 0.6389, + "step": 66900 + }, + { + "epoch": 0.5915062147492, + "grad_norm": 2.181023597717285, + "learning_rate": 4.014156308751334e-05, + "loss": 0.7625, + "step": 66910 + }, + { + "epoch": 0.5915946180095122, + "grad_norm": 11.575806617736816, + "learning_rate": 4.0140089699841466e-05, + "loss": 0.6453, + "step": 66920 + }, + { + "epoch": 0.5916830212698244, + "grad_norm": 4.198673248291016, + "learning_rate": 4.0138616312169594e-05, + "loss": 0.7204, + "step": 66930 + }, + { + "epoch": 0.5917714245301366, + "grad_norm": 6.410655975341797, + "learning_rate": 4.013714292449772e-05, + "loss": 0.7021, + "step": 66940 + }, + { + "epoch": 0.5918598277904489, + "grad_norm": 5.004095077514648, + "learning_rate": 4.013566953682585e-05, + "loss": 0.8387, + "step": 66950 + }, + { + "epoch": 0.5919482310507611, + "grad_norm": 4.536837577819824, + "learning_rate": 4.0134196149153986e-05, + "loss": 0.621, + "step": 66960 + }, + { + "epoch": 0.5920366343110733, + "grad_norm": 2.161701202392578, + "learning_rate": 4.0132722761482114e-05, + "loss": 0.603, + "step": 66970 + }, + { + "epoch": 0.5921250375713857, + "grad_norm": 2.361048698425293, + "learning_rate": 4.013124937381024e-05, + "loss": 0.7566, + "step": 66980 + }, + { + "epoch": 0.5922134408316979, + "grad_norm": 13.218186378479004, + "learning_rate": 4.012977598613837e-05, + "loss": 0.6942, + "step": 66990 + }, + { + "epoch": 0.5923018440920101, + "grad_norm": 2.0734739303588867, + "learning_rate": 4.01283025984665e-05, + "loss": 0.6897, + "step": 67000 + }, + { + "epoch": 0.5923902473523224, + "grad_norm": 3.272113561630249, + "learning_rate": 4.012682921079463e-05, + "loss": 0.572, + "step": 67010 + }, + { + "epoch": 0.5924786506126346, + "grad_norm": 2.3308887481689453, + "learning_rate": 4.0125355823122756e-05, + "loss": 0.5531, + "step": 67020 + }, + { + "epoch": 0.5925670538729468, + "grad_norm": 4.321115493774414, + "learning_rate": 4.012388243545089e-05, + "loss": 0.5591, + "step": 67030 + }, + { + "epoch": 0.5926554571332591, + "grad_norm": 9.157179832458496, + "learning_rate": 4.012240904777902e-05, + "loss": 0.6473, + "step": 67040 + }, + { + "epoch": 0.5927438603935713, + "grad_norm": 3.5586681365966797, + "learning_rate": 4.012093566010715e-05, + "loss": 0.7633, + "step": 67050 + }, + { + "epoch": 0.5928322636538835, + "grad_norm": 3.897705078125, + "learning_rate": 4.0119462272435276e-05, + "loss": 0.7359, + "step": 67060 + }, + { + "epoch": 0.5929206669141958, + "grad_norm": 4.001584053039551, + "learning_rate": 4.0117988884763405e-05, + "loss": 0.7523, + "step": 67070 + }, + { + "epoch": 0.593009070174508, + "grad_norm": 1.5126879215240479, + "learning_rate": 4.011651549709153e-05, + "loss": 0.6526, + "step": 67080 + }, + { + "epoch": 0.5930974734348202, + "grad_norm": 1.7397924661636353, + "learning_rate": 4.011504210941967e-05, + "loss": 0.6974, + "step": 67090 + }, + { + "epoch": 0.5931858766951326, + "grad_norm": 6.736157417297363, + "learning_rate": 4.011356872174779e-05, + "loss": 0.7651, + "step": 67100 + }, + { + "epoch": 0.5932742799554448, + "grad_norm": 4.01091194152832, + "learning_rate": 4.0112095334075925e-05, + "loss": 0.6184, + "step": 67110 + }, + { + "epoch": 0.593362683215757, + "grad_norm": 3.457923173904419, + "learning_rate": 4.011062194640405e-05, + "loss": 0.6837, + "step": 67120 + }, + { + "epoch": 0.5934510864760693, + "grad_norm": 2.8803300857543945, + "learning_rate": 4.010914855873218e-05, + "loss": 0.512, + "step": 67130 + }, + { + "epoch": 0.5935394897363815, + "grad_norm": 3.0528881549835205, + "learning_rate": 4.010767517106031e-05, + "loss": 0.7376, + "step": 67140 + }, + { + "epoch": 0.5936278929966937, + "grad_norm": 7.523730278015137, + "learning_rate": 4.0106201783388445e-05, + "loss": 0.7138, + "step": 67150 + }, + { + "epoch": 0.593716296257006, + "grad_norm": 1.6315839290618896, + "learning_rate": 4.010472839571657e-05, + "loss": 0.679, + "step": 67160 + }, + { + "epoch": 0.5938046995173182, + "grad_norm": 1.6547417640686035, + "learning_rate": 4.01032550080447e-05, + "loss": 0.7017, + "step": 67170 + }, + { + "epoch": 0.5938931027776304, + "grad_norm": 5.377217769622803, + "learning_rate": 4.010178162037283e-05, + "loss": 0.6217, + "step": 67180 + }, + { + "epoch": 0.5939815060379426, + "grad_norm": 2.67793869972229, + "learning_rate": 4.010030823270096e-05, + "loss": 0.6974, + "step": 67190 + }, + { + "epoch": 0.5940699092982549, + "grad_norm": 1.984525442123413, + "learning_rate": 4.009883484502909e-05, + "loss": 0.7356, + "step": 67200 + }, + { + "epoch": 0.5941583125585672, + "grad_norm": 4.21031379699707, + "learning_rate": 4.0097361457357215e-05, + "loss": 0.6745, + "step": 67210 + }, + { + "epoch": 0.5942467158188794, + "grad_norm": 3.056952476501465, + "learning_rate": 4.0095888069685344e-05, + "loss": 0.7835, + "step": 67220 + }, + { + "epoch": 0.5943351190791917, + "grad_norm": 5.154651165008545, + "learning_rate": 4.009441468201348e-05, + "loss": 0.8265, + "step": 67230 + }, + { + "epoch": 0.5944235223395039, + "grad_norm": 8.108997344970703, + "learning_rate": 4.00929412943416e-05, + "loss": 0.7139, + "step": 67240 + }, + { + "epoch": 0.5945119255998161, + "grad_norm": 2.9724066257476807, + "learning_rate": 4.0091467906669735e-05, + "loss": 0.7208, + "step": 67250 + }, + { + "epoch": 0.5946003288601284, + "grad_norm": 4.48063850402832, + "learning_rate": 4.0089994518997864e-05, + "loss": 0.6936, + "step": 67260 + }, + { + "epoch": 0.5946887321204406, + "grad_norm": 4.644166469573975, + "learning_rate": 4.008852113132599e-05, + "loss": 0.6472, + "step": 67270 + }, + { + "epoch": 0.5947771353807528, + "grad_norm": 5.567645072937012, + "learning_rate": 4.008704774365412e-05, + "loss": 0.6021, + "step": 67280 + }, + { + "epoch": 0.5948655386410651, + "grad_norm": 6.615933895111084, + "learning_rate": 4.0085574355982256e-05, + "loss": 0.6155, + "step": 67290 + }, + { + "epoch": 0.5949539419013773, + "grad_norm": 2.456272840499878, + "learning_rate": 4.008410096831038e-05, + "loss": 0.7682, + "step": 67300 + }, + { + "epoch": 0.5950423451616895, + "grad_norm": 1.886036992073059, + "learning_rate": 4.008262758063851e-05, + "loss": 0.5989, + "step": 67310 + }, + { + "epoch": 0.5951307484220018, + "grad_norm": 3.0485422611236572, + "learning_rate": 4.008115419296664e-05, + "loss": 0.7711, + "step": 67320 + }, + { + "epoch": 0.5952191516823141, + "grad_norm": 1.8456193208694458, + "learning_rate": 4.007968080529477e-05, + "loss": 0.6806, + "step": 67330 + }, + { + "epoch": 0.5953075549426263, + "grad_norm": 8.89923095703125, + "learning_rate": 4.00782074176229e-05, + "loss": 0.6393, + "step": 67340 + }, + { + "epoch": 0.5953959582029386, + "grad_norm": 5.349854469299316, + "learning_rate": 4.0076734029951026e-05, + "loss": 0.7292, + "step": 67350 + }, + { + "epoch": 0.5954843614632508, + "grad_norm": 1.8010096549987793, + "learning_rate": 4.0075260642279154e-05, + "loss": 0.8465, + "step": 67360 + }, + { + "epoch": 0.595572764723563, + "grad_norm": 2.808081865310669, + "learning_rate": 4.007378725460729e-05, + "loss": 0.7319, + "step": 67370 + }, + { + "epoch": 0.5956611679838753, + "grad_norm": 2.711019515991211, + "learning_rate": 4.007231386693541e-05, + "loss": 0.7204, + "step": 67380 + }, + { + "epoch": 0.5957495712441875, + "grad_norm": 1.0074832439422607, + "learning_rate": 4.0070840479263546e-05, + "loss": 0.6799, + "step": 67390 + }, + { + "epoch": 0.5958379745044997, + "grad_norm": 2.0530591011047363, + "learning_rate": 4.0069367091591674e-05, + "loss": 0.702, + "step": 67400 + }, + { + "epoch": 0.5959263777648119, + "grad_norm": 1.8472235202789307, + "learning_rate": 4.00678937039198e-05, + "loss": 0.8623, + "step": 67410 + }, + { + "epoch": 0.5960147810251242, + "grad_norm": 3.2525367736816406, + "learning_rate": 4.006642031624793e-05, + "loss": 0.6399, + "step": 67420 + }, + { + "epoch": 0.5961031842854364, + "grad_norm": 4.3894171714782715, + "learning_rate": 4.0064946928576066e-05, + "loss": 0.7461, + "step": 67430 + }, + { + "epoch": 0.5961915875457486, + "grad_norm": 1.209202766418457, + "learning_rate": 4.006347354090419e-05, + "loss": 0.658, + "step": 67440 + }, + { + "epoch": 0.596279990806061, + "grad_norm": 5.785106658935547, + "learning_rate": 4.006200015323232e-05, + "loss": 0.6539, + "step": 67450 + }, + { + "epoch": 0.5963683940663732, + "grad_norm": 1.5607560873031616, + "learning_rate": 4.0060526765560444e-05, + "loss": 0.7356, + "step": 67460 + }, + { + "epoch": 0.5964567973266854, + "grad_norm": 1.5569267272949219, + "learning_rate": 4.005905337788858e-05, + "loss": 0.6567, + "step": 67470 + }, + { + "epoch": 0.5965452005869977, + "grad_norm": 1.541078805923462, + "learning_rate": 4.005757999021671e-05, + "loss": 0.6104, + "step": 67480 + }, + { + "epoch": 0.5966336038473099, + "grad_norm": 10.697413444519043, + "learning_rate": 4.0056106602544836e-05, + "loss": 0.6446, + "step": 67490 + }, + { + "epoch": 0.5967220071076221, + "grad_norm": 8.067002296447754, + "learning_rate": 4.0054633214872965e-05, + "loss": 0.5597, + "step": 67500 + }, + { + "epoch": 0.5968104103679344, + "grad_norm": 2.9999191761016846, + "learning_rate": 4.00531598272011e-05, + "loss": 0.743, + "step": 67510 + }, + { + "epoch": 0.5968988136282466, + "grad_norm": 3.6837759017944336, + "learning_rate": 4.005168643952922e-05, + "loss": 0.6818, + "step": 67520 + }, + { + "epoch": 0.5969872168885588, + "grad_norm": 1.8458577394485474, + "learning_rate": 4.0050213051857356e-05, + "loss": 0.8241, + "step": 67530 + }, + { + "epoch": 0.5970756201488711, + "grad_norm": 9.878037452697754, + "learning_rate": 4.0048739664185485e-05, + "loss": 0.5651, + "step": 67540 + }, + { + "epoch": 0.5971640234091833, + "grad_norm": 5.5965166091918945, + "learning_rate": 4.004726627651361e-05, + "loss": 0.6289, + "step": 67550 + }, + { + "epoch": 0.5972524266694955, + "grad_norm": 11.003169059753418, + "learning_rate": 4.004579288884174e-05, + "loss": 0.7754, + "step": 67560 + }, + { + "epoch": 0.5973408299298079, + "grad_norm": 3.590615749359131, + "learning_rate": 4.004431950116987e-05, + "loss": 0.5968, + "step": 67570 + }, + { + "epoch": 0.5974292331901201, + "grad_norm": 9.921289443969727, + "learning_rate": 4.0042846113498e-05, + "loss": 0.6193, + "step": 67580 + }, + { + "epoch": 0.5975176364504323, + "grad_norm": 6.370003700256348, + "learning_rate": 4.004137272582613e-05, + "loss": 0.6825, + "step": 67590 + }, + { + "epoch": 0.5976060397107446, + "grad_norm": 5.396376609802246, + "learning_rate": 4.0039899338154255e-05, + "loss": 0.6942, + "step": 67600 + }, + { + "epoch": 0.5976944429710568, + "grad_norm": 4.06182336807251, + "learning_rate": 4.003842595048239e-05, + "loss": 0.6383, + "step": 67610 + }, + { + "epoch": 0.597782846231369, + "grad_norm": 2.475921630859375, + "learning_rate": 4.003695256281052e-05, + "loss": 0.718, + "step": 67620 + }, + { + "epoch": 0.5978712494916812, + "grad_norm": 11.372515678405762, + "learning_rate": 4.003547917513865e-05, + "loss": 0.7232, + "step": 67630 + }, + { + "epoch": 0.5979596527519935, + "grad_norm": 2.693408250808716, + "learning_rate": 4.0034005787466775e-05, + "loss": 0.7385, + "step": 67640 + }, + { + "epoch": 0.5980480560123057, + "grad_norm": 6.491113662719727, + "learning_rate": 4.003253239979491e-05, + "loss": 0.7132, + "step": 67650 + }, + { + "epoch": 0.5981364592726179, + "grad_norm": 1.7727231979370117, + "learning_rate": 4.003105901212303e-05, + "loss": 0.6441, + "step": 67660 + }, + { + "epoch": 0.5982248625329302, + "grad_norm": 4.259739398956299, + "learning_rate": 4.002958562445117e-05, + "loss": 0.8155, + "step": 67670 + }, + { + "epoch": 0.5983132657932425, + "grad_norm": 0.9205679893493652, + "learning_rate": 4.002811223677929e-05, + "loss": 0.6162, + "step": 67680 + }, + { + "epoch": 0.5984016690535547, + "grad_norm": 1.4040359258651733, + "learning_rate": 4.0026638849107424e-05, + "loss": 0.6264, + "step": 67690 + }, + { + "epoch": 0.598490072313867, + "grad_norm": 3.6029984951019287, + "learning_rate": 4.002516546143555e-05, + "loss": 0.8111, + "step": 67700 + }, + { + "epoch": 0.5985784755741792, + "grad_norm": 4.135036945343018, + "learning_rate": 4.002369207376368e-05, + "loss": 0.7476, + "step": 67710 + }, + { + "epoch": 0.5986668788344914, + "grad_norm": 2.148099899291992, + "learning_rate": 4.002221868609181e-05, + "loss": 0.7191, + "step": 67720 + }, + { + "epoch": 0.5987552820948037, + "grad_norm": 2.440849781036377, + "learning_rate": 4.0020745298419944e-05, + "loss": 0.7065, + "step": 67730 + }, + { + "epoch": 0.5988436853551159, + "grad_norm": 7.0985589027404785, + "learning_rate": 4.0019271910748065e-05, + "loss": 0.5483, + "step": 67740 + }, + { + "epoch": 0.5989320886154281, + "grad_norm": 6.353954315185547, + "learning_rate": 4.00177985230762e-05, + "loss": 0.7893, + "step": 67750 + }, + { + "epoch": 0.5990204918757404, + "grad_norm": 5.931369781494141, + "learning_rate": 4.001632513540433e-05, + "loss": 0.7944, + "step": 67760 + }, + { + "epoch": 0.5991088951360526, + "grad_norm": 1.0348714590072632, + "learning_rate": 4.001485174773246e-05, + "loss": 0.6199, + "step": 67770 + }, + { + "epoch": 0.5991972983963648, + "grad_norm": 3.4274654388427734, + "learning_rate": 4.0013378360060586e-05, + "loss": 0.7245, + "step": 67780 + }, + { + "epoch": 0.599285701656677, + "grad_norm": 2.3413896560668945, + "learning_rate": 4.001190497238872e-05, + "loss": 0.6347, + "step": 67790 + }, + { + "epoch": 0.5993741049169894, + "grad_norm": 1.481231451034546, + "learning_rate": 4.001043158471684e-05, + "loss": 0.6589, + "step": 67800 + }, + { + "epoch": 0.5994625081773016, + "grad_norm": 3.3175745010375977, + "learning_rate": 4.000895819704498e-05, + "loss": 0.5997, + "step": 67810 + }, + { + "epoch": 0.5995509114376139, + "grad_norm": 3.3511624336242676, + "learning_rate": 4.0007484809373106e-05, + "loss": 0.6646, + "step": 67820 + }, + { + "epoch": 0.5996393146979261, + "grad_norm": 6.857593536376953, + "learning_rate": 4.0006011421701234e-05, + "loss": 0.5203, + "step": 67830 + }, + { + "epoch": 0.5997277179582383, + "grad_norm": 2.252788543701172, + "learning_rate": 4.000453803402936e-05, + "loss": 0.7173, + "step": 67840 + }, + { + "epoch": 0.5998161212185505, + "grad_norm": 2.993269205093384, + "learning_rate": 4.000306464635749e-05, + "loss": 0.7273, + "step": 67850 + }, + { + "epoch": 0.5999045244788628, + "grad_norm": 7.120776653289795, + "learning_rate": 4.000159125868562e-05, + "loss": 0.7521, + "step": 67860 + }, + { + "epoch": 0.599992927739175, + "grad_norm": 1.7486554384231567, + "learning_rate": 4.0000117871013754e-05, + "loss": 0.6977, + "step": 67870 + }, + { + "epoch": 0.6000813309994872, + "grad_norm": 1.0409510135650635, + "learning_rate": 3.999864448334188e-05, + "loss": 0.739, + "step": 67880 + }, + { + "epoch": 0.6001697342597995, + "grad_norm": 12.113892555236816, + "learning_rate": 3.999717109567001e-05, + "loss": 0.81, + "step": 67890 + }, + { + "epoch": 0.6002581375201117, + "grad_norm": 5.20490837097168, + "learning_rate": 3.999569770799814e-05, + "loss": 0.7397, + "step": 67900 + }, + { + "epoch": 0.6003465407804239, + "grad_norm": 3.258568286895752, + "learning_rate": 3.999422432032627e-05, + "loss": 0.7813, + "step": 67910 + }, + { + "epoch": 0.6004349440407363, + "grad_norm": 9.120695114135742, + "learning_rate": 3.9992750932654396e-05, + "loss": 0.6967, + "step": 67920 + }, + { + "epoch": 0.6005233473010485, + "grad_norm": 2.6310369968414307, + "learning_rate": 3.9991277544982524e-05, + "loss": 0.6519, + "step": 67930 + }, + { + "epoch": 0.6006117505613607, + "grad_norm": 2.5960001945495605, + "learning_rate": 3.998980415731066e-05, + "loss": 0.8263, + "step": 67940 + }, + { + "epoch": 0.600700153821673, + "grad_norm": 4.554094314575195, + "learning_rate": 3.998833076963879e-05, + "loss": 0.8485, + "step": 67950 + }, + { + "epoch": 0.6007885570819852, + "grad_norm": 1.8861922025680542, + "learning_rate": 3.9986857381966916e-05, + "loss": 0.6734, + "step": 67960 + }, + { + "epoch": 0.6008769603422974, + "grad_norm": 7.652975559234619, + "learning_rate": 3.9985383994295045e-05, + "loss": 0.678, + "step": 67970 + }, + { + "epoch": 0.6009653636026097, + "grad_norm": 3.88148832321167, + "learning_rate": 3.998391060662317e-05, + "loss": 0.5662, + "step": 67980 + }, + { + "epoch": 0.6010537668629219, + "grad_norm": 2.120248556137085, + "learning_rate": 3.99824372189513e-05, + "loss": 0.7374, + "step": 67990 + }, + { + "epoch": 0.6011421701232341, + "grad_norm": 4.1534199714660645, + "learning_rate": 3.9980963831279437e-05, + "loss": 0.5647, + "step": 68000 + }, + { + "epoch": 0.6012305733835464, + "grad_norm": 1.922553300857544, + "learning_rate": 3.9979490443607565e-05, + "loss": 0.7215, + "step": 68010 + }, + { + "epoch": 0.6013189766438586, + "grad_norm": 13.003496170043945, + "learning_rate": 3.997801705593569e-05, + "loss": 0.5849, + "step": 68020 + }, + { + "epoch": 0.6014073799041708, + "grad_norm": 2.862215995788574, + "learning_rate": 3.997654366826382e-05, + "loss": 0.6946, + "step": 68030 + }, + { + "epoch": 0.6014957831644832, + "grad_norm": 3.0106401443481445, + "learning_rate": 3.997507028059195e-05, + "loss": 0.6255, + "step": 68040 + }, + { + "epoch": 0.6015841864247954, + "grad_norm": 1.2612980604171753, + "learning_rate": 3.997359689292008e-05, + "loss": 0.7444, + "step": 68050 + }, + { + "epoch": 0.6016725896851076, + "grad_norm": 1.9860903024673462, + "learning_rate": 3.9972123505248213e-05, + "loss": 0.7292, + "step": 68060 + }, + { + "epoch": 0.6017609929454198, + "grad_norm": 3.843406915664673, + "learning_rate": 3.9970650117576335e-05, + "loss": 0.7389, + "step": 68070 + }, + { + "epoch": 0.6018493962057321, + "grad_norm": 7.409058570861816, + "learning_rate": 3.996917672990447e-05, + "loss": 0.6841, + "step": 68080 + }, + { + "epoch": 0.6019377994660443, + "grad_norm": 3.4304847717285156, + "learning_rate": 3.99677033422326e-05, + "loss": 0.6102, + "step": 68090 + }, + { + "epoch": 0.6020262027263565, + "grad_norm": 5.2471537590026855, + "learning_rate": 3.996622995456073e-05, + "loss": 0.7516, + "step": 68100 + }, + { + "epoch": 0.6021146059866688, + "grad_norm": 3.076887607574463, + "learning_rate": 3.9964756566888855e-05, + "loss": 0.5522, + "step": 68110 + }, + { + "epoch": 0.602203009246981, + "grad_norm": 1.0112590789794922, + "learning_rate": 3.996328317921699e-05, + "loss": 0.5902, + "step": 68120 + }, + { + "epoch": 0.6022914125072932, + "grad_norm": 4.697880268096924, + "learning_rate": 3.996180979154511e-05, + "loss": 0.6159, + "step": 68130 + }, + { + "epoch": 0.6023798157676055, + "grad_norm": 1.8803813457489014, + "learning_rate": 3.996033640387325e-05, + "loss": 0.6603, + "step": 68140 + }, + { + "epoch": 0.6024682190279177, + "grad_norm": 1.8947783708572388, + "learning_rate": 3.9958863016201375e-05, + "loss": 0.4859, + "step": 68150 + }, + { + "epoch": 0.60255662228823, + "grad_norm": 2.829491376876831, + "learning_rate": 3.9957389628529504e-05, + "loss": 0.5707, + "step": 68160 + }, + { + "epoch": 0.6026450255485423, + "grad_norm": 5.179346561431885, + "learning_rate": 3.995591624085763e-05, + "loss": 0.6871, + "step": 68170 + }, + { + "epoch": 0.6027334288088545, + "grad_norm": 1.9947444200515747, + "learning_rate": 3.995444285318576e-05, + "loss": 0.6344, + "step": 68180 + }, + { + "epoch": 0.6028218320691667, + "grad_norm": 3.2611663341522217, + "learning_rate": 3.995296946551389e-05, + "loss": 0.7923, + "step": 68190 + }, + { + "epoch": 0.602910235329479, + "grad_norm": 10.408723831176758, + "learning_rate": 3.9951496077842024e-05, + "loss": 0.5766, + "step": 68200 + }, + { + "epoch": 0.6029986385897912, + "grad_norm": 2.6321170330047607, + "learning_rate": 3.9950022690170145e-05, + "loss": 0.8093, + "step": 68210 + }, + { + "epoch": 0.6030870418501034, + "grad_norm": 1.8001887798309326, + "learning_rate": 3.994854930249828e-05, + "loss": 0.7165, + "step": 68220 + }, + { + "epoch": 0.6031754451104157, + "grad_norm": 4.607061862945557, + "learning_rate": 3.994707591482641e-05, + "loss": 0.6837, + "step": 68230 + }, + { + "epoch": 0.6032638483707279, + "grad_norm": 3.9257895946502686, + "learning_rate": 3.994560252715454e-05, + "loss": 0.6992, + "step": 68240 + }, + { + "epoch": 0.6033522516310401, + "grad_norm": 1.7141555547714233, + "learning_rate": 3.9944129139482666e-05, + "loss": 0.613, + "step": 68250 + }, + { + "epoch": 0.6034406548913523, + "grad_norm": 3.1083195209503174, + "learning_rate": 3.99426557518108e-05, + "loss": 0.6735, + "step": 68260 + }, + { + "epoch": 0.6035290581516647, + "grad_norm": 1.1760464906692505, + "learning_rate": 3.994118236413892e-05, + "loss": 0.6472, + "step": 68270 + }, + { + "epoch": 0.6036174614119769, + "grad_norm": 4.06918478012085, + "learning_rate": 3.993970897646706e-05, + "loss": 0.6039, + "step": 68280 + }, + { + "epoch": 0.6037058646722891, + "grad_norm": 3.1744275093078613, + "learning_rate": 3.993823558879518e-05, + "loss": 0.6446, + "step": 68290 + }, + { + "epoch": 0.6037942679326014, + "grad_norm": 0.9572000503540039, + "learning_rate": 3.9936762201123314e-05, + "loss": 0.606, + "step": 68300 + }, + { + "epoch": 0.6038826711929136, + "grad_norm": 3.2773358821868896, + "learning_rate": 3.993528881345144e-05, + "loss": 0.5472, + "step": 68310 + }, + { + "epoch": 0.6039710744532258, + "grad_norm": 5.722139835357666, + "learning_rate": 3.993381542577957e-05, + "loss": 0.5986, + "step": 68320 + }, + { + "epoch": 0.6040594777135381, + "grad_norm": 4.527989387512207, + "learning_rate": 3.99323420381077e-05, + "loss": 0.6804, + "step": 68330 + }, + { + "epoch": 0.6041478809738503, + "grad_norm": 3.540149211883545, + "learning_rate": 3.9930868650435834e-05, + "loss": 0.7221, + "step": 68340 + }, + { + "epoch": 0.6042362842341625, + "grad_norm": 4.579570293426514, + "learning_rate": 3.9929395262763956e-05, + "loss": 0.7591, + "step": 68350 + }, + { + "epoch": 0.6043246874944748, + "grad_norm": 7.313347339630127, + "learning_rate": 3.992792187509209e-05, + "loss": 0.7487, + "step": 68360 + }, + { + "epoch": 0.604413090754787, + "grad_norm": 2.8706626892089844, + "learning_rate": 3.992644848742022e-05, + "loss": 0.7745, + "step": 68370 + }, + { + "epoch": 0.6045014940150992, + "grad_norm": 2.8275105953216553, + "learning_rate": 3.992497509974835e-05, + "loss": 0.5839, + "step": 68380 + }, + { + "epoch": 0.6045898972754116, + "grad_norm": 3.384598731994629, + "learning_rate": 3.9923501712076476e-05, + "loss": 0.5708, + "step": 68390 + }, + { + "epoch": 0.6046783005357238, + "grad_norm": 2.156449794769287, + "learning_rate": 3.9922028324404605e-05, + "loss": 0.8107, + "step": 68400 + }, + { + "epoch": 0.604766703796036, + "grad_norm": 4.001865386962891, + "learning_rate": 3.992055493673273e-05, + "loss": 0.631, + "step": 68410 + }, + { + "epoch": 0.6048551070563483, + "grad_norm": 7.301394939422607, + "learning_rate": 3.991908154906087e-05, + "loss": 0.6151, + "step": 68420 + }, + { + "epoch": 0.6049435103166605, + "grad_norm": 4.076961517333984, + "learning_rate": 3.991760816138899e-05, + "loss": 0.6588, + "step": 68430 + }, + { + "epoch": 0.6050319135769727, + "grad_norm": 2.770707845687866, + "learning_rate": 3.9916134773717125e-05, + "loss": 0.5893, + "step": 68440 + }, + { + "epoch": 0.605120316837285, + "grad_norm": 6.907416820526123, + "learning_rate": 3.991466138604525e-05, + "loss": 0.7231, + "step": 68450 + }, + { + "epoch": 0.6052087200975972, + "grad_norm": 7.347880840301514, + "learning_rate": 3.991318799837338e-05, + "loss": 0.5238, + "step": 68460 + }, + { + "epoch": 0.6052971233579094, + "grad_norm": 3.90139102935791, + "learning_rate": 3.991171461070151e-05, + "loss": 0.783, + "step": 68470 + }, + { + "epoch": 0.6053855266182216, + "grad_norm": 4.252216339111328, + "learning_rate": 3.9910241223029645e-05, + "loss": 0.8325, + "step": 68480 + }, + { + "epoch": 0.6054739298785339, + "grad_norm": 5.732337474822998, + "learning_rate": 3.9908767835357767e-05, + "loss": 0.7703, + "step": 68490 + }, + { + "epoch": 0.6055623331388461, + "grad_norm": 4.272355079650879, + "learning_rate": 3.99072944476859e-05, + "loss": 0.6956, + "step": 68500 + }, + { + "epoch": 0.6056507363991585, + "grad_norm": 3.164580821990967, + "learning_rate": 3.990582106001402e-05, + "loss": 0.644, + "step": 68510 + }, + { + "epoch": 0.6057391396594707, + "grad_norm": 2.5002005100250244, + "learning_rate": 3.990434767234216e-05, + "loss": 0.6915, + "step": 68520 + }, + { + "epoch": 0.6058275429197829, + "grad_norm": 3.9687719345092773, + "learning_rate": 3.990287428467029e-05, + "loss": 0.6782, + "step": 68530 + }, + { + "epoch": 0.6059159461800951, + "grad_norm": 2.8334271907806396, + "learning_rate": 3.9901400896998415e-05, + "loss": 0.5957, + "step": 68540 + }, + { + "epoch": 0.6060043494404074, + "grad_norm": 5.97862434387207, + "learning_rate": 3.9899927509326543e-05, + "loss": 0.7612, + "step": 68550 + }, + { + "epoch": 0.6060927527007196, + "grad_norm": 2.0084502696990967, + "learning_rate": 3.989845412165468e-05, + "loss": 0.6127, + "step": 68560 + }, + { + "epoch": 0.6061811559610318, + "grad_norm": 5.48071813583374, + "learning_rate": 3.98969807339828e-05, + "loss": 0.5699, + "step": 68570 + }, + { + "epoch": 0.6062695592213441, + "grad_norm": 3.4372706413269043, + "learning_rate": 3.9895507346310935e-05, + "loss": 0.5595, + "step": 68580 + }, + { + "epoch": 0.6063579624816563, + "grad_norm": 0.9249988794326782, + "learning_rate": 3.9894033958639064e-05, + "loss": 0.6997, + "step": 68590 + }, + { + "epoch": 0.6064463657419685, + "grad_norm": 3.710207462310791, + "learning_rate": 3.989256057096719e-05, + "loss": 0.7397, + "step": 68600 + }, + { + "epoch": 0.6065347690022808, + "grad_norm": 1.0048185586929321, + "learning_rate": 3.989108718329532e-05, + "loss": 0.7025, + "step": 68610 + }, + { + "epoch": 0.606623172262593, + "grad_norm": 4.552618980407715, + "learning_rate": 3.9889613795623455e-05, + "loss": 0.7587, + "step": 68620 + }, + { + "epoch": 0.6067115755229053, + "grad_norm": 1.4993724822998047, + "learning_rate": 3.988814040795158e-05, + "loss": 0.5566, + "step": 68630 + }, + { + "epoch": 0.6067999787832176, + "grad_norm": 1.413038730621338, + "learning_rate": 3.988666702027971e-05, + "loss": 0.6957, + "step": 68640 + }, + { + "epoch": 0.6068883820435298, + "grad_norm": 3.3125061988830566, + "learning_rate": 3.9885193632607834e-05, + "loss": 0.66, + "step": 68650 + }, + { + "epoch": 0.606976785303842, + "grad_norm": 1.9941564798355103, + "learning_rate": 3.988372024493597e-05, + "loss": 0.6737, + "step": 68660 + }, + { + "epoch": 0.6070651885641543, + "grad_norm": 1.3719918727874756, + "learning_rate": 3.98822468572641e-05, + "loss": 0.6748, + "step": 68670 + }, + { + "epoch": 0.6071535918244665, + "grad_norm": 3.223843812942505, + "learning_rate": 3.9880773469592226e-05, + "loss": 0.6671, + "step": 68680 + }, + { + "epoch": 0.6072419950847787, + "grad_norm": 5.648901462554932, + "learning_rate": 3.9879300081920354e-05, + "loss": 0.6562, + "step": 68690 + }, + { + "epoch": 0.607330398345091, + "grad_norm": 8.837112426757812, + "learning_rate": 3.987782669424849e-05, + "loss": 0.6545, + "step": 68700 + }, + { + "epoch": 0.6074188016054032, + "grad_norm": 3.6554744243621826, + "learning_rate": 3.987635330657661e-05, + "loss": 0.5833, + "step": 68710 + }, + { + "epoch": 0.6075072048657154, + "grad_norm": 3.366485595703125, + "learning_rate": 3.9874879918904746e-05, + "loss": 0.743, + "step": 68720 + }, + { + "epoch": 0.6075956081260276, + "grad_norm": 6.173121452331543, + "learning_rate": 3.9873406531232874e-05, + "loss": 0.734, + "step": 68730 + }, + { + "epoch": 0.6076840113863399, + "grad_norm": 9.327568054199219, + "learning_rate": 3.9871933143561e-05, + "loss": 0.6856, + "step": 68740 + }, + { + "epoch": 0.6077724146466522, + "grad_norm": 2.837390184402466, + "learning_rate": 3.987045975588913e-05, + "loss": 0.5528, + "step": 68750 + }, + { + "epoch": 0.6078608179069644, + "grad_norm": 4.72812032699585, + "learning_rate": 3.986898636821726e-05, + "loss": 0.6144, + "step": 68760 + }, + { + "epoch": 0.6079492211672767, + "grad_norm": 4.415213108062744, + "learning_rate": 3.986751298054539e-05, + "loss": 0.6628, + "step": 68770 + }, + { + "epoch": 0.6080376244275889, + "grad_norm": 2.4548535346984863, + "learning_rate": 3.986603959287352e-05, + "loss": 0.5479, + "step": 68780 + }, + { + "epoch": 0.6081260276879011, + "grad_norm": 2.5510547161102295, + "learning_rate": 3.986456620520165e-05, + "loss": 0.6633, + "step": 68790 + }, + { + "epoch": 0.6082144309482134, + "grad_norm": 4.9136576652526855, + "learning_rate": 3.986309281752978e-05, + "loss": 0.7326, + "step": 68800 + }, + { + "epoch": 0.6083028342085256, + "grad_norm": 4.370539665222168, + "learning_rate": 3.986161942985791e-05, + "loss": 0.7098, + "step": 68810 + }, + { + "epoch": 0.6083912374688378, + "grad_norm": 2.7245564460754395, + "learning_rate": 3.9860146042186036e-05, + "loss": 0.7583, + "step": 68820 + }, + { + "epoch": 0.6084796407291501, + "grad_norm": 4.710555553436279, + "learning_rate": 3.9858672654514164e-05, + "loss": 0.6862, + "step": 68830 + }, + { + "epoch": 0.6085680439894623, + "grad_norm": 1.6268647909164429, + "learning_rate": 3.98571992668423e-05, + "loss": 0.5837, + "step": 68840 + }, + { + "epoch": 0.6086564472497745, + "grad_norm": 3.0540761947631836, + "learning_rate": 3.985572587917043e-05, + "loss": 0.684, + "step": 68850 + }, + { + "epoch": 0.6087448505100869, + "grad_norm": 2.8425979614257812, + "learning_rate": 3.9854252491498556e-05, + "loss": 0.7028, + "step": 68860 + }, + { + "epoch": 0.6088332537703991, + "grad_norm": 9.741423606872559, + "learning_rate": 3.9852779103826685e-05, + "loss": 0.6337, + "step": 68870 + }, + { + "epoch": 0.6089216570307113, + "grad_norm": 2.310828685760498, + "learning_rate": 3.985130571615481e-05, + "loss": 0.6625, + "step": 68880 + }, + { + "epoch": 0.6090100602910236, + "grad_norm": 3.778407096862793, + "learning_rate": 3.984983232848294e-05, + "loss": 0.8521, + "step": 68890 + }, + { + "epoch": 0.6090984635513358, + "grad_norm": 4.01237154006958, + "learning_rate": 3.984835894081107e-05, + "loss": 0.7701, + "step": 68900 + }, + { + "epoch": 0.609186866811648, + "grad_norm": 11.06180477142334, + "learning_rate": 3.9846885553139205e-05, + "loss": 0.6215, + "step": 68910 + }, + { + "epoch": 0.6092752700719603, + "grad_norm": 5.82162618637085, + "learning_rate": 3.984541216546733e-05, + "loss": 0.5944, + "step": 68920 + }, + { + "epoch": 0.6093636733322725, + "grad_norm": 2.3689417839050293, + "learning_rate": 3.984393877779546e-05, + "loss": 0.6031, + "step": 68930 + }, + { + "epoch": 0.6094520765925847, + "grad_norm": 5.308497905731201, + "learning_rate": 3.984246539012359e-05, + "loss": 0.7454, + "step": 68940 + }, + { + "epoch": 0.6095404798528969, + "grad_norm": 4.571074962615967, + "learning_rate": 3.984099200245172e-05, + "loss": 0.5948, + "step": 68950 + }, + { + "epoch": 0.6096288831132092, + "grad_norm": 4.445878028869629, + "learning_rate": 3.9839518614779847e-05, + "loss": 0.7984, + "step": 68960 + }, + { + "epoch": 0.6097172863735214, + "grad_norm": 3.9879441261291504, + "learning_rate": 3.983804522710798e-05, + "loss": 0.6897, + "step": 68970 + }, + { + "epoch": 0.6098056896338337, + "grad_norm": 3.2836380004882812, + "learning_rate": 3.98365718394361e-05, + "loss": 0.7139, + "step": 68980 + }, + { + "epoch": 0.609894092894146, + "grad_norm": 6.017600059509277, + "learning_rate": 3.983509845176424e-05, + "loss": 0.6724, + "step": 68990 + }, + { + "epoch": 0.6099824961544582, + "grad_norm": 3.9386825561523438, + "learning_rate": 3.983362506409237e-05, + "loss": 0.7285, + "step": 69000 + }, + { + "epoch": 0.6100708994147704, + "grad_norm": 3.628932476043701, + "learning_rate": 3.9832151676420495e-05, + "loss": 0.8888, + "step": 69010 + }, + { + "epoch": 0.6101593026750827, + "grad_norm": 1.531785011291504, + "learning_rate": 3.9830678288748623e-05, + "loss": 0.6118, + "step": 69020 + }, + { + "epoch": 0.6102477059353949, + "grad_norm": 2.324725389480591, + "learning_rate": 3.982920490107676e-05, + "loss": 0.8042, + "step": 69030 + }, + { + "epoch": 0.6103361091957071, + "grad_norm": 5.797264099121094, + "learning_rate": 3.982773151340488e-05, + "loss": 0.5666, + "step": 69040 + }, + { + "epoch": 0.6104245124560194, + "grad_norm": 10.481884002685547, + "learning_rate": 3.9826258125733015e-05, + "loss": 0.7382, + "step": 69050 + }, + { + "epoch": 0.6105129157163316, + "grad_norm": 6.266570091247559, + "learning_rate": 3.9824784738061144e-05, + "loss": 0.7971, + "step": 69060 + }, + { + "epoch": 0.6106013189766438, + "grad_norm": 2.1786258220672607, + "learning_rate": 3.982331135038927e-05, + "loss": 0.5833, + "step": 69070 + }, + { + "epoch": 0.6106897222369561, + "grad_norm": 4.0261054039001465, + "learning_rate": 3.98218379627174e-05, + "loss": 0.7022, + "step": 69080 + }, + { + "epoch": 0.6107781254972683, + "grad_norm": 1.2456425428390503, + "learning_rate": 3.9820364575045536e-05, + "loss": 0.7358, + "step": 69090 + }, + { + "epoch": 0.6108665287575806, + "grad_norm": 2.3084208965301514, + "learning_rate": 3.981889118737366e-05, + "loss": 0.7043, + "step": 69100 + }, + { + "epoch": 0.6109549320178929, + "grad_norm": 3.831892728805542, + "learning_rate": 3.981741779970179e-05, + "loss": 0.7041, + "step": 69110 + }, + { + "epoch": 0.6110433352782051, + "grad_norm": 1.9702353477478027, + "learning_rate": 3.9815944412029914e-05, + "loss": 0.6562, + "step": 69120 + }, + { + "epoch": 0.6111317385385173, + "grad_norm": 4.149004936218262, + "learning_rate": 3.981447102435805e-05, + "loss": 0.6047, + "step": 69130 + }, + { + "epoch": 0.6112201417988296, + "grad_norm": 2.6869633197784424, + "learning_rate": 3.981299763668618e-05, + "loss": 0.6753, + "step": 69140 + }, + { + "epoch": 0.6113085450591418, + "grad_norm": 9.032734870910645, + "learning_rate": 3.9811524249014306e-05, + "loss": 0.7659, + "step": 69150 + }, + { + "epoch": 0.611396948319454, + "grad_norm": 9.428448677062988, + "learning_rate": 3.9810050861342434e-05, + "loss": 0.6832, + "step": 69160 + }, + { + "epoch": 0.6114853515797662, + "grad_norm": 3.1898558139801025, + "learning_rate": 3.980857747367057e-05, + "loss": 0.5865, + "step": 69170 + }, + { + "epoch": 0.6115737548400785, + "grad_norm": 2.5670242309570312, + "learning_rate": 3.980710408599869e-05, + "loss": 0.7318, + "step": 69180 + }, + { + "epoch": 0.6116621581003907, + "grad_norm": 12.594137191772461, + "learning_rate": 3.9805630698326826e-05, + "loss": 0.6647, + "step": 69190 + }, + { + "epoch": 0.6117505613607029, + "grad_norm": 4.982483863830566, + "learning_rate": 3.9804157310654954e-05, + "loss": 0.6688, + "step": 69200 + }, + { + "epoch": 0.6118389646210152, + "grad_norm": 4.143118858337402, + "learning_rate": 3.980268392298308e-05, + "loss": 0.7305, + "step": 69210 + }, + { + "epoch": 0.6119273678813275, + "grad_norm": 3.982994794845581, + "learning_rate": 3.980121053531121e-05, + "loss": 0.5619, + "step": 69220 + }, + { + "epoch": 0.6120157711416397, + "grad_norm": 6.126901149749756, + "learning_rate": 3.979973714763934e-05, + "loss": 0.5906, + "step": 69230 + }, + { + "epoch": 0.612104174401952, + "grad_norm": 2.6012651920318604, + "learning_rate": 3.979826375996747e-05, + "loss": 0.7294, + "step": 69240 + }, + { + "epoch": 0.6121925776622642, + "grad_norm": 6.575870037078857, + "learning_rate": 3.97967903722956e-05, + "loss": 0.7193, + "step": 69250 + }, + { + "epoch": 0.6122809809225764, + "grad_norm": 1.7410662174224854, + "learning_rate": 3.9795316984623724e-05, + "loss": 0.763, + "step": 69260 + }, + { + "epoch": 0.6123693841828887, + "grad_norm": 4.49363899230957, + "learning_rate": 3.979384359695186e-05, + "loss": 0.8104, + "step": 69270 + }, + { + "epoch": 0.6124577874432009, + "grad_norm": 2.2362005710601807, + "learning_rate": 3.979237020927999e-05, + "loss": 0.6791, + "step": 69280 + }, + { + "epoch": 0.6125461907035131, + "grad_norm": 6.156050682067871, + "learning_rate": 3.9790896821608116e-05, + "loss": 0.7252, + "step": 69290 + }, + { + "epoch": 0.6126345939638254, + "grad_norm": 2.2494657039642334, + "learning_rate": 3.9789423433936245e-05, + "loss": 0.6665, + "step": 69300 + }, + { + "epoch": 0.6127229972241376, + "grad_norm": 2.3521780967712402, + "learning_rate": 3.978795004626438e-05, + "loss": 0.6314, + "step": 69310 + }, + { + "epoch": 0.6128114004844498, + "grad_norm": 2.6286988258361816, + "learning_rate": 3.97864766585925e-05, + "loss": 0.7451, + "step": 69320 + }, + { + "epoch": 0.6128998037447622, + "grad_norm": 1.6540248394012451, + "learning_rate": 3.9785003270920636e-05, + "loss": 0.5681, + "step": 69330 + }, + { + "epoch": 0.6129882070050744, + "grad_norm": 4.927855491638184, + "learning_rate": 3.978352988324876e-05, + "loss": 0.7682, + "step": 69340 + }, + { + "epoch": 0.6130766102653866, + "grad_norm": 2.7282636165618896, + "learning_rate": 3.978205649557689e-05, + "loss": 0.6492, + "step": 69350 + }, + { + "epoch": 0.6131650135256989, + "grad_norm": 1.715871810913086, + "learning_rate": 3.978058310790502e-05, + "loss": 0.6803, + "step": 69360 + }, + { + "epoch": 0.6132534167860111, + "grad_norm": 4.740761756896973, + "learning_rate": 3.977910972023315e-05, + "loss": 0.782, + "step": 69370 + }, + { + "epoch": 0.6133418200463233, + "grad_norm": 5.819037914276123, + "learning_rate": 3.977763633256128e-05, + "loss": 0.748, + "step": 69380 + }, + { + "epoch": 0.6134302233066355, + "grad_norm": 2.7251076698303223, + "learning_rate": 3.977616294488941e-05, + "loss": 0.7762, + "step": 69390 + }, + { + "epoch": 0.6135186265669478, + "grad_norm": 1.2761238813400269, + "learning_rate": 3.9774689557217535e-05, + "loss": 0.6961, + "step": 69400 + }, + { + "epoch": 0.61360702982726, + "grad_norm": 2.4533190727233887, + "learning_rate": 3.977321616954567e-05, + "loss": 0.7167, + "step": 69410 + }, + { + "epoch": 0.6136954330875722, + "grad_norm": 6.268654823303223, + "learning_rate": 3.97717427818738e-05, + "loss": 0.7098, + "step": 69420 + }, + { + "epoch": 0.6137838363478845, + "grad_norm": 2.5260400772094727, + "learning_rate": 3.977026939420193e-05, + "loss": 0.8052, + "step": 69430 + }, + { + "epoch": 0.6138722396081967, + "grad_norm": 2.6990771293640137, + "learning_rate": 3.9768796006530055e-05, + "loss": 0.8255, + "step": 69440 + }, + { + "epoch": 0.613960642868509, + "grad_norm": 2.0879828929901123, + "learning_rate": 3.976732261885818e-05, + "loss": 0.6054, + "step": 69450 + }, + { + "epoch": 0.6140490461288213, + "grad_norm": 2.028175115585327, + "learning_rate": 3.976584923118631e-05, + "loss": 0.6453, + "step": 69460 + }, + { + "epoch": 0.6141374493891335, + "grad_norm": 7.0612359046936035, + "learning_rate": 3.976437584351445e-05, + "loss": 0.5713, + "step": 69470 + }, + { + "epoch": 0.6142258526494457, + "grad_norm": 1.8073804378509521, + "learning_rate": 3.976290245584257e-05, + "loss": 0.6989, + "step": 69480 + }, + { + "epoch": 0.614314255909758, + "grad_norm": 7.351374626159668, + "learning_rate": 3.9761429068170704e-05, + "loss": 0.8372, + "step": 69490 + }, + { + "epoch": 0.6144026591700702, + "grad_norm": 1.6930521726608276, + "learning_rate": 3.975995568049883e-05, + "loss": 0.7104, + "step": 69500 + }, + { + "epoch": 0.6144910624303824, + "grad_norm": 2.022446632385254, + "learning_rate": 3.975848229282696e-05, + "loss": 0.6713, + "step": 69510 + }, + { + "epoch": 0.6145794656906947, + "grad_norm": 3.321983814239502, + "learning_rate": 3.975700890515509e-05, + "loss": 0.6751, + "step": 69520 + }, + { + "epoch": 0.6146678689510069, + "grad_norm": 6.4746012687683105, + "learning_rate": 3.9755535517483224e-05, + "loss": 0.6448, + "step": 69530 + }, + { + "epoch": 0.6147562722113191, + "grad_norm": 2.5130515098571777, + "learning_rate": 3.9754062129811345e-05, + "loss": 0.6415, + "step": 69540 + }, + { + "epoch": 0.6148446754716314, + "grad_norm": 5.569534778594971, + "learning_rate": 3.975258874213948e-05, + "loss": 0.7955, + "step": 69550 + }, + { + "epoch": 0.6149330787319436, + "grad_norm": 1.5254712104797363, + "learning_rate": 3.975111535446761e-05, + "loss": 0.6602, + "step": 69560 + }, + { + "epoch": 0.6150214819922559, + "grad_norm": 3.2865262031555176, + "learning_rate": 3.974964196679574e-05, + "loss": 0.7502, + "step": 69570 + }, + { + "epoch": 0.6151098852525682, + "grad_norm": 3.7028069496154785, + "learning_rate": 3.9748168579123866e-05, + "loss": 0.7265, + "step": 69580 + }, + { + "epoch": 0.6151982885128804, + "grad_norm": 2.4027321338653564, + "learning_rate": 3.9746695191451994e-05, + "loss": 0.6391, + "step": 69590 + }, + { + "epoch": 0.6152866917731926, + "grad_norm": 7.726175308227539, + "learning_rate": 3.974522180378012e-05, + "loss": 0.6589, + "step": 69600 + }, + { + "epoch": 0.6153750950335048, + "grad_norm": 1.7029047012329102, + "learning_rate": 3.974374841610826e-05, + "loss": 0.6806, + "step": 69610 + }, + { + "epoch": 0.6154634982938171, + "grad_norm": 1.4652581214904785, + "learning_rate": 3.974227502843638e-05, + "loss": 0.6115, + "step": 69620 + }, + { + "epoch": 0.6155519015541293, + "grad_norm": 1.5396239757537842, + "learning_rate": 3.9740801640764514e-05, + "loss": 0.8202, + "step": 69630 + }, + { + "epoch": 0.6156403048144415, + "grad_norm": 3.345118999481201, + "learning_rate": 3.973932825309264e-05, + "loss": 0.5949, + "step": 69640 + }, + { + "epoch": 0.6157287080747538, + "grad_norm": 4.660312175750732, + "learning_rate": 3.973785486542077e-05, + "loss": 0.777, + "step": 69650 + }, + { + "epoch": 0.615817111335066, + "grad_norm": 2.9668712615966797, + "learning_rate": 3.97363814777489e-05, + "loss": 0.6064, + "step": 69660 + }, + { + "epoch": 0.6159055145953782, + "grad_norm": 1.708382248878479, + "learning_rate": 3.9734908090077034e-05, + "loss": 0.604, + "step": 69670 + }, + { + "epoch": 0.6159939178556905, + "grad_norm": 6.197994232177734, + "learning_rate": 3.9733434702405156e-05, + "loss": 0.6515, + "step": 69680 + }, + { + "epoch": 0.6160823211160028, + "grad_norm": 1.832005262374878, + "learning_rate": 3.973196131473329e-05, + "loss": 0.6481, + "step": 69690 + }, + { + "epoch": 0.616170724376315, + "grad_norm": 6.75554895401001, + "learning_rate": 3.973048792706142e-05, + "loss": 0.7163, + "step": 69700 + }, + { + "epoch": 0.6162591276366273, + "grad_norm": 4.5858235359191895, + "learning_rate": 3.972901453938955e-05, + "loss": 0.7289, + "step": 69710 + }, + { + "epoch": 0.6163475308969395, + "grad_norm": 1.3326013088226318, + "learning_rate": 3.9727541151717676e-05, + "loss": 0.7728, + "step": 69720 + }, + { + "epoch": 0.6164359341572517, + "grad_norm": 9.440103530883789, + "learning_rate": 3.9726067764045804e-05, + "loss": 0.605, + "step": 69730 + }, + { + "epoch": 0.616524337417564, + "grad_norm": 2.7616615295410156, + "learning_rate": 3.972459437637393e-05, + "loss": 0.6349, + "step": 69740 + }, + { + "epoch": 0.6166127406778762, + "grad_norm": 4.382421016693115, + "learning_rate": 3.972312098870207e-05, + "loss": 0.6094, + "step": 69750 + }, + { + "epoch": 0.6167011439381884, + "grad_norm": 3.0365304946899414, + "learning_rate": 3.9721647601030196e-05, + "loss": 0.674, + "step": 69760 + }, + { + "epoch": 0.6167895471985007, + "grad_norm": 2.782837390899658, + "learning_rate": 3.9720174213358325e-05, + "loss": 0.7295, + "step": 69770 + }, + { + "epoch": 0.6168779504588129, + "grad_norm": 2.722450017929077, + "learning_rate": 3.971870082568645e-05, + "loss": 0.5833, + "step": 69780 + }, + { + "epoch": 0.6169663537191251, + "grad_norm": 6.575571537017822, + "learning_rate": 3.971722743801458e-05, + "loss": 0.7132, + "step": 69790 + }, + { + "epoch": 0.6170547569794373, + "grad_norm": 1.6678440570831299, + "learning_rate": 3.971575405034271e-05, + "loss": 0.7755, + "step": 69800 + }, + { + "epoch": 0.6171431602397497, + "grad_norm": 6.708428382873535, + "learning_rate": 3.971428066267084e-05, + "loss": 0.7186, + "step": 69810 + }, + { + "epoch": 0.6172315635000619, + "grad_norm": 3.0498299598693848, + "learning_rate": 3.971280727499897e-05, + "loss": 0.6556, + "step": 69820 + }, + { + "epoch": 0.6173199667603742, + "grad_norm": 2.082214832305908, + "learning_rate": 3.97113338873271e-05, + "loss": 0.703, + "step": 69830 + }, + { + "epoch": 0.6174083700206864, + "grad_norm": 1.7338827848434448, + "learning_rate": 3.970986049965523e-05, + "loss": 0.7961, + "step": 69840 + }, + { + "epoch": 0.6174967732809986, + "grad_norm": 2.6805312633514404, + "learning_rate": 3.970838711198336e-05, + "loss": 0.6546, + "step": 69850 + }, + { + "epoch": 0.6175851765413108, + "grad_norm": 4.324188709259033, + "learning_rate": 3.9706913724311487e-05, + "loss": 0.5934, + "step": 69860 + }, + { + "epoch": 0.6176735798016231, + "grad_norm": 4.739712238311768, + "learning_rate": 3.9705440336639615e-05, + "loss": 0.5926, + "step": 69870 + }, + { + "epoch": 0.6177619830619353, + "grad_norm": 5.206289291381836, + "learning_rate": 3.970396694896775e-05, + "loss": 0.7831, + "step": 69880 + }, + { + "epoch": 0.6178503863222475, + "grad_norm": 5.760246276855469, + "learning_rate": 3.970249356129588e-05, + "loss": 0.7158, + "step": 69890 + }, + { + "epoch": 0.6179387895825598, + "grad_norm": 3.2113797664642334, + "learning_rate": 3.970102017362401e-05, + "loss": 0.7189, + "step": 69900 + }, + { + "epoch": 0.618027192842872, + "grad_norm": 17.097204208374023, + "learning_rate": 3.9699546785952135e-05, + "loss": 0.6945, + "step": 69910 + }, + { + "epoch": 0.6181155961031843, + "grad_norm": 1.966998815536499, + "learning_rate": 3.9698073398280263e-05, + "loss": 0.7506, + "step": 69920 + }, + { + "epoch": 0.6182039993634966, + "grad_norm": 5.558865070343018, + "learning_rate": 3.969660001060839e-05, + "loss": 0.7423, + "step": 69930 + }, + { + "epoch": 0.6182924026238088, + "grad_norm": 3.5467731952667236, + "learning_rate": 3.969512662293653e-05, + "loss": 0.6454, + "step": 69940 + }, + { + "epoch": 0.618380805884121, + "grad_norm": 2.814469337463379, + "learning_rate": 3.969365323526465e-05, + "loss": 0.6733, + "step": 69950 + }, + { + "epoch": 0.6184692091444333, + "grad_norm": 5.449344635009766, + "learning_rate": 3.9692179847592784e-05, + "loss": 0.7544, + "step": 69960 + }, + { + "epoch": 0.6185576124047455, + "grad_norm": 2.5994772911071777, + "learning_rate": 3.969070645992091e-05, + "loss": 0.6585, + "step": 69970 + }, + { + "epoch": 0.6186460156650577, + "grad_norm": 4.481049537658691, + "learning_rate": 3.968923307224904e-05, + "loss": 0.662, + "step": 69980 + }, + { + "epoch": 0.61873441892537, + "grad_norm": 3.5467121601104736, + "learning_rate": 3.968775968457717e-05, + "loss": 0.753, + "step": 69990 + }, + { + "epoch": 0.6188228221856822, + "grad_norm": 9.668065071105957, + "learning_rate": 3.9686286296905304e-05, + "loss": 0.5426, + "step": 70000 + }, + { + "epoch": 0.6189112254459944, + "grad_norm": 5.743401527404785, + "learning_rate": 3.9684812909233425e-05, + "loss": 0.6715, + "step": 70010 + }, + { + "epoch": 0.6189996287063066, + "grad_norm": 3.2080140113830566, + "learning_rate": 3.968333952156156e-05, + "loss": 0.6367, + "step": 70020 + }, + { + "epoch": 0.6190880319666189, + "grad_norm": 6.793011665344238, + "learning_rate": 3.968186613388969e-05, + "loss": 0.725, + "step": 70030 + }, + { + "epoch": 0.6191764352269312, + "grad_norm": 2.914573907852173, + "learning_rate": 3.968039274621782e-05, + "loss": 0.5949, + "step": 70040 + }, + { + "epoch": 0.6192648384872435, + "grad_norm": 7.469156742095947, + "learning_rate": 3.9678919358545946e-05, + "loss": 0.6438, + "step": 70050 + }, + { + "epoch": 0.6193532417475557, + "grad_norm": 2.9054205417633057, + "learning_rate": 3.9677445970874074e-05, + "loss": 0.6425, + "step": 70060 + }, + { + "epoch": 0.6194416450078679, + "grad_norm": 3.513422727584839, + "learning_rate": 3.96759725832022e-05, + "loss": 0.7361, + "step": 70070 + }, + { + "epoch": 0.6195300482681801, + "grad_norm": 2.3616559505462646, + "learning_rate": 3.967449919553034e-05, + "loss": 0.6299, + "step": 70080 + }, + { + "epoch": 0.6196184515284924, + "grad_norm": 3.658355712890625, + "learning_rate": 3.967302580785846e-05, + "loss": 0.6401, + "step": 70090 + }, + { + "epoch": 0.6197068547888046, + "grad_norm": 4.6410322189331055, + "learning_rate": 3.9671552420186594e-05, + "loss": 0.7179, + "step": 70100 + }, + { + "epoch": 0.6197952580491168, + "grad_norm": 5.003321170806885, + "learning_rate": 3.967007903251472e-05, + "loss": 0.7072, + "step": 70110 + }, + { + "epoch": 0.6198836613094291, + "grad_norm": 7.007421970367432, + "learning_rate": 3.966860564484285e-05, + "loss": 0.684, + "step": 70120 + }, + { + "epoch": 0.6199720645697413, + "grad_norm": 1.5463006496429443, + "learning_rate": 3.966713225717098e-05, + "loss": 0.5084, + "step": 70130 + }, + { + "epoch": 0.6200604678300535, + "grad_norm": 3.663630723953247, + "learning_rate": 3.9665658869499114e-05, + "loss": 0.5766, + "step": 70140 + }, + { + "epoch": 0.6201488710903658, + "grad_norm": 7.871727466583252, + "learning_rate": 3.9664185481827236e-05, + "loss": 0.7734, + "step": 70150 + }, + { + "epoch": 0.6202372743506781, + "grad_norm": 3.426593542098999, + "learning_rate": 3.966271209415537e-05, + "loss": 0.6635, + "step": 70160 + }, + { + "epoch": 0.6203256776109903, + "grad_norm": 3.6056861877441406, + "learning_rate": 3.966123870648349e-05, + "loss": 0.737, + "step": 70170 + }, + { + "epoch": 0.6204140808713026, + "grad_norm": 8.423047065734863, + "learning_rate": 3.965976531881163e-05, + "loss": 0.767, + "step": 70180 + }, + { + "epoch": 0.6205024841316148, + "grad_norm": 3.6703591346740723, + "learning_rate": 3.9658291931139756e-05, + "loss": 0.8508, + "step": 70190 + }, + { + "epoch": 0.620590887391927, + "grad_norm": 3.2734997272491455, + "learning_rate": 3.9656818543467884e-05, + "loss": 0.6617, + "step": 70200 + }, + { + "epoch": 0.6206792906522393, + "grad_norm": 1.8895032405853271, + "learning_rate": 3.965534515579601e-05, + "loss": 0.7108, + "step": 70210 + }, + { + "epoch": 0.6207676939125515, + "grad_norm": 2.559908390045166, + "learning_rate": 3.965387176812415e-05, + "loss": 0.6649, + "step": 70220 + }, + { + "epoch": 0.6208560971728637, + "grad_norm": 2.5558080673217773, + "learning_rate": 3.965239838045227e-05, + "loss": 0.6888, + "step": 70230 + }, + { + "epoch": 0.620944500433176, + "grad_norm": 2.5347774028778076, + "learning_rate": 3.9650924992780405e-05, + "loss": 0.57, + "step": 70240 + }, + { + "epoch": 0.6210329036934882, + "grad_norm": 2.93037748336792, + "learning_rate": 3.964945160510853e-05, + "loss": 0.7308, + "step": 70250 + }, + { + "epoch": 0.6211213069538004, + "grad_norm": 1.7718629837036133, + "learning_rate": 3.964797821743666e-05, + "loss": 0.6658, + "step": 70260 + }, + { + "epoch": 0.6212097102141126, + "grad_norm": 3.6500391960144043, + "learning_rate": 3.964650482976479e-05, + "loss": 0.6599, + "step": 70270 + }, + { + "epoch": 0.621298113474425, + "grad_norm": 8.794549942016602, + "learning_rate": 3.964503144209292e-05, + "loss": 0.6497, + "step": 70280 + }, + { + "epoch": 0.6213865167347372, + "grad_norm": 1.5417425632476807, + "learning_rate": 3.9643558054421046e-05, + "loss": 0.6085, + "step": 70290 + }, + { + "epoch": 0.6214749199950494, + "grad_norm": 3.684896230697632, + "learning_rate": 3.964208466674918e-05, + "loss": 0.6309, + "step": 70300 + }, + { + "epoch": 0.6215633232553617, + "grad_norm": 9.374857902526855, + "learning_rate": 3.96406112790773e-05, + "loss": 0.7555, + "step": 70310 + }, + { + "epoch": 0.6216517265156739, + "grad_norm": 4.701685428619385, + "learning_rate": 3.963913789140544e-05, + "loss": 0.6665, + "step": 70320 + }, + { + "epoch": 0.6217401297759861, + "grad_norm": 2.6833534240722656, + "learning_rate": 3.9637664503733567e-05, + "loss": 0.7492, + "step": 70330 + }, + { + "epoch": 0.6218285330362984, + "grad_norm": 2.8093228340148926, + "learning_rate": 3.9636191116061695e-05, + "loss": 0.7487, + "step": 70340 + }, + { + "epoch": 0.6219169362966106, + "grad_norm": 2.0366687774658203, + "learning_rate": 3.963471772838982e-05, + "loss": 0.6758, + "step": 70350 + }, + { + "epoch": 0.6220053395569228, + "grad_norm": 6.7705230712890625, + "learning_rate": 3.963324434071796e-05, + "loss": 0.6338, + "step": 70360 + }, + { + "epoch": 0.6220937428172351, + "grad_norm": 5.320030212402344, + "learning_rate": 3.963177095304608e-05, + "loss": 0.6145, + "step": 70370 + }, + { + "epoch": 0.6221821460775473, + "grad_norm": 1.349388837814331, + "learning_rate": 3.9630297565374215e-05, + "loss": 0.5986, + "step": 70380 + }, + { + "epoch": 0.6222705493378596, + "grad_norm": 3.4945578575134277, + "learning_rate": 3.962882417770234e-05, + "loss": 0.6608, + "step": 70390 + }, + { + "epoch": 0.6223589525981719, + "grad_norm": 7.115966320037842, + "learning_rate": 3.962735079003047e-05, + "loss": 0.7617, + "step": 70400 + }, + { + "epoch": 0.6224473558584841, + "grad_norm": 3.625720500946045, + "learning_rate": 3.96258774023586e-05, + "loss": 0.696, + "step": 70410 + }, + { + "epoch": 0.6225357591187963, + "grad_norm": 4.838071346282959, + "learning_rate": 3.962440401468673e-05, + "loss": 0.742, + "step": 70420 + }, + { + "epoch": 0.6226241623791086, + "grad_norm": 1.1251450777053833, + "learning_rate": 3.962293062701486e-05, + "loss": 0.6648, + "step": 70430 + }, + { + "epoch": 0.6227125656394208, + "grad_norm": 5.156866073608398, + "learning_rate": 3.962145723934299e-05, + "loss": 0.6456, + "step": 70440 + }, + { + "epoch": 0.622800968899733, + "grad_norm": 2.8390755653381348, + "learning_rate": 3.9619983851671114e-05, + "loss": 0.704, + "step": 70450 + }, + { + "epoch": 0.6228893721600453, + "grad_norm": 2.190711498260498, + "learning_rate": 3.961851046399925e-05, + "loss": 0.7537, + "step": 70460 + }, + { + "epoch": 0.6229777754203575, + "grad_norm": 11.280312538146973, + "learning_rate": 3.961703707632738e-05, + "loss": 0.691, + "step": 70470 + }, + { + "epoch": 0.6230661786806697, + "grad_norm": 1.746973991394043, + "learning_rate": 3.9615563688655505e-05, + "loss": 0.7353, + "step": 70480 + }, + { + "epoch": 0.6231545819409819, + "grad_norm": 3.1988108158111572, + "learning_rate": 3.9614090300983634e-05, + "loss": 0.613, + "step": 70490 + }, + { + "epoch": 0.6232429852012942, + "grad_norm": 2.930650234222412, + "learning_rate": 3.961261691331177e-05, + "loss": 0.7425, + "step": 70500 + }, + { + "epoch": 0.6233313884616065, + "grad_norm": 3.3266892433166504, + "learning_rate": 3.961114352563989e-05, + "loss": 0.6717, + "step": 70510 + }, + { + "epoch": 0.6234197917219187, + "grad_norm": 1.8725578784942627, + "learning_rate": 3.9609670137968026e-05, + "loss": 0.6432, + "step": 70520 + }, + { + "epoch": 0.623508194982231, + "grad_norm": 5.384922027587891, + "learning_rate": 3.960819675029615e-05, + "loss": 0.9552, + "step": 70530 + }, + { + "epoch": 0.6235965982425432, + "grad_norm": 2.1145195960998535, + "learning_rate": 3.960672336262428e-05, + "loss": 0.6509, + "step": 70540 + }, + { + "epoch": 0.6236850015028554, + "grad_norm": 2.2755064964294434, + "learning_rate": 3.960524997495241e-05, + "loss": 0.7932, + "step": 70550 + }, + { + "epoch": 0.6237734047631677, + "grad_norm": 1.5214645862579346, + "learning_rate": 3.960377658728054e-05, + "loss": 0.7171, + "step": 70560 + }, + { + "epoch": 0.6238618080234799, + "grad_norm": 2.2720513343811035, + "learning_rate": 3.960230319960867e-05, + "loss": 0.7061, + "step": 70570 + }, + { + "epoch": 0.6239502112837921, + "grad_norm": 1.4581434726715088, + "learning_rate": 3.96008298119368e-05, + "loss": 0.6505, + "step": 70580 + }, + { + "epoch": 0.6240386145441044, + "grad_norm": 5.223835468292236, + "learning_rate": 3.9599356424264924e-05, + "loss": 0.6278, + "step": 70590 + }, + { + "epoch": 0.6241270178044166, + "grad_norm": 6.953359127044678, + "learning_rate": 3.959788303659306e-05, + "loss": 0.7659, + "step": 70600 + }, + { + "epoch": 0.6242154210647288, + "grad_norm": 4.40071439743042, + "learning_rate": 3.959640964892119e-05, + "loss": 0.5564, + "step": 70610 + }, + { + "epoch": 0.6243038243250411, + "grad_norm": 4.113469123840332, + "learning_rate": 3.9594936261249316e-05, + "loss": 0.7422, + "step": 70620 + }, + { + "epoch": 0.6243922275853534, + "grad_norm": 10.757296562194824, + "learning_rate": 3.9593462873577444e-05, + "loss": 0.664, + "step": 70630 + }, + { + "epoch": 0.6244806308456656, + "grad_norm": 1.3857872486114502, + "learning_rate": 3.959198948590557e-05, + "loss": 0.5617, + "step": 70640 + }, + { + "epoch": 0.6245690341059779, + "grad_norm": 12.130651473999023, + "learning_rate": 3.95905160982337e-05, + "loss": 0.5965, + "step": 70650 + }, + { + "epoch": 0.6246574373662901, + "grad_norm": 1.5708321332931519, + "learning_rate": 3.9589042710561836e-05, + "loss": 0.4673, + "step": 70660 + }, + { + "epoch": 0.6247458406266023, + "grad_norm": 1.8471735715866089, + "learning_rate": 3.9587569322889965e-05, + "loss": 0.6611, + "step": 70670 + }, + { + "epoch": 0.6248342438869146, + "grad_norm": 5.520918846130371, + "learning_rate": 3.958609593521809e-05, + "loss": 0.6115, + "step": 70680 + }, + { + "epoch": 0.6249226471472268, + "grad_norm": 5.764188766479492, + "learning_rate": 3.958462254754622e-05, + "loss": 0.7145, + "step": 70690 + }, + { + "epoch": 0.625011050407539, + "grad_norm": 3.201566219329834, + "learning_rate": 3.958314915987435e-05, + "loss": 0.6039, + "step": 70700 + }, + { + "epoch": 0.6250994536678512, + "grad_norm": 1.7584728002548218, + "learning_rate": 3.958167577220248e-05, + "loss": 0.7577, + "step": 70710 + }, + { + "epoch": 0.6251878569281635, + "grad_norm": 2.0485239028930664, + "learning_rate": 3.958020238453061e-05, + "loss": 0.8059, + "step": 70720 + }, + { + "epoch": 0.6252762601884757, + "grad_norm": 1.824622392654419, + "learning_rate": 3.957872899685874e-05, + "loss": 0.6871, + "step": 70730 + }, + { + "epoch": 0.6253646634487879, + "grad_norm": 7.051321506500244, + "learning_rate": 3.957725560918687e-05, + "loss": 0.6122, + "step": 70740 + }, + { + "epoch": 0.6254530667091003, + "grad_norm": 3.967787027359009, + "learning_rate": 3.9575782221515e-05, + "loss": 0.6527, + "step": 70750 + }, + { + "epoch": 0.6255414699694125, + "grad_norm": 1.4101231098175049, + "learning_rate": 3.9574308833843126e-05, + "loss": 0.6156, + "step": 70760 + }, + { + "epoch": 0.6256298732297247, + "grad_norm": 4.428545951843262, + "learning_rate": 3.9572835446171255e-05, + "loss": 0.6416, + "step": 70770 + }, + { + "epoch": 0.625718276490037, + "grad_norm": 5.919891834259033, + "learning_rate": 3.957136205849938e-05, + "loss": 0.7208, + "step": 70780 + }, + { + "epoch": 0.6258066797503492, + "grad_norm": 14.246026039123535, + "learning_rate": 3.956988867082752e-05, + "loss": 0.7137, + "step": 70790 + }, + { + "epoch": 0.6258950830106614, + "grad_norm": 2.3636012077331543, + "learning_rate": 3.956841528315565e-05, + "loss": 0.6627, + "step": 70800 + }, + { + "epoch": 0.6259834862709737, + "grad_norm": 5.053730487823486, + "learning_rate": 3.9566941895483775e-05, + "loss": 0.6867, + "step": 70810 + }, + { + "epoch": 0.6260718895312859, + "grad_norm": 4.1448540687561035, + "learning_rate": 3.95654685078119e-05, + "loss": 0.769, + "step": 70820 + }, + { + "epoch": 0.6261602927915981, + "grad_norm": 1.0152547359466553, + "learning_rate": 3.956399512014003e-05, + "loss": 0.5611, + "step": 70830 + }, + { + "epoch": 0.6262486960519104, + "grad_norm": 3.6613223552703857, + "learning_rate": 3.956252173246816e-05, + "loss": 0.6594, + "step": 70840 + }, + { + "epoch": 0.6263370993122226, + "grad_norm": 1.9071577787399292, + "learning_rate": 3.9561048344796295e-05, + "loss": 0.6842, + "step": 70850 + }, + { + "epoch": 0.6264255025725348, + "grad_norm": 11.972447395324707, + "learning_rate": 3.9559574957124424e-05, + "loss": 0.5839, + "step": 70860 + }, + { + "epoch": 0.6265139058328472, + "grad_norm": 3.9560227394104004, + "learning_rate": 3.955810156945255e-05, + "loss": 0.6597, + "step": 70870 + }, + { + "epoch": 0.6266023090931594, + "grad_norm": 3.1316449642181396, + "learning_rate": 3.955662818178068e-05, + "loss": 0.7728, + "step": 70880 + }, + { + "epoch": 0.6266907123534716, + "grad_norm": 7.84269380569458, + "learning_rate": 3.955515479410881e-05, + "loss": 0.6914, + "step": 70890 + }, + { + "epoch": 0.6267791156137839, + "grad_norm": 3.6713805198669434, + "learning_rate": 3.955368140643694e-05, + "loss": 0.6, + "step": 70900 + }, + { + "epoch": 0.6268675188740961, + "grad_norm": 7.718493938446045, + "learning_rate": 3.955220801876507e-05, + "loss": 0.7166, + "step": 70910 + }, + { + "epoch": 0.6269559221344083, + "grad_norm": 6.260168552398682, + "learning_rate": 3.9550734631093194e-05, + "loss": 0.5618, + "step": 70920 + }, + { + "epoch": 0.6270443253947205, + "grad_norm": 2.636608123779297, + "learning_rate": 3.954926124342133e-05, + "loss": 0.6655, + "step": 70930 + }, + { + "epoch": 0.6271327286550328, + "grad_norm": 4.675978660583496, + "learning_rate": 3.954778785574946e-05, + "loss": 0.6802, + "step": 70940 + }, + { + "epoch": 0.627221131915345, + "grad_norm": 1.7388070821762085, + "learning_rate": 3.9546314468077586e-05, + "loss": 0.7387, + "step": 70950 + }, + { + "epoch": 0.6273095351756572, + "grad_norm": 2.636258125305176, + "learning_rate": 3.9544841080405714e-05, + "loss": 0.6117, + "step": 70960 + }, + { + "epoch": 0.6273979384359695, + "grad_norm": 3.331146240234375, + "learning_rate": 3.954336769273385e-05, + "loss": 0.78, + "step": 70970 + }, + { + "epoch": 0.6274863416962818, + "grad_norm": 3.7574076652526855, + "learning_rate": 3.954189430506197e-05, + "loss": 0.7342, + "step": 70980 + }, + { + "epoch": 0.627574744956594, + "grad_norm": 1.7998695373535156, + "learning_rate": 3.9540420917390106e-05, + "loss": 0.6975, + "step": 70990 + }, + { + "epoch": 0.6276631482169063, + "grad_norm": 3.9677886962890625, + "learning_rate": 3.953894752971823e-05, + "loss": 0.8364, + "step": 71000 + }, + { + "epoch": 0.6277515514772185, + "grad_norm": 2.983008861541748, + "learning_rate": 3.953747414204636e-05, + "loss": 0.6461, + "step": 71010 + }, + { + "epoch": 0.6278399547375307, + "grad_norm": 1.4710510969161987, + "learning_rate": 3.953600075437449e-05, + "loss": 0.6559, + "step": 71020 + }, + { + "epoch": 0.627928357997843, + "grad_norm": 1.6924302577972412, + "learning_rate": 3.953452736670262e-05, + "loss": 0.6297, + "step": 71030 + }, + { + "epoch": 0.6280167612581552, + "grad_norm": 1.9899982213974, + "learning_rate": 3.953305397903075e-05, + "loss": 0.6832, + "step": 71040 + }, + { + "epoch": 0.6281051645184674, + "grad_norm": 1.574859619140625, + "learning_rate": 3.953158059135888e-05, + "loss": 0.5918, + "step": 71050 + }, + { + "epoch": 0.6281935677787797, + "grad_norm": 6.533076286315918, + "learning_rate": 3.9530107203687004e-05, + "loss": 0.7408, + "step": 71060 + }, + { + "epoch": 0.6282819710390919, + "grad_norm": 7.978692531585693, + "learning_rate": 3.952863381601514e-05, + "loss": 0.5992, + "step": 71070 + }, + { + "epoch": 0.6283703742994041, + "grad_norm": 8.623467445373535, + "learning_rate": 3.952716042834327e-05, + "loss": 0.5835, + "step": 71080 + }, + { + "epoch": 0.6284587775597164, + "grad_norm": 7.112682819366455, + "learning_rate": 3.9525687040671396e-05, + "loss": 0.6871, + "step": 71090 + }, + { + "epoch": 0.6285471808200287, + "grad_norm": 8.423845291137695, + "learning_rate": 3.9524213652999524e-05, + "loss": 0.6075, + "step": 71100 + }, + { + "epoch": 0.6286355840803409, + "grad_norm": 1.8750859498977661, + "learning_rate": 3.952274026532765e-05, + "loss": 0.821, + "step": 71110 + }, + { + "epoch": 0.6287239873406532, + "grad_norm": 7.829469203948975, + "learning_rate": 3.952126687765578e-05, + "loss": 0.6275, + "step": 71120 + }, + { + "epoch": 0.6288123906009654, + "grad_norm": 2.229738473892212, + "learning_rate": 3.9519793489983916e-05, + "loss": 0.7512, + "step": 71130 + }, + { + "epoch": 0.6289007938612776, + "grad_norm": 3.0698466300964355, + "learning_rate": 3.951832010231204e-05, + "loss": 0.665, + "step": 71140 + }, + { + "epoch": 0.6289891971215898, + "grad_norm": 9.702445030212402, + "learning_rate": 3.951684671464017e-05, + "loss": 0.7262, + "step": 71150 + }, + { + "epoch": 0.6290776003819021, + "grad_norm": 5.001696586608887, + "learning_rate": 3.95153733269683e-05, + "loss": 0.7692, + "step": 71160 + }, + { + "epoch": 0.6291660036422143, + "grad_norm": 5.729146480560303, + "learning_rate": 3.951389993929643e-05, + "loss": 0.7092, + "step": 71170 + }, + { + "epoch": 0.6292544069025265, + "grad_norm": 2.534942150115967, + "learning_rate": 3.951242655162456e-05, + "loss": 0.5682, + "step": 71180 + }, + { + "epoch": 0.6293428101628388, + "grad_norm": 2.6672048568725586, + "learning_rate": 3.951095316395269e-05, + "loss": 0.6661, + "step": 71190 + }, + { + "epoch": 0.629431213423151, + "grad_norm": 5.888321876525879, + "learning_rate": 3.9509479776280815e-05, + "loss": 0.6581, + "step": 71200 + }, + { + "epoch": 0.6295196166834632, + "grad_norm": 4.407350540161133, + "learning_rate": 3.950800638860895e-05, + "loss": 0.6108, + "step": 71210 + }, + { + "epoch": 0.6296080199437756, + "grad_norm": 3.452178478240967, + "learning_rate": 3.950653300093707e-05, + "loss": 0.6308, + "step": 71220 + }, + { + "epoch": 0.6296964232040878, + "grad_norm": 1.931299090385437, + "learning_rate": 3.9505059613265207e-05, + "loss": 0.6578, + "step": 71230 + }, + { + "epoch": 0.6297848264644, + "grad_norm": 2.254317045211792, + "learning_rate": 3.9503586225593335e-05, + "loss": 0.7416, + "step": 71240 + }, + { + "epoch": 0.6298732297247123, + "grad_norm": 1.2741414308547974, + "learning_rate": 3.950211283792146e-05, + "loss": 0.7595, + "step": 71250 + }, + { + "epoch": 0.6299616329850245, + "grad_norm": 4.544749736785889, + "learning_rate": 3.950063945024959e-05, + "loss": 0.5759, + "step": 71260 + }, + { + "epoch": 0.6300500362453367, + "grad_norm": 2.699962854385376, + "learning_rate": 3.949916606257773e-05, + "loss": 0.7419, + "step": 71270 + }, + { + "epoch": 0.630138439505649, + "grad_norm": 5.524005889892578, + "learning_rate": 3.949769267490585e-05, + "loss": 0.6595, + "step": 71280 + }, + { + "epoch": 0.6302268427659612, + "grad_norm": 2.926417589187622, + "learning_rate": 3.9496219287233983e-05, + "loss": 0.6159, + "step": 71290 + }, + { + "epoch": 0.6303152460262734, + "grad_norm": 0.8104326725006104, + "learning_rate": 3.949474589956211e-05, + "loss": 0.5544, + "step": 71300 + }, + { + "epoch": 0.6304036492865857, + "grad_norm": 21.157617568969727, + "learning_rate": 3.949327251189024e-05, + "loss": 0.6606, + "step": 71310 + }, + { + "epoch": 0.6304920525468979, + "grad_norm": 4.392067909240723, + "learning_rate": 3.949179912421837e-05, + "loss": 0.731, + "step": 71320 + }, + { + "epoch": 0.6305804558072101, + "grad_norm": 11.362103462219238, + "learning_rate": 3.9490325736546504e-05, + "loss": 0.7257, + "step": 71330 + }, + { + "epoch": 0.6306688590675225, + "grad_norm": 6.68247127532959, + "learning_rate": 3.9488852348874625e-05, + "loss": 0.592, + "step": 71340 + }, + { + "epoch": 0.6307572623278347, + "grad_norm": 13.463199615478516, + "learning_rate": 3.948737896120276e-05, + "loss": 0.6858, + "step": 71350 + }, + { + "epoch": 0.6308456655881469, + "grad_norm": 2.395317792892456, + "learning_rate": 3.948590557353088e-05, + "loss": 0.6241, + "step": 71360 + }, + { + "epoch": 0.6309340688484592, + "grad_norm": 5.0932512283325195, + "learning_rate": 3.948443218585902e-05, + "loss": 0.6482, + "step": 71370 + }, + { + "epoch": 0.6310224721087714, + "grad_norm": 2.647167682647705, + "learning_rate": 3.9482958798187145e-05, + "loss": 0.6506, + "step": 71380 + }, + { + "epoch": 0.6311108753690836, + "grad_norm": 6.654621601104736, + "learning_rate": 3.9481485410515274e-05, + "loss": 0.6528, + "step": 71390 + }, + { + "epoch": 0.6311992786293958, + "grad_norm": 8.494424819946289, + "learning_rate": 3.94800120228434e-05, + "loss": 0.6563, + "step": 71400 + }, + { + "epoch": 0.6312876818897081, + "grad_norm": 3.0831310749053955, + "learning_rate": 3.947853863517154e-05, + "loss": 0.7338, + "step": 71410 + }, + { + "epoch": 0.6313760851500203, + "grad_norm": 6.151607513427734, + "learning_rate": 3.947706524749966e-05, + "loss": 0.6282, + "step": 71420 + }, + { + "epoch": 0.6314644884103325, + "grad_norm": 4.34309720993042, + "learning_rate": 3.9475591859827794e-05, + "loss": 0.7178, + "step": 71430 + }, + { + "epoch": 0.6315528916706448, + "grad_norm": 1.1920831203460693, + "learning_rate": 3.947411847215592e-05, + "loss": 0.7459, + "step": 71440 + }, + { + "epoch": 0.6316412949309571, + "grad_norm": 1.8974387645721436, + "learning_rate": 3.947264508448405e-05, + "loss": 0.7442, + "step": 71450 + }, + { + "epoch": 0.6317296981912693, + "grad_norm": 4.991611480712891, + "learning_rate": 3.947117169681218e-05, + "loss": 0.7063, + "step": 71460 + }, + { + "epoch": 0.6318181014515816, + "grad_norm": 12.901622772216797, + "learning_rate": 3.946969830914031e-05, + "loss": 0.7322, + "step": 71470 + }, + { + "epoch": 0.6319065047118938, + "grad_norm": 6.449521064758301, + "learning_rate": 3.9468224921468436e-05, + "loss": 0.5937, + "step": 71480 + }, + { + "epoch": 0.631994907972206, + "grad_norm": 2.001889705657959, + "learning_rate": 3.946675153379657e-05, + "loss": 0.5844, + "step": 71490 + }, + { + "epoch": 0.6320833112325183, + "grad_norm": 9.098803520202637, + "learning_rate": 3.946527814612469e-05, + "loss": 0.6801, + "step": 71500 + }, + { + "epoch": 0.6321717144928305, + "grad_norm": 8.762860298156738, + "learning_rate": 3.946380475845283e-05, + "loss": 0.8125, + "step": 71510 + }, + { + "epoch": 0.6322601177531427, + "grad_norm": 1.3403794765472412, + "learning_rate": 3.9462331370780956e-05, + "loss": 0.6922, + "step": 71520 + }, + { + "epoch": 0.632348521013455, + "grad_norm": 1.946393370628357, + "learning_rate": 3.9460857983109084e-05, + "loss": 0.7752, + "step": 71530 + }, + { + "epoch": 0.6324369242737672, + "grad_norm": 11.131938934326172, + "learning_rate": 3.945938459543721e-05, + "loss": 0.6543, + "step": 71540 + }, + { + "epoch": 0.6325253275340794, + "grad_norm": 8.733319282531738, + "learning_rate": 3.945791120776535e-05, + "loss": 0.5897, + "step": 71550 + }, + { + "epoch": 0.6326137307943916, + "grad_norm": 3.5438811779022217, + "learning_rate": 3.945643782009347e-05, + "loss": 0.6385, + "step": 71560 + }, + { + "epoch": 0.632702134054704, + "grad_norm": 8.746137619018555, + "learning_rate": 3.9454964432421604e-05, + "loss": 0.5731, + "step": 71570 + }, + { + "epoch": 0.6327905373150162, + "grad_norm": 3.85282039642334, + "learning_rate": 3.945349104474973e-05, + "loss": 0.6728, + "step": 71580 + }, + { + "epoch": 0.6328789405753285, + "grad_norm": 1.4692754745483398, + "learning_rate": 3.945201765707786e-05, + "loss": 0.6366, + "step": 71590 + }, + { + "epoch": 0.6329673438356407, + "grad_norm": 2.382704257965088, + "learning_rate": 3.945054426940599e-05, + "loss": 0.6038, + "step": 71600 + }, + { + "epoch": 0.6330557470959529, + "grad_norm": 3.2350873947143555, + "learning_rate": 3.944907088173412e-05, + "loss": 0.7405, + "step": 71610 + }, + { + "epoch": 0.6331441503562651, + "grad_norm": 1.6920549869537354, + "learning_rate": 3.9447597494062246e-05, + "loss": 0.6516, + "step": 71620 + }, + { + "epoch": 0.6332325536165774, + "grad_norm": 3.844879150390625, + "learning_rate": 3.944612410639038e-05, + "loss": 0.6012, + "step": 71630 + }, + { + "epoch": 0.6333209568768896, + "grad_norm": 3.9348480701446533, + "learning_rate": 3.944465071871851e-05, + "loss": 0.7567, + "step": 71640 + }, + { + "epoch": 0.6334093601372018, + "grad_norm": 3.7274162769317627, + "learning_rate": 3.944317733104664e-05, + "loss": 0.6956, + "step": 71650 + }, + { + "epoch": 0.6334977633975141, + "grad_norm": 7.328160762786865, + "learning_rate": 3.9441703943374766e-05, + "loss": 0.6459, + "step": 71660 + }, + { + "epoch": 0.6335861666578263, + "grad_norm": 3.333332061767578, + "learning_rate": 3.9440230555702895e-05, + "loss": 0.5662, + "step": 71670 + }, + { + "epoch": 0.6336745699181385, + "grad_norm": 7.647139072418213, + "learning_rate": 3.943875716803102e-05, + "loss": 0.6997, + "step": 71680 + }, + { + "epoch": 0.6337629731784509, + "grad_norm": 1.4852776527404785, + "learning_rate": 3.943728378035915e-05, + "loss": 0.7116, + "step": 71690 + }, + { + "epoch": 0.6338513764387631, + "grad_norm": 3.6578001976013184, + "learning_rate": 3.943581039268729e-05, + "loss": 0.6353, + "step": 71700 + }, + { + "epoch": 0.6339397796990753, + "grad_norm": 4.410030364990234, + "learning_rate": 3.9434337005015415e-05, + "loss": 0.5709, + "step": 71710 + }, + { + "epoch": 0.6340281829593876, + "grad_norm": 2.205988883972168, + "learning_rate": 3.943286361734354e-05, + "loss": 0.7324, + "step": 71720 + }, + { + "epoch": 0.6341165862196998, + "grad_norm": 3.9057741165161133, + "learning_rate": 3.943139022967167e-05, + "loss": 0.7229, + "step": 71730 + }, + { + "epoch": 0.634204989480012, + "grad_norm": 2.1983866691589355, + "learning_rate": 3.94299168419998e-05, + "loss": 0.6926, + "step": 71740 + }, + { + "epoch": 0.6342933927403243, + "grad_norm": 3.0037119388580322, + "learning_rate": 3.942844345432793e-05, + "loss": 0.63, + "step": 71750 + }, + { + "epoch": 0.6343817960006365, + "grad_norm": 22.46878433227539, + "learning_rate": 3.9426970066656064e-05, + "loss": 0.7218, + "step": 71760 + }, + { + "epoch": 0.6344701992609487, + "grad_norm": 3.7411534786224365, + "learning_rate": 3.942549667898419e-05, + "loss": 0.6539, + "step": 71770 + }, + { + "epoch": 0.634558602521261, + "grad_norm": 2.6895134449005127, + "learning_rate": 3.942402329131232e-05, + "loss": 0.6616, + "step": 71780 + }, + { + "epoch": 0.6346470057815732, + "grad_norm": 1.4705055952072144, + "learning_rate": 3.942254990364045e-05, + "loss": 0.7168, + "step": 71790 + }, + { + "epoch": 0.6347354090418854, + "grad_norm": 2.9868435859680176, + "learning_rate": 3.942107651596858e-05, + "loss": 0.714, + "step": 71800 + }, + { + "epoch": 0.6348238123021978, + "grad_norm": 2.1962103843688965, + "learning_rate": 3.9419603128296705e-05, + "loss": 0.7727, + "step": 71810 + }, + { + "epoch": 0.63491221556251, + "grad_norm": 2.4985291957855225, + "learning_rate": 3.941812974062484e-05, + "loss": 0.7119, + "step": 71820 + }, + { + "epoch": 0.6350006188228222, + "grad_norm": 2.670518398284912, + "learning_rate": 3.941665635295296e-05, + "loss": 0.8273, + "step": 71830 + }, + { + "epoch": 0.6350890220831344, + "grad_norm": 1.5731847286224365, + "learning_rate": 3.94151829652811e-05, + "loss": 0.6485, + "step": 71840 + }, + { + "epoch": 0.6351774253434467, + "grad_norm": 3.0562798976898193, + "learning_rate": 3.9413709577609225e-05, + "loss": 0.618, + "step": 71850 + }, + { + "epoch": 0.6352658286037589, + "grad_norm": 2.1619131565093994, + "learning_rate": 3.9412236189937354e-05, + "loss": 0.7004, + "step": 71860 + }, + { + "epoch": 0.6353542318640711, + "grad_norm": 3.9373903274536133, + "learning_rate": 3.941076280226548e-05, + "loss": 0.4843, + "step": 71870 + }, + { + "epoch": 0.6354426351243834, + "grad_norm": 9.931157112121582, + "learning_rate": 3.940928941459362e-05, + "loss": 0.7468, + "step": 71880 + }, + { + "epoch": 0.6355310383846956, + "grad_norm": 4.420377731323242, + "learning_rate": 3.940781602692174e-05, + "loss": 0.5889, + "step": 71890 + }, + { + "epoch": 0.6356194416450078, + "grad_norm": 2.833367109298706, + "learning_rate": 3.9406342639249874e-05, + "loss": 0.6528, + "step": 71900 + }, + { + "epoch": 0.6357078449053201, + "grad_norm": 33.024906158447266, + "learning_rate": 3.9404869251578e-05, + "loss": 0.6006, + "step": 71910 + }, + { + "epoch": 0.6357962481656323, + "grad_norm": 3.4277443885803223, + "learning_rate": 3.940339586390613e-05, + "loss": 0.7393, + "step": 71920 + }, + { + "epoch": 0.6358846514259446, + "grad_norm": 3.256603717803955, + "learning_rate": 3.940192247623426e-05, + "loss": 0.7508, + "step": 71930 + }, + { + "epoch": 0.6359730546862569, + "grad_norm": 2.869668960571289, + "learning_rate": 3.940044908856239e-05, + "loss": 0.7701, + "step": 71940 + }, + { + "epoch": 0.6360614579465691, + "grad_norm": 1.2516562938690186, + "learning_rate": 3.9398975700890516e-05, + "loss": 0.6374, + "step": 71950 + }, + { + "epoch": 0.6361498612068813, + "grad_norm": 7.036595821380615, + "learning_rate": 3.939750231321865e-05, + "loss": 0.7108, + "step": 71960 + }, + { + "epoch": 0.6362382644671936, + "grad_norm": 0.8625949025154114, + "learning_rate": 3.939602892554677e-05, + "loss": 0.599, + "step": 71970 + }, + { + "epoch": 0.6363266677275058, + "grad_norm": 12.085440635681152, + "learning_rate": 3.939455553787491e-05, + "loss": 0.6241, + "step": 71980 + }, + { + "epoch": 0.636415070987818, + "grad_norm": 2.5847713947296143, + "learning_rate": 3.9393082150203036e-05, + "loss": 0.6443, + "step": 71990 + }, + { + "epoch": 0.6365034742481303, + "grad_norm": 16.710302352905273, + "learning_rate": 3.9391608762531164e-05, + "loss": 0.7294, + "step": 72000 + }, + { + "epoch": 0.6365918775084425, + "grad_norm": 2.764096736907959, + "learning_rate": 3.939013537485929e-05, + "loss": 0.6141, + "step": 72010 + }, + { + "epoch": 0.6366802807687547, + "grad_norm": 12.803329467773438, + "learning_rate": 3.938866198718743e-05, + "loss": 0.6715, + "step": 72020 + }, + { + "epoch": 0.636768684029067, + "grad_norm": 2.818300485610962, + "learning_rate": 3.938718859951555e-05, + "loss": 0.6954, + "step": 72030 + }, + { + "epoch": 0.6368570872893793, + "grad_norm": 11.248153686523438, + "learning_rate": 3.9385715211843685e-05, + "loss": 0.857, + "step": 72040 + }, + { + "epoch": 0.6369454905496915, + "grad_norm": 4.570161819458008, + "learning_rate": 3.9384241824171806e-05, + "loss": 0.669, + "step": 72050 + }, + { + "epoch": 0.6370338938100037, + "grad_norm": 4.431693077087402, + "learning_rate": 3.938276843649994e-05, + "loss": 0.6775, + "step": 72060 + }, + { + "epoch": 0.637122297070316, + "grad_norm": 3.9414236545562744, + "learning_rate": 3.938129504882807e-05, + "loss": 0.733, + "step": 72070 + }, + { + "epoch": 0.6372107003306282, + "grad_norm": 4.171746253967285, + "learning_rate": 3.93798216611562e-05, + "loss": 0.7131, + "step": 72080 + }, + { + "epoch": 0.6372991035909404, + "grad_norm": 2.7596962451934814, + "learning_rate": 3.9378348273484326e-05, + "loss": 0.7805, + "step": 72090 + }, + { + "epoch": 0.6373875068512527, + "grad_norm": 3.1972498893737793, + "learning_rate": 3.937687488581246e-05, + "loss": 0.604, + "step": 72100 + }, + { + "epoch": 0.6374759101115649, + "grad_norm": 1.2993907928466797, + "learning_rate": 3.937540149814058e-05, + "loss": 0.7639, + "step": 72110 + }, + { + "epoch": 0.6375643133718771, + "grad_norm": 3.5008013248443604, + "learning_rate": 3.937392811046872e-05, + "loss": 0.7438, + "step": 72120 + }, + { + "epoch": 0.6376527166321894, + "grad_norm": 2.820345878601074, + "learning_rate": 3.9372454722796846e-05, + "loss": 0.7525, + "step": 72130 + }, + { + "epoch": 0.6377411198925016, + "grad_norm": 3.930330991744995, + "learning_rate": 3.9370981335124975e-05, + "loss": 0.6784, + "step": 72140 + }, + { + "epoch": 0.6378295231528138, + "grad_norm": 2.45150089263916, + "learning_rate": 3.93695079474531e-05, + "loss": 0.7035, + "step": 72150 + }, + { + "epoch": 0.6379179264131262, + "grad_norm": 4.0196852684021, + "learning_rate": 3.936803455978123e-05, + "loss": 0.6605, + "step": 72160 + }, + { + "epoch": 0.6380063296734384, + "grad_norm": 0.7428300380706787, + "learning_rate": 3.936656117210936e-05, + "loss": 0.6295, + "step": 72170 + }, + { + "epoch": 0.6380947329337506, + "grad_norm": 2.0799736976623535, + "learning_rate": 3.9365087784437495e-05, + "loss": 0.7059, + "step": 72180 + }, + { + "epoch": 0.6381831361940629, + "grad_norm": 2.508800745010376, + "learning_rate": 3.936361439676562e-05, + "loss": 0.5304, + "step": 72190 + }, + { + "epoch": 0.6382715394543751, + "grad_norm": 9.046399116516113, + "learning_rate": 3.936214100909375e-05, + "loss": 0.5919, + "step": 72200 + }, + { + "epoch": 0.6383599427146873, + "grad_norm": 5.714117050170898, + "learning_rate": 3.936066762142188e-05, + "loss": 0.7344, + "step": 72210 + }, + { + "epoch": 0.6384483459749996, + "grad_norm": 2.476715326309204, + "learning_rate": 3.935919423375001e-05, + "loss": 0.9094, + "step": 72220 + }, + { + "epoch": 0.6385367492353118, + "grad_norm": 2.1051700115203857, + "learning_rate": 3.935772084607814e-05, + "loss": 0.6232, + "step": 72230 + }, + { + "epoch": 0.638625152495624, + "grad_norm": 5.702798366546631, + "learning_rate": 3.935624745840627e-05, + "loss": 0.5859, + "step": 72240 + }, + { + "epoch": 0.6387135557559362, + "grad_norm": 3.238276243209839, + "learning_rate": 3.9354774070734394e-05, + "loss": 0.7372, + "step": 72250 + }, + { + "epoch": 0.6388019590162485, + "grad_norm": 1.8214185237884521, + "learning_rate": 3.935330068306253e-05, + "loss": 0.5912, + "step": 72260 + }, + { + "epoch": 0.6388903622765607, + "grad_norm": 2.8009698390960693, + "learning_rate": 3.935182729539066e-05, + "loss": 0.7354, + "step": 72270 + }, + { + "epoch": 0.638978765536873, + "grad_norm": 4.564553737640381, + "learning_rate": 3.9350353907718785e-05, + "loss": 0.6423, + "step": 72280 + }, + { + "epoch": 0.6390671687971853, + "grad_norm": 3.0105106830596924, + "learning_rate": 3.9348880520046914e-05, + "loss": 0.8495, + "step": 72290 + }, + { + "epoch": 0.6391555720574975, + "grad_norm": 13.156732559204102, + "learning_rate": 3.934740713237504e-05, + "loss": 0.7583, + "step": 72300 + }, + { + "epoch": 0.6392439753178097, + "grad_norm": 4.095754623413086, + "learning_rate": 3.934593374470317e-05, + "loss": 0.5831, + "step": 72310 + }, + { + "epoch": 0.639332378578122, + "grad_norm": 6.197402477264404, + "learning_rate": 3.9344460357031306e-05, + "loss": 0.6996, + "step": 72320 + }, + { + "epoch": 0.6394207818384342, + "grad_norm": 5.47165584564209, + "learning_rate": 3.934298696935943e-05, + "loss": 0.8331, + "step": 72330 + }, + { + "epoch": 0.6395091850987464, + "grad_norm": 9.038413047790527, + "learning_rate": 3.934151358168756e-05, + "loss": 0.7595, + "step": 72340 + }, + { + "epoch": 0.6395975883590587, + "grad_norm": 1.7033380270004272, + "learning_rate": 3.934004019401569e-05, + "loss": 0.6425, + "step": 72350 + }, + { + "epoch": 0.6396859916193709, + "grad_norm": 2.089405059814453, + "learning_rate": 3.933856680634382e-05, + "loss": 0.539, + "step": 72360 + }, + { + "epoch": 0.6397743948796831, + "grad_norm": 10.20825481414795, + "learning_rate": 3.933709341867195e-05, + "loss": 0.577, + "step": 72370 + }, + { + "epoch": 0.6398627981399954, + "grad_norm": 2.216937303543091, + "learning_rate": 3.933562003100008e-05, + "loss": 0.8453, + "step": 72380 + }, + { + "epoch": 0.6399512014003076, + "grad_norm": 4.528128147125244, + "learning_rate": 3.9334146643328204e-05, + "loss": 0.6589, + "step": 72390 + }, + { + "epoch": 0.6400396046606199, + "grad_norm": 3.941056728363037, + "learning_rate": 3.933267325565634e-05, + "loss": 0.6726, + "step": 72400 + }, + { + "epoch": 0.6401280079209322, + "grad_norm": 2.947174310684204, + "learning_rate": 3.933119986798446e-05, + "loss": 0.5928, + "step": 72410 + }, + { + "epoch": 0.6402164111812444, + "grad_norm": 6.511383533477783, + "learning_rate": 3.9329726480312596e-05, + "loss": 0.6863, + "step": 72420 + }, + { + "epoch": 0.6403048144415566, + "grad_norm": 6.1176066398620605, + "learning_rate": 3.9328253092640724e-05, + "loss": 0.7101, + "step": 72430 + }, + { + "epoch": 0.6403932177018689, + "grad_norm": 1.474511742591858, + "learning_rate": 3.932677970496885e-05, + "loss": 0.6886, + "step": 72440 + }, + { + "epoch": 0.6404816209621811, + "grad_norm": 3.9815256595611572, + "learning_rate": 3.932530631729698e-05, + "loss": 0.6555, + "step": 72450 + }, + { + "epoch": 0.6405700242224933, + "grad_norm": 2.5388100147247314, + "learning_rate": 3.9323832929625116e-05, + "loss": 0.6699, + "step": 72460 + }, + { + "epoch": 0.6406584274828055, + "grad_norm": 2.893710136413574, + "learning_rate": 3.932235954195324e-05, + "loss": 0.7019, + "step": 72470 + }, + { + "epoch": 0.6407468307431178, + "grad_norm": 6.327343940734863, + "learning_rate": 3.932088615428137e-05, + "loss": 0.7938, + "step": 72480 + }, + { + "epoch": 0.64083523400343, + "grad_norm": 3.7004594802856445, + "learning_rate": 3.93194127666095e-05, + "loss": 0.7295, + "step": 72490 + }, + { + "epoch": 0.6409236372637422, + "grad_norm": 4.380626201629639, + "learning_rate": 3.931793937893763e-05, + "loss": 0.6347, + "step": 72500 + }, + { + "epoch": 0.6410120405240545, + "grad_norm": 3.8228535652160645, + "learning_rate": 3.931646599126576e-05, + "loss": 0.6666, + "step": 72510 + }, + { + "epoch": 0.6411004437843668, + "grad_norm": 4.612709999084473, + "learning_rate": 3.9314992603593886e-05, + "loss": 0.7021, + "step": 72520 + }, + { + "epoch": 0.641188847044679, + "grad_norm": 10.086278915405273, + "learning_rate": 3.9313519215922015e-05, + "loss": 0.6832, + "step": 72530 + }, + { + "epoch": 0.6412772503049913, + "grad_norm": 4.685321807861328, + "learning_rate": 3.931204582825015e-05, + "loss": 0.7379, + "step": 72540 + }, + { + "epoch": 0.6413656535653035, + "grad_norm": 2.3907065391540527, + "learning_rate": 3.931057244057828e-05, + "loss": 0.6257, + "step": 72550 + }, + { + "epoch": 0.6414540568256157, + "grad_norm": 2.8433804512023926, + "learning_rate": 3.9309099052906406e-05, + "loss": 0.6606, + "step": 72560 + }, + { + "epoch": 0.641542460085928, + "grad_norm": 2.0151984691619873, + "learning_rate": 3.9307625665234535e-05, + "loss": 0.5543, + "step": 72570 + }, + { + "epoch": 0.6416308633462402, + "grad_norm": 2.5229897499084473, + "learning_rate": 3.930615227756266e-05, + "loss": 0.7262, + "step": 72580 + }, + { + "epoch": 0.6417192666065524, + "grad_norm": 3.7997162342071533, + "learning_rate": 3.930467888989079e-05, + "loss": 0.6412, + "step": 72590 + }, + { + "epoch": 0.6418076698668647, + "grad_norm": 6.855039119720459, + "learning_rate": 3.9303205502218927e-05, + "loss": 0.603, + "step": 72600 + }, + { + "epoch": 0.6418960731271769, + "grad_norm": 3.914764165878296, + "learning_rate": 3.9301732114547055e-05, + "loss": 0.7359, + "step": 72610 + }, + { + "epoch": 0.6419844763874891, + "grad_norm": 1.7383325099945068, + "learning_rate": 3.930025872687518e-05, + "loss": 0.6613, + "step": 72620 + }, + { + "epoch": 0.6420728796478015, + "grad_norm": 4.043971061706543, + "learning_rate": 3.929878533920331e-05, + "loss": 0.6749, + "step": 72630 + }, + { + "epoch": 0.6421612829081137, + "grad_norm": 1.9985517263412476, + "learning_rate": 3.929731195153144e-05, + "loss": 0.5844, + "step": 72640 + }, + { + "epoch": 0.6422496861684259, + "grad_norm": 3.319026470184326, + "learning_rate": 3.929583856385957e-05, + "loss": 0.6248, + "step": 72650 + }, + { + "epoch": 0.6423380894287382, + "grad_norm": 4.223296165466309, + "learning_rate": 3.92943651761877e-05, + "loss": 0.6825, + "step": 72660 + }, + { + "epoch": 0.6424264926890504, + "grad_norm": 1.7072672843933105, + "learning_rate": 3.929289178851583e-05, + "loss": 0.6081, + "step": 72670 + }, + { + "epoch": 0.6425148959493626, + "grad_norm": 4.584011077880859, + "learning_rate": 3.929141840084396e-05, + "loss": 0.6605, + "step": 72680 + }, + { + "epoch": 0.6426032992096748, + "grad_norm": 3.3608193397521973, + "learning_rate": 3.928994501317209e-05, + "loss": 0.6693, + "step": 72690 + }, + { + "epoch": 0.6426917024699871, + "grad_norm": 2.45196270942688, + "learning_rate": 3.928847162550022e-05, + "loss": 0.7753, + "step": 72700 + }, + { + "epoch": 0.6427801057302993, + "grad_norm": 14.06520938873291, + "learning_rate": 3.9286998237828345e-05, + "loss": 0.8139, + "step": 72710 + }, + { + "epoch": 0.6428685089906115, + "grad_norm": 3.0075533390045166, + "learning_rate": 3.9285524850156474e-05, + "loss": 0.5903, + "step": 72720 + }, + { + "epoch": 0.6429569122509238, + "grad_norm": 3.4011175632476807, + "learning_rate": 3.928405146248461e-05, + "loss": 0.47, + "step": 72730 + }, + { + "epoch": 0.643045315511236, + "grad_norm": 7.843263149261475, + "learning_rate": 3.928257807481274e-05, + "loss": 0.7056, + "step": 72740 + }, + { + "epoch": 0.6431337187715483, + "grad_norm": 7.2828850746154785, + "learning_rate": 3.9281104687140865e-05, + "loss": 0.789, + "step": 72750 + }, + { + "epoch": 0.6432221220318606, + "grad_norm": 5.0123772621154785, + "learning_rate": 3.9279631299468994e-05, + "loss": 0.6655, + "step": 72760 + }, + { + "epoch": 0.6433105252921728, + "grad_norm": 7.071907043457031, + "learning_rate": 3.927815791179712e-05, + "loss": 0.6601, + "step": 72770 + }, + { + "epoch": 0.643398928552485, + "grad_norm": 2.356963634490967, + "learning_rate": 3.927668452412525e-05, + "loss": 0.7259, + "step": 72780 + }, + { + "epoch": 0.6434873318127973, + "grad_norm": 6.7405266761779785, + "learning_rate": 3.9275211136453386e-05, + "loss": 0.7829, + "step": 72790 + }, + { + "epoch": 0.6435757350731095, + "grad_norm": 1.91306734085083, + "learning_rate": 3.927373774878151e-05, + "loss": 0.7477, + "step": 72800 + }, + { + "epoch": 0.6436641383334217, + "grad_norm": 5.131834030151367, + "learning_rate": 3.927226436110964e-05, + "loss": 0.6657, + "step": 72810 + }, + { + "epoch": 0.643752541593734, + "grad_norm": 13.162694931030273, + "learning_rate": 3.927079097343777e-05, + "loss": 0.5233, + "step": 72820 + }, + { + "epoch": 0.6438409448540462, + "grad_norm": 3.790311098098755, + "learning_rate": 3.92693175857659e-05, + "loss": 0.6352, + "step": 72830 + }, + { + "epoch": 0.6439293481143584, + "grad_norm": 5.361180782318115, + "learning_rate": 3.926784419809403e-05, + "loss": 0.7202, + "step": 72840 + }, + { + "epoch": 0.6440177513746707, + "grad_norm": 2.460918426513672, + "learning_rate": 3.926637081042216e-05, + "loss": 0.612, + "step": 72850 + }, + { + "epoch": 0.6441061546349829, + "grad_norm": 7.951444149017334, + "learning_rate": 3.9264897422750284e-05, + "loss": 0.6913, + "step": 72860 + }, + { + "epoch": 0.6441945578952952, + "grad_norm": 1.660313367843628, + "learning_rate": 3.926342403507842e-05, + "loss": 0.8218, + "step": 72870 + }, + { + "epoch": 0.6442829611556075, + "grad_norm": 1.7001129388809204, + "learning_rate": 3.926195064740654e-05, + "loss": 0.7178, + "step": 72880 + }, + { + "epoch": 0.6443713644159197, + "grad_norm": 3.2389354705810547, + "learning_rate": 3.9260477259734676e-05, + "loss": 0.7323, + "step": 72890 + }, + { + "epoch": 0.6444597676762319, + "grad_norm": 2.750436544418335, + "learning_rate": 3.9259003872062804e-05, + "loss": 0.7078, + "step": 72900 + }, + { + "epoch": 0.6445481709365442, + "grad_norm": 3.6820144653320312, + "learning_rate": 3.925753048439093e-05, + "loss": 0.6893, + "step": 72910 + }, + { + "epoch": 0.6446365741968564, + "grad_norm": 4.892619609832764, + "learning_rate": 3.925605709671906e-05, + "loss": 0.606, + "step": 72920 + }, + { + "epoch": 0.6447249774571686, + "grad_norm": 1.9708278179168701, + "learning_rate": 3.9254583709047196e-05, + "loss": 0.5998, + "step": 72930 + }, + { + "epoch": 0.6448133807174808, + "grad_norm": 1.9074394702911377, + "learning_rate": 3.925311032137532e-05, + "loss": 0.5747, + "step": 72940 + }, + { + "epoch": 0.6449017839777931, + "grad_norm": 1.587152361869812, + "learning_rate": 3.925163693370345e-05, + "loss": 0.6174, + "step": 72950 + }, + { + "epoch": 0.6449901872381053, + "grad_norm": 3.8429887294769287, + "learning_rate": 3.925016354603158e-05, + "loss": 0.7796, + "step": 72960 + }, + { + "epoch": 0.6450785904984175, + "grad_norm": 1.057215690612793, + "learning_rate": 3.924869015835971e-05, + "loss": 0.6168, + "step": 72970 + }, + { + "epoch": 0.6451669937587298, + "grad_norm": 1.4331223964691162, + "learning_rate": 3.924721677068784e-05, + "loss": 0.5519, + "step": 72980 + }, + { + "epoch": 0.6452553970190421, + "grad_norm": 5.272140026092529, + "learning_rate": 3.9245743383015966e-05, + "loss": 0.8789, + "step": 72990 + }, + { + "epoch": 0.6453438002793543, + "grad_norm": 25.634410858154297, + "learning_rate": 3.9244269995344095e-05, + "loss": 0.8032, + "step": 73000 + }, + { + "epoch": 0.6454322035396666, + "grad_norm": 1.9800435304641724, + "learning_rate": 3.924279660767223e-05, + "loss": 0.697, + "step": 73010 + }, + { + "epoch": 0.6455206067999788, + "grad_norm": 7.465404987335205, + "learning_rate": 3.924132322000035e-05, + "loss": 0.53, + "step": 73020 + }, + { + "epoch": 0.645609010060291, + "grad_norm": 6.441591262817383, + "learning_rate": 3.9239849832328486e-05, + "loss": 0.5684, + "step": 73030 + }, + { + "epoch": 0.6456974133206033, + "grad_norm": 1.0479837656021118, + "learning_rate": 3.9238376444656615e-05, + "loss": 0.7137, + "step": 73040 + }, + { + "epoch": 0.6457858165809155, + "grad_norm": 1.1749058961868286, + "learning_rate": 3.923690305698474e-05, + "loss": 0.6255, + "step": 73050 + }, + { + "epoch": 0.6458742198412277, + "grad_norm": 6.112790584564209, + "learning_rate": 3.923542966931287e-05, + "loss": 0.6626, + "step": 73060 + }, + { + "epoch": 0.64596262310154, + "grad_norm": 3.223339319229126, + "learning_rate": 3.923395628164101e-05, + "loss": 0.722, + "step": 73070 + }, + { + "epoch": 0.6460510263618522, + "grad_norm": 3.2157599925994873, + "learning_rate": 3.923248289396913e-05, + "loss": 0.7325, + "step": 73080 + }, + { + "epoch": 0.6461394296221644, + "grad_norm": 3.0029873847961426, + "learning_rate": 3.923100950629726e-05, + "loss": 0.5914, + "step": 73090 + }, + { + "epoch": 0.6462278328824768, + "grad_norm": 10.399054527282715, + "learning_rate": 3.9229536118625385e-05, + "loss": 0.748, + "step": 73100 + }, + { + "epoch": 0.646316236142789, + "grad_norm": 2.6047916412353516, + "learning_rate": 3.922806273095352e-05, + "loss": 0.6442, + "step": 73110 + }, + { + "epoch": 0.6464046394031012, + "grad_norm": 1.9666624069213867, + "learning_rate": 3.922658934328165e-05, + "loss": 0.6171, + "step": 73120 + }, + { + "epoch": 0.6464930426634135, + "grad_norm": 2.737152576446533, + "learning_rate": 3.922511595560978e-05, + "loss": 0.6358, + "step": 73130 + }, + { + "epoch": 0.6465814459237257, + "grad_norm": 6.9829936027526855, + "learning_rate": 3.9223642567937905e-05, + "loss": 0.6111, + "step": 73140 + }, + { + "epoch": 0.6466698491840379, + "grad_norm": 1.3174173831939697, + "learning_rate": 3.922216918026604e-05, + "loss": 0.5371, + "step": 73150 + }, + { + "epoch": 0.6467582524443501, + "grad_norm": 3.9846386909484863, + "learning_rate": 3.922069579259416e-05, + "loss": 0.7204, + "step": 73160 + }, + { + "epoch": 0.6468466557046624, + "grad_norm": 7.698777198791504, + "learning_rate": 3.92192224049223e-05, + "loss": 0.6365, + "step": 73170 + }, + { + "epoch": 0.6469350589649746, + "grad_norm": 6.6580328941345215, + "learning_rate": 3.9217749017250425e-05, + "loss": 0.6359, + "step": 73180 + }, + { + "epoch": 0.6470234622252868, + "grad_norm": 5.755401611328125, + "learning_rate": 3.9216275629578554e-05, + "loss": 0.7798, + "step": 73190 + }, + { + "epoch": 0.6471118654855991, + "grad_norm": 1.0191410779953003, + "learning_rate": 3.921480224190668e-05, + "loss": 0.5513, + "step": 73200 + }, + { + "epoch": 0.6472002687459113, + "grad_norm": 7.7609429359436035, + "learning_rate": 3.921332885423482e-05, + "loss": 0.7038, + "step": 73210 + }, + { + "epoch": 0.6472886720062236, + "grad_norm": 1.4561318159103394, + "learning_rate": 3.921185546656294e-05, + "loss": 0.6017, + "step": 73220 + }, + { + "epoch": 0.6473770752665359, + "grad_norm": 2.4102132320404053, + "learning_rate": 3.9210382078891074e-05, + "loss": 0.6685, + "step": 73230 + }, + { + "epoch": 0.6474654785268481, + "grad_norm": 2.9067699909210205, + "learning_rate": 3.9208908691219195e-05, + "loss": 0.7334, + "step": 73240 + }, + { + "epoch": 0.6475538817871603, + "grad_norm": 3.205502510070801, + "learning_rate": 3.920743530354733e-05, + "loss": 0.6654, + "step": 73250 + }, + { + "epoch": 0.6476422850474726, + "grad_norm": 4.8111982345581055, + "learning_rate": 3.920596191587546e-05, + "loss": 0.643, + "step": 73260 + }, + { + "epoch": 0.6477306883077848, + "grad_norm": 5.713198661804199, + "learning_rate": 3.920448852820359e-05, + "loss": 0.7921, + "step": 73270 + }, + { + "epoch": 0.647819091568097, + "grad_norm": 4.404626369476318, + "learning_rate": 3.9203015140531716e-05, + "loss": 0.6053, + "step": 73280 + }, + { + "epoch": 0.6479074948284093, + "grad_norm": 3.414381504058838, + "learning_rate": 3.920154175285985e-05, + "loss": 0.5812, + "step": 73290 + }, + { + "epoch": 0.6479958980887215, + "grad_norm": 1.3864072561264038, + "learning_rate": 3.920006836518797e-05, + "loss": 0.6563, + "step": 73300 + }, + { + "epoch": 0.6480843013490337, + "grad_norm": 7.526893615722656, + "learning_rate": 3.919859497751611e-05, + "loss": 0.6581, + "step": 73310 + }, + { + "epoch": 0.648172704609346, + "grad_norm": 2.8173017501831055, + "learning_rate": 3.9197121589844236e-05, + "loss": 0.6835, + "step": 73320 + }, + { + "epoch": 0.6482611078696582, + "grad_norm": 7.339843273162842, + "learning_rate": 3.9195648202172364e-05, + "loss": 0.7126, + "step": 73330 + }, + { + "epoch": 0.6483495111299705, + "grad_norm": 2.4590420722961426, + "learning_rate": 3.919417481450049e-05, + "loss": 0.6361, + "step": 73340 + }, + { + "epoch": 0.6484379143902828, + "grad_norm": 9.176513671875, + "learning_rate": 3.919270142682862e-05, + "loss": 0.7751, + "step": 73350 + }, + { + "epoch": 0.648526317650595, + "grad_norm": 2.7113230228424072, + "learning_rate": 3.919122803915675e-05, + "loss": 0.7401, + "step": 73360 + }, + { + "epoch": 0.6486147209109072, + "grad_norm": 1.8295503854751587, + "learning_rate": 3.9189754651484884e-05, + "loss": 0.6543, + "step": 73370 + }, + { + "epoch": 0.6487031241712194, + "grad_norm": 2.899273633956909, + "learning_rate": 3.9188281263813006e-05, + "loss": 0.7755, + "step": 73380 + }, + { + "epoch": 0.6487915274315317, + "grad_norm": 2.4900076389312744, + "learning_rate": 3.918680787614114e-05, + "loss": 0.636, + "step": 73390 + }, + { + "epoch": 0.6488799306918439, + "grad_norm": 17.328548431396484, + "learning_rate": 3.918533448846927e-05, + "loss": 0.6923, + "step": 73400 + }, + { + "epoch": 0.6489683339521561, + "grad_norm": 7.702594757080078, + "learning_rate": 3.91838611007974e-05, + "loss": 0.7161, + "step": 73410 + }, + { + "epoch": 0.6490567372124684, + "grad_norm": 1.5657483339309692, + "learning_rate": 3.9182387713125526e-05, + "loss": 0.7198, + "step": 73420 + }, + { + "epoch": 0.6491451404727806, + "grad_norm": 4.969159126281738, + "learning_rate": 3.918091432545366e-05, + "loss": 0.5189, + "step": 73430 + }, + { + "epoch": 0.6492335437330928, + "grad_norm": 7.732271671295166, + "learning_rate": 3.917944093778178e-05, + "loss": 0.6831, + "step": 73440 + }, + { + "epoch": 0.6493219469934051, + "grad_norm": 0.968481719493866, + "learning_rate": 3.917796755010992e-05, + "loss": 0.5961, + "step": 73450 + }, + { + "epoch": 0.6494103502537174, + "grad_norm": 2.046670913696289, + "learning_rate": 3.9176494162438046e-05, + "loss": 0.6725, + "step": 73460 + }, + { + "epoch": 0.6494987535140296, + "grad_norm": 1.2786284685134888, + "learning_rate": 3.9175020774766175e-05, + "loss": 0.7509, + "step": 73470 + }, + { + "epoch": 0.6495871567743419, + "grad_norm": 2.8690474033355713, + "learning_rate": 3.91735473870943e-05, + "loss": 0.6889, + "step": 73480 + }, + { + "epoch": 0.6496755600346541, + "grad_norm": 1.6470656394958496, + "learning_rate": 3.917207399942243e-05, + "loss": 0.6097, + "step": 73490 + }, + { + "epoch": 0.6497639632949663, + "grad_norm": 4.127285480499268, + "learning_rate": 3.917060061175056e-05, + "loss": 0.6348, + "step": 73500 + }, + { + "epoch": 0.6498523665552786, + "grad_norm": 3.5229005813598633, + "learning_rate": 3.9169127224078695e-05, + "loss": 0.6321, + "step": 73510 + }, + { + "epoch": 0.6499407698155908, + "grad_norm": 2.7361483573913574, + "learning_rate": 3.916765383640682e-05, + "loss": 0.6596, + "step": 73520 + }, + { + "epoch": 0.650029173075903, + "grad_norm": 3.1426522731781006, + "learning_rate": 3.916618044873495e-05, + "loss": 0.6338, + "step": 73530 + }, + { + "epoch": 0.6501175763362153, + "grad_norm": 2.4380404949188232, + "learning_rate": 3.916470706106308e-05, + "loss": 0.8314, + "step": 73540 + }, + { + "epoch": 0.6502059795965275, + "grad_norm": 3.5036780834198, + "learning_rate": 3.916323367339121e-05, + "loss": 0.5892, + "step": 73550 + }, + { + "epoch": 0.6502943828568397, + "grad_norm": 1.5960756540298462, + "learning_rate": 3.916176028571934e-05, + "loss": 0.4945, + "step": 73560 + }, + { + "epoch": 0.650382786117152, + "grad_norm": 7.403130054473877, + "learning_rate": 3.916028689804747e-05, + "loss": 0.7256, + "step": 73570 + }, + { + "epoch": 0.6504711893774643, + "grad_norm": 7.227440357208252, + "learning_rate": 3.91588135103756e-05, + "loss": 0.5593, + "step": 73580 + }, + { + "epoch": 0.6505595926377765, + "grad_norm": 3.7225825786590576, + "learning_rate": 3.915734012270373e-05, + "loss": 0.6105, + "step": 73590 + }, + { + "epoch": 0.6506479958980887, + "grad_norm": 2.2574832439422607, + "learning_rate": 3.915586673503186e-05, + "loss": 0.6534, + "step": 73600 + }, + { + "epoch": 0.650736399158401, + "grad_norm": 4.280375003814697, + "learning_rate": 3.9154393347359985e-05, + "loss": 0.5881, + "step": 73610 + }, + { + "epoch": 0.6508248024187132, + "grad_norm": 3.515443801879883, + "learning_rate": 3.9152919959688114e-05, + "loss": 0.7258, + "step": 73620 + }, + { + "epoch": 0.6509132056790254, + "grad_norm": 7.235195636749268, + "learning_rate": 3.915144657201624e-05, + "loss": 0.725, + "step": 73630 + }, + { + "epoch": 0.6510016089393377, + "grad_norm": 2.778562068939209, + "learning_rate": 3.914997318434438e-05, + "loss": 0.7343, + "step": 73640 + }, + { + "epoch": 0.6510900121996499, + "grad_norm": 4.067414283752441, + "learning_rate": 3.9148499796672505e-05, + "loss": 0.6327, + "step": 73650 + }, + { + "epoch": 0.6511784154599621, + "grad_norm": 1.4434269666671753, + "learning_rate": 3.9147026409000634e-05, + "loss": 0.7595, + "step": 73660 + }, + { + "epoch": 0.6512668187202744, + "grad_norm": 2.4540281295776367, + "learning_rate": 3.914555302132876e-05, + "loss": 0.5795, + "step": 73670 + }, + { + "epoch": 0.6513552219805866, + "grad_norm": 4.55037784576416, + "learning_rate": 3.914407963365689e-05, + "loss": 0.5787, + "step": 73680 + }, + { + "epoch": 0.6514436252408989, + "grad_norm": 7.68404483795166, + "learning_rate": 3.914260624598502e-05, + "loss": 0.6487, + "step": 73690 + }, + { + "epoch": 0.6515320285012112, + "grad_norm": 6.309410572052002, + "learning_rate": 3.9141132858313154e-05, + "loss": 0.676, + "step": 73700 + }, + { + "epoch": 0.6516204317615234, + "grad_norm": 4.53164529800415, + "learning_rate": 3.9139659470641276e-05, + "loss": 0.6698, + "step": 73710 + }, + { + "epoch": 0.6517088350218356, + "grad_norm": 4.580211639404297, + "learning_rate": 3.913818608296941e-05, + "loss": 0.7516, + "step": 73720 + }, + { + "epoch": 0.6517972382821479, + "grad_norm": 3.3935134410858154, + "learning_rate": 3.913671269529754e-05, + "loss": 0.6457, + "step": 73730 + }, + { + "epoch": 0.6518856415424601, + "grad_norm": 1.6694892644882202, + "learning_rate": 3.913523930762567e-05, + "loss": 0.5469, + "step": 73740 + }, + { + "epoch": 0.6519740448027723, + "grad_norm": 1.1333941221237183, + "learning_rate": 3.9133765919953796e-05, + "loss": 0.5975, + "step": 73750 + }, + { + "epoch": 0.6520624480630846, + "grad_norm": 0.8881174325942993, + "learning_rate": 3.913229253228193e-05, + "loss": 0.5939, + "step": 73760 + }, + { + "epoch": 0.6521508513233968, + "grad_norm": 3.590424060821533, + "learning_rate": 3.913081914461005e-05, + "loss": 0.6154, + "step": 73770 + }, + { + "epoch": 0.652239254583709, + "grad_norm": 3.809953451156616, + "learning_rate": 3.912934575693819e-05, + "loss": 0.7261, + "step": 73780 + }, + { + "epoch": 0.6523276578440212, + "grad_norm": 8.598770141601562, + "learning_rate": 3.9127872369266316e-05, + "loss": 0.6614, + "step": 73790 + }, + { + "epoch": 0.6524160611043335, + "grad_norm": 15.509154319763184, + "learning_rate": 3.9126398981594444e-05, + "loss": 0.6871, + "step": 73800 + }, + { + "epoch": 0.6525044643646458, + "grad_norm": 9.159502029418945, + "learning_rate": 3.912492559392257e-05, + "loss": 0.8046, + "step": 73810 + }, + { + "epoch": 0.652592867624958, + "grad_norm": 5.595328330993652, + "learning_rate": 3.91234522062507e-05, + "loss": 0.6996, + "step": 73820 + }, + { + "epoch": 0.6526812708852703, + "grad_norm": 2.1299898624420166, + "learning_rate": 3.912197881857883e-05, + "loss": 0.7598, + "step": 73830 + }, + { + "epoch": 0.6527696741455825, + "grad_norm": 2.160871744155884, + "learning_rate": 3.9120505430906964e-05, + "loss": 0.7018, + "step": 73840 + }, + { + "epoch": 0.6528580774058947, + "grad_norm": 4.677453994750977, + "learning_rate": 3.9119032043235086e-05, + "loss": 0.8033, + "step": 73850 + }, + { + "epoch": 0.652946480666207, + "grad_norm": 3.1859958171844482, + "learning_rate": 3.911755865556322e-05, + "loss": 0.7191, + "step": 73860 + }, + { + "epoch": 0.6530348839265192, + "grad_norm": 7.368778228759766, + "learning_rate": 3.911608526789135e-05, + "loss": 0.6873, + "step": 73870 + }, + { + "epoch": 0.6531232871868314, + "grad_norm": 1.461064338684082, + "learning_rate": 3.911461188021948e-05, + "loss": 0.4827, + "step": 73880 + }, + { + "epoch": 0.6532116904471437, + "grad_norm": 1.586313247680664, + "learning_rate": 3.9113138492547606e-05, + "loss": 0.6818, + "step": 73890 + }, + { + "epoch": 0.6533000937074559, + "grad_norm": 5.235528469085693, + "learning_rate": 3.911166510487574e-05, + "loss": 0.6646, + "step": 73900 + }, + { + "epoch": 0.6533884969677681, + "grad_norm": 4.75295352935791, + "learning_rate": 3.911019171720386e-05, + "loss": 0.7481, + "step": 73910 + }, + { + "epoch": 0.6534769002280804, + "grad_norm": 8.453245162963867, + "learning_rate": 3.9108718329532e-05, + "loss": 0.5429, + "step": 73920 + }, + { + "epoch": 0.6535653034883927, + "grad_norm": 6.221120834350586, + "learning_rate": 3.910724494186012e-05, + "loss": 0.6538, + "step": 73930 + }, + { + "epoch": 0.6536537067487049, + "grad_norm": 3.9913480281829834, + "learning_rate": 3.9105771554188255e-05, + "loss": 0.6814, + "step": 73940 + }, + { + "epoch": 0.6537421100090172, + "grad_norm": 2.242767095565796, + "learning_rate": 3.910429816651638e-05, + "loss": 0.6905, + "step": 73950 + }, + { + "epoch": 0.6538305132693294, + "grad_norm": 4.462658882141113, + "learning_rate": 3.910282477884451e-05, + "loss": 0.6583, + "step": 73960 + }, + { + "epoch": 0.6539189165296416, + "grad_norm": 4.621025085449219, + "learning_rate": 3.910135139117264e-05, + "loss": 0.5677, + "step": 73970 + }, + { + "epoch": 0.6540073197899539, + "grad_norm": 1.43351411819458, + "learning_rate": 3.9099878003500775e-05, + "loss": 0.714, + "step": 73980 + }, + { + "epoch": 0.6540957230502661, + "grad_norm": 6.703693866729736, + "learning_rate": 3.9098404615828897e-05, + "loss": 0.6055, + "step": 73990 + }, + { + "epoch": 0.6541841263105783, + "grad_norm": 2.37622332572937, + "learning_rate": 3.909693122815703e-05, + "loss": 0.5538, + "step": 74000 + }, + { + "epoch": 0.6542725295708905, + "grad_norm": 5.703482627868652, + "learning_rate": 3.909545784048516e-05, + "loss": 0.6441, + "step": 74010 + }, + { + "epoch": 0.6543609328312028, + "grad_norm": 2.2979772090911865, + "learning_rate": 3.909398445281329e-05, + "loss": 0.6144, + "step": 74020 + }, + { + "epoch": 0.654449336091515, + "grad_norm": 1.548211693763733, + "learning_rate": 3.909251106514142e-05, + "loss": 0.7007, + "step": 74030 + }, + { + "epoch": 0.6545377393518272, + "grad_norm": 1.5616092681884766, + "learning_rate": 3.909103767746955e-05, + "loss": 0.7027, + "step": 74040 + }, + { + "epoch": 0.6546261426121396, + "grad_norm": 5.873095989227295, + "learning_rate": 3.9089564289797673e-05, + "loss": 0.5587, + "step": 74050 + }, + { + "epoch": 0.6547145458724518, + "grad_norm": 6.141514301300049, + "learning_rate": 3.908809090212581e-05, + "loss": 0.6174, + "step": 74060 + }, + { + "epoch": 0.654802949132764, + "grad_norm": 3.8194026947021484, + "learning_rate": 3.908661751445393e-05, + "loss": 0.6711, + "step": 74070 + }, + { + "epoch": 0.6548913523930763, + "grad_norm": 4.030489444732666, + "learning_rate": 3.9085144126782065e-05, + "loss": 0.5805, + "step": 74080 + }, + { + "epoch": 0.6549797556533885, + "grad_norm": 5.708727836608887, + "learning_rate": 3.9083670739110194e-05, + "loss": 0.7565, + "step": 74090 + }, + { + "epoch": 0.6550681589137007, + "grad_norm": 3.284543037414551, + "learning_rate": 3.908219735143832e-05, + "loss": 0.637, + "step": 74100 + }, + { + "epoch": 0.655156562174013, + "grad_norm": 13.578351974487305, + "learning_rate": 3.908072396376645e-05, + "loss": 0.7715, + "step": 74110 + }, + { + "epoch": 0.6552449654343252, + "grad_norm": 3.3261005878448486, + "learning_rate": 3.9079250576094585e-05, + "loss": 0.7305, + "step": 74120 + }, + { + "epoch": 0.6553333686946374, + "grad_norm": 5.2780022621154785, + "learning_rate": 3.907777718842271e-05, + "loss": 0.7668, + "step": 74130 + }, + { + "epoch": 0.6554217719549497, + "grad_norm": 1.6371046304702759, + "learning_rate": 3.907630380075084e-05, + "loss": 0.6188, + "step": 74140 + }, + { + "epoch": 0.6555101752152619, + "grad_norm": 7.224486351013184, + "learning_rate": 3.907483041307897e-05, + "loss": 0.5678, + "step": 74150 + }, + { + "epoch": 0.6555985784755742, + "grad_norm": 2.072559356689453, + "learning_rate": 3.90733570254071e-05, + "loss": 0.6709, + "step": 74160 + }, + { + "epoch": 0.6556869817358865, + "grad_norm": 7.273366928100586, + "learning_rate": 3.907188363773523e-05, + "loss": 0.5146, + "step": 74170 + }, + { + "epoch": 0.6557753849961987, + "grad_norm": 3.7417361736297607, + "learning_rate": 3.9070410250063356e-05, + "loss": 0.5478, + "step": 74180 + }, + { + "epoch": 0.6558637882565109, + "grad_norm": 3.2216029167175293, + "learning_rate": 3.9068936862391484e-05, + "loss": 0.5563, + "step": 74190 + }, + { + "epoch": 0.6559521915168232, + "grad_norm": 6.048964977264404, + "learning_rate": 3.906746347471962e-05, + "loss": 0.6505, + "step": 74200 + }, + { + "epoch": 0.6560405947771354, + "grad_norm": 1.0378650426864624, + "learning_rate": 3.906599008704774e-05, + "loss": 0.6915, + "step": 74210 + }, + { + "epoch": 0.6561289980374476, + "grad_norm": 2.546952247619629, + "learning_rate": 3.9064516699375876e-05, + "loss": 0.6816, + "step": 74220 + }, + { + "epoch": 0.6562174012977599, + "grad_norm": 1.9041805267333984, + "learning_rate": 3.9063043311704004e-05, + "loss": 0.708, + "step": 74230 + }, + { + "epoch": 0.6563058045580721, + "grad_norm": 3.479835033416748, + "learning_rate": 3.906156992403213e-05, + "loss": 0.6829, + "step": 74240 + }, + { + "epoch": 0.6563942078183843, + "grad_norm": 3.2493185997009277, + "learning_rate": 3.906009653636026e-05, + "loss": 0.7421, + "step": 74250 + }, + { + "epoch": 0.6564826110786965, + "grad_norm": 5.673702716827393, + "learning_rate": 3.9058623148688396e-05, + "loss": 0.5329, + "step": 74260 + }, + { + "epoch": 0.6565710143390088, + "grad_norm": 2.299133539199829, + "learning_rate": 3.905714976101652e-05, + "loss": 0.648, + "step": 74270 + }, + { + "epoch": 0.6566594175993211, + "grad_norm": 4.86217737197876, + "learning_rate": 3.905567637334465e-05, + "loss": 0.8056, + "step": 74280 + }, + { + "epoch": 0.6567478208596333, + "grad_norm": 1.1120375394821167, + "learning_rate": 3.9054202985672774e-05, + "loss": 0.7625, + "step": 74290 + }, + { + "epoch": 0.6568362241199456, + "grad_norm": 5.428472518920898, + "learning_rate": 3.905272959800091e-05, + "loss": 0.5938, + "step": 74300 + }, + { + "epoch": 0.6569246273802578, + "grad_norm": 3.106133222579956, + "learning_rate": 3.905125621032904e-05, + "loss": 0.5564, + "step": 74310 + }, + { + "epoch": 0.65701303064057, + "grad_norm": 3.6658782958984375, + "learning_rate": 3.9049782822657166e-05, + "loss": 0.781, + "step": 74320 + }, + { + "epoch": 0.6571014339008823, + "grad_norm": 4.764915943145752, + "learning_rate": 3.9048309434985294e-05, + "loss": 0.5713, + "step": 74330 + }, + { + "epoch": 0.6571898371611945, + "grad_norm": 1.1686493158340454, + "learning_rate": 3.904683604731343e-05, + "loss": 0.6478, + "step": 74340 + }, + { + "epoch": 0.6572782404215067, + "grad_norm": 8.3425931930542, + "learning_rate": 3.904536265964155e-05, + "loss": 0.8412, + "step": 74350 + }, + { + "epoch": 0.657366643681819, + "grad_norm": 6.874490261077881, + "learning_rate": 3.9043889271969686e-05, + "loss": 0.6412, + "step": 74360 + }, + { + "epoch": 0.6574550469421312, + "grad_norm": 5.611931324005127, + "learning_rate": 3.9042415884297815e-05, + "loss": 0.7604, + "step": 74370 + }, + { + "epoch": 0.6575434502024434, + "grad_norm": 2.853137969970703, + "learning_rate": 3.904094249662594e-05, + "loss": 0.5872, + "step": 74380 + }, + { + "epoch": 0.6576318534627557, + "grad_norm": 3.651102304458618, + "learning_rate": 3.903946910895407e-05, + "loss": 0.6484, + "step": 74390 + }, + { + "epoch": 0.657720256723068, + "grad_norm": 2.713616132736206, + "learning_rate": 3.90379957212822e-05, + "loss": 0.5957, + "step": 74400 + }, + { + "epoch": 0.6578086599833802, + "grad_norm": 2.728940963745117, + "learning_rate": 3.903652233361033e-05, + "loss": 0.7185, + "step": 74410 + }, + { + "epoch": 0.6578970632436925, + "grad_norm": 1.8137192726135254, + "learning_rate": 3.903504894593846e-05, + "loss": 0.6176, + "step": 74420 + }, + { + "epoch": 0.6579854665040047, + "grad_norm": 5.03383731842041, + "learning_rate": 3.903357555826659e-05, + "loss": 0.6772, + "step": 74430 + }, + { + "epoch": 0.6580738697643169, + "grad_norm": 4.3429999351501465, + "learning_rate": 3.903210217059472e-05, + "loss": 0.6096, + "step": 74440 + }, + { + "epoch": 0.6581622730246292, + "grad_norm": 2.540360927581787, + "learning_rate": 3.903062878292285e-05, + "loss": 0.7183, + "step": 74450 + }, + { + "epoch": 0.6582506762849414, + "grad_norm": 16.79432487487793, + "learning_rate": 3.9029155395250977e-05, + "loss": 0.6725, + "step": 74460 + }, + { + "epoch": 0.6583390795452536, + "grad_norm": 1.5936205387115479, + "learning_rate": 3.9027682007579105e-05, + "loss": 0.7244, + "step": 74470 + }, + { + "epoch": 0.6584274828055658, + "grad_norm": 3.5982398986816406, + "learning_rate": 3.902620861990724e-05, + "loss": 0.6549, + "step": 74480 + }, + { + "epoch": 0.6585158860658781, + "grad_norm": 2.0425920486450195, + "learning_rate": 3.902473523223537e-05, + "loss": 0.6661, + "step": 74490 + }, + { + "epoch": 0.6586042893261903, + "grad_norm": 3.9640567302703857, + "learning_rate": 3.90232618445635e-05, + "loss": 0.6676, + "step": 74500 + }, + { + "epoch": 0.6586926925865025, + "grad_norm": 3.5814859867095947, + "learning_rate": 3.9021788456891625e-05, + "loss": 0.5836, + "step": 74510 + }, + { + "epoch": 0.6587810958468149, + "grad_norm": 5.417182922363281, + "learning_rate": 3.9020315069219753e-05, + "loss": 0.5923, + "step": 74520 + }, + { + "epoch": 0.6588694991071271, + "grad_norm": 5.457121849060059, + "learning_rate": 3.901884168154788e-05, + "loss": 0.6721, + "step": 74530 + }, + { + "epoch": 0.6589579023674393, + "grad_norm": 3.9403765201568604, + "learning_rate": 3.901736829387601e-05, + "loss": 0.7197, + "step": 74540 + }, + { + "epoch": 0.6590463056277516, + "grad_norm": 3.137930154800415, + "learning_rate": 3.9015894906204145e-05, + "loss": 0.7591, + "step": 74550 + }, + { + "epoch": 0.6591347088880638, + "grad_norm": 4.644708156585693, + "learning_rate": 3.9014421518532274e-05, + "loss": 0.6787, + "step": 74560 + }, + { + "epoch": 0.659223112148376, + "grad_norm": 1.615928292274475, + "learning_rate": 3.90129481308604e-05, + "loss": 0.6022, + "step": 74570 + }, + { + "epoch": 0.6593115154086883, + "grad_norm": 3.498143434524536, + "learning_rate": 3.901147474318853e-05, + "loss": 0.7551, + "step": 74580 + }, + { + "epoch": 0.6593999186690005, + "grad_norm": 1.796372652053833, + "learning_rate": 3.901000135551666e-05, + "loss": 0.7387, + "step": 74590 + }, + { + "epoch": 0.6594883219293127, + "grad_norm": 6.777951240539551, + "learning_rate": 3.900852796784479e-05, + "loss": 0.7005, + "step": 74600 + }, + { + "epoch": 0.659576725189625, + "grad_norm": 8.958002090454102, + "learning_rate": 3.900705458017292e-05, + "loss": 0.7769, + "step": 74610 + }, + { + "epoch": 0.6596651284499372, + "grad_norm": 1.4068703651428223, + "learning_rate": 3.900558119250105e-05, + "loss": 0.7339, + "step": 74620 + }, + { + "epoch": 0.6597535317102494, + "grad_norm": 7.773104190826416, + "learning_rate": 3.900410780482918e-05, + "loss": 0.6199, + "step": 74630 + }, + { + "epoch": 0.6598419349705618, + "grad_norm": 2.2941694259643555, + "learning_rate": 3.900263441715731e-05, + "loss": 0.7006, + "step": 74640 + }, + { + "epoch": 0.659930338230874, + "grad_norm": 1.6014699935913086, + "learning_rate": 3.9001161029485436e-05, + "loss": 0.6841, + "step": 74650 + }, + { + "epoch": 0.6600187414911862, + "grad_norm": 2.185286521911621, + "learning_rate": 3.8999687641813564e-05, + "loss": 0.6551, + "step": 74660 + }, + { + "epoch": 0.6601071447514985, + "grad_norm": 7.959512710571289, + "learning_rate": 3.89982142541417e-05, + "loss": 0.6432, + "step": 74670 + }, + { + "epoch": 0.6601955480118107, + "grad_norm": 4.725872993469238, + "learning_rate": 3.899674086646982e-05, + "loss": 0.6286, + "step": 74680 + }, + { + "epoch": 0.6602839512721229, + "grad_norm": 7.8982415199279785, + "learning_rate": 3.8995267478797956e-05, + "loss": 0.6208, + "step": 74690 + }, + { + "epoch": 0.6603723545324351, + "grad_norm": 2.8995907306671143, + "learning_rate": 3.8993794091126084e-05, + "loss": 0.6577, + "step": 74700 + }, + { + "epoch": 0.6604607577927474, + "grad_norm": 3.0823237895965576, + "learning_rate": 3.899232070345421e-05, + "loss": 0.6799, + "step": 74710 + }, + { + "epoch": 0.6605491610530596, + "grad_norm": 7.6601176261901855, + "learning_rate": 3.899084731578234e-05, + "loss": 0.7882, + "step": 74720 + }, + { + "epoch": 0.6606375643133718, + "grad_norm": 4.167365074157715, + "learning_rate": 3.8989373928110476e-05, + "loss": 0.6947, + "step": 74730 + }, + { + "epoch": 0.6607259675736841, + "grad_norm": 3.089801073074341, + "learning_rate": 3.89879005404386e-05, + "loss": 0.6315, + "step": 74740 + }, + { + "epoch": 0.6608143708339964, + "grad_norm": 4.085617542266846, + "learning_rate": 3.898642715276673e-05, + "loss": 0.673, + "step": 74750 + }, + { + "epoch": 0.6609027740943086, + "grad_norm": 3.7830417156219482, + "learning_rate": 3.8984953765094854e-05, + "loss": 0.6371, + "step": 74760 + }, + { + "epoch": 0.6609911773546209, + "grad_norm": 2.098037004470825, + "learning_rate": 3.898348037742299e-05, + "loss": 0.7509, + "step": 74770 + }, + { + "epoch": 0.6610795806149331, + "grad_norm": 4.651118755340576, + "learning_rate": 3.898200698975112e-05, + "loss": 0.7642, + "step": 74780 + }, + { + "epoch": 0.6611679838752453, + "grad_norm": 3.37223482131958, + "learning_rate": 3.8980533602079246e-05, + "loss": 0.6491, + "step": 74790 + }, + { + "epoch": 0.6612563871355576, + "grad_norm": 9.367709159851074, + "learning_rate": 3.8979060214407375e-05, + "loss": 0.7687, + "step": 74800 + }, + { + "epoch": 0.6613447903958698, + "grad_norm": 8.562145233154297, + "learning_rate": 3.897758682673551e-05, + "loss": 0.7634, + "step": 74810 + }, + { + "epoch": 0.661433193656182, + "grad_norm": 3.6404943466186523, + "learning_rate": 3.897611343906363e-05, + "loss": 0.7594, + "step": 74820 + }, + { + "epoch": 0.6615215969164943, + "grad_norm": 1.6759766340255737, + "learning_rate": 3.8974640051391766e-05, + "loss": 0.6646, + "step": 74830 + }, + { + "epoch": 0.6616100001768065, + "grad_norm": 7.769847393035889, + "learning_rate": 3.8973166663719895e-05, + "loss": 0.6609, + "step": 74840 + }, + { + "epoch": 0.6616984034371187, + "grad_norm": 4.607982635498047, + "learning_rate": 3.897169327604802e-05, + "loss": 0.7208, + "step": 74850 + }, + { + "epoch": 0.661786806697431, + "grad_norm": 5.989676475524902, + "learning_rate": 3.897021988837615e-05, + "loss": 0.6603, + "step": 74860 + }, + { + "epoch": 0.6618752099577433, + "grad_norm": 7.331932067871094, + "learning_rate": 3.896874650070428e-05, + "loss": 0.8526, + "step": 74870 + }, + { + "epoch": 0.6619636132180555, + "grad_norm": 3.6298041343688965, + "learning_rate": 3.896727311303241e-05, + "loss": 0.7313, + "step": 74880 + }, + { + "epoch": 0.6620520164783678, + "grad_norm": 5.4664692878723145, + "learning_rate": 3.896579972536054e-05, + "loss": 0.6411, + "step": 74890 + }, + { + "epoch": 0.66214041973868, + "grad_norm": 5.398388385772705, + "learning_rate": 3.8964326337688665e-05, + "loss": 0.6359, + "step": 74900 + }, + { + "epoch": 0.6622288229989922, + "grad_norm": 5.649899482727051, + "learning_rate": 3.89628529500168e-05, + "loss": 0.6118, + "step": 74910 + }, + { + "epoch": 0.6623172262593044, + "grad_norm": 4.014989852905273, + "learning_rate": 3.896137956234493e-05, + "loss": 0.6763, + "step": 74920 + }, + { + "epoch": 0.6624056295196167, + "grad_norm": 2.751934766769409, + "learning_rate": 3.895990617467306e-05, + "loss": 0.6632, + "step": 74930 + }, + { + "epoch": 0.6624940327799289, + "grad_norm": 3.3606157302856445, + "learning_rate": 3.8958432787001185e-05, + "loss": 0.7213, + "step": 74940 + }, + { + "epoch": 0.6625824360402411, + "grad_norm": 4.998013973236084, + "learning_rate": 3.895695939932932e-05, + "loss": 0.863, + "step": 74950 + }, + { + "epoch": 0.6626708393005534, + "grad_norm": 3.9802145957946777, + "learning_rate": 3.895548601165744e-05, + "loss": 0.6845, + "step": 74960 + }, + { + "epoch": 0.6627592425608656, + "grad_norm": 5.25894832611084, + "learning_rate": 3.895401262398558e-05, + "loss": 0.7301, + "step": 74970 + }, + { + "epoch": 0.6628476458211778, + "grad_norm": 1.4403324127197266, + "learning_rate": 3.8952539236313705e-05, + "loss": 0.7469, + "step": 74980 + }, + { + "epoch": 0.6629360490814902, + "grad_norm": 2.2025909423828125, + "learning_rate": 3.8951065848641834e-05, + "loss": 0.595, + "step": 74990 + }, + { + "epoch": 0.6630244523418024, + "grad_norm": 1.9643757343292236, + "learning_rate": 3.894959246096996e-05, + "loss": 0.639, + "step": 75000 + }, + { + "epoch": 0.6631128556021146, + "grad_norm": 6.522768020629883, + "learning_rate": 3.894811907329809e-05, + "loss": 0.6624, + "step": 75010 + }, + { + "epoch": 0.6632012588624269, + "grad_norm": 1.6482665538787842, + "learning_rate": 3.894664568562622e-05, + "loss": 0.6221, + "step": 75020 + }, + { + "epoch": 0.6632896621227391, + "grad_norm": 6.372900009155273, + "learning_rate": 3.8945172297954354e-05, + "loss": 0.6881, + "step": 75030 + }, + { + "epoch": 0.6633780653830513, + "grad_norm": 3.927379608154297, + "learning_rate": 3.8943698910282475e-05, + "loss": 0.6885, + "step": 75040 + }, + { + "epoch": 0.6634664686433636, + "grad_norm": 17.94007682800293, + "learning_rate": 3.894222552261061e-05, + "loss": 0.6843, + "step": 75050 + }, + { + "epoch": 0.6635548719036758, + "grad_norm": 3.019455909729004, + "learning_rate": 3.894075213493874e-05, + "loss": 0.8003, + "step": 75060 + }, + { + "epoch": 0.663643275163988, + "grad_norm": 6.074705600738525, + "learning_rate": 3.893927874726687e-05, + "loss": 0.6351, + "step": 75070 + }, + { + "epoch": 0.6637316784243003, + "grad_norm": 2.8690683841705322, + "learning_rate": 3.8937805359594996e-05, + "loss": 0.5528, + "step": 75080 + }, + { + "epoch": 0.6638200816846125, + "grad_norm": 3.4551234245300293, + "learning_rate": 3.893633197192313e-05, + "loss": 0.7532, + "step": 75090 + }, + { + "epoch": 0.6639084849449247, + "grad_norm": 3.0580084323883057, + "learning_rate": 3.893485858425125e-05, + "loss": 0.6613, + "step": 75100 + }, + { + "epoch": 0.6639968882052371, + "grad_norm": 6.894099712371826, + "learning_rate": 3.893338519657939e-05, + "loss": 0.6883, + "step": 75110 + }, + { + "epoch": 0.6640852914655493, + "grad_norm": 4.7544732093811035, + "learning_rate": 3.893191180890751e-05, + "loss": 0.6124, + "step": 75120 + }, + { + "epoch": 0.6641736947258615, + "grad_norm": 1.909325122833252, + "learning_rate": 3.8930438421235644e-05, + "loss": 0.8038, + "step": 75130 + }, + { + "epoch": 0.6642620979861737, + "grad_norm": 5.189207077026367, + "learning_rate": 3.892896503356377e-05, + "loss": 0.5643, + "step": 75140 + }, + { + "epoch": 0.664350501246486, + "grad_norm": 4.618427753448486, + "learning_rate": 3.89274916458919e-05, + "loss": 0.7983, + "step": 75150 + }, + { + "epoch": 0.6644389045067982, + "grad_norm": 1.8668888807296753, + "learning_rate": 3.892601825822003e-05, + "loss": 0.6527, + "step": 75160 + }, + { + "epoch": 0.6645273077671104, + "grad_norm": 2.950101613998413, + "learning_rate": 3.8924544870548164e-05, + "loss": 0.6145, + "step": 75170 + }, + { + "epoch": 0.6646157110274227, + "grad_norm": 9.534425735473633, + "learning_rate": 3.8923071482876286e-05, + "loss": 0.7393, + "step": 75180 + }, + { + "epoch": 0.6647041142877349, + "grad_norm": 1.30713951587677, + "learning_rate": 3.892159809520442e-05, + "loss": 0.627, + "step": 75190 + }, + { + "epoch": 0.6647925175480471, + "grad_norm": 2.4795477390289307, + "learning_rate": 3.892012470753255e-05, + "loss": 0.592, + "step": 75200 + }, + { + "epoch": 0.6648809208083594, + "grad_norm": 3.4508309364318848, + "learning_rate": 3.891865131986068e-05, + "loss": 0.7174, + "step": 75210 + }, + { + "epoch": 0.6649693240686717, + "grad_norm": 9.213872909545898, + "learning_rate": 3.8917177932188806e-05, + "loss": 0.6935, + "step": 75220 + }, + { + "epoch": 0.6650577273289839, + "grad_norm": 4.244976043701172, + "learning_rate": 3.8915704544516934e-05, + "loss": 0.8166, + "step": 75230 + }, + { + "epoch": 0.6651461305892962, + "grad_norm": 6.809774875640869, + "learning_rate": 3.891423115684506e-05, + "loss": 0.7052, + "step": 75240 + }, + { + "epoch": 0.6652345338496084, + "grad_norm": 1.0831025838851929, + "learning_rate": 3.89127577691732e-05, + "loss": 0.6503, + "step": 75250 + }, + { + "epoch": 0.6653229371099206, + "grad_norm": 5.4457902908325195, + "learning_rate": 3.891128438150132e-05, + "loss": 0.8292, + "step": 75260 + }, + { + "epoch": 0.6654113403702329, + "grad_norm": 14.615921974182129, + "learning_rate": 3.8909810993829455e-05, + "loss": 0.5816, + "step": 75270 + }, + { + "epoch": 0.6654997436305451, + "grad_norm": 6.300139904022217, + "learning_rate": 3.890833760615758e-05, + "loss": 0.7312, + "step": 75280 + }, + { + "epoch": 0.6655881468908573, + "grad_norm": 1.539911150932312, + "learning_rate": 3.890686421848571e-05, + "loss": 0.6257, + "step": 75290 + }, + { + "epoch": 0.6656765501511696, + "grad_norm": 2.3931667804718018, + "learning_rate": 3.890539083081384e-05, + "loss": 0.6081, + "step": 75300 + }, + { + "epoch": 0.6657649534114818, + "grad_norm": 1.514052391052246, + "learning_rate": 3.8903917443141975e-05, + "loss": 0.6673, + "step": 75310 + }, + { + "epoch": 0.665853356671794, + "grad_norm": 1.3385413885116577, + "learning_rate": 3.89024440554701e-05, + "loss": 0.5597, + "step": 75320 + }, + { + "epoch": 0.6659417599321062, + "grad_norm": 2.084204912185669, + "learning_rate": 3.890097066779823e-05, + "loss": 0.5735, + "step": 75330 + }, + { + "epoch": 0.6660301631924186, + "grad_norm": 2.842465877532959, + "learning_rate": 3.889949728012636e-05, + "loss": 0.636, + "step": 75340 + }, + { + "epoch": 0.6661185664527308, + "grad_norm": 7.057918071746826, + "learning_rate": 3.889802389245449e-05, + "loss": 0.6449, + "step": 75350 + }, + { + "epoch": 0.666206969713043, + "grad_norm": 0.6593702435493469, + "learning_rate": 3.8896550504782617e-05, + "loss": 0.7469, + "step": 75360 + }, + { + "epoch": 0.6662953729733553, + "grad_norm": 5.400606632232666, + "learning_rate": 3.8895077117110745e-05, + "loss": 0.6287, + "step": 75370 + }, + { + "epoch": 0.6663837762336675, + "grad_norm": 4.266366004943848, + "learning_rate": 3.889360372943888e-05, + "loss": 0.7159, + "step": 75380 + }, + { + "epoch": 0.6664721794939797, + "grad_norm": 3.478417158126831, + "learning_rate": 3.889213034176701e-05, + "loss": 0.8176, + "step": 75390 + }, + { + "epoch": 0.666560582754292, + "grad_norm": 4.912135601043701, + "learning_rate": 3.889065695409514e-05, + "loss": 0.7509, + "step": 75400 + }, + { + "epoch": 0.6666489860146042, + "grad_norm": 6.7844438552856445, + "learning_rate": 3.8889183566423265e-05, + "loss": 0.7294, + "step": 75410 + }, + { + "epoch": 0.6667373892749164, + "grad_norm": 2.0250132083892822, + "learning_rate": 3.8887710178751393e-05, + "loss": 0.6866, + "step": 75420 + }, + { + "epoch": 0.6668257925352287, + "grad_norm": 3.1584935188293457, + "learning_rate": 3.888623679107952e-05, + "loss": 0.5881, + "step": 75430 + }, + { + "epoch": 0.6669141957955409, + "grad_norm": 1.4282203912734985, + "learning_rate": 3.888476340340766e-05, + "loss": 0.5731, + "step": 75440 + }, + { + "epoch": 0.6670025990558531, + "grad_norm": 7.167487621307373, + "learning_rate": 3.8883290015735785e-05, + "loss": 0.7159, + "step": 75450 + }, + { + "epoch": 0.6670910023161655, + "grad_norm": 4.460699558258057, + "learning_rate": 3.8881816628063914e-05, + "loss": 0.625, + "step": 75460 + }, + { + "epoch": 0.6671794055764777, + "grad_norm": 1.3844077587127686, + "learning_rate": 3.888034324039204e-05, + "loss": 0.6857, + "step": 75470 + }, + { + "epoch": 0.6672678088367899, + "grad_norm": 8.716390609741211, + "learning_rate": 3.887886985272017e-05, + "loss": 0.6292, + "step": 75480 + }, + { + "epoch": 0.6673562120971022, + "grad_norm": 4.229059219360352, + "learning_rate": 3.88773964650483e-05, + "loss": 0.6051, + "step": 75490 + }, + { + "epoch": 0.6674446153574144, + "grad_norm": 4.041706085205078, + "learning_rate": 3.8875923077376434e-05, + "loss": 0.731, + "step": 75500 + }, + { + "epoch": 0.6675330186177266, + "grad_norm": 6.528076648712158, + "learning_rate": 3.8874449689704555e-05, + "loss": 0.6065, + "step": 75510 + }, + { + "epoch": 0.6676214218780389, + "grad_norm": 1.0240939855575562, + "learning_rate": 3.887297630203269e-05, + "loss": 0.6556, + "step": 75520 + }, + { + "epoch": 0.6677098251383511, + "grad_norm": 1.8203481435775757, + "learning_rate": 3.887150291436082e-05, + "loss": 0.7946, + "step": 75530 + }, + { + "epoch": 0.6677982283986633, + "grad_norm": 21.99590492248535, + "learning_rate": 3.887002952668895e-05, + "loss": 0.5838, + "step": 75540 + }, + { + "epoch": 0.6678866316589755, + "grad_norm": 2.1583988666534424, + "learning_rate": 3.8868556139017076e-05, + "loss": 0.689, + "step": 75550 + }, + { + "epoch": 0.6679750349192878, + "grad_norm": 2.108670234680176, + "learning_rate": 3.886708275134521e-05, + "loss": 0.5782, + "step": 75560 + }, + { + "epoch": 0.6680634381796, + "grad_norm": 6.648342609405518, + "learning_rate": 3.886560936367333e-05, + "loss": 0.6422, + "step": 75570 + }, + { + "epoch": 0.6681518414399124, + "grad_norm": 4.114011287689209, + "learning_rate": 3.886413597600147e-05, + "loss": 0.7424, + "step": 75580 + }, + { + "epoch": 0.6682402447002246, + "grad_norm": 2.9584524631500244, + "learning_rate": 3.886266258832959e-05, + "loss": 0.6906, + "step": 75590 + }, + { + "epoch": 0.6683286479605368, + "grad_norm": 1.5455824136734009, + "learning_rate": 3.8861189200657724e-05, + "loss": 0.6834, + "step": 75600 + }, + { + "epoch": 0.668417051220849, + "grad_norm": 3.160207986831665, + "learning_rate": 3.885971581298585e-05, + "loss": 0.6911, + "step": 75610 + }, + { + "epoch": 0.6685054544811613, + "grad_norm": 5.8109822273254395, + "learning_rate": 3.885824242531398e-05, + "loss": 0.6734, + "step": 75620 + }, + { + "epoch": 0.6685938577414735, + "grad_norm": 3.8840224742889404, + "learning_rate": 3.885676903764211e-05, + "loss": 0.7482, + "step": 75630 + }, + { + "epoch": 0.6686822610017857, + "grad_norm": 4.629284381866455, + "learning_rate": 3.8855295649970244e-05, + "loss": 0.6803, + "step": 75640 + }, + { + "epoch": 0.668770664262098, + "grad_norm": 6.9976396560668945, + "learning_rate": 3.8853822262298366e-05, + "loss": 0.717, + "step": 75650 + }, + { + "epoch": 0.6688590675224102, + "grad_norm": 3.005126476287842, + "learning_rate": 3.88523488746265e-05, + "loss": 0.6389, + "step": 75660 + }, + { + "epoch": 0.6689474707827224, + "grad_norm": 2.950073003768921, + "learning_rate": 3.885087548695463e-05, + "loss": 0.7223, + "step": 75670 + }, + { + "epoch": 0.6690358740430347, + "grad_norm": 7.066815376281738, + "learning_rate": 3.884940209928276e-05, + "loss": 0.6523, + "step": 75680 + }, + { + "epoch": 0.6691242773033469, + "grad_norm": 1.7473678588867188, + "learning_rate": 3.8847928711610886e-05, + "loss": 0.6794, + "step": 75690 + }, + { + "epoch": 0.6692126805636592, + "grad_norm": 3.887746810913086, + "learning_rate": 3.8846455323939014e-05, + "loss": 0.6326, + "step": 75700 + }, + { + "epoch": 0.6693010838239715, + "grad_norm": 3.1705639362335205, + "learning_rate": 3.884498193626714e-05, + "loss": 0.6652, + "step": 75710 + }, + { + "epoch": 0.6693894870842837, + "grad_norm": 8.686491012573242, + "learning_rate": 3.884350854859528e-05, + "loss": 0.7387, + "step": 75720 + }, + { + "epoch": 0.6694778903445959, + "grad_norm": 0.8560713529586792, + "learning_rate": 3.88420351609234e-05, + "loss": 0.5385, + "step": 75730 + }, + { + "epoch": 0.6695662936049082, + "grad_norm": 0.9194939732551575, + "learning_rate": 3.8840561773251535e-05, + "loss": 0.6358, + "step": 75740 + }, + { + "epoch": 0.6696546968652204, + "grad_norm": 2.2076425552368164, + "learning_rate": 3.883908838557966e-05, + "loss": 0.6917, + "step": 75750 + }, + { + "epoch": 0.6697431001255326, + "grad_norm": 2.779151201248169, + "learning_rate": 3.883761499790779e-05, + "loss": 0.6635, + "step": 75760 + }, + { + "epoch": 0.6698315033858449, + "grad_norm": 3.504073143005371, + "learning_rate": 3.883614161023592e-05, + "loss": 0.5528, + "step": 75770 + }, + { + "epoch": 0.6699199066461571, + "grad_norm": 2.029125452041626, + "learning_rate": 3.8834668222564055e-05, + "loss": 0.639, + "step": 75780 + }, + { + "epoch": 0.6700083099064693, + "grad_norm": 6.529359817504883, + "learning_rate": 3.8833194834892176e-05, + "loss": 0.6525, + "step": 75790 + }, + { + "epoch": 0.6700967131667815, + "grad_norm": 0.9420561790466309, + "learning_rate": 3.883172144722031e-05, + "loss": 0.5379, + "step": 75800 + }, + { + "epoch": 0.6701851164270939, + "grad_norm": 5.292688846588135, + "learning_rate": 3.883024805954843e-05, + "loss": 0.6137, + "step": 75810 + }, + { + "epoch": 0.6702735196874061, + "grad_norm": 3.3520727157592773, + "learning_rate": 3.882877467187657e-05, + "loss": 0.6021, + "step": 75820 + }, + { + "epoch": 0.6703619229477183, + "grad_norm": 1.5326029062271118, + "learning_rate": 3.8827301284204697e-05, + "loss": 0.5959, + "step": 75830 + }, + { + "epoch": 0.6704503262080306, + "grad_norm": 3.817713975906372, + "learning_rate": 3.8825827896532825e-05, + "loss": 0.6685, + "step": 75840 + }, + { + "epoch": 0.6705387294683428, + "grad_norm": 2.552400827407837, + "learning_rate": 3.882435450886095e-05, + "loss": 0.7599, + "step": 75850 + }, + { + "epoch": 0.670627132728655, + "grad_norm": 5.682801246643066, + "learning_rate": 3.882288112118909e-05, + "loss": 0.704, + "step": 75860 + }, + { + "epoch": 0.6707155359889673, + "grad_norm": 2.2050187587738037, + "learning_rate": 3.882140773351721e-05, + "loss": 0.7109, + "step": 75870 + }, + { + "epoch": 0.6708039392492795, + "grad_norm": 3.0417778491973877, + "learning_rate": 3.8819934345845345e-05, + "loss": 0.7757, + "step": 75880 + }, + { + "epoch": 0.6708923425095917, + "grad_norm": 2.154115676879883, + "learning_rate": 3.8818460958173474e-05, + "loss": 0.6801, + "step": 75890 + }, + { + "epoch": 0.670980745769904, + "grad_norm": 4.734131336212158, + "learning_rate": 3.88169875705016e-05, + "loss": 0.6568, + "step": 75900 + }, + { + "epoch": 0.6710691490302162, + "grad_norm": 1.9785854816436768, + "learning_rate": 3.881551418282973e-05, + "loss": 0.6523, + "step": 75910 + }, + { + "epoch": 0.6711575522905284, + "grad_norm": 8.007513999938965, + "learning_rate": 3.8814040795157865e-05, + "loss": 0.8287, + "step": 75920 + }, + { + "epoch": 0.6712459555508408, + "grad_norm": 2.4354846477508545, + "learning_rate": 3.881256740748599e-05, + "loss": 0.6888, + "step": 75930 + }, + { + "epoch": 0.671334358811153, + "grad_norm": 5.224363327026367, + "learning_rate": 3.881109401981412e-05, + "loss": 0.7643, + "step": 75940 + }, + { + "epoch": 0.6714227620714652, + "grad_norm": 1.7246884107589722, + "learning_rate": 3.8809620632142244e-05, + "loss": 0.645, + "step": 75950 + }, + { + "epoch": 0.6715111653317775, + "grad_norm": 2.9604434967041016, + "learning_rate": 3.880814724447038e-05, + "loss": 0.7274, + "step": 75960 + }, + { + "epoch": 0.6715995685920897, + "grad_norm": 2.0970165729522705, + "learning_rate": 3.880667385679851e-05, + "loss": 0.5855, + "step": 75970 + }, + { + "epoch": 0.6716879718524019, + "grad_norm": 3.185828924179077, + "learning_rate": 3.8805200469126635e-05, + "loss": 0.6886, + "step": 75980 + }, + { + "epoch": 0.6717763751127142, + "grad_norm": 4.420489311218262, + "learning_rate": 3.8803727081454764e-05, + "loss": 0.7147, + "step": 75990 + }, + { + "epoch": 0.6718647783730264, + "grad_norm": 6.536433219909668, + "learning_rate": 3.88022536937829e-05, + "loss": 0.66, + "step": 76000 + }, + { + "epoch": 0.6719531816333386, + "grad_norm": 2.0639824867248535, + "learning_rate": 3.880078030611102e-05, + "loss": 0.6225, + "step": 76010 + }, + { + "epoch": 0.6720415848936508, + "grad_norm": 4.866724014282227, + "learning_rate": 3.8799306918439156e-05, + "loss": 0.6275, + "step": 76020 + }, + { + "epoch": 0.6721299881539631, + "grad_norm": 15.086483001708984, + "learning_rate": 3.8797833530767284e-05, + "loss": 0.5561, + "step": 76030 + }, + { + "epoch": 0.6722183914142753, + "grad_norm": 20.65117645263672, + "learning_rate": 3.879636014309541e-05, + "loss": 0.5579, + "step": 76040 + }, + { + "epoch": 0.6723067946745876, + "grad_norm": 1.189677357673645, + "learning_rate": 3.879488675542354e-05, + "loss": 0.7201, + "step": 76050 + }, + { + "epoch": 0.6723951979348999, + "grad_norm": 2.507047414779663, + "learning_rate": 3.879341336775167e-05, + "loss": 0.7802, + "step": 76060 + }, + { + "epoch": 0.6724836011952121, + "grad_norm": 5.809626579284668, + "learning_rate": 3.87919399800798e-05, + "loss": 0.6804, + "step": 76070 + }, + { + "epoch": 0.6725720044555243, + "grad_norm": 4.820792198181152, + "learning_rate": 3.879046659240793e-05, + "loss": 0.5915, + "step": 76080 + }, + { + "epoch": 0.6726604077158366, + "grad_norm": 1.4262391328811646, + "learning_rate": 3.8788993204736054e-05, + "loss": 0.78, + "step": 76090 + }, + { + "epoch": 0.6727488109761488, + "grad_norm": 5.81444787979126, + "learning_rate": 3.878751981706419e-05, + "loss": 0.616, + "step": 76100 + }, + { + "epoch": 0.672837214236461, + "grad_norm": 10.396108627319336, + "learning_rate": 3.878604642939232e-05, + "loss": 0.743, + "step": 76110 + }, + { + "epoch": 0.6729256174967733, + "grad_norm": 3.6961781978607178, + "learning_rate": 3.8784573041720446e-05, + "loss": 0.5824, + "step": 76120 + }, + { + "epoch": 0.6730140207570855, + "grad_norm": 5.650369167327881, + "learning_rate": 3.8783099654048574e-05, + "loss": 0.8155, + "step": 76130 + }, + { + "epoch": 0.6731024240173977, + "grad_norm": 8.80875015258789, + "learning_rate": 3.878162626637671e-05, + "loss": 0.7756, + "step": 76140 + }, + { + "epoch": 0.67319082727771, + "grad_norm": 5.947559833526611, + "learning_rate": 3.878015287870483e-05, + "loss": 0.737, + "step": 76150 + }, + { + "epoch": 0.6732792305380222, + "grad_norm": 13.889673233032227, + "learning_rate": 3.8778679491032966e-05, + "loss": 0.6277, + "step": 76160 + }, + { + "epoch": 0.6733676337983345, + "grad_norm": 4.561303615570068, + "learning_rate": 3.8777206103361095e-05, + "loss": 0.5821, + "step": 76170 + }, + { + "epoch": 0.6734560370586468, + "grad_norm": 1.242016315460205, + "learning_rate": 3.877573271568922e-05, + "loss": 0.5489, + "step": 76180 + }, + { + "epoch": 0.673544440318959, + "grad_norm": 4.189915180206299, + "learning_rate": 3.877425932801735e-05, + "loss": 0.7306, + "step": 76190 + }, + { + "epoch": 0.6736328435792712, + "grad_norm": 6.442260265350342, + "learning_rate": 3.877278594034548e-05, + "loss": 0.5927, + "step": 76200 + }, + { + "epoch": 0.6737212468395835, + "grad_norm": 6.389708518981934, + "learning_rate": 3.877131255267361e-05, + "loss": 0.6212, + "step": 76210 + }, + { + "epoch": 0.6738096500998957, + "grad_norm": 8.660594940185547, + "learning_rate": 3.876983916500174e-05, + "loss": 0.6111, + "step": 76220 + }, + { + "epoch": 0.6738980533602079, + "grad_norm": 2.4078876972198486, + "learning_rate": 3.876836577732987e-05, + "loss": 0.7935, + "step": 76230 + }, + { + "epoch": 0.6739864566205201, + "grad_norm": 1.8436461687088013, + "learning_rate": 3.8766892389658e-05, + "loss": 0.7178, + "step": 76240 + }, + { + "epoch": 0.6740748598808324, + "grad_norm": 1.7624011039733887, + "learning_rate": 3.876541900198613e-05, + "loss": 0.6913, + "step": 76250 + }, + { + "epoch": 0.6741632631411446, + "grad_norm": 3.4612741470336914, + "learning_rate": 3.8763945614314256e-05, + "loss": 0.7263, + "step": 76260 + }, + { + "epoch": 0.6742516664014568, + "grad_norm": 6.166719913482666, + "learning_rate": 3.8762472226642385e-05, + "loss": 0.7612, + "step": 76270 + }, + { + "epoch": 0.6743400696617692, + "grad_norm": 3.175039529800415, + "learning_rate": 3.876099883897051e-05, + "loss": 0.7465, + "step": 76280 + }, + { + "epoch": 0.6744284729220814, + "grad_norm": 2.4754281044006348, + "learning_rate": 3.875952545129865e-05, + "loss": 0.6401, + "step": 76290 + }, + { + "epoch": 0.6745168761823936, + "grad_norm": 4.258701801300049, + "learning_rate": 3.875805206362678e-05, + "loss": 0.7304, + "step": 76300 + }, + { + "epoch": 0.6746052794427059, + "grad_norm": 5.60132360458374, + "learning_rate": 3.8756578675954905e-05, + "loss": 0.65, + "step": 76310 + }, + { + "epoch": 0.6746936827030181, + "grad_norm": 3.512701988220215, + "learning_rate": 3.875510528828303e-05, + "loss": 0.6317, + "step": 76320 + }, + { + "epoch": 0.6747820859633303, + "grad_norm": 2.3437705039978027, + "learning_rate": 3.875363190061116e-05, + "loss": 0.6583, + "step": 76330 + }, + { + "epoch": 0.6748704892236426, + "grad_norm": 1.687526822090149, + "learning_rate": 3.875215851293929e-05, + "loss": 0.7704, + "step": 76340 + }, + { + "epoch": 0.6749588924839548, + "grad_norm": 11.453084945678711, + "learning_rate": 3.8750685125267425e-05, + "loss": 0.6987, + "step": 76350 + }, + { + "epoch": 0.675047295744267, + "grad_norm": 1.4400664567947388, + "learning_rate": 3.8749211737595554e-05, + "loss": 0.7892, + "step": 76360 + }, + { + "epoch": 0.6751356990045793, + "grad_norm": 2.6637187004089355, + "learning_rate": 3.874773834992368e-05, + "loss": 0.6595, + "step": 76370 + }, + { + "epoch": 0.6752241022648915, + "grad_norm": 3.611447334289551, + "learning_rate": 3.874626496225181e-05, + "loss": 0.7128, + "step": 76380 + }, + { + "epoch": 0.6753125055252037, + "grad_norm": 1.8053510189056396, + "learning_rate": 3.874479157457994e-05, + "loss": 0.558, + "step": 76390 + }, + { + "epoch": 0.6754009087855161, + "grad_norm": 8.580409049987793, + "learning_rate": 3.874331818690807e-05, + "loss": 0.646, + "step": 76400 + }, + { + "epoch": 0.6754893120458283, + "grad_norm": 2.5508487224578857, + "learning_rate": 3.87418447992362e-05, + "loss": 0.6681, + "step": 76410 + }, + { + "epoch": 0.6755777153061405, + "grad_norm": 4.451391220092773, + "learning_rate": 3.8740371411564324e-05, + "loss": 0.6011, + "step": 76420 + }, + { + "epoch": 0.6756661185664528, + "grad_norm": 1.0979185104370117, + "learning_rate": 3.873889802389246e-05, + "loss": 0.8827, + "step": 76430 + }, + { + "epoch": 0.675754521826765, + "grad_norm": 3.342114210128784, + "learning_rate": 3.873742463622059e-05, + "loss": 0.657, + "step": 76440 + }, + { + "epoch": 0.6758429250870772, + "grad_norm": 4.40913200378418, + "learning_rate": 3.8735951248548716e-05, + "loss": 0.6716, + "step": 76450 + }, + { + "epoch": 0.6759313283473894, + "grad_norm": 2.9905641078948975, + "learning_rate": 3.8734477860876844e-05, + "loss": 0.6634, + "step": 76460 + }, + { + "epoch": 0.6760197316077017, + "grad_norm": 3.614109754562378, + "learning_rate": 3.873300447320498e-05, + "loss": 0.6161, + "step": 76470 + }, + { + "epoch": 0.6761081348680139, + "grad_norm": 13.181926727294922, + "learning_rate": 3.87315310855331e-05, + "loss": 0.6864, + "step": 76480 + }, + { + "epoch": 0.6761965381283261, + "grad_norm": 2.6683850288391113, + "learning_rate": 3.8730057697861236e-05, + "loss": 0.6879, + "step": 76490 + }, + { + "epoch": 0.6762849413886384, + "grad_norm": 1.5223404169082642, + "learning_rate": 3.8728584310189364e-05, + "loss": 0.6355, + "step": 76500 + }, + { + "epoch": 0.6763733446489506, + "grad_norm": 3.8239762783050537, + "learning_rate": 3.872711092251749e-05, + "loss": 0.6753, + "step": 76510 + }, + { + "epoch": 0.676461747909263, + "grad_norm": 5.135126113891602, + "learning_rate": 3.872563753484562e-05, + "loss": 0.7254, + "step": 76520 + }, + { + "epoch": 0.6765501511695752, + "grad_norm": 2.5574920177459717, + "learning_rate": 3.872416414717375e-05, + "loss": 0.5601, + "step": 76530 + }, + { + "epoch": 0.6766385544298874, + "grad_norm": 4.815797328948975, + "learning_rate": 3.872269075950188e-05, + "loss": 0.5536, + "step": 76540 + }, + { + "epoch": 0.6767269576901996, + "grad_norm": 5.155397891998291, + "learning_rate": 3.872121737183001e-05, + "loss": 0.7275, + "step": 76550 + }, + { + "epoch": 0.6768153609505119, + "grad_norm": 2.6984620094299316, + "learning_rate": 3.8719743984158134e-05, + "loss": 0.6707, + "step": 76560 + }, + { + "epoch": 0.6769037642108241, + "grad_norm": 3.107577085494995, + "learning_rate": 3.871827059648627e-05, + "loss": 0.746, + "step": 76570 + }, + { + "epoch": 0.6769921674711363, + "grad_norm": 2.2539663314819336, + "learning_rate": 3.87167972088144e-05, + "loss": 0.8251, + "step": 76580 + }, + { + "epoch": 0.6770805707314486, + "grad_norm": 2.646031141281128, + "learning_rate": 3.8715323821142526e-05, + "loss": 0.7333, + "step": 76590 + }, + { + "epoch": 0.6771689739917608, + "grad_norm": 1.3493362665176392, + "learning_rate": 3.8713850433470654e-05, + "loss": 0.6736, + "step": 76600 + }, + { + "epoch": 0.677257377252073, + "grad_norm": 2.064878225326538, + "learning_rate": 3.871237704579879e-05, + "loss": 0.5603, + "step": 76610 + }, + { + "epoch": 0.6773457805123853, + "grad_norm": 1.6407034397125244, + "learning_rate": 3.871090365812691e-05, + "loss": 0.6618, + "step": 76620 + }, + { + "epoch": 0.6774341837726975, + "grad_norm": 7.08697509765625, + "learning_rate": 3.8709430270455046e-05, + "loss": 0.6694, + "step": 76630 + }, + { + "epoch": 0.6775225870330098, + "grad_norm": 1.1913646459579468, + "learning_rate": 3.870795688278317e-05, + "loss": 0.6065, + "step": 76640 + }, + { + "epoch": 0.6776109902933221, + "grad_norm": 4.2500386238098145, + "learning_rate": 3.87064834951113e-05, + "loss": 0.706, + "step": 76650 + }, + { + "epoch": 0.6776993935536343, + "grad_norm": 4.372733116149902, + "learning_rate": 3.870501010743943e-05, + "loss": 0.8389, + "step": 76660 + }, + { + "epoch": 0.6777877968139465, + "grad_norm": 2.3600168228149414, + "learning_rate": 3.870353671976756e-05, + "loss": 0.6175, + "step": 76670 + }, + { + "epoch": 0.6778762000742588, + "grad_norm": 7.481557846069336, + "learning_rate": 3.870206333209569e-05, + "loss": 0.6869, + "step": 76680 + }, + { + "epoch": 0.677964603334571, + "grad_norm": 1.741132378578186, + "learning_rate": 3.870058994442382e-05, + "loss": 0.6602, + "step": 76690 + }, + { + "epoch": 0.6780530065948832, + "grad_norm": 1.6569914817810059, + "learning_rate": 3.8699116556751945e-05, + "loss": 0.754, + "step": 76700 + }, + { + "epoch": 0.6781414098551954, + "grad_norm": 3.491014003753662, + "learning_rate": 3.869764316908008e-05, + "loss": 0.6877, + "step": 76710 + }, + { + "epoch": 0.6782298131155077, + "grad_norm": 1.5371977090835571, + "learning_rate": 3.869616978140821e-05, + "loss": 0.5571, + "step": 76720 + }, + { + "epoch": 0.6783182163758199, + "grad_norm": 6.5803399085998535, + "learning_rate": 3.8694696393736337e-05, + "loss": 0.698, + "step": 76730 + }, + { + "epoch": 0.6784066196361321, + "grad_norm": 2.120131492614746, + "learning_rate": 3.8693223006064465e-05, + "loss": 0.714, + "step": 76740 + }, + { + "epoch": 0.6784950228964444, + "grad_norm": 8.216854095458984, + "learning_rate": 3.86917496183926e-05, + "loss": 0.7117, + "step": 76750 + }, + { + "epoch": 0.6785834261567567, + "grad_norm": 3.822622299194336, + "learning_rate": 3.869027623072072e-05, + "loss": 0.7071, + "step": 76760 + }, + { + "epoch": 0.6786718294170689, + "grad_norm": 3.3949902057647705, + "learning_rate": 3.868880284304886e-05, + "loss": 0.5266, + "step": 76770 + }, + { + "epoch": 0.6787602326773812, + "grad_norm": 3.3918614387512207, + "learning_rate": 3.868732945537698e-05, + "loss": 0.5577, + "step": 76780 + }, + { + "epoch": 0.6788486359376934, + "grad_norm": 7.679214000701904, + "learning_rate": 3.8685856067705113e-05, + "loss": 0.8213, + "step": 76790 + }, + { + "epoch": 0.6789370391980056, + "grad_norm": 3.390754461288452, + "learning_rate": 3.868438268003324e-05, + "loss": 0.6292, + "step": 76800 + }, + { + "epoch": 0.6790254424583179, + "grad_norm": 5.8655686378479, + "learning_rate": 3.868290929236137e-05, + "loss": 0.6511, + "step": 76810 + }, + { + "epoch": 0.6791138457186301, + "grad_norm": 13.011842727661133, + "learning_rate": 3.86814359046895e-05, + "loss": 0.734, + "step": 76820 + }, + { + "epoch": 0.6792022489789423, + "grad_norm": 1.3025916814804077, + "learning_rate": 3.8679962517017634e-05, + "loss": 0.6747, + "step": 76830 + }, + { + "epoch": 0.6792906522392546, + "grad_norm": 14.641339302062988, + "learning_rate": 3.8678489129345755e-05, + "loss": 0.6175, + "step": 76840 + }, + { + "epoch": 0.6793790554995668, + "grad_norm": 1.6169750690460205, + "learning_rate": 3.867701574167389e-05, + "loss": 0.6741, + "step": 76850 + }, + { + "epoch": 0.679467458759879, + "grad_norm": 3.9084529876708984, + "learning_rate": 3.867554235400202e-05, + "loss": 0.6247, + "step": 76860 + }, + { + "epoch": 0.6795558620201914, + "grad_norm": 1.8821941614151, + "learning_rate": 3.867406896633015e-05, + "loss": 0.6401, + "step": 76870 + }, + { + "epoch": 0.6796442652805036, + "grad_norm": 3.509068250656128, + "learning_rate": 3.8672595578658275e-05, + "loss": 0.5333, + "step": 76880 + }, + { + "epoch": 0.6797326685408158, + "grad_norm": 13.12830638885498, + "learning_rate": 3.8671122190986404e-05, + "loss": 0.686, + "step": 76890 + }, + { + "epoch": 0.679821071801128, + "grad_norm": 2.8814942836761475, + "learning_rate": 3.866964880331453e-05, + "loss": 0.6663, + "step": 76900 + }, + { + "epoch": 0.6799094750614403, + "grad_norm": 6.208309173583984, + "learning_rate": 3.866817541564267e-05, + "loss": 0.6331, + "step": 76910 + }, + { + "epoch": 0.6799978783217525, + "grad_norm": 1.1584899425506592, + "learning_rate": 3.866670202797079e-05, + "loss": 0.6199, + "step": 76920 + }, + { + "epoch": 0.6800862815820647, + "grad_norm": 1.685923457145691, + "learning_rate": 3.8665228640298924e-05, + "loss": 0.6051, + "step": 76930 + }, + { + "epoch": 0.680174684842377, + "grad_norm": 3.3316476345062256, + "learning_rate": 3.866375525262705e-05, + "loss": 0.7501, + "step": 76940 + }, + { + "epoch": 0.6802630881026892, + "grad_norm": 1.666646122932434, + "learning_rate": 3.866228186495518e-05, + "loss": 0.5034, + "step": 76950 + }, + { + "epoch": 0.6803514913630014, + "grad_norm": 3.861109495162964, + "learning_rate": 3.866080847728331e-05, + "loss": 0.6992, + "step": 76960 + }, + { + "epoch": 0.6804398946233137, + "grad_norm": 4.303036689758301, + "learning_rate": 3.8659335089611444e-05, + "loss": 0.6443, + "step": 76970 + }, + { + "epoch": 0.6805282978836259, + "grad_norm": 7.868462562561035, + "learning_rate": 3.8657861701939566e-05, + "loss": 0.6491, + "step": 76980 + }, + { + "epoch": 0.6806167011439382, + "grad_norm": 4.3306450843811035, + "learning_rate": 3.86563883142677e-05, + "loss": 0.663, + "step": 76990 + }, + { + "epoch": 0.6807051044042505, + "grad_norm": 2.773069143295288, + "learning_rate": 3.865491492659582e-05, + "loss": 0.7513, + "step": 77000 + }, + { + "epoch": 0.6807935076645627, + "grad_norm": 2.352480173110962, + "learning_rate": 3.865344153892396e-05, + "loss": 0.6102, + "step": 77010 + }, + { + "epoch": 0.6808819109248749, + "grad_norm": 2.846865177154541, + "learning_rate": 3.8651968151252086e-05, + "loss": 0.7337, + "step": 77020 + }, + { + "epoch": 0.6809703141851872, + "grad_norm": 2.99969744682312, + "learning_rate": 3.8650494763580214e-05, + "loss": 0.7258, + "step": 77030 + }, + { + "epoch": 0.6810587174454994, + "grad_norm": 3.2447099685668945, + "learning_rate": 3.864902137590834e-05, + "loss": 0.7632, + "step": 77040 + }, + { + "epoch": 0.6811471207058116, + "grad_norm": 1.6464154720306396, + "learning_rate": 3.864754798823648e-05, + "loss": 0.5715, + "step": 77050 + }, + { + "epoch": 0.6812355239661239, + "grad_norm": 6.3425188064575195, + "learning_rate": 3.86460746005646e-05, + "loss": 0.786, + "step": 77060 + }, + { + "epoch": 0.6813239272264361, + "grad_norm": 9.601325988769531, + "learning_rate": 3.8644601212892734e-05, + "loss": 0.5505, + "step": 77070 + }, + { + "epoch": 0.6814123304867483, + "grad_norm": 4.141589641571045, + "learning_rate": 3.864312782522086e-05, + "loss": 0.7429, + "step": 77080 + }, + { + "epoch": 0.6815007337470605, + "grad_norm": 7.141766548156738, + "learning_rate": 3.864165443754899e-05, + "loss": 0.6463, + "step": 77090 + }, + { + "epoch": 0.6815891370073728, + "grad_norm": 3.5620219707489014, + "learning_rate": 3.864018104987712e-05, + "loss": 0.8364, + "step": 77100 + }, + { + "epoch": 0.6816775402676851, + "grad_norm": 2.0552573204040527, + "learning_rate": 3.863870766220525e-05, + "loss": 0.7667, + "step": 77110 + }, + { + "epoch": 0.6817659435279974, + "grad_norm": 2.395127773284912, + "learning_rate": 3.8637234274533376e-05, + "loss": 0.5508, + "step": 77120 + }, + { + "epoch": 0.6818543467883096, + "grad_norm": 5.132692813873291, + "learning_rate": 3.863576088686151e-05, + "loss": 0.7267, + "step": 77130 + }, + { + "epoch": 0.6819427500486218, + "grad_norm": 9.271647453308105, + "learning_rate": 3.863428749918964e-05, + "loss": 0.6817, + "step": 77140 + }, + { + "epoch": 0.682031153308934, + "grad_norm": 4.211577892303467, + "learning_rate": 3.863281411151777e-05, + "loss": 0.6936, + "step": 77150 + }, + { + "epoch": 0.6821195565692463, + "grad_norm": 5.926328659057617, + "learning_rate": 3.8631340723845896e-05, + "loss": 0.6266, + "step": 77160 + }, + { + "epoch": 0.6822079598295585, + "grad_norm": 9.586467742919922, + "learning_rate": 3.8629867336174025e-05, + "loss": 0.5502, + "step": 77170 + }, + { + "epoch": 0.6822963630898707, + "grad_norm": 3.0919978618621826, + "learning_rate": 3.862839394850215e-05, + "loss": 0.6783, + "step": 77180 + }, + { + "epoch": 0.682384766350183, + "grad_norm": 11.795863151550293, + "learning_rate": 3.862692056083029e-05, + "loss": 0.6815, + "step": 77190 + }, + { + "epoch": 0.6824731696104952, + "grad_norm": 2.4316375255584717, + "learning_rate": 3.862544717315842e-05, + "loss": 0.763, + "step": 77200 + }, + { + "epoch": 0.6825615728708074, + "grad_norm": 6.4402923583984375, + "learning_rate": 3.8623973785486545e-05, + "loss": 0.746, + "step": 77210 + }, + { + "epoch": 0.6826499761311197, + "grad_norm": 3.1197142601013184, + "learning_rate": 3.862250039781467e-05, + "loss": 0.6944, + "step": 77220 + }, + { + "epoch": 0.682738379391432, + "grad_norm": 2.88101863861084, + "learning_rate": 3.86210270101428e-05, + "loss": 0.6582, + "step": 77230 + }, + { + "epoch": 0.6828267826517442, + "grad_norm": 2.6732773780822754, + "learning_rate": 3.861955362247093e-05, + "loss": 0.6823, + "step": 77240 + }, + { + "epoch": 0.6829151859120565, + "grad_norm": 2.0981602668762207, + "learning_rate": 3.861808023479906e-05, + "loss": 0.6391, + "step": 77250 + }, + { + "epoch": 0.6830035891723687, + "grad_norm": 1.3534191846847534, + "learning_rate": 3.8616606847127194e-05, + "loss": 0.6871, + "step": 77260 + }, + { + "epoch": 0.6830919924326809, + "grad_norm": 6.169220447540283, + "learning_rate": 3.861513345945532e-05, + "loss": 0.7267, + "step": 77270 + }, + { + "epoch": 0.6831803956929932, + "grad_norm": 5.3853678703308105, + "learning_rate": 3.861366007178345e-05, + "loss": 0.6419, + "step": 77280 + }, + { + "epoch": 0.6832687989533054, + "grad_norm": 2.234717845916748, + "learning_rate": 3.861218668411158e-05, + "loss": 0.6934, + "step": 77290 + }, + { + "epoch": 0.6833572022136176, + "grad_norm": 3.117443323135376, + "learning_rate": 3.861071329643971e-05, + "loss": 0.7977, + "step": 77300 + }, + { + "epoch": 0.6834456054739299, + "grad_norm": 1.4964114427566528, + "learning_rate": 3.8609239908767835e-05, + "loss": 0.6945, + "step": 77310 + }, + { + "epoch": 0.6835340087342421, + "grad_norm": 2.7582192420959473, + "learning_rate": 3.860776652109597e-05, + "loss": 0.6983, + "step": 77320 + }, + { + "epoch": 0.6836224119945543, + "grad_norm": 2.0405874252319336, + "learning_rate": 3.86062931334241e-05, + "loss": 0.6828, + "step": 77330 + }, + { + "epoch": 0.6837108152548665, + "grad_norm": 1.4996368885040283, + "learning_rate": 3.860481974575223e-05, + "loss": 0.7164, + "step": 77340 + }, + { + "epoch": 0.6837992185151789, + "grad_norm": 2.466998338699341, + "learning_rate": 3.8603346358080355e-05, + "loss": 0.8637, + "step": 77350 + }, + { + "epoch": 0.6838876217754911, + "grad_norm": 7.78523588180542, + "learning_rate": 3.8601872970408484e-05, + "loss": 0.6814, + "step": 77360 + }, + { + "epoch": 0.6839760250358033, + "grad_norm": 1.4570581912994385, + "learning_rate": 3.860039958273661e-05, + "loss": 0.6079, + "step": 77370 + }, + { + "epoch": 0.6840644282961156, + "grad_norm": 8.560462951660156, + "learning_rate": 3.859892619506475e-05, + "loss": 0.557, + "step": 77380 + }, + { + "epoch": 0.6841528315564278, + "grad_norm": 7.949138164520264, + "learning_rate": 3.859745280739287e-05, + "loss": 0.7307, + "step": 77390 + }, + { + "epoch": 0.68424123481674, + "grad_norm": 1.9432971477508545, + "learning_rate": 3.8595979419721004e-05, + "loss": 0.6246, + "step": 77400 + }, + { + "epoch": 0.6843296380770523, + "grad_norm": 2.5344889163970947, + "learning_rate": 3.859450603204913e-05, + "loss": 0.7123, + "step": 77410 + }, + { + "epoch": 0.6844180413373645, + "grad_norm": 5.848361492156982, + "learning_rate": 3.859303264437726e-05, + "loss": 0.6575, + "step": 77420 + }, + { + "epoch": 0.6845064445976767, + "grad_norm": 2.0107686519622803, + "learning_rate": 3.859155925670539e-05, + "loss": 0.5882, + "step": 77430 + }, + { + "epoch": 0.684594847857989, + "grad_norm": 2.0502915382385254, + "learning_rate": 3.8590085869033524e-05, + "loss": 0.7121, + "step": 77440 + }, + { + "epoch": 0.6846832511183012, + "grad_norm": 3.6624529361724854, + "learning_rate": 3.8588612481361646e-05, + "loss": 0.5824, + "step": 77450 + }, + { + "epoch": 0.6847716543786135, + "grad_norm": 6.3490891456604, + "learning_rate": 3.858713909368978e-05, + "loss": 0.5814, + "step": 77460 + }, + { + "epoch": 0.6848600576389258, + "grad_norm": 2.2851083278656006, + "learning_rate": 3.85856657060179e-05, + "loss": 0.6769, + "step": 77470 + }, + { + "epoch": 0.684948460899238, + "grad_norm": 1.1894772052764893, + "learning_rate": 3.858419231834604e-05, + "loss": 0.5305, + "step": 77480 + }, + { + "epoch": 0.6850368641595502, + "grad_norm": 1.4727857112884521, + "learning_rate": 3.8582718930674166e-05, + "loss": 0.6348, + "step": 77490 + }, + { + "epoch": 0.6851252674198625, + "grad_norm": 5.069452285766602, + "learning_rate": 3.8581245543002294e-05, + "loss": 0.6586, + "step": 77500 + }, + { + "epoch": 0.6852136706801747, + "grad_norm": 1.811840295791626, + "learning_rate": 3.857977215533042e-05, + "loss": 0.6417, + "step": 77510 + }, + { + "epoch": 0.6853020739404869, + "grad_norm": 1.2616230249404907, + "learning_rate": 3.857829876765856e-05, + "loss": 0.5941, + "step": 77520 + }, + { + "epoch": 0.6853904772007992, + "grad_norm": 1.8622196912765503, + "learning_rate": 3.857682537998668e-05, + "loss": 0.7069, + "step": 77530 + }, + { + "epoch": 0.6854788804611114, + "grad_norm": 3.8690836429595947, + "learning_rate": 3.8575351992314815e-05, + "loss": 0.6264, + "step": 77540 + }, + { + "epoch": 0.6855672837214236, + "grad_norm": 2.098522424697876, + "learning_rate": 3.857387860464294e-05, + "loss": 0.6378, + "step": 77550 + }, + { + "epoch": 0.6856556869817358, + "grad_norm": 1.5568214654922485, + "learning_rate": 3.857240521697107e-05, + "loss": 0.723, + "step": 77560 + }, + { + "epoch": 0.6857440902420481, + "grad_norm": 3.2480857372283936, + "learning_rate": 3.85709318292992e-05, + "loss": 0.6516, + "step": 77570 + }, + { + "epoch": 0.6858324935023604, + "grad_norm": 1.9615591764450073, + "learning_rate": 3.856945844162733e-05, + "loss": 0.7321, + "step": 77580 + }, + { + "epoch": 0.6859208967626726, + "grad_norm": 3.882704734802246, + "learning_rate": 3.8567985053955456e-05, + "loss": 0.6662, + "step": 77590 + }, + { + "epoch": 0.6860093000229849, + "grad_norm": 3.7366836071014404, + "learning_rate": 3.856651166628359e-05, + "loss": 0.7001, + "step": 77600 + }, + { + "epoch": 0.6860977032832971, + "grad_norm": 5.924266338348389, + "learning_rate": 3.856503827861171e-05, + "loss": 0.7357, + "step": 77610 + }, + { + "epoch": 0.6861861065436093, + "grad_norm": 2.591278076171875, + "learning_rate": 3.856356489093985e-05, + "loss": 0.5735, + "step": 77620 + }, + { + "epoch": 0.6862745098039216, + "grad_norm": 4.934967041015625, + "learning_rate": 3.8562091503267977e-05, + "loss": 0.6293, + "step": 77630 + }, + { + "epoch": 0.6863629130642338, + "grad_norm": 6.791815757751465, + "learning_rate": 3.8560618115596105e-05, + "loss": 0.6967, + "step": 77640 + }, + { + "epoch": 0.686451316324546, + "grad_norm": 3.148416042327881, + "learning_rate": 3.855914472792423e-05, + "loss": 0.655, + "step": 77650 + }, + { + "epoch": 0.6865397195848583, + "grad_norm": 3.7165634632110596, + "learning_rate": 3.855767134025237e-05, + "loss": 0.8598, + "step": 77660 + }, + { + "epoch": 0.6866281228451705, + "grad_norm": 8.821511268615723, + "learning_rate": 3.855619795258049e-05, + "loss": 0.6967, + "step": 77670 + }, + { + "epoch": 0.6867165261054827, + "grad_norm": 6.567383766174316, + "learning_rate": 3.8554724564908625e-05, + "loss": 0.7457, + "step": 77680 + }, + { + "epoch": 0.686804929365795, + "grad_norm": 6.200684547424316, + "learning_rate": 3.8553251177236753e-05, + "loss": 0.6479, + "step": 77690 + }, + { + "epoch": 0.6868933326261073, + "grad_norm": 4.559550762176514, + "learning_rate": 3.855177778956488e-05, + "loss": 0.7718, + "step": 77700 + }, + { + "epoch": 0.6869817358864195, + "grad_norm": 2.284128427505493, + "learning_rate": 3.855030440189301e-05, + "loss": 0.7383, + "step": 77710 + }, + { + "epoch": 0.6870701391467318, + "grad_norm": 6.826602458953857, + "learning_rate": 3.854883101422114e-05, + "loss": 0.7224, + "step": 77720 + }, + { + "epoch": 0.687158542407044, + "grad_norm": 3.140498638153076, + "learning_rate": 3.854735762654927e-05, + "loss": 0.6355, + "step": 77730 + }, + { + "epoch": 0.6872469456673562, + "grad_norm": 4.787046909332275, + "learning_rate": 3.85458842388774e-05, + "loss": 0.5156, + "step": 77740 + }, + { + "epoch": 0.6873353489276685, + "grad_norm": 0.96241295337677, + "learning_rate": 3.8544410851205524e-05, + "loss": 0.6055, + "step": 77750 + }, + { + "epoch": 0.6874237521879807, + "grad_norm": 1.3984150886535645, + "learning_rate": 3.854293746353366e-05, + "loss": 0.5456, + "step": 77760 + }, + { + "epoch": 0.6875121554482929, + "grad_norm": 2.450329065322876, + "learning_rate": 3.854146407586179e-05, + "loss": 0.7398, + "step": 77770 + }, + { + "epoch": 0.6876005587086051, + "grad_norm": 7.138993263244629, + "learning_rate": 3.8539990688189915e-05, + "loss": 0.6696, + "step": 77780 + }, + { + "epoch": 0.6876889619689174, + "grad_norm": 2.7971534729003906, + "learning_rate": 3.8538517300518044e-05, + "loss": 0.604, + "step": 77790 + }, + { + "epoch": 0.6877773652292296, + "grad_norm": 2.738062858581543, + "learning_rate": 3.853704391284618e-05, + "loss": 0.6613, + "step": 77800 + }, + { + "epoch": 0.6878657684895418, + "grad_norm": 6.439538478851318, + "learning_rate": 3.85355705251743e-05, + "loss": 0.6827, + "step": 77810 + }, + { + "epoch": 0.6879541717498542, + "grad_norm": 2.194342613220215, + "learning_rate": 3.8534097137502436e-05, + "loss": 0.6879, + "step": 77820 + }, + { + "epoch": 0.6880425750101664, + "grad_norm": 4.100100994110107, + "learning_rate": 3.853262374983056e-05, + "loss": 0.4775, + "step": 77830 + }, + { + "epoch": 0.6881309782704786, + "grad_norm": 1.8053207397460938, + "learning_rate": 3.853115036215869e-05, + "loss": 0.847, + "step": 77840 + }, + { + "epoch": 0.6882193815307909, + "grad_norm": 7.048489570617676, + "learning_rate": 3.852967697448682e-05, + "loss": 0.6523, + "step": 77850 + }, + { + "epoch": 0.6883077847911031, + "grad_norm": 14.310152053833008, + "learning_rate": 3.852820358681495e-05, + "loss": 0.7075, + "step": 77860 + }, + { + "epoch": 0.6883961880514153, + "grad_norm": 6.7928009033203125, + "learning_rate": 3.852673019914308e-05, + "loss": 0.6874, + "step": 77870 + }, + { + "epoch": 0.6884845913117276, + "grad_norm": 3.396361827850342, + "learning_rate": 3.852525681147121e-05, + "loss": 0.6903, + "step": 77880 + }, + { + "epoch": 0.6885729945720398, + "grad_norm": 2.094087839126587, + "learning_rate": 3.8523783423799334e-05, + "loss": 0.606, + "step": 77890 + }, + { + "epoch": 0.688661397832352, + "grad_norm": 2.478360176086426, + "learning_rate": 3.852231003612747e-05, + "loss": 0.5834, + "step": 77900 + }, + { + "epoch": 0.6887498010926643, + "grad_norm": 4.949512958526611, + "learning_rate": 3.85208366484556e-05, + "loss": 0.6903, + "step": 77910 + }, + { + "epoch": 0.6888382043529765, + "grad_norm": 9.530298233032227, + "learning_rate": 3.8519363260783726e-05, + "loss": 0.7446, + "step": 77920 + }, + { + "epoch": 0.6889266076132888, + "grad_norm": 4.61517858505249, + "learning_rate": 3.8517889873111854e-05, + "loss": 0.6037, + "step": 77930 + }, + { + "epoch": 0.6890150108736011, + "grad_norm": 3.588691234588623, + "learning_rate": 3.851641648543998e-05, + "loss": 0.7105, + "step": 77940 + }, + { + "epoch": 0.6891034141339133, + "grad_norm": 2.0319581031799316, + "learning_rate": 3.851494309776811e-05, + "loss": 0.6975, + "step": 77950 + }, + { + "epoch": 0.6891918173942255, + "grad_norm": 4.805972576141357, + "learning_rate": 3.8513469710096246e-05, + "loss": 0.6088, + "step": 77960 + }, + { + "epoch": 0.6892802206545378, + "grad_norm": 5.29019832611084, + "learning_rate": 3.851199632242437e-05, + "loss": 0.696, + "step": 77970 + }, + { + "epoch": 0.68936862391485, + "grad_norm": 3.0730087757110596, + "learning_rate": 3.85105229347525e-05, + "loss": 0.6287, + "step": 77980 + }, + { + "epoch": 0.6894570271751622, + "grad_norm": 3.155669689178467, + "learning_rate": 3.850904954708063e-05, + "loss": 0.567, + "step": 77990 + }, + { + "epoch": 0.6895454304354744, + "grad_norm": 1.2867512702941895, + "learning_rate": 3.850757615940876e-05, + "loss": 0.6755, + "step": 78000 + }, + { + "epoch": 0.6896338336957867, + "grad_norm": 3.9351489543914795, + "learning_rate": 3.850610277173689e-05, + "loss": 0.8449, + "step": 78010 + }, + { + "epoch": 0.6897222369560989, + "grad_norm": 7.071952819824219, + "learning_rate": 3.850462938406502e-05, + "loss": 0.6631, + "step": 78020 + }, + { + "epoch": 0.6898106402164111, + "grad_norm": 1.500244140625, + "learning_rate": 3.8503155996393145e-05, + "loss": 0.7634, + "step": 78030 + }, + { + "epoch": 0.6898990434767234, + "grad_norm": 3.5885379314422607, + "learning_rate": 3.850168260872128e-05, + "loss": 0.7424, + "step": 78040 + }, + { + "epoch": 0.6899874467370357, + "grad_norm": 3.835240602493286, + "learning_rate": 3.850020922104941e-05, + "loss": 0.835, + "step": 78050 + }, + { + "epoch": 0.690075849997348, + "grad_norm": 4.1147565841674805, + "learning_rate": 3.8498735833377536e-05, + "loss": 0.6442, + "step": 78060 + }, + { + "epoch": 0.6901642532576602, + "grad_norm": 1.1382116079330444, + "learning_rate": 3.8497262445705665e-05, + "loss": 0.6578, + "step": 78070 + }, + { + "epoch": 0.6902526565179724, + "grad_norm": 1.4677952527999878, + "learning_rate": 3.849578905803379e-05, + "loss": 0.6354, + "step": 78080 + }, + { + "epoch": 0.6903410597782846, + "grad_norm": 2.0075416564941406, + "learning_rate": 3.849431567036192e-05, + "loss": 0.6277, + "step": 78090 + }, + { + "epoch": 0.6904294630385969, + "grad_norm": 2.518988609313965, + "learning_rate": 3.8492842282690057e-05, + "loss": 0.8074, + "step": 78100 + }, + { + "epoch": 0.6905178662989091, + "grad_norm": 1.1049258708953857, + "learning_rate": 3.8491368895018185e-05, + "loss": 0.6466, + "step": 78110 + }, + { + "epoch": 0.6906062695592213, + "grad_norm": 1.5145949125289917, + "learning_rate": 3.848989550734631e-05, + "loss": 0.6499, + "step": 78120 + }, + { + "epoch": 0.6906946728195336, + "grad_norm": 3.385376214981079, + "learning_rate": 3.848842211967444e-05, + "loss": 0.6591, + "step": 78130 + }, + { + "epoch": 0.6907830760798458, + "grad_norm": 12.704715728759766, + "learning_rate": 3.848694873200257e-05, + "loss": 0.6504, + "step": 78140 + }, + { + "epoch": 0.690871479340158, + "grad_norm": 1.2976224422454834, + "learning_rate": 3.84854753443307e-05, + "loss": 0.6289, + "step": 78150 + }, + { + "epoch": 0.6909598826004703, + "grad_norm": 1.834031581878662, + "learning_rate": 3.8484001956658833e-05, + "loss": 0.7837, + "step": 78160 + }, + { + "epoch": 0.6910482858607826, + "grad_norm": 3.1878788471221924, + "learning_rate": 3.848252856898696e-05, + "loss": 0.6847, + "step": 78170 + }, + { + "epoch": 0.6911366891210948, + "grad_norm": 4.563849449157715, + "learning_rate": 3.848105518131509e-05, + "loss": 0.7368, + "step": 78180 + }, + { + "epoch": 0.6912250923814071, + "grad_norm": 5.8437676429748535, + "learning_rate": 3.847958179364322e-05, + "loss": 0.6023, + "step": 78190 + }, + { + "epoch": 0.6913134956417193, + "grad_norm": 1.4736589193344116, + "learning_rate": 3.847810840597135e-05, + "loss": 0.7275, + "step": 78200 + }, + { + "epoch": 0.6914018989020315, + "grad_norm": 1.7320752143859863, + "learning_rate": 3.8476635018299475e-05, + "loss": 0.7797, + "step": 78210 + }, + { + "epoch": 0.6914903021623438, + "grad_norm": 2.283383369445801, + "learning_rate": 3.8475161630627604e-05, + "loss": 0.74, + "step": 78220 + }, + { + "epoch": 0.691578705422656, + "grad_norm": 1.9529945850372314, + "learning_rate": 3.847368824295574e-05, + "loss": 0.6701, + "step": 78230 + }, + { + "epoch": 0.6916671086829682, + "grad_norm": 1.152864694595337, + "learning_rate": 3.847221485528387e-05, + "loss": 0.7087, + "step": 78240 + }, + { + "epoch": 0.6917555119432804, + "grad_norm": 6.683051586151123, + "learning_rate": 3.8470741467611995e-05, + "loss": 0.5918, + "step": 78250 + }, + { + "epoch": 0.6918439152035927, + "grad_norm": 2.7650420665740967, + "learning_rate": 3.8469268079940124e-05, + "loss": 0.6306, + "step": 78260 + }, + { + "epoch": 0.6919323184639049, + "grad_norm": 3.6181538105010986, + "learning_rate": 3.846779469226825e-05, + "loss": 0.7452, + "step": 78270 + }, + { + "epoch": 0.6920207217242171, + "grad_norm": 8.790289878845215, + "learning_rate": 3.846632130459638e-05, + "loss": 0.702, + "step": 78280 + }, + { + "epoch": 0.6921091249845295, + "grad_norm": 7.053476810455322, + "learning_rate": 3.8464847916924516e-05, + "loss": 0.7591, + "step": 78290 + }, + { + "epoch": 0.6921975282448417, + "grad_norm": 4.7699785232543945, + "learning_rate": 3.846337452925264e-05, + "loss": 0.7871, + "step": 78300 + }, + { + "epoch": 0.6922859315051539, + "grad_norm": 2.2361674308776855, + "learning_rate": 3.846190114158077e-05, + "loss": 0.7456, + "step": 78310 + }, + { + "epoch": 0.6923743347654662, + "grad_norm": 4.297597408294678, + "learning_rate": 3.84604277539089e-05, + "loss": 0.6889, + "step": 78320 + }, + { + "epoch": 0.6924627380257784, + "grad_norm": 4.675218105316162, + "learning_rate": 3.845895436623703e-05, + "loss": 0.6225, + "step": 78330 + }, + { + "epoch": 0.6925511412860906, + "grad_norm": 1.6113498210906982, + "learning_rate": 3.845748097856516e-05, + "loss": 0.5882, + "step": 78340 + }, + { + "epoch": 0.6926395445464029, + "grad_norm": 5.541153430938721, + "learning_rate": 3.845600759089329e-05, + "loss": 0.565, + "step": 78350 + }, + { + "epoch": 0.6927279478067151, + "grad_norm": 1.596197247505188, + "learning_rate": 3.8454534203221414e-05, + "loss": 0.6401, + "step": 78360 + }, + { + "epoch": 0.6928163510670273, + "grad_norm": 4.908422946929932, + "learning_rate": 3.845306081554955e-05, + "loss": 0.7363, + "step": 78370 + }, + { + "epoch": 0.6929047543273396, + "grad_norm": 5.7742486000061035, + "learning_rate": 3.845158742787768e-05, + "loss": 0.5931, + "step": 78380 + }, + { + "epoch": 0.6929931575876518, + "grad_norm": 3.7057316303253174, + "learning_rate": 3.8450114040205806e-05, + "loss": 0.6672, + "step": 78390 + }, + { + "epoch": 0.693081560847964, + "grad_norm": 2.952052593231201, + "learning_rate": 3.8448640652533934e-05, + "loss": 0.6726, + "step": 78400 + }, + { + "epoch": 0.6931699641082764, + "grad_norm": 5.1991400718688965, + "learning_rate": 3.844716726486206e-05, + "loss": 0.5413, + "step": 78410 + }, + { + "epoch": 0.6932583673685886, + "grad_norm": 3.4608805179595947, + "learning_rate": 3.844569387719019e-05, + "loss": 0.6158, + "step": 78420 + }, + { + "epoch": 0.6933467706289008, + "grad_norm": 3.589613437652588, + "learning_rate": 3.8444220489518326e-05, + "loss": 0.7783, + "step": 78430 + }, + { + "epoch": 0.693435173889213, + "grad_norm": 11.545126914978027, + "learning_rate": 3.844274710184645e-05, + "loss": 0.6587, + "step": 78440 + }, + { + "epoch": 0.6935235771495253, + "grad_norm": 9.156929016113281, + "learning_rate": 3.844127371417458e-05, + "loss": 0.7596, + "step": 78450 + }, + { + "epoch": 0.6936119804098375, + "grad_norm": 3.0845587253570557, + "learning_rate": 3.843980032650271e-05, + "loss": 0.6724, + "step": 78460 + }, + { + "epoch": 0.6937003836701497, + "grad_norm": 1.9026954174041748, + "learning_rate": 3.843832693883084e-05, + "loss": 0.5653, + "step": 78470 + }, + { + "epoch": 0.693788786930462, + "grad_norm": 6.188068866729736, + "learning_rate": 3.843685355115897e-05, + "loss": 0.7205, + "step": 78480 + }, + { + "epoch": 0.6938771901907742, + "grad_norm": 1.2324304580688477, + "learning_rate": 3.84353801634871e-05, + "loss": 0.7364, + "step": 78490 + }, + { + "epoch": 0.6939655934510864, + "grad_norm": 1.2918884754180908, + "learning_rate": 3.8433906775815225e-05, + "loss": 0.7628, + "step": 78500 + }, + { + "epoch": 0.6940539967113987, + "grad_norm": 3.9561684131622314, + "learning_rate": 3.843243338814336e-05, + "loss": 0.6541, + "step": 78510 + }, + { + "epoch": 0.694142399971711, + "grad_norm": 5.111685276031494, + "learning_rate": 3.843096000047148e-05, + "loss": 0.6263, + "step": 78520 + }, + { + "epoch": 0.6942308032320232, + "grad_norm": 2.7367007732391357, + "learning_rate": 3.8429486612799616e-05, + "loss": 0.5999, + "step": 78530 + }, + { + "epoch": 0.6943192064923355, + "grad_norm": 11.13370418548584, + "learning_rate": 3.8428013225127745e-05, + "loss": 0.7875, + "step": 78540 + }, + { + "epoch": 0.6944076097526477, + "grad_norm": 2.065875768661499, + "learning_rate": 3.842653983745587e-05, + "loss": 0.743, + "step": 78550 + }, + { + "epoch": 0.6944960130129599, + "grad_norm": 1.517561912536621, + "learning_rate": 3.8425066449784e-05, + "loss": 0.5215, + "step": 78560 + }, + { + "epoch": 0.6945844162732722, + "grad_norm": 9.550228118896484, + "learning_rate": 3.842359306211214e-05, + "loss": 0.6063, + "step": 78570 + }, + { + "epoch": 0.6946728195335844, + "grad_norm": 3.929105758666992, + "learning_rate": 3.842211967444026e-05, + "loss": 0.608, + "step": 78580 + }, + { + "epoch": 0.6947612227938966, + "grad_norm": 4.663963794708252, + "learning_rate": 3.842064628676839e-05, + "loss": 0.6964, + "step": 78590 + }, + { + "epoch": 0.6948496260542089, + "grad_norm": 23.641374588012695, + "learning_rate": 3.841917289909652e-05, + "loss": 0.6725, + "step": 78600 + }, + { + "epoch": 0.6949380293145211, + "grad_norm": 5.547507286071777, + "learning_rate": 3.841769951142465e-05, + "loss": 0.6349, + "step": 78610 + }, + { + "epoch": 0.6950264325748333, + "grad_norm": 5.105334281921387, + "learning_rate": 3.841622612375278e-05, + "loss": 0.5902, + "step": 78620 + }, + { + "epoch": 0.6951148358351456, + "grad_norm": 3.687915563583374, + "learning_rate": 3.8414752736080914e-05, + "loss": 0.8202, + "step": 78630 + }, + { + "epoch": 0.6952032390954579, + "grad_norm": 4.429482460021973, + "learning_rate": 3.8413279348409035e-05, + "loss": 0.6844, + "step": 78640 + }, + { + "epoch": 0.6952916423557701, + "grad_norm": 5.250709056854248, + "learning_rate": 3.841180596073717e-05, + "loss": 0.7061, + "step": 78650 + }, + { + "epoch": 0.6953800456160824, + "grad_norm": 2.9156718254089355, + "learning_rate": 3.841033257306529e-05, + "loss": 0.7087, + "step": 78660 + }, + { + "epoch": 0.6954684488763946, + "grad_norm": 3.9343905448913574, + "learning_rate": 3.840885918539343e-05, + "loss": 0.6798, + "step": 78670 + }, + { + "epoch": 0.6955568521367068, + "grad_norm": 2.015077829360962, + "learning_rate": 3.8407385797721555e-05, + "loss": 0.6106, + "step": 78680 + }, + { + "epoch": 0.695645255397019, + "grad_norm": 6.2138566970825195, + "learning_rate": 3.8405912410049684e-05, + "loss": 0.6293, + "step": 78690 + }, + { + "epoch": 0.6957336586573313, + "grad_norm": 17.735597610473633, + "learning_rate": 3.840443902237781e-05, + "loss": 0.7272, + "step": 78700 + }, + { + "epoch": 0.6958220619176435, + "grad_norm": 3.783289909362793, + "learning_rate": 3.840296563470595e-05, + "loss": 0.7488, + "step": 78710 + }, + { + "epoch": 0.6959104651779557, + "grad_norm": 4.88430118560791, + "learning_rate": 3.840149224703407e-05, + "loss": 0.7339, + "step": 78720 + }, + { + "epoch": 0.695998868438268, + "grad_norm": 15.371222496032715, + "learning_rate": 3.8400018859362204e-05, + "loss": 0.6698, + "step": 78730 + }, + { + "epoch": 0.6960872716985802, + "grad_norm": 2.1152405738830566, + "learning_rate": 3.839854547169033e-05, + "loss": 0.6102, + "step": 78740 + }, + { + "epoch": 0.6961756749588924, + "grad_norm": 2.173186779022217, + "learning_rate": 3.839707208401846e-05, + "loss": 0.6526, + "step": 78750 + }, + { + "epoch": 0.6962640782192048, + "grad_norm": 2.6727705001831055, + "learning_rate": 3.839559869634659e-05, + "loss": 0.7233, + "step": 78760 + }, + { + "epoch": 0.696352481479517, + "grad_norm": 7.480678081512451, + "learning_rate": 3.839412530867472e-05, + "loss": 0.6809, + "step": 78770 + }, + { + "epoch": 0.6964408847398292, + "grad_norm": 1.2236015796661377, + "learning_rate": 3.8392651921002846e-05, + "loss": 0.6014, + "step": 78780 + }, + { + "epoch": 0.6965292880001415, + "grad_norm": 1.2494572401046753, + "learning_rate": 3.839117853333098e-05, + "loss": 0.6668, + "step": 78790 + }, + { + "epoch": 0.6966176912604537, + "grad_norm": 8.251554489135742, + "learning_rate": 3.83897051456591e-05, + "loss": 0.6042, + "step": 78800 + }, + { + "epoch": 0.6967060945207659, + "grad_norm": 2.8776397705078125, + "learning_rate": 3.838823175798724e-05, + "loss": 0.752, + "step": 78810 + }, + { + "epoch": 0.6967944977810782, + "grad_norm": 1.4898128509521484, + "learning_rate": 3.8386758370315366e-05, + "loss": 0.5884, + "step": 78820 + }, + { + "epoch": 0.6968829010413904, + "grad_norm": 2.1654345989227295, + "learning_rate": 3.8385284982643494e-05, + "loss": 0.6226, + "step": 78830 + }, + { + "epoch": 0.6969713043017026, + "grad_norm": 3.406527519226074, + "learning_rate": 3.838381159497162e-05, + "loss": 0.6076, + "step": 78840 + }, + { + "epoch": 0.6970597075620149, + "grad_norm": 9.295869827270508, + "learning_rate": 3.838233820729976e-05, + "loss": 0.5279, + "step": 78850 + }, + { + "epoch": 0.6971481108223271, + "grad_norm": 3.8971216678619385, + "learning_rate": 3.838086481962788e-05, + "loss": 0.5792, + "step": 78860 + }, + { + "epoch": 0.6972365140826393, + "grad_norm": 3.4754998683929443, + "learning_rate": 3.8379391431956014e-05, + "loss": 0.7319, + "step": 78870 + }, + { + "epoch": 0.6973249173429517, + "grad_norm": 3.010288953781128, + "learning_rate": 3.8377918044284136e-05, + "loss": 0.6882, + "step": 78880 + }, + { + "epoch": 0.6974133206032639, + "grad_norm": 2.7883942127227783, + "learning_rate": 3.837644465661227e-05, + "loss": 0.6509, + "step": 78890 + }, + { + "epoch": 0.6975017238635761, + "grad_norm": 2.550469160079956, + "learning_rate": 3.83749712689404e-05, + "loss": 0.6652, + "step": 78900 + }, + { + "epoch": 0.6975901271238883, + "grad_norm": 1.3190661668777466, + "learning_rate": 3.837349788126853e-05, + "loss": 0.5776, + "step": 78910 + }, + { + "epoch": 0.6976785303842006, + "grad_norm": 2.149895668029785, + "learning_rate": 3.8372024493596656e-05, + "loss": 0.6269, + "step": 78920 + }, + { + "epoch": 0.6977669336445128, + "grad_norm": 2.7502822875976562, + "learning_rate": 3.837055110592479e-05, + "loss": 0.6426, + "step": 78930 + }, + { + "epoch": 0.697855336904825, + "grad_norm": 3.8479859828948975, + "learning_rate": 3.836907771825291e-05, + "loss": 0.7274, + "step": 78940 + }, + { + "epoch": 0.6979437401651373, + "grad_norm": 1.2449777126312256, + "learning_rate": 3.836760433058105e-05, + "loss": 0.7502, + "step": 78950 + }, + { + "epoch": 0.6980321434254495, + "grad_norm": 2.5838563442230225, + "learning_rate": 3.8366130942909176e-05, + "loss": 0.6146, + "step": 78960 + }, + { + "epoch": 0.6981205466857617, + "grad_norm": 1.2737388610839844, + "learning_rate": 3.8364657555237305e-05, + "loss": 0.7528, + "step": 78970 + }, + { + "epoch": 0.698208949946074, + "grad_norm": 2.6073715686798096, + "learning_rate": 3.836318416756543e-05, + "loss": 0.7847, + "step": 78980 + }, + { + "epoch": 0.6982973532063863, + "grad_norm": 3.2915992736816406, + "learning_rate": 3.836171077989356e-05, + "loss": 0.691, + "step": 78990 + }, + { + "epoch": 0.6983857564666985, + "grad_norm": 6.68314266204834, + "learning_rate": 3.836023739222169e-05, + "loss": 0.6805, + "step": 79000 + }, + { + "epoch": 0.6984741597270108, + "grad_norm": 2.470914125442505, + "learning_rate": 3.8358764004549825e-05, + "loss": 0.5996, + "step": 79010 + }, + { + "epoch": 0.698562562987323, + "grad_norm": 2.4620614051818848, + "learning_rate": 3.835729061687795e-05, + "loss": 0.6096, + "step": 79020 + }, + { + "epoch": 0.6986509662476352, + "grad_norm": 2.979200601577759, + "learning_rate": 3.835581722920608e-05, + "loss": 0.6128, + "step": 79030 + }, + { + "epoch": 0.6987393695079475, + "grad_norm": 1.9747117757797241, + "learning_rate": 3.835434384153421e-05, + "loss": 0.7464, + "step": 79040 + }, + { + "epoch": 0.6988277727682597, + "grad_norm": 1.1767827272415161, + "learning_rate": 3.835287045386234e-05, + "loss": 0.5912, + "step": 79050 + }, + { + "epoch": 0.6989161760285719, + "grad_norm": 1.072789192199707, + "learning_rate": 3.835139706619047e-05, + "loss": 0.6726, + "step": 79060 + }, + { + "epoch": 0.6990045792888842, + "grad_norm": 3.0794551372528076, + "learning_rate": 3.83499236785186e-05, + "loss": 0.6359, + "step": 79070 + }, + { + "epoch": 0.6990929825491964, + "grad_norm": 1.395095705986023, + "learning_rate": 3.834845029084673e-05, + "loss": 0.752, + "step": 79080 + }, + { + "epoch": 0.6991813858095086, + "grad_norm": 4.126559734344482, + "learning_rate": 3.834697690317486e-05, + "loss": 0.7603, + "step": 79090 + }, + { + "epoch": 0.6992697890698208, + "grad_norm": 5.317914962768555, + "learning_rate": 3.834550351550299e-05, + "loss": 0.5813, + "step": 79100 + }, + { + "epoch": 0.6993581923301332, + "grad_norm": 1.5926259756088257, + "learning_rate": 3.8344030127831115e-05, + "loss": 0.6125, + "step": 79110 + }, + { + "epoch": 0.6994465955904454, + "grad_norm": 1.7232309579849243, + "learning_rate": 3.8342556740159244e-05, + "loss": 0.5315, + "step": 79120 + }, + { + "epoch": 0.6995349988507576, + "grad_norm": 14.704235076904297, + "learning_rate": 3.834108335248737e-05, + "loss": 0.7286, + "step": 79130 + }, + { + "epoch": 0.6996234021110699, + "grad_norm": 1.7574563026428223, + "learning_rate": 3.833960996481551e-05, + "loss": 0.5799, + "step": 79140 + }, + { + "epoch": 0.6997118053713821, + "grad_norm": 1.215949296951294, + "learning_rate": 3.8338136577143635e-05, + "loss": 0.7365, + "step": 79150 + }, + { + "epoch": 0.6998002086316943, + "grad_norm": 5.2433881759643555, + "learning_rate": 3.8336663189471764e-05, + "loss": 0.6206, + "step": 79160 + }, + { + "epoch": 0.6998886118920066, + "grad_norm": 3.9595348834991455, + "learning_rate": 3.833518980179989e-05, + "loss": 0.6593, + "step": 79170 + }, + { + "epoch": 0.6999770151523188, + "grad_norm": 4.542568683624268, + "learning_rate": 3.833371641412802e-05, + "loss": 0.6442, + "step": 79180 + }, + { + "epoch": 0.700065418412631, + "grad_norm": 8.986078262329102, + "learning_rate": 3.833224302645615e-05, + "loss": 0.7199, + "step": 79190 + }, + { + "epoch": 0.7001538216729433, + "grad_norm": 4.744601249694824, + "learning_rate": 3.8330769638784284e-05, + "loss": 0.6518, + "step": 79200 + }, + { + "epoch": 0.7002422249332555, + "grad_norm": 1.6164606809616089, + "learning_rate": 3.832929625111241e-05, + "loss": 0.731, + "step": 79210 + }, + { + "epoch": 0.7003306281935677, + "grad_norm": 2.7258386611938477, + "learning_rate": 3.832782286344054e-05, + "loss": 0.6669, + "step": 79220 + }, + { + "epoch": 0.7004190314538801, + "grad_norm": 1.7344862222671509, + "learning_rate": 3.832634947576867e-05, + "loss": 0.5681, + "step": 79230 + }, + { + "epoch": 0.7005074347141923, + "grad_norm": 4.408559799194336, + "learning_rate": 3.83248760880968e-05, + "loss": 0.6261, + "step": 79240 + }, + { + "epoch": 0.7005958379745045, + "grad_norm": 1.9056577682495117, + "learning_rate": 3.8323402700424926e-05, + "loss": 0.6321, + "step": 79250 + }, + { + "epoch": 0.7006842412348168, + "grad_norm": 2.941056966781616, + "learning_rate": 3.832192931275306e-05, + "loss": 0.5622, + "step": 79260 + }, + { + "epoch": 0.700772644495129, + "grad_norm": 2.356008291244507, + "learning_rate": 3.832045592508118e-05, + "loss": 0.6902, + "step": 79270 + }, + { + "epoch": 0.7008610477554412, + "grad_norm": 4.120590686798096, + "learning_rate": 3.831898253740932e-05, + "loss": 0.7524, + "step": 79280 + }, + { + "epoch": 0.7009494510157535, + "grad_norm": 3.3727569580078125, + "learning_rate": 3.8317509149737446e-05, + "loss": 0.6958, + "step": 79290 + }, + { + "epoch": 0.7010378542760657, + "grad_norm": 7.907055854797363, + "learning_rate": 3.8316035762065574e-05, + "loss": 0.7343, + "step": 79300 + }, + { + "epoch": 0.7011262575363779, + "grad_norm": 1.464552640914917, + "learning_rate": 3.83145623743937e-05, + "loss": 0.6277, + "step": 79310 + }, + { + "epoch": 0.7012146607966901, + "grad_norm": 1.5897775888442993, + "learning_rate": 3.831308898672184e-05, + "loss": 0.6797, + "step": 79320 + }, + { + "epoch": 0.7013030640570024, + "grad_norm": 3.080288887023926, + "learning_rate": 3.831161559904996e-05, + "loss": 0.667, + "step": 79330 + }, + { + "epoch": 0.7013914673173146, + "grad_norm": 3.283644676208496, + "learning_rate": 3.8310142211378094e-05, + "loss": 0.6307, + "step": 79340 + }, + { + "epoch": 0.701479870577627, + "grad_norm": 1.221320390701294, + "learning_rate": 3.8308668823706216e-05, + "loss": 0.6109, + "step": 79350 + }, + { + "epoch": 0.7015682738379392, + "grad_norm": 3.7016940116882324, + "learning_rate": 3.830719543603435e-05, + "loss": 0.6676, + "step": 79360 + }, + { + "epoch": 0.7016566770982514, + "grad_norm": 4.4188432693481445, + "learning_rate": 3.830572204836248e-05, + "loss": 0.646, + "step": 79370 + }, + { + "epoch": 0.7017450803585636, + "grad_norm": 2.582963228225708, + "learning_rate": 3.830424866069061e-05, + "loss": 0.7603, + "step": 79380 + }, + { + "epoch": 0.7018334836188759, + "grad_norm": 38.37507629394531, + "learning_rate": 3.8302775273018736e-05, + "loss": 0.6606, + "step": 79390 + }, + { + "epoch": 0.7019218868791881, + "grad_norm": 1.8954352140426636, + "learning_rate": 3.830130188534687e-05, + "loss": 0.7488, + "step": 79400 + }, + { + "epoch": 0.7020102901395003, + "grad_norm": 4.248343467712402, + "learning_rate": 3.829982849767499e-05, + "loss": 0.6453, + "step": 79410 + }, + { + "epoch": 0.7020986933998126, + "grad_norm": 10.113481521606445, + "learning_rate": 3.829835511000313e-05, + "loss": 0.6714, + "step": 79420 + }, + { + "epoch": 0.7021870966601248, + "grad_norm": 3.6906447410583496, + "learning_rate": 3.8296881722331256e-05, + "loss": 0.7333, + "step": 79430 + }, + { + "epoch": 0.702275499920437, + "grad_norm": 7.084136962890625, + "learning_rate": 3.8295408334659385e-05, + "loss": 0.7472, + "step": 79440 + }, + { + "epoch": 0.7023639031807493, + "grad_norm": 7.299503326416016, + "learning_rate": 3.829393494698751e-05, + "loss": 0.7032, + "step": 79450 + }, + { + "epoch": 0.7024523064410615, + "grad_norm": 1.9515361785888672, + "learning_rate": 3.829246155931565e-05, + "loss": 0.7094, + "step": 79460 + }, + { + "epoch": 0.7025407097013738, + "grad_norm": 2.1417977809906006, + "learning_rate": 3.829098817164377e-05, + "loss": 0.5656, + "step": 79470 + }, + { + "epoch": 0.7026291129616861, + "grad_norm": 3.552403211593628, + "learning_rate": 3.8289514783971905e-05, + "loss": 0.6753, + "step": 79480 + }, + { + "epoch": 0.7027175162219983, + "grad_norm": 10.532079696655273, + "learning_rate": 3.8288041396300027e-05, + "loss": 0.7718, + "step": 79490 + }, + { + "epoch": 0.7028059194823105, + "grad_norm": 1.7979345321655273, + "learning_rate": 3.828656800862816e-05, + "loss": 0.6526, + "step": 79500 + }, + { + "epoch": 0.7028943227426228, + "grad_norm": 10.65814208984375, + "learning_rate": 3.828509462095629e-05, + "loss": 0.7484, + "step": 79510 + }, + { + "epoch": 0.702982726002935, + "grad_norm": 2.0553746223449707, + "learning_rate": 3.828362123328442e-05, + "loss": 0.6374, + "step": 79520 + }, + { + "epoch": 0.7030711292632472, + "grad_norm": 1.2106977701187134, + "learning_rate": 3.828214784561255e-05, + "loss": 0.688, + "step": 79530 + }, + { + "epoch": 0.7031595325235594, + "grad_norm": 2.429676055908203, + "learning_rate": 3.828067445794068e-05, + "loss": 0.7162, + "step": 79540 + }, + { + "epoch": 0.7032479357838717, + "grad_norm": 5.542989253997803, + "learning_rate": 3.8279201070268803e-05, + "loss": 0.709, + "step": 79550 + }, + { + "epoch": 0.7033363390441839, + "grad_norm": 4.563215732574463, + "learning_rate": 3.827772768259694e-05, + "loss": 0.64, + "step": 79560 + }, + { + "epoch": 0.7034247423044961, + "grad_norm": 3.350586414337158, + "learning_rate": 3.827625429492507e-05, + "loss": 0.7658, + "step": 79570 + }, + { + "epoch": 0.7035131455648085, + "grad_norm": 3.2068681716918945, + "learning_rate": 3.8274780907253195e-05, + "loss": 0.8937, + "step": 79580 + }, + { + "epoch": 0.7036015488251207, + "grad_norm": 2.844890832901001, + "learning_rate": 3.8273307519581324e-05, + "loss": 0.5628, + "step": 79590 + }, + { + "epoch": 0.703689952085433, + "grad_norm": 22.12971305847168, + "learning_rate": 3.827183413190945e-05, + "loss": 0.6472, + "step": 79600 + }, + { + "epoch": 0.7037783553457452, + "grad_norm": 3.8450522422790527, + "learning_rate": 3.827036074423758e-05, + "loss": 0.5785, + "step": 79610 + }, + { + "epoch": 0.7038667586060574, + "grad_norm": 1.7838504314422607, + "learning_rate": 3.8268887356565715e-05, + "loss": 0.688, + "step": 79620 + }, + { + "epoch": 0.7039551618663696, + "grad_norm": 6.339825630187988, + "learning_rate": 3.826741396889384e-05, + "loss": 0.7111, + "step": 79630 + }, + { + "epoch": 0.7040435651266819, + "grad_norm": 3.405885696411133, + "learning_rate": 3.826594058122197e-05, + "loss": 0.756, + "step": 79640 + }, + { + "epoch": 0.7041319683869941, + "grad_norm": 1.351367473602295, + "learning_rate": 3.82644671935501e-05, + "loss": 0.7055, + "step": 79650 + }, + { + "epoch": 0.7042203716473063, + "grad_norm": 2.725022554397583, + "learning_rate": 3.826299380587823e-05, + "loss": 0.7049, + "step": 79660 + }, + { + "epoch": 0.7043087749076186, + "grad_norm": 18.686199188232422, + "learning_rate": 3.826152041820636e-05, + "loss": 0.6502, + "step": 79670 + }, + { + "epoch": 0.7043971781679308, + "grad_norm": 2.0062692165374756, + "learning_rate": 3.826004703053449e-05, + "loss": 0.6214, + "step": 79680 + }, + { + "epoch": 0.704485581428243, + "grad_norm": 1.0387845039367676, + "learning_rate": 3.8258573642862614e-05, + "loss": 0.6143, + "step": 79690 + }, + { + "epoch": 0.7045739846885554, + "grad_norm": 10.7262601852417, + "learning_rate": 3.825710025519075e-05, + "loss": 0.6674, + "step": 79700 + }, + { + "epoch": 0.7046623879488676, + "grad_norm": 2.371840238571167, + "learning_rate": 3.825562686751887e-05, + "loss": 0.8116, + "step": 79710 + }, + { + "epoch": 0.7047507912091798, + "grad_norm": 5.077537536621094, + "learning_rate": 3.8254153479847006e-05, + "loss": 0.651, + "step": 79720 + }, + { + "epoch": 0.7048391944694921, + "grad_norm": 4.237266540527344, + "learning_rate": 3.8252680092175134e-05, + "loss": 0.6247, + "step": 79730 + }, + { + "epoch": 0.7049275977298043, + "grad_norm": 10.335536003112793, + "learning_rate": 3.825120670450326e-05, + "loss": 0.6308, + "step": 79740 + }, + { + "epoch": 0.7050160009901165, + "grad_norm": 4.275367259979248, + "learning_rate": 3.824973331683139e-05, + "loss": 0.7129, + "step": 79750 + }, + { + "epoch": 0.7051044042504288, + "grad_norm": 2.2424392700195312, + "learning_rate": 3.8248259929159526e-05, + "loss": 0.5872, + "step": 79760 + }, + { + "epoch": 0.705192807510741, + "grad_norm": 7.280856132507324, + "learning_rate": 3.824678654148765e-05, + "loss": 0.5647, + "step": 79770 + }, + { + "epoch": 0.7052812107710532, + "grad_norm": 17.15814971923828, + "learning_rate": 3.824531315381578e-05, + "loss": 0.7838, + "step": 79780 + }, + { + "epoch": 0.7053696140313654, + "grad_norm": 1.6910101175308228, + "learning_rate": 3.824383976614391e-05, + "loss": 0.5949, + "step": 79790 + }, + { + "epoch": 0.7054580172916777, + "grad_norm": 5.263885021209717, + "learning_rate": 3.824236637847204e-05, + "loss": 0.6636, + "step": 79800 + }, + { + "epoch": 0.7055464205519899, + "grad_norm": 6.243316650390625, + "learning_rate": 3.824089299080017e-05, + "loss": 0.6419, + "step": 79810 + }, + { + "epoch": 0.7056348238123022, + "grad_norm": 3.959357976913452, + "learning_rate": 3.8239419603128296e-05, + "loss": 0.6747, + "step": 79820 + }, + { + "epoch": 0.7057232270726145, + "grad_norm": 5.906799793243408, + "learning_rate": 3.8237946215456424e-05, + "loss": 0.647, + "step": 79830 + }, + { + "epoch": 0.7058116303329267, + "grad_norm": 7.207847595214844, + "learning_rate": 3.823647282778456e-05, + "loss": 0.7708, + "step": 79840 + }, + { + "epoch": 0.7059000335932389, + "grad_norm": 4.120061874389648, + "learning_rate": 3.823499944011268e-05, + "loss": 0.6943, + "step": 79850 + }, + { + "epoch": 0.7059884368535512, + "grad_norm": 3.261446237564087, + "learning_rate": 3.8233526052440816e-05, + "loss": 0.6548, + "step": 79860 + }, + { + "epoch": 0.7060768401138634, + "grad_norm": 5.274998664855957, + "learning_rate": 3.8232052664768945e-05, + "loss": 0.555, + "step": 79870 + }, + { + "epoch": 0.7061652433741756, + "grad_norm": 13.627988815307617, + "learning_rate": 3.823057927709707e-05, + "loss": 0.5523, + "step": 79880 + }, + { + "epoch": 0.7062536466344879, + "grad_norm": 5.227761745452881, + "learning_rate": 3.82291058894252e-05, + "loss": 0.7526, + "step": 79890 + }, + { + "epoch": 0.7063420498948001, + "grad_norm": 7.97208309173584, + "learning_rate": 3.8227632501753336e-05, + "loss": 0.688, + "step": 79900 + }, + { + "epoch": 0.7064304531551123, + "grad_norm": 2.218047618865967, + "learning_rate": 3.822615911408146e-05, + "loss": 0.6018, + "step": 79910 + }, + { + "epoch": 0.7065188564154246, + "grad_norm": 4.144509792327881, + "learning_rate": 3.822468572640959e-05, + "loss": 0.8467, + "step": 79920 + }, + { + "epoch": 0.7066072596757368, + "grad_norm": 2.490694284439087, + "learning_rate": 3.822321233873772e-05, + "loss": 0.5365, + "step": 79930 + }, + { + "epoch": 0.7066956629360491, + "grad_norm": 2.9271297454833984, + "learning_rate": 3.822173895106585e-05, + "loss": 0.6453, + "step": 79940 + }, + { + "epoch": 0.7067840661963614, + "grad_norm": 6.936543941497803, + "learning_rate": 3.822026556339398e-05, + "loss": 0.6204, + "step": 79950 + }, + { + "epoch": 0.7068724694566736, + "grad_norm": 1.7527416944503784, + "learning_rate": 3.8218792175722107e-05, + "loss": 0.5657, + "step": 79960 + }, + { + "epoch": 0.7069608727169858, + "grad_norm": 5.678459167480469, + "learning_rate": 3.8217318788050235e-05, + "loss": 0.6058, + "step": 79970 + }, + { + "epoch": 0.707049275977298, + "grad_norm": 1.9363157749176025, + "learning_rate": 3.821584540037837e-05, + "loss": 0.6623, + "step": 79980 + }, + { + "epoch": 0.7071376792376103, + "grad_norm": 9.029206275939941, + "learning_rate": 3.82143720127065e-05, + "loss": 0.6865, + "step": 79990 + }, + { + "epoch": 0.7072260824979225, + "grad_norm": 1.8255505561828613, + "learning_rate": 3.821289862503463e-05, + "loss": 0.6503, + "step": 80000 + }, + { + "epoch": 0.7073144857582347, + "grad_norm": 4.427699565887451, + "learning_rate": 3.8211425237362755e-05, + "loss": 0.7285, + "step": 80010 + }, + { + "epoch": 0.707402889018547, + "grad_norm": 3.238001585006714, + "learning_rate": 3.8209951849690883e-05, + "loss": 0.7121, + "step": 80020 + }, + { + "epoch": 0.7074912922788592, + "grad_norm": 2.349195718765259, + "learning_rate": 3.820847846201901e-05, + "loss": 0.8094, + "step": 80030 + }, + { + "epoch": 0.7075796955391714, + "grad_norm": 8.461559295654297, + "learning_rate": 3.820700507434715e-05, + "loss": 0.8172, + "step": 80040 + }, + { + "epoch": 0.7076680987994838, + "grad_norm": 2.5984296798706055, + "learning_rate": 3.8205531686675275e-05, + "loss": 0.6721, + "step": 80050 + }, + { + "epoch": 0.707756502059796, + "grad_norm": 3.524854898452759, + "learning_rate": 3.8204058299003404e-05, + "loss": 0.5735, + "step": 80060 + }, + { + "epoch": 0.7078449053201082, + "grad_norm": 1.5843122005462646, + "learning_rate": 3.820258491133153e-05, + "loss": 0.7668, + "step": 80070 + }, + { + "epoch": 0.7079333085804205, + "grad_norm": 1.318343162536621, + "learning_rate": 3.820111152365966e-05, + "loss": 0.6858, + "step": 80080 + }, + { + "epoch": 0.7080217118407327, + "grad_norm": 1.6608740091323853, + "learning_rate": 3.819963813598779e-05, + "loss": 0.516, + "step": 80090 + }, + { + "epoch": 0.7081101151010449, + "grad_norm": 3.6183922290802, + "learning_rate": 3.819816474831592e-05, + "loss": 0.6425, + "step": 80100 + }, + { + "epoch": 0.7081985183613572, + "grad_norm": 8.370306015014648, + "learning_rate": 3.819669136064405e-05, + "loss": 0.8753, + "step": 80110 + }, + { + "epoch": 0.7082869216216694, + "grad_norm": 7.38960075378418, + "learning_rate": 3.819521797297218e-05, + "loss": 0.589, + "step": 80120 + }, + { + "epoch": 0.7083753248819816, + "grad_norm": 1.351130723953247, + "learning_rate": 3.819374458530031e-05, + "loss": 0.6062, + "step": 80130 + }, + { + "epoch": 0.7084637281422939, + "grad_norm": 4.630609512329102, + "learning_rate": 3.819227119762844e-05, + "loss": 0.6723, + "step": 80140 + }, + { + "epoch": 0.7085521314026061, + "grad_norm": 4.845788955688477, + "learning_rate": 3.8190797809956566e-05, + "loss": 0.6207, + "step": 80150 + }, + { + "epoch": 0.7086405346629183, + "grad_norm": 4.605804920196533, + "learning_rate": 3.8189324422284694e-05, + "loss": 0.7058, + "step": 80160 + }, + { + "epoch": 0.7087289379232307, + "grad_norm": 1.0693213939666748, + "learning_rate": 3.818785103461283e-05, + "loss": 0.6121, + "step": 80170 + }, + { + "epoch": 0.7088173411835429, + "grad_norm": 3.252079725265503, + "learning_rate": 3.818637764694095e-05, + "loss": 0.7348, + "step": 80180 + }, + { + "epoch": 0.7089057444438551, + "grad_norm": 3.078933000564575, + "learning_rate": 3.8184904259269086e-05, + "loss": 0.7634, + "step": 80190 + }, + { + "epoch": 0.7089941477041674, + "grad_norm": 2.09808349609375, + "learning_rate": 3.8183430871597214e-05, + "loss": 0.6455, + "step": 80200 + }, + { + "epoch": 0.7090825509644796, + "grad_norm": 8.699748992919922, + "learning_rate": 3.818195748392534e-05, + "loss": 0.7876, + "step": 80210 + }, + { + "epoch": 0.7091709542247918, + "grad_norm": 6.0032734870910645, + "learning_rate": 3.818048409625347e-05, + "loss": 0.6613, + "step": 80220 + }, + { + "epoch": 0.709259357485104, + "grad_norm": 7.0532002449035645, + "learning_rate": 3.8179010708581606e-05, + "loss": 0.625, + "step": 80230 + }, + { + "epoch": 0.7093477607454163, + "grad_norm": 3.107225179672241, + "learning_rate": 3.817753732090973e-05, + "loss": 0.7048, + "step": 80240 + }, + { + "epoch": 0.7094361640057285, + "grad_norm": 4.6212873458862305, + "learning_rate": 3.817606393323786e-05, + "loss": 0.7531, + "step": 80250 + }, + { + "epoch": 0.7095245672660407, + "grad_norm": 2.4628825187683105, + "learning_rate": 3.817459054556599e-05, + "loss": 0.7099, + "step": 80260 + }, + { + "epoch": 0.709612970526353, + "grad_norm": 14.950610160827637, + "learning_rate": 3.817311715789412e-05, + "loss": 0.7147, + "step": 80270 + }, + { + "epoch": 0.7097013737866652, + "grad_norm": 1.1862995624542236, + "learning_rate": 3.817164377022225e-05, + "loss": 0.5203, + "step": 80280 + }, + { + "epoch": 0.7097897770469775, + "grad_norm": 1.3501532077789307, + "learning_rate": 3.8170170382550376e-05, + "loss": 0.7251, + "step": 80290 + }, + { + "epoch": 0.7098781803072898, + "grad_norm": 3.677821159362793, + "learning_rate": 3.8168696994878505e-05, + "loss": 0.4929, + "step": 80300 + }, + { + "epoch": 0.709966583567602, + "grad_norm": 1.737152099609375, + "learning_rate": 3.816722360720664e-05, + "loss": 0.7541, + "step": 80310 + }, + { + "epoch": 0.7100549868279142, + "grad_norm": 1.7136890888214111, + "learning_rate": 3.816575021953476e-05, + "loss": 0.6292, + "step": 80320 + }, + { + "epoch": 0.7101433900882265, + "grad_norm": 6.275796890258789, + "learning_rate": 3.8164276831862896e-05, + "loss": 0.5734, + "step": 80330 + }, + { + "epoch": 0.7102317933485387, + "grad_norm": 5.169858455657959, + "learning_rate": 3.8162803444191025e-05, + "loss": 0.7539, + "step": 80340 + }, + { + "epoch": 0.7103201966088509, + "grad_norm": 3.0072147846221924, + "learning_rate": 3.816133005651915e-05, + "loss": 0.6728, + "step": 80350 + }, + { + "epoch": 0.7104085998691632, + "grad_norm": 2.848865270614624, + "learning_rate": 3.815985666884728e-05, + "loss": 0.6994, + "step": 80360 + }, + { + "epoch": 0.7104970031294754, + "grad_norm": 1.6073395013809204, + "learning_rate": 3.8158383281175417e-05, + "loss": 0.5958, + "step": 80370 + }, + { + "epoch": 0.7105854063897876, + "grad_norm": 5.001084804534912, + "learning_rate": 3.815690989350354e-05, + "loss": 0.6145, + "step": 80380 + }, + { + "epoch": 0.7106738096500999, + "grad_norm": 3.050320863723755, + "learning_rate": 3.815543650583167e-05, + "loss": 0.627, + "step": 80390 + }, + { + "epoch": 0.7107622129104121, + "grad_norm": 6.285628318786621, + "learning_rate": 3.81539631181598e-05, + "loss": 0.711, + "step": 80400 + }, + { + "epoch": 0.7108506161707244, + "grad_norm": 5.953503131866455, + "learning_rate": 3.815248973048793e-05, + "loss": 0.6785, + "step": 80410 + }, + { + "epoch": 0.7109390194310367, + "grad_norm": 7.071252346038818, + "learning_rate": 3.815101634281606e-05, + "loss": 0.7448, + "step": 80420 + }, + { + "epoch": 0.7110274226913489, + "grad_norm": 5.907627582550049, + "learning_rate": 3.814954295514419e-05, + "loss": 0.639, + "step": 80430 + }, + { + "epoch": 0.7111158259516611, + "grad_norm": 2.3476157188415527, + "learning_rate": 3.8148069567472315e-05, + "loss": 0.7414, + "step": 80440 + }, + { + "epoch": 0.7112042292119733, + "grad_norm": 6.545152187347412, + "learning_rate": 3.814659617980045e-05, + "loss": 0.7091, + "step": 80450 + }, + { + "epoch": 0.7112926324722856, + "grad_norm": 3.210423469543457, + "learning_rate": 3.814512279212857e-05, + "loss": 0.794, + "step": 80460 + }, + { + "epoch": 0.7113810357325978, + "grad_norm": 3.8754019737243652, + "learning_rate": 3.814364940445671e-05, + "loss": 0.5328, + "step": 80470 + }, + { + "epoch": 0.71146943899291, + "grad_norm": 3.9626824855804443, + "learning_rate": 3.8142176016784835e-05, + "loss": 0.5803, + "step": 80480 + }, + { + "epoch": 0.7115578422532223, + "grad_norm": 5.242998123168945, + "learning_rate": 3.8140702629112964e-05, + "loss": 0.6155, + "step": 80490 + }, + { + "epoch": 0.7116462455135345, + "grad_norm": 5.1946120262146, + "learning_rate": 3.813922924144109e-05, + "loss": 0.7128, + "step": 80500 + }, + { + "epoch": 0.7117346487738467, + "grad_norm": 5.955926895141602, + "learning_rate": 3.813775585376923e-05, + "loss": 0.6745, + "step": 80510 + }, + { + "epoch": 0.711823052034159, + "grad_norm": 3.4583559036254883, + "learning_rate": 3.813628246609735e-05, + "loss": 0.8521, + "step": 80520 + }, + { + "epoch": 0.7119114552944713, + "grad_norm": 3.571674346923828, + "learning_rate": 3.8134809078425484e-05, + "loss": 0.6185, + "step": 80530 + }, + { + "epoch": 0.7119998585547835, + "grad_norm": 1.334622859954834, + "learning_rate": 3.8133335690753605e-05, + "loss": 0.5735, + "step": 80540 + }, + { + "epoch": 0.7120882618150958, + "grad_norm": 0.7861053943634033, + "learning_rate": 3.813186230308174e-05, + "loss": 0.4735, + "step": 80550 + }, + { + "epoch": 0.712176665075408, + "grad_norm": 3.4673657417297363, + "learning_rate": 3.813038891540987e-05, + "loss": 0.7406, + "step": 80560 + }, + { + "epoch": 0.7122650683357202, + "grad_norm": 4.715671539306641, + "learning_rate": 3.8128915527738e-05, + "loss": 0.6631, + "step": 80570 + }, + { + "epoch": 0.7123534715960325, + "grad_norm": 5.260922908782959, + "learning_rate": 3.8127442140066126e-05, + "loss": 0.5585, + "step": 80580 + }, + { + "epoch": 0.7124418748563447, + "grad_norm": 8.380212783813477, + "learning_rate": 3.812596875239426e-05, + "loss": 0.7258, + "step": 80590 + }, + { + "epoch": 0.7125302781166569, + "grad_norm": 3.363799810409546, + "learning_rate": 3.812449536472238e-05, + "loss": 0.6766, + "step": 80600 + }, + { + "epoch": 0.7126186813769692, + "grad_norm": 2.8767929077148438, + "learning_rate": 3.812302197705052e-05, + "loss": 0.7858, + "step": 80610 + }, + { + "epoch": 0.7127070846372814, + "grad_norm": 2.8520190715789795, + "learning_rate": 3.8121548589378646e-05, + "loss": 0.6691, + "step": 80620 + }, + { + "epoch": 0.7127954878975936, + "grad_norm": 3.7284460067749023, + "learning_rate": 3.8120075201706774e-05, + "loss": 0.5819, + "step": 80630 + }, + { + "epoch": 0.712883891157906, + "grad_norm": 4.8409528732299805, + "learning_rate": 3.81186018140349e-05, + "loss": 0.6469, + "step": 80640 + }, + { + "epoch": 0.7129722944182182, + "grad_norm": 7.845481872558594, + "learning_rate": 3.811712842636303e-05, + "loss": 0.7873, + "step": 80650 + }, + { + "epoch": 0.7130606976785304, + "grad_norm": 2.3964896202087402, + "learning_rate": 3.811565503869116e-05, + "loss": 0.8076, + "step": 80660 + }, + { + "epoch": 0.7131491009388427, + "grad_norm": 4.14758825302124, + "learning_rate": 3.8114181651019294e-05, + "loss": 0.6822, + "step": 80670 + }, + { + "epoch": 0.7132375041991549, + "grad_norm": 1.3765736818313599, + "learning_rate": 3.8112708263347416e-05, + "loss": 0.5771, + "step": 80680 + }, + { + "epoch": 0.7133259074594671, + "grad_norm": 3.637725830078125, + "learning_rate": 3.811123487567555e-05, + "loss": 0.6483, + "step": 80690 + }, + { + "epoch": 0.7134143107197793, + "grad_norm": 1.5444157123565674, + "learning_rate": 3.810976148800368e-05, + "loss": 0.7043, + "step": 80700 + }, + { + "epoch": 0.7135027139800916, + "grad_norm": 1.9112391471862793, + "learning_rate": 3.810828810033181e-05, + "loss": 0.6185, + "step": 80710 + }, + { + "epoch": 0.7135911172404038, + "grad_norm": 3.378286361694336, + "learning_rate": 3.8106814712659936e-05, + "loss": 0.6345, + "step": 80720 + }, + { + "epoch": 0.713679520500716, + "grad_norm": 7.995019912719727, + "learning_rate": 3.810534132498807e-05, + "loss": 0.5848, + "step": 80730 + }, + { + "epoch": 0.7137679237610283, + "grad_norm": 3.080310583114624, + "learning_rate": 3.810386793731619e-05, + "loss": 0.6683, + "step": 80740 + }, + { + "epoch": 0.7138563270213405, + "grad_norm": 2.683173418045044, + "learning_rate": 3.810239454964433e-05, + "loss": 0.6971, + "step": 80750 + }, + { + "epoch": 0.7139447302816528, + "grad_norm": 5.536104679107666, + "learning_rate": 3.810092116197245e-05, + "loss": 0.6663, + "step": 80760 + }, + { + "epoch": 0.7140331335419651, + "grad_norm": 4.201199054718018, + "learning_rate": 3.8099447774300585e-05, + "loss": 0.6005, + "step": 80770 + }, + { + "epoch": 0.7141215368022773, + "grad_norm": 10.316539764404297, + "learning_rate": 3.809797438662871e-05, + "loss": 0.7354, + "step": 80780 + }, + { + "epoch": 0.7142099400625895, + "grad_norm": 3.480752468109131, + "learning_rate": 3.809650099895684e-05, + "loss": 0.758, + "step": 80790 + }, + { + "epoch": 0.7142983433229018, + "grad_norm": 6.061373233795166, + "learning_rate": 3.809502761128497e-05, + "loss": 0.6605, + "step": 80800 + }, + { + "epoch": 0.714386746583214, + "grad_norm": 2.2674710750579834, + "learning_rate": 3.8093554223613105e-05, + "loss": 0.6302, + "step": 80810 + }, + { + "epoch": 0.7144751498435262, + "grad_norm": 4.011229515075684, + "learning_rate": 3.8092080835941226e-05, + "loss": 0.6428, + "step": 80820 + }, + { + "epoch": 0.7145635531038385, + "grad_norm": 2.183293104171753, + "learning_rate": 3.809060744826936e-05, + "loss": 0.6736, + "step": 80830 + }, + { + "epoch": 0.7146519563641507, + "grad_norm": 4.528976917266846, + "learning_rate": 3.808913406059749e-05, + "loss": 0.645, + "step": 80840 + }, + { + "epoch": 0.7147403596244629, + "grad_norm": 6.955418109893799, + "learning_rate": 3.808766067292562e-05, + "loss": 0.6069, + "step": 80850 + }, + { + "epoch": 0.7148287628847751, + "grad_norm": 3.8275115489959717, + "learning_rate": 3.8086187285253747e-05, + "loss": 0.7253, + "step": 80860 + }, + { + "epoch": 0.7149171661450874, + "grad_norm": 5.179789066314697, + "learning_rate": 3.808471389758188e-05, + "loss": 0.6367, + "step": 80870 + }, + { + "epoch": 0.7150055694053997, + "grad_norm": 2.03383469581604, + "learning_rate": 3.808324050991e-05, + "loss": 0.755, + "step": 80880 + }, + { + "epoch": 0.715093972665712, + "grad_norm": 2.186598777770996, + "learning_rate": 3.808176712223814e-05, + "loss": 0.7652, + "step": 80890 + }, + { + "epoch": 0.7151823759260242, + "grad_norm": 4.560229778289795, + "learning_rate": 3.808029373456627e-05, + "loss": 0.6176, + "step": 80900 + }, + { + "epoch": 0.7152707791863364, + "grad_norm": 1.5504982471466064, + "learning_rate": 3.8078820346894395e-05, + "loss": 0.7417, + "step": 80910 + }, + { + "epoch": 0.7153591824466486, + "grad_norm": 1.2862977981567383, + "learning_rate": 3.8077346959222523e-05, + "loss": 0.636, + "step": 80920 + }, + { + "epoch": 0.7154475857069609, + "grad_norm": 2.8133223056793213, + "learning_rate": 3.807587357155065e-05, + "loss": 0.7344, + "step": 80930 + }, + { + "epoch": 0.7155359889672731, + "grad_norm": 3.200807809829712, + "learning_rate": 3.807440018387878e-05, + "loss": 0.7446, + "step": 80940 + }, + { + "epoch": 0.7156243922275853, + "grad_norm": 11.442713737487793, + "learning_rate": 3.8072926796206915e-05, + "loss": 0.77, + "step": 80950 + }, + { + "epoch": 0.7157127954878976, + "grad_norm": 1.8898372650146484, + "learning_rate": 3.8071453408535044e-05, + "loss": 0.6343, + "step": 80960 + }, + { + "epoch": 0.7158011987482098, + "grad_norm": 4.321465969085693, + "learning_rate": 3.806998002086317e-05, + "loss": 0.7081, + "step": 80970 + }, + { + "epoch": 0.715889602008522, + "grad_norm": 1.2546701431274414, + "learning_rate": 3.80685066331913e-05, + "loss": 0.6125, + "step": 80980 + }, + { + "epoch": 0.7159780052688343, + "grad_norm": 2.626376152038574, + "learning_rate": 3.806703324551943e-05, + "loss": 0.6884, + "step": 80990 + }, + { + "epoch": 0.7160664085291466, + "grad_norm": 1.4480594396591187, + "learning_rate": 3.806555985784756e-05, + "loss": 0.6814, + "step": 81000 + }, + { + "epoch": 0.7161548117894588, + "grad_norm": 3.1579535007476807, + "learning_rate": 3.8064086470175685e-05, + "loss": 0.6906, + "step": 81010 + }, + { + "epoch": 0.7162432150497711, + "grad_norm": 1.7976185083389282, + "learning_rate": 3.806261308250382e-05, + "loss": 0.6705, + "step": 81020 + }, + { + "epoch": 0.7163316183100833, + "grad_norm": 1.3817342519760132, + "learning_rate": 3.806113969483195e-05, + "loss": 0.7549, + "step": 81030 + }, + { + "epoch": 0.7164200215703955, + "grad_norm": 1.3976129293441772, + "learning_rate": 3.805966630716008e-05, + "loss": 0.6479, + "step": 81040 + }, + { + "epoch": 0.7165084248307078, + "grad_norm": 3.1552183628082275, + "learning_rate": 3.8058192919488206e-05, + "loss": 0.6905, + "step": 81050 + }, + { + "epoch": 0.71659682809102, + "grad_norm": 6.283140659332275, + "learning_rate": 3.8056719531816334e-05, + "loss": 0.5697, + "step": 81060 + }, + { + "epoch": 0.7166852313513322, + "grad_norm": 3.000025510787964, + "learning_rate": 3.805524614414446e-05, + "loss": 0.7089, + "step": 81070 + }, + { + "epoch": 0.7167736346116445, + "grad_norm": 3.9269211292266846, + "learning_rate": 3.80537727564726e-05, + "loss": 0.659, + "step": 81080 + }, + { + "epoch": 0.7168620378719567, + "grad_norm": 1.69406259059906, + "learning_rate": 3.8052299368800726e-05, + "loss": 0.7369, + "step": 81090 + }, + { + "epoch": 0.7169504411322689, + "grad_norm": 2.697906017303467, + "learning_rate": 3.8050825981128854e-05, + "loss": 0.6946, + "step": 81100 + }, + { + "epoch": 0.7170388443925813, + "grad_norm": 3.463474750518799, + "learning_rate": 3.804935259345698e-05, + "loss": 0.6458, + "step": 81110 + }, + { + "epoch": 0.7171272476528935, + "grad_norm": 1.4377788305282593, + "learning_rate": 3.804787920578511e-05, + "loss": 0.7313, + "step": 81120 + }, + { + "epoch": 0.7172156509132057, + "grad_norm": 4.773625373840332, + "learning_rate": 3.804640581811324e-05, + "loss": 0.598, + "step": 81130 + }, + { + "epoch": 0.717304054173518, + "grad_norm": 4.413767337799072, + "learning_rate": 3.8044932430441374e-05, + "loss": 0.6287, + "step": 81140 + }, + { + "epoch": 0.7173924574338302, + "grad_norm": 2.012118101119995, + "learning_rate": 3.8043459042769496e-05, + "loss": 0.664, + "step": 81150 + }, + { + "epoch": 0.7174808606941424, + "grad_norm": 3.5045692920684814, + "learning_rate": 3.804198565509763e-05, + "loss": 0.6162, + "step": 81160 + }, + { + "epoch": 0.7175692639544546, + "grad_norm": 5.1168999671936035, + "learning_rate": 3.804051226742576e-05, + "loss": 0.6448, + "step": 81170 + }, + { + "epoch": 0.7176576672147669, + "grad_norm": 8.003167152404785, + "learning_rate": 3.803903887975389e-05, + "loss": 0.7621, + "step": 81180 + }, + { + "epoch": 0.7177460704750791, + "grad_norm": 1.8028844594955444, + "learning_rate": 3.8037565492082016e-05, + "loss": 0.6371, + "step": 81190 + }, + { + "epoch": 0.7178344737353913, + "grad_norm": 3.978248357772827, + "learning_rate": 3.803609210441015e-05, + "loss": 0.6651, + "step": 81200 + }, + { + "epoch": 0.7179228769957036, + "grad_norm": 1.696570634841919, + "learning_rate": 3.803461871673827e-05, + "loss": 0.6472, + "step": 81210 + }, + { + "epoch": 0.7180112802560158, + "grad_norm": 1.6071555614471436, + "learning_rate": 3.803314532906641e-05, + "loss": 0.589, + "step": 81220 + }, + { + "epoch": 0.7180996835163281, + "grad_norm": 1.5361149311065674, + "learning_rate": 3.803167194139453e-05, + "loss": 0.5857, + "step": 81230 + }, + { + "epoch": 0.7181880867766404, + "grad_norm": 1.8883726596832275, + "learning_rate": 3.8030198553722665e-05, + "loss": 0.6103, + "step": 81240 + }, + { + "epoch": 0.7182764900369526, + "grad_norm": 1.2080509662628174, + "learning_rate": 3.802872516605079e-05, + "loss": 0.5576, + "step": 81250 + }, + { + "epoch": 0.7183648932972648, + "grad_norm": 2.866602897644043, + "learning_rate": 3.802725177837892e-05, + "loss": 0.6364, + "step": 81260 + }, + { + "epoch": 0.7184532965575771, + "grad_norm": 2.0462229251861572, + "learning_rate": 3.802577839070705e-05, + "loss": 0.7218, + "step": 81270 + }, + { + "epoch": 0.7185416998178893, + "grad_norm": 1.7809785604476929, + "learning_rate": 3.8024305003035185e-05, + "loss": 0.6613, + "step": 81280 + }, + { + "epoch": 0.7186301030782015, + "grad_norm": 1.510802149772644, + "learning_rate": 3.8022831615363306e-05, + "loss": 0.7402, + "step": 81290 + }, + { + "epoch": 0.7187185063385138, + "grad_norm": 6.516298294067383, + "learning_rate": 3.802135822769144e-05, + "loss": 0.8887, + "step": 81300 + }, + { + "epoch": 0.718806909598826, + "grad_norm": 3.8273253440856934, + "learning_rate": 3.801988484001957e-05, + "loss": 0.6008, + "step": 81310 + }, + { + "epoch": 0.7188953128591382, + "grad_norm": 1.5741015672683716, + "learning_rate": 3.80184114523477e-05, + "loss": 0.6555, + "step": 81320 + }, + { + "epoch": 0.7189837161194504, + "grad_norm": 2.636172294616699, + "learning_rate": 3.8016938064675827e-05, + "loss": 0.7384, + "step": 81330 + }, + { + "epoch": 0.7190721193797627, + "grad_norm": 4.42274284362793, + "learning_rate": 3.801546467700396e-05, + "loss": 0.6152, + "step": 81340 + }, + { + "epoch": 0.719160522640075, + "grad_norm": 3.990666627883911, + "learning_rate": 3.801399128933208e-05, + "loss": 0.6806, + "step": 81350 + }, + { + "epoch": 0.7192489259003872, + "grad_norm": 4.11974573135376, + "learning_rate": 3.801251790166022e-05, + "loss": 0.6603, + "step": 81360 + }, + { + "epoch": 0.7193373291606995, + "grad_norm": 1.5518118143081665, + "learning_rate": 3.801104451398834e-05, + "loss": 0.656, + "step": 81370 + }, + { + "epoch": 0.7194257324210117, + "grad_norm": 1.8162482976913452, + "learning_rate": 3.8009571126316475e-05, + "loss": 0.6368, + "step": 81380 + }, + { + "epoch": 0.7195141356813239, + "grad_norm": 13.060083389282227, + "learning_rate": 3.8008097738644604e-05, + "loss": 0.6244, + "step": 81390 + }, + { + "epoch": 0.7196025389416362, + "grad_norm": 1.791451096534729, + "learning_rate": 3.800662435097273e-05, + "loss": 0.6481, + "step": 81400 + }, + { + "epoch": 0.7196909422019484, + "grad_norm": 2.118098735809326, + "learning_rate": 3.800515096330086e-05, + "loss": 0.7088, + "step": 81410 + }, + { + "epoch": 0.7197793454622606, + "grad_norm": 1.72208833694458, + "learning_rate": 3.8003677575628995e-05, + "loss": 0.6061, + "step": 81420 + }, + { + "epoch": 0.7198677487225729, + "grad_norm": 5.208735466003418, + "learning_rate": 3.800220418795712e-05, + "loss": 0.6558, + "step": 81430 + }, + { + "epoch": 0.7199561519828851, + "grad_norm": 16.971261978149414, + "learning_rate": 3.800073080028525e-05, + "loss": 0.7517, + "step": 81440 + }, + { + "epoch": 0.7200445552431973, + "grad_norm": 1.423710584640503, + "learning_rate": 3.799925741261338e-05, + "loss": 0.5638, + "step": 81450 + }, + { + "epoch": 0.7201329585035096, + "grad_norm": 10.31123161315918, + "learning_rate": 3.799778402494151e-05, + "loss": 0.6973, + "step": 81460 + }, + { + "epoch": 0.7202213617638219, + "grad_norm": 2.5266685485839844, + "learning_rate": 3.799631063726964e-05, + "loss": 0.5689, + "step": 81470 + }, + { + "epoch": 0.7203097650241341, + "grad_norm": 1.584389567375183, + "learning_rate": 3.7994837249597765e-05, + "loss": 0.6623, + "step": 81480 + }, + { + "epoch": 0.7203981682844464, + "grad_norm": 5.045012474060059, + "learning_rate": 3.7993363861925894e-05, + "loss": 0.5619, + "step": 81490 + }, + { + "epoch": 0.7204865715447586, + "grad_norm": 1.9340811967849731, + "learning_rate": 3.799189047425403e-05, + "loss": 0.6351, + "step": 81500 + }, + { + "epoch": 0.7205749748050708, + "grad_norm": 2.4334867000579834, + "learning_rate": 3.799041708658215e-05, + "loss": 0.6287, + "step": 81510 + }, + { + "epoch": 0.720663378065383, + "grad_norm": 3.7819254398345947, + "learning_rate": 3.7988943698910286e-05, + "loss": 0.6718, + "step": 81520 + }, + { + "epoch": 0.7207517813256953, + "grad_norm": 2.841998815536499, + "learning_rate": 3.7987470311238414e-05, + "loss": 0.6195, + "step": 81530 + }, + { + "epoch": 0.7208401845860075, + "grad_norm": 1.7949743270874023, + "learning_rate": 3.798599692356654e-05, + "loss": 0.6644, + "step": 81540 + }, + { + "epoch": 0.7209285878463197, + "grad_norm": 6.6643853187561035, + "learning_rate": 3.798452353589467e-05, + "loss": 0.6357, + "step": 81550 + }, + { + "epoch": 0.721016991106632, + "grad_norm": 1.5048494338989258, + "learning_rate": 3.7983050148222806e-05, + "loss": 0.6431, + "step": 81560 + }, + { + "epoch": 0.7211053943669442, + "grad_norm": 2.609926700592041, + "learning_rate": 3.798157676055093e-05, + "loss": 0.7046, + "step": 81570 + }, + { + "epoch": 0.7211937976272564, + "grad_norm": 5.734683990478516, + "learning_rate": 3.798010337287906e-05, + "loss": 0.6196, + "step": 81580 + }, + { + "epoch": 0.7212822008875688, + "grad_norm": 3.050089120864868, + "learning_rate": 3.7978629985207184e-05, + "loss": 0.7234, + "step": 81590 + }, + { + "epoch": 0.721370604147881, + "grad_norm": 2.5078630447387695, + "learning_rate": 3.797715659753532e-05, + "loss": 0.7657, + "step": 81600 + }, + { + "epoch": 0.7214590074081932, + "grad_norm": 2.299607276916504, + "learning_rate": 3.797568320986345e-05, + "loss": 0.7014, + "step": 81610 + }, + { + "epoch": 0.7215474106685055, + "grad_norm": 4.529541492462158, + "learning_rate": 3.7974209822191576e-05, + "loss": 0.6828, + "step": 81620 + }, + { + "epoch": 0.7216358139288177, + "grad_norm": 5.3901448249816895, + "learning_rate": 3.7972736434519704e-05, + "loss": 0.6141, + "step": 81630 + }, + { + "epoch": 0.7217242171891299, + "grad_norm": 2.1895015239715576, + "learning_rate": 3.797126304684784e-05, + "loss": 0.6778, + "step": 81640 + }, + { + "epoch": 0.7218126204494422, + "grad_norm": 5.135331153869629, + "learning_rate": 3.796978965917596e-05, + "loss": 0.8217, + "step": 81650 + }, + { + "epoch": 0.7219010237097544, + "grad_norm": 1.9905357360839844, + "learning_rate": 3.7968316271504096e-05, + "loss": 0.6438, + "step": 81660 + }, + { + "epoch": 0.7219894269700666, + "grad_norm": 13.713616371154785, + "learning_rate": 3.7966842883832225e-05, + "loss": 0.7791, + "step": 81670 + }, + { + "epoch": 0.7220778302303789, + "grad_norm": 3.2609429359436035, + "learning_rate": 3.796536949616035e-05, + "loss": 0.6227, + "step": 81680 + }, + { + "epoch": 0.7221662334906911, + "grad_norm": 2.5888688564300537, + "learning_rate": 3.796389610848848e-05, + "loss": 0.5572, + "step": 81690 + }, + { + "epoch": 0.7222546367510034, + "grad_norm": 1.1970055103302002, + "learning_rate": 3.796242272081661e-05, + "loss": 0.5683, + "step": 81700 + }, + { + "epoch": 0.7223430400113157, + "grad_norm": 7.1145453453063965, + "learning_rate": 3.796094933314474e-05, + "loss": 0.5811, + "step": 81710 + }, + { + "epoch": 0.7224314432716279, + "grad_norm": 4.881281852722168, + "learning_rate": 3.795947594547287e-05, + "loss": 0.7372, + "step": 81720 + }, + { + "epoch": 0.7225198465319401, + "grad_norm": 4.914188861846924, + "learning_rate": 3.7958002557800995e-05, + "loss": 0.4989, + "step": 81730 + }, + { + "epoch": 0.7226082497922524, + "grad_norm": 4.660118579864502, + "learning_rate": 3.795652917012913e-05, + "loss": 0.6057, + "step": 81740 + }, + { + "epoch": 0.7226966530525646, + "grad_norm": 1.2871594429016113, + "learning_rate": 3.795505578245726e-05, + "loss": 0.5653, + "step": 81750 + }, + { + "epoch": 0.7227850563128768, + "grad_norm": 2.805406332015991, + "learning_rate": 3.7953582394785386e-05, + "loss": 0.7284, + "step": 81760 + }, + { + "epoch": 0.722873459573189, + "grad_norm": 6.828197956085205, + "learning_rate": 3.7952109007113515e-05, + "loss": 0.7238, + "step": 81770 + }, + { + "epoch": 0.7229618628335013, + "grad_norm": 5.44737434387207, + "learning_rate": 3.795063561944165e-05, + "loss": 0.6699, + "step": 81780 + }, + { + "epoch": 0.7230502660938135, + "grad_norm": 1.4210898876190186, + "learning_rate": 3.794916223176977e-05, + "loss": 0.602, + "step": 81790 + }, + { + "epoch": 0.7231386693541257, + "grad_norm": 3.5432026386260986, + "learning_rate": 3.794768884409791e-05, + "loss": 0.7718, + "step": 81800 + }, + { + "epoch": 0.723227072614438, + "grad_norm": 3.968459367752075, + "learning_rate": 3.7946215456426035e-05, + "loss": 0.6815, + "step": 81810 + }, + { + "epoch": 0.7233154758747503, + "grad_norm": 3.1968178749084473, + "learning_rate": 3.794474206875416e-05, + "loss": 0.7348, + "step": 81820 + }, + { + "epoch": 0.7234038791350625, + "grad_norm": 2.341172218322754, + "learning_rate": 3.794326868108229e-05, + "loss": 0.5412, + "step": 81830 + }, + { + "epoch": 0.7234922823953748, + "grad_norm": 2.320580005645752, + "learning_rate": 3.794179529341042e-05, + "loss": 0.7299, + "step": 81840 + }, + { + "epoch": 0.723580685655687, + "grad_norm": 3.793396234512329, + "learning_rate": 3.794032190573855e-05, + "loss": 0.7211, + "step": 81850 + }, + { + "epoch": 0.7236690889159992, + "grad_norm": 7.644009113311768, + "learning_rate": 3.7938848518066684e-05, + "loss": 0.6952, + "step": 81860 + }, + { + "epoch": 0.7237574921763115, + "grad_norm": 5.067923069000244, + "learning_rate": 3.793737513039481e-05, + "loss": 0.735, + "step": 81870 + }, + { + "epoch": 0.7238458954366237, + "grad_norm": 7.491464138031006, + "learning_rate": 3.793590174272294e-05, + "loss": 0.6496, + "step": 81880 + }, + { + "epoch": 0.7239342986969359, + "grad_norm": 3.3565855026245117, + "learning_rate": 3.793442835505107e-05, + "loss": 0.6818, + "step": 81890 + }, + { + "epoch": 0.7240227019572482, + "grad_norm": 2.9698851108551025, + "learning_rate": 3.79329549673792e-05, + "loss": 0.6887, + "step": 81900 + }, + { + "epoch": 0.7241111052175604, + "grad_norm": 3.4320688247680664, + "learning_rate": 3.7931481579707325e-05, + "loss": 0.7373, + "step": 81910 + }, + { + "epoch": 0.7241995084778726, + "grad_norm": 4.822113513946533, + "learning_rate": 3.793000819203546e-05, + "loss": 0.6452, + "step": 81920 + }, + { + "epoch": 0.7242879117381849, + "grad_norm": 2.3925328254699707, + "learning_rate": 3.792853480436359e-05, + "loss": 0.6385, + "step": 81930 + }, + { + "epoch": 0.7243763149984972, + "grad_norm": 7.681417942047119, + "learning_rate": 3.792706141669172e-05, + "loss": 0.6281, + "step": 81940 + }, + { + "epoch": 0.7244647182588094, + "grad_norm": 6.202743053436279, + "learning_rate": 3.7925588029019846e-05, + "loss": 0.689, + "step": 81950 + }, + { + "epoch": 0.7245531215191217, + "grad_norm": 4.275755882263184, + "learning_rate": 3.7924114641347974e-05, + "loss": 0.6615, + "step": 81960 + }, + { + "epoch": 0.7246415247794339, + "grad_norm": 1.701641321182251, + "learning_rate": 3.79226412536761e-05, + "loss": 0.6777, + "step": 81970 + }, + { + "epoch": 0.7247299280397461, + "grad_norm": 3.189011812210083, + "learning_rate": 3.792116786600423e-05, + "loss": 0.7767, + "step": 81980 + }, + { + "epoch": 0.7248183313000583, + "grad_norm": 4.4323272705078125, + "learning_rate": 3.7919694478332366e-05, + "loss": 0.6551, + "step": 81990 + }, + { + "epoch": 0.7249067345603706, + "grad_norm": 3.3805644512176514, + "learning_rate": 3.7918221090660494e-05, + "loss": 0.6323, + "step": 82000 + }, + { + "epoch": 0.7249951378206828, + "grad_norm": 1.8189047574996948, + "learning_rate": 3.791674770298862e-05, + "loss": 0.6673, + "step": 82010 + }, + { + "epoch": 0.725083541080995, + "grad_norm": 3.39742112159729, + "learning_rate": 3.791527431531675e-05, + "loss": 0.7428, + "step": 82020 + }, + { + "epoch": 0.7251719443413073, + "grad_norm": 8.843940734863281, + "learning_rate": 3.791380092764488e-05, + "loss": 0.641, + "step": 82030 + }, + { + "epoch": 0.7252603476016195, + "grad_norm": 2.9795138835906982, + "learning_rate": 3.791232753997301e-05, + "loss": 0.6441, + "step": 82040 + }, + { + "epoch": 0.7253487508619317, + "grad_norm": 1.0929025411605835, + "learning_rate": 3.791085415230114e-05, + "loss": 0.6991, + "step": 82050 + }, + { + "epoch": 0.7254371541222441, + "grad_norm": 0.9400179386138916, + "learning_rate": 3.7909380764629264e-05, + "loss": 0.6448, + "step": 82060 + }, + { + "epoch": 0.7255255573825563, + "grad_norm": 5.6500773429870605, + "learning_rate": 3.79079073769574e-05, + "loss": 0.7272, + "step": 82070 + }, + { + "epoch": 0.7256139606428685, + "grad_norm": 12.625765800476074, + "learning_rate": 3.790643398928553e-05, + "loss": 0.6164, + "step": 82080 + }, + { + "epoch": 0.7257023639031808, + "grad_norm": 2.4194607734680176, + "learning_rate": 3.7904960601613656e-05, + "loss": 0.6672, + "step": 82090 + }, + { + "epoch": 0.725790767163493, + "grad_norm": 4.919355869293213, + "learning_rate": 3.7903487213941784e-05, + "loss": 0.6211, + "step": 82100 + }, + { + "epoch": 0.7258791704238052, + "grad_norm": 1.40232515335083, + "learning_rate": 3.790201382626992e-05, + "loss": 0.5534, + "step": 82110 + }, + { + "epoch": 0.7259675736841175, + "grad_norm": 0.8732109069824219, + "learning_rate": 3.790054043859804e-05, + "loss": 0.6815, + "step": 82120 + }, + { + "epoch": 0.7260559769444297, + "grad_norm": 1.5525437593460083, + "learning_rate": 3.7899067050926176e-05, + "loss": 0.8336, + "step": 82130 + }, + { + "epoch": 0.7261443802047419, + "grad_norm": 2.76408314704895, + "learning_rate": 3.7897593663254305e-05, + "loss": 0.7518, + "step": 82140 + }, + { + "epoch": 0.7262327834650542, + "grad_norm": 2.739830255508423, + "learning_rate": 3.789612027558243e-05, + "loss": 0.5977, + "step": 82150 + }, + { + "epoch": 0.7263211867253664, + "grad_norm": 1.6819652318954468, + "learning_rate": 3.789464688791056e-05, + "loss": 0.6824, + "step": 82160 + }, + { + "epoch": 0.7264095899856786, + "grad_norm": 3.837034225463867, + "learning_rate": 3.7893173500238696e-05, + "loss": 0.614, + "step": 82170 + }, + { + "epoch": 0.726497993245991, + "grad_norm": 2.2636003494262695, + "learning_rate": 3.789170011256682e-05, + "loss": 0.8269, + "step": 82180 + }, + { + "epoch": 0.7265863965063032, + "grad_norm": 2.8763487339019775, + "learning_rate": 3.789022672489495e-05, + "loss": 0.6978, + "step": 82190 + }, + { + "epoch": 0.7266747997666154, + "grad_norm": 1.058937668800354, + "learning_rate": 3.7888753337223075e-05, + "loss": 0.714, + "step": 82200 + }, + { + "epoch": 0.7267632030269277, + "grad_norm": 2.0732810497283936, + "learning_rate": 3.788727994955121e-05, + "loss": 0.6954, + "step": 82210 + }, + { + "epoch": 0.7268516062872399, + "grad_norm": 11.375685691833496, + "learning_rate": 3.788580656187934e-05, + "loss": 0.6545, + "step": 82220 + }, + { + "epoch": 0.7269400095475521, + "grad_norm": 1.0989155769348145, + "learning_rate": 3.7884333174207467e-05, + "loss": 0.6257, + "step": 82230 + }, + { + "epoch": 0.7270284128078643, + "grad_norm": 1.6817965507507324, + "learning_rate": 3.7882859786535595e-05, + "loss": 0.4865, + "step": 82240 + }, + { + "epoch": 0.7271168160681766, + "grad_norm": 6.18862247467041, + "learning_rate": 3.788138639886373e-05, + "loss": 0.7283, + "step": 82250 + }, + { + "epoch": 0.7272052193284888, + "grad_norm": 16.718080520629883, + "learning_rate": 3.787991301119185e-05, + "loss": 0.7228, + "step": 82260 + }, + { + "epoch": 0.727293622588801, + "grad_norm": 3.0961620807647705, + "learning_rate": 3.787843962351999e-05, + "loss": 0.7523, + "step": 82270 + }, + { + "epoch": 0.7273820258491133, + "grad_norm": 16.639848709106445, + "learning_rate": 3.7876966235848115e-05, + "loss": 0.6769, + "step": 82280 + }, + { + "epoch": 0.7274704291094256, + "grad_norm": 9.027290344238281, + "learning_rate": 3.7875492848176243e-05, + "loss": 0.6618, + "step": 82290 + }, + { + "epoch": 0.7275588323697378, + "grad_norm": 4.046119213104248, + "learning_rate": 3.787401946050437e-05, + "loss": 0.768, + "step": 82300 + }, + { + "epoch": 0.7276472356300501, + "grad_norm": 2.05656361579895, + "learning_rate": 3.78725460728325e-05, + "loss": 0.7273, + "step": 82310 + }, + { + "epoch": 0.7277356388903623, + "grad_norm": 2.0593655109405518, + "learning_rate": 3.787107268516063e-05, + "loss": 0.7483, + "step": 82320 + }, + { + "epoch": 0.7278240421506745, + "grad_norm": 3.8659236431121826, + "learning_rate": 3.7869599297488764e-05, + "loss": 0.796, + "step": 82330 + }, + { + "epoch": 0.7279124454109868, + "grad_norm": 2.786943197250366, + "learning_rate": 3.7868125909816885e-05, + "loss": 0.678, + "step": 82340 + }, + { + "epoch": 0.728000848671299, + "grad_norm": 4.119709491729736, + "learning_rate": 3.786665252214502e-05, + "loss": 0.6518, + "step": 82350 + }, + { + "epoch": 0.7280892519316112, + "grad_norm": 2.0500283241271973, + "learning_rate": 3.786517913447315e-05, + "loss": 0.5561, + "step": 82360 + }, + { + "epoch": 0.7281776551919235, + "grad_norm": 3.5243289470672607, + "learning_rate": 3.786370574680128e-05, + "loss": 0.7494, + "step": 82370 + }, + { + "epoch": 0.7282660584522357, + "grad_norm": 5.052689075469971, + "learning_rate": 3.7862232359129405e-05, + "loss": 0.713, + "step": 82380 + }, + { + "epoch": 0.7283544617125479, + "grad_norm": 1.687080979347229, + "learning_rate": 3.786075897145754e-05, + "loss": 0.6209, + "step": 82390 + }, + { + "epoch": 0.7284428649728601, + "grad_norm": 1.6656676530838013, + "learning_rate": 3.785928558378566e-05, + "loss": 0.6535, + "step": 82400 + }, + { + "epoch": 0.7285312682331725, + "grad_norm": 2.815579652786255, + "learning_rate": 3.78578121961138e-05, + "loss": 0.555, + "step": 82410 + }, + { + "epoch": 0.7286196714934847, + "grad_norm": 9.232556343078613, + "learning_rate": 3.785633880844192e-05, + "loss": 0.6742, + "step": 82420 + }, + { + "epoch": 0.728708074753797, + "grad_norm": 5.655826568603516, + "learning_rate": 3.7854865420770054e-05, + "loss": 0.7752, + "step": 82430 + }, + { + "epoch": 0.7287964780141092, + "grad_norm": 5.988077640533447, + "learning_rate": 3.785339203309818e-05, + "loss": 0.6712, + "step": 82440 + }, + { + "epoch": 0.7288848812744214, + "grad_norm": 0.9591336846351624, + "learning_rate": 3.785191864542631e-05, + "loss": 0.5992, + "step": 82450 + }, + { + "epoch": 0.7289732845347336, + "grad_norm": 1.510759711265564, + "learning_rate": 3.785044525775444e-05, + "loss": 0.6131, + "step": 82460 + }, + { + "epoch": 0.7290616877950459, + "grad_norm": 13.347774505615234, + "learning_rate": 3.7848971870082574e-05, + "loss": 0.5697, + "step": 82470 + }, + { + "epoch": 0.7291500910553581, + "grad_norm": 2.2034530639648438, + "learning_rate": 3.7847498482410696e-05, + "loss": 0.8204, + "step": 82480 + }, + { + "epoch": 0.7292384943156703, + "grad_norm": 2.9784910678863525, + "learning_rate": 3.784602509473883e-05, + "loss": 0.775, + "step": 82490 + }, + { + "epoch": 0.7293268975759826, + "grad_norm": 4.6114397048950195, + "learning_rate": 3.784455170706696e-05, + "loss": 0.7688, + "step": 82500 + }, + { + "epoch": 0.7294153008362948, + "grad_norm": 7.151790142059326, + "learning_rate": 3.784307831939509e-05, + "loss": 0.6117, + "step": 82510 + }, + { + "epoch": 0.729503704096607, + "grad_norm": 4.189168453216553, + "learning_rate": 3.7841604931723216e-05, + "loss": 0.6043, + "step": 82520 + }, + { + "epoch": 0.7295921073569194, + "grad_norm": 1.0064668655395508, + "learning_rate": 3.7840131544051344e-05, + "loss": 0.5606, + "step": 82530 + }, + { + "epoch": 0.7296805106172316, + "grad_norm": 3.5928053855895996, + "learning_rate": 3.783865815637947e-05, + "loss": 0.6251, + "step": 82540 + }, + { + "epoch": 0.7297689138775438, + "grad_norm": 1.864996075630188, + "learning_rate": 3.783718476870761e-05, + "loss": 0.5181, + "step": 82550 + }, + { + "epoch": 0.7298573171378561, + "grad_norm": 1.9529139995574951, + "learning_rate": 3.783571138103573e-05, + "loss": 0.5528, + "step": 82560 + }, + { + "epoch": 0.7299457203981683, + "grad_norm": 7.490105152130127, + "learning_rate": 3.7834237993363864e-05, + "loss": 0.7557, + "step": 82570 + }, + { + "epoch": 0.7300341236584805, + "grad_norm": 1.795874834060669, + "learning_rate": 3.783276460569199e-05, + "loss": 0.7436, + "step": 82580 + }, + { + "epoch": 0.7301225269187928, + "grad_norm": 3.031679630279541, + "learning_rate": 3.783129121802012e-05, + "loss": 0.5611, + "step": 82590 + }, + { + "epoch": 0.730210930179105, + "grad_norm": 2.3319554328918457, + "learning_rate": 3.782981783034825e-05, + "loss": 0.7378, + "step": 82600 + }, + { + "epoch": 0.7302993334394172, + "grad_norm": 1.281991958618164, + "learning_rate": 3.7828344442676385e-05, + "loss": 0.4882, + "step": 82610 + }, + { + "epoch": 0.7303877366997295, + "grad_norm": 1.561753273010254, + "learning_rate": 3.7826871055004506e-05, + "loss": 0.75, + "step": 82620 + }, + { + "epoch": 0.7304761399600417, + "grad_norm": 3.4038097858428955, + "learning_rate": 3.782539766733264e-05, + "loss": 0.7109, + "step": 82630 + }, + { + "epoch": 0.7305645432203539, + "grad_norm": 1.3466259241104126, + "learning_rate": 3.782392427966077e-05, + "loss": 0.604, + "step": 82640 + }, + { + "epoch": 0.7306529464806663, + "grad_norm": 3.2525291442871094, + "learning_rate": 3.78224508919889e-05, + "loss": 0.6011, + "step": 82650 + }, + { + "epoch": 0.7307413497409785, + "grad_norm": 1.857112169265747, + "learning_rate": 3.7820977504317026e-05, + "loss": 0.6465, + "step": 82660 + }, + { + "epoch": 0.7308297530012907, + "grad_norm": 2.114192247390747, + "learning_rate": 3.7819504116645155e-05, + "loss": 0.6263, + "step": 82670 + }, + { + "epoch": 0.730918156261603, + "grad_norm": 2.1825144290924072, + "learning_rate": 3.781803072897328e-05, + "loss": 0.7194, + "step": 82680 + }, + { + "epoch": 0.7310065595219152, + "grad_norm": 1.5394127368927002, + "learning_rate": 3.781655734130142e-05, + "loss": 0.7264, + "step": 82690 + }, + { + "epoch": 0.7310949627822274, + "grad_norm": 2.3071682453155518, + "learning_rate": 3.781508395362954e-05, + "loss": 0.816, + "step": 82700 + }, + { + "epoch": 0.7311833660425396, + "grad_norm": 2.225712776184082, + "learning_rate": 3.7813610565957675e-05, + "loss": 0.7076, + "step": 82710 + }, + { + "epoch": 0.7312717693028519, + "grad_norm": 1.7534329891204834, + "learning_rate": 3.78121371782858e-05, + "loss": 0.6413, + "step": 82720 + }, + { + "epoch": 0.7313601725631641, + "grad_norm": 2.1758055686950684, + "learning_rate": 3.781066379061393e-05, + "loss": 0.644, + "step": 82730 + }, + { + "epoch": 0.7314485758234763, + "grad_norm": 6.965531349182129, + "learning_rate": 3.780919040294206e-05, + "loss": 0.7453, + "step": 82740 + }, + { + "epoch": 0.7315369790837886, + "grad_norm": 7.385221004486084, + "learning_rate": 3.7807717015270195e-05, + "loss": 0.7283, + "step": 82750 + }, + { + "epoch": 0.7316253823441009, + "grad_norm": 1.3409184217453003, + "learning_rate": 3.780624362759832e-05, + "loss": 0.6674, + "step": 82760 + }, + { + "epoch": 0.7317137856044131, + "grad_norm": 2.5681004524230957, + "learning_rate": 3.780477023992645e-05, + "loss": 0.6004, + "step": 82770 + }, + { + "epoch": 0.7318021888647254, + "grad_norm": 2.5477404594421387, + "learning_rate": 3.780329685225458e-05, + "loss": 0.6222, + "step": 82780 + }, + { + "epoch": 0.7318905921250376, + "grad_norm": 4.637944221496582, + "learning_rate": 3.780182346458271e-05, + "loss": 0.6573, + "step": 82790 + }, + { + "epoch": 0.7319789953853498, + "grad_norm": 1.322871208190918, + "learning_rate": 3.780035007691084e-05, + "loss": 0.646, + "step": 82800 + }, + { + "epoch": 0.7320673986456621, + "grad_norm": 1.9769188165664673, + "learning_rate": 3.7798876689238965e-05, + "loss": 0.6736, + "step": 82810 + }, + { + "epoch": 0.7321558019059743, + "grad_norm": 2.6647613048553467, + "learning_rate": 3.7797403301567094e-05, + "loss": 0.7257, + "step": 82820 + }, + { + "epoch": 0.7322442051662865, + "grad_norm": 9.220582008361816, + "learning_rate": 3.779592991389523e-05, + "loss": 0.7633, + "step": 82830 + }, + { + "epoch": 0.7323326084265988, + "grad_norm": 1.625688076019287, + "learning_rate": 3.779445652622336e-05, + "loss": 0.7025, + "step": 82840 + }, + { + "epoch": 0.732421011686911, + "grad_norm": 4.385787487030029, + "learning_rate": 3.7792983138551485e-05, + "loss": 0.8389, + "step": 82850 + }, + { + "epoch": 0.7325094149472232, + "grad_norm": 1.5261093378067017, + "learning_rate": 3.7791509750879614e-05, + "loss": 0.6195, + "step": 82860 + }, + { + "epoch": 0.7325978182075354, + "grad_norm": 2.3391811847686768, + "learning_rate": 3.779003636320774e-05, + "loss": 0.5418, + "step": 82870 + }, + { + "epoch": 0.7326862214678478, + "grad_norm": 3.247581958770752, + "learning_rate": 3.778856297553587e-05, + "loss": 0.6039, + "step": 82880 + }, + { + "epoch": 0.73277462472816, + "grad_norm": 1.761211633682251, + "learning_rate": 3.7787089587864e-05, + "loss": 0.6608, + "step": 82890 + }, + { + "epoch": 0.7328630279884722, + "grad_norm": 6.203659534454346, + "learning_rate": 3.7785616200192134e-05, + "loss": 0.7611, + "step": 82900 + }, + { + "epoch": 0.7329514312487845, + "grad_norm": 2.116152763366699, + "learning_rate": 3.778414281252026e-05, + "loss": 0.6829, + "step": 82910 + }, + { + "epoch": 0.7330398345090967, + "grad_norm": 7.488345623016357, + "learning_rate": 3.778266942484839e-05, + "loss": 0.6787, + "step": 82920 + }, + { + "epoch": 0.7331282377694089, + "grad_norm": 3.8275585174560547, + "learning_rate": 3.778119603717652e-05, + "loss": 0.6351, + "step": 82930 + }, + { + "epoch": 0.7332166410297212, + "grad_norm": 3.363858938217163, + "learning_rate": 3.777972264950465e-05, + "loss": 0.6049, + "step": 82940 + }, + { + "epoch": 0.7333050442900334, + "grad_norm": 1.35905921459198, + "learning_rate": 3.7778249261832776e-05, + "loss": 0.585, + "step": 82950 + }, + { + "epoch": 0.7333934475503456, + "grad_norm": 6.123122692108154, + "learning_rate": 3.777677587416091e-05, + "loss": 0.6661, + "step": 82960 + }, + { + "epoch": 0.7334818508106579, + "grad_norm": 3.0487587451934814, + "learning_rate": 3.777530248648904e-05, + "loss": 0.665, + "step": 82970 + }, + { + "epoch": 0.7335702540709701, + "grad_norm": 7.3421549797058105, + "learning_rate": 3.777382909881717e-05, + "loss": 0.6879, + "step": 82980 + }, + { + "epoch": 0.7336586573312823, + "grad_norm": 1.6624705791473389, + "learning_rate": 3.7772355711145296e-05, + "loss": 0.6313, + "step": 82990 + }, + { + "epoch": 0.7337470605915947, + "grad_norm": 4.164744853973389, + "learning_rate": 3.7770882323473424e-05, + "loss": 0.6981, + "step": 83000 + }, + { + "epoch": 0.7338354638519069, + "grad_norm": 7.852123260498047, + "learning_rate": 3.776940893580155e-05, + "loss": 0.7305, + "step": 83010 + }, + { + "epoch": 0.7339238671122191, + "grad_norm": 7.19644832611084, + "learning_rate": 3.776793554812969e-05, + "loss": 0.6751, + "step": 83020 + }, + { + "epoch": 0.7340122703725314, + "grad_norm": 5.935701847076416, + "learning_rate": 3.776646216045781e-05, + "loss": 0.5763, + "step": 83030 + }, + { + "epoch": 0.7341006736328436, + "grad_norm": 8.278578758239746, + "learning_rate": 3.7764988772785945e-05, + "loss": 0.7584, + "step": 83040 + }, + { + "epoch": 0.7341890768931558, + "grad_norm": 1.1739051342010498, + "learning_rate": 3.776351538511407e-05, + "loss": 0.6151, + "step": 83050 + }, + { + "epoch": 0.734277480153468, + "grad_norm": 1.8704761266708374, + "learning_rate": 3.77620419974422e-05, + "loss": 0.6379, + "step": 83060 + }, + { + "epoch": 0.7343658834137803, + "grad_norm": 4.368412017822266, + "learning_rate": 3.776056860977033e-05, + "loss": 0.6465, + "step": 83070 + }, + { + "epoch": 0.7344542866740925, + "grad_norm": 4.229257106781006, + "learning_rate": 3.7759095222098465e-05, + "loss": 0.6963, + "step": 83080 + }, + { + "epoch": 0.7345426899344047, + "grad_norm": 9.565871238708496, + "learning_rate": 3.7757621834426586e-05, + "loss": 0.7075, + "step": 83090 + }, + { + "epoch": 0.734631093194717, + "grad_norm": 1.0253034830093384, + "learning_rate": 3.775614844675472e-05, + "loss": 0.6859, + "step": 83100 + }, + { + "epoch": 0.7347194964550292, + "grad_norm": 3.467026948928833, + "learning_rate": 3.775467505908285e-05, + "loss": 0.6699, + "step": 83110 + }, + { + "epoch": 0.7348078997153416, + "grad_norm": 1.897066354751587, + "learning_rate": 3.775320167141098e-05, + "loss": 0.7024, + "step": 83120 + }, + { + "epoch": 0.7348963029756538, + "grad_norm": 8.542876243591309, + "learning_rate": 3.7751728283739107e-05, + "loss": 0.6004, + "step": 83130 + }, + { + "epoch": 0.734984706235966, + "grad_norm": 2.2165987491607666, + "learning_rate": 3.7750254896067235e-05, + "loss": 0.6797, + "step": 83140 + }, + { + "epoch": 0.7350731094962782, + "grad_norm": 5.751006126403809, + "learning_rate": 3.774878150839536e-05, + "loss": 0.6325, + "step": 83150 + }, + { + "epoch": 0.7351615127565905, + "grad_norm": 1.4556961059570312, + "learning_rate": 3.77473081207235e-05, + "loss": 0.6399, + "step": 83160 + }, + { + "epoch": 0.7352499160169027, + "grad_norm": 15.751131057739258, + "learning_rate": 3.774583473305162e-05, + "loss": 0.7067, + "step": 83170 + }, + { + "epoch": 0.7353383192772149, + "grad_norm": 2.450960397720337, + "learning_rate": 3.7744361345379755e-05, + "loss": 0.5672, + "step": 83180 + }, + { + "epoch": 0.7354267225375272, + "grad_norm": 2.299738645553589, + "learning_rate": 3.7742887957707883e-05, + "loss": 0.6697, + "step": 83190 + }, + { + "epoch": 0.7355151257978394, + "grad_norm": 3.7470109462738037, + "learning_rate": 3.774141457003601e-05, + "loss": 0.589, + "step": 83200 + }, + { + "epoch": 0.7356035290581516, + "grad_norm": 2.5312650203704834, + "learning_rate": 3.773994118236414e-05, + "loss": 0.6406, + "step": 83210 + }, + { + "epoch": 0.7356919323184639, + "grad_norm": 2.1556906700134277, + "learning_rate": 3.7738467794692275e-05, + "loss": 0.6681, + "step": 83220 + }, + { + "epoch": 0.7357803355787761, + "grad_norm": 2.699314594268799, + "learning_rate": 3.77369944070204e-05, + "loss": 0.6691, + "step": 83230 + }, + { + "epoch": 0.7358687388390884, + "grad_norm": 2.0234696865081787, + "learning_rate": 3.773552101934853e-05, + "loss": 0.5997, + "step": 83240 + }, + { + "epoch": 0.7359571420994007, + "grad_norm": 2.5400748252868652, + "learning_rate": 3.7734047631676654e-05, + "loss": 0.685, + "step": 83250 + }, + { + "epoch": 0.7360455453597129, + "grad_norm": 5.217487335205078, + "learning_rate": 3.773257424400479e-05, + "loss": 0.6832, + "step": 83260 + }, + { + "epoch": 0.7361339486200251, + "grad_norm": 0.9310530424118042, + "learning_rate": 3.773110085633292e-05, + "loss": 0.5563, + "step": 83270 + }, + { + "epoch": 0.7362223518803374, + "grad_norm": 2.381751537322998, + "learning_rate": 3.7729627468661045e-05, + "loss": 0.7837, + "step": 83280 + }, + { + "epoch": 0.7363107551406496, + "grad_norm": 2.303239583969116, + "learning_rate": 3.7728154080989174e-05, + "loss": 0.6513, + "step": 83290 + }, + { + "epoch": 0.7363991584009618, + "grad_norm": 7.5340094566345215, + "learning_rate": 3.772668069331731e-05, + "loss": 0.8153, + "step": 83300 + }, + { + "epoch": 0.736487561661274, + "grad_norm": 3.0403056144714355, + "learning_rate": 3.772520730564543e-05, + "loss": 0.6001, + "step": 83310 + }, + { + "epoch": 0.7365759649215863, + "grad_norm": 8.362848281860352, + "learning_rate": 3.7723733917973566e-05, + "loss": 0.8238, + "step": 83320 + }, + { + "epoch": 0.7366643681818985, + "grad_norm": 1.840634822845459, + "learning_rate": 3.7722260530301694e-05, + "loss": 0.6153, + "step": 83330 + }, + { + "epoch": 0.7367527714422107, + "grad_norm": 1.6935734748840332, + "learning_rate": 3.772078714262982e-05, + "loss": 0.709, + "step": 83340 + }, + { + "epoch": 0.7368411747025231, + "grad_norm": 5.5450825691223145, + "learning_rate": 3.771931375495795e-05, + "loss": 0.6664, + "step": 83350 + }, + { + "epoch": 0.7369295779628353, + "grad_norm": 4.621242046356201, + "learning_rate": 3.771784036728608e-05, + "loss": 0.6342, + "step": 83360 + }, + { + "epoch": 0.7370179812231475, + "grad_norm": 4.499111652374268, + "learning_rate": 3.771636697961421e-05, + "loss": 0.6453, + "step": 83370 + }, + { + "epoch": 0.7371063844834598, + "grad_norm": 3.9753365516662598, + "learning_rate": 3.771489359194234e-05, + "loss": 0.537, + "step": 83380 + }, + { + "epoch": 0.737194787743772, + "grad_norm": 1.1098240613937378, + "learning_rate": 3.7713420204270464e-05, + "loss": 0.5643, + "step": 83390 + }, + { + "epoch": 0.7372831910040842, + "grad_norm": 5.1562724113464355, + "learning_rate": 3.77119468165986e-05, + "loss": 0.7421, + "step": 83400 + }, + { + "epoch": 0.7373715942643965, + "grad_norm": 1.4846030473709106, + "learning_rate": 3.771047342892673e-05, + "loss": 0.7462, + "step": 83410 + }, + { + "epoch": 0.7374599975247087, + "grad_norm": 5.749865531921387, + "learning_rate": 3.7709000041254856e-05, + "loss": 0.5743, + "step": 83420 + }, + { + "epoch": 0.7375484007850209, + "grad_norm": 5.566555023193359, + "learning_rate": 3.7707526653582984e-05, + "loss": 0.622, + "step": 83430 + }, + { + "epoch": 0.7376368040453332, + "grad_norm": 2.281686544418335, + "learning_rate": 3.770605326591112e-05, + "loss": 0.6668, + "step": 83440 + }, + { + "epoch": 0.7377252073056454, + "grad_norm": 1.5201878547668457, + "learning_rate": 3.770457987823924e-05, + "loss": 0.8028, + "step": 83450 + }, + { + "epoch": 0.7378136105659576, + "grad_norm": 1.441184639930725, + "learning_rate": 3.7703106490567376e-05, + "loss": 0.7622, + "step": 83460 + }, + { + "epoch": 0.73790201382627, + "grad_norm": 1.495642900466919, + "learning_rate": 3.77016331028955e-05, + "loss": 0.6142, + "step": 83470 + }, + { + "epoch": 0.7379904170865822, + "grad_norm": 3.465707778930664, + "learning_rate": 3.770015971522363e-05, + "loss": 0.5996, + "step": 83480 + }, + { + "epoch": 0.7380788203468944, + "grad_norm": 5.066530227661133, + "learning_rate": 3.769868632755176e-05, + "loss": 0.7761, + "step": 83490 + }, + { + "epoch": 0.7381672236072067, + "grad_norm": 4.373834609985352, + "learning_rate": 3.769721293987989e-05, + "loss": 0.6554, + "step": 83500 + }, + { + "epoch": 0.7382556268675189, + "grad_norm": 1.6781747341156006, + "learning_rate": 3.769573955220802e-05, + "loss": 0.719, + "step": 83510 + }, + { + "epoch": 0.7383440301278311, + "grad_norm": 4.214022636413574, + "learning_rate": 3.769426616453615e-05, + "loss": 0.6745, + "step": 83520 + }, + { + "epoch": 0.7384324333881433, + "grad_norm": 5.292420387268066, + "learning_rate": 3.7692792776864275e-05, + "loss": 0.728, + "step": 83530 + }, + { + "epoch": 0.7385208366484556, + "grad_norm": 1.5029011964797974, + "learning_rate": 3.769131938919241e-05, + "loss": 0.8833, + "step": 83540 + }, + { + "epoch": 0.7386092399087678, + "grad_norm": 2.422543525695801, + "learning_rate": 3.768984600152054e-05, + "loss": 0.6862, + "step": 83550 + }, + { + "epoch": 0.73869764316908, + "grad_norm": 1.2955564260482788, + "learning_rate": 3.7688372613848666e-05, + "loss": 0.6666, + "step": 83560 + }, + { + "epoch": 0.7387860464293923, + "grad_norm": 1.1184724569320679, + "learning_rate": 3.7686899226176795e-05, + "loss": 0.5686, + "step": 83570 + }, + { + "epoch": 0.7388744496897045, + "grad_norm": 4.629207134246826, + "learning_rate": 3.768542583850493e-05, + "loss": 0.6956, + "step": 83580 + }, + { + "epoch": 0.7389628529500168, + "grad_norm": 7.006459712982178, + "learning_rate": 3.768395245083305e-05, + "loss": 0.6794, + "step": 83590 + }, + { + "epoch": 0.7390512562103291, + "grad_norm": 3.655062198638916, + "learning_rate": 3.7682479063161187e-05, + "loss": 0.7095, + "step": 83600 + }, + { + "epoch": 0.7391396594706413, + "grad_norm": 4.1207594871521, + "learning_rate": 3.768100567548931e-05, + "loss": 0.6169, + "step": 83610 + }, + { + "epoch": 0.7392280627309535, + "grad_norm": 1.8817453384399414, + "learning_rate": 3.767953228781744e-05, + "loss": 0.6754, + "step": 83620 + }, + { + "epoch": 0.7393164659912658, + "grad_norm": 4.477394104003906, + "learning_rate": 3.767805890014557e-05, + "loss": 0.7391, + "step": 83630 + }, + { + "epoch": 0.739404869251578, + "grad_norm": 1.1810672283172607, + "learning_rate": 3.76765855124737e-05, + "loss": 0.6219, + "step": 83640 + }, + { + "epoch": 0.7394932725118902, + "grad_norm": 2.279214859008789, + "learning_rate": 3.767511212480183e-05, + "loss": 0.7055, + "step": 83650 + }, + { + "epoch": 0.7395816757722025, + "grad_norm": 1.1570273637771606, + "learning_rate": 3.7673638737129963e-05, + "loss": 0.6998, + "step": 83660 + }, + { + "epoch": 0.7396700790325147, + "grad_norm": 4.0594801902771, + "learning_rate": 3.7672165349458085e-05, + "loss": 0.5268, + "step": 83670 + }, + { + "epoch": 0.7397584822928269, + "grad_norm": 2.7007806301116943, + "learning_rate": 3.767069196178622e-05, + "loss": 0.6917, + "step": 83680 + }, + { + "epoch": 0.7398468855531392, + "grad_norm": 5.185434818267822, + "learning_rate": 3.766921857411435e-05, + "loss": 0.5995, + "step": 83690 + }, + { + "epoch": 0.7399352888134514, + "grad_norm": 3.2521445751190186, + "learning_rate": 3.766774518644248e-05, + "loss": 0.6451, + "step": 83700 + }, + { + "epoch": 0.7400236920737637, + "grad_norm": 15.640610694885254, + "learning_rate": 3.7666271798770605e-05, + "loss": 0.6014, + "step": 83710 + }, + { + "epoch": 0.740112095334076, + "grad_norm": 2.6641318798065186, + "learning_rate": 3.7664798411098734e-05, + "loss": 0.7148, + "step": 83720 + }, + { + "epoch": 0.7402004985943882, + "grad_norm": 4.184748649597168, + "learning_rate": 3.766332502342686e-05, + "loss": 0.718, + "step": 83730 + }, + { + "epoch": 0.7402889018547004, + "grad_norm": 3.5810635089874268, + "learning_rate": 3.7661851635755e-05, + "loss": 0.6951, + "step": 83740 + }, + { + "epoch": 0.7403773051150127, + "grad_norm": 2.6039035320281982, + "learning_rate": 3.7660378248083125e-05, + "loss": 0.6446, + "step": 83750 + }, + { + "epoch": 0.7404657083753249, + "grad_norm": 1.8136022090911865, + "learning_rate": 3.7658904860411254e-05, + "loss": 0.7126, + "step": 83760 + }, + { + "epoch": 0.7405541116356371, + "grad_norm": 2.059373140335083, + "learning_rate": 3.765743147273938e-05, + "loss": 0.4997, + "step": 83770 + }, + { + "epoch": 0.7406425148959493, + "grad_norm": 2.658827066421509, + "learning_rate": 3.765595808506751e-05, + "loss": 0.6084, + "step": 83780 + }, + { + "epoch": 0.7407309181562616, + "grad_norm": 2.070920467376709, + "learning_rate": 3.765448469739564e-05, + "loss": 0.6939, + "step": 83790 + }, + { + "epoch": 0.7408193214165738, + "grad_norm": 6.838007926940918, + "learning_rate": 3.7653011309723774e-05, + "loss": 0.6527, + "step": 83800 + }, + { + "epoch": 0.740907724676886, + "grad_norm": 3.6335978507995605, + "learning_rate": 3.76515379220519e-05, + "loss": 0.6913, + "step": 83810 + }, + { + "epoch": 0.7409961279371984, + "grad_norm": 2.9283552169799805, + "learning_rate": 3.765006453438003e-05, + "loss": 0.6935, + "step": 83820 + }, + { + "epoch": 0.7410845311975106, + "grad_norm": 2.8865389823913574, + "learning_rate": 3.764859114670816e-05, + "loss": 0.8142, + "step": 83830 + }, + { + "epoch": 0.7411729344578228, + "grad_norm": 4.269875526428223, + "learning_rate": 3.764711775903629e-05, + "loss": 0.6683, + "step": 83840 + }, + { + "epoch": 0.7412613377181351, + "grad_norm": 2.1077256202697754, + "learning_rate": 3.7645644371364416e-05, + "loss": 0.7241, + "step": 83850 + }, + { + "epoch": 0.7413497409784473, + "grad_norm": 1.362311601638794, + "learning_rate": 3.7644170983692544e-05, + "loss": 0.6739, + "step": 83860 + }, + { + "epoch": 0.7414381442387595, + "grad_norm": 2.4714369773864746, + "learning_rate": 3.764269759602068e-05, + "loss": 0.5756, + "step": 83870 + }, + { + "epoch": 0.7415265474990718, + "grad_norm": 8.367737770080566, + "learning_rate": 3.764122420834881e-05, + "loss": 0.7419, + "step": 83880 + }, + { + "epoch": 0.741614950759384, + "grad_norm": 9.864267349243164, + "learning_rate": 3.7639750820676936e-05, + "loss": 0.6326, + "step": 83890 + }, + { + "epoch": 0.7417033540196962, + "grad_norm": 3.680027484893799, + "learning_rate": 3.7638277433005064e-05, + "loss": 0.7048, + "step": 83900 + }, + { + "epoch": 0.7417917572800085, + "grad_norm": 2.4032979011535645, + "learning_rate": 3.763680404533319e-05, + "loss": 0.7104, + "step": 83910 + }, + { + "epoch": 0.7418801605403207, + "grad_norm": 5.893245697021484, + "learning_rate": 3.763533065766132e-05, + "loss": 0.6404, + "step": 83920 + }, + { + "epoch": 0.7419685638006329, + "grad_norm": 2.3711273670196533, + "learning_rate": 3.7633857269989456e-05, + "loss": 0.6871, + "step": 83930 + }, + { + "epoch": 0.7420569670609453, + "grad_norm": 1.8997288942337036, + "learning_rate": 3.763238388231758e-05, + "loss": 0.6988, + "step": 83940 + }, + { + "epoch": 0.7421453703212575, + "grad_norm": 8.349346160888672, + "learning_rate": 3.763091049464571e-05, + "loss": 0.7116, + "step": 83950 + }, + { + "epoch": 0.7422337735815697, + "grad_norm": 5.230564594268799, + "learning_rate": 3.762943710697384e-05, + "loss": 0.6246, + "step": 83960 + }, + { + "epoch": 0.742322176841882, + "grad_norm": 2.8075129985809326, + "learning_rate": 3.762796371930197e-05, + "loss": 0.6132, + "step": 83970 + }, + { + "epoch": 0.7424105801021942, + "grad_norm": 3.2089080810546875, + "learning_rate": 3.76264903316301e-05, + "loss": 0.6156, + "step": 83980 + }, + { + "epoch": 0.7424989833625064, + "grad_norm": 3.1539509296417236, + "learning_rate": 3.762501694395823e-05, + "loss": 0.6312, + "step": 83990 + }, + { + "epoch": 0.7425873866228186, + "grad_norm": 2.6977450847625732, + "learning_rate": 3.7623543556286355e-05, + "loss": 0.6348, + "step": 84000 + }, + { + "epoch": 0.7426757898831309, + "grad_norm": 3.076150894165039, + "learning_rate": 3.762207016861449e-05, + "loss": 0.5621, + "step": 84010 + }, + { + "epoch": 0.7427641931434431, + "grad_norm": 3.1662416458129883, + "learning_rate": 3.762059678094262e-05, + "loss": 0.7502, + "step": 84020 + }, + { + "epoch": 0.7428525964037553, + "grad_norm": 3.0767221450805664, + "learning_rate": 3.7619123393270746e-05, + "loss": 0.79, + "step": 84030 + }, + { + "epoch": 0.7429409996640676, + "grad_norm": 6.01400899887085, + "learning_rate": 3.7617650005598875e-05, + "loss": 0.7268, + "step": 84040 + }, + { + "epoch": 0.7430294029243798, + "grad_norm": 3.0729949474334717, + "learning_rate": 3.761617661792701e-05, + "loss": 0.6725, + "step": 84050 + }, + { + "epoch": 0.7431178061846921, + "grad_norm": 3.168278455734253, + "learning_rate": 3.761470323025513e-05, + "loss": 0.6452, + "step": 84060 + }, + { + "epoch": 0.7432062094450044, + "grad_norm": 2.7801151275634766, + "learning_rate": 3.761322984258327e-05, + "loss": 0.6718, + "step": 84070 + }, + { + "epoch": 0.7432946127053166, + "grad_norm": 6.022086143493652, + "learning_rate": 3.761175645491139e-05, + "loss": 0.5809, + "step": 84080 + }, + { + "epoch": 0.7433830159656288, + "grad_norm": 2.4401657581329346, + "learning_rate": 3.761028306723952e-05, + "loss": 0.7353, + "step": 84090 + }, + { + "epoch": 0.7434714192259411, + "grad_norm": 4.503438472747803, + "learning_rate": 3.760880967956765e-05, + "loss": 0.666, + "step": 84100 + }, + { + "epoch": 0.7435598224862533, + "grad_norm": 2.0846970081329346, + "learning_rate": 3.760733629189578e-05, + "loss": 0.7087, + "step": 84110 + }, + { + "epoch": 0.7436482257465655, + "grad_norm": 3.1882002353668213, + "learning_rate": 3.760586290422391e-05, + "loss": 0.6863, + "step": 84120 + }, + { + "epoch": 0.7437366290068778, + "grad_norm": 5.416436672210693, + "learning_rate": 3.7604389516552044e-05, + "loss": 0.6927, + "step": 84130 + }, + { + "epoch": 0.74382503226719, + "grad_norm": 1.339212417602539, + "learning_rate": 3.7602916128880165e-05, + "loss": 0.6296, + "step": 84140 + }, + { + "epoch": 0.7439134355275022, + "grad_norm": 6.03978157043457, + "learning_rate": 3.76014427412083e-05, + "loss": 0.6019, + "step": 84150 + }, + { + "epoch": 0.7440018387878145, + "grad_norm": 7.817158222198486, + "learning_rate": 3.759996935353643e-05, + "loss": 0.7386, + "step": 84160 + }, + { + "epoch": 0.7440902420481267, + "grad_norm": 2.0685369968414307, + "learning_rate": 3.759849596586456e-05, + "loss": 0.6602, + "step": 84170 + }, + { + "epoch": 0.744178645308439, + "grad_norm": 1.6310970783233643, + "learning_rate": 3.7597022578192685e-05, + "loss": 0.5695, + "step": 84180 + }, + { + "epoch": 0.7442670485687513, + "grad_norm": 1.040309190750122, + "learning_rate": 3.7595549190520814e-05, + "loss": 0.7032, + "step": 84190 + }, + { + "epoch": 0.7443554518290635, + "grad_norm": 2.361133098602295, + "learning_rate": 3.759407580284894e-05, + "loss": 0.7757, + "step": 84200 + }, + { + "epoch": 0.7444438550893757, + "grad_norm": 4.239267349243164, + "learning_rate": 3.759260241517708e-05, + "loss": 0.6926, + "step": 84210 + }, + { + "epoch": 0.744532258349688, + "grad_norm": 2.7009713649749756, + "learning_rate": 3.75911290275052e-05, + "loss": 0.7644, + "step": 84220 + }, + { + "epoch": 0.7446206616100002, + "grad_norm": 3.142449140548706, + "learning_rate": 3.7589655639833334e-05, + "loss": 0.6601, + "step": 84230 + }, + { + "epoch": 0.7447090648703124, + "grad_norm": 2.860416889190674, + "learning_rate": 3.758818225216146e-05, + "loss": 0.5282, + "step": 84240 + }, + { + "epoch": 0.7447974681306246, + "grad_norm": 1.2303674221038818, + "learning_rate": 3.758670886448959e-05, + "loss": 0.6776, + "step": 84250 + }, + { + "epoch": 0.7448858713909369, + "grad_norm": 5.118793487548828, + "learning_rate": 3.758523547681772e-05, + "loss": 0.7907, + "step": 84260 + }, + { + "epoch": 0.7449742746512491, + "grad_norm": 2.992825746536255, + "learning_rate": 3.7583762089145854e-05, + "loss": 0.8036, + "step": 84270 + }, + { + "epoch": 0.7450626779115613, + "grad_norm": 2.319847583770752, + "learning_rate": 3.7582288701473976e-05, + "loss": 0.6061, + "step": 84280 + }, + { + "epoch": 0.7451510811718736, + "grad_norm": 15.495037078857422, + "learning_rate": 3.758081531380211e-05, + "loss": 0.6218, + "step": 84290 + }, + { + "epoch": 0.7452394844321859, + "grad_norm": 3.1386735439300537, + "learning_rate": 3.757934192613023e-05, + "loss": 0.6729, + "step": 84300 + }, + { + "epoch": 0.7453278876924981, + "grad_norm": 1.256740689277649, + "learning_rate": 3.757786853845837e-05, + "loss": 0.5731, + "step": 84310 + }, + { + "epoch": 0.7454162909528104, + "grad_norm": 1.222428798675537, + "learning_rate": 3.7576395150786496e-05, + "loss": 0.4752, + "step": 84320 + }, + { + "epoch": 0.7455046942131226, + "grad_norm": 8.0045804977417, + "learning_rate": 3.7574921763114624e-05, + "loss": 0.6779, + "step": 84330 + }, + { + "epoch": 0.7455930974734348, + "grad_norm": 5.968054294586182, + "learning_rate": 3.757344837544275e-05, + "loss": 0.678, + "step": 84340 + }, + { + "epoch": 0.7456815007337471, + "grad_norm": 4.230433940887451, + "learning_rate": 3.757197498777089e-05, + "loss": 0.7098, + "step": 84350 + }, + { + "epoch": 0.7457699039940593, + "grad_norm": 2.3833985328674316, + "learning_rate": 3.757050160009901e-05, + "loss": 0.7, + "step": 84360 + }, + { + "epoch": 0.7458583072543715, + "grad_norm": 2.216752052307129, + "learning_rate": 3.7569028212427144e-05, + "loss": 0.5883, + "step": 84370 + }, + { + "epoch": 0.7459467105146838, + "grad_norm": 2.3393094539642334, + "learning_rate": 3.756755482475527e-05, + "loss": 0.7256, + "step": 84380 + }, + { + "epoch": 0.746035113774996, + "grad_norm": 1.5323232412338257, + "learning_rate": 3.75660814370834e-05, + "loss": 0.685, + "step": 84390 + }, + { + "epoch": 0.7461235170353082, + "grad_norm": 10.415721893310547, + "learning_rate": 3.756460804941153e-05, + "loss": 0.631, + "step": 84400 + }, + { + "epoch": 0.7462119202956206, + "grad_norm": 2.981957197189331, + "learning_rate": 3.756313466173966e-05, + "loss": 0.6316, + "step": 84410 + }, + { + "epoch": 0.7463003235559328, + "grad_norm": 5.538896083831787, + "learning_rate": 3.7561661274067786e-05, + "loss": 0.578, + "step": 84420 + }, + { + "epoch": 0.746388726816245, + "grad_norm": 7.855510234832764, + "learning_rate": 3.756018788639592e-05, + "loss": 0.7256, + "step": 84430 + }, + { + "epoch": 0.7464771300765572, + "grad_norm": 6.738065719604492, + "learning_rate": 3.755871449872404e-05, + "loss": 0.4829, + "step": 84440 + }, + { + "epoch": 0.7465655333368695, + "grad_norm": 6.32013463973999, + "learning_rate": 3.755724111105218e-05, + "loss": 0.6204, + "step": 84450 + }, + { + "epoch": 0.7466539365971817, + "grad_norm": 3.561615228652954, + "learning_rate": 3.7555767723380306e-05, + "loss": 0.6177, + "step": 84460 + }, + { + "epoch": 0.7467423398574939, + "grad_norm": 2.4124794006347656, + "learning_rate": 3.7554294335708435e-05, + "loss": 0.5233, + "step": 84470 + }, + { + "epoch": 0.7468307431178062, + "grad_norm": 1.7479076385498047, + "learning_rate": 3.755282094803656e-05, + "loss": 0.7237, + "step": 84480 + }, + { + "epoch": 0.7469191463781184, + "grad_norm": 3.5975189208984375, + "learning_rate": 3.75513475603647e-05, + "loss": 0.6279, + "step": 84490 + }, + { + "epoch": 0.7470075496384306, + "grad_norm": 5.031615734100342, + "learning_rate": 3.754987417269282e-05, + "loss": 0.7757, + "step": 84500 + }, + { + "epoch": 0.7470959528987429, + "grad_norm": 3.85677170753479, + "learning_rate": 3.7548400785020955e-05, + "loss": 0.5713, + "step": 84510 + }, + { + "epoch": 0.7471843561590551, + "grad_norm": 8.631434440612793, + "learning_rate": 3.754692739734908e-05, + "loss": 0.6599, + "step": 84520 + }, + { + "epoch": 0.7472727594193674, + "grad_norm": 1.915344476699829, + "learning_rate": 3.754545400967721e-05, + "loss": 0.5745, + "step": 84530 + }, + { + "epoch": 0.7473611626796797, + "grad_norm": 7.097794055938721, + "learning_rate": 3.754398062200534e-05, + "loss": 0.5727, + "step": 84540 + }, + { + "epoch": 0.7474495659399919, + "grad_norm": 1.8751381635665894, + "learning_rate": 3.754250723433347e-05, + "loss": 0.584, + "step": 84550 + }, + { + "epoch": 0.7475379692003041, + "grad_norm": 1.7229143381118774, + "learning_rate": 3.75410338466616e-05, + "loss": 0.5988, + "step": 84560 + }, + { + "epoch": 0.7476263724606164, + "grad_norm": 2.0988352298736572, + "learning_rate": 3.753956045898973e-05, + "loss": 0.6226, + "step": 84570 + }, + { + "epoch": 0.7477147757209286, + "grad_norm": 7.097356796264648, + "learning_rate": 3.753808707131785e-05, + "loss": 0.5733, + "step": 84580 + }, + { + "epoch": 0.7478031789812408, + "grad_norm": 1.6344846487045288, + "learning_rate": 3.753661368364599e-05, + "loss": 0.6081, + "step": 84590 + }, + { + "epoch": 0.747891582241553, + "grad_norm": 3.0487451553344727, + "learning_rate": 3.753514029597412e-05, + "loss": 0.5739, + "step": 84600 + }, + { + "epoch": 0.7479799855018653, + "grad_norm": 1.8894730806350708, + "learning_rate": 3.7533666908302245e-05, + "loss": 0.6757, + "step": 84610 + }, + { + "epoch": 0.7480683887621775, + "grad_norm": 2.5171899795532227, + "learning_rate": 3.7532193520630374e-05, + "loss": 0.8411, + "step": 84620 + }, + { + "epoch": 0.7481567920224897, + "grad_norm": 4.379422664642334, + "learning_rate": 3.753072013295851e-05, + "loss": 0.6768, + "step": 84630 + }, + { + "epoch": 0.748245195282802, + "grad_norm": 8.756941795349121, + "learning_rate": 3.752924674528664e-05, + "loss": 0.8038, + "step": 84640 + }, + { + "epoch": 0.7483335985431143, + "grad_norm": 11.941573143005371, + "learning_rate": 3.7527773357614765e-05, + "loss": 0.6889, + "step": 84650 + }, + { + "epoch": 0.7484220018034266, + "grad_norm": 2.0575923919677734, + "learning_rate": 3.7526299969942894e-05, + "loss": 0.6922, + "step": 84660 + }, + { + "epoch": 0.7485104050637388, + "grad_norm": 3.003082513809204, + "learning_rate": 3.752482658227102e-05, + "loss": 0.5843, + "step": 84670 + }, + { + "epoch": 0.748598808324051, + "grad_norm": 2.6774916648864746, + "learning_rate": 3.752335319459915e-05, + "loss": 0.7793, + "step": 84680 + }, + { + "epoch": 0.7486872115843632, + "grad_norm": 4.0084099769592285, + "learning_rate": 3.752187980692728e-05, + "loss": 0.6427, + "step": 84690 + }, + { + "epoch": 0.7487756148446755, + "grad_norm": 3.386981725692749, + "learning_rate": 3.7520406419255414e-05, + "loss": 0.5562, + "step": 84700 + }, + { + "epoch": 0.7488640181049877, + "grad_norm": 1.8329048156738281, + "learning_rate": 3.751893303158354e-05, + "loss": 0.6764, + "step": 84710 + }, + { + "epoch": 0.7489524213652999, + "grad_norm": 1.757285237312317, + "learning_rate": 3.751745964391167e-05, + "loss": 0.6155, + "step": 84720 + }, + { + "epoch": 0.7490408246256122, + "grad_norm": 1.4587023258209229, + "learning_rate": 3.75159862562398e-05, + "loss": 0.6608, + "step": 84730 + }, + { + "epoch": 0.7491292278859244, + "grad_norm": 8.046749114990234, + "learning_rate": 3.751451286856793e-05, + "loss": 0.5967, + "step": 84740 + }, + { + "epoch": 0.7492176311462366, + "grad_norm": 4.4256062507629395, + "learning_rate": 3.7513039480896056e-05, + "loss": 0.6597, + "step": 84750 + }, + { + "epoch": 0.7493060344065489, + "grad_norm": 5.429772853851318, + "learning_rate": 3.751156609322419e-05, + "loss": 0.6772, + "step": 84760 + }, + { + "epoch": 0.7493944376668612, + "grad_norm": 1.6209697723388672, + "learning_rate": 3.751009270555231e-05, + "loss": 0.5931, + "step": 84770 + }, + { + "epoch": 0.7494828409271734, + "grad_norm": 3.219609022140503, + "learning_rate": 3.750861931788045e-05, + "loss": 0.6778, + "step": 84780 + }, + { + "epoch": 0.7495712441874857, + "grad_norm": 3.7768449783325195, + "learning_rate": 3.7507145930208576e-05, + "loss": 0.5966, + "step": 84790 + }, + { + "epoch": 0.7496596474477979, + "grad_norm": 1.5925281047821045, + "learning_rate": 3.7505672542536704e-05, + "loss": 0.6377, + "step": 84800 + }, + { + "epoch": 0.7497480507081101, + "grad_norm": 1.883607029914856, + "learning_rate": 3.750419915486483e-05, + "loss": 0.8659, + "step": 84810 + }, + { + "epoch": 0.7498364539684224, + "grad_norm": 5.472825050354004, + "learning_rate": 3.750272576719297e-05, + "loss": 0.684, + "step": 84820 + }, + { + "epoch": 0.7499248572287346, + "grad_norm": 2.8847320079803467, + "learning_rate": 3.750125237952109e-05, + "loss": 0.6232, + "step": 84830 + }, + { + "epoch": 0.7500132604890468, + "grad_norm": 5.925868034362793, + "learning_rate": 3.7499778991849224e-05, + "loss": 0.657, + "step": 84840 + }, + { + "epoch": 0.750101663749359, + "grad_norm": 6.294375419616699, + "learning_rate": 3.749830560417735e-05, + "loss": 0.6016, + "step": 84850 + }, + { + "epoch": 0.7501900670096713, + "grad_norm": 3.546903610229492, + "learning_rate": 3.749683221650548e-05, + "loss": 0.6389, + "step": 84860 + }, + { + "epoch": 0.7502784702699835, + "grad_norm": 6.391054153442383, + "learning_rate": 3.749535882883361e-05, + "loss": 0.6071, + "step": 84870 + }, + { + "epoch": 0.7503668735302959, + "grad_norm": 4.3632636070251465, + "learning_rate": 3.749388544116174e-05, + "loss": 0.7108, + "step": 84880 + }, + { + "epoch": 0.7504552767906081, + "grad_norm": 3.853543996810913, + "learning_rate": 3.7492412053489866e-05, + "loss": 0.7068, + "step": 84890 + }, + { + "epoch": 0.7505436800509203, + "grad_norm": 1.2865716218948364, + "learning_rate": 3.7490938665818e-05, + "loss": 0.6535, + "step": 84900 + }, + { + "epoch": 0.7506320833112325, + "grad_norm": 3.5363283157348633, + "learning_rate": 3.748946527814612e-05, + "loss": 0.783, + "step": 84910 + }, + { + "epoch": 0.7507204865715448, + "grad_norm": 4.21813440322876, + "learning_rate": 3.748799189047426e-05, + "loss": 0.5929, + "step": 84920 + }, + { + "epoch": 0.750808889831857, + "grad_norm": 3.862945079803467, + "learning_rate": 3.7486518502802386e-05, + "loss": 0.7237, + "step": 84930 + }, + { + "epoch": 0.7508972930921692, + "grad_norm": 1.0606372356414795, + "learning_rate": 3.7485045115130515e-05, + "loss": 0.6162, + "step": 84940 + }, + { + "epoch": 0.7509856963524815, + "grad_norm": 1.2253813743591309, + "learning_rate": 3.748357172745864e-05, + "loss": 0.5597, + "step": 84950 + }, + { + "epoch": 0.7510740996127937, + "grad_norm": 4.704916954040527, + "learning_rate": 3.748209833978678e-05, + "loss": 0.588, + "step": 84960 + }, + { + "epoch": 0.7511625028731059, + "grad_norm": 17.943359375, + "learning_rate": 3.74806249521149e-05, + "loss": 0.785, + "step": 84970 + }, + { + "epoch": 0.7512509061334182, + "grad_norm": 6.664664268493652, + "learning_rate": 3.7479151564443035e-05, + "loss": 0.669, + "step": 84980 + }, + { + "epoch": 0.7513393093937304, + "grad_norm": 2.415360689163208, + "learning_rate": 3.747767817677116e-05, + "loss": 0.6002, + "step": 84990 + }, + { + "epoch": 0.7514277126540427, + "grad_norm": 6.830345153808594, + "learning_rate": 3.747620478909929e-05, + "loss": 0.6042, + "step": 85000 + }, + { + "epoch": 0.751516115914355, + "grad_norm": 1.9554429054260254, + "learning_rate": 3.747473140142742e-05, + "loss": 0.6581, + "step": 85010 + }, + { + "epoch": 0.7516045191746672, + "grad_norm": 3.2804136276245117, + "learning_rate": 3.747325801375555e-05, + "loss": 0.6722, + "step": 85020 + }, + { + "epoch": 0.7516929224349794, + "grad_norm": 3.311786651611328, + "learning_rate": 3.747178462608368e-05, + "loss": 0.7336, + "step": 85030 + }, + { + "epoch": 0.7517813256952917, + "grad_norm": 5.547338008880615, + "learning_rate": 3.747031123841181e-05, + "loss": 0.7348, + "step": 85040 + }, + { + "epoch": 0.7518697289556039, + "grad_norm": 2.4761946201324463, + "learning_rate": 3.7468837850739933e-05, + "loss": 0.7087, + "step": 85050 + }, + { + "epoch": 0.7519581322159161, + "grad_norm": 6.19984769821167, + "learning_rate": 3.746736446306807e-05, + "loss": 0.6201, + "step": 85060 + }, + { + "epoch": 0.7520465354762284, + "grad_norm": 2.218047857284546, + "learning_rate": 3.74658910753962e-05, + "loss": 0.5292, + "step": 85070 + }, + { + "epoch": 0.7521349387365406, + "grad_norm": 3.5358660221099854, + "learning_rate": 3.7464417687724325e-05, + "loss": 0.7117, + "step": 85080 + }, + { + "epoch": 0.7522233419968528, + "grad_norm": 1.6040410995483398, + "learning_rate": 3.7462944300052454e-05, + "loss": 0.6922, + "step": 85090 + }, + { + "epoch": 0.752311745257165, + "grad_norm": 1.895133376121521, + "learning_rate": 3.746147091238059e-05, + "loss": 0.6707, + "step": 85100 + }, + { + "epoch": 0.7524001485174773, + "grad_norm": 5.289239883422852, + "learning_rate": 3.745999752470871e-05, + "loss": 0.6576, + "step": 85110 + }, + { + "epoch": 0.7524885517777896, + "grad_norm": 2.582613468170166, + "learning_rate": 3.7458524137036845e-05, + "loss": 0.6006, + "step": 85120 + }, + { + "epoch": 0.7525769550381018, + "grad_norm": 15.104894638061523, + "learning_rate": 3.745705074936497e-05, + "loss": 0.6407, + "step": 85130 + }, + { + "epoch": 0.7526653582984141, + "grad_norm": 10.050370216369629, + "learning_rate": 3.74555773616931e-05, + "loss": 0.5614, + "step": 85140 + }, + { + "epoch": 0.7527537615587263, + "grad_norm": 3.571335554122925, + "learning_rate": 3.745410397402123e-05, + "loss": 0.526, + "step": 85150 + }, + { + "epoch": 0.7528421648190385, + "grad_norm": 3.719113826751709, + "learning_rate": 3.745263058634936e-05, + "loss": 0.7387, + "step": 85160 + }, + { + "epoch": 0.7529305680793508, + "grad_norm": 3.958775520324707, + "learning_rate": 3.745115719867749e-05, + "loss": 0.5996, + "step": 85170 + }, + { + "epoch": 0.753018971339663, + "grad_norm": 2.792614221572876, + "learning_rate": 3.744968381100562e-05, + "loss": 0.5216, + "step": 85180 + }, + { + "epoch": 0.7531073745999752, + "grad_norm": 9.038586616516113, + "learning_rate": 3.7448210423333744e-05, + "loss": 0.6776, + "step": 85190 + }, + { + "epoch": 0.7531957778602875, + "grad_norm": 10.211748123168945, + "learning_rate": 3.744673703566188e-05, + "loss": 0.6809, + "step": 85200 + }, + { + "epoch": 0.7532841811205997, + "grad_norm": 13.097444534301758, + "learning_rate": 3.744526364799001e-05, + "loss": 0.5742, + "step": 85210 + }, + { + "epoch": 0.7533725843809119, + "grad_norm": 4.21989107131958, + "learning_rate": 3.7443790260318136e-05, + "loss": 0.6275, + "step": 85220 + }, + { + "epoch": 0.7534609876412242, + "grad_norm": 1.35722815990448, + "learning_rate": 3.7442316872646264e-05, + "loss": 0.7118, + "step": 85230 + }, + { + "epoch": 0.7535493909015365, + "grad_norm": 1.5782872438430786, + "learning_rate": 3.744084348497439e-05, + "loss": 0.641, + "step": 85240 + }, + { + "epoch": 0.7536377941618487, + "grad_norm": 1.3326246738433838, + "learning_rate": 3.743937009730252e-05, + "loss": 0.6758, + "step": 85250 + }, + { + "epoch": 0.753726197422161, + "grad_norm": 8.397066116333008, + "learning_rate": 3.7437896709630656e-05, + "loss": 0.7465, + "step": 85260 + }, + { + "epoch": 0.7538146006824732, + "grad_norm": 5.0338969230651855, + "learning_rate": 3.743642332195878e-05, + "loss": 0.7145, + "step": 85270 + }, + { + "epoch": 0.7539030039427854, + "grad_norm": 4.390265941619873, + "learning_rate": 3.743494993428691e-05, + "loss": 0.6464, + "step": 85280 + }, + { + "epoch": 0.7539914072030977, + "grad_norm": 2.3420324325561523, + "learning_rate": 3.743347654661504e-05, + "loss": 0.7682, + "step": 85290 + }, + { + "epoch": 0.7540798104634099, + "grad_norm": 3.8715598583221436, + "learning_rate": 3.743200315894317e-05, + "loss": 0.6269, + "step": 85300 + }, + { + "epoch": 0.7541682137237221, + "grad_norm": 1.51862633228302, + "learning_rate": 3.74305297712713e-05, + "loss": 0.6234, + "step": 85310 + }, + { + "epoch": 0.7542566169840343, + "grad_norm": 7.025180816650391, + "learning_rate": 3.742905638359943e-05, + "loss": 0.5331, + "step": 85320 + }, + { + "epoch": 0.7543450202443466, + "grad_norm": 6.612129211425781, + "learning_rate": 3.7427582995927554e-05, + "loss": 0.6728, + "step": 85330 + }, + { + "epoch": 0.7544334235046588, + "grad_norm": 4.373544692993164, + "learning_rate": 3.742610960825569e-05, + "loss": 0.8271, + "step": 85340 + }, + { + "epoch": 0.754521826764971, + "grad_norm": 1.9424684047698975, + "learning_rate": 3.742463622058382e-05, + "loss": 0.901, + "step": 85350 + }, + { + "epoch": 0.7546102300252834, + "grad_norm": 2.035431146621704, + "learning_rate": 3.7423162832911946e-05, + "loss": 0.5488, + "step": 85360 + }, + { + "epoch": 0.7546986332855956, + "grad_norm": 2.8291876316070557, + "learning_rate": 3.7421689445240075e-05, + "loss": 0.7324, + "step": 85370 + }, + { + "epoch": 0.7547870365459078, + "grad_norm": 3.273371458053589, + "learning_rate": 3.74202160575682e-05, + "loss": 0.6691, + "step": 85380 + }, + { + "epoch": 0.7548754398062201, + "grad_norm": 3.8757667541503906, + "learning_rate": 3.741874266989633e-05, + "loss": 0.5976, + "step": 85390 + }, + { + "epoch": 0.7549638430665323, + "grad_norm": 2.590461015701294, + "learning_rate": 3.7417269282224466e-05, + "loss": 0.6462, + "step": 85400 + }, + { + "epoch": 0.7550522463268445, + "grad_norm": 2.8991386890411377, + "learning_rate": 3.741579589455259e-05, + "loss": 0.7577, + "step": 85410 + }, + { + "epoch": 0.7551406495871568, + "grad_norm": 6.056429862976074, + "learning_rate": 3.741432250688072e-05, + "loss": 0.7301, + "step": 85420 + }, + { + "epoch": 0.755229052847469, + "grad_norm": 1.267293095588684, + "learning_rate": 3.741284911920885e-05, + "loss": 0.6723, + "step": 85430 + }, + { + "epoch": 0.7553174561077812, + "grad_norm": 1.804534673690796, + "learning_rate": 3.741137573153698e-05, + "loss": 0.5866, + "step": 85440 + }, + { + "epoch": 0.7554058593680935, + "grad_norm": 5.3751726150512695, + "learning_rate": 3.740990234386511e-05, + "loss": 0.6297, + "step": 85450 + }, + { + "epoch": 0.7554942626284057, + "grad_norm": 1.5884987115859985, + "learning_rate": 3.740842895619324e-05, + "loss": 0.6622, + "step": 85460 + }, + { + "epoch": 0.755582665888718, + "grad_norm": 1.7796268463134766, + "learning_rate": 3.7406955568521365e-05, + "loss": 0.6744, + "step": 85470 + }, + { + "epoch": 0.7556710691490303, + "grad_norm": 2.3204240798950195, + "learning_rate": 3.74054821808495e-05, + "loss": 0.6614, + "step": 85480 + }, + { + "epoch": 0.7557594724093425, + "grad_norm": 11.465657234191895, + "learning_rate": 3.740400879317763e-05, + "loss": 0.69, + "step": 85490 + }, + { + "epoch": 0.7558478756696547, + "grad_norm": 5.745561599731445, + "learning_rate": 3.740253540550576e-05, + "loss": 0.7496, + "step": 85500 + }, + { + "epoch": 0.755936278929967, + "grad_norm": 4.5895867347717285, + "learning_rate": 3.7401062017833885e-05, + "loss": 0.5997, + "step": 85510 + }, + { + "epoch": 0.7560246821902792, + "grad_norm": 8.398486137390137, + "learning_rate": 3.7399588630162013e-05, + "loss": 0.7039, + "step": 85520 + }, + { + "epoch": 0.7561130854505914, + "grad_norm": 2.9003798961639404, + "learning_rate": 3.739811524249014e-05, + "loss": 0.5834, + "step": 85530 + }, + { + "epoch": 0.7562014887109036, + "grad_norm": 4.476251602172852, + "learning_rate": 3.739664185481828e-05, + "loss": 0.5525, + "step": 85540 + }, + { + "epoch": 0.7562898919712159, + "grad_norm": 2.4741172790527344, + "learning_rate": 3.7395168467146405e-05, + "loss": 0.6641, + "step": 85550 + }, + { + "epoch": 0.7563782952315281, + "grad_norm": 3.6845388412475586, + "learning_rate": 3.7393695079474534e-05, + "loss": 0.654, + "step": 85560 + }, + { + "epoch": 0.7564666984918403, + "grad_norm": 4.938345909118652, + "learning_rate": 3.739222169180266e-05, + "loss": 0.5988, + "step": 85570 + }, + { + "epoch": 0.7565551017521526, + "grad_norm": 4.465521812438965, + "learning_rate": 3.739074830413079e-05, + "loss": 0.7493, + "step": 85580 + }, + { + "epoch": 0.7566435050124649, + "grad_norm": 2.490525722503662, + "learning_rate": 3.738927491645892e-05, + "loss": 0.6856, + "step": 85590 + }, + { + "epoch": 0.7567319082727771, + "grad_norm": 10.04533863067627, + "learning_rate": 3.738780152878705e-05, + "loss": 0.6438, + "step": 85600 + }, + { + "epoch": 0.7568203115330894, + "grad_norm": 2.158148765563965, + "learning_rate": 3.738632814111518e-05, + "loss": 0.7951, + "step": 85610 + }, + { + "epoch": 0.7569087147934016, + "grad_norm": 4.396143913269043, + "learning_rate": 3.738485475344331e-05, + "loss": 0.5549, + "step": 85620 + }, + { + "epoch": 0.7569971180537138, + "grad_norm": 2.749972343444824, + "learning_rate": 3.738338136577144e-05, + "loss": 0.6336, + "step": 85630 + }, + { + "epoch": 0.7570855213140261, + "grad_norm": 1.941791296005249, + "learning_rate": 3.738190797809957e-05, + "loss": 0.7297, + "step": 85640 + }, + { + "epoch": 0.7571739245743383, + "grad_norm": 2.1761698722839355, + "learning_rate": 3.7380434590427696e-05, + "loss": 0.782, + "step": 85650 + }, + { + "epoch": 0.7572623278346505, + "grad_norm": 2.907989740371704, + "learning_rate": 3.7378961202755824e-05, + "loss": 0.8016, + "step": 85660 + }, + { + "epoch": 0.7573507310949628, + "grad_norm": 13.42611026763916, + "learning_rate": 3.737748781508396e-05, + "loss": 0.6224, + "step": 85670 + }, + { + "epoch": 0.757439134355275, + "grad_norm": 2.420255661010742, + "learning_rate": 3.737601442741209e-05, + "loss": 0.6448, + "step": 85680 + }, + { + "epoch": 0.7575275376155872, + "grad_norm": 1.5082685947418213, + "learning_rate": 3.7374541039740216e-05, + "loss": 0.6928, + "step": 85690 + }, + { + "epoch": 0.7576159408758995, + "grad_norm": 7.950104236602783, + "learning_rate": 3.7373067652068344e-05, + "loss": 0.5604, + "step": 85700 + }, + { + "epoch": 0.7577043441362118, + "grad_norm": 3.9872817993164062, + "learning_rate": 3.737159426439647e-05, + "loss": 0.6927, + "step": 85710 + }, + { + "epoch": 0.757792747396524, + "grad_norm": 3.4432690143585205, + "learning_rate": 3.73701208767246e-05, + "loss": 0.6204, + "step": 85720 + }, + { + "epoch": 0.7578811506568363, + "grad_norm": 5.041614055633545, + "learning_rate": 3.7368647489052736e-05, + "loss": 0.7157, + "step": 85730 + }, + { + "epoch": 0.7579695539171485, + "grad_norm": 5.1985321044921875, + "learning_rate": 3.736717410138086e-05, + "loss": 0.7084, + "step": 85740 + }, + { + "epoch": 0.7580579571774607, + "grad_norm": 9.321937561035156, + "learning_rate": 3.736570071370899e-05, + "loss": 0.7561, + "step": 85750 + }, + { + "epoch": 0.758146360437773, + "grad_norm": 1.8656742572784424, + "learning_rate": 3.736422732603712e-05, + "loss": 0.6805, + "step": 85760 + }, + { + "epoch": 0.7582347636980852, + "grad_norm": 2.554158926010132, + "learning_rate": 3.736275393836525e-05, + "loss": 0.6077, + "step": 85770 + }, + { + "epoch": 0.7583231669583974, + "grad_norm": 2.7132935523986816, + "learning_rate": 3.736128055069338e-05, + "loss": 0.7703, + "step": 85780 + }, + { + "epoch": 0.7584115702187096, + "grad_norm": 7.1925740242004395, + "learning_rate": 3.735980716302151e-05, + "loss": 0.5611, + "step": 85790 + }, + { + "epoch": 0.7584999734790219, + "grad_norm": 1.8949414491653442, + "learning_rate": 3.7358333775349635e-05, + "loss": 0.7039, + "step": 85800 + }, + { + "epoch": 0.7585883767393341, + "grad_norm": 0.8182306885719299, + "learning_rate": 3.735686038767777e-05, + "loss": 0.5058, + "step": 85810 + }, + { + "epoch": 0.7586767799996463, + "grad_norm": 4.480530261993408, + "learning_rate": 3.73553870000059e-05, + "loss": 0.5975, + "step": 85820 + }, + { + "epoch": 0.7587651832599587, + "grad_norm": 2.9453999996185303, + "learning_rate": 3.7353913612334026e-05, + "loss": 0.8073, + "step": 85830 + }, + { + "epoch": 0.7588535865202709, + "grad_norm": 8.90038013458252, + "learning_rate": 3.7352440224662155e-05, + "loss": 0.6849, + "step": 85840 + }, + { + "epoch": 0.7589419897805831, + "grad_norm": 2.3736259937286377, + "learning_rate": 3.735096683699028e-05, + "loss": 0.6218, + "step": 85850 + }, + { + "epoch": 0.7590303930408954, + "grad_norm": 2.652662515640259, + "learning_rate": 3.734949344931841e-05, + "loss": 0.6121, + "step": 85860 + }, + { + "epoch": 0.7591187963012076, + "grad_norm": 1.1810376644134521, + "learning_rate": 3.7348020061646547e-05, + "loss": 0.8128, + "step": 85870 + }, + { + "epoch": 0.7592071995615198, + "grad_norm": 1.275516152381897, + "learning_rate": 3.734654667397467e-05, + "loss": 0.6507, + "step": 85880 + }, + { + "epoch": 0.7592956028218321, + "grad_norm": 1.1071529388427734, + "learning_rate": 3.73450732863028e-05, + "loss": 0.5529, + "step": 85890 + }, + { + "epoch": 0.7593840060821443, + "grad_norm": 2.1914048194885254, + "learning_rate": 3.734359989863093e-05, + "loss": 0.661, + "step": 85900 + }, + { + "epoch": 0.7594724093424565, + "grad_norm": 2.3267629146575928, + "learning_rate": 3.734212651095906e-05, + "loss": 0.8045, + "step": 85910 + }, + { + "epoch": 0.7595608126027688, + "grad_norm": 2.9179482460021973, + "learning_rate": 3.734065312328719e-05, + "loss": 0.6407, + "step": 85920 + }, + { + "epoch": 0.759649215863081, + "grad_norm": 6.203660488128662, + "learning_rate": 3.7339179735615323e-05, + "loss": 0.7239, + "step": 85930 + }, + { + "epoch": 0.7597376191233933, + "grad_norm": 1.300817608833313, + "learning_rate": 3.7337706347943445e-05, + "loss": 0.6637, + "step": 85940 + }, + { + "epoch": 0.7598260223837056, + "grad_norm": 11.422124862670898, + "learning_rate": 3.733623296027158e-05, + "loss": 0.53, + "step": 85950 + }, + { + "epoch": 0.7599144256440178, + "grad_norm": 1.13628351688385, + "learning_rate": 3.73347595725997e-05, + "loss": 0.6933, + "step": 85960 + }, + { + "epoch": 0.76000282890433, + "grad_norm": 9.199313163757324, + "learning_rate": 3.733328618492784e-05, + "loss": 0.5772, + "step": 85970 + }, + { + "epoch": 0.7600912321646422, + "grad_norm": 1.7413986921310425, + "learning_rate": 3.7331812797255965e-05, + "loss": 0.5428, + "step": 85980 + }, + { + "epoch": 0.7601796354249545, + "grad_norm": 1.0740153789520264, + "learning_rate": 3.7330339409584094e-05, + "loss": 0.6409, + "step": 85990 + }, + { + "epoch": 0.7602680386852667, + "grad_norm": 1.8425790071487427, + "learning_rate": 3.732886602191222e-05, + "loss": 0.6859, + "step": 86000 + }, + { + "epoch": 0.7603564419455789, + "grad_norm": 7.294825553894043, + "learning_rate": 3.732739263424036e-05, + "loss": 0.706, + "step": 86010 + }, + { + "epoch": 0.7604448452058912, + "grad_norm": 5.672529697418213, + "learning_rate": 3.732591924656848e-05, + "loss": 0.6576, + "step": 86020 + }, + { + "epoch": 0.7605332484662034, + "grad_norm": 2.1200952529907227, + "learning_rate": 3.7324445858896614e-05, + "loss": 0.6754, + "step": 86030 + }, + { + "epoch": 0.7606216517265156, + "grad_norm": 1.5939382314682007, + "learning_rate": 3.732297247122474e-05, + "loss": 0.588, + "step": 86040 + }, + { + "epoch": 0.7607100549868279, + "grad_norm": 3.9748353958129883, + "learning_rate": 3.732149908355287e-05, + "loss": 0.7642, + "step": 86050 + }, + { + "epoch": 0.7607984582471402, + "grad_norm": 1.7719999551773071, + "learning_rate": 3.7320025695881e-05, + "loss": 0.7279, + "step": 86060 + }, + { + "epoch": 0.7608868615074524, + "grad_norm": 2.2005772590637207, + "learning_rate": 3.731855230820913e-05, + "loss": 0.5966, + "step": 86070 + }, + { + "epoch": 0.7609752647677647, + "grad_norm": 13.298222541809082, + "learning_rate": 3.7317078920537256e-05, + "loss": 0.6331, + "step": 86080 + }, + { + "epoch": 0.7610636680280769, + "grad_norm": 9.629752159118652, + "learning_rate": 3.731560553286539e-05, + "loss": 0.7111, + "step": 86090 + }, + { + "epoch": 0.7611520712883891, + "grad_norm": 10.915865898132324, + "learning_rate": 3.731413214519351e-05, + "loss": 0.6354, + "step": 86100 + }, + { + "epoch": 0.7612404745487014, + "grad_norm": 9.142168045043945, + "learning_rate": 3.731265875752165e-05, + "loss": 0.6389, + "step": 86110 + }, + { + "epoch": 0.7613288778090136, + "grad_norm": 2.2176826000213623, + "learning_rate": 3.7311185369849776e-05, + "loss": 0.7594, + "step": 86120 + }, + { + "epoch": 0.7614172810693258, + "grad_norm": 3.262695789337158, + "learning_rate": 3.7309711982177904e-05, + "loss": 0.6558, + "step": 86130 + }, + { + "epoch": 0.761505684329638, + "grad_norm": 11.032332420349121, + "learning_rate": 3.730823859450603e-05, + "loss": 0.5876, + "step": 86140 + }, + { + "epoch": 0.7615940875899503, + "grad_norm": 1.451194167137146, + "learning_rate": 3.730676520683417e-05, + "loss": 0.6445, + "step": 86150 + }, + { + "epoch": 0.7616824908502625, + "grad_norm": 1.378336787223816, + "learning_rate": 3.730529181916229e-05, + "loss": 0.5879, + "step": 86160 + }, + { + "epoch": 0.7617708941105747, + "grad_norm": 7.298768520355225, + "learning_rate": 3.7303818431490424e-05, + "loss": 0.6221, + "step": 86170 + }, + { + "epoch": 0.7618592973708871, + "grad_norm": 2.8144688606262207, + "learning_rate": 3.7302345043818546e-05, + "loss": 0.7999, + "step": 86180 + }, + { + "epoch": 0.7619477006311993, + "grad_norm": 2.1327362060546875, + "learning_rate": 3.730087165614668e-05, + "loss": 0.7028, + "step": 86190 + }, + { + "epoch": 0.7620361038915116, + "grad_norm": 9.01235294342041, + "learning_rate": 3.729939826847481e-05, + "loss": 0.6731, + "step": 86200 + }, + { + "epoch": 0.7621245071518238, + "grad_norm": 0.987602174282074, + "learning_rate": 3.729792488080294e-05, + "loss": 0.6702, + "step": 86210 + }, + { + "epoch": 0.762212910412136, + "grad_norm": 1.9932714700698853, + "learning_rate": 3.7296451493131066e-05, + "loss": 0.5742, + "step": 86220 + }, + { + "epoch": 0.7623013136724482, + "grad_norm": 2.0963876247406006, + "learning_rate": 3.72949781054592e-05, + "loss": 0.6345, + "step": 86230 + }, + { + "epoch": 0.7623897169327605, + "grad_norm": 3.216031312942505, + "learning_rate": 3.729350471778732e-05, + "loss": 0.7798, + "step": 86240 + }, + { + "epoch": 0.7624781201930727, + "grad_norm": 2.305607795715332, + "learning_rate": 3.729203133011546e-05, + "loss": 0.6234, + "step": 86250 + }, + { + "epoch": 0.7625665234533849, + "grad_norm": 1.9500572681427002, + "learning_rate": 3.7290557942443586e-05, + "loss": 0.597, + "step": 86260 + }, + { + "epoch": 0.7626549267136972, + "grad_norm": 0.8851110935211182, + "learning_rate": 3.7289084554771715e-05, + "loss": 0.5644, + "step": 86270 + }, + { + "epoch": 0.7627433299740094, + "grad_norm": 3.3796939849853516, + "learning_rate": 3.728761116709984e-05, + "loss": 0.7794, + "step": 86280 + }, + { + "epoch": 0.7628317332343216, + "grad_norm": 2.7242753505706787, + "learning_rate": 3.728613777942798e-05, + "loss": 0.6771, + "step": 86290 + }, + { + "epoch": 0.762920136494634, + "grad_norm": 1.906731128692627, + "learning_rate": 3.72846643917561e-05, + "loss": 0.5904, + "step": 86300 + }, + { + "epoch": 0.7630085397549462, + "grad_norm": 2.6325597763061523, + "learning_rate": 3.7283191004084235e-05, + "loss": 0.7392, + "step": 86310 + }, + { + "epoch": 0.7630969430152584, + "grad_norm": 3.9864730834960938, + "learning_rate": 3.7281717616412356e-05, + "loss": 0.5748, + "step": 86320 + }, + { + "epoch": 0.7631853462755707, + "grad_norm": 1.0226085186004639, + "learning_rate": 3.728024422874049e-05, + "loss": 0.6245, + "step": 86330 + }, + { + "epoch": 0.7632737495358829, + "grad_norm": 2.6433935165405273, + "learning_rate": 3.727877084106862e-05, + "loss": 0.601, + "step": 86340 + }, + { + "epoch": 0.7633621527961951, + "grad_norm": 7.856193542480469, + "learning_rate": 3.727729745339675e-05, + "loss": 0.8347, + "step": 86350 + }, + { + "epoch": 0.7634505560565074, + "grad_norm": 2.9784162044525146, + "learning_rate": 3.7275824065724877e-05, + "loss": 0.5236, + "step": 86360 + }, + { + "epoch": 0.7635389593168196, + "grad_norm": 3.378016710281372, + "learning_rate": 3.727435067805301e-05, + "loss": 0.6383, + "step": 86370 + }, + { + "epoch": 0.7636273625771318, + "grad_norm": 1.6524105072021484, + "learning_rate": 3.727287729038113e-05, + "loss": 0.6702, + "step": 86380 + }, + { + "epoch": 0.763715765837444, + "grad_norm": 6.8003435134887695, + "learning_rate": 3.727140390270927e-05, + "loss": 0.697, + "step": 86390 + }, + { + "epoch": 0.7638041690977563, + "grad_norm": 1.075705647468567, + "learning_rate": 3.72699305150374e-05, + "loss": 0.6677, + "step": 86400 + }, + { + "epoch": 0.7638925723580685, + "grad_norm": 4.592240810394287, + "learning_rate": 3.7268457127365525e-05, + "loss": 0.6444, + "step": 86410 + }, + { + "epoch": 0.7639809756183809, + "grad_norm": 1.935365080833435, + "learning_rate": 3.7266983739693653e-05, + "loss": 0.677, + "step": 86420 + }, + { + "epoch": 0.7640693788786931, + "grad_norm": 1.6860672235488892, + "learning_rate": 3.726551035202178e-05, + "loss": 0.5884, + "step": 86430 + }, + { + "epoch": 0.7641577821390053, + "grad_norm": 1.514349102973938, + "learning_rate": 3.726403696434991e-05, + "loss": 0.7351, + "step": 86440 + }, + { + "epoch": 0.7642461853993175, + "grad_norm": 1.893250584602356, + "learning_rate": 3.7262563576678045e-05, + "loss": 0.7859, + "step": 86450 + }, + { + "epoch": 0.7643345886596298, + "grad_norm": 6.0112385749816895, + "learning_rate": 3.7261090189006174e-05, + "loss": 0.5757, + "step": 86460 + }, + { + "epoch": 0.764422991919942, + "grad_norm": 7.275670528411865, + "learning_rate": 3.72596168013343e-05, + "loss": 0.6174, + "step": 86470 + }, + { + "epoch": 0.7645113951802542, + "grad_norm": 3.070352077484131, + "learning_rate": 3.725814341366243e-05, + "loss": 0.6087, + "step": 86480 + }, + { + "epoch": 0.7645997984405665, + "grad_norm": 3.509242296218872, + "learning_rate": 3.725667002599056e-05, + "loss": 0.5485, + "step": 86490 + }, + { + "epoch": 0.7646882017008787, + "grad_norm": 9.614588737487793, + "learning_rate": 3.725519663831869e-05, + "loss": 0.6283, + "step": 86500 + }, + { + "epoch": 0.7647766049611909, + "grad_norm": 3.4603703022003174, + "learning_rate": 3.725372325064682e-05, + "loss": 0.6028, + "step": 86510 + }, + { + "epoch": 0.7648650082215032, + "grad_norm": 4.604862689971924, + "learning_rate": 3.725224986297495e-05, + "loss": 0.6593, + "step": 86520 + }, + { + "epoch": 0.7649534114818155, + "grad_norm": 2.411860466003418, + "learning_rate": 3.725077647530308e-05, + "loss": 0.6644, + "step": 86530 + }, + { + "epoch": 0.7650418147421277, + "grad_norm": 6.114974021911621, + "learning_rate": 3.724930308763121e-05, + "loss": 0.665, + "step": 86540 + }, + { + "epoch": 0.76513021800244, + "grad_norm": 1.5121123790740967, + "learning_rate": 3.7247829699959336e-05, + "loss": 0.7448, + "step": 86550 + }, + { + "epoch": 0.7652186212627522, + "grad_norm": 13.443902969360352, + "learning_rate": 3.7246356312287464e-05, + "loss": 0.6595, + "step": 86560 + }, + { + "epoch": 0.7653070245230644, + "grad_norm": 10.518452644348145, + "learning_rate": 3.724488292461559e-05, + "loss": 0.6494, + "step": 86570 + }, + { + "epoch": 0.7653954277833767, + "grad_norm": 3.0383522510528564, + "learning_rate": 3.724340953694373e-05, + "loss": 0.7022, + "step": 86580 + }, + { + "epoch": 0.7654838310436889, + "grad_norm": 0.9191209673881531, + "learning_rate": 3.7241936149271856e-05, + "loss": 0.5543, + "step": 86590 + }, + { + "epoch": 0.7655722343040011, + "grad_norm": 2.0296833515167236, + "learning_rate": 3.7240462761599984e-05, + "loss": 0.6574, + "step": 86600 + }, + { + "epoch": 0.7656606375643134, + "grad_norm": 4.197606086730957, + "learning_rate": 3.723898937392811e-05, + "loss": 0.6363, + "step": 86610 + }, + { + "epoch": 0.7657490408246256, + "grad_norm": 2.9814839363098145, + "learning_rate": 3.723751598625624e-05, + "loss": 0.7775, + "step": 86620 + }, + { + "epoch": 0.7658374440849378, + "grad_norm": 1.5551029443740845, + "learning_rate": 3.723604259858437e-05, + "loss": 0.6589, + "step": 86630 + }, + { + "epoch": 0.76592584734525, + "grad_norm": 2.176035165786743, + "learning_rate": 3.7234569210912504e-05, + "loss": 0.6282, + "step": 86640 + }, + { + "epoch": 0.7660142506055624, + "grad_norm": 7.538548469543457, + "learning_rate": 3.7233095823240626e-05, + "loss": 0.7455, + "step": 86650 + }, + { + "epoch": 0.7661026538658746, + "grad_norm": 2.308427572250366, + "learning_rate": 3.723162243556876e-05, + "loss": 0.5778, + "step": 86660 + }, + { + "epoch": 0.7661910571261868, + "grad_norm": 3.6587073802948, + "learning_rate": 3.723014904789689e-05, + "loss": 0.5609, + "step": 86670 + }, + { + "epoch": 0.7662794603864991, + "grad_norm": 6.131343364715576, + "learning_rate": 3.722867566022502e-05, + "loss": 0.7635, + "step": 86680 + }, + { + "epoch": 0.7663678636468113, + "grad_norm": 2.838127374649048, + "learning_rate": 3.7227202272553146e-05, + "loss": 0.6712, + "step": 86690 + }, + { + "epoch": 0.7664562669071235, + "grad_norm": 4.57146692276001, + "learning_rate": 3.722572888488128e-05, + "loss": 0.5437, + "step": 86700 + }, + { + "epoch": 0.7665446701674358, + "grad_norm": 1.599280595779419, + "learning_rate": 3.72242554972094e-05, + "loss": 0.7483, + "step": 86710 + }, + { + "epoch": 0.766633073427748, + "grad_norm": 3.5061333179473877, + "learning_rate": 3.722278210953754e-05, + "loss": 0.6974, + "step": 86720 + }, + { + "epoch": 0.7667214766880602, + "grad_norm": 1.9312424659729004, + "learning_rate": 3.7221308721865666e-05, + "loss": 0.7671, + "step": 86730 + }, + { + "epoch": 0.7668098799483725, + "grad_norm": 11.15473461151123, + "learning_rate": 3.7219835334193795e-05, + "loss": 0.5768, + "step": 86740 + }, + { + "epoch": 0.7668982832086847, + "grad_norm": 1.9385086297988892, + "learning_rate": 3.721836194652192e-05, + "loss": 0.5882, + "step": 86750 + }, + { + "epoch": 0.7669866864689969, + "grad_norm": 2.1111316680908203, + "learning_rate": 3.721688855885006e-05, + "loss": 0.6912, + "step": 86760 + }, + { + "epoch": 0.7670750897293093, + "grad_norm": 6.412662506103516, + "learning_rate": 3.721541517117818e-05, + "loss": 0.6348, + "step": 86770 + }, + { + "epoch": 0.7671634929896215, + "grad_norm": 2.2824347019195557, + "learning_rate": 3.7213941783506315e-05, + "loss": 0.6453, + "step": 86780 + }, + { + "epoch": 0.7672518962499337, + "grad_norm": 4.151925086975098, + "learning_rate": 3.7212468395834436e-05, + "loss": 0.688, + "step": 86790 + }, + { + "epoch": 0.767340299510246, + "grad_norm": 1.4501419067382812, + "learning_rate": 3.721099500816257e-05, + "loss": 0.7009, + "step": 86800 + }, + { + "epoch": 0.7674287027705582, + "grad_norm": 5.784538269042969, + "learning_rate": 3.72095216204907e-05, + "loss": 0.6477, + "step": 86810 + }, + { + "epoch": 0.7675171060308704, + "grad_norm": 3.613633632659912, + "learning_rate": 3.720804823281883e-05, + "loss": 0.5474, + "step": 86820 + }, + { + "epoch": 0.7676055092911827, + "grad_norm": 1.507540225982666, + "learning_rate": 3.7206574845146957e-05, + "loss": 0.5756, + "step": 86830 + }, + { + "epoch": 0.7676939125514949, + "grad_norm": 2.6359832286834717, + "learning_rate": 3.720510145747509e-05, + "loss": 0.6383, + "step": 86840 + }, + { + "epoch": 0.7677823158118071, + "grad_norm": 5.860042095184326, + "learning_rate": 3.720362806980321e-05, + "loss": 0.6709, + "step": 86850 + }, + { + "epoch": 0.7678707190721193, + "grad_norm": 4.933174133300781, + "learning_rate": 3.720215468213135e-05, + "loss": 0.7577, + "step": 86860 + }, + { + "epoch": 0.7679591223324316, + "grad_norm": 4.768065929412842, + "learning_rate": 3.720068129445948e-05, + "loss": 0.5741, + "step": 86870 + }, + { + "epoch": 0.7680475255927438, + "grad_norm": 3.4530088901519775, + "learning_rate": 3.7199207906787605e-05, + "loss": 0.692, + "step": 86880 + }, + { + "epoch": 0.7681359288530561, + "grad_norm": 2.5492825508117676, + "learning_rate": 3.7197734519115734e-05, + "loss": 0.6823, + "step": 86890 + }, + { + "epoch": 0.7682243321133684, + "grad_norm": 4.962037563323975, + "learning_rate": 3.719626113144386e-05, + "loss": 0.6342, + "step": 86900 + }, + { + "epoch": 0.7683127353736806, + "grad_norm": 4.782608509063721, + "learning_rate": 3.719478774377199e-05, + "loss": 0.5894, + "step": 86910 + }, + { + "epoch": 0.7684011386339928, + "grad_norm": 12.531340599060059, + "learning_rate": 3.7193314356100125e-05, + "loss": 0.729, + "step": 86920 + }, + { + "epoch": 0.7684895418943051, + "grad_norm": 3.969730854034424, + "learning_rate": 3.719184096842825e-05, + "loss": 0.6603, + "step": 86930 + }, + { + "epoch": 0.7685779451546173, + "grad_norm": 6.431152820587158, + "learning_rate": 3.719036758075638e-05, + "loss": 0.6721, + "step": 86940 + }, + { + "epoch": 0.7686663484149295, + "grad_norm": 1.359704852104187, + "learning_rate": 3.718889419308451e-05, + "loss": 0.6226, + "step": 86950 + }, + { + "epoch": 0.7687547516752418, + "grad_norm": 1.1664873361587524, + "learning_rate": 3.718742080541264e-05, + "loss": 0.5415, + "step": 86960 + }, + { + "epoch": 0.768843154935554, + "grad_norm": 5.02387809753418, + "learning_rate": 3.718594741774077e-05, + "loss": 0.5928, + "step": 86970 + }, + { + "epoch": 0.7689315581958662, + "grad_norm": 1.8267531394958496, + "learning_rate": 3.71844740300689e-05, + "loss": 0.535, + "step": 86980 + }, + { + "epoch": 0.7690199614561785, + "grad_norm": 1.6625699996948242, + "learning_rate": 3.7183000642397024e-05, + "loss": 0.721, + "step": 86990 + }, + { + "epoch": 0.7691083647164907, + "grad_norm": 0.972973108291626, + "learning_rate": 3.718152725472516e-05, + "loss": 0.6983, + "step": 87000 + }, + { + "epoch": 0.769196767976803, + "grad_norm": 8.324231147766113, + "learning_rate": 3.718005386705328e-05, + "loss": 0.6716, + "step": 87010 + }, + { + "epoch": 0.7692851712371153, + "grad_norm": 2.3789448738098145, + "learning_rate": 3.7178580479381416e-05, + "loss": 0.5998, + "step": 87020 + }, + { + "epoch": 0.7693735744974275, + "grad_norm": 2.331791877746582, + "learning_rate": 3.7177107091709544e-05, + "loss": 0.716, + "step": 87030 + }, + { + "epoch": 0.7694619777577397, + "grad_norm": 7.855481147766113, + "learning_rate": 3.717563370403767e-05, + "loss": 0.7022, + "step": 87040 + }, + { + "epoch": 0.769550381018052, + "grad_norm": 4.423086166381836, + "learning_rate": 3.71741603163658e-05, + "loss": 0.6298, + "step": 87050 + }, + { + "epoch": 0.7696387842783642, + "grad_norm": 2.639465808868408, + "learning_rate": 3.7172686928693936e-05, + "loss": 0.7328, + "step": 87060 + }, + { + "epoch": 0.7697271875386764, + "grad_norm": 6.835954189300537, + "learning_rate": 3.717121354102206e-05, + "loss": 0.6145, + "step": 87070 + }, + { + "epoch": 0.7698155907989886, + "grad_norm": 3.1799473762512207, + "learning_rate": 3.716974015335019e-05, + "loss": 0.6323, + "step": 87080 + }, + { + "epoch": 0.7699039940593009, + "grad_norm": 1.1374777555465698, + "learning_rate": 3.716826676567832e-05, + "loss": 0.6963, + "step": 87090 + }, + { + "epoch": 0.7699923973196131, + "grad_norm": 9.492671012878418, + "learning_rate": 3.716679337800645e-05, + "loss": 0.643, + "step": 87100 + }, + { + "epoch": 0.7700808005799253, + "grad_norm": 2.9431264400482178, + "learning_rate": 3.716531999033458e-05, + "loss": 0.6186, + "step": 87110 + }, + { + "epoch": 0.7701692038402377, + "grad_norm": 8.265769004821777, + "learning_rate": 3.7163846602662706e-05, + "loss": 0.7168, + "step": 87120 + }, + { + "epoch": 0.7702576071005499, + "grad_norm": 5.6462788581848145, + "learning_rate": 3.7162373214990834e-05, + "loss": 0.6878, + "step": 87130 + }, + { + "epoch": 0.7703460103608621, + "grad_norm": 1.2095264196395874, + "learning_rate": 3.716089982731897e-05, + "loss": 0.6237, + "step": 87140 + }, + { + "epoch": 0.7704344136211744, + "grad_norm": 1.564755916595459, + "learning_rate": 3.715942643964709e-05, + "loss": 0.6208, + "step": 87150 + }, + { + "epoch": 0.7705228168814866, + "grad_norm": 8.673503875732422, + "learning_rate": 3.7157953051975226e-05, + "loss": 0.6998, + "step": 87160 + }, + { + "epoch": 0.7706112201417988, + "grad_norm": 4.018807888031006, + "learning_rate": 3.7156479664303355e-05, + "loss": 0.8663, + "step": 87170 + }, + { + "epoch": 0.7706996234021111, + "grad_norm": 3.4271037578582764, + "learning_rate": 3.715500627663148e-05, + "loss": 0.5504, + "step": 87180 + }, + { + "epoch": 0.7707880266624233, + "grad_norm": 2.202488899230957, + "learning_rate": 3.715353288895961e-05, + "loss": 0.6589, + "step": 87190 + }, + { + "epoch": 0.7708764299227355, + "grad_norm": 0.8874525427818298, + "learning_rate": 3.7152059501287746e-05, + "loss": 0.8064, + "step": 87200 + }, + { + "epoch": 0.7709648331830478, + "grad_norm": 5.62076473236084, + "learning_rate": 3.715058611361587e-05, + "loss": 0.7078, + "step": 87210 + }, + { + "epoch": 0.77105323644336, + "grad_norm": 3.08376407623291, + "learning_rate": 3.7149112725944e-05, + "loss": 0.6404, + "step": 87220 + }, + { + "epoch": 0.7711416397036722, + "grad_norm": 2.968723773956299, + "learning_rate": 3.714763933827213e-05, + "loss": 0.741, + "step": 87230 + }, + { + "epoch": 0.7712300429639846, + "grad_norm": 3.1863155364990234, + "learning_rate": 3.714616595060026e-05, + "loss": 0.6871, + "step": 87240 + }, + { + "epoch": 0.7713184462242968, + "grad_norm": 2.1257872581481934, + "learning_rate": 3.714469256292839e-05, + "loss": 0.6134, + "step": 87250 + }, + { + "epoch": 0.771406849484609, + "grad_norm": 3.2810986042022705, + "learning_rate": 3.7143219175256516e-05, + "loss": 0.5887, + "step": 87260 + }, + { + "epoch": 0.7714952527449213, + "grad_norm": 3.897430419921875, + "learning_rate": 3.7141745787584645e-05, + "loss": 0.5798, + "step": 87270 + }, + { + "epoch": 0.7715836560052335, + "grad_norm": 1.7576478719711304, + "learning_rate": 3.714027239991278e-05, + "loss": 0.6754, + "step": 87280 + }, + { + "epoch": 0.7716720592655457, + "grad_norm": 1.6312427520751953, + "learning_rate": 3.71387990122409e-05, + "loss": 0.5702, + "step": 87290 + }, + { + "epoch": 0.771760462525858, + "grad_norm": 6.856921672821045, + "learning_rate": 3.713732562456904e-05, + "loss": 0.6354, + "step": 87300 + }, + { + "epoch": 0.7718488657861702, + "grad_norm": 2.8430116176605225, + "learning_rate": 3.7135852236897165e-05, + "loss": 0.6825, + "step": 87310 + }, + { + "epoch": 0.7719372690464824, + "grad_norm": 1.7134850025177002, + "learning_rate": 3.7134378849225293e-05, + "loss": 0.6339, + "step": 87320 + }, + { + "epoch": 0.7720256723067946, + "grad_norm": 2.7168121337890625, + "learning_rate": 3.713290546155342e-05, + "loss": 0.648, + "step": 87330 + }, + { + "epoch": 0.7721140755671069, + "grad_norm": 3.56308650970459, + "learning_rate": 3.713143207388156e-05, + "loss": 0.7647, + "step": 87340 + }, + { + "epoch": 0.7722024788274191, + "grad_norm": 2.641613721847534, + "learning_rate": 3.712995868620968e-05, + "loss": 0.6659, + "step": 87350 + }, + { + "epoch": 0.7722908820877314, + "grad_norm": 4.132381916046143, + "learning_rate": 3.7128485298537814e-05, + "loss": 0.785, + "step": 87360 + }, + { + "epoch": 0.7723792853480437, + "grad_norm": 4.496201992034912, + "learning_rate": 3.712701191086594e-05, + "loss": 0.7201, + "step": 87370 + }, + { + "epoch": 0.7724676886083559, + "grad_norm": 2.0849084854125977, + "learning_rate": 3.712553852319407e-05, + "loss": 0.6724, + "step": 87380 + }, + { + "epoch": 0.7725560918686681, + "grad_norm": 2.197014331817627, + "learning_rate": 3.71240651355222e-05, + "loss": 0.6605, + "step": 87390 + }, + { + "epoch": 0.7726444951289804, + "grad_norm": 3.115480422973633, + "learning_rate": 3.712259174785033e-05, + "loss": 0.6514, + "step": 87400 + }, + { + "epoch": 0.7727328983892926, + "grad_norm": 3.5521392822265625, + "learning_rate": 3.7121118360178455e-05, + "loss": 0.5559, + "step": 87410 + }, + { + "epoch": 0.7728213016496048, + "grad_norm": 3.6806702613830566, + "learning_rate": 3.711964497250659e-05, + "loss": 0.6868, + "step": 87420 + }, + { + "epoch": 0.7729097049099171, + "grad_norm": 2.708115339279175, + "learning_rate": 3.711817158483472e-05, + "loss": 0.6085, + "step": 87430 + }, + { + "epoch": 0.7729981081702293, + "grad_norm": 3.2188241481781006, + "learning_rate": 3.711669819716285e-05, + "loss": 0.5919, + "step": 87440 + }, + { + "epoch": 0.7730865114305415, + "grad_norm": 9.90634536743164, + "learning_rate": 3.7115224809490976e-05, + "loss": 0.6776, + "step": 87450 + }, + { + "epoch": 0.7731749146908538, + "grad_norm": 8.803030967712402, + "learning_rate": 3.7113751421819104e-05, + "loss": 0.6571, + "step": 87460 + }, + { + "epoch": 0.773263317951166, + "grad_norm": 5.876282691955566, + "learning_rate": 3.711227803414723e-05, + "loss": 0.7332, + "step": 87470 + }, + { + "epoch": 0.7733517212114783, + "grad_norm": 1.35496187210083, + "learning_rate": 3.711080464647536e-05, + "loss": 0.6929, + "step": 87480 + }, + { + "epoch": 0.7734401244717906, + "grad_norm": 2.2722551822662354, + "learning_rate": 3.7109331258803496e-05, + "loss": 0.6115, + "step": 87490 + }, + { + "epoch": 0.7735285277321028, + "grad_norm": 2.5263278484344482, + "learning_rate": 3.7107857871131624e-05, + "loss": 0.703, + "step": 87500 + }, + { + "epoch": 0.773616930992415, + "grad_norm": 2.4253995418548584, + "learning_rate": 3.710638448345975e-05, + "loss": 0.6902, + "step": 87510 + }, + { + "epoch": 0.7737053342527273, + "grad_norm": 4.172402858734131, + "learning_rate": 3.710491109578788e-05, + "loss": 0.7145, + "step": 87520 + }, + { + "epoch": 0.7737937375130395, + "grad_norm": 1.2479969263076782, + "learning_rate": 3.710343770811601e-05, + "loss": 0.5595, + "step": 87530 + }, + { + "epoch": 0.7738821407733517, + "grad_norm": 1.5717514753341675, + "learning_rate": 3.710196432044414e-05, + "loss": 0.6057, + "step": 87540 + }, + { + "epoch": 0.7739705440336639, + "grad_norm": 3.5647497177124023, + "learning_rate": 3.710049093277227e-05, + "loss": 0.6811, + "step": 87550 + }, + { + "epoch": 0.7740589472939762, + "grad_norm": 2.8100178241729736, + "learning_rate": 3.70990175451004e-05, + "loss": 0.6874, + "step": 87560 + }, + { + "epoch": 0.7741473505542884, + "grad_norm": 1.2704204320907593, + "learning_rate": 3.709754415742853e-05, + "loss": 0.5757, + "step": 87570 + }, + { + "epoch": 0.7742357538146006, + "grad_norm": 12.480536460876465, + "learning_rate": 3.709607076975666e-05, + "loss": 0.6589, + "step": 87580 + }, + { + "epoch": 0.774324157074913, + "grad_norm": 1.3453384637832642, + "learning_rate": 3.7094597382084786e-05, + "loss": 0.6775, + "step": 87590 + }, + { + "epoch": 0.7744125603352252, + "grad_norm": 5.988736152648926, + "learning_rate": 3.7093123994412914e-05, + "loss": 0.7738, + "step": 87600 + }, + { + "epoch": 0.7745009635955374, + "grad_norm": 3.0035417079925537, + "learning_rate": 3.709165060674105e-05, + "loss": 0.6694, + "step": 87610 + }, + { + "epoch": 0.7745893668558497, + "grad_norm": 4.275324821472168, + "learning_rate": 3.709017721906917e-05, + "loss": 0.6089, + "step": 87620 + }, + { + "epoch": 0.7746777701161619, + "grad_norm": 2.202371120452881, + "learning_rate": 3.7088703831397306e-05, + "loss": 0.7254, + "step": 87630 + }, + { + "epoch": 0.7747661733764741, + "grad_norm": 6.4135212898254395, + "learning_rate": 3.7087230443725435e-05, + "loss": 0.6861, + "step": 87640 + }, + { + "epoch": 0.7748545766367864, + "grad_norm": 1.285910964012146, + "learning_rate": 3.708575705605356e-05, + "loss": 0.6832, + "step": 87650 + }, + { + "epoch": 0.7749429798970986, + "grad_norm": 4.480310916900635, + "learning_rate": 3.708428366838169e-05, + "loss": 0.6222, + "step": 87660 + }, + { + "epoch": 0.7750313831574108, + "grad_norm": 5.771284580230713, + "learning_rate": 3.7082810280709826e-05, + "loss": 0.5067, + "step": 87670 + }, + { + "epoch": 0.7751197864177231, + "grad_norm": 9.911836624145508, + "learning_rate": 3.708133689303795e-05, + "loss": 0.6686, + "step": 87680 + }, + { + "epoch": 0.7752081896780353, + "grad_norm": 1.3472927808761597, + "learning_rate": 3.707986350536608e-05, + "loss": 0.6423, + "step": 87690 + }, + { + "epoch": 0.7752965929383475, + "grad_norm": 1.7603410482406616, + "learning_rate": 3.707839011769421e-05, + "loss": 0.7479, + "step": 87700 + }, + { + "epoch": 0.7753849961986599, + "grad_norm": 5.905195713043213, + "learning_rate": 3.707691673002234e-05, + "loss": 0.6705, + "step": 87710 + }, + { + "epoch": 0.7754733994589721, + "grad_norm": 4.155801296234131, + "learning_rate": 3.707544334235047e-05, + "loss": 0.7596, + "step": 87720 + }, + { + "epoch": 0.7755618027192843, + "grad_norm": 3.1507489681243896, + "learning_rate": 3.7073969954678597e-05, + "loss": 0.6666, + "step": 87730 + }, + { + "epoch": 0.7756502059795966, + "grad_norm": 1.6494165658950806, + "learning_rate": 3.7072496567006725e-05, + "loss": 0.6231, + "step": 87740 + }, + { + "epoch": 0.7757386092399088, + "grad_norm": 2.3226284980773926, + "learning_rate": 3.707102317933486e-05, + "loss": 0.765, + "step": 87750 + }, + { + "epoch": 0.775827012500221, + "grad_norm": 6.424279689788818, + "learning_rate": 3.706954979166298e-05, + "loss": 0.6071, + "step": 87760 + }, + { + "epoch": 0.7759154157605332, + "grad_norm": 3.302466869354248, + "learning_rate": 3.706807640399112e-05, + "loss": 0.736, + "step": 87770 + }, + { + "epoch": 0.7760038190208455, + "grad_norm": 1.4127602577209473, + "learning_rate": 3.7066603016319245e-05, + "loss": 0.6367, + "step": 87780 + }, + { + "epoch": 0.7760922222811577, + "grad_norm": 4.607992649078369, + "learning_rate": 3.7065129628647373e-05, + "loss": 0.5764, + "step": 87790 + }, + { + "epoch": 0.7761806255414699, + "grad_norm": 2.244208574295044, + "learning_rate": 3.70636562409755e-05, + "loss": 0.6067, + "step": 87800 + }, + { + "epoch": 0.7762690288017822, + "grad_norm": 2.9210565090179443, + "learning_rate": 3.706218285330364e-05, + "loss": 0.6236, + "step": 87810 + }, + { + "epoch": 0.7763574320620944, + "grad_norm": 0.6639277935028076, + "learning_rate": 3.706070946563176e-05, + "loss": 0.7391, + "step": 87820 + }, + { + "epoch": 0.7764458353224067, + "grad_norm": 4.9028520584106445, + "learning_rate": 3.7059236077959894e-05, + "loss": 0.6178, + "step": 87830 + }, + { + "epoch": 0.776534238582719, + "grad_norm": 6.615703105926514, + "learning_rate": 3.7057762690288015e-05, + "loss": 0.7192, + "step": 87840 + }, + { + "epoch": 0.7766226418430312, + "grad_norm": 3.8918676376342773, + "learning_rate": 3.705628930261615e-05, + "loss": 0.6397, + "step": 87850 + }, + { + "epoch": 0.7767110451033434, + "grad_norm": 2.080568313598633, + "learning_rate": 3.705481591494428e-05, + "loss": 0.7087, + "step": 87860 + }, + { + "epoch": 0.7767994483636557, + "grad_norm": 3.0624096393585205, + "learning_rate": 3.705334252727241e-05, + "loss": 0.6014, + "step": 87870 + }, + { + "epoch": 0.7768878516239679, + "grad_norm": 4.928555011749268, + "learning_rate": 3.7051869139600535e-05, + "loss": 0.5826, + "step": 87880 + }, + { + "epoch": 0.7769762548842801, + "grad_norm": 2.02699613571167, + "learning_rate": 3.705039575192867e-05, + "loss": 0.6875, + "step": 87890 + }, + { + "epoch": 0.7770646581445924, + "grad_norm": 4.903561115264893, + "learning_rate": 3.704892236425679e-05, + "loss": 0.619, + "step": 87900 + }, + { + "epoch": 0.7771530614049046, + "grad_norm": 1.723884105682373, + "learning_rate": 3.704744897658493e-05, + "loss": 0.5838, + "step": 87910 + }, + { + "epoch": 0.7772414646652168, + "grad_norm": 6.695972442626953, + "learning_rate": 3.7045975588913056e-05, + "loss": 0.7628, + "step": 87920 + }, + { + "epoch": 0.777329867925529, + "grad_norm": 2.024143934249878, + "learning_rate": 3.7044502201241184e-05, + "loss": 0.6228, + "step": 87930 + }, + { + "epoch": 0.7774182711858413, + "grad_norm": 2.132025718688965, + "learning_rate": 3.704302881356931e-05, + "loss": 0.731, + "step": 87940 + }, + { + "epoch": 0.7775066744461536, + "grad_norm": 7.058281898498535, + "learning_rate": 3.704155542589744e-05, + "loss": 0.6744, + "step": 87950 + }, + { + "epoch": 0.7775950777064659, + "grad_norm": 1.7809114456176758, + "learning_rate": 3.704008203822557e-05, + "loss": 0.6232, + "step": 87960 + }, + { + "epoch": 0.7776834809667781, + "grad_norm": 3.7689127922058105, + "learning_rate": 3.7038608650553704e-05, + "loss": 0.6386, + "step": 87970 + }, + { + "epoch": 0.7777718842270903, + "grad_norm": 5.322371006011963, + "learning_rate": 3.7037135262881826e-05, + "loss": 0.7953, + "step": 87980 + }, + { + "epoch": 0.7778602874874025, + "grad_norm": 1.9489524364471436, + "learning_rate": 3.703566187520996e-05, + "loss": 0.6845, + "step": 87990 + }, + { + "epoch": 0.7779486907477148, + "grad_norm": 4.435844421386719, + "learning_rate": 3.703418848753809e-05, + "loss": 0.7268, + "step": 88000 + }, + { + "epoch": 0.778037094008027, + "grad_norm": 5.996851921081543, + "learning_rate": 3.703271509986622e-05, + "loss": 0.5489, + "step": 88010 + }, + { + "epoch": 0.7781254972683392, + "grad_norm": 3.0234906673431396, + "learning_rate": 3.7031241712194346e-05, + "loss": 0.5764, + "step": 88020 + }, + { + "epoch": 0.7782139005286515, + "grad_norm": 3.921184778213501, + "learning_rate": 3.702976832452248e-05, + "loss": 0.5819, + "step": 88030 + }, + { + "epoch": 0.7783023037889637, + "grad_norm": 14.208062171936035, + "learning_rate": 3.70282949368506e-05, + "loss": 0.5787, + "step": 88040 + }, + { + "epoch": 0.7783907070492759, + "grad_norm": 2.463609218597412, + "learning_rate": 3.702682154917874e-05, + "loss": 0.7069, + "step": 88050 + }, + { + "epoch": 0.7784791103095882, + "grad_norm": 1.9219471216201782, + "learning_rate": 3.7025348161506866e-05, + "loss": 0.6457, + "step": 88060 + }, + { + "epoch": 0.7785675135699005, + "grad_norm": 3.2396671772003174, + "learning_rate": 3.7023874773834994e-05, + "loss": 0.6789, + "step": 88070 + }, + { + "epoch": 0.7786559168302127, + "grad_norm": 3.904404640197754, + "learning_rate": 3.702240138616312e-05, + "loss": 0.7621, + "step": 88080 + }, + { + "epoch": 0.778744320090525, + "grad_norm": 4.983608245849609, + "learning_rate": 3.702092799849125e-05, + "loss": 0.6982, + "step": 88090 + }, + { + "epoch": 0.7788327233508372, + "grad_norm": 3.2792069911956787, + "learning_rate": 3.701945461081938e-05, + "loss": 0.6748, + "step": 88100 + }, + { + "epoch": 0.7789211266111494, + "grad_norm": 2.070518970489502, + "learning_rate": 3.7017981223147515e-05, + "loss": 0.6089, + "step": 88110 + }, + { + "epoch": 0.7790095298714617, + "grad_norm": 2.015944719314575, + "learning_rate": 3.7016507835475636e-05, + "loss": 0.6293, + "step": 88120 + }, + { + "epoch": 0.7790979331317739, + "grad_norm": 3.026904821395874, + "learning_rate": 3.701503444780377e-05, + "loss": 0.7415, + "step": 88130 + }, + { + "epoch": 0.7791863363920861, + "grad_norm": 1.4083503484725952, + "learning_rate": 3.70135610601319e-05, + "loss": 0.6926, + "step": 88140 + }, + { + "epoch": 0.7792747396523984, + "grad_norm": 4.103747844696045, + "learning_rate": 3.701208767246003e-05, + "loss": 0.7771, + "step": 88150 + }, + { + "epoch": 0.7793631429127106, + "grad_norm": 4.715278625488281, + "learning_rate": 3.7010614284788156e-05, + "loss": 0.6279, + "step": 88160 + }, + { + "epoch": 0.7794515461730228, + "grad_norm": 8.26240062713623, + "learning_rate": 3.700914089711629e-05, + "loss": 0.5934, + "step": 88170 + }, + { + "epoch": 0.7795399494333352, + "grad_norm": 4.509095191955566, + "learning_rate": 3.700766750944441e-05, + "loss": 0.6107, + "step": 88180 + }, + { + "epoch": 0.7796283526936474, + "grad_norm": 2.47825288772583, + "learning_rate": 3.700619412177255e-05, + "loss": 0.6927, + "step": 88190 + }, + { + "epoch": 0.7797167559539596, + "grad_norm": 1.8736214637756348, + "learning_rate": 3.700472073410067e-05, + "loss": 0.583, + "step": 88200 + }, + { + "epoch": 0.7798051592142718, + "grad_norm": 2.698552131652832, + "learning_rate": 3.7003247346428805e-05, + "loss": 0.6544, + "step": 88210 + }, + { + "epoch": 0.7798935624745841, + "grad_norm": 2.763906955718994, + "learning_rate": 3.700177395875693e-05, + "loss": 0.6034, + "step": 88220 + }, + { + "epoch": 0.7799819657348963, + "grad_norm": 10.334351539611816, + "learning_rate": 3.700030057108506e-05, + "loss": 0.7124, + "step": 88230 + }, + { + "epoch": 0.7800703689952085, + "grad_norm": 2.847381114959717, + "learning_rate": 3.699882718341319e-05, + "loss": 0.7507, + "step": 88240 + }, + { + "epoch": 0.7801587722555208, + "grad_norm": 5.7208380699157715, + "learning_rate": 3.6997353795741325e-05, + "loss": 0.5929, + "step": 88250 + }, + { + "epoch": 0.780247175515833, + "grad_norm": 1.9132936000823975, + "learning_rate": 3.699588040806945e-05, + "loss": 0.6645, + "step": 88260 + }, + { + "epoch": 0.7803355787761452, + "grad_norm": 6.9606828689575195, + "learning_rate": 3.699440702039758e-05, + "loss": 0.6603, + "step": 88270 + }, + { + "epoch": 0.7804239820364575, + "grad_norm": 1.421900987625122, + "learning_rate": 3.699293363272571e-05, + "loss": 0.6757, + "step": 88280 + }, + { + "epoch": 0.7805123852967697, + "grad_norm": 8.137231826782227, + "learning_rate": 3.699146024505384e-05, + "loss": 0.6226, + "step": 88290 + }, + { + "epoch": 0.780600788557082, + "grad_norm": 4.930499076843262, + "learning_rate": 3.698998685738197e-05, + "loss": 0.8131, + "step": 88300 + }, + { + "epoch": 0.7806891918173943, + "grad_norm": 4.467109680175781, + "learning_rate": 3.6988513469710095e-05, + "loss": 0.6777, + "step": 88310 + }, + { + "epoch": 0.7807775950777065, + "grad_norm": 9.587776184082031, + "learning_rate": 3.6987040082038224e-05, + "loss": 0.6822, + "step": 88320 + }, + { + "epoch": 0.7808659983380187, + "grad_norm": 1.8515719175338745, + "learning_rate": 3.698556669436636e-05, + "loss": 0.7263, + "step": 88330 + }, + { + "epoch": 0.780954401598331, + "grad_norm": 6.098905086517334, + "learning_rate": 3.698409330669449e-05, + "loss": 0.5291, + "step": 88340 + }, + { + "epoch": 0.7810428048586432, + "grad_norm": 3.5075201988220215, + "learning_rate": 3.6982619919022615e-05, + "loss": 0.6244, + "step": 88350 + }, + { + "epoch": 0.7811312081189554, + "grad_norm": 2.036731719970703, + "learning_rate": 3.6981146531350744e-05, + "loss": 0.6066, + "step": 88360 + }, + { + "epoch": 0.7812196113792677, + "grad_norm": 2.9030609130859375, + "learning_rate": 3.697967314367887e-05, + "loss": 0.7719, + "step": 88370 + }, + { + "epoch": 0.7813080146395799, + "grad_norm": 1.686673641204834, + "learning_rate": 3.6978199756007e-05, + "loss": 0.6309, + "step": 88380 + }, + { + "epoch": 0.7813964178998921, + "grad_norm": 1.8342885971069336, + "learning_rate": 3.6976726368335136e-05, + "loss": 0.6344, + "step": 88390 + }, + { + "epoch": 0.7814848211602043, + "grad_norm": 2.6140074729919434, + "learning_rate": 3.6975252980663264e-05, + "loss": 0.8475, + "step": 88400 + }, + { + "epoch": 0.7815732244205166, + "grad_norm": 3.955953598022461, + "learning_rate": 3.697377959299139e-05, + "loss": 0.7072, + "step": 88410 + }, + { + "epoch": 0.7816616276808289, + "grad_norm": 2.344635248184204, + "learning_rate": 3.697230620531952e-05, + "loss": 0.6929, + "step": 88420 + }, + { + "epoch": 0.7817500309411411, + "grad_norm": 1.209633231163025, + "learning_rate": 3.697083281764765e-05, + "loss": 0.6304, + "step": 88430 + }, + { + "epoch": 0.7818384342014534, + "grad_norm": 5.430700302124023, + "learning_rate": 3.696935942997578e-05, + "loss": 0.6963, + "step": 88440 + }, + { + "epoch": 0.7819268374617656, + "grad_norm": 3.00968337059021, + "learning_rate": 3.6967886042303906e-05, + "loss": 0.6072, + "step": 88450 + }, + { + "epoch": 0.7820152407220778, + "grad_norm": 1.635097622871399, + "learning_rate": 3.696641265463204e-05, + "loss": 0.7084, + "step": 88460 + }, + { + "epoch": 0.7821036439823901, + "grad_norm": 1.3624483346939087, + "learning_rate": 3.696493926696017e-05, + "loss": 0.6762, + "step": 88470 + }, + { + "epoch": 0.7821920472427023, + "grad_norm": 31.46500587463379, + "learning_rate": 3.69634658792883e-05, + "loss": 0.6381, + "step": 88480 + }, + { + "epoch": 0.7822804505030145, + "grad_norm": 1.579077959060669, + "learning_rate": 3.6961992491616426e-05, + "loss": 0.6122, + "step": 88490 + }, + { + "epoch": 0.7823688537633268, + "grad_norm": 2.750282049179077, + "learning_rate": 3.6960519103944554e-05, + "loss": 0.5084, + "step": 88500 + }, + { + "epoch": 0.782457257023639, + "grad_norm": 10.721227645874023, + "learning_rate": 3.695904571627268e-05, + "loss": 0.7496, + "step": 88510 + }, + { + "epoch": 0.7825456602839512, + "grad_norm": 4.143172264099121, + "learning_rate": 3.695757232860082e-05, + "loss": 0.6698, + "step": 88520 + }, + { + "epoch": 0.7826340635442635, + "grad_norm": 2.3151283264160156, + "learning_rate": 3.6956098940928946e-05, + "loss": 0.7075, + "step": 88530 + }, + { + "epoch": 0.7827224668045758, + "grad_norm": 2.9645211696624756, + "learning_rate": 3.6954625553257075e-05, + "loss": 0.8017, + "step": 88540 + }, + { + "epoch": 0.782810870064888, + "grad_norm": 1.0456886291503906, + "learning_rate": 3.69531521655852e-05, + "loss": 0.6558, + "step": 88550 + }, + { + "epoch": 0.7828992733252003, + "grad_norm": 1.234302043914795, + "learning_rate": 3.695167877791333e-05, + "loss": 0.5816, + "step": 88560 + }, + { + "epoch": 0.7829876765855125, + "grad_norm": 1.1398594379425049, + "learning_rate": 3.695020539024146e-05, + "loss": 0.6601, + "step": 88570 + }, + { + "epoch": 0.7830760798458247, + "grad_norm": 4.491460800170898, + "learning_rate": 3.6948732002569595e-05, + "loss": 0.7649, + "step": 88580 + }, + { + "epoch": 0.783164483106137, + "grad_norm": 4.4944167137146, + "learning_rate": 3.6947258614897716e-05, + "loss": 0.5904, + "step": 88590 + }, + { + "epoch": 0.7832528863664492, + "grad_norm": 0.9227166771888733, + "learning_rate": 3.694578522722585e-05, + "loss": 0.6771, + "step": 88600 + }, + { + "epoch": 0.7833412896267614, + "grad_norm": 3.6446499824523926, + "learning_rate": 3.694431183955398e-05, + "loss": 0.6977, + "step": 88610 + }, + { + "epoch": 0.7834296928870736, + "grad_norm": 1.4419152736663818, + "learning_rate": 3.694283845188211e-05, + "loss": 0.6702, + "step": 88620 + }, + { + "epoch": 0.7835180961473859, + "grad_norm": 1.8561259508132935, + "learning_rate": 3.6941365064210237e-05, + "loss": 0.7333, + "step": 88630 + }, + { + "epoch": 0.7836064994076981, + "grad_norm": 3.7696709632873535, + "learning_rate": 3.693989167653837e-05, + "loss": 0.6795, + "step": 88640 + }, + { + "epoch": 0.7836949026680105, + "grad_norm": 3.3229572772979736, + "learning_rate": 3.693841828886649e-05, + "loss": 0.6587, + "step": 88650 + }, + { + "epoch": 0.7837833059283227, + "grad_norm": 4.2172346115112305, + "learning_rate": 3.693694490119463e-05, + "loss": 0.7052, + "step": 88660 + }, + { + "epoch": 0.7838717091886349, + "grad_norm": 1.510422706604004, + "learning_rate": 3.693547151352275e-05, + "loss": 0.5378, + "step": 88670 + }, + { + "epoch": 0.7839601124489471, + "grad_norm": 7.965168476104736, + "learning_rate": 3.6933998125850885e-05, + "loss": 0.7124, + "step": 88680 + }, + { + "epoch": 0.7840485157092594, + "grad_norm": 1.8420188426971436, + "learning_rate": 3.6932524738179013e-05, + "loss": 0.608, + "step": 88690 + }, + { + "epoch": 0.7841369189695716, + "grad_norm": 1.1057806015014648, + "learning_rate": 3.693105135050714e-05, + "loss": 0.6727, + "step": 88700 + }, + { + "epoch": 0.7842253222298838, + "grad_norm": 2.295884609222412, + "learning_rate": 3.692957796283527e-05, + "loss": 0.5431, + "step": 88710 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 2.478905200958252, + "learning_rate": 3.6928104575163405e-05, + "loss": 0.7279, + "step": 88720 + }, + { + "epoch": 0.7844021287505083, + "grad_norm": 2.1281604766845703, + "learning_rate": 3.692663118749153e-05, + "loss": 0.5643, + "step": 88730 + }, + { + "epoch": 0.7844905320108205, + "grad_norm": 2.78515625, + "learning_rate": 3.692515779981966e-05, + "loss": 0.5311, + "step": 88740 + }, + { + "epoch": 0.7845789352711328, + "grad_norm": 13.516206741333008, + "learning_rate": 3.692368441214779e-05, + "loss": 0.6049, + "step": 88750 + }, + { + "epoch": 0.784667338531445, + "grad_norm": 2.7559354305267334, + "learning_rate": 3.692221102447592e-05, + "loss": 0.6337, + "step": 88760 + }, + { + "epoch": 0.7847557417917573, + "grad_norm": 2.1681416034698486, + "learning_rate": 3.692073763680405e-05, + "loss": 0.8782, + "step": 88770 + }, + { + "epoch": 0.7848441450520696, + "grad_norm": 4.1809916496276855, + "learning_rate": 3.6919264249132175e-05, + "loss": 0.6429, + "step": 88780 + }, + { + "epoch": 0.7849325483123818, + "grad_norm": 2.3515563011169434, + "learning_rate": 3.6917790861460304e-05, + "loss": 0.6666, + "step": 88790 + }, + { + "epoch": 0.785020951572694, + "grad_norm": 1.1446459293365479, + "learning_rate": 3.691631747378844e-05, + "loss": 0.685, + "step": 88800 + }, + { + "epoch": 0.7851093548330063, + "grad_norm": 0.7936797738075256, + "learning_rate": 3.691484408611656e-05, + "loss": 0.6952, + "step": 88810 + }, + { + "epoch": 0.7851977580933185, + "grad_norm": 5.4908013343811035, + "learning_rate": 3.6913370698444696e-05, + "loss": 0.6882, + "step": 88820 + }, + { + "epoch": 0.7852861613536307, + "grad_norm": 7.077978134155273, + "learning_rate": 3.6911897310772824e-05, + "loss": 0.7397, + "step": 88830 + }, + { + "epoch": 0.785374564613943, + "grad_norm": 3.20505952835083, + "learning_rate": 3.691042392310095e-05, + "loss": 0.7053, + "step": 88840 + }, + { + "epoch": 0.7854629678742552, + "grad_norm": 7.291604042053223, + "learning_rate": 3.690895053542908e-05, + "loss": 0.8036, + "step": 88850 + }, + { + "epoch": 0.7855513711345674, + "grad_norm": 4.67714786529541, + "learning_rate": 3.6907477147757216e-05, + "loss": 0.7113, + "step": 88860 + }, + { + "epoch": 0.7856397743948796, + "grad_norm": 2.403384208679199, + "learning_rate": 3.690600376008534e-05, + "loss": 0.633, + "step": 88870 + }, + { + "epoch": 0.7857281776551919, + "grad_norm": 3.3825719356536865, + "learning_rate": 3.690453037241347e-05, + "loss": 0.7011, + "step": 88880 + }, + { + "epoch": 0.7858165809155042, + "grad_norm": 3.1293861865997314, + "learning_rate": 3.6903056984741594e-05, + "loss": 0.6551, + "step": 88890 + }, + { + "epoch": 0.7859049841758164, + "grad_norm": 7.326724529266357, + "learning_rate": 3.690158359706973e-05, + "loss": 0.5545, + "step": 88900 + }, + { + "epoch": 0.7859933874361287, + "grad_norm": 2.852421760559082, + "learning_rate": 3.690011020939786e-05, + "loss": 0.719, + "step": 88910 + }, + { + "epoch": 0.7860817906964409, + "grad_norm": 7.667893409729004, + "learning_rate": 3.6898636821725986e-05, + "loss": 0.6199, + "step": 88920 + }, + { + "epoch": 0.7861701939567531, + "grad_norm": 2.5921435356140137, + "learning_rate": 3.6897163434054114e-05, + "loss": 0.6916, + "step": 88930 + }, + { + "epoch": 0.7862585972170654, + "grad_norm": 1.0341322422027588, + "learning_rate": 3.689569004638225e-05, + "loss": 0.6539, + "step": 88940 + }, + { + "epoch": 0.7863470004773776, + "grad_norm": 7.7420573234558105, + "learning_rate": 3.689421665871037e-05, + "loss": 0.7001, + "step": 88950 + }, + { + "epoch": 0.7864354037376898, + "grad_norm": 5.026370525360107, + "learning_rate": 3.6892743271038506e-05, + "loss": 0.7464, + "step": 88960 + }, + { + "epoch": 0.7865238069980021, + "grad_norm": 2.243091583251953, + "learning_rate": 3.6891269883366634e-05, + "loss": 0.6455, + "step": 88970 + }, + { + "epoch": 0.7866122102583143, + "grad_norm": 4.696250915527344, + "learning_rate": 3.688979649569476e-05, + "loss": 0.577, + "step": 88980 + }, + { + "epoch": 0.7867006135186265, + "grad_norm": 11.603858947753906, + "learning_rate": 3.688832310802289e-05, + "loss": 0.5918, + "step": 88990 + }, + { + "epoch": 0.7867890167789388, + "grad_norm": 4.928067207336426, + "learning_rate": 3.6886849720351026e-05, + "loss": 0.7424, + "step": 89000 + }, + { + "epoch": 0.7868774200392511, + "grad_norm": 2.0284087657928467, + "learning_rate": 3.688537633267915e-05, + "loss": 0.5957, + "step": 89010 + }, + { + "epoch": 0.7869658232995633, + "grad_norm": 3.126385450363159, + "learning_rate": 3.688390294500728e-05, + "loss": 0.6453, + "step": 89020 + }, + { + "epoch": 0.7870542265598756, + "grad_norm": 2.6042163372039795, + "learning_rate": 3.6882429557335405e-05, + "loss": 0.5499, + "step": 89030 + }, + { + "epoch": 0.7871426298201878, + "grad_norm": 4.190433502197266, + "learning_rate": 3.688095616966354e-05, + "loss": 0.6625, + "step": 89040 + }, + { + "epoch": 0.7872310330805, + "grad_norm": 2.73695707321167, + "learning_rate": 3.687948278199167e-05, + "loss": 0.6045, + "step": 89050 + }, + { + "epoch": 0.7873194363408123, + "grad_norm": 3.515976905822754, + "learning_rate": 3.6878009394319796e-05, + "loss": 0.6021, + "step": 89060 + }, + { + "epoch": 0.7874078396011245, + "grad_norm": 2.2931578159332275, + "learning_rate": 3.6876536006647925e-05, + "loss": 0.5635, + "step": 89070 + }, + { + "epoch": 0.7874962428614367, + "grad_norm": 3.415226459503174, + "learning_rate": 3.687506261897606e-05, + "loss": 0.6612, + "step": 89080 + }, + { + "epoch": 0.7875846461217489, + "grad_norm": 6.145545482635498, + "learning_rate": 3.687358923130418e-05, + "loss": 0.6797, + "step": 89090 + }, + { + "epoch": 0.7876730493820612, + "grad_norm": 6.924167633056641, + "learning_rate": 3.6872115843632317e-05, + "loss": 0.8244, + "step": 89100 + }, + { + "epoch": 0.7877614526423734, + "grad_norm": 3.182163953781128, + "learning_rate": 3.6870642455960445e-05, + "loss": 0.6204, + "step": 89110 + }, + { + "epoch": 0.7878498559026856, + "grad_norm": 12.460630416870117, + "learning_rate": 3.686916906828857e-05, + "loss": 0.5892, + "step": 89120 + }, + { + "epoch": 0.787938259162998, + "grad_norm": 8.036012649536133, + "learning_rate": 3.68676956806167e-05, + "loss": 0.6687, + "step": 89130 + }, + { + "epoch": 0.7880266624233102, + "grad_norm": 3.1527159214019775, + "learning_rate": 3.686622229294483e-05, + "loss": 0.73, + "step": 89140 + }, + { + "epoch": 0.7881150656836224, + "grad_norm": 2.6475212574005127, + "learning_rate": 3.686474890527296e-05, + "loss": 0.5661, + "step": 89150 + }, + { + "epoch": 0.7882034689439347, + "grad_norm": 1.0636621713638306, + "learning_rate": 3.6863275517601093e-05, + "loss": 0.5466, + "step": 89160 + }, + { + "epoch": 0.7882918722042469, + "grad_norm": 5.516800880432129, + "learning_rate": 3.6861802129929215e-05, + "loss": 0.6586, + "step": 89170 + }, + { + "epoch": 0.7883802754645591, + "grad_norm": 2.99794340133667, + "learning_rate": 3.686032874225735e-05, + "loss": 0.735, + "step": 89180 + }, + { + "epoch": 0.7884686787248714, + "grad_norm": 5.132623195648193, + "learning_rate": 3.685885535458548e-05, + "loss": 0.626, + "step": 89190 + }, + { + "epoch": 0.7885570819851836, + "grad_norm": 7.003590106964111, + "learning_rate": 3.685738196691361e-05, + "loss": 0.7468, + "step": 89200 + }, + { + "epoch": 0.7886454852454958, + "grad_norm": 3.0408949851989746, + "learning_rate": 3.6855908579241735e-05, + "loss": 0.5835, + "step": 89210 + }, + { + "epoch": 0.7887338885058081, + "grad_norm": 4.65286922454834, + "learning_rate": 3.685443519156987e-05, + "loss": 0.7545, + "step": 89220 + }, + { + "epoch": 0.7888222917661203, + "grad_norm": 7.724262237548828, + "learning_rate": 3.685296180389799e-05, + "loss": 0.7402, + "step": 89230 + }, + { + "epoch": 0.7889106950264326, + "grad_norm": 5.497486114501953, + "learning_rate": 3.685148841622613e-05, + "loss": 0.66, + "step": 89240 + }, + { + "epoch": 0.7889990982867449, + "grad_norm": 1.7779713869094849, + "learning_rate": 3.6850015028554255e-05, + "loss": 0.616, + "step": 89250 + }, + { + "epoch": 0.7890875015470571, + "grad_norm": 10.849710464477539, + "learning_rate": 3.6848541640882384e-05, + "loss": 0.5861, + "step": 89260 + }, + { + "epoch": 0.7891759048073693, + "grad_norm": 16.75411605834961, + "learning_rate": 3.684706825321051e-05, + "loss": 0.7508, + "step": 89270 + }, + { + "epoch": 0.7892643080676816, + "grad_norm": 8.728506088256836, + "learning_rate": 3.684559486553864e-05, + "loss": 0.6679, + "step": 89280 + }, + { + "epoch": 0.7893527113279938, + "grad_norm": 3.0433952808380127, + "learning_rate": 3.684412147786677e-05, + "loss": 0.7144, + "step": 89290 + }, + { + "epoch": 0.789441114588306, + "grad_norm": 8.602217674255371, + "learning_rate": 3.6842648090194904e-05, + "loss": 0.6023, + "step": 89300 + }, + { + "epoch": 0.7895295178486182, + "grad_norm": 1.4041599035263062, + "learning_rate": 3.684117470252303e-05, + "loss": 0.7798, + "step": 89310 + }, + { + "epoch": 0.7896179211089305, + "grad_norm": 0.9814836978912354, + "learning_rate": 3.683970131485116e-05, + "loss": 0.7293, + "step": 89320 + }, + { + "epoch": 0.7897063243692427, + "grad_norm": 2.3014960289001465, + "learning_rate": 3.683822792717929e-05, + "loss": 0.7899, + "step": 89330 + }, + { + "epoch": 0.7897947276295549, + "grad_norm": 3.170083999633789, + "learning_rate": 3.683675453950742e-05, + "loss": 0.6036, + "step": 89340 + }, + { + "epoch": 0.7898831308898672, + "grad_norm": 3.452230453491211, + "learning_rate": 3.6835281151835546e-05, + "loss": 0.5871, + "step": 89350 + }, + { + "epoch": 0.7899715341501795, + "grad_norm": 7.357548713684082, + "learning_rate": 3.6833807764163674e-05, + "loss": 0.6071, + "step": 89360 + }, + { + "epoch": 0.7900599374104917, + "grad_norm": 1.5751733779907227, + "learning_rate": 3.683233437649181e-05, + "loss": 0.626, + "step": 89370 + }, + { + "epoch": 0.790148340670804, + "grad_norm": 2.1227712631225586, + "learning_rate": 3.683086098881994e-05, + "loss": 0.7323, + "step": 89380 + }, + { + "epoch": 0.7902367439311162, + "grad_norm": 10.857606887817383, + "learning_rate": 3.6829387601148066e-05, + "loss": 0.7407, + "step": 89390 + }, + { + "epoch": 0.7903251471914284, + "grad_norm": 2.5266811847686768, + "learning_rate": 3.6827914213476194e-05, + "loss": 0.5984, + "step": 89400 + }, + { + "epoch": 0.7904135504517407, + "grad_norm": 2.156224250793457, + "learning_rate": 3.682644082580432e-05, + "loss": 0.6688, + "step": 89410 + }, + { + "epoch": 0.7905019537120529, + "grad_norm": 1.744645357131958, + "learning_rate": 3.682496743813245e-05, + "loss": 0.7207, + "step": 89420 + }, + { + "epoch": 0.7905903569723651, + "grad_norm": 1.5828803777694702, + "learning_rate": 3.6823494050460586e-05, + "loss": 0.6678, + "step": 89430 + }, + { + "epoch": 0.7906787602326774, + "grad_norm": 5.530209064483643, + "learning_rate": 3.6822020662788714e-05, + "loss": 0.6763, + "step": 89440 + }, + { + "epoch": 0.7907671634929896, + "grad_norm": 0.8541598320007324, + "learning_rate": 3.682054727511684e-05, + "loss": 0.6077, + "step": 89450 + }, + { + "epoch": 0.7908555667533018, + "grad_norm": 2.934976816177368, + "learning_rate": 3.681907388744497e-05, + "loss": 0.6337, + "step": 89460 + }, + { + "epoch": 0.790943970013614, + "grad_norm": 1.8297994136810303, + "learning_rate": 3.68176004997731e-05, + "loss": 0.5861, + "step": 89470 + }, + { + "epoch": 0.7910323732739264, + "grad_norm": 1.726386547088623, + "learning_rate": 3.681612711210123e-05, + "loss": 0.62, + "step": 89480 + }, + { + "epoch": 0.7911207765342386, + "grad_norm": 1.0238670110702515, + "learning_rate": 3.681465372442936e-05, + "loss": 0.7597, + "step": 89490 + }, + { + "epoch": 0.7912091797945509, + "grad_norm": 4.956290245056152, + "learning_rate": 3.6813180336757485e-05, + "loss": 0.6482, + "step": 89500 + }, + { + "epoch": 0.7912975830548631, + "grad_norm": 6.393110275268555, + "learning_rate": 3.681170694908562e-05, + "loss": 0.6254, + "step": 89510 + }, + { + "epoch": 0.7913859863151753, + "grad_norm": 2.4836957454681396, + "learning_rate": 3.681023356141375e-05, + "loss": 0.6302, + "step": 89520 + }, + { + "epoch": 0.7914743895754875, + "grad_norm": 13.077306747436523, + "learning_rate": 3.6808760173741876e-05, + "loss": 0.6459, + "step": 89530 + }, + { + "epoch": 0.7915627928357998, + "grad_norm": 2.5070433616638184, + "learning_rate": 3.6807286786070005e-05, + "loss": 0.6984, + "step": 89540 + }, + { + "epoch": 0.791651196096112, + "grad_norm": 7.889594554901123, + "learning_rate": 3.680581339839814e-05, + "loss": 0.6318, + "step": 89550 + }, + { + "epoch": 0.7917395993564242, + "grad_norm": 3.2554988861083984, + "learning_rate": 3.680434001072626e-05, + "loss": 0.6755, + "step": 89560 + }, + { + "epoch": 0.7918280026167365, + "grad_norm": 1.702609658241272, + "learning_rate": 3.68028666230544e-05, + "loss": 0.6503, + "step": 89570 + }, + { + "epoch": 0.7919164058770487, + "grad_norm": 1.5688401460647583, + "learning_rate": 3.6801393235382525e-05, + "loss": 0.5643, + "step": 89580 + }, + { + "epoch": 0.7920048091373609, + "grad_norm": 2.11912202835083, + "learning_rate": 3.679991984771065e-05, + "loss": 0.6708, + "step": 89590 + }, + { + "epoch": 0.7920932123976733, + "grad_norm": 2.4789113998413086, + "learning_rate": 3.679844646003878e-05, + "loss": 0.7685, + "step": 89600 + }, + { + "epoch": 0.7921816156579855, + "grad_norm": 1.4530085325241089, + "learning_rate": 3.679697307236691e-05, + "loss": 0.591, + "step": 89610 + }, + { + "epoch": 0.7922700189182977, + "grad_norm": 2.6055314540863037, + "learning_rate": 3.679549968469504e-05, + "loss": 0.6398, + "step": 89620 + }, + { + "epoch": 0.79235842217861, + "grad_norm": 2.4989185333251953, + "learning_rate": 3.6794026297023174e-05, + "loss": 0.5998, + "step": 89630 + }, + { + "epoch": 0.7924468254389222, + "grad_norm": 1.8193473815917969, + "learning_rate": 3.6792552909351295e-05, + "loss": 0.6934, + "step": 89640 + }, + { + "epoch": 0.7925352286992344, + "grad_norm": 5.6462931632995605, + "learning_rate": 3.679107952167943e-05, + "loss": 0.6782, + "step": 89650 + }, + { + "epoch": 0.7926236319595467, + "grad_norm": 7.5111870765686035, + "learning_rate": 3.678960613400756e-05, + "loss": 0.7243, + "step": 89660 + }, + { + "epoch": 0.7927120352198589, + "grad_norm": 1.2621877193450928, + "learning_rate": 3.678813274633569e-05, + "loss": 0.684, + "step": 89670 + }, + { + "epoch": 0.7928004384801711, + "grad_norm": 2.9011409282684326, + "learning_rate": 3.6786659358663815e-05, + "loss": 0.7346, + "step": 89680 + }, + { + "epoch": 0.7928888417404834, + "grad_norm": 6.006600379943848, + "learning_rate": 3.678518597099195e-05, + "loss": 0.7034, + "step": 89690 + }, + { + "epoch": 0.7929772450007956, + "grad_norm": 1.6063816547393799, + "learning_rate": 3.678371258332007e-05, + "loss": 0.6455, + "step": 89700 + }, + { + "epoch": 0.7930656482611079, + "grad_norm": 3.9927990436553955, + "learning_rate": 3.678223919564821e-05, + "loss": 0.7919, + "step": 89710 + }, + { + "epoch": 0.7931540515214202, + "grad_norm": 5.710406303405762, + "learning_rate": 3.678076580797633e-05, + "loss": 0.7795, + "step": 89720 + }, + { + "epoch": 0.7932424547817324, + "grad_norm": 1.9698961973190308, + "learning_rate": 3.6779292420304464e-05, + "loss": 0.6593, + "step": 89730 + }, + { + "epoch": 0.7933308580420446, + "grad_norm": 1.3413444757461548, + "learning_rate": 3.677781903263259e-05, + "loss": 0.7236, + "step": 89740 + }, + { + "epoch": 0.7934192613023568, + "grad_norm": 3.25018048286438, + "learning_rate": 3.677634564496072e-05, + "loss": 0.6375, + "step": 89750 + }, + { + "epoch": 0.7935076645626691, + "grad_norm": 1.7060281038284302, + "learning_rate": 3.677487225728885e-05, + "loss": 0.7026, + "step": 89760 + }, + { + "epoch": 0.7935960678229813, + "grad_norm": 2.2737340927124023, + "learning_rate": 3.6773398869616984e-05, + "loss": 0.5609, + "step": 89770 + }, + { + "epoch": 0.7936844710832935, + "grad_norm": 2.828493595123291, + "learning_rate": 3.6771925481945106e-05, + "loss": 0.6469, + "step": 89780 + }, + { + "epoch": 0.7937728743436058, + "grad_norm": 1.5337222814559937, + "learning_rate": 3.677045209427324e-05, + "loss": 0.7227, + "step": 89790 + }, + { + "epoch": 0.793861277603918, + "grad_norm": 2.079472303390503, + "learning_rate": 3.676897870660137e-05, + "loss": 0.7591, + "step": 89800 + }, + { + "epoch": 0.7939496808642302, + "grad_norm": 1.264919638633728, + "learning_rate": 3.67675053189295e-05, + "loss": 0.6199, + "step": 89810 + }, + { + "epoch": 0.7940380841245425, + "grad_norm": 3.484257221221924, + "learning_rate": 3.6766031931257626e-05, + "loss": 0.7112, + "step": 89820 + }, + { + "epoch": 0.7941264873848548, + "grad_norm": 6.181394577026367, + "learning_rate": 3.6764558543585754e-05, + "loss": 0.7115, + "step": 89830 + }, + { + "epoch": 0.794214890645167, + "grad_norm": 2.282637119293213, + "learning_rate": 3.676308515591388e-05, + "loss": 0.6525, + "step": 89840 + }, + { + "epoch": 0.7943032939054793, + "grad_norm": 5.451003551483154, + "learning_rate": 3.676161176824202e-05, + "loss": 0.5581, + "step": 89850 + }, + { + "epoch": 0.7943916971657915, + "grad_norm": 5.316173076629639, + "learning_rate": 3.676013838057014e-05, + "loss": 0.7072, + "step": 89860 + }, + { + "epoch": 0.7944801004261037, + "grad_norm": 2.9746615886688232, + "learning_rate": 3.6758664992898274e-05, + "loss": 0.807, + "step": 89870 + }, + { + "epoch": 0.794568503686416, + "grad_norm": 2.413289785385132, + "learning_rate": 3.67571916052264e-05, + "loss": 0.6598, + "step": 89880 + }, + { + "epoch": 0.7946569069467282, + "grad_norm": 7.571915626525879, + "learning_rate": 3.675571821755453e-05, + "loss": 0.6777, + "step": 89890 + }, + { + "epoch": 0.7947453102070404, + "grad_norm": 1.52367103099823, + "learning_rate": 3.675424482988266e-05, + "loss": 0.6388, + "step": 89900 + }, + { + "epoch": 0.7948337134673527, + "grad_norm": 11.09347915649414, + "learning_rate": 3.6752771442210795e-05, + "loss": 0.5975, + "step": 89910 + }, + { + "epoch": 0.7949221167276649, + "grad_norm": 3.8028767108917236, + "learning_rate": 3.6751298054538916e-05, + "loss": 0.6116, + "step": 89920 + }, + { + "epoch": 0.7950105199879771, + "grad_norm": 3.759174108505249, + "learning_rate": 3.674982466686705e-05, + "loss": 0.5797, + "step": 89930 + }, + { + "epoch": 0.7950989232482893, + "grad_norm": 3.9057390689849854, + "learning_rate": 3.674835127919518e-05, + "loss": 0.8096, + "step": 89940 + }, + { + "epoch": 0.7951873265086017, + "grad_norm": 2.45003342628479, + "learning_rate": 3.674687789152331e-05, + "loss": 0.6402, + "step": 89950 + }, + { + "epoch": 0.7952757297689139, + "grad_norm": 1.45388925075531, + "learning_rate": 3.6745404503851436e-05, + "loss": 0.6921, + "step": 89960 + }, + { + "epoch": 0.7953641330292262, + "grad_norm": 2.3963100910186768, + "learning_rate": 3.6743931116179565e-05, + "loss": 0.5433, + "step": 89970 + }, + { + "epoch": 0.7954525362895384, + "grad_norm": 2.329308032989502, + "learning_rate": 3.674245772850769e-05, + "loss": 0.6011, + "step": 89980 + }, + { + "epoch": 0.7955409395498506, + "grad_norm": 3.735215663909912, + "learning_rate": 3.674098434083583e-05, + "loss": 0.727, + "step": 89990 + }, + { + "epoch": 0.7956293428101628, + "grad_norm": 5.208836078643799, + "learning_rate": 3.673951095316395e-05, + "loss": 0.6039, + "step": 90000 + }, + { + "epoch": 0.7957177460704751, + "grad_norm": 3.0822675228118896, + "learning_rate": 3.6738037565492085e-05, + "loss": 0.6221, + "step": 90010 + }, + { + "epoch": 0.7958061493307873, + "grad_norm": 1.4857887029647827, + "learning_rate": 3.673656417782021e-05, + "loss": 0.7058, + "step": 90020 + }, + { + "epoch": 0.7958945525910995, + "grad_norm": 12.714051246643066, + "learning_rate": 3.673509079014834e-05, + "loss": 0.603, + "step": 90030 + }, + { + "epoch": 0.7959829558514118, + "grad_norm": 6.570509910583496, + "learning_rate": 3.673361740247647e-05, + "loss": 0.6961, + "step": 90040 + }, + { + "epoch": 0.796071359111724, + "grad_norm": 1.5676804780960083, + "learning_rate": 3.6732144014804605e-05, + "loss": 0.73, + "step": 90050 + }, + { + "epoch": 0.7961597623720362, + "grad_norm": 1.061217188835144, + "learning_rate": 3.673067062713273e-05, + "loss": 0.7439, + "step": 90060 + }, + { + "epoch": 0.7962481656323486, + "grad_norm": 6.577081680297852, + "learning_rate": 3.672919723946086e-05, + "loss": 0.8206, + "step": 90070 + }, + { + "epoch": 0.7963365688926608, + "grad_norm": 1.5707778930664062, + "learning_rate": 3.672772385178898e-05, + "loss": 0.5718, + "step": 90080 + }, + { + "epoch": 0.796424972152973, + "grad_norm": 6.731858253479004, + "learning_rate": 3.672625046411712e-05, + "loss": 0.6494, + "step": 90090 + }, + { + "epoch": 0.7965133754132853, + "grad_norm": 3.61200213432312, + "learning_rate": 3.672477707644525e-05, + "loss": 0.7392, + "step": 90100 + }, + { + "epoch": 0.7966017786735975, + "grad_norm": 1.8012754917144775, + "learning_rate": 3.6723303688773375e-05, + "loss": 0.6165, + "step": 90110 + }, + { + "epoch": 0.7966901819339097, + "grad_norm": 6.126452445983887, + "learning_rate": 3.6721830301101504e-05, + "loss": 0.7815, + "step": 90120 + }, + { + "epoch": 0.796778585194222, + "grad_norm": 10.822237014770508, + "learning_rate": 3.672035691342964e-05, + "loss": 0.7079, + "step": 90130 + }, + { + "epoch": 0.7968669884545342, + "grad_norm": 9.062956809997559, + "learning_rate": 3.671888352575776e-05, + "loss": 0.6992, + "step": 90140 + }, + { + "epoch": 0.7969553917148464, + "grad_norm": 8.386719703674316, + "learning_rate": 3.6717410138085895e-05, + "loss": 0.5561, + "step": 90150 + }, + { + "epoch": 0.7970437949751586, + "grad_norm": 11.70284652709961, + "learning_rate": 3.6715936750414024e-05, + "loss": 0.6831, + "step": 90160 + }, + { + "epoch": 0.7971321982354709, + "grad_norm": 0.9447895884513855, + "learning_rate": 3.671446336274215e-05, + "loss": 0.6646, + "step": 90170 + }, + { + "epoch": 0.7972206014957831, + "grad_norm": 5.494142532348633, + "learning_rate": 3.671298997507028e-05, + "loss": 0.5407, + "step": 90180 + }, + { + "epoch": 0.7973090047560955, + "grad_norm": 8.045598030090332, + "learning_rate": 3.671151658739841e-05, + "loss": 0.8086, + "step": 90190 + }, + { + "epoch": 0.7973974080164077, + "grad_norm": 3.0696487426757812, + "learning_rate": 3.671004319972654e-05, + "loss": 0.6195, + "step": 90200 + }, + { + "epoch": 0.7974858112767199, + "grad_norm": 1.5057857036590576, + "learning_rate": 3.670856981205467e-05, + "loss": 0.6232, + "step": 90210 + }, + { + "epoch": 0.7975742145370321, + "grad_norm": 9.587413787841797, + "learning_rate": 3.67070964243828e-05, + "loss": 0.6251, + "step": 90220 + }, + { + "epoch": 0.7976626177973444, + "grad_norm": 3.105311870574951, + "learning_rate": 3.670562303671093e-05, + "loss": 0.7872, + "step": 90230 + }, + { + "epoch": 0.7977510210576566, + "grad_norm": 2.6679515838623047, + "learning_rate": 3.670414964903906e-05, + "loss": 0.7538, + "step": 90240 + }, + { + "epoch": 0.7978394243179688, + "grad_norm": 9.960758209228516, + "learning_rate": 3.6702676261367186e-05, + "loss": 0.7526, + "step": 90250 + }, + { + "epoch": 0.7979278275782811, + "grad_norm": 1.8704094886779785, + "learning_rate": 3.6701202873695314e-05, + "loss": 0.5858, + "step": 90260 + }, + { + "epoch": 0.7980162308385933, + "grad_norm": 5.982682228088379, + "learning_rate": 3.669972948602345e-05, + "loss": 0.5648, + "step": 90270 + }, + { + "epoch": 0.7981046340989055, + "grad_norm": 5.4588303565979, + "learning_rate": 3.669825609835158e-05, + "loss": 0.6909, + "step": 90280 + }, + { + "epoch": 0.7981930373592178, + "grad_norm": 2.4680209159851074, + "learning_rate": 3.6696782710679706e-05, + "loss": 0.5043, + "step": 90290 + }, + { + "epoch": 0.7982814406195301, + "grad_norm": 0.5643435716629028, + "learning_rate": 3.6695309323007834e-05, + "loss": 0.7279, + "step": 90300 + }, + { + "epoch": 0.7983698438798423, + "grad_norm": 1.5978022813796997, + "learning_rate": 3.669383593533596e-05, + "loss": 0.6806, + "step": 90310 + }, + { + "epoch": 0.7984582471401546, + "grad_norm": 2.233640432357788, + "learning_rate": 3.669236254766409e-05, + "loss": 0.5723, + "step": 90320 + }, + { + "epoch": 0.7985466504004668, + "grad_norm": 3.4113526344299316, + "learning_rate": 3.669088915999222e-05, + "loss": 0.5339, + "step": 90330 + }, + { + "epoch": 0.798635053660779, + "grad_norm": 2.4243462085723877, + "learning_rate": 3.6689415772320354e-05, + "loss": 0.5339, + "step": 90340 + }, + { + "epoch": 0.7987234569210913, + "grad_norm": 2.112652540206909, + "learning_rate": 3.668794238464848e-05, + "loss": 0.6625, + "step": 90350 + }, + { + "epoch": 0.7988118601814035, + "grad_norm": 1.8202285766601562, + "learning_rate": 3.668646899697661e-05, + "loss": 0.5857, + "step": 90360 + }, + { + "epoch": 0.7989002634417157, + "grad_norm": 3.372126817703247, + "learning_rate": 3.668499560930474e-05, + "loss": 0.6101, + "step": 90370 + }, + { + "epoch": 0.798988666702028, + "grad_norm": 1.6648164987564087, + "learning_rate": 3.668352222163287e-05, + "loss": 0.5972, + "step": 90380 + }, + { + "epoch": 0.7990770699623402, + "grad_norm": 3.1647562980651855, + "learning_rate": 3.6682048833960996e-05, + "loss": 0.6722, + "step": 90390 + }, + { + "epoch": 0.7991654732226524, + "grad_norm": 3.588517904281616, + "learning_rate": 3.668057544628913e-05, + "loss": 0.6789, + "step": 90400 + }, + { + "epoch": 0.7992538764829646, + "grad_norm": 3.194951295852661, + "learning_rate": 3.667910205861726e-05, + "loss": 0.6922, + "step": 90410 + }, + { + "epoch": 0.799342279743277, + "grad_norm": 1.9487104415893555, + "learning_rate": 3.667762867094539e-05, + "loss": 0.6304, + "step": 90420 + }, + { + "epoch": 0.7994306830035892, + "grad_norm": 18.30216407775879, + "learning_rate": 3.6676155283273516e-05, + "loss": 0.7401, + "step": 90430 + }, + { + "epoch": 0.7995190862639014, + "grad_norm": 21.167621612548828, + "learning_rate": 3.6674681895601645e-05, + "loss": 0.6146, + "step": 90440 + }, + { + "epoch": 0.7996074895242137, + "grad_norm": 6.47659969329834, + "learning_rate": 3.667320850792977e-05, + "loss": 0.6854, + "step": 90450 + }, + { + "epoch": 0.7996958927845259, + "grad_norm": 4.811198711395264, + "learning_rate": 3.667173512025791e-05, + "loss": 0.6042, + "step": 90460 + }, + { + "epoch": 0.7997842960448381, + "grad_norm": 6.630556106567383, + "learning_rate": 3.667026173258603e-05, + "loss": 0.7453, + "step": 90470 + }, + { + "epoch": 0.7998726993051504, + "grad_norm": 1.1571115255355835, + "learning_rate": 3.6668788344914165e-05, + "loss": 0.5701, + "step": 90480 + }, + { + "epoch": 0.7999611025654626, + "grad_norm": 3.7846672534942627, + "learning_rate": 3.666731495724229e-05, + "loss": 0.6055, + "step": 90490 + }, + { + "epoch": 0.8000495058257748, + "grad_norm": 1.8373130559921265, + "learning_rate": 3.666584156957042e-05, + "loss": 0.5539, + "step": 90500 + }, + { + "epoch": 0.8001379090860871, + "grad_norm": 2.4086570739746094, + "learning_rate": 3.666436818189855e-05, + "loss": 0.6487, + "step": 90510 + }, + { + "epoch": 0.8002263123463993, + "grad_norm": 8.181065559387207, + "learning_rate": 3.6662894794226685e-05, + "loss": 0.7604, + "step": 90520 + }, + { + "epoch": 0.8003147156067115, + "grad_norm": 2.988612413406372, + "learning_rate": 3.666142140655481e-05, + "loss": 0.5591, + "step": 90530 + }, + { + "epoch": 0.8004031188670239, + "grad_norm": 5.187861919403076, + "learning_rate": 3.665994801888294e-05, + "loss": 0.5942, + "step": 90540 + }, + { + "epoch": 0.8004915221273361, + "grad_norm": 6.124841690063477, + "learning_rate": 3.6658474631211063e-05, + "loss": 0.7474, + "step": 90550 + }, + { + "epoch": 0.8005799253876483, + "grad_norm": 5.242320537567139, + "learning_rate": 3.66570012435392e-05, + "loss": 0.6616, + "step": 90560 + }, + { + "epoch": 0.8006683286479606, + "grad_norm": 3.7027320861816406, + "learning_rate": 3.665552785586733e-05, + "loss": 0.5237, + "step": 90570 + }, + { + "epoch": 0.8007567319082728, + "grad_norm": 3.004584789276123, + "learning_rate": 3.6654054468195455e-05, + "loss": 0.7868, + "step": 90580 + }, + { + "epoch": 0.800845135168585, + "grad_norm": 9.394257545471191, + "learning_rate": 3.6652581080523584e-05, + "loss": 0.76, + "step": 90590 + }, + { + "epoch": 0.8009335384288973, + "grad_norm": 9.192709922790527, + "learning_rate": 3.665110769285172e-05, + "loss": 0.6656, + "step": 90600 + }, + { + "epoch": 0.8010219416892095, + "grad_norm": 5.200221061706543, + "learning_rate": 3.664963430517984e-05, + "loss": 0.6364, + "step": 90610 + }, + { + "epoch": 0.8011103449495217, + "grad_norm": 6.382055759429932, + "learning_rate": 3.6648160917507975e-05, + "loss": 0.7844, + "step": 90620 + }, + { + "epoch": 0.8011987482098339, + "grad_norm": 5.1465535163879395, + "learning_rate": 3.6646687529836104e-05, + "loss": 0.6745, + "step": 90630 + }, + { + "epoch": 0.8012871514701462, + "grad_norm": 3.387704610824585, + "learning_rate": 3.664521414216423e-05, + "loss": 0.6639, + "step": 90640 + }, + { + "epoch": 0.8013755547304584, + "grad_norm": 2.0622878074645996, + "learning_rate": 3.664374075449236e-05, + "loss": 0.7891, + "step": 90650 + }, + { + "epoch": 0.8014639579907707, + "grad_norm": 4.627909183502197, + "learning_rate": 3.664226736682049e-05, + "loss": 0.799, + "step": 90660 + }, + { + "epoch": 0.801552361251083, + "grad_norm": 2.908628463745117, + "learning_rate": 3.664079397914862e-05, + "loss": 0.7835, + "step": 90670 + }, + { + "epoch": 0.8016407645113952, + "grad_norm": 4.224820137023926, + "learning_rate": 3.663932059147675e-05, + "loss": 0.6483, + "step": 90680 + }, + { + "epoch": 0.8017291677717074, + "grad_norm": 3.7172868251800537, + "learning_rate": 3.6637847203804874e-05, + "loss": 0.6258, + "step": 90690 + }, + { + "epoch": 0.8018175710320197, + "grad_norm": 2.3695993423461914, + "learning_rate": 3.663637381613301e-05, + "loss": 0.6825, + "step": 90700 + }, + { + "epoch": 0.8019059742923319, + "grad_norm": 2.419285774230957, + "learning_rate": 3.663490042846114e-05, + "loss": 0.6306, + "step": 90710 + }, + { + "epoch": 0.8019943775526441, + "grad_norm": 2.4181325435638428, + "learning_rate": 3.6633427040789266e-05, + "loss": 0.6776, + "step": 90720 + }, + { + "epoch": 0.8020827808129564, + "grad_norm": 3.598076343536377, + "learning_rate": 3.6631953653117394e-05, + "loss": 0.6597, + "step": 90730 + }, + { + "epoch": 0.8021711840732686, + "grad_norm": 4.75786018371582, + "learning_rate": 3.663048026544553e-05, + "loss": 0.7046, + "step": 90740 + }, + { + "epoch": 0.8022595873335808, + "grad_norm": 2.5627851486206055, + "learning_rate": 3.662900687777365e-05, + "loss": 0.5065, + "step": 90750 + }, + { + "epoch": 0.8023479905938931, + "grad_norm": 1.4296832084655762, + "learning_rate": 3.6627533490101786e-05, + "loss": 0.6487, + "step": 90760 + }, + { + "epoch": 0.8024363938542053, + "grad_norm": 2.2443766593933105, + "learning_rate": 3.6626060102429914e-05, + "loss": 0.8873, + "step": 90770 + }, + { + "epoch": 0.8025247971145176, + "grad_norm": 2.9139606952667236, + "learning_rate": 3.662458671475804e-05, + "loss": 0.6156, + "step": 90780 + }, + { + "epoch": 0.8026132003748299, + "grad_norm": 4.232553482055664, + "learning_rate": 3.662311332708617e-05, + "loss": 0.79, + "step": 90790 + }, + { + "epoch": 0.8027016036351421, + "grad_norm": 6.342467308044434, + "learning_rate": 3.66216399394143e-05, + "loss": 0.7305, + "step": 90800 + }, + { + "epoch": 0.8027900068954543, + "grad_norm": 1.851207971572876, + "learning_rate": 3.662016655174243e-05, + "loss": 0.8652, + "step": 90810 + }, + { + "epoch": 0.8028784101557666, + "grad_norm": 3.6889631748199463, + "learning_rate": 3.661869316407056e-05, + "loss": 0.6235, + "step": 90820 + }, + { + "epoch": 0.8029668134160788, + "grad_norm": 7.3890533447265625, + "learning_rate": 3.6617219776398684e-05, + "loss": 0.5818, + "step": 90830 + }, + { + "epoch": 0.803055216676391, + "grad_norm": 7.880035400390625, + "learning_rate": 3.661574638872682e-05, + "loss": 0.5788, + "step": 90840 + }, + { + "epoch": 0.8031436199367032, + "grad_norm": 4.1777825355529785, + "learning_rate": 3.661427300105495e-05, + "loss": 0.5818, + "step": 90850 + }, + { + "epoch": 0.8032320231970155, + "grad_norm": 3.1537744998931885, + "learning_rate": 3.6612799613383076e-05, + "loss": 0.7929, + "step": 90860 + }, + { + "epoch": 0.8033204264573277, + "grad_norm": 5.2961955070495605, + "learning_rate": 3.6611326225711205e-05, + "loss": 0.7781, + "step": 90870 + }, + { + "epoch": 0.8034088297176399, + "grad_norm": 2.942534923553467, + "learning_rate": 3.660985283803934e-05, + "loss": 0.7238, + "step": 90880 + }, + { + "epoch": 0.8034972329779523, + "grad_norm": 3.3210320472717285, + "learning_rate": 3.660837945036746e-05, + "loss": 0.6948, + "step": 90890 + }, + { + "epoch": 0.8035856362382645, + "grad_norm": 7.409834861755371, + "learning_rate": 3.6606906062695596e-05, + "loss": 0.6393, + "step": 90900 + }, + { + "epoch": 0.8036740394985767, + "grad_norm": 2.8655855655670166, + "learning_rate": 3.660543267502372e-05, + "loss": 0.7015, + "step": 90910 + }, + { + "epoch": 0.803762442758889, + "grad_norm": 4.531734943389893, + "learning_rate": 3.660395928735185e-05, + "loss": 0.7349, + "step": 90920 + }, + { + "epoch": 0.8038508460192012, + "grad_norm": 5.419953346252441, + "learning_rate": 3.660248589967998e-05, + "loss": 0.7656, + "step": 90930 + }, + { + "epoch": 0.8039392492795134, + "grad_norm": 1.0050759315490723, + "learning_rate": 3.660101251200811e-05, + "loss": 0.643, + "step": 90940 + }, + { + "epoch": 0.8040276525398257, + "grad_norm": 1.530034065246582, + "learning_rate": 3.659953912433624e-05, + "loss": 0.6302, + "step": 90950 + }, + { + "epoch": 0.8041160558001379, + "grad_norm": 1.8367685079574585, + "learning_rate": 3.659806573666437e-05, + "loss": 0.6528, + "step": 90960 + }, + { + "epoch": 0.8042044590604501, + "grad_norm": 2.1076557636260986, + "learning_rate": 3.6596592348992495e-05, + "loss": 0.7145, + "step": 90970 + }, + { + "epoch": 0.8042928623207624, + "grad_norm": 2.1007304191589355, + "learning_rate": 3.659511896132063e-05, + "loss": 0.6833, + "step": 90980 + }, + { + "epoch": 0.8043812655810746, + "grad_norm": 2.545356035232544, + "learning_rate": 3.659364557364876e-05, + "loss": 0.5637, + "step": 90990 + }, + { + "epoch": 0.8044696688413868, + "grad_norm": 1.8784129619598389, + "learning_rate": 3.659217218597689e-05, + "loss": 0.5453, + "step": 91000 + }, + { + "epoch": 0.8045580721016992, + "grad_norm": 6.988739013671875, + "learning_rate": 3.6590698798305015e-05, + "loss": 0.7347, + "step": 91010 + }, + { + "epoch": 0.8046464753620114, + "grad_norm": 5.475579261779785, + "learning_rate": 3.6589225410633143e-05, + "loss": 0.6541, + "step": 91020 + }, + { + "epoch": 0.8047348786223236, + "grad_norm": 4.5187907218933105, + "learning_rate": 3.658775202296127e-05, + "loss": 0.6761, + "step": 91030 + }, + { + "epoch": 0.8048232818826359, + "grad_norm": 13.180841445922852, + "learning_rate": 3.658627863528941e-05, + "loss": 0.6719, + "step": 91040 + }, + { + "epoch": 0.8049116851429481, + "grad_norm": 32.88784408569336, + "learning_rate": 3.658480524761753e-05, + "loss": 0.557, + "step": 91050 + }, + { + "epoch": 0.8050000884032603, + "grad_norm": 7.399740695953369, + "learning_rate": 3.6583331859945664e-05, + "loss": 0.6873, + "step": 91060 + }, + { + "epoch": 0.8050884916635725, + "grad_norm": 3.388075590133667, + "learning_rate": 3.658185847227379e-05, + "loss": 0.6513, + "step": 91070 + }, + { + "epoch": 0.8051768949238848, + "grad_norm": 1.3924405574798584, + "learning_rate": 3.658038508460192e-05, + "loss": 0.6953, + "step": 91080 + }, + { + "epoch": 0.805265298184197, + "grad_norm": 1.3593623638153076, + "learning_rate": 3.657891169693005e-05, + "loss": 0.6209, + "step": 91090 + }, + { + "epoch": 0.8053537014445092, + "grad_norm": 5.281068801879883, + "learning_rate": 3.6577438309258184e-05, + "loss": 0.5972, + "step": 91100 + }, + { + "epoch": 0.8054421047048215, + "grad_norm": 4.000575065612793, + "learning_rate": 3.6575964921586305e-05, + "loss": 0.5616, + "step": 91110 + }, + { + "epoch": 0.8055305079651337, + "grad_norm": 2.1831018924713135, + "learning_rate": 3.657449153391444e-05, + "loss": 0.6358, + "step": 91120 + }, + { + "epoch": 0.805618911225446, + "grad_norm": 5.4715094566345215, + "learning_rate": 3.657301814624257e-05, + "loss": 0.6813, + "step": 91130 + }, + { + "epoch": 0.8057073144857583, + "grad_norm": 1.7945505380630493, + "learning_rate": 3.65715447585707e-05, + "loss": 0.6476, + "step": 91140 + }, + { + "epoch": 0.8057957177460705, + "grad_norm": 1.485714077949524, + "learning_rate": 3.6570071370898826e-05, + "loss": 0.6773, + "step": 91150 + }, + { + "epoch": 0.8058841210063827, + "grad_norm": 2.7322723865509033, + "learning_rate": 3.6568597983226954e-05, + "loss": 0.5902, + "step": 91160 + }, + { + "epoch": 0.805972524266695, + "grad_norm": 1.9922597408294678, + "learning_rate": 3.656712459555508e-05, + "loss": 0.4839, + "step": 91170 + }, + { + "epoch": 0.8060609275270072, + "grad_norm": 4.767215251922607, + "learning_rate": 3.656565120788322e-05, + "loss": 0.6663, + "step": 91180 + }, + { + "epoch": 0.8061493307873194, + "grad_norm": 2.5317060947418213, + "learning_rate": 3.6564177820211346e-05, + "loss": 0.6364, + "step": 91190 + }, + { + "epoch": 0.8062377340476317, + "grad_norm": 1.9275096654891968, + "learning_rate": 3.6562704432539474e-05, + "loss": 0.6303, + "step": 91200 + }, + { + "epoch": 0.8063261373079439, + "grad_norm": 10.892936706542969, + "learning_rate": 3.65612310448676e-05, + "loss": 0.747, + "step": 91210 + }, + { + "epoch": 0.8064145405682561, + "grad_norm": 7.613400936126709, + "learning_rate": 3.655975765719573e-05, + "loss": 0.6131, + "step": 91220 + }, + { + "epoch": 0.8065029438285684, + "grad_norm": 1.0172770023345947, + "learning_rate": 3.655828426952386e-05, + "loss": 0.648, + "step": 91230 + }, + { + "epoch": 0.8065913470888806, + "grad_norm": 2.1202609539031982, + "learning_rate": 3.6556810881851994e-05, + "loss": 0.5708, + "step": 91240 + }, + { + "epoch": 0.8066797503491929, + "grad_norm": 0.8410158157348633, + "learning_rate": 3.655533749418012e-05, + "loss": 0.6327, + "step": 91250 + }, + { + "epoch": 0.8067681536095052, + "grad_norm": 2.0556564331054688, + "learning_rate": 3.655386410650825e-05, + "loss": 0.6523, + "step": 91260 + }, + { + "epoch": 0.8068565568698174, + "grad_norm": 5.771847248077393, + "learning_rate": 3.655239071883638e-05, + "loss": 0.6363, + "step": 91270 + }, + { + "epoch": 0.8069449601301296, + "grad_norm": 1.1063709259033203, + "learning_rate": 3.655091733116451e-05, + "loss": 0.7078, + "step": 91280 + }, + { + "epoch": 0.8070333633904418, + "grad_norm": 7.785793781280518, + "learning_rate": 3.6549443943492636e-05, + "loss": 0.7513, + "step": 91290 + }, + { + "epoch": 0.8071217666507541, + "grad_norm": 3.255139112472534, + "learning_rate": 3.6547970555820765e-05, + "loss": 0.66, + "step": 91300 + }, + { + "epoch": 0.8072101699110663, + "grad_norm": 3.1840829849243164, + "learning_rate": 3.65464971681489e-05, + "loss": 0.7693, + "step": 91310 + }, + { + "epoch": 0.8072985731713785, + "grad_norm": 7.684268951416016, + "learning_rate": 3.654502378047703e-05, + "loss": 0.5535, + "step": 91320 + }, + { + "epoch": 0.8073869764316908, + "grad_norm": 6.404549598693848, + "learning_rate": 3.6543550392805156e-05, + "loss": 0.6261, + "step": 91330 + }, + { + "epoch": 0.807475379692003, + "grad_norm": 4.003452777862549, + "learning_rate": 3.6542077005133285e-05, + "loss": 0.6514, + "step": 91340 + }, + { + "epoch": 0.8075637829523152, + "grad_norm": 4.139762878417969, + "learning_rate": 3.654060361746141e-05, + "loss": 0.6895, + "step": 91350 + }, + { + "epoch": 0.8076521862126276, + "grad_norm": 2.405508279800415, + "learning_rate": 3.653913022978954e-05, + "loss": 0.6905, + "step": 91360 + }, + { + "epoch": 0.8077405894729398, + "grad_norm": 4.928290367126465, + "learning_rate": 3.6537656842117677e-05, + "loss": 0.4836, + "step": 91370 + }, + { + "epoch": 0.807828992733252, + "grad_norm": 6.773512363433838, + "learning_rate": 3.65361834544458e-05, + "loss": 0.6862, + "step": 91380 + }, + { + "epoch": 0.8079173959935643, + "grad_norm": 5.532743453979492, + "learning_rate": 3.653471006677393e-05, + "loss": 0.5549, + "step": 91390 + }, + { + "epoch": 0.8080057992538765, + "grad_norm": 4.4739861488342285, + "learning_rate": 3.653323667910206e-05, + "loss": 0.6652, + "step": 91400 + }, + { + "epoch": 0.8080942025141887, + "grad_norm": 2.3816475868225098, + "learning_rate": 3.653176329143019e-05, + "loss": 0.6128, + "step": 91410 + }, + { + "epoch": 0.808182605774501, + "grad_norm": 4.370458126068115, + "learning_rate": 3.653028990375832e-05, + "loss": 0.7605, + "step": 91420 + }, + { + "epoch": 0.8082710090348132, + "grad_norm": 3.0469794273376465, + "learning_rate": 3.6528816516086453e-05, + "loss": 0.6975, + "step": 91430 + }, + { + "epoch": 0.8083594122951254, + "grad_norm": 4.971312999725342, + "learning_rate": 3.6527343128414575e-05, + "loss": 0.7178, + "step": 91440 + }, + { + "epoch": 0.8084478155554377, + "grad_norm": 4.512945652008057, + "learning_rate": 3.652586974074271e-05, + "loss": 0.6244, + "step": 91450 + }, + { + "epoch": 0.8085362188157499, + "grad_norm": 8.823596954345703, + "learning_rate": 3.652439635307084e-05, + "loss": 0.599, + "step": 91460 + }, + { + "epoch": 0.8086246220760621, + "grad_norm": 2.64381742477417, + "learning_rate": 3.652292296539897e-05, + "loss": 0.6012, + "step": 91470 + }, + { + "epoch": 0.8087130253363745, + "grad_norm": 1.6936637163162231, + "learning_rate": 3.6521449577727095e-05, + "loss": 0.7722, + "step": 91480 + }, + { + "epoch": 0.8088014285966867, + "grad_norm": 1.9723860025405884, + "learning_rate": 3.6519976190055224e-05, + "loss": 0.727, + "step": 91490 + }, + { + "epoch": 0.8088898318569989, + "grad_norm": 6.06514835357666, + "learning_rate": 3.651850280238335e-05, + "loss": 0.7831, + "step": 91500 + }, + { + "epoch": 0.8089782351173112, + "grad_norm": 9.091752052307129, + "learning_rate": 3.651702941471149e-05, + "loss": 0.624, + "step": 91510 + }, + { + "epoch": 0.8090666383776234, + "grad_norm": 4.952655792236328, + "learning_rate": 3.651555602703961e-05, + "loss": 0.617, + "step": 91520 + }, + { + "epoch": 0.8091550416379356, + "grad_norm": 3.2162885665893555, + "learning_rate": 3.6514082639367744e-05, + "loss": 0.6377, + "step": 91530 + }, + { + "epoch": 0.8092434448982478, + "grad_norm": 2.793762445449829, + "learning_rate": 3.651260925169587e-05, + "loss": 0.6733, + "step": 91540 + }, + { + "epoch": 0.8093318481585601, + "grad_norm": 1.3875963687896729, + "learning_rate": 3.6511135864024e-05, + "loss": 0.6158, + "step": 91550 + }, + { + "epoch": 0.8094202514188723, + "grad_norm": 1.0732028484344482, + "learning_rate": 3.650966247635213e-05, + "loss": 0.6045, + "step": 91560 + }, + { + "epoch": 0.8095086546791845, + "grad_norm": 2.06482195854187, + "learning_rate": 3.6508189088680264e-05, + "loss": 0.6835, + "step": 91570 + }, + { + "epoch": 0.8095970579394968, + "grad_norm": 8.042503356933594, + "learning_rate": 3.6506715701008386e-05, + "loss": 0.6592, + "step": 91580 + }, + { + "epoch": 0.809685461199809, + "grad_norm": 6.1628241539001465, + "learning_rate": 3.650524231333652e-05, + "loss": 0.5696, + "step": 91590 + }, + { + "epoch": 0.8097738644601213, + "grad_norm": 1.5379279851913452, + "learning_rate": 3.650376892566464e-05, + "loss": 0.6397, + "step": 91600 + }, + { + "epoch": 0.8098622677204336, + "grad_norm": 7.55742073059082, + "learning_rate": 3.650229553799278e-05, + "loss": 0.698, + "step": 91610 + }, + { + "epoch": 0.8099506709807458, + "grad_norm": 5.294787883758545, + "learning_rate": 3.6500822150320906e-05, + "loss": 0.5999, + "step": 91620 + }, + { + "epoch": 0.810039074241058, + "grad_norm": 2.8587660789489746, + "learning_rate": 3.6499348762649034e-05, + "loss": 0.6621, + "step": 91630 + }, + { + "epoch": 0.8101274775013703, + "grad_norm": 5.2149858474731445, + "learning_rate": 3.649787537497716e-05, + "loss": 0.7445, + "step": 91640 + }, + { + "epoch": 0.8102158807616825, + "grad_norm": 5.11593770980835, + "learning_rate": 3.64964019873053e-05, + "loss": 0.7587, + "step": 91650 + }, + { + "epoch": 0.8103042840219947, + "grad_norm": 9.673462867736816, + "learning_rate": 3.649492859963342e-05, + "loss": 0.7378, + "step": 91660 + }, + { + "epoch": 0.810392687282307, + "grad_norm": 3.2622766494750977, + "learning_rate": 3.6493455211961554e-05, + "loss": 0.6514, + "step": 91670 + }, + { + "epoch": 0.8104810905426192, + "grad_norm": 7.931560039520264, + "learning_rate": 3.649198182428968e-05, + "loss": 0.701, + "step": 91680 + }, + { + "epoch": 0.8105694938029314, + "grad_norm": 4.800394058227539, + "learning_rate": 3.649050843661781e-05, + "loss": 0.6736, + "step": 91690 + }, + { + "epoch": 0.8106578970632436, + "grad_norm": 1.9453575611114502, + "learning_rate": 3.648903504894594e-05, + "loss": 0.7023, + "step": 91700 + }, + { + "epoch": 0.8107463003235559, + "grad_norm": 3.5230698585510254, + "learning_rate": 3.6487561661274074e-05, + "loss": 0.6172, + "step": 91710 + }, + { + "epoch": 0.8108347035838682, + "grad_norm": 1.1256121397018433, + "learning_rate": 3.6486088273602196e-05, + "loss": 0.5613, + "step": 91720 + }, + { + "epoch": 0.8109231068441805, + "grad_norm": 1.6945984363555908, + "learning_rate": 3.648461488593033e-05, + "loss": 0.5496, + "step": 91730 + }, + { + "epoch": 0.8110115101044927, + "grad_norm": 4.023597240447998, + "learning_rate": 3.648314149825845e-05, + "loss": 0.7627, + "step": 91740 + }, + { + "epoch": 0.8110999133648049, + "grad_norm": 2.5677945613861084, + "learning_rate": 3.648166811058659e-05, + "loss": 0.7206, + "step": 91750 + }, + { + "epoch": 0.8111883166251171, + "grad_norm": 2.9123995304107666, + "learning_rate": 3.6480194722914716e-05, + "loss": 0.6682, + "step": 91760 + }, + { + "epoch": 0.8112767198854294, + "grad_norm": 2.7454440593719482, + "learning_rate": 3.6478721335242845e-05, + "loss": 0.5953, + "step": 91770 + }, + { + "epoch": 0.8113651231457416, + "grad_norm": 3.35835337638855, + "learning_rate": 3.647724794757097e-05, + "loss": 0.5792, + "step": 91780 + }, + { + "epoch": 0.8114535264060538, + "grad_norm": 8.722201347351074, + "learning_rate": 3.647577455989911e-05, + "loss": 0.6281, + "step": 91790 + }, + { + "epoch": 0.8115419296663661, + "grad_norm": 8.841720581054688, + "learning_rate": 3.647430117222723e-05, + "loss": 0.6054, + "step": 91800 + }, + { + "epoch": 0.8116303329266783, + "grad_norm": 2.318906545639038, + "learning_rate": 3.6472827784555365e-05, + "loss": 0.6452, + "step": 91810 + }, + { + "epoch": 0.8117187361869905, + "grad_norm": 6.147796154022217, + "learning_rate": 3.647135439688349e-05, + "loss": 0.6364, + "step": 91820 + }, + { + "epoch": 0.8118071394473028, + "grad_norm": 4.061427116394043, + "learning_rate": 3.646988100921162e-05, + "loss": 0.716, + "step": 91830 + }, + { + "epoch": 0.8118955427076151, + "grad_norm": 9.489065170288086, + "learning_rate": 3.646840762153975e-05, + "loss": 0.4981, + "step": 91840 + }, + { + "epoch": 0.8119839459679273, + "grad_norm": 2.0221989154815674, + "learning_rate": 3.646693423386788e-05, + "loss": 0.6535, + "step": 91850 + }, + { + "epoch": 0.8120723492282396, + "grad_norm": 6.83666467666626, + "learning_rate": 3.6465460846196007e-05, + "loss": 0.6213, + "step": 91860 + }, + { + "epoch": 0.8121607524885518, + "grad_norm": 2.6959662437438965, + "learning_rate": 3.646398745852414e-05, + "loss": 0.7453, + "step": 91870 + }, + { + "epoch": 0.812249155748864, + "grad_norm": 7.575078964233398, + "learning_rate": 3.646251407085226e-05, + "loss": 0.7979, + "step": 91880 + }, + { + "epoch": 0.8123375590091763, + "grad_norm": 2.185880661010742, + "learning_rate": 3.64610406831804e-05, + "loss": 0.6016, + "step": 91890 + }, + { + "epoch": 0.8124259622694885, + "grad_norm": 2.460078477859497, + "learning_rate": 3.645956729550853e-05, + "loss": 0.6122, + "step": 91900 + }, + { + "epoch": 0.8125143655298007, + "grad_norm": 4.931056976318359, + "learning_rate": 3.6458093907836655e-05, + "loss": 0.7672, + "step": 91910 + }, + { + "epoch": 0.812602768790113, + "grad_norm": 3.246084451675415, + "learning_rate": 3.6456620520164783e-05, + "loss": 0.6281, + "step": 91920 + }, + { + "epoch": 0.8126911720504252, + "grad_norm": 4.333129405975342, + "learning_rate": 3.645514713249292e-05, + "loss": 0.6461, + "step": 91930 + }, + { + "epoch": 0.8127795753107374, + "grad_norm": 1.3898875713348389, + "learning_rate": 3.645367374482104e-05, + "loss": 0.5682, + "step": 91940 + }, + { + "epoch": 0.8128679785710498, + "grad_norm": 1.76920485496521, + "learning_rate": 3.6452200357149175e-05, + "loss": 0.764, + "step": 91950 + }, + { + "epoch": 0.812956381831362, + "grad_norm": 3.05548357963562, + "learning_rate": 3.64507269694773e-05, + "loss": 0.6244, + "step": 91960 + }, + { + "epoch": 0.8130447850916742, + "grad_norm": 2.593017816543579, + "learning_rate": 3.644925358180543e-05, + "loss": 0.6237, + "step": 91970 + }, + { + "epoch": 0.8131331883519864, + "grad_norm": 5.923634052276611, + "learning_rate": 3.644778019413356e-05, + "loss": 0.5903, + "step": 91980 + }, + { + "epoch": 0.8132215916122987, + "grad_norm": 2.416029453277588, + "learning_rate": 3.644630680646169e-05, + "loss": 0.6887, + "step": 91990 + }, + { + "epoch": 0.8133099948726109, + "grad_norm": 5.269219398498535, + "learning_rate": 3.644483341878982e-05, + "loss": 0.5629, + "step": 92000 + }, + { + "epoch": 0.8133983981329231, + "grad_norm": 1.3841913938522339, + "learning_rate": 3.644336003111795e-05, + "loss": 0.7364, + "step": 92010 + }, + { + "epoch": 0.8134868013932354, + "grad_norm": 7.034547328948975, + "learning_rate": 3.6441886643446074e-05, + "loss": 0.661, + "step": 92020 + }, + { + "epoch": 0.8135752046535476, + "grad_norm": 2.8226442337036133, + "learning_rate": 3.644041325577421e-05, + "loss": 0.7605, + "step": 92030 + }, + { + "epoch": 0.8136636079138598, + "grad_norm": 3.047433376312256, + "learning_rate": 3.643893986810234e-05, + "loss": 0.6732, + "step": 92040 + }, + { + "epoch": 0.8137520111741721, + "grad_norm": 5.352085113525391, + "learning_rate": 3.6437466480430466e-05, + "loss": 0.666, + "step": 92050 + }, + { + "epoch": 0.8138404144344843, + "grad_norm": 7.8496246337890625, + "learning_rate": 3.6435993092758594e-05, + "loss": 0.6799, + "step": 92060 + }, + { + "epoch": 0.8139288176947966, + "grad_norm": 3.2211687564849854, + "learning_rate": 3.643451970508672e-05, + "loss": 0.6864, + "step": 92070 + }, + { + "epoch": 0.8140172209551089, + "grad_norm": 1.2963106632232666, + "learning_rate": 3.643304631741485e-05, + "loss": 0.5978, + "step": 92080 + }, + { + "epoch": 0.8141056242154211, + "grad_norm": 7.160528182983398, + "learning_rate": 3.6431572929742986e-05, + "loss": 0.6484, + "step": 92090 + }, + { + "epoch": 0.8141940274757333, + "grad_norm": 3.806544542312622, + "learning_rate": 3.6430099542071114e-05, + "loss": 0.6917, + "step": 92100 + }, + { + "epoch": 0.8142824307360456, + "grad_norm": 2.3122305870056152, + "learning_rate": 3.642862615439924e-05, + "loss": 0.8155, + "step": 92110 + }, + { + "epoch": 0.8143708339963578, + "grad_norm": 2.539961338043213, + "learning_rate": 3.642715276672737e-05, + "loss": 0.5733, + "step": 92120 + }, + { + "epoch": 0.81445923725667, + "grad_norm": 2.348661184310913, + "learning_rate": 3.64256793790555e-05, + "loss": 0.6232, + "step": 92130 + }, + { + "epoch": 0.8145476405169823, + "grad_norm": 8.354660987854004, + "learning_rate": 3.642420599138363e-05, + "loss": 0.6642, + "step": 92140 + }, + { + "epoch": 0.8146360437772945, + "grad_norm": 1.988718867301941, + "learning_rate": 3.642273260371176e-05, + "loss": 0.7333, + "step": 92150 + }, + { + "epoch": 0.8147244470376067, + "grad_norm": 1.8847707509994507, + "learning_rate": 3.642125921603989e-05, + "loss": 0.6676, + "step": 92160 + }, + { + "epoch": 0.814812850297919, + "grad_norm": 1.855980396270752, + "learning_rate": 3.641978582836802e-05, + "loss": 0.7347, + "step": 92170 + }, + { + "epoch": 0.8149012535582312, + "grad_norm": 3.3950014114379883, + "learning_rate": 3.641831244069615e-05, + "loss": 0.5434, + "step": 92180 + }, + { + "epoch": 0.8149896568185435, + "grad_norm": 1.589786410331726, + "learning_rate": 3.6416839053024276e-05, + "loss": 0.6124, + "step": 92190 + }, + { + "epoch": 0.8150780600788557, + "grad_norm": 2.3193206787109375, + "learning_rate": 3.6415365665352404e-05, + "loss": 0.7166, + "step": 92200 + }, + { + "epoch": 0.815166463339168, + "grad_norm": 2.0396432876586914, + "learning_rate": 3.641389227768053e-05, + "loss": 0.5533, + "step": 92210 + }, + { + "epoch": 0.8152548665994802, + "grad_norm": 2.424165964126587, + "learning_rate": 3.641241889000867e-05, + "loss": 0.6541, + "step": 92220 + }, + { + "epoch": 0.8153432698597924, + "grad_norm": 4.729544162750244, + "learning_rate": 3.6410945502336796e-05, + "loss": 0.6918, + "step": 92230 + }, + { + "epoch": 0.8154316731201047, + "grad_norm": 4.937289237976074, + "learning_rate": 3.6409472114664925e-05, + "loss": 0.6124, + "step": 92240 + }, + { + "epoch": 0.8155200763804169, + "grad_norm": 1.8067333698272705, + "learning_rate": 3.640799872699305e-05, + "loss": 0.595, + "step": 92250 + }, + { + "epoch": 0.8156084796407291, + "grad_norm": 8.4617919921875, + "learning_rate": 3.640652533932118e-05, + "loss": 0.6465, + "step": 92260 + }, + { + "epoch": 0.8156968829010414, + "grad_norm": 2.161268949508667, + "learning_rate": 3.640505195164931e-05, + "loss": 0.5909, + "step": 92270 + }, + { + "epoch": 0.8157852861613536, + "grad_norm": 12.53947925567627, + "learning_rate": 3.6403578563977445e-05, + "loss": 0.7782, + "step": 92280 + }, + { + "epoch": 0.8158736894216658, + "grad_norm": 4.181900501251221, + "learning_rate": 3.640210517630557e-05, + "loss": 0.6112, + "step": 92290 + }, + { + "epoch": 0.8159620926819781, + "grad_norm": 1.1209492683410645, + "learning_rate": 3.64006317886337e-05, + "loss": 0.673, + "step": 92300 + }, + { + "epoch": 0.8160504959422904, + "grad_norm": 3.288938045501709, + "learning_rate": 3.639915840096183e-05, + "loss": 0.6374, + "step": 92310 + }, + { + "epoch": 0.8161388992026026, + "grad_norm": 1.3260165452957153, + "learning_rate": 3.639768501328996e-05, + "loss": 0.5456, + "step": 92320 + }, + { + "epoch": 0.8162273024629149, + "grad_norm": 3.2640914916992188, + "learning_rate": 3.639621162561809e-05, + "loss": 0.6227, + "step": 92330 + }, + { + "epoch": 0.8163157057232271, + "grad_norm": 2.262810468673706, + "learning_rate": 3.639473823794622e-05, + "loss": 0.4703, + "step": 92340 + }, + { + "epoch": 0.8164041089835393, + "grad_norm": 2.8205833435058594, + "learning_rate": 3.639326485027434e-05, + "loss": 0.7569, + "step": 92350 + }, + { + "epoch": 0.8164925122438516, + "grad_norm": 11.378689765930176, + "learning_rate": 3.639179146260248e-05, + "loss": 0.693, + "step": 92360 + }, + { + "epoch": 0.8165809155041638, + "grad_norm": 2.483201503753662, + "learning_rate": 3.639031807493061e-05, + "loss": 0.7184, + "step": 92370 + }, + { + "epoch": 0.816669318764476, + "grad_norm": 5.942753314971924, + "learning_rate": 3.6388844687258735e-05, + "loss": 0.5987, + "step": 92380 + }, + { + "epoch": 0.8167577220247882, + "grad_norm": 1.5592632293701172, + "learning_rate": 3.6387371299586864e-05, + "loss": 0.7121, + "step": 92390 + }, + { + "epoch": 0.8168461252851005, + "grad_norm": 2.551483392715454, + "learning_rate": 3.6385897911915e-05, + "loss": 0.6107, + "step": 92400 + }, + { + "epoch": 0.8169345285454127, + "grad_norm": 6.301365852355957, + "learning_rate": 3.638442452424312e-05, + "loss": 0.5015, + "step": 92410 + }, + { + "epoch": 0.817022931805725, + "grad_norm": 2.5561041831970215, + "learning_rate": 3.6382951136571255e-05, + "loss": 0.7222, + "step": 92420 + }, + { + "epoch": 0.8171113350660373, + "grad_norm": 2.8549132347106934, + "learning_rate": 3.638147774889938e-05, + "loss": 0.712, + "step": 92430 + }, + { + "epoch": 0.8171997383263495, + "grad_norm": 3.7408576011657715, + "learning_rate": 3.638000436122751e-05, + "loss": 0.7478, + "step": 92440 + }, + { + "epoch": 0.8172881415866617, + "grad_norm": 6.571569442749023, + "learning_rate": 3.637853097355564e-05, + "loss": 0.6062, + "step": 92450 + }, + { + "epoch": 0.817376544846974, + "grad_norm": 2.056588649749756, + "learning_rate": 3.637705758588377e-05, + "loss": 0.679, + "step": 92460 + }, + { + "epoch": 0.8174649481072862, + "grad_norm": 2.586590528488159, + "learning_rate": 3.63755841982119e-05, + "loss": 0.5518, + "step": 92470 + }, + { + "epoch": 0.8175533513675984, + "grad_norm": 2.3542826175689697, + "learning_rate": 3.637411081054003e-05, + "loss": 0.6996, + "step": 92480 + }, + { + "epoch": 0.8176417546279107, + "grad_norm": 2.6961874961853027, + "learning_rate": 3.6372637422868154e-05, + "loss": 0.5685, + "step": 92490 + }, + { + "epoch": 0.8177301578882229, + "grad_norm": 2.5827276706695557, + "learning_rate": 3.637116403519629e-05, + "loss": 0.5815, + "step": 92500 + }, + { + "epoch": 0.8178185611485351, + "grad_norm": 4.169956207275391, + "learning_rate": 3.636969064752442e-05, + "loss": 0.6305, + "step": 92510 + }, + { + "epoch": 0.8179069644088474, + "grad_norm": 2.4388086795806885, + "learning_rate": 3.6368217259852546e-05, + "loss": 0.6421, + "step": 92520 + }, + { + "epoch": 0.8179953676691596, + "grad_norm": 12.235270500183105, + "learning_rate": 3.6366743872180674e-05, + "loss": 0.7019, + "step": 92530 + }, + { + "epoch": 0.8180837709294719, + "grad_norm": 1.524057388305664, + "learning_rate": 3.63652704845088e-05, + "loss": 0.5144, + "step": 92540 + }, + { + "epoch": 0.8181721741897842, + "grad_norm": 2.38299822807312, + "learning_rate": 3.636379709683693e-05, + "loss": 0.7373, + "step": 92550 + }, + { + "epoch": 0.8182605774500964, + "grad_norm": 2.7471745014190674, + "learning_rate": 3.6362323709165066e-05, + "loss": 0.6435, + "step": 92560 + }, + { + "epoch": 0.8183489807104086, + "grad_norm": 2.5418381690979004, + "learning_rate": 3.636085032149319e-05, + "loss": 0.7688, + "step": 92570 + }, + { + "epoch": 0.8184373839707209, + "grad_norm": 1.9382070302963257, + "learning_rate": 3.635937693382132e-05, + "loss": 0.7713, + "step": 92580 + }, + { + "epoch": 0.8185257872310331, + "grad_norm": 4.588695526123047, + "learning_rate": 3.635790354614945e-05, + "loss": 0.6895, + "step": 92590 + }, + { + "epoch": 0.8186141904913453, + "grad_norm": 1.5332796573638916, + "learning_rate": 3.635643015847758e-05, + "loss": 0.665, + "step": 92600 + }, + { + "epoch": 0.8187025937516575, + "grad_norm": 1.8502647876739502, + "learning_rate": 3.635495677080571e-05, + "loss": 0.6994, + "step": 92610 + }, + { + "epoch": 0.8187909970119698, + "grad_norm": 16.294696807861328, + "learning_rate": 3.635348338313384e-05, + "loss": 0.7789, + "step": 92620 + }, + { + "epoch": 0.818879400272282, + "grad_norm": 6.920746326446533, + "learning_rate": 3.6352009995461964e-05, + "loss": 0.6314, + "step": 92630 + }, + { + "epoch": 0.8189678035325942, + "grad_norm": 4.778506278991699, + "learning_rate": 3.63505366077901e-05, + "loss": 0.7118, + "step": 92640 + }, + { + "epoch": 0.8190562067929065, + "grad_norm": 2.9670848846435547, + "learning_rate": 3.634906322011823e-05, + "loss": 0.6154, + "step": 92650 + }, + { + "epoch": 0.8191446100532188, + "grad_norm": 4.475405216217041, + "learning_rate": 3.6347589832446356e-05, + "loss": 0.6261, + "step": 92660 + }, + { + "epoch": 0.819233013313531, + "grad_norm": 2.0727615356445312, + "learning_rate": 3.6346116444774485e-05, + "loss": 0.791, + "step": 92670 + }, + { + "epoch": 0.8193214165738433, + "grad_norm": 3.1234781742095947, + "learning_rate": 3.634464305710261e-05, + "loss": 0.6408, + "step": 92680 + }, + { + "epoch": 0.8194098198341555, + "grad_norm": 3.2888875007629395, + "learning_rate": 3.634316966943074e-05, + "loss": 0.6352, + "step": 92690 + }, + { + "epoch": 0.8194982230944677, + "grad_norm": 9.846280097961426, + "learning_rate": 3.6341696281758876e-05, + "loss": 0.6515, + "step": 92700 + }, + { + "epoch": 0.81958662635478, + "grad_norm": 2.0807485580444336, + "learning_rate": 3.6340222894087e-05, + "loss": 0.8486, + "step": 92710 + }, + { + "epoch": 0.8196750296150922, + "grad_norm": 2.234642744064331, + "learning_rate": 3.633874950641513e-05, + "loss": 0.5592, + "step": 92720 + }, + { + "epoch": 0.8197634328754044, + "grad_norm": 2.986532211303711, + "learning_rate": 3.633727611874326e-05, + "loss": 0.7216, + "step": 92730 + }, + { + "epoch": 0.8198518361357167, + "grad_norm": 4.456048011779785, + "learning_rate": 3.633580273107139e-05, + "loss": 0.749, + "step": 92740 + }, + { + "epoch": 0.8199402393960289, + "grad_norm": 1.583105206489563, + "learning_rate": 3.633432934339952e-05, + "loss": 0.479, + "step": 92750 + }, + { + "epoch": 0.8200286426563411, + "grad_norm": 7.858306884765625, + "learning_rate": 3.633285595572765e-05, + "loss": 0.6366, + "step": 92760 + }, + { + "epoch": 0.8201170459166534, + "grad_norm": 2.4444801807403564, + "learning_rate": 3.6331382568055775e-05, + "loss": 0.6335, + "step": 92770 + }, + { + "epoch": 0.8202054491769657, + "grad_norm": 3.6671173572540283, + "learning_rate": 3.632990918038391e-05, + "loss": 0.6705, + "step": 92780 + }, + { + "epoch": 0.8202938524372779, + "grad_norm": 7.266944408416748, + "learning_rate": 3.632843579271203e-05, + "loss": 0.5835, + "step": 92790 + }, + { + "epoch": 0.8203822556975902, + "grad_norm": 6.77391242980957, + "learning_rate": 3.632696240504017e-05, + "loss": 0.6936, + "step": 92800 + }, + { + "epoch": 0.8204706589579024, + "grad_norm": 1.7394851446151733, + "learning_rate": 3.6325489017368295e-05, + "loss": 0.6249, + "step": 92810 + }, + { + "epoch": 0.8205590622182146, + "grad_norm": 12.7716646194458, + "learning_rate": 3.6324015629696423e-05, + "loss": 0.7139, + "step": 92820 + }, + { + "epoch": 0.8206474654785268, + "grad_norm": 2.141550064086914, + "learning_rate": 3.632254224202455e-05, + "loss": 0.62, + "step": 92830 + }, + { + "epoch": 0.8207358687388391, + "grad_norm": 5.2641119956970215, + "learning_rate": 3.632106885435269e-05, + "loss": 0.6269, + "step": 92840 + }, + { + "epoch": 0.8208242719991513, + "grad_norm": 10.728096961975098, + "learning_rate": 3.631959546668081e-05, + "loss": 0.5302, + "step": 92850 + }, + { + "epoch": 0.8209126752594635, + "grad_norm": 2.6497015953063965, + "learning_rate": 3.6318122079008944e-05, + "loss": 0.6716, + "step": 92860 + }, + { + "epoch": 0.8210010785197758, + "grad_norm": 1.8583358526229858, + "learning_rate": 3.631664869133707e-05, + "loss": 0.6074, + "step": 92870 + }, + { + "epoch": 0.821089481780088, + "grad_norm": 6.227238178253174, + "learning_rate": 3.63151753036652e-05, + "loss": 0.5987, + "step": 92880 + }, + { + "epoch": 0.8211778850404002, + "grad_norm": 2.291771411895752, + "learning_rate": 3.631370191599333e-05, + "loss": 0.5297, + "step": 92890 + }, + { + "epoch": 0.8212662883007126, + "grad_norm": 7.058529376983643, + "learning_rate": 3.631222852832146e-05, + "loss": 0.7912, + "step": 92900 + }, + { + "epoch": 0.8213546915610248, + "grad_norm": 2.214385509490967, + "learning_rate": 3.6310755140649585e-05, + "loss": 0.7743, + "step": 92910 + }, + { + "epoch": 0.821443094821337, + "grad_norm": 2.1141932010650635, + "learning_rate": 3.630928175297772e-05, + "loss": 0.5796, + "step": 92920 + }, + { + "epoch": 0.8215314980816493, + "grad_norm": 1.197513461112976, + "learning_rate": 3.630780836530584e-05, + "loss": 0.6577, + "step": 92930 + }, + { + "epoch": 0.8216199013419615, + "grad_norm": 4.269301414489746, + "learning_rate": 3.630633497763398e-05, + "loss": 0.6044, + "step": 92940 + }, + { + "epoch": 0.8217083046022737, + "grad_norm": 8.293034553527832, + "learning_rate": 3.6304861589962106e-05, + "loss": 0.6815, + "step": 92950 + }, + { + "epoch": 0.821796707862586, + "grad_norm": 3.6531825065612793, + "learning_rate": 3.6303388202290234e-05, + "loss": 0.6008, + "step": 92960 + }, + { + "epoch": 0.8218851111228982, + "grad_norm": 2.503737211227417, + "learning_rate": 3.630191481461836e-05, + "loss": 0.8088, + "step": 92970 + }, + { + "epoch": 0.8219735143832104, + "grad_norm": 3.013113021850586, + "learning_rate": 3.63004414269465e-05, + "loss": 0.6954, + "step": 92980 + }, + { + "epoch": 0.8220619176435227, + "grad_norm": 4.123807907104492, + "learning_rate": 3.629896803927462e-05, + "loss": 0.5944, + "step": 92990 + }, + { + "epoch": 0.8221503209038349, + "grad_norm": 2.6282057762145996, + "learning_rate": 3.6297494651602754e-05, + "loss": 0.6253, + "step": 93000 + }, + { + "epoch": 0.8222387241641472, + "grad_norm": 1.1501773595809937, + "learning_rate": 3.629602126393088e-05, + "loss": 0.6613, + "step": 93010 + }, + { + "epoch": 0.8223271274244595, + "grad_norm": 1.26383376121521, + "learning_rate": 3.629454787625901e-05, + "loss": 0.6102, + "step": 93020 + }, + { + "epoch": 0.8224155306847717, + "grad_norm": 1.831240177154541, + "learning_rate": 3.629307448858714e-05, + "loss": 0.6655, + "step": 93030 + }, + { + "epoch": 0.8225039339450839, + "grad_norm": 12.614300727844238, + "learning_rate": 3.629160110091527e-05, + "loss": 0.6272, + "step": 93040 + }, + { + "epoch": 0.8225923372053962, + "grad_norm": 4.264084339141846, + "learning_rate": 3.6290127713243396e-05, + "loss": 0.7118, + "step": 93050 + }, + { + "epoch": 0.8226807404657084, + "grad_norm": 3.637566566467285, + "learning_rate": 3.628865432557153e-05, + "loss": 0.7523, + "step": 93060 + }, + { + "epoch": 0.8227691437260206, + "grad_norm": 1.5302402973175049, + "learning_rate": 3.628718093789966e-05, + "loss": 0.6187, + "step": 93070 + }, + { + "epoch": 0.8228575469863328, + "grad_norm": 2.6546175479888916, + "learning_rate": 3.628570755022779e-05, + "loss": 0.6413, + "step": 93080 + }, + { + "epoch": 0.8229459502466451, + "grad_norm": 6.245647430419922, + "learning_rate": 3.6284234162555916e-05, + "loss": 0.72, + "step": 93090 + }, + { + "epoch": 0.8230343535069573, + "grad_norm": 5.670719146728516, + "learning_rate": 3.6282760774884044e-05, + "loss": 0.6134, + "step": 93100 + }, + { + "epoch": 0.8231227567672695, + "grad_norm": 2.2253665924072266, + "learning_rate": 3.628128738721217e-05, + "loss": 0.6956, + "step": 93110 + }, + { + "epoch": 0.8232111600275818, + "grad_norm": 1.2625876665115356, + "learning_rate": 3.627981399954031e-05, + "loss": 0.7247, + "step": 93120 + }, + { + "epoch": 0.8232995632878941, + "grad_norm": 2.1546518802642822, + "learning_rate": 3.6278340611868436e-05, + "loss": 0.6002, + "step": 93130 + }, + { + "epoch": 0.8233879665482063, + "grad_norm": 3.581998348236084, + "learning_rate": 3.6276867224196565e-05, + "loss": 0.7919, + "step": 93140 + }, + { + "epoch": 0.8234763698085186, + "grad_norm": 2.450770378112793, + "learning_rate": 3.627539383652469e-05, + "loss": 0.6428, + "step": 93150 + }, + { + "epoch": 0.8235647730688308, + "grad_norm": 0.8883151412010193, + "learning_rate": 3.627392044885282e-05, + "loss": 0.6831, + "step": 93160 + }, + { + "epoch": 0.823653176329143, + "grad_norm": 2.1562340259552, + "learning_rate": 3.627244706118095e-05, + "loss": 0.7156, + "step": 93170 + }, + { + "epoch": 0.8237415795894553, + "grad_norm": 2.547234058380127, + "learning_rate": 3.627097367350908e-05, + "loss": 0.6353, + "step": 93180 + }, + { + "epoch": 0.8238299828497675, + "grad_norm": 3.1326427459716797, + "learning_rate": 3.626950028583721e-05, + "loss": 0.6836, + "step": 93190 + }, + { + "epoch": 0.8239183861100797, + "grad_norm": 1.0381940603256226, + "learning_rate": 3.626802689816534e-05, + "loss": 0.6069, + "step": 93200 + }, + { + "epoch": 0.824006789370392, + "grad_norm": 3.0886294841766357, + "learning_rate": 3.626655351049347e-05, + "loss": 0.6478, + "step": 93210 + }, + { + "epoch": 0.8240951926307042, + "grad_norm": 3.674008369445801, + "learning_rate": 3.62650801228216e-05, + "loss": 0.6818, + "step": 93220 + }, + { + "epoch": 0.8241835958910164, + "grad_norm": 4.91383171081543, + "learning_rate": 3.6263606735149727e-05, + "loss": 0.7287, + "step": 93230 + }, + { + "epoch": 0.8242719991513286, + "grad_norm": 4.42464542388916, + "learning_rate": 3.6262133347477855e-05, + "loss": 0.6606, + "step": 93240 + }, + { + "epoch": 0.824360402411641, + "grad_norm": 1.6349812746047974, + "learning_rate": 3.626065995980599e-05, + "loss": 0.5162, + "step": 93250 + }, + { + "epoch": 0.8244488056719532, + "grad_norm": 4.765417098999023, + "learning_rate": 3.625918657213411e-05, + "loss": 0.72, + "step": 93260 + }, + { + "epoch": 0.8245372089322655, + "grad_norm": 2.7557642459869385, + "learning_rate": 3.625771318446225e-05, + "loss": 0.6219, + "step": 93270 + }, + { + "epoch": 0.8246256121925777, + "grad_norm": 4.133311748504639, + "learning_rate": 3.6256239796790375e-05, + "loss": 0.6556, + "step": 93280 + }, + { + "epoch": 0.8247140154528899, + "grad_norm": 5.949282646179199, + "learning_rate": 3.6254766409118503e-05, + "loss": 0.5986, + "step": 93290 + }, + { + "epoch": 0.8248024187132021, + "grad_norm": 2.9737560749053955, + "learning_rate": 3.625329302144663e-05, + "loss": 0.7077, + "step": 93300 + }, + { + "epoch": 0.8248908219735144, + "grad_norm": 3.029956102371216, + "learning_rate": 3.625181963377477e-05, + "loss": 0.6805, + "step": 93310 + }, + { + "epoch": 0.8249792252338266, + "grad_norm": 3.5298495292663574, + "learning_rate": 3.625034624610289e-05, + "loss": 0.6401, + "step": 93320 + }, + { + "epoch": 0.8250676284941388, + "grad_norm": 11.33055305480957, + "learning_rate": 3.6248872858431024e-05, + "loss": 0.6757, + "step": 93330 + }, + { + "epoch": 0.8251560317544511, + "grad_norm": 2.6825695037841797, + "learning_rate": 3.624739947075915e-05, + "loss": 0.6602, + "step": 93340 + }, + { + "epoch": 0.8252444350147633, + "grad_norm": 5.537924289703369, + "learning_rate": 3.624592608308728e-05, + "loss": 0.6691, + "step": 93350 + }, + { + "epoch": 0.8253328382750755, + "grad_norm": 1.3969522714614868, + "learning_rate": 3.624445269541541e-05, + "loss": 0.6283, + "step": 93360 + }, + { + "epoch": 0.8254212415353879, + "grad_norm": 3.790619134902954, + "learning_rate": 3.624297930774354e-05, + "loss": 0.6556, + "step": 93370 + }, + { + "epoch": 0.8255096447957001, + "grad_norm": 1.347337245941162, + "learning_rate": 3.6241505920071665e-05, + "loss": 0.5757, + "step": 93380 + }, + { + "epoch": 0.8255980480560123, + "grad_norm": 1.6619949340820312, + "learning_rate": 3.62400325323998e-05, + "loss": 0.5592, + "step": 93390 + }, + { + "epoch": 0.8256864513163246, + "grad_norm": 9.179929733276367, + "learning_rate": 3.623855914472792e-05, + "loss": 0.6658, + "step": 93400 + }, + { + "epoch": 0.8257748545766368, + "grad_norm": 3.4941885471343994, + "learning_rate": 3.623708575705606e-05, + "loss": 0.6295, + "step": 93410 + }, + { + "epoch": 0.825863257836949, + "grad_norm": 7.0606207847595215, + "learning_rate": 3.6235612369384186e-05, + "loss": 0.5741, + "step": 93420 + }, + { + "epoch": 0.8259516610972613, + "grad_norm": 6.441812992095947, + "learning_rate": 3.6234138981712314e-05, + "loss": 0.7791, + "step": 93430 + }, + { + "epoch": 0.8260400643575735, + "grad_norm": 4.038876533508301, + "learning_rate": 3.623266559404044e-05, + "loss": 0.6294, + "step": 93440 + }, + { + "epoch": 0.8261284676178857, + "grad_norm": 1.269675850868225, + "learning_rate": 3.623119220636858e-05, + "loss": 0.6209, + "step": 93450 + }, + { + "epoch": 0.826216870878198, + "grad_norm": 6.175108432769775, + "learning_rate": 3.62297188186967e-05, + "loss": 0.6997, + "step": 93460 + }, + { + "epoch": 0.8263052741385102, + "grad_norm": 6.478979110717773, + "learning_rate": 3.6228245431024834e-05, + "loss": 0.7994, + "step": 93470 + }, + { + "epoch": 0.8263936773988225, + "grad_norm": 7.0543012619018555, + "learning_rate": 3.6226772043352956e-05, + "loss": 0.7055, + "step": 93480 + }, + { + "epoch": 0.8264820806591348, + "grad_norm": 2.9575681686401367, + "learning_rate": 3.622529865568109e-05, + "loss": 0.6662, + "step": 93490 + }, + { + "epoch": 0.826570483919447, + "grad_norm": 1.9254590272903442, + "learning_rate": 3.622382526800922e-05, + "loss": 0.561, + "step": 93500 + }, + { + "epoch": 0.8266588871797592, + "grad_norm": 1.9862453937530518, + "learning_rate": 3.622235188033735e-05, + "loss": 0.7086, + "step": 93510 + }, + { + "epoch": 0.8267472904400714, + "grad_norm": 2.7529067993164062, + "learning_rate": 3.6220878492665476e-05, + "loss": 0.5647, + "step": 93520 + }, + { + "epoch": 0.8268356937003837, + "grad_norm": 1.2451297044754028, + "learning_rate": 3.621940510499361e-05, + "loss": 0.6209, + "step": 93530 + }, + { + "epoch": 0.8269240969606959, + "grad_norm": 5.997985363006592, + "learning_rate": 3.621793171732173e-05, + "loss": 0.6259, + "step": 93540 + }, + { + "epoch": 0.8270125002210081, + "grad_norm": 0.9683815836906433, + "learning_rate": 3.621645832964987e-05, + "loss": 0.5411, + "step": 93550 + }, + { + "epoch": 0.8271009034813204, + "grad_norm": 2.685631275177002, + "learning_rate": 3.6214984941977996e-05, + "loss": 0.6966, + "step": 93560 + }, + { + "epoch": 0.8271893067416326, + "grad_norm": 3.4126617908477783, + "learning_rate": 3.6213511554306124e-05, + "loss": 0.775, + "step": 93570 + }, + { + "epoch": 0.8272777100019448, + "grad_norm": 1.7319691181182861, + "learning_rate": 3.621203816663425e-05, + "loss": 0.6634, + "step": 93580 + }, + { + "epoch": 0.8273661132622571, + "grad_norm": 1.2252484560012817, + "learning_rate": 3.621056477896239e-05, + "loss": 0.6667, + "step": 93590 + }, + { + "epoch": 0.8274545165225694, + "grad_norm": 6.5128679275512695, + "learning_rate": 3.620909139129051e-05, + "loss": 0.6113, + "step": 93600 + }, + { + "epoch": 0.8275429197828816, + "grad_norm": 8.293207168579102, + "learning_rate": 3.6207618003618645e-05, + "loss": 0.6084, + "step": 93610 + }, + { + "epoch": 0.8276313230431939, + "grad_norm": 11.024662017822266, + "learning_rate": 3.6206144615946766e-05, + "loss": 0.7372, + "step": 93620 + }, + { + "epoch": 0.8277197263035061, + "grad_norm": 2.8585751056671143, + "learning_rate": 3.62046712282749e-05, + "loss": 0.5327, + "step": 93630 + }, + { + "epoch": 0.8278081295638183, + "grad_norm": 9.185791969299316, + "learning_rate": 3.620319784060303e-05, + "loss": 0.6877, + "step": 93640 + }, + { + "epoch": 0.8278965328241306, + "grad_norm": 1.2710436582565308, + "learning_rate": 3.620172445293116e-05, + "loss": 0.6548, + "step": 93650 + }, + { + "epoch": 0.8279849360844428, + "grad_norm": 2.4045794010162354, + "learning_rate": 3.6200251065259286e-05, + "loss": 0.7098, + "step": 93660 + }, + { + "epoch": 0.828073339344755, + "grad_norm": 6.192056655883789, + "learning_rate": 3.619877767758742e-05, + "loss": 0.7721, + "step": 93670 + }, + { + "epoch": 0.8281617426050673, + "grad_norm": 4.213747024536133, + "learning_rate": 3.619730428991554e-05, + "loss": 0.6803, + "step": 93680 + }, + { + "epoch": 0.8282501458653795, + "grad_norm": 3.9271175861358643, + "learning_rate": 3.619583090224368e-05, + "loss": 0.5977, + "step": 93690 + }, + { + "epoch": 0.8283385491256917, + "grad_norm": 3.1815481185913086, + "learning_rate": 3.619435751457181e-05, + "loss": 0.642, + "step": 93700 + }, + { + "epoch": 0.828426952386004, + "grad_norm": 8.14946174621582, + "learning_rate": 3.6192884126899935e-05, + "loss": 0.7146, + "step": 93710 + }, + { + "epoch": 0.8285153556463163, + "grad_norm": 3.190246343612671, + "learning_rate": 3.619141073922806e-05, + "loss": 0.5952, + "step": 93720 + }, + { + "epoch": 0.8286037589066285, + "grad_norm": 4.229310989379883, + "learning_rate": 3.618993735155619e-05, + "loss": 0.6867, + "step": 93730 + }, + { + "epoch": 0.8286921621669407, + "grad_norm": 1.8603646755218506, + "learning_rate": 3.618846396388432e-05, + "loss": 0.7172, + "step": 93740 + }, + { + "epoch": 0.828780565427253, + "grad_norm": 2.589078187942505, + "learning_rate": 3.6186990576212455e-05, + "loss": 0.6561, + "step": 93750 + }, + { + "epoch": 0.8288689686875652, + "grad_norm": 6.150266647338867, + "learning_rate": 3.618551718854058e-05, + "loss": 0.7016, + "step": 93760 + }, + { + "epoch": 0.8289573719478774, + "grad_norm": 1.3360360860824585, + "learning_rate": 3.618404380086871e-05, + "loss": 0.6646, + "step": 93770 + }, + { + "epoch": 0.8290457752081897, + "grad_norm": 4.242101192474365, + "learning_rate": 3.618257041319684e-05, + "loss": 0.615, + "step": 93780 + }, + { + "epoch": 0.8291341784685019, + "grad_norm": 11.25772476196289, + "learning_rate": 3.618109702552497e-05, + "loss": 0.6885, + "step": 93790 + }, + { + "epoch": 0.8292225817288141, + "grad_norm": 2.015272378921509, + "learning_rate": 3.61796236378531e-05, + "loss": 0.5991, + "step": 93800 + }, + { + "epoch": 0.8293109849891264, + "grad_norm": 1.615455985069275, + "learning_rate": 3.617815025018123e-05, + "loss": 0.5903, + "step": 93810 + }, + { + "epoch": 0.8293993882494386, + "grad_norm": 1.867724061012268, + "learning_rate": 3.6176676862509354e-05, + "loss": 0.5391, + "step": 93820 + }, + { + "epoch": 0.8294877915097508, + "grad_norm": 1.86784029006958, + "learning_rate": 3.617520347483749e-05, + "loss": 0.6771, + "step": 93830 + }, + { + "epoch": 0.8295761947700632, + "grad_norm": 5.769179821014404, + "learning_rate": 3.617373008716561e-05, + "loss": 0.6704, + "step": 93840 + }, + { + "epoch": 0.8296645980303754, + "grad_norm": 3.508192300796509, + "learning_rate": 3.6172256699493745e-05, + "loss": 0.7017, + "step": 93850 + }, + { + "epoch": 0.8297530012906876, + "grad_norm": 0.9482429623603821, + "learning_rate": 3.6170783311821874e-05, + "loss": 0.6837, + "step": 93860 + }, + { + "epoch": 0.8298414045509999, + "grad_norm": 3.013530731201172, + "learning_rate": 3.616930992415e-05, + "loss": 0.6996, + "step": 93870 + }, + { + "epoch": 0.8299298078113121, + "grad_norm": 1.6802130937576294, + "learning_rate": 3.616783653647813e-05, + "loss": 0.6276, + "step": 93880 + }, + { + "epoch": 0.8300182110716243, + "grad_norm": 2.8071370124816895, + "learning_rate": 3.6166363148806266e-05, + "loss": 0.7872, + "step": 93890 + }, + { + "epoch": 0.8301066143319366, + "grad_norm": 2.110656976699829, + "learning_rate": 3.616488976113439e-05, + "loss": 0.6538, + "step": 93900 + }, + { + "epoch": 0.8301950175922488, + "grad_norm": 3.661648750305176, + "learning_rate": 3.616341637346252e-05, + "loss": 0.6225, + "step": 93910 + }, + { + "epoch": 0.830283420852561, + "grad_norm": 5.264840126037598, + "learning_rate": 3.616194298579065e-05, + "loss": 0.7919, + "step": 93920 + }, + { + "epoch": 0.8303718241128732, + "grad_norm": 2.264892101287842, + "learning_rate": 3.616046959811878e-05, + "loss": 0.5866, + "step": 93930 + }, + { + "epoch": 0.8304602273731855, + "grad_norm": 4.5659708976745605, + "learning_rate": 3.615899621044691e-05, + "loss": 0.5614, + "step": 93940 + }, + { + "epoch": 0.8305486306334977, + "grad_norm": 1.5349199771881104, + "learning_rate": 3.615752282277504e-05, + "loss": 0.5593, + "step": 93950 + }, + { + "epoch": 0.83063703389381, + "grad_norm": 6.145536422729492, + "learning_rate": 3.615604943510317e-05, + "loss": 0.8235, + "step": 93960 + }, + { + "epoch": 0.8307254371541223, + "grad_norm": 4.073675155639648, + "learning_rate": 3.61545760474313e-05, + "loss": 0.5412, + "step": 93970 + }, + { + "epoch": 0.8308138404144345, + "grad_norm": 2.0879557132720947, + "learning_rate": 3.615310265975943e-05, + "loss": 0.5803, + "step": 93980 + }, + { + "epoch": 0.8309022436747467, + "grad_norm": 1.188655972480774, + "learning_rate": 3.6151629272087556e-05, + "loss": 0.6371, + "step": 93990 + }, + { + "epoch": 0.830990646935059, + "grad_norm": 4.851926326751709, + "learning_rate": 3.6150155884415684e-05, + "loss": 0.4916, + "step": 94000 + }, + { + "epoch": 0.8310790501953712, + "grad_norm": 1.4034401178359985, + "learning_rate": 3.614868249674381e-05, + "loss": 0.6945, + "step": 94010 + }, + { + "epoch": 0.8311674534556834, + "grad_norm": 3.4772355556488037, + "learning_rate": 3.614720910907195e-05, + "loss": 0.6497, + "step": 94020 + }, + { + "epoch": 0.8312558567159957, + "grad_norm": 4.7165374755859375, + "learning_rate": 3.6145735721400076e-05, + "loss": 0.7531, + "step": 94030 + }, + { + "epoch": 0.8313442599763079, + "grad_norm": 3.0375566482543945, + "learning_rate": 3.6144262333728205e-05, + "loss": 0.6571, + "step": 94040 + }, + { + "epoch": 0.8314326632366201, + "grad_norm": 1.5466070175170898, + "learning_rate": 3.614278894605633e-05, + "loss": 0.6361, + "step": 94050 + }, + { + "epoch": 0.8315210664969324, + "grad_norm": 5.721677780151367, + "learning_rate": 3.614131555838446e-05, + "loss": 0.6532, + "step": 94060 + }, + { + "epoch": 0.8316094697572447, + "grad_norm": 1.3382488489151, + "learning_rate": 3.613984217071259e-05, + "loss": 0.6434, + "step": 94070 + }, + { + "epoch": 0.8316978730175569, + "grad_norm": 3.216163158416748, + "learning_rate": 3.6138368783040725e-05, + "loss": 0.8104, + "step": 94080 + }, + { + "epoch": 0.8317862762778692, + "grad_norm": 4.24934720993042, + "learning_rate": 3.6136895395368846e-05, + "loss": 0.6685, + "step": 94090 + }, + { + "epoch": 0.8318746795381814, + "grad_norm": 4.726006507873535, + "learning_rate": 3.613542200769698e-05, + "loss": 0.7063, + "step": 94100 + }, + { + "epoch": 0.8319630827984936, + "grad_norm": 5.045265197753906, + "learning_rate": 3.613394862002511e-05, + "loss": 0.6767, + "step": 94110 + }, + { + "epoch": 0.8320514860588059, + "grad_norm": 3.512802839279175, + "learning_rate": 3.613247523235324e-05, + "loss": 0.7515, + "step": 94120 + }, + { + "epoch": 0.8321398893191181, + "grad_norm": 3.4347739219665527, + "learning_rate": 3.6131001844681367e-05, + "loss": 0.5649, + "step": 94130 + }, + { + "epoch": 0.8322282925794303, + "grad_norm": 8.086050033569336, + "learning_rate": 3.61295284570095e-05, + "loss": 0.5289, + "step": 94140 + }, + { + "epoch": 0.8323166958397425, + "grad_norm": 3.934406042098999, + "learning_rate": 3.612805506933762e-05, + "loss": 0.6853, + "step": 94150 + }, + { + "epoch": 0.8324050991000548, + "grad_norm": 1.3040629625320435, + "learning_rate": 3.612658168166576e-05, + "loss": 0.7017, + "step": 94160 + }, + { + "epoch": 0.832493502360367, + "grad_norm": 3.054164409637451, + "learning_rate": 3.612510829399389e-05, + "loss": 0.6139, + "step": 94170 + }, + { + "epoch": 0.8325819056206792, + "grad_norm": 6.750848293304443, + "learning_rate": 3.6123634906322015e-05, + "loss": 0.6179, + "step": 94180 + }, + { + "epoch": 0.8326703088809916, + "grad_norm": 5.214075088500977, + "learning_rate": 3.6122161518650143e-05, + "loss": 0.6383, + "step": 94190 + }, + { + "epoch": 0.8327587121413038, + "grad_norm": 5.887363910675049, + "learning_rate": 3.612068813097827e-05, + "loss": 0.5707, + "step": 94200 + }, + { + "epoch": 0.832847115401616, + "grad_norm": 2.3657591342926025, + "learning_rate": 3.61192147433064e-05, + "loss": 0.5935, + "step": 94210 + }, + { + "epoch": 0.8329355186619283, + "grad_norm": 4.025388717651367, + "learning_rate": 3.6117741355634535e-05, + "loss": 0.6411, + "step": 94220 + }, + { + "epoch": 0.8330239219222405, + "grad_norm": 5.579286098480225, + "learning_rate": 3.611626796796266e-05, + "loss": 0.7095, + "step": 94230 + }, + { + "epoch": 0.8331123251825527, + "grad_norm": 2.4436745643615723, + "learning_rate": 3.611479458029079e-05, + "loss": 0.6464, + "step": 94240 + }, + { + "epoch": 0.833200728442865, + "grad_norm": 3.3473572731018066, + "learning_rate": 3.611332119261892e-05, + "loss": 0.6948, + "step": 94250 + }, + { + "epoch": 0.8332891317031772, + "grad_norm": 4.559767246246338, + "learning_rate": 3.611184780494705e-05, + "loss": 0.5037, + "step": 94260 + }, + { + "epoch": 0.8333775349634894, + "grad_norm": 2.4051787853240967, + "learning_rate": 3.611037441727518e-05, + "loss": 0.6604, + "step": 94270 + }, + { + "epoch": 0.8334659382238017, + "grad_norm": 1.8673170804977417, + "learning_rate": 3.610890102960331e-05, + "loss": 0.6683, + "step": 94280 + }, + { + "epoch": 0.8335543414841139, + "grad_norm": 2.71720290184021, + "learning_rate": 3.6107427641931434e-05, + "loss": 0.7054, + "step": 94290 + }, + { + "epoch": 0.8336427447444261, + "grad_norm": 4.1643571853637695, + "learning_rate": 3.610595425425957e-05, + "loss": 0.6801, + "step": 94300 + }, + { + "epoch": 0.8337311480047385, + "grad_norm": 2.6858880519866943, + "learning_rate": 3.610448086658769e-05, + "loss": 0.6288, + "step": 94310 + }, + { + "epoch": 0.8338195512650507, + "grad_norm": 2.0185506343841553, + "learning_rate": 3.6103007478915826e-05, + "loss": 0.6989, + "step": 94320 + }, + { + "epoch": 0.8339079545253629, + "grad_norm": 3.8516831398010254, + "learning_rate": 3.6101534091243954e-05, + "loss": 0.5838, + "step": 94330 + }, + { + "epoch": 0.8339963577856752, + "grad_norm": 8.46336841583252, + "learning_rate": 3.610006070357208e-05, + "loss": 0.8762, + "step": 94340 + }, + { + "epoch": 0.8340847610459874, + "grad_norm": 3.932737350463867, + "learning_rate": 3.609858731590021e-05, + "loss": 0.6922, + "step": 94350 + }, + { + "epoch": 0.8341731643062996, + "grad_norm": 13.593315124511719, + "learning_rate": 3.6097113928228346e-05, + "loss": 0.6391, + "step": 94360 + }, + { + "epoch": 0.8342615675666119, + "grad_norm": 2.8367323875427246, + "learning_rate": 3.609564054055647e-05, + "loss": 0.5589, + "step": 94370 + }, + { + "epoch": 0.8343499708269241, + "grad_norm": 2.1782689094543457, + "learning_rate": 3.60941671528846e-05, + "loss": 0.558, + "step": 94380 + }, + { + "epoch": 0.8344383740872363, + "grad_norm": 2.604386568069458, + "learning_rate": 3.609269376521273e-05, + "loss": 0.6621, + "step": 94390 + }, + { + "epoch": 0.8345267773475485, + "grad_norm": 2.1232829093933105, + "learning_rate": 3.609122037754086e-05, + "loss": 0.6319, + "step": 94400 + }, + { + "epoch": 0.8346151806078608, + "grad_norm": 4.2524919509887695, + "learning_rate": 3.608974698986899e-05, + "loss": 0.7322, + "step": 94410 + }, + { + "epoch": 0.834703583868173, + "grad_norm": 2.8115828037261963, + "learning_rate": 3.608827360219712e-05, + "loss": 0.7946, + "step": 94420 + }, + { + "epoch": 0.8347919871284853, + "grad_norm": 6.732259750366211, + "learning_rate": 3.6086800214525244e-05, + "loss": 0.6632, + "step": 94430 + }, + { + "epoch": 0.8348803903887976, + "grad_norm": 2.397207736968994, + "learning_rate": 3.608532682685338e-05, + "loss": 0.5989, + "step": 94440 + }, + { + "epoch": 0.8349687936491098, + "grad_norm": 4.116695880889893, + "learning_rate": 3.60838534391815e-05, + "loss": 0.6338, + "step": 94450 + }, + { + "epoch": 0.835057196909422, + "grad_norm": 1.6942788362503052, + "learning_rate": 3.6082380051509636e-05, + "loss": 0.7084, + "step": 94460 + }, + { + "epoch": 0.8351456001697343, + "grad_norm": 7.596015930175781, + "learning_rate": 3.6080906663837764e-05, + "loss": 0.6815, + "step": 94470 + }, + { + "epoch": 0.8352340034300465, + "grad_norm": 4.143629550933838, + "learning_rate": 3.607943327616589e-05, + "loss": 0.672, + "step": 94480 + }, + { + "epoch": 0.8353224066903587, + "grad_norm": 4.248392105102539, + "learning_rate": 3.607795988849402e-05, + "loss": 0.6851, + "step": 94490 + }, + { + "epoch": 0.835410809950671, + "grad_norm": 1.733974814414978, + "learning_rate": 3.6076486500822156e-05, + "loss": 0.6572, + "step": 94500 + }, + { + "epoch": 0.8354992132109832, + "grad_norm": 4.106632232666016, + "learning_rate": 3.607501311315028e-05, + "loss": 0.5702, + "step": 94510 + }, + { + "epoch": 0.8355876164712954, + "grad_norm": 4.915042400360107, + "learning_rate": 3.607353972547841e-05, + "loss": 0.6478, + "step": 94520 + }, + { + "epoch": 0.8356760197316077, + "grad_norm": 6.578536510467529, + "learning_rate": 3.607206633780654e-05, + "loss": 0.7171, + "step": 94530 + }, + { + "epoch": 0.83576442299192, + "grad_norm": 1.7583603858947754, + "learning_rate": 3.607059295013467e-05, + "loss": 0.6362, + "step": 94540 + }, + { + "epoch": 0.8358528262522322, + "grad_norm": 3.7528419494628906, + "learning_rate": 3.60691195624628e-05, + "loss": 0.6052, + "step": 94550 + }, + { + "epoch": 0.8359412295125445, + "grad_norm": 12.52851676940918, + "learning_rate": 3.6067646174790926e-05, + "loss": 0.4435, + "step": 94560 + }, + { + "epoch": 0.8360296327728567, + "grad_norm": 2.9721691608428955, + "learning_rate": 3.6066172787119055e-05, + "loss": 0.652, + "step": 94570 + }, + { + "epoch": 0.8361180360331689, + "grad_norm": 8.937162399291992, + "learning_rate": 3.606469939944719e-05, + "loss": 0.5816, + "step": 94580 + }, + { + "epoch": 0.8362064392934812, + "grad_norm": 7.869368553161621, + "learning_rate": 3.606322601177531e-05, + "loss": 0.6281, + "step": 94590 + }, + { + "epoch": 0.8362948425537934, + "grad_norm": 14.269586563110352, + "learning_rate": 3.6061752624103447e-05, + "loss": 0.7967, + "step": 94600 + }, + { + "epoch": 0.8363832458141056, + "grad_norm": 6.290854454040527, + "learning_rate": 3.6060279236431575e-05, + "loss": 0.7268, + "step": 94610 + }, + { + "epoch": 0.8364716490744178, + "grad_norm": 1.0632624626159668, + "learning_rate": 3.60588058487597e-05, + "loss": 0.5505, + "step": 94620 + }, + { + "epoch": 0.8365600523347301, + "grad_norm": 1.3970060348510742, + "learning_rate": 3.605733246108783e-05, + "loss": 0.5961, + "step": 94630 + }, + { + "epoch": 0.8366484555950423, + "grad_norm": 5.421457290649414, + "learning_rate": 3.605585907341597e-05, + "loss": 0.6474, + "step": 94640 + }, + { + "epoch": 0.8367368588553545, + "grad_norm": 4.138095378875732, + "learning_rate": 3.605438568574409e-05, + "loss": 0.6886, + "step": 94650 + }, + { + "epoch": 0.8368252621156669, + "grad_norm": 3.2270023822784424, + "learning_rate": 3.6052912298072223e-05, + "loss": 0.7935, + "step": 94660 + }, + { + "epoch": 0.8369136653759791, + "grad_norm": 3.0651822090148926, + "learning_rate": 3.6051438910400345e-05, + "loss": 0.6458, + "step": 94670 + }, + { + "epoch": 0.8370020686362913, + "grad_norm": 1.2351793050765991, + "learning_rate": 3.604996552272848e-05, + "loss": 0.5934, + "step": 94680 + }, + { + "epoch": 0.8370904718966036, + "grad_norm": 2.908625602722168, + "learning_rate": 3.604849213505661e-05, + "loss": 0.6298, + "step": 94690 + }, + { + "epoch": 0.8371788751569158, + "grad_norm": 2.9542267322540283, + "learning_rate": 3.604701874738474e-05, + "loss": 0.6277, + "step": 94700 + }, + { + "epoch": 0.837267278417228, + "grad_norm": 0.9569472670555115, + "learning_rate": 3.6045545359712865e-05, + "loss": 0.6418, + "step": 94710 + }, + { + "epoch": 0.8373556816775403, + "grad_norm": 2.43017578125, + "learning_rate": 3.6044071972041e-05, + "loss": 0.7019, + "step": 94720 + }, + { + "epoch": 0.8374440849378525, + "grad_norm": 3.396594524383545, + "learning_rate": 3.604259858436912e-05, + "loss": 0.5352, + "step": 94730 + }, + { + "epoch": 0.8375324881981647, + "grad_norm": 5.04083251953125, + "learning_rate": 3.604112519669726e-05, + "loss": 0.618, + "step": 94740 + }, + { + "epoch": 0.837620891458477, + "grad_norm": 8.53906536102295, + "learning_rate": 3.6039651809025385e-05, + "loss": 0.7045, + "step": 94750 + }, + { + "epoch": 0.8377092947187892, + "grad_norm": 1.6326218843460083, + "learning_rate": 3.6038178421353514e-05, + "loss": 0.5973, + "step": 94760 + }, + { + "epoch": 0.8377976979791014, + "grad_norm": 3.714289665222168, + "learning_rate": 3.603670503368164e-05, + "loss": 0.697, + "step": 94770 + }, + { + "epoch": 0.8378861012394138, + "grad_norm": 8.500152587890625, + "learning_rate": 3.603523164600977e-05, + "loss": 0.628, + "step": 94780 + }, + { + "epoch": 0.837974504499726, + "grad_norm": 4.840029239654541, + "learning_rate": 3.60337582583379e-05, + "loss": 0.7251, + "step": 94790 + }, + { + "epoch": 0.8380629077600382, + "grad_norm": 2.8530380725860596, + "learning_rate": 3.6032284870666034e-05, + "loss": 0.7012, + "step": 94800 + }, + { + "epoch": 0.8381513110203505, + "grad_norm": 23.596668243408203, + "learning_rate": 3.603081148299416e-05, + "loss": 0.6092, + "step": 94810 + }, + { + "epoch": 0.8382397142806627, + "grad_norm": 15.336099624633789, + "learning_rate": 3.602933809532229e-05, + "loss": 0.8211, + "step": 94820 + }, + { + "epoch": 0.8383281175409749, + "grad_norm": 2.4705581665039062, + "learning_rate": 3.602786470765042e-05, + "loss": 0.6362, + "step": 94830 + }, + { + "epoch": 0.8384165208012871, + "grad_norm": 8.284422874450684, + "learning_rate": 3.602639131997855e-05, + "loss": 0.636, + "step": 94840 + }, + { + "epoch": 0.8385049240615994, + "grad_norm": 1.1233346462249756, + "learning_rate": 3.6024917932306676e-05, + "loss": 0.7047, + "step": 94850 + }, + { + "epoch": 0.8385933273219116, + "grad_norm": 2.7768990993499756, + "learning_rate": 3.602344454463481e-05, + "loss": 0.5937, + "step": 94860 + }, + { + "epoch": 0.8386817305822238, + "grad_norm": 4.231262683868408, + "learning_rate": 3.602197115696294e-05, + "loss": 0.7304, + "step": 94870 + }, + { + "epoch": 0.8387701338425361, + "grad_norm": 3.647117853164673, + "learning_rate": 3.602049776929107e-05, + "loss": 0.7706, + "step": 94880 + }, + { + "epoch": 0.8388585371028483, + "grad_norm": 3.821812152862549, + "learning_rate": 3.6019024381619196e-05, + "loss": 0.61, + "step": 94890 + }, + { + "epoch": 0.8389469403631606, + "grad_norm": 3.63480806350708, + "learning_rate": 3.6017550993947324e-05, + "loss": 0.7079, + "step": 94900 + }, + { + "epoch": 0.8390353436234729, + "grad_norm": 3.7317652702331543, + "learning_rate": 3.601607760627545e-05, + "loss": 0.7602, + "step": 94910 + }, + { + "epoch": 0.8391237468837851, + "grad_norm": 7.364920139312744, + "learning_rate": 3.601460421860358e-05, + "loss": 0.7488, + "step": 94920 + }, + { + "epoch": 0.8392121501440973, + "grad_norm": 23.182640075683594, + "learning_rate": 3.6013130830931716e-05, + "loss": 0.6611, + "step": 94930 + }, + { + "epoch": 0.8393005534044096, + "grad_norm": 4.278759479522705, + "learning_rate": 3.6011657443259844e-05, + "loss": 0.6502, + "step": 94940 + }, + { + "epoch": 0.8393889566647218, + "grad_norm": 1.0418457984924316, + "learning_rate": 3.601018405558797e-05, + "loss": 0.6761, + "step": 94950 + }, + { + "epoch": 0.839477359925034, + "grad_norm": 2.0343780517578125, + "learning_rate": 3.60087106679161e-05, + "loss": 0.5939, + "step": 94960 + }, + { + "epoch": 0.8395657631853463, + "grad_norm": 1.6717872619628906, + "learning_rate": 3.600723728024423e-05, + "loss": 0.6276, + "step": 94970 + }, + { + "epoch": 0.8396541664456585, + "grad_norm": 9.336103439331055, + "learning_rate": 3.600576389257236e-05, + "loss": 0.6517, + "step": 94980 + }, + { + "epoch": 0.8397425697059707, + "grad_norm": 2.040410041809082, + "learning_rate": 3.600429050490049e-05, + "loss": 0.5953, + "step": 94990 + }, + { + "epoch": 0.839830972966283, + "grad_norm": 3.3423497676849365, + "learning_rate": 3.600281711722862e-05, + "loss": 0.683, + "step": 95000 + }, + { + "epoch": 0.8399193762265952, + "grad_norm": 1.4555820226669312, + "learning_rate": 3.600134372955675e-05, + "loss": 0.5977, + "step": 95010 + }, + { + "epoch": 0.8400077794869075, + "grad_norm": 5.59188175201416, + "learning_rate": 3.599987034188488e-05, + "loss": 0.7248, + "step": 95020 + }, + { + "epoch": 0.8400961827472198, + "grad_norm": 3.424687147140503, + "learning_rate": 3.5998396954213006e-05, + "loss": 0.5415, + "step": 95030 + }, + { + "epoch": 0.840184586007532, + "grad_norm": 11.723047256469727, + "learning_rate": 3.5996923566541135e-05, + "loss": 0.5556, + "step": 95040 + }, + { + "epoch": 0.8402729892678442, + "grad_norm": 1.008385419845581, + "learning_rate": 3.599545017886927e-05, + "loss": 0.546, + "step": 95050 + }, + { + "epoch": 0.8403613925281564, + "grad_norm": 10.497899055480957, + "learning_rate": 3.599397679119739e-05, + "loss": 0.8233, + "step": 95060 + }, + { + "epoch": 0.8404497957884687, + "grad_norm": 5.900903224945068, + "learning_rate": 3.599250340352553e-05, + "loss": 0.6416, + "step": 95070 + }, + { + "epoch": 0.8405381990487809, + "grad_norm": 1.6267361640930176, + "learning_rate": 3.5991030015853655e-05, + "loss": 0.5124, + "step": 95080 + }, + { + "epoch": 0.8406266023090931, + "grad_norm": 1.2869590520858765, + "learning_rate": 3.598955662818178e-05, + "loss": 0.5858, + "step": 95090 + }, + { + "epoch": 0.8407150055694054, + "grad_norm": 1.0540703535079956, + "learning_rate": 3.598808324050991e-05, + "loss": 0.6063, + "step": 95100 + }, + { + "epoch": 0.8408034088297176, + "grad_norm": 8.46433162689209, + "learning_rate": 3.598660985283805e-05, + "loss": 0.6762, + "step": 95110 + }, + { + "epoch": 0.8408918120900298, + "grad_norm": 2.2610857486724854, + "learning_rate": 3.598513646516617e-05, + "loss": 0.6093, + "step": 95120 + }, + { + "epoch": 0.8409802153503422, + "grad_norm": 4.000816822052002, + "learning_rate": 3.5983663077494304e-05, + "loss": 0.5154, + "step": 95130 + }, + { + "epoch": 0.8410686186106544, + "grad_norm": 3.9335789680480957, + "learning_rate": 3.5982189689822425e-05, + "loss": 0.6145, + "step": 95140 + }, + { + "epoch": 0.8411570218709666, + "grad_norm": 2.7194511890411377, + "learning_rate": 3.598071630215056e-05, + "loss": 0.5529, + "step": 95150 + }, + { + "epoch": 0.8412454251312789, + "grad_norm": 13.901379585266113, + "learning_rate": 3.597924291447869e-05, + "loss": 0.6166, + "step": 95160 + }, + { + "epoch": 0.8413338283915911, + "grad_norm": 6.461185932159424, + "learning_rate": 3.597776952680682e-05, + "loss": 0.678, + "step": 95170 + }, + { + "epoch": 0.8414222316519033, + "grad_norm": 5.392058372497559, + "learning_rate": 3.5976296139134945e-05, + "loss": 0.66, + "step": 95180 + }, + { + "epoch": 0.8415106349122156, + "grad_norm": 4.977040767669678, + "learning_rate": 3.597482275146308e-05, + "loss": 0.62, + "step": 95190 + }, + { + "epoch": 0.8415990381725278, + "grad_norm": 3.355560302734375, + "learning_rate": 3.59733493637912e-05, + "loss": 0.6605, + "step": 95200 + }, + { + "epoch": 0.84168744143284, + "grad_norm": 2.369241714477539, + "learning_rate": 3.597187597611934e-05, + "loss": 0.7076, + "step": 95210 + }, + { + "epoch": 0.8417758446931523, + "grad_norm": 10.198139190673828, + "learning_rate": 3.5970402588447466e-05, + "loss": 0.6616, + "step": 95220 + }, + { + "epoch": 0.8418642479534645, + "grad_norm": 5.111310005187988, + "learning_rate": 3.5968929200775594e-05, + "loss": 0.6097, + "step": 95230 + }, + { + "epoch": 0.8419526512137767, + "grad_norm": 3.880850076675415, + "learning_rate": 3.596745581310372e-05, + "loss": 0.6163, + "step": 95240 + }, + { + "epoch": 0.8420410544740891, + "grad_norm": 2.7748265266418457, + "learning_rate": 3.596598242543185e-05, + "loss": 0.7044, + "step": 95250 + }, + { + "epoch": 0.8421294577344013, + "grad_norm": 1.8967853784561157, + "learning_rate": 3.596450903775998e-05, + "loss": 0.6364, + "step": 95260 + }, + { + "epoch": 0.8422178609947135, + "grad_norm": 2.3596765995025635, + "learning_rate": 3.5963035650088114e-05, + "loss": 0.6644, + "step": 95270 + }, + { + "epoch": 0.8423062642550257, + "grad_norm": 1.3654931783676147, + "learning_rate": 3.5961562262416236e-05, + "loss": 0.5585, + "step": 95280 + }, + { + "epoch": 0.842394667515338, + "grad_norm": 3.444798231124878, + "learning_rate": 3.596008887474437e-05, + "loss": 0.639, + "step": 95290 + }, + { + "epoch": 0.8424830707756502, + "grad_norm": 2.277906656265259, + "learning_rate": 3.59586154870725e-05, + "loss": 0.609, + "step": 95300 + }, + { + "epoch": 0.8425714740359624, + "grad_norm": 7.962225437164307, + "learning_rate": 3.595714209940063e-05, + "loss": 0.5751, + "step": 95310 + }, + { + "epoch": 0.8426598772962747, + "grad_norm": 9.873428344726562, + "learning_rate": 3.5955668711728756e-05, + "loss": 0.7628, + "step": 95320 + }, + { + "epoch": 0.8427482805565869, + "grad_norm": 6.623358249664307, + "learning_rate": 3.595419532405689e-05, + "loss": 0.6902, + "step": 95330 + }, + { + "epoch": 0.8428366838168991, + "grad_norm": 0.8233844637870789, + "learning_rate": 3.595272193638501e-05, + "loss": 0.5426, + "step": 95340 + }, + { + "epoch": 0.8429250870772114, + "grad_norm": 5.654267311096191, + "learning_rate": 3.595124854871315e-05, + "loss": 0.795, + "step": 95350 + }, + { + "epoch": 0.8430134903375236, + "grad_norm": 4.271440029144287, + "learning_rate": 3.5949775161041276e-05, + "loss": 0.6115, + "step": 95360 + }, + { + "epoch": 0.8431018935978359, + "grad_norm": 3.573516607284546, + "learning_rate": 3.5948301773369404e-05, + "loss": 0.6523, + "step": 95370 + }, + { + "epoch": 0.8431902968581482, + "grad_norm": 2.437706470489502, + "learning_rate": 3.594682838569753e-05, + "loss": 0.7475, + "step": 95380 + }, + { + "epoch": 0.8432787001184604, + "grad_norm": 2.68989634513855, + "learning_rate": 3.594535499802566e-05, + "loss": 0.6564, + "step": 95390 + }, + { + "epoch": 0.8433671033787726, + "grad_norm": 3.355306386947632, + "learning_rate": 3.594388161035379e-05, + "loss": 0.7606, + "step": 95400 + }, + { + "epoch": 0.8434555066390849, + "grad_norm": 2.803866386413574, + "learning_rate": 3.5942408222681925e-05, + "loss": 0.5561, + "step": 95410 + }, + { + "epoch": 0.8435439098993971, + "grad_norm": 11.35826301574707, + "learning_rate": 3.5940934835010046e-05, + "loss": 0.6679, + "step": 95420 + }, + { + "epoch": 0.8436323131597093, + "grad_norm": 2.249281167984009, + "learning_rate": 3.593946144733818e-05, + "loss": 0.5811, + "step": 95430 + }, + { + "epoch": 0.8437207164200216, + "grad_norm": 3.558587074279785, + "learning_rate": 3.593798805966631e-05, + "loss": 0.6888, + "step": 95440 + }, + { + "epoch": 0.8438091196803338, + "grad_norm": 5.050012111663818, + "learning_rate": 3.593651467199444e-05, + "loss": 0.6763, + "step": 95450 + }, + { + "epoch": 0.843897522940646, + "grad_norm": 2.9509243965148926, + "learning_rate": 3.5935041284322566e-05, + "loss": 0.5862, + "step": 95460 + }, + { + "epoch": 0.8439859262009582, + "grad_norm": 8.164398193359375, + "learning_rate": 3.59335678966507e-05, + "loss": 0.5383, + "step": 95470 + }, + { + "epoch": 0.8440743294612705, + "grad_norm": 4.784492492675781, + "learning_rate": 3.593209450897882e-05, + "loss": 0.6068, + "step": 95480 + }, + { + "epoch": 0.8441627327215828, + "grad_norm": 1.5843349695205688, + "learning_rate": 3.593062112130696e-05, + "loss": 0.6065, + "step": 95490 + }, + { + "epoch": 0.844251135981895, + "grad_norm": 1.4645557403564453, + "learning_rate": 3.592914773363508e-05, + "loss": 0.5672, + "step": 95500 + }, + { + "epoch": 0.8443395392422073, + "grad_norm": 2.9640069007873535, + "learning_rate": 3.5927674345963215e-05, + "loss": 0.6791, + "step": 95510 + }, + { + "epoch": 0.8444279425025195, + "grad_norm": 5.711122989654541, + "learning_rate": 3.592620095829134e-05, + "loss": 0.5923, + "step": 95520 + }, + { + "epoch": 0.8445163457628317, + "grad_norm": 1.127500295639038, + "learning_rate": 3.592472757061947e-05, + "loss": 0.6712, + "step": 95530 + }, + { + "epoch": 0.844604749023144, + "grad_norm": 8.827896118164062, + "learning_rate": 3.59232541829476e-05, + "loss": 0.6406, + "step": 95540 + }, + { + "epoch": 0.8446931522834562, + "grad_norm": 4.571467399597168, + "learning_rate": 3.5921780795275735e-05, + "loss": 0.7155, + "step": 95550 + }, + { + "epoch": 0.8447815555437684, + "grad_norm": 2.5723965167999268, + "learning_rate": 3.592030740760386e-05, + "loss": 0.6263, + "step": 95560 + }, + { + "epoch": 0.8448699588040807, + "grad_norm": 5.320863246917725, + "learning_rate": 3.591883401993199e-05, + "loss": 0.6627, + "step": 95570 + }, + { + "epoch": 0.8449583620643929, + "grad_norm": 3.796985149383545, + "learning_rate": 3.591736063226012e-05, + "loss": 0.6255, + "step": 95580 + }, + { + "epoch": 0.8450467653247051, + "grad_norm": 6.235360622406006, + "learning_rate": 3.591588724458825e-05, + "loss": 0.7415, + "step": 95590 + }, + { + "epoch": 0.8451351685850174, + "grad_norm": 4.304016590118408, + "learning_rate": 3.591441385691638e-05, + "loss": 0.6035, + "step": 95600 + }, + { + "epoch": 0.8452235718453297, + "grad_norm": 1.7260944843292236, + "learning_rate": 3.5912940469244505e-05, + "loss": 0.7029, + "step": 95610 + }, + { + "epoch": 0.8453119751056419, + "grad_norm": 10.873573303222656, + "learning_rate": 3.5911467081572634e-05, + "loss": 0.5789, + "step": 95620 + }, + { + "epoch": 0.8454003783659542, + "grad_norm": 3.0364582538604736, + "learning_rate": 3.590999369390077e-05, + "loss": 0.6372, + "step": 95630 + }, + { + "epoch": 0.8454887816262664, + "grad_norm": 3.8201138973236084, + "learning_rate": 3.590852030622889e-05, + "loss": 0.6013, + "step": 95640 + }, + { + "epoch": 0.8455771848865786, + "grad_norm": 6.827417850494385, + "learning_rate": 3.5907046918557025e-05, + "loss": 0.6801, + "step": 95650 + }, + { + "epoch": 0.8456655881468909, + "grad_norm": 2.6591992378234863, + "learning_rate": 3.5905573530885154e-05, + "loss": 0.6349, + "step": 95660 + }, + { + "epoch": 0.8457539914072031, + "grad_norm": 7.244960308074951, + "learning_rate": 3.590410014321328e-05, + "loss": 0.6183, + "step": 95670 + }, + { + "epoch": 0.8458423946675153, + "grad_norm": 3.011967420578003, + "learning_rate": 3.590262675554141e-05, + "loss": 0.5659, + "step": 95680 + }, + { + "epoch": 0.8459307979278275, + "grad_norm": 2.5808794498443604, + "learning_rate": 3.5901153367869546e-05, + "loss": 0.7281, + "step": 95690 + }, + { + "epoch": 0.8460192011881398, + "grad_norm": 2.2106897830963135, + "learning_rate": 3.589967998019767e-05, + "loss": 0.6309, + "step": 95700 + }, + { + "epoch": 0.846107604448452, + "grad_norm": 13.006103515625, + "learning_rate": 3.58982065925258e-05, + "loss": 0.799, + "step": 95710 + }, + { + "epoch": 0.8461960077087644, + "grad_norm": 2.7790842056274414, + "learning_rate": 3.589673320485393e-05, + "loss": 0.6459, + "step": 95720 + }, + { + "epoch": 0.8462844109690766, + "grad_norm": 8.42682933807373, + "learning_rate": 3.589525981718206e-05, + "loss": 0.6586, + "step": 95730 + }, + { + "epoch": 0.8463728142293888, + "grad_norm": 3.0759105682373047, + "learning_rate": 3.589378642951019e-05, + "loss": 0.7381, + "step": 95740 + }, + { + "epoch": 0.846461217489701, + "grad_norm": 7.410990238189697, + "learning_rate": 3.5892313041838316e-05, + "loss": 0.7721, + "step": 95750 + }, + { + "epoch": 0.8465496207500133, + "grad_norm": 2.3688104152679443, + "learning_rate": 3.5890839654166444e-05, + "loss": 0.6294, + "step": 95760 + }, + { + "epoch": 0.8466380240103255, + "grad_norm": 1.406822919845581, + "learning_rate": 3.588936626649458e-05, + "loss": 0.6041, + "step": 95770 + }, + { + "epoch": 0.8467264272706377, + "grad_norm": 5.143759727478027, + "learning_rate": 3.588789287882271e-05, + "loss": 0.5975, + "step": 95780 + }, + { + "epoch": 0.84681483053095, + "grad_norm": 3.095275640487671, + "learning_rate": 3.5886419491150836e-05, + "loss": 0.5834, + "step": 95790 + }, + { + "epoch": 0.8469032337912622, + "grad_norm": 2.8376457691192627, + "learning_rate": 3.5884946103478964e-05, + "loss": 0.771, + "step": 95800 + }, + { + "epoch": 0.8469916370515744, + "grad_norm": 0.9358116388320923, + "learning_rate": 3.588347271580709e-05, + "loss": 0.5959, + "step": 95810 + }, + { + "epoch": 0.8470800403118867, + "grad_norm": 1.056120753288269, + "learning_rate": 3.588199932813522e-05, + "loss": 0.5906, + "step": 95820 + }, + { + "epoch": 0.8471684435721989, + "grad_norm": 6.234168529510498, + "learning_rate": 3.5880525940463356e-05, + "loss": 0.6878, + "step": 95830 + }, + { + "epoch": 0.8472568468325112, + "grad_norm": 4.215854167938232, + "learning_rate": 3.5879052552791484e-05, + "loss": 0.6, + "step": 95840 + }, + { + "epoch": 0.8473452500928235, + "grad_norm": 3.6773717403411865, + "learning_rate": 3.587757916511961e-05, + "loss": 0.5118, + "step": 95850 + }, + { + "epoch": 0.8474336533531357, + "grad_norm": 2.5729904174804688, + "learning_rate": 3.587610577744774e-05, + "loss": 0.6244, + "step": 95860 + }, + { + "epoch": 0.8475220566134479, + "grad_norm": 6.450924396514893, + "learning_rate": 3.587463238977587e-05, + "loss": 0.7929, + "step": 95870 + }, + { + "epoch": 0.8476104598737602, + "grad_norm": 3.9789950847625732, + "learning_rate": 3.5873159002104e-05, + "loss": 0.8084, + "step": 95880 + }, + { + "epoch": 0.8476988631340724, + "grad_norm": 1.5721575021743774, + "learning_rate": 3.5871685614432126e-05, + "loss": 0.5535, + "step": 95890 + }, + { + "epoch": 0.8477872663943846, + "grad_norm": 1.7342270612716675, + "learning_rate": 3.587021222676026e-05, + "loss": 0.5874, + "step": 95900 + }, + { + "epoch": 0.8478756696546969, + "grad_norm": 1.1704521179199219, + "learning_rate": 3.586873883908839e-05, + "loss": 0.6115, + "step": 95910 + }, + { + "epoch": 0.8479640729150091, + "grad_norm": 3.590111255645752, + "learning_rate": 3.586726545141652e-05, + "loss": 0.6238, + "step": 95920 + }, + { + "epoch": 0.8480524761753213, + "grad_norm": 1.3525285720825195, + "learning_rate": 3.5865792063744646e-05, + "loss": 0.6814, + "step": 95930 + }, + { + "epoch": 0.8481408794356335, + "grad_norm": 5.288671016693115, + "learning_rate": 3.5864318676072775e-05, + "loss": 0.6686, + "step": 95940 + }, + { + "epoch": 0.8482292826959458, + "grad_norm": 6.754096508026123, + "learning_rate": 3.58628452884009e-05, + "loss": 0.6281, + "step": 95950 + }, + { + "epoch": 0.8483176859562581, + "grad_norm": 3.5103559494018555, + "learning_rate": 3.586137190072904e-05, + "loss": 0.6366, + "step": 95960 + }, + { + "epoch": 0.8484060892165703, + "grad_norm": 8.585774421691895, + "learning_rate": 3.585989851305716e-05, + "loss": 0.684, + "step": 95970 + }, + { + "epoch": 0.8484944924768826, + "grad_norm": 7.428455829620361, + "learning_rate": 3.5858425125385295e-05, + "loss": 0.5793, + "step": 95980 + }, + { + "epoch": 0.8485828957371948, + "grad_norm": 7.0970916748046875, + "learning_rate": 3.585695173771342e-05, + "loss": 0.5722, + "step": 95990 + }, + { + "epoch": 0.848671298997507, + "grad_norm": 6.8665337562561035, + "learning_rate": 3.585547835004155e-05, + "loss": 0.527, + "step": 96000 + }, + { + "epoch": 0.8487597022578193, + "grad_norm": 1.715965986251831, + "learning_rate": 3.585400496236968e-05, + "loss": 0.6227, + "step": 96010 + }, + { + "epoch": 0.8488481055181315, + "grad_norm": 2.6507320404052734, + "learning_rate": 3.5852531574697815e-05, + "loss": 0.7047, + "step": 96020 + }, + { + "epoch": 0.8489365087784437, + "grad_norm": 7.443882942199707, + "learning_rate": 3.585105818702594e-05, + "loss": 0.5848, + "step": 96030 + }, + { + "epoch": 0.849024912038756, + "grad_norm": 20.649703979492188, + "learning_rate": 3.584958479935407e-05, + "loss": 0.7186, + "step": 96040 + }, + { + "epoch": 0.8491133152990682, + "grad_norm": 4.864152908325195, + "learning_rate": 3.58481114116822e-05, + "loss": 0.6077, + "step": 96050 + }, + { + "epoch": 0.8492017185593804, + "grad_norm": 4.269893646240234, + "learning_rate": 3.584663802401033e-05, + "loss": 0.6816, + "step": 96060 + }, + { + "epoch": 0.8492901218196927, + "grad_norm": 2.7612497806549072, + "learning_rate": 3.584516463633846e-05, + "loss": 0.6619, + "step": 96070 + }, + { + "epoch": 0.849378525080005, + "grad_norm": 3.4402577877044678, + "learning_rate": 3.5843691248666585e-05, + "loss": 0.6352, + "step": 96080 + }, + { + "epoch": 0.8494669283403172, + "grad_norm": 2.4338760375976562, + "learning_rate": 3.5842217860994714e-05, + "loss": 0.7769, + "step": 96090 + }, + { + "epoch": 0.8495553316006295, + "grad_norm": 2.6479580402374268, + "learning_rate": 3.584074447332285e-05, + "loss": 0.6805, + "step": 96100 + }, + { + "epoch": 0.8496437348609417, + "grad_norm": 1.8810317516326904, + "learning_rate": 3.583927108565097e-05, + "loss": 0.5493, + "step": 96110 + }, + { + "epoch": 0.8497321381212539, + "grad_norm": 2.2300667762756348, + "learning_rate": 3.5837797697979105e-05, + "loss": 0.6758, + "step": 96120 + }, + { + "epoch": 0.8498205413815662, + "grad_norm": 1.3616703748703003, + "learning_rate": 3.5836324310307234e-05, + "loss": 0.7226, + "step": 96130 + }, + { + "epoch": 0.8499089446418784, + "grad_norm": 2.228010892868042, + "learning_rate": 3.583485092263536e-05, + "loss": 0.6333, + "step": 96140 + }, + { + "epoch": 0.8499973479021906, + "grad_norm": 1.386564016342163, + "learning_rate": 3.583337753496349e-05, + "loss": 0.6214, + "step": 96150 + }, + { + "epoch": 0.8500857511625028, + "grad_norm": 3.9845974445343018, + "learning_rate": 3.5831904147291626e-05, + "loss": 0.7496, + "step": 96160 + }, + { + "epoch": 0.8501741544228151, + "grad_norm": 2.2014803886413574, + "learning_rate": 3.583043075961975e-05, + "loss": 0.7283, + "step": 96170 + }, + { + "epoch": 0.8502625576831273, + "grad_norm": 1.7963542938232422, + "learning_rate": 3.582895737194788e-05, + "loss": 0.5877, + "step": 96180 + }, + { + "epoch": 0.8503509609434396, + "grad_norm": 1.5569453239440918, + "learning_rate": 3.5827483984276004e-05, + "loss": 0.5518, + "step": 96190 + }, + { + "epoch": 0.8504393642037519, + "grad_norm": 2.659372329711914, + "learning_rate": 3.582601059660414e-05, + "loss": 0.6012, + "step": 96200 + }, + { + "epoch": 0.8505277674640641, + "grad_norm": 1.3436371088027954, + "learning_rate": 3.582453720893227e-05, + "loss": 0.6314, + "step": 96210 + }, + { + "epoch": 0.8506161707243763, + "grad_norm": 1.939951777458191, + "learning_rate": 3.5823063821260396e-05, + "loss": 0.7213, + "step": 96220 + }, + { + "epoch": 0.8507045739846886, + "grad_norm": 1.5260087251663208, + "learning_rate": 3.5821590433588524e-05, + "loss": 0.7606, + "step": 96230 + }, + { + "epoch": 0.8507929772450008, + "grad_norm": 3.363067865371704, + "learning_rate": 3.582011704591666e-05, + "loss": 0.6936, + "step": 96240 + }, + { + "epoch": 0.850881380505313, + "grad_norm": 2.567561149597168, + "learning_rate": 3.581864365824478e-05, + "loss": 0.7035, + "step": 96250 + }, + { + "epoch": 0.8509697837656253, + "grad_norm": 2.520526170730591, + "learning_rate": 3.5817170270572916e-05, + "loss": 0.6729, + "step": 96260 + }, + { + "epoch": 0.8510581870259375, + "grad_norm": 16.309490203857422, + "learning_rate": 3.5815696882901044e-05, + "loss": 0.6256, + "step": 96270 + }, + { + "epoch": 0.8511465902862497, + "grad_norm": 2.2490227222442627, + "learning_rate": 3.581422349522917e-05, + "loss": 0.4601, + "step": 96280 + }, + { + "epoch": 0.851234993546562, + "grad_norm": 2.0835492610931396, + "learning_rate": 3.58127501075573e-05, + "loss": 0.5824, + "step": 96290 + }, + { + "epoch": 0.8513233968068742, + "grad_norm": 1.6873316764831543, + "learning_rate": 3.5811276719885436e-05, + "loss": 0.6356, + "step": 96300 + }, + { + "epoch": 0.8514118000671865, + "grad_norm": 2.128718614578247, + "learning_rate": 3.580980333221356e-05, + "loss": 0.601, + "step": 96310 + }, + { + "epoch": 0.8515002033274988, + "grad_norm": 2.577293872833252, + "learning_rate": 3.580832994454169e-05, + "loss": 0.5732, + "step": 96320 + }, + { + "epoch": 0.851588606587811, + "grad_norm": 2.8871302604675293, + "learning_rate": 3.5806856556869814e-05, + "loss": 0.6314, + "step": 96330 + }, + { + "epoch": 0.8516770098481232, + "grad_norm": 1.9625697135925293, + "learning_rate": 3.580538316919795e-05, + "loss": 0.6238, + "step": 96340 + }, + { + "epoch": 0.8517654131084355, + "grad_norm": 3.150153636932373, + "learning_rate": 3.580390978152608e-05, + "loss": 0.5864, + "step": 96350 + }, + { + "epoch": 0.8518538163687477, + "grad_norm": 3.8638978004455566, + "learning_rate": 3.5802436393854206e-05, + "loss": 0.7103, + "step": 96360 + }, + { + "epoch": 0.8519422196290599, + "grad_norm": 3.1687910556793213, + "learning_rate": 3.5800963006182335e-05, + "loss": 0.713, + "step": 96370 + }, + { + "epoch": 0.8520306228893721, + "grad_norm": 2.8374810218811035, + "learning_rate": 3.579948961851047e-05, + "loss": 0.5735, + "step": 96380 + }, + { + "epoch": 0.8521190261496844, + "grad_norm": 1.7191070318222046, + "learning_rate": 3.579801623083859e-05, + "loss": 0.7229, + "step": 96390 + }, + { + "epoch": 0.8522074294099966, + "grad_norm": 3.8795509338378906, + "learning_rate": 3.5796542843166726e-05, + "loss": 0.7042, + "step": 96400 + }, + { + "epoch": 0.8522958326703088, + "grad_norm": 4.817535877227783, + "learning_rate": 3.5795069455494855e-05, + "loss": 0.67, + "step": 96410 + }, + { + "epoch": 0.8523842359306211, + "grad_norm": 4.528602123260498, + "learning_rate": 3.579359606782298e-05, + "loss": 0.681, + "step": 96420 + }, + { + "epoch": 0.8524726391909334, + "grad_norm": 10.816837310791016, + "learning_rate": 3.579212268015111e-05, + "loss": 0.676, + "step": 96430 + }, + { + "epoch": 0.8525610424512456, + "grad_norm": 1.0233919620513916, + "learning_rate": 3.579064929247924e-05, + "loss": 0.5836, + "step": 96440 + }, + { + "epoch": 0.8526494457115579, + "grad_norm": 4.109862327575684, + "learning_rate": 3.578917590480737e-05, + "loss": 0.6727, + "step": 96450 + }, + { + "epoch": 0.8527378489718701, + "grad_norm": 3.049077033996582, + "learning_rate": 3.57877025171355e-05, + "loss": 0.5985, + "step": 96460 + }, + { + "epoch": 0.8528262522321823, + "grad_norm": 2.6511070728302, + "learning_rate": 3.5786229129463625e-05, + "loss": 0.6227, + "step": 96470 + }, + { + "epoch": 0.8529146554924946, + "grad_norm": 2.297149658203125, + "learning_rate": 3.578475574179176e-05, + "loss": 0.6207, + "step": 96480 + }, + { + "epoch": 0.8530030587528068, + "grad_norm": 7.095157623291016, + "learning_rate": 3.578328235411989e-05, + "loss": 0.6466, + "step": 96490 + }, + { + "epoch": 0.853091462013119, + "grad_norm": 16.811241149902344, + "learning_rate": 3.578180896644802e-05, + "loss": 0.6613, + "step": 96500 + }, + { + "epoch": 0.8531798652734313, + "grad_norm": 1.6061618328094482, + "learning_rate": 3.5780335578776145e-05, + "loss": 0.637, + "step": 96510 + }, + { + "epoch": 0.8532682685337435, + "grad_norm": 1.3366936445236206, + "learning_rate": 3.577886219110428e-05, + "loss": 0.7036, + "step": 96520 + }, + { + "epoch": 0.8533566717940557, + "grad_norm": 1.4037444591522217, + "learning_rate": 3.57773888034324e-05, + "loss": 0.6635, + "step": 96530 + }, + { + "epoch": 0.853445075054368, + "grad_norm": 3.01778507232666, + "learning_rate": 3.577591541576054e-05, + "loss": 0.747, + "step": 96540 + }, + { + "epoch": 0.8535334783146803, + "grad_norm": 4.420552730560303, + "learning_rate": 3.577444202808866e-05, + "loss": 0.7034, + "step": 96550 + }, + { + "epoch": 0.8536218815749925, + "grad_norm": 7.216592788696289, + "learning_rate": 3.5772968640416794e-05, + "loss": 0.5377, + "step": 96560 + }, + { + "epoch": 0.8537102848353048, + "grad_norm": 1.2200016975402832, + "learning_rate": 3.577149525274492e-05, + "loss": 0.7169, + "step": 96570 + }, + { + "epoch": 0.853798688095617, + "grad_norm": 2.5889317989349365, + "learning_rate": 3.577002186507305e-05, + "loss": 0.726, + "step": 96580 + }, + { + "epoch": 0.8538870913559292, + "grad_norm": 3.7994658946990967, + "learning_rate": 3.576854847740118e-05, + "loss": 0.6724, + "step": 96590 + }, + { + "epoch": 0.8539754946162414, + "grad_norm": 3.08121919631958, + "learning_rate": 3.5767075089729314e-05, + "loss": 0.5426, + "step": 96600 + }, + { + "epoch": 0.8540638978765537, + "grad_norm": 4.825959205627441, + "learning_rate": 3.5765601702057435e-05, + "loss": 0.6906, + "step": 96610 + }, + { + "epoch": 0.8541523011368659, + "grad_norm": 3.4976179599761963, + "learning_rate": 3.576412831438557e-05, + "loss": 0.6838, + "step": 96620 + }, + { + "epoch": 0.8542407043971781, + "grad_norm": 9.864885330200195, + "learning_rate": 3.57626549267137e-05, + "loss": 0.6538, + "step": 96630 + }, + { + "epoch": 0.8543291076574904, + "grad_norm": 2.7881863117218018, + "learning_rate": 3.576118153904183e-05, + "loss": 0.6598, + "step": 96640 + }, + { + "epoch": 0.8544175109178026, + "grad_norm": 2.6809630393981934, + "learning_rate": 3.5759708151369956e-05, + "loss": 0.6609, + "step": 96650 + }, + { + "epoch": 0.8545059141781148, + "grad_norm": 6.943284511566162, + "learning_rate": 3.575823476369809e-05, + "loss": 0.6044, + "step": 96660 + }, + { + "epoch": 0.8545943174384272, + "grad_norm": 2.844498872756958, + "learning_rate": 3.575676137602621e-05, + "loss": 0.65, + "step": 96670 + }, + { + "epoch": 0.8546827206987394, + "grad_norm": 1.6513187885284424, + "learning_rate": 3.575528798835435e-05, + "loss": 0.6576, + "step": 96680 + }, + { + "epoch": 0.8547711239590516, + "grad_norm": 1.2408981323242188, + "learning_rate": 3.5753814600682476e-05, + "loss": 0.6423, + "step": 96690 + }, + { + "epoch": 0.8548595272193639, + "grad_norm": 3.5790164470672607, + "learning_rate": 3.5752341213010604e-05, + "loss": 0.6824, + "step": 96700 + }, + { + "epoch": 0.8549479304796761, + "grad_norm": 1.7619749307632446, + "learning_rate": 3.575086782533873e-05, + "loss": 0.6511, + "step": 96710 + }, + { + "epoch": 0.8550363337399883, + "grad_norm": 5.983351707458496, + "learning_rate": 3.574939443766686e-05, + "loss": 0.7705, + "step": 96720 + }, + { + "epoch": 0.8551247370003006, + "grad_norm": 2.7571675777435303, + "learning_rate": 3.574792104999499e-05, + "loss": 0.6756, + "step": 96730 + }, + { + "epoch": 0.8552131402606128, + "grad_norm": 2.4864935874938965, + "learning_rate": 3.5746447662323124e-05, + "loss": 0.5314, + "step": 96740 + }, + { + "epoch": 0.855301543520925, + "grad_norm": 5.254898548126221, + "learning_rate": 3.574497427465125e-05, + "loss": 0.7719, + "step": 96750 + }, + { + "epoch": 0.8553899467812373, + "grad_norm": 2.200340747833252, + "learning_rate": 3.574350088697938e-05, + "loss": 0.7374, + "step": 96760 + }, + { + "epoch": 0.8554783500415495, + "grad_norm": 7.612358093261719, + "learning_rate": 3.574202749930751e-05, + "loss": 0.8307, + "step": 96770 + }, + { + "epoch": 0.8555667533018618, + "grad_norm": 3.9470272064208984, + "learning_rate": 3.574055411163564e-05, + "loss": 0.5449, + "step": 96780 + }, + { + "epoch": 0.8556551565621741, + "grad_norm": 4.081993103027344, + "learning_rate": 3.5739080723963766e-05, + "loss": 0.6508, + "step": 96790 + }, + { + "epoch": 0.8557435598224863, + "grad_norm": 2.588196039199829, + "learning_rate": 3.5737607336291895e-05, + "loss": 0.6271, + "step": 96800 + }, + { + "epoch": 0.8558319630827985, + "grad_norm": 3.0945520401000977, + "learning_rate": 3.573613394862003e-05, + "loss": 0.5452, + "step": 96810 + }, + { + "epoch": 0.8559203663431108, + "grad_norm": 7.589072227478027, + "learning_rate": 3.573466056094816e-05, + "loss": 0.7708, + "step": 96820 + }, + { + "epoch": 0.856008769603423, + "grad_norm": 1.6648616790771484, + "learning_rate": 3.5733187173276286e-05, + "loss": 0.5345, + "step": 96830 + }, + { + "epoch": 0.8560971728637352, + "grad_norm": 3.797358989715576, + "learning_rate": 3.5731713785604415e-05, + "loss": 0.6019, + "step": 96840 + }, + { + "epoch": 0.8561855761240474, + "grad_norm": 6.206610679626465, + "learning_rate": 3.573024039793254e-05, + "loss": 0.6814, + "step": 96850 + }, + { + "epoch": 0.8562739793843597, + "grad_norm": 1.350623369216919, + "learning_rate": 3.572876701026067e-05, + "loss": 0.6797, + "step": 96860 + }, + { + "epoch": 0.8563623826446719, + "grad_norm": 4.248816013336182, + "learning_rate": 3.5727293622588807e-05, + "loss": 0.7387, + "step": 96870 + }, + { + "epoch": 0.8564507859049841, + "grad_norm": 1.847179651260376, + "learning_rate": 3.5725820234916935e-05, + "loss": 0.6964, + "step": 96880 + }, + { + "epoch": 0.8565391891652964, + "grad_norm": 2.9017820358276367, + "learning_rate": 3.572434684724506e-05, + "loss": 0.631, + "step": 96890 + }, + { + "epoch": 0.8566275924256087, + "grad_norm": 10.833443641662598, + "learning_rate": 3.572287345957319e-05, + "loss": 0.6257, + "step": 96900 + }, + { + "epoch": 0.8567159956859209, + "grad_norm": 3.561417818069458, + "learning_rate": 3.572140007190132e-05, + "loss": 0.6574, + "step": 96910 + }, + { + "epoch": 0.8568043989462332, + "grad_norm": 0.7631369829177856, + "learning_rate": 3.571992668422945e-05, + "loss": 0.6225, + "step": 96920 + }, + { + "epoch": 0.8568928022065454, + "grad_norm": 6.963668346405029, + "learning_rate": 3.5718453296557583e-05, + "loss": 0.6315, + "step": 96930 + }, + { + "epoch": 0.8569812054668576, + "grad_norm": 1.9811309576034546, + "learning_rate": 3.5716979908885705e-05, + "loss": 0.7407, + "step": 96940 + }, + { + "epoch": 0.8570696087271699, + "grad_norm": 1.2657369375228882, + "learning_rate": 3.571550652121384e-05, + "loss": 0.5438, + "step": 96950 + }, + { + "epoch": 0.8571580119874821, + "grad_norm": 3.8242335319519043, + "learning_rate": 3.571403313354197e-05, + "loss": 0.7064, + "step": 96960 + }, + { + "epoch": 0.8572464152477943, + "grad_norm": 10.111712455749512, + "learning_rate": 3.57125597458701e-05, + "loss": 0.6155, + "step": 96970 + }, + { + "epoch": 0.8573348185081066, + "grad_norm": 1.8088099956512451, + "learning_rate": 3.5711086358198225e-05, + "loss": 0.7048, + "step": 96980 + }, + { + "epoch": 0.8574232217684188, + "grad_norm": 6.075275897979736, + "learning_rate": 3.570961297052636e-05, + "loss": 0.5599, + "step": 96990 + }, + { + "epoch": 0.857511625028731, + "grad_norm": 1.9787936210632324, + "learning_rate": 3.570813958285448e-05, + "loss": 0.7863, + "step": 97000 + }, + { + "epoch": 0.8576000282890432, + "grad_norm": 6.562952518463135, + "learning_rate": 3.570666619518262e-05, + "loss": 0.8221, + "step": 97010 + }, + { + "epoch": 0.8576884315493556, + "grad_norm": 2.168668031692505, + "learning_rate": 3.570519280751074e-05, + "loss": 0.6861, + "step": 97020 + }, + { + "epoch": 0.8577768348096678, + "grad_norm": 6.50564432144165, + "learning_rate": 3.5703719419838874e-05, + "loss": 0.6213, + "step": 97030 + }, + { + "epoch": 0.85786523806998, + "grad_norm": 1.0779777765274048, + "learning_rate": 3.5702246032167e-05, + "loss": 0.5123, + "step": 97040 + }, + { + "epoch": 0.8579536413302923, + "grad_norm": 6.688148498535156, + "learning_rate": 3.570077264449513e-05, + "loss": 0.6077, + "step": 97050 + }, + { + "epoch": 0.8580420445906045, + "grad_norm": 1.3460952043533325, + "learning_rate": 3.569929925682326e-05, + "loss": 0.6679, + "step": 97060 + }, + { + "epoch": 0.8581304478509167, + "grad_norm": 5.407824993133545, + "learning_rate": 3.5697825869151394e-05, + "loss": 0.6355, + "step": 97070 + }, + { + "epoch": 0.858218851111229, + "grad_norm": 8.066926002502441, + "learning_rate": 3.5696352481479516e-05, + "loss": 0.6606, + "step": 97080 + }, + { + "epoch": 0.8583072543715412, + "grad_norm": 5.549952983856201, + "learning_rate": 3.569487909380765e-05, + "loss": 0.6487, + "step": 97090 + }, + { + "epoch": 0.8583956576318534, + "grad_norm": 3.47602915763855, + "learning_rate": 3.569340570613578e-05, + "loss": 0.6682, + "step": 97100 + }, + { + "epoch": 0.8584840608921657, + "grad_norm": 2.4410276412963867, + "learning_rate": 3.569193231846391e-05, + "loss": 0.6071, + "step": 97110 + }, + { + "epoch": 0.8585724641524779, + "grad_norm": 1.9685924053192139, + "learning_rate": 3.5690458930792036e-05, + "loss": 0.7168, + "step": 97120 + }, + { + "epoch": 0.8586608674127901, + "grad_norm": 5.951869964599609, + "learning_rate": 3.568898554312017e-05, + "loss": 0.8174, + "step": 97130 + }, + { + "epoch": 0.8587492706731025, + "grad_norm": 2.647230863571167, + "learning_rate": 3.568751215544829e-05, + "loss": 0.5943, + "step": 97140 + }, + { + "epoch": 0.8588376739334147, + "grad_norm": 4.039038181304932, + "learning_rate": 3.568603876777643e-05, + "loss": 0.7293, + "step": 97150 + }, + { + "epoch": 0.8589260771937269, + "grad_norm": 3.268256664276123, + "learning_rate": 3.568456538010455e-05, + "loss": 0.7301, + "step": 97160 + }, + { + "epoch": 0.8590144804540392, + "grad_norm": 2.228330135345459, + "learning_rate": 3.5683091992432684e-05, + "loss": 0.5719, + "step": 97170 + }, + { + "epoch": 0.8591028837143514, + "grad_norm": 2.9342756271362305, + "learning_rate": 3.568161860476081e-05, + "loss": 0.5967, + "step": 97180 + }, + { + "epoch": 0.8591912869746636, + "grad_norm": 1.1431633234024048, + "learning_rate": 3.568014521708894e-05, + "loss": 0.6203, + "step": 97190 + }, + { + "epoch": 0.8592796902349759, + "grad_norm": 3.2620229721069336, + "learning_rate": 3.567867182941707e-05, + "loss": 0.6046, + "step": 97200 + }, + { + "epoch": 0.8593680934952881, + "grad_norm": 3.122626304626465, + "learning_rate": 3.5677198441745204e-05, + "loss": 0.6107, + "step": 97210 + }, + { + "epoch": 0.8594564967556003, + "grad_norm": 13.108017921447754, + "learning_rate": 3.5675725054073326e-05, + "loss": 0.5778, + "step": 97220 + }, + { + "epoch": 0.8595449000159125, + "grad_norm": 4.802103519439697, + "learning_rate": 3.567425166640146e-05, + "loss": 0.4346, + "step": 97230 + }, + { + "epoch": 0.8596333032762248, + "grad_norm": 12.014464378356934, + "learning_rate": 3.567277827872959e-05, + "loss": 0.6535, + "step": 97240 + }, + { + "epoch": 0.8597217065365371, + "grad_norm": 0.8196065425872803, + "learning_rate": 3.567130489105772e-05, + "loss": 0.7249, + "step": 97250 + }, + { + "epoch": 0.8598101097968494, + "grad_norm": 2.5248751640319824, + "learning_rate": 3.5669831503385846e-05, + "loss": 0.6495, + "step": 97260 + }, + { + "epoch": 0.8598985130571616, + "grad_norm": 2.2289059162139893, + "learning_rate": 3.5668358115713975e-05, + "loss": 0.6034, + "step": 97270 + }, + { + "epoch": 0.8599869163174738, + "grad_norm": 2.478698968887329, + "learning_rate": 3.56668847280421e-05, + "loss": 0.796, + "step": 97280 + }, + { + "epoch": 0.860075319577786, + "grad_norm": 2.4700891971588135, + "learning_rate": 3.566541134037024e-05, + "loss": 0.7364, + "step": 97290 + }, + { + "epoch": 0.8601637228380983, + "grad_norm": 1.2270628213882446, + "learning_rate": 3.566393795269836e-05, + "loss": 0.6514, + "step": 97300 + }, + { + "epoch": 0.8602521260984105, + "grad_norm": 3.766535758972168, + "learning_rate": 3.5662464565026495e-05, + "loss": 0.5957, + "step": 97310 + }, + { + "epoch": 0.8603405293587227, + "grad_norm": 9.726471900939941, + "learning_rate": 3.566099117735462e-05, + "loss": 0.7541, + "step": 97320 + }, + { + "epoch": 0.860428932619035, + "grad_norm": 3.229013681411743, + "learning_rate": 3.565951778968275e-05, + "loss": 0.4931, + "step": 97330 + }, + { + "epoch": 0.8605173358793472, + "grad_norm": 4.683764457702637, + "learning_rate": 3.565804440201088e-05, + "loss": 0.6422, + "step": 97340 + }, + { + "epoch": 0.8606057391396594, + "grad_norm": 2.1976706981658936, + "learning_rate": 3.5656571014339015e-05, + "loss": 0.6622, + "step": 97350 + }, + { + "epoch": 0.8606941423999717, + "grad_norm": 4.245981216430664, + "learning_rate": 3.5655097626667137e-05, + "loss": 0.6518, + "step": 97360 + }, + { + "epoch": 0.860782545660284, + "grad_norm": 1.7333186864852905, + "learning_rate": 3.565362423899527e-05, + "loss": 0.6501, + "step": 97370 + }, + { + "epoch": 0.8608709489205962, + "grad_norm": 1.8159234523773193, + "learning_rate": 3.565215085132339e-05, + "loss": 0.5702, + "step": 97380 + }, + { + "epoch": 0.8609593521809085, + "grad_norm": 10.311392784118652, + "learning_rate": 3.565067746365153e-05, + "loss": 0.7206, + "step": 97390 + }, + { + "epoch": 0.8610477554412207, + "grad_norm": 2.2085134983062744, + "learning_rate": 3.564920407597966e-05, + "loss": 0.7121, + "step": 97400 + }, + { + "epoch": 0.8611361587015329, + "grad_norm": 2.321945905685425, + "learning_rate": 3.5647730688307785e-05, + "loss": 0.6404, + "step": 97410 + }, + { + "epoch": 0.8612245619618452, + "grad_norm": 2.306286573410034, + "learning_rate": 3.5646257300635913e-05, + "loss": 0.6428, + "step": 97420 + }, + { + "epoch": 0.8613129652221574, + "grad_norm": 3.535773992538452, + "learning_rate": 3.564478391296405e-05, + "loss": 0.6817, + "step": 97430 + }, + { + "epoch": 0.8614013684824696, + "grad_norm": 6.402032852172852, + "learning_rate": 3.564331052529217e-05, + "loss": 0.6937, + "step": 97440 + }, + { + "epoch": 0.8614897717427819, + "grad_norm": 2.2330784797668457, + "learning_rate": 3.5641837137620305e-05, + "loss": 0.5796, + "step": 97450 + }, + { + "epoch": 0.8615781750030941, + "grad_norm": 4.886663436889648, + "learning_rate": 3.5640363749948434e-05, + "loss": 0.686, + "step": 97460 + }, + { + "epoch": 0.8616665782634063, + "grad_norm": 2.6288633346557617, + "learning_rate": 3.563889036227656e-05, + "loss": 0.6134, + "step": 97470 + }, + { + "epoch": 0.8617549815237185, + "grad_norm": 12.250592231750488, + "learning_rate": 3.563741697460469e-05, + "loss": 0.708, + "step": 97480 + }, + { + "epoch": 0.8618433847840309, + "grad_norm": 3.854862689971924, + "learning_rate": 3.563594358693282e-05, + "loss": 0.7334, + "step": 97490 + }, + { + "epoch": 0.8619317880443431, + "grad_norm": 4.415807723999023, + "learning_rate": 3.563447019926095e-05, + "loss": 0.6553, + "step": 97500 + }, + { + "epoch": 0.8620201913046553, + "grad_norm": 2.9235692024230957, + "learning_rate": 3.563299681158908e-05, + "loss": 0.6903, + "step": 97510 + }, + { + "epoch": 0.8621085945649676, + "grad_norm": 3.8028318881988525, + "learning_rate": 3.5631523423917204e-05, + "loss": 0.6343, + "step": 97520 + }, + { + "epoch": 0.8621969978252798, + "grad_norm": 6.743467330932617, + "learning_rate": 3.563005003624534e-05, + "loss": 0.635, + "step": 97530 + }, + { + "epoch": 0.862285401085592, + "grad_norm": 1.6084133386611938, + "learning_rate": 3.562857664857347e-05, + "loss": 0.7388, + "step": 97540 + }, + { + "epoch": 0.8623738043459043, + "grad_norm": 10.368175506591797, + "learning_rate": 3.5627103260901596e-05, + "loss": 0.6807, + "step": 97550 + }, + { + "epoch": 0.8624622076062165, + "grad_norm": 12.379048347473145, + "learning_rate": 3.5625629873229724e-05, + "loss": 0.5697, + "step": 97560 + }, + { + "epoch": 0.8625506108665287, + "grad_norm": 3.011819362640381, + "learning_rate": 3.562415648555786e-05, + "loss": 0.6628, + "step": 97570 + }, + { + "epoch": 0.862639014126841, + "grad_norm": 2.7928409576416016, + "learning_rate": 3.562268309788598e-05, + "loss": 0.6936, + "step": 97580 + }, + { + "epoch": 0.8627274173871532, + "grad_norm": 2.027700901031494, + "learning_rate": 3.5621209710214116e-05, + "loss": 0.8069, + "step": 97590 + }, + { + "epoch": 0.8628158206474654, + "grad_norm": 2.958068370819092, + "learning_rate": 3.5619736322542244e-05, + "loss": 0.5896, + "step": 97600 + }, + { + "epoch": 0.8629042239077778, + "grad_norm": 2.9642834663391113, + "learning_rate": 3.561826293487037e-05, + "loss": 0.6321, + "step": 97610 + }, + { + "epoch": 0.86299262716809, + "grad_norm": 1.8911761045455933, + "learning_rate": 3.56167895471985e-05, + "loss": 0.7027, + "step": 97620 + }, + { + "epoch": 0.8630810304284022, + "grad_norm": 2.3212645053863525, + "learning_rate": 3.561531615952663e-05, + "loss": 0.6298, + "step": 97630 + }, + { + "epoch": 0.8631694336887145, + "grad_norm": 5.461788654327393, + "learning_rate": 3.561384277185476e-05, + "loss": 0.5616, + "step": 97640 + }, + { + "epoch": 0.8632578369490267, + "grad_norm": 0.9621568322181702, + "learning_rate": 3.561236938418289e-05, + "loss": 0.6358, + "step": 97650 + }, + { + "epoch": 0.8633462402093389, + "grad_norm": 1.8285093307495117, + "learning_rate": 3.561089599651102e-05, + "loss": 0.6572, + "step": 97660 + }, + { + "epoch": 0.8634346434696512, + "grad_norm": 3.9671151638031006, + "learning_rate": 3.560942260883915e-05, + "loss": 0.595, + "step": 97670 + }, + { + "epoch": 0.8635230467299634, + "grad_norm": 4.02812385559082, + "learning_rate": 3.560794922116728e-05, + "loss": 0.7218, + "step": 97680 + }, + { + "epoch": 0.8636114499902756, + "grad_norm": 3.439793348312378, + "learning_rate": 3.5606475833495406e-05, + "loss": 0.5587, + "step": 97690 + }, + { + "epoch": 0.8636998532505878, + "grad_norm": 6.592374324798584, + "learning_rate": 3.5605002445823534e-05, + "loss": 0.7331, + "step": 97700 + }, + { + "epoch": 0.8637882565109001, + "grad_norm": 1.928765892982483, + "learning_rate": 3.560352905815167e-05, + "loss": 0.689, + "step": 97710 + }, + { + "epoch": 0.8638766597712123, + "grad_norm": 2.3686225414276123, + "learning_rate": 3.56020556704798e-05, + "loss": 0.823, + "step": 97720 + }, + { + "epoch": 0.8639650630315246, + "grad_norm": 2.042149066925049, + "learning_rate": 3.5600582282807926e-05, + "loss": 0.5875, + "step": 97730 + }, + { + "epoch": 0.8640534662918369, + "grad_norm": 3.339426040649414, + "learning_rate": 3.5599108895136055e-05, + "loss": 0.6725, + "step": 97740 + }, + { + "epoch": 0.8641418695521491, + "grad_norm": 1.7333118915557861, + "learning_rate": 3.559763550746418e-05, + "loss": 0.6748, + "step": 97750 + }, + { + "epoch": 0.8642302728124613, + "grad_norm": 4.163698673248291, + "learning_rate": 3.559616211979231e-05, + "loss": 0.6952, + "step": 97760 + }, + { + "epoch": 0.8643186760727736, + "grad_norm": 1.8396464586257935, + "learning_rate": 3.559468873212044e-05, + "loss": 0.6029, + "step": 97770 + }, + { + "epoch": 0.8644070793330858, + "grad_norm": 4.963450908660889, + "learning_rate": 3.5593215344448575e-05, + "loss": 0.7147, + "step": 97780 + }, + { + "epoch": 0.864495482593398, + "grad_norm": 6.767226696014404, + "learning_rate": 3.55917419567767e-05, + "loss": 0.6666, + "step": 97790 + }, + { + "epoch": 0.8645838858537103, + "grad_norm": 7.092291831970215, + "learning_rate": 3.559026856910483e-05, + "loss": 0.5999, + "step": 97800 + }, + { + "epoch": 0.8646722891140225, + "grad_norm": 0.9363610744476318, + "learning_rate": 3.558879518143296e-05, + "loss": 0.5209, + "step": 97810 + }, + { + "epoch": 0.8647606923743347, + "grad_norm": 1.8100072145462036, + "learning_rate": 3.558732179376109e-05, + "loss": 0.6536, + "step": 97820 + }, + { + "epoch": 0.864849095634647, + "grad_norm": 2.556466579437256, + "learning_rate": 3.558584840608922e-05, + "loss": 0.6147, + "step": 97830 + }, + { + "epoch": 0.8649374988949593, + "grad_norm": 2.9554662704467773, + "learning_rate": 3.558437501841735e-05, + "loss": 0.6924, + "step": 97840 + }, + { + "epoch": 0.8650259021552715, + "grad_norm": 2.7062246799468994, + "learning_rate": 3.558290163074547e-05, + "loss": 0.6442, + "step": 97850 + }, + { + "epoch": 0.8651143054155838, + "grad_norm": 1.5560581684112549, + "learning_rate": 3.558142824307361e-05, + "loss": 0.6401, + "step": 97860 + }, + { + "epoch": 0.865202708675896, + "grad_norm": 9.17304515838623, + "learning_rate": 3.557995485540174e-05, + "loss": 0.7126, + "step": 97870 + }, + { + "epoch": 0.8652911119362082, + "grad_norm": 3.902885913848877, + "learning_rate": 3.5578481467729865e-05, + "loss": 0.6815, + "step": 97880 + }, + { + "epoch": 0.8653795151965205, + "grad_norm": 7.74009895324707, + "learning_rate": 3.5577008080057994e-05, + "loss": 0.6422, + "step": 97890 + }, + { + "epoch": 0.8654679184568327, + "grad_norm": 5.859641075134277, + "learning_rate": 3.557553469238613e-05, + "loss": 0.6396, + "step": 97900 + }, + { + "epoch": 0.8655563217171449, + "grad_norm": 4.236804962158203, + "learning_rate": 3.557406130471425e-05, + "loss": 0.5992, + "step": 97910 + }, + { + "epoch": 0.8656447249774571, + "grad_norm": 6.393752574920654, + "learning_rate": 3.5572587917042385e-05, + "loss": 0.5366, + "step": 97920 + }, + { + "epoch": 0.8657331282377694, + "grad_norm": 3.365983247756958, + "learning_rate": 3.5571114529370514e-05, + "loss": 0.7184, + "step": 97930 + }, + { + "epoch": 0.8658215314980816, + "grad_norm": 3.524007797241211, + "learning_rate": 3.556964114169864e-05, + "loss": 0.8267, + "step": 97940 + }, + { + "epoch": 0.8659099347583938, + "grad_norm": 1.3623628616333008, + "learning_rate": 3.556816775402677e-05, + "loss": 0.5764, + "step": 97950 + }, + { + "epoch": 0.8659983380187062, + "grad_norm": 1.6860737800598145, + "learning_rate": 3.55666943663549e-05, + "loss": 0.6355, + "step": 97960 + }, + { + "epoch": 0.8660867412790184, + "grad_norm": 10.27686882019043, + "learning_rate": 3.556522097868303e-05, + "loss": 0.6504, + "step": 97970 + }, + { + "epoch": 0.8661751445393306, + "grad_norm": 2.0946044921875, + "learning_rate": 3.556374759101116e-05, + "loss": 0.6157, + "step": 97980 + }, + { + "epoch": 0.8662635477996429, + "grad_norm": 3.56308913230896, + "learning_rate": 3.5562274203339284e-05, + "loss": 0.5911, + "step": 97990 + }, + { + "epoch": 0.8663519510599551, + "grad_norm": 6.122189044952393, + "learning_rate": 3.556080081566742e-05, + "loss": 0.6597, + "step": 98000 + }, + { + "epoch": 0.8664403543202673, + "grad_norm": 2.2405266761779785, + "learning_rate": 3.555932742799555e-05, + "loss": 0.6425, + "step": 98010 + }, + { + "epoch": 0.8665287575805796, + "grad_norm": 4.223904132843018, + "learning_rate": 3.5557854040323676e-05, + "loss": 0.6877, + "step": 98020 + }, + { + "epoch": 0.8666171608408918, + "grad_norm": 5.597757339477539, + "learning_rate": 3.5556380652651804e-05, + "loss": 0.6597, + "step": 98030 + }, + { + "epoch": 0.866705564101204, + "grad_norm": 6.285776138305664, + "learning_rate": 3.555490726497994e-05, + "loss": 0.7034, + "step": 98040 + }, + { + "epoch": 0.8667939673615163, + "grad_norm": 3.9177229404449463, + "learning_rate": 3.555343387730806e-05, + "loss": 0.6602, + "step": 98050 + }, + { + "epoch": 0.8668823706218285, + "grad_norm": 1.1600656509399414, + "learning_rate": 3.5551960489636196e-05, + "loss": 0.8058, + "step": 98060 + }, + { + "epoch": 0.8669707738821407, + "grad_norm": 3.8851804733276367, + "learning_rate": 3.5550487101964324e-05, + "loss": 0.6659, + "step": 98070 + }, + { + "epoch": 0.8670591771424531, + "grad_norm": 8.677483558654785, + "learning_rate": 3.554901371429245e-05, + "loss": 0.6912, + "step": 98080 + }, + { + "epoch": 0.8671475804027653, + "grad_norm": 2.5365614891052246, + "learning_rate": 3.554754032662058e-05, + "loss": 0.669, + "step": 98090 + }, + { + "epoch": 0.8672359836630775, + "grad_norm": 2.035731077194214, + "learning_rate": 3.554606693894871e-05, + "loss": 0.6094, + "step": 98100 + }, + { + "epoch": 0.8673243869233898, + "grad_norm": 2.49967098236084, + "learning_rate": 3.554459355127684e-05, + "loss": 0.6024, + "step": 98110 + }, + { + "epoch": 0.867412790183702, + "grad_norm": 4.827687740325928, + "learning_rate": 3.554312016360497e-05, + "loss": 0.7174, + "step": 98120 + }, + { + "epoch": 0.8675011934440142, + "grad_norm": 1.857065200805664, + "learning_rate": 3.5541646775933094e-05, + "loss": 0.6971, + "step": 98130 + }, + { + "epoch": 0.8675895967043264, + "grad_norm": 2.1943271160125732, + "learning_rate": 3.554017338826123e-05, + "loss": 0.7467, + "step": 98140 + }, + { + "epoch": 0.8676779999646387, + "grad_norm": 7.98257303237915, + "learning_rate": 3.553870000058936e-05, + "loss": 0.6916, + "step": 98150 + }, + { + "epoch": 0.8677664032249509, + "grad_norm": 4.576429843902588, + "learning_rate": 3.5537226612917486e-05, + "loss": 0.5785, + "step": 98160 + }, + { + "epoch": 0.8678548064852631, + "grad_norm": 3.215961217880249, + "learning_rate": 3.5535753225245615e-05, + "loss": 0.6113, + "step": 98170 + }, + { + "epoch": 0.8679432097455754, + "grad_norm": 5.609352111816406, + "learning_rate": 3.553427983757375e-05, + "loss": 0.628, + "step": 98180 + }, + { + "epoch": 0.8680316130058876, + "grad_norm": 2.508380174636841, + "learning_rate": 3.553280644990187e-05, + "loss": 0.7088, + "step": 98190 + }, + { + "epoch": 0.8681200162662, + "grad_norm": 1.774983525276184, + "learning_rate": 3.5531333062230006e-05, + "loss": 0.6589, + "step": 98200 + }, + { + "epoch": 0.8682084195265122, + "grad_norm": 3.499372720718384, + "learning_rate": 3.552985967455813e-05, + "loss": 0.6775, + "step": 98210 + }, + { + "epoch": 0.8682968227868244, + "grad_norm": 6.5042724609375, + "learning_rate": 3.552838628688626e-05, + "loss": 0.7656, + "step": 98220 + }, + { + "epoch": 0.8683852260471366, + "grad_norm": 3.096292495727539, + "learning_rate": 3.552691289921439e-05, + "loss": 0.7369, + "step": 98230 + }, + { + "epoch": 0.8684736293074489, + "grad_norm": 3.5823757648468018, + "learning_rate": 3.552543951154252e-05, + "loss": 0.7006, + "step": 98240 + }, + { + "epoch": 0.8685620325677611, + "grad_norm": 1.5179250240325928, + "learning_rate": 3.552396612387065e-05, + "loss": 0.7063, + "step": 98250 + }, + { + "epoch": 0.8686504358280733, + "grad_norm": 3.8673157691955566, + "learning_rate": 3.552249273619878e-05, + "loss": 0.6496, + "step": 98260 + }, + { + "epoch": 0.8687388390883856, + "grad_norm": 2.3928096294403076, + "learning_rate": 3.5521019348526905e-05, + "loss": 0.7631, + "step": 98270 + }, + { + "epoch": 0.8688272423486978, + "grad_norm": 3.0162246227264404, + "learning_rate": 3.551954596085504e-05, + "loss": 0.5955, + "step": 98280 + }, + { + "epoch": 0.86891564560901, + "grad_norm": 11.222826957702637, + "learning_rate": 3.551807257318317e-05, + "loss": 0.6608, + "step": 98290 + }, + { + "epoch": 0.8690040488693223, + "grad_norm": 3.427701234817505, + "learning_rate": 3.55165991855113e-05, + "loss": 0.5962, + "step": 98300 + }, + { + "epoch": 0.8690924521296346, + "grad_norm": 4.837122917175293, + "learning_rate": 3.5515125797839425e-05, + "loss": 0.7147, + "step": 98310 + }, + { + "epoch": 0.8691808553899468, + "grad_norm": 20.0013370513916, + "learning_rate": 3.5513652410167553e-05, + "loss": 0.5389, + "step": 98320 + }, + { + "epoch": 0.8692692586502591, + "grad_norm": 1.3774549961090088, + "learning_rate": 3.551217902249568e-05, + "loss": 0.5439, + "step": 98330 + }, + { + "epoch": 0.8693576619105713, + "grad_norm": 4.33914852142334, + "learning_rate": 3.551070563482382e-05, + "loss": 0.6164, + "step": 98340 + }, + { + "epoch": 0.8694460651708835, + "grad_norm": 2.696580648422241, + "learning_rate": 3.550923224715194e-05, + "loss": 0.6114, + "step": 98350 + }, + { + "epoch": 0.8695344684311958, + "grad_norm": 1.8292914628982544, + "learning_rate": 3.5507758859480074e-05, + "loss": 0.7744, + "step": 98360 + }, + { + "epoch": 0.869622871691508, + "grad_norm": 7.250606536865234, + "learning_rate": 3.55062854718082e-05, + "loss": 0.5779, + "step": 98370 + }, + { + "epoch": 0.8697112749518202, + "grad_norm": 6.369245529174805, + "learning_rate": 3.550481208413633e-05, + "loss": 0.639, + "step": 98380 + }, + { + "epoch": 0.8697996782121324, + "grad_norm": 19.52463722229004, + "learning_rate": 3.550333869646446e-05, + "loss": 0.5768, + "step": 98390 + }, + { + "epoch": 0.8698880814724447, + "grad_norm": 5.448720455169678, + "learning_rate": 3.5501865308792594e-05, + "loss": 0.5667, + "step": 98400 + }, + { + "epoch": 0.8699764847327569, + "grad_norm": 5.967934608459473, + "learning_rate": 3.5500391921120715e-05, + "loss": 0.5591, + "step": 98410 + }, + { + "epoch": 0.8700648879930691, + "grad_norm": 3.537064790725708, + "learning_rate": 3.549891853344885e-05, + "loss": 0.6367, + "step": 98420 + }, + { + "epoch": 0.8701532912533815, + "grad_norm": 2.089097738265991, + "learning_rate": 3.549744514577697e-05, + "loss": 0.8827, + "step": 98430 + }, + { + "epoch": 0.8702416945136937, + "grad_norm": 3.187114953994751, + "learning_rate": 3.549597175810511e-05, + "loss": 0.6592, + "step": 98440 + }, + { + "epoch": 0.8703300977740059, + "grad_norm": 1.6178146600723267, + "learning_rate": 3.5494498370433236e-05, + "loss": 0.631, + "step": 98450 + }, + { + "epoch": 0.8704185010343182, + "grad_norm": 4.6605448722839355, + "learning_rate": 3.5493024982761364e-05, + "loss": 0.7185, + "step": 98460 + }, + { + "epoch": 0.8705069042946304, + "grad_norm": 1.473974347114563, + "learning_rate": 3.549155159508949e-05, + "loss": 0.6521, + "step": 98470 + }, + { + "epoch": 0.8705953075549426, + "grad_norm": 2.387791633605957, + "learning_rate": 3.549007820741763e-05, + "loss": 0.6081, + "step": 98480 + }, + { + "epoch": 0.8706837108152549, + "grad_norm": 7.768845081329346, + "learning_rate": 3.548860481974575e-05, + "loss": 0.6605, + "step": 98490 + }, + { + "epoch": 0.8707721140755671, + "grad_norm": 32.79899978637695, + "learning_rate": 3.5487131432073884e-05, + "loss": 0.5928, + "step": 98500 + }, + { + "epoch": 0.8708605173358793, + "grad_norm": 4.767167091369629, + "learning_rate": 3.548565804440201e-05, + "loss": 0.7422, + "step": 98510 + }, + { + "epoch": 0.8709489205961916, + "grad_norm": 2.4973831176757812, + "learning_rate": 3.548418465673014e-05, + "loss": 0.6895, + "step": 98520 + }, + { + "epoch": 0.8710373238565038, + "grad_norm": 3.401088237762451, + "learning_rate": 3.548271126905827e-05, + "loss": 0.6381, + "step": 98530 + }, + { + "epoch": 0.871125727116816, + "grad_norm": 0.8923147916793823, + "learning_rate": 3.5481237881386404e-05, + "loss": 0.6686, + "step": 98540 + }, + { + "epoch": 0.8712141303771284, + "grad_norm": 1.5152621269226074, + "learning_rate": 3.5479764493714526e-05, + "loss": 0.5468, + "step": 98550 + }, + { + "epoch": 0.8713025336374406, + "grad_norm": 4.025765419006348, + "learning_rate": 3.547829110604266e-05, + "loss": 0.6428, + "step": 98560 + }, + { + "epoch": 0.8713909368977528, + "grad_norm": 5.059818267822266, + "learning_rate": 3.547681771837079e-05, + "loss": 0.5825, + "step": 98570 + }, + { + "epoch": 0.871479340158065, + "grad_norm": 6.144792079925537, + "learning_rate": 3.547534433069892e-05, + "loss": 0.8703, + "step": 98580 + }, + { + "epoch": 0.8715677434183773, + "grad_norm": 2.613663673400879, + "learning_rate": 3.5473870943027046e-05, + "loss": 0.7718, + "step": 98590 + }, + { + "epoch": 0.8716561466786895, + "grad_norm": 2.0589616298675537, + "learning_rate": 3.5472397555355174e-05, + "loss": 0.6489, + "step": 98600 + }, + { + "epoch": 0.8717445499390017, + "grad_norm": 31.750431060791016, + "learning_rate": 3.54709241676833e-05, + "loss": 0.6047, + "step": 98610 + }, + { + "epoch": 0.871832953199314, + "grad_norm": 1.3656587600708008, + "learning_rate": 3.546945078001144e-05, + "loss": 0.6384, + "step": 98620 + }, + { + "epoch": 0.8719213564596262, + "grad_norm": 3.833303451538086, + "learning_rate": 3.5467977392339566e-05, + "loss": 0.508, + "step": 98630 + }, + { + "epoch": 0.8720097597199384, + "grad_norm": 0.9208725690841675, + "learning_rate": 3.5466504004667695e-05, + "loss": 0.6131, + "step": 98640 + }, + { + "epoch": 0.8720981629802507, + "grad_norm": 1.3009897470474243, + "learning_rate": 3.546503061699582e-05, + "loss": 0.6773, + "step": 98650 + }, + { + "epoch": 0.8721865662405629, + "grad_norm": 7.807600975036621, + "learning_rate": 3.546355722932395e-05, + "loss": 0.6227, + "step": 98660 + }, + { + "epoch": 0.8722749695008752, + "grad_norm": 1.3163272142410278, + "learning_rate": 3.546208384165208e-05, + "loss": 0.6966, + "step": 98670 + }, + { + "epoch": 0.8723633727611875, + "grad_norm": 3.8006227016448975, + "learning_rate": 3.546061045398021e-05, + "loss": 0.6701, + "step": 98680 + }, + { + "epoch": 0.8724517760214997, + "grad_norm": 1.7885210514068604, + "learning_rate": 3.545913706630834e-05, + "loss": 0.5752, + "step": 98690 + }, + { + "epoch": 0.8725401792818119, + "grad_norm": 1.1173492670059204, + "learning_rate": 3.545766367863647e-05, + "loss": 0.703, + "step": 98700 + }, + { + "epoch": 0.8726285825421242, + "grad_norm": 2.2168967723846436, + "learning_rate": 3.54561902909646e-05, + "loss": 0.6392, + "step": 98710 + }, + { + "epoch": 0.8727169858024364, + "grad_norm": 4.935361385345459, + "learning_rate": 3.545471690329273e-05, + "loss": 0.6871, + "step": 98720 + }, + { + "epoch": 0.8728053890627486, + "grad_norm": 12.358054161071777, + "learning_rate": 3.5453243515620857e-05, + "loss": 0.5707, + "step": 98730 + }, + { + "epoch": 0.8728937923230609, + "grad_norm": 1.3461123704910278, + "learning_rate": 3.5451770127948985e-05, + "loss": 0.5944, + "step": 98740 + }, + { + "epoch": 0.8729821955833731, + "grad_norm": 2.271646738052368, + "learning_rate": 3.545029674027712e-05, + "loss": 0.5627, + "step": 98750 + }, + { + "epoch": 0.8730705988436853, + "grad_norm": 2.8260445594787598, + "learning_rate": 3.544882335260525e-05, + "loss": 0.6882, + "step": 98760 + }, + { + "epoch": 0.8731590021039976, + "grad_norm": 5.59370756149292, + "learning_rate": 3.544734996493338e-05, + "loss": 0.7472, + "step": 98770 + }, + { + "epoch": 0.8732474053643098, + "grad_norm": 4.746335983276367, + "learning_rate": 3.5445876577261505e-05, + "loss": 0.6872, + "step": 98780 + }, + { + "epoch": 0.8733358086246221, + "grad_norm": 5.156048774719238, + "learning_rate": 3.5444403189589633e-05, + "loss": 0.6404, + "step": 98790 + }, + { + "epoch": 0.8734242118849344, + "grad_norm": 2.1542906761169434, + "learning_rate": 3.544292980191776e-05, + "loss": 0.5713, + "step": 98800 + }, + { + "epoch": 0.8735126151452466, + "grad_norm": 3.502044439315796, + "learning_rate": 3.54414564142459e-05, + "loss": 0.6576, + "step": 98810 + }, + { + "epoch": 0.8736010184055588, + "grad_norm": 5.644341468811035, + "learning_rate": 3.543998302657402e-05, + "loss": 0.6506, + "step": 98820 + }, + { + "epoch": 0.873689421665871, + "grad_norm": 8.779607772827148, + "learning_rate": 3.5438509638902154e-05, + "loss": 0.6885, + "step": 98830 + }, + { + "epoch": 0.8737778249261833, + "grad_norm": 3.6800928115844727, + "learning_rate": 3.543703625123028e-05, + "loss": 0.6447, + "step": 98840 + }, + { + "epoch": 0.8738662281864955, + "grad_norm": 1.630843997001648, + "learning_rate": 3.543556286355841e-05, + "loss": 0.5899, + "step": 98850 + }, + { + "epoch": 0.8739546314468077, + "grad_norm": 1.7254748344421387, + "learning_rate": 3.543408947588654e-05, + "loss": 0.6351, + "step": 98860 + }, + { + "epoch": 0.87404303470712, + "grad_norm": 2.0491297245025635, + "learning_rate": 3.5432616088214674e-05, + "loss": 0.628, + "step": 98870 + }, + { + "epoch": 0.8741314379674322, + "grad_norm": 5.019067287445068, + "learning_rate": 3.5431142700542795e-05, + "loss": 0.6937, + "step": 98880 + }, + { + "epoch": 0.8742198412277444, + "grad_norm": 9.280817985534668, + "learning_rate": 3.542966931287093e-05, + "loss": 0.595, + "step": 98890 + }, + { + "epoch": 0.8743082444880568, + "grad_norm": 3.0402538776397705, + "learning_rate": 3.542819592519905e-05, + "loss": 0.6988, + "step": 98900 + }, + { + "epoch": 0.874396647748369, + "grad_norm": 4.510800361633301, + "learning_rate": 3.542672253752719e-05, + "loss": 0.6386, + "step": 98910 + }, + { + "epoch": 0.8744850510086812, + "grad_norm": 5.1762847900390625, + "learning_rate": 3.5425249149855316e-05, + "loss": 0.6576, + "step": 98920 + }, + { + "epoch": 0.8745734542689935, + "grad_norm": 2.7324318885803223, + "learning_rate": 3.5423775762183444e-05, + "loss": 0.7281, + "step": 98930 + }, + { + "epoch": 0.8746618575293057, + "grad_norm": 2.619001865386963, + "learning_rate": 3.542230237451157e-05, + "loss": 0.7271, + "step": 98940 + }, + { + "epoch": 0.8747502607896179, + "grad_norm": 2.3867146968841553, + "learning_rate": 3.542082898683971e-05, + "loss": 0.6574, + "step": 98950 + }, + { + "epoch": 0.8748386640499302, + "grad_norm": 4.602837562561035, + "learning_rate": 3.541935559916783e-05, + "loss": 0.5304, + "step": 98960 + }, + { + "epoch": 0.8749270673102424, + "grad_norm": 3.5285396575927734, + "learning_rate": 3.5417882211495964e-05, + "loss": 0.7435, + "step": 98970 + }, + { + "epoch": 0.8750154705705546, + "grad_norm": 2.4819743633270264, + "learning_rate": 3.541640882382409e-05, + "loss": 0.6965, + "step": 98980 + }, + { + "epoch": 0.8751038738308669, + "grad_norm": 2.2841880321502686, + "learning_rate": 3.541493543615222e-05, + "loss": 0.6337, + "step": 98990 + }, + { + "epoch": 0.8751922770911791, + "grad_norm": 6.790390968322754, + "learning_rate": 3.541346204848035e-05, + "loss": 0.5454, + "step": 99000 + }, + { + "epoch": 0.8752806803514913, + "grad_norm": 1.1564199924468994, + "learning_rate": 3.5411988660808484e-05, + "loss": 0.634, + "step": 99010 + }, + { + "epoch": 0.8753690836118037, + "grad_norm": 10.727293014526367, + "learning_rate": 3.5410515273136606e-05, + "loss": 0.6872, + "step": 99020 + }, + { + "epoch": 0.8754574868721159, + "grad_norm": 6.787434101104736, + "learning_rate": 3.540904188546474e-05, + "loss": 0.6958, + "step": 99030 + }, + { + "epoch": 0.8755458901324281, + "grad_norm": 8.093029022216797, + "learning_rate": 3.540756849779286e-05, + "loss": 0.6648, + "step": 99040 + }, + { + "epoch": 0.8756342933927403, + "grad_norm": 2.7870447635650635, + "learning_rate": 3.5406095110121e-05, + "loss": 0.5873, + "step": 99050 + }, + { + "epoch": 0.8757226966530526, + "grad_norm": 2.5037841796875, + "learning_rate": 3.5404621722449126e-05, + "loss": 0.6493, + "step": 99060 + }, + { + "epoch": 0.8758110999133648, + "grad_norm": 1.922541618347168, + "learning_rate": 3.5403148334777254e-05, + "loss": 0.5845, + "step": 99070 + }, + { + "epoch": 0.875899503173677, + "grad_norm": 3.1529664993286133, + "learning_rate": 3.540167494710538e-05, + "loss": 0.4853, + "step": 99080 + }, + { + "epoch": 0.8759879064339893, + "grad_norm": 1.3723474740982056, + "learning_rate": 3.540020155943352e-05, + "loss": 0.6435, + "step": 99090 + }, + { + "epoch": 0.8760763096943015, + "grad_norm": 4.011801719665527, + "learning_rate": 3.539872817176164e-05, + "loss": 0.6664, + "step": 99100 + }, + { + "epoch": 0.8761647129546137, + "grad_norm": 3.2729763984680176, + "learning_rate": 3.5397254784089775e-05, + "loss": 0.7014, + "step": 99110 + }, + { + "epoch": 0.876253116214926, + "grad_norm": 4.137759208679199, + "learning_rate": 3.53957813964179e-05, + "loss": 0.6752, + "step": 99120 + }, + { + "epoch": 0.8763415194752382, + "grad_norm": 3.8527872562408447, + "learning_rate": 3.539430800874603e-05, + "loss": 0.7016, + "step": 99130 + }, + { + "epoch": 0.8764299227355505, + "grad_norm": 3.8789100646972656, + "learning_rate": 3.539283462107416e-05, + "loss": 0.6849, + "step": 99140 + }, + { + "epoch": 0.8765183259958628, + "grad_norm": 5.719064235687256, + "learning_rate": 3.539136123340229e-05, + "loss": 0.7762, + "step": 99150 + }, + { + "epoch": 0.876606729256175, + "grad_norm": 1.9379489421844482, + "learning_rate": 3.5389887845730416e-05, + "loss": 0.7892, + "step": 99160 + }, + { + "epoch": 0.8766951325164872, + "grad_norm": 2.9722301959991455, + "learning_rate": 3.538841445805855e-05, + "loss": 0.7086, + "step": 99170 + }, + { + "epoch": 0.8767835357767995, + "grad_norm": 16.889270782470703, + "learning_rate": 3.538694107038667e-05, + "loss": 0.6332, + "step": 99180 + }, + { + "epoch": 0.8768719390371117, + "grad_norm": 2.966301202774048, + "learning_rate": 3.538546768271481e-05, + "loss": 0.6073, + "step": 99190 + }, + { + "epoch": 0.8769603422974239, + "grad_norm": 3.1727070808410645, + "learning_rate": 3.538399429504294e-05, + "loss": 0.6174, + "step": 99200 + }, + { + "epoch": 0.8770487455577362, + "grad_norm": 5.121644496917725, + "learning_rate": 3.5382520907371065e-05, + "loss": 0.6423, + "step": 99210 + }, + { + "epoch": 0.8771371488180484, + "grad_norm": 9.327658653259277, + "learning_rate": 3.538104751969919e-05, + "loss": 0.6135, + "step": 99220 + }, + { + "epoch": 0.8772255520783606, + "grad_norm": 1.8207581043243408, + "learning_rate": 3.537957413202733e-05, + "loss": 0.7655, + "step": 99230 + }, + { + "epoch": 0.8773139553386728, + "grad_norm": 5.972422122955322, + "learning_rate": 3.537810074435545e-05, + "loss": 0.7373, + "step": 99240 + }, + { + "epoch": 0.8774023585989851, + "grad_norm": 3.4334843158721924, + "learning_rate": 3.5376627356683585e-05, + "loss": 0.7113, + "step": 99250 + }, + { + "epoch": 0.8774907618592974, + "grad_norm": 21.4027099609375, + "learning_rate": 3.537515396901171e-05, + "loss": 0.6069, + "step": 99260 + }, + { + "epoch": 0.8775791651196096, + "grad_norm": 1.0649467706680298, + "learning_rate": 3.537368058133984e-05, + "loss": 0.6434, + "step": 99270 + }, + { + "epoch": 0.8776675683799219, + "grad_norm": 1.082999348640442, + "learning_rate": 3.537220719366797e-05, + "loss": 0.5167, + "step": 99280 + }, + { + "epoch": 0.8777559716402341, + "grad_norm": 2.8941519260406494, + "learning_rate": 3.53707338059961e-05, + "loss": 0.6574, + "step": 99290 + }, + { + "epoch": 0.8778443749005463, + "grad_norm": 12.018267631530762, + "learning_rate": 3.536926041832423e-05, + "loss": 0.7463, + "step": 99300 + }, + { + "epoch": 0.8779327781608586, + "grad_norm": 1.1627510786056519, + "learning_rate": 3.536778703065236e-05, + "loss": 0.4938, + "step": 99310 + }, + { + "epoch": 0.8780211814211708, + "grad_norm": 1.0448715686798096, + "learning_rate": 3.5366313642980484e-05, + "loss": 0.6823, + "step": 99320 + }, + { + "epoch": 0.878109584681483, + "grad_norm": 0.8962799310684204, + "learning_rate": 3.536484025530862e-05, + "loss": 0.5619, + "step": 99330 + }, + { + "epoch": 0.8781979879417953, + "grad_norm": 5.345099925994873, + "learning_rate": 3.536336686763675e-05, + "loss": 0.6446, + "step": 99340 + }, + { + "epoch": 0.8782863912021075, + "grad_norm": 1.104266881942749, + "learning_rate": 3.5361893479964875e-05, + "loss": 0.6198, + "step": 99350 + }, + { + "epoch": 0.8783747944624197, + "grad_norm": 7.238385200500488, + "learning_rate": 3.5360420092293004e-05, + "loss": 0.7927, + "step": 99360 + }, + { + "epoch": 0.8784631977227321, + "grad_norm": 2.3845322132110596, + "learning_rate": 3.535894670462113e-05, + "loss": 0.5904, + "step": 99370 + }, + { + "epoch": 0.8785516009830443, + "grad_norm": 5.77187442779541, + "learning_rate": 3.535747331694926e-05, + "loss": 0.6724, + "step": 99380 + }, + { + "epoch": 0.8786400042433565, + "grad_norm": 6.896483898162842, + "learning_rate": 3.5355999929277396e-05, + "loss": 0.6764, + "step": 99390 + }, + { + "epoch": 0.8787284075036688, + "grad_norm": 3.7183151245117188, + "learning_rate": 3.535452654160552e-05, + "loss": 0.6929, + "step": 99400 + }, + { + "epoch": 0.878816810763981, + "grad_norm": 1.867093563079834, + "learning_rate": 3.535305315393365e-05, + "loss": 0.6756, + "step": 99410 + }, + { + "epoch": 0.8789052140242932, + "grad_norm": 11.129083633422852, + "learning_rate": 3.535157976626178e-05, + "loss": 0.6851, + "step": 99420 + }, + { + "epoch": 0.8789936172846055, + "grad_norm": 2.918914556503296, + "learning_rate": 3.535010637858991e-05, + "loss": 0.7333, + "step": 99430 + }, + { + "epoch": 0.8790820205449177, + "grad_norm": 2.4519002437591553, + "learning_rate": 3.534863299091804e-05, + "loss": 0.5644, + "step": 99440 + }, + { + "epoch": 0.8791704238052299, + "grad_norm": 1.9421651363372803, + "learning_rate": 3.534715960324617e-05, + "loss": 0.7968, + "step": 99450 + }, + { + "epoch": 0.8792588270655421, + "grad_norm": 2.8746142387390137, + "learning_rate": 3.5345686215574294e-05, + "loss": 0.5139, + "step": 99460 + }, + { + "epoch": 0.8793472303258544, + "grad_norm": 3.651703119277954, + "learning_rate": 3.534421282790243e-05, + "loss": 0.6729, + "step": 99470 + }, + { + "epoch": 0.8794356335861666, + "grad_norm": 7.4472503662109375, + "learning_rate": 3.534273944023056e-05, + "loss": 0.5204, + "step": 99480 + }, + { + "epoch": 0.879524036846479, + "grad_norm": 3.8146281242370605, + "learning_rate": 3.5341266052558686e-05, + "loss": 0.706, + "step": 99490 + }, + { + "epoch": 0.8796124401067912, + "grad_norm": 6.142029285430908, + "learning_rate": 3.5339792664886814e-05, + "loss": 0.6076, + "step": 99500 + }, + { + "epoch": 0.8797008433671034, + "grad_norm": 2.7834155559539795, + "learning_rate": 3.533831927721494e-05, + "loss": 0.5998, + "step": 99510 + }, + { + "epoch": 0.8797892466274156, + "grad_norm": 4.0486836433410645, + "learning_rate": 3.533684588954307e-05, + "loss": 0.6007, + "step": 99520 + }, + { + "epoch": 0.8798776498877279, + "grad_norm": 2.0358059406280518, + "learning_rate": 3.5335372501871206e-05, + "loss": 0.5999, + "step": 99530 + }, + { + "epoch": 0.8799660531480401, + "grad_norm": 8.948349952697754, + "learning_rate": 3.5333899114199335e-05, + "loss": 0.8091, + "step": 99540 + }, + { + "epoch": 0.8800544564083523, + "grad_norm": 5.502082824707031, + "learning_rate": 3.533242572652746e-05, + "loss": 0.6152, + "step": 99550 + }, + { + "epoch": 0.8801428596686646, + "grad_norm": 1.0880204439163208, + "learning_rate": 3.533095233885559e-05, + "loss": 0.6501, + "step": 99560 + }, + { + "epoch": 0.8802312629289768, + "grad_norm": 7.57011604309082, + "learning_rate": 3.532947895118372e-05, + "loss": 0.7725, + "step": 99570 + }, + { + "epoch": 0.880319666189289, + "grad_norm": 2.140597105026245, + "learning_rate": 3.532800556351185e-05, + "loss": 0.7059, + "step": 99580 + }, + { + "epoch": 0.8804080694496013, + "grad_norm": 1.4484633207321167, + "learning_rate": 3.532653217583998e-05, + "loss": 0.6203, + "step": 99590 + }, + { + "epoch": 0.8804964727099135, + "grad_norm": 1.4211913347244263, + "learning_rate": 3.532505878816811e-05, + "loss": 0.5407, + "step": 99600 + }, + { + "epoch": 0.8805848759702258, + "grad_norm": 1.9758319854736328, + "learning_rate": 3.532358540049624e-05, + "loss": 0.6465, + "step": 99610 + }, + { + "epoch": 0.8806732792305381, + "grad_norm": 3.216864824295044, + "learning_rate": 3.532211201282437e-05, + "loss": 0.6105, + "step": 99620 + }, + { + "epoch": 0.8807616824908503, + "grad_norm": 34.49270248413086, + "learning_rate": 3.5320638625152497e-05, + "loss": 0.6752, + "step": 99630 + }, + { + "epoch": 0.8808500857511625, + "grad_norm": 1.2794069051742554, + "learning_rate": 3.5319165237480625e-05, + "loss": 0.6764, + "step": 99640 + }, + { + "epoch": 0.8809384890114748, + "grad_norm": 1.846698522567749, + "learning_rate": 3.531769184980875e-05, + "loss": 0.5829, + "step": 99650 + }, + { + "epoch": 0.881026892271787, + "grad_norm": 2.458574056625366, + "learning_rate": 3.531621846213689e-05, + "loss": 0.5706, + "step": 99660 + }, + { + "epoch": 0.8811152955320992, + "grad_norm": 3.632127046585083, + "learning_rate": 3.531474507446502e-05, + "loss": 0.8337, + "step": 99670 + }, + { + "epoch": 0.8812036987924114, + "grad_norm": 2.750758647918701, + "learning_rate": 3.5313271686793145e-05, + "loss": 0.553, + "step": 99680 + }, + { + "epoch": 0.8812921020527237, + "grad_norm": 3.2621114253997803, + "learning_rate": 3.5311798299121273e-05, + "loss": 0.6514, + "step": 99690 + }, + { + "epoch": 0.8813805053130359, + "grad_norm": 1.0674934387207031, + "learning_rate": 3.53103249114494e-05, + "loss": 0.5775, + "step": 99700 + }, + { + "epoch": 0.8814689085733481, + "grad_norm": 2.0522475242614746, + "learning_rate": 3.530885152377753e-05, + "loss": 0.603, + "step": 99710 + }, + { + "epoch": 0.8815573118336604, + "grad_norm": 1.4563353061676025, + "learning_rate": 3.5307378136105665e-05, + "loss": 0.7596, + "step": 99720 + }, + { + "epoch": 0.8816457150939727, + "grad_norm": 8.15146255493164, + "learning_rate": 3.530590474843379e-05, + "loss": 0.6854, + "step": 99730 + }, + { + "epoch": 0.881734118354285, + "grad_norm": 5.4734392166137695, + "learning_rate": 3.530443136076192e-05, + "loss": 0.5172, + "step": 99740 + }, + { + "epoch": 0.8818225216145972, + "grad_norm": 2.399756669998169, + "learning_rate": 3.530295797309005e-05, + "loss": 0.7575, + "step": 99750 + }, + { + "epoch": 0.8819109248749094, + "grad_norm": 2.3738224506378174, + "learning_rate": 3.530148458541818e-05, + "loss": 0.7188, + "step": 99760 + }, + { + "epoch": 0.8819993281352216, + "grad_norm": 2.2894015312194824, + "learning_rate": 3.530001119774631e-05, + "loss": 0.8088, + "step": 99770 + }, + { + "epoch": 0.8820877313955339, + "grad_norm": 1.5450955629348755, + "learning_rate": 3.529853781007444e-05, + "loss": 0.5708, + "step": 99780 + }, + { + "epoch": 0.8821761346558461, + "grad_norm": 1.1488293409347534, + "learning_rate": 3.5297064422402564e-05, + "loss": 0.6266, + "step": 99790 + }, + { + "epoch": 0.8822645379161583, + "grad_norm": 2.267016649246216, + "learning_rate": 3.52955910347307e-05, + "loss": 0.6417, + "step": 99800 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 2.547762155532837, + "learning_rate": 3.529411764705883e-05, + "loss": 0.7029, + "step": 99810 + }, + { + "epoch": 0.8824413444367828, + "grad_norm": 2.7036497592926025, + "learning_rate": 3.5292644259386956e-05, + "loss": 0.788, + "step": 99820 + }, + { + "epoch": 0.882529747697095, + "grad_norm": 2.031101942062378, + "learning_rate": 3.5291170871715084e-05, + "loss": 0.6671, + "step": 99830 + }, + { + "epoch": 0.8826181509574073, + "grad_norm": 2.2533857822418213, + "learning_rate": 3.528969748404322e-05, + "loss": 0.6051, + "step": 99840 + }, + { + "epoch": 0.8827065542177196, + "grad_norm": 5.356338977813721, + "learning_rate": 3.528822409637134e-05, + "loss": 0.7115, + "step": 99850 + }, + { + "epoch": 0.8827949574780318, + "grad_norm": 2.7008416652679443, + "learning_rate": 3.5286750708699476e-05, + "loss": 0.5552, + "step": 99860 + }, + { + "epoch": 0.8828833607383441, + "grad_norm": 2.2722997665405273, + "learning_rate": 3.52852773210276e-05, + "loss": 0.551, + "step": 99870 + }, + { + "epoch": 0.8829717639986563, + "grad_norm": 7.175367832183838, + "learning_rate": 3.528380393335573e-05, + "loss": 0.8399, + "step": 99880 + }, + { + "epoch": 0.8830601672589685, + "grad_norm": 3.9263012409210205, + "learning_rate": 3.528233054568386e-05, + "loss": 0.6357, + "step": 99890 + }, + { + "epoch": 0.8831485705192808, + "grad_norm": 0.8555113077163696, + "learning_rate": 3.528085715801199e-05, + "loss": 0.5515, + "step": 99900 + }, + { + "epoch": 0.883236973779593, + "grad_norm": 1.8814798593521118, + "learning_rate": 3.527938377034012e-05, + "loss": 0.5942, + "step": 99910 + }, + { + "epoch": 0.8833253770399052, + "grad_norm": 2.6422667503356934, + "learning_rate": 3.527791038266825e-05, + "loss": 0.6715, + "step": 99920 + }, + { + "epoch": 0.8834137803002174, + "grad_norm": 4.119536399841309, + "learning_rate": 3.5276436994996374e-05, + "loss": 0.7084, + "step": 99930 + }, + { + "epoch": 0.8835021835605297, + "grad_norm": 6.071832656860352, + "learning_rate": 3.527496360732451e-05, + "loss": 0.7154, + "step": 99940 + }, + { + "epoch": 0.8835905868208419, + "grad_norm": 2.373159408569336, + "learning_rate": 3.527349021965264e-05, + "loss": 0.7028, + "step": 99950 + }, + { + "epoch": 0.8836789900811542, + "grad_norm": 8.160861015319824, + "learning_rate": 3.5272016831980766e-05, + "loss": 0.7688, + "step": 99960 + }, + { + "epoch": 0.8837673933414665, + "grad_norm": 1.8556309938430786, + "learning_rate": 3.5270543444308894e-05, + "loss": 0.5306, + "step": 99970 + }, + { + "epoch": 0.8838557966017787, + "grad_norm": 1.4118374586105347, + "learning_rate": 3.526907005663702e-05, + "loss": 0.5776, + "step": 99980 + }, + { + "epoch": 0.8839441998620909, + "grad_norm": 4.978692531585693, + "learning_rate": 3.526759666896515e-05, + "loss": 0.671, + "step": 99990 + }, + { + "epoch": 0.8840326031224032, + "grad_norm": 2.377770185470581, + "learning_rate": 3.5266123281293286e-05, + "loss": 0.729, + "step": 100000 + }, + { + "epoch": 0.8841210063827154, + "grad_norm": 1.3143846988677979, + "learning_rate": 3.526464989362141e-05, + "loss": 0.6601, + "step": 100010 + }, + { + "epoch": 0.8842094096430276, + "grad_norm": 2.8539278507232666, + "learning_rate": 3.526317650594954e-05, + "loss": 0.5819, + "step": 100020 + }, + { + "epoch": 0.8842978129033399, + "grad_norm": 1.5120114088058472, + "learning_rate": 3.526170311827767e-05, + "loss": 0.6371, + "step": 100030 + }, + { + "epoch": 0.8843862161636521, + "grad_norm": 3.649439573287964, + "learning_rate": 3.52602297306058e-05, + "loss": 0.7258, + "step": 100040 + }, + { + "epoch": 0.8844746194239643, + "grad_norm": 3.1046438217163086, + "learning_rate": 3.525875634293393e-05, + "loss": 0.7519, + "step": 100050 + }, + { + "epoch": 0.8845630226842766, + "grad_norm": 3.6604366302490234, + "learning_rate": 3.525728295526206e-05, + "loss": 0.6225, + "step": 100060 + }, + { + "epoch": 0.8846514259445888, + "grad_norm": 3.448822021484375, + "learning_rate": 3.5255809567590185e-05, + "loss": 0.7538, + "step": 100070 + }, + { + "epoch": 0.8847398292049011, + "grad_norm": 2.1168973445892334, + "learning_rate": 3.525433617991832e-05, + "loss": 0.6728, + "step": 100080 + }, + { + "epoch": 0.8848282324652134, + "grad_norm": 10.837515830993652, + "learning_rate": 3.525286279224644e-05, + "loss": 0.579, + "step": 100090 + }, + { + "epoch": 0.8849166357255256, + "grad_norm": 5.5818586349487305, + "learning_rate": 3.5251389404574577e-05, + "loss": 0.5947, + "step": 100100 + }, + { + "epoch": 0.8850050389858378, + "grad_norm": 1.1673955917358398, + "learning_rate": 3.5249916016902705e-05, + "loss": 0.6881, + "step": 100110 + }, + { + "epoch": 0.88509344224615, + "grad_norm": 1.301098346710205, + "learning_rate": 3.524844262923083e-05, + "loss": 0.6105, + "step": 100120 + }, + { + "epoch": 0.8851818455064623, + "grad_norm": 4.43489933013916, + "learning_rate": 3.524696924155896e-05, + "loss": 0.6806, + "step": 100130 + }, + { + "epoch": 0.8852702487667745, + "grad_norm": 1.8990715742111206, + "learning_rate": 3.52454958538871e-05, + "loss": 0.6743, + "step": 100140 + }, + { + "epoch": 0.8853586520270867, + "grad_norm": 3.0658020973205566, + "learning_rate": 3.524402246621522e-05, + "loss": 0.6715, + "step": 100150 + }, + { + "epoch": 0.885447055287399, + "grad_norm": 1.5086337327957153, + "learning_rate": 3.5242549078543353e-05, + "loss": 0.7008, + "step": 100160 + }, + { + "epoch": 0.8855354585477112, + "grad_norm": 1.8033921718597412, + "learning_rate": 3.524107569087148e-05, + "loss": 0.7926, + "step": 100170 + }, + { + "epoch": 0.8856238618080234, + "grad_norm": 3.2836856842041016, + "learning_rate": 3.523960230319961e-05, + "loss": 0.7363, + "step": 100180 + }, + { + "epoch": 0.8857122650683357, + "grad_norm": 1.9718445539474487, + "learning_rate": 3.523812891552774e-05, + "loss": 0.6143, + "step": 100190 + }, + { + "epoch": 0.885800668328648, + "grad_norm": 4.54406213760376, + "learning_rate": 3.523665552785587e-05, + "loss": 0.6392, + "step": 100200 + }, + { + "epoch": 0.8858890715889602, + "grad_norm": 5.802214622497559, + "learning_rate": 3.5235182140183995e-05, + "loss": 0.6594, + "step": 100210 + }, + { + "epoch": 0.8859774748492725, + "grad_norm": 1.9690452814102173, + "learning_rate": 3.523370875251213e-05, + "loss": 0.5638, + "step": 100220 + }, + { + "epoch": 0.8860658781095847, + "grad_norm": 2.6623852252960205, + "learning_rate": 3.523223536484025e-05, + "loss": 0.7114, + "step": 100230 + }, + { + "epoch": 0.8861542813698969, + "grad_norm": 4.33412504196167, + "learning_rate": 3.523076197716839e-05, + "loss": 0.6726, + "step": 100240 + }, + { + "epoch": 0.8862426846302092, + "grad_norm": 3.843928098678589, + "learning_rate": 3.5229288589496515e-05, + "loss": 0.637, + "step": 100250 + }, + { + "epoch": 0.8863310878905214, + "grad_norm": 4.868276596069336, + "learning_rate": 3.5227815201824644e-05, + "loss": 0.7108, + "step": 100260 + }, + { + "epoch": 0.8864194911508336, + "grad_norm": 1.452610731124878, + "learning_rate": 3.522634181415277e-05, + "loss": 0.6581, + "step": 100270 + }, + { + "epoch": 0.8865078944111459, + "grad_norm": 1.7720494270324707, + "learning_rate": 3.522486842648091e-05, + "loss": 0.6808, + "step": 100280 + }, + { + "epoch": 0.8865962976714581, + "grad_norm": 9.473158836364746, + "learning_rate": 3.522339503880903e-05, + "loss": 0.7835, + "step": 100290 + }, + { + "epoch": 0.8866847009317703, + "grad_norm": 1.4104455709457397, + "learning_rate": 3.5221921651137164e-05, + "loss": 0.6491, + "step": 100300 + }, + { + "epoch": 0.8867731041920826, + "grad_norm": 8.778430938720703, + "learning_rate": 3.522044826346529e-05, + "loss": 0.6525, + "step": 100310 + }, + { + "epoch": 0.8868615074523949, + "grad_norm": 1.6890722513198853, + "learning_rate": 3.521897487579342e-05, + "loss": 0.6736, + "step": 100320 + }, + { + "epoch": 0.8869499107127071, + "grad_norm": 11.171053886413574, + "learning_rate": 3.521750148812155e-05, + "loss": 0.5095, + "step": 100330 + }, + { + "epoch": 0.8870383139730194, + "grad_norm": 2.19525146484375, + "learning_rate": 3.521602810044968e-05, + "loss": 0.7259, + "step": 100340 + }, + { + "epoch": 0.8871267172333316, + "grad_norm": 3.0850815773010254, + "learning_rate": 3.5214554712777806e-05, + "loss": 0.6318, + "step": 100350 + }, + { + "epoch": 0.8872151204936438, + "grad_norm": 7.177029609680176, + "learning_rate": 3.521308132510594e-05, + "loss": 0.8074, + "step": 100360 + }, + { + "epoch": 0.887303523753956, + "grad_norm": 4.595524311065674, + "learning_rate": 3.521160793743406e-05, + "loss": 0.6359, + "step": 100370 + }, + { + "epoch": 0.8873919270142683, + "grad_norm": 7.67150354385376, + "learning_rate": 3.52101345497622e-05, + "loss": 0.64, + "step": 100380 + }, + { + "epoch": 0.8874803302745805, + "grad_norm": 1.5217820405960083, + "learning_rate": 3.5208661162090326e-05, + "loss": 0.6981, + "step": 100390 + }, + { + "epoch": 0.8875687335348927, + "grad_norm": 3.5454249382019043, + "learning_rate": 3.5207187774418454e-05, + "loss": 0.6135, + "step": 100400 + }, + { + "epoch": 0.887657136795205, + "grad_norm": 5.299508571624756, + "learning_rate": 3.520571438674658e-05, + "loss": 0.7602, + "step": 100410 + }, + { + "epoch": 0.8877455400555172, + "grad_norm": 3.249697685241699, + "learning_rate": 3.520424099907472e-05, + "loss": 0.7282, + "step": 100420 + }, + { + "epoch": 0.8878339433158294, + "grad_norm": 5.020130634307861, + "learning_rate": 3.520276761140284e-05, + "loss": 0.6608, + "step": 100430 + }, + { + "epoch": 0.8879223465761418, + "grad_norm": 2.3925464153289795, + "learning_rate": 3.5201294223730974e-05, + "loss": 0.5725, + "step": 100440 + }, + { + "epoch": 0.888010749836454, + "grad_norm": 15.526290893554688, + "learning_rate": 3.51998208360591e-05, + "loss": 0.6876, + "step": 100450 + }, + { + "epoch": 0.8880991530967662, + "grad_norm": 4.03241491317749, + "learning_rate": 3.519834744838723e-05, + "loss": 0.6501, + "step": 100460 + }, + { + "epoch": 0.8881875563570785, + "grad_norm": 1.3994730710983276, + "learning_rate": 3.519687406071536e-05, + "loss": 0.7434, + "step": 100470 + }, + { + "epoch": 0.8882759596173907, + "grad_norm": 1.7547762393951416, + "learning_rate": 3.519540067304349e-05, + "loss": 0.6682, + "step": 100480 + }, + { + "epoch": 0.8883643628777029, + "grad_norm": 1.701341152191162, + "learning_rate": 3.5193927285371616e-05, + "loss": 0.6908, + "step": 100490 + }, + { + "epoch": 0.8884527661380152, + "grad_norm": 3.0670440196990967, + "learning_rate": 3.519245389769975e-05, + "loss": 0.705, + "step": 100500 + }, + { + "epoch": 0.8885411693983274, + "grad_norm": 0.8181890845298767, + "learning_rate": 3.519098051002788e-05, + "loss": 0.6324, + "step": 100510 + }, + { + "epoch": 0.8886295726586396, + "grad_norm": 3.015061616897583, + "learning_rate": 3.518950712235601e-05, + "loss": 0.6946, + "step": 100520 + }, + { + "epoch": 0.8887179759189519, + "grad_norm": 6.234442710876465, + "learning_rate": 3.5188033734684136e-05, + "loss": 0.5297, + "step": 100530 + }, + { + "epoch": 0.8888063791792641, + "grad_norm": 1.1581306457519531, + "learning_rate": 3.5186560347012265e-05, + "loss": 0.6742, + "step": 100540 + }, + { + "epoch": 0.8888947824395764, + "grad_norm": 1.9346543550491333, + "learning_rate": 3.518508695934039e-05, + "loss": 0.7385, + "step": 100550 + }, + { + "epoch": 0.8889831856998887, + "grad_norm": 4.378337383270264, + "learning_rate": 3.518361357166852e-05, + "loss": 0.6182, + "step": 100560 + }, + { + "epoch": 0.8890715889602009, + "grad_norm": 3.4411778450012207, + "learning_rate": 3.518214018399666e-05, + "loss": 0.659, + "step": 100570 + }, + { + "epoch": 0.8891599922205131, + "grad_norm": 2.7635605335235596, + "learning_rate": 3.5180666796324785e-05, + "loss": 0.7243, + "step": 100580 + }, + { + "epoch": 0.8892483954808253, + "grad_norm": 5.1399383544921875, + "learning_rate": 3.517919340865291e-05, + "loss": 0.6066, + "step": 100590 + }, + { + "epoch": 0.8893367987411376, + "grad_norm": 1.4909334182739258, + "learning_rate": 3.517772002098104e-05, + "loss": 0.6279, + "step": 100600 + }, + { + "epoch": 0.8894252020014498, + "grad_norm": 4.729198455810547, + "learning_rate": 3.517624663330917e-05, + "loss": 0.722, + "step": 100610 + }, + { + "epoch": 0.889513605261762, + "grad_norm": 5.377867698669434, + "learning_rate": 3.51747732456373e-05, + "loss": 0.5887, + "step": 100620 + }, + { + "epoch": 0.8896020085220743, + "grad_norm": 1.6192539930343628, + "learning_rate": 3.5173299857965434e-05, + "loss": 0.6734, + "step": 100630 + }, + { + "epoch": 0.8896904117823865, + "grad_norm": 3.3911421298980713, + "learning_rate": 3.517182647029356e-05, + "loss": 0.707, + "step": 100640 + }, + { + "epoch": 0.8897788150426987, + "grad_norm": 1.5079931020736694, + "learning_rate": 3.517035308262169e-05, + "loss": 0.5818, + "step": 100650 + }, + { + "epoch": 0.889867218303011, + "grad_norm": 4.550739765167236, + "learning_rate": 3.516887969494982e-05, + "loss": 0.5759, + "step": 100660 + }, + { + "epoch": 0.8899556215633233, + "grad_norm": 4.818974494934082, + "learning_rate": 3.516740630727795e-05, + "loss": 0.6299, + "step": 100670 + }, + { + "epoch": 0.8900440248236355, + "grad_norm": 8.771803855895996, + "learning_rate": 3.5165932919606075e-05, + "loss": 0.6815, + "step": 100680 + }, + { + "epoch": 0.8901324280839478, + "grad_norm": 1.6297937631607056, + "learning_rate": 3.516445953193421e-05, + "loss": 0.6405, + "step": 100690 + }, + { + "epoch": 0.89022083134426, + "grad_norm": 1.503909707069397, + "learning_rate": 3.516298614426233e-05, + "loss": 0.5347, + "step": 100700 + }, + { + "epoch": 0.8903092346045722, + "grad_norm": 4.525701999664307, + "learning_rate": 3.516151275659047e-05, + "loss": 0.6617, + "step": 100710 + }, + { + "epoch": 0.8903976378648845, + "grad_norm": 1.8797188997268677, + "learning_rate": 3.5160039368918596e-05, + "loss": 0.597, + "step": 100720 + }, + { + "epoch": 0.8904860411251967, + "grad_norm": 3.4912447929382324, + "learning_rate": 3.5158565981246724e-05, + "loss": 0.6898, + "step": 100730 + }, + { + "epoch": 0.8905744443855089, + "grad_norm": 6.8011956214904785, + "learning_rate": 3.515709259357485e-05, + "loss": 0.6806, + "step": 100740 + }, + { + "epoch": 0.8906628476458212, + "grad_norm": 10.962492942810059, + "learning_rate": 3.515561920590299e-05, + "loss": 0.6588, + "step": 100750 + }, + { + "epoch": 0.8907512509061334, + "grad_norm": 2.307803153991699, + "learning_rate": 3.515414581823111e-05, + "loss": 0.6848, + "step": 100760 + }, + { + "epoch": 0.8908396541664456, + "grad_norm": 1.471994400024414, + "learning_rate": 3.5152672430559244e-05, + "loss": 0.6145, + "step": 100770 + }, + { + "epoch": 0.8909280574267578, + "grad_norm": 1.0318900346755981, + "learning_rate": 3.515119904288737e-05, + "loss": 0.5517, + "step": 100780 + }, + { + "epoch": 0.8910164606870702, + "grad_norm": 3.9565703868865967, + "learning_rate": 3.51497256552155e-05, + "loss": 0.7354, + "step": 100790 + }, + { + "epoch": 0.8911048639473824, + "grad_norm": 3.331211805343628, + "learning_rate": 3.514825226754363e-05, + "loss": 0.5486, + "step": 100800 + }, + { + "epoch": 0.8911932672076947, + "grad_norm": 3.1534969806671143, + "learning_rate": 3.514677887987176e-05, + "loss": 0.6251, + "step": 100810 + }, + { + "epoch": 0.8912816704680069, + "grad_norm": 0.961853563785553, + "learning_rate": 3.5145305492199886e-05, + "loss": 0.5399, + "step": 100820 + }, + { + "epoch": 0.8913700737283191, + "grad_norm": 0.6532862782478333, + "learning_rate": 3.514383210452802e-05, + "loss": 0.5547, + "step": 100830 + }, + { + "epoch": 0.8914584769886313, + "grad_norm": 1.2637436389923096, + "learning_rate": 3.514235871685614e-05, + "loss": 0.7035, + "step": 100840 + }, + { + "epoch": 0.8915468802489436, + "grad_norm": 2.0397205352783203, + "learning_rate": 3.514088532918428e-05, + "loss": 0.5815, + "step": 100850 + }, + { + "epoch": 0.8916352835092558, + "grad_norm": 4.124575614929199, + "learning_rate": 3.5139411941512406e-05, + "loss": 0.6324, + "step": 100860 + }, + { + "epoch": 0.891723686769568, + "grad_norm": 7.4020094871521, + "learning_rate": 3.5137938553840534e-05, + "loss": 0.6288, + "step": 100870 + }, + { + "epoch": 0.8918120900298803, + "grad_norm": 5.552517414093018, + "learning_rate": 3.513646516616866e-05, + "loss": 0.6562, + "step": 100880 + }, + { + "epoch": 0.8919004932901925, + "grad_norm": 1.8676708936691284, + "learning_rate": 3.51349917784968e-05, + "loss": 0.9967, + "step": 100890 + }, + { + "epoch": 0.8919888965505047, + "grad_norm": 3.641599416732788, + "learning_rate": 3.513351839082492e-05, + "loss": 0.5815, + "step": 100900 + }, + { + "epoch": 0.8920772998108171, + "grad_norm": 2.785343647003174, + "learning_rate": 3.5132045003153055e-05, + "loss": 0.707, + "step": 100910 + }, + { + "epoch": 0.8921657030711293, + "grad_norm": 4.495552062988281, + "learning_rate": 3.5130571615481176e-05, + "loss": 0.6783, + "step": 100920 + }, + { + "epoch": 0.8922541063314415, + "grad_norm": 6.22516393661499, + "learning_rate": 3.512909822780931e-05, + "loss": 0.5645, + "step": 100930 + }, + { + "epoch": 0.8923425095917538, + "grad_norm": 2.9545347690582275, + "learning_rate": 3.512762484013744e-05, + "loss": 0.5692, + "step": 100940 + }, + { + "epoch": 0.892430912852066, + "grad_norm": 8.589646339416504, + "learning_rate": 3.512615145246557e-05, + "loss": 0.6504, + "step": 100950 + }, + { + "epoch": 0.8925193161123782, + "grad_norm": 7.866554260253906, + "learning_rate": 3.5124678064793696e-05, + "loss": 0.6609, + "step": 100960 + }, + { + "epoch": 0.8926077193726905, + "grad_norm": 1.5530755519866943, + "learning_rate": 3.512320467712183e-05, + "loss": 0.722, + "step": 100970 + }, + { + "epoch": 0.8926961226330027, + "grad_norm": 1.5333573818206787, + "learning_rate": 3.512173128944995e-05, + "loss": 0.6719, + "step": 100980 + }, + { + "epoch": 0.8927845258933149, + "grad_norm": 1.0072578191757202, + "learning_rate": 3.512025790177809e-05, + "loss": 0.5213, + "step": 100990 + }, + { + "epoch": 0.8928729291536271, + "grad_norm": 10.758806228637695, + "learning_rate": 3.5118784514106217e-05, + "loss": 0.6005, + "step": 101000 + }, + { + "epoch": 0.8929613324139394, + "grad_norm": 3.3677430152893066, + "learning_rate": 3.5117311126434345e-05, + "loss": 0.5697, + "step": 101010 + }, + { + "epoch": 0.8930497356742517, + "grad_norm": 1.2367812395095825, + "learning_rate": 3.511583773876247e-05, + "loss": 0.5482, + "step": 101020 + }, + { + "epoch": 0.893138138934564, + "grad_norm": 1.904903769493103, + "learning_rate": 3.51143643510906e-05, + "loss": 0.6816, + "step": 101030 + }, + { + "epoch": 0.8932265421948762, + "grad_norm": 3.000394105911255, + "learning_rate": 3.511289096341873e-05, + "loss": 0.7403, + "step": 101040 + }, + { + "epoch": 0.8933149454551884, + "grad_norm": 1.9283185005187988, + "learning_rate": 3.5111417575746865e-05, + "loss": 0.6826, + "step": 101050 + }, + { + "epoch": 0.8934033487155006, + "grad_norm": 3.345632791519165, + "learning_rate": 3.510994418807499e-05, + "loss": 0.6579, + "step": 101060 + }, + { + "epoch": 0.8934917519758129, + "grad_norm": 2.5902278423309326, + "learning_rate": 3.510847080040312e-05, + "loss": 0.6711, + "step": 101070 + }, + { + "epoch": 0.8935801552361251, + "grad_norm": 2.029496669769287, + "learning_rate": 3.510699741273125e-05, + "loss": 0.6894, + "step": 101080 + }, + { + "epoch": 0.8936685584964373, + "grad_norm": 1.384575366973877, + "learning_rate": 3.510552402505938e-05, + "loss": 0.6912, + "step": 101090 + }, + { + "epoch": 0.8937569617567496, + "grad_norm": 3.0527894496917725, + "learning_rate": 3.510405063738751e-05, + "loss": 0.7272, + "step": 101100 + }, + { + "epoch": 0.8938453650170618, + "grad_norm": 11.757392883300781, + "learning_rate": 3.510257724971564e-05, + "loss": 0.5911, + "step": 101110 + }, + { + "epoch": 0.893933768277374, + "grad_norm": 1.9908053874969482, + "learning_rate": 3.5101103862043764e-05, + "loss": 0.7402, + "step": 101120 + }, + { + "epoch": 0.8940221715376863, + "grad_norm": 1.9703317880630493, + "learning_rate": 3.50996304743719e-05, + "loss": 0.687, + "step": 101130 + }, + { + "epoch": 0.8941105747979986, + "grad_norm": 3.663726329803467, + "learning_rate": 3.509815708670002e-05, + "loss": 0.6811, + "step": 101140 + }, + { + "epoch": 0.8941989780583108, + "grad_norm": 4.681532859802246, + "learning_rate": 3.5096683699028155e-05, + "loss": 0.6492, + "step": 101150 + }, + { + "epoch": 0.8942873813186231, + "grad_norm": 3.539275884628296, + "learning_rate": 3.5095210311356284e-05, + "loss": 0.6097, + "step": 101160 + }, + { + "epoch": 0.8943757845789353, + "grad_norm": 5.129672527313232, + "learning_rate": 3.509373692368441e-05, + "loss": 0.6826, + "step": 101170 + }, + { + "epoch": 0.8944641878392475, + "grad_norm": 2.5532431602478027, + "learning_rate": 3.509226353601254e-05, + "loss": 0.6332, + "step": 101180 + }, + { + "epoch": 0.8945525910995598, + "grad_norm": 16.827884674072266, + "learning_rate": 3.5090790148340676e-05, + "loss": 0.6399, + "step": 101190 + }, + { + "epoch": 0.894640994359872, + "grad_norm": 8.727770805358887, + "learning_rate": 3.50893167606688e-05, + "loss": 0.5987, + "step": 101200 + }, + { + "epoch": 0.8947293976201842, + "grad_norm": 0.9801639914512634, + "learning_rate": 3.508784337299693e-05, + "loss": 0.6697, + "step": 101210 + }, + { + "epoch": 0.8948178008804965, + "grad_norm": 1.7317551374435425, + "learning_rate": 3.508636998532506e-05, + "loss": 0.7046, + "step": 101220 + }, + { + "epoch": 0.8949062041408087, + "grad_norm": 1.1599980592727661, + "learning_rate": 3.508489659765319e-05, + "loss": 0.6924, + "step": 101230 + }, + { + "epoch": 0.8949946074011209, + "grad_norm": 6.136709213256836, + "learning_rate": 3.508342320998132e-05, + "loss": 0.6045, + "step": 101240 + }, + { + "epoch": 0.8950830106614331, + "grad_norm": 2.312178373336792, + "learning_rate": 3.508194982230945e-05, + "loss": 0.5173, + "step": 101250 + }, + { + "epoch": 0.8951714139217455, + "grad_norm": 5.154534816741943, + "learning_rate": 3.5080476434637574e-05, + "loss": 0.574, + "step": 101260 + }, + { + "epoch": 0.8952598171820577, + "grad_norm": 7.279311180114746, + "learning_rate": 3.507900304696571e-05, + "loss": 0.7546, + "step": 101270 + }, + { + "epoch": 0.89534822044237, + "grad_norm": 1.5704989433288574, + "learning_rate": 3.507752965929383e-05, + "loss": 0.5996, + "step": 101280 + }, + { + "epoch": 0.8954366237026822, + "grad_norm": 1.6563899517059326, + "learning_rate": 3.5076056271621966e-05, + "loss": 0.6485, + "step": 101290 + }, + { + "epoch": 0.8955250269629944, + "grad_norm": 2.398397207260132, + "learning_rate": 3.5074582883950094e-05, + "loss": 0.6093, + "step": 101300 + }, + { + "epoch": 0.8956134302233066, + "grad_norm": 2.6400833129882812, + "learning_rate": 3.507310949627822e-05, + "loss": 0.6433, + "step": 101310 + }, + { + "epoch": 0.8957018334836189, + "grad_norm": 1.1053946018218994, + "learning_rate": 3.507163610860635e-05, + "loss": 0.6295, + "step": 101320 + }, + { + "epoch": 0.8957902367439311, + "grad_norm": 2.3099589347839355, + "learning_rate": 3.5070162720934486e-05, + "loss": 0.6705, + "step": 101330 + }, + { + "epoch": 0.8958786400042433, + "grad_norm": 4.429075717926025, + "learning_rate": 3.506868933326261e-05, + "loss": 0.6963, + "step": 101340 + }, + { + "epoch": 0.8959670432645556, + "grad_norm": 2.8399767875671387, + "learning_rate": 3.506721594559074e-05, + "loss": 0.6676, + "step": 101350 + }, + { + "epoch": 0.8960554465248678, + "grad_norm": 2.4658477306365967, + "learning_rate": 3.506574255791887e-05, + "loss": 0.7333, + "step": 101360 + }, + { + "epoch": 0.89614384978518, + "grad_norm": 15.264657974243164, + "learning_rate": 3.5064269170247e-05, + "loss": 0.6424, + "step": 101370 + }, + { + "epoch": 0.8962322530454924, + "grad_norm": 8.383511543273926, + "learning_rate": 3.506279578257513e-05, + "loss": 0.6973, + "step": 101380 + }, + { + "epoch": 0.8963206563058046, + "grad_norm": 2.5382955074310303, + "learning_rate": 3.5061322394903256e-05, + "loss": 0.8195, + "step": 101390 + }, + { + "epoch": 0.8964090595661168, + "grad_norm": 9.689706802368164, + "learning_rate": 3.5059849007231385e-05, + "loss": 0.67, + "step": 101400 + }, + { + "epoch": 0.8964974628264291, + "grad_norm": 2.7874510288238525, + "learning_rate": 3.505837561955952e-05, + "loss": 0.7002, + "step": 101410 + }, + { + "epoch": 0.8965858660867413, + "grad_norm": 4.352518558502197, + "learning_rate": 3.505690223188765e-05, + "loss": 0.7201, + "step": 101420 + }, + { + "epoch": 0.8966742693470535, + "grad_norm": 3.670970916748047, + "learning_rate": 3.5055428844215776e-05, + "loss": 0.6365, + "step": 101430 + }, + { + "epoch": 0.8967626726073658, + "grad_norm": 2.897301197052002, + "learning_rate": 3.5053955456543905e-05, + "loss": 0.5498, + "step": 101440 + }, + { + "epoch": 0.896851075867678, + "grad_norm": 2.6691713333129883, + "learning_rate": 3.505248206887203e-05, + "loss": 0.5476, + "step": 101450 + }, + { + "epoch": 0.8969394791279902, + "grad_norm": 1.6963719129562378, + "learning_rate": 3.505100868120016e-05, + "loss": 0.7333, + "step": 101460 + }, + { + "epoch": 0.8970278823883024, + "grad_norm": 1.7317208051681519, + "learning_rate": 3.5049535293528297e-05, + "loss": 0.6961, + "step": 101470 + }, + { + "epoch": 0.8971162856486147, + "grad_norm": 2.3766930103302, + "learning_rate": 3.5048061905856425e-05, + "loss": 0.7015, + "step": 101480 + }, + { + "epoch": 0.8972046889089269, + "grad_norm": 0.9640777707099915, + "learning_rate": 3.504658851818455e-05, + "loss": 0.6253, + "step": 101490 + }, + { + "epoch": 0.8972930921692392, + "grad_norm": 5.681628704071045, + "learning_rate": 3.504511513051268e-05, + "loss": 0.636, + "step": 101500 + }, + { + "epoch": 0.8973814954295515, + "grad_norm": 3.1678149700164795, + "learning_rate": 3.504364174284081e-05, + "loss": 0.589, + "step": 101510 + }, + { + "epoch": 0.8974698986898637, + "grad_norm": 3.6972336769104004, + "learning_rate": 3.504216835516894e-05, + "loss": 0.5261, + "step": 101520 + }, + { + "epoch": 0.8975583019501759, + "grad_norm": 1.8316400051116943, + "learning_rate": 3.504069496749707e-05, + "loss": 0.6979, + "step": 101530 + }, + { + "epoch": 0.8976467052104882, + "grad_norm": 8.349693298339844, + "learning_rate": 3.50392215798252e-05, + "loss": 0.6079, + "step": 101540 + }, + { + "epoch": 0.8977351084708004, + "grad_norm": 4.87383508682251, + "learning_rate": 3.503774819215333e-05, + "loss": 0.6064, + "step": 101550 + }, + { + "epoch": 0.8978235117311126, + "grad_norm": 10.393636703491211, + "learning_rate": 3.503627480448146e-05, + "loss": 0.7332, + "step": 101560 + }, + { + "epoch": 0.8979119149914249, + "grad_norm": 6.1662492752075195, + "learning_rate": 3.503480141680959e-05, + "loss": 0.6072, + "step": 101570 + }, + { + "epoch": 0.8980003182517371, + "grad_norm": 2.561657428741455, + "learning_rate": 3.5033328029137715e-05, + "loss": 0.7546, + "step": 101580 + }, + { + "epoch": 0.8980887215120493, + "grad_norm": 2.8024821281433105, + "learning_rate": 3.5031854641465844e-05, + "loss": 0.6235, + "step": 101590 + }, + { + "epoch": 0.8981771247723616, + "grad_norm": 2.377392053604126, + "learning_rate": 3.503038125379398e-05, + "loss": 0.5911, + "step": 101600 + }, + { + "epoch": 0.8982655280326739, + "grad_norm": 3.295403242111206, + "learning_rate": 3.50289078661221e-05, + "loss": 0.6485, + "step": 101610 + }, + { + "epoch": 0.8983539312929861, + "grad_norm": 0.9697879552841187, + "learning_rate": 3.5027434478450235e-05, + "loss": 0.684, + "step": 101620 + }, + { + "epoch": 0.8984423345532984, + "grad_norm": 5.126312732696533, + "learning_rate": 3.5025961090778364e-05, + "loss": 0.5952, + "step": 101630 + }, + { + "epoch": 0.8985307378136106, + "grad_norm": 10.524635314941406, + "learning_rate": 3.502448770310649e-05, + "loss": 0.8219, + "step": 101640 + }, + { + "epoch": 0.8986191410739228, + "grad_norm": 6.454860210418701, + "learning_rate": 3.502301431543462e-05, + "loss": 0.5776, + "step": 101650 + }, + { + "epoch": 0.898707544334235, + "grad_norm": 5.484509468078613, + "learning_rate": 3.5021540927762756e-05, + "loss": 0.6469, + "step": 101660 + }, + { + "epoch": 0.8987959475945473, + "grad_norm": 9.468827247619629, + "learning_rate": 3.502006754009088e-05, + "loss": 0.6623, + "step": 101670 + }, + { + "epoch": 0.8988843508548595, + "grad_norm": 3.075105667114258, + "learning_rate": 3.501859415241901e-05, + "loss": 0.7001, + "step": 101680 + }, + { + "epoch": 0.8989727541151717, + "grad_norm": 1.0968987941741943, + "learning_rate": 3.501712076474714e-05, + "loss": 0.6179, + "step": 101690 + }, + { + "epoch": 0.899061157375484, + "grad_norm": 3.3807880878448486, + "learning_rate": 3.501564737707527e-05, + "loss": 0.6345, + "step": 101700 + }, + { + "epoch": 0.8991495606357962, + "grad_norm": 7.7511067390441895, + "learning_rate": 3.50141739894034e-05, + "loss": 0.7584, + "step": 101710 + }, + { + "epoch": 0.8992379638961084, + "grad_norm": 3.7797470092773438, + "learning_rate": 3.501270060173153e-05, + "loss": 0.6759, + "step": 101720 + }, + { + "epoch": 0.8993263671564208, + "grad_norm": 2.490063190460205, + "learning_rate": 3.5011227214059654e-05, + "loss": 0.6778, + "step": 101730 + }, + { + "epoch": 0.899414770416733, + "grad_norm": 3.2917134761810303, + "learning_rate": 3.500975382638779e-05, + "loss": 0.7253, + "step": 101740 + }, + { + "epoch": 0.8995031736770452, + "grad_norm": 2.1594207286834717, + "learning_rate": 3.500828043871591e-05, + "loss": 0.6469, + "step": 101750 + }, + { + "epoch": 0.8995915769373575, + "grad_norm": 2.15397047996521, + "learning_rate": 3.5006807051044046e-05, + "loss": 0.6228, + "step": 101760 + }, + { + "epoch": 0.8996799801976697, + "grad_norm": 3.32759165763855, + "learning_rate": 3.5005333663372174e-05, + "loss": 0.6176, + "step": 101770 + }, + { + "epoch": 0.8997683834579819, + "grad_norm": 1.747042179107666, + "learning_rate": 3.50038602757003e-05, + "loss": 0.7409, + "step": 101780 + }, + { + "epoch": 0.8998567867182942, + "grad_norm": 3.225634813308716, + "learning_rate": 3.500238688802843e-05, + "loss": 0.7049, + "step": 101790 + }, + { + "epoch": 0.8999451899786064, + "grad_norm": 4.23330020904541, + "learning_rate": 3.5000913500356566e-05, + "loss": 0.6701, + "step": 101800 + }, + { + "epoch": 0.9000335932389186, + "grad_norm": 1.5757709741592407, + "learning_rate": 3.499944011268469e-05, + "loss": 0.5276, + "step": 101810 + }, + { + "epoch": 0.9001219964992309, + "grad_norm": 2.1911356449127197, + "learning_rate": 3.499796672501282e-05, + "loss": 0.6656, + "step": 101820 + }, + { + "epoch": 0.9002103997595431, + "grad_norm": 1.4769911766052246, + "learning_rate": 3.499649333734095e-05, + "loss": 0.5302, + "step": 101830 + }, + { + "epoch": 0.9002988030198553, + "grad_norm": 2.9458811283111572, + "learning_rate": 3.499501994966908e-05, + "loss": 0.5677, + "step": 101840 + }, + { + "epoch": 0.9003872062801677, + "grad_norm": 4.243816375732422, + "learning_rate": 3.499354656199721e-05, + "loss": 0.6722, + "step": 101850 + }, + { + "epoch": 0.9004756095404799, + "grad_norm": 2.830928087234497, + "learning_rate": 3.4992073174325336e-05, + "loss": 0.7675, + "step": 101860 + }, + { + "epoch": 0.9005640128007921, + "grad_norm": 7.463563919067383, + "learning_rate": 3.4990599786653465e-05, + "loss": 0.6429, + "step": 101870 + }, + { + "epoch": 0.9006524160611044, + "grad_norm": 3.8281073570251465, + "learning_rate": 3.49891263989816e-05, + "loss": 0.5814, + "step": 101880 + }, + { + "epoch": 0.9007408193214166, + "grad_norm": 2.409959554672241, + "learning_rate": 3.498765301130972e-05, + "loss": 0.7404, + "step": 101890 + }, + { + "epoch": 0.9008292225817288, + "grad_norm": 8.971677780151367, + "learning_rate": 3.4986179623637856e-05, + "loss": 0.6807, + "step": 101900 + }, + { + "epoch": 0.900917625842041, + "grad_norm": 5.7753472328186035, + "learning_rate": 3.4984706235965985e-05, + "loss": 0.6223, + "step": 101910 + }, + { + "epoch": 0.9010060291023533, + "grad_norm": 4.067429542541504, + "learning_rate": 3.498323284829411e-05, + "loss": 0.5396, + "step": 101920 + }, + { + "epoch": 0.9010944323626655, + "grad_norm": 1.4998195171356201, + "learning_rate": 3.498175946062224e-05, + "loss": 0.6694, + "step": 101930 + }, + { + "epoch": 0.9011828356229777, + "grad_norm": 3.7504825592041016, + "learning_rate": 3.498028607295038e-05, + "loss": 0.7412, + "step": 101940 + }, + { + "epoch": 0.90127123888329, + "grad_norm": 6.002006530761719, + "learning_rate": 3.49788126852785e-05, + "loss": 0.742, + "step": 101950 + }, + { + "epoch": 0.9013596421436022, + "grad_norm": 3.166353940963745, + "learning_rate": 3.497733929760663e-05, + "loss": 0.6574, + "step": 101960 + }, + { + "epoch": 0.9014480454039145, + "grad_norm": 8.615494728088379, + "learning_rate": 3.4975865909934755e-05, + "loss": 0.6139, + "step": 101970 + }, + { + "epoch": 0.9015364486642268, + "grad_norm": 13.972883224487305, + "learning_rate": 3.497439252226289e-05, + "loss": 0.6576, + "step": 101980 + }, + { + "epoch": 0.901624851924539, + "grad_norm": 2.6312336921691895, + "learning_rate": 3.497291913459102e-05, + "loss": 0.6305, + "step": 101990 + }, + { + "epoch": 0.9017132551848512, + "grad_norm": 12.213671684265137, + "learning_rate": 3.497144574691915e-05, + "loss": 0.7336, + "step": 102000 + }, + { + "epoch": 0.9018016584451635, + "grad_norm": 1.814104676246643, + "learning_rate": 3.4969972359247275e-05, + "loss": 0.505, + "step": 102010 + }, + { + "epoch": 0.9018900617054757, + "grad_norm": 1.0694621801376343, + "learning_rate": 3.496849897157541e-05, + "loss": 0.5508, + "step": 102020 + }, + { + "epoch": 0.9019784649657879, + "grad_norm": 1.7550774812698364, + "learning_rate": 3.496702558390353e-05, + "loss": 0.7223, + "step": 102030 + }, + { + "epoch": 0.9020668682261002, + "grad_norm": 4.128680229187012, + "learning_rate": 3.496555219623167e-05, + "loss": 0.586, + "step": 102040 + }, + { + "epoch": 0.9021552714864124, + "grad_norm": 4.750894546508789, + "learning_rate": 3.4964078808559795e-05, + "loss": 0.6127, + "step": 102050 + }, + { + "epoch": 0.9022436747467246, + "grad_norm": 4.454137802124023, + "learning_rate": 3.4962605420887924e-05, + "loss": 0.7838, + "step": 102060 + }, + { + "epoch": 0.9023320780070369, + "grad_norm": 2.62306547164917, + "learning_rate": 3.496113203321605e-05, + "loss": 0.721, + "step": 102070 + }, + { + "epoch": 0.9024204812673492, + "grad_norm": 2.284649133682251, + "learning_rate": 3.495965864554418e-05, + "loss": 0.6146, + "step": 102080 + }, + { + "epoch": 0.9025088845276614, + "grad_norm": 2.9690709114074707, + "learning_rate": 3.495818525787231e-05, + "loss": 0.5991, + "step": 102090 + }, + { + "epoch": 0.9025972877879737, + "grad_norm": 2.8630611896514893, + "learning_rate": 3.4956711870200444e-05, + "loss": 0.5894, + "step": 102100 + }, + { + "epoch": 0.9026856910482859, + "grad_norm": 5.967788219451904, + "learning_rate": 3.4955238482528565e-05, + "loss": 0.6866, + "step": 102110 + }, + { + "epoch": 0.9027740943085981, + "grad_norm": 1.9666553735733032, + "learning_rate": 3.49537650948567e-05, + "loss": 0.6933, + "step": 102120 + }, + { + "epoch": 0.9028624975689103, + "grad_norm": 1.8393679857254028, + "learning_rate": 3.495229170718483e-05, + "loss": 0.6108, + "step": 102130 + }, + { + "epoch": 0.9029509008292226, + "grad_norm": 2.8642807006835938, + "learning_rate": 3.495081831951296e-05, + "loss": 0.6309, + "step": 102140 + }, + { + "epoch": 0.9030393040895348, + "grad_norm": 1.292883276939392, + "learning_rate": 3.4949344931841086e-05, + "loss": 0.4453, + "step": 102150 + }, + { + "epoch": 0.903127707349847, + "grad_norm": 8.02357292175293, + "learning_rate": 3.494787154416922e-05, + "loss": 0.6735, + "step": 102160 + }, + { + "epoch": 0.9032161106101593, + "grad_norm": 9.84030818939209, + "learning_rate": 3.494639815649734e-05, + "loss": 0.5224, + "step": 102170 + }, + { + "epoch": 0.9033045138704715, + "grad_norm": 9.755715370178223, + "learning_rate": 3.494492476882548e-05, + "loss": 0.5777, + "step": 102180 + }, + { + "epoch": 0.9033929171307837, + "grad_norm": 2.8910040855407715, + "learning_rate": 3.4943451381153606e-05, + "loss": 0.8668, + "step": 102190 + }, + { + "epoch": 0.9034813203910961, + "grad_norm": 2.0276455879211426, + "learning_rate": 3.4941977993481734e-05, + "loss": 0.6185, + "step": 102200 + }, + { + "epoch": 0.9035697236514083, + "grad_norm": 5.360898017883301, + "learning_rate": 3.494050460580986e-05, + "loss": 0.6512, + "step": 102210 + }, + { + "epoch": 0.9036581269117205, + "grad_norm": 1.9048824310302734, + "learning_rate": 3.493903121813799e-05, + "loss": 0.5484, + "step": 102220 + }, + { + "epoch": 0.9037465301720328, + "grad_norm": 15.215572357177734, + "learning_rate": 3.493755783046612e-05, + "loss": 0.6705, + "step": 102230 + }, + { + "epoch": 0.903834933432345, + "grad_norm": 2.206956386566162, + "learning_rate": 3.4936084442794254e-05, + "loss": 0.6876, + "step": 102240 + }, + { + "epoch": 0.9039233366926572, + "grad_norm": 2.6515204906463623, + "learning_rate": 3.4934611055122376e-05, + "loss": 0.5921, + "step": 102250 + }, + { + "epoch": 0.9040117399529695, + "grad_norm": 4.575997829437256, + "learning_rate": 3.493313766745051e-05, + "loss": 0.5741, + "step": 102260 + }, + { + "epoch": 0.9041001432132817, + "grad_norm": 8.7185697555542, + "learning_rate": 3.493166427977864e-05, + "loss": 0.5363, + "step": 102270 + }, + { + "epoch": 0.9041885464735939, + "grad_norm": 1.7065107822418213, + "learning_rate": 3.493019089210677e-05, + "loss": 0.5292, + "step": 102280 + }, + { + "epoch": 0.9042769497339062, + "grad_norm": 3.5478756427764893, + "learning_rate": 3.4928717504434896e-05, + "loss": 0.5829, + "step": 102290 + }, + { + "epoch": 0.9043653529942184, + "grad_norm": 2.1791701316833496, + "learning_rate": 3.492724411676303e-05, + "loss": 0.5794, + "step": 102300 + }, + { + "epoch": 0.9044537562545306, + "grad_norm": 4.497156143188477, + "learning_rate": 3.492577072909115e-05, + "loss": 0.5913, + "step": 102310 + }, + { + "epoch": 0.904542159514843, + "grad_norm": 2.201439142227173, + "learning_rate": 3.492429734141929e-05, + "loss": 0.7369, + "step": 102320 + }, + { + "epoch": 0.9046305627751552, + "grad_norm": 15.413252830505371, + "learning_rate": 3.4922823953747416e-05, + "loss": 0.5469, + "step": 102330 + }, + { + "epoch": 0.9047189660354674, + "grad_norm": 9.893781661987305, + "learning_rate": 3.4921350566075545e-05, + "loss": 0.5994, + "step": 102340 + }, + { + "epoch": 0.9048073692957797, + "grad_norm": 6.480618953704834, + "learning_rate": 3.491987717840367e-05, + "loss": 0.5544, + "step": 102350 + }, + { + "epoch": 0.9048957725560919, + "grad_norm": 7.362419605255127, + "learning_rate": 3.49184037907318e-05, + "loss": 0.7763, + "step": 102360 + }, + { + "epoch": 0.9049841758164041, + "grad_norm": 1.6436314582824707, + "learning_rate": 3.491693040305993e-05, + "loss": 0.6397, + "step": 102370 + }, + { + "epoch": 0.9050725790767163, + "grad_norm": 0.9055808782577515, + "learning_rate": 3.4915457015388065e-05, + "loss": 0.5555, + "step": 102380 + }, + { + "epoch": 0.9051609823370286, + "grad_norm": 1.5474635362625122, + "learning_rate": 3.491398362771619e-05, + "loss": 0.648, + "step": 102390 + }, + { + "epoch": 0.9052493855973408, + "grad_norm": 1.456683874130249, + "learning_rate": 3.491251024004432e-05, + "loss": 0.7182, + "step": 102400 + }, + { + "epoch": 0.905337788857653, + "grad_norm": 6.114360809326172, + "learning_rate": 3.491103685237245e-05, + "loss": 0.5114, + "step": 102410 + }, + { + "epoch": 0.9054261921179653, + "grad_norm": 1.343212366104126, + "learning_rate": 3.490956346470058e-05, + "loss": 0.5927, + "step": 102420 + }, + { + "epoch": 0.9055145953782775, + "grad_norm": 2.227585792541504, + "learning_rate": 3.490809007702871e-05, + "loss": 0.5756, + "step": 102430 + }, + { + "epoch": 0.9056029986385898, + "grad_norm": 2.8722617626190186, + "learning_rate": 3.4906616689356835e-05, + "loss": 0.7209, + "step": 102440 + }, + { + "epoch": 0.9056914018989021, + "grad_norm": 4.4949212074279785, + "learning_rate": 3.490514330168497e-05, + "loss": 0.6218, + "step": 102450 + }, + { + "epoch": 0.9057798051592143, + "grad_norm": 8.657486915588379, + "learning_rate": 3.49036699140131e-05, + "loss": 0.6459, + "step": 102460 + }, + { + "epoch": 0.9058682084195265, + "grad_norm": 2.9187140464782715, + "learning_rate": 3.490219652634123e-05, + "loss": 0.557, + "step": 102470 + }, + { + "epoch": 0.9059566116798388, + "grad_norm": 8.384116172790527, + "learning_rate": 3.4900723138669355e-05, + "loss": 0.7111, + "step": 102480 + }, + { + "epoch": 0.906045014940151, + "grad_norm": 3.899387836456299, + "learning_rate": 3.4899249750997484e-05, + "loss": 0.5597, + "step": 102490 + }, + { + "epoch": 0.9061334182004632, + "grad_norm": 5.011263847351074, + "learning_rate": 3.489777636332561e-05, + "loss": 0.6535, + "step": 102500 + }, + { + "epoch": 0.9062218214607755, + "grad_norm": 1.0646131038665771, + "learning_rate": 3.489630297565375e-05, + "loss": 0.6902, + "step": 102510 + }, + { + "epoch": 0.9063102247210877, + "grad_norm": 2.9479987621307373, + "learning_rate": 3.4894829587981875e-05, + "loss": 0.6589, + "step": 102520 + }, + { + "epoch": 0.9063986279813999, + "grad_norm": 3.5728771686553955, + "learning_rate": 3.4893356200310004e-05, + "loss": 0.529, + "step": 102530 + }, + { + "epoch": 0.9064870312417121, + "grad_norm": 1.4704993963241577, + "learning_rate": 3.489188281263813e-05, + "loss": 0.7254, + "step": 102540 + }, + { + "epoch": 0.9065754345020244, + "grad_norm": 4.313514709472656, + "learning_rate": 3.489040942496626e-05, + "loss": 0.5275, + "step": 102550 + }, + { + "epoch": 0.9066638377623367, + "grad_norm": 4.278264045715332, + "learning_rate": 3.488893603729439e-05, + "loss": 0.7614, + "step": 102560 + }, + { + "epoch": 0.906752241022649, + "grad_norm": 3.022684335708618, + "learning_rate": 3.4887462649622524e-05, + "loss": 0.584, + "step": 102570 + }, + { + "epoch": 0.9068406442829612, + "grad_norm": 2.240894317626953, + "learning_rate": 3.4885989261950646e-05, + "loss": 0.7421, + "step": 102580 + }, + { + "epoch": 0.9069290475432734, + "grad_norm": 2.047661781311035, + "learning_rate": 3.488451587427878e-05, + "loss": 0.6114, + "step": 102590 + }, + { + "epoch": 0.9070174508035856, + "grad_norm": 5.01424503326416, + "learning_rate": 3.488304248660691e-05, + "loss": 0.6426, + "step": 102600 + }, + { + "epoch": 0.9071058540638979, + "grad_norm": 1.331560730934143, + "learning_rate": 3.488156909893504e-05, + "loss": 0.7314, + "step": 102610 + }, + { + "epoch": 0.9071942573242101, + "grad_norm": 1.643265724182129, + "learning_rate": 3.4880095711263166e-05, + "loss": 0.7225, + "step": 102620 + }, + { + "epoch": 0.9072826605845223, + "grad_norm": 5.257772445678711, + "learning_rate": 3.48786223235913e-05, + "loss": 0.6232, + "step": 102630 + }, + { + "epoch": 0.9073710638448346, + "grad_norm": 4.28574800491333, + "learning_rate": 3.487714893591942e-05, + "loss": 0.6348, + "step": 102640 + }, + { + "epoch": 0.9074594671051468, + "grad_norm": 1.86776864528656, + "learning_rate": 3.487567554824756e-05, + "loss": 0.6368, + "step": 102650 + }, + { + "epoch": 0.907547870365459, + "grad_norm": 5.235930919647217, + "learning_rate": 3.4874202160575686e-05, + "loss": 0.6792, + "step": 102660 + }, + { + "epoch": 0.9076362736257714, + "grad_norm": 3.324723243713379, + "learning_rate": 3.4872728772903814e-05, + "loss": 0.6396, + "step": 102670 + }, + { + "epoch": 0.9077246768860836, + "grad_norm": 3.4838247299194336, + "learning_rate": 3.487125538523194e-05, + "loss": 0.791, + "step": 102680 + }, + { + "epoch": 0.9078130801463958, + "grad_norm": 3.4456939697265625, + "learning_rate": 3.486978199756007e-05, + "loss": 0.6986, + "step": 102690 + }, + { + "epoch": 0.9079014834067081, + "grad_norm": 3.2096827030181885, + "learning_rate": 3.48683086098882e-05, + "loss": 0.6852, + "step": 102700 + }, + { + "epoch": 0.9079898866670203, + "grad_norm": 8.984938621520996, + "learning_rate": 3.4866835222216334e-05, + "loss": 0.7243, + "step": 102710 + }, + { + "epoch": 0.9080782899273325, + "grad_norm": 2.8154518604278564, + "learning_rate": 3.4865361834544456e-05, + "loss": 0.6874, + "step": 102720 + }, + { + "epoch": 0.9081666931876448, + "grad_norm": 1.7001768350601196, + "learning_rate": 3.486388844687259e-05, + "loss": 0.5491, + "step": 102730 + }, + { + "epoch": 0.908255096447957, + "grad_norm": 3.0020084381103516, + "learning_rate": 3.486241505920072e-05, + "loss": 0.6214, + "step": 102740 + }, + { + "epoch": 0.9083434997082692, + "grad_norm": 1.2392884492874146, + "learning_rate": 3.486094167152885e-05, + "loss": 0.6687, + "step": 102750 + }, + { + "epoch": 0.9084319029685815, + "grad_norm": 2.5042834281921387, + "learning_rate": 3.4859468283856976e-05, + "loss": 0.7043, + "step": 102760 + }, + { + "epoch": 0.9085203062288937, + "grad_norm": 2.7333014011383057, + "learning_rate": 3.485799489618511e-05, + "loss": 0.7307, + "step": 102770 + }, + { + "epoch": 0.9086087094892059, + "grad_norm": 2.7315826416015625, + "learning_rate": 3.485652150851323e-05, + "loss": 0.6583, + "step": 102780 + }, + { + "epoch": 0.9086971127495183, + "grad_norm": 6.964535713195801, + "learning_rate": 3.485504812084137e-05, + "loss": 0.5731, + "step": 102790 + }, + { + "epoch": 0.9087855160098305, + "grad_norm": 2.9444174766540527, + "learning_rate": 3.485357473316949e-05, + "loss": 0.7838, + "step": 102800 + }, + { + "epoch": 0.9088739192701427, + "grad_norm": 3.931525707244873, + "learning_rate": 3.4852101345497625e-05, + "loss": 0.639, + "step": 102810 + }, + { + "epoch": 0.908962322530455, + "grad_norm": 2.748145818710327, + "learning_rate": 3.485062795782575e-05, + "loss": 0.6461, + "step": 102820 + }, + { + "epoch": 0.9090507257907672, + "grad_norm": 8.574252128601074, + "learning_rate": 3.484915457015388e-05, + "loss": 0.7263, + "step": 102830 + }, + { + "epoch": 0.9091391290510794, + "grad_norm": 9.556109428405762, + "learning_rate": 3.484768118248201e-05, + "loss": 0.5527, + "step": 102840 + }, + { + "epoch": 0.9092275323113916, + "grad_norm": 0.9728810787200928, + "learning_rate": 3.4846207794810145e-05, + "loss": 0.5305, + "step": 102850 + }, + { + "epoch": 0.9093159355717039, + "grad_norm": 6.62507438659668, + "learning_rate": 3.4844734407138267e-05, + "loss": 0.5505, + "step": 102860 + }, + { + "epoch": 0.9094043388320161, + "grad_norm": 4.307148456573486, + "learning_rate": 3.48432610194664e-05, + "loss": 0.7932, + "step": 102870 + }, + { + "epoch": 0.9094927420923283, + "grad_norm": 1.1886510848999023, + "learning_rate": 3.484178763179453e-05, + "loss": 0.5498, + "step": 102880 + }, + { + "epoch": 0.9095811453526406, + "grad_norm": 4.167265892028809, + "learning_rate": 3.484031424412266e-05, + "loss": 0.6188, + "step": 102890 + }, + { + "epoch": 0.9096695486129528, + "grad_norm": 13.898412704467773, + "learning_rate": 3.483884085645079e-05, + "loss": 0.5631, + "step": 102900 + }, + { + "epoch": 0.9097579518732651, + "grad_norm": 1.6460766792297363, + "learning_rate": 3.4837367468778915e-05, + "loss": 0.6439, + "step": 102910 + }, + { + "epoch": 0.9098463551335774, + "grad_norm": 1.5120488405227661, + "learning_rate": 3.4835894081107043e-05, + "loss": 0.7586, + "step": 102920 + }, + { + "epoch": 0.9099347583938896, + "grad_norm": 1.4673289060592651, + "learning_rate": 3.483442069343518e-05, + "loss": 0.7212, + "step": 102930 + }, + { + "epoch": 0.9100231616542018, + "grad_norm": 9.395055770874023, + "learning_rate": 3.48329473057633e-05, + "loss": 0.7843, + "step": 102940 + }, + { + "epoch": 0.9101115649145141, + "grad_norm": 1.9193661212921143, + "learning_rate": 3.4831473918091435e-05, + "loss": 0.6455, + "step": 102950 + }, + { + "epoch": 0.9101999681748263, + "grad_norm": 7.308329105377197, + "learning_rate": 3.4830000530419564e-05, + "loss": 0.7136, + "step": 102960 + }, + { + "epoch": 0.9102883714351385, + "grad_norm": 5.894074440002441, + "learning_rate": 3.482852714274769e-05, + "loss": 0.5992, + "step": 102970 + }, + { + "epoch": 0.9103767746954508, + "grad_norm": 5.067314147949219, + "learning_rate": 3.482705375507582e-05, + "loss": 0.6705, + "step": 102980 + }, + { + "epoch": 0.910465177955763, + "grad_norm": 2.852985143661499, + "learning_rate": 3.4825580367403955e-05, + "loss": 0.7311, + "step": 102990 + }, + { + "epoch": 0.9105535812160752, + "grad_norm": 1.2534308433532715, + "learning_rate": 3.482410697973208e-05, + "loss": 0.6228, + "step": 103000 + }, + { + "epoch": 0.9106419844763874, + "grad_norm": 5.065445899963379, + "learning_rate": 3.482263359206021e-05, + "loss": 0.5424, + "step": 103010 + }, + { + "epoch": 0.9107303877366997, + "grad_norm": 1.254294991493225, + "learning_rate": 3.482116020438834e-05, + "loss": 0.6666, + "step": 103020 + }, + { + "epoch": 0.910818790997012, + "grad_norm": 3.0075409412384033, + "learning_rate": 3.481968681671647e-05, + "loss": 0.6205, + "step": 103030 + }, + { + "epoch": 0.9109071942573242, + "grad_norm": 1.6168415546417236, + "learning_rate": 3.48182134290446e-05, + "loss": 0.6119, + "step": 103040 + }, + { + "epoch": 0.9109955975176365, + "grad_norm": 2.2849974632263184, + "learning_rate": 3.4816740041372726e-05, + "loss": 0.548, + "step": 103050 + }, + { + "epoch": 0.9110840007779487, + "grad_norm": 1.2676775455474854, + "learning_rate": 3.4815266653700854e-05, + "loss": 0.7602, + "step": 103060 + }, + { + "epoch": 0.9111724040382609, + "grad_norm": 7.243632793426514, + "learning_rate": 3.481379326602899e-05, + "loss": 0.5948, + "step": 103070 + }, + { + "epoch": 0.9112608072985732, + "grad_norm": 1.9729849100112915, + "learning_rate": 3.481231987835711e-05, + "loss": 0.5694, + "step": 103080 + }, + { + "epoch": 0.9113492105588854, + "grad_norm": 2.3196303844451904, + "learning_rate": 3.4810846490685246e-05, + "loss": 0.6395, + "step": 103090 + }, + { + "epoch": 0.9114376138191976, + "grad_norm": 8.909586906433105, + "learning_rate": 3.4809373103013374e-05, + "loss": 0.6553, + "step": 103100 + }, + { + "epoch": 0.9115260170795099, + "grad_norm": 2.7305219173431396, + "learning_rate": 3.48078997153415e-05, + "loss": 0.5914, + "step": 103110 + }, + { + "epoch": 0.9116144203398221, + "grad_norm": 12.29466438293457, + "learning_rate": 3.480642632766963e-05, + "loss": 0.7054, + "step": 103120 + }, + { + "epoch": 0.9117028236001343, + "grad_norm": 1.8778612613677979, + "learning_rate": 3.4804952939997766e-05, + "loss": 0.7085, + "step": 103130 + }, + { + "epoch": 0.9117912268604467, + "grad_norm": 23.512847900390625, + "learning_rate": 3.480347955232589e-05, + "loss": 0.6218, + "step": 103140 + }, + { + "epoch": 0.9118796301207589, + "grad_norm": 6.219461441040039, + "learning_rate": 3.480200616465402e-05, + "loss": 0.795, + "step": 103150 + }, + { + "epoch": 0.9119680333810711, + "grad_norm": 10.840205192565918, + "learning_rate": 3.4800532776982144e-05, + "loss": 0.675, + "step": 103160 + }, + { + "epoch": 0.9120564366413834, + "grad_norm": 3.672513723373413, + "learning_rate": 3.479905938931028e-05, + "loss": 0.7038, + "step": 103170 + }, + { + "epoch": 0.9121448399016956, + "grad_norm": 4.021480560302734, + "learning_rate": 3.479758600163841e-05, + "loss": 0.6334, + "step": 103180 + }, + { + "epoch": 0.9122332431620078, + "grad_norm": 1.8859593868255615, + "learning_rate": 3.4796112613966536e-05, + "loss": 0.6976, + "step": 103190 + }, + { + "epoch": 0.91232164642232, + "grad_norm": 2.477224826812744, + "learning_rate": 3.4794639226294664e-05, + "loss": 0.7565, + "step": 103200 + }, + { + "epoch": 0.9124100496826323, + "grad_norm": 3.812579870223999, + "learning_rate": 3.47931658386228e-05, + "loss": 0.6198, + "step": 103210 + }, + { + "epoch": 0.9124984529429445, + "grad_norm": 7.9026970863342285, + "learning_rate": 3.479169245095092e-05, + "loss": 0.582, + "step": 103220 + }, + { + "epoch": 0.9125868562032567, + "grad_norm": 2.329782009124756, + "learning_rate": 3.4790219063279056e-05, + "loss": 0.6175, + "step": 103230 + }, + { + "epoch": 0.912675259463569, + "grad_norm": 2.1884875297546387, + "learning_rate": 3.4788745675607185e-05, + "loss": 0.5448, + "step": 103240 + }, + { + "epoch": 0.9127636627238812, + "grad_norm": 3.888169288635254, + "learning_rate": 3.478727228793531e-05, + "loss": 0.7435, + "step": 103250 + }, + { + "epoch": 0.9128520659841936, + "grad_norm": 6.73412561416626, + "learning_rate": 3.478579890026344e-05, + "loss": 0.6516, + "step": 103260 + }, + { + "epoch": 0.9129404692445058, + "grad_norm": 5.661271572113037, + "learning_rate": 3.478432551259157e-05, + "loss": 0.5616, + "step": 103270 + }, + { + "epoch": 0.913028872504818, + "grad_norm": 5.905777931213379, + "learning_rate": 3.4782852124919705e-05, + "loss": 0.5896, + "step": 103280 + }, + { + "epoch": 0.9131172757651302, + "grad_norm": 1.270121693611145, + "learning_rate": 3.478137873724783e-05, + "loss": 0.5912, + "step": 103290 + }, + { + "epoch": 0.9132056790254425, + "grad_norm": 3.8103644847869873, + "learning_rate": 3.477990534957596e-05, + "loss": 0.6009, + "step": 103300 + }, + { + "epoch": 0.9132940822857547, + "grad_norm": 21.119020462036133, + "learning_rate": 3.477843196190409e-05, + "loss": 0.636, + "step": 103310 + }, + { + "epoch": 0.9133824855460669, + "grad_norm": 4.260202407836914, + "learning_rate": 3.477695857423222e-05, + "loss": 0.6174, + "step": 103320 + }, + { + "epoch": 0.9134708888063792, + "grad_norm": 1.796023964881897, + "learning_rate": 3.477548518656035e-05, + "loss": 0.6095, + "step": 103330 + }, + { + "epoch": 0.9135592920666914, + "grad_norm": 1.3518072366714478, + "learning_rate": 3.477401179888848e-05, + "loss": 0.6585, + "step": 103340 + }, + { + "epoch": 0.9136476953270036, + "grad_norm": 6.710455417633057, + "learning_rate": 3.477253841121661e-05, + "loss": 0.6919, + "step": 103350 + }, + { + "epoch": 0.9137360985873159, + "grad_norm": 1.6725014448165894, + "learning_rate": 3.477106502354474e-05, + "loss": 0.6039, + "step": 103360 + }, + { + "epoch": 0.9138245018476281, + "grad_norm": 5.248466968536377, + "learning_rate": 3.476959163587287e-05, + "loss": 0.6507, + "step": 103370 + }, + { + "epoch": 0.9139129051079404, + "grad_norm": 3.784817695617676, + "learning_rate": 3.4768118248200995e-05, + "loss": 0.6993, + "step": 103380 + }, + { + "epoch": 0.9140013083682527, + "grad_norm": 6.656286716461182, + "learning_rate": 3.4766644860529124e-05, + "loss": 0.6885, + "step": 103390 + }, + { + "epoch": 0.9140897116285649, + "grad_norm": 6.79155969619751, + "learning_rate": 3.476517147285726e-05, + "loss": 0.6063, + "step": 103400 + }, + { + "epoch": 0.9141781148888771, + "grad_norm": 3.4230575561523438, + "learning_rate": 3.476369808518538e-05, + "loss": 0.754, + "step": 103410 + }, + { + "epoch": 0.9142665181491894, + "grad_norm": 1.6727087497711182, + "learning_rate": 3.4762224697513515e-05, + "loss": 0.5201, + "step": 103420 + }, + { + "epoch": 0.9143549214095016, + "grad_norm": 2.6733951568603516, + "learning_rate": 3.4760751309841644e-05, + "loss": 0.5719, + "step": 103430 + }, + { + "epoch": 0.9144433246698138, + "grad_norm": 2.1525189876556396, + "learning_rate": 3.475927792216977e-05, + "loss": 0.7658, + "step": 103440 + }, + { + "epoch": 0.914531727930126, + "grad_norm": 6.747961521148682, + "learning_rate": 3.47578045344979e-05, + "loss": 0.6899, + "step": 103450 + }, + { + "epoch": 0.9146201311904383, + "grad_norm": 10.989623069763184, + "learning_rate": 3.4756331146826036e-05, + "loss": 0.5678, + "step": 103460 + }, + { + "epoch": 0.9147085344507505, + "grad_norm": 17.048847198486328, + "learning_rate": 3.475485775915416e-05, + "loss": 0.5074, + "step": 103470 + }, + { + "epoch": 0.9147969377110627, + "grad_norm": 1.4350841045379639, + "learning_rate": 3.475338437148229e-05, + "loss": 0.6783, + "step": 103480 + }, + { + "epoch": 0.914885340971375, + "grad_norm": 2.285968542098999, + "learning_rate": 3.475191098381042e-05, + "loss": 0.6917, + "step": 103490 + }, + { + "epoch": 0.9149737442316873, + "grad_norm": 5.684361934661865, + "learning_rate": 3.475043759613855e-05, + "loss": 0.6848, + "step": 103500 + }, + { + "epoch": 0.9150621474919995, + "grad_norm": 5.974975109100342, + "learning_rate": 3.474896420846668e-05, + "loss": 0.7555, + "step": 103510 + }, + { + "epoch": 0.9151505507523118, + "grad_norm": 8.389676094055176, + "learning_rate": 3.4747490820794806e-05, + "loss": 0.6622, + "step": 103520 + }, + { + "epoch": 0.915238954012624, + "grad_norm": 7.66300630569458, + "learning_rate": 3.4746017433122934e-05, + "loss": 0.6401, + "step": 103530 + }, + { + "epoch": 0.9153273572729362, + "grad_norm": 1.6489628553390503, + "learning_rate": 3.474454404545107e-05, + "loss": 0.7141, + "step": 103540 + }, + { + "epoch": 0.9154157605332485, + "grad_norm": 5.561105251312256, + "learning_rate": 3.474307065777919e-05, + "loss": 0.5689, + "step": 103550 + }, + { + "epoch": 0.9155041637935607, + "grad_norm": 3.733860731124878, + "learning_rate": 3.4741597270107326e-05, + "loss": 0.7455, + "step": 103560 + }, + { + "epoch": 0.9155925670538729, + "grad_norm": 10.259614944458008, + "learning_rate": 3.4740123882435454e-05, + "loss": 0.6487, + "step": 103570 + }, + { + "epoch": 0.9156809703141852, + "grad_norm": 4.322904109954834, + "learning_rate": 3.473865049476358e-05, + "loss": 0.7696, + "step": 103580 + }, + { + "epoch": 0.9157693735744974, + "grad_norm": 2.1287317276000977, + "learning_rate": 3.473717710709171e-05, + "loss": 0.7048, + "step": 103590 + }, + { + "epoch": 0.9158577768348096, + "grad_norm": 3.1804282665252686, + "learning_rate": 3.4735703719419846e-05, + "loss": 0.5895, + "step": 103600 + }, + { + "epoch": 0.9159461800951219, + "grad_norm": 2.5047860145568848, + "learning_rate": 3.473423033174797e-05, + "loss": 0.5911, + "step": 103610 + }, + { + "epoch": 0.9160345833554342, + "grad_norm": 3.1896352767944336, + "learning_rate": 3.47327569440761e-05, + "loss": 0.5795, + "step": 103620 + }, + { + "epoch": 0.9161229866157464, + "grad_norm": 0.9608718752861023, + "learning_rate": 3.4731283556404224e-05, + "loss": 0.5384, + "step": 103630 + }, + { + "epoch": 0.9162113898760587, + "grad_norm": 1.7372419834136963, + "learning_rate": 3.472981016873236e-05, + "loss": 0.6734, + "step": 103640 + }, + { + "epoch": 0.9162997931363709, + "grad_norm": 1.8831795454025269, + "learning_rate": 3.472833678106049e-05, + "loss": 0.5582, + "step": 103650 + }, + { + "epoch": 0.9163881963966831, + "grad_norm": 3.647759199142456, + "learning_rate": 3.4726863393388616e-05, + "loss": 0.5827, + "step": 103660 + }, + { + "epoch": 0.9164765996569953, + "grad_norm": 3.0250210762023926, + "learning_rate": 3.4725390005716745e-05, + "loss": 0.6757, + "step": 103670 + }, + { + "epoch": 0.9165650029173076, + "grad_norm": 17.237476348876953, + "learning_rate": 3.472391661804488e-05, + "loss": 0.7058, + "step": 103680 + }, + { + "epoch": 0.9166534061776198, + "grad_norm": 3.7470288276672363, + "learning_rate": 3.4722443230373e-05, + "loss": 0.7271, + "step": 103690 + }, + { + "epoch": 0.916741809437932, + "grad_norm": 1.799844741821289, + "learning_rate": 3.4720969842701136e-05, + "loss": 0.806, + "step": 103700 + }, + { + "epoch": 0.9168302126982443, + "grad_norm": 2.8860955238342285, + "learning_rate": 3.4719496455029265e-05, + "loss": 0.6154, + "step": 103710 + }, + { + "epoch": 0.9169186159585565, + "grad_norm": 5.958944797515869, + "learning_rate": 3.471802306735739e-05, + "loss": 0.7982, + "step": 103720 + }, + { + "epoch": 0.9170070192188688, + "grad_norm": 7.22694730758667, + "learning_rate": 3.471654967968552e-05, + "loss": 0.7488, + "step": 103730 + }, + { + "epoch": 0.9170954224791811, + "grad_norm": 8.265280723571777, + "learning_rate": 3.471507629201365e-05, + "loss": 0.7001, + "step": 103740 + }, + { + "epoch": 0.9171838257394933, + "grad_norm": 6.297915935516357, + "learning_rate": 3.471360290434178e-05, + "loss": 0.6241, + "step": 103750 + }, + { + "epoch": 0.9172722289998055, + "grad_norm": 1.8345996141433716, + "learning_rate": 3.471212951666991e-05, + "loss": 0.6995, + "step": 103760 + }, + { + "epoch": 0.9173606322601178, + "grad_norm": 1.3702058792114258, + "learning_rate": 3.4710656128998035e-05, + "loss": 0.6059, + "step": 103770 + }, + { + "epoch": 0.91744903552043, + "grad_norm": 1.0202032327651978, + "learning_rate": 3.470918274132617e-05, + "loss": 0.6367, + "step": 103780 + }, + { + "epoch": 0.9175374387807422, + "grad_norm": 3.577991485595703, + "learning_rate": 3.47077093536543e-05, + "loss": 0.5804, + "step": 103790 + }, + { + "epoch": 0.9176258420410545, + "grad_norm": 1.8480812311172485, + "learning_rate": 3.470623596598243e-05, + "loss": 0.5703, + "step": 103800 + }, + { + "epoch": 0.9177142453013667, + "grad_norm": 2.2506215572357178, + "learning_rate": 3.4704762578310555e-05, + "loss": 0.6877, + "step": 103810 + }, + { + "epoch": 0.9178026485616789, + "grad_norm": 3.1972687244415283, + "learning_rate": 3.470328919063869e-05, + "loss": 0.6196, + "step": 103820 + }, + { + "epoch": 0.9178910518219912, + "grad_norm": 3.169938325881958, + "learning_rate": 3.470181580296681e-05, + "loss": 0.7236, + "step": 103830 + }, + { + "epoch": 0.9179794550823034, + "grad_norm": 2.4742043018341064, + "learning_rate": 3.470034241529495e-05, + "loss": 0.6334, + "step": 103840 + }, + { + "epoch": 0.9180678583426157, + "grad_norm": 2.081242084503174, + "learning_rate": 3.469886902762307e-05, + "loss": 0.5269, + "step": 103850 + }, + { + "epoch": 0.918156261602928, + "grad_norm": 1.9559173583984375, + "learning_rate": 3.4697395639951204e-05, + "loss": 0.7655, + "step": 103860 + }, + { + "epoch": 0.9182446648632402, + "grad_norm": 3.1912856101989746, + "learning_rate": 3.469592225227933e-05, + "loss": 0.6924, + "step": 103870 + }, + { + "epoch": 0.9183330681235524, + "grad_norm": 1.9441215991973877, + "learning_rate": 3.469444886460746e-05, + "loss": 0.7904, + "step": 103880 + }, + { + "epoch": 0.9184214713838647, + "grad_norm": 0.9783027172088623, + "learning_rate": 3.469297547693559e-05, + "loss": 0.7327, + "step": 103890 + }, + { + "epoch": 0.9185098746441769, + "grad_norm": 1.1540257930755615, + "learning_rate": 3.4691502089263724e-05, + "loss": 0.6338, + "step": 103900 + }, + { + "epoch": 0.9185982779044891, + "grad_norm": 3.16699481010437, + "learning_rate": 3.4690028701591845e-05, + "loss": 0.529, + "step": 103910 + }, + { + "epoch": 0.9186866811648013, + "grad_norm": 1.4021426439285278, + "learning_rate": 3.468855531391998e-05, + "loss": 0.5532, + "step": 103920 + }, + { + "epoch": 0.9187750844251136, + "grad_norm": 3.0290215015411377, + "learning_rate": 3.468708192624811e-05, + "loss": 0.481, + "step": 103930 + }, + { + "epoch": 0.9188634876854258, + "grad_norm": 1.3256397247314453, + "learning_rate": 3.468560853857624e-05, + "loss": 0.6867, + "step": 103940 + }, + { + "epoch": 0.918951890945738, + "grad_norm": 1.932599425315857, + "learning_rate": 3.4684135150904366e-05, + "loss": 0.6652, + "step": 103950 + }, + { + "epoch": 0.9190402942060503, + "grad_norm": 2.8910319805145264, + "learning_rate": 3.46826617632325e-05, + "loss": 0.7193, + "step": 103960 + }, + { + "epoch": 0.9191286974663626, + "grad_norm": 1.3356348276138306, + "learning_rate": 3.468118837556062e-05, + "loss": 0.6138, + "step": 103970 + }, + { + "epoch": 0.9192171007266748, + "grad_norm": 1.0583598613739014, + "learning_rate": 3.467971498788876e-05, + "loss": 0.5298, + "step": 103980 + }, + { + "epoch": 0.9193055039869871, + "grad_norm": 3.0338141918182373, + "learning_rate": 3.467824160021688e-05, + "loss": 0.6652, + "step": 103990 + }, + { + "epoch": 0.9193939072472993, + "grad_norm": 0.8351437449455261, + "learning_rate": 3.4676768212545014e-05, + "loss": 0.5462, + "step": 104000 + }, + { + "epoch": 0.9194823105076115, + "grad_norm": 4.577482223510742, + "learning_rate": 3.467529482487314e-05, + "loss": 0.7125, + "step": 104010 + }, + { + "epoch": 0.9195707137679238, + "grad_norm": 7.595093727111816, + "learning_rate": 3.467382143720127e-05, + "loss": 0.8144, + "step": 104020 + }, + { + "epoch": 0.919659117028236, + "grad_norm": 2.464247703552246, + "learning_rate": 3.46723480495294e-05, + "loss": 0.761, + "step": 104030 + }, + { + "epoch": 0.9197475202885482, + "grad_norm": 3.4661977291107178, + "learning_rate": 3.4670874661857534e-05, + "loss": 0.7069, + "step": 104040 + }, + { + "epoch": 0.9198359235488605, + "grad_norm": 3.049800157546997, + "learning_rate": 3.4669401274185656e-05, + "loss": 0.7475, + "step": 104050 + }, + { + "epoch": 0.9199243268091727, + "grad_norm": 4.5359416007995605, + "learning_rate": 3.466792788651379e-05, + "loss": 0.6148, + "step": 104060 + }, + { + "epoch": 0.9200127300694849, + "grad_norm": 5.870856761932373, + "learning_rate": 3.466645449884192e-05, + "loss": 0.5147, + "step": 104070 + }, + { + "epoch": 0.9201011333297971, + "grad_norm": 5.160520553588867, + "learning_rate": 3.466498111117005e-05, + "loss": 0.663, + "step": 104080 + }, + { + "epoch": 0.9201895365901095, + "grad_norm": 3.9021685123443604, + "learning_rate": 3.4663507723498176e-05, + "loss": 0.5629, + "step": 104090 + }, + { + "epoch": 0.9202779398504217, + "grad_norm": 0.8227945566177368, + "learning_rate": 3.4662034335826304e-05, + "loss": 0.5951, + "step": 104100 + }, + { + "epoch": 0.920366343110734, + "grad_norm": 5.3351335525512695, + "learning_rate": 3.466056094815443e-05, + "loss": 0.5594, + "step": 104110 + }, + { + "epoch": 0.9204547463710462, + "grad_norm": 2.647221326828003, + "learning_rate": 3.465908756048257e-05, + "loss": 0.6732, + "step": 104120 + }, + { + "epoch": 0.9205431496313584, + "grad_norm": 3.9328064918518066, + "learning_rate": 3.4657614172810696e-05, + "loss": 0.717, + "step": 104130 + }, + { + "epoch": 0.9206315528916706, + "grad_norm": 3.648874044418335, + "learning_rate": 3.4656140785138825e-05, + "loss": 0.6789, + "step": 104140 + }, + { + "epoch": 0.9207199561519829, + "grad_norm": 4.600849628448486, + "learning_rate": 3.465466739746695e-05, + "loss": 0.6565, + "step": 104150 + }, + { + "epoch": 0.9208083594122951, + "grad_norm": 0.8460045456886292, + "learning_rate": 3.465319400979508e-05, + "loss": 0.6734, + "step": 104160 + }, + { + "epoch": 0.9208967626726073, + "grad_norm": 3.087242841720581, + "learning_rate": 3.465172062212321e-05, + "loss": 0.7499, + "step": 104170 + }, + { + "epoch": 0.9209851659329196, + "grad_norm": 6.493686676025391, + "learning_rate": 3.4650247234451345e-05, + "loss": 0.6004, + "step": 104180 + }, + { + "epoch": 0.9210735691932318, + "grad_norm": 3.5971786975860596, + "learning_rate": 3.464877384677947e-05, + "loss": 0.7482, + "step": 104190 + }, + { + "epoch": 0.921161972453544, + "grad_norm": 5.517910957336426, + "learning_rate": 3.46473004591076e-05, + "loss": 0.6536, + "step": 104200 + }, + { + "epoch": 0.9212503757138564, + "grad_norm": 2.8508706092834473, + "learning_rate": 3.464582707143573e-05, + "loss": 0.5906, + "step": 104210 + }, + { + "epoch": 0.9213387789741686, + "grad_norm": 1.2400574684143066, + "learning_rate": 3.464435368376386e-05, + "loss": 0.5976, + "step": 104220 + }, + { + "epoch": 0.9214271822344808, + "grad_norm": 3.411325216293335, + "learning_rate": 3.4642880296091987e-05, + "loss": 0.6162, + "step": 104230 + }, + { + "epoch": 0.9215155854947931, + "grad_norm": 2.050952911376953, + "learning_rate": 3.4641406908420115e-05, + "loss": 0.6717, + "step": 104240 + }, + { + "epoch": 0.9216039887551053, + "grad_norm": 1.7029166221618652, + "learning_rate": 3.463993352074825e-05, + "loss": 0.7187, + "step": 104250 + }, + { + "epoch": 0.9216923920154175, + "grad_norm": 4.325075626373291, + "learning_rate": 3.463846013307638e-05, + "loss": 0.7069, + "step": 104260 + }, + { + "epoch": 0.9217807952757298, + "grad_norm": 2.1368467807769775, + "learning_rate": 3.463698674540451e-05, + "loss": 0.6641, + "step": 104270 + }, + { + "epoch": 0.921869198536042, + "grad_norm": 2.4817216396331787, + "learning_rate": 3.4635513357732635e-05, + "loss": 0.5852, + "step": 104280 + }, + { + "epoch": 0.9219576017963542, + "grad_norm": 14.611276626586914, + "learning_rate": 3.4634039970060763e-05, + "loss": 0.633, + "step": 104290 + }, + { + "epoch": 0.9220460050566665, + "grad_norm": 2.314023733139038, + "learning_rate": 3.463256658238889e-05, + "loss": 0.6533, + "step": 104300 + }, + { + "epoch": 0.9221344083169787, + "grad_norm": 4.543471336364746, + "learning_rate": 3.463109319471703e-05, + "loss": 0.6756, + "step": 104310 + }, + { + "epoch": 0.922222811577291, + "grad_norm": 2.733240842819214, + "learning_rate": 3.462961980704515e-05, + "loss": 0.7268, + "step": 104320 + }, + { + "epoch": 0.9223112148376033, + "grad_norm": 3.4160687923431396, + "learning_rate": 3.4628146419373284e-05, + "loss": 0.6346, + "step": 104330 + }, + { + "epoch": 0.9223996180979155, + "grad_norm": 4.000524997711182, + "learning_rate": 3.462667303170141e-05, + "loss": 0.5833, + "step": 104340 + }, + { + "epoch": 0.9224880213582277, + "grad_norm": 3.380833387374878, + "learning_rate": 3.462519964402954e-05, + "loss": 0.6004, + "step": 104350 + }, + { + "epoch": 0.92257642461854, + "grad_norm": 7.3308820724487305, + "learning_rate": 3.462372625635767e-05, + "loss": 0.6014, + "step": 104360 + }, + { + "epoch": 0.9226648278788522, + "grad_norm": 2.7283005714416504, + "learning_rate": 3.4622252868685804e-05, + "loss": 0.5989, + "step": 104370 + }, + { + "epoch": 0.9227532311391644, + "grad_norm": 2.2377896308898926, + "learning_rate": 3.4620779481013925e-05, + "loss": 0.6806, + "step": 104380 + }, + { + "epoch": 0.9228416343994766, + "grad_norm": 2.443495988845825, + "learning_rate": 3.461930609334206e-05, + "loss": 0.664, + "step": 104390 + }, + { + "epoch": 0.9229300376597889, + "grad_norm": 2.7558369636535645, + "learning_rate": 3.461783270567019e-05, + "loss": 0.7429, + "step": 104400 + }, + { + "epoch": 0.9230184409201011, + "grad_norm": 1.415323257446289, + "learning_rate": 3.461635931799832e-05, + "loss": 0.7924, + "step": 104410 + }, + { + "epoch": 0.9231068441804133, + "grad_norm": 3.840822696685791, + "learning_rate": 3.4614885930326446e-05, + "loss": 0.6435, + "step": 104420 + }, + { + "epoch": 0.9231952474407256, + "grad_norm": 2.3900983333587646, + "learning_rate": 3.461341254265458e-05, + "loss": 0.7044, + "step": 104430 + }, + { + "epoch": 0.9232836507010379, + "grad_norm": 2.2705886363983154, + "learning_rate": 3.46119391549827e-05, + "loss": 0.6203, + "step": 104440 + }, + { + "epoch": 0.9233720539613501, + "grad_norm": 1.9621963500976562, + "learning_rate": 3.461046576731084e-05, + "loss": 0.8105, + "step": 104450 + }, + { + "epoch": 0.9234604572216624, + "grad_norm": 1.57456374168396, + "learning_rate": 3.460899237963896e-05, + "loss": 0.6263, + "step": 104460 + }, + { + "epoch": 0.9235488604819746, + "grad_norm": 1.8862709999084473, + "learning_rate": 3.4607518991967094e-05, + "loss": 0.6521, + "step": 104470 + }, + { + "epoch": 0.9236372637422868, + "grad_norm": 2.4278106689453125, + "learning_rate": 3.460604560429522e-05, + "loss": 0.7634, + "step": 104480 + }, + { + "epoch": 0.9237256670025991, + "grad_norm": 1.6628830432891846, + "learning_rate": 3.460457221662335e-05, + "loss": 0.6415, + "step": 104490 + }, + { + "epoch": 0.9238140702629113, + "grad_norm": 3.358079671859741, + "learning_rate": 3.460309882895148e-05, + "loss": 0.6356, + "step": 104500 + }, + { + "epoch": 0.9239024735232235, + "grad_norm": 1.7938374280929565, + "learning_rate": 3.4601625441279614e-05, + "loss": 0.7198, + "step": 104510 + }, + { + "epoch": 0.9239908767835358, + "grad_norm": 4.427228927612305, + "learning_rate": 3.4600152053607736e-05, + "loss": 0.6924, + "step": 104520 + }, + { + "epoch": 0.924079280043848, + "grad_norm": 8.362548828125, + "learning_rate": 3.459867866593587e-05, + "loss": 0.6429, + "step": 104530 + }, + { + "epoch": 0.9241676833041602, + "grad_norm": 1.2016392946243286, + "learning_rate": 3.4597205278264e-05, + "loss": 0.6864, + "step": 104540 + }, + { + "epoch": 0.9242560865644724, + "grad_norm": 6.469966888427734, + "learning_rate": 3.459573189059213e-05, + "loss": 0.5586, + "step": 104550 + }, + { + "epoch": 0.9243444898247848, + "grad_norm": 6.511034965515137, + "learning_rate": 3.4594258502920256e-05, + "loss": 0.6893, + "step": 104560 + }, + { + "epoch": 0.924432893085097, + "grad_norm": 1.8520325422286987, + "learning_rate": 3.4592785115248384e-05, + "loss": 0.6149, + "step": 104570 + }, + { + "epoch": 0.9245212963454092, + "grad_norm": 2.806547164916992, + "learning_rate": 3.459131172757651e-05, + "loss": 0.7918, + "step": 104580 + }, + { + "epoch": 0.9246096996057215, + "grad_norm": 1.0989857912063599, + "learning_rate": 3.458983833990465e-05, + "loss": 0.7532, + "step": 104590 + }, + { + "epoch": 0.9246981028660337, + "grad_norm": 4.717621326446533, + "learning_rate": 3.458836495223277e-05, + "loss": 0.7444, + "step": 104600 + }, + { + "epoch": 0.9247865061263459, + "grad_norm": 1.115357518196106, + "learning_rate": 3.4586891564560905e-05, + "loss": 0.6074, + "step": 104610 + }, + { + "epoch": 0.9248749093866582, + "grad_norm": 0.9762543439865112, + "learning_rate": 3.458541817688903e-05, + "loss": 0.6751, + "step": 104620 + }, + { + "epoch": 0.9249633126469704, + "grad_norm": 3.491162061691284, + "learning_rate": 3.458394478921716e-05, + "loss": 0.7316, + "step": 104630 + }, + { + "epoch": 0.9250517159072826, + "grad_norm": 3.6603012084960938, + "learning_rate": 3.458247140154529e-05, + "loss": 0.8362, + "step": 104640 + }, + { + "epoch": 0.9251401191675949, + "grad_norm": 1.6165461540222168, + "learning_rate": 3.4580998013873425e-05, + "loss": 0.6194, + "step": 104650 + }, + { + "epoch": 0.9252285224279071, + "grad_norm": 2.929722547531128, + "learning_rate": 3.4579524626201546e-05, + "loss": 0.6733, + "step": 104660 + }, + { + "epoch": 0.9253169256882193, + "grad_norm": 5.379761695861816, + "learning_rate": 3.457805123852968e-05, + "loss": 0.7932, + "step": 104670 + }, + { + "epoch": 0.9254053289485317, + "grad_norm": 1.9697751998901367, + "learning_rate": 3.45765778508578e-05, + "loss": 0.6643, + "step": 104680 + }, + { + "epoch": 0.9254937322088439, + "grad_norm": 3.5992343425750732, + "learning_rate": 3.457510446318594e-05, + "loss": 0.7967, + "step": 104690 + }, + { + "epoch": 0.9255821354691561, + "grad_norm": 2.030086040496826, + "learning_rate": 3.457363107551407e-05, + "loss": 0.7189, + "step": 104700 + }, + { + "epoch": 0.9256705387294684, + "grad_norm": 28.58447265625, + "learning_rate": 3.4572157687842195e-05, + "loss": 0.6581, + "step": 104710 + }, + { + "epoch": 0.9257589419897806, + "grad_norm": 2.21335768699646, + "learning_rate": 3.457068430017032e-05, + "loss": 0.641, + "step": 104720 + }, + { + "epoch": 0.9258473452500928, + "grad_norm": 1.9996507167816162, + "learning_rate": 3.456921091249846e-05, + "loss": 0.6598, + "step": 104730 + }, + { + "epoch": 0.925935748510405, + "grad_norm": 1.8208813667297363, + "learning_rate": 3.456773752482658e-05, + "loss": 0.6008, + "step": 104740 + }, + { + "epoch": 0.9260241517707173, + "grad_norm": 1.8049731254577637, + "learning_rate": 3.4566264137154715e-05, + "loss": 0.6254, + "step": 104750 + }, + { + "epoch": 0.9261125550310295, + "grad_norm": 3.426121473312378, + "learning_rate": 3.4564790749482844e-05, + "loss": 0.6932, + "step": 104760 + }, + { + "epoch": 0.9262009582913417, + "grad_norm": 1.122986078262329, + "learning_rate": 3.456331736181097e-05, + "loss": 0.5211, + "step": 104770 + }, + { + "epoch": 0.926289361551654, + "grad_norm": 7.839140892028809, + "learning_rate": 3.45618439741391e-05, + "loss": 0.6795, + "step": 104780 + }, + { + "epoch": 0.9263777648119663, + "grad_norm": 2.6903767585754395, + "learning_rate": 3.456037058646723e-05, + "loss": 0.5912, + "step": 104790 + }, + { + "epoch": 0.9264661680722786, + "grad_norm": 2.958411693572998, + "learning_rate": 3.455889719879536e-05, + "loss": 0.7027, + "step": 104800 + }, + { + "epoch": 0.9265545713325908, + "grad_norm": 3.4592723846435547, + "learning_rate": 3.455742381112349e-05, + "loss": 0.6906, + "step": 104810 + }, + { + "epoch": 0.926642974592903, + "grad_norm": 5.114811897277832, + "learning_rate": 3.4555950423451614e-05, + "loss": 0.6953, + "step": 104820 + }, + { + "epoch": 0.9267313778532152, + "grad_norm": 2.9148213863372803, + "learning_rate": 3.455447703577975e-05, + "loss": 0.5997, + "step": 104830 + }, + { + "epoch": 0.9268197811135275, + "grad_norm": 2.3010993003845215, + "learning_rate": 3.455300364810788e-05, + "loss": 0.6917, + "step": 104840 + }, + { + "epoch": 0.9269081843738397, + "grad_norm": 2.3580894470214844, + "learning_rate": 3.4551530260436006e-05, + "loss": 0.6295, + "step": 104850 + }, + { + "epoch": 0.9269965876341519, + "grad_norm": 2.740551471710205, + "learning_rate": 3.4550056872764134e-05, + "loss": 0.6211, + "step": 104860 + }, + { + "epoch": 0.9270849908944642, + "grad_norm": 3.4254167079925537, + "learning_rate": 3.454858348509227e-05, + "loss": 0.5244, + "step": 104870 + }, + { + "epoch": 0.9271733941547764, + "grad_norm": 1.171040654182434, + "learning_rate": 3.454711009742039e-05, + "loss": 0.7817, + "step": 104880 + }, + { + "epoch": 0.9272617974150886, + "grad_norm": 5.224419593811035, + "learning_rate": 3.4545636709748526e-05, + "loss": 0.6389, + "step": 104890 + }, + { + "epoch": 0.9273502006754009, + "grad_norm": 4.985189437866211, + "learning_rate": 3.4544163322076654e-05, + "loss": 0.6284, + "step": 104900 + }, + { + "epoch": 0.9274386039357132, + "grad_norm": 4.505568504333496, + "learning_rate": 3.454268993440478e-05, + "loss": 0.6461, + "step": 104910 + }, + { + "epoch": 0.9275270071960254, + "grad_norm": 2.198920488357544, + "learning_rate": 3.454121654673291e-05, + "loss": 0.7069, + "step": 104920 + }, + { + "epoch": 0.9276154104563377, + "grad_norm": 2.271940231323242, + "learning_rate": 3.453974315906104e-05, + "loss": 0.593, + "step": 104930 + }, + { + "epoch": 0.9277038137166499, + "grad_norm": 0.9303901791572571, + "learning_rate": 3.453826977138917e-05, + "loss": 0.6362, + "step": 104940 + }, + { + "epoch": 0.9277922169769621, + "grad_norm": 3.752889394760132, + "learning_rate": 3.45367963837173e-05, + "loss": 0.6494, + "step": 104950 + }, + { + "epoch": 0.9278806202372744, + "grad_norm": 9.621313095092773, + "learning_rate": 3.4535322996045424e-05, + "loss": 0.6087, + "step": 104960 + }, + { + "epoch": 0.9279690234975866, + "grad_norm": 10.74526309967041, + "learning_rate": 3.453384960837356e-05, + "loss": 0.5853, + "step": 104970 + }, + { + "epoch": 0.9280574267578988, + "grad_norm": 8.03128433227539, + "learning_rate": 3.453237622070169e-05, + "loss": 0.6375, + "step": 104980 + }, + { + "epoch": 0.928145830018211, + "grad_norm": 2.8632171154022217, + "learning_rate": 3.4530902833029816e-05, + "loss": 0.5752, + "step": 104990 + }, + { + "epoch": 0.9282342332785233, + "grad_norm": 1.5016505718231201, + "learning_rate": 3.4529429445357944e-05, + "loss": 0.6637, + "step": 105000 + }, + { + "epoch": 0.9283226365388355, + "grad_norm": 3.3937366008758545, + "learning_rate": 3.452795605768608e-05, + "loss": 0.6431, + "step": 105010 + }, + { + "epoch": 0.9284110397991477, + "grad_norm": 8.214386940002441, + "learning_rate": 3.45264826700142e-05, + "loss": 0.7078, + "step": 105020 + }, + { + "epoch": 0.9284994430594601, + "grad_norm": 4.080697536468506, + "learning_rate": 3.4525009282342336e-05, + "loss": 0.5912, + "step": 105030 + }, + { + "epoch": 0.9285878463197723, + "grad_norm": 7.324512958526611, + "learning_rate": 3.4523535894670465e-05, + "loss": 0.7108, + "step": 105040 + }, + { + "epoch": 0.9286762495800845, + "grad_norm": 1.087760090827942, + "learning_rate": 3.452206250699859e-05, + "loss": 0.5203, + "step": 105050 + }, + { + "epoch": 0.9287646528403968, + "grad_norm": 3.406646490097046, + "learning_rate": 3.452058911932672e-05, + "loss": 0.6115, + "step": 105060 + }, + { + "epoch": 0.928853056100709, + "grad_norm": 10.423240661621094, + "learning_rate": 3.451911573165485e-05, + "loss": 0.5134, + "step": 105070 + }, + { + "epoch": 0.9289414593610212, + "grad_norm": 2.9946515560150146, + "learning_rate": 3.451764234398298e-05, + "loss": 0.5499, + "step": 105080 + }, + { + "epoch": 0.9290298626213335, + "grad_norm": 4.508096694946289, + "learning_rate": 3.451616895631111e-05, + "loss": 0.609, + "step": 105090 + }, + { + "epoch": 0.9291182658816457, + "grad_norm": 7.4385271072387695, + "learning_rate": 3.451469556863924e-05, + "loss": 0.8378, + "step": 105100 + }, + { + "epoch": 0.9292066691419579, + "grad_norm": 2.4653961658477783, + "learning_rate": 3.451322218096737e-05, + "loss": 0.6597, + "step": 105110 + }, + { + "epoch": 0.9292950724022702, + "grad_norm": 1.977466106414795, + "learning_rate": 3.45117487932955e-05, + "loss": 0.7119, + "step": 105120 + }, + { + "epoch": 0.9293834756625824, + "grad_norm": 2.236581325531006, + "learning_rate": 3.4510275405623627e-05, + "loss": 0.6094, + "step": 105130 + }, + { + "epoch": 0.9294718789228946, + "grad_norm": 1.236483097076416, + "learning_rate": 3.4508802017951755e-05, + "loss": 0.6374, + "step": 105140 + }, + { + "epoch": 0.929560282183207, + "grad_norm": 4.176371097564697, + "learning_rate": 3.450732863027988e-05, + "loss": 0.5527, + "step": 105150 + }, + { + "epoch": 0.9296486854435192, + "grad_norm": 1.9775093793869019, + "learning_rate": 3.450585524260802e-05, + "loss": 0.6298, + "step": 105160 + }, + { + "epoch": 0.9297370887038314, + "grad_norm": 4.56240177154541, + "learning_rate": 3.450438185493615e-05, + "loss": 0.6215, + "step": 105170 + }, + { + "epoch": 0.9298254919641437, + "grad_norm": 3.758308172225952, + "learning_rate": 3.4502908467264275e-05, + "loss": 0.7157, + "step": 105180 + }, + { + "epoch": 0.9299138952244559, + "grad_norm": 5.263377666473389, + "learning_rate": 3.4501435079592403e-05, + "loss": 0.8083, + "step": 105190 + }, + { + "epoch": 0.9300022984847681, + "grad_norm": 4.500478267669678, + "learning_rate": 3.449996169192053e-05, + "loss": 0.6416, + "step": 105200 + }, + { + "epoch": 0.9300907017450804, + "grad_norm": 1.5850861072540283, + "learning_rate": 3.449848830424866e-05, + "loss": 0.5494, + "step": 105210 + }, + { + "epoch": 0.9301791050053926, + "grad_norm": 2.821803331375122, + "learning_rate": 3.4497014916576795e-05, + "loss": 0.5608, + "step": 105220 + }, + { + "epoch": 0.9302675082657048, + "grad_norm": 1.5779587030410767, + "learning_rate": 3.4495541528904924e-05, + "loss": 0.6611, + "step": 105230 + }, + { + "epoch": 0.930355911526017, + "grad_norm": 5.891751766204834, + "learning_rate": 3.449406814123305e-05, + "loss": 0.6683, + "step": 105240 + }, + { + "epoch": 0.9304443147863293, + "grad_norm": 2.789013624191284, + "learning_rate": 3.449259475356118e-05, + "loss": 0.5944, + "step": 105250 + }, + { + "epoch": 0.9305327180466415, + "grad_norm": 3.7947304248809814, + "learning_rate": 3.449112136588931e-05, + "loss": 0.5688, + "step": 105260 + }, + { + "epoch": 0.9306211213069538, + "grad_norm": 4.604398727416992, + "learning_rate": 3.448964797821744e-05, + "loss": 0.6049, + "step": 105270 + }, + { + "epoch": 0.9307095245672661, + "grad_norm": 1.2592705488204956, + "learning_rate": 3.448817459054557e-05, + "loss": 0.6481, + "step": 105280 + }, + { + "epoch": 0.9307979278275783, + "grad_norm": 1.717741847038269, + "learning_rate": 3.4486701202873694e-05, + "loss": 0.6046, + "step": 105290 + }, + { + "epoch": 0.9308863310878905, + "grad_norm": 0.8107909560203552, + "learning_rate": 3.448522781520183e-05, + "loss": 0.6765, + "step": 105300 + }, + { + "epoch": 0.9309747343482028, + "grad_norm": 9.534072875976562, + "learning_rate": 3.448375442752996e-05, + "loss": 0.6768, + "step": 105310 + }, + { + "epoch": 0.931063137608515, + "grad_norm": 2.568861961364746, + "learning_rate": 3.4482281039858086e-05, + "loss": 0.628, + "step": 105320 + }, + { + "epoch": 0.9311515408688272, + "grad_norm": 11.862177848815918, + "learning_rate": 3.4480807652186214e-05, + "loss": 0.6139, + "step": 105330 + }, + { + "epoch": 0.9312399441291395, + "grad_norm": 1.8706004619598389, + "learning_rate": 3.447933426451435e-05, + "loss": 0.5692, + "step": 105340 + }, + { + "epoch": 0.9313283473894517, + "grad_norm": 2.5620651245117188, + "learning_rate": 3.447786087684247e-05, + "loss": 0.5955, + "step": 105350 + }, + { + "epoch": 0.9314167506497639, + "grad_norm": 1.167807936668396, + "learning_rate": 3.4476387489170606e-05, + "loss": 0.6981, + "step": 105360 + }, + { + "epoch": 0.9315051539100762, + "grad_norm": 2.324995517730713, + "learning_rate": 3.4474914101498734e-05, + "loss": 0.6546, + "step": 105370 + }, + { + "epoch": 0.9315935571703885, + "grad_norm": 2.697660207748413, + "learning_rate": 3.447344071382686e-05, + "loss": 0.5719, + "step": 105380 + }, + { + "epoch": 0.9316819604307007, + "grad_norm": 9.180237770080566, + "learning_rate": 3.447196732615499e-05, + "loss": 0.7306, + "step": 105390 + }, + { + "epoch": 0.931770363691013, + "grad_norm": 2.1576290130615234, + "learning_rate": 3.447049393848312e-05, + "loss": 0.7309, + "step": 105400 + }, + { + "epoch": 0.9318587669513252, + "grad_norm": 2.53000545501709, + "learning_rate": 3.446902055081125e-05, + "loss": 0.5871, + "step": 105410 + }, + { + "epoch": 0.9319471702116374, + "grad_norm": 2.2341485023498535, + "learning_rate": 3.446754716313938e-05, + "loss": 0.8134, + "step": 105420 + }, + { + "epoch": 0.9320355734719497, + "grad_norm": 12.517487525939941, + "learning_rate": 3.4466073775467504e-05, + "loss": 0.6647, + "step": 105430 + }, + { + "epoch": 0.9321239767322619, + "grad_norm": 3.3855173587799072, + "learning_rate": 3.446460038779564e-05, + "loss": 0.6319, + "step": 105440 + }, + { + "epoch": 0.9322123799925741, + "grad_norm": 2.9722278118133545, + "learning_rate": 3.446312700012377e-05, + "loss": 0.6604, + "step": 105450 + }, + { + "epoch": 0.9323007832528863, + "grad_norm": 2.611067533493042, + "learning_rate": 3.4461653612451896e-05, + "loss": 0.7337, + "step": 105460 + }, + { + "epoch": 0.9323891865131986, + "grad_norm": 1.7677834033966064, + "learning_rate": 3.4460180224780024e-05, + "loss": 0.6748, + "step": 105470 + }, + { + "epoch": 0.9324775897735108, + "grad_norm": 2.7088329792022705, + "learning_rate": 3.445870683710816e-05, + "loss": 0.4969, + "step": 105480 + }, + { + "epoch": 0.932565993033823, + "grad_norm": 1.6849554777145386, + "learning_rate": 3.445723344943628e-05, + "loss": 0.5471, + "step": 105490 + }, + { + "epoch": 0.9326543962941354, + "grad_norm": 3.012347936630249, + "learning_rate": 3.4455760061764416e-05, + "loss": 0.6974, + "step": 105500 + }, + { + "epoch": 0.9327427995544476, + "grad_norm": 1.863147497177124, + "learning_rate": 3.445428667409254e-05, + "loss": 0.686, + "step": 105510 + }, + { + "epoch": 0.9328312028147598, + "grad_norm": 2.776118755340576, + "learning_rate": 3.445281328642067e-05, + "loss": 0.6664, + "step": 105520 + }, + { + "epoch": 0.9329196060750721, + "grad_norm": 9.31080436706543, + "learning_rate": 3.44513398987488e-05, + "loss": 0.5442, + "step": 105530 + }, + { + "epoch": 0.9330080093353843, + "grad_norm": 1.9648231267929077, + "learning_rate": 3.444986651107693e-05, + "loss": 0.7315, + "step": 105540 + }, + { + "epoch": 0.9330964125956965, + "grad_norm": 6.062094688415527, + "learning_rate": 3.444839312340506e-05, + "loss": 0.6707, + "step": 105550 + }, + { + "epoch": 0.9331848158560088, + "grad_norm": 2.9668476581573486, + "learning_rate": 3.444691973573319e-05, + "loss": 0.5714, + "step": 105560 + }, + { + "epoch": 0.933273219116321, + "grad_norm": 5.547146320343018, + "learning_rate": 3.4445446348061315e-05, + "loss": 0.6385, + "step": 105570 + }, + { + "epoch": 0.9333616223766332, + "grad_norm": 4.097261428833008, + "learning_rate": 3.444397296038945e-05, + "loss": 0.7184, + "step": 105580 + }, + { + "epoch": 0.9334500256369455, + "grad_norm": 4.441718101501465, + "learning_rate": 3.444249957271758e-05, + "loss": 0.6514, + "step": 105590 + }, + { + "epoch": 0.9335384288972577, + "grad_norm": 1.865618348121643, + "learning_rate": 3.4441026185045707e-05, + "loss": 0.6407, + "step": 105600 + }, + { + "epoch": 0.9336268321575699, + "grad_norm": 3.6917524337768555, + "learning_rate": 3.4439552797373835e-05, + "loss": 0.68, + "step": 105610 + }, + { + "epoch": 0.9337152354178823, + "grad_norm": 2.2410688400268555, + "learning_rate": 3.443807940970196e-05, + "loss": 0.6834, + "step": 105620 + }, + { + "epoch": 0.9338036386781945, + "grad_norm": 3.7255070209503174, + "learning_rate": 3.443660602203009e-05, + "loss": 0.6979, + "step": 105630 + }, + { + "epoch": 0.9338920419385067, + "grad_norm": 2.3326375484466553, + "learning_rate": 3.443513263435823e-05, + "loss": 0.5148, + "step": 105640 + }, + { + "epoch": 0.933980445198819, + "grad_norm": 12.944406509399414, + "learning_rate": 3.443365924668635e-05, + "loss": 0.6967, + "step": 105650 + }, + { + "epoch": 0.9340688484591312, + "grad_norm": 1.932525634765625, + "learning_rate": 3.4432185859014483e-05, + "loss": 0.5949, + "step": 105660 + }, + { + "epoch": 0.9341572517194434, + "grad_norm": 1.6108492612838745, + "learning_rate": 3.443071247134261e-05, + "loss": 0.6319, + "step": 105670 + }, + { + "epoch": 0.9342456549797556, + "grad_norm": 2.996105909347534, + "learning_rate": 3.442923908367074e-05, + "loss": 0.6132, + "step": 105680 + }, + { + "epoch": 0.9343340582400679, + "grad_norm": 3.7411937713623047, + "learning_rate": 3.442776569599887e-05, + "loss": 0.5928, + "step": 105690 + }, + { + "epoch": 0.9344224615003801, + "grad_norm": 3.852276563644409, + "learning_rate": 3.4426292308327004e-05, + "loss": 0.6142, + "step": 105700 + }, + { + "epoch": 0.9345108647606923, + "grad_norm": 2.47209095954895, + "learning_rate": 3.4424818920655125e-05, + "loss": 0.7048, + "step": 105710 + }, + { + "epoch": 0.9345992680210046, + "grad_norm": 6.194769859313965, + "learning_rate": 3.442334553298326e-05, + "loss": 0.5949, + "step": 105720 + }, + { + "epoch": 0.9346876712813168, + "grad_norm": 2.719789981842041, + "learning_rate": 3.442187214531139e-05, + "loss": 0.6168, + "step": 105730 + }, + { + "epoch": 0.9347760745416291, + "grad_norm": 5.138671398162842, + "learning_rate": 3.442039875763952e-05, + "loss": 0.6338, + "step": 105740 + }, + { + "epoch": 0.9348644778019414, + "grad_norm": 1.496875286102295, + "learning_rate": 3.4418925369967645e-05, + "loss": 0.668, + "step": 105750 + }, + { + "epoch": 0.9349528810622536, + "grad_norm": 1.9950506687164307, + "learning_rate": 3.4417451982295774e-05, + "loss": 0.7026, + "step": 105760 + }, + { + "epoch": 0.9350412843225658, + "grad_norm": 1.2310317754745483, + "learning_rate": 3.44159785946239e-05, + "loss": 0.6841, + "step": 105770 + }, + { + "epoch": 0.9351296875828781, + "grad_norm": 14.722799301147461, + "learning_rate": 3.441450520695204e-05, + "loss": 0.7115, + "step": 105780 + }, + { + "epoch": 0.9352180908431903, + "grad_norm": 2.9561312198638916, + "learning_rate": 3.441303181928016e-05, + "loss": 0.7107, + "step": 105790 + }, + { + "epoch": 0.9353064941035025, + "grad_norm": 5.269919395446777, + "learning_rate": 3.4411558431608294e-05, + "loss": 0.556, + "step": 105800 + }, + { + "epoch": 0.9353948973638148, + "grad_norm": 9.491625785827637, + "learning_rate": 3.441008504393642e-05, + "loss": 0.7069, + "step": 105810 + }, + { + "epoch": 0.935483300624127, + "grad_norm": 2.8752474784851074, + "learning_rate": 3.440861165626455e-05, + "loss": 0.605, + "step": 105820 + }, + { + "epoch": 0.9355717038844392, + "grad_norm": 6.255155086517334, + "learning_rate": 3.440713826859268e-05, + "loss": 0.7178, + "step": 105830 + }, + { + "epoch": 0.9356601071447515, + "grad_norm": 2.972198724746704, + "learning_rate": 3.4405664880920814e-05, + "loss": 0.6154, + "step": 105840 + }, + { + "epoch": 0.9357485104050638, + "grad_norm": 1.6700375080108643, + "learning_rate": 3.4404191493248936e-05, + "loss": 0.5742, + "step": 105850 + }, + { + "epoch": 0.935836913665376, + "grad_norm": 1.5107364654541016, + "learning_rate": 3.440271810557707e-05, + "loss": 0.5957, + "step": 105860 + }, + { + "epoch": 0.9359253169256883, + "grad_norm": 1.121809482574463, + "learning_rate": 3.440124471790519e-05, + "loss": 0.6361, + "step": 105870 + }, + { + "epoch": 0.9360137201860005, + "grad_norm": 1.8884894847869873, + "learning_rate": 3.439977133023333e-05, + "loss": 0.6003, + "step": 105880 + }, + { + "epoch": 0.9361021234463127, + "grad_norm": 1.7910462617874146, + "learning_rate": 3.4398297942561456e-05, + "loss": 0.5779, + "step": 105890 + }, + { + "epoch": 0.936190526706625, + "grad_norm": 7.440021514892578, + "learning_rate": 3.4396824554889584e-05, + "loss": 0.6207, + "step": 105900 + }, + { + "epoch": 0.9362789299669372, + "grad_norm": 11.046977996826172, + "learning_rate": 3.439535116721771e-05, + "loss": 0.6097, + "step": 105910 + }, + { + "epoch": 0.9363673332272494, + "grad_norm": 1.3227531909942627, + "learning_rate": 3.439387777954585e-05, + "loss": 0.608, + "step": 105920 + }, + { + "epoch": 0.9364557364875616, + "grad_norm": 1.1968457698822021, + "learning_rate": 3.439240439187397e-05, + "loss": 0.5547, + "step": 105930 + }, + { + "epoch": 0.9365441397478739, + "grad_norm": 2.193240165710449, + "learning_rate": 3.4390931004202105e-05, + "loss": 0.5987, + "step": 105940 + }, + { + "epoch": 0.9366325430081861, + "grad_norm": 3.277909517288208, + "learning_rate": 3.438945761653023e-05, + "loss": 0.6261, + "step": 105950 + }, + { + "epoch": 0.9367209462684983, + "grad_norm": 1.5054796934127808, + "learning_rate": 3.438798422885836e-05, + "loss": 0.6798, + "step": 105960 + }, + { + "epoch": 0.9368093495288107, + "grad_norm": 1.54038667678833, + "learning_rate": 3.438651084118649e-05, + "loss": 0.6678, + "step": 105970 + }, + { + "epoch": 0.9368977527891229, + "grad_norm": 10.033136367797852, + "learning_rate": 3.438503745351462e-05, + "loss": 0.5729, + "step": 105980 + }, + { + "epoch": 0.9369861560494351, + "grad_norm": 1.7395009994506836, + "learning_rate": 3.4383564065842746e-05, + "loss": 0.5782, + "step": 105990 + }, + { + "epoch": 0.9370745593097474, + "grad_norm": 8.441571235656738, + "learning_rate": 3.438209067817088e-05, + "loss": 0.7053, + "step": 106000 + }, + { + "epoch": 0.9371629625700596, + "grad_norm": 4.995687007904053, + "learning_rate": 3.438061729049901e-05, + "loss": 0.658, + "step": 106010 + }, + { + "epoch": 0.9372513658303718, + "grad_norm": 3.8771445751190186, + "learning_rate": 3.437914390282714e-05, + "loss": 0.7503, + "step": 106020 + }, + { + "epoch": 0.9373397690906841, + "grad_norm": 3.065551280975342, + "learning_rate": 3.4377670515155266e-05, + "loss": 0.6886, + "step": 106030 + }, + { + "epoch": 0.9374281723509963, + "grad_norm": 3.868722915649414, + "learning_rate": 3.4376197127483395e-05, + "loss": 0.5327, + "step": 106040 + }, + { + "epoch": 0.9375165756113085, + "grad_norm": 1.6570860147476196, + "learning_rate": 3.437472373981152e-05, + "loss": 0.5965, + "step": 106050 + }, + { + "epoch": 0.9376049788716208, + "grad_norm": 7.585522651672363, + "learning_rate": 3.437325035213966e-05, + "loss": 0.6274, + "step": 106060 + }, + { + "epoch": 0.937693382131933, + "grad_norm": 5.415796279907227, + "learning_rate": 3.437177696446779e-05, + "loss": 0.7436, + "step": 106070 + }, + { + "epoch": 0.9377817853922452, + "grad_norm": 4.855235576629639, + "learning_rate": 3.4370303576795915e-05, + "loss": 0.7077, + "step": 106080 + }, + { + "epoch": 0.9378701886525576, + "grad_norm": 1.1730399131774902, + "learning_rate": 3.436883018912404e-05, + "loss": 0.5663, + "step": 106090 + }, + { + "epoch": 0.9379585919128698, + "grad_norm": 13.284073829650879, + "learning_rate": 3.436735680145217e-05, + "loss": 0.599, + "step": 106100 + }, + { + "epoch": 0.938046995173182, + "grad_norm": 3.193488359451294, + "learning_rate": 3.43658834137803e-05, + "loss": 0.642, + "step": 106110 + }, + { + "epoch": 0.9381353984334942, + "grad_norm": 12.354021072387695, + "learning_rate": 3.436441002610843e-05, + "loss": 0.588, + "step": 106120 + }, + { + "epoch": 0.9382238016938065, + "grad_norm": 2.084394931793213, + "learning_rate": 3.4362936638436564e-05, + "loss": 0.6375, + "step": 106130 + }, + { + "epoch": 0.9383122049541187, + "grad_norm": 7.386837005615234, + "learning_rate": 3.436146325076469e-05, + "loss": 0.7165, + "step": 106140 + }, + { + "epoch": 0.9384006082144309, + "grad_norm": 12.498074531555176, + "learning_rate": 3.435998986309282e-05, + "loss": 0.6787, + "step": 106150 + }, + { + "epoch": 0.9384890114747432, + "grad_norm": 2.8941307067871094, + "learning_rate": 3.435851647542095e-05, + "loss": 0.7384, + "step": 106160 + }, + { + "epoch": 0.9385774147350554, + "grad_norm": 6.089250087738037, + "learning_rate": 3.435704308774908e-05, + "loss": 0.6384, + "step": 106170 + }, + { + "epoch": 0.9386658179953676, + "grad_norm": 3.2717485427856445, + "learning_rate": 3.4355569700077205e-05, + "loss": 0.6729, + "step": 106180 + }, + { + "epoch": 0.9387542212556799, + "grad_norm": 2.67104172706604, + "learning_rate": 3.435409631240534e-05, + "loss": 0.6155, + "step": 106190 + }, + { + "epoch": 0.9388426245159921, + "grad_norm": 3.3363802433013916, + "learning_rate": 3.435262292473347e-05, + "loss": 0.5862, + "step": 106200 + }, + { + "epoch": 0.9389310277763044, + "grad_norm": 2.501697301864624, + "learning_rate": 3.43511495370616e-05, + "loss": 0.7169, + "step": 106210 + }, + { + "epoch": 0.9390194310366167, + "grad_norm": 3.346224069595337, + "learning_rate": 3.4349676149389726e-05, + "loss": 0.6054, + "step": 106220 + }, + { + "epoch": 0.9391078342969289, + "grad_norm": 2.4818291664123535, + "learning_rate": 3.4348202761717854e-05, + "loss": 0.7401, + "step": 106230 + }, + { + "epoch": 0.9391962375572411, + "grad_norm": 3.6289844512939453, + "learning_rate": 3.434672937404598e-05, + "loss": 0.7041, + "step": 106240 + }, + { + "epoch": 0.9392846408175534, + "grad_norm": 1.087025761604309, + "learning_rate": 3.434525598637412e-05, + "loss": 0.7006, + "step": 106250 + }, + { + "epoch": 0.9393730440778656, + "grad_norm": 8.719230651855469, + "learning_rate": 3.434378259870224e-05, + "loss": 0.6008, + "step": 106260 + }, + { + "epoch": 0.9394614473381778, + "grad_norm": 4.29959774017334, + "learning_rate": 3.4342309211030374e-05, + "loss": 0.6814, + "step": 106270 + }, + { + "epoch": 0.93954985059849, + "grad_norm": 1.403823733329773, + "learning_rate": 3.43408358233585e-05, + "loss": 0.6274, + "step": 106280 + }, + { + "epoch": 0.9396382538588023, + "grad_norm": 4.81955623626709, + "learning_rate": 3.433936243568663e-05, + "loss": 0.6377, + "step": 106290 + }, + { + "epoch": 0.9397266571191145, + "grad_norm": 7.307697296142578, + "learning_rate": 3.433788904801476e-05, + "loss": 0.5989, + "step": 106300 + }, + { + "epoch": 0.9398150603794267, + "grad_norm": 8.821192741394043, + "learning_rate": 3.4336415660342894e-05, + "loss": 0.5647, + "step": 106310 + }, + { + "epoch": 0.939903463639739, + "grad_norm": 3.9636425971984863, + "learning_rate": 3.4334942272671016e-05, + "loss": 0.6552, + "step": 106320 + }, + { + "epoch": 0.9399918669000513, + "grad_norm": 2.2295689582824707, + "learning_rate": 3.433346888499915e-05, + "loss": 0.6519, + "step": 106330 + }, + { + "epoch": 0.9400802701603636, + "grad_norm": 2.579984426498413, + "learning_rate": 3.433199549732727e-05, + "loss": 0.6199, + "step": 106340 + }, + { + "epoch": 0.9401686734206758, + "grad_norm": 10.387164115905762, + "learning_rate": 3.433052210965541e-05, + "loss": 0.5651, + "step": 106350 + }, + { + "epoch": 0.940257076680988, + "grad_norm": 3.2856268882751465, + "learning_rate": 3.4329048721983536e-05, + "loss": 0.6841, + "step": 106360 + }, + { + "epoch": 0.9403454799413002, + "grad_norm": 1.4172332286834717, + "learning_rate": 3.4327575334311664e-05, + "loss": 0.6082, + "step": 106370 + }, + { + "epoch": 0.9404338832016125, + "grad_norm": 5.022871017456055, + "learning_rate": 3.432610194663979e-05, + "loss": 0.6535, + "step": 106380 + }, + { + "epoch": 0.9405222864619247, + "grad_norm": 11.879000663757324, + "learning_rate": 3.432462855896793e-05, + "loss": 0.6264, + "step": 106390 + }, + { + "epoch": 0.9406106897222369, + "grad_norm": 2.9734277725219727, + "learning_rate": 3.432315517129605e-05, + "loss": 0.637, + "step": 106400 + }, + { + "epoch": 0.9406990929825492, + "grad_norm": 5.496205806732178, + "learning_rate": 3.4321681783624185e-05, + "loss": 0.6466, + "step": 106410 + }, + { + "epoch": 0.9407874962428614, + "grad_norm": 2.9897515773773193, + "learning_rate": 3.432020839595231e-05, + "loss": 0.7122, + "step": 106420 + }, + { + "epoch": 0.9408758995031736, + "grad_norm": 1.718722939491272, + "learning_rate": 3.431873500828044e-05, + "loss": 0.5625, + "step": 106430 + }, + { + "epoch": 0.940964302763486, + "grad_norm": 1.3780550956726074, + "learning_rate": 3.431726162060857e-05, + "loss": 0.5758, + "step": 106440 + }, + { + "epoch": 0.9410527060237982, + "grad_norm": 4.742579460144043, + "learning_rate": 3.43157882329367e-05, + "loss": 0.6305, + "step": 106450 + }, + { + "epoch": 0.9411411092841104, + "grad_norm": 5.992853164672852, + "learning_rate": 3.4314314845264826e-05, + "loss": 0.6176, + "step": 106460 + }, + { + "epoch": 0.9412295125444227, + "grad_norm": 4.425865173339844, + "learning_rate": 3.431284145759296e-05, + "loss": 0.609, + "step": 106470 + }, + { + "epoch": 0.9413179158047349, + "grad_norm": 1.596043348312378, + "learning_rate": 3.431136806992108e-05, + "loss": 0.6324, + "step": 106480 + }, + { + "epoch": 0.9414063190650471, + "grad_norm": 8.001104354858398, + "learning_rate": 3.430989468224922e-05, + "loss": 0.6937, + "step": 106490 + }, + { + "epoch": 0.9414947223253594, + "grad_norm": 5.724034786224365, + "learning_rate": 3.4308421294577347e-05, + "loss": 0.5984, + "step": 106500 + }, + { + "epoch": 0.9415831255856716, + "grad_norm": 10.905638694763184, + "learning_rate": 3.4306947906905475e-05, + "loss": 0.6456, + "step": 106510 + }, + { + "epoch": 0.9416715288459838, + "grad_norm": 2.4138989448547363, + "learning_rate": 3.43054745192336e-05, + "loss": 0.6583, + "step": 106520 + }, + { + "epoch": 0.941759932106296, + "grad_norm": 5.834841728210449, + "learning_rate": 3.430400113156174e-05, + "loss": 0.7593, + "step": 106530 + }, + { + "epoch": 0.9418483353666083, + "grad_norm": 4.196831226348877, + "learning_rate": 3.430252774388986e-05, + "loss": 0.5705, + "step": 106540 + }, + { + "epoch": 0.9419367386269205, + "grad_norm": 2.742556095123291, + "learning_rate": 3.4301054356217995e-05, + "loss": 0.5533, + "step": 106550 + }, + { + "epoch": 0.9420251418872329, + "grad_norm": 11.296940803527832, + "learning_rate": 3.429958096854612e-05, + "loss": 0.5298, + "step": 106560 + }, + { + "epoch": 0.9421135451475451, + "grad_norm": 3.208120107650757, + "learning_rate": 3.429810758087425e-05, + "loss": 0.6616, + "step": 106570 + }, + { + "epoch": 0.9422019484078573, + "grad_norm": 3.070038318634033, + "learning_rate": 3.429663419320238e-05, + "loss": 0.5938, + "step": 106580 + }, + { + "epoch": 0.9422903516681695, + "grad_norm": 8.327720642089844, + "learning_rate": 3.429516080553051e-05, + "loss": 0.7682, + "step": 106590 + }, + { + "epoch": 0.9423787549284818, + "grad_norm": 4.176601409912109, + "learning_rate": 3.429368741785864e-05, + "loss": 0.602, + "step": 106600 + }, + { + "epoch": 0.942467158188794, + "grad_norm": 4.963527679443359, + "learning_rate": 3.429221403018677e-05, + "loss": 0.6014, + "step": 106610 + }, + { + "epoch": 0.9425555614491062, + "grad_norm": 1.4309555292129517, + "learning_rate": 3.4290740642514894e-05, + "loss": 0.571, + "step": 106620 + }, + { + "epoch": 0.9426439647094185, + "grad_norm": 0.8592166304588318, + "learning_rate": 3.428926725484303e-05, + "loss": 0.7284, + "step": 106630 + }, + { + "epoch": 0.9427323679697307, + "grad_norm": 10.35663890838623, + "learning_rate": 3.428779386717116e-05, + "loss": 0.638, + "step": 106640 + }, + { + "epoch": 0.9428207712300429, + "grad_norm": 2.0613021850585938, + "learning_rate": 3.4286320479499285e-05, + "loss": 0.5058, + "step": 106650 + }, + { + "epoch": 0.9429091744903552, + "grad_norm": 1.8221769332885742, + "learning_rate": 3.4284847091827414e-05, + "loss": 0.7491, + "step": 106660 + }, + { + "epoch": 0.9429975777506674, + "grad_norm": 3.9394474029541016, + "learning_rate": 3.428337370415555e-05, + "loss": 0.5745, + "step": 106670 + }, + { + "epoch": 0.9430859810109797, + "grad_norm": 1.4672960042953491, + "learning_rate": 3.428190031648367e-05, + "loss": 0.5828, + "step": 106680 + }, + { + "epoch": 0.943174384271292, + "grad_norm": 1.6499518156051636, + "learning_rate": 3.4280426928811806e-05, + "loss": 0.5971, + "step": 106690 + }, + { + "epoch": 0.9432627875316042, + "grad_norm": 5.15764856338501, + "learning_rate": 3.427895354113993e-05, + "loss": 0.7189, + "step": 106700 + }, + { + "epoch": 0.9433511907919164, + "grad_norm": 4.637803554534912, + "learning_rate": 3.427748015346806e-05, + "loss": 0.587, + "step": 106710 + }, + { + "epoch": 0.9434395940522287, + "grad_norm": 2.552391767501831, + "learning_rate": 3.427600676579619e-05, + "loss": 0.6569, + "step": 106720 + }, + { + "epoch": 0.9435279973125409, + "grad_norm": 4.291319370269775, + "learning_rate": 3.427453337812432e-05, + "loss": 0.7166, + "step": 106730 + }, + { + "epoch": 0.9436164005728531, + "grad_norm": 3.2860984802246094, + "learning_rate": 3.427305999045245e-05, + "loss": 0.6575, + "step": 106740 + }, + { + "epoch": 0.9437048038331654, + "grad_norm": 3.7059011459350586, + "learning_rate": 3.427158660278058e-05, + "loss": 0.5796, + "step": 106750 + }, + { + "epoch": 0.9437932070934776, + "grad_norm": 3.594114303588867, + "learning_rate": 3.4270113215108704e-05, + "loss": 0.6662, + "step": 106760 + }, + { + "epoch": 0.9438816103537898, + "grad_norm": 2.9780309200286865, + "learning_rate": 3.426863982743684e-05, + "loss": 0.5473, + "step": 106770 + }, + { + "epoch": 0.943970013614102, + "grad_norm": 11.914055824279785, + "learning_rate": 3.426716643976497e-05, + "loss": 0.6997, + "step": 106780 + }, + { + "epoch": 0.9440584168744143, + "grad_norm": 1.9557054042816162, + "learning_rate": 3.4265693052093096e-05, + "loss": 0.6747, + "step": 106790 + }, + { + "epoch": 0.9441468201347266, + "grad_norm": 6.067456245422363, + "learning_rate": 3.4264219664421224e-05, + "loss": 0.6688, + "step": 106800 + }, + { + "epoch": 0.9442352233950388, + "grad_norm": 1.1976491212844849, + "learning_rate": 3.426274627674935e-05, + "loss": 0.664, + "step": 106810 + }, + { + "epoch": 0.9443236266553511, + "grad_norm": 4.096565246582031, + "learning_rate": 3.426127288907748e-05, + "loss": 0.6322, + "step": 106820 + }, + { + "epoch": 0.9444120299156633, + "grad_norm": 17.041379928588867, + "learning_rate": 3.4259799501405616e-05, + "loss": 0.6444, + "step": 106830 + }, + { + "epoch": 0.9445004331759755, + "grad_norm": 1.333940029144287, + "learning_rate": 3.425832611373374e-05, + "loss": 0.5801, + "step": 106840 + }, + { + "epoch": 0.9445888364362878, + "grad_norm": 2.1787521839141846, + "learning_rate": 3.425685272606187e-05, + "loss": 0.5083, + "step": 106850 + }, + { + "epoch": 0.9446772396966, + "grad_norm": 1.6780534982681274, + "learning_rate": 3.425537933839e-05, + "loss": 0.6349, + "step": 106860 + }, + { + "epoch": 0.9447656429569122, + "grad_norm": 5.518852710723877, + "learning_rate": 3.425390595071813e-05, + "loss": 0.6507, + "step": 106870 + }, + { + "epoch": 0.9448540462172245, + "grad_norm": 7.130365371704102, + "learning_rate": 3.425243256304626e-05, + "loss": 0.5616, + "step": 106880 + }, + { + "epoch": 0.9449424494775367, + "grad_norm": 3.413080930709839, + "learning_rate": 3.425095917537439e-05, + "loss": 0.588, + "step": 106890 + }, + { + "epoch": 0.9450308527378489, + "grad_norm": 0.8026190400123596, + "learning_rate": 3.4249485787702515e-05, + "loss": 0.6704, + "step": 106900 + }, + { + "epoch": 0.9451192559981613, + "grad_norm": 1.9579423666000366, + "learning_rate": 3.424801240003065e-05, + "loss": 0.6763, + "step": 106910 + }, + { + "epoch": 0.9452076592584735, + "grad_norm": 5.837519645690918, + "learning_rate": 3.424653901235878e-05, + "loss": 0.7122, + "step": 106920 + }, + { + "epoch": 0.9452960625187857, + "grad_norm": 5.711021423339844, + "learning_rate": 3.4245065624686906e-05, + "loss": 0.6986, + "step": 106930 + }, + { + "epoch": 0.945384465779098, + "grad_norm": 1.944549560546875, + "learning_rate": 3.4243592237015035e-05, + "loss": 0.6563, + "step": 106940 + }, + { + "epoch": 0.9454728690394102, + "grad_norm": 5.686873435974121, + "learning_rate": 3.424211884934316e-05, + "loss": 0.5243, + "step": 106950 + }, + { + "epoch": 0.9455612722997224, + "grad_norm": 4.338107585906982, + "learning_rate": 3.424064546167129e-05, + "loss": 0.6427, + "step": 106960 + }, + { + "epoch": 0.9456496755600347, + "grad_norm": 5.108939170837402, + "learning_rate": 3.4239172073999427e-05, + "loss": 0.657, + "step": 106970 + }, + { + "epoch": 0.9457380788203469, + "grad_norm": 9.524923324584961, + "learning_rate": 3.4237698686327555e-05, + "loss": 0.6514, + "step": 106980 + }, + { + "epoch": 0.9458264820806591, + "grad_norm": 11.318289756774902, + "learning_rate": 3.423622529865568e-05, + "loss": 0.6222, + "step": 106990 + }, + { + "epoch": 0.9459148853409713, + "grad_norm": 1.8790006637573242, + "learning_rate": 3.423475191098381e-05, + "loss": 0.5543, + "step": 107000 + }, + { + "epoch": 0.9460032886012836, + "grad_norm": 5.158295154571533, + "learning_rate": 3.423327852331194e-05, + "loss": 0.5889, + "step": 107010 + }, + { + "epoch": 0.9460916918615958, + "grad_norm": 2.221982717514038, + "learning_rate": 3.423180513564007e-05, + "loss": 0.6793, + "step": 107020 + }, + { + "epoch": 0.9461800951219081, + "grad_norm": 5.159732341766357, + "learning_rate": 3.42303317479682e-05, + "loss": 0.5866, + "step": 107030 + }, + { + "epoch": 0.9462684983822204, + "grad_norm": 1.9355392456054688, + "learning_rate": 3.422885836029633e-05, + "loss": 0.7064, + "step": 107040 + }, + { + "epoch": 0.9463569016425326, + "grad_norm": 13.628268241882324, + "learning_rate": 3.422738497262446e-05, + "loss": 0.774, + "step": 107050 + }, + { + "epoch": 0.9464453049028448, + "grad_norm": 2.6042873859405518, + "learning_rate": 3.422591158495259e-05, + "loss": 0.6066, + "step": 107060 + }, + { + "epoch": 0.9465337081631571, + "grad_norm": 7.6928815841674805, + "learning_rate": 3.422443819728072e-05, + "loss": 0.6226, + "step": 107070 + }, + { + "epoch": 0.9466221114234693, + "grad_norm": 1.4809387922286987, + "learning_rate": 3.4222964809608845e-05, + "loss": 0.567, + "step": 107080 + }, + { + "epoch": 0.9467105146837815, + "grad_norm": 2.3960022926330566, + "learning_rate": 3.4221491421936974e-05, + "loss": 0.6553, + "step": 107090 + }, + { + "epoch": 0.9467989179440938, + "grad_norm": 2.232804298400879, + "learning_rate": 3.422001803426511e-05, + "loss": 0.6052, + "step": 107100 + }, + { + "epoch": 0.946887321204406, + "grad_norm": 1.6408066749572754, + "learning_rate": 3.421854464659324e-05, + "loss": 0.6134, + "step": 107110 + }, + { + "epoch": 0.9469757244647182, + "grad_norm": 1.4858996868133545, + "learning_rate": 3.4217071258921365e-05, + "loss": 0.6503, + "step": 107120 + }, + { + "epoch": 0.9470641277250305, + "grad_norm": 2.2252790927886963, + "learning_rate": 3.4215597871249494e-05, + "loss": 0.7173, + "step": 107130 + }, + { + "epoch": 0.9471525309853427, + "grad_norm": 4.65252161026001, + "learning_rate": 3.421412448357762e-05, + "loss": 0.6642, + "step": 107140 + }, + { + "epoch": 0.947240934245655, + "grad_norm": 4.499752044677734, + "learning_rate": 3.421265109590575e-05, + "loss": 0.6111, + "step": 107150 + }, + { + "epoch": 0.9473293375059673, + "grad_norm": 1.6032524108886719, + "learning_rate": 3.4211177708233886e-05, + "loss": 0.6872, + "step": 107160 + }, + { + "epoch": 0.9474177407662795, + "grad_norm": 5.613959789276123, + "learning_rate": 3.420970432056201e-05, + "loss": 0.6062, + "step": 107170 + }, + { + "epoch": 0.9475061440265917, + "grad_norm": 1.392574429512024, + "learning_rate": 3.420823093289014e-05, + "loss": 0.6558, + "step": 107180 + }, + { + "epoch": 0.947594547286904, + "grad_norm": 1.549219012260437, + "learning_rate": 3.420675754521827e-05, + "loss": 0.6022, + "step": 107190 + }, + { + "epoch": 0.9476829505472162, + "grad_norm": 6.8509650230407715, + "learning_rate": 3.42052841575464e-05, + "loss": 0.6933, + "step": 107200 + }, + { + "epoch": 0.9477713538075284, + "grad_norm": 9.156034469604492, + "learning_rate": 3.420381076987453e-05, + "loss": 0.6242, + "step": 107210 + }, + { + "epoch": 0.9478597570678406, + "grad_norm": 6.534549236297607, + "learning_rate": 3.420233738220266e-05, + "loss": 0.6654, + "step": 107220 + }, + { + "epoch": 0.9479481603281529, + "grad_norm": 2.9824163913726807, + "learning_rate": 3.4200863994530784e-05, + "loss": 0.5776, + "step": 107230 + }, + { + "epoch": 0.9480365635884651, + "grad_norm": 7.644856929779053, + "learning_rate": 3.419939060685892e-05, + "loss": 0.7317, + "step": 107240 + }, + { + "epoch": 0.9481249668487773, + "grad_norm": 1.7401103973388672, + "learning_rate": 3.419791721918705e-05, + "loss": 0.5825, + "step": 107250 + }, + { + "epoch": 0.9482133701090896, + "grad_norm": 3.100944757461548, + "learning_rate": 3.4196443831515176e-05, + "loss": 0.6495, + "step": 107260 + }, + { + "epoch": 0.9483017733694019, + "grad_norm": 0.9705969095230103, + "learning_rate": 3.4194970443843304e-05, + "loss": 0.5088, + "step": 107270 + }, + { + "epoch": 0.9483901766297141, + "grad_norm": 5.885824203491211, + "learning_rate": 3.419349705617143e-05, + "loss": 0.6608, + "step": 107280 + }, + { + "epoch": 0.9484785798900264, + "grad_norm": 1.637331247329712, + "learning_rate": 3.419202366849956e-05, + "loss": 0.4585, + "step": 107290 + }, + { + "epoch": 0.9485669831503386, + "grad_norm": 7.709097862243652, + "learning_rate": 3.4190550280827696e-05, + "loss": 0.5675, + "step": 107300 + }, + { + "epoch": 0.9486553864106508, + "grad_norm": 13.357598304748535, + "learning_rate": 3.418907689315582e-05, + "loss": 0.7206, + "step": 107310 + }, + { + "epoch": 0.9487437896709631, + "grad_norm": 3.9393203258514404, + "learning_rate": 3.418760350548395e-05, + "loss": 0.653, + "step": 107320 + }, + { + "epoch": 0.9488321929312753, + "grad_norm": 5.50761079788208, + "learning_rate": 3.418613011781208e-05, + "loss": 0.6621, + "step": 107330 + }, + { + "epoch": 0.9489205961915875, + "grad_norm": 2.243089199066162, + "learning_rate": 3.418465673014021e-05, + "loss": 0.7987, + "step": 107340 + }, + { + "epoch": 0.9490089994518998, + "grad_norm": 1.6311918497085571, + "learning_rate": 3.418318334246834e-05, + "loss": 0.6362, + "step": 107350 + }, + { + "epoch": 0.949097402712212, + "grad_norm": 4.786495208740234, + "learning_rate": 3.418170995479647e-05, + "loss": 0.6874, + "step": 107360 + }, + { + "epoch": 0.9491858059725242, + "grad_norm": 2.9045217037200928, + "learning_rate": 3.4180236567124595e-05, + "loss": 0.6305, + "step": 107370 + }, + { + "epoch": 0.9492742092328365, + "grad_norm": 1.2427177429199219, + "learning_rate": 3.417876317945273e-05, + "loss": 0.6014, + "step": 107380 + }, + { + "epoch": 0.9493626124931488, + "grad_norm": 3.103736162185669, + "learning_rate": 3.417728979178085e-05, + "loss": 0.6472, + "step": 107390 + }, + { + "epoch": 0.949451015753461, + "grad_norm": 2.277155637741089, + "learning_rate": 3.4175816404108986e-05, + "loss": 0.6705, + "step": 107400 + }, + { + "epoch": 0.9495394190137733, + "grad_norm": 4.957957744598389, + "learning_rate": 3.4174343016437115e-05, + "loss": 0.7479, + "step": 107410 + }, + { + "epoch": 0.9496278222740855, + "grad_norm": 9.334573745727539, + "learning_rate": 3.417286962876524e-05, + "loss": 0.7009, + "step": 107420 + }, + { + "epoch": 0.9497162255343977, + "grad_norm": 4.930044174194336, + "learning_rate": 3.417139624109337e-05, + "loss": 0.6366, + "step": 107430 + }, + { + "epoch": 0.94980462879471, + "grad_norm": 2.5144917964935303, + "learning_rate": 3.416992285342151e-05, + "loss": 0.5848, + "step": 107440 + }, + { + "epoch": 0.9498930320550222, + "grad_norm": 2.3998749256134033, + "learning_rate": 3.416844946574963e-05, + "loss": 0.489, + "step": 107450 + }, + { + "epoch": 0.9499814353153344, + "grad_norm": 3.059237003326416, + "learning_rate": 3.416697607807776e-05, + "loss": 0.6635, + "step": 107460 + }, + { + "epoch": 0.9500698385756466, + "grad_norm": 1.5679110288619995, + "learning_rate": 3.416550269040589e-05, + "loss": 0.7331, + "step": 107470 + }, + { + "epoch": 0.9501582418359589, + "grad_norm": 7.488781929016113, + "learning_rate": 3.416402930273402e-05, + "loss": 0.5985, + "step": 107480 + }, + { + "epoch": 0.9502466450962711, + "grad_norm": 2.71647572517395, + "learning_rate": 3.416255591506215e-05, + "loss": 0.5026, + "step": 107490 + }, + { + "epoch": 0.9503350483565834, + "grad_norm": 1.9849530458450317, + "learning_rate": 3.416108252739028e-05, + "loss": 0.5606, + "step": 107500 + }, + { + "epoch": 0.9504234516168957, + "grad_norm": 10.521023750305176, + "learning_rate": 3.4159609139718405e-05, + "loss": 0.5648, + "step": 107510 + }, + { + "epoch": 0.9505118548772079, + "grad_norm": 2.396305561065674, + "learning_rate": 3.415813575204654e-05, + "loss": 0.6881, + "step": 107520 + }, + { + "epoch": 0.9506002581375201, + "grad_norm": 3.7700295448303223, + "learning_rate": 3.415666236437466e-05, + "loss": 0.5457, + "step": 107530 + }, + { + "epoch": 0.9506886613978324, + "grad_norm": 11.179518699645996, + "learning_rate": 3.41551889767028e-05, + "loss": 0.7302, + "step": 107540 + }, + { + "epoch": 0.9507770646581446, + "grad_norm": 1.3473808765411377, + "learning_rate": 3.4153715589030925e-05, + "loss": 0.6218, + "step": 107550 + }, + { + "epoch": 0.9508654679184568, + "grad_norm": 1.59479820728302, + "learning_rate": 3.4152242201359054e-05, + "loss": 0.6327, + "step": 107560 + }, + { + "epoch": 0.9509538711787691, + "grad_norm": 8.574318885803223, + "learning_rate": 3.415076881368718e-05, + "loss": 0.5492, + "step": 107570 + }, + { + "epoch": 0.9510422744390813, + "grad_norm": 1.1620367765426636, + "learning_rate": 3.414929542601532e-05, + "loss": 0.579, + "step": 107580 + }, + { + "epoch": 0.9511306776993935, + "grad_norm": 3.6802213191986084, + "learning_rate": 3.414782203834344e-05, + "loss": 0.6158, + "step": 107590 + }, + { + "epoch": 0.9512190809597058, + "grad_norm": 9.622323989868164, + "learning_rate": 3.4146348650671574e-05, + "loss": 0.7037, + "step": 107600 + }, + { + "epoch": 0.951307484220018, + "grad_norm": 6.923878192901611, + "learning_rate": 3.41448752629997e-05, + "loss": 0.7099, + "step": 107610 + }, + { + "epoch": 0.9513958874803303, + "grad_norm": 3.974891185760498, + "learning_rate": 3.414340187532783e-05, + "loss": 0.6255, + "step": 107620 + }, + { + "epoch": 0.9514842907406426, + "grad_norm": 12.173301696777344, + "learning_rate": 3.414192848765596e-05, + "loss": 0.6575, + "step": 107630 + }, + { + "epoch": 0.9515726940009548, + "grad_norm": 4.852212905883789, + "learning_rate": 3.414045509998409e-05, + "loss": 0.5951, + "step": 107640 + }, + { + "epoch": 0.951661097261267, + "grad_norm": 5.011682033538818, + "learning_rate": 3.4138981712312216e-05, + "loss": 0.6038, + "step": 107650 + }, + { + "epoch": 0.9517495005215793, + "grad_norm": 6.1992878913879395, + "learning_rate": 3.413750832464035e-05, + "loss": 0.5561, + "step": 107660 + }, + { + "epoch": 0.9518379037818915, + "grad_norm": 20.937747955322266, + "learning_rate": 3.413603493696847e-05, + "loss": 0.5559, + "step": 107670 + }, + { + "epoch": 0.9519263070422037, + "grad_norm": 11.46400260925293, + "learning_rate": 3.413456154929661e-05, + "loss": 0.7077, + "step": 107680 + }, + { + "epoch": 0.9520147103025159, + "grad_norm": 2.0332183837890625, + "learning_rate": 3.4133088161624736e-05, + "loss": 0.7002, + "step": 107690 + }, + { + "epoch": 0.9521031135628282, + "grad_norm": 1.3499661684036255, + "learning_rate": 3.4131614773952864e-05, + "loss": 0.6974, + "step": 107700 + }, + { + "epoch": 0.9521915168231404, + "grad_norm": 3.8344931602478027, + "learning_rate": 3.413014138628099e-05, + "loss": 0.6379, + "step": 107710 + }, + { + "epoch": 0.9522799200834526, + "grad_norm": 6.694114685058594, + "learning_rate": 3.412866799860913e-05, + "loss": 0.4893, + "step": 107720 + }, + { + "epoch": 0.9523683233437649, + "grad_norm": 5.147834300994873, + "learning_rate": 3.412719461093725e-05, + "loss": 0.6722, + "step": 107730 + }, + { + "epoch": 0.9524567266040772, + "grad_norm": 8.246710777282715, + "learning_rate": 3.4125721223265384e-05, + "loss": 0.7812, + "step": 107740 + }, + { + "epoch": 0.9525451298643894, + "grad_norm": 10.205009460449219, + "learning_rate": 3.4124247835593506e-05, + "loss": 0.6962, + "step": 107750 + }, + { + "epoch": 0.9526335331247017, + "grad_norm": 5.004541873931885, + "learning_rate": 3.412277444792164e-05, + "loss": 0.6664, + "step": 107760 + }, + { + "epoch": 0.9527219363850139, + "grad_norm": 2.0937769412994385, + "learning_rate": 3.412130106024977e-05, + "loss": 0.6484, + "step": 107770 + }, + { + "epoch": 0.9528103396453261, + "grad_norm": 2.0680203437805176, + "learning_rate": 3.41198276725779e-05, + "loss": 0.6528, + "step": 107780 + }, + { + "epoch": 0.9528987429056384, + "grad_norm": 1.4519532918930054, + "learning_rate": 3.4118354284906026e-05, + "loss": 0.7094, + "step": 107790 + }, + { + "epoch": 0.9529871461659506, + "grad_norm": 1.2675204277038574, + "learning_rate": 3.411688089723416e-05, + "loss": 0.551, + "step": 107800 + }, + { + "epoch": 0.9530755494262628, + "grad_norm": 1.4857728481292725, + "learning_rate": 3.411540750956228e-05, + "loss": 0.6719, + "step": 107810 + }, + { + "epoch": 0.9531639526865751, + "grad_norm": 3.813077211380005, + "learning_rate": 3.411393412189042e-05, + "loss": 0.5846, + "step": 107820 + }, + { + "epoch": 0.9532523559468873, + "grad_norm": 5.566223621368408, + "learning_rate": 3.4112460734218546e-05, + "loss": 0.5991, + "step": 107830 + }, + { + "epoch": 0.9533407592071995, + "grad_norm": 3.6968796253204346, + "learning_rate": 3.4110987346546675e-05, + "loss": 0.6139, + "step": 107840 + }, + { + "epoch": 0.9534291624675117, + "grad_norm": 0.9310871958732605, + "learning_rate": 3.41095139588748e-05, + "loss": 0.6337, + "step": 107850 + }, + { + "epoch": 0.9535175657278241, + "grad_norm": 2.691727876663208, + "learning_rate": 3.410804057120293e-05, + "loss": 0.6461, + "step": 107860 + }, + { + "epoch": 0.9536059689881363, + "grad_norm": 1.6179885864257812, + "learning_rate": 3.410656718353106e-05, + "loss": 0.6521, + "step": 107870 + }, + { + "epoch": 0.9536943722484486, + "grad_norm": 1.3111066818237305, + "learning_rate": 3.4105093795859195e-05, + "loss": 0.6831, + "step": 107880 + }, + { + "epoch": 0.9537827755087608, + "grad_norm": 1.5552518367767334, + "learning_rate": 3.410362040818732e-05, + "loss": 0.7639, + "step": 107890 + }, + { + "epoch": 0.953871178769073, + "grad_norm": 9.723103523254395, + "learning_rate": 3.410214702051545e-05, + "loss": 0.5838, + "step": 107900 + }, + { + "epoch": 0.9539595820293852, + "grad_norm": 1.4131728410720825, + "learning_rate": 3.410067363284358e-05, + "loss": 0.7367, + "step": 107910 + }, + { + "epoch": 0.9540479852896975, + "grad_norm": 2.871356248855591, + "learning_rate": 3.409920024517171e-05, + "loss": 0.6817, + "step": 107920 + }, + { + "epoch": 0.9541363885500097, + "grad_norm": 4.903386116027832, + "learning_rate": 3.409772685749984e-05, + "loss": 0.5768, + "step": 107930 + }, + { + "epoch": 0.9542247918103219, + "grad_norm": 1.263875961303711, + "learning_rate": 3.409625346982797e-05, + "loss": 0.6812, + "step": 107940 + }, + { + "epoch": 0.9543131950706342, + "grad_norm": 20.965744018554688, + "learning_rate": 3.40947800821561e-05, + "loss": 0.7873, + "step": 107950 + }, + { + "epoch": 0.9544015983309464, + "grad_norm": 1.4810017347335815, + "learning_rate": 3.409330669448423e-05, + "loss": 0.5582, + "step": 107960 + }, + { + "epoch": 0.9544900015912587, + "grad_norm": 1.8919059038162231, + "learning_rate": 3.409183330681236e-05, + "loss": 0.6934, + "step": 107970 + }, + { + "epoch": 0.954578404851571, + "grad_norm": 3.1167871952056885, + "learning_rate": 3.4090359919140485e-05, + "loss": 0.7143, + "step": 107980 + }, + { + "epoch": 0.9546668081118832, + "grad_norm": 2.9333460330963135, + "learning_rate": 3.4088886531468614e-05, + "loss": 0.6604, + "step": 107990 + }, + { + "epoch": 0.9547552113721954, + "grad_norm": 13.716352462768555, + "learning_rate": 3.408741314379674e-05, + "loss": 0.737, + "step": 108000 + }, + { + "epoch": 0.9548436146325077, + "grad_norm": 2.1939032077789307, + "learning_rate": 3.408593975612488e-05, + "loss": 0.7201, + "step": 108010 + }, + { + "epoch": 0.9549320178928199, + "grad_norm": 3.350382089614868, + "learning_rate": 3.4084466368453005e-05, + "loss": 0.7804, + "step": 108020 + }, + { + "epoch": 0.9550204211531321, + "grad_norm": 2.7359728813171387, + "learning_rate": 3.4082992980781134e-05, + "loss": 0.6383, + "step": 108030 + }, + { + "epoch": 0.9551088244134444, + "grad_norm": 2.0213334560394287, + "learning_rate": 3.408151959310926e-05, + "loss": 0.7167, + "step": 108040 + }, + { + "epoch": 0.9551972276737566, + "grad_norm": 1.583143711090088, + "learning_rate": 3.408004620543739e-05, + "loss": 0.7339, + "step": 108050 + }, + { + "epoch": 0.9552856309340688, + "grad_norm": 2.750746011734009, + "learning_rate": 3.407857281776552e-05, + "loss": 0.7033, + "step": 108060 + }, + { + "epoch": 0.955374034194381, + "grad_norm": 2.987856388092041, + "learning_rate": 3.4077099430093654e-05, + "loss": 0.5844, + "step": 108070 + }, + { + "epoch": 0.9554624374546933, + "grad_norm": 2.7189905643463135, + "learning_rate": 3.407562604242178e-05, + "loss": 0.8291, + "step": 108080 + }, + { + "epoch": 0.9555508407150056, + "grad_norm": 1.5544456243515015, + "learning_rate": 3.407415265474991e-05, + "loss": 0.6861, + "step": 108090 + }, + { + "epoch": 0.9556392439753179, + "grad_norm": 3.2795040607452393, + "learning_rate": 3.407267926707804e-05, + "loss": 0.707, + "step": 108100 + }, + { + "epoch": 0.9557276472356301, + "grad_norm": 1.6470195055007935, + "learning_rate": 3.407120587940617e-05, + "loss": 0.7609, + "step": 108110 + }, + { + "epoch": 0.9558160504959423, + "grad_norm": 7.1803507804870605, + "learning_rate": 3.4069732491734296e-05, + "loss": 0.7388, + "step": 108120 + }, + { + "epoch": 0.9559044537562545, + "grad_norm": 3.234081506729126, + "learning_rate": 3.406825910406243e-05, + "loss": 0.6063, + "step": 108130 + }, + { + "epoch": 0.9559928570165668, + "grad_norm": 0.897541344165802, + "learning_rate": 3.406678571639055e-05, + "loss": 0.5949, + "step": 108140 + }, + { + "epoch": 0.956081260276879, + "grad_norm": 2.718233585357666, + "learning_rate": 3.406531232871869e-05, + "loss": 0.584, + "step": 108150 + }, + { + "epoch": 0.9561696635371912, + "grad_norm": 3.2134108543395996, + "learning_rate": 3.4063838941046816e-05, + "loss": 0.6865, + "step": 108160 + }, + { + "epoch": 0.9562580667975035, + "grad_norm": 12.512566566467285, + "learning_rate": 3.4062365553374944e-05, + "loss": 0.5692, + "step": 108170 + }, + { + "epoch": 0.9563464700578157, + "grad_norm": 3.26753830909729, + "learning_rate": 3.406089216570307e-05, + "loss": 0.7239, + "step": 108180 + }, + { + "epoch": 0.9564348733181279, + "grad_norm": 4.158071041107178, + "learning_rate": 3.405941877803121e-05, + "loss": 0.5502, + "step": 108190 + }, + { + "epoch": 0.9565232765784402, + "grad_norm": 1.5002751350402832, + "learning_rate": 3.405794539035933e-05, + "loss": 0.7161, + "step": 108200 + }, + { + "epoch": 0.9566116798387525, + "grad_norm": 1.444059133529663, + "learning_rate": 3.4056472002687464e-05, + "loss": 0.6801, + "step": 108210 + }, + { + "epoch": 0.9567000830990647, + "grad_norm": 2.915778636932373, + "learning_rate": 3.4054998615015586e-05, + "loss": 0.5348, + "step": 108220 + }, + { + "epoch": 0.956788486359377, + "grad_norm": 1.5096100568771362, + "learning_rate": 3.405352522734372e-05, + "loss": 0.5295, + "step": 108230 + }, + { + "epoch": 0.9568768896196892, + "grad_norm": 4.211172103881836, + "learning_rate": 3.405205183967185e-05, + "loss": 0.5911, + "step": 108240 + }, + { + "epoch": 0.9569652928800014, + "grad_norm": 2.9278645515441895, + "learning_rate": 3.405057845199998e-05, + "loss": 0.7571, + "step": 108250 + }, + { + "epoch": 0.9570536961403137, + "grad_norm": 1.5982617139816284, + "learning_rate": 3.4049105064328106e-05, + "loss": 0.617, + "step": 108260 + }, + { + "epoch": 0.9571420994006259, + "grad_norm": 3.509449005126953, + "learning_rate": 3.404763167665624e-05, + "loss": 0.7241, + "step": 108270 + }, + { + "epoch": 0.9572305026609381, + "grad_norm": 5.647476673126221, + "learning_rate": 3.404615828898436e-05, + "loss": 0.6549, + "step": 108280 + }, + { + "epoch": 0.9573189059212504, + "grad_norm": 2.6125192642211914, + "learning_rate": 3.40446849013125e-05, + "loss": 0.7135, + "step": 108290 + }, + { + "epoch": 0.9574073091815626, + "grad_norm": 3.9579532146453857, + "learning_rate": 3.4043211513640626e-05, + "loss": 0.6965, + "step": 108300 + }, + { + "epoch": 0.9574957124418748, + "grad_norm": 4.575448989868164, + "learning_rate": 3.4041738125968755e-05, + "loss": 0.6583, + "step": 108310 + }, + { + "epoch": 0.957584115702187, + "grad_norm": 1.6923058032989502, + "learning_rate": 3.404026473829688e-05, + "loss": 0.6257, + "step": 108320 + }, + { + "epoch": 0.9576725189624994, + "grad_norm": 3.6821818351745605, + "learning_rate": 3.403879135062501e-05, + "loss": 0.6621, + "step": 108330 + }, + { + "epoch": 0.9577609222228116, + "grad_norm": 3.80184006690979, + "learning_rate": 3.403731796295314e-05, + "loss": 0.7091, + "step": 108340 + }, + { + "epoch": 0.9578493254831238, + "grad_norm": 2.4954137802124023, + "learning_rate": 3.4035844575281275e-05, + "loss": 0.6857, + "step": 108350 + }, + { + "epoch": 0.9579377287434361, + "grad_norm": 14.376485824584961, + "learning_rate": 3.4034371187609397e-05, + "loss": 0.6561, + "step": 108360 + }, + { + "epoch": 0.9580261320037483, + "grad_norm": 1.7742670774459839, + "learning_rate": 3.403289779993753e-05, + "loss": 0.6778, + "step": 108370 + }, + { + "epoch": 0.9581145352640605, + "grad_norm": 1.6553324460983276, + "learning_rate": 3.403142441226566e-05, + "loss": 0.694, + "step": 108380 + }, + { + "epoch": 0.9582029385243728, + "grad_norm": 2.247838020324707, + "learning_rate": 3.402995102459379e-05, + "loss": 0.6593, + "step": 108390 + }, + { + "epoch": 0.958291341784685, + "grad_norm": 9.521095275878906, + "learning_rate": 3.402847763692192e-05, + "loss": 0.6383, + "step": 108400 + }, + { + "epoch": 0.9583797450449972, + "grad_norm": 1.7256803512573242, + "learning_rate": 3.402700424925005e-05, + "loss": 0.7085, + "step": 108410 + }, + { + "epoch": 0.9584681483053095, + "grad_norm": 12.944317817687988, + "learning_rate": 3.4025530861578173e-05, + "loss": 0.7791, + "step": 108420 + }, + { + "epoch": 0.9585565515656217, + "grad_norm": 2.0162668228149414, + "learning_rate": 3.402405747390631e-05, + "loss": 0.6895, + "step": 108430 + }, + { + "epoch": 0.9586449548259339, + "grad_norm": 1.2397549152374268, + "learning_rate": 3.402258408623444e-05, + "loss": 0.5211, + "step": 108440 + }, + { + "epoch": 0.9587333580862463, + "grad_norm": 5.9702067375183105, + "learning_rate": 3.4021110698562565e-05, + "loss": 0.5716, + "step": 108450 + }, + { + "epoch": 0.9588217613465585, + "grad_norm": 3.423736095428467, + "learning_rate": 3.4019637310890694e-05, + "loss": 0.611, + "step": 108460 + }, + { + "epoch": 0.9589101646068707, + "grad_norm": 3.6585564613342285, + "learning_rate": 3.401816392321882e-05, + "loss": 0.6566, + "step": 108470 + }, + { + "epoch": 0.958998567867183, + "grad_norm": 8.351479530334473, + "learning_rate": 3.401669053554695e-05, + "loss": 0.6843, + "step": 108480 + }, + { + "epoch": 0.9590869711274952, + "grad_norm": 1.998821496963501, + "learning_rate": 3.4015217147875085e-05, + "loss": 0.7432, + "step": 108490 + }, + { + "epoch": 0.9591753743878074, + "grad_norm": 5.348984718322754, + "learning_rate": 3.401374376020321e-05, + "loss": 0.6328, + "step": 108500 + }, + { + "epoch": 0.9592637776481197, + "grad_norm": 8.680070877075195, + "learning_rate": 3.401227037253134e-05, + "loss": 0.6618, + "step": 108510 + }, + { + "epoch": 0.9593521809084319, + "grad_norm": 2.1394660472869873, + "learning_rate": 3.401079698485947e-05, + "loss": 0.556, + "step": 108520 + }, + { + "epoch": 0.9594405841687441, + "grad_norm": 5.013734340667725, + "learning_rate": 3.40093235971876e-05, + "loss": 0.8058, + "step": 108530 + }, + { + "epoch": 0.9595289874290563, + "grad_norm": 2.7199923992156982, + "learning_rate": 3.400785020951573e-05, + "loss": 0.7137, + "step": 108540 + }, + { + "epoch": 0.9596173906893686, + "grad_norm": 1.7995045185089111, + "learning_rate": 3.400637682184386e-05, + "loss": 0.5576, + "step": 108550 + }, + { + "epoch": 0.9597057939496809, + "grad_norm": 3.206547498703003, + "learning_rate": 3.4004903434171984e-05, + "loss": 0.5018, + "step": 108560 + }, + { + "epoch": 0.9597941972099931, + "grad_norm": 6.213366508483887, + "learning_rate": 3.400343004650012e-05, + "loss": 0.6283, + "step": 108570 + }, + { + "epoch": 0.9598826004703054, + "grad_norm": 8.289933204650879, + "learning_rate": 3.400195665882824e-05, + "loss": 0.6832, + "step": 108580 + }, + { + "epoch": 0.9599710037306176, + "grad_norm": 7.2982964515686035, + "learning_rate": 3.4000483271156376e-05, + "loss": 0.6606, + "step": 108590 + }, + { + "epoch": 0.9600594069909298, + "grad_norm": 5.151221752166748, + "learning_rate": 3.3999009883484504e-05, + "loss": 0.6167, + "step": 108600 + }, + { + "epoch": 0.9601478102512421, + "grad_norm": 2.101573944091797, + "learning_rate": 3.399753649581263e-05, + "loss": 0.6367, + "step": 108610 + }, + { + "epoch": 0.9602362135115543, + "grad_norm": 1.991464614868164, + "learning_rate": 3.399606310814076e-05, + "loss": 0.5753, + "step": 108620 + }, + { + "epoch": 0.9603246167718665, + "grad_norm": 2.156512498855591, + "learning_rate": 3.3994589720468896e-05, + "loss": 0.5939, + "step": 108630 + }, + { + "epoch": 0.9604130200321788, + "grad_norm": 1.273897409439087, + "learning_rate": 3.399311633279702e-05, + "loss": 0.6029, + "step": 108640 + }, + { + "epoch": 0.960501423292491, + "grad_norm": 5.283244609832764, + "learning_rate": 3.399164294512515e-05, + "loss": 0.6629, + "step": 108650 + }, + { + "epoch": 0.9605898265528032, + "grad_norm": 5.072693824768066, + "learning_rate": 3.399016955745328e-05, + "loss": 0.7361, + "step": 108660 + }, + { + "epoch": 0.9606782298131155, + "grad_norm": 5.455718517303467, + "learning_rate": 3.398869616978141e-05, + "loss": 0.7345, + "step": 108670 + }, + { + "epoch": 0.9607666330734278, + "grad_norm": 6.620666027069092, + "learning_rate": 3.398722278210954e-05, + "loss": 0.6886, + "step": 108680 + }, + { + "epoch": 0.96085503633374, + "grad_norm": 8.618492126464844, + "learning_rate": 3.3985749394437666e-05, + "loss": 0.7215, + "step": 108690 + }, + { + "epoch": 0.9609434395940523, + "grad_norm": 6.06873083114624, + "learning_rate": 3.3984276006765794e-05, + "loss": 0.6624, + "step": 108700 + }, + { + "epoch": 0.9610318428543645, + "grad_norm": 5.372738838195801, + "learning_rate": 3.398280261909393e-05, + "loss": 0.6533, + "step": 108710 + }, + { + "epoch": 0.9611202461146767, + "grad_norm": 1.5381653308868408, + "learning_rate": 3.398132923142205e-05, + "loss": 0.6658, + "step": 108720 + }, + { + "epoch": 0.961208649374989, + "grad_norm": 10.10377311706543, + "learning_rate": 3.3979855843750186e-05, + "loss": 0.8451, + "step": 108730 + }, + { + "epoch": 0.9612970526353012, + "grad_norm": 7.399298667907715, + "learning_rate": 3.3978382456078315e-05, + "loss": 0.6932, + "step": 108740 + }, + { + "epoch": 0.9613854558956134, + "grad_norm": 2.0910897254943848, + "learning_rate": 3.397690906840644e-05, + "loss": 0.5614, + "step": 108750 + }, + { + "epoch": 0.9614738591559256, + "grad_norm": 2.026648759841919, + "learning_rate": 3.397543568073457e-05, + "loss": 0.6436, + "step": 108760 + }, + { + "epoch": 0.9615622624162379, + "grad_norm": 2.5890772342681885, + "learning_rate": 3.3973962293062707e-05, + "loss": 0.6001, + "step": 108770 + }, + { + "epoch": 0.9616506656765501, + "grad_norm": 5.992710590362549, + "learning_rate": 3.397248890539083e-05, + "loss": 0.7304, + "step": 108780 + }, + { + "epoch": 0.9617390689368623, + "grad_norm": 3.517192840576172, + "learning_rate": 3.397101551771896e-05, + "loss": 0.7675, + "step": 108790 + }, + { + "epoch": 0.9618274721971747, + "grad_norm": 1.4823883771896362, + "learning_rate": 3.396954213004709e-05, + "loss": 0.5131, + "step": 108800 + }, + { + "epoch": 0.9619158754574869, + "grad_norm": 2.427386522293091, + "learning_rate": 3.396806874237522e-05, + "loss": 0.5861, + "step": 108810 + }, + { + "epoch": 0.9620042787177991, + "grad_norm": 1.6353212594985962, + "learning_rate": 3.396659535470335e-05, + "loss": 0.7368, + "step": 108820 + }, + { + "epoch": 0.9620926819781114, + "grad_norm": 4.7394280433654785, + "learning_rate": 3.396512196703148e-05, + "loss": 0.6409, + "step": 108830 + }, + { + "epoch": 0.9621810852384236, + "grad_norm": 1.347387433052063, + "learning_rate": 3.3963648579359605e-05, + "loss": 0.6165, + "step": 108840 + }, + { + "epoch": 0.9622694884987358, + "grad_norm": 5.459799289703369, + "learning_rate": 3.396217519168774e-05, + "loss": 0.6089, + "step": 108850 + }, + { + "epoch": 0.9623578917590481, + "grad_norm": 1.796870231628418, + "learning_rate": 3.396070180401587e-05, + "loss": 0.5329, + "step": 108860 + }, + { + "epoch": 0.9624462950193603, + "grad_norm": 7.6122846603393555, + "learning_rate": 3.3959228416344e-05, + "loss": 0.6, + "step": 108870 + }, + { + "epoch": 0.9625346982796725, + "grad_norm": 5.274385452270508, + "learning_rate": 3.3957755028672125e-05, + "loss": 0.6126, + "step": 108880 + }, + { + "epoch": 0.9626231015399848, + "grad_norm": 3.3590378761291504, + "learning_rate": 3.3956281641000254e-05, + "loss": 0.6181, + "step": 108890 + }, + { + "epoch": 0.962711504800297, + "grad_norm": 3.373491048812866, + "learning_rate": 3.395480825332838e-05, + "loss": 0.7215, + "step": 108900 + }, + { + "epoch": 0.9627999080606092, + "grad_norm": 4.337435245513916, + "learning_rate": 3.395333486565652e-05, + "loss": 0.6599, + "step": 108910 + }, + { + "epoch": 0.9628883113209216, + "grad_norm": 7.997978687286377, + "learning_rate": 3.3951861477984645e-05, + "loss": 0.6328, + "step": 108920 + }, + { + "epoch": 0.9629767145812338, + "grad_norm": 9.2746000289917, + "learning_rate": 3.3950388090312774e-05, + "loss": 0.7003, + "step": 108930 + }, + { + "epoch": 0.963065117841546, + "grad_norm": 3.2821590900421143, + "learning_rate": 3.39489147026409e-05, + "loss": 0.6253, + "step": 108940 + }, + { + "epoch": 0.9631535211018583, + "grad_norm": 1.6224504709243774, + "learning_rate": 3.394744131496903e-05, + "loss": 0.6933, + "step": 108950 + }, + { + "epoch": 0.9632419243621705, + "grad_norm": 2.4602506160736084, + "learning_rate": 3.394596792729716e-05, + "loss": 0.6175, + "step": 108960 + }, + { + "epoch": 0.9633303276224827, + "grad_norm": 3.0170340538024902, + "learning_rate": 3.394449453962529e-05, + "loss": 0.6648, + "step": 108970 + }, + { + "epoch": 0.963418730882795, + "grad_norm": 4.795486927032471, + "learning_rate": 3.394302115195342e-05, + "loss": 0.6548, + "step": 108980 + }, + { + "epoch": 0.9635071341431072, + "grad_norm": 0.5149299502372742, + "learning_rate": 3.394154776428155e-05, + "loss": 0.5897, + "step": 108990 + }, + { + "epoch": 0.9635955374034194, + "grad_norm": 2.1548361778259277, + "learning_rate": 3.394007437660968e-05, + "loss": 0.5996, + "step": 109000 + }, + { + "epoch": 0.9636839406637316, + "grad_norm": 1.3532441854476929, + "learning_rate": 3.393860098893781e-05, + "loss": 0.5124, + "step": 109010 + }, + { + "epoch": 0.9637723439240439, + "grad_norm": 4.723729133605957, + "learning_rate": 3.3937127601265936e-05, + "loss": 0.5618, + "step": 109020 + }, + { + "epoch": 0.9638607471843561, + "grad_norm": 2.310853958129883, + "learning_rate": 3.3935654213594064e-05, + "loss": 0.7577, + "step": 109030 + }, + { + "epoch": 0.9639491504446684, + "grad_norm": 2.4075546264648438, + "learning_rate": 3.39341808259222e-05, + "loss": 0.6662, + "step": 109040 + }, + { + "epoch": 0.9640375537049807, + "grad_norm": 2.7014997005462646, + "learning_rate": 3.393270743825032e-05, + "loss": 0.6357, + "step": 109050 + }, + { + "epoch": 0.9641259569652929, + "grad_norm": 4.416240692138672, + "learning_rate": 3.3931234050578456e-05, + "loss": 0.6237, + "step": 109060 + }, + { + "epoch": 0.9642143602256051, + "grad_norm": 9.041378021240234, + "learning_rate": 3.3929760662906584e-05, + "loss": 0.6293, + "step": 109070 + }, + { + "epoch": 0.9643027634859174, + "grad_norm": 3.652994155883789, + "learning_rate": 3.392828727523471e-05, + "loss": 0.647, + "step": 109080 + }, + { + "epoch": 0.9643911667462296, + "grad_norm": 4.848643779754639, + "learning_rate": 3.392681388756284e-05, + "loss": 0.648, + "step": 109090 + }, + { + "epoch": 0.9644795700065418, + "grad_norm": 1.7461626529693604, + "learning_rate": 3.3925340499890976e-05, + "loss": 0.6352, + "step": 109100 + }, + { + "epoch": 0.9645679732668541, + "grad_norm": 2.765460968017578, + "learning_rate": 3.39238671122191e-05, + "loss": 0.6671, + "step": 109110 + }, + { + "epoch": 0.9646563765271663, + "grad_norm": 2.8923518657684326, + "learning_rate": 3.392239372454723e-05, + "loss": 0.6095, + "step": 109120 + }, + { + "epoch": 0.9647447797874785, + "grad_norm": 2.7751667499542236, + "learning_rate": 3.392092033687536e-05, + "loss": 0.6744, + "step": 109130 + }, + { + "epoch": 0.9648331830477908, + "grad_norm": 3.464087963104248, + "learning_rate": 3.391944694920349e-05, + "loss": 0.7128, + "step": 109140 + }, + { + "epoch": 0.9649215863081031, + "grad_norm": 2.119079828262329, + "learning_rate": 3.391797356153162e-05, + "loss": 0.5623, + "step": 109150 + }, + { + "epoch": 0.9650099895684153, + "grad_norm": 1.3652396202087402, + "learning_rate": 3.3916500173859746e-05, + "loss": 0.7196, + "step": 109160 + }, + { + "epoch": 0.9650983928287276, + "grad_norm": 2.512056350708008, + "learning_rate": 3.3915026786187875e-05, + "loss": 0.7035, + "step": 109170 + }, + { + "epoch": 0.9651867960890398, + "grad_norm": 19.401247024536133, + "learning_rate": 3.391355339851601e-05, + "loss": 0.7211, + "step": 109180 + }, + { + "epoch": 0.965275199349352, + "grad_norm": 3.0264909267425537, + "learning_rate": 3.391208001084413e-05, + "loss": 0.5732, + "step": 109190 + }, + { + "epoch": 0.9653636026096643, + "grad_norm": 1.6094777584075928, + "learning_rate": 3.3910606623172266e-05, + "loss": 0.599, + "step": 109200 + }, + { + "epoch": 0.9654520058699765, + "grad_norm": 2.056447982788086, + "learning_rate": 3.3909133235500395e-05, + "loss": 0.6274, + "step": 109210 + }, + { + "epoch": 0.9655404091302887, + "grad_norm": 1.327606439590454, + "learning_rate": 3.390765984782852e-05, + "loss": 0.6552, + "step": 109220 + }, + { + "epoch": 0.9656288123906009, + "grad_norm": 2.1870946884155273, + "learning_rate": 3.390618646015665e-05, + "loss": 0.9189, + "step": 109230 + }, + { + "epoch": 0.9657172156509132, + "grad_norm": 4.863929748535156, + "learning_rate": 3.3904713072484787e-05, + "loss": 0.696, + "step": 109240 + }, + { + "epoch": 0.9658056189112254, + "grad_norm": 5.8540449142456055, + "learning_rate": 3.390323968481291e-05, + "loss": 0.6276, + "step": 109250 + }, + { + "epoch": 0.9658940221715376, + "grad_norm": 2.619868755340576, + "learning_rate": 3.390176629714104e-05, + "loss": 0.6666, + "step": 109260 + }, + { + "epoch": 0.96598242543185, + "grad_norm": 5.3761444091796875, + "learning_rate": 3.3900292909469165e-05, + "loss": 0.6263, + "step": 109270 + }, + { + "epoch": 0.9660708286921622, + "grad_norm": 3.354771852493286, + "learning_rate": 3.38988195217973e-05, + "loss": 0.6204, + "step": 109280 + }, + { + "epoch": 0.9661592319524744, + "grad_norm": 2.9355287551879883, + "learning_rate": 3.389734613412543e-05, + "loss": 0.5347, + "step": 109290 + }, + { + "epoch": 0.9662476352127867, + "grad_norm": 1.8563660383224487, + "learning_rate": 3.389587274645356e-05, + "loss": 0.6203, + "step": 109300 + }, + { + "epoch": 0.9663360384730989, + "grad_norm": 1.3764989376068115, + "learning_rate": 3.3894399358781685e-05, + "loss": 0.6076, + "step": 109310 + }, + { + "epoch": 0.9664244417334111, + "grad_norm": 1.4530961513519287, + "learning_rate": 3.389292597110982e-05, + "loss": 0.5924, + "step": 109320 + }, + { + "epoch": 0.9665128449937234, + "grad_norm": 7.1609578132629395, + "learning_rate": 3.389145258343794e-05, + "loss": 0.4917, + "step": 109330 + }, + { + "epoch": 0.9666012482540356, + "grad_norm": 2.37711238861084, + "learning_rate": 3.388997919576608e-05, + "loss": 0.6494, + "step": 109340 + }, + { + "epoch": 0.9666896515143478, + "grad_norm": 3.3385396003723145, + "learning_rate": 3.3888505808094205e-05, + "loss": 0.6843, + "step": 109350 + }, + { + "epoch": 0.9667780547746601, + "grad_norm": 4.974733829498291, + "learning_rate": 3.3887032420422334e-05, + "loss": 0.5208, + "step": 109360 + }, + { + "epoch": 0.9668664580349723, + "grad_norm": 3.60251522064209, + "learning_rate": 3.388555903275046e-05, + "loss": 0.6596, + "step": 109370 + }, + { + "epoch": 0.9669548612952845, + "grad_norm": 7.9344024658203125, + "learning_rate": 3.38840856450786e-05, + "loss": 0.655, + "step": 109380 + }, + { + "epoch": 0.9670432645555969, + "grad_norm": 0.9193863272666931, + "learning_rate": 3.388261225740672e-05, + "loss": 0.6953, + "step": 109390 + }, + { + "epoch": 0.9671316678159091, + "grad_norm": 1.1478573083877563, + "learning_rate": 3.3881138869734854e-05, + "loss": 0.6202, + "step": 109400 + }, + { + "epoch": 0.9672200710762213, + "grad_norm": 6.346615791320801, + "learning_rate": 3.3879665482062975e-05, + "loss": 0.7277, + "step": 109410 + }, + { + "epoch": 0.9673084743365336, + "grad_norm": 4.443350315093994, + "learning_rate": 3.387819209439111e-05, + "loss": 0.5476, + "step": 109420 + }, + { + "epoch": 0.9673968775968458, + "grad_norm": 2.1637489795684814, + "learning_rate": 3.387671870671924e-05, + "loss": 0.664, + "step": 109430 + }, + { + "epoch": 0.967485280857158, + "grad_norm": 1.067169189453125, + "learning_rate": 3.387524531904737e-05, + "loss": 0.6302, + "step": 109440 + }, + { + "epoch": 0.9675736841174702, + "grad_norm": 2.1617817878723145, + "learning_rate": 3.3873771931375496e-05, + "loss": 0.5546, + "step": 109450 + }, + { + "epoch": 0.9676620873777825, + "grad_norm": 3.400785446166992, + "learning_rate": 3.387229854370363e-05, + "loss": 0.6219, + "step": 109460 + }, + { + "epoch": 0.9677504906380947, + "grad_norm": 0.7347487211227417, + "learning_rate": 3.387082515603175e-05, + "loss": 0.5365, + "step": 109470 + }, + { + "epoch": 0.9678388938984069, + "grad_norm": 1.5539226531982422, + "learning_rate": 3.386935176835989e-05, + "loss": 0.5995, + "step": 109480 + }, + { + "epoch": 0.9679272971587192, + "grad_norm": 2.5931179523468018, + "learning_rate": 3.3867878380688016e-05, + "loss": 0.6951, + "step": 109490 + }, + { + "epoch": 0.9680157004190314, + "grad_norm": 2.577695608139038, + "learning_rate": 3.3866404993016144e-05, + "loss": 0.711, + "step": 109500 + }, + { + "epoch": 0.9681041036793437, + "grad_norm": 3.2594804763793945, + "learning_rate": 3.386493160534427e-05, + "loss": 0.7247, + "step": 109510 + }, + { + "epoch": 0.968192506939656, + "grad_norm": 3.4535417556762695, + "learning_rate": 3.38634582176724e-05, + "loss": 0.6625, + "step": 109520 + }, + { + "epoch": 0.9682809101999682, + "grad_norm": 2.1193764209747314, + "learning_rate": 3.386198483000053e-05, + "loss": 0.6772, + "step": 109530 + }, + { + "epoch": 0.9683693134602804, + "grad_norm": 1.722532033920288, + "learning_rate": 3.3860511442328664e-05, + "loss": 0.728, + "step": 109540 + }, + { + "epoch": 0.9684577167205927, + "grad_norm": 3.240015983581543, + "learning_rate": 3.3859038054656786e-05, + "loss": 0.5037, + "step": 109550 + }, + { + "epoch": 0.9685461199809049, + "grad_norm": 4.359414100646973, + "learning_rate": 3.385756466698492e-05, + "loss": 0.7401, + "step": 109560 + }, + { + "epoch": 0.9686345232412171, + "grad_norm": 1.9799870252609253, + "learning_rate": 3.385609127931305e-05, + "loss": 0.7385, + "step": 109570 + }, + { + "epoch": 0.9687229265015294, + "grad_norm": 3.4243040084838867, + "learning_rate": 3.385461789164118e-05, + "loss": 0.6147, + "step": 109580 + }, + { + "epoch": 0.9688113297618416, + "grad_norm": 1.2312356233596802, + "learning_rate": 3.3853144503969306e-05, + "loss": 0.71, + "step": 109590 + }, + { + "epoch": 0.9688997330221538, + "grad_norm": 1.8239474296569824, + "learning_rate": 3.385167111629744e-05, + "loss": 0.7736, + "step": 109600 + }, + { + "epoch": 0.968988136282466, + "grad_norm": 5.502866744995117, + "learning_rate": 3.385019772862556e-05, + "loss": 0.6725, + "step": 109610 + }, + { + "epoch": 0.9690765395427784, + "grad_norm": 1.8239455223083496, + "learning_rate": 3.38487243409537e-05, + "loss": 0.5721, + "step": 109620 + }, + { + "epoch": 0.9691649428030906, + "grad_norm": 6.233642101287842, + "learning_rate": 3.384725095328182e-05, + "loss": 0.5787, + "step": 109630 + }, + { + "epoch": 0.9692533460634029, + "grad_norm": 2.3092336654663086, + "learning_rate": 3.3845777565609955e-05, + "loss": 0.666, + "step": 109640 + }, + { + "epoch": 0.9693417493237151, + "grad_norm": 2.7663841247558594, + "learning_rate": 3.384430417793808e-05, + "loss": 0.7158, + "step": 109650 + }, + { + "epoch": 0.9694301525840273, + "grad_norm": 0.8950415253639221, + "learning_rate": 3.384283079026621e-05, + "loss": 0.6888, + "step": 109660 + }, + { + "epoch": 0.9695185558443395, + "grad_norm": 2.2749557495117188, + "learning_rate": 3.384135740259434e-05, + "loss": 0.6463, + "step": 109670 + }, + { + "epoch": 0.9696069591046518, + "grad_norm": 3.1506237983703613, + "learning_rate": 3.3839884014922475e-05, + "loss": 0.7202, + "step": 109680 + }, + { + "epoch": 0.969695362364964, + "grad_norm": 1.6731704473495483, + "learning_rate": 3.3838410627250596e-05, + "loss": 0.5419, + "step": 109690 + }, + { + "epoch": 0.9697837656252762, + "grad_norm": 7.018250942230225, + "learning_rate": 3.383693723957873e-05, + "loss": 0.706, + "step": 109700 + }, + { + "epoch": 0.9698721688855885, + "grad_norm": 4.473487854003906, + "learning_rate": 3.383546385190686e-05, + "loss": 0.6328, + "step": 109710 + }, + { + "epoch": 0.9699605721459007, + "grad_norm": 2.5180888175964355, + "learning_rate": 3.383399046423499e-05, + "loss": 0.7621, + "step": 109720 + }, + { + "epoch": 0.9700489754062129, + "grad_norm": 2.5558810234069824, + "learning_rate": 3.3832517076563117e-05, + "loss": 0.7036, + "step": 109730 + }, + { + "epoch": 0.9701373786665253, + "grad_norm": 3.7552618980407715, + "learning_rate": 3.3831043688891245e-05, + "loss": 0.6776, + "step": 109740 + }, + { + "epoch": 0.9702257819268375, + "grad_norm": 3.698937177658081, + "learning_rate": 3.382957030121937e-05, + "loss": 0.6538, + "step": 109750 + }, + { + "epoch": 0.9703141851871497, + "grad_norm": 3.056321859359741, + "learning_rate": 3.382809691354751e-05, + "loss": 0.6508, + "step": 109760 + }, + { + "epoch": 0.970402588447462, + "grad_norm": 4.742011070251465, + "learning_rate": 3.382662352587564e-05, + "loss": 0.5984, + "step": 109770 + }, + { + "epoch": 0.9704909917077742, + "grad_norm": 2.4124088287353516, + "learning_rate": 3.3825150138203765e-05, + "loss": 0.5993, + "step": 109780 + }, + { + "epoch": 0.9705793949680864, + "grad_norm": 2.9382693767547607, + "learning_rate": 3.3823676750531893e-05, + "loss": 0.5611, + "step": 109790 + }, + { + "epoch": 0.9706677982283987, + "grad_norm": 1.912672758102417, + "learning_rate": 3.382220336286002e-05, + "loss": 0.5495, + "step": 109800 + }, + { + "epoch": 0.9707562014887109, + "grad_norm": 1.6104891300201416, + "learning_rate": 3.382072997518815e-05, + "loss": 0.5484, + "step": 109810 + }, + { + "epoch": 0.9708446047490231, + "grad_norm": 13.708107948303223, + "learning_rate": 3.3819256587516285e-05, + "loss": 0.6513, + "step": 109820 + }, + { + "epoch": 0.9709330080093354, + "grad_norm": 2.5372703075408936, + "learning_rate": 3.3817783199844414e-05, + "loss": 0.6601, + "step": 109830 + }, + { + "epoch": 0.9710214112696476, + "grad_norm": 2.0031867027282715, + "learning_rate": 3.381630981217254e-05, + "loss": 0.6349, + "step": 109840 + }, + { + "epoch": 0.9711098145299598, + "grad_norm": 28.69565200805664, + "learning_rate": 3.381483642450067e-05, + "loss": 0.6368, + "step": 109850 + }, + { + "epoch": 0.9711982177902722, + "grad_norm": 2.5293679237365723, + "learning_rate": 3.38133630368288e-05, + "loss": 0.7287, + "step": 109860 + }, + { + "epoch": 0.9712866210505844, + "grad_norm": 5.0612945556640625, + "learning_rate": 3.381188964915693e-05, + "loss": 0.6839, + "step": 109870 + }, + { + "epoch": 0.9713750243108966, + "grad_norm": 10.466364860534668, + "learning_rate": 3.3810416261485055e-05, + "loss": 0.6798, + "step": 109880 + }, + { + "epoch": 0.9714634275712088, + "grad_norm": 1.5394682884216309, + "learning_rate": 3.380894287381319e-05, + "loss": 0.6237, + "step": 109890 + }, + { + "epoch": 0.9715518308315211, + "grad_norm": 1.3443679809570312, + "learning_rate": 3.380746948614132e-05, + "loss": 0.5754, + "step": 109900 + }, + { + "epoch": 0.9716402340918333, + "grad_norm": 12.633038520812988, + "learning_rate": 3.380599609846945e-05, + "loss": 0.7017, + "step": 109910 + }, + { + "epoch": 0.9717286373521455, + "grad_norm": 2.1645145416259766, + "learning_rate": 3.3804522710797576e-05, + "loss": 0.4924, + "step": 109920 + }, + { + "epoch": 0.9718170406124578, + "grad_norm": 5.534739971160889, + "learning_rate": 3.3803049323125704e-05, + "loss": 0.6194, + "step": 109930 + }, + { + "epoch": 0.97190544387277, + "grad_norm": 8.998230934143066, + "learning_rate": 3.380157593545383e-05, + "loss": 0.5662, + "step": 109940 + }, + { + "epoch": 0.9719938471330822, + "grad_norm": 3.581232786178589, + "learning_rate": 3.380010254778197e-05, + "loss": 0.9005, + "step": 109950 + }, + { + "epoch": 0.9720822503933945, + "grad_norm": 6.763643741607666, + "learning_rate": 3.3798629160110096e-05, + "loss": 0.685, + "step": 109960 + }, + { + "epoch": 0.9721706536537067, + "grad_norm": 6.694692134857178, + "learning_rate": 3.3797155772438224e-05, + "loss": 0.5331, + "step": 109970 + }, + { + "epoch": 0.972259056914019, + "grad_norm": 1.338887333869934, + "learning_rate": 3.379568238476635e-05, + "loss": 0.7075, + "step": 109980 + }, + { + "epoch": 0.9723474601743313, + "grad_norm": 1.2556545734405518, + "learning_rate": 3.379420899709448e-05, + "loss": 0.679, + "step": 109990 + }, + { + "epoch": 0.9724358634346435, + "grad_norm": 3.0473825931549072, + "learning_rate": 3.379273560942261e-05, + "loss": 0.6411, + "step": 110000 + }, + { + "epoch": 0.9725242666949557, + "grad_norm": 0.9802045822143555, + "learning_rate": 3.3791262221750744e-05, + "loss": 0.6186, + "step": 110010 + }, + { + "epoch": 0.972612669955268, + "grad_norm": 3.4868831634521484, + "learning_rate": 3.3789788834078866e-05, + "loss": 0.7297, + "step": 110020 + }, + { + "epoch": 0.9727010732155802, + "grad_norm": 5.8326826095581055, + "learning_rate": 3.3788315446407e-05, + "loss": 0.6502, + "step": 110030 + }, + { + "epoch": 0.9727894764758924, + "grad_norm": 1.454392910003662, + "learning_rate": 3.378684205873513e-05, + "loss": 0.5798, + "step": 110040 + }, + { + "epoch": 0.9728778797362047, + "grad_norm": 1.8410056829452515, + "learning_rate": 3.378536867106326e-05, + "loss": 0.7029, + "step": 110050 + }, + { + "epoch": 0.9729662829965169, + "grad_norm": 2.8436992168426514, + "learning_rate": 3.3783895283391386e-05, + "loss": 0.7533, + "step": 110060 + }, + { + "epoch": 0.9730546862568291, + "grad_norm": 1.9255040884017944, + "learning_rate": 3.378242189571952e-05, + "loss": 0.7105, + "step": 110070 + }, + { + "epoch": 0.9731430895171413, + "grad_norm": 3.409846544265747, + "learning_rate": 3.378094850804764e-05, + "loss": 0.5517, + "step": 110080 + }, + { + "epoch": 0.9732314927774536, + "grad_norm": 2.6061630249023438, + "learning_rate": 3.377947512037578e-05, + "loss": 0.5854, + "step": 110090 + }, + { + "epoch": 0.9733198960377659, + "grad_norm": 2.2798073291778564, + "learning_rate": 3.37780017327039e-05, + "loss": 0.6064, + "step": 110100 + }, + { + "epoch": 0.9734082992980782, + "grad_norm": 5.488577365875244, + "learning_rate": 3.3776528345032035e-05, + "loss": 0.5987, + "step": 110110 + }, + { + "epoch": 0.9734967025583904, + "grad_norm": 4.310491561889648, + "learning_rate": 3.377505495736016e-05, + "loss": 0.6977, + "step": 110120 + }, + { + "epoch": 0.9735851058187026, + "grad_norm": 1.363355040550232, + "learning_rate": 3.377358156968829e-05, + "loss": 0.6896, + "step": 110130 + }, + { + "epoch": 0.9736735090790148, + "grad_norm": 7.883826732635498, + "learning_rate": 3.377210818201642e-05, + "loss": 0.6362, + "step": 110140 + }, + { + "epoch": 0.9737619123393271, + "grad_norm": 2.7764811515808105, + "learning_rate": 3.3770634794344555e-05, + "loss": 0.6291, + "step": 110150 + }, + { + "epoch": 0.9738503155996393, + "grad_norm": 1.728932499885559, + "learning_rate": 3.3769161406672676e-05, + "loss": 0.6376, + "step": 110160 + }, + { + "epoch": 0.9739387188599515, + "grad_norm": 2.01326847076416, + "learning_rate": 3.376768801900081e-05, + "loss": 0.6448, + "step": 110170 + }, + { + "epoch": 0.9740271221202638, + "grad_norm": 2.6898386478424072, + "learning_rate": 3.376621463132894e-05, + "loss": 0.6115, + "step": 110180 + }, + { + "epoch": 0.974115525380576, + "grad_norm": 3.0468032360076904, + "learning_rate": 3.376474124365707e-05, + "loss": 0.6463, + "step": 110190 + }, + { + "epoch": 0.9742039286408882, + "grad_norm": 1.594526767730713, + "learning_rate": 3.37632678559852e-05, + "loss": 0.6194, + "step": 110200 + }, + { + "epoch": 0.9742923319012006, + "grad_norm": 3.7182111740112305, + "learning_rate": 3.3761794468313325e-05, + "loss": 0.6727, + "step": 110210 + }, + { + "epoch": 0.9743807351615128, + "grad_norm": 1.7522857189178467, + "learning_rate": 3.376032108064145e-05, + "loss": 0.786, + "step": 110220 + }, + { + "epoch": 0.974469138421825, + "grad_norm": 4.859210014343262, + "learning_rate": 3.375884769296959e-05, + "loss": 0.6115, + "step": 110230 + }, + { + "epoch": 0.9745575416821373, + "grad_norm": 1.91049325466156, + "learning_rate": 3.375737430529771e-05, + "loss": 0.5978, + "step": 110240 + }, + { + "epoch": 0.9746459449424495, + "grad_norm": 2.0873186588287354, + "learning_rate": 3.3755900917625845e-05, + "loss": 0.6399, + "step": 110250 + }, + { + "epoch": 0.9747343482027617, + "grad_norm": 2.0097036361694336, + "learning_rate": 3.3754427529953974e-05, + "loss": 0.702, + "step": 110260 + }, + { + "epoch": 0.974822751463074, + "grad_norm": 2.9090471267700195, + "learning_rate": 3.37529541422821e-05, + "loss": 0.5447, + "step": 110270 + }, + { + "epoch": 0.9749111547233862, + "grad_norm": 2.7814157009124756, + "learning_rate": 3.375148075461023e-05, + "loss": 0.6631, + "step": 110280 + }, + { + "epoch": 0.9749995579836984, + "grad_norm": 3.743830680847168, + "learning_rate": 3.3750007366938365e-05, + "loss": 0.6979, + "step": 110290 + }, + { + "epoch": 0.9750879612440106, + "grad_norm": 1.973961353302002, + "learning_rate": 3.374853397926649e-05, + "loss": 0.6843, + "step": 110300 + }, + { + "epoch": 0.9751763645043229, + "grad_norm": 0.7969472408294678, + "learning_rate": 3.374706059159462e-05, + "loss": 0.515, + "step": 110310 + }, + { + "epoch": 0.9752647677646351, + "grad_norm": 0.8885216116905212, + "learning_rate": 3.374558720392275e-05, + "loss": 0.5584, + "step": 110320 + }, + { + "epoch": 0.9753531710249475, + "grad_norm": 3.993535041809082, + "learning_rate": 3.374411381625088e-05, + "loss": 0.6791, + "step": 110330 + }, + { + "epoch": 0.9754415742852597, + "grad_norm": 2.560508966445923, + "learning_rate": 3.374264042857901e-05, + "loss": 0.5503, + "step": 110340 + }, + { + "epoch": 0.9755299775455719, + "grad_norm": 0.5963724255561829, + "learning_rate": 3.3741167040907136e-05, + "loss": 0.6366, + "step": 110350 + }, + { + "epoch": 0.9756183808058841, + "grad_norm": 1.0845046043395996, + "learning_rate": 3.3739693653235264e-05, + "loss": 0.6715, + "step": 110360 + }, + { + "epoch": 0.9757067840661964, + "grad_norm": 3.978794574737549, + "learning_rate": 3.37382202655634e-05, + "loss": 0.6431, + "step": 110370 + }, + { + "epoch": 0.9757951873265086, + "grad_norm": 6.08585786819458, + "learning_rate": 3.373674687789152e-05, + "loss": 0.6128, + "step": 110380 + }, + { + "epoch": 0.9758835905868208, + "grad_norm": 1.7601128816604614, + "learning_rate": 3.3735273490219656e-05, + "loss": 0.5619, + "step": 110390 + }, + { + "epoch": 0.9759719938471331, + "grad_norm": 2.1101343631744385, + "learning_rate": 3.3733800102547784e-05, + "loss": 0.761, + "step": 110400 + }, + { + "epoch": 0.9760603971074453, + "grad_norm": 2.4930419921875, + "learning_rate": 3.373232671487591e-05, + "loss": 0.6937, + "step": 110410 + }, + { + "epoch": 0.9761488003677575, + "grad_norm": 6.182460784912109, + "learning_rate": 3.373085332720404e-05, + "loss": 0.7751, + "step": 110420 + }, + { + "epoch": 0.9762372036280698, + "grad_norm": 3.218567371368408, + "learning_rate": 3.3729379939532176e-05, + "loss": 0.6546, + "step": 110430 + }, + { + "epoch": 0.976325606888382, + "grad_norm": 1.4082136154174805, + "learning_rate": 3.37279065518603e-05, + "loss": 0.5433, + "step": 110440 + }, + { + "epoch": 0.9764140101486943, + "grad_norm": 1.3678123950958252, + "learning_rate": 3.372643316418843e-05, + "loss": 0.6713, + "step": 110450 + }, + { + "epoch": 0.9765024134090066, + "grad_norm": 2.181298017501831, + "learning_rate": 3.3724959776516554e-05, + "loss": 0.6716, + "step": 110460 + }, + { + "epoch": 0.9765908166693188, + "grad_norm": 1.5102198123931885, + "learning_rate": 3.372348638884469e-05, + "loss": 0.7715, + "step": 110470 + }, + { + "epoch": 0.976679219929631, + "grad_norm": 4.327232360839844, + "learning_rate": 3.372201300117282e-05, + "loss": 0.6835, + "step": 110480 + }, + { + "epoch": 0.9767676231899433, + "grad_norm": 2.2418100833892822, + "learning_rate": 3.3720539613500946e-05, + "loss": 0.4445, + "step": 110490 + }, + { + "epoch": 0.9768560264502555, + "grad_norm": 1.5772608518600464, + "learning_rate": 3.3719066225829074e-05, + "loss": 0.5464, + "step": 110500 + }, + { + "epoch": 0.9769444297105677, + "grad_norm": 3.4416377544403076, + "learning_rate": 3.371759283815721e-05, + "loss": 0.5939, + "step": 110510 + }, + { + "epoch": 0.97703283297088, + "grad_norm": 4.3905768394470215, + "learning_rate": 3.371611945048533e-05, + "loss": 0.8269, + "step": 110520 + }, + { + "epoch": 0.9771212362311922, + "grad_norm": 11.387849807739258, + "learning_rate": 3.3714646062813466e-05, + "loss": 0.6816, + "step": 110530 + }, + { + "epoch": 0.9772096394915044, + "grad_norm": 1.5221426486968994, + "learning_rate": 3.3713172675141595e-05, + "loss": 0.6146, + "step": 110540 + }, + { + "epoch": 0.9772980427518166, + "grad_norm": 5.3909101486206055, + "learning_rate": 3.371169928746972e-05, + "loss": 0.6093, + "step": 110550 + }, + { + "epoch": 0.9773864460121289, + "grad_norm": 2.585848808288574, + "learning_rate": 3.371022589979785e-05, + "loss": 0.6838, + "step": 110560 + }, + { + "epoch": 0.9774748492724412, + "grad_norm": 2.381023406982422, + "learning_rate": 3.370875251212598e-05, + "loss": 0.8033, + "step": 110570 + }, + { + "epoch": 0.9775632525327534, + "grad_norm": 4.694600582122803, + "learning_rate": 3.370727912445411e-05, + "loss": 0.704, + "step": 110580 + }, + { + "epoch": 0.9776516557930657, + "grad_norm": 4.067041873931885, + "learning_rate": 3.370580573678224e-05, + "loss": 0.8415, + "step": 110590 + }, + { + "epoch": 0.9777400590533779, + "grad_norm": 4.064582824707031, + "learning_rate": 3.3704332349110365e-05, + "loss": 0.6408, + "step": 110600 + }, + { + "epoch": 0.9778284623136901, + "grad_norm": 1.6221612691879272, + "learning_rate": 3.37028589614385e-05, + "loss": 0.7663, + "step": 110610 + }, + { + "epoch": 0.9779168655740024, + "grad_norm": 5.154999256134033, + "learning_rate": 3.370138557376663e-05, + "loss": 0.7006, + "step": 110620 + }, + { + "epoch": 0.9780052688343146, + "grad_norm": 3.455639123916626, + "learning_rate": 3.3699912186094757e-05, + "loss": 0.5376, + "step": 110630 + }, + { + "epoch": 0.9780936720946268, + "grad_norm": 1.3004995584487915, + "learning_rate": 3.3698438798422885e-05, + "loss": 0.4797, + "step": 110640 + }, + { + "epoch": 0.9781820753549391, + "grad_norm": 6.261152744293213, + "learning_rate": 3.369696541075102e-05, + "loss": 0.6105, + "step": 110650 + }, + { + "epoch": 0.9782704786152513, + "grad_norm": 12.539459228515625, + "learning_rate": 3.369549202307914e-05, + "loss": 0.6637, + "step": 110660 + }, + { + "epoch": 0.9783588818755635, + "grad_norm": 7.47205114364624, + "learning_rate": 3.369401863540728e-05, + "loss": 0.6604, + "step": 110670 + }, + { + "epoch": 0.9784472851358759, + "grad_norm": 1.115787386894226, + "learning_rate": 3.3692545247735405e-05, + "loss": 0.6671, + "step": 110680 + }, + { + "epoch": 0.9785356883961881, + "grad_norm": 1.0838844776153564, + "learning_rate": 3.3691071860063533e-05, + "loss": 0.5285, + "step": 110690 + }, + { + "epoch": 0.9786240916565003, + "grad_norm": 4.756259441375732, + "learning_rate": 3.368959847239166e-05, + "loss": 0.6813, + "step": 110700 + }, + { + "epoch": 0.9787124949168126, + "grad_norm": 8.691108703613281, + "learning_rate": 3.368812508471979e-05, + "loss": 0.6325, + "step": 110710 + }, + { + "epoch": 0.9788008981771248, + "grad_norm": 2.5743041038513184, + "learning_rate": 3.368665169704792e-05, + "loss": 0.6698, + "step": 110720 + }, + { + "epoch": 0.978889301437437, + "grad_norm": 3.716695547103882, + "learning_rate": 3.3685178309376054e-05, + "loss": 0.6944, + "step": 110730 + }, + { + "epoch": 0.9789777046977493, + "grad_norm": 4.441263675689697, + "learning_rate": 3.368370492170418e-05, + "loss": 0.5897, + "step": 110740 + }, + { + "epoch": 0.9790661079580615, + "grad_norm": 3.7443573474884033, + "learning_rate": 3.368223153403231e-05, + "loss": 0.6148, + "step": 110750 + }, + { + "epoch": 0.9791545112183737, + "grad_norm": 7.481271266937256, + "learning_rate": 3.368075814636044e-05, + "loss": 0.5692, + "step": 110760 + }, + { + "epoch": 0.9792429144786859, + "grad_norm": 1.6937706470489502, + "learning_rate": 3.367928475868857e-05, + "loss": 0.6752, + "step": 110770 + }, + { + "epoch": 0.9793313177389982, + "grad_norm": 2.336862087249756, + "learning_rate": 3.3677811371016695e-05, + "loss": 0.5876, + "step": 110780 + }, + { + "epoch": 0.9794197209993104, + "grad_norm": 2.6145122051239014, + "learning_rate": 3.367633798334483e-05, + "loss": 0.7259, + "step": 110790 + }, + { + "epoch": 0.9795081242596227, + "grad_norm": 4.576858043670654, + "learning_rate": 3.367486459567296e-05, + "loss": 0.6567, + "step": 110800 + }, + { + "epoch": 0.979596527519935, + "grad_norm": 2.2098803520202637, + "learning_rate": 3.367339120800109e-05, + "loss": 0.6244, + "step": 110810 + }, + { + "epoch": 0.9796849307802472, + "grad_norm": 2.3720593452453613, + "learning_rate": 3.3671917820329216e-05, + "loss": 0.7029, + "step": 110820 + }, + { + "epoch": 0.9797733340405594, + "grad_norm": 1.2826426029205322, + "learning_rate": 3.3670444432657344e-05, + "loss": 0.6412, + "step": 110830 + }, + { + "epoch": 0.9798617373008717, + "grad_norm": 8.829693794250488, + "learning_rate": 3.366897104498547e-05, + "loss": 0.6462, + "step": 110840 + }, + { + "epoch": 0.9799501405611839, + "grad_norm": 4.648908615112305, + "learning_rate": 3.36674976573136e-05, + "loss": 0.5905, + "step": 110850 + }, + { + "epoch": 0.9800385438214961, + "grad_norm": 0.8421372175216675, + "learning_rate": 3.3666024269641736e-05, + "loss": 0.566, + "step": 110860 + }, + { + "epoch": 0.9801269470818084, + "grad_norm": 6.786984443664551, + "learning_rate": 3.3664550881969864e-05, + "loss": 0.6242, + "step": 110870 + }, + { + "epoch": 0.9802153503421206, + "grad_norm": 1.9286582469940186, + "learning_rate": 3.366307749429799e-05, + "loss": 0.6462, + "step": 110880 + }, + { + "epoch": 0.9803037536024328, + "grad_norm": 2.2931947708129883, + "learning_rate": 3.366160410662612e-05, + "loss": 0.584, + "step": 110890 + }, + { + "epoch": 0.9803921568627451, + "grad_norm": 1.8378829956054688, + "learning_rate": 3.366013071895425e-05, + "loss": 0.5361, + "step": 110900 + }, + { + "epoch": 0.9804805601230573, + "grad_norm": 4.251286029815674, + "learning_rate": 3.365865733128238e-05, + "loss": 0.574, + "step": 110910 + }, + { + "epoch": 0.9805689633833696, + "grad_norm": 2.6717238426208496, + "learning_rate": 3.365718394361051e-05, + "loss": 0.6065, + "step": 110920 + }, + { + "epoch": 0.9806573666436819, + "grad_norm": 1.7513400316238403, + "learning_rate": 3.3655710555938634e-05, + "loss": 0.697, + "step": 110930 + }, + { + "epoch": 0.9807457699039941, + "grad_norm": 1.9559534788131714, + "learning_rate": 3.365423716826677e-05, + "loss": 0.6521, + "step": 110940 + }, + { + "epoch": 0.9808341731643063, + "grad_norm": 5.948936939239502, + "learning_rate": 3.36527637805949e-05, + "loss": 0.6283, + "step": 110950 + }, + { + "epoch": 0.9809225764246186, + "grad_norm": 1.768458604812622, + "learning_rate": 3.3651290392923026e-05, + "loss": 0.6137, + "step": 110960 + }, + { + "epoch": 0.9810109796849308, + "grad_norm": 2.430455207824707, + "learning_rate": 3.3649817005251154e-05, + "loss": 0.5641, + "step": 110970 + }, + { + "epoch": 0.981099382945243, + "grad_norm": 7.473715305328369, + "learning_rate": 3.364834361757929e-05, + "loss": 0.6087, + "step": 110980 + }, + { + "epoch": 0.9811877862055552, + "grad_norm": 7.1489338874816895, + "learning_rate": 3.364687022990741e-05, + "loss": 0.592, + "step": 110990 + }, + { + "epoch": 0.9812761894658675, + "grad_norm": 3.24849271774292, + "learning_rate": 3.3645396842235546e-05, + "loss": 0.7402, + "step": 111000 + }, + { + "epoch": 0.9813645927261797, + "grad_norm": 2.569237470626831, + "learning_rate": 3.3643923454563675e-05, + "loss": 0.6987, + "step": 111010 + }, + { + "epoch": 0.9814529959864919, + "grad_norm": 2.116116523742676, + "learning_rate": 3.36424500668918e-05, + "loss": 0.767, + "step": 111020 + }, + { + "epoch": 0.9815413992468042, + "grad_norm": 2.550372362136841, + "learning_rate": 3.364097667921993e-05, + "loss": 0.5893, + "step": 111030 + }, + { + "epoch": 0.9816298025071165, + "grad_norm": 1.1438673734664917, + "learning_rate": 3.363950329154806e-05, + "loss": 0.6422, + "step": 111040 + }, + { + "epoch": 0.9817182057674287, + "grad_norm": 3.763845920562744, + "learning_rate": 3.363802990387619e-05, + "loss": 0.6291, + "step": 111050 + }, + { + "epoch": 0.981806609027741, + "grad_norm": 3.5108656883239746, + "learning_rate": 3.363655651620432e-05, + "loss": 0.7266, + "step": 111060 + }, + { + "epoch": 0.9818950122880532, + "grad_norm": 4.089592933654785, + "learning_rate": 3.3635083128532445e-05, + "loss": 0.6194, + "step": 111070 + }, + { + "epoch": 0.9819834155483654, + "grad_norm": 10.053905487060547, + "learning_rate": 3.363360974086058e-05, + "loss": 0.6434, + "step": 111080 + }, + { + "epoch": 0.9820718188086777, + "grad_norm": 2.0097062587738037, + "learning_rate": 3.363213635318871e-05, + "loss": 0.6876, + "step": 111090 + }, + { + "epoch": 0.9821602220689899, + "grad_norm": 2.302529811859131, + "learning_rate": 3.3630662965516837e-05, + "loss": 0.7691, + "step": 111100 + }, + { + "epoch": 0.9822486253293021, + "grad_norm": 15.456282615661621, + "learning_rate": 3.3629189577844965e-05, + "loss": 0.477, + "step": 111110 + }, + { + "epoch": 0.9823370285896144, + "grad_norm": 3.199341058731079, + "learning_rate": 3.36277161901731e-05, + "loss": 0.6802, + "step": 111120 + }, + { + "epoch": 0.9824254318499266, + "grad_norm": 2.484121799468994, + "learning_rate": 3.362624280250122e-05, + "loss": 0.6968, + "step": 111130 + }, + { + "epoch": 0.9825138351102388, + "grad_norm": 3.0543062686920166, + "learning_rate": 3.362476941482936e-05, + "loss": 0.599, + "step": 111140 + }, + { + "epoch": 0.982602238370551, + "grad_norm": 6.8025360107421875, + "learning_rate": 3.3623296027157485e-05, + "loss": 0.6477, + "step": 111150 + }, + { + "epoch": 0.9826906416308634, + "grad_norm": 4.920258045196533, + "learning_rate": 3.3621822639485613e-05, + "loss": 0.7036, + "step": 111160 + }, + { + "epoch": 0.9827790448911756, + "grad_norm": 6.934729099273682, + "learning_rate": 3.362034925181374e-05, + "loss": 0.6157, + "step": 111170 + }, + { + "epoch": 0.9828674481514879, + "grad_norm": 1.4260332584381104, + "learning_rate": 3.361887586414187e-05, + "loss": 0.5892, + "step": 111180 + }, + { + "epoch": 0.9829558514118001, + "grad_norm": 7.394562721252441, + "learning_rate": 3.361740247647e-05, + "loss": 0.6313, + "step": 111190 + }, + { + "epoch": 0.9830442546721123, + "grad_norm": 2.8978395462036133, + "learning_rate": 3.3615929088798134e-05, + "loss": 0.6461, + "step": 111200 + }, + { + "epoch": 0.9831326579324245, + "grad_norm": 5.9667277336120605, + "learning_rate": 3.3614455701126255e-05, + "loss": 0.6584, + "step": 111210 + }, + { + "epoch": 0.9832210611927368, + "grad_norm": 3.695119619369507, + "learning_rate": 3.361298231345439e-05, + "loss": 0.6886, + "step": 111220 + }, + { + "epoch": 0.983309464453049, + "grad_norm": 2.3961410522460938, + "learning_rate": 3.361150892578252e-05, + "loss": 0.6933, + "step": 111230 + }, + { + "epoch": 0.9833978677133612, + "grad_norm": 2.9586892127990723, + "learning_rate": 3.361003553811065e-05, + "loss": 0.678, + "step": 111240 + }, + { + "epoch": 0.9834862709736735, + "grad_norm": 3.3400754928588867, + "learning_rate": 3.3608562150438775e-05, + "loss": 0.7299, + "step": 111250 + }, + { + "epoch": 0.9835746742339857, + "grad_norm": 2.155742883682251, + "learning_rate": 3.360708876276691e-05, + "loss": 0.6606, + "step": 111260 + }, + { + "epoch": 0.983663077494298, + "grad_norm": 1.2182674407958984, + "learning_rate": 3.360561537509503e-05, + "loss": 0.5869, + "step": 111270 + }, + { + "epoch": 0.9837514807546103, + "grad_norm": 1.6962968111038208, + "learning_rate": 3.360414198742317e-05, + "loss": 0.5899, + "step": 111280 + }, + { + "epoch": 0.9838398840149225, + "grad_norm": 5.159481048583984, + "learning_rate": 3.360266859975129e-05, + "loss": 0.5996, + "step": 111290 + }, + { + "epoch": 0.9839282872752347, + "grad_norm": 4.989597797393799, + "learning_rate": 3.3601195212079424e-05, + "loss": 0.657, + "step": 111300 + }, + { + "epoch": 0.984016690535547, + "grad_norm": 1.6381126642227173, + "learning_rate": 3.359972182440755e-05, + "loss": 0.5976, + "step": 111310 + }, + { + "epoch": 0.9841050937958592, + "grad_norm": 4.167975902557373, + "learning_rate": 3.359824843673568e-05, + "loss": 0.6252, + "step": 111320 + }, + { + "epoch": 0.9841934970561714, + "grad_norm": 7.51429557800293, + "learning_rate": 3.359677504906381e-05, + "loss": 0.6074, + "step": 111330 + }, + { + "epoch": 0.9842819003164837, + "grad_norm": 1.646257758140564, + "learning_rate": 3.3595301661391944e-05, + "loss": 0.6902, + "step": 111340 + }, + { + "epoch": 0.9843703035767959, + "grad_norm": 3.257969617843628, + "learning_rate": 3.3593828273720066e-05, + "loss": 0.6878, + "step": 111350 + }, + { + "epoch": 0.9844587068371081, + "grad_norm": 4.001593589782715, + "learning_rate": 3.35923548860482e-05, + "loss": 0.6537, + "step": 111360 + }, + { + "epoch": 0.9845471100974204, + "grad_norm": 3.3365602493286133, + "learning_rate": 3.359088149837633e-05, + "loss": 0.6407, + "step": 111370 + }, + { + "epoch": 0.9846355133577326, + "grad_norm": 3.926586389541626, + "learning_rate": 3.358940811070446e-05, + "loss": 0.6118, + "step": 111380 + }, + { + "epoch": 0.9847239166180449, + "grad_norm": 2.5958337783813477, + "learning_rate": 3.3587934723032586e-05, + "loss": 0.6081, + "step": 111390 + }, + { + "epoch": 0.9848123198783572, + "grad_norm": 4.597136974334717, + "learning_rate": 3.3586461335360714e-05, + "loss": 0.6398, + "step": 111400 + }, + { + "epoch": 0.9849007231386694, + "grad_norm": 2.380460262298584, + "learning_rate": 3.358498794768884e-05, + "loss": 0.5416, + "step": 111410 + }, + { + "epoch": 0.9849891263989816, + "grad_norm": 6.699055194854736, + "learning_rate": 3.358351456001698e-05, + "loss": 0.8373, + "step": 111420 + }, + { + "epoch": 0.9850775296592938, + "grad_norm": 2.34159779548645, + "learning_rate": 3.35820411723451e-05, + "loss": 0.5761, + "step": 111430 + }, + { + "epoch": 0.9851659329196061, + "grad_norm": 2.4140844345092773, + "learning_rate": 3.3580567784673235e-05, + "loss": 0.6197, + "step": 111440 + }, + { + "epoch": 0.9852543361799183, + "grad_norm": 1.828371286392212, + "learning_rate": 3.357909439700136e-05, + "loss": 0.5182, + "step": 111450 + }, + { + "epoch": 0.9853427394402305, + "grad_norm": 7.179087162017822, + "learning_rate": 3.357762100932949e-05, + "loss": 0.7365, + "step": 111460 + }, + { + "epoch": 0.9854311427005428, + "grad_norm": 5.928529262542725, + "learning_rate": 3.357614762165762e-05, + "loss": 0.7532, + "step": 111470 + }, + { + "epoch": 0.985519545960855, + "grad_norm": 8.555516242980957, + "learning_rate": 3.3574674233985755e-05, + "loss": 0.7208, + "step": 111480 + }, + { + "epoch": 0.9856079492211672, + "grad_norm": 1.555471420288086, + "learning_rate": 3.3573200846313876e-05, + "loss": 0.7237, + "step": 111490 + }, + { + "epoch": 0.9856963524814795, + "grad_norm": 2.138625144958496, + "learning_rate": 3.357172745864201e-05, + "loss": 0.6548, + "step": 111500 + }, + { + "epoch": 0.9857847557417918, + "grad_norm": 1.7726247310638428, + "learning_rate": 3.357025407097013e-05, + "loss": 0.5777, + "step": 111510 + }, + { + "epoch": 0.985873159002104, + "grad_norm": 2.086958885192871, + "learning_rate": 3.356878068329827e-05, + "loss": 0.6493, + "step": 111520 + }, + { + "epoch": 0.9859615622624163, + "grad_norm": 11.418235778808594, + "learning_rate": 3.3567307295626396e-05, + "loss": 0.6205, + "step": 111530 + }, + { + "epoch": 0.9860499655227285, + "grad_norm": 1.4123930931091309, + "learning_rate": 3.3565833907954525e-05, + "loss": 0.63, + "step": 111540 + }, + { + "epoch": 0.9861383687830407, + "grad_norm": 2.378354787826538, + "learning_rate": 3.356436052028265e-05, + "loss": 0.5865, + "step": 111550 + }, + { + "epoch": 0.986226772043353, + "grad_norm": 3.8460135459899902, + "learning_rate": 3.356288713261079e-05, + "loss": 0.5692, + "step": 111560 + }, + { + "epoch": 0.9863151753036652, + "grad_norm": 4.886835098266602, + "learning_rate": 3.356141374493891e-05, + "loss": 0.5166, + "step": 111570 + }, + { + "epoch": 0.9864035785639774, + "grad_norm": 3.513688564300537, + "learning_rate": 3.3559940357267045e-05, + "loss": 0.6542, + "step": 111580 + }, + { + "epoch": 0.9864919818242897, + "grad_norm": 1.3782950639724731, + "learning_rate": 3.355846696959517e-05, + "loss": 0.6315, + "step": 111590 + }, + { + "epoch": 0.9865803850846019, + "grad_norm": 3.303790807723999, + "learning_rate": 3.35569935819233e-05, + "loss": 0.593, + "step": 111600 + }, + { + "epoch": 0.9866687883449141, + "grad_norm": 4.353058338165283, + "learning_rate": 3.355552019425143e-05, + "loss": 0.7096, + "step": 111610 + }, + { + "epoch": 0.9867571916052263, + "grad_norm": 1.3499033451080322, + "learning_rate": 3.3554046806579565e-05, + "loss": 0.7414, + "step": 111620 + }, + { + "epoch": 0.9868455948655387, + "grad_norm": 1.2435297966003418, + "learning_rate": 3.355257341890769e-05, + "loss": 0.5722, + "step": 111630 + }, + { + "epoch": 0.9869339981258509, + "grad_norm": 9.246455192565918, + "learning_rate": 3.355110003123582e-05, + "loss": 0.6394, + "step": 111640 + }, + { + "epoch": 0.9870224013861632, + "grad_norm": 1.8854089975357056, + "learning_rate": 3.354962664356395e-05, + "loss": 0.5993, + "step": 111650 + }, + { + "epoch": 0.9871108046464754, + "grad_norm": 1.8141562938690186, + "learning_rate": 3.354815325589208e-05, + "loss": 0.7156, + "step": 111660 + }, + { + "epoch": 0.9871992079067876, + "grad_norm": 3.7588021755218506, + "learning_rate": 3.354667986822021e-05, + "loss": 0.5912, + "step": 111670 + }, + { + "epoch": 0.9872876111670998, + "grad_norm": 2.8974814414978027, + "learning_rate": 3.3545206480548335e-05, + "loss": 0.6617, + "step": 111680 + }, + { + "epoch": 0.9873760144274121, + "grad_norm": 4.168397903442383, + "learning_rate": 3.3543733092876464e-05, + "loss": 0.615, + "step": 111690 + }, + { + "epoch": 0.9874644176877243, + "grad_norm": 2.7471868991851807, + "learning_rate": 3.35422597052046e-05, + "loss": 0.6021, + "step": 111700 + }, + { + "epoch": 0.9875528209480365, + "grad_norm": 1.2697445154190063, + "learning_rate": 3.354078631753273e-05, + "loss": 0.7222, + "step": 111710 + }, + { + "epoch": 0.9876412242083488, + "grad_norm": 0.9256862998008728, + "learning_rate": 3.3539312929860856e-05, + "loss": 0.6113, + "step": 111720 + }, + { + "epoch": 0.987729627468661, + "grad_norm": 3.108173131942749, + "learning_rate": 3.3537839542188984e-05, + "loss": 0.5417, + "step": 111730 + }, + { + "epoch": 0.9878180307289733, + "grad_norm": 2.847672939300537, + "learning_rate": 3.353636615451711e-05, + "loss": 0.7285, + "step": 111740 + }, + { + "epoch": 0.9879064339892856, + "grad_norm": 2.249431848526001, + "learning_rate": 3.353489276684524e-05, + "loss": 0.6328, + "step": 111750 + }, + { + "epoch": 0.9879948372495978, + "grad_norm": 2.3861143589019775, + "learning_rate": 3.353341937917337e-05, + "loss": 0.7842, + "step": 111760 + }, + { + "epoch": 0.98808324050991, + "grad_norm": 1.4964494705200195, + "learning_rate": 3.3531945991501504e-05, + "loss": 0.6366, + "step": 111770 + }, + { + "epoch": 0.9881716437702223, + "grad_norm": 2.032001256942749, + "learning_rate": 3.353047260382963e-05, + "loss": 0.5816, + "step": 111780 + }, + { + "epoch": 0.9882600470305345, + "grad_norm": 1.4544965028762817, + "learning_rate": 3.352899921615776e-05, + "loss": 0.6942, + "step": 111790 + }, + { + "epoch": 0.9883484502908467, + "grad_norm": 2.392289638519287, + "learning_rate": 3.352752582848589e-05, + "loss": 0.6744, + "step": 111800 + }, + { + "epoch": 0.988436853551159, + "grad_norm": 2.157137155532837, + "learning_rate": 3.352605244081402e-05, + "loss": 0.5898, + "step": 111810 + }, + { + "epoch": 0.9885252568114712, + "grad_norm": 4.992175102233887, + "learning_rate": 3.3524579053142146e-05, + "loss": 0.5477, + "step": 111820 + }, + { + "epoch": 0.9886136600717834, + "grad_norm": 7.483372211456299, + "learning_rate": 3.352310566547028e-05, + "loss": 0.5999, + "step": 111830 + }, + { + "epoch": 0.9887020633320956, + "grad_norm": 4.890076637268066, + "learning_rate": 3.352163227779841e-05, + "loss": 0.6696, + "step": 111840 + }, + { + "epoch": 0.9887904665924079, + "grad_norm": 13.550922393798828, + "learning_rate": 3.352015889012654e-05, + "loss": 0.6865, + "step": 111850 + }, + { + "epoch": 0.9888788698527202, + "grad_norm": 1.2540837526321411, + "learning_rate": 3.3518685502454666e-05, + "loss": 0.6883, + "step": 111860 + }, + { + "epoch": 0.9889672731130325, + "grad_norm": 2.1965646743774414, + "learning_rate": 3.3517212114782794e-05, + "loss": 0.7527, + "step": 111870 + }, + { + "epoch": 0.9890556763733447, + "grad_norm": 1.8193001747131348, + "learning_rate": 3.351573872711092e-05, + "loss": 0.6358, + "step": 111880 + }, + { + "epoch": 0.9891440796336569, + "grad_norm": 1.7222371101379395, + "learning_rate": 3.351426533943906e-05, + "loss": 0.6972, + "step": 111890 + }, + { + "epoch": 0.9892324828939691, + "grad_norm": 3.522794723510742, + "learning_rate": 3.351279195176718e-05, + "loss": 0.5687, + "step": 111900 + }, + { + "epoch": 0.9893208861542814, + "grad_norm": 3.7774064540863037, + "learning_rate": 3.3511318564095315e-05, + "loss": 0.615, + "step": 111910 + }, + { + "epoch": 0.9894092894145936, + "grad_norm": 0.9122946262359619, + "learning_rate": 3.350984517642344e-05, + "loss": 0.5144, + "step": 111920 + }, + { + "epoch": 0.9894976926749058, + "grad_norm": 2.9657299518585205, + "learning_rate": 3.350837178875157e-05, + "loss": 0.6468, + "step": 111930 + }, + { + "epoch": 0.9895860959352181, + "grad_norm": 4.893675804138184, + "learning_rate": 3.35068984010797e-05, + "loss": 0.5725, + "step": 111940 + }, + { + "epoch": 0.9896744991955303, + "grad_norm": 1.4844152927398682, + "learning_rate": 3.3505425013407835e-05, + "loss": 0.732, + "step": 111950 + }, + { + "epoch": 0.9897629024558425, + "grad_norm": 5.409655570983887, + "learning_rate": 3.3503951625735956e-05, + "loss": 0.6168, + "step": 111960 + }, + { + "epoch": 0.9898513057161548, + "grad_norm": 1.2928420305252075, + "learning_rate": 3.350247823806409e-05, + "loss": 0.6159, + "step": 111970 + }, + { + "epoch": 0.9899397089764671, + "grad_norm": 2.484102725982666, + "learning_rate": 3.350100485039221e-05, + "loss": 0.7712, + "step": 111980 + }, + { + "epoch": 0.9900281122367793, + "grad_norm": 2.1543467044830322, + "learning_rate": 3.349953146272035e-05, + "loss": 0.5778, + "step": 111990 + }, + { + "epoch": 0.9901165154970916, + "grad_norm": 1.3941656351089478, + "learning_rate": 3.3498058075048477e-05, + "loss": 0.7212, + "step": 112000 + }, + { + "epoch": 0.9902049187574038, + "grad_norm": 10.229924201965332, + "learning_rate": 3.3496584687376605e-05, + "loss": 0.7138, + "step": 112010 + }, + { + "epoch": 0.990293322017716, + "grad_norm": 14.385763168334961, + "learning_rate": 3.349511129970473e-05, + "loss": 0.5272, + "step": 112020 + }, + { + "epoch": 0.9903817252780283, + "grad_norm": 3.4601848125457764, + "learning_rate": 3.349363791203287e-05, + "loss": 0.6389, + "step": 112030 + }, + { + "epoch": 0.9904701285383405, + "grad_norm": 13.993854522705078, + "learning_rate": 3.349216452436099e-05, + "loss": 0.5967, + "step": 112040 + }, + { + "epoch": 0.9905585317986527, + "grad_norm": 4.1742119789123535, + "learning_rate": 3.3490691136689125e-05, + "loss": 0.589, + "step": 112050 + }, + { + "epoch": 0.990646935058965, + "grad_norm": 2.8001863956451416, + "learning_rate": 3.3489217749017253e-05, + "loss": 0.6774, + "step": 112060 + }, + { + "epoch": 0.9907353383192772, + "grad_norm": 4.187798023223877, + "learning_rate": 3.348774436134538e-05, + "loss": 0.6014, + "step": 112070 + }, + { + "epoch": 0.9908237415795894, + "grad_norm": 9.613956451416016, + "learning_rate": 3.348627097367351e-05, + "loss": 0.7411, + "step": 112080 + }, + { + "epoch": 0.9909121448399016, + "grad_norm": 2.667699098587036, + "learning_rate": 3.3484797586001645e-05, + "loss": 0.565, + "step": 112090 + }, + { + "epoch": 0.991000548100214, + "grad_norm": 11.713051795959473, + "learning_rate": 3.348332419832977e-05, + "loss": 0.6704, + "step": 112100 + }, + { + "epoch": 0.9910889513605262, + "grad_norm": 4.876836776733398, + "learning_rate": 3.34818508106579e-05, + "loss": 0.658, + "step": 112110 + }, + { + "epoch": 0.9911773546208384, + "grad_norm": 2.1874516010284424, + "learning_rate": 3.3480377422986024e-05, + "loss": 0.509, + "step": 112120 + }, + { + "epoch": 0.9912657578811507, + "grad_norm": 2.001737117767334, + "learning_rate": 3.347890403531416e-05, + "loss": 0.6167, + "step": 112130 + }, + { + "epoch": 0.9913541611414629, + "grad_norm": 2.1845858097076416, + "learning_rate": 3.347743064764229e-05, + "loss": 0.5533, + "step": 112140 + }, + { + "epoch": 0.9914425644017751, + "grad_norm": 1.0823453664779663, + "learning_rate": 3.3475957259970415e-05, + "loss": 0.5117, + "step": 112150 + }, + { + "epoch": 0.9915309676620874, + "grad_norm": 4.162007808685303, + "learning_rate": 3.3474483872298544e-05, + "loss": 0.7083, + "step": 112160 + }, + { + "epoch": 0.9916193709223996, + "grad_norm": 1.4400945901870728, + "learning_rate": 3.347301048462668e-05, + "loss": 0.7027, + "step": 112170 + }, + { + "epoch": 0.9917077741827118, + "grad_norm": 0.8610682487487793, + "learning_rate": 3.34715370969548e-05, + "loss": 0.6419, + "step": 112180 + }, + { + "epoch": 0.9917961774430241, + "grad_norm": 2.548790454864502, + "learning_rate": 3.3470063709282936e-05, + "loss": 0.6447, + "step": 112190 + }, + { + "epoch": 0.9918845807033363, + "grad_norm": 1.4812498092651367, + "learning_rate": 3.3468590321611064e-05, + "loss": 0.6844, + "step": 112200 + }, + { + "epoch": 0.9919729839636485, + "grad_norm": 3.9896907806396484, + "learning_rate": 3.346711693393919e-05, + "loss": 0.5442, + "step": 112210 + }, + { + "epoch": 0.9920613872239609, + "grad_norm": 3.034356117248535, + "learning_rate": 3.346564354626732e-05, + "loss": 0.5341, + "step": 112220 + }, + { + "epoch": 0.9921497904842731, + "grad_norm": 1.6086528301239014, + "learning_rate": 3.346417015859545e-05, + "loss": 0.5476, + "step": 112230 + }, + { + "epoch": 0.9922381937445853, + "grad_norm": 2.423977851867676, + "learning_rate": 3.346269677092358e-05, + "loss": 0.663, + "step": 112240 + }, + { + "epoch": 0.9923265970048976, + "grad_norm": 1.9531772136688232, + "learning_rate": 3.346122338325171e-05, + "loss": 0.6186, + "step": 112250 + }, + { + "epoch": 0.9924150002652098, + "grad_norm": 1.5535861253738403, + "learning_rate": 3.3459749995579834e-05, + "loss": 0.6499, + "step": 112260 + }, + { + "epoch": 0.992503403525522, + "grad_norm": 5.8906378746032715, + "learning_rate": 3.345827660790797e-05, + "loss": 0.7592, + "step": 112270 + }, + { + "epoch": 0.9925918067858343, + "grad_norm": 3.8838255405426025, + "learning_rate": 3.34568032202361e-05, + "loss": 0.6756, + "step": 112280 + }, + { + "epoch": 0.9926802100461465, + "grad_norm": 3.5611259937286377, + "learning_rate": 3.3455329832564226e-05, + "loss": 0.5665, + "step": 112290 + }, + { + "epoch": 0.9927686133064587, + "grad_norm": 4.166460990905762, + "learning_rate": 3.3453856444892354e-05, + "loss": 0.6722, + "step": 112300 + }, + { + "epoch": 0.992857016566771, + "grad_norm": 2.573184013366699, + "learning_rate": 3.345238305722049e-05, + "loss": 0.6226, + "step": 112310 + }, + { + "epoch": 0.9929454198270832, + "grad_norm": 3.364985704421997, + "learning_rate": 3.345090966954861e-05, + "loss": 0.6554, + "step": 112320 + }, + { + "epoch": 0.9930338230873955, + "grad_norm": 1.81927490234375, + "learning_rate": 3.3449436281876746e-05, + "loss": 0.5596, + "step": 112330 + }, + { + "epoch": 0.9931222263477077, + "grad_norm": 3.421112060546875, + "learning_rate": 3.344796289420487e-05, + "loss": 0.5602, + "step": 112340 + }, + { + "epoch": 0.99321062960802, + "grad_norm": 2.461779832839966, + "learning_rate": 3.3446489506533e-05, + "loss": 0.6946, + "step": 112350 + }, + { + "epoch": 0.9932990328683322, + "grad_norm": 2.894778251647949, + "learning_rate": 3.344501611886113e-05, + "loss": 0.7578, + "step": 112360 + }, + { + "epoch": 0.9933874361286444, + "grad_norm": 5.531325817108154, + "learning_rate": 3.344354273118926e-05, + "loss": 0.7036, + "step": 112370 + }, + { + "epoch": 0.9934758393889567, + "grad_norm": 8.579312324523926, + "learning_rate": 3.344206934351739e-05, + "loss": 0.6394, + "step": 112380 + }, + { + "epoch": 0.9935642426492689, + "grad_norm": 5.952613830566406, + "learning_rate": 3.344059595584552e-05, + "loss": 0.6792, + "step": 112390 + }, + { + "epoch": 0.9936526459095811, + "grad_norm": 1.9628828763961792, + "learning_rate": 3.3439122568173645e-05, + "loss": 0.5413, + "step": 112400 + }, + { + "epoch": 0.9937410491698934, + "grad_norm": 2.6676266193389893, + "learning_rate": 3.343764918050178e-05, + "loss": 0.6368, + "step": 112410 + }, + { + "epoch": 0.9938294524302056, + "grad_norm": 0.967044472694397, + "learning_rate": 3.343617579282991e-05, + "loss": 0.6824, + "step": 112420 + }, + { + "epoch": 0.9939178556905178, + "grad_norm": 5.3108439445495605, + "learning_rate": 3.3434702405158036e-05, + "loss": 0.7213, + "step": 112430 + }, + { + "epoch": 0.9940062589508301, + "grad_norm": 2.7890877723693848, + "learning_rate": 3.3433229017486165e-05, + "loss": 0.6702, + "step": 112440 + }, + { + "epoch": 0.9940946622111424, + "grad_norm": 1.9422330856323242, + "learning_rate": 3.343175562981429e-05, + "loss": 0.6479, + "step": 112450 + }, + { + "epoch": 0.9941830654714546, + "grad_norm": 3.3416390419006348, + "learning_rate": 3.343028224214242e-05, + "loss": 0.5992, + "step": 112460 + }, + { + "epoch": 0.9942714687317669, + "grad_norm": 1.1402921676635742, + "learning_rate": 3.3428808854470557e-05, + "loss": 0.6883, + "step": 112470 + }, + { + "epoch": 0.9943598719920791, + "grad_norm": 2.2184958457946777, + "learning_rate": 3.342733546679868e-05, + "loss": 0.6027, + "step": 112480 + }, + { + "epoch": 0.9944482752523913, + "grad_norm": 4.096175670623779, + "learning_rate": 3.342586207912681e-05, + "loss": 0.6861, + "step": 112490 + }, + { + "epoch": 0.9945366785127036, + "grad_norm": 3.490675687789917, + "learning_rate": 3.342438869145494e-05, + "loss": 0.6124, + "step": 112500 + }, + { + "epoch": 0.9946250817730158, + "grad_norm": 1.02232825756073, + "learning_rate": 3.342291530378307e-05, + "loss": 0.6064, + "step": 112510 + }, + { + "epoch": 0.994713485033328, + "grad_norm": 3.6563689708709717, + "learning_rate": 3.34214419161112e-05, + "loss": 0.6643, + "step": 112520 + }, + { + "epoch": 0.9948018882936402, + "grad_norm": 10.541913986206055, + "learning_rate": 3.3419968528439334e-05, + "loss": 0.5836, + "step": 112530 + }, + { + "epoch": 0.9948902915539525, + "grad_norm": 4.165454864501953, + "learning_rate": 3.3418495140767455e-05, + "loss": 0.7104, + "step": 112540 + }, + { + "epoch": 0.9949786948142647, + "grad_norm": 5.17685604095459, + "learning_rate": 3.341702175309559e-05, + "loss": 0.5705, + "step": 112550 + }, + { + "epoch": 0.9950670980745769, + "grad_norm": 6.6226630210876465, + "learning_rate": 3.341554836542372e-05, + "loss": 0.6129, + "step": 112560 + }, + { + "epoch": 0.9951555013348893, + "grad_norm": 2.9814226627349854, + "learning_rate": 3.341407497775185e-05, + "loss": 0.7265, + "step": 112570 + }, + { + "epoch": 0.9952439045952015, + "grad_norm": 8.501177787780762, + "learning_rate": 3.3412601590079975e-05, + "loss": 0.6709, + "step": 112580 + }, + { + "epoch": 0.9953323078555137, + "grad_norm": 2.1733036041259766, + "learning_rate": 3.3411128202408104e-05, + "loss": 0.78, + "step": 112590 + }, + { + "epoch": 0.995420711115826, + "grad_norm": 1.7375086545944214, + "learning_rate": 3.340965481473624e-05, + "loss": 0.7385, + "step": 112600 + }, + { + "epoch": 0.9955091143761382, + "grad_norm": 2.2866108417510986, + "learning_rate": 3.340818142706437e-05, + "loss": 0.7098, + "step": 112610 + }, + { + "epoch": 0.9955975176364504, + "grad_norm": 5.398899078369141, + "learning_rate": 3.3406708039392495e-05, + "loss": 0.6375, + "step": 112620 + }, + { + "epoch": 0.9956859208967627, + "grad_norm": 1.354555368423462, + "learning_rate": 3.3405234651720624e-05, + "loss": 0.6631, + "step": 112630 + }, + { + "epoch": 0.9957743241570749, + "grad_norm": 2.862786293029785, + "learning_rate": 3.340376126404875e-05, + "loss": 0.6972, + "step": 112640 + }, + { + "epoch": 0.9958627274173871, + "grad_norm": 2.064380645751953, + "learning_rate": 3.340228787637688e-05, + "loss": 0.6434, + "step": 112650 + }, + { + "epoch": 0.9959511306776994, + "grad_norm": 3.987992286682129, + "learning_rate": 3.3400814488705016e-05, + "loss": 0.6607, + "step": 112660 + }, + { + "epoch": 0.9960395339380116, + "grad_norm": 1.7794808149337769, + "learning_rate": 3.3399341101033144e-05, + "loss": 0.6053, + "step": 112670 + }, + { + "epoch": 0.9961279371983238, + "grad_norm": 2.814638376235962, + "learning_rate": 3.339786771336127e-05, + "loss": 0.608, + "step": 112680 + }, + { + "epoch": 0.9962163404586362, + "grad_norm": 2.2738704681396484, + "learning_rate": 3.33963943256894e-05, + "loss": 0.6412, + "step": 112690 + }, + { + "epoch": 0.9963047437189484, + "grad_norm": 3.6364216804504395, + "learning_rate": 3.339492093801753e-05, + "loss": 0.6692, + "step": 112700 + }, + { + "epoch": 0.9963931469792606, + "grad_norm": 1.6770386695861816, + "learning_rate": 3.339344755034566e-05, + "loss": 0.6817, + "step": 112710 + }, + { + "epoch": 0.9964815502395729, + "grad_norm": 5.360958576202393, + "learning_rate": 3.339197416267379e-05, + "loss": 0.6297, + "step": 112720 + }, + { + "epoch": 0.9965699534998851, + "grad_norm": 2.039898157119751, + "learning_rate": 3.3390500775001914e-05, + "loss": 0.6651, + "step": 112730 + }, + { + "epoch": 0.9966583567601973, + "grad_norm": 4.612518310546875, + "learning_rate": 3.338902738733005e-05, + "loss": 0.6607, + "step": 112740 + }, + { + "epoch": 0.9967467600205095, + "grad_norm": 7.095003604888916, + "learning_rate": 3.338755399965818e-05, + "loss": 0.6472, + "step": 112750 + }, + { + "epoch": 0.9968351632808218, + "grad_norm": 2.640900135040283, + "learning_rate": 3.3386080611986306e-05, + "loss": 0.572, + "step": 112760 + }, + { + "epoch": 0.996923566541134, + "grad_norm": 11.376489639282227, + "learning_rate": 3.3384607224314434e-05, + "loss": 0.7194, + "step": 112770 + }, + { + "epoch": 0.9970119698014462, + "grad_norm": 1.747674822807312, + "learning_rate": 3.338313383664257e-05, + "loss": 0.739, + "step": 112780 + }, + { + "epoch": 0.9971003730617585, + "grad_norm": 9.786775588989258, + "learning_rate": 3.338166044897069e-05, + "loss": 0.7105, + "step": 112790 + }, + { + "epoch": 0.9971887763220708, + "grad_norm": 3.084177255630493, + "learning_rate": 3.3380187061298826e-05, + "loss": 0.6088, + "step": 112800 + }, + { + "epoch": 0.997277179582383, + "grad_norm": 4.496860980987549, + "learning_rate": 3.337871367362695e-05, + "loss": 0.5767, + "step": 112810 + }, + { + "epoch": 0.9973655828426953, + "grad_norm": 0.9731305837631226, + "learning_rate": 3.337724028595508e-05, + "loss": 0.5315, + "step": 112820 + }, + { + "epoch": 0.9974539861030075, + "grad_norm": 17.036603927612305, + "learning_rate": 3.337576689828321e-05, + "loss": 0.6176, + "step": 112830 + }, + { + "epoch": 0.9975423893633197, + "grad_norm": 1.8506420850753784, + "learning_rate": 3.337429351061134e-05, + "loss": 0.675, + "step": 112840 + }, + { + "epoch": 0.997630792623632, + "grad_norm": 1.9069759845733643, + "learning_rate": 3.337282012293947e-05, + "loss": 0.6838, + "step": 112850 + }, + { + "epoch": 0.9977191958839442, + "grad_norm": 2.550217628479004, + "learning_rate": 3.33713467352676e-05, + "loss": 0.6823, + "step": 112860 + }, + { + "epoch": 0.9978075991442564, + "grad_norm": 2.61788010597229, + "learning_rate": 3.3369873347595725e-05, + "loss": 0.7089, + "step": 112870 + }, + { + "epoch": 0.9978960024045687, + "grad_norm": 1.6682244539260864, + "learning_rate": 3.336839995992386e-05, + "loss": 0.7126, + "step": 112880 + }, + { + "epoch": 0.9979844056648809, + "grad_norm": 2.2637345790863037, + "learning_rate": 3.336692657225199e-05, + "loss": 0.7083, + "step": 112890 + }, + { + "epoch": 0.9980728089251931, + "grad_norm": 3.195830821990967, + "learning_rate": 3.3365453184580116e-05, + "loss": 0.649, + "step": 112900 + }, + { + "epoch": 0.9981612121855054, + "grad_norm": 2.306546688079834, + "learning_rate": 3.3363979796908245e-05, + "loss": 0.7072, + "step": 112910 + }, + { + "epoch": 0.9982496154458177, + "grad_norm": 3.3835599422454834, + "learning_rate": 3.336250640923637e-05, + "loss": 0.7273, + "step": 112920 + }, + { + "epoch": 0.9983380187061299, + "grad_norm": 2.138302803039551, + "learning_rate": 3.33610330215645e-05, + "loss": 0.7381, + "step": 112930 + }, + { + "epoch": 0.9984264219664422, + "grad_norm": 1.5605179071426392, + "learning_rate": 3.335955963389264e-05, + "loss": 0.5386, + "step": 112940 + }, + { + "epoch": 0.9985148252267544, + "grad_norm": 2.307772159576416, + "learning_rate": 3.335808624622076e-05, + "loss": 0.6734, + "step": 112950 + }, + { + "epoch": 0.9986032284870666, + "grad_norm": 1.731562614440918, + "learning_rate": 3.335661285854889e-05, + "loss": 0.5946, + "step": 112960 + }, + { + "epoch": 0.9986916317473788, + "grad_norm": 1.4824844598770142, + "learning_rate": 3.335513947087702e-05, + "loss": 0.6387, + "step": 112970 + }, + { + "epoch": 0.9987800350076911, + "grad_norm": 11.846600532531738, + "learning_rate": 3.335366608320515e-05, + "loss": 0.6748, + "step": 112980 + }, + { + "epoch": 0.9988684382680033, + "grad_norm": 1.443597435951233, + "learning_rate": 3.335219269553328e-05, + "loss": 0.695, + "step": 112990 + }, + { + "epoch": 0.9989568415283155, + "grad_norm": 2.229417085647583, + "learning_rate": 3.3350719307861414e-05, + "loss": 0.6362, + "step": 113000 + }, + { + "epoch": 0.9990452447886278, + "grad_norm": 2.9889605045318604, + "learning_rate": 3.3349245920189535e-05, + "loss": 0.7178, + "step": 113010 + }, + { + "epoch": 0.99913364804894, + "grad_norm": 3.6316070556640625, + "learning_rate": 3.334777253251767e-05, + "loss": 0.717, + "step": 113020 + }, + { + "epoch": 0.9992220513092522, + "grad_norm": 3.6598217487335205, + "learning_rate": 3.33462991448458e-05, + "loss": 0.7047, + "step": 113030 + }, + { + "epoch": 0.9993104545695646, + "grad_norm": 1.576230525970459, + "learning_rate": 3.334482575717393e-05, + "loss": 0.525, + "step": 113040 + }, + { + "epoch": 0.9993988578298768, + "grad_norm": 1.6156716346740723, + "learning_rate": 3.3343352369502055e-05, + "loss": 0.6069, + "step": 113050 + }, + { + "epoch": 0.999487261090189, + "grad_norm": 1.4244310855865479, + "learning_rate": 3.3341878981830184e-05, + "loss": 0.7527, + "step": 113060 + }, + { + "epoch": 0.9995756643505013, + "grad_norm": 3.3648128509521484, + "learning_rate": 3.334040559415831e-05, + "loss": 0.8143, + "step": 113070 + }, + { + "epoch": 0.9996640676108135, + "grad_norm": 2.311471700668335, + "learning_rate": 3.333893220648645e-05, + "loss": 0.5865, + "step": 113080 + }, + { + "epoch": 0.9997524708711257, + "grad_norm": 4.397999286651611, + "learning_rate": 3.333745881881457e-05, + "loss": 0.6696, + "step": 113090 + }, + { + "epoch": 0.999840874131438, + "grad_norm": 6.6669020652771, + "learning_rate": 3.3335985431142704e-05, + "loss": 0.5819, + "step": 113100 + }, + { + "epoch": 0.9999292773917502, + "grad_norm": 2.493673086166382, + "learning_rate": 3.333451204347083e-05, + "loss": 0.6061, + "step": 113110 + }, + { + "epoch": 1.0, + "eval_loss": 0.6402608752250671, + "eval_runtime": 1558.2793, + "eval_samples_per_second": 290.365, + "eval_steps_per_second": 18.148, + "step": 113118 + }, + { + "epoch": 1.0000176806520624, + "grad_norm": 5.0341668128967285, + "learning_rate": 3.333303865579896e-05, + "loss": 0.5431, + "step": 113120 + }, + { + "epoch": 1.0001060839123748, + "grad_norm": 1.3547182083129883, + "learning_rate": 3.333156526812709e-05, + "loss": 0.5498, + "step": 113130 + }, + { + "epoch": 1.0001944871726869, + "grad_norm": 1.4502989053726196, + "learning_rate": 3.3330091880455224e-05, + "loss": 0.6172, + "step": 113140 + }, + { + "epoch": 1.0002828904329992, + "grad_norm": 5.147324562072754, + "learning_rate": 3.3328618492783346e-05, + "loss": 0.5801, + "step": 113150 + }, + { + "epoch": 1.0003712936933113, + "grad_norm": 0.6961853504180908, + "learning_rate": 3.332714510511148e-05, + "loss": 0.5647, + "step": 113160 + }, + { + "epoch": 1.0004596969536237, + "grad_norm": 2.148359537124634, + "learning_rate": 3.33256717174396e-05, + "loss": 0.709, + "step": 113170 + }, + { + "epoch": 1.0005481002139358, + "grad_norm": 5.336066722869873, + "learning_rate": 3.332419832976774e-05, + "loss": 0.5048, + "step": 113180 + }, + { + "epoch": 1.0006365034742482, + "grad_norm": 1.5481164455413818, + "learning_rate": 3.3322724942095866e-05, + "loss": 0.6045, + "step": 113190 + }, + { + "epoch": 1.0007249067345603, + "grad_norm": 3.8603017330169678, + "learning_rate": 3.3321251554423994e-05, + "loss": 0.6148, + "step": 113200 + }, + { + "epoch": 1.0008133099948726, + "grad_norm": 2.894463300704956, + "learning_rate": 3.331977816675212e-05, + "loss": 0.5941, + "step": 113210 + }, + { + "epoch": 1.000901713255185, + "grad_norm": 7.036715984344482, + "learning_rate": 3.331830477908026e-05, + "loss": 0.5017, + "step": 113220 + }, + { + "epoch": 1.000990116515497, + "grad_norm": 5.875628471374512, + "learning_rate": 3.331683139140838e-05, + "loss": 0.5387, + "step": 113230 + }, + { + "epoch": 1.0010785197758094, + "grad_norm": 5.956996917724609, + "learning_rate": 3.3315358003736514e-05, + "loss": 0.6005, + "step": 113240 + }, + { + "epoch": 1.0011669230361215, + "grad_norm": 11.41282844543457, + "learning_rate": 3.331388461606464e-05, + "loss": 0.5063, + "step": 113250 + }, + { + "epoch": 1.0012553262964339, + "grad_norm": 2.202025890350342, + "learning_rate": 3.331241122839277e-05, + "loss": 0.5133, + "step": 113260 + }, + { + "epoch": 1.001343729556746, + "grad_norm": 2.3404276371002197, + "learning_rate": 3.33109378407209e-05, + "loss": 0.499, + "step": 113270 + }, + { + "epoch": 1.0014321328170583, + "grad_norm": 1.808912754058838, + "learning_rate": 3.330946445304903e-05, + "loss": 0.71, + "step": 113280 + }, + { + "epoch": 1.0015205360773705, + "grad_norm": 3.837283134460449, + "learning_rate": 3.3307991065377156e-05, + "loss": 0.7154, + "step": 113290 + }, + { + "epoch": 1.0016089393376828, + "grad_norm": 1.2322243452072144, + "learning_rate": 3.330651767770529e-05, + "loss": 0.793, + "step": 113300 + }, + { + "epoch": 1.001697342597995, + "grad_norm": 7.320115089416504, + "learning_rate": 3.330504429003341e-05, + "loss": 0.6948, + "step": 113310 + }, + { + "epoch": 1.0017857458583073, + "grad_norm": 4.228982925415039, + "learning_rate": 3.330357090236155e-05, + "loss": 0.6774, + "step": 113320 + }, + { + "epoch": 1.0018741491186196, + "grad_norm": 10.58315658569336, + "learning_rate": 3.3302097514689676e-05, + "loss": 0.5723, + "step": 113330 + }, + { + "epoch": 1.0019625523789317, + "grad_norm": 0.9104031324386597, + "learning_rate": 3.3300624127017805e-05, + "loss": 0.5914, + "step": 113340 + }, + { + "epoch": 1.002050955639244, + "grad_norm": 4.153336524963379, + "learning_rate": 3.329915073934593e-05, + "loss": 0.6114, + "step": 113350 + }, + { + "epoch": 1.0021393588995562, + "grad_norm": 1.6540496349334717, + "learning_rate": 3.329767735167407e-05, + "loss": 0.6583, + "step": 113360 + }, + { + "epoch": 1.0022277621598685, + "grad_norm": 1.6964622735977173, + "learning_rate": 3.329620396400219e-05, + "loss": 0.6022, + "step": 113370 + }, + { + "epoch": 1.0023161654201806, + "grad_norm": 1.8616136312484741, + "learning_rate": 3.3294730576330325e-05, + "loss": 0.6551, + "step": 113380 + }, + { + "epoch": 1.002404568680493, + "grad_norm": 3.145517587661743, + "learning_rate": 3.329325718865845e-05, + "loss": 0.5643, + "step": 113390 + }, + { + "epoch": 1.002492971940805, + "grad_norm": 3.3268396854400635, + "learning_rate": 3.329178380098658e-05, + "loss": 0.4888, + "step": 113400 + }, + { + "epoch": 1.0025813752011175, + "grad_norm": 1.4747227430343628, + "learning_rate": 3.329031041331471e-05, + "loss": 0.465, + "step": 113410 + }, + { + "epoch": 1.0026697784614296, + "grad_norm": 2.9858028888702393, + "learning_rate": 3.328883702564284e-05, + "loss": 0.5592, + "step": 113420 + }, + { + "epoch": 1.002758181721742, + "grad_norm": 5.396865367889404, + "learning_rate": 3.328736363797097e-05, + "loss": 0.5083, + "step": 113430 + }, + { + "epoch": 1.002846584982054, + "grad_norm": 2.077924966812134, + "learning_rate": 3.32858902502991e-05, + "loss": 0.5788, + "step": 113440 + }, + { + "epoch": 1.0029349882423664, + "grad_norm": 2.6789615154266357, + "learning_rate": 3.328441686262723e-05, + "loss": 0.7885, + "step": 113450 + }, + { + "epoch": 1.0030233915026787, + "grad_norm": 8.627890586853027, + "learning_rate": 3.328294347495536e-05, + "loss": 0.6237, + "step": 113460 + }, + { + "epoch": 1.0031117947629908, + "grad_norm": 1.346265435218811, + "learning_rate": 3.328147008728349e-05, + "loss": 0.5363, + "step": 113470 + }, + { + "epoch": 1.0032001980233032, + "grad_norm": 1.7499152421951294, + "learning_rate": 3.3279996699611615e-05, + "loss": 0.4775, + "step": 113480 + }, + { + "epoch": 1.0032886012836153, + "grad_norm": 2.949223518371582, + "learning_rate": 3.3278523311939744e-05, + "loss": 0.7244, + "step": 113490 + }, + { + "epoch": 1.0033770045439276, + "grad_norm": 1.6534029245376587, + "learning_rate": 3.327704992426788e-05, + "loss": 0.6141, + "step": 113500 + }, + { + "epoch": 1.0034654078042398, + "grad_norm": 4.85213565826416, + "learning_rate": 3.327557653659601e-05, + "loss": 0.5037, + "step": 113510 + }, + { + "epoch": 1.003553811064552, + "grad_norm": 2.4136953353881836, + "learning_rate": 3.3274103148924135e-05, + "loss": 0.4695, + "step": 113520 + }, + { + "epoch": 1.0036422143248642, + "grad_norm": 2.642446279525757, + "learning_rate": 3.3272629761252264e-05, + "loss": 0.6884, + "step": 113530 + }, + { + "epoch": 1.0037306175851766, + "grad_norm": 1.989149570465088, + "learning_rate": 3.327115637358039e-05, + "loss": 0.6696, + "step": 113540 + }, + { + "epoch": 1.0038190208454887, + "grad_norm": 1.039262294769287, + "learning_rate": 3.326968298590852e-05, + "loss": 0.538, + "step": 113550 + }, + { + "epoch": 1.003907424105801, + "grad_norm": 2.7582459449768066, + "learning_rate": 3.326820959823665e-05, + "loss": 0.5875, + "step": 113560 + }, + { + "epoch": 1.0039958273661134, + "grad_norm": 1.412739872932434, + "learning_rate": 3.3266736210564784e-05, + "loss": 0.7538, + "step": 113570 + }, + { + "epoch": 1.0040842306264255, + "grad_norm": 1.9850144386291504, + "learning_rate": 3.326526282289291e-05, + "loss": 0.6752, + "step": 113580 + }, + { + "epoch": 1.0041726338867378, + "grad_norm": 10.695941925048828, + "learning_rate": 3.326378943522104e-05, + "loss": 0.5454, + "step": 113590 + }, + { + "epoch": 1.00426103714705, + "grad_norm": 5.131735801696777, + "learning_rate": 3.326231604754917e-05, + "loss": 0.8016, + "step": 113600 + }, + { + "epoch": 1.0043494404073623, + "grad_norm": 1.8428049087524414, + "learning_rate": 3.32608426598773e-05, + "loss": 0.6201, + "step": 113610 + }, + { + "epoch": 1.0044378436676744, + "grad_norm": 1.785803198814392, + "learning_rate": 3.3259369272205426e-05, + "loss": 0.5449, + "step": 113620 + }, + { + "epoch": 1.0045262469279868, + "grad_norm": 3.745408058166504, + "learning_rate": 3.325789588453356e-05, + "loss": 0.6351, + "step": 113630 + }, + { + "epoch": 1.0046146501882989, + "grad_norm": 1.622895359992981, + "learning_rate": 3.325642249686168e-05, + "loss": 0.5655, + "step": 113640 + }, + { + "epoch": 1.0047030534486112, + "grad_norm": 1.594959020614624, + "learning_rate": 3.325494910918982e-05, + "loss": 0.696, + "step": 113650 + }, + { + "epoch": 1.0047914567089233, + "grad_norm": 1.0371103286743164, + "learning_rate": 3.3253475721517946e-05, + "loss": 0.4814, + "step": 113660 + }, + { + "epoch": 1.0048798599692357, + "grad_norm": 1.7003803253173828, + "learning_rate": 3.3252002333846074e-05, + "loss": 0.5683, + "step": 113670 + }, + { + "epoch": 1.0049682632295478, + "grad_norm": 3.5081698894500732, + "learning_rate": 3.32505289461742e-05, + "loss": 0.5515, + "step": 113680 + }, + { + "epoch": 1.0050566664898601, + "grad_norm": 2.4893651008605957, + "learning_rate": 3.324905555850234e-05, + "loss": 0.6085, + "step": 113690 + }, + { + "epoch": 1.0051450697501725, + "grad_norm": 2.544677734375, + "learning_rate": 3.324758217083046e-05, + "loss": 0.501, + "step": 113700 + }, + { + "epoch": 1.0052334730104846, + "grad_norm": 8.061636924743652, + "learning_rate": 3.3246108783158594e-05, + "loss": 0.5854, + "step": 113710 + }, + { + "epoch": 1.005321876270797, + "grad_norm": 16.36623191833496, + "learning_rate": 3.324463539548672e-05, + "loss": 0.6096, + "step": 113720 + }, + { + "epoch": 1.005410279531109, + "grad_norm": 10.859704971313477, + "learning_rate": 3.324316200781485e-05, + "loss": 0.6627, + "step": 113730 + }, + { + "epoch": 1.0054986827914214, + "grad_norm": 10.567687034606934, + "learning_rate": 3.324168862014298e-05, + "loss": 0.6371, + "step": 113740 + }, + { + "epoch": 1.0055870860517335, + "grad_norm": 7.632933139801025, + "learning_rate": 3.324021523247111e-05, + "loss": 0.7607, + "step": 113750 + }, + { + "epoch": 1.0056754893120459, + "grad_norm": 2.8745486736297607, + "learning_rate": 3.3238741844799236e-05, + "loss": 0.7439, + "step": 113760 + }, + { + "epoch": 1.005763892572358, + "grad_norm": 1.6206594705581665, + "learning_rate": 3.323726845712737e-05, + "loss": 0.4246, + "step": 113770 + }, + { + "epoch": 1.0058522958326703, + "grad_norm": 2.075242757797241, + "learning_rate": 3.323579506945549e-05, + "loss": 0.6937, + "step": 113780 + }, + { + "epoch": 1.0059406990929824, + "grad_norm": 1.1037721633911133, + "learning_rate": 3.323432168178363e-05, + "loss": 0.5536, + "step": 113790 + }, + { + "epoch": 1.0060291023532948, + "grad_norm": 1.3882333040237427, + "learning_rate": 3.3232848294111756e-05, + "loss": 0.5429, + "step": 113800 + }, + { + "epoch": 1.0061175056136071, + "grad_norm": 2.6770808696746826, + "learning_rate": 3.3231374906439885e-05, + "loss": 0.6684, + "step": 113810 + }, + { + "epoch": 1.0062059088739193, + "grad_norm": 6.101493835449219, + "learning_rate": 3.322990151876801e-05, + "loss": 0.6159, + "step": 113820 + }, + { + "epoch": 1.0062943121342316, + "grad_norm": 6.200798988342285, + "learning_rate": 3.322842813109615e-05, + "loss": 0.586, + "step": 113830 + }, + { + "epoch": 1.0063827153945437, + "grad_norm": 4.1431732177734375, + "learning_rate": 3.322695474342427e-05, + "loss": 0.6897, + "step": 113840 + }, + { + "epoch": 1.006471118654856, + "grad_norm": 3.066347122192383, + "learning_rate": 3.3225481355752405e-05, + "loss": 0.5235, + "step": 113850 + }, + { + "epoch": 1.0065595219151682, + "grad_norm": 14.762816429138184, + "learning_rate": 3.322400796808053e-05, + "loss": 0.6861, + "step": 113860 + }, + { + "epoch": 1.0066479251754805, + "grad_norm": 3.053056478500366, + "learning_rate": 3.322253458040866e-05, + "loss": 0.5974, + "step": 113870 + }, + { + "epoch": 1.0067363284357926, + "grad_norm": 3.4242119789123535, + "learning_rate": 3.322106119273679e-05, + "loss": 0.6279, + "step": 113880 + }, + { + "epoch": 1.006824731696105, + "grad_norm": 2.1032440662384033, + "learning_rate": 3.321958780506492e-05, + "loss": 0.5928, + "step": 113890 + }, + { + "epoch": 1.006913134956417, + "grad_norm": 0.756652295589447, + "learning_rate": 3.321811441739305e-05, + "loss": 0.553, + "step": 113900 + }, + { + "epoch": 1.0070015382167294, + "grad_norm": 7.051733493804932, + "learning_rate": 3.321664102972118e-05, + "loss": 0.5903, + "step": 113910 + }, + { + "epoch": 1.0070899414770418, + "grad_norm": 1.7435734272003174, + "learning_rate": 3.3215167642049303e-05, + "loss": 0.5776, + "step": 113920 + }, + { + "epoch": 1.007178344737354, + "grad_norm": 1.471184492111206, + "learning_rate": 3.321369425437744e-05, + "loss": 0.6572, + "step": 113930 + }, + { + "epoch": 1.0072667479976662, + "grad_norm": 1.5176074504852295, + "learning_rate": 3.321222086670557e-05, + "loss": 0.6424, + "step": 113940 + }, + { + "epoch": 1.0073551512579784, + "grad_norm": 5.296817779541016, + "learning_rate": 3.3210747479033695e-05, + "loss": 0.7113, + "step": 113950 + }, + { + "epoch": 1.0074435545182907, + "grad_norm": 6.168008804321289, + "learning_rate": 3.3209274091361824e-05, + "loss": 0.6011, + "step": 113960 + }, + { + "epoch": 1.0075319577786028, + "grad_norm": 2.4365758895874023, + "learning_rate": 3.320780070368996e-05, + "loss": 0.6166, + "step": 113970 + }, + { + "epoch": 1.0076203610389152, + "grad_norm": 1.9212533235549927, + "learning_rate": 3.320632731601808e-05, + "loss": 0.7925, + "step": 113980 + }, + { + "epoch": 1.0077087642992273, + "grad_norm": 1.874809980392456, + "learning_rate": 3.3204853928346215e-05, + "loss": 0.6869, + "step": 113990 + }, + { + "epoch": 1.0077971675595396, + "grad_norm": 6.095668315887451, + "learning_rate": 3.320338054067434e-05, + "loss": 0.6558, + "step": 114000 + }, + { + "epoch": 1.0078855708198518, + "grad_norm": 1.9459079504013062, + "learning_rate": 3.320190715300247e-05, + "loss": 0.5956, + "step": 114010 + }, + { + "epoch": 1.007973974080164, + "grad_norm": 3.421865940093994, + "learning_rate": 3.32004337653306e-05, + "loss": 0.5842, + "step": 114020 + }, + { + "epoch": 1.0080623773404762, + "grad_norm": 4.6370038986206055, + "learning_rate": 3.319896037765873e-05, + "loss": 0.6503, + "step": 114030 + }, + { + "epoch": 1.0081507806007886, + "grad_norm": 2.3934757709503174, + "learning_rate": 3.319748698998686e-05, + "loss": 0.721, + "step": 114040 + }, + { + "epoch": 1.008239183861101, + "grad_norm": 2.066995859146118, + "learning_rate": 3.319601360231499e-05, + "loss": 0.659, + "step": 114050 + }, + { + "epoch": 1.008327587121413, + "grad_norm": 0.802885115146637, + "learning_rate": 3.3194540214643114e-05, + "loss": 0.6241, + "step": 114060 + }, + { + "epoch": 1.0084159903817254, + "grad_norm": 2.192640781402588, + "learning_rate": 3.319306682697125e-05, + "loss": 0.6469, + "step": 114070 + }, + { + "epoch": 1.0085043936420375, + "grad_norm": 2.280278205871582, + "learning_rate": 3.319159343929938e-05, + "loss": 0.6334, + "step": 114080 + }, + { + "epoch": 1.0085927969023498, + "grad_norm": 9.668909072875977, + "learning_rate": 3.3190120051627506e-05, + "loss": 0.6301, + "step": 114090 + }, + { + "epoch": 1.008681200162662, + "grad_norm": 2.2471654415130615, + "learning_rate": 3.3188646663955634e-05, + "loss": 0.6271, + "step": 114100 + }, + { + "epoch": 1.0087696034229743, + "grad_norm": 4.89241886138916, + "learning_rate": 3.318717327628376e-05, + "loss": 0.6508, + "step": 114110 + }, + { + "epoch": 1.0088580066832864, + "grad_norm": 1.3187072277069092, + "learning_rate": 3.318569988861189e-05, + "loss": 0.5368, + "step": 114120 + }, + { + "epoch": 1.0089464099435987, + "grad_norm": 1.534955382347107, + "learning_rate": 3.3184226500940026e-05, + "loss": 0.5425, + "step": 114130 + }, + { + "epoch": 1.0090348132039109, + "grad_norm": 2.378653049468994, + "learning_rate": 3.318275311326815e-05, + "loss": 0.6615, + "step": 114140 + }, + { + "epoch": 1.0091232164642232, + "grad_norm": 3.9965426921844482, + "learning_rate": 3.318127972559628e-05, + "loss": 0.7364, + "step": 114150 + }, + { + "epoch": 1.0092116197245355, + "grad_norm": 2.593505382537842, + "learning_rate": 3.317980633792441e-05, + "loss": 0.618, + "step": 114160 + }, + { + "epoch": 1.0093000229848477, + "grad_norm": 17.067840576171875, + "learning_rate": 3.317833295025254e-05, + "loss": 0.599, + "step": 114170 + }, + { + "epoch": 1.00938842624516, + "grad_norm": 2.4784748554229736, + "learning_rate": 3.317685956258067e-05, + "loss": 0.6696, + "step": 114180 + }, + { + "epoch": 1.0094768295054721, + "grad_norm": 2.6997790336608887, + "learning_rate": 3.31753861749088e-05, + "loss": 0.7048, + "step": 114190 + }, + { + "epoch": 1.0095652327657845, + "grad_norm": 1.444516897201538, + "learning_rate": 3.3173912787236924e-05, + "loss": 0.5986, + "step": 114200 + }, + { + "epoch": 1.0096536360260966, + "grad_norm": 3.4414377212524414, + "learning_rate": 3.317243939956506e-05, + "loss": 0.5343, + "step": 114210 + }, + { + "epoch": 1.009742039286409, + "grad_norm": 1.959061861038208, + "learning_rate": 3.317096601189318e-05, + "loss": 0.6388, + "step": 114220 + }, + { + "epoch": 1.009830442546721, + "grad_norm": 4.316252708435059, + "learning_rate": 3.3169492624221316e-05, + "loss": 0.653, + "step": 114230 + }, + { + "epoch": 1.0099188458070334, + "grad_norm": 1.8691909313201904, + "learning_rate": 3.3168019236549445e-05, + "loss": 0.6185, + "step": 114240 + }, + { + "epoch": 1.0100072490673455, + "grad_norm": 1.523187279701233, + "learning_rate": 3.316654584887757e-05, + "loss": 0.5836, + "step": 114250 + }, + { + "epoch": 1.0100956523276579, + "grad_norm": 2.1660549640655518, + "learning_rate": 3.31650724612057e-05, + "loss": 0.6075, + "step": 114260 + }, + { + "epoch": 1.01018405558797, + "grad_norm": 1.1963714361190796, + "learning_rate": 3.3163599073533837e-05, + "loss": 0.6646, + "step": 114270 + }, + { + "epoch": 1.0102724588482823, + "grad_norm": 1.5786654949188232, + "learning_rate": 3.316212568586196e-05, + "loss": 0.6503, + "step": 114280 + }, + { + "epoch": 1.0103608621085947, + "grad_norm": 2.0850670337677, + "learning_rate": 3.316065229819009e-05, + "loss": 0.6873, + "step": 114290 + }, + { + "epoch": 1.0104492653689068, + "grad_norm": 1.2632302045822144, + "learning_rate": 3.315917891051822e-05, + "loss": 0.5467, + "step": 114300 + }, + { + "epoch": 1.0105376686292191, + "grad_norm": 3.6498982906341553, + "learning_rate": 3.315770552284635e-05, + "loss": 0.5677, + "step": 114310 + }, + { + "epoch": 1.0106260718895312, + "grad_norm": 1.6779340505599976, + "learning_rate": 3.315623213517448e-05, + "loss": 0.5289, + "step": 114320 + }, + { + "epoch": 1.0107144751498436, + "grad_norm": 9.336527824401855, + "learning_rate": 3.3154758747502613e-05, + "loss": 0.6628, + "step": 114330 + }, + { + "epoch": 1.0108028784101557, + "grad_norm": 2.1366055011749268, + "learning_rate": 3.3153285359830735e-05, + "loss": 0.7948, + "step": 114340 + }, + { + "epoch": 1.010891281670468, + "grad_norm": 1.3133643865585327, + "learning_rate": 3.315181197215887e-05, + "loss": 0.5915, + "step": 114350 + }, + { + "epoch": 1.0109796849307802, + "grad_norm": 2.3709449768066406, + "learning_rate": 3.3150338584487e-05, + "loss": 0.7534, + "step": 114360 + }, + { + "epoch": 1.0110680881910925, + "grad_norm": 2.742271661758423, + "learning_rate": 3.314886519681513e-05, + "loss": 0.6598, + "step": 114370 + }, + { + "epoch": 1.0111564914514046, + "grad_norm": 1.0594581365585327, + "learning_rate": 3.3147391809143255e-05, + "loss": 0.6599, + "step": 114380 + }, + { + "epoch": 1.011244894711717, + "grad_norm": 2.854827880859375, + "learning_rate": 3.3145918421471384e-05, + "loss": 0.5625, + "step": 114390 + }, + { + "epoch": 1.0113332979720293, + "grad_norm": 1.4914394617080688, + "learning_rate": 3.314444503379951e-05, + "loss": 0.5157, + "step": 114400 + }, + { + "epoch": 1.0114217012323414, + "grad_norm": 2.1870899200439453, + "learning_rate": 3.314297164612765e-05, + "loss": 0.5608, + "step": 114410 + }, + { + "epoch": 1.0115101044926538, + "grad_norm": 3.653965950012207, + "learning_rate": 3.3141498258455775e-05, + "loss": 0.7114, + "step": 114420 + }, + { + "epoch": 1.011598507752966, + "grad_norm": 4.081316947937012, + "learning_rate": 3.3140024870783904e-05, + "loss": 0.6132, + "step": 114430 + }, + { + "epoch": 1.0116869110132782, + "grad_norm": 0.9579772353172302, + "learning_rate": 3.313855148311203e-05, + "loss": 0.6651, + "step": 114440 + }, + { + "epoch": 1.0117753142735904, + "grad_norm": 7.856067180633545, + "learning_rate": 3.313707809544016e-05, + "loss": 0.5882, + "step": 114450 + }, + { + "epoch": 1.0118637175339027, + "grad_norm": 2.0313918590545654, + "learning_rate": 3.313560470776829e-05, + "loss": 0.7006, + "step": 114460 + }, + { + "epoch": 1.0119521207942148, + "grad_norm": 1.9130921363830566, + "learning_rate": 3.313413132009642e-05, + "loss": 0.7166, + "step": 114470 + }, + { + "epoch": 1.0120405240545272, + "grad_norm": 0.9928189516067505, + "learning_rate": 3.313265793242455e-05, + "loss": 0.6781, + "step": 114480 + }, + { + "epoch": 1.0121289273148393, + "grad_norm": 0.9419524669647217, + "learning_rate": 3.313118454475268e-05, + "loss": 0.5059, + "step": 114490 + }, + { + "epoch": 1.0122173305751516, + "grad_norm": 1.8238518238067627, + "learning_rate": 3.312971115708081e-05, + "loss": 0.6664, + "step": 114500 + }, + { + "epoch": 1.012305733835464, + "grad_norm": 3.2623229026794434, + "learning_rate": 3.312823776940894e-05, + "loss": 0.7242, + "step": 114510 + }, + { + "epoch": 1.012394137095776, + "grad_norm": 3.2176554203033447, + "learning_rate": 3.3126764381737066e-05, + "loss": 0.6324, + "step": 114520 + }, + { + "epoch": 1.0124825403560884, + "grad_norm": 5.634082317352295, + "learning_rate": 3.3125290994065194e-05, + "loss": 0.7172, + "step": 114530 + }, + { + "epoch": 1.0125709436164005, + "grad_norm": 1.57271146774292, + "learning_rate": 3.312381760639333e-05, + "loss": 0.5841, + "step": 114540 + }, + { + "epoch": 1.0126593468767129, + "grad_norm": 5.282871723175049, + "learning_rate": 3.312234421872146e-05, + "loss": 0.5814, + "step": 114550 + }, + { + "epoch": 1.012747750137025, + "grad_norm": 3.7397422790527344, + "learning_rate": 3.3120870831049586e-05, + "loss": 0.6079, + "step": 114560 + }, + { + "epoch": 1.0128361533973373, + "grad_norm": 4.592635154724121, + "learning_rate": 3.3119397443377714e-05, + "loss": 0.715, + "step": 114570 + }, + { + "epoch": 1.0129245566576495, + "grad_norm": 3.379467725753784, + "learning_rate": 3.311792405570584e-05, + "loss": 0.5434, + "step": 114580 + }, + { + "epoch": 1.0130129599179618, + "grad_norm": 2.7413527965545654, + "learning_rate": 3.311645066803397e-05, + "loss": 0.6117, + "step": 114590 + }, + { + "epoch": 1.013101363178274, + "grad_norm": 2.3742785453796387, + "learning_rate": 3.3114977280362106e-05, + "loss": 0.576, + "step": 114600 + }, + { + "epoch": 1.0131897664385863, + "grad_norm": 0.8790023326873779, + "learning_rate": 3.311350389269023e-05, + "loss": 0.5333, + "step": 114610 + }, + { + "epoch": 1.0132781696988984, + "grad_norm": 7.006760120391846, + "learning_rate": 3.311203050501836e-05, + "loss": 0.5783, + "step": 114620 + }, + { + "epoch": 1.0133665729592107, + "grad_norm": 6.131092548370361, + "learning_rate": 3.311055711734649e-05, + "loss": 0.6316, + "step": 114630 + }, + { + "epoch": 1.013454976219523, + "grad_norm": 1.641683578491211, + "learning_rate": 3.310908372967462e-05, + "loss": 0.627, + "step": 114640 + }, + { + "epoch": 1.0135433794798352, + "grad_norm": 2.0098884105682373, + "learning_rate": 3.310761034200275e-05, + "loss": 0.6766, + "step": 114650 + }, + { + "epoch": 1.0136317827401475, + "grad_norm": 2.0325984954833984, + "learning_rate": 3.310613695433088e-05, + "loss": 0.7113, + "step": 114660 + }, + { + "epoch": 1.0137201860004597, + "grad_norm": 2.463782787322998, + "learning_rate": 3.3104663566659005e-05, + "loss": 0.7137, + "step": 114670 + }, + { + "epoch": 1.013808589260772, + "grad_norm": 3.714784622192383, + "learning_rate": 3.310319017898714e-05, + "loss": 0.5961, + "step": 114680 + }, + { + "epoch": 1.0138969925210841, + "grad_norm": 1.7615363597869873, + "learning_rate": 3.310171679131526e-05, + "loss": 0.7349, + "step": 114690 + }, + { + "epoch": 1.0139853957813965, + "grad_norm": 3.256420135498047, + "learning_rate": 3.3100243403643396e-05, + "loss": 0.6422, + "step": 114700 + }, + { + "epoch": 1.0140737990417086, + "grad_norm": 1.1652439832687378, + "learning_rate": 3.3098770015971525e-05, + "loss": 0.4512, + "step": 114710 + }, + { + "epoch": 1.014162202302021, + "grad_norm": 3.902275562286377, + "learning_rate": 3.309729662829965e-05, + "loss": 0.6091, + "step": 114720 + }, + { + "epoch": 1.014250605562333, + "grad_norm": 2.9751980304718018, + "learning_rate": 3.309582324062778e-05, + "loss": 0.7228, + "step": 114730 + }, + { + "epoch": 1.0143390088226454, + "grad_norm": 2.747573137283325, + "learning_rate": 3.3094349852955917e-05, + "loss": 0.6112, + "step": 114740 + }, + { + "epoch": 1.0144274120829577, + "grad_norm": 1.6531888246536255, + "learning_rate": 3.309287646528404e-05, + "loss": 0.5841, + "step": 114750 + }, + { + "epoch": 1.0145158153432698, + "grad_norm": 3.116415023803711, + "learning_rate": 3.309140307761217e-05, + "loss": 0.6197, + "step": 114760 + }, + { + "epoch": 1.0146042186035822, + "grad_norm": 6.16981840133667, + "learning_rate": 3.30899296899403e-05, + "loss": 0.584, + "step": 114770 + }, + { + "epoch": 1.0146926218638943, + "grad_norm": 2.065519332885742, + "learning_rate": 3.308845630226843e-05, + "loss": 0.5533, + "step": 114780 + }, + { + "epoch": 1.0147810251242066, + "grad_norm": 7.4151411056518555, + "learning_rate": 3.308698291459656e-05, + "loss": 0.623, + "step": 114790 + }, + { + "epoch": 1.0148694283845188, + "grad_norm": 1.7224183082580566, + "learning_rate": 3.3085509526924693e-05, + "loss": 0.6343, + "step": 114800 + }, + { + "epoch": 1.014957831644831, + "grad_norm": 4.719710826873779, + "learning_rate": 3.3084036139252815e-05, + "loss": 0.4838, + "step": 114810 + }, + { + "epoch": 1.0150462349051432, + "grad_norm": 2.6137685775756836, + "learning_rate": 3.308256275158095e-05, + "loss": 0.6428, + "step": 114820 + }, + { + "epoch": 1.0151346381654556, + "grad_norm": 1.3227020502090454, + "learning_rate": 3.308108936390907e-05, + "loss": 0.5614, + "step": 114830 + }, + { + "epoch": 1.0152230414257677, + "grad_norm": 9.94175910949707, + "learning_rate": 3.307961597623721e-05, + "loss": 0.8337, + "step": 114840 + }, + { + "epoch": 1.01531144468608, + "grad_norm": 13.124799728393555, + "learning_rate": 3.3078142588565335e-05, + "loss": 0.6158, + "step": 114850 + }, + { + "epoch": 1.0153998479463922, + "grad_norm": 2.885446786880493, + "learning_rate": 3.3076669200893464e-05, + "loss": 0.7418, + "step": 114860 + }, + { + "epoch": 1.0154882512067045, + "grad_norm": 5.281364917755127, + "learning_rate": 3.307519581322159e-05, + "loss": 0.5997, + "step": 114870 + }, + { + "epoch": 1.0155766544670168, + "grad_norm": 1.1485543251037598, + "learning_rate": 3.307372242554973e-05, + "loss": 0.6214, + "step": 114880 + }, + { + "epoch": 1.015665057727329, + "grad_norm": 1.0571945905685425, + "learning_rate": 3.307224903787785e-05, + "loss": 0.4867, + "step": 114890 + }, + { + "epoch": 1.0157534609876413, + "grad_norm": 9.413779258728027, + "learning_rate": 3.3070775650205984e-05, + "loss": 0.5908, + "step": 114900 + }, + { + "epoch": 1.0158418642479534, + "grad_norm": 1.4142274856567383, + "learning_rate": 3.306930226253411e-05, + "loss": 0.5757, + "step": 114910 + }, + { + "epoch": 1.0159302675082658, + "grad_norm": 6.0000457763671875, + "learning_rate": 3.306782887486224e-05, + "loss": 0.5736, + "step": 114920 + }, + { + "epoch": 1.0160186707685779, + "grad_norm": 4.925318241119385, + "learning_rate": 3.306635548719037e-05, + "loss": 0.5809, + "step": 114930 + }, + { + "epoch": 1.0161070740288902, + "grad_norm": 2.033148765563965, + "learning_rate": 3.30648820995185e-05, + "loss": 0.6067, + "step": 114940 + }, + { + "epoch": 1.0161954772892023, + "grad_norm": 3.5339834690093994, + "learning_rate": 3.3063408711846626e-05, + "loss": 0.5897, + "step": 114950 + }, + { + "epoch": 1.0162838805495147, + "grad_norm": 1.1234498023986816, + "learning_rate": 3.306193532417476e-05, + "loss": 0.5069, + "step": 114960 + }, + { + "epoch": 1.0163722838098268, + "grad_norm": 7.30183219909668, + "learning_rate": 3.306046193650288e-05, + "loss": 0.6961, + "step": 114970 + }, + { + "epoch": 1.0164606870701391, + "grad_norm": 4.576351642608643, + "learning_rate": 3.305898854883102e-05, + "loss": 0.7681, + "step": 114980 + }, + { + "epoch": 1.0165490903304515, + "grad_norm": 4.762503147125244, + "learning_rate": 3.3057515161159146e-05, + "loss": 0.6715, + "step": 114990 + }, + { + "epoch": 1.0166374935907636, + "grad_norm": 2.4711084365844727, + "learning_rate": 3.3056041773487274e-05, + "loss": 0.5581, + "step": 115000 + }, + { + "epoch": 1.016725896851076, + "grad_norm": 1.7345740795135498, + "learning_rate": 3.30545683858154e-05, + "loss": 0.6539, + "step": 115010 + }, + { + "epoch": 1.016814300111388, + "grad_norm": 2.8621976375579834, + "learning_rate": 3.305309499814354e-05, + "loss": 0.6463, + "step": 115020 + }, + { + "epoch": 1.0169027033717004, + "grad_norm": 2.12165904045105, + "learning_rate": 3.305162161047166e-05, + "loss": 0.5, + "step": 115030 + }, + { + "epoch": 1.0169911066320125, + "grad_norm": 0.9638975262641907, + "learning_rate": 3.3050148222799794e-05, + "loss": 0.5905, + "step": 115040 + }, + { + "epoch": 1.0170795098923249, + "grad_norm": 2.3649346828460693, + "learning_rate": 3.3048674835127916e-05, + "loss": 0.642, + "step": 115050 + }, + { + "epoch": 1.017167913152637, + "grad_norm": 1.0278958082199097, + "learning_rate": 3.304720144745605e-05, + "loss": 0.5289, + "step": 115060 + }, + { + "epoch": 1.0172563164129493, + "grad_norm": 2.411811351776123, + "learning_rate": 3.304572805978418e-05, + "loss": 0.5647, + "step": 115070 + }, + { + "epoch": 1.0173447196732615, + "grad_norm": 3.973703384399414, + "learning_rate": 3.304425467211231e-05, + "loss": 0.5368, + "step": 115080 + }, + { + "epoch": 1.0174331229335738, + "grad_norm": 2.5362212657928467, + "learning_rate": 3.3042781284440436e-05, + "loss": 0.7078, + "step": 115090 + }, + { + "epoch": 1.0175215261938861, + "grad_norm": 2.3636996746063232, + "learning_rate": 3.304130789676857e-05, + "loss": 0.7232, + "step": 115100 + }, + { + "epoch": 1.0176099294541983, + "grad_norm": 2.8480608463287354, + "learning_rate": 3.303983450909669e-05, + "loss": 0.6377, + "step": 115110 + }, + { + "epoch": 1.0176983327145106, + "grad_norm": 1.7566604614257812, + "learning_rate": 3.303836112142483e-05, + "loss": 0.6525, + "step": 115120 + }, + { + "epoch": 1.0177867359748227, + "grad_norm": 1.5804463624954224, + "learning_rate": 3.3036887733752956e-05, + "loss": 0.5934, + "step": 115130 + }, + { + "epoch": 1.017875139235135, + "grad_norm": 8.386014938354492, + "learning_rate": 3.3035414346081085e-05, + "loss": 0.595, + "step": 115140 + }, + { + "epoch": 1.0179635424954472, + "grad_norm": 2.0764265060424805, + "learning_rate": 3.303394095840921e-05, + "loss": 0.6004, + "step": 115150 + }, + { + "epoch": 1.0180519457557595, + "grad_norm": 2.2943568229675293, + "learning_rate": 3.303246757073734e-05, + "loss": 0.7777, + "step": 115160 + }, + { + "epoch": 1.0181403490160716, + "grad_norm": 0.6454117298126221, + "learning_rate": 3.303099418306547e-05, + "loss": 0.5751, + "step": 115170 + }, + { + "epoch": 1.018228752276384, + "grad_norm": 2.7356128692626953, + "learning_rate": 3.3029520795393605e-05, + "loss": 0.6796, + "step": 115180 + }, + { + "epoch": 1.018317155536696, + "grad_norm": 1.1771854162216187, + "learning_rate": 3.3028047407721726e-05, + "loss": 0.6093, + "step": 115190 + }, + { + "epoch": 1.0184055587970084, + "grad_norm": 2.6923511028289795, + "learning_rate": 3.302657402004986e-05, + "loss": 0.603, + "step": 115200 + }, + { + "epoch": 1.0184939620573206, + "grad_norm": 2.4413962364196777, + "learning_rate": 3.302510063237799e-05, + "loss": 0.6456, + "step": 115210 + }, + { + "epoch": 1.018582365317633, + "grad_norm": 1.724786639213562, + "learning_rate": 3.302362724470612e-05, + "loss": 0.5813, + "step": 115220 + }, + { + "epoch": 1.0186707685779453, + "grad_norm": 1.0470874309539795, + "learning_rate": 3.3022153857034247e-05, + "loss": 0.5502, + "step": 115230 + }, + { + "epoch": 1.0187591718382574, + "grad_norm": 2.4003427028656006, + "learning_rate": 3.302068046936238e-05, + "loss": 0.6398, + "step": 115240 + }, + { + "epoch": 1.0188475750985697, + "grad_norm": 9.031026840209961, + "learning_rate": 3.30192070816905e-05, + "loss": 0.644, + "step": 115250 + }, + { + "epoch": 1.0189359783588818, + "grad_norm": 1.9744430780410767, + "learning_rate": 3.301773369401864e-05, + "loss": 0.5874, + "step": 115260 + }, + { + "epoch": 1.0190243816191942, + "grad_norm": 3.2505557537078857, + "learning_rate": 3.301626030634677e-05, + "loss": 0.6277, + "step": 115270 + }, + { + "epoch": 1.0191127848795063, + "grad_norm": 3.8930466175079346, + "learning_rate": 3.3014786918674895e-05, + "loss": 0.6859, + "step": 115280 + }, + { + "epoch": 1.0192011881398186, + "grad_norm": 6.320791721343994, + "learning_rate": 3.3013313531003023e-05, + "loss": 0.5608, + "step": 115290 + }, + { + "epoch": 1.0192895914001308, + "grad_norm": 3.1180779933929443, + "learning_rate": 3.301184014333115e-05, + "loss": 0.6638, + "step": 115300 + }, + { + "epoch": 1.019377994660443, + "grad_norm": 2.680518627166748, + "learning_rate": 3.301036675565928e-05, + "loss": 0.6795, + "step": 115310 + }, + { + "epoch": 1.0194663979207552, + "grad_norm": 2.081961154937744, + "learning_rate": 3.3008893367987415e-05, + "loss": 0.6785, + "step": 115320 + }, + { + "epoch": 1.0195548011810676, + "grad_norm": 7.333011150360107, + "learning_rate": 3.3007419980315544e-05, + "loss": 0.6141, + "step": 115330 + }, + { + "epoch": 1.01964320444138, + "grad_norm": 1.7128896713256836, + "learning_rate": 3.300594659264367e-05, + "loss": 0.6972, + "step": 115340 + }, + { + "epoch": 1.019731607701692, + "grad_norm": 8.286114692687988, + "learning_rate": 3.30044732049718e-05, + "loss": 0.5715, + "step": 115350 + }, + { + "epoch": 1.0198200109620044, + "grad_norm": 0.9567068815231323, + "learning_rate": 3.300299981729993e-05, + "loss": 0.5846, + "step": 115360 + }, + { + "epoch": 1.0199084142223165, + "grad_norm": 0.8313459753990173, + "learning_rate": 3.300152642962806e-05, + "loss": 0.6588, + "step": 115370 + }, + { + "epoch": 1.0199968174826288, + "grad_norm": 1.2403515577316284, + "learning_rate": 3.300005304195619e-05, + "loss": 0.5241, + "step": 115380 + }, + { + "epoch": 1.020085220742941, + "grad_norm": 1.7476023435592651, + "learning_rate": 3.299857965428432e-05, + "loss": 0.5517, + "step": 115390 + }, + { + "epoch": 1.0201736240032533, + "grad_norm": 3.253467321395874, + "learning_rate": 3.299710626661245e-05, + "loss": 0.5874, + "step": 115400 + }, + { + "epoch": 1.0202620272635654, + "grad_norm": 3.416421413421631, + "learning_rate": 3.299563287894058e-05, + "loss": 0.6567, + "step": 115410 + }, + { + "epoch": 1.0203504305238777, + "grad_norm": 12.988905906677246, + "learning_rate": 3.2994159491268706e-05, + "loss": 0.577, + "step": 115420 + }, + { + "epoch": 1.0204388337841899, + "grad_norm": 3.3279874324798584, + "learning_rate": 3.2992686103596834e-05, + "loss": 0.472, + "step": 115430 + }, + { + "epoch": 1.0205272370445022, + "grad_norm": 8.408232688903809, + "learning_rate": 3.299121271592496e-05, + "loss": 0.6969, + "step": 115440 + }, + { + "epoch": 1.0206156403048143, + "grad_norm": 1.940561056137085, + "learning_rate": 3.29897393282531e-05, + "loss": 0.5783, + "step": 115450 + }, + { + "epoch": 1.0207040435651267, + "grad_norm": 3.3989226818084717, + "learning_rate": 3.2988265940581226e-05, + "loss": 0.6387, + "step": 115460 + }, + { + "epoch": 1.020792446825439, + "grad_norm": 2.806227684020996, + "learning_rate": 3.2986792552909354e-05, + "loss": 0.593, + "step": 115470 + }, + { + "epoch": 1.0208808500857511, + "grad_norm": 2.114802360534668, + "learning_rate": 3.298531916523748e-05, + "loss": 0.5895, + "step": 115480 + }, + { + "epoch": 1.0209692533460635, + "grad_norm": 1.941298246383667, + "learning_rate": 3.298384577756561e-05, + "loss": 0.5382, + "step": 115490 + }, + { + "epoch": 1.0210576566063756, + "grad_norm": 2.512268304824829, + "learning_rate": 3.298237238989374e-05, + "loss": 0.7258, + "step": 115500 + }, + { + "epoch": 1.021146059866688, + "grad_norm": 2.1646790504455566, + "learning_rate": 3.2980899002221874e-05, + "loss": 0.6303, + "step": 115510 + }, + { + "epoch": 1.021234463127, + "grad_norm": 3.8876559734344482, + "learning_rate": 3.2979425614549996e-05, + "loss": 0.5748, + "step": 115520 + }, + { + "epoch": 1.0213228663873124, + "grad_norm": 3.440377950668335, + "learning_rate": 3.297795222687813e-05, + "loss": 0.6175, + "step": 115530 + }, + { + "epoch": 1.0214112696476245, + "grad_norm": 5.650941371917725, + "learning_rate": 3.297647883920626e-05, + "loss": 0.6, + "step": 115540 + }, + { + "epoch": 1.0214996729079369, + "grad_norm": 1.6975969076156616, + "learning_rate": 3.297500545153439e-05, + "loss": 0.602, + "step": 115550 + }, + { + "epoch": 1.021588076168249, + "grad_norm": 2.118499279022217, + "learning_rate": 3.2973532063862516e-05, + "loss": 0.6747, + "step": 115560 + }, + { + "epoch": 1.0216764794285613, + "grad_norm": 2.2729036808013916, + "learning_rate": 3.297205867619065e-05, + "loss": 0.5729, + "step": 115570 + }, + { + "epoch": 1.0217648826888737, + "grad_norm": 5.56952428817749, + "learning_rate": 3.297058528851877e-05, + "loss": 0.575, + "step": 115580 + }, + { + "epoch": 1.0218532859491858, + "grad_norm": 2.339576244354248, + "learning_rate": 3.296911190084691e-05, + "loss": 0.6363, + "step": 115590 + }, + { + "epoch": 1.0219416892094981, + "grad_norm": 5.936741352081299, + "learning_rate": 3.2967638513175036e-05, + "loss": 0.6227, + "step": 115600 + }, + { + "epoch": 1.0220300924698102, + "grad_norm": 6.292945384979248, + "learning_rate": 3.2966165125503165e-05, + "loss": 0.6724, + "step": 115610 + }, + { + "epoch": 1.0221184957301226, + "grad_norm": 0.6593518853187561, + "learning_rate": 3.296469173783129e-05, + "loss": 0.5803, + "step": 115620 + }, + { + "epoch": 1.0222068989904347, + "grad_norm": 4.837062358856201, + "learning_rate": 3.296321835015942e-05, + "loss": 0.4487, + "step": 115630 + }, + { + "epoch": 1.022295302250747, + "grad_norm": 2.0359268188476562, + "learning_rate": 3.296174496248755e-05, + "loss": 0.5896, + "step": 115640 + }, + { + "epoch": 1.0223837055110592, + "grad_norm": 0.9177247285842896, + "learning_rate": 3.2960271574815685e-05, + "loss": 0.6012, + "step": 115650 + }, + { + "epoch": 1.0224721087713715, + "grad_norm": 2.142111301422119, + "learning_rate": 3.2958798187143806e-05, + "loss": 0.7178, + "step": 115660 + }, + { + "epoch": 1.0225605120316836, + "grad_norm": 1.4394023418426514, + "learning_rate": 3.295732479947194e-05, + "loss": 0.6407, + "step": 115670 + }, + { + "epoch": 1.022648915291996, + "grad_norm": 2.4677865505218506, + "learning_rate": 3.295585141180007e-05, + "loss": 0.5853, + "step": 115680 + }, + { + "epoch": 1.0227373185523083, + "grad_norm": 2.486605405807495, + "learning_rate": 3.29543780241282e-05, + "loss": 0.6149, + "step": 115690 + }, + { + "epoch": 1.0228257218126204, + "grad_norm": 2.982140064239502, + "learning_rate": 3.295290463645633e-05, + "loss": 0.5306, + "step": 115700 + }, + { + "epoch": 1.0229141250729328, + "grad_norm": 10.542850494384766, + "learning_rate": 3.295143124878446e-05, + "loss": 0.6284, + "step": 115710 + }, + { + "epoch": 1.023002528333245, + "grad_norm": 2.162801742553711, + "learning_rate": 3.294995786111258e-05, + "loss": 0.6244, + "step": 115720 + }, + { + "epoch": 1.0230909315935572, + "grad_norm": 2.2199089527130127, + "learning_rate": 3.294848447344072e-05, + "loss": 0.7403, + "step": 115730 + }, + { + "epoch": 1.0231793348538694, + "grad_norm": 1.796887755393982, + "learning_rate": 3.294701108576885e-05, + "loss": 0.8027, + "step": 115740 + }, + { + "epoch": 1.0232677381141817, + "grad_norm": 2.835404872894287, + "learning_rate": 3.2945537698096975e-05, + "loss": 0.6643, + "step": 115750 + }, + { + "epoch": 1.0233561413744938, + "grad_norm": 1.3645983934402466, + "learning_rate": 3.2944064310425104e-05, + "loss": 0.6417, + "step": 115760 + }, + { + "epoch": 1.0234445446348062, + "grad_norm": 3.714608669281006, + "learning_rate": 3.294259092275323e-05, + "loss": 0.6954, + "step": 115770 + }, + { + "epoch": 1.0235329478951183, + "grad_norm": 14.418932914733887, + "learning_rate": 3.294111753508136e-05, + "loss": 0.5623, + "step": 115780 + }, + { + "epoch": 1.0236213511554306, + "grad_norm": 4.085190773010254, + "learning_rate": 3.2939644147409495e-05, + "loss": 0.6271, + "step": 115790 + }, + { + "epoch": 1.0237097544157427, + "grad_norm": 1.3367069959640503, + "learning_rate": 3.293817075973762e-05, + "loss": 0.4774, + "step": 115800 + }, + { + "epoch": 1.023798157676055, + "grad_norm": 5.851768970489502, + "learning_rate": 3.293669737206575e-05, + "loss": 0.6517, + "step": 115810 + }, + { + "epoch": 1.0238865609363674, + "grad_norm": 1.6313979625701904, + "learning_rate": 3.293522398439388e-05, + "loss": 0.5888, + "step": 115820 + }, + { + "epoch": 1.0239749641966795, + "grad_norm": 1.676857352256775, + "learning_rate": 3.293375059672201e-05, + "loss": 0.6169, + "step": 115830 + }, + { + "epoch": 1.024063367456992, + "grad_norm": 9.19443416595459, + "learning_rate": 3.293227720905014e-05, + "loss": 0.7504, + "step": 115840 + }, + { + "epoch": 1.024151770717304, + "grad_norm": 4.627795219421387, + "learning_rate": 3.293080382137827e-05, + "loss": 0.7622, + "step": 115850 + }, + { + "epoch": 1.0242401739776164, + "grad_norm": 2.1353626251220703, + "learning_rate": 3.2929330433706394e-05, + "loss": 0.6792, + "step": 115860 + }, + { + "epoch": 1.0243285772379285, + "grad_norm": 1.4297456741333008, + "learning_rate": 3.292785704603453e-05, + "loss": 0.6547, + "step": 115870 + }, + { + "epoch": 1.0244169804982408, + "grad_norm": 4.945678234100342, + "learning_rate": 3.292638365836265e-05, + "loss": 0.5383, + "step": 115880 + }, + { + "epoch": 1.024505383758553, + "grad_norm": 1.5775309801101685, + "learning_rate": 3.2924910270690786e-05, + "loss": 0.5684, + "step": 115890 + }, + { + "epoch": 1.0245937870188653, + "grad_norm": 11.50912094116211, + "learning_rate": 3.2923436883018914e-05, + "loss": 0.6661, + "step": 115900 + }, + { + "epoch": 1.0246821902791774, + "grad_norm": 3.94026517868042, + "learning_rate": 3.292196349534704e-05, + "loss": 0.5846, + "step": 115910 + }, + { + "epoch": 1.0247705935394897, + "grad_norm": 1.2065709829330444, + "learning_rate": 3.292049010767517e-05, + "loss": 0.5183, + "step": 115920 + }, + { + "epoch": 1.024858996799802, + "grad_norm": 2.126706123352051, + "learning_rate": 3.2919016720003306e-05, + "loss": 0.6142, + "step": 115930 + }, + { + "epoch": 1.0249474000601142, + "grad_norm": 9.068276405334473, + "learning_rate": 3.291754333233143e-05, + "loss": 0.7343, + "step": 115940 + }, + { + "epoch": 1.0250358033204265, + "grad_norm": 1.9008930921554565, + "learning_rate": 3.291606994465956e-05, + "loss": 0.6975, + "step": 115950 + }, + { + "epoch": 1.0251242065807387, + "grad_norm": 2.7840113639831543, + "learning_rate": 3.291459655698769e-05, + "loss": 0.5029, + "step": 115960 + }, + { + "epoch": 1.025212609841051, + "grad_norm": 2.1668808460235596, + "learning_rate": 3.291312316931582e-05, + "loss": 0.5861, + "step": 115970 + }, + { + "epoch": 1.0253010131013631, + "grad_norm": 15.260984420776367, + "learning_rate": 3.291164978164395e-05, + "loss": 0.5989, + "step": 115980 + }, + { + "epoch": 1.0253894163616755, + "grad_norm": 3.3249385356903076, + "learning_rate": 3.2910176393972076e-05, + "loss": 0.6341, + "step": 115990 + }, + { + "epoch": 1.0254778196219876, + "grad_norm": 4.381968021392822, + "learning_rate": 3.2908703006300204e-05, + "loss": 0.5492, + "step": 116000 + }, + { + "epoch": 1.0255662228823, + "grad_norm": 2.3023040294647217, + "learning_rate": 3.290722961862834e-05, + "loss": 0.6113, + "step": 116010 + }, + { + "epoch": 1.025654626142612, + "grad_norm": 1.339086890220642, + "learning_rate": 3.290575623095646e-05, + "loss": 0.42, + "step": 116020 + }, + { + "epoch": 1.0257430294029244, + "grad_norm": 5.605476379394531, + "learning_rate": 3.2904282843284596e-05, + "loss": 0.6983, + "step": 116030 + }, + { + "epoch": 1.0258314326632365, + "grad_norm": 5.244485378265381, + "learning_rate": 3.2902809455612725e-05, + "loss": 0.6016, + "step": 116040 + }, + { + "epoch": 1.0259198359235489, + "grad_norm": 4.918577194213867, + "learning_rate": 3.290133606794085e-05, + "loss": 0.7356, + "step": 116050 + }, + { + "epoch": 1.0260082391838612, + "grad_norm": 9.049163818359375, + "learning_rate": 3.289986268026898e-05, + "loss": 0.6026, + "step": 116060 + }, + { + "epoch": 1.0260966424441733, + "grad_norm": 2.0767617225646973, + "learning_rate": 3.2898389292597116e-05, + "loss": 0.5932, + "step": 116070 + }, + { + "epoch": 1.0261850457044857, + "grad_norm": 2.3189079761505127, + "learning_rate": 3.289691590492524e-05, + "loss": 0.6042, + "step": 116080 + }, + { + "epoch": 1.0262734489647978, + "grad_norm": 1.4300647974014282, + "learning_rate": 3.289544251725337e-05, + "loss": 0.7043, + "step": 116090 + }, + { + "epoch": 1.0263618522251101, + "grad_norm": 2.61091947555542, + "learning_rate": 3.2893969129581495e-05, + "loss": 0.5071, + "step": 116100 + }, + { + "epoch": 1.0264502554854222, + "grad_norm": 3.349518060684204, + "learning_rate": 3.289249574190963e-05, + "loss": 0.5487, + "step": 116110 + }, + { + "epoch": 1.0265386587457346, + "grad_norm": 2.2534759044647217, + "learning_rate": 3.289102235423776e-05, + "loss": 0.6715, + "step": 116120 + }, + { + "epoch": 1.0266270620060467, + "grad_norm": 6.956655025482178, + "learning_rate": 3.2889548966565887e-05, + "loss": 0.5354, + "step": 116130 + }, + { + "epoch": 1.026715465266359, + "grad_norm": 1.267991304397583, + "learning_rate": 3.2888075578894015e-05, + "loss": 0.5453, + "step": 116140 + }, + { + "epoch": 1.0268038685266712, + "grad_norm": 4.1753010749816895, + "learning_rate": 3.288660219122215e-05, + "loss": 0.6389, + "step": 116150 + }, + { + "epoch": 1.0268922717869835, + "grad_norm": 2.0763416290283203, + "learning_rate": 3.288512880355027e-05, + "loss": 0.5779, + "step": 116160 + }, + { + "epoch": 1.0269806750472958, + "grad_norm": 1.9550834894180298, + "learning_rate": 3.288365541587841e-05, + "loss": 0.6679, + "step": 116170 + }, + { + "epoch": 1.027069078307608, + "grad_norm": 2.5839273929595947, + "learning_rate": 3.2882182028206535e-05, + "loss": 0.65, + "step": 116180 + }, + { + "epoch": 1.0271574815679203, + "grad_norm": 49.472232818603516, + "learning_rate": 3.2880708640534663e-05, + "loss": 0.7485, + "step": 116190 + }, + { + "epoch": 1.0272458848282324, + "grad_norm": 1.6635946035385132, + "learning_rate": 3.287923525286279e-05, + "loss": 0.5659, + "step": 116200 + }, + { + "epoch": 1.0273342880885448, + "grad_norm": 1.6751060485839844, + "learning_rate": 3.287776186519093e-05, + "loss": 0.5833, + "step": 116210 + }, + { + "epoch": 1.0274226913488569, + "grad_norm": 5.592175006866455, + "learning_rate": 3.287628847751905e-05, + "loss": 0.5921, + "step": 116220 + }, + { + "epoch": 1.0275110946091692, + "grad_norm": 2.4472837448120117, + "learning_rate": 3.2874815089847184e-05, + "loss": 0.7164, + "step": 116230 + }, + { + "epoch": 1.0275994978694813, + "grad_norm": 3.849940776824951, + "learning_rate": 3.287334170217531e-05, + "loss": 0.604, + "step": 116240 + }, + { + "epoch": 1.0276879011297937, + "grad_norm": 6.839824676513672, + "learning_rate": 3.287186831450344e-05, + "loss": 0.592, + "step": 116250 + }, + { + "epoch": 1.0277763043901058, + "grad_norm": 12.123258590698242, + "learning_rate": 3.287039492683157e-05, + "loss": 0.802, + "step": 116260 + }, + { + "epoch": 1.0278647076504182, + "grad_norm": 3.9698305130004883, + "learning_rate": 3.28689215391597e-05, + "loss": 0.6043, + "step": 116270 + }, + { + "epoch": 1.0279531109107305, + "grad_norm": 4.2516632080078125, + "learning_rate": 3.2867448151487825e-05, + "loss": 0.6398, + "step": 116280 + }, + { + "epoch": 1.0280415141710426, + "grad_norm": 5.811633110046387, + "learning_rate": 3.286597476381596e-05, + "loss": 0.635, + "step": 116290 + }, + { + "epoch": 1.028129917431355, + "grad_norm": 3.5544073581695557, + "learning_rate": 3.286450137614409e-05, + "loss": 0.6762, + "step": 116300 + }, + { + "epoch": 1.028218320691667, + "grad_norm": 8.247230529785156, + "learning_rate": 3.286302798847222e-05, + "loss": 0.7066, + "step": 116310 + }, + { + "epoch": 1.0283067239519794, + "grad_norm": 2.4690587520599365, + "learning_rate": 3.2861554600800346e-05, + "loss": 0.5071, + "step": 116320 + }, + { + "epoch": 1.0283951272122915, + "grad_norm": 1.889485239982605, + "learning_rate": 3.2860081213128474e-05, + "loss": 0.6758, + "step": 116330 + }, + { + "epoch": 1.0284835304726039, + "grad_norm": 3.0156872272491455, + "learning_rate": 3.28586078254566e-05, + "loss": 0.6799, + "step": 116340 + }, + { + "epoch": 1.028571933732916, + "grad_norm": 3.3365800380706787, + "learning_rate": 3.285713443778473e-05, + "loss": 0.6223, + "step": 116350 + }, + { + "epoch": 1.0286603369932283, + "grad_norm": 25.31796646118164, + "learning_rate": 3.2855661050112866e-05, + "loss": 0.6178, + "step": 116360 + }, + { + "epoch": 1.0287487402535405, + "grad_norm": 4.635453701019287, + "learning_rate": 3.2854187662440994e-05, + "loss": 0.6073, + "step": 116370 + }, + { + "epoch": 1.0288371435138528, + "grad_norm": 1.779373288154602, + "learning_rate": 3.285271427476912e-05, + "loss": 0.6033, + "step": 116380 + }, + { + "epoch": 1.028925546774165, + "grad_norm": 3.0340542793273926, + "learning_rate": 3.285124088709725e-05, + "loss": 0.6538, + "step": 116390 + }, + { + "epoch": 1.0290139500344773, + "grad_norm": 1.6494413614273071, + "learning_rate": 3.284976749942538e-05, + "loss": 0.649, + "step": 116400 + }, + { + "epoch": 1.0291023532947896, + "grad_norm": 9.002840995788574, + "learning_rate": 3.284829411175351e-05, + "loss": 0.6207, + "step": 116410 + }, + { + "epoch": 1.0291907565551017, + "grad_norm": 1.5056276321411133, + "learning_rate": 3.284682072408164e-05, + "loss": 0.68, + "step": 116420 + }, + { + "epoch": 1.029279159815414, + "grad_norm": 4.635558605194092, + "learning_rate": 3.284534733640977e-05, + "loss": 0.6925, + "step": 116430 + }, + { + "epoch": 1.0293675630757262, + "grad_norm": 11.118827819824219, + "learning_rate": 3.28438739487379e-05, + "loss": 0.4654, + "step": 116440 + }, + { + "epoch": 1.0294559663360385, + "grad_norm": 13.458061218261719, + "learning_rate": 3.284240056106603e-05, + "loss": 0.6874, + "step": 116450 + }, + { + "epoch": 1.0295443695963507, + "grad_norm": 12.226763725280762, + "learning_rate": 3.2840927173394156e-05, + "loss": 0.6744, + "step": 116460 + }, + { + "epoch": 1.029632772856663, + "grad_norm": 9.540175437927246, + "learning_rate": 3.2839453785722284e-05, + "loss": 0.5955, + "step": 116470 + }, + { + "epoch": 1.0297211761169751, + "grad_norm": 9.096881866455078, + "learning_rate": 3.283798039805042e-05, + "loss": 0.4888, + "step": 116480 + }, + { + "epoch": 1.0298095793772875, + "grad_norm": 4.424280166625977, + "learning_rate": 3.283650701037854e-05, + "loss": 0.4796, + "step": 116490 + }, + { + "epoch": 1.0298979826375996, + "grad_norm": 2.7132251262664795, + "learning_rate": 3.2835033622706676e-05, + "loss": 0.5185, + "step": 116500 + }, + { + "epoch": 1.029986385897912, + "grad_norm": 1.077351689338684, + "learning_rate": 3.2833560235034805e-05, + "loss": 0.5455, + "step": 116510 + }, + { + "epoch": 1.0300747891582243, + "grad_norm": 2.441951036453247, + "learning_rate": 3.283208684736293e-05, + "loss": 0.624, + "step": 116520 + }, + { + "epoch": 1.0301631924185364, + "grad_norm": 3.245570421218872, + "learning_rate": 3.283061345969106e-05, + "loss": 0.6914, + "step": 116530 + }, + { + "epoch": 1.0302515956788487, + "grad_norm": 2.5124316215515137, + "learning_rate": 3.2829140072019196e-05, + "loss": 0.6638, + "step": 116540 + }, + { + "epoch": 1.0303399989391608, + "grad_norm": 2.594841241836548, + "learning_rate": 3.282766668434732e-05, + "loss": 0.6965, + "step": 116550 + }, + { + "epoch": 1.0304284021994732, + "grad_norm": 1.13369882106781, + "learning_rate": 3.282619329667545e-05, + "loss": 0.557, + "step": 116560 + }, + { + "epoch": 1.0305168054597853, + "grad_norm": 2.8408596515655518, + "learning_rate": 3.2824719909003575e-05, + "loss": 0.6678, + "step": 116570 + }, + { + "epoch": 1.0306052087200976, + "grad_norm": 0.7319878339767456, + "learning_rate": 3.282324652133171e-05, + "loss": 0.5983, + "step": 116580 + }, + { + "epoch": 1.0306936119804098, + "grad_norm": 4.826745510101318, + "learning_rate": 3.282177313365984e-05, + "loss": 0.5915, + "step": 116590 + }, + { + "epoch": 1.030782015240722, + "grad_norm": 3.338949680328369, + "learning_rate": 3.2820299745987967e-05, + "loss": 0.666, + "step": 116600 + }, + { + "epoch": 1.0308704185010342, + "grad_norm": 12.315072059631348, + "learning_rate": 3.2818826358316095e-05, + "loss": 0.6548, + "step": 116610 + }, + { + "epoch": 1.0309588217613466, + "grad_norm": 1.5388741493225098, + "learning_rate": 3.281735297064423e-05, + "loss": 0.5622, + "step": 116620 + }, + { + "epoch": 1.0310472250216587, + "grad_norm": 1.8703491687774658, + "learning_rate": 3.281587958297235e-05, + "loss": 0.5399, + "step": 116630 + }, + { + "epoch": 1.031135628281971, + "grad_norm": 1.3267319202423096, + "learning_rate": 3.281440619530049e-05, + "loss": 0.6587, + "step": 116640 + }, + { + "epoch": 1.0312240315422834, + "grad_norm": 5.569469451904297, + "learning_rate": 3.2812932807628615e-05, + "loss": 0.5705, + "step": 116650 + }, + { + "epoch": 1.0313124348025955, + "grad_norm": 3.2137794494628906, + "learning_rate": 3.2811459419956743e-05, + "loss": 0.4815, + "step": 116660 + }, + { + "epoch": 1.0314008380629078, + "grad_norm": 2.425694227218628, + "learning_rate": 3.280998603228487e-05, + "loss": 0.5265, + "step": 116670 + }, + { + "epoch": 1.03148924132322, + "grad_norm": 3.39157772064209, + "learning_rate": 3.280851264461301e-05, + "loss": 0.7208, + "step": 116680 + }, + { + "epoch": 1.0315776445835323, + "grad_norm": 2.22771954536438, + "learning_rate": 3.280703925694113e-05, + "loss": 0.7884, + "step": 116690 + }, + { + "epoch": 1.0316660478438444, + "grad_norm": 3.204160451889038, + "learning_rate": 3.2805565869269264e-05, + "loss": 0.6452, + "step": 116700 + }, + { + "epoch": 1.0317544511041568, + "grad_norm": 4.634530544281006, + "learning_rate": 3.2804092481597385e-05, + "loss": 0.559, + "step": 116710 + }, + { + "epoch": 1.0318428543644689, + "grad_norm": 2.596842050552368, + "learning_rate": 3.280261909392552e-05, + "loss": 0.6561, + "step": 116720 + }, + { + "epoch": 1.0319312576247812, + "grad_norm": 1.9425549507141113, + "learning_rate": 3.280114570625365e-05, + "loss": 0.6915, + "step": 116730 + }, + { + "epoch": 1.0320196608850933, + "grad_norm": 2.7140884399414062, + "learning_rate": 3.279967231858178e-05, + "loss": 0.6119, + "step": 116740 + }, + { + "epoch": 1.0321080641454057, + "grad_norm": 1.9476137161254883, + "learning_rate": 3.2798198930909905e-05, + "loss": 0.7827, + "step": 116750 + }, + { + "epoch": 1.032196467405718, + "grad_norm": 11.106419563293457, + "learning_rate": 3.279672554323804e-05, + "loss": 0.5986, + "step": 116760 + }, + { + "epoch": 1.0322848706660301, + "grad_norm": 2.127530336380005, + "learning_rate": 3.279525215556616e-05, + "loss": 0.5492, + "step": 116770 + }, + { + "epoch": 1.0323732739263425, + "grad_norm": 4.33651065826416, + "learning_rate": 3.27937787678943e-05, + "loss": 0.7341, + "step": 116780 + }, + { + "epoch": 1.0324616771866546, + "grad_norm": 2.457453966140747, + "learning_rate": 3.2792305380222426e-05, + "loss": 0.4417, + "step": 116790 + }, + { + "epoch": 1.032550080446967, + "grad_norm": 3.453371524810791, + "learning_rate": 3.2790831992550554e-05, + "loss": 0.7319, + "step": 116800 + }, + { + "epoch": 1.032638483707279, + "grad_norm": 4.092473030090332, + "learning_rate": 3.278935860487868e-05, + "loss": 0.5708, + "step": 116810 + }, + { + "epoch": 1.0327268869675914, + "grad_norm": 1.7105004787445068, + "learning_rate": 3.278788521720681e-05, + "loss": 0.5989, + "step": 116820 + }, + { + "epoch": 1.0328152902279035, + "grad_norm": 1.922426462173462, + "learning_rate": 3.278641182953494e-05, + "loss": 0.6356, + "step": 116830 + }, + { + "epoch": 1.0329036934882159, + "grad_norm": 2.488189935684204, + "learning_rate": 3.2784938441863074e-05, + "loss": 0.6216, + "step": 116840 + }, + { + "epoch": 1.032992096748528, + "grad_norm": 1.9522022008895874, + "learning_rate": 3.2783465054191196e-05, + "loss": 0.6034, + "step": 116850 + }, + { + "epoch": 1.0330805000088403, + "grad_norm": 2.601702928543091, + "learning_rate": 3.278199166651933e-05, + "loss": 0.5684, + "step": 116860 + }, + { + "epoch": 1.0331689032691527, + "grad_norm": 6.249575614929199, + "learning_rate": 3.278051827884746e-05, + "loss": 0.6326, + "step": 116870 + }, + { + "epoch": 1.0332573065294648, + "grad_norm": 20.06865692138672, + "learning_rate": 3.277904489117559e-05, + "loss": 0.7556, + "step": 116880 + }, + { + "epoch": 1.0333457097897771, + "grad_norm": 12.318852424621582, + "learning_rate": 3.2777571503503716e-05, + "loss": 0.5879, + "step": 116890 + }, + { + "epoch": 1.0334341130500893, + "grad_norm": 1.2416291236877441, + "learning_rate": 3.277609811583185e-05, + "loss": 0.6813, + "step": 116900 + }, + { + "epoch": 1.0335225163104016, + "grad_norm": 1.846293330192566, + "learning_rate": 3.277462472815997e-05, + "loss": 0.6278, + "step": 116910 + }, + { + "epoch": 1.0336109195707137, + "grad_norm": 1.836117148399353, + "learning_rate": 3.277315134048811e-05, + "loss": 0.5509, + "step": 116920 + }, + { + "epoch": 1.033699322831026, + "grad_norm": 4.629387855529785, + "learning_rate": 3.277167795281623e-05, + "loss": 0.5082, + "step": 116930 + }, + { + "epoch": 1.0337877260913382, + "grad_norm": 3.5430023670196533, + "learning_rate": 3.2770204565144365e-05, + "loss": 0.5197, + "step": 116940 + }, + { + "epoch": 1.0338761293516505, + "grad_norm": 1.8582254648208618, + "learning_rate": 3.276873117747249e-05, + "loss": 0.671, + "step": 116950 + }, + { + "epoch": 1.0339645326119626, + "grad_norm": 1.6037861108779907, + "learning_rate": 3.276725778980062e-05, + "loss": 0.6472, + "step": 116960 + }, + { + "epoch": 1.034052935872275, + "grad_norm": 2.811312198638916, + "learning_rate": 3.276578440212875e-05, + "loss": 0.5694, + "step": 116970 + }, + { + "epoch": 1.0341413391325873, + "grad_norm": 3.180572271347046, + "learning_rate": 3.2764311014456885e-05, + "loss": 0.6234, + "step": 116980 + }, + { + "epoch": 1.0342297423928994, + "grad_norm": 1.1608598232269287, + "learning_rate": 3.2762837626785006e-05, + "loss": 0.6262, + "step": 116990 + }, + { + "epoch": 1.0343181456532118, + "grad_norm": 1.9698179960250854, + "learning_rate": 3.276136423911314e-05, + "loss": 0.5939, + "step": 117000 + }, + { + "epoch": 1.034406548913524, + "grad_norm": 6.817419052124023, + "learning_rate": 3.275989085144127e-05, + "loss": 0.6763, + "step": 117010 + }, + { + "epoch": 1.0344949521738362, + "grad_norm": 1.9949711561203003, + "learning_rate": 3.27584174637694e-05, + "loss": 0.5865, + "step": 117020 + }, + { + "epoch": 1.0345833554341484, + "grad_norm": 2.875236988067627, + "learning_rate": 3.2756944076097526e-05, + "loss": 0.6031, + "step": 117030 + }, + { + "epoch": 1.0346717586944607, + "grad_norm": 0.9283199310302734, + "learning_rate": 3.275547068842566e-05, + "loss": 0.6082, + "step": 117040 + }, + { + "epoch": 1.0347601619547728, + "grad_norm": 4.239526748657227, + "learning_rate": 3.275399730075378e-05, + "loss": 0.6346, + "step": 117050 + }, + { + "epoch": 1.0348485652150852, + "grad_norm": 0.9630407094955444, + "learning_rate": 3.275252391308192e-05, + "loss": 0.5952, + "step": 117060 + }, + { + "epoch": 1.0349369684753973, + "grad_norm": 1.790809988975525, + "learning_rate": 3.275105052541004e-05, + "loss": 0.6504, + "step": 117070 + }, + { + "epoch": 1.0350253717357096, + "grad_norm": 1.949729084968567, + "learning_rate": 3.2749577137738175e-05, + "loss": 0.6687, + "step": 117080 + }, + { + "epoch": 1.0351137749960218, + "grad_norm": 5.460433006286621, + "learning_rate": 3.27481037500663e-05, + "loss": 0.6907, + "step": 117090 + }, + { + "epoch": 1.035202178256334, + "grad_norm": 2.627575635910034, + "learning_rate": 3.274663036239443e-05, + "loss": 0.6784, + "step": 117100 + }, + { + "epoch": 1.0352905815166464, + "grad_norm": 11.56816577911377, + "learning_rate": 3.274515697472256e-05, + "loss": 0.7326, + "step": 117110 + }, + { + "epoch": 1.0353789847769586, + "grad_norm": 1.409606695175171, + "learning_rate": 3.2743683587050695e-05, + "loss": 0.5791, + "step": 117120 + }, + { + "epoch": 1.035467388037271, + "grad_norm": 1.8347712755203247, + "learning_rate": 3.274221019937882e-05, + "loss": 0.7242, + "step": 117130 + }, + { + "epoch": 1.035555791297583, + "grad_norm": 4.0566277503967285, + "learning_rate": 3.274073681170695e-05, + "loss": 0.7123, + "step": 117140 + }, + { + "epoch": 1.0356441945578954, + "grad_norm": 2.90889048576355, + "learning_rate": 3.273926342403508e-05, + "loss": 0.5579, + "step": 117150 + }, + { + "epoch": 1.0357325978182075, + "grad_norm": 1.5285141468048096, + "learning_rate": 3.273779003636321e-05, + "loss": 0.5843, + "step": 117160 + }, + { + "epoch": 1.0358210010785198, + "grad_norm": 2.2421305179595947, + "learning_rate": 3.273631664869134e-05, + "loss": 0.7018, + "step": 117170 + }, + { + "epoch": 1.035909404338832, + "grad_norm": 3.8225080966949463, + "learning_rate": 3.2734843261019465e-05, + "loss": 0.6318, + "step": 117180 + }, + { + "epoch": 1.0359978075991443, + "grad_norm": 8.156444549560547, + "learning_rate": 3.2733369873347594e-05, + "loss": 0.6397, + "step": 117190 + }, + { + "epoch": 1.0360862108594564, + "grad_norm": 7.463613510131836, + "learning_rate": 3.273189648567573e-05, + "loss": 0.6656, + "step": 117200 + }, + { + "epoch": 1.0361746141197687, + "grad_norm": 1.971327543258667, + "learning_rate": 3.273042309800386e-05, + "loss": 0.551, + "step": 117210 + }, + { + "epoch": 1.0362630173800809, + "grad_norm": 1.6811615228652954, + "learning_rate": 3.2728949710331986e-05, + "loss": 0.642, + "step": 117220 + }, + { + "epoch": 1.0363514206403932, + "grad_norm": 2.060220241546631, + "learning_rate": 3.2727476322660114e-05, + "loss": 0.6732, + "step": 117230 + }, + { + "epoch": 1.0364398239007055, + "grad_norm": 1.6338359117507935, + "learning_rate": 3.272600293498824e-05, + "loss": 0.6826, + "step": 117240 + }, + { + "epoch": 1.0365282271610177, + "grad_norm": 2.3002490997314453, + "learning_rate": 3.272452954731637e-05, + "loss": 0.576, + "step": 117250 + }, + { + "epoch": 1.03661663042133, + "grad_norm": 1.0325394868850708, + "learning_rate": 3.2723056159644506e-05, + "loss": 0.6372, + "step": 117260 + }, + { + "epoch": 1.0367050336816421, + "grad_norm": 1.868433952331543, + "learning_rate": 3.2721582771972634e-05, + "loss": 0.6853, + "step": 117270 + }, + { + "epoch": 1.0367934369419545, + "grad_norm": 1.6474049091339111, + "learning_rate": 3.272010938430076e-05, + "loss": 0.5646, + "step": 117280 + }, + { + "epoch": 1.0368818402022666, + "grad_norm": 2.120286703109741, + "learning_rate": 3.271863599662889e-05, + "loss": 0.6415, + "step": 117290 + }, + { + "epoch": 1.036970243462579, + "grad_norm": 3.433964729309082, + "learning_rate": 3.271716260895702e-05, + "loss": 0.6083, + "step": 117300 + }, + { + "epoch": 1.037058646722891, + "grad_norm": 3.581944704055786, + "learning_rate": 3.271568922128515e-05, + "loss": 0.5799, + "step": 117310 + }, + { + "epoch": 1.0371470499832034, + "grad_norm": 3.5881195068359375, + "learning_rate": 3.2714215833613276e-05, + "loss": 0.518, + "step": 117320 + }, + { + "epoch": 1.0372354532435155, + "grad_norm": 5.122342586517334, + "learning_rate": 3.271274244594141e-05, + "loss": 0.6346, + "step": 117330 + }, + { + "epoch": 1.0373238565038279, + "grad_norm": 5.392604351043701, + "learning_rate": 3.271126905826954e-05, + "loss": 0.6113, + "step": 117340 + }, + { + "epoch": 1.0374122597641402, + "grad_norm": 4.673681259155273, + "learning_rate": 3.270979567059767e-05, + "loss": 0.5359, + "step": 117350 + }, + { + "epoch": 1.0375006630244523, + "grad_norm": 3.5873610973358154, + "learning_rate": 3.2708322282925796e-05, + "loss": 0.6046, + "step": 117360 + }, + { + "epoch": 1.0375890662847647, + "grad_norm": 16.325695037841797, + "learning_rate": 3.2706848895253924e-05, + "loss": 0.6831, + "step": 117370 + }, + { + "epoch": 1.0376774695450768, + "grad_norm": 2.7966771125793457, + "learning_rate": 3.270537550758205e-05, + "loss": 0.667, + "step": 117380 + }, + { + "epoch": 1.0377658728053891, + "grad_norm": 1.5073250532150269, + "learning_rate": 3.270390211991019e-05, + "loss": 0.589, + "step": 117390 + }, + { + "epoch": 1.0378542760657012, + "grad_norm": 8.531837463378906, + "learning_rate": 3.270242873223831e-05, + "loss": 0.5831, + "step": 117400 + }, + { + "epoch": 1.0379426793260136, + "grad_norm": 7.171672821044922, + "learning_rate": 3.2700955344566445e-05, + "loss": 0.6154, + "step": 117410 + }, + { + "epoch": 1.0380310825863257, + "grad_norm": 10.264817237854004, + "learning_rate": 3.269948195689457e-05, + "loss": 0.7075, + "step": 117420 + }, + { + "epoch": 1.038119485846638, + "grad_norm": 4.6835713386535645, + "learning_rate": 3.26980085692227e-05, + "loss": 0.582, + "step": 117430 + }, + { + "epoch": 1.0382078891069502, + "grad_norm": 1.2939256429672241, + "learning_rate": 3.269653518155083e-05, + "loss": 0.6365, + "step": 117440 + }, + { + "epoch": 1.0382962923672625, + "grad_norm": 3.343648672103882, + "learning_rate": 3.2695061793878965e-05, + "loss": 0.7446, + "step": 117450 + }, + { + "epoch": 1.0383846956275748, + "grad_norm": 3.8489580154418945, + "learning_rate": 3.2693588406207086e-05, + "loss": 0.6648, + "step": 117460 + }, + { + "epoch": 1.038473098887887, + "grad_norm": 4.065408229827881, + "learning_rate": 3.269211501853522e-05, + "loss": 0.4903, + "step": 117470 + }, + { + "epoch": 1.0385615021481993, + "grad_norm": 11.971077919006348, + "learning_rate": 3.269064163086335e-05, + "loss": 0.7453, + "step": 117480 + }, + { + "epoch": 1.0386499054085114, + "grad_norm": 2.102881669998169, + "learning_rate": 3.268916824319148e-05, + "loss": 0.5602, + "step": 117490 + }, + { + "epoch": 1.0387383086688238, + "grad_norm": 2.289872646331787, + "learning_rate": 3.2687694855519607e-05, + "loss": 0.6237, + "step": 117500 + }, + { + "epoch": 1.038826711929136, + "grad_norm": 1.8100292682647705, + "learning_rate": 3.268622146784774e-05, + "loss": 0.6389, + "step": 117510 + }, + { + "epoch": 1.0389151151894482, + "grad_norm": 2.585407018661499, + "learning_rate": 3.268474808017586e-05, + "loss": 0.5156, + "step": 117520 + }, + { + "epoch": 1.0390035184497604, + "grad_norm": 23.623857498168945, + "learning_rate": 3.2683274692504e-05, + "loss": 0.5978, + "step": 117530 + }, + { + "epoch": 1.0390919217100727, + "grad_norm": 1.6409337520599365, + "learning_rate": 3.268180130483212e-05, + "loss": 0.6086, + "step": 117540 + }, + { + "epoch": 1.0391803249703848, + "grad_norm": 1.6514098644256592, + "learning_rate": 3.2680327917160255e-05, + "loss": 0.6996, + "step": 117550 + }, + { + "epoch": 1.0392687282306972, + "grad_norm": 4.418907642364502, + "learning_rate": 3.2678854529488383e-05, + "loss": 0.5801, + "step": 117560 + }, + { + "epoch": 1.0393571314910095, + "grad_norm": 2.738635540008545, + "learning_rate": 3.267738114181651e-05, + "loss": 0.4338, + "step": 117570 + }, + { + "epoch": 1.0394455347513216, + "grad_norm": 1.4246604442596436, + "learning_rate": 3.267590775414464e-05, + "loss": 0.6458, + "step": 117580 + }, + { + "epoch": 1.039533938011634, + "grad_norm": 1.2119765281677246, + "learning_rate": 3.2674434366472775e-05, + "loss": 0.6196, + "step": 117590 + }, + { + "epoch": 1.039622341271946, + "grad_norm": 1.0284359455108643, + "learning_rate": 3.26729609788009e-05, + "loss": 0.5413, + "step": 117600 + }, + { + "epoch": 1.0397107445322584, + "grad_norm": 4.031009197235107, + "learning_rate": 3.267148759112903e-05, + "loss": 0.7052, + "step": 117610 + }, + { + "epoch": 1.0397991477925705, + "grad_norm": 1.655850887298584, + "learning_rate": 3.267001420345716e-05, + "loss": 0.5698, + "step": 117620 + }, + { + "epoch": 1.0398875510528829, + "grad_norm": 33.166297912597656, + "learning_rate": 3.266854081578529e-05, + "loss": 0.7216, + "step": 117630 + }, + { + "epoch": 1.039975954313195, + "grad_norm": 1.8915823698043823, + "learning_rate": 3.266706742811342e-05, + "loss": 0.7542, + "step": 117640 + }, + { + "epoch": 1.0400643575735073, + "grad_norm": 1.5253355503082275, + "learning_rate": 3.2665594040441545e-05, + "loss": 0.6712, + "step": 117650 + }, + { + "epoch": 1.0401527608338195, + "grad_norm": 3.1630632877349854, + "learning_rate": 3.2664120652769674e-05, + "loss": 0.656, + "step": 117660 + }, + { + "epoch": 1.0402411640941318, + "grad_norm": 4.646976470947266, + "learning_rate": 3.266264726509781e-05, + "loss": 0.6421, + "step": 117670 + }, + { + "epoch": 1.040329567354444, + "grad_norm": 1.5237339735031128, + "learning_rate": 3.266117387742593e-05, + "loss": 0.6431, + "step": 117680 + }, + { + "epoch": 1.0404179706147563, + "grad_norm": 1.745900273323059, + "learning_rate": 3.2659700489754066e-05, + "loss": 0.4945, + "step": 117690 + }, + { + "epoch": 1.0405063738750686, + "grad_norm": 4.8196892738342285, + "learning_rate": 3.2658227102082194e-05, + "loss": 0.5877, + "step": 117700 + }, + { + "epoch": 1.0405947771353807, + "grad_norm": 3.6238794326782227, + "learning_rate": 3.265675371441032e-05, + "loss": 0.7393, + "step": 117710 + }, + { + "epoch": 1.040683180395693, + "grad_norm": 6.0012664794921875, + "learning_rate": 3.265528032673845e-05, + "loss": 0.7391, + "step": 117720 + }, + { + "epoch": 1.0407715836560052, + "grad_norm": 4.358944892883301, + "learning_rate": 3.2653806939066586e-05, + "loss": 0.6603, + "step": 117730 + }, + { + "epoch": 1.0408599869163175, + "grad_norm": 7.1416826248168945, + "learning_rate": 3.265233355139471e-05, + "loss": 0.6724, + "step": 117740 + }, + { + "epoch": 1.0409483901766297, + "grad_norm": 8.007926940917969, + "learning_rate": 3.265086016372284e-05, + "loss": 0.6326, + "step": 117750 + }, + { + "epoch": 1.041036793436942, + "grad_norm": 2.640085458755493, + "learning_rate": 3.2649386776050964e-05, + "loss": 0.529, + "step": 117760 + }, + { + "epoch": 1.0411251966972541, + "grad_norm": 2.1877005100250244, + "learning_rate": 3.26479133883791e-05, + "loss": 0.596, + "step": 117770 + }, + { + "epoch": 1.0412135999575665, + "grad_norm": 1.1778029203414917, + "learning_rate": 3.264644000070723e-05, + "loss": 0.6291, + "step": 117780 + }, + { + "epoch": 1.0413020032178786, + "grad_norm": 4.867035388946533, + "learning_rate": 3.2644966613035356e-05, + "loss": 0.6043, + "step": 117790 + }, + { + "epoch": 1.041390406478191, + "grad_norm": 3.264575719833374, + "learning_rate": 3.2643493225363484e-05, + "loss": 0.5568, + "step": 117800 + }, + { + "epoch": 1.041478809738503, + "grad_norm": 1.502008318901062, + "learning_rate": 3.264201983769162e-05, + "loss": 0.581, + "step": 117810 + }, + { + "epoch": 1.0415672129988154, + "grad_norm": 9.617237091064453, + "learning_rate": 3.264054645001974e-05, + "loss": 0.7682, + "step": 117820 + }, + { + "epoch": 1.0416556162591277, + "grad_norm": 1.3119839429855347, + "learning_rate": 3.2639073062347876e-05, + "loss": 0.6426, + "step": 117830 + }, + { + "epoch": 1.0417440195194398, + "grad_norm": 3.6332430839538574, + "learning_rate": 3.2637599674676004e-05, + "loss": 0.6073, + "step": 117840 + }, + { + "epoch": 1.0418324227797522, + "grad_norm": 8.436199188232422, + "learning_rate": 3.263612628700413e-05, + "loss": 0.5925, + "step": 117850 + }, + { + "epoch": 1.0419208260400643, + "grad_norm": 2.1173672676086426, + "learning_rate": 3.263465289933226e-05, + "loss": 0.5255, + "step": 117860 + }, + { + "epoch": 1.0420092293003766, + "grad_norm": 1.4945423603057861, + "learning_rate": 3.263317951166039e-05, + "loss": 0.6486, + "step": 117870 + }, + { + "epoch": 1.0420976325606888, + "grad_norm": 2.0368943214416504, + "learning_rate": 3.263170612398852e-05, + "loss": 0.6807, + "step": 117880 + }, + { + "epoch": 1.0421860358210011, + "grad_norm": 1.9873511791229248, + "learning_rate": 3.263023273631665e-05, + "loss": 0.5655, + "step": 117890 + }, + { + "epoch": 1.0422744390813132, + "grad_norm": 1.3288109302520752, + "learning_rate": 3.2628759348644775e-05, + "loss": 0.5924, + "step": 117900 + }, + { + "epoch": 1.0423628423416256, + "grad_norm": 1.703797459602356, + "learning_rate": 3.262728596097291e-05, + "loss": 0.6244, + "step": 117910 + }, + { + "epoch": 1.0424512456019377, + "grad_norm": 2.420076847076416, + "learning_rate": 3.262581257330104e-05, + "loss": 0.6413, + "step": 117920 + }, + { + "epoch": 1.04253964886225, + "grad_norm": 2.431633472442627, + "learning_rate": 3.2624339185629166e-05, + "loss": 0.6053, + "step": 117930 + }, + { + "epoch": 1.0426280521225624, + "grad_norm": 2.1759305000305176, + "learning_rate": 3.2622865797957295e-05, + "loss": 0.5221, + "step": 117940 + }, + { + "epoch": 1.0427164553828745, + "grad_norm": 1.887358546257019, + "learning_rate": 3.262139241028543e-05, + "loss": 0.6447, + "step": 117950 + }, + { + "epoch": 1.0428048586431868, + "grad_norm": 8.109214782714844, + "learning_rate": 3.261991902261355e-05, + "loss": 0.6483, + "step": 117960 + }, + { + "epoch": 1.042893261903499, + "grad_norm": 5.8749213218688965, + "learning_rate": 3.2618445634941687e-05, + "loss": 0.5661, + "step": 117970 + }, + { + "epoch": 1.0429816651638113, + "grad_norm": 2.1259536743164062, + "learning_rate": 3.2616972247269815e-05, + "loss": 0.5928, + "step": 117980 + }, + { + "epoch": 1.0430700684241234, + "grad_norm": 4.279666900634766, + "learning_rate": 3.261549885959794e-05, + "loss": 0.4649, + "step": 117990 + }, + { + "epoch": 1.0431584716844358, + "grad_norm": 7.866892337799072, + "learning_rate": 3.261402547192607e-05, + "loss": 0.6927, + "step": 118000 + }, + { + "epoch": 1.0432468749447479, + "grad_norm": 2.551658868789673, + "learning_rate": 3.26125520842542e-05, + "loss": 0.6887, + "step": 118010 + }, + { + "epoch": 1.0433352782050602, + "grad_norm": 1.9606891870498657, + "learning_rate": 3.261107869658233e-05, + "loss": 0.7013, + "step": 118020 + }, + { + "epoch": 1.0434236814653723, + "grad_norm": 32.96314239501953, + "learning_rate": 3.2609605308910464e-05, + "loss": 0.5399, + "step": 118030 + }, + { + "epoch": 1.0435120847256847, + "grad_norm": 5.241535186767578, + "learning_rate": 3.2608131921238585e-05, + "loss": 0.6744, + "step": 118040 + }, + { + "epoch": 1.043600487985997, + "grad_norm": 3.9287118911743164, + "learning_rate": 3.260665853356672e-05, + "loss": 0.6243, + "step": 118050 + }, + { + "epoch": 1.0436888912463091, + "grad_norm": 1.7416187524795532, + "learning_rate": 3.260518514589485e-05, + "loss": 0.4971, + "step": 118060 + }, + { + "epoch": 1.0437772945066215, + "grad_norm": 13.711721420288086, + "learning_rate": 3.260371175822298e-05, + "loss": 0.6676, + "step": 118070 + }, + { + "epoch": 1.0438656977669336, + "grad_norm": 1.176175594329834, + "learning_rate": 3.2602238370551105e-05, + "loss": 0.5755, + "step": 118080 + }, + { + "epoch": 1.043954101027246, + "grad_norm": 34.83610916137695, + "learning_rate": 3.260076498287924e-05, + "loss": 0.6186, + "step": 118090 + }, + { + "epoch": 1.044042504287558, + "grad_norm": 0.8576605319976807, + "learning_rate": 3.259929159520736e-05, + "loss": 0.5831, + "step": 118100 + }, + { + "epoch": 1.0441309075478704, + "grad_norm": 2.002018451690674, + "learning_rate": 3.25978182075355e-05, + "loss": 0.5897, + "step": 118110 + }, + { + "epoch": 1.0442193108081825, + "grad_norm": 6.9745097160339355, + "learning_rate": 3.2596344819863625e-05, + "loss": 0.5911, + "step": 118120 + }, + { + "epoch": 1.0443077140684949, + "grad_norm": 1.8782075643539429, + "learning_rate": 3.2594871432191754e-05, + "loss": 0.7535, + "step": 118130 + }, + { + "epoch": 1.044396117328807, + "grad_norm": 1.2439783811569214, + "learning_rate": 3.259339804451988e-05, + "loss": 0.6057, + "step": 118140 + }, + { + "epoch": 1.0444845205891193, + "grad_norm": 1.0344895124435425, + "learning_rate": 3.259192465684801e-05, + "loss": 0.7656, + "step": 118150 + }, + { + "epoch": 1.0445729238494317, + "grad_norm": 1.9495149850845337, + "learning_rate": 3.259045126917614e-05, + "loss": 0.5793, + "step": 118160 + }, + { + "epoch": 1.0446613271097438, + "grad_norm": 1.5701905488967896, + "learning_rate": 3.2588977881504274e-05, + "loss": 0.7641, + "step": 118170 + }, + { + "epoch": 1.0447497303700561, + "grad_norm": 1.7587635517120361, + "learning_rate": 3.25875044938324e-05, + "loss": 0.5941, + "step": 118180 + }, + { + "epoch": 1.0448381336303683, + "grad_norm": 5.947789192199707, + "learning_rate": 3.258603110616053e-05, + "loss": 0.6691, + "step": 118190 + }, + { + "epoch": 1.0449265368906806, + "grad_norm": 0.7765215635299683, + "learning_rate": 3.258455771848866e-05, + "loss": 0.6448, + "step": 118200 + }, + { + "epoch": 1.0450149401509927, + "grad_norm": 8.86922836303711, + "learning_rate": 3.258308433081679e-05, + "loss": 0.7149, + "step": 118210 + }, + { + "epoch": 1.045103343411305, + "grad_norm": 1.8575059175491333, + "learning_rate": 3.2581610943144916e-05, + "loss": 0.5772, + "step": 118220 + }, + { + "epoch": 1.0451917466716172, + "grad_norm": 2.029527425765991, + "learning_rate": 3.2580137555473044e-05, + "loss": 0.6826, + "step": 118230 + }, + { + "epoch": 1.0452801499319295, + "grad_norm": 7.09782075881958, + "learning_rate": 3.257866416780118e-05, + "loss": 0.6824, + "step": 118240 + }, + { + "epoch": 1.0453685531922416, + "grad_norm": 2.7990503311157227, + "learning_rate": 3.257719078012931e-05, + "loss": 0.6052, + "step": 118250 + }, + { + "epoch": 1.045456956452554, + "grad_norm": 1.5899012088775635, + "learning_rate": 3.2575717392457436e-05, + "loss": 0.5407, + "step": 118260 + }, + { + "epoch": 1.045545359712866, + "grad_norm": 4.548243999481201, + "learning_rate": 3.2574244004785564e-05, + "loss": 0.697, + "step": 118270 + }, + { + "epoch": 1.0456337629731784, + "grad_norm": 1.0858268737792969, + "learning_rate": 3.257277061711369e-05, + "loss": 0.578, + "step": 118280 + }, + { + "epoch": 1.0457221662334908, + "grad_norm": 1.5233114957809448, + "learning_rate": 3.257129722944182e-05, + "loss": 0.6481, + "step": 118290 + }, + { + "epoch": 1.045810569493803, + "grad_norm": 1.6378499269485474, + "learning_rate": 3.2569823841769956e-05, + "loss": 0.6644, + "step": 118300 + }, + { + "epoch": 1.0458989727541153, + "grad_norm": 1.4661781787872314, + "learning_rate": 3.2568350454098085e-05, + "loss": 0.8157, + "step": 118310 + }, + { + "epoch": 1.0459873760144274, + "grad_norm": 4.555196285247803, + "learning_rate": 3.256687706642621e-05, + "loss": 0.5554, + "step": 118320 + }, + { + "epoch": 1.0460757792747397, + "grad_norm": 1.831338882446289, + "learning_rate": 3.256540367875434e-05, + "loss": 0.5105, + "step": 118330 + }, + { + "epoch": 1.0461641825350518, + "grad_norm": 7.606119632720947, + "learning_rate": 3.256393029108247e-05, + "loss": 0.6715, + "step": 118340 + }, + { + "epoch": 1.0462525857953642, + "grad_norm": 3.658600091934204, + "learning_rate": 3.25624569034106e-05, + "loss": 0.5396, + "step": 118350 + }, + { + "epoch": 1.0463409890556763, + "grad_norm": 5.823617458343506, + "learning_rate": 3.256098351573873e-05, + "loss": 0.5467, + "step": 118360 + }, + { + "epoch": 1.0464293923159886, + "grad_norm": 1.1644865274429321, + "learning_rate": 3.2559510128066855e-05, + "loss": 0.6025, + "step": 118370 + }, + { + "epoch": 1.0465177955763008, + "grad_norm": 2.5406291484832764, + "learning_rate": 3.255803674039499e-05, + "loss": 0.5164, + "step": 118380 + }, + { + "epoch": 1.046606198836613, + "grad_norm": 1.1089091300964355, + "learning_rate": 3.255656335272312e-05, + "loss": 0.5679, + "step": 118390 + }, + { + "epoch": 1.0466946020969254, + "grad_norm": 1.1352068185806274, + "learning_rate": 3.2555089965051246e-05, + "loss": 0.6858, + "step": 118400 + }, + { + "epoch": 1.0467830053572376, + "grad_norm": 1.3964446783065796, + "learning_rate": 3.2553616577379375e-05, + "loss": 0.6755, + "step": 118410 + }, + { + "epoch": 1.04687140861755, + "grad_norm": 3.4785985946655273, + "learning_rate": 3.255214318970751e-05, + "loss": 0.6976, + "step": 118420 + }, + { + "epoch": 1.046959811877862, + "grad_norm": 2.8392281532287598, + "learning_rate": 3.255066980203563e-05, + "loss": 0.5557, + "step": 118430 + }, + { + "epoch": 1.0470482151381744, + "grad_norm": 2.3943850994110107, + "learning_rate": 3.254919641436377e-05, + "loss": 0.6604, + "step": 118440 + }, + { + "epoch": 1.0471366183984865, + "grad_norm": 0.850866973400116, + "learning_rate": 3.2547723026691895e-05, + "loss": 0.5476, + "step": 118450 + }, + { + "epoch": 1.0472250216587988, + "grad_norm": 4.029863357543945, + "learning_rate": 3.2546249639020023e-05, + "loss": 0.6471, + "step": 118460 + }, + { + "epoch": 1.047313424919111, + "grad_norm": 4.031404495239258, + "learning_rate": 3.254477625134815e-05, + "loss": 0.7169, + "step": 118470 + }, + { + "epoch": 1.0474018281794233, + "grad_norm": 26.501449584960938, + "learning_rate": 3.254330286367628e-05, + "loss": 0.5073, + "step": 118480 + }, + { + "epoch": 1.0474902314397354, + "grad_norm": 1.9022653102874756, + "learning_rate": 3.254182947600441e-05, + "loss": 0.5951, + "step": 118490 + }, + { + "epoch": 1.0475786347000478, + "grad_norm": 7.83197546005249, + "learning_rate": 3.2540356088332544e-05, + "loss": 0.6585, + "step": 118500 + }, + { + "epoch": 1.0476670379603599, + "grad_norm": 6.007931709289551, + "learning_rate": 3.2538882700660665e-05, + "loss": 0.5451, + "step": 118510 + }, + { + "epoch": 1.0477554412206722, + "grad_norm": 1.2372618913650513, + "learning_rate": 3.25374093129888e-05, + "loss": 0.603, + "step": 118520 + }, + { + "epoch": 1.0478438444809846, + "grad_norm": 5.3750529289245605, + "learning_rate": 3.253593592531693e-05, + "loss": 0.6387, + "step": 118530 + }, + { + "epoch": 1.0479322477412967, + "grad_norm": 5.821578502655029, + "learning_rate": 3.253446253764506e-05, + "loss": 0.6934, + "step": 118540 + }, + { + "epoch": 1.048020651001609, + "grad_norm": 2.2839174270629883, + "learning_rate": 3.2532989149973185e-05, + "loss": 0.6178, + "step": 118550 + }, + { + "epoch": 1.0481090542619211, + "grad_norm": 1.5846675634384155, + "learning_rate": 3.253151576230132e-05, + "loss": 0.5161, + "step": 118560 + }, + { + "epoch": 1.0481974575222335, + "grad_norm": 1.9797673225402832, + "learning_rate": 3.253004237462944e-05, + "loss": 0.646, + "step": 118570 + }, + { + "epoch": 1.0482858607825456, + "grad_norm": 2.532197952270508, + "learning_rate": 3.252856898695758e-05, + "loss": 0.6122, + "step": 118580 + }, + { + "epoch": 1.048374264042858, + "grad_norm": 2.0440354347229004, + "learning_rate": 3.25270955992857e-05, + "loss": 0.5084, + "step": 118590 + }, + { + "epoch": 1.04846266730317, + "grad_norm": 8.088627815246582, + "learning_rate": 3.2525622211613834e-05, + "loss": 0.7105, + "step": 118600 + }, + { + "epoch": 1.0485510705634824, + "grad_norm": 2.5125439167022705, + "learning_rate": 3.252414882394196e-05, + "loss": 0.6422, + "step": 118610 + }, + { + "epoch": 1.0486394738237945, + "grad_norm": 1.638609766960144, + "learning_rate": 3.252267543627009e-05, + "loss": 0.495, + "step": 118620 + }, + { + "epoch": 1.0487278770841069, + "grad_norm": 3.144679307937622, + "learning_rate": 3.252120204859822e-05, + "loss": 0.6835, + "step": 118630 + }, + { + "epoch": 1.0488162803444192, + "grad_norm": 1.8833526372909546, + "learning_rate": 3.2519728660926354e-05, + "loss": 0.6067, + "step": 118640 + }, + { + "epoch": 1.0489046836047313, + "grad_norm": 2.845440626144409, + "learning_rate": 3.2518255273254476e-05, + "loss": 0.6727, + "step": 118650 + }, + { + "epoch": 1.0489930868650437, + "grad_norm": 2.4671027660369873, + "learning_rate": 3.251678188558261e-05, + "loss": 0.6061, + "step": 118660 + }, + { + "epoch": 1.0490814901253558, + "grad_norm": 10.824947357177734, + "learning_rate": 3.251530849791074e-05, + "loss": 0.6677, + "step": 118670 + }, + { + "epoch": 1.0491698933856681, + "grad_norm": 2.396451473236084, + "learning_rate": 3.251383511023887e-05, + "loss": 0.6101, + "step": 118680 + }, + { + "epoch": 1.0492582966459802, + "grad_norm": 2.3407211303710938, + "learning_rate": 3.2512361722566996e-05, + "loss": 0.747, + "step": 118690 + }, + { + "epoch": 1.0493466999062926, + "grad_norm": 3.300262928009033, + "learning_rate": 3.2510888334895124e-05, + "loss": 0.6422, + "step": 118700 + }, + { + "epoch": 1.0494351031666047, + "grad_norm": 4.343479633331299, + "learning_rate": 3.250941494722325e-05, + "loss": 0.7417, + "step": 118710 + }, + { + "epoch": 1.049523506426917, + "grad_norm": 4.603010654449463, + "learning_rate": 3.250794155955139e-05, + "loss": 0.5915, + "step": 118720 + }, + { + "epoch": 1.0496119096872292, + "grad_norm": 6.133446216583252, + "learning_rate": 3.250646817187951e-05, + "loss": 0.6562, + "step": 118730 + }, + { + "epoch": 1.0497003129475415, + "grad_norm": 2.617239236831665, + "learning_rate": 3.2504994784207644e-05, + "loss": 0.6916, + "step": 118740 + }, + { + "epoch": 1.0497887162078539, + "grad_norm": 2.3791656494140625, + "learning_rate": 3.250352139653577e-05, + "loss": 0.4972, + "step": 118750 + }, + { + "epoch": 1.049877119468166, + "grad_norm": 1.3188098669052124, + "learning_rate": 3.25020480088639e-05, + "loss": 0.6578, + "step": 118760 + }, + { + "epoch": 1.0499655227284783, + "grad_norm": 0.976134181022644, + "learning_rate": 3.250057462119203e-05, + "loss": 0.7267, + "step": 118770 + }, + { + "epoch": 1.0500539259887904, + "grad_norm": 1.7477500438690186, + "learning_rate": 3.2499101233520165e-05, + "loss": 0.68, + "step": 118780 + }, + { + "epoch": 1.0501423292491028, + "grad_norm": 6.36392879486084, + "learning_rate": 3.2497627845848286e-05, + "loss": 0.6517, + "step": 118790 + }, + { + "epoch": 1.050230732509415, + "grad_norm": 1.692762017250061, + "learning_rate": 3.249615445817642e-05, + "loss": 0.6802, + "step": 118800 + }, + { + "epoch": 1.0503191357697272, + "grad_norm": 10.650903701782227, + "learning_rate": 3.249468107050454e-05, + "loss": 0.5884, + "step": 118810 + }, + { + "epoch": 1.0504075390300394, + "grad_norm": 1.98170006275177, + "learning_rate": 3.249320768283268e-05, + "loss": 0.6197, + "step": 118820 + }, + { + "epoch": 1.0504959422903517, + "grad_norm": 2.243741273880005, + "learning_rate": 3.2491734295160806e-05, + "loss": 0.4907, + "step": 118830 + }, + { + "epoch": 1.0505843455506638, + "grad_norm": 1.4422988891601562, + "learning_rate": 3.2490260907488935e-05, + "loss": 0.657, + "step": 118840 + }, + { + "epoch": 1.0506727488109762, + "grad_norm": 5.877725601196289, + "learning_rate": 3.248878751981706e-05, + "loss": 0.7138, + "step": 118850 + }, + { + "epoch": 1.0507611520712883, + "grad_norm": 2.1584677696228027, + "learning_rate": 3.24873141321452e-05, + "loss": 0.6772, + "step": 118860 + }, + { + "epoch": 1.0508495553316006, + "grad_norm": 3.422776699066162, + "learning_rate": 3.248584074447332e-05, + "loss": 0.6891, + "step": 118870 + }, + { + "epoch": 1.050937958591913, + "grad_norm": 1.1917446851730347, + "learning_rate": 3.2484367356801455e-05, + "loss": 0.708, + "step": 118880 + }, + { + "epoch": 1.051026361852225, + "grad_norm": 1.870484709739685, + "learning_rate": 3.248289396912958e-05, + "loss": 0.7486, + "step": 118890 + }, + { + "epoch": 1.0511147651125374, + "grad_norm": 2.3740108013153076, + "learning_rate": 3.248142058145771e-05, + "loss": 0.7176, + "step": 118900 + }, + { + "epoch": 1.0512031683728496, + "grad_norm": 1.7380127906799316, + "learning_rate": 3.247994719378584e-05, + "loss": 0.6192, + "step": 118910 + }, + { + "epoch": 1.051291571633162, + "grad_norm": 1.7060693502426147, + "learning_rate": 3.2478473806113975e-05, + "loss": 0.5793, + "step": 118920 + }, + { + "epoch": 1.051379974893474, + "grad_norm": 6.669757843017578, + "learning_rate": 3.24770004184421e-05, + "loss": 0.6941, + "step": 118930 + }, + { + "epoch": 1.0514683781537864, + "grad_norm": 1.8193747997283936, + "learning_rate": 3.247552703077023e-05, + "loss": 0.7079, + "step": 118940 + }, + { + "epoch": 1.0515567814140985, + "grad_norm": 2.433159112930298, + "learning_rate": 3.2474053643098353e-05, + "loss": 0.6772, + "step": 118950 + }, + { + "epoch": 1.0516451846744108, + "grad_norm": 2.7907907962799072, + "learning_rate": 3.247258025542649e-05, + "loss": 0.7304, + "step": 118960 + }, + { + "epoch": 1.051733587934723, + "grad_norm": 4.268134117126465, + "learning_rate": 3.247110686775462e-05, + "loss": 0.7282, + "step": 118970 + }, + { + "epoch": 1.0518219911950353, + "grad_norm": 1.9655283689498901, + "learning_rate": 3.2469633480082745e-05, + "loss": 0.5369, + "step": 118980 + }, + { + "epoch": 1.0519103944553476, + "grad_norm": 11.410774230957031, + "learning_rate": 3.2468160092410874e-05, + "loss": 0.5548, + "step": 118990 + }, + { + "epoch": 1.0519987977156597, + "grad_norm": 2.279615879058838, + "learning_rate": 3.246668670473901e-05, + "loss": 0.6723, + "step": 119000 + }, + { + "epoch": 1.052087200975972, + "grad_norm": 1.2958675622940063, + "learning_rate": 3.246521331706713e-05, + "loss": 0.5788, + "step": 119010 + }, + { + "epoch": 1.0521756042362842, + "grad_norm": 2.697662591934204, + "learning_rate": 3.2463739929395265e-05, + "loss": 0.7265, + "step": 119020 + }, + { + "epoch": 1.0522640074965965, + "grad_norm": 1.0697963237762451, + "learning_rate": 3.2462266541723394e-05, + "loss": 0.5311, + "step": 119030 + }, + { + "epoch": 1.0523524107569087, + "grad_norm": 5.604464054107666, + "learning_rate": 3.246079315405152e-05, + "loss": 0.5922, + "step": 119040 + }, + { + "epoch": 1.052440814017221, + "grad_norm": 1.7310692071914673, + "learning_rate": 3.245931976637965e-05, + "loss": 0.5441, + "step": 119050 + }, + { + "epoch": 1.0525292172775331, + "grad_norm": 6.643568992614746, + "learning_rate": 3.245784637870778e-05, + "loss": 0.5552, + "step": 119060 + }, + { + "epoch": 1.0526176205378455, + "grad_norm": 1.0637174844741821, + "learning_rate": 3.245637299103591e-05, + "loss": 0.6838, + "step": 119070 + }, + { + "epoch": 1.0527060237981576, + "grad_norm": 13.687317848205566, + "learning_rate": 3.245489960336404e-05, + "loss": 0.5748, + "step": 119080 + }, + { + "epoch": 1.05279442705847, + "grad_norm": 3.949272871017456, + "learning_rate": 3.245342621569217e-05, + "loss": 0.6729, + "step": 119090 + }, + { + "epoch": 1.052882830318782, + "grad_norm": 3.789527654647827, + "learning_rate": 3.24519528280203e-05, + "loss": 0.6706, + "step": 119100 + }, + { + "epoch": 1.0529712335790944, + "grad_norm": 11.215285301208496, + "learning_rate": 3.245047944034843e-05, + "loss": 0.7216, + "step": 119110 + }, + { + "epoch": 1.0530596368394067, + "grad_norm": 3.875169038772583, + "learning_rate": 3.2449006052676556e-05, + "loss": 0.7692, + "step": 119120 + }, + { + "epoch": 1.0531480400997189, + "grad_norm": 1.3836971521377563, + "learning_rate": 3.2447532665004684e-05, + "loss": 0.5809, + "step": 119130 + }, + { + "epoch": 1.0532364433600312, + "grad_norm": 6.706811428070068, + "learning_rate": 3.244605927733282e-05, + "loss": 0.6049, + "step": 119140 + }, + { + "epoch": 1.0533248466203433, + "grad_norm": 13.888518333435059, + "learning_rate": 3.244458588966095e-05, + "loss": 0.7284, + "step": 119150 + }, + { + "epoch": 1.0534132498806557, + "grad_norm": 2.408536911010742, + "learning_rate": 3.2443112501989076e-05, + "loss": 0.5536, + "step": 119160 + }, + { + "epoch": 1.0535016531409678, + "grad_norm": 1.611156940460205, + "learning_rate": 3.2441639114317204e-05, + "loss": 0.6307, + "step": 119170 + }, + { + "epoch": 1.0535900564012801, + "grad_norm": 4.852395057678223, + "learning_rate": 3.244016572664533e-05, + "loss": 0.612, + "step": 119180 + }, + { + "epoch": 1.0536784596615922, + "grad_norm": 2.170468807220459, + "learning_rate": 3.243869233897346e-05, + "loss": 0.6796, + "step": 119190 + }, + { + "epoch": 1.0537668629219046, + "grad_norm": 1.043799877166748, + "learning_rate": 3.243721895130159e-05, + "loss": 0.6356, + "step": 119200 + }, + { + "epoch": 1.0538552661822167, + "grad_norm": 2.0495386123657227, + "learning_rate": 3.2435745563629724e-05, + "loss": 0.5904, + "step": 119210 + }, + { + "epoch": 1.053943669442529, + "grad_norm": 1.8927379846572876, + "learning_rate": 3.243427217595785e-05, + "loss": 0.666, + "step": 119220 + }, + { + "epoch": 1.0540320727028414, + "grad_norm": 0.8466701507568359, + "learning_rate": 3.243279878828598e-05, + "loss": 0.6227, + "step": 119230 + }, + { + "epoch": 1.0541204759631535, + "grad_norm": 2.2930867671966553, + "learning_rate": 3.243132540061411e-05, + "loss": 0.5204, + "step": 119240 + }, + { + "epoch": 1.0542088792234658, + "grad_norm": 1.7285113334655762, + "learning_rate": 3.242985201294224e-05, + "loss": 0.5859, + "step": 119250 + }, + { + "epoch": 1.054297282483778, + "grad_norm": 2.9076006412506104, + "learning_rate": 3.2428378625270366e-05, + "loss": 0.5672, + "step": 119260 + }, + { + "epoch": 1.0543856857440903, + "grad_norm": 0.6604291200637817, + "learning_rate": 3.24269052375985e-05, + "loss": 0.6291, + "step": 119270 + }, + { + "epoch": 1.0544740890044024, + "grad_norm": 1.632657766342163, + "learning_rate": 3.242543184992662e-05, + "loss": 0.7196, + "step": 119280 + }, + { + "epoch": 1.0545624922647148, + "grad_norm": 1.1319701671600342, + "learning_rate": 3.242395846225476e-05, + "loss": 0.5293, + "step": 119290 + }, + { + "epoch": 1.0546508955250269, + "grad_norm": 15.328736305236816, + "learning_rate": 3.2422485074582886e-05, + "loss": 0.6006, + "step": 119300 + }, + { + "epoch": 1.0547392987853392, + "grad_norm": 3.335042953491211, + "learning_rate": 3.2421011686911015e-05, + "loss": 0.5151, + "step": 119310 + }, + { + "epoch": 1.0548277020456513, + "grad_norm": 7.990843772888184, + "learning_rate": 3.241953829923914e-05, + "loss": 0.6766, + "step": 119320 + }, + { + "epoch": 1.0549161053059637, + "grad_norm": 1.868437647819519, + "learning_rate": 3.241806491156728e-05, + "loss": 0.6583, + "step": 119330 + }, + { + "epoch": 1.055004508566276, + "grad_norm": 2.881113052368164, + "learning_rate": 3.24165915238954e-05, + "loss": 0.6838, + "step": 119340 + }, + { + "epoch": 1.0550929118265882, + "grad_norm": 1.1695398092269897, + "learning_rate": 3.2415118136223535e-05, + "loss": 0.6705, + "step": 119350 + }, + { + "epoch": 1.0551813150869005, + "grad_norm": 5.177979469299316, + "learning_rate": 3.241364474855166e-05, + "loss": 0.6029, + "step": 119360 + }, + { + "epoch": 1.0552697183472126, + "grad_norm": 1.2371217012405396, + "learning_rate": 3.241217136087979e-05, + "loss": 0.5284, + "step": 119370 + }, + { + "epoch": 1.055358121607525, + "grad_norm": 1.984904408454895, + "learning_rate": 3.241069797320792e-05, + "loss": 0.5742, + "step": 119380 + }, + { + "epoch": 1.055446524867837, + "grad_norm": 3.9826338291168213, + "learning_rate": 3.2409224585536055e-05, + "loss": 0.5881, + "step": 119390 + }, + { + "epoch": 1.0555349281281494, + "grad_norm": 3.7008163928985596, + "learning_rate": 3.240775119786418e-05, + "loss": 0.5789, + "step": 119400 + }, + { + "epoch": 1.0556233313884615, + "grad_norm": 2.044239044189453, + "learning_rate": 3.240627781019231e-05, + "loss": 0.7048, + "step": 119410 + }, + { + "epoch": 1.0557117346487739, + "grad_norm": 1.688833475112915, + "learning_rate": 3.2404804422520433e-05, + "loss": 0.5084, + "step": 119420 + }, + { + "epoch": 1.055800137909086, + "grad_norm": 3.701371192932129, + "learning_rate": 3.240333103484857e-05, + "loss": 0.728, + "step": 119430 + }, + { + "epoch": 1.0558885411693983, + "grad_norm": 3.2155213356018066, + "learning_rate": 3.24018576471767e-05, + "loss": 0.5807, + "step": 119440 + }, + { + "epoch": 1.0559769444297105, + "grad_norm": 1.2120938301086426, + "learning_rate": 3.2400384259504825e-05, + "loss": 0.6785, + "step": 119450 + }, + { + "epoch": 1.0560653476900228, + "grad_norm": 1.965158224105835, + "learning_rate": 3.2398910871832954e-05, + "loss": 0.6014, + "step": 119460 + }, + { + "epoch": 1.0561537509503351, + "grad_norm": 2.8948214054107666, + "learning_rate": 3.239743748416109e-05, + "loss": 0.7163, + "step": 119470 + }, + { + "epoch": 1.0562421542106473, + "grad_norm": 5.8671464920043945, + "learning_rate": 3.239596409648921e-05, + "loss": 0.5125, + "step": 119480 + }, + { + "epoch": 1.0563305574709596, + "grad_norm": 0.9219271540641785, + "learning_rate": 3.2394490708817345e-05, + "loss": 0.5515, + "step": 119490 + }, + { + "epoch": 1.0564189607312717, + "grad_norm": 3.2974612712860107, + "learning_rate": 3.2393017321145474e-05, + "loss": 0.6556, + "step": 119500 + }, + { + "epoch": 1.056507363991584, + "grad_norm": 11.104557991027832, + "learning_rate": 3.23915439334736e-05, + "loss": 0.5809, + "step": 119510 + }, + { + "epoch": 1.0565957672518962, + "grad_norm": 2.6774652004241943, + "learning_rate": 3.239007054580173e-05, + "loss": 0.548, + "step": 119520 + }, + { + "epoch": 1.0566841705122085, + "grad_norm": 2.163102149963379, + "learning_rate": 3.238859715812986e-05, + "loss": 0.5961, + "step": 119530 + }, + { + "epoch": 1.0567725737725207, + "grad_norm": 1.8730876445770264, + "learning_rate": 3.238712377045799e-05, + "loss": 0.5904, + "step": 119540 + }, + { + "epoch": 1.056860977032833, + "grad_norm": 6.378261089324951, + "learning_rate": 3.238565038278612e-05, + "loss": 0.6158, + "step": 119550 + }, + { + "epoch": 1.0569493802931451, + "grad_norm": 5.50390100479126, + "learning_rate": 3.2384176995114244e-05, + "loss": 0.5649, + "step": 119560 + }, + { + "epoch": 1.0570377835534575, + "grad_norm": 9.187122344970703, + "learning_rate": 3.238270360744238e-05, + "loss": 0.7454, + "step": 119570 + }, + { + "epoch": 1.0571261868137698, + "grad_norm": 1.2036949396133423, + "learning_rate": 3.238123021977051e-05, + "loss": 0.6886, + "step": 119580 + }, + { + "epoch": 1.057214590074082, + "grad_norm": 3.5661401748657227, + "learning_rate": 3.2379756832098636e-05, + "loss": 0.6194, + "step": 119590 + }, + { + "epoch": 1.0573029933343943, + "grad_norm": 3.2739648818969727, + "learning_rate": 3.2378283444426764e-05, + "loss": 0.6981, + "step": 119600 + }, + { + "epoch": 1.0573913965947064, + "grad_norm": 1.8871715068817139, + "learning_rate": 3.23768100567549e-05, + "loss": 0.6083, + "step": 119610 + }, + { + "epoch": 1.0574797998550187, + "grad_norm": 2.985586643218994, + "learning_rate": 3.237533666908302e-05, + "loss": 0.5318, + "step": 119620 + }, + { + "epoch": 1.0575682031153308, + "grad_norm": 0.9269682765007019, + "learning_rate": 3.2373863281411156e-05, + "loss": 0.6014, + "step": 119630 + }, + { + "epoch": 1.0576566063756432, + "grad_norm": 5.7977614402771, + "learning_rate": 3.237238989373928e-05, + "loss": 0.4181, + "step": 119640 + }, + { + "epoch": 1.0577450096359553, + "grad_norm": 1.8269349336624146, + "learning_rate": 3.237091650606741e-05, + "loss": 0.5756, + "step": 119650 + }, + { + "epoch": 1.0578334128962676, + "grad_norm": 1.9745653867721558, + "learning_rate": 3.236944311839554e-05, + "loss": 0.4607, + "step": 119660 + }, + { + "epoch": 1.0579218161565798, + "grad_norm": 2.573300838470459, + "learning_rate": 3.236796973072367e-05, + "loss": 0.5911, + "step": 119670 + }, + { + "epoch": 1.058010219416892, + "grad_norm": 2.9967710971832275, + "learning_rate": 3.23664963430518e-05, + "loss": 0.5213, + "step": 119680 + }, + { + "epoch": 1.0580986226772042, + "grad_norm": 4.2053351402282715, + "learning_rate": 3.236502295537993e-05, + "loss": 0.7029, + "step": 119690 + }, + { + "epoch": 1.0581870259375166, + "grad_norm": 3.492554187774658, + "learning_rate": 3.2363549567708054e-05, + "loss": 0.4927, + "step": 119700 + }, + { + "epoch": 1.058275429197829, + "grad_norm": 1.8730839490890503, + "learning_rate": 3.236207618003619e-05, + "loss": 0.6672, + "step": 119710 + }, + { + "epoch": 1.058363832458141, + "grad_norm": 3.348025321960449, + "learning_rate": 3.236060279236432e-05, + "loss": 0.6587, + "step": 119720 + }, + { + "epoch": 1.0584522357184534, + "grad_norm": 4.149188995361328, + "learning_rate": 3.2359129404692446e-05, + "loss": 0.6755, + "step": 119730 + }, + { + "epoch": 1.0585406389787655, + "grad_norm": 2.0982747077941895, + "learning_rate": 3.2357656017020575e-05, + "loss": 0.6226, + "step": 119740 + }, + { + "epoch": 1.0586290422390778, + "grad_norm": 4.101964473724365, + "learning_rate": 3.235618262934871e-05, + "loss": 0.5546, + "step": 119750 + }, + { + "epoch": 1.05871744549939, + "grad_norm": 8.209115982055664, + "learning_rate": 3.235470924167683e-05, + "loss": 0.5523, + "step": 119760 + }, + { + "epoch": 1.0588058487597023, + "grad_norm": 2.536583185195923, + "learning_rate": 3.2353235854004967e-05, + "loss": 0.7637, + "step": 119770 + }, + { + "epoch": 1.0588942520200144, + "grad_norm": 7.836760997772217, + "learning_rate": 3.235176246633309e-05, + "loss": 0.5865, + "step": 119780 + }, + { + "epoch": 1.0589826552803268, + "grad_norm": 3.317887783050537, + "learning_rate": 3.235028907866122e-05, + "loss": 0.6094, + "step": 119790 + }, + { + "epoch": 1.0590710585406389, + "grad_norm": 4.125409126281738, + "learning_rate": 3.234881569098935e-05, + "loss": 0.7318, + "step": 119800 + }, + { + "epoch": 1.0591594618009512, + "grad_norm": 2.881061553955078, + "learning_rate": 3.234734230331748e-05, + "loss": 0.6077, + "step": 119810 + }, + { + "epoch": 1.0592478650612636, + "grad_norm": 4.462584018707275, + "learning_rate": 3.234586891564561e-05, + "loss": 0.6128, + "step": 119820 + }, + { + "epoch": 1.0593362683215757, + "grad_norm": 8.47276782989502, + "learning_rate": 3.2344395527973743e-05, + "loss": 0.5648, + "step": 119830 + }, + { + "epoch": 1.059424671581888, + "grad_norm": 9.020562171936035, + "learning_rate": 3.2342922140301865e-05, + "loss": 0.711, + "step": 119840 + }, + { + "epoch": 1.0595130748422001, + "grad_norm": 1.9493681192398071, + "learning_rate": 3.234144875263e-05, + "loss": 0.6099, + "step": 119850 + }, + { + "epoch": 1.0596014781025125, + "grad_norm": 4.458160877227783, + "learning_rate": 3.233997536495813e-05, + "loss": 0.6741, + "step": 119860 + }, + { + "epoch": 1.0596898813628246, + "grad_norm": 1.1083298921585083, + "learning_rate": 3.233850197728626e-05, + "loss": 0.5078, + "step": 119870 + }, + { + "epoch": 1.059778284623137, + "grad_norm": 6.4721198081970215, + "learning_rate": 3.2337028589614385e-05, + "loss": 0.7002, + "step": 119880 + }, + { + "epoch": 1.059866687883449, + "grad_norm": 1.5790786743164062, + "learning_rate": 3.2335555201942514e-05, + "loss": 0.6224, + "step": 119890 + }, + { + "epoch": 1.0599550911437614, + "grad_norm": 6.935812950134277, + "learning_rate": 3.233408181427064e-05, + "loss": 0.7024, + "step": 119900 + }, + { + "epoch": 1.0600434944040735, + "grad_norm": 4.4745774269104, + "learning_rate": 3.233260842659878e-05, + "loss": 0.6105, + "step": 119910 + }, + { + "epoch": 1.0601318976643859, + "grad_norm": 3.367515802383423, + "learning_rate": 3.23311350389269e-05, + "loss": 0.6949, + "step": 119920 + }, + { + "epoch": 1.0602203009246982, + "grad_norm": 4.029800891876221, + "learning_rate": 3.2329661651255034e-05, + "loss": 0.7179, + "step": 119930 + }, + { + "epoch": 1.0603087041850103, + "grad_norm": 0.9912646412849426, + "learning_rate": 3.232818826358316e-05, + "loss": 0.5529, + "step": 119940 + }, + { + "epoch": 1.0603971074453227, + "grad_norm": 1.3798308372497559, + "learning_rate": 3.232671487591129e-05, + "loss": 0.5124, + "step": 119950 + }, + { + "epoch": 1.0604855107056348, + "grad_norm": 4.9029107093811035, + "learning_rate": 3.232524148823942e-05, + "loss": 0.6106, + "step": 119960 + }, + { + "epoch": 1.0605739139659471, + "grad_norm": 4.14245080947876, + "learning_rate": 3.2323768100567554e-05, + "loss": 0.7063, + "step": 119970 + }, + { + "epoch": 1.0606623172262593, + "grad_norm": 5.5595383644104, + "learning_rate": 3.2322294712895675e-05, + "loss": 0.5531, + "step": 119980 + }, + { + "epoch": 1.0607507204865716, + "grad_norm": 20.607101440429688, + "learning_rate": 3.232082132522381e-05, + "loss": 0.5401, + "step": 119990 + }, + { + "epoch": 1.0608391237468837, + "grad_norm": 2.56923770904541, + "learning_rate": 3.231934793755194e-05, + "loss": 0.6642, + "step": 120000 + }, + { + "epoch": 1.060927527007196, + "grad_norm": 5.361979961395264, + "learning_rate": 3.231787454988007e-05, + "loss": 0.681, + "step": 120010 + }, + { + "epoch": 1.0610159302675082, + "grad_norm": 1.8949682712554932, + "learning_rate": 3.2316401162208196e-05, + "loss": 0.6631, + "step": 120020 + }, + { + "epoch": 1.0611043335278205, + "grad_norm": 3.042494297027588, + "learning_rate": 3.2314927774536324e-05, + "loss": 0.6607, + "step": 120030 + }, + { + "epoch": 1.0611927367881326, + "grad_norm": 4.636878967285156, + "learning_rate": 3.231345438686445e-05, + "loss": 0.5422, + "step": 120040 + }, + { + "epoch": 1.061281140048445, + "grad_norm": 1.388635516166687, + "learning_rate": 3.231198099919259e-05, + "loss": 0.6404, + "step": 120050 + }, + { + "epoch": 1.0613695433087573, + "grad_norm": 1.4575765132904053, + "learning_rate": 3.2310507611520716e-05, + "loss": 0.6218, + "step": 120060 + }, + { + "epoch": 1.0614579465690694, + "grad_norm": 1.6528077125549316, + "learning_rate": 3.2309034223848844e-05, + "loss": 0.6597, + "step": 120070 + }, + { + "epoch": 1.0615463498293818, + "grad_norm": 3.4966204166412354, + "learning_rate": 3.230756083617697e-05, + "loss": 0.6115, + "step": 120080 + }, + { + "epoch": 1.061634753089694, + "grad_norm": 5.340075492858887, + "learning_rate": 3.23060874485051e-05, + "loss": 0.6453, + "step": 120090 + }, + { + "epoch": 1.0617231563500062, + "grad_norm": 2.160996198654175, + "learning_rate": 3.230461406083323e-05, + "loss": 0.5929, + "step": 120100 + }, + { + "epoch": 1.0618115596103184, + "grad_norm": 5.821319103240967, + "learning_rate": 3.230314067316136e-05, + "loss": 0.5311, + "step": 120110 + }, + { + "epoch": 1.0618999628706307, + "grad_norm": 1.443834662437439, + "learning_rate": 3.230166728548949e-05, + "loss": 0.6031, + "step": 120120 + }, + { + "epoch": 1.0619883661309428, + "grad_norm": 7.692910194396973, + "learning_rate": 3.230019389781762e-05, + "loss": 0.6992, + "step": 120130 + }, + { + "epoch": 1.0620767693912552, + "grad_norm": 1.5878018140792847, + "learning_rate": 3.229872051014575e-05, + "loss": 0.6119, + "step": 120140 + }, + { + "epoch": 1.0621651726515673, + "grad_norm": 23.90019416809082, + "learning_rate": 3.229724712247388e-05, + "loss": 0.5547, + "step": 120150 + }, + { + "epoch": 1.0622535759118796, + "grad_norm": 1.5449013710021973, + "learning_rate": 3.2295773734802006e-05, + "loss": 0.6846, + "step": 120160 + }, + { + "epoch": 1.062341979172192, + "grad_norm": 3.1172685623168945, + "learning_rate": 3.2294300347130135e-05, + "loss": 0.7436, + "step": 120170 + }, + { + "epoch": 1.062430382432504, + "grad_norm": 2.0229194164276123, + "learning_rate": 3.229282695945827e-05, + "loss": 0.5896, + "step": 120180 + }, + { + "epoch": 1.0625187856928164, + "grad_norm": 4.894861221313477, + "learning_rate": 3.22913535717864e-05, + "loss": 0.7207, + "step": 120190 + }, + { + "epoch": 1.0626071889531286, + "grad_norm": 2.4821605682373047, + "learning_rate": 3.2289880184114526e-05, + "loss": 0.6091, + "step": 120200 + }, + { + "epoch": 1.062695592213441, + "grad_norm": 3.251113176345825, + "learning_rate": 3.2288406796442655e-05, + "loss": 0.5585, + "step": 120210 + }, + { + "epoch": 1.062783995473753, + "grad_norm": 32.71751403808594, + "learning_rate": 3.228693340877078e-05, + "loss": 0.5548, + "step": 120220 + }, + { + "epoch": 1.0628723987340654, + "grad_norm": 1.7661948204040527, + "learning_rate": 3.228546002109891e-05, + "loss": 0.6298, + "step": 120230 + }, + { + "epoch": 1.0629608019943775, + "grad_norm": 4.406103610992432, + "learning_rate": 3.2283986633427047e-05, + "loss": 0.7948, + "step": 120240 + }, + { + "epoch": 1.0630492052546898, + "grad_norm": 5.395301818847656, + "learning_rate": 3.228251324575517e-05, + "loss": 0.6325, + "step": 120250 + }, + { + "epoch": 1.063137608515002, + "grad_norm": 5.040433883666992, + "learning_rate": 3.22810398580833e-05, + "loss": 0.6932, + "step": 120260 + }, + { + "epoch": 1.0632260117753143, + "grad_norm": 6.978147029876709, + "learning_rate": 3.227956647041143e-05, + "loss": 0.5559, + "step": 120270 + }, + { + "epoch": 1.0633144150356264, + "grad_norm": 2.692667245864868, + "learning_rate": 3.227809308273956e-05, + "loss": 0.6051, + "step": 120280 + }, + { + "epoch": 1.0634028182959387, + "grad_norm": 2.461068630218506, + "learning_rate": 3.227661969506769e-05, + "loss": 0.6555, + "step": 120290 + }, + { + "epoch": 1.063491221556251, + "grad_norm": 1.7914044857025146, + "learning_rate": 3.2275146307395823e-05, + "loss": 0.5954, + "step": 120300 + }, + { + "epoch": 1.0635796248165632, + "grad_norm": 5.35980749130249, + "learning_rate": 3.2273672919723945e-05, + "loss": 0.5973, + "step": 120310 + }, + { + "epoch": 1.0636680280768755, + "grad_norm": 0.904769241809845, + "learning_rate": 3.227219953205208e-05, + "loss": 0.6075, + "step": 120320 + }, + { + "epoch": 1.0637564313371877, + "grad_norm": 3.791430950164795, + "learning_rate": 3.227072614438021e-05, + "loss": 0.43, + "step": 120330 + }, + { + "epoch": 1.0638448345975, + "grad_norm": 2.046313762664795, + "learning_rate": 3.226925275670834e-05, + "loss": 0.6327, + "step": 120340 + }, + { + "epoch": 1.0639332378578121, + "grad_norm": 1.3397361040115356, + "learning_rate": 3.2267779369036465e-05, + "loss": 0.7551, + "step": 120350 + }, + { + "epoch": 1.0640216411181245, + "grad_norm": 2.307652711868286, + "learning_rate": 3.2266305981364594e-05, + "loss": 0.6605, + "step": 120360 + }, + { + "epoch": 1.0641100443784366, + "grad_norm": 1.9020657539367676, + "learning_rate": 3.226483259369272e-05, + "loss": 0.6767, + "step": 120370 + }, + { + "epoch": 1.064198447638749, + "grad_norm": 4.0863776206970215, + "learning_rate": 3.226335920602086e-05, + "loss": 0.6498, + "step": 120380 + }, + { + "epoch": 1.064286850899061, + "grad_norm": 2.576895236968994, + "learning_rate": 3.226188581834898e-05, + "loss": 0.5399, + "step": 120390 + }, + { + "epoch": 1.0643752541593734, + "grad_norm": 5.870359897613525, + "learning_rate": 3.2260412430677114e-05, + "loss": 0.667, + "step": 120400 + }, + { + "epoch": 1.0644636574196857, + "grad_norm": 2.762570381164551, + "learning_rate": 3.225893904300524e-05, + "loss": 0.6036, + "step": 120410 + }, + { + "epoch": 1.0645520606799979, + "grad_norm": 4.400736331939697, + "learning_rate": 3.225746565533337e-05, + "loss": 0.7545, + "step": 120420 + }, + { + "epoch": 1.0646404639403102, + "grad_norm": 1.3446694612503052, + "learning_rate": 3.22559922676615e-05, + "loss": 0.6572, + "step": 120430 + }, + { + "epoch": 1.0647288672006223, + "grad_norm": 1.4495806694030762, + "learning_rate": 3.2254518879989634e-05, + "loss": 0.6116, + "step": 120440 + }, + { + "epoch": 1.0648172704609347, + "grad_norm": 5.972079753875732, + "learning_rate": 3.2253045492317756e-05, + "loss": 0.565, + "step": 120450 + }, + { + "epoch": 1.0649056737212468, + "grad_norm": 2.1598987579345703, + "learning_rate": 3.225157210464589e-05, + "loss": 0.766, + "step": 120460 + }, + { + "epoch": 1.0649940769815591, + "grad_norm": 1.5302361249923706, + "learning_rate": 3.225009871697401e-05, + "loss": 0.5278, + "step": 120470 + }, + { + "epoch": 1.0650824802418712, + "grad_norm": 3.2372963428497314, + "learning_rate": 3.224862532930215e-05, + "loss": 0.5398, + "step": 120480 + }, + { + "epoch": 1.0651708835021836, + "grad_norm": 7.0759100914001465, + "learning_rate": 3.2247151941630276e-05, + "loss": 0.5889, + "step": 120490 + }, + { + "epoch": 1.0652592867624957, + "grad_norm": 1.8381094932556152, + "learning_rate": 3.2245678553958404e-05, + "loss": 0.5886, + "step": 120500 + }, + { + "epoch": 1.065347690022808, + "grad_norm": 1.341376543045044, + "learning_rate": 3.224420516628653e-05, + "loss": 0.6705, + "step": 120510 + }, + { + "epoch": 1.0654360932831204, + "grad_norm": 12.450121879577637, + "learning_rate": 3.224273177861467e-05, + "loss": 0.708, + "step": 120520 + }, + { + "epoch": 1.0655244965434325, + "grad_norm": 4.1199212074279785, + "learning_rate": 3.224125839094279e-05, + "loss": 0.6721, + "step": 120530 + }, + { + "epoch": 1.0656128998037449, + "grad_norm": 4.740642070770264, + "learning_rate": 3.2239785003270924e-05, + "loss": 0.5963, + "step": 120540 + }, + { + "epoch": 1.065701303064057, + "grad_norm": 5.144943714141846, + "learning_rate": 3.223831161559905e-05, + "loss": 0.662, + "step": 120550 + }, + { + "epoch": 1.0657897063243693, + "grad_norm": 1.3962523937225342, + "learning_rate": 3.223683822792718e-05, + "loss": 0.6482, + "step": 120560 + }, + { + "epoch": 1.0658781095846814, + "grad_norm": 2.8582942485809326, + "learning_rate": 3.223536484025531e-05, + "loss": 0.4729, + "step": 120570 + }, + { + "epoch": 1.0659665128449938, + "grad_norm": 4.132655620574951, + "learning_rate": 3.223389145258344e-05, + "loss": 0.5207, + "step": 120580 + }, + { + "epoch": 1.066054916105306, + "grad_norm": 1.2919360399246216, + "learning_rate": 3.2232418064911566e-05, + "loss": 0.5604, + "step": 120590 + }, + { + "epoch": 1.0661433193656182, + "grad_norm": 2.8105602264404297, + "learning_rate": 3.22309446772397e-05, + "loss": 0.4947, + "step": 120600 + }, + { + "epoch": 1.0662317226259304, + "grad_norm": 7.603043079376221, + "learning_rate": 3.222947128956782e-05, + "loss": 0.6662, + "step": 120610 + }, + { + "epoch": 1.0663201258862427, + "grad_norm": 10.433573722839355, + "learning_rate": 3.222799790189596e-05, + "loss": 0.5635, + "step": 120620 + }, + { + "epoch": 1.066408529146555, + "grad_norm": 2.4541211128234863, + "learning_rate": 3.2226524514224086e-05, + "loss": 0.7041, + "step": 120630 + }, + { + "epoch": 1.0664969324068672, + "grad_norm": 1.8980493545532227, + "learning_rate": 3.2225051126552215e-05, + "loss": 0.6814, + "step": 120640 + }, + { + "epoch": 1.0665853356671795, + "grad_norm": 1.6585372686386108, + "learning_rate": 3.222357773888034e-05, + "loss": 0.6179, + "step": 120650 + }, + { + "epoch": 1.0666737389274916, + "grad_norm": 17.13709831237793, + "learning_rate": 3.222210435120848e-05, + "loss": 0.5893, + "step": 120660 + }, + { + "epoch": 1.066762142187804, + "grad_norm": 2.4513256549835205, + "learning_rate": 3.22206309635366e-05, + "loss": 0.5877, + "step": 120670 + }, + { + "epoch": 1.066850545448116, + "grad_norm": 9.410117149353027, + "learning_rate": 3.2219157575864735e-05, + "loss": 0.6185, + "step": 120680 + }, + { + "epoch": 1.0669389487084284, + "grad_norm": 2.3887970447540283, + "learning_rate": 3.221768418819286e-05, + "loss": 0.5749, + "step": 120690 + }, + { + "epoch": 1.0670273519687405, + "grad_norm": 5.723911762237549, + "learning_rate": 3.221621080052099e-05, + "loss": 0.6326, + "step": 120700 + }, + { + "epoch": 1.0671157552290529, + "grad_norm": 1.5306992530822754, + "learning_rate": 3.221473741284912e-05, + "loss": 0.6017, + "step": 120710 + }, + { + "epoch": 1.067204158489365, + "grad_norm": 2.6558194160461426, + "learning_rate": 3.221326402517725e-05, + "loss": 0.6607, + "step": 120720 + }, + { + "epoch": 1.0672925617496773, + "grad_norm": 2.473825693130493, + "learning_rate": 3.2211790637505377e-05, + "loss": 0.7037, + "step": 120730 + }, + { + "epoch": 1.0673809650099895, + "grad_norm": 12.26204776763916, + "learning_rate": 3.221031724983351e-05, + "loss": 0.6037, + "step": 120740 + }, + { + "epoch": 1.0674693682703018, + "grad_norm": 5.587930679321289, + "learning_rate": 3.220884386216163e-05, + "loss": 0.5959, + "step": 120750 + }, + { + "epoch": 1.0675577715306142, + "grad_norm": 3.8034064769744873, + "learning_rate": 3.220737047448977e-05, + "loss": 0.6826, + "step": 120760 + }, + { + "epoch": 1.0676461747909263, + "grad_norm": 3.043522596359253, + "learning_rate": 3.22058970868179e-05, + "loss": 0.5302, + "step": 120770 + }, + { + "epoch": 1.0677345780512386, + "grad_norm": 6.027188777923584, + "learning_rate": 3.2204423699146025e-05, + "loss": 0.6712, + "step": 120780 + }, + { + "epoch": 1.0678229813115507, + "grad_norm": 1.6273082494735718, + "learning_rate": 3.2202950311474153e-05, + "loss": 0.704, + "step": 120790 + }, + { + "epoch": 1.067911384571863, + "grad_norm": 2.3920822143554688, + "learning_rate": 3.220147692380229e-05, + "loss": 0.622, + "step": 120800 + }, + { + "epoch": 1.0679997878321752, + "grad_norm": 0.9818435311317444, + "learning_rate": 3.220000353613041e-05, + "loss": 0.6715, + "step": 120810 + }, + { + "epoch": 1.0680881910924875, + "grad_norm": 1.0270146131515503, + "learning_rate": 3.2198530148458545e-05, + "loss": 0.6102, + "step": 120820 + }, + { + "epoch": 1.0681765943527997, + "grad_norm": 2.4802002906799316, + "learning_rate": 3.219705676078667e-05, + "loss": 0.6699, + "step": 120830 + }, + { + "epoch": 1.068264997613112, + "grad_norm": 3.1002514362335205, + "learning_rate": 3.21955833731148e-05, + "loss": 0.5078, + "step": 120840 + }, + { + "epoch": 1.0683534008734241, + "grad_norm": 3.872596502304077, + "learning_rate": 3.219410998544293e-05, + "loss": 0.6268, + "step": 120850 + }, + { + "epoch": 1.0684418041337365, + "grad_norm": 3.8102264404296875, + "learning_rate": 3.219263659777106e-05, + "loss": 0.4891, + "step": 120860 + }, + { + "epoch": 1.0685302073940486, + "grad_norm": 6.440011501312256, + "learning_rate": 3.219116321009919e-05, + "loss": 0.5027, + "step": 120870 + }, + { + "epoch": 1.068618610654361, + "grad_norm": 5.2556986808776855, + "learning_rate": 3.218968982242732e-05, + "loss": 0.6085, + "step": 120880 + }, + { + "epoch": 1.0687070139146733, + "grad_norm": 7.90283203125, + "learning_rate": 3.2188216434755444e-05, + "loss": 0.6949, + "step": 120890 + }, + { + "epoch": 1.0687954171749854, + "grad_norm": 3.3118958473205566, + "learning_rate": 3.218674304708358e-05, + "loss": 0.5001, + "step": 120900 + }, + { + "epoch": 1.0688838204352977, + "grad_norm": 5.0742597579956055, + "learning_rate": 3.218526965941171e-05, + "loss": 0.7215, + "step": 120910 + }, + { + "epoch": 1.0689722236956098, + "grad_norm": 1.7520378828048706, + "learning_rate": 3.2183796271739836e-05, + "loss": 0.7059, + "step": 120920 + }, + { + "epoch": 1.0690606269559222, + "grad_norm": 3.5206902027130127, + "learning_rate": 3.2182322884067964e-05, + "loss": 0.5361, + "step": 120930 + }, + { + "epoch": 1.0691490302162343, + "grad_norm": 2.8200526237487793, + "learning_rate": 3.218084949639609e-05, + "loss": 0.5159, + "step": 120940 + }, + { + "epoch": 1.0692374334765467, + "grad_norm": 1.680497169494629, + "learning_rate": 3.217937610872422e-05, + "loss": 0.5674, + "step": 120950 + }, + { + "epoch": 1.0693258367368588, + "grad_norm": 5.961465358734131, + "learning_rate": 3.2177902721052356e-05, + "loss": 0.6492, + "step": 120960 + }, + { + "epoch": 1.0694142399971711, + "grad_norm": 7.21779203414917, + "learning_rate": 3.2176429333380484e-05, + "loss": 0.6107, + "step": 120970 + }, + { + "epoch": 1.0695026432574832, + "grad_norm": 2.5119082927703857, + "learning_rate": 3.217495594570861e-05, + "loss": 0.5784, + "step": 120980 + }, + { + "epoch": 1.0695910465177956, + "grad_norm": 3.025531053543091, + "learning_rate": 3.217348255803674e-05, + "loss": 0.7293, + "step": 120990 + }, + { + "epoch": 1.069679449778108, + "grad_norm": 2.1070396900177, + "learning_rate": 3.217200917036487e-05, + "loss": 0.4889, + "step": 121000 + }, + { + "epoch": 1.06976785303842, + "grad_norm": 2.1777422428131104, + "learning_rate": 3.2170535782693e-05, + "loss": 0.6259, + "step": 121010 + }, + { + "epoch": 1.0698562562987324, + "grad_norm": 1.2348896265029907, + "learning_rate": 3.216906239502113e-05, + "loss": 0.6916, + "step": 121020 + }, + { + "epoch": 1.0699446595590445, + "grad_norm": 3.8984742164611816, + "learning_rate": 3.216758900734926e-05, + "loss": 0.7222, + "step": 121030 + }, + { + "epoch": 1.0700330628193568, + "grad_norm": 2.697364091873169, + "learning_rate": 3.216611561967739e-05, + "loss": 0.6438, + "step": 121040 + }, + { + "epoch": 1.070121466079669, + "grad_norm": 0.9914762377738953, + "learning_rate": 3.216464223200552e-05, + "loss": 0.5099, + "step": 121050 + }, + { + "epoch": 1.0702098693399813, + "grad_norm": 16.497310638427734, + "learning_rate": 3.2163168844333646e-05, + "loss": 0.5632, + "step": 121060 + }, + { + "epoch": 1.0702982726002934, + "grad_norm": 1.8060559034347534, + "learning_rate": 3.2161695456661774e-05, + "loss": 0.7179, + "step": 121070 + }, + { + "epoch": 1.0703866758606058, + "grad_norm": 20.95323944091797, + "learning_rate": 3.21602220689899e-05, + "loss": 0.6801, + "step": 121080 + }, + { + "epoch": 1.0704750791209179, + "grad_norm": 2.695711851119995, + "learning_rate": 3.215874868131804e-05, + "loss": 0.5828, + "step": 121090 + }, + { + "epoch": 1.0705634823812302, + "grad_norm": 3.5927395820617676, + "learning_rate": 3.2157275293646166e-05, + "loss": 0.5339, + "step": 121100 + }, + { + "epoch": 1.0706518856415426, + "grad_norm": 6.041247844696045, + "learning_rate": 3.2155801905974295e-05, + "loss": 0.764, + "step": 121110 + }, + { + "epoch": 1.0707402889018547, + "grad_norm": 1.805579662322998, + "learning_rate": 3.215432851830242e-05, + "loss": 0.5147, + "step": 121120 + }, + { + "epoch": 1.070828692162167, + "grad_norm": 1.332554817199707, + "learning_rate": 3.215285513063055e-05, + "loss": 0.6448, + "step": 121130 + }, + { + "epoch": 1.0709170954224791, + "grad_norm": 2.044205665588379, + "learning_rate": 3.215138174295868e-05, + "loss": 0.5545, + "step": 121140 + }, + { + "epoch": 1.0710054986827915, + "grad_norm": 21.489187240600586, + "learning_rate": 3.2149908355286815e-05, + "loss": 0.6201, + "step": 121150 + }, + { + "epoch": 1.0710939019431036, + "grad_norm": 11.531546592712402, + "learning_rate": 3.214843496761494e-05, + "loss": 0.6901, + "step": 121160 + }, + { + "epoch": 1.071182305203416, + "grad_norm": 3.9804844856262207, + "learning_rate": 3.214696157994307e-05, + "loss": 0.7121, + "step": 121170 + }, + { + "epoch": 1.071270708463728, + "grad_norm": 2.379244565963745, + "learning_rate": 3.21454881922712e-05, + "loss": 0.5671, + "step": 121180 + }, + { + "epoch": 1.0713591117240404, + "grad_norm": 4.033192157745361, + "learning_rate": 3.214401480459933e-05, + "loss": 0.5707, + "step": 121190 + }, + { + "epoch": 1.0714475149843525, + "grad_norm": 2.3262979984283447, + "learning_rate": 3.214254141692746e-05, + "loss": 0.5273, + "step": 121200 + }, + { + "epoch": 1.0715359182446649, + "grad_norm": 1.0184017419815063, + "learning_rate": 3.214106802925559e-05, + "loss": 0.6661, + "step": 121210 + }, + { + "epoch": 1.0716243215049772, + "grad_norm": 1.3592736721038818, + "learning_rate": 3.213959464158371e-05, + "loss": 0.5844, + "step": 121220 + }, + { + "epoch": 1.0717127247652893, + "grad_norm": 5.208528518676758, + "learning_rate": 3.213812125391185e-05, + "loss": 0.6802, + "step": 121230 + }, + { + "epoch": 1.0718011280256017, + "grad_norm": 1.4978290796279907, + "learning_rate": 3.213664786623998e-05, + "loss": 0.5807, + "step": 121240 + }, + { + "epoch": 1.0718895312859138, + "grad_norm": 1.2481811046600342, + "learning_rate": 3.2135174478568105e-05, + "loss": 0.4375, + "step": 121250 + }, + { + "epoch": 1.0719779345462261, + "grad_norm": 2.0746304988861084, + "learning_rate": 3.2133701090896234e-05, + "loss": 0.595, + "step": 121260 + }, + { + "epoch": 1.0720663378065383, + "grad_norm": 1.6982258558273315, + "learning_rate": 3.213222770322437e-05, + "loss": 0.7717, + "step": 121270 + }, + { + "epoch": 1.0721547410668506, + "grad_norm": 5.13646936416626, + "learning_rate": 3.213075431555249e-05, + "loss": 0.6305, + "step": 121280 + }, + { + "epoch": 1.0722431443271627, + "grad_norm": 10.087930679321289, + "learning_rate": 3.2129280927880625e-05, + "loss": 0.5816, + "step": 121290 + }, + { + "epoch": 1.072331547587475, + "grad_norm": 5.895126819610596, + "learning_rate": 3.212780754020875e-05, + "loss": 0.5815, + "step": 121300 + }, + { + "epoch": 1.0724199508477872, + "grad_norm": 4.897123336791992, + "learning_rate": 3.212633415253688e-05, + "loss": 0.7048, + "step": 121310 + }, + { + "epoch": 1.0725083541080995, + "grad_norm": 1.7410991191864014, + "learning_rate": 3.212486076486501e-05, + "loss": 0.5557, + "step": 121320 + }, + { + "epoch": 1.0725967573684116, + "grad_norm": 13.142781257629395, + "learning_rate": 3.212338737719314e-05, + "loss": 0.3966, + "step": 121330 + }, + { + "epoch": 1.072685160628724, + "grad_norm": 9.509994506835938, + "learning_rate": 3.212191398952127e-05, + "loss": 0.5738, + "step": 121340 + }, + { + "epoch": 1.0727735638890363, + "grad_norm": 1.4094538688659668, + "learning_rate": 3.21204406018494e-05, + "loss": 0.5842, + "step": 121350 + }, + { + "epoch": 1.0728619671493484, + "grad_norm": 1.9394299983978271, + "learning_rate": 3.2118967214177524e-05, + "loss": 0.4725, + "step": 121360 + }, + { + "epoch": 1.0729503704096608, + "grad_norm": 4.592030048370361, + "learning_rate": 3.211749382650566e-05, + "loss": 0.5502, + "step": 121370 + }, + { + "epoch": 1.073038773669973, + "grad_norm": 3.1156632900238037, + "learning_rate": 3.211602043883379e-05, + "loss": 0.6149, + "step": 121380 + }, + { + "epoch": 1.0731271769302853, + "grad_norm": 5.135519981384277, + "learning_rate": 3.2114547051161916e-05, + "loss": 0.756, + "step": 121390 + }, + { + "epoch": 1.0732155801905974, + "grad_norm": 2.4875688552856445, + "learning_rate": 3.2113073663490044e-05, + "loss": 0.6142, + "step": 121400 + }, + { + "epoch": 1.0733039834509097, + "grad_norm": 5.848193645477295, + "learning_rate": 3.211160027581817e-05, + "loss": 0.6173, + "step": 121410 + }, + { + "epoch": 1.0733923867112218, + "grad_norm": 4.377786636352539, + "learning_rate": 3.21101268881463e-05, + "loss": 0.6957, + "step": 121420 + }, + { + "epoch": 1.0734807899715342, + "grad_norm": 3.9862008094787598, + "learning_rate": 3.2108653500474436e-05, + "loss": 0.6682, + "step": 121430 + }, + { + "epoch": 1.0735691932318463, + "grad_norm": 3.1569674015045166, + "learning_rate": 3.210718011280256e-05, + "loss": 0.4815, + "step": 121440 + }, + { + "epoch": 1.0736575964921586, + "grad_norm": 2.537647008895874, + "learning_rate": 3.210570672513069e-05, + "loss": 0.6763, + "step": 121450 + }, + { + "epoch": 1.0737459997524708, + "grad_norm": 11.374505043029785, + "learning_rate": 3.210423333745882e-05, + "loss": 0.7222, + "step": 121460 + }, + { + "epoch": 1.073834403012783, + "grad_norm": 5.498156547546387, + "learning_rate": 3.210275994978695e-05, + "loss": 0.6443, + "step": 121470 + }, + { + "epoch": 1.0739228062730954, + "grad_norm": 2.8872296810150146, + "learning_rate": 3.210128656211508e-05, + "loss": 0.6018, + "step": 121480 + }, + { + "epoch": 1.0740112095334076, + "grad_norm": 4.210776329040527, + "learning_rate": 3.209981317444321e-05, + "loss": 0.6167, + "step": 121490 + }, + { + "epoch": 1.07409961279372, + "grad_norm": 7.615468502044678, + "learning_rate": 3.2098339786771334e-05, + "loss": 0.5823, + "step": 121500 + }, + { + "epoch": 1.074188016054032, + "grad_norm": 2.6012418270111084, + "learning_rate": 3.209686639909947e-05, + "loss": 0.6672, + "step": 121510 + }, + { + "epoch": 1.0742764193143444, + "grad_norm": 5.401417255401611, + "learning_rate": 3.209539301142759e-05, + "loss": 0.564, + "step": 121520 + }, + { + "epoch": 1.0743648225746565, + "grad_norm": 1.6750456094741821, + "learning_rate": 3.2093919623755726e-05, + "loss": 0.5829, + "step": 121530 + }, + { + "epoch": 1.0744532258349688, + "grad_norm": 4.977668762207031, + "learning_rate": 3.2092446236083855e-05, + "loss": 0.7314, + "step": 121540 + }, + { + "epoch": 1.074541629095281, + "grad_norm": 2.7716176509857178, + "learning_rate": 3.209097284841198e-05, + "loss": 0.5461, + "step": 121550 + }, + { + "epoch": 1.0746300323555933, + "grad_norm": 1.9521986246109009, + "learning_rate": 3.208949946074011e-05, + "loss": 0.5778, + "step": 121560 + }, + { + "epoch": 1.0747184356159054, + "grad_norm": 3.8622937202453613, + "learning_rate": 3.2088026073068246e-05, + "loss": 0.6507, + "step": 121570 + }, + { + "epoch": 1.0748068388762178, + "grad_norm": 5.4770050048828125, + "learning_rate": 3.208655268539637e-05, + "loss": 0.806, + "step": 121580 + }, + { + "epoch": 1.07489524213653, + "grad_norm": 3.598459482192993, + "learning_rate": 3.20850792977245e-05, + "loss": 0.6434, + "step": 121590 + }, + { + "epoch": 1.0749836453968422, + "grad_norm": 1.5761405229568481, + "learning_rate": 3.208360591005263e-05, + "loss": 0.6724, + "step": 121600 + }, + { + "epoch": 1.0750720486571546, + "grad_norm": 1.326784372329712, + "learning_rate": 3.208213252238076e-05, + "loss": 0.5272, + "step": 121610 + }, + { + "epoch": 1.0751604519174667, + "grad_norm": 2.9855453968048096, + "learning_rate": 3.208065913470889e-05, + "loss": 0.6267, + "step": 121620 + }, + { + "epoch": 1.075248855177779, + "grad_norm": 3.084684133529663, + "learning_rate": 3.207918574703702e-05, + "loss": 0.5896, + "step": 121630 + }, + { + "epoch": 1.0753372584380911, + "grad_norm": 1.775133728981018, + "learning_rate": 3.2077712359365145e-05, + "loss": 0.6063, + "step": 121640 + }, + { + "epoch": 1.0754256616984035, + "grad_norm": 1.709446668624878, + "learning_rate": 3.207623897169328e-05, + "loss": 0.6196, + "step": 121650 + }, + { + "epoch": 1.0755140649587156, + "grad_norm": 4.275025367736816, + "learning_rate": 3.20747655840214e-05, + "loss": 0.7465, + "step": 121660 + }, + { + "epoch": 1.075602468219028, + "grad_norm": 11.366464614868164, + "learning_rate": 3.207329219634954e-05, + "loss": 0.5256, + "step": 121670 + }, + { + "epoch": 1.07569087147934, + "grad_norm": 3.131870985031128, + "learning_rate": 3.2071818808677665e-05, + "loss": 0.5486, + "step": 121680 + }, + { + "epoch": 1.0757792747396524, + "grad_norm": 1.2570112943649292, + "learning_rate": 3.2070345421005793e-05, + "loss": 0.6025, + "step": 121690 + }, + { + "epoch": 1.0758676779999647, + "grad_norm": 4.750698089599609, + "learning_rate": 3.206887203333392e-05, + "loss": 0.6556, + "step": 121700 + }, + { + "epoch": 1.0759560812602769, + "grad_norm": 8.063404083251953, + "learning_rate": 3.206739864566206e-05, + "loss": 0.726, + "step": 121710 + }, + { + "epoch": 1.0760444845205892, + "grad_norm": 1.368151307106018, + "learning_rate": 3.206592525799018e-05, + "loss": 0.7945, + "step": 121720 + }, + { + "epoch": 1.0761328877809013, + "grad_norm": 7.499267101287842, + "learning_rate": 3.2064451870318314e-05, + "loss": 0.5429, + "step": 121730 + }, + { + "epoch": 1.0762212910412137, + "grad_norm": 1.5975914001464844, + "learning_rate": 3.206297848264644e-05, + "loss": 0.5326, + "step": 121740 + }, + { + "epoch": 1.0763096943015258, + "grad_norm": 13.910560607910156, + "learning_rate": 3.206150509497457e-05, + "loss": 0.737, + "step": 121750 + }, + { + "epoch": 1.0763980975618381, + "grad_norm": 6.845784664154053, + "learning_rate": 3.20600317073027e-05, + "loss": 0.6227, + "step": 121760 + }, + { + "epoch": 1.0764865008221502, + "grad_norm": 1.2261799573898315, + "learning_rate": 3.205855831963083e-05, + "loss": 0.7177, + "step": 121770 + }, + { + "epoch": 1.0765749040824626, + "grad_norm": 2.2650344371795654, + "learning_rate": 3.2057084931958955e-05, + "loss": 0.5767, + "step": 121780 + }, + { + "epoch": 1.0766633073427747, + "grad_norm": 3.7625882625579834, + "learning_rate": 3.205561154428709e-05, + "loss": 0.6843, + "step": 121790 + }, + { + "epoch": 1.076751710603087, + "grad_norm": 11.173534393310547, + "learning_rate": 3.205413815661521e-05, + "loss": 0.5714, + "step": 121800 + }, + { + "epoch": 1.0768401138633994, + "grad_norm": 0.8792701363563538, + "learning_rate": 3.205266476894335e-05, + "loss": 0.5123, + "step": 121810 + }, + { + "epoch": 1.0769285171237115, + "grad_norm": 5.793383598327637, + "learning_rate": 3.2051191381271476e-05, + "loss": 0.7082, + "step": 121820 + }, + { + "epoch": 1.0770169203840239, + "grad_norm": 1.818489909172058, + "learning_rate": 3.2049717993599604e-05, + "loss": 0.7048, + "step": 121830 + }, + { + "epoch": 1.077105323644336, + "grad_norm": 6.321500301361084, + "learning_rate": 3.204824460592773e-05, + "loss": 0.7024, + "step": 121840 + }, + { + "epoch": 1.0771937269046483, + "grad_norm": 1.1618351936340332, + "learning_rate": 3.204677121825587e-05, + "loss": 0.5693, + "step": 121850 + }, + { + "epoch": 1.0772821301649604, + "grad_norm": 6.2675628662109375, + "learning_rate": 3.204529783058399e-05, + "loss": 0.5162, + "step": 121860 + }, + { + "epoch": 1.0773705334252728, + "grad_norm": 16.915861129760742, + "learning_rate": 3.2043824442912124e-05, + "loss": 0.7185, + "step": 121870 + }, + { + "epoch": 1.077458936685585, + "grad_norm": 2.535393714904785, + "learning_rate": 3.204235105524025e-05, + "loss": 0.5321, + "step": 121880 + }, + { + "epoch": 1.0775473399458972, + "grad_norm": 3.225973129272461, + "learning_rate": 3.204087766756838e-05, + "loss": 0.7294, + "step": 121890 + }, + { + "epoch": 1.0776357432062094, + "grad_norm": 10.800483703613281, + "learning_rate": 3.203940427989651e-05, + "loss": 0.511, + "step": 121900 + }, + { + "epoch": 1.0777241464665217, + "grad_norm": 4.964085102081299, + "learning_rate": 3.203793089222464e-05, + "loss": 0.6674, + "step": 121910 + }, + { + "epoch": 1.0778125497268338, + "grad_norm": 5.181891918182373, + "learning_rate": 3.203645750455277e-05, + "loss": 0.6324, + "step": 121920 + }, + { + "epoch": 1.0779009529871462, + "grad_norm": 4.374396324157715, + "learning_rate": 3.20349841168809e-05, + "loss": 0.576, + "step": 121930 + }, + { + "epoch": 1.0779893562474585, + "grad_norm": 3.8809781074523926, + "learning_rate": 3.203351072920903e-05, + "loss": 0.7083, + "step": 121940 + }, + { + "epoch": 1.0780777595077706, + "grad_norm": 2.152778148651123, + "learning_rate": 3.203203734153716e-05, + "loss": 0.6069, + "step": 121950 + }, + { + "epoch": 1.078166162768083, + "grad_norm": 2.129201889038086, + "learning_rate": 3.2030563953865286e-05, + "loss": 0.6123, + "step": 121960 + }, + { + "epoch": 1.078254566028395, + "grad_norm": 2.8653624057769775, + "learning_rate": 3.2029090566193414e-05, + "loss": 0.724, + "step": 121970 + }, + { + "epoch": 1.0783429692887074, + "grad_norm": 11.684548377990723, + "learning_rate": 3.202761717852155e-05, + "loss": 0.7077, + "step": 121980 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 1.9912139177322388, + "learning_rate": 3.202614379084967e-05, + "loss": 0.625, + "step": 121990 + }, + { + "epoch": 1.078519775809332, + "grad_norm": 2.9863288402557373, + "learning_rate": 3.2024670403177806e-05, + "loss": 0.6493, + "step": 122000 + }, + { + "epoch": 1.078608179069644, + "grad_norm": 2.803328514099121, + "learning_rate": 3.2023197015505935e-05, + "loss": 0.6274, + "step": 122010 + }, + { + "epoch": 1.0786965823299564, + "grad_norm": 1.2812105417251587, + "learning_rate": 3.202172362783406e-05, + "loss": 0.6034, + "step": 122020 + }, + { + "epoch": 1.0787849855902685, + "grad_norm": 7.073108673095703, + "learning_rate": 3.202025024016219e-05, + "loss": 0.6396, + "step": 122030 + }, + { + "epoch": 1.0788733888505808, + "grad_norm": 16.532384872436523, + "learning_rate": 3.2018776852490326e-05, + "loss": 0.7696, + "step": 122040 + }, + { + "epoch": 1.078961792110893, + "grad_norm": 3.5699141025543213, + "learning_rate": 3.201730346481845e-05, + "loss": 0.5237, + "step": 122050 + }, + { + "epoch": 1.0790501953712053, + "grad_norm": 1.8562746047973633, + "learning_rate": 3.201583007714658e-05, + "loss": 0.5993, + "step": 122060 + }, + { + "epoch": 1.0791385986315176, + "grad_norm": 4.66828727722168, + "learning_rate": 3.201435668947471e-05, + "loss": 0.7343, + "step": 122070 + }, + { + "epoch": 1.0792270018918297, + "grad_norm": 1.4367125034332275, + "learning_rate": 3.201288330180284e-05, + "loss": 0.5483, + "step": 122080 + }, + { + "epoch": 1.079315405152142, + "grad_norm": 0.9155851006507874, + "learning_rate": 3.201140991413097e-05, + "loss": 0.6584, + "step": 122090 + }, + { + "epoch": 1.0794038084124542, + "grad_norm": 2.3527088165283203, + "learning_rate": 3.20099365264591e-05, + "loss": 0.5648, + "step": 122100 + }, + { + "epoch": 1.0794922116727665, + "grad_norm": 2.074800968170166, + "learning_rate": 3.2008463138787225e-05, + "loss": 0.588, + "step": 122110 + }, + { + "epoch": 1.0795806149330787, + "grad_norm": 1.8037925958633423, + "learning_rate": 3.200698975111536e-05, + "loss": 0.7356, + "step": 122120 + }, + { + "epoch": 1.079669018193391, + "grad_norm": 2.0469555854797363, + "learning_rate": 3.200551636344348e-05, + "loss": 0.5375, + "step": 122130 + }, + { + "epoch": 1.0797574214537031, + "grad_norm": 6.390101432800293, + "learning_rate": 3.200404297577162e-05, + "loss": 0.6544, + "step": 122140 + }, + { + "epoch": 1.0798458247140155, + "grad_norm": 1.8118375539779663, + "learning_rate": 3.2002569588099745e-05, + "loss": 0.5123, + "step": 122150 + }, + { + "epoch": 1.0799342279743276, + "grad_norm": 8.044949531555176, + "learning_rate": 3.2001096200427873e-05, + "loss": 0.6148, + "step": 122160 + }, + { + "epoch": 1.08002263123464, + "grad_norm": 1.4586153030395508, + "learning_rate": 3.1999622812756e-05, + "loss": 0.6151, + "step": 122170 + }, + { + "epoch": 1.0801110344949523, + "grad_norm": 1.2313237190246582, + "learning_rate": 3.199814942508414e-05, + "loss": 0.4891, + "step": 122180 + }, + { + "epoch": 1.0801994377552644, + "grad_norm": 1.6114076375961304, + "learning_rate": 3.199667603741226e-05, + "loss": 0.4885, + "step": 122190 + }, + { + "epoch": 1.0802878410155767, + "grad_norm": 2.5828280448913574, + "learning_rate": 3.1995202649740394e-05, + "loss": 0.7361, + "step": 122200 + }, + { + "epoch": 1.0803762442758889, + "grad_norm": 1.6665910482406616, + "learning_rate": 3.199372926206852e-05, + "loss": 0.5084, + "step": 122210 + }, + { + "epoch": 1.0804646475362012, + "grad_norm": 1.9073121547698975, + "learning_rate": 3.199225587439665e-05, + "loss": 0.5759, + "step": 122220 + }, + { + "epoch": 1.0805530507965133, + "grad_norm": 2.49973201751709, + "learning_rate": 3.199078248672478e-05, + "loss": 0.6122, + "step": 122230 + }, + { + "epoch": 1.0806414540568257, + "grad_norm": 3.6795437335968018, + "learning_rate": 3.198930909905291e-05, + "loss": 0.6624, + "step": 122240 + }, + { + "epoch": 1.0807298573171378, + "grad_norm": 2.7021732330322266, + "learning_rate": 3.1987835711381035e-05, + "loss": 0.5861, + "step": 122250 + }, + { + "epoch": 1.0808182605774501, + "grad_norm": 2.5378150939941406, + "learning_rate": 3.198636232370917e-05, + "loss": 0.6666, + "step": 122260 + }, + { + "epoch": 1.0809066638377622, + "grad_norm": 3.60380482673645, + "learning_rate": 3.198488893603729e-05, + "loss": 0.6491, + "step": 122270 + }, + { + "epoch": 1.0809950670980746, + "grad_norm": 2.004743814468384, + "learning_rate": 3.198341554836543e-05, + "loss": 0.6239, + "step": 122280 + }, + { + "epoch": 1.081083470358387, + "grad_norm": 1.2397043704986572, + "learning_rate": 3.1981942160693556e-05, + "loss": 0.5916, + "step": 122290 + }, + { + "epoch": 1.081171873618699, + "grad_norm": 0.6310939788818359, + "learning_rate": 3.1980468773021684e-05, + "loss": 0.5, + "step": 122300 + }, + { + "epoch": 1.0812602768790114, + "grad_norm": 2.923029661178589, + "learning_rate": 3.197899538534981e-05, + "loss": 0.5786, + "step": 122310 + }, + { + "epoch": 1.0813486801393235, + "grad_norm": 2.1095168590545654, + "learning_rate": 3.197752199767795e-05, + "loss": 0.6065, + "step": 122320 + }, + { + "epoch": 1.0814370833996358, + "grad_norm": 2.7881975173950195, + "learning_rate": 3.197604861000607e-05, + "loss": 0.6355, + "step": 122330 + }, + { + "epoch": 1.081525486659948, + "grad_norm": 0.8852866888046265, + "learning_rate": 3.1974575222334204e-05, + "loss": 0.6103, + "step": 122340 + }, + { + "epoch": 1.0816138899202603, + "grad_norm": 8.771408081054688, + "learning_rate": 3.1973101834662326e-05, + "loss": 0.7937, + "step": 122350 + }, + { + "epoch": 1.0817022931805724, + "grad_norm": 6.1663618087768555, + "learning_rate": 3.197162844699046e-05, + "loss": 0.5049, + "step": 122360 + }, + { + "epoch": 1.0817906964408848, + "grad_norm": 8.269588470458984, + "learning_rate": 3.197015505931859e-05, + "loss": 0.6602, + "step": 122370 + }, + { + "epoch": 1.081879099701197, + "grad_norm": 1.1021735668182373, + "learning_rate": 3.196868167164672e-05, + "loss": 0.553, + "step": 122380 + }, + { + "epoch": 1.0819675029615092, + "grad_norm": 1.3458079099655151, + "learning_rate": 3.1967208283974846e-05, + "loss": 0.4639, + "step": 122390 + }, + { + "epoch": 1.0820559062218216, + "grad_norm": 18.411046981811523, + "learning_rate": 3.196573489630298e-05, + "loss": 0.6744, + "step": 122400 + }, + { + "epoch": 1.0821443094821337, + "grad_norm": 3.0645389556884766, + "learning_rate": 3.19642615086311e-05, + "loss": 0.5915, + "step": 122410 + }, + { + "epoch": 1.082232712742446, + "grad_norm": 3.1344542503356934, + "learning_rate": 3.196278812095924e-05, + "loss": 0.7577, + "step": 122420 + }, + { + "epoch": 1.0823211160027582, + "grad_norm": 1.5618627071380615, + "learning_rate": 3.1961314733287366e-05, + "loss": 0.6805, + "step": 122430 + }, + { + "epoch": 1.0824095192630705, + "grad_norm": 2.5196473598480225, + "learning_rate": 3.1959841345615495e-05, + "loss": 0.6196, + "step": 122440 + }, + { + "epoch": 1.0824979225233826, + "grad_norm": 2.6964004039764404, + "learning_rate": 3.195836795794362e-05, + "loss": 0.6674, + "step": 122450 + }, + { + "epoch": 1.082586325783695, + "grad_norm": 3.5802650451660156, + "learning_rate": 3.195689457027176e-05, + "loss": 0.6871, + "step": 122460 + }, + { + "epoch": 1.082674729044007, + "grad_norm": 2.1361007690429688, + "learning_rate": 3.195542118259988e-05, + "loss": 0.5687, + "step": 122470 + }, + { + "epoch": 1.0827631323043194, + "grad_norm": 2.30440354347229, + "learning_rate": 3.1953947794928015e-05, + "loss": 0.5267, + "step": 122480 + }, + { + "epoch": 1.0828515355646315, + "grad_norm": 1.0843998193740845, + "learning_rate": 3.1952474407256136e-05, + "loss": 0.5372, + "step": 122490 + }, + { + "epoch": 1.0829399388249439, + "grad_norm": 1.6311516761779785, + "learning_rate": 3.195100101958427e-05, + "loss": 0.6458, + "step": 122500 + }, + { + "epoch": 1.083028342085256, + "grad_norm": 5.391809940338135, + "learning_rate": 3.19495276319124e-05, + "loss": 0.6708, + "step": 122510 + }, + { + "epoch": 1.0831167453455683, + "grad_norm": 2.112456798553467, + "learning_rate": 3.194805424424053e-05, + "loss": 0.5991, + "step": 122520 + }, + { + "epoch": 1.0832051486058807, + "grad_norm": 2.9221577644348145, + "learning_rate": 3.1946580856568656e-05, + "loss": 0.7105, + "step": 122530 + }, + { + "epoch": 1.0832935518661928, + "grad_norm": 2.363945722579956, + "learning_rate": 3.194510746889679e-05, + "loss": 0.5106, + "step": 122540 + }, + { + "epoch": 1.0833819551265051, + "grad_norm": 57.95821762084961, + "learning_rate": 3.194363408122491e-05, + "loss": 0.6824, + "step": 122550 + }, + { + "epoch": 1.0834703583868173, + "grad_norm": 2.919809579849243, + "learning_rate": 3.194216069355305e-05, + "loss": 0.614, + "step": 122560 + }, + { + "epoch": 1.0835587616471296, + "grad_norm": 4.594418048858643, + "learning_rate": 3.194068730588118e-05, + "loss": 0.6027, + "step": 122570 + }, + { + "epoch": 1.0836471649074417, + "grad_norm": 6.151988983154297, + "learning_rate": 3.1939213918209305e-05, + "loss": 0.611, + "step": 122580 + }, + { + "epoch": 1.083735568167754, + "grad_norm": 1.7607015371322632, + "learning_rate": 3.193774053053743e-05, + "loss": 0.6062, + "step": 122590 + }, + { + "epoch": 1.0838239714280662, + "grad_norm": 1.4993444681167603, + "learning_rate": 3.193626714286556e-05, + "loss": 0.551, + "step": 122600 + }, + { + "epoch": 1.0839123746883785, + "grad_norm": 1.3588351011276245, + "learning_rate": 3.193479375519369e-05, + "loss": 0.6519, + "step": 122610 + }, + { + "epoch": 1.0840007779486907, + "grad_norm": 1.543945550918579, + "learning_rate": 3.1933320367521825e-05, + "loss": 0.621, + "step": 122620 + }, + { + "epoch": 1.084089181209003, + "grad_norm": 2.5363752841949463, + "learning_rate": 3.193184697984995e-05, + "loss": 0.6891, + "step": 122630 + }, + { + "epoch": 1.0841775844693151, + "grad_norm": 3.4957563877105713, + "learning_rate": 3.193037359217808e-05, + "loss": 0.4635, + "step": 122640 + }, + { + "epoch": 1.0842659877296275, + "grad_norm": 1.6638953685760498, + "learning_rate": 3.192890020450621e-05, + "loss": 0.467, + "step": 122650 + }, + { + "epoch": 1.0843543909899398, + "grad_norm": 3.1594300270080566, + "learning_rate": 3.192742681683434e-05, + "loss": 0.6271, + "step": 122660 + }, + { + "epoch": 1.084442794250252, + "grad_norm": 4.568197727203369, + "learning_rate": 3.192595342916247e-05, + "loss": 0.5805, + "step": 122670 + }, + { + "epoch": 1.0845311975105643, + "grad_norm": 3.5797080993652344, + "learning_rate": 3.19244800414906e-05, + "loss": 0.5598, + "step": 122680 + }, + { + "epoch": 1.0846196007708764, + "grad_norm": 3.7258315086364746, + "learning_rate": 3.1923006653818724e-05, + "loss": 0.5602, + "step": 122690 + }, + { + "epoch": 1.0847080040311887, + "grad_norm": 5.744478225708008, + "learning_rate": 3.192153326614686e-05, + "loss": 0.6259, + "step": 122700 + }, + { + "epoch": 1.0847964072915008, + "grad_norm": 1.917878270149231, + "learning_rate": 3.192005987847499e-05, + "loss": 0.6345, + "step": 122710 + }, + { + "epoch": 1.0848848105518132, + "grad_norm": 7.340816497802734, + "learning_rate": 3.1918586490803116e-05, + "loss": 0.7264, + "step": 122720 + }, + { + "epoch": 1.0849732138121253, + "grad_norm": 1.8755451440811157, + "learning_rate": 3.1917113103131244e-05, + "loss": 0.6663, + "step": 122730 + }, + { + "epoch": 1.0850616170724376, + "grad_norm": 7.353437900543213, + "learning_rate": 3.191563971545937e-05, + "loss": 0.5963, + "step": 122740 + }, + { + "epoch": 1.0851500203327498, + "grad_norm": 1.9176220893859863, + "learning_rate": 3.19141663277875e-05, + "loss": 0.5429, + "step": 122750 + }, + { + "epoch": 1.085238423593062, + "grad_norm": 1.859707236289978, + "learning_rate": 3.1912692940115636e-05, + "loss": 0.67, + "step": 122760 + }, + { + "epoch": 1.0853268268533744, + "grad_norm": 1.7249350547790527, + "learning_rate": 3.1911219552443764e-05, + "loss": 0.5487, + "step": 122770 + }, + { + "epoch": 1.0854152301136866, + "grad_norm": 6.912788391113281, + "learning_rate": 3.190974616477189e-05, + "loss": 0.7496, + "step": 122780 + }, + { + "epoch": 1.085503633373999, + "grad_norm": 1.5615384578704834, + "learning_rate": 3.190827277710002e-05, + "loss": 0.5046, + "step": 122790 + }, + { + "epoch": 1.085592036634311, + "grad_norm": 2.0725605487823486, + "learning_rate": 3.190679938942815e-05, + "loss": 0.5555, + "step": 122800 + }, + { + "epoch": 1.0856804398946234, + "grad_norm": 4.006580352783203, + "learning_rate": 3.190532600175628e-05, + "loss": 0.6636, + "step": 122810 + }, + { + "epoch": 1.0857688431549355, + "grad_norm": 6.023318290710449, + "learning_rate": 3.1903852614084406e-05, + "loss": 0.696, + "step": 122820 + }, + { + "epoch": 1.0858572464152478, + "grad_norm": 4.691527843475342, + "learning_rate": 3.190237922641254e-05, + "loss": 0.6718, + "step": 122830 + }, + { + "epoch": 1.08594564967556, + "grad_norm": 2.462649345397949, + "learning_rate": 3.190090583874067e-05, + "loss": 0.5945, + "step": 122840 + }, + { + "epoch": 1.0860340529358723, + "grad_norm": 3.252864122390747, + "learning_rate": 3.18994324510688e-05, + "loss": 0.8285, + "step": 122850 + }, + { + "epoch": 1.0861224561961844, + "grad_norm": 1.8932931423187256, + "learning_rate": 3.1897959063396926e-05, + "loss": 0.6723, + "step": 122860 + }, + { + "epoch": 1.0862108594564968, + "grad_norm": 4.223151683807373, + "learning_rate": 3.1896485675725054e-05, + "loss": 0.6034, + "step": 122870 + }, + { + "epoch": 1.086299262716809, + "grad_norm": 5.50346565246582, + "learning_rate": 3.189501228805318e-05, + "loss": 0.7297, + "step": 122880 + }, + { + "epoch": 1.0863876659771212, + "grad_norm": 4.345876216888428, + "learning_rate": 3.189353890038132e-05, + "loss": 0.6009, + "step": 122890 + }, + { + "epoch": 1.0864760692374336, + "grad_norm": 4.005703926086426, + "learning_rate": 3.1892065512709446e-05, + "loss": 0.7488, + "step": 122900 + }, + { + "epoch": 1.0865644724977457, + "grad_norm": 9.135722160339355, + "learning_rate": 3.1890592125037575e-05, + "loss": 0.6311, + "step": 122910 + }, + { + "epoch": 1.086652875758058, + "grad_norm": 9.949433326721191, + "learning_rate": 3.18891187373657e-05, + "loss": 0.6459, + "step": 122920 + }, + { + "epoch": 1.0867412790183701, + "grad_norm": 2.5519168376922607, + "learning_rate": 3.188764534969383e-05, + "loss": 0.5789, + "step": 122930 + }, + { + "epoch": 1.0868296822786825, + "grad_norm": 1.693414330482483, + "learning_rate": 3.188617196202196e-05, + "loss": 0.6737, + "step": 122940 + }, + { + "epoch": 1.0869180855389946, + "grad_norm": 3.470093011856079, + "learning_rate": 3.1884698574350095e-05, + "loss": 0.5935, + "step": 122950 + }, + { + "epoch": 1.087006488799307, + "grad_norm": 2.216045379638672, + "learning_rate": 3.1883225186678216e-05, + "loss": 0.7899, + "step": 122960 + }, + { + "epoch": 1.087094892059619, + "grad_norm": 1.5903863906860352, + "learning_rate": 3.188175179900635e-05, + "loss": 0.5255, + "step": 122970 + }, + { + "epoch": 1.0871832953199314, + "grad_norm": 4.212559223175049, + "learning_rate": 3.188027841133448e-05, + "loss": 0.6146, + "step": 122980 + }, + { + "epoch": 1.0872716985802438, + "grad_norm": 20.852306365966797, + "learning_rate": 3.187880502366261e-05, + "loss": 0.6694, + "step": 122990 + }, + { + "epoch": 1.0873601018405559, + "grad_norm": 5.608232498168945, + "learning_rate": 3.1877331635990737e-05, + "loss": 0.6996, + "step": 123000 + }, + { + "epoch": 1.0874485051008682, + "grad_norm": 3.7173542976379395, + "learning_rate": 3.187585824831887e-05, + "loss": 0.6763, + "step": 123010 + }, + { + "epoch": 1.0875369083611803, + "grad_norm": 2.7234785556793213, + "learning_rate": 3.187438486064699e-05, + "loss": 0.632, + "step": 123020 + }, + { + "epoch": 1.0876253116214927, + "grad_norm": 4.820131778717041, + "learning_rate": 3.187291147297513e-05, + "loss": 0.5988, + "step": 123030 + }, + { + "epoch": 1.0877137148818048, + "grad_norm": 1.576489806175232, + "learning_rate": 3.187143808530326e-05, + "loss": 0.5666, + "step": 123040 + }, + { + "epoch": 1.0878021181421171, + "grad_norm": 9.27625560760498, + "learning_rate": 3.1869964697631385e-05, + "loss": 0.6994, + "step": 123050 + }, + { + "epoch": 1.0878905214024293, + "grad_norm": 1.7611716985702515, + "learning_rate": 3.1868491309959513e-05, + "loss": 0.5743, + "step": 123060 + }, + { + "epoch": 1.0879789246627416, + "grad_norm": 1.4599748849868774, + "learning_rate": 3.186701792228764e-05, + "loss": 0.6491, + "step": 123070 + }, + { + "epoch": 1.0880673279230537, + "grad_norm": 6.349857807159424, + "learning_rate": 3.186554453461577e-05, + "loss": 0.6364, + "step": 123080 + }, + { + "epoch": 1.088155731183366, + "grad_norm": 1.1519315242767334, + "learning_rate": 3.1864071146943905e-05, + "loss": 0.4621, + "step": 123090 + }, + { + "epoch": 1.0882441344436782, + "grad_norm": 1.7422294616699219, + "learning_rate": 3.186259775927203e-05, + "loss": 0.6687, + "step": 123100 + }, + { + "epoch": 1.0883325377039905, + "grad_norm": 2.3867409229278564, + "learning_rate": 3.186112437160016e-05, + "loss": 0.6195, + "step": 123110 + }, + { + "epoch": 1.0884209409643029, + "grad_norm": 3.232933282852173, + "learning_rate": 3.185965098392829e-05, + "loss": 0.5636, + "step": 123120 + }, + { + "epoch": 1.088509344224615, + "grad_norm": 3.027165651321411, + "learning_rate": 3.185817759625642e-05, + "loss": 0.5069, + "step": 123130 + }, + { + "epoch": 1.0885977474849273, + "grad_norm": 3.1634950637817383, + "learning_rate": 3.185670420858455e-05, + "loss": 0.5984, + "step": 123140 + }, + { + "epoch": 1.0886861507452394, + "grad_norm": 5.46713924407959, + "learning_rate": 3.185523082091268e-05, + "loss": 0.6881, + "step": 123150 + }, + { + "epoch": 1.0887745540055518, + "grad_norm": 1.8680810928344727, + "learning_rate": 3.1853757433240804e-05, + "loss": 0.6447, + "step": 123160 + }, + { + "epoch": 1.088862957265864, + "grad_norm": 1.7547731399536133, + "learning_rate": 3.185228404556894e-05, + "loss": 0.7225, + "step": 123170 + }, + { + "epoch": 1.0889513605261762, + "grad_norm": 2.694031238555908, + "learning_rate": 3.185081065789706e-05, + "loss": 0.6521, + "step": 123180 + }, + { + "epoch": 1.0890397637864884, + "grad_norm": 3.2728137969970703, + "learning_rate": 3.1849337270225196e-05, + "loss": 0.681, + "step": 123190 + }, + { + "epoch": 1.0891281670468007, + "grad_norm": 11.53569507598877, + "learning_rate": 3.1847863882553324e-05, + "loss": 0.6359, + "step": 123200 + }, + { + "epoch": 1.0892165703071128, + "grad_norm": 2.5578250885009766, + "learning_rate": 3.184639049488145e-05, + "loss": 0.7334, + "step": 123210 + }, + { + "epoch": 1.0893049735674252, + "grad_norm": 2.361954927444458, + "learning_rate": 3.184491710720958e-05, + "loss": 0.5542, + "step": 123220 + }, + { + "epoch": 1.0893933768277373, + "grad_norm": 31.11703109741211, + "learning_rate": 3.1843443719537716e-05, + "loss": 0.7469, + "step": 123230 + }, + { + "epoch": 1.0894817800880496, + "grad_norm": 6.2968430519104, + "learning_rate": 3.184197033186584e-05, + "loss": 0.6461, + "step": 123240 + }, + { + "epoch": 1.089570183348362, + "grad_norm": 2.392772912979126, + "learning_rate": 3.184049694419397e-05, + "loss": 0.56, + "step": 123250 + }, + { + "epoch": 1.089658586608674, + "grad_norm": 1.6381855010986328, + "learning_rate": 3.18390235565221e-05, + "loss": 0.6057, + "step": 123260 + }, + { + "epoch": 1.0897469898689864, + "grad_norm": 4.16072940826416, + "learning_rate": 3.183755016885023e-05, + "loss": 0.5933, + "step": 123270 + }, + { + "epoch": 1.0898353931292986, + "grad_norm": 1.7321690320968628, + "learning_rate": 3.183607678117836e-05, + "loss": 0.5009, + "step": 123280 + }, + { + "epoch": 1.089923796389611, + "grad_norm": 3.3836262226104736, + "learning_rate": 3.1834603393506486e-05, + "loss": 0.5166, + "step": 123290 + }, + { + "epoch": 1.090012199649923, + "grad_norm": 1.843558430671692, + "learning_rate": 3.1833130005834614e-05, + "loss": 0.7366, + "step": 123300 + }, + { + "epoch": 1.0901006029102354, + "grad_norm": 0.9097705483436584, + "learning_rate": 3.183165661816275e-05, + "loss": 0.5847, + "step": 123310 + }, + { + "epoch": 1.0901890061705475, + "grad_norm": 2.4408435821533203, + "learning_rate": 3.183018323049087e-05, + "loss": 0.5333, + "step": 123320 + }, + { + "epoch": 1.0902774094308598, + "grad_norm": 4.341062068939209, + "learning_rate": 3.1828709842819006e-05, + "loss": 0.5979, + "step": 123330 + }, + { + "epoch": 1.090365812691172, + "grad_norm": 2.545137643814087, + "learning_rate": 3.1827236455147134e-05, + "loss": 0.6414, + "step": 123340 + }, + { + "epoch": 1.0904542159514843, + "grad_norm": 4.5739264488220215, + "learning_rate": 3.182576306747526e-05, + "loss": 0.5754, + "step": 123350 + }, + { + "epoch": 1.0905426192117966, + "grad_norm": 1.841970682144165, + "learning_rate": 3.182428967980339e-05, + "loss": 0.6466, + "step": 123360 + }, + { + "epoch": 1.0906310224721087, + "grad_norm": 1.1298997402191162, + "learning_rate": 3.1822816292131526e-05, + "loss": 0.6019, + "step": 123370 + }, + { + "epoch": 1.090719425732421, + "grad_norm": 2.7480437755584717, + "learning_rate": 3.182134290445965e-05, + "loss": 0.6754, + "step": 123380 + }, + { + "epoch": 1.0908078289927332, + "grad_norm": 2.2227084636688232, + "learning_rate": 3.181986951678778e-05, + "loss": 0.5505, + "step": 123390 + }, + { + "epoch": 1.0908962322530456, + "grad_norm": 3.8440029621124268, + "learning_rate": 3.181839612911591e-05, + "loss": 0.5024, + "step": 123400 + }, + { + "epoch": 1.0909846355133577, + "grad_norm": 4.927218914031982, + "learning_rate": 3.181692274144404e-05, + "loss": 0.6554, + "step": 123410 + }, + { + "epoch": 1.09107303877367, + "grad_norm": 4.422795295715332, + "learning_rate": 3.181544935377217e-05, + "loss": 0.6696, + "step": 123420 + }, + { + "epoch": 1.0911614420339821, + "grad_norm": 2.428706407546997, + "learning_rate": 3.1813975966100296e-05, + "loss": 0.5874, + "step": 123430 + }, + { + "epoch": 1.0912498452942945, + "grad_norm": 4.93586540222168, + "learning_rate": 3.1812502578428425e-05, + "loss": 0.5851, + "step": 123440 + }, + { + "epoch": 1.0913382485546066, + "grad_norm": 2.1009678840637207, + "learning_rate": 3.181102919075656e-05, + "loss": 0.5244, + "step": 123450 + }, + { + "epoch": 1.091426651814919, + "grad_norm": 20.023983001708984, + "learning_rate": 3.180955580308468e-05, + "loss": 0.5989, + "step": 123460 + }, + { + "epoch": 1.0915150550752313, + "grad_norm": 1.8608654737472534, + "learning_rate": 3.180808241541282e-05, + "loss": 0.6433, + "step": 123470 + }, + { + "epoch": 1.0916034583355434, + "grad_norm": 5.683498382568359, + "learning_rate": 3.1806609027740945e-05, + "loss": 0.551, + "step": 123480 + }, + { + "epoch": 1.0916918615958557, + "grad_norm": 2.197681427001953, + "learning_rate": 3.180513564006907e-05, + "loss": 0.6296, + "step": 123490 + }, + { + "epoch": 1.0917802648561679, + "grad_norm": 2.280125379562378, + "learning_rate": 3.18036622523972e-05, + "loss": 0.5279, + "step": 123500 + }, + { + "epoch": 1.0918686681164802, + "grad_norm": 1.1667300462722778, + "learning_rate": 3.180218886472534e-05, + "loss": 0.5624, + "step": 123510 + }, + { + "epoch": 1.0919570713767923, + "grad_norm": 2.5631821155548096, + "learning_rate": 3.180071547705346e-05, + "loss": 0.4854, + "step": 123520 + }, + { + "epoch": 1.0920454746371047, + "grad_norm": 1.6595871448516846, + "learning_rate": 3.1799242089381594e-05, + "loss": 0.667, + "step": 123530 + }, + { + "epoch": 1.0921338778974168, + "grad_norm": 3.094825029373169, + "learning_rate": 3.1797768701709715e-05, + "loss": 0.689, + "step": 123540 + }, + { + "epoch": 1.0922222811577291, + "grad_norm": 1.3138394355773926, + "learning_rate": 3.179629531403785e-05, + "loss": 0.6058, + "step": 123550 + }, + { + "epoch": 1.0923106844180412, + "grad_norm": 2.099130392074585, + "learning_rate": 3.179482192636598e-05, + "loss": 0.643, + "step": 123560 + }, + { + "epoch": 1.0923990876783536, + "grad_norm": 2.628974676132202, + "learning_rate": 3.179334853869411e-05, + "loss": 0.6483, + "step": 123570 + }, + { + "epoch": 1.092487490938666, + "grad_norm": 2.490139961242676, + "learning_rate": 3.1791875151022235e-05, + "loss": 0.7011, + "step": 123580 + }, + { + "epoch": 1.092575894198978, + "grad_norm": 1.8102847337722778, + "learning_rate": 3.179040176335037e-05, + "loss": 0.5049, + "step": 123590 + }, + { + "epoch": 1.0926642974592904, + "grad_norm": 6.975493907928467, + "learning_rate": 3.178892837567849e-05, + "loss": 0.6521, + "step": 123600 + }, + { + "epoch": 1.0927527007196025, + "grad_norm": 2.1386125087738037, + "learning_rate": 3.178745498800663e-05, + "loss": 0.6642, + "step": 123610 + }, + { + "epoch": 1.0928411039799149, + "grad_norm": 8.358141899108887, + "learning_rate": 3.1785981600334755e-05, + "loss": 0.6421, + "step": 123620 + }, + { + "epoch": 1.092929507240227, + "grad_norm": 1.8538811206817627, + "learning_rate": 3.1784508212662884e-05, + "loss": 0.6678, + "step": 123630 + }, + { + "epoch": 1.0930179105005393, + "grad_norm": 2.4046263694763184, + "learning_rate": 3.178303482499101e-05, + "loss": 0.6618, + "step": 123640 + }, + { + "epoch": 1.0931063137608514, + "grad_norm": 3.560228109359741, + "learning_rate": 3.178156143731914e-05, + "loss": 0.6322, + "step": 123650 + }, + { + "epoch": 1.0931947170211638, + "grad_norm": 6.775623798370361, + "learning_rate": 3.178008804964727e-05, + "loss": 0.5787, + "step": 123660 + }, + { + "epoch": 1.093283120281476, + "grad_norm": 3.2997021675109863, + "learning_rate": 3.1778614661975404e-05, + "loss": 0.6072, + "step": 123670 + }, + { + "epoch": 1.0933715235417882, + "grad_norm": 1.1060670614242554, + "learning_rate": 3.177714127430353e-05, + "loss": 0.6452, + "step": 123680 + }, + { + "epoch": 1.0934599268021006, + "grad_norm": 1.32697594165802, + "learning_rate": 3.177566788663166e-05, + "loss": 0.6269, + "step": 123690 + }, + { + "epoch": 1.0935483300624127, + "grad_norm": 2.1969361305236816, + "learning_rate": 3.177419449895979e-05, + "loss": 0.5655, + "step": 123700 + }, + { + "epoch": 1.093636733322725, + "grad_norm": 1.4848862886428833, + "learning_rate": 3.177272111128792e-05, + "loss": 0.6188, + "step": 123710 + }, + { + "epoch": 1.0937251365830372, + "grad_norm": 4.382133483886719, + "learning_rate": 3.1771247723616046e-05, + "loss": 0.7114, + "step": 123720 + }, + { + "epoch": 1.0938135398433495, + "grad_norm": 1.4304269552230835, + "learning_rate": 3.176977433594418e-05, + "loss": 0.5503, + "step": 123730 + }, + { + "epoch": 1.0939019431036616, + "grad_norm": 0.9899899363517761, + "learning_rate": 3.176830094827231e-05, + "loss": 0.7028, + "step": 123740 + }, + { + "epoch": 1.093990346363974, + "grad_norm": 1.4632052183151245, + "learning_rate": 3.176682756060044e-05, + "loss": 0.6318, + "step": 123750 + }, + { + "epoch": 1.094078749624286, + "grad_norm": 1.8045881986618042, + "learning_rate": 3.1765354172928566e-05, + "loss": 0.6597, + "step": 123760 + }, + { + "epoch": 1.0941671528845984, + "grad_norm": 3.3151791095733643, + "learning_rate": 3.1763880785256694e-05, + "loss": 0.484, + "step": 123770 + }, + { + "epoch": 1.0942555561449105, + "grad_norm": 3.7142739295959473, + "learning_rate": 3.176240739758482e-05, + "loss": 0.5963, + "step": 123780 + }, + { + "epoch": 1.0943439594052229, + "grad_norm": 13.858745574951172, + "learning_rate": 3.176093400991295e-05, + "loss": 0.5214, + "step": 123790 + }, + { + "epoch": 1.094432362665535, + "grad_norm": 1.7841321229934692, + "learning_rate": 3.1759460622241086e-05, + "loss": 0.501, + "step": 123800 + }, + { + "epoch": 1.0945207659258473, + "grad_norm": 1.2429823875427246, + "learning_rate": 3.1757987234569215e-05, + "loss": 0.6267, + "step": 123810 + }, + { + "epoch": 1.0946091691861595, + "grad_norm": 1.8305622339248657, + "learning_rate": 3.175651384689734e-05, + "loss": 0.8363, + "step": 123820 + }, + { + "epoch": 1.0946975724464718, + "grad_norm": 2.0226359367370605, + "learning_rate": 3.175504045922547e-05, + "loss": 0.6559, + "step": 123830 + }, + { + "epoch": 1.0947859757067842, + "grad_norm": 3.287172555923462, + "learning_rate": 3.17535670715536e-05, + "loss": 0.6691, + "step": 123840 + }, + { + "epoch": 1.0948743789670963, + "grad_norm": 2.5246734619140625, + "learning_rate": 3.175209368388173e-05, + "loss": 0.6421, + "step": 123850 + }, + { + "epoch": 1.0949627822274086, + "grad_norm": 5.257667064666748, + "learning_rate": 3.175062029620986e-05, + "loss": 0.6591, + "step": 123860 + }, + { + "epoch": 1.0950511854877207, + "grad_norm": 3.2301576137542725, + "learning_rate": 3.174914690853799e-05, + "loss": 0.4681, + "step": 123870 + }, + { + "epoch": 1.095139588748033, + "grad_norm": 4.241169452667236, + "learning_rate": 3.174767352086612e-05, + "loss": 0.6829, + "step": 123880 + }, + { + "epoch": 1.0952279920083452, + "grad_norm": 1.628957748413086, + "learning_rate": 3.174620013319425e-05, + "loss": 0.6162, + "step": 123890 + }, + { + "epoch": 1.0953163952686575, + "grad_norm": 3.2245240211486816, + "learning_rate": 3.1744726745522376e-05, + "loss": 0.622, + "step": 123900 + }, + { + "epoch": 1.0954047985289697, + "grad_norm": 1.9790922403335571, + "learning_rate": 3.1743253357850505e-05, + "loss": 0.6179, + "step": 123910 + }, + { + "epoch": 1.095493201789282, + "grad_norm": 2.3553946018218994, + "learning_rate": 3.174177997017864e-05, + "loss": 0.709, + "step": 123920 + }, + { + "epoch": 1.0955816050495941, + "grad_norm": 2.2758944034576416, + "learning_rate": 3.174030658250676e-05, + "loss": 0.5286, + "step": 123930 + }, + { + "epoch": 1.0956700083099065, + "grad_norm": 2.614903211593628, + "learning_rate": 3.17388331948349e-05, + "loss": 0.5457, + "step": 123940 + }, + { + "epoch": 1.0957584115702188, + "grad_norm": 2.2489240169525146, + "learning_rate": 3.1737359807163025e-05, + "loss": 0.5752, + "step": 123950 + }, + { + "epoch": 1.095846814830531, + "grad_norm": 3.6554696559906006, + "learning_rate": 3.1735886419491153e-05, + "loss": 0.6931, + "step": 123960 + }, + { + "epoch": 1.0959352180908433, + "grad_norm": 2.23058819770813, + "learning_rate": 3.173441303181928e-05, + "loss": 0.6354, + "step": 123970 + }, + { + "epoch": 1.0960236213511554, + "grad_norm": 2.3187522888183594, + "learning_rate": 3.173293964414742e-05, + "loss": 0.5745, + "step": 123980 + }, + { + "epoch": 1.0961120246114677, + "grad_norm": 2.40840744972229, + "learning_rate": 3.173146625647554e-05, + "loss": 0.592, + "step": 123990 + }, + { + "epoch": 1.0962004278717798, + "grad_norm": 2.2621009349823, + "learning_rate": 3.1729992868803674e-05, + "loss": 0.6942, + "step": 124000 + }, + { + "epoch": 1.0962888311320922, + "grad_norm": 3.0730385780334473, + "learning_rate": 3.1728519481131795e-05, + "loss": 0.61, + "step": 124010 + }, + { + "epoch": 1.0963772343924043, + "grad_norm": 5.8164777755737305, + "learning_rate": 3.172704609345993e-05, + "loss": 0.5204, + "step": 124020 + }, + { + "epoch": 1.0964656376527167, + "grad_norm": 2.794093370437622, + "learning_rate": 3.172557270578806e-05, + "loss": 0.5585, + "step": 124030 + }, + { + "epoch": 1.0965540409130288, + "grad_norm": 1.3437511920928955, + "learning_rate": 3.172409931811619e-05, + "loss": 0.4742, + "step": 124040 + }, + { + "epoch": 1.0966424441733411, + "grad_norm": 3.45401668548584, + "learning_rate": 3.1722625930444315e-05, + "loss": 0.5125, + "step": 124050 + }, + { + "epoch": 1.0967308474336535, + "grad_norm": 6.227214336395264, + "learning_rate": 3.172115254277245e-05, + "loss": 0.6378, + "step": 124060 + }, + { + "epoch": 1.0968192506939656, + "grad_norm": 4.764873027801514, + "learning_rate": 3.171967915510057e-05, + "loss": 0.6519, + "step": 124070 + }, + { + "epoch": 1.096907653954278, + "grad_norm": 1.779847502708435, + "learning_rate": 3.171820576742871e-05, + "loss": 0.528, + "step": 124080 + }, + { + "epoch": 1.09699605721459, + "grad_norm": 6.919624328613281, + "learning_rate": 3.1716732379756836e-05, + "loss": 0.5687, + "step": 124090 + }, + { + "epoch": 1.0970844604749024, + "grad_norm": 8.186634063720703, + "learning_rate": 3.1715258992084964e-05, + "loss": 0.7371, + "step": 124100 + }, + { + "epoch": 1.0971728637352145, + "grad_norm": 5.432801246643066, + "learning_rate": 3.171378560441309e-05, + "loss": 0.727, + "step": 124110 + }, + { + "epoch": 1.0972612669955268, + "grad_norm": 4.025297164916992, + "learning_rate": 3.171231221674122e-05, + "loss": 0.6859, + "step": 124120 + }, + { + "epoch": 1.097349670255839, + "grad_norm": 10.788854598999023, + "learning_rate": 3.171083882906935e-05, + "loss": 0.5939, + "step": 124130 + }, + { + "epoch": 1.0974380735161513, + "grad_norm": 8.757100105285645, + "learning_rate": 3.1709365441397484e-05, + "loss": 0.5978, + "step": 124140 + }, + { + "epoch": 1.0975264767764634, + "grad_norm": 1.8570926189422607, + "learning_rate": 3.1707892053725606e-05, + "loss": 0.5262, + "step": 124150 + }, + { + "epoch": 1.0976148800367758, + "grad_norm": 1.9620702266693115, + "learning_rate": 3.170641866605374e-05, + "loss": 0.6535, + "step": 124160 + }, + { + "epoch": 1.097703283297088, + "grad_norm": 2.4935414791107178, + "learning_rate": 3.170494527838187e-05, + "loss": 0.6419, + "step": 124170 + }, + { + "epoch": 1.0977916865574002, + "grad_norm": 2.49300217628479, + "learning_rate": 3.170347189071e-05, + "loss": 0.5589, + "step": 124180 + }, + { + "epoch": 1.0978800898177126, + "grad_norm": 8.12836742401123, + "learning_rate": 3.1701998503038126e-05, + "loss": 0.6959, + "step": 124190 + }, + { + "epoch": 1.0979684930780247, + "grad_norm": 3.762848377227783, + "learning_rate": 3.170052511536626e-05, + "loss": 0.6687, + "step": 124200 + }, + { + "epoch": 1.098056896338337, + "grad_norm": 2.8207998275756836, + "learning_rate": 3.169905172769438e-05, + "loss": 0.5983, + "step": 124210 + }, + { + "epoch": 1.0981452995986491, + "grad_norm": 2.1519622802734375, + "learning_rate": 3.169757834002252e-05, + "loss": 0.5559, + "step": 124220 + }, + { + "epoch": 1.0982337028589615, + "grad_norm": 3.45025897026062, + "learning_rate": 3.169610495235064e-05, + "loss": 0.4949, + "step": 124230 + }, + { + "epoch": 1.0983221061192736, + "grad_norm": 2.2874369621276855, + "learning_rate": 3.1694631564678774e-05, + "loss": 0.5615, + "step": 124240 + }, + { + "epoch": 1.098410509379586, + "grad_norm": 0.9942915439605713, + "learning_rate": 3.16931581770069e-05, + "loss": 0.5601, + "step": 124250 + }, + { + "epoch": 1.098498912639898, + "grad_norm": 1.8301376104354858, + "learning_rate": 3.169168478933503e-05, + "loss": 0.6039, + "step": 124260 + }, + { + "epoch": 1.0985873159002104, + "grad_norm": 1.2993732690811157, + "learning_rate": 3.169021140166316e-05, + "loss": 0.6644, + "step": 124270 + }, + { + "epoch": 1.0986757191605228, + "grad_norm": 3.9826393127441406, + "learning_rate": 3.1688738013991295e-05, + "loss": 0.6299, + "step": 124280 + }, + { + "epoch": 1.0987641224208349, + "grad_norm": 5.105567932128906, + "learning_rate": 3.1687264626319416e-05, + "loss": 0.7223, + "step": 124290 + }, + { + "epoch": 1.0988525256811472, + "grad_norm": 3.4585683345794678, + "learning_rate": 3.168579123864755e-05, + "loss": 0.5159, + "step": 124300 + }, + { + "epoch": 1.0989409289414593, + "grad_norm": 2.7868947982788086, + "learning_rate": 3.168431785097568e-05, + "loss": 0.61, + "step": 124310 + }, + { + "epoch": 1.0990293322017717, + "grad_norm": 2.120448589324951, + "learning_rate": 3.168284446330381e-05, + "loss": 0.579, + "step": 124320 + }, + { + "epoch": 1.0991177354620838, + "grad_norm": 6.469594478607178, + "learning_rate": 3.1681371075631936e-05, + "loss": 0.5175, + "step": 124330 + }, + { + "epoch": 1.0992061387223961, + "grad_norm": 4.462221145629883, + "learning_rate": 3.167989768796007e-05, + "loss": 0.6313, + "step": 124340 + }, + { + "epoch": 1.0992945419827083, + "grad_norm": 2.2095582485198975, + "learning_rate": 3.167842430028819e-05, + "loss": 0.6948, + "step": 124350 + }, + { + "epoch": 1.0993829452430206, + "grad_norm": 1.5518501996994019, + "learning_rate": 3.167695091261633e-05, + "loss": 0.6219, + "step": 124360 + }, + { + "epoch": 1.0994713485033327, + "grad_norm": 2.8392245769500732, + "learning_rate": 3.167547752494445e-05, + "loss": 0.4992, + "step": 124370 + }, + { + "epoch": 1.099559751763645, + "grad_norm": 2.7968928813934326, + "learning_rate": 3.1674004137272585e-05, + "loss": 0.5455, + "step": 124380 + }, + { + "epoch": 1.0996481550239572, + "grad_norm": 2.4937191009521484, + "learning_rate": 3.167253074960071e-05, + "loss": 0.5717, + "step": 124390 + }, + { + "epoch": 1.0997365582842695, + "grad_norm": 1.6564760208129883, + "learning_rate": 3.167105736192884e-05, + "loss": 0.7106, + "step": 124400 + }, + { + "epoch": 1.0998249615445816, + "grad_norm": 1.3187408447265625, + "learning_rate": 3.166958397425697e-05, + "loss": 0.5031, + "step": 124410 + }, + { + "epoch": 1.099913364804894, + "grad_norm": 0.8784456253051758, + "learning_rate": 3.1668110586585105e-05, + "loss": 0.5915, + "step": 124420 + }, + { + "epoch": 1.1000017680652063, + "grad_norm": 1.3571583032608032, + "learning_rate": 3.166663719891323e-05, + "loss": 0.5861, + "step": 124430 + }, + { + "epoch": 1.1000901713255185, + "grad_norm": 1.1783332824707031, + "learning_rate": 3.166516381124136e-05, + "loss": 0.6347, + "step": 124440 + }, + { + "epoch": 1.1001785745858308, + "grad_norm": 1.5184040069580078, + "learning_rate": 3.166369042356949e-05, + "loss": 0.6243, + "step": 124450 + }, + { + "epoch": 1.100266977846143, + "grad_norm": 0.8016341924667358, + "learning_rate": 3.166221703589762e-05, + "loss": 0.4972, + "step": 124460 + }, + { + "epoch": 1.1003553811064553, + "grad_norm": 2.7434635162353516, + "learning_rate": 3.166074364822575e-05, + "loss": 0.6019, + "step": 124470 + }, + { + "epoch": 1.1004437843667674, + "grad_norm": 15.655183792114258, + "learning_rate": 3.1659270260553875e-05, + "loss": 0.7312, + "step": 124480 + }, + { + "epoch": 1.1005321876270797, + "grad_norm": 5.6018147468566895, + "learning_rate": 3.1657796872882004e-05, + "loss": 0.5543, + "step": 124490 + }, + { + "epoch": 1.1006205908873918, + "grad_norm": 18.7354679107666, + "learning_rate": 3.165632348521014e-05, + "loss": 0.7445, + "step": 124500 + }, + { + "epoch": 1.1007089941477042, + "grad_norm": 3.9416720867156982, + "learning_rate": 3.165485009753826e-05, + "loss": 0.746, + "step": 124510 + }, + { + "epoch": 1.1007973974080163, + "grad_norm": 2.296971321105957, + "learning_rate": 3.1653376709866395e-05, + "loss": 0.5416, + "step": 124520 + }, + { + "epoch": 1.1008858006683286, + "grad_norm": 2.4656710624694824, + "learning_rate": 3.1651903322194524e-05, + "loss": 0.674, + "step": 124530 + }, + { + "epoch": 1.100974203928641, + "grad_norm": 5.1056647300720215, + "learning_rate": 3.165042993452265e-05, + "loss": 0.5288, + "step": 124540 + }, + { + "epoch": 1.101062607188953, + "grad_norm": 2.2869575023651123, + "learning_rate": 3.164895654685078e-05, + "loss": 0.4813, + "step": 124550 + }, + { + "epoch": 1.1011510104492654, + "grad_norm": 1.507654070854187, + "learning_rate": 3.1647483159178916e-05, + "loss": 0.6334, + "step": 124560 + }, + { + "epoch": 1.1012394137095776, + "grad_norm": 7.412845134735107, + "learning_rate": 3.164600977150704e-05, + "loss": 0.5261, + "step": 124570 + }, + { + "epoch": 1.10132781696989, + "grad_norm": 21.285717010498047, + "learning_rate": 3.164453638383517e-05, + "loss": 0.5806, + "step": 124580 + }, + { + "epoch": 1.101416220230202, + "grad_norm": 0.9926066994667053, + "learning_rate": 3.16430629961633e-05, + "loss": 0.5848, + "step": 124590 + }, + { + "epoch": 1.1015046234905144, + "grad_norm": 2.340979814529419, + "learning_rate": 3.164158960849143e-05, + "loss": 0.5419, + "step": 124600 + }, + { + "epoch": 1.1015930267508265, + "grad_norm": 18.688566207885742, + "learning_rate": 3.164011622081956e-05, + "loss": 0.4664, + "step": 124610 + }, + { + "epoch": 1.1016814300111388, + "grad_norm": 1.94839346408844, + "learning_rate": 3.1638642833147686e-05, + "loss": 0.593, + "step": 124620 + }, + { + "epoch": 1.101769833271451, + "grad_norm": 1.2230571508407593, + "learning_rate": 3.1637169445475814e-05, + "loss": 0.5186, + "step": 124630 + }, + { + "epoch": 1.1018582365317633, + "grad_norm": 5.989471912384033, + "learning_rate": 3.163569605780395e-05, + "loss": 0.693, + "step": 124640 + }, + { + "epoch": 1.1019466397920756, + "grad_norm": 4.454293727874756, + "learning_rate": 3.163422267013208e-05, + "loss": 0.6537, + "step": 124650 + }, + { + "epoch": 1.1020350430523878, + "grad_norm": 3.478476047515869, + "learning_rate": 3.1632749282460206e-05, + "loss": 0.6131, + "step": 124660 + }, + { + "epoch": 1.1021234463127, + "grad_norm": 3.6605007648468018, + "learning_rate": 3.1631275894788334e-05, + "loss": 0.5098, + "step": 124670 + }, + { + "epoch": 1.1022118495730122, + "grad_norm": 1.8386131525039673, + "learning_rate": 3.162980250711646e-05, + "loss": 0.5756, + "step": 124680 + }, + { + "epoch": 1.1023002528333246, + "grad_norm": 9.639357566833496, + "learning_rate": 3.162832911944459e-05, + "loss": 0.6886, + "step": 124690 + }, + { + "epoch": 1.1023886560936367, + "grad_norm": 3.2937099933624268, + "learning_rate": 3.162685573177272e-05, + "loss": 0.6433, + "step": 124700 + }, + { + "epoch": 1.102477059353949, + "grad_norm": 1.8968168497085571, + "learning_rate": 3.1625382344100854e-05, + "loss": 0.5272, + "step": 124710 + }, + { + "epoch": 1.1025654626142611, + "grad_norm": 2.356100559234619, + "learning_rate": 3.162390895642898e-05, + "loss": 0.6988, + "step": 124720 + }, + { + "epoch": 1.1026538658745735, + "grad_norm": 2.712435722351074, + "learning_rate": 3.162243556875711e-05, + "loss": 0.6932, + "step": 124730 + }, + { + "epoch": 1.1027422691348856, + "grad_norm": 5.530733108520508, + "learning_rate": 3.162096218108524e-05, + "loss": 0.5679, + "step": 124740 + }, + { + "epoch": 1.102830672395198, + "grad_norm": 1.4024028778076172, + "learning_rate": 3.161948879341337e-05, + "loss": 0.5861, + "step": 124750 + }, + { + "epoch": 1.1029190756555103, + "grad_norm": 12.713836669921875, + "learning_rate": 3.1618015405741496e-05, + "loss": 0.7116, + "step": 124760 + }, + { + "epoch": 1.1030074789158224, + "grad_norm": 3.951727867126465, + "learning_rate": 3.161654201806963e-05, + "loss": 0.5621, + "step": 124770 + }, + { + "epoch": 1.1030958821761347, + "grad_norm": 4.476200103759766, + "learning_rate": 3.161506863039776e-05, + "loss": 0.5257, + "step": 124780 + }, + { + "epoch": 1.1031842854364469, + "grad_norm": 4.1308979988098145, + "learning_rate": 3.161359524272589e-05, + "loss": 0.5562, + "step": 124790 + }, + { + "epoch": 1.1032726886967592, + "grad_norm": 7.531303882598877, + "learning_rate": 3.1612121855054016e-05, + "loss": 0.6974, + "step": 124800 + }, + { + "epoch": 1.1033610919570713, + "grad_norm": 2.629441499710083, + "learning_rate": 3.1610648467382145e-05, + "loss": 0.8278, + "step": 124810 + }, + { + "epoch": 1.1034494952173837, + "grad_norm": 1.5622435808181763, + "learning_rate": 3.160917507971027e-05, + "loss": 0.6904, + "step": 124820 + }, + { + "epoch": 1.1035378984776958, + "grad_norm": 2.1056606769561768, + "learning_rate": 3.160770169203841e-05, + "loss": 0.5995, + "step": 124830 + }, + { + "epoch": 1.1036263017380081, + "grad_norm": 5.661498546600342, + "learning_rate": 3.160622830436653e-05, + "loss": 0.6521, + "step": 124840 + }, + { + "epoch": 1.1037147049983203, + "grad_norm": 5.302452564239502, + "learning_rate": 3.1604754916694665e-05, + "loss": 0.575, + "step": 124850 + }, + { + "epoch": 1.1038031082586326, + "grad_norm": 3.152299165725708, + "learning_rate": 3.160328152902279e-05, + "loss": 0.6587, + "step": 124860 + }, + { + "epoch": 1.103891511518945, + "grad_norm": 5.2222490310668945, + "learning_rate": 3.160180814135092e-05, + "loss": 0.626, + "step": 124870 + }, + { + "epoch": 1.103979914779257, + "grad_norm": 4.738489151000977, + "learning_rate": 3.160033475367905e-05, + "loss": 0.6006, + "step": 124880 + }, + { + "epoch": 1.1040683180395694, + "grad_norm": 6.9758734703063965, + "learning_rate": 3.1598861366007185e-05, + "loss": 0.6588, + "step": 124890 + }, + { + "epoch": 1.1041567212998815, + "grad_norm": 4.444128036499023, + "learning_rate": 3.159738797833531e-05, + "loss": 0.51, + "step": 124900 + }, + { + "epoch": 1.1042451245601939, + "grad_norm": 2.0373830795288086, + "learning_rate": 3.159591459066344e-05, + "loss": 0.5792, + "step": 124910 + }, + { + "epoch": 1.104333527820506, + "grad_norm": 3.0136606693267822, + "learning_rate": 3.159444120299157e-05, + "loss": 0.4443, + "step": 124920 + }, + { + "epoch": 1.1044219310808183, + "grad_norm": 4.708420276641846, + "learning_rate": 3.15929678153197e-05, + "loss": 0.6843, + "step": 124930 + }, + { + "epoch": 1.1045103343411304, + "grad_norm": 6.26500129699707, + "learning_rate": 3.159149442764783e-05, + "loss": 0.5731, + "step": 124940 + }, + { + "epoch": 1.1045987376014428, + "grad_norm": 6.592031955718994, + "learning_rate": 3.1590021039975955e-05, + "loss": 0.7107, + "step": 124950 + }, + { + "epoch": 1.104687140861755, + "grad_norm": 3.4746623039245605, + "learning_rate": 3.1588547652304084e-05, + "loss": 0.6429, + "step": 124960 + }, + { + "epoch": 1.1047755441220672, + "grad_norm": 4.6872782707214355, + "learning_rate": 3.158707426463222e-05, + "loss": 0.735, + "step": 124970 + }, + { + "epoch": 1.1048639473823794, + "grad_norm": 1.2400972843170166, + "learning_rate": 3.158560087696034e-05, + "loss": 0.5657, + "step": 124980 + }, + { + "epoch": 1.1049523506426917, + "grad_norm": 7.700628757476807, + "learning_rate": 3.1584127489288475e-05, + "loss": 0.6607, + "step": 124990 + }, + { + "epoch": 1.1050407539030038, + "grad_norm": 6.96332311630249, + "learning_rate": 3.1582654101616604e-05, + "loss": 0.733, + "step": 125000 + }, + { + "epoch": 1.1051291571633162, + "grad_norm": 2.716226100921631, + "learning_rate": 3.158118071394473e-05, + "loss": 0.6108, + "step": 125010 + }, + { + "epoch": 1.1052175604236285, + "grad_norm": 1.9164835214614868, + "learning_rate": 3.157970732627286e-05, + "loss": 0.4986, + "step": 125020 + }, + { + "epoch": 1.1053059636839406, + "grad_norm": 5.984572410583496, + "learning_rate": 3.1578233938600996e-05, + "loss": 0.5988, + "step": 125030 + }, + { + "epoch": 1.105394366944253, + "grad_norm": 9.276653289794922, + "learning_rate": 3.157676055092912e-05, + "loss": 0.7256, + "step": 125040 + }, + { + "epoch": 1.105482770204565, + "grad_norm": 4.372081756591797, + "learning_rate": 3.157528716325725e-05, + "loss": 0.6523, + "step": 125050 + }, + { + "epoch": 1.1055711734648774, + "grad_norm": 4.581477165222168, + "learning_rate": 3.1573813775585374e-05, + "loss": 0.6441, + "step": 125060 + }, + { + "epoch": 1.1056595767251896, + "grad_norm": 1.8464874029159546, + "learning_rate": 3.157234038791351e-05, + "loss": 0.5518, + "step": 125070 + }, + { + "epoch": 1.105747979985502, + "grad_norm": 1.789445161819458, + "learning_rate": 3.157086700024164e-05, + "loss": 0.6388, + "step": 125080 + }, + { + "epoch": 1.105836383245814, + "grad_norm": 2.8823423385620117, + "learning_rate": 3.1569393612569766e-05, + "loss": 0.6962, + "step": 125090 + }, + { + "epoch": 1.1059247865061264, + "grad_norm": 2.493934392929077, + "learning_rate": 3.1567920224897894e-05, + "loss": 0.5916, + "step": 125100 + }, + { + "epoch": 1.1060131897664385, + "grad_norm": 2.5145621299743652, + "learning_rate": 3.156644683722603e-05, + "loss": 0.5304, + "step": 125110 + }, + { + "epoch": 1.1061015930267508, + "grad_norm": 2.5019047260284424, + "learning_rate": 3.156497344955415e-05, + "loss": 0.5764, + "step": 125120 + }, + { + "epoch": 1.1061899962870632, + "grad_norm": 7.4171319007873535, + "learning_rate": 3.1563500061882286e-05, + "loss": 0.5856, + "step": 125130 + }, + { + "epoch": 1.1062783995473753, + "grad_norm": 3.224748373031616, + "learning_rate": 3.1562026674210414e-05, + "loss": 0.5969, + "step": 125140 + }, + { + "epoch": 1.1063668028076876, + "grad_norm": 1.4299430847167969, + "learning_rate": 3.156055328653854e-05, + "loss": 0.618, + "step": 125150 + }, + { + "epoch": 1.1064552060679997, + "grad_norm": 23.758073806762695, + "learning_rate": 3.155907989886667e-05, + "loss": 0.6106, + "step": 125160 + }, + { + "epoch": 1.106543609328312, + "grad_norm": 3.710561513900757, + "learning_rate": 3.15576065111948e-05, + "loss": 0.635, + "step": 125170 + }, + { + "epoch": 1.1066320125886242, + "grad_norm": 2.1364290714263916, + "learning_rate": 3.155613312352293e-05, + "loss": 0.5749, + "step": 125180 + }, + { + "epoch": 1.1067204158489365, + "grad_norm": 3.6300723552703857, + "learning_rate": 3.155465973585106e-05, + "loss": 0.5514, + "step": 125190 + }, + { + "epoch": 1.1068088191092487, + "grad_norm": 1.935792088508606, + "learning_rate": 3.1553186348179184e-05, + "loss": 0.4761, + "step": 125200 + }, + { + "epoch": 1.106897222369561, + "grad_norm": 2.8274099826812744, + "learning_rate": 3.155171296050732e-05, + "loss": 0.6517, + "step": 125210 + }, + { + "epoch": 1.1069856256298731, + "grad_norm": 1.3117761611938477, + "learning_rate": 3.155023957283545e-05, + "loss": 0.5541, + "step": 125220 + }, + { + "epoch": 1.1070740288901855, + "grad_norm": 1.588179111480713, + "learning_rate": 3.1548766185163576e-05, + "loss": 0.6889, + "step": 125230 + }, + { + "epoch": 1.1071624321504978, + "grad_norm": 1.6698071956634521, + "learning_rate": 3.1547292797491705e-05, + "loss": 0.5234, + "step": 125240 + }, + { + "epoch": 1.10725083541081, + "grad_norm": 1.8297425508499146, + "learning_rate": 3.154581940981984e-05, + "loss": 0.5914, + "step": 125250 + }, + { + "epoch": 1.1073392386711223, + "grad_norm": 3.630329132080078, + "learning_rate": 3.154434602214796e-05, + "loss": 0.7428, + "step": 125260 + }, + { + "epoch": 1.1074276419314344, + "grad_norm": 2.250415086746216, + "learning_rate": 3.1542872634476097e-05, + "loss": 0.5794, + "step": 125270 + }, + { + "epoch": 1.1075160451917467, + "grad_norm": 1.8166521787643433, + "learning_rate": 3.1541399246804225e-05, + "loss": 0.5565, + "step": 125280 + }, + { + "epoch": 1.1076044484520589, + "grad_norm": 2.053131341934204, + "learning_rate": 3.153992585913235e-05, + "loss": 0.5952, + "step": 125290 + }, + { + "epoch": 1.1076928517123712, + "grad_norm": 3.1434459686279297, + "learning_rate": 3.153845247146048e-05, + "loss": 0.6141, + "step": 125300 + }, + { + "epoch": 1.1077812549726833, + "grad_norm": 1.8760188817977905, + "learning_rate": 3.153697908378861e-05, + "loss": 0.6268, + "step": 125310 + }, + { + "epoch": 1.1078696582329957, + "grad_norm": 2.238780975341797, + "learning_rate": 3.153550569611674e-05, + "loss": 0.6903, + "step": 125320 + }, + { + "epoch": 1.1079580614933078, + "grad_norm": 0.7598658204078674, + "learning_rate": 3.1534032308444873e-05, + "loss": 0.6129, + "step": 125330 + }, + { + "epoch": 1.1080464647536201, + "grad_norm": 6.6462273597717285, + "learning_rate": 3.1532558920772995e-05, + "loss": 0.5791, + "step": 125340 + }, + { + "epoch": 1.1081348680139325, + "grad_norm": 5.928302764892578, + "learning_rate": 3.153108553310113e-05, + "loss": 0.5833, + "step": 125350 + }, + { + "epoch": 1.1082232712742446, + "grad_norm": 8.749913215637207, + "learning_rate": 3.152961214542926e-05, + "loss": 0.789, + "step": 125360 + }, + { + "epoch": 1.108311674534557, + "grad_norm": 3.7648115158081055, + "learning_rate": 3.152813875775739e-05, + "loss": 0.6107, + "step": 125370 + }, + { + "epoch": 1.108400077794869, + "grad_norm": 4.305647850036621, + "learning_rate": 3.1526665370085515e-05, + "loss": 0.6218, + "step": 125380 + }, + { + "epoch": 1.1084884810551814, + "grad_norm": 1.4181492328643799, + "learning_rate": 3.152519198241365e-05, + "loss": 0.6031, + "step": 125390 + }, + { + "epoch": 1.1085768843154935, + "grad_norm": 1.7940001487731934, + "learning_rate": 3.152371859474177e-05, + "loss": 0.5557, + "step": 125400 + }, + { + "epoch": 1.1086652875758058, + "grad_norm": 1.1032383441925049, + "learning_rate": 3.152224520706991e-05, + "loss": 0.629, + "step": 125410 + }, + { + "epoch": 1.108753690836118, + "grad_norm": 2.5060174465179443, + "learning_rate": 3.152077181939803e-05, + "loss": 0.5312, + "step": 125420 + }, + { + "epoch": 1.1088420940964303, + "grad_norm": 1.2104655504226685, + "learning_rate": 3.1519298431726164e-05, + "loss": 0.5527, + "step": 125430 + }, + { + "epoch": 1.1089304973567424, + "grad_norm": 5.633395195007324, + "learning_rate": 3.151782504405429e-05, + "loss": 0.5279, + "step": 125440 + }, + { + "epoch": 1.1090189006170548, + "grad_norm": 3.9315385818481445, + "learning_rate": 3.151635165638242e-05, + "loss": 0.546, + "step": 125450 + }, + { + "epoch": 1.1091073038773671, + "grad_norm": 1.255246877670288, + "learning_rate": 3.151487826871055e-05, + "loss": 0.5867, + "step": 125460 + }, + { + "epoch": 1.1091957071376792, + "grad_norm": 1.3848497867584229, + "learning_rate": 3.1513404881038684e-05, + "loss": 0.5623, + "step": 125470 + }, + { + "epoch": 1.1092841103979916, + "grad_norm": 3.3597869873046875, + "learning_rate": 3.1511931493366805e-05, + "loss": 0.6361, + "step": 125480 + }, + { + "epoch": 1.1093725136583037, + "grad_norm": 3.975618362426758, + "learning_rate": 3.151045810569494e-05, + "loss": 0.5909, + "step": 125490 + }, + { + "epoch": 1.109460916918616, + "grad_norm": 12.802530288696289, + "learning_rate": 3.150898471802307e-05, + "loss": 0.5648, + "step": 125500 + }, + { + "epoch": 1.1095493201789282, + "grad_norm": 2.30096435546875, + "learning_rate": 3.15075113303512e-05, + "loss": 0.5542, + "step": 125510 + }, + { + "epoch": 1.1096377234392405, + "grad_norm": 8.658194541931152, + "learning_rate": 3.1506037942679326e-05, + "loss": 0.7332, + "step": 125520 + }, + { + "epoch": 1.1097261266995526, + "grad_norm": 13.195569038391113, + "learning_rate": 3.1504564555007454e-05, + "loss": 0.5939, + "step": 125530 + }, + { + "epoch": 1.109814529959865, + "grad_norm": 3.1111979484558105, + "learning_rate": 3.150309116733558e-05, + "loss": 0.6388, + "step": 125540 + }, + { + "epoch": 1.109902933220177, + "grad_norm": 4.646438121795654, + "learning_rate": 3.150161777966372e-05, + "loss": 0.5267, + "step": 125550 + }, + { + "epoch": 1.1099913364804894, + "grad_norm": 16.754776000976562, + "learning_rate": 3.1500144391991846e-05, + "loss": 0.604, + "step": 125560 + }, + { + "epoch": 1.1100797397408015, + "grad_norm": 2.3517396450042725, + "learning_rate": 3.1498671004319974e-05, + "loss": 0.5682, + "step": 125570 + }, + { + "epoch": 1.1101681430011139, + "grad_norm": 1.9671168327331543, + "learning_rate": 3.14971976166481e-05, + "loss": 0.6677, + "step": 125580 + }, + { + "epoch": 1.1102565462614262, + "grad_norm": 3.8956871032714844, + "learning_rate": 3.149572422897623e-05, + "loss": 0.5748, + "step": 125590 + }, + { + "epoch": 1.1103449495217383, + "grad_norm": 2.6497509479522705, + "learning_rate": 3.149425084130436e-05, + "loss": 0.5924, + "step": 125600 + }, + { + "epoch": 1.1104333527820507, + "grad_norm": 3.2031991481781006, + "learning_rate": 3.1492777453632494e-05, + "loss": 0.6475, + "step": 125610 + }, + { + "epoch": 1.1105217560423628, + "grad_norm": 4.9300336837768555, + "learning_rate": 3.149130406596062e-05, + "loss": 0.6979, + "step": 125620 + }, + { + "epoch": 1.1106101593026751, + "grad_norm": 10.219849586486816, + "learning_rate": 3.148983067828875e-05, + "loss": 0.6738, + "step": 125630 + }, + { + "epoch": 1.1106985625629873, + "grad_norm": 4.082695960998535, + "learning_rate": 3.148835729061688e-05, + "loss": 0.5604, + "step": 125640 + }, + { + "epoch": 1.1107869658232996, + "grad_norm": 1.997944712638855, + "learning_rate": 3.148688390294501e-05, + "loss": 0.6651, + "step": 125650 + }, + { + "epoch": 1.1108753690836117, + "grad_norm": 6.413634777069092, + "learning_rate": 3.1485410515273136e-05, + "loss": 0.4856, + "step": 125660 + }, + { + "epoch": 1.110963772343924, + "grad_norm": 1.8265559673309326, + "learning_rate": 3.1483937127601265e-05, + "loss": 0.5785, + "step": 125670 + }, + { + "epoch": 1.1110521756042362, + "grad_norm": 1.9175828695297241, + "learning_rate": 3.14824637399294e-05, + "loss": 0.5205, + "step": 125680 + }, + { + "epoch": 1.1111405788645485, + "grad_norm": 16.259056091308594, + "learning_rate": 3.148099035225753e-05, + "loss": 0.5791, + "step": 125690 + }, + { + "epoch": 1.1112289821248607, + "grad_norm": 7.223111152648926, + "learning_rate": 3.1479516964585656e-05, + "loss": 0.5815, + "step": 125700 + }, + { + "epoch": 1.111317385385173, + "grad_norm": 1.3366649150848389, + "learning_rate": 3.1478043576913785e-05, + "loss": 0.6147, + "step": 125710 + }, + { + "epoch": 1.1114057886454853, + "grad_norm": 3.212587356567383, + "learning_rate": 3.147657018924191e-05, + "loss": 0.7404, + "step": 125720 + }, + { + "epoch": 1.1114941919057975, + "grad_norm": 2.8107106685638428, + "learning_rate": 3.147509680157004e-05, + "loss": 0.6225, + "step": 125730 + }, + { + "epoch": 1.1115825951661098, + "grad_norm": 1.9735573530197144, + "learning_rate": 3.1473623413898177e-05, + "loss": 0.69, + "step": 125740 + }, + { + "epoch": 1.111670998426422, + "grad_norm": 1.473301887512207, + "learning_rate": 3.1472150026226305e-05, + "loss": 0.5122, + "step": 125750 + }, + { + "epoch": 1.1117594016867343, + "grad_norm": 3.5634231567382812, + "learning_rate": 3.147067663855443e-05, + "loss": 0.6575, + "step": 125760 + }, + { + "epoch": 1.1118478049470464, + "grad_norm": 2.6562602519989014, + "learning_rate": 3.146920325088256e-05, + "loss": 0.5978, + "step": 125770 + }, + { + "epoch": 1.1119362082073587, + "grad_norm": 6.751959800720215, + "learning_rate": 3.146772986321069e-05, + "loss": 0.6665, + "step": 125780 + }, + { + "epoch": 1.1120246114676708, + "grad_norm": 1.9707025289535522, + "learning_rate": 3.146625647553882e-05, + "loss": 0.6024, + "step": 125790 + }, + { + "epoch": 1.1121130147279832, + "grad_norm": 9.072181701660156, + "learning_rate": 3.1464783087866953e-05, + "loss": 0.7038, + "step": 125800 + }, + { + "epoch": 1.1122014179882953, + "grad_norm": 2.1578590869903564, + "learning_rate": 3.1463309700195075e-05, + "loss": 0.584, + "step": 125810 + }, + { + "epoch": 1.1122898212486076, + "grad_norm": 2.9197070598602295, + "learning_rate": 3.146183631252321e-05, + "loss": 0.4763, + "step": 125820 + }, + { + "epoch": 1.11237822450892, + "grad_norm": 3.019852638244629, + "learning_rate": 3.146036292485134e-05, + "loss": 0.7298, + "step": 125830 + }, + { + "epoch": 1.112466627769232, + "grad_norm": 1.4384427070617676, + "learning_rate": 3.145888953717947e-05, + "loss": 0.7194, + "step": 125840 + }, + { + "epoch": 1.1125550310295445, + "grad_norm": 3.314427614212036, + "learning_rate": 3.1457416149507595e-05, + "loss": 0.5566, + "step": 125850 + }, + { + "epoch": 1.1126434342898566, + "grad_norm": 1.7390302419662476, + "learning_rate": 3.145594276183573e-05, + "loss": 0.6769, + "step": 125860 + }, + { + "epoch": 1.112731837550169, + "grad_norm": 3.1712758541107178, + "learning_rate": 3.145446937416385e-05, + "loss": 0.5637, + "step": 125870 + }, + { + "epoch": 1.112820240810481, + "grad_norm": 3.294419527053833, + "learning_rate": 3.145299598649199e-05, + "loss": 0.633, + "step": 125880 + }, + { + "epoch": 1.1129086440707934, + "grad_norm": 3.6876602172851562, + "learning_rate": 3.145152259882011e-05, + "loss": 0.6303, + "step": 125890 + }, + { + "epoch": 1.1129970473311055, + "grad_norm": 1.4310834407806396, + "learning_rate": 3.1450049211148244e-05, + "loss": 0.7808, + "step": 125900 + }, + { + "epoch": 1.1130854505914178, + "grad_norm": 1.7381114959716797, + "learning_rate": 3.144857582347637e-05, + "loss": 0.711, + "step": 125910 + }, + { + "epoch": 1.11317385385173, + "grad_norm": 7.087277412414551, + "learning_rate": 3.14471024358045e-05, + "loss": 0.5593, + "step": 125920 + }, + { + "epoch": 1.1132622571120423, + "grad_norm": 9.185232162475586, + "learning_rate": 3.144562904813263e-05, + "loss": 0.6842, + "step": 125930 + }, + { + "epoch": 1.1133506603723546, + "grad_norm": 6.922568321228027, + "learning_rate": 3.1444155660460764e-05, + "loss": 0.732, + "step": 125940 + }, + { + "epoch": 1.1134390636326668, + "grad_norm": 2.6085684299468994, + "learning_rate": 3.1442682272788886e-05, + "loss": 0.5931, + "step": 125950 + }, + { + "epoch": 1.113527466892979, + "grad_norm": 13.312128067016602, + "learning_rate": 3.144120888511702e-05, + "loss": 0.5794, + "step": 125960 + }, + { + "epoch": 1.1136158701532912, + "grad_norm": 0.8146817088127136, + "learning_rate": 3.143973549744515e-05, + "loss": 0.6911, + "step": 125970 + }, + { + "epoch": 1.1137042734136036, + "grad_norm": 2.7889797687530518, + "learning_rate": 3.143826210977328e-05, + "loss": 0.5913, + "step": 125980 + }, + { + "epoch": 1.1137926766739157, + "grad_norm": 2.0145294666290283, + "learning_rate": 3.1436788722101406e-05, + "loss": 0.6094, + "step": 125990 + }, + { + "epoch": 1.113881079934228, + "grad_norm": 8.28034782409668, + "learning_rate": 3.1435315334429534e-05, + "loss": 0.4998, + "step": 126000 + }, + { + "epoch": 1.1139694831945401, + "grad_norm": 14.785907745361328, + "learning_rate": 3.143384194675766e-05, + "loss": 0.6574, + "step": 126010 + }, + { + "epoch": 1.1140578864548525, + "grad_norm": 2.148918628692627, + "learning_rate": 3.14323685590858e-05, + "loss": 0.508, + "step": 126020 + }, + { + "epoch": 1.1141462897151646, + "grad_norm": 2.3937630653381348, + "learning_rate": 3.143089517141392e-05, + "loss": 0.608, + "step": 126030 + }, + { + "epoch": 1.114234692975477, + "grad_norm": 5.709342956542969, + "learning_rate": 3.1429421783742054e-05, + "loss": 0.6393, + "step": 126040 + }, + { + "epoch": 1.1143230962357893, + "grad_norm": 5.312056541442871, + "learning_rate": 3.142794839607018e-05, + "loss": 0.6536, + "step": 126050 + }, + { + "epoch": 1.1144114994961014, + "grad_norm": 2.5917561054229736, + "learning_rate": 3.142647500839831e-05, + "loss": 0.5768, + "step": 126060 + }, + { + "epoch": 1.1144999027564138, + "grad_norm": 2.8331007957458496, + "learning_rate": 3.142500162072644e-05, + "loss": 0.6252, + "step": 126070 + }, + { + "epoch": 1.1145883060167259, + "grad_norm": 1.2347944974899292, + "learning_rate": 3.1423528233054574e-05, + "loss": 0.7996, + "step": 126080 + }, + { + "epoch": 1.1146767092770382, + "grad_norm": 8.965507507324219, + "learning_rate": 3.1422054845382696e-05, + "loss": 0.57, + "step": 126090 + }, + { + "epoch": 1.1147651125373503, + "grad_norm": 8.337733268737793, + "learning_rate": 3.142058145771083e-05, + "loss": 0.6087, + "step": 126100 + }, + { + "epoch": 1.1148535157976627, + "grad_norm": 1.861345887184143, + "learning_rate": 3.141910807003896e-05, + "loss": 0.5793, + "step": 126110 + }, + { + "epoch": 1.1149419190579748, + "grad_norm": 12.161194801330566, + "learning_rate": 3.141763468236709e-05, + "loss": 0.5958, + "step": 126120 + }, + { + "epoch": 1.1150303223182871, + "grad_norm": 1.917739748954773, + "learning_rate": 3.1416161294695216e-05, + "loss": 0.5545, + "step": 126130 + }, + { + "epoch": 1.1151187255785993, + "grad_norm": 2.5260403156280518, + "learning_rate": 3.1414687907023345e-05, + "loss": 0.5566, + "step": 126140 + }, + { + "epoch": 1.1152071288389116, + "grad_norm": 4.833311557769775, + "learning_rate": 3.141321451935147e-05, + "loss": 0.7167, + "step": 126150 + }, + { + "epoch": 1.1152955320992237, + "grad_norm": 2.2427449226379395, + "learning_rate": 3.141174113167961e-05, + "loss": 0.5197, + "step": 126160 + }, + { + "epoch": 1.115383935359536, + "grad_norm": 0.6076095700263977, + "learning_rate": 3.141026774400773e-05, + "loss": 0.5656, + "step": 126170 + }, + { + "epoch": 1.1154723386198484, + "grad_norm": 0.5763596296310425, + "learning_rate": 3.1408794356335865e-05, + "loss": 0.6521, + "step": 126180 + }, + { + "epoch": 1.1155607418801605, + "grad_norm": 2.0882694721221924, + "learning_rate": 3.140732096866399e-05, + "loss": 0.5895, + "step": 126190 + }, + { + "epoch": 1.1156491451404729, + "grad_norm": 2.8082191944122314, + "learning_rate": 3.140584758099212e-05, + "loss": 0.7213, + "step": 126200 + }, + { + "epoch": 1.115737548400785, + "grad_norm": 2.7710788249969482, + "learning_rate": 3.140437419332025e-05, + "loss": 0.6231, + "step": 126210 + }, + { + "epoch": 1.1158259516610973, + "grad_norm": 7.6033477783203125, + "learning_rate": 3.1402900805648385e-05, + "loss": 0.6203, + "step": 126220 + }, + { + "epoch": 1.1159143549214094, + "grad_norm": 2.884514093399048, + "learning_rate": 3.1401427417976507e-05, + "loss": 0.7096, + "step": 126230 + }, + { + "epoch": 1.1160027581817218, + "grad_norm": 1.7271358966827393, + "learning_rate": 3.139995403030464e-05, + "loss": 0.5324, + "step": 126240 + }, + { + "epoch": 1.116091161442034, + "grad_norm": 1.718198537826538, + "learning_rate": 3.139848064263276e-05, + "loss": 0.6442, + "step": 126250 + }, + { + "epoch": 1.1161795647023462, + "grad_norm": 1.9957566261291504, + "learning_rate": 3.13970072549609e-05, + "loss": 0.5736, + "step": 126260 + }, + { + "epoch": 1.1162679679626584, + "grad_norm": 5.794721603393555, + "learning_rate": 3.139553386728903e-05, + "loss": 0.7412, + "step": 126270 + }, + { + "epoch": 1.1163563712229707, + "grad_norm": 3.4659881591796875, + "learning_rate": 3.1394060479617155e-05, + "loss": 0.5739, + "step": 126280 + }, + { + "epoch": 1.1164447744832828, + "grad_norm": 5.785071849822998, + "learning_rate": 3.1392587091945283e-05, + "loss": 0.5969, + "step": 126290 + }, + { + "epoch": 1.1165331777435952, + "grad_norm": 3.6061861515045166, + "learning_rate": 3.139111370427342e-05, + "loss": 0.62, + "step": 126300 + }, + { + "epoch": 1.1166215810039075, + "grad_norm": 1.406785011291504, + "learning_rate": 3.138964031660154e-05, + "loss": 0.7136, + "step": 126310 + }, + { + "epoch": 1.1167099842642196, + "grad_norm": 2.6116490364074707, + "learning_rate": 3.1388166928929675e-05, + "loss": 0.5932, + "step": 126320 + }, + { + "epoch": 1.116798387524532, + "grad_norm": 1.9786701202392578, + "learning_rate": 3.1386693541257804e-05, + "loss": 0.734, + "step": 126330 + }, + { + "epoch": 1.116886790784844, + "grad_norm": 1.6138758659362793, + "learning_rate": 3.138522015358593e-05, + "loss": 0.5513, + "step": 126340 + }, + { + "epoch": 1.1169751940451564, + "grad_norm": 1.5725356340408325, + "learning_rate": 3.138374676591406e-05, + "loss": 0.5476, + "step": 126350 + }, + { + "epoch": 1.1170635973054686, + "grad_norm": 8.241376876831055, + "learning_rate": 3.138227337824219e-05, + "loss": 0.4881, + "step": 126360 + }, + { + "epoch": 1.117152000565781, + "grad_norm": 1.2967334985733032, + "learning_rate": 3.138079999057032e-05, + "loss": 0.5908, + "step": 126370 + }, + { + "epoch": 1.117240403826093, + "grad_norm": 2.218273401260376, + "learning_rate": 3.137932660289845e-05, + "loss": 0.6395, + "step": 126380 + }, + { + "epoch": 1.1173288070864054, + "grad_norm": 2.249335289001465, + "learning_rate": 3.1377853215226574e-05, + "loss": 0.5609, + "step": 126390 + }, + { + "epoch": 1.1174172103467175, + "grad_norm": 1.0351797342300415, + "learning_rate": 3.137637982755471e-05, + "loss": 0.5399, + "step": 126400 + }, + { + "epoch": 1.1175056136070298, + "grad_norm": 5.871886253356934, + "learning_rate": 3.137490643988284e-05, + "loss": 0.6555, + "step": 126410 + }, + { + "epoch": 1.1175940168673422, + "grad_norm": 2.120114326477051, + "learning_rate": 3.1373433052210966e-05, + "loss": 0.4715, + "step": 126420 + }, + { + "epoch": 1.1176824201276543, + "grad_norm": 2.8516831398010254, + "learning_rate": 3.1371959664539094e-05, + "loss": 0.568, + "step": 126430 + }, + { + "epoch": 1.1177708233879666, + "grad_norm": 4.082274436950684, + "learning_rate": 3.137048627686723e-05, + "loss": 0.6649, + "step": 126440 + }, + { + "epoch": 1.1178592266482787, + "grad_norm": 2.897552251815796, + "learning_rate": 3.136901288919535e-05, + "loss": 0.6458, + "step": 126450 + }, + { + "epoch": 1.117947629908591, + "grad_norm": 5.313582420349121, + "learning_rate": 3.1367539501523486e-05, + "loss": 0.626, + "step": 126460 + }, + { + "epoch": 1.1180360331689032, + "grad_norm": 1.5733404159545898, + "learning_rate": 3.1366066113851614e-05, + "loss": 0.5666, + "step": 126470 + }, + { + "epoch": 1.1181244364292156, + "grad_norm": 1.1127409934997559, + "learning_rate": 3.136459272617974e-05, + "loss": 0.5631, + "step": 126480 + }, + { + "epoch": 1.1182128396895277, + "grad_norm": 0.7725486159324646, + "learning_rate": 3.136311933850787e-05, + "loss": 0.5232, + "step": 126490 + }, + { + "epoch": 1.11830124294984, + "grad_norm": 9.888276100158691, + "learning_rate": 3.1361645950836e-05, + "loss": 0.6731, + "step": 126500 + }, + { + "epoch": 1.1183896462101521, + "grad_norm": 2.0208230018615723, + "learning_rate": 3.136017256316413e-05, + "loss": 0.6906, + "step": 126510 + }, + { + "epoch": 1.1184780494704645, + "grad_norm": 4.282047748565674, + "learning_rate": 3.135869917549226e-05, + "loss": 0.6936, + "step": 126520 + }, + { + "epoch": 1.1185664527307768, + "grad_norm": 7.722494602203369, + "learning_rate": 3.135722578782039e-05, + "loss": 0.6546, + "step": 126530 + }, + { + "epoch": 1.118654855991089, + "grad_norm": 1.8932750225067139, + "learning_rate": 3.135575240014852e-05, + "loss": 0.679, + "step": 126540 + }, + { + "epoch": 1.1187432592514013, + "grad_norm": 3.4825494289398193, + "learning_rate": 3.135427901247665e-05, + "loss": 0.5792, + "step": 126550 + }, + { + "epoch": 1.1188316625117134, + "grad_norm": 3.8179244995117188, + "learning_rate": 3.1352805624804776e-05, + "loss": 0.7298, + "step": 126560 + }, + { + "epoch": 1.1189200657720257, + "grad_norm": 11.020949363708496, + "learning_rate": 3.1351332237132904e-05, + "loss": 0.7908, + "step": 126570 + }, + { + "epoch": 1.1190084690323379, + "grad_norm": 2.8973608016967773, + "learning_rate": 3.134985884946104e-05, + "loss": 0.6241, + "step": 126580 + }, + { + "epoch": 1.1190968722926502, + "grad_norm": 15.375143051147461, + "learning_rate": 3.134838546178917e-05, + "loss": 0.6852, + "step": 126590 + }, + { + "epoch": 1.1191852755529623, + "grad_norm": 2.3051064014434814, + "learning_rate": 3.1346912074117296e-05, + "loss": 0.5439, + "step": 126600 + }, + { + "epoch": 1.1192736788132747, + "grad_norm": 1.1505138874053955, + "learning_rate": 3.1345438686445425e-05, + "loss": 0.7202, + "step": 126610 + }, + { + "epoch": 1.1193620820735868, + "grad_norm": 3.004833698272705, + "learning_rate": 3.134396529877355e-05, + "loss": 0.6213, + "step": 126620 + }, + { + "epoch": 1.1194504853338991, + "grad_norm": 5.324898719787598, + "learning_rate": 3.134249191110168e-05, + "loss": 0.512, + "step": 126630 + }, + { + "epoch": 1.1195388885942115, + "grad_norm": 1.4163658618927002, + "learning_rate": 3.134101852342981e-05, + "loss": 0.6827, + "step": 126640 + }, + { + "epoch": 1.1196272918545236, + "grad_norm": 1.3405177593231201, + "learning_rate": 3.1339545135757945e-05, + "loss": 0.69, + "step": 126650 + }, + { + "epoch": 1.119715695114836, + "grad_norm": 3.2913944721221924, + "learning_rate": 3.133807174808607e-05, + "loss": 0.7064, + "step": 126660 + }, + { + "epoch": 1.119804098375148, + "grad_norm": 0.9808671474456787, + "learning_rate": 3.13365983604142e-05, + "loss": 0.5889, + "step": 126670 + }, + { + "epoch": 1.1198925016354604, + "grad_norm": 2.1487343311309814, + "learning_rate": 3.133512497274233e-05, + "loss": 0.6175, + "step": 126680 + }, + { + "epoch": 1.1199809048957725, + "grad_norm": 2.7274739742279053, + "learning_rate": 3.133365158507046e-05, + "loss": 0.6296, + "step": 126690 + }, + { + "epoch": 1.1200693081560849, + "grad_norm": 0.8556886315345764, + "learning_rate": 3.133217819739859e-05, + "loss": 0.6017, + "step": 126700 + }, + { + "epoch": 1.120157711416397, + "grad_norm": 1.6036845445632935, + "learning_rate": 3.133070480972672e-05, + "loss": 0.6004, + "step": 126710 + }, + { + "epoch": 1.1202461146767093, + "grad_norm": 7.731258869171143, + "learning_rate": 3.132923142205484e-05, + "loss": 0.6642, + "step": 126720 + }, + { + "epoch": 1.1203345179370214, + "grad_norm": 1.877601146697998, + "learning_rate": 3.132775803438298e-05, + "loss": 0.7328, + "step": 126730 + }, + { + "epoch": 1.1204229211973338, + "grad_norm": 3.8008337020874023, + "learning_rate": 3.132628464671111e-05, + "loss": 0.6508, + "step": 126740 + }, + { + "epoch": 1.120511324457646, + "grad_norm": 2.1105246543884277, + "learning_rate": 3.1324811259039235e-05, + "loss": 0.6142, + "step": 126750 + }, + { + "epoch": 1.1205997277179582, + "grad_norm": 3.3778750896453857, + "learning_rate": 3.1323337871367364e-05, + "loss": 0.6446, + "step": 126760 + }, + { + "epoch": 1.1206881309782706, + "grad_norm": 3.2779958248138428, + "learning_rate": 3.13218644836955e-05, + "loss": 0.6537, + "step": 126770 + }, + { + "epoch": 1.1207765342385827, + "grad_norm": 6.633204936981201, + "learning_rate": 3.132039109602362e-05, + "loss": 0.5846, + "step": 126780 + }, + { + "epoch": 1.120864937498895, + "grad_norm": 10.813836097717285, + "learning_rate": 3.1318917708351755e-05, + "loss": 0.6043, + "step": 126790 + }, + { + "epoch": 1.1209533407592072, + "grad_norm": 4.237395286560059, + "learning_rate": 3.1317444320679884e-05, + "loss": 0.7342, + "step": 126800 + }, + { + "epoch": 1.1210417440195195, + "grad_norm": 4.222539901733398, + "learning_rate": 3.131597093300801e-05, + "loss": 0.5849, + "step": 126810 + }, + { + "epoch": 1.1211301472798316, + "grad_norm": 0.974381685256958, + "learning_rate": 3.131449754533614e-05, + "loss": 0.5039, + "step": 126820 + }, + { + "epoch": 1.121218550540144, + "grad_norm": 7.4011616706848145, + "learning_rate": 3.131302415766427e-05, + "loss": 0.5862, + "step": 126830 + }, + { + "epoch": 1.121306953800456, + "grad_norm": 3.8008170127868652, + "learning_rate": 3.13115507699924e-05, + "loss": 0.5368, + "step": 126840 + }, + { + "epoch": 1.1213953570607684, + "grad_norm": 4.988938808441162, + "learning_rate": 3.131007738232053e-05, + "loss": 0.5801, + "step": 126850 + }, + { + "epoch": 1.1214837603210805, + "grad_norm": 1.0936565399169922, + "learning_rate": 3.1308603994648654e-05, + "loss": 0.5127, + "step": 126860 + }, + { + "epoch": 1.121572163581393, + "grad_norm": 2.333862781524658, + "learning_rate": 3.130713060697679e-05, + "loss": 0.6937, + "step": 126870 + }, + { + "epoch": 1.121660566841705, + "grad_norm": 0.904512882232666, + "learning_rate": 3.130565721930492e-05, + "loss": 0.5786, + "step": 126880 + }, + { + "epoch": 1.1217489701020174, + "grad_norm": 2.9923555850982666, + "learning_rate": 3.1304183831633046e-05, + "loss": 0.522, + "step": 126890 + }, + { + "epoch": 1.1218373733623297, + "grad_norm": 11.812182426452637, + "learning_rate": 3.1302710443961174e-05, + "loss": 0.7455, + "step": 126900 + }, + { + "epoch": 1.1219257766226418, + "grad_norm": 2.903211832046509, + "learning_rate": 3.130123705628931e-05, + "loss": 0.5789, + "step": 126910 + }, + { + "epoch": 1.1220141798829542, + "grad_norm": 1.6600630283355713, + "learning_rate": 3.129976366861743e-05, + "loss": 0.5981, + "step": 126920 + }, + { + "epoch": 1.1221025831432663, + "grad_norm": 1.3097416162490845, + "learning_rate": 3.1298290280945566e-05, + "loss": 0.508, + "step": 126930 + }, + { + "epoch": 1.1221909864035786, + "grad_norm": 15.6178560256958, + "learning_rate": 3.129681689327369e-05, + "loss": 0.5887, + "step": 126940 + }, + { + "epoch": 1.1222793896638907, + "grad_norm": 1.5398316383361816, + "learning_rate": 3.129534350560182e-05, + "loss": 0.4815, + "step": 126950 + }, + { + "epoch": 1.122367792924203, + "grad_norm": 0.8039826154708862, + "learning_rate": 3.129387011792995e-05, + "loss": 0.5016, + "step": 126960 + }, + { + "epoch": 1.1224561961845152, + "grad_norm": 2.7974190711975098, + "learning_rate": 3.129239673025808e-05, + "loss": 0.6706, + "step": 126970 + }, + { + "epoch": 1.1225445994448275, + "grad_norm": 2.6173548698425293, + "learning_rate": 3.129092334258621e-05, + "loss": 0.8201, + "step": 126980 + }, + { + "epoch": 1.1226330027051397, + "grad_norm": 3.0918893814086914, + "learning_rate": 3.128944995491434e-05, + "loss": 0.6088, + "step": 126990 + }, + { + "epoch": 1.122721405965452, + "grad_norm": 2.8781216144561768, + "learning_rate": 3.1287976567242464e-05, + "loss": 0.5678, + "step": 127000 + }, + { + "epoch": 1.1228098092257643, + "grad_norm": 1.210296630859375, + "learning_rate": 3.12865031795706e-05, + "loss": 0.6697, + "step": 127010 + }, + { + "epoch": 1.1228982124860765, + "grad_norm": 2.7205042839050293, + "learning_rate": 3.128502979189873e-05, + "loss": 0.5873, + "step": 127020 + }, + { + "epoch": 1.1229866157463888, + "grad_norm": 1.9096770286560059, + "learning_rate": 3.1283556404226856e-05, + "loss": 0.6304, + "step": 127030 + }, + { + "epoch": 1.123075019006701, + "grad_norm": 1.5841436386108398, + "learning_rate": 3.1282083016554985e-05, + "loss": 0.6046, + "step": 127040 + }, + { + "epoch": 1.1231634222670133, + "grad_norm": 4.161255836486816, + "learning_rate": 3.128060962888312e-05, + "loss": 0.6643, + "step": 127050 + }, + { + "epoch": 1.1232518255273254, + "grad_norm": 5.838593482971191, + "learning_rate": 3.127913624121124e-05, + "loss": 0.6965, + "step": 127060 + }, + { + "epoch": 1.1233402287876377, + "grad_norm": 5.136905670166016, + "learning_rate": 3.1277662853539376e-05, + "loss": 0.6932, + "step": 127070 + }, + { + "epoch": 1.1234286320479498, + "grad_norm": 13.633052825927734, + "learning_rate": 3.12761894658675e-05, + "loss": 0.4809, + "step": 127080 + }, + { + "epoch": 1.1235170353082622, + "grad_norm": 2.246755361557007, + "learning_rate": 3.127471607819563e-05, + "loss": 0.7049, + "step": 127090 + }, + { + "epoch": 1.1236054385685743, + "grad_norm": 1.7412654161453247, + "learning_rate": 3.127324269052376e-05, + "loss": 0.6733, + "step": 127100 + }, + { + "epoch": 1.1236938418288867, + "grad_norm": 6.824997901916504, + "learning_rate": 3.127176930285189e-05, + "loss": 0.6276, + "step": 127110 + }, + { + "epoch": 1.123782245089199, + "grad_norm": 4.419140815734863, + "learning_rate": 3.127029591518002e-05, + "loss": 0.6596, + "step": 127120 + }, + { + "epoch": 1.1238706483495111, + "grad_norm": 13.504050254821777, + "learning_rate": 3.126882252750815e-05, + "loss": 0.6391, + "step": 127130 + }, + { + "epoch": 1.1239590516098235, + "grad_norm": 3.6946518421173096, + "learning_rate": 3.1267349139836275e-05, + "loss": 0.5094, + "step": 127140 + }, + { + "epoch": 1.1240474548701356, + "grad_norm": 2.123535633087158, + "learning_rate": 3.126587575216441e-05, + "loss": 0.7253, + "step": 127150 + }, + { + "epoch": 1.124135858130448, + "grad_norm": 7.622867584228516, + "learning_rate": 3.126440236449254e-05, + "loss": 0.6652, + "step": 127160 + }, + { + "epoch": 1.12422426139076, + "grad_norm": 2.0470130443573, + "learning_rate": 3.126292897682067e-05, + "loss": 0.5564, + "step": 127170 + }, + { + "epoch": 1.1243126646510724, + "grad_norm": 1.351466417312622, + "learning_rate": 3.1261455589148795e-05, + "loss": 0.5942, + "step": 127180 + }, + { + "epoch": 1.1244010679113845, + "grad_norm": 1.3055543899536133, + "learning_rate": 3.1259982201476923e-05, + "loss": 0.662, + "step": 127190 + }, + { + "epoch": 1.1244894711716968, + "grad_norm": 2.757878541946411, + "learning_rate": 3.125850881380505e-05, + "loss": 0.4989, + "step": 127200 + }, + { + "epoch": 1.124577874432009, + "grad_norm": 2.3031773567199707, + "learning_rate": 3.125703542613319e-05, + "loss": 0.6053, + "step": 127210 + }, + { + "epoch": 1.1246662776923213, + "grad_norm": 5.308856964111328, + "learning_rate": 3.125556203846131e-05, + "loss": 0.5899, + "step": 127220 + }, + { + "epoch": 1.1247546809526336, + "grad_norm": 1.7436944246292114, + "learning_rate": 3.1254088650789444e-05, + "loss": 0.6173, + "step": 127230 + }, + { + "epoch": 1.1248430842129458, + "grad_norm": 3.0537967681884766, + "learning_rate": 3.125261526311757e-05, + "loss": 0.6495, + "step": 127240 + }, + { + "epoch": 1.124931487473258, + "grad_norm": 7.111103057861328, + "learning_rate": 3.12511418754457e-05, + "loss": 0.5467, + "step": 127250 + }, + { + "epoch": 1.1250198907335702, + "grad_norm": 1.9732167720794678, + "learning_rate": 3.124966848777383e-05, + "loss": 0.6263, + "step": 127260 + }, + { + "epoch": 1.1251082939938826, + "grad_norm": 1.3944525718688965, + "learning_rate": 3.1248195100101964e-05, + "loss": 0.4412, + "step": 127270 + }, + { + "epoch": 1.1251966972541947, + "grad_norm": 1.6753472089767456, + "learning_rate": 3.1246721712430085e-05, + "loss": 0.5593, + "step": 127280 + }, + { + "epoch": 1.125285100514507, + "grad_norm": 3.7476272583007812, + "learning_rate": 3.124524832475822e-05, + "loss": 0.524, + "step": 127290 + }, + { + "epoch": 1.1253735037748192, + "grad_norm": 9.736187934875488, + "learning_rate": 3.124377493708634e-05, + "loss": 0.537, + "step": 127300 + }, + { + "epoch": 1.1254619070351315, + "grad_norm": 20.751432418823242, + "learning_rate": 3.124230154941448e-05, + "loss": 0.6067, + "step": 127310 + }, + { + "epoch": 1.1255503102954436, + "grad_norm": 3.3839869499206543, + "learning_rate": 3.1240828161742606e-05, + "loss": 0.6929, + "step": 127320 + }, + { + "epoch": 1.125638713555756, + "grad_norm": 3.6687798500061035, + "learning_rate": 3.1239354774070734e-05, + "loss": 0.5337, + "step": 127330 + }, + { + "epoch": 1.1257271168160683, + "grad_norm": 2.122966766357422, + "learning_rate": 3.123788138639886e-05, + "loss": 0.7601, + "step": 127340 + }, + { + "epoch": 1.1258155200763804, + "grad_norm": 2.464848279953003, + "learning_rate": 3.1236407998727e-05, + "loss": 0.4645, + "step": 127350 + }, + { + "epoch": 1.1259039233366925, + "grad_norm": 1.720076322555542, + "learning_rate": 3.123493461105512e-05, + "loss": 0.6076, + "step": 127360 + }, + { + "epoch": 1.1259923265970049, + "grad_norm": 1.5469862222671509, + "learning_rate": 3.1233461223383254e-05, + "loss": 0.6892, + "step": 127370 + }, + { + "epoch": 1.1260807298573172, + "grad_norm": 5.656920909881592, + "learning_rate": 3.123198783571138e-05, + "loss": 0.8294, + "step": 127380 + }, + { + "epoch": 1.1261691331176293, + "grad_norm": 2.652963638305664, + "learning_rate": 3.123051444803951e-05, + "loss": 0.6324, + "step": 127390 + }, + { + "epoch": 1.1262575363779417, + "grad_norm": 3.476199150085449, + "learning_rate": 3.122904106036764e-05, + "loss": 0.6096, + "step": 127400 + }, + { + "epoch": 1.1263459396382538, + "grad_norm": 2.9368913173675537, + "learning_rate": 3.122756767269577e-05, + "loss": 0.7013, + "step": 127410 + }, + { + "epoch": 1.1264343428985661, + "grad_norm": 1.9870730638504028, + "learning_rate": 3.1226094285023896e-05, + "loss": 0.6709, + "step": 127420 + }, + { + "epoch": 1.1265227461588783, + "grad_norm": 5.3724589347839355, + "learning_rate": 3.122462089735203e-05, + "loss": 0.7113, + "step": 127430 + }, + { + "epoch": 1.1266111494191906, + "grad_norm": 3.1432862281799316, + "learning_rate": 3.122314750968016e-05, + "loss": 0.6544, + "step": 127440 + }, + { + "epoch": 1.1266995526795027, + "grad_norm": 2.4781832695007324, + "learning_rate": 3.122167412200829e-05, + "loss": 0.6568, + "step": 127450 + }, + { + "epoch": 1.126787955939815, + "grad_norm": 3.0052313804626465, + "learning_rate": 3.1220200734336416e-05, + "loss": 0.5346, + "step": 127460 + }, + { + "epoch": 1.1268763592001272, + "grad_norm": 4.139371871948242, + "learning_rate": 3.1218727346664544e-05, + "loss": 0.7233, + "step": 127470 + }, + { + "epoch": 1.1269647624604395, + "grad_norm": 5.191629409790039, + "learning_rate": 3.121725395899267e-05, + "loss": 0.8899, + "step": 127480 + }, + { + "epoch": 1.1270531657207519, + "grad_norm": 1.8640124797821045, + "learning_rate": 3.121578057132081e-05, + "loss": 0.6398, + "step": 127490 + }, + { + "epoch": 1.127141568981064, + "grad_norm": 11.424607276916504, + "learning_rate": 3.1214307183648936e-05, + "loss": 0.6252, + "step": 127500 + }, + { + "epoch": 1.1272299722413763, + "grad_norm": 2.3524322509765625, + "learning_rate": 3.1212833795977065e-05, + "loss": 0.6719, + "step": 127510 + }, + { + "epoch": 1.1273183755016885, + "grad_norm": 5.033801555633545, + "learning_rate": 3.121136040830519e-05, + "loss": 0.6408, + "step": 127520 + }, + { + "epoch": 1.1274067787620008, + "grad_norm": 5.450212001800537, + "learning_rate": 3.120988702063332e-05, + "loss": 0.6145, + "step": 127530 + }, + { + "epoch": 1.127495182022313, + "grad_norm": 1.8799296617507935, + "learning_rate": 3.120841363296145e-05, + "loss": 0.6284, + "step": 127540 + }, + { + "epoch": 1.1275835852826253, + "grad_norm": 1.1824297904968262, + "learning_rate": 3.120694024528958e-05, + "loss": 0.5214, + "step": 127550 + }, + { + "epoch": 1.1276719885429374, + "grad_norm": 3.4856674671173096, + "learning_rate": 3.120546685761771e-05, + "loss": 0.6247, + "step": 127560 + }, + { + "epoch": 1.1277603918032497, + "grad_norm": 14.795614242553711, + "learning_rate": 3.120399346994584e-05, + "loss": 0.6804, + "step": 127570 + }, + { + "epoch": 1.1278487950635618, + "grad_norm": 2.6707940101623535, + "learning_rate": 3.120252008227397e-05, + "loss": 0.6733, + "step": 127580 + }, + { + "epoch": 1.1279371983238742, + "grad_norm": 2.520338535308838, + "learning_rate": 3.12010466946021e-05, + "loss": 0.6386, + "step": 127590 + }, + { + "epoch": 1.1280256015841865, + "grad_norm": 1.6713100671768188, + "learning_rate": 3.1199573306930227e-05, + "loss": 0.5881, + "step": 127600 + }, + { + "epoch": 1.1281140048444986, + "grad_norm": 3.222883462905884, + "learning_rate": 3.1198099919258355e-05, + "loss": 0.7449, + "step": 127610 + }, + { + "epoch": 1.128202408104811, + "grad_norm": 1.9076906442642212, + "learning_rate": 3.119662653158649e-05, + "loss": 0.582, + "step": 127620 + }, + { + "epoch": 1.128290811365123, + "grad_norm": 3.9604201316833496, + "learning_rate": 3.119515314391462e-05, + "loss": 0.6379, + "step": 127630 + }, + { + "epoch": 1.1283792146254354, + "grad_norm": 1.8760172128677368, + "learning_rate": 3.119367975624275e-05, + "loss": 0.6786, + "step": 127640 + }, + { + "epoch": 1.1284676178857476, + "grad_norm": 2.554670810699463, + "learning_rate": 3.1192206368570875e-05, + "loss": 0.5619, + "step": 127650 + }, + { + "epoch": 1.12855602114606, + "grad_norm": 2.782709836959839, + "learning_rate": 3.1190732980899003e-05, + "loss": 0.6666, + "step": 127660 + }, + { + "epoch": 1.128644424406372, + "grad_norm": 1.3999807834625244, + "learning_rate": 3.118925959322713e-05, + "loss": 0.6457, + "step": 127670 + }, + { + "epoch": 1.1287328276666844, + "grad_norm": 1.673561692237854, + "learning_rate": 3.118778620555527e-05, + "loss": 0.5272, + "step": 127680 + }, + { + "epoch": 1.1288212309269965, + "grad_norm": 12.835931777954102, + "learning_rate": 3.118631281788339e-05, + "loss": 0.7, + "step": 127690 + }, + { + "epoch": 1.1289096341873088, + "grad_norm": 3.948513984680176, + "learning_rate": 3.1184839430211524e-05, + "loss": 0.7694, + "step": 127700 + }, + { + "epoch": 1.1289980374476212, + "grad_norm": 1.4934343099594116, + "learning_rate": 3.118336604253965e-05, + "loss": 0.5405, + "step": 127710 + }, + { + "epoch": 1.1290864407079333, + "grad_norm": 3.815197467803955, + "learning_rate": 3.118189265486778e-05, + "loss": 0.5864, + "step": 127720 + }, + { + "epoch": 1.1291748439682456, + "grad_norm": 2.613705635070801, + "learning_rate": 3.118041926719591e-05, + "loss": 0.5741, + "step": 127730 + }, + { + "epoch": 1.1292632472285578, + "grad_norm": 18.398590087890625, + "learning_rate": 3.1178945879524044e-05, + "loss": 0.5663, + "step": 127740 + }, + { + "epoch": 1.12935165048887, + "grad_norm": 1.7807130813598633, + "learning_rate": 3.1177472491852165e-05, + "loss": 0.4899, + "step": 127750 + }, + { + "epoch": 1.1294400537491822, + "grad_norm": 7.4020233154296875, + "learning_rate": 3.11759991041803e-05, + "loss": 0.6467, + "step": 127760 + }, + { + "epoch": 1.1295284570094946, + "grad_norm": 9.795546531677246, + "learning_rate": 3.117452571650842e-05, + "loss": 0.5992, + "step": 127770 + }, + { + "epoch": 1.1296168602698067, + "grad_norm": 2.897505283355713, + "learning_rate": 3.117305232883656e-05, + "loss": 0.6233, + "step": 127780 + }, + { + "epoch": 1.129705263530119, + "grad_norm": 1.5370945930480957, + "learning_rate": 3.1171578941164686e-05, + "loss": 0.5322, + "step": 127790 + }, + { + "epoch": 1.1297936667904311, + "grad_norm": 3.2173004150390625, + "learning_rate": 3.1170105553492814e-05, + "loss": 0.6435, + "step": 127800 + }, + { + "epoch": 1.1298820700507435, + "grad_norm": 2.5702590942382812, + "learning_rate": 3.116863216582094e-05, + "loss": 0.5733, + "step": 127810 + }, + { + "epoch": 1.1299704733110558, + "grad_norm": 2.9192237854003906, + "learning_rate": 3.116715877814908e-05, + "loss": 0.5339, + "step": 127820 + }, + { + "epoch": 1.130058876571368, + "grad_norm": 5.5746846199035645, + "learning_rate": 3.11656853904772e-05, + "loss": 0.7213, + "step": 127830 + }, + { + "epoch": 1.1301472798316803, + "grad_norm": 2.588975667953491, + "learning_rate": 3.1164212002805334e-05, + "loss": 0.653, + "step": 127840 + }, + { + "epoch": 1.1302356830919924, + "grad_norm": 5.150849342346191, + "learning_rate": 3.116273861513346e-05, + "loss": 0.6385, + "step": 127850 + }, + { + "epoch": 1.1303240863523047, + "grad_norm": 5.099704742431641, + "learning_rate": 3.116126522746159e-05, + "loss": 0.5256, + "step": 127860 + }, + { + "epoch": 1.1304124896126169, + "grad_norm": 1.229253888130188, + "learning_rate": 3.115979183978972e-05, + "loss": 0.5803, + "step": 127870 + }, + { + "epoch": 1.1305008928729292, + "grad_norm": 5.609333515167236, + "learning_rate": 3.115831845211785e-05, + "loss": 0.6201, + "step": 127880 + }, + { + "epoch": 1.1305892961332413, + "grad_norm": 2.342172145843506, + "learning_rate": 3.1156845064445976e-05, + "loss": 0.6521, + "step": 127890 + }, + { + "epoch": 1.1306776993935537, + "grad_norm": 3.7136483192443848, + "learning_rate": 3.115537167677411e-05, + "loss": 0.6616, + "step": 127900 + }, + { + "epoch": 1.1307661026538658, + "grad_norm": 2.246790647506714, + "learning_rate": 3.115389828910223e-05, + "loss": 0.7396, + "step": 127910 + }, + { + "epoch": 1.1308545059141781, + "grad_norm": 11.692253112792969, + "learning_rate": 3.115242490143037e-05, + "loss": 0.5945, + "step": 127920 + }, + { + "epoch": 1.1309429091744905, + "grad_norm": 8.861778259277344, + "learning_rate": 3.1150951513758496e-05, + "loss": 0.6087, + "step": 127930 + }, + { + "epoch": 1.1310313124348026, + "grad_norm": 0.8729851841926575, + "learning_rate": 3.1149478126086625e-05, + "loss": 0.5115, + "step": 127940 + }, + { + "epoch": 1.1311197156951147, + "grad_norm": 3.291654109954834, + "learning_rate": 3.114800473841475e-05, + "loss": 0.5895, + "step": 127950 + }, + { + "epoch": 1.131208118955427, + "grad_norm": 5.798988342285156, + "learning_rate": 3.114653135074289e-05, + "loss": 0.612, + "step": 127960 + }, + { + "epoch": 1.1312965222157394, + "grad_norm": 1.1860740184783936, + "learning_rate": 3.114505796307101e-05, + "loss": 0.7366, + "step": 127970 + }, + { + "epoch": 1.1313849254760515, + "grad_norm": 1.4610323905944824, + "learning_rate": 3.1143584575399145e-05, + "loss": 0.6076, + "step": 127980 + }, + { + "epoch": 1.1314733287363639, + "grad_norm": 3.5780997276306152, + "learning_rate": 3.114211118772727e-05, + "loss": 0.6438, + "step": 127990 + }, + { + "epoch": 1.131561731996676, + "grad_norm": 7.238277912139893, + "learning_rate": 3.11406378000554e-05, + "loss": 0.5945, + "step": 128000 + }, + { + "epoch": 1.1316501352569883, + "grad_norm": 6.38936710357666, + "learning_rate": 3.113916441238353e-05, + "loss": 0.6412, + "step": 128010 + }, + { + "epoch": 1.1317385385173004, + "grad_norm": 2.3094916343688965, + "learning_rate": 3.113769102471166e-05, + "loss": 0.6005, + "step": 128020 + }, + { + "epoch": 1.1318269417776128, + "grad_norm": 5.772735595703125, + "learning_rate": 3.1136217637039786e-05, + "loss": 0.6199, + "step": 128030 + }, + { + "epoch": 1.131915345037925, + "grad_norm": 2.0689494609832764, + "learning_rate": 3.113474424936792e-05, + "loss": 0.5729, + "step": 128040 + }, + { + "epoch": 1.1320037482982372, + "grad_norm": 3.396052837371826, + "learning_rate": 3.113327086169604e-05, + "loss": 0.6487, + "step": 128050 + }, + { + "epoch": 1.1320921515585494, + "grad_norm": 1.5205445289611816, + "learning_rate": 3.113179747402418e-05, + "loss": 0.608, + "step": 128060 + }, + { + "epoch": 1.1321805548188617, + "grad_norm": 0.7985290288925171, + "learning_rate": 3.113032408635231e-05, + "loss": 0.4496, + "step": 128070 + }, + { + "epoch": 1.132268958079174, + "grad_norm": 2.381638288497925, + "learning_rate": 3.1128850698680435e-05, + "loss": 0.6879, + "step": 128080 + }, + { + "epoch": 1.1323573613394862, + "grad_norm": 3.3611629009246826, + "learning_rate": 3.112737731100856e-05, + "loss": 0.5327, + "step": 128090 + }, + { + "epoch": 1.1324457645997985, + "grad_norm": 1.7304800748825073, + "learning_rate": 3.11259039233367e-05, + "loss": 0.6686, + "step": 128100 + }, + { + "epoch": 1.1325341678601106, + "grad_norm": 8.254027366638184, + "learning_rate": 3.112443053566482e-05, + "loss": 0.5926, + "step": 128110 + }, + { + "epoch": 1.132622571120423, + "grad_norm": 4.4736857414245605, + "learning_rate": 3.1122957147992955e-05, + "loss": 0.5408, + "step": 128120 + }, + { + "epoch": 1.132710974380735, + "grad_norm": 1.0886543989181519, + "learning_rate": 3.112148376032108e-05, + "loss": 0.5942, + "step": 128130 + }, + { + "epoch": 1.1327993776410474, + "grad_norm": 1.0274347066879272, + "learning_rate": 3.112001037264921e-05, + "loss": 0.5788, + "step": 128140 + }, + { + "epoch": 1.1328877809013596, + "grad_norm": 1.0781298875808716, + "learning_rate": 3.111853698497734e-05, + "loss": 0.4986, + "step": 128150 + }, + { + "epoch": 1.132976184161672, + "grad_norm": 4.785043239593506, + "learning_rate": 3.111706359730547e-05, + "loss": 0.7426, + "step": 128160 + }, + { + "epoch": 1.133064587421984, + "grad_norm": 6.955599784851074, + "learning_rate": 3.11155902096336e-05, + "loss": 0.698, + "step": 128170 + }, + { + "epoch": 1.1331529906822964, + "grad_norm": 2.1085779666900635, + "learning_rate": 3.111411682196173e-05, + "loss": 0.5651, + "step": 128180 + }, + { + "epoch": 1.1332413939426087, + "grad_norm": 7.190761089324951, + "learning_rate": 3.1112643434289854e-05, + "loss": 0.5391, + "step": 128190 + }, + { + "epoch": 1.1333297972029208, + "grad_norm": 3.2585065364837646, + "learning_rate": 3.111117004661799e-05, + "loss": 0.5222, + "step": 128200 + }, + { + "epoch": 1.1334182004632332, + "grad_norm": 3.023618459701538, + "learning_rate": 3.110969665894612e-05, + "loss": 0.5605, + "step": 128210 + }, + { + "epoch": 1.1335066037235453, + "grad_norm": 1.328873634338379, + "learning_rate": 3.1108223271274246e-05, + "loss": 0.5163, + "step": 128220 + }, + { + "epoch": 1.1335950069838576, + "grad_norm": 1.7010133266448975, + "learning_rate": 3.1106749883602374e-05, + "loss": 0.5721, + "step": 128230 + }, + { + "epoch": 1.1336834102441697, + "grad_norm": 1.680161714553833, + "learning_rate": 3.11052764959305e-05, + "loss": 0.6808, + "step": 128240 + }, + { + "epoch": 1.133771813504482, + "grad_norm": 3.0349910259246826, + "learning_rate": 3.110380310825863e-05, + "loss": 0.7132, + "step": 128250 + }, + { + "epoch": 1.1338602167647942, + "grad_norm": 4.480554103851318, + "learning_rate": 3.1102329720586766e-05, + "loss": 0.6455, + "step": 128260 + }, + { + "epoch": 1.1339486200251065, + "grad_norm": 2.1163761615753174, + "learning_rate": 3.110085633291489e-05, + "loss": 0.5604, + "step": 128270 + }, + { + "epoch": 1.1340370232854187, + "grad_norm": 4.307466983795166, + "learning_rate": 3.109938294524302e-05, + "loss": 0.7485, + "step": 128280 + }, + { + "epoch": 1.134125426545731, + "grad_norm": 2.174879789352417, + "learning_rate": 3.109790955757115e-05, + "loss": 0.6301, + "step": 128290 + }, + { + "epoch": 1.1342138298060433, + "grad_norm": 2.4866321086883545, + "learning_rate": 3.109643616989928e-05, + "loss": 0.6528, + "step": 128300 + }, + { + "epoch": 1.1343022330663555, + "grad_norm": 2.4251863956451416, + "learning_rate": 3.109496278222741e-05, + "loss": 0.7295, + "step": 128310 + }, + { + "epoch": 1.1343906363266678, + "grad_norm": 13.800593376159668, + "learning_rate": 3.109348939455554e-05, + "loss": 0.6949, + "step": 128320 + }, + { + "epoch": 1.13447903958698, + "grad_norm": 10.20455265045166, + "learning_rate": 3.1092016006883664e-05, + "loss": 0.7361, + "step": 128330 + }, + { + "epoch": 1.1345674428472923, + "grad_norm": 4.577689170837402, + "learning_rate": 3.10905426192118e-05, + "loss": 0.6818, + "step": 128340 + }, + { + "epoch": 1.1346558461076044, + "grad_norm": 2.310227155685425, + "learning_rate": 3.108906923153993e-05, + "loss": 0.6335, + "step": 128350 + }, + { + "epoch": 1.1347442493679167, + "grad_norm": 2.1640915870666504, + "learning_rate": 3.1087595843868056e-05, + "loss": 0.5092, + "step": 128360 + }, + { + "epoch": 1.1348326526282289, + "grad_norm": 4.713624477386475, + "learning_rate": 3.1086122456196184e-05, + "loss": 0.5343, + "step": 128370 + }, + { + "epoch": 1.1349210558885412, + "grad_norm": 8.862934112548828, + "learning_rate": 3.108464906852431e-05, + "loss": 0.6609, + "step": 128380 + }, + { + "epoch": 1.1350094591488533, + "grad_norm": 5.741730213165283, + "learning_rate": 3.108317568085244e-05, + "loss": 0.4951, + "step": 128390 + }, + { + "epoch": 1.1350978624091657, + "grad_norm": 4.6546735763549805, + "learning_rate": 3.1081702293180576e-05, + "loss": 0.6988, + "step": 128400 + }, + { + "epoch": 1.135186265669478, + "grad_norm": 2.6714022159576416, + "learning_rate": 3.1080228905508705e-05, + "loss": 0.5887, + "step": 128410 + }, + { + "epoch": 1.1352746689297901, + "grad_norm": 1.811808466911316, + "learning_rate": 3.107875551783683e-05, + "loss": 0.4598, + "step": 128420 + }, + { + "epoch": 1.1353630721901025, + "grad_norm": 10.22468376159668, + "learning_rate": 3.107728213016496e-05, + "loss": 0.6492, + "step": 128430 + }, + { + "epoch": 1.1354514754504146, + "grad_norm": 8.852533340454102, + "learning_rate": 3.107580874249309e-05, + "loss": 0.5588, + "step": 128440 + }, + { + "epoch": 1.135539878710727, + "grad_norm": 2.12284255027771, + "learning_rate": 3.107433535482122e-05, + "loss": 0.7217, + "step": 128450 + }, + { + "epoch": 1.135628281971039, + "grad_norm": 1.8057159185409546, + "learning_rate": 3.107286196714935e-05, + "loss": 0.6407, + "step": 128460 + }, + { + "epoch": 1.1357166852313514, + "grad_norm": 2.4709341526031494, + "learning_rate": 3.107138857947748e-05, + "loss": 0.6377, + "step": 128470 + }, + { + "epoch": 1.1358050884916635, + "grad_norm": 9.284843444824219, + "learning_rate": 3.106991519180561e-05, + "loss": 0.6803, + "step": 128480 + }, + { + "epoch": 1.1358934917519758, + "grad_norm": 1.947967767715454, + "learning_rate": 3.106844180413374e-05, + "loss": 0.6463, + "step": 128490 + }, + { + "epoch": 1.135981895012288, + "grad_norm": 1.7554666996002197, + "learning_rate": 3.1066968416461867e-05, + "loss": 0.4648, + "step": 128500 + }, + { + "epoch": 1.1360702982726003, + "grad_norm": 0.9914371967315674, + "learning_rate": 3.1065495028789995e-05, + "loss": 0.4564, + "step": 128510 + }, + { + "epoch": 1.1361587015329127, + "grad_norm": 18.34630012512207, + "learning_rate": 3.106402164111812e-05, + "loss": 0.6756, + "step": 128520 + }, + { + "epoch": 1.1362471047932248, + "grad_norm": 4.659605026245117, + "learning_rate": 3.106254825344626e-05, + "loss": 0.5377, + "step": 128530 + }, + { + "epoch": 1.136335508053537, + "grad_norm": 1.9386022090911865, + "learning_rate": 3.106107486577439e-05, + "loss": 0.6784, + "step": 128540 + }, + { + "epoch": 1.1364239113138492, + "grad_norm": 1.648189902305603, + "learning_rate": 3.1059601478102515e-05, + "loss": 0.6302, + "step": 128550 + }, + { + "epoch": 1.1365123145741616, + "grad_norm": 2.0266330242156982, + "learning_rate": 3.1058128090430643e-05, + "loss": 0.6043, + "step": 128560 + }, + { + "epoch": 1.1366007178344737, + "grad_norm": 4.102417469024658, + "learning_rate": 3.105665470275877e-05, + "loss": 0.6162, + "step": 128570 + }, + { + "epoch": 1.136689121094786, + "grad_norm": 3.8699839115142822, + "learning_rate": 3.10551813150869e-05, + "loss": 0.6195, + "step": 128580 + }, + { + "epoch": 1.1367775243550982, + "grad_norm": 2.4275920391082764, + "learning_rate": 3.1053707927415035e-05, + "loss": 0.6614, + "step": 128590 + }, + { + "epoch": 1.1368659276154105, + "grad_norm": 4.222693920135498, + "learning_rate": 3.105223453974316e-05, + "loss": 0.5581, + "step": 128600 + }, + { + "epoch": 1.1369543308757226, + "grad_norm": 2.011685848236084, + "learning_rate": 3.105076115207129e-05, + "loss": 0.5598, + "step": 128610 + }, + { + "epoch": 1.137042734136035, + "grad_norm": 2.6063008308410645, + "learning_rate": 3.104928776439942e-05, + "loss": 0.6228, + "step": 128620 + }, + { + "epoch": 1.137131137396347, + "grad_norm": 4.440762996673584, + "learning_rate": 3.104781437672755e-05, + "loss": 0.665, + "step": 128630 + }, + { + "epoch": 1.1372195406566594, + "grad_norm": 3.1974143981933594, + "learning_rate": 3.104634098905568e-05, + "loss": 0.7242, + "step": 128640 + }, + { + "epoch": 1.1373079439169715, + "grad_norm": 1.4081751108169556, + "learning_rate": 3.104486760138381e-05, + "loss": 0.523, + "step": 128650 + }, + { + "epoch": 1.1373963471772839, + "grad_norm": 2.1216461658477783, + "learning_rate": 3.1043394213711934e-05, + "loss": 0.7082, + "step": 128660 + }, + { + "epoch": 1.1374847504375962, + "grad_norm": 1.7226834297180176, + "learning_rate": 3.104192082604007e-05, + "loss": 0.6275, + "step": 128670 + }, + { + "epoch": 1.1375731536979083, + "grad_norm": 2.572516441345215, + "learning_rate": 3.10404474383682e-05, + "loss": 0.6339, + "step": 128680 + }, + { + "epoch": 1.1376615569582207, + "grad_norm": 1.8610177040100098, + "learning_rate": 3.1038974050696326e-05, + "loss": 0.5018, + "step": 128690 + }, + { + "epoch": 1.1377499602185328, + "grad_norm": 1.7649964094161987, + "learning_rate": 3.1037500663024454e-05, + "loss": 0.7139, + "step": 128700 + }, + { + "epoch": 1.1378383634788451, + "grad_norm": 0.8393646478652954, + "learning_rate": 3.103602727535258e-05, + "loss": 0.6238, + "step": 128710 + }, + { + "epoch": 1.1379267667391573, + "grad_norm": 3.8245954513549805, + "learning_rate": 3.103455388768071e-05, + "loss": 0.5752, + "step": 128720 + }, + { + "epoch": 1.1380151699994696, + "grad_norm": 9.85477352142334, + "learning_rate": 3.1033080500008846e-05, + "loss": 0.7119, + "step": 128730 + }, + { + "epoch": 1.1381035732597817, + "grad_norm": 4.801961421966553, + "learning_rate": 3.103160711233697e-05, + "loss": 0.6302, + "step": 128740 + }, + { + "epoch": 1.138191976520094, + "grad_norm": 2.0827114582061768, + "learning_rate": 3.10301337246651e-05, + "loss": 0.5871, + "step": 128750 + }, + { + "epoch": 1.1382803797804062, + "grad_norm": 1.2859481573104858, + "learning_rate": 3.102866033699323e-05, + "loss": 0.5953, + "step": 128760 + }, + { + "epoch": 1.1383687830407185, + "grad_norm": 5.2473931312561035, + "learning_rate": 3.102718694932136e-05, + "loss": 0.7347, + "step": 128770 + }, + { + "epoch": 1.1384571863010309, + "grad_norm": 5.208019733428955, + "learning_rate": 3.102571356164949e-05, + "loss": 0.6063, + "step": 128780 + }, + { + "epoch": 1.138545589561343, + "grad_norm": 1.5457022190093994, + "learning_rate": 3.102424017397762e-05, + "loss": 0.7643, + "step": 128790 + }, + { + "epoch": 1.1386339928216553, + "grad_norm": 2.7987060546875, + "learning_rate": 3.1022766786305744e-05, + "loss": 0.5569, + "step": 128800 + }, + { + "epoch": 1.1387223960819675, + "grad_norm": 4.447319030761719, + "learning_rate": 3.102129339863388e-05, + "loss": 0.5638, + "step": 128810 + }, + { + "epoch": 1.1388107993422798, + "grad_norm": 0.7671382427215576, + "learning_rate": 3.101982001096201e-05, + "loss": 0.5687, + "step": 128820 + }, + { + "epoch": 1.138899202602592, + "grad_norm": 1.4144881963729858, + "learning_rate": 3.1018346623290136e-05, + "loss": 0.7059, + "step": 128830 + }, + { + "epoch": 1.1389876058629043, + "grad_norm": 1.6497530937194824, + "learning_rate": 3.1016873235618264e-05, + "loss": 0.608, + "step": 128840 + }, + { + "epoch": 1.1390760091232164, + "grad_norm": 2.7800369262695312, + "learning_rate": 3.101539984794639e-05, + "loss": 0.6789, + "step": 128850 + }, + { + "epoch": 1.1391644123835287, + "grad_norm": 3.221393346786499, + "learning_rate": 3.101392646027452e-05, + "loss": 0.6814, + "step": 128860 + }, + { + "epoch": 1.1392528156438408, + "grad_norm": 1.2543436288833618, + "learning_rate": 3.1012453072602656e-05, + "loss": 0.6242, + "step": 128870 + }, + { + "epoch": 1.1393412189041532, + "grad_norm": 2.0788819789886475, + "learning_rate": 3.101097968493078e-05, + "loss": 0.6054, + "step": 128880 + }, + { + "epoch": 1.1394296221644655, + "grad_norm": 2.0430173873901367, + "learning_rate": 3.100950629725891e-05, + "loss": 0.6817, + "step": 128890 + }, + { + "epoch": 1.1395180254247776, + "grad_norm": 1.8807059526443481, + "learning_rate": 3.100803290958704e-05, + "loss": 0.5536, + "step": 128900 + }, + { + "epoch": 1.13960642868509, + "grad_norm": 5.936976432800293, + "learning_rate": 3.100655952191517e-05, + "loss": 0.5814, + "step": 128910 + }, + { + "epoch": 1.139694831945402, + "grad_norm": 11.144386291503906, + "learning_rate": 3.10050861342433e-05, + "loss": 0.5457, + "step": 128920 + }, + { + "epoch": 1.1397832352057145, + "grad_norm": 1.185876488685608, + "learning_rate": 3.100361274657143e-05, + "loss": 0.6976, + "step": 128930 + }, + { + "epoch": 1.1398716384660266, + "grad_norm": 1.4058983325958252, + "learning_rate": 3.1002139358899555e-05, + "loss": 0.7023, + "step": 128940 + }, + { + "epoch": 1.139960041726339, + "grad_norm": 3.231215000152588, + "learning_rate": 3.100066597122769e-05, + "loss": 0.5315, + "step": 128950 + }, + { + "epoch": 1.140048444986651, + "grad_norm": 5.5960164070129395, + "learning_rate": 3.099919258355581e-05, + "loss": 0.6117, + "step": 128960 + }, + { + "epoch": 1.1401368482469634, + "grad_norm": 5.824815273284912, + "learning_rate": 3.099771919588395e-05, + "loss": 0.7051, + "step": 128970 + }, + { + "epoch": 1.1402252515072755, + "grad_norm": 4.238917827606201, + "learning_rate": 3.0996245808212075e-05, + "loss": 0.5019, + "step": 128980 + }, + { + "epoch": 1.1403136547675878, + "grad_norm": 1.0785647630691528, + "learning_rate": 3.09947724205402e-05, + "loss": 0.6906, + "step": 128990 + }, + { + "epoch": 1.1404020580279002, + "grad_norm": 2.795828104019165, + "learning_rate": 3.099329903286833e-05, + "loss": 0.627, + "step": 129000 + }, + { + "epoch": 1.1404904612882123, + "grad_norm": 5.802361488342285, + "learning_rate": 3.099182564519647e-05, + "loss": 0.4534, + "step": 129010 + }, + { + "epoch": 1.1405788645485246, + "grad_norm": 6.869480133056641, + "learning_rate": 3.099035225752459e-05, + "loss": 0.5706, + "step": 129020 + }, + { + "epoch": 1.1406672678088368, + "grad_norm": 0.9857071042060852, + "learning_rate": 3.0988878869852724e-05, + "loss": 0.5821, + "step": 129030 + }, + { + "epoch": 1.140755671069149, + "grad_norm": 2.4715664386749268, + "learning_rate": 3.098740548218085e-05, + "loss": 0.6485, + "step": 129040 + }, + { + "epoch": 1.1408440743294612, + "grad_norm": 9.887063026428223, + "learning_rate": 3.098593209450898e-05, + "loss": 0.581, + "step": 129050 + }, + { + "epoch": 1.1409324775897736, + "grad_norm": 4.94040060043335, + "learning_rate": 3.098445870683711e-05, + "loss": 0.6784, + "step": 129060 + }, + { + "epoch": 1.1410208808500857, + "grad_norm": 13.735101699829102, + "learning_rate": 3.098298531916524e-05, + "loss": 0.6245, + "step": 129070 + }, + { + "epoch": 1.141109284110398, + "grad_norm": 1.6162124872207642, + "learning_rate": 3.0981511931493365e-05, + "loss": 0.7017, + "step": 129080 + }, + { + "epoch": 1.1411976873707101, + "grad_norm": 5.409655570983887, + "learning_rate": 3.09800385438215e-05, + "loss": 0.5664, + "step": 129090 + }, + { + "epoch": 1.1412860906310225, + "grad_norm": 1.40962815284729, + "learning_rate": 3.097856515614962e-05, + "loss": 0.5777, + "step": 129100 + }, + { + "epoch": 1.1413744938913348, + "grad_norm": 10.75178050994873, + "learning_rate": 3.097709176847776e-05, + "loss": 0.7643, + "step": 129110 + }, + { + "epoch": 1.141462897151647, + "grad_norm": 2.588787794113159, + "learning_rate": 3.0975618380805885e-05, + "loss": 0.5151, + "step": 129120 + }, + { + "epoch": 1.141551300411959, + "grad_norm": 1.3616329431533813, + "learning_rate": 3.0974144993134014e-05, + "loss": 0.6456, + "step": 129130 + }, + { + "epoch": 1.1416397036722714, + "grad_norm": 9.743754386901855, + "learning_rate": 3.097267160546214e-05, + "loss": 0.7101, + "step": 129140 + }, + { + "epoch": 1.1417281069325838, + "grad_norm": 6.239378929138184, + "learning_rate": 3.097119821779028e-05, + "loss": 0.6694, + "step": 129150 + }, + { + "epoch": 1.1418165101928959, + "grad_norm": 3.233586549758911, + "learning_rate": 3.09697248301184e-05, + "loss": 0.6558, + "step": 129160 + }, + { + "epoch": 1.1419049134532082, + "grad_norm": 3.038461923599243, + "learning_rate": 3.0968251442446534e-05, + "loss": 0.6131, + "step": 129170 + }, + { + "epoch": 1.1419933167135203, + "grad_norm": 2.079636335372925, + "learning_rate": 3.0966778054774656e-05, + "loss": 0.5607, + "step": 129180 + }, + { + "epoch": 1.1420817199738327, + "grad_norm": 5.6674370765686035, + "learning_rate": 3.096530466710279e-05, + "loss": 0.698, + "step": 129190 + }, + { + "epoch": 1.1421701232341448, + "grad_norm": 1.4439177513122559, + "learning_rate": 3.096383127943092e-05, + "loss": 0.6141, + "step": 129200 + }, + { + "epoch": 1.1422585264944571, + "grad_norm": 1.557323694229126, + "learning_rate": 3.096235789175905e-05, + "loss": 0.6327, + "step": 129210 + }, + { + "epoch": 1.1423469297547695, + "grad_norm": 1.6232562065124512, + "learning_rate": 3.0960884504087176e-05, + "loss": 0.7038, + "step": 129220 + }, + { + "epoch": 1.1424353330150816, + "grad_norm": 1.7543247938156128, + "learning_rate": 3.095941111641531e-05, + "loss": 0.5627, + "step": 129230 + }, + { + "epoch": 1.1425237362753937, + "grad_norm": 1.365300178527832, + "learning_rate": 3.095793772874343e-05, + "loss": 0.643, + "step": 129240 + }, + { + "epoch": 1.142612139535706, + "grad_norm": 1.8272337913513184, + "learning_rate": 3.095646434107157e-05, + "loss": 0.6104, + "step": 129250 + }, + { + "epoch": 1.1427005427960184, + "grad_norm": 2.6397464275360107, + "learning_rate": 3.0954990953399696e-05, + "loss": 0.6511, + "step": 129260 + }, + { + "epoch": 1.1427889460563305, + "grad_norm": 4.559430122375488, + "learning_rate": 3.0953517565727824e-05, + "loss": 0.6548, + "step": 129270 + }, + { + "epoch": 1.1428773493166429, + "grad_norm": 2.175283670425415, + "learning_rate": 3.095204417805595e-05, + "loss": 0.6311, + "step": 129280 + }, + { + "epoch": 1.142965752576955, + "grad_norm": 1.8018583059310913, + "learning_rate": 3.095057079038409e-05, + "loss": 0.4994, + "step": 129290 + }, + { + "epoch": 1.1430541558372673, + "grad_norm": 1.4794366359710693, + "learning_rate": 3.094909740271221e-05, + "loss": 0.6195, + "step": 129300 + }, + { + "epoch": 1.1431425590975794, + "grad_norm": 1.8551850318908691, + "learning_rate": 3.0947624015040345e-05, + "loss": 0.6823, + "step": 129310 + }, + { + "epoch": 1.1432309623578918, + "grad_norm": 2.381648540496826, + "learning_rate": 3.094615062736847e-05, + "loss": 0.5987, + "step": 129320 + }, + { + "epoch": 1.143319365618204, + "grad_norm": 1.3825405836105347, + "learning_rate": 3.09446772396966e-05, + "loss": 0.7014, + "step": 129330 + }, + { + "epoch": 1.1434077688785163, + "grad_norm": 4.019542694091797, + "learning_rate": 3.094320385202473e-05, + "loss": 0.7687, + "step": 129340 + }, + { + "epoch": 1.1434961721388284, + "grad_norm": 7.227363586425781, + "learning_rate": 3.094173046435286e-05, + "loss": 0.5195, + "step": 129350 + }, + { + "epoch": 1.1435845753991407, + "grad_norm": 2.040146589279175, + "learning_rate": 3.0940257076680986e-05, + "loss": 0.6398, + "step": 129360 + }, + { + "epoch": 1.143672978659453, + "grad_norm": 1.8154550790786743, + "learning_rate": 3.093878368900912e-05, + "loss": 0.6957, + "step": 129370 + }, + { + "epoch": 1.1437613819197652, + "grad_norm": 5.321227073669434, + "learning_rate": 3.093731030133725e-05, + "loss": 0.5996, + "step": 129380 + }, + { + "epoch": 1.1438497851800775, + "grad_norm": 3.097691535949707, + "learning_rate": 3.093583691366538e-05, + "loss": 0.534, + "step": 129390 + }, + { + "epoch": 1.1439381884403896, + "grad_norm": 0.9084679484367371, + "learning_rate": 3.0934363525993506e-05, + "loss": 0.5166, + "step": 129400 + }, + { + "epoch": 1.144026591700702, + "grad_norm": 4.590011119842529, + "learning_rate": 3.0932890138321635e-05, + "loss": 0.6488, + "step": 129410 + }, + { + "epoch": 1.144114994961014, + "grad_norm": 2.5000808238983154, + "learning_rate": 3.093141675064976e-05, + "loss": 0.5468, + "step": 129420 + }, + { + "epoch": 1.1442033982213264, + "grad_norm": 4.600218296051025, + "learning_rate": 3.092994336297789e-05, + "loss": 0.7062, + "step": 129430 + }, + { + "epoch": 1.1442918014816386, + "grad_norm": 6.936985969543457, + "learning_rate": 3.092846997530603e-05, + "loss": 0.5943, + "step": 129440 + }, + { + "epoch": 1.144380204741951, + "grad_norm": 3.0413055419921875, + "learning_rate": 3.0926996587634155e-05, + "loss": 0.7356, + "step": 129450 + }, + { + "epoch": 1.144468608002263, + "grad_norm": 2.9137046337127686, + "learning_rate": 3.0925523199962283e-05, + "loss": 0.6307, + "step": 129460 + }, + { + "epoch": 1.1445570112625754, + "grad_norm": 3.0789620876312256, + "learning_rate": 3.092404981229041e-05, + "loss": 0.6423, + "step": 129470 + }, + { + "epoch": 1.1446454145228877, + "grad_norm": 1.3038885593414307, + "learning_rate": 3.092257642461854e-05, + "loss": 0.5895, + "step": 129480 + }, + { + "epoch": 1.1447338177831998, + "grad_norm": 11.934042930603027, + "learning_rate": 3.092110303694667e-05, + "loss": 0.5973, + "step": 129490 + }, + { + "epoch": 1.1448222210435122, + "grad_norm": 1.8368771076202393, + "learning_rate": 3.0919629649274804e-05, + "loss": 0.6139, + "step": 129500 + }, + { + "epoch": 1.1449106243038243, + "grad_norm": 1.5556972026824951, + "learning_rate": 3.091815626160293e-05, + "loss": 0.5033, + "step": 129510 + }, + { + "epoch": 1.1449990275641366, + "grad_norm": 1.673009991645813, + "learning_rate": 3.091668287393106e-05, + "loss": 0.7569, + "step": 129520 + }, + { + "epoch": 1.1450874308244487, + "grad_norm": 2.1208839416503906, + "learning_rate": 3.091520948625919e-05, + "loss": 0.4855, + "step": 129530 + }, + { + "epoch": 1.145175834084761, + "grad_norm": 1.4348490238189697, + "learning_rate": 3.091373609858732e-05, + "loss": 0.7457, + "step": 129540 + }, + { + "epoch": 1.1452642373450732, + "grad_norm": 2.5995562076568604, + "learning_rate": 3.0912262710915445e-05, + "loss": 0.7044, + "step": 129550 + }, + { + "epoch": 1.1453526406053856, + "grad_norm": 4.249809741973877, + "learning_rate": 3.091078932324358e-05, + "loss": 0.5688, + "step": 129560 + }, + { + "epoch": 1.1454410438656977, + "grad_norm": 2.1283082962036133, + "learning_rate": 3.09093159355717e-05, + "loss": 0.6918, + "step": 129570 + }, + { + "epoch": 1.14552944712601, + "grad_norm": 3.371137857437134, + "learning_rate": 3.090784254789984e-05, + "loss": 0.6069, + "step": 129580 + }, + { + "epoch": 1.1456178503863224, + "grad_norm": 2.4240896701812744, + "learning_rate": 3.0906369160227966e-05, + "loss": 0.536, + "step": 129590 + }, + { + "epoch": 1.1457062536466345, + "grad_norm": 2.010716676712036, + "learning_rate": 3.0904895772556094e-05, + "loss": 0.5669, + "step": 129600 + }, + { + "epoch": 1.1457946569069468, + "grad_norm": 3.1237964630126953, + "learning_rate": 3.090342238488422e-05, + "loss": 0.5819, + "step": 129610 + }, + { + "epoch": 1.145883060167259, + "grad_norm": 3.414632558822632, + "learning_rate": 3.090194899721236e-05, + "loss": 0.6437, + "step": 129620 + }, + { + "epoch": 1.1459714634275713, + "grad_norm": 13.128074645996094, + "learning_rate": 3.090047560954048e-05, + "loss": 0.6882, + "step": 129630 + }, + { + "epoch": 1.1460598666878834, + "grad_norm": 9.342219352722168, + "learning_rate": 3.0899002221868614e-05, + "loss": 0.6222, + "step": 129640 + }, + { + "epoch": 1.1461482699481957, + "grad_norm": 2.1535446643829346, + "learning_rate": 3.0897528834196736e-05, + "loss": 0.5809, + "step": 129650 + }, + { + "epoch": 1.1462366732085079, + "grad_norm": 9.27763843536377, + "learning_rate": 3.089605544652487e-05, + "loss": 0.5984, + "step": 129660 + }, + { + "epoch": 1.1463250764688202, + "grad_norm": 1.8992501497268677, + "learning_rate": 3.0894582058853e-05, + "loss": 0.5369, + "step": 129670 + }, + { + "epoch": 1.1464134797291323, + "grad_norm": 2.566413402557373, + "learning_rate": 3.089310867118113e-05, + "loss": 0.6067, + "step": 129680 + }, + { + "epoch": 1.1465018829894447, + "grad_norm": 1.1106303930282593, + "learning_rate": 3.0891635283509256e-05, + "loss": 0.7396, + "step": 129690 + }, + { + "epoch": 1.146590286249757, + "grad_norm": 1.3200767040252686, + "learning_rate": 3.089016189583739e-05, + "loss": 0.5962, + "step": 129700 + }, + { + "epoch": 1.1466786895100691, + "grad_norm": 1.7844996452331543, + "learning_rate": 3.088868850816551e-05, + "loss": 0.5788, + "step": 129710 + }, + { + "epoch": 1.1467670927703815, + "grad_norm": 2.092609167098999, + "learning_rate": 3.088721512049365e-05, + "loss": 0.5229, + "step": 129720 + }, + { + "epoch": 1.1468554960306936, + "grad_norm": 3.077134609222412, + "learning_rate": 3.0885741732821776e-05, + "loss": 0.6544, + "step": 129730 + }, + { + "epoch": 1.146943899291006, + "grad_norm": 14.96422004699707, + "learning_rate": 3.0884268345149904e-05, + "loss": 0.5527, + "step": 129740 + }, + { + "epoch": 1.147032302551318, + "grad_norm": 5.837390422821045, + "learning_rate": 3.088279495747803e-05, + "loss": 0.6023, + "step": 129750 + }, + { + "epoch": 1.1471207058116304, + "grad_norm": 2.97983455657959, + "learning_rate": 3.088132156980617e-05, + "loss": 0.6348, + "step": 129760 + }, + { + "epoch": 1.1472091090719425, + "grad_norm": 2.1689441204071045, + "learning_rate": 3.087984818213429e-05, + "loss": 0.6095, + "step": 129770 + }, + { + "epoch": 1.1472975123322549, + "grad_norm": 3.8452553749084473, + "learning_rate": 3.0878374794462425e-05, + "loss": 0.6871, + "step": 129780 + }, + { + "epoch": 1.147385915592567, + "grad_norm": 6.251856803894043, + "learning_rate": 3.0876901406790546e-05, + "loss": 0.6768, + "step": 129790 + }, + { + "epoch": 1.1474743188528793, + "grad_norm": 4.925457954406738, + "learning_rate": 3.087542801911868e-05, + "loss": 0.7806, + "step": 129800 + }, + { + "epoch": 1.1475627221131917, + "grad_norm": 4.261886119842529, + "learning_rate": 3.087395463144681e-05, + "loss": 0.6236, + "step": 129810 + }, + { + "epoch": 1.1476511253735038, + "grad_norm": 4.9891862869262695, + "learning_rate": 3.087248124377494e-05, + "loss": 0.5637, + "step": 129820 + }, + { + "epoch": 1.147739528633816, + "grad_norm": 4.105252265930176, + "learning_rate": 3.0871007856103066e-05, + "loss": 0.6118, + "step": 129830 + }, + { + "epoch": 1.1478279318941282, + "grad_norm": 2.1113710403442383, + "learning_rate": 3.08695344684312e-05, + "loss": 0.4702, + "step": 129840 + }, + { + "epoch": 1.1479163351544406, + "grad_norm": 9.12517261505127, + "learning_rate": 3.086806108075932e-05, + "loss": 0.4841, + "step": 129850 + }, + { + "epoch": 1.1480047384147527, + "grad_norm": 1.4305143356323242, + "learning_rate": 3.086658769308746e-05, + "loss": 0.4818, + "step": 129860 + }, + { + "epoch": 1.148093141675065, + "grad_norm": 1.1121559143066406, + "learning_rate": 3.0865114305415587e-05, + "loss": 0.6366, + "step": 129870 + }, + { + "epoch": 1.1481815449353772, + "grad_norm": 2.573345899581909, + "learning_rate": 3.0863640917743715e-05, + "loss": 0.7032, + "step": 129880 + }, + { + "epoch": 1.1482699481956895, + "grad_norm": 1.0719095468521118, + "learning_rate": 3.086216753007184e-05, + "loss": 0.4951, + "step": 129890 + }, + { + "epoch": 1.1483583514560016, + "grad_norm": 4.711009979248047, + "learning_rate": 3.086069414239997e-05, + "loss": 0.5405, + "step": 129900 + }, + { + "epoch": 1.148446754716314, + "grad_norm": 2.915990114212036, + "learning_rate": 3.08592207547281e-05, + "loss": 0.6023, + "step": 129910 + }, + { + "epoch": 1.148535157976626, + "grad_norm": 3.4980132579803467, + "learning_rate": 3.0857747367056235e-05, + "loss": 0.6093, + "step": 129920 + }, + { + "epoch": 1.1486235612369384, + "grad_norm": 3.3230602741241455, + "learning_rate": 3.085627397938436e-05, + "loss": 0.646, + "step": 129930 + }, + { + "epoch": 1.1487119644972505, + "grad_norm": 2.237851858139038, + "learning_rate": 3.085480059171249e-05, + "loss": 0.5086, + "step": 129940 + }, + { + "epoch": 1.148800367757563, + "grad_norm": 1.7973872423171997, + "learning_rate": 3.085332720404062e-05, + "loss": 0.5327, + "step": 129950 + }, + { + "epoch": 1.1488887710178752, + "grad_norm": 6.047544956207275, + "learning_rate": 3.085185381636875e-05, + "loss": 0.5857, + "step": 129960 + }, + { + "epoch": 1.1489771742781874, + "grad_norm": 1.8672459125518799, + "learning_rate": 3.085038042869688e-05, + "loss": 0.7111, + "step": 129970 + }, + { + "epoch": 1.1490655775384997, + "grad_norm": 1.6612030267715454, + "learning_rate": 3.084890704102501e-05, + "loss": 0.5196, + "step": 129980 + }, + { + "epoch": 1.1491539807988118, + "grad_norm": 4.564487457275391, + "learning_rate": 3.0847433653353134e-05, + "loss": 0.5518, + "step": 129990 + }, + { + "epoch": 1.1492423840591242, + "grad_norm": 1.430924892425537, + "learning_rate": 3.084596026568127e-05, + "loss": 0.5857, + "step": 130000 + }, + { + "epoch": 1.1493307873194363, + "grad_norm": 1.7929118871688843, + "learning_rate": 3.084448687800939e-05, + "loss": 0.5652, + "step": 130010 + }, + { + "epoch": 1.1494191905797486, + "grad_norm": 1.4924002885818481, + "learning_rate": 3.0843013490337525e-05, + "loss": 0.6504, + "step": 130020 + }, + { + "epoch": 1.1495075938400607, + "grad_norm": 5.29504919052124, + "learning_rate": 3.0841540102665654e-05, + "loss": 0.6984, + "step": 130030 + }, + { + "epoch": 1.149595997100373, + "grad_norm": 1.9065515995025635, + "learning_rate": 3.084006671499378e-05, + "loss": 0.6376, + "step": 130040 + }, + { + "epoch": 1.1496844003606852, + "grad_norm": 4.07283878326416, + "learning_rate": 3.083859332732191e-05, + "loss": 0.7325, + "step": 130050 + }, + { + "epoch": 1.1497728036209975, + "grad_norm": 1.8085373640060425, + "learning_rate": 3.0837119939650046e-05, + "loss": 0.5452, + "step": 130060 + }, + { + "epoch": 1.1498612068813099, + "grad_norm": 3.3446431159973145, + "learning_rate": 3.083564655197817e-05, + "loss": 0.5984, + "step": 130070 + }, + { + "epoch": 1.149949610141622, + "grad_norm": 1.0745528936386108, + "learning_rate": 3.08341731643063e-05, + "loss": 0.6564, + "step": 130080 + }, + { + "epoch": 1.1500380134019343, + "grad_norm": 3.416633129119873, + "learning_rate": 3.083269977663443e-05, + "loss": 0.6429, + "step": 130090 + }, + { + "epoch": 1.1501264166622465, + "grad_norm": 1.4294919967651367, + "learning_rate": 3.083122638896256e-05, + "loss": 0.5988, + "step": 130100 + }, + { + "epoch": 1.1502148199225588, + "grad_norm": 3.212697744369507, + "learning_rate": 3.082975300129069e-05, + "loss": 0.6303, + "step": 130110 + }, + { + "epoch": 1.150303223182871, + "grad_norm": 2.286608934402466, + "learning_rate": 3.0828279613618816e-05, + "loss": 0.6174, + "step": 130120 + }, + { + "epoch": 1.1503916264431833, + "grad_norm": 1.804093837738037, + "learning_rate": 3.0826806225946944e-05, + "loss": 0.7295, + "step": 130130 + }, + { + "epoch": 1.1504800297034954, + "grad_norm": 1.5259981155395508, + "learning_rate": 3.082533283827508e-05, + "loss": 0.6263, + "step": 130140 + }, + { + "epoch": 1.1505684329638077, + "grad_norm": 3.9004428386688232, + "learning_rate": 3.08238594506032e-05, + "loss": 0.7751, + "step": 130150 + }, + { + "epoch": 1.1506568362241198, + "grad_norm": 3.0532617568969727, + "learning_rate": 3.0822386062931336e-05, + "loss": 0.5985, + "step": 130160 + }, + { + "epoch": 1.1507452394844322, + "grad_norm": 1.6736780405044556, + "learning_rate": 3.0820912675259464e-05, + "loss": 0.6115, + "step": 130170 + }, + { + "epoch": 1.1508336427447445, + "grad_norm": 4.148414134979248, + "learning_rate": 3.081943928758759e-05, + "loss": 0.6236, + "step": 130180 + }, + { + "epoch": 1.1509220460050567, + "grad_norm": 5.934894561767578, + "learning_rate": 3.081796589991572e-05, + "loss": 0.4944, + "step": 130190 + }, + { + "epoch": 1.151010449265369, + "grad_norm": 2.424107313156128, + "learning_rate": 3.0816492512243856e-05, + "loss": 0.6583, + "step": 130200 + }, + { + "epoch": 1.1510988525256811, + "grad_norm": 3.375579833984375, + "learning_rate": 3.081501912457198e-05, + "loss": 0.6433, + "step": 130210 + }, + { + "epoch": 1.1511872557859935, + "grad_norm": 12.654208183288574, + "learning_rate": 3.081354573690011e-05, + "loss": 0.6515, + "step": 130220 + }, + { + "epoch": 1.1512756590463056, + "grad_norm": 7.625489711761475, + "learning_rate": 3.081207234922824e-05, + "loss": 0.562, + "step": 130230 + }, + { + "epoch": 1.151364062306618, + "grad_norm": 2.1072824001312256, + "learning_rate": 3.081059896155637e-05, + "loss": 0.5538, + "step": 130240 + }, + { + "epoch": 1.15145246556693, + "grad_norm": 8.0526123046875, + "learning_rate": 3.08091255738845e-05, + "loss": 0.7138, + "step": 130250 + }, + { + "epoch": 1.1515408688272424, + "grad_norm": 1.326537847518921, + "learning_rate": 3.0807652186212626e-05, + "loss": 0.5371, + "step": 130260 + }, + { + "epoch": 1.1516292720875545, + "grad_norm": 3.9905776977539062, + "learning_rate": 3.0806178798540755e-05, + "loss": 0.6641, + "step": 130270 + }, + { + "epoch": 1.1517176753478668, + "grad_norm": 8.161019325256348, + "learning_rate": 3.080470541086889e-05, + "loss": 0.6056, + "step": 130280 + }, + { + "epoch": 1.1518060786081792, + "grad_norm": 8.429973602294922, + "learning_rate": 3.080323202319702e-05, + "loss": 0.6007, + "step": 130290 + }, + { + "epoch": 1.1518944818684913, + "grad_norm": 2.276729106903076, + "learning_rate": 3.0801758635525146e-05, + "loss": 0.6495, + "step": 130300 + }, + { + "epoch": 1.1519828851288036, + "grad_norm": 2.459988594055176, + "learning_rate": 3.0800285247853275e-05, + "loss": 0.5556, + "step": 130310 + }, + { + "epoch": 1.1520712883891158, + "grad_norm": 1.8681766986846924, + "learning_rate": 3.07988118601814e-05, + "loss": 0.7208, + "step": 130320 + }, + { + "epoch": 1.152159691649428, + "grad_norm": 1.3446437120437622, + "learning_rate": 3.079733847250953e-05, + "loss": 0.4902, + "step": 130330 + }, + { + "epoch": 1.1522480949097402, + "grad_norm": 8.097925186157227, + "learning_rate": 3.079586508483767e-05, + "loss": 0.6125, + "step": 130340 + }, + { + "epoch": 1.1523364981700526, + "grad_norm": 3.3305063247680664, + "learning_rate": 3.0794391697165795e-05, + "loss": 0.6731, + "step": 130350 + }, + { + "epoch": 1.1524249014303647, + "grad_norm": 12.64484977722168, + "learning_rate": 3.079291830949392e-05, + "loss": 0.5832, + "step": 130360 + }, + { + "epoch": 1.152513304690677, + "grad_norm": 3.750962972640991, + "learning_rate": 3.079144492182205e-05, + "loss": 0.6567, + "step": 130370 + }, + { + "epoch": 1.1526017079509892, + "grad_norm": 3.4089438915252686, + "learning_rate": 3.078997153415018e-05, + "loss": 0.5302, + "step": 130380 + }, + { + "epoch": 1.1526901112113015, + "grad_norm": 3.4216701984405518, + "learning_rate": 3.078849814647831e-05, + "loss": 0.6514, + "step": 130390 + }, + { + "epoch": 1.1527785144716138, + "grad_norm": 1.9582709074020386, + "learning_rate": 3.078702475880644e-05, + "loss": 0.7527, + "step": 130400 + }, + { + "epoch": 1.152866917731926, + "grad_norm": 2.0037267208099365, + "learning_rate": 3.078555137113457e-05, + "loss": 0.5566, + "step": 130410 + }, + { + "epoch": 1.152955320992238, + "grad_norm": 2.7218093872070312, + "learning_rate": 3.07840779834627e-05, + "loss": 0.6755, + "step": 130420 + }, + { + "epoch": 1.1530437242525504, + "grad_norm": 3.4097325801849365, + "learning_rate": 3.078260459579083e-05, + "loss": 0.666, + "step": 130430 + }, + { + "epoch": 1.1531321275128628, + "grad_norm": 2.1065242290496826, + "learning_rate": 3.078113120811896e-05, + "loss": 0.6265, + "step": 130440 + }, + { + "epoch": 1.1532205307731749, + "grad_norm": 5.590689182281494, + "learning_rate": 3.0779657820447085e-05, + "loss": 0.635, + "step": 130450 + }, + { + "epoch": 1.1533089340334872, + "grad_norm": 1.4026074409484863, + "learning_rate": 3.0778184432775214e-05, + "loss": 0.6955, + "step": 130460 + }, + { + "epoch": 1.1533973372937993, + "grad_norm": 1.5446447134017944, + "learning_rate": 3.077671104510335e-05, + "loss": 0.5637, + "step": 130470 + }, + { + "epoch": 1.1534857405541117, + "grad_norm": 2.288461685180664, + "learning_rate": 3.077523765743147e-05, + "loss": 0.5959, + "step": 130480 + }, + { + "epoch": 1.1535741438144238, + "grad_norm": 2.1392714977264404, + "learning_rate": 3.0773764269759605e-05, + "loss": 0.5121, + "step": 130490 + }, + { + "epoch": 1.1536625470747361, + "grad_norm": 2.8267087936401367, + "learning_rate": 3.0772290882087734e-05, + "loss": 0.5936, + "step": 130500 + }, + { + "epoch": 1.1537509503350483, + "grad_norm": 0.7872088551521301, + "learning_rate": 3.077081749441586e-05, + "loss": 0.6218, + "step": 130510 + }, + { + "epoch": 1.1538393535953606, + "grad_norm": 4.260744571685791, + "learning_rate": 3.076934410674399e-05, + "loss": 0.6814, + "step": 130520 + }, + { + "epoch": 1.1539277568556727, + "grad_norm": 5.207024097442627, + "learning_rate": 3.0767870719072126e-05, + "loss": 0.6387, + "step": 130530 + }, + { + "epoch": 1.154016160115985, + "grad_norm": 2.6019461154937744, + "learning_rate": 3.076639733140025e-05, + "loss": 0.6577, + "step": 130540 + }, + { + "epoch": 1.1541045633762974, + "grad_norm": 5.212896823883057, + "learning_rate": 3.076492394372838e-05, + "loss": 0.6266, + "step": 130550 + }, + { + "epoch": 1.1541929666366095, + "grad_norm": 4.378382205963135, + "learning_rate": 3.076345055605651e-05, + "loss": 0.7377, + "step": 130560 + }, + { + "epoch": 1.1542813698969219, + "grad_norm": 1.3188276290893555, + "learning_rate": 3.076197716838464e-05, + "loss": 0.6517, + "step": 130570 + }, + { + "epoch": 1.154369773157234, + "grad_norm": 1.741511583328247, + "learning_rate": 3.076050378071277e-05, + "loss": 0.5072, + "step": 130580 + }, + { + "epoch": 1.1544581764175463, + "grad_norm": 2.215941905975342, + "learning_rate": 3.0759030393040896e-05, + "loss": 0.6724, + "step": 130590 + }, + { + "epoch": 1.1545465796778585, + "grad_norm": 1.5650781393051147, + "learning_rate": 3.0757557005369024e-05, + "loss": 0.6223, + "step": 130600 + }, + { + "epoch": 1.1546349829381708, + "grad_norm": 2.669823169708252, + "learning_rate": 3.075608361769716e-05, + "loss": 0.5961, + "step": 130610 + }, + { + "epoch": 1.154723386198483, + "grad_norm": 3.119622230529785, + "learning_rate": 3.075461023002528e-05, + "loss": 0.6584, + "step": 130620 + }, + { + "epoch": 1.1548117894587953, + "grad_norm": 3.438450574874878, + "learning_rate": 3.0753136842353416e-05, + "loss": 0.506, + "step": 130630 + }, + { + "epoch": 1.1549001927191074, + "grad_norm": 2.9536571502685547, + "learning_rate": 3.0751663454681544e-05, + "loss": 0.6153, + "step": 130640 + }, + { + "epoch": 1.1549885959794197, + "grad_norm": 13.768678665161133, + "learning_rate": 3.075019006700967e-05, + "loss": 0.6068, + "step": 130650 + }, + { + "epoch": 1.155076999239732, + "grad_norm": 5.026328086853027, + "learning_rate": 3.07487166793378e-05, + "loss": 0.7131, + "step": 130660 + }, + { + "epoch": 1.1551654025000442, + "grad_norm": 5.939401626586914, + "learning_rate": 3.0747243291665936e-05, + "loss": 0.6545, + "step": 130670 + }, + { + "epoch": 1.1552538057603565, + "grad_norm": 0.8401268720626831, + "learning_rate": 3.074576990399406e-05, + "loss": 0.4732, + "step": 130680 + }, + { + "epoch": 1.1553422090206686, + "grad_norm": 3.457369327545166, + "learning_rate": 3.074429651632219e-05, + "loss": 0.6863, + "step": 130690 + }, + { + "epoch": 1.155430612280981, + "grad_norm": 9.696551322937012, + "learning_rate": 3.074282312865032e-05, + "loss": 0.7699, + "step": 130700 + }, + { + "epoch": 1.155519015541293, + "grad_norm": 23.009403228759766, + "learning_rate": 3.074134974097845e-05, + "loss": 0.6011, + "step": 130710 + }, + { + "epoch": 1.1556074188016054, + "grad_norm": 8.823689460754395, + "learning_rate": 3.073987635330658e-05, + "loss": 0.5357, + "step": 130720 + }, + { + "epoch": 1.1556958220619176, + "grad_norm": 2.050417900085449, + "learning_rate": 3.0738402965634706e-05, + "loss": 0.6402, + "step": 130730 + }, + { + "epoch": 1.15578422532223, + "grad_norm": 5.882505416870117, + "learning_rate": 3.0736929577962835e-05, + "loss": 0.761, + "step": 130740 + }, + { + "epoch": 1.155872628582542, + "grad_norm": 2.0210752487182617, + "learning_rate": 3.073545619029097e-05, + "loss": 0.5858, + "step": 130750 + }, + { + "epoch": 1.1559610318428544, + "grad_norm": 2.07631254196167, + "learning_rate": 3.073398280261909e-05, + "loss": 0.679, + "step": 130760 + }, + { + "epoch": 1.1560494351031667, + "grad_norm": 1.1941617727279663, + "learning_rate": 3.0732509414947227e-05, + "loss": 0.6533, + "step": 130770 + }, + { + "epoch": 1.1561378383634788, + "grad_norm": 1.3758389949798584, + "learning_rate": 3.0731036027275355e-05, + "loss": 0.624, + "step": 130780 + }, + { + "epoch": 1.1562262416237912, + "grad_norm": 5.725162506103516, + "learning_rate": 3.072956263960348e-05, + "loss": 0.6447, + "step": 130790 + }, + { + "epoch": 1.1563146448841033, + "grad_norm": 3.19097638130188, + "learning_rate": 3.072808925193161e-05, + "loss": 0.6202, + "step": 130800 + }, + { + "epoch": 1.1564030481444156, + "grad_norm": 1.3096617460250854, + "learning_rate": 3.072661586425975e-05, + "loss": 0.6101, + "step": 130810 + }, + { + "epoch": 1.1564914514047278, + "grad_norm": 10.097740173339844, + "learning_rate": 3.072514247658787e-05, + "loss": 0.4957, + "step": 130820 + }, + { + "epoch": 1.15657985466504, + "grad_norm": 0.9915809035301208, + "learning_rate": 3.0723669088916003e-05, + "loss": 0.6326, + "step": 130830 + }, + { + "epoch": 1.1566682579253522, + "grad_norm": 7.710951805114746, + "learning_rate": 3.0722195701244125e-05, + "loss": 0.6739, + "step": 130840 + }, + { + "epoch": 1.1567566611856646, + "grad_norm": 2.461761713027954, + "learning_rate": 3.072072231357226e-05, + "loss": 0.5882, + "step": 130850 + }, + { + "epoch": 1.1568450644459767, + "grad_norm": 21.788585662841797, + "learning_rate": 3.071924892590039e-05, + "loss": 0.5491, + "step": 130860 + }, + { + "epoch": 1.156933467706289, + "grad_norm": 2.5861144065856934, + "learning_rate": 3.071777553822852e-05, + "loss": 0.6463, + "step": 130870 + }, + { + "epoch": 1.1570218709666014, + "grad_norm": 1.5807311534881592, + "learning_rate": 3.0716302150556645e-05, + "loss": 0.6314, + "step": 130880 + }, + { + "epoch": 1.1571102742269135, + "grad_norm": 4.687560081481934, + "learning_rate": 3.071482876288478e-05, + "loss": 0.6703, + "step": 130890 + }, + { + "epoch": 1.1571986774872258, + "grad_norm": 2.799945592880249, + "learning_rate": 3.07133553752129e-05, + "loss": 0.6551, + "step": 130900 + }, + { + "epoch": 1.157287080747538, + "grad_norm": 5.429865837097168, + "learning_rate": 3.071188198754104e-05, + "loss": 0.6678, + "step": 130910 + }, + { + "epoch": 1.1573754840078503, + "grad_norm": 2.5880908966064453, + "learning_rate": 3.0710408599869165e-05, + "loss": 0.4931, + "step": 130920 + }, + { + "epoch": 1.1574638872681624, + "grad_norm": 7.995789051055908, + "learning_rate": 3.0708935212197294e-05, + "loss": 0.6173, + "step": 130930 + }, + { + "epoch": 1.1575522905284747, + "grad_norm": 5.808766841888428, + "learning_rate": 3.070746182452542e-05, + "loss": 0.6576, + "step": 130940 + }, + { + "epoch": 1.1576406937887869, + "grad_norm": 5.3204851150512695, + "learning_rate": 3.070598843685355e-05, + "loss": 0.6186, + "step": 130950 + }, + { + "epoch": 1.1577290970490992, + "grad_norm": 8.692299842834473, + "learning_rate": 3.070451504918168e-05, + "loss": 0.6071, + "step": 130960 + }, + { + "epoch": 1.1578175003094113, + "grad_norm": 2.939260721206665, + "learning_rate": 3.0703041661509814e-05, + "loss": 0.7696, + "step": 130970 + }, + { + "epoch": 1.1579059035697237, + "grad_norm": 2.478036642074585, + "learning_rate": 3.0701568273837935e-05, + "loss": 0.5836, + "step": 130980 + }, + { + "epoch": 1.157994306830036, + "grad_norm": 2.0853664875030518, + "learning_rate": 3.070009488616607e-05, + "loss": 0.7075, + "step": 130990 + }, + { + "epoch": 1.1580827100903481, + "grad_norm": 1.2412909269332886, + "learning_rate": 3.06986214984942e-05, + "loss": 0.6712, + "step": 131000 + }, + { + "epoch": 1.1581711133506603, + "grad_norm": 1.3032063245773315, + "learning_rate": 3.069714811082233e-05, + "loss": 0.5988, + "step": 131010 + }, + { + "epoch": 1.1582595166109726, + "grad_norm": 2.303032875061035, + "learning_rate": 3.0695674723150456e-05, + "loss": 0.5472, + "step": 131020 + }, + { + "epoch": 1.158347919871285, + "grad_norm": 5.118271350860596, + "learning_rate": 3.069420133547859e-05, + "loss": 0.5835, + "step": 131030 + }, + { + "epoch": 1.158436323131597, + "grad_norm": 2.0873095989227295, + "learning_rate": 3.069272794780671e-05, + "loss": 0.5617, + "step": 131040 + }, + { + "epoch": 1.1585247263919094, + "grad_norm": 1.1297153234481812, + "learning_rate": 3.069125456013485e-05, + "loss": 0.4775, + "step": 131050 + }, + { + "epoch": 1.1586131296522215, + "grad_norm": 2.575432777404785, + "learning_rate": 3.068978117246297e-05, + "loss": 0.6477, + "step": 131060 + }, + { + "epoch": 1.1587015329125339, + "grad_norm": 2.042219877243042, + "learning_rate": 3.0688307784791104e-05, + "loss": 0.6684, + "step": 131070 + }, + { + "epoch": 1.158789936172846, + "grad_norm": 2.1652231216430664, + "learning_rate": 3.068683439711923e-05, + "loss": 0.6125, + "step": 131080 + }, + { + "epoch": 1.1588783394331583, + "grad_norm": 1.0913035869598389, + "learning_rate": 3.068536100944736e-05, + "loss": 0.5844, + "step": 131090 + }, + { + "epoch": 1.1589667426934704, + "grad_norm": 2.7634623050689697, + "learning_rate": 3.068388762177549e-05, + "loss": 0.5514, + "step": 131100 + }, + { + "epoch": 1.1590551459537828, + "grad_norm": 1.49564790725708, + "learning_rate": 3.0682414234103624e-05, + "loss": 0.5283, + "step": 131110 + }, + { + "epoch": 1.159143549214095, + "grad_norm": 3.3530147075653076, + "learning_rate": 3.0680940846431746e-05, + "loss": 0.6858, + "step": 131120 + }, + { + "epoch": 1.1592319524744072, + "grad_norm": 2.3691318035125732, + "learning_rate": 3.067946745875988e-05, + "loss": 0.6402, + "step": 131130 + }, + { + "epoch": 1.1593203557347196, + "grad_norm": 4.105404376983643, + "learning_rate": 3.067799407108801e-05, + "loss": 0.6941, + "step": 131140 + }, + { + "epoch": 1.1594087589950317, + "grad_norm": 2.8768675327301025, + "learning_rate": 3.067652068341614e-05, + "loss": 0.7107, + "step": 131150 + }, + { + "epoch": 1.159497162255344, + "grad_norm": 5.293215751647949, + "learning_rate": 3.0675047295744266e-05, + "loss": 0.6305, + "step": 131160 + }, + { + "epoch": 1.1595855655156562, + "grad_norm": 2.359454393386841, + "learning_rate": 3.06735739080724e-05, + "loss": 0.6586, + "step": 131170 + }, + { + "epoch": 1.1596739687759685, + "grad_norm": 2.1791038513183594, + "learning_rate": 3.067210052040052e-05, + "loss": 0.6774, + "step": 131180 + }, + { + "epoch": 1.1597623720362806, + "grad_norm": 1.4351047277450562, + "learning_rate": 3.067062713272866e-05, + "loss": 0.615, + "step": 131190 + }, + { + "epoch": 1.159850775296593, + "grad_norm": 7.623877048492432, + "learning_rate": 3.0669153745056786e-05, + "loss": 0.549, + "step": 131200 + }, + { + "epoch": 1.159939178556905, + "grad_norm": 2.908419609069824, + "learning_rate": 3.0667680357384915e-05, + "loss": 0.4899, + "step": 131210 + }, + { + "epoch": 1.1600275818172174, + "grad_norm": 2.050945520401001, + "learning_rate": 3.066620696971304e-05, + "loss": 0.6592, + "step": 131220 + }, + { + "epoch": 1.1601159850775296, + "grad_norm": 1.3170599937438965, + "learning_rate": 3.066473358204117e-05, + "loss": 0.559, + "step": 131230 + }, + { + "epoch": 1.160204388337842, + "grad_norm": 6.787662029266357, + "learning_rate": 3.0663260194369307e-05, + "loss": 0.5855, + "step": 131240 + }, + { + "epoch": 1.1602927915981542, + "grad_norm": 1.2695868015289307, + "learning_rate": 3.0661786806697435e-05, + "loss": 0.5806, + "step": 131250 + }, + { + "epoch": 1.1603811948584664, + "grad_norm": 1.4428120851516724, + "learning_rate": 3.066031341902556e-05, + "loss": 0.5773, + "step": 131260 + }, + { + "epoch": 1.1604695981187787, + "grad_norm": 4.806600570678711, + "learning_rate": 3.065884003135369e-05, + "loss": 0.5664, + "step": 131270 + }, + { + "epoch": 1.1605580013790908, + "grad_norm": 12.411433219909668, + "learning_rate": 3.065736664368182e-05, + "loss": 0.6255, + "step": 131280 + }, + { + "epoch": 1.1606464046394032, + "grad_norm": 1.616111159324646, + "learning_rate": 3.065589325600995e-05, + "loss": 0.6148, + "step": 131290 + }, + { + "epoch": 1.1607348078997153, + "grad_norm": 1.4758148193359375, + "learning_rate": 3.0654419868338083e-05, + "loss": 0.5684, + "step": 131300 + }, + { + "epoch": 1.1608232111600276, + "grad_norm": 1.1272330284118652, + "learning_rate": 3.0652946480666205e-05, + "loss": 0.6464, + "step": 131310 + }, + { + "epoch": 1.1609116144203397, + "grad_norm": 4.339254379272461, + "learning_rate": 3.065147309299434e-05, + "loss": 0.6755, + "step": 131320 + }, + { + "epoch": 1.161000017680652, + "grad_norm": 3.4238412380218506, + "learning_rate": 3.064999970532247e-05, + "loss": 0.4863, + "step": 131330 + }, + { + "epoch": 1.1610884209409642, + "grad_norm": 2.6474673748016357, + "learning_rate": 3.06485263176506e-05, + "loss": 0.5196, + "step": 131340 + }, + { + "epoch": 1.1611768242012765, + "grad_norm": 1.2571967840194702, + "learning_rate": 3.0647052929978725e-05, + "loss": 0.6095, + "step": 131350 + }, + { + "epoch": 1.161265227461589, + "grad_norm": 8.376547813415527, + "learning_rate": 3.064557954230686e-05, + "loss": 0.576, + "step": 131360 + }, + { + "epoch": 1.161353630721901, + "grad_norm": 4.1423773765563965, + "learning_rate": 3.064410615463498e-05, + "loss": 0.6226, + "step": 131370 + }, + { + "epoch": 1.1614420339822134, + "grad_norm": 2.3484625816345215, + "learning_rate": 3.064263276696312e-05, + "loss": 0.6793, + "step": 131380 + }, + { + "epoch": 1.1615304372425255, + "grad_norm": 1.7184317111968994, + "learning_rate": 3.0641159379291245e-05, + "loss": 0.5015, + "step": 131390 + }, + { + "epoch": 1.1616188405028378, + "grad_norm": 1.766471028327942, + "learning_rate": 3.0639685991619374e-05, + "loss": 0.6157, + "step": 131400 + }, + { + "epoch": 1.16170724376315, + "grad_norm": 2.537006139755249, + "learning_rate": 3.06382126039475e-05, + "loss": 0.7882, + "step": 131410 + }, + { + "epoch": 1.1617956470234623, + "grad_norm": 3.338129997253418, + "learning_rate": 3.063673921627563e-05, + "loss": 0.6511, + "step": 131420 + }, + { + "epoch": 1.1618840502837744, + "grad_norm": 1.5497372150421143, + "learning_rate": 3.063526582860376e-05, + "loss": 0.5561, + "step": 131430 + }, + { + "epoch": 1.1619724535440867, + "grad_norm": 4.003422737121582, + "learning_rate": 3.0633792440931894e-05, + "loss": 0.601, + "step": 131440 + }, + { + "epoch": 1.1620608568043989, + "grad_norm": 1.6146740913391113, + "learning_rate": 3.0632319053260016e-05, + "loss": 0.6574, + "step": 131450 + }, + { + "epoch": 1.1621492600647112, + "grad_norm": 1.4565777778625488, + "learning_rate": 3.063084566558815e-05, + "loss": 0.6761, + "step": 131460 + }, + { + "epoch": 1.1622376633250235, + "grad_norm": 1.5680956840515137, + "learning_rate": 3.062937227791628e-05, + "loss": 0.5231, + "step": 131470 + }, + { + "epoch": 1.1623260665853357, + "grad_norm": 1.0105998516082764, + "learning_rate": 3.062789889024441e-05, + "loss": 0.6894, + "step": 131480 + }, + { + "epoch": 1.162414469845648, + "grad_norm": 1.1560819149017334, + "learning_rate": 3.0626425502572536e-05, + "loss": 0.7437, + "step": 131490 + }, + { + "epoch": 1.1625028731059601, + "grad_norm": 2.643648147583008, + "learning_rate": 3.062495211490067e-05, + "loss": 0.6468, + "step": 131500 + }, + { + "epoch": 1.1625912763662725, + "grad_norm": 3.2028660774230957, + "learning_rate": 3.062347872722879e-05, + "loss": 0.6121, + "step": 131510 + }, + { + "epoch": 1.1626796796265846, + "grad_norm": 2.450869083404541, + "learning_rate": 3.062200533955693e-05, + "loss": 0.7223, + "step": 131520 + }, + { + "epoch": 1.162768082886897, + "grad_norm": 0.9343979954719543, + "learning_rate": 3.0620531951885056e-05, + "loss": 0.6489, + "step": 131530 + }, + { + "epoch": 1.162856486147209, + "grad_norm": 0.8441886305809021, + "learning_rate": 3.0619058564213184e-05, + "loss": 0.5035, + "step": 131540 + }, + { + "epoch": 1.1629448894075214, + "grad_norm": 6.100572109222412, + "learning_rate": 3.061758517654131e-05, + "loss": 0.5692, + "step": 131550 + }, + { + "epoch": 1.1630332926678335, + "grad_norm": 5.454118728637695, + "learning_rate": 3.061611178886944e-05, + "loss": 0.5874, + "step": 131560 + }, + { + "epoch": 1.1631216959281458, + "grad_norm": 1.5155504941940308, + "learning_rate": 3.061463840119757e-05, + "loss": 0.6349, + "step": 131570 + }, + { + "epoch": 1.1632100991884582, + "grad_norm": 7.628772258758545, + "learning_rate": 3.0613165013525704e-05, + "loss": 0.5355, + "step": 131580 + }, + { + "epoch": 1.1632985024487703, + "grad_norm": 0.8436858654022217, + "learning_rate": 3.0611691625853826e-05, + "loss": 0.5978, + "step": 131590 + }, + { + "epoch": 1.1633869057090824, + "grad_norm": 1.6208841800689697, + "learning_rate": 3.061021823818196e-05, + "loss": 0.598, + "step": 131600 + }, + { + "epoch": 1.1634753089693948, + "grad_norm": 1.880710482597351, + "learning_rate": 3.060874485051009e-05, + "loss": 0.6345, + "step": 131610 + }, + { + "epoch": 1.1635637122297071, + "grad_norm": 2.420870780944824, + "learning_rate": 3.060727146283822e-05, + "loss": 0.5964, + "step": 131620 + }, + { + "epoch": 1.1636521154900192, + "grad_norm": 5.38777494430542, + "learning_rate": 3.0605798075166346e-05, + "loss": 0.6127, + "step": 131630 + }, + { + "epoch": 1.1637405187503316, + "grad_norm": 1.035424828529358, + "learning_rate": 3.060432468749448e-05, + "loss": 0.6733, + "step": 131640 + }, + { + "epoch": 1.1638289220106437, + "grad_norm": 2.5046041011810303, + "learning_rate": 3.06028512998226e-05, + "loss": 0.5019, + "step": 131650 + }, + { + "epoch": 1.163917325270956, + "grad_norm": 1.4642952680587769, + "learning_rate": 3.060137791215074e-05, + "loss": 0.7098, + "step": 131660 + }, + { + "epoch": 1.1640057285312682, + "grad_norm": 3.817128896713257, + "learning_rate": 3.059990452447886e-05, + "loss": 0.6361, + "step": 131670 + }, + { + "epoch": 1.1640941317915805, + "grad_norm": 3.3395297527313232, + "learning_rate": 3.0598431136806995e-05, + "loss": 0.7037, + "step": 131680 + }, + { + "epoch": 1.1641825350518926, + "grad_norm": 0.9705575108528137, + "learning_rate": 3.059695774913512e-05, + "loss": 0.7348, + "step": 131690 + }, + { + "epoch": 1.164270938312205, + "grad_norm": 1.3147315979003906, + "learning_rate": 3.059548436146325e-05, + "loss": 0.6297, + "step": 131700 + }, + { + "epoch": 1.164359341572517, + "grad_norm": 3.56866192817688, + "learning_rate": 3.059401097379138e-05, + "loss": 0.5257, + "step": 131710 + }, + { + "epoch": 1.1644477448328294, + "grad_norm": 1.7186813354492188, + "learning_rate": 3.0592537586119515e-05, + "loss": 0.6567, + "step": 131720 + }, + { + "epoch": 1.1645361480931418, + "grad_norm": 3.382847547531128, + "learning_rate": 3.0591064198447637e-05, + "loss": 0.6167, + "step": 131730 + }, + { + "epoch": 1.1646245513534539, + "grad_norm": 5.220097064971924, + "learning_rate": 3.058959081077577e-05, + "loss": 0.4954, + "step": 131740 + }, + { + "epoch": 1.1647129546137662, + "grad_norm": 8.415491104125977, + "learning_rate": 3.05881174231039e-05, + "loss": 0.6588, + "step": 131750 + }, + { + "epoch": 1.1648013578740783, + "grad_norm": 2.3298912048339844, + "learning_rate": 3.058664403543203e-05, + "loss": 0.556, + "step": 131760 + }, + { + "epoch": 1.1648897611343907, + "grad_norm": 1.4740550518035889, + "learning_rate": 3.058517064776016e-05, + "loss": 0.6669, + "step": 131770 + }, + { + "epoch": 1.1649781643947028, + "grad_norm": 3.5775668621063232, + "learning_rate": 3.0583697260088285e-05, + "loss": 0.6344, + "step": 131780 + }, + { + "epoch": 1.1650665676550152, + "grad_norm": 2.814379930496216, + "learning_rate": 3.0582223872416413e-05, + "loss": 0.6525, + "step": 131790 + }, + { + "epoch": 1.1651549709153273, + "grad_norm": 2.768153429031372, + "learning_rate": 3.058075048474455e-05, + "loss": 0.648, + "step": 131800 + }, + { + "epoch": 1.1652433741756396, + "grad_norm": 8.114901542663574, + "learning_rate": 3.057927709707267e-05, + "loss": 0.5869, + "step": 131810 + }, + { + "epoch": 1.1653317774359517, + "grad_norm": 2.019869804382324, + "learning_rate": 3.0577803709400805e-05, + "loss": 0.6177, + "step": 131820 + }, + { + "epoch": 1.165420180696264, + "grad_norm": 0.9405084252357483, + "learning_rate": 3.0576330321728934e-05, + "loss": 0.5722, + "step": 131830 + }, + { + "epoch": 1.1655085839565764, + "grad_norm": 2.8537769317626953, + "learning_rate": 3.057485693405706e-05, + "loss": 0.5964, + "step": 131840 + }, + { + "epoch": 1.1655969872168885, + "grad_norm": 3.6890366077423096, + "learning_rate": 3.057338354638519e-05, + "loss": 0.7653, + "step": 131850 + }, + { + "epoch": 1.1656853904772009, + "grad_norm": 1.1687262058258057, + "learning_rate": 3.0571910158713326e-05, + "loss": 0.7162, + "step": 131860 + }, + { + "epoch": 1.165773793737513, + "grad_norm": 2.0385937690734863, + "learning_rate": 3.057043677104145e-05, + "loss": 0.6421, + "step": 131870 + }, + { + "epoch": 1.1658621969978253, + "grad_norm": 5.374686241149902, + "learning_rate": 3.056896338336958e-05, + "loss": 0.5068, + "step": 131880 + }, + { + "epoch": 1.1659506002581375, + "grad_norm": 1.3698152303695679, + "learning_rate": 3.0567489995697704e-05, + "loss": 0.6813, + "step": 131890 + }, + { + "epoch": 1.1660390035184498, + "grad_norm": 7.347153663635254, + "learning_rate": 3.056601660802584e-05, + "loss": 0.6302, + "step": 131900 + }, + { + "epoch": 1.166127406778762, + "grad_norm": 26.705734252929688, + "learning_rate": 3.056454322035397e-05, + "loss": 0.6034, + "step": 131910 + }, + { + "epoch": 1.1662158100390743, + "grad_norm": 1.0941476821899414, + "learning_rate": 3.0563069832682096e-05, + "loss": 0.5443, + "step": 131920 + }, + { + "epoch": 1.1663042132993864, + "grad_norm": 14.015328407287598, + "learning_rate": 3.0561596445010224e-05, + "loss": 0.5796, + "step": 131930 + }, + { + "epoch": 1.1663926165596987, + "grad_norm": 2.918259859085083, + "learning_rate": 3.056012305733836e-05, + "loss": 0.6019, + "step": 131940 + }, + { + "epoch": 1.166481019820011, + "grad_norm": 2.0471794605255127, + "learning_rate": 3.055864966966648e-05, + "loss": 0.7929, + "step": 131950 + }, + { + "epoch": 1.1665694230803232, + "grad_norm": 6.047561168670654, + "learning_rate": 3.0557176281994616e-05, + "loss": 0.5976, + "step": 131960 + }, + { + "epoch": 1.1666578263406355, + "grad_norm": 1.3194369077682495, + "learning_rate": 3.0555702894322744e-05, + "loss": 0.4825, + "step": 131970 + }, + { + "epoch": 1.1667462296009476, + "grad_norm": 2.630715847015381, + "learning_rate": 3.055422950665087e-05, + "loss": 0.5792, + "step": 131980 + }, + { + "epoch": 1.16683463286126, + "grad_norm": 1.671630620956421, + "learning_rate": 3.0552756118979e-05, + "loss": 0.6355, + "step": 131990 + }, + { + "epoch": 1.166923036121572, + "grad_norm": 1.4064486026763916, + "learning_rate": 3.0551282731307136e-05, + "loss": 0.554, + "step": 132000 + }, + { + "epoch": 1.1670114393818845, + "grad_norm": 1.7668626308441162, + "learning_rate": 3.054980934363526e-05, + "loss": 0.6176, + "step": 132010 + }, + { + "epoch": 1.1670998426421966, + "grad_norm": 6.646592140197754, + "learning_rate": 3.054833595596339e-05, + "loss": 0.6725, + "step": 132020 + }, + { + "epoch": 1.167188245902509, + "grad_norm": 15.09215259552002, + "learning_rate": 3.054686256829152e-05, + "loss": 0.5482, + "step": 132030 + }, + { + "epoch": 1.167276649162821, + "grad_norm": 3.1803817749023438, + "learning_rate": 3.054538918061965e-05, + "loss": 0.7204, + "step": 132040 + }, + { + "epoch": 1.1673650524231334, + "grad_norm": 5.396078586578369, + "learning_rate": 3.054391579294778e-05, + "loss": 0.5977, + "step": 132050 + }, + { + "epoch": 1.1674534556834457, + "grad_norm": 4.771073818206787, + "learning_rate": 3.0542442405275906e-05, + "loss": 0.7139, + "step": 132060 + }, + { + "epoch": 1.1675418589437578, + "grad_norm": 13.232796669006348, + "learning_rate": 3.0540969017604035e-05, + "loss": 0.6498, + "step": 132070 + }, + { + "epoch": 1.1676302622040702, + "grad_norm": 1.5892701148986816, + "learning_rate": 3.053949562993217e-05, + "loss": 0.5872, + "step": 132080 + }, + { + "epoch": 1.1677186654643823, + "grad_norm": 2.342539072036743, + "learning_rate": 3.05380222422603e-05, + "loss": 0.6666, + "step": 132090 + }, + { + "epoch": 1.1678070687246946, + "grad_norm": 2.24224853515625, + "learning_rate": 3.0536548854588426e-05, + "loss": 0.6869, + "step": 132100 + }, + { + "epoch": 1.1678954719850068, + "grad_norm": 3.398423194885254, + "learning_rate": 3.0535075466916555e-05, + "loss": 0.6473, + "step": 132110 + }, + { + "epoch": 1.167983875245319, + "grad_norm": 1.142639398574829, + "learning_rate": 3.053360207924468e-05, + "loss": 0.5882, + "step": 132120 + }, + { + "epoch": 1.1680722785056312, + "grad_norm": 4.320949554443359, + "learning_rate": 3.053212869157281e-05, + "loss": 0.5157, + "step": 132130 + }, + { + "epoch": 1.1681606817659436, + "grad_norm": 2.6523942947387695, + "learning_rate": 3.053065530390094e-05, + "loss": 0.728, + "step": 132140 + }, + { + "epoch": 1.1682490850262557, + "grad_norm": 9.76197624206543, + "learning_rate": 3.0529181916229075e-05, + "loss": 0.5348, + "step": 132150 + }, + { + "epoch": 1.168337488286568, + "grad_norm": 2.3899006843566895, + "learning_rate": 3.05277085285572e-05, + "loss": 0.6289, + "step": 132160 + }, + { + "epoch": 1.1684258915468804, + "grad_norm": 4.8326263427734375, + "learning_rate": 3.052623514088533e-05, + "loss": 0.5918, + "step": 132170 + }, + { + "epoch": 1.1685142948071925, + "grad_norm": 1.922325611114502, + "learning_rate": 3.052476175321346e-05, + "loss": 0.56, + "step": 132180 + }, + { + "epoch": 1.1686026980675046, + "grad_norm": 1.5457454919815063, + "learning_rate": 3.052328836554159e-05, + "loss": 0.5192, + "step": 132190 + }, + { + "epoch": 1.168691101327817, + "grad_norm": 2.1275956630706787, + "learning_rate": 3.052181497786972e-05, + "loss": 0.7522, + "step": 132200 + }, + { + "epoch": 1.1687795045881293, + "grad_norm": 0.6837577223777771, + "learning_rate": 3.052034159019785e-05, + "loss": 0.6392, + "step": 132210 + }, + { + "epoch": 1.1688679078484414, + "grad_norm": 3.106041669845581, + "learning_rate": 3.051886820252598e-05, + "loss": 0.6022, + "step": 132220 + }, + { + "epoch": 1.1689563111087538, + "grad_norm": 1.675716757774353, + "learning_rate": 3.0517394814854105e-05, + "loss": 0.5052, + "step": 132230 + }, + { + "epoch": 1.1690447143690659, + "grad_norm": 1.3169463872909546, + "learning_rate": 3.0515921427182237e-05, + "loss": 0.6393, + "step": 132240 + }, + { + "epoch": 1.1691331176293782, + "grad_norm": 3.624569892883301, + "learning_rate": 3.0514448039510362e-05, + "loss": 0.7519, + "step": 132250 + }, + { + "epoch": 1.1692215208896903, + "grad_norm": 1.2276462316513062, + "learning_rate": 3.0512974651838494e-05, + "loss": 0.4475, + "step": 132260 + }, + { + "epoch": 1.1693099241500027, + "grad_norm": 4.70722770690918, + "learning_rate": 3.0511501264166625e-05, + "loss": 0.5525, + "step": 132270 + }, + { + "epoch": 1.1693983274103148, + "grad_norm": 2.2845730781555176, + "learning_rate": 3.051002787649475e-05, + "loss": 0.6448, + "step": 132280 + }, + { + "epoch": 1.1694867306706271, + "grad_norm": 1.9513633251190186, + "learning_rate": 3.0508554488822882e-05, + "loss": 0.6228, + "step": 132290 + }, + { + "epoch": 1.1695751339309393, + "grad_norm": 5.279168128967285, + "learning_rate": 3.0507081101151014e-05, + "loss": 0.6417, + "step": 132300 + }, + { + "epoch": 1.1696635371912516, + "grad_norm": 2.700528860092163, + "learning_rate": 3.050560771347914e-05, + "loss": 0.6448, + "step": 132310 + }, + { + "epoch": 1.169751940451564, + "grad_norm": 6.037847518920898, + "learning_rate": 3.050413432580727e-05, + "loss": 0.659, + "step": 132320 + }, + { + "epoch": 1.169840343711876, + "grad_norm": 2.064674139022827, + "learning_rate": 3.0502660938135402e-05, + "loss": 0.6646, + "step": 132330 + }, + { + "epoch": 1.1699287469721884, + "grad_norm": 1.5199215412139893, + "learning_rate": 3.0501187550463527e-05, + "loss": 0.6334, + "step": 132340 + }, + { + "epoch": 1.1700171502325005, + "grad_norm": 1.3490065336227417, + "learning_rate": 3.049971416279166e-05, + "loss": 0.5442, + "step": 132350 + }, + { + "epoch": 1.1701055534928129, + "grad_norm": 3.5852301120758057, + "learning_rate": 3.0498240775119784e-05, + "loss": 0.6561, + "step": 132360 + }, + { + "epoch": 1.170193956753125, + "grad_norm": 2.237496852874756, + "learning_rate": 3.0496767387447916e-05, + "loss": 0.5177, + "step": 132370 + }, + { + "epoch": 1.1702823600134373, + "grad_norm": 7.085289001464844, + "learning_rate": 3.0495293999776047e-05, + "loss": 0.4908, + "step": 132380 + }, + { + "epoch": 1.1703707632737494, + "grad_norm": 3.303255081176758, + "learning_rate": 3.0493820612104172e-05, + "loss": 0.5905, + "step": 132390 + }, + { + "epoch": 1.1704591665340618, + "grad_norm": 3.2829606533050537, + "learning_rate": 3.0492347224432304e-05, + "loss": 0.6725, + "step": 132400 + }, + { + "epoch": 1.170547569794374, + "grad_norm": 2.7317938804626465, + "learning_rate": 3.0490873836760436e-05, + "loss": 0.6341, + "step": 132410 + }, + { + "epoch": 1.1706359730546863, + "grad_norm": 1.486572504043579, + "learning_rate": 3.048940044908856e-05, + "loss": 0.4655, + "step": 132420 + }, + { + "epoch": 1.1707243763149986, + "grad_norm": 2.321272373199463, + "learning_rate": 3.0487927061416693e-05, + "loss": 0.737, + "step": 132430 + }, + { + "epoch": 1.1708127795753107, + "grad_norm": 2.659299373626709, + "learning_rate": 3.0486453673744824e-05, + "loss": 0.638, + "step": 132440 + }, + { + "epoch": 1.170901182835623, + "grad_norm": 6.0426106452941895, + "learning_rate": 3.048498028607295e-05, + "loss": 0.601, + "step": 132450 + }, + { + "epoch": 1.1709895860959352, + "grad_norm": 1.1585391759872437, + "learning_rate": 3.048350689840108e-05, + "loss": 0.5628, + "step": 132460 + }, + { + "epoch": 1.1710779893562475, + "grad_norm": 1.6140022277832031, + "learning_rate": 3.0482033510729213e-05, + "loss": 0.5131, + "step": 132470 + }, + { + "epoch": 1.1711663926165596, + "grad_norm": 1.9316903352737427, + "learning_rate": 3.0480560123057338e-05, + "loss": 0.6078, + "step": 132480 + }, + { + "epoch": 1.171254795876872, + "grad_norm": 5.598964691162109, + "learning_rate": 3.047908673538547e-05, + "loss": 0.7338, + "step": 132490 + }, + { + "epoch": 1.171343199137184, + "grad_norm": 2.369739055633545, + "learning_rate": 3.0477613347713598e-05, + "loss": 0.6196, + "step": 132500 + }, + { + "epoch": 1.1714316023974964, + "grad_norm": 14.49877643585205, + "learning_rate": 3.0476139960041726e-05, + "loss": 0.7379, + "step": 132510 + }, + { + "epoch": 1.1715200056578086, + "grad_norm": 1.8578732013702393, + "learning_rate": 3.0474666572369858e-05, + "loss": 0.5704, + "step": 132520 + }, + { + "epoch": 1.171608408918121, + "grad_norm": 2.9169743061065674, + "learning_rate": 3.0473193184697986e-05, + "loss": 0.6851, + "step": 132530 + }, + { + "epoch": 1.1716968121784332, + "grad_norm": 1.8212313652038574, + "learning_rate": 3.0471719797026115e-05, + "loss": 0.5748, + "step": 132540 + }, + { + "epoch": 1.1717852154387454, + "grad_norm": 6.714269161224365, + "learning_rate": 3.0470246409354246e-05, + "loss": 0.5968, + "step": 132550 + }, + { + "epoch": 1.1718736186990577, + "grad_norm": 0.8977352380752563, + "learning_rate": 3.0468773021682375e-05, + "loss": 0.5727, + "step": 132560 + }, + { + "epoch": 1.1719620219593698, + "grad_norm": 4.852736949920654, + "learning_rate": 3.0467299634010503e-05, + "loss": 0.5725, + "step": 132570 + }, + { + "epoch": 1.1720504252196822, + "grad_norm": 1.6195685863494873, + "learning_rate": 3.0465826246338635e-05, + "loss": 0.5958, + "step": 132580 + }, + { + "epoch": 1.1721388284799943, + "grad_norm": 2.600242853164673, + "learning_rate": 3.0464352858666763e-05, + "loss": 0.6953, + "step": 132590 + }, + { + "epoch": 1.1722272317403066, + "grad_norm": 1.6285310983657837, + "learning_rate": 3.046287947099489e-05, + "loss": 0.6017, + "step": 132600 + }, + { + "epoch": 1.1723156350006187, + "grad_norm": 16.744966506958008, + "learning_rate": 3.046140608332302e-05, + "loss": 0.7311, + "step": 132610 + }, + { + "epoch": 1.172404038260931, + "grad_norm": 2.718080997467041, + "learning_rate": 3.045993269565115e-05, + "loss": 0.7, + "step": 132620 + }, + { + "epoch": 1.1724924415212432, + "grad_norm": 5.0117998123168945, + "learning_rate": 3.045845930797928e-05, + "loss": 0.5889, + "step": 132630 + }, + { + "epoch": 1.1725808447815556, + "grad_norm": 3.1515860557556152, + "learning_rate": 3.0456985920307408e-05, + "loss": 0.5895, + "step": 132640 + }, + { + "epoch": 1.172669248041868, + "grad_norm": 2.0903701782226562, + "learning_rate": 3.045551253263554e-05, + "loss": 0.5522, + "step": 132650 + }, + { + "epoch": 1.17275765130218, + "grad_norm": 7.477038383483887, + "learning_rate": 3.045403914496367e-05, + "loss": 0.715, + "step": 132660 + }, + { + "epoch": 1.1728460545624924, + "grad_norm": 1.654893159866333, + "learning_rate": 3.0452565757291797e-05, + "loss": 0.5086, + "step": 132670 + }, + { + "epoch": 1.1729344578228045, + "grad_norm": 2.200843095779419, + "learning_rate": 3.045109236961993e-05, + "loss": 0.5562, + "step": 132680 + }, + { + "epoch": 1.1730228610831168, + "grad_norm": 4.3543009757995605, + "learning_rate": 3.0449618981948057e-05, + "loss": 0.6624, + "step": 132690 + }, + { + "epoch": 1.173111264343429, + "grad_norm": 5.937431812286377, + "learning_rate": 3.0448145594276185e-05, + "loss": 0.7204, + "step": 132700 + }, + { + "epoch": 1.1731996676037413, + "grad_norm": 1.456529140472412, + "learning_rate": 3.0446672206604317e-05, + "loss": 0.5367, + "step": 132710 + }, + { + "epoch": 1.1732880708640534, + "grad_norm": 6.336716175079346, + "learning_rate": 3.0445198818932442e-05, + "loss": 0.5996, + "step": 132720 + }, + { + "epoch": 1.1733764741243657, + "grad_norm": 3.082418918609619, + "learning_rate": 3.0443725431260574e-05, + "loss": 0.6777, + "step": 132730 + }, + { + "epoch": 1.1734648773846779, + "grad_norm": 4.456279277801514, + "learning_rate": 3.0442252043588705e-05, + "loss": 0.6322, + "step": 132740 + }, + { + "epoch": 1.1735532806449902, + "grad_norm": 1.7743403911590576, + "learning_rate": 3.044077865591683e-05, + "loss": 0.5483, + "step": 132750 + }, + { + "epoch": 1.1736416839053025, + "grad_norm": 7.261760234832764, + "learning_rate": 3.0439305268244962e-05, + "loss": 0.614, + "step": 132760 + }, + { + "epoch": 1.1737300871656147, + "grad_norm": 2.1416265964508057, + "learning_rate": 3.0437831880573094e-05, + "loss": 0.4852, + "step": 132770 + }, + { + "epoch": 1.1738184904259268, + "grad_norm": 5.894510746002197, + "learning_rate": 3.043635849290122e-05, + "loss": 0.6076, + "step": 132780 + }, + { + "epoch": 1.1739068936862391, + "grad_norm": 1.7651535272598267, + "learning_rate": 3.043488510522935e-05, + "loss": 0.5231, + "step": 132790 + }, + { + "epoch": 1.1739952969465515, + "grad_norm": 3.848479747772217, + "learning_rate": 3.0433411717557482e-05, + "loss": 0.6342, + "step": 132800 + }, + { + "epoch": 1.1740837002068636, + "grad_norm": 1.8462088108062744, + "learning_rate": 3.0431938329885607e-05, + "loss": 0.5902, + "step": 132810 + }, + { + "epoch": 1.174172103467176, + "grad_norm": 2.1441943645477295, + "learning_rate": 3.043046494221374e-05, + "loss": 0.5651, + "step": 132820 + }, + { + "epoch": 1.174260506727488, + "grad_norm": 1.283730149269104, + "learning_rate": 3.0428991554541864e-05, + "loss": 0.5917, + "step": 132830 + }, + { + "epoch": 1.1743489099878004, + "grad_norm": 3.028813362121582, + "learning_rate": 3.0427518166869996e-05, + "loss": 0.6284, + "step": 132840 + }, + { + "epoch": 1.1744373132481125, + "grad_norm": 10.345865249633789, + "learning_rate": 3.0426044779198127e-05, + "loss": 0.5326, + "step": 132850 + }, + { + "epoch": 1.1745257165084249, + "grad_norm": 16.154863357543945, + "learning_rate": 3.0424571391526252e-05, + "loss": 0.6167, + "step": 132860 + }, + { + "epoch": 1.174614119768737, + "grad_norm": 5.269153594970703, + "learning_rate": 3.0423098003854384e-05, + "loss": 0.5542, + "step": 132870 + }, + { + "epoch": 1.1747025230290493, + "grad_norm": 2.8429760932922363, + "learning_rate": 3.0421624616182516e-05, + "loss": 0.5917, + "step": 132880 + }, + { + "epoch": 1.1747909262893614, + "grad_norm": 3.219909429550171, + "learning_rate": 3.042015122851064e-05, + "loss": 0.7126, + "step": 132890 + }, + { + "epoch": 1.1748793295496738, + "grad_norm": 3.921576499938965, + "learning_rate": 3.0418677840838773e-05, + "loss": 0.6151, + "step": 132900 + }, + { + "epoch": 1.1749677328099861, + "grad_norm": 2.368994951248169, + "learning_rate": 3.0417204453166904e-05, + "loss": 0.6771, + "step": 132910 + }, + { + "epoch": 1.1750561360702982, + "grad_norm": 10.023112297058105, + "learning_rate": 3.041573106549503e-05, + "loss": 0.602, + "step": 132920 + }, + { + "epoch": 1.1751445393306106, + "grad_norm": 7.554394721984863, + "learning_rate": 3.041425767782316e-05, + "loss": 0.5795, + "step": 132930 + }, + { + "epoch": 1.1752329425909227, + "grad_norm": 2.894916296005249, + "learning_rate": 3.0412784290151293e-05, + "loss": 0.6606, + "step": 132940 + }, + { + "epoch": 1.175321345851235, + "grad_norm": 4.400060653686523, + "learning_rate": 3.0411310902479418e-05, + "loss": 0.5947, + "step": 132950 + }, + { + "epoch": 1.1754097491115472, + "grad_norm": 1.7976292371749878, + "learning_rate": 3.040983751480755e-05, + "loss": 0.6628, + "step": 132960 + }, + { + "epoch": 1.1754981523718595, + "grad_norm": 3.646946668624878, + "learning_rate": 3.0408364127135674e-05, + "loss": 0.6524, + "step": 132970 + }, + { + "epoch": 1.1755865556321716, + "grad_norm": 1.9021406173706055, + "learning_rate": 3.0406890739463806e-05, + "loss": 0.4782, + "step": 132980 + }, + { + "epoch": 1.175674958892484, + "grad_norm": 2.6747212409973145, + "learning_rate": 3.0405417351791938e-05, + "loss": 0.7948, + "step": 132990 + }, + { + "epoch": 1.175763362152796, + "grad_norm": 4.680131912231445, + "learning_rate": 3.0403943964120063e-05, + "loss": 0.6384, + "step": 133000 + }, + { + "epoch": 1.1758517654131084, + "grad_norm": 3.6572086811065674, + "learning_rate": 3.0402470576448195e-05, + "loss": 0.6263, + "step": 133010 + }, + { + "epoch": 1.1759401686734208, + "grad_norm": 2.589233160018921, + "learning_rate": 3.0400997188776326e-05, + "loss": 0.5465, + "step": 133020 + }, + { + "epoch": 1.176028571933733, + "grad_norm": 1.347601056098938, + "learning_rate": 3.039952380110445e-05, + "loss": 0.6568, + "step": 133030 + }, + { + "epoch": 1.1761169751940452, + "grad_norm": 2.90478515625, + "learning_rate": 3.0398050413432583e-05, + "loss": 0.6734, + "step": 133040 + }, + { + "epoch": 1.1762053784543574, + "grad_norm": 3.321408748626709, + "learning_rate": 3.0396577025760715e-05, + "loss": 0.7163, + "step": 133050 + }, + { + "epoch": 1.1762937817146697, + "grad_norm": 8.990157127380371, + "learning_rate": 3.039510363808884e-05, + "loss": 0.6033, + "step": 133060 + }, + { + "epoch": 1.1763821849749818, + "grad_norm": 6.883105278015137, + "learning_rate": 3.039363025041697e-05, + "loss": 0.6572, + "step": 133070 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 6.7309041023254395, + "learning_rate": 3.0392156862745097e-05, + "loss": 0.5922, + "step": 133080 + }, + { + "epoch": 1.1765589914956063, + "grad_norm": 1.2131184339523315, + "learning_rate": 3.0390683475073228e-05, + "loss": 0.6746, + "step": 133090 + }, + { + "epoch": 1.1766473947559186, + "grad_norm": 3.128512382507324, + "learning_rate": 3.038921008740136e-05, + "loss": 0.6037, + "step": 133100 + }, + { + "epoch": 1.1767357980162307, + "grad_norm": 2.463557481765747, + "learning_rate": 3.0387736699729485e-05, + "loss": 0.5792, + "step": 133110 + }, + { + "epoch": 1.176824201276543, + "grad_norm": 1.2875701189041138, + "learning_rate": 3.0386263312057617e-05, + "loss": 0.4718, + "step": 133120 + }, + { + "epoch": 1.1769126045368554, + "grad_norm": 0.9055109024047852, + "learning_rate": 3.038478992438575e-05, + "loss": 0.6206, + "step": 133130 + }, + { + "epoch": 1.1770010077971675, + "grad_norm": 6.1861162185668945, + "learning_rate": 3.0383316536713873e-05, + "loss": 0.6528, + "step": 133140 + }, + { + "epoch": 1.1770894110574799, + "grad_norm": 6.8475775718688965, + "learning_rate": 3.0381843149042005e-05, + "loss": 0.6514, + "step": 133150 + }, + { + "epoch": 1.177177814317792, + "grad_norm": 11.177821159362793, + "learning_rate": 3.0380369761370137e-05, + "loss": 0.6521, + "step": 133160 + }, + { + "epoch": 1.1772662175781043, + "grad_norm": 1.2997747659683228, + "learning_rate": 3.0378896373698262e-05, + "loss": 0.4423, + "step": 133170 + }, + { + "epoch": 1.1773546208384165, + "grad_norm": 2.687894105911255, + "learning_rate": 3.0377422986026394e-05, + "loss": 0.4874, + "step": 133180 + }, + { + "epoch": 1.1774430240987288, + "grad_norm": 3.052844762802124, + "learning_rate": 3.037594959835452e-05, + "loss": 0.6034, + "step": 133190 + }, + { + "epoch": 1.177531427359041, + "grad_norm": 4.097358226776123, + "learning_rate": 3.037447621068265e-05, + "loss": 0.7755, + "step": 133200 + }, + { + "epoch": 1.1776198306193533, + "grad_norm": 7.128112316131592, + "learning_rate": 3.0373002823010782e-05, + "loss": 0.6449, + "step": 133210 + }, + { + "epoch": 1.1777082338796654, + "grad_norm": 0.8843667507171631, + "learning_rate": 3.0371529435338907e-05, + "loss": 0.5929, + "step": 133220 + }, + { + "epoch": 1.1777966371399777, + "grad_norm": 1.8608607053756714, + "learning_rate": 3.037005604766704e-05, + "loss": 0.7112, + "step": 133230 + }, + { + "epoch": 1.17788504040029, + "grad_norm": 5.936575889587402, + "learning_rate": 3.036858265999517e-05, + "loss": 0.6388, + "step": 133240 + }, + { + "epoch": 1.1779734436606022, + "grad_norm": 3.8557424545288086, + "learning_rate": 3.0367109272323295e-05, + "loss": 0.6037, + "step": 133250 + }, + { + "epoch": 1.1780618469209145, + "grad_norm": 4.412086009979248, + "learning_rate": 3.0365635884651427e-05, + "loss": 0.5401, + "step": 133260 + }, + { + "epoch": 1.1781502501812267, + "grad_norm": 1.3330632448196411, + "learning_rate": 3.036416249697956e-05, + "loss": 0.5513, + "step": 133270 + }, + { + "epoch": 1.178238653441539, + "grad_norm": 3.633444309234619, + "learning_rate": 3.0362689109307684e-05, + "loss": 0.5331, + "step": 133280 + }, + { + "epoch": 1.1783270567018511, + "grad_norm": 6.16020393371582, + "learning_rate": 3.0361215721635816e-05, + "loss": 0.7169, + "step": 133290 + }, + { + "epoch": 1.1784154599621635, + "grad_norm": 3.3290483951568604, + "learning_rate": 3.035974233396394e-05, + "loss": 0.5634, + "step": 133300 + }, + { + "epoch": 1.1785038632224756, + "grad_norm": 10.795690536499023, + "learning_rate": 3.0358268946292072e-05, + "loss": 0.5967, + "step": 133310 + }, + { + "epoch": 1.178592266482788, + "grad_norm": 3.8134284019470215, + "learning_rate": 3.0356795558620204e-05, + "loss": 0.6132, + "step": 133320 + }, + { + "epoch": 1.1786806697431, + "grad_norm": 1.540705919265747, + "learning_rate": 3.035532217094833e-05, + "loss": 0.5453, + "step": 133330 + }, + { + "epoch": 1.1787690730034124, + "grad_norm": 2.7744524478912354, + "learning_rate": 3.035384878327646e-05, + "loss": 0.5002, + "step": 133340 + }, + { + "epoch": 1.1788574762637247, + "grad_norm": 6.923981189727783, + "learning_rate": 3.0352375395604593e-05, + "loss": 0.6694, + "step": 133350 + }, + { + "epoch": 1.1789458795240368, + "grad_norm": 2.4157350063323975, + "learning_rate": 3.0350902007932718e-05, + "loss": 0.6004, + "step": 133360 + }, + { + "epoch": 1.179034282784349, + "grad_norm": 6.4708781242370605, + "learning_rate": 3.034942862026085e-05, + "loss": 0.5346, + "step": 133370 + }, + { + "epoch": 1.1791226860446613, + "grad_norm": 1.2918226718902588, + "learning_rate": 3.034795523258898e-05, + "loss": 0.5839, + "step": 133380 + }, + { + "epoch": 1.1792110893049736, + "grad_norm": 2.116534948348999, + "learning_rate": 3.0346481844917106e-05, + "loss": 0.5119, + "step": 133390 + }, + { + "epoch": 1.1792994925652858, + "grad_norm": 3.1565561294555664, + "learning_rate": 3.0345008457245238e-05, + "loss": 0.482, + "step": 133400 + }, + { + "epoch": 1.179387895825598, + "grad_norm": 23.30229377746582, + "learning_rate": 3.034353506957337e-05, + "loss": 0.561, + "step": 133410 + }, + { + "epoch": 1.1794762990859102, + "grad_norm": 2.0253965854644775, + "learning_rate": 3.0342061681901494e-05, + "loss": 0.6565, + "step": 133420 + }, + { + "epoch": 1.1795647023462226, + "grad_norm": 7.668882369995117, + "learning_rate": 3.0340588294229626e-05, + "loss": 0.5438, + "step": 133430 + }, + { + "epoch": 1.1796531056065347, + "grad_norm": 3.12192440032959, + "learning_rate": 3.0339114906557755e-05, + "loss": 0.5389, + "step": 133440 + }, + { + "epoch": 1.179741508866847, + "grad_norm": 1.7643693685531616, + "learning_rate": 3.0337641518885883e-05, + "loss": 0.5749, + "step": 133450 + }, + { + "epoch": 1.1798299121271592, + "grad_norm": 4.117247104644775, + "learning_rate": 3.0336168131214015e-05, + "loss": 0.7087, + "step": 133460 + }, + { + "epoch": 1.1799183153874715, + "grad_norm": 7.315949440002441, + "learning_rate": 3.0334694743542143e-05, + "loss": 0.5827, + "step": 133470 + }, + { + "epoch": 1.1800067186477836, + "grad_norm": 2.433905601501465, + "learning_rate": 3.033322135587027e-05, + "loss": 0.6983, + "step": 133480 + }, + { + "epoch": 1.180095121908096, + "grad_norm": 12.099250793457031, + "learning_rate": 3.0331747968198403e-05, + "loss": 0.6841, + "step": 133490 + }, + { + "epoch": 1.1801835251684083, + "grad_norm": 4.206975936889648, + "learning_rate": 3.033027458052653e-05, + "loss": 0.6199, + "step": 133500 + }, + { + "epoch": 1.1802719284287204, + "grad_norm": 1.3971086740493774, + "learning_rate": 3.032880119285466e-05, + "loss": 0.6861, + "step": 133510 + }, + { + "epoch": 1.1803603316890328, + "grad_norm": 9.218562126159668, + "learning_rate": 3.032732780518279e-05, + "loss": 0.6203, + "step": 133520 + }, + { + "epoch": 1.1804487349493449, + "grad_norm": 2.294931650161743, + "learning_rate": 3.032585441751092e-05, + "loss": 0.6884, + "step": 133530 + }, + { + "epoch": 1.1805371382096572, + "grad_norm": 10.050101280212402, + "learning_rate": 3.0324381029839048e-05, + "loss": 0.6901, + "step": 133540 + }, + { + "epoch": 1.1806255414699693, + "grad_norm": 3.3422930240631104, + "learning_rate": 3.0322907642167177e-05, + "loss": 0.668, + "step": 133550 + }, + { + "epoch": 1.1807139447302817, + "grad_norm": 6.363961219787598, + "learning_rate": 3.032143425449531e-05, + "loss": 0.6081, + "step": 133560 + }, + { + "epoch": 1.1808023479905938, + "grad_norm": 2.307931661605835, + "learning_rate": 3.0319960866823437e-05, + "loss": 0.6154, + "step": 133570 + }, + { + "epoch": 1.1808907512509061, + "grad_norm": 1.586371660232544, + "learning_rate": 3.0318487479151565e-05, + "loss": 0.7045, + "step": 133580 + }, + { + "epoch": 1.1809791545112183, + "grad_norm": 2.354738712310791, + "learning_rate": 3.0317014091479697e-05, + "loss": 0.526, + "step": 133590 + }, + { + "epoch": 1.1810675577715306, + "grad_norm": 2.54079270362854, + "learning_rate": 3.0315540703807825e-05, + "loss": 0.7171, + "step": 133600 + }, + { + "epoch": 1.181155961031843, + "grad_norm": 3.5299763679504395, + "learning_rate": 3.0314067316135953e-05, + "loss": 0.6507, + "step": 133610 + }, + { + "epoch": 1.181244364292155, + "grad_norm": 1.049715518951416, + "learning_rate": 3.0312593928464085e-05, + "loss": 0.5837, + "step": 133620 + }, + { + "epoch": 1.1813327675524674, + "grad_norm": 3.565322160720825, + "learning_rate": 3.0311120540792214e-05, + "loss": 0.6637, + "step": 133630 + }, + { + "epoch": 1.1814211708127795, + "grad_norm": 1.863945722579956, + "learning_rate": 3.0309647153120342e-05, + "loss": 0.6379, + "step": 133640 + }, + { + "epoch": 1.1815095740730919, + "grad_norm": 3.682539701461792, + "learning_rate": 3.0308173765448474e-05, + "loss": 0.5311, + "step": 133650 + }, + { + "epoch": 1.181597977333404, + "grad_norm": 2.9334843158721924, + "learning_rate": 3.03067003777766e-05, + "loss": 0.6481, + "step": 133660 + }, + { + "epoch": 1.1816863805937163, + "grad_norm": 4.022030830383301, + "learning_rate": 3.030522699010473e-05, + "loss": 0.7427, + "step": 133670 + }, + { + "epoch": 1.1817747838540285, + "grad_norm": 1.6121671199798584, + "learning_rate": 3.0303753602432862e-05, + "loss": 0.6308, + "step": 133680 + }, + { + "epoch": 1.1818631871143408, + "grad_norm": 3.0046753883361816, + "learning_rate": 3.0302280214760987e-05, + "loss": 0.586, + "step": 133690 + }, + { + "epoch": 1.181951590374653, + "grad_norm": 2.20027232170105, + "learning_rate": 3.030080682708912e-05, + "loss": 0.6444, + "step": 133700 + }, + { + "epoch": 1.1820399936349653, + "grad_norm": 6.925246715545654, + "learning_rate": 3.029933343941725e-05, + "loss": 0.6971, + "step": 133710 + }, + { + "epoch": 1.1821283968952776, + "grad_norm": 3.790677785873413, + "learning_rate": 3.0297860051745376e-05, + "loss": 0.713, + "step": 133720 + }, + { + "epoch": 1.1822168001555897, + "grad_norm": 2.0312232971191406, + "learning_rate": 3.0296386664073507e-05, + "loss": 0.5332, + "step": 133730 + }, + { + "epoch": 1.182305203415902, + "grad_norm": 7.247586727142334, + "learning_rate": 3.029491327640164e-05, + "loss": 0.663, + "step": 133740 + }, + { + "epoch": 1.1823936066762142, + "grad_norm": 3.275068998336792, + "learning_rate": 3.0293439888729764e-05, + "loss": 0.6624, + "step": 133750 + }, + { + "epoch": 1.1824820099365265, + "grad_norm": 1.7012073993682861, + "learning_rate": 3.0291966501057896e-05, + "loss": 0.5602, + "step": 133760 + }, + { + "epoch": 1.1825704131968386, + "grad_norm": 2.670976161956787, + "learning_rate": 3.029049311338602e-05, + "loss": 0.5404, + "step": 133770 + }, + { + "epoch": 1.182658816457151, + "grad_norm": 0.9218131303787231, + "learning_rate": 3.0289019725714152e-05, + "loss": 0.6014, + "step": 133780 + }, + { + "epoch": 1.182747219717463, + "grad_norm": 8.487752914428711, + "learning_rate": 3.0287546338042284e-05, + "loss": 0.7067, + "step": 133790 + }, + { + "epoch": 1.1828356229777754, + "grad_norm": 1.5247690677642822, + "learning_rate": 3.028607295037041e-05, + "loss": 0.7459, + "step": 133800 + }, + { + "epoch": 1.1829240262380876, + "grad_norm": 1.91983163356781, + "learning_rate": 3.028459956269854e-05, + "loss": 0.7229, + "step": 133810 + }, + { + "epoch": 1.1830124294984, + "grad_norm": 6.579474925994873, + "learning_rate": 3.0283126175026673e-05, + "loss": 0.5639, + "step": 133820 + }, + { + "epoch": 1.1831008327587123, + "grad_norm": 1.699512243270874, + "learning_rate": 3.0281652787354798e-05, + "loss": 0.5564, + "step": 133830 + }, + { + "epoch": 1.1831892360190244, + "grad_norm": 1.832939863204956, + "learning_rate": 3.028017939968293e-05, + "loss": 0.7191, + "step": 133840 + }, + { + "epoch": 1.1832776392793367, + "grad_norm": 0.9650859236717224, + "learning_rate": 3.027870601201106e-05, + "loss": 0.7212, + "step": 133850 + }, + { + "epoch": 1.1833660425396488, + "grad_norm": 9.520347595214844, + "learning_rate": 3.0277232624339186e-05, + "loss": 0.6559, + "step": 133860 + }, + { + "epoch": 1.1834544457999612, + "grad_norm": 2.349287986755371, + "learning_rate": 3.0275759236667318e-05, + "loss": 0.6233, + "step": 133870 + }, + { + "epoch": 1.1835428490602733, + "grad_norm": 1.295008897781372, + "learning_rate": 3.027428584899545e-05, + "loss": 0.7253, + "step": 133880 + }, + { + "epoch": 1.1836312523205856, + "grad_norm": 1.1947423219680786, + "learning_rate": 3.0272812461323574e-05, + "loss": 0.6286, + "step": 133890 + }, + { + "epoch": 1.1837196555808978, + "grad_norm": 2.7769126892089844, + "learning_rate": 3.0271339073651706e-05, + "loss": 0.6528, + "step": 133900 + }, + { + "epoch": 1.18380805884121, + "grad_norm": 1.6827560663223267, + "learning_rate": 3.026986568597983e-05, + "loss": 0.5617, + "step": 133910 + }, + { + "epoch": 1.1838964621015222, + "grad_norm": 8.138789176940918, + "learning_rate": 3.0268392298307963e-05, + "loss": 0.7237, + "step": 133920 + }, + { + "epoch": 1.1839848653618346, + "grad_norm": 10.887385368347168, + "learning_rate": 3.0266918910636095e-05, + "loss": 0.5822, + "step": 133930 + }, + { + "epoch": 1.184073268622147, + "grad_norm": 8.09276008605957, + "learning_rate": 3.026544552296422e-05, + "loss": 0.4782, + "step": 133940 + }, + { + "epoch": 1.184161671882459, + "grad_norm": 2.6022586822509766, + "learning_rate": 3.026397213529235e-05, + "loss": 0.6282, + "step": 133950 + }, + { + "epoch": 1.1842500751427711, + "grad_norm": 2.1033880710601807, + "learning_rate": 3.0262498747620483e-05, + "loss": 0.6039, + "step": 133960 + }, + { + "epoch": 1.1843384784030835, + "grad_norm": 6.447736740112305, + "learning_rate": 3.0261025359948608e-05, + "loss": 0.5947, + "step": 133970 + }, + { + "epoch": 1.1844268816633958, + "grad_norm": 1.5387040376663208, + "learning_rate": 3.025955197227674e-05, + "loss": 0.6481, + "step": 133980 + }, + { + "epoch": 1.184515284923708, + "grad_norm": 2.642251968383789, + "learning_rate": 3.025807858460487e-05, + "loss": 0.6364, + "step": 133990 + }, + { + "epoch": 1.1846036881840203, + "grad_norm": 1.2769719362258911, + "learning_rate": 3.0256605196932997e-05, + "loss": 0.5002, + "step": 134000 + }, + { + "epoch": 1.1846920914443324, + "grad_norm": 1.3712046146392822, + "learning_rate": 3.0255131809261128e-05, + "loss": 0.6655, + "step": 134010 + }, + { + "epoch": 1.1847804947046447, + "grad_norm": 3.0285303592681885, + "learning_rate": 3.0253658421589253e-05, + "loss": 0.714, + "step": 134020 + }, + { + "epoch": 1.1848688979649569, + "grad_norm": 10.945008277893066, + "learning_rate": 3.0252185033917385e-05, + "loss": 0.6471, + "step": 134030 + }, + { + "epoch": 1.1849573012252692, + "grad_norm": 4.571321487426758, + "learning_rate": 3.0250711646245517e-05, + "loss": 0.6315, + "step": 134040 + }, + { + "epoch": 1.1850457044855816, + "grad_norm": 7.85854434967041, + "learning_rate": 3.0249238258573642e-05, + "loss": 0.5895, + "step": 134050 + }, + { + "epoch": 1.1851341077458937, + "grad_norm": 1.8974639177322388, + "learning_rate": 3.0247764870901773e-05, + "loss": 0.6176, + "step": 134060 + }, + { + "epoch": 1.1852225110062058, + "grad_norm": 2.0651192665100098, + "learning_rate": 3.0246291483229905e-05, + "loss": 0.5908, + "step": 134070 + }, + { + "epoch": 1.1853109142665181, + "grad_norm": 0.9960033297538757, + "learning_rate": 3.024481809555803e-05, + "loss": 0.5894, + "step": 134080 + }, + { + "epoch": 1.1853993175268305, + "grad_norm": 1.3338834047317505, + "learning_rate": 3.0243344707886162e-05, + "loss": 0.6926, + "step": 134090 + }, + { + "epoch": 1.1854877207871426, + "grad_norm": 1.4474478960037231, + "learning_rate": 3.0241871320214294e-05, + "loss": 0.69, + "step": 134100 + }, + { + "epoch": 1.185576124047455, + "grad_norm": 4.29463529586792, + "learning_rate": 3.024039793254242e-05, + "loss": 0.7127, + "step": 134110 + }, + { + "epoch": 1.185664527307767, + "grad_norm": 4.743776798248291, + "learning_rate": 3.023892454487055e-05, + "loss": 0.6648, + "step": 134120 + }, + { + "epoch": 1.1857529305680794, + "grad_norm": 1.6514040231704712, + "learning_rate": 3.0237451157198675e-05, + "loss": 0.663, + "step": 134130 + }, + { + "epoch": 1.1858413338283915, + "grad_norm": 1.0738427639007568, + "learning_rate": 3.0235977769526807e-05, + "loss": 0.5619, + "step": 134140 + }, + { + "epoch": 1.1859297370887039, + "grad_norm": 4.679333686828613, + "learning_rate": 3.023450438185494e-05, + "loss": 0.7378, + "step": 134150 + }, + { + "epoch": 1.186018140349016, + "grad_norm": 1.6422126293182373, + "learning_rate": 3.0233030994183064e-05, + "loss": 0.6213, + "step": 134160 + }, + { + "epoch": 1.1861065436093283, + "grad_norm": 5.034276485443115, + "learning_rate": 3.0231557606511196e-05, + "loss": 0.6473, + "step": 134170 + }, + { + "epoch": 1.1861949468696404, + "grad_norm": 1.9419156312942505, + "learning_rate": 3.0230084218839327e-05, + "loss": 0.6596, + "step": 134180 + }, + { + "epoch": 1.1862833501299528, + "grad_norm": 5.711865425109863, + "learning_rate": 3.0228610831167452e-05, + "loss": 0.5762, + "step": 134190 + }, + { + "epoch": 1.1863717533902651, + "grad_norm": 3.984689235687256, + "learning_rate": 3.0227137443495584e-05, + "loss": 0.5392, + "step": 134200 + }, + { + "epoch": 1.1864601566505772, + "grad_norm": 1.3124592304229736, + "learning_rate": 3.0225664055823716e-05, + "loss": 0.7484, + "step": 134210 + }, + { + "epoch": 1.1865485599108896, + "grad_norm": 2.726818561553955, + "learning_rate": 3.022419066815184e-05, + "loss": 0.6481, + "step": 134220 + }, + { + "epoch": 1.1866369631712017, + "grad_norm": 2.6501593589782715, + "learning_rate": 3.0222717280479972e-05, + "loss": 0.5935, + "step": 134230 + }, + { + "epoch": 1.186725366431514, + "grad_norm": 4.129079341888428, + "learning_rate": 3.0221243892808104e-05, + "loss": 0.6558, + "step": 134240 + }, + { + "epoch": 1.1868137696918262, + "grad_norm": 3.0883612632751465, + "learning_rate": 3.021977050513623e-05, + "loss": 0.6343, + "step": 134250 + }, + { + "epoch": 1.1869021729521385, + "grad_norm": 13.528619766235352, + "learning_rate": 3.021829711746436e-05, + "loss": 0.5872, + "step": 134260 + }, + { + "epoch": 1.1869905762124506, + "grad_norm": 1.262600064277649, + "learning_rate": 3.0216823729792486e-05, + "loss": 0.5916, + "step": 134270 + }, + { + "epoch": 1.187078979472763, + "grad_norm": 2.543734073638916, + "learning_rate": 3.0215350342120618e-05, + "loss": 0.6238, + "step": 134280 + }, + { + "epoch": 1.187167382733075, + "grad_norm": 3.074098825454712, + "learning_rate": 3.021387695444875e-05, + "loss": 0.7055, + "step": 134290 + }, + { + "epoch": 1.1872557859933874, + "grad_norm": 1.6259393692016602, + "learning_rate": 3.0212403566776874e-05, + "loss": 0.6377, + "step": 134300 + }, + { + "epoch": 1.1873441892536998, + "grad_norm": 2.3745906352996826, + "learning_rate": 3.0210930179105006e-05, + "loss": 0.5058, + "step": 134310 + }, + { + "epoch": 1.187432592514012, + "grad_norm": 3.7903308868408203, + "learning_rate": 3.0209456791433138e-05, + "loss": 0.6222, + "step": 134320 + }, + { + "epoch": 1.1875209957743242, + "grad_norm": 4.346645832061768, + "learning_rate": 3.0207983403761263e-05, + "loss": 0.596, + "step": 134330 + }, + { + "epoch": 1.1876093990346364, + "grad_norm": 3.0062313079833984, + "learning_rate": 3.0206510016089394e-05, + "loss": 0.5854, + "step": 134340 + }, + { + "epoch": 1.1876978022949487, + "grad_norm": 4.951059818267822, + "learning_rate": 3.0205036628417526e-05, + "loss": 0.5962, + "step": 134350 + }, + { + "epoch": 1.1877862055552608, + "grad_norm": 5.383270263671875, + "learning_rate": 3.020356324074565e-05, + "loss": 0.6678, + "step": 134360 + }, + { + "epoch": 1.1878746088155732, + "grad_norm": 2.7078888416290283, + "learning_rate": 3.0202089853073783e-05, + "loss": 0.7557, + "step": 134370 + }, + { + "epoch": 1.1879630120758853, + "grad_norm": 4.165093421936035, + "learning_rate": 3.020061646540191e-05, + "loss": 0.5552, + "step": 134380 + }, + { + "epoch": 1.1880514153361976, + "grad_norm": 2.7868340015411377, + "learning_rate": 3.019914307773004e-05, + "loss": 0.5464, + "step": 134390 + }, + { + "epoch": 1.1881398185965097, + "grad_norm": 1.2256553173065186, + "learning_rate": 3.019766969005817e-05, + "loss": 0.7446, + "step": 134400 + }, + { + "epoch": 1.188228221856822, + "grad_norm": 1.8391183614730835, + "learning_rate": 3.01961963023863e-05, + "loss": 0.6549, + "step": 134410 + }, + { + "epoch": 1.1883166251171344, + "grad_norm": 2.2443768978118896, + "learning_rate": 3.0194722914714428e-05, + "loss": 0.6133, + "step": 134420 + }, + { + "epoch": 1.1884050283774465, + "grad_norm": 3.4900825023651123, + "learning_rate": 3.019324952704256e-05, + "loss": 0.5998, + "step": 134430 + }, + { + "epoch": 1.188493431637759, + "grad_norm": 4.5909929275512695, + "learning_rate": 3.0191776139370688e-05, + "loss": 0.4663, + "step": 134440 + }, + { + "epoch": 1.188581834898071, + "grad_norm": 2.962786912918091, + "learning_rate": 3.0190302751698817e-05, + "loss": 0.5979, + "step": 134450 + }, + { + "epoch": 1.1886702381583834, + "grad_norm": 1.3459868431091309, + "learning_rate": 3.0188829364026948e-05, + "loss": 0.5572, + "step": 134460 + }, + { + "epoch": 1.1887586414186955, + "grad_norm": 0.845302402973175, + "learning_rate": 3.0187355976355077e-05, + "loss": 0.5532, + "step": 134470 + }, + { + "epoch": 1.1888470446790078, + "grad_norm": 2.38145112991333, + "learning_rate": 3.0185882588683205e-05, + "loss": 0.6991, + "step": 134480 + }, + { + "epoch": 1.18893544793932, + "grad_norm": 10.354094505310059, + "learning_rate": 3.0184409201011333e-05, + "loss": 0.6276, + "step": 134490 + }, + { + "epoch": 1.1890238511996323, + "grad_norm": 1.6070795059204102, + "learning_rate": 3.0182935813339465e-05, + "loss": 0.5707, + "step": 134500 + }, + { + "epoch": 1.1891122544599444, + "grad_norm": 2.7522847652435303, + "learning_rate": 3.0181462425667593e-05, + "loss": 0.7623, + "step": 134510 + }, + { + "epoch": 1.1892006577202567, + "grad_norm": 1.0323193073272705, + "learning_rate": 3.0179989037995722e-05, + "loss": 0.6218, + "step": 134520 + }, + { + "epoch": 1.189289060980569, + "grad_norm": 3.0552427768707275, + "learning_rate": 3.0178515650323854e-05, + "loss": 0.6467, + "step": 134530 + }, + { + "epoch": 1.1893774642408812, + "grad_norm": 3.1509528160095215, + "learning_rate": 3.0177042262651982e-05, + "loss": 0.6141, + "step": 134540 + }, + { + "epoch": 1.1894658675011935, + "grad_norm": 11.069650650024414, + "learning_rate": 3.017556887498011e-05, + "loss": 0.6985, + "step": 134550 + }, + { + "epoch": 1.1895542707615057, + "grad_norm": 6.334070682525635, + "learning_rate": 3.0174095487308242e-05, + "loss": 0.7615, + "step": 134560 + }, + { + "epoch": 1.189642674021818, + "grad_norm": 1.852825403213501, + "learning_rate": 3.017262209963637e-05, + "loss": 0.5602, + "step": 134570 + }, + { + "epoch": 1.1897310772821301, + "grad_norm": 2.627521276473999, + "learning_rate": 3.01711487119645e-05, + "loss": 0.5864, + "step": 134580 + }, + { + "epoch": 1.1898194805424425, + "grad_norm": 4.4445390701293945, + "learning_rate": 3.016967532429263e-05, + "loss": 0.6078, + "step": 134590 + }, + { + "epoch": 1.1899078838027546, + "grad_norm": 2.243546724319458, + "learning_rate": 3.0168201936620755e-05, + "loss": 0.593, + "step": 134600 + }, + { + "epoch": 1.189996287063067, + "grad_norm": 7.853315830230713, + "learning_rate": 3.0166728548948887e-05, + "loss": 0.6843, + "step": 134610 + }, + { + "epoch": 1.190084690323379, + "grad_norm": 1.8134901523590088, + "learning_rate": 3.016525516127702e-05, + "loss": 0.6254, + "step": 134620 + }, + { + "epoch": 1.1901730935836914, + "grad_norm": 2.3110218048095703, + "learning_rate": 3.0163781773605144e-05, + "loss": 0.6844, + "step": 134630 + }, + { + "epoch": 1.1902614968440037, + "grad_norm": 6.4041547775268555, + "learning_rate": 3.0162308385933276e-05, + "loss": 0.6786, + "step": 134640 + }, + { + "epoch": 1.1903499001043159, + "grad_norm": 4.018733978271484, + "learning_rate": 3.0160834998261407e-05, + "loss": 0.6663, + "step": 134650 + }, + { + "epoch": 1.190438303364628, + "grad_norm": 2.0734989643096924, + "learning_rate": 3.0159361610589532e-05, + "loss": 0.5676, + "step": 134660 + }, + { + "epoch": 1.1905267066249403, + "grad_norm": 5.42205810546875, + "learning_rate": 3.0157888222917664e-05, + "loss": 0.7832, + "step": 134670 + }, + { + "epoch": 1.1906151098852527, + "grad_norm": 8.497903823852539, + "learning_rate": 3.0156414835245796e-05, + "loss": 0.5889, + "step": 134680 + }, + { + "epoch": 1.1907035131455648, + "grad_norm": 1.8236331939697266, + "learning_rate": 3.015494144757392e-05, + "loss": 0.7016, + "step": 134690 + }, + { + "epoch": 1.1907919164058771, + "grad_norm": 5.72875452041626, + "learning_rate": 3.0153468059902052e-05, + "loss": 0.5426, + "step": 134700 + }, + { + "epoch": 1.1908803196661892, + "grad_norm": 1.451060175895691, + "learning_rate": 3.0151994672230184e-05, + "loss": 0.6335, + "step": 134710 + }, + { + "epoch": 1.1909687229265016, + "grad_norm": 2.3085947036743164, + "learning_rate": 3.015052128455831e-05, + "loss": 0.6423, + "step": 134720 + }, + { + "epoch": 1.1910571261868137, + "grad_norm": 1.1448899507522583, + "learning_rate": 3.014904789688644e-05, + "loss": 0.4428, + "step": 134730 + }, + { + "epoch": 1.191145529447126, + "grad_norm": 3.119044303894043, + "learning_rate": 3.0147574509214566e-05, + "loss": 0.584, + "step": 134740 + }, + { + "epoch": 1.1912339327074382, + "grad_norm": 3.132662296295166, + "learning_rate": 3.0146101121542698e-05, + "loss": 0.5607, + "step": 134750 + }, + { + "epoch": 1.1913223359677505, + "grad_norm": 4.623294830322266, + "learning_rate": 3.014462773387083e-05, + "loss": 0.6425, + "step": 134760 + }, + { + "epoch": 1.1914107392280626, + "grad_norm": 2.9660074710845947, + "learning_rate": 3.0143154346198954e-05, + "loss": 0.5568, + "step": 134770 + }, + { + "epoch": 1.191499142488375, + "grad_norm": 6.1668877601623535, + "learning_rate": 3.0141680958527086e-05, + "loss": 0.594, + "step": 134780 + }, + { + "epoch": 1.1915875457486873, + "grad_norm": 4.208791732788086, + "learning_rate": 3.0140207570855218e-05, + "loss": 0.6041, + "step": 134790 + }, + { + "epoch": 1.1916759490089994, + "grad_norm": 2.7201950550079346, + "learning_rate": 3.0138734183183343e-05, + "loss": 0.551, + "step": 134800 + }, + { + "epoch": 1.1917643522693118, + "grad_norm": 2.2502012252807617, + "learning_rate": 3.0137260795511475e-05, + "loss": 0.6462, + "step": 134810 + }, + { + "epoch": 1.1918527555296239, + "grad_norm": 2.161388635635376, + "learning_rate": 3.0135787407839606e-05, + "loss": 0.6353, + "step": 134820 + }, + { + "epoch": 1.1919411587899362, + "grad_norm": 1.8835651874542236, + "learning_rate": 3.013431402016773e-05, + "loss": 0.5842, + "step": 134830 + }, + { + "epoch": 1.1920295620502483, + "grad_norm": 1.7087674140930176, + "learning_rate": 3.0132840632495863e-05, + "loss": 0.5753, + "step": 134840 + }, + { + "epoch": 1.1921179653105607, + "grad_norm": 4.540475845336914, + "learning_rate": 3.0131367244823988e-05, + "loss": 0.6149, + "step": 134850 + }, + { + "epoch": 1.1922063685708728, + "grad_norm": 5.799891471862793, + "learning_rate": 3.012989385715212e-05, + "loss": 0.6232, + "step": 134860 + }, + { + "epoch": 1.1922947718311852, + "grad_norm": 2.382371187210083, + "learning_rate": 3.012842046948025e-05, + "loss": 0.5341, + "step": 134870 + }, + { + "epoch": 1.1923831750914973, + "grad_norm": 4.911892414093018, + "learning_rate": 3.0126947081808376e-05, + "loss": 0.6944, + "step": 134880 + }, + { + "epoch": 1.1924715783518096, + "grad_norm": 2.5033485889434814, + "learning_rate": 3.0125473694136508e-05, + "loss": 0.6661, + "step": 134890 + }, + { + "epoch": 1.192559981612122, + "grad_norm": 1.0402098894119263, + "learning_rate": 3.012400030646464e-05, + "loss": 0.5771, + "step": 134900 + }, + { + "epoch": 1.192648384872434, + "grad_norm": 1.4594581127166748, + "learning_rate": 3.0122526918792765e-05, + "loss": 0.6465, + "step": 134910 + }, + { + "epoch": 1.1927367881327464, + "grad_norm": 23.358400344848633, + "learning_rate": 3.0121053531120897e-05, + "loss": 0.6033, + "step": 134920 + }, + { + "epoch": 1.1928251913930585, + "grad_norm": 1.0218687057495117, + "learning_rate": 3.011958014344903e-05, + "loss": 0.6087, + "step": 134930 + }, + { + "epoch": 1.1929135946533709, + "grad_norm": 1.9828563928604126, + "learning_rate": 3.0118106755777153e-05, + "loss": 0.6202, + "step": 134940 + }, + { + "epoch": 1.193001997913683, + "grad_norm": 0.881187379360199, + "learning_rate": 3.0116633368105285e-05, + "loss": 0.7083, + "step": 134950 + }, + { + "epoch": 1.1930904011739953, + "grad_norm": 8.478384971618652, + "learning_rate": 3.011515998043341e-05, + "loss": 0.6248, + "step": 134960 + }, + { + "epoch": 1.1931788044343075, + "grad_norm": 2.483832836151123, + "learning_rate": 3.0113686592761542e-05, + "loss": 0.7632, + "step": 134970 + }, + { + "epoch": 1.1932672076946198, + "grad_norm": 6.684719085693359, + "learning_rate": 3.0112213205089673e-05, + "loss": 0.5586, + "step": 134980 + }, + { + "epoch": 1.193355610954932, + "grad_norm": 1.3750325441360474, + "learning_rate": 3.01107398174178e-05, + "loss": 0.6378, + "step": 134990 + }, + { + "epoch": 1.1934440142152443, + "grad_norm": 3.7526187896728516, + "learning_rate": 3.010926642974593e-05, + "loss": 0.5898, + "step": 135000 + }, + { + "epoch": 1.1935324174755566, + "grad_norm": 1.3710887432098389, + "learning_rate": 3.0107793042074062e-05, + "loss": 0.6396, + "step": 135010 + }, + { + "epoch": 1.1936208207358687, + "grad_norm": 2.045327663421631, + "learning_rate": 3.0106319654402187e-05, + "loss": 0.7543, + "step": 135020 + }, + { + "epoch": 1.193709223996181, + "grad_norm": 1.9241224527359009, + "learning_rate": 3.010484626673032e-05, + "loss": 0.6438, + "step": 135030 + }, + { + "epoch": 1.1937976272564932, + "grad_norm": 1.0467989444732666, + "learning_rate": 3.010337287905845e-05, + "loss": 0.563, + "step": 135040 + }, + { + "epoch": 1.1938860305168055, + "grad_norm": 1.5242828130722046, + "learning_rate": 3.0101899491386575e-05, + "loss": 0.6317, + "step": 135050 + }, + { + "epoch": 1.1939744337771176, + "grad_norm": 4.247598171234131, + "learning_rate": 3.0100426103714707e-05, + "loss": 0.6454, + "step": 135060 + }, + { + "epoch": 1.19406283703743, + "grad_norm": 3.4380462169647217, + "learning_rate": 3.0098952716042832e-05, + "loss": 0.5925, + "step": 135070 + }, + { + "epoch": 1.1941512402977421, + "grad_norm": 3.403031587600708, + "learning_rate": 3.0097479328370964e-05, + "loss": 0.6129, + "step": 135080 + }, + { + "epoch": 1.1942396435580545, + "grad_norm": 2.1044533252716064, + "learning_rate": 3.0096005940699096e-05, + "loss": 0.639, + "step": 135090 + }, + { + "epoch": 1.1943280468183666, + "grad_norm": 4.494055271148682, + "learning_rate": 3.009453255302722e-05, + "loss": 0.6652, + "step": 135100 + }, + { + "epoch": 1.194416450078679, + "grad_norm": 3.6114072799682617, + "learning_rate": 3.0093059165355352e-05, + "loss": 0.6758, + "step": 135110 + }, + { + "epoch": 1.1945048533389913, + "grad_norm": 2.921370506286621, + "learning_rate": 3.0091585777683484e-05, + "loss": 0.4851, + "step": 135120 + }, + { + "epoch": 1.1945932565993034, + "grad_norm": 1.47972571849823, + "learning_rate": 3.009011239001161e-05, + "loss": 0.5908, + "step": 135130 + }, + { + "epoch": 1.1946816598596157, + "grad_norm": 2.664775848388672, + "learning_rate": 3.008863900233974e-05, + "loss": 0.5439, + "step": 135140 + }, + { + "epoch": 1.1947700631199278, + "grad_norm": 2.8714442253112793, + "learning_rate": 3.0087165614667872e-05, + "loss": 0.7995, + "step": 135150 + }, + { + "epoch": 1.1948584663802402, + "grad_norm": 2.735581159591675, + "learning_rate": 3.0085692226995997e-05, + "loss": 0.54, + "step": 135160 + }, + { + "epoch": 1.1949468696405523, + "grad_norm": 1.93233060836792, + "learning_rate": 3.008421883932413e-05, + "loss": 0.7647, + "step": 135170 + }, + { + "epoch": 1.1950352729008646, + "grad_norm": 1.9108073711395264, + "learning_rate": 3.008274545165226e-05, + "loss": 0.591, + "step": 135180 + }, + { + "epoch": 1.1951236761611768, + "grad_norm": 1.6401818990707397, + "learning_rate": 3.0081272063980386e-05, + "loss": 0.5852, + "step": 135190 + }, + { + "epoch": 1.195212079421489, + "grad_norm": 5.186831474304199, + "learning_rate": 3.0079798676308518e-05, + "loss": 0.5745, + "step": 135200 + }, + { + "epoch": 1.1953004826818012, + "grad_norm": 2.3921542167663574, + "learning_rate": 3.0078325288636643e-05, + "loss": 0.5655, + "step": 135210 + }, + { + "epoch": 1.1953888859421136, + "grad_norm": 1.8918118476867676, + "learning_rate": 3.0076851900964774e-05, + "loss": 0.6225, + "step": 135220 + }, + { + "epoch": 1.195477289202426, + "grad_norm": 4.41324520111084, + "learning_rate": 3.0075378513292906e-05, + "loss": 0.5583, + "step": 135230 + }, + { + "epoch": 1.195565692462738, + "grad_norm": 2.0892438888549805, + "learning_rate": 3.007390512562103e-05, + "loss": 0.6693, + "step": 135240 + }, + { + "epoch": 1.1956540957230501, + "grad_norm": 1.8570963144302368, + "learning_rate": 3.0072431737949163e-05, + "loss": 0.6538, + "step": 135250 + }, + { + "epoch": 1.1957424989833625, + "grad_norm": 1.8314871788024902, + "learning_rate": 3.0070958350277295e-05, + "loss": 0.5374, + "step": 135260 + }, + { + "epoch": 1.1958309022436748, + "grad_norm": 7.7529802322387695, + "learning_rate": 3.006948496260542e-05, + "loss": 0.5959, + "step": 135270 + }, + { + "epoch": 1.195919305503987, + "grad_norm": 8.081664085388184, + "learning_rate": 3.006801157493355e-05, + "loss": 0.5608, + "step": 135280 + }, + { + "epoch": 1.1960077087642993, + "grad_norm": 11.194672584533691, + "learning_rate": 3.0066538187261683e-05, + "loss": 0.589, + "step": 135290 + }, + { + "epoch": 1.1960961120246114, + "grad_norm": 1.2424057722091675, + "learning_rate": 3.0065064799589808e-05, + "loss": 0.5983, + "step": 135300 + }, + { + "epoch": 1.1961845152849238, + "grad_norm": 3.588252544403076, + "learning_rate": 3.006359141191794e-05, + "loss": 0.5889, + "step": 135310 + }, + { + "epoch": 1.1962729185452359, + "grad_norm": 4.2321953773498535, + "learning_rate": 3.0062118024246068e-05, + "loss": 0.6925, + "step": 135320 + }, + { + "epoch": 1.1963613218055482, + "grad_norm": 3.803393602371216, + "learning_rate": 3.0060644636574196e-05, + "loss": 0.6396, + "step": 135330 + }, + { + "epoch": 1.1964497250658603, + "grad_norm": 2.1365442276000977, + "learning_rate": 3.0059171248902328e-05, + "loss": 0.5557, + "step": 135340 + }, + { + "epoch": 1.1965381283261727, + "grad_norm": 5.730929851531982, + "learning_rate": 3.0057697861230456e-05, + "loss": 0.5322, + "step": 135350 + }, + { + "epoch": 1.1966265315864848, + "grad_norm": 2.514432191848755, + "learning_rate": 3.0056224473558585e-05, + "loss": 0.488, + "step": 135360 + }, + { + "epoch": 1.1967149348467971, + "grad_norm": 2.984315872192383, + "learning_rate": 3.0054751085886717e-05, + "loss": 0.6205, + "step": 135370 + }, + { + "epoch": 1.1968033381071095, + "grad_norm": 0.8397032022476196, + "learning_rate": 3.0053277698214845e-05, + "loss": 0.5801, + "step": 135380 + }, + { + "epoch": 1.1968917413674216, + "grad_norm": 3.1440913677215576, + "learning_rate": 3.0051804310542973e-05, + "loss": 0.8333, + "step": 135390 + }, + { + "epoch": 1.196980144627734, + "grad_norm": 2.5497305393218994, + "learning_rate": 3.0050330922871105e-05, + "loss": 0.6406, + "step": 135400 + }, + { + "epoch": 1.197068547888046, + "grad_norm": 2.3228108882904053, + "learning_rate": 3.0048857535199233e-05, + "loss": 0.5936, + "step": 135410 + }, + { + "epoch": 1.1971569511483584, + "grad_norm": 3.424015998840332, + "learning_rate": 3.0047384147527362e-05, + "loss": 0.7542, + "step": 135420 + }, + { + "epoch": 1.1972453544086705, + "grad_norm": 4.755439281463623, + "learning_rate": 3.004591075985549e-05, + "loss": 0.6213, + "step": 135430 + }, + { + "epoch": 1.1973337576689829, + "grad_norm": 1.2206058502197266, + "learning_rate": 3.0044437372183622e-05, + "loss": 0.5459, + "step": 135440 + }, + { + "epoch": 1.197422160929295, + "grad_norm": 4.128241062164307, + "learning_rate": 3.004296398451175e-05, + "loss": 0.5197, + "step": 135450 + }, + { + "epoch": 1.1975105641896073, + "grad_norm": 1.3135501146316528, + "learning_rate": 3.004149059683988e-05, + "loss": 0.5041, + "step": 135460 + }, + { + "epoch": 1.1975989674499194, + "grad_norm": 1.1277292966842651, + "learning_rate": 3.004001720916801e-05, + "loss": 0.5991, + "step": 135470 + }, + { + "epoch": 1.1976873707102318, + "grad_norm": 5.00355339050293, + "learning_rate": 3.003854382149614e-05, + "loss": 0.6647, + "step": 135480 + }, + { + "epoch": 1.1977757739705441, + "grad_norm": 2.198127031326294, + "learning_rate": 3.0037070433824267e-05, + "loss": 0.5137, + "step": 135490 + }, + { + "epoch": 1.1978641772308563, + "grad_norm": 3.6041066646575928, + "learning_rate": 3.00355970461524e-05, + "loss": 0.5561, + "step": 135500 + }, + { + "epoch": 1.1979525804911686, + "grad_norm": 3.7992098331451416, + "learning_rate": 3.0034123658480527e-05, + "loss": 0.6922, + "step": 135510 + }, + { + "epoch": 1.1980409837514807, + "grad_norm": 1.5862653255462646, + "learning_rate": 3.0032650270808655e-05, + "loss": 0.5576, + "step": 135520 + }, + { + "epoch": 1.198129387011793, + "grad_norm": 3.370110034942627, + "learning_rate": 3.0031176883136787e-05, + "loss": 0.5795, + "step": 135530 + }, + { + "epoch": 1.1982177902721052, + "grad_norm": 7.045070648193359, + "learning_rate": 3.0029703495464912e-05, + "loss": 0.7336, + "step": 135540 + }, + { + "epoch": 1.1983061935324175, + "grad_norm": 6.976004600524902, + "learning_rate": 3.0028230107793044e-05, + "loss": 0.6323, + "step": 135550 + }, + { + "epoch": 1.1983945967927296, + "grad_norm": 15.105899810791016, + "learning_rate": 3.0026756720121176e-05, + "loss": 0.5799, + "step": 135560 + }, + { + "epoch": 1.198483000053042, + "grad_norm": 1.8034095764160156, + "learning_rate": 3.00252833324493e-05, + "loss": 0.5372, + "step": 135570 + }, + { + "epoch": 1.198571403313354, + "grad_norm": 4.731830596923828, + "learning_rate": 3.0023809944777432e-05, + "loss": 0.5295, + "step": 135580 + }, + { + "epoch": 1.1986598065736664, + "grad_norm": 3.444960832595825, + "learning_rate": 3.0022336557105564e-05, + "loss": 0.3763, + "step": 135590 + }, + { + "epoch": 1.1987482098339788, + "grad_norm": 12.027643203735352, + "learning_rate": 3.002086316943369e-05, + "loss": 0.6959, + "step": 135600 + }, + { + "epoch": 1.198836613094291, + "grad_norm": 3.482025384902954, + "learning_rate": 3.001938978176182e-05, + "loss": 0.6573, + "step": 135610 + }, + { + "epoch": 1.1989250163546032, + "grad_norm": 1.7336550951004028, + "learning_rate": 3.0017916394089953e-05, + "loss": 0.5449, + "step": 135620 + }, + { + "epoch": 1.1990134196149154, + "grad_norm": 2.5900959968566895, + "learning_rate": 3.0016443006418077e-05, + "loss": 0.6094, + "step": 135630 + }, + { + "epoch": 1.1991018228752277, + "grad_norm": 1.0362385511398315, + "learning_rate": 3.001496961874621e-05, + "loss": 0.5825, + "step": 135640 + }, + { + "epoch": 1.1991902261355398, + "grad_norm": 1.4746947288513184, + "learning_rate": 3.001349623107434e-05, + "loss": 0.559, + "step": 135650 + }, + { + "epoch": 1.1992786293958522, + "grad_norm": 3.1542747020721436, + "learning_rate": 3.0012022843402466e-05, + "loss": 0.6825, + "step": 135660 + }, + { + "epoch": 1.1993670326561643, + "grad_norm": 1.3476935625076294, + "learning_rate": 3.0010549455730598e-05, + "loss": 0.6431, + "step": 135670 + }, + { + "epoch": 1.1994554359164766, + "grad_norm": 2.6179378032684326, + "learning_rate": 3.0009076068058723e-05, + "loss": 0.6364, + "step": 135680 + }, + { + "epoch": 1.1995438391767888, + "grad_norm": 7.111267566680908, + "learning_rate": 3.0007602680386854e-05, + "loss": 0.6319, + "step": 135690 + }, + { + "epoch": 1.199632242437101, + "grad_norm": 1.1153324842453003, + "learning_rate": 3.0006129292714986e-05, + "loss": 0.6491, + "step": 135700 + }, + { + "epoch": 1.1997206456974134, + "grad_norm": 2.242643356323242, + "learning_rate": 3.000465590504311e-05, + "loss": 0.637, + "step": 135710 + }, + { + "epoch": 1.1998090489577256, + "grad_norm": 1.3586938381195068, + "learning_rate": 3.0003182517371243e-05, + "loss": 0.6326, + "step": 135720 + }, + { + "epoch": 1.199897452218038, + "grad_norm": 1.611083745956421, + "learning_rate": 3.0001709129699375e-05, + "loss": 0.6013, + "step": 135730 + }, + { + "epoch": 1.19998585547835, + "grad_norm": 1.6657063961029053, + "learning_rate": 3.00002357420275e-05, + "loss": 0.6407, + "step": 135740 + }, + { + "epoch": 1.2000742587386624, + "grad_norm": 1.2029578685760498, + "learning_rate": 2.999876235435563e-05, + "loss": 0.6533, + "step": 135750 + }, + { + "epoch": 1.2001626619989745, + "grad_norm": 1.8247076272964478, + "learning_rate": 2.9997288966683763e-05, + "loss": 0.5325, + "step": 135760 + }, + { + "epoch": 1.2002510652592868, + "grad_norm": 2.0752828121185303, + "learning_rate": 2.9995815579011888e-05, + "loss": 0.6111, + "step": 135770 + }, + { + "epoch": 1.200339468519599, + "grad_norm": 1.8151401281356812, + "learning_rate": 2.999434219134002e-05, + "loss": 0.592, + "step": 135780 + }, + { + "epoch": 1.2004278717799113, + "grad_norm": 3.682612657546997, + "learning_rate": 2.9992868803668145e-05, + "loss": 0.6635, + "step": 135790 + }, + { + "epoch": 1.2005162750402234, + "grad_norm": 4.259321212768555, + "learning_rate": 2.9991395415996276e-05, + "loss": 0.7588, + "step": 135800 + }, + { + "epoch": 1.2006046783005357, + "grad_norm": 4.106159210205078, + "learning_rate": 2.9989922028324408e-05, + "loss": 0.6797, + "step": 135810 + }, + { + "epoch": 1.200693081560848, + "grad_norm": 4.49326229095459, + "learning_rate": 2.9988448640652533e-05, + "loss": 0.5601, + "step": 135820 + }, + { + "epoch": 1.2007814848211602, + "grad_norm": 2.7030065059661865, + "learning_rate": 2.9986975252980665e-05, + "loss": 0.6338, + "step": 135830 + }, + { + "epoch": 1.2008698880814723, + "grad_norm": 2.13750958442688, + "learning_rate": 2.9985501865308797e-05, + "loss": 0.738, + "step": 135840 + }, + { + "epoch": 1.2009582913417847, + "grad_norm": 8.595383644104004, + "learning_rate": 2.998402847763692e-05, + "loss": 0.5787, + "step": 135850 + }, + { + "epoch": 1.201046694602097, + "grad_norm": 1.8548564910888672, + "learning_rate": 2.9982555089965053e-05, + "loss": 0.6345, + "step": 135860 + }, + { + "epoch": 1.2011350978624091, + "grad_norm": 1.257297158241272, + "learning_rate": 2.9981081702293185e-05, + "loss": 0.6923, + "step": 135870 + }, + { + "epoch": 1.2012235011227215, + "grad_norm": 7.5779547691345215, + "learning_rate": 2.997960831462131e-05, + "loss": 0.5577, + "step": 135880 + }, + { + "epoch": 1.2013119043830336, + "grad_norm": 2.7143030166625977, + "learning_rate": 2.9978134926949442e-05, + "loss": 0.6172, + "step": 135890 + }, + { + "epoch": 1.201400307643346, + "grad_norm": 1.1999244689941406, + "learning_rate": 2.9976661539277567e-05, + "loss": 0.6509, + "step": 135900 + }, + { + "epoch": 1.201488710903658, + "grad_norm": 1.4042283296585083, + "learning_rate": 2.99751881516057e-05, + "loss": 0.4573, + "step": 135910 + }, + { + "epoch": 1.2015771141639704, + "grad_norm": 2.753455877304077, + "learning_rate": 2.997371476393383e-05, + "loss": 0.5865, + "step": 135920 + }, + { + "epoch": 1.2016655174242825, + "grad_norm": 7.5508270263671875, + "learning_rate": 2.9972241376261955e-05, + "loss": 0.6316, + "step": 135930 + }, + { + "epoch": 1.2017539206845949, + "grad_norm": 4.950708389282227, + "learning_rate": 2.9970767988590087e-05, + "loss": 0.8057, + "step": 135940 + }, + { + "epoch": 1.201842323944907, + "grad_norm": 2.173250436782837, + "learning_rate": 2.996929460091822e-05, + "loss": 0.7843, + "step": 135950 + }, + { + "epoch": 1.2019307272052193, + "grad_norm": 2.9072792530059814, + "learning_rate": 2.9967821213246344e-05, + "loss": 0.5655, + "step": 135960 + }, + { + "epoch": 1.2020191304655317, + "grad_norm": 1.9952467679977417, + "learning_rate": 2.9966347825574475e-05, + "loss": 0.5429, + "step": 135970 + }, + { + "epoch": 1.2021075337258438, + "grad_norm": 1.2081823348999023, + "learning_rate": 2.9964874437902607e-05, + "loss": 0.6011, + "step": 135980 + }, + { + "epoch": 1.2021959369861561, + "grad_norm": 3.3362925052642822, + "learning_rate": 2.9963401050230732e-05, + "loss": 0.5857, + "step": 135990 + }, + { + "epoch": 1.2022843402464682, + "grad_norm": 2.562812328338623, + "learning_rate": 2.9961927662558864e-05, + "loss": 0.6045, + "step": 136000 + }, + { + "epoch": 1.2023727435067806, + "grad_norm": 7.28032922744751, + "learning_rate": 2.996045427488699e-05, + "loss": 0.6275, + "step": 136010 + }, + { + "epoch": 1.2024611467670927, + "grad_norm": 2.5389790534973145, + "learning_rate": 2.995898088721512e-05, + "loss": 0.7235, + "step": 136020 + }, + { + "epoch": 1.202549550027405, + "grad_norm": 4.076797008514404, + "learning_rate": 2.9957507499543252e-05, + "loss": 0.5264, + "step": 136030 + }, + { + "epoch": 1.2026379532877172, + "grad_norm": 3.2233691215515137, + "learning_rate": 2.9956034111871377e-05, + "loss": 0.6682, + "step": 136040 + }, + { + "epoch": 1.2027263565480295, + "grad_norm": 1.448582649230957, + "learning_rate": 2.995456072419951e-05, + "loss": 0.576, + "step": 136050 + }, + { + "epoch": 1.2028147598083416, + "grad_norm": 5.4315032958984375, + "learning_rate": 2.995308733652764e-05, + "loss": 0.531, + "step": 136060 + }, + { + "epoch": 1.202903163068654, + "grad_norm": 4.017689228057861, + "learning_rate": 2.9951613948855766e-05, + "loss": 0.6048, + "step": 136070 + }, + { + "epoch": 1.2029915663289663, + "grad_norm": 5.030168056488037, + "learning_rate": 2.9950140561183897e-05, + "loss": 0.5581, + "step": 136080 + }, + { + "epoch": 1.2030799695892784, + "grad_norm": 4.050236701965332, + "learning_rate": 2.994866717351203e-05, + "loss": 0.6695, + "step": 136090 + }, + { + "epoch": 1.2031683728495908, + "grad_norm": 1.908950924873352, + "learning_rate": 2.9947193785840154e-05, + "loss": 0.7135, + "step": 136100 + }, + { + "epoch": 1.203256776109903, + "grad_norm": 1.759932518005371, + "learning_rate": 2.9945720398168286e-05, + "loss": 0.6404, + "step": 136110 + }, + { + "epoch": 1.2033451793702152, + "grad_norm": 1.37447190284729, + "learning_rate": 2.9944247010496418e-05, + "loss": 0.5635, + "step": 136120 + }, + { + "epoch": 1.2034335826305274, + "grad_norm": 12.4634370803833, + "learning_rate": 2.9942773622824543e-05, + "loss": 0.6053, + "step": 136130 + }, + { + "epoch": 1.2035219858908397, + "grad_norm": 2.978100538253784, + "learning_rate": 2.9941300235152674e-05, + "loss": 0.7483, + "step": 136140 + }, + { + "epoch": 1.2036103891511518, + "grad_norm": 1.9565192461013794, + "learning_rate": 2.99398268474808e-05, + "loss": 0.5541, + "step": 136150 + }, + { + "epoch": 1.2036987924114642, + "grad_norm": 7.531537055969238, + "learning_rate": 2.993835345980893e-05, + "loss": 0.4802, + "step": 136160 + }, + { + "epoch": 1.2037871956717763, + "grad_norm": 3.354962110519409, + "learning_rate": 2.9936880072137063e-05, + "loss": 0.6952, + "step": 136170 + }, + { + "epoch": 1.2038755989320886, + "grad_norm": 6.775915145874023, + "learning_rate": 2.9935406684465188e-05, + "loss": 0.6826, + "step": 136180 + }, + { + "epoch": 1.203964002192401, + "grad_norm": 2.227693796157837, + "learning_rate": 2.993393329679332e-05, + "loss": 0.5377, + "step": 136190 + }, + { + "epoch": 1.204052405452713, + "grad_norm": 2.2532155513763428, + "learning_rate": 2.993245990912145e-05, + "loss": 0.566, + "step": 136200 + }, + { + "epoch": 1.2041408087130254, + "grad_norm": 3.50679874420166, + "learning_rate": 2.9930986521449576e-05, + "loss": 0.5649, + "step": 136210 + }, + { + "epoch": 1.2042292119733375, + "grad_norm": 11.085139274597168, + "learning_rate": 2.9929513133777708e-05, + "loss": 0.5629, + "step": 136220 + }, + { + "epoch": 1.2043176152336499, + "grad_norm": 2.4664134979248047, + "learning_rate": 2.992803974610584e-05, + "loss": 0.5645, + "step": 136230 + }, + { + "epoch": 1.204406018493962, + "grad_norm": 2.66837215423584, + "learning_rate": 2.9926566358433965e-05, + "loss": 0.5899, + "step": 136240 + }, + { + "epoch": 1.2044944217542743, + "grad_norm": 2.3268489837646484, + "learning_rate": 2.9925092970762096e-05, + "loss": 0.6027, + "step": 136250 + }, + { + "epoch": 1.2045828250145865, + "grad_norm": 4.256599426269531, + "learning_rate": 2.9923619583090225e-05, + "loss": 0.5748, + "step": 136260 + }, + { + "epoch": 1.2046712282748988, + "grad_norm": 16.19921112060547, + "learning_rate": 2.9922146195418353e-05, + "loss": 0.6162, + "step": 136270 + }, + { + "epoch": 1.204759631535211, + "grad_norm": 1.312179446220398, + "learning_rate": 2.9920672807746485e-05, + "loss": 0.5321, + "step": 136280 + }, + { + "epoch": 1.2048480347955233, + "grad_norm": 2.1660702228546143, + "learning_rate": 2.9919199420074613e-05, + "loss": 0.6213, + "step": 136290 + }, + { + "epoch": 1.2049364380558356, + "grad_norm": 1.8339970111846924, + "learning_rate": 2.991772603240274e-05, + "loss": 0.5352, + "step": 136300 + }, + { + "epoch": 1.2050248413161477, + "grad_norm": 1.6349108219146729, + "learning_rate": 2.9916252644730873e-05, + "loss": 0.6056, + "step": 136310 + }, + { + "epoch": 1.20511324457646, + "grad_norm": 4.372035980224609, + "learning_rate": 2.9914779257059e-05, + "loss": 0.545, + "step": 136320 + }, + { + "epoch": 1.2052016478367722, + "grad_norm": 2.2924644947052, + "learning_rate": 2.991330586938713e-05, + "loss": 0.6779, + "step": 136330 + }, + { + "epoch": 1.2052900510970845, + "grad_norm": 1.4742251634597778, + "learning_rate": 2.9911832481715262e-05, + "loss": 0.658, + "step": 136340 + }, + { + "epoch": 1.2053784543573967, + "grad_norm": 5.213178634643555, + "learning_rate": 2.991035909404339e-05, + "loss": 0.6454, + "step": 136350 + }, + { + "epoch": 1.205466857617709, + "grad_norm": 8.827898979187012, + "learning_rate": 2.990888570637152e-05, + "loss": 0.7178, + "step": 136360 + }, + { + "epoch": 1.2055552608780211, + "grad_norm": 1.477850317955017, + "learning_rate": 2.9907412318699647e-05, + "loss": 0.5543, + "step": 136370 + }, + { + "epoch": 1.2056436641383335, + "grad_norm": 2.4808008670806885, + "learning_rate": 2.990593893102778e-05, + "loss": 0.5529, + "step": 136380 + }, + { + "epoch": 1.2057320673986456, + "grad_norm": 11.636964797973633, + "learning_rate": 2.9904465543355907e-05, + "loss": 0.5901, + "step": 136390 + }, + { + "epoch": 1.205820470658958, + "grad_norm": 1.3622137308120728, + "learning_rate": 2.9902992155684035e-05, + "loss": 0.6249, + "step": 136400 + }, + { + "epoch": 1.2059088739192703, + "grad_norm": 2.1333587169647217, + "learning_rate": 2.9901518768012167e-05, + "loss": 0.6967, + "step": 136410 + }, + { + "epoch": 1.2059972771795824, + "grad_norm": 6.147896766662598, + "learning_rate": 2.9900045380340295e-05, + "loss": 0.6283, + "step": 136420 + }, + { + "epoch": 1.2060856804398945, + "grad_norm": 2.3316831588745117, + "learning_rate": 2.9898571992668424e-05, + "loss": 0.6949, + "step": 136430 + }, + { + "epoch": 1.2061740837002068, + "grad_norm": 1.2832236289978027, + "learning_rate": 2.9897098604996555e-05, + "loss": 0.5576, + "step": 136440 + }, + { + "epoch": 1.2062624869605192, + "grad_norm": 1.4624751806259155, + "learning_rate": 2.9895625217324684e-05, + "loss": 0.6055, + "step": 136450 + }, + { + "epoch": 1.2063508902208313, + "grad_norm": 1.3734575510025024, + "learning_rate": 2.9894151829652812e-05, + "loss": 0.4974, + "step": 136460 + }, + { + "epoch": 1.2064392934811436, + "grad_norm": 1.2721678018569946, + "learning_rate": 2.9892678441980944e-05, + "loss": 0.5278, + "step": 136470 + }, + { + "epoch": 1.2065276967414558, + "grad_norm": 0.8982300758361816, + "learning_rate": 2.989120505430907e-05, + "loss": 0.7269, + "step": 136480 + }, + { + "epoch": 1.206616100001768, + "grad_norm": 0.6635897159576416, + "learning_rate": 2.98897316666372e-05, + "loss": 0.6442, + "step": 136490 + }, + { + "epoch": 1.2067045032620802, + "grad_norm": 1.790482759475708, + "learning_rate": 2.9888258278965332e-05, + "loss": 0.5081, + "step": 136500 + }, + { + "epoch": 1.2067929065223926, + "grad_norm": 1.2175158262252808, + "learning_rate": 2.9886784891293457e-05, + "loss": 0.5805, + "step": 136510 + }, + { + "epoch": 1.2068813097827047, + "grad_norm": 0.6000199913978577, + "learning_rate": 2.988531150362159e-05, + "loss": 0.5517, + "step": 136520 + }, + { + "epoch": 1.206969713043017, + "grad_norm": 2.8189423084259033, + "learning_rate": 2.988383811594972e-05, + "loss": 0.6113, + "step": 136530 + }, + { + "epoch": 1.2070581163033292, + "grad_norm": 4.3407111167907715, + "learning_rate": 2.9882364728277846e-05, + "loss": 0.5647, + "step": 136540 + }, + { + "epoch": 1.2071465195636415, + "grad_norm": 8.9413480758667, + "learning_rate": 2.9880891340605978e-05, + "loss": 0.6584, + "step": 136550 + }, + { + "epoch": 1.2072349228239538, + "grad_norm": 10.368038177490234, + "learning_rate": 2.987941795293411e-05, + "loss": 0.6513, + "step": 136560 + }, + { + "epoch": 1.207323326084266, + "grad_norm": 1.3307462930679321, + "learning_rate": 2.9877944565262234e-05, + "loss": 0.5898, + "step": 136570 + }, + { + "epoch": 1.2074117293445783, + "grad_norm": 1.3528331518173218, + "learning_rate": 2.9876471177590366e-05, + "loss": 0.6154, + "step": 136580 + }, + { + "epoch": 1.2075001326048904, + "grad_norm": 2.554675579071045, + "learning_rate": 2.9874997789918498e-05, + "loss": 0.6681, + "step": 136590 + }, + { + "epoch": 1.2075885358652028, + "grad_norm": 2.9656012058258057, + "learning_rate": 2.9873524402246623e-05, + "loss": 0.7641, + "step": 136600 + }, + { + "epoch": 1.2076769391255149, + "grad_norm": 6.637085914611816, + "learning_rate": 2.9872051014574754e-05, + "loss": 0.6516, + "step": 136610 + }, + { + "epoch": 1.2077653423858272, + "grad_norm": 1.9870898723602295, + "learning_rate": 2.987057762690288e-05, + "loss": 0.6032, + "step": 136620 + }, + { + "epoch": 1.2078537456461393, + "grad_norm": 23.89141082763672, + "learning_rate": 2.986910423923101e-05, + "loss": 0.5617, + "step": 136630 + }, + { + "epoch": 1.2079421489064517, + "grad_norm": 5.746498107910156, + "learning_rate": 2.9867630851559143e-05, + "loss": 0.4003, + "step": 136640 + }, + { + "epoch": 1.2080305521667638, + "grad_norm": 3.930852174758911, + "learning_rate": 2.9866157463887268e-05, + "loss": 0.7007, + "step": 136650 + }, + { + "epoch": 1.2081189554270761, + "grad_norm": 6.953625202178955, + "learning_rate": 2.98646840762154e-05, + "loss": 0.678, + "step": 136660 + }, + { + "epoch": 1.2082073586873885, + "grad_norm": 4.143732070922852, + "learning_rate": 2.986321068854353e-05, + "loss": 0.6667, + "step": 136670 + }, + { + "epoch": 1.2082957619477006, + "grad_norm": 0.9013015627861023, + "learning_rate": 2.9861737300871656e-05, + "loss": 0.4879, + "step": 136680 + }, + { + "epoch": 1.208384165208013, + "grad_norm": 5.37971305847168, + "learning_rate": 2.9860263913199788e-05, + "loss": 0.6711, + "step": 136690 + }, + { + "epoch": 1.208472568468325, + "grad_norm": 3.4336211681365967, + "learning_rate": 2.985879052552792e-05, + "loss": 0.6039, + "step": 136700 + }, + { + "epoch": 1.2085609717286374, + "grad_norm": 2.0258169174194336, + "learning_rate": 2.9857317137856045e-05, + "loss": 0.5788, + "step": 136710 + }, + { + "epoch": 1.2086493749889495, + "grad_norm": 3.508629083633423, + "learning_rate": 2.9855843750184176e-05, + "loss": 0.483, + "step": 136720 + }, + { + "epoch": 1.2087377782492619, + "grad_norm": 1.486860752105713, + "learning_rate": 2.98543703625123e-05, + "loss": 0.7178, + "step": 136730 + }, + { + "epoch": 1.208826181509574, + "grad_norm": 6.615573883056641, + "learning_rate": 2.9852896974840433e-05, + "loss": 0.6733, + "step": 136740 + }, + { + "epoch": 1.2089145847698863, + "grad_norm": 2.1742522716522217, + "learning_rate": 2.9851423587168565e-05, + "loss": 0.6439, + "step": 136750 + }, + { + "epoch": 1.2090029880301985, + "grad_norm": 1.98505437374115, + "learning_rate": 2.984995019949669e-05, + "loss": 0.7232, + "step": 136760 + }, + { + "epoch": 1.2090913912905108, + "grad_norm": 2.0811924934387207, + "learning_rate": 2.984847681182482e-05, + "loss": 0.7184, + "step": 136770 + }, + { + "epoch": 1.2091797945508231, + "grad_norm": 1.3345839977264404, + "learning_rate": 2.9847003424152953e-05, + "loss": 0.643, + "step": 136780 + }, + { + "epoch": 1.2092681978111353, + "grad_norm": 12.411947250366211, + "learning_rate": 2.984553003648108e-05, + "loss": 0.5643, + "step": 136790 + }, + { + "epoch": 1.2093566010714476, + "grad_norm": 1.6442501544952393, + "learning_rate": 2.984405664880921e-05, + "loss": 0.6276, + "step": 136800 + }, + { + "epoch": 1.2094450043317597, + "grad_norm": 0.918235719203949, + "learning_rate": 2.9842583261137342e-05, + "loss": 0.5083, + "step": 136810 + }, + { + "epoch": 1.209533407592072, + "grad_norm": 3.278961181640625, + "learning_rate": 2.9841109873465467e-05, + "loss": 0.5528, + "step": 136820 + }, + { + "epoch": 1.2096218108523842, + "grad_norm": 3.771461009979248, + "learning_rate": 2.98396364857936e-05, + "loss": 0.6625, + "step": 136830 + }, + { + "epoch": 1.2097102141126965, + "grad_norm": 1.090736746788025, + "learning_rate": 2.9838163098121724e-05, + "loss": 0.5648, + "step": 136840 + }, + { + "epoch": 1.2097986173730086, + "grad_norm": 0.7083085179328918, + "learning_rate": 2.9836689710449855e-05, + "loss": 0.6447, + "step": 136850 + }, + { + "epoch": 1.209887020633321, + "grad_norm": 12.285191535949707, + "learning_rate": 2.9835216322777987e-05, + "loss": 0.8053, + "step": 136860 + }, + { + "epoch": 1.209975423893633, + "grad_norm": 2.7982661724090576, + "learning_rate": 2.9833742935106112e-05, + "loss": 0.4441, + "step": 136870 + }, + { + "epoch": 1.2100638271539454, + "grad_norm": 2.4491517543792725, + "learning_rate": 2.9832269547434244e-05, + "loss": 0.6215, + "step": 136880 + }, + { + "epoch": 1.2101522304142578, + "grad_norm": 6.341678142547607, + "learning_rate": 2.9830796159762375e-05, + "loss": 0.6386, + "step": 136890 + }, + { + "epoch": 1.21024063367457, + "grad_norm": 1.4119036197662354, + "learning_rate": 2.98293227720905e-05, + "loss": 0.4223, + "step": 136900 + }, + { + "epoch": 1.2103290369348823, + "grad_norm": 2.488222122192383, + "learning_rate": 2.9827849384418632e-05, + "loss": 0.5413, + "step": 136910 + }, + { + "epoch": 1.2104174401951944, + "grad_norm": 3.0203399658203125, + "learning_rate": 2.9826375996746764e-05, + "loss": 0.7236, + "step": 136920 + }, + { + "epoch": 1.2105058434555067, + "grad_norm": 3.1244728565216064, + "learning_rate": 2.982490260907489e-05, + "loss": 0.5899, + "step": 136930 + }, + { + "epoch": 1.2105942467158188, + "grad_norm": 2.269578456878662, + "learning_rate": 2.982342922140302e-05, + "loss": 0.5994, + "step": 136940 + }, + { + "epoch": 1.2106826499761312, + "grad_norm": 7.8710808753967285, + "learning_rate": 2.9821955833731152e-05, + "loss": 0.5414, + "step": 136950 + }, + { + "epoch": 1.2107710532364433, + "grad_norm": 2.330002784729004, + "learning_rate": 2.9820482446059277e-05, + "loss": 0.5062, + "step": 136960 + }, + { + "epoch": 1.2108594564967556, + "grad_norm": 1.1440849304199219, + "learning_rate": 2.981900905838741e-05, + "loss": 0.5044, + "step": 136970 + }, + { + "epoch": 1.2109478597570678, + "grad_norm": 2.307657480239868, + "learning_rate": 2.9817535670715534e-05, + "loss": 0.6238, + "step": 136980 + }, + { + "epoch": 1.21103626301738, + "grad_norm": 2.5043787956237793, + "learning_rate": 2.9816062283043666e-05, + "loss": 0.623, + "step": 136990 + }, + { + "epoch": 1.2111246662776924, + "grad_norm": 4.4899091720581055, + "learning_rate": 2.9814588895371798e-05, + "loss": 0.5782, + "step": 137000 + }, + { + "epoch": 1.2112130695380046, + "grad_norm": 1.0341296195983887, + "learning_rate": 2.9813115507699922e-05, + "loss": 0.5798, + "step": 137010 + }, + { + "epoch": 1.2113014727983167, + "grad_norm": 1.7302199602127075, + "learning_rate": 2.9811642120028054e-05, + "loss": 0.6064, + "step": 137020 + }, + { + "epoch": 1.211389876058629, + "grad_norm": 17.51358413696289, + "learning_rate": 2.9810168732356186e-05, + "loss": 0.7097, + "step": 137030 + }, + { + "epoch": 1.2114782793189414, + "grad_norm": 3.5841894149780273, + "learning_rate": 2.980869534468431e-05, + "loss": 0.7135, + "step": 137040 + }, + { + "epoch": 1.2115666825792535, + "grad_norm": 4.1583685874938965, + "learning_rate": 2.9807221957012443e-05, + "loss": 0.6791, + "step": 137050 + }, + { + "epoch": 1.2116550858395658, + "grad_norm": 3.6986026763916016, + "learning_rate": 2.9805748569340574e-05, + "loss": 0.6759, + "step": 137060 + }, + { + "epoch": 1.211743489099878, + "grad_norm": 3.9213991165161133, + "learning_rate": 2.98042751816687e-05, + "loss": 0.6375, + "step": 137070 + }, + { + "epoch": 1.2118318923601903, + "grad_norm": 5.8226423263549805, + "learning_rate": 2.980280179399683e-05, + "loss": 0.6665, + "step": 137080 + }, + { + "epoch": 1.2119202956205024, + "grad_norm": 3.0791375637054443, + "learning_rate": 2.9801328406324956e-05, + "loss": 0.6518, + "step": 137090 + }, + { + "epoch": 1.2120086988808147, + "grad_norm": 4.601994514465332, + "learning_rate": 2.9799855018653088e-05, + "loss": 0.6553, + "step": 137100 + }, + { + "epoch": 1.2120971021411269, + "grad_norm": 1.0395901203155518, + "learning_rate": 2.979838163098122e-05, + "loss": 0.6115, + "step": 137110 + }, + { + "epoch": 1.2121855054014392, + "grad_norm": 1.40775465965271, + "learning_rate": 2.9796908243309345e-05, + "loss": 0.6279, + "step": 137120 + }, + { + "epoch": 1.2122739086617513, + "grad_norm": 4.496260166168213, + "learning_rate": 2.9795434855637476e-05, + "loss": 0.5396, + "step": 137130 + }, + { + "epoch": 1.2123623119220637, + "grad_norm": 2.5563015937805176, + "learning_rate": 2.9793961467965608e-05, + "loss": 0.4649, + "step": 137140 + }, + { + "epoch": 1.212450715182376, + "grad_norm": 23.356992721557617, + "learning_rate": 2.9792488080293733e-05, + "loss": 0.6542, + "step": 137150 + }, + { + "epoch": 1.2125391184426881, + "grad_norm": 4.126574993133545, + "learning_rate": 2.9791014692621865e-05, + "loss": 0.4877, + "step": 137160 + }, + { + "epoch": 1.2126275217030005, + "grad_norm": 2.8210806846618652, + "learning_rate": 2.9789541304949996e-05, + "loss": 0.6324, + "step": 137170 + }, + { + "epoch": 1.2127159249633126, + "grad_norm": 3.3037822246551514, + "learning_rate": 2.978806791727812e-05, + "loss": 0.543, + "step": 137180 + }, + { + "epoch": 1.212804328223625, + "grad_norm": 2.2153584957122803, + "learning_rate": 2.9786594529606253e-05, + "loss": 0.7084, + "step": 137190 + }, + { + "epoch": 1.212892731483937, + "grad_norm": 3.0296430587768555, + "learning_rate": 2.978512114193438e-05, + "loss": 0.5181, + "step": 137200 + }, + { + "epoch": 1.2129811347442494, + "grad_norm": 4.3093438148498535, + "learning_rate": 2.978364775426251e-05, + "loss": 0.5614, + "step": 137210 + }, + { + "epoch": 1.2130695380045615, + "grad_norm": 2.935051679611206, + "learning_rate": 2.978217436659064e-05, + "loss": 0.6652, + "step": 137220 + }, + { + "epoch": 1.2131579412648739, + "grad_norm": 3.412858724594116, + "learning_rate": 2.978070097891877e-05, + "loss": 0.556, + "step": 137230 + }, + { + "epoch": 1.213246344525186, + "grad_norm": 2.090912342071533, + "learning_rate": 2.97792275912469e-05, + "loss": 0.5652, + "step": 137240 + }, + { + "epoch": 1.2133347477854983, + "grad_norm": 1.0503562688827515, + "learning_rate": 2.977775420357503e-05, + "loss": 0.6239, + "step": 137250 + }, + { + "epoch": 1.2134231510458107, + "grad_norm": 2.9367406368255615, + "learning_rate": 2.977628081590316e-05, + "loss": 0.786, + "step": 137260 + }, + { + "epoch": 1.2135115543061228, + "grad_norm": 8.727594375610352, + "learning_rate": 2.9774807428231287e-05, + "loss": 0.4766, + "step": 137270 + }, + { + "epoch": 1.2135999575664351, + "grad_norm": 2.7663381099700928, + "learning_rate": 2.977333404055942e-05, + "loss": 0.608, + "step": 137280 + }, + { + "epoch": 1.2136883608267472, + "grad_norm": 1.3357828855514526, + "learning_rate": 2.9771860652887547e-05, + "loss": 0.6031, + "step": 137290 + }, + { + "epoch": 1.2137767640870596, + "grad_norm": 2.4202351570129395, + "learning_rate": 2.9770387265215675e-05, + "loss": 0.6815, + "step": 137300 + }, + { + "epoch": 1.2138651673473717, + "grad_norm": 2.0652809143066406, + "learning_rate": 2.9768913877543804e-05, + "loss": 0.7682, + "step": 137310 + }, + { + "epoch": 1.213953570607684, + "grad_norm": 1.4614520072937012, + "learning_rate": 2.9767440489871935e-05, + "loss": 0.6199, + "step": 137320 + }, + { + "epoch": 1.2140419738679962, + "grad_norm": 2.4469339847564697, + "learning_rate": 2.9765967102200064e-05, + "loss": 0.4863, + "step": 137330 + }, + { + "epoch": 1.2141303771283085, + "grad_norm": 2.1750988960266113, + "learning_rate": 2.9764493714528192e-05, + "loss": 0.4876, + "step": 137340 + }, + { + "epoch": 1.2142187803886206, + "grad_norm": 2.6206347942352295, + "learning_rate": 2.9763020326856324e-05, + "loss": 0.5562, + "step": 137350 + }, + { + "epoch": 1.214307183648933, + "grad_norm": 2.2426376342773438, + "learning_rate": 2.9761546939184452e-05, + "loss": 0.6917, + "step": 137360 + }, + { + "epoch": 1.2143955869092453, + "grad_norm": 6.355005264282227, + "learning_rate": 2.976007355151258e-05, + "loss": 0.6412, + "step": 137370 + }, + { + "epoch": 1.2144839901695574, + "grad_norm": 1.5263639688491821, + "learning_rate": 2.9758600163840712e-05, + "loss": 0.5678, + "step": 137380 + }, + { + "epoch": 1.2145723934298698, + "grad_norm": 1.8938119411468506, + "learning_rate": 2.975712677616884e-05, + "loss": 0.6178, + "step": 137390 + }, + { + "epoch": 1.214660796690182, + "grad_norm": 0.7140837907791138, + "learning_rate": 2.975565338849697e-05, + "loss": 0.6161, + "step": 137400 + }, + { + "epoch": 1.2147491999504942, + "grad_norm": 1.3030954599380493, + "learning_rate": 2.97541800008251e-05, + "loss": 0.6784, + "step": 137410 + }, + { + "epoch": 1.2148376032108064, + "grad_norm": 1.3001399040222168, + "learning_rate": 2.975270661315323e-05, + "loss": 0.6489, + "step": 137420 + }, + { + "epoch": 1.2149260064711187, + "grad_norm": 2.305727005004883, + "learning_rate": 2.9751233225481357e-05, + "loss": 0.6255, + "step": 137430 + }, + { + "epoch": 1.2150144097314308, + "grad_norm": 3.130544662475586, + "learning_rate": 2.974975983780949e-05, + "loss": 0.6187, + "step": 137440 + }, + { + "epoch": 1.2151028129917432, + "grad_norm": 2.104393482208252, + "learning_rate": 2.9748286450137614e-05, + "loss": 0.6462, + "step": 137450 + }, + { + "epoch": 1.2151912162520553, + "grad_norm": 17.182109832763672, + "learning_rate": 2.9746813062465746e-05, + "loss": 0.5703, + "step": 137460 + }, + { + "epoch": 1.2152796195123676, + "grad_norm": 1.2784488201141357, + "learning_rate": 2.9745339674793878e-05, + "loss": 0.5694, + "step": 137470 + }, + { + "epoch": 1.21536802277268, + "grad_norm": 4.601258754730225, + "learning_rate": 2.9743866287122003e-05, + "loss": 0.635, + "step": 137480 + }, + { + "epoch": 1.215456426032992, + "grad_norm": 5.744540214538574, + "learning_rate": 2.9742392899450134e-05, + "loss": 0.5518, + "step": 137490 + }, + { + "epoch": 1.2155448292933044, + "grad_norm": 3.579768419265747, + "learning_rate": 2.9740919511778266e-05, + "loss": 0.6247, + "step": 137500 + }, + { + "epoch": 1.2156332325536165, + "grad_norm": 1.313004732131958, + "learning_rate": 2.973944612410639e-05, + "loss": 0.5962, + "step": 137510 + }, + { + "epoch": 1.215721635813929, + "grad_norm": 2.258887529373169, + "learning_rate": 2.9737972736434523e-05, + "loss": 0.5043, + "step": 137520 + }, + { + "epoch": 1.215810039074241, + "grad_norm": 0.7860084772109985, + "learning_rate": 2.9736499348762654e-05, + "loss": 0.7221, + "step": 137530 + }, + { + "epoch": 1.2158984423345534, + "grad_norm": 1.6615575551986694, + "learning_rate": 2.973502596109078e-05, + "loss": 0.5724, + "step": 137540 + }, + { + "epoch": 1.2159868455948655, + "grad_norm": 7.128425598144531, + "learning_rate": 2.973355257341891e-05, + "loss": 0.7745, + "step": 137550 + }, + { + "epoch": 1.2160752488551778, + "grad_norm": 6.046515464782715, + "learning_rate": 2.9732079185747036e-05, + "loss": 0.6238, + "step": 137560 + }, + { + "epoch": 1.21616365211549, + "grad_norm": 5.1349005699157715, + "learning_rate": 2.9730605798075168e-05, + "loss": 0.6999, + "step": 137570 + }, + { + "epoch": 1.2162520553758023, + "grad_norm": 8.523122787475586, + "learning_rate": 2.97291324104033e-05, + "loss": 0.6226, + "step": 137580 + }, + { + "epoch": 1.2163404586361146, + "grad_norm": 2.7262232303619385, + "learning_rate": 2.9727659022731425e-05, + "loss": 0.7337, + "step": 137590 + }, + { + "epoch": 1.2164288618964267, + "grad_norm": 2.348341703414917, + "learning_rate": 2.9726185635059556e-05, + "loss": 0.6576, + "step": 137600 + }, + { + "epoch": 1.2165172651567389, + "grad_norm": 4.144571304321289, + "learning_rate": 2.9724712247387688e-05, + "loss": 0.6071, + "step": 137610 + }, + { + "epoch": 1.2166056684170512, + "grad_norm": 5.32082462310791, + "learning_rate": 2.9723238859715813e-05, + "loss": 0.6986, + "step": 137620 + }, + { + "epoch": 1.2166940716773635, + "grad_norm": 1.6616417169570923, + "learning_rate": 2.9721765472043945e-05, + "loss": 0.5176, + "step": 137630 + }, + { + "epoch": 1.2167824749376757, + "grad_norm": 8.661393165588379, + "learning_rate": 2.9720292084372077e-05, + "loss": 0.6621, + "step": 137640 + }, + { + "epoch": 1.216870878197988, + "grad_norm": 12.566831588745117, + "learning_rate": 2.97188186967002e-05, + "loss": 0.5925, + "step": 137650 + }, + { + "epoch": 1.2169592814583001, + "grad_norm": 11.358185768127441, + "learning_rate": 2.9717345309028333e-05, + "loss": 0.8209, + "step": 137660 + }, + { + "epoch": 1.2170476847186125, + "grad_norm": 2.285712957382202, + "learning_rate": 2.9715871921356458e-05, + "loss": 0.5525, + "step": 137670 + }, + { + "epoch": 1.2171360879789246, + "grad_norm": 1.9643148183822632, + "learning_rate": 2.971439853368459e-05, + "loss": 0.6341, + "step": 137680 + }, + { + "epoch": 1.217224491239237, + "grad_norm": 6.272572040557861, + "learning_rate": 2.971292514601272e-05, + "loss": 0.6458, + "step": 137690 + }, + { + "epoch": 1.217312894499549, + "grad_norm": 2.1927270889282227, + "learning_rate": 2.9711451758340847e-05, + "loss": 0.6917, + "step": 137700 + }, + { + "epoch": 1.2174012977598614, + "grad_norm": 7.6609697341918945, + "learning_rate": 2.970997837066898e-05, + "loss": 0.6144, + "step": 137710 + }, + { + "epoch": 1.2174897010201735, + "grad_norm": 1.4886795282363892, + "learning_rate": 2.970850498299711e-05, + "loss": 0.6124, + "step": 137720 + }, + { + "epoch": 1.2175781042804859, + "grad_norm": 3.1784870624542236, + "learning_rate": 2.9707031595325235e-05, + "loss": 0.6208, + "step": 137730 + }, + { + "epoch": 1.2176665075407982, + "grad_norm": 4.474926948547363, + "learning_rate": 2.9705558207653367e-05, + "loss": 0.6468, + "step": 137740 + }, + { + "epoch": 1.2177549108011103, + "grad_norm": 2.798476457595825, + "learning_rate": 2.97040848199815e-05, + "loss": 0.5288, + "step": 137750 + }, + { + "epoch": 1.2178433140614227, + "grad_norm": 1.4441986083984375, + "learning_rate": 2.9702611432309624e-05, + "loss": 0.4914, + "step": 137760 + }, + { + "epoch": 1.2179317173217348, + "grad_norm": 3.248546838760376, + "learning_rate": 2.9701138044637755e-05, + "loss": 0.4981, + "step": 137770 + }, + { + "epoch": 1.2180201205820471, + "grad_norm": 3.7279415130615234, + "learning_rate": 2.969966465696588e-05, + "loss": 0.6593, + "step": 137780 + }, + { + "epoch": 1.2181085238423592, + "grad_norm": 2.674619197845459, + "learning_rate": 2.9698191269294012e-05, + "loss": 0.5844, + "step": 137790 + }, + { + "epoch": 1.2181969271026716, + "grad_norm": 8.349813461303711, + "learning_rate": 2.9696717881622144e-05, + "loss": 0.5755, + "step": 137800 + }, + { + "epoch": 1.2182853303629837, + "grad_norm": 4.912051677703857, + "learning_rate": 2.969524449395027e-05, + "loss": 0.5173, + "step": 137810 + }, + { + "epoch": 1.218373733623296, + "grad_norm": 8.534285545349121, + "learning_rate": 2.96937711062784e-05, + "loss": 0.5858, + "step": 137820 + }, + { + "epoch": 1.2184621368836082, + "grad_norm": 1.9246318340301514, + "learning_rate": 2.9692297718606532e-05, + "loss": 0.5856, + "step": 137830 + }, + { + "epoch": 1.2185505401439205, + "grad_norm": 1.6515977382659912, + "learning_rate": 2.9690824330934657e-05, + "loss": 0.6729, + "step": 137840 + }, + { + "epoch": 1.2186389434042328, + "grad_norm": 2.6866743564605713, + "learning_rate": 2.968935094326279e-05, + "loss": 0.6847, + "step": 137850 + }, + { + "epoch": 1.218727346664545, + "grad_norm": 2.5193541049957275, + "learning_rate": 2.968787755559092e-05, + "loss": 0.706, + "step": 137860 + }, + { + "epoch": 1.2188157499248573, + "grad_norm": 2.6215736865997314, + "learning_rate": 2.9686404167919046e-05, + "loss": 0.6244, + "step": 137870 + }, + { + "epoch": 1.2189041531851694, + "grad_norm": 4.919952869415283, + "learning_rate": 2.9684930780247177e-05, + "loss": 0.6091, + "step": 137880 + }, + { + "epoch": 1.2189925564454818, + "grad_norm": 1.4555758237838745, + "learning_rate": 2.968345739257531e-05, + "loss": 0.6375, + "step": 137890 + }, + { + "epoch": 1.2190809597057939, + "grad_norm": 8.432286262512207, + "learning_rate": 2.9681984004903434e-05, + "loss": 0.7329, + "step": 137900 + }, + { + "epoch": 1.2191693629661062, + "grad_norm": 3.4205806255340576, + "learning_rate": 2.9680510617231566e-05, + "loss": 0.5508, + "step": 137910 + }, + { + "epoch": 1.2192577662264183, + "grad_norm": 9.265718460083008, + "learning_rate": 2.967903722955969e-05, + "loss": 0.63, + "step": 137920 + }, + { + "epoch": 1.2193461694867307, + "grad_norm": 1.8168007135391235, + "learning_rate": 2.9677563841887823e-05, + "loss": 0.6797, + "step": 137930 + }, + { + "epoch": 1.2194345727470428, + "grad_norm": 4.4171671867370605, + "learning_rate": 2.9676090454215954e-05, + "loss": 0.5785, + "step": 137940 + }, + { + "epoch": 1.2195229760073552, + "grad_norm": 2.2304272651672363, + "learning_rate": 2.967461706654408e-05, + "loss": 0.6669, + "step": 137950 + }, + { + "epoch": 1.2196113792676675, + "grad_norm": 1.7471081018447876, + "learning_rate": 2.967314367887221e-05, + "loss": 0.6249, + "step": 137960 + }, + { + "epoch": 1.2196997825279796, + "grad_norm": 2.7947728633880615, + "learning_rate": 2.9671670291200343e-05, + "loss": 0.6053, + "step": 137970 + }, + { + "epoch": 1.219788185788292, + "grad_norm": 2.0127172470092773, + "learning_rate": 2.9670196903528468e-05, + "loss": 0.6746, + "step": 137980 + }, + { + "epoch": 1.219876589048604, + "grad_norm": 1.0718377828598022, + "learning_rate": 2.96687235158566e-05, + "loss": 0.5889, + "step": 137990 + }, + { + "epoch": 1.2199649923089164, + "grad_norm": 1.4731075763702393, + "learning_rate": 2.966725012818473e-05, + "loss": 0.6613, + "step": 138000 + }, + { + "epoch": 1.2200533955692285, + "grad_norm": 1.4960943460464478, + "learning_rate": 2.9665776740512856e-05, + "loss": 0.497, + "step": 138010 + }, + { + "epoch": 1.2201417988295409, + "grad_norm": 3.8215792179107666, + "learning_rate": 2.9664303352840988e-05, + "loss": 0.6802, + "step": 138020 + }, + { + "epoch": 1.220230202089853, + "grad_norm": 4.7353010177612305, + "learning_rate": 2.9662829965169113e-05, + "loss": 0.592, + "step": 138030 + }, + { + "epoch": 1.2203186053501653, + "grad_norm": 3.953007698059082, + "learning_rate": 2.9661356577497245e-05, + "loss": 0.5935, + "step": 138040 + }, + { + "epoch": 1.2204070086104775, + "grad_norm": 2.577096700668335, + "learning_rate": 2.9659883189825376e-05, + "loss": 0.5629, + "step": 138050 + }, + { + "epoch": 1.2204954118707898, + "grad_norm": 2.1419148445129395, + "learning_rate": 2.96584098021535e-05, + "loss": 0.75, + "step": 138060 + }, + { + "epoch": 1.2205838151311021, + "grad_norm": 3.8945703506469727, + "learning_rate": 2.9656936414481633e-05, + "loss": 0.4608, + "step": 138070 + }, + { + "epoch": 1.2206722183914143, + "grad_norm": 5.334362030029297, + "learning_rate": 2.9655463026809765e-05, + "loss": 0.5213, + "step": 138080 + }, + { + "epoch": 1.2207606216517266, + "grad_norm": 1.595914363861084, + "learning_rate": 2.965398963913789e-05, + "loss": 0.6218, + "step": 138090 + }, + { + "epoch": 1.2208490249120387, + "grad_norm": 2.302316904067993, + "learning_rate": 2.965251625146602e-05, + "loss": 0.6194, + "step": 138100 + }, + { + "epoch": 1.220937428172351, + "grad_norm": 4.442800521850586, + "learning_rate": 2.9651042863794153e-05, + "loss": 0.5927, + "step": 138110 + }, + { + "epoch": 1.2210258314326632, + "grad_norm": 14.791041374206543, + "learning_rate": 2.9649569476122278e-05, + "loss": 0.6413, + "step": 138120 + }, + { + "epoch": 1.2211142346929755, + "grad_norm": 1.07089102268219, + "learning_rate": 2.964809608845041e-05, + "loss": 0.6038, + "step": 138130 + }, + { + "epoch": 1.2212026379532877, + "grad_norm": 2.700444221496582, + "learning_rate": 2.9646622700778538e-05, + "loss": 0.6453, + "step": 138140 + }, + { + "epoch": 1.2212910412136, + "grad_norm": 2.884981870651245, + "learning_rate": 2.9645149313106667e-05, + "loss": 0.7445, + "step": 138150 + }, + { + "epoch": 1.2213794444739121, + "grad_norm": 10.680171012878418, + "learning_rate": 2.96436759254348e-05, + "loss": 0.5982, + "step": 138160 + }, + { + "epoch": 1.2214678477342245, + "grad_norm": 4.481019973754883, + "learning_rate": 2.9642202537762927e-05, + "loss": 0.6119, + "step": 138170 + }, + { + "epoch": 1.2215562509945368, + "grad_norm": 4.571042537689209, + "learning_rate": 2.9640729150091055e-05, + "loss": 0.5926, + "step": 138180 + }, + { + "epoch": 1.221644654254849, + "grad_norm": 1.2971141338348389, + "learning_rate": 2.9639255762419187e-05, + "loss": 0.6591, + "step": 138190 + }, + { + "epoch": 1.221733057515161, + "grad_norm": 9.165392875671387, + "learning_rate": 2.9637782374747315e-05, + "loss": 0.5971, + "step": 138200 + }, + { + "epoch": 1.2218214607754734, + "grad_norm": 1.4384552240371704, + "learning_rate": 2.9636308987075444e-05, + "loss": 0.5661, + "step": 138210 + }, + { + "epoch": 1.2219098640357857, + "grad_norm": 4.815966606140137, + "learning_rate": 2.9634835599403575e-05, + "loss": 0.6542, + "step": 138220 + }, + { + "epoch": 1.2219982672960978, + "grad_norm": 2.283719539642334, + "learning_rate": 2.9633362211731704e-05, + "loss": 0.5627, + "step": 138230 + }, + { + "epoch": 1.2220866705564102, + "grad_norm": 3.065930128097534, + "learning_rate": 2.9631888824059832e-05, + "loss": 0.7348, + "step": 138240 + }, + { + "epoch": 1.2221750738167223, + "grad_norm": 0.9909847378730774, + "learning_rate": 2.963041543638796e-05, + "loss": 0.6386, + "step": 138250 + }, + { + "epoch": 1.2222634770770346, + "grad_norm": 3.1771318912506104, + "learning_rate": 2.9628942048716092e-05, + "loss": 0.6293, + "step": 138260 + }, + { + "epoch": 1.2223518803373468, + "grad_norm": 1.604655146598816, + "learning_rate": 2.962746866104422e-05, + "loss": 0.5759, + "step": 138270 + }, + { + "epoch": 1.222440283597659, + "grad_norm": 13.580036163330078, + "learning_rate": 2.962599527337235e-05, + "loss": 0.4535, + "step": 138280 + }, + { + "epoch": 1.2225286868579712, + "grad_norm": 3.7436904907226562, + "learning_rate": 2.962452188570048e-05, + "loss": 0.5533, + "step": 138290 + }, + { + "epoch": 1.2226170901182836, + "grad_norm": 5.706722259521484, + "learning_rate": 2.962304849802861e-05, + "loss": 0.5865, + "step": 138300 + }, + { + "epoch": 1.2227054933785957, + "grad_norm": 1.2666332721710205, + "learning_rate": 2.9621575110356737e-05, + "loss": 0.6452, + "step": 138310 + }, + { + "epoch": 1.222793896638908, + "grad_norm": 4.669419288635254, + "learning_rate": 2.962010172268487e-05, + "loss": 0.6412, + "step": 138320 + }, + { + "epoch": 1.2228822998992204, + "grad_norm": 3.056744337081909, + "learning_rate": 2.9618628335012997e-05, + "loss": 0.5803, + "step": 138330 + }, + { + "epoch": 1.2229707031595325, + "grad_norm": 10.539825439453125, + "learning_rate": 2.9617154947341126e-05, + "loss": 0.6298, + "step": 138340 + }, + { + "epoch": 1.2230591064198448, + "grad_norm": 22.080547332763672, + "learning_rate": 2.9615681559669257e-05, + "loss": 0.5054, + "step": 138350 + }, + { + "epoch": 1.223147509680157, + "grad_norm": 1.4531960487365723, + "learning_rate": 2.9614208171997386e-05, + "loss": 0.5574, + "step": 138360 + }, + { + "epoch": 1.2232359129404693, + "grad_norm": 2.239962339401245, + "learning_rate": 2.9612734784325514e-05, + "loss": 0.5634, + "step": 138370 + }, + { + "epoch": 1.2233243162007814, + "grad_norm": 2.4439761638641357, + "learning_rate": 2.9611261396653646e-05, + "loss": 0.5967, + "step": 138380 + }, + { + "epoch": 1.2234127194610938, + "grad_norm": 10.456849098205566, + "learning_rate": 2.960978800898177e-05, + "loss": 0.8345, + "step": 138390 + }, + { + "epoch": 1.2235011227214059, + "grad_norm": 1.9543341398239136, + "learning_rate": 2.9608314621309903e-05, + "loss": 0.5144, + "step": 138400 + }, + { + "epoch": 1.2235895259817182, + "grad_norm": 1.7761907577514648, + "learning_rate": 2.9606841233638034e-05, + "loss": 0.5498, + "step": 138410 + }, + { + "epoch": 1.2236779292420303, + "grad_norm": 4.272205829620361, + "learning_rate": 2.960536784596616e-05, + "loss": 0.6445, + "step": 138420 + }, + { + "epoch": 1.2237663325023427, + "grad_norm": 4.287578582763672, + "learning_rate": 2.960389445829429e-05, + "loss": 0.7997, + "step": 138430 + }, + { + "epoch": 1.223854735762655, + "grad_norm": 2.6554386615753174, + "learning_rate": 2.9602421070622423e-05, + "loss": 0.5588, + "step": 138440 + }, + { + "epoch": 1.2239431390229671, + "grad_norm": 2.591968536376953, + "learning_rate": 2.9600947682950548e-05, + "loss": 0.6851, + "step": 138450 + }, + { + "epoch": 1.2240315422832795, + "grad_norm": 13.519428253173828, + "learning_rate": 2.959947429527868e-05, + "loss": 0.6773, + "step": 138460 + }, + { + "epoch": 1.2241199455435916, + "grad_norm": 5.097795486450195, + "learning_rate": 2.959800090760681e-05, + "loss": 0.573, + "step": 138470 + }, + { + "epoch": 1.224208348803904, + "grad_norm": 6.76710844039917, + "learning_rate": 2.9596527519934936e-05, + "loss": 0.6682, + "step": 138480 + }, + { + "epoch": 1.224296752064216, + "grad_norm": 1.6406009197235107, + "learning_rate": 2.9595054132263068e-05, + "loss": 0.5799, + "step": 138490 + }, + { + "epoch": 1.2243851553245284, + "grad_norm": 1.3960139751434326, + "learning_rate": 2.9593580744591193e-05, + "loss": 0.5657, + "step": 138500 + }, + { + "epoch": 1.2244735585848405, + "grad_norm": 2.7008554935455322, + "learning_rate": 2.9592107356919325e-05, + "loss": 0.7137, + "step": 138510 + }, + { + "epoch": 1.2245619618451529, + "grad_norm": 6.212095737457275, + "learning_rate": 2.9590633969247456e-05, + "loss": 0.7117, + "step": 138520 + }, + { + "epoch": 1.224650365105465, + "grad_norm": 1.4511773586273193, + "learning_rate": 2.958916058157558e-05, + "loss": 0.584, + "step": 138530 + }, + { + "epoch": 1.2247387683657773, + "grad_norm": 8.34814739227295, + "learning_rate": 2.9587687193903713e-05, + "loss": 0.6637, + "step": 138540 + }, + { + "epoch": 1.2248271716260897, + "grad_norm": 0.6713892817497253, + "learning_rate": 2.9586213806231845e-05, + "loss": 0.5589, + "step": 138550 + }, + { + "epoch": 1.2249155748864018, + "grad_norm": 14.103983879089355, + "learning_rate": 2.958474041855997e-05, + "loss": 0.5878, + "step": 138560 + }, + { + "epoch": 1.2250039781467141, + "grad_norm": 3.680828809738159, + "learning_rate": 2.95832670308881e-05, + "loss": 0.6141, + "step": 138570 + }, + { + "epoch": 1.2250923814070263, + "grad_norm": 10.030557632446289, + "learning_rate": 2.9581793643216233e-05, + "loss": 0.6395, + "step": 138580 + }, + { + "epoch": 1.2251807846673386, + "grad_norm": 2.402918815612793, + "learning_rate": 2.9580320255544358e-05, + "loss": 0.6361, + "step": 138590 + }, + { + "epoch": 1.2252691879276507, + "grad_norm": 3.20762038230896, + "learning_rate": 2.957884686787249e-05, + "loss": 0.6201, + "step": 138600 + }, + { + "epoch": 1.225357591187963, + "grad_norm": 5.7481584548950195, + "learning_rate": 2.9577373480200615e-05, + "loss": 0.559, + "step": 138610 + }, + { + "epoch": 1.2254459944482752, + "grad_norm": 2.062344789505005, + "learning_rate": 2.9575900092528747e-05, + "loss": 0.6437, + "step": 138620 + }, + { + "epoch": 1.2255343977085875, + "grad_norm": 6.626830577850342, + "learning_rate": 2.957442670485688e-05, + "loss": 0.6104, + "step": 138630 + }, + { + "epoch": 1.2256228009688996, + "grad_norm": 2.9041008949279785, + "learning_rate": 2.9572953317185003e-05, + "loss": 0.6814, + "step": 138640 + }, + { + "epoch": 1.225711204229212, + "grad_norm": 2.7060437202453613, + "learning_rate": 2.9571479929513135e-05, + "loss": 0.7021, + "step": 138650 + }, + { + "epoch": 1.2257996074895243, + "grad_norm": 1.874562382698059, + "learning_rate": 2.9570006541841267e-05, + "loss": 0.4089, + "step": 138660 + }, + { + "epoch": 1.2258880107498364, + "grad_norm": 4.5058698654174805, + "learning_rate": 2.9568533154169392e-05, + "loss": 0.7192, + "step": 138670 + }, + { + "epoch": 1.2259764140101488, + "grad_norm": 2.774868965148926, + "learning_rate": 2.9567059766497524e-05, + "loss": 0.6546, + "step": 138680 + }, + { + "epoch": 1.226064817270461, + "grad_norm": 2.191645622253418, + "learning_rate": 2.9565586378825655e-05, + "loss": 0.6557, + "step": 138690 + }, + { + "epoch": 1.2261532205307732, + "grad_norm": 9.972789764404297, + "learning_rate": 2.956411299115378e-05, + "loss": 0.5647, + "step": 138700 + }, + { + "epoch": 1.2262416237910854, + "grad_norm": 5.8902692794799805, + "learning_rate": 2.9562639603481912e-05, + "loss": 0.7517, + "step": 138710 + }, + { + "epoch": 1.2263300270513977, + "grad_norm": 2.5864126682281494, + "learning_rate": 2.9561166215810037e-05, + "loss": 0.6825, + "step": 138720 + }, + { + "epoch": 1.2264184303117098, + "grad_norm": 7.310678005218506, + "learning_rate": 2.955969282813817e-05, + "loss": 0.5666, + "step": 138730 + }, + { + "epoch": 1.2265068335720222, + "grad_norm": 3.093445301055908, + "learning_rate": 2.95582194404663e-05, + "loss": 0.6412, + "step": 138740 + }, + { + "epoch": 1.2265952368323343, + "grad_norm": 1.314583420753479, + "learning_rate": 2.9556746052794425e-05, + "loss": 0.7598, + "step": 138750 + }, + { + "epoch": 1.2266836400926466, + "grad_norm": 2.556443691253662, + "learning_rate": 2.9555272665122557e-05, + "loss": 0.6774, + "step": 138760 + }, + { + "epoch": 1.226772043352959, + "grad_norm": 2.419553518295288, + "learning_rate": 2.955379927745069e-05, + "loss": 0.6102, + "step": 138770 + }, + { + "epoch": 1.226860446613271, + "grad_norm": 3.532956838607788, + "learning_rate": 2.9552325889778814e-05, + "loss": 0.5526, + "step": 138780 + }, + { + "epoch": 1.2269488498735832, + "grad_norm": 1.0614246129989624, + "learning_rate": 2.9550852502106946e-05, + "loss": 0.5958, + "step": 138790 + }, + { + "epoch": 1.2270372531338956, + "grad_norm": 1.4571219682693481, + "learning_rate": 2.9549379114435077e-05, + "loss": 0.6172, + "step": 138800 + }, + { + "epoch": 1.227125656394208, + "grad_norm": 3.0234410762786865, + "learning_rate": 2.9547905726763202e-05, + "loss": 0.7691, + "step": 138810 + }, + { + "epoch": 1.22721405965452, + "grad_norm": 1.7629417181015015, + "learning_rate": 2.9546432339091334e-05, + "loss": 0.5493, + "step": 138820 + }, + { + "epoch": 1.2273024629148324, + "grad_norm": 4.805184364318848, + "learning_rate": 2.9544958951419466e-05, + "loss": 0.5679, + "step": 138830 + }, + { + "epoch": 1.2273908661751445, + "grad_norm": 5.6291728019714355, + "learning_rate": 2.954348556374759e-05, + "loss": 0.6767, + "step": 138840 + }, + { + "epoch": 1.2274792694354568, + "grad_norm": 3.7806589603424072, + "learning_rate": 2.9542012176075723e-05, + "loss": 0.6058, + "step": 138850 + }, + { + "epoch": 1.227567672695769, + "grad_norm": 2.1739916801452637, + "learning_rate": 2.9540538788403848e-05, + "loss": 0.6627, + "step": 138860 + }, + { + "epoch": 1.2276560759560813, + "grad_norm": 1.126368522644043, + "learning_rate": 2.953906540073198e-05, + "loss": 0.5701, + "step": 138870 + }, + { + "epoch": 1.2277444792163936, + "grad_norm": 3.0415472984313965, + "learning_rate": 2.953759201306011e-05, + "loss": 0.6517, + "step": 138880 + }, + { + "epoch": 1.2278328824767057, + "grad_norm": 5.938354969024658, + "learning_rate": 2.9536118625388236e-05, + "loss": 0.6688, + "step": 138890 + }, + { + "epoch": 1.2279212857370179, + "grad_norm": 6.8731207847595215, + "learning_rate": 2.9534645237716368e-05, + "loss": 0.793, + "step": 138900 + }, + { + "epoch": 1.2280096889973302, + "grad_norm": 3.185910701751709, + "learning_rate": 2.95331718500445e-05, + "loss": 0.5834, + "step": 138910 + }, + { + "epoch": 1.2280980922576425, + "grad_norm": 10.668088912963867, + "learning_rate": 2.9531698462372624e-05, + "loss": 0.5753, + "step": 138920 + }, + { + "epoch": 1.2281864955179547, + "grad_norm": 1.5388685464859009, + "learning_rate": 2.9530225074700756e-05, + "loss": 0.517, + "step": 138930 + }, + { + "epoch": 1.228274898778267, + "grad_norm": 1.0322678089141846, + "learning_rate": 2.9528751687028888e-05, + "loss": 0.5561, + "step": 138940 + }, + { + "epoch": 1.2283633020385791, + "grad_norm": 1.669554591178894, + "learning_rate": 2.9527278299357013e-05, + "loss": 0.6144, + "step": 138950 + }, + { + "epoch": 1.2284517052988915, + "grad_norm": 2.211992025375366, + "learning_rate": 2.9525804911685145e-05, + "loss": 0.648, + "step": 138960 + }, + { + "epoch": 1.2285401085592036, + "grad_norm": 5.565649509429932, + "learning_rate": 2.952433152401327e-05, + "loss": 0.7602, + "step": 138970 + }, + { + "epoch": 1.228628511819516, + "grad_norm": 3.330230712890625, + "learning_rate": 2.95228581363414e-05, + "loss": 0.7364, + "step": 138980 + }, + { + "epoch": 1.228716915079828, + "grad_norm": 1.3671505451202393, + "learning_rate": 2.9521384748669533e-05, + "loss": 0.6803, + "step": 138990 + }, + { + "epoch": 1.2288053183401404, + "grad_norm": 3.701233148574829, + "learning_rate": 2.9519911360997658e-05, + "loss": 0.6709, + "step": 139000 + }, + { + "epoch": 1.2288937216004525, + "grad_norm": 4.393486499786377, + "learning_rate": 2.951843797332579e-05, + "loss": 0.6979, + "step": 139010 + }, + { + "epoch": 1.2289821248607649, + "grad_norm": 3.8513600826263428, + "learning_rate": 2.951696458565392e-05, + "loss": 0.4942, + "step": 139020 + }, + { + "epoch": 1.2290705281210772, + "grad_norm": 2.520718574523926, + "learning_rate": 2.9515491197982046e-05, + "loss": 0.613, + "step": 139030 + }, + { + "epoch": 1.2291589313813893, + "grad_norm": 5.0588202476501465, + "learning_rate": 2.9514017810310178e-05, + "loss": 0.6457, + "step": 139040 + }, + { + "epoch": 1.2292473346417017, + "grad_norm": 3.025023937225342, + "learning_rate": 2.951254442263831e-05, + "loss": 0.6269, + "step": 139050 + }, + { + "epoch": 1.2293357379020138, + "grad_norm": 2.1494200229644775, + "learning_rate": 2.9511071034966435e-05, + "loss": 0.7161, + "step": 139060 + }, + { + "epoch": 1.2294241411623261, + "grad_norm": 1.9760066270828247, + "learning_rate": 2.9509597647294567e-05, + "loss": 0.6012, + "step": 139070 + }, + { + "epoch": 1.2295125444226382, + "grad_norm": 11.698833465576172, + "learning_rate": 2.9508124259622695e-05, + "loss": 0.617, + "step": 139080 + }, + { + "epoch": 1.2296009476829506, + "grad_norm": 1.958322286605835, + "learning_rate": 2.9506650871950823e-05, + "loss": 0.5638, + "step": 139090 + }, + { + "epoch": 1.2296893509432627, + "grad_norm": 1.2511893510818481, + "learning_rate": 2.9505177484278955e-05, + "loss": 0.5512, + "step": 139100 + }, + { + "epoch": 1.229777754203575, + "grad_norm": 2.959009885787964, + "learning_rate": 2.9503704096607083e-05, + "loss": 0.5491, + "step": 139110 + }, + { + "epoch": 1.2298661574638872, + "grad_norm": 1.961250901222229, + "learning_rate": 2.9502230708935212e-05, + "loss": 0.5892, + "step": 139120 + }, + { + "epoch": 1.2299545607241995, + "grad_norm": 1.7659155130386353, + "learning_rate": 2.9500757321263344e-05, + "loss": 0.6596, + "step": 139130 + }, + { + "epoch": 1.2300429639845119, + "grad_norm": 1.3340916633605957, + "learning_rate": 2.9499283933591472e-05, + "loss": 0.6452, + "step": 139140 + }, + { + "epoch": 1.230131367244824, + "grad_norm": 5.906961917877197, + "learning_rate": 2.94978105459196e-05, + "loss": 0.6852, + "step": 139150 + }, + { + "epoch": 1.2302197705051363, + "grad_norm": 5.5011701583862305, + "learning_rate": 2.9496337158247732e-05, + "loss": 0.5948, + "step": 139160 + }, + { + "epoch": 1.2303081737654484, + "grad_norm": 2.077199697494507, + "learning_rate": 2.949486377057586e-05, + "loss": 0.6935, + "step": 139170 + }, + { + "epoch": 1.2303965770257608, + "grad_norm": 10.900552749633789, + "learning_rate": 2.949339038290399e-05, + "loss": 0.6032, + "step": 139180 + }, + { + "epoch": 1.230484980286073, + "grad_norm": 1.7784994840621948, + "learning_rate": 2.9491916995232117e-05, + "loss": 0.6936, + "step": 139190 + }, + { + "epoch": 1.2305733835463852, + "grad_norm": 1.6221376657485962, + "learning_rate": 2.949044360756025e-05, + "loss": 0.6826, + "step": 139200 + }, + { + "epoch": 1.2306617868066974, + "grad_norm": 3.658998727798462, + "learning_rate": 2.9488970219888377e-05, + "loss": 0.7465, + "step": 139210 + }, + { + "epoch": 1.2307501900670097, + "grad_norm": 5.2513580322265625, + "learning_rate": 2.9487496832216506e-05, + "loss": 0.6939, + "step": 139220 + }, + { + "epoch": 1.2308385933273218, + "grad_norm": 2.5928003787994385, + "learning_rate": 2.9486023444544637e-05, + "loss": 0.6589, + "step": 139230 + }, + { + "epoch": 1.2309269965876342, + "grad_norm": 1.4034242630004883, + "learning_rate": 2.9484550056872766e-05, + "loss": 0.5372, + "step": 139240 + }, + { + "epoch": 1.2310153998479465, + "grad_norm": 5.431221008300781, + "learning_rate": 2.9483076669200894e-05, + "loss": 0.5149, + "step": 139250 + }, + { + "epoch": 1.2311038031082586, + "grad_norm": 2.4931249618530273, + "learning_rate": 2.9481603281529026e-05, + "loss": 0.525, + "step": 139260 + }, + { + "epoch": 1.231192206368571, + "grad_norm": 6.9441046714782715, + "learning_rate": 2.9480129893857154e-05, + "loss": 0.6273, + "step": 139270 + }, + { + "epoch": 1.231280609628883, + "grad_norm": 2.858452558517456, + "learning_rate": 2.9478656506185282e-05, + "loss": 0.5048, + "step": 139280 + }, + { + "epoch": 1.2313690128891954, + "grad_norm": 4.454078197479248, + "learning_rate": 2.9477183118513414e-05, + "loss": 0.4997, + "step": 139290 + }, + { + "epoch": 1.2314574161495075, + "grad_norm": 3.5746941566467285, + "learning_rate": 2.9475709730841543e-05, + "loss": 0.5961, + "step": 139300 + }, + { + "epoch": 1.2315458194098199, + "grad_norm": 2.514056444168091, + "learning_rate": 2.947423634316967e-05, + "loss": 0.5532, + "step": 139310 + }, + { + "epoch": 1.231634222670132, + "grad_norm": 1.0345075130462646, + "learning_rate": 2.9472762955497803e-05, + "loss": 0.4782, + "step": 139320 + }, + { + "epoch": 1.2317226259304443, + "grad_norm": 3.135854959487915, + "learning_rate": 2.9471289567825928e-05, + "loss": 0.5563, + "step": 139330 + }, + { + "epoch": 1.2318110291907565, + "grad_norm": 1.2373956441879272, + "learning_rate": 2.946981618015406e-05, + "loss": 0.5486, + "step": 139340 + }, + { + "epoch": 1.2318994324510688, + "grad_norm": 0.9658572673797607, + "learning_rate": 2.946834279248219e-05, + "loss": 0.6118, + "step": 139350 + }, + { + "epoch": 1.2319878357113812, + "grad_norm": 1.2828421592712402, + "learning_rate": 2.9466869404810316e-05, + "loss": 0.6404, + "step": 139360 + }, + { + "epoch": 1.2320762389716933, + "grad_norm": 4.5428266525268555, + "learning_rate": 2.9465396017138448e-05, + "loss": 0.5508, + "step": 139370 + }, + { + "epoch": 1.2321646422320054, + "grad_norm": 15.900464057922363, + "learning_rate": 2.946392262946658e-05, + "loss": 0.526, + "step": 139380 + }, + { + "epoch": 1.2322530454923177, + "grad_norm": 1.7019041776657104, + "learning_rate": 2.9462449241794704e-05, + "loss": 0.5456, + "step": 139390 + }, + { + "epoch": 1.23234144875263, + "grad_norm": 7.868950843811035, + "learning_rate": 2.9460975854122836e-05, + "loss": 0.5342, + "step": 139400 + }, + { + "epoch": 1.2324298520129422, + "grad_norm": 1.0215487480163574, + "learning_rate": 2.9459502466450968e-05, + "loss": 0.43, + "step": 139410 + }, + { + "epoch": 1.2325182552732545, + "grad_norm": 1.0092544555664062, + "learning_rate": 2.9458029078779093e-05, + "loss": 0.6623, + "step": 139420 + }, + { + "epoch": 1.2326066585335667, + "grad_norm": 4.069736957550049, + "learning_rate": 2.9456555691107225e-05, + "loss": 0.6316, + "step": 139430 + }, + { + "epoch": 1.232695061793879, + "grad_norm": 1.8851509094238281, + "learning_rate": 2.945508230343535e-05, + "loss": 0.4998, + "step": 139440 + }, + { + "epoch": 1.2327834650541911, + "grad_norm": 1.1257460117340088, + "learning_rate": 2.945360891576348e-05, + "loss": 0.5904, + "step": 139450 + }, + { + "epoch": 1.2328718683145035, + "grad_norm": 6.347303867340088, + "learning_rate": 2.9452135528091613e-05, + "loss": 0.7657, + "step": 139460 + }, + { + "epoch": 1.2329602715748158, + "grad_norm": 4.237667083740234, + "learning_rate": 2.9450662140419738e-05, + "loss": 0.7489, + "step": 139470 + }, + { + "epoch": 1.233048674835128, + "grad_norm": 1.6039080619812012, + "learning_rate": 2.944918875274787e-05, + "loss": 0.5705, + "step": 139480 + }, + { + "epoch": 1.23313707809544, + "grad_norm": 2.965522289276123, + "learning_rate": 2.9447715365076e-05, + "loss": 0.6045, + "step": 139490 + }, + { + "epoch": 1.2332254813557524, + "grad_norm": 1.2478375434875488, + "learning_rate": 2.9446241977404127e-05, + "loss": 0.5403, + "step": 139500 + }, + { + "epoch": 1.2333138846160647, + "grad_norm": 1.5774801969528198, + "learning_rate": 2.9444768589732258e-05, + "loss": 0.6005, + "step": 139510 + }, + { + "epoch": 1.2334022878763768, + "grad_norm": 4.610653877258301, + "learning_rate": 2.944329520206039e-05, + "loss": 0.6143, + "step": 139520 + }, + { + "epoch": 1.2334906911366892, + "grad_norm": 10.987772941589355, + "learning_rate": 2.9441821814388515e-05, + "loss": 0.8183, + "step": 139530 + }, + { + "epoch": 1.2335790943970013, + "grad_norm": 6.430619716644287, + "learning_rate": 2.9440348426716647e-05, + "loss": 0.7105, + "step": 139540 + }, + { + "epoch": 1.2336674976573136, + "grad_norm": 1.183045506477356, + "learning_rate": 2.9438875039044772e-05, + "loss": 0.5627, + "step": 139550 + }, + { + "epoch": 1.2337559009176258, + "grad_norm": 1.4134809970855713, + "learning_rate": 2.9437401651372903e-05, + "loss": 0.6304, + "step": 139560 + }, + { + "epoch": 1.2338443041779381, + "grad_norm": 4.921472549438477, + "learning_rate": 2.9435928263701035e-05, + "loss": 0.6883, + "step": 139570 + }, + { + "epoch": 1.2339327074382502, + "grad_norm": 3.9728519916534424, + "learning_rate": 2.943445487602916e-05, + "loss": 0.5863, + "step": 139580 + }, + { + "epoch": 1.2340211106985626, + "grad_norm": 2.939859628677368, + "learning_rate": 2.9432981488357292e-05, + "loss": 0.6287, + "step": 139590 + }, + { + "epoch": 1.2341095139588747, + "grad_norm": 1.3547147512435913, + "learning_rate": 2.9431508100685424e-05, + "loss": 0.6122, + "step": 139600 + }, + { + "epoch": 1.234197917219187, + "grad_norm": 2.903419256210327, + "learning_rate": 2.943003471301355e-05, + "loss": 0.6081, + "step": 139610 + }, + { + "epoch": 1.2342863204794994, + "grad_norm": 4.38590145111084, + "learning_rate": 2.942856132534168e-05, + "loss": 0.4389, + "step": 139620 + }, + { + "epoch": 1.2343747237398115, + "grad_norm": 7.4531450271606445, + "learning_rate": 2.9427087937669812e-05, + "loss": 0.5544, + "step": 139630 + }, + { + "epoch": 1.2344631270001238, + "grad_norm": 2.2751047611236572, + "learning_rate": 2.9425614549997937e-05, + "loss": 0.6129, + "step": 139640 + }, + { + "epoch": 1.234551530260436, + "grad_norm": 1.5383727550506592, + "learning_rate": 2.942414116232607e-05, + "loss": 0.5807, + "step": 139650 + }, + { + "epoch": 1.2346399335207483, + "grad_norm": 1.790018916130066, + "learning_rate": 2.9422667774654194e-05, + "loss": 0.5889, + "step": 139660 + }, + { + "epoch": 1.2347283367810604, + "grad_norm": 1.6726810932159424, + "learning_rate": 2.9421194386982326e-05, + "loss": 0.5862, + "step": 139670 + }, + { + "epoch": 1.2348167400413728, + "grad_norm": 2.0210378170013428, + "learning_rate": 2.9419720999310457e-05, + "loss": 0.6687, + "step": 139680 + }, + { + "epoch": 1.2349051433016849, + "grad_norm": 4.886565685272217, + "learning_rate": 2.9418247611638582e-05, + "loss": 0.5988, + "step": 139690 + }, + { + "epoch": 1.2349935465619972, + "grad_norm": 5.94719123840332, + "learning_rate": 2.9416774223966714e-05, + "loss": 0.6724, + "step": 139700 + }, + { + "epoch": 1.2350819498223093, + "grad_norm": 5.646327018737793, + "learning_rate": 2.9415300836294846e-05, + "loss": 0.5967, + "step": 139710 + }, + { + "epoch": 1.2351703530826217, + "grad_norm": 0.8480719327926636, + "learning_rate": 2.941382744862297e-05, + "loss": 0.6368, + "step": 139720 + }, + { + "epoch": 1.235258756342934, + "grad_norm": 7.830117702484131, + "learning_rate": 2.9412354060951102e-05, + "loss": 0.6109, + "step": 139730 + }, + { + "epoch": 1.2353471596032461, + "grad_norm": 4.0005784034729, + "learning_rate": 2.9410880673279234e-05, + "loss": 0.6374, + "step": 139740 + }, + { + "epoch": 1.2354355628635585, + "grad_norm": 4.897264003753662, + "learning_rate": 2.940940728560736e-05, + "loss": 0.4978, + "step": 139750 + }, + { + "epoch": 1.2355239661238706, + "grad_norm": 7.273678302764893, + "learning_rate": 2.940793389793549e-05, + "loss": 0.7556, + "step": 139760 + }, + { + "epoch": 1.235612369384183, + "grad_norm": 1.4680782556533813, + "learning_rate": 2.9406460510263623e-05, + "loss": 0.5497, + "step": 139770 + }, + { + "epoch": 1.235700772644495, + "grad_norm": 3.8397841453552246, + "learning_rate": 2.9404987122591748e-05, + "loss": 0.7207, + "step": 139780 + }, + { + "epoch": 1.2357891759048074, + "grad_norm": 4.973461627960205, + "learning_rate": 2.940351373491988e-05, + "loss": 0.6682, + "step": 139790 + }, + { + "epoch": 1.2358775791651195, + "grad_norm": 3.151339292526245, + "learning_rate": 2.9402040347248004e-05, + "loss": 0.6395, + "step": 139800 + }, + { + "epoch": 1.2359659824254319, + "grad_norm": 2.884575128555298, + "learning_rate": 2.9400566959576136e-05, + "loss": 0.6677, + "step": 139810 + }, + { + "epoch": 1.236054385685744, + "grad_norm": 6.017087936401367, + "learning_rate": 2.9399093571904268e-05, + "loss": 0.5366, + "step": 139820 + }, + { + "epoch": 1.2361427889460563, + "grad_norm": 4.820949077606201, + "learning_rate": 2.9397620184232393e-05, + "loss": 0.583, + "step": 139830 + }, + { + "epoch": 1.2362311922063687, + "grad_norm": 8.258111000061035, + "learning_rate": 2.9396146796560524e-05, + "loss": 0.6777, + "step": 139840 + }, + { + "epoch": 1.2363195954666808, + "grad_norm": 1.1991479396820068, + "learning_rate": 2.9394673408888656e-05, + "loss": 0.6433, + "step": 139850 + }, + { + "epoch": 1.2364079987269931, + "grad_norm": 2.9836766719818115, + "learning_rate": 2.939320002121678e-05, + "loss": 0.4588, + "step": 139860 + }, + { + "epoch": 1.2364964019873053, + "grad_norm": 3.943164587020874, + "learning_rate": 2.9391726633544913e-05, + "loss": 0.619, + "step": 139870 + }, + { + "epoch": 1.2365848052476176, + "grad_norm": 2.0488691329956055, + "learning_rate": 2.9390253245873045e-05, + "loss": 0.6212, + "step": 139880 + }, + { + "epoch": 1.2366732085079297, + "grad_norm": 2.4676930904388428, + "learning_rate": 2.938877985820117e-05, + "loss": 0.561, + "step": 139890 + }, + { + "epoch": 1.236761611768242, + "grad_norm": 1.7249491214752197, + "learning_rate": 2.93873064705293e-05, + "loss": 0.5581, + "step": 139900 + }, + { + "epoch": 1.2368500150285542, + "grad_norm": 1.536492109298706, + "learning_rate": 2.9385833082857426e-05, + "loss": 0.66, + "step": 139910 + }, + { + "epoch": 1.2369384182888665, + "grad_norm": 3.7124695777893066, + "learning_rate": 2.9384359695185558e-05, + "loss": 0.6974, + "step": 139920 + }, + { + "epoch": 1.2370268215491786, + "grad_norm": 2.430027484893799, + "learning_rate": 2.938288630751369e-05, + "loss": 0.6646, + "step": 139930 + }, + { + "epoch": 1.237115224809491, + "grad_norm": 5.068747043609619, + "learning_rate": 2.9381412919841815e-05, + "loss": 0.5946, + "step": 139940 + }, + { + "epoch": 1.2372036280698033, + "grad_norm": 5.979935169219971, + "learning_rate": 2.9379939532169947e-05, + "loss": 0.6908, + "step": 139950 + }, + { + "epoch": 1.2372920313301154, + "grad_norm": 6.684631824493408, + "learning_rate": 2.9378466144498078e-05, + "loss": 0.5682, + "step": 139960 + }, + { + "epoch": 1.2373804345904278, + "grad_norm": 3.413923740386963, + "learning_rate": 2.9376992756826203e-05, + "loss": 0.6468, + "step": 139970 + }, + { + "epoch": 1.23746883785074, + "grad_norm": 1.644879698753357, + "learning_rate": 2.9375519369154335e-05, + "loss": 0.6515, + "step": 139980 + }, + { + "epoch": 1.2375572411110523, + "grad_norm": 4.588129997253418, + "learning_rate": 2.9374045981482467e-05, + "loss": 0.563, + "step": 139990 + }, + { + "epoch": 1.2376456443713644, + "grad_norm": 3.4406423568725586, + "learning_rate": 2.937257259381059e-05, + "loss": 0.5147, + "step": 140000 + }, + { + "epoch": 1.2377340476316767, + "grad_norm": 5.868357181549072, + "learning_rate": 2.9371099206138723e-05, + "loss": 0.6511, + "step": 140010 + }, + { + "epoch": 1.2378224508919888, + "grad_norm": 1.119891881942749, + "learning_rate": 2.9369625818466852e-05, + "loss": 0.6221, + "step": 140020 + }, + { + "epoch": 1.2379108541523012, + "grad_norm": 4.095285415649414, + "learning_rate": 2.936815243079498e-05, + "loss": 0.5063, + "step": 140030 + }, + { + "epoch": 1.2379992574126133, + "grad_norm": 2.5871200561523438, + "learning_rate": 2.9366679043123112e-05, + "loss": 0.7259, + "step": 140040 + }, + { + "epoch": 1.2380876606729256, + "grad_norm": 1.1331309080123901, + "learning_rate": 2.936520565545124e-05, + "loss": 0.5898, + "step": 140050 + }, + { + "epoch": 1.238176063933238, + "grad_norm": 4.594869136810303, + "learning_rate": 2.936373226777937e-05, + "loss": 0.6123, + "step": 140060 + }, + { + "epoch": 1.23826446719355, + "grad_norm": 1.2060575485229492, + "learning_rate": 2.93622588801075e-05, + "loss": 0.5587, + "step": 140070 + }, + { + "epoch": 1.2383528704538622, + "grad_norm": 1.2319847345352173, + "learning_rate": 2.936078549243563e-05, + "loss": 0.6498, + "step": 140080 + }, + { + "epoch": 1.2384412737141746, + "grad_norm": 11.477744102478027, + "learning_rate": 2.9359312104763757e-05, + "loss": 0.5192, + "step": 140090 + }, + { + "epoch": 1.238529676974487, + "grad_norm": 7.122554779052734, + "learning_rate": 2.935783871709189e-05, + "loss": 0.6864, + "step": 140100 + }, + { + "epoch": 1.238618080234799, + "grad_norm": 4.3073835372924805, + "learning_rate": 2.9356365329420017e-05, + "loss": 0.5233, + "step": 140110 + }, + { + "epoch": 1.2387064834951114, + "grad_norm": 5.499136447906494, + "learning_rate": 2.9354891941748145e-05, + "loss": 0.5919, + "step": 140120 + }, + { + "epoch": 1.2387948867554235, + "grad_norm": 5.400546073913574, + "learning_rate": 2.9353418554076277e-05, + "loss": 0.56, + "step": 140130 + }, + { + "epoch": 1.2388832900157358, + "grad_norm": 1.5307656526565552, + "learning_rate": 2.9351945166404406e-05, + "loss": 0.6586, + "step": 140140 + }, + { + "epoch": 1.238971693276048, + "grad_norm": 3.108717918395996, + "learning_rate": 2.9350471778732534e-05, + "loss": 0.6399, + "step": 140150 + }, + { + "epoch": 1.2390600965363603, + "grad_norm": 2.895707130432129, + "learning_rate": 2.9348998391060662e-05, + "loss": 0.7124, + "step": 140160 + }, + { + "epoch": 1.2391484997966724, + "grad_norm": 1.0909459590911865, + "learning_rate": 2.9347525003388794e-05, + "loss": 0.6455, + "step": 140170 + }, + { + "epoch": 1.2392369030569848, + "grad_norm": 4.810318946838379, + "learning_rate": 2.9346051615716922e-05, + "loss": 0.6425, + "step": 140180 + }, + { + "epoch": 1.2393253063172969, + "grad_norm": 5.3630266189575195, + "learning_rate": 2.934457822804505e-05, + "loss": 0.6832, + "step": 140190 + }, + { + "epoch": 1.2394137095776092, + "grad_norm": 7.8332343101501465, + "learning_rate": 2.9343104840373182e-05, + "loss": 0.6905, + "step": 140200 + }, + { + "epoch": 1.2395021128379216, + "grad_norm": 2.4580328464508057, + "learning_rate": 2.934163145270131e-05, + "loss": 0.6283, + "step": 140210 + }, + { + "epoch": 1.2395905160982337, + "grad_norm": 3.7288308143615723, + "learning_rate": 2.934015806502944e-05, + "loss": 0.659, + "step": 140220 + }, + { + "epoch": 1.239678919358546, + "grad_norm": 5.179182052612305, + "learning_rate": 2.933868467735757e-05, + "loss": 0.5555, + "step": 140230 + }, + { + "epoch": 1.2397673226188581, + "grad_norm": 2.32548189163208, + "learning_rate": 2.9337211289685703e-05, + "loss": 0.6479, + "step": 140240 + }, + { + "epoch": 1.2398557258791705, + "grad_norm": 1.174964189529419, + "learning_rate": 2.9335737902013828e-05, + "loss": 0.6909, + "step": 140250 + }, + { + "epoch": 1.2399441291394826, + "grad_norm": 1.8876432180404663, + "learning_rate": 2.933426451434196e-05, + "loss": 0.6437, + "step": 140260 + }, + { + "epoch": 1.240032532399795, + "grad_norm": 9.983796119689941, + "learning_rate": 2.9332791126670084e-05, + "loss": 0.7188, + "step": 140270 + }, + { + "epoch": 1.240120935660107, + "grad_norm": 1.982649326324463, + "learning_rate": 2.9331317738998216e-05, + "loss": 0.5284, + "step": 140280 + }, + { + "epoch": 1.2402093389204194, + "grad_norm": 7.236152172088623, + "learning_rate": 2.9329844351326348e-05, + "loss": 0.6579, + "step": 140290 + }, + { + "epoch": 1.2402977421807315, + "grad_norm": 2.171541452407837, + "learning_rate": 2.9328370963654473e-05, + "loss": 0.6806, + "step": 140300 + }, + { + "epoch": 1.2403861454410439, + "grad_norm": 2.270296573638916, + "learning_rate": 2.9326897575982605e-05, + "loss": 0.6705, + "step": 140310 + }, + { + "epoch": 1.2404745487013562, + "grad_norm": 1.9734338521957397, + "learning_rate": 2.9325424188310736e-05, + "loss": 0.6387, + "step": 140320 + }, + { + "epoch": 1.2405629519616683, + "grad_norm": 4.599209785461426, + "learning_rate": 2.932395080063886e-05, + "loss": 0.5914, + "step": 140330 + }, + { + "epoch": 1.2406513552219807, + "grad_norm": 1.9192358255386353, + "learning_rate": 2.9322477412966993e-05, + "loss": 0.5906, + "step": 140340 + }, + { + "epoch": 1.2407397584822928, + "grad_norm": 4.300664901733398, + "learning_rate": 2.9321004025295125e-05, + "loss": 0.6245, + "step": 140350 + }, + { + "epoch": 1.2408281617426051, + "grad_norm": 5.689311504364014, + "learning_rate": 2.931953063762325e-05, + "loss": 0.5603, + "step": 140360 + }, + { + "epoch": 1.2409165650029172, + "grad_norm": 3.6298885345458984, + "learning_rate": 2.931805724995138e-05, + "loss": 0.6237, + "step": 140370 + }, + { + "epoch": 1.2410049682632296, + "grad_norm": 11.114352226257324, + "learning_rate": 2.9316583862279506e-05, + "loss": 0.6418, + "step": 140380 + }, + { + "epoch": 1.2410933715235417, + "grad_norm": 3.2634389400482178, + "learning_rate": 2.9315110474607638e-05, + "loss": 0.5905, + "step": 140390 + }, + { + "epoch": 1.241181774783854, + "grad_norm": 3.1169321537017822, + "learning_rate": 2.931363708693577e-05, + "loss": 0.6444, + "step": 140400 + }, + { + "epoch": 1.2412701780441662, + "grad_norm": 3.5170397758483887, + "learning_rate": 2.9312163699263895e-05, + "loss": 0.7171, + "step": 140410 + }, + { + "epoch": 1.2413585813044785, + "grad_norm": 5.625921726226807, + "learning_rate": 2.9310690311592027e-05, + "loss": 0.7638, + "step": 140420 + }, + { + "epoch": 1.2414469845647909, + "grad_norm": 17.05948829650879, + "learning_rate": 2.930921692392016e-05, + "loss": 0.6317, + "step": 140430 + }, + { + "epoch": 1.241535387825103, + "grad_norm": 2.29343843460083, + "learning_rate": 2.9307743536248283e-05, + "loss": 0.5793, + "step": 140440 + }, + { + "epoch": 1.2416237910854153, + "grad_norm": 1.8508127927780151, + "learning_rate": 2.9306270148576415e-05, + "loss": 0.7046, + "step": 140450 + }, + { + "epoch": 1.2417121943457274, + "grad_norm": 3.533745288848877, + "learning_rate": 2.9304796760904547e-05, + "loss": 0.6784, + "step": 140460 + }, + { + "epoch": 1.2418005976060398, + "grad_norm": 1.58856201171875, + "learning_rate": 2.9303323373232672e-05, + "loss": 0.6206, + "step": 140470 + }, + { + "epoch": 1.241889000866352, + "grad_norm": 3.449031352996826, + "learning_rate": 2.9301849985560803e-05, + "loss": 0.6174, + "step": 140480 + }, + { + "epoch": 1.2419774041266642, + "grad_norm": 1.543929100036621, + "learning_rate": 2.930037659788893e-05, + "loss": 0.6951, + "step": 140490 + }, + { + "epoch": 1.2420658073869764, + "grad_norm": 9.21243953704834, + "learning_rate": 2.929890321021706e-05, + "loss": 0.6359, + "step": 140500 + }, + { + "epoch": 1.2421542106472887, + "grad_norm": 9.06112003326416, + "learning_rate": 2.9297429822545192e-05, + "loss": 0.7242, + "step": 140510 + }, + { + "epoch": 1.2422426139076008, + "grad_norm": 3.261469841003418, + "learning_rate": 2.9295956434873317e-05, + "loss": 0.6628, + "step": 140520 + }, + { + "epoch": 1.2423310171679132, + "grad_norm": 1.5714235305786133, + "learning_rate": 2.929448304720145e-05, + "loss": 0.596, + "step": 140530 + }, + { + "epoch": 1.2424194204282255, + "grad_norm": 1.2306922674179077, + "learning_rate": 2.929300965952958e-05, + "loss": 0.5427, + "step": 140540 + }, + { + "epoch": 1.2425078236885376, + "grad_norm": 4.4726762771606445, + "learning_rate": 2.9291536271857705e-05, + "loss": 0.6596, + "step": 140550 + }, + { + "epoch": 1.24259622694885, + "grad_norm": 2.372318744659424, + "learning_rate": 2.9290062884185837e-05, + "loss": 0.5286, + "step": 140560 + }, + { + "epoch": 1.242684630209162, + "grad_norm": 1.2543840408325195, + "learning_rate": 2.928858949651397e-05, + "loss": 0.5122, + "step": 140570 + }, + { + "epoch": 1.2427730334694744, + "grad_norm": 4.93451452255249, + "learning_rate": 2.9287116108842094e-05, + "loss": 0.5446, + "step": 140580 + }, + { + "epoch": 1.2428614367297866, + "grad_norm": 1.7844778299331665, + "learning_rate": 2.9285642721170226e-05, + "loss": 0.6212, + "step": 140590 + }, + { + "epoch": 1.242949839990099, + "grad_norm": 9.284774780273438, + "learning_rate": 2.9284169333498357e-05, + "loss": 0.7217, + "step": 140600 + }, + { + "epoch": 1.243038243250411, + "grad_norm": 4.928479194641113, + "learning_rate": 2.9282695945826482e-05, + "loss": 0.5224, + "step": 140610 + }, + { + "epoch": 1.2431266465107234, + "grad_norm": 14.97335147857666, + "learning_rate": 2.9281222558154614e-05, + "loss": 0.5793, + "step": 140620 + }, + { + "epoch": 1.2432150497710355, + "grad_norm": 1.0068814754486084, + "learning_rate": 2.927974917048274e-05, + "loss": 0.5943, + "step": 140630 + }, + { + "epoch": 1.2433034530313478, + "grad_norm": 2.9845073223114014, + "learning_rate": 2.927827578281087e-05, + "loss": 0.5788, + "step": 140640 + }, + { + "epoch": 1.2433918562916602, + "grad_norm": 2.9338865280151367, + "learning_rate": 2.9276802395139002e-05, + "loss": 0.6163, + "step": 140650 + }, + { + "epoch": 1.2434802595519723, + "grad_norm": 9.645593643188477, + "learning_rate": 2.9275329007467127e-05, + "loss": 0.6994, + "step": 140660 + }, + { + "epoch": 1.2435686628122844, + "grad_norm": 2.7298977375030518, + "learning_rate": 2.927385561979526e-05, + "loss": 0.6851, + "step": 140670 + }, + { + "epoch": 1.2436570660725967, + "grad_norm": 3.164957046508789, + "learning_rate": 2.927238223212339e-05, + "loss": 0.5353, + "step": 140680 + }, + { + "epoch": 1.243745469332909, + "grad_norm": 3.9340643882751465, + "learning_rate": 2.9270908844451516e-05, + "loss": 0.5433, + "step": 140690 + }, + { + "epoch": 1.2438338725932212, + "grad_norm": 5.256089210510254, + "learning_rate": 2.9269435456779648e-05, + "loss": 0.5532, + "step": 140700 + }, + { + "epoch": 1.2439222758535335, + "grad_norm": 2.4819583892822266, + "learning_rate": 2.926796206910778e-05, + "loss": 0.5825, + "step": 140710 + }, + { + "epoch": 1.2440106791138457, + "grad_norm": 1.4068666696548462, + "learning_rate": 2.9266488681435904e-05, + "loss": 0.5838, + "step": 140720 + }, + { + "epoch": 1.244099082374158, + "grad_norm": 7.233332633972168, + "learning_rate": 2.9265015293764036e-05, + "loss": 0.6525, + "step": 140730 + }, + { + "epoch": 1.2441874856344701, + "grad_norm": 5.608917713165283, + "learning_rate": 2.926354190609216e-05, + "loss": 0.6391, + "step": 140740 + }, + { + "epoch": 1.2442758888947825, + "grad_norm": 6.583284378051758, + "learning_rate": 2.9262068518420293e-05, + "loss": 0.8408, + "step": 140750 + }, + { + "epoch": 1.2443642921550946, + "grad_norm": 1.630718469619751, + "learning_rate": 2.9260595130748425e-05, + "loss": 0.5601, + "step": 140760 + }, + { + "epoch": 1.244452695415407, + "grad_norm": 5.152973175048828, + "learning_rate": 2.925912174307655e-05, + "loss": 0.6414, + "step": 140770 + }, + { + "epoch": 1.244541098675719, + "grad_norm": 2.3453240394592285, + "learning_rate": 2.925764835540468e-05, + "loss": 0.6453, + "step": 140780 + }, + { + "epoch": 1.2446295019360314, + "grad_norm": 1.7918018102645874, + "learning_rate": 2.9256174967732813e-05, + "loss": 0.5182, + "step": 140790 + }, + { + "epoch": 1.2447179051963437, + "grad_norm": 1.9017199277877808, + "learning_rate": 2.9254701580060938e-05, + "loss": 0.5866, + "step": 140800 + }, + { + "epoch": 1.2448063084566559, + "grad_norm": 2.5929338932037354, + "learning_rate": 2.925322819238907e-05, + "loss": 0.6306, + "step": 140810 + }, + { + "epoch": 1.2448947117169682, + "grad_norm": 1.0340843200683594, + "learning_rate": 2.92517548047172e-05, + "loss": 0.6139, + "step": 140820 + }, + { + "epoch": 1.2449831149772803, + "grad_norm": 1.3462214469909668, + "learning_rate": 2.9250281417045326e-05, + "loss": 0.5743, + "step": 140830 + }, + { + "epoch": 1.2450715182375927, + "grad_norm": 2.510620355606079, + "learning_rate": 2.9248808029373458e-05, + "loss": 0.5134, + "step": 140840 + }, + { + "epoch": 1.2451599214979048, + "grad_norm": 5.207735061645508, + "learning_rate": 2.9247334641701583e-05, + "loss": 0.6167, + "step": 140850 + }, + { + "epoch": 1.2452483247582171, + "grad_norm": 4.856535911560059, + "learning_rate": 2.9245861254029715e-05, + "loss": 0.5485, + "step": 140860 + }, + { + "epoch": 1.2453367280185292, + "grad_norm": 6.803157806396484, + "learning_rate": 2.9244387866357847e-05, + "loss": 0.5408, + "step": 140870 + }, + { + "epoch": 1.2454251312788416, + "grad_norm": 3.1833581924438477, + "learning_rate": 2.924291447868597e-05, + "loss": 0.4767, + "step": 140880 + }, + { + "epoch": 1.2455135345391537, + "grad_norm": 2.0532584190368652, + "learning_rate": 2.9241441091014103e-05, + "loss": 0.6454, + "step": 140890 + }, + { + "epoch": 1.245601937799466, + "grad_norm": 2.4384188652038574, + "learning_rate": 2.9239967703342235e-05, + "loss": 0.6185, + "step": 140900 + }, + { + "epoch": 1.2456903410597784, + "grad_norm": 2.536409854888916, + "learning_rate": 2.923849431567036e-05, + "loss": 0.6473, + "step": 140910 + }, + { + "epoch": 1.2457787443200905, + "grad_norm": 1.9948673248291016, + "learning_rate": 2.9237020927998492e-05, + "loss": 0.6627, + "step": 140920 + }, + { + "epoch": 1.2458671475804028, + "grad_norm": 6.776634216308594, + "learning_rate": 2.9235547540326623e-05, + "loss": 0.7353, + "step": 140930 + }, + { + "epoch": 1.245955550840715, + "grad_norm": 2.3835225105285645, + "learning_rate": 2.9234074152654752e-05, + "loss": 0.6793, + "step": 140940 + }, + { + "epoch": 1.2460439541010273, + "grad_norm": 1.0225166082382202, + "learning_rate": 2.923260076498288e-05, + "loss": 0.4932, + "step": 140950 + }, + { + "epoch": 1.2461323573613394, + "grad_norm": 3.9867641925811768, + "learning_rate": 2.923112737731101e-05, + "loss": 0.6659, + "step": 140960 + }, + { + "epoch": 1.2462207606216518, + "grad_norm": 1.974141240119934, + "learning_rate": 2.922965398963914e-05, + "loss": 0.7315, + "step": 140970 + }, + { + "epoch": 1.2463091638819639, + "grad_norm": 1.6168317794799805, + "learning_rate": 2.922818060196727e-05, + "loss": 0.5722, + "step": 140980 + }, + { + "epoch": 1.2463975671422762, + "grad_norm": 4.553557872772217, + "learning_rate": 2.9226707214295397e-05, + "loss": 0.5784, + "step": 140990 + }, + { + "epoch": 1.2464859704025884, + "grad_norm": 1.275503158569336, + "learning_rate": 2.922523382662353e-05, + "loss": 0.4953, + "step": 141000 + }, + { + "epoch": 1.2465743736629007, + "grad_norm": 1.0491254329681396, + "learning_rate": 2.9223760438951657e-05, + "loss": 0.5979, + "step": 141010 + }, + { + "epoch": 1.246662776923213, + "grad_norm": 1.8186928033828735, + "learning_rate": 2.9222287051279785e-05, + "loss": 0.6265, + "step": 141020 + }, + { + "epoch": 1.2467511801835252, + "grad_norm": 2.4338583946228027, + "learning_rate": 2.9220813663607917e-05, + "loss": 0.6928, + "step": 141030 + }, + { + "epoch": 1.2468395834438375, + "grad_norm": 4.873175144195557, + "learning_rate": 2.9219340275936046e-05, + "loss": 0.6879, + "step": 141040 + }, + { + "epoch": 1.2469279867041496, + "grad_norm": 5.91003942489624, + "learning_rate": 2.9217866888264174e-05, + "loss": 0.6794, + "step": 141050 + }, + { + "epoch": 1.247016389964462, + "grad_norm": 1.8183811902999878, + "learning_rate": 2.9216393500592306e-05, + "loss": 0.5291, + "step": 141060 + }, + { + "epoch": 1.247104793224774, + "grad_norm": 4.114963531494141, + "learning_rate": 2.9214920112920434e-05, + "loss": 0.6011, + "step": 141070 + }, + { + "epoch": 1.2471931964850864, + "grad_norm": 3.7473528385162354, + "learning_rate": 2.9213446725248562e-05, + "loss": 0.6559, + "step": 141080 + }, + { + "epoch": 1.2472815997453985, + "grad_norm": 7.758160591125488, + "learning_rate": 2.9211973337576694e-05, + "loss": 0.6818, + "step": 141090 + }, + { + "epoch": 1.2473700030057109, + "grad_norm": 2.686952829360962, + "learning_rate": 2.921049994990482e-05, + "loss": 0.5258, + "step": 141100 + }, + { + "epoch": 1.247458406266023, + "grad_norm": 4.461240291595459, + "learning_rate": 2.920902656223295e-05, + "loss": 0.6901, + "step": 141110 + }, + { + "epoch": 1.2475468095263353, + "grad_norm": 5.235640525817871, + "learning_rate": 2.9207553174561083e-05, + "loss": 0.5542, + "step": 141120 + }, + { + "epoch": 1.2476352127866477, + "grad_norm": 2.567991256713867, + "learning_rate": 2.9206079786889207e-05, + "loss": 0.5565, + "step": 141130 + }, + { + "epoch": 1.2477236160469598, + "grad_norm": 2.196699857711792, + "learning_rate": 2.920460639921734e-05, + "loss": 0.5685, + "step": 141140 + }, + { + "epoch": 1.2478120193072721, + "grad_norm": 1.8159294128417969, + "learning_rate": 2.920313301154547e-05, + "loss": 0.7082, + "step": 141150 + }, + { + "epoch": 1.2479004225675843, + "grad_norm": 2.4379069805145264, + "learning_rate": 2.9201659623873596e-05, + "loss": 0.6524, + "step": 141160 + }, + { + "epoch": 1.2479888258278966, + "grad_norm": 6.3143181800842285, + "learning_rate": 2.9200186236201728e-05, + "loss": 0.581, + "step": 141170 + }, + { + "epoch": 1.2480772290882087, + "grad_norm": 1.6999142169952393, + "learning_rate": 2.919871284852986e-05, + "loss": 0.576, + "step": 141180 + }, + { + "epoch": 1.248165632348521, + "grad_norm": 4.524407863616943, + "learning_rate": 2.9197239460857984e-05, + "loss": 0.5335, + "step": 141190 + }, + { + "epoch": 1.2482540356088332, + "grad_norm": 1.083709716796875, + "learning_rate": 2.9195766073186116e-05, + "loss": 0.7418, + "step": 141200 + }, + { + "epoch": 1.2483424388691455, + "grad_norm": 11.910554885864258, + "learning_rate": 2.919429268551424e-05, + "loss": 0.5651, + "step": 141210 + }, + { + "epoch": 1.2484308421294577, + "grad_norm": 5.971522808074951, + "learning_rate": 2.9192819297842373e-05, + "loss": 0.6524, + "step": 141220 + }, + { + "epoch": 1.24851924538977, + "grad_norm": 8.171072006225586, + "learning_rate": 2.9191345910170505e-05, + "loss": 0.4948, + "step": 141230 + }, + { + "epoch": 1.2486076486500823, + "grad_norm": 6.670187473297119, + "learning_rate": 2.918987252249863e-05, + "loss": 0.6167, + "step": 141240 + }, + { + "epoch": 1.2486960519103945, + "grad_norm": 3.2850587368011475, + "learning_rate": 2.918839913482676e-05, + "loss": 0.7249, + "step": 141250 + }, + { + "epoch": 1.2487844551707066, + "grad_norm": 19.041240692138672, + "learning_rate": 2.9186925747154893e-05, + "loss": 0.6956, + "step": 141260 + }, + { + "epoch": 1.248872858431019, + "grad_norm": 2.8739800453186035, + "learning_rate": 2.9185452359483018e-05, + "loss": 0.6685, + "step": 141270 + }, + { + "epoch": 1.2489612616913313, + "grad_norm": 3.2773549556732178, + "learning_rate": 2.918397897181115e-05, + "loss": 0.6015, + "step": 141280 + }, + { + "epoch": 1.2490496649516434, + "grad_norm": 2.605332136154175, + "learning_rate": 2.918250558413928e-05, + "loss": 0.6365, + "step": 141290 + }, + { + "epoch": 1.2491380682119557, + "grad_norm": 10.375014305114746, + "learning_rate": 2.9181032196467406e-05, + "loss": 0.4976, + "step": 141300 + }, + { + "epoch": 1.2492264714722678, + "grad_norm": 2.152941942214966, + "learning_rate": 2.9179558808795538e-05, + "loss": 0.5866, + "step": 141310 + }, + { + "epoch": 1.2493148747325802, + "grad_norm": 4.143291473388672, + "learning_rate": 2.9178085421123663e-05, + "loss": 0.6137, + "step": 141320 + }, + { + "epoch": 1.2494032779928923, + "grad_norm": 2.2320897579193115, + "learning_rate": 2.9176612033451795e-05, + "loss": 0.5502, + "step": 141330 + }, + { + "epoch": 1.2494916812532046, + "grad_norm": 2.5040457248687744, + "learning_rate": 2.9175138645779927e-05, + "loss": 0.695, + "step": 141340 + }, + { + "epoch": 1.2495800845135168, + "grad_norm": 2.3700103759765625, + "learning_rate": 2.917366525810805e-05, + "loss": 0.7287, + "step": 141350 + }, + { + "epoch": 1.249668487773829, + "grad_norm": 3.8010458946228027, + "learning_rate": 2.9172191870436183e-05, + "loss": 0.5645, + "step": 141360 + }, + { + "epoch": 1.2497568910341412, + "grad_norm": 2.8290529251098633, + "learning_rate": 2.9170718482764315e-05, + "loss": 0.5873, + "step": 141370 + }, + { + "epoch": 1.2498452942944536, + "grad_norm": 5.750149726867676, + "learning_rate": 2.916924509509244e-05, + "loss": 0.5435, + "step": 141380 + }, + { + "epoch": 1.249933697554766, + "grad_norm": 1.9185670614242554, + "learning_rate": 2.9167771707420572e-05, + "loss": 0.5686, + "step": 141390 + }, + { + "epoch": 1.250022100815078, + "grad_norm": 2.6748223304748535, + "learning_rate": 2.9166298319748704e-05, + "loss": 0.5652, + "step": 141400 + }, + { + "epoch": 1.2501105040753904, + "grad_norm": 2.3826236724853516, + "learning_rate": 2.916482493207683e-05, + "loss": 0.7153, + "step": 141410 + }, + { + "epoch": 1.2501989073357025, + "grad_norm": 1.8094279766082764, + "learning_rate": 2.916335154440496e-05, + "loss": 0.5242, + "step": 141420 + }, + { + "epoch": 1.2502873105960148, + "grad_norm": 1.37126624584198, + "learning_rate": 2.9161878156733085e-05, + "loss": 0.6647, + "step": 141430 + }, + { + "epoch": 1.250375713856327, + "grad_norm": 2.8650171756744385, + "learning_rate": 2.9160404769061217e-05, + "loss": 0.6869, + "step": 141440 + }, + { + "epoch": 1.2504641171166393, + "grad_norm": 1.3624227046966553, + "learning_rate": 2.915893138138935e-05, + "loss": 0.498, + "step": 141450 + }, + { + "epoch": 1.2505525203769516, + "grad_norm": 1.9418193101882935, + "learning_rate": 2.9157457993717474e-05, + "loss": 0.6451, + "step": 141460 + }, + { + "epoch": 1.2506409236372638, + "grad_norm": 1.5357246398925781, + "learning_rate": 2.9155984606045605e-05, + "loss": 0.5752, + "step": 141470 + }, + { + "epoch": 1.2507293268975759, + "grad_norm": 5.975726127624512, + "learning_rate": 2.9154511218373737e-05, + "loss": 0.4895, + "step": 141480 + }, + { + "epoch": 1.2508177301578882, + "grad_norm": 7.569196701049805, + "learning_rate": 2.9153037830701862e-05, + "loss": 0.6037, + "step": 141490 + }, + { + "epoch": 1.2509061334182006, + "grad_norm": 5.183529853820801, + "learning_rate": 2.9151564443029994e-05, + "loss": 0.618, + "step": 141500 + }, + { + "epoch": 1.2509945366785127, + "grad_norm": 2.297191619873047, + "learning_rate": 2.9150091055358126e-05, + "loss": 0.5857, + "step": 141510 + }, + { + "epoch": 1.251082939938825, + "grad_norm": 3.5654499530792236, + "learning_rate": 2.914861766768625e-05, + "loss": 0.6124, + "step": 141520 + }, + { + "epoch": 1.2511713431991371, + "grad_norm": 1.1095050573349, + "learning_rate": 2.9147144280014382e-05, + "loss": 0.5371, + "step": 141530 + }, + { + "epoch": 1.2512597464594495, + "grad_norm": 4.851041316986084, + "learning_rate": 2.9145670892342514e-05, + "loss": 0.5768, + "step": 141540 + }, + { + "epoch": 1.2513481497197616, + "grad_norm": 1.8972887992858887, + "learning_rate": 2.914419750467064e-05, + "loss": 0.6216, + "step": 141550 + }, + { + "epoch": 1.251436552980074, + "grad_norm": 3.0215260982513428, + "learning_rate": 2.914272411699877e-05, + "loss": 0.7099, + "step": 141560 + }, + { + "epoch": 1.251524956240386, + "grad_norm": 3.6639623641967773, + "learning_rate": 2.9141250729326896e-05, + "loss": 0.6488, + "step": 141570 + }, + { + "epoch": 1.2516133595006984, + "grad_norm": 9.585492134094238, + "learning_rate": 2.9139777341655027e-05, + "loss": 0.4569, + "step": 141580 + }, + { + "epoch": 1.2517017627610105, + "grad_norm": 2.5919620990753174, + "learning_rate": 2.913830395398316e-05, + "loss": 0.5874, + "step": 141590 + }, + { + "epoch": 1.2517901660213229, + "grad_norm": 5.106435775756836, + "learning_rate": 2.9136830566311284e-05, + "loss": 0.6638, + "step": 141600 + }, + { + "epoch": 1.2518785692816352, + "grad_norm": 8.585777282714844, + "learning_rate": 2.9135357178639416e-05, + "loss": 0.6941, + "step": 141610 + }, + { + "epoch": 1.2519669725419473, + "grad_norm": 3.21549129486084, + "learning_rate": 2.9133883790967548e-05, + "loss": 0.5557, + "step": 141620 + }, + { + "epoch": 1.2520553758022595, + "grad_norm": 19.117788314819336, + "learning_rate": 2.9132410403295673e-05, + "loss": 0.6647, + "step": 141630 + }, + { + "epoch": 1.2521437790625718, + "grad_norm": 2.573091983795166, + "learning_rate": 2.9130937015623804e-05, + "loss": 0.684, + "step": 141640 + }, + { + "epoch": 1.2522321823228841, + "grad_norm": 4.657829284667969, + "learning_rate": 2.9129463627951936e-05, + "loss": 0.5427, + "step": 141650 + }, + { + "epoch": 1.2523205855831963, + "grad_norm": 2.9532501697540283, + "learning_rate": 2.912799024028006e-05, + "loss": 0.5121, + "step": 141660 + }, + { + "epoch": 1.2524089888435086, + "grad_norm": 1.4169063568115234, + "learning_rate": 2.9126516852608193e-05, + "loss": 0.5873, + "step": 141670 + }, + { + "epoch": 1.2524973921038207, + "grad_norm": 21.120031356811523, + "learning_rate": 2.9125043464936318e-05, + "loss": 0.6757, + "step": 141680 + }, + { + "epoch": 1.252585795364133, + "grad_norm": 1.2934825420379639, + "learning_rate": 2.912357007726445e-05, + "loss": 0.5375, + "step": 141690 + }, + { + "epoch": 1.2526741986244452, + "grad_norm": 11.707215309143066, + "learning_rate": 2.912209668959258e-05, + "loss": 0.6225, + "step": 141700 + }, + { + "epoch": 1.2527626018847575, + "grad_norm": 3.3010013103485107, + "learning_rate": 2.9120623301920706e-05, + "loss": 0.4932, + "step": 141710 + }, + { + "epoch": 1.2528510051450699, + "grad_norm": 7.640185356140137, + "learning_rate": 2.9119149914248838e-05, + "loss": 0.6269, + "step": 141720 + }, + { + "epoch": 1.252939408405382, + "grad_norm": 2.6844773292541504, + "learning_rate": 2.911767652657697e-05, + "loss": 0.7, + "step": 141730 + }, + { + "epoch": 1.253027811665694, + "grad_norm": 4.77592658996582, + "learning_rate": 2.9116203138905095e-05, + "loss": 0.613, + "step": 141740 + }, + { + "epoch": 1.2531162149260064, + "grad_norm": 2.815992593765259, + "learning_rate": 2.9114729751233226e-05, + "loss": 0.6259, + "step": 141750 + }, + { + "epoch": 1.2532046181863188, + "grad_norm": 3.3699681758880615, + "learning_rate": 2.9113256363561358e-05, + "loss": 0.6528, + "step": 141760 + }, + { + "epoch": 1.253293021446631, + "grad_norm": 1.2847920656204224, + "learning_rate": 2.9111782975889483e-05, + "loss": 0.6925, + "step": 141770 + }, + { + "epoch": 1.2533814247069432, + "grad_norm": 9.219353675842285, + "learning_rate": 2.9110309588217615e-05, + "loss": 0.593, + "step": 141780 + }, + { + "epoch": 1.2534698279672554, + "grad_norm": 17.176612854003906, + "learning_rate": 2.9108836200545743e-05, + "loss": 0.6063, + "step": 141790 + }, + { + "epoch": 1.2535582312275677, + "grad_norm": 4.702795505523682, + "learning_rate": 2.910736281287387e-05, + "loss": 0.5646, + "step": 141800 + }, + { + "epoch": 1.2536466344878798, + "grad_norm": 9.503539085388184, + "learning_rate": 2.9105889425202003e-05, + "loss": 0.759, + "step": 141810 + }, + { + "epoch": 1.2537350377481922, + "grad_norm": 1.0335949659347534, + "learning_rate": 2.910441603753013e-05, + "loss": 0.5143, + "step": 141820 + }, + { + "epoch": 1.2538234410085045, + "grad_norm": 11.666600227355957, + "learning_rate": 2.910294264985826e-05, + "loss": 0.6094, + "step": 141830 + }, + { + "epoch": 1.2539118442688166, + "grad_norm": 5.775475025177002, + "learning_rate": 2.9101469262186392e-05, + "loss": 0.6613, + "step": 141840 + }, + { + "epoch": 1.2540002475291288, + "grad_norm": 2.63454270362854, + "learning_rate": 2.909999587451452e-05, + "loss": 0.536, + "step": 141850 + }, + { + "epoch": 1.254088650789441, + "grad_norm": 2.5589218139648438, + "learning_rate": 2.909852248684265e-05, + "loss": 0.5766, + "step": 141860 + }, + { + "epoch": 1.2541770540497534, + "grad_norm": 3.598965883255005, + "learning_rate": 2.909704909917078e-05, + "loss": 0.6469, + "step": 141870 + }, + { + "epoch": 1.2542654573100656, + "grad_norm": 5.1218180656433105, + "learning_rate": 2.909557571149891e-05, + "loss": 0.49, + "step": 141880 + }, + { + "epoch": 1.254353860570378, + "grad_norm": 6.656684398651123, + "learning_rate": 2.9094102323827037e-05, + "loss": 0.6965, + "step": 141890 + }, + { + "epoch": 1.25444226383069, + "grad_norm": 4.330774784088135, + "learning_rate": 2.9092628936155165e-05, + "loss": 0.6882, + "step": 141900 + }, + { + "epoch": 1.2545306670910024, + "grad_norm": 1.1962385177612305, + "learning_rate": 2.9091155548483297e-05, + "loss": 0.6487, + "step": 141910 + }, + { + "epoch": 1.2546190703513145, + "grad_norm": 5.509335517883301, + "learning_rate": 2.9089682160811425e-05, + "loss": 0.6785, + "step": 141920 + }, + { + "epoch": 1.2547074736116268, + "grad_norm": 6.97117805480957, + "learning_rate": 2.9088208773139554e-05, + "loss": 0.4744, + "step": 141930 + }, + { + "epoch": 1.2547958768719392, + "grad_norm": 1.8010069131851196, + "learning_rate": 2.9086735385467685e-05, + "loss": 0.6192, + "step": 141940 + }, + { + "epoch": 1.2548842801322513, + "grad_norm": 2.195843458175659, + "learning_rate": 2.9085261997795814e-05, + "loss": 0.6447, + "step": 141950 + }, + { + "epoch": 1.2549726833925634, + "grad_norm": 1.7158185243606567, + "learning_rate": 2.9083788610123942e-05, + "loss": 0.6275, + "step": 141960 + }, + { + "epoch": 1.2550610866528757, + "grad_norm": 4.613834381103516, + "learning_rate": 2.9082315222452074e-05, + "loss": 0.6075, + "step": 141970 + }, + { + "epoch": 1.255149489913188, + "grad_norm": 5.494673728942871, + "learning_rate": 2.9080841834780202e-05, + "loss": 0.5105, + "step": 141980 + }, + { + "epoch": 1.2552378931735002, + "grad_norm": 2.8348162174224854, + "learning_rate": 2.907936844710833e-05, + "loss": 0.6561, + "step": 141990 + }, + { + "epoch": 1.2553262964338125, + "grad_norm": 1.4355186223983765, + "learning_rate": 2.9077895059436462e-05, + "loss": 0.6661, + "step": 142000 + }, + { + "epoch": 1.2554146996941247, + "grad_norm": 2.367457389831543, + "learning_rate": 2.907642167176459e-05, + "loss": 0.6743, + "step": 142010 + }, + { + "epoch": 1.255503102954437, + "grad_norm": 2.9533989429473877, + "learning_rate": 2.907494828409272e-05, + "loss": 0.4789, + "step": 142020 + }, + { + "epoch": 1.2555915062147491, + "grad_norm": 2.8348333835601807, + "learning_rate": 2.907347489642085e-05, + "loss": 0.6829, + "step": 142030 + }, + { + "epoch": 1.2556799094750615, + "grad_norm": 1.6097936630249023, + "learning_rate": 2.9072001508748976e-05, + "loss": 0.4533, + "step": 142040 + }, + { + "epoch": 1.2557683127353738, + "grad_norm": 9.160196304321289, + "learning_rate": 2.9070528121077108e-05, + "loss": 0.5649, + "step": 142050 + }, + { + "epoch": 1.255856715995686, + "grad_norm": 2.1227221488952637, + "learning_rate": 2.906905473340524e-05, + "loss": 0.679, + "step": 142060 + }, + { + "epoch": 1.255945119255998, + "grad_norm": 4.315895080566406, + "learning_rate": 2.9067581345733364e-05, + "loss": 0.652, + "step": 142070 + }, + { + "epoch": 1.2560335225163104, + "grad_norm": 5.385100364685059, + "learning_rate": 2.9066107958061496e-05, + "loss": 0.4854, + "step": 142080 + }, + { + "epoch": 1.2561219257766227, + "grad_norm": 1.7082422971725464, + "learning_rate": 2.9064634570389628e-05, + "loss": 0.734, + "step": 142090 + }, + { + "epoch": 1.2562103290369349, + "grad_norm": 3.230713367462158, + "learning_rate": 2.9063161182717753e-05, + "loss": 0.7296, + "step": 142100 + }, + { + "epoch": 1.2562987322972472, + "grad_norm": 1.7252577543258667, + "learning_rate": 2.9061687795045884e-05, + "loss": 0.6799, + "step": 142110 + }, + { + "epoch": 1.2563871355575593, + "grad_norm": 2.9703972339630127, + "learning_rate": 2.9060214407374016e-05, + "loss": 0.6039, + "step": 142120 + }, + { + "epoch": 1.2564755388178717, + "grad_norm": 2.5418121814727783, + "learning_rate": 2.905874101970214e-05, + "loss": 0.4859, + "step": 142130 + }, + { + "epoch": 1.2565639420781838, + "grad_norm": 4.017408847808838, + "learning_rate": 2.9057267632030273e-05, + "loss": 0.6434, + "step": 142140 + }, + { + "epoch": 1.2566523453384961, + "grad_norm": 3.0217642784118652, + "learning_rate": 2.9055794244358398e-05, + "loss": 0.6172, + "step": 142150 + }, + { + "epoch": 1.2567407485988082, + "grad_norm": 2.3579957485198975, + "learning_rate": 2.905432085668653e-05, + "loss": 0.6518, + "step": 142160 + }, + { + "epoch": 1.2568291518591206, + "grad_norm": 6.132658004760742, + "learning_rate": 2.905284746901466e-05, + "loss": 0.5344, + "step": 142170 + }, + { + "epoch": 1.2569175551194327, + "grad_norm": 2.83100962638855, + "learning_rate": 2.9051374081342786e-05, + "loss": 0.7732, + "step": 142180 + }, + { + "epoch": 1.257005958379745, + "grad_norm": 1.8778926134109497, + "learning_rate": 2.9049900693670918e-05, + "loss": 0.6001, + "step": 142190 + }, + { + "epoch": 1.2570943616400574, + "grad_norm": 2.402885913848877, + "learning_rate": 2.904842730599905e-05, + "loss": 0.5186, + "step": 142200 + }, + { + "epoch": 1.2571827649003695, + "grad_norm": 1.9606581926345825, + "learning_rate": 2.9046953918327175e-05, + "loss": 0.5075, + "step": 142210 + }, + { + "epoch": 1.2572711681606816, + "grad_norm": 6.555304050445557, + "learning_rate": 2.9045480530655306e-05, + "loss": 0.5144, + "step": 142220 + }, + { + "epoch": 1.257359571420994, + "grad_norm": 3.139082908630371, + "learning_rate": 2.9044007142983438e-05, + "loss": 0.6225, + "step": 142230 + }, + { + "epoch": 1.2574479746813063, + "grad_norm": 2.987534761428833, + "learning_rate": 2.9042533755311563e-05, + "loss": 0.4837, + "step": 142240 + }, + { + "epoch": 1.2575363779416184, + "grad_norm": 1.3827580213546753, + "learning_rate": 2.9041060367639695e-05, + "loss": 0.6732, + "step": 142250 + }, + { + "epoch": 1.2576247812019308, + "grad_norm": 4.690704822540283, + "learning_rate": 2.903958697996782e-05, + "loss": 0.6898, + "step": 142260 + }, + { + "epoch": 1.257713184462243, + "grad_norm": 3.160564661026001, + "learning_rate": 2.903811359229595e-05, + "loss": 0.7037, + "step": 142270 + }, + { + "epoch": 1.2578015877225552, + "grad_norm": 4.473592758178711, + "learning_rate": 2.9036640204624083e-05, + "loss": 0.6937, + "step": 142280 + }, + { + "epoch": 1.2578899909828674, + "grad_norm": 4.894199848175049, + "learning_rate": 2.903516681695221e-05, + "loss": 0.6031, + "step": 142290 + }, + { + "epoch": 1.2579783942431797, + "grad_norm": 1.4990565776824951, + "learning_rate": 2.903369342928034e-05, + "loss": 0.5938, + "step": 142300 + }, + { + "epoch": 1.258066797503492, + "grad_norm": 1.4152413606643677, + "learning_rate": 2.9032220041608472e-05, + "loss": 0.5592, + "step": 142310 + }, + { + "epoch": 1.2581552007638042, + "grad_norm": 0.9226208329200745, + "learning_rate": 2.9030746653936597e-05, + "loss": 0.6542, + "step": 142320 + }, + { + "epoch": 1.2582436040241163, + "grad_norm": 3.8320677280426025, + "learning_rate": 2.902927326626473e-05, + "loss": 0.6514, + "step": 142330 + }, + { + "epoch": 1.2583320072844286, + "grad_norm": 2.6828713417053223, + "learning_rate": 2.902779987859286e-05, + "loss": 0.6364, + "step": 142340 + }, + { + "epoch": 1.258420410544741, + "grad_norm": 8.254096031188965, + "learning_rate": 2.9026326490920985e-05, + "loss": 0.5871, + "step": 142350 + }, + { + "epoch": 1.258508813805053, + "grad_norm": 2.9420909881591797, + "learning_rate": 2.9024853103249117e-05, + "loss": 0.6058, + "step": 142360 + }, + { + "epoch": 1.2585972170653654, + "grad_norm": 7.3002800941467285, + "learning_rate": 2.9023379715577242e-05, + "loss": 0.6542, + "step": 142370 + }, + { + "epoch": 1.2586856203256775, + "grad_norm": 4.738195419311523, + "learning_rate": 2.9021906327905374e-05, + "loss": 0.6273, + "step": 142380 + }, + { + "epoch": 1.2587740235859899, + "grad_norm": 5.579693794250488, + "learning_rate": 2.9020432940233505e-05, + "loss": 0.5985, + "step": 142390 + }, + { + "epoch": 1.258862426846302, + "grad_norm": 3.8817455768585205, + "learning_rate": 2.901895955256163e-05, + "loss": 0.5406, + "step": 142400 + }, + { + "epoch": 1.2589508301066143, + "grad_norm": 11.041511535644531, + "learning_rate": 2.9017486164889762e-05, + "loss": 0.5037, + "step": 142410 + }, + { + "epoch": 1.2590392333669267, + "grad_norm": 8.328207015991211, + "learning_rate": 2.9016012777217894e-05, + "loss": 0.5755, + "step": 142420 + }, + { + "epoch": 1.2591276366272388, + "grad_norm": 2.8297994136810303, + "learning_rate": 2.901453938954602e-05, + "loss": 0.678, + "step": 142430 + }, + { + "epoch": 1.259216039887551, + "grad_norm": 2.941640615463257, + "learning_rate": 2.901306600187415e-05, + "loss": 0.7165, + "step": 142440 + }, + { + "epoch": 1.2593044431478633, + "grad_norm": 6.018679618835449, + "learning_rate": 2.9011592614202282e-05, + "loss": 0.5531, + "step": 142450 + }, + { + "epoch": 1.2593928464081756, + "grad_norm": 0.9858852624893188, + "learning_rate": 2.9010119226530407e-05, + "loss": 0.5502, + "step": 142460 + }, + { + "epoch": 1.2594812496684877, + "grad_norm": 3.667024612426758, + "learning_rate": 2.900864583885854e-05, + "loss": 0.5949, + "step": 142470 + }, + { + "epoch": 1.2595696529288, + "grad_norm": 1.5842547416687012, + "learning_rate": 2.900717245118667e-05, + "loss": 0.6418, + "step": 142480 + }, + { + "epoch": 1.2596580561891122, + "grad_norm": 2.1449875831604004, + "learning_rate": 2.9005699063514796e-05, + "loss": 0.5484, + "step": 142490 + }, + { + "epoch": 1.2597464594494245, + "grad_norm": 1.6487674713134766, + "learning_rate": 2.9004225675842928e-05, + "loss": 0.6449, + "step": 142500 + }, + { + "epoch": 1.2598348627097367, + "grad_norm": 1.6574065685272217, + "learning_rate": 2.9002752288171052e-05, + "loss": 0.5596, + "step": 142510 + }, + { + "epoch": 1.259923265970049, + "grad_norm": 2.4455056190490723, + "learning_rate": 2.9001278900499184e-05, + "loss": 0.6108, + "step": 142520 + }, + { + "epoch": 1.2600116692303613, + "grad_norm": 9.824710845947266, + "learning_rate": 2.8999805512827316e-05, + "loss": 0.6233, + "step": 142530 + }, + { + "epoch": 1.2601000724906735, + "grad_norm": 5.204301834106445, + "learning_rate": 2.899833212515544e-05, + "loss": 0.6731, + "step": 142540 + }, + { + "epoch": 1.2601884757509856, + "grad_norm": 3.536147356033325, + "learning_rate": 2.8996858737483573e-05, + "loss": 0.6073, + "step": 142550 + }, + { + "epoch": 1.260276879011298, + "grad_norm": 3.7649152278900146, + "learning_rate": 2.8995385349811704e-05, + "loss": 0.6917, + "step": 142560 + }, + { + "epoch": 1.2603652822716103, + "grad_norm": 3.116579532623291, + "learning_rate": 2.899391196213983e-05, + "loss": 0.6835, + "step": 142570 + }, + { + "epoch": 1.2604536855319224, + "grad_norm": 13.381340026855469, + "learning_rate": 2.899243857446796e-05, + "loss": 0.5467, + "step": 142580 + }, + { + "epoch": 1.2605420887922347, + "grad_norm": 2.4020488262176514, + "learning_rate": 2.8990965186796093e-05, + "loss": 0.6804, + "step": 142590 + }, + { + "epoch": 1.2606304920525468, + "grad_norm": 1.091081976890564, + "learning_rate": 2.8989491799124218e-05, + "loss": 0.6525, + "step": 142600 + }, + { + "epoch": 1.2607188953128592, + "grad_norm": 2.0964810848236084, + "learning_rate": 2.898801841145235e-05, + "loss": 0.6748, + "step": 142610 + }, + { + "epoch": 1.2608072985731713, + "grad_norm": 1.5391497611999512, + "learning_rate": 2.8986545023780475e-05, + "loss": 0.6673, + "step": 142620 + }, + { + "epoch": 1.2608957018334837, + "grad_norm": 1.1725083589553833, + "learning_rate": 2.8985071636108606e-05, + "loss": 0.489, + "step": 142630 + }, + { + "epoch": 1.260984105093796, + "grad_norm": 8.688754081726074, + "learning_rate": 2.8983598248436738e-05, + "loss": 0.5855, + "step": 142640 + }, + { + "epoch": 1.2610725083541081, + "grad_norm": 2.642036199569702, + "learning_rate": 2.8982124860764863e-05, + "loss": 0.6322, + "step": 142650 + }, + { + "epoch": 1.2611609116144202, + "grad_norm": 2.2940332889556885, + "learning_rate": 2.8980651473092995e-05, + "loss": 0.5994, + "step": 142660 + }, + { + "epoch": 1.2612493148747326, + "grad_norm": 2.187880277633667, + "learning_rate": 2.8979178085421126e-05, + "loss": 0.642, + "step": 142670 + }, + { + "epoch": 1.261337718135045, + "grad_norm": 2.172506332397461, + "learning_rate": 2.897770469774925e-05, + "loss": 0.5701, + "step": 142680 + }, + { + "epoch": 1.261426121395357, + "grad_norm": 1.471436858177185, + "learning_rate": 2.8976231310077383e-05, + "loss": 0.6288, + "step": 142690 + }, + { + "epoch": 1.2615145246556694, + "grad_norm": 1.1443840265274048, + "learning_rate": 2.8974757922405515e-05, + "loss": 0.6768, + "step": 142700 + }, + { + "epoch": 1.2616029279159815, + "grad_norm": 2.6344046592712402, + "learning_rate": 2.897328453473364e-05, + "loss": 0.6899, + "step": 142710 + }, + { + "epoch": 1.2616913311762938, + "grad_norm": 9.553025245666504, + "learning_rate": 2.897181114706177e-05, + "loss": 0.7157, + "step": 142720 + }, + { + "epoch": 1.261779734436606, + "grad_norm": 2.090665340423584, + "learning_rate": 2.89703377593899e-05, + "loss": 0.5796, + "step": 142730 + }, + { + "epoch": 1.2618681376969183, + "grad_norm": 8.538515090942383, + "learning_rate": 2.896886437171803e-05, + "loss": 0.5682, + "step": 142740 + }, + { + "epoch": 1.2619565409572304, + "grad_norm": 7.142655372619629, + "learning_rate": 2.896739098404616e-05, + "loss": 0.6292, + "step": 142750 + }, + { + "epoch": 1.2620449442175428, + "grad_norm": 1.1068390607833862, + "learning_rate": 2.896591759637429e-05, + "loss": 0.5977, + "step": 142760 + }, + { + "epoch": 1.2621333474778549, + "grad_norm": 4.276402473449707, + "learning_rate": 2.8964444208702417e-05, + "loss": 0.6011, + "step": 142770 + }, + { + "epoch": 1.2622217507381672, + "grad_norm": 5.4022417068481445, + "learning_rate": 2.896297082103055e-05, + "loss": 0.6588, + "step": 142780 + }, + { + "epoch": 1.2623101539984796, + "grad_norm": 5.604693412780762, + "learning_rate": 2.8961497433358677e-05, + "loss": 0.6636, + "step": 142790 + }, + { + "epoch": 1.2623985572587917, + "grad_norm": 1.653070092201233, + "learning_rate": 2.8960024045686805e-05, + "loss": 0.5639, + "step": 142800 + }, + { + "epoch": 1.2624869605191038, + "grad_norm": 3.6620776653289795, + "learning_rate": 2.8958550658014937e-05, + "loss": 0.7108, + "step": 142810 + }, + { + "epoch": 1.2625753637794161, + "grad_norm": 6.846787929534912, + "learning_rate": 2.8957077270343065e-05, + "loss": 0.6007, + "step": 142820 + }, + { + "epoch": 1.2626637670397285, + "grad_norm": 2.0712146759033203, + "learning_rate": 2.8955603882671194e-05, + "loss": 0.6024, + "step": 142830 + }, + { + "epoch": 1.2627521703000406, + "grad_norm": 7.8736419677734375, + "learning_rate": 2.8954130494999325e-05, + "loss": 0.5729, + "step": 142840 + }, + { + "epoch": 1.262840573560353, + "grad_norm": 4.709074020385742, + "learning_rate": 2.8952657107327454e-05, + "loss": 0.5712, + "step": 142850 + }, + { + "epoch": 1.262928976820665, + "grad_norm": 1.5062496662139893, + "learning_rate": 2.8951183719655582e-05, + "loss": 0.6204, + "step": 142860 + }, + { + "epoch": 1.2630173800809774, + "grad_norm": 2.3152246475219727, + "learning_rate": 2.894971033198371e-05, + "loss": 0.6111, + "step": 142870 + }, + { + "epoch": 1.2631057833412895, + "grad_norm": 2.668691396713257, + "learning_rate": 2.8948236944311842e-05, + "loss": 0.683, + "step": 142880 + }, + { + "epoch": 1.2631941866016019, + "grad_norm": 4.096856117248535, + "learning_rate": 2.894676355663997e-05, + "loss": 0.6052, + "step": 142890 + }, + { + "epoch": 1.2632825898619142, + "grad_norm": 4.328186511993408, + "learning_rate": 2.89452901689681e-05, + "loss": 0.6021, + "step": 142900 + }, + { + "epoch": 1.2633709931222263, + "grad_norm": 3.0060596466064453, + "learning_rate": 2.894381678129623e-05, + "loss": 0.5609, + "step": 142910 + }, + { + "epoch": 1.2634593963825385, + "grad_norm": 2.2473089694976807, + "learning_rate": 2.894234339362436e-05, + "loss": 0.5002, + "step": 142920 + }, + { + "epoch": 1.2635477996428508, + "grad_norm": 13.181225776672363, + "learning_rate": 2.8940870005952487e-05, + "loss": 0.6276, + "step": 142930 + }, + { + "epoch": 1.2636362029031631, + "grad_norm": 1.4006301164627075, + "learning_rate": 2.893939661828062e-05, + "loss": 0.6133, + "step": 142940 + }, + { + "epoch": 1.2637246061634753, + "grad_norm": 1.669376254081726, + "learning_rate": 2.8937923230608747e-05, + "loss": 0.5652, + "step": 142950 + }, + { + "epoch": 1.2638130094237876, + "grad_norm": 3.2969346046447754, + "learning_rate": 2.8936449842936876e-05, + "loss": 0.5649, + "step": 142960 + }, + { + "epoch": 1.2639014126840997, + "grad_norm": 1.5077687501907349, + "learning_rate": 2.8934976455265008e-05, + "loss": 0.6012, + "step": 142970 + }, + { + "epoch": 1.263989815944412, + "grad_norm": 3.3668384552001953, + "learning_rate": 2.8933503067593133e-05, + "loss": 0.7186, + "step": 142980 + }, + { + "epoch": 1.2640782192047242, + "grad_norm": 5.55092716217041, + "learning_rate": 2.8932029679921264e-05, + "loss": 0.6273, + "step": 142990 + }, + { + "epoch": 1.2641666224650365, + "grad_norm": 5.23398494720459, + "learning_rate": 2.8930556292249396e-05, + "loss": 0.6269, + "step": 143000 + }, + { + "epoch": 1.2642550257253489, + "grad_norm": 1.7478502988815308, + "learning_rate": 2.892908290457752e-05, + "loss": 0.597, + "step": 143010 + }, + { + "epoch": 1.264343428985661, + "grad_norm": 2.3077640533447266, + "learning_rate": 2.8927609516905653e-05, + "loss": 0.6253, + "step": 143020 + }, + { + "epoch": 1.264431832245973, + "grad_norm": 4.431246757507324, + "learning_rate": 2.8926136129233784e-05, + "loss": 0.7342, + "step": 143030 + }, + { + "epoch": 1.2645202355062855, + "grad_norm": 1.7367810010910034, + "learning_rate": 2.892466274156191e-05, + "loss": 0.5903, + "step": 143040 + }, + { + "epoch": 1.2646086387665978, + "grad_norm": 3.2478721141815186, + "learning_rate": 2.892318935389004e-05, + "loss": 0.5048, + "step": 143050 + }, + { + "epoch": 1.26469704202691, + "grad_norm": 1.2484029531478882, + "learning_rate": 2.8921715966218173e-05, + "loss": 0.4639, + "step": 143060 + }, + { + "epoch": 1.2647854452872223, + "grad_norm": 2.8576207160949707, + "learning_rate": 2.8920242578546298e-05, + "loss": 0.6445, + "step": 143070 + }, + { + "epoch": 1.2648738485475344, + "grad_norm": 3.724398136138916, + "learning_rate": 2.891876919087443e-05, + "loss": 0.6831, + "step": 143080 + }, + { + "epoch": 1.2649622518078467, + "grad_norm": 4.604367733001709, + "learning_rate": 2.8917295803202555e-05, + "loss": 0.5652, + "step": 143090 + }, + { + "epoch": 1.2650506550681588, + "grad_norm": 3.145444631576538, + "learning_rate": 2.8915822415530686e-05, + "loss": 0.6068, + "step": 143100 + }, + { + "epoch": 1.2651390583284712, + "grad_norm": 2.3527021408081055, + "learning_rate": 2.8914349027858818e-05, + "loss": 0.6915, + "step": 143110 + }, + { + "epoch": 1.2652274615887835, + "grad_norm": 1.904188871383667, + "learning_rate": 2.8912875640186943e-05, + "loss": 0.6783, + "step": 143120 + }, + { + "epoch": 1.2653158648490956, + "grad_norm": 2.834195852279663, + "learning_rate": 2.8911402252515075e-05, + "loss": 0.6958, + "step": 143130 + }, + { + "epoch": 1.2654042681094078, + "grad_norm": 5.731857776641846, + "learning_rate": 2.8909928864843207e-05, + "loss": 0.582, + "step": 143140 + }, + { + "epoch": 1.26549267136972, + "grad_norm": 1.5149227380752563, + "learning_rate": 2.890845547717133e-05, + "loss": 0.5661, + "step": 143150 + }, + { + "epoch": 1.2655810746300324, + "grad_norm": 13.879504203796387, + "learning_rate": 2.8906982089499463e-05, + "loss": 0.5487, + "step": 143160 + }, + { + "epoch": 1.2656694778903446, + "grad_norm": 4.728702068328857, + "learning_rate": 2.8905508701827595e-05, + "loss": 0.5851, + "step": 143170 + }, + { + "epoch": 1.265757881150657, + "grad_norm": 1.6866681575775146, + "learning_rate": 2.890403531415572e-05, + "loss": 0.6349, + "step": 143180 + }, + { + "epoch": 1.265846284410969, + "grad_norm": 1.0111569166183472, + "learning_rate": 2.8902561926483852e-05, + "loss": 0.5505, + "step": 143190 + }, + { + "epoch": 1.2659346876712814, + "grad_norm": 1.425072193145752, + "learning_rate": 2.8901088538811977e-05, + "loss": 0.5108, + "step": 143200 + }, + { + "epoch": 1.2660230909315935, + "grad_norm": 1.514492392539978, + "learning_rate": 2.889961515114011e-05, + "loss": 0.6134, + "step": 143210 + }, + { + "epoch": 1.2661114941919058, + "grad_norm": 1.8787119388580322, + "learning_rate": 2.889814176346824e-05, + "loss": 0.5376, + "step": 143220 + }, + { + "epoch": 1.2661998974522182, + "grad_norm": 13.510932922363281, + "learning_rate": 2.8896668375796365e-05, + "loss": 0.6555, + "step": 143230 + }, + { + "epoch": 1.2662883007125303, + "grad_norm": 8.407179832458496, + "learning_rate": 2.8895194988124497e-05, + "loss": 0.6163, + "step": 143240 + }, + { + "epoch": 1.2663767039728424, + "grad_norm": 3.4686079025268555, + "learning_rate": 2.889372160045263e-05, + "loss": 0.6605, + "step": 143250 + }, + { + "epoch": 1.2664651072331548, + "grad_norm": 1.2689135074615479, + "learning_rate": 2.8892248212780754e-05, + "loss": 0.4418, + "step": 143260 + }, + { + "epoch": 1.266553510493467, + "grad_norm": 3.256218671798706, + "learning_rate": 2.8890774825108885e-05, + "loss": 0.5736, + "step": 143270 + }, + { + "epoch": 1.2666419137537792, + "grad_norm": 2.8123676776885986, + "learning_rate": 2.8889301437437017e-05, + "loss": 0.5124, + "step": 143280 + }, + { + "epoch": 1.2667303170140916, + "grad_norm": 1.206463098526001, + "learning_rate": 2.8887828049765142e-05, + "loss": 0.5684, + "step": 143290 + }, + { + "epoch": 1.2668187202744037, + "grad_norm": 2.8159615993499756, + "learning_rate": 2.8886354662093274e-05, + "loss": 0.6916, + "step": 143300 + }, + { + "epoch": 1.266907123534716, + "grad_norm": 0.7424335479736328, + "learning_rate": 2.8884881274421405e-05, + "loss": 0.5167, + "step": 143310 + }, + { + "epoch": 1.2669955267950281, + "grad_norm": 0.9518781900405884, + "learning_rate": 2.888340788674953e-05, + "loss": 0.6491, + "step": 143320 + }, + { + "epoch": 1.2670839300553405, + "grad_norm": 2.728700637817383, + "learning_rate": 2.8881934499077662e-05, + "loss": 0.6594, + "step": 143330 + }, + { + "epoch": 1.2671723333156526, + "grad_norm": 4.281972408294678, + "learning_rate": 2.8880461111405787e-05, + "loss": 0.6846, + "step": 143340 + }, + { + "epoch": 1.267260736575965, + "grad_norm": 4.021638870239258, + "learning_rate": 2.887898772373392e-05, + "loss": 0.4888, + "step": 143350 + }, + { + "epoch": 1.267349139836277, + "grad_norm": 7.736510753631592, + "learning_rate": 2.887751433606205e-05, + "loss": 0.6918, + "step": 143360 + }, + { + "epoch": 1.2674375430965894, + "grad_norm": 0.790366530418396, + "learning_rate": 2.8876040948390176e-05, + "loss": 0.59, + "step": 143370 + }, + { + "epoch": 1.2675259463569017, + "grad_norm": 1.8299994468688965, + "learning_rate": 2.8874567560718307e-05, + "loss": 0.6455, + "step": 143380 + }, + { + "epoch": 1.2676143496172139, + "grad_norm": 3.1271791458129883, + "learning_rate": 2.887309417304644e-05, + "loss": 0.5926, + "step": 143390 + }, + { + "epoch": 1.267702752877526, + "grad_norm": 2.715224266052246, + "learning_rate": 2.8871620785374564e-05, + "loss": 0.5584, + "step": 143400 + }, + { + "epoch": 1.2677911561378383, + "grad_norm": 6.12232780456543, + "learning_rate": 2.8870147397702696e-05, + "loss": 0.6501, + "step": 143410 + }, + { + "epoch": 1.2678795593981507, + "grad_norm": 5.59682035446167, + "learning_rate": 2.8868674010030828e-05, + "loss": 0.7469, + "step": 143420 + }, + { + "epoch": 1.2679679626584628, + "grad_norm": 6.036071300506592, + "learning_rate": 2.8867200622358953e-05, + "loss": 0.7425, + "step": 143430 + }, + { + "epoch": 1.2680563659187751, + "grad_norm": 1.6482362747192383, + "learning_rate": 2.8865727234687084e-05, + "loss": 0.606, + "step": 143440 + }, + { + "epoch": 1.2681447691790873, + "grad_norm": 2.166813850402832, + "learning_rate": 2.886425384701521e-05, + "loss": 0.667, + "step": 143450 + }, + { + "epoch": 1.2682331724393996, + "grad_norm": 1.209266185760498, + "learning_rate": 2.886278045934334e-05, + "loss": 0.6682, + "step": 143460 + }, + { + "epoch": 1.2683215756997117, + "grad_norm": 11.603596687316895, + "learning_rate": 2.8861307071671473e-05, + "loss": 0.5872, + "step": 143470 + }, + { + "epoch": 1.268409978960024, + "grad_norm": 1.7886656522750854, + "learning_rate": 2.8859833683999598e-05, + "loss": 0.5745, + "step": 143480 + }, + { + "epoch": 1.2684983822203364, + "grad_norm": 3.00777268409729, + "learning_rate": 2.885836029632773e-05, + "loss": 0.6183, + "step": 143490 + }, + { + "epoch": 1.2685867854806485, + "grad_norm": 1.610642433166504, + "learning_rate": 2.885688690865586e-05, + "loss": 0.6505, + "step": 143500 + }, + { + "epoch": 1.2686751887409606, + "grad_norm": 1.5638673305511475, + "learning_rate": 2.8855413520983986e-05, + "loss": 0.525, + "step": 143510 + }, + { + "epoch": 1.268763592001273, + "grad_norm": 3.448699712753296, + "learning_rate": 2.8853940133312118e-05, + "loss": 0.6483, + "step": 143520 + }, + { + "epoch": 1.2688519952615853, + "grad_norm": 13.250480651855469, + "learning_rate": 2.885246674564025e-05, + "loss": 0.4388, + "step": 143530 + }, + { + "epoch": 1.2689403985218974, + "grad_norm": 2.753995656967163, + "learning_rate": 2.8850993357968375e-05, + "loss": 0.6268, + "step": 143540 + }, + { + "epoch": 1.2690288017822098, + "grad_norm": 2.7080774307250977, + "learning_rate": 2.8849519970296506e-05, + "loss": 0.5594, + "step": 143550 + }, + { + "epoch": 1.269117205042522, + "grad_norm": 3.518550157546997, + "learning_rate": 2.884804658262463e-05, + "loss": 0.5845, + "step": 143560 + }, + { + "epoch": 1.2692056083028342, + "grad_norm": 1.6237034797668457, + "learning_rate": 2.8846573194952763e-05, + "loss": 0.5847, + "step": 143570 + }, + { + "epoch": 1.2692940115631464, + "grad_norm": 6.082953929901123, + "learning_rate": 2.8845099807280895e-05, + "loss": 0.722, + "step": 143580 + }, + { + "epoch": 1.2693824148234587, + "grad_norm": 3.562917947769165, + "learning_rate": 2.884362641960902e-05, + "loss": 0.6102, + "step": 143590 + }, + { + "epoch": 1.269470818083771, + "grad_norm": 1.9264031648635864, + "learning_rate": 2.884215303193715e-05, + "loss": 0.4509, + "step": 143600 + }, + { + "epoch": 1.2695592213440832, + "grad_norm": 1.300840139389038, + "learning_rate": 2.8840679644265283e-05, + "loss": 0.5835, + "step": 143610 + }, + { + "epoch": 1.2696476246043953, + "grad_norm": 1.7975136041641235, + "learning_rate": 2.8839206256593408e-05, + "loss": 0.7292, + "step": 143620 + }, + { + "epoch": 1.2697360278647076, + "grad_norm": 5.618459224700928, + "learning_rate": 2.883773286892154e-05, + "loss": 0.4774, + "step": 143630 + }, + { + "epoch": 1.26982443112502, + "grad_norm": 0.8197553753852844, + "learning_rate": 2.883625948124967e-05, + "loss": 0.6866, + "step": 143640 + }, + { + "epoch": 1.269912834385332, + "grad_norm": 6.46420955657959, + "learning_rate": 2.8834786093577797e-05, + "loss": 0.6666, + "step": 143650 + }, + { + "epoch": 1.2700012376456444, + "grad_norm": 7.9257612228393555, + "learning_rate": 2.883331270590593e-05, + "loss": 0.6584, + "step": 143660 + }, + { + "epoch": 1.2700896409059566, + "grad_norm": 3.0767781734466553, + "learning_rate": 2.8831839318234057e-05, + "loss": 0.6179, + "step": 143670 + }, + { + "epoch": 1.270178044166269, + "grad_norm": 1.7076400518417358, + "learning_rate": 2.8830365930562185e-05, + "loss": 0.5594, + "step": 143680 + }, + { + "epoch": 1.270266447426581, + "grad_norm": 3.3699827194213867, + "learning_rate": 2.8828892542890317e-05, + "loss": 0.5119, + "step": 143690 + }, + { + "epoch": 1.2703548506868934, + "grad_norm": 2.9758145809173584, + "learning_rate": 2.8827419155218445e-05, + "loss": 0.6506, + "step": 143700 + }, + { + "epoch": 1.2704432539472057, + "grad_norm": 2.0639407634735107, + "learning_rate": 2.8825945767546574e-05, + "loss": 0.6716, + "step": 143710 + }, + { + "epoch": 1.2705316572075178, + "grad_norm": 2.3735551834106445, + "learning_rate": 2.8824472379874705e-05, + "loss": 0.6813, + "step": 143720 + }, + { + "epoch": 1.27062006046783, + "grad_norm": 1.4210724830627441, + "learning_rate": 2.8822998992202834e-05, + "loss": 0.5981, + "step": 143730 + }, + { + "epoch": 1.2707084637281423, + "grad_norm": 1.2074196338653564, + "learning_rate": 2.8821525604530962e-05, + "loss": 0.5499, + "step": 143740 + }, + { + "epoch": 1.2707968669884546, + "grad_norm": 1.5849781036376953, + "learning_rate": 2.8820052216859094e-05, + "loss": 0.6157, + "step": 143750 + }, + { + "epoch": 1.2708852702487667, + "grad_norm": 7.992796897888184, + "learning_rate": 2.8818578829187222e-05, + "loss": 0.591, + "step": 143760 + }, + { + "epoch": 1.270973673509079, + "grad_norm": 5.470476150512695, + "learning_rate": 2.881710544151535e-05, + "loss": 0.5298, + "step": 143770 + }, + { + "epoch": 1.2710620767693912, + "grad_norm": 6.0549116134643555, + "learning_rate": 2.8815632053843482e-05, + "loss": 0.5797, + "step": 143780 + }, + { + "epoch": 1.2711504800297035, + "grad_norm": 1.3632700443267822, + "learning_rate": 2.881415866617161e-05, + "loss": 0.6716, + "step": 143790 + }, + { + "epoch": 1.2712388832900157, + "grad_norm": 2.3481719493865967, + "learning_rate": 2.881268527849974e-05, + "loss": 0.5658, + "step": 143800 + }, + { + "epoch": 1.271327286550328, + "grad_norm": 2.9604926109313965, + "learning_rate": 2.8811211890827867e-05, + "loss": 0.5784, + "step": 143810 + }, + { + "epoch": 1.2714156898106403, + "grad_norm": 8.447488784790039, + "learning_rate": 2.8809738503156e-05, + "loss": 0.5668, + "step": 143820 + }, + { + "epoch": 1.2715040930709525, + "grad_norm": 20.94457244873047, + "learning_rate": 2.8808265115484127e-05, + "loss": 0.5542, + "step": 143830 + }, + { + "epoch": 1.2715924963312646, + "grad_norm": 2.674136161804199, + "learning_rate": 2.8806791727812256e-05, + "loss": 0.6087, + "step": 143840 + }, + { + "epoch": 1.271680899591577, + "grad_norm": 4.967868328094482, + "learning_rate": 2.8805318340140387e-05, + "loss": 0.5514, + "step": 143850 + }, + { + "epoch": 1.2717693028518893, + "grad_norm": 5.074409484863281, + "learning_rate": 2.8803844952468516e-05, + "loss": 0.713, + "step": 143860 + }, + { + "epoch": 1.2718577061122014, + "grad_norm": 2.294990301132202, + "learning_rate": 2.8802371564796644e-05, + "loss": 0.6364, + "step": 143870 + }, + { + "epoch": 1.2719461093725137, + "grad_norm": 9.884675025939941, + "learning_rate": 2.8800898177124776e-05, + "loss": 0.5875, + "step": 143880 + }, + { + "epoch": 1.2720345126328259, + "grad_norm": 6.053366184234619, + "learning_rate": 2.8799424789452904e-05, + "loss": 0.6447, + "step": 143890 + }, + { + "epoch": 1.2721229158931382, + "grad_norm": 8.812887191772461, + "learning_rate": 2.8797951401781033e-05, + "loss": 0.7148, + "step": 143900 + }, + { + "epoch": 1.2722113191534503, + "grad_norm": 1.4976969957351685, + "learning_rate": 2.8796478014109164e-05, + "loss": 0.5782, + "step": 143910 + }, + { + "epoch": 1.2722997224137627, + "grad_norm": 5.260051727294922, + "learning_rate": 2.879500462643729e-05, + "loss": 0.646, + "step": 143920 + }, + { + "epoch": 1.2723881256740748, + "grad_norm": 8.015777587890625, + "learning_rate": 2.879353123876542e-05, + "loss": 0.677, + "step": 143930 + }, + { + "epoch": 1.2724765289343871, + "grad_norm": 11.101980209350586, + "learning_rate": 2.8792057851093553e-05, + "loss": 0.6675, + "step": 143940 + }, + { + "epoch": 1.2725649321946992, + "grad_norm": 5.310331344604492, + "learning_rate": 2.8790584463421678e-05, + "loss": 0.6534, + "step": 143950 + }, + { + "epoch": 1.2726533354550116, + "grad_norm": 5.071225166320801, + "learning_rate": 2.878911107574981e-05, + "loss": 0.5221, + "step": 143960 + }, + { + "epoch": 1.272741738715324, + "grad_norm": 4.7692155838012695, + "learning_rate": 2.878763768807794e-05, + "loss": 0.5558, + "step": 143970 + }, + { + "epoch": 1.272830141975636, + "grad_norm": 2.1279332637786865, + "learning_rate": 2.8786164300406066e-05, + "loss": 0.7547, + "step": 143980 + }, + { + "epoch": 1.2729185452359484, + "grad_norm": 3.2309956550598145, + "learning_rate": 2.8784690912734198e-05, + "loss": 0.6821, + "step": 143990 + }, + { + "epoch": 1.2730069484962605, + "grad_norm": 2.9643259048461914, + "learning_rate": 2.878321752506233e-05, + "loss": 0.6318, + "step": 144000 + }, + { + "epoch": 1.2730953517565728, + "grad_norm": 3.976378917694092, + "learning_rate": 2.8781744137390455e-05, + "loss": 0.6519, + "step": 144010 + }, + { + "epoch": 1.273183755016885, + "grad_norm": 1.9537153244018555, + "learning_rate": 2.8780270749718586e-05, + "loss": 0.6273, + "step": 144020 + }, + { + "epoch": 1.2732721582771973, + "grad_norm": 3.5452706813812256, + "learning_rate": 2.877879736204671e-05, + "loss": 0.56, + "step": 144030 + }, + { + "epoch": 1.2733605615375094, + "grad_norm": 1.5345059633255005, + "learning_rate": 2.8777323974374843e-05, + "loss": 0.5701, + "step": 144040 + }, + { + "epoch": 1.2734489647978218, + "grad_norm": 4.666220188140869, + "learning_rate": 2.8775850586702975e-05, + "loss": 0.5635, + "step": 144050 + }, + { + "epoch": 1.273537368058134, + "grad_norm": 2.9292423725128174, + "learning_rate": 2.87743771990311e-05, + "loss": 0.5471, + "step": 144060 + }, + { + "epoch": 1.2736257713184462, + "grad_norm": 1.0052542686462402, + "learning_rate": 2.877290381135923e-05, + "loss": 0.559, + "step": 144070 + }, + { + "epoch": 1.2737141745787586, + "grad_norm": 1.4468069076538086, + "learning_rate": 2.8771430423687363e-05, + "loss": 0.6737, + "step": 144080 + }, + { + "epoch": 1.2738025778390707, + "grad_norm": 1.5976800918579102, + "learning_rate": 2.8769957036015488e-05, + "loss": 0.6002, + "step": 144090 + }, + { + "epoch": 1.2738909810993828, + "grad_norm": 3.1899566650390625, + "learning_rate": 2.876848364834362e-05, + "loss": 0.6077, + "step": 144100 + }, + { + "epoch": 1.2739793843596952, + "grad_norm": 5.289868354797363, + "learning_rate": 2.8767010260671752e-05, + "loss": 0.5639, + "step": 144110 + }, + { + "epoch": 1.2740677876200075, + "grad_norm": 0.67649906873703, + "learning_rate": 2.8765536872999877e-05, + "loss": 0.5683, + "step": 144120 + }, + { + "epoch": 1.2741561908803196, + "grad_norm": 1.6645371913909912, + "learning_rate": 2.876406348532801e-05, + "loss": 0.6473, + "step": 144130 + }, + { + "epoch": 1.274244594140632, + "grad_norm": 1.9654217958450317, + "learning_rate": 2.8762590097656133e-05, + "loss": 0.7002, + "step": 144140 + }, + { + "epoch": 1.274332997400944, + "grad_norm": 7.0221662521362305, + "learning_rate": 2.8761116709984265e-05, + "loss": 0.5752, + "step": 144150 + }, + { + "epoch": 1.2744214006612564, + "grad_norm": 1.2534343004226685, + "learning_rate": 2.8759643322312397e-05, + "loss": 0.6451, + "step": 144160 + }, + { + "epoch": 1.2745098039215685, + "grad_norm": 2.1572318077087402, + "learning_rate": 2.8758169934640522e-05, + "loss": 0.5988, + "step": 144170 + }, + { + "epoch": 1.2745982071818809, + "grad_norm": 1.246850609779358, + "learning_rate": 2.8756696546968654e-05, + "loss": 0.6117, + "step": 144180 + }, + { + "epoch": 1.2746866104421932, + "grad_norm": 1.5250139236450195, + "learning_rate": 2.8755223159296785e-05, + "loss": 0.6395, + "step": 144190 + }, + { + "epoch": 1.2747750137025053, + "grad_norm": 10.86414623260498, + "learning_rate": 2.875374977162491e-05, + "loss": 0.6626, + "step": 144200 + }, + { + "epoch": 1.2748634169628175, + "grad_norm": 16.819005966186523, + "learning_rate": 2.8752276383953042e-05, + "loss": 0.6078, + "step": 144210 + }, + { + "epoch": 1.2749518202231298, + "grad_norm": 6.163760185241699, + "learning_rate": 2.8750802996281174e-05, + "loss": 0.5332, + "step": 144220 + }, + { + "epoch": 1.2750402234834421, + "grad_norm": 5.6115031242370605, + "learning_rate": 2.87493296086093e-05, + "loss": 0.7229, + "step": 144230 + }, + { + "epoch": 1.2751286267437543, + "grad_norm": 1.7008734941482544, + "learning_rate": 2.874785622093743e-05, + "loss": 0.6086, + "step": 144240 + }, + { + "epoch": 1.2752170300040666, + "grad_norm": 4.339953899383545, + "learning_rate": 2.8746382833265562e-05, + "loss": 0.7021, + "step": 144250 + }, + { + "epoch": 1.2753054332643787, + "grad_norm": 1.4352452754974365, + "learning_rate": 2.8744909445593687e-05, + "loss": 0.634, + "step": 144260 + }, + { + "epoch": 1.275393836524691, + "grad_norm": 3.6246347427368164, + "learning_rate": 2.874343605792182e-05, + "loss": 0.5599, + "step": 144270 + }, + { + "epoch": 1.2754822397850032, + "grad_norm": 2.590325355529785, + "learning_rate": 2.8741962670249944e-05, + "loss": 0.5862, + "step": 144280 + }, + { + "epoch": 1.2755706430453155, + "grad_norm": 6.288815498352051, + "learning_rate": 2.8740489282578076e-05, + "loss": 0.5432, + "step": 144290 + }, + { + "epoch": 1.2756590463056279, + "grad_norm": 3.886507749557495, + "learning_rate": 2.8739015894906207e-05, + "loss": 0.5607, + "step": 144300 + }, + { + "epoch": 1.27574744956594, + "grad_norm": 2.6747195720672607, + "learning_rate": 2.8737542507234332e-05, + "loss": 0.5247, + "step": 144310 + }, + { + "epoch": 1.2758358528262521, + "grad_norm": 1.249826192855835, + "learning_rate": 2.8736069119562464e-05, + "loss": 0.676, + "step": 144320 + }, + { + "epoch": 1.2759242560865645, + "grad_norm": 0.8176407217979431, + "learning_rate": 2.8734595731890596e-05, + "loss": 0.621, + "step": 144330 + }, + { + "epoch": 1.2760126593468768, + "grad_norm": 1.9243634939193726, + "learning_rate": 2.873312234421872e-05, + "loss": 0.6846, + "step": 144340 + }, + { + "epoch": 1.276101062607189, + "grad_norm": 2.7360658645629883, + "learning_rate": 2.8731648956546853e-05, + "loss": 0.5991, + "step": 144350 + }, + { + "epoch": 1.2761894658675013, + "grad_norm": 3.5600759983062744, + "learning_rate": 2.8730175568874984e-05, + "loss": 0.6171, + "step": 144360 + }, + { + "epoch": 1.2762778691278134, + "grad_norm": 3.715871572494507, + "learning_rate": 2.872870218120311e-05, + "loss": 0.668, + "step": 144370 + }, + { + "epoch": 1.2763662723881257, + "grad_norm": 15.773343086242676, + "learning_rate": 2.872722879353124e-05, + "loss": 0.6257, + "step": 144380 + }, + { + "epoch": 1.2764546756484378, + "grad_norm": 0.5877245664596558, + "learning_rate": 2.8725755405859366e-05, + "loss": 0.5009, + "step": 144390 + }, + { + "epoch": 1.2765430789087502, + "grad_norm": 22.888269424438477, + "learning_rate": 2.8724282018187498e-05, + "loss": 0.6638, + "step": 144400 + }, + { + "epoch": 1.2766314821690625, + "grad_norm": 2.3428499698638916, + "learning_rate": 2.872280863051563e-05, + "loss": 0.6117, + "step": 144410 + }, + { + "epoch": 1.2767198854293746, + "grad_norm": 3.7248363494873047, + "learning_rate": 2.8721335242843754e-05, + "loss": 0.6009, + "step": 144420 + }, + { + "epoch": 1.2768082886896868, + "grad_norm": 2.8315975666046143, + "learning_rate": 2.8719861855171886e-05, + "loss": 0.4665, + "step": 144430 + }, + { + "epoch": 1.276896691949999, + "grad_norm": 8.859705924987793, + "learning_rate": 2.8718388467500018e-05, + "loss": 0.6391, + "step": 144440 + }, + { + "epoch": 1.2769850952103114, + "grad_norm": 2.327495574951172, + "learning_rate": 2.8716915079828143e-05, + "loss": 0.601, + "step": 144450 + }, + { + "epoch": 1.2770734984706236, + "grad_norm": 1.9837312698364258, + "learning_rate": 2.8715441692156275e-05, + "loss": 0.7083, + "step": 144460 + }, + { + "epoch": 1.277161901730936, + "grad_norm": 1.566441297531128, + "learning_rate": 2.8713968304484406e-05, + "loss": 0.5664, + "step": 144470 + }, + { + "epoch": 1.277250304991248, + "grad_norm": 3.0444397926330566, + "learning_rate": 2.871249491681253e-05, + "loss": 0.5965, + "step": 144480 + }, + { + "epoch": 1.2773387082515604, + "grad_norm": 1.6361877918243408, + "learning_rate": 2.8711021529140663e-05, + "loss": 0.5821, + "step": 144490 + }, + { + "epoch": 1.2774271115118725, + "grad_norm": 1.4208300113677979, + "learning_rate": 2.8709548141468788e-05, + "loss": 0.5275, + "step": 144500 + }, + { + "epoch": 1.2775155147721848, + "grad_norm": 3.640321969985962, + "learning_rate": 2.870807475379692e-05, + "loss": 0.565, + "step": 144510 + }, + { + "epoch": 1.2776039180324972, + "grad_norm": 5.088964939117432, + "learning_rate": 2.870660136612505e-05, + "loss": 0.6934, + "step": 144520 + }, + { + "epoch": 1.2776923212928093, + "grad_norm": 2.4783012866973877, + "learning_rate": 2.8705127978453176e-05, + "loss": 0.5838, + "step": 144530 + }, + { + "epoch": 1.2777807245531214, + "grad_norm": 2.5613768100738525, + "learning_rate": 2.8703654590781308e-05, + "loss": 0.6613, + "step": 144540 + }, + { + "epoch": 1.2778691278134338, + "grad_norm": 1.3631033897399902, + "learning_rate": 2.870218120310944e-05, + "loss": 0.8001, + "step": 144550 + }, + { + "epoch": 1.277957531073746, + "grad_norm": 7.077154159545898, + "learning_rate": 2.8700707815437565e-05, + "loss": 0.6563, + "step": 144560 + }, + { + "epoch": 1.2780459343340582, + "grad_norm": 1.1418441534042358, + "learning_rate": 2.8699234427765697e-05, + "loss": 0.6233, + "step": 144570 + }, + { + "epoch": 1.2781343375943706, + "grad_norm": 7.944869041442871, + "learning_rate": 2.869776104009383e-05, + "loss": 0.4808, + "step": 144580 + }, + { + "epoch": 1.2782227408546827, + "grad_norm": 8.782295227050781, + "learning_rate": 2.8696287652421953e-05, + "loss": 0.6318, + "step": 144590 + }, + { + "epoch": 1.278311144114995, + "grad_norm": 3.41955304145813, + "learning_rate": 2.8694814264750085e-05, + "loss": 0.6655, + "step": 144600 + }, + { + "epoch": 1.2783995473753071, + "grad_norm": 2.343939781188965, + "learning_rate": 2.8693340877078213e-05, + "loss": 0.7034, + "step": 144610 + }, + { + "epoch": 1.2784879506356195, + "grad_norm": 0.9646592736244202, + "learning_rate": 2.8691867489406342e-05, + "loss": 0.706, + "step": 144620 + }, + { + "epoch": 1.2785763538959316, + "grad_norm": 2.6781017780303955, + "learning_rate": 2.8690394101734474e-05, + "loss": 0.5193, + "step": 144630 + }, + { + "epoch": 1.278664757156244, + "grad_norm": 4.066171169281006, + "learning_rate": 2.8688920714062602e-05, + "loss": 0.6338, + "step": 144640 + }, + { + "epoch": 1.278753160416556, + "grad_norm": 10.386735916137695, + "learning_rate": 2.868744732639073e-05, + "loss": 0.6423, + "step": 144650 + }, + { + "epoch": 1.2788415636768684, + "grad_norm": 0.9358965158462524, + "learning_rate": 2.8685973938718862e-05, + "loss": 0.5591, + "step": 144660 + }, + { + "epoch": 1.2789299669371808, + "grad_norm": 4.788538932800293, + "learning_rate": 2.868450055104699e-05, + "loss": 0.5652, + "step": 144670 + }, + { + "epoch": 1.2790183701974929, + "grad_norm": 1.6475518941879272, + "learning_rate": 2.868302716337512e-05, + "loss": 0.5754, + "step": 144680 + }, + { + "epoch": 1.279106773457805, + "grad_norm": 2.2971761226654053, + "learning_rate": 2.868155377570325e-05, + "loss": 0.6217, + "step": 144690 + }, + { + "epoch": 1.2791951767181173, + "grad_norm": 12.338384628295898, + "learning_rate": 2.868008038803138e-05, + "loss": 0.5459, + "step": 144700 + }, + { + "epoch": 1.2792835799784297, + "grad_norm": 14.136381149291992, + "learning_rate": 2.8678607000359507e-05, + "loss": 0.6504, + "step": 144710 + }, + { + "epoch": 1.2793719832387418, + "grad_norm": 1.352890968322754, + "learning_rate": 2.867713361268764e-05, + "loss": 0.6775, + "step": 144720 + }, + { + "epoch": 1.2794603864990541, + "grad_norm": 1.5157339572906494, + "learning_rate": 2.8675660225015767e-05, + "loss": 0.5784, + "step": 144730 + }, + { + "epoch": 1.2795487897593663, + "grad_norm": 2.1194818019866943, + "learning_rate": 2.8674186837343896e-05, + "loss": 0.5803, + "step": 144740 + }, + { + "epoch": 1.2796371930196786, + "grad_norm": 4.2607855796813965, + "learning_rate": 2.8672713449672024e-05, + "loss": 0.5947, + "step": 144750 + }, + { + "epoch": 1.2797255962799907, + "grad_norm": 6.949308395385742, + "learning_rate": 2.8671240062000156e-05, + "loss": 0.6539, + "step": 144760 + }, + { + "epoch": 1.279813999540303, + "grad_norm": 1.5105788707733154, + "learning_rate": 2.8669766674328284e-05, + "loss": 0.5601, + "step": 144770 + }, + { + "epoch": 1.2799024028006154, + "grad_norm": 1.5485703945159912, + "learning_rate": 2.8668293286656412e-05, + "loss": 0.6481, + "step": 144780 + }, + { + "epoch": 1.2799908060609275, + "grad_norm": 2.6597132682800293, + "learning_rate": 2.8666819898984544e-05, + "loss": 0.599, + "step": 144790 + }, + { + "epoch": 1.2800792093212396, + "grad_norm": 3.5043883323669434, + "learning_rate": 2.8665346511312673e-05, + "loss": 0.656, + "step": 144800 + }, + { + "epoch": 1.280167612581552, + "grad_norm": 0.8487949967384338, + "learning_rate": 2.86638731236408e-05, + "loss": 0.5207, + "step": 144810 + }, + { + "epoch": 1.2802560158418643, + "grad_norm": 2.381865978240967, + "learning_rate": 2.8662399735968933e-05, + "loss": 0.6365, + "step": 144820 + }, + { + "epoch": 1.2803444191021764, + "grad_norm": 4.491841793060303, + "learning_rate": 2.866092634829706e-05, + "loss": 0.6697, + "step": 144830 + }, + { + "epoch": 1.2804328223624888, + "grad_norm": 6.116016387939453, + "learning_rate": 2.865945296062519e-05, + "loss": 0.6145, + "step": 144840 + }, + { + "epoch": 1.280521225622801, + "grad_norm": 3.354546546936035, + "learning_rate": 2.865797957295332e-05, + "loss": 0.5288, + "step": 144850 + }, + { + "epoch": 1.2806096288831132, + "grad_norm": 4.211579322814941, + "learning_rate": 2.8656506185281446e-05, + "loss": 0.7173, + "step": 144860 + }, + { + "epoch": 1.2806980321434254, + "grad_norm": 8.3267183303833, + "learning_rate": 2.8655032797609578e-05, + "loss": 0.6671, + "step": 144870 + }, + { + "epoch": 1.2807864354037377, + "grad_norm": 11.029805183410645, + "learning_rate": 2.865355940993771e-05, + "loss": 0.6084, + "step": 144880 + }, + { + "epoch": 1.28087483866405, + "grad_norm": 3.079167366027832, + "learning_rate": 2.8652086022265834e-05, + "loss": 0.6042, + "step": 144890 + }, + { + "epoch": 1.2809632419243622, + "grad_norm": 1.026294231414795, + "learning_rate": 2.8650612634593966e-05, + "loss": 0.5229, + "step": 144900 + }, + { + "epoch": 1.2810516451846743, + "grad_norm": 2.931396245956421, + "learning_rate": 2.8649139246922098e-05, + "loss": 0.6454, + "step": 144910 + }, + { + "epoch": 1.2811400484449866, + "grad_norm": 3.786447048187256, + "learning_rate": 2.8647665859250223e-05, + "loss": 0.599, + "step": 144920 + }, + { + "epoch": 1.281228451705299, + "grad_norm": 2.0971574783325195, + "learning_rate": 2.8646192471578355e-05, + "loss": 0.6693, + "step": 144930 + }, + { + "epoch": 1.281316854965611, + "grad_norm": 4.028332710266113, + "learning_rate": 2.8644719083906486e-05, + "loss": 0.734, + "step": 144940 + }, + { + "epoch": 1.2814052582259234, + "grad_norm": 2.2117221355438232, + "learning_rate": 2.864324569623461e-05, + "loss": 0.5475, + "step": 144950 + }, + { + "epoch": 1.2814936614862356, + "grad_norm": 2.2046737670898438, + "learning_rate": 2.8641772308562743e-05, + "loss": 0.5736, + "step": 144960 + }, + { + "epoch": 1.281582064746548, + "grad_norm": 6.036850452423096, + "learning_rate": 2.8640298920890868e-05, + "loss": 0.6335, + "step": 144970 + }, + { + "epoch": 1.28167046800686, + "grad_norm": 2.936176061630249, + "learning_rate": 2.8638825533219e-05, + "loss": 0.6286, + "step": 144980 + }, + { + "epoch": 1.2817588712671724, + "grad_norm": 3.474818229675293, + "learning_rate": 2.863735214554713e-05, + "loss": 0.5419, + "step": 144990 + }, + { + "epoch": 1.2818472745274847, + "grad_norm": 13.819268226623535, + "learning_rate": 2.8635878757875257e-05, + "loss": 0.7727, + "step": 145000 + }, + { + "epoch": 1.2819356777877968, + "grad_norm": 9.787267684936523, + "learning_rate": 2.8634405370203388e-05, + "loss": 0.5939, + "step": 145010 + }, + { + "epoch": 1.282024081048109, + "grad_norm": 8.66058349609375, + "learning_rate": 2.863293198253152e-05, + "loss": 0.835, + "step": 145020 + }, + { + "epoch": 1.2821124843084213, + "grad_norm": 1.4130849838256836, + "learning_rate": 2.8631458594859645e-05, + "loss": 0.568, + "step": 145030 + }, + { + "epoch": 1.2822008875687336, + "grad_norm": 1.268191933631897, + "learning_rate": 2.8629985207187777e-05, + "loss": 0.4368, + "step": 145040 + }, + { + "epoch": 1.2822892908290457, + "grad_norm": 3.038203239440918, + "learning_rate": 2.862851181951591e-05, + "loss": 0.5463, + "step": 145050 + }, + { + "epoch": 1.282377694089358, + "grad_norm": 5.102221488952637, + "learning_rate": 2.8627038431844033e-05, + "loss": 0.7606, + "step": 145060 + }, + { + "epoch": 1.2824660973496702, + "grad_norm": 2.93377947807312, + "learning_rate": 2.8625565044172165e-05, + "loss": 0.7012, + "step": 145070 + }, + { + "epoch": 1.2825545006099826, + "grad_norm": 3.0152440071105957, + "learning_rate": 2.862409165650029e-05, + "loss": 0.6868, + "step": 145080 + }, + { + "epoch": 1.2826429038702947, + "grad_norm": 0.9875383973121643, + "learning_rate": 2.8622618268828422e-05, + "loss": 0.4813, + "step": 145090 + }, + { + "epoch": 1.282731307130607, + "grad_norm": 2.975111484527588, + "learning_rate": 2.8621144881156554e-05, + "loss": 0.7272, + "step": 145100 + }, + { + "epoch": 1.2828197103909194, + "grad_norm": 2.956238269805908, + "learning_rate": 2.861967149348468e-05, + "loss": 0.6224, + "step": 145110 + }, + { + "epoch": 1.2829081136512315, + "grad_norm": 6.5094895362854, + "learning_rate": 2.861819810581281e-05, + "loss": 0.6055, + "step": 145120 + }, + { + "epoch": 1.2829965169115436, + "grad_norm": 5.5012526512146, + "learning_rate": 2.8616724718140942e-05, + "loss": 0.5386, + "step": 145130 + }, + { + "epoch": 1.283084920171856, + "grad_norm": 3.321120023727417, + "learning_rate": 2.8615251330469067e-05, + "loss": 0.4525, + "step": 145140 + }, + { + "epoch": 1.2831733234321683, + "grad_norm": 2.4823215007781982, + "learning_rate": 2.86137779427972e-05, + "loss": 0.6326, + "step": 145150 + }, + { + "epoch": 1.2832617266924804, + "grad_norm": 9.892343521118164, + "learning_rate": 2.861230455512533e-05, + "loss": 0.5858, + "step": 145160 + }, + { + "epoch": 1.2833501299527927, + "grad_norm": 3.45646071434021, + "learning_rate": 2.8610831167453456e-05, + "loss": 0.5829, + "step": 145170 + }, + { + "epoch": 1.2834385332131049, + "grad_norm": 1.6325308084487915, + "learning_rate": 2.8609357779781587e-05, + "loss": 0.573, + "step": 145180 + }, + { + "epoch": 1.2835269364734172, + "grad_norm": 3.046881675720215, + "learning_rate": 2.860788439210972e-05, + "loss": 0.7265, + "step": 145190 + }, + { + "epoch": 1.2836153397337293, + "grad_norm": 6.056140422821045, + "learning_rate": 2.8606411004437844e-05, + "loss": 0.5527, + "step": 145200 + }, + { + "epoch": 1.2837037429940417, + "grad_norm": 3.9403016567230225, + "learning_rate": 2.8604937616765976e-05, + "loss": 0.5361, + "step": 145210 + }, + { + "epoch": 1.2837921462543538, + "grad_norm": 2.8443801403045654, + "learning_rate": 2.86034642290941e-05, + "loss": 0.5384, + "step": 145220 + }, + { + "epoch": 1.2838805495146661, + "grad_norm": 1.457789659500122, + "learning_rate": 2.8601990841422232e-05, + "loss": 0.5964, + "step": 145230 + }, + { + "epoch": 1.2839689527749782, + "grad_norm": 2.6842362880706787, + "learning_rate": 2.8600517453750364e-05, + "loss": 0.6268, + "step": 145240 + }, + { + "epoch": 1.2840573560352906, + "grad_norm": 2.956540584564209, + "learning_rate": 2.859904406607849e-05, + "loss": 0.6093, + "step": 145250 + }, + { + "epoch": 1.284145759295603, + "grad_norm": 1.686755657196045, + "learning_rate": 2.859757067840662e-05, + "loss": 0.5026, + "step": 145260 + }, + { + "epoch": 1.284234162555915, + "grad_norm": 4.004009246826172, + "learning_rate": 2.8596097290734753e-05, + "loss": 0.5831, + "step": 145270 + }, + { + "epoch": 1.2843225658162272, + "grad_norm": 1.9915940761566162, + "learning_rate": 2.8594623903062878e-05, + "loss": 0.5448, + "step": 145280 + }, + { + "epoch": 1.2844109690765395, + "grad_norm": 3.8886215686798096, + "learning_rate": 2.859315051539101e-05, + "loss": 0.6719, + "step": 145290 + }, + { + "epoch": 1.2844993723368519, + "grad_norm": 2.0705909729003906, + "learning_rate": 2.859167712771914e-05, + "loss": 0.5929, + "step": 145300 + }, + { + "epoch": 1.284587775597164, + "grad_norm": 3.3223421573638916, + "learning_rate": 2.8590203740047266e-05, + "loss": 0.5063, + "step": 145310 + }, + { + "epoch": 1.2846761788574763, + "grad_norm": 9.860546112060547, + "learning_rate": 2.8588730352375398e-05, + "loss": 0.5811, + "step": 145320 + }, + { + "epoch": 1.2847645821177884, + "grad_norm": 0.9417491555213928, + "learning_rate": 2.8587256964703523e-05, + "loss": 0.6186, + "step": 145330 + }, + { + "epoch": 1.2848529853781008, + "grad_norm": 3.1897246837615967, + "learning_rate": 2.8585783577031654e-05, + "loss": 0.6089, + "step": 145340 + }, + { + "epoch": 1.284941388638413, + "grad_norm": 1.8526476621627808, + "learning_rate": 2.8584310189359786e-05, + "loss": 0.6641, + "step": 145350 + }, + { + "epoch": 1.2850297918987252, + "grad_norm": 1.4419277906417847, + "learning_rate": 2.858283680168791e-05, + "loss": 0.6775, + "step": 145360 + }, + { + "epoch": 1.2851181951590376, + "grad_norm": 10.389781951904297, + "learning_rate": 2.8581363414016043e-05, + "loss": 0.5123, + "step": 145370 + }, + { + "epoch": 1.2852065984193497, + "grad_norm": 3.1473987102508545, + "learning_rate": 2.8579890026344175e-05, + "loss": 0.5282, + "step": 145380 + }, + { + "epoch": 1.2852950016796618, + "grad_norm": 1.3255176544189453, + "learning_rate": 2.85784166386723e-05, + "loss": 0.6065, + "step": 145390 + }, + { + "epoch": 1.2853834049399742, + "grad_norm": 6.769455909729004, + "learning_rate": 2.857694325100043e-05, + "loss": 0.6503, + "step": 145400 + }, + { + "epoch": 1.2854718082002865, + "grad_norm": 4.0513505935668945, + "learning_rate": 2.8575469863328563e-05, + "loss": 0.511, + "step": 145410 + }, + { + "epoch": 1.2855602114605986, + "grad_norm": 5.690187454223633, + "learning_rate": 2.8573996475656688e-05, + "loss": 0.6757, + "step": 145420 + }, + { + "epoch": 1.285648614720911, + "grad_norm": 0.7033282518386841, + "learning_rate": 2.857252308798482e-05, + "loss": 0.6104, + "step": 145430 + }, + { + "epoch": 1.285737017981223, + "grad_norm": 3.491544008255005, + "learning_rate": 2.8571049700312945e-05, + "loss": 0.5733, + "step": 145440 + }, + { + "epoch": 1.2858254212415354, + "grad_norm": 1.6139774322509766, + "learning_rate": 2.8569576312641077e-05, + "loss": 0.5101, + "step": 145450 + }, + { + "epoch": 1.2859138245018475, + "grad_norm": 7.899396896362305, + "learning_rate": 2.8568102924969208e-05, + "loss": 0.657, + "step": 145460 + }, + { + "epoch": 1.2860022277621599, + "grad_norm": 2.78837251663208, + "learning_rate": 2.8566629537297333e-05, + "loss": 0.6428, + "step": 145470 + }, + { + "epoch": 1.2860906310224722, + "grad_norm": 11.340784072875977, + "learning_rate": 2.8565156149625465e-05, + "loss": 0.7063, + "step": 145480 + }, + { + "epoch": 1.2861790342827844, + "grad_norm": 9.237870216369629, + "learning_rate": 2.8563682761953597e-05, + "loss": 0.596, + "step": 145490 + }, + { + "epoch": 1.2862674375430965, + "grad_norm": 2.3367531299591064, + "learning_rate": 2.856220937428172e-05, + "loss": 0.5436, + "step": 145500 + }, + { + "epoch": 1.2863558408034088, + "grad_norm": 2.500900983810425, + "learning_rate": 2.8560735986609853e-05, + "loss": 0.5673, + "step": 145510 + }, + { + "epoch": 1.2864442440637212, + "grad_norm": 1.7571214437484741, + "learning_rate": 2.8559262598937985e-05, + "loss": 0.5923, + "step": 145520 + }, + { + "epoch": 1.2865326473240333, + "grad_norm": 2.621816396713257, + "learning_rate": 2.855778921126611e-05, + "loss": 0.6661, + "step": 145530 + }, + { + "epoch": 1.2866210505843456, + "grad_norm": 1.5630762577056885, + "learning_rate": 2.8556315823594242e-05, + "loss": 0.6048, + "step": 145540 + }, + { + "epoch": 1.2867094538446577, + "grad_norm": 8.00423812866211, + "learning_rate": 2.8554842435922374e-05, + "loss": 0.6142, + "step": 145550 + }, + { + "epoch": 1.28679785710497, + "grad_norm": 1.4410505294799805, + "learning_rate": 2.85533690482505e-05, + "loss": 0.5269, + "step": 145560 + }, + { + "epoch": 1.2868862603652822, + "grad_norm": 3.0488429069519043, + "learning_rate": 2.855189566057863e-05, + "loss": 0.5433, + "step": 145570 + }, + { + "epoch": 1.2869746636255945, + "grad_norm": 6.676955223083496, + "learning_rate": 2.855042227290676e-05, + "loss": 0.5935, + "step": 145580 + }, + { + "epoch": 1.2870630668859069, + "grad_norm": 4.665261745452881, + "learning_rate": 2.8548948885234887e-05, + "loss": 0.6498, + "step": 145590 + }, + { + "epoch": 1.287151470146219, + "grad_norm": 3.844132661819458, + "learning_rate": 2.854747549756302e-05, + "loss": 0.6117, + "step": 145600 + }, + { + "epoch": 1.2872398734065311, + "grad_norm": 7.462503433227539, + "learning_rate": 2.8546002109891147e-05, + "loss": 0.5184, + "step": 145610 + }, + { + "epoch": 1.2873282766668435, + "grad_norm": 2.0592591762542725, + "learning_rate": 2.8544528722219275e-05, + "loss": 0.5423, + "step": 145620 + }, + { + "epoch": 1.2874166799271558, + "grad_norm": 7.324609279632568, + "learning_rate": 2.8543055334547407e-05, + "loss": 0.613, + "step": 145630 + }, + { + "epoch": 1.287505083187468, + "grad_norm": 1.4013365507125854, + "learning_rate": 2.8541581946875536e-05, + "loss": 0.7144, + "step": 145640 + }, + { + "epoch": 1.2875934864477803, + "grad_norm": 7.762128829956055, + "learning_rate": 2.8540108559203664e-05, + "loss": 0.6142, + "step": 145650 + }, + { + "epoch": 1.2876818897080924, + "grad_norm": 2.950435161590576, + "learning_rate": 2.8538635171531796e-05, + "loss": 0.7119, + "step": 145660 + }, + { + "epoch": 1.2877702929684047, + "grad_norm": 6.425849437713623, + "learning_rate": 2.8537161783859924e-05, + "loss": 0.6833, + "step": 145670 + }, + { + "epoch": 1.2878586962287168, + "grad_norm": 2.2536520957946777, + "learning_rate": 2.8535688396188052e-05, + "loss": 0.6732, + "step": 145680 + }, + { + "epoch": 1.2879470994890292, + "grad_norm": 16.938657760620117, + "learning_rate": 2.853421500851618e-05, + "loss": 0.6439, + "step": 145690 + }, + { + "epoch": 1.2880355027493415, + "grad_norm": 7.789825916290283, + "learning_rate": 2.8532741620844312e-05, + "loss": 0.6726, + "step": 145700 + }, + { + "epoch": 1.2881239060096537, + "grad_norm": 3.132148504257202, + "learning_rate": 2.853126823317244e-05, + "loss": 0.5345, + "step": 145710 + }, + { + "epoch": 1.2882123092699658, + "grad_norm": 2.9491019248962402, + "learning_rate": 2.852979484550057e-05, + "loss": 0.6191, + "step": 145720 + }, + { + "epoch": 1.2883007125302781, + "grad_norm": 0.851676881313324, + "learning_rate": 2.85283214578287e-05, + "loss": 0.6708, + "step": 145730 + }, + { + "epoch": 1.2883891157905905, + "grad_norm": 2.125684976577759, + "learning_rate": 2.852684807015683e-05, + "loss": 0.6749, + "step": 145740 + }, + { + "epoch": 1.2884775190509026, + "grad_norm": 7.701344013214111, + "learning_rate": 2.8525374682484958e-05, + "loss": 0.5242, + "step": 145750 + }, + { + "epoch": 1.288565922311215, + "grad_norm": 7.966821193695068, + "learning_rate": 2.852390129481309e-05, + "loss": 0.5904, + "step": 145760 + }, + { + "epoch": 1.288654325571527, + "grad_norm": 6.028903961181641, + "learning_rate": 2.8522427907141218e-05, + "loss": 0.554, + "step": 145770 + }, + { + "epoch": 1.2887427288318394, + "grad_norm": 1.3743746280670166, + "learning_rate": 2.8520954519469346e-05, + "loss": 0.6797, + "step": 145780 + }, + { + "epoch": 1.2888311320921515, + "grad_norm": 1.4279412031173706, + "learning_rate": 2.8519481131797478e-05, + "loss": 0.5223, + "step": 145790 + }, + { + "epoch": 1.2889195353524638, + "grad_norm": 2.4263710975646973, + "learning_rate": 2.8518007744125603e-05, + "loss": 0.5801, + "step": 145800 + }, + { + "epoch": 1.289007938612776, + "grad_norm": 7.836480617523193, + "learning_rate": 2.8516534356453735e-05, + "loss": 0.702, + "step": 145810 + }, + { + "epoch": 1.2890963418730883, + "grad_norm": 13.650886535644531, + "learning_rate": 2.8515060968781866e-05, + "loss": 0.5048, + "step": 145820 + }, + { + "epoch": 1.2891847451334004, + "grad_norm": 4.52866268157959, + "learning_rate": 2.851358758110999e-05, + "loss": 0.7947, + "step": 145830 + }, + { + "epoch": 1.2892731483937128, + "grad_norm": 6.007741928100586, + "learning_rate": 2.8512114193438123e-05, + "loss": 0.5939, + "step": 145840 + }, + { + "epoch": 1.289361551654025, + "grad_norm": 11.088205337524414, + "learning_rate": 2.8510640805766255e-05, + "loss": 0.5385, + "step": 145850 + }, + { + "epoch": 1.2894499549143372, + "grad_norm": 2.0562798976898193, + "learning_rate": 2.850916741809438e-05, + "loss": 0.6746, + "step": 145860 + }, + { + "epoch": 1.2895383581746493, + "grad_norm": 1.281401515007019, + "learning_rate": 2.850769403042251e-05, + "loss": 0.4941, + "step": 145870 + }, + { + "epoch": 1.2896267614349617, + "grad_norm": 7.045774459838867, + "learning_rate": 2.8506220642750643e-05, + "loss": 0.5254, + "step": 145880 + }, + { + "epoch": 1.289715164695274, + "grad_norm": 4.512178421020508, + "learning_rate": 2.8504747255078768e-05, + "loss": 0.5488, + "step": 145890 + }, + { + "epoch": 1.2898035679555861, + "grad_norm": 2.099147319793701, + "learning_rate": 2.85032738674069e-05, + "loss": 0.6524, + "step": 145900 + }, + { + "epoch": 1.2898919712158985, + "grad_norm": 3.3113958835601807, + "learning_rate": 2.8501800479735025e-05, + "loss": 0.6098, + "step": 145910 + }, + { + "epoch": 1.2899803744762106, + "grad_norm": 4.148066997528076, + "learning_rate": 2.8500327092063157e-05, + "loss": 0.5473, + "step": 145920 + }, + { + "epoch": 1.290068777736523, + "grad_norm": 2.951728582382202, + "learning_rate": 2.849885370439129e-05, + "loss": 0.6273, + "step": 145930 + }, + { + "epoch": 1.290157180996835, + "grad_norm": 2.621436595916748, + "learning_rate": 2.8497380316719413e-05, + "loss": 0.6373, + "step": 145940 + }, + { + "epoch": 1.2902455842571474, + "grad_norm": 1.4980254173278809, + "learning_rate": 2.8495906929047545e-05, + "loss": 0.5354, + "step": 145950 + }, + { + "epoch": 1.2903339875174598, + "grad_norm": 5.7778239250183105, + "learning_rate": 2.8494433541375677e-05, + "loss": 0.586, + "step": 145960 + }, + { + "epoch": 1.2904223907777719, + "grad_norm": 1.3316553831100464, + "learning_rate": 2.8492960153703802e-05, + "loss": 0.6285, + "step": 145970 + }, + { + "epoch": 1.290510794038084, + "grad_norm": 5.821597099304199, + "learning_rate": 2.8491486766031934e-05, + "loss": 0.5095, + "step": 145980 + }, + { + "epoch": 1.2905991972983963, + "grad_norm": 1.9012861251831055, + "learning_rate": 2.8490013378360065e-05, + "loss": 0.5334, + "step": 145990 + }, + { + "epoch": 1.2906876005587087, + "grad_norm": 1.3801084756851196, + "learning_rate": 2.848853999068819e-05, + "loss": 0.6532, + "step": 146000 + }, + { + "epoch": 1.2907760038190208, + "grad_norm": 2.3948774337768555, + "learning_rate": 2.8487066603016322e-05, + "loss": 0.6381, + "step": 146010 + }, + { + "epoch": 1.2908644070793331, + "grad_norm": 1.9329140186309814, + "learning_rate": 2.8485593215344454e-05, + "loss": 0.6742, + "step": 146020 + }, + { + "epoch": 1.2909528103396453, + "grad_norm": 0.8544408679008484, + "learning_rate": 2.848411982767258e-05, + "loss": 0.5949, + "step": 146030 + }, + { + "epoch": 1.2910412135999576, + "grad_norm": 2.083409309387207, + "learning_rate": 2.848264644000071e-05, + "loss": 0.5571, + "step": 146040 + }, + { + "epoch": 1.2911296168602697, + "grad_norm": 1.5912474393844604, + "learning_rate": 2.8481173052328835e-05, + "loss": 0.46, + "step": 146050 + }, + { + "epoch": 1.291218020120582, + "grad_norm": 3.4063994884490967, + "learning_rate": 2.8479699664656967e-05, + "loss": 0.6153, + "step": 146060 + }, + { + "epoch": 1.2913064233808944, + "grad_norm": 4.82628870010376, + "learning_rate": 2.84782262769851e-05, + "loss": 0.4818, + "step": 146070 + }, + { + "epoch": 1.2913948266412065, + "grad_norm": 5.40212869644165, + "learning_rate": 2.8476752889313224e-05, + "loss": 0.6233, + "step": 146080 + }, + { + "epoch": 1.2914832299015186, + "grad_norm": 1.9495534896850586, + "learning_rate": 2.8475279501641356e-05, + "loss": 0.585, + "step": 146090 + }, + { + "epoch": 1.291571633161831, + "grad_norm": 6.7265305519104, + "learning_rate": 2.8473806113969487e-05, + "loss": 0.725, + "step": 146100 + }, + { + "epoch": 1.2916600364221433, + "grad_norm": 0.8092676997184753, + "learning_rate": 2.8472332726297612e-05, + "loss": 0.5393, + "step": 146110 + }, + { + "epoch": 1.2917484396824555, + "grad_norm": 2.290491819381714, + "learning_rate": 2.8470859338625744e-05, + "loss": 0.4181, + "step": 146120 + }, + { + "epoch": 1.2918368429427678, + "grad_norm": 8.55141830444336, + "learning_rate": 2.8469385950953876e-05, + "loss": 0.7677, + "step": 146130 + }, + { + "epoch": 1.29192524620308, + "grad_norm": 2.789681911468506, + "learning_rate": 2.8467912563282e-05, + "loss": 0.5504, + "step": 146140 + }, + { + "epoch": 1.2920136494633923, + "grad_norm": 7.410167217254639, + "learning_rate": 2.8466439175610132e-05, + "loss": 0.6975, + "step": 146150 + }, + { + "epoch": 1.2921020527237044, + "grad_norm": 6.615194797515869, + "learning_rate": 2.8464965787938257e-05, + "loss": 0.671, + "step": 146160 + }, + { + "epoch": 1.2921904559840167, + "grad_norm": 1.2965937852859497, + "learning_rate": 2.846349240026639e-05, + "loss": 0.4572, + "step": 146170 + }, + { + "epoch": 1.292278859244329, + "grad_norm": 8.206915855407715, + "learning_rate": 2.846201901259452e-05, + "loss": 0.5175, + "step": 146180 + }, + { + "epoch": 1.2923672625046412, + "grad_norm": 6.692469120025635, + "learning_rate": 2.8460545624922646e-05, + "loss": 0.5762, + "step": 146190 + }, + { + "epoch": 1.2924556657649533, + "grad_norm": 5.608756065368652, + "learning_rate": 2.8459072237250778e-05, + "loss": 0.5313, + "step": 146200 + }, + { + "epoch": 1.2925440690252656, + "grad_norm": 1.3925907611846924, + "learning_rate": 2.845759884957891e-05, + "loss": 0.6933, + "step": 146210 + }, + { + "epoch": 1.292632472285578, + "grad_norm": 6.7219014167785645, + "learning_rate": 2.8456125461907034e-05, + "loss": 0.5021, + "step": 146220 + }, + { + "epoch": 1.29272087554589, + "grad_norm": 1.2453153133392334, + "learning_rate": 2.8454652074235166e-05, + "loss": 0.7405, + "step": 146230 + }, + { + "epoch": 1.2928092788062024, + "grad_norm": 2.447101593017578, + "learning_rate": 2.8453178686563298e-05, + "loss": 0.5882, + "step": 146240 + }, + { + "epoch": 1.2928976820665146, + "grad_norm": 3.116987466812134, + "learning_rate": 2.8451705298891423e-05, + "loss": 0.6017, + "step": 146250 + }, + { + "epoch": 1.292986085326827, + "grad_norm": 2.5585999488830566, + "learning_rate": 2.8450231911219555e-05, + "loss": 0.6149, + "step": 146260 + }, + { + "epoch": 1.293074488587139, + "grad_norm": 1.6837371587753296, + "learning_rate": 2.844875852354768e-05, + "loss": 0.5378, + "step": 146270 + }, + { + "epoch": 1.2931628918474514, + "grad_norm": 1.533752679824829, + "learning_rate": 2.844728513587581e-05, + "loss": 0.5095, + "step": 146280 + }, + { + "epoch": 1.2932512951077637, + "grad_norm": 3.9867093563079834, + "learning_rate": 2.8445811748203943e-05, + "loss": 0.6708, + "step": 146290 + }, + { + "epoch": 1.2933396983680758, + "grad_norm": 2.2079825401306152, + "learning_rate": 2.8444338360532068e-05, + "loss": 0.7006, + "step": 146300 + }, + { + "epoch": 1.293428101628388, + "grad_norm": 2.876032829284668, + "learning_rate": 2.84428649728602e-05, + "loss": 0.604, + "step": 146310 + }, + { + "epoch": 1.2935165048887003, + "grad_norm": 1.8037502765655518, + "learning_rate": 2.844139158518833e-05, + "loss": 0.6907, + "step": 146320 + }, + { + "epoch": 1.2936049081490126, + "grad_norm": 5.662286758422852, + "learning_rate": 2.8439918197516456e-05, + "loss": 0.5766, + "step": 146330 + }, + { + "epoch": 1.2936933114093248, + "grad_norm": 5.239223003387451, + "learning_rate": 2.8438444809844588e-05, + "loss": 0.5212, + "step": 146340 + }, + { + "epoch": 1.293781714669637, + "grad_norm": 1.3765106201171875, + "learning_rate": 2.843697142217272e-05, + "loss": 0.6878, + "step": 146350 + }, + { + "epoch": 1.2938701179299492, + "grad_norm": 3.2985055446624756, + "learning_rate": 2.8435498034500845e-05, + "loss": 0.6275, + "step": 146360 + }, + { + "epoch": 1.2939585211902616, + "grad_norm": 1.4100604057312012, + "learning_rate": 2.8434024646828977e-05, + "loss": 0.6671, + "step": 146370 + }, + { + "epoch": 1.2940469244505737, + "grad_norm": 4.344699382781982, + "learning_rate": 2.84325512591571e-05, + "loss": 0.6651, + "step": 146380 + }, + { + "epoch": 1.294135327710886, + "grad_norm": 5.090414047241211, + "learning_rate": 2.8431077871485233e-05, + "loss": 0.6956, + "step": 146390 + }, + { + "epoch": 1.2942237309711981, + "grad_norm": 1.7224764823913574, + "learning_rate": 2.8429604483813365e-05, + "loss": 0.5881, + "step": 146400 + }, + { + "epoch": 1.2943121342315105, + "grad_norm": 3.1565380096435547, + "learning_rate": 2.842813109614149e-05, + "loss": 0.7066, + "step": 146410 + }, + { + "epoch": 1.2944005374918226, + "grad_norm": 8.755756378173828, + "learning_rate": 2.8426657708469622e-05, + "loss": 0.65, + "step": 146420 + }, + { + "epoch": 1.294488940752135, + "grad_norm": 1.3175544738769531, + "learning_rate": 2.8425184320797753e-05, + "loss": 0.5405, + "step": 146430 + }, + { + "epoch": 1.2945773440124473, + "grad_norm": 1.4170767068862915, + "learning_rate": 2.842371093312588e-05, + "loss": 0.6276, + "step": 146440 + }, + { + "epoch": 1.2946657472727594, + "grad_norm": 2.0644989013671875, + "learning_rate": 2.842223754545401e-05, + "loss": 0.5847, + "step": 146450 + }, + { + "epoch": 1.2947541505330715, + "grad_norm": 3.948702812194824, + "learning_rate": 2.8420764157782142e-05, + "loss": 0.6221, + "step": 146460 + }, + { + "epoch": 1.2948425537933839, + "grad_norm": 4.97209358215332, + "learning_rate": 2.8419290770110267e-05, + "loss": 0.6305, + "step": 146470 + }, + { + "epoch": 1.2949309570536962, + "grad_norm": 2.394826889038086, + "learning_rate": 2.84178173824384e-05, + "loss": 0.6793, + "step": 146480 + }, + { + "epoch": 1.2950193603140083, + "grad_norm": 1.4477965831756592, + "learning_rate": 2.841634399476653e-05, + "loss": 0.7527, + "step": 146490 + }, + { + "epoch": 1.2951077635743207, + "grad_norm": 2.1424484252929688, + "learning_rate": 2.8414870607094655e-05, + "loss": 0.5579, + "step": 146500 + }, + { + "epoch": 1.2951961668346328, + "grad_norm": 5.241677284240723, + "learning_rate": 2.8413397219422787e-05, + "loss": 0.5501, + "step": 146510 + }, + { + "epoch": 1.2952845700949451, + "grad_norm": 1.9471867084503174, + "learning_rate": 2.8411923831750915e-05, + "loss": 0.6399, + "step": 146520 + }, + { + "epoch": 1.2953729733552573, + "grad_norm": 10.179215431213379, + "learning_rate": 2.8410450444079044e-05, + "loss": 0.5723, + "step": 146530 + }, + { + "epoch": 1.2954613766155696, + "grad_norm": 2.3967907428741455, + "learning_rate": 2.8408977056407176e-05, + "loss": 0.4262, + "step": 146540 + }, + { + "epoch": 1.295549779875882, + "grad_norm": 1.2617415189743042, + "learning_rate": 2.8407503668735304e-05, + "loss": 0.4513, + "step": 146550 + }, + { + "epoch": 1.295638183136194, + "grad_norm": 1.8135316371917725, + "learning_rate": 2.8406030281063432e-05, + "loss": 0.5743, + "step": 146560 + }, + { + "epoch": 1.2957265863965062, + "grad_norm": 2.815929651260376, + "learning_rate": 2.8404556893391564e-05, + "loss": 0.7244, + "step": 146570 + }, + { + "epoch": 1.2958149896568185, + "grad_norm": 2.680691957473755, + "learning_rate": 2.8403083505719692e-05, + "loss": 0.6356, + "step": 146580 + }, + { + "epoch": 1.2959033929171309, + "grad_norm": 2.231788396835327, + "learning_rate": 2.840161011804782e-05, + "loss": 0.5356, + "step": 146590 + }, + { + "epoch": 1.295991796177443, + "grad_norm": 1.815050721168518, + "learning_rate": 2.8400136730375952e-05, + "loss": 0.5368, + "step": 146600 + }, + { + "epoch": 1.2960801994377553, + "grad_norm": 1.2623122930526733, + "learning_rate": 2.839866334270408e-05, + "loss": 0.7523, + "step": 146610 + }, + { + "epoch": 1.2961686026980674, + "grad_norm": 1.5876820087432861, + "learning_rate": 2.839718995503221e-05, + "loss": 0.7272, + "step": 146620 + }, + { + "epoch": 1.2962570059583798, + "grad_norm": 1.8999567031860352, + "learning_rate": 2.8395716567360337e-05, + "loss": 0.6806, + "step": 146630 + }, + { + "epoch": 1.296345409218692, + "grad_norm": 4.445340156555176, + "learning_rate": 2.839424317968847e-05, + "loss": 0.5926, + "step": 146640 + }, + { + "epoch": 1.2964338124790042, + "grad_norm": 19.329952239990234, + "learning_rate": 2.8392769792016598e-05, + "loss": 0.6271, + "step": 146650 + }, + { + "epoch": 1.2965222157393166, + "grad_norm": 1.148324966430664, + "learning_rate": 2.8391296404344726e-05, + "loss": 0.5235, + "step": 146660 + }, + { + "epoch": 1.2966106189996287, + "grad_norm": 2.467998743057251, + "learning_rate": 2.8389823016672858e-05, + "loss": 0.5469, + "step": 146670 + }, + { + "epoch": 1.2966990222599408, + "grad_norm": 1.8557848930358887, + "learning_rate": 2.8388349629000986e-05, + "loss": 0.6129, + "step": 146680 + }, + { + "epoch": 1.2967874255202532, + "grad_norm": 8.01926326751709, + "learning_rate": 2.8386876241329114e-05, + "loss": 0.7484, + "step": 146690 + }, + { + "epoch": 1.2968758287805655, + "grad_norm": 5.732945442199707, + "learning_rate": 2.8385402853657246e-05, + "loss": 0.5079, + "step": 146700 + }, + { + "epoch": 1.2969642320408776, + "grad_norm": 2.4821481704711914, + "learning_rate": 2.8383929465985374e-05, + "loss": 0.6717, + "step": 146710 + }, + { + "epoch": 1.29705263530119, + "grad_norm": 6.604349613189697, + "learning_rate": 2.8382456078313503e-05, + "loss": 0.6876, + "step": 146720 + }, + { + "epoch": 1.297141038561502, + "grad_norm": 2.479754686355591, + "learning_rate": 2.8380982690641635e-05, + "loss": 0.5344, + "step": 146730 + }, + { + "epoch": 1.2972294418218144, + "grad_norm": 3.167471170425415, + "learning_rate": 2.837950930296976e-05, + "loss": 0.5937, + "step": 146740 + }, + { + "epoch": 1.2973178450821266, + "grad_norm": 2.060343027114868, + "learning_rate": 2.837803591529789e-05, + "loss": 0.5797, + "step": 146750 + }, + { + "epoch": 1.297406248342439, + "grad_norm": 4.025713920593262, + "learning_rate": 2.8376562527626023e-05, + "loss": 0.6287, + "step": 146760 + }, + { + "epoch": 1.2974946516027512, + "grad_norm": 6.159024715423584, + "learning_rate": 2.8375089139954148e-05, + "loss": 0.689, + "step": 146770 + }, + { + "epoch": 1.2975830548630634, + "grad_norm": 1.5408360958099365, + "learning_rate": 2.837361575228228e-05, + "loss": 0.505, + "step": 146780 + }, + { + "epoch": 1.2976714581233755, + "grad_norm": 2.154221534729004, + "learning_rate": 2.837214236461041e-05, + "loss": 0.6672, + "step": 146790 + }, + { + "epoch": 1.2977598613836878, + "grad_norm": 8.126728057861328, + "learning_rate": 2.8370668976938536e-05, + "loss": 0.6237, + "step": 146800 + }, + { + "epoch": 1.2978482646440002, + "grad_norm": 7.023776054382324, + "learning_rate": 2.8369195589266668e-05, + "loss": 0.4704, + "step": 146810 + }, + { + "epoch": 1.2979366679043123, + "grad_norm": 1.201236367225647, + "learning_rate": 2.83677222015948e-05, + "loss": 0.4981, + "step": 146820 + }, + { + "epoch": 1.2980250711646246, + "grad_norm": 5.6378655433654785, + "learning_rate": 2.8366248813922925e-05, + "loss": 0.675, + "step": 146830 + }, + { + "epoch": 1.2981134744249367, + "grad_norm": 6.840307712554932, + "learning_rate": 2.8364775426251057e-05, + "loss": 0.7116, + "step": 146840 + }, + { + "epoch": 1.298201877685249, + "grad_norm": 6.086009502410889, + "learning_rate": 2.836330203857918e-05, + "loss": 0.5584, + "step": 146850 + }, + { + "epoch": 1.2982902809455612, + "grad_norm": 3.1375789642333984, + "learning_rate": 2.8361828650907313e-05, + "loss": 0.4644, + "step": 146860 + }, + { + "epoch": 1.2983786842058735, + "grad_norm": 2.1103127002716064, + "learning_rate": 2.8360355263235445e-05, + "loss": 0.5786, + "step": 146870 + }, + { + "epoch": 1.2984670874661859, + "grad_norm": 7.147364139556885, + "learning_rate": 2.835888187556357e-05, + "loss": 0.5104, + "step": 146880 + }, + { + "epoch": 1.298555490726498, + "grad_norm": 1.9633492231369019, + "learning_rate": 2.8357408487891702e-05, + "loss": 0.6729, + "step": 146890 + }, + { + "epoch": 1.2986438939868101, + "grad_norm": 3.0465798377990723, + "learning_rate": 2.8355935100219834e-05, + "loss": 0.5347, + "step": 146900 + }, + { + "epoch": 1.2987322972471225, + "grad_norm": 2.7077319622039795, + "learning_rate": 2.835446171254796e-05, + "loss": 0.5477, + "step": 146910 + }, + { + "epoch": 1.2988207005074348, + "grad_norm": 1.6707226037979126, + "learning_rate": 2.835298832487609e-05, + "loss": 0.6702, + "step": 146920 + }, + { + "epoch": 1.298909103767747, + "grad_norm": 4.2694573402404785, + "learning_rate": 2.8351514937204222e-05, + "loss": 0.7365, + "step": 146930 + }, + { + "epoch": 1.2989975070280593, + "grad_norm": 2.267035961151123, + "learning_rate": 2.8350041549532347e-05, + "loss": 0.6377, + "step": 146940 + }, + { + "epoch": 1.2990859102883714, + "grad_norm": 1.021494746208191, + "learning_rate": 2.834856816186048e-05, + "loss": 0.5388, + "step": 146950 + }, + { + "epoch": 1.2991743135486837, + "grad_norm": 6.940701484680176, + "learning_rate": 2.834709477418861e-05, + "loss": 0.59, + "step": 146960 + }, + { + "epoch": 1.2992627168089959, + "grad_norm": 2.500112771987915, + "learning_rate": 2.8345621386516735e-05, + "loss": 0.5426, + "step": 146970 + }, + { + "epoch": 1.2993511200693082, + "grad_norm": 1.5883054733276367, + "learning_rate": 2.8344147998844867e-05, + "loss": 0.6212, + "step": 146980 + }, + { + "epoch": 1.2994395233296203, + "grad_norm": 2.620814800262451, + "learning_rate": 2.8342674611172992e-05, + "loss": 0.6575, + "step": 146990 + }, + { + "epoch": 1.2995279265899327, + "grad_norm": 2.0864148139953613, + "learning_rate": 2.8341201223501124e-05, + "loss": 0.5719, + "step": 147000 + }, + { + "epoch": 1.2996163298502448, + "grad_norm": 1.3372048139572144, + "learning_rate": 2.8339727835829256e-05, + "loss": 0.6043, + "step": 147010 + }, + { + "epoch": 1.2997047331105571, + "grad_norm": 2.6868886947631836, + "learning_rate": 2.833825444815738e-05, + "loss": 0.5885, + "step": 147020 + }, + { + "epoch": 1.2997931363708695, + "grad_norm": 1.4888969659805298, + "learning_rate": 2.8336781060485512e-05, + "loss": 0.5749, + "step": 147030 + }, + { + "epoch": 1.2998815396311816, + "grad_norm": 2.819533109664917, + "learning_rate": 2.8335307672813644e-05, + "loss": 0.5596, + "step": 147040 + }, + { + "epoch": 1.2999699428914937, + "grad_norm": 1.919136643409729, + "learning_rate": 2.833383428514177e-05, + "loss": 0.5749, + "step": 147050 + }, + { + "epoch": 1.300058346151806, + "grad_norm": 5.6132917404174805, + "learning_rate": 2.83323608974699e-05, + "loss": 0.603, + "step": 147060 + }, + { + "epoch": 1.3001467494121184, + "grad_norm": 12.341681480407715, + "learning_rate": 2.8330887509798033e-05, + "loss": 0.702, + "step": 147070 + }, + { + "epoch": 1.3002351526724305, + "grad_norm": 11.892692565917969, + "learning_rate": 2.8329414122126157e-05, + "loss": 0.5466, + "step": 147080 + }, + { + "epoch": 1.3003235559327428, + "grad_norm": 2.8739593029022217, + "learning_rate": 2.832794073445429e-05, + "loss": 0.5766, + "step": 147090 + }, + { + "epoch": 1.300411959193055, + "grad_norm": 8.932125091552734, + "learning_rate": 2.8326467346782414e-05, + "loss": 0.6745, + "step": 147100 + }, + { + "epoch": 1.3005003624533673, + "grad_norm": 2.2936980724334717, + "learning_rate": 2.8324993959110546e-05, + "loss": 0.5872, + "step": 147110 + }, + { + "epoch": 1.3005887657136794, + "grad_norm": 2.2789089679718018, + "learning_rate": 2.8323520571438678e-05, + "loss": 0.5136, + "step": 147120 + }, + { + "epoch": 1.3006771689739918, + "grad_norm": 3.512072801589966, + "learning_rate": 2.8322047183766803e-05, + "loss": 0.5661, + "step": 147130 + }, + { + "epoch": 1.3007655722343041, + "grad_norm": 2.171337604522705, + "learning_rate": 2.8320573796094934e-05, + "loss": 0.5836, + "step": 147140 + }, + { + "epoch": 1.3008539754946162, + "grad_norm": 2.246044635772705, + "learning_rate": 2.8319100408423066e-05, + "loss": 0.5959, + "step": 147150 + }, + { + "epoch": 1.3009423787549284, + "grad_norm": 1.1269278526306152, + "learning_rate": 2.831762702075119e-05, + "loss": 0.5188, + "step": 147160 + }, + { + "epoch": 1.3010307820152407, + "grad_norm": 3.1443018913269043, + "learning_rate": 2.8316153633079323e-05, + "loss": 0.5548, + "step": 147170 + }, + { + "epoch": 1.301119185275553, + "grad_norm": 4.107034683227539, + "learning_rate": 2.8314680245407455e-05, + "loss": 0.7024, + "step": 147180 + }, + { + "epoch": 1.3012075885358652, + "grad_norm": 0.7410919666290283, + "learning_rate": 2.831320685773558e-05, + "loss": 0.5675, + "step": 147190 + }, + { + "epoch": 1.3012959917961775, + "grad_norm": 2.4453177452087402, + "learning_rate": 2.831173347006371e-05, + "loss": 0.646, + "step": 147200 + }, + { + "epoch": 1.3013843950564896, + "grad_norm": 3.1775753498077393, + "learning_rate": 2.8310260082391836e-05, + "loss": 0.5698, + "step": 147210 + }, + { + "epoch": 1.301472798316802, + "grad_norm": 3.8935375213623047, + "learning_rate": 2.8308786694719968e-05, + "loss": 0.581, + "step": 147220 + }, + { + "epoch": 1.301561201577114, + "grad_norm": 1.562652587890625, + "learning_rate": 2.83073133070481e-05, + "loss": 0.6022, + "step": 147230 + }, + { + "epoch": 1.3016496048374264, + "grad_norm": 9.213784217834473, + "learning_rate": 2.8305839919376225e-05, + "loss": 0.6211, + "step": 147240 + }, + { + "epoch": 1.3017380080977388, + "grad_norm": 2.685655117034912, + "learning_rate": 2.8304366531704356e-05, + "loss": 0.5941, + "step": 147250 + }, + { + "epoch": 1.3018264113580509, + "grad_norm": 1.4757558107376099, + "learning_rate": 2.8302893144032488e-05, + "loss": 0.508, + "step": 147260 + }, + { + "epoch": 1.301914814618363, + "grad_norm": 1.7871500253677368, + "learning_rate": 2.8301419756360613e-05, + "loss": 0.7016, + "step": 147270 + }, + { + "epoch": 1.3020032178786753, + "grad_norm": 1.066924810409546, + "learning_rate": 2.8299946368688745e-05, + "loss": 0.6228, + "step": 147280 + }, + { + "epoch": 1.3020916211389877, + "grad_norm": 2.3574273586273193, + "learning_rate": 2.8298472981016877e-05, + "loss": 0.6102, + "step": 147290 + }, + { + "epoch": 1.3021800243992998, + "grad_norm": 8.243714332580566, + "learning_rate": 2.8296999593345e-05, + "loss": 0.6085, + "step": 147300 + }, + { + "epoch": 1.3022684276596121, + "grad_norm": 2.1952390670776367, + "learning_rate": 2.8295526205673133e-05, + "loss": 0.5912, + "step": 147310 + }, + { + "epoch": 1.3023568309199243, + "grad_norm": 2.745427370071411, + "learning_rate": 2.8294052818001258e-05, + "loss": 0.6867, + "step": 147320 + }, + { + "epoch": 1.3024452341802366, + "grad_norm": 1.7434247732162476, + "learning_rate": 2.829257943032939e-05, + "loss": 0.6049, + "step": 147330 + }, + { + "epoch": 1.3025336374405487, + "grad_norm": 2.7690441608428955, + "learning_rate": 2.8291106042657522e-05, + "loss": 0.6213, + "step": 147340 + }, + { + "epoch": 1.302622040700861, + "grad_norm": 3.6132118701934814, + "learning_rate": 2.8289632654985647e-05, + "loss": 0.6111, + "step": 147350 + }, + { + "epoch": 1.3027104439611734, + "grad_norm": 1.6341034173965454, + "learning_rate": 2.828815926731378e-05, + "loss": 0.5975, + "step": 147360 + }, + { + "epoch": 1.3027988472214855, + "grad_norm": 5.649781703948975, + "learning_rate": 2.828668587964191e-05, + "loss": 0.6634, + "step": 147370 + }, + { + "epoch": 1.3028872504817977, + "grad_norm": 1.2415521144866943, + "learning_rate": 2.8285212491970035e-05, + "loss": 0.4889, + "step": 147380 + }, + { + "epoch": 1.30297565374211, + "grad_norm": 2.9956023693084717, + "learning_rate": 2.8283739104298167e-05, + "loss": 0.5689, + "step": 147390 + }, + { + "epoch": 1.3030640570024223, + "grad_norm": 2.027679681777954, + "learning_rate": 2.82822657166263e-05, + "loss": 0.5782, + "step": 147400 + }, + { + "epoch": 1.3031524602627345, + "grad_norm": 8.587875366210938, + "learning_rate": 2.8280792328954424e-05, + "loss": 0.5384, + "step": 147410 + }, + { + "epoch": 1.3032408635230468, + "grad_norm": 2.378728151321411, + "learning_rate": 2.8279318941282555e-05, + "loss": 0.5795, + "step": 147420 + }, + { + "epoch": 1.303329266783359, + "grad_norm": 2.8450756072998047, + "learning_rate": 2.8277845553610687e-05, + "loss": 0.5438, + "step": 147430 + }, + { + "epoch": 1.3034176700436713, + "grad_norm": 4.5527496337890625, + "learning_rate": 2.8276372165938812e-05, + "loss": 0.6895, + "step": 147440 + }, + { + "epoch": 1.3035060733039834, + "grad_norm": 3.694490671157837, + "learning_rate": 2.8274898778266944e-05, + "loss": 0.6177, + "step": 147450 + }, + { + "epoch": 1.3035944765642957, + "grad_norm": 1.7594057321548462, + "learning_rate": 2.8273425390595072e-05, + "loss": 0.5451, + "step": 147460 + }, + { + "epoch": 1.303682879824608, + "grad_norm": 2.53791880607605, + "learning_rate": 2.82719520029232e-05, + "loss": 0.4787, + "step": 147470 + }, + { + "epoch": 1.3037712830849202, + "grad_norm": 5.925833225250244, + "learning_rate": 2.8270478615251332e-05, + "loss": 0.6858, + "step": 147480 + }, + { + "epoch": 1.3038596863452323, + "grad_norm": 2.1145951747894287, + "learning_rate": 2.826900522757946e-05, + "loss": 0.6803, + "step": 147490 + }, + { + "epoch": 1.3039480896055446, + "grad_norm": 2.356001615524292, + "learning_rate": 2.826753183990759e-05, + "loss": 0.5678, + "step": 147500 + }, + { + "epoch": 1.304036492865857, + "grad_norm": 0.8765555620193481, + "learning_rate": 2.826605845223572e-05, + "loss": 0.5352, + "step": 147510 + }, + { + "epoch": 1.304124896126169, + "grad_norm": 2.0448122024536133, + "learning_rate": 2.826458506456385e-05, + "loss": 0.5859, + "step": 147520 + }, + { + "epoch": 1.3042132993864815, + "grad_norm": 3.7324912548065186, + "learning_rate": 2.8263111676891977e-05, + "loss": 0.7037, + "step": 147530 + }, + { + "epoch": 1.3043017026467936, + "grad_norm": 6.3869147300720215, + "learning_rate": 2.826163828922011e-05, + "loss": 0.5142, + "step": 147540 + }, + { + "epoch": 1.304390105907106, + "grad_norm": 3.6988437175750732, + "learning_rate": 2.8260164901548238e-05, + "loss": 0.5108, + "step": 147550 + }, + { + "epoch": 1.304478509167418, + "grad_norm": 3.6429684162139893, + "learning_rate": 2.8258691513876366e-05, + "loss": 0.6565, + "step": 147560 + }, + { + "epoch": 1.3045669124277304, + "grad_norm": 2.3001108169555664, + "learning_rate": 2.8257218126204494e-05, + "loss": 0.5487, + "step": 147570 + }, + { + "epoch": 1.3046553156880425, + "grad_norm": 2.0703392028808594, + "learning_rate": 2.8255744738532626e-05, + "loss": 0.6662, + "step": 147580 + }, + { + "epoch": 1.3047437189483548, + "grad_norm": 4.685385704040527, + "learning_rate": 2.8254271350860754e-05, + "loss": 0.6427, + "step": 147590 + }, + { + "epoch": 1.304832122208667, + "grad_norm": 2.031373977661133, + "learning_rate": 2.8252797963188883e-05, + "loss": 0.7154, + "step": 147600 + }, + { + "epoch": 1.3049205254689793, + "grad_norm": 5.000436782836914, + "learning_rate": 2.8251324575517014e-05, + "loss": 0.6124, + "step": 147610 + }, + { + "epoch": 1.3050089287292916, + "grad_norm": 2.333563804626465, + "learning_rate": 2.8249851187845143e-05, + "loss": 0.6567, + "step": 147620 + }, + { + "epoch": 1.3050973319896038, + "grad_norm": 1.7319918870925903, + "learning_rate": 2.824837780017327e-05, + "loss": 0.5404, + "step": 147630 + }, + { + "epoch": 1.3051857352499159, + "grad_norm": 1.6542637348175049, + "learning_rate": 2.8246904412501403e-05, + "loss": 0.6069, + "step": 147640 + }, + { + "epoch": 1.3052741385102282, + "grad_norm": 8.284990310668945, + "learning_rate": 2.824543102482953e-05, + "loss": 0.5799, + "step": 147650 + }, + { + "epoch": 1.3053625417705406, + "grad_norm": 2.2346031665802, + "learning_rate": 2.824395763715766e-05, + "loss": 0.5605, + "step": 147660 + }, + { + "epoch": 1.3054509450308527, + "grad_norm": 12.373358726501465, + "learning_rate": 2.824248424948579e-05, + "loss": 0.5377, + "step": 147670 + }, + { + "epoch": 1.305539348291165, + "grad_norm": 2.6278672218322754, + "learning_rate": 2.8241010861813916e-05, + "loss": 0.5863, + "step": 147680 + }, + { + "epoch": 1.3056277515514771, + "grad_norm": 1.561033010482788, + "learning_rate": 2.8239537474142048e-05, + "loss": 0.6042, + "step": 147690 + }, + { + "epoch": 1.3057161548117895, + "grad_norm": 4.284996032714844, + "learning_rate": 2.823806408647018e-05, + "loss": 0.6056, + "step": 147700 + }, + { + "epoch": 1.3058045580721016, + "grad_norm": 1.2217775583267212, + "learning_rate": 2.8236590698798305e-05, + "loss": 0.5429, + "step": 147710 + }, + { + "epoch": 1.305892961332414, + "grad_norm": 1.2879341840744019, + "learning_rate": 2.8235117311126436e-05, + "loss": 0.5125, + "step": 147720 + }, + { + "epoch": 1.3059813645927263, + "grad_norm": 6.19607400894165, + "learning_rate": 2.8233643923454568e-05, + "loss": 0.5391, + "step": 147730 + }, + { + "epoch": 1.3060697678530384, + "grad_norm": 2.394674777984619, + "learning_rate": 2.8232170535782693e-05, + "loss": 0.5223, + "step": 147740 + }, + { + "epoch": 1.3061581711133505, + "grad_norm": 2.607071876525879, + "learning_rate": 2.8230697148110825e-05, + "loss": 0.5002, + "step": 147750 + }, + { + "epoch": 1.3062465743736629, + "grad_norm": 3.993256092071533, + "learning_rate": 2.8229223760438957e-05, + "loss": 0.6738, + "step": 147760 + }, + { + "epoch": 1.3063349776339752, + "grad_norm": 2.1710879802703857, + "learning_rate": 2.822775037276708e-05, + "loss": 0.5527, + "step": 147770 + }, + { + "epoch": 1.3064233808942873, + "grad_norm": 3.0856449604034424, + "learning_rate": 2.8226276985095213e-05, + "loss": 0.6058, + "step": 147780 + }, + { + "epoch": 1.3065117841545997, + "grad_norm": 1.846940040588379, + "learning_rate": 2.822480359742334e-05, + "loss": 0.6034, + "step": 147790 + }, + { + "epoch": 1.3066001874149118, + "grad_norm": 1.2857451438903809, + "learning_rate": 2.822333020975147e-05, + "loss": 0.5157, + "step": 147800 + }, + { + "epoch": 1.3066885906752241, + "grad_norm": 6.651855945587158, + "learning_rate": 2.8221856822079602e-05, + "loss": 0.7258, + "step": 147810 + }, + { + "epoch": 1.3067769939355363, + "grad_norm": 2.238550901412964, + "learning_rate": 2.8220383434407727e-05, + "loss": 0.4933, + "step": 147820 + }, + { + "epoch": 1.3068653971958486, + "grad_norm": 5.695883750915527, + "learning_rate": 2.821891004673586e-05, + "loss": 0.6126, + "step": 147830 + }, + { + "epoch": 1.306953800456161, + "grad_norm": 1.7693698406219482, + "learning_rate": 2.821743665906399e-05, + "loss": 0.5388, + "step": 147840 + }, + { + "epoch": 1.307042203716473, + "grad_norm": 2.918375015258789, + "learning_rate": 2.8215963271392115e-05, + "loss": 0.7853, + "step": 147850 + }, + { + "epoch": 1.3071306069767852, + "grad_norm": 1.3856827020645142, + "learning_rate": 2.8214489883720247e-05, + "loss": 0.6787, + "step": 147860 + }, + { + "epoch": 1.3072190102370975, + "grad_norm": 4.287835121154785, + "learning_rate": 2.821301649604838e-05, + "loss": 0.5841, + "step": 147870 + }, + { + "epoch": 1.3073074134974099, + "grad_norm": 5.601003170013428, + "learning_rate": 2.8211543108376504e-05, + "loss": 0.5665, + "step": 147880 + }, + { + "epoch": 1.307395816757722, + "grad_norm": 0.9920431971549988, + "learning_rate": 2.8210069720704635e-05, + "loss": 0.5784, + "step": 147890 + }, + { + "epoch": 1.3074842200180343, + "grad_norm": 1.321824073791504, + "learning_rate": 2.8208596333032767e-05, + "loss": 0.5604, + "step": 147900 + }, + { + "epoch": 1.3075726232783464, + "grad_norm": 2.0531821250915527, + "learning_rate": 2.8207122945360892e-05, + "loss": 0.6935, + "step": 147910 + }, + { + "epoch": 1.3076610265386588, + "grad_norm": 2.335388422012329, + "learning_rate": 2.8205649557689024e-05, + "loss": 0.6757, + "step": 147920 + }, + { + "epoch": 1.307749429798971, + "grad_norm": 2.0323305130004883, + "learning_rate": 2.820417617001715e-05, + "loss": 0.5602, + "step": 147930 + }, + { + "epoch": 1.3078378330592833, + "grad_norm": 2.154433250427246, + "learning_rate": 2.820270278234528e-05, + "loss": 0.585, + "step": 147940 + }, + { + "epoch": 1.3079262363195956, + "grad_norm": 7.445944309234619, + "learning_rate": 2.8201229394673412e-05, + "loss": 0.5075, + "step": 147950 + }, + { + "epoch": 1.3080146395799077, + "grad_norm": 2.845095157623291, + "learning_rate": 2.8199756007001537e-05, + "loss": 0.5999, + "step": 147960 + }, + { + "epoch": 1.3081030428402198, + "grad_norm": 3.5085434913635254, + "learning_rate": 2.819828261932967e-05, + "loss": 0.6902, + "step": 147970 + }, + { + "epoch": 1.3081914461005322, + "grad_norm": 1.5698795318603516, + "learning_rate": 2.81968092316578e-05, + "loss": 0.5557, + "step": 147980 + }, + { + "epoch": 1.3082798493608445, + "grad_norm": 2.0982470512390137, + "learning_rate": 2.8195335843985926e-05, + "loss": 0.598, + "step": 147990 + }, + { + "epoch": 1.3083682526211566, + "grad_norm": 5.38260555267334, + "learning_rate": 2.8193862456314058e-05, + "loss": 0.5967, + "step": 148000 + }, + { + "epoch": 1.308456655881469, + "grad_norm": 8.967691421508789, + "learning_rate": 2.819238906864219e-05, + "loss": 0.6884, + "step": 148010 + }, + { + "epoch": 1.308545059141781, + "grad_norm": 2.423182725906372, + "learning_rate": 2.8190915680970314e-05, + "loss": 0.5464, + "step": 148020 + }, + { + "epoch": 1.3086334624020934, + "grad_norm": 1.3502552509307861, + "learning_rate": 2.8189442293298446e-05, + "loss": 0.6193, + "step": 148030 + }, + { + "epoch": 1.3087218656624056, + "grad_norm": 1.9717388153076172, + "learning_rate": 2.818796890562657e-05, + "loss": 0.6013, + "step": 148040 + }, + { + "epoch": 1.308810268922718, + "grad_norm": 10.02807903289795, + "learning_rate": 2.8186495517954703e-05, + "loss": 0.5545, + "step": 148050 + }, + { + "epoch": 1.3088986721830302, + "grad_norm": 2.5539464950561523, + "learning_rate": 2.8185022130282834e-05, + "loss": 0.6676, + "step": 148060 + }, + { + "epoch": 1.3089870754433424, + "grad_norm": 11.810771942138672, + "learning_rate": 2.818354874261096e-05, + "loss": 0.7359, + "step": 148070 + }, + { + "epoch": 1.3090754787036545, + "grad_norm": 3.780825138092041, + "learning_rate": 2.818207535493909e-05, + "loss": 0.5766, + "step": 148080 + }, + { + "epoch": 1.3091638819639668, + "grad_norm": 2.876922369003296, + "learning_rate": 2.8180601967267223e-05, + "loss": 0.6735, + "step": 148090 + }, + { + "epoch": 1.3092522852242792, + "grad_norm": 8.37081241607666, + "learning_rate": 2.8179128579595348e-05, + "loss": 0.5598, + "step": 148100 + }, + { + "epoch": 1.3093406884845913, + "grad_norm": 2.4037575721740723, + "learning_rate": 2.817765519192348e-05, + "loss": 0.6646, + "step": 148110 + }, + { + "epoch": 1.3094290917449036, + "grad_norm": 1.8648908138275146, + "learning_rate": 2.817618180425161e-05, + "loss": 0.4849, + "step": 148120 + }, + { + "epoch": 1.3095174950052157, + "grad_norm": 12.503753662109375, + "learning_rate": 2.8174708416579736e-05, + "loss": 0.5475, + "step": 148130 + }, + { + "epoch": 1.309605898265528, + "grad_norm": 1.414025902748108, + "learning_rate": 2.8173235028907868e-05, + "loss": 0.6481, + "step": 148140 + }, + { + "epoch": 1.3096943015258402, + "grad_norm": 3.0190954208374023, + "learning_rate": 2.8171761641235993e-05, + "loss": 0.6249, + "step": 148150 + }, + { + "epoch": 1.3097827047861526, + "grad_norm": 3.135728120803833, + "learning_rate": 2.8170288253564125e-05, + "loss": 0.6229, + "step": 148160 + }, + { + "epoch": 1.3098711080464647, + "grad_norm": 3.5395970344543457, + "learning_rate": 2.8168814865892256e-05, + "loss": 0.6476, + "step": 148170 + }, + { + "epoch": 1.309959511306777, + "grad_norm": 3.3132314682006836, + "learning_rate": 2.816734147822038e-05, + "loss": 0.599, + "step": 148180 + }, + { + "epoch": 1.3100479145670891, + "grad_norm": 2.0649218559265137, + "learning_rate": 2.8165868090548513e-05, + "loss": 0.475, + "step": 148190 + }, + { + "epoch": 1.3101363178274015, + "grad_norm": 2.7086780071258545, + "learning_rate": 2.8164394702876645e-05, + "loss": 0.6042, + "step": 148200 + }, + { + "epoch": 1.3102247210877138, + "grad_norm": 2.1677048206329346, + "learning_rate": 2.816292131520477e-05, + "loss": 0.6134, + "step": 148210 + }, + { + "epoch": 1.310313124348026, + "grad_norm": 3.4466402530670166, + "learning_rate": 2.81614479275329e-05, + "loss": 0.6503, + "step": 148220 + }, + { + "epoch": 1.310401527608338, + "grad_norm": 2.27215576171875, + "learning_rate": 2.8159974539861033e-05, + "loss": 0.6277, + "step": 148230 + }, + { + "epoch": 1.3104899308686504, + "grad_norm": 1.3102741241455078, + "learning_rate": 2.815850115218916e-05, + "loss": 0.6983, + "step": 148240 + }, + { + "epoch": 1.3105783341289627, + "grad_norm": 6.3948540687561035, + "learning_rate": 2.815702776451729e-05, + "loss": 0.5522, + "step": 148250 + }, + { + "epoch": 1.3106667373892749, + "grad_norm": 4.992373943328857, + "learning_rate": 2.8155554376845415e-05, + "loss": 0.6298, + "step": 148260 + }, + { + "epoch": 1.3107551406495872, + "grad_norm": 3.187575340270996, + "learning_rate": 2.8154080989173547e-05, + "loss": 0.476, + "step": 148270 + }, + { + "epoch": 1.3108435439098993, + "grad_norm": 2.759039878845215, + "learning_rate": 2.815260760150168e-05, + "loss": 0.7035, + "step": 148280 + }, + { + "epoch": 1.3109319471702117, + "grad_norm": 5.789707183837891, + "learning_rate": 2.8151134213829803e-05, + "loss": 0.733, + "step": 148290 + }, + { + "epoch": 1.3110203504305238, + "grad_norm": 1.0783339738845825, + "learning_rate": 2.8149660826157935e-05, + "loss": 0.6591, + "step": 148300 + }, + { + "epoch": 1.3111087536908361, + "grad_norm": 2.655055046081543, + "learning_rate": 2.8148187438486067e-05, + "loss": 0.6063, + "step": 148310 + }, + { + "epoch": 1.3111971569511485, + "grad_norm": 3.78544020652771, + "learning_rate": 2.8146714050814192e-05, + "loss": 0.7063, + "step": 148320 + }, + { + "epoch": 1.3112855602114606, + "grad_norm": 4.102067470550537, + "learning_rate": 2.8145240663142324e-05, + "loss": 0.5678, + "step": 148330 + }, + { + "epoch": 1.3113739634717727, + "grad_norm": 1.2500070333480835, + "learning_rate": 2.8143767275470455e-05, + "loss": 0.5558, + "step": 148340 + }, + { + "epoch": 1.311462366732085, + "grad_norm": 0.7118691205978394, + "learning_rate": 2.814229388779858e-05, + "loss": 0.5641, + "step": 148350 + }, + { + "epoch": 1.3115507699923974, + "grad_norm": 2.446852207183838, + "learning_rate": 2.8140820500126712e-05, + "loss": 0.7724, + "step": 148360 + }, + { + "epoch": 1.3116391732527095, + "grad_norm": 2.7543790340423584, + "learning_rate": 2.8139347112454844e-05, + "loss": 0.6487, + "step": 148370 + }, + { + "epoch": 1.3117275765130219, + "grad_norm": 3.6966869831085205, + "learning_rate": 2.813787372478297e-05, + "loss": 0.5965, + "step": 148380 + }, + { + "epoch": 1.311815979773334, + "grad_norm": 3.447160243988037, + "learning_rate": 2.81364003371111e-05, + "loss": 0.5269, + "step": 148390 + }, + { + "epoch": 1.3119043830336463, + "grad_norm": 1.771708369255066, + "learning_rate": 2.813492694943923e-05, + "loss": 0.6768, + "step": 148400 + }, + { + "epoch": 1.3119927862939584, + "grad_norm": 0.8293766975402832, + "learning_rate": 2.8133453561767357e-05, + "loss": 0.57, + "step": 148410 + }, + { + "epoch": 1.3120811895542708, + "grad_norm": 1.6164108514785767, + "learning_rate": 2.813198017409549e-05, + "loss": 0.6959, + "step": 148420 + }, + { + "epoch": 1.3121695928145831, + "grad_norm": 4.570804595947266, + "learning_rate": 2.8130506786423617e-05, + "loss": 0.5991, + "step": 148430 + }, + { + "epoch": 1.3122579960748952, + "grad_norm": 2.8703620433807373, + "learning_rate": 2.8129033398751746e-05, + "loss": 0.7346, + "step": 148440 + }, + { + "epoch": 1.3123463993352074, + "grad_norm": 4.190461158752441, + "learning_rate": 2.8127560011079877e-05, + "loss": 0.5259, + "step": 148450 + }, + { + "epoch": 1.3124348025955197, + "grad_norm": 5.106935977935791, + "learning_rate": 2.8126086623408006e-05, + "loss": 0.6745, + "step": 148460 + }, + { + "epoch": 1.312523205855832, + "grad_norm": 1.2798593044281006, + "learning_rate": 2.8124613235736134e-05, + "loss": 0.6515, + "step": 148470 + }, + { + "epoch": 1.3126116091161442, + "grad_norm": 5.465696811676025, + "learning_rate": 2.8123139848064266e-05, + "loss": 0.6106, + "step": 148480 + }, + { + "epoch": 1.3127000123764565, + "grad_norm": 2.975137710571289, + "learning_rate": 2.8121666460392394e-05, + "loss": 0.541, + "step": 148490 + }, + { + "epoch": 1.3127884156367686, + "grad_norm": 1.5276206731796265, + "learning_rate": 2.8120193072720523e-05, + "loss": 0.6069, + "step": 148500 + }, + { + "epoch": 1.312876818897081, + "grad_norm": 10.58261775970459, + "learning_rate": 2.811871968504865e-05, + "loss": 0.559, + "step": 148510 + }, + { + "epoch": 1.312965222157393, + "grad_norm": 1.9151713848114014, + "learning_rate": 2.8117246297376783e-05, + "loss": 0.5856, + "step": 148520 + }, + { + "epoch": 1.3130536254177054, + "grad_norm": 2.3587288856506348, + "learning_rate": 2.811577290970491e-05, + "loss": 0.5826, + "step": 148530 + }, + { + "epoch": 1.3131420286780178, + "grad_norm": 2.363764524459839, + "learning_rate": 2.811429952203304e-05, + "loss": 0.6009, + "step": 148540 + }, + { + "epoch": 1.31323043193833, + "grad_norm": 4.240105628967285, + "learning_rate": 2.811282613436117e-05, + "loss": 0.6627, + "step": 148550 + }, + { + "epoch": 1.313318835198642, + "grad_norm": 1.2024303674697876, + "learning_rate": 2.81113527466893e-05, + "loss": 0.7069, + "step": 148560 + }, + { + "epoch": 1.3134072384589544, + "grad_norm": 4.543483734130859, + "learning_rate": 2.8109879359017428e-05, + "loss": 0.6394, + "step": 148570 + }, + { + "epoch": 1.3134956417192667, + "grad_norm": 1.2395130395889282, + "learning_rate": 2.810840597134556e-05, + "loss": 0.5426, + "step": 148580 + }, + { + "epoch": 1.3135840449795788, + "grad_norm": 1.3809128999710083, + "learning_rate": 2.8106932583673688e-05, + "loss": 0.7005, + "step": 148590 + }, + { + "epoch": 1.3136724482398912, + "grad_norm": 1.356872797012329, + "learning_rate": 2.8105459196001816e-05, + "loss": 0.5765, + "step": 148600 + }, + { + "epoch": 1.3137608515002033, + "grad_norm": 1.5789388418197632, + "learning_rate": 2.8103985808329948e-05, + "loss": 0.6714, + "step": 148610 + }, + { + "epoch": 1.3138492547605156, + "grad_norm": 2.0288944244384766, + "learning_rate": 2.8102512420658073e-05, + "loss": 0.597, + "step": 148620 + }, + { + "epoch": 1.3139376580208277, + "grad_norm": 1.8794320821762085, + "learning_rate": 2.8101039032986205e-05, + "loss": 0.5266, + "step": 148630 + }, + { + "epoch": 1.31402606128114, + "grad_norm": 1.0160455703735352, + "learning_rate": 2.8099565645314337e-05, + "loss": 0.6366, + "step": 148640 + }, + { + "epoch": 1.3141144645414524, + "grad_norm": 11.037582397460938, + "learning_rate": 2.809809225764246e-05, + "loss": 0.5294, + "step": 148650 + }, + { + "epoch": 1.3142028678017645, + "grad_norm": 3.0306789875030518, + "learning_rate": 2.8096618869970593e-05, + "loss": 0.6022, + "step": 148660 + }, + { + "epoch": 1.3142912710620767, + "grad_norm": 2.8430674076080322, + "learning_rate": 2.8095145482298725e-05, + "loss": 0.6238, + "step": 148670 + }, + { + "epoch": 1.314379674322389, + "grad_norm": 2.3491432666778564, + "learning_rate": 2.809367209462685e-05, + "loss": 0.6024, + "step": 148680 + }, + { + "epoch": 1.3144680775827013, + "grad_norm": 12.166789054870605, + "learning_rate": 2.8092198706954982e-05, + "loss": 0.6587, + "step": 148690 + }, + { + "epoch": 1.3145564808430135, + "grad_norm": 2.9210777282714844, + "learning_rate": 2.8090725319283113e-05, + "loss": 0.533, + "step": 148700 + }, + { + "epoch": 1.3146448841033258, + "grad_norm": 5.058513164520264, + "learning_rate": 2.808925193161124e-05, + "loss": 0.5783, + "step": 148710 + }, + { + "epoch": 1.314733287363638, + "grad_norm": 4.492527008056641, + "learning_rate": 2.808777854393937e-05, + "loss": 0.5599, + "step": 148720 + }, + { + "epoch": 1.3148216906239503, + "grad_norm": 2.55505633354187, + "learning_rate": 2.8086305156267502e-05, + "loss": 0.6899, + "step": 148730 + }, + { + "epoch": 1.3149100938842624, + "grad_norm": 4.788483142852783, + "learning_rate": 2.8084831768595627e-05, + "loss": 0.5483, + "step": 148740 + }, + { + "epoch": 1.3149984971445747, + "grad_norm": 4.912303447723389, + "learning_rate": 2.808335838092376e-05, + "loss": 0.6141, + "step": 148750 + }, + { + "epoch": 1.3150869004048868, + "grad_norm": 1.9395875930786133, + "learning_rate": 2.8081884993251884e-05, + "loss": 0.5256, + "step": 148760 + }, + { + "epoch": 1.3151753036651992, + "grad_norm": 0.8670976758003235, + "learning_rate": 2.8080411605580015e-05, + "loss": 0.6095, + "step": 148770 + }, + { + "epoch": 1.3152637069255113, + "grad_norm": 13.635018348693848, + "learning_rate": 2.8078938217908147e-05, + "loss": 0.6185, + "step": 148780 + }, + { + "epoch": 1.3153521101858237, + "grad_norm": 1.4558939933776855, + "learning_rate": 2.8077464830236272e-05, + "loss": 0.6478, + "step": 148790 + }, + { + "epoch": 1.315440513446136, + "grad_norm": 1.5235179662704468, + "learning_rate": 2.8075991442564404e-05, + "loss": 0.5987, + "step": 148800 + }, + { + "epoch": 1.3155289167064481, + "grad_norm": 2.690790891647339, + "learning_rate": 2.8074518054892535e-05, + "loss": 0.5572, + "step": 148810 + }, + { + "epoch": 1.3156173199667605, + "grad_norm": 8.002449989318848, + "learning_rate": 2.807304466722066e-05, + "loss": 0.6053, + "step": 148820 + }, + { + "epoch": 1.3157057232270726, + "grad_norm": 2.305943250656128, + "learning_rate": 2.8071571279548792e-05, + "loss": 0.6247, + "step": 148830 + }, + { + "epoch": 1.315794126487385, + "grad_norm": 2.879995107650757, + "learning_rate": 2.8070097891876924e-05, + "loss": 0.673, + "step": 148840 + }, + { + "epoch": 1.315882529747697, + "grad_norm": 1.278105616569519, + "learning_rate": 2.806862450420505e-05, + "loss": 0.4747, + "step": 148850 + }, + { + "epoch": 1.3159709330080094, + "grad_norm": 5.807000160217285, + "learning_rate": 2.806715111653318e-05, + "loss": 0.4995, + "step": 148860 + }, + { + "epoch": 1.3160593362683215, + "grad_norm": 1.4099643230438232, + "learning_rate": 2.8065677728861306e-05, + "loss": 0.5967, + "step": 148870 + }, + { + "epoch": 1.3161477395286338, + "grad_norm": 2.9102284908294678, + "learning_rate": 2.8064204341189437e-05, + "loss": 0.64, + "step": 148880 + }, + { + "epoch": 1.316236142788946, + "grad_norm": 1.4413174390792847, + "learning_rate": 2.806273095351757e-05, + "loss": 0.5785, + "step": 148890 + }, + { + "epoch": 1.3163245460492583, + "grad_norm": 4.8757123947143555, + "learning_rate": 2.8061257565845694e-05, + "loss": 0.6616, + "step": 148900 + }, + { + "epoch": 1.3164129493095706, + "grad_norm": 4.079837322235107, + "learning_rate": 2.8059784178173826e-05, + "loss": 0.6722, + "step": 148910 + }, + { + "epoch": 1.3165013525698828, + "grad_norm": 9.008342742919922, + "learning_rate": 2.8058310790501958e-05, + "loss": 0.5445, + "step": 148920 + }, + { + "epoch": 1.3165897558301949, + "grad_norm": 5.8628740310668945, + "learning_rate": 2.8056837402830083e-05, + "loss": 0.6926, + "step": 148930 + }, + { + "epoch": 1.3166781590905072, + "grad_norm": 3.9124386310577393, + "learning_rate": 2.8055364015158214e-05, + "loss": 0.5543, + "step": 148940 + }, + { + "epoch": 1.3167665623508196, + "grad_norm": 2.377822160720825, + "learning_rate": 2.8053890627486346e-05, + "loss": 0.6096, + "step": 148950 + }, + { + "epoch": 1.3168549656111317, + "grad_norm": 2.1724817752838135, + "learning_rate": 2.805241723981447e-05, + "loss": 0.7722, + "step": 148960 + }, + { + "epoch": 1.316943368871444, + "grad_norm": 1.0120038986206055, + "learning_rate": 2.8050943852142603e-05, + "loss": 0.4392, + "step": 148970 + }, + { + "epoch": 1.3170317721317562, + "grad_norm": 10.115762710571289, + "learning_rate": 2.8049470464470728e-05, + "loss": 0.5273, + "step": 148980 + }, + { + "epoch": 1.3171201753920685, + "grad_norm": 7.143669128417969, + "learning_rate": 2.804799707679886e-05, + "loss": 0.4651, + "step": 148990 + }, + { + "epoch": 1.3172085786523806, + "grad_norm": 0.8453541994094849, + "learning_rate": 2.804652368912699e-05, + "loss": 0.6832, + "step": 149000 + }, + { + "epoch": 1.317296981912693, + "grad_norm": 14.636137008666992, + "learning_rate": 2.8045050301455116e-05, + "loss": 0.6548, + "step": 149010 + }, + { + "epoch": 1.3173853851730053, + "grad_norm": 4.40684700012207, + "learning_rate": 2.8043576913783248e-05, + "loss": 0.5335, + "step": 149020 + }, + { + "epoch": 1.3174737884333174, + "grad_norm": 2.7863948345184326, + "learning_rate": 2.804210352611138e-05, + "loss": 0.5586, + "step": 149030 + }, + { + "epoch": 1.3175621916936295, + "grad_norm": 1.8299301862716675, + "learning_rate": 2.8040630138439505e-05, + "loss": 0.6923, + "step": 149040 + }, + { + "epoch": 1.3176505949539419, + "grad_norm": 2.4196178913116455, + "learning_rate": 2.8039156750767636e-05, + "loss": 0.4718, + "step": 149050 + }, + { + "epoch": 1.3177389982142542, + "grad_norm": 2.169053554534912, + "learning_rate": 2.8037683363095768e-05, + "loss": 0.7069, + "step": 149060 + }, + { + "epoch": 1.3178274014745663, + "grad_norm": 12.398027420043945, + "learning_rate": 2.8036209975423893e-05, + "loss": 0.6106, + "step": 149070 + }, + { + "epoch": 1.3179158047348787, + "grad_norm": 7.882556915283203, + "learning_rate": 2.8034736587752025e-05, + "loss": 0.6408, + "step": 149080 + }, + { + "epoch": 1.3180042079951908, + "grad_norm": 1.3828694820404053, + "learning_rate": 2.803326320008015e-05, + "loss": 0.5569, + "step": 149090 + }, + { + "epoch": 1.3180926112555031, + "grad_norm": 3.0277352333068848, + "learning_rate": 2.803178981240828e-05, + "loss": 0.5917, + "step": 149100 + }, + { + "epoch": 1.3181810145158153, + "grad_norm": 3.305237293243408, + "learning_rate": 2.8030316424736413e-05, + "loss": 0.6349, + "step": 149110 + }, + { + "epoch": 1.3182694177761276, + "grad_norm": 1.0532431602478027, + "learning_rate": 2.8028843037064538e-05, + "loss": 0.5667, + "step": 149120 + }, + { + "epoch": 1.31835782103644, + "grad_norm": 0.9725874662399292, + "learning_rate": 2.802736964939267e-05, + "loss": 0.5095, + "step": 149130 + }, + { + "epoch": 1.318446224296752, + "grad_norm": 1.2367267608642578, + "learning_rate": 2.80258962617208e-05, + "loss": 0.6494, + "step": 149140 + }, + { + "epoch": 1.3185346275570642, + "grad_norm": 2.5237839221954346, + "learning_rate": 2.8024422874048927e-05, + "loss": 0.6039, + "step": 149150 + }, + { + "epoch": 1.3186230308173765, + "grad_norm": 1.1270629167556763, + "learning_rate": 2.802294948637706e-05, + "loss": 0.7042, + "step": 149160 + }, + { + "epoch": 1.3187114340776889, + "grad_norm": 3.558521032333374, + "learning_rate": 2.802147609870519e-05, + "loss": 0.5468, + "step": 149170 + }, + { + "epoch": 1.318799837338001, + "grad_norm": 0.6784349083900452, + "learning_rate": 2.8020002711033315e-05, + "loss": 0.4122, + "step": 149180 + }, + { + "epoch": 1.3188882405983133, + "grad_norm": 4.909865379333496, + "learning_rate": 2.8018529323361447e-05, + "loss": 0.7655, + "step": 149190 + }, + { + "epoch": 1.3189766438586255, + "grad_norm": 1.2544851303100586, + "learning_rate": 2.801705593568958e-05, + "loss": 0.6183, + "step": 149200 + }, + { + "epoch": 1.3190650471189378, + "grad_norm": 2.1723010540008545, + "learning_rate": 2.8015582548017704e-05, + "loss": 0.5113, + "step": 149210 + }, + { + "epoch": 1.31915345037925, + "grad_norm": 2.934835195541382, + "learning_rate": 2.8014109160345835e-05, + "loss": 0.4953, + "step": 149220 + }, + { + "epoch": 1.3192418536395623, + "grad_norm": 3.200718879699707, + "learning_rate": 2.801263577267396e-05, + "loss": 0.5348, + "step": 149230 + }, + { + "epoch": 1.3193302568998746, + "grad_norm": 2.423452615737915, + "learning_rate": 2.8011162385002092e-05, + "loss": 0.6038, + "step": 149240 + }, + { + "epoch": 1.3194186601601867, + "grad_norm": 1.3244715929031372, + "learning_rate": 2.8009688997330224e-05, + "loss": 0.5236, + "step": 149250 + }, + { + "epoch": 1.3195070634204988, + "grad_norm": 2.3296327590942383, + "learning_rate": 2.800821560965835e-05, + "loss": 0.7165, + "step": 149260 + }, + { + "epoch": 1.3195954666808112, + "grad_norm": 0.9985233545303345, + "learning_rate": 2.800674222198648e-05, + "loss": 0.5384, + "step": 149270 + }, + { + "epoch": 1.3196838699411235, + "grad_norm": 2.6017701625823975, + "learning_rate": 2.8005268834314612e-05, + "loss": 0.5466, + "step": 149280 + }, + { + "epoch": 1.3197722732014356, + "grad_norm": 1.6243170499801636, + "learning_rate": 2.8003795446642737e-05, + "loss": 0.638, + "step": 149290 + }, + { + "epoch": 1.319860676461748, + "grad_norm": 3.723896026611328, + "learning_rate": 2.800232205897087e-05, + "loss": 0.6559, + "step": 149300 + }, + { + "epoch": 1.31994907972206, + "grad_norm": 0.9679158329963684, + "learning_rate": 2.8000848671299e-05, + "loss": 0.5498, + "step": 149310 + }, + { + "epoch": 1.3200374829823724, + "grad_norm": 8.074565887451172, + "learning_rate": 2.7999375283627126e-05, + "loss": 0.628, + "step": 149320 + }, + { + "epoch": 1.3201258862426846, + "grad_norm": 3.1651248931884766, + "learning_rate": 2.7997901895955257e-05, + "loss": 0.5715, + "step": 149330 + }, + { + "epoch": 1.320214289502997, + "grad_norm": 1.804006814956665, + "learning_rate": 2.7996428508283386e-05, + "loss": 0.4921, + "step": 149340 + }, + { + "epoch": 1.3203026927633092, + "grad_norm": 4.262528896331787, + "learning_rate": 2.7994955120611514e-05, + "loss": 0.5277, + "step": 149350 + }, + { + "epoch": 1.3203910960236214, + "grad_norm": 0.8347508311271667, + "learning_rate": 2.7993481732939646e-05, + "loss": 0.5887, + "step": 149360 + }, + { + "epoch": 1.3204794992839335, + "grad_norm": 2.1242294311523438, + "learning_rate": 2.7992008345267774e-05, + "loss": 0.7227, + "step": 149370 + }, + { + "epoch": 1.3205679025442458, + "grad_norm": 5.066576957702637, + "learning_rate": 2.7990534957595902e-05, + "loss": 0.6711, + "step": 149380 + }, + { + "epoch": 1.3206563058045582, + "grad_norm": 2.174452304840088, + "learning_rate": 2.7989061569924034e-05, + "loss": 0.6392, + "step": 149390 + }, + { + "epoch": 1.3207447090648703, + "grad_norm": 13.127554893493652, + "learning_rate": 2.7987588182252163e-05, + "loss": 0.5315, + "step": 149400 + }, + { + "epoch": 1.3208331123251826, + "grad_norm": 0.6841283440589905, + "learning_rate": 2.798611479458029e-05, + "loss": 0.5714, + "step": 149410 + }, + { + "epoch": 1.3209215155854948, + "grad_norm": 4.091153621673584, + "learning_rate": 2.7984641406908423e-05, + "loss": 0.54, + "step": 149420 + }, + { + "epoch": 1.321009918845807, + "grad_norm": 5.6124091148376465, + "learning_rate": 2.798316801923655e-05, + "loss": 0.6151, + "step": 149430 + }, + { + "epoch": 1.3210983221061192, + "grad_norm": 12.54110336303711, + "learning_rate": 2.798169463156468e-05, + "loss": 0.6296, + "step": 149440 + }, + { + "epoch": 1.3211867253664316, + "grad_norm": 3.029468536376953, + "learning_rate": 2.7980221243892808e-05, + "loss": 0.601, + "step": 149450 + }, + { + "epoch": 1.3212751286267437, + "grad_norm": 2.3717033863067627, + "learning_rate": 2.797874785622094e-05, + "loss": 0.6443, + "step": 149460 + }, + { + "epoch": 1.321363531887056, + "grad_norm": 2.2255585193634033, + "learning_rate": 2.7977274468549068e-05, + "loss": 0.6611, + "step": 149470 + }, + { + "epoch": 1.3214519351473681, + "grad_norm": 2.8330562114715576, + "learning_rate": 2.7975801080877196e-05, + "loss": 0.5535, + "step": 149480 + }, + { + "epoch": 1.3215403384076805, + "grad_norm": 2.395223617553711, + "learning_rate": 2.7974327693205328e-05, + "loss": 0.5336, + "step": 149490 + }, + { + "epoch": 1.3216287416679928, + "grad_norm": 6.6551361083984375, + "learning_rate": 2.7972854305533456e-05, + "loss": 0.6118, + "step": 149500 + }, + { + "epoch": 1.321717144928305, + "grad_norm": 1.2373433113098145, + "learning_rate": 2.7971380917861585e-05, + "loss": 0.4703, + "step": 149510 + }, + { + "epoch": 1.321805548188617, + "grad_norm": 4.395318508148193, + "learning_rate": 2.7969907530189716e-05, + "loss": 0.6532, + "step": 149520 + }, + { + "epoch": 1.3218939514489294, + "grad_norm": 0.9730364680290222, + "learning_rate": 2.7968434142517845e-05, + "loss": 0.5362, + "step": 149530 + }, + { + "epoch": 1.3219823547092417, + "grad_norm": 3.5485353469848633, + "learning_rate": 2.7966960754845973e-05, + "loss": 0.7579, + "step": 149540 + }, + { + "epoch": 1.3220707579695539, + "grad_norm": 2.8976399898529053, + "learning_rate": 2.7965487367174105e-05, + "loss": 0.7407, + "step": 149550 + }, + { + "epoch": 1.3221591612298662, + "grad_norm": 1.7230486869812012, + "learning_rate": 2.796401397950223e-05, + "loss": 0.6945, + "step": 149560 + }, + { + "epoch": 1.3222475644901783, + "grad_norm": 4.44954776763916, + "learning_rate": 2.796254059183036e-05, + "loss": 0.5641, + "step": 149570 + }, + { + "epoch": 1.3223359677504907, + "grad_norm": 5.69603967666626, + "learning_rate": 2.7961067204158493e-05, + "loss": 0.6739, + "step": 149580 + }, + { + "epoch": 1.3224243710108028, + "grad_norm": 4.622913837432861, + "learning_rate": 2.7959593816486618e-05, + "loss": 0.5676, + "step": 149590 + }, + { + "epoch": 1.3225127742711151, + "grad_norm": 3.3708529472351074, + "learning_rate": 2.795812042881475e-05, + "loss": 0.5343, + "step": 149600 + }, + { + "epoch": 1.3226011775314275, + "grad_norm": 3.0041346549987793, + "learning_rate": 2.7956647041142882e-05, + "loss": 0.6227, + "step": 149610 + }, + { + "epoch": 1.3226895807917396, + "grad_norm": 1.8608965873718262, + "learning_rate": 2.7955173653471007e-05, + "loss": 0.5901, + "step": 149620 + }, + { + "epoch": 1.3227779840520517, + "grad_norm": 1.8206435441970825, + "learning_rate": 2.795370026579914e-05, + "loss": 0.5853, + "step": 149630 + }, + { + "epoch": 1.322866387312364, + "grad_norm": 1.5332399606704712, + "learning_rate": 2.795222687812727e-05, + "loss": 0.7259, + "step": 149640 + }, + { + "epoch": 1.3229547905726764, + "grad_norm": 1.5915025472640991, + "learning_rate": 2.7950753490455395e-05, + "loss": 0.6571, + "step": 149650 + }, + { + "epoch": 1.3230431938329885, + "grad_norm": 1.6561287641525269, + "learning_rate": 2.7949280102783527e-05, + "loss": 0.6351, + "step": 149660 + }, + { + "epoch": 1.3231315970933009, + "grad_norm": 11.13310718536377, + "learning_rate": 2.794780671511166e-05, + "loss": 0.5854, + "step": 149670 + }, + { + "epoch": 1.323220000353613, + "grad_norm": 2.8928637504577637, + "learning_rate": 2.7946333327439784e-05, + "loss": 0.4542, + "step": 149680 + }, + { + "epoch": 1.3233084036139253, + "grad_norm": 1.6942949295043945, + "learning_rate": 2.7944859939767915e-05, + "loss": 0.6247, + "step": 149690 + }, + { + "epoch": 1.3233968068742374, + "grad_norm": 1.7843235731124878, + "learning_rate": 2.794338655209604e-05, + "loss": 0.6055, + "step": 149700 + }, + { + "epoch": 1.3234852101345498, + "grad_norm": 2.2763171195983887, + "learning_rate": 2.7941913164424172e-05, + "loss": 0.6487, + "step": 149710 + }, + { + "epoch": 1.3235736133948621, + "grad_norm": 1.9591131210327148, + "learning_rate": 2.7940439776752304e-05, + "loss": 0.5343, + "step": 149720 + }, + { + "epoch": 1.3236620166551742, + "grad_norm": 1.1911338567733765, + "learning_rate": 2.793896638908043e-05, + "loss": 0.621, + "step": 149730 + }, + { + "epoch": 1.3237504199154864, + "grad_norm": 1.1273411512374878, + "learning_rate": 2.793749300140856e-05, + "loss": 0.611, + "step": 149740 + }, + { + "epoch": 1.3238388231757987, + "grad_norm": 2.019596815109253, + "learning_rate": 2.7936019613736692e-05, + "loss": 0.5633, + "step": 149750 + }, + { + "epoch": 1.323927226436111, + "grad_norm": 2.4595444202423096, + "learning_rate": 2.7934546226064817e-05, + "loss": 0.6861, + "step": 149760 + }, + { + "epoch": 1.3240156296964232, + "grad_norm": 1.9519199132919312, + "learning_rate": 2.793307283839295e-05, + "loss": 0.5056, + "step": 149770 + }, + { + "epoch": 1.3241040329567355, + "grad_norm": 1.9743772745132446, + "learning_rate": 2.793159945072108e-05, + "loss": 0.7267, + "step": 149780 + }, + { + "epoch": 1.3241924362170476, + "grad_norm": 4.21689510345459, + "learning_rate": 2.7930126063049206e-05, + "loss": 0.5181, + "step": 149790 + }, + { + "epoch": 1.32428083947736, + "grad_norm": 1.7165127992630005, + "learning_rate": 2.7928652675377337e-05, + "loss": 0.67, + "step": 149800 + }, + { + "epoch": 1.324369242737672, + "grad_norm": 2.287660837173462, + "learning_rate": 2.7927179287705462e-05, + "loss": 0.5722, + "step": 149810 + }, + { + "epoch": 1.3244576459979844, + "grad_norm": 1.8989897966384888, + "learning_rate": 2.7925705900033594e-05, + "loss": 0.5072, + "step": 149820 + }, + { + "epoch": 1.3245460492582968, + "grad_norm": 2.7715299129486084, + "learning_rate": 2.7924232512361726e-05, + "loss": 0.6686, + "step": 149830 + }, + { + "epoch": 1.324634452518609, + "grad_norm": 1.2070786952972412, + "learning_rate": 2.792275912468985e-05, + "loss": 0.5873, + "step": 149840 + }, + { + "epoch": 1.324722855778921, + "grad_norm": 2.238295316696167, + "learning_rate": 2.7921285737017983e-05, + "loss": 0.6277, + "step": 149850 + }, + { + "epoch": 1.3248112590392334, + "grad_norm": 3.2079405784606934, + "learning_rate": 2.7919812349346114e-05, + "loss": 0.6083, + "step": 149860 + }, + { + "epoch": 1.3248996622995457, + "grad_norm": 1.5579689741134644, + "learning_rate": 2.791833896167424e-05, + "loss": 0.5147, + "step": 149870 + }, + { + "epoch": 1.3249880655598578, + "grad_norm": 1.6787261962890625, + "learning_rate": 2.791686557400237e-05, + "loss": 0.678, + "step": 149880 + }, + { + "epoch": 1.3250764688201702, + "grad_norm": 1.855412483215332, + "learning_rate": 2.7915392186330503e-05, + "loss": 0.5387, + "step": 149890 + }, + { + "epoch": 1.3251648720804823, + "grad_norm": 2.181928873062134, + "learning_rate": 2.7913918798658628e-05, + "loss": 0.5415, + "step": 149900 + }, + { + "epoch": 1.3252532753407946, + "grad_norm": 2.0913829803466797, + "learning_rate": 2.791244541098676e-05, + "loss": 0.5816, + "step": 149910 + }, + { + "epoch": 1.3253416786011067, + "grad_norm": 3.86319637298584, + "learning_rate": 2.7910972023314884e-05, + "loss": 0.5917, + "step": 149920 + }, + { + "epoch": 1.325430081861419, + "grad_norm": 3.071532964706421, + "learning_rate": 2.7909498635643016e-05, + "loss": 0.4953, + "step": 149930 + }, + { + "epoch": 1.3255184851217314, + "grad_norm": 1.556351900100708, + "learning_rate": 2.7908025247971148e-05, + "loss": 0.6378, + "step": 149940 + }, + { + "epoch": 1.3256068883820435, + "grad_norm": 7.783919811248779, + "learning_rate": 2.7906551860299273e-05, + "loss": 0.7176, + "step": 149950 + }, + { + "epoch": 1.3256952916423557, + "grad_norm": 3.308544635772705, + "learning_rate": 2.7905078472627405e-05, + "loss": 0.6186, + "step": 149960 + }, + { + "epoch": 1.325783694902668, + "grad_norm": 9.659269332885742, + "learning_rate": 2.7903605084955536e-05, + "loss": 0.6223, + "step": 149970 + }, + { + "epoch": 1.3258720981629804, + "grad_norm": 1.5997798442840576, + "learning_rate": 2.790213169728366e-05, + "loss": 0.6082, + "step": 149980 + }, + { + "epoch": 1.3259605014232925, + "grad_norm": 2.495012044906616, + "learning_rate": 2.7900658309611793e-05, + "loss": 0.6557, + "step": 149990 + }, + { + "epoch": 1.3260489046836048, + "grad_norm": 6.634875774383545, + "learning_rate": 2.7899184921939925e-05, + "loss": 0.6724, + "step": 150000 + }, + { + "epoch": 1.326137307943917, + "grad_norm": 1.9638028144836426, + "learning_rate": 2.789771153426805e-05, + "loss": 0.6969, + "step": 150010 + }, + { + "epoch": 1.3262257112042293, + "grad_norm": 1.1258223056793213, + "learning_rate": 2.789623814659618e-05, + "loss": 0.5816, + "step": 150020 + }, + { + "epoch": 1.3263141144645414, + "grad_norm": 13.562403678894043, + "learning_rate": 2.7894764758924306e-05, + "loss": 0.4861, + "step": 150030 + }, + { + "epoch": 1.3264025177248537, + "grad_norm": 4.885910511016846, + "learning_rate": 2.7893291371252438e-05, + "loss": 0.6611, + "step": 150040 + }, + { + "epoch": 1.3264909209851659, + "grad_norm": 10.201705932617188, + "learning_rate": 2.789181798358057e-05, + "loss": 0.6298, + "step": 150050 + }, + { + "epoch": 1.3265793242454782, + "grad_norm": 20.267230987548828, + "learning_rate": 2.7890344595908695e-05, + "loss": 0.6347, + "step": 150060 + }, + { + "epoch": 1.3266677275057903, + "grad_norm": 2.118298292160034, + "learning_rate": 2.7888871208236827e-05, + "loss": 0.6034, + "step": 150070 + }, + { + "epoch": 1.3267561307661027, + "grad_norm": 1.4050759077072144, + "learning_rate": 2.788739782056496e-05, + "loss": 0.5572, + "step": 150080 + }, + { + "epoch": 1.326844534026415, + "grad_norm": 14.408783912658691, + "learning_rate": 2.7885924432893083e-05, + "loss": 0.7032, + "step": 150090 + }, + { + "epoch": 1.3269329372867271, + "grad_norm": 4.709105014801025, + "learning_rate": 2.7884451045221215e-05, + "loss": 0.6664, + "step": 150100 + }, + { + "epoch": 1.3270213405470392, + "grad_norm": 2.413173198699951, + "learning_rate": 2.7882977657549347e-05, + "loss": 0.685, + "step": 150110 + }, + { + "epoch": 1.3271097438073516, + "grad_norm": 2.0782058238983154, + "learning_rate": 2.7881504269877472e-05, + "loss": 0.5607, + "step": 150120 + }, + { + "epoch": 1.327198147067664, + "grad_norm": 3.12268328666687, + "learning_rate": 2.7880030882205604e-05, + "loss": 0.6153, + "step": 150130 + }, + { + "epoch": 1.327286550327976, + "grad_norm": 0.6943034529685974, + "learning_rate": 2.7878557494533735e-05, + "loss": 0.4618, + "step": 150140 + }, + { + "epoch": 1.3273749535882884, + "grad_norm": 3.122631072998047, + "learning_rate": 2.787708410686186e-05, + "loss": 0.6025, + "step": 150150 + }, + { + "epoch": 1.3274633568486005, + "grad_norm": 4.157289981842041, + "learning_rate": 2.7875610719189992e-05, + "loss": 0.503, + "step": 150160 + }, + { + "epoch": 1.3275517601089128, + "grad_norm": 0.9801242351531982, + "learning_rate": 2.7874137331518117e-05, + "loss": 0.7158, + "step": 150170 + }, + { + "epoch": 1.327640163369225, + "grad_norm": 4.750873565673828, + "learning_rate": 2.787266394384625e-05, + "loss": 0.5517, + "step": 150180 + }, + { + "epoch": 1.3277285666295373, + "grad_norm": 2.4548332691192627, + "learning_rate": 2.787119055617438e-05, + "loss": 0.6068, + "step": 150190 + }, + { + "epoch": 1.3278169698898497, + "grad_norm": 1.4304416179656982, + "learning_rate": 2.7869717168502505e-05, + "loss": 0.701, + "step": 150200 + }, + { + "epoch": 1.3279053731501618, + "grad_norm": 3.8410096168518066, + "learning_rate": 2.7868243780830637e-05, + "loss": 0.502, + "step": 150210 + }, + { + "epoch": 1.327993776410474, + "grad_norm": 1.530448317527771, + "learning_rate": 2.786677039315877e-05, + "loss": 0.5823, + "step": 150220 + }, + { + "epoch": 1.3280821796707862, + "grad_norm": 1.39223051071167, + "learning_rate": 2.7865297005486894e-05, + "loss": 0.5787, + "step": 150230 + }, + { + "epoch": 1.3281705829310986, + "grad_norm": 2.7651476860046387, + "learning_rate": 2.7863823617815026e-05, + "loss": 0.667, + "step": 150240 + }, + { + "epoch": 1.3282589861914107, + "grad_norm": 2.3552401065826416, + "learning_rate": 2.7862350230143157e-05, + "loss": 0.5412, + "step": 150250 + }, + { + "epoch": 1.328347389451723, + "grad_norm": 1.4385193586349487, + "learning_rate": 2.7860876842471286e-05, + "loss": 0.6674, + "step": 150260 + }, + { + "epoch": 1.3284357927120352, + "grad_norm": 1.6505261659622192, + "learning_rate": 2.7859403454799414e-05, + "loss": 0.5412, + "step": 150270 + }, + { + "epoch": 1.3285241959723475, + "grad_norm": 1.7234666347503662, + "learning_rate": 2.7857930067127542e-05, + "loss": 0.6793, + "step": 150280 + }, + { + "epoch": 1.3286125992326596, + "grad_norm": 1.9166803359985352, + "learning_rate": 2.7856456679455674e-05, + "loss": 0.4958, + "step": 150290 + }, + { + "epoch": 1.328701002492972, + "grad_norm": 2.502293348312378, + "learning_rate": 2.7854983291783803e-05, + "loss": 0.5464, + "step": 150300 + }, + { + "epoch": 1.3287894057532843, + "grad_norm": 4.724217414855957, + "learning_rate": 2.785350990411193e-05, + "loss": 0.7003, + "step": 150310 + }, + { + "epoch": 1.3288778090135964, + "grad_norm": 2.1562182903289795, + "learning_rate": 2.7852036516440063e-05, + "loss": 0.5643, + "step": 150320 + }, + { + "epoch": 1.3289662122739085, + "grad_norm": 2.982635736465454, + "learning_rate": 2.785056312876819e-05, + "loss": 0.767, + "step": 150330 + }, + { + "epoch": 1.3290546155342209, + "grad_norm": 1.9141908884048462, + "learning_rate": 2.784908974109632e-05, + "loss": 0.604, + "step": 150340 + }, + { + "epoch": 1.3291430187945332, + "grad_norm": 4.531190872192383, + "learning_rate": 2.784761635342445e-05, + "loss": 0.7938, + "step": 150350 + }, + { + "epoch": 1.3292314220548453, + "grad_norm": 3.92395281791687, + "learning_rate": 2.784614296575258e-05, + "loss": 0.6323, + "step": 150360 + }, + { + "epoch": 1.3293198253151577, + "grad_norm": 1.299192190170288, + "learning_rate": 2.7844669578080708e-05, + "loss": 0.6143, + "step": 150370 + }, + { + "epoch": 1.3294082285754698, + "grad_norm": 7.759983062744141, + "learning_rate": 2.784319619040884e-05, + "loss": 0.6597, + "step": 150380 + }, + { + "epoch": 1.3294966318357821, + "grad_norm": 4.2938361167907715, + "learning_rate": 2.7841722802736965e-05, + "loss": 0.6369, + "step": 150390 + }, + { + "epoch": 1.3295850350960943, + "grad_norm": 27.397857666015625, + "learning_rate": 2.7840249415065096e-05, + "loss": 0.6365, + "step": 150400 + }, + { + "epoch": 1.3296734383564066, + "grad_norm": 1.1241165399551392, + "learning_rate": 2.7838776027393228e-05, + "loss": 0.5248, + "step": 150410 + }, + { + "epoch": 1.329761841616719, + "grad_norm": 5.163040637969971, + "learning_rate": 2.7837302639721353e-05, + "loss": 0.5497, + "step": 150420 + }, + { + "epoch": 1.329850244877031, + "grad_norm": 2.237762928009033, + "learning_rate": 2.7835829252049485e-05, + "loss": 0.6564, + "step": 150430 + }, + { + "epoch": 1.3299386481373432, + "grad_norm": 5.071065425872803, + "learning_rate": 2.7834355864377616e-05, + "loss": 0.5274, + "step": 150440 + }, + { + "epoch": 1.3300270513976555, + "grad_norm": 5.093015193939209, + "learning_rate": 2.783288247670574e-05, + "loss": 0.6916, + "step": 150450 + }, + { + "epoch": 1.3301154546579679, + "grad_norm": 1.2640419006347656, + "learning_rate": 2.7831409089033873e-05, + "loss": 0.6796, + "step": 150460 + }, + { + "epoch": 1.33020385791828, + "grad_norm": 24.784549713134766, + "learning_rate": 2.7829935701362005e-05, + "loss": 0.7552, + "step": 150470 + }, + { + "epoch": 1.3302922611785923, + "grad_norm": 2.175790548324585, + "learning_rate": 2.782846231369013e-05, + "loss": 0.5447, + "step": 150480 + }, + { + "epoch": 1.3303806644389045, + "grad_norm": 10.204320907592773, + "learning_rate": 2.782698892601826e-05, + "loss": 0.5061, + "step": 150490 + }, + { + "epoch": 1.3304690676992168, + "grad_norm": 1.5230896472930908, + "learning_rate": 2.7825515538346387e-05, + "loss": 0.6622, + "step": 150500 + }, + { + "epoch": 1.330557470959529, + "grad_norm": 2.734492063522339, + "learning_rate": 2.7824042150674518e-05, + "loss": 0.6224, + "step": 150510 + }, + { + "epoch": 1.3306458742198413, + "grad_norm": 3.1669328212738037, + "learning_rate": 2.782256876300265e-05, + "loss": 0.8145, + "step": 150520 + }, + { + "epoch": 1.3307342774801536, + "grad_norm": 1.6175446510314941, + "learning_rate": 2.7821095375330775e-05, + "loss": 0.4171, + "step": 150530 + }, + { + "epoch": 1.3308226807404657, + "grad_norm": 2.3916404247283936, + "learning_rate": 2.7819621987658907e-05, + "loss": 0.6381, + "step": 150540 + }, + { + "epoch": 1.3309110840007778, + "grad_norm": 6.5817108154296875, + "learning_rate": 2.781814859998704e-05, + "loss": 0.7172, + "step": 150550 + }, + { + "epoch": 1.3309994872610902, + "grad_norm": 2.919212579727173, + "learning_rate": 2.7816675212315163e-05, + "loss": 0.5731, + "step": 150560 + }, + { + "epoch": 1.3310878905214025, + "grad_norm": 2.1812314987182617, + "learning_rate": 2.7815201824643295e-05, + "loss": 0.6087, + "step": 150570 + }, + { + "epoch": 1.3311762937817146, + "grad_norm": 2.6658101081848145, + "learning_rate": 2.7813728436971427e-05, + "loss": 0.5804, + "step": 150580 + }, + { + "epoch": 1.331264697042027, + "grad_norm": 2.9856085777282715, + "learning_rate": 2.7812255049299552e-05, + "loss": 0.6247, + "step": 150590 + }, + { + "epoch": 1.331353100302339, + "grad_norm": 1.3335188627243042, + "learning_rate": 2.7810781661627684e-05, + "loss": 0.5555, + "step": 150600 + }, + { + "epoch": 1.3314415035626515, + "grad_norm": 3.168314218521118, + "learning_rate": 2.7809308273955815e-05, + "loss": 0.5928, + "step": 150610 + }, + { + "epoch": 1.3315299068229636, + "grad_norm": 3.5253214836120605, + "learning_rate": 2.780783488628394e-05, + "loss": 0.8322, + "step": 150620 + }, + { + "epoch": 1.331618310083276, + "grad_norm": 1.6161246299743652, + "learning_rate": 2.7806361498612072e-05, + "loss": 0.6468, + "step": 150630 + }, + { + "epoch": 1.331706713343588, + "grad_norm": 5.83316707611084, + "learning_rate": 2.7804888110940197e-05, + "loss": 0.6055, + "step": 150640 + }, + { + "epoch": 1.3317951166039004, + "grad_norm": 2.1565229892730713, + "learning_rate": 2.780341472326833e-05, + "loss": 0.6758, + "step": 150650 + }, + { + "epoch": 1.3318835198642125, + "grad_norm": 1.9444721937179565, + "learning_rate": 2.780194133559646e-05, + "loss": 0.6554, + "step": 150660 + }, + { + "epoch": 1.3319719231245248, + "grad_norm": 1.1099681854248047, + "learning_rate": 2.7800467947924586e-05, + "loss": 0.6158, + "step": 150670 + }, + { + "epoch": 1.3320603263848372, + "grad_norm": 2.171590566635132, + "learning_rate": 2.7798994560252717e-05, + "loss": 0.58, + "step": 150680 + }, + { + "epoch": 1.3321487296451493, + "grad_norm": 6.700047492980957, + "learning_rate": 2.779752117258085e-05, + "loss": 0.7676, + "step": 150690 + }, + { + "epoch": 1.3322371329054614, + "grad_norm": 2.354696750640869, + "learning_rate": 2.7796047784908974e-05, + "loss": 0.6369, + "step": 150700 + }, + { + "epoch": 1.3323255361657738, + "grad_norm": 3.3658783435821533, + "learning_rate": 2.7794574397237106e-05, + "loss": 0.4878, + "step": 150710 + }, + { + "epoch": 1.332413939426086, + "grad_norm": 2.440730571746826, + "learning_rate": 2.7793101009565237e-05, + "loss": 0.5149, + "step": 150720 + }, + { + "epoch": 1.3325023426863982, + "grad_norm": 3.293100595474243, + "learning_rate": 2.7791627621893362e-05, + "loss": 0.6807, + "step": 150730 + }, + { + "epoch": 1.3325907459467106, + "grad_norm": 1.672044277191162, + "learning_rate": 2.7790154234221494e-05, + "loss": 0.6752, + "step": 150740 + }, + { + "epoch": 1.3326791492070227, + "grad_norm": 1.8963091373443604, + "learning_rate": 2.778868084654962e-05, + "loss": 0.5227, + "step": 150750 + }, + { + "epoch": 1.332767552467335, + "grad_norm": 2.4444239139556885, + "learning_rate": 2.778720745887775e-05, + "loss": 0.5804, + "step": 150760 + }, + { + "epoch": 1.3328559557276471, + "grad_norm": 2.397829294204712, + "learning_rate": 2.7785734071205883e-05, + "loss": 0.6672, + "step": 150770 + }, + { + "epoch": 1.3329443589879595, + "grad_norm": 8.323978424072266, + "learning_rate": 2.7784260683534008e-05, + "loss": 0.6131, + "step": 150780 + }, + { + "epoch": 1.3330327622482718, + "grad_norm": 5.0845947265625, + "learning_rate": 2.778278729586214e-05, + "loss": 0.6846, + "step": 150790 + }, + { + "epoch": 1.333121165508584, + "grad_norm": 3.428443431854248, + "learning_rate": 2.778131390819027e-05, + "loss": 0.5184, + "step": 150800 + }, + { + "epoch": 1.333209568768896, + "grad_norm": 1.5157850980758667, + "learning_rate": 2.7779840520518396e-05, + "loss": 0.688, + "step": 150810 + }, + { + "epoch": 1.3332979720292084, + "grad_norm": 2.5739991664886475, + "learning_rate": 2.7778367132846528e-05, + "loss": 0.6031, + "step": 150820 + }, + { + "epoch": 1.3333863752895208, + "grad_norm": 3.486426830291748, + "learning_rate": 2.777689374517466e-05, + "loss": 0.7157, + "step": 150830 + }, + { + "epoch": 1.3334747785498329, + "grad_norm": 1.507509708404541, + "learning_rate": 2.7775420357502784e-05, + "loss": 0.5884, + "step": 150840 + }, + { + "epoch": 1.3335631818101452, + "grad_norm": 3.454697370529175, + "learning_rate": 2.7773946969830916e-05, + "loss": 0.6618, + "step": 150850 + }, + { + "epoch": 1.3336515850704573, + "grad_norm": 2.0091540813446045, + "learning_rate": 2.777247358215904e-05, + "loss": 0.6251, + "step": 150860 + }, + { + "epoch": 1.3337399883307697, + "grad_norm": 1.098272681236267, + "learning_rate": 2.7771000194487173e-05, + "loss": 0.5648, + "step": 150870 + }, + { + "epoch": 1.3338283915910818, + "grad_norm": 18.883617401123047, + "learning_rate": 2.7769526806815305e-05, + "loss": 0.5722, + "step": 150880 + }, + { + "epoch": 1.3339167948513941, + "grad_norm": 1.5560534000396729, + "learning_rate": 2.776805341914343e-05, + "loss": 0.5098, + "step": 150890 + }, + { + "epoch": 1.3340051981117065, + "grad_norm": 1.6480363607406616, + "learning_rate": 2.776658003147156e-05, + "loss": 0.6093, + "step": 150900 + }, + { + "epoch": 1.3340936013720186, + "grad_norm": 6.449525833129883, + "learning_rate": 2.7765106643799693e-05, + "loss": 0.6692, + "step": 150910 + }, + { + "epoch": 1.3341820046323307, + "grad_norm": 10.009638786315918, + "learning_rate": 2.7763633256127818e-05, + "loss": 0.5721, + "step": 150920 + }, + { + "epoch": 1.334270407892643, + "grad_norm": 4.691011428833008, + "learning_rate": 2.776215986845595e-05, + "loss": 0.7491, + "step": 150930 + }, + { + "epoch": 1.3343588111529554, + "grad_norm": 1.4988094568252563, + "learning_rate": 2.776068648078408e-05, + "loss": 0.6663, + "step": 150940 + }, + { + "epoch": 1.3344472144132675, + "grad_norm": 0.885347306728363, + "learning_rate": 2.7759213093112207e-05, + "loss": 0.6074, + "step": 150950 + }, + { + "epoch": 1.3345356176735799, + "grad_norm": 4.787322044372559, + "learning_rate": 2.7757739705440338e-05, + "loss": 0.6732, + "step": 150960 + }, + { + "epoch": 1.334624020933892, + "grad_norm": 2.074153423309326, + "learning_rate": 2.7756266317768463e-05, + "loss": 0.547, + "step": 150970 + }, + { + "epoch": 1.3347124241942043, + "grad_norm": 4.47625207901001, + "learning_rate": 2.7754792930096595e-05, + "loss": 0.5978, + "step": 150980 + }, + { + "epoch": 1.3348008274545164, + "grad_norm": 1.9079277515411377, + "learning_rate": 2.7753319542424727e-05, + "loss": 0.604, + "step": 150990 + }, + { + "epoch": 1.3348892307148288, + "grad_norm": 5.470847129821777, + "learning_rate": 2.775184615475285e-05, + "loss": 0.5262, + "step": 151000 + }, + { + "epoch": 1.3349776339751411, + "grad_norm": 1.6069300174713135, + "learning_rate": 2.7750372767080983e-05, + "loss": 0.5069, + "step": 151010 + }, + { + "epoch": 1.3350660372354533, + "grad_norm": 1.8883270025253296, + "learning_rate": 2.7748899379409115e-05, + "loss": 0.554, + "step": 151020 + }, + { + "epoch": 1.3351544404957654, + "grad_norm": 1.1260170936584473, + "learning_rate": 2.774742599173724e-05, + "loss": 0.5093, + "step": 151030 + }, + { + "epoch": 1.3352428437560777, + "grad_norm": 1.3982582092285156, + "learning_rate": 2.7745952604065372e-05, + "loss": 0.6464, + "step": 151040 + }, + { + "epoch": 1.33533124701639, + "grad_norm": 6.610744953155518, + "learning_rate": 2.7744479216393504e-05, + "loss": 0.5489, + "step": 151050 + }, + { + "epoch": 1.3354196502767022, + "grad_norm": 5.942538738250732, + "learning_rate": 2.774300582872163e-05, + "loss": 0.6697, + "step": 151060 + }, + { + "epoch": 1.3355080535370145, + "grad_norm": 1.7882872819900513, + "learning_rate": 2.774153244104976e-05, + "loss": 0.6564, + "step": 151070 + }, + { + "epoch": 1.3355964567973266, + "grad_norm": 1.5126078128814697, + "learning_rate": 2.7740059053377892e-05, + "loss": 0.5687, + "step": 151080 + }, + { + "epoch": 1.335684860057639, + "grad_norm": 8.000627517700195, + "learning_rate": 2.7738585665706017e-05, + "loss": 0.5835, + "step": 151090 + }, + { + "epoch": 1.335773263317951, + "grad_norm": 4.476342678070068, + "learning_rate": 2.773711227803415e-05, + "loss": 0.4402, + "step": 151100 + }, + { + "epoch": 1.3358616665782634, + "grad_norm": 1.719919204711914, + "learning_rate": 2.7735638890362277e-05, + "loss": 0.4522, + "step": 151110 + }, + { + "epoch": 1.3359500698385758, + "grad_norm": 3.59228515625, + "learning_rate": 2.7734165502690405e-05, + "loss": 0.4962, + "step": 151120 + }, + { + "epoch": 1.336038473098888, + "grad_norm": 1.5816543102264404, + "learning_rate": 2.7732692115018537e-05, + "loss": 0.6337, + "step": 151130 + }, + { + "epoch": 1.3361268763592, + "grad_norm": 3.1262903213500977, + "learning_rate": 2.7731218727346666e-05, + "loss": 0.6649, + "step": 151140 + }, + { + "epoch": 1.3362152796195124, + "grad_norm": 2.3579840660095215, + "learning_rate": 2.7729745339674794e-05, + "loss": 0.6421, + "step": 151150 + }, + { + "epoch": 1.3363036828798247, + "grad_norm": 2.981292963027954, + "learning_rate": 2.7728271952002926e-05, + "loss": 0.6245, + "step": 151160 + }, + { + "epoch": 1.3363920861401368, + "grad_norm": 1.7436307668685913, + "learning_rate": 2.7726798564331054e-05, + "loss": 0.4058, + "step": 151170 + }, + { + "epoch": 1.3364804894004492, + "grad_norm": 4.4524312019348145, + "learning_rate": 2.7725325176659182e-05, + "loss": 0.6272, + "step": 151180 + }, + { + "epoch": 1.3365688926607613, + "grad_norm": 8.533951759338379, + "learning_rate": 2.7723851788987314e-05, + "loss": 0.723, + "step": 151190 + }, + { + "epoch": 1.3366572959210736, + "grad_norm": 2.9351816177368164, + "learning_rate": 2.7722378401315442e-05, + "loss": 0.6489, + "step": 151200 + }, + { + "epoch": 1.3367456991813857, + "grad_norm": 2.2552337646484375, + "learning_rate": 2.772090501364357e-05, + "loss": 0.5402, + "step": 151210 + }, + { + "epoch": 1.336834102441698, + "grad_norm": 10.673256874084473, + "learning_rate": 2.77194316259717e-05, + "loss": 0.5567, + "step": 151220 + }, + { + "epoch": 1.3369225057020102, + "grad_norm": 3.3659915924072266, + "learning_rate": 2.771795823829983e-05, + "loss": 0.6305, + "step": 151230 + }, + { + "epoch": 1.3370109089623226, + "grad_norm": 1.2768335342407227, + "learning_rate": 2.771648485062796e-05, + "loss": 0.5264, + "step": 151240 + }, + { + "epoch": 1.3370993122226347, + "grad_norm": 21.0323486328125, + "learning_rate": 2.7715011462956088e-05, + "loss": 0.5905, + "step": 151250 + }, + { + "epoch": 1.337187715482947, + "grad_norm": 4.157071113586426, + "learning_rate": 2.771353807528422e-05, + "loss": 0.6217, + "step": 151260 + }, + { + "epoch": 1.3372761187432594, + "grad_norm": 5.403841018676758, + "learning_rate": 2.7712064687612348e-05, + "loss": 0.6086, + "step": 151270 + }, + { + "epoch": 1.3373645220035715, + "grad_norm": 8.941508293151855, + "learning_rate": 2.7710591299940476e-05, + "loss": 0.5825, + "step": 151280 + }, + { + "epoch": 1.3374529252638836, + "grad_norm": 0.8912774920463562, + "learning_rate": 2.7709117912268608e-05, + "loss": 0.5628, + "step": 151290 + }, + { + "epoch": 1.337541328524196, + "grad_norm": 4.192302703857422, + "learning_rate": 2.7707644524596736e-05, + "loss": 0.6462, + "step": 151300 + }, + { + "epoch": 1.3376297317845083, + "grad_norm": 2.739774227142334, + "learning_rate": 2.7706171136924865e-05, + "loss": 0.6257, + "step": 151310 + }, + { + "epoch": 1.3377181350448204, + "grad_norm": 4.046346187591553, + "learning_rate": 2.7704697749252996e-05, + "loss": 0.5678, + "step": 151320 + }, + { + "epoch": 1.3378065383051327, + "grad_norm": 2.878293037414551, + "learning_rate": 2.770322436158112e-05, + "loss": 0.6428, + "step": 151330 + }, + { + "epoch": 1.3378949415654449, + "grad_norm": 5.418063640594482, + "learning_rate": 2.7701750973909253e-05, + "loss": 0.6374, + "step": 151340 + }, + { + "epoch": 1.3379833448257572, + "grad_norm": 1.9555840492248535, + "learning_rate": 2.7700277586237385e-05, + "loss": 0.6421, + "step": 151350 + }, + { + "epoch": 1.3380717480860693, + "grad_norm": 3.0847692489624023, + "learning_rate": 2.769880419856551e-05, + "loss": 0.4415, + "step": 151360 + }, + { + "epoch": 1.3381601513463817, + "grad_norm": 1.3546347618103027, + "learning_rate": 2.769733081089364e-05, + "loss": 0.7576, + "step": 151370 + }, + { + "epoch": 1.338248554606694, + "grad_norm": 1.5298864841461182, + "learning_rate": 2.7695857423221773e-05, + "loss": 0.5746, + "step": 151380 + }, + { + "epoch": 1.3383369578670061, + "grad_norm": 2.744459867477417, + "learning_rate": 2.7694384035549898e-05, + "loss": 0.7344, + "step": 151390 + }, + { + "epoch": 1.3384253611273182, + "grad_norm": 1.2843074798583984, + "learning_rate": 2.769291064787803e-05, + "loss": 0.5737, + "step": 151400 + }, + { + "epoch": 1.3385137643876306, + "grad_norm": 1.539597749710083, + "learning_rate": 2.769143726020616e-05, + "loss": 0.6573, + "step": 151410 + }, + { + "epoch": 1.338602167647943, + "grad_norm": 9.883866310119629, + "learning_rate": 2.7689963872534287e-05, + "loss": 0.598, + "step": 151420 + }, + { + "epoch": 1.338690570908255, + "grad_norm": 1.1899820566177368, + "learning_rate": 2.768849048486242e-05, + "loss": 0.6141, + "step": 151430 + }, + { + "epoch": 1.3387789741685674, + "grad_norm": 3.887343168258667, + "learning_rate": 2.768701709719055e-05, + "loss": 0.6354, + "step": 151440 + }, + { + "epoch": 1.3388673774288795, + "grad_norm": 9.095852851867676, + "learning_rate": 2.7685543709518675e-05, + "loss": 0.554, + "step": 151450 + }, + { + "epoch": 1.3389557806891919, + "grad_norm": 1.7072582244873047, + "learning_rate": 2.7684070321846807e-05, + "loss": 0.6264, + "step": 151460 + }, + { + "epoch": 1.339044183949504, + "grad_norm": 2.419516086578369, + "learning_rate": 2.7682596934174932e-05, + "loss": 0.6815, + "step": 151470 + }, + { + "epoch": 1.3391325872098163, + "grad_norm": 11.242193222045898, + "learning_rate": 2.7681123546503064e-05, + "loss": 0.599, + "step": 151480 + }, + { + "epoch": 1.3392209904701287, + "grad_norm": 2.0562210083007812, + "learning_rate": 2.7679650158831195e-05, + "loss": 0.6353, + "step": 151490 + }, + { + "epoch": 1.3393093937304408, + "grad_norm": 5.687093257904053, + "learning_rate": 2.767817677115932e-05, + "loss": 0.5104, + "step": 151500 + }, + { + "epoch": 1.339397796990753, + "grad_norm": 1.5878093242645264, + "learning_rate": 2.7676703383487452e-05, + "loss": 0.5923, + "step": 151510 + }, + { + "epoch": 1.3394862002510652, + "grad_norm": 0.9044582843780518, + "learning_rate": 2.7675229995815584e-05, + "loss": 0.6265, + "step": 151520 + }, + { + "epoch": 1.3395746035113776, + "grad_norm": 3.482861042022705, + "learning_rate": 2.767375660814371e-05, + "loss": 0.6293, + "step": 151530 + }, + { + "epoch": 1.3396630067716897, + "grad_norm": 1.3492008447647095, + "learning_rate": 2.767228322047184e-05, + "loss": 0.5917, + "step": 151540 + }, + { + "epoch": 1.339751410032002, + "grad_norm": 6.387631416320801, + "learning_rate": 2.7670809832799972e-05, + "loss": 0.5207, + "step": 151550 + }, + { + "epoch": 1.3398398132923142, + "grad_norm": 6.104571342468262, + "learning_rate": 2.7669336445128097e-05, + "loss": 0.5676, + "step": 151560 + }, + { + "epoch": 1.3399282165526265, + "grad_norm": 11.630414962768555, + "learning_rate": 2.766786305745623e-05, + "loss": 0.5068, + "step": 151570 + }, + { + "epoch": 1.3400166198129386, + "grad_norm": 6.5402421951293945, + "learning_rate": 2.7666389669784354e-05, + "loss": 0.7379, + "step": 151580 + }, + { + "epoch": 1.340105023073251, + "grad_norm": 2.0985918045043945, + "learning_rate": 2.7664916282112486e-05, + "loss": 0.6612, + "step": 151590 + }, + { + "epoch": 1.3401934263335633, + "grad_norm": 2.4103410243988037, + "learning_rate": 2.7663442894440617e-05, + "loss": 0.5752, + "step": 151600 + }, + { + "epoch": 1.3402818295938754, + "grad_norm": 1.082924485206604, + "learning_rate": 2.7661969506768742e-05, + "loss": 0.5852, + "step": 151610 + }, + { + "epoch": 1.3403702328541875, + "grad_norm": 7.554436206817627, + "learning_rate": 2.7660496119096874e-05, + "loss": 0.588, + "step": 151620 + }, + { + "epoch": 1.3404586361145, + "grad_norm": 1.3122897148132324, + "learning_rate": 2.7659022731425006e-05, + "loss": 0.5607, + "step": 151630 + }, + { + "epoch": 1.3405470393748122, + "grad_norm": 1.1755517721176147, + "learning_rate": 2.765754934375313e-05, + "loss": 0.6246, + "step": 151640 + }, + { + "epoch": 1.3406354426351244, + "grad_norm": 2.0831754207611084, + "learning_rate": 2.7656075956081262e-05, + "loss": 0.6343, + "step": 151650 + }, + { + "epoch": 1.3407238458954367, + "grad_norm": 2.5282142162323, + "learning_rate": 2.7654602568409394e-05, + "loss": 0.6059, + "step": 151660 + }, + { + "epoch": 1.3408122491557488, + "grad_norm": 5.200699806213379, + "learning_rate": 2.765312918073752e-05, + "loss": 0.5954, + "step": 151670 + }, + { + "epoch": 1.3409006524160612, + "grad_norm": 2.8896148204803467, + "learning_rate": 2.765165579306565e-05, + "loss": 0.4922, + "step": 151680 + }, + { + "epoch": 1.3409890556763733, + "grad_norm": 1.2419229745864868, + "learning_rate": 2.7650182405393776e-05, + "loss": 0.5508, + "step": 151690 + }, + { + "epoch": 1.3410774589366856, + "grad_norm": 0.9479629397392273, + "learning_rate": 2.7648709017721908e-05, + "loss": 0.6371, + "step": 151700 + }, + { + "epoch": 1.341165862196998, + "grad_norm": 6.157392978668213, + "learning_rate": 2.764723563005004e-05, + "loss": 0.5914, + "step": 151710 + }, + { + "epoch": 1.34125426545731, + "grad_norm": 1.3271961212158203, + "learning_rate": 2.7645762242378164e-05, + "loss": 0.4498, + "step": 151720 + }, + { + "epoch": 1.3413426687176222, + "grad_norm": 1.174003005027771, + "learning_rate": 2.7644288854706296e-05, + "loss": 0.564, + "step": 151730 + }, + { + "epoch": 1.3414310719779345, + "grad_norm": 1.3384922742843628, + "learning_rate": 2.7642815467034428e-05, + "loss": 0.5889, + "step": 151740 + }, + { + "epoch": 1.3415194752382469, + "grad_norm": 4.7183661460876465, + "learning_rate": 2.7641342079362553e-05, + "loss": 0.5477, + "step": 151750 + }, + { + "epoch": 1.341607878498559, + "grad_norm": 3.477849006652832, + "learning_rate": 2.7639868691690685e-05, + "loss": 0.6169, + "step": 151760 + }, + { + "epoch": 1.3416962817588713, + "grad_norm": 1.535959243774414, + "learning_rate": 2.7638395304018816e-05, + "loss": 0.674, + "step": 151770 + }, + { + "epoch": 1.3417846850191835, + "grad_norm": 3.1993985176086426, + "learning_rate": 2.763692191634694e-05, + "loss": 0.6727, + "step": 151780 + }, + { + "epoch": 1.3418730882794958, + "grad_norm": 1.3129761219024658, + "learning_rate": 2.7635448528675073e-05, + "loss": 0.596, + "step": 151790 + }, + { + "epoch": 1.341961491539808, + "grad_norm": 9.249835014343262, + "learning_rate": 2.7633975141003198e-05, + "loss": 0.6903, + "step": 151800 + }, + { + "epoch": 1.3420498948001203, + "grad_norm": 4.096350193023682, + "learning_rate": 2.763250175333133e-05, + "loss": 0.5566, + "step": 151810 + }, + { + "epoch": 1.3421382980604324, + "grad_norm": 1.807973027229309, + "learning_rate": 2.763102836565946e-05, + "loss": 0.5722, + "step": 151820 + }, + { + "epoch": 1.3422267013207447, + "grad_norm": 2.0240368843078613, + "learning_rate": 2.7629554977987586e-05, + "loss": 0.6585, + "step": 151830 + }, + { + "epoch": 1.3423151045810569, + "grad_norm": 16.842954635620117, + "learning_rate": 2.7628081590315718e-05, + "loss": 0.5216, + "step": 151840 + }, + { + "epoch": 1.3424035078413692, + "grad_norm": 2.1680495738983154, + "learning_rate": 2.762660820264385e-05, + "loss": 0.6971, + "step": 151850 + }, + { + "epoch": 1.3424919111016815, + "grad_norm": 0.9763355255126953, + "learning_rate": 2.7625134814971975e-05, + "loss": 0.4742, + "step": 151860 + }, + { + "epoch": 1.3425803143619937, + "grad_norm": 2.114156484603882, + "learning_rate": 2.7623661427300107e-05, + "loss": 0.7132, + "step": 151870 + }, + { + "epoch": 1.3426687176223058, + "grad_norm": 2.558688163757324, + "learning_rate": 2.762218803962824e-05, + "loss": 0.6599, + "step": 151880 + }, + { + "epoch": 1.3427571208826181, + "grad_norm": 6.453088760375977, + "learning_rate": 2.7620714651956363e-05, + "loss": 0.6011, + "step": 151890 + }, + { + "epoch": 1.3428455241429305, + "grad_norm": 2.327014207839966, + "learning_rate": 2.7619241264284495e-05, + "loss": 0.5812, + "step": 151900 + }, + { + "epoch": 1.3429339274032426, + "grad_norm": 1.6957920789718628, + "learning_rate": 2.7617767876612627e-05, + "loss": 0.7609, + "step": 151910 + }, + { + "epoch": 1.343022330663555, + "grad_norm": 1.9946612119674683, + "learning_rate": 2.7616294488940752e-05, + "loss": 0.6668, + "step": 151920 + }, + { + "epoch": 1.343110733923867, + "grad_norm": 2.5781185626983643, + "learning_rate": 2.7614821101268883e-05, + "loss": 0.5896, + "step": 151930 + }, + { + "epoch": 1.3431991371841794, + "grad_norm": 3.432629108428955, + "learning_rate": 2.761334771359701e-05, + "loss": 0.5381, + "step": 151940 + }, + { + "epoch": 1.3432875404444915, + "grad_norm": 7.795658111572266, + "learning_rate": 2.761187432592514e-05, + "loss": 0.7388, + "step": 151950 + }, + { + "epoch": 1.3433759437048038, + "grad_norm": 1.2835272550582886, + "learning_rate": 2.7610400938253272e-05, + "loss": 0.5521, + "step": 151960 + }, + { + "epoch": 1.3434643469651162, + "grad_norm": 2.343592643737793, + "learning_rate": 2.7608927550581397e-05, + "loss": 0.5599, + "step": 151970 + }, + { + "epoch": 1.3435527502254283, + "grad_norm": 1.9953981637954712, + "learning_rate": 2.760745416290953e-05, + "loss": 0.5386, + "step": 151980 + }, + { + "epoch": 1.3436411534857404, + "grad_norm": 4.4138360023498535, + "learning_rate": 2.760598077523766e-05, + "loss": 0.6223, + "step": 151990 + }, + { + "epoch": 1.3437295567460528, + "grad_norm": 3.9960412979125977, + "learning_rate": 2.7604507387565785e-05, + "loss": 0.5672, + "step": 152000 + }, + { + "epoch": 1.343817960006365, + "grad_norm": 1.3082555532455444, + "learning_rate": 2.7603033999893917e-05, + "loss": 0.5066, + "step": 152010 + }, + { + "epoch": 1.3439063632666772, + "grad_norm": 6.851284027099609, + "learning_rate": 2.760156061222205e-05, + "loss": 0.7054, + "step": 152020 + }, + { + "epoch": 1.3439947665269896, + "grad_norm": 6.909270286560059, + "learning_rate": 2.7600087224550174e-05, + "loss": 0.6548, + "step": 152030 + }, + { + "epoch": 1.3440831697873017, + "grad_norm": 24.7659912109375, + "learning_rate": 2.7598613836878306e-05, + "loss": 0.6857, + "step": 152040 + }, + { + "epoch": 1.344171573047614, + "grad_norm": 1.9135289192199707, + "learning_rate": 2.7597140449206434e-05, + "loss": 0.6329, + "step": 152050 + }, + { + "epoch": 1.3442599763079262, + "grad_norm": 1.4441651105880737, + "learning_rate": 2.7595667061534562e-05, + "loss": 0.5593, + "step": 152060 + }, + { + "epoch": 1.3443483795682385, + "grad_norm": 1.8499969244003296, + "learning_rate": 2.7594193673862694e-05, + "loss": 0.57, + "step": 152070 + }, + { + "epoch": 1.3444367828285508, + "grad_norm": 8.440248489379883, + "learning_rate": 2.7592720286190822e-05, + "loss": 0.5884, + "step": 152080 + }, + { + "epoch": 1.344525186088863, + "grad_norm": 1.3343907594680786, + "learning_rate": 2.759124689851895e-05, + "loss": 0.6163, + "step": 152090 + }, + { + "epoch": 1.344613589349175, + "grad_norm": 3.77996826171875, + "learning_rate": 2.7589773510847082e-05, + "loss": 0.6739, + "step": 152100 + }, + { + "epoch": 1.3447019926094874, + "grad_norm": 2.175771951675415, + "learning_rate": 2.758830012317521e-05, + "loss": 0.7437, + "step": 152110 + }, + { + "epoch": 1.3447903958697998, + "grad_norm": 1.1218180656433105, + "learning_rate": 2.758682673550334e-05, + "loss": 0.6503, + "step": 152120 + }, + { + "epoch": 1.3448787991301119, + "grad_norm": 3.5241992473602295, + "learning_rate": 2.758535334783147e-05, + "loss": 0.4788, + "step": 152130 + }, + { + "epoch": 1.3449672023904242, + "grad_norm": 3.1352739334106445, + "learning_rate": 2.75838799601596e-05, + "loss": 0.5824, + "step": 152140 + }, + { + "epoch": 1.3450556056507363, + "grad_norm": 2.206943988800049, + "learning_rate": 2.7582406572487728e-05, + "loss": 0.5998, + "step": 152150 + }, + { + "epoch": 1.3451440089110487, + "grad_norm": 1.5829147100448608, + "learning_rate": 2.7580933184815856e-05, + "loss": 0.6989, + "step": 152160 + }, + { + "epoch": 1.3452324121713608, + "grad_norm": 2.106628179550171, + "learning_rate": 2.7579459797143988e-05, + "loss": 0.5974, + "step": 152170 + }, + { + "epoch": 1.3453208154316731, + "grad_norm": 6.176105976104736, + "learning_rate": 2.7577986409472116e-05, + "loss": 0.7609, + "step": 152180 + }, + { + "epoch": 1.3454092186919855, + "grad_norm": 1.2982898950576782, + "learning_rate": 2.7576513021800244e-05, + "loss": 0.5646, + "step": 152190 + }, + { + "epoch": 1.3454976219522976, + "grad_norm": 0.9172103404998779, + "learning_rate": 2.7575039634128376e-05, + "loss": 0.6589, + "step": 152200 + }, + { + "epoch": 1.3455860252126097, + "grad_norm": 2.8577544689178467, + "learning_rate": 2.7573566246456504e-05, + "loss": 0.5439, + "step": 152210 + }, + { + "epoch": 1.345674428472922, + "grad_norm": 1.339755654335022, + "learning_rate": 2.7572092858784633e-05, + "loss": 0.6511, + "step": 152220 + }, + { + "epoch": 1.3457628317332344, + "grad_norm": 4.50508451461792, + "learning_rate": 2.7570619471112765e-05, + "loss": 0.585, + "step": 152230 + }, + { + "epoch": 1.3458512349935465, + "grad_norm": 1.6642866134643555, + "learning_rate": 2.7569146083440893e-05, + "loss": 0.6008, + "step": 152240 + }, + { + "epoch": 1.3459396382538589, + "grad_norm": 2.5571939945220947, + "learning_rate": 2.756767269576902e-05, + "loss": 0.6415, + "step": 152250 + }, + { + "epoch": 1.346028041514171, + "grad_norm": 2.508664608001709, + "learning_rate": 2.7566199308097153e-05, + "loss": 0.6126, + "step": 152260 + }, + { + "epoch": 1.3461164447744833, + "grad_norm": 2.102313280105591, + "learning_rate": 2.7564725920425278e-05, + "loss": 0.6892, + "step": 152270 + }, + { + "epoch": 1.3462048480347955, + "grad_norm": 1.6047399044036865, + "learning_rate": 2.756325253275341e-05, + "loss": 0.597, + "step": 152280 + }, + { + "epoch": 1.3462932512951078, + "grad_norm": 3.6406137943267822, + "learning_rate": 2.756177914508154e-05, + "loss": 0.5922, + "step": 152290 + }, + { + "epoch": 1.3463816545554201, + "grad_norm": 3.566967010498047, + "learning_rate": 2.7560305757409666e-05, + "loss": 0.6462, + "step": 152300 + }, + { + "epoch": 1.3464700578157323, + "grad_norm": 1.0806243419647217, + "learning_rate": 2.7558832369737798e-05, + "loss": 0.5552, + "step": 152310 + }, + { + "epoch": 1.3465584610760444, + "grad_norm": 23.99992561340332, + "learning_rate": 2.755735898206593e-05, + "loss": 0.6007, + "step": 152320 + }, + { + "epoch": 1.3466468643363567, + "grad_norm": 4.343580722808838, + "learning_rate": 2.7555885594394055e-05, + "loss": 0.6207, + "step": 152330 + }, + { + "epoch": 1.346735267596669, + "grad_norm": 3.176987409591675, + "learning_rate": 2.7554412206722187e-05, + "loss": 0.5343, + "step": 152340 + }, + { + "epoch": 1.3468236708569812, + "grad_norm": 1.0751079320907593, + "learning_rate": 2.755293881905032e-05, + "loss": 0.5602, + "step": 152350 + }, + { + "epoch": 1.3469120741172935, + "grad_norm": 2.112555742263794, + "learning_rate": 2.7551465431378443e-05, + "loss": 0.4778, + "step": 152360 + }, + { + "epoch": 1.3470004773776056, + "grad_norm": 1.2999556064605713, + "learning_rate": 2.7549992043706575e-05, + "loss": 0.4669, + "step": 152370 + }, + { + "epoch": 1.347088880637918, + "grad_norm": 2.515124559402466, + "learning_rate": 2.7548518656034707e-05, + "loss": 0.585, + "step": 152380 + }, + { + "epoch": 1.34717728389823, + "grad_norm": 19.932363510131836, + "learning_rate": 2.7547045268362832e-05, + "loss": 0.4657, + "step": 152390 + }, + { + "epoch": 1.3472656871585424, + "grad_norm": 1.0412105321884155, + "learning_rate": 2.7545571880690964e-05, + "loss": 0.5654, + "step": 152400 + }, + { + "epoch": 1.3473540904188546, + "grad_norm": 4.872016429901123, + "learning_rate": 2.754409849301909e-05, + "loss": 0.6819, + "step": 152410 + }, + { + "epoch": 1.347442493679167, + "grad_norm": 3.62040114402771, + "learning_rate": 2.754262510534722e-05, + "loss": 0.5932, + "step": 152420 + }, + { + "epoch": 1.347530896939479, + "grad_norm": 10.900782585144043, + "learning_rate": 2.7541151717675352e-05, + "loss": 0.5669, + "step": 152430 + }, + { + "epoch": 1.3476193001997914, + "grad_norm": 13.340954780578613, + "learning_rate": 2.7539678330003477e-05, + "loss": 0.6505, + "step": 152440 + }, + { + "epoch": 1.3477077034601037, + "grad_norm": 1.7349427938461304, + "learning_rate": 2.753820494233161e-05, + "loss": 0.5847, + "step": 152450 + }, + { + "epoch": 1.3477961067204158, + "grad_norm": 9.308643341064453, + "learning_rate": 2.753673155465974e-05, + "loss": 0.8377, + "step": 152460 + }, + { + "epoch": 1.347884509980728, + "grad_norm": 3.295382499694824, + "learning_rate": 2.7535258166987865e-05, + "loss": 0.5619, + "step": 152470 + }, + { + "epoch": 1.3479729132410403, + "grad_norm": 18.71682357788086, + "learning_rate": 2.7533784779315997e-05, + "loss": 0.6208, + "step": 152480 + }, + { + "epoch": 1.3480613165013526, + "grad_norm": 14.244182586669922, + "learning_rate": 2.753231139164413e-05, + "loss": 0.6576, + "step": 152490 + }, + { + "epoch": 1.3481497197616648, + "grad_norm": 1.7026153802871704, + "learning_rate": 2.7530838003972254e-05, + "loss": 0.565, + "step": 152500 + }, + { + "epoch": 1.348238123021977, + "grad_norm": 1.675563097000122, + "learning_rate": 2.7529364616300386e-05, + "loss": 0.7649, + "step": 152510 + }, + { + "epoch": 1.3483265262822892, + "grad_norm": 1.9450074434280396, + "learning_rate": 2.752789122862851e-05, + "loss": 0.8025, + "step": 152520 + }, + { + "epoch": 1.3484149295426016, + "grad_norm": 1.710023045539856, + "learning_rate": 2.7526417840956642e-05, + "loss": 0.5054, + "step": 152530 + }, + { + "epoch": 1.3485033328029137, + "grad_norm": 2.0094408988952637, + "learning_rate": 2.7524944453284774e-05, + "loss": 0.5589, + "step": 152540 + }, + { + "epoch": 1.348591736063226, + "grad_norm": 1.119165301322937, + "learning_rate": 2.75234710656129e-05, + "loss": 0.5645, + "step": 152550 + }, + { + "epoch": 1.3486801393235384, + "grad_norm": 3.8932783603668213, + "learning_rate": 2.752199767794103e-05, + "loss": 0.4985, + "step": 152560 + }, + { + "epoch": 1.3487685425838505, + "grad_norm": 2.324028253555298, + "learning_rate": 2.7520524290269163e-05, + "loss": 0.717, + "step": 152570 + }, + { + "epoch": 1.3488569458441626, + "grad_norm": 9.660073280334473, + "learning_rate": 2.7519050902597287e-05, + "loss": 0.5335, + "step": 152580 + }, + { + "epoch": 1.348945349104475, + "grad_norm": 3.22464656829834, + "learning_rate": 2.751757751492542e-05, + "loss": 0.5849, + "step": 152590 + }, + { + "epoch": 1.3490337523647873, + "grad_norm": 1.327293872833252, + "learning_rate": 2.751610412725355e-05, + "loss": 0.5519, + "step": 152600 + }, + { + "epoch": 1.3491221556250994, + "grad_norm": 12.765607833862305, + "learning_rate": 2.7514630739581676e-05, + "loss": 0.5948, + "step": 152610 + }, + { + "epoch": 1.3492105588854117, + "grad_norm": 2.0518267154693604, + "learning_rate": 2.7513157351909808e-05, + "loss": 0.6375, + "step": 152620 + }, + { + "epoch": 1.3492989621457239, + "grad_norm": 1.2443517446517944, + "learning_rate": 2.7511683964237933e-05, + "loss": 0.6497, + "step": 152630 + }, + { + "epoch": 1.3493873654060362, + "grad_norm": 13.505154609680176, + "learning_rate": 2.7510210576566064e-05, + "loss": 0.6146, + "step": 152640 + }, + { + "epoch": 1.3494757686663483, + "grad_norm": 2.0449235439300537, + "learning_rate": 2.7508737188894196e-05, + "loss": 0.661, + "step": 152650 + }, + { + "epoch": 1.3495641719266607, + "grad_norm": 1.1711230278015137, + "learning_rate": 2.750726380122232e-05, + "loss": 0.6156, + "step": 152660 + }, + { + "epoch": 1.349652575186973, + "grad_norm": 2.5636491775512695, + "learning_rate": 2.7505790413550453e-05, + "loss": 0.6465, + "step": 152670 + }, + { + "epoch": 1.3497409784472851, + "grad_norm": 1.8045976161956787, + "learning_rate": 2.7504317025878585e-05, + "loss": 0.5404, + "step": 152680 + }, + { + "epoch": 1.3498293817075973, + "grad_norm": 1.880570650100708, + "learning_rate": 2.750284363820671e-05, + "loss": 0.5684, + "step": 152690 + }, + { + "epoch": 1.3499177849679096, + "grad_norm": 0.8692885637283325, + "learning_rate": 2.750137025053484e-05, + "loss": 0.5905, + "step": 152700 + }, + { + "epoch": 1.350006188228222, + "grad_norm": 1.9509234428405762, + "learning_rate": 2.7499896862862973e-05, + "loss": 0.6271, + "step": 152710 + }, + { + "epoch": 1.350094591488534, + "grad_norm": 1.200966715812683, + "learning_rate": 2.7498423475191098e-05, + "loss": 0.5505, + "step": 152720 + }, + { + "epoch": 1.3501829947488464, + "grad_norm": 2.834299087524414, + "learning_rate": 2.749695008751923e-05, + "loss": 0.6298, + "step": 152730 + }, + { + "epoch": 1.3502713980091585, + "grad_norm": 4.70060396194458, + "learning_rate": 2.7495476699847355e-05, + "loss": 0.7122, + "step": 152740 + }, + { + "epoch": 1.3503598012694709, + "grad_norm": 5.745482444763184, + "learning_rate": 2.7494003312175486e-05, + "loss": 0.55, + "step": 152750 + }, + { + "epoch": 1.350448204529783, + "grad_norm": 1.6211899518966675, + "learning_rate": 2.7492529924503618e-05, + "loss": 0.608, + "step": 152760 + }, + { + "epoch": 1.3505366077900953, + "grad_norm": 3.1293649673461914, + "learning_rate": 2.7491056536831743e-05, + "loss": 0.593, + "step": 152770 + }, + { + "epoch": 1.3506250110504077, + "grad_norm": 1.8907068967819214, + "learning_rate": 2.7489583149159875e-05, + "loss": 0.6112, + "step": 152780 + }, + { + "epoch": 1.3507134143107198, + "grad_norm": 1.62534499168396, + "learning_rate": 2.7488109761488007e-05, + "loss": 0.6411, + "step": 152790 + }, + { + "epoch": 1.350801817571032, + "grad_norm": 1.6003910303115845, + "learning_rate": 2.748663637381613e-05, + "loss": 0.5716, + "step": 152800 + }, + { + "epoch": 1.3508902208313442, + "grad_norm": 3.8206429481506348, + "learning_rate": 2.7485162986144263e-05, + "loss": 0.6705, + "step": 152810 + }, + { + "epoch": 1.3509786240916566, + "grad_norm": 2.7738959789276123, + "learning_rate": 2.7483689598472395e-05, + "loss": 0.5765, + "step": 152820 + }, + { + "epoch": 1.3510670273519687, + "grad_norm": 13.040949821472168, + "learning_rate": 2.748221621080052e-05, + "loss": 0.6266, + "step": 152830 + }, + { + "epoch": 1.351155430612281, + "grad_norm": 3.785677194595337, + "learning_rate": 2.7480742823128652e-05, + "loss": 0.6482, + "step": 152840 + }, + { + "epoch": 1.3512438338725932, + "grad_norm": 2.183802366256714, + "learning_rate": 2.7479269435456784e-05, + "loss": 0.6982, + "step": 152850 + }, + { + "epoch": 1.3513322371329055, + "grad_norm": 1.7078607082366943, + "learning_rate": 2.747779604778491e-05, + "loss": 0.5505, + "step": 152860 + }, + { + "epoch": 1.3514206403932176, + "grad_norm": 2.5055553913116455, + "learning_rate": 2.747632266011304e-05, + "loss": 0.5533, + "step": 152870 + }, + { + "epoch": 1.35150904365353, + "grad_norm": 2.6654164791107178, + "learning_rate": 2.7474849272441165e-05, + "loss": 0.6827, + "step": 152880 + }, + { + "epoch": 1.3515974469138423, + "grad_norm": 2.8913371562957764, + "learning_rate": 2.7473375884769297e-05, + "loss": 0.5377, + "step": 152890 + }, + { + "epoch": 1.3516858501741544, + "grad_norm": 0.7278856635093689, + "learning_rate": 2.747190249709743e-05, + "loss": 0.4603, + "step": 152900 + }, + { + "epoch": 1.3517742534344666, + "grad_norm": 1.6657037734985352, + "learning_rate": 2.7470429109425554e-05, + "loss": 0.5335, + "step": 152910 + }, + { + "epoch": 1.351862656694779, + "grad_norm": 3.048320770263672, + "learning_rate": 2.7468955721753685e-05, + "loss": 0.6495, + "step": 152920 + }, + { + "epoch": 1.3519510599550912, + "grad_norm": 1.6921703815460205, + "learning_rate": 2.7467482334081817e-05, + "loss": 0.5694, + "step": 152930 + }, + { + "epoch": 1.3520394632154034, + "grad_norm": 4.21888542175293, + "learning_rate": 2.7466008946409942e-05, + "loss": 0.5306, + "step": 152940 + }, + { + "epoch": 1.3521278664757157, + "grad_norm": 1.5649628639221191, + "learning_rate": 2.7464535558738074e-05, + "loss": 0.7282, + "step": 152950 + }, + { + "epoch": 1.3522162697360278, + "grad_norm": 13.663272857666016, + "learning_rate": 2.7463062171066206e-05, + "loss": 0.7725, + "step": 152960 + }, + { + "epoch": 1.3523046729963402, + "grad_norm": 7.526825904846191, + "learning_rate": 2.746158878339433e-05, + "loss": 0.6897, + "step": 152970 + }, + { + "epoch": 1.3523930762566523, + "grad_norm": 1.556532621383667, + "learning_rate": 2.7460115395722462e-05, + "loss": 0.6829, + "step": 152980 + }, + { + "epoch": 1.3524814795169646, + "grad_norm": 7.2153000831604, + "learning_rate": 2.745864200805059e-05, + "loss": 0.5147, + "step": 152990 + }, + { + "epoch": 1.3525698827772767, + "grad_norm": 1.6668345928192139, + "learning_rate": 2.745716862037872e-05, + "loss": 0.57, + "step": 153000 + }, + { + "epoch": 1.352658286037589, + "grad_norm": 5.36071252822876, + "learning_rate": 2.745569523270685e-05, + "loss": 0.5818, + "step": 153010 + }, + { + "epoch": 1.3527466892979012, + "grad_norm": 3.1386351585388184, + "learning_rate": 2.745422184503498e-05, + "loss": 0.6623, + "step": 153020 + }, + { + "epoch": 1.3528350925582135, + "grad_norm": 1.5236743688583374, + "learning_rate": 2.7452748457363107e-05, + "loss": 0.4788, + "step": 153030 + }, + { + "epoch": 1.352923495818526, + "grad_norm": 1.3319661617279053, + "learning_rate": 2.745127506969124e-05, + "loss": 0.6293, + "step": 153040 + }, + { + "epoch": 1.353011899078838, + "grad_norm": 2.216249942779541, + "learning_rate": 2.7449801682019368e-05, + "loss": 0.7444, + "step": 153050 + }, + { + "epoch": 1.3531003023391501, + "grad_norm": 4.037188529968262, + "learning_rate": 2.7448328294347496e-05, + "loss": 0.6448, + "step": 153060 + }, + { + "epoch": 1.3531887055994625, + "grad_norm": 1.6134544610977173, + "learning_rate": 2.7446854906675628e-05, + "loss": 0.5382, + "step": 153070 + }, + { + "epoch": 1.3532771088597748, + "grad_norm": 12.962087631225586, + "learning_rate": 2.7445381519003756e-05, + "loss": 0.6484, + "step": 153080 + }, + { + "epoch": 1.353365512120087, + "grad_norm": 1.941424012184143, + "learning_rate": 2.7443908131331884e-05, + "loss": 0.61, + "step": 153090 + }, + { + "epoch": 1.3534539153803993, + "grad_norm": 7.85158634185791, + "learning_rate": 2.7442434743660013e-05, + "loss": 0.6142, + "step": 153100 + }, + { + "epoch": 1.3535423186407114, + "grad_norm": 17.405946731567383, + "learning_rate": 2.7440961355988144e-05, + "loss": 0.64, + "step": 153110 + }, + { + "epoch": 1.3536307219010237, + "grad_norm": 2.0815839767456055, + "learning_rate": 2.7439487968316273e-05, + "loss": 0.5992, + "step": 153120 + }, + { + "epoch": 1.3537191251613359, + "grad_norm": 1.178189754486084, + "learning_rate": 2.74380145806444e-05, + "loss": 0.6646, + "step": 153130 + }, + { + "epoch": 1.3538075284216482, + "grad_norm": 4.812051296234131, + "learning_rate": 2.7436541192972533e-05, + "loss": 0.6036, + "step": 153140 + }, + { + "epoch": 1.3538959316819605, + "grad_norm": 8.786940574645996, + "learning_rate": 2.743506780530066e-05, + "loss": 0.617, + "step": 153150 + }, + { + "epoch": 1.3539843349422727, + "grad_norm": 3.088794231414795, + "learning_rate": 2.743359441762879e-05, + "loss": 0.5355, + "step": 153160 + }, + { + "epoch": 1.3540727382025848, + "grad_norm": 1.9327174425125122, + "learning_rate": 2.743212102995692e-05, + "loss": 0.4877, + "step": 153170 + }, + { + "epoch": 1.3541611414628971, + "grad_norm": 17.121843338012695, + "learning_rate": 2.743064764228505e-05, + "loss": 0.6143, + "step": 153180 + }, + { + "epoch": 1.3542495447232095, + "grad_norm": 1.1468859910964966, + "learning_rate": 2.7429174254613178e-05, + "loss": 0.5918, + "step": 153190 + }, + { + "epoch": 1.3543379479835216, + "grad_norm": 0.9244375228881836, + "learning_rate": 2.742770086694131e-05, + "loss": 0.636, + "step": 153200 + }, + { + "epoch": 1.354426351243834, + "grad_norm": 1.7848541736602783, + "learning_rate": 2.7426227479269435e-05, + "loss": 0.4962, + "step": 153210 + }, + { + "epoch": 1.354514754504146, + "grad_norm": 2.681871175765991, + "learning_rate": 2.7424754091597566e-05, + "loss": 0.5631, + "step": 153220 + }, + { + "epoch": 1.3546031577644584, + "grad_norm": 1.842670202255249, + "learning_rate": 2.7423280703925698e-05, + "loss": 0.6412, + "step": 153230 + }, + { + "epoch": 1.3546915610247705, + "grad_norm": 3.063415765762329, + "learning_rate": 2.7421807316253823e-05, + "loss": 0.61, + "step": 153240 + }, + { + "epoch": 1.3547799642850828, + "grad_norm": 13.67172622680664, + "learning_rate": 2.7420333928581955e-05, + "loss": 0.8023, + "step": 153250 + }, + { + "epoch": 1.3548683675453952, + "grad_norm": 2.581256866455078, + "learning_rate": 2.7418860540910087e-05, + "loss": 0.4521, + "step": 153260 + }, + { + "epoch": 1.3549567708057073, + "grad_norm": 0.9682684540748596, + "learning_rate": 2.741738715323821e-05, + "loss": 0.6063, + "step": 153270 + }, + { + "epoch": 1.3550451740660194, + "grad_norm": 2.6267082691192627, + "learning_rate": 2.7415913765566343e-05, + "loss": 0.5755, + "step": 153280 + }, + { + "epoch": 1.3551335773263318, + "grad_norm": 14.42762279510498, + "learning_rate": 2.7414440377894475e-05, + "loss": 0.578, + "step": 153290 + }, + { + "epoch": 1.3552219805866441, + "grad_norm": 7.375821113586426, + "learning_rate": 2.74129669902226e-05, + "loss": 0.544, + "step": 153300 + }, + { + "epoch": 1.3553103838469562, + "grad_norm": 3.555516242980957, + "learning_rate": 2.7411493602550732e-05, + "loss": 0.6956, + "step": 153310 + }, + { + "epoch": 1.3553987871072686, + "grad_norm": 2.0015869140625, + "learning_rate": 2.7410020214878864e-05, + "loss": 0.6575, + "step": 153320 + }, + { + "epoch": 1.3554871903675807, + "grad_norm": 3.1677801609039307, + "learning_rate": 2.740854682720699e-05, + "loss": 0.6726, + "step": 153330 + }, + { + "epoch": 1.355575593627893, + "grad_norm": 1.8652198314666748, + "learning_rate": 2.740707343953512e-05, + "loss": 0.7265, + "step": 153340 + }, + { + "epoch": 1.3556639968882052, + "grad_norm": 5.433342933654785, + "learning_rate": 2.7405600051863245e-05, + "loss": 0.585, + "step": 153350 + }, + { + "epoch": 1.3557524001485175, + "grad_norm": 9.56826114654541, + "learning_rate": 2.7404126664191377e-05, + "loss": 0.6202, + "step": 153360 + }, + { + "epoch": 1.3558408034088298, + "grad_norm": 1.6164175271987915, + "learning_rate": 2.740265327651951e-05, + "loss": 0.7429, + "step": 153370 + }, + { + "epoch": 1.355929206669142, + "grad_norm": 2.347870111465454, + "learning_rate": 2.7401179888847634e-05, + "loss": 0.6675, + "step": 153380 + }, + { + "epoch": 1.356017609929454, + "grad_norm": 4.753960609436035, + "learning_rate": 2.7399706501175765e-05, + "loss": 0.6834, + "step": 153390 + }, + { + "epoch": 1.3561060131897664, + "grad_norm": 4.895615100860596, + "learning_rate": 2.7398233113503897e-05, + "loss": 0.6348, + "step": 153400 + }, + { + "epoch": 1.3561944164500788, + "grad_norm": 1.5488193035125732, + "learning_rate": 2.7396759725832022e-05, + "loss": 0.6819, + "step": 153410 + }, + { + "epoch": 1.3562828197103909, + "grad_norm": 3.036613941192627, + "learning_rate": 2.7395286338160154e-05, + "loss": 0.5595, + "step": 153420 + }, + { + "epoch": 1.3563712229707032, + "grad_norm": 3.7243666648864746, + "learning_rate": 2.7393812950488286e-05, + "loss": 0.6052, + "step": 153430 + }, + { + "epoch": 1.3564596262310153, + "grad_norm": 3.369673013687134, + "learning_rate": 2.739233956281641e-05, + "loss": 0.7209, + "step": 153440 + }, + { + "epoch": 1.3565480294913277, + "grad_norm": 1.6963940858840942, + "learning_rate": 2.7390866175144542e-05, + "loss": 0.7019, + "step": 153450 + }, + { + "epoch": 1.3566364327516398, + "grad_norm": 3.1766574382781982, + "learning_rate": 2.7389392787472667e-05, + "loss": 0.4618, + "step": 153460 + }, + { + "epoch": 1.3567248360119522, + "grad_norm": 1.2094073295593262, + "learning_rate": 2.73879193998008e-05, + "loss": 0.6013, + "step": 153470 + }, + { + "epoch": 1.3568132392722645, + "grad_norm": 6.967949867248535, + "learning_rate": 2.738644601212893e-05, + "loss": 0.6755, + "step": 153480 + }, + { + "epoch": 1.3569016425325766, + "grad_norm": 1.9498794078826904, + "learning_rate": 2.7384972624457056e-05, + "loss": 0.5856, + "step": 153490 + }, + { + "epoch": 1.3569900457928887, + "grad_norm": 15.112519264221191, + "learning_rate": 2.7383499236785188e-05, + "loss": 0.6213, + "step": 153500 + }, + { + "epoch": 1.357078449053201, + "grad_norm": 1.4778276681900024, + "learning_rate": 2.738202584911332e-05, + "loss": 0.5613, + "step": 153510 + }, + { + "epoch": 1.3571668523135134, + "grad_norm": 2.414613723754883, + "learning_rate": 2.7380552461441444e-05, + "loss": 0.5225, + "step": 153520 + }, + { + "epoch": 1.3572552555738255, + "grad_norm": 6.527881622314453, + "learning_rate": 2.7379079073769576e-05, + "loss": 0.5431, + "step": 153530 + }, + { + "epoch": 1.3573436588341379, + "grad_norm": 1.3349475860595703, + "learning_rate": 2.7377605686097708e-05, + "loss": 0.7061, + "step": 153540 + }, + { + "epoch": 1.35743206209445, + "grad_norm": 3.2020413875579834, + "learning_rate": 2.7376132298425833e-05, + "loss": 0.5888, + "step": 153550 + }, + { + "epoch": 1.3575204653547623, + "grad_norm": 2.6947829723358154, + "learning_rate": 2.7374658910753964e-05, + "loss": 0.7561, + "step": 153560 + }, + { + "epoch": 1.3576088686150745, + "grad_norm": 4.183035373687744, + "learning_rate": 2.737318552308209e-05, + "loss": 0.6214, + "step": 153570 + }, + { + "epoch": 1.3576972718753868, + "grad_norm": 5.074467182159424, + "learning_rate": 2.737171213541022e-05, + "loss": 0.6269, + "step": 153580 + }, + { + "epoch": 1.357785675135699, + "grad_norm": 1.4343229532241821, + "learning_rate": 2.7370238747738353e-05, + "loss": 0.5334, + "step": 153590 + }, + { + "epoch": 1.3578740783960113, + "grad_norm": 1.9551198482513428, + "learning_rate": 2.7368765360066478e-05, + "loss": 0.6199, + "step": 153600 + }, + { + "epoch": 1.3579624816563234, + "grad_norm": 7.2975993156433105, + "learning_rate": 2.736729197239461e-05, + "loss": 0.6635, + "step": 153610 + }, + { + "epoch": 1.3580508849166357, + "grad_norm": 15.22257137298584, + "learning_rate": 2.736581858472274e-05, + "loss": 0.7037, + "step": 153620 + }, + { + "epoch": 1.358139288176948, + "grad_norm": 4.397807598114014, + "learning_rate": 2.7364345197050866e-05, + "loss": 0.625, + "step": 153630 + }, + { + "epoch": 1.3582276914372602, + "grad_norm": 23.78542709350586, + "learning_rate": 2.7362871809378998e-05, + "loss": 0.6663, + "step": 153640 + }, + { + "epoch": 1.3583160946975725, + "grad_norm": 4.08134126663208, + "learning_rate": 2.736139842170713e-05, + "loss": 0.6032, + "step": 153650 + }, + { + "epoch": 1.3584044979578846, + "grad_norm": 1.7195130586624146, + "learning_rate": 2.7359925034035255e-05, + "loss": 0.6663, + "step": 153660 + }, + { + "epoch": 1.358492901218197, + "grad_norm": 4.62075662612915, + "learning_rate": 2.7358451646363386e-05, + "loss": 0.6765, + "step": 153670 + }, + { + "epoch": 1.358581304478509, + "grad_norm": 1.8698654174804688, + "learning_rate": 2.735697825869151e-05, + "loss": 0.6794, + "step": 153680 + }, + { + "epoch": 1.3586697077388215, + "grad_norm": 8.863204956054688, + "learning_rate": 2.7355504871019643e-05, + "loss": 0.6126, + "step": 153690 + }, + { + "epoch": 1.3587581109991336, + "grad_norm": 3.2313244342803955, + "learning_rate": 2.7354031483347775e-05, + "loss": 0.6543, + "step": 153700 + }, + { + "epoch": 1.358846514259446, + "grad_norm": 11.454544067382812, + "learning_rate": 2.73525580956759e-05, + "loss": 0.6134, + "step": 153710 + }, + { + "epoch": 1.358934917519758, + "grad_norm": 7.350795269012451, + "learning_rate": 2.735108470800403e-05, + "loss": 0.515, + "step": 153720 + }, + { + "epoch": 1.3590233207800704, + "grad_norm": 1.5754120349884033, + "learning_rate": 2.7349611320332163e-05, + "loss": 0.5599, + "step": 153730 + }, + { + "epoch": 1.3591117240403827, + "grad_norm": 1.1125742197036743, + "learning_rate": 2.734813793266029e-05, + "loss": 0.546, + "step": 153740 + }, + { + "epoch": 1.3592001273006948, + "grad_norm": 2.5498523712158203, + "learning_rate": 2.734666454498842e-05, + "loss": 0.4945, + "step": 153750 + }, + { + "epoch": 1.359288530561007, + "grad_norm": 4.113958835601807, + "learning_rate": 2.7345191157316552e-05, + "loss": 0.6604, + "step": 153760 + }, + { + "epoch": 1.3593769338213193, + "grad_norm": 2.057119369506836, + "learning_rate": 2.7343717769644677e-05, + "loss": 0.4741, + "step": 153770 + }, + { + "epoch": 1.3594653370816316, + "grad_norm": 4.867197036743164, + "learning_rate": 2.734224438197281e-05, + "loss": 0.7166, + "step": 153780 + }, + { + "epoch": 1.3595537403419438, + "grad_norm": 2.3347949981689453, + "learning_rate": 2.734077099430094e-05, + "loss": 0.7042, + "step": 153790 + }, + { + "epoch": 1.359642143602256, + "grad_norm": 4.11530876159668, + "learning_rate": 2.7339297606629065e-05, + "loss": 0.7689, + "step": 153800 + }, + { + "epoch": 1.3597305468625682, + "grad_norm": 2.281843423843384, + "learning_rate": 2.7337824218957197e-05, + "loss": 0.6405, + "step": 153810 + }, + { + "epoch": 1.3598189501228806, + "grad_norm": 4.6040449142456055, + "learning_rate": 2.7336350831285322e-05, + "loss": 0.6455, + "step": 153820 + }, + { + "epoch": 1.3599073533831927, + "grad_norm": 2.226038694381714, + "learning_rate": 2.7334877443613454e-05, + "loss": 0.5921, + "step": 153830 + }, + { + "epoch": 1.359995756643505, + "grad_norm": 18.165918350219727, + "learning_rate": 2.7333404055941585e-05, + "loss": 0.5922, + "step": 153840 + }, + { + "epoch": 1.3600841599038174, + "grad_norm": 5.217060565948486, + "learning_rate": 2.733193066826971e-05, + "loss": 0.6279, + "step": 153850 + }, + { + "epoch": 1.3601725631641295, + "grad_norm": 1.1759393215179443, + "learning_rate": 2.7330457280597842e-05, + "loss": 0.5592, + "step": 153860 + }, + { + "epoch": 1.3602609664244416, + "grad_norm": 2.8378050327301025, + "learning_rate": 2.7328983892925974e-05, + "loss": 0.7516, + "step": 153870 + }, + { + "epoch": 1.360349369684754, + "grad_norm": 1.9023247957229614, + "learning_rate": 2.73275105052541e-05, + "loss": 0.6084, + "step": 153880 + }, + { + "epoch": 1.3604377729450663, + "grad_norm": 11.13372802734375, + "learning_rate": 2.732603711758223e-05, + "loss": 0.5913, + "step": 153890 + }, + { + "epoch": 1.3605261762053784, + "grad_norm": 1.1735844612121582, + "learning_rate": 2.7324563729910362e-05, + "loss": 0.7321, + "step": 153900 + }, + { + "epoch": 1.3606145794656908, + "grad_norm": 2.9435930252075195, + "learning_rate": 2.7323090342238487e-05, + "loss": 0.5642, + "step": 153910 + }, + { + "epoch": 1.3607029827260029, + "grad_norm": 9.626900672912598, + "learning_rate": 2.732161695456662e-05, + "loss": 0.6412, + "step": 153920 + }, + { + "epoch": 1.3607913859863152, + "grad_norm": 2.715320110321045, + "learning_rate": 2.7320143566894747e-05, + "loss": 0.6761, + "step": 153930 + }, + { + "epoch": 1.3608797892466273, + "grad_norm": 1.598920464515686, + "learning_rate": 2.7318670179222876e-05, + "loss": 0.6351, + "step": 153940 + }, + { + "epoch": 1.3609681925069397, + "grad_norm": 3.0665156841278076, + "learning_rate": 2.7317196791551007e-05, + "loss": 0.6981, + "step": 153950 + }, + { + "epoch": 1.361056595767252, + "grad_norm": 3.8954806327819824, + "learning_rate": 2.7315723403879136e-05, + "loss": 0.6504, + "step": 153960 + }, + { + "epoch": 1.3611449990275641, + "grad_norm": 3.292116403579712, + "learning_rate": 2.7314250016207264e-05, + "loss": 0.5992, + "step": 153970 + }, + { + "epoch": 1.3612334022878763, + "grad_norm": 1.0852166414260864, + "learning_rate": 2.7312776628535396e-05, + "loss": 0.5186, + "step": 153980 + }, + { + "epoch": 1.3613218055481886, + "grad_norm": 3.1986804008483887, + "learning_rate": 2.7311303240863524e-05, + "loss": 0.6638, + "step": 153990 + }, + { + "epoch": 1.361410208808501, + "grad_norm": 1.7799561023712158, + "learning_rate": 2.7309829853191653e-05, + "loss": 0.6368, + "step": 154000 + }, + { + "epoch": 1.361498612068813, + "grad_norm": 10.748804092407227, + "learning_rate": 2.7308356465519784e-05, + "loss": 0.6364, + "step": 154010 + }, + { + "epoch": 1.3615870153291254, + "grad_norm": 1.81399405002594, + "learning_rate": 2.7306883077847913e-05, + "loss": 0.5802, + "step": 154020 + }, + { + "epoch": 1.3616754185894375, + "grad_norm": 2.1766295433044434, + "learning_rate": 2.730540969017604e-05, + "loss": 0.6595, + "step": 154030 + }, + { + "epoch": 1.3617638218497499, + "grad_norm": 7.2332282066345215, + "learning_rate": 2.730393630250417e-05, + "loss": 0.6622, + "step": 154040 + }, + { + "epoch": 1.361852225110062, + "grad_norm": 4.610494613647461, + "learning_rate": 2.73024629148323e-05, + "loss": 0.5962, + "step": 154050 + }, + { + "epoch": 1.3619406283703743, + "grad_norm": 1.7780160903930664, + "learning_rate": 2.730098952716043e-05, + "loss": 0.5994, + "step": 154060 + }, + { + "epoch": 1.3620290316306867, + "grad_norm": 5.483056545257568, + "learning_rate": 2.7299516139488558e-05, + "loss": 0.6641, + "step": 154070 + }, + { + "epoch": 1.3621174348909988, + "grad_norm": 2.305213451385498, + "learning_rate": 2.729804275181669e-05, + "loss": 0.6899, + "step": 154080 + }, + { + "epoch": 1.362205838151311, + "grad_norm": 2.8561348915100098, + "learning_rate": 2.7296569364144818e-05, + "loss": 0.5653, + "step": 154090 + }, + { + "epoch": 1.3622942414116233, + "grad_norm": 7.68094539642334, + "learning_rate": 2.7295095976472946e-05, + "loss": 0.6154, + "step": 154100 + }, + { + "epoch": 1.3623826446719356, + "grad_norm": 0.6679326295852661, + "learning_rate": 2.7293622588801078e-05, + "loss": 0.5264, + "step": 154110 + }, + { + "epoch": 1.3624710479322477, + "grad_norm": 2.8364195823669434, + "learning_rate": 2.7292149201129206e-05, + "loss": 0.5505, + "step": 154120 + }, + { + "epoch": 1.36255945119256, + "grad_norm": 0.8018041849136353, + "learning_rate": 2.7290675813457335e-05, + "loss": 0.4773, + "step": 154130 + }, + { + "epoch": 1.3626478544528722, + "grad_norm": 1.8438262939453125, + "learning_rate": 2.7289202425785467e-05, + "loss": 0.6894, + "step": 154140 + }, + { + "epoch": 1.3627362577131845, + "grad_norm": 2.2903082370758057, + "learning_rate": 2.7287729038113595e-05, + "loss": 0.658, + "step": 154150 + }, + { + "epoch": 1.3628246609734966, + "grad_norm": 6.520436763763428, + "learning_rate": 2.7286255650441723e-05, + "loss": 0.668, + "step": 154160 + }, + { + "epoch": 1.362913064233809, + "grad_norm": 1.4125345945358276, + "learning_rate": 2.7284782262769855e-05, + "loss": 0.555, + "step": 154170 + }, + { + "epoch": 1.3630014674941213, + "grad_norm": 4.110620975494385, + "learning_rate": 2.728330887509798e-05, + "loss": 0.6559, + "step": 154180 + }, + { + "epoch": 1.3630898707544334, + "grad_norm": 2.6496288776397705, + "learning_rate": 2.7281835487426112e-05, + "loss": 0.6572, + "step": 154190 + }, + { + "epoch": 1.3631782740147456, + "grad_norm": 1.4097830057144165, + "learning_rate": 2.7280362099754243e-05, + "loss": 0.5737, + "step": 154200 + }, + { + "epoch": 1.363266677275058, + "grad_norm": 2.554978370666504, + "learning_rate": 2.727888871208237e-05, + "loss": 0.5654, + "step": 154210 + }, + { + "epoch": 1.3633550805353702, + "grad_norm": 4.791134834289551, + "learning_rate": 2.72774153244105e-05, + "loss": 0.798, + "step": 154220 + }, + { + "epoch": 1.3634434837956824, + "grad_norm": 5.3456621170043945, + "learning_rate": 2.7275941936738632e-05, + "loss": 0.4949, + "step": 154230 + }, + { + "epoch": 1.3635318870559947, + "grad_norm": 1.2459648847579956, + "learning_rate": 2.7274468549066757e-05, + "loss": 0.6048, + "step": 154240 + }, + { + "epoch": 1.3636202903163068, + "grad_norm": 3.840247869491577, + "learning_rate": 2.727299516139489e-05, + "loss": 0.6245, + "step": 154250 + }, + { + "epoch": 1.3637086935766192, + "grad_norm": 5.195685863494873, + "learning_rate": 2.727152177372302e-05, + "loss": 0.5703, + "step": 154260 + }, + { + "epoch": 1.3637970968369313, + "grad_norm": 1.270114541053772, + "learning_rate": 2.7270048386051145e-05, + "loss": 0.5616, + "step": 154270 + }, + { + "epoch": 1.3638855000972436, + "grad_norm": 2.738403081893921, + "learning_rate": 2.7268574998379277e-05, + "loss": 0.6586, + "step": 154280 + }, + { + "epoch": 1.3639739033575558, + "grad_norm": 2.4280507564544678, + "learning_rate": 2.7267101610707402e-05, + "loss": 0.5508, + "step": 154290 + }, + { + "epoch": 1.364062306617868, + "grad_norm": 3.822091579437256, + "learning_rate": 2.7265628223035534e-05, + "loss": 0.4753, + "step": 154300 + }, + { + "epoch": 1.3641507098781802, + "grad_norm": 9.027368545532227, + "learning_rate": 2.7264154835363666e-05, + "loss": 0.5569, + "step": 154310 + }, + { + "epoch": 1.3642391131384926, + "grad_norm": 9.11761474609375, + "learning_rate": 2.726268144769179e-05, + "loss": 0.7106, + "step": 154320 + }, + { + "epoch": 1.364327516398805, + "grad_norm": 1.0846797227859497, + "learning_rate": 2.7261208060019922e-05, + "loss": 0.5674, + "step": 154330 + }, + { + "epoch": 1.364415919659117, + "grad_norm": 1.4489777088165283, + "learning_rate": 2.7259734672348054e-05, + "loss": 0.5001, + "step": 154340 + }, + { + "epoch": 1.3645043229194291, + "grad_norm": 3.4873478412628174, + "learning_rate": 2.725826128467618e-05, + "loss": 0.6888, + "step": 154350 + }, + { + "epoch": 1.3645927261797415, + "grad_norm": 3.128005027770996, + "learning_rate": 2.725678789700431e-05, + "loss": 0.7207, + "step": 154360 + }, + { + "epoch": 1.3646811294400538, + "grad_norm": 4.4434428215026855, + "learning_rate": 2.7255314509332442e-05, + "loss": 0.7486, + "step": 154370 + }, + { + "epoch": 1.364769532700366, + "grad_norm": 7.020805358886719, + "learning_rate": 2.7253841121660567e-05, + "loss": 0.6439, + "step": 154380 + }, + { + "epoch": 1.3648579359606783, + "grad_norm": 1.9832072257995605, + "learning_rate": 2.72523677339887e-05, + "loss": 0.6318, + "step": 154390 + }, + { + "epoch": 1.3649463392209904, + "grad_norm": 8.416921615600586, + "learning_rate": 2.7250894346316824e-05, + "loss": 0.7716, + "step": 154400 + }, + { + "epoch": 1.3650347424813027, + "grad_norm": 17.864105224609375, + "learning_rate": 2.7249420958644956e-05, + "loss": 0.5384, + "step": 154410 + }, + { + "epoch": 1.3651231457416149, + "grad_norm": 2.9790709018707275, + "learning_rate": 2.7247947570973088e-05, + "loss": 0.6522, + "step": 154420 + }, + { + "epoch": 1.3652115490019272, + "grad_norm": 1.2897685766220093, + "learning_rate": 2.7246474183301213e-05, + "loss": 0.7234, + "step": 154430 + }, + { + "epoch": 1.3652999522622395, + "grad_norm": 12.144488334655762, + "learning_rate": 2.7245000795629344e-05, + "loss": 0.6173, + "step": 154440 + }, + { + "epoch": 1.3653883555225517, + "grad_norm": 8.407711029052734, + "learning_rate": 2.7243527407957476e-05, + "loss": 0.5009, + "step": 154450 + }, + { + "epoch": 1.3654767587828638, + "grad_norm": 0.5981799960136414, + "learning_rate": 2.72420540202856e-05, + "loss": 0.7288, + "step": 154460 + }, + { + "epoch": 1.3655651620431761, + "grad_norm": 4.110251426696777, + "learning_rate": 2.7240580632613733e-05, + "loss": 0.5727, + "step": 154470 + }, + { + "epoch": 1.3656535653034885, + "grad_norm": 2.2072997093200684, + "learning_rate": 2.7239107244941864e-05, + "loss": 0.5492, + "step": 154480 + }, + { + "epoch": 1.3657419685638006, + "grad_norm": 3.550320863723755, + "learning_rate": 2.723763385726999e-05, + "loss": 0.6136, + "step": 154490 + }, + { + "epoch": 1.365830371824113, + "grad_norm": 2.081270933151245, + "learning_rate": 2.723616046959812e-05, + "loss": 0.5203, + "step": 154500 + }, + { + "epoch": 1.365918775084425, + "grad_norm": 1.412699818611145, + "learning_rate": 2.7234687081926246e-05, + "loss": 0.5462, + "step": 154510 + }, + { + "epoch": 1.3660071783447374, + "grad_norm": 11.623519897460938, + "learning_rate": 2.7233213694254378e-05, + "loss": 0.4878, + "step": 154520 + }, + { + "epoch": 1.3660955816050495, + "grad_norm": 1.180148959159851, + "learning_rate": 2.723174030658251e-05, + "loss": 0.6776, + "step": 154530 + }, + { + "epoch": 1.3661839848653619, + "grad_norm": 4.216616153717041, + "learning_rate": 2.7230266918910635e-05, + "loss": 0.7044, + "step": 154540 + }, + { + "epoch": 1.3662723881256742, + "grad_norm": 3.270925521850586, + "learning_rate": 2.7228793531238766e-05, + "loss": 0.6919, + "step": 154550 + }, + { + "epoch": 1.3663607913859863, + "grad_norm": 1.9662703275680542, + "learning_rate": 2.7227320143566898e-05, + "loss": 0.6106, + "step": 154560 + }, + { + "epoch": 1.3664491946462984, + "grad_norm": 7.437923431396484, + "learning_rate": 2.7225846755895023e-05, + "loss": 0.5622, + "step": 154570 + }, + { + "epoch": 1.3665375979066108, + "grad_norm": 2.8365561962127686, + "learning_rate": 2.7224373368223155e-05, + "loss": 0.522, + "step": 154580 + }, + { + "epoch": 1.3666260011669231, + "grad_norm": 3.4499101638793945, + "learning_rate": 2.7222899980551287e-05, + "loss": 0.6886, + "step": 154590 + }, + { + "epoch": 1.3667144044272352, + "grad_norm": 2.752417802810669, + "learning_rate": 2.722142659287941e-05, + "loss": 0.6583, + "step": 154600 + }, + { + "epoch": 1.3668028076875476, + "grad_norm": 2.1933319568634033, + "learning_rate": 2.7219953205207543e-05, + "loss": 0.6067, + "step": 154610 + }, + { + "epoch": 1.3668912109478597, + "grad_norm": 6.355262756347656, + "learning_rate": 2.7218479817535675e-05, + "loss": 0.6092, + "step": 154620 + }, + { + "epoch": 1.366979614208172, + "grad_norm": 5.629115104675293, + "learning_rate": 2.72170064298638e-05, + "loss": 0.5589, + "step": 154630 + }, + { + "epoch": 1.3670680174684842, + "grad_norm": 1.2412418127059937, + "learning_rate": 2.721553304219193e-05, + "loss": 0.6126, + "step": 154640 + }, + { + "epoch": 1.3671564207287965, + "grad_norm": 6.742635250091553, + "learning_rate": 2.7214059654520057e-05, + "loss": 0.5539, + "step": 154650 + }, + { + "epoch": 1.3672448239891088, + "grad_norm": 3.1886813640594482, + "learning_rate": 2.721258626684819e-05, + "loss": 0.5567, + "step": 154660 + }, + { + "epoch": 1.367333227249421, + "grad_norm": 1.7838319540023804, + "learning_rate": 2.721111287917632e-05, + "loss": 0.7247, + "step": 154670 + }, + { + "epoch": 1.367421630509733, + "grad_norm": 3.322016954421997, + "learning_rate": 2.7209639491504445e-05, + "loss": 0.5633, + "step": 154680 + }, + { + "epoch": 1.3675100337700454, + "grad_norm": 4.518759727478027, + "learning_rate": 2.7208166103832577e-05, + "loss": 0.537, + "step": 154690 + }, + { + "epoch": 1.3675984370303578, + "grad_norm": 16.524864196777344, + "learning_rate": 2.720669271616071e-05, + "loss": 0.5959, + "step": 154700 + }, + { + "epoch": 1.36768684029067, + "grad_norm": 16.19443130493164, + "learning_rate": 2.7205219328488834e-05, + "loss": 0.6194, + "step": 154710 + }, + { + "epoch": 1.3677752435509822, + "grad_norm": 1.410196304321289, + "learning_rate": 2.7203745940816965e-05, + "loss": 0.6283, + "step": 154720 + }, + { + "epoch": 1.3678636468112944, + "grad_norm": 2.368764638900757, + "learning_rate": 2.7202272553145097e-05, + "loss": 0.6763, + "step": 154730 + }, + { + "epoch": 1.3679520500716067, + "grad_norm": 2.4012529850006104, + "learning_rate": 2.7200799165473222e-05, + "loss": 0.5435, + "step": 154740 + }, + { + "epoch": 1.3680404533319188, + "grad_norm": 0.8090259432792664, + "learning_rate": 2.7199325777801354e-05, + "loss": 0.5123, + "step": 154750 + }, + { + "epoch": 1.3681288565922312, + "grad_norm": 2.2534990310668945, + "learning_rate": 2.719785239012948e-05, + "loss": 0.4984, + "step": 154760 + }, + { + "epoch": 1.3682172598525435, + "grad_norm": 1.3646174669265747, + "learning_rate": 2.719637900245761e-05, + "loss": 0.5332, + "step": 154770 + }, + { + "epoch": 1.3683056631128556, + "grad_norm": 2.0068867206573486, + "learning_rate": 2.7194905614785742e-05, + "loss": 0.5136, + "step": 154780 + }, + { + "epoch": 1.3683940663731677, + "grad_norm": 4.147223949432373, + "learning_rate": 2.7193432227113867e-05, + "loss": 0.628, + "step": 154790 + }, + { + "epoch": 1.36848246963348, + "grad_norm": 1.8314428329467773, + "learning_rate": 2.7191958839442e-05, + "loss": 0.6477, + "step": 154800 + }, + { + "epoch": 1.3685708728937924, + "grad_norm": 4.222271919250488, + "learning_rate": 2.719048545177013e-05, + "loss": 0.6418, + "step": 154810 + }, + { + "epoch": 1.3686592761541045, + "grad_norm": 2.435983419418335, + "learning_rate": 2.7189012064098256e-05, + "loss": 0.7608, + "step": 154820 + }, + { + "epoch": 1.3687476794144169, + "grad_norm": 6.38040828704834, + "learning_rate": 2.7187538676426387e-05, + "loss": 0.6733, + "step": 154830 + }, + { + "epoch": 1.368836082674729, + "grad_norm": 8.850046157836914, + "learning_rate": 2.718606528875452e-05, + "loss": 0.6012, + "step": 154840 + }, + { + "epoch": 1.3689244859350413, + "grad_norm": 3.2060370445251465, + "learning_rate": 2.7184591901082644e-05, + "loss": 0.5737, + "step": 154850 + }, + { + "epoch": 1.3690128891953535, + "grad_norm": 1.6039284467697144, + "learning_rate": 2.7183118513410776e-05, + "loss": 0.5803, + "step": 154860 + }, + { + "epoch": 1.3691012924556658, + "grad_norm": 8.194977760314941, + "learning_rate": 2.7181645125738904e-05, + "loss": 0.6032, + "step": 154870 + }, + { + "epoch": 1.369189695715978, + "grad_norm": 1.7717314958572388, + "learning_rate": 2.7180171738067032e-05, + "loss": 0.6984, + "step": 154880 + }, + { + "epoch": 1.3692780989762903, + "grad_norm": 3.3422884941101074, + "learning_rate": 2.7178698350395164e-05, + "loss": 0.7046, + "step": 154890 + }, + { + "epoch": 1.3693665022366024, + "grad_norm": 3.9236302375793457, + "learning_rate": 2.7177224962723293e-05, + "loss": 0.5878, + "step": 154900 + }, + { + "epoch": 1.3694549054969147, + "grad_norm": 2.0146372318267822, + "learning_rate": 2.717575157505142e-05, + "loss": 0.6033, + "step": 154910 + }, + { + "epoch": 1.369543308757227, + "grad_norm": 1.5420422554016113, + "learning_rate": 2.7174278187379553e-05, + "loss": 0.6024, + "step": 154920 + }, + { + "epoch": 1.3696317120175392, + "grad_norm": 1.5074349641799927, + "learning_rate": 2.717280479970768e-05, + "loss": 0.5833, + "step": 154930 + }, + { + "epoch": 1.3697201152778513, + "grad_norm": 3.9746596813201904, + "learning_rate": 2.717133141203581e-05, + "loss": 0.5922, + "step": 154940 + }, + { + "epoch": 1.3698085185381637, + "grad_norm": 3.6697375774383545, + "learning_rate": 2.716985802436394e-05, + "loss": 0.5826, + "step": 154950 + }, + { + "epoch": 1.369896921798476, + "grad_norm": 4.591105937957764, + "learning_rate": 2.716838463669207e-05, + "loss": 0.5598, + "step": 154960 + }, + { + "epoch": 1.3699853250587881, + "grad_norm": 1.2530019283294678, + "learning_rate": 2.7166911249020198e-05, + "loss": 0.6354, + "step": 154970 + }, + { + "epoch": 1.3700737283191005, + "grad_norm": 1.9378292560577393, + "learning_rate": 2.7165437861348326e-05, + "loss": 0.5806, + "step": 154980 + }, + { + "epoch": 1.3701621315794126, + "grad_norm": 1.188019037246704, + "learning_rate": 2.7163964473676458e-05, + "loss": 0.636, + "step": 154990 + }, + { + "epoch": 1.370250534839725, + "grad_norm": 1.4662915468215942, + "learning_rate": 2.7162491086004586e-05, + "loss": 0.6019, + "step": 155000 + }, + { + "epoch": 1.370338938100037, + "grad_norm": 6.8162994384765625, + "learning_rate": 2.7161017698332715e-05, + "loss": 0.607, + "step": 155010 + }, + { + "epoch": 1.3704273413603494, + "grad_norm": 1.2052501440048218, + "learning_rate": 2.7159544310660846e-05, + "loss": 0.5793, + "step": 155020 + }, + { + "epoch": 1.3705157446206617, + "grad_norm": 4.539021968841553, + "learning_rate": 2.7158070922988975e-05, + "loss": 0.4963, + "step": 155030 + }, + { + "epoch": 1.3706041478809738, + "grad_norm": 10.318037033081055, + "learning_rate": 2.7156597535317103e-05, + "loss": 0.5552, + "step": 155040 + }, + { + "epoch": 1.370692551141286, + "grad_norm": 1.9288170337677002, + "learning_rate": 2.7155124147645235e-05, + "loss": 0.5725, + "step": 155050 + }, + { + "epoch": 1.3707809544015983, + "grad_norm": 1.8394007682800293, + "learning_rate": 2.7153650759973363e-05, + "loss": 0.5495, + "step": 155060 + }, + { + "epoch": 1.3708693576619106, + "grad_norm": 26.70475196838379, + "learning_rate": 2.715217737230149e-05, + "loss": 0.6242, + "step": 155070 + }, + { + "epoch": 1.3709577609222228, + "grad_norm": 1.2042925357818604, + "learning_rate": 2.7150703984629623e-05, + "loss": 0.5535, + "step": 155080 + }, + { + "epoch": 1.371046164182535, + "grad_norm": 4.55109167098999, + "learning_rate": 2.714923059695775e-05, + "loss": 0.4343, + "step": 155090 + }, + { + "epoch": 1.3711345674428472, + "grad_norm": 3.5097484588623047, + "learning_rate": 2.714775720928588e-05, + "loss": 0.7697, + "step": 155100 + }, + { + "epoch": 1.3712229707031596, + "grad_norm": 1.5943996906280518, + "learning_rate": 2.7146283821614012e-05, + "loss": 0.6223, + "step": 155110 + }, + { + "epoch": 1.3713113739634717, + "grad_norm": 1.7234963178634644, + "learning_rate": 2.7144810433942137e-05, + "loss": 0.6311, + "step": 155120 + }, + { + "epoch": 1.371399777223784, + "grad_norm": 3.5486643314361572, + "learning_rate": 2.714333704627027e-05, + "loss": 0.5866, + "step": 155130 + }, + { + "epoch": 1.3714881804840964, + "grad_norm": 4.045604705810547, + "learning_rate": 2.71418636585984e-05, + "loss": 0.6922, + "step": 155140 + }, + { + "epoch": 1.3715765837444085, + "grad_norm": 23.683698654174805, + "learning_rate": 2.7140390270926525e-05, + "loss": 0.7124, + "step": 155150 + }, + { + "epoch": 1.3716649870047206, + "grad_norm": 5.036553382873535, + "learning_rate": 2.7138916883254657e-05, + "loss": 0.6982, + "step": 155160 + }, + { + "epoch": 1.371753390265033, + "grad_norm": 1.9953233003616333, + "learning_rate": 2.713744349558279e-05, + "loss": 0.6876, + "step": 155170 + }, + { + "epoch": 1.3718417935253453, + "grad_norm": 1.812619924545288, + "learning_rate": 2.7135970107910914e-05, + "loss": 0.7092, + "step": 155180 + }, + { + "epoch": 1.3719301967856574, + "grad_norm": 2.9480655193328857, + "learning_rate": 2.7134496720239045e-05, + "loss": 0.7175, + "step": 155190 + }, + { + "epoch": 1.3720186000459698, + "grad_norm": 1.8385483026504517, + "learning_rate": 2.7133023332567177e-05, + "loss": 0.6191, + "step": 155200 + }, + { + "epoch": 1.3721070033062819, + "grad_norm": 2.125201940536499, + "learning_rate": 2.7131549944895302e-05, + "loss": 0.7503, + "step": 155210 + }, + { + "epoch": 1.3721954065665942, + "grad_norm": 2.1805617809295654, + "learning_rate": 2.7130076557223434e-05, + "loss": 0.6716, + "step": 155220 + }, + { + "epoch": 1.3722838098269063, + "grad_norm": 4.144437313079834, + "learning_rate": 2.712860316955156e-05, + "loss": 0.7142, + "step": 155230 + }, + { + "epoch": 1.3723722130872187, + "grad_norm": 6.431118965148926, + "learning_rate": 2.712712978187969e-05, + "loss": 0.6927, + "step": 155240 + }, + { + "epoch": 1.372460616347531, + "grad_norm": 2.286168336868286, + "learning_rate": 2.7125656394207822e-05, + "loss": 0.6384, + "step": 155250 + }, + { + "epoch": 1.3725490196078431, + "grad_norm": 2.2630951404571533, + "learning_rate": 2.7124183006535947e-05, + "loss": 0.6558, + "step": 155260 + }, + { + "epoch": 1.3726374228681553, + "grad_norm": 1.3777186870574951, + "learning_rate": 2.712270961886408e-05, + "loss": 0.6264, + "step": 155270 + }, + { + "epoch": 1.3727258261284676, + "grad_norm": 0.9174994230270386, + "learning_rate": 2.712123623119221e-05, + "loss": 0.5234, + "step": 155280 + }, + { + "epoch": 1.37281422938878, + "grad_norm": 19.313364028930664, + "learning_rate": 2.7119762843520336e-05, + "loss": 0.5636, + "step": 155290 + }, + { + "epoch": 1.372902632649092, + "grad_norm": 3.4840359687805176, + "learning_rate": 2.7118289455848467e-05, + "loss": 0.6314, + "step": 155300 + }, + { + "epoch": 1.3729910359094044, + "grad_norm": 1.7505651712417603, + "learning_rate": 2.71168160681766e-05, + "loss": 0.6878, + "step": 155310 + }, + { + "epoch": 1.3730794391697165, + "grad_norm": 2.405698537826538, + "learning_rate": 2.7115342680504724e-05, + "loss": 0.5962, + "step": 155320 + }, + { + "epoch": 1.3731678424300289, + "grad_norm": 3.2754852771759033, + "learning_rate": 2.7113869292832856e-05, + "loss": 0.6436, + "step": 155330 + }, + { + "epoch": 1.373256245690341, + "grad_norm": 3.2899110317230225, + "learning_rate": 2.711239590516098e-05, + "loss": 0.6518, + "step": 155340 + }, + { + "epoch": 1.3733446489506533, + "grad_norm": 6.16387939453125, + "learning_rate": 2.7110922517489113e-05, + "loss": 0.4967, + "step": 155350 + }, + { + "epoch": 1.3734330522109657, + "grad_norm": 1.9771013259887695, + "learning_rate": 2.7109449129817244e-05, + "loss": 0.6309, + "step": 155360 + }, + { + "epoch": 1.3735214554712778, + "grad_norm": 5.630110263824463, + "learning_rate": 2.710797574214537e-05, + "loss": 0.6041, + "step": 155370 + }, + { + "epoch": 1.37360985873159, + "grad_norm": 1.4134987592697144, + "learning_rate": 2.71065023544735e-05, + "loss": 0.703, + "step": 155380 + }, + { + "epoch": 1.3736982619919023, + "grad_norm": 1.9246699810028076, + "learning_rate": 2.7105028966801633e-05, + "loss": 0.5857, + "step": 155390 + }, + { + "epoch": 1.3737866652522146, + "grad_norm": 2.85746431350708, + "learning_rate": 2.7103555579129758e-05, + "loss": 0.5657, + "step": 155400 + }, + { + "epoch": 1.3738750685125267, + "grad_norm": 4.268984317779541, + "learning_rate": 2.710208219145789e-05, + "loss": 0.5559, + "step": 155410 + }, + { + "epoch": 1.373963471772839, + "grad_norm": 9.158341407775879, + "learning_rate": 2.710060880378602e-05, + "loss": 0.7064, + "step": 155420 + }, + { + "epoch": 1.3740518750331512, + "grad_norm": 3.0119051933288574, + "learning_rate": 2.7099135416114146e-05, + "loss": 0.6397, + "step": 155430 + }, + { + "epoch": 1.3741402782934635, + "grad_norm": 5.203845024108887, + "learning_rate": 2.7097662028442278e-05, + "loss": 0.6357, + "step": 155440 + }, + { + "epoch": 1.3742286815537756, + "grad_norm": 2.26387357711792, + "learning_rate": 2.7096188640770403e-05, + "loss": 0.6279, + "step": 155450 + }, + { + "epoch": 1.374317084814088, + "grad_norm": 19.37293243408203, + "learning_rate": 2.7094715253098535e-05, + "loss": 0.6368, + "step": 155460 + }, + { + "epoch": 1.3744054880744, + "grad_norm": 2.113950729370117, + "learning_rate": 2.7093241865426666e-05, + "loss": 0.5691, + "step": 155470 + }, + { + "epoch": 1.3744938913347124, + "grad_norm": 2.456430435180664, + "learning_rate": 2.709176847775479e-05, + "loss": 0.5471, + "step": 155480 + }, + { + "epoch": 1.3745822945950246, + "grad_norm": 1.3330222368240356, + "learning_rate": 2.7090295090082923e-05, + "loss": 0.6192, + "step": 155490 + }, + { + "epoch": 1.374670697855337, + "grad_norm": 3.0857956409454346, + "learning_rate": 2.7088821702411055e-05, + "loss": 0.7707, + "step": 155500 + }, + { + "epoch": 1.3747591011156493, + "grad_norm": 1.8446743488311768, + "learning_rate": 2.708734831473918e-05, + "loss": 0.6216, + "step": 155510 + }, + { + "epoch": 1.3748475043759614, + "grad_norm": 4.19819974899292, + "learning_rate": 2.708587492706731e-05, + "loss": 0.6936, + "step": 155520 + }, + { + "epoch": 1.3749359076362735, + "grad_norm": 1.715083122253418, + "learning_rate": 2.7084401539395443e-05, + "loss": 0.704, + "step": 155530 + }, + { + "epoch": 1.3750243108965858, + "grad_norm": 1.6971405744552612, + "learning_rate": 2.7082928151723568e-05, + "loss": 0.5975, + "step": 155540 + }, + { + "epoch": 1.3751127141568982, + "grad_norm": 4.815489768981934, + "learning_rate": 2.70814547640517e-05, + "loss": 0.7434, + "step": 155550 + }, + { + "epoch": 1.3752011174172103, + "grad_norm": 2.8287346363067627, + "learning_rate": 2.7079981376379832e-05, + "loss": 0.6777, + "step": 155560 + }, + { + "epoch": 1.3752895206775226, + "grad_norm": 19.006778717041016, + "learning_rate": 2.7078507988707957e-05, + "loss": 0.6244, + "step": 155570 + }, + { + "epoch": 1.3753779239378348, + "grad_norm": 10.056194305419922, + "learning_rate": 2.707703460103609e-05, + "loss": 0.5373, + "step": 155580 + }, + { + "epoch": 1.375466327198147, + "grad_norm": 5.858008861541748, + "learning_rate": 2.7075561213364213e-05, + "loss": 0.5824, + "step": 155590 + }, + { + "epoch": 1.3755547304584592, + "grad_norm": 1.447767972946167, + "learning_rate": 2.7074087825692345e-05, + "loss": 0.6308, + "step": 155600 + }, + { + "epoch": 1.3756431337187716, + "grad_norm": 2.093517541885376, + "learning_rate": 2.7072614438020477e-05, + "loss": 0.776, + "step": 155610 + }, + { + "epoch": 1.375731536979084, + "grad_norm": 2.2905070781707764, + "learning_rate": 2.7071141050348602e-05, + "loss": 0.7129, + "step": 155620 + }, + { + "epoch": 1.375819940239396, + "grad_norm": 2.227703809738159, + "learning_rate": 2.7069667662676734e-05, + "loss": 0.5256, + "step": 155630 + }, + { + "epoch": 1.3759083434997081, + "grad_norm": 2.6590676307678223, + "learning_rate": 2.7068194275004865e-05, + "loss": 0.6504, + "step": 155640 + }, + { + "epoch": 1.3759967467600205, + "grad_norm": 1.292912483215332, + "learning_rate": 2.706672088733299e-05, + "loss": 0.5684, + "step": 155650 + }, + { + "epoch": 1.3760851500203328, + "grad_norm": 10.864413261413574, + "learning_rate": 2.7065247499661122e-05, + "loss": 0.5916, + "step": 155660 + }, + { + "epoch": 1.376173553280645, + "grad_norm": 1.0657316446304321, + "learning_rate": 2.7063774111989254e-05, + "loss": 0.5872, + "step": 155670 + }, + { + "epoch": 1.3762619565409573, + "grad_norm": 1.5046132802963257, + "learning_rate": 2.706230072431738e-05, + "loss": 0.6135, + "step": 155680 + }, + { + "epoch": 1.3763503598012694, + "grad_norm": 2.8958046436309814, + "learning_rate": 2.706082733664551e-05, + "loss": 0.5094, + "step": 155690 + }, + { + "epoch": 1.3764387630615817, + "grad_norm": 4.554339408874512, + "learning_rate": 2.7059353948973635e-05, + "loss": 0.4478, + "step": 155700 + }, + { + "epoch": 1.3765271663218939, + "grad_norm": 3.73746919631958, + "learning_rate": 2.7057880561301767e-05, + "loss": 0.656, + "step": 155710 + }, + { + "epoch": 1.3766155695822062, + "grad_norm": 2.260776996612549, + "learning_rate": 2.70564071736299e-05, + "loss": 0.4898, + "step": 155720 + }, + { + "epoch": 1.3767039728425186, + "grad_norm": 3.22761869430542, + "learning_rate": 2.7054933785958024e-05, + "loss": 0.5594, + "step": 155730 + }, + { + "epoch": 1.3767923761028307, + "grad_norm": 1.7947300672531128, + "learning_rate": 2.7053460398286156e-05, + "loss": 0.7092, + "step": 155740 + }, + { + "epoch": 1.3768807793631428, + "grad_norm": 1.503580093383789, + "learning_rate": 2.7051987010614287e-05, + "loss": 0.6421, + "step": 155750 + }, + { + "epoch": 1.3769691826234551, + "grad_norm": 2.2446248531341553, + "learning_rate": 2.7050513622942412e-05, + "loss": 0.5834, + "step": 155760 + }, + { + "epoch": 1.3770575858837675, + "grad_norm": 2.4247617721557617, + "learning_rate": 2.7049040235270544e-05, + "loss": 0.556, + "step": 155770 + }, + { + "epoch": 1.3771459891440796, + "grad_norm": 2.387530565261841, + "learning_rate": 2.7047566847598676e-05, + "loss": 0.617, + "step": 155780 + }, + { + "epoch": 1.377234392404392, + "grad_norm": 2.995638132095337, + "learning_rate": 2.70460934599268e-05, + "loss": 0.6097, + "step": 155790 + }, + { + "epoch": 1.377322795664704, + "grad_norm": 3.863736867904663, + "learning_rate": 2.7044620072254933e-05, + "loss": 0.6089, + "step": 155800 + }, + { + "epoch": 1.3774111989250164, + "grad_norm": 1.823962926864624, + "learning_rate": 2.704314668458306e-05, + "loss": 0.7074, + "step": 155810 + }, + { + "epoch": 1.3774996021853285, + "grad_norm": 1.502862811088562, + "learning_rate": 2.704167329691119e-05, + "loss": 0.5894, + "step": 155820 + }, + { + "epoch": 1.3775880054456409, + "grad_norm": 3.4480679035186768, + "learning_rate": 2.704019990923932e-05, + "loss": 0.6318, + "step": 155830 + }, + { + "epoch": 1.3776764087059532, + "grad_norm": 1.622433066368103, + "learning_rate": 2.703872652156745e-05, + "loss": 0.6106, + "step": 155840 + }, + { + "epoch": 1.3777648119662653, + "grad_norm": 1.1865192651748657, + "learning_rate": 2.7037253133895578e-05, + "loss": 0.5732, + "step": 155850 + }, + { + "epoch": 1.3778532152265774, + "grad_norm": 3.9950077533721924, + "learning_rate": 2.703577974622371e-05, + "loss": 0.5774, + "step": 155860 + }, + { + "epoch": 1.3779416184868898, + "grad_norm": 1.5382553339004517, + "learning_rate": 2.7034306358551838e-05, + "loss": 0.6515, + "step": 155870 + }, + { + "epoch": 1.3780300217472021, + "grad_norm": 1.448900818824768, + "learning_rate": 2.7032832970879966e-05, + "loss": 0.5279, + "step": 155880 + }, + { + "epoch": 1.3781184250075142, + "grad_norm": 3.701958417892456, + "learning_rate": 2.7031359583208098e-05, + "loss": 0.6972, + "step": 155890 + }, + { + "epoch": 1.3782068282678266, + "grad_norm": 4.679746627807617, + "learning_rate": 2.7029886195536226e-05, + "loss": 0.5998, + "step": 155900 + }, + { + "epoch": 1.3782952315281387, + "grad_norm": 3.1549744606018066, + "learning_rate": 2.7028412807864355e-05, + "loss": 0.5547, + "step": 155910 + }, + { + "epoch": 1.378383634788451, + "grad_norm": 1.7143431901931763, + "learning_rate": 2.7026939420192483e-05, + "loss": 0.6025, + "step": 155920 + }, + { + "epoch": 1.3784720380487632, + "grad_norm": 1.9437109231948853, + "learning_rate": 2.7025466032520615e-05, + "loss": 0.6479, + "step": 155930 + }, + { + "epoch": 1.3785604413090755, + "grad_norm": 7.047440052032471, + "learning_rate": 2.7023992644848743e-05, + "loss": 0.54, + "step": 155940 + }, + { + "epoch": 1.3786488445693879, + "grad_norm": 3.4126367568969727, + "learning_rate": 2.702251925717687e-05, + "loss": 0.532, + "step": 155950 + }, + { + "epoch": 1.3787372478297, + "grad_norm": 3.4150843620300293, + "learning_rate": 2.7021045869505003e-05, + "loss": 0.4966, + "step": 155960 + }, + { + "epoch": 1.378825651090012, + "grad_norm": 2.4167957305908203, + "learning_rate": 2.701957248183313e-05, + "loss": 0.5523, + "step": 155970 + }, + { + "epoch": 1.3789140543503244, + "grad_norm": 3.489398241043091, + "learning_rate": 2.701809909416126e-05, + "loss": 0.6639, + "step": 155980 + }, + { + "epoch": 1.3790024576106368, + "grad_norm": 7.619278907775879, + "learning_rate": 2.701662570648939e-05, + "loss": 0.5892, + "step": 155990 + }, + { + "epoch": 1.379090860870949, + "grad_norm": 3.692145586013794, + "learning_rate": 2.701515231881752e-05, + "loss": 0.5844, + "step": 156000 + }, + { + "epoch": 1.3791792641312612, + "grad_norm": 1.9484286308288574, + "learning_rate": 2.7013678931145648e-05, + "loss": 0.6469, + "step": 156010 + }, + { + "epoch": 1.3792676673915734, + "grad_norm": 1.8545992374420166, + "learning_rate": 2.701220554347378e-05, + "loss": 0.6258, + "step": 156020 + }, + { + "epoch": 1.3793560706518857, + "grad_norm": 2.85139799118042, + "learning_rate": 2.701073215580191e-05, + "loss": 0.66, + "step": 156030 + }, + { + "epoch": 1.3794444739121978, + "grad_norm": 1.1166852712631226, + "learning_rate": 2.7009258768130037e-05, + "loss": 0.6271, + "step": 156040 + }, + { + "epoch": 1.3795328771725102, + "grad_norm": 2.727505683898926, + "learning_rate": 2.700778538045817e-05, + "loss": 0.6577, + "step": 156050 + }, + { + "epoch": 1.3796212804328223, + "grad_norm": 1.5364007949829102, + "learning_rate": 2.7006311992786293e-05, + "loss": 0.4364, + "step": 156060 + }, + { + "epoch": 1.3797096836931346, + "grad_norm": 4.598865032196045, + "learning_rate": 2.7004838605114425e-05, + "loss": 0.4635, + "step": 156070 + }, + { + "epoch": 1.3797980869534467, + "grad_norm": 6.509904861450195, + "learning_rate": 2.7003365217442557e-05, + "loss": 0.5533, + "step": 156080 + }, + { + "epoch": 1.379886490213759, + "grad_norm": 1.072874903678894, + "learning_rate": 2.7001891829770682e-05, + "loss": 0.6263, + "step": 156090 + }, + { + "epoch": 1.3799748934740714, + "grad_norm": 19.482872009277344, + "learning_rate": 2.7000418442098814e-05, + "loss": 0.5289, + "step": 156100 + }, + { + "epoch": 1.3800632967343835, + "grad_norm": 25.272069931030273, + "learning_rate": 2.6998945054426945e-05, + "loss": 0.6261, + "step": 156110 + }, + { + "epoch": 1.3801516999946957, + "grad_norm": 3.5382237434387207, + "learning_rate": 2.699747166675507e-05, + "loss": 0.6623, + "step": 156120 + }, + { + "epoch": 1.380240103255008, + "grad_norm": 6.000899791717529, + "learning_rate": 2.6995998279083202e-05, + "loss": 0.88, + "step": 156130 + }, + { + "epoch": 1.3803285065153204, + "grad_norm": 9.717784881591797, + "learning_rate": 2.6994524891411334e-05, + "loss": 0.667, + "step": 156140 + }, + { + "epoch": 1.3804169097756325, + "grad_norm": 6.183516502380371, + "learning_rate": 2.699305150373946e-05, + "loss": 0.6779, + "step": 156150 + }, + { + "epoch": 1.3805053130359448, + "grad_norm": 2.6247222423553467, + "learning_rate": 2.699157811606759e-05, + "loss": 0.619, + "step": 156160 + }, + { + "epoch": 1.380593716296257, + "grad_norm": 8.288039207458496, + "learning_rate": 2.6990104728395716e-05, + "loss": 0.7186, + "step": 156170 + }, + { + "epoch": 1.3806821195565693, + "grad_norm": 9.10761547088623, + "learning_rate": 2.6988631340723847e-05, + "loss": 0.732, + "step": 156180 + }, + { + "epoch": 1.3807705228168814, + "grad_norm": 7.843234539031982, + "learning_rate": 2.698715795305198e-05, + "loss": 0.6819, + "step": 156190 + }, + { + "epoch": 1.3808589260771937, + "grad_norm": 2.5948920249938965, + "learning_rate": 2.6985684565380104e-05, + "loss": 0.5545, + "step": 156200 + }, + { + "epoch": 1.380947329337506, + "grad_norm": 5.79525899887085, + "learning_rate": 2.6984211177708236e-05, + "loss": 0.6209, + "step": 156210 + }, + { + "epoch": 1.3810357325978182, + "grad_norm": 11.15489387512207, + "learning_rate": 2.6982737790036367e-05, + "loss": 0.7334, + "step": 156220 + }, + { + "epoch": 1.3811241358581303, + "grad_norm": 4.876621246337891, + "learning_rate": 2.6981264402364492e-05, + "loss": 0.593, + "step": 156230 + }, + { + "epoch": 1.3812125391184427, + "grad_norm": 1.5784831047058105, + "learning_rate": 2.6979791014692624e-05, + "loss": 0.5793, + "step": 156240 + }, + { + "epoch": 1.381300942378755, + "grad_norm": 2.1730284690856934, + "learning_rate": 2.6978317627020756e-05, + "loss": 0.5601, + "step": 156250 + }, + { + "epoch": 1.3813893456390671, + "grad_norm": 2.341017961502075, + "learning_rate": 2.697684423934888e-05, + "loss": 0.677, + "step": 156260 + }, + { + "epoch": 1.3814777488993795, + "grad_norm": 4.492663860321045, + "learning_rate": 2.6975370851677013e-05, + "loss": 0.62, + "step": 156270 + }, + { + "epoch": 1.3815661521596916, + "grad_norm": 2.9659740924835205, + "learning_rate": 2.6973897464005138e-05, + "loss": 0.6348, + "step": 156280 + }, + { + "epoch": 1.381654555420004, + "grad_norm": 4.358122825622559, + "learning_rate": 2.697242407633327e-05, + "loss": 0.7985, + "step": 156290 + }, + { + "epoch": 1.381742958680316, + "grad_norm": 1.2129637002944946, + "learning_rate": 2.69709506886614e-05, + "loss": 0.5281, + "step": 156300 + }, + { + "epoch": 1.3818313619406284, + "grad_norm": 2.309075117111206, + "learning_rate": 2.6969477300989526e-05, + "loss": 0.4839, + "step": 156310 + }, + { + "epoch": 1.3819197652009407, + "grad_norm": 1.069314956665039, + "learning_rate": 2.6968003913317658e-05, + "loss": 0.6128, + "step": 156320 + }, + { + "epoch": 1.3820081684612529, + "grad_norm": 4.080246448516846, + "learning_rate": 2.696653052564579e-05, + "loss": 0.7097, + "step": 156330 + }, + { + "epoch": 1.382096571721565, + "grad_norm": 3.4198551177978516, + "learning_rate": 2.6965057137973914e-05, + "loss": 0.6586, + "step": 156340 + }, + { + "epoch": 1.3821849749818773, + "grad_norm": 1.1350969076156616, + "learning_rate": 2.6963583750302046e-05, + "loss": 0.5668, + "step": 156350 + }, + { + "epoch": 1.3822733782421897, + "grad_norm": 1.9831739664077759, + "learning_rate": 2.6962110362630178e-05, + "loss": 0.4136, + "step": 156360 + }, + { + "epoch": 1.3823617815025018, + "grad_norm": 1.7095671892166138, + "learning_rate": 2.6960636974958303e-05, + "loss": 0.6271, + "step": 156370 + }, + { + "epoch": 1.3824501847628141, + "grad_norm": 5.296531677246094, + "learning_rate": 2.6959163587286435e-05, + "loss": 0.7692, + "step": 156380 + }, + { + "epoch": 1.3825385880231262, + "grad_norm": 4.982787132263184, + "learning_rate": 2.695769019961456e-05, + "loss": 0.4727, + "step": 156390 + }, + { + "epoch": 1.3826269912834386, + "grad_norm": 2.1806366443634033, + "learning_rate": 2.695621681194269e-05, + "loss": 0.5999, + "step": 156400 + }, + { + "epoch": 1.3827153945437507, + "grad_norm": 1.8754287958145142, + "learning_rate": 2.6954743424270823e-05, + "loss": 0.5037, + "step": 156410 + }, + { + "epoch": 1.382803797804063, + "grad_norm": 1.323127269744873, + "learning_rate": 2.6953270036598948e-05, + "loss": 0.5753, + "step": 156420 + }, + { + "epoch": 1.3828922010643754, + "grad_norm": 5.687112808227539, + "learning_rate": 2.695179664892708e-05, + "loss": 0.6044, + "step": 156430 + }, + { + "epoch": 1.3829806043246875, + "grad_norm": 4.270662307739258, + "learning_rate": 2.695032326125521e-05, + "loss": 0.5245, + "step": 156440 + }, + { + "epoch": 1.3830690075849996, + "grad_norm": 4.5958147048950195, + "learning_rate": 2.6948849873583337e-05, + "loss": 0.6225, + "step": 156450 + }, + { + "epoch": 1.383157410845312, + "grad_norm": 4.541618347167969, + "learning_rate": 2.6947376485911468e-05, + "loss": 0.5054, + "step": 156460 + }, + { + "epoch": 1.3832458141056243, + "grad_norm": 4.363905429840088, + "learning_rate": 2.69459030982396e-05, + "loss": 0.5859, + "step": 156470 + }, + { + "epoch": 1.3833342173659364, + "grad_norm": 2.2598202228546143, + "learning_rate": 2.6944429710567725e-05, + "loss": 0.6096, + "step": 156480 + }, + { + "epoch": 1.3834226206262488, + "grad_norm": 1.849174976348877, + "learning_rate": 2.6942956322895857e-05, + "loss": 0.581, + "step": 156490 + }, + { + "epoch": 1.3835110238865609, + "grad_norm": 1.1192522048950195, + "learning_rate": 2.694148293522399e-05, + "loss": 0.5989, + "step": 156500 + }, + { + "epoch": 1.3835994271468732, + "grad_norm": 2.742314100265503, + "learning_rate": 2.6940009547552113e-05, + "loss": 0.584, + "step": 156510 + }, + { + "epoch": 1.3836878304071853, + "grad_norm": 4.095992565155029, + "learning_rate": 2.6938536159880245e-05, + "loss": 0.6587, + "step": 156520 + }, + { + "epoch": 1.3837762336674977, + "grad_norm": 2.7218616008758545, + "learning_rate": 2.693706277220837e-05, + "loss": 0.6269, + "step": 156530 + }, + { + "epoch": 1.38386463692781, + "grad_norm": 5.992404460906982, + "learning_rate": 2.6935589384536502e-05, + "loss": 0.6211, + "step": 156540 + }, + { + "epoch": 1.3839530401881222, + "grad_norm": 1.512061357498169, + "learning_rate": 2.6934115996864634e-05, + "loss": 0.6085, + "step": 156550 + }, + { + "epoch": 1.3840414434484343, + "grad_norm": 3.023223876953125, + "learning_rate": 2.693264260919276e-05, + "loss": 0.7619, + "step": 156560 + }, + { + "epoch": 1.3841298467087466, + "grad_norm": 4.649730205535889, + "learning_rate": 2.693116922152089e-05, + "loss": 0.5324, + "step": 156570 + }, + { + "epoch": 1.384218249969059, + "grad_norm": 2.2508370876312256, + "learning_rate": 2.6929695833849022e-05, + "loss": 0.6693, + "step": 156580 + }, + { + "epoch": 1.384306653229371, + "grad_norm": 3.125150680541992, + "learning_rate": 2.6928222446177147e-05, + "loss": 0.7694, + "step": 156590 + }, + { + "epoch": 1.3843950564896834, + "grad_norm": 3.8792030811309814, + "learning_rate": 2.692674905850528e-05, + "loss": 0.4879, + "step": 156600 + }, + { + "epoch": 1.3844834597499955, + "grad_norm": 5.308789253234863, + "learning_rate": 2.692527567083341e-05, + "loss": 0.6282, + "step": 156610 + }, + { + "epoch": 1.3845718630103079, + "grad_norm": 0.6319215297698975, + "learning_rate": 2.6923802283161535e-05, + "loss": 0.6267, + "step": 156620 + }, + { + "epoch": 1.38466026627062, + "grad_norm": 4.450835704803467, + "learning_rate": 2.6922328895489667e-05, + "loss": 0.674, + "step": 156630 + }, + { + "epoch": 1.3847486695309323, + "grad_norm": 6.0222978591918945, + "learning_rate": 2.6920855507817792e-05, + "loss": 0.6524, + "step": 156640 + }, + { + "epoch": 1.3848370727912445, + "grad_norm": 1.9837299585342407, + "learning_rate": 2.6919382120145924e-05, + "loss": 0.6489, + "step": 156650 + }, + { + "epoch": 1.3849254760515568, + "grad_norm": 5.43040657043457, + "learning_rate": 2.6917908732474056e-05, + "loss": 0.6283, + "step": 156660 + }, + { + "epoch": 1.385013879311869, + "grad_norm": 3.1336097717285156, + "learning_rate": 2.691643534480218e-05, + "loss": 0.4765, + "step": 156670 + }, + { + "epoch": 1.3851022825721813, + "grad_norm": 1.0707249641418457, + "learning_rate": 2.6914961957130312e-05, + "loss": 0.5894, + "step": 156680 + }, + { + "epoch": 1.3851906858324936, + "grad_norm": 10.350285530090332, + "learning_rate": 2.6913488569458444e-05, + "loss": 0.5456, + "step": 156690 + }, + { + "epoch": 1.3852790890928057, + "grad_norm": 2.535919427871704, + "learning_rate": 2.691201518178657e-05, + "loss": 0.4919, + "step": 156700 + }, + { + "epoch": 1.3853674923531178, + "grad_norm": 2.2882843017578125, + "learning_rate": 2.69105417941147e-05, + "loss": 0.638, + "step": 156710 + }, + { + "epoch": 1.3854558956134302, + "grad_norm": 9.162206649780273, + "learning_rate": 2.6909068406442833e-05, + "loss": 0.5082, + "step": 156720 + }, + { + "epoch": 1.3855442988737425, + "grad_norm": 2.086186170578003, + "learning_rate": 2.6907595018770958e-05, + "loss": 0.5452, + "step": 156730 + }, + { + "epoch": 1.3856327021340547, + "grad_norm": 2.1356472969055176, + "learning_rate": 2.690612163109909e-05, + "loss": 0.6348, + "step": 156740 + }, + { + "epoch": 1.385721105394367, + "grad_norm": 5.605736255645752, + "learning_rate": 2.6904648243427218e-05, + "loss": 0.5824, + "step": 156750 + }, + { + "epoch": 1.3858095086546791, + "grad_norm": 8.289831161499023, + "learning_rate": 2.6903174855755346e-05, + "loss": 0.6705, + "step": 156760 + }, + { + "epoch": 1.3858979119149915, + "grad_norm": 1.900573492050171, + "learning_rate": 2.6901701468083478e-05, + "loss": 0.4792, + "step": 156770 + }, + { + "epoch": 1.3859863151753036, + "grad_norm": 0.6380146145820618, + "learning_rate": 2.6900228080411606e-05, + "loss": 0.5732, + "step": 156780 + }, + { + "epoch": 1.386074718435616, + "grad_norm": 1.428074836730957, + "learning_rate": 2.6898754692739734e-05, + "loss": 0.7172, + "step": 156790 + }, + { + "epoch": 1.3861631216959283, + "grad_norm": 1.7874327898025513, + "learning_rate": 2.6897281305067866e-05, + "loss": 0.5252, + "step": 156800 + }, + { + "epoch": 1.3862515249562404, + "grad_norm": 2.0756053924560547, + "learning_rate": 2.6895807917395995e-05, + "loss": 0.5948, + "step": 156810 + }, + { + "epoch": 1.3863399282165525, + "grad_norm": 1.0823822021484375, + "learning_rate": 2.6894334529724123e-05, + "loss": 0.624, + "step": 156820 + }, + { + "epoch": 1.3864283314768648, + "grad_norm": 1.5016306638717651, + "learning_rate": 2.6892861142052255e-05, + "loss": 0.5785, + "step": 156830 + }, + { + "epoch": 1.3865167347371772, + "grad_norm": 2.820362091064453, + "learning_rate": 2.6891387754380383e-05, + "loss": 0.6595, + "step": 156840 + }, + { + "epoch": 1.3866051379974893, + "grad_norm": 1.7089853286743164, + "learning_rate": 2.688991436670851e-05, + "loss": 0.6851, + "step": 156850 + }, + { + "epoch": 1.3866935412578016, + "grad_norm": 2.8538742065429688, + "learning_rate": 2.688844097903664e-05, + "loss": 0.5761, + "step": 156860 + }, + { + "epoch": 1.3867819445181138, + "grad_norm": 1.8295609951019287, + "learning_rate": 2.688696759136477e-05, + "loss": 0.5768, + "step": 156870 + }, + { + "epoch": 1.386870347778426, + "grad_norm": 11.0761079788208, + "learning_rate": 2.68854942036929e-05, + "loss": 0.5359, + "step": 156880 + }, + { + "epoch": 1.3869587510387382, + "grad_norm": 3.313791513442993, + "learning_rate": 2.6884020816021028e-05, + "loss": 0.5396, + "step": 156890 + }, + { + "epoch": 1.3870471542990506, + "grad_norm": 1.593228816986084, + "learning_rate": 2.688254742834916e-05, + "loss": 0.5603, + "step": 156900 + }, + { + "epoch": 1.387135557559363, + "grad_norm": 6.165848255157471, + "learning_rate": 2.6881074040677288e-05, + "loss": 0.6847, + "step": 156910 + }, + { + "epoch": 1.387223960819675, + "grad_norm": 3.118553876876831, + "learning_rate": 2.6879600653005417e-05, + "loss": 0.5191, + "step": 156920 + }, + { + "epoch": 1.3873123640799871, + "grad_norm": 10.501680374145508, + "learning_rate": 2.687812726533355e-05, + "loss": 0.6277, + "step": 156930 + }, + { + "epoch": 1.3874007673402995, + "grad_norm": 1.5400967597961426, + "learning_rate": 2.6876653877661677e-05, + "loss": 0.5237, + "step": 156940 + }, + { + "epoch": 1.3874891706006118, + "grad_norm": 6.471516132354736, + "learning_rate": 2.6875180489989805e-05, + "loss": 0.5892, + "step": 156950 + }, + { + "epoch": 1.387577573860924, + "grad_norm": 10.897212982177734, + "learning_rate": 2.6873707102317937e-05, + "loss": 0.5339, + "step": 156960 + }, + { + "epoch": 1.3876659771212363, + "grad_norm": 2.800159454345703, + "learning_rate": 2.6872233714646065e-05, + "loss": 0.5743, + "step": 156970 + }, + { + "epoch": 1.3877543803815484, + "grad_norm": 1.014312744140625, + "learning_rate": 2.6870760326974194e-05, + "loss": 0.6186, + "step": 156980 + }, + { + "epoch": 1.3878427836418608, + "grad_norm": 2.929683208465576, + "learning_rate": 2.6869286939302325e-05, + "loss": 0.6056, + "step": 156990 + }, + { + "epoch": 1.3879311869021729, + "grad_norm": 4.7371392250061035, + "learning_rate": 2.686781355163045e-05, + "loss": 0.6531, + "step": 157000 + }, + { + "epoch": 1.3880195901624852, + "grad_norm": 6.103567123413086, + "learning_rate": 2.6866340163958582e-05, + "loss": 0.6201, + "step": 157010 + }, + { + "epoch": 1.3881079934227976, + "grad_norm": 1.3877007961273193, + "learning_rate": 2.6864866776286714e-05, + "loss": 0.5622, + "step": 157020 + }, + { + "epoch": 1.3881963966831097, + "grad_norm": 1.0094410181045532, + "learning_rate": 2.686339338861484e-05, + "loss": 0.5699, + "step": 157030 + }, + { + "epoch": 1.3882847999434218, + "grad_norm": 5.75368070602417, + "learning_rate": 2.686192000094297e-05, + "loss": 0.6289, + "step": 157040 + }, + { + "epoch": 1.3883732032037341, + "grad_norm": 6.306117534637451, + "learning_rate": 2.6860446613271102e-05, + "loss": 0.7218, + "step": 157050 + }, + { + "epoch": 1.3884616064640465, + "grad_norm": 1.304063320159912, + "learning_rate": 2.6858973225599227e-05, + "loss": 0.7404, + "step": 157060 + }, + { + "epoch": 1.3885500097243586, + "grad_norm": 0.9833344221115112, + "learning_rate": 2.685749983792736e-05, + "loss": 0.5628, + "step": 157070 + }, + { + "epoch": 1.388638412984671, + "grad_norm": 1.5215866565704346, + "learning_rate": 2.685602645025549e-05, + "loss": 0.6346, + "step": 157080 + }, + { + "epoch": 1.388726816244983, + "grad_norm": 2.5768070220947266, + "learning_rate": 2.6854553062583616e-05, + "loss": 0.6523, + "step": 157090 + }, + { + "epoch": 1.3888152195052954, + "grad_norm": 2.755969285964966, + "learning_rate": 2.6853079674911747e-05, + "loss": 0.625, + "step": 157100 + }, + { + "epoch": 1.3889036227656075, + "grad_norm": 2.2574679851531982, + "learning_rate": 2.6851606287239872e-05, + "loss": 0.5981, + "step": 157110 + }, + { + "epoch": 1.3889920260259199, + "grad_norm": 1.1929677724838257, + "learning_rate": 2.6850132899568004e-05, + "loss": 0.5248, + "step": 157120 + }, + { + "epoch": 1.3890804292862322, + "grad_norm": 2.1432747840881348, + "learning_rate": 2.6848659511896136e-05, + "loss": 0.6067, + "step": 157130 + }, + { + "epoch": 1.3891688325465443, + "grad_norm": 2.543018102645874, + "learning_rate": 2.684718612422426e-05, + "loss": 0.6012, + "step": 157140 + }, + { + "epoch": 1.3892572358068564, + "grad_norm": 0.7401247620582581, + "learning_rate": 2.6845712736552392e-05, + "loss": 0.5428, + "step": 157150 + }, + { + "epoch": 1.3893456390671688, + "grad_norm": 1.8546061515808105, + "learning_rate": 2.6844239348880524e-05, + "loss": 0.688, + "step": 157160 + }, + { + "epoch": 1.3894340423274811, + "grad_norm": 1.0606430768966675, + "learning_rate": 2.684276596120865e-05, + "loss": 0.6133, + "step": 157170 + }, + { + "epoch": 1.3895224455877933, + "grad_norm": 1.0503008365631104, + "learning_rate": 2.684129257353678e-05, + "loss": 0.5446, + "step": 157180 + }, + { + "epoch": 1.3896108488481056, + "grad_norm": 1.493780493736267, + "learning_rate": 2.6839819185864913e-05, + "loss": 0.5217, + "step": 157190 + }, + { + "epoch": 1.3896992521084177, + "grad_norm": 2.809370994567871, + "learning_rate": 2.6838345798193038e-05, + "loss": 0.5819, + "step": 157200 + }, + { + "epoch": 1.38978765536873, + "grad_norm": 11.472502708435059, + "learning_rate": 2.683687241052117e-05, + "loss": 0.5705, + "step": 157210 + }, + { + "epoch": 1.3898760586290422, + "grad_norm": 3.870170831680298, + "learning_rate": 2.6835399022849294e-05, + "loss": 0.6996, + "step": 157220 + }, + { + "epoch": 1.3899644618893545, + "grad_norm": 7.817115783691406, + "learning_rate": 2.6833925635177426e-05, + "loss": 0.4974, + "step": 157230 + }, + { + "epoch": 1.3900528651496666, + "grad_norm": 30.183246612548828, + "learning_rate": 2.6832452247505558e-05, + "loss": 0.5176, + "step": 157240 + }, + { + "epoch": 1.390141268409979, + "grad_norm": 2.621539831161499, + "learning_rate": 2.6830978859833683e-05, + "loss": 0.6329, + "step": 157250 + }, + { + "epoch": 1.390229671670291, + "grad_norm": 5.485243320465088, + "learning_rate": 2.6829505472161815e-05, + "loss": 0.5713, + "step": 157260 + }, + { + "epoch": 1.3903180749306034, + "grad_norm": 1.9731295108795166, + "learning_rate": 2.6828032084489946e-05, + "loss": 0.6822, + "step": 157270 + }, + { + "epoch": 1.3904064781909158, + "grad_norm": 4.251471042633057, + "learning_rate": 2.682655869681807e-05, + "loss": 0.6475, + "step": 157280 + }, + { + "epoch": 1.390494881451228, + "grad_norm": 3.7204394340515137, + "learning_rate": 2.6825085309146203e-05, + "loss": 0.6763, + "step": 157290 + }, + { + "epoch": 1.39058328471154, + "grad_norm": 2.8035309314727783, + "learning_rate": 2.6823611921474335e-05, + "loss": 0.5201, + "step": 157300 + }, + { + "epoch": 1.3906716879718524, + "grad_norm": 2.1704540252685547, + "learning_rate": 2.682213853380246e-05, + "loss": 0.6815, + "step": 157310 + }, + { + "epoch": 1.3907600912321647, + "grad_norm": 2.329289197921753, + "learning_rate": 2.682066514613059e-05, + "loss": 0.6119, + "step": 157320 + }, + { + "epoch": 1.3908484944924768, + "grad_norm": 11.108922004699707, + "learning_rate": 2.6819191758458723e-05, + "loss": 0.5553, + "step": 157330 + }, + { + "epoch": 1.3909368977527892, + "grad_norm": 16.72702980041504, + "learning_rate": 2.6817718370786848e-05, + "loss": 0.4598, + "step": 157340 + }, + { + "epoch": 1.3910253010131013, + "grad_norm": 2.5221312046051025, + "learning_rate": 2.681624498311498e-05, + "loss": 0.6026, + "step": 157350 + }, + { + "epoch": 1.3911137042734136, + "grad_norm": 2.9180703163146973, + "learning_rate": 2.6814771595443105e-05, + "loss": 0.5419, + "step": 157360 + }, + { + "epoch": 1.3912021075337258, + "grad_norm": 0.9406756162643433, + "learning_rate": 2.6813298207771237e-05, + "loss": 0.5041, + "step": 157370 + }, + { + "epoch": 1.391290510794038, + "grad_norm": 1.0423914194107056, + "learning_rate": 2.681182482009937e-05, + "loss": 0.5697, + "step": 157380 + }, + { + "epoch": 1.3913789140543504, + "grad_norm": 1.1540026664733887, + "learning_rate": 2.6810351432427493e-05, + "loss": 0.5804, + "step": 157390 + }, + { + "epoch": 1.3914673173146626, + "grad_norm": 5.698990345001221, + "learning_rate": 2.6808878044755625e-05, + "loss": 0.5871, + "step": 157400 + }, + { + "epoch": 1.3915557205749747, + "grad_norm": 1.7291351556777954, + "learning_rate": 2.6807404657083757e-05, + "loss": 0.6036, + "step": 157410 + }, + { + "epoch": 1.391644123835287, + "grad_norm": 1.8603672981262207, + "learning_rate": 2.6805931269411882e-05, + "loss": 0.7682, + "step": 157420 + }, + { + "epoch": 1.3917325270955994, + "grad_norm": 2.7575790882110596, + "learning_rate": 2.6804457881740013e-05, + "loss": 0.633, + "step": 157430 + }, + { + "epoch": 1.3918209303559115, + "grad_norm": 5.373757839202881, + "learning_rate": 2.6802984494068145e-05, + "loss": 0.7215, + "step": 157440 + }, + { + "epoch": 1.3919093336162238, + "grad_norm": 1.2322953939437866, + "learning_rate": 2.680151110639627e-05, + "loss": 0.5785, + "step": 157450 + }, + { + "epoch": 1.391997736876536, + "grad_norm": 3.9250879287719727, + "learning_rate": 2.6800037718724402e-05, + "loss": 0.5582, + "step": 157460 + }, + { + "epoch": 1.3920861401368483, + "grad_norm": 2.4116668701171875, + "learning_rate": 2.6798564331052527e-05, + "loss": 0.6289, + "step": 157470 + }, + { + "epoch": 1.3921745433971604, + "grad_norm": 1.380910873413086, + "learning_rate": 2.679709094338066e-05, + "loss": 0.5894, + "step": 157480 + }, + { + "epoch": 1.3922629466574727, + "grad_norm": 1.8472890853881836, + "learning_rate": 2.679561755570879e-05, + "loss": 0.5763, + "step": 157490 + }, + { + "epoch": 1.392351349917785, + "grad_norm": 1.5728129148483276, + "learning_rate": 2.6794144168036915e-05, + "loss": 0.535, + "step": 157500 + }, + { + "epoch": 1.3924397531780972, + "grad_norm": 1.559787631034851, + "learning_rate": 2.6792670780365047e-05, + "loss": 0.5391, + "step": 157510 + }, + { + "epoch": 1.3925281564384093, + "grad_norm": 2.645859479904175, + "learning_rate": 2.679119739269318e-05, + "loss": 0.5894, + "step": 157520 + }, + { + "epoch": 1.3926165596987217, + "grad_norm": 17.005584716796875, + "learning_rate": 2.6789724005021304e-05, + "loss": 0.6811, + "step": 157530 + }, + { + "epoch": 1.392704962959034, + "grad_norm": 2.3037030696868896, + "learning_rate": 2.6788250617349436e-05, + "loss": 0.7113, + "step": 157540 + }, + { + "epoch": 1.3927933662193461, + "grad_norm": 6.945370674133301, + "learning_rate": 2.6786777229677567e-05, + "loss": 0.5779, + "step": 157550 + }, + { + "epoch": 1.3928817694796585, + "grad_norm": 2.1154115200042725, + "learning_rate": 2.6785303842005692e-05, + "loss": 0.6004, + "step": 157560 + }, + { + "epoch": 1.3929701727399706, + "grad_norm": 1.3923304080963135, + "learning_rate": 2.6783830454333824e-05, + "loss": 0.6148, + "step": 157570 + }, + { + "epoch": 1.393058576000283, + "grad_norm": 5.411924839019775, + "learning_rate": 2.678235706666195e-05, + "loss": 0.5789, + "step": 157580 + }, + { + "epoch": 1.393146979260595, + "grad_norm": 3.663203001022339, + "learning_rate": 2.678088367899008e-05, + "loss": 0.5842, + "step": 157590 + }, + { + "epoch": 1.3932353825209074, + "grad_norm": 1.2214984893798828, + "learning_rate": 2.6779410291318212e-05, + "loss": 0.636, + "step": 157600 + }, + { + "epoch": 1.3933237857812197, + "grad_norm": 2.6077592372894287, + "learning_rate": 2.6777936903646337e-05, + "loss": 0.6277, + "step": 157610 + }, + { + "epoch": 1.3934121890415319, + "grad_norm": 1.0479772090911865, + "learning_rate": 2.677646351597447e-05, + "loss": 0.6629, + "step": 157620 + }, + { + "epoch": 1.393500592301844, + "grad_norm": 9.980446815490723, + "learning_rate": 2.67749901283026e-05, + "loss": 0.574, + "step": 157630 + }, + { + "epoch": 1.3935889955621563, + "grad_norm": 11.375358581542969, + "learning_rate": 2.6773516740630726e-05, + "loss": 0.5764, + "step": 157640 + }, + { + "epoch": 1.3936773988224687, + "grad_norm": 1.025200605392456, + "learning_rate": 2.6772043352958858e-05, + "loss": 0.6117, + "step": 157650 + }, + { + "epoch": 1.3937658020827808, + "grad_norm": 1.7183079719543457, + "learning_rate": 2.677056996528699e-05, + "loss": 0.5069, + "step": 157660 + }, + { + "epoch": 1.3938542053430931, + "grad_norm": 1.4647001028060913, + "learning_rate": 2.6769096577615114e-05, + "loss": 0.6947, + "step": 157670 + }, + { + "epoch": 1.3939426086034052, + "grad_norm": 1.9194972515106201, + "learning_rate": 2.6767623189943246e-05, + "loss": 0.5888, + "step": 157680 + }, + { + "epoch": 1.3940310118637176, + "grad_norm": 5.285514831542969, + "learning_rate": 2.6766149802271374e-05, + "loss": 0.661, + "step": 157690 + }, + { + "epoch": 1.3941194151240297, + "grad_norm": 2.15682053565979, + "learning_rate": 2.6764676414599503e-05, + "loss": 0.6467, + "step": 157700 + }, + { + "epoch": 1.394207818384342, + "grad_norm": 2.0305368900299072, + "learning_rate": 2.6763203026927634e-05, + "loss": 0.698, + "step": 157710 + }, + { + "epoch": 1.3942962216446544, + "grad_norm": 3.1504745483398438, + "learning_rate": 2.6761729639255763e-05, + "loss": 0.6174, + "step": 157720 + }, + { + "epoch": 1.3943846249049665, + "grad_norm": 3.881854295730591, + "learning_rate": 2.676025625158389e-05, + "loss": 0.6993, + "step": 157730 + }, + { + "epoch": 1.3944730281652786, + "grad_norm": 2.65567684173584, + "learning_rate": 2.6758782863912023e-05, + "loss": 0.6831, + "step": 157740 + }, + { + "epoch": 1.394561431425591, + "grad_norm": 1.3427464962005615, + "learning_rate": 2.675730947624015e-05, + "loss": 0.5069, + "step": 157750 + }, + { + "epoch": 1.3946498346859033, + "grad_norm": 18.119110107421875, + "learning_rate": 2.675583608856828e-05, + "loss": 0.685, + "step": 157760 + }, + { + "epoch": 1.3947382379462154, + "grad_norm": 0.9084869027137756, + "learning_rate": 2.675436270089641e-05, + "loss": 0.4774, + "step": 157770 + }, + { + "epoch": 1.3948266412065278, + "grad_norm": 3.3470420837402344, + "learning_rate": 2.675288931322454e-05, + "loss": 0.6149, + "step": 157780 + }, + { + "epoch": 1.39491504446684, + "grad_norm": 0.980299174785614, + "learning_rate": 2.6751415925552668e-05, + "loss": 0.5974, + "step": 157790 + }, + { + "epoch": 1.3950034477271522, + "grad_norm": 7.012294769287109, + "learning_rate": 2.67499425378808e-05, + "loss": 0.7202, + "step": 157800 + }, + { + "epoch": 1.3950918509874644, + "grad_norm": 1.5065248012542725, + "learning_rate": 2.6748469150208928e-05, + "loss": 0.5093, + "step": 157810 + }, + { + "epoch": 1.3951802542477767, + "grad_norm": 5.286609649658203, + "learning_rate": 2.6746995762537057e-05, + "loss": 0.705, + "step": 157820 + }, + { + "epoch": 1.3952686575080888, + "grad_norm": 2.169598340988159, + "learning_rate": 2.6745522374865185e-05, + "loss": 0.6473, + "step": 157830 + }, + { + "epoch": 1.3953570607684012, + "grad_norm": 14.844645500183105, + "learning_rate": 2.6744048987193317e-05, + "loss": 0.672, + "step": 157840 + }, + { + "epoch": 1.3954454640287133, + "grad_norm": 2.981980800628662, + "learning_rate": 2.6742575599521445e-05, + "loss": 0.4646, + "step": 157850 + }, + { + "epoch": 1.3955338672890256, + "grad_norm": 1.0732083320617676, + "learning_rate": 2.6741102211849573e-05, + "loss": 0.6207, + "step": 157860 + }, + { + "epoch": 1.395622270549338, + "grad_norm": 2.8117666244506836, + "learning_rate": 2.6739628824177705e-05, + "loss": 0.58, + "step": 157870 + }, + { + "epoch": 1.39571067380965, + "grad_norm": 2.195852518081665, + "learning_rate": 2.6738155436505833e-05, + "loss": 0.662, + "step": 157880 + }, + { + "epoch": 1.3957990770699622, + "grad_norm": 1.915737509727478, + "learning_rate": 2.6736682048833962e-05, + "loss": 0.5721, + "step": 157890 + }, + { + "epoch": 1.3958874803302745, + "grad_norm": 6.411036968231201, + "learning_rate": 2.6735208661162094e-05, + "loss": 0.7025, + "step": 157900 + }, + { + "epoch": 1.3959758835905869, + "grad_norm": 1.482609748840332, + "learning_rate": 2.6733735273490222e-05, + "loss": 0.5779, + "step": 157910 + }, + { + "epoch": 1.396064286850899, + "grad_norm": 3.666043996810913, + "learning_rate": 2.673226188581835e-05, + "loss": 0.7195, + "step": 157920 + }, + { + "epoch": 1.3961526901112113, + "grad_norm": 2.3935749530792236, + "learning_rate": 2.6730788498146482e-05, + "loss": 0.627, + "step": 157930 + }, + { + "epoch": 1.3962410933715235, + "grad_norm": 2.3432655334472656, + "learning_rate": 2.6729315110474607e-05, + "loss": 0.4989, + "step": 157940 + }, + { + "epoch": 1.3963294966318358, + "grad_norm": 2.711402177810669, + "learning_rate": 2.672784172280274e-05, + "loss": 0.5546, + "step": 157950 + }, + { + "epoch": 1.396417899892148, + "grad_norm": 2.282696485519409, + "learning_rate": 2.672636833513087e-05, + "loss": 0.5195, + "step": 157960 + }, + { + "epoch": 1.3965063031524603, + "grad_norm": 9.077699661254883, + "learning_rate": 2.6724894947458995e-05, + "loss": 0.5887, + "step": 157970 + }, + { + "epoch": 1.3965947064127726, + "grad_norm": 17.72617530822754, + "learning_rate": 2.6723421559787127e-05, + "loss": 0.5331, + "step": 157980 + }, + { + "epoch": 1.3966831096730847, + "grad_norm": 1.9646657705307007, + "learning_rate": 2.672194817211526e-05, + "loss": 0.5663, + "step": 157990 + }, + { + "epoch": 1.3967715129333969, + "grad_norm": 3.6485767364501953, + "learning_rate": 2.6720474784443384e-05, + "loss": 0.6359, + "step": 158000 + }, + { + "epoch": 1.3968599161937092, + "grad_norm": 4.137617588043213, + "learning_rate": 2.6719001396771516e-05, + "loss": 0.6042, + "step": 158010 + }, + { + "epoch": 1.3969483194540215, + "grad_norm": 2.2869791984558105, + "learning_rate": 2.6717528009099647e-05, + "loss": 0.5228, + "step": 158020 + }, + { + "epoch": 1.3970367227143337, + "grad_norm": 1.1500113010406494, + "learning_rate": 2.6716054621427772e-05, + "loss": 0.5813, + "step": 158030 + }, + { + "epoch": 1.397125125974646, + "grad_norm": 1.4167678356170654, + "learning_rate": 2.6714581233755904e-05, + "loss": 0.5669, + "step": 158040 + }, + { + "epoch": 1.3972135292349581, + "grad_norm": 7.787479877471924, + "learning_rate": 2.671310784608403e-05, + "loss": 0.6369, + "step": 158050 + }, + { + "epoch": 1.3973019324952705, + "grad_norm": 1.4218847751617432, + "learning_rate": 2.671163445841216e-05, + "loss": 0.4492, + "step": 158060 + }, + { + "epoch": 1.3973903357555826, + "grad_norm": 1.9995653629302979, + "learning_rate": 2.6710161070740293e-05, + "loss": 0.5917, + "step": 158070 + }, + { + "epoch": 1.397478739015895, + "grad_norm": 4.255588531494141, + "learning_rate": 2.6708687683068417e-05, + "loss": 0.7148, + "step": 158080 + }, + { + "epoch": 1.3975671422762073, + "grad_norm": 1.1951148509979248, + "learning_rate": 2.670721429539655e-05, + "loss": 0.5146, + "step": 158090 + }, + { + "epoch": 1.3976555455365194, + "grad_norm": 2.986920118331909, + "learning_rate": 2.670574090772468e-05, + "loss": 0.5559, + "step": 158100 + }, + { + "epoch": 1.3977439487968315, + "grad_norm": 7.708232879638672, + "learning_rate": 2.6704267520052806e-05, + "loss": 0.6666, + "step": 158110 + }, + { + "epoch": 1.3978323520571438, + "grad_norm": 1.7252209186553955, + "learning_rate": 2.6702794132380938e-05, + "loss": 0.5863, + "step": 158120 + }, + { + "epoch": 1.3979207553174562, + "grad_norm": 2.579143524169922, + "learning_rate": 2.670132074470907e-05, + "loss": 0.6263, + "step": 158130 + }, + { + "epoch": 1.3980091585777683, + "grad_norm": 2.1502201557159424, + "learning_rate": 2.6699847357037194e-05, + "loss": 0.5237, + "step": 158140 + }, + { + "epoch": 1.3980975618380806, + "grad_norm": 6.006604194641113, + "learning_rate": 2.6698373969365326e-05, + "loss": 0.6211, + "step": 158150 + }, + { + "epoch": 1.3981859650983928, + "grad_norm": 1.392369270324707, + "learning_rate": 2.669690058169345e-05, + "loss": 0.7232, + "step": 158160 + }, + { + "epoch": 1.398274368358705, + "grad_norm": 3.991595983505249, + "learning_rate": 2.6695427194021583e-05, + "loss": 0.7598, + "step": 158170 + }, + { + "epoch": 1.3983627716190172, + "grad_norm": 12.469320297241211, + "learning_rate": 2.6693953806349715e-05, + "loss": 0.5641, + "step": 158180 + }, + { + "epoch": 1.3984511748793296, + "grad_norm": 2.5988597869873047, + "learning_rate": 2.669248041867784e-05, + "loss": 0.5599, + "step": 158190 + }, + { + "epoch": 1.398539578139642, + "grad_norm": 2.2182259559631348, + "learning_rate": 2.669100703100597e-05, + "loss": 0.5853, + "step": 158200 + }, + { + "epoch": 1.398627981399954, + "grad_norm": 3.1300642490386963, + "learning_rate": 2.6689533643334103e-05, + "loss": 0.5051, + "step": 158210 + }, + { + "epoch": 1.3987163846602662, + "grad_norm": 1.35454523563385, + "learning_rate": 2.6688060255662228e-05, + "loss": 0.4375, + "step": 158220 + }, + { + "epoch": 1.3988047879205785, + "grad_norm": 1.7835090160369873, + "learning_rate": 2.668658686799036e-05, + "loss": 0.6597, + "step": 158230 + }, + { + "epoch": 1.3988931911808908, + "grad_norm": 5.0810346603393555, + "learning_rate": 2.668511348031849e-05, + "loss": 0.6704, + "step": 158240 + }, + { + "epoch": 1.398981594441203, + "grad_norm": 4.184632301330566, + "learning_rate": 2.6683640092646616e-05, + "loss": 0.4617, + "step": 158250 + }, + { + "epoch": 1.3990699977015153, + "grad_norm": 1.9390873908996582, + "learning_rate": 2.6682166704974748e-05, + "loss": 0.5741, + "step": 158260 + }, + { + "epoch": 1.3991584009618274, + "grad_norm": 1.558897852897644, + "learning_rate": 2.668069331730288e-05, + "loss": 0.7577, + "step": 158270 + }, + { + "epoch": 1.3992468042221398, + "grad_norm": 4.806708812713623, + "learning_rate": 2.6679219929631005e-05, + "loss": 0.5403, + "step": 158280 + }, + { + "epoch": 1.3993352074824519, + "grad_norm": 2.3022091388702393, + "learning_rate": 2.6677746541959137e-05, + "loss": 0.6078, + "step": 158290 + }, + { + "epoch": 1.3994236107427642, + "grad_norm": 1.2062190771102905, + "learning_rate": 2.667627315428726e-05, + "loss": 0.6788, + "step": 158300 + }, + { + "epoch": 1.3995120140030766, + "grad_norm": 4.970940589904785, + "learning_rate": 2.6674799766615393e-05, + "loss": 0.6319, + "step": 158310 + }, + { + "epoch": 1.3996004172633887, + "grad_norm": 6.246344566345215, + "learning_rate": 2.6673326378943525e-05, + "loss": 0.7757, + "step": 158320 + }, + { + "epoch": 1.3996888205237008, + "grad_norm": 1.4345197677612305, + "learning_rate": 2.667185299127165e-05, + "loss": 0.609, + "step": 158330 + }, + { + "epoch": 1.3997772237840131, + "grad_norm": 1.247517466545105, + "learning_rate": 2.6670379603599782e-05, + "loss": 0.6853, + "step": 158340 + }, + { + "epoch": 1.3998656270443255, + "grad_norm": 2.3947219848632812, + "learning_rate": 2.6668906215927914e-05, + "loss": 0.5954, + "step": 158350 + }, + { + "epoch": 1.3999540303046376, + "grad_norm": 6.077724456787109, + "learning_rate": 2.666743282825604e-05, + "loss": 0.6926, + "step": 158360 + }, + { + "epoch": 1.40004243356495, + "grad_norm": 3.187283515930176, + "learning_rate": 2.666595944058417e-05, + "loss": 0.739, + "step": 158370 + }, + { + "epoch": 1.400130836825262, + "grad_norm": 3.419934034347534, + "learning_rate": 2.6664486052912302e-05, + "loss": 0.659, + "step": 158380 + }, + { + "epoch": 1.4002192400855744, + "grad_norm": 2.3514151573181152, + "learning_rate": 2.6663012665240427e-05, + "loss": 0.6331, + "step": 158390 + }, + { + "epoch": 1.4003076433458865, + "grad_norm": 1.5123956203460693, + "learning_rate": 2.666153927756856e-05, + "loss": 0.6871, + "step": 158400 + }, + { + "epoch": 1.4003960466061989, + "grad_norm": 2.739198923110962, + "learning_rate": 2.6660065889896684e-05, + "loss": 0.5129, + "step": 158410 + }, + { + "epoch": 1.400484449866511, + "grad_norm": 1.086140513420105, + "learning_rate": 2.6658592502224815e-05, + "loss": 0.564, + "step": 158420 + }, + { + "epoch": 1.4005728531268233, + "grad_norm": 3.4808900356292725, + "learning_rate": 2.6657119114552947e-05, + "loss": 0.6658, + "step": 158430 + }, + { + "epoch": 1.4006612563871355, + "grad_norm": 4.097965717315674, + "learning_rate": 2.6655645726881072e-05, + "loss": 0.5703, + "step": 158440 + }, + { + "epoch": 1.4007496596474478, + "grad_norm": 4.910766124725342, + "learning_rate": 2.6654172339209204e-05, + "loss": 0.6817, + "step": 158450 + }, + { + "epoch": 1.4008380629077601, + "grad_norm": 3.675067663192749, + "learning_rate": 2.6652698951537336e-05, + "loss": 0.6133, + "step": 158460 + }, + { + "epoch": 1.4009264661680723, + "grad_norm": 1.8795480728149414, + "learning_rate": 2.665122556386546e-05, + "loss": 0.6082, + "step": 158470 + }, + { + "epoch": 1.4010148694283846, + "grad_norm": 2.6172196865081787, + "learning_rate": 2.6649752176193592e-05, + "loss": 0.5722, + "step": 158480 + }, + { + "epoch": 1.4011032726886967, + "grad_norm": 13.755043029785156, + "learning_rate": 2.6648278788521724e-05, + "loss": 0.6059, + "step": 158490 + }, + { + "epoch": 1.401191675949009, + "grad_norm": 2.8032913208007812, + "learning_rate": 2.664680540084985e-05, + "loss": 0.7376, + "step": 158500 + }, + { + "epoch": 1.4012800792093212, + "grad_norm": 4.964893341064453, + "learning_rate": 2.664533201317798e-05, + "loss": 0.6779, + "step": 158510 + }, + { + "epoch": 1.4013684824696335, + "grad_norm": 3.5689101219177246, + "learning_rate": 2.6643858625506106e-05, + "loss": 0.6091, + "step": 158520 + }, + { + "epoch": 1.4014568857299456, + "grad_norm": 3.090955972671509, + "learning_rate": 2.6642385237834237e-05, + "loss": 0.6149, + "step": 158530 + }, + { + "epoch": 1.401545288990258, + "grad_norm": 1.0367218255996704, + "learning_rate": 2.664091185016237e-05, + "loss": 0.5828, + "step": 158540 + }, + { + "epoch": 1.40163369225057, + "grad_norm": 8.77690601348877, + "learning_rate": 2.6639438462490494e-05, + "loss": 0.6477, + "step": 158550 + }, + { + "epoch": 1.4017220955108824, + "grad_norm": 3.570495367050171, + "learning_rate": 2.6637965074818626e-05, + "loss": 0.5514, + "step": 158560 + }, + { + "epoch": 1.4018104987711948, + "grad_norm": 1.879351258277893, + "learning_rate": 2.6636491687146758e-05, + "loss": 0.628, + "step": 158570 + }, + { + "epoch": 1.401898902031507, + "grad_norm": 2.3856704235076904, + "learning_rate": 2.6635018299474883e-05, + "loss": 0.5588, + "step": 158580 + }, + { + "epoch": 1.401987305291819, + "grad_norm": 4.7609076499938965, + "learning_rate": 2.6633544911803014e-05, + "loss": 0.5506, + "step": 158590 + }, + { + "epoch": 1.4020757085521314, + "grad_norm": 9.955957412719727, + "learning_rate": 2.6632071524131146e-05, + "loss": 0.5023, + "step": 158600 + }, + { + "epoch": 1.4021641118124437, + "grad_norm": 3.885927438735962, + "learning_rate": 2.663059813645927e-05, + "loss": 0.7053, + "step": 158610 + }, + { + "epoch": 1.4022525150727558, + "grad_norm": 2.1473939418792725, + "learning_rate": 2.6629124748787403e-05, + "loss": 0.6117, + "step": 158620 + }, + { + "epoch": 1.4023409183330682, + "grad_norm": 1.6334092617034912, + "learning_rate": 2.662765136111553e-05, + "loss": 0.77, + "step": 158630 + }, + { + "epoch": 1.4024293215933803, + "grad_norm": 4.547221660614014, + "learning_rate": 2.662617797344366e-05, + "loss": 0.7352, + "step": 158640 + }, + { + "epoch": 1.4025177248536926, + "grad_norm": 1.7922965288162231, + "learning_rate": 2.662470458577179e-05, + "loss": 0.6089, + "step": 158650 + }, + { + "epoch": 1.4026061281140048, + "grad_norm": 4.786975860595703, + "learning_rate": 2.662323119809992e-05, + "loss": 0.5484, + "step": 158660 + }, + { + "epoch": 1.402694531374317, + "grad_norm": 5.477907657623291, + "learning_rate": 2.6621757810428048e-05, + "loss": 0.6103, + "step": 158670 + }, + { + "epoch": 1.4027829346346294, + "grad_norm": 3.5993690490722656, + "learning_rate": 2.662028442275618e-05, + "loss": 0.5428, + "step": 158680 + }, + { + "epoch": 1.4028713378949416, + "grad_norm": 0.7107141017913818, + "learning_rate": 2.6618811035084308e-05, + "loss": 0.5918, + "step": 158690 + }, + { + "epoch": 1.4029597411552537, + "grad_norm": 3.7796475887298584, + "learning_rate": 2.6617337647412436e-05, + "loss": 0.6606, + "step": 158700 + }, + { + "epoch": 1.403048144415566, + "grad_norm": 4.401938438415527, + "learning_rate": 2.6615864259740568e-05, + "loss": 0.6108, + "step": 158710 + }, + { + "epoch": 1.4031365476758784, + "grad_norm": 2.1124398708343506, + "learning_rate": 2.6614390872068697e-05, + "loss": 0.6109, + "step": 158720 + }, + { + "epoch": 1.4032249509361905, + "grad_norm": 1.367019772529602, + "learning_rate": 2.6612917484396825e-05, + "loss": 0.5916, + "step": 158730 + }, + { + "epoch": 1.4033133541965028, + "grad_norm": 2.4775075912475586, + "learning_rate": 2.6611444096724957e-05, + "loss": 0.6035, + "step": 158740 + }, + { + "epoch": 1.403401757456815, + "grad_norm": 1.916231393814087, + "learning_rate": 2.6609970709053085e-05, + "loss": 0.6342, + "step": 158750 + }, + { + "epoch": 1.4034901607171273, + "grad_norm": 4.126253604888916, + "learning_rate": 2.6608497321381213e-05, + "loss": 0.5982, + "step": 158760 + }, + { + "epoch": 1.4035785639774394, + "grad_norm": 2.7555999755859375, + "learning_rate": 2.660702393370934e-05, + "loss": 0.5341, + "step": 158770 + }, + { + "epoch": 1.4036669672377518, + "grad_norm": 1.5839358568191528, + "learning_rate": 2.6605550546037473e-05, + "loss": 0.5637, + "step": 158780 + }, + { + "epoch": 1.403755370498064, + "grad_norm": 2.5511538982391357, + "learning_rate": 2.6604077158365602e-05, + "loss": 0.5151, + "step": 158790 + }, + { + "epoch": 1.4038437737583762, + "grad_norm": 10.26308536529541, + "learning_rate": 2.660260377069373e-05, + "loss": 0.479, + "step": 158800 + }, + { + "epoch": 1.4039321770186883, + "grad_norm": 4.052664756774902, + "learning_rate": 2.6601130383021862e-05, + "loss": 0.5079, + "step": 158810 + }, + { + "epoch": 1.4040205802790007, + "grad_norm": 22.951419830322266, + "learning_rate": 2.659965699534999e-05, + "loss": 0.6827, + "step": 158820 + }, + { + "epoch": 1.404108983539313, + "grad_norm": 1.4759076833724976, + "learning_rate": 2.659818360767812e-05, + "loss": 0.5656, + "step": 158830 + }, + { + "epoch": 1.4041973867996251, + "grad_norm": 1.3229501247406006, + "learning_rate": 2.659671022000625e-05, + "loss": 0.5172, + "step": 158840 + }, + { + "epoch": 1.4042857900599375, + "grad_norm": 0.9039010405540466, + "learning_rate": 2.659523683233438e-05, + "loss": 0.6397, + "step": 158850 + }, + { + "epoch": 1.4043741933202496, + "grad_norm": 7.029833793640137, + "learning_rate": 2.6593763444662507e-05, + "loss": 0.7808, + "step": 158860 + }, + { + "epoch": 1.404462596580562, + "grad_norm": 7.425211429595947, + "learning_rate": 2.659229005699064e-05, + "loss": 0.4676, + "step": 158870 + }, + { + "epoch": 1.404550999840874, + "grad_norm": 3.775947093963623, + "learning_rate": 2.6590816669318764e-05, + "loss": 0.5577, + "step": 158880 + }, + { + "epoch": 1.4046394031011864, + "grad_norm": 6.46895694732666, + "learning_rate": 2.6589343281646895e-05, + "loss": 0.6114, + "step": 158890 + }, + { + "epoch": 1.4047278063614987, + "grad_norm": 1.6449109315872192, + "learning_rate": 2.6587869893975027e-05, + "loss": 0.6345, + "step": 158900 + }, + { + "epoch": 1.4048162096218109, + "grad_norm": 3.386824369430542, + "learning_rate": 2.6586396506303152e-05, + "loss": 0.6939, + "step": 158910 + }, + { + "epoch": 1.404904612882123, + "grad_norm": 2.3983635902404785, + "learning_rate": 2.6584923118631284e-05, + "loss": 0.5914, + "step": 158920 + }, + { + "epoch": 1.4049930161424353, + "grad_norm": 2.1223251819610596, + "learning_rate": 2.6583449730959416e-05, + "loss": 0.6003, + "step": 158930 + }, + { + "epoch": 1.4050814194027477, + "grad_norm": 11.794602394104004, + "learning_rate": 2.658197634328754e-05, + "loss": 0.6593, + "step": 158940 + }, + { + "epoch": 1.4051698226630598, + "grad_norm": 5.02594518661499, + "learning_rate": 2.6580502955615672e-05, + "loss": 0.6623, + "step": 158950 + }, + { + "epoch": 1.4052582259233721, + "grad_norm": 2.471235752105713, + "learning_rate": 2.6579029567943804e-05, + "loss": 0.5952, + "step": 158960 + }, + { + "epoch": 1.4053466291836842, + "grad_norm": 9.714351654052734, + "learning_rate": 2.657755618027193e-05, + "loss": 0.4271, + "step": 158970 + }, + { + "epoch": 1.4054350324439966, + "grad_norm": 1.7939386367797852, + "learning_rate": 2.657608279260006e-05, + "loss": 0.6081, + "step": 158980 + }, + { + "epoch": 1.4055234357043087, + "grad_norm": 3.073272228240967, + "learning_rate": 2.6574609404928186e-05, + "loss": 0.545, + "step": 158990 + }, + { + "epoch": 1.405611838964621, + "grad_norm": 4.8265228271484375, + "learning_rate": 2.6573136017256318e-05, + "loss": 0.8304, + "step": 159000 + }, + { + "epoch": 1.4057002422249334, + "grad_norm": 1.4160538911819458, + "learning_rate": 2.657166262958445e-05, + "loss": 0.5998, + "step": 159010 + }, + { + "epoch": 1.4057886454852455, + "grad_norm": 3.731612205505371, + "learning_rate": 2.6570189241912574e-05, + "loss": 0.712, + "step": 159020 + }, + { + "epoch": 1.4058770487455576, + "grad_norm": 8.12205696105957, + "learning_rate": 2.6568715854240706e-05, + "loss": 0.7205, + "step": 159030 + }, + { + "epoch": 1.40596545200587, + "grad_norm": 5.020598411560059, + "learning_rate": 2.6567242466568838e-05, + "loss": 0.5902, + "step": 159040 + }, + { + "epoch": 1.4060538552661823, + "grad_norm": 2.0906896591186523, + "learning_rate": 2.6565769078896963e-05, + "loss": 0.5751, + "step": 159050 + }, + { + "epoch": 1.4061422585264944, + "grad_norm": 6.701564311981201, + "learning_rate": 2.6564295691225094e-05, + "loss": 0.6301, + "step": 159060 + }, + { + "epoch": 1.4062306617868068, + "grad_norm": 3.2989742755889893, + "learning_rate": 2.6562822303553226e-05, + "loss": 0.5688, + "step": 159070 + }, + { + "epoch": 1.406319065047119, + "grad_norm": 1.159401774406433, + "learning_rate": 2.656134891588135e-05, + "loss": 0.4446, + "step": 159080 + }, + { + "epoch": 1.4064074683074312, + "grad_norm": 7.220983505249023, + "learning_rate": 2.6559875528209483e-05, + "loss": 0.5983, + "step": 159090 + }, + { + "epoch": 1.4064958715677434, + "grad_norm": 0.9707807302474976, + "learning_rate": 2.6558402140537608e-05, + "loss": 0.5463, + "step": 159100 + }, + { + "epoch": 1.4065842748280557, + "grad_norm": 1.8643614053726196, + "learning_rate": 2.655692875286574e-05, + "loss": 0.6093, + "step": 159110 + }, + { + "epoch": 1.4066726780883678, + "grad_norm": 3.016954183578491, + "learning_rate": 2.655545536519387e-05, + "loss": 0.682, + "step": 159120 + }, + { + "epoch": 1.4067610813486802, + "grad_norm": 1.7046053409576416, + "learning_rate": 2.6553981977521996e-05, + "loss": 0.5484, + "step": 159130 + }, + { + "epoch": 1.4068494846089923, + "grad_norm": 1.19966721534729, + "learning_rate": 2.6552508589850128e-05, + "loss": 0.5397, + "step": 159140 + }, + { + "epoch": 1.4069378878693046, + "grad_norm": 2.3112664222717285, + "learning_rate": 2.655103520217826e-05, + "loss": 0.6095, + "step": 159150 + }, + { + "epoch": 1.407026291129617, + "grad_norm": 2.450408935546875, + "learning_rate": 2.6549561814506385e-05, + "loss": 0.4302, + "step": 159160 + }, + { + "epoch": 1.407114694389929, + "grad_norm": 2.4362261295318604, + "learning_rate": 2.6548088426834516e-05, + "loss": 0.5715, + "step": 159170 + }, + { + "epoch": 1.4072030976502412, + "grad_norm": 4.261641979217529, + "learning_rate": 2.6546615039162648e-05, + "loss": 0.7641, + "step": 159180 + }, + { + "epoch": 1.4072915009105535, + "grad_norm": 3.0175554752349854, + "learning_rate": 2.6545141651490773e-05, + "loss": 0.5958, + "step": 159190 + }, + { + "epoch": 1.407379904170866, + "grad_norm": 7.465358257293701, + "learning_rate": 2.6543668263818905e-05, + "loss": 0.4886, + "step": 159200 + }, + { + "epoch": 1.407468307431178, + "grad_norm": 1.147223711013794, + "learning_rate": 2.6542194876147037e-05, + "loss": 0.6535, + "step": 159210 + }, + { + "epoch": 1.4075567106914904, + "grad_norm": 14.22011947631836, + "learning_rate": 2.654072148847516e-05, + "loss": 0.6468, + "step": 159220 + }, + { + "epoch": 1.4076451139518025, + "grad_norm": 1.5749155282974243, + "learning_rate": 2.6539248100803293e-05, + "loss": 0.6327, + "step": 159230 + }, + { + "epoch": 1.4077335172121148, + "grad_norm": 3.546191692352295, + "learning_rate": 2.653777471313142e-05, + "loss": 0.5151, + "step": 159240 + }, + { + "epoch": 1.407821920472427, + "grad_norm": 1.1169731616973877, + "learning_rate": 2.653630132545955e-05, + "loss": 0.5143, + "step": 159250 + }, + { + "epoch": 1.4079103237327393, + "grad_norm": 6.529577732086182, + "learning_rate": 2.6534827937787682e-05, + "loss": 0.6038, + "step": 159260 + }, + { + "epoch": 1.4079987269930516, + "grad_norm": 3.341996192932129, + "learning_rate": 2.6533354550115807e-05, + "loss": 0.6311, + "step": 159270 + }, + { + "epoch": 1.4080871302533637, + "grad_norm": 1.4324045181274414, + "learning_rate": 2.653188116244394e-05, + "loss": 0.6621, + "step": 159280 + }, + { + "epoch": 1.4081755335136759, + "grad_norm": 7.230814456939697, + "learning_rate": 2.653040777477207e-05, + "loss": 0.58, + "step": 159290 + }, + { + "epoch": 1.4082639367739882, + "grad_norm": 2.9738571643829346, + "learning_rate": 2.6528934387100195e-05, + "loss": 0.6256, + "step": 159300 + }, + { + "epoch": 1.4083523400343005, + "grad_norm": 4.846588134765625, + "learning_rate": 2.6527460999428327e-05, + "loss": 0.5342, + "step": 159310 + }, + { + "epoch": 1.4084407432946127, + "grad_norm": 2.3289072513580322, + "learning_rate": 2.652598761175646e-05, + "loss": 0.6672, + "step": 159320 + }, + { + "epoch": 1.408529146554925, + "grad_norm": 2.5556800365448, + "learning_rate": 2.6524514224084584e-05, + "loss": 0.5299, + "step": 159330 + }, + { + "epoch": 1.4086175498152371, + "grad_norm": 18.285154342651367, + "learning_rate": 2.6523040836412715e-05, + "loss": 0.5968, + "step": 159340 + }, + { + "epoch": 1.4087059530755495, + "grad_norm": 2.8540220260620117, + "learning_rate": 2.652156744874084e-05, + "loss": 0.68, + "step": 159350 + }, + { + "epoch": 1.4087943563358616, + "grad_norm": 2.933842658996582, + "learning_rate": 2.6520094061068972e-05, + "loss": 0.5594, + "step": 159360 + }, + { + "epoch": 1.408882759596174, + "grad_norm": 1.7848162651062012, + "learning_rate": 2.6518620673397104e-05, + "loss": 0.6775, + "step": 159370 + }, + { + "epoch": 1.4089711628564863, + "grad_norm": 5.437075138092041, + "learning_rate": 2.651714728572523e-05, + "loss": 0.5941, + "step": 159380 + }, + { + "epoch": 1.4090595661167984, + "grad_norm": 7.368473052978516, + "learning_rate": 2.651567389805336e-05, + "loss": 0.6777, + "step": 159390 + }, + { + "epoch": 1.4091479693771105, + "grad_norm": 1.5570006370544434, + "learning_rate": 2.6514200510381492e-05, + "loss": 0.5639, + "step": 159400 + }, + { + "epoch": 1.4092363726374229, + "grad_norm": 3.418001174926758, + "learning_rate": 2.6512727122709617e-05, + "loss": 0.6875, + "step": 159410 + }, + { + "epoch": 1.4093247758977352, + "grad_norm": 2.95265531539917, + "learning_rate": 2.651125373503775e-05, + "loss": 0.5776, + "step": 159420 + }, + { + "epoch": 1.4094131791580473, + "grad_norm": 1.4178526401519775, + "learning_rate": 2.650978034736588e-05, + "loss": 0.6171, + "step": 159430 + }, + { + "epoch": 1.4095015824183597, + "grad_norm": 3.679302930831909, + "learning_rate": 2.6508306959694006e-05, + "loss": 0.6201, + "step": 159440 + }, + { + "epoch": 1.4095899856786718, + "grad_norm": 2.993734359741211, + "learning_rate": 2.6506833572022137e-05, + "loss": 0.6115, + "step": 159450 + }, + { + "epoch": 1.4096783889389841, + "grad_norm": 3.4420623779296875, + "learning_rate": 2.6505360184350262e-05, + "loss": 0.6573, + "step": 159460 + }, + { + "epoch": 1.4097667921992962, + "grad_norm": 2.0916452407836914, + "learning_rate": 2.6503886796678394e-05, + "loss": 0.6566, + "step": 159470 + }, + { + "epoch": 1.4098551954596086, + "grad_norm": 1.305911660194397, + "learning_rate": 2.6502413409006526e-05, + "loss": 0.6405, + "step": 159480 + }, + { + "epoch": 1.409943598719921, + "grad_norm": 4.582038879394531, + "learning_rate": 2.650094002133465e-05, + "loss": 0.5164, + "step": 159490 + }, + { + "epoch": 1.410032001980233, + "grad_norm": 1.4076347351074219, + "learning_rate": 2.6499466633662783e-05, + "loss": 0.6497, + "step": 159500 + }, + { + "epoch": 1.4101204052405452, + "grad_norm": 1.8839563131332397, + "learning_rate": 2.6497993245990914e-05, + "loss": 0.6284, + "step": 159510 + }, + { + "epoch": 1.4102088085008575, + "grad_norm": 2.5630011558532715, + "learning_rate": 2.649651985831904e-05, + "loss": 0.6347, + "step": 159520 + }, + { + "epoch": 1.4102972117611698, + "grad_norm": 0.8112945556640625, + "learning_rate": 2.649504647064717e-05, + "loss": 0.6317, + "step": 159530 + }, + { + "epoch": 1.410385615021482, + "grad_norm": 2.7287089824676514, + "learning_rate": 2.6493573082975303e-05, + "loss": 0.6246, + "step": 159540 + }, + { + "epoch": 1.4104740182817943, + "grad_norm": 2.543942928314209, + "learning_rate": 2.6492099695303428e-05, + "loss": 0.6773, + "step": 159550 + }, + { + "epoch": 1.4105624215421064, + "grad_norm": 2.2022199630737305, + "learning_rate": 2.649062630763156e-05, + "loss": 0.5178, + "step": 159560 + }, + { + "epoch": 1.4106508248024188, + "grad_norm": 0.7935627102851868, + "learning_rate": 2.6489152919959688e-05, + "loss": 0.6608, + "step": 159570 + }, + { + "epoch": 1.4107392280627309, + "grad_norm": 1.3393452167510986, + "learning_rate": 2.648767953228782e-05, + "loss": 0.7416, + "step": 159580 + }, + { + "epoch": 1.4108276313230432, + "grad_norm": 2.2884154319763184, + "learning_rate": 2.6486206144615948e-05, + "loss": 0.7118, + "step": 159590 + }, + { + "epoch": 1.4109160345833556, + "grad_norm": 2.903428792953491, + "learning_rate": 2.6484732756944076e-05, + "loss": 0.5632, + "step": 159600 + }, + { + "epoch": 1.4110044378436677, + "grad_norm": 2.1919758319854736, + "learning_rate": 2.6483259369272208e-05, + "loss": 0.5821, + "step": 159610 + }, + { + "epoch": 1.4110928411039798, + "grad_norm": 4.861763000488281, + "learning_rate": 2.6481785981600336e-05, + "loss": 0.5739, + "step": 159620 + }, + { + "epoch": 1.4111812443642922, + "grad_norm": 3.7185757160186768, + "learning_rate": 2.6480312593928465e-05, + "loss": 0.5976, + "step": 159630 + }, + { + "epoch": 1.4112696476246045, + "grad_norm": 3.7825937271118164, + "learning_rate": 2.6478839206256597e-05, + "loss": 0.673, + "step": 159640 + }, + { + "epoch": 1.4113580508849166, + "grad_norm": 2.2832963466644287, + "learning_rate": 2.6477365818584725e-05, + "loss": 0.5858, + "step": 159650 + }, + { + "epoch": 1.411446454145229, + "grad_norm": 2.159510612487793, + "learning_rate": 2.6475892430912853e-05, + "loss": 0.5939, + "step": 159660 + }, + { + "epoch": 1.411534857405541, + "grad_norm": 7.305959701538086, + "learning_rate": 2.6474419043240985e-05, + "loss": 0.5744, + "step": 159670 + }, + { + "epoch": 1.4116232606658534, + "grad_norm": 1.4871844053268433, + "learning_rate": 2.6472945655569113e-05, + "loss": 0.6274, + "step": 159680 + }, + { + "epoch": 1.4117116639261655, + "grad_norm": 2.290940761566162, + "learning_rate": 2.6471472267897242e-05, + "loss": 0.5052, + "step": 159690 + }, + { + "epoch": 1.4118000671864779, + "grad_norm": 7.085480213165283, + "learning_rate": 2.6469998880225373e-05, + "loss": 0.5243, + "step": 159700 + }, + { + "epoch": 1.41188847044679, + "grad_norm": 4.206778526306152, + "learning_rate": 2.64685254925535e-05, + "loss": 0.5879, + "step": 159710 + }, + { + "epoch": 1.4119768737071023, + "grad_norm": 1.0725706815719604, + "learning_rate": 2.646705210488163e-05, + "loss": 0.5319, + "step": 159720 + }, + { + "epoch": 1.4120652769674145, + "grad_norm": 0.9569700956344604, + "learning_rate": 2.6465578717209762e-05, + "loss": 0.7327, + "step": 159730 + }, + { + "epoch": 1.4121536802277268, + "grad_norm": 6.424351692199707, + "learning_rate": 2.6464105329537887e-05, + "loss": 0.7711, + "step": 159740 + }, + { + "epoch": 1.4122420834880391, + "grad_norm": 17.041942596435547, + "learning_rate": 2.646263194186602e-05, + "loss": 0.5717, + "step": 159750 + }, + { + "epoch": 1.4123304867483513, + "grad_norm": 3.166179895401001, + "learning_rate": 2.646115855419415e-05, + "loss": 0.6865, + "step": 159760 + }, + { + "epoch": 1.4124188900086634, + "grad_norm": 1.822239637374878, + "learning_rate": 2.6459685166522275e-05, + "loss": 0.6063, + "step": 159770 + }, + { + "epoch": 1.4125072932689757, + "grad_norm": 14.279197692871094, + "learning_rate": 2.6458211778850407e-05, + "loss": 0.5028, + "step": 159780 + }, + { + "epoch": 1.412595696529288, + "grad_norm": 3.0916919708251953, + "learning_rate": 2.645673839117854e-05, + "loss": 0.6056, + "step": 159790 + }, + { + "epoch": 1.4126840997896002, + "grad_norm": 6.698751449584961, + "learning_rate": 2.6455265003506664e-05, + "loss": 0.6232, + "step": 159800 + }, + { + "epoch": 1.4127725030499125, + "grad_norm": 0.8570348024368286, + "learning_rate": 2.6453791615834796e-05, + "loss": 0.7168, + "step": 159810 + }, + { + "epoch": 1.4128609063102247, + "grad_norm": 1.9548002481460571, + "learning_rate": 2.645231822816292e-05, + "loss": 0.6415, + "step": 159820 + }, + { + "epoch": 1.412949309570537, + "grad_norm": 4.140524864196777, + "learning_rate": 2.6450844840491052e-05, + "loss": 0.5397, + "step": 159830 + }, + { + "epoch": 1.4130377128308491, + "grad_norm": 2.5898735523223877, + "learning_rate": 2.6449371452819184e-05, + "loss": 0.5723, + "step": 159840 + }, + { + "epoch": 1.4131261160911615, + "grad_norm": 7.782649040222168, + "learning_rate": 2.644789806514731e-05, + "loss": 0.7294, + "step": 159850 + }, + { + "epoch": 1.4132145193514738, + "grad_norm": 1.7335716485977173, + "learning_rate": 2.644642467747544e-05, + "loss": 0.5761, + "step": 159860 + }, + { + "epoch": 1.413302922611786, + "grad_norm": 1.9642634391784668, + "learning_rate": 2.6444951289803572e-05, + "loss": 0.5269, + "step": 159870 + }, + { + "epoch": 1.413391325872098, + "grad_norm": 1.5840822458267212, + "learning_rate": 2.6443477902131697e-05, + "loss": 0.7299, + "step": 159880 + }, + { + "epoch": 1.4134797291324104, + "grad_norm": 2.5457520484924316, + "learning_rate": 2.644200451445983e-05, + "loss": 0.7514, + "step": 159890 + }, + { + "epoch": 1.4135681323927227, + "grad_norm": 2.503714084625244, + "learning_rate": 2.644053112678796e-05, + "loss": 0.699, + "step": 159900 + }, + { + "epoch": 1.4136565356530348, + "grad_norm": 6.777674674987793, + "learning_rate": 2.6439057739116086e-05, + "loss": 0.7319, + "step": 159910 + }, + { + "epoch": 1.4137449389133472, + "grad_norm": 3.254171133041382, + "learning_rate": 2.6437584351444218e-05, + "loss": 0.6473, + "step": 159920 + }, + { + "epoch": 1.4138333421736593, + "grad_norm": 28.20862579345703, + "learning_rate": 2.6436110963772343e-05, + "loss": 0.657, + "step": 159930 + }, + { + "epoch": 1.4139217454339716, + "grad_norm": 2.420679807662964, + "learning_rate": 2.6434637576100474e-05, + "loss": 0.6593, + "step": 159940 + }, + { + "epoch": 1.4140101486942838, + "grad_norm": 3.686713933944702, + "learning_rate": 2.6433164188428606e-05, + "loss": 0.5086, + "step": 159950 + }, + { + "epoch": 1.414098551954596, + "grad_norm": 2.3647212982177734, + "learning_rate": 2.643169080075673e-05, + "loss": 0.5481, + "step": 159960 + }, + { + "epoch": 1.4141869552149084, + "grad_norm": 2.3430593013763428, + "learning_rate": 2.6430217413084863e-05, + "loss": 0.5618, + "step": 159970 + }, + { + "epoch": 1.4142753584752206, + "grad_norm": 7.406024932861328, + "learning_rate": 2.6428744025412994e-05, + "loss": 0.6182, + "step": 159980 + }, + { + "epoch": 1.4143637617355327, + "grad_norm": 1.0082461833953857, + "learning_rate": 2.642727063774112e-05, + "loss": 0.535, + "step": 159990 + }, + { + "epoch": 1.414452164995845, + "grad_norm": 7.226438522338867, + "learning_rate": 2.642579725006925e-05, + "loss": 0.5938, + "step": 160000 + }, + { + "epoch": 1.4145405682561574, + "grad_norm": 2.860383987426758, + "learning_rate": 2.6424323862397383e-05, + "loss": 0.7929, + "step": 160010 + }, + { + "epoch": 1.4146289715164695, + "grad_norm": 9.406736373901367, + "learning_rate": 2.6422850474725508e-05, + "loss": 0.5238, + "step": 160020 + }, + { + "epoch": 1.4147173747767818, + "grad_norm": 2.0454769134521484, + "learning_rate": 2.642137708705364e-05, + "loss": 0.5808, + "step": 160030 + }, + { + "epoch": 1.414805778037094, + "grad_norm": 3.2237532138824463, + "learning_rate": 2.641990369938177e-05, + "loss": 0.5386, + "step": 160040 + }, + { + "epoch": 1.4148941812974063, + "grad_norm": 3.441462278366089, + "learning_rate": 2.6418430311709896e-05, + "loss": 0.5946, + "step": 160050 + }, + { + "epoch": 1.4149825845577184, + "grad_norm": 1.8770806789398193, + "learning_rate": 2.6416956924038028e-05, + "loss": 0.5488, + "step": 160060 + }, + { + "epoch": 1.4150709878180308, + "grad_norm": 4.879007816314697, + "learning_rate": 2.6415483536366153e-05, + "loss": 0.6087, + "step": 160070 + }, + { + "epoch": 1.415159391078343, + "grad_norm": 0.9643071293830872, + "learning_rate": 2.6414010148694285e-05, + "loss": 0.6084, + "step": 160080 + }, + { + "epoch": 1.4152477943386552, + "grad_norm": 8.081360816955566, + "learning_rate": 2.6412536761022417e-05, + "loss": 0.6593, + "step": 160090 + }, + { + "epoch": 1.4153361975989673, + "grad_norm": 2.666673183441162, + "learning_rate": 2.641106337335054e-05, + "loss": 0.7004, + "step": 160100 + }, + { + "epoch": 1.4154246008592797, + "grad_norm": 3.298182725906372, + "learning_rate": 2.6409589985678673e-05, + "loss": 0.5644, + "step": 160110 + }, + { + "epoch": 1.415513004119592, + "grad_norm": 10.879555702209473, + "learning_rate": 2.6408116598006805e-05, + "loss": 0.5782, + "step": 160120 + }, + { + "epoch": 1.4156014073799041, + "grad_norm": 4.905949115753174, + "learning_rate": 2.640664321033493e-05, + "loss": 0.5191, + "step": 160130 + }, + { + "epoch": 1.4156898106402165, + "grad_norm": 17.285911560058594, + "learning_rate": 2.640516982266306e-05, + "loss": 0.5591, + "step": 160140 + }, + { + "epoch": 1.4157782139005286, + "grad_norm": 3.5555167198181152, + "learning_rate": 2.6403696434991193e-05, + "loss": 0.6025, + "step": 160150 + }, + { + "epoch": 1.415866617160841, + "grad_norm": 6.737716197967529, + "learning_rate": 2.640222304731932e-05, + "loss": 0.5683, + "step": 160160 + }, + { + "epoch": 1.415955020421153, + "grad_norm": 4.676901817321777, + "learning_rate": 2.640074965964745e-05, + "loss": 0.6096, + "step": 160170 + }, + { + "epoch": 1.4160434236814654, + "grad_norm": 1.5498571395874023, + "learning_rate": 2.6399276271975575e-05, + "loss": 0.4815, + "step": 160180 + }, + { + "epoch": 1.4161318269417777, + "grad_norm": 1.014817476272583, + "learning_rate": 2.6397802884303707e-05, + "loss": 0.5848, + "step": 160190 + }, + { + "epoch": 1.4162202302020899, + "grad_norm": 5.023947238922119, + "learning_rate": 2.639632949663184e-05, + "loss": 0.7855, + "step": 160200 + }, + { + "epoch": 1.416308633462402, + "grad_norm": 6.053834915161133, + "learning_rate": 2.6394856108959964e-05, + "loss": 0.5075, + "step": 160210 + }, + { + "epoch": 1.4163970367227143, + "grad_norm": 1.7487704753875732, + "learning_rate": 2.6393382721288095e-05, + "loss": 0.6308, + "step": 160220 + }, + { + "epoch": 1.4164854399830267, + "grad_norm": 1.0961353778839111, + "learning_rate": 2.6391909333616227e-05, + "loss": 0.6471, + "step": 160230 + }, + { + "epoch": 1.4165738432433388, + "grad_norm": 1.803863525390625, + "learning_rate": 2.6390435945944352e-05, + "loss": 0.5399, + "step": 160240 + }, + { + "epoch": 1.4166622465036511, + "grad_norm": 2.776106357574463, + "learning_rate": 2.6388962558272484e-05, + "loss": 0.5949, + "step": 160250 + }, + { + "epoch": 1.4167506497639633, + "grad_norm": 2.740386962890625, + "learning_rate": 2.6387489170600615e-05, + "loss": 0.6215, + "step": 160260 + }, + { + "epoch": 1.4168390530242756, + "grad_norm": 1.117795705795288, + "learning_rate": 2.638601578292874e-05, + "loss": 0.6517, + "step": 160270 + }, + { + "epoch": 1.4169274562845877, + "grad_norm": 2.1224026679992676, + "learning_rate": 2.6384542395256872e-05, + "loss": 0.6073, + "step": 160280 + }, + { + "epoch": 1.4170158595449, + "grad_norm": 2.0323450565338135, + "learning_rate": 2.6383069007584997e-05, + "loss": 0.6173, + "step": 160290 + }, + { + "epoch": 1.4171042628052122, + "grad_norm": 5.175277233123779, + "learning_rate": 2.638159561991313e-05, + "loss": 0.5712, + "step": 160300 + }, + { + "epoch": 1.4171926660655245, + "grad_norm": 2.6836841106414795, + "learning_rate": 2.638012223224126e-05, + "loss": 0.6467, + "step": 160310 + }, + { + "epoch": 1.4172810693258366, + "grad_norm": 2.9890382289886475, + "learning_rate": 2.6378648844569386e-05, + "loss": 0.6943, + "step": 160320 + }, + { + "epoch": 1.417369472586149, + "grad_norm": 4.35649299621582, + "learning_rate": 2.6377175456897517e-05, + "loss": 0.6489, + "step": 160330 + }, + { + "epoch": 1.4174578758464613, + "grad_norm": 1.9911431074142456, + "learning_rate": 2.637570206922565e-05, + "loss": 0.6182, + "step": 160340 + }, + { + "epoch": 1.4175462791067734, + "grad_norm": 1.7357922792434692, + "learning_rate": 2.6374228681553774e-05, + "loss": 0.5361, + "step": 160350 + }, + { + "epoch": 1.4176346823670856, + "grad_norm": 2.3180503845214844, + "learning_rate": 2.6372755293881906e-05, + "loss": 0.625, + "step": 160360 + }, + { + "epoch": 1.417723085627398, + "grad_norm": 1.7044185400009155, + "learning_rate": 2.6371281906210038e-05, + "loss": 0.5539, + "step": 160370 + }, + { + "epoch": 1.4178114888877102, + "grad_norm": 1.8271578550338745, + "learning_rate": 2.6369808518538163e-05, + "loss": 0.6551, + "step": 160380 + }, + { + "epoch": 1.4178998921480224, + "grad_norm": 2.401309013366699, + "learning_rate": 2.6368335130866294e-05, + "loss": 0.6728, + "step": 160390 + }, + { + "epoch": 1.4179882954083347, + "grad_norm": 1.3315385580062866, + "learning_rate": 2.6366861743194423e-05, + "loss": 0.5476, + "step": 160400 + }, + { + "epoch": 1.4180766986686468, + "grad_norm": 1.1570377349853516, + "learning_rate": 2.636538835552255e-05, + "loss": 0.6512, + "step": 160410 + }, + { + "epoch": 1.4181651019289592, + "grad_norm": 1.9309003353118896, + "learning_rate": 2.6363914967850683e-05, + "loss": 0.5329, + "step": 160420 + }, + { + "epoch": 1.4182535051892713, + "grad_norm": 3.001166820526123, + "learning_rate": 2.636244158017881e-05, + "loss": 0.5772, + "step": 160430 + }, + { + "epoch": 1.4183419084495836, + "grad_norm": 1.5392869710922241, + "learning_rate": 2.636096819250694e-05, + "loss": 0.6, + "step": 160440 + }, + { + "epoch": 1.418430311709896, + "grad_norm": 2.6735455989837646, + "learning_rate": 2.635949480483507e-05, + "loss": 0.4952, + "step": 160450 + }, + { + "epoch": 1.418518714970208, + "grad_norm": 1.7346298694610596, + "learning_rate": 2.63580214171632e-05, + "loss": 0.5309, + "step": 160460 + }, + { + "epoch": 1.4186071182305202, + "grad_norm": 4.668123722076416, + "learning_rate": 2.6356548029491328e-05, + "loss": 0.6442, + "step": 160470 + }, + { + "epoch": 1.4186955214908326, + "grad_norm": 8.75904369354248, + "learning_rate": 2.635507464181946e-05, + "loss": 0.4956, + "step": 160480 + }, + { + "epoch": 1.418783924751145, + "grad_norm": 8.470442771911621, + "learning_rate": 2.6353601254147588e-05, + "loss": 0.5853, + "step": 160490 + }, + { + "epoch": 1.418872328011457, + "grad_norm": 2.0491862297058105, + "learning_rate": 2.6352127866475716e-05, + "loss": 0.4914, + "step": 160500 + }, + { + "epoch": 1.4189607312717694, + "grad_norm": 2.6826112270355225, + "learning_rate": 2.6350654478803848e-05, + "loss": 0.4903, + "step": 160510 + }, + { + "epoch": 1.4190491345320815, + "grad_norm": 2.543534755706787, + "learning_rate": 2.6349181091131976e-05, + "loss": 0.7342, + "step": 160520 + }, + { + "epoch": 1.4191375377923938, + "grad_norm": 4.157847881317139, + "learning_rate": 2.6347707703460105e-05, + "loss": 0.8395, + "step": 160530 + }, + { + "epoch": 1.419225941052706, + "grad_norm": 2.462852954864502, + "learning_rate": 2.6346234315788233e-05, + "loss": 0.6625, + "step": 160540 + }, + { + "epoch": 1.4193143443130183, + "grad_norm": 2.1189374923706055, + "learning_rate": 2.6344760928116365e-05, + "loss": 0.5799, + "step": 160550 + }, + { + "epoch": 1.4194027475733306, + "grad_norm": 5.7805986404418945, + "learning_rate": 2.6343287540444493e-05, + "loss": 0.61, + "step": 160560 + }, + { + "epoch": 1.4194911508336427, + "grad_norm": 10.9738187789917, + "learning_rate": 2.634181415277262e-05, + "loss": 0.4662, + "step": 160570 + }, + { + "epoch": 1.4195795540939549, + "grad_norm": 1.3473695516586304, + "learning_rate": 2.6340340765100753e-05, + "loss": 0.6168, + "step": 160580 + }, + { + "epoch": 1.4196679573542672, + "grad_norm": 6.541378021240234, + "learning_rate": 2.633886737742888e-05, + "loss": 0.6607, + "step": 160590 + }, + { + "epoch": 1.4197563606145795, + "grad_norm": 1.9721786975860596, + "learning_rate": 2.633739398975701e-05, + "loss": 0.5672, + "step": 160600 + }, + { + "epoch": 1.4198447638748917, + "grad_norm": 4.219167709350586, + "learning_rate": 2.6335920602085142e-05, + "loss": 0.6612, + "step": 160610 + }, + { + "epoch": 1.419933167135204, + "grad_norm": 5.972991466522217, + "learning_rate": 2.633444721441327e-05, + "loss": 0.6709, + "step": 160620 + }, + { + "epoch": 1.4200215703955161, + "grad_norm": 4.19579553604126, + "learning_rate": 2.63329738267414e-05, + "loss": 0.6408, + "step": 160630 + }, + { + "epoch": 1.4201099736558285, + "grad_norm": 2.4083566665649414, + "learning_rate": 2.633150043906953e-05, + "loss": 0.5156, + "step": 160640 + }, + { + "epoch": 1.4201983769161406, + "grad_norm": 1.0021512508392334, + "learning_rate": 2.6330027051397655e-05, + "loss": 0.6418, + "step": 160650 + }, + { + "epoch": 1.420286780176453, + "grad_norm": 11.212654113769531, + "learning_rate": 2.6328553663725787e-05, + "loss": 0.6667, + "step": 160660 + }, + { + "epoch": 1.4203751834367653, + "grad_norm": 2.1402647495269775, + "learning_rate": 2.632708027605392e-05, + "loss": 0.6329, + "step": 160670 + }, + { + "epoch": 1.4204635866970774, + "grad_norm": 1.101129174232483, + "learning_rate": 2.6325606888382044e-05, + "loss": 0.6209, + "step": 160680 + }, + { + "epoch": 1.4205519899573895, + "grad_norm": 2.521413564682007, + "learning_rate": 2.6324133500710175e-05, + "loss": 0.7676, + "step": 160690 + }, + { + "epoch": 1.4206403932177019, + "grad_norm": 1.835071325302124, + "learning_rate": 2.6322660113038307e-05, + "loss": 0.5728, + "step": 160700 + }, + { + "epoch": 1.4207287964780142, + "grad_norm": 7.986724376678467, + "learning_rate": 2.6321186725366432e-05, + "loss": 0.6278, + "step": 160710 + }, + { + "epoch": 1.4208171997383263, + "grad_norm": 2.8970935344696045, + "learning_rate": 2.6319713337694564e-05, + "loss": 0.7514, + "step": 160720 + }, + { + "epoch": 1.4209056029986387, + "grad_norm": 3.4715514183044434, + "learning_rate": 2.6318239950022696e-05, + "loss": 0.6582, + "step": 160730 + }, + { + "epoch": 1.4209940062589508, + "grad_norm": 1.9553172588348389, + "learning_rate": 2.631676656235082e-05, + "loss": 0.5588, + "step": 160740 + }, + { + "epoch": 1.4210824095192631, + "grad_norm": 19.131589889526367, + "learning_rate": 2.6315293174678952e-05, + "loss": 0.6642, + "step": 160750 + }, + { + "epoch": 1.4211708127795752, + "grad_norm": 8.412508010864258, + "learning_rate": 2.6313819787007077e-05, + "loss": 0.6862, + "step": 160760 + }, + { + "epoch": 1.4212592160398876, + "grad_norm": 1.5035523176193237, + "learning_rate": 2.631234639933521e-05, + "loss": 0.594, + "step": 160770 + }, + { + "epoch": 1.4213476193002, + "grad_norm": 0.656141996383667, + "learning_rate": 2.631087301166334e-05, + "loss": 0.5416, + "step": 160780 + }, + { + "epoch": 1.421436022560512, + "grad_norm": 1.2475924491882324, + "learning_rate": 2.6309399623991466e-05, + "loss": 0.6128, + "step": 160790 + }, + { + "epoch": 1.4215244258208242, + "grad_norm": 1.5268205404281616, + "learning_rate": 2.6307926236319597e-05, + "loss": 0.6367, + "step": 160800 + }, + { + "epoch": 1.4216128290811365, + "grad_norm": 8.67261028289795, + "learning_rate": 2.630645284864773e-05, + "loss": 0.6368, + "step": 160810 + }, + { + "epoch": 1.4217012323414489, + "grad_norm": 1.3337799310684204, + "learning_rate": 2.6304979460975854e-05, + "loss": 0.511, + "step": 160820 + }, + { + "epoch": 1.421789635601761, + "grad_norm": 2.545447587966919, + "learning_rate": 2.6303506073303986e-05, + "loss": 0.6217, + "step": 160830 + }, + { + "epoch": 1.4218780388620733, + "grad_norm": 4.208387851715088, + "learning_rate": 2.6302032685632118e-05, + "loss": 0.6286, + "step": 160840 + }, + { + "epoch": 1.4219664421223854, + "grad_norm": 2.237874984741211, + "learning_rate": 2.6300559297960243e-05, + "loss": 0.5451, + "step": 160850 + }, + { + "epoch": 1.4220548453826978, + "grad_norm": 1.8355919122695923, + "learning_rate": 2.6299085910288374e-05, + "loss": 0.7452, + "step": 160860 + }, + { + "epoch": 1.42214324864301, + "grad_norm": 6.9248857498168945, + "learning_rate": 2.62976125226165e-05, + "loss": 0.7023, + "step": 160870 + }, + { + "epoch": 1.4222316519033222, + "grad_norm": 2.60052752494812, + "learning_rate": 2.629613913494463e-05, + "loss": 0.6069, + "step": 160880 + }, + { + "epoch": 1.4223200551636344, + "grad_norm": 1.3317762613296509, + "learning_rate": 2.6294665747272763e-05, + "loss": 0.5568, + "step": 160890 + }, + { + "epoch": 1.4224084584239467, + "grad_norm": 2.9765264987945557, + "learning_rate": 2.6293192359600888e-05, + "loss": 0.5409, + "step": 160900 + }, + { + "epoch": 1.4224968616842588, + "grad_norm": 2.9700498580932617, + "learning_rate": 2.629171897192902e-05, + "loss": 0.677, + "step": 160910 + }, + { + "epoch": 1.4225852649445712, + "grad_norm": 1.7976515293121338, + "learning_rate": 2.629024558425715e-05, + "loss": 0.5045, + "step": 160920 + }, + { + "epoch": 1.4226736682048835, + "grad_norm": 1.568293571472168, + "learning_rate": 2.6288772196585276e-05, + "loss": 0.6589, + "step": 160930 + }, + { + "epoch": 1.4227620714651956, + "grad_norm": 21.234567642211914, + "learning_rate": 2.6287298808913408e-05, + "loss": 0.5197, + "step": 160940 + }, + { + "epoch": 1.4228504747255077, + "grad_norm": 4.011499881744385, + "learning_rate": 2.628582542124154e-05, + "loss": 0.5433, + "step": 160950 + }, + { + "epoch": 1.42293887798582, + "grad_norm": 4.501284122467041, + "learning_rate": 2.6284352033569665e-05, + "loss": 0.4991, + "step": 160960 + }, + { + "epoch": 1.4230272812461324, + "grad_norm": 4.294429302215576, + "learning_rate": 2.6282878645897796e-05, + "loss": 0.613, + "step": 160970 + }, + { + "epoch": 1.4231156845064445, + "grad_norm": 2.0576703548431396, + "learning_rate": 2.6281405258225928e-05, + "loss": 0.6648, + "step": 160980 + }, + { + "epoch": 1.4232040877667569, + "grad_norm": 1.1466312408447266, + "learning_rate": 2.6279931870554053e-05, + "loss": 0.6383, + "step": 160990 + }, + { + "epoch": 1.423292491027069, + "grad_norm": 2.554060935974121, + "learning_rate": 2.6278458482882185e-05, + "loss": 0.7308, + "step": 161000 + }, + { + "epoch": 1.4233808942873813, + "grad_norm": 2.549853801727295, + "learning_rate": 2.627698509521031e-05, + "loss": 0.628, + "step": 161010 + }, + { + "epoch": 1.4234692975476935, + "grad_norm": 2.7782649993896484, + "learning_rate": 2.627551170753844e-05, + "loss": 0.6412, + "step": 161020 + }, + { + "epoch": 1.4235577008080058, + "grad_norm": 0.8129737973213196, + "learning_rate": 2.6274038319866573e-05, + "loss": 0.6006, + "step": 161030 + }, + { + "epoch": 1.4236461040683182, + "grad_norm": 7.073207378387451, + "learning_rate": 2.6272564932194698e-05, + "loss": 0.6677, + "step": 161040 + }, + { + "epoch": 1.4237345073286303, + "grad_norm": 1.9187803268432617, + "learning_rate": 2.627109154452283e-05, + "loss": 0.6267, + "step": 161050 + }, + { + "epoch": 1.4238229105889424, + "grad_norm": 0.6884849071502686, + "learning_rate": 2.6269618156850962e-05, + "loss": 0.6123, + "step": 161060 + }, + { + "epoch": 1.4239113138492547, + "grad_norm": 1.2558132410049438, + "learning_rate": 2.6268144769179087e-05, + "loss": 0.6612, + "step": 161070 + }, + { + "epoch": 1.423999717109567, + "grad_norm": 6.449822425842285, + "learning_rate": 2.626667138150722e-05, + "loss": 0.6341, + "step": 161080 + }, + { + "epoch": 1.4240881203698792, + "grad_norm": 1.1429452896118164, + "learning_rate": 2.626519799383535e-05, + "loss": 0.5647, + "step": 161090 + }, + { + "epoch": 1.4241765236301915, + "grad_norm": 3.5178356170654297, + "learning_rate": 2.6263724606163475e-05, + "loss": 0.6495, + "step": 161100 + }, + { + "epoch": 1.4242649268905037, + "grad_norm": 2.5806288719177246, + "learning_rate": 2.6262251218491607e-05, + "loss": 0.5313, + "step": 161110 + }, + { + "epoch": 1.424353330150816, + "grad_norm": 11.872706413269043, + "learning_rate": 2.6260777830819732e-05, + "loss": 0.685, + "step": 161120 + }, + { + "epoch": 1.4244417334111281, + "grad_norm": 1.9377710819244385, + "learning_rate": 2.6259304443147864e-05, + "loss": 0.713, + "step": 161130 + }, + { + "epoch": 1.4245301366714405, + "grad_norm": 4.7357683181762695, + "learning_rate": 2.6257831055475995e-05, + "loss": 0.6366, + "step": 161140 + }, + { + "epoch": 1.4246185399317528, + "grad_norm": 1.9626750946044922, + "learning_rate": 2.625635766780412e-05, + "loss": 0.5811, + "step": 161150 + }, + { + "epoch": 1.424706943192065, + "grad_norm": 4.0224223136901855, + "learning_rate": 2.6254884280132252e-05, + "loss": 0.4357, + "step": 161160 + }, + { + "epoch": 1.424795346452377, + "grad_norm": 4.830257892608643, + "learning_rate": 2.6253410892460384e-05, + "loss": 0.6453, + "step": 161170 + }, + { + "epoch": 1.4248837497126894, + "grad_norm": 4.101064205169678, + "learning_rate": 2.625193750478851e-05, + "loss": 0.681, + "step": 161180 + }, + { + "epoch": 1.4249721529730017, + "grad_norm": 1.207502841949463, + "learning_rate": 2.625046411711664e-05, + "loss": 0.4554, + "step": 161190 + }, + { + "epoch": 1.4250605562333138, + "grad_norm": 6.373531341552734, + "learning_rate": 2.6248990729444772e-05, + "loss": 0.7484, + "step": 161200 + }, + { + "epoch": 1.4251489594936262, + "grad_norm": 3.479339122772217, + "learning_rate": 2.6247517341772897e-05, + "loss": 0.6728, + "step": 161210 + }, + { + "epoch": 1.4252373627539383, + "grad_norm": 1.463140845298767, + "learning_rate": 2.624604395410103e-05, + "loss": 0.5925, + "step": 161220 + }, + { + "epoch": 1.4253257660142507, + "grad_norm": 8.887065887451172, + "learning_rate": 2.6244570566429154e-05, + "loss": 0.5616, + "step": 161230 + }, + { + "epoch": 1.4254141692745628, + "grad_norm": 1.3086721897125244, + "learning_rate": 2.6243097178757286e-05, + "loss": 0.5419, + "step": 161240 + }, + { + "epoch": 1.4255025725348751, + "grad_norm": 1.8700672388076782, + "learning_rate": 2.6241623791085417e-05, + "loss": 0.5145, + "step": 161250 + }, + { + "epoch": 1.4255909757951875, + "grad_norm": 2.630340337753296, + "learning_rate": 2.6240150403413542e-05, + "loss": 0.635, + "step": 161260 + }, + { + "epoch": 1.4256793790554996, + "grad_norm": 22.789064407348633, + "learning_rate": 2.6238677015741674e-05, + "loss": 0.5904, + "step": 161270 + }, + { + "epoch": 1.4257677823158117, + "grad_norm": 7.103511333465576, + "learning_rate": 2.6237203628069806e-05, + "loss": 0.5613, + "step": 161280 + }, + { + "epoch": 1.425856185576124, + "grad_norm": 1.8684812784194946, + "learning_rate": 2.623573024039793e-05, + "loss": 0.6027, + "step": 161290 + }, + { + "epoch": 1.4259445888364364, + "grad_norm": 1.2190165519714355, + "learning_rate": 2.6234256852726063e-05, + "loss": 0.5527, + "step": 161300 + }, + { + "epoch": 1.4260329920967485, + "grad_norm": 4.788529872894287, + "learning_rate": 2.6232783465054194e-05, + "loss": 0.5814, + "step": 161310 + }, + { + "epoch": 1.4261213953570608, + "grad_norm": 1.8191792964935303, + "learning_rate": 2.623131007738232e-05, + "loss": 0.6695, + "step": 161320 + }, + { + "epoch": 1.426209798617373, + "grad_norm": 0.710638165473938, + "learning_rate": 2.622983668971045e-05, + "loss": 0.6834, + "step": 161330 + }, + { + "epoch": 1.4262982018776853, + "grad_norm": 3.264592409133911, + "learning_rate": 2.622836330203858e-05, + "loss": 0.7281, + "step": 161340 + }, + { + "epoch": 1.4263866051379974, + "grad_norm": 1.5470978021621704, + "learning_rate": 2.6226889914366708e-05, + "loss": 0.618, + "step": 161350 + }, + { + "epoch": 1.4264750083983098, + "grad_norm": 1.203267216682434, + "learning_rate": 2.622541652669484e-05, + "loss": 0.6747, + "step": 161360 + }, + { + "epoch": 1.426563411658622, + "grad_norm": 1.1337255239486694, + "learning_rate": 2.6223943139022968e-05, + "loss": 0.4655, + "step": 161370 + }, + { + "epoch": 1.4266518149189342, + "grad_norm": 4.025588035583496, + "learning_rate": 2.6222469751351096e-05, + "loss": 0.5607, + "step": 161380 + }, + { + "epoch": 1.4267402181792463, + "grad_norm": 0.8390575051307678, + "learning_rate": 2.6220996363679228e-05, + "loss": 0.4946, + "step": 161390 + }, + { + "epoch": 1.4268286214395587, + "grad_norm": 2.9481260776519775, + "learning_rate": 2.6219522976007356e-05, + "loss": 0.533, + "step": 161400 + }, + { + "epoch": 1.426917024699871, + "grad_norm": 3.012434720993042, + "learning_rate": 2.6218049588335485e-05, + "loss": 0.4758, + "step": 161410 + }, + { + "epoch": 1.4270054279601831, + "grad_norm": 1.6087599992752075, + "learning_rate": 2.6216576200663616e-05, + "loss": 0.6749, + "step": 161420 + }, + { + "epoch": 1.4270938312204955, + "grad_norm": 1.224970817565918, + "learning_rate": 2.6215102812991745e-05, + "loss": 0.7121, + "step": 161430 + }, + { + "epoch": 1.4271822344808076, + "grad_norm": 1.0578094720840454, + "learning_rate": 2.6213629425319873e-05, + "loss": 0.5582, + "step": 161440 + }, + { + "epoch": 1.42727063774112, + "grad_norm": 9.980646133422852, + "learning_rate": 2.6212156037648005e-05, + "loss": 0.6451, + "step": 161450 + }, + { + "epoch": 1.427359041001432, + "grad_norm": 2.742506504058838, + "learning_rate": 2.6210682649976133e-05, + "loss": 0.6079, + "step": 161460 + }, + { + "epoch": 1.4274474442617444, + "grad_norm": 3.062483549118042, + "learning_rate": 2.620920926230426e-05, + "loss": 0.4081, + "step": 161470 + }, + { + "epoch": 1.4275358475220565, + "grad_norm": 4.115296840667725, + "learning_rate": 2.620773587463239e-05, + "loss": 0.71, + "step": 161480 + }, + { + "epoch": 1.4276242507823689, + "grad_norm": 2.8837597370147705, + "learning_rate": 2.620626248696052e-05, + "loss": 0.6073, + "step": 161490 + }, + { + "epoch": 1.427712654042681, + "grad_norm": 5.456223964691162, + "learning_rate": 2.620478909928865e-05, + "loss": 0.5854, + "step": 161500 + }, + { + "epoch": 1.4278010573029933, + "grad_norm": 2.9360814094543457, + "learning_rate": 2.620331571161678e-05, + "loss": 0.5733, + "step": 161510 + }, + { + "epoch": 1.4278894605633057, + "grad_norm": 1.341850757598877, + "learning_rate": 2.620184232394491e-05, + "loss": 0.5321, + "step": 161520 + }, + { + "epoch": 1.4279778638236178, + "grad_norm": 2.6372475624084473, + "learning_rate": 2.620036893627304e-05, + "loss": 0.6352, + "step": 161530 + }, + { + "epoch": 1.42806626708393, + "grad_norm": 3.1966259479522705, + "learning_rate": 2.6198895548601167e-05, + "loss": 0.5907, + "step": 161540 + }, + { + "epoch": 1.4281546703442423, + "grad_norm": 4.268582820892334, + "learning_rate": 2.61974221609293e-05, + "loss": 0.666, + "step": 161550 + }, + { + "epoch": 1.4282430736045546, + "grad_norm": 2.418524980545044, + "learning_rate": 2.6195948773257427e-05, + "loss": 0.5888, + "step": 161560 + }, + { + "epoch": 1.4283314768648667, + "grad_norm": 2.0414352416992188, + "learning_rate": 2.6194475385585555e-05, + "loss": 0.5097, + "step": 161570 + }, + { + "epoch": 1.428419880125179, + "grad_norm": 11.2678861618042, + "learning_rate": 2.6193001997913687e-05, + "loss": 0.596, + "step": 161580 + }, + { + "epoch": 1.4285082833854912, + "grad_norm": 12.495491981506348, + "learning_rate": 2.6191528610241812e-05, + "loss": 0.7022, + "step": 161590 + }, + { + "epoch": 1.4285966866458035, + "grad_norm": 2.315694808959961, + "learning_rate": 2.6190055222569944e-05, + "loss": 0.6095, + "step": 161600 + }, + { + "epoch": 1.4286850899061156, + "grad_norm": 3.6961419582366943, + "learning_rate": 2.6188581834898075e-05, + "loss": 0.6796, + "step": 161610 + }, + { + "epoch": 1.428773493166428, + "grad_norm": 2.283215284347534, + "learning_rate": 2.61871084472262e-05, + "loss": 0.5508, + "step": 161620 + }, + { + "epoch": 1.4288618964267403, + "grad_norm": 2.245480537414551, + "learning_rate": 2.6185635059554332e-05, + "loss": 0.6805, + "step": 161630 + }, + { + "epoch": 1.4289502996870524, + "grad_norm": 15.089563369750977, + "learning_rate": 2.6184161671882464e-05, + "loss": 0.6279, + "step": 161640 + }, + { + "epoch": 1.4290387029473646, + "grad_norm": 1.8494623899459839, + "learning_rate": 2.618268828421059e-05, + "loss": 0.6328, + "step": 161650 + }, + { + "epoch": 1.429127106207677, + "grad_norm": 3.1747725009918213, + "learning_rate": 2.618121489653872e-05, + "loss": 0.5979, + "step": 161660 + }, + { + "epoch": 1.4292155094679893, + "grad_norm": 1.5071965456008911, + "learning_rate": 2.6179741508866852e-05, + "loss": 0.5722, + "step": 161670 + }, + { + "epoch": 1.4293039127283014, + "grad_norm": 2.5523529052734375, + "learning_rate": 2.6178268121194977e-05, + "loss": 0.7458, + "step": 161680 + }, + { + "epoch": 1.4293923159886137, + "grad_norm": 3.0697720050811768, + "learning_rate": 2.617679473352311e-05, + "loss": 0.6483, + "step": 161690 + }, + { + "epoch": 1.4294807192489258, + "grad_norm": 6.666508674621582, + "learning_rate": 2.6175321345851234e-05, + "loss": 0.5427, + "step": 161700 + }, + { + "epoch": 1.4295691225092382, + "grad_norm": 4.020336151123047, + "learning_rate": 2.6173847958179366e-05, + "loss": 0.5794, + "step": 161710 + }, + { + "epoch": 1.4296575257695503, + "grad_norm": 3.8209495544433594, + "learning_rate": 2.6172374570507497e-05, + "loss": 0.5047, + "step": 161720 + }, + { + "epoch": 1.4297459290298626, + "grad_norm": 1.0950429439544678, + "learning_rate": 2.6170901182835622e-05, + "loss": 0.6292, + "step": 161730 + }, + { + "epoch": 1.429834332290175, + "grad_norm": 4.650197982788086, + "learning_rate": 2.6169427795163754e-05, + "loss": 0.686, + "step": 161740 + }, + { + "epoch": 1.429922735550487, + "grad_norm": 4.526788711547852, + "learning_rate": 2.6167954407491886e-05, + "loss": 0.5967, + "step": 161750 + }, + { + "epoch": 1.4300111388107992, + "grad_norm": 7.717674732208252, + "learning_rate": 2.616648101982001e-05, + "loss": 0.5825, + "step": 161760 + }, + { + "epoch": 1.4300995420711116, + "grad_norm": 2.3685898780822754, + "learning_rate": 2.6165007632148143e-05, + "loss": 0.6557, + "step": 161770 + }, + { + "epoch": 1.430187945331424, + "grad_norm": 3.4794111251831055, + "learning_rate": 2.6163534244476274e-05, + "loss": 0.5531, + "step": 161780 + }, + { + "epoch": 1.430276348591736, + "grad_norm": 4.8207316398620605, + "learning_rate": 2.61620608568044e-05, + "loss": 0.6737, + "step": 161790 + }, + { + "epoch": 1.4303647518520484, + "grad_norm": 2.528627634048462, + "learning_rate": 2.616058746913253e-05, + "loss": 0.5746, + "step": 161800 + }, + { + "epoch": 1.4304531551123605, + "grad_norm": 2.007368326187134, + "learning_rate": 2.6159114081460656e-05, + "loss": 0.5995, + "step": 161810 + }, + { + "epoch": 1.4305415583726728, + "grad_norm": 1.885964035987854, + "learning_rate": 2.6157640693788788e-05, + "loss": 0.611, + "step": 161820 + }, + { + "epoch": 1.430629961632985, + "grad_norm": 1.7199714183807373, + "learning_rate": 2.615616730611692e-05, + "loss": 0.6451, + "step": 161830 + }, + { + "epoch": 1.4307183648932973, + "grad_norm": 4.746889114379883, + "learning_rate": 2.6154693918445044e-05, + "loss": 0.4711, + "step": 161840 + }, + { + "epoch": 1.4308067681536096, + "grad_norm": 5.606736183166504, + "learning_rate": 2.6153220530773176e-05, + "loss": 0.7349, + "step": 161850 + }, + { + "epoch": 1.4308951714139218, + "grad_norm": 2.159475564956665, + "learning_rate": 2.6151747143101308e-05, + "loss": 0.5479, + "step": 161860 + }, + { + "epoch": 1.4309835746742339, + "grad_norm": 4.461320877075195, + "learning_rate": 2.6150273755429433e-05, + "loss": 0.6135, + "step": 161870 + }, + { + "epoch": 1.4310719779345462, + "grad_norm": 3.7734806537628174, + "learning_rate": 2.6148800367757565e-05, + "loss": 0.6893, + "step": 161880 + }, + { + "epoch": 1.4311603811948586, + "grad_norm": 5.611871719360352, + "learning_rate": 2.6147326980085696e-05, + "loss": 0.4517, + "step": 161890 + }, + { + "epoch": 1.4312487844551707, + "grad_norm": 3.5616016387939453, + "learning_rate": 2.614585359241382e-05, + "loss": 0.6392, + "step": 161900 + }, + { + "epoch": 1.431337187715483, + "grad_norm": 5.073832035064697, + "learning_rate": 2.6144380204741953e-05, + "loss": 0.5417, + "step": 161910 + }, + { + "epoch": 1.4314255909757951, + "grad_norm": 1.7248265743255615, + "learning_rate": 2.6142906817070085e-05, + "loss": 0.5248, + "step": 161920 + }, + { + "epoch": 1.4315139942361075, + "grad_norm": 2.3456459045410156, + "learning_rate": 2.614143342939821e-05, + "loss": 0.539, + "step": 161930 + }, + { + "epoch": 1.4316023974964196, + "grad_norm": 2.0335206985473633, + "learning_rate": 2.613996004172634e-05, + "loss": 0.5889, + "step": 161940 + }, + { + "epoch": 1.431690800756732, + "grad_norm": 5.259879112243652, + "learning_rate": 2.6138486654054467e-05, + "loss": 0.5908, + "step": 161950 + }, + { + "epoch": 1.4317792040170443, + "grad_norm": 1.8193250894546509, + "learning_rate": 2.6137013266382598e-05, + "loss": 0.5954, + "step": 161960 + }, + { + "epoch": 1.4318676072773564, + "grad_norm": 2.533898115158081, + "learning_rate": 2.613553987871073e-05, + "loss": 0.6349, + "step": 161970 + }, + { + "epoch": 1.4319560105376685, + "grad_norm": 7.703001499176025, + "learning_rate": 2.6134066491038855e-05, + "loss": 0.5896, + "step": 161980 + }, + { + "epoch": 1.4320444137979809, + "grad_norm": 4.382285118103027, + "learning_rate": 2.6132593103366987e-05, + "loss": 0.6507, + "step": 161990 + }, + { + "epoch": 1.4321328170582932, + "grad_norm": 6.28700590133667, + "learning_rate": 2.613111971569512e-05, + "loss": 0.5939, + "step": 162000 + }, + { + "epoch": 1.4322212203186053, + "grad_norm": 8.765546798706055, + "learning_rate": 2.6129646328023243e-05, + "loss": 0.597, + "step": 162010 + }, + { + "epoch": 1.4323096235789177, + "grad_norm": 2.822014808654785, + "learning_rate": 2.6128172940351375e-05, + "loss": 0.5538, + "step": 162020 + }, + { + "epoch": 1.4323980268392298, + "grad_norm": 2.5320301055908203, + "learning_rate": 2.6126699552679507e-05, + "loss": 0.545, + "step": 162030 + }, + { + "epoch": 1.4324864300995421, + "grad_norm": 3.5387613773345947, + "learning_rate": 2.6125226165007632e-05, + "loss": 0.5022, + "step": 162040 + }, + { + "epoch": 1.4325748333598542, + "grad_norm": 2.996692657470703, + "learning_rate": 2.6123752777335764e-05, + "loss": 0.6151, + "step": 162050 + }, + { + "epoch": 1.4326632366201666, + "grad_norm": 1.551916241645813, + "learning_rate": 2.612227938966389e-05, + "loss": 0.6087, + "step": 162060 + }, + { + "epoch": 1.4327516398804787, + "grad_norm": 1.2909657955169678, + "learning_rate": 2.612080600199202e-05, + "loss": 0.4813, + "step": 162070 + }, + { + "epoch": 1.432840043140791, + "grad_norm": 2.6515233516693115, + "learning_rate": 2.6119332614320152e-05, + "loss": 0.5572, + "step": 162080 + }, + { + "epoch": 1.4329284464011032, + "grad_norm": 1.394481897354126, + "learning_rate": 2.6117859226648277e-05, + "loss": 0.7136, + "step": 162090 + }, + { + "epoch": 1.4330168496614155, + "grad_norm": 2.159165620803833, + "learning_rate": 2.611638583897641e-05, + "loss": 0.497, + "step": 162100 + }, + { + "epoch": 1.4331052529217279, + "grad_norm": 2.1746418476104736, + "learning_rate": 2.611491245130454e-05, + "loss": 0.5899, + "step": 162110 + }, + { + "epoch": 1.43319365618204, + "grad_norm": 2.519761323928833, + "learning_rate": 2.6113439063632665e-05, + "loss": 0.5736, + "step": 162120 + }, + { + "epoch": 1.433282059442352, + "grad_norm": 2.9637722969055176, + "learning_rate": 2.6111965675960797e-05, + "loss": 0.6743, + "step": 162130 + }, + { + "epoch": 1.4333704627026644, + "grad_norm": 1.2997210025787354, + "learning_rate": 2.611049228828893e-05, + "loss": 0.5524, + "step": 162140 + }, + { + "epoch": 1.4334588659629768, + "grad_norm": 9.77399730682373, + "learning_rate": 2.6109018900617054e-05, + "loss": 0.6186, + "step": 162150 + }, + { + "epoch": 1.433547269223289, + "grad_norm": 1.4343018531799316, + "learning_rate": 2.6107545512945186e-05, + "loss": 0.5757, + "step": 162160 + }, + { + "epoch": 1.4336356724836012, + "grad_norm": 1.8964042663574219, + "learning_rate": 2.610607212527331e-05, + "loss": 0.678, + "step": 162170 + }, + { + "epoch": 1.4337240757439134, + "grad_norm": 2.3213462829589844, + "learning_rate": 2.6104598737601442e-05, + "loss": 0.5234, + "step": 162180 + }, + { + "epoch": 1.4338124790042257, + "grad_norm": 1.0815372467041016, + "learning_rate": 2.6103125349929574e-05, + "loss": 0.6681, + "step": 162190 + }, + { + "epoch": 1.4339008822645378, + "grad_norm": 2.6626393795013428, + "learning_rate": 2.61016519622577e-05, + "loss": 0.5253, + "step": 162200 + }, + { + "epoch": 1.4339892855248502, + "grad_norm": 1.4560598134994507, + "learning_rate": 2.610017857458583e-05, + "loss": 0.5838, + "step": 162210 + }, + { + "epoch": 1.4340776887851625, + "grad_norm": 1.633857011795044, + "learning_rate": 2.6098705186913963e-05, + "loss": 0.6727, + "step": 162220 + }, + { + "epoch": 1.4341660920454746, + "grad_norm": 3.4809625148773193, + "learning_rate": 2.6097231799242088e-05, + "loss": 0.6221, + "step": 162230 + }, + { + "epoch": 1.4342544953057867, + "grad_norm": 6.129171848297119, + "learning_rate": 2.609575841157022e-05, + "loss": 0.6702, + "step": 162240 + }, + { + "epoch": 1.434342898566099, + "grad_norm": 8.678262710571289, + "learning_rate": 2.609428502389835e-05, + "loss": 0.7219, + "step": 162250 + }, + { + "epoch": 1.4344313018264114, + "grad_norm": 3.63297700881958, + "learning_rate": 2.6092811636226476e-05, + "loss": 0.5457, + "step": 162260 + }, + { + "epoch": 1.4345197050867236, + "grad_norm": 3.3888328075408936, + "learning_rate": 2.6091338248554608e-05, + "loss": 0.554, + "step": 162270 + }, + { + "epoch": 1.434608108347036, + "grad_norm": 1.793877124786377, + "learning_rate": 2.6089864860882736e-05, + "loss": 0.5358, + "step": 162280 + }, + { + "epoch": 1.434696511607348, + "grad_norm": 3.278965473175049, + "learning_rate": 2.6088391473210864e-05, + "loss": 0.6237, + "step": 162290 + }, + { + "epoch": 1.4347849148676604, + "grad_norm": 1.1085797548294067, + "learning_rate": 2.6086918085538996e-05, + "loss": 0.6651, + "step": 162300 + }, + { + "epoch": 1.4348733181279725, + "grad_norm": 4.023129940032959, + "learning_rate": 2.6085444697867125e-05, + "loss": 0.6262, + "step": 162310 + }, + { + "epoch": 1.4349617213882848, + "grad_norm": 8.151616096496582, + "learning_rate": 2.6083971310195253e-05, + "loss": 0.5932, + "step": 162320 + }, + { + "epoch": 1.4350501246485972, + "grad_norm": 2.363712787628174, + "learning_rate": 2.6082497922523385e-05, + "loss": 0.5092, + "step": 162330 + }, + { + "epoch": 1.4351385279089093, + "grad_norm": 1.5764716863632202, + "learning_rate": 2.6081024534851513e-05, + "loss": 0.5242, + "step": 162340 + }, + { + "epoch": 1.4352269311692214, + "grad_norm": 1.7375887632369995, + "learning_rate": 2.607955114717964e-05, + "loss": 0.6371, + "step": 162350 + }, + { + "epoch": 1.4353153344295337, + "grad_norm": 4.226534366607666, + "learning_rate": 2.6078077759507773e-05, + "loss": 0.4833, + "step": 162360 + }, + { + "epoch": 1.435403737689846, + "grad_norm": 3.9281508922576904, + "learning_rate": 2.60766043718359e-05, + "loss": 0.5534, + "step": 162370 + }, + { + "epoch": 1.4354921409501582, + "grad_norm": 9.115564346313477, + "learning_rate": 2.607513098416403e-05, + "loss": 0.5556, + "step": 162380 + }, + { + "epoch": 1.4355805442104705, + "grad_norm": 7.714043617248535, + "learning_rate": 2.607365759649216e-05, + "loss": 0.4868, + "step": 162390 + }, + { + "epoch": 1.4356689474707827, + "grad_norm": 1.9511128664016724, + "learning_rate": 2.607218420882029e-05, + "loss": 0.5432, + "step": 162400 + }, + { + "epoch": 1.435757350731095, + "grad_norm": 1.6880097389221191, + "learning_rate": 2.6070710821148418e-05, + "loss": 0.6266, + "step": 162410 + }, + { + "epoch": 1.4358457539914071, + "grad_norm": 1.64362633228302, + "learning_rate": 2.6069237433476547e-05, + "loss": 0.6547, + "step": 162420 + }, + { + "epoch": 1.4359341572517195, + "grad_norm": 2.097661018371582, + "learning_rate": 2.606776404580468e-05, + "loss": 0.6609, + "step": 162430 + }, + { + "epoch": 1.4360225605120318, + "grad_norm": 1.2369998693466187, + "learning_rate": 2.6066290658132807e-05, + "loss": 0.5857, + "step": 162440 + }, + { + "epoch": 1.436110963772344, + "grad_norm": 4.781394958496094, + "learning_rate": 2.6064817270460935e-05, + "loss": 0.6015, + "step": 162450 + }, + { + "epoch": 1.436199367032656, + "grad_norm": 6.276465892791748, + "learning_rate": 2.6063343882789067e-05, + "loss": 0.5349, + "step": 162460 + }, + { + "epoch": 1.4362877702929684, + "grad_norm": 6.646275043487549, + "learning_rate": 2.6061870495117195e-05, + "loss": 0.5763, + "step": 162470 + }, + { + "epoch": 1.4363761735532807, + "grad_norm": 7.209773540496826, + "learning_rate": 2.6060397107445324e-05, + "loss": 0.5422, + "step": 162480 + }, + { + "epoch": 1.4364645768135929, + "grad_norm": 2.5789430141448975, + "learning_rate": 2.6058923719773455e-05, + "loss": 0.6021, + "step": 162490 + }, + { + "epoch": 1.4365529800739052, + "grad_norm": 3.2698092460632324, + "learning_rate": 2.6057450332101584e-05, + "loss": 0.6449, + "step": 162500 + }, + { + "epoch": 1.4366413833342173, + "grad_norm": 3.841197967529297, + "learning_rate": 2.6055976944429712e-05, + "loss": 0.6016, + "step": 162510 + }, + { + "epoch": 1.4367297865945297, + "grad_norm": 2.9219274520874023, + "learning_rate": 2.6054503556757844e-05, + "loss": 0.6316, + "step": 162520 + }, + { + "epoch": 1.4368181898548418, + "grad_norm": 4.1972503662109375, + "learning_rate": 2.605303016908597e-05, + "loss": 0.5689, + "step": 162530 + }, + { + "epoch": 1.4369065931151541, + "grad_norm": 2.036928415298462, + "learning_rate": 2.60515567814141e-05, + "loss": 0.5514, + "step": 162540 + }, + { + "epoch": 1.4369949963754665, + "grad_norm": 1.4042549133300781, + "learning_rate": 2.6050083393742232e-05, + "loss": 0.6013, + "step": 162550 + }, + { + "epoch": 1.4370833996357786, + "grad_norm": 2.5507240295410156, + "learning_rate": 2.6048610006070357e-05, + "loss": 0.5252, + "step": 162560 + }, + { + "epoch": 1.4371718028960907, + "grad_norm": 14.307161331176758, + "learning_rate": 2.604713661839849e-05, + "loss": 0.6689, + "step": 162570 + }, + { + "epoch": 1.437260206156403, + "grad_norm": 2.6624224185943604, + "learning_rate": 2.604566323072662e-05, + "loss": 0.5598, + "step": 162580 + }, + { + "epoch": 1.4373486094167154, + "grad_norm": 8.971811294555664, + "learning_rate": 2.6044189843054746e-05, + "loss": 0.5781, + "step": 162590 + }, + { + "epoch": 1.4374370126770275, + "grad_norm": 6.02656364440918, + "learning_rate": 2.6042716455382877e-05, + "loss": 0.5422, + "step": 162600 + }, + { + "epoch": 1.4375254159373398, + "grad_norm": 1.2006080150604248, + "learning_rate": 2.604124306771101e-05, + "loss": 0.6252, + "step": 162610 + }, + { + "epoch": 1.437613819197652, + "grad_norm": 3.102445602416992, + "learning_rate": 2.6039769680039134e-05, + "loss": 0.6512, + "step": 162620 + }, + { + "epoch": 1.4377022224579643, + "grad_norm": 5.744995594024658, + "learning_rate": 2.6038296292367266e-05, + "loss": 0.4888, + "step": 162630 + }, + { + "epoch": 1.4377906257182764, + "grad_norm": 1.6488481760025024, + "learning_rate": 2.603682290469539e-05, + "loss": 0.6474, + "step": 162640 + }, + { + "epoch": 1.4378790289785888, + "grad_norm": 14.50014591217041, + "learning_rate": 2.6035349517023522e-05, + "loss": 0.5481, + "step": 162650 + }, + { + "epoch": 1.4379674322389009, + "grad_norm": 1.5166046619415283, + "learning_rate": 2.6033876129351654e-05, + "loss": 0.5951, + "step": 162660 + }, + { + "epoch": 1.4380558354992132, + "grad_norm": 2.7222585678100586, + "learning_rate": 2.603240274167978e-05, + "loss": 0.5437, + "step": 162670 + }, + { + "epoch": 1.4381442387595254, + "grad_norm": 1.432569980621338, + "learning_rate": 2.603092935400791e-05, + "loss": 0.5863, + "step": 162680 + }, + { + "epoch": 1.4382326420198377, + "grad_norm": 3.668592691421509, + "learning_rate": 2.6029455966336043e-05, + "loss": 0.5204, + "step": 162690 + }, + { + "epoch": 1.43832104528015, + "grad_norm": 1.389432668685913, + "learning_rate": 2.6027982578664168e-05, + "loss": 0.6487, + "step": 162700 + }, + { + "epoch": 1.4384094485404622, + "grad_norm": 2.746288299560547, + "learning_rate": 2.60265091909923e-05, + "loss": 0.7084, + "step": 162710 + }, + { + "epoch": 1.4384978518007743, + "grad_norm": 3.2672362327575684, + "learning_rate": 2.602503580332043e-05, + "loss": 0.7816, + "step": 162720 + }, + { + "epoch": 1.4385862550610866, + "grad_norm": 12.6722412109375, + "learning_rate": 2.6023562415648556e-05, + "loss": 0.5416, + "step": 162730 + }, + { + "epoch": 1.438674658321399, + "grad_norm": 5.013515949249268, + "learning_rate": 2.6022089027976688e-05, + "loss": 0.5489, + "step": 162740 + }, + { + "epoch": 1.438763061581711, + "grad_norm": 2.7230336666107178, + "learning_rate": 2.6020615640304813e-05, + "loss": 0.525, + "step": 162750 + }, + { + "epoch": 1.4388514648420234, + "grad_norm": 5.559991836547852, + "learning_rate": 2.6019142252632945e-05, + "loss": 0.6051, + "step": 162760 + }, + { + "epoch": 1.4389398681023355, + "grad_norm": 6.633627891540527, + "learning_rate": 2.6017668864961076e-05, + "loss": 0.5731, + "step": 162770 + }, + { + "epoch": 1.4390282713626479, + "grad_norm": 2.0652992725372314, + "learning_rate": 2.60161954772892e-05, + "loss": 0.5908, + "step": 162780 + }, + { + "epoch": 1.43911667462296, + "grad_norm": 1.779591679573059, + "learning_rate": 2.6014722089617333e-05, + "loss": 0.4873, + "step": 162790 + }, + { + "epoch": 1.4392050778832723, + "grad_norm": 1.1020110845565796, + "learning_rate": 2.6013248701945465e-05, + "loss": 0.6138, + "step": 162800 + }, + { + "epoch": 1.4392934811435847, + "grad_norm": 0.9076933264732361, + "learning_rate": 2.601177531427359e-05, + "loss": 0.6028, + "step": 162810 + }, + { + "epoch": 1.4393818844038968, + "grad_norm": 2.0878868103027344, + "learning_rate": 2.601030192660172e-05, + "loss": 0.5928, + "step": 162820 + }, + { + "epoch": 1.439470287664209, + "grad_norm": 2.130429744720459, + "learning_rate": 2.6008828538929853e-05, + "loss": 0.5192, + "step": 162830 + }, + { + "epoch": 1.4395586909245213, + "grad_norm": 1.096369981765747, + "learning_rate": 2.6007355151257978e-05, + "loss": 0.5802, + "step": 162840 + }, + { + "epoch": 1.4396470941848336, + "grad_norm": 3.403939962387085, + "learning_rate": 2.600588176358611e-05, + "loss": 0.5427, + "step": 162850 + }, + { + "epoch": 1.4397354974451457, + "grad_norm": 3.4605319499969482, + "learning_rate": 2.600440837591424e-05, + "loss": 0.658, + "step": 162860 + }, + { + "epoch": 1.439823900705458, + "grad_norm": 1.0626569986343384, + "learning_rate": 2.6002934988242367e-05, + "loss": 0.4823, + "step": 162870 + }, + { + "epoch": 1.4399123039657702, + "grad_norm": 3.51475191116333, + "learning_rate": 2.60014616005705e-05, + "loss": 0.634, + "step": 162880 + }, + { + "epoch": 1.4400007072260825, + "grad_norm": 4.519690990447998, + "learning_rate": 2.5999988212898623e-05, + "loss": 0.6418, + "step": 162890 + }, + { + "epoch": 1.4400891104863947, + "grad_norm": 4.688043594360352, + "learning_rate": 2.5998514825226755e-05, + "loss": 0.6243, + "step": 162900 + }, + { + "epoch": 1.440177513746707, + "grad_norm": 3.764296293258667, + "learning_rate": 2.5997041437554887e-05, + "loss": 0.5926, + "step": 162910 + }, + { + "epoch": 1.4402659170070193, + "grad_norm": 1.5430827140808105, + "learning_rate": 2.5995568049883012e-05, + "loss": 0.73, + "step": 162920 + }, + { + "epoch": 1.4403543202673315, + "grad_norm": 1.4045923948287964, + "learning_rate": 2.5994094662211143e-05, + "loss": 0.56, + "step": 162930 + }, + { + "epoch": 1.4404427235276436, + "grad_norm": 11.848942756652832, + "learning_rate": 2.5992621274539275e-05, + "loss": 0.6137, + "step": 162940 + }, + { + "epoch": 1.440531126787956, + "grad_norm": 3.742100954055786, + "learning_rate": 2.59911478868674e-05, + "loss": 0.6206, + "step": 162950 + }, + { + "epoch": 1.4406195300482683, + "grad_norm": 1.1643280982971191, + "learning_rate": 2.5989674499195532e-05, + "loss": 0.5386, + "step": 162960 + }, + { + "epoch": 1.4407079333085804, + "grad_norm": 2.6120107173919678, + "learning_rate": 2.5988201111523664e-05, + "loss": 0.537, + "step": 162970 + }, + { + "epoch": 1.4407963365688927, + "grad_norm": 1.7390090227127075, + "learning_rate": 2.598672772385179e-05, + "loss": 0.5117, + "step": 162980 + }, + { + "epoch": 1.4408847398292048, + "grad_norm": 1.3304662704467773, + "learning_rate": 2.598525433617992e-05, + "loss": 0.4873, + "step": 162990 + }, + { + "epoch": 1.4409731430895172, + "grad_norm": 7.268707752227783, + "learning_rate": 2.5983780948508045e-05, + "loss": 0.6188, + "step": 163000 + }, + { + "epoch": 1.4410615463498293, + "grad_norm": 1.9802719354629517, + "learning_rate": 2.5982307560836177e-05, + "loss": 0.6798, + "step": 163010 + }, + { + "epoch": 1.4411499496101416, + "grad_norm": 2.593223810195923, + "learning_rate": 2.598083417316431e-05, + "loss": 0.5599, + "step": 163020 + }, + { + "epoch": 1.441238352870454, + "grad_norm": 6.6110334396362305, + "learning_rate": 2.5979360785492434e-05, + "loss": 0.7445, + "step": 163030 + }, + { + "epoch": 1.441326756130766, + "grad_norm": 1.256464958190918, + "learning_rate": 2.5977887397820566e-05, + "loss": 0.4669, + "step": 163040 + }, + { + "epoch": 1.4414151593910782, + "grad_norm": 1.5307859182357788, + "learning_rate": 2.5976414010148697e-05, + "loss": 0.5798, + "step": 163050 + }, + { + "epoch": 1.4415035626513906, + "grad_norm": 2.0247325897216797, + "learning_rate": 2.5974940622476822e-05, + "loss": 0.6276, + "step": 163060 + }, + { + "epoch": 1.441591965911703, + "grad_norm": 17.28339385986328, + "learning_rate": 2.5973467234804954e-05, + "loss": 0.5822, + "step": 163070 + }, + { + "epoch": 1.441680369172015, + "grad_norm": 2.3798680305480957, + "learning_rate": 2.5971993847133086e-05, + "loss": 0.4811, + "step": 163080 + }, + { + "epoch": 1.4417687724323274, + "grad_norm": 2.943125009536743, + "learning_rate": 2.597052045946121e-05, + "loss": 0.641, + "step": 163090 + }, + { + "epoch": 1.4418571756926395, + "grad_norm": 2.5619585514068604, + "learning_rate": 2.5969047071789342e-05, + "loss": 0.5306, + "step": 163100 + }, + { + "epoch": 1.4419455789529518, + "grad_norm": 1.1987885236740112, + "learning_rate": 2.5967573684117467e-05, + "loss": 0.5237, + "step": 163110 + }, + { + "epoch": 1.442033982213264, + "grad_norm": 5.159473419189453, + "learning_rate": 2.59661002964456e-05, + "loss": 0.6278, + "step": 163120 + }, + { + "epoch": 1.4421223854735763, + "grad_norm": 8.181326866149902, + "learning_rate": 2.596462690877373e-05, + "loss": 0.6036, + "step": 163130 + }, + { + "epoch": 1.4422107887338886, + "grad_norm": 2.690187692642212, + "learning_rate": 2.5963153521101856e-05, + "loss": 0.5328, + "step": 163140 + }, + { + "epoch": 1.4422991919942008, + "grad_norm": 1.867153525352478, + "learning_rate": 2.5961680133429988e-05, + "loss": 0.465, + "step": 163150 + }, + { + "epoch": 1.4423875952545129, + "grad_norm": 7.317404270172119, + "learning_rate": 2.596020674575812e-05, + "loss": 0.5974, + "step": 163160 + }, + { + "epoch": 1.4424759985148252, + "grad_norm": 3.129351854324341, + "learning_rate": 2.5958733358086244e-05, + "loss": 0.8205, + "step": 163170 + }, + { + "epoch": 1.4425644017751376, + "grad_norm": 1.3179301023483276, + "learning_rate": 2.5957259970414376e-05, + "loss": 0.6368, + "step": 163180 + }, + { + "epoch": 1.4426528050354497, + "grad_norm": 10.699811935424805, + "learning_rate": 2.5955786582742508e-05, + "loss": 0.5653, + "step": 163190 + }, + { + "epoch": 1.442741208295762, + "grad_norm": 6.10602331161499, + "learning_rate": 2.5954313195070633e-05, + "loss": 0.5601, + "step": 163200 + }, + { + "epoch": 1.4428296115560741, + "grad_norm": 10.145916938781738, + "learning_rate": 2.5952839807398764e-05, + "loss": 0.5393, + "step": 163210 + }, + { + "epoch": 1.4429180148163865, + "grad_norm": 3.739583730697632, + "learning_rate": 2.5951366419726896e-05, + "loss": 0.5284, + "step": 163220 + }, + { + "epoch": 1.4430064180766986, + "grad_norm": 6.051761150360107, + "learning_rate": 2.594989303205502e-05, + "loss": 0.5538, + "step": 163230 + }, + { + "epoch": 1.443094821337011, + "grad_norm": 1.2931458950042725, + "learning_rate": 2.5948419644383153e-05, + "loss": 0.6153, + "step": 163240 + }, + { + "epoch": 1.443183224597323, + "grad_norm": 18.104881286621094, + "learning_rate": 2.594694625671128e-05, + "loss": 0.6126, + "step": 163250 + }, + { + "epoch": 1.4432716278576354, + "grad_norm": 5.546779632568359, + "learning_rate": 2.594547286903941e-05, + "loss": 0.7233, + "step": 163260 + }, + { + "epoch": 1.4433600311179475, + "grad_norm": 2.75102162361145, + "learning_rate": 2.594399948136754e-05, + "loss": 0.7219, + "step": 163270 + }, + { + "epoch": 1.4434484343782599, + "grad_norm": 2.815718650817871, + "learning_rate": 2.594252609369567e-05, + "loss": 0.5152, + "step": 163280 + }, + { + "epoch": 1.4435368376385722, + "grad_norm": 4.630425930023193, + "learning_rate": 2.5941052706023798e-05, + "loss": 0.7018, + "step": 163290 + }, + { + "epoch": 1.4436252408988843, + "grad_norm": 3.211033344268799, + "learning_rate": 2.593957931835193e-05, + "loss": 0.5249, + "step": 163300 + }, + { + "epoch": 1.4437136441591967, + "grad_norm": 1.5600229501724243, + "learning_rate": 2.5938105930680058e-05, + "loss": 0.496, + "step": 163310 + }, + { + "epoch": 1.4438020474195088, + "grad_norm": 1.2481801509857178, + "learning_rate": 2.5936632543008187e-05, + "loss": 0.5194, + "step": 163320 + }, + { + "epoch": 1.4438904506798211, + "grad_norm": 2.3871798515319824, + "learning_rate": 2.5935159155336318e-05, + "loss": 0.6149, + "step": 163330 + }, + { + "epoch": 1.4439788539401333, + "grad_norm": 2.9217076301574707, + "learning_rate": 2.5933685767664447e-05, + "loss": 0.6472, + "step": 163340 + }, + { + "epoch": 1.4440672572004456, + "grad_norm": 2.064652442932129, + "learning_rate": 2.5932212379992575e-05, + "loss": 0.7448, + "step": 163350 + }, + { + "epoch": 1.4441556604607577, + "grad_norm": 4.032621383666992, + "learning_rate": 2.5930738992320703e-05, + "loss": 0.6343, + "step": 163360 + }, + { + "epoch": 1.44424406372107, + "grad_norm": 5.0219502449035645, + "learning_rate": 2.5929265604648835e-05, + "loss": 0.5416, + "step": 163370 + }, + { + "epoch": 1.4443324669813822, + "grad_norm": 1.3431707620620728, + "learning_rate": 2.5927792216976963e-05, + "loss": 0.5947, + "step": 163380 + }, + { + "epoch": 1.4444208702416945, + "grad_norm": 5.522706508636475, + "learning_rate": 2.5926318829305092e-05, + "loss": 0.6294, + "step": 163390 + }, + { + "epoch": 1.4445092735020069, + "grad_norm": 3.8069798946380615, + "learning_rate": 2.5924845441633224e-05, + "loss": 0.695, + "step": 163400 + }, + { + "epoch": 1.444597676762319, + "grad_norm": 3.3847084045410156, + "learning_rate": 2.5923372053961352e-05, + "loss": 0.5162, + "step": 163410 + }, + { + "epoch": 1.444686080022631, + "grad_norm": 1.954024314880371, + "learning_rate": 2.592189866628948e-05, + "loss": 0.5632, + "step": 163420 + }, + { + "epoch": 1.4447744832829434, + "grad_norm": 5.948233604431152, + "learning_rate": 2.5920425278617612e-05, + "loss": 0.5513, + "step": 163430 + }, + { + "epoch": 1.4448628865432558, + "grad_norm": 2.3392529487609863, + "learning_rate": 2.591895189094574e-05, + "loss": 0.5386, + "step": 163440 + }, + { + "epoch": 1.444951289803568, + "grad_norm": 1.4230355024337769, + "learning_rate": 2.591747850327387e-05, + "loss": 0.592, + "step": 163450 + }, + { + "epoch": 1.4450396930638802, + "grad_norm": 1.8998687267303467, + "learning_rate": 2.5916005115602e-05, + "loss": 0.5133, + "step": 163460 + }, + { + "epoch": 1.4451280963241924, + "grad_norm": 5.14849853515625, + "learning_rate": 2.5914531727930125e-05, + "loss": 0.614, + "step": 163470 + }, + { + "epoch": 1.4452164995845047, + "grad_norm": 5.489749908447266, + "learning_rate": 2.5913058340258257e-05, + "loss": 0.5105, + "step": 163480 + }, + { + "epoch": 1.4453049028448168, + "grad_norm": 1.4034628868103027, + "learning_rate": 2.591158495258639e-05, + "loss": 0.5288, + "step": 163490 + }, + { + "epoch": 1.4453933061051292, + "grad_norm": 17.642122268676758, + "learning_rate": 2.5910111564914514e-05, + "loss": 0.6285, + "step": 163500 + }, + { + "epoch": 1.4454817093654415, + "grad_norm": 4.625466823577881, + "learning_rate": 2.5908638177242646e-05, + "loss": 0.7202, + "step": 163510 + }, + { + "epoch": 1.4455701126257536, + "grad_norm": 2.3592336177825928, + "learning_rate": 2.5907164789570777e-05, + "loss": 0.5312, + "step": 163520 + }, + { + "epoch": 1.4456585158860658, + "grad_norm": 4.763549327850342, + "learning_rate": 2.5905691401898902e-05, + "loss": 0.6423, + "step": 163530 + }, + { + "epoch": 1.445746919146378, + "grad_norm": 6.493288993835449, + "learning_rate": 2.5904218014227034e-05, + "loss": 0.7212, + "step": 163540 + }, + { + "epoch": 1.4458353224066904, + "grad_norm": 11.763323783874512, + "learning_rate": 2.5902744626555166e-05, + "loss": 0.6255, + "step": 163550 + }, + { + "epoch": 1.4459237256670026, + "grad_norm": 4.125518321990967, + "learning_rate": 2.590127123888329e-05, + "loss": 0.5503, + "step": 163560 + }, + { + "epoch": 1.446012128927315, + "grad_norm": 4.00100564956665, + "learning_rate": 2.5899797851211423e-05, + "loss": 0.5956, + "step": 163570 + }, + { + "epoch": 1.446100532187627, + "grad_norm": 5.836422443389893, + "learning_rate": 2.5898324463539547e-05, + "loss": 0.5177, + "step": 163580 + }, + { + "epoch": 1.4461889354479394, + "grad_norm": 2.6270346641540527, + "learning_rate": 2.589685107586768e-05, + "loss": 0.554, + "step": 163590 + }, + { + "epoch": 1.4462773387082515, + "grad_norm": 4.215670108795166, + "learning_rate": 2.589537768819581e-05, + "loss": 0.5933, + "step": 163600 + }, + { + "epoch": 1.4463657419685638, + "grad_norm": 1.8026701211929321, + "learning_rate": 2.5893904300523936e-05, + "loss": 0.6172, + "step": 163610 + }, + { + "epoch": 1.4464541452288762, + "grad_norm": 6.321165084838867, + "learning_rate": 2.5892430912852068e-05, + "loss": 0.5316, + "step": 163620 + }, + { + "epoch": 1.4465425484891883, + "grad_norm": 2.622074842453003, + "learning_rate": 2.58909575251802e-05, + "loss": 0.5677, + "step": 163630 + }, + { + "epoch": 1.4466309517495004, + "grad_norm": 3.058978319168091, + "learning_rate": 2.5889484137508324e-05, + "loss": 0.6025, + "step": 163640 + }, + { + "epoch": 1.4467193550098127, + "grad_norm": 1.5226470232009888, + "learning_rate": 2.5888010749836456e-05, + "loss": 0.5846, + "step": 163650 + }, + { + "epoch": 1.446807758270125, + "grad_norm": 1.815367579460144, + "learning_rate": 2.5886537362164588e-05, + "loss": 0.5233, + "step": 163660 + }, + { + "epoch": 1.4468961615304372, + "grad_norm": 0.8955591320991516, + "learning_rate": 2.5885063974492713e-05, + "loss": 0.4743, + "step": 163670 + }, + { + "epoch": 1.4469845647907496, + "grad_norm": 13.716712951660156, + "learning_rate": 2.5883590586820845e-05, + "loss": 0.7029, + "step": 163680 + }, + { + "epoch": 1.4470729680510617, + "grad_norm": 2.3181943893432617, + "learning_rate": 2.5882117199148976e-05, + "loss": 0.6114, + "step": 163690 + }, + { + "epoch": 1.447161371311374, + "grad_norm": 1.5615142583847046, + "learning_rate": 2.58806438114771e-05, + "loss": 0.6424, + "step": 163700 + }, + { + "epoch": 1.4472497745716861, + "grad_norm": 2.9645166397094727, + "learning_rate": 2.5879170423805233e-05, + "loss": 0.6658, + "step": 163710 + }, + { + "epoch": 1.4473381778319985, + "grad_norm": 4.734382629394531, + "learning_rate": 2.5877697036133358e-05, + "loss": 0.5654, + "step": 163720 + }, + { + "epoch": 1.4474265810923108, + "grad_norm": 2.744168519973755, + "learning_rate": 2.587622364846149e-05, + "loss": 0.6321, + "step": 163730 + }, + { + "epoch": 1.447514984352623, + "grad_norm": 2.9295029640197754, + "learning_rate": 2.587475026078962e-05, + "loss": 0.6988, + "step": 163740 + }, + { + "epoch": 1.447603387612935, + "grad_norm": 3.176943302154541, + "learning_rate": 2.5873276873117746e-05, + "loss": 0.643, + "step": 163750 + }, + { + "epoch": 1.4476917908732474, + "grad_norm": 1.8153252601623535, + "learning_rate": 2.5871803485445878e-05, + "loss": 0.5405, + "step": 163760 + }, + { + "epoch": 1.4477801941335597, + "grad_norm": 1.4959596395492554, + "learning_rate": 2.587033009777401e-05, + "loss": 0.6259, + "step": 163770 + }, + { + "epoch": 1.4478685973938719, + "grad_norm": 2.9318108558654785, + "learning_rate": 2.5868856710102135e-05, + "loss": 0.5346, + "step": 163780 + }, + { + "epoch": 1.4479570006541842, + "grad_norm": 2.953251838684082, + "learning_rate": 2.5867383322430267e-05, + "loss": 0.5994, + "step": 163790 + }, + { + "epoch": 1.4480454039144963, + "grad_norm": 4.248293876647949, + "learning_rate": 2.58659099347584e-05, + "loss": 0.6384, + "step": 163800 + }, + { + "epoch": 1.4481338071748087, + "grad_norm": 2.5765395164489746, + "learning_rate": 2.5864436547086523e-05, + "loss": 0.5957, + "step": 163810 + }, + { + "epoch": 1.4482222104351208, + "grad_norm": 7.987663269042969, + "learning_rate": 2.5862963159414655e-05, + "loss": 0.6086, + "step": 163820 + }, + { + "epoch": 1.4483106136954331, + "grad_norm": 1.3870084285736084, + "learning_rate": 2.586148977174278e-05, + "loss": 0.6118, + "step": 163830 + }, + { + "epoch": 1.4483990169557455, + "grad_norm": 1.799506425857544, + "learning_rate": 2.5860016384070912e-05, + "loss": 0.5134, + "step": 163840 + }, + { + "epoch": 1.4484874202160576, + "grad_norm": 3.09377384185791, + "learning_rate": 2.5858542996399044e-05, + "loss": 0.4939, + "step": 163850 + }, + { + "epoch": 1.4485758234763697, + "grad_norm": 2.490576982498169, + "learning_rate": 2.585706960872717e-05, + "loss": 0.6821, + "step": 163860 + }, + { + "epoch": 1.448664226736682, + "grad_norm": 2.213803768157959, + "learning_rate": 2.58555962210553e-05, + "loss": 0.5501, + "step": 163870 + }, + { + "epoch": 1.4487526299969944, + "grad_norm": 8.636711120605469, + "learning_rate": 2.5854122833383432e-05, + "loss": 0.5457, + "step": 163880 + }, + { + "epoch": 1.4488410332573065, + "grad_norm": 2.0155556201934814, + "learning_rate": 2.5852649445711557e-05, + "loss": 0.4364, + "step": 163890 + }, + { + "epoch": 1.4489294365176189, + "grad_norm": 6.429178237915039, + "learning_rate": 2.585117605803969e-05, + "loss": 0.5651, + "step": 163900 + }, + { + "epoch": 1.449017839777931, + "grad_norm": 5.18787956237793, + "learning_rate": 2.584970267036782e-05, + "loss": 0.659, + "step": 163910 + }, + { + "epoch": 1.4491062430382433, + "grad_norm": 6.4320387840271, + "learning_rate": 2.5848229282695945e-05, + "loss": 0.4547, + "step": 163920 + }, + { + "epoch": 1.4491946462985554, + "grad_norm": 3.056828022003174, + "learning_rate": 2.5846755895024077e-05, + "loss": 0.5395, + "step": 163930 + }, + { + "epoch": 1.4492830495588678, + "grad_norm": 1.1863996982574463, + "learning_rate": 2.5845282507352202e-05, + "loss": 0.4899, + "step": 163940 + }, + { + "epoch": 1.44937145281918, + "grad_norm": 2.12358021736145, + "learning_rate": 2.5843809119680334e-05, + "loss": 0.5816, + "step": 163950 + }, + { + "epoch": 1.4494598560794922, + "grad_norm": 4.030679225921631, + "learning_rate": 2.5842335732008466e-05, + "loss": 0.6144, + "step": 163960 + }, + { + "epoch": 1.4495482593398044, + "grad_norm": 1.533742070198059, + "learning_rate": 2.584086234433659e-05, + "loss": 0.5092, + "step": 163970 + }, + { + "epoch": 1.4496366626001167, + "grad_norm": 3.387834072113037, + "learning_rate": 2.5839388956664722e-05, + "loss": 0.7097, + "step": 163980 + }, + { + "epoch": 1.449725065860429, + "grad_norm": 1.3237627744674683, + "learning_rate": 2.5837915568992854e-05, + "loss": 0.5897, + "step": 163990 + }, + { + "epoch": 1.4498134691207412, + "grad_norm": 1.4648076295852661, + "learning_rate": 2.583644218132098e-05, + "loss": 0.7495, + "step": 164000 + }, + { + "epoch": 1.4499018723810533, + "grad_norm": 3.0642778873443604, + "learning_rate": 2.583496879364911e-05, + "loss": 0.5616, + "step": 164010 + }, + { + "epoch": 1.4499902756413656, + "grad_norm": 3.9176270961761475, + "learning_rate": 2.5833495405977242e-05, + "loss": 0.6663, + "step": 164020 + }, + { + "epoch": 1.450078678901678, + "grad_norm": 17.731969833374023, + "learning_rate": 2.5832022018305367e-05, + "loss": 0.5695, + "step": 164030 + }, + { + "epoch": 1.45016708216199, + "grad_norm": 2.0136289596557617, + "learning_rate": 2.58305486306335e-05, + "loss": 0.6485, + "step": 164040 + }, + { + "epoch": 1.4502554854223024, + "grad_norm": 1.26592218875885, + "learning_rate": 2.5829075242961624e-05, + "loss": 0.4891, + "step": 164050 + }, + { + "epoch": 1.4503438886826145, + "grad_norm": 6.124168395996094, + "learning_rate": 2.5827601855289756e-05, + "loss": 0.5678, + "step": 164060 + }, + { + "epoch": 1.4504322919429269, + "grad_norm": 4.74151611328125, + "learning_rate": 2.5826128467617888e-05, + "loss": 0.6572, + "step": 164070 + }, + { + "epoch": 1.450520695203239, + "grad_norm": 3.4685003757476807, + "learning_rate": 2.5824655079946013e-05, + "loss": 0.5702, + "step": 164080 + }, + { + "epoch": 1.4506090984635513, + "grad_norm": 2.7056164741516113, + "learning_rate": 2.5823181692274144e-05, + "loss": 0.5537, + "step": 164090 + }, + { + "epoch": 1.4506975017238637, + "grad_norm": 6.796932220458984, + "learning_rate": 2.5821708304602276e-05, + "loss": 0.636, + "step": 164100 + }, + { + "epoch": 1.4507859049841758, + "grad_norm": 1.6616939306259155, + "learning_rate": 2.58202349169304e-05, + "loss": 0.5289, + "step": 164110 + }, + { + "epoch": 1.450874308244488, + "grad_norm": 3.142630100250244, + "learning_rate": 2.5818761529258533e-05, + "loss": 0.6268, + "step": 164120 + }, + { + "epoch": 1.4509627115048003, + "grad_norm": 1.2217828035354614, + "learning_rate": 2.5817288141586665e-05, + "loss": 0.6011, + "step": 164130 + }, + { + "epoch": 1.4510511147651126, + "grad_norm": 1.3970904350280762, + "learning_rate": 2.581581475391479e-05, + "loss": 0.5293, + "step": 164140 + }, + { + "epoch": 1.4511395180254247, + "grad_norm": 1.7399171590805054, + "learning_rate": 2.581434136624292e-05, + "loss": 0.5658, + "step": 164150 + }, + { + "epoch": 1.451227921285737, + "grad_norm": 2.3229031562805176, + "learning_rate": 2.5812867978571053e-05, + "loss": 0.599, + "step": 164160 + }, + { + "epoch": 1.4513163245460492, + "grad_norm": 7.8017258644104, + "learning_rate": 2.5811394590899178e-05, + "loss": 0.7117, + "step": 164170 + }, + { + "epoch": 1.4514047278063615, + "grad_norm": 6.35605525970459, + "learning_rate": 2.580992120322731e-05, + "loss": 0.7188, + "step": 164180 + }, + { + "epoch": 1.4514931310666737, + "grad_norm": 2.08288311958313, + "learning_rate": 2.5808447815555438e-05, + "loss": 0.5334, + "step": 164190 + }, + { + "epoch": 1.451581534326986, + "grad_norm": 15.069218635559082, + "learning_rate": 2.5806974427883566e-05, + "loss": 0.812, + "step": 164200 + }, + { + "epoch": 1.4516699375872983, + "grad_norm": 3.739450216293335, + "learning_rate": 2.5805501040211698e-05, + "loss": 0.6396, + "step": 164210 + }, + { + "epoch": 1.4517583408476105, + "grad_norm": 2.323925495147705, + "learning_rate": 2.5804027652539827e-05, + "loss": 0.6348, + "step": 164220 + }, + { + "epoch": 1.4518467441079226, + "grad_norm": 1.292046308517456, + "learning_rate": 2.5802554264867955e-05, + "loss": 0.6547, + "step": 164230 + }, + { + "epoch": 1.451935147368235, + "grad_norm": 6.096904754638672, + "learning_rate": 2.5801080877196087e-05, + "loss": 0.6436, + "step": 164240 + }, + { + "epoch": 1.4520235506285473, + "grad_norm": 5.194807529449463, + "learning_rate": 2.5799607489524215e-05, + "loss": 0.5368, + "step": 164250 + }, + { + "epoch": 1.4521119538888594, + "grad_norm": 1.9867806434631348, + "learning_rate": 2.5798134101852343e-05, + "loss": 0.5341, + "step": 164260 + }, + { + "epoch": 1.4522003571491717, + "grad_norm": 4.554685592651367, + "learning_rate": 2.5796660714180475e-05, + "loss": 0.5455, + "step": 164270 + }, + { + "epoch": 1.4522887604094838, + "grad_norm": 4.112213134765625, + "learning_rate": 2.5795187326508603e-05, + "loss": 0.5344, + "step": 164280 + }, + { + "epoch": 1.4523771636697962, + "grad_norm": 1.4324764013290405, + "learning_rate": 2.5793713938836732e-05, + "loss": 0.6195, + "step": 164290 + }, + { + "epoch": 1.4524655669301083, + "grad_norm": 1.9947705268859863, + "learning_rate": 2.579224055116486e-05, + "loss": 0.7024, + "step": 164300 + }, + { + "epoch": 1.4525539701904207, + "grad_norm": 2.0354230403900146, + "learning_rate": 2.5790767163492992e-05, + "loss": 0.6673, + "step": 164310 + }, + { + "epoch": 1.452642373450733, + "grad_norm": 1.0773022174835205, + "learning_rate": 2.578929377582112e-05, + "loss": 0.5314, + "step": 164320 + }, + { + "epoch": 1.4527307767110451, + "grad_norm": 11.21201229095459, + "learning_rate": 2.578782038814925e-05, + "loss": 0.5639, + "step": 164330 + }, + { + "epoch": 1.4528191799713572, + "grad_norm": 2.3989248275756836, + "learning_rate": 2.578634700047738e-05, + "loss": 0.6197, + "step": 164340 + }, + { + "epoch": 1.4529075832316696, + "grad_norm": 1.8855544328689575, + "learning_rate": 2.578487361280551e-05, + "loss": 0.6155, + "step": 164350 + }, + { + "epoch": 1.452995986491982, + "grad_norm": 1.1655921936035156, + "learning_rate": 2.5783400225133637e-05, + "loss": 0.5079, + "step": 164360 + }, + { + "epoch": 1.453084389752294, + "grad_norm": 4.279953479766846, + "learning_rate": 2.578192683746177e-05, + "loss": 0.6271, + "step": 164370 + }, + { + "epoch": 1.4531727930126064, + "grad_norm": 6.052027225494385, + "learning_rate": 2.5780453449789897e-05, + "loss": 0.6589, + "step": 164380 + }, + { + "epoch": 1.4532611962729185, + "grad_norm": 7.289895057678223, + "learning_rate": 2.5778980062118025e-05, + "loss": 0.6789, + "step": 164390 + }, + { + "epoch": 1.4533495995332308, + "grad_norm": 2.2724192142486572, + "learning_rate": 2.5777506674446157e-05, + "loss": 0.6214, + "step": 164400 + }, + { + "epoch": 1.453438002793543, + "grad_norm": 2.834459066390991, + "learning_rate": 2.5776033286774282e-05, + "loss": 0.5595, + "step": 164410 + }, + { + "epoch": 1.4535264060538553, + "grad_norm": 8.435432434082031, + "learning_rate": 2.5774559899102414e-05, + "loss": 0.7499, + "step": 164420 + }, + { + "epoch": 1.4536148093141676, + "grad_norm": 1.8576384782791138, + "learning_rate": 2.5773086511430546e-05, + "loss": 0.5031, + "step": 164430 + }, + { + "epoch": 1.4537032125744798, + "grad_norm": 3.54443621635437, + "learning_rate": 2.577161312375867e-05, + "loss": 0.7047, + "step": 164440 + }, + { + "epoch": 1.4537916158347919, + "grad_norm": 1.3779367208480835, + "learning_rate": 2.5770139736086802e-05, + "loss": 0.5273, + "step": 164450 + }, + { + "epoch": 1.4538800190951042, + "grad_norm": 0.951066792011261, + "learning_rate": 2.5768666348414934e-05, + "loss": 0.4368, + "step": 164460 + }, + { + "epoch": 1.4539684223554166, + "grad_norm": 6.272188663482666, + "learning_rate": 2.576719296074306e-05, + "loss": 0.6842, + "step": 164470 + }, + { + "epoch": 1.4540568256157287, + "grad_norm": 2.189135789871216, + "learning_rate": 2.576571957307119e-05, + "loss": 0.5512, + "step": 164480 + }, + { + "epoch": 1.454145228876041, + "grad_norm": 9.620318412780762, + "learning_rate": 2.5764246185399323e-05, + "loss": 0.5882, + "step": 164490 + }, + { + "epoch": 1.4542336321363531, + "grad_norm": 1.9308042526245117, + "learning_rate": 2.5762772797727448e-05, + "loss": 0.5067, + "step": 164500 + }, + { + "epoch": 1.4543220353966655, + "grad_norm": 2.0730581283569336, + "learning_rate": 2.576129941005558e-05, + "loss": 0.5153, + "step": 164510 + }, + { + "epoch": 1.4544104386569776, + "grad_norm": 1.6916236877441406, + "learning_rate": 2.5759826022383704e-05, + "loss": 0.6213, + "step": 164520 + }, + { + "epoch": 1.45449884191729, + "grad_norm": 5.143879413604736, + "learning_rate": 2.5758352634711836e-05, + "loss": 0.5907, + "step": 164530 + }, + { + "epoch": 1.454587245177602, + "grad_norm": 1.4263319969177246, + "learning_rate": 2.5756879247039968e-05, + "loss": 0.6434, + "step": 164540 + }, + { + "epoch": 1.4546756484379144, + "grad_norm": 32.212013244628906, + "learning_rate": 2.5755405859368093e-05, + "loss": 0.5096, + "step": 164550 + }, + { + "epoch": 1.4547640516982265, + "grad_norm": 1.8083791732788086, + "learning_rate": 2.5753932471696224e-05, + "loss": 0.4848, + "step": 164560 + }, + { + "epoch": 1.4548524549585389, + "grad_norm": 5.165712356567383, + "learning_rate": 2.5752459084024356e-05, + "loss": 0.6446, + "step": 164570 + }, + { + "epoch": 1.4549408582188512, + "grad_norm": 4.633554935455322, + "learning_rate": 2.575098569635248e-05, + "loss": 0.6794, + "step": 164580 + }, + { + "epoch": 1.4550292614791633, + "grad_norm": 9.161764144897461, + "learning_rate": 2.5749512308680613e-05, + "loss": 0.5444, + "step": 164590 + }, + { + "epoch": 1.4551176647394755, + "grad_norm": 3.4681951999664307, + "learning_rate": 2.5748038921008745e-05, + "loss": 0.5406, + "step": 164600 + }, + { + "epoch": 1.4552060679997878, + "grad_norm": 3.597238540649414, + "learning_rate": 2.574656553333687e-05, + "loss": 0.5073, + "step": 164610 + }, + { + "epoch": 1.4552944712601001, + "grad_norm": 3.0702738761901855, + "learning_rate": 2.5745092145665e-05, + "loss": 0.4819, + "step": 164620 + }, + { + "epoch": 1.4553828745204123, + "grad_norm": 3.031830310821533, + "learning_rate": 2.5743618757993133e-05, + "loss": 0.6888, + "step": 164630 + }, + { + "epoch": 1.4554712777807246, + "grad_norm": 3.0176455974578857, + "learning_rate": 2.5742145370321258e-05, + "loss": 0.6312, + "step": 164640 + }, + { + "epoch": 1.4555596810410367, + "grad_norm": 1.930683970451355, + "learning_rate": 2.574067198264939e-05, + "loss": 0.6988, + "step": 164650 + }, + { + "epoch": 1.455648084301349, + "grad_norm": 12.490296363830566, + "learning_rate": 2.5739198594977515e-05, + "loss": 0.623, + "step": 164660 + }, + { + "epoch": 1.4557364875616612, + "grad_norm": 2.5703468322753906, + "learning_rate": 2.5737725207305646e-05, + "loss": 0.6131, + "step": 164670 + }, + { + "epoch": 1.4558248908219735, + "grad_norm": 8.548884391784668, + "learning_rate": 2.5736251819633778e-05, + "loss": 0.5618, + "step": 164680 + }, + { + "epoch": 1.4559132940822859, + "grad_norm": 3.454601526260376, + "learning_rate": 2.5734778431961903e-05, + "loss": 0.6014, + "step": 164690 + }, + { + "epoch": 1.456001697342598, + "grad_norm": 2.2383291721343994, + "learning_rate": 2.5733305044290035e-05, + "loss": 0.5733, + "step": 164700 + }, + { + "epoch": 1.45609010060291, + "grad_norm": 4.370964527130127, + "learning_rate": 2.5731831656618167e-05, + "loss": 0.6482, + "step": 164710 + }, + { + "epoch": 1.4561785038632225, + "grad_norm": 8.252481460571289, + "learning_rate": 2.573035826894629e-05, + "loss": 0.646, + "step": 164720 + }, + { + "epoch": 1.4562669071235348, + "grad_norm": 1.504181146621704, + "learning_rate": 2.5728884881274423e-05, + "loss": 0.4821, + "step": 164730 + }, + { + "epoch": 1.456355310383847, + "grad_norm": 1.9877575635910034, + "learning_rate": 2.5727411493602555e-05, + "loss": 0.5806, + "step": 164740 + }, + { + "epoch": 1.4564437136441593, + "grad_norm": 1.5914674997329712, + "learning_rate": 2.572593810593068e-05, + "loss": 0.569, + "step": 164750 + }, + { + "epoch": 1.4565321169044714, + "grad_norm": 2.445352077484131, + "learning_rate": 2.5724464718258812e-05, + "loss": 0.5511, + "step": 164760 + }, + { + "epoch": 1.4566205201647837, + "grad_norm": 4.389750003814697, + "learning_rate": 2.5722991330586937e-05, + "loss": 0.6666, + "step": 164770 + }, + { + "epoch": 1.4567089234250958, + "grad_norm": 1.926706075668335, + "learning_rate": 2.572151794291507e-05, + "loss": 0.5022, + "step": 164780 + }, + { + "epoch": 1.4567973266854082, + "grad_norm": 5.218235015869141, + "learning_rate": 2.57200445552432e-05, + "loss": 0.6544, + "step": 164790 + }, + { + "epoch": 1.4568857299457205, + "grad_norm": 5.551985740661621, + "learning_rate": 2.5718571167571325e-05, + "loss": 0.6384, + "step": 164800 + }, + { + "epoch": 1.4569741332060326, + "grad_norm": 9.404993057250977, + "learning_rate": 2.5717097779899457e-05, + "loss": 0.7557, + "step": 164810 + }, + { + "epoch": 1.4570625364663448, + "grad_norm": 5.8141913414001465, + "learning_rate": 2.571562439222759e-05, + "loss": 0.5351, + "step": 164820 + }, + { + "epoch": 1.457150939726657, + "grad_norm": 3.627639055252075, + "learning_rate": 2.5714151004555714e-05, + "loss": 0.5949, + "step": 164830 + }, + { + "epoch": 1.4572393429869694, + "grad_norm": 12.041051864624023, + "learning_rate": 2.5712677616883845e-05, + "loss": 0.738, + "step": 164840 + }, + { + "epoch": 1.4573277462472816, + "grad_norm": 5.507813453674316, + "learning_rate": 2.5711204229211977e-05, + "loss": 0.5774, + "step": 164850 + }, + { + "epoch": 1.457416149507594, + "grad_norm": 6.31934118270874, + "learning_rate": 2.5709730841540102e-05, + "loss": 0.5821, + "step": 164860 + }, + { + "epoch": 1.457504552767906, + "grad_norm": 4.954954147338867, + "learning_rate": 2.5708257453868234e-05, + "loss": 0.6281, + "step": 164870 + }, + { + "epoch": 1.4575929560282184, + "grad_norm": 0.9911690950393677, + "learning_rate": 2.570678406619636e-05, + "loss": 0.4857, + "step": 164880 + }, + { + "epoch": 1.4576813592885305, + "grad_norm": 1.8980528116226196, + "learning_rate": 2.570531067852449e-05, + "loss": 0.5968, + "step": 164890 + }, + { + "epoch": 1.4577697625488428, + "grad_norm": 6.934421539306641, + "learning_rate": 2.5703837290852622e-05, + "loss": 0.5965, + "step": 164900 + }, + { + "epoch": 1.4578581658091552, + "grad_norm": 1.7270981073379517, + "learning_rate": 2.5702363903180747e-05, + "loss": 0.6563, + "step": 164910 + }, + { + "epoch": 1.4579465690694673, + "grad_norm": 1.2252440452575684, + "learning_rate": 2.570089051550888e-05, + "loss": 0.5351, + "step": 164920 + }, + { + "epoch": 1.4580349723297794, + "grad_norm": 7.718968868255615, + "learning_rate": 2.569941712783701e-05, + "loss": 0.5336, + "step": 164930 + }, + { + "epoch": 1.4581233755900918, + "grad_norm": 3.558626413345337, + "learning_rate": 2.5697943740165136e-05, + "loss": 0.6577, + "step": 164940 + }, + { + "epoch": 1.458211778850404, + "grad_norm": 2.1842081546783447, + "learning_rate": 2.5696470352493267e-05, + "loss": 0.6618, + "step": 164950 + }, + { + "epoch": 1.4583001821107162, + "grad_norm": 1.4713033437728882, + "learning_rate": 2.56949969648214e-05, + "loss": 0.5566, + "step": 164960 + }, + { + "epoch": 1.4583885853710286, + "grad_norm": 2.79213547706604, + "learning_rate": 2.5693523577149524e-05, + "loss": 0.7123, + "step": 164970 + }, + { + "epoch": 1.4584769886313407, + "grad_norm": 2.363684892654419, + "learning_rate": 2.5692050189477656e-05, + "loss": 0.7089, + "step": 164980 + }, + { + "epoch": 1.458565391891653, + "grad_norm": 3.16656231880188, + "learning_rate": 2.569057680180578e-05, + "loss": 0.653, + "step": 164990 + }, + { + "epoch": 1.4586537951519651, + "grad_norm": 1.1046537160873413, + "learning_rate": 2.5689103414133913e-05, + "loss": 0.5114, + "step": 165000 + }, + { + "epoch": 1.4587421984122775, + "grad_norm": 2.486225128173828, + "learning_rate": 2.5687630026462044e-05, + "loss": 0.5947, + "step": 165010 + }, + { + "epoch": 1.4588306016725898, + "grad_norm": 2.149235248565674, + "learning_rate": 2.568615663879017e-05, + "loss": 0.5024, + "step": 165020 + }, + { + "epoch": 1.458919004932902, + "grad_norm": 4.746137619018555, + "learning_rate": 2.56846832511183e-05, + "loss": 0.6324, + "step": 165030 + }, + { + "epoch": 1.459007408193214, + "grad_norm": 1.5279366970062256, + "learning_rate": 2.5683209863446433e-05, + "loss": 0.5474, + "step": 165040 + }, + { + "epoch": 1.4590958114535264, + "grad_norm": 9.404610633850098, + "learning_rate": 2.5681736475774558e-05, + "loss": 0.5092, + "step": 165050 + }, + { + "epoch": 1.4591842147138387, + "grad_norm": 1.1786949634552002, + "learning_rate": 2.568026308810269e-05, + "loss": 0.6351, + "step": 165060 + }, + { + "epoch": 1.4592726179741509, + "grad_norm": 3.8415687084198, + "learning_rate": 2.567878970043082e-05, + "loss": 0.5673, + "step": 165070 + }, + { + "epoch": 1.4593610212344632, + "grad_norm": 3.9637155532836914, + "learning_rate": 2.5677316312758946e-05, + "loss": 0.4905, + "step": 165080 + }, + { + "epoch": 1.4594494244947753, + "grad_norm": 1.7195208072662354, + "learning_rate": 2.5675842925087078e-05, + "loss": 0.6538, + "step": 165090 + }, + { + "epoch": 1.4595378277550877, + "grad_norm": 2.8560831546783447, + "learning_rate": 2.567436953741521e-05, + "loss": 0.6156, + "step": 165100 + }, + { + "epoch": 1.4596262310153998, + "grad_norm": 3.751354455947876, + "learning_rate": 2.5672896149743335e-05, + "loss": 0.6192, + "step": 165110 + }, + { + "epoch": 1.4597146342757121, + "grad_norm": 4.121771812438965, + "learning_rate": 2.5671422762071466e-05, + "loss": 0.5216, + "step": 165120 + }, + { + "epoch": 1.4598030375360243, + "grad_norm": 1.919204831123352, + "learning_rate": 2.5669949374399595e-05, + "loss": 0.558, + "step": 165130 + }, + { + "epoch": 1.4598914407963366, + "grad_norm": 2.993875741958618, + "learning_rate": 2.5668475986727723e-05, + "loss": 0.6175, + "step": 165140 + }, + { + "epoch": 1.4599798440566487, + "grad_norm": 2.063328742980957, + "learning_rate": 2.5667002599055855e-05, + "loss": 0.6634, + "step": 165150 + }, + { + "epoch": 1.460068247316961, + "grad_norm": 1.126875877380371, + "learning_rate": 2.5665529211383983e-05, + "loss": 0.6615, + "step": 165160 + }, + { + "epoch": 1.4601566505772734, + "grad_norm": 4.320643901824951, + "learning_rate": 2.566405582371211e-05, + "loss": 0.6034, + "step": 165170 + }, + { + "epoch": 1.4602450538375855, + "grad_norm": 2.498349905014038, + "learning_rate": 2.5662582436040243e-05, + "loss": 0.5868, + "step": 165180 + }, + { + "epoch": 1.4603334570978976, + "grad_norm": 1.547664999961853, + "learning_rate": 2.5661109048368372e-05, + "loss": 0.5582, + "step": 165190 + }, + { + "epoch": 1.46042186035821, + "grad_norm": 2.9772160053253174, + "learning_rate": 2.56596356606965e-05, + "loss": 0.6408, + "step": 165200 + }, + { + "epoch": 1.4605102636185223, + "grad_norm": 5.7648444175720215, + "learning_rate": 2.5658162273024632e-05, + "loss": 0.6789, + "step": 165210 + }, + { + "epoch": 1.4605986668788344, + "grad_norm": 2.8161935806274414, + "learning_rate": 2.565668888535276e-05, + "loss": 0.5534, + "step": 165220 + }, + { + "epoch": 1.4606870701391468, + "grad_norm": 0.6714740991592407, + "learning_rate": 2.565521549768089e-05, + "loss": 0.5508, + "step": 165230 + }, + { + "epoch": 1.460775473399459, + "grad_norm": 2.9895434379577637, + "learning_rate": 2.5653742110009017e-05, + "loss": 0.6576, + "step": 165240 + }, + { + "epoch": 1.4608638766597712, + "grad_norm": 3.760072708129883, + "learning_rate": 2.565226872233715e-05, + "loss": 0.5562, + "step": 165250 + }, + { + "epoch": 1.4609522799200834, + "grad_norm": 1.5829070806503296, + "learning_rate": 2.5650795334665277e-05, + "loss": 0.531, + "step": 165260 + }, + { + "epoch": 1.4610406831803957, + "grad_norm": 2.8461713790893555, + "learning_rate": 2.5649321946993405e-05, + "loss": 0.6239, + "step": 165270 + }, + { + "epoch": 1.461129086440708, + "grad_norm": 3.457423686981201, + "learning_rate": 2.5647848559321537e-05, + "loss": 0.5131, + "step": 165280 + }, + { + "epoch": 1.4612174897010202, + "grad_norm": 1.7850918769836426, + "learning_rate": 2.5646375171649665e-05, + "loss": 0.5816, + "step": 165290 + }, + { + "epoch": 1.4613058929613323, + "grad_norm": 3.1808416843414307, + "learning_rate": 2.5644901783977794e-05, + "loss": 0.6218, + "step": 165300 + }, + { + "epoch": 1.4613942962216446, + "grad_norm": 6.663628101348877, + "learning_rate": 2.5643428396305926e-05, + "loss": 0.7077, + "step": 165310 + }, + { + "epoch": 1.461482699481957, + "grad_norm": 3.2090237140655518, + "learning_rate": 2.5641955008634054e-05, + "loss": 0.5629, + "step": 165320 + }, + { + "epoch": 1.461571102742269, + "grad_norm": 1.8510327339172363, + "learning_rate": 2.5640481620962182e-05, + "loss": 0.5733, + "step": 165330 + }, + { + "epoch": 1.4616595060025814, + "grad_norm": 2.715728998184204, + "learning_rate": 2.5639008233290314e-05, + "loss": 0.5221, + "step": 165340 + }, + { + "epoch": 1.4617479092628936, + "grad_norm": 1.6793752908706665, + "learning_rate": 2.563753484561844e-05, + "loss": 0.6006, + "step": 165350 + }, + { + "epoch": 1.461836312523206, + "grad_norm": 1.9382816553115845, + "learning_rate": 2.563606145794657e-05, + "loss": 0.5766, + "step": 165360 + }, + { + "epoch": 1.461924715783518, + "grad_norm": 1.7276549339294434, + "learning_rate": 2.5634588070274702e-05, + "loss": 0.5043, + "step": 165370 + }, + { + "epoch": 1.4620131190438304, + "grad_norm": 3.428576946258545, + "learning_rate": 2.5633114682602827e-05, + "loss": 0.5291, + "step": 165380 + }, + { + "epoch": 1.4621015223041427, + "grad_norm": 1.6354753971099854, + "learning_rate": 2.563164129493096e-05, + "loss": 0.5526, + "step": 165390 + }, + { + "epoch": 1.4621899255644548, + "grad_norm": 2.0869579315185547, + "learning_rate": 2.563016790725909e-05, + "loss": 0.7208, + "step": 165400 + }, + { + "epoch": 1.462278328824767, + "grad_norm": 5.547192096710205, + "learning_rate": 2.5628694519587216e-05, + "loss": 0.5818, + "step": 165410 + }, + { + "epoch": 1.4623667320850793, + "grad_norm": 5.048770427703857, + "learning_rate": 2.5627221131915348e-05, + "loss": 0.6396, + "step": 165420 + }, + { + "epoch": 1.4624551353453916, + "grad_norm": 3.276597023010254, + "learning_rate": 2.562574774424348e-05, + "loss": 0.6493, + "step": 165430 + }, + { + "epoch": 1.4625435386057037, + "grad_norm": 4.226131916046143, + "learning_rate": 2.5624274356571604e-05, + "loss": 0.7202, + "step": 165440 + }, + { + "epoch": 1.462631941866016, + "grad_norm": 0.8856672048568726, + "learning_rate": 2.5622800968899736e-05, + "loss": 0.5749, + "step": 165450 + }, + { + "epoch": 1.4627203451263282, + "grad_norm": 1.2508918046951294, + "learning_rate": 2.562132758122786e-05, + "loss": 0.6631, + "step": 165460 + }, + { + "epoch": 1.4628087483866405, + "grad_norm": 3.146450996398926, + "learning_rate": 2.5619854193555993e-05, + "loss": 0.5841, + "step": 165470 + }, + { + "epoch": 1.4628971516469527, + "grad_norm": 2.5681655406951904, + "learning_rate": 2.5618380805884124e-05, + "loss": 0.6587, + "step": 165480 + }, + { + "epoch": 1.462985554907265, + "grad_norm": 5.823937892913818, + "learning_rate": 2.561690741821225e-05, + "loss": 0.5335, + "step": 165490 + }, + { + "epoch": 1.4630739581675773, + "grad_norm": 5.969976902008057, + "learning_rate": 2.561543403054038e-05, + "loss": 0.602, + "step": 165500 + }, + { + "epoch": 1.4631623614278895, + "grad_norm": 1.256791353225708, + "learning_rate": 2.5613960642868513e-05, + "loss": 0.5627, + "step": 165510 + }, + { + "epoch": 1.4632507646882016, + "grad_norm": 2.244436740875244, + "learning_rate": 2.5612487255196638e-05, + "loss": 0.6193, + "step": 165520 + }, + { + "epoch": 1.463339167948514, + "grad_norm": 5.083354949951172, + "learning_rate": 2.561101386752477e-05, + "loss": 0.5478, + "step": 165530 + }, + { + "epoch": 1.4634275712088263, + "grad_norm": 9.455780029296875, + "learning_rate": 2.56095404798529e-05, + "loss": 0.5386, + "step": 165540 + }, + { + "epoch": 1.4635159744691384, + "grad_norm": 2.285196304321289, + "learning_rate": 2.5608067092181026e-05, + "loss": 0.651, + "step": 165550 + }, + { + "epoch": 1.4636043777294507, + "grad_norm": 2.8001251220703125, + "learning_rate": 2.5606593704509158e-05, + "loss": 0.627, + "step": 165560 + }, + { + "epoch": 1.4636927809897629, + "grad_norm": 4.559179306030273, + "learning_rate": 2.560512031683729e-05, + "loss": 0.7675, + "step": 165570 + }, + { + "epoch": 1.4637811842500752, + "grad_norm": 3.3552405834198, + "learning_rate": 2.5603646929165415e-05, + "loss": 0.6884, + "step": 165580 + }, + { + "epoch": 1.4638695875103873, + "grad_norm": 6.944356918334961, + "learning_rate": 2.5602173541493547e-05, + "loss": 0.4882, + "step": 165590 + }, + { + "epoch": 1.4639579907706997, + "grad_norm": 1.5141106843948364, + "learning_rate": 2.560070015382167e-05, + "loss": 0.5562, + "step": 165600 + }, + { + "epoch": 1.464046394031012, + "grad_norm": 10.830549240112305, + "learning_rate": 2.5599226766149803e-05, + "loss": 0.6326, + "step": 165610 + }, + { + "epoch": 1.4641347972913241, + "grad_norm": 4.002711296081543, + "learning_rate": 2.5597753378477935e-05, + "loss": 0.6317, + "step": 165620 + }, + { + "epoch": 1.4642232005516362, + "grad_norm": 1.8983798027038574, + "learning_rate": 2.559627999080606e-05, + "loss": 0.6843, + "step": 165630 + }, + { + "epoch": 1.4643116038119486, + "grad_norm": 2.2025256156921387, + "learning_rate": 2.559480660313419e-05, + "loss": 0.6304, + "step": 165640 + }, + { + "epoch": 1.464400007072261, + "grad_norm": 2.338712453842163, + "learning_rate": 2.5593333215462323e-05, + "loss": 0.6261, + "step": 165650 + }, + { + "epoch": 1.464488410332573, + "grad_norm": 2.222493886947632, + "learning_rate": 2.559185982779045e-05, + "loss": 0.6161, + "step": 165660 + }, + { + "epoch": 1.4645768135928854, + "grad_norm": 2.706639051437378, + "learning_rate": 2.559038644011858e-05, + "loss": 0.6721, + "step": 165670 + }, + { + "epoch": 1.4646652168531975, + "grad_norm": 10.1292142868042, + "learning_rate": 2.5588913052446712e-05, + "loss": 0.5578, + "step": 165680 + }, + { + "epoch": 1.4647536201135098, + "grad_norm": 4.619641304016113, + "learning_rate": 2.5587439664774837e-05, + "loss": 0.5937, + "step": 165690 + }, + { + "epoch": 1.464842023373822, + "grad_norm": 3.47676420211792, + "learning_rate": 2.558596627710297e-05, + "loss": 0.6585, + "step": 165700 + }, + { + "epoch": 1.4649304266341343, + "grad_norm": 8.229005813598633, + "learning_rate": 2.5584492889431094e-05, + "loss": 0.5872, + "step": 165710 + }, + { + "epoch": 1.4650188298944464, + "grad_norm": 1.3553125858306885, + "learning_rate": 2.5583019501759225e-05, + "loss": 0.5232, + "step": 165720 + }, + { + "epoch": 1.4651072331547588, + "grad_norm": 0.8650517463684082, + "learning_rate": 2.5581546114087357e-05, + "loss": 0.5782, + "step": 165730 + }, + { + "epoch": 1.465195636415071, + "grad_norm": 0.8444440960884094, + "learning_rate": 2.5580072726415482e-05, + "loss": 0.6146, + "step": 165740 + }, + { + "epoch": 1.4652840396753832, + "grad_norm": 3.3970048427581787, + "learning_rate": 2.5578599338743614e-05, + "loss": 0.6311, + "step": 165750 + }, + { + "epoch": 1.4653724429356956, + "grad_norm": 2.781576633453369, + "learning_rate": 2.5577125951071745e-05, + "loss": 0.6764, + "step": 165760 + }, + { + "epoch": 1.4654608461960077, + "grad_norm": 1.3261603116989136, + "learning_rate": 2.557565256339987e-05, + "loss": 0.6909, + "step": 165770 + }, + { + "epoch": 1.4655492494563198, + "grad_norm": 1.5934619903564453, + "learning_rate": 2.5574179175728002e-05, + "loss": 0.6065, + "step": 165780 + }, + { + "epoch": 1.4656376527166322, + "grad_norm": 1.2074617147445679, + "learning_rate": 2.5572705788056134e-05, + "loss": 0.5175, + "step": 165790 + }, + { + "epoch": 1.4657260559769445, + "grad_norm": 0.938444197177887, + "learning_rate": 2.557123240038426e-05, + "loss": 0.565, + "step": 165800 + }, + { + "epoch": 1.4658144592372566, + "grad_norm": 1.1136524677276611, + "learning_rate": 2.556975901271239e-05, + "loss": 0.5749, + "step": 165810 + }, + { + "epoch": 1.465902862497569, + "grad_norm": 1.3726590871810913, + "learning_rate": 2.5568285625040516e-05, + "loss": 0.5493, + "step": 165820 + }, + { + "epoch": 1.465991265757881, + "grad_norm": 3.4139211177825928, + "learning_rate": 2.5566812237368647e-05, + "loss": 0.5185, + "step": 165830 + }, + { + "epoch": 1.4660796690181934, + "grad_norm": 2.2957217693328857, + "learning_rate": 2.556533884969678e-05, + "loss": 0.5733, + "step": 165840 + }, + { + "epoch": 1.4661680722785055, + "grad_norm": 3.5023183822631836, + "learning_rate": 2.5563865462024904e-05, + "loss": 0.6435, + "step": 165850 + }, + { + "epoch": 1.4662564755388179, + "grad_norm": 9.044766426086426, + "learning_rate": 2.5562392074353036e-05, + "loss": 0.551, + "step": 165860 + }, + { + "epoch": 1.4663448787991302, + "grad_norm": 1.5988744497299194, + "learning_rate": 2.5560918686681168e-05, + "loss": 0.5926, + "step": 165870 + }, + { + "epoch": 1.4664332820594423, + "grad_norm": 3.6065478324890137, + "learning_rate": 2.5559445299009293e-05, + "loss": 0.553, + "step": 165880 + }, + { + "epoch": 1.4665216853197545, + "grad_norm": 2.2264952659606934, + "learning_rate": 2.5557971911337424e-05, + "loss": 0.6058, + "step": 165890 + }, + { + "epoch": 1.4666100885800668, + "grad_norm": 1.6522276401519775, + "learning_rate": 2.5556498523665556e-05, + "loss": 0.5613, + "step": 165900 + }, + { + "epoch": 1.4666984918403791, + "grad_norm": 2.521946430206299, + "learning_rate": 2.555502513599368e-05, + "loss": 0.51, + "step": 165910 + }, + { + "epoch": 1.4667868951006913, + "grad_norm": 1.7782458066940308, + "learning_rate": 2.5553551748321813e-05, + "loss": 0.6272, + "step": 165920 + }, + { + "epoch": 1.4668752983610036, + "grad_norm": 2.4633171558380127, + "learning_rate": 2.5552078360649944e-05, + "loss": 0.6356, + "step": 165930 + }, + { + "epoch": 1.4669637016213157, + "grad_norm": 19.534011840820312, + "learning_rate": 2.555060497297807e-05, + "loss": 0.7104, + "step": 165940 + }, + { + "epoch": 1.467052104881628, + "grad_norm": 3.2198565006256104, + "learning_rate": 2.55491315853062e-05, + "loss": 0.6513, + "step": 165950 + }, + { + "epoch": 1.4671405081419402, + "grad_norm": 4.993585586547852, + "learning_rate": 2.5547658197634326e-05, + "loss": 0.6339, + "step": 165960 + }, + { + "epoch": 1.4672289114022525, + "grad_norm": 1.893470287322998, + "learning_rate": 2.5546184809962458e-05, + "loss": 0.5702, + "step": 165970 + }, + { + "epoch": 1.4673173146625649, + "grad_norm": 11.073661804199219, + "learning_rate": 2.554471142229059e-05, + "loss": 0.7347, + "step": 165980 + }, + { + "epoch": 1.467405717922877, + "grad_norm": 5.126662731170654, + "learning_rate": 2.5543238034618715e-05, + "loss": 0.5893, + "step": 165990 + }, + { + "epoch": 1.4674941211831891, + "grad_norm": 9.233426094055176, + "learning_rate": 2.5541764646946846e-05, + "loss": 0.6406, + "step": 166000 + }, + { + "epoch": 1.4675825244435015, + "grad_norm": 3.078634738922119, + "learning_rate": 2.5540291259274978e-05, + "loss": 0.5937, + "step": 166010 + }, + { + "epoch": 1.4676709277038138, + "grad_norm": 1.6421597003936768, + "learning_rate": 2.5538817871603103e-05, + "loss": 0.4789, + "step": 166020 + }, + { + "epoch": 1.467759330964126, + "grad_norm": 1.9199482202529907, + "learning_rate": 2.5537344483931235e-05, + "loss": 0.688, + "step": 166030 + }, + { + "epoch": 1.4678477342244383, + "grad_norm": 3.4600327014923096, + "learning_rate": 2.5535871096259366e-05, + "loss": 0.5997, + "step": 166040 + }, + { + "epoch": 1.4679361374847504, + "grad_norm": 10.555093765258789, + "learning_rate": 2.553439770858749e-05, + "loss": 0.6382, + "step": 166050 + }, + { + "epoch": 1.4680245407450627, + "grad_norm": 4.561395168304443, + "learning_rate": 2.5532924320915623e-05, + "loss": 0.5911, + "step": 166060 + }, + { + "epoch": 1.4681129440053748, + "grad_norm": 4.001600742340088, + "learning_rate": 2.553145093324375e-05, + "loss": 0.5947, + "step": 166070 + }, + { + "epoch": 1.4682013472656872, + "grad_norm": 1.355435848236084, + "learning_rate": 2.552997754557188e-05, + "loss": 0.5764, + "step": 166080 + }, + { + "epoch": 1.4682897505259995, + "grad_norm": 6.710354328155518, + "learning_rate": 2.552850415790001e-05, + "loss": 0.688, + "step": 166090 + }, + { + "epoch": 1.4683781537863116, + "grad_norm": 1.9244849681854248, + "learning_rate": 2.552703077022814e-05, + "loss": 0.6373, + "step": 166100 + }, + { + "epoch": 1.4684665570466238, + "grad_norm": 5.5034589767456055, + "learning_rate": 2.552555738255627e-05, + "loss": 0.6617, + "step": 166110 + }, + { + "epoch": 1.468554960306936, + "grad_norm": 1.4997817277908325, + "learning_rate": 2.55240839948844e-05, + "loss": 0.5782, + "step": 166120 + }, + { + "epoch": 1.4686433635672484, + "grad_norm": 2.0024592876434326, + "learning_rate": 2.552261060721253e-05, + "loss": 0.5451, + "step": 166130 + }, + { + "epoch": 1.4687317668275606, + "grad_norm": 2.064544677734375, + "learning_rate": 2.5521137219540657e-05, + "loss": 0.6011, + "step": 166140 + }, + { + "epoch": 1.468820170087873, + "grad_norm": 0.968387246131897, + "learning_rate": 2.551966383186879e-05, + "loss": 0.5261, + "step": 166150 + }, + { + "epoch": 1.468908573348185, + "grad_norm": 3.8145291805267334, + "learning_rate": 2.5518190444196917e-05, + "loss": 0.5823, + "step": 166160 + }, + { + "epoch": 1.4689969766084974, + "grad_norm": 1.5181794166564941, + "learning_rate": 2.5516717056525045e-05, + "loss": 0.6442, + "step": 166170 + }, + { + "epoch": 1.4690853798688095, + "grad_norm": 5.1150407791137695, + "learning_rate": 2.5515243668853174e-05, + "loss": 0.6484, + "step": 166180 + }, + { + "epoch": 1.4691737831291218, + "grad_norm": 5.140918254852295, + "learning_rate": 2.5513770281181305e-05, + "loss": 0.5803, + "step": 166190 + }, + { + "epoch": 1.4692621863894342, + "grad_norm": 4.786940574645996, + "learning_rate": 2.5512296893509434e-05, + "loss": 0.6295, + "step": 166200 + }, + { + "epoch": 1.4693505896497463, + "grad_norm": 5.206173896789551, + "learning_rate": 2.5510823505837562e-05, + "loss": 0.6806, + "step": 166210 + }, + { + "epoch": 1.4694389929100584, + "grad_norm": 12.629183769226074, + "learning_rate": 2.5509350118165694e-05, + "loss": 0.6354, + "step": 166220 + }, + { + "epoch": 1.4695273961703708, + "grad_norm": 1.6833518743515015, + "learning_rate": 2.5507876730493822e-05, + "loss": 0.4418, + "step": 166230 + }, + { + "epoch": 1.469615799430683, + "grad_norm": 5.06895637512207, + "learning_rate": 2.550640334282195e-05, + "loss": 0.6074, + "step": 166240 + }, + { + "epoch": 1.4697042026909952, + "grad_norm": 2.3329856395721436, + "learning_rate": 2.5504929955150082e-05, + "loss": 0.6471, + "step": 166250 + }, + { + "epoch": 1.4697926059513076, + "grad_norm": 2.237107753753662, + "learning_rate": 2.550345656747821e-05, + "loss": 0.4574, + "step": 166260 + }, + { + "epoch": 1.4698810092116197, + "grad_norm": 1.2353187799453735, + "learning_rate": 2.550198317980634e-05, + "loss": 0.5754, + "step": 166270 + }, + { + "epoch": 1.469969412471932, + "grad_norm": 3.021536350250244, + "learning_rate": 2.550050979213447e-05, + "loss": 0.6932, + "step": 166280 + }, + { + "epoch": 1.4700578157322441, + "grad_norm": 14.792726516723633, + "learning_rate": 2.5499036404462596e-05, + "loss": 0.6391, + "step": 166290 + }, + { + "epoch": 1.4701462189925565, + "grad_norm": 1.6500245332717896, + "learning_rate": 2.5497563016790727e-05, + "loss": 0.6702, + "step": 166300 + }, + { + "epoch": 1.4702346222528686, + "grad_norm": 1.6836272478103638, + "learning_rate": 2.549608962911886e-05, + "loss": 0.5673, + "step": 166310 + }, + { + "epoch": 1.470323025513181, + "grad_norm": 1.1950279474258423, + "learning_rate": 2.5494616241446984e-05, + "loss": 0.6478, + "step": 166320 + }, + { + "epoch": 1.470411428773493, + "grad_norm": 1.10524582862854, + "learning_rate": 2.5493142853775116e-05, + "loss": 0.5465, + "step": 166330 + }, + { + "epoch": 1.4704998320338054, + "grad_norm": 1.3094438314437866, + "learning_rate": 2.5491669466103248e-05, + "loss": 0.7214, + "step": 166340 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 1.426546335220337, + "learning_rate": 2.5490196078431373e-05, + "loss": 0.594, + "step": 166350 + }, + { + "epoch": 1.4706766385544299, + "grad_norm": 1.3956400156021118, + "learning_rate": 2.5488722690759504e-05, + "loss": 0.6851, + "step": 166360 + }, + { + "epoch": 1.470765041814742, + "grad_norm": 2.0667717456817627, + "learning_rate": 2.5487249303087636e-05, + "loss": 0.4455, + "step": 166370 + }, + { + "epoch": 1.4708534450750543, + "grad_norm": 2.319659948348999, + "learning_rate": 2.548577591541576e-05, + "loss": 0.6068, + "step": 166380 + }, + { + "epoch": 1.4709418483353667, + "grad_norm": 1.7326587438583374, + "learning_rate": 2.5484302527743893e-05, + "loss": 0.621, + "step": 166390 + }, + { + "epoch": 1.4710302515956788, + "grad_norm": 1.8600358963012695, + "learning_rate": 2.5482829140072025e-05, + "loss": 0.5259, + "step": 166400 + }, + { + "epoch": 1.4711186548559911, + "grad_norm": 2.192852020263672, + "learning_rate": 2.548135575240015e-05, + "loss": 0.6756, + "step": 166410 + }, + { + "epoch": 1.4712070581163033, + "grad_norm": 1.141935110092163, + "learning_rate": 2.547988236472828e-05, + "loss": 0.5778, + "step": 166420 + }, + { + "epoch": 1.4712954613766156, + "grad_norm": 5.724787712097168, + "learning_rate": 2.5478408977056406e-05, + "loss": 0.67, + "step": 166430 + }, + { + "epoch": 1.4713838646369277, + "grad_norm": 1.5245126485824585, + "learning_rate": 2.5476935589384538e-05, + "loss": 0.6764, + "step": 166440 + }, + { + "epoch": 1.47147226789724, + "grad_norm": 4.510645866394043, + "learning_rate": 2.547546220171267e-05, + "loss": 0.565, + "step": 166450 + }, + { + "epoch": 1.4715606711575524, + "grad_norm": 1.9344482421875, + "learning_rate": 2.5473988814040795e-05, + "loss": 0.6386, + "step": 166460 + }, + { + "epoch": 1.4716490744178645, + "grad_norm": 1.6704473495483398, + "learning_rate": 2.5472515426368926e-05, + "loss": 0.5401, + "step": 166470 + }, + { + "epoch": 1.4717374776781766, + "grad_norm": 3.4325690269470215, + "learning_rate": 2.5471042038697058e-05, + "loss": 0.7064, + "step": 166480 + }, + { + "epoch": 1.471825880938489, + "grad_norm": 2.9860715866088867, + "learning_rate": 2.5469568651025183e-05, + "loss": 0.6302, + "step": 166490 + }, + { + "epoch": 1.4719142841988013, + "grad_norm": 1.640015959739685, + "learning_rate": 2.5468095263353315e-05, + "loss": 0.5719, + "step": 166500 + }, + { + "epoch": 1.4720026874591134, + "grad_norm": 1.1518672704696655, + "learning_rate": 2.5466621875681447e-05, + "loss": 0.4811, + "step": 166510 + }, + { + "epoch": 1.4720910907194258, + "grad_norm": 2.573875904083252, + "learning_rate": 2.546514848800957e-05, + "loss": 0.6277, + "step": 166520 + }, + { + "epoch": 1.472179493979738, + "grad_norm": 3.281602144241333, + "learning_rate": 2.5463675100337703e-05, + "loss": 0.7033, + "step": 166530 + }, + { + "epoch": 1.4722678972400502, + "grad_norm": 3.3142848014831543, + "learning_rate": 2.5462201712665828e-05, + "loss": 0.5406, + "step": 166540 + }, + { + "epoch": 1.4723563005003624, + "grad_norm": 2.545706033706665, + "learning_rate": 2.546072832499396e-05, + "loss": 0.501, + "step": 166550 + }, + { + "epoch": 1.4724447037606747, + "grad_norm": 1.3589352369308472, + "learning_rate": 2.5459254937322092e-05, + "loss": 0.5504, + "step": 166560 + }, + { + "epoch": 1.472533107020987, + "grad_norm": 1.3167109489440918, + "learning_rate": 2.5457781549650217e-05, + "loss": 0.5171, + "step": 166570 + }, + { + "epoch": 1.4726215102812992, + "grad_norm": 1.404005765914917, + "learning_rate": 2.545630816197835e-05, + "loss": 0.3545, + "step": 166580 + }, + { + "epoch": 1.4727099135416113, + "grad_norm": 2.4481923580169678, + "learning_rate": 2.545483477430648e-05, + "loss": 0.6935, + "step": 166590 + }, + { + "epoch": 1.4727983168019236, + "grad_norm": 4.6668381690979, + "learning_rate": 2.5453361386634605e-05, + "loss": 0.5315, + "step": 166600 + }, + { + "epoch": 1.472886720062236, + "grad_norm": 1.568570852279663, + "learning_rate": 2.5451887998962737e-05, + "loss": 0.6689, + "step": 166610 + }, + { + "epoch": 1.472975123322548, + "grad_norm": 3.709521532058716, + "learning_rate": 2.545041461129087e-05, + "loss": 0.622, + "step": 166620 + }, + { + "epoch": 1.4730635265828604, + "grad_norm": 1.1138874292373657, + "learning_rate": 2.5448941223618994e-05, + "loss": 0.6033, + "step": 166630 + }, + { + "epoch": 1.4731519298431726, + "grad_norm": 1.392332673072815, + "learning_rate": 2.5447467835947125e-05, + "loss": 0.6351, + "step": 166640 + }, + { + "epoch": 1.473240333103485, + "grad_norm": 1.611612319946289, + "learning_rate": 2.544599444827525e-05, + "loss": 0.6065, + "step": 166650 + }, + { + "epoch": 1.473328736363797, + "grad_norm": 2.563417911529541, + "learning_rate": 2.5444521060603382e-05, + "loss": 0.7093, + "step": 166660 + }, + { + "epoch": 1.4734171396241094, + "grad_norm": 7.940690517425537, + "learning_rate": 2.5443047672931514e-05, + "loss": 0.6157, + "step": 166670 + }, + { + "epoch": 1.4735055428844217, + "grad_norm": 11.77758502960205, + "learning_rate": 2.544157428525964e-05, + "loss": 0.6166, + "step": 166680 + }, + { + "epoch": 1.4735939461447338, + "grad_norm": 2.736783981323242, + "learning_rate": 2.544010089758777e-05, + "loss": 0.5801, + "step": 166690 + }, + { + "epoch": 1.473682349405046, + "grad_norm": 2.061103582382202, + "learning_rate": 2.5438627509915902e-05, + "loss": 0.6122, + "step": 166700 + }, + { + "epoch": 1.4737707526653583, + "grad_norm": 4.539426803588867, + "learning_rate": 2.5437154122244027e-05, + "loss": 0.7098, + "step": 166710 + }, + { + "epoch": 1.4738591559256706, + "grad_norm": 12.13815689086914, + "learning_rate": 2.543568073457216e-05, + "loss": 0.5149, + "step": 166720 + }, + { + "epoch": 1.4739475591859827, + "grad_norm": 2.472132444381714, + "learning_rate": 2.543420734690029e-05, + "loss": 0.5887, + "step": 166730 + }, + { + "epoch": 1.474035962446295, + "grad_norm": 5.629611492156982, + "learning_rate": 2.5432733959228416e-05, + "loss": 0.5497, + "step": 166740 + }, + { + "epoch": 1.4741243657066072, + "grad_norm": 7.8986496925354, + "learning_rate": 2.5431260571556547e-05, + "loss": 0.6194, + "step": 166750 + }, + { + "epoch": 1.4742127689669196, + "grad_norm": 1.58394455909729, + "learning_rate": 2.5429787183884672e-05, + "loss": 0.62, + "step": 166760 + }, + { + "epoch": 1.4743011722272317, + "grad_norm": 2.47725772857666, + "learning_rate": 2.5428313796212804e-05, + "loss": 0.5576, + "step": 166770 + }, + { + "epoch": 1.474389575487544, + "grad_norm": 1.5111464262008667, + "learning_rate": 2.5426840408540936e-05, + "loss": 0.6828, + "step": 166780 + }, + { + "epoch": 1.4744779787478564, + "grad_norm": 29.438861846923828, + "learning_rate": 2.542536702086906e-05, + "loss": 0.487, + "step": 166790 + }, + { + "epoch": 1.4745663820081685, + "grad_norm": 2.4611611366271973, + "learning_rate": 2.5423893633197193e-05, + "loss": 0.624, + "step": 166800 + }, + { + "epoch": 1.4746547852684806, + "grad_norm": 1.5150115489959717, + "learning_rate": 2.5422420245525324e-05, + "loss": 0.5757, + "step": 166810 + }, + { + "epoch": 1.474743188528793, + "grad_norm": 6.518135070800781, + "learning_rate": 2.542094685785345e-05, + "loss": 0.6352, + "step": 166820 + }, + { + "epoch": 1.4748315917891053, + "grad_norm": 1.4559897184371948, + "learning_rate": 2.541947347018158e-05, + "loss": 0.6181, + "step": 166830 + }, + { + "epoch": 1.4749199950494174, + "grad_norm": 8.118577003479004, + "learning_rate": 2.5418000082509713e-05, + "loss": 0.5698, + "step": 166840 + }, + { + "epoch": 1.4750083983097297, + "grad_norm": 1.0413365364074707, + "learning_rate": 2.5416526694837838e-05, + "loss": 0.5416, + "step": 166850 + }, + { + "epoch": 1.4750968015700419, + "grad_norm": 1.469028353691101, + "learning_rate": 2.541505330716597e-05, + "loss": 0.5961, + "step": 166860 + }, + { + "epoch": 1.4751852048303542, + "grad_norm": 1.7227214574813843, + "learning_rate": 2.54135799194941e-05, + "loss": 0.6318, + "step": 166870 + }, + { + "epoch": 1.4752736080906663, + "grad_norm": 1.8608362674713135, + "learning_rate": 2.5412106531822226e-05, + "loss": 0.6335, + "step": 166880 + }, + { + "epoch": 1.4753620113509787, + "grad_norm": 2.001398801803589, + "learning_rate": 2.5410633144150358e-05, + "loss": 0.6015, + "step": 166890 + }, + { + "epoch": 1.4754504146112908, + "grad_norm": 2.1302404403686523, + "learning_rate": 2.5409159756478483e-05, + "loss": 0.6828, + "step": 166900 + }, + { + "epoch": 1.4755388178716031, + "grad_norm": 8.017047882080078, + "learning_rate": 2.5407686368806615e-05, + "loss": 0.5458, + "step": 166910 + }, + { + "epoch": 1.4756272211319152, + "grad_norm": 3.74044132232666, + "learning_rate": 2.5406212981134746e-05, + "loss": 0.6147, + "step": 166920 + }, + { + "epoch": 1.4757156243922276, + "grad_norm": 1.7153632640838623, + "learning_rate": 2.540473959346287e-05, + "loss": 0.5492, + "step": 166930 + }, + { + "epoch": 1.47580402765254, + "grad_norm": 1.6338903903961182, + "learning_rate": 2.5403266205791003e-05, + "loss": 0.6237, + "step": 166940 + }, + { + "epoch": 1.475892430912852, + "grad_norm": 2.5506057739257812, + "learning_rate": 2.5401792818119135e-05, + "loss": 0.7075, + "step": 166950 + }, + { + "epoch": 1.4759808341731642, + "grad_norm": 2.5842106342315674, + "learning_rate": 2.540031943044726e-05, + "loss": 0.5288, + "step": 166960 + }, + { + "epoch": 1.4760692374334765, + "grad_norm": 1.050004243850708, + "learning_rate": 2.539884604277539e-05, + "loss": 0.649, + "step": 166970 + }, + { + "epoch": 1.4761576406937889, + "grad_norm": 16.436973571777344, + "learning_rate": 2.5397372655103523e-05, + "loss": 0.4757, + "step": 166980 + }, + { + "epoch": 1.476246043954101, + "grad_norm": 1.9779506921768188, + "learning_rate": 2.5395899267431648e-05, + "loss": 0.6412, + "step": 166990 + }, + { + "epoch": 1.4763344472144133, + "grad_norm": 1.8107967376708984, + "learning_rate": 2.539442587975978e-05, + "loss": 0.5409, + "step": 167000 + }, + { + "epoch": 1.4764228504747254, + "grad_norm": 1.4559743404388428, + "learning_rate": 2.539295249208791e-05, + "loss": 0.6913, + "step": 167010 + }, + { + "epoch": 1.4765112537350378, + "grad_norm": 4.314670562744141, + "learning_rate": 2.5391479104416037e-05, + "loss": 0.605, + "step": 167020 + }, + { + "epoch": 1.47659965699535, + "grad_norm": 1.7426317930221558, + "learning_rate": 2.539000571674417e-05, + "loss": 0.5102, + "step": 167030 + }, + { + "epoch": 1.4766880602556622, + "grad_norm": 2.9428422451019287, + "learning_rate": 2.5388532329072297e-05, + "loss": 0.6702, + "step": 167040 + }, + { + "epoch": 1.4767764635159746, + "grad_norm": 1.1322271823883057, + "learning_rate": 2.5387058941400425e-05, + "loss": 0.6103, + "step": 167050 + }, + { + "epoch": 1.4768648667762867, + "grad_norm": 3.2456536293029785, + "learning_rate": 2.5385585553728557e-05, + "loss": 0.5782, + "step": 167060 + }, + { + "epoch": 1.4769532700365988, + "grad_norm": 2.7037160396575928, + "learning_rate": 2.5384112166056685e-05, + "loss": 0.5825, + "step": 167070 + }, + { + "epoch": 1.4770416732969112, + "grad_norm": 2.728236675262451, + "learning_rate": 2.5382638778384814e-05, + "loss": 0.5521, + "step": 167080 + }, + { + "epoch": 1.4771300765572235, + "grad_norm": 3.207359790802002, + "learning_rate": 2.5381165390712945e-05, + "loss": 0.5406, + "step": 167090 + }, + { + "epoch": 1.4772184798175356, + "grad_norm": 1.6934537887573242, + "learning_rate": 2.5379692003041074e-05, + "loss": 0.5483, + "step": 167100 + }, + { + "epoch": 1.477306883077848, + "grad_norm": 3.5528311729431152, + "learning_rate": 2.5378218615369202e-05, + "loss": 0.6066, + "step": 167110 + }, + { + "epoch": 1.47739528633816, + "grad_norm": 1.6753278970718384, + "learning_rate": 2.537674522769733e-05, + "loss": 0.5992, + "step": 167120 + }, + { + "epoch": 1.4774836895984724, + "grad_norm": 1.439624547958374, + "learning_rate": 2.5375271840025462e-05, + "loss": 0.5294, + "step": 167130 + }, + { + "epoch": 1.4775720928587845, + "grad_norm": 2.9812662601470947, + "learning_rate": 2.537379845235359e-05, + "loss": 0.6862, + "step": 167140 + }, + { + "epoch": 1.477660496119097, + "grad_norm": 2.172999143600464, + "learning_rate": 2.537232506468172e-05, + "loss": 0.6114, + "step": 167150 + }, + { + "epoch": 1.4777488993794092, + "grad_norm": 2.554595470428467, + "learning_rate": 2.537085167700985e-05, + "loss": 0.6167, + "step": 167160 + }, + { + "epoch": 1.4778373026397214, + "grad_norm": 4.447844982147217, + "learning_rate": 2.536937828933798e-05, + "loss": 0.58, + "step": 167170 + }, + { + "epoch": 1.4779257059000335, + "grad_norm": 1.9268163442611694, + "learning_rate": 2.5367904901666107e-05, + "loss": 0.6194, + "step": 167180 + }, + { + "epoch": 1.4780141091603458, + "grad_norm": 4.2392258644104, + "learning_rate": 2.536643151399424e-05, + "loss": 0.6735, + "step": 167190 + }, + { + "epoch": 1.4781025124206582, + "grad_norm": 2.439371109008789, + "learning_rate": 2.5364958126322367e-05, + "loss": 0.6448, + "step": 167200 + }, + { + "epoch": 1.4781909156809703, + "grad_norm": 1.1882117986679077, + "learning_rate": 2.5363484738650496e-05, + "loss": 0.4941, + "step": 167210 + }, + { + "epoch": 1.4782793189412826, + "grad_norm": 2.04378342628479, + "learning_rate": 2.5362011350978627e-05, + "loss": 0.4746, + "step": 167220 + }, + { + "epoch": 1.4783677222015947, + "grad_norm": 2.697740316390991, + "learning_rate": 2.5360537963306752e-05, + "loss": 0.764, + "step": 167230 + }, + { + "epoch": 1.478456125461907, + "grad_norm": 1.494955062866211, + "learning_rate": 2.5359064575634884e-05, + "loss": 0.6697, + "step": 167240 + }, + { + "epoch": 1.4785445287222192, + "grad_norm": 0.9937110543251038, + "learning_rate": 2.5357591187963016e-05, + "loss": 0.5706, + "step": 167250 + }, + { + "epoch": 1.4786329319825315, + "grad_norm": 1.5161702632904053, + "learning_rate": 2.535611780029114e-05, + "loss": 0.5916, + "step": 167260 + }, + { + "epoch": 1.4787213352428439, + "grad_norm": 2.774387836456299, + "learning_rate": 2.5354644412619273e-05, + "loss": 0.6247, + "step": 167270 + }, + { + "epoch": 1.478809738503156, + "grad_norm": 1.867197036743164, + "learning_rate": 2.5353171024947404e-05, + "loss": 0.6725, + "step": 167280 + }, + { + "epoch": 1.4788981417634681, + "grad_norm": 3.0254671573638916, + "learning_rate": 2.535169763727553e-05, + "loss": 0.5347, + "step": 167290 + }, + { + "epoch": 1.4789865450237805, + "grad_norm": 6.657210350036621, + "learning_rate": 2.535022424960366e-05, + "loss": 0.6858, + "step": 167300 + }, + { + "epoch": 1.4790749482840928, + "grad_norm": 5.8452558517456055, + "learning_rate": 2.5348750861931793e-05, + "loss": 0.5615, + "step": 167310 + }, + { + "epoch": 1.479163351544405, + "grad_norm": 8.66779899597168, + "learning_rate": 2.5347277474259918e-05, + "loss": 0.7571, + "step": 167320 + }, + { + "epoch": 1.4792517548047173, + "grad_norm": 2.419351577758789, + "learning_rate": 2.534580408658805e-05, + "loss": 0.6538, + "step": 167330 + }, + { + "epoch": 1.4793401580650294, + "grad_norm": 2.094560384750366, + "learning_rate": 2.534433069891618e-05, + "loss": 0.6737, + "step": 167340 + }, + { + "epoch": 1.4794285613253417, + "grad_norm": 4.466786861419678, + "learning_rate": 2.5342857311244306e-05, + "loss": 0.4474, + "step": 167350 + }, + { + "epoch": 1.4795169645856538, + "grad_norm": 2.4063034057617188, + "learning_rate": 2.5341383923572438e-05, + "loss": 0.5744, + "step": 167360 + }, + { + "epoch": 1.4796053678459662, + "grad_norm": 17.483577728271484, + "learning_rate": 2.5339910535900563e-05, + "loss": 0.5744, + "step": 167370 + }, + { + "epoch": 1.4796937711062785, + "grad_norm": 3.8088626861572266, + "learning_rate": 2.5338437148228695e-05, + "loss": 0.5904, + "step": 167380 + }, + { + "epoch": 1.4797821743665907, + "grad_norm": 9.049639701843262, + "learning_rate": 2.5336963760556826e-05, + "loss": 0.492, + "step": 167390 + }, + { + "epoch": 1.4798705776269028, + "grad_norm": 3.6838390827178955, + "learning_rate": 2.533549037288495e-05, + "loss": 0.5937, + "step": 167400 + }, + { + "epoch": 1.4799589808872151, + "grad_norm": 4.280373573303223, + "learning_rate": 2.5334016985213083e-05, + "loss": 0.5342, + "step": 167410 + }, + { + "epoch": 1.4800473841475275, + "grad_norm": 3.391746759414673, + "learning_rate": 2.5332543597541215e-05, + "loss": 0.6239, + "step": 167420 + }, + { + "epoch": 1.4801357874078396, + "grad_norm": 3.2820661067962646, + "learning_rate": 2.533107020986934e-05, + "loss": 0.4712, + "step": 167430 + }, + { + "epoch": 1.480224190668152, + "grad_norm": 3.8523433208465576, + "learning_rate": 2.532959682219747e-05, + "loss": 0.6637, + "step": 167440 + }, + { + "epoch": 1.480312593928464, + "grad_norm": 1.9412356615066528, + "learning_rate": 2.5328123434525603e-05, + "loss": 0.5292, + "step": 167450 + }, + { + "epoch": 1.4804009971887764, + "grad_norm": 2.5025839805603027, + "learning_rate": 2.5326650046853728e-05, + "loss": 0.5537, + "step": 167460 + }, + { + "epoch": 1.4804894004490885, + "grad_norm": 8.144207000732422, + "learning_rate": 2.532517665918186e-05, + "loss": 0.6397, + "step": 167470 + }, + { + "epoch": 1.4805778037094008, + "grad_norm": 5.0364580154418945, + "learning_rate": 2.5323703271509985e-05, + "loss": 0.6693, + "step": 167480 + }, + { + "epoch": 1.480666206969713, + "grad_norm": 3.9354279041290283, + "learning_rate": 2.5322229883838117e-05, + "loss": 0.6508, + "step": 167490 + }, + { + "epoch": 1.4807546102300253, + "grad_norm": 1.4934314489364624, + "learning_rate": 2.532075649616625e-05, + "loss": 0.5671, + "step": 167500 + }, + { + "epoch": 1.4808430134903374, + "grad_norm": 1.1495667695999146, + "learning_rate": 2.5319283108494373e-05, + "loss": 0.6857, + "step": 167510 + }, + { + "epoch": 1.4809314167506498, + "grad_norm": 2.913308620452881, + "learning_rate": 2.5317809720822505e-05, + "loss": 0.6824, + "step": 167520 + }, + { + "epoch": 1.481019820010962, + "grad_norm": 2.445190906524658, + "learning_rate": 2.5316336333150637e-05, + "loss": 0.5857, + "step": 167530 + }, + { + "epoch": 1.4811082232712742, + "grad_norm": 1.6554746627807617, + "learning_rate": 2.5314862945478762e-05, + "loss": 0.7662, + "step": 167540 + }, + { + "epoch": 1.4811966265315863, + "grad_norm": 1.173087477684021, + "learning_rate": 2.5313389557806894e-05, + "loss": 0.6402, + "step": 167550 + }, + { + "epoch": 1.4812850297918987, + "grad_norm": 1.0074834823608398, + "learning_rate": 2.5311916170135025e-05, + "loss": 0.5489, + "step": 167560 + }, + { + "epoch": 1.481373433052211, + "grad_norm": 3.9076647758483887, + "learning_rate": 2.531044278246315e-05, + "loss": 0.5338, + "step": 167570 + }, + { + "epoch": 1.4814618363125232, + "grad_norm": 3.726008415222168, + "learning_rate": 2.5308969394791282e-05, + "loss": 0.6305, + "step": 167580 + }, + { + "epoch": 1.4815502395728355, + "grad_norm": 2.1018142700195312, + "learning_rate": 2.5307496007119407e-05, + "loss": 0.5941, + "step": 167590 + }, + { + "epoch": 1.4816386428331476, + "grad_norm": 1.6468955278396606, + "learning_rate": 2.530602261944754e-05, + "loss": 0.5982, + "step": 167600 + }, + { + "epoch": 1.48172704609346, + "grad_norm": 7.451429843902588, + "learning_rate": 2.530454923177567e-05, + "loss": 0.5749, + "step": 167610 + }, + { + "epoch": 1.481815449353772, + "grad_norm": 2.754551887512207, + "learning_rate": 2.5303075844103796e-05, + "loss": 0.7129, + "step": 167620 + }, + { + "epoch": 1.4819038526140844, + "grad_norm": 4.118371486663818, + "learning_rate": 2.5301602456431927e-05, + "loss": 0.6339, + "step": 167630 + }, + { + "epoch": 1.4819922558743968, + "grad_norm": 2.545444965362549, + "learning_rate": 2.530012906876006e-05, + "loss": 0.5962, + "step": 167640 + }, + { + "epoch": 1.4820806591347089, + "grad_norm": 5.016569137573242, + "learning_rate": 2.5298655681088184e-05, + "loss": 0.4963, + "step": 167650 + }, + { + "epoch": 1.482169062395021, + "grad_norm": 2.780261278152466, + "learning_rate": 2.5297182293416316e-05, + "loss": 0.5706, + "step": 167660 + }, + { + "epoch": 1.4822574656553333, + "grad_norm": 5.90531587600708, + "learning_rate": 2.5295708905744447e-05, + "loss": 0.6377, + "step": 167670 + }, + { + "epoch": 1.4823458689156457, + "grad_norm": 17.049043655395508, + "learning_rate": 2.5294235518072572e-05, + "loss": 0.6117, + "step": 167680 + }, + { + "epoch": 1.4824342721759578, + "grad_norm": 1.530900478363037, + "learning_rate": 2.5292762130400704e-05, + "loss": 0.5135, + "step": 167690 + }, + { + "epoch": 1.4825226754362701, + "grad_norm": 4.286895275115967, + "learning_rate": 2.529128874272883e-05, + "loss": 0.5757, + "step": 167700 + }, + { + "epoch": 1.4826110786965823, + "grad_norm": 4.696872234344482, + "learning_rate": 2.528981535505696e-05, + "loss": 0.5842, + "step": 167710 + }, + { + "epoch": 1.4826994819568946, + "grad_norm": 2.500370502471924, + "learning_rate": 2.5288341967385093e-05, + "loss": 0.5894, + "step": 167720 + }, + { + "epoch": 1.4827878852172067, + "grad_norm": 8.937431335449219, + "learning_rate": 2.5286868579713218e-05, + "loss": 0.5779, + "step": 167730 + }, + { + "epoch": 1.482876288477519, + "grad_norm": 1.4781537055969238, + "learning_rate": 2.528539519204135e-05, + "loss": 0.6223, + "step": 167740 + }, + { + "epoch": 1.4829646917378314, + "grad_norm": 3.4986331462860107, + "learning_rate": 2.528392180436948e-05, + "loss": 0.5833, + "step": 167750 + }, + { + "epoch": 1.4830530949981435, + "grad_norm": 2.8727478981018066, + "learning_rate": 2.5282448416697606e-05, + "loss": 0.711, + "step": 167760 + }, + { + "epoch": 1.4831414982584556, + "grad_norm": 1.9648290872573853, + "learning_rate": 2.5280975029025738e-05, + "loss": 0.558, + "step": 167770 + }, + { + "epoch": 1.483229901518768, + "grad_norm": 2.376296281814575, + "learning_rate": 2.527950164135387e-05, + "loss": 0.5542, + "step": 167780 + }, + { + "epoch": 1.4833183047790803, + "grad_norm": 14.182267189025879, + "learning_rate": 2.5278028253681994e-05, + "loss": 0.6262, + "step": 167790 + }, + { + "epoch": 1.4834067080393925, + "grad_norm": 1.7571372985839844, + "learning_rate": 2.5276554866010126e-05, + "loss": 0.3986, + "step": 167800 + }, + { + "epoch": 1.4834951112997048, + "grad_norm": 2.912757158279419, + "learning_rate": 2.5275081478338258e-05, + "loss": 0.5501, + "step": 167810 + }, + { + "epoch": 1.483583514560017, + "grad_norm": 2.827099323272705, + "learning_rate": 2.5273608090666383e-05, + "loss": 0.7058, + "step": 167820 + }, + { + "epoch": 1.4836719178203293, + "grad_norm": 4.497413158416748, + "learning_rate": 2.5272134702994515e-05, + "loss": 0.6198, + "step": 167830 + }, + { + "epoch": 1.4837603210806414, + "grad_norm": 1.271637201309204, + "learning_rate": 2.527066131532264e-05, + "loss": 0.4485, + "step": 167840 + }, + { + "epoch": 1.4838487243409537, + "grad_norm": 1.8315919637680054, + "learning_rate": 2.526918792765077e-05, + "loss": 0.5869, + "step": 167850 + }, + { + "epoch": 1.483937127601266, + "grad_norm": 1.2556116580963135, + "learning_rate": 2.5267714539978903e-05, + "loss": 0.5199, + "step": 167860 + }, + { + "epoch": 1.4840255308615782, + "grad_norm": 2.636373996734619, + "learning_rate": 2.5266241152307028e-05, + "loss": 0.7162, + "step": 167870 + }, + { + "epoch": 1.4841139341218903, + "grad_norm": 2.2732982635498047, + "learning_rate": 2.526476776463516e-05, + "loss": 0.6404, + "step": 167880 + }, + { + "epoch": 1.4842023373822026, + "grad_norm": 13.094659805297852, + "learning_rate": 2.526329437696329e-05, + "loss": 0.6638, + "step": 167890 + }, + { + "epoch": 1.484290740642515, + "grad_norm": 10.085988998413086, + "learning_rate": 2.5261820989291417e-05, + "loss": 0.723, + "step": 167900 + }, + { + "epoch": 1.484379143902827, + "grad_norm": 1.838112473487854, + "learning_rate": 2.5260347601619548e-05, + "loss": 0.6037, + "step": 167910 + }, + { + "epoch": 1.4844675471631394, + "grad_norm": 2.0667428970336914, + "learning_rate": 2.525887421394768e-05, + "loss": 0.6192, + "step": 167920 + }, + { + "epoch": 1.4845559504234516, + "grad_norm": 1.7449721097946167, + "learning_rate": 2.5257400826275805e-05, + "loss": 0.4779, + "step": 167930 + }, + { + "epoch": 1.484644353683764, + "grad_norm": 3.658342123031616, + "learning_rate": 2.5255927438603937e-05, + "loss": 0.5418, + "step": 167940 + }, + { + "epoch": 1.484732756944076, + "grad_norm": 5.174700736999512, + "learning_rate": 2.5254454050932065e-05, + "loss": 0.505, + "step": 167950 + }, + { + "epoch": 1.4848211602043884, + "grad_norm": 2.033924102783203, + "learning_rate": 2.5252980663260193e-05, + "loss": 0.6025, + "step": 167960 + }, + { + "epoch": 1.4849095634647007, + "grad_norm": 2.3899896144866943, + "learning_rate": 2.5251507275588325e-05, + "loss": 0.5726, + "step": 167970 + }, + { + "epoch": 1.4849979667250128, + "grad_norm": 1.7553874254226685, + "learning_rate": 2.5250033887916454e-05, + "loss": 0.6192, + "step": 167980 + }, + { + "epoch": 1.485086369985325, + "grad_norm": 2.7527503967285156, + "learning_rate": 2.5248560500244582e-05, + "loss": 0.6492, + "step": 167990 + }, + { + "epoch": 1.4851747732456373, + "grad_norm": 4.926758766174316, + "learning_rate": 2.5247087112572714e-05, + "loss": 0.6981, + "step": 168000 + }, + { + "epoch": 1.4852631765059496, + "grad_norm": 5.197306156158447, + "learning_rate": 2.5245613724900842e-05, + "loss": 0.5863, + "step": 168010 + }, + { + "epoch": 1.4853515797662618, + "grad_norm": 10.739989280700684, + "learning_rate": 2.524414033722897e-05, + "loss": 0.5708, + "step": 168020 + }, + { + "epoch": 1.485439983026574, + "grad_norm": 1.9307574033737183, + "learning_rate": 2.5242666949557102e-05, + "loss": 0.5454, + "step": 168030 + }, + { + "epoch": 1.4855283862868862, + "grad_norm": 3.721250534057617, + "learning_rate": 2.524119356188523e-05, + "loss": 0.5725, + "step": 168040 + }, + { + "epoch": 1.4856167895471986, + "grad_norm": 4.485631465911865, + "learning_rate": 2.523972017421336e-05, + "loss": 0.6236, + "step": 168050 + }, + { + "epoch": 1.4857051928075107, + "grad_norm": 3.744614839553833, + "learning_rate": 2.5238246786541487e-05, + "loss": 0.6876, + "step": 168060 + }, + { + "epoch": 1.485793596067823, + "grad_norm": 2.1492726802825928, + "learning_rate": 2.523677339886962e-05, + "loss": 0.6738, + "step": 168070 + }, + { + "epoch": 1.4858819993281351, + "grad_norm": 2.7124409675598145, + "learning_rate": 2.5235300011197747e-05, + "loss": 0.4988, + "step": 168080 + }, + { + "epoch": 1.4859704025884475, + "grad_norm": 10.714515686035156, + "learning_rate": 2.5233826623525876e-05, + "loss": 0.5874, + "step": 168090 + }, + { + "epoch": 1.4860588058487596, + "grad_norm": 8.479536056518555, + "learning_rate": 2.5232353235854007e-05, + "loss": 0.5909, + "step": 168100 + }, + { + "epoch": 1.486147209109072, + "grad_norm": 14.649287223815918, + "learning_rate": 2.5230879848182136e-05, + "loss": 0.7773, + "step": 168110 + }, + { + "epoch": 1.4862356123693843, + "grad_norm": 6.432976245880127, + "learning_rate": 2.5229406460510264e-05, + "loss": 0.5694, + "step": 168120 + }, + { + "epoch": 1.4863240156296964, + "grad_norm": 3.3449676036834717, + "learning_rate": 2.5227933072838396e-05, + "loss": 0.6942, + "step": 168130 + }, + { + "epoch": 1.4864124188900087, + "grad_norm": 7.80836820602417, + "learning_rate": 2.5226459685166524e-05, + "loss": 0.563, + "step": 168140 + }, + { + "epoch": 1.4865008221503209, + "grad_norm": 1.676095962524414, + "learning_rate": 2.5224986297494652e-05, + "loss": 0.6054, + "step": 168150 + }, + { + "epoch": 1.4865892254106332, + "grad_norm": 3.417198419570923, + "learning_rate": 2.5223512909822784e-05, + "loss": 0.5569, + "step": 168160 + }, + { + "epoch": 1.4866776286709453, + "grad_norm": 1.5840697288513184, + "learning_rate": 2.522203952215091e-05, + "loss": 0.4732, + "step": 168170 + }, + { + "epoch": 1.4867660319312577, + "grad_norm": 3.6179959774017334, + "learning_rate": 2.522056613447904e-05, + "loss": 0.5414, + "step": 168180 + }, + { + "epoch": 1.4868544351915698, + "grad_norm": 4.9565043449401855, + "learning_rate": 2.5219092746807173e-05, + "loss": 0.6726, + "step": 168190 + }, + { + "epoch": 1.4869428384518821, + "grad_norm": 2.658311605453491, + "learning_rate": 2.5217619359135298e-05, + "loss": 0.6698, + "step": 168200 + }, + { + "epoch": 1.4870312417121943, + "grad_norm": 2.3924074172973633, + "learning_rate": 2.521614597146343e-05, + "loss": 0.5888, + "step": 168210 + }, + { + "epoch": 1.4871196449725066, + "grad_norm": 2.092282295227051, + "learning_rate": 2.521467258379156e-05, + "loss": 0.6933, + "step": 168220 + }, + { + "epoch": 1.487208048232819, + "grad_norm": 5.658856391906738, + "learning_rate": 2.5213199196119686e-05, + "loss": 0.5825, + "step": 168230 + }, + { + "epoch": 1.487296451493131, + "grad_norm": 1.1307692527770996, + "learning_rate": 2.5211725808447818e-05, + "loss": 0.6151, + "step": 168240 + }, + { + "epoch": 1.4873848547534432, + "grad_norm": 2.23170804977417, + "learning_rate": 2.521025242077595e-05, + "loss": 0.7011, + "step": 168250 + }, + { + "epoch": 1.4874732580137555, + "grad_norm": 1.663549780845642, + "learning_rate": 2.5208779033104075e-05, + "loss": 0.5642, + "step": 168260 + }, + { + "epoch": 1.4875616612740679, + "grad_norm": 3.4748644828796387, + "learning_rate": 2.5207305645432206e-05, + "loss": 0.5497, + "step": 168270 + }, + { + "epoch": 1.48765006453438, + "grad_norm": 1.6814273595809937, + "learning_rate": 2.5205832257760338e-05, + "loss": 0.5978, + "step": 168280 + }, + { + "epoch": 1.4877384677946923, + "grad_norm": 1.6505483388900757, + "learning_rate": 2.5204358870088463e-05, + "loss": 0.559, + "step": 168290 + }, + { + "epoch": 1.4878268710550044, + "grad_norm": 1.7569222450256348, + "learning_rate": 2.5202885482416595e-05, + "loss": 0.6306, + "step": 168300 + }, + { + "epoch": 1.4879152743153168, + "grad_norm": 1.5775505304336548, + "learning_rate": 2.520141209474472e-05, + "loss": 0.6264, + "step": 168310 + }, + { + "epoch": 1.488003677575629, + "grad_norm": 1.5059866905212402, + "learning_rate": 2.519993870707285e-05, + "loss": 0.5655, + "step": 168320 + }, + { + "epoch": 1.4880920808359412, + "grad_norm": 7.040750980377197, + "learning_rate": 2.5198465319400983e-05, + "loss": 0.5223, + "step": 168330 + }, + { + "epoch": 1.4881804840962536, + "grad_norm": 1.3542442321777344, + "learning_rate": 2.5196991931729108e-05, + "loss": 0.6205, + "step": 168340 + }, + { + "epoch": 1.4882688873565657, + "grad_norm": 8.73717975616455, + "learning_rate": 2.519551854405724e-05, + "loss": 0.4427, + "step": 168350 + }, + { + "epoch": 1.4883572906168778, + "grad_norm": 1.6536052227020264, + "learning_rate": 2.519404515638537e-05, + "loss": 0.6414, + "step": 168360 + }, + { + "epoch": 1.4884456938771902, + "grad_norm": 26.23335075378418, + "learning_rate": 2.5192571768713497e-05, + "loss": 0.4855, + "step": 168370 + }, + { + "epoch": 1.4885340971375025, + "grad_norm": 5.047027111053467, + "learning_rate": 2.519109838104163e-05, + "loss": 0.5632, + "step": 168380 + }, + { + "epoch": 1.4886225003978146, + "grad_norm": 1.5244139432907104, + "learning_rate": 2.518962499336976e-05, + "loss": 0.6747, + "step": 168390 + }, + { + "epoch": 1.488710903658127, + "grad_norm": 2.4863431453704834, + "learning_rate": 2.5188151605697885e-05, + "loss": 0.6225, + "step": 168400 + }, + { + "epoch": 1.488799306918439, + "grad_norm": 2.471019744873047, + "learning_rate": 2.5186678218026017e-05, + "loss": 0.5615, + "step": 168410 + }, + { + "epoch": 1.4888877101787514, + "grad_norm": 1.9932076930999756, + "learning_rate": 2.5185204830354142e-05, + "loss": 0.5905, + "step": 168420 + }, + { + "epoch": 1.4889761134390636, + "grad_norm": 1.5231190919876099, + "learning_rate": 2.5183731442682273e-05, + "loss": 0.6244, + "step": 168430 + }, + { + "epoch": 1.489064516699376, + "grad_norm": 1.7419872283935547, + "learning_rate": 2.5182258055010405e-05, + "loss": 0.6957, + "step": 168440 + }, + { + "epoch": 1.4891529199596882, + "grad_norm": 8.614608764648438, + "learning_rate": 2.518078466733853e-05, + "loss": 0.675, + "step": 168450 + }, + { + "epoch": 1.4892413232200004, + "grad_norm": 4.871392250061035, + "learning_rate": 2.5179311279666662e-05, + "loss": 0.537, + "step": 168460 + }, + { + "epoch": 1.4893297264803125, + "grad_norm": 3.458824634552002, + "learning_rate": 2.5177837891994794e-05, + "loss": 0.612, + "step": 168470 + }, + { + "epoch": 1.4894181297406248, + "grad_norm": 1.302211880683899, + "learning_rate": 2.517636450432292e-05, + "loss": 0.558, + "step": 168480 + }, + { + "epoch": 1.4895065330009372, + "grad_norm": 10.931535720825195, + "learning_rate": 2.517489111665105e-05, + "loss": 0.6323, + "step": 168490 + }, + { + "epoch": 1.4895949362612493, + "grad_norm": 5.303699493408203, + "learning_rate": 2.5173417728979182e-05, + "loss": 0.6159, + "step": 168500 + }, + { + "epoch": 1.4896833395215616, + "grad_norm": 2.347606658935547, + "learning_rate": 2.5171944341307307e-05, + "loss": 0.559, + "step": 168510 + }, + { + "epoch": 1.4897717427818737, + "grad_norm": 2.460277795791626, + "learning_rate": 2.517047095363544e-05, + "loss": 0.648, + "step": 168520 + }, + { + "epoch": 1.489860146042186, + "grad_norm": 2.0660288333892822, + "learning_rate": 2.5168997565963564e-05, + "loss": 0.5716, + "step": 168530 + }, + { + "epoch": 1.4899485493024982, + "grad_norm": 0.9978792667388916, + "learning_rate": 2.5167524178291696e-05, + "loss": 0.6552, + "step": 168540 + }, + { + "epoch": 1.4900369525628105, + "grad_norm": 4.9887003898620605, + "learning_rate": 2.5166050790619827e-05, + "loss": 0.579, + "step": 168550 + }, + { + "epoch": 1.4901253558231229, + "grad_norm": 1.4201104640960693, + "learning_rate": 2.5164577402947952e-05, + "loss": 0.5044, + "step": 168560 + }, + { + "epoch": 1.490213759083435, + "grad_norm": 3.3996100425720215, + "learning_rate": 2.5163104015276084e-05, + "loss": 0.6232, + "step": 168570 + }, + { + "epoch": 1.4903021623437471, + "grad_norm": 15.804523468017578, + "learning_rate": 2.5161630627604216e-05, + "loss": 0.5801, + "step": 168580 + }, + { + "epoch": 1.4903905656040595, + "grad_norm": 1.1996616125106812, + "learning_rate": 2.516015723993234e-05, + "loss": 0.5963, + "step": 168590 + }, + { + "epoch": 1.4904789688643718, + "grad_norm": 20.32427215576172, + "learning_rate": 2.5158683852260472e-05, + "loss": 0.5672, + "step": 168600 + }, + { + "epoch": 1.490567372124684, + "grad_norm": 7.99282693862915, + "learning_rate": 2.5157210464588604e-05, + "loss": 0.6394, + "step": 168610 + }, + { + "epoch": 1.4906557753849963, + "grad_norm": 16.958051681518555, + "learning_rate": 2.515573707691673e-05, + "loss": 0.5896, + "step": 168620 + }, + { + "epoch": 1.4907441786453084, + "grad_norm": 8.355259895324707, + "learning_rate": 2.515426368924486e-05, + "loss": 0.5606, + "step": 168630 + }, + { + "epoch": 1.4908325819056207, + "grad_norm": 3.4368736743927, + "learning_rate": 2.5152790301572993e-05, + "loss": 0.5371, + "step": 168640 + }, + { + "epoch": 1.4909209851659329, + "grad_norm": 1.612131953239441, + "learning_rate": 2.5151316913901118e-05, + "loss": 0.6349, + "step": 168650 + }, + { + "epoch": 1.4910093884262452, + "grad_norm": 4.0380048751831055, + "learning_rate": 2.514984352622925e-05, + "loss": 0.547, + "step": 168660 + }, + { + "epoch": 1.4910977916865575, + "grad_norm": 6.6345696449279785, + "learning_rate": 2.5148370138557374e-05, + "loss": 0.5238, + "step": 168670 + }, + { + "epoch": 1.4911861949468697, + "grad_norm": 3.4301276206970215, + "learning_rate": 2.5146896750885506e-05, + "loss": 0.6353, + "step": 168680 + }, + { + "epoch": 1.4912745982071818, + "grad_norm": 6.49104642868042, + "learning_rate": 2.5145423363213638e-05, + "loss": 0.6937, + "step": 168690 + }, + { + "epoch": 1.4913630014674941, + "grad_norm": 4.643334865570068, + "learning_rate": 2.5143949975541763e-05, + "loss": 0.5247, + "step": 168700 + }, + { + "epoch": 1.4914514047278065, + "grad_norm": 3.2362964153289795, + "learning_rate": 2.5142476587869895e-05, + "loss": 0.5551, + "step": 168710 + }, + { + "epoch": 1.4915398079881186, + "grad_norm": 5.963395595550537, + "learning_rate": 2.5141003200198026e-05, + "loss": 0.6472, + "step": 168720 + }, + { + "epoch": 1.491628211248431, + "grad_norm": 4.386768341064453, + "learning_rate": 2.513952981252615e-05, + "loss": 0.6609, + "step": 168730 + }, + { + "epoch": 1.491716614508743, + "grad_norm": 8.722784042358398, + "learning_rate": 2.5138056424854283e-05, + "loss": 0.8431, + "step": 168740 + }, + { + "epoch": 1.4918050177690554, + "grad_norm": 2.4228360652923584, + "learning_rate": 2.5136583037182415e-05, + "loss": 0.6162, + "step": 168750 + }, + { + "epoch": 1.4918934210293675, + "grad_norm": 0.815682590007782, + "learning_rate": 2.513510964951054e-05, + "loss": 0.5852, + "step": 168760 + }, + { + "epoch": 1.4919818242896798, + "grad_norm": 3.2544634342193604, + "learning_rate": 2.513363626183867e-05, + "loss": 0.6008, + "step": 168770 + }, + { + "epoch": 1.492070227549992, + "grad_norm": 1.5924484729766846, + "learning_rate": 2.5132162874166796e-05, + "loss": 0.5001, + "step": 168780 + }, + { + "epoch": 1.4921586308103043, + "grad_norm": 2.2395777702331543, + "learning_rate": 2.5130689486494928e-05, + "loss": 0.7412, + "step": 168790 + }, + { + "epoch": 1.4922470340706164, + "grad_norm": 3.179886817932129, + "learning_rate": 2.512921609882306e-05, + "loss": 0.6234, + "step": 168800 + }, + { + "epoch": 1.4923354373309288, + "grad_norm": 3.3182594776153564, + "learning_rate": 2.5127742711151185e-05, + "loss": 0.6121, + "step": 168810 + }, + { + "epoch": 1.4924238405912411, + "grad_norm": 1.2474032640457153, + "learning_rate": 2.5126269323479317e-05, + "loss": 0.552, + "step": 168820 + }, + { + "epoch": 1.4925122438515532, + "grad_norm": 3.5903759002685547, + "learning_rate": 2.5124795935807448e-05, + "loss": 0.6559, + "step": 168830 + }, + { + "epoch": 1.4926006471118654, + "grad_norm": 2.073939800262451, + "learning_rate": 2.5123322548135573e-05, + "loss": 0.5857, + "step": 168840 + }, + { + "epoch": 1.4926890503721777, + "grad_norm": 2.900148391723633, + "learning_rate": 2.5121849160463705e-05, + "loss": 0.587, + "step": 168850 + }, + { + "epoch": 1.49277745363249, + "grad_norm": 2.4644055366516113, + "learning_rate": 2.5120375772791837e-05, + "loss": 0.5949, + "step": 168860 + }, + { + "epoch": 1.4928658568928022, + "grad_norm": 1.1597604751586914, + "learning_rate": 2.5118902385119962e-05, + "loss": 0.6415, + "step": 168870 + }, + { + "epoch": 1.4929542601531145, + "grad_norm": 2.5917768478393555, + "learning_rate": 2.5117428997448093e-05, + "loss": 0.6046, + "step": 168880 + }, + { + "epoch": 1.4930426634134266, + "grad_norm": 2.9160399436950684, + "learning_rate": 2.5115955609776222e-05, + "loss": 0.5587, + "step": 168890 + }, + { + "epoch": 1.493131066673739, + "grad_norm": 1.3189928531646729, + "learning_rate": 2.5114482222104354e-05, + "loss": 0.5735, + "step": 168900 + }, + { + "epoch": 1.493219469934051, + "grad_norm": 1.201468825340271, + "learning_rate": 2.5113008834432482e-05, + "loss": 0.7179, + "step": 168910 + }, + { + "epoch": 1.4933078731943634, + "grad_norm": 3.8716065883636475, + "learning_rate": 2.511153544676061e-05, + "loss": 0.5129, + "step": 168920 + }, + { + "epoch": 1.4933962764546758, + "grad_norm": 4.122172832489014, + "learning_rate": 2.5110062059088742e-05, + "loss": 0.6779, + "step": 168930 + }, + { + "epoch": 1.4934846797149879, + "grad_norm": 3.85265851020813, + "learning_rate": 2.510858867141687e-05, + "loss": 0.7558, + "step": 168940 + }, + { + "epoch": 1.4935730829753, + "grad_norm": 6.949263095855713, + "learning_rate": 2.5107115283745e-05, + "loss": 0.5533, + "step": 168950 + }, + { + "epoch": 1.4936614862356123, + "grad_norm": 2.57426118850708, + "learning_rate": 2.510564189607313e-05, + "loss": 0.824, + "step": 168960 + }, + { + "epoch": 1.4937498894959247, + "grad_norm": 4.959727764129639, + "learning_rate": 2.510416850840126e-05, + "loss": 0.8049, + "step": 168970 + }, + { + "epoch": 1.4938382927562368, + "grad_norm": 1.691426157951355, + "learning_rate": 2.5102695120729387e-05, + "loss": 0.4823, + "step": 168980 + }, + { + "epoch": 1.4939266960165491, + "grad_norm": 3.8828234672546387, + "learning_rate": 2.510122173305752e-05, + "loss": 0.6448, + "step": 168990 + }, + { + "epoch": 1.4940150992768613, + "grad_norm": 14.066272735595703, + "learning_rate": 2.5099748345385644e-05, + "loss": 0.5908, + "step": 169000 + }, + { + "epoch": 1.4941035025371736, + "grad_norm": 1.3125174045562744, + "learning_rate": 2.5098274957713776e-05, + "loss": 0.6485, + "step": 169010 + }, + { + "epoch": 1.4941919057974857, + "grad_norm": 1.0090336799621582, + "learning_rate": 2.5096801570041907e-05, + "loss": 0.5247, + "step": 169020 + }, + { + "epoch": 1.494280309057798, + "grad_norm": 5.59339714050293, + "learning_rate": 2.5095328182370032e-05, + "loss": 0.4493, + "step": 169030 + }, + { + "epoch": 1.4943687123181104, + "grad_norm": 1.753861427307129, + "learning_rate": 2.5093854794698164e-05, + "loss": 0.687, + "step": 169040 + }, + { + "epoch": 1.4944571155784225, + "grad_norm": 1.7943350076675415, + "learning_rate": 2.5092381407026296e-05, + "loss": 0.5861, + "step": 169050 + }, + { + "epoch": 1.4945455188387347, + "grad_norm": 9.86168384552002, + "learning_rate": 2.509090801935442e-05, + "loss": 0.5855, + "step": 169060 + }, + { + "epoch": 1.494633922099047, + "grad_norm": 2.938364028930664, + "learning_rate": 2.5089434631682553e-05, + "loss": 0.6469, + "step": 169070 + }, + { + "epoch": 1.4947223253593593, + "grad_norm": 0.9118348956108093, + "learning_rate": 2.5087961244010684e-05, + "loss": 0.5248, + "step": 169080 + }, + { + "epoch": 1.4948107286196715, + "grad_norm": 3.6555280685424805, + "learning_rate": 2.508648785633881e-05, + "loss": 0.5285, + "step": 169090 + }, + { + "epoch": 1.4948991318799838, + "grad_norm": 2.906902313232422, + "learning_rate": 2.508501446866694e-05, + "loss": 0.6011, + "step": 169100 + }, + { + "epoch": 1.494987535140296, + "grad_norm": 1.880859136581421, + "learning_rate": 2.5083541080995073e-05, + "loss": 0.5405, + "step": 169110 + }, + { + "epoch": 1.4950759384006083, + "grad_norm": 1.6957145929336548, + "learning_rate": 2.5082067693323198e-05, + "loss": 0.5583, + "step": 169120 + }, + { + "epoch": 1.4951643416609204, + "grad_norm": 5.842836380004883, + "learning_rate": 2.508059430565133e-05, + "loss": 0.6368, + "step": 169130 + }, + { + "epoch": 1.4952527449212327, + "grad_norm": 7.523136138916016, + "learning_rate": 2.5079120917979454e-05, + "loss": 0.4566, + "step": 169140 + }, + { + "epoch": 1.495341148181545, + "grad_norm": 1.844063639640808, + "learning_rate": 2.5077647530307586e-05, + "loss": 0.7034, + "step": 169150 + }, + { + "epoch": 1.4954295514418572, + "grad_norm": 1.8485033512115479, + "learning_rate": 2.5076174142635718e-05, + "loss": 0.6989, + "step": 169160 + }, + { + "epoch": 1.4955179547021693, + "grad_norm": 1.600848913192749, + "learning_rate": 2.5074700754963843e-05, + "loss": 0.5078, + "step": 169170 + }, + { + "epoch": 1.4956063579624816, + "grad_norm": 2.792823314666748, + "learning_rate": 2.5073227367291975e-05, + "loss": 0.6132, + "step": 169180 + }, + { + "epoch": 1.495694761222794, + "grad_norm": 1.3458143472671509, + "learning_rate": 2.5071753979620106e-05, + "loss": 0.5546, + "step": 169190 + }, + { + "epoch": 1.495783164483106, + "grad_norm": 7.214616298675537, + "learning_rate": 2.507028059194823e-05, + "loss": 0.6309, + "step": 169200 + }, + { + "epoch": 1.4958715677434185, + "grad_norm": 4.552582740783691, + "learning_rate": 2.5068807204276363e-05, + "loss": 0.6711, + "step": 169210 + }, + { + "epoch": 1.4959599710037306, + "grad_norm": 1.661666750907898, + "learning_rate": 2.5067333816604495e-05, + "loss": 0.5191, + "step": 169220 + }, + { + "epoch": 1.496048374264043, + "grad_norm": 3.7032909393310547, + "learning_rate": 2.506586042893262e-05, + "loss": 0.7553, + "step": 169230 + }, + { + "epoch": 1.496136777524355, + "grad_norm": 5.18577766418457, + "learning_rate": 2.506438704126075e-05, + "loss": 0.5995, + "step": 169240 + }, + { + "epoch": 1.4962251807846674, + "grad_norm": 2.2874503135681152, + "learning_rate": 2.5062913653588876e-05, + "loss": 0.7453, + "step": 169250 + }, + { + "epoch": 1.4963135840449797, + "grad_norm": 6.964410305023193, + "learning_rate": 2.5061440265917008e-05, + "loss": 0.7524, + "step": 169260 + }, + { + "epoch": 1.4964019873052918, + "grad_norm": 3.547203540802002, + "learning_rate": 2.505996687824514e-05, + "loss": 0.5764, + "step": 169270 + }, + { + "epoch": 1.496490390565604, + "grad_norm": 1.2275030612945557, + "learning_rate": 2.5058493490573265e-05, + "loss": 0.4425, + "step": 169280 + }, + { + "epoch": 1.4965787938259163, + "grad_norm": 2.9658262729644775, + "learning_rate": 2.5057020102901397e-05, + "loss": 0.6042, + "step": 169290 + }, + { + "epoch": 1.4966671970862286, + "grad_norm": 5.516174793243408, + "learning_rate": 2.505554671522953e-05, + "loss": 0.4284, + "step": 169300 + }, + { + "epoch": 1.4967556003465408, + "grad_norm": 3.4635496139526367, + "learning_rate": 2.5054073327557653e-05, + "loss": 0.5782, + "step": 169310 + }, + { + "epoch": 1.496844003606853, + "grad_norm": 5.4861016273498535, + "learning_rate": 2.5052599939885785e-05, + "loss": 0.4534, + "step": 169320 + }, + { + "epoch": 1.4969324068671652, + "grad_norm": 1.2953311204910278, + "learning_rate": 2.5051126552213917e-05, + "loss": 0.59, + "step": 169330 + }, + { + "epoch": 1.4970208101274776, + "grad_norm": 14.16625690460205, + "learning_rate": 2.5049653164542042e-05, + "loss": 0.5066, + "step": 169340 + }, + { + "epoch": 1.4971092133877897, + "grad_norm": 2.735600233078003, + "learning_rate": 2.5048179776870174e-05, + "loss": 0.6294, + "step": 169350 + }, + { + "epoch": 1.497197616648102, + "grad_norm": 2.928574562072754, + "learning_rate": 2.50467063891983e-05, + "loss": 0.653, + "step": 169360 + }, + { + "epoch": 1.4972860199084141, + "grad_norm": 0.7642167210578918, + "learning_rate": 2.504523300152643e-05, + "loss": 0.5845, + "step": 169370 + }, + { + "epoch": 1.4973744231687265, + "grad_norm": 1.8937228918075562, + "learning_rate": 2.5043759613854562e-05, + "loss": 0.5038, + "step": 169380 + }, + { + "epoch": 1.4974628264290386, + "grad_norm": 1.8398478031158447, + "learning_rate": 2.5042286226182687e-05, + "loss": 0.6733, + "step": 169390 + }, + { + "epoch": 1.497551229689351, + "grad_norm": 2.191654682159424, + "learning_rate": 2.504081283851082e-05, + "loss": 0.6384, + "step": 169400 + }, + { + "epoch": 1.4976396329496633, + "grad_norm": 2.1948232650756836, + "learning_rate": 2.503933945083895e-05, + "loss": 0.5465, + "step": 169410 + }, + { + "epoch": 1.4977280362099754, + "grad_norm": 1.8171820640563965, + "learning_rate": 2.5037866063167075e-05, + "loss": 0.4783, + "step": 169420 + }, + { + "epoch": 1.4978164394702875, + "grad_norm": 4.705183506011963, + "learning_rate": 2.5036392675495207e-05, + "loss": 0.478, + "step": 169430 + }, + { + "epoch": 1.4979048427305999, + "grad_norm": 1.259000539779663, + "learning_rate": 2.503491928782334e-05, + "loss": 0.5277, + "step": 169440 + }, + { + "epoch": 1.4979932459909122, + "grad_norm": 4.042740345001221, + "learning_rate": 2.5033445900151464e-05, + "loss": 0.499, + "step": 169450 + }, + { + "epoch": 1.4980816492512243, + "grad_norm": 1.422348976135254, + "learning_rate": 2.5031972512479596e-05, + "loss": 0.5433, + "step": 169460 + }, + { + "epoch": 1.4981700525115367, + "grad_norm": 1.154599905014038, + "learning_rate": 2.503049912480772e-05, + "loss": 0.4343, + "step": 169470 + }, + { + "epoch": 1.4982584557718488, + "grad_norm": 3.9661824703216553, + "learning_rate": 2.5029025737135852e-05, + "loss": 0.6969, + "step": 169480 + }, + { + "epoch": 1.4983468590321611, + "grad_norm": 3.3000917434692383, + "learning_rate": 2.5027552349463984e-05, + "loss": 0.6381, + "step": 169490 + }, + { + "epoch": 1.4984352622924733, + "grad_norm": 1.0850319862365723, + "learning_rate": 2.502607896179211e-05, + "loss": 0.5702, + "step": 169500 + }, + { + "epoch": 1.4985236655527856, + "grad_norm": 6.473759174346924, + "learning_rate": 2.502460557412024e-05, + "loss": 0.6195, + "step": 169510 + }, + { + "epoch": 1.498612068813098, + "grad_norm": 1.8104168176651, + "learning_rate": 2.5023132186448372e-05, + "loss": 0.6375, + "step": 169520 + }, + { + "epoch": 1.49870047207341, + "grad_norm": 1.222034215927124, + "learning_rate": 2.5021658798776497e-05, + "loss": 0.6913, + "step": 169530 + }, + { + "epoch": 1.4987888753337222, + "grad_norm": 2.7687134742736816, + "learning_rate": 2.502018541110463e-05, + "loss": 0.6479, + "step": 169540 + }, + { + "epoch": 1.4988772785940345, + "grad_norm": 1.4761496782302856, + "learning_rate": 2.501871202343276e-05, + "loss": 0.4683, + "step": 169550 + }, + { + "epoch": 1.4989656818543469, + "grad_norm": 3.6298413276672363, + "learning_rate": 2.5017238635760886e-05, + "loss": 0.64, + "step": 169560 + }, + { + "epoch": 1.499054085114659, + "grad_norm": 7.204841136932373, + "learning_rate": 2.5015765248089018e-05, + "loss": 0.5428, + "step": 169570 + }, + { + "epoch": 1.4991424883749713, + "grad_norm": 0.9959343075752258, + "learning_rate": 2.501429186041715e-05, + "loss": 0.4581, + "step": 169580 + }, + { + "epoch": 1.4992308916352834, + "grad_norm": 2.387869358062744, + "learning_rate": 2.5012818472745274e-05, + "loss": 0.6253, + "step": 169590 + }, + { + "epoch": 1.4993192948955958, + "grad_norm": 1.51453697681427, + "learning_rate": 2.5011345085073406e-05, + "loss": 0.6093, + "step": 169600 + }, + { + "epoch": 1.499407698155908, + "grad_norm": 4.41741943359375, + "learning_rate": 2.500987169740153e-05, + "loss": 0.6319, + "step": 169610 + }, + { + "epoch": 1.4994961014162203, + "grad_norm": 2.2445061206817627, + "learning_rate": 2.5008398309729663e-05, + "loss": 0.6416, + "step": 169620 + }, + { + "epoch": 1.4995845046765326, + "grad_norm": 1.8241304159164429, + "learning_rate": 2.5006924922057795e-05, + "loss": 0.6524, + "step": 169630 + }, + { + "epoch": 1.4996729079368447, + "grad_norm": 2.927823543548584, + "learning_rate": 2.500545153438592e-05, + "loss": 0.5658, + "step": 169640 + }, + { + "epoch": 1.4997613111971568, + "grad_norm": 2.05664324760437, + "learning_rate": 2.500397814671405e-05, + "loss": 0.6525, + "step": 169650 + }, + { + "epoch": 1.4998497144574692, + "grad_norm": 6.470807075500488, + "learning_rate": 2.5002504759042183e-05, + "loss": 0.6434, + "step": 169660 + }, + { + "epoch": 1.4999381177177815, + "grad_norm": 2.6049599647521973, + "learning_rate": 2.5001031371370308e-05, + "loss": 0.5496, + "step": 169670 + }, + { + "epoch": 1.5000265209780936, + "grad_norm": 2.938055992126465, + "learning_rate": 2.499955798369844e-05, + "loss": 0.6717, + "step": 169680 + }, + { + "epoch": 1.5001149242384058, + "grad_norm": 1.9531197547912598, + "learning_rate": 2.4998084596026568e-05, + "loss": 0.5321, + "step": 169690 + }, + { + "epoch": 1.500203327498718, + "grad_norm": 2.913414478302002, + "learning_rate": 2.4996611208354696e-05, + "loss": 0.6191, + "step": 169700 + }, + { + "epoch": 1.5002917307590304, + "grad_norm": 5.947141170501709, + "learning_rate": 2.4995137820682828e-05, + "loss": 0.615, + "step": 169710 + }, + { + "epoch": 1.5003801340193426, + "grad_norm": 2.870274782180786, + "learning_rate": 2.4993664433010957e-05, + "loss": 0.6628, + "step": 169720 + }, + { + "epoch": 1.500468537279655, + "grad_norm": 2.479779005050659, + "learning_rate": 2.4992191045339085e-05, + "loss": 0.6681, + "step": 169730 + }, + { + "epoch": 1.5005569405399672, + "grad_norm": 2.949855089187622, + "learning_rate": 2.4990717657667213e-05, + "loss": 0.6598, + "step": 169740 + }, + { + "epoch": 1.5006453438002794, + "grad_norm": 2.6002488136291504, + "learning_rate": 2.4989244269995345e-05, + "loss": 0.5747, + "step": 169750 + }, + { + "epoch": 1.5007337470605915, + "grad_norm": 3.154245376586914, + "learning_rate": 2.4987770882323473e-05, + "loss": 0.6492, + "step": 169760 + }, + { + "epoch": 1.5008221503209038, + "grad_norm": 2.0533437728881836, + "learning_rate": 2.49862974946516e-05, + "loss": 0.5728, + "step": 169770 + }, + { + "epoch": 1.5009105535812162, + "grad_norm": 3.662990093231201, + "learning_rate": 2.4984824106979733e-05, + "loss": 0.5604, + "step": 169780 + }, + { + "epoch": 1.5009989568415283, + "grad_norm": 1.4692844152450562, + "learning_rate": 2.4983350719307862e-05, + "loss": 0.5606, + "step": 169790 + }, + { + "epoch": 1.5010873601018404, + "grad_norm": 1.2336918115615845, + "learning_rate": 2.498187733163599e-05, + "loss": 0.5632, + "step": 169800 + }, + { + "epoch": 1.5011757633621527, + "grad_norm": 10.304373741149902, + "learning_rate": 2.4980403943964122e-05, + "loss": 0.6256, + "step": 169810 + }, + { + "epoch": 1.501264166622465, + "grad_norm": 1.5401891469955444, + "learning_rate": 2.497893055629225e-05, + "loss": 0.5284, + "step": 169820 + }, + { + "epoch": 1.5013525698827772, + "grad_norm": 2.591830253601074, + "learning_rate": 2.497745716862038e-05, + "loss": 0.5906, + "step": 169830 + }, + { + "epoch": 1.5014409731430896, + "grad_norm": 2.6223156452178955, + "learning_rate": 2.497598378094851e-05, + "loss": 0.4506, + "step": 169840 + }, + { + "epoch": 1.501529376403402, + "grad_norm": 1.4776949882507324, + "learning_rate": 2.497451039327664e-05, + "loss": 0.6062, + "step": 169850 + }, + { + "epoch": 1.501617779663714, + "grad_norm": 1.1779335737228394, + "learning_rate": 2.4973037005604767e-05, + "loss": 0.4831, + "step": 169860 + }, + { + "epoch": 1.5017061829240261, + "grad_norm": 2.3911023139953613, + "learning_rate": 2.49715636179329e-05, + "loss": 0.6537, + "step": 169870 + }, + { + "epoch": 1.5017945861843385, + "grad_norm": 0.955909252166748, + "learning_rate": 2.4970090230261027e-05, + "loss": 0.4837, + "step": 169880 + }, + { + "epoch": 1.5018829894446508, + "grad_norm": 2.4581687450408936, + "learning_rate": 2.4968616842589155e-05, + "loss": 0.5608, + "step": 169890 + }, + { + "epoch": 1.501971392704963, + "grad_norm": 13.110949516296387, + "learning_rate": 2.4967143454917287e-05, + "loss": 0.6386, + "step": 169900 + }, + { + "epoch": 1.502059795965275, + "grad_norm": 15.316435813903809, + "learning_rate": 2.4965670067245416e-05, + "loss": 0.4615, + "step": 169910 + }, + { + "epoch": 1.5021481992255874, + "grad_norm": 2.9901394844055176, + "learning_rate": 2.4964196679573544e-05, + "loss": 0.7241, + "step": 169920 + }, + { + "epoch": 1.5022366024858997, + "grad_norm": 3.714289426803589, + "learning_rate": 2.4962723291901676e-05, + "loss": 0.54, + "step": 169930 + }, + { + "epoch": 1.5023250057462119, + "grad_norm": 1.6649699211120605, + "learning_rate": 2.4961249904229804e-05, + "loss": 0.5663, + "step": 169940 + }, + { + "epoch": 1.5024134090065242, + "grad_norm": 12.405657768249512, + "learning_rate": 2.4959776516557932e-05, + "loss": 0.6212, + "step": 169950 + }, + { + "epoch": 1.5025018122668365, + "grad_norm": 2.0155158042907715, + "learning_rate": 2.4958303128886064e-05, + "loss": 0.5299, + "step": 169960 + }, + { + "epoch": 1.5025902155271487, + "grad_norm": 3.5121774673461914, + "learning_rate": 2.4956829741214192e-05, + "loss": 0.5956, + "step": 169970 + }, + { + "epoch": 1.5026786187874608, + "grad_norm": 4.991064548492432, + "learning_rate": 2.495535635354232e-05, + "loss": 0.646, + "step": 169980 + }, + { + "epoch": 1.5027670220477731, + "grad_norm": 2.6751155853271484, + "learning_rate": 2.495388296587045e-05, + "loss": 0.5725, + "step": 169990 + }, + { + "epoch": 1.5028554253080855, + "grad_norm": 5.514303684234619, + "learning_rate": 2.495240957819858e-05, + "loss": 0.69, + "step": 170000 + }, + { + "epoch": 1.5029438285683976, + "grad_norm": 2.0796127319335938, + "learning_rate": 2.495093619052671e-05, + "loss": 0.6217, + "step": 170010 + }, + { + "epoch": 1.5030322318287097, + "grad_norm": 3.1454861164093018, + "learning_rate": 2.4949462802854838e-05, + "loss": 0.619, + "step": 170020 + }, + { + "epoch": 1.503120635089022, + "grad_norm": 2.3838064670562744, + "learning_rate": 2.4947989415182966e-05, + "loss": 0.72, + "step": 170030 + }, + { + "epoch": 1.5032090383493344, + "grad_norm": 9.640490531921387, + "learning_rate": 2.4946516027511098e-05, + "loss": 0.5371, + "step": 170040 + }, + { + "epoch": 1.5032974416096465, + "grad_norm": 3.0396595001220703, + "learning_rate": 2.4945042639839226e-05, + "loss": 0.614, + "step": 170050 + }, + { + "epoch": 1.5033858448699589, + "grad_norm": 2.500504732131958, + "learning_rate": 2.4943569252167354e-05, + "loss": 0.6032, + "step": 170060 + }, + { + "epoch": 1.5034742481302712, + "grad_norm": 1.6776894330978394, + "learning_rate": 2.4942095864495486e-05, + "loss": 0.5255, + "step": 170070 + }, + { + "epoch": 1.5035626513905833, + "grad_norm": 3.970067024230957, + "learning_rate": 2.4940622476823615e-05, + "loss": 0.7731, + "step": 170080 + }, + { + "epoch": 1.5036510546508954, + "grad_norm": 2.090362787246704, + "learning_rate": 2.4939149089151743e-05, + "loss": 0.4771, + "step": 170090 + }, + { + "epoch": 1.5037394579112078, + "grad_norm": 2.3698582649230957, + "learning_rate": 2.493767570147987e-05, + "loss": 0.744, + "step": 170100 + }, + { + "epoch": 1.5038278611715201, + "grad_norm": 2.4198873043060303, + "learning_rate": 2.4936202313808003e-05, + "loss": 0.4677, + "step": 170110 + }, + { + "epoch": 1.5039162644318322, + "grad_norm": 3.272989511489868, + "learning_rate": 2.493472892613613e-05, + "loss": 0.6182, + "step": 170120 + }, + { + "epoch": 1.5040046676921444, + "grad_norm": 2.344414234161377, + "learning_rate": 2.493325553846426e-05, + "loss": 0.7147, + "step": 170130 + }, + { + "epoch": 1.5040930709524567, + "grad_norm": 14.925524711608887, + "learning_rate": 2.493178215079239e-05, + "loss": 0.6598, + "step": 170140 + }, + { + "epoch": 1.504181474212769, + "grad_norm": 1.4058020114898682, + "learning_rate": 2.493030876312052e-05, + "loss": 0.5053, + "step": 170150 + }, + { + "epoch": 1.5042698774730812, + "grad_norm": 1.8325374126434326, + "learning_rate": 2.4928835375448648e-05, + "loss": 0.4343, + "step": 170160 + }, + { + "epoch": 1.5043582807333933, + "grad_norm": 2.2944347858428955, + "learning_rate": 2.4927361987776776e-05, + "loss": 0.5292, + "step": 170170 + }, + { + "epoch": 1.5044466839937058, + "grad_norm": 1.1431275606155396, + "learning_rate": 2.4925888600104908e-05, + "loss": 0.6514, + "step": 170180 + }, + { + "epoch": 1.504535087254018, + "grad_norm": 2.4520976543426514, + "learning_rate": 2.4924415212433037e-05, + "loss": 0.5479, + "step": 170190 + }, + { + "epoch": 1.50462349051433, + "grad_norm": 2.191321611404419, + "learning_rate": 2.4922941824761165e-05, + "loss": 0.5846, + "step": 170200 + }, + { + "epoch": 1.5047118937746424, + "grad_norm": 5.109443664550781, + "learning_rate": 2.4921468437089293e-05, + "loss": 0.6829, + "step": 170210 + }, + { + "epoch": 1.5048002970349548, + "grad_norm": 1.2909646034240723, + "learning_rate": 2.4919995049417425e-05, + "loss": 0.55, + "step": 170220 + }, + { + "epoch": 1.504888700295267, + "grad_norm": 1.490598201751709, + "learning_rate": 2.4918521661745553e-05, + "loss": 0.6029, + "step": 170230 + }, + { + "epoch": 1.504977103555579, + "grad_norm": 8.929780006408691, + "learning_rate": 2.4917048274073682e-05, + "loss": 0.4641, + "step": 170240 + }, + { + "epoch": 1.5050655068158914, + "grad_norm": 6.032838344573975, + "learning_rate": 2.4915574886401813e-05, + "loss": 0.6392, + "step": 170250 + }, + { + "epoch": 1.5051539100762037, + "grad_norm": 1.4174580574035645, + "learning_rate": 2.4914101498729942e-05, + "loss": 0.5078, + "step": 170260 + }, + { + "epoch": 1.5052423133365158, + "grad_norm": 1.6984152793884277, + "learning_rate": 2.491262811105807e-05, + "loss": 0.6429, + "step": 170270 + }, + { + "epoch": 1.505330716596828, + "grad_norm": 1.713139533996582, + "learning_rate": 2.49111547233862e-05, + "loss": 0.6461, + "step": 170280 + }, + { + "epoch": 1.5054191198571405, + "grad_norm": 19.080305099487305, + "learning_rate": 2.490968133571433e-05, + "loss": 0.7372, + "step": 170290 + }, + { + "epoch": 1.5055075231174526, + "grad_norm": 2.627061128616333, + "learning_rate": 2.490820794804246e-05, + "loss": 0.5528, + "step": 170300 + }, + { + "epoch": 1.5055959263777647, + "grad_norm": 4.7588419914245605, + "learning_rate": 2.4906734560370587e-05, + "loss": 0.6198, + "step": 170310 + }, + { + "epoch": 1.505684329638077, + "grad_norm": 0.9784073829650879, + "learning_rate": 2.4905261172698715e-05, + "loss": 0.423, + "step": 170320 + }, + { + "epoch": 1.5057727328983894, + "grad_norm": 3.155487537384033, + "learning_rate": 2.4903787785026847e-05, + "loss": 0.6677, + "step": 170330 + }, + { + "epoch": 1.5058611361587015, + "grad_norm": 3.9082417488098145, + "learning_rate": 2.4902314397354975e-05, + "loss": 0.5043, + "step": 170340 + }, + { + "epoch": 1.5059495394190137, + "grad_norm": 1.0406138896942139, + "learning_rate": 2.4900841009683104e-05, + "loss": 0.4758, + "step": 170350 + }, + { + "epoch": 1.506037942679326, + "grad_norm": 3.066814661026001, + "learning_rate": 2.4899367622011236e-05, + "loss": 0.5702, + "step": 170360 + }, + { + "epoch": 1.5061263459396383, + "grad_norm": 6.921902656555176, + "learning_rate": 2.4897894234339364e-05, + "loss": 0.6582, + "step": 170370 + }, + { + "epoch": 1.5062147491999505, + "grad_norm": 6.363106727600098, + "learning_rate": 2.4896420846667492e-05, + "loss": 0.754, + "step": 170380 + }, + { + "epoch": 1.5063031524602626, + "grad_norm": 9.011924743652344, + "learning_rate": 2.489494745899562e-05, + "loss": 0.5578, + "step": 170390 + }, + { + "epoch": 1.506391555720575, + "grad_norm": 2.6091227531433105, + "learning_rate": 2.4893474071323752e-05, + "loss": 0.6223, + "step": 170400 + }, + { + "epoch": 1.5064799589808873, + "grad_norm": 10.137401580810547, + "learning_rate": 2.489200068365188e-05, + "loss": 0.5603, + "step": 170410 + }, + { + "epoch": 1.5065683622411994, + "grad_norm": 5.111581325531006, + "learning_rate": 2.489052729598001e-05, + "loss": 0.5112, + "step": 170420 + }, + { + "epoch": 1.5066567655015117, + "grad_norm": 1.9903699159622192, + "learning_rate": 2.488905390830814e-05, + "loss": 0.5632, + "step": 170430 + }, + { + "epoch": 1.506745168761824, + "grad_norm": 2.1301333904266357, + "learning_rate": 2.488758052063627e-05, + "loss": 0.6118, + "step": 170440 + }, + { + "epoch": 1.5068335720221362, + "grad_norm": 2.1307950019836426, + "learning_rate": 2.4886107132964397e-05, + "loss": 0.6598, + "step": 170450 + }, + { + "epoch": 1.5069219752824483, + "grad_norm": 2.33685302734375, + "learning_rate": 2.4884633745292526e-05, + "loss": 0.5392, + "step": 170460 + }, + { + "epoch": 1.5070103785427607, + "grad_norm": 0.938329815864563, + "learning_rate": 2.4883160357620658e-05, + "loss": 0.533, + "step": 170470 + }, + { + "epoch": 1.507098781803073, + "grad_norm": 5.397551536560059, + "learning_rate": 2.4881686969948786e-05, + "loss": 0.6804, + "step": 170480 + }, + { + "epoch": 1.5071871850633851, + "grad_norm": 5.188610076904297, + "learning_rate": 2.4880213582276914e-05, + "loss": 0.6573, + "step": 170490 + }, + { + "epoch": 1.5072755883236972, + "grad_norm": 1.6878573894500732, + "learning_rate": 2.4878740194605043e-05, + "loss": 0.5312, + "step": 170500 + }, + { + "epoch": 1.5073639915840096, + "grad_norm": 1.3941048383712769, + "learning_rate": 2.4877266806933174e-05, + "loss": 0.6473, + "step": 170510 + }, + { + "epoch": 1.507452394844322, + "grad_norm": 4.4079718589782715, + "learning_rate": 2.4875793419261303e-05, + "loss": 0.7942, + "step": 170520 + }, + { + "epoch": 1.507540798104634, + "grad_norm": 1.4802908897399902, + "learning_rate": 2.487432003158943e-05, + "loss": 0.576, + "step": 170530 + }, + { + "epoch": 1.5076292013649464, + "grad_norm": 1.521902322769165, + "learning_rate": 2.4872846643917563e-05, + "loss": 0.5962, + "step": 170540 + }, + { + "epoch": 1.5077176046252587, + "grad_norm": 2.56095290184021, + "learning_rate": 2.487137325624569e-05, + "loss": 0.5965, + "step": 170550 + }, + { + "epoch": 1.5078060078855708, + "grad_norm": 2.7927162647247314, + "learning_rate": 2.486989986857382e-05, + "loss": 0.558, + "step": 170560 + }, + { + "epoch": 1.507894411145883, + "grad_norm": 1.9829643964767456, + "learning_rate": 2.4868426480901948e-05, + "loss": 0.5682, + "step": 170570 + }, + { + "epoch": 1.5079828144061953, + "grad_norm": 1.9201500415802002, + "learning_rate": 2.486695309323008e-05, + "loss": 0.7038, + "step": 170580 + }, + { + "epoch": 1.5080712176665076, + "grad_norm": 8.51537799835205, + "learning_rate": 2.4865479705558208e-05, + "loss": 0.6354, + "step": 170590 + }, + { + "epoch": 1.5081596209268198, + "grad_norm": 2.365753412246704, + "learning_rate": 2.4864006317886336e-05, + "loss": 0.5691, + "step": 170600 + }, + { + "epoch": 1.5082480241871319, + "grad_norm": 4.4125823974609375, + "learning_rate": 2.4862532930214468e-05, + "loss": 0.5021, + "step": 170610 + }, + { + "epoch": 1.5083364274474442, + "grad_norm": 2.620122194290161, + "learning_rate": 2.4861059542542596e-05, + "loss": 0.6237, + "step": 170620 + }, + { + "epoch": 1.5084248307077566, + "grad_norm": 14.48987865447998, + "learning_rate": 2.4859586154870725e-05, + "loss": 0.5036, + "step": 170630 + }, + { + "epoch": 1.5085132339680687, + "grad_norm": 2.271885395050049, + "learning_rate": 2.4858112767198853e-05, + "loss": 0.6153, + "step": 170640 + }, + { + "epoch": 1.508601637228381, + "grad_norm": 13.027172088623047, + "learning_rate": 2.4856639379526985e-05, + "loss": 0.5818, + "step": 170650 + }, + { + "epoch": 1.5086900404886934, + "grad_norm": 5.975130081176758, + "learning_rate": 2.4855165991855113e-05, + "loss": 0.4836, + "step": 170660 + }, + { + "epoch": 1.5087784437490055, + "grad_norm": 1.2923355102539062, + "learning_rate": 2.485369260418324e-05, + "loss": 0.6044, + "step": 170670 + }, + { + "epoch": 1.5088668470093176, + "grad_norm": 4.037400245666504, + "learning_rate": 2.485221921651137e-05, + "loss": 0.6181, + "step": 170680 + }, + { + "epoch": 1.50895525026963, + "grad_norm": 4.225475311279297, + "learning_rate": 2.4850745828839502e-05, + "loss": 0.5584, + "step": 170690 + }, + { + "epoch": 1.5090436535299423, + "grad_norm": 1.4339970350265503, + "learning_rate": 2.484927244116763e-05, + "loss": 0.4635, + "step": 170700 + }, + { + "epoch": 1.5091320567902544, + "grad_norm": 2.196194887161255, + "learning_rate": 2.484779905349576e-05, + "loss": 0.5799, + "step": 170710 + }, + { + "epoch": 1.5092204600505665, + "grad_norm": 10.085076332092285, + "learning_rate": 2.484632566582389e-05, + "loss": 0.6642, + "step": 170720 + }, + { + "epoch": 1.5093088633108789, + "grad_norm": 3.725231409072876, + "learning_rate": 2.484485227815202e-05, + "loss": 0.5709, + "step": 170730 + }, + { + "epoch": 1.5093972665711912, + "grad_norm": 3.753584146499634, + "learning_rate": 2.4843378890480147e-05, + "loss": 0.6377, + "step": 170740 + }, + { + "epoch": 1.5094856698315033, + "grad_norm": 3.035900831222534, + "learning_rate": 2.484190550280828e-05, + "loss": 0.5406, + "step": 170750 + }, + { + "epoch": 1.5095740730918155, + "grad_norm": 2.5373897552490234, + "learning_rate": 2.4840432115136407e-05, + "loss": 0.526, + "step": 170760 + }, + { + "epoch": 1.509662476352128, + "grad_norm": 1.967418909072876, + "learning_rate": 2.4838958727464535e-05, + "loss": 0.5389, + "step": 170770 + }, + { + "epoch": 1.5097508796124401, + "grad_norm": 0.7998499870300293, + "learning_rate": 2.4837485339792667e-05, + "loss": 0.6189, + "step": 170780 + }, + { + "epoch": 1.5098392828727523, + "grad_norm": 1.803086519241333, + "learning_rate": 2.4836011952120795e-05, + "loss": 0.4854, + "step": 170790 + }, + { + "epoch": 1.5099276861330646, + "grad_norm": 2.995997190475464, + "learning_rate": 2.4834538564448924e-05, + "loss": 0.8179, + "step": 170800 + }, + { + "epoch": 1.510016089393377, + "grad_norm": 6.703754901885986, + "learning_rate": 2.4833065176777056e-05, + "loss": 0.5792, + "step": 170810 + }, + { + "epoch": 1.510104492653689, + "grad_norm": 2.998743772506714, + "learning_rate": 2.4831591789105184e-05, + "loss": 0.6407, + "step": 170820 + }, + { + "epoch": 1.5101928959140012, + "grad_norm": 1.580256462097168, + "learning_rate": 2.4830118401433312e-05, + "loss": 0.5288, + "step": 170830 + }, + { + "epoch": 1.5102812991743135, + "grad_norm": 2.933837652206421, + "learning_rate": 2.4828645013761444e-05, + "loss": 0.5742, + "step": 170840 + }, + { + "epoch": 1.5103697024346259, + "grad_norm": 1.6946322917938232, + "learning_rate": 2.4827171626089572e-05, + "loss": 0.5555, + "step": 170850 + }, + { + "epoch": 1.510458105694938, + "grad_norm": 1.2750314474105835, + "learning_rate": 2.48256982384177e-05, + "loss": 0.4955, + "step": 170860 + }, + { + "epoch": 1.5105465089552501, + "grad_norm": 10.534096717834473, + "learning_rate": 2.4824224850745832e-05, + "loss": 0.4817, + "step": 170870 + }, + { + "epoch": 1.5106349122155627, + "grad_norm": 1.5416467189788818, + "learning_rate": 2.482275146307396e-05, + "loss": 0.5073, + "step": 170880 + }, + { + "epoch": 1.5107233154758748, + "grad_norm": 1.9794907569885254, + "learning_rate": 2.482127807540209e-05, + "loss": 0.5094, + "step": 170890 + }, + { + "epoch": 1.510811718736187, + "grad_norm": 3.277860403060913, + "learning_rate": 2.481980468773022e-05, + "loss": 0.5946, + "step": 170900 + }, + { + "epoch": 1.5109001219964993, + "grad_norm": 8.792659759521484, + "learning_rate": 2.481833130005835e-05, + "loss": 0.6764, + "step": 170910 + }, + { + "epoch": 1.5109885252568116, + "grad_norm": 1.4864585399627686, + "learning_rate": 2.4816857912386478e-05, + "loss": 0.528, + "step": 170920 + }, + { + "epoch": 1.5110769285171237, + "grad_norm": 2.291426420211792, + "learning_rate": 2.4815384524714606e-05, + "loss": 0.6346, + "step": 170930 + }, + { + "epoch": 1.5111653317774358, + "grad_norm": 5.386637210845947, + "learning_rate": 2.4813911137042738e-05, + "loss": 0.5311, + "step": 170940 + }, + { + "epoch": 1.5112537350377482, + "grad_norm": 8.516998291015625, + "learning_rate": 2.4812437749370866e-05, + "loss": 0.5453, + "step": 170950 + }, + { + "epoch": 1.5113421382980605, + "grad_norm": 7.321317672729492, + "learning_rate": 2.4810964361698994e-05, + "loss": 0.5791, + "step": 170960 + }, + { + "epoch": 1.5114305415583726, + "grad_norm": 2.563375949859619, + "learning_rate": 2.4809490974027123e-05, + "loss": 0.6834, + "step": 170970 + }, + { + "epoch": 1.5115189448186848, + "grad_norm": 22.751176834106445, + "learning_rate": 2.4808017586355254e-05, + "loss": 0.5943, + "step": 170980 + }, + { + "epoch": 1.511607348078997, + "grad_norm": 3.201568603515625, + "learning_rate": 2.4806544198683383e-05, + "loss": 0.5685, + "step": 170990 + }, + { + "epoch": 1.5116957513393094, + "grad_norm": 1.581493854522705, + "learning_rate": 2.480507081101151e-05, + "loss": 0.6274, + "step": 171000 + }, + { + "epoch": 1.5117841545996216, + "grad_norm": 1.1330076456069946, + "learning_rate": 2.4803597423339643e-05, + "loss": 0.4908, + "step": 171010 + }, + { + "epoch": 1.511872557859934, + "grad_norm": 1.00552499294281, + "learning_rate": 2.480212403566777e-05, + "loss": 0.426, + "step": 171020 + }, + { + "epoch": 1.5119609611202462, + "grad_norm": 6.762635231018066, + "learning_rate": 2.48006506479959e-05, + "loss": 0.498, + "step": 171030 + }, + { + "epoch": 1.5120493643805584, + "grad_norm": 8.788949966430664, + "learning_rate": 2.4799177260324028e-05, + "loss": 0.521, + "step": 171040 + }, + { + "epoch": 1.5121377676408705, + "grad_norm": 1.3053117990493774, + "learning_rate": 2.479770387265216e-05, + "loss": 0.6146, + "step": 171050 + }, + { + "epoch": 1.5122261709011828, + "grad_norm": 13.430984497070312, + "learning_rate": 2.4796230484980288e-05, + "loss": 0.6675, + "step": 171060 + }, + { + "epoch": 1.5123145741614952, + "grad_norm": 2.0273873805999756, + "learning_rate": 2.4794757097308416e-05, + "loss": 0.6122, + "step": 171070 + }, + { + "epoch": 1.5124029774218073, + "grad_norm": 2.3378665447235107, + "learning_rate": 2.4793283709636548e-05, + "loss": 0.5783, + "step": 171080 + }, + { + "epoch": 1.5124913806821194, + "grad_norm": 8.47735595703125, + "learning_rate": 2.4791810321964677e-05, + "loss": 0.5486, + "step": 171090 + }, + { + "epoch": 1.5125797839424318, + "grad_norm": 2.7977492809295654, + "learning_rate": 2.4790336934292805e-05, + "loss": 0.6106, + "step": 171100 + }, + { + "epoch": 1.512668187202744, + "grad_norm": 13.27365779876709, + "learning_rate": 2.4788863546620933e-05, + "loss": 0.695, + "step": 171110 + }, + { + "epoch": 1.5127565904630562, + "grad_norm": 15.970056533813477, + "learning_rate": 2.4787390158949065e-05, + "loss": 0.6467, + "step": 171120 + }, + { + "epoch": 1.5128449937233686, + "grad_norm": 1.6792930364608765, + "learning_rate": 2.4785916771277193e-05, + "loss": 0.5215, + "step": 171130 + }, + { + "epoch": 1.512933396983681, + "grad_norm": 9.103386878967285, + "learning_rate": 2.478444338360532e-05, + "loss": 0.6298, + "step": 171140 + }, + { + "epoch": 1.513021800243993, + "grad_norm": 6.686374187469482, + "learning_rate": 2.478296999593345e-05, + "loss": 0.4883, + "step": 171150 + }, + { + "epoch": 1.5131102035043051, + "grad_norm": 1.2885302305221558, + "learning_rate": 2.4781496608261582e-05, + "loss": 0.6489, + "step": 171160 + }, + { + "epoch": 1.5131986067646175, + "grad_norm": 2.2273309230804443, + "learning_rate": 2.478002322058971e-05, + "loss": 0.608, + "step": 171170 + }, + { + "epoch": 1.5132870100249298, + "grad_norm": 1.2896255254745483, + "learning_rate": 2.477854983291784e-05, + "loss": 0.5715, + "step": 171180 + }, + { + "epoch": 1.513375413285242, + "grad_norm": 1.0402296781539917, + "learning_rate": 2.477707644524597e-05, + "loss": 0.4456, + "step": 171190 + }, + { + "epoch": 1.513463816545554, + "grad_norm": 2.567488193511963, + "learning_rate": 2.47756030575741e-05, + "loss": 0.5819, + "step": 171200 + }, + { + "epoch": 1.5135522198058664, + "grad_norm": 1.212886095046997, + "learning_rate": 2.4774129669902227e-05, + "loss": 0.4918, + "step": 171210 + }, + { + "epoch": 1.5136406230661787, + "grad_norm": 3.427006244659424, + "learning_rate": 2.4772656282230355e-05, + "loss": 0.5295, + "step": 171220 + }, + { + "epoch": 1.5137290263264909, + "grad_norm": 1.9608615636825562, + "learning_rate": 2.4771182894558487e-05, + "loss": 0.56, + "step": 171230 + }, + { + "epoch": 1.5138174295868032, + "grad_norm": 2.8133926391601562, + "learning_rate": 2.4769709506886615e-05, + "loss": 0.573, + "step": 171240 + }, + { + "epoch": 1.5139058328471156, + "grad_norm": 1.1024353504180908, + "learning_rate": 2.4768236119214744e-05, + "loss": 0.645, + "step": 171250 + }, + { + "epoch": 1.5139942361074277, + "grad_norm": 2.839495897293091, + "learning_rate": 2.4766762731542875e-05, + "loss": 0.5933, + "step": 171260 + }, + { + "epoch": 1.5140826393677398, + "grad_norm": 7.581789970397949, + "learning_rate": 2.4765289343871004e-05, + "loss": 0.6872, + "step": 171270 + }, + { + "epoch": 1.5141710426280521, + "grad_norm": 2.31018328666687, + "learning_rate": 2.4763815956199132e-05, + "loss": 0.5183, + "step": 171280 + }, + { + "epoch": 1.5142594458883645, + "grad_norm": 1.6008466482162476, + "learning_rate": 2.476234256852726e-05, + "loss": 0.5156, + "step": 171290 + }, + { + "epoch": 1.5143478491486766, + "grad_norm": 4.402003288269043, + "learning_rate": 2.4760869180855392e-05, + "loss": 0.6073, + "step": 171300 + }, + { + "epoch": 1.5144362524089887, + "grad_norm": 1.1848864555358887, + "learning_rate": 2.475939579318352e-05, + "loss": 0.559, + "step": 171310 + }, + { + "epoch": 1.514524655669301, + "grad_norm": 3.13996958732605, + "learning_rate": 2.475792240551165e-05, + "loss": 0.7112, + "step": 171320 + }, + { + "epoch": 1.5146130589296134, + "grad_norm": 4.38145637512207, + "learning_rate": 2.4756449017839777e-05, + "loss": 0.6513, + "step": 171330 + }, + { + "epoch": 1.5147014621899255, + "grad_norm": 1.2336317300796509, + "learning_rate": 2.475497563016791e-05, + "loss": 0.6886, + "step": 171340 + }, + { + "epoch": 1.5147898654502379, + "grad_norm": 0.7369675636291504, + "learning_rate": 2.4753502242496037e-05, + "loss": 0.5116, + "step": 171350 + }, + { + "epoch": 1.5148782687105502, + "grad_norm": 0.9784601330757141, + "learning_rate": 2.4752028854824166e-05, + "loss": 0.5885, + "step": 171360 + }, + { + "epoch": 1.5149666719708623, + "grad_norm": 2.509460210800171, + "learning_rate": 2.4750555467152298e-05, + "loss": 0.6568, + "step": 171370 + }, + { + "epoch": 1.5150550752311744, + "grad_norm": 2.559572696685791, + "learning_rate": 2.4749082079480426e-05, + "loss": 0.5385, + "step": 171380 + }, + { + "epoch": 1.5151434784914868, + "grad_norm": 4.249107837677002, + "learning_rate": 2.4747608691808554e-05, + "loss": 0.6195, + "step": 171390 + }, + { + "epoch": 1.5152318817517991, + "grad_norm": 0.9475095272064209, + "learning_rate": 2.4746135304136683e-05, + "loss": 0.5925, + "step": 171400 + }, + { + "epoch": 1.5153202850121112, + "grad_norm": 1.9829034805297852, + "learning_rate": 2.4744661916464814e-05, + "loss": 0.6953, + "step": 171410 + }, + { + "epoch": 1.5154086882724234, + "grad_norm": 3.5664305686950684, + "learning_rate": 2.4743188528792943e-05, + "loss": 0.5652, + "step": 171420 + }, + { + "epoch": 1.5154970915327357, + "grad_norm": 6.073277950286865, + "learning_rate": 2.474171514112107e-05, + "loss": 0.6489, + "step": 171430 + }, + { + "epoch": 1.515585494793048, + "grad_norm": 1.3370921611785889, + "learning_rate": 2.47402417534492e-05, + "loss": 0.5475, + "step": 171440 + }, + { + "epoch": 1.5156738980533602, + "grad_norm": 2.5059022903442383, + "learning_rate": 2.473876836577733e-05, + "loss": 0.547, + "step": 171450 + }, + { + "epoch": 1.5157623013136723, + "grad_norm": 1.5488955974578857, + "learning_rate": 2.473729497810546e-05, + "loss": 0.6194, + "step": 171460 + }, + { + "epoch": 1.5158507045739849, + "grad_norm": 3.342101812362671, + "learning_rate": 2.4735821590433588e-05, + "loss": 0.6635, + "step": 171470 + }, + { + "epoch": 1.515939107834297, + "grad_norm": 3.384246349334717, + "learning_rate": 2.473434820276172e-05, + "loss": 0.5832, + "step": 171480 + }, + { + "epoch": 1.516027511094609, + "grad_norm": 2.568795919418335, + "learning_rate": 2.4732874815089848e-05, + "loss": 0.4513, + "step": 171490 + }, + { + "epoch": 1.5161159143549214, + "grad_norm": 3.3297486305236816, + "learning_rate": 2.4731401427417976e-05, + "loss": 0.7692, + "step": 171500 + }, + { + "epoch": 1.5162043176152338, + "grad_norm": 3.179825782775879, + "learning_rate": 2.4729928039746105e-05, + "loss": 0.6671, + "step": 171510 + }, + { + "epoch": 1.516292720875546, + "grad_norm": 1.2892937660217285, + "learning_rate": 2.4728454652074236e-05, + "loss": 0.5824, + "step": 171520 + }, + { + "epoch": 1.516381124135858, + "grad_norm": 1.6109046936035156, + "learning_rate": 2.4726981264402365e-05, + "loss": 0.6611, + "step": 171530 + }, + { + "epoch": 1.5164695273961704, + "grad_norm": 2.1983642578125, + "learning_rate": 2.4725507876730493e-05, + "loss": 0.6104, + "step": 171540 + }, + { + "epoch": 1.5165579306564827, + "grad_norm": 9.347017288208008, + "learning_rate": 2.4724034489058625e-05, + "loss": 0.4932, + "step": 171550 + }, + { + "epoch": 1.5166463339167948, + "grad_norm": 3.3557119369506836, + "learning_rate": 2.4722561101386753e-05, + "loss": 0.7232, + "step": 171560 + }, + { + "epoch": 1.516734737177107, + "grad_norm": 2.6021599769592285, + "learning_rate": 2.472108771371488e-05, + "loss": 0.5958, + "step": 171570 + }, + { + "epoch": 1.5168231404374193, + "grad_norm": 3.322150707244873, + "learning_rate": 2.471961432604301e-05, + "loss": 0.6378, + "step": 171580 + }, + { + "epoch": 1.5169115436977316, + "grad_norm": 0.9610885977745056, + "learning_rate": 2.471814093837114e-05, + "loss": 0.5144, + "step": 171590 + }, + { + "epoch": 1.5169999469580437, + "grad_norm": 4.658006191253662, + "learning_rate": 2.471666755069927e-05, + "loss": 0.5929, + "step": 171600 + }, + { + "epoch": 1.517088350218356, + "grad_norm": 4.58236837387085, + "learning_rate": 2.47151941630274e-05, + "loss": 0.6988, + "step": 171610 + }, + { + "epoch": 1.5171767534786684, + "grad_norm": 6.1837687492370605, + "learning_rate": 2.4713720775355527e-05, + "loss": 0.614, + "step": 171620 + }, + { + "epoch": 1.5172651567389805, + "grad_norm": 2.1432061195373535, + "learning_rate": 2.471224738768366e-05, + "loss": 0.5013, + "step": 171630 + }, + { + "epoch": 1.5173535599992927, + "grad_norm": 2.070702075958252, + "learning_rate": 2.4710774000011787e-05, + "loss": 0.6305, + "step": 171640 + }, + { + "epoch": 1.517441963259605, + "grad_norm": 1.5412945747375488, + "learning_rate": 2.4709300612339915e-05, + "loss": 0.6614, + "step": 171650 + }, + { + "epoch": 1.5175303665199174, + "grad_norm": 2.47554349899292, + "learning_rate": 2.4707827224668047e-05, + "loss": 0.5223, + "step": 171660 + }, + { + "epoch": 1.5176187697802295, + "grad_norm": 2.456794261932373, + "learning_rate": 2.4706353836996175e-05, + "loss": 0.6242, + "step": 171670 + }, + { + "epoch": 1.5177071730405416, + "grad_norm": 1.1966077089309692, + "learning_rate": 2.4704880449324304e-05, + "loss": 0.5622, + "step": 171680 + }, + { + "epoch": 1.517795576300854, + "grad_norm": 3.458970069885254, + "learning_rate": 2.4703407061652435e-05, + "loss": 0.564, + "step": 171690 + }, + { + "epoch": 1.5178839795611663, + "grad_norm": 2.3334169387817383, + "learning_rate": 2.4701933673980564e-05, + "loss": 0.5938, + "step": 171700 + }, + { + "epoch": 1.5179723828214784, + "grad_norm": 1.6035447120666504, + "learning_rate": 2.4700460286308692e-05, + "loss": 0.5376, + "step": 171710 + }, + { + "epoch": 1.5180607860817907, + "grad_norm": 1.7743476629257202, + "learning_rate": 2.4698986898636824e-05, + "loss": 0.5416, + "step": 171720 + }, + { + "epoch": 1.518149189342103, + "grad_norm": 3.096792697906494, + "learning_rate": 2.4697513510964952e-05, + "loss": 0.6403, + "step": 171730 + }, + { + "epoch": 1.5182375926024152, + "grad_norm": 1.065329909324646, + "learning_rate": 2.469604012329308e-05, + "loss": 0.572, + "step": 171740 + }, + { + "epoch": 1.5183259958627273, + "grad_norm": 2.8391494750976562, + "learning_rate": 2.4694566735621212e-05, + "loss": 0.7039, + "step": 171750 + }, + { + "epoch": 1.5184143991230397, + "grad_norm": 2.3468706607818604, + "learning_rate": 2.469309334794934e-05, + "loss": 0.6517, + "step": 171760 + }, + { + "epoch": 1.518502802383352, + "grad_norm": 3.8764102458953857, + "learning_rate": 2.469161996027747e-05, + "loss": 0.5581, + "step": 171770 + }, + { + "epoch": 1.5185912056436641, + "grad_norm": 1.846274495124817, + "learning_rate": 2.46901465726056e-05, + "loss": 0.7371, + "step": 171780 + }, + { + "epoch": 1.5186796089039762, + "grad_norm": 8.039772987365723, + "learning_rate": 2.468867318493373e-05, + "loss": 0.6233, + "step": 171790 + }, + { + "epoch": 1.5187680121642886, + "grad_norm": 2.1583986282348633, + "learning_rate": 2.4687199797261857e-05, + "loss": 0.582, + "step": 171800 + }, + { + "epoch": 1.518856415424601, + "grad_norm": 3.0295395851135254, + "learning_rate": 2.468572640958999e-05, + "loss": 0.5288, + "step": 171810 + }, + { + "epoch": 1.518944818684913, + "grad_norm": 7.18132209777832, + "learning_rate": 2.4684253021918118e-05, + "loss": 0.6494, + "step": 171820 + }, + { + "epoch": 1.5190332219452254, + "grad_norm": 6.014684200286865, + "learning_rate": 2.4682779634246246e-05, + "loss": 0.5979, + "step": 171830 + }, + { + "epoch": 1.5191216252055377, + "grad_norm": 1.6961146593093872, + "learning_rate": 2.4681306246574378e-05, + "loss": 0.5575, + "step": 171840 + }, + { + "epoch": 1.5192100284658498, + "grad_norm": 1.6369926929473877, + "learning_rate": 2.4679832858902506e-05, + "loss": 0.563, + "step": 171850 + }, + { + "epoch": 1.519298431726162, + "grad_norm": 1.037829041481018, + "learning_rate": 2.4678359471230634e-05, + "loss": 0.5281, + "step": 171860 + }, + { + "epoch": 1.5193868349864743, + "grad_norm": 1.8316482305526733, + "learning_rate": 2.4676886083558763e-05, + "loss": 0.5275, + "step": 171870 + }, + { + "epoch": 1.5194752382467867, + "grad_norm": 3.0014376640319824, + "learning_rate": 2.4675412695886894e-05, + "loss": 0.6327, + "step": 171880 + }, + { + "epoch": 1.5195636415070988, + "grad_norm": 1.9669630527496338, + "learning_rate": 2.4673939308215023e-05, + "loss": 0.4468, + "step": 171890 + }, + { + "epoch": 1.519652044767411, + "grad_norm": 4.4915595054626465, + "learning_rate": 2.467246592054315e-05, + "loss": 0.5648, + "step": 171900 + }, + { + "epoch": 1.5197404480277232, + "grad_norm": 1.330744743347168, + "learning_rate": 2.467099253287128e-05, + "loss": 0.5694, + "step": 171910 + }, + { + "epoch": 1.5198288512880356, + "grad_norm": 3.722687005996704, + "learning_rate": 2.466951914519941e-05, + "loss": 0.5921, + "step": 171920 + }, + { + "epoch": 1.5199172545483477, + "grad_norm": 1.5006351470947266, + "learning_rate": 2.466804575752754e-05, + "loss": 0.6337, + "step": 171930 + }, + { + "epoch": 1.52000565780866, + "grad_norm": 4.495100498199463, + "learning_rate": 2.4666572369855668e-05, + "loss": 0.7132, + "step": 171940 + }, + { + "epoch": 1.5200940610689724, + "grad_norm": 6.383693218231201, + "learning_rate": 2.46650989821838e-05, + "loss": 0.5993, + "step": 171950 + }, + { + "epoch": 1.5201824643292845, + "grad_norm": 1.008862018585205, + "learning_rate": 2.4663625594511928e-05, + "loss": 0.6129, + "step": 171960 + }, + { + "epoch": 1.5202708675895966, + "grad_norm": 2.4121975898742676, + "learning_rate": 2.4662152206840056e-05, + "loss": 0.7452, + "step": 171970 + }, + { + "epoch": 1.520359270849909, + "grad_norm": 1.813094973564148, + "learning_rate": 2.4660678819168185e-05, + "loss": 0.5145, + "step": 171980 + }, + { + "epoch": 1.5204476741102213, + "grad_norm": 1.5578382015228271, + "learning_rate": 2.4659205431496316e-05, + "loss": 0.6159, + "step": 171990 + }, + { + "epoch": 1.5205360773705334, + "grad_norm": 1.8867958784103394, + "learning_rate": 2.4657732043824445e-05, + "loss": 0.485, + "step": 172000 + }, + { + "epoch": 1.5206244806308455, + "grad_norm": 1.0533500909805298, + "learning_rate": 2.4656258656152573e-05, + "loss": 0.6231, + "step": 172010 + }, + { + "epoch": 1.5207128838911579, + "grad_norm": 1.967441439628601, + "learning_rate": 2.4654785268480705e-05, + "loss": 0.6675, + "step": 172020 + }, + { + "epoch": 1.5208012871514702, + "grad_norm": 0.9737207889556885, + "learning_rate": 2.4653311880808833e-05, + "loss": 0.4126, + "step": 172030 + }, + { + "epoch": 1.5208896904117823, + "grad_norm": 2.703584909439087, + "learning_rate": 2.465183849313696e-05, + "loss": 0.4827, + "step": 172040 + }, + { + "epoch": 1.5209780936720945, + "grad_norm": 4.685316562652588, + "learning_rate": 2.465036510546509e-05, + "loss": 0.7228, + "step": 172050 + }, + { + "epoch": 1.521066496932407, + "grad_norm": 2.4175374507904053, + "learning_rate": 2.4648891717793222e-05, + "loss": 0.5838, + "step": 172060 + }, + { + "epoch": 1.5211549001927192, + "grad_norm": 9.572955131530762, + "learning_rate": 2.464741833012135e-05, + "loss": 0.5546, + "step": 172070 + }, + { + "epoch": 1.5212433034530313, + "grad_norm": 10.905763626098633, + "learning_rate": 2.464594494244948e-05, + "loss": 0.6809, + "step": 172080 + }, + { + "epoch": 1.5213317067133436, + "grad_norm": 1.4791529178619385, + "learning_rate": 2.4644471554777607e-05, + "loss": 0.5037, + "step": 172090 + }, + { + "epoch": 1.521420109973656, + "grad_norm": 15.26163101196289, + "learning_rate": 2.464299816710574e-05, + "loss": 0.5762, + "step": 172100 + }, + { + "epoch": 1.521508513233968, + "grad_norm": 1.4081157445907593, + "learning_rate": 2.4641524779433867e-05, + "loss": 0.5283, + "step": 172110 + }, + { + "epoch": 1.5215969164942802, + "grad_norm": 1.4823800325393677, + "learning_rate": 2.4640051391761995e-05, + "loss": 0.603, + "step": 172120 + }, + { + "epoch": 1.5216853197545925, + "grad_norm": 1.1264511346817017, + "learning_rate": 2.4638578004090127e-05, + "loss": 0.6241, + "step": 172130 + }, + { + "epoch": 1.5217737230149049, + "grad_norm": 3.080552816390991, + "learning_rate": 2.4637104616418255e-05, + "loss": 0.6103, + "step": 172140 + }, + { + "epoch": 1.521862126275217, + "grad_norm": 8.09986686706543, + "learning_rate": 2.4635631228746384e-05, + "loss": 0.6419, + "step": 172150 + }, + { + "epoch": 1.5219505295355291, + "grad_norm": 2.6569294929504395, + "learning_rate": 2.4634157841074512e-05, + "loss": 0.6264, + "step": 172160 + }, + { + "epoch": 1.5220389327958415, + "grad_norm": 3.802151679992676, + "learning_rate": 2.4632684453402644e-05, + "loss": 0.5923, + "step": 172170 + }, + { + "epoch": 1.5221273360561538, + "grad_norm": 5.553664684295654, + "learning_rate": 2.4631211065730772e-05, + "loss": 0.7305, + "step": 172180 + }, + { + "epoch": 1.522215739316466, + "grad_norm": 2.139594554901123, + "learning_rate": 2.46297376780589e-05, + "loss": 0.5906, + "step": 172190 + }, + { + "epoch": 1.5223041425767783, + "grad_norm": 2.169487476348877, + "learning_rate": 2.4628264290387032e-05, + "loss": 0.4848, + "step": 172200 + }, + { + "epoch": 1.5223925458370906, + "grad_norm": 1.6875172853469849, + "learning_rate": 2.462679090271516e-05, + "loss": 0.6831, + "step": 172210 + }, + { + "epoch": 1.5224809490974027, + "grad_norm": 2.2224791049957275, + "learning_rate": 2.462531751504329e-05, + "loss": 0.5264, + "step": 172220 + }, + { + "epoch": 1.5225693523577148, + "grad_norm": 1.4047622680664062, + "learning_rate": 2.4623844127371417e-05, + "loss": 0.6143, + "step": 172230 + }, + { + "epoch": 1.5226577556180272, + "grad_norm": 1.6402615308761597, + "learning_rate": 2.462237073969955e-05, + "loss": 0.5274, + "step": 172240 + }, + { + "epoch": 1.5227461588783395, + "grad_norm": 2.0935678482055664, + "learning_rate": 2.4620897352027677e-05, + "loss": 0.6121, + "step": 172250 + }, + { + "epoch": 1.5228345621386516, + "grad_norm": 1.708573579788208, + "learning_rate": 2.4619423964355806e-05, + "loss": 0.525, + "step": 172260 + }, + { + "epoch": 1.5229229653989638, + "grad_norm": 2.7750730514526367, + "learning_rate": 2.4617950576683934e-05, + "loss": 0.6079, + "step": 172270 + }, + { + "epoch": 1.523011368659276, + "grad_norm": 5.222714424133301, + "learning_rate": 2.4616477189012066e-05, + "loss": 0.6303, + "step": 172280 + }, + { + "epoch": 1.5230997719195885, + "grad_norm": 3.571319818496704, + "learning_rate": 2.4615003801340194e-05, + "loss": 0.6871, + "step": 172290 + }, + { + "epoch": 1.5231881751799006, + "grad_norm": 6.444335460662842, + "learning_rate": 2.4613530413668323e-05, + "loss": 0.7221, + "step": 172300 + }, + { + "epoch": 1.523276578440213, + "grad_norm": 3.561591625213623, + "learning_rate": 2.4612057025996454e-05, + "loss": 0.5972, + "step": 172310 + }, + { + "epoch": 1.5233649817005253, + "grad_norm": 2.5546274185180664, + "learning_rate": 2.4610583638324583e-05, + "loss": 0.5113, + "step": 172320 + }, + { + "epoch": 1.5234533849608374, + "grad_norm": 1.630532145500183, + "learning_rate": 2.460911025065271e-05, + "loss": 0.6621, + "step": 172330 + }, + { + "epoch": 1.5235417882211495, + "grad_norm": 2.120156764984131, + "learning_rate": 2.460763686298084e-05, + "loss": 0.5563, + "step": 172340 + }, + { + "epoch": 1.5236301914814618, + "grad_norm": 2.6163783073425293, + "learning_rate": 2.460616347530897e-05, + "loss": 0.6366, + "step": 172350 + }, + { + "epoch": 1.5237185947417742, + "grad_norm": 3.093794345855713, + "learning_rate": 2.46046900876371e-05, + "loss": 0.6206, + "step": 172360 + }, + { + "epoch": 1.5238069980020863, + "grad_norm": 2.097590208053589, + "learning_rate": 2.4603216699965228e-05, + "loss": 0.6025, + "step": 172370 + }, + { + "epoch": 1.5238954012623984, + "grad_norm": 6.869895935058594, + "learning_rate": 2.460174331229336e-05, + "loss": 0.6307, + "step": 172380 + }, + { + "epoch": 1.5239838045227108, + "grad_norm": 1.4451054334640503, + "learning_rate": 2.4600269924621488e-05, + "loss": 0.538, + "step": 172390 + }, + { + "epoch": 1.524072207783023, + "grad_norm": 3.133430242538452, + "learning_rate": 2.4598796536949616e-05, + "loss": 0.5259, + "step": 172400 + }, + { + "epoch": 1.5241606110433352, + "grad_norm": 2.374271869659424, + "learning_rate": 2.4597323149277745e-05, + "loss": 0.6656, + "step": 172410 + }, + { + "epoch": 1.5242490143036476, + "grad_norm": 4.873725891113281, + "learning_rate": 2.4595849761605876e-05, + "loss": 0.5989, + "step": 172420 + }, + { + "epoch": 1.52433741756396, + "grad_norm": 2.8913698196411133, + "learning_rate": 2.4594376373934005e-05, + "loss": 0.6233, + "step": 172430 + }, + { + "epoch": 1.524425820824272, + "grad_norm": 2.974576950073242, + "learning_rate": 2.4592902986262133e-05, + "loss": 0.5089, + "step": 172440 + }, + { + "epoch": 1.5245142240845841, + "grad_norm": 6.621434688568115, + "learning_rate": 2.459142959859026e-05, + "loss": 0.5803, + "step": 172450 + }, + { + "epoch": 1.5246026273448965, + "grad_norm": 1.4491280317306519, + "learning_rate": 2.4589956210918393e-05, + "loss": 0.5787, + "step": 172460 + }, + { + "epoch": 1.5246910306052088, + "grad_norm": 1.1783498525619507, + "learning_rate": 2.458848282324652e-05, + "loss": 0.6971, + "step": 172470 + }, + { + "epoch": 1.524779433865521, + "grad_norm": 1.5873522758483887, + "learning_rate": 2.458700943557465e-05, + "loss": 0.5434, + "step": 172480 + }, + { + "epoch": 1.524867837125833, + "grad_norm": 6.424190044403076, + "learning_rate": 2.458553604790278e-05, + "loss": 0.5754, + "step": 172490 + }, + { + "epoch": 1.5249562403861454, + "grad_norm": 1.5184119939804077, + "learning_rate": 2.458406266023091e-05, + "loss": 0.6341, + "step": 172500 + }, + { + "epoch": 1.5250446436464578, + "grad_norm": 2.2332565784454346, + "learning_rate": 2.458258927255904e-05, + "loss": 0.6545, + "step": 172510 + }, + { + "epoch": 1.5251330469067699, + "grad_norm": 7.326929092407227, + "learning_rate": 2.4581115884887167e-05, + "loss": 0.686, + "step": 172520 + }, + { + "epoch": 1.5252214501670822, + "grad_norm": 2.210966110229492, + "learning_rate": 2.45796424972153e-05, + "loss": 0.5042, + "step": 172530 + }, + { + "epoch": 1.5253098534273946, + "grad_norm": 1.439150094985962, + "learning_rate": 2.4578169109543427e-05, + "loss": 0.6445, + "step": 172540 + }, + { + "epoch": 1.5253982566877067, + "grad_norm": 9.683670997619629, + "learning_rate": 2.4576695721871555e-05, + "loss": 0.5704, + "step": 172550 + }, + { + "epoch": 1.5254866599480188, + "grad_norm": 2.252748489379883, + "learning_rate": 2.4575222334199683e-05, + "loss": 0.6423, + "step": 172560 + }, + { + "epoch": 1.5255750632083311, + "grad_norm": 2.7010691165924072, + "learning_rate": 2.4573748946527815e-05, + "loss": 0.6829, + "step": 172570 + }, + { + "epoch": 1.5256634664686435, + "grad_norm": 4.710685729980469, + "learning_rate": 2.4572275558855944e-05, + "loss": 0.6447, + "step": 172580 + }, + { + "epoch": 1.5257518697289556, + "grad_norm": 0.9347019195556641, + "learning_rate": 2.4570802171184072e-05, + "loss": 0.5261, + "step": 172590 + }, + { + "epoch": 1.5258402729892677, + "grad_norm": 3.8516714572906494, + "learning_rate": 2.4569328783512204e-05, + "loss": 0.5437, + "step": 172600 + }, + { + "epoch": 1.52592867624958, + "grad_norm": 1.5092065334320068, + "learning_rate": 2.4567855395840332e-05, + "loss": 0.6155, + "step": 172610 + }, + { + "epoch": 1.5260170795098924, + "grad_norm": 2.351064443588257, + "learning_rate": 2.456638200816846e-05, + "loss": 0.6252, + "step": 172620 + }, + { + "epoch": 1.5261054827702045, + "grad_norm": 1.4649814367294312, + "learning_rate": 2.4564908620496592e-05, + "loss": 0.4769, + "step": 172630 + }, + { + "epoch": 1.5261938860305166, + "grad_norm": 2.654024600982666, + "learning_rate": 2.456343523282472e-05, + "loss": 0.6006, + "step": 172640 + }, + { + "epoch": 1.5262822892908292, + "grad_norm": 8.104409217834473, + "learning_rate": 2.456196184515285e-05, + "loss": 0.5854, + "step": 172650 + }, + { + "epoch": 1.5263706925511413, + "grad_norm": 6.593908786773682, + "learning_rate": 2.456048845748098e-05, + "loss": 0.5601, + "step": 172660 + }, + { + "epoch": 1.5264590958114534, + "grad_norm": 10.686739921569824, + "learning_rate": 2.455901506980911e-05, + "loss": 0.5823, + "step": 172670 + }, + { + "epoch": 1.5265474990717658, + "grad_norm": 2.0972142219543457, + "learning_rate": 2.4557541682137237e-05, + "loss": 0.5837, + "step": 172680 + }, + { + "epoch": 1.5266359023320781, + "grad_norm": 3.830202102661133, + "learning_rate": 2.455606829446537e-05, + "loss": 0.598, + "step": 172690 + }, + { + "epoch": 1.5267243055923903, + "grad_norm": 1.2510401010513306, + "learning_rate": 2.4554594906793497e-05, + "loss": 0.7634, + "step": 172700 + }, + { + "epoch": 1.5268127088527024, + "grad_norm": 1.7513103485107422, + "learning_rate": 2.4553121519121626e-05, + "loss": 0.5552, + "step": 172710 + }, + { + "epoch": 1.5269011121130147, + "grad_norm": 2.178109884262085, + "learning_rate": 2.4551648131449757e-05, + "loss": 0.624, + "step": 172720 + }, + { + "epoch": 1.526989515373327, + "grad_norm": 1.3737125396728516, + "learning_rate": 2.4550174743777886e-05, + "loss": 0.6883, + "step": 172730 + }, + { + "epoch": 1.5270779186336392, + "grad_norm": 1.8638834953308105, + "learning_rate": 2.4548701356106014e-05, + "loss": 0.4657, + "step": 172740 + }, + { + "epoch": 1.5271663218939513, + "grad_norm": 4.034111976623535, + "learning_rate": 2.4547227968434146e-05, + "loss": 0.6838, + "step": 172750 + }, + { + "epoch": 1.5272547251542636, + "grad_norm": 3.114433526992798, + "learning_rate": 2.4545754580762274e-05, + "loss": 0.6157, + "step": 172760 + }, + { + "epoch": 1.527343128414576, + "grad_norm": 11.8702974319458, + "learning_rate": 2.4544281193090403e-05, + "loss": 0.6267, + "step": 172770 + }, + { + "epoch": 1.527431531674888, + "grad_norm": 1.5491470098495483, + "learning_rate": 2.4542807805418534e-05, + "loss": 0.5999, + "step": 172780 + }, + { + "epoch": 1.5275199349352004, + "grad_norm": 1.8267556428909302, + "learning_rate": 2.4541334417746663e-05, + "loss": 0.6425, + "step": 172790 + }, + { + "epoch": 1.5276083381955128, + "grad_norm": 5.833799362182617, + "learning_rate": 2.453986103007479e-05, + "loss": 0.4546, + "step": 172800 + }, + { + "epoch": 1.527696741455825, + "grad_norm": 4.049919605255127, + "learning_rate": 2.453838764240292e-05, + "loss": 0.6137, + "step": 172810 + }, + { + "epoch": 1.527785144716137, + "grad_norm": 1.2713209390640259, + "learning_rate": 2.453691425473105e-05, + "loss": 0.5319, + "step": 172820 + }, + { + "epoch": 1.5278735479764494, + "grad_norm": 3.581970453262329, + "learning_rate": 2.453544086705918e-05, + "loss": 0.6672, + "step": 172830 + }, + { + "epoch": 1.5279619512367617, + "grad_norm": 1.4777510166168213, + "learning_rate": 2.4533967479387308e-05, + "loss": 0.6279, + "step": 172840 + }, + { + "epoch": 1.5280503544970738, + "grad_norm": 1.2001938819885254, + "learning_rate": 2.453249409171544e-05, + "loss": 0.6069, + "step": 172850 + }, + { + "epoch": 1.528138757757386, + "grad_norm": 1.0515836477279663, + "learning_rate": 2.4531020704043568e-05, + "loss": 0.5836, + "step": 172860 + }, + { + "epoch": 1.5282271610176983, + "grad_norm": 3.4392929077148438, + "learning_rate": 2.4529547316371696e-05, + "loss": 0.5784, + "step": 172870 + }, + { + "epoch": 1.5283155642780106, + "grad_norm": 10.642607688903809, + "learning_rate": 2.4528073928699825e-05, + "loss": 0.6758, + "step": 172880 + }, + { + "epoch": 1.5284039675383227, + "grad_norm": 2.718301773071289, + "learning_rate": 2.4526600541027956e-05, + "loss": 0.5985, + "step": 172890 + }, + { + "epoch": 1.528492370798635, + "grad_norm": 9.04906940460205, + "learning_rate": 2.4525127153356085e-05, + "loss": 0.5806, + "step": 172900 + }, + { + "epoch": 1.5285807740589474, + "grad_norm": 1.8521445989608765, + "learning_rate": 2.4523653765684213e-05, + "loss": 0.509, + "step": 172910 + }, + { + "epoch": 1.5286691773192596, + "grad_norm": 1.8146170377731323, + "learning_rate": 2.452218037801234e-05, + "loss": 0.6793, + "step": 172920 + }, + { + "epoch": 1.5287575805795717, + "grad_norm": 8.418856620788574, + "learning_rate": 2.4520706990340473e-05, + "loss": 0.5273, + "step": 172930 + }, + { + "epoch": 1.528845983839884, + "grad_norm": 5.599218845367432, + "learning_rate": 2.45192336026686e-05, + "loss": 0.5419, + "step": 172940 + }, + { + "epoch": 1.5289343871001964, + "grad_norm": 13.081548690795898, + "learning_rate": 2.451776021499673e-05, + "loss": 0.621, + "step": 172950 + }, + { + "epoch": 1.5290227903605085, + "grad_norm": 1.769519329071045, + "learning_rate": 2.451628682732486e-05, + "loss": 0.545, + "step": 172960 + }, + { + "epoch": 1.5291111936208206, + "grad_norm": 0.8284148573875427, + "learning_rate": 2.451481343965299e-05, + "loss": 0.5714, + "step": 172970 + }, + { + "epoch": 1.529199596881133, + "grad_norm": 3.3695456981658936, + "learning_rate": 2.451334005198112e-05, + "loss": 0.6492, + "step": 172980 + }, + { + "epoch": 1.5292880001414453, + "grad_norm": 12.29538631439209, + "learning_rate": 2.4511866664309247e-05, + "loss": 0.5491, + "step": 172990 + }, + { + "epoch": 1.5293764034017574, + "grad_norm": 1.382818579673767, + "learning_rate": 2.451039327663738e-05, + "loss": 0.7067, + "step": 173000 + }, + { + "epoch": 1.5294648066620697, + "grad_norm": 13.862791061401367, + "learning_rate": 2.4508919888965507e-05, + "loss": 0.6633, + "step": 173010 + }, + { + "epoch": 1.529553209922382, + "grad_norm": 2.1402299404144287, + "learning_rate": 2.4507446501293635e-05, + "loss": 0.6518, + "step": 173020 + }, + { + "epoch": 1.5296416131826942, + "grad_norm": 13.009121894836426, + "learning_rate": 2.4505973113621764e-05, + "loss": 0.7374, + "step": 173030 + }, + { + "epoch": 1.5297300164430063, + "grad_norm": 3.0197532176971436, + "learning_rate": 2.4504499725949895e-05, + "loss": 0.5881, + "step": 173040 + }, + { + "epoch": 1.5298184197033187, + "grad_norm": 3.2528882026672363, + "learning_rate": 2.4503026338278024e-05, + "loss": 0.5221, + "step": 173050 + }, + { + "epoch": 1.529906822963631, + "grad_norm": 6.419832229614258, + "learning_rate": 2.4501552950606152e-05, + "loss": 0.6583, + "step": 173060 + }, + { + "epoch": 1.5299952262239431, + "grad_norm": 14.746634483337402, + "learning_rate": 2.4500079562934284e-05, + "loss": 0.5746, + "step": 173070 + }, + { + "epoch": 1.5300836294842552, + "grad_norm": 1.8239799737930298, + "learning_rate": 2.4498606175262412e-05, + "loss": 0.4518, + "step": 173080 + }, + { + "epoch": 1.5301720327445676, + "grad_norm": 1.7478179931640625, + "learning_rate": 2.449713278759054e-05, + "loss": 0.5104, + "step": 173090 + }, + { + "epoch": 1.53026043600488, + "grad_norm": 2.063530921936035, + "learning_rate": 2.449565939991867e-05, + "loss": 0.5551, + "step": 173100 + }, + { + "epoch": 1.530348839265192, + "grad_norm": 7.434767723083496, + "learning_rate": 2.44941860122468e-05, + "loss": 0.7746, + "step": 173110 + }, + { + "epoch": 1.5304372425255044, + "grad_norm": 5.22447395324707, + "learning_rate": 2.449271262457493e-05, + "loss": 0.6282, + "step": 173120 + }, + { + "epoch": 1.5305256457858167, + "grad_norm": 4.341371536254883, + "learning_rate": 2.4491239236903057e-05, + "loss": 0.6859, + "step": 173130 + }, + { + "epoch": 1.5306140490461289, + "grad_norm": 2.9289324283599854, + "learning_rate": 2.448976584923119e-05, + "loss": 0.6033, + "step": 173140 + }, + { + "epoch": 1.530702452306441, + "grad_norm": 4.917721271514893, + "learning_rate": 2.4488292461559317e-05, + "loss": 0.5989, + "step": 173150 + }, + { + "epoch": 1.5307908555667533, + "grad_norm": 1.901949167251587, + "learning_rate": 2.4486819073887446e-05, + "loss": 0.6076, + "step": 173160 + }, + { + "epoch": 1.5308792588270657, + "grad_norm": 6.93796968460083, + "learning_rate": 2.4485345686215574e-05, + "loss": 0.7159, + "step": 173170 + }, + { + "epoch": 1.5309676620873778, + "grad_norm": 7.3602294921875, + "learning_rate": 2.4483872298543706e-05, + "loss": 0.4781, + "step": 173180 + }, + { + "epoch": 1.53105606534769, + "grad_norm": 2.3484385013580322, + "learning_rate": 2.4482398910871834e-05, + "loss": 0.5808, + "step": 173190 + }, + { + "epoch": 1.5311444686080022, + "grad_norm": 3.399115800857544, + "learning_rate": 2.4480925523199962e-05, + "loss": 0.6417, + "step": 173200 + }, + { + "epoch": 1.5312328718683146, + "grad_norm": 6.387072563171387, + "learning_rate": 2.447945213552809e-05, + "loss": 0.5766, + "step": 173210 + }, + { + "epoch": 1.5313212751286267, + "grad_norm": 6.067452907562256, + "learning_rate": 2.4477978747856223e-05, + "loss": 0.6603, + "step": 173220 + }, + { + "epoch": 1.5314096783889388, + "grad_norm": 4.355941295623779, + "learning_rate": 2.447650536018435e-05, + "loss": 0.6971, + "step": 173230 + }, + { + "epoch": 1.5314980816492514, + "grad_norm": 1.731299877166748, + "learning_rate": 2.447503197251248e-05, + "loss": 0.5693, + "step": 173240 + }, + { + "epoch": 1.5315864849095635, + "grad_norm": 1.1960794925689697, + "learning_rate": 2.447355858484061e-05, + "loss": 0.5911, + "step": 173250 + }, + { + "epoch": 1.5316748881698756, + "grad_norm": 7.097519874572754, + "learning_rate": 2.447208519716874e-05, + "loss": 0.5844, + "step": 173260 + }, + { + "epoch": 1.531763291430188, + "grad_norm": 4.278408050537109, + "learning_rate": 2.4470611809496868e-05, + "loss": 0.5859, + "step": 173270 + }, + { + "epoch": 1.5318516946905003, + "grad_norm": 3.260118007659912, + "learning_rate": 2.4469138421824996e-05, + "loss": 0.5447, + "step": 173280 + }, + { + "epoch": 1.5319400979508124, + "grad_norm": 0.9700481295585632, + "learning_rate": 2.4467665034153128e-05, + "loss": 0.5629, + "step": 173290 + }, + { + "epoch": 1.5320285012111245, + "grad_norm": 6.768033027648926, + "learning_rate": 2.4466191646481256e-05, + "loss": 0.5933, + "step": 173300 + }, + { + "epoch": 1.532116904471437, + "grad_norm": 1.0262489318847656, + "learning_rate": 2.4464718258809385e-05, + "loss": 0.4784, + "step": 173310 + }, + { + "epoch": 1.5322053077317492, + "grad_norm": 2.351008653640747, + "learning_rate": 2.4463244871137516e-05, + "loss": 0.5758, + "step": 173320 + }, + { + "epoch": 1.5322937109920614, + "grad_norm": 2.3888816833496094, + "learning_rate": 2.4461771483465645e-05, + "loss": 0.6582, + "step": 173330 + }, + { + "epoch": 1.5323821142523735, + "grad_norm": 1.2799350023269653, + "learning_rate": 2.4460298095793773e-05, + "loss": 0.5523, + "step": 173340 + }, + { + "epoch": 1.5324705175126858, + "grad_norm": 2.7748446464538574, + "learning_rate": 2.44588247081219e-05, + "loss": 0.5929, + "step": 173350 + }, + { + "epoch": 1.5325589207729982, + "grad_norm": 1.8020994663238525, + "learning_rate": 2.4457351320450033e-05, + "loss": 0.537, + "step": 173360 + }, + { + "epoch": 1.5326473240333103, + "grad_norm": 2.196061611175537, + "learning_rate": 2.445587793277816e-05, + "loss": 0.56, + "step": 173370 + }, + { + "epoch": 1.5327357272936226, + "grad_norm": 2.0812814235687256, + "learning_rate": 2.445440454510629e-05, + "loss": 0.5524, + "step": 173380 + }, + { + "epoch": 1.532824130553935, + "grad_norm": 1.4463287591934204, + "learning_rate": 2.4452931157434418e-05, + "loss": 0.605, + "step": 173390 + }, + { + "epoch": 1.532912533814247, + "grad_norm": 1.5608011484146118, + "learning_rate": 2.445145776976255e-05, + "loss": 0.7594, + "step": 173400 + }, + { + "epoch": 1.5330009370745592, + "grad_norm": 3.173502206802368, + "learning_rate": 2.4449984382090678e-05, + "loss": 0.6403, + "step": 173410 + }, + { + "epoch": 1.5330893403348715, + "grad_norm": 2.8024933338165283, + "learning_rate": 2.4448510994418807e-05, + "loss": 0.6108, + "step": 173420 + }, + { + "epoch": 1.5331777435951839, + "grad_norm": 2.6264004707336426, + "learning_rate": 2.444703760674694e-05, + "loss": 0.5982, + "step": 173430 + }, + { + "epoch": 1.533266146855496, + "grad_norm": 1.6531671285629272, + "learning_rate": 2.4445564219075067e-05, + "loss": 0.6804, + "step": 173440 + }, + { + "epoch": 1.5333545501158081, + "grad_norm": 2.6333839893341064, + "learning_rate": 2.4444090831403195e-05, + "loss": 0.4988, + "step": 173450 + }, + { + "epoch": 1.5334429533761205, + "grad_norm": 2.5326106548309326, + "learning_rate": 2.4442617443731323e-05, + "loss": 0.5212, + "step": 173460 + }, + { + "epoch": 1.5335313566364328, + "grad_norm": 14.80500316619873, + "learning_rate": 2.4441144056059455e-05, + "loss": 0.496, + "step": 173470 + }, + { + "epoch": 1.533619759896745, + "grad_norm": 0.9426552653312683, + "learning_rate": 2.4439670668387584e-05, + "loss": 0.5625, + "step": 173480 + }, + { + "epoch": 1.5337081631570573, + "grad_norm": 1.0415114164352417, + "learning_rate": 2.4438197280715712e-05, + "loss": 0.6049, + "step": 173490 + }, + { + "epoch": 1.5337965664173696, + "grad_norm": 4.935723304748535, + "learning_rate": 2.443672389304384e-05, + "loss": 0.6203, + "step": 173500 + }, + { + "epoch": 1.5338849696776817, + "grad_norm": 10.291781425476074, + "learning_rate": 2.4435250505371972e-05, + "loss": 0.5572, + "step": 173510 + }, + { + "epoch": 1.5339733729379939, + "grad_norm": 4.828888893127441, + "learning_rate": 2.44337771177001e-05, + "loss": 0.532, + "step": 173520 + }, + { + "epoch": 1.5340617761983062, + "grad_norm": 3.1975908279418945, + "learning_rate": 2.443230373002823e-05, + "loss": 0.5609, + "step": 173530 + }, + { + "epoch": 1.5341501794586185, + "grad_norm": 1.3279147148132324, + "learning_rate": 2.443083034235636e-05, + "loss": 0.4884, + "step": 173540 + }, + { + "epoch": 1.5342385827189307, + "grad_norm": 4.276978015899658, + "learning_rate": 2.442935695468449e-05, + "loss": 0.6079, + "step": 173550 + }, + { + "epoch": 1.5343269859792428, + "grad_norm": 8.07887077331543, + "learning_rate": 2.442788356701262e-05, + "loss": 0.5453, + "step": 173560 + }, + { + "epoch": 1.5344153892395551, + "grad_norm": 1.3652163743972778, + "learning_rate": 2.442641017934075e-05, + "loss": 0.5976, + "step": 173570 + }, + { + "epoch": 1.5345037924998675, + "grad_norm": 2.8738296031951904, + "learning_rate": 2.4424936791668877e-05, + "loss": 0.644, + "step": 173580 + }, + { + "epoch": 1.5345921957601796, + "grad_norm": 4.004851341247559, + "learning_rate": 2.442346340399701e-05, + "loss": 0.726, + "step": 173590 + }, + { + "epoch": 1.534680599020492, + "grad_norm": 2.2931952476501465, + "learning_rate": 2.4421990016325137e-05, + "loss": 0.583, + "step": 173600 + }, + { + "epoch": 1.5347690022808043, + "grad_norm": 2.7399141788482666, + "learning_rate": 2.4420516628653266e-05, + "loss": 0.5554, + "step": 173610 + }, + { + "epoch": 1.5348574055411164, + "grad_norm": 3.6382925510406494, + "learning_rate": 2.4419043240981397e-05, + "loss": 0.5901, + "step": 173620 + }, + { + "epoch": 1.5349458088014285, + "grad_norm": 17.632320404052734, + "learning_rate": 2.4417569853309526e-05, + "loss": 0.5753, + "step": 173630 + }, + { + "epoch": 1.5350342120617408, + "grad_norm": 6.246155261993408, + "learning_rate": 2.4416096465637654e-05, + "loss": 0.6553, + "step": 173640 + }, + { + "epoch": 1.5351226153220532, + "grad_norm": 2.1301167011260986, + "learning_rate": 2.4414623077965786e-05, + "loss": 0.6287, + "step": 173650 + }, + { + "epoch": 1.5352110185823653, + "grad_norm": 2.511080741882324, + "learning_rate": 2.4413149690293914e-05, + "loss": 0.5956, + "step": 173660 + }, + { + "epoch": 1.5352994218426774, + "grad_norm": 2.67580509185791, + "learning_rate": 2.4411676302622043e-05, + "loss": 0.5855, + "step": 173670 + }, + { + "epoch": 1.5353878251029898, + "grad_norm": 1.544997215270996, + "learning_rate": 2.441020291495017e-05, + "loss": 0.5898, + "step": 173680 + }, + { + "epoch": 1.535476228363302, + "grad_norm": 4.787408351898193, + "learning_rate": 2.4408729527278303e-05, + "loss": 0.6563, + "step": 173690 + }, + { + "epoch": 1.5355646316236142, + "grad_norm": 2.9431087970733643, + "learning_rate": 2.440725613960643e-05, + "loss": 0.6099, + "step": 173700 + }, + { + "epoch": 1.5356530348839266, + "grad_norm": 2.1662964820861816, + "learning_rate": 2.440578275193456e-05, + "loss": 0.5175, + "step": 173710 + }, + { + "epoch": 1.535741438144239, + "grad_norm": 1.3347502946853638, + "learning_rate": 2.440430936426269e-05, + "loss": 0.5018, + "step": 173720 + }, + { + "epoch": 1.535829841404551, + "grad_norm": 2.976702928543091, + "learning_rate": 2.440283597659082e-05, + "loss": 0.6384, + "step": 173730 + }, + { + "epoch": 1.5359182446648632, + "grad_norm": 2.4031283855438232, + "learning_rate": 2.4401362588918948e-05, + "loss": 0.5441, + "step": 173740 + }, + { + "epoch": 1.5360066479251755, + "grad_norm": 4.3219313621521, + "learning_rate": 2.4399889201247076e-05, + "loss": 0.6106, + "step": 173750 + }, + { + "epoch": 1.5360950511854878, + "grad_norm": 3.153588056564331, + "learning_rate": 2.4398415813575208e-05, + "loss": 0.6213, + "step": 173760 + }, + { + "epoch": 1.5361834544458, + "grad_norm": 1.836745023727417, + "learning_rate": 2.4396942425903336e-05, + "loss": 0.5739, + "step": 173770 + }, + { + "epoch": 1.536271857706112, + "grad_norm": 1.1322550773620605, + "learning_rate": 2.4395469038231465e-05, + "loss": 0.5097, + "step": 173780 + }, + { + "epoch": 1.5363602609664244, + "grad_norm": 3.6168644428253174, + "learning_rate": 2.4393995650559596e-05, + "loss": 0.7178, + "step": 173790 + }, + { + "epoch": 1.5364486642267368, + "grad_norm": 2.928410291671753, + "learning_rate": 2.4392522262887725e-05, + "loss": 0.5653, + "step": 173800 + }, + { + "epoch": 1.5365370674870489, + "grad_norm": 1.3587273359298706, + "learning_rate": 2.4391048875215853e-05, + "loss": 0.511, + "step": 173810 + }, + { + "epoch": 1.536625470747361, + "grad_norm": 4.807299613952637, + "learning_rate": 2.438957548754398e-05, + "loss": 0.5686, + "step": 173820 + }, + { + "epoch": 1.5367138740076736, + "grad_norm": 2.6591005325317383, + "learning_rate": 2.4388102099872113e-05, + "loss": 0.62, + "step": 173830 + }, + { + "epoch": 1.5368022772679857, + "grad_norm": 1.6205626726150513, + "learning_rate": 2.438662871220024e-05, + "loss": 0.5527, + "step": 173840 + }, + { + "epoch": 1.5368906805282978, + "grad_norm": 1.0479035377502441, + "learning_rate": 2.438515532452837e-05, + "loss": 0.7163, + "step": 173850 + }, + { + "epoch": 1.5369790837886101, + "grad_norm": 4.933063507080078, + "learning_rate": 2.4383681936856498e-05, + "loss": 0.6411, + "step": 173860 + }, + { + "epoch": 1.5370674870489225, + "grad_norm": 1.8709900379180908, + "learning_rate": 2.438220854918463e-05, + "loss": 0.5911, + "step": 173870 + }, + { + "epoch": 1.5371558903092346, + "grad_norm": 1.4639195203781128, + "learning_rate": 2.438073516151276e-05, + "loss": 0.6079, + "step": 173880 + }, + { + "epoch": 1.5372442935695467, + "grad_norm": 2.9190611839294434, + "learning_rate": 2.4379261773840887e-05, + "loss": 0.5513, + "step": 173890 + }, + { + "epoch": 1.537332696829859, + "grad_norm": 2.1594042778015137, + "learning_rate": 2.437778838616902e-05, + "loss": 0.6593, + "step": 173900 + }, + { + "epoch": 1.5374211000901714, + "grad_norm": 3.106978416442871, + "learning_rate": 2.4376314998497147e-05, + "loss": 0.5475, + "step": 173910 + }, + { + "epoch": 1.5375095033504835, + "grad_norm": 8.02630615234375, + "learning_rate": 2.4374841610825275e-05, + "loss": 0.7161, + "step": 173920 + }, + { + "epoch": 1.5375979066107957, + "grad_norm": 1.470693826675415, + "learning_rate": 2.4373368223153403e-05, + "loss": 0.5072, + "step": 173930 + }, + { + "epoch": 1.537686309871108, + "grad_norm": 1.2773759365081787, + "learning_rate": 2.4371894835481535e-05, + "loss": 0.5743, + "step": 173940 + }, + { + "epoch": 1.5377747131314203, + "grad_norm": 1.7460817098617554, + "learning_rate": 2.4370421447809664e-05, + "loss": 0.4711, + "step": 173950 + }, + { + "epoch": 1.5378631163917325, + "grad_norm": 2.5927224159240723, + "learning_rate": 2.4368948060137792e-05, + "loss": 0.7205, + "step": 173960 + }, + { + "epoch": 1.5379515196520448, + "grad_norm": 1.0669831037521362, + "learning_rate": 2.4367474672465924e-05, + "loss": 0.5475, + "step": 173970 + }, + { + "epoch": 1.5380399229123571, + "grad_norm": 1.3232972621917725, + "learning_rate": 2.4366001284794052e-05, + "loss": 0.6209, + "step": 173980 + }, + { + "epoch": 1.5381283261726693, + "grad_norm": 2.118748188018799, + "learning_rate": 2.436452789712218e-05, + "loss": 0.6935, + "step": 173990 + }, + { + "epoch": 1.5382167294329814, + "grad_norm": 3.4111011028289795, + "learning_rate": 2.436305450945031e-05, + "loss": 0.6815, + "step": 174000 + }, + { + "epoch": 1.5383051326932937, + "grad_norm": 1.8020133972167969, + "learning_rate": 2.436158112177844e-05, + "loss": 0.7691, + "step": 174010 + }, + { + "epoch": 1.538393535953606, + "grad_norm": 2.221857786178589, + "learning_rate": 2.436010773410657e-05, + "loss": 0.4535, + "step": 174020 + }, + { + "epoch": 1.5384819392139182, + "grad_norm": 1.7068196535110474, + "learning_rate": 2.4358634346434697e-05, + "loss": 0.552, + "step": 174030 + }, + { + "epoch": 1.5385703424742303, + "grad_norm": 3.848716974258423, + "learning_rate": 2.4357160958762826e-05, + "loss": 0.4983, + "step": 174040 + }, + { + "epoch": 1.5386587457345426, + "grad_norm": 1.0871692895889282, + "learning_rate": 2.4355687571090957e-05, + "loss": 0.5567, + "step": 174050 + }, + { + "epoch": 1.538747148994855, + "grad_norm": 3.127941131591797, + "learning_rate": 2.4354214183419086e-05, + "loss": 0.582, + "step": 174060 + }, + { + "epoch": 1.538835552255167, + "grad_norm": 2.89378023147583, + "learning_rate": 2.4352740795747214e-05, + "loss": 0.5841, + "step": 174070 + }, + { + "epoch": 1.5389239555154794, + "grad_norm": 2.551440715789795, + "learning_rate": 2.4351267408075346e-05, + "loss": 0.5411, + "step": 174080 + }, + { + "epoch": 1.5390123587757918, + "grad_norm": 2.0462491512298584, + "learning_rate": 2.4349794020403474e-05, + "loss": 0.7938, + "step": 174090 + }, + { + "epoch": 1.539100762036104, + "grad_norm": 1.5742062330245972, + "learning_rate": 2.4348320632731602e-05, + "loss": 0.7603, + "step": 174100 + }, + { + "epoch": 1.539189165296416, + "grad_norm": 1.2905710935592651, + "learning_rate": 2.434684724505973e-05, + "loss": 0.7115, + "step": 174110 + }, + { + "epoch": 1.5392775685567284, + "grad_norm": 1.3261444568634033, + "learning_rate": 2.4345373857387863e-05, + "loss": 0.6477, + "step": 174120 + }, + { + "epoch": 1.5393659718170407, + "grad_norm": 1.732905387878418, + "learning_rate": 2.434390046971599e-05, + "loss": 0.6756, + "step": 174130 + }, + { + "epoch": 1.5394543750773528, + "grad_norm": 1.3431121110916138, + "learning_rate": 2.434242708204412e-05, + "loss": 0.4962, + "step": 174140 + }, + { + "epoch": 1.539542778337665, + "grad_norm": 1.2139936685562134, + "learning_rate": 2.4340953694372248e-05, + "loss": 0.7039, + "step": 174150 + }, + { + "epoch": 1.5396311815979773, + "grad_norm": 3.871922016143799, + "learning_rate": 2.433948030670038e-05, + "loss": 0.5405, + "step": 174160 + }, + { + "epoch": 1.5397195848582896, + "grad_norm": 16.565261840820312, + "learning_rate": 2.4338006919028508e-05, + "loss": 0.5995, + "step": 174170 + }, + { + "epoch": 1.5398079881186018, + "grad_norm": 4.501935958862305, + "learning_rate": 2.4336533531356636e-05, + "loss": 0.5628, + "step": 174180 + }, + { + "epoch": 1.539896391378914, + "grad_norm": 10.195740699768066, + "learning_rate": 2.4335060143684768e-05, + "loss": 0.491, + "step": 174190 + }, + { + "epoch": 1.5399847946392264, + "grad_norm": 1.7179523706436157, + "learning_rate": 2.4333586756012896e-05, + "loss": 0.5862, + "step": 174200 + }, + { + "epoch": 1.5400731978995386, + "grad_norm": 3.060002326965332, + "learning_rate": 2.4332113368341025e-05, + "loss": 0.5974, + "step": 174210 + }, + { + "epoch": 1.5401616011598507, + "grad_norm": 4.0132670402526855, + "learning_rate": 2.4330639980669153e-05, + "loss": 0.593, + "step": 174220 + }, + { + "epoch": 1.540250004420163, + "grad_norm": 27.567434310913086, + "learning_rate": 2.4329166592997285e-05, + "loss": 0.682, + "step": 174230 + }, + { + "epoch": 1.5403384076804754, + "grad_norm": 11.265193939208984, + "learning_rate": 2.4327693205325413e-05, + "loss": 0.4424, + "step": 174240 + }, + { + "epoch": 1.5404268109407875, + "grad_norm": 2.146970272064209, + "learning_rate": 2.432621981765354e-05, + "loss": 0.5641, + "step": 174250 + }, + { + "epoch": 1.5405152142010996, + "grad_norm": 3.656827688217163, + "learning_rate": 2.4324746429981673e-05, + "loss": 0.5437, + "step": 174260 + }, + { + "epoch": 1.540603617461412, + "grad_norm": 7.898015022277832, + "learning_rate": 2.43232730423098e-05, + "loss": 0.5011, + "step": 174270 + }, + { + "epoch": 1.5406920207217243, + "grad_norm": 2.4155168533325195, + "learning_rate": 2.432179965463793e-05, + "loss": 0.6331, + "step": 174280 + }, + { + "epoch": 1.5407804239820364, + "grad_norm": 4.174191951751709, + "learning_rate": 2.4320326266966058e-05, + "loss": 0.5373, + "step": 174290 + }, + { + "epoch": 1.5408688272423487, + "grad_norm": 3.2757620811462402, + "learning_rate": 2.431885287929419e-05, + "loss": 0.6991, + "step": 174300 + }, + { + "epoch": 1.540957230502661, + "grad_norm": 8.390728950500488, + "learning_rate": 2.4317379491622318e-05, + "loss": 0.6275, + "step": 174310 + }, + { + "epoch": 1.5410456337629732, + "grad_norm": 1.6958503723144531, + "learning_rate": 2.4315906103950447e-05, + "loss": 0.4597, + "step": 174320 + }, + { + "epoch": 1.5411340370232853, + "grad_norm": 6.810363292694092, + "learning_rate": 2.4314432716278575e-05, + "loss": 0.6064, + "step": 174330 + }, + { + "epoch": 1.5412224402835977, + "grad_norm": 2.7560408115386963, + "learning_rate": 2.4312959328606707e-05, + "loss": 0.5504, + "step": 174340 + }, + { + "epoch": 1.54131084354391, + "grad_norm": 2.6676573753356934, + "learning_rate": 2.4311485940934835e-05, + "loss": 0.5673, + "step": 174350 + }, + { + "epoch": 1.5413992468042221, + "grad_norm": 2.806307077407837, + "learning_rate": 2.4310012553262963e-05, + "loss": 0.6896, + "step": 174360 + }, + { + "epoch": 1.5414876500645343, + "grad_norm": 4.78728723526001, + "learning_rate": 2.4308539165591095e-05, + "loss": 0.7291, + "step": 174370 + }, + { + "epoch": 1.5415760533248466, + "grad_norm": 0.6354554891586304, + "learning_rate": 2.4307065777919223e-05, + "loss": 0.4028, + "step": 174380 + }, + { + "epoch": 1.541664456585159, + "grad_norm": 2.0700461864471436, + "learning_rate": 2.4305592390247352e-05, + "loss": 0.7117, + "step": 174390 + }, + { + "epoch": 1.541752859845471, + "grad_norm": 5.30848503112793, + "learning_rate": 2.430411900257548e-05, + "loss": 0.6401, + "step": 174400 + }, + { + "epoch": 1.5418412631057832, + "grad_norm": 1.5651367902755737, + "learning_rate": 2.4302645614903612e-05, + "loss": 0.6562, + "step": 174410 + }, + { + "epoch": 1.5419296663660957, + "grad_norm": 14.498424530029297, + "learning_rate": 2.430117222723174e-05, + "loss": 0.6783, + "step": 174420 + }, + { + "epoch": 1.5420180696264079, + "grad_norm": 2.9781339168548584, + "learning_rate": 2.429969883955987e-05, + "loss": 0.4447, + "step": 174430 + }, + { + "epoch": 1.54210647288672, + "grad_norm": 2.2297732830047607, + "learning_rate": 2.4298225451888e-05, + "loss": 0.6672, + "step": 174440 + }, + { + "epoch": 1.5421948761470323, + "grad_norm": 6.882424354553223, + "learning_rate": 2.429675206421613e-05, + "loss": 0.6849, + "step": 174450 + }, + { + "epoch": 1.5422832794073447, + "grad_norm": 4.663057327270508, + "learning_rate": 2.4295278676544257e-05, + "loss": 0.7398, + "step": 174460 + }, + { + "epoch": 1.5423716826676568, + "grad_norm": 1.9308974742889404, + "learning_rate": 2.429380528887239e-05, + "loss": 0.5704, + "step": 174470 + }, + { + "epoch": 1.542460085927969, + "grad_norm": 3.26165771484375, + "learning_rate": 2.4292331901200517e-05, + "loss": 0.6808, + "step": 174480 + }, + { + "epoch": 1.5425484891882812, + "grad_norm": 2.3461320400238037, + "learning_rate": 2.4290858513528646e-05, + "loss": 0.5786, + "step": 174490 + }, + { + "epoch": 1.5426368924485936, + "grad_norm": 1.0739569664001465, + "learning_rate": 2.4289385125856777e-05, + "loss": 0.5507, + "step": 174500 + }, + { + "epoch": 1.5427252957089057, + "grad_norm": 1.6214160919189453, + "learning_rate": 2.4287911738184906e-05, + "loss": 0.5693, + "step": 174510 + }, + { + "epoch": 1.5428136989692178, + "grad_norm": 2.236431121826172, + "learning_rate": 2.4286438350513034e-05, + "loss": 0.6735, + "step": 174520 + }, + { + "epoch": 1.5429021022295302, + "grad_norm": 1.9672120809555054, + "learning_rate": 2.4284964962841166e-05, + "loss": 0.5716, + "step": 174530 + }, + { + "epoch": 1.5429905054898425, + "grad_norm": 2.1955885887145996, + "learning_rate": 2.4283491575169294e-05, + "loss": 0.5009, + "step": 174540 + }, + { + "epoch": 1.5430789087501546, + "grad_norm": 3.2552270889282227, + "learning_rate": 2.4282018187497422e-05, + "loss": 0.5852, + "step": 174550 + }, + { + "epoch": 1.543167312010467, + "grad_norm": 4.058462142944336, + "learning_rate": 2.4280544799825554e-05, + "loss": 0.668, + "step": 174560 + }, + { + "epoch": 1.5432557152707793, + "grad_norm": 8.593563079833984, + "learning_rate": 2.4279071412153683e-05, + "loss": 0.6956, + "step": 174570 + }, + { + "epoch": 1.5433441185310914, + "grad_norm": 3.849159002304077, + "learning_rate": 2.427759802448181e-05, + "loss": 0.5241, + "step": 174580 + }, + { + "epoch": 1.5434325217914036, + "grad_norm": 2.745166301727295, + "learning_rate": 2.4276124636809943e-05, + "loss": 0.5873, + "step": 174590 + }, + { + "epoch": 1.543520925051716, + "grad_norm": 2.396702527999878, + "learning_rate": 2.427465124913807e-05, + "loss": 0.5339, + "step": 174600 + }, + { + "epoch": 1.5436093283120282, + "grad_norm": 4.112366676330566, + "learning_rate": 2.42731778614662e-05, + "loss": 0.5718, + "step": 174610 + }, + { + "epoch": 1.5436977315723404, + "grad_norm": 2.43721342086792, + "learning_rate": 2.4271704473794328e-05, + "loss": 0.5891, + "step": 174620 + }, + { + "epoch": 1.5437861348326525, + "grad_norm": 11.213445663452148, + "learning_rate": 2.427023108612246e-05, + "loss": 0.5832, + "step": 174630 + }, + { + "epoch": 1.5438745380929648, + "grad_norm": 7.7145490646362305, + "learning_rate": 2.4268757698450588e-05, + "loss": 0.7186, + "step": 174640 + }, + { + "epoch": 1.5439629413532772, + "grad_norm": 2.1213717460632324, + "learning_rate": 2.4267284310778716e-05, + "loss": 0.6147, + "step": 174650 + }, + { + "epoch": 1.5440513446135893, + "grad_norm": 4.611488342285156, + "learning_rate": 2.4265810923106848e-05, + "loss": 0.6268, + "step": 174660 + }, + { + "epoch": 1.5441397478739016, + "grad_norm": 3.4685707092285156, + "learning_rate": 2.4264337535434976e-05, + "loss": 0.7543, + "step": 174670 + }, + { + "epoch": 1.544228151134214, + "grad_norm": 1.8858023881912231, + "learning_rate": 2.4262864147763105e-05, + "loss": 0.6068, + "step": 174680 + }, + { + "epoch": 1.544316554394526, + "grad_norm": 1.863261342048645, + "learning_rate": 2.4261390760091233e-05, + "loss": 0.5625, + "step": 174690 + }, + { + "epoch": 1.5444049576548382, + "grad_norm": 1.830488681793213, + "learning_rate": 2.4259917372419365e-05, + "loss": 0.5935, + "step": 174700 + }, + { + "epoch": 1.5444933609151505, + "grad_norm": 17.094575881958008, + "learning_rate": 2.4258443984747493e-05, + "loss": 0.5845, + "step": 174710 + }, + { + "epoch": 1.544581764175463, + "grad_norm": 3.5949769020080566, + "learning_rate": 2.425697059707562e-05, + "loss": 0.5959, + "step": 174720 + }, + { + "epoch": 1.544670167435775, + "grad_norm": 3.0940229892730713, + "learning_rate": 2.4255497209403753e-05, + "loss": 0.6586, + "step": 174730 + }, + { + "epoch": 1.5447585706960871, + "grad_norm": 1.356379508972168, + "learning_rate": 2.425402382173188e-05, + "loss": 0.6374, + "step": 174740 + }, + { + "epoch": 1.5448469739563995, + "grad_norm": 6.310287952423096, + "learning_rate": 2.425255043406001e-05, + "loss": 0.6854, + "step": 174750 + }, + { + "epoch": 1.5449353772167118, + "grad_norm": 5.824239730834961, + "learning_rate": 2.4251077046388138e-05, + "loss": 0.7386, + "step": 174760 + }, + { + "epoch": 1.545023780477024, + "grad_norm": 2.230116844177246, + "learning_rate": 2.424960365871627e-05, + "loss": 0.4884, + "step": 174770 + }, + { + "epoch": 1.5451121837373363, + "grad_norm": 1.2905182838439941, + "learning_rate": 2.4248130271044398e-05, + "loss": 0.5113, + "step": 174780 + }, + { + "epoch": 1.5452005869976486, + "grad_norm": 2.199789524078369, + "learning_rate": 2.4246656883372527e-05, + "loss": 0.5855, + "step": 174790 + }, + { + "epoch": 1.5452889902579607, + "grad_norm": 2.3680264949798584, + "learning_rate": 2.4245183495700655e-05, + "loss": 0.5911, + "step": 174800 + }, + { + "epoch": 1.5453773935182729, + "grad_norm": 1.7917861938476562, + "learning_rate": 2.4243710108028787e-05, + "loss": 0.6001, + "step": 174810 + }, + { + "epoch": 1.5454657967785852, + "grad_norm": 1.727258324623108, + "learning_rate": 2.4242236720356915e-05, + "loss": 0.6476, + "step": 174820 + }, + { + "epoch": 1.5455542000388975, + "grad_norm": 1.3490941524505615, + "learning_rate": 2.4240763332685043e-05, + "loss": 0.5915, + "step": 174830 + }, + { + "epoch": 1.5456426032992097, + "grad_norm": 1.7392785549163818, + "learning_rate": 2.4239289945013175e-05, + "loss": 0.5682, + "step": 174840 + }, + { + "epoch": 1.5457310065595218, + "grad_norm": 2.2393693923950195, + "learning_rate": 2.4237816557341304e-05, + "loss": 0.55, + "step": 174850 + }, + { + "epoch": 1.5458194098198341, + "grad_norm": 14.635035514831543, + "learning_rate": 2.4236343169669432e-05, + "loss": 0.5477, + "step": 174860 + }, + { + "epoch": 1.5459078130801465, + "grad_norm": 2.3071987628936768, + "learning_rate": 2.423486978199756e-05, + "loss": 0.7754, + "step": 174870 + }, + { + "epoch": 1.5459962163404586, + "grad_norm": 1.2465306520462036, + "learning_rate": 2.4233396394325692e-05, + "loss": 0.6085, + "step": 174880 + }, + { + "epoch": 1.546084619600771, + "grad_norm": 1.6105276346206665, + "learning_rate": 2.423192300665382e-05, + "loss": 0.694, + "step": 174890 + }, + { + "epoch": 1.5461730228610833, + "grad_norm": 0.9031888246536255, + "learning_rate": 2.423044961898195e-05, + "loss": 0.6845, + "step": 174900 + }, + { + "epoch": 1.5462614261213954, + "grad_norm": 1.2699271440505981, + "learning_rate": 2.422897623131008e-05, + "loss": 0.5156, + "step": 174910 + }, + { + "epoch": 1.5463498293817075, + "grad_norm": 3.84972882270813, + "learning_rate": 2.422750284363821e-05, + "loss": 0.6114, + "step": 174920 + }, + { + "epoch": 1.5464382326420198, + "grad_norm": 2.710536003112793, + "learning_rate": 2.4226029455966337e-05, + "loss": 0.6652, + "step": 174930 + }, + { + "epoch": 1.5465266359023322, + "grad_norm": 1.5496443510055542, + "learning_rate": 2.4224556068294465e-05, + "loss": 0.5837, + "step": 174940 + }, + { + "epoch": 1.5466150391626443, + "grad_norm": 4.660191535949707, + "learning_rate": 2.4223082680622597e-05, + "loss": 0.5843, + "step": 174950 + }, + { + "epoch": 1.5467034424229564, + "grad_norm": 4.089006423950195, + "learning_rate": 2.4221609292950726e-05, + "loss": 0.5533, + "step": 174960 + }, + { + "epoch": 1.5467918456832688, + "grad_norm": 5.7805256843566895, + "learning_rate": 2.4220135905278854e-05, + "loss": 0.5759, + "step": 174970 + }, + { + "epoch": 1.5468802489435811, + "grad_norm": 1.7269043922424316, + "learning_rate": 2.4218662517606982e-05, + "loss": 0.5542, + "step": 174980 + }, + { + "epoch": 1.5469686522038932, + "grad_norm": 3.431790828704834, + "learning_rate": 2.4217189129935114e-05, + "loss": 0.659, + "step": 174990 + }, + { + "epoch": 1.5470570554642054, + "grad_norm": 1.9091544151306152, + "learning_rate": 2.4215715742263242e-05, + "loss": 0.6308, + "step": 175000 + }, + { + "epoch": 1.547145458724518, + "grad_norm": 1.4534193277359009, + "learning_rate": 2.421424235459137e-05, + "loss": 0.5355, + "step": 175010 + }, + { + "epoch": 1.54723386198483, + "grad_norm": 2.6281485557556152, + "learning_rate": 2.4212768966919502e-05, + "loss": 0.6187, + "step": 175020 + }, + { + "epoch": 1.5473222652451422, + "grad_norm": 1.8663088083267212, + "learning_rate": 2.421129557924763e-05, + "loss": 0.7375, + "step": 175030 + }, + { + "epoch": 1.5474106685054545, + "grad_norm": 6.004947662353516, + "learning_rate": 2.420982219157576e-05, + "loss": 0.6483, + "step": 175040 + }, + { + "epoch": 1.5474990717657668, + "grad_norm": 3.082247018814087, + "learning_rate": 2.4208348803903888e-05, + "loss": 0.6405, + "step": 175050 + }, + { + "epoch": 1.547587475026079, + "grad_norm": 1.9165914058685303, + "learning_rate": 2.420687541623202e-05, + "loss": 0.4826, + "step": 175060 + }, + { + "epoch": 1.547675878286391, + "grad_norm": 4.244795322418213, + "learning_rate": 2.4205402028560148e-05, + "loss": 0.6104, + "step": 175070 + }, + { + "epoch": 1.5477642815467034, + "grad_norm": 5.788334846496582, + "learning_rate": 2.4203928640888276e-05, + "loss": 0.5436, + "step": 175080 + }, + { + "epoch": 1.5478526848070158, + "grad_norm": 1.8514665365219116, + "learning_rate": 2.4202455253216408e-05, + "loss": 0.6689, + "step": 175090 + }, + { + "epoch": 1.5479410880673279, + "grad_norm": 5.626691818237305, + "learning_rate": 2.4200981865544536e-05, + "loss": 0.6319, + "step": 175100 + }, + { + "epoch": 1.54802949132764, + "grad_norm": 4.971273899078369, + "learning_rate": 2.4199508477872664e-05, + "loss": 0.5258, + "step": 175110 + }, + { + "epoch": 1.5481178945879526, + "grad_norm": 4.934142112731934, + "learning_rate": 2.4198035090200793e-05, + "loss": 0.4408, + "step": 175120 + }, + { + "epoch": 1.5482062978482647, + "grad_norm": 14.0502347946167, + "learning_rate": 2.4196561702528925e-05, + "loss": 0.4426, + "step": 175130 + }, + { + "epoch": 1.5482947011085768, + "grad_norm": 2.326627016067505, + "learning_rate": 2.4195088314857053e-05, + "loss": 0.6075, + "step": 175140 + }, + { + "epoch": 1.5483831043688892, + "grad_norm": 2.2757251262664795, + "learning_rate": 2.419361492718518e-05, + "loss": 0.6883, + "step": 175150 + }, + { + "epoch": 1.5484715076292015, + "grad_norm": 2.406067371368408, + "learning_rate": 2.419214153951331e-05, + "loss": 0.6454, + "step": 175160 + }, + { + "epoch": 1.5485599108895136, + "grad_norm": 2.498495578765869, + "learning_rate": 2.419066815184144e-05, + "loss": 0.5722, + "step": 175170 + }, + { + "epoch": 1.5486483141498257, + "grad_norm": 3.516601800918579, + "learning_rate": 2.418919476416957e-05, + "loss": 0.5675, + "step": 175180 + }, + { + "epoch": 1.548736717410138, + "grad_norm": 1.4392104148864746, + "learning_rate": 2.4187721376497698e-05, + "loss": 0.5271, + "step": 175190 + }, + { + "epoch": 1.5488251206704504, + "grad_norm": 3.1388044357299805, + "learning_rate": 2.418624798882583e-05, + "loss": 0.6841, + "step": 175200 + }, + { + "epoch": 1.5489135239307625, + "grad_norm": 4.868514060974121, + "learning_rate": 2.4184774601153958e-05, + "loss": 0.5321, + "step": 175210 + }, + { + "epoch": 1.5490019271910747, + "grad_norm": 4.692134380340576, + "learning_rate": 2.4183301213482087e-05, + "loss": 0.6547, + "step": 175220 + }, + { + "epoch": 1.549090330451387, + "grad_norm": 1.6500064134597778, + "learning_rate": 2.4181827825810215e-05, + "loss": 0.6132, + "step": 175230 + }, + { + "epoch": 1.5491787337116993, + "grad_norm": 2.7626256942749023, + "learning_rate": 2.4180354438138347e-05, + "loss": 0.4775, + "step": 175240 + }, + { + "epoch": 1.5492671369720115, + "grad_norm": 1.8304368257522583, + "learning_rate": 2.4178881050466475e-05, + "loss": 0.7541, + "step": 175250 + }, + { + "epoch": 1.5493555402323238, + "grad_norm": 8.355866432189941, + "learning_rate": 2.4177407662794603e-05, + "loss": 0.7308, + "step": 175260 + }, + { + "epoch": 1.5494439434926361, + "grad_norm": 1.6950831413269043, + "learning_rate": 2.417593427512273e-05, + "loss": 0.5514, + "step": 175270 + }, + { + "epoch": 1.5495323467529483, + "grad_norm": 0.6586547493934631, + "learning_rate": 2.4174460887450863e-05, + "loss": 0.5595, + "step": 175280 + }, + { + "epoch": 1.5496207500132604, + "grad_norm": 9.266799926757812, + "learning_rate": 2.4172987499778992e-05, + "loss": 0.4993, + "step": 175290 + }, + { + "epoch": 1.5497091532735727, + "grad_norm": 1.9516414403915405, + "learning_rate": 2.417151411210712e-05, + "loss": 0.6373, + "step": 175300 + }, + { + "epoch": 1.549797556533885, + "grad_norm": 3.445497512817383, + "learning_rate": 2.4170040724435252e-05, + "loss": 0.6679, + "step": 175310 + }, + { + "epoch": 1.5498859597941972, + "grad_norm": 2.7462542057037354, + "learning_rate": 2.416856733676338e-05, + "loss": 0.4786, + "step": 175320 + }, + { + "epoch": 1.5499743630545093, + "grad_norm": 4.17148494720459, + "learning_rate": 2.416709394909151e-05, + "loss": 0.6029, + "step": 175330 + }, + { + "epoch": 1.5500627663148216, + "grad_norm": 1.4304940700531006, + "learning_rate": 2.4165620561419637e-05, + "loss": 0.466, + "step": 175340 + }, + { + "epoch": 1.550151169575134, + "grad_norm": 2.634665012359619, + "learning_rate": 2.416414717374777e-05, + "loss": 0.6591, + "step": 175350 + }, + { + "epoch": 1.5502395728354461, + "grad_norm": 6.376070499420166, + "learning_rate": 2.4162673786075897e-05, + "loss": 0.6488, + "step": 175360 + }, + { + "epoch": 1.5503279760957585, + "grad_norm": 2.5970404148101807, + "learning_rate": 2.4161200398404025e-05, + "loss": 0.5881, + "step": 175370 + }, + { + "epoch": 1.5504163793560708, + "grad_norm": 3.7440390586853027, + "learning_rate": 2.4159727010732157e-05, + "loss": 0.5171, + "step": 175380 + }, + { + "epoch": 1.550504782616383, + "grad_norm": 4.337168216705322, + "learning_rate": 2.4158253623060285e-05, + "loss": 0.5246, + "step": 175390 + }, + { + "epoch": 1.550593185876695, + "grad_norm": 1.1312719583511353, + "learning_rate": 2.4156780235388414e-05, + "loss": 0.5638, + "step": 175400 + }, + { + "epoch": 1.5506815891370074, + "grad_norm": 2.632840871810913, + "learning_rate": 2.4155306847716546e-05, + "loss": 0.5473, + "step": 175410 + }, + { + "epoch": 1.5507699923973197, + "grad_norm": 1.6288197040557861, + "learning_rate": 2.4153833460044674e-05, + "loss": 0.6525, + "step": 175420 + }, + { + "epoch": 1.5508583956576318, + "grad_norm": 6.673985481262207, + "learning_rate": 2.4152360072372802e-05, + "loss": 0.5225, + "step": 175430 + }, + { + "epoch": 1.550946798917944, + "grad_norm": 3.8512940406799316, + "learning_rate": 2.4150886684700934e-05, + "loss": 0.5995, + "step": 175440 + }, + { + "epoch": 1.5510352021782563, + "grad_norm": 2.3673620223999023, + "learning_rate": 2.4149413297029062e-05, + "loss": 0.5624, + "step": 175450 + }, + { + "epoch": 1.5511236054385686, + "grad_norm": 3.6392786502838135, + "learning_rate": 2.414793990935719e-05, + "loss": 0.4961, + "step": 175460 + }, + { + "epoch": 1.5512120086988808, + "grad_norm": 3.440269947052002, + "learning_rate": 2.4146466521685322e-05, + "loss": 0.5754, + "step": 175470 + }, + { + "epoch": 1.551300411959193, + "grad_norm": 1.4280959367752075, + "learning_rate": 2.414499313401345e-05, + "loss": 0.6204, + "step": 175480 + }, + { + "epoch": 1.5513888152195054, + "grad_norm": 1.9678702354431152, + "learning_rate": 2.414351974634158e-05, + "loss": 0.5157, + "step": 175490 + }, + { + "epoch": 1.5514772184798176, + "grad_norm": 8.589799880981445, + "learning_rate": 2.414204635866971e-05, + "loss": 0.6506, + "step": 175500 + }, + { + "epoch": 1.5515656217401297, + "grad_norm": 1.5467242002487183, + "learning_rate": 2.414057297099784e-05, + "loss": 0.5876, + "step": 175510 + }, + { + "epoch": 1.551654025000442, + "grad_norm": 6.826015472412109, + "learning_rate": 2.4139099583325968e-05, + "loss": 0.5367, + "step": 175520 + }, + { + "epoch": 1.5517424282607544, + "grad_norm": 2.0968873500823975, + "learning_rate": 2.41376261956541e-05, + "loss": 0.5819, + "step": 175530 + }, + { + "epoch": 1.5518308315210665, + "grad_norm": 1.9116806983947754, + "learning_rate": 2.4136152807982228e-05, + "loss": 0.7346, + "step": 175540 + }, + { + "epoch": 1.5519192347813786, + "grad_norm": 2.7878074645996094, + "learning_rate": 2.4134679420310356e-05, + "loss": 0.6796, + "step": 175550 + }, + { + "epoch": 1.552007638041691, + "grad_norm": 5.731505393981934, + "learning_rate": 2.4133206032638488e-05, + "loss": 0.5371, + "step": 175560 + }, + { + "epoch": 1.5520960413020033, + "grad_norm": 2.0684056282043457, + "learning_rate": 2.4131732644966616e-05, + "loss": 0.555, + "step": 175570 + }, + { + "epoch": 1.5521844445623154, + "grad_norm": 7.5566487312316895, + "learning_rate": 2.4130259257294745e-05, + "loss": 0.6837, + "step": 175580 + }, + { + "epoch": 1.5522728478226275, + "grad_norm": 3.572631597518921, + "learning_rate": 2.4128785869622873e-05, + "loss": 0.5693, + "step": 175590 + }, + { + "epoch": 1.55236125108294, + "grad_norm": 2.8073689937591553, + "learning_rate": 2.4127312481951005e-05, + "loss": 0.6157, + "step": 175600 + }, + { + "epoch": 1.5524496543432522, + "grad_norm": 1.7215214967727661, + "learning_rate": 2.4125839094279133e-05, + "loss": 0.6523, + "step": 175610 + }, + { + "epoch": 1.5525380576035643, + "grad_norm": 4.865016460418701, + "learning_rate": 2.412436570660726e-05, + "loss": 0.5473, + "step": 175620 + }, + { + "epoch": 1.5526264608638767, + "grad_norm": 1.9606802463531494, + "learning_rate": 2.412289231893539e-05, + "loss": 0.6233, + "step": 175630 + }, + { + "epoch": 1.552714864124189, + "grad_norm": 6.350049018859863, + "learning_rate": 2.412141893126352e-05, + "loss": 0.4618, + "step": 175640 + }, + { + "epoch": 1.5528032673845011, + "grad_norm": 13.56799030303955, + "learning_rate": 2.411994554359165e-05, + "loss": 0.4577, + "step": 175650 + }, + { + "epoch": 1.5528916706448133, + "grad_norm": 3.8896825313568115, + "learning_rate": 2.4118472155919778e-05, + "loss": 0.6352, + "step": 175660 + }, + { + "epoch": 1.5529800739051256, + "grad_norm": 2.473496675491333, + "learning_rate": 2.411699876824791e-05, + "loss": 0.6829, + "step": 175670 + }, + { + "epoch": 1.553068477165438, + "grad_norm": 3.9788055419921875, + "learning_rate": 2.4115525380576038e-05, + "loss": 0.5836, + "step": 175680 + }, + { + "epoch": 1.55315688042575, + "grad_norm": 2.101442575454712, + "learning_rate": 2.4114051992904167e-05, + "loss": 0.6256, + "step": 175690 + }, + { + "epoch": 1.5532452836860622, + "grad_norm": 4.8250041007995605, + "learning_rate": 2.4112578605232295e-05, + "loss": 0.6183, + "step": 175700 + }, + { + "epoch": 1.5533336869463747, + "grad_norm": 5.208817481994629, + "learning_rate": 2.4111105217560427e-05, + "loss": 0.6138, + "step": 175710 + }, + { + "epoch": 1.5534220902066869, + "grad_norm": 3.2474365234375, + "learning_rate": 2.4109631829888555e-05, + "loss": 0.5583, + "step": 175720 + }, + { + "epoch": 1.553510493466999, + "grad_norm": 2.2283647060394287, + "learning_rate": 2.4108158442216683e-05, + "loss": 0.5087, + "step": 175730 + }, + { + "epoch": 1.5535988967273113, + "grad_norm": 3.2932844161987305, + "learning_rate": 2.4106685054544812e-05, + "loss": 0.5104, + "step": 175740 + }, + { + "epoch": 1.5536872999876237, + "grad_norm": 3.431257963180542, + "learning_rate": 2.4105211666872943e-05, + "loss": 0.5497, + "step": 175750 + }, + { + "epoch": 1.5537757032479358, + "grad_norm": 1.9445186853408813, + "learning_rate": 2.4103738279201072e-05, + "loss": 0.5849, + "step": 175760 + }, + { + "epoch": 1.553864106508248, + "grad_norm": 8.358589172363281, + "learning_rate": 2.41022648915292e-05, + "loss": 0.6241, + "step": 175770 + }, + { + "epoch": 1.5539525097685603, + "grad_norm": 2.708808660507202, + "learning_rate": 2.4100791503857332e-05, + "loss": 0.7163, + "step": 175780 + }, + { + "epoch": 1.5540409130288726, + "grad_norm": 1.8191673755645752, + "learning_rate": 2.409931811618546e-05, + "loss": 0.5747, + "step": 175790 + }, + { + "epoch": 1.5541293162891847, + "grad_norm": 6.4892144203186035, + "learning_rate": 2.409784472851359e-05, + "loss": 0.6796, + "step": 175800 + }, + { + "epoch": 1.5542177195494968, + "grad_norm": 2.453010082244873, + "learning_rate": 2.4096371340841717e-05, + "loss": 0.5122, + "step": 175810 + }, + { + "epoch": 1.5543061228098092, + "grad_norm": 3.2654922008514404, + "learning_rate": 2.409489795316985e-05, + "loss": 0.619, + "step": 175820 + }, + { + "epoch": 1.5543945260701215, + "grad_norm": 3.2870142459869385, + "learning_rate": 2.4093424565497977e-05, + "loss": 0.6954, + "step": 175830 + }, + { + "epoch": 1.5544829293304336, + "grad_norm": 1.6599255800247192, + "learning_rate": 2.4091951177826105e-05, + "loss": 0.6912, + "step": 175840 + }, + { + "epoch": 1.554571332590746, + "grad_norm": 1.4668525457382202, + "learning_rate": 2.4090477790154237e-05, + "loss": 0.5591, + "step": 175850 + }, + { + "epoch": 1.5546597358510583, + "grad_norm": 5.512995719909668, + "learning_rate": 2.4089004402482366e-05, + "loss": 0.6056, + "step": 175860 + }, + { + "epoch": 1.5547481391113704, + "grad_norm": 1.3192214965820312, + "learning_rate": 2.4087531014810494e-05, + "loss": 0.4609, + "step": 175870 + }, + { + "epoch": 1.5548365423716826, + "grad_norm": 2.4650135040283203, + "learning_rate": 2.4086057627138622e-05, + "loss": 0.7683, + "step": 175880 + }, + { + "epoch": 1.554924945631995, + "grad_norm": 4.417902946472168, + "learning_rate": 2.4084584239466754e-05, + "loss": 0.5397, + "step": 175890 + }, + { + "epoch": 1.5550133488923072, + "grad_norm": 2.7622385025024414, + "learning_rate": 2.4083110851794882e-05, + "loss": 0.5373, + "step": 175900 + }, + { + "epoch": 1.5551017521526194, + "grad_norm": 3.459970474243164, + "learning_rate": 2.408163746412301e-05, + "loss": 0.6649, + "step": 175910 + }, + { + "epoch": 1.5551901554129315, + "grad_norm": 1.4808686971664429, + "learning_rate": 2.408016407645114e-05, + "loss": 0.6733, + "step": 175920 + }, + { + "epoch": 1.5552785586732438, + "grad_norm": 3.042663097381592, + "learning_rate": 2.407869068877927e-05, + "loss": 0.6793, + "step": 175930 + }, + { + "epoch": 1.5553669619335562, + "grad_norm": 8.79601001739502, + "learning_rate": 2.40772173011074e-05, + "loss": 0.5742, + "step": 175940 + }, + { + "epoch": 1.5554553651938683, + "grad_norm": 3.032383680343628, + "learning_rate": 2.4075743913435528e-05, + "loss": 0.6242, + "step": 175950 + }, + { + "epoch": 1.5555437684541806, + "grad_norm": 4.006124496459961, + "learning_rate": 2.407427052576366e-05, + "loss": 0.5409, + "step": 175960 + }, + { + "epoch": 1.555632171714493, + "grad_norm": 3.9470105171203613, + "learning_rate": 2.4072797138091788e-05, + "loss": 0.628, + "step": 175970 + }, + { + "epoch": 1.555720574974805, + "grad_norm": 6.362183094024658, + "learning_rate": 2.4071323750419916e-05, + "loss": 0.6373, + "step": 175980 + }, + { + "epoch": 1.5558089782351172, + "grad_norm": 1.9918493032455444, + "learning_rate": 2.4069850362748044e-05, + "loss": 0.5338, + "step": 175990 + }, + { + "epoch": 1.5558973814954296, + "grad_norm": 1.1010850667953491, + "learning_rate": 2.4068376975076176e-05, + "loss": 0.7065, + "step": 176000 + }, + { + "epoch": 1.555985784755742, + "grad_norm": 1.8384859561920166, + "learning_rate": 2.4066903587404304e-05, + "loss": 0.6622, + "step": 176010 + }, + { + "epoch": 1.556074188016054, + "grad_norm": 1.623852014541626, + "learning_rate": 2.4065430199732433e-05, + "loss": 0.6095, + "step": 176020 + }, + { + "epoch": 1.5561625912763661, + "grad_norm": 2.5296456813812256, + "learning_rate": 2.4063956812060564e-05, + "loss": 0.7166, + "step": 176030 + }, + { + "epoch": 1.5562509945366785, + "grad_norm": 4.086573123931885, + "learning_rate": 2.4062483424388693e-05, + "loss": 0.6613, + "step": 176040 + }, + { + "epoch": 1.5563393977969908, + "grad_norm": 3.509824275970459, + "learning_rate": 2.406101003671682e-05, + "loss": 0.6629, + "step": 176050 + }, + { + "epoch": 1.556427801057303, + "grad_norm": 2.155970811843872, + "learning_rate": 2.405953664904495e-05, + "loss": 0.4275, + "step": 176060 + }, + { + "epoch": 1.5565162043176153, + "grad_norm": 6.274196147918701, + "learning_rate": 2.405806326137308e-05, + "loss": 0.6362, + "step": 176070 + }, + { + "epoch": 1.5566046075779276, + "grad_norm": 1.7009046077728271, + "learning_rate": 2.405658987370121e-05, + "loss": 0.5711, + "step": 176080 + }, + { + "epoch": 1.5566930108382397, + "grad_norm": 4.581592559814453, + "learning_rate": 2.4055116486029338e-05, + "loss": 0.7456, + "step": 176090 + }, + { + "epoch": 1.5567814140985519, + "grad_norm": 6.589500427246094, + "learning_rate": 2.4053643098357466e-05, + "loss": 0.5122, + "step": 176100 + }, + { + "epoch": 1.5568698173588642, + "grad_norm": 6.289793014526367, + "learning_rate": 2.4052169710685598e-05, + "loss": 0.7746, + "step": 176110 + }, + { + "epoch": 1.5569582206191765, + "grad_norm": 1.7730607986450195, + "learning_rate": 2.4050696323013726e-05, + "loss": 0.5726, + "step": 176120 + }, + { + "epoch": 1.5570466238794887, + "grad_norm": 7.2734575271606445, + "learning_rate": 2.4049222935341855e-05, + "loss": 0.5616, + "step": 176130 + }, + { + "epoch": 1.5571350271398008, + "grad_norm": 2.77254581451416, + "learning_rate": 2.4047749547669987e-05, + "loss": 0.5998, + "step": 176140 + }, + { + "epoch": 1.5572234304001131, + "grad_norm": 1.6330997943878174, + "learning_rate": 2.4046276159998115e-05, + "loss": 0.4892, + "step": 176150 + }, + { + "epoch": 1.5573118336604255, + "grad_norm": 1.2648051977157593, + "learning_rate": 2.4044802772326243e-05, + "loss": 0.5772, + "step": 176160 + }, + { + "epoch": 1.5574002369207376, + "grad_norm": 8.252049446105957, + "learning_rate": 2.404332938465437e-05, + "loss": 0.6662, + "step": 176170 + }, + { + "epoch": 1.55748864018105, + "grad_norm": 9.004280090332031, + "learning_rate": 2.4041855996982503e-05, + "loss": 0.6576, + "step": 176180 + }, + { + "epoch": 1.5575770434413623, + "grad_norm": 2.663370132446289, + "learning_rate": 2.4040382609310632e-05, + "loss": 0.5891, + "step": 176190 + }, + { + "epoch": 1.5576654467016744, + "grad_norm": 8.976247787475586, + "learning_rate": 2.403890922163876e-05, + "loss": 0.554, + "step": 176200 + }, + { + "epoch": 1.5577538499619865, + "grad_norm": 10.197449684143066, + "learning_rate": 2.403743583396689e-05, + "loss": 0.6058, + "step": 176210 + }, + { + "epoch": 1.5578422532222989, + "grad_norm": 6.126716613769531, + "learning_rate": 2.403596244629502e-05, + "loss": 0.5604, + "step": 176220 + }, + { + "epoch": 1.5579306564826112, + "grad_norm": 1.966170072555542, + "learning_rate": 2.403448905862315e-05, + "loss": 0.5347, + "step": 176230 + }, + { + "epoch": 1.5580190597429233, + "grad_norm": 6.774153709411621, + "learning_rate": 2.4033015670951277e-05, + "loss": 0.6212, + "step": 176240 + }, + { + "epoch": 1.5581074630032354, + "grad_norm": 3.204974412918091, + "learning_rate": 2.403154228327941e-05, + "loss": 0.6954, + "step": 176250 + }, + { + "epoch": 1.5581958662635478, + "grad_norm": 2.6879467964172363, + "learning_rate": 2.4030068895607537e-05, + "loss": 0.595, + "step": 176260 + }, + { + "epoch": 1.5582842695238601, + "grad_norm": 2.4986608028411865, + "learning_rate": 2.4028595507935665e-05, + "loss": 0.4526, + "step": 176270 + }, + { + "epoch": 1.5583726727841722, + "grad_norm": 17.314674377441406, + "learning_rate": 2.4027122120263794e-05, + "loss": 0.5682, + "step": 176280 + }, + { + "epoch": 1.5584610760444844, + "grad_norm": 10.226405143737793, + "learning_rate": 2.4025648732591925e-05, + "loss": 0.6113, + "step": 176290 + }, + { + "epoch": 1.558549479304797, + "grad_norm": 2.781558036804199, + "learning_rate": 2.4024175344920054e-05, + "loss": 0.5478, + "step": 176300 + }, + { + "epoch": 1.558637882565109, + "grad_norm": 2.374439001083374, + "learning_rate": 2.4022701957248182e-05, + "loss": 0.4678, + "step": 176310 + }, + { + "epoch": 1.5587262858254212, + "grad_norm": 1.906740427017212, + "learning_rate": 2.4021228569576314e-05, + "loss": 0.6328, + "step": 176320 + }, + { + "epoch": 1.5588146890857335, + "grad_norm": 0.9516133069992065, + "learning_rate": 2.4019755181904442e-05, + "loss": 0.5999, + "step": 176330 + }, + { + "epoch": 1.5589030923460458, + "grad_norm": 2.762033462524414, + "learning_rate": 2.401828179423257e-05, + "loss": 0.6909, + "step": 176340 + }, + { + "epoch": 1.558991495606358, + "grad_norm": 1.5850287675857544, + "learning_rate": 2.4016808406560702e-05, + "loss": 0.5599, + "step": 176350 + }, + { + "epoch": 1.55907989886667, + "grad_norm": 1.2970778942108154, + "learning_rate": 2.401533501888883e-05, + "loss": 0.5156, + "step": 176360 + }, + { + "epoch": 1.5591683021269824, + "grad_norm": 2.068624973297119, + "learning_rate": 2.401386163121696e-05, + "loss": 0.6752, + "step": 176370 + }, + { + "epoch": 1.5592567053872948, + "grad_norm": 4.437560081481934, + "learning_rate": 2.401238824354509e-05, + "loss": 0.6849, + "step": 176380 + }, + { + "epoch": 1.559345108647607, + "grad_norm": 11.547019958496094, + "learning_rate": 2.401091485587322e-05, + "loss": 0.5907, + "step": 176390 + }, + { + "epoch": 1.559433511907919, + "grad_norm": 17.42321014404297, + "learning_rate": 2.4009441468201347e-05, + "loss": 0.7714, + "step": 176400 + }, + { + "epoch": 1.5595219151682314, + "grad_norm": 16.337127685546875, + "learning_rate": 2.400796808052948e-05, + "loss": 0.7394, + "step": 176410 + }, + { + "epoch": 1.5596103184285437, + "grad_norm": 4.904205799102783, + "learning_rate": 2.4006494692857608e-05, + "loss": 0.7249, + "step": 176420 + }, + { + "epoch": 1.5596987216888558, + "grad_norm": 5.072138786315918, + "learning_rate": 2.4005021305185736e-05, + "loss": 0.5635, + "step": 176430 + }, + { + "epoch": 1.5597871249491682, + "grad_norm": 0.8053647875785828, + "learning_rate": 2.4003547917513868e-05, + "loss": 0.5783, + "step": 176440 + }, + { + "epoch": 1.5598755282094805, + "grad_norm": 2.298722743988037, + "learning_rate": 2.4002074529841996e-05, + "loss": 0.5057, + "step": 176450 + }, + { + "epoch": 1.5599639314697926, + "grad_norm": 3.5129120349884033, + "learning_rate": 2.4000601142170124e-05, + "loss": 0.6155, + "step": 176460 + }, + { + "epoch": 1.5600523347301047, + "grad_norm": 6.439996242523193, + "learning_rate": 2.3999127754498256e-05, + "loss": 0.4902, + "step": 176470 + }, + { + "epoch": 1.560140737990417, + "grad_norm": 2.6693789958953857, + "learning_rate": 2.3997654366826384e-05, + "loss": 0.6106, + "step": 176480 + }, + { + "epoch": 1.5602291412507294, + "grad_norm": 2.057903528213501, + "learning_rate": 2.3996180979154513e-05, + "loss": 0.4878, + "step": 176490 + }, + { + "epoch": 1.5603175445110415, + "grad_norm": 4.72182035446167, + "learning_rate": 2.3994707591482645e-05, + "loss": 0.5856, + "step": 176500 + }, + { + "epoch": 1.5604059477713537, + "grad_norm": 7.757410049438477, + "learning_rate": 2.3993234203810773e-05, + "loss": 0.5483, + "step": 176510 + }, + { + "epoch": 1.560494351031666, + "grad_norm": 5.265251159667969, + "learning_rate": 2.39917608161389e-05, + "loss": 0.5218, + "step": 176520 + }, + { + "epoch": 1.5605827542919783, + "grad_norm": 6.178158283233643, + "learning_rate": 2.399028742846703e-05, + "loss": 0.5849, + "step": 176530 + }, + { + "epoch": 1.5606711575522905, + "grad_norm": 4.1125874519348145, + "learning_rate": 2.398881404079516e-05, + "loss": 0.6595, + "step": 176540 + }, + { + "epoch": 1.5607595608126028, + "grad_norm": 1.4558113813400269, + "learning_rate": 2.398734065312329e-05, + "loss": 0.5954, + "step": 176550 + }, + { + "epoch": 1.5608479640729152, + "grad_norm": 3.422471523284912, + "learning_rate": 2.3985867265451418e-05, + "loss": 0.551, + "step": 176560 + }, + { + "epoch": 1.5609363673332273, + "grad_norm": 2.989250659942627, + "learning_rate": 2.3984393877779546e-05, + "loss": 0.5014, + "step": 176570 + }, + { + "epoch": 1.5610247705935394, + "grad_norm": 9.05696964263916, + "learning_rate": 2.3982920490107678e-05, + "loss": 0.7459, + "step": 176580 + }, + { + "epoch": 1.5611131738538517, + "grad_norm": 2.3023223876953125, + "learning_rate": 2.3981447102435807e-05, + "loss": 0.5984, + "step": 176590 + }, + { + "epoch": 1.561201577114164, + "grad_norm": 6.538005352020264, + "learning_rate": 2.3979973714763935e-05, + "loss": 0.6604, + "step": 176600 + }, + { + "epoch": 1.5612899803744762, + "grad_norm": 7.982250690460205, + "learning_rate": 2.3978500327092067e-05, + "loss": 0.5231, + "step": 176610 + }, + { + "epoch": 1.5613783836347883, + "grad_norm": 4.236746311187744, + "learning_rate": 2.3977026939420195e-05, + "loss": 0.5771, + "step": 176620 + }, + { + "epoch": 1.5614667868951007, + "grad_norm": 1.882865071296692, + "learning_rate": 2.3975553551748323e-05, + "loss": 0.4318, + "step": 176630 + }, + { + "epoch": 1.561555190155413, + "grad_norm": 4.176548480987549, + "learning_rate": 2.397408016407645e-05, + "loss": 0.6464, + "step": 176640 + }, + { + "epoch": 1.5616435934157251, + "grad_norm": 8.582466125488281, + "learning_rate": 2.3972606776404583e-05, + "loss": 0.5813, + "step": 176650 + }, + { + "epoch": 1.5617319966760375, + "grad_norm": 5.795729637145996, + "learning_rate": 2.3971133388732712e-05, + "loss": 0.5705, + "step": 176660 + }, + { + "epoch": 1.5618203999363498, + "grad_norm": 6.867427349090576, + "learning_rate": 2.396966000106084e-05, + "loss": 0.5965, + "step": 176670 + }, + { + "epoch": 1.561908803196662, + "grad_norm": 1.722930908203125, + "learning_rate": 2.3968186613388972e-05, + "loss": 0.5778, + "step": 176680 + }, + { + "epoch": 1.561997206456974, + "grad_norm": 21.46084976196289, + "learning_rate": 2.39667132257171e-05, + "loss": 0.6877, + "step": 176690 + }, + { + "epoch": 1.5620856097172864, + "grad_norm": 1.9725826978683472, + "learning_rate": 2.396523983804523e-05, + "loss": 0.5208, + "step": 176700 + }, + { + "epoch": 1.5621740129775987, + "grad_norm": 2.8085975646972656, + "learning_rate": 2.3963766450373357e-05, + "loss": 0.5655, + "step": 176710 + }, + { + "epoch": 1.5622624162379108, + "grad_norm": 1.2130274772644043, + "learning_rate": 2.396229306270149e-05, + "loss": 0.5443, + "step": 176720 + }, + { + "epoch": 1.562350819498223, + "grad_norm": 1.321975588798523, + "learning_rate": 2.3960819675029617e-05, + "loss": 0.6458, + "step": 176730 + }, + { + "epoch": 1.5624392227585353, + "grad_norm": 2.276165008544922, + "learning_rate": 2.3959346287357745e-05, + "loss": 0.545, + "step": 176740 + }, + { + "epoch": 1.5625276260188476, + "grad_norm": 1.0494939088821411, + "learning_rate": 2.3957872899685874e-05, + "loss": 0.6316, + "step": 176750 + }, + { + "epoch": 1.5626160292791598, + "grad_norm": 3.4098010063171387, + "learning_rate": 2.3956399512014005e-05, + "loss": 0.5825, + "step": 176760 + }, + { + "epoch": 1.562704432539472, + "grad_norm": 1.3356003761291504, + "learning_rate": 2.3954926124342134e-05, + "loss": 0.5101, + "step": 176770 + }, + { + "epoch": 1.5627928357997845, + "grad_norm": 2.826198101043701, + "learning_rate": 2.3953452736670262e-05, + "loss": 0.6422, + "step": 176780 + }, + { + "epoch": 1.5628812390600966, + "grad_norm": 7.472723484039307, + "learning_rate": 2.3951979348998394e-05, + "loss": 0.5543, + "step": 176790 + }, + { + "epoch": 1.5629696423204087, + "grad_norm": 1.6344304084777832, + "learning_rate": 2.3950505961326522e-05, + "loss": 0.5944, + "step": 176800 + }, + { + "epoch": 1.563058045580721, + "grad_norm": 1.928816556930542, + "learning_rate": 2.394903257365465e-05, + "loss": 0.5692, + "step": 176810 + }, + { + "epoch": 1.5631464488410334, + "grad_norm": 1.4873366355895996, + "learning_rate": 2.394755918598278e-05, + "loss": 0.6319, + "step": 176820 + }, + { + "epoch": 1.5632348521013455, + "grad_norm": 3.338905096054077, + "learning_rate": 2.394608579831091e-05, + "loss": 0.576, + "step": 176830 + }, + { + "epoch": 1.5633232553616576, + "grad_norm": 10.962322235107422, + "learning_rate": 2.394461241063904e-05, + "loss": 0.6467, + "step": 176840 + }, + { + "epoch": 1.56341165862197, + "grad_norm": 2.717649221420288, + "learning_rate": 2.3943139022967167e-05, + "loss": 0.6829, + "step": 176850 + }, + { + "epoch": 1.5635000618822823, + "grad_norm": 6.951993942260742, + "learning_rate": 2.3941665635295296e-05, + "loss": 0.5802, + "step": 176860 + }, + { + "epoch": 1.5635884651425944, + "grad_norm": 1.9833133220672607, + "learning_rate": 2.3940192247623428e-05, + "loss": 0.5176, + "step": 176870 + }, + { + "epoch": 1.5636768684029065, + "grad_norm": 1.712316870689392, + "learning_rate": 2.3938718859951556e-05, + "loss": 0.5376, + "step": 176880 + }, + { + "epoch": 1.563765271663219, + "grad_norm": 1.2919236421585083, + "learning_rate": 2.3937245472279684e-05, + "loss": 0.5559, + "step": 176890 + }, + { + "epoch": 1.5638536749235312, + "grad_norm": 11.732109069824219, + "learning_rate": 2.3935772084607816e-05, + "loss": 0.7083, + "step": 176900 + }, + { + "epoch": 1.5639420781838433, + "grad_norm": 4.852503299713135, + "learning_rate": 2.3934298696935944e-05, + "loss": 0.513, + "step": 176910 + }, + { + "epoch": 1.5640304814441557, + "grad_norm": 1.8035823106765747, + "learning_rate": 2.3932825309264073e-05, + "loss": 0.6046, + "step": 176920 + }, + { + "epoch": 1.564118884704468, + "grad_norm": 1.3335148096084595, + "learning_rate": 2.39313519215922e-05, + "loss": 0.6209, + "step": 176930 + }, + { + "epoch": 1.5642072879647801, + "grad_norm": 2.9932949542999268, + "learning_rate": 2.3929878533920333e-05, + "loss": 0.5952, + "step": 176940 + }, + { + "epoch": 1.5642956912250923, + "grad_norm": 2.053060531616211, + "learning_rate": 2.392840514624846e-05, + "loss": 0.6696, + "step": 176950 + }, + { + "epoch": 1.5643840944854046, + "grad_norm": 8.876794815063477, + "learning_rate": 2.392693175857659e-05, + "loss": 0.6265, + "step": 176960 + }, + { + "epoch": 1.564472497745717, + "grad_norm": 1.3175852298736572, + "learning_rate": 2.392545837090472e-05, + "loss": 0.5908, + "step": 176970 + }, + { + "epoch": 1.564560901006029, + "grad_norm": 1.8960905075073242, + "learning_rate": 2.392398498323285e-05, + "loss": 0.5778, + "step": 176980 + }, + { + "epoch": 1.5646493042663412, + "grad_norm": 4.270967483520508, + "learning_rate": 2.3922511595560978e-05, + "loss": 0.6514, + "step": 176990 + }, + { + "epoch": 1.5647377075266535, + "grad_norm": 4.1663689613342285, + "learning_rate": 2.3921038207889106e-05, + "loss": 0.5853, + "step": 177000 + }, + { + "epoch": 1.5648261107869659, + "grad_norm": 1.3195915222167969, + "learning_rate": 2.3919564820217238e-05, + "loss": 0.6515, + "step": 177010 + }, + { + "epoch": 1.564914514047278, + "grad_norm": 12.04109001159668, + "learning_rate": 2.3918091432545366e-05, + "loss": 0.5227, + "step": 177020 + }, + { + "epoch": 1.5650029173075903, + "grad_norm": 1.4092333316802979, + "learning_rate": 2.3916618044873495e-05, + "loss": 0.6642, + "step": 177030 + }, + { + "epoch": 1.5650913205679027, + "grad_norm": 3.3699228763580322, + "learning_rate": 2.3915144657201623e-05, + "loss": 0.5208, + "step": 177040 + }, + { + "epoch": 1.5651797238282148, + "grad_norm": 2.845052480697632, + "learning_rate": 2.3913671269529755e-05, + "loss": 0.6047, + "step": 177050 + }, + { + "epoch": 1.565268127088527, + "grad_norm": 5.391455173492432, + "learning_rate": 2.3912197881857883e-05, + "loss": 0.5443, + "step": 177060 + }, + { + "epoch": 1.5653565303488393, + "grad_norm": 1.5082225799560547, + "learning_rate": 2.391072449418601e-05, + "loss": 0.6542, + "step": 177070 + }, + { + "epoch": 1.5654449336091516, + "grad_norm": 0.50357586145401, + "learning_rate": 2.3909251106514143e-05, + "loss": 0.4891, + "step": 177080 + }, + { + "epoch": 1.5655333368694637, + "grad_norm": 2.5022904872894287, + "learning_rate": 2.390777771884227e-05, + "loss": 0.6431, + "step": 177090 + }, + { + "epoch": 1.5656217401297758, + "grad_norm": 1.3826407194137573, + "learning_rate": 2.39063043311704e-05, + "loss": 0.5379, + "step": 177100 + }, + { + "epoch": 1.5657101433900882, + "grad_norm": 2.0808982849121094, + "learning_rate": 2.390483094349853e-05, + "loss": 0.4533, + "step": 177110 + }, + { + "epoch": 1.5657985466504005, + "grad_norm": 1.225738286972046, + "learning_rate": 2.390335755582666e-05, + "loss": 0.6256, + "step": 177120 + }, + { + "epoch": 1.5658869499107126, + "grad_norm": 1.853959321975708, + "learning_rate": 2.390188416815479e-05, + "loss": 0.5257, + "step": 177130 + }, + { + "epoch": 1.565975353171025, + "grad_norm": 1.8163670301437378, + "learning_rate": 2.3900410780482917e-05, + "loss": 0.5124, + "step": 177140 + }, + { + "epoch": 1.5660637564313373, + "grad_norm": 2.156202554702759, + "learning_rate": 2.389893739281105e-05, + "loss": 0.4939, + "step": 177150 + }, + { + "epoch": 1.5661521596916494, + "grad_norm": 1.2103588581085205, + "learning_rate": 2.3897464005139177e-05, + "loss": 0.527, + "step": 177160 + }, + { + "epoch": 1.5662405629519616, + "grad_norm": 7.839535713195801, + "learning_rate": 2.3895990617467305e-05, + "loss": 0.6572, + "step": 177170 + }, + { + "epoch": 1.566328966212274, + "grad_norm": 1.2732614278793335, + "learning_rate": 2.3894517229795434e-05, + "loss": 0.7046, + "step": 177180 + }, + { + "epoch": 1.5664173694725863, + "grad_norm": 1.1127547025680542, + "learning_rate": 2.3893043842123565e-05, + "loss": 0.6706, + "step": 177190 + }, + { + "epoch": 1.5665057727328984, + "grad_norm": 1.468625545501709, + "learning_rate": 2.3891570454451694e-05, + "loss": 0.5455, + "step": 177200 + }, + { + "epoch": 1.5665941759932105, + "grad_norm": 1.4811850786209106, + "learning_rate": 2.3890097066779822e-05, + "loss": 0.5632, + "step": 177210 + }, + { + "epoch": 1.5666825792535228, + "grad_norm": 13.830153465270996, + "learning_rate": 2.388862367910795e-05, + "loss": 0.6757, + "step": 177220 + }, + { + "epoch": 1.5667709825138352, + "grad_norm": 2.274662971496582, + "learning_rate": 2.3887150291436082e-05, + "loss": 0.5751, + "step": 177230 + }, + { + "epoch": 1.5668593857741473, + "grad_norm": 3.429499626159668, + "learning_rate": 2.388567690376421e-05, + "loss": 0.5757, + "step": 177240 + }, + { + "epoch": 1.5669477890344596, + "grad_norm": 4.459138870239258, + "learning_rate": 2.388420351609234e-05, + "loss": 0.6723, + "step": 177250 + }, + { + "epoch": 1.567036192294772, + "grad_norm": 2.4164700508117676, + "learning_rate": 2.388273012842047e-05, + "loss": 0.5153, + "step": 177260 + }, + { + "epoch": 1.567124595555084, + "grad_norm": 2.449877977371216, + "learning_rate": 2.38812567407486e-05, + "loss": 0.8246, + "step": 177270 + }, + { + "epoch": 1.5672129988153962, + "grad_norm": 3.5852701663970947, + "learning_rate": 2.3879783353076727e-05, + "loss": 0.7463, + "step": 177280 + }, + { + "epoch": 1.5673014020757086, + "grad_norm": 1.6659650802612305, + "learning_rate": 2.387830996540486e-05, + "loss": 0.5225, + "step": 177290 + }, + { + "epoch": 1.567389805336021, + "grad_norm": 2.45802903175354, + "learning_rate": 2.3876836577732987e-05, + "loss": 0.5015, + "step": 177300 + }, + { + "epoch": 1.567478208596333, + "grad_norm": 1.3427000045776367, + "learning_rate": 2.3875363190061116e-05, + "loss": 0.579, + "step": 177310 + }, + { + "epoch": 1.5675666118566451, + "grad_norm": 5.278824329376221, + "learning_rate": 2.3873889802389248e-05, + "loss": 0.6429, + "step": 177320 + }, + { + "epoch": 1.5676550151169575, + "grad_norm": 1.3126202821731567, + "learning_rate": 2.3872416414717376e-05, + "loss": 0.5997, + "step": 177330 + }, + { + "epoch": 1.5677434183772698, + "grad_norm": 2.5202386379241943, + "learning_rate": 2.3870943027045504e-05, + "loss": 0.5816, + "step": 177340 + }, + { + "epoch": 1.567831821637582, + "grad_norm": 5.427288055419922, + "learning_rate": 2.3869469639373636e-05, + "loss": 0.7357, + "step": 177350 + }, + { + "epoch": 1.5679202248978943, + "grad_norm": 5.297801971435547, + "learning_rate": 2.3867996251701764e-05, + "loss": 0.5988, + "step": 177360 + }, + { + "epoch": 1.5680086281582066, + "grad_norm": 2.1023435592651367, + "learning_rate": 2.3866522864029893e-05, + "loss": 0.6773, + "step": 177370 + }, + { + "epoch": 1.5680970314185187, + "grad_norm": 3.128377914428711, + "learning_rate": 2.3865049476358024e-05, + "loss": 0.458, + "step": 177380 + }, + { + "epoch": 1.5681854346788309, + "grad_norm": 1.211957335472107, + "learning_rate": 2.3863576088686153e-05, + "loss": 0.568, + "step": 177390 + }, + { + "epoch": 1.5682738379391432, + "grad_norm": 2.160508871078491, + "learning_rate": 2.386210270101428e-05, + "loss": 0.73, + "step": 177400 + }, + { + "epoch": 1.5683622411994556, + "grad_norm": 1.121775507926941, + "learning_rate": 2.3860629313342413e-05, + "loss": 0.6873, + "step": 177410 + }, + { + "epoch": 1.5684506444597677, + "grad_norm": 9.46325969696045, + "learning_rate": 2.385915592567054e-05, + "loss": 0.5065, + "step": 177420 + }, + { + "epoch": 1.5685390477200798, + "grad_norm": 10.104605674743652, + "learning_rate": 2.385768253799867e-05, + "loss": 0.6625, + "step": 177430 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 4.208607196807861, + "learning_rate": 2.38562091503268e-05, + "loss": 0.5357, + "step": 177440 + }, + { + "epoch": 1.5687158542407045, + "grad_norm": 7.169487476348877, + "learning_rate": 2.385473576265493e-05, + "loss": 0.6935, + "step": 177450 + }, + { + "epoch": 1.5688042575010166, + "grad_norm": 2.8854124546051025, + "learning_rate": 2.3853262374983058e-05, + "loss": 0.5694, + "step": 177460 + }, + { + "epoch": 1.5688926607613287, + "grad_norm": 2.144292116165161, + "learning_rate": 2.3851788987311186e-05, + "loss": 0.5011, + "step": 177470 + }, + { + "epoch": 1.5689810640216413, + "grad_norm": 2.791090965270996, + "learning_rate": 2.3850315599639318e-05, + "loss": 0.742, + "step": 177480 + }, + { + "epoch": 1.5690694672819534, + "grad_norm": 10.48210620880127, + "learning_rate": 2.3848842211967446e-05, + "loss": 0.4566, + "step": 177490 + }, + { + "epoch": 1.5691578705422655, + "grad_norm": 2.357089042663574, + "learning_rate": 2.3847368824295575e-05, + "loss": 0.5526, + "step": 177500 + }, + { + "epoch": 1.5692462738025779, + "grad_norm": 2.4185054302215576, + "learning_rate": 2.3845895436623703e-05, + "loss": 0.5614, + "step": 177510 + }, + { + "epoch": 1.5693346770628902, + "grad_norm": 2.3175907135009766, + "learning_rate": 2.3844422048951835e-05, + "loss": 0.6569, + "step": 177520 + }, + { + "epoch": 1.5694230803232023, + "grad_norm": 5.830840110778809, + "learning_rate": 2.3842948661279963e-05, + "loss": 0.6688, + "step": 177530 + }, + { + "epoch": 1.5695114835835144, + "grad_norm": 2.5840630531311035, + "learning_rate": 2.384147527360809e-05, + "loss": 0.5568, + "step": 177540 + }, + { + "epoch": 1.5695998868438268, + "grad_norm": 3.6981008052825928, + "learning_rate": 2.3840001885936223e-05, + "loss": 0.455, + "step": 177550 + }, + { + "epoch": 1.5696882901041391, + "grad_norm": 2.26568865776062, + "learning_rate": 2.3838528498264352e-05, + "loss": 0.6471, + "step": 177560 + }, + { + "epoch": 1.5697766933644512, + "grad_norm": 3.4982657432556152, + "learning_rate": 2.383705511059248e-05, + "loss": 0.6207, + "step": 177570 + }, + { + "epoch": 1.5698650966247634, + "grad_norm": 11.61485481262207, + "learning_rate": 2.383558172292061e-05, + "loss": 0.6342, + "step": 177580 + }, + { + "epoch": 1.5699534998850757, + "grad_norm": 2.927955150604248, + "learning_rate": 2.383410833524874e-05, + "loss": 0.6117, + "step": 177590 + }, + { + "epoch": 1.570041903145388, + "grad_norm": 3.3633296489715576, + "learning_rate": 2.383263494757687e-05, + "loss": 0.6254, + "step": 177600 + }, + { + "epoch": 1.5701303064057002, + "grad_norm": 3.2824857234954834, + "learning_rate": 2.3831161559904997e-05, + "loss": 0.6156, + "step": 177610 + }, + { + "epoch": 1.5702187096660125, + "grad_norm": 1.9438114166259766, + "learning_rate": 2.382968817223313e-05, + "loss": 0.6046, + "step": 177620 + }, + { + "epoch": 1.5703071129263249, + "grad_norm": 9.858379364013672, + "learning_rate": 2.3828214784561257e-05, + "loss": 0.5039, + "step": 177630 + }, + { + "epoch": 1.570395516186637, + "grad_norm": 1.7021639347076416, + "learning_rate": 2.3826741396889385e-05, + "loss": 0.5935, + "step": 177640 + }, + { + "epoch": 1.570483919446949, + "grad_norm": 2.1032676696777344, + "learning_rate": 2.3825268009217514e-05, + "loss": 0.5879, + "step": 177650 + }, + { + "epoch": 1.5705723227072614, + "grad_norm": 1.8284764289855957, + "learning_rate": 2.3823794621545645e-05, + "loss": 0.5119, + "step": 177660 + }, + { + "epoch": 1.5706607259675738, + "grad_norm": 1.3647922277450562, + "learning_rate": 2.3822321233873774e-05, + "loss": 0.5886, + "step": 177670 + }, + { + "epoch": 1.570749129227886, + "grad_norm": 2.7354862689971924, + "learning_rate": 2.3820847846201902e-05, + "loss": 0.6214, + "step": 177680 + }, + { + "epoch": 1.570837532488198, + "grad_norm": 3.5631704330444336, + "learning_rate": 2.381937445853003e-05, + "loss": 0.5461, + "step": 177690 + }, + { + "epoch": 1.5709259357485104, + "grad_norm": 1.2752279043197632, + "learning_rate": 2.3817901070858162e-05, + "loss": 0.5554, + "step": 177700 + }, + { + "epoch": 1.5710143390088227, + "grad_norm": 3.2722902297973633, + "learning_rate": 2.381642768318629e-05, + "loss": 0.6625, + "step": 177710 + }, + { + "epoch": 1.5711027422691348, + "grad_norm": 1.4920198917388916, + "learning_rate": 2.381495429551442e-05, + "loss": 0.6535, + "step": 177720 + }, + { + "epoch": 1.5711911455294472, + "grad_norm": 2.660507917404175, + "learning_rate": 2.381348090784255e-05, + "loss": 0.6356, + "step": 177730 + }, + { + "epoch": 1.5712795487897595, + "grad_norm": 2.067870616912842, + "learning_rate": 2.381200752017068e-05, + "loss": 0.5859, + "step": 177740 + }, + { + "epoch": 1.5713679520500716, + "grad_norm": 1.3267247676849365, + "learning_rate": 2.3810534132498807e-05, + "loss": 0.4893, + "step": 177750 + }, + { + "epoch": 1.5714563553103837, + "grad_norm": 1.94123375415802, + "learning_rate": 2.3809060744826936e-05, + "loss": 0.5941, + "step": 177760 + }, + { + "epoch": 1.571544758570696, + "grad_norm": 5.056488037109375, + "learning_rate": 2.3807587357155067e-05, + "loss": 0.6583, + "step": 177770 + }, + { + "epoch": 1.5716331618310084, + "grad_norm": 1.3272475004196167, + "learning_rate": 2.3806113969483196e-05, + "loss": 0.5946, + "step": 177780 + }, + { + "epoch": 1.5717215650913205, + "grad_norm": 4.6792497634887695, + "learning_rate": 2.3804640581811324e-05, + "loss": 0.6402, + "step": 177790 + }, + { + "epoch": 1.5718099683516327, + "grad_norm": 2.2302956581115723, + "learning_rate": 2.3803167194139453e-05, + "loss": 0.6164, + "step": 177800 + }, + { + "epoch": 1.571898371611945, + "grad_norm": 5.343714714050293, + "learning_rate": 2.3801693806467584e-05, + "loss": 0.7028, + "step": 177810 + }, + { + "epoch": 1.5719867748722574, + "grad_norm": 2.8348476886749268, + "learning_rate": 2.3800220418795713e-05, + "loss": 0.6744, + "step": 177820 + }, + { + "epoch": 1.5720751781325695, + "grad_norm": 3.4458703994750977, + "learning_rate": 2.379874703112384e-05, + "loss": 0.6044, + "step": 177830 + }, + { + "epoch": 1.5721635813928818, + "grad_norm": 3.3119430541992188, + "learning_rate": 2.3797273643451973e-05, + "loss": 0.6851, + "step": 177840 + }, + { + "epoch": 1.5722519846531942, + "grad_norm": 14.754148483276367, + "learning_rate": 2.37958002557801e-05, + "loss": 0.732, + "step": 177850 + }, + { + "epoch": 1.5723403879135063, + "grad_norm": 1.7317376136779785, + "learning_rate": 2.379432686810823e-05, + "loss": 0.6028, + "step": 177860 + }, + { + "epoch": 1.5724287911738184, + "grad_norm": 2.2867050170898438, + "learning_rate": 2.3792853480436358e-05, + "loss": 0.5016, + "step": 177870 + }, + { + "epoch": 1.5725171944341307, + "grad_norm": 3.202863931655884, + "learning_rate": 2.379138009276449e-05, + "loss": 0.529, + "step": 177880 + }, + { + "epoch": 1.572605597694443, + "grad_norm": 7.062049865722656, + "learning_rate": 2.3789906705092618e-05, + "loss": 0.5519, + "step": 177890 + }, + { + "epoch": 1.5726940009547552, + "grad_norm": 3.202885150909424, + "learning_rate": 2.3788433317420746e-05, + "loss": 0.7217, + "step": 177900 + }, + { + "epoch": 1.5727824042150673, + "grad_norm": 1.4440633058547974, + "learning_rate": 2.3786959929748878e-05, + "loss": 0.6615, + "step": 177910 + }, + { + "epoch": 1.5728708074753797, + "grad_norm": 3.9629647731781006, + "learning_rate": 2.3785486542077006e-05, + "loss": 0.6506, + "step": 177920 + }, + { + "epoch": 1.572959210735692, + "grad_norm": 1.7773234844207764, + "learning_rate": 2.3784013154405135e-05, + "loss": 0.6561, + "step": 177930 + }, + { + "epoch": 1.5730476139960041, + "grad_norm": 10.660492897033691, + "learning_rate": 2.3782539766733263e-05, + "loss": 0.5801, + "step": 177940 + }, + { + "epoch": 1.5731360172563165, + "grad_norm": 1.918804407119751, + "learning_rate": 2.3781066379061395e-05, + "loss": 0.6901, + "step": 177950 + }, + { + "epoch": 1.5732244205166288, + "grad_norm": 1.9095244407653809, + "learning_rate": 2.3779592991389523e-05, + "loss": 0.6185, + "step": 177960 + }, + { + "epoch": 1.573312823776941, + "grad_norm": 3.5318918228149414, + "learning_rate": 2.377811960371765e-05, + "loss": 0.5582, + "step": 177970 + }, + { + "epoch": 1.573401227037253, + "grad_norm": 0.9088351726531982, + "learning_rate": 2.377664621604578e-05, + "loss": 0.526, + "step": 177980 + }, + { + "epoch": 1.5734896302975654, + "grad_norm": 2.974895715713501, + "learning_rate": 2.377517282837391e-05, + "loss": 0.6306, + "step": 177990 + }, + { + "epoch": 1.5735780335578777, + "grad_norm": 2.7470595836639404, + "learning_rate": 2.377369944070204e-05, + "loss": 0.7147, + "step": 178000 + }, + { + "epoch": 1.5736664368181899, + "grad_norm": 6.477626323699951, + "learning_rate": 2.377222605303017e-05, + "loss": 0.6333, + "step": 178010 + }, + { + "epoch": 1.573754840078502, + "grad_norm": 0.9369567632675171, + "learning_rate": 2.37707526653583e-05, + "loss": 0.6072, + "step": 178020 + }, + { + "epoch": 1.5738432433388143, + "grad_norm": 4.91400146484375, + "learning_rate": 2.376927927768643e-05, + "loss": 0.556, + "step": 178030 + }, + { + "epoch": 1.5739316465991267, + "grad_norm": 4.29938268661499, + "learning_rate": 2.3767805890014557e-05, + "loss": 0.6233, + "step": 178040 + }, + { + "epoch": 1.5740200498594388, + "grad_norm": 8.595135688781738, + "learning_rate": 2.3766332502342685e-05, + "loss": 0.5871, + "step": 178050 + }, + { + "epoch": 1.574108453119751, + "grad_norm": 1.551999807357788, + "learning_rate": 2.3764859114670817e-05, + "loss": 0.6797, + "step": 178060 + }, + { + "epoch": 1.5741968563800635, + "grad_norm": 4.337893962860107, + "learning_rate": 2.3763385726998945e-05, + "loss": 0.569, + "step": 178070 + }, + { + "epoch": 1.5742852596403756, + "grad_norm": 1.0492699146270752, + "learning_rate": 2.3761912339327074e-05, + "loss": 0.5351, + "step": 178080 + }, + { + "epoch": 1.5743736629006877, + "grad_norm": 4.99405574798584, + "learning_rate": 2.3760438951655205e-05, + "loss": 0.5742, + "step": 178090 + }, + { + "epoch": 1.574462066161, + "grad_norm": 1.6093047857284546, + "learning_rate": 2.3758965563983334e-05, + "loss": 0.7021, + "step": 178100 + }, + { + "epoch": 1.5745504694213124, + "grad_norm": 9.563122749328613, + "learning_rate": 2.3757492176311462e-05, + "loss": 0.6464, + "step": 178110 + }, + { + "epoch": 1.5746388726816245, + "grad_norm": 4.5364508628845215, + "learning_rate": 2.375601878863959e-05, + "loss": 0.5776, + "step": 178120 + }, + { + "epoch": 1.5747272759419366, + "grad_norm": 3.5077686309814453, + "learning_rate": 2.3754545400967722e-05, + "loss": 0.5347, + "step": 178130 + }, + { + "epoch": 1.574815679202249, + "grad_norm": 10.026756286621094, + "learning_rate": 2.375307201329585e-05, + "loss": 0.6771, + "step": 178140 + }, + { + "epoch": 1.5749040824625613, + "grad_norm": 1.6047227382659912, + "learning_rate": 2.375159862562398e-05, + "loss": 0.5376, + "step": 178150 + }, + { + "epoch": 1.5749924857228734, + "grad_norm": 1.6588748693466187, + "learning_rate": 2.3750125237952107e-05, + "loss": 0.5695, + "step": 178160 + }, + { + "epoch": 1.5750808889831855, + "grad_norm": 2.7459299564361572, + "learning_rate": 2.374865185028024e-05, + "loss": 0.5176, + "step": 178170 + }, + { + "epoch": 1.5751692922434979, + "grad_norm": 4.993346214294434, + "learning_rate": 2.3747178462608367e-05, + "loss": 0.4711, + "step": 178180 + }, + { + "epoch": 1.5752576955038102, + "grad_norm": 3.7444307804107666, + "learning_rate": 2.3745705074936496e-05, + "loss": 0.4653, + "step": 178190 + }, + { + "epoch": 1.5753460987641223, + "grad_norm": 7.655012607574463, + "learning_rate": 2.3744231687264627e-05, + "loss": 0.5704, + "step": 178200 + }, + { + "epoch": 1.5754345020244347, + "grad_norm": 6.56545352935791, + "learning_rate": 2.3742758299592756e-05, + "loss": 0.6917, + "step": 178210 + }, + { + "epoch": 1.575522905284747, + "grad_norm": 1.851274013519287, + "learning_rate": 2.3741284911920887e-05, + "loss": 0.5544, + "step": 178220 + }, + { + "epoch": 1.5756113085450592, + "grad_norm": 2.2871317863464355, + "learning_rate": 2.3739811524249016e-05, + "loss": 0.4978, + "step": 178230 + }, + { + "epoch": 1.5756997118053713, + "grad_norm": 3.604883909225464, + "learning_rate": 2.3738338136577144e-05, + "loss": 0.6063, + "step": 178240 + }, + { + "epoch": 1.5757881150656836, + "grad_norm": 2.8319008350372314, + "learning_rate": 2.3736864748905276e-05, + "loss": 0.7282, + "step": 178250 + }, + { + "epoch": 1.575876518325996, + "grad_norm": 10.24183464050293, + "learning_rate": 2.3735391361233404e-05, + "loss": 0.6667, + "step": 178260 + }, + { + "epoch": 1.575964921586308, + "grad_norm": 2.5965993404388428, + "learning_rate": 2.3733917973561533e-05, + "loss": 0.5381, + "step": 178270 + }, + { + "epoch": 1.5760533248466202, + "grad_norm": 3.5896975994110107, + "learning_rate": 2.3732444585889664e-05, + "loss": 0.5891, + "step": 178280 + }, + { + "epoch": 1.5761417281069325, + "grad_norm": 6.681732177734375, + "learning_rate": 2.3730971198217793e-05, + "loss": 0.6866, + "step": 178290 + }, + { + "epoch": 1.5762301313672449, + "grad_norm": 2.867141008377075, + "learning_rate": 2.372949781054592e-05, + "loss": 0.6847, + "step": 178300 + }, + { + "epoch": 1.576318534627557, + "grad_norm": 1.4590797424316406, + "learning_rate": 2.3728024422874053e-05, + "loss": 0.5857, + "step": 178310 + }, + { + "epoch": 1.5764069378878693, + "grad_norm": 1.1472526788711548, + "learning_rate": 2.372655103520218e-05, + "loss": 0.6157, + "step": 178320 + }, + { + "epoch": 1.5764953411481817, + "grad_norm": 4.510525703430176, + "learning_rate": 2.372507764753031e-05, + "loss": 0.5637, + "step": 178330 + }, + { + "epoch": 1.5765837444084938, + "grad_norm": 6.114560604095459, + "learning_rate": 2.3723604259858438e-05, + "loss": 0.6339, + "step": 178340 + }, + { + "epoch": 1.576672147668806, + "grad_norm": 2.6327743530273438, + "learning_rate": 2.372213087218657e-05, + "loss": 0.5722, + "step": 178350 + }, + { + "epoch": 1.5767605509291183, + "grad_norm": 2.417829751968384, + "learning_rate": 2.3720657484514698e-05, + "loss": 0.6076, + "step": 178360 + }, + { + "epoch": 1.5768489541894306, + "grad_norm": 1.753642201423645, + "learning_rate": 2.3719184096842826e-05, + "loss": 0.5679, + "step": 178370 + }, + { + "epoch": 1.5769373574497427, + "grad_norm": 1.9006749391555786, + "learning_rate": 2.3717710709170958e-05, + "loss": 0.586, + "step": 178380 + }, + { + "epoch": 1.5770257607100548, + "grad_norm": 3.3040943145751953, + "learning_rate": 2.3716237321499086e-05, + "loss": 0.5879, + "step": 178390 + }, + { + "epoch": 1.5771141639703672, + "grad_norm": 2.73498272895813, + "learning_rate": 2.3714763933827215e-05, + "loss": 0.645, + "step": 178400 + }, + { + "epoch": 1.5772025672306795, + "grad_norm": 1.0006335973739624, + "learning_rate": 2.3713290546155343e-05, + "loss": 0.7305, + "step": 178410 + }, + { + "epoch": 1.5772909704909917, + "grad_norm": 2.9414331912994385, + "learning_rate": 2.3711817158483475e-05, + "loss": 0.7322, + "step": 178420 + }, + { + "epoch": 1.577379373751304, + "grad_norm": 1.0808783769607544, + "learning_rate": 2.3710343770811603e-05, + "loss": 0.5497, + "step": 178430 + }, + { + "epoch": 1.5774677770116163, + "grad_norm": 4.873090744018555, + "learning_rate": 2.370887038313973e-05, + "loss": 0.667, + "step": 178440 + }, + { + "epoch": 1.5775561802719285, + "grad_norm": 2.205275297164917, + "learning_rate": 2.370739699546786e-05, + "loss": 0.5755, + "step": 178450 + }, + { + "epoch": 1.5776445835322406, + "grad_norm": 3.3588781356811523, + "learning_rate": 2.370592360779599e-05, + "loss": 0.5621, + "step": 178460 + }, + { + "epoch": 1.577732986792553, + "grad_norm": 5.083580493927002, + "learning_rate": 2.370445022012412e-05, + "loss": 0.6306, + "step": 178470 + }, + { + "epoch": 1.5778213900528653, + "grad_norm": 4.142603874206543, + "learning_rate": 2.370297683245225e-05, + "loss": 0.5205, + "step": 178480 + }, + { + "epoch": 1.5779097933131774, + "grad_norm": 16.28163719177246, + "learning_rate": 2.370150344478038e-05, + "loss": 0.6707, + "step": 178490 + }, + { + "epoch": 1.5779981965734895, + "grad_norm": 3.596571445465088, + "learning_rate": 2.370003005710851e-05, + "loss": 0.5476, + "step": 178500 + }, + { + "epoch": 1.5780865998338018, + "grad_norm": 3.476890802383423, + "learning_rate": 2.3698556669436637e-05, + "loss": 0.5857, + "step": 178510 + }, + { + "epoch": 1.5781750030941142, + "grad_norm": 7.287332057952881, + "learning_rate": 2.3697083281764765e-05, + "loss": 0.7006, + "step": 178520 + }, + { + "epoch": 1.5782634063544263, + "grad_norm": 1.8955235481262207, + "learning_rate": 2.3695609894092897e-05, + "loss": 0.6019, + "step": 178530 + }, + { + "epoch": 1.5783518096147386, + "grad_norm": 1.8529415130615234, + "learning_rate": 2.3694136506421025e-05, + "loss": 0.6293, + "step": 178540 + }, + { + "epoch": 1.578440212875051, + "grad_norm": 11.360862731933594, + "learning_rate": 2.3692663118749154e-05, + "loss": 0.6061, + "step": 178550 + }, + { + "epoch": 1.578528616135363, + "grad_norm": 4.6779327392578125, + "learning_rate": 2.3691189731077285e-05, + "loss": 0.6599, + "step": 178560 + }, + { + "epoch": 1.5786170193956752, + "grad_norm": 1.919928789138794, + "learning_rate": 2.3689716343405414e-05, + "loss": 0.5257, + "step": 178570 + }, + { + "epoch": 1.5787054226559876, + "grad_norm": 4.567292213439941, + "learning_rate": 2.3688242955733542e-05, + "loss": 0.647, + "step": 178580 + }, + { + "epoch": 1.5787938259163, + "grad_norm": 2.994666814804077, + "learning_rate": 2.368676956806167e-05, + "loss": 0.5878, + "step": 178590 + }, + { + "epoch": 1.578882229176612, + "grad_norm": 2.907742738723755, + "learning_rate": 2.3685296180389802e-05, + "loss": 0.4936, + "step": 178600 + }, + { + "epoch": 1.5789706324369241, + "grad_norm": 5.553699970245361, + "learning_rate": 2.368382279271793e-05, + "loss": 0.5572, + "step": 178610 + }, + { + "epoch": 1.5790590356972365, + "grad_norm": 1.1139589548110962, + "learning_rate": 2.368234940504606e-05, + "loss": 0.4497, + "step": 178620 + }, + { + "epoch": 1.5791474389575488, + "grad_norm": 3.8644986152648926, + "learning_rate": 2.3680876017374187e-05, + "loss": 0.6286, + "step": 178630 + }, + { + "epoch": 1.579235842217861, + "grad_norm": 2.879363775253296, + "learning_rate": 2.367940262970232e-05, + "loss": 0.5567, + "step": 178640 + }, + { + "epoch": 1.579324245478173, + "grad_norm": 4.482773303985596, + "learning_rate": 2.3677929242030447e-05, + "loss": 0.5552, + "step": 178650 + }, + { + "epoch": 1.5794126487384856, + "grad_norm": 2.0084800720214844, + "learning_rate": 2.3676455854358576e-05, + "loss": 0.6186, + "step": 178660 + }, + { + "epoch": 1.5795010519987978, + "grad_norm": 1.4457485675811768, + "learning_rate": 2.3674982466686707e-05, + "loss": 0.5787, + "step": 178670 + }, + { + "epoch": 1.5795894552591099, + "grad_norm": 2.304863691329956, + "learning_rate": 2.3673509079014836e-05, + "loss": 0.5726, + "step": 178680 + }, + { + "epoch": 1.5796778585194222, + "grad_norm": 3.500981330871582, + "learning_rate": 2.3672035691342964e-05, + "loss": 0.5061, + "step": 178690 + }, + { + "epoch": 1.5797662617797346, + "grad_norm": 2.110438823699951, + "learning_rate": 2.3670562303671093e-05, + "loss": 0.6363, + "step": 178700 + }, + { + "epoch": 1.5798546650400467, + "grad_norm": 1.9318902492523193, + "learning_rate": 2.3669088915999224e-05, + "loss": 0.5963, + "step": 178710 + }, + { + "epoch": 1.5799430683003588, + "grad_norm": 3.0371882915496826, + "learning_rate": 2.3667615528327353e-05, + "loss": 0.5753, + "step": 178720 + }, + { + "epoch": 1.5800314715606711, + "grad_norm": 2.086472749710083, + "learning_rate": 2.366614214065548e-05, + "loss": 0.7206, + "step": 178730 + }, + { + "epoch": 1.5801198748209835, + "grad_norm": 3.5821118354797363, + "learning_rate": 2.3664668752983613e-05, + "loss": 0.4912, + "step": 178740 + }, + { + "epoch": 1.5802082780812956, + "grad_norm": 1.779372215270996, + "learning_rate": 2.366319536531174e-05, + "loss": 0.5519, + "step": 178750 + }, + { + "epoch": 1.5802966813416077, + "grad_norm": 1.6509251594543457, + "learning_rate": 2.366172197763987e-05, + "loss": 0.5721, + "step": 178760 + }, + { + "epoch": 1.58038508460192, + "grad_norm": 1.9885419607162476, + "learning_rate": 2.3660248589967998e-05, + "loss": 0.5759, + "step": 178770 + }, + { + "epoch": 1.5804734878622324, + "grad_norm": 1.6160720586776733, + "learning_rate": 2.365877520229613e-05, + "loss": 0.49, + "step": 178780 + }, + { + "epoch": 1.5805618911225445, + "grad_norm": 13.939750671386719, + "learning_rate": 2.3657301814624258e-05, + "loss": 0.667, + "step": 178790 + }, + { + "epoch": 1.5806502943828569, + "grad_norm": 1.304701328277588, + "learning_rate": 2.3655828426952386e-05, + "loss": 0.6548, + "step": 178800 + }, + { + "epoch": 1.5807386976431692, + "grad_norm": 1.8542201519012451, + "learning_rate": 2.3654355039280515e-05, + "loss": 0.6059, + "step": 178810 + }, + { + "epoch": 1.5808271009034813, + "grad_norm": 3.677777051925659, + "learning_rate": 2.3652881651608646e-05, + "loss": 0.5655, + "step": 178820 + }, + { + "epoch": 1.5809155041637935, + "grad_norm": 5.935771942138672, + "learning_rate": 2.3651408263936775e-05, + "loss": 0.6363, + "step": 178830 + }, + { + "epoch": 1.5810039074241058, + "grad_norm": 2.1556954383850098, + "learning_rate": 2.3649934876264903e-05, + "loss": 0.6915, + "step": 178840 + }, + { + "epoch": 1.5810923106844181, + "grad_norm": 1.132177710533142, + "learning_rate": 2.3648461488593035e-05, + "loss": 0.4895, + "step": 178850 + }, + { + "epoch": 1.5811807139447303, + "grad_norm": 2.6694698333740234, + "learning_rate": 2.3646988100921163e-05, + "loss": 0.5707, + "step": 178860 + }, + { + "epoch": 1.5812691172050424, + "grad_norm": 3.039940595626831, + "learning_rate": 2.364551471324929e-05, + "loss": 0.703, + "step": 178870 + }, + { + "epoch": 1.5813575204653547, + "grad_norm": 1.4182237386703491, + "learning_rate": 2.364404132557742e-05, + "loss": 0.6468, + "step": 178880 + }, + { + "epoch": 1.581445923725667, + "grad_norm": 1.8856532573699951, + "learning_rate": 2.364256793790555e-05, + "loss": 0.7067, + "step": 178890 + }, + { + "epoch": 1.5815343269859792, + "grad_norm": 0.8956825733184814, + "learning_rate": 2.364109455023368e-05, + "loss": 0.4886, + "step": 178900 + }, + { + "epoch": 1.5816227302462915, + "grad_norm": 2.1884191036224365, + "learning_rate": 2.3639621162561808e-05, + "loss": 0.5318, + "step": 178910 + }, + { + "epoch": 1.5817111335066039, + "grad_norm": 6.482180595397949, + "learning_rate": 2.3638147774889937e-05, + "loss": 0.6169, + "step": 178920 + }, + { + "epoch": 1.581799536766916, + "grad_norm": 2.9183382987976074, + "learning_rate": 2.363667438721807e-05, + "loss": 0.5812, + "step": 178930 + }, + { + "epoch": 1.581887940027228, + "grad_norm": 2.4691333770751953, + "learning_rate": 2.3635200999546197e-05, + "loss": 0.7067, + "step": 178940 + }, + { + "epoch": 1.5819763432875404, + "grad_norm": 4.402538299560547, + "learning_rate": 2.3633727611874325e-05, + "loss": 0.5928, + "step": 178950 + }, + { + "epoch": 1.5820647465478528, + "grad_norm": 7.086040496826172, + "learning_rate": 2.3632254224202457e-05, + "loss": 0.5833, + "step": 178960 + }, + { + "epoch": 1.582153149808165, + "grad_norm": 5.071657180786133, + "learning_rate": 2.3630780836530585e-05, + "loss": 0.7063, + "step": 178970 + }, + { + "epoch": 1.582241553068477, + "grad_norm": 0.8515512943267822, + "learning_rate": 2.3629307448858714e-05, + "loss": 0.631, + "step": 178980 + }, + { + "epoch": 1.5823299563287894, + "grad_norm": 2.0974810123443604, + "learning_rate": 2.3627834061186842e-05, + "loss": 0.5044, + "step": 178990 + }, + { + "epoch": 1.5824183595891017, + "grad_norm": 0.7980273365974426, + "learning_rate": 2.3626360673514974e-05, + "loss": 0.5441, + "step": 179000 + }, + { + "epoch": 1.5825067628494138, + "grad_norm": 5.945720672607422, + "learning_rate": 2.3624887285843102e-05, + "loss": 0.6803, + "step": 179010 + }, + { + "epoch": 1.5825951661097262, + "grad_norm": 4.092936992645264, + "learning_rate": 2.362341389817123e-05, + "loss": 0.5958, + "step": 179020 + }, + { + "epoch": 1.5826835693700385, + "grad_norm": 1.810025691986084, + "learning_rate": 2.3621940510499362e-05, + "loss": 0.6582, + "step": 179030 + }, + { + "epoch": 1.5827719726303506, + "grad_norm": 6.428379058837891, + "learning_rate": 2.362046712282749e-05, + "loss": 0.5424, + "step": 179040 + }, + { + "epoch": 1.5828603758906628, + "grad_norm": 9.304973602294922, + "learning_rate": 2.361899373515562e-05, + "loss": 0.5468, + "step": 179050 + }, + { + "epoch": 1.582948779150975, + "grad_norm": 2.233605146408081, + "learning_rate": 2.3617520347483747e-05, + "loss": 0.5723, + "step": 179060 + }, + { + "epoch": 1.5830371824112874, + "grad_norm": 4.325331211090088, + "learning_rate": 2.361604695981188e-05, + "loss": 0.5641, + "step": 179070 + }, + { + "epoch": 1.5831255856715996, + "grad_norm": 2.376910448074341, + "learning_rate": 2.3614573572140007e-05, + "loss": 0.5781, + "step": 179080 + }, + { + "epoch": 1.5832139889319117, + "grad_norm": 1.9221725463867188, + "learning_rate": 2.3613100184468136e-05, + "loss": 0.6365, + "step": 179090 + }, + { + "epoch": 1.583302392192224, + "grad_norm": 2.345121145248413, + "learning_rate": 2.3611626796796267e-05, + "loss": 0.6133, + "step": 179100 + }, + { + "epoch": 1.5833907954525364, + "grad_norm": 8.477579116821289, + "learning_rate": 2.3610153409124396e-05, + "loss": 0.5823, + "step": 179110 + }, + { + "epoch": 1.5834791987128485, + "grad_norm": 1.4636448621749878, + "learning_rate": 2.3608680021452524e-05, + "loss": 0.552, + "step": 179120 + }, + { + "epoch": 1.5835676019731608, + "grad_norm": 2.6343884468078613, + "learning_rate": 2.3607206633780656e-05, + "loss": 0.6246, + "step": 179130 + }, + { + "epoch": 1.5836560052334732, + "grad_norm": 4.477362155914307, + "learning_rate": 2.3605733246108784e-05, + "loss": 0.5574, + "step": 179140 + }, + { + "epoch": 1.5837444084937853, + "grad_norm": 1.9163240194320679, + "learning_rate": 2.3604259858436912e-05, + "loss": 0.6059, + "step": 179150 + }, + { + "epoch": 1.5838328117540974, + "grad_norm": 1.4452087879180908, + "learning_rate": 2.3602786470765044e-05, + "loss": 0.5309, + "step": 179160 + }, + { + "epoch": 1.5839212150144097, + "grad_norm": 2.4761500358581543, + "learning_rate": 2.3601313083093173e-05, + "loss": 0.6122, + "step": 179170 + }, + { + "epoch": 1.584009618274722, + "grad_norm": 2.7357590198516846, + "learning_rate": 2.35998396954213e-05, + "loss": 0.5505, + "step": 179180 + }, + { + "epoch": 1.5840980215350342, + "grad_norm": 1.9816958904266357, + "learning_rate": 2.3598366307749433e-05, + "loss": 0.4913, + "step": 179190 + }, + { + "epoch": 1.5841864247953463, + "grad_norm": 6.363073825836182, + "learning_rate": 2.359689292007756e-05, + "loss": 0.5269, + "step": 179200 + }, + { + "epoch": 1.5842748280556587, + "grad_norm": 4.007833957672119, + "learning_rate": 2.359541953240569e-05, + "loss": 0.608, + "step": 179210 + }, + { + "epoch": 1.584363231315971, + "grad_norm": 1.6797919273376465, + "learning_rate": 2.359394614473382e-05, + "loss": 0.4743, + "step": 179220 + }, + { + "epoch": 1.5844516345762831, + "grad_norm": 6.640074729919434, + "learning_rate": 2.359247275706195e-05, + "loss": 0.6119, + "step": 179230 + }, + { + "epoch": 1.5845400378365952, + "grad_norm": 2.887671947479248, + "learning_rate": 2.3590999369390078e-05, + "loss": 0.8554, + "step": 179240 + }, + { + "epoch": 1.5846284410969078, + "grad_norm": 7.402940273284912, + "learning_rate": 2.358952598171821e-05, + "loss": 0.6546, + "step": 179250 + }, + { + "epoch": 1.58471684435722, + "grad_norm": 1.115099549293518, + "learning_rate": 2.3588052594046338e-05, + "loss": 0.5681, + "step": 179260 + }, + { + "epoch": 1.584805247617532, + "grad_norm": 7.120483875274658, + "learning_rate": 2.3586579206374466e-05, + "loss": 0.6008, + "step": 179270 + }, + { + "epoch": 1.5848936508778444, + "grad_norm": 3.989267587661743, + "learning_rate": 2.3585105818702595e-05, + "loss": 0.6401, + "step": 179280 + }, + { + "epoch": 1.5849820541381567, + "grad_norm": 6.123172760009766, + "learning_rate": 2.3583632431030726e-05, + "loss": 0.6082, + "step": 179290 + }, + { + "epoch": 1.5850704573984689, + "grad_norm": 3.8420369625091553, + "learning_rate": 2.3582159043358855e-05, + "loss": 0.556, + "step": 179300 + }, + { + "epoch": 1.585158860658781, + "grad_norm": 5.0978684425354, + "learning_rate": 2.3580685655686983e-05, + "loss": 0.5908, + "step": 179310 + }, + { + "epoch": 1.5852472639190933, + "grad_norm": 2.9666075706481934, + "learning_rate": 2.3579212268015115e-05, + "loss": 0.5986, + "step": 179320 + }, + { + "epoch": 1.5853356671794057, + "grad_norm": 3.3843584060668945, + "learning_rate": 2.3577738880343243e-05, + "loss": 0.6425, + "step": 179330 + }, + { + "epoch": 1.5854240704397178, + "grad_norm": 9.74099063873291, + "learning_rate": 2.357626549267137e-05, + "loss": 0.5784, + "step": 179340 + }, + { + "epoch": 1.58551247370003, + "grad_norm": 2.8699123859405518, + "learning_rate": 2.35747921049995e-05, + "loss": 0.5442, + "step": 179350 + }, + { + "epoch": 1.5856008769603422, + "grad_norm": 2.280367374420166, + "learning_rate": 2.357331871732763e-05, + "loss": 0.6257, + "step": 179360 + }, + { + "epoch": 1.5856892802206546, + "grad_norm": 4.121230125427246, + "learning_rate": 2.357184532965576e-05, + "loss": 0.5657, + "step": 179370 + }, + { + "epoch": 1.5857776834809667, + "grad_norm": 3.2622361183166504, + "learning_rate": 2.357037194198389e-05, + "loss": 0.5927, + "step": 179380 + }, + { + "epoch": 1.585866086741279, + "grad_norm": 1.4865586757659912, + "learning_rate": 2.356889855431202e-05, + "loss": 0.6538, + "step": 179390 + }, + { + "epoch": 1.5859544900015914, + "grad_norm": 2.511223793029785, + "learning_rate": 2.356742516664015e-05, + "loss": 0.6043, + "step": 179400 + }, + { + "epoch": 1.5860428932619035, + "grad_norm": 9.320632934570312, + "learning_rate": 2.3565951778968277e-05, + "loss": 0.4915, + "step": 179410 + }, + { + "epoch": 1.5861312965222156, + "grad_norm": 7.399755954742432, + "learning_rate": 2.3564478391296405e-05, + "loss": 0.6575, + "step": 179420 + }, + { + "epoch": 1.586219699782528, + "grad_norm": 6.180196285247803, + "learning_rate": 2.3563005003624537e-05, + "loss": 0.6244, + "step": 179430 + }, + { + "epoch": 1.5863081030428403, + "grad_norm": 2.650404930114746, + "learning_rate": 2.3561531615952665e-05, + "loss": 0.5942, + "step": 179440 + }, + { + "epoch": 1.5863965063031524, + "grad_norm": 2.940699577331543, + "learning_rate": 2.3560058228280794e-05, + "loss": 0.4877, + "step": 179450 + }, + { + "epoch": 1.5864849095634646, + "grad_norm": 3.089730739593506, + "learning_rate": 2.3558584840608922e-05, + "loss": 0.6801, + "step": 179460 + }, + { + "epoch": 1.586573312823777, + "grad_norm": 2.5377092361450195, + "learning_rate": 2.3557111452937054e-05, + "loss": 0.5812, + "step": 179470 + }, + { + "epoch": 1.5866617160840892, + "grad_norm": 3.2368810176849365, + "learning_rate": 2.3555638065265182e-05, + "loss": 0.5587, + "step": 179480 + }, + { + "epoch": 1.5867501193444014, + "grad_norm": 8.294109344482422, + "learning_rate": 2.355416467759331e-05, + "loss": 0.5886, + "step": 179490 + }, + { + "epoch": 1.5868385226047137, + "grad_norm": 1.0442575216293335, + "learning_rate": 2.3552691289921442e-05, + "loss": 0.589, + "step": 179500 + }, + { + "epoch": 1.586926925865026, + "grad_norm": 2.7338919639587402, + "learning_rate": 2.355121790224957e-05, + "loss": 0.7372, + "step": 179510 + }, + { + "epoch": 1.5870153291253382, + "grad_norm": 1.5340664386749268, + "learning_rate": 2.35497445145777e-05, + "loss": 0.5771, + "step": 179520 + }, + { + "epoch": 1.5871037323856503, + "grad_norm": 1.4046498537063599, + "learning_rate": 2.3548271126905827e-05, + "loss": 0.4419, + "step": 179530 + }, + { + "epoch": 1.5871921356459626, + "grad_norm": 0.923066258430481, + "learning_rate": 2.354679773923396e-05, + "loss": 0.6282, + "step": 179540 + }, + { + "epoch": 1.587280538906275, + "grad_norm": 7.069698810577393, + "learning_rate": 2.3545324351562087e-05, + "loss": 0.5107, + "step": 179550 + }, + { + "epoch": 1.587368942166587, + "grad_norm": 9.05636978149414, + "learning_rate": 2.3543850963890216e-05, + "loss": 0.6176, + "step": 179560 + }, + { + "epoch": 1.5874573454268992, + "grad_norm": 3.305478572845459, + "learning_rate": 2.3542377576218344e-05, + "loss": 0.6397, + "step": 179570 + }, + { + "epoch": 1.5875457486872115, + "grad_norm": 3.001066207885742, + "learning_rate": 2.3540904188546476e-05, + "loss": 0.5991, + "step": 179580 + }, + { + "epoch": 1.5876341519475239, + "grad_norm": 1.7910176515579224, + "learning_rate": 2.3539430800874604e-05, + "loss": 0.4692, + "step": 179590 + }, + { + "epoch": 1.587722555207836, + "grad_norm": 3.8087406158447266, + "learning_rate": 2.3537957413202732e-05, + "loss": 0.6447, + "step": 179600 + }, + { + "epoch": 1.5878109584681483, + "grad_norm": 2.3797008991241455, + "learning_rate": 2.3536484025530864e-05, + "loss": 0.5771, + "step": 179610 + }, + { + "epoch": 1.5878993617284607, + "grad_norm": 1.7294301986694336, + "learning_rate": 2.3535010637858993e-05, + "loss": 0.6754, + "step": 179620 + }, + { + "epoch": 1.5879877649887728, + "grad_norm": 3.0239651203155518, + "learning_rate": 2.353353725018712e-05, + "loss": 0.7199, + "step": 179630 + }, + { + "epoch": 1.588076168249085, + "grad_norm": 2.4570260047912598, + "learning_rate": 2.353206386251525e-05, + "loss": 0.5504, + "step": 179640 + }, + { + "epoch": 1.5881645715093973, + "grad_norm": 3.3867244720458984, + "learning_rate": 2.353059047484338e-05, + "loss": 0.5894, + "step": 179650 + }, + { + "epoch": 1.5882529747697096, + "grad_norm": 2.0542361736297607, + "learning_rate": 2.352911708717151e-05, + "loss": 0.3638, + "step": 179660 + }, + { + "epoch": 1.5883413780300217, + "grad_norm": 3.210259437561035, + "learning_rate": 2.3527643699499638e-05, + "loss": 0.556, + "step": 179670 + }, + { + "epoch": 1.5884297812903339, + "grad_norm": 1.8866873979568481, + "learning_rate": 2.352617031182777e-05, + "loss": 0.5619, + "step": 179680 + }, + { + "epoch": 1.5885181845506462, + "grad_norm": 3.407763957977295, + "learning_rate": 2.3524696924155898e-05, + "loss": 0.6893, + "step": 179690 + }, + { + "epoch": 1.5886065878109585, + "grad_norm": 2.2238566875457764, + "learning_rate": 2.3523223536484026e-05, + "loss": 0.5486, + "step": 179700 + }, + { + "epoch": 1.5886949910712707, + "grad_norm": 2.908684730529785, + "learning_rate": 2.3521750148812155e-05, + "loss": 0.5935, + "step": 179710 + }, + { + "epoch": 1.588783394331583, + "grad_norm": 2.056279182434082, + "learning_rate": 2.3520276761140286e-05, + "loss": 0.5224, + "step": 179720 + }, + { + "epoch": 1.5888717975918953, + "grad_norm": 1.5335217714309692, + "learning_rate": 2.3518803373468415e-05, + "loss": 0.5014, + "step": 179730 + }, + { + "epoch": 1.5889602008522075, + "grad_norm": 1.1683319807052612, + "learning_rate": 2.3517329985796543e-05, + "loss": 0.6044, + "step": 179740 + }, + { + "epoch": 1.5890486041125196, + "grad_norm": 3.7608189582824707, + "learning_rate": 2.351585659812467e-05, + "loss": 0.6802, + "step": 179750 + }, + { + "epoch": 1.589137007372832, + "grad_norm": 2.6810014247894287, + "learning_rate": 2.3514383210452803e-05, + "loss": 0.6324, + "step": 179760 + }, + { + "epoch": 1.5892254106331443, + "grad_norm": 2.0969247817993164, + "learning_rate": 2.351290982278093e-05, + "loss": 0.5661, + "step": 179770 + }, + { + "epoch": 1.5893138138934564, + "grad_norm": 2.7428903579711914, + "learning_rate": 2.351143643510906e-05, + "loss": 0.5117, + "step": 179780 + }, + { + "epoch": 1.5894022171537685, + "grad_norm": 2.2435295581817627, + "learning_rate": 2.350996304743719e-05, + "loss": 0.4758, + "step": 179790 + }, + { + "epoch": 1.5894906204140808, + "grad_norm": 2.1977460384368896, + "learning_rate": 2.350848965976532e-05, + "loss": 0.6348, + "step": 179800 + }, + { + "epoch": 1.5895790236743932, + "grad_norm": 7.006791114807129, + "learning_rate": 2.3507016272093448e-05, + "loss": 0.5691, + "step": 179810 + }, + { + "epoch": 1.5896674269347053, + "grad_norm": 4.33966064453125, + "learning_rate": 2.3505542884421577e-05, + "loss": 0.6901, + "step": 179820 + }, + { + "epoch": 1.5897558301950174, + "grad_norm": 1.4744582176208496, + "learning_rate": 2.350406949674971e-05, + "loss": 0.5036, + "step": 179830 + }, + { + "epoch": 1.58984423345533, + "grad_norm": 1.8755768537521362, + "learning_rate": 2.3502596109077837e-05, + "loss": 0.6141, + "step": 179840 + }, + { + "epoch": 1.5899326367156421, + "grad_norm": 3.1658451557159424, + "learning_rate": 2.3501122721405965e-05, + "loss": 0.6172, + "step": 179850 + }, + { + "epoch": 1.5900210399759542, + "grad_norm": 2.5658719539642334, + "learning_rate": 2.3499649333734097e-05, + "loss": 0.6669, + "step": 179860 + }, + { + "epoch": 1.5901094432362666, + "grad_norm": 3.693065643310547, + "learning_rate": 2.3498175946062225e-05, + "loss": 0.6059, + "step": 179870 + }, + { + "epoch": 1.590197846496579, + "grad_norm": 2.051083564758301, + "learning_rate": 2.3496702558390353e-05, + "loss": 0.7024, + "step": 179880 + }, + { + "epoch": 1.590286249756891, + "grad_norm": 3.083063840866089, + "learning_rate": 2.3495229170718482e-05, + "loss": 0.5307, + "step": 179890 + }, + { + "epoch": 1.5903746530172032, + "grad_norm": 4.3811235427856445, + "learning_rate": 2.3493755783046614e-05, + "loss": 0.5674, + "step": 179900 + }, + { + "epoch": 1.5904630562775155, + "grad_norm": 1.8005938529968262, + "learning_rate": 2.3492282395374742e-05, + "loss": 0.5071, + "step": 179910 + }, + { + "epoch": 1.5905514595378278, + "grad_norm": 3.3282253742218018, + "learning_rate": 2.349080900770287e-05, + "loss": 0.6421, + "step": 179920 + }, + { + "epoch": 1.59063986279814, + "grad_norm": 3.116586208343506, + "learning_rate": 2.3489335620031e-05, + "loss": 0.6063, + "step": 179930 + }, + { + "epoch": 1.590728266058452, + "grad_norm": 2.2537944316864014, + "learning_rate": 2.348786223235913e-05, + "loss": 0.5991, + "step": 179940 + }, + { + "epoch": 1.5908166693187646, + "grad_norm": 13.848723411560059, + "learning_rate": 2.348638884468726e-05, + "loss": 0.6208, + "step": 179950 + }, + { + "epoch": 1.5909050725790768, + "grad_norm": 3.3459393978118896, + "learning_rate": 2.3484915457015387e-05, + "loss": 0.6962, + "step": 179960 + }, + { + "epoch": 1.5909934758393889, + "grad_norm": 2.123842716217041, + "learning_rate": 2.348344206934352e-05, + "loss": 0.5256, + "step": 179970 + }, + { + "epoch": 1.5910818790997012, + "grad_norm": 4.606895923614502, + "learning_rate": 2.3481968681671647e-05, + "loss": 0.7916, + "step": 179980 + }, + { + "epoch": 1.5911702823600136, + "grad_norm": 12.328638076782227, + "learning_rate": 2.3480495293999776e-05, + "loss": 0.6393, + "step": 179990 + }, + { + "epoch": 1.5912586856203257, + "grad_norm": 3.4472873210906982, + "learning_rate": 2.3479021906327904e-05, + "loss": 0.5403, + "step": 180000 + }, + { + "epoch": 1.5913470888806378, + "grad_norm": 0.8657683730125427, + "learning_rate": 2.3477548518656036e-05, + "loss": 0.5665, + "step": 180010 + }, + { + "epoch": 1.5914354921409501, + "grad_norm": 1.5501017570495605, + "learning_rate": 2.3476075130984164e-05, + "loss": 0.6192, + "step": 180020 + }, + { + "epoch": 1.5915238954012625, + "grad_norm": 1.8315023183822632, + "learning_rate": 2.3474601743312292e-05, + "loss": 0.6643, + "step": 180030 + }, + { + "epoch": 1.5916122986615746, + "grad_norm": 4.753173351287842, + "learning_rate": 2.3473128355640424e-05, + "loss": 0.6156, + "step": 180040 + }, + { + "epoch": 1.5917007019218867, + "grad_norm": 2.014275312423706, + "learning_rate": 2.3471654967968552e-05, + "loss": 0.6332, + "step": 180050 + }, + { + "epoch": 1.591789105182199, + "grad_norm": 1.5966054201126099, + "learning_rate": 2.347018158029668e-05, + "loss": 0.5239, + "step": 180060 + }, + { + "epoch": 1.5918775084425114, + "grad_norm": 8.995635032653809, + "learning_rate": 2.3468708192624813e-05, + "loss": 0.5499, + "step": 180070 + }, + { + "epoch": 1.5919659117028235, + "grad_norm": 5.3437628746032715, + "learning_rate": 2.346723480495294e-05, + "loss": 0.5464, + "step": 180080 + }, + { + "epoch": 1.5920543149631359, + "grad_norm": 3.6786534786224365, + "learning_rate": 2.346576141728107e-05, + "loss": 0.6158, + "step": 180090 + }, + { + "epoch": 1.5921427182234482, + "grad_norm": 2.0477163791656494, + "learning_rate": 2.34642880296092e-05, + "loss": 0.5796, + "step": 180100 + }, + { + "epoch": 1.5922311214837603, + "grad_norm": 1.4329932928085327, + "learning_rate": 2.346281464193733e-05, + "loss": 0.5692, + "step": 180110 + }, + { + "epoch": 1.5923195247440725, + "grad_norm": 1.3678836822509766, + "learning_rate": 2.3461341254265458e-05, + "loss": 0.6621, + "step": 180120 + }, + { + "epoch": 1.5924079280043848, + "grad_norm": 0.872795820236206, + "learning_rate": 2.345986786659359e-05, + "loss": 0.5538, + "step": 180130 + }, + { + "epoch": 1.5924963312646971, + "grad_norm": 1.3687347173690796, + "learning_rate": 2.3458394478921718e-05, + "loss": 0.5979, + "step": 180140 + }, + { + "epoch": 1.5925847345250093, + "grad_norm": 5.690780162811279, + "learning_rate": 2.3456921091249846e-05, + "loss": 0.6672, + "step": 180150 + }, + { + "epoch": 1.5926731377853214, + "grad_norm": 1.6220439672470093, + "learning_rate": 2.3455447703577978e-05, + "loss": 0.6538, + "step": 180160 + }, + { + "epoch": 1.5927615410456337, + "grad_norm": 5.372829437255859, + "learning_rate": 2.3453974315906106e-05, + "loss": 0.5669, + "step": 180170 + }, + { + "epoch": 1.592849944305946, + "grad_norm": 3.263195514678955, + "learning_rate": 2.3452500928234235e-05, + "loss": 0.5365, + "step": 180180 + }, + { + "epoch": 1.5929383475662582, + "grad_norm": 4.150403022766113, + "learning_rate": 2.3451027540562366e-05, + "loss": 0.622, + "step": 180190 + }, + { + "epoch": 1.5930267508265705, + "grad_norm": 2.364734411239624, + "learning_rate": 2.3449554152890495e-05, + "loss": 0.6447, + "step": 180200 + }, + { + "epoch": 1.5931151540868829, + "grad_norm": 3.08217453956604, + "learning_rate": 2.3448080765218623e-05, + "loss": 0.6072, + "step": 180210 + }, + { + "epoch": 1.593203557347195, + "grad_norm": 2.8130154609680176, + "learning_rate": 2.344660737754675e-05, + "loss": 0.5749, + "step": 180220 + }, + { + "epoch": 1.593291960607507, + "grad_norm": 61.3019905090332, + "learning_rate": 2.3445133989874883e-05, + "loss": 0.5995, + "step": 180230 + }, + { + "epoch": 1.5933803638678194, + "grad_norm": 6.596480846405029, + "learning_rate": 2.344366060220301e-05, + "loss": 0.7515, + "step": 180240 + }, + { + "epoch": 1.5934687671281318, + "grad_norm": 2.65738844871521, + "learning_rate": 2.344218721453114e-05, + "loss": 0.731, + "step": 180250 + }, + { + "epoch": 1.593557170388444, + "grad_norm": 5.716721534729004, + "learning_rate": 2.344071382685927e-05, + "loss": 0.6749, + "step": 180260 + }, + { + "epoch": 1.593645573648756, + "grad_norm": 2.220371723175049, + "learning_rate": 2.34392404391874e-05, + "loss": 0.535, + "step": 180270 + }, + { + "epoch": 1.5937339769090684, + "grad_norm": 5.855672836303711, + "learning_rate": 2.3437767051515528e-05, + "loss": 0.7136, + "step": 180280 + }, + { + "epoch": 1.5938223801693807, + "grad_norm": 1.289474368095398, + "learning_rate": 2.3436293663843657e-05, + "loss": 0.5728, + "step": 180290 + }, + { + "epoch": 1.5939107834296928, + "grad_norm": 2.866244077682495, + "learning_rate": 2.343482027617179e-05, + "loss": 0.7036, + "step": 180300 + }, + { + "epoch": 1.5939991866900052, + "grad_norm": 2.3711483478546143, + "learning_rate": 2.3433346888499917e-05, + "loss": 0.6784, + "step": 180310 + }, + { + "epoch": 1.5940875899503175, + "grad_norm": 5.846799373626709, + "learning_rate": 2.3431873500828045e-05, + "loss": 0.6402, + "step": 180320 + }, + { + "epoch": 1.5941759932106296, + "grad_norm": 2.150857448577881, + "learning_rate": 2.3430400113156177e-05, + "loss": 0.6215, + "step": 180330 + }, + { + "epoch": 1.5942643964709418, + "grad_norm": 2.2093544006347656, + "learning_rate": 2.3428926725484305e-05, + "loss": 0.5766, + "step": 180340 + }, + { + "epoch": 1.594352799731254, + "grad_norm": 3.9267990589141846, + "learning_rate": 2.3427453337812434e-05, + "loss": 0.587, + "step": 180350 + }, + { + "epoch": 1.5944412029915664, + "grad_norm": 2.444490432739258, + "learning_rate": 2.3425979950140562e-05, + "loss": 0.5162, + "step": 180360 + }, + { + "epoch": 1.5945296062518786, + "grad_norm": 1.5033422708511353, + "learning_rate": 2.3424506562468694e-05, + "loss": 0.653, + "step": 180370 + }, + { + "epoch": 1.5946180095121907, + "grad_norm": 2.6062347888946533, + "learning_rate": 2.3423033174796822e-05, + "loss": 0.7283, + "step": 180380 + }, + { + "epoch": 1.594706412772503, + "grad_norm": 1.7430806159973145, + "learning_rate": 2.342155978712495e-05, + "loss": 0.6121, + "step": 180390 + }, + { + "epoch": 1.5947948160328154, + "grad_norm": 7.023827075958252, + "learning_rate": 2.342008639945308e-05, + "loss": 0.6218, + "step": 180400 + }, + { + "epoch": 1.5948832192931275, + "grad_norm": 6.906120300292969, + "learning_rate": 2.341861301178121e-05, + "loss": 0.6109, + "step": 180410 + }, + { + "epoch": 1.5949716225534396, + "grad_norm": 1.745193600654602, + "learning_rate": 2.341713962410934e-05, + "loss": 0.6717, + "step": 180420 + }, + { + "epoch": 1.5950600258137522, + "grad_norm": 2.645542860031128, + "learning_rate": 2.3415666236437467e-05, + "loss": 0.6428, + "step": 180430 + }, + { + "epoch": 1.5951484290740643, + "grad_norm": 2.2267887592315674, + "learning_rate": 2.34141928487656e-05, + "loss": 0.4378, + "step": 180440 + }, + { + "epoch": 1.5952368323343764, + "grad_norm": 2.023374319076538, + "learning_rate": 2.3412719461093727e-05, + "loss": 0.709, + "step": 180450 + }, + { + "epoch": 1.5953252355946888, + "grad_norm": 1.7647895812988281, + "learning_rate": 2.3411246073421856e-05, + "loss": 0.5173, + "step": 180460 + }, + { + "epoch": 1.595413638855001, + "grad_norm": 5.248888969421387, + "learning_rate": 2.3409772685749984e-05, + "loss": 0.7386, + "step": 180470 + }, + { + "epoch": 1.5955020421153132, + "grad_norm": 2.8855655193328857, + "learning_rate": 2.3408299298078116e-05, + "loss": 0.7359, + "step": 180480 + }, + { + "epoch": 1.5955904453756253, + "grad_norm": 3.3585245609283447, + "learning_rate": 2.3406825910406244e-05, + "loss": 0.5645, + "step": 180490 + }, + { + "epoch": 1.5956788486359377, + "grad_norm": 1.4526543617248535, + "learning_rate": 2.3405352522734372e-05, + "loss": 0.5595, + "step": 180500 + }, + { + "epoch": 1.59576725189625, + "grad_norm": 4.439969062805176, + "learning_rate": 2.34038791350625e-05, + "loss": 0.5334, + "step": 180510 + }, + { + "epoch": 1.5958556551565621, + "grad_norm": 1.8360610008239746, + "learning_rate": 2.3402405747390632e-05, + "loss": 0.6466, + "step": 180520 + }, + { + "epoch": 1.5959440584168743, + "grad_norm": 1.8138060569763184, + "learning_rate": 2.340093235971876e-05, + "loss": 0.8035, + "step": 180530 + }, + { + "epoch": 1.5960324616771868, + "grad_norm": 2.0265145301818848, + "learning_rate": 2.339945897204689e-05, + "loss": 0.5551, + "step": 180540 + }, + { + "epoch": 1.596120864937499, + "grad_norm": 10.673317909240723, + "learning_rate": 2.339798558437502e-05, + "loss": 0.6168, + "step": 180550 + }, + { + "epoch": 1.596209268197811, + "grad_norm": 3.2672953605651855, + "learning_rate": 2.339651219670315e-05, + "loss": 0.606, + "step": 180560 + }, + { + "epoch": 1.5962976714581234, + "grad_norm": 1.986310601234436, + "learning_rate": 2.3395038809031278e-05, + "loss": 0.4415, + "step": 180570 + }, + { + "epoch": 1.5963860747184357, + "grad_norm": 1.5670112371444702, + "learning_rate": 2.3393565421359406e-05, + "loss": 0.5865, + "step": 180580 + }, + { + "epoch": 1.5964744779787479, + "grad_norm": 3.4498016834259033, + "learning_rate": 2.3392092033687538e-05, + "loss": 0.5926, + "step": 180590 + }, + { + "epoch": 1.59656288123906, + "grad_norm": 1.3564311265945435, + "learning_rate": 2.3390618646015666e-05, + "loss": 0.6366, + "step": 180600 + }, + { + "epoch": 1.5966512844993723, + "grad_norm": 20.663562774658203, + "learning_rate": 2.3389145258343794e-05, + "loss": 0.6622, + "step": 180610 + }, + { + "epoch": 1.5967396877596847, + "grad_norm": 1.8269972801208496, + "learning_rate": 2.3387671870671926e-05, + "loss": 0.5852, + "step": 180620 + }, + { + "epoch": 1.5968280910199968, + "grad_norm": 2.7022252082824707, + "learning_rate": 2.3386198483000055e-05, + "loss": 0.5378, + "step": 180630 + }, + { + "epoch": 1.596916494280309, + "grad_norm": 4.237645149230957, + "learning_rate": 2.3384725095328183e-05, + "loss": 0.5791, + "step": 180640 + }, + { + "epoch": 1.5970048975406212, + "grad_norm": 7.8528642654418945, + "learning_rate": 2.338325170765631e-05, + "loss": 0.5662, + "step": 180650 + }, + { + "epoch": 1.5970933008009336, + "grad_norm": 6.670100688934326, + "learning_rate": 2.3381778319984443e-05, + "loss": 0.5362, + "step": 180660 + }, + { + "epoch": 1.5971817040612457, + "grad_norm": 1.4665193557739258, + "learning_rate": 2.338030493231257e-05, + "loss": 0.4798, + "step": 180670 + }, + { + "epoch": 1.597270107321558, + "grad_norm": 1.8202050924301147, + "learning_rate": 2.33788315446407e-05, + "loss": 0.6075, + "step": 180680 + }, + { + "epoch": 1.5973585105818704, + "grad_norm": 1.1200059652328491, + "learning_rate": 2.3377358156968828e-05, + "loss": 0.5509, + "step": 180690 + }, + { + "epoch": 1.5974469138421825, + "grad_norm": 1.9601930379867554, + "learning_rate": 2.337588476929696e-05, + "loss": 0.601, + "step": 180700 + }, + { + "epoch": 1.5975353171024946, + "grad_norm": 2.990407943725586, + "learning_rate": 2.3374411381625088e-05, + "loss": 0.5793, + "step": 180710 + }, + { + "epoch": 1.597623720362807, + "grad_norm": 2.6189730167388916, + "learning_rate": 2.3372937993953217e-05, + "loss": 0.5415, + "step": 180720 + }, + { + "epoch": 1.5977121236231193, + "grad_norm": 4.03648567199707, + "learning_rate": 2.3371464606281348e-05, + "loss": 0.5235, + "step": 180730 + }, + { + "epoch": 1.5978005268834314, + "grad_norm": 1.577492594718933, + "learning_rate": 2.3369991218609477e-05, + "loss": 0.5668, + "step": 180740 + }, + { + "epoch": 1.5978889301437436, + "grad_norm": 4.022270679473877, + "learning_rate": 2.3368517830937605e-05, + "loss": 0.5031, + "step": 180750 + }, + { + "epoch": 1.597977333404056, + "grad_norm": 2.198739528656006, + "learning_rate": 2.3367044443265733e-05, + "loss": 0.5673, + "step": 180760 + }, + { + "epoch": 1.5980657366643682, + "grad_norm": 2.115494966506958, + "learning_rate": 2.3365571055593865e-05, + "loss": 0.7283, + "step": 180770 + }, + { + "epoch": 1.5981541399246804, + "grad_norm": 3.8738770484924316, + "learning_rate": 2.3364097667921993e-05, + "loss": 0.5925, + "step": 180780 + }, + { + "epoch": 1.5982425431849927, + "grad_norm": 3.2534127235412598, + "learning_rate": 2.3362624280250122e-05, + "loss": 0.4669, + "step": 180790 + }, + { + "epoch": 1.598330946445305, + "grad_norm": 2.4455721378326416, + "learning_rate": 2.3361150892578254e-05, + "loss": 0.759, + "step": 180800 + }, + { + "epoch": 1.5984193497056172, + "grad_norm": 1.9553617238998413, + "learning_rate": 2.3359677504906382e-05, + "loss": 0.5701, + "step": 180810 + }, + { + "epoch": 1.5985077529659293, + "grad_norm": 4.097979545593262, + "learning_rate": 2.335820411723451e-05, + "loss": 0.5689, + "step": 180820 + }, + { + "epoch": 1.5985961562262416, + "grad_norm": 6.519585609436035, + "learning_rate": 2.335673072956264e-05, + "loss": 0.5682, + "step": 180830 + }, + { + "epoch": 1.598684559486554, + "grad_norm": 2.102809190750122, + "learning_rate": 2.335525734189077e-05, + "loss": 0.5077, + "step": 180840 + }, + { + "epoch": 1.598772962746866, + "grad_norm": 3.9697105884552, + "learning_rate": 2.33537839542189e-05, + "loss": 0.614, + "step": 180850 + }, + { + "epoch": 1.5988613660071782, + "grad_norm": 5.887269020080566, + "learning_rate": 2.3352310566547027e-05, + "loss": 0.7094, + "step": 180860 + }, + { + "epoch": 1.5989497692674906, + "grad_norm": 16.581584930419922, + "learning_rate": 2.3350837178875155e-05, + "loss": 0.6375, + "step": 180870 + }, + { + "epoch": 1.599038172527803, + "grad_norm": 9.863439559936523, + "learning_rate": 2.3349363791203287e-05, + "loss": 0.6338, + "step": 180880 + }, + { + "epoch": 1.599126575788115, + "grad_norm": 2.2112786769866943, + "learning_rate": 2.3347890403531415e-05, + "loss": 0.5495, + "step": 180890 + }, + { + "epoch": 1.5992149790484274, + "grad_norm": 1.802366018295288, + "learning_rate": 2.3346417015859544e-05, + "loss": 0.4762, + "step": 180900 + }, + { + "epoch": 1.5993033823087397, + "grad_norm": 1.8756802082061768, + "learning_rate": 2.3344943628187676e-05, + "loss": 0.5056, + "step": 180910 + }, + { + "epoch": 1.5993917855690518, + "grad_norm": 5.013724327087402, + "learning_rate": 2.3343470240515804e-05, + "loss": 0.5394, + "step": 180920 + }, + { + "epoch": 1.599480188829364, + "grad_norm": 2.391293525695801, + "learning_rate": 2.3341996852843932e-05, + "loss": 0.5915, + "step": 180930 + }, + { + "epoch": 1.5995685920896763, + "grad_norm": 0.8728381395339966, + "learning_rate": 2.334052346517206e-05, + "loss": 0.5568, + "step": 180940 + }, + { + "epoch": 1.5996569953499886, + "grad_norm": 2.9804489612579346, + "learning_rate": 2.3339050077500192e-05, + "loss": 0.6745, + "step": 180950 + }, + { + "epoch": 1.5997453986103007, + "grad_norm": 4.190545082092285, + "learning_rate": 2.333757668982832e-05, + "loss": 0.7235, + "step": 180960 + }, + { + "epoch": 1.5998338018706129, + "grad_norm": 2.6208951473236084, + "learning_rate": 2.333610330215645e-05, + "loss": 0.6569, + "step": 180970 + }, + { + "epoch": 1.5999222051309252, + "grad_norm": 1.6110780239105225, + "learning_rate": 2.333462991448458e-05, + "loss": 0.6735, + "step": 180980 + }, + { + "epoch": 1.6000106083912375, + "grad_norm": 3.2944605350494385, + "learning_rate": 2.333315652681271e-05, + "loss": 0.6234, + "step": 180990 + }, + { + "epoch": 1.6000990116515497, + "grad_norm": 3.5745596885681152, + "learning_rate": 2.3331683139140838e-05, + "loss": 0.7434, + "step": 181000 + }, + { + "epoch": 1.6001874149118618, + "grad_norm": 1.958343267440796, + "learning_rate": 2.333020975146897e-05, + "loss": 0.574, + "step": 181010 + }, + { + "epoch": 1.6002758181721743, + "grad_norm": 1.6085907220840454, + "learning_rate": 2.3328736363797098e-05, + "loss": 0.6236, + "step": 181020 + }, + { + "epoch": 1.6003642214324865, + "grad_norm": 8.054312705993652, + "learning_rate": 2.3327262976125226e-05, + "loss": 0.5869, + "step": 181030 + }, + { + "epoch": 1.6004526246927986, + "grad_norm": 1.788875937461853, + "learning_rate": 2.3325789588453358e-05, + "loss": 0.5936, + "step": 181040 + }, + { + "epoch": 1.600541027953111, + "grad_norm": 16.9254093170166, + "learning_rate": 2.3324316200781486e-05, + "loss": 0.6252, + "step": 181050 + }, + { + "epoch": 1.6006294312134233, + "grad_norm": 1.7318662405014038, + "learning_rate": 2.3322842813109614e-05, + "loss": 0.5142, + "step": 181060 + }, + { + "epoch": 1.6007178344737354, + "grad_norm": 1.4358969926834106, + "learning_rate": 2.3321369425437746e-05, + "loss": 0.5436, + "step": 181070 + }, + { + "epoch": 1.6008062377340475, + "grad_norm": 2.0630404949188232, + "learning_rate": 2.3319896037765875e-05, + "loss": 0.5965, + "step": 181080 + }, + { + "epoch": 1.6008946409943599, + "grad_norm": 1.9878710508346558, + "learning_rate": 2.3318422650094003e-05, + "loss": 0.5732, + "step": 181090 + }, + { + "epoch": 1.6009830442546722, + "grad_norm": 9.042510986328125, + "learning_rate": 2.3316949262422135e-05, + "loss": 0.6844, + "step": 181100 + }, + { + "epoch": 1.6010714475149843, + "grad_norm": 2.026099920272827, + "learning_rate": 2.3315475874750263e-05, + "loss": 0.5458, + "step": 181110 + }, + { + "epoch": 1.6011598507752964, + "grad_norm": 5.122081279754639, + "learning_rate": 2.331400248707839e-05, + "loss": 0.5995, + "step": 181120 + }, + { + "epoch": 1.601248254035609, + "grad_norm": 6.760629653930664, + "learning_rate": 2.3312529099406523e-05, + "loss": 0.548, + "step": 181130 + }, + { + "epoch": 1.6013366572959211, + "grad_norm": 2.083120107650757, + "learning_rate": 2.331105571173465e-05, + "loss": 0.4942, + "step": 181140 + }, + { + "epoch": 1.6014250605562332, + "grad_norm": 2.0648934841156006, + "learning_rate": 2.330958232406278e-05, + "loss": 0.4688, + "step": 181150 + }, + { + "epoch": 1.6015134638165456, + "grad_norm": 10.854294776916504, + "learning_rate": 2.3308108936390908e-05, + "loss": 0.6666, + "step": 181160 + }, + { + "epoch": 1.601601867076858, + "grad_norm": 3.3238260746002197, + "learning_rate": 2.330663554871904e-05, + "loss": 0.5312, + "step": 181170 + }, + { + "epoch": 1.60169027033717, + "grad_norm": 1.9797765016555786, + "learning_rate": 2.3305162161047168e-05, + "loss": 0.5256, + "step": 181180 + }, + { + "epoch": 1.6017786735974822, + "grad_norm": 1.6275157928466797, + "learning_rate": 2.3303688773375297e-05, + "loss": 0.4706, + "step": 181190 + }, + { + "epoch": 1.6018670768577945, + "grad_norm": 1.9179420471191406, + "learning_rate": 2.330221538570343e-05, + "loss": 0.5285, + "step": 181200 + }, + { + "epoch": 1.6019554801181068, + "grad_norm": 4.8577880859375, + "learning_rate": 2.3300741998031557e-05, + "loss": 0.7118, + "step": 181210 + }, + { + "epoch": 1.602043883378419, + "grad_norm": 13.904053688049316, + "learning_rate": 2.3299268610359685e-05, + "loss": 0.4638, + "step": 181220 + }, + { + "epoch": 1.602132286638731, + "grad_norm": 4.712271213531494, + "learning_rate": 2.3297795222687813e-05, + "loss": 0.4728, + "step": 181230 + }, + { + "epoch": 1.6022206898990434, + "grad_norm": 4.088210105895996, + "learning_rate": 2.3296321835015945e-05, + "loss": 0.5832, + "step": 181240 + }, + { + "epoch": 1.6023090931593558, + "grad_norm": 4.0730109214782715, + "learning_rate": 2.3294848447344073e-05, + "loss": 0.6342, + "step": 181250 + }, + { + "epoch": 1.6023974964196679, + "grad_norm": 0.8740782737731934, + "learning_rate": 2.3293375059672202e-05, + "loss": 0.626, + "step": 181260 + }, + { + "epoch": 1.6024858996799802, + "grad_norm": 2.648021697998047, + "learning_rate": 2.3291901672000334e-05, + "loss": 0.6943, + "step": 181270 + }, + { + "epoch": 1.6025743029402926, + "grad_norm": 1.7694991827011108, + "learning_rate": 2.3290428284328462e-05, + "loss": 0.5814, + "step": 181280 + }, + { + "epoch": 1.6026627062006047, + "grad_norm": 5.823047637939453, + "learning_rate": 2.328895489665659e-05, + "loss": 0.5385, + "step": 181290 + }, + { + "epoch": 1.6027511094609168, + "grad_norm": 3.57016921043396, + "learning_rate": 2.328748150898472e-05, + "loss": 0.5916, + "step": 181300 + }, + { + "epoch": 1.6028395127212292, + "grad_norm": 2.9068450927734375, + "learning_rate": 2.328600812131285e-05, + "loss": 0.6711, + "step": 181310 + }, + { + "epoch": 1.6029279159815415, + "grad_norm": 1.2106068134307861, + "learning_rate": 2.328453473364098e-05, + "loss": 0.5868, + "step": 181320 + }, + { + "epoch": 1.6030163192418536, + "grad_norm": 12.682123184204102, + "learning_rate": 2.3283061345969107e-05, + "loss": 0.6893, + "step": 181330 + }, + { + "epoch": 1.6031047225021657, + "grad_norm": 3.611449956893921, + "learning_rate": 2.3281587958297235e-05, + "loss": 0.4965, + "step": 181340 + }, + { + "epoch": 1.603193125762478, + "grad_norm": 2.0053744316101074, + "learning_rate": 2.3280114570625367e-05, + "loss": 0.5402, + "step": 181350 + }, + { + "epoch": 1.6032815290227904, + "grad_norm": 2.563060998916626, + "learning_rate": 2.3278641182953496e-05, + "loss": 0.5686, + "step": 181360 + }, + { + "epoch": 1.6033699322831025, + "grad_norm": 13.727355003356934, + "learning_rate": 2.3277167795281624e-05, + "loss": 0.6338, + "step": 181370 + }, + { + "epoch": 1.6034583355434149, + "grad_norm": 2.278502941131592, + "learning_rate": 2.3275694407609756e-05, + "loss": 0.412, + "step": 181380 + }, + { + "epoch": 1.6035467388037272, + "grad_norm": 0.7340344190597534, + "learning_rate": 2.3274221019937884e-05, + "loss": 0.4774, + "step": 181390 + }, + { + "epoch": 1.6036351420640393, + "grad_norm": 16.899831771850586, + "learning_rate": 2.3272747632266012e-05, + "loss": 0.5788, + "step": 181400 + }, + { + "epoch": 1.6037235453243515, + "grad_norm": 1.7707161903381348, + "learning_rate": 2.327127424459414e-05, + "loss": 0.6372, + "step": 181410 + }, + { + "epoch": 1.6038119485846638, + "grad_norm": 2.171081066131592, + "learning_rate": 2.3269800856922272e-05, + "loss": 0.626, + "step": 181420 + }, + { + "epoch": 1.6039003518449761, + "grad_norm": 5.244316101074219, + "learning_rate": 2.32683274692504e-05, + "loss": 0.6371, + "step": 181430 + }, + { + "epoch": 1.6039887551052883, + "grad_norm": 1.4206849336624146, + "learning_rate": 2.326685408157853e-05, + "loss": 0.5649, + "step": 181440 + }, + { + "epoch": 1.6040771583656004, + "grad_norm": 0.7768887281417847, + "learning_rate": 2.326538069390666e-05, + "loss": 0.5145, + "step": 181450 + }, + { + "epoch": 1.6041655616259127, + "grad_norm": 2.4371511936187744, + "learning_rate": 2.326390730623479e-05, + "loss": 0.4989, + "step": 181460 + }, + { + "epoch": 1.604253964886225, + "grad_norm": 1.2818129062652588, + "learning_rate": 2.3262433918562918e-05, + "loss": 0.5708, + "step": 181470 + }, + { + "epoch": 1.6043423681465372, + "grad_norm": 5.305901527404785, + "learning_rate": 2.3260960530891046e-05, + "loss": 0.6164, + "step": 181480 + }, + { + "epoch": 1.6044307714068495, + "grad_norm": 4.615675449371338, + "learning_rate": 2.3259487143219178e-05, + "loss": 0.6454, + "step": 181490 + }, + { + "epoch": 1.6045191746671619, + "grad_norm": 1.7052054405212402, + "learning_rate": 2.3258013755547306e-05, + "loss": 0.497, + "step": 181500 + }, + { + "epoch": 1.604607577927474, + "grad_norm": 1.955332636833191, + "learning_rate": 2.3256540367875434e-05, + "loss": 0.6614, + "step": 181510 + }, + { + "epoch": 1.6046959811877861, + "grad_norm": 3.9738073348999023, + "learning_rate": 2.3255066980203563e-05, + "loss": 0.6741, + "step": 181520 + }, + { + "epoch": 1.6047843844480985, + "grad_norm": 5.6055121421813965, + "learning_rate": 2.3253593592531695e-05, + "loss": 0.5592, + "step": 181530 + }, + { + "epoch": 1.6048727877084108, + "grad_norm": 5.648557186126709, + "learning_rate": 2.3252120204859823e-05, + "loss": 0.6688, + "step": 181540 + }, + { + "epoch": 1.604961190968723, + "grad_norm": 3.946701765060425, + "learning_rate": 2.325064681718795e-05, + "loss": 0.5887, + "step": 181550 + }, + { + "epoch": 1.605049594229035, + "grad_norm": 1.046492338180542, + "learning_rate": 2.3249173429516083e-05, + "loss": 0.5832, + "step": 181560 + }, + { + "epoch": 1.6051379974893474, + "grad_norm": 9.238065719604492, + "learning_rate": 2.324770004184421e-05, + "loss": 0.5267, + "step": 181570 + }, + { + "epoch": 1.6052264007496597, + "grad_norm": 12.647065162658691, + "learning_rate": 2.324622665417234e-05, + "loss": 0.7567, + "step": 181580 + }, + { + "epoch": 1.6053148040099718, + "grad_norm": 3.0393753051757812, + "learning_rate": 2.3244753266500468e-05, + "loss": 0.6261, + "step": 181590 + }, + { + "epoch": 1.6054032072702842, + "grad_norm": 2.342805862426758, + "learning_rate": 2.32432798788286e-05, + "loss": 0.4822, + "step": 181600 + }, + { + "epoch": 1.6054916105305965, + "grad_norm": 4.9356279373168945, + "learning_rate": 2.3241806491156728e-05, + "loss": 0.5146, + "step": 181610 + }, + { + "epoch": 1.6055800137909086, + "grad_norm": 1.2645158767700195, + "learning_rate": 2.3240333103484856e-05, + "loss": 0.766, + "step": 181620 + }, + { + "epoch": 1.6056684170512208, + "grad_norm": 1.230264663696289, + "learning_rate": 2.3238859715812985e-05, + "loss": 0.6112, + "step": 181630 + }, + { + "epoch": 1.605756820311533, + "grad_norm": 1.6895595788955688, + "learning_rate": 2.3237386328141117e-05, + "loss": 0.5351, + "step": 181640 + }, + { + "epoch": 1.6058452235718454, + "grad_norm": 10.520718574523926, + "learning_rate": 2.3235912940469245e-05, + "loss": 0.6176, + "step": 181650 + }, + { + "epoch": 1.6059336268321576, + "grad_norm": 5.648880958557129, + "learning_rate": 2.3234439552797373e-05, + "loss": 0.7447, + "step": 181660 + }, + { + "epoch": 1.6060220300924697, + "grad_norm": 0.617428719997406, + "learning_rate": 2.3232966165125505e-05, + "loss": 0.6657, + "step": 181670 + }, + { + "epoch": 1.606110433352782, + "grad_norm": 7.144290924072266, + "learning_rate": 2.3231492777453633e-05, + "loss": 0.5915, + "step": 181680 + }, + { + "epoch": 1.6061988366130944, + "grad_norm": 2.25575590133667, + "learning_rate": 2.3230019389781762e-05, + "loss": 0.5199, + "step": 181690 + }, + { + "epoch": 1.6062872398734065, + "grad_norm": 2.3102056980133057, + "learning_rate": 2.322854600210989e-05, + "loss": 0.6036, + "step": 181700 + }, + { + "epoch": 1.6063756431337186, + "grad_norm": 1.210795521736145, + "learning_rate": 2.3227072614438022e-05, + "loss": 0.6067, + "step": 181710 + }, + { + "epoch": 1.6064640463940312, + "grad_norm": 1.6011179685592651, + "learning_rate": 2.322559922676615e-05, + "loss": 0.4687, + "step": 181720 + }, + { + "epoch": 1.6065524496543433, + "grad_norm": 4.61318826675415, + "learning_rate": 2.322412583909428e-05, + "loss": 0.7097, + "step": 181730 + }, + { + "epoch": 1.6066408529146554, + "grad_norm": 1.1694315671920776, + "learning_rate": 2.322265245142241e-05, + "loss": 0.5813, + "step": 181740 + }, + { + "epoch": 1.6067292561749678, + "grad_norm": 5.971678733825684, + "learning_rate": 2.322117906375054e-05, + "loss": 0.4506, + "step": 181750 + }, + { + "epoch": 1.60681765943528, + "grad_norm": 4.504195213317871, + "learning_rate": 2.3219705676078667e-05, + "loss": 0.5465, + "step": 181760 + }, + { + "epoch": 1.6069060626955922, + "grad_norm": 1.9132769107818604, + "learning_rate": 2.3218232288406795e-05, + "loss": 0.5942, + "step": 181770 + }, + { + "epoch": 1.6069944659559043, + "grad_norm": 7.634122848510742, + "learning_rate": 2.3216758900734927e-05, + "loss": 0.5525, + "step": 181780 + }, + { + "epoch": 1.6070828692162167, + "grad_norm": 9.518406867980957, + "learning_rate": 2.3215285513063055e-05, + "loss": 0.5315, + "step": 181790 + }, + { + "epoch": 1.607171272476529, + "grad_norm": 7.976248741149902, + "learning_rate": 2.3213812125391184e-05, + "loss": 0.6382, + "step": 181800 + }, + { + "epoch": 1.6072596757368411, + "grad_norm": 2.548813819885254, + "learning_rate": 2.3212338737719312e-05, + "loss": 0.5214, + "step": 181810 + }, + { + "epoch": 1.6073480789971533, + "grad_norm": 1.5473272800445557, + "learning_rate": 2.3210865350047444e-05, + "loss": 0.5252, + "step": 181820 + }, + { + "epoch": 1.6074364822574656, + "grad_norm": 2.76377010345459, + "learning_rate": 2.3209391962375572e-05, + "loss": 0.4888, + "step": 181830 + }, + { + "epoch": 1.607524885517778, + "grad_norm": 2.4098384380340576, + "learning_rate": 2.32079185747037e-05, + "loss": 0.5933, + "step": 181840 + }, + { + "epoch": 1.60761328877809, + "grad_norm": 3.50019907951355, + "learning_rate": 2.3206445187031832e-05, + "loss": 0.6098, + "step": 181850 + }, + { + "epoch": 1.6077016920384024, + "grad_norm": 1.6021910905838013, + "learning_rate": 2.320497179935996e-05, + "loss": 0.4224, + "step": 181860 + }, + { + "epoch": 1.6077900952987147, + "grad_norm": 3.7278025150299072, + "learning_rate": 2.320349841168809e-05, + "loss": 0.6151, + "step": 181870 + }, + { + "epoch": 1.6078784985590269, + "grad_norm": 1.9527655839920044, + "learning_rate": 2.3202025024016217e-05, + "loss": 0.571, + "step": 181880 + }, + { + "epoch": 1.607966901819339, + "grad_norm": 2.862412452697754, + "learning_rate": 2.320055163634435e-05, + "loss": 0.5115, + "step": 181890 + }, + { + "epoch": 1.6080553050796513, + "grad_norm": 2.3258020877838135, + "learning_rate": 2.3199078248672477e-05, + "loss": 0.6083, + "step": 181900 + }, + { + "epoch": 1.6081437083399637, + "grad_norm": 2.944877862930298, + "learning_rate": 2.3197604861000606e-05, + "loss": 0.5423, + "step": 181910 + }, + { + "epoch": 1.6082321116002758, + "grad_norm": 1.8457995653152466, + "learning_rate": 2.3196131473328738e-05, + "loss": 0.4872, + "step": 181920 + }, + { + "epoch": 1.608320514860588, + "grad_norm": 3.075913190841675, + "learning_rate": 2.3194658085656866e-05, + "loss": 0.6344, + "step": 181930 + }, + { + "epoch": 1.6084089181209003, + "grad_norm": 3.6242711544036865, + "learning_rate": 2.3193184697984994e-05, + "loss": 0.5162, + "step": 181940 + }, + { + "epoch": 1.6084973213812126, + "grad_norm": 2.1001744270324707, + "learning_rate": 2.3191711310313126e-05, + "loss": 0.4915, + "step": 181950 + }, + { + "epoch": 1.6085857246415247, + "grad_norm": 2.706205368041992, + "learning_rate": 2.3190237922641254e-05, + "loss": 0.7342, + "step": 181960 + }, + { + "epoch": 1.608674127901837, + "grad_norm": 0.8581461906433105, + "learning_rate": 2.3188764534969383e-05, + "loss": 0.5639, + "step": 181970 + }, + { + "epoch": 1.6087625311621494, + "grad_norm": 3.009218215942383, + "learning_rate": 2.3187291147297514e-05, + "loss": 0.5698, + "step": 181980 + }, + { + "epoch": 1.6088509344224615, + "grad_norm": 1.1869412660598755, + "learning_rate": 2.3185817759625643e-05, + "loss": 0.6848, + "step": 181990 + }, + { + "epoch": 1.6089393376827736, + "grad_norm": 2.014723539352417, + "learning_rate": 2.318434437195377e-05, + "loss": 0.6098, + "step": 182000 + }, + { + "epoch": 1.609027740943086, + "grad_norm": 8.267091751098633, + "learning_rate": 2.3182870984281903e-05, + "loss": 0.546, + "step": 182010 + }, + { + "epoch": 1.6091161442033983, + "grad_norm": 2.47029972076416, + "learning_rate": 2.318139759661003e-05, + "loss": 0.6097, + "step": 182020 + }, + { + "epoch": 1.6092045474637104, + "grad_norm": 1.279463768005371, + "learning_rate": 2.317992420893816e-05, + "loss": 0.5847, + "step": 182030 + }, + { + "epoch": 1.6092929507240226, + "grad_norm": 4.717491149902344, + "learning_rate": 2.317845082126629e-05, + "loss": 0.5985, + "step": 182040 + }, + { + "epoch": 1.609381353984335, + "grad_norm": 17.589853286743164, + "learning_rate": 2.317697743359442e-05, + "loss": 0.5873, + "step": 182050 + }, + { + "epoch": 1.6094697572446472, + "grad_norm": 3.970960855484009, + "learning_rate": 2.3175504045922548e-05, + "loss": 0.6575, + "step": 182060 + }, + { + "epoch": 1.6095581605049594, + "grad_norm": 2.1887660026550293, + "learning_rate": 2.317403065825068e-05, + "loss": 0.5719, + "step": 182070 + }, + { + "epoch": 1.6096465637652717, + "grad_norm": 6.2741007804870605, + "learning_rate": 2.3172557270578808e-05, + "loss": 0.6173, + "step": 182080 + }, + { + "epoch": 1.609734967025584, + "grad_norm": 4.65067195892334, + "learning_rate": 2.3171083882906937e-05, + "loss": 0.5542, + "step": 182090 + }, + { + "epoch": 1.6098233702858962, + "grad_norm": 3.2295310497283936, + "learning_rate": 2.3169610495235065e-05, + "loss": 0.5144, + "step": 182100 + }, + { + "epoch": 1.6099117735462083, + "grad_norm": 3.585876226425171, + "learning_rate": 2.3168137107563197e-05, + "loss": 0.6248, + "step": 182110 + }, + { + "epoch": 1.6100001768065206, + "grad_norm": 1.9754563570022583, + "learning_rate": 2.3166663719891325e-05, + "loss": 0.6352, + "step": 182120 + }, + { + "epoch": 1.610088580066833, + "grad_norm": 1.6661945581436157, + "learning_rate": 2.3165190332219453e-05, + "loss": 0.4967, + "step": 182130 + }, + { + "epoch": 1.610176983327145, + "grad_norm": 0.993270993232727, + "learning_rate": 2.3163716944547585e-05, + "loss": 0.5103, + "step": 182140 + }, + { + "epoch": 1.6102653865874572, + "grad_norm": 11.330484390258789, + "learning_rate": 2.3162243556875713e-05, + "loss": 0.5816, + "step": 182150 + }, + { + "epoch": 1.6103537898477696, + "grad_norm": 3.327075958251953, + "learning_rate": 2.3160770169203842e-05, + "loss": 0.5867, + "step": 182160 + }, + { + "epoch": 1.610442193108082, + "grad_norm": 4.12709903717041, + "learning_rate": 2.315929678153197e-05, + "loss": 0.5012, + "step": 182170 + }, + { + "epoch": 1.610530596368394, + "grad_norm": 2.376232624053955, + "learning_rate": 2.3157823393860102e-05, + "loss": 0.5629, + "step": 182180 + }, + { + "epoch": 1.6106189996287064, + "grad_norm": 1.1024261713027954, + "learning_rate": 2.315635000618823e-05, + "loss": 0.5488, + "step": 182190 + }, + { + "epoch": 1.6107074028890187, + "grad_norm": 5.935842514038086, + "learning_rate": 2.315487661851636e-05, + "loss": 0.5842, + "step": 182200 + }, + { + "epoch": 1.6107958061493308, + "grad_norm": 8.251925468444824, + "learning_rate": 2.315340323084449e-05, + "loss": 0.5993, + "step": 182210 + }, + { + "epoch": 1.610884209409643, + "grad_norm": 1.3976600170135498, + "learning_rate": 2.315192984317262e-05, + "loss": 0.6335, + "step": 182220 + }, + { + "epoch": 1.6109726126699553, + "grad_norm": 10.666242599487305, + "learning_rate": 2.3150456455500747e-05, + "loss": 0.6517, + "step": 182230 + }, + { + "epoch": 1.6110610159302676, + "grad_norm": 4.050312519073486, + "learning_rate": 2.3148983067828875e-05, + "loss": 0.595, + "step": 182240 + }, + { + "epoch": 1.6111494191905797, + "grad_norm": 5.496546268463135, + "learning_rate": 2.3147509680157007e-05, + "loss": 0.5611, + "step": 182250 + }, + { + "epoch": 1.6112378224508919, + "grad_norm": 1.9264631271362305, + "learning_rate": 2.3146036292485135e-05, + "loss": 0.5173, + "step": 182260 + }, + { + "epoch": 1.6113262257112042, + "grad_norm": 2.7983193397521973, + "learning_rate": 2.3144562904813264e-05, + "loss": 0.6491, + "step": 182270 + }, + { + "epoch": 1.6114146289715165, + "grad_norm": 5.818872451782227, + "learning_rate": 2.3143089517141392e-05, + "loss": 0.5475, + "step": 182280 + }, + { + "epoch": 1.6115030322318287, + "grad_norm": 1.6825203895568848, + "learning_rate": 2.3141616129469524e-05, + "loss": 0.6427, + "step": 182290 + }, + { + "epoch": 1.6115914354921408, + "grad_norm": 1.9343751668930054, + "learning_rate": 2.3140142741797652e-05, + "loss": 0.5137, + "step": 182300 + }, + { + "epoch": 1.6116798387524534, + "grad_norm": 13.85970401763916, + "learning_rate": 2.313866935412578e-05, + "loss": 0.6143, + "step": 182310 + }, + { + "epoch": 1.6117682420127655, + "grad_norm": 2.501319646835327, + "learning_rate": 2.3137195966453912e-05, + "loss": 0.5665, + "step": 182320 + }, + { + "epoch": 1.6118566452730776, + "grad_norm": 2.020782709121704, + "learning_rate": 2.313572257878204e-05, + "loss": 0.5982, + "step": 182330 + }, + { + "epoch": 1.61194504853339, + "grad_norm": 3.299557685852051, + "learning_rate": 2.313424919111017e-05, + "loss": 0.5345, + "step": 182340 + }, + { + "epoch": 1.6120334517937023, + "grad_norm": 2.0023138523101807, + "learning_rate": 2.3132775803438297e-05, + "loss": 0.6338, + "step": 182350 + }, + { + "epoch": 1.6121218550540144, + "grad_norm": 1.1032688617706299, + "learning_rate": 2.313130241576643e-05, + "loss": 0.7372, + "step": 182360 + }, + { + "epoch": 1.6122102583143265, + "grad_norm": 1.0570935010910034, + "learning_rate": 2.3129829028094558e-05, + "loss": 0.4742, + "step": 182370 + }, + { + "epoch": 1.6122986615746389, + "grad_norm": 4.266687393188477, + "learning_rate": 2.3128355640422686e-05, + "loss": 0.6086, + "step": 182380 + }, + { + "epoch": 1.6123870648349512, + "grad_norm": 4.972256183624268, + "learning_rate": 2.3126882252750818e-05, + "loss": 0.7079, + "step": 182390 + }, + { + "epoch": 1.6124754680952633, + "grad_norm": 1.728886604309082, + "learning_rate": 2.3125408865078946e-05, + "loss": 0.5929, + "step": 182400 + }, + { + "epoch": 1.6125638713555754, + "grad_norm": 1.6776295900344849, + "learning_rate": 2.3123935477407074e-05, + "loss": 0.6809, + "step": 182410 + }, + { + "epoch": 1.6126522746158878, + "grad_norm": 7.576881408691406, + "learning_rate": 2.3122462089735203e-05, + "loss": 0.6855, + "step": 182420 + }, + { + "epoch": 1.6127406778762001, + "grad_norm": 16.358537673950195, + "learning_rate": 2.3120988702063334e-05, + "loss": 0.6711, + "step": 182430 + }, + { + "epoch": 1.6128290811365122, + "grad_norm": 1.77681303024292, + "learning_rate": 2.3119515314391463e-05, + "loss": 0.5875, + "step": 182440 + }, + { + "epoch": 1.6129174843968246, + "grad_norm": 1.8097878694534302, + "learning_rate": 2.311804192671959e-05, + "loss": 0.5834, + "step": 182450 + }, + { + "epoch": 1.613005887657137, + "grad_norm": 4.5516180992126465, + "learning_rate": 2.311656853904772e-05, + "loss": 0.6121, + "step": 182460 + }, + { + "epoch": 1.613094290917449, + "grad_norm": 8.049764633178711, + "learning_rate": 2.311509515137585e-05, + "loss": 0.6117, + "step": 182470 + }, + { + "epoch": 1.6131826941777612, + "grad_norm": 3.6471707820892334, + "learning_rate": 2.311362176370398e-05, + "loss": 0.6182, + "step": 182480 + }, + { + "epoch": 1.6132710974380735, + "grad_norm": 1.2097395658493042, + "learning_rate": 2.3112148376032108e-05, + "loss": 0.6337, + "step": 182490 + }, + { + "epoch": 1.6133595006983859, + "grad_norm": 1.8433586359024048, + "learning_rate": 2.311067498836024e-05, + "loss": 0.6182, + "step": 182500 + }, + { + "epoch": 1.613447903958698, + "grad_norm": 1.2184592485427856, + "learning_rate": 2.3109201600688368e-05, + "loss": 0.7434, + "step": 182510 + }, + { + "epoch": 1.61353630721901, + "grad_norm": 9.369141578674316, + "learning_rate": 2.3107728213016496e-05, + "loss": 0.6229, + "step": 182520 + }, + { + "epoch": 1.6136247104793224, + "grad_norm": 13.255956649780273, + "learning_rate": 2.3106254825344625e-05, + "loss": 0.6211, + "step": 182530 + }, + { + "epoch": 1.6137131137396348, + "grad_norm": 10.364533424377441, + "learning_rate": 2.3104781437672757e-05, + "loss": 0.7024, + "step": 182540 + }, + { + "epoch": 1.613801516999947, + "grad_norm": 3.991870880126953, + "learning_rate": 2.3103308050000885e-05, + "loss": 0.6564, + "step": 182550 + }, + { + "epoch": 1.6138899202602592, + "grad_norm": 9.33375358581543, + "learning_rate": 2.3101834662329013e-05, + "loss": 0.6453, + "step": 182560 + }, + { + "epoch": 1.6139783235205716, + "grad_norm": 1.5691694021224976, + "learning_rate": 2.3100361274657145e-05, + "loss": 0.4641, + "step": 182570 + }, + { + "epoch": 1.6140667267808837, + "grad_norm": 2.3496265411376953, + "learning_rate": 2.3098887886985273e-05, + "loss": 0.5926, + "step": 182580 + }, + { + "epoch": 1.6141551300411958, + "grad_norm": 6.989462375640869, + "learning_rate": 2.30974144993134e-05, + "loss": 0.54, + "step": 182590 + }, + { + "epoch": 1.6142435333015082, + "grad_norm": 1.6402604579925537, + "learning_rate": 2.309594111164153e-05, + "loss": 0.4626, + "step": 182600 + }, + { + "epoch": 1.6143319365618205, + "grad_norm": 2.9409546852111816, + "learning_rate": 2.3094467723969662e-05, + "loss": 0.5875, + "step": 182610 + }, + { + "epoch": 1.6144203398221326, + "grad_norm": 10.978370666503906, + "learning_rate": 2.309299433629779e-05, + "loss": 0.604, + "step": 182620 + }, + { + "epoch": 1.6145087430824447, + "grad_norm": 5.703006744384766, + "learning_rate": 2.309152094862592e-05, + "loss": 0.5203, + "step": 182630 + }, + { + "epoch": 1.614597146342757, + "grad_norm": 3.4042088985443115, + "learning_rate": 2.3090047560954047e-05, + "loss": 0.7101, + "step": 182640 + }, + { + "epoch": 1.6146855496030694, + "grad_norm": 4.665914535522461, + "learning_rate": 2.308857417328218e-05, + "loss": 0.5858, + "step": 182650 + }, + { + "epoch": 1.6147739528633815, + "grad_norm": 2.900662660598755, + "learning_rate": 2.3087100785610307e-05, + "loss": 0.5946, + "step": 182660 + }, + { + "epoch": 1.6148623561236939, + "grad_norm": 5.946843147277832, + "learning_rate": 2.3085627397938435e-05, + "loss": 0.563, + "step": 182670 + }, + { + "epoch": 1.6149507593840062, + "grad_norm": 13.57431697845459, + "learning_rate": 2.3084154010266567e-05, + "loss": 0.6278, + "step": 182680 + }, + { + "epoch": 1.6150391626443183, + "grad_norm": 1.0160492658615112, + "learning_rate": 2.3082680622594695e-05, + "loss": 0.5475, + "step": 182690 + }, + { + "epoch": 1.6151275659046305, + "grad_norm": 1.1510932445526123, + "learning_rate": 2.3081207234922824e-05, + "loss": 0.5453, + "step": 182700 + }, + { + "epoch": 1.6152159691649428, + "grad_norm": 2.0452396869659424, + "learning_rate": 2.3079733847250952e-05, + "loss": 0.6262, + "step": 182710 + }, + { + "epoch": 1.6153043724252552, + "grad_norm": 1.8475239276885986, + "learning_rate": 2.3078260459579084e-05, + "loss": 0.5877, + "step": 182720 + }, + { + "epoch": 1.6153927756855673, + "grad_norm": 2.4507598876953125, + "learning_rate": 2.3076787071907212e-05, + "loss": 0.6673, + "step": 182730 + }, + { + "epoch": 1.6154811789458794, + "grad_norm": 8.006498336791992, + "learning_rate": 2.307531368423534e-05, + "loss": 0.5106, + "step": 182740 + }, + { + "epoch": 1.6155695822061917, + "grad_norm": 2.00193190574646, + "learning_rate": 2.307384029656347e-05, + "loss": 0.6629, + "step": 182750 + }, + { + "epoch": 1.615657985466504, + "grad_norm": 1.2723567485809326, + "learning_rate": 2.30723669088916e-05, + "loss": 0.6077, + "step": 182760 + }, + { + "epoch": 1.6157463887268162, + "grad_norm": 11.683198928833008, + "learning_rate": 2.307089352121973e-05, + "loss": 0.6445, + "step": 182770 + }, + { + "epoch": 1.6158347919871285, + "grad_norm": 2.3526580333709717, + "learning_rate": 2.3069420133547857e-05, + "loss": 0.5111, + "step": 182780 + }, + { + "epoch": 1.6159231952474409, + "grad_norm": 3.677626848220825, + "learning_rate": 2.306794674587599e-05, + "loss": 0.6075, + "step": 182790 + }, + { + "epoch": 1.616011598507753, + "grad_norm": 1.0365103483200073, + "learning_rate": 2.3066473358204117e-05, + "loss": 0.4695, + "step": 182800 + }, + { + "epoch": 1.6161000017680651, + "grad_norm": 1.7588698863983154, + "learning_rate": 2.3064999970532246e-05, + "loss": 0.5403, + "step": 182810 + }, + { + "epoch": 1.6161884050283775, + "grad_norm": 1.7166516780853271, + "learning_rate": 2.3063526582860374e-05, + "loss": 0.5617, + "step": 182820 + }, + { + "epoch": 1.6162768082886898, + "grad_norm": 4.186545372009277, + "learning_rate": 2.3062053195188506e-05, + "loss": 0.586, + "step": 182830 + }, + { + "epoch": 1.616365211549002, + "grad_norm": 13.704556465148926, + "learning_rate": 2.3060579807516634e-05, + "loss": 0.6511, + "step": 182840 + }, + { + "epoch": 1.616453614809314, + "grad_norm": 13.228602409362793, + "learning_rate": 2.3059106419844763e-05, + "loss": 0.5323, + "step": 182850 + }, + { + "epoch": 1.6165420180696264, + "grad_norm": 5.97099494934082, + "learning_rate": 2.3057633032172894e-05, + "loss": 0.597, + "step": 182860 + }, + { + "epoch": 1.6166304213299387, + "grad_norm": 1.8951526880264282, + "learning_rate": 2.3056159644501023e-05, + "loss": 0.5811, + "step": 182870 + }, + { + "epoch": 1.6167188245902508, + "grad_norm": 3.4787559509277344, + "learning_rate": 2.3054686256829154e-05, + "loss": 0.5543, + "step": 182880 + }, + { + "epoch": 1.616807227850563, + "grad_norm": 3.7139663696289062, + "learning_rate": 2.3053212869157283e-05, + "loss": 0.6589, + "step": 182890 + }, + { + "epoch": 1.6168956311108755, + "grad_norm": 5.257376670837402, + "learning_rate": 2.305173948148541e-05, + "loss": 0.6923, + "step": 182900 + }, + { + "epoch": 1.6169840343711877, + "grad_norm": 3.8011438846588135, + "learning_rate": 2.3050266093813543e-05, + "loss": 0.7006, + "step": 182910 + }, + { + "epoch": 1.6170724376314998, + "grad_norm": 2.4830210208892822, + "learning_rate": 2.304879270614167e-05, + "loss": 0.6573, + "step": 182920 + }, + { + "epoch": 1.6171608408918121, + "grad_norm": 0.9808798432350159, + "learning_rate": 2.30473193184698e-05, + "loss": 0.5592, + "step": 182930 + }, + { + "epoch": 1.6172492441521245, + "grad_norm": 1.3411725759506226, + "learning_rate": 2.304584593079793e-05, + "loss": 0.5643, + "step": 182940 + }, + { + "epoch": 1.6173376474124366, + "grad_norm": 1.004717469215393, + "learning_rate": 2.304437254312606e-05, + "loss": 0.5945, + "step": 182950 + }, + { + "epoch": 1.6174260506727487, + "grad_norm": 3.487877130508423, + "learning_rate": 2.3042899155454188e-05, + "loss": 0.6477, + "step": 182960 + }, + { + "epoch": 1.617514453933061, + "grad_norm": 5.3697638511657715, + "learning_rate": 2.304142576778232e-05, + "loss": 0.4594, + "step": 182970 + }, + { + "epoch": 1.6176028571933734, + "grad_norm": 2.046640396118164, + "learning_rate": 2.3039952380110448e-05, + "loss": 0.6383, + "step": 182980 + }, + { + "epoch": 1.6176912604536855, + "grad_norm": 9.25212574005127, + "learning_rate": 2.3038478992438576e-05, + "loss": 0.6077, + "step": 182990 + }, + { + "epoch": 1.6177796637139976, + "grad_norm": 2.4157204627990723, + "learning_rate": 2.3037005604766705e-05, + "loss": 0.6537, + "step": 183000 + }, + { + "epoch": 1.61786806697431, + "grad_norm": 5.2383713722229, + "learning_rate": 2.3035532217094837e-05, + "loss": 0.5277, + "step": 183010 + }, + { + "epoch": 1.6179564702346223, + "grad_norm": 5.388779163360596, + "learning_rate": 2.3034058829422965e-05, + "loss": 0.581, + "step": 183020 + }, + { + "epoch": 1.6180448734949344, + "grad_norm": 3.4251158237457275, + "learning_rate": 2.3032585441751093e-05, + "loss": 0.6993, + "step": 183030 + }, + { + "epoch": 1.6181332767552468, + "grad_norm": 0.8551393151283264, + "learning_rate": 2.3031112054079225e-05, + "loss": 0.6008, + "step": 183040 + }, + { + "epoch": 1.618221680015559, + "grad_norm": 1.4848769903182983, + "learning_rate": 2.3029638666407353e-05, + "loss": 0.5718, + "step": 183050 + }, + { + "epoch": 1.6183100832758712, + "grad_norm": 4.130042552947998, + "learning_rate": 2.3028165278735482e-05, + "loss": 0.5397, + "step": 183060 + }, + { + "epoch": 1.6183984865361833, + "grad_norm": 13.945141792297363, + "learning_rate": 2.302669189106361e-05, + "loss": 0.6131, + "step": 183070 + }, + { + "epoch": 1.6184868897964957, + "grad_norm": 1.6707444190979004, + "learning_rate": 2.3025218503391742e-05, + "loss": 0.439, + "step": 183080 + }, + { + "epoch": 1.618575293056808, + "grad_norm": 2.354923725128174, + "learning_rate": 2.302374511571987e-05, + "loss": 0.5689, + "step": 183090 + }, + { + "epoch": 1.6186636963171201, + "grad_norm": 1.6011011600494385, + "learning_rate": 2.3022271728048e-05, + "loss": 0.5743, + "step": 183100 + }, + { + "epoch": 1.6187520995774323, + "grad_norm": 1.0173894166946411, + "learning_rate": 2.3020798340376127e-05, + "loss": 0.5217, + "step": 183110 + }, + { + "epoch": 1.6188405028377446, + "grad_norm": 1.8395146131515503, + "learning_rate": 2.301932495270426e-05, + "loss": 0.5786, + "step": 183120 + }, + { + "epoch": 1.618928906098057, + "grad_norm": 4.783468246459961, + "learning_rate": 2.3017851565032387e-05, + "loss": 0.53, + "step": 183130 + }, + { + "epoch": 1.619017309358369, + "grad_norm": 5.537186145782471, + "learning_rate": 2.3016378177360515e-05, + "loss": 0.6554, + "step": 183140 + }, + { + "epoch": 1.6191057126186814, + "grad_norm": 2.122586727142334, + "learning_rate": 2.3014904789688647e-05, + "loss": 0.6046, + "step": 183150 + }, + { + "epoch": 1.6191941158789938, + "grad_norm": 7.516603469848633, + "learning_rate": 2.3013431402016775e-05, + "loss": 0.5209, + "step": 183160 + }, + { + "epoch": 1.6192825191393059, + "grad_norm": 2.1614632606506348, + "learning_rate": 2.3011958014344904e-05, + "loss": 0.5478, + "step": 183170 + }, + { + "epoch": 1.619370922399618, + "grad_norm": 2.4288723468780518, + "learning_rate": 2.3010484626673032e-05, + "loss": 0.6448, + "step": 183180 + }, + { + "epoch": 1.6194593256599303, + "grad_norm": 3.185554027557373, + "learning_rate": 2.3009011239001164e-05, + "loss": 0.4708, + "step": 183190 + }, + { + "epoch": 1.6195477289202427, + "grad_norm": 1.534436583518982, + "learning_rate": 2.3007537851329292e-05, + "loss": 0.5635, + "step": 183200 + }, + { + "epoch": 1.6196361321805548, + "grad_norm": 11.226266860961914, + "learning_rate": 2.300606446365742e-05, + "loss": 0.5708, + "step": 183210 + }, + { + "epoch": 1.619724535440867, + "grad_norm": 2.137450933456421, + "learning_rate": 2.300459107598555e-05, + "loss": 0.7895, + "step": 183220 + }, + { + "epoch": 1.6198129387011793, + "grad_norm": 4.291244029998779, + "learning_rate": 2.300311768831368e-05, + "loss": 0.6136, + "step": 183230 + }, + { + "epoch": 1.6199013419614916, + "grad_norm": 1.8652474880218506, + "learning_rate": 2.300164430064181e-05, + "loss": 0.6109, + "step": 183240 + }, + { + "epoch": 1.6199897452218037, + "grad_norm": 2.3695945739746094, + "learning_rate": 2.3000170912969937e-05, + "loss": 0.4908, + "step": 183250 + }, + { + "epoch": 1.620078148482116, + "grad_norm": 4.933897972106934, + "learning_rate": 2.299869752529807e-05, + "loss": 0.6101, + "step": 183260 + }, + { + "epoch": 1.6201665517424284, + "grad_norm": 1.580335259437561, + "learning_rate": 2.2997224137626197e-05, + "loss": 0.5333, + "step": 183270 + }, + { + "epoch": 1.6202549550027405, + "grad_norm": 2.3352818489074707, + "learning_rate": 2.2995750749954326e-05, + "loss": 0.6282, + "step": 183280 + }, + { + "epoch": 1.6203433582630526, + "grad_norm": 7.141417026519775, + "learning_rate": 2.2994277362282454e-05, + "loss": 0.7438, + "step": 183290 + }, + { + "epoch": 1.620431761523365, + "grad_norm": 3.556039810180664, + "learning_rate": 2.2992803974610586e-05, + "loss": 0.5468, + "step": 183300 + }, + { + "epoch": 1.6205201647836773, + "grad_norm": 4.072267055511475, + "learning_rate": 2.2991330586938714e-05, + "loss": 0.6712, + "step": 183310 + }, + { + "epoch": 1.6206085680439895, + "grad_norm": 2.012181282043457, + "learning_rate": 2.2989857199266843e-05, + "loss": 0.624, + "step": 183320 + }, + { + "epoch": 1.6206969713043016, + "grad_norm": 1.802600622177124, + "learning_rate": 2.2988383811594974e-05, + "loss": 0.5486, + "step": 183330 + }, + { + "epoch": 1.620785374564614, + "grad_norm": 9.85913372039795, + "learning_rate": 2.2986910423923103e-05, + "loss": 0.646, + "step": 183340 + }, + { + "epoch": 1.6208737778249263, + "grad_norm": 1.4825754165649414, + "learning_rate": 2.298543703625123e-05, + "loss": 0.6334, + "step": 183350 + }, + { + "epoch": 1.6209621810852384, + "grad_norm": 1.2595270872116089, + "learning_rate": 2.298396364857936e-05, + "loss": 0.6473, + "step": 183360 + }, + { + "epoch": 1.6210505843455507, + "grad_norm": 1.9350718259811401, + "learning_rate": 2.298249026090749e-05, + "loss": 0.6663, + "step": 183370 + }, + { + "epoch": 1.621138987605863, + "grad_norm": 2.0129005908966064, + "learning_rate": 2.298101687323562e-05, + "loss": 0.5177, + "step": 183380 + }, + { + "epoch": 1.6212273908661752, + "grad_norm": 2.1219167709350586, + "learning_rate": 2.2979543485563748e-05, + "loss": 0.4817, + "step": 183390 + }, + { + "epoch": 1.6213157941264873, + "grad_norm": 1.235468864440918, + "learning_rate": 2.2978070097891876e-05, + "loss": 0.5827, + "step": 183400 + }, + { + "epoch": 1.6214041973867996, + "grad_norm": 4.386280536651611, + "learning_rate": 2.2976596710220008e-05, + "loss": 0.5182, + "step": 183410 + }, + { + "epoch": 1.621492600647112, + "grad_norm": 1.6221117973327637, + "learning_rate": 2.2975123322548136e-05, + "loss": 0.5621, + "step": 183420 + }, + { + "epoch": 1.621581003907424, + "grad_norm": 1.7423888444900513, + "learning_rate": 2.2973649934876265e-05, + "loss": 0.5202, + "step": 183430 + }, + { + "epoch": 1.6216694071677362, + "grad_norm": 3.144033908843994, + "learning_rate": 2.2972176547204396e-05, + "loss": 0.6239, + "step": 183440 + }, + { + "epoch": 1.6217578104280486, + "grad_norm": 3.2087838649749756, + "learning_rate": 2.2970703159532525e-05, + "loss": 0.6817, + "step": 183450 + }, + { + "epoch": 1.621846213688361, + "grad_norm": 2.837109088897705, + "learning_rate": 2.2969229771860653e-05, + "loss": 0.6833, + "step": 183460 + }, + { + "epoch": 1.621934616948673, + "grad_norm": 1.2741280794143677, + "learning_rate": 2.296775638418878e-05, + "loss": 0.5932, + "step": 183470 + }, + { + "epoch": 1.6220230202089851, + "grad_norm": 3.0609824657440186, + "learning_rate": 2.2966282996516913e-05, + "loss": 0.5633, + "step": 183480 + }, + { + "epoch": 1.6221114234692977, + "grad_norm": 9.062402725219727, + "learning_rate": 2.296480960884504e-05, + "loss": 0.5761, + "step": 183490 + }, + { + "epoch": 1.6221998267296098, + "grad_norm": 1.9718917608261108, + "learning_rate": 2.296333622117317e-05, + "loss": 0.6681, + "step": 183500 + }, + { + "epoch": 1.622288229989922, + "grad_norm": 6.590805530548096, + "learning_rate": 2.2961862833501302e-05, + "loss": 0.5325, + "step": 183510 + }, + { + "epoch": 1.6223766332502343, + "grad_norm": 2.291339159011841, + "learning_rate": 2.296038944582943e-05, + "loss": 0.5877, + "step": 183520 + }, + { + "epoch": 1.6224650365105466, + "grad_norm": 2.2934045791625977, + "learning_rate": 2.295891605815756e-05, + "loss": 0.6308, + "step": 183530 + }, + { + "epoch": 1.6225534397708588, + "grad_norm": 2.509108781814575, + "learning_rate": 2.2957442670485687e-05, + "loss": 0.5279, + "step": 183540 + }, + { + "epoch": 1.6226418430311709, + "grad_norm": 2.1688010692596436, + "learning_rate": 2.295596928281382e-05, + "loss": 0.537, + "step": 183550 + }, + { + "epoch": 1.6227302462914832, + "grad_norm": 3.1773228645324707, + "learning_rate": 2.2954495895141947e-05, + "loss": 0.5855, + "step": 183560 + }, + { + "epoch": 1.6228186495517956, + "grad_norm": 1.686359167098999, + "learning_rate": 2.2953022507470075e-05, + "loss": 0.5335, + "step": 183570 + }, + { + "epoch": 1.6229070528121077, + "grad_norm": 2.2279937267303467, + "learning_rate": 2.2951549119798204e-05, + "loss": 0.6343, + "step": 183580 + }, + { + "epoch": 1.6229954560724198, + "grad_norm": 2.9656448364257812, + "learning_rate": 2.2950075732126335e-05, + "loss": 0.6324, + "step": 183590 + }, + { + "epoch": 1.6230838593327321, + "grad_norm": 31.457103729248047, + "learning_rate": 2.2948602344454464e-05, + "loss": 0.6671, + "step": 183600 + }, + { + "epoch": 1.6231722625930445, + "grad_norm": 2.75780987739563, + "learning_rate": 2.2947128956782592e-05, + "loss": 0.7377, + "step": 183610 + }, + { + "epoch": 1.6232606658533566, + "grad_norm": 1.5926953554153442, + "learning_rate": 2.2945655569110724e-05, + "loss": 0.603, + "step": 183620 + }, + { + "epoch": 1.623349069113669, + "grad_norm": 3.640963554382324, + "learning_rate": 2.2944182181438852e-05, + "loss": 0.583, + "step": 183630 + }, + { + "epoch": 1.6234374723739813, + "grad_norm": 2.1036155223846436, + "learning_rate": 2.294270879376698e-05, + "loss": 0.5616, + "step": 183640 + }, + { + "epoch": 1.6235258756342934, + "grad_norm": 4.902594089508057, + "learning_rate": 2.294123540609511e-05, + "loss": 0.6791, + "step": 183650 + }, + { + "epoch": 1.6236142788946055, + "grad_norm": 1.7234135866165161, + "learning_rate": 2.293976201842324e-05, + "loss": 0.6306, + "step": 183660 + }, + { + "epoch": 1.6237026821549179, + "grad_norm": 3.0011208057403564, + "learning_rate": 2.293828863075137e-05, + "loss": 0.5174, + "step": 183670 + }, + { + "epoch": 1.6237910854152302, + "grad_norm": 4.035562992095947, + "learning_rate": 2.2936815243079497e-05, + "loss": 0.4826, + "step": 183680 + }, + { + "epoch": 1.6238794886755423, + "grad_norm": 1.7754682302474976, + "learning_rate": 2.293534185540763e-05, + "loss": 0.6573, + "step": 183690 + }, + { + "epoch": 1.6239678919358544, + "grad_norm": 0.8616647720336914, + "learning_rate": 2.2933868467735757e-05, + "loss": 0.6085, + "step": 183700 + }, + { + "epoch": 1.6240562951961668, + "grad_norm": 4.318363666534424, + "learning_rate": 2.2932395080063886e-05, + "loss": 0.6725, + "step": 183710 + }, + { + "epoch": 1.6241446984564791, + "grad_norm": 2.096954345703125, + "learning_rate": 2.2930921692392014e-05, + "loss": 0.5702, + "step": 183720 + }, + { + "epoch": 1.6242331017167912, + "grad_norm": 4.886160373687744, + "learning_rate": 2.2929448304720146e-05, + "loss": 0.5545, + "step": 183730 + }, + { + "epoch": 1.6243215049771036, + "grad_norm": 1.7509719133377075, + "learning_rate": 2.2927974917048274e-05, + "loss": 0.6568, + "step": 183740 + }, + { + "epoch": 1.624409908237416, + "grad_norm": 1.8688541650772095, + "learning_rate": 2.2926501529376403e-05, + "loss": 0.7127, + "step": 183750 + }, + { + "epoch": 1.624498311497728, + "grad_norm": 1.086142659187317, + "learning_rate": 2.2925028141704534e-05, + "loss": 0.4725, + "step": 183760 + }, + { + "epoch": 1.6245867147580402, + "grad_norm": 5.558563709259033, + "learning_rate": 2.2923554754032663e-05, + "loss": 0.6067, + "step": 183770 + }, + { + "epoch": 1.6246751180183525, + "grad_norm": 2.00053071975708, + "learning_rate": 2.292208136636079e-05, + "loss": 0.5779, + "step": 183780 + }, + { + "epoch": 1.6247635212786649, + "grad_norm": 2.470217227935791, + "learning_rate": 2.2920607978688923e-05, + "loss": 0.6558, + "step": 183790 + }, + { + "epoch": 1.624851924538977, + "grad_norm": 1.4910800457000732, + "learning_rate": 2.291913459101705e-05, + "loss": 0.5049, + "step": 183800 + }, + { + "epoch": 1.624940327799289, + "grad_norm": 3.218100070953369, + "learning_rate": 2.291766120334518e-05, + "loss": 0.5279, + "step": 183810 + }, + { + "epoch": 1.6250287310596014, + "grad_norm": 4.51097297668457, + "learning_rate": 2.291618781567331e-05, + "loss": 0.6609, + "step": 183820 + }, + { + "epoch": 1.6251171343199138, + "grad_norm": 3.820086717605591, + "learning_rate": 2.291471442800144e-05, + "loss": 0.6391, + "step": 183830 + }, + { + "epoch": 1.625205537580226, + "grad_norm": 2.5522615909576416, + "learning_rate": 2.2913241040329568e-05, + "loss": 0.6297, + "step": 183840 + }, + { + "epoch": 1.6252939408405382, + "grad_norm": 1.307796597480774, + "learning_rate": 2.29117676526577e-05, + "loss": 0.6185, + "step": 183850 + }, + { + "epoch": 1.6253823441008506, + "grad_norm": 5.658438682556152, + "learning_rate": 2.2910294264985828e-05, + "loss": 0.765, + "step": 183860 + }, + { + "epoch": 1.6254707473611627, + "grad_norm": 2.9527199268341064, + "learning_rate": 2.2908820877313956e-05, + "loss": 0.5706, + "step": 183870 + }, + { + "epoch": 1.6255591506214748, + "grad_norm": 6.69216775894165, + "learning_rate": 2.2907347489642088e-05, + "loss": 0.5828, + "step": 183880 + }, + { + "epoch": 1.6256475538817872, + "grad_norm": 2.156167984008789, + "learning_rate": 2.2905874101970216e-05, + "loss": 0.5231, + "step": 183890 + }, + { + "epoch": 1.6257359571420995, + "grad_norm": 18.370405197143555, + "learning_rate": 2.2904400714298345e-05, + "loss": 0.7013, + "step": 183900 + }, + { + "epoch": 1.6258243604024116, + "grad_norm": 7.3644609451293945, + "learning_rate": 2.2902927326626477e-05, + "loss": 0.6461, + "step": 183910 + }, + { + "epoch": 1.6259127636627237, + "grad_norm": 5.743333339691162, + "learning_rate": 2.2901453938954605e-05, + "loss": 0.6395, + "step": 183920 + }, + { + "epoch": 1.626001166923036, + "grad_norm": 1.135197401046753, + "learning_rate": 2.2899980551282733e-05, + "loss": 0.6282, + "step": 183930 + }, + { + "epoch": 1.6260895701833484, + "grad_norm": 1.5497599840164185, + "learning_rate": 2.289850716361086e-05, + "loss": 0.5932, + "step": 183940 + }, + { + "epoch": 1.6261779734436606, + "grad_norm": 1.3106706142425537, + "learning_rate": 2.2897033775938993e-05, + "loss": 0.624, + "step": 183950 + }, + { + "epoch": 1.626266376703973, + "grad_norm": 1.3661928176879883, + "learning_rate": 2.289556038826712e-05, + "loss": 0.6013, + "step": 183960 + }, + { + "epoch": 1.6263547799642852, + "grad_norm": 1.613750696182251, + "learning_rate": 2.289408700059525e-05, + "loss": 0.5678, + "step": 183970 + }, + { + "epoch": 1.6264431832245974, + "grad_norm": 1.8886228799819946, + "learning_rate": 2.2892613612923382e-05, + "loss": 0.6511, + "step": 183980 + }, + { + "epoch": 1.6265315864849095, + "grad_norm": 3.687861204147339, + "learning_rate": 2.289114022525151e-05, + "loss": 0.5881, + "step": 183990 + }, + { + "epoch": 1.6266199897452218, + "grad_norm": 1.4639226198196411, + "learning_rate": 2.288966683757964e-05, + "loss": 0.6218, + "step": 184000 + }, + { + "epoch": 1.6267083930055342, + "grad_norm": 0.9775075316429138, + "learning_rate": 2.2888193449907767e-05, + "loss": 0.614, + "step": 184010 + }, + { + "epoch": 1.6267967962658463, + "grad_norm": 4.130499839782715, + "learning_rate": 2.28867200622359e-05, + "loss": 0.5666, + "step": 184020 + }, + { + "epoch": 1.6268851995261584, + "grad_norm": 7.859674453735352, + "learning_rate": 2.2885246674564027e-05, + "loss": 0.6281, + "step": 184030 + }, + { + "epoch": 1.6269736027864707, + "grad_norm": 0.9946154952049255, + "learning_rate": 2.2883773286892155e-05, + "loss": 0.5167, + "step": 184040 + }, + { + "epoch": 1.627062006046783, + "grad_norm": 5.320554733276367, + "learning_rate": 2.2882299899220284e-05, + "loss": 0.6229, + "step": 184050 + }, + { + "epoch": 1.6271504093070952, + "grad_norm": 1.5439127683639526, + "learning_rate": 2.2880826511548415e-05, + "loss": 0.576, + "step": 184060 + }, + { + "epoch": 1.6272388125674073, + "grad_norm": 6.699517250061035, + "learning_rate": 2.2879353123876544e-05, + "loss": 0.5049, + "step": 184070 + }, + { + "epoch": 1.6273272158277199, + "grad_norm": 2.4477782249450684, + "learning_rate": 2.2877879736204672e-05, + "loss": 0.4145, + "step": 184080 + }, + { + "epoch": 1.627415619088032, + "grad_norm": 3.1822073459625244, + "learning_rate": 2.2876406348532804e-05, + "loss": 0.6304, + "step": 184090 + }, + { + "epoch": 1.6275040223483441, + "grad_norm": 9.732050895690918, + "learning_rate": 2.2874932960860932e-05, + "loss": 0.773, + "step": 184100 + }, + { + "epoch": 1.6275924256086565, + "grad_norm": 1.589491844177246, + "learning_rate": 2.287345957318906e-05, + "loss": 0.7218, + "step": 184110 + }, + { + "epoch": 1.6276808288689688, + "grad_norm": 5.9442830085754395, + "learning_rate": 2.287198618551719e-05, + "loss": 0.665, + "step": 184120 + }, + { + "epoch": 1.627769232129281, + "grad_norm": 1.9216110706329346, + "learning_rate": 2.287051279784532e-05, + "loss": 0.6914, + "step": 184130 + }, + { + "epoch": 1.627857635389593, + "grad_norm": 2.534881591796875, + "learning_rate": 2.286903941017345e-05, + "loss": 0.6321, + "step": 184140 + }, + { + "epoch": 1.6279460386499054, + "grad_norm": 9.885744094848633, + "learning_rate": 2.2867566022501577e-05, + "loss": 0.6662, + "step": 184150 + }, + { + "epoch": 1.6280344419102177, + "grad_norm": 1.8704266548156738, + "learning_rate": 2.286609263482971e-05, + "loss": 0.6705, + "step": 184160 + }, + { + "epoch": 1.6281228451705299, + "grad_norm": 0.8257664442062378, + "learning_rate": 2.2864619247157837e-05, + "loss": 0.565, + "step": 184170 + }, + { + "epoch": 1.628211248430842, + "grad_norm": 4.256963729858398, + "learning_rate": 2.2863145859485966e-05, + "loss": 0.5459, + "step": 184180 + }, + { + "epoch": 1.6282996516911543, + "grad_norm": 1.2728620767593384, + "learning_rate": 2.2861672471814094e-05, + "loss": 0.7425, + "step": 184190 + }, + { + "epoch": 1.6283880549514667, + "grad_norm": 12.854273796081543, + "learning_rate": 2.2860199084142226e-05, + "loss": 0.5235, + "step": 184200 + }, + { + "epoch": 1.6284764582117788, + "grad_norm": 1.6517729759216309, + "learning_rate": 2.2858725696470354e-05, + "loss": 0.5652, + "step": 184210 + }, + { + "epoch": 1.6285648614720911, + "grad_norm": 1.1881617307662964, + "learning_rate": 2.2857252308798483e-05, + "loss": 0.5657, + "step": 184220 + }, + { + "epoch": 1.6286532647324035, + "grad_norm": 5.583603858947754, + "learning_rate": 2.285577892112661e-05, + "loss": 0.6541, + "step": 184230 + }, + { + "epoch": 1.6287416679927156, + "grad_norm": 2.6169891357421875, + "learning_rate": 2.2854305533454743e-05, + "loss": 0.6686, + "step": 184240 + }, + { + "epoch": 1.6288300712530277, + "grad_norm": 1.1197354793548584, + "learning_rate": 2.285283214578287e-05, + "loss": 0.7222, + "step": 184250 + }, + { + "epoch": 1.62891847451334, + "grad_norm": 1.1425940990447998, + "learning_rate": 2.2851358758111e-05, + "loss": 0.6223, + "step": 184260 + }, + { + "epoch": 1.6290068777736524, + "grad_norm": 5.089562892913818, + "learning_rate": 2.284988537043913e-05, + "loss": 0.5669, + "step": 184270 + }, + { + "epoch": 1.6290952810339645, + "grad_norm": 3.5676016807556152, + "learning_rate": 2.284841198276726e-05, + "loss": 0.6697, + "step": 184280 + }, + { + "epoch": 1.6291836842942766, + "grad_norm": 2.2371551990509033, + "learning_rate": 2.2846938595095388e-05, + "loss": 0.6177, + "step": 184290 + }, + { + "epoch": 1.629272087554589, + "grad_norm": 1.5458804368972778, + "learning_rate": 2.2845465207423516e-05, + "loss": 0.493, + "step": 184300 + }, + { + "epoch": 1.6293604908149013, + "grad_norm": 31.324037551879883, + "learning_rate": 2.2843991819751648e-05, + "loss": 0.4658, + "step": 184310 + }, + { + "epoch": 1.6294488940752134, + "grad_norm": 2.5619683265686035, + "learning_rate": 2.2842518432079776e-05, + "loss": 0.6893, + "step": 184320 + }, + { + "epoch": 1.6295372973355258, + "grad_norm": 7.988354682922363, + "learning_rate": 2.2841045044407905e-05, + "loss": 0.4634, + "step": 184330 + }, + { + "epoch": 1.6296257005958381, + "grad_norm": 2.203214406967163, + "learning_rate": 2.2839571656736033e-05, + "loss": 0.5366, + "step": 184340 + }, + { + "epoch": 1.6297141038561502, + "grad_norm": 6.620633602142334, + "learning_rate": 2.2838098269064165e-05, + "loss": 0.5371, + "step": 184350 + }, + { + "epoch": 1.6298025071164624, + "grad_norm": 6.454627990722656, + "learning_rate": 2.2836624881392293e-05, + "loss": 0.624, + "step": 184360 + }, + { + "epoch": 1.6298909103767747, + "grad_norm": 1.9121379852294922, + "learning_rate": 2.283515149372042e-05, + "loss": 0.5148, + "step": 184370 + }, + { + "epoch": 1.629979313637087, + "grad_norm": 2.0181052684783936, + "learning_rate": 2.2833678106048553e-05, + "loss": 0.5305, + "step": 184380 + }, + { + "epoch": 1.6300677168973992, + "grad_norm": 4.650597095489502, + "learning_rate": 2.283220471837668e-05, + "loss": 0.669, + "step": 184390 + }, + { + "epoch": 1.6301561201577113, + "grad_norm": 2.9797658920288086, + "learning_rate": 2.283073133070481e-05, + "loss": 0.7286, + "step": 184400 + }, + { + "epoch": 1.6302445234180236, + "grad_norm": 1.9700895547866821, + "learning_rate": 2.2829257943032938e-05, + "loss": 0.577, + "step": 184410 + }, + { + "epoch": 1.630332926678336, + "grad_norm": 2.63667893409729, + "learning_rate": 2.282778455536107e-05, + "loss": 0.5794, + "step": 184420 + }, + { + "epoch": 1.630421329938648, + "grad_norm": 1.9961471557617188, + "learning_rate": 2.28263111676892e-05, + "loss": 0.69, + "step": 184430 + }, + { + "epoch": 1.6305097331989604, + "grad_norm": 3.482682943344116, + "learning_rate": 2.2824837780017327e-05, + "loss": 0.6993, + "step": 184440 + }, + { + "epoch": 1.6305981364592728, + "grad_norm": 2.6156997680664062, + "learning_rate": 2.282336439234546e-05, + "loss": 0.5546, + "step": 184450 + }, + { + "epoch": 1.6306865397195849, + "grad_norm": 2.978020429611206, + "learning_rate": 2.2821891004673587e-05, + "loss": 0.5797, + "step": 184460 + }, + { + "epoch": 1.630774942979897, + "grad_norm": 3.4677491188049316, + "learning_rate": 2.2820417617001715e-05, + "loss": 0.6331, + "step": 184470 + }, + { + "epoch": 1.6308633462402093, + "grad_norm": 1.6658741235733032, + "learning_rate": 2.2818944229329844e-05, + "loss": 0.6836, + "step": 184480 + }, + { + "epoch": 1.6309517495005217, + "grad_norm": 1.7685045003890991, + "learning_rate": 2.2817470841657975e-05, + "loss": 0.5463, + "step": 184490 + }, + { + "epoch": 1.6310401527608338, + "grad_norm": 8.338022232055664, + "learning_rate": 2.2815997453986104e-05, + "loss": 0.6075, + "step": 184500 + }, + { + "epoch": 1.631128556021146, + "grad_norm": 3.6579251289367676, + "learning_rate": 2.2814524066314232e-05, + "loss": 0.6168, + "step": 184510 + }, + { + "epoch": 1.6312169592814583, + "grad_norm": 5.747318744659424, + "learning_rate": 2.281305067864236e-05, + "loss": 0.6771, + "step": 184520 + }, + { + "epoch": 1.6313053625417706, + "grad_norm": 3.40767502784729, + "learning_rate": 2.2811577290970492e-05, + "loss": 0.5803, + "step": 184530 + }, + { + "epoch": 1.6313937658020827, + "grad_norm": 3.503166437149048, + "learning_rate": 2.281010390329862e-05, + "loss": 0.5794, + "step": 184540 + }, + { + "epoch": 1.631482169062395, + "grad_norm": 2.796386480331421, + "learning_rate": 2.280863051562675e-05, + "loss": 0.7916, + "step": 184550 + }, + { + "epoch": 1.6315705723227074, + "grad_norm": 4.32317590713501, + "learning_rate": 2.280715712795488e-05, + "loss": 0.6251, + "step": 184560 + }, + { + "epoch": 1.6316589755830195, + "grad_norm": 1.8309497833251953, + "learning_rate": 2.280568374028301e-05, + "loss": 0.6394, + "step": 184570 + }, + { + "epoch": 1.6317473788433317, + "grad_norm": 4.816558837890625, + "learning_rate": 2.2804210352611137e-05, + "loss": 0.4691, + "step": 184580 + }, + { + "epoch": 1.631835782103644, + "grad_norm": 2.5757365226745605, + "learning_rate": 2.2802736964939266e-05, + "loss": 0.5826, + "step": 184590 + }, + { + "epoch": 1.6319241853639563, + "grad_norm": 3.247929334640503, + "learning_rate": 2.2801263577267397e-05, + "loss": 0.5432, + "step": 184600 + }, + { + "epoch": 1.6320125886242685, + "grad_norm": 1.2992980480194092, + "learning_rate": 2.2799790189595526e-05, + "loss": 0.5994, + "step": 184610 + }, + { + "epoch": 1.6321009918845806, + "grad_norm": 4.61171817779541, + "learning_rate": 2.2798316801923654e-05, + "loss": 0.6519, + "step": 184620 + }, + { + "epoch": 1.632189395144893, + "grad_norm": 3.0313308238983154, + "learning_rate": 2.2796843414251786e-05, + "loss": 0.6346, + "step": 184630 + }, + { + "epoch": 1.6322777984052053, + "grad_norm": 12.606894493103027, + "learning_rate": 2.2795370026579914e-05, + "loss": 0.5395, + "step": 184640 + }, + { + "epoch": 1.6323662016655174, + "grad_norm": 5.260139465332031, + "learning_rate": 2.2793896638908042e-05, + "loss": 0.589, + "step": 184650 + }, + { + "epoch": 1.6324546049258295, + "grad_norm": 1.856351375579834, + "learning_rate": 2.279242325123617e-05, + "loss": 0.5267, + "step": 184660 + }, + { + "epoch": 1.632543008186142, + "grad_norm": 0.9601485133171082, + "learning_rate": 2.2790949863564303e-05, + "loss": 0.4933, + "step": 184670 + }, + { + "epoch": 1.6326314114464542, + "grad_norm": 3.705890655517578, + "learning_rate": 2.278947647589243e-05, + "loss": 0.5195, + "step": 184680 + }, + { + "epoch": 1.6327198147067663, + "grad_norm": 3.7225093841552734, + "learning_rate": 2.278800308822056e-05, + "loss": 0.5904, + "step": 184690 + }, + { + "epoch": 1.6328082179670786, + "grad_norm": 1.198681116104126, + "learning_rate": 2.278652970054869e-05, + "loss": 0.5897, + "step": 184700 + }, + { + "epoch": 1.632896621227391, + "grad_norm": 2.5294363498687744, + "learning_rate": 2.278505631287682e-05, + "loss": 0.5875, + "step": 184710 + }, + { + "epoch": 1.632985024487703, + "grad_norm": 1.3712669610977173, + "learning_rate": 2.2783582925204948e-05, + "loss": 0.558, + "step": 184720 + }, + { + "epoch": 1.6330734277480152, + "grad_norm": 1.1586508750915527, + "learning_rate": 2.278210953753308e-05, + "loss": 0.5094, + "step": 184730 + }, + { + "epoch": 1.6331618310083276, + "grad_norm": 7.613958358764648, + "learning_rate": 2.2780636149861208e-05, + "loss": 0.5836, + "step": 184740 + }, + { + "epoch": 1.63325023426864, + "grad_norm": 2.813015937805176, + "learning_rate": 2.2779162762189336e-05, + "loss": 0.5914, + "step": 184750 + }, + { + "epoch": 1.633338637528952, + "grad_norm": 1.207080602645874, + "learning_rate": 2.2777689374517468e-05, + "loss": 0.567, + "step": 184760 + }, + { + "epoch": 1.6334270407892642, + "grad_norm": 3.395878553390503, + "learning_rate": 2.2776215986845596e-05, + "loss": 0.6267, + "step": 184770 + }, + { + "epoch": 1.6335154440495767, + "grad_norm": 1.9244418144226074, + "learning_rate": 2.2774742599173725e-05, + "loss": 0.6235, + "step": 184780 + }, + { + "epoch": 1.6336038473098888, + "grad_norm": 2.774043560028076, + "learning_rate": 2.2773269211501856e-05, + "loss": 0.6738, + "step": 184790 + }, + { + "epoch": 1.633692250570201, + "grad_norm": 1.2514560222625732, + "learning_rate": 2.2771795823829985e-05, + "loss": 0.4984, + "step": 184800 + }, + { + "epoch": 1.6337806538305133, + "grad_norm": 5.4205241203308105, + "learning_rate": 2.2770322436158113e-05, + "loss": 0.4232, + "step": 184810 + }, + { + "epoch": 1.6338690570908256, + "grad_norm": 2.8734962940216064, + "learning_rate": 2.2768849048486245e-05, + "loss": 0.6807, + "step": 184820 + }, + { + "epoch": 1.6339574603511378, + "grad_norm": 1.3748440742492676, + "learning_rate": 2.2767375660814373e-05, + "loss": 0.5818, + "step": 184830 + }, + { + "epoch": 1.6340458636114499, + "grad_norm": 5.2768731117248535, + "learning_rate": 2.27659022731425e-05, + "loss": 0.6752, + "step": 184840 + }, + { + "epoch": 1.6341342668717622, + "grad_norm": 2.147465944290161, + "learning_rate": 2.2764428885470633e-05, + "loss": 0.6051, + "step": 184850 + }, + { + "epoch": 1.6342226701320746, + "grad_norm": 1.3552194833755493, + "learning_rate": 2.276295549779876e-05, + "loss": 0.6242, + "step": 184860 + }, + { + "epoch": 1.6343110733923867, + "grad_norm": 3.005063056945801, + "learning_rate": 2.276148211012689e-05, + "loss": 0.546, + "step": 184870 + }, + { + "epoch": 1.6343994766526988, + "grad_norm": 2.38718581199646, + "learning_rate": 2.276000872245502e-05, + "loss": 0.6354, + "step": 184880 + }, + { + "epoch": 1.6344878799130111, + "grad_norm": 2.210580348968506, + "learning_rate": 2.275853533478315e-05, + "loss": 0.4162, + "step": 184890 + }, + { + "epoch": 1.6345762831733235, + "grad_norm": 1.0004842281341553, + "learning_rate": 2.275706194711128e-05, + "loss": 0.6462, + "step": 184900 + }, + { + "epoch": 1.6346646864336356, + "grad_norm": 2.734127998352051, + "learning_rate": 2.2755588559439407e-05, + "loss": 0.7612, + "step": 184910 + }, + { + "epoch": 1.634753089693948, + "grad_norm": 2.2676632404327393, + "learning_rate": 2.275411517176754e-05, + "loss": 0.6075, + "step": 184920 + }, + { + "epoch": 1.6348414929542603, + "grad_norm": 1.2737714052200317, + "learning_rate": 2.2752641784095667e-05, + "loss": 0.5965, + "step": 184930 + }, + { + "epoch": 1.6349298962145724, + "grad_norm": 3.9585015773773193, + "learning_rate": 2.2751168396423795e-05, + "loss": 0.6548, + "step": 184940 + }, + { + "epoch": 1.6350182994748845, + "grad_norm": 3.6612370014190674, + "learning_rate": 2.2749695008751924e-05, + "loss": 0.5771, + "step": 184950 + }, + { + "epoch": 1.6351067027351969, + "grad_norm": 2.543405055999756, + "learning_rate": 2.2748221621080055e-05, + "loss": 0.5941, + "step": 184960 + }, + { + "epoch": 1.6351951059955092, + "grad_norm": 5.730416297912598, + "learning_rate": 2.2746748233408184e-05, + "loss": 0.5093, + "step": 184970 + }, + { + "epoch": 1.6352835092558213, + "grad_norm": 3.4625625610351562, + "learning_rate": 2.2745274845736312e-05, + "loss": 0.7759, + "step": 184980 + }, + { + "epoch": 1.6353719125161335, + "grad_norm": 1.772454023361206, + "learning_rate": 2.274380145806444e-05, + "loss": 0.5607, + "step": 184990 + }, + { + "epoch": 1.6354603157764458, + "grad_norm": 1.370222806930542, + "learning_rate": 2.2742328070392572e-05, + "loss": 0.5706, + "step": 185000 + }, + { + "epoch": 1.6355487190367581, + "grad_norm": 3.0200443267822266, + "learning_rate": 2.27408546827207e-05, + "loss": 0.6513, + "step": 185010 + }, + { + "epoch": 1.6356371222970703, + "grad_norm": 7.69234037399292, + "learning_rate": 2.273938129504883e-05, + "loss": 0.5365, + "step": 185020 + }, + { + "epoch": 1.6357255255573826, + "grad_norm": 1.9520457983016968, + "learning_rate": 2.273790790737696e-05, + "loss": 0.6405, + "step": 185030 + }, + { + "epoch": 1.635813928817695, + "grad_norm": 2.9949679374694824, + "learning_rate": 2.273643451970509e-05, + "loss": 0.5321, + "step": 185040 + }, + { + "epoch": 1.635902332078007, + "grad_norm": 2.275728702545166, + "learning_rate": 2.2734961132033217e-05, + "loss": 0.4874, + "step": 185050 + }, + { + "epoch": 1.6359907353383192, + "grad_norm": 6.908626556396484, + "learning_rate": 2.2733487744361346e-05, + "loss": 0.6911, + "step": 185060 + }, + { + "epoch": 1.6360791385986315, + "grad_norm": 1.3334299325942993, + "learning_rate": 2.2732014356689477e-05, + "loss": 0.564, + "step": 185070 + }, + { + "epoch": 1.6361675418589439, + "grad_norm": 5.897999286651611, + "learning_rate": 2.2730540969017606e-05, + "loss": 0.6266, + "step": 185080 + }, + { + "epoch": 1.636255945119256, + "grad_norm": 1.4176234006881714, + "learning_rate": 2.2729067581345734e-05, + "loss": 0.4531, + "step": 185090 + }, + { + "epoch": 1.636344348379568, + "grad_norm": 3.0399603843688965, + "learning_rate": 2.2727594193673866e-05, + "loss": 0.6749, + "step": 185100 + }, + { + "epoch": 1.6364327516398804, + "grad_norm": 1.9612928628921509, + "learning_rate": 2.2726120806001994e-05, + "loss": 0.6035, + "step": 185110 + }, + { + "epoch": 1.6365211549001928, + "grad_norm": 5.286875247955322, + "learning_rate": 2.2724647418330123e-05, + "loss": 0.5198, + "step": 185120 + }, + { + "epoch": 1.636609558160505, + "grad_norm": 1.482245683670044, + "learning_rate": 2.272317403065825e-05, + "loss": 0.6298, + "step": 185130 + }, + { + "epoch": 1.6366979614208172, + "grad_norm": 2.0642032623291016, + "learning_rate": 2.2721700642986383e-05, + "loss": 0.6343, + "step": 185140 + }, + { + "epoch": 1.6367863646811296, + "grad_norm": 1.313356876373291, + "learning_rate": 2.272022725531451e-05, + "loss": 0.642, + "step": 185150 + }, + { + "epoch": 1.6368747679414417, + "grad_norm": 3.3046481609344482, + "learning_rate": 2.271875386764264e-05, + "loss": 0.5257, + "step": 185160 + }, + { + "epoch": 1.6369631712017538, + "grad_norm": 1.4577585458755493, + "learning_rate": 2.2717280479970768e-05, + "loss": 0.6171, + "step": 185170 + }, + { + "epoch": 1.6370515744620662, + "grad_norm": 2.3187859058380127, + "learning_rate": 2.27158070922989e-05, + "loss": 0.6, + "step": 185180 + }, + { + "epoch": 1.6371399777223785, + "grad_norm": 1.291504979133606, + "learning_rate": 2.2714333704627028e-05, + "loss": 0.6561, + "step": 185190 + }, + { + "epoch": 1.6372283809826906, + "grad_norm": 5.608502388000488, + "learning_rate": 2.2712860316955156e-05, + "loss": 0.6199, + "step": 185200 + }, + { + "epoch": 1.6373167842430028, + "grad_norm": 4.817999362945557, + "learning_rate": 2.2711386929283288e-05, + "loss": 0.788, + "step": 185210 + }, + { + "epoch": 1.637405187503315, + "grad_norm": 1.6362338066101074, + "learning_rate": 2.2709913541611416e-05, + "loss": 0.5305, + "step": 185220 + }, + { + "epoch": 1.6374935907636274, + "grad_norm": 3.609710693359375, + "learning_rate": 2.2708440153939545e-05, + "loss": 0.648, + "step": 185230 + }, + { + "epoch": 1.6375819940239396, + "grad_norm": 9.4567289352417, + "learning_rate": 2.2706966766267673e-05, + "loss": 0.6739, + "step": 185240 + }, + { + "epoch": 1.6376703972842517, + "grad_norm": 8.18065071105957, + "learning_rate": 2.2705493378595805e-05, + "loss": 0.6428, + "step": 185250 + }, + { + "epoch": 1.6377588005445642, + "grad_norm": 14.607341766357422, + "learning_rate": 2.2704019990923933e-05, + "loss": 0.5551, + "step": 185260 + }, + { + "epoch": 1.6378472038048764, + "grad_norm": 2.1119654178619385, + "learning_rate": 2.270254660325206e-05, + "loss": 0.6007, + "step": 185270 + }, + { + "epoch": 1.6379356070651885, + "grad_norm": 8.281624794006348, + "learning_rate": 2.2701073215580193e-05, + "loss": 0.6458, + "step": 185280 + }, + { + "epoch": 1.6380240103255008, + "grad_norm": 1.4578211307525635, + "learning_rate": 2.269959982790832e-05, + "loss": 0.6117, + "step": 185290 + }, + { + "epoch": 1.6381124135858132, + "grad_norm": 10.856989860534668, + "learning_rate": 2.269812644023645e-05, + "loss": 0.5313, + "step": 185300 + }, + { + "epoch": 1.6382008168461253, + "grad_norm": 1.363480806350708, + "learning_rate": 2.2696653052564578e-05, + "loss": 0.6278, + "step": 185310 + }, + { + "epoch": 1.6382892201064374, + "grad_norm": 3.4219608306884766, + "learning_rate": 2.269517966489271e-05, + "loss": 0.67, + "step": 185320 + }, + { + "epoch": 1.6383776233667497, + "grad_norm": 3.406155586242676, + "learning_rate": 2.269370627722084e-05, + "loss": 0.6604, + "step": 185330 + }, + { + "epoch": 1.638466026627062, + "grad_norm": 3.8006746768951416, + "learning_rate": 2.2692232889548967e-05, + "loss": 0.5872, + "step": 185340 + }, + { + "epoch": 1.6385544298873742, + "grad_norm": 1.6638083457946777, + "learning_rate": 2.2690759501877095e-05, + "loss": 0.5811, + "step": 185350 + }, + { + "epoch": 1.6386428331476863, + "grad_norm": 4.976661205291748, + "learning_rate": 2.2689286114205227e-05, + "loss": 0.5568, + "step": 185360 + }, + { + "epoch": 1.638731236407999, + "grad_norm": 4.77788782119751, + "learning_rate": 2.2687812726533355e-05, + "loss": 0.5337, + "step": 185370 + }, + { + "epoch": 1.638819639668311, + "grad_norm": 2.5900914669036865, + "learning_rate": 2.2686339338861483e-05, + "loss": 0.5267, + "step": 185380 + }, + { + "epoch": 1.6389080429286231, + "grad_norm": 2.8252573013305664, + "learning_rate": 2.2684865951189615e-05, + "loss": 0.6344, + "step": 185390 + }, + { + "epoch": 1.6389964461889355, + "grad_norm": 1.5785282850265503, + "learning_rate": 2.2683392563517744e-05, + "loss": 0.5995, + "step": 185400 + }, + { + "epoch": 1.6390848494492478, + "grad_norm": 9.918407440185547, + "learning_rate": 2.2681919175845872e-05, + "loss": 0.7484, + "step": 185410 + }, + { + "epoch": 1.63917325270956, + "grad_norm": 3.0052177906036377, + "learning_rate": 2.2680445788174e-05, + "loss": 0.6407, + "step": 185420 + }, + { + "epoch": 1.639261655969872, + "grad_norm": 1.9721765518188477, + "learning_rate": 2.2678972400502132e-05, + "loss": 0.5394, + "step": 185430 + }, + { + "epoch": 1.6393500592301844, + "grad_norm": 2.792327880859375, + "learning_rate": 2.267749901283026e-05, + "loss": 0.7258, + "step": 185440 + }, + { + "epoch": 1.6394384624904967, + "grad_norm": 2.254916191101074, + "learning_rate": 2.267602562515839e-05, + "loss": 0.6591, + "step": 185450 + }, + { + "epoch": 1.6395268657508089, + "grad_norm": 2.5821938514709473, + "learning_rate": 2.2674552237486517e-05, + "loss": 0.5923, + "step": 185460 + }, + { + "epoch": 1.639615269011121, + "grad_norm": 2.5696825981140137, + "learning_rate": 2.267307884981465e-05, + "loss": 0.5828, + "step": 185470 + }, + { + "epoch": 1.6397036722714333, + "grad_norm": 3.0272023677825928, + "learning_rate": 2.2671605462142777e-05, + "loss": 0.6408, + "step": 185480 + }, + { + "epoch": 1.6397920755317457, + "grad_norm": 9.991312026977539, + "learning_rate": 2.2670132074470906e-05, + "loss": 0.582, + "step": 185490 + }, + { + "epoch": 1.6398804787920578, + "grad_norm": 2.480916976928711, + "learning_rate": 2.2668658686799037e-05, + "loss": 0.5271, + "step": 185500 + }, + { + "epoch": 1.6399688820523701, + "grad_norm": 1.6791179180145264, + "learning_rate": 2.2667185299127166e-05, + "loss": 0.6126, + "step": 185510 + }, + { + "epoch": 1.6400572853126825, + "grad_norm": 1.5378644466400146, + "learning_rate": 2.2665711911455294e-05, + "loss": 0.6285, + "step": 185520 + }, + { + "epoch": 1.6401456885729946, + "grad_norm": 1.2760668992996216, + "learning_rate": 2.2664238523783422e-05, + "loss": 0.7315, + "step": 185530 + }, + { + "epoch": 1.6402340918333067, + "grad_norm": 3.544977903366089, + "learning_rate": 2.2662765136111554e-05, + "loss": 0.6292, + "step": 185540 + }, + { + "epoch": 1.640322495093619, + "grad_norm": 7.1313676834106445, + "learning_rate": 2.2661291748439682e-05, + "loss": 0.5463, + "step": 185550 + }, + { + "epoch": 1.6404108983539314, + "grad_norm": 1.6529300212860107, + "learning_rate": 2.265981836076781e-05, + "loss": 0.5482, + "step": 185560 + }, + { + "epoch": 1.6404993016142435, + "grad_norm": 2.0977251529693604, + "learning_rate": 2.2658344973095943e-05, + "loss": 0.5639, + "step": 185570 + }, + { + "epoch": 1.6405877048745556, + "grad_norm": 1.8324156999588013, + "learning_rate": 2.265687158542407e-05, + "loss": 0.5793, + "step": 185580 + }, + { + "epoch": 1.640676108134868, + "grad_norm": 1.737492561340332, + "learning_rate": 2.26553981977522e-05, + "loss": 0.5701, + "step": 185590 + }, + { + "epoch": 1.6407645113951803, + "grad_norm": 3.78633975982666, + "learning_rate": 2.2653924810080328e-05, + "loss": 0.7726, + "step": 185600 + }, + { + "epoch": 1.6408529146554924, + "grad_norm": 2.2083418369293213, + "learning_rate": 2.265245142240846e-05, + "loss": 0.5441, + "step": 185610 + }, + { + "epoch": 1.6409413179158048, + "grad_norm": 9.208758354187012, + "learning_rate": 2.2650978034736588e-05, + "loss": 0.5738, + "step": 185620 + }, + { + "epoch": 1.6410297211761171, + "grad_norm": 2.564452648162842, + "learning_rate": 2.2649504647064716e-05, + "loss": 0.6949, + "step": 185630 + }, + { + "epoch": 1.6411181244364292, + "grad_norm": 2.0815610885620117, + "learning_rate": 2.2648031259392848e-05, + "loss": 0.5191, + "step": 185640 + }, + { + "epoch": 1.6412065276967414, + "grad_norm": 3.158571243286133, + "learning_rate": 2.2646557871720976e-05, + "loss": 0.6093, + "step": 185650 + }, + { + "epoch": 1.6412949309570537, + "grad_norm": 2.1659717559814453, + "learning_rate": 2.2645084484049104e-05, + "loss": 0.5577, + "step": 185660 + }, + { + "epoch": 1.641383334217366, + "grad_norm": 6.634654521942139, + "learning_rate": 2.2643611096377236e-05, + "loss": 0.573, + "step": 185670 + }, + { + "epoch": 1.6414717374776782, + "grad_norm": 6.806859016418457, + "learning_rate": 2.2642137708705365e-05, + "loss": 0.61, + "step": 185680 + }, + { + "epoch": 1.6415601407379903, + "grad_norm": 3.863046646118164, + "learning_rate": 2.2640664321033493e-05, + "loss": 0.7126, + "step": 185690 + }, + { + "epoch": 1.6416485439983026, + "grad_norm": 5.153300762176514, + "learning_rate": 2.2639190933361625e-05, + "loss": 0.5262, + "step": 185700 + }, + { + "epoch": 1.641736947258615, + "grad_norm": 2.3899624347686768, + "learning_rate": 2.2637717545689753e-05, + "loss": 0.5768, + "step": 185710 + }, + { + "epoch": 1.641825350518927, + "grad_norm": 1.7550944089889526, + "learning_rate": 2.263624415801788e-05, + "loss": 0.6629, + "step": 185720 + }, + { + "epoch": 1.6419137537792394, + "grad_norm": 2.075427770614624, + "learning_rate": 2.2634770770346013e-05, + "loss": 0.589, + "step": 185730 + }, + { + "epoch": 1.6420021570395518, + "grad_norm": 9.012155532836914, + "learning_rate": 2.263329738267414e-05, + "loss": 0.5528, + "step": 185740 + }, + { + "epoch": 1.6420905602998639, + "grad_norm": 15.19962215423584, + "learning_rate": 2.263182399500227e-05, + "loss": 0.6877, + "step": 185750 + }, + { + "epoch": 1.642178963560176, + "grad_norm": 5.779613971710205, + "learning_rate": 2.26303506073304e-05, + "loss": 0.5426, + "step": 185760 + }, + { + "epoch": 1.6422673668204884, + "grad_norm": 2.744046688079834, + "learning_rate": 2.262887721965853e-05, + "loss": 0.4839, + "step": 185770 + }, + { + "epoch": 1.6423557700808007, + "grad_norm": 1.8332329988479614, + "learning_rate": 2.2627403831986658e-05, + "loss": 0.725, + "step": 185780 + }, + { + "epoch": 1.6424441733411128, + "grad_norm": 2.207960605621338, + "learning_rate": 2.262593044431479e-05, + "loss": 0.5962, + "step": 185790 + }, + { + "epoch": 1.642532576601425, + "grad_norm": 3.4198386669158936, + "learning_rate": 2.262445705664292e-05, + "loss": 0.6046, + "step": 185800 + }, + { + "epoch": 1.6426209798617373, + "grad_norm": 6.026602268218994, + "learning_rate": 2.2622983668971047e-05, + "loss": 0.5458, + "step": 185810 + }, + { + "epoch": 1.6427093831220496, + "grad_norm": 1.4739265441894531, + "learning_rate": 2.2621510281299175e-05, + "loss": 0.5088, + "step": 185820 + }, + { + "epoch": 1.6427977863823617, + "grad_norm": 4.951425075531006, + "learning_rate": 2.2620036893627307e-05, + "loss": 0.6929, + "step": 185830 + }, + { + "epoch": 1.6428861896426739, + "grad_norm": 7.2399797439575195, + "learning_rate": 2.2618563505955435e-05, + "loss": 0.5343, + "step": 185840 + }, + { + "epoch": 1.6429745929029864, + "grad_norm": 9.530874252319336, + "learning_rate": 2.2617090118283564e-05, + "loss": 0.5051, + "step": 185850 + }, + { + "epoch": 1.6430629961632985, + "grad_norm": 1.4886133670806885, + "learning_rate": 2.2615616730611695e-05, + "loss": 0.4965, + "step": 185860 + }, + { + "epoch": 1.6431513994236107, + "grad_norm": 10.15964126586914, + "learning_rate": 2.2614143342939824e-05, + "loss": 0.5204, + "step": 185870 + }, + { + "epoch": 1.643239802683923, + "grad_norm": 1.6887242794036865, + "learning_rate": 2.2612669955267952e-05, + "loss": 0.5784, + "step": 185880 + }, + { + "epoch": 1.6433282059442353, + "grad_norm": 0.7270352244377136, + "learning_rate": 2.261119656759608e-05, + "loss": 0.6224, + "step": 185890 + }, + { + "epoch": 1.6434166092045475, + "grad_norm": 7.137011528015137, + "learning_rate": 2.2609723179924212e-05, + "loss": 0.5258, + "step": 185900 + }, + { + "epoch": 1.6435050124648596, + "grad_norm": 2.0126235485076904, + "learning_rate": 2.260824979225234e-05, + "loss": 0.5014, + "step": 185910 + }, + { + "epoch": 1.643593415725172, + "grad_norm": 7.399680137634277, + "learning_rate": 2.260677640458047e-05, + "loss": 0.5448, + "step": 185920 + }, + { + "epoch": 1.6436818189854843, + "grad_norm": 3.8143908977508545, + "learning_rate": 2.2605303016908597e-05, + "loss": 0.6536, + "step": 185930 + }, + { + "epoch": 1.6437702222457964, + "grad_norm": 2.3221652507781982, + "learning_rate": 2.260382962923673e-05, + "loss": 0.5971, + "step": 185940 + }, + { + "epoch": 1.6438586255061085, + "grad_norm": 17.32271385192871, + "learning_rate": 2.2602356241564857e-05, + "loss": 0.5239, + "step": 185950 + }, + { + "epoch": 1.643947028766421, + "grad_norm": 3.2116289138793945, + "learning_rate": 2.2600882853892986e-05, + "loss": 0.5328, + "step": 185960 + }, + { + "epoch": 1.6440354320267332, + "grad_norm": 2.5078237056732178, + "learning_rate": 2.2599409466221117e-05, + "loss": 0.5402, + "step": 185970 + }, + { + "epoch": 1.6441238352870453, + "grad_norm": 1.8225317001342773, + "learning_rate": 2.2597936078549246e-05, + "loss": 0.5803, + "step": 185980 + }, + { + "epoch": 1.6442122385473577, + "grad_norm": 2.283177137374878, + "learning_rate": 2.2596462690877374e-05, + "loss": 0.6979, + "step": 185990 + }, + { + "epoch": 1.64430064180767, + "grad_norm": 5.670346260070801, + "learning_rate": 2.2594989303205502e-05, + "loss": 0.4854, + "step": 186000 + }, + { + "epoch": 1.6443890450679821, + "grad_norm": 2.1823768615722656, + "learning_rate": 2.2593515915533634e-05, + "loss": 0.5479, + "step": 186010 + }, + { + "epoch": 1.6444774483282942, + "grad_norm": 1.916414499282837, + "learning_rate": 2.2592042527861762e-05, + "loss": 0.5529, + "step": 186020 + }, + { + "epoch": 1.6445658515886066, + "grad_norm": 3.1143224239349365, + "learning_rate": 2.259056914018989e-05, + "loss": 0.6401, + "step": 186030 + }, + { + "epoch": 1.644654254848919, + "grad_norm": 2.6293599605560303, + "learning_rate": 2.2589095752518023e-05, + "loss": 0.6842, + "step": 186040 + }, + { + "epoch": 1.644742658109231, + "grad_norm": 3.2117276191711426, + "learning_rate": 2.258762236484615e-05, + "loss": 0.7098, + "step": 186050 + }, + { + "epoch": 1.6448310613695432, + "grad_norm": 1.3258217573165894, + "learning_rate": 2.258614897717428e-05, + "loss": 0.5693, + "step": 186060 + }, + { + "epoch": 1.6449194646298555, + "grad_norm": 2.0359179973602295, + "learning_rate": 2.2584675589502408e-05, + "loss": 0.6484, + "step": 186070 + }, + { + "epoch": 1.6450078678901678, + "grad_norm": 2.763855457305908, + "learning_rate": 2.258320220183054e-05, + "loss": 0.7361, + "step": 186080 + }, + { + "epoch": 1.64509627115048, + "grad_norm": 1.6098722219467163, + "learning_rate": 2.2581728814158668e-05, + "loss": 0.5642, + "step": 186090 + }, + { + "epoch": 1.6451846744107923, + "grad_norm": 1.175952434539795, + "learning_rate": 2.2580255426486796e-05, + "loss": 0.7005, + "step": 186100 + }, + { + "epoch": 1.6452730776711046, + "grad_norm": 4.795873165130615, + "learning_rate": 2.2578782038814924e-05, + "loss": 0.6346, + "step": 186110 + }, + { + "epoch": 1.6453614809314168, + "grad_norm": 1.2376720905303955, + "learning_rate": 2.2577308651143056e-05, + "loss": 0.5795, + "step": 186120 + }, + { + "epoch": 1.6454498841917289, + "grad_norm": 1.6838208436965942, + "learning_rate": 2.2575835263471185e-05, + "loss": 0.5601, + "step": 186130 + }, + { + "epoch": 1.6455382874520412, + "grad_norm": 3.498685359954834, + "learning_rate": 2.2574361875799313e-05, + "loss": 0.6309, + "step": 186140 + }, + { + "epoch": 1.6456266907123536, + "grad_norm": 2.9972593784332275, + "learning_rate": 2.2572888488127445e-05, + "loss": 0.6418, + "step": 186150 + }, + { + "epoch": 1.6457150939726657, + "grad_norm": 10.504207611083984, + "learning_rate": 2.2571415100455573e-05, + "loss": 0.5684, + "step": 186160 + }, + { + "epoch": 1.6458034972329778, + "grad_norm": 1.811249852180481, + "learning_rate": 2.25699417127837e-05, + "loss": 0.5437, + "step": 186170 + }, + { + "epoch": 1.6458919004932901, + "grad_norm": 4.171734809875488, + "learning_rate": 2.256846832511183e-05, + "loss": 0.6472, + "step": 186180 + }, + { + "epoch": 1.6459803037536025, + "grad_norm": 1.820402979850769, + "learning_rate": 2.256699493743996e-05, + "loss": 0.6145, + "step": 186190 + }, + { + "epoch": 1.6460687070139146, + "grad_norm": 1.4646812677383423, + "learning_rate": 2.256552154976809e-05, + "loss": 0.6537, + "step": 186200 + }, + { + "epoch": 1.646157110274227, + "grad_norm": 1.6161844730377197, + "learning_rate": 2.2564048162096218e-05, + "loss": 0.5979, + "step": 186210 + }, + { + "epoch": 1.6462455135345393, + "grad_norm": 2.2486977577209473, + "learning_rate": 2.256257477442435e-05, + "loss": 0.6974, + "step": 186220 + }, + { + "epoch": 1.6463339167948514, + "grad_norm": 2.3361196517944336, + "learning_rate": 2.2561101386752478e-05, + "loss": 0.5686, + "step": 186230 + }, + { + "epoch": 1.6464223200551635, + "grad_norm": 2.4169716835021973, + "learning_rate": 2.2559627999080607e-05, + "loss": 0.6224, + "step": 186240 + }, + { + "epoch": 1.6465107233154759, + "grad_norm": 5.070600509643555, + "learning_rate": 2.2558154611408735e-05, + "loss": 0.5573, + "step": 186250 + }, + { + "epoch": 1.6465991265757882, + "grad_norm": 13.2886381149292, + "learning_rate": 2.2556681223736867e-05, + "loss": 0.7492, + "step": 186260 + }, + { + "epoch": 1.6466875298361003, + "grad_norm": 9.86184310913086, + "learning_rate": 2.2555207836064995e-05, + "loss": 0.5797, + "step": 186270 + }, + { + "epoch": 1.6467759330964125, + "grad_norm": 1.712245225906372, + "learning_rate": 2.2553734448393123e-05, + "loss": 0.5541, + "step": 186280 + }, + { + "epoch": 1.6468643363567248, + "grad_norm": 1.4967656135559082, + "learning_rate": 2.2552261060721252e-05, + "loss": 0.5551, + "step": 186290 + }, + { + "epoch": 1.6469527396170371, + "grad_norm": 17.65376853942871, + "learning_rate": 2.2550787673049384e-05, + "loss": 0.6918, + "step": 186300 + }, + { + "epoch": 1.6470411428773493, + "grad_norm": 14.704512596130371, + "learning_rate": 2.2549314285377512e-05, + "loss": 0.546, + "step": 186310 + }, + { + "epoch": 1.6471295461376616, + "grad_norm": 4.653510570526123, + "learning_rate": 2.254784089770564e-05, + "loss": 0.659, + "step": 186320 + }, + { + "epoch": 1.647217949397974, + "grad_norm": 2.2873945236206055, + "learning_rate": 2.2546367510033772e-05, + "loss": 0.5374, + "step": 186330 + }, + { + "epoch": 1.647306352658286, + "grad_norm": 15.1098051071167, + "learning_rate": 2.25448941223619e-05, + "loss": 0.59, + "step": 186340 + }, + { + "epoch": 1.6473947559185982, + "grad_norm": 1.3340559005737305, + "learning_rate": 2.254342073469003e-05, + "loss": 0.5251, + "step": 186350 + }, + { + "epoch": 1.6474831591789105, + "grad_norm": 3.3727757930755615, + "learning_rate": 2.2541947347018157e-05, + "loss": 0.7079, + "step": 186360 + }, + { + "epoch": 1.6475715624392229, + "grad_norm": 1.6442217826843262, + "learning_rate": 2.254047395934629e-05, + "loss": 0.531, + "step": 186370 + }, + { + "epoch": 1.647659965699535, + "grad_norm": 5.728442668914795, + "learning_rate": 2.2539000571674417e-05, + "loss": 0.5283, + "step": 186380 + }, + { + "epoch": 1.647748368959847, + "grad_norm": 8.326568603515625, + "learning_rate": 2.2537527184002545e-05, + "loss": 0.6114, + "step": 186390 + }, + { + "epoch": 1.6478367722201595, + "grad_norm": 1.2240368127822876, + "learning_rate": 2.2536053796330674e-05, + "loss": 0.626, + "step": 186400 + }, + { + "epoch": 1.6479251754804718, + "grad_norm": 9.812299728393555, + "learning_rate": 2.2534580408658806e-05, + "loss": 0.6095, + "step": 186410 + }, + { + "epoch": 1.648013578740784, + "grad_norm": 4.076673984527588, + "learning_rate": 2.2533107020986934e-05, + "loss": 0.6195, + "step": 186420 + }, + { + "epoch": 1.6481019820010963, + "grad_norm": 1.2228164672851562, + "learning_rate": 2.2531633633315062e-05, + "loss": 0.4627, + "step": 186430 + }, + { + "epoch": 1.6481903852614086, + "grad_norm": 5.404401779174805, + "learning_rate": 2.2530160245643194e-05, + "loss": 0.4749, + "step": 186440 + }, + { + "epoch": 1.6482787885217207, + "grad_norm": 2.2291276454925537, + "learning_rate": 2.2528686857971322e-05, + "loss": 0.6116, + "step": 186450 + }, + { + "epoch": 1.6483671917820328, + "grad_norm": 1.830512523651123, + "learning_rate": 2.252721347029945e-05, + "loss": 0.57, + "step": 186460 + }, + { + "epoch": 1.6484555950423452, + "grad_norm": 2.566335439682007, + "learning_rate": 2.252574008262758e-05, + "loss": 0.4799, + "step": 186470 + }, + { + "epoch": 1.6485439983026575, + "grad_norm": 17.35238265991211, + "learning_rate": 2.252426669495571e-05, + "loss": 0.5629, + "step": 186480 + }, + { + "epoch": 1.6486324015629696, + "grad_norm": 4.275138854980469, + "learning_rate": 2.252279330728384e-05, + "loss": 0.6852, + "step": 186490 + }, + { + "epoch": 1.6487208048232818, + "grad_norm": 2.7943878173828125, + "learning_rate": 2.2521319919611968e-05, + "loss": 0.6412, + "step": 186500 + }, + { + "epoch": 1.648809208083594, + "grad_norm": 7.2049384117126465, + "learning_rate": 2.25198465319401e-05, + "loss": 0.6042, + "step": 186510 + }, + { + "epoch": 1.6488976113439064, + "grad_norm": 2.6606061458587646, + "learning_rate": 2.2518373144268228e-05, + "loss": 0.6231, + "step": 186520 + }, + { + "epoch": 1.6489860146042186, + "grad_norm": 2.138690948486328, + "learning_rate": 2.2516899756596356e-05, + "loss": 0.676, + "step": 186530 + }, + { + "epoch": 1.6490744178645307, + "grad_norm": 2.766820192337036, + "learning_rate": 2.2515426368924484e-05, + "loss": 0.5783, + "step": 186540 + }, + { + "epoch": 1.6491628211248432, + "grad_norm": 8.024649620056152, + "learning_rate": 2.2513952981252616e-05, + "loss": 0.8184, + "step": 186550 + }, + { + "epoch": 1.6492512243851554, + "grad_norm": 5.261190891265869, + "learning_rate": 2.2512479593580744e-05, + "loss": 0.6714, + "step": 186560 + }, + { + "epoch": 1.6493396276454675, + "grad_norm": 1.8569610118865967, + "learning_rate": 2.2511006205908873e-05, + "loss": 0.5392, + "step": 186570 + }, + { + "epoch": 1.6494280309057798, + "grad_norm": 2.904841184616089, + "learning_rate": 2.2509532818237005e-05, + "loss": 0.7231, + "step": 186580 + }, + { + "epoch": 1.6495164341660922, + "grad_norm": 3.9287497997283936, + "learning_rate": 2.2508059430565133e-05, + "loss": 0.5742, + "step": 186590 + }, + { + "epoch": 1.6496048374264043, + "grad_norm": 2.037377119064331, + "learning_rate": 2.250658604289326e-05, + "loss": 0.6421, + "step": 186600 + }, + { + "epoch": 1.6496932406867164, + "grad_norm": 1.6801235675811768, + "learning_rate": 2.2505112655221393e-05, + "loss": 0.5124, + "step": 186610 + }, + { + "epoch": 1.6497816439470288, + "grad_norm": 6.721368312835693, + "learning_rate": 2.250363926754952e-05, + "loss": 0.6706, + "step": 186620 + }, + { + "epoch": 1.649870047207341, + "grad_norm": 9.116744041442871, + "learning_rate": 2.250216587987765e-05, + "loss": 0.7733, + "step": 186630 + }, + { + "epoch": 1.6499584504676532, + "grad_norm": 0.9526000022888184, + "learning_rate": 2.250069249220578e-05, + "loss": 0.5411, + "step": 186640 + }, + { + "epoch": 1.6500468537279653, + "grad_norm": 4.075675010681152, + "learning_rate": 2.249921910453391e-05, + "loss": 0.5596, + "step": 186650 + }, + { + "epoch": 1.6501352569882777, + "grad_norm": 2.9119768142700195, + "learning_rate": 2.2497745716862038e-05, + "loss": 0.5673, + "step": 186660 + }, + { + "epoch": 1.65022366024859, + "grad_norm": 4.40766716003418, + "learning_rate": 2.249627232919017e-05, + "loss": 0.6125, + "step": 186670 + }, + { + "epoch": 1.6503120635089021, + "grad_norm": 8.399806022644043, + "learning_rate": 2.2494798941518298e-05, + "loss": 0.6156, + "step": 186680 + }, + { + "epoch": 1.6504004667692145, + "grad_norm": 1.9836020469665527, + "learning_rate": 2.2493325553846427e-05, + "loss": 0.6544, + "step": 186690 + }, + { + "epoch": 1.6504888700295268, + "grad_norm": 3.8788788318634033, + "learning_rate": 2.249185216617456e-05, + "loss": 0.5849, + "step": 186700 + }, + { + "epoch": 1.650577273289839, + "grad_norm": 2.894810199737549, + "learning_rate": 2.2490378778502687e-05, + "loss": 0.5491, + "step": 186710 + }, + { + "epoch": 1.650665676550151, + "grad_norm": 4.892343997955322, + "learning_rate": 2.2488905390830815e-05, + "loss": 0.6121, + "step": 186720 + }, + { + "epoch": 1.6507540798104634, + "grad_norm": 1.6945147514343262, + "learning_rate": 2.2487432003158947e-05, + "loss": 0.5917, + "step": 186730 + }, + { + "epoch": 1.6508424830707757, + "grad_norm": 4.941824436187744, + "learning_rate": 2.2485958615487075e-05, + "loss": 0.6246, + "step": 186740 + }, + { + "epoch": 1.6509308863310879, + "grad_norm": 3.3184974193573, + "learning_rate": 2.2484485227815203e-05, + "loss": 0.5743, + "step": 186750 + }, + { + "epoch": 1.6510192895914, + "grad_norm": 5.543766975402832, + "learning_rate": 2.2483011840143332e-05, + "loss": 0.6602, + "step": 186760 + }, + { + "epoch": 1.6511076928517123, + "grad_norm": 2.293617010116577, + "learning_rate": 2.2481538452471464e-05, + "loss": 0.5078, + "step": 186770 + }, + { + "epoch": 1.6511960961120247, + "grad_norm": 1.9293001890182495, + "learning_rate": 2.2480065064799592e-05, + "loss": 0.6742, + "step": 186780 + }, + { + "epoch": 1.6512844993723368, + "grad_norm": 4.787430286407471, + "learning_rate": 2.247859167712772e-05, + "loss": 0.4449, + "step": 186790 + }, + { + "epoch": 1.6513729026326491, + "grad_norm": 4.850201606750488, + "learning_rate": 2.2477118289455852e-05, + "loss": 0.6375, + "step": 186800 + }, + { + "epoch": 1.6514613058929615, + "grad_norm": 0.9525098204612732, + "learning_rate": 2.247564490178398e-05, + "loss": 0.6298, + "step": 186810 + }, + { + "epoch": 1.6515497091532736, + "grad_norm": 11.680496215820312, + "learning_rate": 2.247417151411211e-05, + "loss": 0.59, + "step": 186820 + }, + { + "epoch": 1.6516381124135857, + "grad_norm": 3.2707183361053467, + "learning_rate": 2.2472698126440237e-05, + "loss": 0.6066, + "step": 186830 + }, + { + "epoch": 1.651726515673898, + "grad_norm": 1.8392772674560547, + "learning_rate": 2.247122473876837e-05, + "loss": 0.7862, + "step": 186840 + }, + { + "epoch": 1.6518149189342104, + "grad_norm": 1.7167593240737915, + "learning_rate": 2.2469751351096497e-05, + "loss": 0.6628, + "step": 186850 + }, + { + "epoch": 1.6519033221945225, + "grad_norm": 3.199334144592285, + "learning_rate": 2.2468277963424626e-05, + "loss": 0.6035, + "step": 186860 + }, + { + "epoch": 1.6519917254548346, + "grad_norm": 1.1371521949768066, + "learning_rate": 2.2466804575752757e-05, + "loss": 0.5503, + "step": 186870 + }, + { + "epoch": 1.652080128715147, + "grad_norm": 4.007225513458252, + "learning_rate": 2.2465331188080886e-05, + "loss": 0.4445, + "step": 186880 + }, + { + "epoch": 1.6521685319754593, + "grad_norm": 2.58516263961792, + "learning_rate": 2.2463857800409014e-05, + "loss": 0.5687, + "step": 186890 + }, + { + "epoch": 1.6522569352357714, + "grad_norm": 3.320324420928955, + "learning_rate": 2.2462384412737142e-05, + "loss": 0.5398, + "step": 186900 + }, + { + "epoch": 1.6523453384960838, + "grad_norm": 1.634325385093689, + "learning_rate": 2.2460911025065274e-05, + "loss": 0.6049, + "step": 186910 + }, + { + "epoch": 1.6524337417563961, + "grad_norm": 4.976755142211914, + "learning_rate": 2.2459437637393402e-05, + "loss": 0.4744, + "step": 186920 + }, + { + "epoch": 1.6525221450167082, + "grad_norm": 12.777931213378906, + "learning_rate": 2.245796424972153e-05, + "loss": 0.5794, + "step": 186930 + }, + { + "epoch": 1.6526105482770204, + "grad_norm": 2.296560049057007, + "learning_rate": 2.245649086204966e-05, + "loss": 0.6496, + "step": 186940 + }, + { + "epoch": 1.6526989515373327, + "grad_norm": 3.4259567260742188, + "learning_rate": 2.245501747437779e-05, + "loss": 0.5835, + "step": 186950 + }, + { + "epoch": 1.652787354797645, + "grad_norm": 2.2072949409484863, + "learning_rate": 2.245354408670592e-05, + "loss": 0.5796, + "step": 186960 + }, + { + "epoch": 1.6528757580579572, + "grad_norm": 2.7551636695861816, + "learning_rate": 2.2452070699034048e-05, + "loss": 0.6548, + "step": 186970 + }, + { + "epoch": 1.6529641613182693, + "grad_norm": 0.8537187576293945, + "learning_rate": 2.245059731136218e-05, + "loss": 0.6141, + "step": 186980 + }, + { + "epoch": 1.6530525645785816, + "grad_norm": 4.9957756996154785, + "learning_rate": 2.2449123923690308e-05, + "loss": 0.5832, + "step": 186990 + }, + { + "epoch": 1.653140967838894, + "grad_norm": 3.157557964324951, + "learning_rate": 2.2447650536018436e-05, + "loss": 0.6262, + "step": 187000 + }, + { + "epoch": 1.653229371099206, + "grad_norm": 4.5432868003845215, + "learning_rate": 2.2446177148346564e-05, + "loss": 0.6355, + "step": 187010 + }, + { + "epoch": 1.6533177743595184, + "grad_norm": 9.247173309326172, + "learning_rate": 2.2444703760674696e-05, + "loss": 0.6208, + "step": 187020 + }, + { + "epoch": 1.6534061776198308, + "grad_norm": 0.7592854499816895, + "learning_rate": 2.2443230373002825e-05, + "loss": 0.6037, + "step": 187030 + }, + { + "epoch": 1.653494580880143, + "grad_norm": 2.9964187145233154, + "learning_rate": 2.2441756985330953e-05, + "loss": 0.5343, + "step": 187040 + }, + { + "epoch": 1.653582984140455, + "grad_norm": 3.456821918487549, + "learning_rate": 2.244028359765908e-05, + "loss": 0.6327, + "step": 187050 + }, + { + "epoch": 1.6536713874007674, + "grad_norm": 1.7094639539718628, + "learning_rate": 2.2438810209987213e-05, + "loss": 0.6772, + "step": 187060 + }, + { + "epoch": 1.6537597906610797, + "grad_norm": 0.9522131085395813, + "learning_rate": 2.243733682231534e-05, + "loss": 0.6629, + "step": 187070 + }, + { + "epoch": 1.6538481939213918, + "grad_norm": 3.7966177463531494, + "learning_rate": 2.243586343464347e-05, + "loss": 0.5795, + "step": 187080 + }, + { + "epoch": 1.653936597181704, + "grad_norm": 1.757983922958374, + "learning_rate": 2.24343900469716e-05, + "loss": 0.5867, + "step": 187090 + }, + { + "epoch": 1.6540250004420163, + "grad_norm": 1.2853597402572632, + "learning_rate": 2.243291665929973e-05, + "loss": 0.5369, + "step": 187100 + }, + { + "epoch": 1.6541134037023286, + "grad_norm": 3.481792688369751, + "learning_rate": 2.2431443271627858e-05, + "loss": 0.5699, + "step": 187110 + }, + { + "epoch": 1.6542018069626407, + "grad_norm": 3.536203384399414, + "learning_rate": 2.2429969883955986e-05, + "loss": 0.5452, + "step": 187120 + }, + { + "epoch": 1.6542902102229529, + "grad_norm": 2.491907835006714, + "learning_rate": 2.2428496496284118e-05, + "loss": 0.6593, + "step": 187130 + }, + { + "epoch": 1.6543786134832654, + "grad_norm": 2.9078681468963623, + "learning_rate": 2.2427023108612247e-05, + "loss": 0.6332, + "step": 187140 + }, + { + "epoch": 1.6544670167435775, + "grad_norm": 2.54404616355896, + "learning_rate": 2.2425549720940375e-05, + "loss": 0.643, + "step": 187150 + }, + { + "epoch": 1.6545554200038897, + "grad_norm": 4.271890163421631, + "learning_rate": 2.2424076333268507e-05, + "loss": 0.5725, + "step": 187160 + }, + { + "epoch": 1.654643823264202, + "grad_norm": 1.9113109111785889, + "learning_rate": 2.2422602945596635e-05, + "loss": 0.5966, + "step": 187170 + }, + { + "epoch": 1.6547322265245143, + "grad_norm": 7.789828300476074, + "learning_rate": 2.2421129557924763e-05, + "loss": 0.5529, + "step": 187180 + }, + { + "epoch": 1.6548206297848265, + "grad_norm": 13.340986251831055, + "learning_rate": 2.2419656170252892e-05, + "loss": 0.4414, + "step": 187190 + }, + { + "epoch": 1.6549090330451386, + "grad_norm": 1.3944019079208374, + "learning_rate": 2.2418182782581023e-05, + "loss": 0.6176, + "step": 187200 + }, + { + "epoch": 1.654997436305451, + "grad_norm": 1.931065559387207, + "learning_rate": 2.2416709394909152e-05, + "loss": 0.5971, + "step": 187210 + }, + { + "epoch": 1.6550858395657633, + "grad_norm": 4.512258052825928, + "learning_rate": 2.241523600723728e-05, + "loss": 0.55, + "step": 187220 + }, + { + "epoch": 1.6551742428260754, + "grad_norm": 3.3120241165161133, + "learning_rate": 2.241376261956541e-05, + "loss": 0.7053, + "step": 187230 + }, + { + "epoch": 1.6552626460863875, + "grad_norm": 2.242070436477661, + "learning_rate": 2.241228923189354e-05, + "loss": 0.636, + "step": 187240 + }, + { + "epoch": 1.6553510493466999, + "grad_norm": 1.2260303497314453, + "learning_rate": 2.241081584422167e-05, + "loss": 0.6742, + "step": 187250 + }, + { + "epoch": 1.6554394526070122, + "grad_norm": 3.577976942062378, + "learning_rate": 2.2409342456549797e-05, + "loss": 0.525, + "step": 187260 + }, + { + "epoch": 1.6555278558673243, + "grad_norm": 2.638819456100464, + "learning_rate": 2.240786906887793e-05, + "loss": 0.5726, + "step": 187270 + }, + { + "epoch": 1.6556162591276367, + "grad_norm": 1.5112980604171753, + "learning_rate": 2.2406395681206057e-05, + "loss": 0.5624, + "step": 187280 + }, + { + "epoch": 1.655704662387949, + "grad_norm": 3.5007877349853516, + "learning_rate": 2.2404922293534185e-05, + "loss": 0.5716, + "step": 187290 + }, + { + "epoch": 1.6557930656482611, + "grad_norm": 3.1316921710968018, + "learning_rate": 2.2403448905862314e-05, + "loss": 0.6398, + "step": 187300 + }, + { + "epoch": 1.6558814689085732, + "grad_norm": 6.054881572723389, + "learning_rate": 2.2401975518190446e-05, + "loss": 0.6693, + "step": 187310 + }, + { + "epoch": 1.6559698721688856, + "grad_norm": 10.04617977142334, + "learning_rate": 2.2400502130518574e-05, + "loss": 0.6034, + "step": 187320 + }, + { + "epoch": 1.656058275429198, + "grad_norm": 1.2988650798797607, + "learning_rate": 2.2399028742846702e-05, + "loss": 0.6341, + "step": 187330 + }, + { + "epoch": 1.65614667868951, + "grad_norm": 5.454225063323975, + "learning_rate": 2.2397555355174834e-05, + "loss": 0.5, + "step": 187340 + }, + { + "epoch": 1.6562350819498222, + "grad_norm": 3.382472515106201, + "learning_rate": 2.2396081967502962e-05, + "loss": 0.6223, + "step": 187350 + }, + { + "epoch": 1.6563234852101345, + "grad_norm": 12.317181587219238, + "learning_rate": 2.239460857983109e-05, + "loss": 0.5549, + "step": 187360 + }, + { + "epoch": 1.6564118884704468, + "grad_norm": 8.568225860595703, + "learning_rate": 2.239313519215922e-05, + "loss": 0.6905, + "step": 187370 + }, + { + "epoch": 1.656500291730759, + "grad_norm": 4.240734577178955, + "learning_rate": 2.239166180448735e-05, + "loss": 0.5609, + "step": 187380 + }, + { + "epoch": 1.6565886949910713, + "grad_norm": 4.741962432861328, + "learning_rate": 2.239018841681548e-05, + "loss": 0.6033, + "step": 187390 + }, + { + "epoch": 1.6566770982513837, + "grad_norm": 2.4022674560546875, + "learning_rate": 2.2388715029143607e-05, + "loss": 0.4941, + "step": 187400 + }, + { + "epoch": 1.6567655015116958, + "grad_norm": 1.1704354286193848, + "learning_rate": 2.2387241641471736e-05, + "loss": 0.6629, + "step": 187410 + }, + { + "epoch": 1.656853904772008, + "grad_norm": 3.423189163208008, + "learning_rate": 2.2385768253799868e-05, + "loss": 0.5746, + "step": 187420 + }, + { + "epoch": 1.6569423080323202, + "grad_norm": 5.525378227233887, + "learning_rate": 2.2384294866127996e-05, + "loss": 0.6669, + "step": 187430 + }, + { + "epoch": 1.6570307112926326, + "grad_norm": 4.234412670135498, + "learning_rate": 2.2382821478456124e-05, + "loss": 0.5415, + "step": 187440 + }, + { + "epoch": 1.6571191145529447, + "grad_norm": 4.066867351531982, + "learning_rate": 2.2381348090784256e-05, + "loss": 0.5628, + "step": 187450 + }, + { + "epoch": 1.6572075178132568, + "grad_norm": 1.7800112962722778, + "learning_rate": 2.2379874703112384e-05, + "loss": 0.5709, + "step": 187460 + }, + { + "epoch": 1.6572959210735692, + "grad_norm": 2.0689873695373535, + "learning_rate": 2.2378401315440513e-05, + "loss": 0.4691, + "step": 187470 + }, + { + "epoch": 1.6573843243338815, + "grad_norm": 1.8084672689437866, + "learning_rate": 2.237692792776864e-05, + "loss": 0.5711, + "step": 187480 + }, + { + "epoch": 1.6574727275941936, + "grad_norm": 1.3834961652755737, + "learning_rate": 2.2375454540096773e-05, + "loss": 0.4465, + "step": 187490 + }, + { + "epoch": 1.657561130854506, + "grad_norm": 1.350616693496704, + "learning_rate": 2.23739811524249e-05, + "loss": 0.562, + "step": 187500 + }, + { + "epoch": 1.6576495341148183, + "grad_norm": 16.401464462280273, + "learning_rate": 2.237250776475303e-05, + "loss": 0.6055, + "step": 187510 + }, + { + "epoch": 1.6577379373751304, + "grad_norm": 1.357333779335022, + "learning_rate": 2.237103437708116e-05, + "loss": 0.4337, + "step": 187520 + }, + { + "epoch": 1.6578263406354425, + "grad_norm": 5.806070327758789, + "learning_rate": 2.236956098940929e-05, + "loss": 0.4938, + "step": 187530 + }, + { + "epoch": 1.6579147438957549, + "grad_norm": 3.723224639892578, + "learning_rate": 2.236808760173742e-05, + "loss": 0.6482, + "step": 187540 + }, + { + "epoch": 1.6580031471560672, + "grad_norm": 2.085385799407959, + "learning_rate": 2.236661421406555e-05, + "loss": 0.6182, + "step": 187550 + }, + { + "epoch": 1.6580915504163793, + "grad_norm": 2.7253615856170654, + "learning_rate": 2.2365140826393678e-05, + "loss": 0.7215, + "step": 187560 + }, + { + "epoch": 1.6581799536766915, + "grad_norm": 5.712151050567627, + "learning_rate": 2.236366743872181e-05, + "loss": 0.5406, + "step": 187570 + }, + { + "epoch": 1.6582683569370038, + "grad_norm": 3.2005090713500977, + "learning_rate": 2.2362194051049938e-05, + "loss": 0.5711, + "step": 187580 + }, + { + "epoch": 1.6583567601973161, + "grad_norm": 1.4085785150527954, + "learning_rate": 2.2360720663378067e-05, + "loss": 0.6013, + "step": 187590 + }, + { + "epoch": 1.6584451634576283, + "grad_norm": 0.8115139007568359, + "learning_rate": 2.2359247275706198e-05, + "loss": 0.495, + "step": 187600 + }, + { + "epoch": 1.6585335667179406, + "grad_norm": 2.6341500282287598, + "learning_rate": 2.2357773888034327e-05, + "loss": 0.6219, + "step": 187610 + }, + { + "epoch": 1.658621969978253, + "grad_norm": 2.4017670154571533, + "learning_rate": 2.2356300500362455e-05, + "loss": 0.6649, + "step": 187620 + }, + { + "epoch": 1.658710373238565, + "grad_norm": 11.742932319641113, + "learning_rate": 2.2354827112690587e-05, + "loss": 0.5668, + "step": 187630 + }, + { + "epoch": 1.6587987764988772, + "grad_norm": 1.3605257272720337, + "learning_rate": 2.2353353725018715e-05, + "loss": 0.5387, + "step": 187640 + }, + { + "epoch": 1.6588871797591895, + "grad_norm": 1.6390085220336914, + "learning_rate": 2.2351880337346843e-05, + "loss": 0.494, + "step": 187650 + }, + { + "epoch": 1.6589755830195019, + "grad_norm": 2.0409624576568604, + "learning_rate": 2.2350406949674972e-05, + "loss": 0.5474, + "step": 187660 + }, + { + "epoch": 1.659063986279814, + "grad_norm": 1.4725699424743652, + "learning_rate": 2.2348933562003104e-05, + "loss": 0.5128, + "step": 187670 + }, + { + "epoch": 1.6591523895401261, + "grad_norm": 1.872623324394226, + "learning_rate": 2.2347460174331232e-05, + "loss": 0.585, + "step": 187680 + }, + { + "epoch": 1.6592407928004385, + "grad_norm": 1.105606198310852, + "learning_rate": 2.234598678665936e-05, + "loss": 0.6576, + "step": 187690 + }, + { + "epoch": 1.6593291960607508, + "grad_norm": 3.464085102081299, + "learning_rate": 2.234451339898749e-05, + "loss": 0.7388, + "step": 187700 + }, + { + "epoch": 1.659417599321063, + "grad_norm": 1.8075155019760132, + "learning_rate": 2.234304001131562e-05, + "loss": 0.4952, + "step": 187710 + }, + { + "epoch": 1.659506002581375, + "grad_norm": 2.524611711502075, + "learning_rate": 2.234156662364375e-05, + "loss": 0.4873, + "step": 187720 + }, + { + "epoch": 1.6595944058416876, + "grad_norm": 1.3582056760787964, + "learning_rate": 2.2340093235971877e-05, + "loss": 0.5825, + "step": 187730 + }, + { + "epoch": 1.6596828091019997, + "grad_norm": 2.472538471221924, + "learning_rate": 2.233861984830001e-05, + "loss": 0.7072, + "step": 187740 + }, + { + "epoch": 1.6597712123623118, + "grad_norm": 4.318940162658691, + "learning_rate": 2.2337146460628137e-05, + "loss": 0.4695, + "step": 187750 + }, + { + "epoch": 1.6598596156226242, + "grad_norm": 1.5569344758987427, + "learning_rate": 2.2335673072956265e-05, + "loss": 0.5672, + "step": 187760 + }, + { + "epoch": 1.6599480188829365, + "grad_norm": 1.4827817678451538, + "learning_rate": 2.2334199685284394e-05, + "loss": 0.4942, + "step": 187770 + }, + { + "epoch": 1.6600364221432486, + "grad_norm": 2.3764090538024902, + "learning_rate": 2.2332726297612526e-05, + "loss": 0.595, + "step": 187780 + }, + { + "epoch": 1.6601248254035608, + "grad_norm": 1.2101463079452515, + "learning_rate": 2.2331252909940654e-05, + "loss": 0.5768, + "step": 187790 + }, + { + "epoch": 1.660213228663873, + "grad_norm": 0.9474406242370605, + "learning_rate": 2.2329779522268782e-05, + "loss": 0.7007, + "step": 187800 + }, + { + "epoch": 1.6603016319241855, + "grad_norm": 2.228926181793213, + "learning_rate": 2.2328306134596914e-05, + "loss": 0.5875, + "step": 187810 + }, + { + "epoch": 1.6603900351844976, + "grad_norm": 2.193197250366211, + "learning_rate": 2.2326832746925042e-05, + "loss": 0.5612, + "step": 187820 + }, + { + "epoch": 1.6604784384448097, + "grad_norm": 1.5102007389068604, + "learning_rate": 2.232535935925317e-05, + "loss": 0.6149, + "step": 187830 + }, + { + "epoch": 1.660566841705122, + "grad_norm": 2.672593832015991, + "learning_rate": 2.23238859715813e-05, + "loss": 0.6846, + "step": 187840 + }, + { + "epoch": 1.6606552449654344, + "grad_norm": 5.263713359832764, + "learning_rate": 2.232241258390943e-05, + "loss": 0.7165, + "step": 187850 + }, + { + "epoch": 1.6607436482257465, + "grad_norm": 2.5864052772521973, + "learning_rate": 2.232093919623756e-05, + "loss": 0.5533, + "step": 187860 + }, + { + "epoch": 1.6608320514860588, + "grad_norm": 1.2693744897842407, + "learning_rate": 2.2319465808565688e-05, + "loss": 0.6, + "step": 187870 + }, + { + "epoch": 1.6609204547463712, + "grad_norm": 2.708627462387085, + "learning_rate": 2.2317992420893816e-05, + "loss": 0.5311, + "step": 187880 + }, + { + "epoch": 1.6610088580066833, + "grad_norm": 3.467881679534912, + "learning_rate": 2.2316519033221948e-05, + "loss": 0.66, + "step": 187890 + }, + { + "epoch": 1.6610972612669954, + "grad_norm": 14.021622657775879, + "learning_rate": 2.2315045645550076e-05, + "loss": 0.6753, + "step": 187900 + }, + { + "epoch": 1.6611856645273078, + "grad_norm": 2.3567147254943848, + "learning_rate": 2.2313572257878204e-05, + "loss": 0.4414, + "step": 187910 + }, + { + "epoch": 1.66127406778762, + "grad_norm": 1.0080100297927856, + "learning_rate": 2.2312098870206336e-05, + "loss": 0.4932, + "step": 187920 + }, + { + "epoch": 1.6613624710479322, + "grad_norm": 3.2714929580688477, + "learning_rate": 2.2310625482534464e-05, + "loss": 0.6729, + "step": 187930 + }, + { + "epoch": 1.6614508743082443, + "grad_norm": 1.458680272102356, + "learning_rate": 2.2309152094862593e-05, + "loss": 0.7108, + "step": 187940 + }, + { + "epoch": 1.6615392775685567, + "grad_norm": 1.31399667263031, + "learning_rate": 2.230767870719072e-05, + "loss": 0.6631, + "step": 187950 + }, + { + "epoch": 1.661627680828869, + "grad_norm": 2.3060646057128906, + "learning_rate": 2.2306205319518853e-05, + "loss": 0.6086, + "step": 187960 + }, + { + "epoch": 1.6617160840891811, + "grad_norm": 4.0019001960754395, + "learning_rate": 2.230473193184698e-05, + "loss": 0.5381, + "step": 187970 + }, + { + "epoch": 1.6618044873494935, + "grad_norm": 6.709582805633545, + "learning_rate": 2.230325854417511e-05, + "loss": 0.6134, + "step": 187980 + }, + { + "epoch": 1.6618928906098058, + "grad_norm": 1.386630654335022, + "learning_rate": 2.230178515650324e-05, + "loss": 0.5532, + "step": 187990 + }, + { + "epoch": 1.661981293870118, + "grad_norm": 4.375740051269531, + "learning_rate": 2.230031176883137e-05, + "loss": 0.6025, + "step": 188000 + }, + { + "epoch": 1.66206969713043, + "grad_norm": 3.1843533515930176, + "learning_rate": 2.2298838381159498e-05, + "loss": 0.5813, + "step": 188010 + }, + { + "epoch": 1.6621581003907424, + "grad_norm": 1.7038811445236206, + "learning_rate": 2.2297364993487626e-05, + "loss": 0.6368, + "step": 188020 + }, + { + "epoch": 1.6622465036510548, + "grad_norm": 5.219362735748291, + "learning_rate": 2.2295891605815758e-05, + "loss": 0.6255, + "step": 188030 + }, + { + "epoch": 1.6623349069113669, + "grad_norm": 2.0721471309661865, + "learning_rate": 2.2294418218143887e-05, + "loss": 0.5182, + "step": 188040 + }, + { + "epoch": 1.662423310171679, + "grad_norm": 1.7778161764144897, + "learning_rate": 2.2292944830472015e-05, + "loss": 0.6499, + "step": 188050 + }, + { + "epoch": 1.6625117134319913, + "grad_norm": 1.397182822227478, + "learning_rate": 2.2291471442800143e-05, + "loss": 0.5333, + "step": 188060 + }, + { + "epoch": 1.6626001166923037, + "grad_norm": 3.3567376136779785, + "learning_rate": 2.2289998055128275e-05, + "loss": 0.6544, + "step": 188070 + }, + { + "epoch": 1.6626885199526158, + "grad_norm": 3.1433956623077393, + "learning_rate": 2.2288524667456403e-05, + "loss": 0.6111, + "step": 188080 + }, + { + "epoch": 1.6627769232129281, + "grad_norm": 13.049280166625977, + "learning_rate": 2.228705127978453e-05, + "loss": 0.6801, + "step": 188090 + }, + { + "epoch": 1.6628653264732405, + "grad_norm": 1.7103008031845093, + "learning_rate": 2.2285577892112663e-05, + "loss": 0.5051, + "step": 188100 + }, + { + "epoch": 1.6629537297335526, + "grad_norm": 3.4923737049102783, + "learning_rate": 2.2284104504440792e-05, + "loss": 0.6392, + "step": 188110 + }, + { + "epoch": 1.6630421329938647, + "grad_norm": 1.9245039224624634, + "learning_rate": 2.228263111676892e-05, + "loss": 0.5564, + "step": 188120 + }, + { + "epoch": 1.663130536254177, + "grad_norm": 1.9640815258026123, + "learning_rate": 2.228115772909705e-05, + "loss": 0.6027, + "step": 188130 + }, + { + "epoch": 1.6632189395144894, + "grad_norm": 8.780740737915039, + "learning_rate": 2.227968434142518e-05, + "loss": 0.6337, + "step": 188140 + }, + { + "epoch": 1.6633073427748015, + "grad_norm": 2.6531083583831787, + "learning_rate": 2.227821095375331e-05, + "loss": 0.4749, + "step": 188150 + }, + { + "epoch": 1.6633957460351136, + "grad_norm": 1.0486245155334473, + "learning_rate": 2.2276737566081437e-05, + "loss": 0.4814, + "step": 188160 + }, + { + "epoch": 1.663484149295426, + "grad_norm": 5.010915756225586, + "learning_rate": 2.2275264178409565e-05, + "loss": 0.6252, + "step": 188170 + }, + { + "epoch": 1.6635725525557383, + "grad_norm": 12.290897369384766, + "learning_rate": 2.2273790790737697e-05, + "loss": 0.6341, + "step": 188180 + }, + { + "epoch": 1.6636609558160504, + "grad_norm": 8.985589981079102, + "learning_rate": 2.2272317403065825e-05, + "loss": 0.6347, + "step": 188190 + }, + { + "epoch": 1.6637493590763628, + "grad_norm": 11.642650604248047, + "learning_rate": 2.2270844015393954e-05, + "loss": 0.4847, + "step": 188200 + }, + { + "epoch": 1.6638377623366751, + "grad_norm": 5.4836320877075195, + "learning_rate": 2.2269370627722085e-05, + "loss": 0.628, + "step": 188210 + }, + { + "epoch": 1.6639261655969872, + "grad_norm": 9.160797119140625, + "learning_rate": 2.2267897240050214e-05, + "loss": 0.5779, + "step": 188220 + }, + { + "epoch": 1.6640145688572994, + "grad_norm": 4.764699935913086, + "learning_rate": 2.2266423852378342e-05, + "loss": 0.5482, + "step": 188230 + }, + { + "epoch": 1.6641029721176117, + "grad_norm": 1.3461828231811523, + "learning_rate": 2.226495046470647e-05, + "loss": 0.6048, + "step": 188240 + }, + { + "epoch": 1.664191375377924, + "grad_norm": 4.535086154937744, + "learning_rate": 2.2263477077034602e-05, + "loss": 0.5348, + "step": 188250 + }, + { + "epoch": 1.6642797786382362, + "grad_norm": 3.120342254638672, + "learning_rate": 2.226200368936273e-05, + "loss": 0.52, + "step": 188260 + }, + { + "epoch": 1.6643681818985483, + "grad_norm": 3.6663310527801514, + "learning_rate": 2.226053030169086e-05, + "loss": 0.5827, + "step": 188270 + }, + { + "epoch": 1.6644565851588606, + "grad_norm": 8.960272789001465, + "learning_rate": 2.225905691401899e-05, + "loss": 0.5495, + "step": 188280 + }, + { + "epoch": 1.664544988419173, + "grad_norm": 1.650512456893921, + "learning_rate": 2.225758352634712e-05, + "loss": 0.5691, + "step": 188290 + }, + { + "epoch": 1.664633391679485, + "grad_norm": 6.905547142028809, + "learning_rate": 2.2256110138675247e-05, + "loss": 0.6997, + "step": 188300 + }, + { + "epoch": 1.6647217949397972, + "grad_norm": 11.614811897277832, + "learning_rate": 2.2254636751003376e-05, + "loss": 0.6876, + "step": 188310 + }, + { + "epoch": 1.6648101982001098, + "grad_norm": 4.543526649475098, + "learning_rate": 2.2253163363331508e-05, + "loss": 0.5165, + "step": 188320 + }, + { + "epoch": 1.664898601460422, + "grad_norm": 6.2200727462768555, + "learning_rate": 2.2251689975659636e-05, + "loss": 0.5396, + "step": 188330 + }, + { + "epoch": 1.664987004720734, + "grad_norm": 5.304297924041748, + "learning_rate": 2.2250216587987764e-05, + "loss": 0.4579, + "step": 188340 + }, + { + "epoch": 1.6650754079810464, + "grad_norm": 4.436426162719727, + "learning_rate": 2.2248743200315893e-05, + "loss": 0.6397, + "step": 188350 + }, + { + "epoch": 1.6651638112413587, + "grad_norm": 11.11939525604248, + "learning_rate": 2.2247269812644024e-05, + "loss": 0.6046, + "step": 188360 + }, + { + "epoch": 1.6652522145016708, + "grad_norm": 1.316219687461853, + "learning_rate": 2.2245796424972153e-05, + "loss": 0.4938, + "step": 188370 + }, + { + "epoch": 1.665340617761983, + "grad_norm": 1.542055606842041, + "learning_rate": 2.224432303730028e-05, + "loss": 0.5233, + "step": 188380 + }, + { + "epoch": 1.6654290210222953, + "grad_norm": 8.293159484863281, + "learning_rate": 2.2242849649628413e-05, + "loss": 0.4731, + "step": 188390 + }, + { + "epoch": 1.6655174242826076, + "grad_norm": 15.16063404083252, + "learning_rate": 2.224137626195654e-05, + "loss": 0.6341, + "step": 188400 + }, + { + "epoch": 1.6656058275429197, + "grad_norm": 1.1575562953948975, + "learning_rate": 2.223990287428467e-05, + "loss": 0.4586, + "step": 188410 + }, + { + "epoch": 1.6656942308032319, + "grad_norm": 1.761497974395752, + "learning_rate": 2.22384294866128e-05, + "loss": 0.5505, + "step": 188420 + }, + { + "epoch": 1.6657826340635442, + "grad_norm": 4.031102180480957, + "learning_rate": 2.223695609894093e-05, + "loss": 0.5952, + "step": 188430 + }, + { + "epoch": 1.6658710373238566, + "grad_norm": 1.2118767499923706, + "learning_rate": 2.2235482711269058e-05, + "loss": 0.5478, + "step": 188440 + }, + { + "epoch": 1.6659594405841687, + "grad_norm": 1.559574007987976, + "learning_rate": 2.223400932359719e-05, + "loss": 0.5723, + "step": 188450 + }, + { + "epoch": 1.666047843844481, + "grad_norm": 3.166830062866211, + "learning_rate": 2.2232535935925318e-05, + "loss": 0.66, + "step": 188460 + }, + { + "epoch": 1.6661362471047934, + "grad_norm": 2.521289110183716, + "learning_rate": 2.2231062548253446e-05, + "loss": 0.5163, + "step": 188470 + }, + { + "epoch": 1.6662246503651055, + "grad_norm": 3.942953109741211, + "learning_rate": 2.2229589160581578e-05, + "loss": 0.4564, + "step": 188480 + }, + { + "epoch": 1.6663130536254176, + "grad_norm": 2.9778642654418945, + "learning_rate": 2.2228115772909706e-05, + "loss": 0.6761, + "step": 188490 + }, + { + "epoch": 1.66640145688573, + "grad_norm": 2.649951696395874, + "learning_rate": 2.2226642385237835e-05, + "loss": 0.5729, + "step": 188500 + }, + { + "epoch": 1.6664898601460423, + "grad_norm": 1.2398520708084106, + "learning_rate": 2.2225168997565967e-05, + "loss": 0.5507, + "step": 188510 + }, + { + "epoch": 1.6665782634063544, + "grad_norm": 1.4003452062606812, + "learning_rate": 2.2223695609894095e-05, + "loss": 0.5047, + "step": 188520 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 6.24932861328125, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.4166, + "step": 188530 + }, + { + "epoch": 1.6667550699269789, + "grad_norm": 2.1581618785858154, + "learning_rate": 2.2220748834550355e-05, + "loss": 0.7106, + "step": 188540 + }, + { + "epoch": 1.6668434731872912, + "grad_norm": 6.85422420501709, + "learning_rate": 2.2219275446878483e-05, + "loss": 0.6032, + "step": 188550 + }, + { + "epoch": 1.6669318764476033, + "grad_norm": 1.0539196729660034, + "learning_rate": 2.2217802059206612e-05, + "loss": 0.5044, + "step": 188560 + }, + { + "epoch": 1.6670202797079157, + "grad_norm": 1.972545862197876, + "learning_rate": 2.2216328671534743e-05, + "loss": 0.5386, + "step": 188570 + }, + { + "epoch": 1.667108682968228, + "grad_norm": 6.655059814453125, + "learning_rate": 2.2214855283862872e-05, + "loss": 0.5911, + "step": 188580 + }, + { + "epoch": 1.6671970862285401, + "grad_norm": 5.74491024017334, + "learning_rate": 2.2213381896191e-05, + "loss": 0.5598, + "step": 188590 + }, + { + "epoch": 1.6672854894888522, + "grad_norm": 1.4643230438232422, + "learning_rate": 2.221190850851913e-05, + "loss": 0.6169, + "step": 188600 + }, + { + "epoch": 1.6673738927491646, + "grad_norm": 2.5021920204162598, + "learning_rate": 2.221043512084726e-05, + "loss": 0.5406, + "step": 188610 + }, + { + "epoch": 1.667462296009477, + "grad_norm": 14.962636947631836, + "learning_rate": 2.220896173317539e-05, + "loss": 0.4999, + "step": 188620 + }, + { + "epoch": 1.667550699269789, + "grad_norm": 1.1757675409317017, + "learning_rate": 2.2207488345503517e-05, + "loss": 0.5597, + "step": 188630 + }, + { + "epoch": 1.6676391025301012, + "grad_norm": 3.5266504287719727, + "learning_rate": 2.2206014957831645e-05, + "loss": 0.5871, + "step": 188640 + }, + { + "epoch": 1.6677275057904135, + "grad_norm": 3.238380193710327, + "learning_rate": 2.2204541570159777e-05, + "loss": 0.5241, + "step": 188650 + }, + { + "epoch": 1.6678159090507259, + "grad_norm": 3.122004747390747, + "learning_rate": 2.2203068182487905e-05, + "loss": 0.6736, + "step": 188660 + }, + { + "epoch": 1.667904312311038, + "grad_norm": 1.0609664916992188, + "learning_rate": 2.2201594794816034e-05, + "loss": 0.6957, + "step": 188670 + }, + { + "epoch": 1.6679927155713503, + "grad_norm": 2.5533833503723145, + "learning_rate": 2.2200121407144166e-05, + "loss": 0.6652, + "step": 188680 + }, + { + "epoch": 1.6680811188316627, + "grad_norm": 3.396364450454712, + "learning_rate": 2.2198648019472294e-05, + "loss": 0.5739, + "step": 188690 + }, + { + "epoch": 1.6681695220919748, + "grad_norm": 14.063793182373047, + "learning_rate": 2.2197174631800422e-05, + "loss": 0.6358, + "step": 188700 + }, + { + "epoch": 1.668257925352287, + "grad_norm": 2.8764028549194336, + "learning_rate": 2.219570124412855e-05, + "loss": 0.5964, + "step": 188710 + }, + { + "epoch": 1.6683463286125992, + "grad_norm": 0.9605642557144165, + "learning_rate": 2.2194227856456682e-05, + "loss": 0.5841, + "step": 188720 + }, + { + "epoch": 1.6684347318729116, + "grad_norm": 2.392451047897339, + "learning_rate": 2.219275446878481e-05, + "loss": 0.5822, + "step": 188730 + }, + { + "epoch": 1.6685231351332237, + "grad_norm": 2.0240206718444824, + "learning_rate": 2.219128108111294e-05, + "loss": 0.5108, + "step": 188740 + }, + { + "epoch": 1.6686115383935358, + "grad_norm": 1.1215158700942993, + "learning_rate": 2.218980769344107e-05, + "loss": 0.4555, + "step": 188750 + }, + { + "epoch": 1.6686999416538482, + "grad_norm": 7.258119583129883, + "learning_rate": 2.21883343057692e-05, + "loss": 0.5292, + "step": 188760 + }, + { + "epoch": 1.6687883449141605, + "grad_norm": 2.3384623527526855, + "learning_rate": 2.2186860918097327e-05, + "loss": 0.6956, + "step": 188770 + }, + { + "epoch": 1.6688767481744726, + "grad_norm": 4.769532203674316, + "learning_rate": 2.2185387530425456e-05, + "loss": 0.6884, + "step": 188780 + }, + { + "epoch": 1.668965151434785, + "grad_norm": 1.6644866466522217, + "learning_rate": 2.2183914142753588e-05, + "loss": 0.7263, + "step": 188790 + }, + { + "epoch": 1.6690535546950973, + "grad_norm": 2.7710771560668945, + "learning_rate": 2.2182440755081716e-05, + "loss": 0.5692, + "step": 188800 + }, + { + "epoch": 1.6691419579554094, + "grad_norm": 1.0649518966674805, + "learning_rate": 2.2180967367409844e-05, + "loss": 0.6625, + "step": 188810 + }, + { + "epoch": 1.6692303612157215, + "grad_norm": 4.310955047607422, + "learning_rate": 2.2179493979737973e-05, + "loss": 0.5366, + "step": 188820 + }, + { + "epoch": 1.669318764476034, + "grad_norm": 6.253264904022217, + "learning_rate": 2.2178020592066104e-05, + "loss": 0.5959, + "step": 188830 + }, + { + "epoch": 1.6694071677363462, + "grad_norm": 0.9093913435935974, + "learning_rate": 2.2176547204394233e-05, + "loss": 0.5524, + "step": 188840 + }, + { + "epoch": 1.6694955709966584, + "grad_norm": 1.5057843923568726, + "learning_rate": 2.217507381672236e-05, + "loss": 0.585, + "step": 188850 + }, + { + "epoch": 1.6695839742569705, + "grad_norm": 4.095244407653809, + "learning_rate": 2.2173600429050493e-05, + "loss": 0.7318, + "step": 188860 + }, + { + "epoch": 1.6696723775172828, + "grad_norm": 3.4648289680480957, + "learning_rate": 2.217212704137862e-05, + "loss": 0.6113, + "step": 188870 + }, + { + "epoch": 1.6697607807775952, + "grad_norm": 5.554011821746826, + "learning_rate": 2.217065365370675e-05, + "loss": 0.5773, + "step": 188880 + }, + { + "epoch": 1.6698491840379073, + "grad_norm": 15.912015914916992, + "learning_rate": 2.2169180266034878e-05, + "loss": 0.6196, + "step": 188890 + }, + { + "epoch": 1.6699375872982194, + "grad_norm": 4.737215518951416, + "learning_rate": 2.216770687836301e-05, + "loss": 0.607, + "step": 188900 + }, + { + "epoch": 1.670025990558532, + "grad_norm": 2.334709644317627, + "learning_rate": 2.2166233490691138e-05, + "loss": 0.6417, + "step": 188910 + }, + { + "epoch": 1.670114393818844, + "grad_norm": 2.6589438915252686, + "learning_rate": 2.2164760103019266e-05, + "loss": 0.5442, + "step": 188920 + }, + { + "epoch": 1.6702027970791562, + "grad_norm": 0.9686906933784485, + "learning_rate": 2.2163286715347398e-05, + "loss": 0.548, + "step": 188930 + }, + { + "epoch": 1.6702912003394685, + "grad_norm": 2.9497954845428467, + "learning_rate": 2.2161813327675526e-05, + "loss": 0.5604, + "step": 188940 + }, + { + "epoch": 1.6703796035997809, + "grad_norm": 1.4715207815170288, + "learning_rate": 2.2160339940003655e-05, + "loss": 0.5868, + "step": 188950 + }, + { + "epoch": 1.670468006860093, + "grad_norm": 6.004668235778809, + "learning_rate": 2.2158866552331783e-05, + "loss": 0.6727, + "step": 188960 + }, + { + "epoch": 1.6705564101204051, + "grad_norm": 5.1338348388671875, + "learning_rate": 2.2157393164659915e-05, + "loss": 0.6981, + "step": 188970 + }, + { + "epoch": 1.6706448133807175, + "grad_norm": 2.2565853595733643, + "learning_rate": 2.2155919776988043e-05, + "loss": 0.5748, + "step": 188980 + }, + { + "epoch": 1.6707332166410298, + "grad_norm": 3.186152935028076, + "learning_rate": 2.215444638931617e-05, + "loss": 0.6113, + "step": 188990 + }, + { + "epoch": 1.670821619901342, + "grad_norm": 4.91539192199707, + "learning_rate": 2.21529730016443e-05, + "loss": 0.5266, + "step": 189000 + }, + { + "epoch": 1.670910023161654, + "grad_norm": 1.577486276626587, + "learning_rate": 2.2151499613972432e-05, + "loss": 0.4357, + "step": 189010 + }, + { + "epoch": 1.6709984264219664, + "grad_norm": 4.022982120513916, + "learning_rate": 2.215002622630056e-05, + "loss": 0.6327, + "step": 189020 + }, + { + "epoch": 1.6710868296822787, + "grad_norm": 1.7423595190048218, + "learning_rate": 2.214855283862869e-05, + "loss": 0.6096, + "step": 189030 + }, + { + "epoch": 1.6711752329425908, + "grad_norm": 11.674312591552734, + "learning_rate": 2.214707945095682e-05, + "loss": 0.573, + "step": 189040 + }, + { + "epoch": 1.6712636362029032, + "grad_norm": 1.4974020719528198, + "learning_rate": 2.214560606328495e-05, + "loss": 0.4743, + "step": 189050 + }, + { + "epoch": 1.6713520394632155, + "grad_norm": 2.0666520595550537, + "learning_rate": 2.2144132675613077e-05, + "loss": 0.7592, + "step": 189060 + }, + { + "epoch": 1.6714404427235277, + "grad_norm": 2.0698180198669434, + "learning_rate": 2.2142659287941205e-05, + "loss": 0.5692, + "step": 189070 + }, + { + "epoch": 1.6715288459838398, + "grad_norm": 4.766151428222656, + "learning_rate": 2.2141185900269337e-05, + "loss": 0.6432, + "step": 189080 + }, + { + "epoch": 1.6716172492441521, + "grad_norm": 2.26936411857605, + "learning_rate": 2.2139712512597465e-05, + "loss": 0.6432, + "step": 189090 + }, + { + "epoch": 1.6717056525044645, + "grad_norm": 8.839527130126953, + "learning_rate": 2.2138239124925594e-05, + "loss": 0.5871, + "step": 189100 + }, + { + "epoch": 1.6717940557647766, + "grad_norm": 7.770999908447266, + "learning_rate": 2.2136765737253722e-05, + "loss": 0.644, + "step": 189110 + }, + { + "epoch": 1.6718824590250887, + "grad_norm": 1.8214045763015747, + "learning_rate": 2.2135292349581854e-05, + "loss": 0.6545, + "step": 189120 + }, + { + "epoch": 1.671970862285401, + "grad_norm": 11.122117042541504, + "learning_rate": 2.2133818961909982e-05, + "loss": 0.4978, + "step": 189130 + }, + { + "epoch": 1.6720592655457134, + "grad_norm": 1.1969584226608276, + "learning_rate": 2.213234557423811e-05, + "loss": 0.7185, + "step": 189140 + }, + { + "epoch": 1.6721476688060255, + "grad_norm": 11.206798553466797, + "learning_rate": 2.2130872186566242e-05, + "loss": 0.5742, + "step": 189150 + }, + { + "epoch": 1.6722360720663378, + "grad_norm": 7.0461859703063965, + "learning_rate": 2.212939879889437e-05, + "loss": 0.6616, + "step": 189160 + }, + { + "epoch": 1.6723244753266502, + "grad_norm": 2.256686210632324, + "learning_rate": 2.21279254112225e-05, + "loss": 0.7487, + "step": 189170 + }, + { + "epoch": 1.6724128785869623, + "grad_norm": 3.1147313117980957, + "learning_rate": 2.2126452023550627e-05, + "loss": 0.5055, + "step": 189180 + }, + { + "epoch": 1.6725012818472744, + "grad_norm": 1.971746563911438, + "learning_rate": 2.212497863587876e-05, + "loss": 0.6966, + "step": 189190 + }, + { + "epoch": 1.6725896851075868, + "grad_norm": 3.3352856636047363, + "learning_rate": 2.2123505248206887e-05, + "loss": 0.6319, + "step": 189200 + }, + { + "epoch": 1.672678088367899, + "grad_norm": 1.1682038307189941, + "learning_rate": 2.2122031860535016e-05, + "loss": 0.5754, + "step": 189210 + }, + { + "epoch": 1.6727664916282112, + "grad_norm": 2.785252094268799, + "learning_rate": 2.2120558472863147e-05, + "loss": 0.5977, + "step": 189220 + }, + { + "epoch": 1.6728548948885233, + "grad_norm": 3.5565831661224365, + "learning_rate": 2.2119085085191276e-05, + "loss": 0.5839, + "step": 189230 + }, + { + "epoch": 1.6729432981488357, + "grad_norm": 3.4115993976593018, + "learning_rate": 2.2117611697519404e-05, + "loss": 0.6556, + "step": 189240 + }, + { + "epoch": 1.673031701409148, + "grad_norm": 4.680783271789551, + "learning_rate": 2.2116138309847533e-05, + "loss": 0.6052, + "step": 189250 + }, + { + "epoch": 1.6731201046694602, + "grad_norm": 1.8483526706695557, + "learning_rate": 2.2114664922175664e-05, + "loss": 0.6151, + "step": 189260 + }, + { + "epoch": 1.6732085079297725, + "grad_norm": 2.25266695022583, + "learning_rate": 2.2113191534503793e-05, + "loss": 0.6653, + "step": 189270 + }, + { + "epoch": 1.6732969111900848, + "grad_norm": 1.655000925064087, + "learning_rate": 2.211171814683192e-05, + "loss": 0.6066, + "step": 189280 + }, + { + "epoch": 1.673385314450397, + "grad_norm": 2.4040040969848633, + "learning_rate": 2.211024475916005e-05, + "loss": 0.5819, + "step": 189290 + }, + { + "epoch": 1.673473717710709, + "grad_norm": 2.0747737884521484, + "learning_rate": 2.210877137148818e-05, + "loss": 0.5092, + "step": 189300 + }, + { + "epoch": 1.6735621209710214, + "grad_norm": 4.702084064483643, + "learning_rate": 2.210729798381631e-05, + "loss": 0.5864, + "step": 189310 + }, + { + "epoch": 1.6736505242313338, + "grad_norm": 4.6156768798828125, + "learning_rate": 2.2105824596144438e-05, + "loss": 0.6299, + "step": 189320 + }, + { + "epoch": 1.6737389274916459, + "grad_norm": 5.976464748382568, + "learning_rate": 2.210435120847257e-05, + "loss": 0.7717, + "step": 189330 + }, + { + "epoch": 1.673827330751958, + "grad_norm": 1.6078767776489258, + "learning_rate": 2.2102877820800698e-05, + "loss": 0.547, + "step": 189340 + }, + { + "epoch": 1.6739157340122703, + "grad_norm": 3.873584747314453, + "learning_rate": 2.2101404433128826e-05, + "loss": 0.5983, + "step": 189350 + }, + { + "epoch": 1.6740041372725827, + "grad_norm": 1.2261841297149658, + "learning_rate": 2.2099931045456958e-05, + "loss": 0.6695, + "step": 189360 + }, + { + "epoch": 1.6740925405328948, + "grad_norm": 8.731363296508789, + "learning_rate": 2.2098457657785086e-05, + "loss": 0.6594, + "step": 189370 + }, + { + "epoch": 1.6741809437932071, + "grad_norm": 5.022148132324219, + "learning_rate": 2.2096984270113215e-05, + "loss": 0.6323, + "step": 189380 + }, + { + "epoch": 1.6742693470535195, + "grad_norm": 2.15146803855896, + "learning_rate": 2.2095510882441346e-05, + "loss": 0.5235, + "step": 189390 + }, + { + "epoch": 1.6743577503138316, + "grad_norm": 3.7636878490448, + "learning_rate": 2.2094037494769475e-05, + "loss": 0.601, + "step": 189400 + }, + { + "epoch": 1.6744461535741437, + "grad_norm": 2.561110496520996, + "learning_rate": 2.2092564107097603e-05, + "loss": 0.5239, + "step": 189410 + }, + { + "epoch": 1.674534556834456, + "grad_norm": 1.4131098985671997, + "learning_rate": 2.2091090719425735e-05, + "loss": 0.5494, + "step": 189420 + }, + { + "epoch": 1.6746229600947684, + "grad_norm": 0.7710692286491394, + "learning_rate": 2.2089617331753863e-05, + "loss": 0.5949, + "step": 189430 + }, + { + "epoch": 1.6747113633550805, + "grad_norm": 5.19439697265625, + "learning_rate": 2.208814394408199e-05, + "loss": 0.6324, + "step": 189440 + }, + { + "epoch": 1.6747997666153926, + "grad_norm": 1.2359737157821655, + "learning_rate": 2.2086670556410123e-05, + "loss": 0.5129, + "step": 189450 + }, + { + "epoch": 1.674888169875705, + "grad_norm": 2.959333658218384, + "learning_rate": 2.208519716873825e-05, + "loss": 0.5621, + "step": 189460 + }, + { + "epoch": 1.6749765731360173, + "grad_norm": 1.3144824504852295, + "learning_rate": 2.208372378106638e-05, + "loss": 0.7098, + "step": 189470 + }, + { + "epoch": 1.6750649763963295, + "grad_norm": 1.24237859249115, + "learning_rate": 2.2082250393394512e-05, + "loss": 0.5468, + "step": 189480 + }, + { + "epoch": 1.6751533796566416, + "grad_norm": 10.24328327178955, + "learning_rate": 2.208077700572264e-05, + "loss": 0.568, + "step": 189490 + }, + { + "epoch": 1.6752417829169541, + "grad_norm": 16.48509407043457, + "learning_rate": 2.207930361805077e-05, + "loss": 0.6656, + "step": 189500 + }, + { + "epoch": 1.6753301861772663, + "grad_norm": 8.129464149475098, + "learning_rate": 2.20778302303789e-05, + "loss": 0.5774, + "step": 189510 + }, + { + "epoch": 1.6754185894375784, + "grad_norm": 1.916213035583496, + "learning_rate": 2.207635684270703e-05, + "loss": 0.5001, + "step": 189520 + }, + { + "epoch": 1.6755069926978907, + "grad_norm": 1.5008715391159058, + "learning_rate": 2.2074883455035157e-05, + "loss": 0.6012, + "step": 189530 + }, + { + "epoch": 1.675595395958203, + "grad_norm": 3.7476067543029785, + "learning_rate": 2.2073410067363285e-05, + "loss": 0.6386, + "step": 189540 + }, + { + "epoch": 1.6756837992185152, + "grad_norm": 14.789105415344238, + "learning_rate": 2.2071936679691417e-05, + "loss": 0.5663, + "step": 189550 + }, + { + "epoch": 1.6757722024788273, + "grad_norm": 5.446702003479004, + "learning_rate": 2.2070463292019545e-05, + "loss": 0.6136, + "step": 189560 + }, + { + "epoch": 1.6758606057391396, + "grad_norm": 3.216346025466919, + "learning_rate": 2.2068989904347674e-05, + "loss": 0.5366, + "step": 189570 + }, + { + "epoch": 1.675949008999452, + "grad_norm": 8.363117218017578, + "learning_rate": 2.2067516516675805e-05, + "loss": 0.5766, + "step": 189580 + }, + { + "epoch": 1.676037412259764, + "grad_norm": 6.885417938232422, + "learning_rate": 2.2066043129003934e-05, + "loss": 0.492, + "step": 189590 + }, + { + "epoch": 1.6761258155200762, + "grad_norm": 5.379096984863281, + "learning_rate": 2.2064569741332062e-05, + "loss": 0.576, + "step": 189600 + }, + { + "epoch": 1.6762142187803888, + "grad_norm": 8.03687572479248, + "learning_rate": 2.206309635366019e-05, + "loss": 0.7067, + "step": 189610 + }, + { + "epoch": 1.676302622040701, + "grad_norm": 3.711484670639038, + "learning_rate": 2.2061622965988322e-05, + "loss": 0.6372, + "step": 189620 + }, + { + "epoch": 1.676391025301013, + "grad_norm": 11.158401489257812, + "learning_rate": 2.206014957831645e-05, + "loss": 0.6165, + "step": 189630 + }, + { + "epoch": 1.6764794285613254, + "grad_norm": 2.3779067993164062, + "learning_rate": 2.205867619064458e-05, + "loss": 0.5788, + "step": 189640 + }, + { + "epoch": 1.6765678318216377, + "grad_norm": 9.3578462600708, + "learning_rate": 2.2057202802972707e-05, + "loss": 0.5638, + "step": 189650 + }, + { + "epoch": 1.6766562350819498, + "grad_norm": 4.641378402709961, + "learning_rate": 2.205572941530084e-05, + "loss": 0.5555, + "step": 189660 + }, + { + "epoch": 1.676744638342262, + "grad_norm": 2.9508461952209473, + "learning_rate": 2.2054256027628967e-05, + "loss": 0.6674, + "step": 189670 + }, + { + "epoch": 1.6768330416025743, + "grad_norm": 1.9657361507415771, + "learning_rate": 2.2052782639957096e-05, + "loss": 0.6882, + "step": 189680 + }, + { + "epoch": 1.6769214448628866, + "grad_norm": 2.3473188877105713, + "learning_rate": 2.2051309252285228e-05, + "loss": 0.5233, + "step": 189690 + }, + { + "epoch": 1.6770098481231988, + "grad_norm": 0.8163551688194275, + "learning_rate": 2.2049835864613356e-05, + "loss": 0.5934, + "step": 189700 + }, + { + "epoch": 1.6770982513835109, + "grad_norm": 5.901861667633057, + "learning_rate": 2.2048362476941484e-05, + "loss": 0.5388, + "step": 189710 + }, + { + "epoch": 1.6771866546438232, + "grad_norm": 2.464370012283325, + "learning_rate": 2.2046889089269613e-05, + "loss": 0.6292, + "step": 189720 + }, + { + "epoch": 1.6772750579041356, + "grad_norm": 3.0845062732696533, + "learning_rate": 2.2045415701597744e-05, + "loss": 0.7139, + "step": 189730 + }, + { + "epoch": 1.6773634611644477, + "grad_norm": 3.8142809867858887, + "learning_rate": 2.2043942313925873e-05, + "loss": 0.6596, + "step": 189740 + }, + { + "epoch": 1.67745186442476, + "grad_norm": 6.545985221862793, + "learning_rate": 2.2042468926254e-05, + "loss": 0.4766, + "step": 189750 + }, + { + "epoch": 1.6775402676850724, + "grad_norm": 2.106898069381714, + "learning_rate": 2.204099553858213e-05, + "loss": 0.6604, + "step": 189760 + }, + { + "epoch": 1.6776286709453845, + "grad_norm": 8.585613250732422, + "learning_rate": 2.203952215091026e-05, + "loss": 0.5835, + "step": 189770 + }, + { + "epoch": 1.6777170742056966, + "grad_norm": 0.9737794995307922, + "learning_rate": 2.203804876323839e-05, + "loss": 0.6715, + "step": 189780 + }, + { + "epoch": 1.677805477466009, + "grad_norm": 2.4751617908477783, + "learning_rate": 2.2036575375566518e-05, + "loss": 0.5614, + "step": 189790 + }, + { + "epoch": 1.6778938807263213, + "grad_norm": 8.864995956420898, + "learning_rate": 2.203510198789465e-05, + "loss": 0.6392, + "step": 189800 + }, + { + "epoch": 1.6779822839866334, + "grad_norm": 10.63521671295166, + "learning_rate": 2.2033628600222778e-05, + "loss": 0.6142, + "step": 189810 + }, + { + "epoch": 1.6780706872469455, + "grad_norm": 1.2009896039962769, + "learning_rate": 2.2032155212550906e-05, + "loss": 0.6091, + "step": 189820 + }, + { + "epoch": 1.6781590905072579, + "grad_norm": 1.397385835647583, + "learning_rate": 2.2030681824879035e-05, + "loss": 0.7609, + "step": 189830 + }, + { + "epoch": 1.6782474937675702, + "grad_norm": 2.633423328399658, + "learning_rate": 2.2029208437207166e-05, + "loss": 0.5759, + "step": 189840 + }, + { + "epoch": 1.6783358970278823, + "grad_norm": 3.4063544273376465, + "learning_rate": 2.2027735049535295e-05, + "loss": 0.5684, + "step": 189850 + }, + { + "epoch": 1.6784243002881947, + "grad_norm": 10.041712760925293, + "learning_rate": 2.2026261661863423e-05, + "loss": 0.6201, + "step": 189860 + }, + { + "epoch": 1.678512703548507, + "grad_norm": 4.14884090423584, + "learning_rate": 2.2024788274191555e-05, + "loss": 0.5453, + "step": 189870 + }, + { + "epoch": 1.6786011068088191, + "grad_norm": 6.366163730621338, + "learning_rate": 2.2023314886519683e-05, + "loss": 0.6861, + "step": 189880 + }, + { + "epoch": 1.6786895100691313, + "grad_norm": 2.5637083053588867, + "learning_rate": 2.202184149884781e-05, + "loss": 0.6522, + "step": 189890 + }, + { + "epoch": 1.6787779133294436, + "grad_norm": 1.2532386779785156, + "learning_rate": 2.202036811117594e-05, + "loss": 0.6338, + "step": 189900 + }, + { + "epoch": 1.678866316589756, + "grad_norm": 4.722858428955078, + "learning_rate": 2.201889472350407e-05, + "loss": 0.5245, + "step": 189910 + }, + { + "epoch": 1.678954719850068, + "grad_norm": 3.1113734245300293, + "learning_rate": 2.20174213358322e-05, + "loss": 0.6467, + "step": 189920 + }, + { + "epoch": 1.6790431231103802, + "grad_norm": 1.4831795692443848, + "learning_rate": 2.201594794816033e-05, + "loss": 0.5092, + "step": 189930 + }, + { + "epoch": 1.6791315263706925, + "grad_norm": 3.9340319633483887, + "learning_rate": 2.2014474560488457e-05, + "loss": 0.6271, + "step": 189940 + }, + { + "epoch": 1.6792199296310049, + "grad_norm": 1.2145519256591797, + "learning_rate": 2.201300117281659e-05, + "loss": 0.5553, + "step": 189950 + }, + { + "epoch": 1.679308332891317, + "grad_norm": 1.2873629331588745, + "learning_rate": 2.2011527785144717e-05, + "loss": 0.6549, + "step": 189960 + }, + { + "epoch": 1.6793967361516293, + "grad_norm": 1.677676796913147, + "learning_rate": 2.2010054397472845e-05, + "loss": 0.5991, + "step": 189970 + }, + { + "epoch": 1.6794851394119417, + "grad_norm": 2.600574254989624, + "learning_rate": 2.2008581009800977e-05, + "loss": 0.5597, + "step": 189980 + }, + { + "epoch": 1.6795735426722538, + "grad_norm": 3.347747564315796, + "learning_rate": 2.2007107622129105e-05, + "loss": 0.6057, + "step": 189990 + }, + { + "epoch": 1.679661945932566, + "grad_norm": 1.7320047616958618, + "learning_rate": 2.2005634234457234e-05, + "loss": 0.6227, + "step": 190000 + }, + { + "epoch": 1.6797503491928782, + "grad_norm": 3.8314831256866455, + "learning_rate": 2.2004160846785362e-05, + "loss": 0.6426, + "step": 190010 + }, + { + "epoch": 1.6798387524531906, + "grad_norm": 4.242805004119873, + "learning_rate": 2.2002687459113494e-05, + "loss": 0.6173, + "step": 190020 + }, + { + "epoch": 1.6799271557135027, + "grad_norm": 1.511187195777893, + "learning_rate": 2.2001214071441622e-05, + "loss": 0.5629, + "step": 190030 + }, + { + "epoch": 1.6800155589738148, + "grad_norm": 2.0220277309417725, + "learning_rate": 2.199974068376975e-05, + "loss": 0.5588, + "step": 190040 + }, + { + "epoch": 1.6801039622341272, + "grad_norm": 4.622857570648193, + "learning_rate": 2.1998267296097882e-05, + "loss": 0.5901, + "step": 190050 + }, + { + "epoch": 1.6801923654944395, + "grad_norm": 2.2772791385650635, + "learning_rate": 2.199679390842601e-05, + "loss": 0.6768, + "step": 190060 + }, + { + "epoch": 1.6802807687547516, + "grad_norm": 9.772955894470215, + "learning_rate": 2.199532052075414e-05, + "loss": 0.559, + "step": 190070 + }, + { + "epoch": 1.6803691720150638, + "grad_norm": 3.5287930965423584, + "learning_rate": 2.1993847133082267e-05, + "loss": 0.6372, + "step": 190080 + }, + { + "epoch": 1.6804575752753763, + "grad_norm": 3.7590224742889404, + "learning_rate": 2.19923737454104e-05, + "loss": 0.7284, + "step": 190090 + }, + { + "epoch": 1.6805459785356884, + "grad_norm": 10.513998031616211, + "learning_rate": 2.1990900357738527e-05, + "loss": 0.5029, + "step": 190100 + }, + { + "epoch": 1.6806343817960006, + "grad_norm": 1.8067030906677246, + "learning_rate": 2.1989426970066656e-05, + "loss": 0.7068, + "step": 190110 + }, + { + "epoch": 1.680722785056313, + "grad_norm": 10.699889183044434, + "learning_rate": 2.1987953582394784e-05, + "loss": 0.5368, + "step": 190120 + }, + { + "epoch": 1.6808111883166252, + "grad_norm": 10.229439735412598, + "learning_rate": 2.1986480194722916e-05, + "loss": 0.5624, + "step": 190130 + }, + { + "epoch": 1.6808995915769374, + "grad_norm": 1.7228556871414185, + "learning_rate": 2.1985006807051044e-05, + "loss": 0.6638, + "step": 190140 + }, + { + "epoch": 1.6809879948372495, + "grad_norm": 2.5064215660095215, + "learning_rate": 2.1983533419379172e-05, + "loss": 0.5796, + "step": 190150 + }, + { + "epoch": 1.6810763980975618, + "grad_norm": 3.2687125205993652, + "learning_rate": 2.1982060031707304e-05, + "loss": 0.6425, + "step": 190160 + }, + { + "epoch": 1.6811648013578742, + "grad_norm": 2.8402090072631836, + "learning_rate": 2.1980586644035433e-05, + "loss": 0.5586, + "step": 190170 + }, + { + "epoch": 1.6812532046181863, + "grad_norm": 5.387648582458496, + "learning_rate": 2.197911325636356e-05, + "loss": 0.6745, + "step": 190180 + }, + { + "epoch": 1.6813416078784984, + "grad_norm": 1.676777958869934, + "learning_rate": 2.197763986869169e-05, + "loss": 0.5778, + "step": 190190 + }, + { + "epoch": 1.681430011138811, + "grad_norm": 6.165497779846191, + "learning_rate": 2.197616648101982e-05, + "loss": 0.5702, + "step": 190200 + }, + { + "epoch": 1.681518414399123, + "grad_norm": 1.4563077688217163, + "learning_rate": 2.197469309334795e-05, + "loss": 0.5723, + "step": 190210 + }, + { + "epoch": 1.6816068176594352, + "grad_norm": 2.4805495738983154, + "learning_rate": 2.1973219705676078e-05, + "loss": 0.5249, + "step": 190220 + }, + { + "epoch": 1.6816952209197475, + "grad_norm": 2.617976188659668, + "learning_rate": 2.1971746318004206e-05, + "loss": 0.7356, + "step": 190230 + }, + { + "epoch": 1.6817836241800599, + "grad_norm": 1.9330956935882568, + "learning_rate": 2.1970272930332338e-05, + "loss": 0.6424, + "step": 190240 + }, + { + "epoch": 1.681872027440372, + "grad_norm": 11.002070426940918, + "learning_rate": 2.1968799542660466e-05, + "loss": 0.6507, + "step": 190250 + }, + { + "epoch": 1.6819604307006841, + "grad_norm": 1.3374661207199097, + "learning_rate": 2.1967326154988595e-05, + "loss": 0.5391, + "step": 190260 + }, + { + "epoch": 1.6820488339609965, + "grad_norm": 3.5891616344451904, + "learning_rate": 2.1965852767316726e-05, + "loss": 0.4846, + "step": 190270 + }, + { + "epoch": 1.6821372372213088, + "grad_norm": 1.4183052778244019, + "learning_rate": 2.1964379379644855e-05, + "loss": 0.5757, + "step": 190280 + }, + { + "epoch": 1.682225640481621, + "grad_norm": 9.315398216247559, + "learning_rate": 2.1962905991972983e-05, + "loss": 0.5603, + "step": 190290 + }, + { + "epoch": 1.682314043741933, + "grad_norm": 9.497298240661621, + "learning_rate": 2.1961432604301115e-05, + "loss": 0.6526, + "step": 190300 + }, + { + "epoch": 1.6824024470022454, + "grad_norm": 1.1867121458053589, + "learning_rate": 2.1959959216629243e-05, + "loss": 0.6201, + "step": 190310 + }, + { + "epoch": 1.6824908502625577, + "grad_norm": 9.748201370239258, + "learning_rate": 2.195848582895737e-05, + "loss": 0.4557, + "step": 190320 + }, + { + "epoch": 1.6825792535228699, + "grad_norm": 5.4763336181640625, + "learning_rate": 2.1957012441285503e-05, + "loss": 0.5302, + "step": 190330 + }, + { + "epoch": 1.6826676567831822, + "grad_norm": 2.238471031188965, + "learning_rate": 2.195553905361363e-05, + "loss": 0.4887, + "step": 190340 + }, + { + "epoch": 1.6827560600434945, + "grad_norm": 2.178219795227051, + "learning_rate": 2.195406566594176e-05, + "loss": 0.6582, + "step": 190350 + }, + { + "epoch": 1.6828444633038067, + "grad_norm": 1.5363105535507202, + "learning_rate": 2.195259227826989e-05, + "loss": 0.6112, + "step": 190360 + }, + { + "epoch": 1.6829328665641188, + "grad_norm": 8.312849044799805, + "learning_rate": 2.195111889059802e-05, + "loss": 0.5893, + "step": 190370 + }, + { + "epoch": 1.6830212698244311, + "grad_norm": 2.584324359893799, + "learning_rate": 2.194964550292615e-05, + "loss": 0.7375, + "step": 190380 + }, + { + "epoch": 1.6831096730847435, + "grad_norm": 1.2932301759719849, + "learning_rate": 2.194817211525428e-05, + "loss": 0.5513, + "step": 190390 + }, + { + "epoch": 1.6831980763450556, + "grad_norm": 2.2087790966033936, + "learning_rate": 2.194669872758241e-05, + "loss": 0.4637, + "step": 190400 + }, + { + "epoch": 1.6832864796053677, + "grad_norm": 3.2174506187438965, + "learning_rate": 2.1945225339910537e-05, + "loss": 0.6108, + "step": 190410 + }, + { + "epoch": 1.68337488286568, + "grad_norm": 15.741214752197266, + "learning_rate": 2.194375195223867e-05, + "loss": 0.56, + "step": 190420 + }, + { + "epoch": 1.6834632861259924, + "grad_norm": 4.203559398651123, + "learning_rate": 2.1942278564566797e-05, + "loss": 0.5673, + "step": 190430 + }, + { + "epoch": 1.6835516893863045, + "grad_norm": 13.096375465393066, + "learning_rate": 2.1940805176894925e-05, + "loss": 0.5454, + "step": 190440 + }, + { + "epoch": 1.6836400926466168, + "grad_norm": 4.456557750701904, + "learning_rate": 2.1939331789223057e-05, + "loss": 0.7474, + "step": 190450 + }, + { + "epoch": 1.6837284959069292, + "grad_norm": 3.961533784866333, + "learning_rate": 2.1937858401551185e-05, + "loss": 0.5512, + "step": 190460 + }, + { + "epoch": 1.6838168991672413, + "grad_norm": 14.67947769165039, + "learning_rate": 2.1936385013879314e-05, + "loss": 0.652, + "step": 190470 + }, + { + "epoch": 1.6839053024275534, + "grad_norm": 1.3641750812530518, + "learning_rate": 2.1934911626207442e-05, + "loss": 0.6523, + "step": 190480 + }, + { + "epoch": 1.6839937056878658, + "grad_norm": 2.07220458984375, + "learning_rate": 2.1933438238535574e-05, + "loss": 0.4864, + "step": 190490 + }, + { + "epoch": 1.6840821089481781, + "grad_norm": 1.1580243110656738, + "learning_rate": 2.1931964850863702e-05, + "loss": 0.5769, + "step": 190500 + }, + { + "epoch": 1.6841705122084902, + "grad_norm": 1.169252634048462, + "learning_rate": 2.193049146319183e-05, + "loss": 0.5285, + "step": 190510 + }, + { + "epoch": 1.6842589154688024, + "grad_norm": 1.5489351749420166, + "learning_rate": 2.1929018075519962e-05, + "loss": 0.5721, + "step": 190520 + }, + { + "epoch": 1.6843473187291147, + "grad_norm": 1.0660382509231567, + "learning_rate": 2.192754468784809e-05, + "loss": 0.5511, + "step": 190530 + }, + { + "epoch": 1.684435721989427, + "grad_norm": 14.691634178161621, + "learning_rate": 2.192607130017622e-05, + "loss": 0.5029, + "step": 190540 + }, + { + "epoch": 1.6845241252497392, + "grad_norm": 2.0073564052581787, + "learning_rate": 2.1924597912504347e-05, + "loss": 0.5478, + "step": 190550 + }, + { + "epoch": 1.6846125285100515, + "grad_norm": 1.3661534786224365, + "learning_rate": 2.192312452483248e-05, + "loss": 0.67, + "step": 190560 + }, + { + "epoch": 1.6847009317703638, + "grad_norm": 6.212716102600098, + "learning_rate": 2.1921651137160607e-05, + "loss": 0.6697, + "step": 190570 + }, + { + "epoch": 1.684789335030676, + "grad_norm": 4.82363748550415, + "learning_rate": 2.1920177749488736e-05, + "loss": 0.5786, + "step": 190580 + }, + { + "epoch": 1.684877738290988, + "grad_norm": 4.08050012588501, + "learning_rate": 2.1918704361816864e-05, + "loss": 0.6144, + "step": 190590 + }, + { + "epoch": 1.6849661415513004, + "grad_norm": 8.38548469543457, + "learning_rate": 2.1917230974144996e-05, + "loss": 0.6661, + "step": 190600 + }, + { + "epoch": 1.6850545448116128, + "grad_norm": 2.0309576988220215, + "learning_rate": 2.1915757586473124e-05, + "loss": 0.5592, + "step": 190610 + }, + { + "epoch": 1.6851429480719249, + "grad_norm": 8.065064430236816, + "learning_rate": 2.1914284198801253e-05, + "loss": 0.5105, + "step": 190620 + }, + { + "epoch": 1.685231351332237, + "grad_norm": 3.297581434249878, + "learning_rate": 2.1912810811129384e-05, + "loss": 0.6416, + "step": 190630 + }, + { + "epoch": 1.6853197545925493, + "grad_norm": 1.1886439323425293, + "learning_rate": 2.1911337423457513e-05, + "loss": 0.6973, + "step": 190640 + }, + { + "epoch": 1.6854081578528617, + "grad_norm": 3.1800239086151123, + "learning_rate": 2.190986403578564e-05, + "loss": 0.6965, + "step": 190650 + }, + { + "epoch": 1.6854965611131738, + "grad_norm": 2.357185125350952, + "learning_rate": 2.190839064811377e-05, + "loss": 0.4668, + "step": 190660 + }, + { + "epoch": 1.685584964373486, + "grad_norm": 2.388274669647217, + "learning_rate": 2.19069172604419e-05, + "loss": 0.6018, + "step": 190670 + }, + { + "epoch": 1.6856733676337985, + "grad_norm": 6.639049053192139, + "learning_rate": 2.190544387277003e-05, + "loss": 0.7029, + "step": 190680 + }, + { + "epoch": 1.6857617708941106, + "grad_norm": 4.946562767028809, + "learning_rate": 2.1903970485098158e-05, + "loss": 0.5823, + "step": 190690 + }, + { + "epoch": 1.6858501741544227, + "grad_norm": 1.82542085647583, + "learning_rate": 2.1902497097426286e-05, + "loss": 0.5313, + "step": 190700 + }, + { + "epoch": 1.685938577414735, + "grad_norm": 2.4510159492492676, + "learning_rate": 2.1901023709754418e-05, + "loss": 0.5712, + "step": 190710 + }, + { + "epoch": 1.6860269806750474, + "grad_norm": 1.0207104682922363, + "learning_rate": 2.1899550322082546e-05, + "loss": 0.4856, + "step": 190720 + }, + { + "epoch": 1.6861153839353595, + "grad_norm": 3.2885923385620117, + "learning_rate": 2.1898076934410675e-05, + "loss": 0.5497, + "step": 190730 + }, + { + "epoch": 1.6862037871956717, + "grad_norm": 9.004790306091309, + "learning_rate": 2.1896603546738806e-05, + "loss": 0.5325, + "step": 190740 + }, + { + "epoch": 1.686292190455984, + "grad_norm": 4.0741119384765625, + "learning_rate": 2.1895130159066935e-05, + "loss": 0.5682, + "step": 190750 + }, + { + "epoch": 1.6863805937162963, + "grad_norm": 1.089857578277588, + "learning_rate": 2.1893656771395063e-05, + "loss": 0.5986, + "step": 190760 + }, + { + "epoch": 1.6864689969766085, + "grad_norm": 5.030314922332764, + "learning_rate": 2.189218338372319e-05, + "loss": 0.6099, + "step": 190770 + }, + { + "epoch": 1.6865574002369206, + "grad_norm": 3.630523204803467, + "learning_rate": 2.1890709996051323e-05, + "loss": 0.4763, + "step": 190780 + }, + { + "epoch": 1.6866458034972331, + "grad_norm": 1.2105838060379028, + "learning_rate": 2.188923660837945e-05, + "loss": 0.626, + "step": 190790 + }, + { + "epoch": 1.6867342067575453, + "grad_norm": 3.044017791748047, + "learning_rate": 2.188776322070758e-05, + "loss": 0.595, + "step": 190800 + }, + { + "epoch": 1.6868226100178574, + "grad_norm": 3.25032639503479, + "learning_rate": 2.188628983303571e-05, + "loss": 0.7181, + "step": 190810 + }, + { + "epoch": 1.6869110132781697, + "grad_norm": 2.1563756465911865, + "learning_rate": 2.188481644536384e-05, + "loss": 0.4322, + "step": 190820 + }, + { + "epoch": 1.686999416538482, + "grad_norm": 3.811094045639038, + "learning_rate": 2.188334305769197e-05, + "loss": 0.4748, + "step": 190830 + }, + { + "epoch": 1.6870878197987942, + "grad_norm": 5.4308648109436035, + "learning_rate": 2.1881869670020097e-05, + "loss": 0.5897, + "step": 190840 + }, + { + "epoch": 1.6871762230591063, + "grad_norm": 1.9380451440811157, + "learning_rate": 2.188039628234823e-05, + "loss": 0.5574, + "step": 190850 + }, + { + "epoch": 1.6872646263194186, + "grad_norm": 1.8837717771530151, + "learning_rate": 2.1878922894676357e-05, + "loss": 0.6488, + "step": 190860 + }, + { + "epoch": 1.687353029579731, + "grad_norm": 2.0130834579467773, + "learning_rate": 2.1877449507004485e-05, + "loss": 0.627, + "step": 190870 + }, + { + "epoch": 1.687441432840043, + "grad_norm": 1.5990207195281982, + "learning_rate": 2.1875976119332613e-05, + "loss": 0.5575, + "step": 190880 + }, + { + "epoch": 1.6875298361003552, + "grad_norm": 1.9707977771759033, + "learning_rate": 2.1874502731660745e-05, + "loss": 0.5864, + "step": 190890 + }, + { + "epoch": 1.6876182393606676, + "grad_norm": 3.9532060623168945, + "learning_rate": 2.1873029343988874e-05, + "loss": 0.6636, + "step": 190900 + }, + { + "epoch": 1.68770664262098, + "grad_norm": 2.506739854812622, + "learning_rate": 2.1871555956317002e-05, + "loss": 0.5359, + "step": 190910 + }, + { + "epoch": 1.687795045881292, + "grad_norm": 2.4119558334350586, + "learning_rate": 2.1870082568645134e-05, + "loss": 0.6798, + "step": 190920 + }, + { + "epoch": 1.6878834491416044, + "grad_norm": 4.066296100616455, + "learning_rate": 2.1868609180973262e-05, + "loss": 0.6281, + "step": 190930 + }, + { + "epoch": 1.6879718524019167, + "grad_norm": 3.9416284561157227, + "learning_rate": 2.186713579330139e-05, + "loss": 0.6315, + "step": 190940 + }, + { + "epoch": 1.6880602556622288, + "grad_norm": 2.1455953121185303, + "learning_rate": 2.186566240562952e-05, + "loss": 0.6335, + "step": 190950 + }, + { + "epoch": 1.688148658922541, + "grad_norm": 3.3208320140838623, + "learning_rate": 2.186418901795765e-05, + "loss": 0.5828, + "step": 190960 + }, + { + "epoch": 1.6882370621828533, + "grad_norm": 1.930897831916809, + "learning_rate": 2.186271563028578e-05, + "loss": 0.6321, + "step": 190970 + }, + { + "epoch": 1.6883254654431656, + "grad_norm": 3.0473833084106445, + "learning_rate": 2.1861242242613907e-05, + "loss": 0.6549, + "step": 190980 + }, + { + "epoch": 1.6884138687034778, + "grad_norm": 7.404590606689453, + "learning_rate": 2.185976885494204e-05, + "loss": 0.6192, + "step": 190990 + }, + { + "epoch": 1.6885022719637899, + "grad_norm": 1.7399927377700806, + "learning_rate": 2.1858295467270167e-05, + "loss": 0.6563, + "step": 191000 + }, + { + "epoch": 1.6885906752241022, + "grad_norm": 11.48447036743164, + "learning_rate": 2.1856822079598296e-05, + "loss": 0.5461, + "step": 191010 + }, + { + "epoch": 1.6886790784844146, + "grad_norm": 1.321085810661316, + "learning_rate": 2.1855348691926424e-05, + "loss": 0.5534, + "step": 191020 + }, + { + "epoch": 1.6887674817447267, + "grad_norm": 4.213069915771484, + "learning_rate": 2.1853875304254556e-05, + "loss": 0.5374, + "step": 191030 + }, + { + "epoch": 1.688855885005039, + "grad_norm": 1.8781251907348633, + "learning_rate": 2.1852401916582684e-05, + "loss": 0.5992, + "step": 191040 + }, + { + "epoch": 1.6889442882653514, + "grad_norm": 15.161416053771973, + "learning_rate": 2.1850928528910812e-05, + "loss": 0.5349, + "step": 191050 + }, + { + "epoch": 1.6890326915256635, + "grad_norm": 11.546005249023438, + "learning_rate": 2.184945514123894e-05, + "loss": 0.6534, + "step": 191060 + }, + { + "epoch": 1.6891210947859756, + "grad_norm": 7.424916744232178, + "learning_rate": 2.1847981753567073e-05, + "loss": 0.6056, + "step": 191070 + }, + { + "epoch": 1.689209498046288, + "grad_norm": 2.070619821548462, + "learning_rate": 2.18465083658952e-05, + "loss": 0.5307, + "step": 191080 + }, + { + "epoch": 1.6892979013066003, + "grad_norm": 0.9939049482345581, + "learning_rate": 2.184503497822333e-05, + "loss": 0.5588, + "step": 191090 + }, + { + "epoch": 1.6893863045669124, + "grad_norm": 4.085865020751953, + "learning_rate": 2.184356159055146e-05, + "loss": 0.5902, + "step": 191100 + }, + { + "epoch": 1.6894747078272245, + "grad_norm": 2.3851819038391113, + "learning_rate": 2.184208820287959e-05, + "loss": 0.5819, + "step": 191110 + }, + { + "epoch": 1.6895631110875369, + "grad_norm": 0.5734080076217651, + "learning_rate": 2.1840614815207718e-05, + "loss": 0.5529, + "step": 191120 + }, + { + "epoch": 1.6896515143478492, + "grad_norm": 3.099874258041382, + "learning_rate": 2.1839141427535846e-05, + "loss": 0.5503, + "step": 191130 + }, + { + "epoch": 1.6897399176081613, + "grad_norm": 1.7946574687957764, + "learning_rate": 2.1837668039863978e-05, + "loss": 0.7642, + "step": 191140 + }, + { + "epoch": 1.6898283208684737, + "grad_norm": 3.9887897968292236, + "learning_rate": 2.1836194652192106e-05, + "loss": 0.487, + "step": 191150 + }, + { + "epoch": 1.689916724128786, + "grad_norm": 2.7381110191345215, + "learning_rate": 2.1834721264520234e-05, + "loss": 0.704, + "step": 191160 + }, + { + "epoch": 1.6900051273890981, + "grad_norm": 2.4239556789398193, + "learning_rate": 2.1833247876848366e-05, + "loss": 0.5848, + "step": 191170 + }, + { + "epoch": 1.6900935306494103, + "grad_norm": 2.926347494125366, + "learning_rate": 2.1831774489176495e-05, + "loss": 0.4912, + "step": 191180 + }, + { + "epoch": 1.6901819339097226, + "grad_norm": 3.046971082687378, + "learning_rate": 2.1830301101504623e-05, + "loss": 0.5571, + "step": 191190 + }, + { + "epoch": 1.690270337170035, + "grad_norm": 2.106529951095581, + "learning_rate": 2.182882771383275e-05, + "loss": 0.4651, + "step": 191200 + }, + { + "epoch": 1.690358740430347, + "grad_norm": 2.125847578048706, + "learning_rate": 2.1827354326160883e-05, + "loss": 0.6046, + "step": 191210 + }, + { + "epoch": 1.6904471436906592, + "grad_norm": 1.7945876121520996, + "learning_rate": 2.182588093848901e-05, + "loss": 0.4768, + "step": 191220 + }, + { + "epoch": 1.6905355469509715, + "grad_norm": 5.428643703460693, + "learning_rate": 2.182440755081714e-05, + "loss": 0.5581, + "step": 191230 + }, + { + "epoch": 1.6906239502112839, + "grad_norm": 1.877885103225708, + "learning_rate": 2.182293416314527e-05, + "loss": 0.5371, + "step": 191240 + }, + { + "epoch": 1.690712353471596, + "grad_norm": 2.7024946212768555, + "learning_rate": 2.18214607754734e-05, + "loss": 0.6356, + "step": 191250 + }, + { + "epoch": 1.6908007567319083, + "grad_norm": 4.930706977844238, + "learning_rate": 2.1819987387801528e-05, + "loss": 0.5158, + "step": 191260 + }, + { + "epoch": 1.6908891599922207, + "grad_norm": 2.5163559913635254, + "learning_rate": 2.181851400012966e-05, + "loss": 0.6207, + "step": 191270 + }, + { + "epoch": 1.6909775632525328, + "grad_norm": 1.711630940437317, + "learning_rate": 2.1817040612457788e-05, + "loss": 0.5904, + "step": 191280 + }, + { + "epoch": 1.691065966512845, + "grad_norm": 7.230292320251465, + "learning_rate": 2.1815567224785917e-05, + "loss": 0.6343, + "step": 191290 + }, + { + "epoch": 1.6911543697731573, + "grad_norm": 4.234020709991455, + "learning_rate": 2.181409383711405e-05, + "loss": 0.583, + "step": 191300 + }, + { + "epoch": 1.6912427730334696, + "grad_norm": 2.0680196285247803, + "learning_rate": 2.1812620449442177e-05, + "loss": 0.679, + "step": 191310 + }, + { + "epoch": 1.6913311762937817, + "grad_norm": 2.2695934772491455, + "learning_rate": 2.1811147061770305e-05, + "loss": 0.5432, + "step": 191320 + }, + { + "epoch": 1.6914195795540938, + "grad_norm": 2.44629168510437, + "learning_rate": 2.1809673674098437e-05, + "loss": 0.6198, + "step": 191330 + }, + { + "epoch": 1.6915079828144062, + "grad_norm": 2.11736798286438, + "learning_rate": 2.1808200286426565e-05, + "loss": 0.5626, + "step": 191340 + }, + { + "epoch": 1.6915963860747185, + "grad_norm": 1.8716286420822144, + "learning_rate": 2.1806726898754694e-05, + "loss": 0.544, + "step": 191350 + }, + { + "epoch": 1.6916847893350306, + "grad_norm": 4.241274833679199, + "learning_rate": 2.1805253511082825e-05, + "loss": 0.3895, + "step": 191360 + }, + { + "epoch": 1.6917731925953428, + "grad_norm": 2.03879451751709, + "learning_rate": 2.1803780123410954e-05, + "loss": 0.5302, + "step": 191370 + }, + { + "epoch": 1.6918615958556553, + "grad_norm": 1.6585477590560913, + "learning_rate": 2.1802306735739082e-05, + "loss": 0.5525, + "step": 191380 + }, + { + "epoch": 1.6919499991159674, + "grad_norm": 1.5219755172729492, + "learning_rate": 2.1800833348067214e-05, + "loss": 0.5222, + "step": 191390 + }, + { + "epoch": 1.6920384023762796, + "grad_norm": 2.376049518585205, + "learning_rate": 2.1799359960395342e-05, + "loss": 0.601, + "step": 191400 + }, + { + "epoch": 1.692126805636592, + "grad_norm": 3.1819357872009277, + "learning_rate": 2.179788657272347e-05, + "loss": 0.5259, + "step": 191410 + }, + { + "epoch": 1.6922152088969042, + "grad_norm": 4.571484088897705, + "learning_rate": 2.17964131850516e-05, + "loss": 0.5156, + "step": 191420 + }, + { + "epoch": 1.6923036121572164, + "grad_norm": 7.575584411621094, + "learning_rate": 2.179493979737973e-05, + "loss": 0.7005, + "step": 191430 + }, + { + "epoch": 1.6923920154175285, + "grad_norm": 8.56787109375, + "learning_rate": 2.179346640970786e-05, + "loss": 0.6455, + "step": 191440 + }, + { + "epoch": 1.6924804186778408, + "grad_norm": 6.546863555908203, + "learning_rate": 2.1791993022035987e-05, + "loss": 0.7042, + "step": 191450 + }, + { + "epoch": 1.6925688219381532, + "grad_norm": 1.9080413579940796, + "learning_rate": 2.179051963436412e-05, + "loss": 0.6087, + "step": 191460 + }, + { + "epoch": 1.6926572251984653, + "grad_norm": 5.976744651794434, + "learning_rate": 2.1789046246692247e-05, + "loss": 0.5607, + "step": 191470 + }, + { + "epoch": 1.6927456284587774, + "grad_norm": 1.779448390007019, + "learning_rate": 2.1787572859020376e-05, + "loss": 0.628, + "step": 191480 + }, + { + "epoch": 1.6928340317190897, + "grad_norm": 2.0550694465637207, + "learning_rate": 2.1786099471348504e-05, + "loss": 0.5087, + "step": 191490 + }, + { + "epoch": 1.692922434979402, + "grad_norm": 1.993318796157837, + "learning_rate": 2.1784626083676636e-05, + "loss": 0.6802, + "step": 191500 + }, + { + "epoch": 1.6930108382397142, + "grad_norm": 2.724390983581543, + "learning_rate": 2.1783152696004764e-05, + "loss": 0.63, + "step": 191510 + }, + { + "epoch": 1.6930992415000266, + "grad_norm": 2.7625443935394287, + "learning_rate": 2.1781679308332893e-05, + "loss": 0.4151, + "step": 191520 + }, + { + "epoch": 1.693187644760339, + "grad_norm": 5.640219688415527, + "learning_rate": 2.178020592066102e-05, + "loss": 0.5271, + "step": 191530 + }, + { + "epoch": 1.693276048020651, + "grad_norm": 2.244678258895874, + "learning_rate": 2.1778732532989153e-05, + "loss": 0.5969, + "step": 191540 + }, + { + "epoch": 1.6933644512809631, + "grad_norm": 5.617044925689697, + "learning_rate": 2.177725914531728e-05, + "loss": 0.5922, + "step": 191550 + }, + { + "epoch": 1.6934528545412755, + "grad_norm": 1.4208678007125854, + "learning_rate": 2.177578575764541e-05, + "loss": 0.5552, + "step": 191560 + }, + { + "epoch": 1.6935412578015878, + "grad_norm": 1.5686712265014648, + "learning_rate": 2.177431236997354e-05, + "loss": 0.627, + "step": 191570 + }, + { + "epoch": 1.6936296610619, + "grad_norm": 6.022036552429199, + "learning_rate": 2.177283898230167e-05, + "loss": 0.6061, + "step": 191580 + }, + { + "epoch": 1.693718064322212, + "grad_norm": 18.178495407104492, + "learning_rate": 2.1771365594629798e-05, + "loss": 0.4567, + "step": 191590 + }, + { + "epoch": 1.6938064675825244, + "grad_norm": 2.7776973247528076, + "learning_rate": 2.1769892206957926e-05, + "loss": 0.5015, + "step": 191600 + }, + { + "epoch": 1.6938948708428367, + "grad_norm": 1.8666770458221436, + "learning_rate": 2.1768418819286058e-05, + "loss": 0.6053, + "step": 191610 + }, + { + "epoch": 1.6939832741031489, + "grad_norm": 1.6332348585128784, + "learning_rate": 2.1766945431614186e-05, + "loss": 0.6221, + "step": 191620 + }, + { + "epoch": 1.6940716773634612, + "grad_norm": 3.93522572517395, + "learning_rate": 2.1765472043942315e-05, + "loss": 0.7227, + "step": 191630 + }, + { + "epoch": 1.6941600806237735, + "grad_norm": 6.13995885848999, + "learning_rate": 2.1763998656270446e-05, + "loss": 0.5609, + "step": 191640 + }, + { + "epoch": 1.6942484838840857, + "grad_norm": 3.6988348960876465, + "learning_rate": 2.1762525268598575e-05, + "loss": 0.6668, + "step": 191650 + }, + { + "epoch": 1.6943368871443978, + "grad_norm": 16.96199607849121, + "learning_rate": 2.1761051880926703e-05, + "loss": 0.5778, + "step": 191660 + }, + { + "epoch": 1.6944252904047101, + "grad_norm": 2.165583848953247, + "learning_rate": 2.175957849325483e-05, + "loss": 0.5572, + "step": 191670 + }, + { + "epoch": 1.6945136936650225, + "grad_norm": 1.700186848640442, + "learning_rate": 2.1758105105582963e-05, + "loss": 0.6116, + "step": 191680 + }, + { + "epoch": 1.6946020969253346, + "grad_norm": 5.387173175811768, + "learning_rate": 2.175663171791109e-05, + "loss": 0.5563, + "step": 191690 + }, + { + "epoch": 1.6946905001856467, + "grad_norm": 4.737339019775391, + "learning_rate": 2.175515833023922e-05, + "loss": 0.5988, + "step": 191700 + }, + { + "epoch": 1.694778903445959, + "grad_norm": 6.355472087860107, + "learning_rate": 2.1753684942567348e-05, + "loss": 0.5269, + "step": 191710 + }, + { + "epoch": 1.6948673067062714, + "grad_norm": 8.328197479248047, + "learning_rate": 2.175221155489548e-05, + "loss": 0.605, + "step": 191720 + }, + { + "epoch": 1.6949557099665835, + "grad_norm": 4.873742580413818, + "learning_rate": 2.1750738167223608e-05, + "loss": 0.5307, + "step": 191730 + }, + { + "epoch": 1.6950441132268959, + "grad_norm": 11.200762748718262, + "learning_rate": 2.1749264779551737e-05, + "loss": 0.516, + "step": 191740 + }, + { + "epoch": 1.6951325164872082, + "grad_norm": 2.6554524898529053, + "learning_rate": 2.174779139187987e-05, + "loss": 0.6569, + "step": 191750 + }, + { + "epoch": 1.6952209197475203, + "grad_norm": 2.401576280593872, + "learning_rate": 2.1746318004207997e-05, + "loss": 0.6212, + "step": 191760 + }, + { + "epoch": 1.6953093230078324, + "grad_norm": 2.919806480407715, + "learning_rate": 2.1744844616536125e-05, + "loss": 0.5842, + "step": 191770 + }, + { + "epoch": 1.6953977262681448, + "grad_norm": 1.6942613124847412, + "learning_rate": 2.1743371228864253e-05, + "loss": 0.6789, + "step": 191780 + }, + { + "epoch": 1.6954861295284571, + "grad_norm": 1.4825233221054077, + "learning_rate": 2.1741897841192385e-05, + "loss": 0.5009, + "step": 191790 + }, + { + "epoch": 1.6955745327887692, + "grad_norm": 6.986950397491455, + "learning_rate": 2.1740424453520514e-05, + "loss": 0.4956, + "step": 191800 + }, + { + "epoch": 1.6956629360490814, + "grad_norm": 0.8059583306312561, + "learning_rate": 2.1738951065848642e-05, + "loss": 0.429, + "step": 191810 + }, + { + "epoch": 1.6957513393093937, + "grad_norm": 0.841052234172821, + "learning_rate": 2.173747767817677e-05, + "loss": 0.4548, + "step": 191820 + }, + { + "epoch": 1.695839742569706, + "grad_norm": 1.0270919799804688, + "learning_rate": 2.1736004290504902e-05, + "loss": 0.5977, + "step": 191830 + }, + { + "epoch": 1.6959281458300182, + "grad_norm": 3.210310220718384, + "learning_rate": 2.173453090283303e-05, + "loss": 0.654, + "step": 191840 + }, + { + "epoch": 1.6960165490903305, + "grad_norm": 1.1407127380371094, + "learning_rate": 2.173305751516116e-05, + "loss": 0.6051, + "step": 191850 + }, + { + "epoch": 1.6961049523506428, + "grad_norm": 6.326188087463379, + "learning_rate": 2.173158412748929e-05, + "loss": 0.6217, + "step": 191860 + }, + { + "epoch": 1.696193355610955, + "grad_norm": 2.7178454399108887, + "learning_rate": 2.173011073981742e-05, + "loss": 0.4836, + "step": 191870 + }, + { + "epoch": 1.696281758871267, + "grad_norm": 0.8690058588981628, + "learning_rate": 2.1728637352145547e-05, + "loss": 0.5596, + "step": 191880 + }, + { + "epoch": 1.6963701621315794, + "grad_norm": 3.9567224979400635, + "learning_rate": 2.1727163964473675e-05, + "loss": 0.5937, + "step": 191890 + }, + { + "epoch": 1.6964585653918918, + "grad_norm": 2.895535707473755, + "learning_rate": 2.1725690576801807e-05, + "loss": 0.6673, + "step": 191900 + }, + { + "epoch": 1.696546968652204, + "grad_norm": 1.5205992460250854, + "learning_rate": 2.1724217189129936e-05, + "loss": 0.6333, + "step": 191910 + }, + { + "epoch": 1.696635371912516, + "grad_norm": 1.8309000730514526, + "learning_rate": 2.1722743801458064e-05, + "loss": 0.6837, + "step": 191920 + }, + { + "epoch": 1.6967237751728284, + "grad_norm": 10.216381072998047, + "learning_rate": 2.1721270413786196e-05, + "loss": 0.5494, + "step": 191930 + }, + { + "epoch": 1.6968121784331407, + "grad_norm": 7.729549407958984, + "learning_rate": 2.1719797026114324e-05, + "loss": 0.5908, + "step": 191940 + }, + { + "epoch": 1.6969005816934528, + "grad_norm": 8.199275970458984, + "learning_rate": 2.1718323638442452e-05, + "loss": 0.5744, + "step": 191950 + }, + { + "epoch": 1.696988984953765, + "grad_norm": 1.3720570802688599, + "learning_rate": 2.171685025077058e-05, + "loss": 0.5484, + "step": 191960 + }, + { + "epoch": 1.6970773882140775, + "grad_norm": 2.224489212036133, + "learning_rate": 2.1715376863098712e-05, + "loss": 0.5152, + "step": 191970 + }, + { + "epoch": 1.6971657914743896, + "grad_norm": 1.7162714004516602, + "learning_rate": 2.171390347542684e-05, + "loss": 0.6841, + "step": 191980 + }, + { + "epoch": 1.6972541947347017, + "grad_norm": 2.6422953605651855, + "learning_rate": 2.171243008775497e-05, + "loss": 0.5728, + "step": 191990 + }, + { + "epoch": 1.697342597995014, + "grad_norm": 7.1870198249816895, + "learning_rate": 2.1710956700083098e-05, + "loss": 0.6655, + "step": 192000 + }, + { + "epoch": 1.6974310012553264, + "grad_norm": 1.5172876119613647, + "learning_rate": 2.170948331241123e-05, + "loss": 0.5648, + "step": 192010 + }, + { + "epoch": 1.6975194045156385, + "grad_norm": 1.8165618181228638, + "learning_rate": 2.1708009924739358e-05, + "loss": 0.6453, + "step": 192020 + }, + { + "epoch": 1.6976078077759507, + "grad_norm": 5.7464470863342285, + "learning_rate": 2.1706536537067486e-05, + "loss": 0.5617, + "step": 192030 + }, + { + "epoch": 1.697696211036263, + "grad_norm": 2.307619571685791, + "learning_rate": 2.1705063149395618e-05, + "loss": 0.4716, + "step": 192040 + }, + { + "epoch": 1.6977846142965753, + "grad_norm": 7.959567070007324, + "learning_rate": 2.1703589761723746e-05, + "loss": 0.5985, + "step": 192050 + }, + { + "epoch": 1.6978730175568875, + "grad_norm": 3.922034978866577, + "learning_rate": 2.1702116374051874e-05, + "loss": 0.6511, + "step": 192060 + }, + { + "epoch": 1.6979614208171996, + "grad_norm": 4.7937726974487305, + "learning_rate": 2.1700642986380003e-05, + "loss": 0.7351, + "step": 192070 + }, + { + "epoch": 1.698049824077512, + "grad_norm": 3.758579969406128, + "learning_rate": 2.1699169598708135e-05, + "loss": 0.6756, + "step": 192080 + }, + { + "epoch": 1.6981382273378243, + "grad_norm": 1.4163635969161987, + "learning_rate": 2.1697696211036263e-05, + "loss": 0.6984, + "step": 192090 + }, + { + "epoch": 1.6982266305981364, + "grad_norm": 6.909902572631836, + "learning_rate": 2.169622282336439e-05, + "loss": 0.6313, + "step": 192100 + }, + { + "epoch": 1.6983150338584487, + "grad_norm": 2.223724126815796, + "learning_rate": 2.1694749435692523e-05, + "loss": 0.5948, + "step": 192110 + }, + { + "epoch": 1.698403437118761, + "grad_norm": 2.031708240509033, + "learning_rate": 2.169327604802065e-05, + "loss": 0.5754, + "step": 192120 + }, + { + "epoch": 1.6984918403790732, + "grad_norm": 2.6645381450653076, + "learning_rate": 2.169180266034878e-05, + "loss": 0.5845, + "step": 192130 + }, + { + "epoch": 1.6985802436393853, + "grad_norm": 1.2635366916656494, + "learning_rate": 2.1690329272676908e-05, + "loss": 0.4683, + "step": 192140 + }, + { + "epoch": 1.6986686468996977, + "grad_norm": 1.452958583831787, + "learning_rate": 2.168885588500504e-05, + "loss": 0.5489, + "step": 192150 + }, + { + "epoch": 1.69875705016001, + "grad_norm": 2.634744167327881, + "learning_rate": 2.1687382497333168e-05, + "loss": 0.6094, + "step": 192160 + }, + { + "epoch": 1.6988454534203221, + "grad_norm": 1.571746587753296, + "learning_rate": 2.1685909109661296e-05, + "loss": 0.7726, + "step": 192170 + }, + { + "epoch": 1.6989338566806342, + "grad_norm": 1.0951273441314697, + "learning_rate": 2.1684435721989428e-05, + "loss": 0.5322, + "step": 192180 + }, + { + "epoch": 1.6990222599409466, + "grad_norm": 3.656393527984619, + "learning_rate": 2.1682962334317557e-05, + "loss": 0.5525, + "step": 192190 + }, + { + "epoch": 1.699110663201259, + "grad_norm": 7.627120494842529, + "learning_rate": 2.168148894664569e-05, + "loss": 0.5073, + "step": 192200 + }, + { + "epoch": 1.699199066461571, + "grad_norm": 8.105610847473145, + "learning_rate": 2.1680015558973817e-05, + "loss": 0.4916, + "step": 192210 + }, + { + "epoch": 1.6992874697218834, + "grad_norm": 11.996027946472168, + "learning_rate": 2.1678542171301945e-05, + "loss": 0.5124, + "step": 192220 + }, + { + "epoch": 1.6993758729821957, + "grad_norm": 4.262763023376465, + "learning_rate": 2.1677068783630077e-05, + "loss": 0.5752, + "step": 192230 + }, + { + "epoch": 1.6994642762425078, + "grad_norm": 2.279161214828491, + "learning_rate": 2.1675595395958205e-05, + "loss": 0.6714, + "step": 192240 + }, + { + "epoch": 1.69955267950282, + "grad_norm": 2.2455124855041504, + "learning_rate": 2.1674122008286333e-05, + "loss": 0.5061, + "step": 192250 + }, + { + "epoch": 1.6996410827631323, + "grad_norm": 3.2316677570343018, + "learning_rate": 2.1672648620614465e-05, + "loss": 0.4982, + "step": 192260 + }, + { + "epoch": 1.6997294860234446, + "grad_norm": 1.562880039215088, + "learning_rate": 2.1671175232942594e-05, + "loss": 0.7062, + "step": 192270 + }, + { + "epoch": 1.6998178892837568, + "grad_norm": 13.20670223236084, + "learning_rate": 2.1669701845270722e-05, + "loss": 0.5343, + "step": 192280 + }, + { + "epoch": 1.6999062925440689, + "grad_norm": 1.3940116167068481, + "learning_rate": 2.166822845759885e-05, + "loss": 0.574, + "step": 192290 + }, + { + "epoch": 1.6999946958043812, + "grad_norm": 2.756795644760132, + "learning_rate": 2.1666755069926982e-05, + "loss": 0.6782, + "step": 192300 + }, + { + "epoch": 1.7000830990646936, + "grad_norm": 2.0956616401672363, + "learning_rate": 2.166528168225511e-05, + "loss": 0.7429, + "step": 192310 + }, + { + "epoch": 1.7001715023250057, + "grad_norm": 4.234799385070801, + "learning_rate": 2.166380829458324e-05, + "loss": 0.6413, + "step": 192320 + }, + { + "epoch": 1.700259905585318, + "grad_norm": 3.032193183898926, + "learning_rate": 2.166233490691137e-05, + "loss": 0.6566, + "step": 192330 + }, + { + "epoch": 1.7003483088456304, + "grad_norm": 1.0524382591247559, + "learning_rate": 2.16608615192395e-05, + "loss": 0.5907, + "step": 192340 + }, + { + "epoch": 1.7004367121059425, + "grad_norm": 1.5335265398025513, + "learning_rate": 2.1659388131567627e-05, + "loss": 0.487, + "step": 192350 + }, + { + "epoch": 1.7005251153662546, + "grad_norm": 1.9411295652389526, + "learning_rate": 2.1657914743895756e-05, + "loss": 0.5349, + "step": 192360 + }, + { + "epoch": 1.700613518626567, + "grad_norm": 3.3382508754730225, + "learning_rate": 2.1656441356223887e-05, + "loss": 0.6519, + "step": 192370 + }, + { + "epoch": 1.7007019218868793, + "grad_norm": 2.6799464225769043, + "learning_rate": 2.1654967968552016e-05, + "loss": 0.5604, + "step": 192380 + }, + { + "epoch": 1.7007903251471914, + "grad_norm": 5.3011322021484375, + "learning_rate": 2.1653494580880144e-05, + "loss": 0.8506, + "step": 192390 + }, + { + "epoch": 1.7008787284075035, + "grad_norm": 7.716976165771484, + "learning_rate": 2.1652021193208276e-05, + "loss": 0.5934, + "step": 192400 + }, + { + "epoch": 1.7009671316678159, + "grad_norm": 1.4680860042572021, + "learning_rate": 2.1650547805536404e-05, + "loss": 0.6041, + "step": 192410 + }, + { + "epoch": 1.7010555349281282, + "grad_norm": 9.352996826171875, + "learning_rate": 2.1649074417864532e-05, + "loss": 0.6027, + "step": 192420 + }, + { + "epoch": 1.7011439381884403, + "grad_norm": 2.039008378982544, + "learning_rate": 2.164760103019266e-05, + "loss": 0.6861, + "step": 192430 + }, + { + "epoch": 1.7012323414487527, + "grad_norm": 3.9179604053497314, + "learning_rate": 2.1646127642520793e-05, + "loss": 0.7188, + "step": 192440 + }, + { + "epoch": 1.701320744709065, + "grad_norm": 5.1725029945373535, + "learning_rate": 2.164465425484892e-05, + "loss": 0.5822, + "step": 192450 + }, + { + "epoch": 1.7014091479693771, + "grad_norm": 2.0126705169677734, + "learning_rate": 2.164318086717705e-05, + "loss": 0.5184, + "step": 192460 + }, + { + "epoch": 1.7014975512296893, + "grad_norm": 2.5465290546417236, + "learning_rate": 2.1641707479505178e-05, + "loss": 0.5563, + "step": 192470 + }, + { + "epoch": 1.7015859544900016, + "grad_norm": 2.2157347202301025, + "learning_rate": 2.164023409183331e-05, + "loss": 0.6306, + "step": 192480 + }, + { + "epoch": 1.701674357750314, + "grad_norm": 1.3446446657180786, + "learning_rate": 2.1638760704161438e-05, + "loss": 0.6451, + "step": 192490 + }, + { + "epoch": 1.701762761010626, + "grad_norm": 3.229628086090088, + "learning_rate": 2.1637287316489566e-05, + "loss": 0.7066, + "step": 192500 + }, + { + "epoch": 1.7018511642709382, + "grad_norm": 1.7282774448394775, + "learning_rate": 2.1635813928817698e-05, + "loss": 0.5993, + "step": 192510 + }, + { + "epoch": 1.7019395675312505, + "grad_norm": 4.4794535636901855, + "learning_rate": 2.1634340541145826e-05, + "loss": 0.6174, + "step": 192520 + }, + { + "epoch": 1.7020279707915629, + "grad_norm": 2.4877657890319824, + "learning_rate": 2.1632867153473955e-05, + "loss": 0.5814, + "step": 192530 + }, + { + "epoch": 1.702116374051875, + "grad_norm": 2.519657850265503, + "learning_rate": 2.1631393765802083e-05, + "loss": 0.5948, + "step": 192540 + }, + { + "epoch": 1.7022047773121871, + "grad_norm": 1.9545468091964722, + "learning_rate": 2.1629920378130215e-05, + "loss": 0.6662, + "step": 192550 + }, + { + "epoch": 1.7022931805724997, + "grad_norm": 1.0247230529785156, + "learning_rate": 2.1628446990458343e-05, + "loss": 0.5043, + "step": 192560 + }, + { + "epoch": 1.7023815838328118, + "grad_norm": 1.5919438600540161, + "learning_rate": 2.162697360278647e-05, + "loss": 0.5179, + "step": 192570 + }, + { + "epoch": 1.702469987093124, + "grad_norm": 1.2989373207092285, + "learning_rate": 2.1625500215114603e-05, + "loss": 0.6397, + "step": 192580 + }, + { + "epoch": 1.7025583903534363, + "grad_norm": 1.4600801467895508, + "learning_rate": 2.162402682744273e-05, + "loss": 0.581, + "step": 192590 + }, + { + "epoch": 1.7026467936137486, + "grad_norm": 9.107261657714844, + "learning_rate": 2.162255343977086e-05, + "loss": 0.6276, + "step": 192600 + }, + { + "epoch": 1.7027351968740607, + "grad_norm": 1.6058019399642944, + "learning_rate": 2.1621080052098988e-05, + "loss": 0.6251, + "step": 192610 + }, + { + "epoch": 1.7028236001343728, + "grad_norm": 1.7118215560913086, + "learning_rate": 2.161960666442712e-05, + "loss": 0.5064, + "step": 192620 + }, + { + "epoch": 1.7029120033946852, + "grad_norm": 16.251148223876953, + "learning_rate": 2.1618133276755248e-05, + "loss": 0.6524, + "step": 192630 + }, + { + "epoch": 1.7030004066549975, + "grad_norm": 3.134155511856079, + "learning_rate": 2.1616659889083377e-05, + "loss": 0.5798, + "step": 192640 + }, + { + "epoch": 1.7030888099153096, + "grad_norm": 2.934706449508667, + "learning_rate": 2.1615186501411505e-05, + "loss": 0.6212, + "step": 192650 + }, + { + "epoch": 1.7031772131756218, + "grad_norm": 1.3024001121520996, + "learning_rate": 2.1613713113739637e-05, + "loss": 0.5679, + "step": 192660 + }, + { + "epoch": 1.703265616435934, + "grad_norm": 4.700631618499756, + "learning_rate": 2.1612239726067765e-05, + "loss": 0.5954, + "step": 192670 + }, + { + "epoch": 1.7033540196962464, + "grad_norm": 2.8760063648223877, + "learning_rate": 2.1610766338395893e-05, + "loss": 0.6942, + "step": 192680 + }, + { + "epoch": 1.7034424229565586, + "grad_norm": 1.6820541620254517, + "learning_rate": 2.1609292950724025e-05, + "loss": 0.782, + "step": 192690 + }, + { + "epoch": 1.703530826216871, + "grad_norm": 1.9463380575180054, + "learning_rate": 2.1607819563052153e-05, + "loss": 0.6404, + "step": 192700 + }, + { + "epoch": 1.7036192294771833, + "grad_norm": 1.1862205266952515, + "learning_rate": 2.1606346175380282e-05, + "loss": 0.5103, + "step": 192710 + }, + { + "epoch": 1.7037076327374954, + "grad_norm": 4.426752090454102, + "learning_rate": 2.160487278770841e-05, + "loss": 0.5889, + "step": 192720 + }, + { + "epoch": 1.7037960359978075, + "grad_norm": 0.9702103734016418, + "learning_rate": 2.1603399400036542e-05, + "loss": 0.607, + "step": 192730 + }, + { + "epoch": 1.7038844392581198, + "grad_norm": 22.196788787841797, + "learning_rate": 2.160192601236467e-05, + "loss": 0.8177, + "step": 192740 + }, + { + "epoch": 1.7039728425184322, + "grad_norm": 9.930220603942871, + "learning_rate": 2.16004526246928e-05, + "loss": 0.5192, + "step": 192750 + }, + { + "epoch": 1.7040612457787443, + "grad_norm": 2.2557296752929688, + "learning_rate": 2.159897923702093e-05, + "loss": 0.6364, + "step": 192760 + }, + { + "epoch": 1.7041496490390564, + "grad_norm": 3.7410945892333984, + "learning_rate": 2.159750584934906e-05, + "loss": 0.6099, + "step": 192770 + }, + { + "epoch": 1.7042380522993688, + "grad_norm": 2.8992271423339844, + "learning_rate": 2.1596032461677187e-05, + "loss": 0.5525, + "step": 192780 + }, + { + "epoch": 1.704326455559681, + "grad_norm": 2.923013210296631, + "learning_rate": 2.1594559074005315e-05, + "loss": 0.6137, + "step": 192790 + }, + { + "epoch": 1.7044148588199932, + "grad_norm": 7.2937726974487305, + "learning_rate": 2.1593085686333447e-05, + "loss": 0.7636, + "step": 192800 + }, + { + "epoch": 1.7045032620803056, + "grad_norm": 1.2805955410003662, + "learning_rate": 2.1591612298661576e-05, + "loss": 0.6087, + "step": 192810 + }, + { + "epoch": 1.704591665340618, + "grad_norm": 1.9098401069641113, + "learning_rate": 2.1590138910989704e-05, + "loss": 0.5969, + "step": 192820 + }, + { + "epoch": 1.70468006860093, + "grad_norm": 1.8037567138671875, + "learning_rate": 2.1588665523317832e-05, + "loss": 0.6318, + "step": 192830 + }, + { + "epoch": 1.7047684718612421, + "grad_norm": 3.3513031005859375, + "learning_rate": 2.1587192135645964e-05, + "loss": 0.5824, + "step": 192840 + }, + { + "epoch": 1.7048568751215545, + "grad_norm": 2.465480327606201, + "learning_rate": 2.1585718747974092e-05, + "loss": 0.606, + "step": 192850 + }, + { + "epoch": 1.7049452783818668, + "grad_norm": 7.786761283874512, + "learning_rate": 2.158424536030222e-05, + "loss": 0.5148, + "step": 192860 + }, + { + "epoch": 1.705033681642179, + "grad_norm": 1.2157894372940063, + "learning_rate": 2.1582771972630352e-05, + "loss": 0.5365, + "step": 192870 + }, + { + "epoch": 1.705122084902491, + "grad_norm": 10.837203025817871, + "learning_rate": 2.158129858495848e-05, + "loss": 0.5598, + "step": 192880 + }, + { + "epoch": 1.7052104881628034, + "grad_norm": 8.102943420410156, + "learning_rate": 2.157982519728661e-05, + "loss": 0.6252, + "step": 192890 + }, + { + "epoch": 1.7052988914231157, + "grad_norm": 1.599469542503357, + "learning_rate": 2.1578351809614737e-05, + "loss": 0.5073, + "step": 192900 + }, + { + "epoch": 1.7053872946834279, + "grad_norm": 1.0777111053466797, + "learning_rate": 2.157687842194287e-05, + "loss": 0.6193, + "step": 192910 + }, + { + "epoch": 1.7054756979437402, + "grad_norm": 1.2110395431518555, + "learning_rate": 2.1575405034270998e-05, + "loss": 0.6007, + "step": 192920 + }, + { + "epoch": 1.7055641012040526, + "grad_norm": 2.502366065979004, + "learning_rate": 2.1573931646599126e-05, + "loss": 0.5691, + "step": 192930 + }, + { + "epoch": 1.7056525044643647, + "grad_norm": 5.271129131317139, + "learning_rate": 2.1572458258927254e-05, + "loss": 0.5525, + "step": 192940 + }, + { + "epoch": 1.7057409077246768, + "grad_norm": 1.9179210662841797, + "learning_rate": 2.1570984871255386e-05, + "loss": 0.5809, + "step": 192950 + }, + { + "epoch": 1.7058293109849891, + "grad_norm": 2.8024744987487793, + "learning_rate": 2.1569511483583514e-05, + "loss": 0.6545, + "step": 192960 + }, + { + "epoch": 1.7059177142453015, + "grad_norm": 1.679518699645996, + "learning_rate": 2.1568038095911643e-05, + "loss": 0.6099, + "step": 192970 + }, + { + "epoch": 1.7060061175056136, + "grad_norm": 2.3610336780548096, + "learning_rate": 2.1566564708239774e-05, + "loss": 0.6501, + "step": 192980 + }, + { + "epoch": 1.7060945207659257, + "grad_norm": 2.0149097442626953, + "learning_rate": 2.1565091320567903e-05, + "loss": 0.6572, + "step": 192990 + }, + { + "epoch": 1.706182924026238, + "grad_norm": 1.6732184886932373, + "learning_rate": 2.156361793289603e-05, + "loss": 0.5489, + "step": 193000 + }, + { + "epoch": 1.7062713272865504, + "grad_norm": 8.428173065185547, + "learning_rate": 2.156214454522416e-05, + "loss": 0.5129, + "step": 193010 + }, + { + "epoch": 1.7063597305468625, + "grad_norm": 13.745948791503906, + "learning_rate": 2.156067115755229e-05, + "loss": 0.5867, + "step": 193020 + }, + { + "epoch": 1.7064481338071749, + "grad_norm": 4.421304225921631, + "learning_rate": 2.155919776988042e-05, + "loss": 0.4689, + "step": 193030 + }, + { + "epoch": 1.7065365370674872, + "grad_norm": 1.6220214366912842, + "learning_rate": 2.1557724382208548e-05, + "loss": 0.6558, + "step": 193040 + }, + { + "epoch": 1.7066249403277993, + "grad_norm": 1.6485824584960938, + "learning_rate": 2.155625099453668e-05, + "loss": 0.5118, + "step": 193050 + }, + { + "epoch": 1.7067133435881114, + "grad_norm": 2.4856295585632324, + "learning_rate": 2.1554777606864808e-05, + "loss": 0.7231, + "step": 193060 + }, + { + "epoch": 1.7068017468484238, + "grad_norm": 1.8700010776519775, + "learning_rate": 2.1553304219192936e-05, + "loss": 0.5572, + "step": 193070 + }, + { + "epoch": 1.7068901501087361, + "grad_norm": 2.4165444374084473, + "learning_rate": 2.1551830831521068e-05, + "loss": 0.6112, + "step": 193080 + }, + { + "epoch": 1.7069785533690482, + "grad_norm": 1.8147107362747192, + "learning_rate": 2.1550357443849197e-05, + "loss": 0.6198, + "step": 193090 + }, + { + "epoch": 1.7070669566293604, + "grad_norm": 1.855204463005066, + "learning_rate": 2.1548884056177325e-05, + "loss": 0.5507, + "step": 193100 + }, + { + "epoch": 1.7071553598896727, + "grad_norm": 4.025112628936768, + "learning_rate": 2.1547410668505457e-05, + "loss": 0.5764, + "step": 193110 + }, + { + "epoch": 1.707243763149985, + "grad_norm": 21.12237548828125, + "learning_rate": 2.1545937280833585e-05, + "loss": 0.7226, + "step": 193120 + }, + { + "epoch": 1.7073321664102972, + "grad_norm": 1.2159252166748047, + "learning_rate": 2.1544463893161713e-05, + "loss": 0.4942, + "step": 193130 + }, + { + "epoch": 1.7074205696706093, + "grad_norm": 2.432981491088867, + "learning_rate": 2.1542990505489845e-05, + "loss": 0.5872, + "step": 193140 + }, + { + "epoch": 1.7075089729309219, + "grad_norm": 1.2240477800369263, + "learning_rate": 2.1541517117817973e-05, + "loss": 0.516, + "step": 193150 + }, + { + "epoch": 1.707597376191234, + "grad_norm": 8.193679809570312, + "learning_rate": 2.1540043730146102e-05, + "loss": 0.6525, + "step": 193160 + }, + { + "epoch": 1.707685779451546, + "grad_norm": 1.132754921913147, + "learning_rate": 2.1538570342474234e-05, + "loss": 0.4889, + "step": 193170 + }, + { + "epoch": 1.7077741827118584, + "grad_norm": 2.784849166870117, + "learning_rate": 2.1537096954802362e-05, + "loss": 0.4858, + "step": 193180 + }, + { + "epoch": 1.7078625859721708, + "grad_norm": 3.904958486557007, + "learning_rate": 2.153562356713049e-05, + "loss": 0.5982, + "step": 193190 + }, + { + "epoch": 1.707950989232483, + "grad_norm": 0.7412169575691223, + "learning_rate": 2.1534150179458622e-05, + "loss": 0.5165, + "step": 193200 + }, + { + "epoch": 1.708039392492795, + "grad_norm": 1.7279950380325317, + "learning_rate": 2.153267679178675e-05, + "loss": 0.5379, + "step": 193210 + }, + { + "epoch": 1.7081277957531074, + "grad_norm": 1.994883418083191, + "learning_rate": 2.153120340411488e-05, + "loss": 0.7022, + "step": 193220 + }, + { + "epoch": 1.7082161990134197, + "grad_norm": 10.715351104736328, + "learning_rate": 2.152973001644301e-05, + "loss": 0.5129, + "step": 193230 + }, + { + "epoch": 1.7083046022737318, + "grad_norm": 2.887328624725342, + "learning_rate": 2.152825662877114e-05, + "loss": 0.555, + "step": 193240 + }, + { + "epoch": 1.708393005534044, + "grad_norm": 2.088953733444214, + "learning_rate": 2.1526783241099267e-05, + "loss": 0.5347, + "step": 193250 + }, + { + "epoch": 1.7084814087943563, + "grad_norm": 1.7319414615631104, + "learning_rate": 2.1525309853427395e-05, + "loss": 0.4997, + "step": 193260 + }, + { + "epoch": 1.7085698120546686, + "grad_norm": 2.9857470989227295, + "learning_rate": 2.1523836465755527e-05, + "loss": 0.7082, + "step": 193270 + }, + { + "epoch": 1.7086582153149807, + "grad_norm": 2.421107053756714, + "learning_rate": 2.1522363078083656e-05, + "loss": 0.6149, + "step": 193280 + }, + { + "epoch": 1.708746618575293, + "grad_norm": 4.203795433044434, + "learning_rate": 2.1520889690411784e-05, + "loss": 0.422, + "step": 193290 + }, + { + "epoch": 1.7088350218356054, + "grad_norm": 2.44531512260437, + "learning_rate": 2.1519416302739912e-05, + "loss": 0.6336, + "step": 193300 + }, + { + "epoch": 1.7089234250959175, + "grad_norm": 0.8148242235183716, + "learning_rate": 2.1517942915068044e-05, + "loss": 0.5725, + "step": 193310 + }, + { + "epoch": 1.7090118283562297, + "grad_norm": 2.558438777923584, + "learning_rate": 2.1516469527396172e-05, + "loss": 0.7034, + "step": 193320 + }, + { + "epoch": 1.709100231616542, + "grad_norm": 1.6167006492614746, + "learning_rate": 2.15149961397243e-05, + "loss": 0.6927, + "step": 193330 + }, + { + "epoch": 1.7091886348768544, + "grad_norm": 3.192253589630127, + "learning_rate": 2.1513522752052432e-05, + "loss": 0.4504, + "step": 193340 + }, + { + "epoch": 1.7092770381371665, + "grad_norm": 2.2594075202941895, + "learning_rate": 2.151204936438056e-05, + "loss": 0.6758, + "step": 193350 + }, + { + "epoch": 1.7093654413974786, + "grad_norm": 1.3042347431182861, + "learning_rate": 2.151057597670869e-05, + "loss": 0.7211, + "step": 193360 + }, + { + "epoch": 1.709453844657791, + "grad_norm": 8.780888557434082, + "learning_rate": 2.1509102589036818e-05, + "loss": 0.6545, + "step": 193370 + }, + { + "epoch": 1.7095422479181033, + "grad_norm": 2.702665090560913, + "learning_rate": 2.150762920136495e-05, + "loss": 0.7298, + "step": 193380 + }, + { + "epoch": 1.7096306511784154, + "grad_norm": 5.596799850463867, + "learning_rate": 2.1506155813693078e-05, + "loss": 0.5567, + "step": 193390 + }, + { + "epoch": 1.7097190544387277, + "grad_norm": 5.592110633850098, + "learning_rate": 2.1504682426021206e-05, + "loss": 0.5826, + "step": 193400 + }, + { + "epoch": 1.70980745769904, + "grad_norm": 6.443530559539795, + "learning_rate": 2.1503209038349334e-05, + "loss": 0.602, + "step": 193410 + }, + { + "epoch": 1.7098958609593522, + "grad_norm": 2.6245791912078857, + "learning_rate": 2.1501735650677466e-05, + "loss": 0.5969, + "step": 193420 + }, + { + "epoch": 1.7099842642196643, + "grad_norm": 1.3029557466506958, + "learning_rate": 2.1500262263005594e-05, + "loss": 0.6485, + "step": 193430 + }, + { + "epoch": 1.7100726674799767, + "grad_norm": 4.779838562011719, + "learning_rate": 2.1498788875333723e-05, + "loss": 0.6585, + "step": 193440 + }, + { + "epoch": 1.710161070740289, + "grad_norm": 3.536142349243164, + "learning_rate": 2.1497315487661855e-05, + "loss": 0.6322, + "step": 193450 + }, + { + "epoch": 1.7102494740006011, + "grad_norm": 2.38405704498291, + "learning_rate": 2.1495842099989983e-05, + "loss": 0.5583, + "step": 193460 + }, + { + "epoch": 1.7103378772609132, + "grad_norm": 2.292497396469116, + "learning_rate": 2.149436871231811e-05, + "loss": 0.5467, + "step": 193470 + }, + { + "epoch": 1.7104262805212256, + "grad_norm": 3.9541614055633545, + "learning_rate": 2.149289532464624e-05, + "loss": 0.7388, + "step": 193480 + }, + { + "epoch": 1.710514683781538, + "grad_norm": 12.266168594360352, + "learning_rate": 2.149142193697437e-05, + "loss": 0.5763, + "step": 193490 + }, + { + "epoch": 1.71060308704185, + "grad_norm": 1.5108264684677124, + "learning_rate": 2.14899485493025e-05, + "loss": 0.6221, + "step": 193500 + }, + { + "epoch": 1.7106914903021624, + "grad_norm": 2.0176055431365967, + "learning_rate": 2.1488475161630628e-05, + "loss": 0.6865, + "step": 193510 + }, + { + "epoch": 1.7107798935624747, + "grad_norm": 5.445321083068848, + "learning_rate": 2.148700177395876e-05, + "loss": 0.4985, + "step": 193520 + }, + { + "epoch": 1.7108682968227868, + "grad_norm": 1.921905755996704, + "learning_rate": 2.1485528386286888e-05, + "loss": 0.7662, + "step": 193530 + }, + { + "epoch": 1.710956700083099, + "grad_norm": 3.3987882137298584, + "learning_rate": 2.1484054998615017e-05, + "loss": 0.5557, + "step": 193540 + }, + { + "epoch": 1.7110451033434113, + "grad_norm": 2.8329806327819824, + "learning_rate": 2.1482581610943145e-05, + "loss": 0.5807, + "step": 193550 + }, + { + "epoch": 1.7111335066037237, + "grad_norm": 2.9834933280944824, + "learning_rate": 2.1481108223271277e-05, + "loss": 0.6431, + "step": 193560 + }, + { + "epoch": 1.7112219098640358, + "grad_norm": 3.6295862197875977, + "learning_rate": 2.1479634835599405e-05, + "loss": 0.7418, + "step": 193570 + }, + { + "epoch": 1.711310313124348, + "grad_norm": 4.6869025230407715, + "learning_rate": 2.1478161447927533e-05, + "loss": 0.5437, + "step": 193580 + }, + { + "epoch": 1.7113987163846602, + "grad_norm": 2.265542507171631, + "learning_rate": 2.147668806025566e-05, + "loss": 0.5905, + "step": 193590 + }, + { + "epoch": 1.7114871196449726, + "grad_norm": 5.112244129180908, + "learning_rate": 2.1475214672583793e-05, + "loss": 0.5551, + "step": 193600 + }, + { + "epoch": 1.7115755229052847, + "grad_norm": 3.033071756362915, + "learning_rate": 2.1473741284911922e-05, + "loss": 0.6883, + "step": 193610 + }, + { + "epoch": 1.711663926165597, + "grad_norm": 1.3255032300949097, + "learning_rate": 2.147226789724005e-05, + "loss": 0.7637, + "step": 193620 + }, + { + "epoch": 1.7117523294259094, + "grad_norm": 1.6674469709396362, + "learning_rate": 2.1470794509568182e-05, + "loss": 0.6135, + "step": 193630 + }, + { + "epoch": 1.7118407326862215, + "grad_norm": 1.3110415935516357, + "learning_rate": 2.146932112189631e-05, + "loss": 0.602, + "step": 193640 + }, + { + "epoch": 1.7119291359465336, + "grad_norm": 2.6006574630737305, + "learning_rate": 2.146784773422444e-05, + "loss": 0.6407, + "step": 193650 + }, + { + "epoch": 1.712017539206846, + "grad_norm": 1.43293297290802, + "learning_rate": 2.1466374346552567e-05, + "loss": 0.5702, + "step": 193660 + }, + { + "epoch": 1.7121059424671583, + "grad_norm": 11.81348991394043, + "learning_rate": 2.14649009588807e-05, + "loss": 0.6085, + "step": 193670 + }, + { + "epoch": 1.7121943457274704, + "grad_norm": 1.8563233613967896, + "learning_rate": 2.1463427571208827e-05, + "loss": 0.5778, + "step": 193680 + }, + { + "epoch": 1.7122827489877825, + "grad_norm": 1.7744039297103882, + "learning_rate": 2.1461954183536955e-05, + "loss": 0.4811, + "step": 193690 + }, + { + "epoch": 1.7123711522480949, + "grad_norm": 1.2777608633041382, + "learning_rate": 2.1460480795865087e-05, + "loss": 0.5048, + "step": 193700 + }, + { + "epoch": 1.7124595555084072, + "grad_norm": 2.096623659133911, + "learning_rate": 2.1459007408193215e-05, + "loss": 0.6469, + "step": 193710 + }, + { + "epoch": 1.7125479587687193, + "grad_norm": 2.577944755554199, + "learning_rate": 2.1457534020521344e-05, + "loss": 0.6443, + "step": 193720 + }, + { + "epoch": 1.7126363620290315, + "grad_norm": 7.665952205657959, + "learning_rate": 2.1456060632849472e-05, + "loss": 0.6474, + "step": 193730 + }, + { + "epoch": 1.712724765289344, + "grad_norm": 2.8538498878479004, + "learning_rate": 2.1454587245177604e-05, + "loss": 0.6347, + "step": 193740 + }, + { + "epoch": 1.7128131685496562, + "grad_norm": 1.5761281251907349, + "learning_rate": 2.1453113857505732e-05, + "loss": 0.4585, + "step": 193750 + }, + { + "epoch": 1.7129015718099683, + "grad_norm": 9.394773483276367, + "learning_rate": 2.145164046983386e-05, + "loss": 0.5066, + "step": 193760 + }, + { + "epoch": 1.7129899750702806, + "grad_norm": 1.5645577907562256, + "learning_rate": 2.145016708216199e-05, + "loss": 0.5753, + "step": 193770 + }, + { + "epoch": 1.713078378330593, + "grad_norm": 4.080010414123535, + "learning_rate": 2.144869369449012e-05, + "loss": 0.5519, + "step": 193780 + }, + { + "epoch": 1.713166781590905, + "grad_norm": 6.62410306930542, + "learning_rate": 2.144722030681825e-05, + "loss": 0.6189, + "step": 193790 + }, + { + "epoch": 1.7132551848512172, + "grad_norm": 9.186639785766602, + "learning_rate": 2.1445746919146377e-05, + "loss": 0.5834, + "step": 193800 + }, + { + "epoch": 1.7133435881115295, + "grad_norm": 8.02290153503418, + "learning_rate": 2.144427353147451e-05, + "loss": 0.4295, + "step": 193810 + }, + { + "epoch": 1.7134319913718419, + "grad_norm": 2.2746243476867676, + "learning_rate": 2.1442800143802638e-05, + "loss": 0.5328, + "step": 193820 + }, + { + "epoch": 1.713520394632154, + "grad_norm": 1.81694757938385, + "learning_rate": 2.1441326756130766e-05, + "loss": 0.6152, + "step": 193830 + }, + { + "epoch": 1.7136087978924661, + "grad_norm": 10.199980735778809, + "learning_rate": 2.1439853368458894e-05, + "loss": 0.5066, + "step": 193840 + }, + { + "epoch": 1.7136972011527785, + "grad_norm": 1.051020622253418, + "learning_rate": 2.1438379980787026e-05, + "loss": 0.6798, + "step": 193850 + }, + { + "epoch": 1.7137856044130908, + "grad_norm": 7.369716644287109, + "learning_rate": 2.1436906593115154e-05, + "loss": 0.611, + "step": 193860 + }, + { + "epoch": 1.713874007673403, + "grad_norm": 3.89119553565979, + "learning_rate": 2.1435433205443283e-05, + "loss": 0.4936, + "step": 193870 + }, + { + "epoch": 1.7139624109337153, + "grad_norm": 2.684659004211426, + "learning_rate": 2.1433959817771414e-05, + "loss": 0.5614, + "step": 193880 + }, + { + "epoch": 1.7140508141940276, + "grad_norm": 1.016061782836914, + "learning_rate": 2.1432486430099543e-05, + "loss": 0.552, + "step": 193890 + }, + { + "epoch": 1.7141392174543397, + "grad_norm": 4.650036334991455, + "learning_rate": 2.143101304242767e-05, + "loss": 0.6651, + "step": 193900 + }, + { + "epoch": 1.7142276207146518, + "grad_norm": 3.340902328491211, + "learning_rate": 2.14295396547558e-05, + "loss": 0.5723, + "step": 193910 + }, + { + "epoch": 1.7143160239749642, + "grad_norm": 2.3471693992614746, + "learning_rate": 2.142806626708393e-05, + "loss": 0.6426, + "step": 193920 + }, + { + "epoch": 1.7144044272352765, + "grad_norm": 2.278501033782959, + "learning_rate": 2.142659287941206e-05, + "loss": 0.5446, + "step": 193930 + }, + { + "epoch": 1.7144928304955886, + "grad_norm": 0.9082626700401306, + "learning_rate": 2.1425119491740188e-05, + "loss": 0.5715, + "step": 193940 + }, + { + "epoch": 1.7145812337559008, + "grad_norm": 2.4361538887023926, + "learning_rate": 2.1423646104068316e-05, + "loss": 0.4883, + "step": 193950 + }, + { + "epoch": 1.714669637016213, + "grad_norm": 2.2103195190429688, + "learning_rate": 2.1422172716396448e-05, + "loss": 0.7856, + "step": 193960 + }, + { + "epoch": 1.7147580402765255, + "grad_norm": 1.3517742156982422, + "learning_rate": 2.1420699328724576e-05, + "loss": 0.6893, + "step": 193970 + }, + { + "epoch": 1.7148464435368376, + "grad_norm": 1.9715032577514648, + "learning_rate": 2.1419225941052705e-05, + "loss": 0.6235, + "step": 193980 + }, + { + "epoch": 1.71493484679715, + "grad_norm": 3.0068297386169434, + "learning_rate": 2.1417752553380836e-05, + "loss": 0.6845, + "step": 193990 + }, + { + "epoch": 1.7150232500574623, + "grad_norm": 4.430449962615967, + "learning_rate": 2.1416279165708965e-05, + "loss": 0.4986, + "step": 194000 + }, + { + "epoch": 1.7151116533177744, + "grad_norm": 1.213348150253296, + "learning_rate": 2.1414805778037093e-05, + "loss": 0.741, + "step": 194010 + }, + { + "epoch": 1.7152000565780865, + "grad_norm": 1.1817817687988281, + "learning_rate": 2.1413332390365225e-05, + "loss": 0.4849, + "step": 194020 + }, + { + "epoch": 1.7152884598383988, + "grad_norm": 8.518665313720703, + "learning_rate": 2.1411859002693353e-05, + "loss": 0.5765, + "step": 194030 + }, + { + "epoch": 1.7153768630987112, + "grad_norm": 7.211263656616211, + "learning_rate": 2.141038561502148e-05, + "loss": 0.6467, + "step": 194040 + }, + { + "epoch": 1.7154652663590233, + "grad_norm": 5.086142539978027, + "learning_rate": 2.1408912227349613e-05, + "loss": 0.5901, + "step": 194050 + }, + { + "epoch": 1.7155536696193354, + "grad_norm": 1.3066221475601196, + "learning_rate": 2.1407438839677742e-05, + "loss": 0.5527, + "step": 194060 + }, + { + "epoch": 1.7156420728796478, + "grad_norm": 5.550687313079834, + "learning_rate": 2.140596545200587e-05, + "loss": 0.6424, + "step": 194070 + }, + { + "epoch": 1.71573047613996, + "grad_norm": 2.44869327545166, + "learning_rate": 2.1404492064334002e-05, + "loss": 0.535, + "step": 194080 + }, + { + "epoch": 1.7158188794002722, + "grad_norm": 1.9060211181640625, + "learning_rate": 2.140301867666213e-05, + "loss": 0.6116, + "step": 194090 + }, + { + "epoch": 1.7159072826605846, + "grad_norm": 4.8658952713012695, + "learning_rate": 2.140154528899026e-05, + "loss": 0.6883, + "step": 194100 + }, + { + "epoch": 1.715995685920897, + "grad_norm": 1.91483473777771, + "learning_rate": 2.140007190131839e-05, + "loss": 0.6174, + "step": 194110 + }, + { + "epoch": 1.716084089181209, + "grad_norm": 6.037035942077637, + "learning_rate": 2.139859851364652e-05, + "loss": 0.6025, + "step": 194120 + }, + { + "epoch": 1.7161724924415211, + "grad_norm": 7.2498779296875, + "learning_rate": 2.1397125125974647e-05, + "loss": 0.6472, + "step": 194130 + }, + { + "epoch": 1.7162608957018335, + "grad_norm": 7.297693252563477, + "learning_rate": 2.139565173830278e-05, + "loss": 0.5235, + "step": 194140 + }, + { + "epoch": 1.7163492989621458, + "grad_norm": 4.959930896759033, + "learning_rate": 2.1394178350630907e-05, + "loss": 0.6185, + "step": 194150 + }, + { + "epoch": 1.716437702222458, + "grad_norm": 1.1903307437896729, + "learning_rate": 2.1392704962959035e-05, + "loss": 0.5407, + "step": 194160 + }, + { + "epoch": 1.71652610548277, + "grad_norm": 2.4141032695770264, + "learning_rate": 2.1391231575287167e-05, + "loss": 0.7523, + "step": 194170 + }, + { + "epoch": 1.7166145087430824, + "grad_norm": 3.3654158115386963, + "learning_rate": 2.1389758187615296e-05, + "loss": 0.7131, + "step": 194180 + }, + { + "epoch": 1.7167029120033948, + "grad_norm": 5.278143882751465, + "learning_rate": 2.1388284799943424e-05, + "loss": 0.5624, + "step": 194190 + }, + { + "epoch": 1.7167913152637069, + "grad_norm": 1.1827096939086914, + "learning_rate": 2.1386811412271552e-05, + "loss": 0.7165, + "step": 194200 + }, + { + "epoch": 1.7168797185240192, + "grad_norm": 1.5173828601837158, + "learning_rate": 2.1385338024599684e-05, + "loss": 0.6331, + "step": 194210 + }, + { + "epoch": 1.7169681217843316, + "grad_norm": 1.3472874164581299, + "learning_rate": 2.1383864636927812e-05, + "loss": 0.5395, + "step": 194220 + }, + { + "epoch": 1.7170565250446437, + "grad_norm": 1.7874771356582642, + "learning_rate": 2.138239124925594e-05, + "loss": 0.656, + "step": 194230 + }, + { + "epoch": 1.7171449283049558, + "grad_norm": 2.1612870693206787, + "learning_rate": 2.138091786158407e-05, + "loss": 0.6968, + "step": 194240 + }, + { + "epoch": 1.7172333315652681, + "grad_norm": 2.093449354171753, + "learning_rate": 2.13794444739122e-05, + "loss": 0.5092, + "step": 194250 + }, + { + "epoch": 1.7173217348255805, + "grad_norm": 1.7155430316925049, + "learning_rate": 2.137797108624033e-05, + "loss": 0.5043, + "step": 194260 + }, + { + "epoch": 1.7174101380858926, + "grad_norm": 0.9649681448936462, + "learning_rate": 2.1376497698568458e-05, + "loss": 0.4129, + "step": 194270 + }, + { + "epoch": 1.7174985413462047, + "grad_norm": 1.9646459817886353, + "learning_rate": 2.137502431089659e-05, + "loss": 0.5339, + "step": 194280 + }, + { + "epoch": 1.717586944606517, + "grad_norm": 3.8654212951660156, + "learning_rate": 2.1373550923224718e-05, + "loss": 0.4797, + "step": 194290 + }, + { + "epoch": 1.7176753478668294, + "grad_norm": 1.408607840538025, + "learning_rate": 2.1372077535552846e-05, + "loss": 0.6064, + "step": 194300 + }, + { + "epoch": 1.7177637511271415, + "grad_norm": 13.348353385925293, + "learning_rate": 2.1370604147880974e-05, + "loss": 0.6391, + "step": 194310 + }, + { + "epoch": 1.7178521543874536, + "grad_norm": 2.575873851776123, + "learning_rate": 2.1369130760209106e-05, + "loss": 0.4392, + "step": 194320 + }, + { + "epoch": 1.7179405576477662, + "grad_norm": 3.7794172763824463, + "learning_rate": 2.1367657372537234e-05, + "loss": 0.5417, + "step": 194330 + }, + { + "epoch": 1.7180289609080783, + "grad_norm": 2.6534876823425293, + "learning_rate": 2.1366183984865363e-05, + "loss": 0.6409, + "step": 194340 + }, + { + "epoch": 1.7181173641683904, + "grad_norm": 3.612494707107544, + "learning_rate": 2.1364710597193494e-05, + "loss": 0.4685, + "step": 194350 + }, + { + "epoch": 1.7182057674287028, + "grad_norm": 6.548025608062744, + "learning_rate": 2.1363237209521623e-05, + "loss": 0.676, + "step": 194360 + }, + { + "epoch": 1.7182941706890151, + "grad_norm": 5.572389125823975, + "learning_rate": 2.136176382184975e-05, + "loss": 0.5471, + "step": 194370 + }, + { + "epoch": 1.7183825739493273, + "grad_norm": 14.386319160461426, + "learning_rate": 2.136029043417788e-05, + "loss": 0.5738, + "step": 194380 + }, + { + "epoch": 1.7184709772096394, + "grad_norm": 1.8911259174346924, + "learning_rate": 2.135881704650601e-05, + "loss": 0.6273, + "step": 194390 + }, + { + "epoch": 1.7185593804699517, + "grad_norm": 2.1299264430999756, + "learning_rate": 2.135734365883414e-05, + "loss": 0.5588, + "step": 194400 + }, + { + "epoch": 1.718647783730264, + "grad_norm": 1.7233922481536865, + "learning_rate": 2.1355870271162268e-05, + "loss": 0.6437, + "step": 194410 + }, + { + "epoch": 1.7187361869905762, + "grad_norm": 2.2364232540130615, + "learning_rate": 2.1354396883490396e-05, + "loss": 0.586, + "step": 194420 + }, + { + "epoch": 1.7188245902508883, + "grad_norm": 1.1255460977554321, + "learning_rate": 2.1352923495818528e-05, + "loss": 0.5092, + "step": 194430 + }, + { + "epoch": 1.7189129935112009, + "grad_norm": 1.157018780708313, + "learning_rate": 2.1351450108146656e-05, + "loss": 0.5651, + "step": 194440 + }, + { + "epoch": 1.719001396771513, + "grad_norm": 5.953611850738525, + "learning_rate": 2.1349976720474785e-05, + "loss": 0.729, + "step": 194450 + }, + { + "epoch": 1.719089800031825, + "grad_norm": 2.7032785415649414, + "learning_rate": 2.1348503332802917e-05, + "loss": 0.5417, + "step": 194460 + }, + { + "epoch": 1.7191782032921374, + "grad_norm": 2.5386767387390137, + "learning_rate": 2.1347029945131045e-05, + "loss": 0.5638, + "step": 194470 + }, + { + "epoch": 1.7192666065524498, + "grad_norm": 2.9284324645996094, + "learning_rate": 2.1345556557459173e-05, + "loss": 0.5439, + "step": 194480 + }, + { + "epoch": 1.719355009812762, + "grad_norm": 3.7722442150115967, + "learning_rate": 2.13440831697873e-05, + "loss": 0.604, + "step": 194490 + }, + { + "epoch": 1.719443413073074, + "grad_norm": 1.712037444114685, + "learning_rate": 2.1342609782115433e-05, + "loss": 0.5204, + "step": 194500 + }, + { + "epoch": 1.7195318163333864, + "grad_norm": 2.1617226600646973, + "learning_rate": 2.1341136394443562e-05, + "loss": 0.5705, + "step": 194510 + }, + { + "epoch": 1.7196202195936987, + "grad_norm": 3.638930320739746, + "learning_rate": 2.133966300677169e-05, + "loss": 0.5451, + "step": 194520 + }, + { + "epoch": 1.7197086228540108, + "grad_norm": 5.074330806732178, + "learning_rate": 2.133818961909982e-05, + "loss": 0.5068, + "step": 194530 + }, + { + "epoch": 1.719797026114323, + "grad_norm": 2.370124340057373, + "learning_rate": 2.133671623142795e-05, + "loss": 0.4832, + "step": 194540 + }, + { + "epoch": 1.7198854293746353, + "grad_norm": 7.430358409881592, + "learning_rate": 2.133524284375608e-05, + "loss": 0.5092, + "step": 194550 + }, + { + "epoch": 1.7199738326349476, + "grad_norm": 2.030440330505371, + "learning_rate": 2.1333769456084207e-05, + "loss": 0.5819, + "step": 194560 + }, + { + "epoch": 1.7200622358952598, + "grad_norm": 5.374444961547852, + "learning_rate": 2.133229606841234e-05, + "loss": 0.6293, + "step": 194570 + }, + { + "epoch": 1.720150639155572, + "grad_norm": 1.5253618955612183, + "learning_rate": 2.1330822680740467e-05, + "loss": 0.6682, + "step": 194580 + }, + { + "epoch": 1.7202390424158844, + "grad_norm": 3.1249170303344727, + "learning_rate": 2.1329349293068595e-05, + "loss": 0.5649, + "step": 194590 + }, + { + "epoch": 1.7203274456761966, + "grad_norm": 3.138165235519409, + "learning_rate": 2.1327875905396724e-05, + "loss": 0.5689, + "step": 194600 + }, + { + "epoch": 1.7204158489365087, + "grad_norm": 2.1322529315948486, + "learning_rate": 2.1326402517724855e-05, + "loss": 0.7504, + "step": 194610 + }, + { + "epoch": 1.720504252196821, + "grad_norm": 2.534820079803467, + "learning_rate": 2.1324929130052984e-05, + "loss": 0.6296, + "step": 194620 + }, + { + "epoch": 1.7205926554571334, + "grad_norm": 1.5047125816345215, + "learning_rate": 2.1323455742381112e-05, + "loss": 0.4259, + "step": 194630 + }, + { + "epoch": 1.7206810587174455, + "grad_norm": 1.553313136100769, + "learning_rate": 2.1321982354709244e-05, + "loss": 0.5223, + "step": 194640 + }, + { + "epoch": 1.7207694619777576, + "grad_norm": 7.213515281677246, + "learning_rate": 2.1320508967037372e-05, + "loss": 0.577, + "step": 194650 + }, + { + "epoch": 1.72085786523807, + "grad_norm": 9.564608573913574, + "learning_rate": 2.13190355793655e-05, + "loss": 0.5102, + "step": 194660 + }, + { + "epoch": 1.7209462684983823, + "grad_norm": 7.004220485687256, + "learning_rate": 2.131756219169363e-05, + "loss": 0.6399, + "step": 194670 + }, + { + "epoch": 1.7210346717586944, + "grad_norm": 4.490629196166992, + "learning_rate": 2.131608880402176e-05, + "loss": 0.5615, + "step": 194680 + }, + { + "epoch": 1.7211230750190067, + "grad_norm": 2.593158006668091, + "learning_rate": 2.131461541634989e-05, + "loss": 0.5914, + "step": 194690 + }, + { + "epoch": 1.721211478279319, + "grad_norm": 1.4362741708755493, + "learning_rate": 2.1313142028678017e-05, + "loss": 0.5115, + "step": 194700 + }, + { + "epoch": 1.7212998815396312, + "grad_norm": 2.6229753494262695, + "learning_rate": 2.1311668641006146e-05, + "loss": 0.6462, + "step": 194710 + }, + { + "epoch": 1.7213882847999433, + "grad_norm": 2.605205535888672, + "learning_rate": 2.1310195253334277e-05, + "loss": 0.6794, + "step": 194720 + }, + { + "epoch": 1.7214766880602557, + "grad_norm": 9.741536140441895, + "learning_rate": 2.1308721865662406e-05, + "loss": 0.629, + "step": 194730 + }, + { + "epoch": 1.721565091320568, + "grad_norm": 3.437342882156372, + "learning_rate": 2.1307248477990534e-05, + "loss": 0.55, + "step": 194740 + }, + { + "epoch": 1.7216534945808801, + "grad_norm": 10.987372398376465, + "learning_rate": 2.1305775090318666e-05, + "loss": 0.5065, + "step": 194750 + }, + { + "epoch": 1.7217418978411922, + "grad_norm": 1.3480275869369507, + "learning_rate": 2.1304301702646794e-05, + "loss": 0.5772, + "step": 194760 + }, + { + "epoch": 1.7218303011015046, + "grad_norm": 4.841872692108154, + "learning_rate": 2.1302828314974923e-05, + "loss": 0.6665, + "step": 194770 + }, + { + "epoch": 1.721918704361817, + "grad_norm": 2.1356678009033203, + "learning_rate": 2.130135492730305e-05, + "loss": 0.672, + "step": 194780 + }, + { + "epoch": 1.722007107622129, + "grad_norm": 1.9766780138015747, + "learning_rate": 2.1299881539631183e-05, + "loss": 0.6122, + "step": 194790 + }, + { + "epoch": 1.7220955108824414, + "grad_norm": 2.7788548469543457, + "learning_rate": 2.129840815195931e-05, + "loss": 0.5378, + "step": 194800 + }, + { + "epoch": 1.7221839141427537, + "grad_norm": 2.4216575622558594, + "learning_rate": 2.129693476428744e-05, + "loss": 0.694, + "step": 194810 + }, + { + "epoch": 1.7222723174030659, + "grad_norm": 1.8145804405212402, + "learning_rate": 2.129546137661557e-05, + "loss": 0.619, + "step": 194820 + }, + { + "epoch": 1.722360720663378, + "grad_norm": 3.8577568531036377, + "learning_rate": 2.12939879889437e-05, + "loss": 0.6129, + "step": 194830 + }, + { + "epoch": 1.7224491239236903, + "grad_norm": 9.400105476379395, + "learning_rate": 2.1292514601271828e-05, + "loss": 0.6411, + "step": 194840 + }, + { + "epoch": 1.7225375271840027, + "grad_norm": 8.117342948913574, + "learning_rate": 2.1291041213599956e-05, + "loss": 0.5218, + "step": 194850 + }, + { + "epoch": 1.7226259304443148, + "grad_norm": 2.0788090229034424, + "learning_rate": 2.1289567825928088e-05, + "loss": 0.5991, + "step": 194860 + }, + { + "epoch": 1.722714333704627, + "grad_norm": 2.073340654373169, + "learning_rate": 2.1288094438256216e-05, + "loss": 0.5795, + "step": 194870 + }, + { + "epoch": 1.7228027369649392, + "grad_norm": 12.70473861694336, + "learning_rate": 2.1286621050584345e-05, + "loss": 0.5457, + "step": 194880 + }, + { + "epoch": 1.7228911402252516, + "grad_norm": 1.126299500465393, + "learning_rate": 2.1285147662912473e-05, + "loss": 0.5223, + "step": 194890 + }, + { + "epoch": 1.7229795434855637, + "grad_norm": 2.4783525466918945, + "learning_rate": 2.1283674275240605e-05, + "loss": 0.5983, + "step": 194900 + }, + { + "epoch": 1.7230679467458758, + "grad_norm": 1.0184361934661865, + "learning_rate": 2.1282200887568733e-05, + "loss": 0.6414, + "step": 194910 + }, + { + "epoch": 1.7231563500061884, + "grad_norm": 1.2930160760879517, + "learning_rate": 2.128072749989686e-05, + "loss": 0.5334, + "step": 194920 + }, + { + "epoch": 1.7232447532665005, + "grad_norm": 1.700271487236023, + "learning_rate": 2.1279254112224993e-05, + "loss": 0.6356, + "step": 194930 + }, + { + "epoch": 1.7233331565268126, + "grad_norm": 2.276177167892456, + "learning_rate": 2.127778072455312e-05, + "loss": 0.5209, + "step": 194940 + }, + { + "epoch": 1.723421559787125, + "grad_norm": 5.536886215209961, + "learning_rate": 2.127630733688125e-05, + "loss": 0.5808, + "step": 194950 + }, + { + "epoch": 1.7235099630474373, + "grad_norm": 5.371890068054199, + "learning_rate": 2.127483394920938e-05, + "loss": 0.6612, + "step": 194960 + }, + { + "epoch": 1.7235983663077494, + "grad_norm": 4.695826053619385, + "learning_rate": 2.127336056153751e-05, + "loss": 0.6523, + "step": 194970 + }, + { + "epoch": 1.7236867695680615, + "grad_norm": 5.159978866577148, + "learning_rate": 2.127188717386564e-05, + "loss": 0.5542, + "step": 194980 + }, + { + "epoch": 1.723775172828374, + "grad_norm": 4.265750408172607, + "learning_rate": 2.127041378619377e-05, + "loss": 0.5552, + "step": 194990 + }, + { + "epoch": 1.7238635760886862, + "grad_norm": 3.749016284942627, + "learning_rate": 2.12689403985219e-05, + "loss": 0.514, + "step": 195000 + }, + { + "epoch": 1.7239519793489984, + "grad_norm": 4.934724807739258, + "learning_rate": 2.1267467010850027e-05, + "loss": 0.6437, + "step": 195010 + }, + { + "epoch": 1.7240403826093105, + "grad_norm": 1.63783860206604, + "learning_rate": 2.126599362317816e-05, + "loss": 0.5651, + "step": 195020 + }, + { + "epoch": 1.724128785869623, + "grad_norm": 2.97623348236084, + "learning_rate": 2.1264520235506287e-05, + "loss": 0.6143, + "step": 195030 + }, + { + "epoch": 1.7242171891299352, + "grad_norm": 6.014468669891357, + "learning_rate": 2.1263046847834415e-05, + "loss": 0.542, + "step": 195040 + }, + { + "epoch": 1.7243055923902473, + "grad_norm": 3.3367233276367188, + "learning_rate": 2.1261573460162547e-05, + "loss": 0.7033, + "step": 195050 + }, + { + "epoch": 1.7243939956505596, + "grad_norm": 2.835598945617676, + "learning_rate": 2.1260100072490675e-05, + "loss": 0.5744, + "step": 195060 + }, + { + "epoch": 1.724482398910872, + "grad_norm": 5.4065680503845215, + "learning_rate": 2.1258626684818804e-05, + "loss": 0.5361, + "step": 195070 + }, + { + "epoch": 1.724570802171184, + "grad_norm": 6.708798408508301, + "learning_rate": 2.1257153297146935e-05, + "loss": 0.5408, + "step": 195080 + }, + { + "epoch": 1.7246592054314962, + "grad_norm": 2.620743751525879, + "learning_rate": 2.1255679909475064e-05, + "loss": 0.5281, + "step": 195090 + }, + { + "epoch": 1.7247476086918085, + "grad_norm": 3.543836832046509, + "learning_rate": 2.1254206521803192e-05, + "loss": 0.6236, + "step": 195100 + }, + { + "epoch": 1.7248360119521209, + "grad_norm": 0.964089035987854, + "learning_rate": 2.1252733134131324e-05, + "loss": 0.5193, + "step": 195110 + }, + { + "epoch": 1.724924415212433, + "grad_norm": 2.3067169189453125, + "learning_rate": 2.1251259746459452e-05, + "loss": 0.5311, + "step": 195120 + }, + { + "epoch": 1.7250128184727451, + "grad_norm": 9.584707260131836, + "learning_rate": 2.124978635878758e-05, + "loss": 0.5483, + "step": 195130 + }, + { + "epoch": 1.7251012217330575, + "grad_norm": 6.576661586761475, + "learning_rate": 2.124831297111571e-05, + "loss": 0.5107, + "step": 195140 + }, + { + "epoch": 1.7251896249933698, + "grad_norm": 2.2746684551239014, + "learning_rate": 2.124683958344384e-05, + "loss": 0.5469, + "step": 195150 + }, + { + "epoch": 1.725278028253682, + "grad_norm": 3.8254854679107666, + "learning_rate": 2.124536619577197e-05, + "loss": 0.4967, + "step": 195160 + }, + { + "epoch": 1.7253664315139943, + "grad_norm": 4.170722961425781, + "learning_rate": 2.1243892808100097e-05, + "loss": 0.6071, + "step": 195170 + }, + { + "epoch": 1.7254548347743066, + "grad_norm": 6.850671768188477, + "learning_rate": 2.1242419420428226e-05, + "loss": 0.8257, + "step": 195180 + }, + { + "epoch": 1.7255432380346187, + "grad_norm": 1.8608275651931763, + "learning_rate": 2.1240946032756358e-05, + "loss": 0.6186, + "step": 195190 + }, + { + "epoch": 1.7256316412949309, + "grad_norm": 2.613795757293701, + "learning_rate": 2.1239472645084486e-05, + "loss": 0.6071, + "step": 195200 + }, + { + "epoch": 1.7257200445552432, + "grad_norm": 3.4038357734680176, + "learning_rate": 2.1237999257412614e-05, + "loss": 0.6723, + "step": 195210 + }, + { + "epoch": 1.7258084478155555, + "grad_norm": 4.461121082305908, + "learning_rate": 2.1236525869740746e-05, + "loss": 0.6479, + "step": 195220 + }, + { + "epoch": 1.7258968510758677, + "grad_norm": 1.9412622451782227, + "learning_rate": 2.1235052482068874e-05, + "loss": 0.4966, + "step": 195230 + }, + { + "epoch": 1.7259852543361798, + "grad_norm": 1.07846999168396, + "learning_rate": 2.1233579094397003e-05, + "loss": 0.5164, + "step": 195240 + }, + { + "epoch": 1.7260736575964921, + "grad_norm": 2.48948073387146, + "learning_rate": 2.123210570672513e-05, + "loss": 0.5181, + "step": 195250 + }, + { + "epoch": 1.7261620608568045, + "grad_norm": 2.4837334156036377, + "learning_rate": 2.1230632319053263e-05, + "loss": 0.6371, + "step": 195260 + }, + { + "epoch": 1.7262504641171166, + "grad_norm": 1.6007318496704102, + "learning_rate": 2.122915893138139e-05, + "loss": 0.5292, + "step": 195270 + }, + { + "epoch": 1.726338867377429, + "grad_norm": 1.8158955574035645, + "learning_rate": 2.122768554370952e-05, + "loss": 0.6204, + "step": 195280 + }, + { + "epoch": 1.7264272706377413, + "grad_norm": 1.3074404001235962, + "learning_rate": 2.122621215603765e-05, + "loss": 0.4778, + "step": 195290 + }, + { + "epoch": 1.7265156738980534, + "grad_norm": 5.726093292236328, + "learning_rate": 2.122473876836578e-05, + "loss": 0.5694, + "step": 195300 + }, + { + "epoch": 1.7266040771583655, + "grad_norm": 4.886521816253662, + "learning_rate": 2.1223265380693908e-05, + "loss": 0.4615, + "step": 195310 + }, + { + "epoch": 1.7266924804186778, + "grad_norm": 2.211855411529541, + "learning_rate": 2.1221791993022036e-05, + "loss": 0.6406, + "step": 195320 + }, + { + "epoch": 1.7267808836789902, + "grad_norm": 0.9785746932029724, + "learning_rate": 2.1220318605350168e-05, + "loss": 0.5325, + "step": 195330 + }, + { + "epoch": 1.7268692869393023, + "grad_norm": 1.7503172159194946, + "learning_rate": 2.1218845217678296e-05, + "loss": 0.4948, + "step": 195340 + }, + { + "epoch": 1.7269576901996144, + "grad_norm": 18.286426544189453, + "learning_rate": 2.1217371830006425e-05, + "loss": 0.7423, + "step": 195350 + }, + { + "epoch": 1.7270460934599268, + "grad_norm": 1.8167378902435303, + "learning_rate": 2.1215898442334553e-05, + "loss": 0.471, + "step": 195360 + }, + { + "epoch": 1.727134496720239, + "grad_norm": 7.867949962615967, + "learning_rate": 2.1214425054662685e-05, + "loss": 0.6855, + "step": 195370 + }, + { + "epoch": 1.7272228999805512, + "grad_norm": 8.186702728271484, + "learning_rate": 2.1212951666990813e-05, + "loss": 0.615, + "step": 195380 + }, + { + "epoch": 1.7273113032408636, + "grad_norm": 9.494102478027344, + "learning_rate": 2.121147827931894e-05, + "loss": 0.5212, + "step": 195390 + }, + { + "epoch": 1.727399706501176, + "grad_norm": 2.0706982612609863, + "learning_rate": 2.1210004891647073e-05, + "loss": 0.5912, + "step": 195400 + }, + { + "epoch": 1.727488109761488, + "grad_norm": 4.010214328765869, + "learning_rate": 2.12085315039752e-05, + "loss": 0.6671, + "step": 195410 + }, + { + "epoch": 1.7275765130218002, + "grad_norm": 2.9555420875549316, + "learning_rate": 2.120705811630333e-05, + "loss": 0.6518, + "step": 195420 + }, + { + "epoch": 1.7276649162821125, + "grad_norm": 6.859008312225342, + "learning_rate": 2.120558472863146e-05, + "loss": 0.6816, + "step": 195430 + }, + { + "epoch": 1.7277533195424248, + "grad_norm": 1.4763985872268677, + "learning_rate": 2.120411134095959e-05, + "loss": 0.5363, + "step": 195440 + }, + { + "epoch": 1.727841722802737, + "grad_norm": 3.245969295501709, + "learning_rate": 2.120263795328772e-05, + "loss": 0.5138, + "step": 195450 + }, + { + "epoch": 1.727930126063049, + "grad_norm": 1.5190030336380005, + "learning_rate": 2.1201164565615847e-05, + "loss": 0.6454, + "step": 195460 + }, + { + "epoch": 1.7280185293233614, + "grad_norm": 4.291130065917969, + "learning_rate": 2.119969117794398e-05, + "loss": 0.5923, + "step": 195470 + }, + { + "epoch": 1.7281069325836738, + "grad_norm": 1.679487943649292, + "learning_rate": 2.1198217790272107e-05, + "loss": 0.6685, + "step": 195480 + }, + { + "epoch": 1.7281953358439859, + "grad_norm": 12.517325401306152, + "learning_rate": 2.1196744402600235e-05, + "loss": 0.503, + "step": 195490 + }, + { + "epoch": 1.728283739104298, + "grad_norm": 2.771240234375, + "learning_rate": 2.1195271014928364e-05, + "loss": 0.4449, + "step": 195500 + }, + { + "epoch": 1.7283721423646106, + "grad_norm": 7.492249011993408, + "learning_rate": 2.1193797627256495e-05, + "loss": 0.4944, + "step": 195510 + }, + { + "epoch": 1.7284605456249227, + "grad_norm": 2.683053970336914, + "learning_rate": 2.1192324239584624e-05, + "loss": 0.6942, + "step": 195520 + }, + { + "epoch": 1.7285489488852348, + "grad_norm": 2.708479404449463, + "learning_rate": 2.1190850851912752e-05, + "loss": 0.47, + "step": 195530 + }, + { + "epoch": 1.7286373521455471, + "grad_norm": 2.0322928428649902, + "learning_rate": 2.118937746424088e-05, + "loss": 0.681, + "step": 195540 + }, + { + "epoch": 1.7287257554058595, + "grad_norm": 2.9554851055145264, + "learning_rate": 2.1187904076569012e-05, + "loss": 0.6341, + "step": 195550 + }, + { + "epoch": 1.7288141586661716, + "grad_norm": 3.1124467849731445, + "learning_rate": 2.118643068889714e-05, + "loss": 0.7117, + "step": 195560 + }, + { + "epoch": 1.7289025619264837, + "grad_norm": 15.172920227050781, + "learning_rate": 2.118495730122527e-05, + "loss": 0.7122, + "step": 195570 + }, + { + "epoch": 1.728990965186796, + "grad_norm": 30.813711166381836, + "learning_rate": 2.11834839135534e-05, + "loss": 0.5538, + "step": 195580 + }, + { + "epoch": 1.7290793684471084, + "grad_norm": 0.8557159304618835, + "learning_rate": 2.118201052588153e-05, + "loss": 0.4769, + "step": 195590 + }, + { + "epoch": 1.7291677717074205, + "grad_norm": 3.094269275665283, + "learning_rate": 2.1180537138209657e-05, + "loss": 0.5318, + "step": 195600 + }, + { + "epoch": 1.7292561749677327, + "grad_norm": 2.590012788772583, + "learning_rate": 2.1179063750537786e-05, + "loss": 0.6028, + "step": 195610 + }, + { + "epoch": 1.7293445782280452, + "grad_norm": 2.8997645378112793, + "learning_rate": 2.1177590362865917e-05, + "loss": 0.5047, + "step": 195620 + }, + { + "epoch": 1.7294329814883573, + "grad_norm": 1.8133465051651, + "learning_rate": 2.1176116975194046e-05, + "loss": 0.4875, + "step": 195630 + }, + { + "epoch": 1.7295213847486695, + "grad_norm": 7.082368850708008, + "learning_rate": 2.1174643587522174e-05, + "loss": 0.518, + "step": 195640 + }, + { + "epoch": 1.7296097880089818, + "grad_norm": 1.881662368774414, + "learning_rate": 2.1173170199850302e-05, + "loss": 0.6293, + "step": 195650 + }, + { + "epoch": 1.7296981912692941, + "grad_norm": 1.6363252401351929, + "learning_rate": 2.1171696812178434e-05, + "loss": 0.6062, + "step": 195660 + }, + { + "epoch": 1.7297865945296063, + "grad_norm": 3.21171236038208, + "learning_rate": 2.1170223424506563e-05, + "loss": 0.5207, + "step": 195670 + }, + { + "epoch": 1.7298749977899184, + "grad_norm": 5.394152641296387, + "learning_rate": 2.116875003683469e-05, + "loss": 0.542, + "step": 195680 + }, + { + "epoch": 1.7299634010502307, + "grad_norm": 2.825296640396118, + "learning_rate": 2.1167276649162823e-05, + "loss": 0.5008, + "step": 195690 + }, + { + "epoch": 1.730051804310543, + "grad_norm": 9.6980562210083, + "learning_rate": 2.116580326149095e-05, + "loss": 0.5799, + "step": 195700 + }, + { + "epoch": 1.7301402075708552, + "grad_norm": 1.1438497304916382, + "learning_rate": 2.116432987381908e-05, + "loss": 0.7066, + "step": 195710 + }, + { + "epoch": 1.7302286108311673, + "grad_norm": 2.870929002761841, + "learning_rate": 2.1162856486147208e-05, + "loss": 0.6094, + "step": 195720 + }, + { + "epoch": 1.7303170140914796, + "grad_norm": 8.667716026306152, + "learning_rate": 2.116138309847534e-05, + "loss": 0.4956, + "step": 195730 + }, + { + "epoch": 1.730405417351792, + "grad_norm": 1.7962353229522705, + "learning_rate": 2.1159909710803468e-05, + "loss": 0.5723, + "step": 195740 + }, + { + "epoch": 1.730493820612104, + "grad_norm": 3.418757915496826, + "learning_rate": 2.1158436323131596e-05, + "loss": 0.5362, + "step": 195750 + }, + { + "epoch": 1.7305822238724164, + "grad_norm": 1.270337462425232, + "learning_rate": 2.1156962935459728e-05, + "loss": 0.5453, + "step": 195760 + }, + { + "epoch": 1.7306706271327288, + "grad_norm": 5.31475830078125, + "learning_rate": 2.1155489547787856e-05, + "loss": 0.5389, + "step": 195770 + }, + { + "epoch": 1.730759030393041, + "grad_norm": 2.0658061504364014, + "learning_rate": 2.1154016160115985e-05, + "loss": 0.4715, + "step": 195780 + }, + { + "epoch": 1.730847433653353, + "grad_norm": 1.0598411560058594, + "learning_rate": 2.1152542772444113e-05, + "loss": 0.5568, + "step": 195790 + }, + { + "epoch": 1.7309358369136654, + "grad_norm": 2.3994977474212646, + "learning_rate": 2.1151069384772245e-05, + "loss": 0.5859, + "step": 195800 + }, + { + "epoch": 1.7310242401739777, + "grad_norm": 1.717003345489502, + "learning_rate": 2.1149595997100373e-05, + "loss": 0.6225, + "step": 195810 + }, + { + "epoch": 1.7311126434342898, + "grad_norm": 1.4952044486999512, + "learning_rate": 2.11481226094285e-05, + "loss": 0.5733, + "step": 195820 + }, + { + "epoch": 1.731201046694602, + "grad_norm": 5.4516730308532715, + "learning_rate": 2.114664922175663e-05, + "loss": 0.5928, + "step": 195830 + }, + { + "epoch": 1.7312894499549143, + "grad_norm": 1.2333766222000122, + "learning_rate": 2.114517583408476e-05, + "loss": 0.5295, + "step": 195840 + }, + { + "epoch": 1.7313778532152266, + "grad_norm": 1.6348472833633423, + "learning_rate": 2.114370244641289e-05, + "loss": 0.549, + "step": 195850 + }, + { + "epoch": 1.7314662564755388, + "grad_norm": 1.1696728467941284, + "learning_rate": 2.1142229058741018e-05, + "loss": 0.5897, + "step": 195860 + }, + { + "epoch": 1.731554659735851, + "grad_norm": 1.7708359956741333, + "learning_rate": 2.114075567106915e-05, + "loss": 0.6673, + "step": 195870 + }, + { + "epoch": 1.7316430629961634, + "grad_norm": 12.028105735778809, + "learning_rate": 2.113928228339728e-05, + "loss": 0.6415, + "step": 195880 + }, + { + "epoch": 1.7317314662564756, + "grad_norm": 12.218727111816406, + "learning_rate": 2.1137808895725407e-05, + "loss": 0.5422, + "step": 195890 + }, + { + "epoch": 1.7318198695167877, + "grad_norm": 6.059319496154785, + "learning_rate": 2.113633550805354e-05, + "loss": 0.6705, + "step": 195900 + }, + { + "epoch": 1.7319082727771, + "grad_norm": 5.332113265991211, + "learning_rate": 2.1134862120381667e-05, + "loss": 0.471, + "step": 195910 + }, + { + "epoch": 1.7319966760374124, + "grad_norm": 1.3191802501678467, + "learning_rate": 2.1133388732709795e-05, + "loss": 0.514, + "step": 195920 + }, + { + "epoch": 1.7320850792977245, + "grad_norm": 1.0523746013641357, + "learning_rate": 2.1131915345037927e-05, + "loss": 0.5035, + "step": 195930 + }, + { + "epoch": 1.7321734825580366, + "grad_norm": 6.753158092498779, + "learning_rate": 2.1130441957366055e-05, + "loss": 0.5398, + "step": 195940 + }, + { + "epoch": 1.732261885818349, + "grad_norm": 4.173696517944336, + "learning_rate": 2.1128968569694184e-05, + "loss": 0.6315, + "step": 195950 + }, + { + "epoch": 1.7323502890786613, + "grad_norm": 9.682572364807129, + "learning_rate": 2.1127495182022315e-05, + "loss": 0.5357, + "step": 195960 + }, + { + "epoch": 1.7324386923389734, + "grad_norm": 4.160720348358154, + "learning_rate": 2.1126021794350444e-05, + "loss": 0.5762, + "step": 195970 + }, + { + "epoch": 1.7325270955992857, + "grad_norm": 2.6927499771118164, + "learning_rate": 2.1124548406678572e-05, + "loss": 0.7313, + "step": 195980 + }, + { + "epoch": 1.732615498859598, + "grad_norm": 1.5778392553329468, + "learning_rate": 2.1123075019006704e-05, + "loss": 0.6011, + "step": 195990 + }, + { + "epoch": 1.7327039021199102, + "grad_norm": 1.2303720712661743, + "learning_rate": 2.1121601631334832e-05, + "loss": 0.6368, + "step": 196000 + }, + { + "epoch": 1.7327923053802223, + "grad_norm": 4.567232131958008, + "learning_rate": 2.112012824366296e-05, + "loss": 0.5589, + "step": 196010 + }, + { + "epoch": 1.7328807086405347, + "grad_norm": 1.4383995532989502, + "learning_rate": 2.1118654855991092e-05, + "loss": 0.5802, + "step": 196020 + }, + { + "epoch": 1.732969111900847, + "grad_norm": 1.431085228919983, + "learning_rate": 2.111718146831922e-05, + "loss": 0.6278, + "step": 196030 + }, + { + "epoch": 1.7330575151611591, + "grad_norm": 1.2765882015228271, + "learning_rate": 2.111570808064735e-05, + "loss": 0.6553, + "step": 196040 + }, + { + "epoch": 1.7331459184214713, + "grad_norm": 2.260514974594116, + "learning_rate": 2.111423469297548e-05, + "loss": 0.5194, + "step": 196050 + }, + { + "epoch": 1.7332343216817836, + "grad_norm": 2.344637632369995, + "learning_rate": 2.111276130530361e-05, + "loss": 0.5624, + "step": 196060 + }, + { + "epoch": 1.733322724942096, + "grad_norm": 2.5157487392425537, + "learning_rate": 2.1111287917631737e-05, + "loss": 0.6148, + "step": 196070 + }, + { + "epoch": 1.733411128202408, + "grad_norm": 1.5370826721191406, + "learning_rate": 2.1109814529959866e-05, + "loss": 0.6209, + "step": 196080 + }, + { + "epoch": 1.7334995314627204, + "grad_norm": 3.3523473739624023, + "learning_rate": 2.1108341142287997e-05, + "loss": 0.5228, + "step": 196090 + }, + { + "epoch": 1.7335879347230327, + "grad_norm": 0.8930099606513977, + "learning_rate": 2.1106867754616126e-05, + "loss": 0.5291, + "step": 196100 + }, + { + "epoch": 1.7336763379833449, + "grad_norm": 1.5018413066864014, + "learning_rate": 2.1105394366944254e-05, + "loss": 0.533, + "step": 196110 + }, + { + "epoch": 1.733764741243657, + "grad_norm": 3.6461799144744873, + "learning_rate": 2.1103920979272383e-05, + "loss": 0.531, + "step": 196120 + }, + { + "epoch": 1.7338531445039693, + "grad_norm": 1.418461561203003, + "learning_rate": 2.1102447591600514e-05, + "loss": 0.4567, + "step": 196130 + }, + { + "epoch": 1.7339415477642817, + "grad_norm": 3.9982662200927734, + "learning_rate": 2.1100974203928643e-05, + "loss": 0.5638, + "step": 196140 + }, + { + "epoch": 1.7340299510245938, + "grad_norm": 1.4564203023910522, + "learning_rate": 2.109950081625677e-05, + "loss": 0.6685, + "step": 196150 + }, + { + "epoch": 1.734118354284906, + "grad_norm": 7.852879047393799, + "learning_rate": 2.1098027428584903e-05, + "loss": 0.5937, + "step": 196160 + }, + { + "epoch": 1.7342067575452182, + "grad_norm": 3.3024935722351074, + "learning_rate": 2.109655404091303e-05, + "loss": 0.557, + "step": 196170 + }, + { + "epoch": 1.7342951608055306, + "grad_norm": 6.282588958740234, + "learning_rate": 2.109508065324116e-05, + "loss": 0.602, + "step": 196180 + }, + { + "epoch": 1.7343835640658427, + "grad_norm": 2.669245481491089, + "learning_rate": 2.1093607265569288e-05, + "loss": 0.7253, + "step": 196190 + }, + { + "epoch": 1.7344719673261548, + "grad_norm": 1.9435648918151855, + "learning_rate": 2.109213387789742e-05, + "loss": 0.6093, + "step": 196200 + }, + { + "epoch": 1.7345603705864674, + "grad_norm": 1.8497153520584106, + "learning_rate": 2.1090660490225548e-05, + "loss": 0.611, + "step": 196210 + }, + { + "epoch": 1.7346487738467795, + "grad_norm": 3.0626959800720215, + "learning_rate": 2.1089187102553676e-05, + "loss": 0.5759, + "step": 196220 + }, + { + "epoch": 1.7347371771070916, + "grad_norm": 1.7463003396987915, + "learning_rate": 2.1087713714881808e-05, + "loss": 0.5655, + "step": 196230 + }, + { + "epoch": 1.734825580367404, + "grad_norm": 3.1084718704223633, + "learning_rate": 2.1086240327209936e-05, + "loss": 0.6622, + "step": 196240 + }, + { + "epoch": 1.7349139836277163, + "grad_norm": 5.808989524841309, + "learning_rate": 2.1084766939538065e-05, + "loss": 0.6504, + "step": 196250 + }, + { + "epoch": 1.7350023868880284, + "grad_norm": 1.4292551279067993, + "learning_rate": 2.1083293551866193e-05, + "loss": 0.6286, + "step": 196260 + }, + { + "epoch": 1.7350907901483406, + "grad_norm": 2.5943634510040283, + "learning_rate": 2.1081820164194325e-05, + "loss": 0.4967, + "step": 196270 + }, + { + "epoch": 1.735179193408653, + "grad_norm": 5.778375148773193, + "learning_rate": 2.1080346776522453e-05, + "loss": 0.6326, + "step": 196280 + }, + { + "epoch": 1.7352675966689652, + "grad_norm": 6.108475685119629, + "learning_rate": 2.107887338885058e-05, + "loss": 0.6184, + "step": 196290 + }, + { + "epoch": 1.7353559999292774, + "grad_norm": 2.5158615112304688, + "learning_rate": 2.107740000117871e-05, + "loss": 0.4978, + "step": 196300 + }, + { + "epoch": 1.7354444031895895, + "grad_norm": 1.2883144617080688, + "learning_rate": 2.107592661350684e-05, + "loss": 0.525, + "step": 196310 + }, + { + "epoch": 1.7355328064499018, + "grad_norm": 4.02616548538208, + "learning_rate": 2.107445322583497e-05, + "loss": 0.564, + "step": 196320 + }, + { + "epoch": 1.7356212097102142, + "grad_norm": 11.756324768066406, + "learning_rate": 2.10729798381631e-05, + "loss": 0.6965, + "step": 196330 + }, + { + "epoch": 1.7357096129705263, + "grad_norm": 0.6863888502120972, + "learning_rate": 2.107150645049123e-05, + "loss": 0.6371, + "step": 196340 + }, + { + "epoch": 1.7357980162308386, + "grad_norm": 5.1146721839904785, + "learning_rate": 2.107003306281936e-05, + "loss": 0.6121, + "step": 196350 + }, + { + "epoch": 1.735886419491151, + "grad_norm": 1.0052541494369507, + "learning_rate": 2.1068559675147487e-05, + "loss": 0.5339, + "step": 196360 + }, + { + "epoch": 1.735974822751463, + "grad_norm": 1.4069644212722778, + "learning_rate": 2.1067086287475615e-05, + "loss": 0.4263, + "step": 196370 + }, + { + "epoch": 1.7360632260117752, + "grad_norm": 1.6357277631759644, + "learning_rate": 2.1065612899803747e-05, + "loss": 0.5789, + "step": 196380 + }, + { + "epoch": 1.7361516292720875, + "grad_norm": 1.6332443952560425, + "learning_rate": 2.1064139512131875e-05, + "loss": 0.6643, + "step": 196390 + }, + { + "epoch": 1.7362400325324, + "grad_norm": 8.895979881286621, + "learning_rate": 2.1062666124460004e-05, + "loss": 0.6415, + "step": 196400 + }, + { + "epoch": 1.736328435792712, + "grad_norm": 2.421715497970581, + "learning_rate": 2.1061192736788135e-05, + "loss": 0.6306, + "step": 196410 + }, + { + "epoch": 1.7364168390530241, + "grad_norm": 2.148012161254883, + "learning_rate": 2.1059719349116264e-05, + "loss": 0.528, + "step": 196420 + }, + { + "epoch": 1.7365052423133365, + "grad_norm": 2.4306044578552246, + "learning_rate": 2.1058245961444392e-05, + "loss": 0.6064, + "step": 196430 + }, + { + "epoch": 1.7365936455736488, + "grad_norm": 1.142021656036377, + "learning_rate": 2.105677257377252e-05, + "loss": 0.4912, + "step": 196440 + }, + { + "epoch": 1.736682048833961, + "grad_norm": 2.4800100326538086, + "learning_rate": 2.1055299186100652e-05, + "loss": 0.6269, + "step": 196450 + }, + { + "epoch": 1.7367704520942733, + "grad_norm": 4.973618507385254, + "learning_rate": 2.105382579842878e-05, + "loss": 0.6908, + "step": 196460 + }, + { + "epoch": 1.7368588553545856, + "grad_norm": 2.3804619312286377, + "learning_rate": 2.105235241075691e-05, + "loss": 0.6208, + "step": 196470 + }, + { + "epoch": 1.7369472586148977, + "grad_norm": 3.174193859100342, + "learning_rate": 2.1050879023085037e-05, + "loss": 0.5373, + "step": 196480 + }, + { + "epoch": 1.7370356618752099, + "grad_norm": 1.7836205959320068, + "learning_rate": 2.104940563541317e-05, + "loss": 0.5188, + "step": 196490 + }, + { + "epoch": 1.7371240651355222, + "grad_norm": 2.734497308731079, + "learning_rate": 2.1047932247741297e-05, + "loss": 0.4601, + "step": 196500 + }, + { + "epoch": 1.7372124683958345, + "grad_norm": 7.385402679443359, + "learning_rate": 2.1046458860069426e-05, + "loss": 0.717, + "step": 196510 + }, + { + "epoch": 1.7373008716561467, + "grad_norm": 2.8061513900756836, + "learning_rate": 2.1044985472397557e-05, + "loss": 0.6686, + "step": 196520 + }, + { + "epoch": 1.7373892749164588, + "grad_norm": 1.9530673027038574, + "learning_rate": 2.1043512084725686e-05, + "loss": 0.6468, + "step": 196530 + }, + { + "epoch": 1.7374776781767711, + "grad_norm": 2.8552348613739014, + "learning_rate": 2.1042038697053814e-05, + "loss": 0.6425, + "step": 196540 + }, + { + "epoch": 1.7375660814370835, + "grad_norm": 2.1397485733032227, + "learning_rate": 2.1040565309381942e-05, + "loss": 0.5901, + "step": 196550 + }, + { + "epoch": 1.7376544846973956, + "grad_norm": 1.5400224924087524, + "learning_rate": 2.1039091921710074e-05, + "loss": 0.544, + "step": 196560 + }, + { + "epoch": 1.737742887957708, + "grad_norm": 2.6097347736358643, + "learning_rate": 2.1037618534038203e-05, + "loss": 0.6636, + "step": 196570 + }, + { + "epoch": 1.7378312912180203, + "grad_norm": 1.6473885774612427, + "learning_rate": 2.103614514636633e-05, + "loss": 0.6324, + "step": 196580 + }, + { + "epoch": 1.7379196944783324, + "grad_norm": 8.564188957214355, + "learning_rate": 2.103467175869446e-05, + "loss": 0.586, + "step": 196590 + }, + { + "epoch": 1.7380080977386445, + "grad_norm": 3.2612433433532715, + "learning_rate": 2.103319837102259e-05, + "loss": 0.5477, + "step": 196600 + }, + { + "epoch": 1.7380965009989569, + "grad_norm": 1.9359999895095825, + "learning_rate": 2.103172498335072e-05, + "loss": 0.5392, + "step": 196610 + }, + { + "epoch": 1.7381849042592692, + "grad_norm": 4.035454273223877, + "learning_rate": 2.1030251595678848e-05, + "loss": 0.6259, + "step": 196620 + }, + { + "epoch": 1.7382733075195813, + "grad_norm": 5.301139831542969, + "learning_rate": 2.102877820800698e-05, + "loss": 0.5815, + "step": 196630 + }, + { + "epoch": 1.7383617107798934, + "grad_norm": 2.0500972270965576, + "learning_rate": 2.1027304820335108e-05, + "loss": 0.5499, + "step": 196640 + }, + { + "epoch": 1.7384501140402058, + "grad_norm": 1.364989161491394, + "learning_rate": 2.1025831432663236e-05, + "loss": 0.5388, + "step": 196650 + }, + { + "epoch": 1.7385385173005181, + "grad_norm": 3.1312942504882812, + "learning_rate": 2.1024358044991364e-05, + "loss": 0.6786, + "step": 196660 + }, + { + "epoch": 1.7386269205608302, + "grad_norm": 1.338382601737976, + "learning_rate": 2.1022884657319496e-05, + "loss": 0.5909, + "step": 196670 + }, + { + "epoch": 1.7387153238211426, + "grad_norm": 9.556388854980469, + "learning_rate": 2.1021411269647625e-05, + "loss": 0.6866, + "step": 196680 + }, + { + "epoch": 1.738803727081455, + "grad_norm": 1.7506722211837769, + "learning_rate": 2.1019937881975753e-05, + "loss": 0.5025, + "step": 196690 + }, + { + "epoch": 1.738892130341767, + "grad_norm": 3.2631466388702393, + "learning_rate": 2.1018464494303885e-05, + "loss": 0.5898, + "step": 196700 + }, + { + "epoch": 1.7389805336020792, + "grad_norm": 1.8236184120178223, + "learning_rate": 2.1016991106632013e-05, + "loss": 0.4798, + "step": 196710 + }, + { + "epoch": 1.7390689368623915, + "grad_norm": 4.561819076538086, + "learning_rate": 2.101551771896014e-05, + "loss": 0.5659, + "step": 196720 + }, + { + "epoch": 1.7391573401227038, + "grad_norm": 11.79699420928955, + "learning_rate": 2.101404433128827e-05, + "loss": 0.4901, + "step": 196730 + }, + { + "epoch": 1.739245743383016, + "grad_norm": 1.3858580589294434, + "learning_rate": 2.10125709436164e-05, + "loss": 0.6426, + "step": 196740 + }, + { + "epoch": 1.739334146643328, + "grad_norm": 7.432936668395996, + "learning_rate": 2.101109755594453e-05, + "loss": 0.6957, + "step": 196750 + }, + { + "epoch": 1.7394225499036404, + "grad_norm": 3.0839948654174805, + "learning_rate": 2.1009624168272658e-05, + "loss": 0.5432, + "step": 196760 + }, + { + "epoch": 1.7395109531639528, + "grad_norm": 3.8584437370300293, + "learning_rate": 2.1008150780600787e-05, + "loss": 0.5606, + "step": 196770 + }, + { + "epoch": 1.7395993564242649, + "grad_norm": 2.1494619846343994, + "learning_rate": 2.1006677392928918e-05, + "loss": 0.6634, + "step": 196780 + }, + { + "epoch": 1.739687759684577, + "grad_norm": 1.4211783409118652, + "learning_rate": 2.1005204005257047e-05, + "loss": 0.519, + "step": 196790 + }, + { + "epoch": 1.7397761629448896, + "grad_norm": 2.011362314224243, + "learning_rate": 2.1003730617585175e-05, + "loss": 0.5824, + "step": 196800 + }, + { + "epoch": 1.7398645662052017, + "grad_norm": 1.8710979223251343, + "learning_rate": 2.1002257229913307e-05, + "loss": 0.5322, + "step": 196810 + }, + { + "epoch": 1.7399529694655138, + "grad_norm": 1.362769365310669, + "learning_rate": 2.1000783842241435e-05, + "loss": 0.6559, + "step": 196820 + }, + { + "epoch": 1.7400413727258262, + "grad_norm": 3.9178881645202637, + "learning_rate": 2.0999310454569563e-05, + "loss": 0.5809, + "step": 196830 + }, + { + "epoch": 1.7401297759861385, + "grad_norm": 3.1922249794006348, + "learning_rate": 2.0997837066897695e-05, + "loss": 0.6025, + "step": 196840 + }, + { + "epoch": 1.7402181792464506, + "grad_norm": 1.3927693367004395, + "learning_rate": 2.0996363679225824e-05, + "loss": 0.569, + "step": 196850 + }, + { + "epoch": 1.7403065825067627, + "grad_norm": 3.245332956314087, + "learning_rate": 2.0994890291553955e-05, + "loss": 0.6722, + "step": 196860 + }, + { + "epoch": 1.740394985767075, + "grad_norm": 0.815080463886261, + "learning_rate": 2.0993416903882084e-05, + "loss": 0.6002, + "step": 196870 + }, + { + "epoch": 1.7404833890273874, + "grad_norm": 6.448976993560791, + "learning_rate": 2.0991943516210212e-05, + "loss": 0.6991, + "step": 196880 + }, + { + "epoch": 1.7405717922876995, + "grad_norm": 1.5082669258117676, + "learning_rate": 2.0990470128538344e-05, + "loss": 0.406, + "step": 196890 + }, + { + "epoch": 1.7406601955480117, + "grad_norm": 18.35371208190918, + "learning_rate": 2.0988996740866472e-05, + "loss": 0.69, + "step": 196900 + }, + { + "epoch": 1.740748598808324, + "grad_norm": 1.7704530954360962, + "learning_rate": 2.09875233531946e-05, + "loss": 0.4987, + "step": 196910 + }, + { + "epoch": 1.7408370020686363, + "grad_norm": 2.0578227043151855, + "learning_rate": 2.0986049965522732e-05, + "loss": 0.5107, + "step": 196920 + }, + { + "epoch": 1.7409254053289485, + "grad_norm": 12.334599494934082, + "learning_rate": 2.098457657785086e-05, + "loss": 0.5618, + "step": 196930 + }, + { + "epoch": 1.7410138085892608, + "grad_norm": 1.5528627634048462, + "learning_rate": 2.098310319017899e-05, + "loss": 0.6015, + "step": 196940 + }, + { + "epoch": 1.7411022118495731, + "grad_norm": 4.1872100830078125, + "learning_rate": 2.0981629802507117e-05, + "loss": 0.4583, + "step": 196950 + }, + { + "epoch": 1.7411906151098853, + "grad_norm": 3.3384766578674316, + "learning_rate": 2.098015641483525e-05, + "loss": 0.6458, + "step": 196960 + }, + { + "epoch": 1.7412790183701974, + "grad_norm": 3.9110372066497803, + "learning_rate": 2.0978683027163377e-05, + "loss": 0.5932, + "step": 196970 + }, + { + "epoch": 1.7413674216305097, + "grad_norm": 10.663474082946777, + "learning_rate": 2.0977209639491506e-05, + "loss": 0.5076, + "step": 196980 + }, + { + "epoch": 1.741455824890822, + "grad_norm": 4.7137064933776855, + "learning_rate": 2.0975736251819637e-05, + "loss": 0.5824, + "step": 196990 + }, + { + "epoch": 1.7415442281511342, + "grad_norm": 0.7676690220832825, + "learning_rate": 2.0974262864147766e-05, + "loss": 0.4205, + "step": 197000 + }, + { + "epoch": 1.7416326314114463, + "grad_norm": 9.2551851272583, + "learning_rate": 2.0972789476475894e-05, + "loss": 0.6729, + "step": 197010 + }, + { + "epoch": 1.7417210346717586, + "grad_norm": 7.426784038543701, + "learning_rate": 2.0971316088804023e-05, + "loss": 0.6624, + "step": 197020 + }, + { + "epoch": 1.741809437932071, + "grad_norm": 0.8745737075805664, + "learning_rate": 2.0969842701132154e-05, + "loss": 0.5729, + "step": 197030 + }, + { + "epoch": 1.7418978411923831, + "grad_norm": 8.710653305053711, + "learning_rate": 2.0968369313460283e-05, + "loss": 0.5401, + "step": 197040 + }, + { + "epoch": 1.7419862444526955, + "grad_norm": 1.2940747737884521, + "learning_rate": 2.096689592578841e-05, + "loss": 0.5226, + "step": 197050 + }, + { + "epoch": 1.7420746477130078, + "grad_norm": 20.898895263671875, + "learning_rate": 2.0965422538116543e-05, + "loss": 0.634, + "step": 197060 + }, + { + "epoch": 1.74216305097332, + "grad_norm": 12.74487590789795, + "learning_rate": 2.096394915044467e-05, + "loss": 0.4975, + "step": 197070 + }, + { + "epoch": 1.742251454233632, + "grad_norm": 3.2381784915924072, + "learning_rate": 2.09624757627728e-05, + "loss": 0.4833, + "step": 197080 + }, + { + "epoch": 1.7423398574939444, + "grad_norm": 2.0809082984924316, + "learning_rate": 2.0961002375100928e-05, + "loss": 0.6267, + "step": 197090 + }, + { + "epoch": 1.7424282607542567, + "grad_norm": 1.789787769317627, + "learning_rate": 2.095952898742906e-05, + "loss": 0.4638, + "step": 197100 + }, + { + "epoch": 1.7425166640145688, + "grad_norm": 1.7888133525848389, + "learning_rate": 2.0958055599757188e-05, + "loss": 0.6171, + "step": 197110 + }, + { + "epoch": 1.742605067274881, + "grad_norm": 6.754447937011719, + "learning_rate": 2.0956582212085316e-05, + "loss": 0.5902, + "step": 197120 + }, + { + "epoch": 1.7426934705351933, + "grad_norm": 2.8703441619873047, + "learning_rate": 2.0955108824413445e-05, + "loss": 0.6575, + "step": 197130 + }, + { + "epoch": 1.7427818737955056, + "grad_norm": 25.108125686645508, + "learning_rate": 2.0953635436741576e-05, + "loss": 0.6874, + "step": 197140 + }, + { + "epoch": 1.7428702770558178, + "grad_norm": 3.9465975761413574, + "learning_rate": 2.0952162049069705e-05, + "loss": 0.6163, + "step": 197150 + }, + { + "epoch": 1.74295868031613, + "grad_norm": 8.941535949707031, + "learning_rate": 2.0950688661397833e-05, + "loss": 0.7202, + "step": 197160 + }, + { + "epoch": 1.7430470835764424, + "grad_norm": 0.8280032873153687, + "learning_rate": 2.0949215273725965e-05, + "loss": 0.4764, + "step": 197170 + }, + { + "epoch": 1.7431354868367546, + "grad_norm": 1.437925100326538, + "learning_rate": 2.0947741886054093e-05, + "loss": 0.5309, + "step": 197180 + }, + { + "epoch": 1.7432238900970667, + "grad_norm": 3.3845760822296143, + "learning_rate": 2.094626849838222e-05, + "loss": 0.5555, + "step": 197190 + }, + { + "epoch": 1.743312293357379, + "grad_norm": 6.526894569396973, + "learning_rate": 2.094479511071035e-05, + "loss": 0.6583, + "step": 197200 + }, + { + "epoch": 1.7434006966176914, + "grad_norm": 2.2649056911468506, + "learning_rate": 2.094332172303848e-05, + "loss": 0.6363, + "step": 197210 + }, + { + "epoch": 1.7434890998780035, + "grad_norm": 1.5738238096237183, + "learning_rate": 2.094184833536661e-05, + "loss": 0.7025, + "step": 197220 + }, + { + "epoch": 1.7435775031383156, + "grad_norm": 1.7489577531814575, + "learning_rate": 2.0940374947694738e-05, + "loss": 0.6186, + "step": 197230 + }, + { + "epoch": 1.743665906398628, + "grad_norm": 6.811504364013672, + "learning_rate": 2.0938901560022867e-05, + "loss": 0.5891, + "step": 197240 + }, + { + "epoch": 1.7437543096589403, + "grad_norm": 2.0611021518707275, + "learning_rate": 2.0937428172351e-05, + "loss": 0.5561, + "step": 197250 + }, + { + "epoch": 1.7438427129192524, + "grad_norm": 1.748860478401184, + "learning_rate": 2.0935954784679127e-05, + "loss": 0.6354, + "step": 197260 + }, + { + "epoch": 1.7439311161795648, + "grad_norm": 2.872514486312866, + "learning_rate": 2.0934481397007255e-05, + "loss": 0.6794, + "step": 197270 + }, + { + "epoch": 1.744019519439877, + "grad_norm": 2.2312440872192383, + "learning_rate": 2.0933008009335387e-05, + "loss": 0.63, + "step": 197280 + }, + { + "epoch": 1.7441079227001892, + "grad_norm": 2.638317584991455, + "learning_rate": 2.0931534621663515e-05, + "loss": 0.6223, + "step": 197290 + }, + { + "epoch": 1.7441963259605013, + "grad_norm": 1.5555953979492188, + "learning_rate": 2.0930061233991644e-05, + "loss": 0.513, + "step": 197300 + }, + { + "epoch": 1.7442847292208137, + "grad_norm": 1.947716474533081, + "learning_rate": 2.0928587846319772e-05, + "loss": 0.5422, + "step": 197310 + }, + { + "epoch": 1.744373132481126, + "grad_norm": 5.751101493835449, + "learning_rate": 2.0927114458647904e-05, + "loss": 0.6981, + "step": 197320 + }, + { + "epoch": 1.7444615357414381, + "grad_norm": 1.7628252506256104, + "learning_rate": 2.0925641070976032e-05, + "loss": 0.5721, + "step": 197330 + }, + { + "epoch": 1.7445499390017503, + "grad_norm": 3.8472437858581543, + "learning_rate": 2.092416768330416e-05, + "loss": 0.5763, + "step": 197340 + }, + { + "epoch": 1.7446383422620626, + "grad_norm": 3.0430450439453125, + "learning_rate": 2.0922694295632292e-05, + "loss": 0.6133, + "step": 197350 + }, + { + "epoch": 1.744726745522375, + "grad_norm": 1.3532829284667969, + "learning_rate": 2.092122090796042e-05, + "loss": 0.5364, + "step": 197360 + }, + { + "epoch": 1.744815148782687, + "grad_norm": 3.3958280086517334, + "learning_rate": 2.091974752028855e-05, + "loss": 0.6856, + "step": 197370 + }, + { + "epoch": 1.7449035520429992, + "grad_norm": 1.0810480117797852, + "learning_rate": 2.0918274132616677e-05, + "loss": 0.6368, + "step": 197380 + }, + { + "epoch": 1.7449919553033117, + "grad_norm": 3.335617780685425, + "learning_rate": 2.091680074494481e-05, + "loss": 0.4823, + "step": 197390 + }, + { + "epoch": 1.7450803585636239, + "grad_norm": 4.657704830169678, + "learning_rate": 2.0915327357272937e-05, + "loss": 0.5041, + "step": 197400 + }, + { + "epoch": 1.745168761823936, + "grad_norm": 2.925278663635254, + "learning_rate": 2.0913853969601066e-05, + "loss": 0.7198, + "step": 197410 + }, + { + "epoch": 1.7452571650842483, + "grad_norm": 2.435164451599121, + "learning_rate": 2.0912380581929194e-05, + "loss": 0.6114, + "step": 197420 + }, + { + "epoch": 1.7453455683445607, + "grad_norm": 2.3864328861236572, + "learning_rate": 2.0910907194257326e-05, + "loss": 0.4957, + "step": 197430 + }, + { + "epoch": 1.7454339716048728, + "grad_norm": 2.1344692707061768, + "learning_rate": 2.0909433806585454e-05, + "loss": 0.6309, + "step": 197440 + }, + { + "epoch": 1.745522374865185, + "grad_norm": 8.217358589172363, + "learning_rate": 2.0907960418913582e-05, + "loss": 0.5852, + "step": 197450 + }, + { + "epoch": 1.7456107781254973, + "grad_norm": 2.3924551010131836, + "learning_rate": 2.0906487031241714e-05, + "loss": 0.5538, + "step": 197460 + }, + { + "epoch": 1.7456991813858096, + "grad_norm": 2.210322380065918, + "learning_rate": 2.0905013643569842e-05, + "loss": 0.6097, + "step": 197470 + }, + { + "epoch": 1.7457875846461217, + "grad_norm": 4.129487991333008, + "learning_rate": 2.090354025589797e-05, + "loss": 0.6492, + "step": 197480 + }, + { + "epoch": 1.7458759879064338, + "grad_norm": 4.150144577026367, + "learning_rate": 2.09020668682261e-05, + "loss": 0.6553, + "step": 197490 + }, + { + "epoch": 1.7459643911667462, + "grad_norm": 6.629619598388672, + "learning_rate": 2.090059348055423e-05, + "loss": 0.5711, + "step": 197500 + }, + { + "epoch": 1.7460527944270585, + "grad_norm": 1.9283958673477173, + "learning_rate": 2.089912009288236e-05, + "loss": 0.443, + "step": 197510 + }, + { + "epoch": 1.7461411976873706, + "grad_norm": 2.020054578781128, + "learning_rate": 2.0897646705210488e-05, + "loss": 0.6163, + "step": 197520 + }, + { + "epoch": 1.746229600947683, + "grad_norm": 8.528481483459473, + "learning_rate": 2.089617331753862e-05, + "loss": 0.5935, + "step": 197530 + }, + { + "epoch": 1.7463180042079953, + "grad_norm": 1.652199149131775, + "learning_rate": 2.0894699929866748e-05, + "loss": 0.5647, + "step": 197540 + }, + { + "epoch": 1.7464064074683074, + "grad_norm": 5.253749370574951, + "learning_rate": 2.0893226542194876e-05, + "loss": 0.6569, + "step": 197550 + }, + { + "epoch": 1.7464948107286196, + "grad_norm": 1.7958537340164185, + "learning_rate": 2.0891753154523004e-05, + "loss": 0.5579, + "step": 197560 + }, + { + "epoch": 1.746583213988932, + "grad_norm": 3.0275492668151855, + "learning_rate": 2.0890279766851136e-05, + "loss": 0.6173, + "step": 197570 + }, + { + "epoch": 1.7466716172492442, + "grad_norm": 1.8350715637207031, + "learning_rate": 2.0888806379179265e-05, + "loss": 0.6338, + "step": 197580 + }, + { + "epoch": 1.7467600205095564, + "grad_norm": 3.038846015930176, + "learning_rate": 2.0887332991507393e-05, + "loss": 0.5396, + "step": 197590 + }, + { + "epoch": 1.7468484237698685, + "grad_norm": 11.514623641967773, + "learning_rate": 2.088585960383552e-05, + "loss": 0.5687, + "step": 197600 + }, + { + "epoch": 1.7469368270301808, + "grad_norm": 4.025920867919922, + "learning_rate": 2.0884386216163653e-05, + "loss": 0.6045, + "step": 197610 + }, + { + "epoch": 1.7470252302904932, + "grad_norm": 1.6115721464157104, + "learning_rate": 2.088291282849178e-05, + "loss": 0.5957, + "step": 197620 + }, + { + "epoch": 1.7471136335508053, + "grad_norm": 4.450847625732422, + "learning_rate": 2.088143944081991e-05, + "loss": 0.5725, + "step": 197630 + }, + { + "epoch": 1.7472020368111176, + "grad_norm": 0.9861357808113098, + "learning_rate": 2.087996605314804e-05, + "loss": 0.555, + "step": 197640 + }, + { + "epoch": 1.74729044007143, + "grad_norm": 1.5301084518432617, + "learning_rate": 2.087849266547617e-05, + "loss": 0.6504, + "step": 197650 + }, + { + "epoch": 1.747378843331742, + "grad_norm": 1.3909530639648438, + "learning_rate": 2.0877019277804298e-05, + "loss": 0.5335, + "step": 197660 + }, + { + "epoch": 1.7474672465920542, + "grad_norm": 1.4576557874679565, + "learning_rate": 2.0875545890132426e-05, + "loss": 0.5952, + "step": 197670 + }, + { + "epoch": 1.7475556498523666, + "grad_norm": 1.7314077615737915, + "learning_rate": 2.0874072502460558e-05, + "loss": 0.719, + "step": 197680 + }, + { + "epoch": 1.747644053112679, + "grad_norm": 1.678776741027832, + "learning_rate": 2.0872599114788687e-05, + "loss": 0.546, + "step": 197690 + }, + { + "epoch": 1.747732456372991, + "grad_norm": 2.1321935653686523, + "learning_rate": 2.0871125727116815e-05, + "loss": 0.5656, + "step": 197700 + }, + { + "epoch": 1.7478208596333031, + "grad_norm": 2.211000919342041, + "learning_rate": 2.0869652339444947e-05, + "loss": 0.6632, + "step": 197710 + }, + { + "epoch": 1.7479092628936155, + "grad_norm": 1.558192491531372, + "learning_rate": 2.0868178951773075e-05, + "loss": 0.5655, + "step": 197720 + }, + { + "epoch": 1.7479976661539278, + "grad_norm": 3.241481304168701, + "learning_rate": 2.0866705564101203e-05, + "loss": 0.5737, + "step": 197730 + }, + { + "epoch": 1.74808606941424, + "grad_norm": 2.6850011348724365, + "learning_rate": 2.0865232176429335e-05, + "loss": 0.4636, + "step": 197740 + }, + { + "epoch": 1.7481744726745523, + "grad_norm": 2.521146297454834, + "learning_rate": 2.0863758788757463e-05, + "loss": 0.498, + "step": 197750 + }, + { + "epoch": 1.7482628759348646, + "grad_norm": 13.72861385345459, + "learning_rate": 2.0862285401085592e-05, + "loss": 0.4228, + "step": 197760 + }, + { + "epoch": 1.7483512791951767, + "grad_norm": 3.342637777328491, + "learning_rate": 2.0860812013413724e-05, + "loss": 0.6127, + "step": 197770 + }, + { + "epoch": 1.7484396824554889, + "grad_norm": 8.745201110839844, + "learning_rate": 2.0859338625741852e-05, + "loss": 0.6358, + "step": 197780 + }, + { + "epoch": 1.7485280857158012, + "grad_norm": 3.7885406017303467, + "learning_rate": 2.085786523806998e-05, + "loss": 0.4518, + "step": 197790 + }, + { + "epoch": 1.7486164889761135, + "grad_norm": 2.9585280418395996, + "learning_rate": 2.0856391850398112e-05, + "loss": 0.6938, + "step": 197800 + }, + { + "epoch": 1.7487048922364257, + "grad_norm": 2.1558680534362793, + "learning_rate": 2.085491846272624e-05, + "loss": 0.6789, + "step": 197810 + }, + { + "epoch": 1.7487932954967378, + "grad_norm": 2.011660575866699, + "learning_rate": 2.085344507505437e-05, + "loss": 0.5276, + "step": 197820 + }, + { + "epoch": 1.7488816987570501, + "grad_norm": 1.6391394138336182, + "learning_rate": 2.08519716873825e-05, + "loss": 0.6125, + "step": 197830 + }, + { + "epoch": 1.7489701020173625, + "grad_norm": 1.2173222303390503, + "learning_rate": 2.085049829971063e-05, + "loss": 0.6754, + "step": 197840 + }, + { + "epoch": 1.7490585052776746, + "grad_norm": 2.6116750240325928, + "learning_rate": 2.0849024912038757e-05, + "loss": 0.6366, + "step": 197850 + }, + { + "epoch": 1.749146908537987, + "grad_norm": 16.526519775390625, + "learning_rate": 2.084755152436689e-05, + "loss": 0.5286, + "step": 197860 + }, + { + "epoch": 1.7492353117982993, + "grad_norm": 4.576642990112305, + "learning_rate": 2.0846078136695017e-05, + "loss": 0.6201, + "step": 197870 + }, + { + "epoch": 1.7493237150586114, + "grad_norm": 5.97806453704834, + "learning_rate": 2.0844604749023146e-05, + "loss": 0.5511, + "step": 197880 + }, + { + "epoch": 1.7494121183189235, + "grad_norm": 2.474228858947754, + "learning_rate": 2.0843131361351274e-05, + "loss": 0.6874, + "step": 197890 + }, + { + "epoch": 1.7495005215792359, + "grad_norm": 3.8049893379211426, + "learning_rate": 2.0841657973679406e-05, + "loss": 0.6463, + "step": 197900 + }, + { + "epoch": 1.7495889248395482, + "grad_norm": 2.4274582862854004, + "learning_rate": 2.0840184586007534e-05, + "loss": 0.5975, + "step": 197910 + }, + { + "epoch": 1.7496773280998603, + "grad_norm": 2.680166244506836, + "learning_rate": 2.0838711198335662e-05, + "loss": 0.5186, + "step": 197920 + }, + { + "epoch": 1.7497657313601724, + "grad_norm": 5.076358795166016, + "learning_rate": 2.0837237810663794e-05, + "loss": 0.6417, + "step": 197930 + }, + { + "epoch": 1.7498541346204848, + "grad_norm": 1.2135424613952637, + "learning_rate": 2.0835764422991923e-05, + "loss": 0.6321, + "step": 197940 + }, + { + "epoch": 1.7499425378807971, + "grad_norm": 3.6951210498809814, + "learning_rate": 2.083429103532005e-05, + "loss": 0.6703, + "step": 197950 + }, + { + "epoch": 1.7500309411411092, + "grad_norm": 1.8483201265335083, + "learning_rate": 2.083281764764818e-05, + "loss": 0.6007, + "step": 197960 + }, + { + "epoch": 1.7501193444014214, + "grad_norm": 2.100478410720825, + "learning_rate": 2.083134425997631e-05, + "loss": 0.6677, + "step": 197970 + }, + { + "epoch": 1.750207747661734, + "grad_norm": 3.2420291900634766, + "learning_rate": 2.082987087230444e-05, + "loss": 0.4966, + "step": 197980 + }, + { + "epoch": 1.750296150922046, + "grad_norm": 1.5824224948883057, + "learning_rate": 2.0828397484632568e-05, + "loss": 0.5962, + "step": 197990 + }, + { + "epoch": 1.7503845541823582, + "grad_norm": 8.216656684875488, + "learning_rate": 2.08269240969607e-05, + "loss": 0.4532, + "step": 198000 + }, + { + "epoch": 1.7504729574426705, + "grad_norm": 0.8512338399887085, + "learning_rate": 2.0825450709288828e-05, + "loss": 0.5657, + "step": 198010 + }, + { + "epoch": 1.7505613607029828, + "grad_norm": 5.59337043762207, + "learning_rate": 2.0823977321616956e-05, + "loss": 0.5933, + "step": 198020 + }, + { + "epoch": 1.750649763963295, + "grad_norm": 1.7913964986801147, + "learning_rate": 2.0822503933945085e-05, + "loss": 0.5116, + "step": 198030 + }, + { + "epoch": 1.750738167223607, + "grad_norm": 3.5717811584472656, + "learning_rate": 2.0821030546273216e-05, + "loss": 0.5585, + "step": 198040 + }, + { + "epoch": 1.7508265704839194, + "grad_norm": 3.144894599914551, + "learning_rate": 2.0819557158601345e-05, + "loss": 0.6281, + "step": 198050 + }, + { + "epoch": 1.7509149737442318, + "grad_norm": 2.6191606521606445, + "learning_rate": 2.0818083770929473e-05, + "loss": 0.5447, + "step": 198060 + }, + { + "epoch": 1.751003377004544, + "grad_norm": 0.7619302868843079, + "learning_rate": 2.08166103832576e-05, + "loss": 0.4921, + "step": 198070 + }, + { + "epoch": 1.751091780264856, + "grad_norm": 4.799787998199463, + "learning_rate": 2.0815136995585733e-05, + "loss": 0.5855, + "step": 198080 + }, + { + "epoch": 1.7511801835251684, + "grad_norm": 1.671536922454834, + "learning_rate": 2.081366360791386e-05, + "loss": 0.5948, + "step": 198090 + }, + { + "epoch": 1.7512685867854807, + "grad_norm": 3.216219425201416, + "learning_rate": 2.081219022024199e-05, + "loss": 0.5781, + "step": 198100 + }, + { + "epoch": 1.7513569900457928, + "grad_norm": 8.024598121643066, + "learning_rate": 2.081071683257012e-05, + "loss": 0.5581, + "step": 198110 + }, + { + "epoch": 1.7514453933061052, + "grad_norm": 1.9712045192718506, + "learning_rate": 2.080924344489825e-05, + "loss": 0.5852, + "step": 198120 + }, + { + "epoch": 1.7515337965664175, + "grad_norm": 2.13334321975708, + "learning_rate": 2.0807770057226378e-05, + "loss": 0.8064, + "step": 198130 + }, + { + "epoch": 1.7516221998267296, + "grad_norm": 15.331814765930176, + "learning_rate": 2.0806296669554507e-05, + "loss": 0.558, + "step": 198140 + }, + { + "epoch": 1.7517106030870417, + "grad_norm": 1.680092215538025, + "learning_rate": 2.080482328188264e-05, + "loss": 0.6171, + "step": 198150 + }, + { + "epoch": 1.751799006347354, + "grad_norm": 1.7920665740966797, + "learning_rate": 2.0803349894210767e-05, + "loss": 0.5709, + "step": 198160 + }, + { + "epoch": 1.7518874096076664, + "grad_norm": 2.1616902351379395, + "learning_rate": 2.0801876506538895e-05, + "loss": 0.6331, + "step": 198170 + }, + { + "epoch": 1.7519758128679785, + "grad_norm": 1.8493404388427734, + "learning_rate": 2.0800403118867027e-05, + "loss": 0.6714, + "step": 198180 + }, + { + "epoch": 1.7520642161282907, + "grad_norm": 5.895579814910889, + "learning_rate": 2.0798929731195155e-05, + "loss": 0.6431, + "step": 198190 + }, + { + "epoch": 1.752152619388603, + "grad_norm": 5.15779972076416, + "learning_rate": 2.0797456343523283e-05, + "loss": 0.7133, + "step": 198200 + }, + { + "epoch": 1.7522410226489153, + "grad_norm": 2.4788951873779297, + "learning_rate": 2.0795982955851412e-05, + "loss": 0.6498, + "step": 198210 + }, + { + "epoch": 1.7523294259092275, + "grad_norm": 2.990628957748413, + "learning_rate": 2.0794509568179544e-05, + "loss": 0.5041, + "step": 198220 + }, + { + "epoch": 1.7524178291695398, + "grad_norm": 2.803309202194214, + "learning_rate": 2.0793036180507672e-05, + "loss": 0.6219, + "step": 198230 + }, + { + "epoch": 1.7525062324298522, + "grad_norm": 2.890803337097168, + "learning_rate": 2.07915627928358e-05, + "loss": 0.658, + "step": 198240 + }, + { + "epoch": 1.7525946356901643, + "grad_norm": 9.967921257019043, + "learning_rate": 2.079008940516393e-05, + "loss": 0.469, + "step": 198250 + }, + { + "epoch": 1.7526830389504764, + "grad_norm": 1.3004199266433716, + "learning_rate": 2.078861601749206e-05, + "loss": 0.6036, + "step": 198260 + }, + { + "epoch": 1.7527714422107887, + "grad_norm": 1.2337579727172852, + "learning_rate": 2.078714262982019e-05, + "loss": 0.6715, + "step": 198270 + }, + { + "epoch": 1.752859845471101, + "grad_norm": 2.868938446044922, + "learning_rate": 2.0785669242148317e-05, + "loss": 0.6415, + "step": 198280 + }, + { + "epoch": 1.7529482487314132, + "grad_norm": 0.981107234954834, + "learning_rate": 2.078419585447645e-05, + "loss": 0.5801, + "step": 198290 + }, + { + "epoch": 1.7530366519917253, + "grad_norm": 8.54338550567627, + "learning_rate": 2.0782722466804577e-05, + "loss": 0.7166, + "step": 198300 + }, + { + "epoch": 1.7531250552520377, + "grad_norm": 1.221792221069336, + "learning_rate": 2.0781249079132706e-05, + "loss": 0.551, + "step": 198310 + }, + { + "epoch": 1.75321345851235, + "grad_norm": 4.302059650421143, + "learning_rate": 2.0779775691460834e-05, + "loss": 0.6789, + "step": 198320 + }, + { + "epoch": 1.7533018617726621, + "grad_norm": 5.496481418609619, + "learning_rate": 2.0778302303788966e-05, + "loss": 0.4865, + "step": 198330 + }, + { + "epoch": 1.7533902650329745, + "grad_norm": 4.6418986320495605, + "learning_rate": 2.0776828916117094e-05, + "loss": 0.6185, + "step": 198340 + }, + { + "epoch": 1.7534786682932868, + "grad_norm": 12.174217224121094, + "learning_rate": 2.0775355528445222e-05, + "loss": 0.6402, + "step": 198350 + }, + { + "epoch": 1.753567071553599, + "grad_norm": 4.296413421630859, + "learning_rate": 2.077388214077335e-05, + "loss": 0.671, + "step": 198360 + }, + { + "epoch": 1.753655474813911, + "grad_norm": 8.448339462280273, + "learning_rate": 2.0772408753101482e-05, + "loss": 0.5399, + "step": 198370 + }, + { + "epoch": 1.7537438780742234, + "grad_norm": 1.545261025428772, + "learning_rate": 2.077093536542961e-05, + "loss": 0.5635, + "step": 198380 + }, + { + "epoch": 1.7538322813345357, + "grad_norm": 12.022567749023438, + "learning_rate": 2.076946197775774e-05, + "loss": 0.6601, + "step": 198390 + }, + { + "epoch": 1.7539206845948478, + "grad_norm": 2.1051766872406006, + "learning_rate": 2.076798859008587e-05, + "loss": 0.4195, + "step": 198400 + }, + { + "epoch": 1.75400908785516, + "grad_norm": 1.4720613956451416, + "learning_rate": 2.0766515202414e-05, + "loss": 0.6752, + "step": 198410 + }, + { + "epoch": 1.7540974911154723, + "grad_norm": 1.620879054069519, + "learning_rate": 2.0765041814742128e-05, + "loss": 0.4582, + "step": 198420 + }, + { + "epoch": 1.7541858943757846, + "grad_norm": 1.0339560508728027, + "learning_rate": 2.0763568427070256e-05, + "loss": 0.5416, + "step": 198430 + }, + { + "epoch": 1.7542742976360968, + "grad_norm": 3.3532519340515137, + "learning_rate": 2.0762095039398388e-05, + "loss": 0.5899, + "step": 198440 + }, + { + "epoch": 1.754362700896409, + "grad_norm": 1.8915557861328125, + "learning_rate": 2.0760621651726516e-05, + "loss": 0.6154, + "step": 198450 + }, + { + "epoch": 1.7544511041567215, + "grad_norm": 3.085219144821167, + "learning_rate": 2.0759148264054644e-05, + "loss": 0.6711, + "step": 198460 + }, + { + "epoch": 1.7545395074170336, + "grad_norm": 2.022360324859619, + "learning_rate": 2.0757674876382776e-05, + "loss": 0.5663, + "step": 198470 + }, + { + "epoch": 1.7546279106773457, + "grad_norm": 7.102045059204102, + "learning_rate": 2.0756201488710904e-05, + "loss": 0.5613, + "step": 198480 + }, + { + "epoch": 1.754716313937658, + "grad_norm": 4.3691325187683105, + "learning_rate": 2.0754728101039033e-05, + "loss": 0.6342, + "step": 198490 + }, + { + "epoch": 1.7548047171979704, + "grad_norm": 1.8911253213882446, + "learning_rate": 2.075325471336716e-05, + "loss": 0.6014, + "step": 198500 + }, + { + "epoch": 1.7548931204582825, + "grad_norm": 0.9943745136260986, + "learning_rate": 2.0751781325695293e-05, + "loss": 0.5776, + "step": 198510 + }, + { + "epoch": 1.7549815237185946, + "grad_norm": 1.7579480409622192, + "learning_rate": 2.075030793802342e-05, + "loss": 0.5863, + "step": 198520 + }, + { + "epoch": 1.755069926978907, + "grad_norm": 1.8364801406860352, + "learning_rate": 2.074883455035155e-05, + "loss": 0.4875, + "step": 198530 + }, + { + "epoch": 1.7551583302392193, + "grad_norm": 4.80426549911499, + "learning_rate": 2.0747361162679678e-05, + "loss": 0.4999, + "step": 198540 + }, + { + "epoch": 1.7552467334995314, + "grad_norm": 3.127359390258789, + "learning_rate": 2.074588777500781e-05, + "loss": 0.4488, + "step": 198550 + }, + { + "epoch": 1.7553351367598435, + "grad_norm": 1.0734896659851074, + "learning_rate": 2.0744414387335938e-05, + "loss": 0.5554, + "step": 198560 + }, + { + "epoch": 1.755423540020156, + "grad_norm": 2.630678653717041, + "learning_rate": 2.0742940999664066e-05, + "loss": 0.5699, + "step": 198570 + }, + { + "epoch": 1.7555119432804682, + "grad_norm": 7.725094318389893, + "learning_rate": 2.0741467611992198e-05, + "loss": 0.6702, + "step": 198580 + }, + { + "epoch": 1.7556003465407803, + "grad_norm": 1.153080701828003, + "learning_rate": 2.0739994224320327e-05, + "loss": 0.6543, + "step": 198590 + }, + { + "epoch": 1.7556887498010927, + "grad_norm": 1.0754648447036743, + "learning_rate": 2.0738520836648455e-05, + "loss": 0.5494, + "step": 198600 + }, + { + "epoch": 1.755777153061405, + "grad_norm": 2.021418333053589, + "learning_rate": 2.0737047448976583e-05, + "loss": 0.5057, + "step": 198610 + }, + { + "epoch": 1.7558655563217171, + "grad_norm": 2.577563762664795, + "learning_rate": 2.0735574061304715e-05, + "loss": 0.6664, + "step": 198620 + }, + { + "epoch": 1.7559539595820293, + "grad_norm": 1.5344913005828857, + "learning_rate": 2.0734100673632843e-05, + "loss": 0.5796, + "step": 198630 + }, + { + "epoch": 1.7560423628423416, + "grad_norm": 4.062793731689453, + "learning_rate": 2.0732627285960972e-05, + "loss": 0.6136, + "step": 198640 + }, + { + "epoch": 1.756130766102654, + "grad_norm": 4.947664260864258, + "learning_rate": 2.0731153898289103e-05, + "loss": 0.6089, + "step": 198650 + }, + { + "epoch": 1.756219169362966, + "grad_norm": 1.329883337020874, + "learning_rate": 2.0729680510617232e-05, + "loss": 0.5791, + "step": 198660 + }, + { + "epoch": 1.7563075726232782, + "grad_norm": 2.21018385887146, + "learning_rate": 2.072820712294536e-05, + "loss": 0.617, + "step": 198670 + }, + { + "epoch": 1.7563959758835905, + "grad_norm": 10.442187309265137, + "learning_rate": 2.0726733735273492e-05, + "loss": 0.546, + "step": 198680 + }, + { + "epoch": 1.7564843791439029, + "grad_norm": 2.224388360977173, + "learning_rate": 2.072526034760162e-05, + "loss": 0.4472, + "step": 198690 + }, + { + "epoch": 1.756572782404215, + "grad_norm": 1.3046085834503174, + "learning_rate": 2.072378695992975e-05, + "loss": 0.4822, + "step": 198700 + }, + { + "epoch": 1.7566611856645273, + "grad_norm": 20.326618194580078, + "learning_rate": 2.072231357225788e-05, + "loss": 0.7144, + "step": 198710 + }, + { + "epoch": 1.7567495889248397, + "grad_norm": 2.609830141067505, + "learning_rate": 2.072084018458601e-05, + "loss": 0.5783, + "step": 198720 + }, + { + "epoch": 1.7568379921851518, + "grad_norm": 1.1245803833007812, + "learning_rate": 2.0719366796914137e-05, + "loss": 0.4985, + "step": 198730 + }, + { + "epoch": 1.756926395445464, + "grad_norm": 10.702179908752441, + "learning_rate": 2.071789340924227e-05, + "loss": 0.5078, + "step": 198740 + }, + { + "epoch": 1.7570147987057763, + "grad_norm": 2.8106372356414795, + "learning_rate": 2.0716420021570397e-05, + "loss": 0.5444, + "step": 198750 + }, + { + "epoch": 1.7571032019660886, + "grad_norm": 3.579751968383789, + "learning_rate": 2.0714946633898525e-05, + "loss": 0.6045, + "step": 198760 + }, + { + "epoch": 1.7571916052264007, + "grad_norm": 2.9466452598571777, + "learning_rate": 2.0713473246226657e-05, + "loss": 0.5714, + "step": 198770 + }, + { + "epoch": 1.7572800084867128, + "grad_norm": 5.917065143585205, + "learning_rate": 2.0711999858554786e-05, + "loss": 0.6135, + "step": 198780 + }, + { + "epoch": 1.7573684117470252, + "grad_norm": 1.6902612447738647, + "learning_rate": 2.0710526470882914e-05, + "loss": 0.5649, + "step": 198790 + }, + { + "epoch": 1.7574568150073375, + "grad_norm": 9.794297218322754, + "learning_rate": 2.0709053083211046e-05, + "loss": 0.5662, + "step": 198800 + }, + { + "epoch": 1.7575452182676496, + "grad_norm": 2.114468574523926, + "learning_rate": 2.0707579695539174e-05, + "loss": 0.6114, + "step": 198810 + }, + { + "epoch": 1.757633621527962, + "grad_norm": 3.443908452987671, + "learning_rate": 2.0706106307867302e-05, + "loss": 0.7409, + "step": 198820 + }, + { + "epoch": 1.7577220247882743, + "grad_norm": 4.537910461425781, + "learning_rate": 2.070463292019543e-05, + "loss": 0.6048, + "step": 198830 + }, + { + "epoch": 1.7578104280485864, + "grad_norm": 1.9517021179199219, + "learning_rate": 2.0703159532523562e-05, + "loss": 0.5075, + "step": 198840 + }, + { + "epoch": 1.7578988313088986, + "grad_norm": 0.9750070571899414, + "learning_rate": 2.070168614485169e-05, + "loss": 0.7622, + "step": 198850 + }, + { + "epoch": 1.757987234569211, + "grad_norm": 2.8106045722961426, + "learning_rate": 2.070021275717982e-05, + "loss": 0.6603, + "step": 198860 + }, + { + "epoch": 1.7580756378295233, + "grad_norm": 1.1765936613082886, + "learning_rate": 2.069873936950795e-05, + "loss": 0.5412, + "step": 198870 + }, + { + "epoch": 1.7581640410898354, + "grad_norm": 8.268555641174316, + "learning_rate": 2.069726598183608e-05, + "loss": 0.6645, + "step": 198880 + }, + { + "epoch": 1.7582524443501475, + "grad_norm": 1.3632766008377075, + "learning_rate": 2.0695792594164208e-05, + "loss": 0.5744, + "step": 198890 + }, + { + "epoch": 1.7583408476104598, + "grad_norm": 1.5119342803955078, + "learning_rate": 2.0694319206492336e-05, + "loss": 0.6186, + "step": 198900 + }, + { + "epoch": 1.7584292508707722, + "grad_norm": 1.6961848735809326, + "learning_rate": 2.0692845818820468e-05, + "loss": 0.5894, + "step": 198910 + }, + { + "epoch": 1.7585176541310843, + "grad_norm": 5.72066593170166, + "learning_rate": 2.0691372431148596e-05, + "loss": 0.6347, + "step": 198920 + }, + { + "epoch": 1.7586060573913966, + "grad_norm": 0.654653787612915, + "learning_rate": 2.0689899043476724e-05, + "loss": 0.5837, + "step": 198930 + }, + { + "epoch": 1.758694460651709, + "grad_norm": 4.1402668952941895, + "learning_rate": 2.0688425655804856e-05, + "loss": 0.6437, + "step": 198940 + }, + { + "epoch": 1.758782863912021, + "grad_norm": 2.6822288036346436, + "learning_rate": 2.0686952268132985e-05, + "loss": 0.6118, + "step": 198950 + }, + { + "epoch": 1.7588712671723332, + "grad_norm": 2.4100704193115234, + "learning_rate": 2.0685478880461113e-05, + "loss": 0.6775, + "step": 198960 + }, + { + "epoch": 1.7589596704326456, + "grad_norm": 4.156137943267822, + "learning_rate": 2.068400549278924e-05, + "loss": 0.5525, + "step": 198970 + }, + { + "epoch": 1.759048073692958, + "grad_norm": 2.91312837600708, + "learning_rate": 2.0682532105117373e-05, + "loss": 0.5631, + "step": 198980 + }, + { + "epoch": 1.75913647695327, + "grad_norm": 5.039311408996582, + "learning_rate": 2.06810587174455e-05, + "loss": 0.4401, + "step": 198990 + }, + { + "epoch": 1.7592248802135821, + "grad_norm": 5.100133895874023, + "learning_rate": 2.067958532977363e-05, + "loss": 0.5406, + "step": 199000 + }, + { + "epoch": 1.7593132834738945, + "grad_norm": 2.2148382663726807, + "learning_rate": 2.0678111942101758e-05, + "loss": 0.5846, + "step": 199010 + }, + { + "epoch": 1.7594016867342068, + "grad_norm": 3.281672239303589, + "learning_rate": 2.067663855442989e-05, + "loss": 0.5773, + "step": 199020 + }, + { + "epoch": 1.759490089994519, + "grad_norm": 2.9445834159851074, + "learning_rate": 2.0675165166758018e-05, + "loss": 0.6853, + "step": 199030 + }, + { + "epoch": 1.7595784932548313, + "grad_norm": 1.9498964548110962, + "learning_rate": 2.0673691779086147e-05, + "loss": 0.7405, + "step": 199040 + }, + { + "epoch": 1.7596668965151436, + "grad_norm": 7.0430169105529785, + "learning_rate": 2.0672218391414278e-05, + "loss": 0.6344, + "step": 199050 + }, + { + "epoch": 1.7597552997754558, + "grad_norm": 1.1378570795059204, + "learning_rate": 2.0670745003742407e-05, + "loss": 0.5853, + "step": 199060 + }, + { + "epoch": 1.7598437030357679, + "grad_norm": 1.6694504022598267, + "learning_rate": 2.0669271616070535e-05, + "loss": 0.6564, + "step": 199070 + }, + { + "epoch": 1.7599321062960802, + "grad_norm": 4.245837211608887, + "learning_rate": 2.0667798228398663e-05, + "loss": 0.5587, + "step": 199080 + }, + { + "epoch": 1.7600205095563926, + "grad_norm": 2.624056577682495, + "learning_rate": 2.0666324840726795e-05, + "loss": 0.5788, + "step": 199090 + }, + { + "epoch": 1.7601089128167047, + "grad_norm": 5.049633502960205, + "learning_rate": 2.0664851453054923e-05, + "loss": 0.55, + "step": 199100 + }, + { + "epoch": 1.7601973160770168, + "grad_norm": 8.455689430236816, + "learning_rate": 2.0663378065383052e-05, + "loss": 0.8119, + "step": 199110 + }, + { + "epoch": 1.7602857193373291, + "grad_norm": 4.369366645812988, + "learning_rate": 2.0661904677711184e-05, + "loss": 0.7053, + "step": 199120 + }, + { + "epoch": 1.7603741225976415, + "grad_norm": 2.9583048820495605, + "learning_rate": 2.0660431290039312e-05, + "loss": 0.4824, + "step": 199130 + }, + { + "epoch": 1.7604625258579536, + "grad_norm": 1.6215863227844238, + "learning_rate": 2.065895790236744e-05, + "loss": 0.4069, + "step": 199140 + }, + { + "epoch": 1.7605509291182657, + "grad_norm": 11.74459457397461, + "learning_rate": 2.065748451469557e-05, + "loss": 0.4307, + "step": 199150 + }, + { + "epoch": 1.7606393323785783, + "grad_norm": 1.71657395362854, + "learning_rate": 2.06560111270237e-05, + "loss": 0.5258, + "step": 199160 + }, + { + "epoch": 1.7607277356388904, + "grad_norm": 12.330255508422852, + "learning_rate": 2.065453773935183e-05, + "loss": 0.5932, + "step": 199170 + }, + { + "epoch": 1.7608161388992025, + "grad_norm": 2.850137710571289, + "learning_rate": 2.0653064351679957e-05, + "loss": 0.7165, + "step": 199180 + }, + { + "epoch": 1.7609045421595149, + "grad_norm": 1.646945595741272, + "learning_rate": 2.0651590964008085e-05, + "loss": 0.5961, + "step": 199190 + }, + { + "epoch": 1.7609929454198272, + "grad_norm": 1.6394953727722168, + "learning_rate": 2.0650117576336217e-05, + "loss": 0.5964, + "step": 199200 + }, + { + "epoch": 1.7610813486801393, + "grad_norm": 1.420682668685913, + "learning_rate": 2.0648644188664345e-05, + "loss": 0.5166, + "step": 199210 + }, + { + "epoch": 1.7611697519404514, + "grad_norm": 1.4977213144302368, + "learning_rate": 2.0647170800992474e-05, + "loss": 0.5271, + "step": 199220 + }, + { + "epoch": 1.7612581552007638, + "grad_norm": 1.1657967567443848, + "learning_rate": 2.0645697413320606e-05, + "loss": 0.585, + "step": 199230 + }, + { + "epoch": 1.7613465584610761, + "grad_norm": 4.959208011627197, + "learning_rate": 2.0644224025648734e-05, + "loss": 0.616, + "step": 199240 + }, + { + "epoch": 1.7614349617213882, + "grad_norm": 2.369187593460083, + "learning_rate": 2.0642750637976862e-05, + "loss": 0.5312, + "step": 199250 + }, + { + "epoch": 1.7615233649817004, + "grad_norm": 3.511531114578247, + "learning_rate": 2.064127725030499e-05, + "loss": 0.5762, + "step": 199260 + }, + { + "epoch": 1.7616117682420127, + "grad_norm": 1.6372212171554565, + "learning_rate": 2.0639803862633122e-05, + "loss": 0.6159, + "step": 199270 + }, + { + "epoch": 1.761700171502325, + "grad_norm": 2.4326720237731934, + "learning_rate": 2.063833047496125e-05, + "loss": 0.543, + "step": 199280 + }, + { + "epoch": 1.7617885747626372, + "grad_norm": 2.585983991622925, + "learning_rate": 2.063685708728938e-05, + "loss": 0.548, + "step": 199290 + }, + { + "epoch": 1.7618769780229495, + "grad_norm": 1.2142208814620972, + "learning_rate": 2.0635383699617507e-05, + "loss": 0.5861, + "step": 199300 + }, + { + "epoch": 1.7619653812832619, + "grad_norm": 5.508725166320801, + "learning_rate": 2.063391031194564e-05, + "loss": 0.5201, + "step": 199310 + }, + { + "epoch": 1.762053784543574, + "grad_norm": 3.581573724746704, + "learning_rate": 2.0632436924273768e-05, + "loss": 0.6495, + "step": 199320 + }, + { + "epoch": 1.762142187803886, + "grad_norm": 5.776298522949219, + "learning_rate": 2.0630963536601896e-05, + "loss": 0.5674, + "step": 199330 + }, + { + "epoch": 1.7622305910641984, + "grad_norm": 2.4548747539520264, + "learning_rate": 2.0629490148930028e-05, + "loss": 0.6964, + "step": 199340 + }, + { + "epoch": 1.7623189943245108, + "grad_norm": 2.6389870643615723, + "learning_rate": 2.0628016761258156e-05, + "loss": 0.6252, + "step": 199350 + }, + { + "epoch": 1.762407397584823, + "grad_norm": 6.058359622955322, + "learning_rate": 2.0626543373586284e-05, + "loss": 0.5675, + "step": 199360 + }, + { + "epoch": 1.762495800845135, + "grad_norm": 0.8558288812637329, + "learning_rate": 2.0625069985914413e-05, + "loss": 0.6076, + "step": 199370 + }, + { + "epoch": 1.7625842041054474, + "grad_norm": 5.600949287414551, + "learning_rate": 2.0623596598242544e-05, + "loss": 0.5871, + "step": 199380 + }, + { + "epoch": 1.7626726073657597, + "grad_norm": 2.670600414276123, + "learning_rate": 2.0622123210570673e-05, + "loss": 0.5181, + "step": 199390 + }, + { + "epoch": 1.7627610106260718, + "grad_norm": 2.0761938095092773, + "learning_rate": 2.06206498228988e-05, + "loss": 0.5393, + "step": 199400 + }, + { + "epoch": 1.7628494138863842, + "grad_norm": 3.9695093631744385, + "learning_rate": 2.0619176435226933e-05, + "loss": 0.6408, + "step": 199410 + }, + { + "epoch": 1.7629378171466965, + "grad_norm": 10.492831230163574, + "learning_rate": 2.061770304755506e-05, + "loss": 0.6675, + "step": 199420 + }, + { + "epoch": 1.7630262204070086, + "grad_norm": 4.89743185043335, + "learning_rate": 2.061622965988319e-05, + "loss": 0.5598, + "step": 199430 + }, + { + "epoch": 1.7631146236673207, + "grad_norm": 2.589043378829956, + "learning_rate": 2.0614756272211318e-05, + "loss": 0.653, + "step": 199440 + }, + { + "epoch": 1.763203026927633, + "grad_norm": 1.5037132501602173, + "learning_rate": 2.061328288453945e-05, + "loss": 0.6573, + "step": 199450 + }, + { + "epoch": 1.7632914301879454, + "grad_norm": 2.9797708988189697, + "learning_rate": 2.0611809496867578e-05, + "loss": 0.6749, + "step": 199460 + }, + { + "epoch": 1.7633798334482575, + "grad_norm": 1.983686089515686, + "learning_rate": 2.0610336109195706e-05, + "loss": 0.6909, + "step": 199470 + }, + { + "epoch": 1.7634682367085697, + "grad_norm": 1.4411038160324097, + "learning_rate": 2.0608862721523835e-05, + "loss": 0.5829, + "step": 199480 + }, + { + "epoch": 1.763556639968882, + "grad_norm": 1.2027361392974854, + "learning_rate": 2.0607389333851966e-05, + "loss": 0.5746, + "step": 199490 + }, + { + "epoch": 1.7636450432291944, + "grad_norm": 1.7734391689300537, + "learning_rate": 2.0605915946180095e-05, + "loss": 0.7385, + "step": 199500 + }, + { + "epoch": 1.7637334464895065, + "grad_norm": 1.9979277849197388, + "learning_rate": 2.0604442558508223e-05, + "loss": 0.5195, + "step": 199510 + }, + { + "epoch": 1.7638218497498188, + "grad_norm": 2.0859720706939697, + "learning_rate": 2.0602969170836355e-05, + "loss": 0.6381, + "step": 199520 + }, + { + "epoch": 1.7639102530101312, + "grad_norm": 2.4914655685424805, + "learning_rate": 2.0601495783164483e-05, + "loss": 0.545, + "step": 199530 + }, + { + "epoch": 1.7639986562704433, + "grad_norm": 2.902784824371338, + "learning_rate": 2.060002239549261e-05, + "loss": 0.4389, + "step": 199540 + }, + { + "epoch": 1.7640870595307554, + "grad_norm": 1.6960073709487915, + "learning_rate": 2.059854900782074e-05, + "loss": 0.619, + "step": 199550 + }, + { + "epoch": 1.7641754627910677, + "grad_norm": 2.486156940460205, + "learning_rate": 2.0597075620148872e-05, + "loss": 0.5329, + "step": 199560 + }, + { + "epoch": 1.76426386605138, + "grad_norm": 3.1031014919281006, + "learning_rate": 2.0595602232477e-05, + "loss": 0.5805, + "step": 199570 + }, + { + "epoch": 1.7643522693116922, + "grad_norm": 1.4590438604354858, + "learning_rate": 2.059412884480513e-05, + "loss": 0.6192, + "step": 199580 + }, + { + "epoch": 1.7644406725720043, + "grad_norm": 2.6685523986816406, + "learning_rate": 2.059265545713326e-05, + "loss": 0.5931, + "step": 199590 + }, + { + "epoch": 1.7645290758323167, + "grad_norm": 4.324309825897217, + "learning_rate": 2.059118206946139e-05, + "loss": 0.6099, + "step": 199600 + }, + { + "epoch": 1.764617479092629, + "grad_norm": 1.6499725580215454, + "learning_rate": 2.0589708681789517e-05, + "loss": 0.4791, + "step": 199610 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 2.315854072570801, + "learning_rate": 2.058823529411765e-05, + "loss": 0.6319, + "step": 199620 + }, + { + "epoch": 1.7647942856132535, + "grad_norm": 3.2634072303771973, + "learning_rate": 2.0586761906445777e-05, + "loss": 0.586, + "step": 199630 + }, + { + "epoch": 1.7648826888735658, + "grad_norm": 4.387502193450928, + "learning_rate": 2.0585288518773905e-05, + "loss": 0.5497, + "step": 199640 + }, + { + "epoch": 1.764971092133878, + "grad_norm": 2.499368190765381, + "learning_rate": 2.0583815131102037e-05, + "loss": 0.6791, + "step": 199650 + }, + { + "epoch": 1.76505949539419, + "grad_norm": 1.9002959728240967, + "learning_rate": 2.0582341743430165e-05, + "loss": 0.5873, + "step": 199660 + }, + { + "epoch": 1.7651478986545024, + "grad_norm": 11.747011184692383, + "learning_rate": 2.0580868355758294e-05, + "loss": 0.6854, + "step": 199670 + }, + { + "epoch": 1.7652363019148147, + "grad_norm": 3.0913946628570557, + "learning_rate": 2.0579394968086426e-05, + "loss": 0.6433, + "step": 199680 + }, + { + "epoch": 1.7653247051751269, + "grad_norm": 2.5673446655273438, + "learning_rate": 2.0577921580414554e-05, + "loss": 0.5589, + "step": 199690 + }, + { + "epoch": 1.765413108435439, + "grad_norm": 5.331379413604736, + "learning_rate": 2.0576448192742682e-05, + "loss": 0.5776, + "step": 199700 + }, + { + "epoch": 1.7655015116957513, + "grad_norm": 2.3399834632873535, + "learning_rate": 2.0574974805070814e-05, + "loss": 0.4646, + "step": 199710 + }, + { + "epoch": 1.7655899149560637, + "grad_norm": 5.019132137298584, + "learning_rate": 2.0573501417398942e-05, + "loss": 0.6931, + "step": 199720 + }, + { + "epoch": 1.7656783182163758, + "grad_norm": 10.349445343017578, + "learning_rate": 2.057202802972707e-05, + "loss": 0.6039, + "step": 199730 + }, + { + "epoch": 1.765766721476688, + "grad_norm": 17.549537658691406, + "learning_rate": 2.0570554642055202e-05, + "loss": 0.6005, + "step": 199740 + }, + { + "epoch": 1.7658551247370005, + "grad_norm": 2.8126816749572754, + "learning_rate": 2.056908125438333e-05, + "loss": 0.692, + "step": 199750 + }, + { + "epoch": 1.7659435279973126, + "grad_norm": 3.191575050354004, + "learning_rate": 2.056760786671146e-05, + "loss": 0.5962, + "step": 199760 + }, + { + "epoch": 1.7660319312576247, + "grad_norm": 2.158874750137329, + "learning_rate": 2.056613447903959e-05, + "loss": 0.7582, + "step": 199770 + }, + { + "epoch": 1.766120334517937, + "grad_norm": 1.3560339212417603, + "learning_rate": 2.056466109136772e-05, + "loss": 0.5423, + "step": 199780 + }, + { + "epoch": 1.7662087377782494, + "grad_norm": 2.2288689613342285, + "learning_rate": 2.0563187703695848e-05, + "loss": 0.6104, + "step": 199790 + }, + { + "epoch": 1.7662971410385615, + "grad_norm": 1.4673881530761719, + "learning_rate": 2.0561714316023976e-05, + "loss": 0.6268, + "step": 199800 + }, + { + "epoch": 1.7663855442988736, + "grad_norm": 1.5412003993988037, + "learning_rate": 2.0560240928352108e-05, + "loss": 0.5949, + "step": 199810 + }, + { + "epoch": 1.766473947559186, + "grad_norm": 1.4000840187072754, + "learning_rate": 2.0558767540680236e-05, + "loss": 0.5762, + "step": 199820 + }, + { + "epoch": 1.7665623508194983, + "grad_norm": 1.1794644594192505, + "learning_rate": 2.0557294153008364e-05, + "loss": 0.6181, + "step": 199830 + }, + { + "epoch": 1.7666507540798104, + "grad_norm": 4.9438676834106445, + "learning_rate": 2.0555820765336493e-05, + "loss": 0.5657, + "step": 199840 + }, + { + "epoch": 1.7667391573401225, + "grad_norm": 1.6453328132629395, + "learning_rate": 2.0554347377664625e-05, + "loss": 0.4458, + "step": 199850 + }, + { + "epoch": 1.766827560600435, + "grad_norm": 1.2419538497924805, + "learning_rate": 2.0552873989992753e-05, + "loss": 0.5685, + "step": 199860 + }, + { + "epoch": 1.7669159638607472, + "grad_norm": 1.0484174489974976, + "learning_rate": 2.055140060232088e-05, + "loss": 0.605, + "step": 199870 + }, + { + "epoch": 1.7670043671210593, + "grad_norm": 1.3979978561401367, + "learning_rate": 2.0549927214649013e-05, + "loss": 0.6089, + "step": 199880 + }, + { + "epoch": 1.7670927703813717, + "grad_norm": 2.2974133491516113, + "learning_rate": 2.054845382697714e-05, + "loss": 0.5482, + "step": 199890 + }, + { + "epoch": 1.767181173641684, + "grad_norm": 14.60004997253418, + "learning_rate": 2.054698043930527e-05, + "loss": 0.6268, + "step": 199900 + }, + { + "epoch": 1.7672695769019962, + "grad_norm": 4.126373291015625, + "learning_rate": 2.0545507051633398e-05, + "loss": 0.5838, + "step": 199910 + }, + { + "epoch": 1.7673579801623083, + "grad_norm": 3.4054360389709473, + "learning_rate": 2.054403366396153e-05, + "loss": 0.6252, + "step": 199920 + }, + { + "epoch": 1.7674463834226206, + "grad_norm": 6.496740818023682, + "learning_rate": 2.0542560276289658e-05, + "loss": 0.587, + "step": 199930 + }, + { + "epoch": 1.767534786682933, + "grad_norm": 2.177032947540283, + "learning_rate": 2.0541086888617786e-05, + "loss": 0.5029, + "step": 199940 + }, + { + "epoch": 1.767623189943245, + "grad_norm": 2.3139541149139404, + "learning_rate": 2.0539613500945915e-05, + "loss": 0.6176, + "step": 199950 + }, + { + "epoch": 1.7677115932035572, + "grad_norm": 8.146818161010742, + "learning_rate": 2.0538140113274047e-05, + "loss": 0.6634, + "step": 199960 + }, + { + "epoch": 1.7677999964638695, + "grad_norm": 1.7606213092803955, + "learning_rate": 2.0536666725602175e-05, + "loss": 0.5378, + "step": 199970 + }, + { + "epoch": 1.7678883997241819, + "grad_norm": 3.540778398513794, + "learning_rate": 2.0535193337930303e-05, + "loss": 0.6441, + "step": 199980 + }, + { + "epoch": 1.767976802984494, + "grad_norm": 4.727563858032227, + "learning_rate": 2.0533719950258435e-05, + "loss": 0.6939, + "step": 199990 + }, + { + "epoch": 1.7680652062448063, + "grad_norm": 2.876376152038574, + "learning_rate": 2.0532246562586563e-05, + "loss": 0.5114, + "step": 200000 + }, + { + "epoch": 1.7681536095051187, + "grad_norm": 2.6525161266326904, + "learning_rate": 2.0530773174914692e-05, + "loss": 0.5812, + "step": 200010 + }, + { + "epoch": 1.7682420127654308, + "grad_norm": 3.602836847305298, + "learning_rate": 2.052929978724282e-05, + "loss": 0.4607, + "step": 200020 + }, + { + "epoch": 1.768330416025743, + "grad_norm": 0.9413797855377197, + "learning_rate": 2.0527826399570952e-05, + "loss": 0.5367, + "step": 200030 + }, + { + "epoch": 1.7684188192860553, + "grad_norm": 2.601632833480835, + "learning_rate": 2.052635301189908e-05, + "loss": 0.664, + "step": 200040 + }, + { + "epoch": 1.7685072225463676, + "grad_norm": 7.349104881286621, + "learning_rate": 2.052487962422721e-05, + "loss": 0.5638, + "step": 200050 + }, + { + "epoch": 1.7685956258066797, + "grad_norm": 3.8090150356292725, + "learning_rate": 2.052340623655534e-05, + "loss": 0.5718, + "step": 200060 + }, + { + "epoch": 1.7686840290669918, + "grad_norm": 5.953371047973633, + "learning_rate": 2.052193284888347e-05, + "loss": 0.6368, + "step": 200070 + }, + { + "epoch": 1.7687724323273042, + "grad_norm": 3.4271018505096436, + "learning_rate": 2.0520459461211597e-05, + "loss": 0.5389, + "step": 200080 + }, + { + "epoch": 1.7688608355876165, + "grad_norm": 1.8100402355194092, + "learning_rate": 2.0518986073539725e-05, + "loss": 0.6928, + "step": 200090 + }, + { + "epoch": 1.7689492388479287, + "grad_norm": 8.461389541625977, + "learning_rate": 2.0517512685867857e-05, + "loss": 0.665, + "step": 200100 + }, + { + "epoch": 1.769037642108241, + "grad_norm": 3.565764904022217, + "learning_rate": 2.0516039298195985e-05, + "loss": 0.6226, + "step": 200110 + }, + { + "epoch": 1.7691260453685533, + "grad_norm": 1.8190014362335205, + "learning_rate": 2.0514565910524114e-05, + "loss": 0.5624, + "step": 200120 + }, + { + "epoch": 1.7692144486288655, + "grad_norm": 17.462963104248047, + "learning_rate": 2.0513092522852242e-05, + "loss": 0.4893, + "step": 200130 + }, + { + "epoch": 1.7693028518891776, + "grad_norm": 5.460578441619873, + "learning_rate": 2.0511619135180374e-05, + "loss": 0.5877, + "step": 200140 + }, + { + "epoch": 1.76939125514949, + "grad_norm": 3.6186656951904297, + "learning_rate": 2.0510145747508502e-05, + "loss": 0.6442, + "step": 200150 + }, + { + "epoch": 1.7694796584098023, + "grad_norm": 0.7906415462493896, + "learning_rate": 2.050867235983663e-05, + "loss": 0.6767, + "step": 200160 + }, + { + "epoch": 1.7695680616701144, + "grad_norm": 3.094832420349121, + "learning_rate": 2.0507198972164762e-05, + "loss": 0.6337, + "step": 200170 + }, + { + "epoch": 1.7696564649304265, + "grad_norm": 2.70182204246521, + "learning_rate": 2.050572558449289e-05, + "loss": 0.5912, + "step": 200180 + }, + { + "epoch": 1.7697448681907388, + "grad_norm": 2.904658079147339, + "learning_rate": 2.050425219682102e-05, + "loss": 0.5743, + "step": 200190 + }, + { + "epoch": 1.7698332714510512, + "grad_norm": 7.599824905395508, + "learning_rate": 2.0502778809149147e-05, + "loss": 0.5784, + "step": 200200 + }, + { + "epoch": 1.7699216747113633, + "grad_norm": 2.563567876815796, + "learning_rate": 2.050130542147728e-05, + "loss": 0.476, + "step": 200210 + }, + { + "epoch": 1.7700100779716756, + "grad_norm": 3.0157346725463867, + "learning_rate": 2.0499832033805407e-05, + "loss": 0.5543, + "step": 200220 + }, + { + "epoch": 1.770098481231988, + "grad_norm": 2.0201637744903564, + "learning_rate": 2.0498358646133536e-05, + "loss": 0.6047, + "step": 200230 + }, + { + "epoch": 1.7701868844923, + "grad_norm": 2.5668652057647705, + "learning_rate": 2.0496885258461668e-05, + "loss": 0.7256, + "step": 200240 + }, + { + "epoch": 1.7702752877526122, + "grad_norm": 2.552210569381714, + "learning_rate": 2.0495411870789796e-05, + "loss": 0.6268, + "step": 200250 + }, + { + "epoch": 1.7703636910129246, + "grad_norm": 9.171015739440918, + "learning_rate": 2.0493938483117924e-05, + "loss": 0.5236, + "step": 200260 + }, + { + "epoch": 1.770452094273237, + "grad_norm": 2.7152488231658936, + "learning_rate": 2.0492465095446053e-05, + "loss": 0.6378, + "step": 200270 + }, + { + "epoch": 1.770540497533549, + "grad_norm": 7.269675254821777, + "learning_rate": 2.0490991707774184e-05, + "loss": 0.5716, + "step": 200280 + }, + { + "epoch": 1.7706289007938611, + "grad_norm": 1.826249361038208, + "learning_rate": 2.0489518320102313e-05, + "loss": 0.4896, + "step": 200290 + }, + { + "epoch": 1.7707173040541735, + "grad_norm": 1.7940057516098022, + "learning_rate": 2.048804493243044e-05, + "loss": 0.6147, + "step": 200300 + }, + { + "epoch": 1.7708057073144858, + "grad_norm": 1.2506428956985474, + "learning_rate": 2.048657154475857e-05, + "loss": 0.6026, + "step": 200310 + }, + { + "epoch": 1.770894110574798, + "grad_norm": 5.991634368896484, + "learning_rate": 2.04850981570867e-05, + "loss": 0.4239, + "step": 200320 + }, + { + "epoch": 1.77098251383511, + "grad_norm": 8.469026565551758, + "learning_rate": 2.048362476941483e-05, + "loss": 0.5772, + "step": 200330 + }, + { + "epoch": 1.7710709170954226, + "grad_norm": 2.0407445430755615, + "learning_rate": 2.0482151381742958e-05, + "loss": 0.5582, + "step": 200340 + }, + { + "epoch": 1.7711593203557348, + "grad_norm": 1.7195005416870117, + "learning_rate": 2.048067799407109e-05, + "loss": 0.5878, + "step": 200350 + }, + { + "epoch": 1.7712477236160469, + "grad_norm": 1.9609400033950806, + "learning_rate": 2.0479204606399218e-05, + "loss": 0.4709, + "step": 200360 + }, + { + "epoch": 1.7713361268763592, + "grad_norm": 3.1894423961639404, + "learning_rate": 2.0477731218727346e-05, + "loss": 0.653, + "step": 200370 + }, + { + "epoch": 1.7714245301366716, + "grad_norm": 2.48327898979187, + "learning_rate": 2.0476257831055475e-05, + "loss": 0.6086, + "step": 200380 + }, + { + "epoch": 1.7715129333969837, + "grad_norm": 2.1797211170196533, + "learning_rate": 2.0474784443383606e-05, + "loss": 0.7471, + "step": 200390 + }, + { + "epoch": 1.7716013366572958, + "grad_norm": 1.8306154012680054, + "learning_rate": 2.0473311055711735e-05, + "loss": 0.6601, + "step": 200400 + }, + { + "epoch": 1.7716897399176081, + "grad_norm": 3.669273614883423, + "learning_rate": 2.0471837668039863e-05, + "loss": 0.5518, + "step": 200410 + }, + { + "epoch": 1.7717781431779205, + "grad_norm": 2.3546721935272217, + "learning_rate": 2.047036428036799e-05, + "loss": 0.6494, + "step": 200420 + }, + { + "epoch": 1.7718665464382326, + "grad_norm": 8.568611145019531, + "learning_rate": 2.0468890892696123e-05, + "loss": 0.6568, + "step": 200430 + }, + { + "epoch": 1.7719549496985447, + "grad_norm": 13.436927795410156, + "learning_rate": 2.046741750502425e-05, + "loss": 0.5846, + "step": 200440 + }, + { + "epoch": 1.7720433529588573, + "grad_norm": 23.046981811523438, + "learning_rate": 2.046594411735238e-05, + "loss": 0.6407, + "step": 200450 + }, + { + "epoch": 1.7721317562191694, + "grad_norm": 0.9065667986869812, + "learning_rate": 2.046447072968051e-05, + "loss": 0.5297, + "step": 200460 + }, + { + "epoch": 1.7722201594794815, + "grad_norm": 2.932445526123047, + "learning_rate": 2.046299734200864e-05, + "loss": 0.5539, + "step": 200470 + }, + { + "epoch": 1.7723085627397939, + "grad_norm": 6.372934341430664, + "learning_rate": 2.046152395433677e-05, + "loss": 0.5858, + "step": 200480 + }, + { + "epoch": 1.7723969660001062, + "grad_norm": 1.7840017080307007, + "learning_rate": 2.0460050566664897e-05, + "loss": 0.6135, + "step": 200490 + }, + { + "epoch": 1.7724853692604183, + "grad_norm": 5.092711448669434, + "learning_rate": 2.045857717899303e-05, + "loss": 0.5872, + "step": 200500 + }, + { + "epoch": 1.7725737725207305, + "grad_norm": 4.186411380767822, + "learning_rate": 2.0457103791321157e-05, + "loss": 0.6856, + "step": 200510 + }, + { + "epoch": 1.7726621757810428, + "grad_norm": 4.1263275146484375, + "learning_rate": 2.0455630403649285e-05, + "loss": 0.5244, + "step": 200520 + }, + { + "epoch": 1.7727505790413551, + "grad_norm": 1.3254998922348022, + "learning_rate": 2.0454157015977417e-05, + "loss": 0.5811, + "step": 200530 + }, + { + "epoch": 1.7728389823016673, + "grad_norm": 3.523235559463501, + "learning_rate": 2.0452683628305545e-05, + "loss": 0.7075, + "step": 200540 + }, + { + "epoch": 1.7729273855619794, + "grad_norm": 6.180381774902344, + "learning_rate": 2.0451210240633674e-05, + "loss": 0.4999, + "step": 200550 + }, + { + "epoch": 1.7730157888222917, + "grad_norm": 6.181248664855957, + "learning_rate": 2.0449736852961805e-05, + "loss": 0.6462, + "step": 200560 + }, + { + "epoch": 1.773104192082604, + "grad_norm": 2.5143160820007324, + "learning_rate": 2.0448263465289934e-05, + "loss": 0.6671, + "step": 200570 + }, + { + "epoch": 1.7731925953429162, + "grad_norm": 3.0866026878356934, + "learning_rate": 2.0446790077618062e-05, + "loss": 0.5643, + "step": 200580 + }, + { + "epoch": 1.7732809986032285, + "grad_norm": 3.386183500289917, + "learning_rate": 2.0445316689946194e-05, + "loss": 0.6616, + "step": 200590 + }, + { + "epoch": 1.7733694018635409, + "grad_norm": 8.194581985473633, + "learning_rate": 2.0443843302274322e-05, + "loss": 0.4982, + "step": 200600 + }, + { + "epoch": 1.773457805123853, + "grad_norm": 1.739749550819397, + "learning_rate": 2.044236991460245e-05, + "loss": 0.5744, + "step": 200610 + }, + { + "epoch": 1.773546208384165, + "grad_norm": 5.339415073394775, + "learning_rate": 2.0440896526930582e-05, + "loss": 0.678, + "step": 200620 + }, + { + "epoch": 1.7736346116444774, + "grad_norm": 2.5839920043945312, + "learning_rate": 2.043942313925871e-05, + "loss": 0.574, + "step": 200630 + }, + { + "epoch": 1.7737230149047898, + "grad_norm": 1.4898864030838013, + "learning_rate": 2.043794975158684e-05, + "loss": 0.7052, + "step": 200640 + }, + { + "epoch": 1.773811418165102, + "grad_norm": 1.386272668838501, + "learning_rate": 2.043647636391497e-05, + "loss": 0.4552, + "step": 200650 + }, + { + "epoch": 1.773899821425414, + "grad_norm": 1.7696417570114136, + "learning_rate": 2.04350029762431e-05, + "loss": 0.5599, + "step": 200660 + }, + { + "epoch": 1.7739882246857264, + "grad_norm": 2.029740333557129, + "learning_rate": 2.0433529588571227e-05, + "loss": 0.5294, + "step": 200670 + }, + { + "epoch": 1.7740766279460387, + "grad_norm": 12.559075355529785, + "learning_rate": 2.043205620089936e-05, + "loss": 0.5891, + "step": 200680 + }, + { + "epoch": 1.7741650312063508, + "grad_norm": 6.567686557769775, + "learning_rate": 2.0430582813227488e-05, + "loss": 0.6184, + "step": 200690 + }, + { + "epoch": 1.7742534344666632, + "grad_norm": 5.650191307067871, + "learning_rate": 2.0429109425555616e-05, + "loss": 0.5118, + "step": 200700 + }, + { + "epoch": 1.7743418377269755, + "grad_norm": 4.373534202575684, + "learning_rate": 2.0427636037883748e-05, + "loss": 0.482, + "step": 200710 + }, + { + "epoch": 1.7744302409872876, + "grad_norm": 2.541811466217041, + "learning_rate": 2.0426162650211876e-05, + "loss": 0.5296, + "step": 200720 + }, + { + "epoch": 1.7745186442475998, + "grad_norm": 7.511800289154053, + "learning_rate": 2.0424689262540004e-05, + "loss": 0.5839, + "step": 200730 + }, + { + "epoch": 1.774607047507912, + "grad_norm": 5.731058120727539, + "learning_rate": 2.0423215874868133e-05, + "loss": 0.5132, + "step": 200740 + }, + { + "epoch": 1.7746954507682244, + "grad_norm": 1.3647056818008423, + "learning_rate": 2.0421742487196264e-05, + "loss": 0.7385, + "step": 200750 + }, + { + "epoch": 1.7747838540285366, + "grad_norm": 1.5348931550979614, + "learning_rate": 2.0420269099524393e-05, + "loss": 0.6126, + "step": 200760 + }, + { + "epoch": 1.7748722572888487, + "grad_norm": 11.28753662109375, + "learning_rate": 2.041879571185252e-05, + "loss": 0.6272, + "step": 200770 + }, + { + "epoch": 1.774960660549161, + "grad_norm": 4.322978496551514, + "learning_rate": 2.041732232418065e-05, + "loss": 0.5384, + "step": 200780 + }, + { + "epoch": 1.7750490638094734, + "grad_norm": 1.7618731260299683, + "learning_rate": 2.041584893650878e-05, + "loss": 0.6147, + "step": 200790 + }, + { + "epoch": 1.7751374670697855, + "grad_norm": 16.58060073852539, + "learning_rate": 2.041437554883691e-05, + "loss": 0.649, + "step": 200800 + }, + { + "epoch": 1.7752258703300978, + "grad_norm": 1.241051197052002, + "learning_rate": 2.0412902161165038e-05, + "loss": 0.592, + "step": 200810 + }, + { + "epoch": 1.7753142735904102, + "grad_norm": 2.0913937091827393, + "learning_rate": 2.041142877349317e-05, + "loss": 0.7277, + "step": 200820 + }, + { + "epoch": 1.7754026768507223, + "grad_norm": 2.897819995880127, + "learning_rate": 2.0409955385821298e-05, + "loss": 0.6609, + "step": 200830 + }, + { + "epoch": 1.7754910801110344, + "grad_norm": 2.172450065612793, + "learning_rate": 2.0408481998149426e-05, + "loss": 0.6171, + "step": 200840 + }, + { + "epoch": 1.7755794833713467, + "grad_norm": 1.255169153213501, + "learning_rate": 2.0407008610477555e-05, + "loss": 0.5725, + "step": 200850 + }, + { + "epoch": 1.775667886631659, + "grad_norm": 1.731946587562561, + "learning_rate": 2.0405535222805687e-05, + "loss": 0.5666, + "step": 200860 + }, + { + "epoch": 1.7757562898919712, + "grad_norm": 4.914487838745117, + "learning_rate": 2.0404061835133815e-05, + "loss": 0.701, + "step": 200870 + }, + { + "epoch": 1.7758446931522833, + "grad_norm": 2.3087074756622314, + "learning_rate": 2.0402588447461943e-05, + "loss": 0.5716, + "step": 200880 + }, + { + "epoch": 1.7759330964125957, + "grad_norm": 1.1194977760314941, + "learning_rate": 2.040111505979007e-05, + "loss": 0.5854, + "step": 200890 + }, + { + "epoch": 1.776021499672908, + "grad_norm": 1.1009505987167358, + "learning_rate": 2.0399641672118203e-05, + "loss": 0.565, + "step": 200900 + }, + { + "epoch": 1.7761099029332201, + "grad_norm": 2.3220832347869873, + "learning_rate": 2.039816828444633e-05, + "loss": 0.5319, + "step": 200910 + }, + { + "epoch": 1.7761983061935325, + "grad_norm": 7.326570510864258, + "learning_rate": 2.039669489677446e-05, + "loss": 0.7092, + "step": 200920 + }, + { + "epoch": 1.7762867094538448, + "grad_norm": 2.641589641571045, + "learning_rate": 2.0395221509102592e-05, + "loss": 0.5615, + "step": 200930 + }, + { + "epoch": 1.776375112714157, + "grad_norm": 1.013283610343933, + "learning_rate": 2.039374812143072e-05, + "loss": 0.5636, + "step": 200940 + }, + { + "epoch": 1.776463515974469, + "grad_norm": 1.1645312309265137, + "learning_rate": 2.039227473375885e-05, + "loss": 0.5778, + "step": 200950 + }, + { + "epoch": 1.7765519192347814, + "grad_norm": 2.746457815170288, + "learning_rate": 2.0390801346086977e-05, + "loss": 0.572, + "step": 200960 + }, + { + "epoch": 1.7766403224950937, + "grad_norm": 4.176667213439941, + "learning_rate": 2.038932795841511e-05, + "loss": 0.5899, + "step": 200970 + }, + { + "epoch": 1.7767287257554059, + "grad_norm": 3.438382387161255, + "learning_rate": 2.0387854570743237e-05, + "loss": 0.5251, + "step": 200980 + }, + { + "epoch": 1.776817129015718, + "grad_norm": 3.557908535003662, + "learning_rate": 2.0386381183071365e-05, + "loss": 0.5072, + "step": 200990 + }, + { + "epoch": 1.7769055322760303, + "grad_norm": 6.518289089202881, + "learning_rate": 2.0384907795399497e-05, + "loss": 0.563, + "step": 201000 + }, + { + "epoch": 1.7769939355363427, + "grad_norm": 1.248594880104065, + "learning_rate": 2.0383434407727625e-05, + "loss": 0.5615, + "step": 201010 + }, + { + "epoch": 1.7770823387966548, + "grad_norm": 6.089591979980469, + "learning_rate": 2.0381961020055754e-05, + "loss": 0.5475, + "step": 201020 + }, + { + "epoch": 1.777170742056967, + "grad_norm": 11.350852012634277, + "learning_rate": 2.0380487632383882e-05, + "loss": 0.5913, + "step": 201030 + }, + { + "epoch": 1.7772591453172795, + "grad_norm": 1.9228618144989014, + "learning_rate": 2.0379014244712014e-05, + "loss": 0.5296, + "step": 201040 + }, + { + "epoch": 1.7773475485775916, + "grad_norm": 7.04586935043335, + "learning_rate": 2.0377540857040142e-05, + "loss": 0.6224, + "step": 201050 + }, + { + "epoch": 1.7774359518379037, + "grad_norm": 11.666397094726562, + "learning_rate": 2.037606746936827e-05, + "loss": 0.5715, + "step": 201060 + }, + { + "epoch": 1.777524355098216, + "grad_norm": 15.812749862670898, + "learning_rate": 2.03745940816964e-05, + "loss": 0.611, + "step": 201070 + }, + { + "epoch": 1.7776127583585284, + "grad_norm": 5.996926784515381, + "learning_rate": 2.037312069402453e-05, + "loss": 0.6422, + "step": 201080 + }, + { + "epoch": 1.7777011616188405, + "grad_norm": 1.75562584400177, + "learning_rate": 2.037164730635266e-05, + "loss": 0.5885, + "step": 201090 + }, + { + "epoch": 1.7777895648791526, + "grad_norm": 1.073387622833252, + "learning_rate": 2.0370173918680787e-05, + "loss": 0.6348, + "step": 201100 + }, + { + "epoch": 1.777877968139465, + "grad_norm": 5.5727996826171875, + "learning_rate": 2.036870053100892e-05, + "loss": 0.5176, + "step": 201110 + }, + { + "epoch": 1.7779663713997773, + "grad_norm": 1.9805892705917358, + "learning_rate": 2.0367227143337047e-05, + "loss": 0.694, + "step": 201120 + }, + { + "epoch": 1.7780547746600894, + "grad_norm": 3.733828544616699, + "learning_rate": 2.0365753755665176e-05, + "loss": 0.7036, + "step": 201130 + }, + { + "epoch": 1.7781431779204016, + "grad_norm": 5.081565856933594, + "learning_rate": 2.0364280367993304e-05, + "loss": 0.7147, + "step": 201140 + }, + { + "epoch": 1.778231581180714, + "grad_norm": 3.3621277809143066, + "learning_rate": 2.0362806980321436e-05, + "loss": 0.5959, + "step": 201150 + }, + { + "epoch": 1.7783199844410262, + "grad_norm": 2.002547025680542, + "learning_rate": 2.0361333592649564e-05, + "loss": 0.5854, + "step": 201160 + }, + { + "epoch": 1.7784083877013384, + "grad_norm": 1.1885132789611816, + "learning_rate": 2.0359860204977693e-05, + "loss": 0.6929, + "step": 201170 + }, + { + "epoch": 1.7784967909616507, + "grad_norm": 1.0890119075775146, + "learning_rate": 2.0358386817305824e-05, + "loss": 0.5461, + "step": 201180 + }, + { + "epoch": 1.778585194221963, + "grad_norm": 2.4033539295196533, + "learning_rate": 2.0356913429633953e-05, + "loss": 0.5506, + "step": 201190 + }, + { + "epoch": 1.7786735974822752, + "grad_norm": 4.361912727355957, + "learning_rate": 2.035544004196208e-05, + "loss": 0.6374, + "step": 201200 + }, + { + "epoch": 1.7787620007425873, + "grad_norm": 1.5721828937530518, + "learning_rate": 2.035396665429021e-05, + "loss": 0.5158, + "step": 201210 + }, + { + "epoch": 1.7788504040028996, + "grad_norm": 3.3955421447753906, + "learning_rate": 2.035249326661834e-05, + "loss": 0.8143, + "step": 201220 + }, + { + "epoch": 1.778938807263212, + "grad_norm": 2.908384084701538, + "learning_rate": 2.035101987894647e-05, + "loss": 0.6386, + "step": 201230 + }, + { + "epoch": 1.779027210523524, + "grad_norm": 4.739225387573242, + "learning_rate": 2.0349546491274598e-05, + "loss": 0.674, + "step": 201240 + }, + { + "epoch": 1.7791156137838362, + "grad_norm": 2.748955488204956, + "learning_rate": 2.0348073103602726e-05, + "loss": 0.5438, + "step": 201250 + }, + { + "epoch": 1.7792040170441485, + "grad_norm": 2.6617844104766846, + "learning_rate": 2.0346599715930858e-05, + "loss": 0.6165, + "step": 201260 + }, + { + "epoch": 1.7792924203044609, + "grad_norm": 6.649594306945801, + "learning_rate": 2.0345126328258986e-05, + "loss": 0.4528, + "step": 201270 + }, + { + "epoch": 1.779380823564773, + "grad_norm": 2.950157880783081, + "learning_rate": 2.0343652940587115e-05, + "loss": 0.6148, + "step": 201280 + }, + { + "epoch": 1.7794692268250853, + "grad_norm": 1.084587574005127, + "learning_rate": 2.0342179552915246e-05, + "loss": 0.5, + "step": 201290 + }, + { + "epoch": 1.7795576300853977, + "grad_norm": 0.8753800392150879, + "learning_rate": 2.0340706165243375e-05, + "loss": 0.527, + "step": 201300 + }, + { + "epoch": 1.7796460333457098, + "grad_norm": 1.71090829372406, + "learning_rate": 2.0339232777571503e-05, + "loss": 0.5337, + "step": 201310 + }, + { + "epoch": 1.779734436606022, + "grad_norm": 2.7448437213897705, + "learning_rate": 2.033775938989963e-05, + "loss": 0.5433, + "step": 201320 + }, + { + "epoch": 1.7798228398663343, + "grad_norm": 4.972217082977295, + "learning_rate": 2.0336286002227763e-05, + "loss": 0.7051, + "step": 201330 + }, + { + "epoch": 1.7799112431266466, + "grad_norm": 2.407299518585205, + "learning_rate": 2.033481261455589e-05, + "loss": 0.6354, + "step": 201340 + }, + { + "epoch": 1.7799996463869587, + "grad_norm": 15.180646896362305, + "learning_rate": 2.033333922688402e-05, + "loss": 0.6935, + "step": 201350 + }, + { + "epoch": 1.7800880496472709, + "grad_norm": 1.1638933420181274, + "learning_rate": 2.033186583921215e-05, + "loss": 0.6537, + "step": 201360 + }, + { + "epoch": 1.7801764529075832, + "grad_norm": 2.0126402378082275, + "learning_rate": 2.033039245154028e-05, + "loss": 0.5021, + "step": 201370 + }, + { + "epoch": 1.7802648561678955, + "grad_norm": 1.1321524381637573, + "learning_rate": 2.032891906386841e-05, + "loss": 0.4838, + "step": 201380 + }, + { + "epoch": 1.7803532594282077, + "grad_norm": 4.253654956817627, + "learning_rate": 2.0327445676196537e-05, + "loss": 0.6997, + "step": 201390 + }, + { + "epoch": 1.78044166268852, + "grad_norm": 3.022303581237793, + "learning_rate": 2.032597228852467e-05, + "loss": 0.5443, + "step": 201400 + }, + { + "epoch": 1.7805300659488323, + "grad_norm": 3.3942525386810303, + "learning_rate": 2.0324498900852797e-05, + "loss": 0.479, + "step": 201410 + }, + { + "epoch": 1.7806184692091445, + "grad_norm": 6.061913967132568, + "learning_rate": 2.0323025513180925e-05, + "loss": 0.5148, + "step": 201420 + }, + { + "epoch": 1.7807068724694566, + "grad_norm": 3.260727882385254, + "learning_rate": 2.0321552125509054e-05, + "loss": 0.6581, + "step": 201430 + }, + { + "epoch": 1.780795275729769, + "grad_norm": 5.648933410644531, + "learning_rate": 2.0320078737837185e-05, + "loss": 0.6565, + "step": 201440 + }, + { + "epoch": 1.7808836789900813, + "grad_norm": 2.1177427768707275, + "learning_rate": 2.0318605350165314e-05, + "loss": 0.5784, + "step": 201450 + }, + { + "epoch": 1.7809720822503934, + "grad_norm": 3.3396449089050293, + "learning_rate": 2.0317131962493442e-05, + "loss": 0.6284, + "step": 201460 + }, + { + "epoch": 1.7810604855107055, + "grad_norm": 1.3688477277755737, + "learning_rate": 2.0315658574821574e-05, + "loss": 0.5571, + "step": 201470 + }, + { + "epoch": 1.7811488887710178, + "grad_norm": 1.3339372873306274, + "learning_rate": 2.0314185187149702e-05, + "loss": 0.477, + "step": 201480 + }, + { + "epoch": 1.7812372920313302, + "grad_norm": 23.196277618408203, + "learning_rate": 2.031271179947783e-05, + "loss": 0.5992, + "step": 201490 + }, + { + "epoch": 1.7813256952916423, + "grad_norm": 4.477624893188477, + "learning_rate": 2.0311238411805962e-05, + "loss": 0.6204, + "step": 201500 + }, + { + "epoch": 1.7814140985519547, + "grad_norm": 4.52294921875, + "learning_rate": 2.030976502413409e-05, + "loss": 0.6913, + "step": 201510 + }, + { + "epoch": 1.781502501812267, + "grad_norm": 1.8961204290390015, + "learning_rate": 2.0308291636462222e-05, + "loss": 0.4341, + "step": 201520 + }, + { + "epoch": 1.7815909050725791, + "grad_norm": 3.135321617126465, + "learning_rate": 2.030681824879035e-05, + "loss": 0.5982, + "step": 201530 + }, + { + "epoch": 1.7816793083328912, + "grad_norm": 1.6723718643188477, + "learning_rate": 2.030534486111848e-05, + "loss": 0.6095, + "step": 201540 + }, + { + "epoch": 1.7817677115932036, + "grad_norm": 9.863723754882812, + "learning_rate": 2.030387147344661e-05, + "loss": 0.5759, + "step": 201550 + }, + { + "epoch": 1.781856114853516, + "grad_norm": 3.2073895931243896, + "learning_rate": 2.030239808577474e-05, + "loss": 0.5872, + "step": 201560 + }, + { + "epoch": 1.781944518113828, + "grad_norm": 21.227392196655273, + "learning_rate": 2.0300924698102867e-05, + "loss": 0.5558, + "step": 201570 + }, + { + "epoch": 1.7820329213741402, + "grad_norm": 1.1915196180343628, + "learning_rate": 2.0299451310431e-05, + "loss": 0.6041, + "step": 201580 + }, + { + "epoch": 1.7821213246344525, + "grad_norm": 1.779495120048523, + "learning_rate": 2.0297977922759127e-05, + "loss": 0.6486, + "step": 201590 + }, + { + "epoch": 1.7822097278947648, + "grad_norm": 2.8377809524536133, + "learning_rate": 2.0296504535087256e-05, + "loss": 0.5171, + "step": 201600 + }, + { + "epoch": 1.782298131155077, + "grad_norm": 1.758028268814087, + "learning_rate": 2.0295031147415384e-05, + "loss": 0.5747, + "step": 201610 + }, + { + "epoch": 1.782386534415389, + "grad_norm": 6.24250602722168, + "learning_rate": 2.0293557759743516e-05, + "loss": 0.4505, + "step": 201620 + }, + { + "epoch": 1.7824749376757016, + "grad_norm": 2.263643741607666, + "learning_rate": 2.0292084372071644e-05, + "loss": 0.5904, + "step": 201630 + }, + { + "epoch": 1.7825633409360138, + "grad_norm": 0.8728529214859009, + "learning_rate": 2.0290610984399773e-05, + "loss": 0.7096, + "step": 201640 + }, + { + "epoch": 1.7826517441963259, + "grad_norm": 4.5690460205078125, + "learning_rate": 2.0289137596727904e-05, + "loss": 0.6131, + "step": 201650 + }, + { + "epoch": 1.7827401474566382, + "grad_norm": 1.0904308557510376, + "learning_rate": 2.0287664209056033e-05, + "loss": 0.6517, + "step": 201660 + }, + { + "epoch": 1.7828285507169506, + "grad_norm": 2.5785434246063232, + "learning_rate": 2.028619082138416e-05, + "loss": 0.5849, + "step": 201670 + }, + { + "epoch": 1.7829169539772627, + "grad_norm": 16.86878204345703, + "learning_rate": 2.028471743371229e-05, + "loss": 0.4943, + "step": 201680 + }, + { + "epoch": 1.7830053572375748, + "grad_norm": 2.302427053451538, + "learning_rate": 2.028324404604042e-05, + "loss": 0.6317, + "step": 201690 + }, + { + "epoch": 1.7830937604978871, + "grad_norm": 2.126096248626709, + "learning_rate": 2.028177065836855e-05, + "loss": 0.5878, + "step": 201700 + }, + { + "epoch": 1.7831821637581995, + "grad_norm": 1.160621166229248, + "learning_rate": 2.0280297270696678e-05, + "loss": 0.6054, + "step": 201710 + }, + { + "epoch": 1.7832705670185116, + "grad_norm": 1.9716119766235352, + "learning_rate": 2.0278823883024806e-05, + "loss": 0.4748, + "step": 201720 + }, + { + "epoch": 1.7833589702788237, + "grad_norm": 2.5228166580200195, + "learning_rate": 2.0277350495352938e-05, + "loss": 0.5856, + "step": 201730 + }, + { + "epoch": 1.783447373539136, + "grad_norm": 1.8789089918136597, + "learning_rate": 2.0275877107681066e-05, + "loss": 0.6641, + "step": 201740 + }, + { + "epoch": 1.7835357767994484, + "grad_norm": 2.8637356758117676, + "learning_rate": 2.0274403720009195e-05, + "loss": 0.5891, + "step": 201750 + }, + { + "epoch": 1.7836241800597605, + "grad_norm": 8.53894329071045, + "learning_rate": 2.0272930332337326e-05, + "loss": 0.5297, + "step": 201760 + }, + { + "epoch": 1.7837125833200729, + "grad_norm": 2.2200565338134766, + "learning_rate": 2.0271456944665455e-05, + "loss": 0.5071, + "step": 201770 + }, + { + "epoch": 1.7838009865803852, + "grad_norm": 2.633755683898926, + "learning_rate": 2.0269983556993583e-05, + "loss": 0.643, + "step": 201780 + }, + { + "epoch": 1.7838893898406973, + "grad_norm": 1.5142741203308105, + "learning_rate": 2.026851016932171e-05, + "loss": 0.4403, + "step": 201790 + }, + { + "epoch": 1.7839777931010095, + "grad_norm": 2.4997575283050537, + "learning_rate": 2.0267036781649843e-05, + "loss": 0.5829, + "step": 201800 + }, + { + "epoch": 1.7840661963613218, + "grad_norm": 1.807714581489563, + "learning_rate": 2.026556339397797e-05, + "loss": 0.7249, + "step": 201810 + }, + { + "epoch": 1.7841545996216341, + "grad_norm": 1.2535957098007202, + "learning_rate": 2.02640900063061e-05, + "loss": 0.5603, + "step": 201820 + }, + { + "epoch": 1.7842430028819463, + "grad_norm": 2.556269407272339, + "learning_rate": 2.0262616618634232e-05, + "loss": 0.5667, + "step": 201830 + }, + { + "epoch": 1.7843314061422584, + "grad_norm": 3.98608660697937, + "learning_rate": 2.026114323096236e-05, + "loss": 0.5177, + "step": 201840 + }, + { + "epoch": 1.7844198094025707, + "grad_norm": 1.199278712272644, + "learning_rate": 2.025966984329049e-05, + "loss": 0.4912, + "step": 201850 + }, + { + "epoch": 1.784508212662883, + "grad_norm": 2.08693265914917, + "learning_rate": 2.0258196455618617e-05, + "loss": 0.6314, + "step": 201860 + }, + { + "epoch": 1.7845966159231952, + "grad_norm": 5.786932468414307, + "learning_rate": 2.025672306794675e-05, + "loss": 0.6169, + "step": 201870 + }, + { + "epoch": 1.7846850191835075, + "grad_norm": 1.3747122287750244, + "learning_rate": 2.0255249680274877e-05, + "loss": 0.6953, + "step": 201880 + }, + { + "epoch": 1.7847734224438199, + "grad_norm": 1.3487671613693237, + "learning_rate": 2.0253776292603005e-05, + "loss": 0.6751, + "step": 201890 + }, + { + "epoch": 1.784861825704132, + "grad_norm": 2.8167760372161865, + "learning_rate": 2.0252302904931134e-05, + "loss": 0.6142, + "step": 201900 + }, + { + "epoch": 1.784950228964444, + "grad_norm": 4.856353282928467, + "learning_rate": 2.0250829517259265e-05, + "loss": 0.6448, + "step": 201910 + }, + { + "epoch": 1.7850386322247564, + "grad_norm": 2.498176097869873, + "learning_rate": 2.0249356129587394e-05, + "loss": 0.6247, + "step": 201920 + }, + { + "epoch": 1.7851270354850688, + "grad_norm": 2.4965102672576904, + "learning_rate": 2.0247882741915522e-05, + "loss": 0.6207, + "step": 201930 + }, + { + "epoch": 1.785215438745381, + "grad_norm": 3.6828131675720215, + "learning_rate": 2.0246409354243654e-05, + "loss": 0.6407, + "step": 201940 + }, + { + "epoch": 1.785303842005693, + "grad_norm": 2.793348789215088, + "learning_rate": 2.0244935966571782e-05, + "loss": 0.6003, + "step": 201950 + }, + { + "epoch": 1.7853922452660054, + "grad_norm": 1.7601300477981567, + "learning_rate": 2.024346257889991e-05, + "loss": 0.5063, + "step": 201960 + }, + { + "epoch": 1.7854806485263177, + "grad_norm": 2.9499454498291016, + "learning_rate": 2.024198919122804e-05, + "loss": 0.5465, + "step": 201970 + }, + { + "epoch": 1.7855690517866298, + "grad_norm": 6.607210636138916, + "learning_rate": 2.024051580355617e-05, + "loss": 0.6002, + "step": 201980 + }, + { + "epoch": 1.7856574550469422, + "grad_norm": 1.4262440204620361, + "learning_rate": 2.02390424158843e-05, + "loss": 0.5102, + "step": 201990 + }, + { + "epoch": 1.7857458583072545, + "grad_norm": 1.0396995544433594, + "learning_rate": 2.0237569028212427e-05, + "loss": 0.6954, + "step": 202000 + }, + { + "epoch": 1.7858342615675666, + "grad_norm": 2.5385825634002686, + "learning_rate": 2.0236095640540556e-05, + "loss": 0.6196, + "step": 202010 + }, + { + "epoch": 1.7859226648278788, + "grad_norm": 9.166196823120117, + "learning_rate": 2.0234622252868687e-05, + "loss": 0.5676, + "step": 202020 + }, + { + "epoch": 1.786011068088191, + "grad_norm": 3.221381902694702, + "learning_rate": 2.0233148865196816e-05, + "loss": 0.5448, + "step": 202030 + }, + { + "epoch": 1.7860994713485034, + "grad_norm": 2.0953943729400635, + "learning_rate": 2.0231675477524944e-05, + "loss": 0.5464, + "step": 202040 + }, + { + "epoch": 1.7861878746088156, + "grad_norm": 6.100296497344971, + "learning_rate": 2.0230202089853076e-05, + "loss": 0.5461, + "step": 202050 + }, + { + "epoch": 1.7862762778691277, + "grad_norm": 2.7867844104766846, + "learning_rate": 2.0228728702181204e-05, + "loss": 0.5864, + "step": 202060 + }, + { + "epoch": 1.78636468112944, + "grad_norm": 1.0863789319992065, + "learning_rate": 2.0227255314509333e-05, + "loss": 0.5319, + "step": 202070 + }, + { + "epoch": 1.7864530843897524, + "grad_norm": 1.249638557434082, + "learning_rate": 2.022578192683746e-05, + "loss": 0.6176, + "step": 202080 + }, + { + "epoch": 1.7865414876500645, + "grad_norm": 3.3248021602630615, + "learning_rate": 2.0224308539165593e-05, + "loss": 0.6684, + "step": 202090 + }, + { + "epoch": 1.7866298909103768, + "grad_norm": 2.549376964569092, + "learning_rate": 2.022283515149372e-05, + "loss": 0.4447, + "step": 202100 + }, + { + "epoch": 1.7867182941706892, + "grad_norm": 2.5520517826080322, + "learning_rate": 2.022136176382185e-05, + "loss": 0.5984, + "step": 202110 + }, + { + "epoch": 1.7868066974310013, + "grad_norm": 3.3038833141326904, + "learning_rate": 2.021988837614998e-05, + "loss": 0.6856, + "step": 202120 + }, + { + "epoch": 1.7868951006913134, + "grad_norm": 4.299644947052002, + "learning_rate": 2.021841498847811e-05, + "loss": 0.5675, + "step": 202130 + }, + { + "epoch": 1.7869835039516258, + "grad_norm": 4.799663543701172, + "learning_rate": 2.0216941600806238e-05, + "loss": 0.6688, + "step": 202140 + }, + { + "epoch": 1.787071907211938, + "grad_norm": 2.128201961517334, + "learning_rate": 2.0215468213134366e-05, + "loss": 0.6304, + "step": 202150 + }, + { + "epoch": 1.7871603104722502, + "grad_norm": 8.448138236999512, + "learning_rate": 2.0213994825462498e-05, + "loss": 0.5856, + "step": 202160 + }, + { + "epoch": 1.7872487137325623, + "grad_norm": 1.518473744392395, + "learning_rate": 2.0212521437790626e-05, + "loss": 0.4824, + "step": 202170 + }, + { + "epoch": 1.7873371169928747, + "grad_norm": 3.5119426250457764, + "learning_rate": 2.0211048050118755e-05, + "loss": 0.6747, + "step": 202180 + }, + { + "epoch": 1.787425520253187, + "grad_norm": 5.362548828125, + "learning_rate": 2.0209574662446883e-05, + "loss": 0.6061, + "step": 202190 + }, + { + "epoch": 1.7875139235134991, + "grad_norm": 2.3434488773345947, + "learning_rate": 2.0208101274775015e-05, + "loss": 0.5408, + "step": 202200 + }, + { + "epoch": 1.7876023267738113, + "grad_norm": 2.1973605155944824, + "learning_rate": 2.0206627887103143e-05, + "loss": 0.5496, + "step": 202210 + }, + { + "epoch": 1.7876907300341238, + "grad_norm": 5.755651950836182, + "learning_rate": 2.020515449943127e-05, + "loss": 0.5231, + "step": 202220 + }, + { + "epoch": 1.787779133294436, + "grad_norm": 16.390277862548828, + "learning_rate": 2.0203681111759403e-05, + "loss": 0.6467, + "step": 202230 + }, + { + "epoch": 1.787867536554748, + "grad_norm": 1.874927282333374, + "learning_rate": 2.020220772408753e-05, + "loss": 0.6742, + "step": 202240 + }, + { + "epoch": 1.7879559398150604, + "grad_norm": 5.835158824920654, + "learning_rate": 2.020073433641566e-05, + "loss": 0.5809, + "step": 202250 + }, + { + "epoch": 1.7880443430753727, + "grad_norm": 1.696328043937683, + "learning_rate": 2.0199260948743788e-05, + "loss": 0.5666, + "step": 202260 + }, + { + "epoch": 1.7881327463356849, + "grad_norm": 5.258299827575684, + "learning_rate": 2.019778756107192e-05, + "loss": 0.5327, + "step": 202270 + }, + { + "epoch": 1.788221149595997, + "grad_norm": 1.2496052980422974, + "learning_rate": 2.0196314173400048e-05, + "loss": 0.5961, + "step": 202280 + }, + { + "epoch": 1.7883095528563093, + "grad_norm": 1.8631751537322998, + "learning_rate": 2.0194840785728177e-05, + "loss": 0.5465, + "step": 202290 + }, + { + "epoch": 1.7883979561166217, + "grad_norm": 1.9512073993682861, + "learning_rate": 2.019336739805631e-05, + "loss": 0.6174, + "step": 202300 + }, + { + "epoch": 1.7884863593769338, + "grad_norm": 2.4947447776794434, + "learning_rate": 2.0191894010384437e-05, + "loss": 0.648, + "step": 202310 + }, + { + "epoch": 1.788574762637246, + "grad_norm": 1.7648309469223022, + "learning_rate": 2.0190420622712565e-05, + "loss": 0.5216, + "step": 202320 + }, + { + "epoch": 1.7886631658975582, + "grad_norm": 2.477572441101074, + "learning_rate": 2.0188947235040693e-05, + "loss": 0.6343, + "step": 202330 + }, + { + "epoch": 1.7887515691578706, + "grad_norm": 3.5414538383483887, + "learning_rate": 2.0187473847368825e-05, + "loss": 0.4303, + "step": 202340 + }, + { + "epoch": 1.7888399724181827, + "grad_norm": 2.2090277671813965, + "learning_rate": 2.0186000459696954e-05, + "loss": 0.5601, + "step": 202350 + }, + { + "epoch": 1.788928375678495, + "grad_norm": 1.491808295249939, + "learning_rate": 2.0184527072025082e-05, + "loss": 0.6144, + "step": 202360 + }, + { + "epoch": 1.7890167789388074, + "grad_norm": 1.3663378953933716, + "learning_rate": 2.0183053684353214e-05, + "loss": 0.435, + "step": 202370 + }, + { + "epoch": 1.7891051821991195, + "grad_norm": 12.870560646057129, + "learning_rate": 2.0181580296681342e-05, + "loss": 0.4637, + "step": 202380 + }, + { + "epoch": 1.7891935854594316, + "grad_norm": 3.9255971908569336, + "learning_rate": 2.018010690900947e-05, + "loss": 0.5703, + "step": 202390 + }, + { + "epoch": 1.789281988719744, + "grad_norm": 5.678790092468262, + "learning_rate": 2.0178633521337602e-05, + "loss": 0.6829, + "step": 202400 + }, + { + "epoch": 1.7893703919800563, + "grad_norm": 3.1971166133880615, + "learning_rate": 2.017716013366573e-05, + "loss": 0.5526, + "step": 202410 + }, + { + "epoch": 1.7894587952403684, + "grad_norm": 3.0424091815948486, + "learning_rate": 2.017568674599386e-05, + "loss": 0.5215, + "step": 202420 + }, + { + "epoch": 1.7895471985006806, + "grad_norm": 2.290457248687744, + "learning_rate": 2.017421335832199e-05, + "loss": 0.4215, + "step": 202430 + }, + { + "epoch": 1.789635601760993, + "grad_norm": 4.569860935211182, + "learning_rate": 2.017273997065012e-05, + "loss": 0.6219, + "step": 202440 + }, + { + "epoch": 1.7897240050213052, + "grad_norm": 3.1228394508361816, + "learning_rate": 2.0171266582978247e-05, + "loss": 0.478, + "step": 202450 + }, + { + "epoch": 1.7898124082816174, + "grad_norm": 2.4187965393066406, + "learning_rate": 2.016979319530638e-05, + "loss": 0.4837, + "step": 202460 + }, + { + "epoch": 1.7899008115419297, + "grad_norm": 2.5301291942596436, + "learning_rate": 2.0168319807634507e-05, + "loss": 0.6361, + "step": 202470 + }, + { + "epoch": 1.789989214802242, + "grad_norm": 3.850473642349243, + "learning_rate": 2.0166846419962636e-05, + "loss": 0.6645, + "step": 202480 + }, + { + "epoch": 1.7900776180625542, + "grad_norm": 6.328955173492432, + "learning_rate": 2.0165373032290767e-05, + "loss": 0.6638, + "step": 202490 + }, + { + "epoch": 1.7901660213228663, + "grad_norm": 2.0801162719726562, + "learning_rate": 2.0163899644618896e-05, + "loss": 0.7099, + "step": 202500 + }, + { + "epoch": 1.7902544245831786, + "grad_norm": 40.06092834472656, + "learning_rate": 2.0162426256947024e-05, + "loss": 0.5311, + "step": 202510 + }, + { + "epoch": 1.790342827843491, + "grad_norm": 8.495887756347656, + "learning_rate": 2.0160952869275156e-05, + "loss": 0.6118, + "step": 202520 + }, + { + "epoch": 1.790431231103803, + "grad_norm": 5.132534980773926, + "learning_rate": 2.0159479481603284e-05, + "loss": 0.6539, + "step": 202530 + }, + { + "epoch": 1.7905196343641152, + "grad_norm": 2.4099485874176025, + "learning_rate": 2.0158006093931413e-05, + "loss": 0.67, + "step": 202540 + }, + { + "epoch": 1.7906080376244276, + "grad_norm": 7.6244049072265625, + "learning_rate": 2.015653270625954e-05, + "loss": 0.6178, + "step": 202550 + }, + { + "epoch": 1.79069644088474, + "grad_norm": 3.099921703338623, + "learning_rate": 2.0155059318587673e-05, + "loss": 0.5688, + "step": 202560 + }, + { + "epoch": 1.790784844145052, + "grad_norm": 2.667494058609009, + "learning_rate": 2.01535859309158e-05, + "loss": 0.6662, + "step": 202570 + }, + { + "epoch": 1.7908732474053644, + "grad_norm": 7.829151153564453, + "learning_rate": 2.015211254324393e-05, + "loss": 0.5814, + "step": 202580 + }, + { + "epoch": 1.7909616506656767, + "grad_norm": 3.4630019664764404, + "learning_rate": 2.015063915557206e-05, + "loss": 0.5265, + "step": 202590 + }, + { + "epoch": 1.7910500539259888, + "grad_norm": 1.9397492408752441, + "learning_rate": 2.014916576790019e-05, + "loss": 0.652, + "step": 202600 + }, + { + "epoch": 1.791138457186301, + "grad_norm": 2.0695536136627197, + "learning_rate": 2.0147692380228318e-05, + "loss": 0.6052, + "step": 202610 + }, + { + "epoch": 1.7912268604466133, + "grad_norm": 1.3530495166778564, + "learning_rate": 2.0146218992556446e-05, + "loss": 0.544, + "step": 202620 + }, + { + "epoch": 1.7913152637069256, + "grad_norm": 3.372593402862549, + "learning_rate": 2.0144745604884578e-05, + "loss": 0.4413, + "step": 202630 + }, + { + "epoch": 1.7914036669672377, + "grad_norm": 10.179778099060059, + "learning_rate": 2.0143272217212706e-05, + "loss": 0.6925, + "step": 202640 + }, + { + "epoch": 1.7914920702275499, + "grad_norm": 1.1542190313339233, + "learning_rate": 2.0141798829540835e-05, + "loss": 0.5617, + "step": 202650 + }, + { + "epoch": 1.7915804734878622, + "grad_norm": 8.816036224365234, + "learning_rate": 2.0140325441868963e-05, + "loss": 0.6547, + "step": 202660 + }, + { + "epoch": 1.7916688767481745, + "grad_norm": 1.5335596799850464, + "learning_rate": 2.0138852054197095e-05, + "loss": 0.472, + "step": 202670 + }, + { + "epoch": 1.7917572800084867, + "grad_norm": 2.918468475341797, + "learning_rate": 2.0137378666525223e-05, + "loss": 0.6504, + "step": 202680 + }, + { + "epoch": 1.791845683268799, + "grad_norm": 4.687645435333252, + "learning_rate": 2.013590527885335e-05, + "loss": 0.6655, + "step": 202690 + }, + { + "epoch": 1.7919340865291113, + "grad_norm": 2.681749105453491, + "learning_rate": 2.0134431891181483e-05, + "loss": 0.5321, + "step": 202700 + }, + { + "epoch": 1.7920224897894235, + "grad_norm": 0.9618099927902222, + "learning_rate": 2.013295850350961e-05, + "loss": 0.6792, + "step": 202710 + }, + { + "epoch": 1.7921108930497356, + "grad_norm": 1.7380198240280151, + "learning_rate": 2.013148511583774e-05, + "loss": 0.6196, + "step": 202720 + }, + { + "epoch": 1.792199296310048, + "grad_norm": 2.081829786300659, + "learning_rate": 2.0130011728165868e-05, + "loss": 0.5425, + "step": 202730 + }, + { + "epoch": 1.7922876995703603, + "grad_norm": 1.2693742513656616, + "learning_rate": 2.0128538340494e-05, + "loss": 0.5532, + "step": 202740 + }, + { + "epoch": 1.7923761028306724, + "grad_norm": 3.4703986644744873, + "learning_rate": 2.012706495282213e-05, + "loss": 0.5527, + "step": 202750 + }, + { + "epoch": 1.7924645060909845, + "grad_norm": 8.27893352508545, + "learning_rate": 2.0125591565150257e-05, + "loss": 0.6402, + "step": 202760 + }, + { + "epoch": 1.7925529093512969, + "grad_norm": 1.1515133380889893, + "learning_rate": 2.012411817747839e-05, + "loss": 0.5985, + "step": 202770 + }, + { + "epoch": 1.7926413126116092, + "grad_norm": 1.263790488243103, + "learning_rate": 2.0122644789806517e-05, + "loss": 0.5517, + "step": 202780 + }, + { + "epoch": 1.7927297158719213, + "grad_norm": 5.390106678009033, + "learning_rate": 2.0121171402134645e-05, + "loss": 0.6238, + "step": 202790 + }, + { + "epoch": 1.7928181191322334, + "grad_norm": 4.11129093170166, + "learning_rate": 2.0119698014462774e-05, + "loss": 0.6483, + "step": 202800 + }, + { + "epoch": 1.792906522392546, + "grad_norm": 2.6334662437438965, + "learning_rate": 2.0118224626790905e-05, + "loss": 0.6785, + "step": 202810 + }, + { + "epoch": 1.7929949256528581, + "grad_norm": 2.90444278717041, + "learning_rate": 2.0116751239119034e-05, + "loss": 0.6102, + "step": 202820 + }, + { + "epoch": 1.7930833289131702, + "grad_norm": 1.446006178855896, + "learning_rate": 2.0115277851447162e-05, + "loss": 0.5848, + "step": 202830 + }, + { + "epoch": 1.7931717321734826, + "grad_norm": 2.223749876022339, + "learning_rate": 2.011380446377529e-05, + "loss": 0.5806, + "step": 202840 + }, + { + "epoch": 1.793260135433795, + "grad_norm": 1.503097414970398, + "learning_rate": 2.0112331076103422e-05, + "loss": 0.5039, + "step": 202850 + }, + { + "epoch": 1.793348538694107, + "grad_norm": 2.7936577796936035, + "learning_rate": 2.011085768843155e-05, + "loss": 0.6021, + "step": 202860 + }, + { + "epoch": 1.7934369419544192, + "grad_norm": 1.772740125656128, + "learning_rate": 2.010938430075968e-05, + "loss": 0.7105, + "step": 202870 + }, + { + "epoch": 1.7935253452147315, + "grad_norm": 1.6509367227554321, + "learning_rate": 2.010791091308781e-05, + "loss": 0.5605, + "step": 202880 + }, + { + "epoch": 1.7936137484750438, + "grad_norm": 1.566184639930725, + "learning_rate": 2.010643752541594e-05, + "loss": 0.55, + "step": 202890 + }, + { + "epoch": 1.793702151735356, + "grad_norm": 3.6672322750091553, + "learning_rate": 2.0104964137744067e-05, + "loss": 0.688, + "step": 202900 + }, + { + "epoch": 1.793790554995668, + "grad_norm": 3.762032985687256, + "learning_rate": 2.0103490750072196e-05, + "loss": 0.5696, + "step": 202910 + }, + { + "epoch": 1.7938789582559804, + "grad_norm": 8.605866432189941, + "learning_rate": 2.0102017362400327e-05, + "loss": 0.6872, + "step": 202920 + }, + { + "epoch": 1.7939673615162928, + "grad_norm": 2.1437554359436035, + "learning_rate": 2.0100543974728456e-05, + "loss": 0.5188, + "step": 202930 + }, + { + "epoch": 1.7940557647766049, + "grad_norm": 8.493627548217773, + "learning_rate": 2.0099070587056584e-05, + "loss": 0.6031, + "step": 202940 + }, + { + "epoch": 1.7941441680369172, + "grad_norm": 12.949837684631348, + "learning_rate": 2.0097597199384716e-05, + "loss": 0.555, + "step": 202950 + }, + { + "epoch": 1.7942325712972296, + "grad_norm": 5.7469353675842285, + "learning_rate": 2.0096123811712844e-05, + "loss": 0.5302, + "step": 202960 + }, + { + "epoch": 1.7943209745575417, + "grad_norm": 2.5767934322357178, + "learning_rate": 2.0094650424040972e-05, + "loss": 0.5887, + "step": 202970 + }, + { + "epoch": 1.7944093778178538, + "grad_norm": 1.2326583862304688, + "learning_rate": 2.00931770363691e-05, + "loss": 0.553, + "step": 202980 + }, + { + "epoch": 1.7944977810781662, + "grad_norm": 5.684199810028076, + "learning_rate": 2.0091703648697233e-05, + "loss": 0.5481, + "step": 202990 + }, + { + "epoch": 1.7945861843384785, + "grad_norm": 3.1634457111358643, + "learning_rate": 2.009023026102536e-05, + "loss": 0.6541, + "step": 203000 + }, + { + "epoch": 1.7946745875987906, + "grad_norm": 1.2563214302062988, + "learning_rate": 2.008875687335349e-05, + "loss": 0.6606, + "step": 203010 + }, + { + "epoch": 1.7947629908591027, + "grad_norm": 13.21400260925293, + "learning_rate": 2.0087283485681618e-05, + "loss": 0.6453, + "step": 203020 + }, + { + "epoch": 1.794851394119415, + "grad_norm": 23.091981887817383, + "learning_rate": 2.008581009800975e-05, + "loss": 0.5445, + "step": 203030 + }, + { + "epoch": 1.7949397973797274, + "grad_norm": 2.3949601650238037, + "learning_rate": 2.0084336710337878e-05, + "loss": 0.7609, + "step": 203040 + }, + { + "epoch": 1.7950282006400395, + "grad_norm": 1.2793152332305908, + "learning_rate": 2.0082863322666006e-05, + "loss": 0.4889, + "step": 203050 + }, + { + "epoch": 1.7951166039003519, + "grad_norm": 6.398225784301758, + "learning_rate": 2.0081389934994138e-05, + "loss": 0.6512, + "step": 203060 + }, + { + "epoch": 1.7952050071606642, + "grad_norm": 3.3708279132843018, + "learning_rate": 2.0079916547322266e-05, + "loss": 0.4781, + "step": 203070 + }, + { + "epoch": 1.7952934104209763, + "grad_norm": 0.9596735239028931, + "learning_rate": 2.0078443159650395e-05, + "loss": 0.5738, + "step": 203080 + }, + { + "epoch": 1.7953818136812885, + "grad_norm": 3.4195425510406494, + "learning_rate": 2.0076969771978523e-05, + "loss": 0.6649, + "step": 203090 + }, + { + "epoch": 1.7954702169416008, + "grad_norm": 2.021829128265381, + "learning_rate": 2.0075496384306655e-05, + "loss": 0.5668, + "step": 203100 + }, + { + "epoch": 1.7955586202019131, + "grad_norm": 2.659977912902832, + "learning_rate": 2.0074022996634783e-05, + "loss": 0.6641, + "step": 203110 + }, + { + "epoch": 1.7956470234622253, + "grad_norm": 7.592244625091553, + "learning_rate": 2.007254960896291e-05, + "loss": 0.6285, + "step": 203120 + }, + { + "epoch": 1.7957354267225374, + "grad_norm": 1.3313415050506592, + "learning_rate": 2.007107622129104e-05, + "loss": 0.469, + "step": 203130 + }, + { + "epoch": 1.7958238299828497, + "grad_norm": 11.043901443481445, + "learning_rate": 2.006960283361917e-05, + "loss": 0.5971, + "step": 203140 + }, + { + "epoch": 1.795912233243162, + "grad_norm": 1.4664943218231201, + "learning_rate": 2.00681294459473e-05, + "loss": 0.544, + "step": 203150 + }, + { + "epoch": 1.7960006365034742, + "grad_norm": 1.6054465770721436, + "learning_rate": 2.0066656058275428e-05, + "loss": 0.666, + "step": 203160 + }, + { + "epoch": 1.7960890397637865, + "grad_norm": 6.42251443862915, + "learning_rate": 2.006518267060356e-05, + "loss": 0.5357, + "step": 203170 + }, + { + "epoch": 1.7961774430240989, + "grad_norm": 5.198081970214844, + "learning_rate": 2.0063709282931688e-05, + "loss": 0.6537, + "step": 203180 + }, + { + "epoch": 1.796265846284411, + "grad_norm": 1.8932801485061646, + "learning_rate": 2.0062235895259817e-05, + "loss": 0.6992, + "step": 203190 + }, + { + "epoch": 1.7963542495447231, + "grad_norm": 2.0405161380767822, + "learning_rate": 2.0060762507587945e-05, + "loss": 0.6569, + "step": 203200 + }, + { + "epoch": 1.7964426528050355, + "grad_norm": 1.3182144165039062, + "learning_rate": 2.0059289119916077e-05, + "loss": 0.4165, + "step": 203210 + }, + { + "epoch": 1.7965310560653478, + "grad_norm": 3.6779301166534424, + "learning_rate": 2.0057815732244205e-05, + "loss": 0.6144, + "step": 203220 + }, + { + "epoch": 1.79661945932566, + "grad_norm": 2.590069055557251, + "learning_rate": 2.0056342344572333e-05, + "loss": 0.5455, + "step": 203230 + }, + { + "epoch": 1.796707862585972, + "grad_norm": 5.057015895843506, + "learning_rate": 2.0054868956900465e-05, + "loss": 0.5215, + "step": 203240 + }, + { + "epoch": 1.7967962658462844, + "grad_norm": 4.856685161590576, + "learning_rate": 2.0053395569228593e-05, + "loss": 0.8774, + "step": 203250 + }, + { + "epoch": 1.7968846691065967, + "grad_norm": 3.5035128593444824, + "learning_rate": 2.0051922181556722e-05, + "loss": 0.4991, + "step": 203260 + }, + { + "epoch": 1.7969730723669088, + "grad_norm": 2.6281065940856934, + "learning_rate": 2.005044879388485e-05, + "loss": 0.5703, + "step": 203270 + }, + { + "epoch": 1.7970614756272212, + "grad_norm": 3.6252622604370117, + "learning_rate": 2.0048975406212982e-05, + "loss": 0.6669, + "step": 203280 + }, + { + "epoch": 1.7971498788875335, + "grad_norm": 4.038813591003418, + "learning_rate": 2.004750201854111e-05, + "loss": 0.6555, + "step": 203290 + }, + { + "epoch": 1.7972382821478456, + "grad_norm": 1.6726902723312378, + "learning_rate": 2.004602863086924e-05, + "loss": 0.5421, + "step": 203300 + }, + { + "epoch": 1.7973266854081578, + "grad_norm": 8.145662307739258, + "learning_rate": 2.004455524319737e-05, + "loss": 0.6577, + "step": 203310 + }, + { + "epoch": 1.79741508866847, + "grad_norm": 1.3114120960235596, + "learning_rate": 2.00430818555255e-05, + "loss": 0.5077, + "step": 203320 + }, + { + "epoch": 1.7975034919287824, + "grad_norm": 3.426360845565796, + "learning_rate": 2.0041608467853627e-05, + "loss": 0.6362, + "step": 203330 + }, + { + "epoch": 1.7975918951890946, + "grad_norm": 1.7572712898254395, + "learning_rate": 2.004013508018176e-05, + "loss": 0.568, + "step": 203340 + }, + { + "epoch": 1.7976802984494067, + "grad_norm": 2.663112163543701, + "learning_rate": 2.0038661692509887e-05, + "loss": 0.5625, + "step": 203350 + }, + { + "epoch": 1.797768701709719, + "grad_norm": 3.7881641387939453, + "learning_rate": 2.0037188304838016e-05, + "loss": 0.6323, + "step": 203360 + }, + { + "epoch": 1.7978571049700314, + "grad_norm": 2.837618350982666, + "learning_rate": 2.0035714917166147e-05, + "loss": 0.6865, + "step": 203370 + }, + { + "epoch": 1.7979455082303435, + "grad_norm": 2.7353949546813965, + "learning_rate": 2.0034241529494276e-05, + "loss": 0.6266, + "step": 203380 + }, + { + "epoch": 1.7980339114906556, + "grad_norm": 3.1415469646453857, + "learning_rate": 2.0032768141822404e-05, + "loss": 0.6028, + "step": 203390 + }, + { + "epoch": 1.7981223147509682, + "grad_norm": 3.9146194458007812, + "learning_rate": 2.0031294754150536e-05, + "loss": 0.493, + "step": 203400 + }, + { + "epoch": 1.7982107180112803, + "grad_norm": 0.9928793907165527, + "learning_rate": 2.0029821366478664e-05, + "loss": 0.5807, + "step": 203410 + }, + { + "epoch": 1.7982991212715924, + "grad_norm": 2.017794132232666, + "learning_rate": 2.0028347978806792e-05, + "loss": 0.5576, + "step": 203420 + }, + { + "epoch": 1.7983875245319048, + "grad_norm": 9.675992012023926, + "learning_rate": 2.0026874591134924e-05, + "loss": 0.5098, + "step": 203430 + }, + { + "epoch": 1.798475927792217, + "grad_norm": 0.9910358786582947, + "learning_rate": 2.0025401203463053e-05, + "loss": 0.6325, + "step": 203440 + }, + { + "epoch": 1.7985643310525292, + "grad_norm": 3.145650625228882, + "learning_rate": 2.002392781579118e-05, + "loss": 0.5242, + "step": 203450 + }, + { + "epoch": 1.7986527343128413, + "grad_norm": 13.919158935546875, + "learning_rate": 2.0022454428119313e-05, + "loss": 0.6918, + "step": 203460 + }, + { + "epoch": 1.7987411375731537, + "grad_norm": 6.881532192230225, + "learning_rate": 2.002098104044744e-05, + "loss": 0.6199, + "step": 203470 + }, + { + "epoch": 1.798829540833466, + "grad_norm": 1.532932996749878, + "learning_rate": 2.001950765277557e-05, + "loss": 0.5106, + "step": 203480 + }, + { + "epoch": 1.7989179440937781, + "grad_norm": 3.1638576984405518, + "learning_rate": 2.0018034265103698e-05, + "loss": 0.5734, + "step": 203490 + }, + { + "epoch": 1.7990063473540903, + "grad_norm": 1.9062272310256958, + "learning_rate": 2.001656087743183e-05, + "loss": 0.7303, + "step": 203500 + }, + { + "epoch": 1.7990947506144026, + "grad_norm": 2.4404211044311523, + "learning_rate": 2.0015087489759958e-05, + "loss": 0.7045, + "step": 203510 + }, + { + "epoch": 1.799183153874715, + "grad_norm": 1.1322623491287231, + "learning_rate": 2.0013614102088086e-05, + "loss": 0.7148, + "step": 203520 + }, + { + "epoch": 1.799271557135027, + "grad_norm": 3.5437192916870117, + "learning_rate": 2.0012140714416218e-05, + "loss": 0.4968, + "step": 203530 + }, + { + "epoch": 1.7993599603953394, + "grad_norm": 2.8896358013153076, + "learning_rate": 2.0010667326744346e-05, + "loss": 0.5137, + "step": 203540 + }, + { + "epoch": 1.7994483636556518, + "grad_norm": 3.1829965114593506, + "learning_rate": 2.0009193939072475e-05, + "loss": 0.6372, + "step": 203550 + }, + { + "epoch": 1.7995367669159639, + "grad_norm": 4.139323711395264, + "learning_rate": 2.0007720551400603e-05, + "loss": 0.6396, + "step": 203560 + }, + { + "epoch": 1.799625170176276, + "grad_norm": 1.7024682760238647, + "learning_rate": 2.0006247163728735e-05, + "loss": 0.5441, + "step": 203570 + }, + { + "epoch": 1.7997135734365883, + "grad_norm": 4.200142860412598, + "learning_rate": 2.0004773776056863e-05, + "loss": 0.6474, + "step": 203580 + }, + { + "epoch": 1.7998019766969007, + "grad_norm": 7.579667091369629, + "learning_rate": 2.000330038838499e-05, + "loss": 0.6004, + "step": 203590 + }, + { + "epoch": 1.7998903799572128, + "grad_norm": 1.4747802019119263, + "learning_rate": 2.000182700071312e-05, + "loss": 0.6946, + "step": 203600 + }, + { + "epoch": 1.799978783217525, + "grad_norm": 1.8386363983154297, + "learning_rate": 2.000035361304125e-05, + "loss": 0.63, + "step": 203610 + }, + { + "epoch": 1.8000671864778373, + "grad_norm": 1.8821145296096802, + "learning_rate": 1.999888022536938e-05, + "loss": 0.4379, + "step": 203620 + }, + { + "epoch": 1.8001555897381496, + "grad_norm": 0.8716601133346558, + "learning_rate": 1.9997406837697508e-05, + "loss": 0.5653, + "step": 203630 + }, + { + "epoch": 1.8002439929984617, + "grad_norm": 6.133352756500244, + "learning_rate": 1.999593345002564e-05, + "loss": 0.6158, + "step": 203640 + }, + { + "epoch": 1.800332396258774, + "grad_norm": 2.3612663745880127, + "learning_rate": 1.999446006235377e-05, + "loss": 0.6145, + "step": 203650 + }, + { + "epoch": 1.8004207995190864, + "grad_norm": 7.695614814758301, + "learning_rate": 1.9992986674681897e-05, + "loss": 0.7172, + "step": 203660 + }, + { + "epoch": 1.8005092027793985, + "grad_norm": 2.7973742485046387, + "learning_rate": 1.9991513287010025e-05, + "loss": 0.5287, + "step": 203670 + }, + { + "epoch": 1.8005976060397106, + "grad_norm": 2.371340036392212, + "learning_rate": 1.9990039899338157e-05, + "loss": 0.5255, + "step": 203680 + }, + { + "epoch": 1.800686009300023, + "grad_norm": 1.6354972124099731, + "learning_rate": 1.9988566511666285e-05, + "loss": 0.5584, + "step": 203690 + }, + { + "epoch": 1.8007744125603353, + "grad_norm": 1.2594465017318726, + "learning_rate": 1.9987093123994413e-05, + "loss": 0.6533, + "step": 203700 + }, + { + "epoch": 1.8008628158206474, + "grad_norm": 6.658586025238037, + "learning_rate": 1.9985619736322545e-05, + "loss": 0.5343, + "step": 203710 + }, + { + "epoch": 1.8009512190809596, + "grad_norm": 21.469985961914062, + "learning_rate": 1.9984146348650674e-05, + "loss": 0.6156, + "step": 203720 + }, + { + "epoch": 1.801039622341272, + "grad_norm": 2.947948455810547, + "learning_rate": 1.9982672960978802e-05, + "loss": 0.594, + "step": 203730 + }, + { + "epoch": 1.8011280256015842, + "grad_norm": 10.709976196289062, + "learning_rate": 1.998119957330693e-05, + "loss": 0.5257, + "step": 203740 + }, + { + "epoch": 1.8012164288618964, + "grad_norm": 4.042881488800049, + "learning_rate": 1.9979726185635062e-05, + "loss": 0.7276, + "step": 203750 + }, + { + "epoch": 1.8013048321222087, + "grad_norm": 10.027393341064453, + "learning_rate": 1.997825279796319e-05, + "loss": 0.4674, + "step": 203760 + }, + { + "epoch": 1.801393235382521, + "grad_norm": 3.7841732501983643, + "learning_rate": 1.997677941029132e-05, + "loss": 0.5067, + "step": 203770 + }, + { + "epoch": 1.8014816386428332, + "grad_norm": 6.647843837738037, + "learning_rate": 1.9975306022619447e-05, + "loss": 0.6248, + "step": 203780 + }, + { + "epoch": 1.8015700419031453, + "grad_norm": 1.9358712434768677, + "learning_rate": 1.997383263494758e-05, + "loss": 0.5107, + "step": 203790 + }, + { + "epoch": 1.8016584451634576, + "grad_norm": 5.811150074005127, + "learning_rate": 1.9972359247275707e-05, + "loss": 0.5592, + "step": 203800 + }, + { + "epoch": 1.80174684842377, + "grad_norm": 2.651205539703369, + "learning_rate": 1.9970885859603836e-05, + "loss": 0.5803, + "step": 203810 + }, + { + "epoch": 1.801835251684082, + "grad_norm": 8.953720092773438, + "learning_rate": 1.9969412471931967e-05, + "loss": 0.8144, + "step": 203820 + }, + { + "epoch": 1.8019236549443942, + "grad_norm": 4.853878498077393, + "learning_rate": 1.9967939084260096e-05, + "loss": 0.5215, + "step": 203830 + }, + { + "epoch": 1.8020120582047066, + "grad_norm": 2.547917604446411, + "learning_rate": 1.9966465696588224e-05, + "loss": 0.6414, + "step": 203840 + }, + { + "epoch": 1.802100461465019, + "grad_norm": 1.9385179281234741, + "learning_rate": 1.9964992308916352e-05, + "loss": 0.6466, + "step": 203850 + }, + { + "epoch": 1.802188864725331, + "grad_norm": 1.986112117767334, + "learning_rate": 1.9963518921244484e-05, + "loss": 0.6, + "step": 203860 + }, + { + "epoch": 1.8022772679856434, + "grad_norm": 10.715702056884766, + "learning_rate": 1.9962045533572612e-05, + "loss": 0.5021, + "step": 203870 + }, + { + "epoch": 1.8023656712459557, + "grad_norm": 1.070197343826294, + "learning_rate": 1.996057214590074e-05, + "loss": 0.5645, + "step": 203880 + }, + { + "epoch": 1.8024540745062678, + "grad_norm": 2.2143959999084473, + "learning_rate": 1.9959098758228873e-05, + "loss": 0.6572, + "step": 203890 + }, + { + "epoch": 1.80254247776658, + "grad_norm": 4.326207160949707, + "learning_rate": 1.9957625370557e-05, + "loss": 0.5749, + "step": 203900 + }, + { + "epoch": 1.8026308810268923, + "grad_norm": 4.877932548522949, + "learning_rate": 1.995615198288513e-05, + "loss": 0.582, + "step": 203910 + }, + { + "epoch": 1.8027192842872046, + "grad_norm": 2.591742753982544, + "learning_rate": 1.9954678595213258e-05, + "loss": 0.5947, + "step": 203920 + }, + { + "epoch": 1.8028076875475167, + "grad_norm": 3.0118486881256104, + "learning_rate": 1.995320520754139e-05, + "loss": 0.5979, + "step": 203930 + }, + { + "epoch": 1.8028960908078289, + "grad_norm": 11.166177749633789, + "learning_rate": 1.9951731819869518e-05, + "loss": 0.6252, + "step": 203940 + }, + { + "epoch": 1.8029844940681412, + "grad_norm": 8.568842887878418, + "learning_rate": 1.9950258432197646e-05, + "loss": 0.6038, + "step": 203950 + }, + { + "epoch": 1.8030728973284535, + "grad_norm": 3.4849417209625244, + "learning_rate": 1.9948785044525774e-05, + "loss": 0.5048, + "step": 203960 + }, + { + "epoch": 1.8031613005887657, + "grad_norm": 3.739942789077759, + "learning_rate": 1.9947311656853906e-05, + "loss": 0.5712, + "step": 203970 + }, + { + "epoch": 1.8032497038490778, + "grad_norm": 2.258883476257324, + "learning_rate": 1.9945838269182034e-05, + "loss": 0.6069, + "step": 203980 + }, + { + "epoch": 1.8033381071093904, + "grad_norm": 1.2948611974716187, + "learning_rate": 1.9944364881510163e-05, + "loss": 0.5668, + "step": 203990 + }, + { + "epoch": 1.8034265103697025, + "grad_norm": 3.449302911758423, + "learning_rate": 1.9942891493838295e-05, + "loss": 0.6062, + "step": 204000 + }, + { + "epoch": 1.8035149136300146, + "grad_norm": 9.215113639831543, + "learning_rate": 1.9941418106166423e-05, + "loss": 0.562, + "step": 204010 + }, + { + "epoch": 1.803603316890327, + "grad_norm": 1.5984588861465454, + "learning_rate": 1.993994471849455e-05, + "loss": 0.64, + "step": 204020 + }, + { + "epoch": 1.8036917201506393, + "grad_norm": 1.348873496055603, + "learning_rate": 1.993847133082268e-05, + "loss": 0.5534, + "step": 204030 + }, + { + "epoch": 1.8037801234109514, + "grad_norm": 10.261507034301758, + "learning_rate": 1.993699794315081e-05, + "loss": 0.4839, + "step": 204040 + }, + { + "epoch": 1.8038685266712635, + "grad_norm": 5.606393814086914, + "learning_rate": 1.993552455547894e-05, + "loss": 0.5313, + "step": 204050 + }, + { + "epoch": 1.8039569299315759, + "grad_norm": 3.2035329341888428, + "learning_rate": 1.9934051167807068e-05, + "loss": 0.6767, + "step": 204060 + }, + { + "epoch": 1.8040453331918882, + "grad_norm": 2.819847822189331, + "learning_rate": 1.99325777801352e-05, + "loss": 0.6076, + "step": 204070 + }, + { + "epoch": 1.8041337364522003, + "grad_norm": 4.736661434173584, + "learning_rate": 1.9931104392463328e-05, + "loss": 0.7026, + "step": 204080 + }, + { + "epoch": 1.8042221397125124, + "grad_norm": 2.1682775020599365, + "learning_rate": 1.9929631004791457e-05, + "loss": 0.5879, + "step": 204090 + }, + { + "epoch": 1.8043105429728248, + "grad_norm": 1.8323850631713867, + "learning_rate": 1.9928157617119585e-05, + "loss": 0.5783, + "step": 204100 + }, + { + "epoch": 1.8043989462331371, + "grad_norm": 4.539486408233643, + "learning_rate": 1.9926684229447717e-05, + "loss": 0.7798, + "step": 204110 + }, + { + "epoch": 1.8044873494934492, + "grad_norm": 2.930159330368042, + "learning_rate": 1.9925210841775845e-05, + "loss": 0.513, + "step": 204120 + }, + { + "epoch": 1.8045757527537616, + "grad_norm": 2.8496460914611816, + "learning_rate": 1.9923737454103973e-05, + "loss": 0.6242, + "step": 204130 + }, + { + "epoch": 1.804664156014074, + "grad_norm": 6.112209796905518, + "learning_rate": 1.9922264066432102e-05, + "loss": 0.6255, + "step": 204140 + }, + { + "epoch": 1.804752559274386, + "grad_norm": 1.8608163595199585, + "learning_rate": 1.9920790678760233e-05, + "loss": 0.6518, + "step": 204150 + }, + { + "epoch": 1.8048409625346982, + "grad_norm": 1.0296214818954468, + "learning_rate": 1.9919317291088362e-05, + "loss": 0.6478, + "step": 204160 + }, + { + "epoch": 1.8049293657950105, + "grad_norm": 3.8804173469543457, + "learning_rate": 1.991784390341649e-05, + "loss": 0.5584, + "step": 204170 + }, + { + "epoch": 1.8050177690553229, + "grad_norm": 2.2194786071777344, + "learning_rate": 1.9916370515744622e-05, + "loss": 0.5766, + "step": 204180 + }, + { + "epoch": 1.805106172315635, + "grad_norm": 2.818420171737671, + "learning_rate": 1.991489712807275e-05, + "loss": 0.6371, + "step": 204190 + }, + { + "epoch": 1.805194575575947, + "grad_norm": 7.022854804992676, + "learning_rate": 1.991342374040088e-05, + "loss": 0.5697, + "step": 204200 + }, + { + "epoch": 1.8052829788362594, + "grad_norm": 1.2418639659881592, + "learning_rate": 1.9911950352729007e-05, + "loss": 0.6116, + "step": 204210 + }, + { + "epoch": 1.8053713820965718, + "grad_norm": 2.139376163482666, + "learning_rate": 1.991047696505714e-05, + "loss": 0.5006, + "step": 204220 + }, + { + "epoch": 1.805459785356884, + "grad_norm": 2.4272873401641846, + "learning_rate": 1.9909003577385267e-05, + "loss": 0.4331, + "step": 204230 + }, + { + "epoch": 1.8055481886171962, + "grad_norm": 1.1874834299087524, + "learning_rate": 1.9907530189713395e-05, + "loss": 0.4698, + "step": 204240 + }, + { + "epoch": 1.8056365918775086, + "grad_norm": 3.5929088592529297, + "learning_rate": 1.9906056802041527e-05, + "loss": 0.6432, + "step": 204250 + }, + { + "epoch": 1.8057249951378207, + "grad_norm": 1.9485775232315063, + "learning_rate": 1.9904583414369656e-05, + "loss": 0.581, + "step": 204260 + }, + { + "epoch": 1.8058133983981328, + "grad_norm": 2.9153568744659424, + "learning_rate": 1.9903110026697784e-05, + "loss": 0.6045, + "step": 204270 + }, + { + "epoch": 1.8059018016584452, + "grad_norm": 4.08786153793335, + "learning_rate": 1.9901636639025916e-05, + "loss": 0.5731, + "step": 204280 + }, + { + "epoch": 1.8059902049187575, + "grad_norm": 2.3672847747802734, + "learning_rate": 1.9900163251354044e-05, + "loss": 0.661, + "step": 204290 + }, + { + "epoch": 1.8060786081790696, + "grad_norm": 3.9267995357513428, + "learning_rate": 1.9898689863682172e-05, + "loss": 0.6448, + "step": 204300 + }, + { + "epoch": 1.8061670114393817, + "grad_norm": 4.543934345245361, + "learning_rate": 1.9897216476010304e-05, + "loss": 0.6241, + "step": 204310 + }, + { + "epoch": 1.806255414699694, + "grad_norm": 1.5214087963104248, + "learning_rate": 1.9895743088338432e-05, + "loss": 0.6662, + "step": 204320 + }, + { + "epoch": 1.8063438179600064, + "grad_norm": 2.0061838626861572, + "learning_rate": 1.989426970066656e-05, + "loss": 0.5999, + "step": 204330 + }, + { + "epoch": 1.8064322212203185, + "grad_norm": 1.2490204572677612, + "learning_rate": 1.9892796312994692e-05, + "loss": 0.6354, + "step": 204340 + }, + { + "epoch": 1.8065206244806309, + "grad_norm": 4.839119911193848, + "learning_rate": 1.989132292532282e-05, + "loss": 0.6157, + "step": 204350 + }, + { + "epoch": 1.8066090277409432, + "grad_norm": 2.272608995437622, + "learning_rate": 1.988984953765095e-05, + "loss": 0.5604, + "step": 204360 + }, + { + "epoch": 1.8066974310012553, + "grad_norm": 5.739010334014893, + "learning_rate": 1.988837614997908e-05, + "loss": 0.5527, + "step": 204370 + }, + { + "epoch": 1.8067858342615675, + "grad_norm": 1.696432113647461, + "learning_rate": 1.988690276230721e-05, + "loss": 0.6518, + "step": 204380 + }, + { + "epoch": 1.8068742375218798, + "grad_norm": 5.386918067932129, + "learning_rate": 1.9885429374635338e-05, + "loss": 0.6386, + "step": 204390 + }, + { + "epoch": 1.8069626407821922, + "grad_norm": 1.4314566850662231, + "learning_rate": 1.988395598696347e-05, + "loss": 0.6696, + "step": 204400 + }, + { + "epoch": 1.8070510440425043, + "grad_norm": 1.2361992597579956, + "learning_rate": 1.9882482599291598e-05, + "loss": 0.612, + "step": 204410 + }, + { + "epoch": 1.8071394473028164, + "grad_norm": 3.845834732055664, + "learning_rate": 1.9881009211619726e-05, + "loss": 0.5499, + "step": 204420 + }, + { + "epoch": 1.8072278505631287, + "grad_norm": 2.6744496822357178, + "learning_rate": 1.9879535823947854e-05, + "loss": 0.6015, + "step": 204430 + }, + { + "epoch": 1.807316253823441, + "grad_norm": 1.3522708415985107, + "learning_rate": 1.9878062436275986e-05, + "loss": 0.5649, + "step": 204440 + }, + { + "epoch": 1.8074046570837532, + "grad_norm": 0.5291694402694702, + "learning_rate": 1.9876589048604115e-05, + "loss": 0.6499, + "step": 204450 + }, + { + "epoch": 1.8074930603440655, + "grad_norm": 2.6592183113098145, + "learning_rate": 1.9875115660932243e-05, + "loss": 0.5, + "step": 204460 + }, + { + "epoch": 1.8075814636043779, + "grad_norm": 1.8216780424118042, + "learning_rate": 1.9873642273260375e-05, + "loss": 0.7019, + "step": 204470 + }, + { + "epoch": 1.80766986686469, + "grad_norm": 8.142723083496094, + "learning_rate": 1.9872168885588503e-05, + "loss": 0.5172, + "step": 204480 + }, + { + "epoch": 1.8077582701250021, + "grad_norm": 6.1031341552734375, + "learning_rate": 1.987069549791663e-05, + "loss": 0.4787, + "step": 204490 + }, + { + "epoch": 1.8078466733853145, + "grad_norm": 1.4311480522155762, + "learning_rate": 1.986922211024476e-05, + "loss": 0.6433, + "step": 204500 + }, + { + "epoch": 1.8079350766456268, + "grad_norm": 2.7168381214141846, + "learning_rate": 1.986774872257289e-05, + "loss": 0.5925, + "step": 204510 + }, + { + "epoch": 1.808023479905939, + "grad_norm": 2.2922656536102295, + "learning_rate": 1.986627533490102e-05, + "loss": 0.6095, + "step": 204520 + }, + { + "epoch": 1.808111883166251, + "grad_norm": 1.2676931619644165, + "learning_rate": 1.9864801947229148e-05, + "loss": 0.6203, + "step": 204530 + }, + { + "epoch": 1.8082002864265634, + "grad_norm": 1.2178192138671875, + "learning_rate": 1.986332855955728e-05, + "loss": 0.4626, + "step": 204540 + }, + { + "epoch": 1.8082886896868757, + "grad_norm": 6.804373741149902, + "learning_rate": 1.9861855171885408e-05, + "loss": 0.5906, + "step": 204550 + }, + { + "epoch": 1.8083770929471878, + "grad_norm": 1.9722411632537842, + "learning_rate": 1.9860381784213537e-05, + "loss": 0.5679, + "step": 204560 + }, + { + "epoch": 1.8084654962075, + "grad_norm": 1.387751579284668, + "learning_rate": 1.9858908396541665e-05, + "loss": 0.5495, + "step": 204570 + }, + { + "epoch": 1.8085538994678125, + "grad_norm": 2.8149945735931396, + "learning_rate": 1.9857435008869797e-05, + "loss": 0.598, + "step": 204580 + }, + { + "epoch": 1.8086423027281247, + "grad_norm": 0.8277774453163147, + "learning_rate": 1.9855961621197925e-05, + "loss": 0.627, + "step": 204590 + }, + { + "epoch": 1.8087307059884368, + "grad_norm": 4.5288543701171875, + "learning_rate": 1.9854488233526053e-05, + "loss": 0.6177, + "step": 204600 + }, + { + "epoch": 1.8088191092487491, + "grad_norm": 1.4247055053710938, + "learning_rate": 1.9853014845854182e-05, + "loss": 0.5941, + "step": 204610 + }, + { + "epoch": 1.8089075125090615, + "grad_norm": 2.537128448486328, + "learning_rate": 1.9851541458182314e-05, + "loss": 0.5751, + "step": 204620 + }, + { + "epoch": 1.8089959157693736, + "grad_norm": 3.9799728393554688, + "learning_rate": 1.9850068070510442e-05, + "loss": 0.5171, + "step": 204630 + }, + { + "epoch": 1.8090843190296857, + "grad_norm": 1.9004215002059937, + "learning_rate": 1.984859468283857e-05, + "loss": 0.694, + "step": 204640 + }, + { + "epoch": 1.809172722289998, + "grad_norm": 2.632812261581421, + "learning_rate": 1.9847121295166702e-05, + "loss": 0.5829, + "step": 204650 + }, + { + "epoch": 1.8092611255503104, + "grad_norm": 1.5173672437667847, + "learning_rate": 1.984564790749483e-05, + "loss": 0.5527, + "step": 204660 + }, + { + "epoch": 1.8093495288106225, + "grad_norm": 0.7835744619369507, + "learning_rate": 1.984417451982296e-05, + "loss": 0.5764, + "step": 204670 + }, + { + "epoch": 1.8094379320709346, + "grad_norm": 7.133487224578857, + "learning_rate": 1.9842701132151087e-05, + "loss": 0.6719, + "step": 204680 + }, + { + "epoch": 1.8095263353312472, + "grad_norm": 2.5694580078125, + "learning_rate": 1.984122774447922e-05, + "loss": 0.5588, + "step": 204690 + }, + { + "epoch": 1.8096147385915593, + "grad_norm": 2.642927646636963, + "learning_rate": 1.9839754356807347e-05, + "loss": 0.6477, + "step": 204700 + }, + { + "epoch": 1.8097031418518714, + "grad_norm": 3.1200151443481445, + "learning_rate": 1.9838280969135475e-05, + "loss": 0.5816, + "step": 204710 + }, + { + "epoch": 1.8097915451121838, + "grad_norm": 4.143441200256348, + "learning_rate": 1.9836807581463604e-05, + "loss": 0.5273, + "step": 204720 + }, + { + "epoch": 1.809879948372496, + "grad_norm": 4.186145782470703, + "learning_rate": 1.9835334193791736e-05, + "loss": 0.6081, + "step": 204730 + }, + { + "epoch": 1.8099683516328082, + "grad_norm": 32.160648345947266, + "learning_rate": 1.9833860806119864e-05, + "loss": 0.6195, + "step": 204740 + }, + { + "epoch": 1.8100567548931203, + "grad_norm": 1.5172966718673706, + "learning_rate": 1.9832387418447992e-05, + "loss": 0.6532, + "step": 204750 + }, + { + "epoch": 1.8101451581534327, + "grad_norm": 1.7198430299758911, + "learning_rate": 1.9830914030776124e-05, + "loss": 0.6428, + "step": 204760 + }, + { + "epoch": 1.810233561413745, + "grad_norm": 2.6470184326171875, + "learning_rate": 1.9829440643104252e-05, + "loss": 0.4808, + "step": 204770 + }, + { + "epoch": 1.8103219646740571, + "grad_norm": 5.744252681732178, + "learning_rate": 1.982796725543238e-05, + "loss": 0.6234, + "step": 204780 + }, + { + "epoch": 1.8104103679343693, + "grad_norm": 1.3214352130889893, + "learning_rate": 1.982649386776051e-05, + "loss": 0.4612, + "step": 204790 + }, + { + "epoch": 1.8104987711946816, + "grad_norm": 0.8004894852638245, + "learning_rate": 1.982502048008864e-05, + "loss": 0.47, + "step": 204800 + }, + { + "epoch": 1.810587174454994, + "grad_norm": 3.214850425720215, + "learning_rate": 1.982354709241677e-05, + "loss": 0.5348, + "step": 204810 + }, + { + "epoch": 1.810675577715306, + "grad_norm": 3.740530252456665, + "learning_rate": 1.9822073704744898e-05, + "loss": 0.628, + "step": 204820 + }, + { + "epoch": 1.8107639809756184, + "grad_norm": 1.620605230331421, + "learning_rate": 1.982060031707303e-05, + "loss": 0.4869, + "step": 204830 + }, + { + "epoch": 1.8108523842359308, + "grad_norm": 7.022885322570801, + "learning_rate": 1.9819126929401158e-05, + "loss": 0.5965, + "step": 204840 + }, + { + "epoch": 1.8109407874962429, + "grad_norm": 3.8251516819000244, + "learning_rate": 1.9817653541729286e-05, + "loss": 0.5974, + "step": 204850 + }, + { + "epoch": 1.811029190756555, + "grad_norm": 1.8026121854782104, + "learning_rate": 1.9816180154057414e-05, + "loss": 0.5771, + "step": 204860 + }, + { + "epoch": 1.8111175940168673, + "grad_norm": 2.1831490993499756, + "learning_rate": 1.9814706766385546e-05, + "loss": 0.6034, + "step": 204870 + }, + { + "epoch": 1.8112059972771797, + "grad_norm": 3.5884084701538086, + "learning_rate": 1.9813233378713674e-05, + "loss": 0.619, + "step": 204880 + }, + { + "epoch": 1.8112944005374918, + "grad_norm": 15.207366943359375, + "learning_rate": 1.9811759991041803e-05, + "loss": 0.5584, + "step": 204890 + }, + { + "epoch": 1.811382803797804, + "grad_norm": 2.334613084793091, + "learning_rate": 1.981028660336993e-05, + "loss": 0.593, + "step": 204900 + }, + { + "epoch": 1.8114712070581163, + "grad_norm": 0.8896350264549255, + "learning_rate": 1.9808813215698063e-05, + "loss": 0.6255, + "step": 204910 + }, + { + "epoch": 1.8115596103184286, + "grad_norm": 2.5466392040252686, + "learning_rate": 1.980733982802619e-05, + "loss": 0.6236, + "step": 204920 + }, + { + "epoch": 1.8116480135787407, + "grad_norm": 7.370975017547607, + "learning_rate": 1.980586644035432e-05, + "loss": 0.5788, + "step": 204930 + }, + { + "epoch": 1.811736416839053, + "grad_norm": 6.6383376121521, + "learning_rate": 1.980439305268245e-05, + "loss": 0.6388, + "step": 204940 + }, + { + "epoch": 1.8118248200993654, + "grad_norm": 2.832657814025879, + "learning_rate": 1.980291966501058e-05, + "loss": 0.655, + "step": 204950 + }, + { + "epoch": 1.8119132233596775, + "grad_norm": 1.4920225143432617, + "learning_rate": 1.9801446277338708e-05, + "loss": 0.5131, + "step": 204960 + }, + { + "epoch": 1.8120016266199896, + "grad_norm": 3.6986663341522217, + "learning_rate": 1.9799972889666836e-05, + "loss": 0.5755, + "step": 204970 + }, + { + "epoch": 1.812090029880302, + "grad_norm": 2.3363704681396484, + "learning_rate": 1.9798499501994968e-05, + "loss": 0.5117, + "step": 204980 + }, + { + "epoch": 1.8121784331406143, + "grad_norm": 0.8071462512016296, + "learning_rate": 1.9797026114323096e-05, + "loss": 0.5865, + "step": 204990 + }, + { + "epoch": 1.8122668364009265, + "grad_norm": 4.943005561828613, + "learning_rate": 1.9795552726651225e-05, + "loss": 0.6755, + "step": 205000 + }, + { + "epoch": 1.8123552396612386, + "grad_norm": 1.0413872003555298, + "learning_rate": 1.9794079338979357e-05, + "loss": 0.5493, + "step": 205010 + }, + { + "epoch": 1.812443642921551, + "grad_norm": 1.3888115882873535, + "learning_rate": 1.9792605951307485e-05, + "loss": 0.6143, + "step": 205020 + }, + { + "epoch": 1.8125320461818633, + "grad_norm": 3.122053861618042, + "learning_rate": 1.9791132563635613e-05, + "loss": 0.5599, + "step": 205030 + }, + { + "epoch": 1.8126204494421754, + "grad_norm": 1.4189289808273315, + "learning_rate": 1.978965917596374e-05, + "loss": 0.5805, + "step": 205040 + }, + { + "epoch": 1.8127088527024877, + "grad_norm": 5.1583638191223145, + "learning_rate": 1.9788185788291873e-05, + "loss": 0.4908, + "step": 205050 + }, + { + "epoch": 1.8127972559628, + "grad_norm": 2.4484779834747314, + "learning_rate": 1.9786712400620002e-05, + "loss": 0.5656, + "step": 205060 + }, + { + "epoch": 1.8128856592231122, + "grad_norm": 1.7282007932662964, + "learning_rate": 1.978523901294813e-05, + "loss": 0.6116, + "step": 205070 + }, + { + "epoch": 1.8129740624834243, + "grad_norm": 6.115036487579346, + "learning_rate": 1.978376562527626e-05, + "loss": 0.5016, + "step": 205080 + }, + { + "epoch": 1.8130624657437366, + "grad_norm": 3.375054121017456, + "learning_rate": 1.978229223760439e-05, + "loss": 0.5012, + "step": 205090 + }, + { + "epoch": 1.813150869004049, + "grad_norm": 2.5764636993408203, + "learning_rate": 1.978081884993252e-05, + "loss": 0.5606, + "step": 205100 + }, + { + "epoch": 1.813239272264361, + "grad_norm": 1.6634795665740967, + "learning_rate": 1.9779345462260647e-05, + "loss": 0.6111, + "step": 205110 + }, + { + "epoch": 1.8133276755246732, + "grad_norm": 7.311112403869629, + "learning_rate": 1.977787207458878e-05, + "loss": 0.5079, + "step": 205120 + }, + { + "epoch": 1.8134160787849856, + "grad_norm": 2.988346815109253, + "learning_rate": 1.9776398686916907e-05, + "loss": 0.5101, + "step": 205130 + }, + { + "epoch": 1.813504482045298, + "grad_norm": 2.2646923065185547, + "learning_rate": 1.9774925299245035e-05, + "loss": 0.6477, + "step": 205140 + }, + { + "epoch": 1.81359288530561, + "grad_norm": 5.640881538391113, + "learning_rate": 1.9773451911573164e-05, + "loss": 0.6112, + "step": 205150 + }, + { + "epoch": 1.8136812885659221, + "grad_norm": 4.360267162322998, + "learning_rate": 1.9771978523901295e-05, + "loss": 0.5578, + "step": 205160 + }, + { + "epoch": 1.8137696918262347, + "grad_norm": 2.666182041168213, + "learning_rate": 1.9770505136229424e-05, + "loss": 0.5626, + "step": 205170 + }, + { + "epoch": 1.8138580950865468, + "grad_norm": 1.572108268737793, + "learning_rate": 1.9769031748557552e-05, + "loss": 0.6473, + "step": 205180 + }, + { + "epoch": 1.813946498346859, + "grad_norm": 2.5407955646514893, + "learning_rate": 1.9767558360885684e-05, + "loss": 0.5364, + "step": 205190 + }, + { + "epoch": 1.8140349016071713, + "grad_norm": 5.172571182250977, + "learning_rate": 1.9766084973213812e-05, + "loss": 0.4815, + "step": 205200 + }, + { + "epoch": 1.8141233048674836, + "grad_norm": 4.145648956298828, + "learning_rate": 1.976461158554194e-05, + "loss": 0.6249, + "step": 205210 + }, + { + "epoch": 1.8142117081277958, + "grad_norm": 6.628396987915039, + "learning_rate": 1.9763138197870072e-05, + "loss": 0.4574, + "step": 205220 + }, + { + "epoch": 1.8143001113881079, + "grad_norm": 4.403090476989746, + "learning_rate": 1.97616648101982e-05, + "loss": 0.5821, + "step": 205230 + }, + { + "epoch": 1.8143885146484202, + "grad_norm": 3.767047882080078, + "learning_rate": 1.976019142252633e-05, + "loss": 0.5941, + "step": 205240 + }, + { + "epoch": 1.8144769179087326, + "grad_norm": 2.1737284660339355, + "learning_rate": 1.975871803485446e-05, + "loss": 0.5473, + "step": 205250 + }, + { + "epoch": 1.8145653211690447, + "grad_norm": 2.667391061782837, + "learning_rate": 1.975724464718259e-05, + "loss": 0.4895, + "step": 205260 + }, + { + "epoch": 1.8146537244293568, + "grad_norm": 16.818302154541016, + "learning_rate": 1.9755771259510718e-05, + "loss": 0.609, + "step": 205270 + }, + { + "epoch": 1.8147421276896694, + "grad_norm": 1.2136601209640503, + "learning_rate": 1.975429787183885e-05, + "loss": 0.5769, + "step": 205280 + }, + { + "epoch": 1.8148305309499815, + "grad_norm": 6.928625106811523, + "learning_rate": 1.9752824484166978e-05, + "loss": 0.6335, + "step": 205290 + }, + { + "epoch": 1.8149189342102936, + "grad_norm": 6.01639461517334, + "learning_rate": 1.9751351096495106e-05, + "loss": 0.4662, + "step": 205300 + }, + { + "epoch": 1.815007337470606, + "grad_norm": 9.724088668823242, + "learning_rate": 1.9749877708823238e-05, + "loss": 0.4604, + "step": 205310 + }, + { + "epoch": 1.8150957407309183, + "grad_norm": 3.2025539875030518, + "learning_rate": 1.9748404321151366e-05, + "loss": 0.8161, + "step": 205320 + }, + { + "epoch": 1.8151841439912304, + "grad_norm": 2.018144369125366, + "learning_rate": 1.9746930933479494e-05, + "loss": 0.5801, + "step": 205330 + }, + { + "epoch": 1.8152725472515425, + "grad_norm": 2.981337308883667, + "learning_rate": 1.9745457545807626e-05, + "loss": 0.5227, + "step": 205340 + }, + { + "epoch": 1.8153609505118549, + "grad_norm": 2.2382495403289795, + "learning_rate": 1.9743984158135755e-05, + "loss": 0.5619, + "step": 205350 + }, + { + "epoch": 1.8154493537721672, + "grad_norm": 1.552259087562561, + "learning_rate": 1.9742510770463883e-05, + "loss": 0.6834, + "step": 205360 + }, + { + "epoch": 1.8155377570324793, + "grad_norm": 1.2444350719451904, + "learning_rate": 1.974103738279201e-05, + "loss": 0.5761, + "step": 205370 + }, + { + "epoch": 1.8156261602927914, + "grad_norm": 3.165355920791626, + "learning_rate": 1.9739563995120143e-05, + "loss": 0.7512, + "step": 205380 + }, + { + "epoch": 1.8157145635531038, + "grad_norm": 2.6302196979522705, + "learning_rate": 1.973809060744827e-05, + "loss": 0.6307, + "step": 205390 + }, + { + "epoch": 1.8158029668134161, + "grad_norm": 2.225032091140747, + "learning_rate": 1.97366172197764e-05, + "loss": 0.5078, + "step": 205400 + }, + { + "epoch": 1.8158913700737283, + "grad_norm": 1.9014170169830322, + "learning_rate": 1.973514383210453e-05, + "loss": 0.5053, + "step": 205410 + }, + { + "epoch": 1.8159797733340406, + "grad_norm": 3.0980594158172607, + "learning_rate": 1.973367044443266e-05, + "loss": 0.56, + "step": 205420 + }, + { + "epoch": 1.816068176594353, + "grad_norm": 1.409366250038147, + "learning_rate": 1.9732197056760788e-05, + "loss": 0.3925, + "step": 205430 + }, + { + "epoch": 1.816156579854665, + "grad_norm": 5.884101867675781, + "learning_rate": 1.9730723669088916e-05, + "loss": 0.5774, + "step": 205440 + }, + { + "epoch": 1.8162449831149772, + "grad_norm": 5.426363468170166, + "learning_rate": 1.9729250281417048e-05, + "loss": 0.5905, + "step": 205450 + }, + { + "epoch": 1.8163333863752895, + "grad_norm": 1.9328361749649048, + "learning_rate": 1.9727776893745177e-05, + "loss": 0.6275, + "step": 205460 + }, + { + "epoch": 1.8164217896356019, + "grad_norm": 3.0021536350250244, + "learning_rate": 1.9726303506073305e-05, + "loss": 0.5065, + "step": 205470 + }, + { + "epoch": 1.816510192895914, + "grad_norm": 15.42954158782959, + "learning_rate": 1.9724830118401437e-05, + "loss": 0.5191, + "step": 205480 + }, + { + "epoch": 1.816598596156226, + "grad_norm": 5.228804588317871, + "learning_rate": 1.9723356730729565e-05, + "loss": 0.6386, + "step": 205490 + }, + { + "epoch": 1.8166869994165384, + "grad_norm": 0.9492586851119995, + "learning_rate": 1.9721883343057693e-05, + "loss": 0.5185, + "step": 205500 + }, + { + "epoch": 1.8167754026768508, + "grad_norm": 3.6664984226226807, + "learning_rate": 1.9720409955385822e-05, + "loss": 0.6452, + "step": 205510 + }, + { + "epoch": 1.816863805937163, + "grad_norm": 1.380125880241394, + "learning_rate": 1.9718936567713953e-05, + "loss": 0.687, + "step": 205520 + }, + { + "epoch": 1.8169522091974752, + "grad_norm": 2.0596303939819336, + "learning_rate": 1.9717463180042082e-05, + "loss": 0.6561, + "step": 205530 + }, + { + "epoch": 1.8170406124577876, + "grad_norm": 1.2801146507263184, + "learning_rate": 1.971598979237021e-05, + "loss": 0.6626, + "step": 205540 + }, + { + "epoch": 1.8171290157180997, + "grad_norm": 2.1601812839508057, + "learning_rate": 1.971451640469834e-05, + "loss": 0.5393, + "step": 205550 + }, + { + "epoch": 1.8172174189784118, + "grad_norm": 4.125202178955078, + "learning_rate": 1.971304301702647e-05, + "loss": 0.5652, + "step": 205560 + }, + { + "epoch": 1.8173058222387242, + "grad_norm": 9.637201309204102, + "learning_rate": 1.97115696293546e-05, + "loss": 0.5294, + "step": 205570 + }, + { + "epoch": 1.8173942254990365, + "grad_norm": 2.6966235637664795, + "learning_rate": 1.9710096241682727e-05, + "loss": 0.5471, + "step": 205580 + }, + { + "epoch": 1.8174826287593486, + "grad_norm": 1.6010628938674927, + "learning_rate": 1.970862285401086e-05, + "loss": 0.5847, + "step": 205590 + }, + { + "epoch": 1.8175710320196607, + "grad_norm": 1.595720887184143, + "learning_rate": 1.9707149466338987e-05, + "loss": 0.5632, + "step": 205600 + }, + { + "epoch": 1.817659435279973, + "grad_norm": 6.063867568969727, + "learning_rate": 1.9705676078667115e-05, + "loss": 0.6014, + "step": 205610 + }, + { + "epoch": 1.8177478385402854, + "grad_norm": 1.8563518524169922, + "learning_rate": 1.9704202690995244e-05, + "loss": 0.5215, + "step": 205620 + }, + { + "epoch": 1.8178362418005976, + "grad_norm": 6.146674156188965, + "learning_rate": 1.9702729303323376e-05, + "loss": 0.4971, + "step": 205630 + }, + { + "epoch": 1.81792464506091, + "grad_norm": 7.920614242553711, + "learning_rate": 1.9701255915651504e-05, + "loss": 0.7799, + "step": 205640 + }, + { + "epoch": 1.8180130483212222, + "grad_norm": 1.5419124364852905, + "learning_rate": 1.9699782527979632e-05, + "loss": 0.5308, + "step": 205650 + }, + { + "epoch": 1.8181014515815344, + "grad_norm": 4.718521595001221, + "learning_rate": 1.9698309140307764e-05, + "loss": 0.5452, + "step": 205660 + }, + { + "epoch": 1.8181898548418465, + "grad_norm": 3.180023193359375, + "learning_rate": 1.9696835752635892e-05, + "loss": 0.7371, + "step": 205670 + }, + { + "epoch": 1.8182782581021588, + "grad_norm": 4.409896373748779, + "learning_rate": 1.969536236496402e-05, + "loss": 0.6145, + "step": 205680 + }, + { + "epoch": 1.8183666613624712, + "grad_norm": 4.773332118988037, + "learning_rate": 1.969388897729215e-05, + "loss": 0.5614, + "step": 205690 + }, + { + "epoch": 1.8184550646227833, + "grad_norm": 8.93399429321289, + "learning_rate": 1.969241558962028e-05, + "loss": 0.5681, + "step": 205700 + }, + { + "epoch": 1.8185434678830954, + "grad_norm": 5.823650360107422, + "learning_rate": 1.969094220194841e-05, + "loss": 0.665, + "step": 205710 + }, + { + "epoch": 1.8186318711434077, + "grad_norm": 2.985886335372925, + "learning_rate": 1.9689468814276537e-05, + "loss": 0.7392, + "step": 205720 + }, + { + "epoch": 1.81872027440372, + "grad_norm": 15.467670440673828, + "learning_rate": 1.9687995426604666e-05, + "loss": 0.6906, + "step": 205730 + }, + { + "epoch": 1.8188086776640322, + "grad_norm": 8.271402359008789, + "learning_rate": 1.9686522038932798e-05, + "loss": 0.6446, + "step": 205740 + }, + { + "epoch": 1.8188970809243445, + "grad_norm": 3.8750271797180176, + "learning_rate": 1.9685048651260926e-05, + "loss": 0.5974, + "step": 205750 + }, + { + "epoch": 1.8189854841846569, + "grad_norm": 2.3784050941467285, + "learning_rate": 1.9683575263589054e-05, + "loss": 0.4477, + "step": 205760 + }, + { + "epoch": 1.819073887444969, + "grad_norm": 1.3976037502288818, + "learning_rate": 1.9682101875917186e-05, + "loss": 0.6801, + "step": 205770 + }, + { + "epoch": 1.8191622907052811, + "grad_norm": 6.066532611846924, + "learning_rate": 1.9680628488245314e-05, + "loss": 0.6771, + "step": 205780 + }, + { + "epoch": 1.8192506939655935, + "grad_norm": 5.840319633483887, + "learning_rate": 1.9679155100573443e-05, + "loss": 0.5399, + "step": 205790 + }, + { + "epoch": 1.8193390972259058, + "grad_norm": 4.8559489250183105, + "learning_rate": 1.967768171290157e-05, + "loss": 0.4738, + "step": 205800 + }, + { + "epoch": 1.819427500486218, + "grad_norm": 8.714665412902832, + "learning_rate": 1.9676208325229703e-05, + "loss": 0.7982, + "step": 205810 + }, + { + "epoch": 1.81951590374653, + "grad_norm": 6.71140718460083, + "learning_rate": 1.967473493755783e-05, + "loss": 0.5827, + "step": 205820 + }, + { + "epoch": 1.8196043070068424, + "grad_norm": 2.8837430477142334, + "learning_rate": 1.967326154988596e-05, + "loss": 0.6267, + "step": 205830 + }, + { + "epoch": 1.8196927102671547, + "grad_norm": 6.497797012329102, + "learning_rate": 1.9671788162214088e-05, + "loss": 0.6017, + "step": 205840 + }, + { + "epoch": 1.8197811135274669, + "grad_norm": 2.312929153442383, + "learning_rate": 1.967031477454222e-05, + "loss": 0.5694, + "step": 205850 + }, + { + "epoch": 1.819869516787779, + "grad_norm": 4.205124855041504, + "learning_rate": 1.9668841386870348e-05, + "loss": 0.6344, + "step": 205860 + }, + { + "epoch": 1.8199579200480915, + "grad_norm": 2.381417751312256, + "learning_rate": 1.9667367999198476e-05, + "loss": 0.5678, + "step": 205870 + }, + { + "epoch": 1.8200463233084037, + "grad_norm": 2.740489959716797, + "learning_rate": 1.9665894611526608e-05, + "loss": 0.6498, + "step": 205880 + }, + { + "epoch": 1.8201347265687158, + "grad_norm": 8.038272857666016, + "learning_rate": 1.9664421223854736e-05, + "loss": 0.6189, + "step": 205890 + }, + { + "epoch": 1.8202231298290281, + "grad_norm": 0.5989644527435303, + "learning_rate": 1.9662947836182865e-05, + "loss": 0.5195, + "step": 205900 + }, + { + "epoch": 1.8203115330893405, + "grad_norm": 7.084933280944824, + "learning_rate": 1.9661474448510993e-05, + "loss": 0.571, + "step": 205910 + }, + { + "epoch": 1.8203999363496526, + "grad_norm": 11.091043472290039, + "learning_rate": 1.9660001060839125e-05, + "loss": 0.6413, + "step": 205920 + }, + { + "epoch": 1.8204883396099647, + "grad_norm": 1.5782756805419922, + "learning_rate": 1.9658527673167253e-05, + "loss": 0.5706, + "step": 205930 + }, + { + "epoch": 1.820576742870277, + "grad_norm": 4.537599563598633, + "learning_rate": 1.965705428549538e-05, + "loss": 0.5001, + "step": 205940 + }, + { + "epoch": 1.8206651461305894, + "grad_norm": 1.3987971544265747, + "learning_rate": 1.9655580897823513e-05, + "loss": 0.5765, + "step": 205950 + }, + { + "epoch": 1.8207535493909015, + "grad_norm": 3.5085034370422363, + "learning_rate": 1.9654107510151642e-05, + "loss": 0.4726, + "step": 205960 + }, + { + "epoch": 1.8208419526512136, + "grad_norm": 4.559741020202637, + "learning_rate": 1.965263412247977e-05, + "loss": 0.5985, + "step": 205970 + }, + { + "epoch": 1.820930355911526, + "grad_norm": 1.4537702798843384, + "learning_rate": 1.96511607348079e-05, + "loss": 0.4433, + "step": 205980 + }, + { + "epoch": 1.8210187591718383, + "grad_norm": 15.174161911010742, + "learning_rate": 1.964968734713603e-05, + "loss": 0.5866, + "step": 205990 + }, + { + "epoch": 1.8211071624321504, + "grad_norm": 5.180905818939209, + "learning_rate": 1.964821395946416e-05, + "loss": 0.5105, + "step": 206000 + }, + { + "epoch": 1.8211955656924628, + "grad_norm": 1.7084853649139404, + "learning_rate": 1.9646740571792287e-05, + "loss": 0.6123, + "step": 206010 + }, + { + "epoch": 1.8212839689527751, + "grad_norm": 2.7740683555603027, + "learning_rate": 1.9645267184120415e-05, + "loss": 0.6021, + "step": 206020 + }, + { + "epoch": 1.8213723722130872, + "grad_norm": 4.2755913734436035, + "learning_rate": 1.9643793796448547e-05, + "loss": 0.5733, + "step": 206030 + }, + { + "epoch": 1.8214607754733994, + "grad_norm": 2.595569372177124, + "learning_rate": 1.9642320408776675e-05, + "loss": 0.5537, + "step": 206040 + }, + { + "epoch": 1.8215491787337117, + "grad_norm": 2.639146566390991, + "learning_rate": 1.9640847021104804e-05, + "loss": 0.7424, + "step": 206050 + }, + { + "epoch": 1.821637581994024, + "grad_norm": 1.8448108434677124, + "learning_rate": 1.9639373633432935e-05, + "loss": 0.5973, + "step": 206060 + }, + { + "epoch": 1.8217259852543362, + "grad_norm": 5.290491104125977, + "learning_rate": 1.9637900245761064e-05, + "loss": 0.685, + "step": 206070 + }, + { + "epoch": 1.8218143885146483, + "grad_norm": 3.2655539512634277, + "learning_rate": 1.9636426858089192e-05, + "loss": 0.5121, + "step": 206080 + }, + { + "epoch": 1.8219027917749606, + "grad_norm": 3.944739818572998, + "learning_rate": 1.963495347041732e-05, + "loss": 0.6442, + "step": 206090 + }, + { + "epoch": 1.821991195035273, + "grad_norm": 34.24992752075195, + "learning_rate": 1.9633480082745452e-05, + "loss": 0.6488, + "step": 206100 + }, + { + "epoch": 1.822079598295585, + "grad_norm": 1.8902596235275269, + "learning_rate": 1.963200669507358e-05, + "loss": 0.5305, + "step": 206110 + }, + { + "epoch": 1.8221680015558974, + "grad_norm": 2.227578639984131, + "learning_rate": 1.963053330740171e-05, + "loss": 0.5316, + "step": 206120 + }, + { + "epoch": 1.8222564048162098, + "grad_norm": 1.9944559335708618, + "learning_rate": 1.962905991972984e-05, + "loss": 0.5433, + "step": 206130 + }, + { + "epoch": 1.8223448080765219, + "grad_norm": 2.3997745513916016, + "learning_rate": 1.962758653205797e-05, + "loss": 0.7141, + "step": 206140 + }, + { + "epoch": 1.822433211336834, + "grad_norm": 1.6116048097610474, + "learning_rate": 1.9626113144386097e-05, + "loss": 0.631, + "step": 206150 + }, + { + "epoch": 1.8225216145971463, + "grad_norm": 1.3154655694961548, + "learning_rate": 1.962463975671423e-05, + "loss": 0.5164, + "step": 206160 + }, + { + "epoch": 1.8226100178574587, + "grad_norm": 5.549624919891357, + "learning_rate": 1.9623166369042357e-05, + "loss": 0.6743, + "step": 206170 + }, + { + "epoch": 1.8226984211177708, + "grad_norm": 2.2603037357330322, + "learning_rate": 1.9621692981370486e-05, + "loss": 0.5922, + "step": 206180 + }, + { + "epoch": 1.822786824378083, + "grad_norm": 3.52301025390625, + "learning_rate": 1.9620219593698618e-05, + "loss": 0.4822, + "step": 206190 + }, + { + "epoch": 1.8228752276383953, + "grad_norm": 5.003649711608887, + "learning_rate": 1.9618746206026746e-05, + "loss": 0.5354, + "step": 206200 + }, + { + "epoch": 1.8229636308987076, + "grad_norm": 2.923891305923462, + "learning_rate": 1.9617272818354878e-05, + "loss": 0.4701, + "step": 206210 + }, + { + "epoch": 1.8230520341590197, + "grad_norm": 0.8332355618476868, + "learning_rate": 1.9615799430683006e-05, + "loss": 0.5434, + "step": 206220 + }, + { + "epoch": 1.823140437419332, + "grad_norm": 1.7102841138839722, + "learning_rate": 1.9614326043011134e-05, + "loss": 0.5627, + "step": 206230 + }, + { + "epoch": 1.8232288406796444, + "grad_norm": 10.538195610046387, + "learning_rate": 1.9612852655339266e-05, + "loss": 0.6224, + "step": 206240 + }, + { + "epoch": 1.8233172439399565, + "grad_norm": 3.4315640926361084, + "learning_rate": 1.9611379267667394e-05, + "loss": 0.6502, + "step": 206250 + }, + { + "epoch": 1.8234056472002687, + "grad_norm": 1.2188180685043335, + "learning_rate": 1.9609905879995523e-05, + "loss": 0.6174, + "step": 206260 + }, + { + "epoch": 1.823494050460581, + "grad_norm": 6.603806018829346, + "learning_rate": 1.960843249232365e-05, + "loss": 0.4602, + "step": 206270 + }, + { + "epoch": 1.8235824537208933, + "grad_norm": 4.472281455993652, + "learning_rate": 1.9606959104651783e-05, + "loss": 0.537, + "step": 206280 + }, + { + "epoch": 1.8236708569812055, + "grad_norm": 1.5111479759216309, + "learning_rate": 1.960548571697991e-05, + "loss": 0.6231, + "step": 206290 + }, + { + "epoch": 1.8237592602415176, + "grad_norm": 1.1447303295135498, + "learning_rate": 1.960401232930804e-05, + "loss": 0.4651, + "step": 206300 + }, + { + "epoch": 1.82384766350183, + "grad_norm": 1.6166595220565796, + "learning_rate": 1.9602538941636168e-05, + "loss": 0.6349, + "step": 206310 + }, + { + "epoch": 1.8239360667621423, + "grad_norm": 3.1219429969787598, + "learning_rate": 1.96010655539643e-05, + "loss": 0.6483, + "step": 206320 + }, + { + "epoch": 1.8240244700224544, + "grad_norm": 1.9796806573867798, + "learning_rate": 1.9599592166292428e-05, + "loss": 0.6497, + "step": 206330 + }, + { + "epoch": 1.8241128732827667, + "grad_norm": 2.0331077575683594, + "learning_rate": 1.9598118778620556e-05, + "loss": 0.6726, + "step": 206340 + }, + { + "epoch": 1.824201276543079, + "grad_norm": 3.3719019889831543, + "learning_rate": 1.9596645390948688e-05, + "loss": 0.5446, + "step": 206350 + }, + { + "epoch": 1.8242896798033912, + "grad_norm": 1.7892955541610718, + "learning_rate": 1.9595172003276817e-05, + "loss": 0.5504, + "step": 206360 + }, + { + "epoch": 1.8243780830637033, + "grad_norm": 4.419246673583984, + "learning_rate": 1.9593698615604945e-05, + "loss": 0.515, + "step": 206370 + }, + { + "epoch": 1.8244664863240156, + "grad_norm": 2.4962549209594727, + "learning_rate": 1.9592225227933073e-05, + "loss": 0.5677, + "step": 206380 + }, + { + "epoch": 1.824554889584328, + "grad_norm": 15.308316230773926, + "learning_rate": 1.9590751840261205e-05, + "loss": 0.5564, + "step": 206390 + }, + { + "epoch": 1.82464329284464, + "grad_norm": 3.7508552074432373, + "learning_rate": 1.9589278452589333e-05, + "loss": 0.5001, + "step": 206400 + }, + { + "epoch": 1.8247316961049522, + "grad_norm": 0.7837414741516113, + "learning_rate": 1.958780506491746e-05, + "loss": 0.5693, + "step": 206410 + }, + { + "epoch": 1.8248200993652646, + "grad_norm": 9.835325241088867, + "learning_rate": 1.9586331677245593e-05, + "loss": 0.5738, + "step": 206420 + }, + { + "epoch": 1.824908502625577, + "grad_norm": 4.901032447814941, + "learning_rate": 1.9584858289573722e-05, + "loss": 0.5124, + "step": 206430 + }, + { + "epoch": 1.824996905885889, + "grad_norm": 3.4703445434570312, + "learning_rate": 1.958338490190185e-05, + "loss": 0.5813, + "step": 206440 + }, + { + "epoch": 1.8250853091462012, + "grad_norm": 3.4250540733337402, + "learning_rate": 1.958191151422998e-05, + "loss": 0.5784, + "step": 206450 + }, + { + "epoch": 1.8251737124065137, + "grad_norm": 1.2305299043655396, + "learning_rate": 1.958043812655811e-05, + "loss": 0.4881, + "step": 206460 + }, + { + "epoch": 1.8252621156668258, + "grad_norm": 2.2637906074523926, + "learning_rate": 1.957896473888624e-05, + "loss": 0.5897, + "step": 206470 + }, + { + "epoch": 1.825350518927138, + "grad_norm": 3.5360212326049805, + "learning_rate": 1.9577491351214367e-05, + "loss": 0.6305, + "step": 206480 + }, + { + "epoch": 1.8254389221874503, + "grad_norm": 9.073210716247559, + "learning_rate": 1.9576017963542495e-05, + "loss": 0.6322, + "step": 206490 + }, + { + "epoch": 1.8255273254477626, + "grad_norm": 1.8201502561569214, + "learning_rate": 1.9574544575870627e-05, + "loss": 0.4767, + "step": 206500 + }, + { + "epoch": 1.8256157287080748, + "grad_norm": 8.53696060180664, + "learning_rate": 1.9573071188198755e-05, + "loss": 0.663, + "step": 206510 + }, + { + "epoch": 1.8257041319683869, + "grad_norm": 1.5252351760864258, + "learning_rate": 1.9571597800526884e-05, + "loss": 0.5231, + "step": 206520 + }, + { + "epoch": 1.8257925352286992, + "grad_norm": 5.737728595733643, + "learning_rate": 1.9570124412855015e-05, + "loss": 0.5788, + "step": 206530 + }, + { + "epoch": 1.8258809384890116, + "grad_norm": 1.994388222694397, + "learning_rate": 1.9568651025183144e-05, + "loss": 0.6812, + "step": 206540 + }, + { + "epoch": 1.8259693417493237, + "grad_norm": 3.8378353118896484, + "learning_rate": 1.9567177637511272e-05, + "loss": 0.6631, + "step": 206550 + }, + { + "epoch": 1.8260577450096358, + "grad_norm": 1.4128046035766602, + "learning_rate": 1.95657042498394e-05, + "loss": 0.5389, + "step": 206560 + }, + { + "epoch": 1.8261461482699481, + "grad_norm": 6.838903903961182, + "learning_rate": 1.9564230862167532e-05, + "loss": 0.6888, + "step": 206570 + }, + { + "epoch": 1.8262345515302605, + "grad_norm": 7.118066787719727, + "learning_rate": 1.956275747449566e-05, + "loss": 0.5024, + "step": 206580 + }, + { + "epoch": 1.8263229547905726, + "grad_norm": 43.27200698852539, + "learning_rate": 1.956128408682379e-05, + "loss": 0.6193, + "step": 206590 + }, + { + "epoch": 1.826411358050885, + "grad_norm": 2.128223180770874, + "learning_rate": 1.955981069915192e-05, + "loss": 0.4912, + "step": 206600 + }, + { + "epoch": 1.8264997613111973, + "grad_norm": 4.299762725830078, + "learning_rate": 1.955833731148005e-05, + "loss": 0.4791, + "step": 206610 + }, + { + "epoch": 1.8265881645715094, + "grad_norm": 1.7000281810760498, + "learning_rate": 1.9556863923808177e-05, + "loss": 0.472, + "step": 206620 + }, + { + "epoch": 1.8266765678318215, + "grad_norm": 4.285869121551514, + "learning_rate": 1.9555390536136306e-05, + "loss": 0.7635, + "step": 206630 + }, + { + "epoch": 1.8267649710921339, + "grad_norm": 2.826352119445801, + "learning_rate": 1.9553917148464438e-05, + "loss": 0.5418, + "step": 206640 + }, + { + "epoch": 1.8268533743524462, + "grad_norm": 1.4064503908157349, + "learning_rate": 1.9552443760792566e-05, + "loss": 0.5626, + "step": 206650 + }, + { + "epoch": 1.8269417776127583, + "grad_norm": 1.5571008920669556, + "learning_rate": 1.9550970373120694e-05, + "loss": 0.5358, + "step": 206660 + }, + { + "epoch": 1.8270301808730705, + "grad_norm": 4.954158306121826, + "learning_rate": 1.9549496985448823e-05, + "loss": 0.5906, + "step": 206670 + }, + { + "epoch": 1.8271185841333828, + "grad_norm": 2.0399539470672607, + "learning_rate": 1.9548023597776954e-05, + "loss": 0.5853, + "step": 206680 + }, + { + "epoch": 1.8272069873936951, + "grad_norm": 0.9200155138969421, + "learning_rate": 1.9546550210105083e-05, + "loss": 0.6993, + "step": 206690 + }, + { + "epoch": 1.8272953906540073, + "grad_norm": 3.1453781127929688, + "learning_rate": 1.954507682243321e-05, + "loss": 0.6423, + "step": 206700 + }, + { + "epoch": 1.8273837939143196, + "grad_norm": 2.170257329940796, + "learning_rate": 1.9543603434761343e-05, + "loss": 0.6632, + "step": 206710 + }, + { + "epoch": 1.827472197174632, + "grad_norm": 2.5499367713928223, + "learning_rate": 1.954213004708947e-05, + "loss": 0.5534, + "step": 206720 + }, + { + "epoch": 1.827560600434944, + "grad_norm": 11.52445125579834, + "learning_rate": 1.95406566594176e-05, + "loss": 0.6607, + "step": 206730 + }, + { + "epoch": 1.8276490036952562, + "grad_norm": 1.7752468585968018, + "learning_rate": 1.9539183271745728e-05, + "loss": 0.5646, + "step": 206740 + }, + { + "epoch": 1.8277374069555685, + "grad_norm": 1.7741189002990723, + "learning_rate": 1.953770988407386e-05, + "loss": 0.527, + "step": 206750 + }, + { + "epoch": 1.8278258102158809, + "grad_norm": 3.750159502029419, + "learning_rate": 1.9536236496401988e-05, + "loss": 0.738, + "step": 206760 + }, + { + "epoch": 1.827914213476193, + "grad_norm": 3.8642637729644775, + "learning_rate": 1.9534763108730116e-05, + "loss": 0.6589, + "step": 206770 + }, + { + "epoch": 1.828002616736505, + "grad_norm": 4.268569469451904, + "learning_rate": 1.9533289721058248e-05, + "loss": 0.7822, + "step": 206780 + }, + { + "epoch": 1.8280910199968174, + "grad_norm": 6.819716453552246, + "learning_rate": 1.9531816333386376e-05, + "loss": 0.4744, + "step": 206790 + }, + { + "epoch": 1.8281794232571298, + "grad_norm": 1.9105000495910645, + "learning_rate": 1.9530342945714505e-05, + "loss": 0.5862, + "step": 206800 + }, + { + "epoch": 1.828267826517442, + "grad_norm": 2.9903974533081055, + "learning_rate": 1.9528869558042633e-05, + "loss": 0.6234, + "step": 206810 + }, + { + "epoch": 1.8283562297777542, + "grad_norm": 1.6201250553131104, + "learning_rate": 1.9527396170370765e-05, + "loss": 0.4748, + "step": 206820 + }, + { + "epoch": 1.8284446330380666, + "grad_norm": 1.895225167274475, + "learning_rate": 1.9525922782698893e-05, + "loss": 0.6045, + "step": 206830 + }, + { + "epoch": 1.8285330362983787, + "grad_norm": 1.1606491804122925, + "learning_rate": 1.952444939502702e-05, + "loss": 0.633, + "step": 206840 + }, + { + "epoch": 1.8286214395586908, + "grad_norm": 4.069981575012207, + "learning_rate": 1.952297600735515e-05, + "loss": 0.464, + "step": 206850 + }, + { + "epoch": 1.8287098428190032, + "grad_norm": 2.674436330795288, + "learning_rate": 1.952150261968328e-05, + "loss": 0.6591, + "step": 206860 + }, + { + "epoch": 1.8287982460793155, + "grad_norm": 0.8125573396682739, + "learning_rate": 1.952002923201141e-05, + "loss": 0.5678, + "step": 206870 + }, + { + "epoch": 1.8288866493396276, + "grad_norm": 17.65329933166504, + "learning_rate": 1.951855584433954e-05, + "loss": 0.6219, + "step": 206880 + }, + { + "epoch": 1.8289750525999398, + "grad_norm": 1.762826919555664, + "learning_rate": 1.951708245666767e-05, + "loss": 0.6162, + "step": 206890 + }, + { + "epoch": 1.829063455860252, + "grad_norm": 2.4930219650268555, + "learning_rate": 1.95156090689958e-05, + "loss": 0.554, + "step": 206900 + }, + { + "epoch": 1.8291518591205644, + "grad_norm": 1.571549892425537, + "learning_rate": 1.9514135681323927e-05, + "loss": 0.554, + "step": 206910 + }, + { + "epoch": 1.8292402623808766, + "grad_norm": 2.7987961769104004, + "learning_rate": 1.9512662293652055e-05, + "loss": 0.531, + "step": 206920 + }, + { + "epoch": 1.829328665641189, + "grad_norm": 10.366473197937012, + "learning_rate": 1.9511188905980187e-05, + "loss": 0.5679, + "step": 206930 + }, + { + "epoch": 1.8294170689015012, + "grad_norm": 2.8565518856048584, + "learning_rate": 1.9509715518308315e-05, + "loss": 0.7168, + "step": 206940 + }, + { + "epoch": 1.8295054721618134, + "grad_norm": 1.980823278427124, + "learning_rate": 1.9508242130636444e-05, + "loss": 0.5628, + "step": 206950 + }, + { + "epoch": 1.8295938754221255, + "grad_norm": 3.3893544673919678, + "learning_rate": 1.9506768742964572e-05, + "loss": 0.6066, + "step": 206960 + }, + { + "epoch": 1.8296822786824378, + "grad_norm": 0.9177488684654236, + "learning_rate": 1.9505295355292704e-05, + "loss": 0.6143, + "step": 206970 + }, + { + "epoch": 1.8297706819427502, + "grad_norm": 2.841909408569336, + "learning_rate": 1.9503821967620832e-05, + "loss": 0.5786, + "step": 206980 + }, + { + "epoch": 1.8298590852030623, + "grad_norm": 11.035330772399902, + "learning_rate": 1.950234857994896e-05, + "loss": 0.5647, + "step": 206990 + }, + { + "epoch": 1.8299474884633744, + "grad_norm": 6.5169148445129395, + "learning_rate": 1.9500875192277092e-05, + "loss": 0.5018, + "step": 207000 + }, + { + "epoch": 1.8300358917236867, + "grad_norm": 7.916037082672119, + "learning_rate": 1.949940180460522e-05, + "loss": 0.574, + "step": 207010 + }, + { + "epoch": 1.830124294983999, + "grad_norm": 1.599095106124878, + "learning_rate": 1.949792841693335e-05, + "loss": 0.5706, + "step": 207020 + }, + { + "epoch": 1.8302126982443112, + "grad_norm": 2.2294921875, + "learning_rate": 1.949645502926148e-05, + "loss": 0.5595, + "step": 207030 + }, + { + "epoch": 1.8303011015046233, + "grad_norm": 2.055271863937378, + "learning_rate": 1.949498164158961e-05, + "loss": 0.6021, + "step": 207040 + }, + { + "epoch": 1.830389504764936, + "grad_norm": 0.9322070479393005, + "learning_rate": 1.9493508253917737e-05, + "loss": 0.6375, + "step": 207050 + }, + { + "epoch": 1.830477908025248, + "grad_norm": 9.942620277404785, + "learning_rate": 1.949203486624587e-05, + "loss": 0.6126, + "step": 207060 + }, + { + "epoch": 1.8305663112855601, + "grad_norm": 6.288659572601318, + "learning_rate": 1.9490561478573997e-05, + "loss": 0.4625, + "step": 207070 + }, + { + "epoch": 1.8306547145458725, + "grad_norm": 2.519831657409668, + "learning_rate": 1.9489088090902126e-05, + "loss": 0.6937, + "step": 207080 + }, + { + "epoch": 1.8307431178061848, + "grad_norm": 3.0569796562194824, + "learning_rate": 1.9487614703230258e-05, + "loss": 0.5536, + "step": 207090 + }, + { + "epoch": 1.830831521066497, + "grad_norm": 4.443091869354248, + "learning_rate": 1.9486141315558386e-05, + "loss": 0.5247, + "step": 207100 + }, + { + "epoch": 1.830919924326809, + "grad_norm": 1.5366700887680054, + "learning_rate": 1.9484667927886514e-05, + "loss": 0.5371, + "step": 207110 + }, + { + "epoch": 1.8310083275871214, + "grad_norm": 2.45685076713562, + "learning_rate": 1.9483194540214646e-05, + "loss": 0.5259, + "step": 207120 + }, + { + "epoch": 1.8310967308474337, + "grad_norm": 1.5443123579025269, + "learning_rate": 1.9481721152542774e-05, + "loss": 0.6548, + "step": 207130 + }, + { + "epoch": 1.8311851341077459, + "grad_norm": 5.310937404632568, + "learning_rate": 1.9480247764870903e-05, + "loss": 0.4454, + "step": 207140 + }, + { + "epoch": 1.831273537368058, + "grad_norm": 1.7682256698608398, + "learning_rate": 1.9478774377199034e-05, + "loss": 0.5112, + "step": 207150 + }, + { + "epoch": 1.8313619406283703, + "grad_norm": 2.3581113815307617, + "learning_rate": 1.9477300989527163e-05, + "loss": 0.4136, + "step": 207160 + }, + { + "epoch": 1.8314503438886827, + "grad_norm": 4.200525283813477, + "learning_rate": 1.947582760185529e-05, + "loss": 0.5595, + "step": 207170 + }, + { + "epoch": 1.8315387471489948, + "grad_norm": 2.5899343490600586, + "learning_rate": 1.9474354214183423e-05, + "loss": 0.5646, + "step": 207180 + }, + { + "epoch": 1.8316271504093071, + "grad_norm": 3.1714022159576416, + "learning_rate": 1.947288082651155e-05, + "loss": 0.5637, + "step": 207190 + }, + { + "epoch": 1.8317155536696195, + "grad_norm": 2.9272472858428955, + "learning_rate": 1.947140743883968e-05, + "loss": 0.6306, + "step": 207200 + }, + { + "epoch": 1.8318039569299316, + "grad_norm": 3.404977798461914, + "learning_rate": 1.9469934051167808e-05, + "loss": 0.6518, + "step": 207210 + }, + { + "epoch": 1.8318923601902437, + "grad_norm": 5.2195210456848145, + "learning_rate": 1.946846066349594e-05, + "loss": 0.5652, + "step": 207220 + }, + { + "epoch": 1.831980763450556, + "grad_norm": 6.188805103302002, + "learning_rate": 1.9466987275824068e-05, + "loss": 0.5442, + "step": 207230 + }, + { + "epoch": 1.8320691667108684, + "grad_norm": 6.709285736083984, + "learning_rate": 1.9465513888152196e-05, + "loss": 0.716, + "step": 207240 + }, + { + "epoch": 1.8321575699711805, + "grad_norm": 1.6161854267120361, + "learning_rate": 1.9464040500480328e-05, + "loss": 0.5632, + "step": 207250 + }, + { + "epoch": 1.8322459732314926, + "grad_norm": 2.73622989654541, + "learning_rate": 1.9462567112808456e-05, + "loss": 0.617, + "step": 207260 + }, + { + "epoch": 1.832334376491805, + "grad_norm": 8.8794584274292, + "learning_rate": 1.9461093725136585e-05, + "loss": 0.6688, + "step": 207270 + }, + { + "epoch": 1.8324227797521173, + "grad_norm": 2.570241928100586, + "learning_rate": 1.9459620337464713e-05, + "loss": 0.4684, + "step": 207280 + }, + { + "epoch": 1.8325111830124294, + "grad_norm": 9.02534008026123, + "learning_rate": 1.9458146949792845e-05, + "loss": 0.7221, + "step": 207290 + }, + { + "epoch": 1.8325995862727418, + "grad_norm": 5.554074287414551, + "learning_rate": 1.9456673562120973e-05, + "loss": 0.6299, + "step": 207300 + }, + { + "epoch": 1.8326879895330541, + "grad_norm": 1.8187109231948853, + "learning_rate": 1.94552001744491e-05, + "loss": 0.5546, + "step": 207310 + }, + { + "epoch": 1.8327763927933662, + "grad_norm": 3.4909963607788086, + "learning_rate": 1.945372678677723e-05, + "loss": 0.4557, + "step": 207320 + }, + { + "epoch": 1.8328647960536784, + "grad_norm": 3.637523889541626, + "learning_rate": 1.9452253399105362e-05, + "loss": 0.5962, + "step": 207330 + }, + { + "epoch": 1.8329531993139907, + "grad_norm": 1.782825231552124, + "learning_rate": 1.945078001143349e-05, + "loss": 0.575, + "step": 207340 + }, + { + "epoch": 1.833041602574303, + "grad_norm": 1.9207335710525513, + "learning_rate": 1.944930662376162e-05, + "loss": 0.4993, + "step": 207350 + }, + { + "epoch": 1.8331300058346152, + "grad_norm": 1.6457518339157104, + "learning_rate": 1.944783323608975e-05, + "loss": 0.5708, + "step": 207360 + }, + { + "epoch": 1.8332184090949273, + "grad_norm": 0.7958346009254456, + "learning_rate": 1.944635984841788e-05, + "loss": 0.61, + "step": 207370 + }, + { + "epoch": 1.8333068123552396, + "grad_norm": 4.749642372131348, + "learning_rate": 1.9444886460746007e-05, + "loss": 0.6038, + "step": 207380 + }, + { + "epoch": 1.833395215615552, + "grad_norm": 13.015735626220703, + "learning_rate": 1.9443413073074135e-05, + "loss": 0.592, + "step": 207390 + }, + { + "epoch": 1.833483618875864, + "grad_norm": 2.589996337890625, + "learning_rate": 1.9441939685402267e-05, + "loss": 0.6938, + "step": 207400 + }, + { + "epoch": 1.8335720221361764, + "grad_norm": 1.5949350595474243, + "learning_rate": 1.9440466297730395e-05, + "loss": 0.6871, + "step": 207410 + }, + { + "epoch": 1.8336604253964888, + "grad_norm": 5.7265424728393555, + "learning_rate": 1.9438992910058524e-05, + "loss": 0.6874, + "step": 207420 + }, + { + "epoch": 1.8337488286568009, + "grad_norm": 3.134699821472168, + "learning_rate": 1.9437519522386652e-05, + "loss": 0.6307, + "step": 207430 + }, + { + "epoch": 1.833837231917113, + "grad_norm": 1.5864468812942505, + "learning_rate": 1.9436046134714784e-05, + "loss": 0.5999, + "step": 207440 + }, + { + "epoch": 1.8339256351774254, + "grad_norm": 2.123861789703369, + "learning_rate": 1.9434572747042912e-05, + "loss": 0.4582, + "step": 207450 + }, + { + "epoch": 1.8340140384377377, + "grad_norm": 3.683892250061035, + "learning_rate": 1.943309935937104e-05, + "loss": 0.6082, + "step": 207460 + }, + { + "epoch": 1.8341024416980498, + "grad_norm": 7.93398380279541, + "learning_rate": 1.9431625971699172e-05, + "loss": 0.6664, + "step": 207470 + }, + { + "epoch": 1.834190844958362, + "grad_norm": 0.6710127592086792, + "learning_rate": 1.94301525840273e-05, + "loss": 0.6653, + "step": 207480 + }, + { + "epoch": 1.8342792482186743, + "grad_norm": 3.0037691593170166, + "learning_rate": 1.942867919635543e-05, + "loss": 0.5595, + "step": 207490 + }, + { + "epoch": 1.8343676514789866, + "grad_norm": 1.559056282043457, + "learning_rate": 1.9427205808683557e-05, + "loss": 0.4848, + "step": 207500 + }, + { + "epoch": 1.8344560547392987, + "grad_norm": 2.6961400508880615, + "learning_rate": 1.942573242101169e-05, + "loss": 0.6084, + "step": 207510 + }, + { + "epoch": 1.834544457999611, + "grad_norm": 5.092144012451172, + "learning_rate": 1.9424259033339817e-05, + "loss": 0.6226, + "step": 207520 + }, + { + "epoch": 1.8346328612599234, + "grad_norm": 1.878910779953003, + "learning_rate": 1.9422785645667946e-05, + "loss": 0.5703, + "step": 207530 + }, + { + "epoch": 1.8347212645202355, + "grad_norm": 3.9596362113952637, + "learning_rate": 1.9421312257996077e-05, + "loss": 0.733, + "step": 207540 + }, + { + "epoch": 1.8348096677805477, + "grad_norm": 8.592870712280273, + "learning_rate": 1.9419838870324206e-05, + "loss": 0.5475, + "step": 207550 + }, + { + "epoch": 1.83489807104086, + "grad_norm": 2.708343505859375, + "learning_rate": 1.9418365482652334e-05, + "loss": 0.5893, + "step": 207560 + }, + { + "epoch": 1.8349864743011723, + "grad_norm": 2.026700496673584, + "learning_rate": 1.9416892094980463e-05, + "loss": 0.5527, + "step": 207570 + }, + { + "epoch": 1.8350748775614845, + "grad_norm": 1.2991694211959839, + "learning_rate": 1.9415418707308594e-05, + "loss": 0.6627, + "step": 207580 + }, + { + "epoch": 1.8351632808217966, + "grad_norm": 1.454665184020996, + "learning_rate": 1.9413945319636723e-05, + "loss": 0.5932, + "step": 207590 + }, + { + "epoch": 1.835251684082109, + "grad_norm": 1.2965185642242432, + "learning_rate": 1.941247193196485e-05, + "loss": 0.5742, + "step": 207600 + }, + { + "epoch": 1.8353400873424213, + "grad_norm": 1.6546359062194824, + "learning_rate": 1.941099854429298e-05, + "loss": 0.5657, + "step": 207610 + }, + { + "epoch": 1.8354284906027334, + "grad_norm": 1.7437026500701904, + "learning_rate": 1.940952515662111e-05, + "loss": 0.564, + "step": 207620 + }, + { + "epoch": 1.8355168938630455, + "grad_norm": 2.647946834564209, + "learning_rate": 1.940805176894924e-05, + "loss": 0.6428, + "step": 207630 + }, + { + "epoch": 1.835605297123358, + "grad_norm": 4.591547012329102, + "learning_rate": 1.9406578381277368e-05, + "loss": 0.5847, + "step": 207640 + }, + { + "epoch": 1.8356937003836702, + "grad_norm": 3.490241050720215, + "learning_rate": 1.94051049936055e-05, + "loss": 0.5666, + "step": 207650 + }, + { + "epoch": 1.8357821036439823, + "grad_norm": 2.4817020893096924, + "learning_rate": 1.9403631605933628e-05, + "loss": 0.4616, + "step": 207660 + }, + { + "epoch": 1.8358705069042947, + "grad_norm": 1.6254853010177612, + "learning_rate": 1.9402158218261756e-05, + "loss": 0.5828, + "step": 207670 + }, + { + "epoch": 1.835958910164607, + "grad_norm": 2.2969744205474854, + "learning_rate": 1.9400684830589885e-05, + "loss": 0.6024, + "step": 207680 + }, + { + "epoch": 1.8360473134249191, + "grad_norm": 3.1695868968963623, + "learning_rate": 1.9399211442918016e-05, + "loss": 0.6249, + "step": 207690 + }, + { + "epoch": 1.8361357166852312, + "grad_norm": 1.6998211145401, + "learning_rate": 1.9397738055246145e-05, + "loss": 0.4365, + "step": 207700 + }, + { + "epoch": 1.8362241199455436, + "grad_norm": 2.9159839153289795, + "learning_rate": 1.9396264667574273e-05, + "loss": 0.6375, + "step": 207710 + }, + { + "epoch": 1.836312523205856, + "grad_norm": 1.8462963104248047, + "learning_rate": 1.9394791279902405e-05, + "loss": 0.5091, + "step": 207720 + }, + { + "epoch": 1.836400926466168, + "grad_norm": 1.6872423887252808, + "learning_rate": 1.9393317892230533e-05, + "loss": 0.5515, + "step": 207730 + }, + { + "epoch": 1.8364893297264802, + "grad_norm": 2.599106550216675, + "learning_rate": 1.939184450455866e-05, + "loss": 0.7511, + "step": 207740 + }, + { + "epoch": 1.8365777329867925, + "grad_norm": 3.112389087677002, + "learning_rate": 1.939037111688679e-05, + "loss": 0.6244, + "step": 207750 + }, + { + "epoch": 1.8366661362471048, + "grad_norm": 1.9276559352874756, + "learning_rate": 1.938889772921492e-05, + "loss": 0.6984, + "step": 207760 + }, + { + "epoch": 1.836754539507417, + "grad_norm": 2.162914752960205, + "learning_rate": 1.938742434154305e-05, + "loss": 0.5041, + "step": 207770 + }, + { + "epoch": 1.8368429427677293, + "grad_norm": 1.7087122201919556, + "learning_rate": 1.9385950953871178e-05, + "loss": 0.5383, + "step": 207780 + }, + { + "epoch": 1.8369313460280416, + "grad_norm": 2.3233726024627686, + "learning_rate": 1.9384477566199307e-05, + "loss": 0.55, + "step": 207790 + }, + { + "epoch": 1.8370197492883538, + "grad_norm": 1.1416159868240356, + "learning_rate": 1.938300417852744e-05, + "loss": 0.6024, + "step": 207800 + }, + { + "epoch": 1.8371081525486659, + "grad_norm": 2.158411979675293, + "learning_rate": 1.9381530790855567e-05, + "loss": 0.5108, + "step": 207810 + }, + { + "epoch": 1.8371965558089782, + "grad_norm": 1.3921183347702026, + "learning_rate": 1.9380057403183695e-05, + "loss": 0.604, + "step": 207820 + }, + { + "epoch": 1.8372849590692906, + "grad_norm": 1.9411571025848389, + "learning_rate": 1.9378584015511827e-05, + "loss": 0.6256, + "step": 207830 + }, + { + "epoch": 1.8373733623296027, + "grad_norm": 1.8781229257583618, + "learning_rate": 1.9377110627839955e-05, + "loss": 0.5606, + "step": 207840 + }, + { + "epoch": 1.8374617655899148, + "grad_norm": 9.537175178527832, + "learning_rate": 1.9375637240168084e-05, + "loss": 0.5402, + "step": 207850 + }, + { + "epoch": 1.8375501688502272, + "grad_norm": 3.0181829929351807, + "learning_rate": 1.9374163852496212e-05, + "loss": 0.6142, + "step": 207860 + }, + { + "epoch": 1.8376385721105395, + "grad_norm": 11.303926467895508, + "learning_rate": 1.9372690464824344e-05, + "loss": 0.544, + "step": 207870 + }, + { + "epoch": 1.8377269753708516, + "grad_norm": 4.462282180786133, + "learning_rate": 1.9371217077152472e-05, + "loss": 0.6198, + "step": 207880 + }, + { + "epoch": 1.837815378631164, + "grad_norm": 6.00845193862915, + "learning_rate": 1.93697436894806e-05, + "loss": 0.5571, + "step": 207890 + }, + { + "epoch": 1.8379037818914763, + "grad_norm": 2.0074462890625, + "learning_rate": 1.936827030180873e-05, + "loss": 0.6813, + "step": 207900 + }, + { + "epoch": 1.8379921851517884, + "grad_norm": 2.754859447479248, + "learning_rate": 1.936679691413686e-05, + "loss": 0.6652, + "step": 207910 + }, + { + "epoch": 1.8380805884121005, + "grad_norm": 4.443384647369385, + "learning_rate": 1.936532352646499e-05, + "loss": 0.6517, + "step": 207920 + }, + { + "epoch": 1.8381689916724129, + "grad_norm": 2.1095924377441406, + "learning_rate": 1.9363850138793117e-05, + "loss": 0.6979, + "step": 207930 + }, + { + "epoch": 1.8382573949327252, + "grad_norm": 1.8302702903747559, + "learning_rate": 1.936237675112125e-05, + "loss": 0.506, + "step": 207940 + }, + { + "epoch": 1.8383457981930373, + "grad_norm": 0.7646289467811584, + "learning_rate": 1.9360903363449377e-05, + "loss": 0.5938, + "step": 207950 + }, + { + "epoch": 1.8384342014533495, + "grad_norm": 5.645435810089111, + "learning_rate": 1.9359429975777506e-05, + "loss": 0.5905, + "step": 207960 + }, + { + "epoch": 1.8385226047136618, + "grad_norm": 3.4635891914367676, + "learning_rate": 1.9357956588105637e-05, + "loss": 0.6362, + "step": 207970 + }, + { + "epoch": 1.8386110079739741, + "grad_norm": 14.734624862670898, + "learning_rate": 1.9356483200433766e-05, + "loss": 0.712, + "step": 207980 + }, + { + "epoch": 1.8386994112342863, + "grad_norm": 2.7411723136901855, + "learning_rate": 1.9355009812761894e-05, + "loss": 0.5717, + "step": 207990 + }, + { + "epoch": 1.8387878144945986, + "grad_norm": 3.2438783645629883, + "learning_rate": 1.9353536425090026e-05, + "loss": 0.5634, + "step": 208000 + }, + { + "epoch": 1.838876217754911, + "grad_norm": 3.8390941619873047, + "learning_rate": 1.9352063037418154e-05, + "loss": 0.5358, + "step": 208010 + }, + { + "epoch": 1.838964621015223, + "grad_norm": 2.427786350250244, + "learning_rate": 1.9350589649746283e-05, + "loss": 0.5674, + "step": 208020 + }, + { + "epoch": 1.8390530242755352, + "grad_norm": 12.458891868591309, + "learning_rate": 1.9349116262074414e-05, + "loss": 0.6475, + "step": 208030 + }, + { + "epoch": 1.8391414275358475, + "grad_norm": 1.6757439374923706, + "learning_rate": 1.9347642874402543e-05, + "loss": 0.5057, + "step": 208040 + }, + { + "epoch": 1.8392298307961599, + "grad_norm": 3.8869035243988037, + "learning_rate": 1.934616948673067e-05, + "loss": 0.5077, + "step": 208050 + }, + { + "epoch": 1.839318234056472, + "grad_norm": 1.2677150964736938, + "learning_rate": 1.9344696099058803e-05, + "loss": 0.6241, + "step": 208060 + }, + { + "epoch": 1.839406637316784, + "grad_norm": 1.1207103729248047, + "learning_rate": 1.934322271138693e-05, + "loss": 0.5549, + "step": 208070 + }, + { + "epoch": 1.8394950405770965, + "grad_norm": 1.5252139568328857, + "learning_rate": 1.934174932371506e-05, + "loss": 0.575, + "step": 208080 + }, + { + "epoch": 1.8395834438374088, + "grad_norm": 2.096590042114258, + "learning_rate": 1.934027593604319e-05, + "loss": 0.4791, + "step": 208090 + }, + { + "epoch": 1.839671847097721, + "grad_norm": 11.018218040466309, + "learning_rate": 1.933880254837132e-05, + "loss": 0.5244, + "step": 208100 + }, + { + "epoch": 1.8397602503580333, + "grad_norm": 8.555094718933105, + "learning_rate": 1.9337329160699448e-05, + "loss": 0.5983, + "step": 208110 + }, + { + "epoch": 1.8398486536183456, + "grad_norm": 2.8810555934906006, + "learning_rate": 1.933585577302758e-05, + "loss": 0.591, + "step": 208120 + }, + { + "epoch": 1.8399370568786577, + "grad_norm": 3.159071207046509, + "learning_rate": 1.9334382385355708e-05, + "loss": 0.704, + "step": 208130 + }, + { + "epoch": 1.8400254601389698, + "grad_norm": 3.9344112873077393, + "learning_rate": 1.9332908997683836e-05, + "loss": 0.7072, + "step": 208140 + }, + { + "epoch": 1.8401138633992822, + "grad_norm": 1.484507441520691, + "learning_rate": 1.9331435610011965e-05, + "loss": 0.6429, + "step": 208150 + }, + { + "epoch": 1.8402022666595945, + "grad_norm": 2.2243213653564453, + "learning_rate": 1.9329962222340096e-05, + "loss": 0.4745, + "step": 208160 + }, + { + "epoch": 1.8402906699199066, + "grad_norm": 3.3667314052581787, + "learning_rate": 1.9328488834668225e-05, + "loss": 0.5799, + "step": 208170 + }, + { + "epoch": 1.8403790731802188, + "grad_norm": 2.562797784805298, + "learning_rate": 1.9327015446996353e-05, + "loss": 0.6137, + "step": 208180 + }, + { + "epoch": 1.840467476440531, + "grad_norm": 3.30324649810791, + "learning_rate": 1.9325542059324485e-05, + "loss": 0.6696, + "step": 208190 + }, + { + "epoch": 1.8405558797008434, + "grad_norm": 2.3900535106658936, + "learning_rate": 1.9324068671652613e-05, + "loss": 0.5436, + "step": 208200 + }, + { + "epoch": 1.8406442829611556, + "grad_norm": 1.4462683200836182, + "learning_rate": 1.932259528398074e-05, + "loss": 0.5414, + "step": 208210 + }, + { + "epoch": 1.8407326862214677, + "grad_norm": 3.147698402404785, + "learning_rate": 1.932112189630887e-05, + "loss": 0.7038, + "step": 208220 + }, + { + "epoch": 1.8408210894817802, + "grad_norm": 1.601162314414978, + "learning_rate": 1.9319648508637e-05, + "loss": 0.62, + "step": 208230 + }, + { + "epoch": 1.8409094927420924, + "grad_norm": 2.883354902267456, + "learning_rate": 1.931817512096513e-05, + "loss": 0.6798, + "step": 208240 + }, + { + "epoch": 1.8409978960024045, + "grad_norm": 1.8171663284301758, + "learning_rate": 1.931670173329326e-05, + "loss": 0.6214, + "step": 208250 + }, + { + "epoch": 1.8410862992627168, + "grad_norm": 7.812248706817627, + "learning_rate": 1.9315228345621387e-05, + "loss": 0.5539, + "step": 208260 + }, + { + "epoch": 1.8411747025230292, + "grad_norm": 6.320882320404053, + "learning_rate": 1.931375495794952e-05, + "loss": 0.6669, + "step": 208270 + }, + { + "epoch": 1.8412631057833413, + "grad_norm": 1.2448458671569824, + "learning_rate": 1.9312281570277647e-05, + "loss": 0.5023, + "step": 208280 + }, + { + "epoch": 1.8413515090436534, + "grad_norm": 8.441811561584473, + "learning_rate": 1.9310808182605775e-05, + "loss": 0.472, + "step": 208290 + }, + { + "epoch": 1.8414399123039658, + "grad_norm": 3.837329387664795, + "learning_rate": 1.9309334794933907e-05, + "loss": 0.549, + "step": 208300 + }, + { + "epoch": 1.841528315564278, + "grad_norm": 2.837460517883301, + "learning_rate": 1.9307861407262035e-05, + "loss": 0.6207, + "step": 208310 + }, + { + "epoch": 1.8416167188245902, + "grad_norm": 1.691310167312622, + "learning_rate": 1.9306388019590164e-05, + "loss": 0.5241, + "step": 208320 + }, + { + "epoch": 1.8417051220849023, + "grad_norm": 1.5407449007034302, + "learning_rate": 1.9304914631918292e-05, + "loss": 0.5545, + "step": 208330 + }, + { + "epoch": 1.8417935253452147, + "grad_norm": 9.079808235168457, + "learning_rate": 1.9303441244246424e-05, + "loss": 0.5403, + "step": 208340 + }, + { + "epoch": 1.841881928605527, + "grad_norm": 3.774881601333618, + "learning_rate": 1.9301967856574552e-05, + "loss": 0.5198, + "step": 208350 + }, + { + "epoch": 1.8419703318658391, + "grad_norm": 3.2473204135894775, + "learning_rate": 1.930049446890268e-05, + "loss": 0.5796, + "step": 208360 + }, + { + "epoch": 1.8420587351261515, + "grad_norm": 2.596795082092285, + "learning_rate": 1.9299021081230812e-05, + "loss": 0.6532, + "step": 208370 + }, + { + "epoch": 1.8421471383864638, + "grad_norm": 1.7869517803192139, + "learning_rate": 1.929754769355894e-05, + "loss": 0.6459, + "step": 208380 + }, + { + "epoch": 1.842235541646776, + "grad_norm": 1.2131072282791138, + "learning_rate": 1.929607430588707e-05, + "loss": 0.4164, + "step": 208390 + }, + { + "epoch": 1.842323944907088, + "grad_norm": 0.9625169038772583, + "learning_rate": 1.9294600918215197e-05, + "loss": 0.529, + "step": 208400 + }, + { + "epoch": 1.8424123481674004, + "grad_norm": 6.193233966827393, + "learning_rate": 1.929312753054333e-05, + "loss": 0.6566, + "step": 208410 + }, + { + "epoch": 1.8425007514277127, + "grad_norm": 3.986889600753784, + "learning_rate": 1.9291654142871457e-05, + "loss": 0.56, + "step": 208420 + }, + { + "epoch": 1.8425891546880249, + "grad_norm": 8.127528190612793, + "learning_rate": 1.9290180755199586e-05, + "loss": 0.546, + "step": 208430 + }, + { + "epoch": 1.842677557948337, + "grad_norm": 1.231967568397522, + "learning_rate": 1.9288707367527714e-05, + "loss": 0.492, + "step": 208440 + }, + { + "epoch": 1.8427659612086493, + "grad_norm": 5.981503486633301, + "learning_rate": 1.9287233979855846e-05, + "loss": 0.6479, + "step": 208450 + }, + { + "epoch": 1.8428543644689617, + "grad_norm": 9.92072868347168, + "learning_rate": 1.9285760592183974e-05, + "loss": 0.6576, + "step": 208460 + }, + { + "epoch": 1.8429427677292738, + "grad_norm": 2.594308853149414, + "learning_rate": 1.9284287204512102e-05, + "loss": 0.5538, + "step": 208470 + }, + { + "epoch": 1.8430311709895861, + "grad_norm": 2.1643831729888916, + "learning_rate": 1.9282813816840234e-05, + "loss": 0.7178, + "step": 208480 + }, + { + "epoch": 1.8431195742498985, + "grad_norm": 2.940063714981079, + "learning_rate": 1.9281340429168363e-05, + "loss": 0.5861, + "step": 208490 + }, + { + "epoch": 1.8432079775102106, + "grad_norm": 3.8470075130462646, + "learning_rate": 1.927986704149649e-05, + "loss": 0.5032, + "step": 208500 + }, + { + "epoch": 1.8432963807705227, + "grad_norm": 4.685696125030518, + "learning_rate": 1.927839365382462e-05, + "loss": 0.5485, + "step": 208510 + }, + { + "epoch": 1.843384784030835, + "grad_norm": 6.664670467376709, + "learning_rate": 1.927692026615275e-05, + "loss": 0.7401, + "step": 208520 + }, + { + "epoch": 1.8434731872911474, + "grad_norm": 3.396160840988159, + "learning_rate": 1.927544687848088e-05, + "loss": 0.4842, + "step": 208530 + }, + { + "epoch": 1.8435615905514595, + "grad_norm": 1.1683683395385742, + "learning_rate": 1.9273973490809008e-05, + "loss": 0.5451, + "step": 208540 + }, + { + "epoch": 1.8436499938117716, + "grad_norm": 6.246324062347412, + "learning_rate": 1.9272500103137136e-05, + "loss": 0.5874, + "step": 208550 + }, + { + "epoch": 1.843738397072084, + "grad_norm": 2.184767961502075, + "learning_rate": 1.9271026715465268e-05, + "loss": 0.5323, + "step": 208560 + }, + { + "epoch": 1.8438268003323963, + "grad_norm": 2.2570149898529053, + "learning_rate": 1.9269553327793396e-05, + "loss": 0.6178, + "step": 208570 + }, + { + "epoch": 1.8439152035927084, + "grad_norm": 9.028661727905273, + "learning_rate": 1.9268079940121525e-05, + "loss": 0.479, + "step": 208580 + }, + { + "epoch": 1.8440036068530208, + "grad_norm": 1.981824278831482, + "learning_rate": 1.9266606552449656e-05, + "loss": 0.7059, + "step": 208590 + }, + { + "epoch": 1.8440920101133331, + "grad_norm": 1.6644154787063599, + "learning_rate": 1.9265133164777785e-05, + "loss": 0.6297, + "step": 208600 + }, + { + "epoch": 1.8441804133736452, + "grad_norm": 2.061619520187378, + "learning_rate": 1.9263659777105913e-05, + "loss": 0.4998, + "step": 208610 + }, + { + "epoch": 1.8442688166339574, + "grad_norm": 2.6741526126861572, + "learning_rate": 1.926218638943404e-05, + "loss": 0.6671, + "step": 208620 + }, + { + "epoch": 1.8443572198942697, + "grad_norm": 3.8932957649230957, + "learning_rate": 1.9260713001762173e-05, + "loss": 0.5225, + "step": 208630 + }, + { + "epoch": 1.844445623154582, + "grad_norm": 2.195492744445801, + "learning_rate": 1.92592396140903e-05, + "loss": 0.6988, + "step": 208640 + }, + { + "epoch": 1.8445340264148942, + "grad_norm": 2.725295305252075, + "learning_rate": 1.925776622641843e-05, + "loss": 0.4874, + "step": 208650 + }, + { + "epoch": 1.8446224296752063, + "grad_norm": 4.1461100578308105, + "learning_rate": 1.925629283874656e-05, + "loss": 0.4211, + "step": 208660 + }, + { + "epoch": 1.8447108329355186, + "grad_norm": 4.067321300506592, + "learning_rate": 1.925481945107469e-05, + "loss": 0.6168, + "step": 208670 + }, + { + "epoch": 1.844799236195831, + "grad_norm": 2.246029853820801, + "learning_rate": 1.9253346063402818e-05, + "loss": 0.5897, + "step": 208680 + }, + { + "epoch": 1.844887639456143, + "grad_norm": 1.4188412427902222, + "learning_rate": 1.9251872675730947e-05, + "loss": 0.5161, + "step": 208690 + }, + { + "epoch": 1.8449760427164554, + "grad_norm": 1.4774802923202515, + "learning_rate": 1.925039928805908e-05, + "loss": 0.5898, + "step": 208700 + }, + { + "epoch": 1.8450644459767678, + "grad_norm": 1.8297480344772339, + "learning_rate": 1.9248925900387207e-05, + "loss": 0.4712, + "step": 208710 + }, + { + "epoch": 1.84515284923708, + "grad_norm": 2.1271655559539795, + "learning_rate": 1.9247452512715335e-05, + "loss": 0.4736, + "step": 208720 + }, + { + "epoch": 1.845241252497392, + "grad_norm": 3.366549253463745, + "learning_rate": 1.9245979125043463e-05, + "loss": 0.4887, + "step": 208730 + }, + { + "epoch": 1.8453296557577044, + "grad_norm": 6.425593376159668, + "learning_rate": 1.9244505737371595e-05, + "loss": 0.489, + "step": 208740 + }, + { + "epoch": 1.8454180590180167, + "grad_norm": 2.044172525405884, + "learning_rate": 1.9243032349699723e-05, + "loss": 0.6317, + "step": 208750 + }, + { + "epoch": 1.8455064622783288, + "grad_norm": 1.8842204809188843, + "learning_rate": 1.9241558962027852e-05, + "loss": 0.4849, + "step": 208760 + }, + { + "epoch": 1.845594865538641, + "grad_norm": 4.034019947052002, + "learning_rate": 1.9240085574355984e-05, + "loss": 0.6941, + "step": 208770 + }, + { + "epoch": 1.8456832687989533, + "grad_norm": 1.5998018980026245, + "learning_rate": 1.9238612186684112e-05, + "loss": 0.4624, + "step": 208780 + }, + { + "epoch": 1.8457716720592656, + "grad_norm": 1.3839768171310425, + "learning_rate": 1.923713879901224e-05, + "loss": 0.6336, + "step": 208790 + }, + { + "epoch": 1.8458600753195777, + "grad_norm": 14.206204414367676, + "learning_rate": 1.923566541134037e-05, + "loss": 0.6308, + "step": 208800 + }, + { + "epoch": 1.8459484785798899, + "grad_norm": 5.289737224578857, + "learning_rate": 1.92341920236685e-05, + "loss": 0.6465, + "step": 208810 + }, + { + "epoch": 1.8460368818402024, + "grad_norm": 5.853087425231934, + "learning_rate": 1.923271863599663e-05, + "loss": 0.6347, + "step": 208820 + }, + { + "epoch": 1.8461252851005145, + "grad_norm": 1.3876986503601074, + "learning_rate": 1.9231245248324757e-05, + "loss": 0.4464, + "step": 208830 + }, + { + "epoch": 1.8462136883608267, + "grad_norm": 1.8678480386734009, + "learning_rate": 1.922977186065289e-05, + "loss": 0.5254, + "step": 208840 + }, + { + "epoch": 1.846302091621139, + "grad_norm": 4.040596008300781, + "learning_rate": 1.9228298472981017e-05, + "loss": 0.6068, + "step": 208850 + }, + { + "epoch": 1.8463904948814513, + "grad_norm": 15.859367370605469, + "learning_rate": 1.9226825085309146e-05, + "loss": 0.573, + "step": 208860 + }, + { + "epoch": 1.8464788981417635, + "grad_norm": 2.2348434925079346, + "learning_rate": 1.9225351697637274e-05, + "loss": 0.7017, + "step": 208870 + }, + { + "epoch": 1.8465673014020756, + "grad_norm": 2.014366865158081, + "learning_rate": 1.9223878309965406e-05, + "loss": 0.6181, + "step": 208880 + }, + { + "epoch": 1.846655704662388, + "grad_norm": 2.2384707927703857, + "learning_rate": 1.9222404922293534e-05, + "loss": 0.6106, + "step": 208890 + }, + { + "epoch": 1.8467441079227003, + "grad_norm": 2.4331438541412354, + "learning_rate": 1.9220931534621662e-05, + "loss": 0.5193, + "step": 208900 + }, + { + "epoch": 1.8468325111830124, + "grad_norm": 0.9146058559417725, + "learning_rate": 1.9219458146949794e-05, + "loss": 0.448, + "step": 208910 + }, + { + "epoch": 1.8469209144433245, + "grad_norm": 1.7593638896942139, + "learning_rate": 1.9217984759277922e-05, + "loss": 0.7223, + "step": 208920 + }, + { + "epoch": 1.8470093177036369, + "grad_norm": 7.308826446533203, + "learning_rate": 1.921651137160605e-05, + "loss": 0.7488, + "step": 208930 + }, + { + "epoch": 1.8470977209639492, + "grad_norm": 3.9544637203216553, + "learning_rate": 1.9215037983934183e-05, + "loss": 0.6527, + "step": 208940 + }, + { + "epoch": 1.8471861242242613, + "grad_norm": 6.821430206298828, + "learning_rate": 1.921356459626231e-05, + "loss": 0.5304, + "step": 208950 + }, + { + "epoch": 1.8472745274845737, + "grad_norm": 3.524042844772339, + "learning_rate": 1.921209120859044e-05, + "loss": 0.7257, + "step": 208960 + }, + { + "epoch": 1.847362930744886, + "grad_norm": 3.3266422748565674, + "learning_rate": 1.921061782091857e-05, + "loss": 0.5185, + "step": 208970 + }, + { + "epoch": 1.8474513340051981, + "grad_norm": 7.193199634552002, + "learning_rate": 1.92091444332467e-05, + "loss": 0.4839, + "step": 208980 + }, + { + "epoch": 1.8475397372655102, + "grad_norm": 4.568263530731201, + "learning_rate": 1.9207671045574828e-05, + "loss": 0.5547, + "step": 208990 + }, + { + "epoch": 1.8476281405258226, + "grad_norm": 1.9425479173660278, + "learning_rate": 1.920619765790296e-05, + "loss": 0.5305, + "step": 209000 + }, + { + "epoch": 1.847716543786135, + "grad_norm": 8.574569702148438, + "learning_rate": 1.9204724270231088e-05, + "loss": 0.5875, + "step": 209010 + }, + { + "epoch": 1.847804947046447, + "grad_norm": 2.6623642444610596, + "learning_rate": 1.9203250882559216e-05, + "loss": 0.6051, + "step": 209020 + }, + { + "epoch": 1.8478933503067592, + "grad_norm": 21.262672424316406, + "learning_rate": 1.9201777494887348e-05, + "loss": 0.5521, + "step": 209030 + }, + { + "epoch": 1.8479817535670715, + "grad_norm": 4.10367488861084, + "learning_rate": 1.9200304107215476e-05, + "loss": 0.6743, + "step": 209040 + }, + { + "epoch": 1.8480701568273838, + "grad_norm": 2.1683733463287354, + "learning_rate": 1.9198830719543605e-05, + "loss": 0.6803, + "step": 209050 + }, + { + "epoch": 1.848158560087696, + "grad_norm": 2.451362371444702, + "learning_rate": 1.9197357331871736e-05, + "loss": 0.5931, + "step": 209060 + }, + { + "epoch": 1.8482469633480083, + "grad_norm": 1.2647017240524292, + "learning_rate": 1.9195883944199865e-05, + "loss": 0.5594, + "step": 209070 + }, + { + "epoch": 1.8483353666083207, + "grad_norm": 2.7430880069732666, + "learning_rate": 1.9194410556527993e-05, + "loss": 0.5378, + "step": 209080 + }, + { + "epoch": 1.8484237698686328, + "grad_norm": 3.733816623687744, + "learning_rate": 1.919293716885612e-05, + "loss": 0.5103, + "step": 209090 + }, + { + "epoch": 1.848512173128945, + "grad_norm": 2.061666488647461, + "learning_rate": 1.9191463781184253e-05, + "loss": 0.6082, + "step": 209100 + }, + { + "epoch": 1.8486005763892572, + "grad_norm": 1.3504884243011475, + "learning_rate": 1.918999039351238e-05, + "loss": 0.4425, + "step": 209110 + }, + { + "epoch": 1.8486889796495696, + "grad_norm": 1.0352938175201416, + "learning_rate": 1.918851700584051e-05, + "loss": 0.5305, + "step": 209120 + }, + { + "epoch": 1.8487773829098817, + "grad_norm": 3.7901957035064697, + "learning_rate": 1.918704361816864e-05, + "loss": 0.5637, + "step": 209130 + }, + { + "epoch": 1.8488657861701938, + "grad_norm": 3.6319451332092285, + "learning_rate": 1.918557023049677e-05, + "loss": 0.521, + "step": 209140 + }, + { + "epoch": 1.8489541894305062, + "grad_norm": 1.8487882614135742, + "learning_rate": 1.91840968428249e-05, + "loss": 0.6178, + "step": 209150 + }, + { + "epoch": 1.8490425926908185, + "grad_norm": 2.701673746109009, + "learning_rate": 1.9182623455153027e-05, + "loss": 0.5902, + "step": 209160 + }, + { + "epoch": 1.8491309959511306, + "grad_norm": 1.8205074071884155, + "learning_rate": 1.918115006748116e-05, + "loss": 0.6549, + "step": 209170 + }, + { + "epoch": 1.849219399211443, + "grad_norm": 2.135723352432251, + "learning_rate": 1.9179676679809287e-05, + "loss": 0.4657, + "step": 209180 + }, + { + "epoch": 1.8493078024717553, + "grad_norm": 2.154141664505005, + "learning_rate": 1.9178203292137415e-05, + "loss": 0.5167, + "step": 209190 + }, + { + "epoch": 1.8493962057320674, + "grad_norm": 2.4006640911102295, + "learning_rate": 1.9176729904465543e-05, + "loss": 0.5883, + "step": 209200 + }, + { + "epoch": 1.8494846089923795, + "grad_norm": 3.1476542949676514, + "learning_rate": 1.9175256516793675e-05, + "loss": 0.5373, + "step": 209210 + }, + { + "epoch": 1.8495730122526919, + "grad_norm": 2.0557191371917725, + "learning_rate": 1.9173783129121804e-05, + "loss": 0.5275, + "step": 209220 + }, + { + "epoch": 1.8496614155130042, + "grad_norm": 2.2931723594665527, + "learning_rate": 1.9172309741449932e-05, + "loss": 0.6184, + "step": 209230 + }, + { + "epoch": 1.8497498187733163, + "grad_norm": 4.016658782958984, + "learning_rate": 1.9170836353778064e-05, + "loss": 0.6319, + "step": 209240 + }, + { + "epoch": 1.8498382220336285, + "grad_norm": 1.1208021640777588, + "learning_rate": 1.9169362966106192e-05, + "loss": 0.5133, + "step": 209250 + }, + { + "epoch": 1.8499266252939408, + "grad_norm": 1.048937439918518, + "learning_rate": 1.916788957843432e-05, + "loss": 0.6012, + "step": 209260 + }, + { + "epoch": 1.8500150285542531, + "grad_norm": 1.1958234310150146, + "learning_rate": 1.916641619076245e-05, + "loss": 0.6033, + "step": 209270 + }, + { + "epoch": 1.8501034318145653, + "grad_norm": 19.80698013305664, + "learning_rate": 1.916494280309058e-05, + "loss": 0.5972, + "step": 209280 + }, + { + "epoch": 1.8501918350748776, + "grad_norm": 2.8051559925079346, + "learning_rate": 1.916346941541871e-05, + "loss": 0.592, + "step": 209290 + }, + { + "epoch": 1.85028023833519, + "grad_norm": 1.0027694702148438, + "learning_rate": 1.9161996027746837e-05, + "loss": 0.5449, + "step": 209300 + }, + { + "epoch": 1.850368641595502, + "grad_norm": 11.03569221496582, + "learning_rate": 1.916052264007497e-05, + "loss": 0.5831, + "step": 209310 + }, + { + "epoch": 1.8504570448558142, + "grad_norm": 2.9402170181274414, + "learning_rate": 1.9159049252403097e-05, + "loss": 0.6162, + "step": 209320 + }, + { + "epoch": 1.8505454481161265, + "grad_norm": 10.135941505432129, + "learning_rate": 1.9157575864731226e-05, + "loss": 0.6953, + "step": 209330 + }, + { + "epoch": 1.8506338513764389, + "grad_norm": 13.804825782775879, + "learning_rate": 1.9156102477059354e-05, + "loss": 0.514, + "step": 209340 + }, + { + "epoch": 1.850722254636751, + "grad_norm": 2.008164644241333, + "learning_rate": 1.9154629089387486e-05, + "loss": 0.4948, + "step": 209350 + }, + { + "epoch": 1.8508106578970631, + "grad_norm": 1.5638909339904785, + "learning_rate": 1.9153155701715614e-05, + "loss": 0.6244, + "step": 209360 + }, + { + "epoch": 1.8508990611573755, + "grad_norm": 4.650651931762695, + "learning_rate": 1.9151682314043742e-05, + "loss": 0.677, + "step": 209370 + }, + { + "epoch": 1.8509874644176878, + "grad_norm": 1.0625505447387695, + "learning_rate": 1.915020892637187e-05, + "loss": 0.5537, + "step": 209380 + }, + { + "epoch": 1.851075867678, + "grad_norm": 1.3145408630371094, + "learning_rate": 1.9148735538700003e-05, + "loss": 0.5965, + "step": 209390 + }, + { + "epoch": 1.851164270938312, + "grad_norm": 1.9740639925003052, + "learning_rate": 1.914726215102813e-05, + "loss": 0.5444, + "step": 209400 + }, + { + "epoch": 1.8512526741986246, + "grad_norm": 2.028446912765503, + "learning_rate": 1.914578876335626e-05, + "loss": 0.6332, + "step": 209410 + }, + { + "epoch": 1.8513410774589367, + "grad_norm": 2.316175699234009, + "learning_rate": 1.914431537568439e-05, + "loss": 0.6294, + "step": 209420 + }, + { + "epoch": 1.8514294807192488, + "grad_norm": 3.143463373184204, + "learning_rate": 1.914284198801252e-05, + "loss": 0.5261, + "step": 209430 + }, + { + "epoch": 1.8515178839795612, + "grad_norm": 2.866745710372925, + "learning_rate": 1.9141368600340648e-05, + "loss": 0.4925, + "step": 209440 + }, + { + "epoch": 1.8516062872398735, + "grad_norm": 4.782702445983887, + "learning_rate": 1.9139895212668776e-05, + "loss": 0.68, + "step": 209450 + }, + { + "epoch": 1.8516946905001856, + "grad_norm": 6.505615234375, + "learning_rate": 1.9138421824996908e-05, + "loss": 0.6842, + "step": 209460 + }, + { + "epoch": 1.8517830937604978, + "grad_norm": 0.8906887769699097, + "learning_rate": 1.9136948437325036e-05, + "loss": 0.5339, + "step": 209470 + }, + { + "epoch": 1.85187149702081, + "grad_norm": 1.8407458066940308, + "learning_rate": 1.9135475049653164e-05, + "loss": 0.6347, + "step": 209480 + }, + { + "epoch": 1.8519599002811225, + "grad_norm": 3.4316773414611816, + "learning_rate": 1.9134001661981293e-05, + "loss": 0.4858, + "step": 209490 + }, + { + "epoch": 1.8520483035414346, + "grad_norm": 2.7154715061187744, + "learning_rate": 1.9132528274309425e-05, + "loss": 0.7528, + "step": 209500 + }, + { + "epoch": 1.8521367068017467, + "grad_norm": 2.3810553550720215, + "learning_rate": 1.9131054886637553e-05, + "loss": 0.675, + "step": 209510 + }, + { + "epoch": 1.8522251100620593, + "grad_norm": 2.987447500228882, + "learning_rate": 1.912958149896568e-05, + "loss": 0.6207, + "step": 209520 + }, + { + "epoch": 1.8523135133223714, + "grad_norm": 1.906565546989441, + "learning_rate": 1.9128108111293813e-05, + "loss": 0.5548, + "step": 209530 + }, + { + "epoch": 1.8524019165826835, + "grad_norm": 1.5837750434875488, + "learning_rate": 1.912663472362194e-05, + "loss": 0.4597, + "step": 209540 + }, + { + "epoch": 1.8524903198429958, + "grad_norm": 4.482527732849121, + "learning_rate": 1.912516133595007e-05, + "loss": 0.6536, + "step": 209550 + }, + { + "epoch": 1.8525787231033082, + "grad_norm": 1.0739617347717285, + "learning_rate": 1.9123687948278198e-05, + "loss": 0.5747, + "step": 209560 + }, + { + "epoch": 1.8526671263636203, + "grad_norm": 3.541281223297119, + "learning_rate": 1.912221456060633e-05, + "loss": 0.6488, + "step": 209570 + }, + { + "epoch": 1.8527555296239324, + "grad_norm": 2.5793895721435547, + "learning_rate": 1.9120741172934458e-05, + "loss": 0.5013, + "step": 209580 + }, + { + "epoch": 1.8528439328842448, + "grad_norm": 1.0419985055923462, + "learning_rate": 1.9119267785262587e-05, + "loss": 0.656, + "step": 209590 + }, + { + "epoch": 1.852932336144557, + "grad_norm": 4.573622226715088, + "learning_rate": 1.9117794397590718e-05, + "loss": 0.6409, + "step": 209600 + }, + { + "epoch": 1.8530207394048692, + "grad_norm": 2.8321332931518555, + "learning_rate": 1.9116321009918847e-05, + "loss": 0.5855, + "step": 209610 + }, + { + "epoch": 1.8531091426651813, + "grad_norm": 1.3490846157073975, + "learning_rate": 1.9114847622246975e-05, + "loss": 0.4741, + "step": 209620 + }, + { + "epoch": 1.8531975459254937, + "grad_norm": 2.3741469383239746, + "learning_rate": 1.9113374234575103e-05, + "loss": 0.5789, + "step": 209630 + }, + { + "epoch": 1.853285949185806, + "grad_norm": 4.7944841384887695, + "learning_rate": 1.9111900846903235e-05, + "loss": 0.4586, + "step": 209640 + }, + { + "epoch": 1.8533743524461181, + "grad_norm": 1.7278589010238647, + "learning_rate": 1.9110427459231363e-05, + "loss": 0.4589, + "step": 209650 + }, + { + "epoch": 1.8534627557064305, + "grad_norm": 2.2020726203918457, + "learning_rate": 1.9108954071559492e-05, + "loss": 0.6328, + "step": 209660 + }, + { + "epoch": 1.8535511589667428, + "grad_norm": 15.768167495727539, + "learning_rate": 1.910748068388762e-05, + "loss": 0.6843, + "step": 209670 + }, + { + "epoch": 1.853639562227055, + "grad_norm": 4.236842155456543, + "learning_rate": 1.9106007296215752e-05, + "loss": 0.5868, + "step": 209680 + }, + { + "epoch": 1.853727965487367, + "grad_norm": 4.5089640617370605, + "learning_rate": 1.910453390854388e-05, + "loss": 0.4748, + "step": 209690 + }, + { + "epoch": 1.8538163687476794, + "grad_norm": 3.02734112739563, + "learning_rate": 1.910306052087201e-05, + "loss": 0.6417, + "step": 209700 + }, + { + "epoch": 1.8539047720079918, + "grad_norm": 2.187847375869751, + "learning_rate": 1.910158713320014e-05, + "loss": 0.6094, + "step": 209710 + }, + { + "epoch": 1.8539931752683039, + "grad_norm": 2.2535903453826904, + "learning_rate": 1.910011374552827e-05, + "loss": 0.6177, + "step": 209720 + }, + { + "epoch": 1.854081578528616, + "grad_norm": 1.3434786796569824, + "learning_rate": 1.9098640357856397e-05, + "loss": 0.4544, + "step": 209730 + }, + { + "epoch": 1.8541699817889283, + "grad_norm": 16.987985610961914, + "learning_rate": 1.9097166970184525e-05, + "loss": 0.4421, + "step": 209740 + }, + { + "epoch": 1.8542583850492407, + "grad_norm": 1.3285064697265625, + "learning_rate": 1.9095693582512657e-05, + "loss": 0.5307, + "step": 209750 + }, + { + "epoch": 1.8543467883095528, + "grad_norm": 5.595705986022949, + "learning_rate": 1.9094220194840786e-05, + "loss": 0.5456, + "step": 209760 + }, + { + "epoch": 1.8544351915698651, + "grad_norm": 13.066436767578125, + "learning_rate": 1.9092746807168914e-05, + "loss": 0.4763, + "step": 209770 + }, + { + "epoch": 1.8545235948301775, + "grad_norm": 4.322331428527832, + "learning_rate": 1.9091273419497046e-05, + "loss": 0.4579, + "step": 209780 + }, + { + "epoch": 1.8546119980904896, + "grad_norm": 2.9211742877960205, + "learning_rate": 1.9089800031825174e-05, + "loss": 0.7345, + "step": 209790 + }, + { + "epoch": 1.8547004013508017, + "grad_norm": 2.2567601203918457, + "learning_rate": 1.9088326644153302e-05, + "loss": 0.6078, + "step": 209800 + }, + { + "epoch": 1.854788804611114, + "grad_norm": 3.767890453338623, + "learning_rate": 1.908685325648143e-05, + "loss": 0.7, + "step": 209810 + }, + { + "epoch": 1.8548772078714264, + "grad_norm": 2.635831117630005, + "learning_rate": 1.9085379868809562e-05, + "loss": 0.6286, + "step": 209820 + }, + { + "epoch": 1.8549656111317385, + "grad_norm": 2.658792734146118, + "learning_rate": 1.908390648113769e-05, + "loss": 0.6185, + "step": 209830 + }, + { + "epoch": 1.8550540143920506, + "grad_norm": 2.8339874744415283, + "learning_rate": 1.908243309346582e-05, + "loss": 0.6188, + "step": 209840 + }, + { + "epoch": 1.855142417652363, + "grad_norm": 1.5091569423675537, + "learning_rate": 1.908095970579395e-05, + "loss": 0.6727, + "step": 209850 + }, + { + "epoch": 1.8552308209126753, + "grad_norm": 2.574605941772461, + "learning_rate": 1.907948631812208e-05, + "loss": 0.6069, + "step": 209860 + }, + { + "epoch": 1.8553192241729874, + "grad_norm": 7.200460910797119, + "learning_rate": 1.9078012930450208e-05, + "loss": 0.5838, + "step": 209870 + }, + { + "epoch": 1.8554076274332998, + "grad_norm": 3.098416805267334, + "learning_rate": 1.907653954277834e-05, + "loss": 0.6059, + "step": 209880 + }, + { + "epoch": 1.8554960306936121, + "grad_norm": 3.291194438934326, + "learning_rate": 1.9075066155106468e-05, + "loss": 0.6237, + "step": 209890 + }, + { + "epoch": 1.8555844339539243, + "grad_norm": 1.4947134256362915, + "learning_rate": 1.9073592767434596e-05, + "loss": 0.5487, + "step": 209900 + }, + { + "epoch": 1.8556728372142364, + "grad_norm": 1.5824846029281616, + "learning_rate": 1.9072119379762728e-05, + "loss": 0.6922, + "step": 209910 + }, + { + "epoch": 1.8557612404745487, + "grad_norm": 2.8694868087768555, + "learning_rate": 1.9070645992090856e-05, + "loss": 0.449, + "step": 209920 + }, + { + "epoch": 1.855849643734861, + "grad_norm": 2.240093946456909, + "learning_rate": 1.9069172604418984e-05, + "loss": 0.6713, + "step": 209930 + }, + { + "epoch": 1.8559380469951732, + "grad_norm": 4.451633453369141, + "learning_rate": 1.9067699216747116e-05, + "loss": 0.4802, + "step": 209940 + }, + { + "epoch": 1.8560264502554853, + "grad_norm": 2.2405810356140137, + "learning_rate": 1.9066225829075245e-05, + "loss": 0.6286, + "step": 209950 + }, + { + "epoch": 1.8561148535157976, + "grad_norm": 2.9434351921081543, + "learning_rate": 1.9064752441403373e-05, + "loss": 0.6735, + "step": 209960 + }, + { + "epoch": 1.85620325677611, + "grad_norm": 9.438030242919922, + "learning_rate": 1.9063279053731505e-05, + "loss": 0.5048, + "step": 209970 + }, + { + "epoch": 1.856291660036422, + "grad_norm": 0.9189260601997375, + "learning_rate": 1.9061805666059633e-05, + "loss": 0.6372, + "step": 209980 + }, + { + "epoch": 1.8563800632967342, + "grad_norm": 2.6689350605010986, + "learning_rate": 1.906033227838776e-05, + "loss": 0.5439, + "step": 209990 + }, + { + "epoch": 1.8564684665570468, + "grad_norm": 2.3570046424865723, + "learning_rate": 1.9058858890715893e-05, + "loss": 0.7371, + "step": 210000 + }, + { + "epoch": 1.856556869817359, + "grad_norm": 3.1186368465423584, + "learning_rate": 1.905738550304402e-05, + "loss": 0.4917, + "step": 210010 + }, + { + "epoch": 1.856645273077671, + "grad_norm": 1.5461078882217407, + "learning_rate": 1.905591211537215e-05, + "loss": 0.6838, + "step": 210020 + }, + { + "epoch": 1.8567336763379834, + "grad_norm": 0.710598349571228, + "learning_rate": 1.9054438727700278e-05, + "loss": 0.5545, + "step": 210030 + }, + { + "epoch": 1.8568220795982957, + "grad_norm": 9.732213973999023, + "learning_rate": 1.905296534002841e-05, + "loss": 0.6883, + "step": 210040 + }, + { + "epoch": 1.8569104828586078, + "grad_norm": 1.8715342283248901, + "learning_rate": 1.9051491952356538e-05, + "loss": 0.5413, + "step": 210050 + }, + { + "epoch": 1.85699888611892, + "grad_norm": 7.001986026763916, + "learning_rate": 1.9050018564684667e-05, + "loss": 0.5849, + "step": 210060 + }, + { + "epoch": 1.8570872893792323, + "grad_norm": 2.913346290588379, + "learning_rate": 1.90485451770128e-05, + "loss": 0.5106, + "step": 210070 + }, + { + "epoch": 1.8571756926395446, + "grad_norm": 2.3131251335144043, + "learning_rate": 1.9047071789340927e-05, + "loss": 0.5835, + "step": 210080 + }, + { + "epoch": 1.8572640958998567, + "grad_norm": 5.071016788482666, + "learning_rate": 1.9045598401669055e-05, + "loss": 0.645, + "step": 210090 + }, + { + "epoch": 1.8573524991601689, + "grad_norm": 2.1975395679473877, + "learning_rate": 1.9044125013997183e-05, + "loss": 0.4592, + "step": 210100 + }, + { + "epoch": 1.8574409024204814, + "grad_norm": 1.8032022714614868, + "learning_rate": 1.9042651626325315e-05, + "loss": 0.7926, + "step": 210110 + }, + { + "epoch": 1.8575293056807936, + "grad_norm": 3.4323999881744385, + "learning_rate": 1.9041178238653444e-05, + "loss": 0.5741, + "step": 210120 + }, + { + "epoch": 1.8576177089411057, + "grad_norm": 2.3246331214904785, + "learning_rate": 1.9039704850981572e-05, + "loss": 0.5563, + "step": 210130 + }, + { + "epoch": 1.857706112201418, + "grad_norm": 2.060991048812866, + "learning_rate": 1.90382314633097e-05, + "loss": 0.544, + "step": 210140 + }, + { + "epoch": 1.8577945154617304, + "grad_norm": 4.852057456970215, + "learning_rate": 1.9036758075637832e-05, + "loss": 0.6653, + "step": 210150 + }, + { + "epoch": 1.8578829187220425, + "grad_norm": 27.585124969482422, + "learning_rate": 1.903528468796596e-05, + "loss": 0.6218, + "step": 210160 + }, + { + "epoch": 1.8579713219823546, + "grad_norm": 4.210274696350098, + "learning_rate": 1.903381130029409e-05, + "loss": 0.5708, + "step": 210170 + }, + { + "epoch": 1.858059725242667, + "grad_norm": 1.9030274152755737, + "learning_rate": 1.903233791262222e-05, + "loss": 0.4775, + "step": 210180 + }, + { + "epoch": 1.8581481285029793, + "grad_norm": 6.925843715667725, + "learning_rate": 1.903086452495035e-05, + "loss": 0.6747, + "step": 210190 + }, + { + "epoch": 1.8582365317632914, + "grad_norm": 1.128814935684204, + "learning_rate": 1.9029391137278477e-05, + "loss": 0.5575, + "step": 210200 + }, + { + "epoch": 1.8583249350236035, + "grad_norm": 3.0503857135772705, + "learning_rate": 1.9027917749606605e-05, + "loss": 0.6345, + "step": 210210 + }, + { + "epoch": 1.8584133382839159, + "grad_norm": 11.362192153930664, + "learning_rate": 1.9026444361934737e-05, + "loss": 0.5727, + "step": 210220 + }, + { + "epoch": 1.8585017415442282, + "grad_norm": 2.16227388381958, + "learning_rate": 1.9024970974262866e-05, + "loss": 0.4944, + "step": 210230 + }, + { + "epoch": 1.8585901448045403, + "grad_norm": 3.194079875946045, + "learning_rate": 1.9023497586590994e-05, + "loss": 0.644, + "step": 210240 + }, + { + "epoch": 1.8586785480648527, + "grad_norm": 1.760561466217041, + "learning_rate": 1.9022024198919126e-05, + "loss": 0.6066, + "step": 210250 + }, + { + "epoch": 1.858766951325165, + "grad_norm": 1.222104787826538, + "learning_rate": 1.9020550811247254e-05, + "loss": 0.5032, + "step": 210260 + }, + { + "epoch": 1.8588553545854771, + "grad_norm": 1.9924540519714355, + "learning_rate": 1.9019077423575382e-05, + "loss": 0.4528, + "step": 210270 + }, + { + "epoch": 1.8589437578457892, + "grad_norm": 1.879380702972412, + "learning_rate": 1.901760403590351e-05, + "loss": 0.6124, + "step": 210280 + }, + { + "epoch": 1.8590321611061016, + "grad_norm": 1.26064932346344, + "learning_rate": 1.9016130648231642e-05, + "loss": 0.5068, + "step": 210290 + }, + { + "epoch": 1.859120564366414, + "grad_norm": 1.4377273321151733, + "learning_rate": 1.901465726055977e-05, + "loss": 0.7046, + "step": 210300 + }, + { + "epoch": 1.859208967626726, + "grad_norm": 1.9383795261383057, + "learning_rate": 1.90131838728879e-05, + "loss": 0.6046, + "step": 210310 + }, + { + "epoch": 1.8592973708870382, + "grad_norm": 4.310141086578369, + "learning_rate": 1.9011710485216028e-05, + "loss": 0.6647, + "step": 210320 + }, + { + "epoch": 1.8593857741473505, + "grad_norm": 1.1649905443191528, + "learning_rate": 1.901023709754416e-05, + "loss": 0.5247, + "step": 210330 + }, + { + "epoch": 1.8594741774076629, + "grad_norm": 2.907787322998047, + "learning_rate": 1.9008763709872288e-05, + "loss": 0.5539, + "step": 210340 + }, + { + "epoch": 1.859562580667975, + "grad_norm": 4.768134593963623, + "learning_rate": 1.9007290322200416e-05, + "loss": 0.5305, + "step": 210350 + }, + { + "epoch": 1.8596509839282873, + "grad_norm": 2.1298699378967285, + "learning_rate": 1.9005816934528548e-05, + "loss": 0.6477, + "step": 210360 + }, + { + "epoch": 1.8597393871885997, + "grad_norm": 5.800304889678955, + "learning_rate": 1.9004343546856676e-05, + "loss": 0.568, + "step": 210370 + }, + { + "epoch": 1.8598277904489118, + "grad_norm": 4.090860366821289, + "learning_rate": 1.9002870159184804e-05, + "loss": 0.6223, + "step": 210380 + }, + { + "epoch": 1.859916193709224, + "grad_norm": 4.281871795654297, + "learning_rate": 1.9001396771512933e-05, + "loss": 0.5283, + "step": 210390 + }, + { + "epoch": 1.8600045969695362, + "grad_norm": 2.082164764404297, + "learning_rate": 1.8999923383841065e-05, + "loss": 0.7171, + "step": 210400 + }, + { + "epoch": 1.8600930002298486, + "grad_norm": 2.0262606143951416, + "learning_rate": 1.8998449996169193e-05, + "loss": 0.5147, + "step": 210410 + }, + { + "epoch": 1.8601814034901607, + "grad_norm": 6.9588470458984375, + "learning_rate": 1.899697660849732e-05, + "loss": 0.5693, + "step": 210420 + }, + { + "epoch": 1.8602698067504728, + "grad_norm": 0.828840970993042, + "learning_rate": 1.8995503220825453e-05, + "loss": 0.4583, + "step": 210430 + }, + { + "epoch": 1.8603582100107852, + "grad_norm": 1.6092866659164429, + "learning_rate": 1.899402983315358e-05, + "loss": 0.592, + "step": 210440 + }, + { + "epoch": 1.8604466132710975, + "grad_norm": 1.7956653833389282, + "learning_rate": 1.899255644548171e-05, + "loss": 0.6539, + "step": 210450 + }, + { + "epoch": 1.8605350165314096, + "grad_norm": 18.42283821105957, + "learning_rate": 1.8991083057809838e-05, + "loss": 0.5401, + "step": 210460 + }, + { + "epoch": 1.860623419791722, + "grad_norm": 0.954200804233551, + "learning_rate": 1.898960967013797e-05, + "loss": 0.5161, + "step": 210470 + }, + { + "epoch": 1.8607118230520343, + "grad_norm": 1.5571684837341309, + "learning_rate": 1.8988136282466098e-05, + "loss": 0.6886, + "step": 210480 + }, + { + "epoch": 1.8608002263123464, + "grad_norm": 2.7032861709594727, + "learning_rate": 1.8986662894794226e-05, + "loss": 0.5252, + "step": 210490 + }, + { + "epoch": 1.8608886295726585, + "grad_norm": 2.116373300552368, + "learning_rate": 1.8985189507122355e-05, + "loss": 0.5481, + "step": 210500 + }, + { + "epoch": 1.860977032832971, + "grad_norm": 2.9152045249938965, + "learning_rate": 1.8983716119450487e-05, + "loss": 0.4912, + "step": 210510 + }, + { + "epoch": 1.8610654360932832, + "grad_norm": 1.7530553340911865, + "learning_rate": 1.8982242731778615e-05, + "loss": 0.5536, + "step": 210520 + }, + { + "epoch": 1.8611538393535954, + "grad_norm": 2.242582321166992, + "learning_rate": 1.8980769344106743e-05, + "loss": 0.576, + "step": 210530 + }, + { + "epoch": 1.8612422426139075, + "grad_norm": 1.1391644477844238, + "learning_rate": 1.8979295956434875e-05, + "loss": 0.5832, + "step": 210540 + }, + { + "epoch": 1.8613306458742198, + "grad_norm": 2.775040864944458, + "learning_rate": 1.8977822568763003e-05, + "loss": 0.5738, + "step": 210550 + }, + { + "epoch": 1.8614190491345322, + "grad_norm": 6.068946361541748, + "learning_rate": 1.8976349181091132e-05, + "loss": 0.6366, + "step": 210560 + }, + { + "epoch": 1.8615074523948443, + "grad_norm": 0.9730151891708374, + "learning_rate": 1.897487579341926e-05, + "loss": 0.5877, + "step": 210570 + }, + { + "epoch": 1.8615958556551566, + "grad_norm": 8.074066162109375, + "learning_rate": 1.8973402405747392e-05, + "loss": 0.6844, + "step": 210580 + }, + { + "epoch": 1.861684258915469, + "grad_norm": 3.2558224201202393, + "learning_rate": 1.897192901807552e-05, + "loss": 0.5831, + "step": 210590 + }, + { + "epoch": 1.861772662175781, + "grad_norm": 3.300506114959717, + "learning_rate": 1.897045563040365e-05, + "loss": 0.6114, + "step": 210600 + }, + { + "epoch": 1.8618610654360932, + "grad_norm": 5.034607887268066, + "learning_rate": 1.8968982242731777e-05, + "loss": 0.52, + "step": 210610 + }, + { + "epoch": 1.8619494686964055, + "grad_norm": 1.6265817880630493, + "learning_rate": 1.896750885505991e-05, + "loss": 0.5916, + "step": 210620 + }, + { + "epoch": 1.8620378719567179, + "grad_norm": 3.206765651702881, + "learning_rate": 1.8966035467388037e-05, + "loss": 0.5087, + "step": 210630 + }, + { + "epoch": 1.86212627521703, + "grad_norm": 2.3355016708374023, + "learning_rate": 1.8964562079716165e-05, + "loss": 0.6369, + "step": 210640 + }, + { + "epoch": 1.8622146784773421, + "grad_norm": 1.6999640464782715, + "learning_rate": 1.8963088692044297e-05, + "loss": 0.6797, + "step": 210650 + }, + { + "epoch": 1.8623030817376545, + "grad_norm": 1.7024785280227661, + "learning_rate": 1.8961615304372425e-05, + "loss": 0.5105, + "step": 210660 + }, + { + "epoch": 1.8623914849979668, + "grad_norm": 0.689441978931427, + "learning_rate": 1.8960141916700554e-05, + "loss": 0.5794, + "step": 210670 + }, + { + "epoch": 1.862479888258279, + "grad_norm": 1.237181305885315, + "learning_rate": 1.8958668529028682e-05, + "loss": 0.4893, + "step": 210680 + }, + { + "epoch": 1.862568291518591, + "grad_norm": 2.740414619445801, + "learning_rate": 1.8957195141356814e-05, + "loss": 0.7547, + "step": 210690 + }, + { + "epoch": 1.8626566947789036, + "grad_norm": 2.399672031402588, + "learning_rate": 1.8955721753684942e-05, + "loss": 0.6757, + "step": 210700 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 10.054362297058105, + "learning_rate": 1.895424836601307e-05, + "loss": 0.6307, + "step": 210710 + }, + { + "epoch": 1.8628335012995278, + "grad_norm": 3.8515055179595947, + "learning_rate": 1.8952774978341202e-05, + "loss": 0.5937, + "step": 210720 + }, + { + "epoch": 1.8629219045598402, + "grad_norm": 3.577775716781616, + "learning_rate": 1.895130159066933e-05, + "loss": 0.5866, + "step": 210730 + }, + { + "epoch": 1.8630103078201525, + "grad_norm": 14.429736137390137, + "learning_rate": 1.894982820299746e-05, + "loss": 0.7103, + "step": 210740 + }, + { + "epoch": 1.8630987110804647, + "grad_norm": 2.594417095184326, + "learning_rate": 1.8948354815325587e-05, + "loss": 0.5822, + "step": 210750 + }, + { + "epoch": 1.8631871143407768, + "grad_norm": 1.7685571908950806, + "learning_rate": 1.894688142765372e-05, + "loss": 0.5286, + "step": 210760 + }, + { + "epoch": 1.8632755176010891, + "grad_norm": 2.778186559677124, + "learning_rate": 1.8945408039981848e-05, + "loss": 0.4858, + "step": 210770 + }, + { + "epoch": 1.8633639208614015, + "grad_norm": 2.275498628616333, + "learning_rate": 1.8943934652309976e-05, + "loss": 0.4612, + "step": 210780 + }, + { + "epoch": 1.8634523241217136, + "grad_norm": 5.5841779708862305, + "learning_rate": 1.8942461264638108e-05, + "loss": 0.6562, + "step": 210790 + }, + { + "epoch": 1.8635407273820257, + "grad_norm": 2.1666529178619385, + "learning_rate": 1.8940987876966236e-05, + "loss": 0.5021, + "step": 210800 + }, + { + "epoch": 1.863629130642338, + "grad_norm": 5.69851541519165, + "learning_rate": 1.8939514489294364e-05, + "loss": 0.5039, + "step": 210810 + }, + { + "epoch": 1.8637175339026504, + "grad_norm": 7.253466606140137, + "learning_rate": 1.8938041101622496e-05, + "loss": 0.6435, + "step": 210820 + }, + { + "epoch": 1.8638059371629625, + "grad_norm": 1.8968863487243652, + "learning_rate": 1.8936567713950624e-05, + "loss": 0.5547, + "step": 210830 + }, + { + "epoch": 1.8638943404232748, + "grad_norm": 3.991114616394043, + "learning_rate": 1.8935094326278753e-05, + "loss": 0.542, + "step": 210840 + }, + { + "epoch": 1.8639827436835872, + "grad_norm": 2.539324998855591, + "learning_rate": 1.8933620938606885e-05, + "loss": 0.6605, + "step": 210850 + }, + { + "epoch": 1.8640711469438993, + "grad_norm": 1.6203175783157349, + "learning_rate": 1.8932147550935013e-05, + "loss": 0.5179, + "step": 210860 + }, + { + "epoch": 1.8641595502042114, + "grad_norm": 5.415097713470459, + "learning_rate": 1.8930674163263145e-05, + "loss": 0.5718, + "step": 210870 + }, + { + "epoch": 1.8642479534645238, + "grad_norm": 3.519313335418701, + "learning_rate": 1.8929200775591273e-05, + "loss": 0.5557, + "step": 210880 + }, + { + "epoch": 1.864336356724836, + "grad_norm": 14.324116706848145, + "learning_rate": 1.89277273879194e-05, + "loss": 0.6179, + "step": 210890 + }, + { + "epoch": 1.8644247599851482, + "grad_norm": 2.7402307987213135, + "learning_rate": 1.8926254000247533e-05, + "loss": 0.62, + "step": 210900 + }, + { + "epoch": 1.8645131632454603, + "grad_norm": 1.7623101472854614, + "learning_rate": 1.892478061257566e-05, + "loss": 0.6093, + "step": 210910 + }, + { + "epoch": 1.8646015665057727, + "grad_norm": 1.7904170751571655, + "learning_rate": 1.892330722490379e-05, + "loss": 0.6941, + "step": 210920 + }, + { + "epoch": 1.864689969766085, + "grad_norm": 2.7217648029327393, + "learning_rate": 1.8921833837231918e-05, + "loss": 0.569, + "step": 210930 + }, + { + "epoch": 1.8647783730263972, + "grad_norm": 3.0506932735443115, + "learning_rate": 1.892036044956005e-05, + "loss": 0.6003, + "step": 210940 + }, + { + "epoch": 1.8648667762867095, + "grad_norm": 4.51693058013916, + "learning_rate": 1.8918887061888178e-05, + "loss": 0.7066, + "step": 210950 + }, + { + "epoch": 1.8649551795470218, + "grad_norm": 0.9649724960327148, + "learning_rate": 1.8917413674216307e-05, + "loss": 0.526, + "step": 210960 + }, + { + "epoch": 1.865043582807334, + "grad_norm": 5.553628444671631, + "learning_rate": 1.8915940286544435e-05, + "loss": 0.6509, + "step": 210970 + }, + { + "epoch": 1.865131986067646, + "grad_norm": 3.0489587783813477, + "learning_rate": 1.8914466898872567e-05, + "loss": 0.6754, + "step": 210980 + }, + { + "epoch": 1.8652203893279584, + "grad_norm": 10.719182014465332, + "learning_rate": 1.8912993511200695e-05, + "loss": 0.6287, + "step": 210990 + }, + { + "epoch": 1.8653087925882708, + "grad_norm": 5.158768653869629, + "learning_rate": 1.8911520123528823e-05, + "loss": 0.5916, + "step": 211000 + }, + { + "epoch": 1.8653971958485829, + "grad_norm": 5.014809608459473, + "learning_rate": 1.8910046735856955e-05, + "loss": 0.6263, + "step": 211010 + }, + { + "epoch": 1.865485599108895, + "grad_norm": 2.151365280151367, + "learning_rate": 1.8908573348185083e-05, + "loss": 0.4735, + "step": 211020 + }, + { + "epoch": 1.8655740023692073, + "grad_norm": 1.3339873552322388, + "learning_rate": 1.8907099960513212e-05, + "loss": 0.5552, + "step": 211030 + }, + { + "epoch": 1.8656624056295197, + "grad_norm": 2.4142069816589355, + "learning_rate": 1.890562657284134e-05, + "loss": 0.6132, + "step": 211040 + }, + { + "epoch": 1.8657508088898318, + "grad_norm": 5.738188743591309, + "learning_rate": 1.8904153185169472e-05, + "loss": 0.5236, + "step": 211050 + }, + { + "epoch": 1.8658392121501441, + "grad_norm": 0.7615451216697693, + "learning_rate": 1.89026797974976e-05, + "loss": 0.4567, + "step": 211060 + }, + { + "epoch": 1.8659276154104565, + "grad_norm": 3.8887619972229004, + "learning_rate": 1.890120640982573e-05, + "loss": 0.626, + "step": 211070 + }, + { + "epoch": 1.8660160186707686, + "grad_norm": 2.3460888862609863, + "learning_rate": 1.889973302215386e-05, + "loss": 0.636, + "step": 211080 + }, + { + "epoch": 1.8661044219310807, + "grad_norm": 9.921082496643066, + "learning_rate": 1.889825963448199e-05, + "loss": 0.6063, + "step": 211090 + }, + { + "epoch": 1.866192825191393, + "grad_norm": 3.5486340522766113, + "learning_rate": 1.8896786246810117e-05, + "loss": 0.6388, + "step": 211100 + }, + { + "epoch": 1.8662812284517054, + "grad_norm": 1.7861011028289795, + "learning_rate": 1.8895312859138245e-05, + "loss": 0.5378, + "step": 211110 + }, + { + "epoch": 1.8663696317120175, + "grad_norm": 4.1411237716674805, + "learning_rate": 1.8893839471466377e-05, + "loss": 0.5782, + "step": 211120 + }, + { + "epoch": 1.8664580349723296, + "grad_norm": 4.170247554779053, + "learning_rate": 1.8892366083794506e-05, + "loss": 0.5814, + "step": 211130 + }, + { + "epoch": 1.866546438232642, + "grad_norm": 4.27308988571167, + "learning_rate": 1.8890892696122634e-05, + "loss": 0.5502, + "step": 211140 + }, + { + "epoch": 1.8666348414929543, + "grad_norm": 2.343512773513794, + "learning_rate": 1.8889419308450762e-05, + "loss": 0.4894, + "step": 211150 + }, + { + "epoch": 1.8667232447532665, + "grad_norm": 3.4192721843719482, + "learning_rate": 1.8887945920778894e-05, + "loss": 0.6186, + "step": 211160 + }, + { + "epoch": 1.8668116480135788, + "grad_norm": 10.524596214294434, + "learning_rate": 1.8886472533107022e-05, + "loss": 0.6242, + "step": 211170 + }, + { + "epoch": 1.8669000512738911, + "grad_norm": 2.186183214187622, + "learning_rate": 1.888499914543515e-05, + "loss": 0.6875, + "step": 211180 + }, + { + "epoch": 1.8669884545342033, + "grad_norm": 3.4638583660125732, + "learning_rate": 1.8883525757763282e-05, + "loss": 0.4402, + "step": 211190 + }, + { + "epoch": 1.8670768577945154, + "grad_norm": 1.5348575115203857, + "learning_rate": 1.888205237009141e-05, + "loss": 0.4851, + "step": 211200 + }, + { + "epoch": 1.8671652610548277, + "grad_norm": 2.961879014968872, + "learning_rate": 1.888057898241954e-05, + "loss": 0.5269, + "step": 211210 + }, + { + "epoch": 1.86725366431514, + "grad_norm": 2.18208646774292, + "learning_rate": 1.8879105594747667e-05, + "loss": 0.5045, + "step": 211220 + }, + { + "epoch": 1.8673420675754522, + "grad_norm": 2.162020444869995, + "learning_rate": 1.88776322070758e-05, + "loss": 0.5443, + "step": 211230 + }, + { + "epoch": 1.8674304708357643, + "grad_norm": 16.945356369018555, + "learning_rate": 1.8876158819403928e-05, + "loss": 0.6668, + "step": 211240 + }, + { + "epoch": 1.8675188740960766, + "grad_norm": 1.7313823699951172, + "learning_rate": 1.8874685431732056e-05, + "loss": 0.6056, + "step": 211250 + }, + { + "epoch": 1.867607277356389, + "grad_norm": 1.8208528757095337, + "learning_rate": 1.8873212044060184e-05, + "loss": 0.6211, + "step": 211260 + }, + { + "epoch": 1.867695680616701, + "grad_norm": 1.1184802055358887, + "learning_rate": 1.8871738656388316e-05, + "loss": 0.5644, + "step": 211270 + }, + { + "epoch": 1.8677840838770132, + "grad_norm": 8.482893943786621, + "learning_rate": 1.8870265268716444e-05, + "loss": 0.4855, + "step": 211280 + }, + { + "epoch": 1.8678724871373258, + "grad_norm": 2.199266195297241, + "learning_rate": 1.8868791881044573e-05, + "loss": 0.4921, + "step": 211290 + }, + { + "epoch": 1.867960890397638, + "grad_norm": 1.4627994298934937, + "learning_rate": 1.8867318493372704e-05, + "loss": 0.5601, + "step": 211300 + }, + { + "epoch": 1.86804929365795, + "grad_norm": 4.6828413009643555, + "learning_rate": 1.8865845105700833e-05, + "loss": 0.5459, + "step": 211310 + }, + { + "epoch": 1.8681376969182624, + "grad_norm": 2.588606357574463, + "learning_rate": 1.886437171802896e-05, + "loss": 0.5119, + "step": 211320 + }, + { + "epoch": 1.8682261001785747, + "grad_norm": 3.95709228515625, + "learning_rate": 1.886289833035709e-05, + "loss": 0.5759, + "step": 211330 + }, + { + "epoch": 1.8683145034388868, + "grad_norm": 14.782010078430176, + "learning_rate": 1.886142494268522e-05, + "loss": 0.4809, + "step": 211340 + }, + { + "epoch": 1.868402906699199, + "grad_norm": 7.163753986358643, + "learning_rate": 1.885995155501335e-05, + "loss": 0.541, + "step": 211350 + }, + { + "epoch": 1.8684913099595113, + "grad_norm": 1.8535152673721313, + "learning_rate": 1.8858478167341478e-05, + "loss": 0.5425, + "step": 211360 + }, + { + "epoch": 1.8685797132198236, + "grad_norm": 2.3897852897644043, + "learning_rate": 1.885700477966961e-05, + "loss": 0.504, + "step": 211370 + }, + { + "epoch": 1.8686681164801358, + "grad_norm": 9.051289558410645, + "learning_rate": 1.8855531391997738e-05, + "loss": 0.6437, + "step": 211380 + }, + { + "epoch": 1.8687565197404479, + "grad_norm": 2.9049437046051025, + "learning_rate": 1.8854058004325866e-05, + "loss": 0.6196, + "step": 211390 + }, + { + "epoch": 1.8688449230007602, + "grad_norm": 9.958059310913086, + "learning_rate": 1.8852584616653995e-05, + "loss": 0.6195, + "step": 211400 + }, + { + "epoch": 1.8689333262610726, + "grad_norm": 5.780620574951172, + "learning_rate": 1.8851111228982127e-05, + "loss": 0.4466, + "step": 211410 + }, + { + "epoch": 1.8690217295213847, + "grad_norm": 1.4476335048675537, + "learning_rate": 1.8849637841310255e-05, + "loss": 0.624, + "step": 211420 + }, + { + "epoch": 1.869110132781697, + "grad_norm": 2.000783920288086, + "learning_rate": 1.8848164453638383e-05, + "loss": 0.4835, + "step": 211430 + }, + { + "epoch": 1.8691985360420094, + "grad_norm": 1.3840090036392212, + "learning_rate": 1.884669106596651e-05, + "loss": 0.5851, + "step": 211440 + }, + { + "epoch": 1.8692869393023215, + "grad_norm": 2.47864031791687, + "learning_rate": 1.8845217678294643e-05, + "loss": 0.6764, + "step": 211450 + }, + { + "epoch": 1.8693753425626336, + "grad_norm": 1.6236169338226318, + "learning_rate": 1.8843744290622772e-05, + "loss": 0.6003, + "step": 211460 + }, + { + "epoch": 1.869463745822946, + "grad_norm": 2.9928510189056396, + "learning_rate": 1.88422709029509e-05, + "loss": 0.5077, + "step": 211470 + }, + { + "epoch": 1.8695521490832583, + "grad_norm": 3.0410666465759277, + "learning_rate": 1.8840797515279032e-05, + "loss": 0.6633, + "step": 211480 + }, + { + "epoch": 1.8696405523435704, + "grad_norm": 1.3951866626739502, + "learning_rate": 1.883932412760716e-05, + "loss": 0.6126, + "step": 211490 + }, + { + "epoch": 1.8697289556038825, + "grad_norm": 1.5205440521240234, + "learning_rate": 1.883785073993529e-05, + "loss": 0.6052, + "step": 211500 + }, + { + "epoch": 1.8698173588641949, + "grad_norm": 2.1935489177703857, + "learning_rate": 1.8836377352263417e-05, + "loss": 0.6031, + "step": 211510 + }, + { + "epoch": 1.8699057621245072, + "grad_norm": 2.1376328468322754, + "learning_rate": 1.883490396459155e-05, + "loss": 0.5603, + "step": 211520 + }, + { + "epoch": 1.8699941653848193, + "grad_norm": 2.075496196746826, + "learning_rate": 1.8833430576919677e-05, + "loss": 0.5508, + "step": 211530 + }, + { + "epoch": 1.8700825686451317, + "grad_norm": 1.8145004510879517, + "learning_rate": 1.8831957189247805e-05, + "loss": 0.6853, + "step": 211540 + }, + { + "epoch": 1.870170971905444, + "grad_norm": 1.899678349494934, + "learning_rate": 1.8830483801575937e-05, + "loss": 0.6093, + "step": 211550 + }, + { + "epoch": 1.8702593751657561, + "grad_norm": 1.9398306608200073, + "learning_rate": 1.8829010413904065e-05, + "loss": 0.5764, + "step": 211560 + }, + { + "epoch": 1.8703477784260683, + "grad_norm": 0.985061526298523, + "learning_rate": 1.8827537026232194e-05, + "loss": 0.5547, + "step": 211570 + }, + { + "epoch": 1.8704361816863806, + "grad_norm": 2.241347551345825, + "learning_rate": 1.8826063638560322e-05, + "loss": 0.6419, + "step": 211580 + }, + { + "epoch": 1.870524584946693, + "grad_norm": 1.3498973846435547, + "learning_rate": 1.8824590250888454e-05, + "loss": 0.4481, + "step": 211590 + }, + { + "epoch": 1.870612988207005, + "grad_norm": 2.329902410507202, + "learning_rate": 1.8823116863216582e-05, + "loss": 0.548, + "step": 211600 + }, + { + "epoch": 1.8707013914673172, + "grad_norm": 3.2804064750671387, + "learning_rate": 1.882164347554471e-05, + "loss": 0.5119, + "step": 211610 + }, + { + "epoch": 1.8707897947276295, + "grad_norm": 15.255338668823242, + "learning_rate": 1.882017008787284e-05, + "loss": 0.6286, + "step": 211620 + }, + { + "epoch": 1.8708781979879419, + "grad_norm": 2.46301007270813, + "learning_rate": 1.881869670020097e-05, + "loss": 0.6802, + "step": 211630 + }, + { + "epoch": 1.870966601248254, + "grad_norm": 5.738197326660156, + "learning_rate": 1.88172233125291e-05, + "loss": 0.6169, + "step": 211640 + }, + { + "epoch": 1.8710550045085663, + "grad_norm": 3.685235023498535, + "learning_rate": 1.8815749924857227e-05, + "loss": 0.5951, + "step": 211650 + }, + { + "epoch": 1.8711434077688787, + "grad_norm": 12.558107376098633, + "learning_rate": 1.881427653718536e-05, + "loss": 0.6296, + "step": 211660 + }, + { + "epoch": 1.8712318110291908, + "grad_norm": 4.235104560852051, + "learning_rate": 1.8812803149513487e-05, + "loss": 0.55, + "step": 211670 + }, + { + "epoch": 1.871320214289503, + "grad_norm": 4.005383014678955, + "learning_rate": 1.8811329761841616e-05, + "loss": 0.5216, + "step": 211680 + }, + { + "epoch": 1.8714086175498152, + "grad_norm": 17.446208953857422, + "learning_rate": 1.8809856374169748e-05, + "loss": 0.7013, + "step": 211690 + }, + { + "epoch": 1.8714970208101276, + "grad_norm": 3.2882604598999023, + "learning_rate": 1.8808382986497876e-05, + "loss": 0.6057, + "step": 211700 + }, + { + "epoch": 1.8715854240704397, + "grad_norm": 1.3633511066436768, + "learning_rate": 1.8806909598826004e-05, + "loss": 0.4949, + "step": 211710 + }, + { + "epoch": 1.8716738273307518, + "grad_norm": 2.982347249984741, + "learning_rate": 1.8805436211154136e-05, + "loss": 0.5158, + "step": 211720 + }, + { + "epoch": 1.8717622305910642, + "grad_norm": 2.555624485015869, + "learning_rate": 1.8803962823482264e-05, + "loss": 0.5961, + "step": 211730 + }, + { + "epoch": 1.8718506338513765, + "grad_norm": 12.946317672729492, + "learning_rate": 1.8802489435810393e-05, + "loss": 0.6125, + "step": 211740 + }, + { + "epoch": 1.8719390371116886, + "grad_norm": 2.439051628112793, + "learning_rate": 1.8801016048138524e-05, + "loss": 0.6064, + "step": 211750 + }, + { + "epoch": 1.872027440372001, + "grad_norm": 2.308988571166992, + "learning_rate": 1.8799542660466653e-05, + "loss": 0.5771, + "step": 211760 + }, + { + "epoch": 1.8721158436323133, + "grad_norm": 19.356401443481445, + "learning_rate": 1.879806927279478e-05, + "loss": 0.6851, + "step": 211770 + }, + { + "epoch": 1.8722042468926254, + "grad_norm": 2.0891366004943848, + "learning_rate": 1.8796595885122913e-05, + "loss": 0.7139, + "step": 211780 + }, + { + "epoch": 1.8722926501529376, + "grad_norm": 3.92799973487854, + "learning_rate": 1.879512249745104e-05, + "loss": 0.5393, + "step": 211790 + }, + { + "epoch": 1.87238105341325, + "grad_norm": 2.865943193435669, + "learning_rate": 1.879364910977917e-05, + "loss": 0.5939, + "step": 211800 + }, + { + "epoch": 1.8724694566735622, + "grad_norm": 11.907809257507324, + "learning_rate": 1.87921757221073e-05, + "loss": 0.5798, + "step": 211810 + }, + { + "epoch": 1.8725578599338744, + "grad_norm": 1.8399838209152222, + "learning_rate": 1.879070233443543e-05, + "loss": 0.5951, + "step": 211820 + }, + { + "epoch": 1.8726462631941865, + "grad_norm": 11.052319526672363, + "learning_rate": 1.8789228946763558e-05, + "loss": 0.6618, + "step": 211830 + }, + { + "epoch": 1.8727346664544988, + "grad_norm": 1.8781675100326538, + "learning_rate": 1.878775555909169e-05, + "loss": 0.5829, + "step": 211840 + }, + { + "epoch": 1.8728230697148112, + "grad_norm": 3.443168878555298, + "learning_rate": 1.8786282171419818e-05, + "loss": 0.6249, + "step": 211850 + }, + { + "epoch": 1.8729114729751233, + "grad_norm": 1.7187124490737915, + "learning_rate": 1.8784808783747947e-05, + "loss": 0.5567, + "step": 211860 + }, + { + "epoch": 1.8729998762354354, + "grad_norm": 1.6366448402404785, + "learning_rate": 1.8783335396076075e-05, + "loss": 0.5983, + "step": 211870 + }, + { + "epoch": 1.873088279495748, + "grad_norm": 3.721496343612671, + "learning_rate": 1.8781862008404207e-05, + "loss": 0.6211, + "step": 211880 + }, + { + "epoch": 1.87317668275606, + "grad_norm": 2.662074089050293, + "learning_rate": 1.8780388620732335e-05, + "loss": 0.5681, + "step": 211890 + }, + { + "epoch": 1.8732650860163722, + "grad_norm": 3.5479068756103516, + "learning_rate": 1.8778915233060463e-05, + "loss": 0.6416, + "step": 211900 + }, + { + "epoch": 1.8733534892766845, + "grad_norm": 1.3248940706253052, + "learning_rate": 1.877744184538859e-05, + "loss": 0.6511, + "step": 211910 + }, + { + "epoch": 1.8734418925369969, + "grad_norm": 1.1158252954483032, + "learning_rate": 1.8775968457716723e-05, + "loss": 0.6935, + "step": 211920 + }, + { + "epoch": 1.873530295797309, + "grad_norm": 1.1364325284957886, + "learning_rate": 1.8774495070044852e-05, + "loss": 0.6279, + "step": 211930 + }, + { + "epoch": 1.8736186990576211, + "grad_norm": 5.12778902053833, + "learning_rate": 1.877302168237298e-05, + "loss": 0.5977, + "step": 211940 + }, + { + "epoch": 1.8737071023179335, + "grad_norm": 8.264141082763672, + "learning_rate": 1.8771548294701112e-05, + "loss": 0.6718, + "step": 211950 + }, + { + "epoch": 1.8737955055782458, + "grad_norm": 7.577733993530273, + "learning_rate": 1.877007490702924e-05, + "loss": 0.4922, + "step": 211960 + }, + { + "epoch": 1.873883908838558, + "grad_norm": 2.200882911682129, + "learning_rate": 1.876860151935737e-05, + "loss": 0.5648, + "step": 211970 + }, + { + "epoch": 1.87397231209887, + "grad_norm": 2.6261353492736816, + "learning_rate": 1.8767128131685497e-05, + "loss": 0.6939, + "step": 211980 + }, + { + "epoch": 1.8740607153591824, + "grad_norm": 7.190047264099121, + "learning_rate": 1.876565474401363e-05, + "loss": 0.6494, + "step": 211990 + }, + { + "epoch": 1.8741491186194947, + "grad_norm": 2.867805242538452, + "learning_rate": 1.8764181356341757e-05, + "loss": 0.7128, + "step": 212000 + }, + { + "epoch": 1.8742375218798069, + "grad_norm": 1.2611019611358643, + "learning_rate": 1.8762707968669885e-05, + "loss": 0.4783, + "step": 212010 + }, + { + "epoch": 1.8743259251401192, + "grad_norm": 8.136725425720215, + "learning_rate": 1.8761234580998017e-05, + "loss": 0.5401, + "step": 212020 + }, + { + "epoch": 1.8744143284004315, + "grad_norm": 2.3983137607574463, + "learning_rate": 1.8759761193326145e-05, + "loss": 0.5398, + "step": 212030 + }, + { + "epoch": 1.8745027316607437, + "grad_norm": 2.939603090286255, + "learning_rate": 1.8758287805654274e-05, + "loss": 0.5078, + "step": 212040 + }, + { + "epoch": 1.8745911349210558, + "grad_norm": 2.2624449729919434, + "learning_rate": 1.8756814417982402e-05, + "loss": 0.6563, + "step": 212050 + }, + { + "epoch": 1.8746795381813681, + "grad_norm": 11.68359088897705, + "learning_rate": 1.8755341030310534e-05, + "loss": 0.627, + "step": 212060 + }, + { + "epoch": 1.8747679414416805, + "grad_norm": 1.6339939832687378, + "learning_rate": 1.8753867642638662e-05, + "loss": 0.5462, + "step": 212070 + }, + { + "epoch": 1.8748563447019926, + "grad_norm": 6.112826824188232, + "learning_rate": 1.875239425496679e-05, + "loss": 0.6341, + "step": 212080 + }, + { + "epoch": 1.8749447479623047, + "grad_norm": 1.7221001386642456, + "learning_rate": 1.875092086729492e-05, + "loss": 0.6001, + "step": 212090 + }, + { + "epoch": 1.875033151222617, + "grad_norm": 1.1448161602020264, + "learning_rate": 1.874944747962305e-05, + "loss": 0.5859, + "step": 212100 + }, + { + "epoch": 1.8751215544829294, + "grad_norm": 3.3444106578826904, + "learning_rate": 1.874797409195118e-05, + "loss": 0.5248, + "step": 212110 + }, + { + "epoch": 1.8752099577432415, + "grad_norm": 4.8078508377075195, + "learning_rate": 1.8746500704279307e-05, + "loss": 0.5533, + "step": 212120 + }, + { + "epoch": 1.8752983610035538, + "grad_norm": 1.4071780443191528, + "learning_rate": 1.874502731660744e-05, + "loss": 0.6851, + "step": 212130 + }, + { + "epoch": 1.8753867642638662, + "grad_norm": 5.860563278198242, + "learning_rate": 1.8743553928935568e-05, + "loss": 0.5989, + "step": 212140 + }, + { + "epoch": 1.8754751675241783, + "grad_norm": 5.743417263031006, + "learning_rate": 1.8742080541263696e-05, + "loss": 0.5763, + "step": 212150 + }, + { + "epoch": 1.8755635707844904, + "grad_norm": 1.6394555568695068, + "learning_rate": 1.8740607153591824e-05, + "loss": 0.5499, + "step": 212160 + }, + { + "epoch": 1.8756519740448028, + "grad_norm": 1.9762955904006958, + "learning_rate": 1.8739133765919956e-05, + "loss": 0.565, + "step": 212170 + }, + { + "epoch": 1.8757403773051151, + "grad_norm": 2.414860725402832, + "learning_rate": 1.8737660378248084e-05, + "loss": 0.5682, + "step": 212180 + }, + { + "epoch": 1.8758287805654272, + "grad_norm": 2.592427968978882, + "learning_rate": 1.8736186990576213e-05, + "loss": 0.4106, + "step": 212190 + }, + { + "epoch": 1.8759171838257394, + "grad_norm": 8.744173049926758, + "learning_rate": 1.873471360290434e-05, + "loss": 0.8138, + "step": 212200 + }, + { + "epoch": 1.8760055870860517, + "grad_norm": 2.453145980834961, + "learning_rate": 1.8733240215232473e-05, + "loss": 0.5933, + "step": 212210 + }, + { + "epoch": 1.876093990346364, + "grad_norm": 1.2898881435394287, + "learning_rate": 1.87317668275606e-05, + "loss": 0.5223, + "step": 212220 + }, + { + "epoch": 1.8761823936066762, + "grad_norm": 2.6530442237854004, + "learning_rate": 1.873029343988873e-05, + "loss": 0.5596, + "step": 212230 + }, + { + "epoch": 1.8762707968669885, + "grad_norm": 1.509381890296936, + "learning_rate": 1.872882005221686e-05, + "loss": 0.4527, + "step": 212240 + }, + { + "epoch": 1.8763592001273008, + "grad_norm": 6.895394802093506, + "learning_rate": 1.872734666454499e-05, + "loss": 0.4692, + "step": 212250 + }, + { + "epoch": 1.876447603387613, + "grad_norm": 9.649444580078125, + "learning_rate": 1.8725873276873118e-05, + "loss": 0.5743, + "step": 212260 + }, + { + "epoch": 1.876536006647925, + "grad_norm": 2.2451863288879395, + "learning_rate": 1.8724399889201246e-05, + "loss": 0.6761, + "step": 212270 + }, + { + "epoch": 1.8766244099082374, + "grad_norm": 3.3644282817840576, + "learning_rate": 1.8722926501529378e-05, + "loss": 0.6537, + "step": 212280 + }, + { + "epoch": 1.8767128131685498, + "grad_norm": 3.8315041065216064, + "learning_rate": 1.8721453113857506e-05, + "loss": 0.553, + "step": 212290 + }, + { + "epoch": 1.8768012164288619, + "grad_norm": 7.9946794509887695, + "learning_rate": 1.8719979726185635e-05, + "loss": 0.7313, + "step": 212300 + }, + { + "epoch": 1.876889619689174, + "grad_norm": 1.6254922151565552, + "learning_rate": 1.8718506338513766e-05, + "loss": 0.5888, + "step": 212310 + }, + { + "epoch": 1.8769780229494863, + "grad_norm": 4.383457660675049, + "learning_rate": 1.8717032950841895e-05, + "loss": 0.5613, + "step": 212320 + }, + { + "epoch": 1.8770664262097987, + "grad_norm": 1.949416160583496, + "learning_rate": 1.8715559563170023e-05, + "loss": 0.5842, + "step": 212330 + }, + { + "epoch": 1.8771548294701108, + "grad_norm": 1.9556927680969238, + "learning_rate": 1.871408617549815e-05, + "loss": 0.5902, + "step": 212340 + }, + { + "epoch": 1.8772432327304232, + "grad_norm": 4.232676029205322, + "learning_rate": 1.8712612787826283e-05, + "loss": 0.6059, + "step": 212350 + }, + { + "epoch": 1.8773316359907355, + "grad_norm": 1.3433644771575928, + "learning_rate": 1.871113940015441e-05, + "loss": 0.7392, + "step": 212360 + }, + { + "epoch": 1.8774200392510476, + "grad_norm": 1.6884804964065552, + "learning_rate": 1.870966601248254e-05, + "loss": 0.5262, + "step": 212370 + }, + { + "epoch": 1.8775084425113597, + "grad_norm": 8.2138032913208, + "learning_rate": 1.870819262481067e-05, + "loss": 0.661, + "step": 212380 + }, + { + "epoch": 1.877596845771672, + "grad_norm": 1.5879830121994019, + "learning_rate": 1.87067192371388e-05, + "loss": 0.5244, + "step": 212390 + }, + { + "epoch": 1.8776852490319844, + "grad_norm": 2.5254714488983154, + "learning_rate": 1.870524584946693e-05, + "loss": 0.5861, + "step": 212400 + }, + { + "epoch": 1.8777736522922965, + "grad_norm": 1.065232753753662, + "learning_rate": 1.8703772461795057e-05, + "loss": 0.5534, + "step": 212410 + }, + { + "epoch": 1.8778620555526087, + "grad_norm": 2.345660924911499, + "learning_rate": 1.870229907412319e-05, + "loss": 0.573, + "step": 212420 + }, + { + "epoch": 1.877950458812921, + "grad_norm": 1.898077130317688, + "learning_rate": 1.8700825686451317e-05, + "loss": 0.4771, + "step": 212430 + }, + { + "epoch": 1.8780388620732333, + "grad_norm": 2.2095444202423096, + "learning_rate": 1.8699352298779445e-05, + "loss": 0.6287, + "step": 212440 + }, + { + "epoch": 1.8781272653335455, + "grad_norm": 1.463356852531433, + "learning_rate": 1.8697878911107574e-05, + "loss": 0.5482, + "step": 212450 + }, + { + "epoch": 1.8782156685938576, + "grad_norm": 3.3391504287719727, + "learning_rate": 1.8696405523435705e-05, + "loss": 0.5935, + "step": 212460 + }, + { + "epoch": 1.8783040718541701, + "grad_norm": 2.8792531490325928, + "learning_rate": 1.8694932135763834e-05, + "loss": 0.5928, + "step": 212470 + }, + { + "epoch": 1.8783924751144823, + "grad_norm": 2.7586793899536133, + "learning_rate": 1.8693458748091962e-05, + "loss": 0.5689, + "step": 212480 + }, + { + "epoch": 1.8784808783747944, + "grad_norm": 5.905708312988281, + "learning_rate": 1.8691985360420094e-05, + "loss": 0.5409, + "step": 212490 + }, + { + "epoch": 1.8785692816351067, + "grad_norm": 14.364883422851562, + "learning_rate": 1.8690511972748222e-05, + "loss": 0.6887, + "step": 212500 + }, + { + "epoch": 1.878657684895419, + "grad_norm": 2.8432703018188477, + "learning_rate": 1.868903858507635e-05, + "loss": 0.5137, + "step": 212510 + }, + { + "epoch": 1.8787460881557312, + "grad_norm": 1.5833494663238525, + "learning_rate": 1.868756519740448e-05, + "loss": 0.4548, + "step": 212520 + }, + { + "epoch": 1.8788344914160433, + "grad_norm": 1.6333998441696167, + "learning_rate": 1.868609180973261e-05, + "loss": 0.5316, + "step": 212530 + }, + { + "epoch": 1.8789228946763556, + "grad_norm": 1.2314115762710571, + "learning_rate": 1.868461842206074e-05, + "loss": 0.4912, + "step": 212540 + }, + { + "epoch": 1.879011297936668, + "grad_norm": 4.743088245391846, + "learning_rate": 1.8683145034388867e-05, + "loss": 0.5233, + "step": 212550 + }, + { + "epoch": 1.87909970119698, + "grad_norm": 2.5482184886932373, + "learning_rate": 1.8681671646716996e-05, + "loss": 0.6037, + "step": 212560 + }, + { + "epoch": 1.8791881044572922, + "grad_norm": 3.077543258666992, + "learning_rate": 1.8680198259045127e-05, + "loss": 0.6617, + "step": 212570 + }, + { + "epoch": 1.8792765077176046, + "grad_norm": 2.091076135635376, + "learning_rate": 1.8678724871373256e-05, + "loss": 0.515, + "step": 212580 + }, + { + "epoch": 1.879364910977917, + "grad_norm": 1.3151177167892456, + "learning_rate": 1.8677251483701384e-05, + "loss": 0.5848, + "step": 212590 + }, + { + "epoch": 1.879453314238229, + "grad_norm": 1.5070290565490723, + "learning_rate": 1.8675778096029516e-05, + "loss": 0.5968, + "step": 212600 + }, + { + "epoch": 1.8795417174985414, + "grad_norm": 4.931157112121582, + "learning_rate": 1.8674304708357644e-05, + "loss": 0.5331, + "step": 212610 + }, + { + "epoch": 1.8796301207588537, + "grad_norm": 2.310117244720459, + "learning_rate": 1.8672831320685773e-05, + "loss": 0.4836, + "step": 212620 + }, + { + "epoch": 1.8797185240191658, + "grad_norm": 1.8003199100494385, + "learning_rate": 1.8671357933013904e-05, + "loss": 0.5507, + "step": 212630 + }, + { + "epoch": 1.879806927279478, + "grad_norm": 1.0642681121826172, + "learning_rate": 1.8669884545342033e-05, + "loss": 0.5688, + "step": 212640 + }, + { + "epoch": 1.8798953305397903, + "grad_norm": 3.1406989097595215, + "learning_rate": 1.866841115767016e-05, + "loss": 0.5875, + "step": 212650 + }, + { + "epoch": 1.8799837338001026, + "grad_norm": 1.50808584690094, + "learning_rate": 1.8666937769998293e-05, + "loss": 0.5141, + "step": 212660 + }, + { + "epoch": 1.8800721370604148, + "grad_norm": 3.3878180980682373, + "learning_rate": 1.866546438232642e-05, + "loss": 0.5458, + "step": 212670 + }, + { + "epoch": 1.8801605403207269, + "grad_norm": 1.265716314315796, + "learning_rate": 1.866399099465455e-05, + "loss": 0.5068, + "step": 212680 + }, + { + "epoch": 1.8802489435810392, + "grad_norm": 3.274003028869629, + "learning_rate": 1.866251760698268e-05, + "loss": 0.5244, + "step": 212690 + }, + { + "epoch": 1.8803373468413516, + "grad_norm": 1.5882059335708618, + "learning_rate": 1.866104421931081e-05, + "loss": 0.5657, + "step": 212700 + }, + { + "epoch": 1.8804257501016637, + "grad_norm": 3.2886435985565186, + "learning_rate": 1.8659570831638938e-05, + "loss": 0.6797, + "step": 212710 + }, + { + "epoch": 1.880514153361976, + "grad_norm": 2.704702615737915, + "learning_rate": 1.865809744396707e-05, + "loss": 0.5966, + "step": 212720 + }, + { + "epoch": 1.8806025566222884, + "grad_norm": 2.2841033935546875, + "learning_rate": 1.8656624056295198e-05, + "loss": 0.5501, + "step": 212730 + }, + { + "epoch": 1.8806909598826005, + "grad_norm": 18.79593276977539, + "learning_rate": 1.8655150668623326e-05, + "loss": 0.601, + "step": 212740 + }, + { + "epoch": 1.8807793631429126, + "grad_norm": 2.4465584754943848, + "learning_rate": 1.8653677280951458e-05, + "loss": 0.5834, + "step": 212750 + }, + { + "epoch": 1.880867766403225, + "grad_norm": 1.4901008605957031, + "learning_rate": 1.8652203893279586e-05, + "loss": 0.5264, + "step": 212760 + }, + { + "epoch": 1.8809561696635373, + "grad_norm": 3.8159334659576416, + "learning_rate": 1.8650730505607715e-05, + "loss": 0.5935, + "step": 212770 + }, + { + "epoch": 1.8810445729238494, + "grad_norm": 2.6459484100341797, + "learning_rate": 1.8649257117935847e-05, + "loss": 0.5992, + "step": 212780 + }, + { + "epoch": 1.8811329761841615, + "grad_norm": 2.558025598526001, + "learning_rate": 1.8647783730263975e-05, + "loss": 0.5407, + "step": 212790 + }, + { + "epoch": 1.8812213794444739, + "grad_norm": 2.620041608810425, + "learning_rate": 1.8646310342592103e-05, + "loss": 0.493, + "step": 212800 + }, + { + "epoch": 1.8813097827047862, + "grad_norm": 1.0128344297409058, + "learning_rate": 1.864483695492023e-05, + "loss": 0.6075, + "step": 212810 + }, + { + "epoch": 1.8813981859650983, + "grad_norm": 2.3276541233062744, + "learning_rate": 1.8643363567248363e-05, + "loss": 0.7396, + "step": 212820 + }, + { + "epoch": 1.8814865892254107, + "grad_norm": 6.157994270324707, + "learning_rate": 1.8641890179576492e-05, + "loss": 0.5341, + "step": 212830 + }, + { + "epoch": 1.881574992485723, + "grad_norm": 3.8587722778320312, + "learning_rate": 1.864041679190462e-05, + "loss": 0.6586, + "step": 212840 + }, + { + "epoch": 1.8816633957460351, + "grad_norm": 2.199636936187744, + "learning_rate": 1.863894340423275e-05, + "loss": 0.5565, + "step": 212850 + }, + { + "epoch": 1.8817517990063473, + "grad_norm": 3.426286458969116, + "learning_rate": 1.863747001656088e-05, + "loss": 0.4645, + "step": 212860 + }, + { + "epoch": 1.8818402022666596, + "grad_norm": 0.8287149667739868, + "learning_rate": 1.863599662888901e-05, + "loss": 0.5721, + "step": 212870 + }, + { + "epoch": 1.881928605526972, + "grad_norm": 5.03294563293457, + "learning_rate": 1.8634523241217137e-05, + "loss": 0.4732, + "step": 212880 + }, + { + "epoch": 1.882017008787284, + "grad_norm": 1.1851072311401367, + "learning_rate": 1.863304985354527e-05, + "loss": 0.678, + "step": 212890 + }, + { + "epoch": 1.8821054120475962, + "grad_norm": 6.871645927429199, + "learning_rate": 1.8631576465873397e-05, + "loss": 0.7234, + "step": 212900 + }, + { + "epoch": 1.8821938153079085, + "grad_norm": 5.614889144897461, + "learning_rate": 1.8630103078201525e-05, + "loss": 0.5841, + "step": 212910 + }, + { + "epoch": 1.8822822185682209, + "grad_norm": 3.654690980911255, + "learning_rate": 1.8628629690529654e-05, + "loss": 0.4902, + "step": 212920 + }, + { + "epoch": 1.882370621828533, + "grad_norm": 9.863300323486328, + "learning_rate": 1.8627156302857785e-05, + "loss": 0.6219, + "step": 212930 + }, + { + "epoch": 1.8824590250888453, + "grad_norm": 1.1956835985183716, + "learning_rate": 1.8625682915185914e-05, + "loss": 0.5397, + "step": 212940 + }, + { + "epoch": 1.8825474283491577, + "grad_norm": 1.8851938247680664, + "learning_rate": 1.8624209527514042e-05, + "loss": 0.6684, + "step": 212950 + }, + { + "epoch": 1.8826358316094698, + "grad_norm": 2.18277645111084, + "learning_rate": 1.8622736139842174e-05, + "loss": 0.6259, + "step": 212960 + }, + { + "epoch": 1.882724234869782, + "grad_norm": 3.0681982040405273, + "learning_rate": 1.8621262752170302e-05, + "loss": 0.5543, + "step": 212970 + }, + { + "epoch": 1.8828126381300943, + "grad_norm": 2.2887508869171143, + "learning_rate": 1.861978936449843e-05, + "loss": 0.5822, + "step": 212980 + }, + { + "epoch": 1.8829010413904066, + "grad_norm": 11.29295539855957, + "learning_rate": 1.861831597682656e-05, + "loss": 0.5882, + "step": 212990 + }, + { + "epoch": 1.8829894446507187, + "grad_norm": 1.1387871503829956, + "learning_rate": 1.861684258915469e-05, + "loss": 0.7087, + "step": 213000 + }, + { + "epoch": 1.8830778479110308, + "grad_norm": 2.4769179821014404, + "learning_rate": 1.861536920148282e-05, + "loss": 0.6643, + "step": 213010 + }, + { + "epoch": 1.8831662511713432, + "grad_norm": 2.312826156616211, + "learning_rate": 1.8613895813810947e-05, + "loss": 0.5726, + "step": 213020 + }, + { + "epoch": 1.8832546544316555, + "grad_norm": 4.7945780754089355, + "learning_rate": 1.8612422426139076e-05, + "loss": 0.536, + "step": 213030 + }, + { + "epoch": 1.8833430576919676, + "grad_norm": 2.921363592147827, + "learning_rate": 1.8610949038467207e-05, + "loss": 0.6759, + "step": 213040 + }, + { + "epoch": 1.8834314609522798, + "grad_norm": 1.5335419178009033, + "learning_rate": 1.8609475650795336e-05, + "loss": 0.5513, + "step": 213050 + }, + { + "epoch": 1.8835198642125923, + "grad_norm": 2.939746141433716, + "learning_rate": 1.8608002263123464e-05, + "loss": 0.6185, + "step": 213060 + }, + { + "epoch": 1.8836082674729044, + "grad_norm": 10.422958374023438, + "learning_rate": 1.8606528875451596e-05, + "loss": 0.5928, + "step": 213070 + }, + { + "epoch": 1.8836966707332166, + "grad_norm": 1.5900689363479614, + "learning_rate": 1.8605055487779724e-05, + "loss": 0.5744, + "step": 213080 + }, + { + "epoch": 1.883785073993529, + "grad_norm": 1.7086278200149536, + "learning_rate": 1.8603582100107853e-05, + "loss": 0.6819, + "step": 213090 + }, + { + "epoch": 1.8838734772538412, + "grad_norm": 6.898779392242432, + "learning_rate": 1.860210871243598e-05, + "loss": 0.5775, + "step": 213100 + }, + { + "epoch": 1.8839618805141534, + "grad_norm": 1.625374436378479, + "learning_rate": 1.8600635324764113e-05, + "loss": 0.549, + "step": 213110 + }, + { + "epoch": 1.8840502837744655, + "grad_norm": 6.336486339569092, + "learning_rate": 1.859916193709224e-05, + "loss": 0.6147, + "step": 213120 + }, + { + "epoch": 1.8841386870347778, + "grad_norm": 2.499382972717285, + "learning_rate": 1.859768854942037e-05, + "loss": 0.6723, + "step": 213130 + }, + { + "epoch": 1.8842270902950902, + "grad_norm": 1.2628241777420044, + "learning_rate": 1.85962151617485e-05, + "loss": 0.6456, + "step": 213140 + }, + { + "epoch": 1.8843154935554023, + "grad_norm": 1.5055720806121826, + "learning_rate": 1.859474177407663e-05, + "loss": 0.595, + "step": 213150 + }, + { + "epoch": 1.8844038968157144, + "grad_norm": 3.127073049545288, + "learning_rate": 1.8593268386404758e-05, + "loss": 0.558, + "step": 213160 + }, + { + "epoch": 1.8844923000760267, + "grad_norm": 4.078506946563721, + "learning_rate": 1.8591794998732886e-05, + "loss": 0.6386, + "step": 213170 + }, + { + "epoch": 1.884580703336339, + "grad_norm": 1.4510366916656494, + "learning_rate": 1.8590321611061018e-05, + "loss": 0.5584, + "step": 213180 + }, + { + "epoch": 1.8846691065966512, + "grad_norm": 2.42274808883667, + "learning_rate": 1.8588848223389146e-05, + "loss": 0.5751, + "step": 213190 + }, + { + "epoch": 1.8847575098569636, + "grad_norm": 3.6079776287078857, + "learning_rate": 1.8587374835717275e-05, + "loss": 0.6377, + "step": 213200 + }, + { + "epoch": 1.884845913117276, + "grad_norm": 1.661149263381958, + "learning_rate": 1.8585901448045403e-05, + "loss": 0.5397, + "step": 213210 + }, + { + "epoch": 1.884934316377588, + "grad_norm": 0.6031556129455566, + "learning_rate": 1.8584428060373535e-05, + "loss": 0.547, + "step": 213220 + }, + { + "epoch": 1.8850227196379001, + "grad_norm": 15.123361587524414, + "learning_rate": 1.8582954672701663e-05, + "loss": 0.6655, + "step": 213230 + }, + { + "epoch": 1.8851111228982125, + "grad_norm": 4.451207637786865, + "learning_rate": 1.858148128502979e-05, + "loss": 0.687, + "step": 213240 + }, + { + "epoch": 1.8851995261585248, + "grad_norm": 3.892076015472412, + "learning_rate": 1.8580007897357923e-05, + "loss": 0.4964, + "step": 213250 + }, + { + "epoch": 1.885287929418837, + "grad_norm": 1.2234504222869873, + "learning_rate": 1.857853450968605e-05, + "loss": 0.6292, + "step": 213260 + }, + { + "epoch": 1.885376332679149, + "grad_norm": 1.309215784072876, + "learning_rate": 1.857706112201418e-05, + "loss": 0.5311, + "step": 213270 + }, + { + "epoch": 1.8854647359394614, + "grad_norm": 5.029854774475098, + "learning_rate": 1.8575587734342308e-05, + "loss": 0.5742, + "step": 213280 + }, + { + "epoch": 1.8855531391997737, + "grad_norm": 4.400570392608643, + "learning_rate": 1.857411434667044e-05, + "loss": 0.6355, + "step": 213290 + }, + { + "epoch": 1.8856415424600859, + "grad_norm": 4.826441287994385, + "learning_rate": 1.857264095899857e-05, + "loss": 0.6101, + "step": 213300 + }, + { + "epoch": 1.8857299457203982, + "grad_norm": 1.3306336402893066, + "learning_rate": 1.8571167571326697e-05, + "loss": 0.5845, + "step": 213310 + }, + { + "epoch": 1.8858183489807105, + "grad_norm": 5.465448379516602, + "learning_rate": 1.8569694183654825e-05, + "loss": 0.5639, + "step": 213320 + }, + { + "epoch": 1.8859067522410227, + "grad_norm": 1.3071844577789307, + "learning_rate": 1.8568220795982957e-05, + "loss": 0.5018, + "step": 213330 + }, + { + "epoch": 1.8859951555013348, + "grad_norm": 7.508504390716553, + "learning_rate": 1.8566747408311085e-05, + "loss": 0.5757, + "step": 213340 + }, + { + "epoch": 1.8860835587616471, + "grad_norm": 0.7872974276542664, + "learning_rate": 1.8565274020639214e-05, + "loss": 0.5064, + "step": 213350 + }, + { + "epoch": 1.8861719620219595, + "grad_norm": 1.6077816486358643, + "learning_rate": 1.8563800632967345e-05, + "loss": 0.5257, + "step": 213360 + }, + { + "epoch": 1.8862603652822716, + "grad_norm": 6.302602767944336, + "learning_rate": 1.8562327245295474e-05, + "loss": 0.6518, + "step": 213370 + }, + { + "epoch": 1.8863487685425837, + "grad_norm": 1.237106442451477, + "learning_rate": 1.8560853857623602e-05, + "loss": 0.638, + "step": 213380 + }, + { + "epoch": 1.886437171802896, + "grad_norm": 5.677822113037109, + "learning_rate": 1.855938046995173e-05, + "loss": 0.5992, + "step": 213390 + }, + { + "epoch": 1.8865255750632084, + "grad_norm": 7.012515068054199, + "learning_rate": 1.8557907082279862e-05, + "loss": 0.493, + "step": 213400 + }, + { + "epoch": 1.8866139783235205, + "grad_norm": 9.422869682312012, + "learning_rate": 1.855643369460799e-05, + "loss": 0.618, + "step": 213410 + }, + { + "epoch": 1.8867023815838329, + "grad_norm": 3.234034299850464, + "learning_rate": 1.855496030693612e-05, + "loss": 0.5519, + "step": 213420 + }, + { + "epoch": 1.8867907848441452, + "grad_norm": 1.369439959526062, + "learning_rate": 1.855348691926425e-05, + "loss": 0.6308, + "step": 213430 + }, + { + "epoch": 1.8868791881044573, + "grad_norm": 5.693788051605225, + "learning_rate": 1.855201353159238e-05, + "loss": 0.6196, + "step": 213440 + }, + { + "epoch": 1.8869675913647694, + "grad_norm": 1.55559241771698, + "learning_rate": 1.8550540143920507e-05, + "loss": 0.5714, + "step": 213450 + }, + { + "epoch": 1.8870559946250818, + "grad_norm": 6.621094226837158, + "learning_rate": 1.8549066756248636e-05, + "loss": 0.623, + "step": 213460 + }, + { + "epoch": 1.8871443978853941, + "grad_norm": 5.765220642089844, + "learning_rate": 1.8547593368576767e-05, + "loss": 0.5496, + "step": 213470 + }, + { + "epoch": 1.8872328011457062, + "grad_norm": 2.526282548904419, + "learning_rate": 1.8546119980904896e-05, + "loss": 0.6379, + "step": 213480 + }, + { + "epoch": 1.8873212044060184, + "grad_norm": 1.4172158241271973, + "learning_rate": 1.8544646593233024e-05, + "loss": 0.6885, + "step": 213490 + }, + { + "epoch": 1.8874096076663307, + "grad_norm": 8.146158218383789, + "learning_rate": 1.8543173205561152e-05, + "loss": 0.6146, + "step": 213500 + }, + { + "epoch": 1.887498010926643, + "grad_norm": 3.2238504886627197, + "learning_rate": 1.8541699817889284e-05, + "loss": 0.6524, + "step": 213510 + }, + { + "epoch": 1.8875864141869552, + "grad_norm": 2.985931873321533, + "learning_rate": 1.8540226430217413e-05, + "loss": 0.5647, + "step": 213520 + }, + { + "epoch": 1.8876748174472675, + "grad_norm": 2.8664724826812744, + "learning_rate": 1.853875304254554e-05, + "loss": 0.542, + "step": 213530 + }, + { + "epoch": 1.8877632207075798, + "grad_norm": 6.465889930725098, + "learning_rate": 1.8537279654873673e-05, + "loss": 0.6566, + "step": 213540 + }, + { + "epoch": 1.887851623967892, + "grad_norm": 17.41527557373047, + "learning_rate": 1.85358062672018e-05, + "loss": 0.5922, + "step": 213550 + }, + { + "epoch": 1.887940027228204, + "grad_norm": 2.6813340187072754, + "learning_rate": 1.853433287952993e-05, + "loss": 0.5266, + "step": 213560 + }, + { + "epoch": 1.8880284304885164, + "grad_norm": 2.6717472076416016, + "learning_rate": 1.853285949185806e-05, + "loss": 0.5942, + "step": 213570 + }, + { + "epoch": 1.8881168337488288, + "grad_norm": 1.4658883810043335, + "learning_rate": 1.853138610418619e-05, + "loss": 0.6243, + "step": 213580 + }, + { + "epoch": 1.888205237009141, + "grad_norm": 4.908879280090332, + "learning_rate": 1.8529912716514318e-05, + "loss": 0.5461, + "step": 213590 + }, + { + "epoch": 1.888293640269453, + "grad_norm": 8.682324409484863, + "learning_rate": 1.852843932884245e-05, + "loss": 0.555, + "step": 213600 + }, + { + "epoch": 1.8883820435297654, + "grad_norm": 3.517279624938965, + "learning_rate": 1.8526965941170578e-05, + "loss": 0.554, + "step": 213610 + }, + { + "epoch": 1.8884704467900777, + "grad_norm": 0.9724871516227722, + "learning_rate": 1.8525492553498706e-05, + "loss": 0.4883, + "step": 213620 + }, + { + "epoch": 1.8885588500503898, + "grad_norm": 4.339098930358887, + "learning_rate": 1.8524019165826838e-05, + "loss": 0.6278, + "step": 213630 + }, + { + "epoch": 1.888647253310702, + "grad_norm": 1.644022822380066, + "learning_rate": 1.8522545778154966e-05, + "loss": 0.7744, + "step": 213640 + }, + { + "epoch": 1.8887356565710145, + "grad_norm": 3.53022837638855, + "learning_rate": 1.8521072390483095e-05, + "loss": 0.549, + "step": 213650 + }, + { + "epoch": 1.8888240598313266, + "grad_norm": 1.4925897121429443, + "learning_rate": 1.8519599002811226e-05, + "loss": 0.6137, + "step": 213660 + }, + { + "epoch": 1.8889124630916387, + "grad_norm": 2.128223180770874, + "learning_rate": 1.8518125615139355e-05, + "loss": 0.6271, + "step": 213670 + }, + { + "epoch": 1.889000866351951, + "grad_norm": 5.156556606292725, + "learning_rate": 1.8516652227467483e-05, + "loss": 0.486, + "step": 213680 + }, + { + "epoch": 1.8890892696122634, + "grad_norm": 1.796038031578064, + "learning_rate": 1.8515178839795615e-05, + "loss": 0.534, + "step": 213690 + }, + { + "epoch": 1.8891776728725755, + "grad_norm": 3.3110363483428955, + "learning_rate": 1.8513705452123743e-05, + "loss": 0.424, + "step": 213700 + }, + { + "epoch": 1.8892660761328877, + "grad_norm": 2.35679292678833, + "learning_rate": 1.851223206445187e-05, + "loss": 0.5533, + "step": 213710 + }, + { + "epoch": 1.8893544793932, + "grad_norm": 2.322882890701294, + "learning_rate": 1.8510758676780003e-05, + "loss": 0.6785, + "step": 213720 + }, + { + "epoch": 1.8894428826535123, + "grad_norm": 2.069554090499878, + "learning_rate": 1.850928528910813e-05, + "loss": 0.6634, + "step": 213730 + }, + { + "epoch": 1.8895312859138245, + "grad_norm": 12.046005249023438, + "learning_rate": 1.850781190143626e-05, + "loss": 0.8362, + "step": 213740 + }, + { + "epoch": 1.8896196891741366, + "grad_norm": 3.466052293777466, + "learning_rate": 1.850633851376439e-05, + "loss": 0.5317, + "step": 213750 + }, + { + "epoch": 1.889708092434449, + "grad_norm": 4.101285934448242, + "learning_rate": 1.850486512609252e-05, + "loss": 0.6194, + "step": 213760 + }, + { + "epoch": 1.8897964956947613, + "grad_norm": 3.137982130050659, + "learning_rate": 1.850339173842065e-05, + "loss": 0.5138, + "step": 213770 + }, + { + "epoch": 1.8898848989550734, + "grad_norm": 1.803922176361084, + "learning_rate": 1.8501918350748777e-05, + "loss": 0.4798, + "step": 213780 + }, + { + "epoch": 1.8899733022153857, + "grad_norm": 2.399549961090088, + "learning_rate": 1.8500444963076905e-05, + "loss": 0.6397, + "step": 213790 + }, + { + "epoch": 1.890061705475698, + "grad_norm": 4.050331115722656, + "learning_rate": 1.8498971575405037e-05, + "loss": 0.6317, + "step": 213800 + }, + { + "epoch": 1.8901501087360102, + "grad_norm": 3.249687671661377, + "learning_rate": 1.8497498187733165e-05, + "loss": 0.5951, + "step": 213810 + }, + { + "epoch": 1.8902385119963223, + "grad_norm": 3.0217623710632324, + "learning_rate": 1.8496024800061294e-05, + "loss": 0.6623, + "step": 213820 + }, + { + "epoch": 1.8903269152566347, + "grad_norm": 1.2302578687667847, + "learning_rate": 1.8494551412389425e-05, + "loss": 0.6215, + "step": 213830 + }, + { + "epoch": 1.890415318516947, + "grad_norm": 1.431031584739685, + "learning_rate": 1.8493078024717554e-05, + "loss": 0.5895, + "step": 213840 + }, + { + "epoch": 1.8905037217772591, + "grad_norm": 1.6367743015289307, + "learning_rate": 1.8491604637045682e-05, + "loss": 0.5257, + "step": 213850 + }, + { + "epoch": 1.8905921250375712, + "grad_norm": 3.0997941493988037, + "learning_rate": 1.849013124937381e-05, + "loss": 0.6593, + "step": 213860 + }, + { + "epoch": 1.8906805282978836, + "grad_norm": 3.3189096450805664, + "learning_rate": 1.8488657861701942e-05, + "loss": 0.6237, + "step": 213870 + }, + { + "epoch": 1.890768931558196, + "grad_norm": 1.5525599718093872, + "learning_rate": 1.848718447403007e-05, + "loss": 0.5502, + "step": 213880 + }, + { + "epoch": 1.890857334818508, + "grad_norm": 3.968534231185913, + "learning_rate": 1.84857110863582e-05, + "loss": 0.6898, + "step": 213890 + }, + { + "epoch": 1.8909457380788204, + "grad_norm": 2.1128954887390137, + "learning_rate": 1.848423769868633e-05, + "loss": 0.6763, + "step": 213900 + }, + { + "epoch": 1.8910341413391327, + "grad_norm": 2.5275232791900635, + "learning_rate": 1.848276431101446e-05, + "loss": 0.6377, + "step": 213910 + }, + { + "epoch": 1.8911225445994448, + "grad_norm": 20.174570083618164, + "learning_rate": 1.8481290923342587e-05, + "loss": 0.641, + "step": 213920 + }, + { + "epoch": 1.891210947859757, + "grad_norm": 1.4616127014160156, + "learning_rate": 1.8479817535670716e-05, + "loss": 0.6087, + "step": 213930 + }, + { + "epoch": 1.8912993511200693, + "grad_norm": 2.63228440284729, + "learning_rate": 1.8478344147998847e-05, + "loss": 0.5483, + "step": 213940 + }, + { + "epoch": 1.8913877543803816, + "grad_norm": 1.7421663999557495, + "learning_rate": 1.8476870760326976e-05, + "loss": 0.5365, + "step": 213950 + }, + { + "epoch": 1.8914761576406938, + "grad_norm": 2.8034555912017822, + "learning_rate": 1.8475397372655104e-05, + "loss": 0.5906, + "step": 213960 + }, + { + "epoch": 1.8915645609010059, + "grad_norm": 1.3557761907577515, + "learning_rate": 1.8473923984983232e-05, + "loss": 0.5773, + "step": 213970 + }, + { + "epoch": 1.8916529641613182, + "grad_norm": 5.342085838317871, + "learning_rate": 1.8472450597311364e-05, + "loss": 0.4327, + "step": 213980 + }, + { + "epoch": 1.8917413674216306, + "grad_norm": 3.167161226272583, + "learning_rate": 1.8470977209639493e-05, + "loss": 0.6539, + "step": 213990 + }, + { + "epoch": 1.8918297706819427, + "grad_norm": 2.558377504348755, + "learning_rate": 1.846950382196762e-05, + "loss": 0.6095, + "step": 214000 + }, + { + "epoch": 1.891918173942255, + "grad_norm": 8.736021041870117, + "learning_rate": 1.8468030434295753e-05, + "loss": 0.5662, + "step": 214010 + }, + { + "epoch": 1.8920065772025674, + "grad_norm": 1.8692705631256104, + "learning_rate": 1.846655704662388e-05, + "loss": 0.5937, + "step": 214020 + }, + { + "epoch": 1.8920949804628795, + "grad_norm": 1.8699647188186646, + "learning_rate": 1.846508365895201e-05, + "loss": 0.5925, + "step": 214030 + }, + { + "epoch": 1.8921833837231916, + "grad_norm": 1.3535127639770508, + "learning_rate": 1.8463610271280138e-05, + "loss": 0.6094, + "step": 214040 + }, + { + "epoch": 1.892271786983504, + "grad_norm": 4.944995403289795, + "learning_rate": 1.846213688360827e-05, + "loss": 0.6154, + "step": 214050 + }, + { + "epoch": 1.8923601902438163, + "grad_norm": 4.698526859283447, + "learning_rate": 1.8460663495936398e-05, + "loss": 0.5083, + "step": 214060 + }, + { + "epoch": 1.8924485935041284, + "grad_norm": 1.3276554346084595, + "learning_rate": 1.8459190108264526e-05, + "loss": 0.5595, + "step": 214070 + }, + { + "epoch": 1.8925369967644405, + "grad_norm": 1.2327542304992676, + "learning_rate": 1.8457716720592658e-05, + "loss": 0.5325, + "step": 214080 + }, + { + "epoch": 1.8926254000247529, + "grad_norm": 1.654686689376831, + "learning_rate": 1.8456243332920786e-05, + "loss": 0.4257, + "step": 214090 + }, + { + "epoch": 1.8927138032850652, + "grad_norm": 2.887430429458618, + "learning_rate": 1.8454769945248915e-05, + "loss": 0.4465, + "step": 214100 + }, + { + "epoch": 1.8928022065453773, + "grad_norm": 1.4590517282485962, + "learning_rate": 1.8453296557577043e-05, + "loss": 0.4605, + "step": 214110 + }, + { + "epoch": 1.8928906098056897, + "grad_norm": 2.051868200302124, + "learning_rate": 1.8451823169905175e-05, + "loss": 0.641, + "step": 214120 + }, + { + "epoch": 1.892979013066002, + "grad_norm": 4.898158073425293, + "learning_rate": 1.8450349782233303e-05, + "loss": 0.5423, + "step": 214130 + }, + { + "epoch": 1.8930674163263141, + "grad_norm": 1.3424944877624512, + "learning_rate": 1.844887639456143e-05, + "loss": 0.6061, + "step": 214140 + }, + { + "epoch": 1.8931558195866263, + "grad_norm": 11.548501014709473, + "learning_rate": 1.844740300688956e-05, + "loss": 0.556, + "step": 214150 + }, + { + "epoch": 1.8932442228469386, + "grad_norm": 2.811197280883789, + "learning_rate": 1.844592961921769e-05, + "loss": 0.5081, + "step": 214160 + }, + { + "epoch": 1.893332626107251, + "grad_norm": 4.027631759643555, + "learning_rate": 1.844445623154582e-05, + "loss": 0.6443, + "step": 214170 + }, + { + "epoch": 1.893421029367563, + "grad_norm": 2.086879014968872, + "learning_rate": 1.8442982843873948e-05, + "loss": 0.6345, + "step": 214180 + }, + { + "epoch": 1.8935094326278752, + "grad_norm": 1.87912118434906, + "learning_rate": 1.844150945620208e-05, + "loss": 0.6381, + "step": 214190 + }, + { + "epoch": 1.8935978358881875, + "grad_norm": 9.014577865600586, + "learning_rate": 1.844003606853021e-05, + "loss": 0.6226, + "step": 214200 + }, + { + "epoch": 1.8936862391484999, + "grad_norm": 7.582598686218262, + "learning_rate": 1.8438562680858337e-05, + "loss": 0.6206, + "step": 214210 + }, + { + "epoch": 1.893774642408812, + "grad_norm": 1.9980887174606323, + "learning_rate": 1.8437089293186465e-05, + "loss": 0.6895, + "step": 214220 + }, + { + "epoch": 1.8938630456691241, + "grad_norm": 2.1076395511627197, + "learning_rate": 1.8435615905514597e-05, + "loss": 0.6247, + "step": 214230 + }, + { + "epoch": 1.8939514489294367, + "grad_norm": 1.788191556930542, + "learning_rate": 1.8434142517842725e-05, + "loss": 0.5424, + "step": 214240 + }, + { + "epoch": 1.8940398521897488, + "grad_norm": 2.757988214492798, + "learning_rate": 1.8432669130170854e-05, + "loss": 0.6193, + "step": 214250 + }, + { + "epoch": 1.894128255450061, + "grad_norm": 3.09478497505188, + "learning_rate": 1.8431195742498985e-05, + "loss": 0.487, + "step": 214260 + }, + { + "epoch": 1.8942166587103733, + "grad_norm": 2.8185760974884033, + "learning_rate": 1.8429722354827114e-05, + "loss": 0.6102, + "step": 214270 + }, + { + "epoch": 1.8943050619706856, + "grad_norm": 2.3654725551605225, + "learning_rate": 1.8428248967155242e-05, + "loss": 0.525, + "step": 214280 + }, + { + "epoch": 1.8943934652309977, + "grad_norm": 3.9461331367492676, + "learning_rate": 1.842677557948337e-05, + "loss": 0.5549, + "step": 214290 + }, + { + "epoch": 1.8944818684913098, + "grad_norm": 5.00105094909668, + "learning_rate": 1.8425302191811502e-05, + "loss": 0.5221, + "step": 214300 + }, + { + "epoch": 1.8945702717516222, + "grad_norm": 3.3477978706359863, + "learning_rate": 1.842382880413963e-05, + "loss": 0.6683, + "step": 214310 + }, + { + "epoch": 1.8946586750119345, + "grad_norm": 17.018566131591797, + "learning_rate": 1.842235541646776e-05, + "loss": 0.5197, + "step": 214320 + }, + { + "epoch": 1.8947470782722466, + "grad_norm": 2.96636700630188, + "learning_rate": 1.8420882028795887e-05, + "loss": 0.5335, + "step": 214330 + }, + { + "epoch": 1.8948354815325588, + "grad_norm": 17.807235717773438, + "learning_rate": 1.841940864112402e-05, + "loss": 0.5206, + "step": 214340 + }, + { + "epoch": 1.8949238847928713, + "grad_norm": 2.2948529720306396, + "learning_rate": 1.8417935253452147e-05, + "loss": 0.5527, + "step": 214350 + }, + { + "epoch": 1.8950122880531834, + "grad_norm": 1.9467228651046753, + "learning_rate": 1.8416461865780276e-05, + "loss": 0.6366, + "step": 214360 + }, + { + "epoch": 1.8951006913134956, + "grad_norm": 3.9395129680633545, + "learning_rate": 1.8414988478108407e-05, + "loss": 0.5505, + "step": 214370 + }, + { + "epoch": 1.895189094573808, + "grad_norm": 7.28964376449585, + "learning_rate": 1.8413515090436536e-05, + "loss": 0.5626, + "step": 214380 + }, + { + "epoch": 1.8952774978341203, + "grad_norm": 5.027669906616211, + "learning_rate": 1.8412041702764664e-05, + "loss": 0.5649, + "step": 214390 + }, + { + "epoch": 1.8953659010944324, + "grad_norm": 3.4936208724975586, + "learning_rate": 1.8410568315092792e-05, + "loss": 0.6373, + "step": 214400 + }, + { + "epoch": 1.8954543043547445, + "grad_norm": 3.587873697280884, + "learning_rate": 1.8409094927420924e-05, + "loss": 0.5265, + "step": 214410 + }, + { + "epoch": 1.8955427076150568, + "grad_norm": 2.6135525703430176, + "learning_rate": 1.8407621539749052e-05, + "loss": 0.524, + "step": 214420 + }, + { + "epoch": 1.8956311108753692, + "grad_norm": 1.8510371446609497, + "learning_rate": 1.840614815207718e-05, + "loss": 0.6583, + "step": 214430 + }, + { + "epoch": 1.8957195141356813, + "grad_norm": 5.618251800537109, + "learning_rate": 1.840467476440531e-05, + "loss": 0.5369, + "step": 214440 + }, + { + "epoch": 1.8958079173959934, + "grad_norm": 9.209452629089355, + "learning_rate": 1.840320137673344e-05, + "loss": 0.6332, + "step": 214450 + }, + { + "epoch": 1.8958963206563058, + "grad_norm": 2.0192315578460693, + "learning_rate": 1.840172798906157e-05, + "loss": 0.627, + "step": 214460 + }, + { + "epoch": 1.895984723916618, + "grad_norm": 1.686482548713684, + "learning_rate": 1.8400254601389698e-05, + "loss": 0.5642, + "step": 214470 + }, + { + "epoch": 1.8960731271769302, + "grad_norm": 1.6534452438354492, + "learning_rate": 1.839878121371783e-05, + "loss": 0.5933, + "step": 214480 + }, + { + "epoch": 1.8961615304372426, + "grad_norm": 2.584453582763672, + "learning_rate": 1.8397307826045958e-05, + "loss": 0.5834, + "step": 214490 + }, + { + "epoch": 1.896249933697555, + "grad_norm": 20.656749725341797, + "learning_rate": 1.8395834438374086e-05, + "loss": 0.5675, + "step": 214500 + }, + { + "epoch": 1.896338336957867, + "grad_norm": 4.196337699890137, + "learning_rate": 1.8394361050702218e-05, + "loss": 0.4978, + "step": 214510 + }, + { + "epoch": 1.8964267402181791, + "grad_norm": 5.75929594039917, + "learning_rate": 1.8392887663030346e-05, + "loss": 0.6418, + "step": 214520 + }, + { + "epoch": 1.8965151434784915, + "grad_norm": 2.27282452583313, + "learning_rate": 1.8391414275358475e-05, + "loss": 0.6251, + "step": 214530 + }, + { + "epoch": 1.8966035467388038, + "grad_norm": 2.5718026161193848, + "learning_rate": 1.8389940887686606e-05, + "loss": 0.5063, + "step": 214540 + }, + { + "epoch": 1.896691949999116, + "grad_norm": 9.644734382629395, + "learning_rate": 1.8388467500014735e-05, + "loss": 0.7092, + "step": 214550 + }, + { + "epoch": 1.896780353259428, + "grad_norm": 1.9957596063613892, + "learning_rate": 1.8386994112342863e-05, + "loss": 0.4627, + "step": 214560 + }, + { + "epoch": 1.8968687565197404, + "grad_norm": 2.2706170082092285, + "learning_rate": 1.8385520724670995e-05, + "loss": 0.3897, + "step": 214570 + }, + { + "epoch": 1.8969571597800527, + "grad_norm": 1.3469334840774536, + "learning_rate": 1.8384047336999123e-05, + "loss": 0.6298, + "step": 214580 + }, + { + "epoch": 1.8970455630403649, + "grad_norm": 1.8614156246185303, + "learning_rate": 1.838257394932725e-05, + "loss": 0.6533, + "step": 214590 + }, + { + "epoch": 1.8971339663006772, + "grad_norm": 1.2404825687408447, + "learning_rate": 1.8381100561655383e-05, + "loss": 0.5264, + "step": 214600 + }, + { + "epoch": 1.8972223695609896, + "grad_norm": 1.3020907640457153, + "learning_rate": 1.837962717398351e-05, + "loss": 0.5894, + "step": 214610 + }, + { + "epoch": 1.8973107728213017, + "grad_norm": 3.074523448944092, + "learning_rate": 1.837815378631164e-05, + "loss": 0.6098, + "step": 214620 + }, + { + "epoch": 1.8973991760816138, + "grad_norm": 5.36939811706543, + "learning_rate": 1.837668039863977e-05, + "loss": 0.4847, + "step": 214630 + }, + { + "epoch": 1.8974875793419261, + "grad_norm": 2.5673325061798096, + "learning_rate": 1.83752070109679e-05, + "loss": 0.7006, + "step": 214640 + }, + { + "epoch": 1.8975759826022385, + "grad_norm": 2.209733724594116, + "learning_rate": 1.837373362329603e-05, + "loss": 0.61, + "step": 214650 + }, + { + "epoch": 1.8976643858625506, + "grad_norm": 1.1435073614120483, + "learning_rate": 1.837226023562416e-05, + "loss": 0.4654, + "step": 214660 + }, + { + "epoch": 1.8977527891228627, + "grad_norm": 1.2431167364120483, + "learning_rate": 1.837078684795229e-05, + "loss": 0.7263, + "step": 214670 + }, + { + "epoch": 1.897841192383175, + "grad_norm": 1.6127747297286987, + "learning_rate": 1.8369313460280417e-05, + "loss": 0.6197, + "step": 214680 + }, + { + "epoch": 1.8979295956434874, + "grad_norm": 2.1076979637145996, + "learning_rate": 1.8367840072608545e-05, + "loss": 0.5103, + "step": 214690 + }, + { + "epoch": 1.8980179989037995, + "grad_norm": 1.5015486478805542, + "learning_rate": 1.8366366684936677e-05, + "loss": 0.4705, + "step": 214700 + }, + { + "epoch": 1.8981064021641119, + "grad_norm": 10.76135540008545, + "learning_rate": 1.8364893297264805e-05, + "loss": 0.5346, + "step": 214710 + }, + { + "epoch": 1.8981948054244242, + "grad_norm": 8.669085502624512, + "learning_rate": 1.8363419909592934e-05, + "loss": 0.5063, + "step": 214720 + }, + { + "epoch": 1.8982832086847363, + "grad_norm": 3.8375988006591797, + "learning_rate": 1.8361946521921065e-05, + "loss": 0.5008, + "step": 214730 + }, + { + "epoch": 1.8983716119450484, + "grad_norm": 1.3984936475753784, + "learning_rate": 1.8360473134249194e-05, + "loss": 0.5863, + "step": 214740 + }, + { + "epoch": 1.8984600152053608, + "grad_norm": 7.372394561767578, + "learning_rate": 1.8358999746577322e-05, + "loss": 0.6715, + "step": 214750 + }, + { + "epoch": 1.8985484184656731, + "grad_norm": 1.0428462028503418, + "learning_rate": 1.835752635890545e-05, + "loss": 0.4896, + "step": 214760 + }, + { + "epoch": 1.8986368217259852, + "grad_norm": 14.690001487731934, + "learning_rate": 1.8356052971233582e-05, + "loss": 0.6821, + "step": 214770 + }, + { + "epoch": 1.8987252249862974, + "grad_norm": 14.343873023986816, + "learning_rate": 1.835457958356171e-05, + "loss": 0.5994, + "step": 214780 + }, + { + "epoch": 1.8988136282466097, + "grad_norm": 2.0612404346466064, + "learning_rate": 1.835310619588984e-05, + "loss": 0.4487, + "step": 214790 + }, + { + "epoch": 1.898902031506922, + "grad_norm": 2.703902006149292, + "learning_rate": 1.8351632808217967e-05, + "loss": 0.5163, + "step": 214800 + }, + { + "epoch": 1.8989904347672342, + "grad_norm": 1.1653752326965332, + "learning_rate": 1.83501594205461e-05, + "loss": 0.5916, + "step": 214810 + }, + { + "epoch": 1.8990788380275463, + "grad_norm": 2.3188114166259766, + "learning_rate": 1.8348686032874227e-05, + "loss": 0.6301, + "step": 214820 + }, + { + "epoch": 1.8991672412878589, + "grad_norm": 1.8194137811660767, + "learning_rate": 1.8347212645202356e-05, + "loss": 0.4594, + "step": 214830 + }, + { + "epoch": 1.899255644548171, + "grad_norm": 1.9201371669769287, + "learning_rate": 1.8345739257530487e-05, + "loss": 0.5383, + "step": 214840 + }, + { + "epoch": 1.899344047808483, + "grad_norm": 5.971470355987549, + "learning_rate": 1.8344265869858616e-05, + "loss": 0.5505, + "step": 214850 + }, + { + "epoch": 1.8994324510687954, + "grad_norm": 3.129467248916626, + "learning_rate": 1.8342792482186744e-05, + "loss": 0.6249, + "step": 214860 + }, + { + "epoch": 1.8995208543291078, + "grad_norm": 6.293489456176758, + "learning_rate": 1.8341319094514872e-05, + "loss": 0.7571, + "step": 214870 + }, + { + "epoch": 1.89960925758942, + "grad_norm": 5.534122467041016, + "learning_rate": 1.8339845706843004e-05, + "loss": 0.5868, + "step": 214880 + }, + { + "epoch": 1.899697660849732, + "grad_norm": 1.0550578832626343, + "learning_rate": 1.8338372319171133e-05, + "loss": 0.53, + "step": 214890 + }, + { + "epoch": 1.8997860641100444, + "grad_norm": 1.8364028930664062, + "learning_rate": 1.833689893149926e-05, + "loss": 0.6311, + "step": 214900 + }, + { + "epoch": 1.8998744673703567, + "grad_norm": 5.171708583831787, + "learning_rate": 1.833542554382739e-05, + "loss": 0.5155, + "step": 214910 + }, + { + "epoch": 1.8999628706306688, + "grad_norm": 1.2820627689361572, + "learning_rate": 1.833395215615552e-05, + "loss": 0.5437, + "step": 214920 + }, + { + "epoch": 1.900051273890981, + "grad_norm": 11.707216262817383, + "learning_rate": 1.833247876848365e-05, + "loss": 0.5461, + "step": 214930 + }, + { + "epoch": 1.9001396771512935, + "grad_norm": 5.4688005447387695, + "learning_rate": 1.8331005380811778e-05, + "loss": 0.4533, + "step": 214940 + }, + { + "epoch": 1.9002280804116056, + "grad_norm": 2.838073253631592, + "learning_rate": 1.832953199313991e-05, + "loss": 0.4696, + "step": 214950 + }, + { + "epoch": 1.9003164836719177, + "grad_norm": 1.4912468194961548, + "learning_rate": 1.8328058605468038e-05, + "loss": 0.5507, + "step": 214960 + }, + { + "epoch": 1.90040488693223, + "grad_norm": 2.4036059379577637, + "learning_rate": 1.8326585217796166e-05, + "loss": 0.5545, + "step": 214970 + }, + { + "epoch": 1.9004932901925424, + "grad_norm": 2.126760482788086, + "learning_rate": 1.8325111830124294e-05, + "loss": 0.6015, + "step": 214980 + }, + { + "epoch": 1.9005816934528545, + "grad_norm": 4.676766872406006, + "learning_rate": 1.8323638442452426e-05, + "loss": 0.6376, + "step": 214990 + }, + { + "epoch": 1.9006700967131667, + "grad_norm": 12.944665908813477, + "learning_rate": 1.8322165054780555e-05, + "loss": 0.6542, + "step": 215000 + }, + { + "epoch": 1.900758499973479, + "grad_norm": 1.8676060438156128, + "learning_rate": 1.8320691667108683e-05, + "loss": 0.4389, + "step": 215010 + }, + { + "epoch": 1.9008469032337914, + "grad_norm": 2.112060070037842, + "learning_rate": 1.8319218279436815e-05, + "loss": 0.4921, + "step": 215020 + }, + { + "epoch": 1.9009353064941035, + "grad_norm": 1.8731809854507446, + "learning_rate": 1.8317744891764943e-05, + "loss": 0.5691, + "step": 215030 + }, + { + "epoch": 1.9010237097544156, + "grad_norm": 2.719052791595459, + "learning_rate": 1.831627150409307e-05, + "loss": 0.6618, + "step": 215040 + }, + { + "epoch": 1.901112113014728, + "grad_norm": 8.77170467376709, + "learning_rate": 1.83147981164212e-05, + "loss": 0.536, + "step": 215050 + }, + { + "epoch": 1.9012005162750403, + "grad_norm": 1.5281165838241577, + "learning_rate": 1.831332472874933e-05, + "loss": 0.4545, + "step": 215060 + }, + { + "epoch": 1.9012889195353524, + "grad_norm": 2.235581159591675, + "learning_rate": 1.831185134107746e-05, + "loss": 0.6655, + "step": 215070 + }, + { + "epoch": 1.9013773227956647, + "grad_norm": 2.4700381755828857, + "learning_rate": 1.8310377953405588e-05, + "loss": 0.5337, + "step": 215080 + }, + { + "epoch": 1.901465726055977, + "grad_norm": 2.260410785675049, + "learning_rate": 1.8308904565733717e-05, + "loss": 0.6076, + "step": 215090 + }, + { + "epoch": 1.9015541293162892, + "grad_norm": 2.535979747772217, + "learning_rate": 1.8307431178061848e-05, + "loss": 0.6101, + "step": 215100 + }, + { + "epoch": 1.9016425325766013, + "grad_norm": 1.87662672996521, + "learning_rate": 1.8305957790389977e-05, + "loss": 0.4419, + "step": 215110 + }, + { + "epoch": 1.9017309358369137, + "grad_norm": 2.4027507305145264, + "learning_rate": 1.8304484402718105e-05, + "loss": 0.5982, + "step": 215120 + }, + { + "epoch": 1.901819339097226, + "grad_norm": 4.070061683654785, + "learning_rate": 1.8303011015046237e-05, + "loss": 0.399, + "step": 215130 + }, + { + "epoch": 1.9019077423575381, + "grad_norm": 5.777319431304932, + "learning_rate": 1.8301537627374365e-05, + "loss": 0.5812, + "step": 215140 + }, + { + "epoch": 1.9019961456178502, + "grad_norm": 0.7077469825744629, + "learning_rate": 1.8300064239702493e-05, + "loss": 0.5997, + "step": 215150 + }, + { + "epoch": 1.9020845488781626, + "grad_norm": 6.09249210357666, + "learning_rate": 1.8298590852030622e-05, + "loss": 0.7451, + "step": 215160 + }, + { + "epoch": 1.902172952138475, + "grad_norm": 2.2320213317871094, + "learning_rate": 1.8297117464358754e-05, + "loss": 0.518, + "step": 215170 + }, + { + "epoch": 1.902261355398787, + "grad_norm": 1.11752188205719, + "learning_rate": 1.8295644076686882e-05, + "loss": 0.5834, + "step": 215180 + }, + { + "epoch": 1.9023497586590994, + "grad_norm": 2.3647751808166504, + "learning_rate": 1.829417068901501e-05, + "loss": 0.7619, + "step": 215190 + }, + { + "epoch": 1.9024381619194117, + "grad_norm": 4.228518009185791, + "learning_rate": 1.8292697301343142e-05, + "loss": 0.569, + "step": 215200 + }, + { + "epoch": 1.9025265651797238, + "grad_norm": 1.678530216217041, + "learning_rate": 1.829122391367127e-05, + "loss": 0.5302, + "step": 215210 + }, + { + "epoch": 1.902614968440036, + "grad_norm": 2.062027931213379, + "learning_rate": 1.82897505259994e-05, + "loss": 0.5292, + "step": 215220 + }, + { + "epoch": 1.9027033717003483, + "grad_norm": 6.109335422515869, + "learning_rate": 1.8288277138327527e-05, + "loss": 0.6924, + "step": 215230 + }, + { + "epoch": 1.9027917749606607, + "grad_norm": 2.0648276805877686, + "learning_rate": 1.828680375065566e-05, + "loss": 0.5308, + "step": 215240 + }, + { + "epoch": 1.9028801782209728, + "grad_norm": 1.5512999296188354, + "learning_rate": 1.8285330362983787e-05, + "loss": 0.4996, + "step": 215250 + }, + { + "epoch": 1.902968581481285, + "grad_norm": 7.731812953948975, + "learning_rate": 1.8283856975311916e-05, + "loss": 0.6653, + "step": 215260 + }, + { + "epoch": 1.9030569847415972, + "grad_norm": 7.800272464752197, + "learning_rate": 1.8282383587640044e-05, + "loss": 0.6671, + "step": 215270 + }, + { + "epoch": 1.9031453880019096, + "grad_norm": 1.48674476146698, + "learning_rate": 1.8280910199968176e-05, + "loss": 0.4932, + "step": 215280 + }, + { + "epoch": 1.9032337912622217, + "grad_norm": 2.0727829933166504, + "learning_rate": 1.8279436812296304e-05, + "loss": 0.5375, + "step": 215290 + }, + { + "epoch": 1.903322194522534, + "grad_norm": 2.819892644882202, + "learning_rate": 1.8277963424624432e-05, + "loss": 0.5211, + "step": 215300 + }, + { + "epoch": 1.9034105977828464, + "grad_norm": 3.371164321899414, + "learning_rate": 1.8276490036952564e-05, + "loss": 0.4364, + "step": 215310 + }, + { + "epoch": 1.9034990010431585, + "grad_norm": 3.080634355545044, + "learning_rate": 1.8275016649280692e-05, + "loss": 0.6259, + "step": 215320 + }, + { + "epoch": 1.9035874043034706, + "grad_norm": 2.3610153198242188, + "learning_rate": 1.827354326160882e-05, + "loss": 0.5589, + "step": 215330 + }, + { + "epoch": 1.903675807563783, + "grad_norm": 4.80729866027832, + "learning_rate": 1.827206987393695e-05, + "loss": 0.6519, + "step": 215340 + }, + { + "epoch": 1.9037642108240953, + "grad_norm": 1.9493257999420166, + "learning_rate": 1.827059648626508e-05, + "loss": 0.5813, + "step": 215350 + }, + { + "epoch": 1.9038526140844074, + "grad_norm": 2.0165605545043945, + "learning_rate": 1.826912309859321e-05, + "loss": 0.5949, + "step": 215360 + }, + { + "epoch": 1.9039410173447195, + "grad_norm": 2.412551164627075, + "learning_rate": 1.8267649710921338e-05, + "loss": 0.5228, + "step": 215370 + }, + { + "epoch": 1.9040294206050319, + "grad_norm": 3.516951322555542, + "learning_rate": 1.826617632324947e-05, + "loss": 0.5421, + "step": 215380 + }, + { + "epoch": 1.9041178238653442, + "grad_norm": 3.9508211612701416, + "learning_rate": 1.8264702935577598e-05, + "loss": 0.5422, + "step": 215390 + }, + { + "epoch": 1.9042062271256563, + "grad_norm": 4.987118244171143, + "learning_rate": 1.8263229547905726e-05, + "loss": 0.5906, + "step": 215400 + }, + { + "epoch": 1.9042946303859687, + "grad_norm": 2.879135847091675, + "learning_rate": 1.8261756160233854e-05, + "loss": 0.5828, + "step": 215410 + }, + { + "epoch": 1.904383033646281, + "grad_norm": 3.213564395904541, + "learning_rate": 1.8260282772561986e-05, + "loss": 0.5133, + "step": 215420 + }, + { + "epoch": 1.9044714369065932, + "grad_norm": 12.460219383239746, + "learning_rate": 1.8258809384890114e-05, + "loss": 0.6821, + "step": 215430 + }, + { + "epoch": 1.9045598401669053, + "grad_norm": 3.8772828578948975, + "learning_rate": 1.8257335997218243e-05, + "loss": 0.6225, + "step": 215440 + }, + { + "epoch": 1.9046482434272176, + "grad_norm": 3.252964735031128, + "learning_rate": 1.8255862609546375e-05, + "loss": 0.5598, + "step": 215450 + }, + { + "epoch": 1.90473664668753, + "grad_norm": 5.064020156860352, + "learning_rate": 1.8254389221874503e-05, + "loss": 0.4994, + "step": 215460 + }, + { + "epoch": 1.904825049947842, + "grad_norm": 1.0315037965774536, + "learning_rate": 1.825291583420263e-05, + "loss": 0.5049, + "step": 215470 + }, + { + "epoch": 1.9049134532081542, + "grad_norm": 3.551647663116455, + "learning_rate": 1.8251442446530763e-05, + "loss": 0.7413, + "step": 215480 + }, + { + "epoch": 1.9050018564684665, + "grad_norm": 1.5396426916122437, + "learning_rate": 1.824996905885889e-05, + "loss": 0.6819, + "step": 215490 + }, + { + "epoch": 1.9050902597287789, + "grad_norm": 10.421950340270996, + "learning_rate": 1.824849567118702e-05, + "loss": 0.6113, + "step": 215500 + }, + { + "epoch": 1.905178662989091, + "grad_norm": 2.161991834640503, + "learning_rate": 1.824702228351515e-05, + "loss": 0.5494, + "step": 215510 + }, + { + "epoch": 1.9052670662494031, + "grad_norm": 2.80000638961792, + "learning_rate": 1.824554889584328e-05, + "loss": 0.5843, + "step": 215520 + }, + { + "epoch": 1.9053554695097157, + "grad_norm": 0.9141148328781128, + "learning_rate": 1.824407550817141e-05, + "loss": 0.5035, + "step": 215530 + }, + { + "epoch": 1.9054438727700278, + "grad_norm": 1.3247166872024536, + "learning_rate": 1.824260212049954e-05, + "loss": 0.5271, + "step": 215540 + }, + { + "epoch": 1.90553227603034, + "grad_norm": 1.809760332107544, + "learning_rate": 1.8241128732827668e-05, + "loss": 0.5927, + "step": 215550 + }, + { + "epoch": 1.9056206792906523, + "grad_norm": 4.684435844421387, + "learning_rate": 1.8239655345155797e-05, + "loss": 0.5268, + "step": 215560 + }, + { + "epoch": 1.9057090825509646, + "grad_norm": 1.1501007080078125, + "learning_rate": 1.823818195748393e-05, + "loss": 0.5675, + "step": 215570 + }, + { + "epoch": 1.9057974858112767, + "grad_norm": 5.479037761688232, + "learning_rate": 1.8236708569812057e-05, + "loss": 0.7386, + "step": 215580 + }, + { + "epoch": 1.9058858890715888, + "grad_norm": 1.9849950075149536, + "learning_rate": 1.8235235182140185e-05, + "loss": 0.6442, + "step": 215590 + }, + { + "epoch": 1.9059742923319012, + "grad_norm": 5.336343765258789, + "learning_rate": 1.8233761794468317e-05, + "loss": 0.5115, + "step": 215600 + }, + { + "epoch": 1.9060626955922135, + "grad_norm": 2.0101168155670166, + "learning_rate": 1.8232288406796445e-05, + "loss": 0.6037, + "step": 215610 + }, + { + "epoch": 1.9061510988525256, + "grad_norm": 2.029799222946167, + "learning_rate": 1.8230815019124574e-05, + "loss": 0.5625, + "step": 215620 + }, + { + "epoch": 1.9062395021128378, + "grad_norm": 2.2444686889648438, + "learning_rate": 1.8229341631452702e-05, + "loss": 0.6678, + "step": 215630 + }, + { + "epoch": 1.9063279053731501, + "grad_norm": 1.8639298677444458, + "learning_rate": 1.8227868243780834e-05, + "loss": 0.6664, + "step": 215640 + }, + { + "epoch": 1.9064163086334625, + "grad_norm": 2.760430097579956, + "learning_rate": 1.8226394856108962e-05, + "loss": 0.6056, + "step": 215650 + }, + { + "epoch": 1.9065047118937746, + "grad_norm": 9.554108619689941, + "learning_rate": 1.822492146843709e-05, + "loss": 0.5387, + "step": 215660 + }, + { + "epoch": 1.906593115154087, + "grad_norm": 5.9488630294799805, + "learning_rate": 1.8223448080765222e-05, + "loss": 0.6218, + "step": 215670 + }, + { + "epoch": 1.9066815184143993, + "grad_norm": 2.389159917831421, + "learning_rate": 1.822197469309335e-05, + "loss": 0.5919, + "step": 215680 + }, + { + "epoch": 1.9067699216747114, + "grad_norm": 1.4990259408950806, + "learning_rate": 1.822050130542148e-05, + "loss": 0.4699, + "step": 215690 + }, + { + "epoch": 1.9068583249350235, + "grad_norm": 4.160340309143066, + "learning_rate": 1.8219027917749607e-05, + "loss": 0.502, + "step": 215700 + }, + { + "epoch": 1.9069467281953358, + "grad_norm": 3.2409324645996094, + "learning_rate": 1.821755453007774e-05, + "loss": 0.6041, + "step": 215710 + }, + { + "epoch": 1.9070351314556482, + "grad_norm": 3.227299451828003, + "learning_rate": 1.8216081142405867e-05, + "loss": 0.5844, + "step": 215720 + }, + { + "epoch": 1.9071235347159603, + "grad_norm": 1.404015064239502, + "learning_rate": 1.8214607754733996e-05, + "loss": 0.5235, + "step": 215730 + }, + { + "epoch": 1.9072119379762724, + "grad_norm": 7.0780720710754395, + "learning_rate": 1.8213134367062124e-05, + "loss": 0.5693, + "step": 215740 + }, + { + "epoch": 1.9073003412365848, + "grad_norm": 2.1195764541625977, + "learning_rate": 1.8211660979390256e-05, + "loss": 0.535, + "step": 215750 + }, + { + "epoch": 1.907388744496897, + "grad_norm": 0.7465678453445435, + "learning_rate": 1.8210187591718384e-05, + "loss": 0.493, + "step": 215760 + }, + { + "epoch": 1.9074771477572092, + "grad_norm": 4.116894245147705, + "learning_rate": 1.8208714204046512e-05, + "loss": 0.5397, + "step": 215770 + }, + { + "epoch": 1.9075655510175216, + "grad_norm": 1.049372673034668, + "learning_rate": 1.8207240816374644e-05, + "loss": 0.4521, + "step": 215780 + }, + { + "epoch": 1.907653954277834, + "grad_norm": 2.7427937984466553, + "learning_rate": 1.8205767428702772e-05, + "loss": 0.6194, + "step": 215790 + }, + { + "epoch": 1.907742357538146, + "grad_norm": 3.264143228530884, + "learning_rate": 1.82042940410309e-05, + "loss": 0.6236, + "step": 215800 + }, + { + "epoch": 1.9078307607984581, + "grad_norm": 3.0359134674072266, + "learning_rate": 1.820282065335903e-05, + "loss": 0.5412, + "step": 215810 + }, + { + "epoch": 1.9079191640587705, + "grad_norm": 11.858660697937012, + "learning_rate": 1.820134726568716e-05, + "loss": 0.5521, + "step": 215820 + }, + { + "epoch": 1.9080075673190828, + "grad_norm": 2.638056755065918, + "learning_rate": 1.819987387801529e-05, + "loss": 0.6447, + "step": 215830 + }, + { + "epoch": 1.908095970579395, + "grad_norm": 1.1822389364242554, + "learning_rate": 1.8198400490343418e-05, + "loss": 0.4912, + "step": 215840 + }, + { + "epoch": 1.908184373839707, + "grad_norm": 1.999218463897705, + "learning_rate": 1.819692710267155e-05, + "loss": 0.3623, + "step": 215850 + }, + { + "epoch": 1.9082727771000194, + "grad_norm": 1.8566445112228394, + "learning_rate": 1.8195453714999678e-05, + "loss": 0.5936, + "step": 215860 + }, + { + "epoch": 1.9083611803603318, + "grad_norm": 1.7797869443893433, + "learning_rate": 1.8193980327327806e-05, + "loss": 0.6518, + "step": 215870 + }, + { + "epoch": 1.9084495836206439, + "grad_norm": 4.77926778793335, + "learning_rate": 1.8192506939655934e-05, + "loss": 0.5155, + "step": 215880 + }, + { + "epoch": 1.9085379868809562, + "grad_norm": 2.640517473220825, + "learning_rate": 1.8191033551984066e-05, + "loss": 0.6423, + "step": 215890 + }, + { + "epoch": 1.9086263901412686, + "grad_norm": 4.8330841064453125, + "learning_rate": 1.8189560164312195e-05, + "loss": 0.5164, + "step": 215900 + }, + { + "epoch": 1.9087147934015807, + "grad_norm": 2.211228609085083, + "learning_rate": 1.8188086776640323e-05, + "loss": 0.5938, + "step": 215910 + }, + { + "epoch": 1.9088031966618928, + "grad_norm": 3.9917683601379395, + "learning_rate": 1.818661338896845e-05, + "loss": 0.5885, + "step": 215920 + }, + { + "epoch": 1.9088915999222051, + "grad_norm": 4.436807155609131, + "learning_rate": 1.8185140001296583e-05, + "loss": 0.6072, + "step": 215930 + }, + { + "epoch": 1.9089800031825175, + "grad_norm": 1.3593721389770508, + "learning_rate": 1.818366661362471e-05, + "loss": 0.5533, + "step": 215940 + }, + { + "epoch": 1.9090684064428296, + "grad_norm": 8.200021743774414, + "learning_rate": 1.818219322595284e-05, + "loss": 0.5177, + "step": 215950 + }, + { + "epoch": 1.9091568097031417, + "grad_norm": 2.670382022857666, + "learning_rate": 1.818071983828097e-05, + "loss": 0.541, + "step": 215960 + }, + { + "epoch": 1.909245212963454, + "grad_norm": 1.8656916618347168, + "learning_rate": 1.81792464506091e-05, + "loss": 0.6579, + "step": 215970 + }, + { + "epoch": 1.9093336162237664, + "grad_norm": 3.47935152053833, + "learning_rate": 1.8177773062937228e-05, + "loss": 0.561, + "step": 215980 + }, + { + "epoch": 1.9094220194840785, + "grad_norm": 2.670969247817993, + "learning_rate": 1.8176299675265356e-05, + "loss": 0.5686, + "step": 215990 + }, + { + "epoch": 1.9095104227443909, + "grad_norm": 2.4435932636260986, + "learning_rate": 1.8174826287593488e-05, + "loss": 0.588, + "step": 216000 + }, + { + "epoch": 1.9095988260047032, + "grad_norm": 2.103586435317993, + "learning_rate": 1.8173352899921617e-05, + "loss": 0.5022, + "step": 216010 + }, + { + "epoch": 1.9096872292650153, + "grad_norm": 6.973981857299805, + "learning_rate": 1.8171879512249745e-05, + "loss": 0.6578, + "step": 216020 + }, + { + "epoch": 1.9097756325253274, + "grad_norm": 0.8850047588348389, + "learning_rate": 1.8170406124577873e-05, + "loss": 0.7316, + "step": 216030 + }, + { + "epoch": 1.9098640357856398, + "grad_norm": 1.660649299621582, + "learning_rate": 1.8168932736906005e-05, + "loss": 0.544, + "step": 216040 + }, + { + "epoch": 1.9099524390459521, + "grad_norm": 4.22000789642334, + "learning_rate": 1.8167459349234133e-05, + "loss": 0.6118, + "step": 216050 + }, + { + "epoch": 1.9100408423062643, + "grad_norm": 3.014322280883789, + "learning_rate": 1.8165985961562262e-05, + "loss": 0.5632, + "step": 216060 + }, + { + "epoch": 1.9101292455665764, + "grad_norm": 4.061424255371094, + "learning_rate": 1.8164512573890393e-05, + "loss": 0.6439, + "step": 216070 + }, + { + "epoch": 1.9102176488268887, + "grad_norm": 2.7834503650665283, + "learning_rate": 1.8163039186218522e-05, + "loss": 0.6515, + "step": 216080 + }, + { + "epoch": 1.910306052087201, + "grad_norm": 2.2243871688842773, + "learning_rate": 1.816156579854665e-05, + "loss": 0.6516, + "step": 216090 + }, + { + "epoch": 1.9103944553475132, + "grad_norm": 8.861047744750977, + "learning_rate": 1.816009241087478e-05, + "loss": 0.6391, + "step": 216100 + }, + { + "epoch": 1.9104828586078253, + "grad_norm": 3.172020196914673, + "learning_rate": 1.815861902320291e-05, + "loss": 0.6371, + "step": 216110 + }, + { + "epoch": 1.9105712618681379, + "grad_norm": 11.315186500549316, + "learning_rate": 1.815714563553104e-05, + "loss": 0.5799, + "step": 216120 + }, + { + "epoch": 1.91065966512845, + "grad_norm": 1.1870808601379395, + "learning_rate": 1.8155672247859167e-05, + "loss": 0.5505, + "step": 216130 + }, + { + "epoch": 1.910748068388762, + "grad_norm": 2.998934507369995, + "learning_rate": 1.81541988601873e-05, + "loss": 0.6784, + "step": 216140 + }, + { + "epoch": 1.9108364716490744, + "grad_norm": 1.2750850915908813, + "learning_rate": 1.8152725472515427e-05, + "loss": 0.4746, + "step": 216150 + }, + { + "epoch": 1.9109248749093868, + "grad_norm": 1.8433935642242432, + "learning_rate": 1.8151252084843555e-05, + "loss": 0.5013, + "step": 216160 + }, + { + "epoch": 1.911013278169699, + "grad_norm": 1.366038203239441, + "learning_rate": 1.8149778697171684e-05, + "loss": 0.5223, + "step": 216170 + }, + { + "epoch": 1.911101681430011, + "grad_norm": 2.91780686378479, + "learning_rate": 1.8148305309499816e-05, + "loss": 0.6686, + "step": 216180 + }, + { + "epoch": 1.9111900846903234, + "grad_norm": 1.7641998529434204, + "learning_rate": 1.8146831921827944e-05, + "loss": 0.4539, + "step": 216190 + }, + { + "epoch": 1.9112784879506357, + "grad_norm": 1.4607517719268799, + "learning_rate": 1.8145358534156072e-05, + "loss": 0.6388, + "step": 216200 + }, + { + "epoch": 1.9113668912109478, + "grad_norm": 11.093215942382812, + "learning_rate": 1.81438851464842e-05, + "loss": 0.4907, + "step": 216210 + }, + { + "epoch": 1.91145529447126, + "grad_norm": 7.174412727355957, + "learning_rate": 1.8142411758812332e-05, + "loss": 0.6713, + "step": 216220 + }, + { + "epoch": 1.9115436977315723, + "grad_norm": 2.491236448287964, + "learning_rate": 1.814093837114046e-05, + "loss": 0.467, + "step": 216230 + }, + { + "epoch": 1.9116321009918846, + "grad_norm": 8.530804634094238, + "learning_rate": 1.813946498346859e-05, + "loss": 0.5929, + "step": 216240 + }, + { + "epoch": 1.9117205042521968, + "grad_norm": 1.5822875499725342, + "learning_rate": 1.813799159579672e-05, + "loss": 0.5307, + "step": 216250 + }, + { + "epoch": 1.911808907512509, + "grad_norm": 18.681421279907227, + "learning_rate": 1.813651820812485e-05, + "loss": 0.5232, + "step": 216260 + }, + { + "epoch": 1.9118973107728214, + "grad_norm": 1.1326762437820435, + "learning_rate": 1.8135044820452978e-05, + "loss": 0.5557, + "step": 216270 + }, + { + "epoch": 1.9119857140331336, + "grad_norm": 6.438721179962158, + "learning_rate": 1.8133571432781106e-05, + "loss": 0.6472, + "step": 216280 + }, + { + "epoch": 1.9120741172934457, + "grad_norm": 1.6215764284133911, + "learning_rate": 1.8132098045109238e-05, + "loss": 0.5147, + "step": 216290 + }, + { + "epoch": 1.912162520553758, + "grad_norm": 1.7961734533309937, + "learning_rate": 1.8130624657437366e-05, + "loss": 0.6435, + "step": 216300 + }, + { + "epoch": 1.9122509238140704, + "grad_norm": 1.5676223039627075, + "learning_rate": 1.8129151269765494e-05, + "loss": 0.5802, + "step": 216310 + }, + { + "epoch": 1.9123393270743825, + "grad_norm": 2.6900227069854736, + "learning_rate": 1.8127677882093626e-05, + "loss": 0.5485, + "step": 216320 + }, + { + "epoch": 1.9124277303346946, + "grad_norm": 15.534506797790527, + "learning_rate": 1.8126204494421754e-05, + "loss": 0.4505, + "step": 216330 + }, + { + "epoch": 1.912516133595007, + "grad_norm": 15.729284286499023, + "learning_rate": 1.8124731106749883e-05, + "loss": 0.7008, + "step": 216340 + }, + { + "epoch": 1.9126045368553193, + "grad_norm": 2.357236623764038, + "learning_rate": 1.8123257719078015e-05, + "loss": 0.5313, + "step": 216350 + }, + { + "epoch": 1.9126929401156314, + "grad_norm": 2.2297756671905518, + "learning_rate": 1.8121784331406143e-05, + "loss": 0.6155, + "step": 216360 + }, + { + "epoch": 1.9127813433759437, + "grad_norm": 1.433730959892273, + "learning_rate": 1.812031094373427e-05, + "loss": 0.5865, + "step": 216370 + }, + { + "epoch": 1.912869746636256, + "grad_norm": 1.051483392715454, + "learning_rate": 1.8118837556062403e-05, + "loss": 0.4809, + "step": 216380 + }, + { + "epoch": 1.9129581498965682, + "grad_norm": 2.712449073791504, + "learning_rate": 1.811736416839053e-05, + "loss": 0.572, + "step": 216390 + }, + { + "epoch": 1.9130465531568803, + "grad_norm": 1.288305640220642, + "learning_rate": 1.811589078071866e-05, + "loss": 0.5041, + "step": 216400 + }, + { + "epoch": 1.9131349564171927, + "grad_norm": 3.086259126663208, + "learning_rate": 1.811441739304679e-05, + "loss": 0.5439, + "step": 216410 + }, + { + "epoch": 1.913223359677505, + "grad_norm": 2.446774482727051, + "learning_rate": 1.811294400537492e-05, + "loss": 0.5209, + "step": 216420 + }, + { + "epoch": 1.9133117629378171, + "grad_norm": 2.3269996643066406, + "learning_rate": 1.8111470617703048e-05, + "loss": 0.4758, + "step": 216430 + }, + { + "epoch": 1.9134001661981292, + "grad_norm": 1.0755128860473633, + "learning_rate": 1.810999723003118e-05, + "loss": 0.607, + "step": 216440 + }, + { + "epoch": 1.9134885694584416, + "grad_norm": 3.610089063644409, + "learning_rate": 1.8108523842359308e-05, + "loss": 0.5732, + "step": 216450 + }, + { + "epoch": 1.913576972718754, + "grad_norm": 2.4240448474884033, + "learning_rate": 1.8107050454687437e-05, + "loss": 0.5613, + "step": 216460 + }, + { + "epoch": 1.913665375979066, + "grad_norm": 0.6894088387489319, + "learning_rate": 1.810557706701557e-05, + "loss": 0.4501, + "step": 216470 + }, + { + "epoch": 1.9137537792393784, + "grad_norm": 7.4250874519348145, + "learning_rate": 1.8104103679343697e-05, + "loss": 0.5233, + "step": 216480 + }, + { + "epoch": 1.9138421824996907, + "grad_norm": 1.0874172449111938, + "learning_rate": 1.8102630291671825e-05, + "loss": 0.525, + "step": 216490 + }, + { + "epoch": 1.9139305857600029, + "grad_norm": 1.8278764486312866, + "learning_rate": 1.8101156903999953e-05, + "loss": 0.6159, + "step": 216500 + }, + { + "epoch": 1.914018989020315, + "grad_norm": 5.650091171264648, + "learning_rate": 1.8099683516328085e-05, + "loss": 0.6809, + "step": 216510 + }, + { + "epoch": 1.9141073922806273, + "grad_norm": 3.1419782638549805, + "learning_rate": 1.8098210128656213e-05, + "loss": 0.585, + "step": 216520 + }, + { + "epoch": 1.9141957955409397, + "grad_norm": 8.009008407592773, + "learning_rate": 1.8096736740984342e-05, + "loss": 0.6204, + "step": 216530 + }, + { + "epoch": 1.9142841988012518, + "grad_norm": 2.5877749919891357, + "learning_rate": 1.8095263353312474e-05, + "loss": 0.4538, + "step": 216540 + }, + { + "epoch": 1.914372602061564, + "grad_norm": 0.9543381333351135, + "learning_rate": 1.8093789965640602e-05, + "loss": 0.6335, + "step": 216550 + }, + { + "epoch": 1.9144610053218762, + "grad_norm": 3.4716832637786865, + "learning_rate": 1.809231657796873e-05, + "loss": 0.5878, + "step": 216560 + }, + { + "epoch": 1.9145494085821886, + "grad_norm": 0.9770382046699524, + "learning_rate": 1.809084319029686e-05, + "loss": 0.5538, + "step": 216570 + }, + { + "epoch": 1.9146378118425007, + "grad_norm": 4.163053512573242, + "learning_rate": 1.808936980262499e-05, + "loss": 0.6316, + "step": 216580 + }, + { + "epoch": 1.914726215102813, + "grad_norm": 2.317775249481201, + "learning_rate": 1.808789641495312e-05, + "loss": 0.6029, + "step": 216590 + }, + { + "epoch": 1.9148146183631254, + "grad_norm": 1.217799186706543, + "learning_rate": 1.8086423027281247e-05, + "loss": 0.5065, + "step": 216600 + }, + { + "epoch": 1.9149030216234375, + "grad_norm": 2.2359209060668945, + "learning_rate": 1.808494963960938e-05, + "loss": 0.6597, + "step": 216610 + }, + { + "epoch": 1.9149914248837496, + "grad_norm": 2.1642236709594727, + "learning_rate": 1.8083476251937507e-05, + "loss": 0.7001, + "step": 216620 + }, + { + "epoch": 1.915079828144062, + "grad_norm": 1.2121096849441528, + "learning_rate": 1.8082002864265636e-05, + "loss": 0.6169, + "step": 216630 + }, + { + "epoch": 1.9151682314043743, + "grad_norm": 2.152376651763916, + "learning_rate": 1.8080529476593764e-05, + "loss": 0.472, + "step": 216640 + }, + { + "epoch": 1.9152566346646864, + "grad_norm": 1.3939648866653442, + "learning_rate": 1.8079056088921896e-05, + "loss": 0.595, + "step": 216650 + }, + { + "epoch": 1.9153450379249986, + "grad_norm": 1.5826573371887207, + "learning_rate": 1.8077582701250024e-05, + "loss": 0.6674, + "step": 216660 + }, + { + "epoch": 1.915433441185311, + "grad_norm": 1.1853008270263672, + "learning_rate": 1.8076109313578152e-05, + "loss": 0.5838, + "step": 216670 + }, + { + "epoch": 1.9155218444456232, + "grad_norm": 1.9050090312957764, + "learning_rate": 1.807463592590628e-05, + "loss": 0.647, + "step": 216680 + }, + { + "epoch": 1.9156102477059354, + "grad_norm": 4.199453830718994, + "learning_rate": 1.8073162538234412e-05, + "loss": 0.5724, + "step": 216690 + }, + { + "epoch": 1.9156986509662475, + "grad_norm": 1.700758695602417, + "learning_rate": 1.807168915056254e-05, + "loss": 0.5006, + "step": 216700 + }, + { + "epoch": 1.91578705422656, + "grad_norm": 22.516199111938477, + "learning_rate": 1.807021576289067e-05, + "loss": 0.6828, + "step": 216710 + }, + { + "epoch": 1.9158754574868722, + "grad_norm": 1.4677587747573853, + "learning_rate": 1.80687423752188e-05, + "loss": 0.5302, + "step": 216720 + }, + { + "epoch": 1.9159638607471843, + "grad_norm": 2.239807367324829, + "learning_rate": 1.806726898754693e-05, + "loss": 0.7154, + "step": 216730 + }, + { + "epoch": 1.9160522640074966, + "grad_norm": 9.4066743850708, + "learning_rate": 1.8065795599875058e-05, + "loss": 0.6155, + "step": 216740 + }, + { + "epoch": 1.916140667267809, + "grad_norm": 2.0142173767089844, + "learning_rate": 1.8064322212203186e-05, + "loss": 0.5795, + "step": 216750 + }, + { + "epoch": 1.916229070528121, + "grad_norm": 16.098608016967773, + "learning_rate": 1.8062848824531318e-05, + "loss": 0.5251, + "step": 216760 + }, + { + "epoch": 1.9163174737884332, + "grad_norm": 2.477219820022583, + "learning_rate": 1.8061375436859446e-05, + "loss": 0.5682, + "step": 216770 + }, + { + "epoch": 1.9164058770487455, + "grad_norm": 1.6210627555847168, + "learning_rate": 1.8059902049187574e-05, + "loss": 0.539, + "step": 216780 + }, + { + "epoch": 1.9164942803090579, + "grad_norm": 1.6682857275009155, + "learning_rate": 1.8058428661515706e-05, + "loss": 0.582, + "step": 216790 + }, + { + "epoch": 1.91658268356937, + "grad_norm": 1.1472681760787964, + "learning_rate": 1.8056955273843834e-05, + "loss": 0.5368, + "step": 216800 + }, + { + "epoch": 1.9166710868296821, + "grad_norm": 2.036644697189331, + "learning_rate": 1.8055481886171963e-05, + "loss": 0.579, + "step": 216810 + }, + { + "epoch": 1.9167594900899945, + "grad_norm": 6.5936479568481445, + "learning_rate": 1.805400849850009e-05, + "loss": 0.71, + "step": 216820 + }, + { + "epoch": 1.9168478933503068, + "grad_norm": 1.057573914527893, + "learning_rate": 1.8052535110828223e-05, + "loss": 0.5328, + "step": 216830 + }, + { + "epoch": 1.916936296610619, + "grad_norm": 1.4324116706848145, + "learning_rate": 1.805106172315635e-05, + "loss": 0.5921, + "step": 216840 + }, + { + "epoch": 1.9170246998709313, + "grad_norm": 1.5914580821990967, + "learning_rate": 1.804958833548448e-05, + "loss": 0.6378, + "step": 216850 + }, + { + "epoch": 1.9171131031312436, + "grad_norm": 3.1657867431640625, + "learning_rate": 1.8048114947812608e-05, + "loss": 0.5323, + "step": 216860 + }, + { + "epoch": 1.9172015063915557, + "grad_norm": 1.5273853540420532, + "learning_rate": 1.804664156014074e-05, + "loss": 0.4514, + "step": 216870 + }, + { + "epoch": 1.9172899096518679, + "grad_norm": 10.381452560424805, + "learning_rate": 1.8045168172468868e-05, + "loss": 0.5863, + "step": 216880 + }, + { + "epoch": 1.9173783129121802, + "grad_norm": 8.416742324829102, + "learning_rate": 1.8043694784796996e-05, + "loss": 0.5755, + "step": 216890 + }, + { + "epoch": 1.9174667161724925, + "grad_norm": 9.627915382385254, + "learning_rate": 1.8042221397125128e-05, + "loss": 0.6754, + "step": 216900 + }, + { + "epoch": 1.9175551194328047, + "grad_norm": 2.02286696434021, + "learning_rate": 1.8040748009453257e-05, + "loss": 0.565, + "step": 216910 + }, + { + "epoch": 1.9176435226931168, + "grad_norm": 1.7678122520446777, + "learning_rate": 1.8039274621781385e-05, + "loss": 0.4378, + "step": 216920 + }, + { + "epoch": 1.9177319259534291, + "grad_norm": 1.2858494520187378, + "learning_rate": 1.8037801234109513e-05, + "loss": 0.535, + "step": 216930 + }, + { + "epoch": 1.9178203292137415, + "grad_norm": 2.36051344871521, + "learning_rate": 1.8036327846437645e-05, + "loss": 0.4137, + "step": 216940 + }, + { + "epoch": 1.9179087324740536, + "grad_norm": 1.4821425676345825, + "learning_rate": 1.8034854458765773e-05, + "loss": 0.5902, + "step": 216950 + }, + { + "epoch": 1.917997135734366, + "grad_norm": 12.057268142700195, + "learning_rate": 1.8033381071093902e-05, + "loss": 0.5828, + "step": 216960 + }, + { + "epoch": 1.9180855389946783, + "grad_norm": 1.9689851999282837, + "learning_rate": 1.8031907683422033e-05, + "loss": 0.5527, + "step": 216970 + }, + { + "epoch": 1.9181739422549904, + "grad_norm": 1.2902332544326782, + "learning_rate": 1.8030434295750162e-05, + "loss": 0.5786, + "step": 216980 + }, + { + "epoch": 1.9182623455153025, + "grad_norm": 8.02831745147705, + "learning_rate": 1.802896090807829e-05, + "loss": 0.5161, + "step": 216990 + }, + { + "epoch": 1.9183507487756148, + "grad_norm": 2.5328893661499023, + "learning_rate": 1.802748752040642e-05, + "loss": 0.4753, + "step": 217000 + }, + { + "epoch": 1.9184391520359272, + "grad_norm": 1.1034444570541382, + "learning_rate": 1.802601413273455e-05, + "loss": 0.51, + "step": 217010 + }, + { + "epoch": 1.9185275552962393, + "grad_norm": 2.585118055343628, + "learning_rate": 1.802454074506268e-05, + "loss": 0.5873, + "step": 217020 + }, + { + "epoch": 1.9186159585565514, + "grad_norm": 1.4094904661178589, + "learning_rate": 1.8023067357390807e-05, + "loss": 0.5913, + "step": 217030 + }, + { + "epoch": 1.9187043618168638, + "grad_norm": 5.6701741218566895, + "learning_rate": 1.8021593969718935e-05, + "loss": 0.5623, + "step": 217040 + }, + { + "epoch": 1.918792765077176, + "grad_norm": 18.261985778808594, + "learning_rate": 1.8020120582047067e-05, + "loss": 0.5366, + "step": 217050 + }, + { + "epoch": 1.9188811683374882, + "grad_norm": 1.955998182296753, + "learning_rate": 1.8018647194375195e-05, + "loss": 0.5805, + "step": 217060 + }, + { + "epoch": 1.9189695715978006, + "grad_norm": 5.888978958129883, + "learning_rate": 1.8017173806703324e-05, + "loss": 0.5649, + "step": 217070 + }, + { + "epoch": 1.919057974858113, + "grad_norm": 8.626385688781738, + "learning_rate": 1.8015700419031456e-05, + "loss": 0.5281, + "step": 217080 + }, + { + "epoch": 1.919146378118425, + "grad_norm": 3.256411552429199, + "learning_rate": 1.8014227031359584e-05, + "loss": 0.5578, + "step": 217090 + }, + { + "epoch": 1.9192347813787372, + "grad_norm": 1.712684988975525, + "learning_rate": 1.8012753643687712e-05, + "loss": 0.5281, + "step": 217100 + }, + { + "epoch": 1.9193231846390495, + "grad_norm": 5.248591423034668, + "learning_rate": 1.801128025601584e-05, + "loss": 0.4879, + "step": 217110 + }, + { + "epoch": 1.9194115878993618, + "grad_norm": 5.866351127624512, + "learning_rate": 1.8009806868343972e-05, + "loss": 0.6175, + "step": 217120 + }, + { + "epoch": 1.919499991159674, + "grad_norm": 2.3572940826416016, + "learning_rate": 1.80083334806721e-05, + "loss": 0.5237, + "step": 217130 + }, + { + "epoch": 1.919588394419986, + "grad_norm": 2.529583215713501, + "learning_rate": 1.800686009300023e-05, + "loss": 0.5641, + "step": 217140 + }, + { + "epoch": 1.9196767976802984, + "grad_norm": 5.777018070220947, + "learning_rate": 1.8005386705328357e-05, + "loss": 0.5322, + "step": 217150 + }, + { + "epoch": 1.9197652009406108, + "grad_norm": 4.584105968475342, + "learning_rate": 1.800391331765649e-05, + "loss": 0.5333, + "step": 217160 + }, + { + "epoch": 1.9198536042009229, + "grad_norm": 7.387413024902344, + "learning_rate": 1.8002439929984617e-05, + "loss": 0.5365, + "step": 217170 + }, + { + "epoch": 1.9199420074612352, + "grad_norm": 15.710243225097656, + "learning_rate": 1.8000966542312746e-05, + "loss": 0.6798, + "step": 217180 + }, + { + "epoch": 1.9200304107215476, + "grad_norm": 3.0048375129699707, + "learning_rate": 1.7999493154640878e-05, + "loss": 0.4851, + "step": 217190 + }, + { + "epoch": 1.9201188139818597, + "grad_norm": 2.6067099571228027, + "learning_rate": 1.7998019766969006e-05, + "loss": 0.5282, + "step": 217200 + }, + { + "epoch": 1.9202072172421718, + "grad_norm": 5.25256872177124, + "learning_rate": 1.7996546379297134e-05, + "loss": 0.5931, + "step": 217210 + }, + { + "epoch": 1.9202956205024841, + "grad_norm": 3.8436293601989746, + "learning_rate": 1.7995072991625263e-05, + "loss": 0.555, + "step": 217220 + }, + { + "epoch": 1.9203840237627965, + "grad_norm": 2.1344540119171143, + "learning_rate": 1.7993599603953394e-05, + "loss": 0.4882, + "step": 217230 + }, + { + "epoch": 1.9204724270231086, + "grad_norm": 1.0375187397003174, + "learning_rate": 1.7992126216281523e-05, + "loss": 0.5093, + "step": 217240 + }, + { + "epoch": 1.9205608302834207, + "grad_norm": 8.884085655212402, + "learning_rate": 1.799065282860965e-05, + "loss": 0.5275, + "step": 217250 + }, + { + "epoch": 1.920649233543733, + "grad_norm": 2.383563756942749, + "learning_rate": 1.7989179440937783e-05, + "loss": 0.5701, + "step": 217260 + }, + { + "epoch": 1.9207376368040454, + "grad_norm": 1.274013638496399, + "learning_rate": 1.798770605326591e-05, + "loss": 0.5523, + "step": 217270 + }, + { + "epoch": 1.9208260400643575, + "grad_norm": 1.9956133365631104, + "learning_rate": 1.798623266559404e-05, + "loss": 0.7067, + "step": 217280 + }, + { + "epoch": 1.9209144433246697, + "grad_norm": 2.0668630599975586, + "learning_rate": 1.798475927792217e-05, + "loss": 0.6458, + "step": 217290 + }, + { + "epoch": 1.9210028465849822, + "grad_norm": 7.267787933349609, + "learning_rate": 1.79832858902503e-05, + "loss": 0.5819, + "step": 217300 + }, + { + "epoch": 1.9210912498452943, + "grad_norm": 3.4679811000823975, + "learning_rate": 1.7981812502578428e-05, + "loss": 0.6455, + "step": 217310 + }, + { + "epoch": 1.9211796531056065, + "grad_norm": 2.465635061264038, + "learning_rate": 1.798033911490656e-05, + "loss": 0.581, + "step": 217320 + }, + { + "epoch": 1.9212680563659188, + "grad_norm": 2.237391710281372, + "learning_rate": 1.7978865727234688e-05, + "loss": 0.545, + "step": 217330 + }, + { + "epoch": 1.9213564596262311, + "grad_norm": 2.048137903213501, + "learning_rate": 1.7977392339562816e-05, + "loss": 0.5683, + "step": 217340 + }, + { + "epoch": 1.9214448628865433, + "grad_norm": 9.93455982208252, + "learning_rate": 1.7975918951890948e-05, + "loss": 0.6699, + "step": 217350 + }, + { + "epoch": 1.9215332661468554, + "grad_norm": 2.37434458732605, + "learning_rate": 1.7974445564219077e-05, + "loss": 0.5423, + "step": 217360 + }, + { + "epoch": 1.9216216694071677, + "grad_norm": 8.60255241394043, + "learning_rate": 1.7972972176547205e-05, + "loss": 0.5786, + "step": 217370 + }, + { + "epoch": 1.92171007266748, + "grad_norm": 10.527209281921387, + "learning_rate": 1.7971498788875337e-05, + "loss": 0.5217, + "step": 217380 + }, + { + "epoch": 1.9217984759277922, + "grad_norm": 1.200935959815979, + "learning_rate": 1.7970025401203465e-05, + "loss": 0.4956, + "step": 217390 + }, + { + "epoch": 1.9218868791881043, + "grad_norm": 4.9400315284729, + "learning_rate": 1.7968552013531593e-05, + "loss": 0.5778, + "step": 217400 + }, + { + "epoch": 1.9219752824484166, + "grad_norm": 3.8515446186065674, + "learning_rate": 1.7967078625859725e-05, + "loss": 0.6323, + "step": 217410 + }, + { + "epoch": 1.922063685708729, + "grad_norm": 3.5395429134368896, + "learning_rate": 1.7965605238187853e-05, + "loss": 0.5324, + "step": 217420 + }, + { + "epoch": 1.922152088969041, + "grad_norm": 3.494119167327881, + "learning_rate": 1.7964131850515982e-05, + "loss": 0.6355, + "step": 217430 + }, + { + "epoch": 1.9222404922293534, + "grad_norm": 0.7550321817398071, + "learning_rate": 1.7962658462844114e-05, + "loss": 0.506, + "step": 217440 + }, + { + "epoch": 1.9223288954896658, + "grad_norm": 2.566480875015259, + "learning_rate": 1.7961185075172242e-05, + "loss": 0.4371, + "step": 217450 + }, + { + "epoch": 1.922417298749978, + "grad_norm": 4.824710369110107, + "learning_rate": 1.795971168750037e-05, + "loss": 0.5889, + "step": 217460 + }, + { + "epoch": 1.92250570201029, + "grad_norm": 1.0541762113571167, + "learning_rate": 1.79582382998285e-05, + "loss": 0.5156, + "step": 217470 + }, + { + "epoch": 1.9225941052706024, + "grad_norm": 3.598564386367798, + "learning_rate": 1.795676491215663e-05, + "loss": 0.5931, + "step": 217480 + }, + { + "epoch": 1.9226825085309147, + "grad_norm": 2.303901195526123, + "learning_rate": 1.795529152448476e-05, + "loss": 0.5181, + "step": 217490 + }, + { + "epoch": 1.9227709117912268, + "grad_norm": 2.465131998062134, + "learning_rate": 1.7953818136812887e-05, + "loss": 0.6425, + "step": 217500 + }, + { + "epoch": 1.922859315051539, + "grad_norm": 1.9853037595748901, + "learning_rate": 1.7952344749141015e-05, + "loss": 0.5342, + "step": 217510 + }, + { + "epoch": 1.9229477183118513, + "grad_norm": 2.2744500637054443, + "learning_rate": 1.7950871361469147e-05, + "loss": 0.6323, + "step": 217520 + }, + { + "epoch": 1.9230361215721636, + "grad_norm": 4.011896133422852, + "learning_rate": 1.7949397973797275e-05, + "loss": 0.5177, + "step": 217530 + }, + { + "epoch": 1.9231245248324758, + "grad_norm": 1.1928479671478271, + "learning_rate": 1.7947924586125404e-05, + "loss": 0.5628, + "step": 217540 + }, + { + "epoch": 1.923212928092788, + "grad_norm": 3.8024237155914307, + "learning_rate": 1.7946451198453536e-05, + "loss": 0.5659, + "step": 217550 + }, + { + "epoch": 1.9233013313531004, + "grad_norm": 3.6223812103271484, + "learning_rate": 1.7944977810781664e-05, + "loss": 0.5637, + "step": 217560 + }, + { + "epoch": 1.9233897346134126, + "grad_norm": 1.7467900514602661, + "learning_rate": 1.7943504423109792e-05, + "loss": 0.4282, + "step": 217570 + }, + { + "epoch": 1.9234781378737247, + "grad_norm": 1.9814823865890503, + "learning_rate": 1.794203103543792e-05, + "loss": 0.5673, + "step": 217580 + }, + { + "epoch": 1.923566541134037, + "grad_norm": 2.8140225410461426, + "learning_rate": 1.7940557647766052e-05, + "loss": 0.7701, + "step": 217590 + }, + { + "epoch": 1.9236549443943494, + "grad_norm": 2.0217604637145996, + "learning_rate": 1.793908426009418e-05, + "loss": 0.6374, + "step": 217600 + }, + { + "epoch": 1.9237433476546615, + "grad_norm": 2.1382977962493896, + "learning_rate": 1.793761087242231e-05, + "loss": 0.5756, + "step": 217610 + }, + { + "epoch": 1.9238317509149736, + "grad_norm": 2.7940778732299805, + "learning_rate": 1.7936137484750437e-05, + "loss": 0.4617, + "step": 217620 + }, + { + "epoch": 1.923920154175286, + "grad_norm": 2.182830810546875, + "learning_rate": 1.793466409707857e-05, + "loss": 0.5728, + "step": 217630 + }, + { + "epoch": 1.9240085574355983, + "grad_norm": 9.27804946899414, + "learning_rate": 1.7933190709406698e-05, + "loss": 0.6668, + "step": 217640 + }, + { + "epoch": 1.9240969606959104, + "grad_norm": 1.4358670711517334, + "learning_rate": 1.7931717321734826e-05, + "loss": 0.6601, + "step": 217650 + }, + { + "epoch": 1.9241853639562227, + "grad_norm": 1.3430434465408325, + "learning_rate": 1.7930243934062958e-05, + "loss": 0.5591, + "step": 217660 + }, + { + "epoch": 1.924273767216535, + "grad_norm": 2.20162296295166, + "learning_rate": 1.7928770546391086e-05, + "loss": 0.6552, + "step": 217670 + }, + { + "epoch": 1.9243621704768472, + "grad_norm": 6.015159606933594, + "learning_rate": 1.7927297158719214e-05, + "loss": 0.6009, + "step": 217680 + }, + { + "epoch": 1.9244505737371593, + "grad_norm": 1.3070344924926758, + "learning_rate": 1.7925823771047343e-05, + "loss": 0.5133, + "step": 217690 + }, + { + "epoch": 1.9245389769974717, + "grad_norm": 3.3622984886169434, + "learning_rate": 1.7924350383375474e-05, + "loss": 0.6422, + "step": 217700 + }, + { + "epoch": 1.924627380257784, + "grad_norm": 5.397711753845215, + "learning_rate": 1.7922876995703603e-05, + "loss": 0.5324, + "step": 217710 + }, + { + "epoch": 1.9247157835180961, + "grad_norm": 4.792809009552002, + "learning_rate": 1.792140360803173e-05, + "loss": 0.6001, + "step": 217720 + }, + { + "epoch": 1.9248041867784083, + "grad_norm": 2.090374708175659, + "learning_rate": 1.7919930220359863e-05, + "loss": 0.613, + "step": 217730 + }, + { + "epoch": 1.9248925900387206, + "grad_norm": 2.0831427574157715, + "learning_rate": 1.791845683268799e-05, + "loss": 0.5197, + "step": 217740 + }, + { + "epoch": 1.924980993299033, + "grad_norm": 9.81713581085205, + "learning_rate": 1.791698344501612e-05, + "loss": 0.5961, + "step": 217750 + }, + { + "epoch": 1.925069396559345, + "grad_norm": 2.418062925338745, + "learning_rate": 1.7915510057344248e-05, + "loss": 0.5523, + "step": 217760 + }, + { + "epoch": 1.9251577998196574, + "grad_norm": 5.5770392417907715, + "learning_rate": 1.791403666967238e-05, + "loss": 0.5343, + "step": 217770 + }, + { + "epoch": 1.9252462030799697, + "grad_norm": 3.117497444152832, + "learning_rate": 1.7912563282000508e-05, + "loss": 0.7334, + "step": 217780 + }, + { + "epoch": 1.9253346063402819, + "grad_norm": 1.4018239974975586, + "learning_rate": 1.7911089894328636e-05, + "loss": 0.5547, + "step": 217790 + }, + { + "epoch": 1.925423009600594, + "grad_norm": 1.0717302560806274, + "learning_rate": 1.7909616506656765e-05, + "loss": 0.6232, + "step": 217800 + }, + { + "epoch": 1.9255114128609063, + "grad_norm": 3.4991631507873535, + "learning_rate": 1.7908143118984896e-05, + "loss": 0.6773, + "step": 217810 + }, + { + "epoch": 1.9255998161212187, + "grad_norm": 2.2995593547821045, + "learning_rate": 1.7906669731313025e-05, + "loss": 0.6265, + "step": 217820 + }, + { + "epoch": 1.9256882193815308, + "grad_norm": 4.029776096343994, + "learning_rate": 1.7905196343641153e-05, + "loss": 0.482, + "step": 217830 + }, + { + "epoch": 1.925776622641843, + "grad_norm": 4.962882995605469, + "learning_rate": 1.7903722955969285e-05, + "loss": 0.5714, + "step": 217840 + }, + { + "epoch": 1.9258650259021552, + "grad_norm": 2.3954861164093018, + "learning_rate": 1.7902249568297413e-05, + "loss": 0.5717, + "step": 217850 + }, + { + "epoch": 1.9259534291624676, + "grad_norm": 1.405086636543274, + "learning_rate": 1.790077618062554e-05, + "loss": 0.564, + "step": 217860 + }, + { + "epoch": 1.9260418324227797, + "grad_norm": 2.1499006748199463, + "learning_rate": 1.789930279295367e-05, + "loss": 0.5252, + "step": 217870 + }, + { + "epoch": 1.9261302356830918, + "grad_norm": 1.116417646408081, + "learning_rate": 1.7897829405281802e-05, + "loss": 0.5586, + "step": 217880 + }, + { + "epoch": 1.9262186389434044, + "grad_norm": 2.3069539070129395, + "learning_rate": 1.789635601760993e-05, + "loss": 0.5022, + "step": 217890 + }, + { + "epoch": 1.9263070422037165, + "grad_norm": 3.9958276748657227, + "learning_rate": 1.789488262993806e-05, + "loss": 0.5591, + "step": 217900 + }, + { + "epoch": 1.9263954454640286, + "grad_norm": 3.916118860244751, + "learning_rate": 1.789340924226619e-05, + "loss": 0.5871, + "step": 217910 + }, + { + "epoch": 1.926483848724341, + "grad_norm": 1.1679779291152954, + "learning_rate": 1.789193585459432e-05, + "loss": 0.6553, + "step": 217920 + }, + { + "epoch": 1.9265722519846533, + "grad_norm": 3.2080304622650146, + "learning_rate": 1.7890462466922447e-05, + "loss": 0.494, + "step": 217930 + }, + { + "epoch": 1.9266606552449654, + "grad_norm": 1.95858895778656, + "learning_rate": 1.7888989079250575e-05, + "loss": 0.433, + "step": 217940 + }, + { + "epoch": 1.9267490585052776, + "grad_norm": 2.56058931350708, + "learning_rate": 1.7887515691578707e-05, + "loss": 0.658, + "step": 217950 + }, + { + "epoch": 1.92683746176559, + "grad_norm": 1.4632349014282227, + "learning_rate": 1.7886042303906835e-05, + "loss": 0.5436, + "step": 217960 + }, + { + "epoch": 1.9269258650259022, + "grad_norm": 8.628364562988281, + "learning_rate": 1.7884568916234964e-05, + "loss": 0.5751, + "step": 217970 + }, + { + "epoch": 1.9270142682862144, + "grad_norm": 2.563814640045166, + "learning_rate": 1.7883095528563092e-05, + "loss": 0.6268, + "step": 217980 + }, + { + "epoch": 1.9271026715465265, + "grad_norm": 2.3320868015289307, + "learning_rate": 1.7881622140891224e-05, + "loss": 0.5867, + "step": 217990 + }, + { + "epoch": 1.9271910748068388, + "grad_norm": 2.9975430965423584, + "learning_rate": 1.7880148753219352e-05, + "loss": 0.6685, + "step": 218000 + }, + { + "epoch": 1.9272794780671512, + "grad_norm": 5.7594170570373535, + "learning_rate": 1.787867536554748e-05, + "loss": 0.4779, + "step": 218010 + }, + { + "epoch": 1.9273678813274633, + "grad_norm": 4.022233486175537, + "learning_rate": 1.7877201977875612e-05, + "loss": 0.6474, + "step": 218020 + }, + { + "epoch": 1.9274562845877756, + "grad_norm": 1.2713489532470703, + "learning_rate": 1.787572859020374e-05, + "loss": 0.5509, + "step": 218030 + }, + { + "epoch": 1.927544687848088, + "grad_norm": 1.3132531642913818, + "learning_rate": 1.787425520253187e-05, + "loss": 0.5761, + "step": 218040 + }, + { + "epoch": 1.9276330911084, + "grad_norm": 3.1133525371551514, + "learning_rate": 1.7872781814859997e-05, + "loss": 0.586, + "step": 218050 + }, + { + "epoch": 1.9277214943687122, + "grad_norm": 1.876328468322754, + "learning_rate": 1.787130842718813e-05, + "loss": 0.5022, + "step": 218060 + }, + { + "epoch": 1.9278098976290245, + "grad_norm": 2.0243823528289795, + "learning_rate": 1.7869835039516257e-05, + "loss": 0.5588, + "step": 218070 + }, + { + "epoch": 1.927898300889337, + "grad_norm": 3.532790184020996, + "learning_rate": 1.7868361651844386e-05, + "loss": 0.4585, + "step": 218080 + }, + { + "epoch": 1.927986704149649, + "grad_norm": 3.1207637786865234, + "learning_rate": 1.7866888264172514e-05, + "loss": 0.6022, + "step": 218090 + }, + { + "epoch": 1.9280751074099611, + "grad_norm": 10.706979751586914, + "learning_rate": 1.7865414876500646e-05, + "loss": 0.4778, + "step": 218100 + }, + { + "epoch": 1.9281635106702735, + "grad_norm": 3.146976947784424, + "learning_rate": 1.7863941488828774e-05, + "loss": 0.4929, + "step": 218110 + }, + { + "epoch": 1.9282519139305858, + "grad_norm": 1.6596893072128296, + "learning_rate": 1.7862468101156903e-05, + "loss": 0.6311, + "step": 218120 + }, + { + "epoch": 1.928340317190898, + "grad_norm": 3.6359317302703857, + "learning_rate": 1.7860994713485034e-05, + "loss": 0.5591, + "step": 218130 + }, + { + "epoch": 1.9284287204512103, + "grad_norm": 3.193084955215454, + "learning_rate": 1.7859521325813163e-05, + "loss": 0.4278, + "step": 218140 + }, + { + "epoch": 1.9285171237115226, + "grad_norm": 1.1501078605651855, + "learning_rate": 1.785804793814129e-05, + "loss": 0.516, + "step": 218150 + }, + { + "epoch": 1.9286055269718347, + "grad_norm": 2.0530176162719727, + "learning_rate": 1.785657455046942e-05, + "loss": 0.5644, + "step": 218160 + }, + { + "epoch": 1.9286939302321469, + "grad_norm": 2.1958062648773193, + "learning_rate": 1.785510116279755e-05, + "loss": 0.5224, + "step": 218170 + }, + { + "epoch": 1.9287823334924592, + "grad_norm": 8.756962776184082, + "learning_rate": 1.785362777512568e-05, + "loss": 0.5628, + "step": 218180 + }, + { + "epoch": 1.9288707367527715, + "grad_norm": 2.0489983558654785, + "learning_rate": 1.7852154387453808e-05, + "loss": 0.5238, + "step": 218190 + }, + { + "epoch": 1.9289591400130837, + "grad_norm": 3.329068660736084, + "learning_rate": 1.785068099978194e-05, + "loss": 0.5265, + "step": 218200 + }, + { + "epoch": 1.9290475432733958, + "grad_norm": 2.248135566711426, + "learning_rate": 1.7849207612110068e-05, + "loss": 0.5407, + "step": 218210 + }, + { + "epoch": 1.9291359465337081, + "grad_norm": 1.5821142196655273, + "learning_rate": 1.7847734224438196e-05, + "loss": 0.6992, + "step": 218220 + }, + { + "epoch": 1.9292243497940205, + "grad_norm": 3.011342763900757, + "learning_rate": 1.7846260836766328e-05, + "loss": 0.47, + "step": 218230 + }, + { + "epoch": 1.9293127530543326, + "grad_norm": 3.4710450172424316, + "learning_rate": 1.7844787449094456e-05, + "loss": 0.6613, + "step": 218240 + }, + { + "epoch": 1.929401156314645, + "grad_norm": 3.8228461742401123, + "learning_rate": 1.7843314061422585e-05, + "loss": 0.7031, + "step": 218250 + }, + { + "epoch": 1.9294895595749573, + "grad_norm": 2.7415707111358643, + "learning_rate": 1.7841840673750716e-05, + "loss": 0.56, + "step": 218260 + }, + { + "epoch": 1.9295779628352694, + "grad_norm": 1.6685841083526611, + "learning_rate": 1.7840367286078845e-05, + "loss": 0.6108, + "step": 218270 + }, + { + "epoch": 1.9296663660955815, + "grad_norm": 9.429146766662598, + "learning_rate": 1.7838893898406973e-05, + "loss": 0.5643, + "step": 218280 + }, + { + "epoch": 1.9297547693558939, + "grad_norm": 2.2939939498901367, + "learning_rate": 1.7837420510735105e-05, + "loss": 0.5447, + "step": 218290 + }, + { + "epoch": 1.9298431726162062, + "grad_norm": 2.8022780418395996, + "learning_rate": 1.7835947123063233e-05, + "loss": 0.5759, + "step": 218300 + }, + { + "epoch": 1.9299315758765183, + "grad_norm": 3.5208373069763184, + "learning_rate": 1.783447373539136e-05, + "loss": 0.4799, + "step": 218310 + }, + { + "epoch": 1.9300199791368304, + "grad_norm": 1.9998112916946411, + "learning_rate": 1.7833000347719493e-05, + "loss": 0.6462, + "step": 218320 + }, + { + "epoch": 1.9301083823971428, + "grad_norm": 2.380908727645874, + "learning_rate": 1.7831526960047622e-05, + "loss": 0.5386, + "step": 218330 + }, + { + "epoch": 1.9301967856574551, + "grad_norm": 1.969097375869751, + "learning_rate": 1.783005357237575e-05, + "loss": 0.7182, + "step": 218340 + }, + { + "epoch": 1.9302851889177672, + "grad_norm": 9.583781242370605, + "learning_rate": 1.7828580184703882e-05, + "loss": 0.4504, + "step": 218350 + }, + { + "epoch": 1.9303735921780796, + "grad_norm": 14.661343574523926, + "learning_rate": 1.782710679703201e-05, + "loss": 0.4807, + "step": 218360 + }, + { + "epoch": 1.930461995438392, + "grad_norm": 2.680039882659912, + "learning_rate": 1.782563340936014e-05, + "loss": 0.5022, + "step": 218370 + }, + { + "epoch": 1.930550398698704, + "grad_norm": 22.676002502441406, + "learning_rate": 1.782416002168827e-05, + "loss": 0.6186, + "step": 218380 + }, + { + "epoch": 1.9306388019590162, + "grad_norm": 5.862962245941162, + "learning_rate": 1.78226866340164e-05, + "loss": 0.5968, + "step": 218390 + }, + { + "epoch": 1.9307272052193285, + "grad_norm": 3.8336522579193115, + "learning_rate": 1.7821213246344527e-05, + "loss": 0.5934, + "step": 218400 + }, + { + "epoch": 1.9308156084796408, + "grad_norm": 2.620485782623291, + "learning_rate": 1.7819739858672655e-05, + "loss": 0.6129, + "step": 218410 + }, + { + "epoch": 1.930904011739953, + "grad_norm": 3.7110278606414795, + "learning_rate": 1.7818266471000787e-05, + "loss": 0.5808, + "step": 218420 + }, + { + "epoch": 1.930992415000265, + "grad_norm": 1.9464308023452759, + "learning_rate": 1.7816793083328915e-05, + "loss": 0.6752, + "step": 218430 + }, + { + "epoch": 1.9310808182605774, + "grad_norm": 2.341123342514038, + "learning_rate": 1.7815319695657044e-05, + "loss": 0.6382, + "step": 218440 + }, + { + "epoch": 1.9311692215208898, + "grad_norm": 1.8072208166122437, + "learning_rate": 1.7813846307985172e-05, + "loss": 0.5496, + "step": 218450 + }, + { + "epoch": 1.9312576247812019, + "grad_norm": 7.883699893951416, + "learning_rate": 1.7812372920313304e-05, + "loss": 0.6209, + "step": 218460 + }, + { + "epoch": 1.931346028041514, + "grad_norm": 1.207894206047058, + "learning_rate": 1.7810899532641432e-05, + "loss": 0.6467, + "step": 218470 + }, + { + "epoch": 1.9314344313018266, + "grad_norm": 1.264348030090332, + "learning_rate": 1.780942614496956e-05, + "loss": 0.5884, + "step": 218480 + }, + { + "epoch": 1.9315228345621387, + "grad_norm": 2.026008367538452, + "learning_rate": 1.7807952757297692e-05, + "loss": 0.6138, + "step": 218490 + }, + { + "epoch": 1.9316112378224508, + "grad_norm": 1.5705889463424683, + "learning_rate": 1.780647936962582e-05, + "loss": 0.4809, + "step": 218500 + }, + { + "epoch": 1.9316996410827632, + "grad_norm": 4.483217239379883, + "learning_rate": 1.780500598195395e-05, + "loss": 0.5974, + "step": 218510 + }, + { + "epoch": 1.9317880443430755, + "grad_norm": 2.591930389404297, + "learning_rate": 1.7803532594282077e-05, + "loss": 0.6075, + "step": 218520 + }, + { + "epoch": 1.9318764476033876, + "grad_norm": 8.742344856262207, + "learning_rate": 1.780205920661021e-05, + "loss": 0.5389, + "step": 218530 + }, + { + "epoch": 1.9319648508636997, + "grad_norm": 1.6099954843521118, + "learning_rate": 1.7800585818938337e-05, + "loss": 0.5678, + "step": 218540 + }, + { + "epoch": 1.932053254124012, + "grad_norm": 2.3716413974761963, + "learning_rate": 1.7799112431266466e-05, + "loss": 0.4723, + "step": 218550 + }, + { + "epoch": 1.9321416573843244, + "grad_norm": 1.2897536754608154, + "learning_rate": 1.7797639043594598e-05, + "loss": 0.4575, + "step": 218560 + }, + { + "epoch": 1.9322300606446365, + "grad_norm": 8.33523941040039, + "learning_rate": 1.7796165655922726e-05, + "loss": 0.5717, + "step": 218570 + }, + { + "epoch": 1.9323184639049487, + "grad_norm": 9.135923385620117, + "learning_rate": 1.7794692268250854e-05, + "loss": 0.6403, + "step": 218580 + }, + { + "epoch": 1.932406867165261, + "grad_norm": 5.0192131996154785, + "learning_rate": 1.7793218880578983e-05, + "loss": 0.5846, + "step": 218590 + }, + { + "epoch": 1.9324952704255733, + "grad_norm": 1.7597485780715942, + "learning_rate": 1.7791745492907114e-05, + "loss": 0.4968, + "step": 218600 + }, + { + "epoch": 1.9325836736858855, + "grad_norm": 7.343288421630859, + "learning_rate": 1.7790272105235243e-05, + "loss": 0.5158, + "step": 218610 + }, + { + "epoch": 1.9326720769461978, + "grad_norm": 2.0759341716766357, + "learning_rate": 1.778879871756337e-05, + "loss": 0.5679, + "step": 218620 + }, + { + "epoch": 1.9327604802065101, + "grad_norm": 4.260840892791748, + "learning_rate": 1.77873253298915e-05, + "loss": 0.5032, + "step": 218630 + }, + { + "epoch": 1.9328488834668223, + "grad_norm": 2.3392951488494873, + "learning_rate": 1.778585194221963e-05, + "loss": 0.4972, + "step": 218640 + }, + { + "epoch": 1.9329372867271344, + "grad_norm": 2.7006070613861084, + "learning_rate": 1.778437855454776e-05, + "loss": 0.6097, + "step": 218650 + }, + { + "epoch": 1.9330256899874467, + "grad_norm": 2.1393356323242188, + "learning_rate": 1.7782905166875888e-05, + "loss": 0.6489, + "step": 218660 + }, + { + "epoch": 1.933114093247759, + "grad_norm": 2.202934741973877, + "learning_rate": 1.778143177920402e-05, + "loss": 0.5765, + "step": 218670 + }, + { + "epoch": 1.9332024965080712, + "grad_norm": 5.654677867889404, + "learning_rate": 1.7779958391532148e-05, + "loss": 0.6287, + "step": 218680 + }, + { + "epoch": 1.9332908997683833, + "grad_norm": 2.843629837036133, + "learning_rate": 1.7778485003860276e-05, + "loss": 0.6147, + "step": 218690 + }, + { + "epoch": 1.9333793030286957, + "grad_norm": 5.8555192947387695, + "learning_rate": 1.7777011616188405e-05, + "loss": 0.6377, + "step": 218700 + }, + { + "epoch": 1.933467706289008, + "grad_norm": 6.428351879119873, + "learning_rate": 1.7775538228516536e-05, + "loss": 0.59, + "step": 218710 + }, + { + "epoch": 1.9335561095493201, + "grad_norm": 1.2108049392700195, + "learning_rate": 1.7774064840844665e-05, + "loss": 0.601, + "step": 218720 + }, + { + "epoch": 1.9336445128096325, + "grad_norm": 24.356868743896484, + "learning_rate": 1.7772591453172793e-05, + "loss": 0.4592, + "step": 218730 + }, + { + "epoch": 1.9337329160699448, + "grad_norm": 1.771763801574707, + "learning_rate": 1.777111806550092e-05, + "loss": 0.5184, + "step": 218740 + }, + { + "epoch": 1.933821319330257, + "grad_norm": 2.1380646228790283, + "learning_rate": 1.7769644677829053e-05, + "loss": 0.5084, + "step": 218750 + }, + { + "epoch": 1.933909722590569, + "grad_norm": 1.9356966018676758, + "learning_rate": 1.776817129015718e-05, + "loss": 0.5513, + "step": 218760 + }, + { + "epoch": 1.9339981258508814, + "grad_norm": 1.8511749505996704, + "learning_rate": 1.776669790248531e-05, + "loss": 0.5839, + "step": 218770 + }, + { + "epoch": 1.9340865291111937, + "grad_norm": 1.9369524717330933, + "learning_rate": 1.776522451481344e-05, + "loss": 0.6438, + "step": 218780 + }, + { + "epoch": 1.9341749323715058, + "grad_norm": 7.3883466720581055, + "learning_rate": 1.776375112714157e-05, + "loss": 0.6322, + "step": 218790 + }, + { + "epoch": 1.934263335631818, + "grad_norm": 6.247851371765137, + "learning_rate": 1.77622777394697e-05, + "loss": 0.5515, + "step": 218800 + }, + { + "epoch": 1.9343517388921303, + "grad_norm": 2.7822794914245605, + "learning_rate": 1.7760804351797827e-05, + "loss": 0.5161, + "step": 218810 + }, + { + "epoch": 1.9344401421524426, + "grad_norm": 3.816363573074341, + "learning_rate": 1.775933096412596e-05, + "loss": 0.5905, + "step": 218820 + }, + { + "epoch": 1.9345285454127548, + "grad_norm": 2.320831537246704, + "learning_rate": 1.7757857576454087e-05, + "loss": 0.5651, + "step": 218830 + }, + { + "epoch": 1.934616948673067, + "grad_norm": 2.0087990760803223, + "learning_rate": 1.7756384188782215e-05, + "loss": 0.6182, + "step": 218840 + }, + { + "epoch": 1.9347053519333794, + "grad_norm": 1.3577136993408203, + "learning_rate": 1.7754910801110347e-05, + "loss": 0.5823, + "step": 218850 + }, + { + "epoch": 1.9347937551936916, + "grad_norm": 6.090202331542969, + "learning_rate": 1.7753437413438475e-05, + "loss": 0.5309, + "step": 218860 + }, + { + "epoch": 1.9348821584540037, + "grad_norm": 2.085124969482422, + "learning_rate": 1.7751964025766604e-05, + "loss": 0.5835, + "step": 218870 + }, + { + "epoch": 1.934970561714316, + "grad_norm": 1.6566983461380005, + "learning_rate": 1.7750490638094732e-05, + "loss": 0.5985, + "step": 218880 + }, + { + "epoch": 1.9350589649746284, + "grad_norm": 2.8751561641693115, + "learning_rate": 1.7749017250422864e-05, + "loss": 0.7127, + "step": 218890 + }, + { + "epoch": 1.9351473682349405, + "grad_norm": 2.1735904216766357, + "learning_rate": 1.7747543862750992e-05, + "loss": 0.5764, + "step": 218900 + }, + { + "epoch": 1.9352357714952526, + "grad_norm": 1.6033087968826294, + "learning_rate": 1.774607047507912e-05, + "loss": 0.56, + "step": 218910 + }, + { + "epoch": 1.935324174755565, + "grad_norm": 1.7736544609069824, + "learning_rate": 1.774459708740725e-05, + "loss": 0.6301, + "step": 218920 + }, + { + "epoch": 1.9354125780158773, + "grad_norm": 5.374022006988525, + "learning_rate": 1.774312369973538e-05, + "loss": 0.6536, + "step": 218930 + }, + { + "epoch": 1.9355009812761894, + "grad_norm": 5.016340732574463, + "learning_rate": 1.774165031206351e-05, + "loss": 0.693, + "step": 218940 + }, + { + "epoch": 1.9355893845365018, + "grad_norm": 2.5083281993865967, + "learning_rate": 1.7740176924391637e-05, + "loss": 0.4785, + "step": 218950 + }, + { + "epoch": 1.935677787796814, + "grad_norm": 1.362806797027588, + "learning_rate": 1.773870353671977e-05, + "loss": 0.5337, + "step": 218960 + }, + { + "epoch": 1.9357661910571262, + "grad_norm": 5.3241376876831055, + "learning_rate": 1.7737230149047897e-05, + "loss": 0.6107, + "step": 218970 + }, + { + "epoch": 1.9358545943174383, + "grad_norm": 5.739940166473389, + "learning_rate": 1.7735756761376026e-05, + "loss": 0.7272, + "step": 218980 + }, + { + "epoch": 1.9359429975777507, + "grad_norm": 5.452607154846191, + "learning_rate": 1.7734283373704154e-05, + "loss": 0.5908, + "step": 218990 + }, + { + "epoch": 1.936031400838063, + "grad_norm": 0.8039748072624207, + "learning_rate": 1.7732809986032286e-05, + "loss": 0.5703, + "step": 219000 + }, + { + "epoch": 1.9361198040983751, + "grad_norm": 1.631017804145813, + "learning_rate": 1.7731336598360414e-05, + "loss": 0.5345, + "step": 219010 + }, + { + "epoch": 1.9362082073586873, + "grad_norm": 5.559750556945801, + "learning_rate": 1.7729863210688543e-05, + "loss": 0.5953, + "step": 219020 + }, + { + "epoch": 1.9362966106189996, + "grad_norm": 1.3706836700439453, + "learning_rate": 1.7728389823016674e-05, + "loss": 0.6115, + "step": 219030 + }, + { + "epoch": 1.936385013879312, + "grad_norm": 6.054581642150879, + "learning_rate": 1.7726916435344803e-05, + "loss": 0.4715, + "step": 219040 + }, + { + "epoch": 1.936473417139624, + "grad_norm": 10.774863243103027, + "learning_rate": 1.772544304767293e-05, + "loss": 0.5946, + "step": 219050 + }, + { + "epoch": 1.9365618203999362, + "grad_norm": 3.723139524459839, + "learning_rate": 1.772396966000106e-05, + "loss": 0.7378, + "step": 219060 + }, + { + "epoch": 1.9366502236602487, + "grad_norm": 1.4909392595291138, + "learning_rate": 1.772249627232919e-05, + "loss": 0.6206, + "step": 219070 + }, + { + "epoch": 1.9367386269205609, + "grad_norm": 1.3338351249694824, + "learning_rate": 1.772102288465732e-05, + "loss": 0.5949, + "step": 219080 + }, + { + "epoch": 1.936827030180873, + "grad_norm": 1.9898263216018677, + "learning_rate": 1.7719549496985448e-05, + "loss": 0.4866, + "step": 219090 + }, + { + "epoch": 1.9369154334411853, + "grad_norm": 5.609694004058838, + "learning_rate": 1.7718076109313576e-05, + "loss": 0.583, + "step": 219100 + }, + { + "epoch": 1.9370038367014977, + "grad_norm": 4.765753269195557, + "learning_rate": 1.7716602721641708e-05, + "loss": 0.6616, + "step": 219110 + }, + { + "epoch": 1.9370922399618098, + "grad_norm": 4.974100112915039, + "learning_rate": 1.7715129333969836e-05, + "loss": 0.4614, + "step": 219120 + }, + { + "epoch": 1.937180643222122, + "grad_norm": 1.5515565872192383, + "learning_rate": 1.7713655946297965e-05, + "loss": 0.5789, + "step": 219130 + }, + { + "epoch": 1.9372690464824343, + "grad_norm": 6.178471565246582, + "learning_rate": 1.7712182558626096e-05, + "loss": 0.5698, + "step": 219140 + }, + { + "epoch": 1.9373574497427466, + "grad_norm": 6.909763336181641, + "learning_rate": 1.7710709170954225e-05, + "loss": 0.6136, + "step": 219150 + }, + { + "epoch": 1.9374458530030587, + "grad_norm": 1.6127444505691528, + "learning_rate": 1.7709235783282353e-05, + "loss": 0.5734, + "step": 219160 + }, + { + "epoch": 1.9375342562633708, + "grad_norm": 2.046926736831665, + "learning_rate": 1.7707762395610485e-05, + "loss": 0.5165, + "step": 219170 + }, + { + "epoch": 1.9376226595236834, + "grad_norm": 4.1073527336120605, + "learning_rate": 1.7706289007938613e-05, + "loss": 0.581, + "step": 219180 + }, + { + "epoch": 1.9377110627839955, + "grad_norm": 1.3950103521347046, + "learning_rate": 1.770481562026674e-05, + "loss": 0.6281, + "step": 219190 + }, + { + "epoch": 1.9377994660443076, + "grad_norm": 1.0525743961334229, + "learning_rate": 1.7703342232594873e-05, + "loss": 0.532, + "step": 219200 + }, + { + "epoch": 1.93788786930462, + "grad_norm": 0.8216108083724976, + "learning_rate": 1.7701868844923e-05, + "loss": 0.6186, + "step": 219210 + }, + { + "epoch": 1.9379762725649323, + "grad_norm": 7.460700035095215, + "learning_rate": 1.770039545725113e-05, + "loss": 0.5772, + "step": 219220 + }, + { + "epoch": 1.9380646758252444, + "grad_norm": 2.1443541049957275, + "learning_rate": 1.769892206957926e-05, + "loss": 0.5322, + "step": 219230 + }, + { + "epoch": 1.9381530790855566, + "grad_norm": 0.920703649520874, + "learning_rate": 1.769744868190739e-05, + "loss": 0.6011, + "step": 219240 + }, + { + "epoch": 1.938241482345869, + "grad_norm": 1.9932037591934204, + "learning_rate": 1.769597529423552e-05, + "loss": 0.6048, + "step": 219250 + }, + { + "epoch": 1.9383298856061812, + "grad_norm": 6.591262340545654, + "learning_rate": 1.769450190656365e-05, + "loss": 0.5356, + "step": 219260 + }, + { + "epoch": 1.9384182888664934, + "grad_norm": 4.121075630187988, + "learning_rate": 1.769302851889178e-05, + "loss": 0.5259, + "step": 219270 + }, + { + "epoch": 1.9385066921268055, + "grad_norm": 6.472079753875732, + "learning_rate": 1.7691555131219907e-05, + "loss": 0.625, + "step": 219280 + }, + { + "epoch": 1.9385950953871178, + "grad_norm": 2.06003999710083, + "learning_rate": 1.769008174354804e-05, + "loss": 0.5802, + "step": 219290 + }, + { + "epoch": 1.9386834986474302, + "grad_norm": 3.8098337650299072, + "learning_rate": 1.7688608355876167e-05, + "loss": 0.6639, + "step": 219300 + }, + { + "epoch": 1.9387719019077423, + "grad_norm": 2.501352548599243, + "learning_rate": 1.7687134968204295e-05, + "loss": 0.5246, + "step": 219310 + }, + { + "epoch": 1.9388603051680546, + "grad_norm": 5.230441093444824, + "learning_rate": 1.7685661580532427e-05, + "loss": 0.5643, + "step": 219320 + }, + { + "epoch": 1.938948708428367, + "grad_norm": 1.5599844455718994, + "learning_rate": 1.7684188192860555e-05, + "loss": 0.6277, + "step": 219330 + }, + { + "epoch": 1.939037111688679, + "grad_norm": 7.174968242645264, + "learning_rate": 1.7682714805188684e-05, + "loss": 0.5319, + "step": 219340 + }, + { + "epoch": 1.9391255149489912, + "grad_norm": 3.6672191619873047, + "learning_rate": 1.7681241417516812e-05, + "loss": 0.5435, + "step": 219350 + }, + { + "epoch": 1.9392139182093036, + "grad_norm": 6.531252861022949, + "learning_rate": 1.7679768029844944e-05, + "loss": 0.4753, + "step": 219360 + }, + { + "epoch": 1.939302321469616, + "grad_norm": 5.337547302246094, + "learning_rate": 1.7678294642173072e-05, + "loss": 0.6496, + "step": 219370 + }, + { + "epoch": 1.939390724729928, + "grad_norm": 1.5465481281280518, + "learning_rate": 1.76768212545012e-05, + "loss": 0.5702, + "step": 219380 + }, + { + "epoch": 1.9394791279902401, + "grad_norm": 4.0540642738342285, + "learning_rate": 1.767534786682933e-05, + "loss": 0.566, + "step": 219390 + }, + { + "epoch": 1.9395675312505525, + "grad_norm": 7.075825214385986, + "learning_rate": 1.767387447915746e-05, + "loss": 0.501, + "step": 219400 + }, + { + "epoch": 1.9396559345108648, + "grad_norm": 10.447091102600098, + "learning_rate": 1.767240109148559e-05, + "loss": 0.5543, + "step": 219410 + }, + { + "epoch": 1.939744337771177, + "grad_norm": 2.6997909545898438, + "learning_rate": 1.7670927703813717e-05, + "loss": 0.5825, + "step": 219420 + }, + { + "epoch": 1.9398327410314893, + "grad_norm": 2.0073082447052, + "learning_rate": 1.766945431614185e-05, + "loss": 0.5996, + "step": 219430 + }, + { + "epoch": 1.9399211442918016, + "grad_norm": 1.1852062940597534, + "learning_rate": 1.7667980928469977e-05, + "loss": 0.7153, + "step": 219440 + }, + { + "epoch": 1.9400095475521137, + "grad_norm": 1.176051378250122, + "learning_rate": 1.7666507540798106e-05, + "loss": 0.4779, + "step": 219450 + }, + { + "epoch": 1.9400979508124259, + "grad_norm": 1.9624440670013428, + "learning_rate": 1.7665034153126234e-05, + "loss": 0.4672, + "step": 219460 + }, + { + "epoch": 1.9401863540727382, + "grad_norm": 3.0061705112457275, + "learning_rate": 1.7663560765454366e-05, + "loss": 0.5718, + "step": 219470 + }, + { + "epoch": 1.9402747573330505, + "grad_norm": 3.341076612472534, + "learning_rate": 1.7662087377782494e-05, + "loss": 0.7723, + "step": 219480 + }, + { + "epoch": 1.9403631605933627, + "grad_norm": 8.552477836608887, + "learning_rate": 1.7660613990110623e-05, + "loss": 0.5439, + "step": 219490 + }, + { + "epoch": 1.9404515638536748, + "grad_norm": 12.897489547729492, + "learning_rate": 1.7659140602438754e-05, + "loss": 0.5182, + "step": 219500 + }, + { + "epoch": 1.9405399671139871, + "grad_norm": 33.714229583740234, + "learning_rate": 1.7657667214766883e-05, + "loss": 0.6536, + "step": 219510 + }, + { + "epoch": 1.9406283703742995, + "grad_norm": 1.814167857170105, + "learning_rate": 1.765619382709501e-05, + "loss": 0.5305, + "step": 219520 + }, + { + "epoch": 1.9407167736346116, + "grad_norm": 0.6747092604637146, + "learning_rate": 1.765472043942314e-05, + "loss": 0.5619, + "step": 219530 + }, + { + "epoch": 1.940805176894924, + "grad_norm": 3.3053078651428223, + "learning_rate": 1.765324705175127e-05, + "loss": 0.6962, + "step": 219540 + }, + { + "epoch": 1.9408935801552363, + "grad_norm": 1.7575353384017944, + "learning_rate": 1.76517736640794e-05, + "loss": 0.5206, + "step": 219550 + }, + { + "epoch": 1.9409819834155484, + "grad_norm": 3.9731557369232178, + "learning_rate": 1.7650300276407528e-05, + "loss": 0.6002, + "step": 219560 + }, + { + "epoch": 1.9410703866758605, + "grad_norm": 6.252354145050049, + "learning_rate": 1.7648826888735656e-05, + "loss": 0.5617, + "step": 219570 + }, + { + "epoch": 1.9411587899361729, + "grad_norm": 1.7864651679992676, + "learning_rate": 1.7647353501063788e-05, + "loss": 0.5981, + "step": 219580 + }, + { + "epoch": 1.9412471931964852, + "grad_norm": 0.9924711585044861, + "learning_rate": 1.7645880113391916e-05, + "loss": 0.7034, + "step": 219590 + }, + { + "epoch": 1.9413355964567973, + "grad_norm": 0.935824990272522, + "learning_rate": 1.7644406725720045e-05, + "loss": 0.5799, + "step": 219600 + }, + { + "epoch": 1.9414239997171094, + "grad_norm": 0.8380221724510193, + "learning_rate": 1.7642933338048176e-05, + "loss": 0.5999, + "step": 219610 + }, + { + "epoch": 1.9415124029774218, + "grad_norm": 5.587125778198242, + "learning_rate": 1.7641459950376305e-05, + "loss": 0.5707, + "step": 219620 + }, + { + "epoch": 1.9416008062377341, + "grad_norm": 2.5349559783935547, + "learning_rate": 1.7639986562704433e-05, + "loss": 0.5103, + "step": 219630 + }, + { + "epoch": 1.9416892094980462, + "grad_norm": 3.3946216106414795, + "learning_rate": 1.763851317503256e-05, + "loss": 0.644, + "step": 219640 + }, + { + "epoch": 1.9417776127583584, + "grad_norm": 1.6937671899795532, + "learning_rate": 1.7637039787360693e-05, + "loss": 0.6548, + "step": 219650 + }, + { + "epoch": 1.941866016018671, + "grad_norm": 10.187103271484375, + "learning_rate": 1.763556639968882e-05, + "loss": 0.5445, + "step": 219660 + }, + { + "epoch": 1.941954419278983, + "grad_norm": 3.1246190071105957, + "learning_rate": 1.763409301201695e-05, + "loss": 0.5554, + "step": 219670 + }, + { + "epoch": 1.9420428225392952, + "grad_norm": 1.1979979276657104, + "learning_rate": 1.763261962434508e-05, + "loss": 0.6211, + "step": 219680 + }, + { + "epoch": 1.9421312257996075, + "grad_norm": 2.406420946121216, + "learning_rate": 1.763114623667321e-05, + "loss": 0.5294, + "step": 219690 + }, + { + "epoch": 1.9422196290599198, + "grad_norm": 5.116988658905029, + "learning_rate": 1.762967284900134e-05, + "loss": 0.7088, + "step": 219700 + }, + { + "epoch": 1.942308032320232, + "grad_norm": 4.389537811279297, + "learning_rate": 1.7628199461329467e-05, + "loss": 0.4937, + "step": 219710 + }, + { + "epoch": 1.942396435580544, + "grad_norm": 1.7930338382720947, + "learning_rate": 1.76267260736576e-05, + "loss": 0.5966, + "step": 219720 + }, + { + "epoch": 1.9424848388408564, + "grad_norm": 3.2008001804351807, + "learning_rate": 1.7625252685985727e-05, + "loss": 0.5694, + "step": 219730 + }, + { + "epoch": 1.9425732421011688, + "grad_norm": 0.765235960483551, + "learning_rate": 1.7623779298313855e-05, + "loss": 0.6581, + "step": 219740 + }, + { + "epoch": 1.942661645361481, + "grad_norm": 1.841051697731018, + "learning_rate": 1.7622305910641984e-05, + "loss": 0.5654, + "step": 219750 + }, + { + "epoch": 1.942750048621793, + "grad_norm": 3.4378368854522705, + "learning_rate": 1.7620832522970115e-05, + "loss": 0.6789, + "step": 219760 + }, + { + "epoch": 1.9428384518821056, + "grad_norm": 4.095025539398193, + "learning_rate": 1.7619359135298244e-05, + "loss": 0.6061, + "step": 219770 + }, + { + "epoch": 1.9429268551424177, + "grad_norm": 2.454801082611084, + "learning_rate": 1.7617885747626372e-05, + "loss": 0.5717, + "step": 219780 + }, + { + "epoch": 1.9430152584027298, + "grad_norm": 1.8583295345306396, + "learning_rate": 1.7616412359954504e-05, + "loss": 0.6113, + "step": 219790 + }, + { + "epoch": 1.9431036616630422, + "grad_norm": 2.6790244579315186, + "learning_rate": 1.7614938972282632e-05, + "loss": 0.6239, + "step": 219800 + }, + { + "epoch": 1.9431920649233545, + "grad_norm": 3.1267857551574707, + "learning_rate": 1.761346558461076e-05, + "loss": 0.5703, + "step": 219810 + }, + { + "epoch": 1.9432804681836666, + "grad_norm": 3.0117502212524414, + "learning_rate": 1.761199219693889e-05, + "loss": 0.5695, + "step": 219820 + }, + { + "epoch": 1.9433688714439787, + "grad_norm": 3.4411027431488037, + "learning_rate": 1.761051880926702e-05, + "loss": 0.4603, + "step": 219830 + }, + { + "epoch": 1.943457274704291, + "grad_norm": 2.077807664871216, + "learning_rate": 1.760904542159515e-05, + "loss": 0.6571, + "step": 219840 + }, + { + "epoch": 1.9435456779646034, + "grad_norm": 1.2126491069793701, + "learning_rate": 1.7607572033923277e-05, + "loss": 0.5591, + "step": 219850 + }, + { + "epoch": 1.9436340812249155, + "grad_norm": 9.818131446838379, + "learning_rate": 1.7606098646251406e-05, + "loss": 0.5448, + "step": 219860 + }, + { + "epoch": 1.9437224844852277, + "grad_norm": 15.830206871032715, + "learning_rate": 1.7604625258579537e-05, + "loss": 0.5582, + "step": 219870 + }, + { + "epoch": 1.94381088774554, + "grad_norm": 2.1060850620269775, + "learning_rate": 1.7603151870907666e-05, + "loss": 0.6976, + "step": 219880 + }, + { + "epoch": 1.9438992910058523, + "grad_norm": 4.5368828773498535, + "learning_rate": 1.7601678483235794e-05, + "loss": 0.661, + "step": 219890 + }, + { + "epoch": 1.9439876942661645, + "grad_norm": 3.4462361335754395, + "learning_rate": 1.7600205095563926e-05, + "loss": 0.5099, + "step": 219900 + }, + { + "epoch": 1.9440760975264768, + "grad_norm": 1.9157859086990356, + "learning_rate": 1.7598731707892054e-05, + "loss": 0.5198, + "step": 219910 + }, + { + "epoch": 1.9441645007867892, + "grad_norm": 1.4814224243164062, + "learning_rate": 1.7597258320220182e-05, + "loss": 0.5405, + "step": 219920 + }, + { + "epoch": 1.9442529040471013, + "grad_norm": 1.172135353088379, + "learning_rate": 1.759578493254831e-05, + "loss": 0.5591, + "step": 219930 + }, + { + "epoch": 1.9443413073074134, + "grad_norm": 4.656274318695068, + "learning_rate": 1.7594311544876443e-05, + "loss": 0.7209, + "step": 219940 + }, + { + "epoch": 1.9444297105677257, + "grad_norm": 2.0200414657592773, + "learning_rate": 1.759283815720457e-05, + "loss": 0.6001, + "step": 219950 + }, + { + "epoch": 1.944518113828038, + "grad_norm": 12.98513412475586, + "learning_rate": 1.75913647695327e-05, + "loss": 0.532, + "step": 219960 + }, + { + "epoch": 1.9446065170883502, + "grad_norm": 1.2624541521072388, + "learning_rate": 1.758989138186083e-05, + "loss": 0.4632, + "step": 219970 + }, + { + "epoch": 1.9446949203486623, + "grad_norm": 2.138148307800293, + "learning_rate": 1.758841799418896e-05, + "loss": 0.6467, + "step": 219980 + }, + { + "epoch": 1.9447833236089747, + "grad_norm": 7.881981372833252, + "learning_rate": 1.7586944606517088e-05, + "loss": 0.5565, + "step": 219990 + }, + { + "epoch": 1.944871726869287, + "grad_norm": 2.5142242908477783, + "learning_rate": 1.7585471218845216e-05, + "loss": 0.615, + "step": 220000 + }, + { + "epoch": 1.9449601301295991, + "grad_norm": 1.5457180738449097, + "learning_rate": 1.7583997831173348e-05, + "loss": 0.5461, + "step": 220010 + }, + { + "epoch": 1.9450485333899115, + "grad_norm": 3.353332757949829, + "learning_rate": 1.7582524443501476e-05, + "loss": 0.5181, + "step": 220020 + }, + { + "epoch": 1.9451369366502238, + "grad_norm": 32.87668991088867, + "learning_rate": 1.7581051055829605e-05, + "loss": 0.627, + "step": 220030 + }, + { + "epoch": 1.945225339910536, + "grad_norm": 2.2376997470855713, + "learning_rate": 1.7579577668157733e-05, + "loss": 0.6101, + "step": 220040 + }, + { + "epoch": 1.945313743170848, + "grad_norm": 4.235456943511963, + "learning_rate": 1.7578104280485865e-05, + "loss": 0.4517, + "step": 220050 + }, + { + "epoch": 1.9454021464311604, + "grad_norm": 2.897719383239746, + "learning_rate": 1.7576630892813993e-05, + "loss": 0.529, + "step": 220060 + }, + { + "epoch": 1.9454905496914727, + "grad_norm": 3.140842914581299, + "learning_rate": 1.757515750514212e-05, + "loss": 0.7252, + "step": 220070 + }, + { + "epoch": 1.9455789529517848, + "grad_norm": 4.630125045776367, + "learning_rate": 1.7573684117470253e-05, + "loss": 0.6496, + "step": 220080 + }, + { + "epoch": 1.945667356212097, + "grad_norm": 2.1478161811828613, + "learning_rate": 1.757221072979838e-05, + "loss": 0.4344, + "step": 220090 + }, + { + "epoch": 1.9457557594724093, + "grad_norm": 2.4617857933044434, + "learning_rate": 1.757073734212651e-05, + "loss": 0.7266, + "step": 220100 + }, + { + "epoch": 1.9458441627327216, + "grad_norm": 3.54447603225708, + "learning_rate": 1.756926395445464e-05, + "loss": 0.6597, + "step": 220110 + }, + { + "epoch": 1.9459325659930338, + "grad_norm": 1.2158421277999878, + "learning_rate": 1.756779056678277e-05, + "loss": 0.4788, + "step": 220120 + }, + { + "epoch": 1.9460209692533461, + "grad_norm": 12.480134010314941, + "learning_rate": 1.7566317179110898e-05, + "loss": 0.6888, + "step": 220130 + }, + { + "epoch": 1.9461093725136585, + "grad_norm": 3.005518674850464, + "learning_rate": 1.756484379143903e-05, + "loss": 0.5886, + "step": 220140 + }, + { + "epoch": 1.9461977757739706, + "grad_norm": 4.216115474700928, + "learning_rate": 1.756337040376716e-05, + "loss": 0.7235, + "step": 220150 + }, + { + "epoch": 1.9462861790342827, + "grad_norm": 2.8140199184417725, + "learning_rate": 1.7561897016095287e-05, + "loss": 0.6644, + "step": 220160 + }, + { + "epoch": 1.946374582294595, + "grad_norm": 6.192380428314209, + "learning_rate": 1.756042362842342e-05, + "loss": 0.4998, + "step": 220170 + }, + { + "epoch": 1.9464629855549074, + "grad_norm": 5.766744613647461, + "learning_rate": 1.7558950240751547e-05, + "loss": 0.6183, + "step": 220180 + }, + { + "epoch": 1.9465513888152195, + "grad_norm": 5.080082416534424, + "learning_rate": 1.755747685307968e-05, + "loss": 0.6872, + "step": 220190 + }, + { + "epoch": 1.9466397920755316, + "grad_norm": 2.4956791400909424, + "learning_rate": 1.7556003465407807e-05, + "loss": 0.5551, + "step": 220200 + }, + { + "epoch": 1.946728195335844, + "grad_norm": 3.850264310836792, + "learning_rate": 1.7554530077735935e-05, + "loss": 0.685, + "step": 220210 + }, + { + "epoch": 1.9468165985961563, + "grad_norm": 1.7307443618774414, + "learning_rate": 1.7553056690064064e-05, + "loss": 0.7235, + "step": 220220 + }, + { + "epoch": 1.9469050018564684, + "grad_norm": 2.543159008026123, + "learning_rate": 1.7551583302392195e-05, + "loss": 0.6074, + "step": 220230 + }, + { + "epoch": 1.9469934051167808, + "grad_norm": 2.0163302421569824, + "learning_rate": 1.7550109914720324e-05, + "loss": 0.6399, + "step": 220240 + }, + { + "epoch": 1.947081808377093, + "grad_norm": 3.360020399093628, + "learning_rate": 1.7548636527048452e-05, + "loss": 0.5777, + "step": 220250 + }, + { + "epoch": 1.9471702116374052, + "grad_norm": 2.3825111389160156, + "learning_rate": 1.7547163139376584e-05, + "loss": 0.5344, + "step": 220260 + }, + { + "epoch": 1.9472586148977173, + "grad_norm": 3.441697835922241, + "learning_rate": 1.7545689751704712e-05, + "loss": 0.6566, + "step": 220270 + }, + { + "epoch": 1.9473470181580297, + "grad_norm": 4.799928188323975, + "learning_rate": 1.754421636403284e-05, + "loss": 0.648, + "step": 220280 + }, + { + "epoch": 1.947435421418342, + "grad_norm": 1.052435278892517, + "learning_rate": 1.754274297636097e-05, + "loss": 0.6379, + "step": 220290 + }, + { + "epoch": 1.9475238246786541, + "grad_norm": 0.9867194294929504, + "learning_rate": 1.75412695886891e-05, + "loss": 0.5768, + "step": 220300 + }, + { + "epoch": 1.9476122279389663, + "grad_norm": 2.3401873111724854, + "learning_rate": 1.753979620101723e-05, + "loss": 0.5194, + "step": 220310 + }, + { + "epoch": 1.9477006311992786, + "grad_norm": 1.3275898694992065, + "learning_rate": 1.7538322813345357e-05, + "loss": 0.7023, + "step": 220320 + }, + { + "epoch": 1.947789034459591, + "grad_norm": 1.9870930910110474, + "learning_rate": 1.7536849425673486e-05, + "loss": 0.6657, + "step": 220330 + }, + { + "epoch": 1.947877437719903, + "grad_norm": 14.915205001831055, + "learning_rate": 1.7535376038001617e-05, + "loss": 0.6116, + "step": 220340 + }, + { + "epoch": 1.9479658409802152, + "grad_norm": 1.971383810043335, + "learning_rate": 1.7533902650329746e-05, + "loss": 0.5179, + "step": 220350 + }, + { + "epoch": 1.9480542442405278, + "grad_norm": 2.5229861736297607, + "learning_rate": 1.7532429262657874e-05, + "loss": 0.6418, + "step": 220360 + }, + { + "epoch": 1.9481426475008399, + "grad_norm": 2.2057082653045654, + "learning_rate": 1.7530955874986006e-05, + "loss": 0.4316, + "step": 220370 + }, + { + "epoch": 1.948231050761152, + "grad_norm": 4.470913887023926, + "learning_rate": 1.7529482487314134e-05, + "loss": 0.6664, + "step": 220380 + }, + { + "epoch": 1.9483194540214643, + "grad_norm": 3.529062271118164, + "learning_rate": 1.7528009099642263e-05, + "loss": 0.6037, + "step": 220390 + }, + { + "epoch": 1.9484078572817767, + "grad_norm": 0.8591119050979614, + "learning_rate": 1.752653571197039e-05, + "loss": 0.4202, + "step": 220400 + }, + { + "epoch": 1.9484962605420888, + "grad_norm": 9.83202838897705, + "learning_rate": 1.7525062324298523e-05, + "loss": 0.6812, + "step": 220410 + }, + { + "epoch": 1.948584663802401, + "grad_norm": 2.8193583488464355, + "learning_rate": 1.752358893662665e-05, + "loss": 0.5509, + "step": 220420 + }, + { + "epoch": 1.9486730670627133, + "grad_norm": 3.4376721382141113, + "learning_rate": 1.752211554895478e-05, + "loss": 0.5605, + "step": 220430 + }, + { + "epoch": 1.9487614703230256, + "grad_norm": 2.1072752475738525, + "learning_rate": 1.752064216128291e-05, + "loss": 0.5252, + "step": 220440 + }, + { + "epoch": 1.9488498735833377, + "grad_norm": 8.790304183959961, + "learning_rate": 1.751916877361104e-05, + "loss": 0.6145, + "step": 220450 + }, + { + "epoch": 1.9489382768436498, + "grad_norm": 3.855455160140991, + "learning_rate": 1.7517695385939168e-05, + "loss": 0.4925, + "step": 220460 + }, + { + "epoch": 1.9490266801039622, + "grad_norm": 1.7364224195480347, + "learning_rate": 1.7516221998267296e-05, + "loss": 0.6553, + "step": 220470 + }, + { + "epoch": 1.9491150833642745, + "grad_norm": 3.5123841762542725, + "learning_rate": 1.7514748610595428e-05, + "loss": 0.5795, + "step": 220480 + }, + { + "epoch": 1.9492034866245866, + "grad_norm": 2.9621994495391846, + "learning_rate": 1.7513275222923556e-05, + "loss": 0.7128, + "step": 220490 + }, + { + "epoch": 1.949291889884899, + "grad_norm": 7.281950950622559, + "learning_rate": 1.7511801835251685e-05, + "loss": 0.6006, + "step": 220500 + }, + { + "epoch": 1.9493802931452113, + "grad_norm": 2.602921962738037, + "learning_rate": 1.7510328447579813e-05, + "loss": 0.6524, + "step": 220510 + }, + { + "epoch": 1.9494686964055234, + "grad_norm": 4.122191429138184, + "learning_rate": 1.7508855059907945e-05, + "loss": 0.5878, + "step": 220520 + }, + { + "epoch": 1.9495570996658356, + "grad_norm": 1.875119924545288, + "learning_rate": 1.7507381672236073e-05, + "loss": 0.5572, + "step": 220530 + }, + { + "epoch": 1.949645502926148, + "grad_norm": 1.1922593116760254, + "learning_rate": 1.75059082845642e-05, + "loss": 0.5543, + "step": 220540 + }, + { + "epoch": 1.9497339061864603, + "grad_norm": 1.9271132946014404, + "learning_rate": 1.7504434896892333e-05, + "loss": 0.6123, + "step": 220550 + }, + { + "epoch": 1.9498223094467724, + "grad_norm": 3.6488096714019775, + "learning_rate": 1.750296150922046e-05, + "loss": 0.6235, + "step": 220560 + }, + { + "epoch": 1.9499107127070845, + "grad_norm": 4.910078525543213, + "learning_rate": 1.750148812154859e-05, + "loss": 0.4749, + "step": 220570 + }, + { + "epoch": 1.9499991159673968, + "grad_norm": 10.244205474853516, + "learning_rate": 1.7500014733876718e-05, + "loss": 0.5819, + "step": 220580 + }, + { + "epoch": 1.9500875192277092, + "grad_norm": 1.7231125831604004, + "learning_rate": 1.749854134620485e-05, + "loss": 0.5535, + "step": 220590 + }, + { + "epoch": 1.9501759224880213, + "grad_norm": 2.103482961654663, + "learning_rate": 1.7497067958532978e-05, + "loss": 0.7089, + "step": 220600 + }, + { + "epoch": 1.9502643257483336, + "grad_norm": 2.507673501968384, + "learning_rate": 1.7495594570861107e-05, + "loss": 0.6831, + "step": 220610 + }, + { + "epoch": 1.950352729008646, + "grad_norm": 3.5907340049743652, + "learning_rate": 1.749412118318924e-05, + "loss": 0.5588, + "step": 220620 + }, + { + "epoch": 1.950441132268958, + "grad_norm": 1.2048869132995605, + "learning_rate": 1.7492647795517367e-05, + "loss": 0.5469, + "step": 220630 + }, + { + "epoch": 1.9505295355292702, + "grad_norm": 3.259854793548584, + "learning_rate": 1.7491174407845495e-05, + "loss": 0.605, + "step": 220640 + }, + { + "epoch": 1.9506179387895826, + "grad_norm": 1.5435246229171753, + "learning_rate": 1.7489701020173623e-05, + "loss": 0.5402, + "step": 220650 + }, + { + "epoch": 1.950706342049895, + "grad_norm": 19.203760147094727, + "learning_rate": 1.7488227632501755e-05, + "loss": 0.5638, + "step": 220660 + }, + { + "epoch": 1.950794745310207, + "grad_norm": 3.113534450531006, + "learning_rate": 1.7486754244829884e-05, + "loss": 0.5273, + "step": 220670 + }, + { + "epoch": 1.9508831485705191, + "grad_norm": 2.984529733657837, + "learning_rate": 1.7485280857158012e-05, + "loss": 0.4242, + "step": 220680 + }, + { + "epoch": 1.9509715518308315, + "grad_norm": 2.876054286956787, + "learning_rate": 1.748380746948614e-05, + "loss": 0.606, + "step": 220690 + }, + { + "epoch": 1.9510599550911438, + "grad_norm": 7.738475799560547, + "learning_rate": 1.7482334081814272e-05, + "loss": 0.6156, + "step": 220700 + }, + { + "epoch": 1.951148358351456, + "grad_norm": 2.8049612045288086, + "learning_rate": 1.74808606941424e-05, + "loss": 0.6601, + "step": 220710 + }, + { + "epoch": 1.9512367616117683, + "grad_norm": 2.8683419227600098, + "learning_rate": 1.747938730647053e-05, + "loss": 0.5211, + "step": 220720 + }, + { + "epoch": 1.9513251648720806, + "grad_norm": 9.514555931091309, + "learning_rate": 1.747791391879866e-05, + "loss": 0.5206, + "step": 220730 + }, + { + "epoch": 1.9514135681323928, + "grad_norm": 12.106396675109863, + "learning_rate": 1.747644053112679e-05, + "loss": 0.5466, + "step": 220740 + }, + { + "epoch": 1.9515019713927049, + "grad_norm": 3.4347753524780273, + "learning_rate": 1.7474967143454917e-05, + "loss": 0.6684, + "step": 220750 + }, + { + "epoch": 1.9515903746530172, + "grad_norm": 7.132259368896484, + "learning_rate": 1.7473493755783046e-05, + "loss": 0.7184, + "step": 220760 + }, + { + "epoch": 1.9516787779133296, + "grad_norm": 5.191409111022949, + "learning_rate": 1.7472020368111177e-05, + "loss": 0.5463, + "step": 220770 + }, + { + "epoch": 1.9517671811736417, + "grad_norm": 1.6866577863693237, + "learning_rate": 1.7470546980439306e-05, + "loss": 0.5319, + "step": 220780 + }, + { + "epoch": 1.9518555844339538, + "grad_norm": 1.7636666297912598, + "learning_rate": 1.7469073592767434e-05, + "loss": 0.5774, + "step": 220790 + }, + { + "epoch": 1.9519439876942661, + "grad_norm": 6.350882530212402, + "learning_rate": 1.7467600205095562e-05, + "loss": 0.5517, + "step": 220800 + }, + { + "epoch": 1.9520323909545785, + "grad_norm": 0.730274498462677, + "learning_rate": 1.7466126817423694e-05, + "loss": 0.5857, + "step": 220810 + }, + { + "epoch": 1.9521207942148906, + "grad_norm": 2.098360776901245, + "learning_rate": 1.7464653429751822e-05, + "loss": 0.6221, + "step": 220820 + }, + { + "epoch": 1.952209197475203, + "grad_norm": 3.16597843170166, + "learning_rate": 1.746318004207995e-05, + "loss": 0.5988, + "step": 220830 + }, + { + "epoch": 1.9522976007355153, + "grad_norm": 1.323750376701355, + "learning_rate": 1.7461706654408083e-05, + "loss": 0.586, + "step": 220840 + }, + { + "epoch": 1.9523860039958274, + "grad_norm": 1.8122895956039429, + "learning_rate": 1.746023326673621e-05, + "loss": 0.5637, + "step": 220850 + }, + { + "epoch": 1.9524744072561395, + "grad_norm": 2.3582842350006104, + "learning_rate": 1.745875987906434e-05, + "loss": 0.6373, + "step": 220860 + }, + { + "epoch": 1.9525628105164519, + "grad_norm": 1.4971257448196411, + "learning_rate": 1.7457286491392468e-05, + "loss": 0.4774, + "step": 220870 + }, + { + "epoch": 1.9526512137767642, + "grad_norm": 1.4984142780303955, + "learning_rate": 1.74558131037206e-05, + "loss": 0.412, + "step": 220880 + }, + { + "epoch": 1.9527396170370763, + "grad_norm": 3.6662018299102783, + "learning_rate": 1.7454339716048728e-05, + "loss": 0.5171, + "step": 220890 + }, + { + "epoch": 1.9528280202973884, + "grad_norm": 4.355205535888672, + "learning_rate": 1.7452866328376856e-05, + "loss": 0.5553, + "step": 220900 + }, + { + "epoch": 1.9529164235577008, + "grad_norm": 13.22236156463623, + "learning_rate": 1.7451392940704988e-05, + "loss": 0.4429, + "step": 220910 + }, + { + "epoch": 1.9530048268180131, + "grad_norm": 2.350409746170044, + "learning_rate": 1.7449919553033116e-05, + "loss": 0.5378, + "step": 220920 + }, + { + "epoch": 1.9530932300783252, + "grad_norm": 1.5182199478149414, + "learning_rate": 1.7448446165361244e-05, + "loss": 0.574, + "step": 220930 + }, + { + "epoch": 1.9531816333386374, + "grad_norm": 2.9196348190307617, + "learning_rate": 1.7446972777689373e-05, + "loss": 0.6858, + "step": 220940 + }, + { + "epoch": 1.95327003659895, + "grad_norm": 4.065303325653076, + "learning_rate": 1.7445499390017505e-05, + "loss": 0.4565, + "step": 220950 + }, + { + "epoch": 1.953358439859262, + "grad_norm": 2.180522918701172, + "learning_rate": 1.7444026002345633e-05, + "loss": 0.5924, + "step": 220960 + }, + { + "epoch": 1.9534468431195742, + "grad_norm": 4.2608489990234375, + "learning_rate": 1.744255261467376e-05, + "loss": 0.6149, + "step": 220970 + }, + { + "epoch": 1.9535352463798865, + "grad_norm": 4.069046497344971, + "learning_rate": 1.7441079227001893e-05, + "loss": 0.5646, + "step": 220980 + }, + { + "epoch": 1.9536236496401989, + "grad_norm": 5.257110595703125, + "learning_rate": 1.743960583933002e-05, + "loss": 0.5123, + "step": 220990 + }, + { + "epoch": 1.953712052900511, + "grad_norm": 3.287450075149536, + "learning_rate": 1.743813245165815e-05, + "loss": 0.4468, + "step": 221000 + }, + { + "epoch": 1.953800456160823, + "grad_norm": 2.3431289196014404, + "learning_rate": 1.743665906398628e-05, + "loss": 0.4935, + "step": 221010 + }, + { + "epoch": 1.9538888594211354, + "grad_norm": 1.9732731580734253, + "learning_rate": 1.743518567631441e-05, + "loss": 0.655, + "step": 221020 + }, + { + "epoch": 1.9539772626814478, + "grad_norm": 2.1418793201446533, + "learning_rate": 1.7433712288642538e-05, + "loss": 0.638, + "step": 221030 + }, + { + "epoch": 1.95406566594176, + "grad_norm": 1.3343536853790283, + "learning_rate": 1.743223890097067e-05, + "loss": 0.4028, + "step": 221040 + }, + { + "epoch": 1.954154069202072, + "grad_norm": 4.15090274810791, + "learning_rate": 1.7430765513298798e-05, + "loss": 0.6298, + "step": 221050 + }, + { + "epoch": 1.9542424724623844, + "grad_norm": 3.681791305541992, + "learning_rate": 1.7429292125626927e-05, + "loss": 0.5714, + "step": 221060 + }, + { + "epoch": 1.9543308757226967, + "grad_norm": 2.90159010887146, + "learning_rate": 1.742781873795506e-05, + "loss": 0.6419, + "step": 221070 + }, + { + "epoch": 1.9544192789830088, + "grad_norm": 2.430762529373169, + "learning_rate": 1.7426345350283187e-05, + "loss": 0.6476, + "step": 221080 + }, + { + "epoch": 1.9545076822433212, + "grad_norm": 1.8313844203948975, + "learning_rate": 1.7424871962611315e-05, + "loss": 0.5767, + "step": 221090 + }, + { + "epoch": 1.9545960855036335, + "grad_norm": 4.6836090087890625, + "learning_rate": 1.7423398574939447e-05, + "loss": 0.6324, + "step": 221100 + }, + { + "epoch": 1.9546844887639456, + "grad_norm": 2.1411750316619873, + "learning_rate": 1.7421925187267575e-05, + "loss": 0.46, + "step": 221110 + }, + { + "epoch": 1.9547728920242577, + "grad_norm": 3.8862268924713135, + "learning_rate": 1.7420451799595704e-05, + "loss": 0.6116, + "step": 221120 + }, + { + "epoch": 1.95486129528457, + "grad_norm": 2.3128890991210938, + "learning_rate": 1.7418978411923835e-05, + "loss": 0.6273, + "step": 221130 + }, + { + "epoch": 1.9549496985448824, + "grad_norm": 2.4391748905181885, + "learning_rate": 1.7417505024251964e-05, + "loss": 0.5168, + "step": 221140 + }, + { + "epoch": 1.9550381018051946, + "grad_norm": 4.619393825531006, + "learning_rate": 1.7416031636580092e-05, + "loss": 0.598, + "step": 221150 + }, + { + "epoch": 1.9551265050655067, + "grad_norm": 2.1790719032287598, + "learning_rate": 1.741455824890822e-05, + "loss": 0.5018, + "step": 221160 + }, + { + "epoch": 1.955214908325819, + "grad_norm": 6.281453609466553, + "learning_rate": 1.7413084861236352e-05, + "loss": 0.7997, + "step": 221170 + }, + { + "epoch": 1.9553033115861314, + "grad_norm": 2.5911691188812256, + "learning_rate": 1.741161147356448e-05, + "loss": 0.5766, + "step": 221180 + }, + { + "epoch": 1.9553917148464435, + "grad_norm": 8.28908920288086, + "learning_rate": 1.741013808589261e-05, + "loss": 0.6161, + "step": 221190 + }, + { + "epoch": 1.9554801181067558, + "grad_norm": 2.1980628967285156, + "learning_rate": 1.740866469822074e-05, + "loss": 0.6146, + "step": 221200 + }, + { + "epoch": 1.9555685213670682, + "grad_norm": 2.1065611839294434, + "learning_rate": 1.740719131054887e-05, + "loss": 0.5509, + "step": 221210 + }, + { + "epoch": 1.9556569246273803, + "grad_norm": 6.382716655731201, + "learning_rate": 1.7405717922876997e-05, + "loss": 0.5751, + "step": 221220 + }, + { + "epoch": 1.9557453278876924, + "grad_norm": 7.217451095581055, + "learning_rate": 1.7404244535205126e-05, + "loss": 0.4805, + "step": 221230 + }, + { + "epoch": 1.9558337311480047, + "grad_norm": 11.226778984069824, + "learning_rate": 1.7402771147533257e-05, + "loss": 0.6936, + "step": 221240 + }, + { + "epoch": 1.955922134408317, + "grad_norm": 2.2104413509368896, + "learning_rate": 1.7401297759861386e-05, + "loss": 0.5766, + "step": 221250 + }, + { + "epoch": 1.9560105376686292, + "grad_norm": 1.5729948282241821, + "learning_rate": 1.7399824372189514e-05, + "loss": 0.574, + "step": 221260 + }, + { + "epoch": 1.9560989409289413, + "grad_norm": 0.9875640273094177, + "learning_rate": 1.7398350984517646e-05, + "loss": 0.6298, + "step": 221270 + }, + { + "epoch": 1.9561873441892537, + "grad_norm": 10.27600383758545, + "learning_rate": 1.7396877596845774e-05, + "loss": 0.5781, + "step": 221280 + }, + { + "epoch": 1.956275747449566, + "grad_norm": 2.2132785320281982, + "learning_rate": 1.7395404209173902e-05, + "loss": 0.6105, + "step": 221290 + }, + { + "epoch": 1.9563641507098781, + "grad_norm": 1.3178938627243042, + "learning_rate": 1.739393082150203e-05, + "loss": 0.4922, + "step": 221300 + }, + { + "epoch": 1.9564525539701905, + "grad_norm": 1.907183051109314, + "learning_rate": 1.7392457433830163e-05, + "loss": 0.5972, + "step": 221310 + }, + { + "epoch": 1.9565409572305028, + "grad_norm": 4.384500980377197, + "learning_rate": 1.739098404615829e-05, + "loss": 0.7256, + "step": 221320 + }, + { + "epoch": 1.956629360490815, + "grad_norm": 7.438844203948975, + "learning_rate": 1.738951065848642e-05, + "loss": 0.673, + "step": 221330 + }, + { + "epoch": 1.956717763751127, + "grad_norm": 2.2127161026000977, + "learning_rate": 1.7388037270814548e-05, + "loss": 0.595, + "step": 221340 + }, + { + "epoch": 1.9568061670114394, + "grad_norm": 6.308504581451416, + "learning_rate": 1.738656388314268e-05, + "loss": 0.5618, + "step": 221350 + }, + { + "epoch": 1.9568945702717517, + "grad_norm": 3.3190057277679443, + "learning_rate": 1.7385090495470808e-05, + "loss": 0.5663, + "step": 221360 + }, + { + "epoch": 1.9569829735320639, + "grad_norm": 2.126783609390259, + "learning_rate": 1.7383617107798936e-05, + "loss": 0.56, + "step": 221370 + }, + { + "epoch": 1.957071376792376, + "grad_norm": 6.160731315612793, + "learning_rate": 1.7382143720127068e-05, + "loss": 0.6325, + "step": 221380 + }, + { + "epoch": 1.9571597800526883, + "grad_norm": 2.9567158222198486, + "learning_rate": 1.7380670332455196e-05, + "loss": 0.6028, + "step": 221390 + }, + { + "epoch": 1.9572481833130007, + "grad_norm": 1.7168047428131104, + "learning_rate": 1.7379196944783325e-05, + "loss": 0.6372, + "step": 221400 + }, + { + "epoch": 1.9573365865733128, + "grad_norm": 1.3090665340423584, + "learning_rate": 1.7377723557111453e-05, + "loss": 0.5813, + "step": 221410 + }, + { + "epoch": 1.9574249898336251, + "grad_norm": 3.2118301391601562, + "learning_rate": 1.7376250169439585e-05, + "loss": 0.6302, + "step": 221420 + }, + { + "epoch": 1.9575133930939375, + "grad_norm": 3.053651809692383, + "learning_rate": 1.7374776781767713e-05, + "loss": 0.4884, + "step": 221430 + }, + { + "epoch": 1.9576017963542496, + "grad_norm": 2.939345359802246, + "learning_rate": 1.737330339409584e-05, + "loss": 0.5157, + "step": 221440 + }, + { + "epoch": 1.9576901996145617, + "grad_norm": 5.004700660705566, + "learning_rate": 1.737183000642397e-05, + "loss": 0.6239, + "step": 221450 + }, + { + "epoch": 1.957778602874874, + "grad_norm": 1.8308154344558716, + "learning_rate": 1.73703566187521e-05, + "loss": 0.4895, + "step": 221460 + }, + { + "epoch": 1.9578670061351864, + "grad_norm": 1.0355510711669922, + "learning_rate": 1.736888323108023e-05, + "loss": 0.5118, + "step": 221470 + }, + { + "epoch": 1.9579554093954985, + "grad_norm": 2.7190637588500977, + "learning_rate": 1.7367409843408358e-05, + "loss": 0.6767, + "step": 221480 + }, + { + "epoch": 1.9580438126558106, + "grad_norm": 2.793962001800537, + "learning_rate": 1.736593645573649e-05, + "loss": 0.5561, + "step": 221490 + }, + { + "epoch": 1.958132215916123, + "grad_norm": 2.219176769256592, + "learning_rate": 1.7364463068064618e-05, + "loss": 0.6688, + "step": 221500 + }, + { + "epoch": 1.9582206191764353, + "grad_norm": 2.296807050704956, + "learning_rate": 1.7362989680392747e-05, + "loss": 0.6831, + "step": 221510 + }, + { + "epoch": 1.9583090224367474, + "grad_norm": 2.7684853076934814, + "learning_rate": 1.7361516292720875e-05, + "loss": 0.5368, + "step": 221520 + }, + { + "epoch": 1.9583974256970595, + "grad_norm": 3.423884630203247, + "learning_rate": 1.7360042905049007e-05, + "loss": 0.516, + "step": 221530 + }, + { + "epoch": 1.958485828957372, + "grad_norm": 2.5242016315460205, + "learning_rate": 1.7358569517377135e-05, + "loss": 0.6295, + "step": 221540 + }, + { + "epoch": 1.9585742322176842, + "grad_norm": 4.421204566955566, + "learning_rate": 1.7357096129705263e-05, + "loss": 0.6851, + "step": 221550 + }, + { + "epoch": 1.9586626354779963, + "grad_norm": 4.011968612670898, + "learning_rate": 1.7355622742033395e-05, + "loss": 0.5472, + "step": 221560 + }, + { + "epoch": 1.9587510387383087, + "grad_norm": 1.3766950368881226, + "learning_rate": 1.7354149354361523e-05, + "loss": 0.4817, + "step": 221570 + }, + { + "epoch": 1.958839441998621, + "grad_norm": 2.665876865386963, + "learning_rate": 1.7352675966689652e-05, + "loss": 0.626, + "step": 221580 + }, + { + "epoch": 1.9589278452589332, + "grad_norm": 2.626681327819824, + "learning_rate": 1.735120257901778e-05, + "loss": 0.6421, + "step": 221590 + }, + { + "epoch": 1.9590162485192453, + "grad_norm": 3.987231492996216, + "learning_rate": 1.7349729191345912e-05, + "loss": 0.5362, + "step": 221600 + }, + { + "epoch": 1.9591046517795576, + "grad_norm": 4.009259223937988, + "learning_rate": 1.734825580367404e-05, + "loss": 0.5257, + "step": 221610 + }, + { + "epoch": 1.95919305503987, + "grad_norm": 1.906485676765442, + "learning_rate": 1.734678241600217e-05, + "loss": 0.4726, + "step": 221620 + }, + { + "epoch": 1.959281458300182, + "grad_norm": 1.893676996231079, + "learning_rate": 1.7345309028330297e-05, + "loss": 0.4433, + "step": 221630 + }, + { + "epoch": 1.9593698615604942, + "grad_norm": 1.9703915119171143, + "learning_rate": 1.734383564065843e-05, + "loss": 0.598, + "step": 221640 + }, + { + "epoch": 1.9594582648208065, + "grad_norm": 3.427255153656006, + "learning_rate": 1.7342362252986557e-05, + "loss": 0.7, + "step": 221650 + }, + { + "epoch": 1.9595466680811189, + "grad_norm": 4.3255486488342285, + "learning_rate": 1.7340888865314685e-05, + "loss": 0.5803, + "step": 221660 + }, + { + "epoch": 1.959635071341431, + "grad_norm": 2.629002094268799, + "learning_rate": 1.7339415477642817e-05, + "loss": 0.5961, + "step": 221670 + }, + { + "epoch": 1.9597234746017433, + "grad_norm": 1.9308866262435913, + "learning_rate": 1.7337942089970946e-05, + "loss": 0.6365, + "step": 221680 + }, + { + "epoch": 1.9598118778620557, + "grad_norm": 2.3796756267547607, + "learning_rate": 1.7336468702299074e-05, + "loss": 0.667, + "step": 221690 + }, + { + "epoch": 1.9599002811223678, + "grad_norm": 4.674548149108887, + "learning_rate": 1.7334995314627202e-05, + "loss": 0.617, + "step": 221700 + }, + { + "epoch": 1.95998868438268, + "grad_norm": 2.9246327877044678, + "learning_rate": 1.7333521926955334e-05, + "loss": 0.619, + "step": 221710 + }, + { + "epoch": 1.9600770876429923, + "grad_norm": 1.9208478927612305, + "learning_rate": 1.7332048539283462e-05, + "loss": 0.6139, + "step": 221720 + }, + { + "epoch": 1.9601654909033046, + "grad_norm": 3.7790932655334473, + "learning_rate": 1.733057515161159e-05, + "loss": 0.6783, + "step": 221730 + }, + { + "epoch": 1.9602538941636167, + "grad_norm": 1.993432879447937, + "learning_rate": 1.7329101763939722e-05, + "loss": 0.5181, + "step": 221740 + }, + { + "epoch": 1.9603422974239288, + "grad_norm": 2.6456339359283447, + "learning_rate": 1.732762837626785e-05, + "loss": 0.5437, + "step": 221750 + }, + { + "epoch": 1.9604307006842412, + "grad_norm": 4.101899147033691, + "learning_rate": 1.732615498859598e-05, + "loss": 0.6074, + "step": 221760 + }, + { + "epoch": 1.9605191039445535, + "grad_norm": 1.424377202987671, + "learning_rate": 1.7324681600924108e-05, + "loss": 0.4778, + "step": 221770 + }, + { + "epoch": 1.9606075072048657, + "grad_norm": 4.055168151855469, + "learning_rate": 1.732320821325224e-05, + "loss": 0.4997, + "step": 221780 + }, + { + "epoch": 1.960695910465178, + "grad_norm": 3.0688679218292236, + "learning_rate": 1.7321734825580368e-05, + "loss": 0.6575, + "step": 221790 + }, + { + "epoch": 1.9607843137254903, + "grad_norm": 2.3685131072998047, + "learning_rate": 1.7320261437908496e-05, + "loss": 0.6747, + "step": 221800 + }, + { + "epoch": 1.9608727169858025, + "grad_norm": 5.343352794647217, + "learning_rate": 1.7318788050236624e-05, + "loss": 0.6151, + "step": 221810 + }, + { + "epoch": 1.9609611202461146, + "grad_norm": 4.508944988250732, + "learning_rate": 1.7317314662564756e-05, + "loss": 0.5315, + "step": 221820 + }, + { + "epoch": 1.961049523506427, + "grad_norm": 3.288653612136841, + "learning_rate": 1.7315841274892884e-05, + "loss": 0.7137, + "step": 221830 + }, + { + "epoch": 1.9611379267667393, + "grad_norm": 3.245378255844116, + "learning_rate": 1.7314367887221013e-05, + "loss": 0.4592, + "step": 221840 + }, + { + "epoch": 1.9612263300270514, + "grad_norm": 2.223806858062744, + "learning_rate": 1.7312894499549145e-05, + "loss": 0.5834, + "step": 221850 + }, + { + "epoch": 1.9613147332873635, + "grad_norm": 2.324831962585449, + "learning_rate": 1.7311421111877273e-05, + "loss": 0.4512, + "step": 221860 + }, + { + "epoch": 1.9614031365476758, + "grad_norm": 6.371609687805176, + "learning_rate": 1.73099477242054e-05, + "loss": 0.5267, + "step": 221870 + }, + { + "epoch": 1.9614915398079882, + "grad_norm": 1.8866710662841797, + "learning_rate": 1.730847433653353e-05, + "loss": 0.5281, + "step": 221880 + }, + { + "epoch": 1.9615799430683003, + "grad_norm": 1.9318867921829224, + "learning_rate": 1.730700094886166e-05, + "loss": 0.5328, + "step": 221890 + }, + { + "epoch": 1.9616683463286126, + "grad_norm": 2.1442458629608154, + "learning_rate": 1.730552756118979e-05, + "loss": 0.6054, + "step": 221900 + }, + { + "epoch": 1.961756749588925, + "grad_norm": 3.8569774627685547, + "learning_rate": 1.7304054173517918e-05, + "loss": 0.7032, + "step": 221910 + }, + { + "epoch": 1.961845152849237, + "grad_norm": 3.075002431869507, + "learning_rate": 1.730258078584605e-05, + "loss": 0.595, + "step": 221920 + }, + { + "epoch": 1.9619335561095492, + "grad_norm": 8.860819816589355, + "learning_rate": 1.7301107398174178e-05, + "loss": 0.5075, + "step": 221930 + }, + { + "epoch": 1.9620219593698616, + "grad_norm": 3.092060089111328, + "learning_rate": 1.7299634010502306e-05, + "loss": 0.4836, + "step": 221940 + }, + { + "epoch": 1.962110362630174, + "grad_norm": 1.856972575187683, + "learning_rate": 1.7298160622830438e-05, + "loss": 0.479, + "step": 221950 + }, + { + "epoch": 1.962198765890486, + "grad_norm": 1.5563569068908691, + "learning_rate": 1.7296687235158567e-05, + "loss": 0.6582, + "step": 221960 + }, + { + "epoch": 1.9622871691507981, + "grad_norm": 0.956799328327179, + "learning_rate": 1.7295213847486695e-05, + "loss": 0.529, + "step": 221970 + }, + { + "epoch": 1.9623755724111105, + "grad_norm": 3.0957534313201904, + "learning_rate": 1.7293740459814827e-05, + "loss": 0.6286, + "step": 221980 + }, + { + "epoch": 1.9624639756714228, + "grad_norm": 1.7938803434371948, + "learning_rate": 1.7292267072142955e-05, + "loss": 0.6059, + "step": 221990 + }, + { + "epoch": 1.962552378931735, + "grad_norm": 1.8811876773834229, + "learning_rate": 1.7290793684471083e-05, + "loss": 0.5837, + "step": 222000 + }, + { + "epoch": 1.9626407821920473, + "grad_norm": 4.525232791900635, + "learning_rate": 1.7289320296799215e-05, + "loss": 0.5446, + "step": 222010 + }, + { + "epoch": 1.9627291854523596, + "grad_norm": 1.8074058294296265, + "learning_rate": 1.7287846909127343e-05, + "loss": 0.4753, + "step": 222020 + }, + { + "epoch": 1.9628175887126718, + "grad_norm": 5.129820346832275, + "learning_rate": 1.7286373521455472e-05, + "loss": 0.6748, + "step": 222030 + }, + { + "epoch": 1.9629059919729839, + "grad_norm": 8.805209159851074, + "learning_rate": 1.7284900133783604e-05, + "loss": 0.6701, + "step": 222040 + }, + { + "epoch": 1.9629943952332962, + "grad_norm": 2.0817229747772217, + "learning_rate": 1.7283426746111732e-05, + "loss": 0.5398, + "step": 222050 + }, + { + "epoch": 1.9630827984936086, + "grad_norm": 2.0642473697662354, + "learning_rate": 1.728195335843986e-05, + "loss": 0.5827, + "step": 222060 + }, + { + "epoch": 1.9631712017539207, + "grad_norm": 2.650526762008667, + "learning_rate": 1.7280479970767992e-05, + "loss": 0.6029, + "step": 222070 + }, + { + "epoch": 1.9632596050142328, + "grad_norm": 0.8851326704025269, + "learning_rate": 1.727900658309612e-05, + "loss": 0.5382, + "step": 222080 + }, + { + "epoch": 1.9633480082745451, + "grad_norm": 6.130293846130371, + "learning_rate": 1.727753319542425e-05, + "loss": 0.4914, + "step": 222090 + }, + { + "epoch": 1.9634364115348575, + "grad_norm": 1.810418963432312, + "learning_rate": 1.7276059807752377e-05, + "loss": 0.5117, + "step": 222100 + }, + { + "epoch": 1.9635248147951696, + "grad_norm": 2.6192986965179443, + "learning_rate": 1.727458642008051e-05, + "loss": 0.5139, + "step": 222110 + }, + { + "epoch": 1.9636132180554817, + "grad_norm": 1.8537731170654297, + "learning_rate": 1.7273113032408637e-05, + "loss": 0.573, + "step": 222120 + }, + { + "epoch": 1.9637016213157943, + "grad_norm": 2.8963286876678467, + "learning_rate": 1.7271639644736766e-05, + "loss": 0.6338, + "step": 222130 + }, + { + "epoch": 1.9637900245761064, + "grad_norm": 3.3957526683807373, + "learning_rate": 1.7270166257064897e-05, + "loss": 0.6349, + "step": 222140 + }, + { + "epoch": 1.9638784278364185, + "grad_norm": 0.9304403066635132, + "learning_rate": 1.7268692869393026e-05, + "loss": 0.5048, + "step": 222150 + }, + { + "epoch": 1.9639668310967309, + "grad_norm": 1.9414174556732178, + "learning_rate": 1.7267219481721154e-05, + "loss": 0.4382, + "step": 222160 + }, + { + "epoch": 1.9640552343570432, + "grad_norm": 5.736922740936279, + "learning_rate": 1.7265746094049282e-05, + "loss": 0.4992, + "step": 222170 + }, + { + "epoch": 1.9641436376173553, + "grad_norm": 1.3160322904586792, + "learning_rate": 1.7264272706377414e-05, + "loss": 0.6757, + "step": 222180 + }, + { + "epoch": 1.9642320408776675, + "grad_norm": 4.047842025756836, + "learning_rate": 1.7262799318705542e-05, + "loss": 0.5021, + "step": 222190 + }, + { + "epoch": 1.9643204441379798, + "grad_norm": 3.707914113998413, + "learning_rate": 1.726132593103367e-05, + "loss": 0.5876, + "step": 222200 + }, + { + "epoch": 1.9644088473982921, + "grad_norm": 1.5215938091278076, + "learning_rate": 1.7259852543361803e-05, + "loss": 0.5758, + "step": 222210 + }, + { + "epoch": 1.9644972506586043, + "grad_norm": 5.436439037322998, + "learning_rate": 1.725837915568993e-05, + "loss": 0.5506, + "step": 222220 + }, + { + "epoch": 1.9645856539189164, + "grad_norm": 1.7833653688430786, + "learning_rate": 1.725690576801806e-05, + "loss": 0.6581, + "step": 222230 + }, + { + "epoch": 1.9646740571792287, + "grad_norm": 22.85245132446289, + "learning_rate": 1.7255432380346188e-05, + "loss": 0.6773, + "step": 222240 + }, + { + "epoch": 1.964762460439541, + "grad_norm": 2.7302658557891846, + "learning_rate": 1.725395899267432e-05, + "loss": 0.5082, + "step": 222250 + }, + { + "epoch": 1.9648508636998532, + "grad_norm": 10.478650093078613, + "learning_rate": 1.7252485605002448e-05, + "loss": 0.5939, + "step": 222260 + }, + { + "epoch": 1.9649392669601655, + "grad_norm": 1.1871991157531738, + "learning_rate": 1.7251012217330576e-05, + "loss": 0.6735, + "step": 222270 + }, + { + "epoch": 1.9650276702204779, + "grad_norm": 1.9310252666473389, + "learning_rate": 1.7249538829658704e-05, + "loss": 0.6796, + "step": 222280 + }, + { + "epoch": 1.96511607348079, + "grad_norm": 7.742509841918945, + "learning_rate": 1.7248065441986836e-05, + "loss": 0.6949, + "step": 222290 + }, + { + "epoch": 1.965204476741102, + "grad_norm": 4.2397780418396, + "learning_rate": 1.7246592054314964e-05, + "loss": 0.4619, + "step": 222300 + }, + { + "epoch": 1.9652928800014144, + "grad_norm": 2.080115795135498, + "learning_rate": 1.7245118666643093e-05, + "loss": 0.4737, + "step": 222310 + }, + { + "epoch": 1.9653812832617268, + "grad_norm": 5.406563758850098, + "learning_rate": 1.7243645278971225e-05, + "loss": 0.5169, + "step": 222320 + }, + { + "epoch": 1.965469686522039, + "grad_norm": 2.611933469772339, + "learning_rate": 1.7242171891299353e-05, + "loss": 0.5915, + "step": 222330 + }, + { + "epoch": 1.965558089782351, + "grad_norm": 2.100005626678467, + "learning_rate": 1.724069850362748e-05, + "loss": 0.5918, + "step": 222340 + }, + { + "epoch": 1.9656464930426634, + "grad_norm": 8.699605941772461, + "learning_rate": 1.723922511595561e-05, + "loss": 0.6256, + "step": 222350 + }, + { + "epoch": 1.9657348963029757, + "grad_norm": 4.119903087615967, + "learning_rate": 1.723775172828374e-05, + "loss": 0.5898, + "step": 222360 + }, + { + "epoch": 1.9658232995632878, + "grad_norm": 2.810309410095215, + "learning_rate": 1.723627834061187e-05, + "loss": 0.5349, + "step": 222370 + }, + { + "epoch": 1.9659117028236002, + "grad_norm": 8.267805099487305, + "learning_rate": 1.7234804952939998e-05, + "loss": 0.6738, + "step": 222380 + }, + { + "epoch": 1.9660001060839125, + "grad_norm": 2.084251642227173, + "learning_rate": 1.7233331565268126e-05, + "loss": 0.7415, + "step": 222390 + }, + { + "epoch": 1.9660885093442246, + "grad_norm": 1.9852428436279297, + "learning_rate": 1.7231858177596258e-05, + "loss": 0.5328, + "step": 222400 + }, + { + "epoch": 1.9661769126045368, + "grad_norm": 1.7430096864700317, + "learning_rate": 1.7230384789924387e-05, + "loss": 0.5559, + "step": 222410 + }, + { + "epoch": 1.966265315864849, + "grad_norm": 2.324712038040161, + "learning_rate": 1.7228911402252515e-05, + "loss": 0.6603, + "step": 222420 + }, + { + "epoch": 1.9663537191251614, + "grad_norm": 3.147503614425659, + "learning_rate": 1.7227438014580647e-05, + "loss": 0.5688, + "step": 222430 + }, + { + "epoch": 1.9664421223854736, + "grad_norm": 4.333643913269043, + "learning_rate": 1.7225964626908775e-05, + "loss": 0.5561, + "step": 222440 + }, + { + "epoch": 1.9665305256457857, + "grad_norm": 4.8724894523620605, + "learning_rate": 1.7224491239236903e-05, + "loss": 0.4943, + "step": 222450 + }, + { + "epoch": 1.966618928906098, + "grad_norm": 7.888250350952148, + "learning_rate": 1.7223017851565032e-05, + "loss": 0.5848, + "step": 222460 + }, + { + "epoch": 1.9667073321664104, + "grad_norm": 7.013293743133545, + "learning_rate": 1.7221544463893163e-05, + "loss": 0.4606, + "step": 222470 + }, + { + "epoch": 1.9667957354267225, + "grad_norm": 1.9578282833099365, + "learning_rate": 1.7220071076221292e-05, + "loss": 0.5711, + "step": 222480 + }, + { + "epoch": 1.9668841386870348, + "grad_norm": 3.4705073833465576, + "learning_rate": 1.721859768854942e-05, + "loss": 0.6787, + "step": 222490 + }, + { + "epoch": 1.9669725419473472, + "grad_norm": 1.9974721670150757, + "learning_rate": 1.7217124300877552e-05, + "loss": 0.6046, + "step": 222500 + }, + { + "epoch": 1.9670609452076593, + "grad_norm": 1.1950721740722656, + "learning_rate": 1.721565091320568e-05, + "loss": 0.5288, + "step": 222510 + }, + { + "epoch": 1.9671493484679714, + "grad_norm": 3.3858697414398193, + "learning_rate": 1.721417752553381e-05, + "loss": 0.5076, + "step": 222520 + }, + { + "epoch": 1.9672377517282837, + "grad_norm": 2.76285457611084, + "learning_rate": 1.7212704137861937e-05, + "loss": 0.5661, + "step": 222530 + }, + { + "epoch": 1.967326154988596, + "grad_norm": 3.6475632190704346, + "learning_rate": 1.721123075019007e-05, + "loss": 0.6155, + "step": 222540 + }, + { + "epoch": 1.9674145582489082, + "grad_norm": 1.3591192960739136, + "learning_rate": 1.7209757362518197e-05, + "loss": 0.5272, + "step": 222550 + }, + { + "epoch": 1.9675029615092203, + "grad_norm": 2.6487836837768555, + "learning_rate": 1.7208283974846325e-05, + "loss": 0.5823, + "step": 222560 + }, + { + "epoch": 1.9675913647695327, + "grad_norm": 1.8409322500228882, + "learning_rate": 1.7206810587174454e-05, + "loss": 0.6024, + "step": 222570 + }, + { + "epoch": 1.967679768029845, + "grad_norm": 4.987606048583984, + "learning_rate": 1.7205337199502586e-05, + "loss": 0.5538, + "step": 222580 + }, + { + "epoch": 1.9677681712901571, + "grad_norm": 7.849449157714844, + "learning_rate": 1.7203863811830714e-05, + "loss": 0.5508, + "step": 222590 + }, + { + "epoch": 1.9678565745504695, + "grad_norm": 2.5497729778289795, + "learning_rate": 1.7202390424158842e-05, + "loss": 0.4697, + "step": 222600 + }, + { + "epoch": 1.9679449778107818, + "grad_norm": 5.652052879333496, + "learning_rate": 1.7200917036486974e-05, + "loss": 0.5289, + "step": 222610 + }, + { + "epoch": 1.968033381071094, + "grad_norm": 6.274253845214844, + "learning_rate": 1.7199443648815102e-05, + "loss": 0.5764, + "step": 222620 + }, + { + "epoch": 1.968121784331406, + "grad_norm": 3.169924020767212, + "learning_rate": 1.719797026114323e-05, + "loss": 0.6174, + "step": 222630 + }, + { + "epoch": 1.9682101875917184, + "grad_norm": 5.025245189666748, + "learning_rate": 1.719649687347136e-05, + "loss": 0.5409, + "step": 222640 + }, + { + "epoch": 1.9682985908520307, + "grad_norm": 1.7776600122451782, + "learning_rate": 1.719502348579949e-05, + "loss": 0.6629, + "step": 222650 + }, + { + "epoch": 1.9683869941123429, + "grad_norm": 2.793548822402954, + "learning_rate": 1.719355009812762e-05, + "loss": 0.6304, + "step": 222660 + }, + { + "epoch": 1.968475397372655, + "grad_norm": 3.562716007232666, + "learning_rate": 1.7192076710455747e-05, + "loss": 0.5184, + "step": 222670 + }, + { + "epoch": 1.9685638006329673, + "grad_norm": 2.9733078479766846, + "learning_rate": 1.719060332278388e-05, + "loss": 0.5315, + "step": 222680 + }, + { + "epoch": 1.9686522038932797, + "grad_norm": 6.737588882446289, + "learning_rate": 1.7189129935112008e-05, + "loss": 0.601, + "step": 222690 + }, + { + "epoch": 1.9687406071535918, + "grad_norm": 2.0825278759002686, + "learning_rate": 1.7187656547440136e-05, + "loss": 0.6412, + "step": 222700 + }, + { + "epoch": 1.968829010413904, + "grad_norm": 2.5049214363098145, + "learning_rate": 1.7186183159768264e-05, + "loss": 0.6238, + "step": 222710 + }, + { + "epoch": 1.9689174136742165, + "grad_norm": 5.799861907958984, + "learning_rate": 1.7184709772096396e-05, + "loss": 0.5981, + "step": 222720 + }, + { + "epoch": 1.9690058169345286, + "grad_norm": 1.855312466621399, + "learning_rate": 1.7183236384424524e-05, + "loss": 0.4881, + "step": 222730 + }, + { + "epoch": 1.9690942201948407, + "grad_norm": 2.7078020572662354, + "learning_rate": 1.7181762996752653e-05, + "loss": 0.6108, + "step": 222740 + }, + { + "epoch": 1.969182623455153, + "grad_norm": 2.7934305667877197, + "learning_rate": 1.718028960908078e-05, + "loss": 0.543, + "step": 222750 + }, + { + "epoch": 1.9692710267154654, + "grad_norm": 2.9295403957366943, + "learning_rate": 1.7178816221408913e-05, + "loss": 0.6324, + "step": 222760 + }, + { + "epoch": 1.9693594299757775, + "grad_norm": 1.3017882108688354, + "learning_rate": 1.717734283373704e-05, + "loss": 0.4405, + "step": 222770 + }, + { + "epoch": 1.9694478332360896, + "grad_norm": 1.115747332572937, + "learning_rate": 1.717586944606517e-05, + "loss": 0.5383, + "step": 222780 + }, + { + "epoch": 1.969536236496402, + "grad_norm": 2.6722826957702637, + "learning_rate": 1.71743960583933e-05, + "loss": 0.5871, + "step": 222790 + }, + { + "epoch": 1.9696246397567143, + "grad_norm": 3.0671226978302, + "learning_rate": 1.717292267072143e-05, + "loss": 0.5741, + "step": 222800 + }, + { + "epoch": 1.9697130430170264, + "grad_norm": 1.6251859664916992, + "learning_rate": 1.7171449283049558e-05, + "loss": 0.6286, + "step": 222810 + }, + { + "epoch": 1.9698014462773386, + "grad_norm": 1.8720160722732544, + "learning_rate": 1.7169975895377686e-05, + "loss": 0.6504, + "step": 222820 + }, + { + "epoch": 1.969889849537651, + "grad_norm": 6.490540981292725, + "learning_rate": 1.7168502507705818e-05, + "loss": 0.6905, + "step": 222830 + }, + { + "epoch": 1.9699782527979632, + "grad_norm": 7.432729721069336, + "learning_rate": 1.7167029120033946e-05, + "loss": 0.6618, + "step": 222840 + }, + { + "epoch": 1.9700666560582754, + "grad_norm": 1.296164631843567, + "learning_rate": 1.7165555732362075e-05, + "loss": 0.5143, + "step": 222850 + }, + { + "epoch": 1.9701550593185877, + "grad_norm": 3.0250802040100098, + "learning_rate": 1.7164082344690207e-05, + "loss": 0.5507, + "step": 222860 + }, + { + "epoch": 1.9702434625789, + "grad_norm": 3.119205951690674, + "learning_rate": 1.7162608957018335e-05, + "loss": 0.6189, + "step": 222870 + }, + { + "epoch": 1.9703318658392122, + "grad_norm": 4.983765602111816, + "learning_rate": 1.7161135569346463e-05, + "loss": 0.5083, + "step": 222880 + }, + { + "epoch": 1.9704202690995243, + "grad_norm": 14.583525657653809, + "learning_rate": 1.7159662181674595e-05, + "loss": 0.4849, + "step": 222890 + }, + { + "epoch": 1.9705086723598366, + "grad_norm": 2.0588018894195557, + "learning_rate": 1.7158188794002723e-05, + "loss": 0.5834, + "step": 222900 + }, + { + "epoch": 1.970597075620149, + "grad_norm": 1.2134294509887695, + "learning_rate": 1.715671540633085e-05, + "loss": 0.5814, + "step": 222910 + }, + { + "epoch": 1.970685478880461, + "grad_norm": 3.06860089302063, + "learning_rate": 1.7155242018658983e-05, + "loss": 0.5837, + "step": 222920 + }, + { + "epoch": 1.9707738821407732, + "grad_norm": 2.7625577449798584, + "learning_rate": 1.7153768630987112e-05, + "loss": 0.6332, + "step": 222930 + }, + { + "epoch": 1.9708622854010855, + "grad_norm": 1.0985058546066284, + "learning_rate": 1.715229524331524e-05, + "loss": 0.393, + "step": 222940 + }, + { + "epoch": 1.9709506886613979, + "grad_norm": 6.152040958404541, + "learning_rate": 1.7150821855643372e-05, + "loss": 0.621, + "step": 222950 + }, + { + "epoch": 1.97103909192171, + "grad_norm": 10.569106101989746, + "learning_rate": 1.71493484679715e-05, + "loss": 0.5013, + "step": 222960 + }, + { + "epoch": 1.9711274951820223, + "grad_norm": 4.52792501449585, + "learning_rate": 1.714787508029963e-05, + "loss": 0.5851, + "step": 222970 + }, + { + "epoch": 1.9712158984423347, + "grad_norm": 5.344310283660889, + "learning_rate": 1.714640169262776e-05, + "loss": 0.6643, + "step": 222980 + }, + { + "epoch": 1.9713043017026468, + "grad_norm": 5.182750225067139, + "learning_rate": 1.714492830495589e-05, + "loss": 0.6485, + "step": 222990 + }, + { + "epoch": 1.971392704962959, + "grad_norm": 6.226066589355469, + "learning_rate": 1.7143454917284017e-05, + "loss": 0.5357, + "step": 223000 + }, + { + "epoch": 1.9714811082232713, + "grad_norm": 2.8686330318450928, + "learning_rate": 1.714198152961215e-05, + "loss": 0.5842, + "step": 223010 + }, + { + "epoch": 1.9715695114835836, + "grad_norm": 1.1921875476837158, + "learning_rate": 1.7140508141940277e-05, + "loss": 0.5514, + "step": 223020 + }, + { + "epoch": 1.9716579147438957, + "grad_norm": 1.88923978805542, + "learning_rate": 1.7139034754268405e-05, + "loss": 0.4687, + "step": 223030 + }, + { + "epoch": 1.9717463180042079, + "grad_norm": 1.6415197849273682, + "learning_rate": 1.7137561366596534e-05, + "loss": 0.6425, + "step": 223040 + }, + { + "epoch": 1.9718347212645202, + "grad_norm": 1.456709384918213, + "learning_rate": 1.7136087978924666e-05, + "loss": 0.5695, + "step": 223050 + }, + { + "epoch": 1.9719231245248325, + "grad_norm": 6.440140247344971, + "learning_rate": 1.7134614591252794e-05, + "loss": 0.6765, + "step": 223060 + }, + { + "epoch": 1.9720115277851447, + "grad_norm": 1.7574011087417603, + "learning_rate": 1.7133141203580922e-05, + "loss": 0.5549, + "step": 223070 + }, + { + "epoch": 1.972099931045457, + "grad_norm": 4.2524333000183105, + "learning_rate": 1.7131667815909054e-05, + "loss": 0.5492, + "step": 223080 + }, + { + "epoch": 1.9721883343057693, + "grad_norm": 1.1085690259933472, + "learning_rate": 1.7130194428237182e-05, + "loss": 0.5494, + "step": 223090 + }, + { + "epoch": 1.9722767375660815, + "grad_norm": 3.139369010925293, + "learning_rate": 1.712872104056531e-05, + "loss": 0.6718, + "step": 223100 + }, + { + "epoch": 1.9723651408263936, + "grad_norm": 3.9366915225982666, + "learning_rate": 1.712724765289344e-05, + "loss": 0.5698, + "step": 223110 + }, + { + "epoch": 1.972453544086706, + "grad_norm": 6.333446502685547, + "learning_rate": 1.712577426522157e-05, + "loss": 0.6644, + "step": 223120 + }, + { + "epoch": 1.9725419473470183, + "grad_norm": 1.8347561359405518, + "learning_rate": 1.71243008775497e-05, + "loss": 0.4329, + "step": 223130 + }, + { + "epoch": 1.9726303506073304, + "grad_norm": 2.3457577228546143, + "learning_rate": 1.7122827489877828e-05, + "loss": 0.5492, + "step": 223140 + }, + { + "epoch": 1.9727187538676425, + "grad_norm": 1.201335072517395, + "learning_rate": 1.712135410220596e-05, + "loss": 0.5605, + "step": 223150 + }, + { + "epoch": 1.9728071571279548, + "grad_norm": 4.401676654815674, + "learning_rate": 1.7119880714534088e-05, + "loss": 0.5035, + "step": 223160 + }, + { + "epoch": 1.9728955603882672, + "grad_norm": 2.910940408706665, + "learning_rate": 1.7118407326862216e-05, + "loss": 0.6122, + "step": 223170 + }, + { + "epoch": 1.9729839636485793, + "grad_norm": 8.407724380493164, + "learning_rate": 1.7116933939190344e-05, + "loss": 0.5584, + "step": 223180 + }, + { + "epoch": 1.9730723669088917, + "grad_norm": 11.145696640014648, + "learning_rate": 1.7115460551518476e-05, + "loss": 0.5517, + "step": 223190 + }, + { + "epoch": 1.973160770169204, + "grad_norm": 11.784648895263672, + "learning_rate": 1.7113987163846604e-05, + "loss": 0.6739, + "step": 223200 + }, + { + "epoch": 1.9732491734295161, + "grad_norm": 5.412535190582275, + "learning_rate": 1.7112513776174733e-05, + "loss": 0.5425, + "step": 223210 + }, + { + "epoch": 1.9733375766898282, + "grad_norm": 3.030336618423462, + "learning_rate": 1.711104038850286e-05, + "loss": 0.5982, + "step": 223220 + }, + { + "epoch": 1.9734259799501406, + "grad_norm": 1.529349446296692, + "learning_rate": 1.7109567000830993e-05, + "loss": 0.4851, + "step": 223230 + }, + { + "epoch": 1.973514383210453, + "grad_norm": 1.9232240915298462, + "learning_rate": 1.710809361315912e-05, + "loss": 0.6172, + "step": 223240 + }, + { + "epoch": 1.973602786470765, + "grad_norm": 6.125307559967041, + "learning_rate": 1.710662022548725e-05, + "loss": 0.6683, + "step": 223250 + }, + { + "epoch": 1.9736911897310772, + "grad_norm": 1.5917388200759888, + "learning_rate": 1.710514683781538e-05, + "loss": 0.6482, + "step": 223260 + }, + { + "epoch": 1.9737795929913895, + "grad_norm": 2.928166389465332, + "learning_rate": 1.710367345014351e-05, + "loss": 0.5313, + "step": 223270 + }, + { + "epoch": 1.9738679962517018, + "grad_norm": 8.321270942687988, + "learning_rate": 1.7102200062471638e-05, + "loss": 0.5604, + "step": 223280 + }, + { + "epoch": 1.973956399512014, + "grad_norm": 2.138234853744507, + "learning_rate": 1.7100726674799766e-05, + "loss": 0.6185, + "step": 223290 + }, + { + "epoch": 1.974044802772326, + "grad_norm": 3.251054525375366, + "learning_rate": 1.7099253287127898e-05, + "loss": 0.5883, + "step": 223300 + }, + { + "epoch": 1.9741332060326386, + "grad_norm": 5.7253594398498535, + "learning_rate": 1.7097779899456026e-05, + "loss": 0.6261, + "step": 223310 + }, + { + "epoch": 1.9742216092929508, + "grad_norm": 7.327157974243164, + "learning_rate": 1.7096306511784155e-05, + "loss": 0.6169, + "step": 223320 + }, + { + "epoch": 1.9743100125532629, + "grad_norm": 2.8174843788146973, + "learning_rate": 1.7094833124112287e-05, + "loss": 0.7236, + "step": 223330 + }, + { + "epoch": 1.9743984158135752, + "grad_norm": 5.326597690582275, + "learning_rate": 1.7093359736440415e-05, + "loss": 0.6744, + "step": 223340 + }, + { + "epoch": 1.9744868190738876, + "grad_norm": 2.247544050216675, + "learning_rate": 1.7091886348768543e-05, + "loss": 0.5186, + "step": 223350 + }, + { + "epoch": 1.9745752223341997, + "grad_norm": 1.2743721008300781, + "learning_rate": 1.709041296109667e-05, + "loss": 0.6286, + "step": 223360 + }, + { + "epoch": 1.9746636255945118, + "grad_norm": 5.933443546295166, + "learning_rate": 1.7088939573424803e-05, + "loss": 0.6492, + "step": 223370 + }, + { + "epoch": 1.9747520288548241, + "grad_norm": 2.982179880142212, + "learning_rate": 1.7087466185752932e-05, + "loss": 0.6541, + "step": 223380 + }, + { + "epoch": 1.9748404321151365, + "grad_norm": 10.137446403503418, + "learning_rate": 1.708599279808106e-05, + "loss": 0.6684, + "step": 223390 + }, + { + "epoch": 1.9749288353754486, + "grad_norm": 1.6906108856201172, + "learning_rate": 1.708451941040919e-05, + "loss": 0.5938, + "step": 223400 + }, + { + "epoch": 1.9750172386357607, + "grad_norm": 5.015181064605713, + "learning_rate": 1.708304602273732e-05, + "loss": 0.5494, + "step": 223410 + }, + { + "epoch": 1.975105641896073, + "grad_norm": 3.6553337574005127, + "learning_rate": 1.708157263506545e-05, + "loss": 0.4865, + "step": 223420 + }, + { + "epoch": 1.9751940451563854, + "grad_norm": 2.070610523223877, + "learning_rate": 1.7080099247393577e-05, + "loss": 0.5332, + "step": 223430 + }, + { + "epoch": 1.9752824484166975, + "grad_norm": 3.7528462409973145, + "learning_rate": 1.707862585972171e-05, + "loss": 0.5404, + "step": 223440 + }, + { + "epoch": 1.9753708516770099, + "grad_norm": 1.6363284587860107, + "learning_rate": 1.7077152472049837e-05, + "loss": 0.5117, + "step": 223450 + }, + { + "epoch": 1.9754592549373222, + "grad_norm": 7.974064350128174, + "learning_rate": 1.7075679084377965e-05, + "loss": 0.6744, + "step": 223460 + }, + { + "epoch": 1.9755476581976343, + "grad_norm": 1.0796862840652466, + "learning_rate": 1.7074205696706094e-05, + "loss": 0.6018, + "step": 223470 + }, + { + "epoch": 1.9756360614579465, + "grad_norm": 1.667531967163086, + "learning_rate": 1.7072732309034225e-05, + "loss": 0.6464, + "step": 223480 + }, + { + "epoch": 1.9757244647182588, + "grad_norm": 2.247187376022339, + "learning_rate": 1.7071258921362354e-05, + "loss": 0.6127, + "step": 223490 + }, + { + "epoch": 1.9758128679785711, + "grad_norm": 1.4988588094711304, + "learning_rate": 1.7069785533690482e-05, + "loss": 0.5959, + "step": 223500 + }, + { + "epoch": 1.9759012712388833, + "grad_norm": 1.8883183002471924, + "learning_rate": 1.706831214601861e-05, + "loss": 0.5806, + "step": 223510 + }, + { + "epoch": 1.9759896744991954, + "grad_norm": 4.116348743438721, + "learning_rate": 1.7066838758346742e-05, + "loss": 0.5542, + "step": 223520 + }, + { + "epoch": 1.9760780777595077, + "grad_norm": 2.1979382038116455, + "learning_rate": 1.706536537067487e-05, + "loss": 0.427, + "step": 223530 + }, + { + "epoch": 1.97616648101982, + "grad_norm": 2.150432825088501, + "learning_rate": 1.7063891983003e-05, + "loss": 0.6589, + "step": 223540 + }, + { + "epoch": 1.9762548842801322, + "grad_norm": 1.7369264364242554, + "learning_rate": 1.706241859533113e-05, + "loss": 0.6432, + "step": 223550 + }, + { + "epoch": 1.9763432875404445, + "grad_norm": 5.763035297393799, + "learning_rate": 1.706094520765926e-05, + "loss": 0.5455, + "step": 223560 + }, + { + "epoch": 1.9764316908007569, + "grad_norm": 8.456995010375977, + "learning_rate": 1.7059471819987387e-05, + "loss": 0.617, + "step": 223570 + }, + { + "epoch": 1.976520094061069, + "grad_norm": 6.879094123840332, + "learning_rate": 1.7057998432315516e-05, + "loss": 0.5605, + "step": 223580 + }, + { + "epoch": 1.976608497321381, + "grad_norm": 3.256601572036743, + "learning_rate": 1.7056525044643648e-05, + "loss": 0.6778, + "step": 223590 + }, + { + "epoch": 1.9766969005816935, + "grad_norm": 1.143511176109314, + "learning_rate": 1.7055051656971776e-05, + "loss": 0.6732, + "step": 223600 + }, + { + "epoch": 1.9767853038420058, + "grad_norm": 1.5223276615142822, + "learning_rate": 1.7053578269299904e-05, + "loss": 0.5141, + "step": 223610 + }, + { + "epoch": 1.976873707102318, + "grad_norm": 2.047243595123291, + "learning_rate": 1.7052104881628036e-05, + "loss": 0.695, + "step": 223620 + }, + { + "epoch": 1.97696211036263, + "grad_norm": 1.709922432899475, + "learning_rate": 1.7050631493956164e-05, + "loss": 0.6434, + "step": 223630 + }, + { + "epoch": 1.9770505136229424, + "grad_norm": 1.0415716171264648, + "learning_rate": 1.7049158106284293e-05, + "loss": 0.4897, + "step": 223640 + }, + { + "epoch": 1.9771389168832547, + "grad_norm": 0.822580873966217, + "learning_rate": 1.704768471861242e-05, + "loss": 0.5427, + "step": 223650 + }, + { + "epoch": 1.9772273201435668, + "grad_norm": 2.1559183597564697, + "learning_rate": 1.7046211330940553e-05, + "loss": 0.5389, + "step": 223660 + }, + { + "epoch": 1.9773157234038792, + "grad_norm": 1.7369835376739502, + "learning_rate": 1.704473794326868e-05, + "loss": 0.6673, + "step": 223670 + }, + { + "epoch": 1.9774041266641915, + "grad_norm": 2.9662587642669678, + "learning_rate": 1.704326455559681e-05, + "loss": 0.6322, + "step": 223680 + }, + { + "epoch": 1.9774925299245036, + "grad_norm": 1.961627721786499, + "learning_rate": 1.7041791167924938e-05, + "loss": 0.5055, + "step": 223690 + }, + { + "epoch": 1.9775809331848158, + "grad_norm": 1.8878660202026367, + "learning_rate": 1.704031778025307e-05, + "loss": 0.6352, + "step": 223700 + }, + { + "epoch": 1.977669336445128, + "grad_norm": 1.2861078977584839, + "learning_rate": 1.7038844392581198e-05, + "loss": 0.5676, + "step": 223710 + }, + { + "epoch": 1.9777577397054404, + "grad_norm": 13.850279808044434, + "learning_rate": 1.7037371004909326e-05, + "loss": 0.6429, + "step": 223720 + }, + { + "epoch": 1.9778461429657526, + "grad_norm": 3.532600164413452, + "learning_rate": 1.7035897617237458e-05, + "loss": 0.6275, + "step": 223730 + }, + { + "epoch": 1.9779345462260647, + "grad_norm": 2.6759657859802246, + "learning_rate": 1.7034424229565586e-05, + "loss": 0.5844, + "step": 223740 + }, + { + "epoch": 1.978022949486377, + "grad_norm": 1.7044777870178223, + "learning_rate": 1.7032950841893715e-05, + "loss": 0.4891, + "step": 223750 + }, + { + "epoch": 1.9781113527466894, + "grad_norm": 9.880300521850586, + "learning_rate": 1.7031477454221843e-05, + "loss": 0.6351, + "step": 223760 + }, + { + "epoch": 1.9781997560070015, + "grad_norm": 7.09929895401001, + "learning_rate": 1.7030004066549975e-05, + "loss": 0.4399, + "step": 223770 + }, + { + "epoch": 1.9782881592673138, + "grad_norm": 3.8416786193847656, + "learning_rate": 1.7028530678878103e-05, + "loss": 0.5856, + "step": 223780 + }, + { + "epoch": 1.9783765625276262, + "grad_norm": 1.2900749444961548, + "learning_rate": 1.702705729120623e-05, + "loss": 0.4727, + "step": 223790 + }, + { + "epoch": 1.9784649657879383, + "grad_norm": 3.191159963607788, + "learning_rate": 1.7025583903534363e-05, + "loss": 0.7087, + "step": 223800 + }, + { + "epoch": 1.9785533690482504, + "grad_norm": 3.7237839698791504, + "learning_rate": 1.702411051586249e-05, + "loss": 0.7018, + "step": 223810 + }, + { + "epoch": 1.9786417723085628, + "grad_norm": 3.7309305667877197, + "learning_rate": 1.702263712819062e-05, + "loss": 0.5741, + "step": 223820 + }, + { + "epoch": 1.978730175568875, + "grad_norm": 2.9731409549713135, + "learning_rate": 1.7021163740518752e-05, + "loss": 0.6159, + "step": 223830 + }, + { + "epoch": 1.9788185788291872, + "grad_norm": 2.7989695072174072, + "learning_rate": 1.701969035284688e-05, + "loss": 0.5392, + "step": 223840 + }, + { + "epoch": 1.9789069820894993, + "grad_norm": 1.714094638824463, + "learning_rate": 1.701821696517501e-05, + "loss": 0.576, + "step": 223850 + }, + { + "epoch": 1.9789953853498117, + "grad_norm": 6.915410995483398, + "learning_rate": 1.701674357750314e-05, + "loss": 0.5674, + "step": 223860 + }, + { + "epoch": 1.979083788610124, + "grad_norm": 4.811580181121826, + "learning_rate": 1.701527018983127e-05, + "loss": 0.5876, + "step": 223870 + }, + { + "epoch": 1.9791721918704361, + "grad_norm": 2.657339096069336, + "learning_rate": 1.7013796802159397e-05, + "loss": 0.7162, + "step": 223880 + }, + { + "epoch": 1.9792605951307483, + "grad_norm": 4.997399806976318, + "learning_rate": 1.701232341448753e-05, + "loss": 0.6316, + "step": 223890 + }, + { + "epoch": 1.9793489983910608, + "grad_norm": 4.83448600769043, + "learning_rate": 1.7010850026815657e-05, + "loss": 0.6614, + "step": 223900 + }, + { + "epoch": 1.979437401651373, + "grad_norm": 1.3343744277954102, + "learning_rate": 1.7009376639143785e-05, + "loss": 0.5769, + "step": 223910 + }, + { + "epoch": 1.979525804911685, + "grad_norm": 2.2980024814605713, + "learning_rate": 1.7007903251471917e-05, + "loss": 0.6001, + "step": 223920 + }, + { + "epoch": 1.9796142081719974, + "grad_norm": 1.0228185653686523, + "learning_rate": 1.7006429863800045e-05, + "loss": 0.6659, + "step": 223930 + }, + { + "epoch": 1.9797026114323097, + "grad_norm": 1.5537577867507935, + "learning_rate": 1.7004956476128174e-05, + "loss": 0.7196, + "step": 223940 + }, + { + "epoch": 1.9797910146926219, + "grad_norm": 1.3778533935546875, + "learning_rate": 1.7003483088456306e-05, + "loss": 0.6119, + "step": 223950 + }, + { + "epoch": 1.979879417952934, + "grad_norm": 1.0338380336761475, + "learning_rate": 1.7002009700784434e-05, + "loss": 0.4922, + "step": 223960 + }, + { + "epoch": 1.9799678212132463, + "grad_norm": 9.00503921508789, + "learning_rate": 1.7000536313112562e-05, + "loss": 0.7009, + "step": 223970 + }, + { + "epoch": 1.9800562244735587, + "grad_norm": 3.3645524978637695, + "learning_rate": 1.6999062925440694e-05, + "loss": 0.554, + "step": 223980 + }, + { + "epoch": 1.9801446277338708, + "grad_norm": 1.2249828577041626, + "learning_rate": 1.6997589537768822e-05, + "loss": 0.4951, + "step": 223990 + }, + { + "epoch": 1.980233030994183, + "grad_norm": 2.8988823890686035, + "learning_rate": 1.699611615009695e-05, + "loss": 0.6621, + "step": 224000 + }, + { + "epoch": 1.9803214342544955, + "grad_norm": 1.5652815103530884, + "learning_rate": 1.699464276242508e-05, + "loss": 0.4766, + "step": 224010 + }, + { + "epoch": 1.9804098375148076, + "grad_norm": 3.9531476497650146, + "learning_rate": 1.699316937475321e-05, + "loss": 0.5626, + "step": 224020 + }, + { + "epoch": 1.9804982407751197, + "grad_norm": 2.6915416717529297, + "learning_rate": 1.699169598708134e-05, + "loss": 0.5738, + "step": 224030 + }, + { + "epoch": 1.980586644035432, + "grad_norm": 2.531986713409424, + "learning_rate": 1.6990222599409467e-05, + "loss": 0.5644, + "step": 224040 + }, + { + "epoch": 1.9806750472957444, + "grad_norm": 1.614586353302002, + "learning_rate": 1.6988749211737596e-05, + "loss": 0.5142, + "step": 224050 + }, + { + "epoch": 1.9807634505560565, + "grad_norm": 1.6339393854141235, + "learning_rate": 1.6987275824065728e-05, + "loss": 0.5071, + "step": 224060 + }, + { + "epoch": 1.9808518538163686, + "grad_norm": 1.582903504371643, + "learning_rate": 1.6985802436393856e-05, + "loss": 0.5189, + "step": 224070 + }, + { + "epoch": 1.980940257076681, + "grad_norm": 4.0491557121276855, + "learning_rate": 1.6984329048721984e-05, + "loss": 0.5312, + "step": 224080 + }, + { + "epoch": 1.9810286603369933, + "grad_norm": 1.9501909017562866, + "learning_rate": 1.6982855661050116e-05, + "loss": 0.6763, + "step": 224090 + }, + { + "epoch": 1.9811170635973054, + "grad_norm": 11.996707916259766, + "learning_rate": 1.6981382273378244e-05, + "loss": 0.6142, + "step": 224100 + }, + { + "epoch": 1.9812054668576176, + "grad_norm": 4.226593494415283, + "learning_rate": 1.6979908885706373e-05, + "loss": 0.5013, + "step": 224110 + }, + { + "epoch": 1.98129387011793, + "grad_norm": 2.621124029159546, + "learning_rate": 1.69784354980345e-05, + "loss": 0.6391, + "step": 224120 + }, + { + "epoch": 1.9813822733782422, + "grad_norm": 2.4573519229888916, + "learning_rate": 1.6976962110362633e-05, + "loss": 0.5746, + "step": 224130 + }, + { + "epoch": 1.9814706766385544, + "grad_norm": 11.391940116882324, + "learning_rate": 1.697548872269076e-05, + "loss": 0.7617, + "step": 224140 + }, + { + "epoch": 1.9815590798988667, + "grad_norm": 7.178558826446533, + "learning_rate": 1.697401533501889e-05, + "loss": 0.5036, + "step": 224150 + }, + { + "epoch": 1.981647483159179, + "grad_norm": 2.7364306449890137, + "learning_rate": 1.6972541947347018e-05, + "loss": 0.6303, + "step": 224160 + }, + { + "epoch": 1.9817358864194912, + "grad_norm": 10.925390243530273, + "learning_rate": 1.697106855967515e-05, + "loss": 0.5193, + "step": 224170 + }, + { + "epoch": 1.9818242896798033, + "grad_norm": 1.7822974920272827, + "learning_rate": 1.6969595172003278e-05, + "loss": 0.5683, + "step": 224180 + }, + { + "epoch": 1.9819126929401156, + "grad_norm": 15.627070426940918, + "learning_rate": 1.6968121784331406e-05, + "loss": 0.6209, + "step": 224190 + }, + { + "epoch": 1.982001096200428, + "grad_norm": 5.662389755249023, + "learning_rate": 1.6966648396659538e-05, + "loss": 0.5114, + "step": 224200 + }, + { + "epoch": 1.98208949946074, + "grad_norm": 1.9230458736419678, + "learning_rate": 1.6965175008987666e-05, + "loss": 0.4327, + "step": 224210 + }, + { + "epoch": 1.9821779027210522, + "grad_norm": 3.2107596397399902, + "learning_rate": 1.6963701621315795e-05, + "loss": 0.4802, + "step": 224220 + }, + { + "epoch": 1.9822663059813646, + "grad_norm": 3.1691925525665283, + "learning_rate": 1.6962228233643923e-05, + "loss": 0.5961, + "step": 224230 + }, + { + "epoch": 1.982354709241677, + "grad_norm": 1.4854533672332764, + "learning_rate": 1.6960754845972055e-05, + "loss": 0.5751, + "step": 224240 + }, + { + "epoch": 1.982443112501989, + "grad_norm": 1.9018033742904663, + "learning_rate": 1.6959281458300183e-05, + "loss": 0.6033, + "step": 224250 + }, + { + "epoch": 1.9825315157623014, + "grad_norm": 1.478623390197754, + "learning_rate": 1.695780807062831e-05, + "loss": 0.6022, + "step": 224260 + }, + { + "epoch": 1.9826199190226137, + "grad_norm": 3.3851511478424072, + "learning_rate": 1.6956334682956443e-05, + "loss": 0.6146, + "step": 224270 + }, + { + "epoch": 1.9827083222829258, + "grad_norm": 2.368901014328003, + "learning_rate": 1.6954861295284572e-05, + "loss": 0.5482, + "step": 224280 + }, + { + "epoch": 1.982796725543238, + "grad_norm": 2.6048879623413086, + "learning_rate": 1.69533879076127e-05, + "loss": 0.7511, + "step": 224290 + }, + { + "epoch": 1.9828851288035503, + "grad_norm": 1.4162763357162476, + "learning_rate": 1.695191451994083e-05, + "loss": 0.5106, + "step": 224300 + }, + { + "epoch": 1.9829735320638626, + "grad_norm": 6.880292892456055, + "learning_rate": 1.695044113226896e-05, + "loss": 0.5504, + "step": 224310 + }, + { + "epoch": 1.9830619353241747, + "grad_norm": 1.9633214473724365, + "learning_rate": 1.694896774459709e-05, + "loss": 0.3765, + "step": 224320 + }, + { + "epoch": 1.9831503385844869, + "grad_norm": 3.665835380554199, + "learning_rate": 1.6947494356925217e-05, + "loss": 0.5829, + "step": 224330 + }, + { + "epoch": 1.9832387418447992, + "grad_norm": 1.6488367319107056, + "learning_rate": 1.6946020969253345e-05, + "loss": 0.5372, + "step": 224340 + }, + { + "epoch": 1.9833271451051115, + "grad_norm": 1.4566655158996582, + "learning_rate": 1.6944547581581477e-05, + "loss": 0.6111, + "step": 224350 + }, + { + "epoch": 1.9834155483654237, + "grad_norm": 15.666956901550293, + "learning_rate": 1.6943074193909605e-05, + "loss": 0.7757, + "step": 224360 + }, + { + "epoch": 1.983503951625736, + "grad_norm": 4.194991111755371, + "learning_rate": 1.6941600806237734e-05, + "loss": 0.4925, + "step": 224370 + }, + { + "epoch": 1.9835923548860483, + "grad_norm": 8.081720352172852, + "learning_rate": 1.6940127418565865e-05, + "loss": 0.5839, + "step": 224380 + }, + { + "epoch": 1.9836807581463605, + "grad_norm": 8.042984008789062, + "learning_rate": 1.6938654030893994e-05, + "loss": 0.5748, + "step": 224390 + }, + { + "epoch": 1.9837691614066726, + "grad_norm": 2.2797820568084717, + "learning_rate": 1.6937180643222122e-05, + "loss": 0.5712, + "step": 224400 + }, + { + "epoch": 1.983857564666985, + "grad_norm": 3.472729444503784, + "learning_rate": 1.693570725555025e-05, + "loss": 0.5932, + "step": 224410 + }, + { + "epoch": 1.9839459679272973, + "grad_norm": 6.893070220947266, + "learning_rate": 1.6934233867878382e-05, + "loss": 0.5428, + "step": 224420 + }, + { + "epoch": 1.9840343711876094, + "grad_norm": 6.0669403076171875, + "learning_rate": 1.693276048020651e-05, + "loss": 0.5413, + "step": 224430 + }, + { + "epoch": 1.9841227744479215, + "grad_norm": 0.7792112231254578, + "learning_rate": 1.693128709253464e-05, + "loss": 0.4702, + "step": 224440 + }, + { + "epoch": 1.9842111777082339, + "grad_norm": 1.6176273822784424, + "learning_rate": 1.692981370486277e-05, + "loss": 0.6868, + "step": 224450 + }, + { + "epoch": 1.9842995809685462, + "grad_norm": 4.055951118469238, + "learning_rate": 1.69283403171909e-05, + "loss": 0.5389, + "step": 224460 + }, + { + "epoch": 1.9843879842288583, + "grad_norm": 2.720865249633789, + "learning_rate": 1.6926866929519027e-05, + "loss": 0.6477, + "step": 224470 + }, + { + "epoch": 1.9844763874891704, + "grad_norm": 1.8405334949493408, + "learning_rate": 1.6925393541847156e-05, + "loss": 0.5865, + "step": 224480 + }, + { + "epoch": 1.984564790749483, + "grad_norm": 1.7914783954620361, + "learning_rate": 1.6923920154175287e-05, + "loss": 0.5495, + "step": 224490 + }, + { + "epoch": 1.9846531940097951, + "grad_norm": 3.9661803245544434, + "learning_rate": 1.6922446766503416e-05, + "loss": 0.4685, + "step": 224500 + }, + { + "epoch": 1.9847415972701072, + "grad_norm": 5.78141975402832, + "learning_rate": 1.6920973378831544e-05, + "loss": 0.7583, + "step": 224510 + }, + { + "epoch": 1.9848300005304196, + "grad_norm": 2.5443613529205322, + "learning_rate": 1.6919499991159673e-05, + "loss": 0.717, + "step": 224520 + }, + { + "epoch": 1.984918403790732, + "grad_norm": 1.7132166624069214, + "learning_rate": 1.6918026603487804e-05, + "loss": 0.5181, + "step": 224530 + }, + { + "epoch": 1.985006807051044, + "grad_norm": 3.477186441421509, + "learning_rate": 1.6916553215815933e-05, + "loss": 0.5612, + "step": 224540 + }, + { + "epoch": 1.9850952103113562, + "grad_norm": 1.499049186706543, + "learning_rate": 1.691507982814406e-05, + "loss": 0.5493, + "step": 224550 + }, + { + "epoch": 1.9851836135716685, + "grad_norm": 1.847678303718567, + "learning_rate": 1.6913606440472193e-05, + "loss": 0.6777, + "step": 224560 + }, + { + "epoch": 1.9852720168319808, + "grad_norm": 3.1537466049194336, + "learning_rate": 1.691213305280032e-05, + "loss": 0.4916, + "step": 224570 + }, + { + "epoch": 1.985360420092293, + "grad_norm": 3.695202350616455, + "learning_rate": 1.691065966512845e-05, + "loss": 0.5396, + "step": 224580 + }, + { + "epoch": 1.985448823352605, + "grad_norm": 7.094969749450684, + "learning_rate": 1.6909186277456578e-05, + "loss": 0.5074, + "step": 224590 + }, + { + "epoch": 1.9855372266129176, + "grad_norm": 2.075542688369751, + "learning_rate": 1.690771288978471e-05, + "loss": 0.5758, + "step": 224600 + }, + { + "epoch": 1.9856256298732298, + "grad_norm": 1.3143478631973267, + "learning_rate": 1.6906239502112838e-05, + "loss": 0.5097, + "step": 224610 + }, + { + "epoch": 1.985714033133542, + "grad_norm": 3.3322737216949463, + "learning_rate": 1.6904766114440966e-05, + "loss": 0.645, + "step": 224620 + }, + { + "epoch": 1.9858024363938542, + "grad_norm": 1.4347734451293945, + "learning_rate": 1.6903292726769095e-05, + "loss": 0.5514, + "step": 224630 + }, + { + "epoch": 1.9858908396541666, + "grad_norm": 3.8193883895874023, + "learning_rate": 1.6901819339097226e-05, + "loss": 0.6198, + "step": 224640 + }, + { + "epoch": 1.9859792429144787, + "grad_norm": 1.811673641204834, + "learning_rate": 1.6900345951425355e-05, + "loss": 0.5617, + "step": 224650 + }, + { + "epoch": 1.9860676461747908, + "grad_norm": 2.457360029220581, + "learning_rate": 1.6898872563753483e-05, + "loss": 0.5595, + "step": 224660 + }, + { + "epoch": 1.9861560494351032, + "grad_norm": 4.07906436920166, + "learning_rate": 1.6897399176081615e-05, + "loss": 0.5444, + "step": 224670 + }, + { + "epoch": 1.9862444526954155, + "grad_norm": 4.393835067749023, + "learning_rate": 1.6895925788409743e-05, + "loss": 0.6519, + "step": 224680 + }, + { + "epoch": 1.9863328559557276, + "grad_norm": 3.4482831954956055, + "learning_rate": 1.689445240073787e-05, + "loss": 0.502, + "step": 224690 + }, + { + "epoch": 1.9864212592160397, + "grad_norm": 1.3472683429718018, + "learning_rate": 1.6892979013066e-05, + "loss": 0.5358, + "step": 224700 + }, + { + "epoch": 1.986509662476352, + "grad_norm": 1.7818764448165894, + "learning_rate": 1.689150562539413e-05, + "loss": 0.4806, + "step": 224710 + }, + { + "epoch": 1.9865980657366644, + "grad_norm": 2.1879148483276367, + "learning_rate": 1.689003223772226e-05, + "loss": 0.5691, + "step": 224720 + }, + { + "epoch": 1.9866864689969765, + "grad_norm": 2.7464609146118164, + "learning_rate": 1.6888558850050388e-05, + "loss": 0.5308, + "step": 224730 + }, + { + "epoch": 1.9867748722572889, + "grad_norm": 2.4121510982513428, + "learning_rate": 1.688708546237852e-05, + "loss": 0.577, + "step": 224740 + }, + { + "epoch": 1.9868632755176012, + "grad_norm": 1.6558781862258911, + "learning_rate": 1.688561207470665e-05, + "loss": 0.6469, + "step": 224750 + }, + { + "epoch": 1.9869516787779133, + "grad_norm": 9.172469139099121, + "learning_rate": 1.6884138687034777e-05, + "loss": 0.5378, + "step": 224760 + }, + { + "epoch": 1.9870400820382255, + "grad_norm": 1.5967637300491333, + "learning_rate": 1.688266529936291e-05, + "loss": 0.6133, + "step": 224770 + }, + { + "epoch": 1.9871284852985378, + "grad_norm": 1.2075780630111694, + "learning_rate": 1.6881191911691037e-05, + "loss": 0.5244, + "step": 224780 + }, + { + "epoch": 1.9872168885588501, + "grad_norm": 0.8148615956306458, + "learning_rate": 1.6879718524019165e-05, + "loss": 0.628, + "step": 224790 + }, + { + "epoch": 1.9873052918191623, + "grad_norm": 1.3674981594085693, + "learning_rate": 1.6878245136347297e-05, + "loss": 0.5519, + "step": 224800 + }, + { + "epoch": 1.9873936950794744, + "grad_norm": 0.957788348197937, + "learning_rate": 1.6876771748675425e-05, + "loss": 0.5269, + "step": 224810 + }, + { + "epoch": 1.9874820983397867, + "grad_norm": 3.036616086959839, + "learning_rate": 1.6875298361003554e-05, + "loss": 0.5956, + "step": 224820 + }, + { + "epoch": 1.987570501600099, + "grad_norm": 1.987912893295288, + "learning_rate": 1.6873824973331685e-05, + "loss": 0.5781, + "step": 224830 + }, + { + "epoch": 1.9876589048604112, + "grad_norm": 6.283286094665527, + "learning_rate": 1.6872351585659814e-05, + "loss": 0.5509, + "step": 224840 + }, + { + "epoch": 1.9877473081207235, + "grad_norm": 1.9108779430389404, + "learning_rate": 1.6870878197987945e-05, + "loss": 0.6328, + "step": 224850 + }, + { + "epoch": 1.9878357113810359, + "grad_norm": 1.2929589748382568, + "learning_rate": 1.6869404810316074e-05, + "loss": 0.482, + "step": 224860 + }, + { + "epoch": 1.987924114641348, + "grad_norm": 4.726243495941162, + "learning_rate": 1.6867931422644202e-05, + "loss": 0.5229, + "step": 224870 + }, + { + "epoch": 1.9880125179016601, + "grad_norm": 1.9832627773284912, + "learning_rate": 1.686645803497233e-05, + "loss": 0.6598, + "step": 224880 + }, + { + "epoch": 1.9881009211619725, + "grad_norm": 1.8016328811645508, + "learning_rate": 1.6864984647300462e-05, + "loss": 0.5763, + "step": 224890 + }, + { + "epoch": 1.9881893244222848, + "grad_norm": 1.620561122894287, + "learning_rate": 1.686351125962859e-05, + "loss": 0.4979, + "step": 224900 + }, + { + "epoch": 1.988277727682597, + "grad_norm": 4.599083423614502, + "learning_rate": 1.686203787195672e-05, + "loss": 0.5075, + "step": 224910 + }, + { + "epoch": 1.988366130942909, + "grad_norm": 3.026254415512085, + "learning_rate": 1.686056448428485e-05, + "loss": 0.6115, + "step": 224920 + }, + { + "epoch": 1.9884545342032214, + "grad_norm": 1.3611140251159668, + "learning_rate": 1.685909109661298e-05, + "loss": 0.5479, + "step": 224930 + }, + { + "epoch": 1.9885429374635337, + "grad_norm": 7.72886323928833, + "learning_rate": 1.6857617708941107e-05, + "loss": 0.4639, + "step": 224940 + }, + { + "epoch": 1.9886313407238458, + "grad_norm": 4.344997406005859, + "learning_rate": 1.6856144321269236e-05, + "loss": 0.633, + "step": 224950 + }, + { + "epoch": 1.9887197439841582, + "grad_norm": 4.46109676361084, + "learning_rate": 1.6854670933597368e-05, + "loss": 0.6392, + "step": 224960 + }, + { + "epoch": 1.9888081472444705, + "grad_norm": 11.079893112182617, + "learning_rate": 1.6853197545925496e-05, + "loss": 0.6176, + "step": 224970 + }, + { + "epoch": 1.9888965505047826, + "grad_norm": 5.218672752380371, + "learning_rate": 1.6851724158253624e-05, + "loss": 0.5178, + "step": 224980 + }, + { + "epoch": 1.9889849537650948, + "grad_norm": 2.4219586849212646, + "learning_rate": 1.6850250770581753e-05, + "loss": 0.5192, + "step": 224990 + }, + { + "epoch": 1.989073357025407, + "grad_norm": 4.831282615661621, + "learning_rate": 1.6848777382909884e-05, + "loss": 0.5875, + "step": 225000 + }, + { + "epoch": 1.9891617602857194, + "grad_norm": 2.6583251953125, + "learning_rate": 1.6847303995238013e-05, + "loss": 0.5748, + "step": 225010 + }, + { + "epoch": 1.9892501635460316, + "grad_norm": 1.1268571615219116, + "learning_rate": 1.684583060756614e-05, + "loss": 0.5148, + "step": 225020 + }, + { + "epoch": 1.9893385668063437, + "grad_norm": 3.306156635284424, + "learning_rate": 1.6844357219894273e-05, + "loss": 0.6196, + "step": 225030 + }, + { + "epoch": 1.989426970066656, + "grad_norm": 1.5640621185302734, + "learning_rate": 1.68428838322224e-05, + "loss": 0.5988, + "step": 225040 + }, + { + "epoch": 1.9895153733269684, + "grad_norm": 2.0384891033172607, + "learning_rate": 1.684141044455053e-05, + "loss": 0.7041, + "step": 225050 + }, + { + "epoch": 1.9896037765872805, + "grad_norm": 2.0110831260681152, + "learning_rate": 1.6839937056878658e-05, + "loss": 0.6548, + "step": 225060 + }, + { + "epoch": 1.9896921798475928, + "grad_norm": 2.0337533950805664, + "learning_rate": 1.683846366920679e-05, + "loss": 0.6102, + "step": 225070 + }, + { + "epoch": 1.9897805831079052, + "grad_norm": 1.8111612796783447, + "learning_rate": 1.6836990281534918e-05, + "loss": 0.4591, + "step": 225080 + }, + { + "epoch": 1.9898689863682173, + "grad_norm": 1.1000893115997314, + "learning_rate": 1.6835516893863046e-05, + "loss": 0.5735, + "step": 225090 + }, + { + "epoch": 1.9899573896285294, + "grad_norm": 1.8954603672027588, + "learning_rate": 1.6834043506191175e-05, + "loss": 0.5119, + "step": 225100 + }, + { + "epoch": 1.9900457928888418, + "grad_norm": 1.3520907163619995, + "learning_rate": 1.6832570118519306e-05, + "loss": 0.6305, + "step": 225110 + }, + { + "epoch": 1.990134196149154, + "grad_norm": 5.1060967445373535, + "learning_rate": 1.6831096730847435e-05, + "loss": 0.6116, + "step": 225120 + }, + { + "epoch": 1.9902225994094662, + "grad_norm": 3.5641541481018066, + "learning_rate": 1.6829623343175563e-05, + "loss": 0.4878, + "step": 225130 + }, + { + "epoch": 1.9903110026697783, + "grad_norm": 5.637264728546143, + "learning_rate": 1.6828149955503695e-05, + "loss": 0.5807, + "step": 225140 + }, + { + "epoch": 1.9903994059300907, + "grad_norm": 3.1112334728240967, + "learning_rate": 1.6826676567831823e-05, + "loss": 0.6752, + "step": 225150 + }, + { + "epoch": 1.990487809190403, + "grad_norm": 4.424471855163574, + "learning_rate": 1.682520318015995e-05, + "loss": 0.4787, + "step": 225160 + }, + { + "epoch": 1.9905762124507151, + "grad_norm": 1.129172921180725, + "learning_rate": 1.682372979248808e-05, + "loss": 0.5729, + "step": 225170 + }, + { + "epoch": 1.9906646157110273, + "grad_norm": 0.8466700911521912, + "learning_rate": 1.682225640481621e-05, + "loss": 0.5369, + "step": 225180 + }, + { + "epoch": 1.9907530189713398, + "grad_norm": 2.4012086391448975, + "learning_rate": 1.682078301714434e-05, + "loss": 0.6113, + "step": 225190 + }, + { + "epoch": 1.990841422231652, + "grad_norm": 3.801825761795044, + "learning_rate": 1.681930962947247e-05, + "loss": 0.6716, + "step": 225200 + }, + { + "epoch": 1.990929825491964, + "grad_norm": 4.74006462097168, + "learning_rate": 1.68178362418006e-05, + "loss": 0.7159, + "step": 225210 + }, + { + "epoch": 1.9910182287522764, + "grad_norm": 2.6332221031188965, + "learning_rate": 1.681636285412873e-05, + "loss": 0.4632, + "step": 225220 + }, + { + "epoch": 1.9911066320125888, + "grad_norm": 1.048154592514038, + "learning_rate": 1.6814889466456857e-05, + "loss": 0.4734, + "step": 225230 + }, + { + "epoch": 1.9911950352729009, + "grad_norm": 13.232263565063477, + "learning_rate": 1.6813416078784985e-05, + "loss": 0.4885, + "step": 225240 + }, + { + "epoch": 1.991283438533213, + "grad_norm": 0.8549413084983826, + "learning_rate": 1.6811942691113117e-05, + "loss": 0.541, + "step": 225250 + }, + { + "epoch": 1.9913718417935253, + "grad_norm": 4.279587268829346, + "learning_rate": 1.6810469303441245e-05, + "loss": 0.6136, + "step": 225260 + }, + { + "epoch": 1.9914602450538377, + "grad_norm": 4.3734893798828125, + "learning_rate": 1.6808995915769374e-05, + "loss": 0.6915, + "step": 225270 + }, + { + "epoch": 1.9915486483141498, + "grad_norm": 0.9106436371803284, + "learning_rate": 1.6807522528097502e-05, + "loss": 0.5335, + "step": 225280 + }, + { + "epoch": 1.991637051574462, + "grad_norm": 1.1319162845611572, + "learning_rate": 1.6806049140425634e-05, + "loss": 0.5117, + "step": 225290 + }, + { + "epoch": 1.9917254548347743, + "grad_norm": 10.555523872375488, + "learning_rate": 1.6804575752753762e-05, + "loss": 0.58, + "step": 225300 + }, + { + "epoch": 1.9918138580950866, + "grad_norm": 5.483819961547852, + "learning_rate": 1.680310236508189e-05, + "loss": 0.5905, + "step": 225310 + }, + { + "epoch": 1.9919022613553987, + "grad_norm": 6.8603315353393555, + "learning_rate": 1.6801628977410022e-05, + "loss": 0.6074, + "step": 225320 + }, + { + "epoch": 1.991990664615711, + "grad_norm": 1.3294624090194702, + "learning_rate": 1.680015558973815e-05, + "loss": 0.5694, + "step": 225330 + }, + { + "epoch": 1.9920790678760234, + "grad_norm": 12.74087142944336, + "learning_rate": 1.679868220206628e-05, + "loss": 0.7074, + "step": 225340 + }, + { + "epoch": 1.9921674711363355, + "grad_norm": 1.852597713470459, + "learning_rate": 1.6797208814394407e-05, + "loss": 0.5257, + "step": 225350 + }, + { + "epoch": 1.9922558743966476, + "grad_norm": 11.889374732971191, + "learning_rate": 1.679573542672254e-05, + "loss": 0.4863, + "step": 225360 + }, + { + "epoch": 1.99234427765696, + "grad_norm": 2.9477877616882324, + "learning_rate": 1.6794262039050667e-05, + "loss": 0.4494, + "step": 225370 + }, + { + "epoch": 1.9924326809172723, + "grad_norm": 10.925512313842773, + "learning_rate": 1.6792788651378796e-05, + "loss": 0.6253, + "step": 225380 + }, + { + "epoch": 1.9925210841775844, + "grad_norm": 5.6802449226379395, + "learning_rate": 1.6791315263706927e-05, + "loss": 0.6502, + "step": 225390 + }, + { + "epoch": 1.9926094874378966, + "grad_norm": 1.6207715272903442, + "learning_rate": 1.6789841876035056e-05, + "loss": 0.5674, + "step": 225400 + }, + { + "epoch": 1.992697890698209, + "grad_norm": 18.84044075012207, + "learning_rate": 1.6788368488363184e-05, + "loss": 0.4871, + "step": 225410 + }, + { + "epoch": 1.9927862939585212, + "grad_norm": 1.179420828819275, + "learning_rate": 1.6786895100691312e-05, + "loss": 0.6922, + "step": 225420 + }, + { + "epoch": 1.9928746972188334, + "grad_norm": 0.961736261844635, + "learning_rate": 1.6785421713019444e-05, + "loss": 0.5083, + "step": 225430 + }, + { + "epoch": 1.9929631004791457, + "grad_norm": 5.554663181304932, + "learning_rate": 1.6783948325347573e-05, + "loss": 0.729, + "step": 225440 + }, + { + "epoch": 1.993051503739458, + "grad_norm": 8.339944839477539, + "learning_rate": 1.67824749376757e-05, + "loss": 0.5474, + "step": 225450 + }, + { + "epoch": 1.9931399069997702, + "grad_norm": 4.111626148223877, + "learning_rate": 1.678100155000383e-05, + "loss": 0.5849, + "step": 225460 + }, + { + "epoch": 1.9932283102600823, + "grad_norm": 2.2576897144317627, + "learning_rate": 1.677952816233196e-05, + "loss": 0.4451, + "step": 225470 + }, + { + "epoch": 1.9933167135203946, + "grad_norm": 4.997096061706543, + "learning_rate": 1.677805477466009e-05, + "loss": 0.604, + "step": 225480 + }, + { + "epoch": 1.993405116780707, + "grad_norm": 2.46445369720459, + "learning_rate": 1.6776581386988218e-05, + "loss": 0.7684, + "step": 225490 + }, + { + "epoch": 1.993493520041019, + "grad_norm": 2.6454429626464844, + "learning_rate": 1.677510799931635e-05, + "loss": 0.6637, + "step": 225500 + }, + { + "epoch": 1.9935819233013312, + "grad_norm": 4.765692234039307, + "learning_rate": 1.6773634611644478e-05, + "loss": 0.7253, + "step": 225510 + }, + { + "epoch": 1.9936703265616436, + "grad_norm": 1.414405107498169, + "learning_rate": 1.6772161223972606e-05, + "loss": 0.4885, + "step": 225520 + }, + { + "epoch": 1.993758729821956, + "grad_norm": 5.145718574523926, + "learning_rate": 1.6770687836300735e-05, + "loss": 0.6276, + "step": 225530 + }, + { + "epoch": 1.993847133082268, + "grad_norm": 2.5033962726593018, + "learning_rate": 1.6769214448628866e-05, + "loss": 0.5412, + "step": 225540 + }, + { + "epoch": 1.9939355363425804, + "grad_norm": 2.069969654083252, + "learning_rate": 1.6767741060956995e-05, + "loss": 0.7213, + "step": 225550 + }, + { + "epoch": 1.9940239396028927, + "grad_norm": 4.3891777992248535, + "learning_rate": 1.6766267673285123e-05, + "loss": 0.5545, + "step": 225560 + }, + { + "epoch": 1.9941123428632048, + "grad_norm": 1.4024473428726196, + "learning_rate": 1.6764794285613255e-05, + "loss": 0.6344, + "step": 225570 + }, + { + "epoch": 1.994200746123517, + "grad_norm": 3.2352919578552246, + "learning_rate": 1.6763320897941383e-05, + "loss": 0.5767, + "step": 225580 + }, + { + "epoch": 1.9942891493838293, + "grad_norm": 2.1905648708343506, + "learning_rate": 1.676184751026951e-05, + "loss": 0.5415, + "step": 225590 + }, + { + "epoch": 1.9943775526441416, + "grad_norm": 5.467777729034424, + "learning_rate": 1.676037412259764e-05, + "loss": 0.5505, + "step": 225600 + }, + { + "epoch": 1.9944659559044537, + "grad_norm": 5.634042739868164, + "learning_rate": 1.675890073492577e-05, + "loss": 0.7058, + "step": 225610 + }, + { + "epoch": 1.9945543591647659, + "grad_norm": 23.753089904785156, + "learning_rate": 1.67574273472539e-05, + "loss": 0.7105, + "step": 225620 + }, + { + "epoch": 1.9946427624250782, + "grad_norm": 2.8161308765411377, + "learning_rate": 1.6755953959582028e-05, + "loss": 0.5731, + "step": 225630 + }, + { + "epoch": 1.9947311656853906, + "grad_norm": 12.272192001342773, + "learning_rate": 1.675448057191016e-05, + "loss": 0.6786, + "step": 225640 + }, + { + "epoch": 1.9948195689457027, + "grad_norm": 2.276580572128296, + "learning_rate": 1.675300718423829e-05, + "loss": 0.6314, + "step": 225650 + }, + { + "epoch": 1.994907972206015, + "grad_norm": 1.5143632888793945, + "learning_rate": 1.6751533796566417e-05, + "loss": 0.6322, + "step": 225660 + }, + { + "epoch": 1.9949963754663274, + "grad_norm": 5.343900203704834, + "learning_rate": 1.675006040889455e-05, + "loss": 0.5836, + "step": 225670 + }, + { + "epoch": 1.9950847787266395, + "grad_norm": 3.547680139541626, + "learning_rate": 1.6748587021222677e-05, + "loss": 0.595, + "step": 225680 + }, + { + "epoch": 1.9951731819869516, + "grad_norm": 1.290130376815796, + "learning_rate": 1.6747113633550805e-05, + "loss": 0.5369, + "step": 225690 + }, + { + "epoch": 1.995261585247264, + "grad_norm": 1.1940443515777588, + "learning_rate": 1.6745640245878937e-05, + "loss": 0.591, + "step": 225700 + }, + { + "epoch": 1.9953499885075763, + "grad_norm": 4.946343898773193, + "learning_rate": 1.6744166858207065e-05, + "loss": 0.5971, + "step": 225710 + }, + { + "epoch": 1.9954383917678884, + "grad_norm": 11.978418350219727, + "learning_rate": 1.6742693470535194e-05, + "loss": 0.5623, + "step": 225720 + }, + { + "epoch": 1.9955267950282005, + "grad_norm": 1.3770660161972046, + "learning_rate": 1.6741220082863325e-05, + "loss": 0.6232, + "step": 225730 + }, + { + "epoch": 1.9956151982885129, + "grad_norm": 1.5341140031814575, + "learning_rate": 1.6739746695191454e-05, + "loss": 0.6196, + "step": 225740 + }, + { + "epoch": 1.9957036015488252, + "grad_norm": 1.2397600412368774, + "learning_rate": 1.6738273307519582e-05, + "loss": 0.4289, + "step": 225750 + }, + { + "epoch": 1.9957920048091373, + "grad_norm": 2.2048261165618896, + "learning_rate": 1.6736799919847714e-05, + "loss": 0.6551, + "step": 225760 + }, + { + "epoch": 1.9958804080694494, + "grad_norm": 2.451563835144043, + "learning_rate": 1.6735326532175842e-05, + "loss": 0.6384, + "step": 225770 + }, + { + "epoch": 1.995968811329762, + "grad_norm": 1.699855089187622, + "learning_rate": 1.673385314450397e-05, + "loss": 0.4734, + "step": 225780 + }, + { + "epoch": 1.9960572145900741, + "grad_norm": 3.1928694248199463, + "learning_rate": 1.6732379756832102e-05, + "loss": 0.567, + "step": 225790 + }, + { + "epoch": 1.9961456178503862, + "grad_norm": 2.2834367752075195, + "learning_rate": 1.673090636916023e-05, + "loss": 0.4598, + "step": 225800 + }, + { + "epoch": 1.9962340211106986, + "grad_norm": 1.8125332593917847, + "learning_rate": 1.672943298148836e-05, + "loss": 0.5784, + "step": 225810 + }, + { + "epoch": 1.996322424371011, + "grad_norm": 1.8841636180877686, + "learning_rate": 1.6727959593816487e-05, + "loss": 0.6716, + "step": 225820 + }, + { + "epoch": 1.996410827631323, + "grad_norm": 1.5954008102416992, + "learning_rate": 1.672648620614462e-05, + "loss": 0.5947, + "step": 225830 + }, + { + "epoch": 1.9964992308916352, + "grad_norm": 2.470388174057007, + "learning_rate": 1.6725012818472747e-05, + "loss": 0.6013, + "step": 225840 + }, + { + "epoch": 1.9965876341519475, + "grad_norm": 0.8296368718147278, + "learning_rate": 1.6723539430800876e-05, + "loss": 0.6322, + "step": 225850 + }, + { + "epoch": 1.9966760374122599, + "grad_norm": 2.4908385276794434, + "learning_rate": 1.6722066043129007e-05, + "loss": 0.6579, + "step": 225860 + }, + { + "epoch": 1.996764440672572, + "grad_norm": 3.1366007328033447, + "learning_rate": 1.6720592655457136e-05, + "loss": 0.6049, + "step": 225870 + }, + { + "epoch": 1.996852843932884, + "grad_norm": 2.1875829696655273, + "learning_rate": 1.6719119267785264e-05, + "loss": 0.5508, + "step": 225880 + }, + { + "epoch": 1.9969412471931964, + "grad_norm": 2.4100494384765625, + "learning_rate": 1.6717645880113393e-05, + "loss": 0.4558, + "step": 225890 + }, + { + "epoch": 1.9970296504535088, + "grad_norm": 2.559838056564331, + "learning_rate": 1.6716172492441524e-05, + "loss": 0.591, + "step": 225900 + }, + { + "epoch": 1.997118053713821, + "grad_norm": 1.209906816482544, + "learning_rate": 1.6714699104769653e-05, + "loss": 0.5157, + "step": 225910 + }, + { + "epoch": 1.9972064569741332, + "grad_norm": 4.076237678527832, + "learning_rate": 1.671322571709778e-05, + "loss": 0.5812, + "step": 225920 + }, + { + "epoch": 1.9972948602344456, + "grad_norm": 13.479727745056152, + "learning_rate": 1.671175232942591e-05, + "loss": 0.5305, + "step": 225930 + }, + { + "epoch": 1.9973832634947577, + "grad_norm": 2.303053617477417, + "learning_rate": 1.671027894175404e-05, + "loss": 0.5326, + "step": 225940 + }, + { + "epoch": 1.9974716667550698, + "grad_norm": 5.860489368438721, + "learning_rate": 1.670880555408217e-05, + "loss": 0.5122, + "step": 225950 + }, + { + "epoch": 1.9975600700153822, + "grad_norm": 8.860002517700195, + "learning_rate": 1.6707332166410298e-05, + "loss": 0.5731, + "step": 225960 + }, + { + "epoch": 1.9976484732756945, + "grad_norm": 1.4590798616409302, + "learning_rate": 1.670585877873843e-05, + "loss": 0.5888, + "step": 225970 + }, + { + "epoch": 1.9977368765360066, + "grad_norm": 7.0802388191223145, + "learning_rate": 1.6704385391066558e-05, + "loss": 0.6654, + "step": 225980 + }, + { + "epoch": 1.9978252797963187, + "grad_norm": 3.268148422241211, + "learning_rate": 1.6702912003394686e-05, + "loss": 0.6022, + "step": 225990 + }, + { + "epoch": 1.997913683056631, + "grad_norm": 6.230152606964111, + "learning_rate": 1.6701438615722815e-05, + "loss": 0.5004, + "step": 226000 + }, + { + "epoch": 1.9980020863169434, + "grad_norm": 2.543973445892334, + "learning_rate": 1.6699965228050946e-05, + "loss": 0.5898, + "step": 226010 + }, + { + "epoch": 1.9980904895772555, + "grad_norm": 10.80360221862793, + "learning_rate": 1.6698491840379075e-05, + "loss": 0.6368, + "step": 226020 + }, + { + "epoch": 1.9981788928375679, + "grad_norm": 1.3303838968276978, + "learning_rate": 1.6697018452707203e-05, + "loss": 0.6188, + "step": 226030 + }, + { + "epoch": 1.9982672960978802, + "grad_norm": 1.1822746992111206, + "learning_rate": 1.6695545065035335e-05, + "loss": 0.5125, + "step": 226040 + }, + { + "epoch": 1.9983556993581923, + "grad_norm": 3.7949411869049072, + "learning_rate": 1.6694071677363463e-05, + "loss": 0.5073, + "step": 226050 + }, + { + "epoch": 1.9984441026185045, + "grad_norm": 0.9324376583099365, + "learning_rate": 1.669259828969159e-05, + "loss": 0.5405, + "step": 226060 + }, + { + "epoch": 1.9985325058788168, + "grad_norm": 2.840153217315674, + "learning_rate": 1.669112490201972e-05, + "loss": 0.496, + "step": 226070 + }, + { + "epoch": 1.9986209091391292, + "grad_norm": 2.08803653717041, + "learning_rate": 1.668965151434785e-05, + "loss": 0.5232, + "step": 226080 + }, + { + "epoch": 1.9987093123994413, + "grad_norm": 2.0550999641418457, + "learning_rate": 1.668817812667598e-05, + "loss": 0.4621, + "step": 226090 + }, + { + "epoch": 1.9987977156597534, + "grad_norm": 1.2500879764556885, + "learning_rate": 1.6686704739004108e-05, + "loss": 0.5219, + "step": 226100 + }, + { + "epoch": 1.9988861189200657, + "grad_norm": 1.7983343601226807, + "learning_rate": 1.6685231351332237e-05, + "loss": 0.514, + "step": 226110 + }, + { + "epoch": 1.998974522180378, + "grad_norm": 4.482465744018555, + "learning_rate": 1.668375796366037e-05, + "loss": 0.569, + "step": 226120 + }, + { + "epoch": 1.9990629254406902, + "grad_norm": 4.159267425537109, + "learning_rate": 1.6682284575988497e-05, + "loss": 0.67, + "step": 226130 + }, + { + "epoch": 1.9991513287010025, + "grad_norm": 1.8467906713485718, + "learning_rate": 1.6680811188316625e-05, + "loss": 0.601, + "step": 226140 + }, + { + "epoch": 1.9992397319613149, + "grad_norm": 6.043004989624023, + "learning_rate": 1.6679337800644757e-05, + "loss": 0.4399, + "step": 226150 + }, + { + "epoch": 1.999328135221627, + "grad_norm": 2.3459484577178955, + "learning_rate": 1.6677864412972885e-05, + "loss": 0.645, + "step": 226160 + }, + { + "epoch": 1.9994165384819391, + "grad_norm": 2.997763156890869, + "learning_rate": 1.6676391025301014e-05, + "loss": 0.5968, + "step": 226170 + }, + { + "epoch": 1.9995049417422515, + "grad_norm": 19.69053077697754, + "learning_rate": 1.6674917637629142e-05, + "loss": 0.6163, + "step": 226180 + }, + { + "epoch": 1.9995933450025638, + "grad_norm": 2.2578210830688477, + "learning_rate": 1.6673444249957274e-05, + "loss": 0.5128, + "step": 226190 + }, + { + "epoch": 1.999681748262876, + "grad_norm": 1.19893479347229, + "learning_rate": 1.6671970862285402e-05, + "loss": 0.6074, + "step": 226200 + }, + { + "epoch": 1.999770151523188, + "grad_norm": 1.4854111671447754, + "learning_rate": 1.667049747461353e-05, + "loss": 0.6433, + "step": 226210 + }, + { + "epoch": 1.9998585547835004, + "grad_norm": 1.1448254585266113, + "learning_rate": 1.666902408694166e-05, + "loss": 0.6176, + "step": 226220 + }, + { + "epoch": 1.9999469580438127, + "grad_norm": 5.088480472564697, + "learning_rate": 1.666755069926979e-05, + "loss": 0.6296, + "step": 226230 + }, + { + "epoch": 2.0, + "eval_loss": 0.5896387696266174, + "eval_runtime": 1557.2969, + "eval_samples_per_second": 290.548, + "eval_steps_per_second": 18.16, + "step": 226236 + }, + { + "epoch": 2.000035361304125, + "grad_norm": 2.1874849796295166, + "learning_rate": 1.666607731159792e-05, + "loss": 0.4975, + "step": 226240 + }, + { + "epoch": 2.000123764564437, + "grad_norm": 1.5953342914581299, + "learning_rate": 1.6664603923926047e-05, + "loss": 0.5065, + "step": 226250 + }, + { + "epoch": 2.0002121678247495, + "grad_norm": 1.244445562362671, + "learning_rate": 1.666313053625418e-05, + "loss": 0.529, + "step": 226260 + }, + { + "epoch": 2.0003005710850617, + "grad_norm": 1.1338938474655151, + "learning_rate": 1.6661657148582307e-05, + "loss": 0.4876, + "step": 226270 + }, + { + "epoch": 2.0003889743453738, + "grad_norm": 5.26050329208374, + "learning_rate": 1.6660183760910436e-05, + "loss": 0.5572, + "step": 226280 + }, + { + "epoch": 2.000477377605686, + "grad_norm": 2.855520486831665, + "learning_rate": 1.6658710373238564e-05, + "loss": 0.6468, + "step": 226290 + }, + { + "epoch": 2.0005657808659985, + "grad_norm": 2.9620635509490967, + "learning_rate": 1.6657236985566696e-05, + "loss": 0.5322, + "step": 226300 + }, + { + "epoch": 2.0006541841263106, + "grad_norm": 4.931084156036377, + "learning_rate": 1.6655763597894824e-05, + "loss": 0.5446, + "step": 226310 + }, + { + "epoch": 2.0007425873866227, + "grad_norm": 1.6907137632369995, + "learning_rate": 1.6654290210222952e-05, + "loss": 0.5074, + "step": 226320 + }, + { + "epoch": 2.0008309906469353, + "grad_norm": 13.46829891204834, + "learning_rate": 1.6652816822551084e-05, + "loss": 0.5043, + "step": 226330 + }, + { + "epoch": 2.0009193939072474, + "grad_norm": 1.7841097116470337, + "learning_rate": 1.6651343434879213e-05, + "loss": 0.5115, + "step": 226340 + }, + { + "epoch": 2.0010077971675595, + "grad_norm": 4.074251651763916, + "learning_rate": 1.664987004720734e-05, + "loss": 0.4255, + "step": 226350 + }, + { + "epoch": 2.0010962004278716, + "grad_norm": 3.4969000816345215, + "learning_rate": 1.664839665953547e-05, + "loss": 0.528, + "step": 226360 + }, + { + "epoch": 2.001184603688184, + "grad_norm": 1.2476563453674316, + "learning_rate": 1.66469232718636e-05, + "loss": 0.5794, + "step": 226370 + }, + { + "epoch": 2.0012730069484963, + "grad_norm": 0.8434544801712036, + "learning_rate": 1.664544988419173e-05, + "loss": 0.4651, + "step": 226380 + }, + { + "epoch": 2.0013614102088084, + "grad_norm": 3.7285029888153076, + "learning_rate": 1.6643976496519858e-05, + "loss": 0.6182, + "step": 226390 + }, + { + "epoch": 2.0014498134691205, + "grad_norm": 4.3275980949401855, + "learning_rate": 1.6642503108847986e-05, + "loss": 0.5106, + "step": 226400 + }, + { + "epoch": 2.001538216729433, + "grad_norm": 1.6196305751800537, + "learning_rate": 1.6641029721176118e-05, + "loss": 0.5377, + "step": 226410 + }, + { + "epoch": 2.0016266199897452, + "grad_norm": 5.1456146240234375, + "learning_rate": 1.6639556333504246e-05, + "loss": 0.5615, + "step": 226420 + }, + { + "epoch": 2.0017150232500573, + "grad_norm": 1.9292058944702148, + "learning_rate": 1.6638082945832374e-05, + "loss": 0.6326, + "step": 226430 + }, + { + "epoch": 2.00180342651037, + "grad_norm": 3.6200435161590576, + "learning_rate": 1.6636609558160506e-05, + "loss": 0.5473, + "step": 226440 + }, + { + "epoch": 2.001891829770682, + "grad_norm": 1.1902130842208862, + "learning_rate": 1.6635136170488635e-05, + "loss": 0.5825, + "step": 226450 + }, + { + "epoch": 2.001980233030994, + "grad_norm": 6.912537097930908, + "learning_rate": 1.6633662782816763e-05, + "loss": 0.7111, + "step": 226460 + }, + { + "epoch": 2.0020686362913063, + "grad_norm": 2.652531623840332, + "learning_rate": 1.663218939514489e-05, + "loss": 0.5736, + "step": 226470 + }, + { + "epoch": 2.002157039551619, + "grad_norm": 4.639496803283691, + "learning_rate": 1.6630716007473023e-05, + "loss": 0.4918, + "step": 226480 + }, + { + "epoch": 2.002245442811931, + "grad_norm": 4.625110626220703, + "learning_rate": 1.662924261980115e-05, + "loss": 0.591, + "step": 226490 + }, + { + "epoch": 2.002333846072243, + "grad_norm": 1.416290044784546, + "learning_rate": 1.662776923212928e-05, + "loss": 0.7216, + "step": 226500 + }, + { + "epoch": 2.002422249332555, + "grad_norm": 1.9091465473175049, + "learning_rate": 1.662629584445741e-05, + "loss": 0.4762, + "step": 226510 + }, + { + "epoch": 2.0025106525928678, + "grad_norm": 1.1477149724960327, + "learning_rate": 1.662482245678554e-05, + "loss": 0.4799, + "step": 226520 + }, + { + "epoch": 2.00259905585318, + "grad_norm": 1.644767165184021, + "learning_rate": 1.6623349069113668e-05, + "loss": 0.3395, + "step": 226530 + }, + { + "epoch": 2.002687459113492, + "grad_norm": 4.657378196716309, + "learning_rate": 1.6621875681441797e-05, + "loss": 0.4455, + "step": 226540 + }, + { + "epoch": 2.0027758623738046, + "grad_norm": 2.1009159088134766, + "learning_rate": 1.6620402293769928e-05, + "loss": 0.6772, + "step": 226550 + }, + { + "epoch": 2.0028642656341167, + "grad_norm": 7.348577976226807, + "learning_rate": 1.6618928906098057e-05, + "loss": 0.472, + "step": 226560 + }, + { + "epoch": 2.002952668894429, + "grad_norm": 1.1804465055465698, + "learning_rate": 1.6617455518426185e-05, + "loss": 0.5798, + "step": 226570 + }, + { + "epoch": 2.003041072154741, + "grad_norm": 1.4105063676834106, + "learning_rate": 1.6615982130754317e-05, + "loss": 0.54, + "step": 226580 + }, + { + "epoch": 2.0031294754150535, + "grad_norm": 1.898439645767212, + "learning_rate": 1.6614508743082445e-05, + "loss": 0.4719, + "step": 226590 + }, + { + "epoch": 2.0032178786753656, + "grad_norm": 1.846827745437622, + "learning_rate": 1.6613035355410573e-05, + "loss": 0.6385, + "step": 226600 + }, + { + "epoch": 2.0033062819356777, + "grad_norm": 2.725637435913086, + "learning_rate": 1.6611561967738705e-05, + "loss": 0.4055, + "step": 226610 + }, + { + "epoch": 2.00339468519599, + "grad_norm": 2.948009490966797, + "learning_rate": 1.6610088580066834e-05, + "loss": 0.6092, + "step": 226620 + }, + { + "epoch": 2.0034830884563024, + "grad_norm": 3.7578041553497314, + "learning_rate": 1.6608615192394962e-05, + "loss": 0.5517, + "step": 226630 + }, + { + "epoch": 2.0035714917166145, + "grad_norm": 2.911978006362915, + "learning_rate": 1.6607141804723094e-05, + "loss": 0.4692, + "step": 226640 + }, + { + "epoch": 2.0036598949769266, + "grad_norm": 1.748624324798584, + "learning_rate": 1.6605668417051222e-05, + "loss": 0.5394, + "step": 226650 + }, + { + "epoch": 2.003748298237239, + "grad_norm": 4.847993850708008, + "learning_rate": 1.660419502937935e-05, + "loss": 0.5823, + "step": 226660 + }, + { + "epoch": 2.0038367014975513, + "grad_norm": 1.9426450729370117, + "learning_rate": 1.6602721641707482e-05, + "loss": 0.514, + "step": 226670 + }, + { + "epoch": 2.0039251047578635, + "grad_norm": 3.120805501937866, + "learning_rate": 1.660124825403561e-05, + "loss": 0.4897, + "step": 226680 + }, + { + "epoch": 2.0040135080181756, + "grad_norm": 1.5435906648635864, + "learning_rate": 1.659977486636374e-05, + "loss": 0.4323, + "step": 226690 + }, + { + "epoch": 2.004101911278488, + "grad_norm": 5.0077104568481445, + "learning_rate": 1.659830147869187e-05, + "loss": 0.603, + "step": 226700 + }, + { + "epoch": 2.0041903145388003, + "grad_norm": 1.593505859375, + "learning_rate": 1.659682809102e-05, + "loss": 0.5083, + "step": 226710 + }, + { + "epoch": 2.0042787177991124, + "grad_norm": 2.1605443954467773, + "learning_rate": 1.6595354703348127e-05, + "loss": 0.6612, + "step": 226720 + }, + { + "epoch": 2.0043671210594245, + "grad_norm": 4.325282096862793, + "learning_rate": 1.659388131567626e-05, + "loss": 0.4663, + "step": 226730 + }, + { + "epoch": 2.004455524319737, + "grad_norm": 2.1911299228668213, + "learning_rate": 1.6592407928004387e-05, + "loss": 0.651, + "step": 226740 + }, + { + "epoch": 2.004543927580049, + "grad_norm": 1.3787710666656494, + "learning_rate": 1.6590934540332516e-05, + "loss": 0.3709, + "step": 226750 + }, + { + "epoch": 2.0046323308403613, + "grad_norm": 1.088927149772644, + "learning_rate": 1.6589461152660644e-05, + "loss": 0.5471, + "step": 226760 + }, + { + "epoch": 2.0047207341006734, + "grad_norm": 1.6254916191101074, + "learning_rate": 1.6587987764988776e-05, + "loss": 0.58, + "step": 226770 + }, + { + "epoch": 2.004809137360986, + "grad_norm": 6.528554916381836, + "learning_rate": 1.6586514377316904e-05, + "loss": 0.54, + "step": 226780 + }, + { + "epoch": 2.004897540621298, + "grad_norm": 1.3381448984146118, + "learning_rate": 1.6585040989645032e-05, + "loss": 0.5161, + "step": 226790 + }, + { + "epoch": 2.00498594388161, + "grad_norm": 13.180055618286133, + "learning_rate": 1.6583567601973164e-05, + "loss": 0.4552, + "step": 226800 + }, + { + "epoch": 2.005074347141923, + "grad_norm": 3.95511794090271, + "learning_rate": 1.6582094214301293e-05, + "loss": 0.5348, + "step": 226810 + }, + { + "epoch": 2.005162750402235, + "grad_norm": 4.893316268920898, + "learning_rate": 1.658062082662942e-05, + "loss": 0.5603, + "step": 226820 + }, + { + "epoch": 2.005251153662547, + "grad_norm": 2.347336769104004, + "learning_rate": 1.657914743895755e-05, + "loss": 0.5761, + "step": 226830 + }, + { + "epoch": 2.005339556922859, + "grad_norm": 6.9926629066467285, + "learning_rate": 1.657767405128568e-05, + "loss": 0.5634, + "step": 226840 + }, + { + "epoch": 2.0054279601831717, + "grad_norm": 1.7365716695785522, + "learning_rate": 1.657620066361381e-05, + "loss": 0.5252, + "step": 226850 + }, + { + "epoch": 2.005516363443484, + "grad_norm": 2.2131664752960205, + "learning_rate": 1.6574727275941938e-05, + "loss": 0.6483, + "step": 226860 + }, + { + "epoch": 2.005604766703796, + "grad_norm": 7.041866302490234, + "learning_rate": 1.6573253888270066e-05, + "loss": 0.5209, + "step": 226870 + }, + { + "epoch": 2.005693169964108, + "grad_norm": 2.14982271194458, + "learning_rate": 1.6571780500598198e-05, + "loss": 0.6255, + "step": 226880 + }, + { + "epoch": 2.0057815732244206, + "grad_norm": 2.870508909225464, + "learning_rate": 1.6570307112926326e-05, + "loss": 0.5227, + "step": 226890 + }, + { + "epoch": 2.0058699764847328, + "grad_norm": 2.2876856327056885, + "learning_rate": 1.6568833725254455e-05, + "loss": 0.5065, + "step": 226900 + }, + { + "epoch": 2.005958379745045, + "grad_norm": 2.1253890991210938, + "learning_rate": 1.6567360337582586e-05, + "loss": 0.5136, + "step": 226910 + }, + { + "epoch": 2.0060467830053574, + "grad_norm": 2.4570226669311523, + "learning_rate": 1.6565886949910715e-05, + "loss": 0.371, + "step": 226920 + }, + { + "epoch": 2.0061351862656696, + "grad_norm": 2.8253185749053955, + "learning_rate": 1.6564413562238843e-05, + "loss": 0.5384, + "step": 226930 + }, + { + "epoch": 2.0062235895259817, + "grad_norm": 4.985726356506348, + "learning_rate": 1.656294017456697e-05, + "loss": 0.5847, + "step": 226940 + }, + { + "epoch": 2.006311992786294, + "grad_norm": 5.122780799865723, + "learning_rate": 1.6561466786895103e-05, + "loss": 0.574, + "step": 226950 + }, + { + "epoch": 2.0064003960466064, + "grad_norm": 0.8131890296936035, + "learning_rate": 1.655999339922323e-05, + "loss": 0.5828, + "step": 226960 + }, + { + "epoch": 2.0064887993069185, + "grad_norm": 3.6763460636138916, + "learning_rate": 1.655852001155136e-05, + "loss": 0.6303, + "step": 226970 + }, + { + "epoch": 2.0065772025672306, + "grad_norm": 4.815425395965576, + "learning_rate": 1.655704662387949e-05, + "loss": 0.4967, + "step": 226980 + }, + { + "epoch": 2.0066656058275427, + "grad_norm": 2.031402587890625, + "learning_rate": 1.655557323620762e-05, + "loss": 0.6119, + "step": 226990 + }, + { + "epoch": 2.0067540090878553, + "grad_norm": 2.284585475921631, + "learning_rate": 1.6554099848535748e-05, + "loss": 0.5346, + "step": 227000 + }, + { + "epoch": 2.0068424123481674, + "grad_norm": 3.7399697303771973, + "learning_rate": 1.6552626460863877e-05, + "loss": 0.643, + "step": 227010 + }, + { + "epoch": 2.0069308156084795, + "grad_norm": 3.23677134513855, + "learning_rate": 1.655115307319201e-05, + "loss": 0.572, + "step": 227020 + }, + { + "epoch": 2.007019218868792, + "grad_norm": 3.547565460205078, + "learning_rate": 1.6549679685520137e-05, + "loss": 0.5607, + "step": 227030 + }, + { + "epoch": 2.007107622129104, + "grad_norm": 1.7100212574005127, + "learning_rate": 1.6548206297848265e-05, + "loss": 0.6112, + "step": 227040 + }, + { + "epoch": 2.0071960253894163, + "grad_norm": 3.0017237663269043, + "learning_rate": 1.6546732910176393e-05, + "loss": 0.6235, + "step": 227050 + }, + { + "epoch": 2.0072844286497284, + "grad_norm": 2.881420850753784, + "learning_rate": 1.6545259522504525e-05, + "loss": 0.6315, + "step": 227060 + }, + { + "epoch": 2.007372831910041, + "grad_norm": 1.7031633853912354, + "learning_rate": 1.6543786134832654e-05, + "loss": 0.4615, + "step": 227070 + }, + { + "epoch": 2.007461235170353, + "grad_norm": 2.438322067260742, + "learning_rate": 1.6542312747160782e-05, + "loss": 0.6254, + "step": 227080 + }, + { + "epoch": 2.0075496384306653, + "grad_norm": 6.31378698348999, + "learning_rate": 1.6540839359488914e-05, + "loss": 0.7021, + "step": 227090 + }, + { + "epoch": 2.0076380416909774, + "grad_norm": 16.334516525268555, + "learning_rate": 1.6539365971817042e-05, + "loss": 0.6365, + "step": 227100 + }, + { + "epoch": 2.00772644495129, + "grad_norm": 2.1810507774353027, + "learning_rate": 1.653789258414517e-05, + "loss": 0.4821, + "step": 227110 + }, + { + "epoch": 2.007814848211602, + "grad_norm": 2.845224380493164, + "learning_rate": 1.65364191964733e-05, + "loss": 0.5724, + "step": 227120 + }, + { + "epoch": 2.007903251471914, + "grad_norm": 2.0504610538482666, + "learning_rate": 1.653494580880143e-05, + "loss": 0.4934, + "step": 227130 + }, + { + "epoch": 2.0079916547322267, + "grad_norm": 0.9096835255622864, + "learning_rate": 1.653347242112956e-05, + "loss": 0.5169, + "step": 227140 + }, + { + "epoch": 2.008080057992539, + "grad_norm": 3.366948366165161, + "learning_rate": 1.6531999033457687e-05, + "loss": 0.687, + "step": 227150 + }, + { + "epoch": 2.008168461252851, + "grad_norm": 1.6514769792556763, + "learning_rate": 1.653052564578582e-05, + "loss": 0.7267, + "step": 227160 + }, + { + "epoch": 2.008256864513163, + "grad_norm": 1.1792739629745483, + "learning_rate": 1.6529052258113947e-05, + "loss": 0.4379, + "step": 227170 + }, + { + "epoch": 2.0083452677734757, + "grad_norm": 2.729915142059326, + "learning_rate": 1.6527578870442076e-05, + "loss": 0.6659, + "step": 227180 + }, + { + "epoch": 2.008433671033788, + "grad_norm": 2.1973369121551514, + "learning_rate": 1.6526105482770204e-05, + "loss": 0.5277, + "step": 227190 + }, + { + "epoch": 2.0085220742941, + "grad_norm": 5.331556797027588, + "learning_rate": 1.6524632095098336e-05, + "loss": 0.5456, + "step": 227200 + }, + { + "epoch": 2.008610477554412, + "grad_norm": 1.4179096221923828, + "learning_rate": 1.6523158707426464e-05, + "loss": 0.3942, + "step": 227210 + }, + { + "epoch": 2.0086988808147246, + "grad_norm": 2.709688663482666, + "learning_rate": 1.6521685319754592e-05, + "loss": 0.724, + "step": 227220 + }, + { + "epoch": 2.0087872840750367, + "grad_norm": 1.9495868682861328, + "learning_rate": 1.652021193208272e-05, + "loss": 0.4749, + "step": 227230 + }, + { + "epoch": 2.008875687335349, + "grad_norm": 4.923674583435059, + "learning_rate": 1.6518738544410852e-05, + "loss": 0.5786, + "step": 227240 + }, + { + "epoch": 2.0089640905956614, + "grad_norm": 1.1810909509658813, + "learning_rate": 1.651726515673898e-05, + "loss": 0.5628, + "step": 227250 + }, + { + "epoch": 2.0090524938559735, + "grad_norm": 1.0802507400512695, + "learning_rate": 1.651579176906711e-05, + "loss": 0.5375, + "step": 227260 + }, + { + "epoch": 2.0091408971162856, + "grad_norm": 3.592555046081543, + "learning_rate": 1.651431838139524e-05, + "loss": 0.6352, + "step": 227270 + }, + { + "epoch": 2.0092293003765977, + "grad_norm": 3.3080804347991943, + "learning_rate": 1.651284499372337e-05, + "loss": 0.5522, + "step": 227280 + }, + { + "epoch": 2.0093177036369103, + "grad_norm": 1.9104849100112915, + "learning_rate": 1.6511371606051498e-05, + "loss": 0.569, + "step": 227290 + }, + { + "epoch": 2.0094061068972224, + "grad_norm": 2.5691516399383545, + "learning_rate": 1.6509898218379626e-05, + "loss": 0.522, + "step": 227300 + }, + { + "epoch": 2.0094945101575346, + "grad_norm": 2.498119592666626, + "learning_rate": 1.6508424830707758e-05, + "loss": 0.6434, + "step": 227310 + }, + { + "epoch": 2.0095829134178467, + "grad_norm": 5.327476978302002, + "learning_rate": 1.6506951443035886e-05, + "loss": 0.5519, + "step": 227320 + }, + { + "epoch": 2.0096713166781592, + "grad_norm": 4.476630210876465, + "learning_rate": 1.6505478055364014e-05, + "loss": 0.5052, + "step": 227330 + }, + { + "epoch": 2.0097597199384714, + "grad_norm": 6.138433456420898, + "learning_rate": 1.6504004667692143e-05, + "loss": 0.5323, + "step": 227340 + }, + { + "epoch": 2.0098481231987835, + "grad_norm": 2.814666509628296, + "learning_rate": 1.6502531280020275e-05, + "loss": 0.5128, + "step": 227350 + }, + { + "epoch": 2.0099365264590956, + "grad_norm": 4.052055835723877, + "learning_rate": 1.6501057892348403e-05, + "loss": 0.4138, + "step": 227360 + }, + { + "epoch": 2.010024929719408, + "grad_norm": 1.7143186330795288, + "learning_rate": 1.649958450467653e-05, + "loss": 0.4506, + "step": 227370 + }, + { + "epoch": 2.0101133329797203, + "grad_norm": 2.9098987579345703, + "learning_rate": 1.6498111117004663e-05, + "loss": 0.6141, + "step": 227380 + }, + { + "epoch": 2.0102017362400324, + "grad_norm": 3.0465667247772217, + "learning_rate": 1.649663772933279e-05, + "loss": 0.4896, + "step": 227390 + }, + { + "epoch": 2.010290139500345, + "grad_norm": 2.38472843170166, + "learning_rate": 1.649516434166092e-05, + "loss": 0.7076, + "step": 227400 + }, + { + "epoch": 2.010378542760657, + "grad_norm": 7.273229598999023, + "learning_rate": 1.6493690953989048e-05, + "loss": 0.5705, + "step": 227410 + }, + { + "epoch": 2.010466946020969, + "grad_norm": 1.634156584739685, + "learning_rate": 1.649221756631718e-05, + "loss": 0.5242, + "step": 227420 + }, + { + "epoch": 2.0105553492812813, + "grad_norm": 3.0493197441101074, + "learning_rate": 1.6490744178645308e-05, + "loss": 0.4826, + "step": 227430 + }, + { + "epoch": 2.010643752541594, + "grad_norm": 2.6604769229888916, + "learning_rate": 1.6489270790973436e-05, + "loss": 0.6233, + "step": 227440 + }, + { + "epoch": 2.010732155801906, + "grad_norm": 1.4654669761657715, + "learning_rate": 1.6487797403301568e-05, + "loss": 0.4838, + "step": 227450 + }, + { + "epoch": 2.010820559062218, + "grad_norm": 1.9044148921966553, + "learning_rate": 1.6486324015629697e-05, + "loss": 0.5075, + "step": 227460 + }, + { + "epoch": 2.0109089623225302, + "grad_norm": 1.9035595655441284, + "learning_rate": 1.6484850627957825e-05, + "loss": 0.6017, + "step": 227470 + }, + { + "epoch": 2.010997365582843, + "grad_norm": 1.1150312423706055, + "learning_rate": 1.6483377240285953e-05, + "loss": 0.5781, + "step": 227480 + }, + { + "epoch": 2.011085768843155, + "grad_norm": 1.375363826751709, + "learning_rate": 1.6481903852614085e-05, + "loss": 0.5594, + "step": 227490 + }, + { + "epoch": 2.011174172103467, + "grad_norm": 3.8840715885162354, + "learning_rate": 1.6480430464942213e-05, + "loss": 0.4573, + "step": 227500 + }, + { + "epoch": 2.0112625753637796, + "grad_norm": 33.09034729003906, + "learning_rate": 1.6478957077270342e-05, + "loss": 0.4896, + "step": 227510 + }, + { + "epoch": 2.0113509786240917, + "grad_norm": 2.974626302719116, + "learning_rate": 1.6477483689598473e-05, + "loss": 0.6094, + "step": 227520 + }, + { + "epoch": 2.011439381884404, + "grad_norm": 0.8014130592346191, + "learning_rate": 1.6476010301926602e-05, + "loss": 0.4753, + "step": 227530 + }, + { + "epoch": 2.011527785144716, + "grad_norm": 23.17989730834961, + "learning_rate": 1.647453691425473e-05, + "loss": 0.5512, + "step": 227540 + }, + { + "epoch": 2.0116161884050285, + "grad_norm": 2.1626498699188232, + "learning_rate": 1.6473063526582862e-05, + "loss": 0.5326, + "step": 227550 + }, + { + "epoch": 2.0117045916653407, + "grad_norm": 3.8709895610809326, + "learning_rate": 1.647159013891099e-05, + "loss": 0.4783, + "step": 227560 + }, + { + "epoch": 2.0117929949256528, + "grad_norm": 1.1059939861297607, + "learning_rate": 1.647011675123912e-05, + "loss": 0.5878, + "step": 227570 + }, + { + "epoch": 2.011881398185965, + "grad_norm": 1.666608214378357, + "learning_rate": 1.646864336356725e-05, + "loss": 0.452, + "step": 227580 + }, + { + "epoch": 2.0119698014462775, + "grad_norm": 2.9405999183654785, + "learning_rate": 1.646716997589538e-05, + "loss": 0.5087, + "step": 227590 + }, + { + "epoch": 2.0120582047065896, + "grad_norm": 4.023858547210693, + "learning_rate": 1.6465696588223507e-05, + "loss": 0.6071, + "step": 227600 + }, + { + "epoch": 2.0121466079669017, + "grad_norm": 1.6006312370300293, + "learning_rate": 1.646422320055164e-05, + "loss": 0.6083, + "step": 227610 + }, + { + "epoch": 2.0122350112272143, + "grad_norm": 1.4520686864852905, + "learning_rate": 1.6462749812879767e-05, + "loss": 0.5611, + "step": 227620 + }, + { + "epoch": 2.0123234144875264, + "grad_norm": 0.6792407631874084, + "learning_rate": 1.6461276425207896e-05, + "loss": 0.525, + "step": 227630 + }, + { + "epoch": 2.0124118177478385, + "grad_norm": 7.04536247253418, + "learning_rate": 1.6459803037536027e-05, + "loss": 0.4969, + "step": 227640 + }, + { + "epoch": 2.0125002210081506, + "grad_norm": 8.498085021972656, + "learning_rate": 1.6458329649864156e-05, + "loss": 0.429, + "step": 227650 + }, + { + "epoch": 2.012588624268463, + "grad_norm": 4.153021812438965, + "learning_rate": 1.6456856262192284e-05, + "loss": 0.5755, + "step": 227660 + }, + { + "epoch": 2.0126770275287753, + "grad_norm": 1.899706244468689, + "learning_rate": 1.6455382874520416e-05, + "loss": 0.5708, + "step": 227670 + }, + { + "epoch": 2.0127654307890874, + "grad_norm": 1.677290439605713, + "learning_rate": 1.6453909486848544e-05, + "loss": 0.4273, + "step": 227680 + }, + { + "epoch": 2.0128538340493995, + "grad_norm": 2.5684056282043457, + "learning_rate": 1.6452436099176672e-05, + "loss": 0.5815, + "step": 227690 + }, + { + "epoch": 2.012942237309712, + "grad_norm": 3.4926180839538574, + "learning_rate": 1.64509627115048e-05, + "loss": 0.5659, + "step": 227700 + }, + { + "epoch": 2.0130306405700242, + "grad_norm": 2.7543869018554688, + "learning_rate": 1.6449489323832933e-05, + "loss": 0.7153, + "step": 227710 + }, + { + "epoch": 2.0131190438303364, + "grad_norm": 2.20570707321167, + "learning_rate": 1.644801593616106e-05, + "loss": 0.3947, + "step": 227720 + }, + { + "epoch": 2.013207447090649, + "grad_norm": 4.9663920402526855, + "learning_rate": 1.644654254848919e-05, + "loss": 0.3981, + "step": 227730 + }, + { + "epoch": 2.013295850350961, + "grad_norm": 1.3579005002975464, + "learning_rate": 1.644506916081732e-05, + "loss": 0.5236, + "step": 227740 + }, + { + "epoch": 2.013384253611273, + "grad_norm": 9.973109245300293, + "learning_rate": 1.644359577314545e-05, + "loss": 0.536, + "step": 227750 + }, + { + "epoch": 2.0134726568715853, + "grad_norm": 10.4425048828125, + "learning_rate": 1.6442122385473578e-05, + "loss": 0.5622, + "step": 227760 + }, + { + "epoch": 2.013561060131898, + "grad_norm": 3.2580654621124268, + "learning_rate": 1.6440648997801706e-05, + "loss": 0.5668, + "step": 227770 + }, + { + "epoch": 2.01364946339221, + "grad_norm": 1.3026891946792603, + "learning_rate": 1.6439175610129838e-05, + "loss": 0.6028, + "step": 227780 + }, + { + "epoch": 2.013737866652522, + "grad_norm": 2.313434362411499, + "learning_rate": 1.6437702222457966e-05, + "loss": 0.6031, + "step": 227790 + }, + { + "epoch": 2.013826269912834, + "grad_norm": 3.5553131103515625, + "learning_rate": 1.6436228834786094e-05, + "loss": 0.5657, + "step": 227800 + }, + { + "epoch": 2.0139146731731468, + "grad_norm": 0.835229218006134, + "learning_rate": 1.6434755447114223e-05, + "loss": 0.5509, + "step": 227810 + }, + { + "epoch": 2.014003076433459, + "grad_norm": 5.415505886077881, + "learning_rate": 1.6433282059442355e-05, + "loss": 0.5286, + "step": 227820 + }, + { + "epoch": 2.014091479693771, + "grad_norm": 1.727968454360962, + "learning_rate": 1.6431808671770483e-05, + "loss": 0.4643, + "step": 227830 + }, + { + "epoch": 2.0141798829540836, + "grad_norm": 17.442506790161133, + "learning_rate": 1.643033528409861e-05, + "loss": 0.5278, + "step": 227840 + }, + { + "epoch": 2.0142682862143957, + "grad_norm": 12.224085807800293, + "learning_rate": 1.6428861896426743e-05, + "loss": 0.4699, + "step": 227850 + }, + { + "epoch": 2.014356689474708, + "grad_norm": 3.2613401412963867, + "learning_rate": 1.642738850875487e-05, + "loss": 0.5843, + "step": 227860 + }, + { + "epoch": 2.01444509273502, + "grad_norm": 1.1799752712249756, + "learning_rate": 1.6425915121083e-05, + "loss": 0.4491, + "step": 227870 + }, + { + "epoch": 2.0145334959953325, + "grad_norm": 9.673839569091797, + "learning_rate": 1.6424441733411128e-05, + "loss": 0.568, + "step": 227880 + }, + { + "epoch": 2.0146218992556446, + "grad_norm": 6.619576930999756, + "learning_rate": 1.642296834573926e-05, + "loss": 0.5526, + "step": 227890 + }, + { + "epoch": 2.0147103025159567, + "grad_norm": 2.2731289863586426, + "learning_rate": 1.6421494958067388e-05, + "loss": 0.5061, + "step": 227900 + }, + { + "epoch": 2.014798705776269, + "grad_norm": 2.8537862300872803, + "learning_rate": 1.6420021570395517e-05, + "loss": 0.6701, + "step": 227910 + }, + { + "epoch": 2.0148871090365814, + "grad_norm": 6.214486598968506, + "learning_rate": 1.6418548182723648e-05, + "loss": 0.451, + "step": 227920 + }, + { + "epoch": 2.0149755122968935, + "grad_norm": 1.036894679069519, + "learning_rate": 1.6417074795051777e-05, + "loss": 0.6088, + "step": 227930 + }, + { + "epoch": 2.0150639155572057, + "grad_norm": 2.699188709259033, + "learning_rate": 1.6415601407379905e-05, + "loss": 0.5631, + "step": 227940 + }, + { + "epoch": 2.0151523188175178, + "grad_norm": 3.862405776977539, + "learning_rate": 1.6414128019708033e-05, + "loss": 0.5338, + "step": 227950 + }, + { + "epoch": 2.0152407220778303, + "grad_norm": 3.273679494857788, + "learning_rate": 1.6412654632036165e-05, + "loss": 0.5928, + "step": 227960 + }, + { + "epoch": 2.0153291253381425, + "grad_norm": 1.4407564401626587, + "learning_rate": 1.6411181244364293e-05, + "loss": 0.5584, + "step": 227970 + }, + { + "epoch": 2.0154175285984546, + "grad_norm": 1.5911240577697754, + "learning_rate": 1.6409707856692422e-05, + "loss": 0.547, + "step": 227980 + }, + { + "epoch": 2.015505931858767, + "grad_norm": 1.2769109010696411, + "learning_rate": 1.640823446902055e-05, + "loss": 0.5782, + "step": 227990 + }, + { + "epoch": 2.0155943351190793, + "grad_norm": 1.0915662050247192, + "learning_rate": 1.6406761081348682e-05, + "loss": 0.6218, + "step": 228000 + }, + { + "epoch": 2.0156827383793914, + "grad_norm": 3.4682273864746094, + "learning_rate": 1.640528769367681e-05, + "loss": 0.4545, + "step": 228010 + }, + { + "epoch": 2.0157711416397035, + "grad_norm": 2.1165151596069336, + "learning_rate": 1.640381430600494e-05, + "loss": 0.5551, + "step": 228020 + }, + { + "epoch": 2.015859544900016, + "grad_norm": 2.606995105743408, + "learning_rate": 1.640234091833307e-05, + "loss": 0.5178, + "step": 228030 + }, + { + "epoch": 2.015947948160328, + "grad_norm": 4.12337589263916, + "learning_rate": 1.64008675306612e-05, + "loss": 0.5324, + "step": 228040 + }, + { + "epoch": 2.0160363514206403, + "grad_norm": 2.379568338394165, + "learning_rate": 1.6399394142989327e-05, + "loss": 0.7434, + "step": 228050 + }, + { + "epoch": 2.0161247546809524, + "grad_norm": 2.7963380813598633, + "learning_rate": 1.6397920755317455e-05, + "loss": 0.5625, + "step": 228060 + }, + { + "epoch": 2.016213157941265, + "grad_norm": 2.659884214401245, + "learning_rate": 1.6396447367645587e-05, + "loss": 0.4516, + "step": 228070 + }, + { + "epoch": 2.016301561201577, + "grad_norm": 2.048354387283325, + "learning_rate": 1.6394973979973716e-05, + "loss": 0.588, + "step": 228080 + }, + { + "epoch": 2.0163899644618892, + "grad_norm": 2.9511985778808594, + "learning_rate": 1.6393500592301844e-05, + "loss": 0.6976, + "step": 228090 + }, + { + "epoch": 2.016478367722202, + "grad_norm": 2.2042236328125, + "learning_rate": 1.6392027204629976e-05, + "loss": 0.4464, + "step": 228100 + }, + { + "epoch": 2.016566770982514, + "grad_norm": 2.330000877380371, + "learning_rate": 1.6390553816958104e-05, + "loss": 0.5334, + "step": 228110 + }, + { + "epoch": 2.016655174242826, + "grad_norm": 4.5644917488098145, + "learning_rate": 1.6389080429286232e-05, + "loss": 0.4493, + "step": 228120 + }, + { + "epoch": 2.016743577503138, + "grad_norm": 1.1940789222717285, + "learning_rate": 1.638760704161436e-05, + "loss": 0.4211, + "step": 228130 + }, + { + "epoch": 2.0168319807634507, + "grad_norm": 0.971814751625061, + "learning_rate": 1.6386133653942492e-05, + "loss": 0.4788, + "step": 228140 + }, + { + "epoch": 2.016920384023763, + "grad_norm": 1.2763389348983765, + "learning_rate": 1.638466026627062e-05, + "loss": 0.5389, + "step": 228150 + }, + { + "epoch": 2.017008787284075, + "grad_norm": 0.7047460079193115, + "learning_rate": 1.638318687859875e-05, + "loss": 0.5869, + "step": 228160 + }, + { + "epoch": 2.017097190544387, + "grad_norm": 4.484645366668701, + "learning_rate": 1.6381713490926877e-05, + "loss": 0.4268, + "step": 228170 + }, + { + "epoch": 2.0171855938046996, + "grad_norm": 2.5888359546661377, + "learning_rate": 1.638024010325501e-05, + "loss": 0.486, + "step": 228180 + }, + { + "epoch": 2.0172739970650118, + "grad_norm": 3.154191017150879, + "learning_rate": 1.6378766715583138e-05, + "loss": 0.5091, + "step": 228190 + }, + { + "epoch": 2.017362400325324, + "grad_norm": 1.6884115934371948, + "learning_rate": 1.6377293327911266e-05, + "loss": 0.5861, + "step": 228200 + }, + { + "epoch": 2.0174508035856364, + "grad_norm": 5.907921314239502, + "learning_rate": 1.6375819940239398e-05, + "loss": 0.5384, + "step": 228210 + }, + { + "epoch": 2.0175392068459486, + "grad_norm": 9.519783020019531, + "learning_rate": 1.6374346552567526e-05, + "loss": 0.4146, + "step": 228220 + }, + { + "epoch": 2.0176276101062607, + "grad_norm": 2.3608474731445312, + "learning_rate": 1.6372873164895654e-05, + "loss": 0.5274, + "step": 228230 + }, + { + "epoch": 2.017716013366573, + "grad_norm": 5.098592758178711, + "learning_rate": 1.6371399777223783e-05, + "loss": 0.7203, + "step": 228240 + }, + { + "epoch": 2.0178044166268854, + "grad_norm": 1.7296195030212402, + "learning_rate": 1.6369926389551914e-05, + "loss": 0.4959, + "step": 228250 + }, + { + "epoch": 2.0178928198871975, + "grad_norm": 4.513619422912598, + "learning_rate": 1.6368453001880043e-05, + "loss": 0.5093, + "step": 228260 + }, + { + "epoch": 2.0179812231475096, + "grad_norm": 2.996067523956299, + "learning_rate": 1.636697961420817e-05, + "loss": 0.5261, + "step": 228270 + }, + { + "epoch": 2.0180696264078217, + "grad_norm": 2.799471855163574, + "learning_rate": 1.63655062265363e-05, + "loss": 0.5919, + "step": 228280 + }, + { + "epoch": 2.0181580296681343, + "grad_norm": 5.429481029510498, + "learning_rate": 1.636403283886443e-05, + "loss": 0.6173, + "step": 228290 + }, + { + "epoch": 2.0182464329284464, + "grad_norm": 3.862074851989746, + "learning_rate": 1.636255945119256e-05, + "loss": 0.5796, + "step": 228300 + }, + { + "epoch": 2.0183348361887585, + "grad_norm": 1.4561879634857178, + "learning_rate": 1.6361086063520688e-05, + "loss": 0.5829, + "step": 228310 + }, + { + "epoch": 2.018423239449071, + "grad_norm": 9.587409019470215, + "learning_rate": 1.635961267584882e-05, + "loss": 0.6406, + "step": 228320 + }, + { + "epoch": 2.018511642709383, + "grad_norm": 2.2412660121917725, + "learning_rate": 1.6358139288176948e-05, + "loss": 0.5229, + "step": 228330 + }, + { + "epoch": 2.0186000459696953, + "grad_norm": 3.4114112854003906, + "learning_rate": 1.6356665900505076e-05, + "loss": 0.5496, + "step": 228340 + }, + { + "epoch": 2.0186884492300075, + "grad_norm": 6.944517612457275, + "learning_rate": 1.6355192512833205e-05, + "loss": 0.527, + "step": 228350 + }, + { + "epoch": 2.01877685249032, + "grad_norm": 9.703253746032715, + "learning_rate": 1.6353719125161337e-05, + "loss": 0.5583, + "step": 228360 + }, + { + "epoch": 2.018865255750632, + "grad_norm": 2.1236536502838135, + "learning_rate": 1.6352245737489465e-05, + "loss": 0.4558, + "step": 228370 + }, + { + "epoch": 2.0189536590109443, + "grad_norm": 3.578543186187744, + "learning_rate": 1.6350772349817593e-05, + "loss": 0.4549, + "step": 228380 + }, + { + "epoch": 2.0190420622712564, + "grad_norm": 2.0467212200164795, + "learning_rate": 1.6349298962145725e-05, + "loss": 0.4517, + "step": 228390 + }, + { + "epoch": 2.019130465531569, + "grad_norm": 1.3647750616073608, + "learning_rate": 1.6347825574473853e-05, + "loss": 0.5527, + "step": 228400 + }, + { + "epoch": 2.019218868791881, + "grad_norm": 2.7362964153289795, + "learning_rate": 1.634635218680198e-05, + "loss": 0.5425, + "step": 228410 + }, + { + "epoch": 2.019307272052193, + "grad_norm": 6.816350936889648, + "learning_rate": 1.634487879913011e-05, + "loss": 0.6023, + "step": 228420 + }, + { + "epoch": 2.0193956753125057, + "grad_norm": 3.7436351776123047, + "learning_rate": 1.6343405411458242e-05, + "loss": 0.6986, + "step": 228430 + }, + { + "epoch": 2.019484078572818, + "grad_norm": 3.248399496078491, + "learning_rate": 1.634193202378637e-05, + "loss": 0.6372, + "step": 228440 + }, + { + "epoch": 2.01957248183313, + "grad_norm": 5.249237060546875, + "learning_rate": 1.63404586361145e-05, + "loss": 0.4565, + "step": 228450 + }, + { + "epoch": 2.019660885093442, + "grad_norm": 2.8938376903533936, + "learning_rate": 1.633898524844263e-05, + "loss": 0.5514, + "step": 228460 + }, + { + "epoch": 2.0197492883537547, + "grad_norm": 1.290312647819519, + "learning_rate": 1.633751186077076e-05, + "loss": 0.6316, + "step": 228470 + }, + { + "epoch": 2.019837691614067, + "grad_norm": 3.1669769287109375, + "learning_rate": 1.6336038473098887e-05, + "loss": 0.5728, + "step": 228480 + }, + { + "epoch": 2.019926094874379, + "grad_norm": 2.6292402744293213, + "learning_rate": 1.633456508542702e-05, + "loss": 0.5222, + "step": 228490 + }, + { + "epoch": 2.020014498134691, + "grad_norm": 0.7374580502510071, + "learning_rate": 1.6333091697755147e-05, + "loss": 0.4178, + "step": 228500 + }, + { + "epoch": 2.0201029013950036, + "grad_norm": 0.5411592125892639, + "learning_rate": 1.6331618310083275e-05, + "loss": 0.5138, + "step": 228510 + }, + { + "epoch": 2.0201913046553157, + "grad_norm": 1.7475320100784302, + "learning_rate": 1.6330144922411407e-05, + "loss": 0.512, + "step": 228520 + }, + { + "epoch": 2.020279707915628, + "grad_norm": 11.850410461425781, + "learning_rate": 1.6328671534739535e-05, + "loss": 0.4531, + "step": 228530 + }, + { + "epoch": 2.02036811117594, + "grad_norm": 7.558352947235107, + "learning_rate": 1.6327198147067664e-05, + "loss": 0.6294, + "step": 228540 + }, + { + "epoch": 2.0204565144362525, + "grad_norm": 6.2877302169799805, + "learning_rate": 1.6325724759395796e-05, + "loss": 0.5224, + "step": 228550 + }, + { + "epoch": 2.0205449176965646, + "grad_norm": 9.639472961425781, + "learning_rate": 1.6324251371723924e-05, + "loss": 0.3981, + "step": 228560 + }, + { + "epoch": 2.0206333209568768, + "grad_norm": 2.081186294555664, + "learning_rate": 1.6322777984052052e-05, + "loss": 0.4902, + "step": 228570 + }, + { + "epoch": 2.0207217242171893, + "grad_norm": 1.9963366985321045, + "learning_rate": 1.6321304596380184e-05, + "loss": 0.5804, + "step": 228580 + }, + { + "epoch": 2.0208101274775014, + "grad_norm": 12.622114181518555, + "learning_rate": 1.6319831208708312e-05, + "loss": 0.5679, + "step": 228590 + }, + { + "epoch": 2.0208985307378136, + "grad_norm": 1.4177680015563965, + "learning_rate": 1.631835782103644e-05, + "loss": 0.4866, + "step": 228600 + }, + { + "epoch": 2.0209869339981257, + "grad_norm": 3.935985803604126, + "learning_rate": 1.6316884433364572e-05, + "loss": 0.6238, + "step": 228610 + }, + { + "epoch": 2.0210753372584382, + "grad_norm": 4.469666481018066, + "learning_rate": 1.63154110456927e-05, + "loss": 0.6369, + "step": 228620 + }, + { + "epoch": 2.0211637405187504, + "grad_norm": 2.5741097927093506, + "learning_rate": 1.631393765802083e-05, + "loss": 0.5018, + "step": 228630 + }, + { + "epoch": 2.0212521437790625, + "grad_norm": 3.194094181060791, + "learning_rate": 1.6312464270348958e-05, + "loss": 0.5554, + "step": 228640 + }, + { + "epoch": 2.0213405470393746, + "grad_norm": 1.7015621662139893, + "learning_rate": 1.631099088267709e-05, + "loss": 0.6713, + "step": 228650 + }, + { + "epoch": 2.021428950299687, + "grad_norm": 1.451069951057434, + "learning_rate": 1.6309517495005218e-05, + "loss": 0.57, + "step": 228660 + }, + { + "epoch": 2.0215173535599993, + "grad_norm": 5.557995796203613, + "learning_rate": 1.6308044107333346e-05, + "loss": 0.4164, + "step": 228670 + }, + { + "epoch": 2.0216057568203114, + "grad_norm": 4.164958477020264, + "learning_rate": 1.6306570719661478e-05, + "loss": 0.4591, + "step": 228680 + }, + { + "epoch": 2.021694160080624, + "grad_norm": 2.0704448223114014, + "learning_rate": 1.6305097331989606e-05, + "loss": 0.6275, + "step": 228690 + }, + { + "epoch": 2.021782563340936, + "grad_norm": 8.492364883422852, + "learning_rate": 1.6303623944317734e-05, + "loss": 0.5531, + "step": 228700 + }, + { + "epoch": 2.021870966601248, + "grad_norm": 4.493282318115234, + "learning_rate": 1.6302150556645863e-05, + "loss": 0.5746, + "step": 228710 + }, + { + "epoch": 2.0219593698615603, + "grad_norm": 2.9381349086761475, + "learning_rate": 1.6300677168973995e-05, + "loss": 0.517, + "step": 228720 + }, + { + "epoch": 2.022047773121873, + "grad_norm": 1.287728190422058, + "learning_rate": 1.6299203781302123e-05, + "loss": 0.5225, + "step": 228730 + }, + { + "epoch": 2.022136176382185, + "grad_norm": 1.636979579925537, + "learning_rate": 1.629773039363025e-05, + "loss": 0.4316, + "step": 228740 + }, + { + "epoch": 2.022224579642497, + "grad_norm": 3.075153350830078, + "learning_rate": 1.6296257005958383e-05, + "loss": 0.4355, + "step": 228750 + }, + { + "epoch": 2.0223129829028093, + "grad_norm": 5.874175548553467, + "learning_rate": 1.629478361828651e-05, + "loss": 0.6777, + "step": 228760 + }, + { + "epoch": 2.022401386163122, + "grad_norm": 4.0939040184021, + "learning_rate": 1.629331023061464e-05, + "loss": 0.452, + "step": 228770 + }, + { + "epoch": 2.022489789423434, + "grad_norm": 2.2117245197296143, + "learning_rate": 1.6291836842942768e-05, + "loss": 0.5496, + "step": 228780 + }, + { + "epoch": 2.022578192683746, + "grad_norm": 1.938408374786377, + "learning_rate": 1.62903634552709e-05, + "loss": 0.4294, + "step": 228790 + }, + { + "epoch": 2.0226665959440586, + "grad_norm": 0.6895377039909363, + "learning_rate": 1.6288890067599028e-05, + "loss": 0.5343, + "step": 228800 + }, + { + "epoch": 2.0227549992043707, + "grad_norm": 1.6633960008621216, + "learning_rate": 1.6287416679927156e-05, + "loss": 0.5003, + "step": 228810 + }, + { + "epoch": 2.022843402464683, + "grad_norm": 1.0484527349472046, + "learning_rate": 1.6285943292255285e-05, + "loss": 0.6513, + "step": 228820 + }, + { + "epoch": 2.022931805724995, + "grad_norm": 9.511815071105957, + "learning_rate": 1.6284469904583417e-05, + "loss": 0.5601, + "step": 228830 + }, + { + "epoch": 2.0230202089853075, + "grad_norm": 2.767503261566162, + "learning_rate": 1.6282996516911545e-05, + "loss": 0.4953, + "step": 228840 + }, + { + "epoch": 2.0231086122456197, + "grad_norm": 1.9104701280593872, + "learning_rate": 1.6281523129239673e-05, + "loss": 0.5541, + "step": 228850 + }, + { + "epoch": 2.023197015505932, + "grad_norm": 5.879551410675049, + "learning_rate": 1.6280049741567805e-05, + "loss": 0.5663, + "step": 228860 + }, + { + "epoch": 2.023285418766244, + "grad_norm": 2.3052818775177, + "learning_rate": 1.6278576353895933e-05, + "loss": 0.5639, + "step": 228870 + }, + { + "epoch": 2.0233738220265565, + "grad_norm": 2.7867023944854736, + "learning_rate": 1.6277102966224062e-05, + "loss": 0.4681, + "step": 228880 + }, + { + "epoch": 2.0234622252868686, + "grad_norm": 1.1979434490203857, + "learning_rate": 1.627562957855219e-05, + "loss": 0.4299, + "step": 228890 + }, + { + "epoch": 2.0235506285471807, + "grad_norm": 19.023008346557617, + "learning_rate": 1.6274156190880322e-05, + "loss": 0.4977, + "step": 228900 + }, + { + "epoch": 2.0236390318074933, + "grad_norm": 5.373170852661133, + "learning_rate": 1.627268280320845e-05, + "loss": 0.5959, + "step": 228910 + }, + { + "epoch": 2.0237274350678054, + "grad_norm": 2.863678455352783, + "learning_rate": 1.627120941553658e-05, + "loss": 0.701, + "step": 228920 + }, + { + "epoch": 2.0238158383281175, + "grad_norm": 4.221573352813721, + "learning_rate": 1.6269736027864707e-05, + "loss": 0.5692, + "step": 228930 + }, + { + "epoch": 2.0239042415884296, + "grad_norm": 3.291574478149414, + "learning_rate": 1.626826264019284e-05, + "loss": 0.6248, + "step": 228940 + }, + { + "epoch": 2.023992644848742, + "grad_norm": 4.181818962097168, + "learning_rate": 1.6266789252520967e-05, + "loss": 0.5786, + "step": 228950 + }, + { + "epoch": 2.0240810481090543, + "grad_norm": 3.5158119201660156, + "learning_rate": 1.6265315864849095e-05, + "loss": 0.5079, + "step": 228960 + }, + { + "epoch": 2.0241694513693664, + "grad_norm": 7.855576992034912, + "learning_rate": 1.6263842477177227e-05, + "loss": 0.6236, + "step": 228970 + }, + { + "epoch": 2.0242578546296786, + "grad_norm": 1.2731858491897583, + "learning_rate": 1.6262369089505355e-05, + "loss": 0.5543, + "step": 228980 + }, + { + "epoch": 2.024346257889991, + "grad_norm": 2.6927037239074707, + "learning_rate": 1.6260895701833484e-05, + "loss": 0.4893, + "step": 228990 + }, + { + "epoch": 2.0244346611503032, + "grad_norm": 2.910059690475464, + "learning_rate": 1.6259422314161612e-05, + "loss": 0.6607, + "step": 229000 + }, + { + "epoch": 2.0245230644106154, + "grad_norm": 2.7661659717559814, + "learning_rate": 1.6257948926489744e-05, + "loss": 0.617, + "step": 229010 + }, + { + "epoch": 2.024611467670928, + "grad_norm": 4.014902114868164, + "learning_rate": 1.6256475538817872e-05, + "loss": 0.6212, + "step": 229020 + }, + { + "epoch": 2.02469987093124, + "grad_norm": 2.1493892669677734, + "learning_rate": 1.6255002151146e-05, + "loss": 0.5765, + "step": 229030 + }, + { + "epoch": 2.024788274191552, + "grad_norm": 3.4505937099456787, + "learning_rate": 1.6253528763474132e-05, + "loss": 0.5298, + "step": 229040 + }, + { + "epoch": 2.0248766774518643, + "grad_norm": 3.3439042568206787, + "learning_rate": 1.625205537580226e-05, + "loss": 0.4587, + "step": 229050 + }, + { + "epoch": 2.024965080712177, + "grad_norm": 1.5941444635391235, + "learning_rate": 1.625058198813039e-05, + "loss": 0.4898, + "step": 229060 + }, + { + "epoch": 2.025053483972489, + "grad_norm": 2.025240182876587, + "learning_rate": 1.6249108600458517e-05, + "loss": 0.4352, + "step": 229070 + }, + { + "epoch": 2.025141887232801, + "grad_norm": 1.6272051334381104, + "learning_rate": 1.624763521278665e-05, + "loss": 0.5222, + "step": 229080 + }, + { + "epoch": 2.025230290493113, + "grad_norm": 4.905060768127441, + "learning_rate": 1.6246161825114778e-05, + "loss": 0.5722, + "step": 229090 + }, + { + "epoch": 2.0253186937534258, + "grad_norm": 2.5193679332733154, + "learning_rate": 1.6244688437442906e-05, + "loss": 0.5671, + "step": 229100 + }, + { + "epoch": 2.025407097013738, + "grad_norm": 1.9916996955871582, + "learning_rate": 1.6243215049771034e-05, + "loss": 0.5629, + "step": 229110 + }, + { + "epoch": 2.02549550027405, + "grad_norm": 11.41375732421875, + "learning_rate": 1.6241741662099166e-05, + "loss": 0.5914, + "step": 229120 + }, + { + "epoch": 2.025583903534362, + "grad_norm": 1.4500977993011475, + "learning_rate": 1.6240268274427294e-05, + "loss": 0.53, + "step": 229130 + }, + { + "epoch": 2.0256723067946747, + "grad_norm": 3.2911603450775146, + "learning_rate": 1.6238794886755423e-05, + "loss": 0.4745, + "step": 229140 + }, + { + "epoch": 2.025760710054987, + "grad_norm": 1.8262370824813843, + "learning_rate": 1.6237321499083554e-05, + "loss": 0.4569, + "step": 229150 + }, + { + "epoch": 2.025849113315299, + "grad_norm": 7.507793426513672, + "learning_rate": 1.6235848111411683e-05, + "loss": 0.5485, + "step": 229160 + }, + { + "epoch": 2.0259375165756115, + "grad_norm": 6.032270908355713, + "learning_rate": 1.623437472373981e-05, + "loss": 0.6051, + "step": 229170 + }, + { + "epoch": 2.0260259198359236, + "grad_norm": 4.390895366668701, + "learning_rate": 1.623290133606794e-05, + "loss": 0.5684, + "step": 229180 + }, + { + "epoch": 2.0261143230962357, + "grad_norm": 13.540802955627441, + "learning_rate": 1.623142794839607e-05, + "loss": 0.5797, + "step": 229190 + }, + { + "epoch": 2.026202726356548, + "grad_norm": 5.589757442474365, + "learning_rate": 1.62299545607242e-05, + "loss": 0.5265, + "step": 229200 + }, + { + "epoch": 2.0262911296168604, + "grad_norm": 1.5808405876159668, + "learning_rate": 1.6228481173052328e-05, + "loss": 0.5245, + "step": 229210 + }, + { + "epoch": 2.0263795328771725, + "grad_norm": 4.218026638031006, + "learning_rate": 1.622700778538046e-05, + "loss": 0.4266, + "step": 229220 + }, + { + "epoch": 2.0264679361374847, + "grad_norm": 2.0124611854553223, + "learning_rate": 1.6225534397708588e-05, + "loss": 0.5629, + "step": 229230 + }, + { + "epoch": 2.026556339397797, + "grad_norm": 0.9772027134895325, + "learning_rate": 1.6224061010036716e-05, + "loss": 0.4698, + "step": 229240 + }, + { + "epoch": 2.0266447426581093, + "grad_norm": 3.5696942806243896, + "learning_rate": 1.6222587622364845e-05, + "loss": 0.5288, + "step": 229250 + }, + { + "epoch": 2.0267331459184215, + "grad_norm": 1.0294303894042969, + "learning_rate": 1.6221114234692976e-05, + "loss": 0.5119, + "step": 229260 + }, + { + "epoch": 2.0268215491787336, + "grad_norm": 3.1618540287017822, + "learning_rate": 1.6219640847021105e-05, + "loss": 0.5377, + "step": 229270 + }, + { + "epoch": 2.026909952439046, + "grad_norm": 0.8885582685470581, + "learning_rate": 1.6218167459349233e-05, + "loss": 0.5275, + "step": 229280 + }, + { + "epoch": 2.0269983556993583, + "grad_norm": 7.362066745758057, + "learning_rate": 1.621669407167736e-05, + "loss": 0.5194, + "step": 229290 + }, + { + "epoch": 2.0270867589596704, + "grad_norm": 1.8983622789382935, + "learning_rate": 1.6215220684005493e-05, + "loss": 0.4672, + "step": 229300 + }, + { + "epoch": 2.0271751622199825, + "grad_norm": 2.092336893081665, + "learning_rate": 1.621374729633362e-05, + "loss": 0.5441, + "step": 229310 + }, + { + "epoch": 2.027263565480295, + "grad_norm": 5.057147026062012, + "learning_rate": 1.621227390866175e-05, + "loss": 0.5656, + "step": 229320 + }, + { + "epoch": 2.027351968740607, + "grad_norm": 1.4584579467773438, + "learning_rate": 1.6210800520989882e-05, + "loss": 0.7807, + "step": 229330 + }, + { + "epoch": 2.0274403720009193, + "grad_norm": 1.755975604057312, + "learning_rate": 1.620932713331801e-05, + "loss": 0.5694, + "step": 229340 + }, + { + "epoch": 2.0275287752612314, + "grad_norm": 1.3693058490753174, + "learning_rate": 1.620785374564614e-05, + "loss": 0.6092, + "step": 229350 + }, + { + "epoch": 2.027617178521544, + "grad_norm": 3.529750108718872, + "learning_rate": 1.6206380357974267e-05, + "loss": 0.4918, + "step": 229360 + }, + { + "epoch": 2.027705581781856, + "grad_norm": 9.242635726928711, + "learning_rate": 1.62049069703024e-05, + "loss": 0.4289, + "step": 229370 + }, + { + "epoch": 2.0277939850421682, + "grad_norm": 1.1579645872116089, + "learning_rate": 1.6203433582630527e-05, + "loss": 0.4208, + "step": 229380 + }, + { + "epoch": 2.027882388302481, + "grad_norm": 1.2972092628479004, + "learning_rate": 1.6201960194958655e-05, + "loss": 0.5731, + "step": 229390 + }, + { + "epoch": 2.027970791562793, + "grad_norm": 1.8126658201217651, + "learning_rate": 1.6200486807286787e-05, + "loss": 0.5519, + "step": 229400 + }, + { + "epoch": 2.028059194823105, + "grad_norm": 2.1758575439453125, + "learning_rate": 1.6199013419614915e-05, + "loss": 0.4955, + "step": 229410 + }, + { + "epoch": 2.028147598083417, + "grad_norm": 2.281951427459717, + "learning_rate": 1.6197540031943044e-05, + "loss": 0.4073, + "step": 229420 + }, + { + "epoch": 2.0282360013437297, + "grad_norm": 2.8807015419006348, + "learning_rate": 1.6196066644271175e-05, + "loss": 0.5219, + "step": 229430 + }, + { + "epoch": 2.028324404604042, + "grad_norm": 7.659378528594971, + "learning_rate": 1.6194593256599304e-05, + "loss": 0.5785, + "step": 229440 + }, + { + "epoch": 2.028412807864354, + "grad_norm": 1.7611111402511597, + "learning_rate": 1.6193119868927432e-05, + "loss": 0.54, + "step": 229450 + }, + { + "epoch": 2.028501211124666, + "grad_norm": 1.7269339561462402, + "learning_rate": 1.6191646481255564e-05, + "loss": 0.5244, + "step": 229460 + }, + { + "epoch": 2.0285896143849786, + "grad_norm": 11.142923355102539, + "learning_rate": 1.6190173093583692e-05, + "loss": 0.4364, + "step": 229470 + }, + { + "epoch": 2.0286780176452908, + "grad_norm": 2.6782939434051514, + "learning_rate": 1.618869970591182e-05, + "loss": 0.5453, + "step": 229480 + }, + { + "epoch": 2.028766420905603, + "grad_norm": 1.8467345237731934, + "learning_rate": 1.6187226318239952e-05, + "loss": 0.5287, + "step": 229490 + }, + { + "epoch": 2.0288548241659154, + "grad_norm": 5.344024181365967, + "learning_rate": 1.618575293056808e-05, + "loss": 0.4638, + "step": 229500 + }, + { + "epoch": 2.0289432274262276, + "grad_norm": 3.613802194595337, + "learning_rate": 1.6184279542896212e-05, + "loss": 0.5966, + "step": 229510 + }, + { + "epoch": 2.0290316306865397, + "grad_norm": 3.3633511066436768, + "learning_rate": 1.618280615522434e-05, + "loss": 0.4531, + "step": 229520 + }, + { + "epoch": 2.029120033946852, + "grad_norm": 3.6952359676361084, + "learning_rate": 1.618133276755247e-05, + "loss": 0.5251, + "step": 229530 + }, + { + "epoch": 2.0292084372071644, + "grad_norm": 1.788018822669983, + "learning_rate": 1.6179859379880597e-05, + "loss": 0.4846, + "step": 229540 + }, + { + "epoch": 2.0292968404674765, + "grad_norm": 3.194873332977295, + "learning_rate": 1.617838599220873e-05, + "loss": 0.4978, + "step": 229550 + }, + { + "epoch": 2.0293852437277886, + "grad_norm": 5.908033847808838, + "learning_rate": 1.6176912604536858e-05, + "loss": 0.5223, + "step": 229560 + }, + { + "epoch": 2.0294736469881007, + "grad_norm": 1.0066196918487549, + "learning_rate": 1.6175439216864986e-05, + "loss": 0.5567, + "step": 229570 + }, + { + "epoch": 2.0295620502484133, + "grad_norm": 6.052037715911865, + "learning_rate": 1.6173965829193114e-05, + "loss": 0.6556, + "step": 229580 + }, + { + "epoch": 2.0296504535087254, + "grad_norm": 1.9851878881454468, + "learning_rate": 1.6172492441521246e-05, + "loss": 0.6564, + "step": 229590 + }, + { + "epoch": 2.0297388567690375, + "grad_norm": 2.1681008338928223, + "learning_rate": 1.6171019053849374e-05, + "loss": 0.553, + "step": 229600 + }, + { + "epoch": 2.02982726002935, + "grad_norm": 32.91980743408203, + "learning_rate": 1.6169545666177503e-05, + "loss": 0.7339, + "step": 229610 + }, + { + "epoch": 2.029915663289662, + "grad_norm": 8.376395225524902, + "learning_rate": 1.6168072278505634e-05, + "loss": 0.6008, + "step": 229620 + }, + { + "epoch": 2.0300040665499743, + "grad_norm": 1.5229064226150513, + "learning_rate": 1.6166598890833763e-05, + "loss": 0.6176, + "step": 229630 + }, + { + "epoch": 2.0300924698102865, + "grad_norm": 1.6452652215957642, + "learning_rate": 1.616512550316189e-05, + "loss": 0.4481, + "step": 229640 + }, + { + "epoch": 2.030180873070599, + "grad_norm": 6.800448894500732, + "learning_rate": 1.616365211549002e-05, + "loss": 0.5481, + "step": 229650 + }, + { + "epoch": 2.030269276330911, + "grad_norm": 14.550088882446289, + "learning_rate": 1.616217872781815e-05, + "loss": 0.5765, + "step": 229660 + }, + { + "epoch": 2.0303576795912233, + "grad_norm": 1.9651539325714111, + "learning_rate": 1.616070534014628e-05, + "loss": 0.5845, + "step": 229670 + }, + { + "epoch": 2.0304460828515354, + "grad_norm": 4.479519367218018, + "learning_rate": 1.6159231952474408e-05, + "loss": 0.4683, + "step": 229680 + }, + { + "epoch": 2.030534486111848, + "grad_norm": 14.476015090942383, + "learning_rate": 1.615775856480254e-05, + "loss": 0.5485, + "step": 229690 + }, + { + "epoch": 2.03062288937216, + "grad_norm": 31.01997947692871, + "learning_rate": 1.6156285177130668e-05, + "loss": 0.486, + "step": 229700 + }, + { + "epoch": 2.030711292632472, + "grad_norm": 5.739177227020264, + "learning_rate": 1.6154811789458796e-05, + "loss": 0.5217, + "step": 229710 + }, + { + "epoch": 2.0307996958927843, + "grad_norm": 2.144521474838257, + "learning_rate": 1.6153338401786925e-05, + "loss": 0.5885, + "step": 229720 + }, + { + "epoch": 2.030888099153097, + "grad_norm": 3.9730498790740967, + "learning_rate": 1.6151865014115057e-05, + "loss": 0.5042, + "step": 229730 + }, + { + "epoch": 2.030976502413409, + "grad_norm": 4.673571586608887, + "learning_rate": 1.6150391626443185e-05, + "loss": 0.4904, + "step": 229740 + }, + { + "epoch": 2.031064905673721, + "grad_norm": 1.5394362211227417, + "learning_rate": 1.6148918238771313e-05, + "loss": 0.5018, + "step": 229750 + }, + { + "epoch": 2.0311533089340337, + "grad_norm": 5.146883487701416, + "learning_rate": 1.614744485109944e-05, + "loss": 0.348, + "step": 229760 + }, + { + "epoch": 2.031241712194346, + "grad_norm": 4.0596394538879395, + "learning_rate": 1.6145971463427573e-05, + "loss": 0.5426, + "step": 229770 + }, + { + "epoch": 2.031330115454658, + "grad_norm": 2.1151888370513916, + "learning_rate": 1.6144498075755702e-05, + "loss": 0.6099, + "step": 229780 + }, + { + "epoch": 2.03141851871497, + "grad_norm": 6.871828556060791, + "learning_rate": 1.614302468808383e-05, + "loss": 0.6526, + "step": 229790 + }, + { + "epoch": 2.0315069219752826, + "grad_norm": 4.6059441566467285, + "learning_rate": 1.6141551300411962e-05, + "loss": 0.6447, + "step": 229800 + }, + { + "epoch": 2.0315953252355947, + "grad_norm": 1.6523704528808594, + "learning_rate": 1.614007791274009e-05, + "loss": 0.533, + "step": 229810 + }, + { + "epoch": 2.031683728495907, + "grad_norm": 9.858926773071289, + "learning_rate": 1.613860452506822e-05, + "loss": 0.5816, + "step": 229820 + }, + { + "epoch": 2.031772131756219, + "grad_norm": 3.2328619956970215, + "learning_rate": 1.6137131137396347e-05, + "loss": 0.5605, + "step": 229830 + }, + { + "epoch": 2.0318605350165315, + "grad_norm": 2.9265825748443604, + "learning_rate": 1.613565774972448e-05, + "loss": 0.5129, + "step": 229840 + }, + { + "epoch": 2.0319489382768436, + "grad_norm": 2.972031593322754, + "learning_rate": 1.6134184362052607e-05, + "loss": 0.4007, + "step": 229850 + }, + { + "epoch": 2.0320373415371558, + "grad_norm": 6.154536247253418, + "learning_rate": 1.6132710974380735e-05, + "loss": 0.5397, + "step": 229860 + }, + { + "epoch": 2.0321257447974683, + "grad_norm": 2.6466572284698486, + "learning_rate": 1.6131237586708867e-05, + "loss": 0.6831, + "step": 229870 + }, + { + "epoch": 2.0322141480577804, + "grad_norm": 1.9684175252914429, + "learning_rate": 1.6129764199036995e-05, + "loss": 0.6095, + "step": 229880 + }, + { + "epoch": 2.0323025513180926, + "grad_norm": 12.264841079711914, + "learning_rate": 1.6128290811365124e-05, + "loss": 0.6465, + "step": 229890 + }, + { + "epoch": 2.0323909545784047, + "grad_norm": 3.7930564880371094, + "learning_rate": 1.6126817423693252e-05, + "loss": 0.4591, + "step": 229900 + }, + { + "epoch": 2.0324793578387172, + "grad_norm": 1.2273736000061035, + "learning_rate": 1.6125344036021384e-05, + "loss": 0.4509, + "step": 229910 + }, + { + "epoch": 2.0325677610990294, + "grad_norm": 9.830855369567871, + "learning_rate": 1.6123870648349512e-05, + "loss": 0.53, + "step": 229920 + }, + { + "epoch": 2.0326561643593415, + "grad_norm": 2.68046236038208, + "learning_rate": 1.612239726067764e-05, + "loss": 0.5792, + "step": 229930 + }, + { + "epoch": 2.0327445676196536, + "grad_norm": 0.8994482755661011, + "learning_rate": 1.612092387300577e-05, + "loss": 0.5331, + "step": 229940 + }, + { + "epoch": 2.032832970879966, + "grad_norm": 4.394510269165039, + "learning_rate": 1.61194504853339e-05, + "loss": 0.423, + "step": 229950 + }, + { + "epoch": 2.0329213741402783, + "grad_norm": 14.247312545776367, + "learning_rate": 1.611797709766203e-05, + "loss": 0.6085, + "step": 229960 + }, + { + "epoch": 2.0330097774005904, + "grad_norm": 2.572593927383423, + "learning_rate": 1.6116503709990157e-05, + "loss": 0.5429, + "step": 229970 + }, + { + "epoch": 2.033098180660903, + "grad_norm": 2.367778778076172, + "learning_rate": 1.611503032231829e-05, + "loss": 0.5322, + "step": 229980 + }, + { + "epoch": 2.033186583921215, + "grad_norm": 1.4375165700912476, + "learning_rate": 1.6113556934646417e-05, + "loss": 0.4808, + "step": 229990 + }, + { + "epoch": 2.033274987181527, + "grad_norm": 1.0224571228027344, + "learning_rate": 1.6112083546974546e-05, + "loss": 0.4775, + "step": 230000 + }, + { + "epoch": 2.0333633904418393, + "grad_norm": 2.178645610809326, + "learning_rate": 1.6110610159302674e-05, + "loss": 0.492, + "step": 230010 + }, + { + "epoch": 2.033451793702152, + "grad_norm": 3.8883605003356934, + "learning_rate": 1.6109136771630806e-05, + "loss": 0.5299, + "step": 230020 + }, + { + "epoch": 2.033540196962464, + "grad_norm": 3.6706905364990234, + "learning_rate": 1.6107663383958934e-05, + "loss": 0.4231, + "step": 230030 + }, + { + "epoch": 2.033628600222776, + "grad_norm": 3.5461013317108154, + "learning_rate": 1.6106189996287063e-05, + "loss": 0.5513, + "step": 230040 + }, + { + "epoch": 2.0337170034830883, + "grad_norm": 3.2488367557525635, + "learning_rate": 1.610471660861519e-05, + "loss": 0.6005, + "step": 230050 + }, + { + "epoch": 2.033805406743401, + "grad_norm": 1.6911059617996216, + "learning_rate": 1.6103243220943323e-05, + "loss": 0.656, + "step": 230060 + }, + { + "epoch": 2.033893810003713, + "grad_norm": 2.250363349914551, + "learning_rate": 1.610176983327145e-05, + "loss": 0.5167, + "step": 230070 + }, + { + "epoch": 2.033982213264025, + "grad_norm": 5.241191387176514, + "learning_rate": 1.610029644559958e-05, + "loss": 0.6186, + "step": 230080 + }, + { + "epoch": 2.0340706165243376, + "grad_norm": 3.793581008911133, + "learning_rate": 1.609882305792771e-05, + "loss": 0.64, + "step": 230090 + }, + { + "epoch": 2.0341590197846497, + "grad_norm": 3.0174548625946045, + "learning_rate": 1.609734967025584e-05, + "loss": 0.6497, + "step": 230100 + }, + { + "epoch": 2.034247423044962, + "grad_norm": 3.384330987930298, + "learning_rate": 1.6095876282583968e-05, + "loss": 0.505, + "step": 230110 + }, + { + "epoch": 2.034335826305274, + "grad_norm": 3.1132314205169678, + "learning_rate": 1.6094402894912096e-05, + "loss": 0.4963, + "step": 230120 + }, + { + "epoch": 2.0344242295655866, + "grad_norm": 1.7613157033920288, + "learning_rate": 1.6092929507240228e-05, + "loss": 0.4811, + "step": 230130 + }, + { + "epoch": 2.0345126328258987, + "grad_norm": 1.2226041555404663, + "learning_rate": 1.6091456119568356e-05, + "loss": 0.5884, + "step": 230140 + }, + { + "epoch": 2.034601036086211, + "grad_norm": 1.1077228784561157, + "learning_rate": 1.6089982731896485e-05, + "loss": 0.4423, + "step": 230150 + }, + { + "epoch": 2.034689439346523, + "grad_norm": 0.5286349058151245, + "learning_rate": 1.6088509344224616e-05, + "loss": 0.5073, + "step": 230160 + }, + { + "epoch": 2.0347778426068355, + "grad_norm": 4.304290294647217, + "learning_rate": 1.6087035956552745e-05, + "loss": 0.6418, + "step": 230170 + }, + { + "epoch": 2.0348662458671476, + "grad_norm": 3.060291290283203, + "learning_rate": 1.6085562568880873e-05, + "loss": 0.5925, + "step": 230180 + }, + { + "epoch": 2.0349546491274597, + "grad_norm": 9.908263206481934, + "learning_rate": 1.6084089181209e-05, + "loss": 0.4869, + "step": 230190 + }, + { + "epoch": 2.0350430523877723, + "grad_norm": 1.5792419910430908, + "learning_rate": 1.6082615793537133e-05, + "loss": 0.6503, + "step": 230200 + }, + { + "epoch": 2.0351314556480844, + "grad_norm": 4.307579517364502, + "learning_rate": 1.608114240586526e-05, + "loss": 0.4999, + "step": 230210 + }, + { + "epoch": 2.0352198589083965, + "grad_norm": 2.6218016147613525, + "learning_rate": 1.607966901819339e-05, + "loss": 0.3762, + "step": 230220 + }, + { + "epoch": 2.0353082621687086, + "grad_norm": 3.9510111808776855, + "learning_rate": 1.6078195630521518e-05, + "loss": 0.5473, + "step": 230230 + }, + { + "epoch": 2.035396665429021, + "grad_norm": 3.611645221710205, + "learning_rate": 1.607672224284965e-05, + "loss": 0.5204, + "step": 230240 + }, + { + "epoch": 2.0354850686893333, + "grad_norm": 9.18155288696289, + "learning_rate": 1.607524885517778e-05, + "loss": 0.507, + "step": 230250 + }, + { + "epoch": 2.0355734719496454, + "grad_norm": 1.4360222816467285, + "learning_rate": 1.6073775467505907e-05, + "loss": 0.5846, + "step": 230260 + }, + { + "epoch": 2.0356618752099576, + "grad_norm": 20.858274459838867, + "learning_rate": 1.607230207983404e-05, + "loss": 0.593, + "step": 230270 + }, + { + "epoch": 2.03575027847027, + "grad_norm": 4.4153008460998535, + "learning_rate": 1.6070828692162167e-05, + "loss": 0.3894, + "step": 230280 + }, + { + "epoch": 2.0358386817305822, + "grad_norm": 1.67811918258667, + "learning_rate": 1.6069355304490295e-05, + "loss": 0.5005, + "step": 230290 + }, + { + "epoch": 2.0359270849908944, + "grad_norm": 1.7915778160095215, + "learning_rate": 1.6067881916818427e-05, + "loss": 0.5513, + "step": 230300 + }, + { + "epoch": 2.0360154882512065, + "grad_norm": 2.4489235877990723, + "learning_rate": 1.6066408529146555e-05, + "loss": 0.5725, + "step": 230310 + }, + { + "epoch": 2.036103891511519, + "grad_norm": 3.9878432750701904, + "learning_rate": 1.6064935141474684e-05, + "loss": 0.537, + "step": 230320 + }, + { + "epoch": 2.036192294771831, + "grad_norm": 3.6167914867401123, + "learning_rate": 1.6063461753802815e-05, + "loss": 0.4207, + "step": 230330 + }, + { + "epoch": 2.0362806980321433, + "grad_norm": 2.4600930213928223, + "learning_rate": 1.6061988366130944e-05, + "loss": 0.5531, + "step": 230340 + }, + { + "epoch": 2.036369101292456, + "grad_norm": 1.1102458238601685, + "learning_rate": 1.6060514978459072e-05, + "loss": 0.4451, + "step": 230350 + }, + { + "epoch": 2.036457504552768, + "grad_norm": 2.7267119884490967, + "learning_rate": 1.6059041590787204e-05, + "loss": 0.5982, + "step": 230360 + }, + { + "epoch": 2.03654590781308, + "grad_norm": 2.651257276535034, + "learning_rate": 1.6057568203115332e-05, + "loss": 0.5401, + "step": 230370 + }, + { + "epoch": 2.036634311073392, + "grad_norm": 4.05678653717041, + "learning_rate": 1.605609481544346e-05, + "loss": 0.422, + "step": 230380 + }, + { + "epoch": 2.0367227143337048, + "grad_norm": 28.9031982421875, + "learning_rate": 1.6054621427771592e-05, + "loss": 0.5884, + "step": 230390 + }, + { + "epoch": 2.036811117594017, + "grad_norm": 2.3702309131622314, + "learning_rate": 1.605314804009972e-05, + "loss": 0.4749, + "step": 230400 + }, + { + "epoch": 2.036899520854329, + "grad_norm": 4.255746364593506, + "learning_rate": 1.605167465242785e-05, + "loss": 0.6272, + "step": 230410 + }, + { + "epoch": 2.036987924114641, + "grad_norm": 1.89402437210083, + "learning_rate": 1.605020126475598e-05, + "loss": 0.5652, + "step": 230420 + }, + { + "epoch": 2.0370763273749537, + "grad_norm": 1.6647453308105469, + "learning_rate": 1.604872787708411e-05, + "loss": 0.5397, + "step": 230430 + }, + { + "epoch": 2.037164730635266, + "grad_norm": 5.1396942138671875, + "learning_rate": 1.6047254489412237e-05, + "loss": 0.4468, + "step": 230440 + }, + { + "epoch": 2.037253133895578, + "grad_norm": 4.873159885406494, + "learning_rate": 1.604578110174037e-05, + "loss": 0.5571, + "step": 230450 + }, + { + "epoch": 2.0373415371558905, + "grad_norm": 2.042257070541382, + "learning_rate": 1.6044307714068498e-05, + "loss": 0.5132, + "step": 230460 + }, + { + "epoch": 2.0374299404162026, + "grad_norm": 4.800120830535889, + "learning_rate": 1.6042834326396626e-05, + "loss": 0.4918, + "step": 230470 + }, + { + "epoch": 2.0375183436765147, + "grad_norm": 2.9972023963928223, + "learning_rate": 1.6041360938724754e-05, + "loss": 0.5523, + "step": 230480 + }, + { + "epoch": 2.037606746936827, + "grad_norm": 5.6374359130859375, + "learning_rate": 1.6039887551052886e-05, + "loss": 0.5757, + "step": 230490 + }, + { + "epoch": 2.0376951501971394, + "grad_norm": 1.77998685836792, + "learning_rate": 1.6038414163381014e-05, + "loss": 0.5139, + "step": 230500 + }, + { + "epoch": 2.0377835534574515, + "grad_norm": 0.8495117425918579, + "learning_rate": 1.6036940775709143e-05, + "loss": 0.4064, + "step": 230510 + }, + { + "epoch": 2.0378719567177637, + "grad_norm": 2.0150158405303955, + "learning_rate": 1.603546738803727e-05, + "loss": 0.6262, + "step": 230520 + }, + { + "epoch": 2.037960359978076, + "grad_norm": 2.289886951446533, + "learning_rate": 1.6033994000365403e-05, + "loss": 0.4876, + "step": 230530 + }, + { + "epoch": 2.0380487632383884, + "grad_norm": 4.138984680175781, + "learning_rate": 1.603252061269353e-05, + "loss": 0.5278, + "step": 230540 + }, + { + "epoch": 2.0381371664987005, + "grad_norm": 1.4971106052398682, + "learning_rate": 1.603104722502166e-05, + "loss": 0.566, + "step": 230550 + }, + { + "epoch": 2.0382255697590126, + "grad_norm": 2.6828060150146484, + "learning_rate": 1.602957383734979e-05, + "loss": 0.5578, + "step": 230560 + }, + { + "epoch": 2.038313973019325, + "grad_norm": 4.939634799957275, + "learning_rate": 1.602810044967792e-05, + "loss": 0.5207, + "step": 230570 + }, + { + "epoch": 2.0384023762796373, + "grad_norm": 1.4824367761611938, + "learning_rate": 1.6026627062006048e-05, + "loss": 0.5221, + "step": 230580 + }, + { + "epoch": 2.0384907795399494, + "grad_norm": 3.943966865539551, + "learning_rate": 1.6025153674334176e-05, + "loss": 0.574, + "step": 230590 + }, + { + "epoch": 2.0385791828002615, + "grad_norm": 2.1735446453094482, + "learning_rate": 1.6023680286662308e-05, + "loss": 0.4432, + "step": 230600 + }, + { + "epoch": 2.038667586060574, + "grad_norm": 4.073056697845459, + "learning_rate": 1.6022206898990436e-05, + "loss": 0.6616, + "step": 230610 + }, + { + "epoch": 2.038755989320886, + "grad_norm": 5.897747993469238, + "learning_rate": 1.6020733511318565e-05, + "loss": 0.5529, + "step": 230620 + }, + { + "epoch": 2.0388443925811983, + "grad_norm": 3.40490984916687, + "learning_rate": 1.6019260123646696e-05, + "loss": 0.5051, + "step": 230630 + }, + { + "epoch": 2.0389327958415104, + "grad_norm": 1.811266303062439, + "learning_rate": 1.6017786735974825e-05, + "loss": 0.5653, + "step": 230640 + }, + { + "epoch": 2.039021199101823, + "grad_norm": 2.810957670211792, + "learning_rate": 1.6016313348302953e-05, + "loss": 0.5264, + "step": 230650 + }, + { + "epoch": 2.039109602362135, + "grad_norm": 11.011382102966309, + "learning_rate": 1.601483996063108e-05, + "loss": 0.548, + "step": 230660 + }, + { + "epoch": 2.0391980056224472, + "grad_norm": 5.041356086730957, + "learning_rate": 1.6013366572959213e-05, + "loss": 0.6021, + "step": 230670 + }, + { + "epoch": 2.03928640888276, + "grad_norm": 7.95573616027832, + "learning_rate": 1.601189318528734e-05, + "loss": 0.5488, + "step": 230680 + }, + { + "epoch": 2.039374812143072, + "grad_norm": 8.107582092285156, + "learning_rate": 1.601041979761547e-05, + "loss": 0.565, + "step": 230690 + }, + { + "epoch": 2.039463215403384, + "grad_norm": 1.7145506143569946, + "learning_rate": 1.60089464099436e-05, + "loss": 0.4471, + "step": 230700 + }, + { + "epoch": 2.039551618663696, + "grad_norm": 2.558162212371826, + "learning_rate": 1.600747302227173e-05, + "loss": 0.5024, + "step": 230710 + }, + { + "epoch": 2.0396400219240087, + "grad_norm": 3.2387874126434326, + "learning_rate": 1.600599963459986e-05, + "loss": 0.5292, + "step": 230720 + }, + { + "epoch": 2.039728425184321, + "grad_norm": 2.5009377002716064, + "learning_rate": 1.6004526246927987e-05, + "loss": 0.5391, + "step": 230730 + }, + { + "epoch": 2.039816828444633, + "grad_norm": 3.6669018268585205, + "learning_rate": 1.600305285925612e-05, + "loss": 0.5024, + "step": 230740 + }, + { + "epoch": 2.039905231704945, + "grad_norm": 4.420226097106934, + "learning_rate": 1.6001579471584247e-05, + "loss": 0.4873, + "step": 230750 + }, + { + "epoch": 2.0399936349652577, + "grad_norm": 7.20191764831543, + "learning_rate": 1.6000106083912375e-05, + "loss": 0.5932, + "step": 230760 + }, + { + "epoch": 2.0400820382255698, + "grad_norm": 2.255068302154541, + "learning_rate": 1.5998632696240504e-05, + "loss": 0.6193, + "step": 230770 + }, + { + "epoch": 2.040170441485882, + "grad_norm": 3.1381120681762695, + "learning_rate": 1.5997159308568635e-05, + "loss": 0.5877, + "step": 230780 + }, + { + "epoch": 2.0402588447461945, + "grad_norm": 2.5356435775756836, + "learning_rate": 1.5995685920896764e-05, + "loss": 0.656, + "step": 230790 + }, + { + "epoch": 2.0403472480065066, + "grad_norm": 10.565383911132812, + "learning_rate": 1.5994212533224892e-05, + "loss": 0.5585, + "step": 230800 + }, + { + "epoch": 2.0404356512668187, + "grad_norm": 4.737443923950195, + "learning_rate": 1.5992739145553024e-05, + "loss": 0.5062, + "step": 230810 + }, + { + "epoch": 2.040524054527131, + "grad_norm": 3.896827459335327, + "learning_rate": 1.5991265757881152e-05, + "loss": 0.6523, + "step": 230820 + }, + { + "epoch": 2.0406124577874434, + "grad_norm": 5.611128330230713, + "learning_rate": 1.598979237020928e-05, + "loss": 0.6357, + "step": 230830 + }, + { + "epoch": 2.0407008610477555, + "grad_norm": 6.95565128326416, + "learning_rate": 1.598831898253741e-05, + "loss": 0.4191, + "step": 230840 + }, + { + "epoch": 2.0407892643080676, + "grad_norm": 5.0833024978637695, + "learning_rate": 1.598684559486554e-05, + "loss": 0.5753, + "step": 230850 + }, + { + "epoch": 2.0408776675683797, + "grad_norm": 5.079162120819092, + "learning_rate": 1.598537220719367e-05, + "loss": 0.6186, + "step": 230860 + }, + { + "epoch": 2.0409660708286923, + "grad_norm": 2.6269021034240723, + "learning_rate": 1.5983898819521797e-05, + "loss": 0.5285, + "step": 230870 + }, + { + "epoch": 2.0410544740890044, + "grad_norm": 2.832679510116577, + "learning_rate": 1.5982425431849926e-05, + "loss": 0.4556, + "step": 230880 + }, + { + "epoch": 2.0411428773493165, + "grad_norm": 4.926564693450928, + "learning_rate": 1.5980952044178057e-05, + "loss": 0.5458, + "step": 230890 + }, + { + "epoch": 2.0412312806096287, + "grad_norm": 4.057921886444092, + "learning_rate": 1.5979478656506186e-05, + "loss": 0.5423, + "step": 230900 + }, + { + "epoch": 2.0413196838699412, + "grad_norm": 1.4843066930770874, + "learning_rate": 1.5978005268834314e-05, + "loss": 0.5034, + "step": 230910 + }, + { + "epoch": 2.0414080871302533, + "grad_norm": 1.8917179107666016, + "learning_rate": 1.5976531881162446e-05, + "loss": 0.4453, + "step": 230920 + }, + { + "epoch": 2.0414964903905655, + "grad_norm": 1.8411122560501099, + "learning_rate": 1.5975058493490574e-05, + "loss": 0.6605, + "step": 230930 + }, + { + "epoch": 2.041584893650878, + "grad_norm": 6.3782196044921875, + "learning_rate": 1.5973585105818703e-05, + "loss": 0.4622, + "step": 230940 + }, + { + "epoch": 2.04167329691119, + "grad_norm": 2.473829746246338, + "learning_rate": 1.597211171814683e-05, + "loss": 0.4222, + "step": 230950 + }, + { + "epoch": 2.0417617001715023, + "grad_norm": 9.5839262008667, + "learning_rate": 1.5970638330474963e-05, + "loss": 0.4995, + "step": 230960 + }, + { + "epoch": 2.0418501034318144, + "grad_norm": 1.3644075393676758, + "learning_rate": 1.596916494280309e-05, + "loss": 0.4704, + "step": 230970 + }, + { + "epoch": 2.041938506692127, + "grad_norm": 2.449341297149658, + "learning_rate": 1.596769155513122e-05, + "loss": 0.4723, + "step": 230980 + }, + { + "epoch": 2.042026909952439, + "grad_norm": 1.2071971893310547, + "learning_rate": 1.5966218167459348e-05, + "loss": 0.544, + "step": 230990 + }, + { + "epoch": 2.042115313212751, + "grad_norm": 2.1394152641296387, + "learning_rate": 1.596474477978748e-05, + "loss": 0.5622, + "step": 231000 + }, + { + "epoch": 2.0422037164730633, + "grad_norm": 7.570413112640381, + "learning_rate": 1.5963271392115608e-05, + "loss": 0.5643, + "step": 231010 + }, + { + "epoch": 2.042292119733376, + "grad_norm": 3.1707704067230225, + "learning_rate": 1.5961798004443736e-05, + "loss": 0.653, + "step": 231020 + }, + { + "epoch": 2.042380522993688, + "grad_norm": 2.7581520080566406, + "learning_rate": 1.5960324616771868e-05, + "loss": 0.5183, + "step": 231030 + }, + { + "epoch": 2.042468926254, + "grad_norm": 4.420993328094482, + "learning_rate": 1.5958851229099996e-05, + "loss": 0.4871, + "step": 231040 + }, + { + "epoch": 2.0425573295143127, + "grad_norm": 8.375741004943848, + "learning_rate": 1.5957377841428125e-05, + "loss": 0.558, + "step": 231050 + }, + { + "epoch": 2.042645732774625, + "grad_norm": 5.842360496520996, + "learning_rate": 1.5955904453756253e-05, + "loss": 0.6223, + "step": 231060 + }, + { + "epoch": 2.042734136034937, + "grad_norm": 1.5016493797302246, + "learning_rate": 1.5954431066084385e-05, + "loss": 0.5232, + "step": 231070 + }, + { + "epoch": 2.042822539295249, + "grad_norm": 1.6735752820968628, + "learning_rate": 1.5952957678412513e-05, + "loss": 0.3657, + "step": 231080 + }, + { + "epoch": 2.0429109425555616, + "grad_norm": 2.0286920070648193, + "learning_rate": 1.595148429074064e-05, + "loss": 0.5797, + "step": 231090 + }, + { + "epoch": 2.0429993458158737, + "grad_norm": 3.6199629306793213, + "learning_rate": 1.5950010903068773e-05, + "loss": 0.6444, + "step": 231100 + }, + { + "epoch": 2.043087749076186, + "grad_norm": 2.6665143966674805, + "learning_rate": 1.59485375153969e-05, + "loss": 0.5078, + "step": 231110 + }, + { + "epoch": 2.043176152336498, + "grad_norm": 6.822865009307861, + "learning_rate": 1.594706412772503e-05, + "loss": 0.5116, + "step": 231120 + }, + { + "epoch": 2.0432645555968105, + "grad_norm": 8.54440975189209, + "learning_rate": 1.5945590740053158e-05, + "loss": 0.4731, + "step": 231130 + }, + { + "epoch": 2.0433529588571226, + "grad_norm": 2.5880229473114014, + "learning_rate": 1.594411735238129e-05, + "loss": 0.629, + "step": 231140 + }, + { + "epoch": 2.0434413621174348, + "grad_norm": 3.211214065551758, + "learning_rate": 1.594264396470942e-05, + "loss": 0.5208, + "step": 231150 + }, + { + "epoch": 2.0435297653777473, + "grad_norm": 2.446917772293091, + "learning_rate": 1.5941170577037547e-05, + "loss": 0.646, + "step": 231160 + }, + { + "epoch": 2.0436181686380595, + "grad_norm": 2.2765862941741943, + "learning_rate": 1.5939697189365675e-05, + "loss": 0.4317, + "step": 231170 + }, + { + "epoch": 2.0437065718983716, + "grad_norm": 3.465106248855591, + "learning_rate": 1.5938223801693807e-05, + "loss": 0.4649, + "step": 231180 + }, + { + "epoch": 2.0437949751586837, + "grad_norm": 5.124727249145508, + "learning_rate": 1.5936750414021935e-05, + "loss": 0.4446, + "step": 231190 + }, + { + "epoch": 2.0438833784189963, + "grad_norm": 9.47276496887207, + "learning_rate": 1.5935277026350063e-05, + "loss": 0.4776, + "step": 231200 + }, + { + "epoch": 2.0439717816793084, + "grad_norm": 3.122164487838745, + "learning_rate": 1.5933803638678195e-05, + "loss": 0.5482, + "step": 231210 + }, + { + "epoch": 2.0440601849396205, + "grad_norm": 2.0621957778930664, + "learning_rate": 1.5932330251006324e-05, + "loss": 0.6024, + "step": 231220 + }, + { + "epoch": 2.0441485881999326, + "grad_norm": 1.182389259338379, + "learning_rate": 1.5930856863334452e-05, + "loss": 0.4999, + "step": 231230 + }, + { + "epoch": 2.044236991460245, + "grad_norm": 10.22768497467041, + "learning_rate": 1.5929383475662584e-05, + "loss": 0.533, + "step": 231240 + }, + { + "epoch": 2.0443253947205573, + "grad_norm": 9.51506233215332, + "learning_rate": 1.5927910087990712e-05, + "loss": 0.6077, + "step": 231250 + }, + { + "epoch": 2.0444137979808694, + "grad_norm": 2.62500262260437, + "learning_rate": 1.592643670031884e-05, + "loss": 0.581, + "step": 231260 + }, + { + "epoch": 2.044502201241182, + "grad_norm": 5.397249698638916, + "learning_rate": 1.5924963312646972e-05, + "loss": 0.5883, + "step": 231270 + }, + { + "epoch": 2.044590604501494, + "grad_norm": 2.7398130893707275, + "learning_rate": 1.59234899249751e-05, + "loss": 0.4314, + "step": 231280 + }, + { + "epoch": 2.044679007761806, + "grad_norm": 5.292287826538086, + "learning_rate": 1.592201653730323e-05, + "loss": 0.5505, + "step": 231290 + }, + { + "epoch": 2.0447674110221183, + "grad_norm": 1.9100251197814941, + "learning_rate": 1.592054314963136e-05, + "loss": 0.6001, + "step": 231300 + }, + { + "epoch": 2.044855814282431, + "grad_norm": 1.6888855695724487, + "learning_rate": 1.591906976195949e-05, + "loss": 0.5494, + "step": 231310 + }, + { + "epoch": 2.044944217542743, + "grad_norm": 2.5151782035827637, + "learning_rate": 1.5917596374287617e-05, + "loss": 0.441, + "step": 231320 + }, + { + "epoch": 2.045032620803055, + "grad_norm": 7.560589790344238, + "learning_rate": 1.591612298661575e-05, + "loss": 0.5611, + "step": 231330 + }, + { + "epoch": 2.0451210240633673, + "grad_norm": 4.330668926239014, + "learning_rate": 1.5914649598943877e-05, + "loss": 0.4653, + "step": 231340 + }, + { + "epoch": 2.04520942732368, + "grad_norm": 2.1402697563171387, + "learning_rate": 1.5913176211272006e-05, + "loss": 0.6049, + "step": 231350 + }, + { + "epoch": 2.045297830583992, + "grad_norm": 4.066859722137451, + "learning_rate": 1.5911702823600137e-05, + "loss": 0.4847, + "step": 231360 + }, + { + "epoch": 2.045386233844304, + "grad_norm": 1.5702037811279297, + "learning_rate": 1.5910229435928266e-05, + "loss": 0.5076, + "step": 231370 + }, + { + "epoch": 2.0454746371046166, + "grad_norm": 1.2722256183624268, + "learning_rate": 1.5908756048256394e-05, + "loss": 0.4251, + "step": 231380 + }, + { + "epoch": 2.0455630403649288, + "grad_norm": 3.5303449630737305, + "learning_rate": 1.5907282660584526e-05, + "loss": 0.5067, + "step": 231390 + }, + { + "epoch": 2.045651443625241, + "grad_norm": 3.3159615993499756, + "learning_rate": 1.5905809272912654e-05, + "loss": 0.4903, + "step": 231400 + }, + { + "epoch": 2.045739846885553, + "grad_norm": 2.8924272060394287, + "learning_rate": 1.5904335885240783e-05, + "loss": 0.525, + "step": 231410 + }, + { + "epoch": 2.0458282501458656, + "grad_norm": 6.138674259185791, + "learning_rate": 1.590286249756891e-05, + "loss": 0.5503, + "step": 231420 + }, + { + "epoch": 2.0459166534061777, + "grad_norm": 3.9708871841430664, + "learning_rate": 1.5901389109897043e-05, + "loss": 0.4996, + "step": 231430 + }, + { + "epoch": 2.04600505666649, + "grad_norm": 12.368253707885742, + "learning_rate": 1.589991572222517e-05, + "loss": 0.6593, + "step": 231440 + }, + { + "epoch": 2.046093459926802, + "grad_norm": 1.53944730758667, + "learning_rate": 1.58984423345533e-05, + "loss": 0.5426, + "step": 231450 + }, + { + "epoch": 2.0461818631871145, + "grad_norm": 1.8597759008407593, + "learning_rate": 1.589696894688143e-05, + "loss": 0.5248, + "step": 231460 + }, + { + "epoch": 2.0462702664474266, + "grad_norm": 3.1019186973571777, + "learning_rate": 1.589549555920956e-05, + "loss": 0.5465, + "step": 231470 + }, + { + "epoch": 2.0463586697077387, + "grad_norm": 1.8611092567443848, + "learning_rate": 1.5894022171537688e-05, + "loss": 0.4551, + "step": 231480 + }, + { + "epoch": 2.046447072968051, + "grad_norm": 1.612317681312561, + "learning_rate": 1.5892548783865816e-05, + "loss": 0.4774, + "step": 231490 + }, + { + "epoch": 2.0465354762283634, + "grad_norm": 6.439380645751953, + "learning_rate": 1.5891075396193948e-05, + "loss": 0.4837, + "step": 231500 + }, + { + "epoch": 2.0466238794886755, + "grad_norm": 8.131214141845703, + "learning_rate": 1.5889602008522076e-05, + "loss": 0.5379, + "step": 231510 + }, + { + "epoch": 2.0467122827489876, + "grad_norm": 5.663340091705322, + "learning_rate": 1.5888128620850205e-05, + "loss": 0.544, + "step": 231520 + }, + { + "epoch": 2.0468006860093, + "grad_norm": 2.7814502716064453, + "learning_rate": 1.5886655233178333e-05, + "loss": 0.5343, + "step": 231530 + }, + { + "epoch": 2.0468890892696123, + "grad_norm": 1.8832165002822876, + "learning_rate": 1.5885181845506465e-05, + "loss": 0.7706, + "step": 231540 + }, + { + "epoch": 2.0469774925299244, + "grad_norm": 21.537189483642578, + "learning_rate": 1.5883708457834593e-05, + "loss": 0.4784, + "step": 231550 + }, + { + "epoch": 2.0470658957902366, + "grad_norm": 2.2764692306518555, + "learning_rate": 1.588223507016272e-05, + "loss": 0.6641, + "step": 231560 + }, + { + "epoch": 2.047154299050549, + "grad_norm": 1.6080387830734253, + "learning_rate": 1.5880761682490853e-05, + "loss": 0.5055, + "step": 231570 + }, + { + "epoch": 2.0472427023108613, + "grad_norm": 3.7670226097106934, + "learning_rate": 1.587928829481898e-05, + "loss": 0.6187, + "step": 231580 + }, + { + "epoch": 2.0473311055711734, + "grad_norm": 2.2849948406219482, + "learning_rate": 1.587781490714711e-05, + "loss": 0.4523, + "step": 231590 + }, + { + "epoch": 2.0474195088314855, + "grad_norm": 7.861727237701416, + "learning_rate": 1.5876341519475238e-05, + "loss": 0.474, + "step": 231600 + }, + { + "epoch": 2.047507912091798, + "grad_norm": 7.686766147613525, + "learning_rate": 1.587486813180337e-05, + "loss": 0.5184, + "step": 231610 + }, + { + "epoch": 2.04759631535211, + "grad_norm": 12.9701509475708, + "learning_rate": 1.58733947441315e-05, + "loss": 0.4663, + "step": 231620 + }, + { + "epoch": 2.0476847186124223, + "grad_norm": 8.370609283447266, + "learning_rate": 1.5871921356459627e-05, + "loss": 0.5993, + "step": 231630 + }, + { + "epoch": 2.047773121872735, + "grad_norm": 3.748138427734375, + "learning_rate": 1.5870447968787755e-05, + "loss": 0.5399, + "step": 231640 + }, + { + "epoch": 2.047861525133047, + "grad_norm": 4.2841081619262695, + "learning_rate": 1.5868974581115887e-05, + "loss": 0.5568, + "step": 231650 + }, + { + "epoch": 2.047949928393359, + "grad_norm": 1.9802615642547607, + "learning_rate": 1.5867501193444015e-05, + "loss": 0.6382, + "step": 231660 + }, + { + "epoch": 2.048038331653671, + "grad_norm": 4.602457046508789, + "learning_rate": 1.5866027805772144e-05, + "loss": 0.5196, + "step": 231670 + }, + { + "epoch": 2.048126734913984, + "grad_norm": 2.451338052749634, + "learning_rate": 1.5864554418100275e-05, + "loss": 0.5595, + "step": 231680 + }, + { + "epoch": 2.048215138174296, + "grad_norm": 1.7430620193481445, + "learning_rate": 1.5863081030428404e-05, + "loss": 0.5137, + "step": 231690 + }, + { + "epoch": 2.048303541434608, + "grad_norm": 4.9068522453308105, + "learning_rate": 1.5861607642756532e-05, + "loss": 0.5266, + "step": 231700 + }, + { + "epoch": 2.04839194469492, + "grad_norm": 16.226215362548828, + "learning_rate": 1.586013425508466e-05, + "loss": 0.5763, + "step": 231710 + }, + { + "epoch": 2.0484803479552327, + "grad_norm": 3.0716404914855957, + "learning_rate": 1.5858660867412792e-05, + "loss": 0.5127, + "step": 231720 + }, + { + "epoch": 2.048568751215545, + "grad_norm": 3.158184051513672, + "learning_rate": 1.585718747974092e-05, + "loss": 0.563, + "step": 231730 + }, + { + "epoch": 2.048657154475857, + "grad_norm": 1.3168848752975464, + "learning_rate": 1.585571409206905e-05, + "loss": 0.5155, + "step": 231740 + }, + { + "epoch": 2.0487455577361695, + "grad_norm": 3.459087371826172, + "learning_rate": 1.585424070439718e-05, + "loss": 0.5293, + "step": 231750 + }, + { + "epoch": 2.0488339609964816, + "grad_norm": 1.5299887657165527, + "learning_rate": 1.585276731672531e-05, + "loss": 0.497, + "step": 231760 + }, + { + "epoch": 2.0489223642567937, + "grad_norm": 2.4056944847106934, + "learning_rate": 1.5851293929053437e-05, + "loss": 0.522, + "step": 231770 + }, + { + "epoch": 2.049010767517106, + "grad_norm": 4.995020866394043, + "learning_rate": 1.5849820541381566e-05, + "loss": 0.4196, + "step": 231780 + }, + { + "epoch": 2.0490991707774184, + "grad_norm": 5.26909065246582, + "learning_rate": 1.5848347153709697e-05, + "loss": 0.5549, + "step": 231790 + }, + { + "epoch": 2.0491875740377306, + "grad_norm": 3.6840174198150635, + "learning_rate": 1.5846873766037826e-05, + "loss": 0.5863, + "step": 231800 + }, + { + "epoch": 2.0492759772980427, + "grad_norm": 3.1539103984832764, + "learning_rate": 1.5845400378365954e-05, + "loss": 0.5518, + "step": 231810 + }, + { + "epoch": 2.049364380558355, + "grad_norm": 2.4849064350128174, + "learning_rate": 1.5843926990694082e-05, + "loss": 0.53, + "step": 231820 + }, + { + "epoch": 2.0494527838186674, + "grad_norm": 4.836664199829102, + "learning_rate": 1.5842453603022214e-05, + "loss": 0.5493, + "step": 231830 + }, + { + "epoch": 2.0495411870789795, + "grad_norm": 2.846653461456299, + "learning_rate": 1.5840980215350343e-05, + "loss": 0.541, + "step": 231840 + }, + { + "epoch": 2.0496295903392916, + "grad_norm": 2.632824659347534, + "learning_rate": 1.583950682767847e-05, + "loss": 0.4897, + "step": 231850 + }, + { + "epoch": 2.049717993599604, + "grad_norm": 9.093971252441406, + "learning_rate": 1.5838033440006603e-05, + "loss": 0.5715, + "step": 231860 + }, + { + "epoch": 2.0498063968599163, + "grad_norm": 1.6627018451690674, + "learning_rate": 1.583656005233473e-05, + "loss": 0.5028, + "step": 231870 + }, + { + "epoch": 2.0498948001202284, + "grad_norm": 3.376101493835449, + "learning_rate": 1.583508666466286e-05, + "loss": 0.5049, + "step": 231880 + }, + { + "epoch": 2.0499832033805405, + "grad_norm": 3.0024359226226807, + "learning_rate": 1.5833613276990988e-05, + "loss": 0.4028, + "step": 231890 + }, + { + "epoch": 2.050071606640853, + "grad_norm": 9.930014610290527, + "learning_rate": 1.583213988931912e-05, + "loss": 0.5391, + "step": 231900 + }, + { + "epoch": 2.050160009901165, + "grad_norm": 4.416248798370361, + "learning_rate": 1.5830666501647248e-05, + "loss": 0.4782, + "step": 231910 + }, + { + "epoch": 2.0502484131614773, + "grad_norm": 4.605274677276611, + "learning_rate": 1.5829193113975376e-05, + "loss": 0.4948, + "step": 231920 + }, + { + "epoch": 2.0503368164217894, + "grad_norm": 1.9919416904449463, + "learning_rate": 1.5827719726303508e-05, + "loss": 0.5023, + "step": 231930 + }, + { + "epoch": 2.050425219682102, + "grad_norm": 5.543881893157959, + "learning_rate": 1.5826246338631636e-05, + "loss": 0.6322, + "step": 231940 + }, + { + "epoch": 2.050513622942414, + "grad_norm": 3.325211763381958, + "learning_rate": 1.5824772950959765e-05, + "loss": 0.5921, + "step": 231950 + }, + { + "epoch": 2.0506020262027262, + "grad_norm": 0.8421772718429565, + "learning_rate": 1.5823299563287893e-05, + "loss": 0.6043, + "step": 231960 + }, + { + "epoch": 2.050690429463039, + "grad_norm": 5.795717239379883, + "learning_rate": 1.5821826175616025e-05, + "loss": 0.5545, + "step": 231970 + }, + { + "epoch": 2.050778832723351, + "grad_norm": 11.153934478759766, + "learning_rate": 1.5820352787944153e-05, + "loss": 0.4595, + "step": 231980 + }, + { + "epoch": 2.050867235983663, + "grad_norm": 3.954604387283325, + "learning_rate": 1.581887940027228e-05, + "loss": 0.4357, + "step": 231990 + }, + { + "epoch": 2.050955639243975, + "grad_norm": 3.5142061710357666, + "learning_rate": 1.581740601260041e-05, + "loss": 0.5391, + "step": 232000 + }, + { + "epoch": 2.0510440425042877, + "grad_norm": 3.8870229721069336, + "learning_rate": 1.581593262492854e-05, + "loss": 0.4903, + "step": 232010 + }, + { + "epoch": 2.0511324457646, + "grad_norm": 2.919037342071533, + "learning_rate": 1.581445923725667e-05, + "loss": 0.5548, + "step": 232020 + }, + { + "epoch": 2.051220849024912, + "grad_norm": 3.5879738330841064, + "learning_rate": 1.5812985849584798e-05, + "loss": 0.6236, + "step": 232030 + }, + { + "epoch": 2.051309252285224, + "grad_norm": 6.1330790519714355, + "learning_rate": 1.581151246191293e-05, + "loss": 0.5598, + "step": 232040 + }, + { + "epoch": 2.0513976555455367, + "grad_norm": 2.585853338241577, + "learning_rate": 1.5810039074241058e-05, + "loss": 0.5945, + "step": 232050 + }, + { + "epoch": 2.0514860588058488, + "grad_norm": 1.9390723705291748, + "learning_rate": 1.5808565686569187e-05, + "loss": 0.4926, + "step": 232060 + }, + { + "epoch": 2.051574462066161, + "grad_norm": 11.157991409301758, + "learning_rate": 1.5807092298897315e-05, + "loss": 0.5269, + "step": 232070 + }, + { + "epoch": 2.051662865326473, + "grad_norm": 2.5161056518554688, + "learning_rate": 1.5805618911225447e-05, + "loss": 0.5594, + "step": 232080 + }, + { + "epoch": 2.0517512685867856, + "grad_norm": 5.658996105194092, + "learning_rate": 1.5804145523553575e-05, + "loss": 0.6445, + "step": 232090 + }, + { + "epoch": 2.0518396718470977, + "grad_norm": 1.880904197692871, + "learning_rate": 1.5802672135881703e-05, + "loss": 0.4951, + "step": 232100 + }, + { + "epoch": 2.05192807510741, + "grad_norm": 6.212449550628662, + "learning_rate": 1.5801198748209832e-05, + "loss": 0.5525, + "step": 232110 + }, + { + "epoch": 2.0520164783677224, + "grad_norm": 1.9387264251708984, + "learning_rate": 1.5799725360537964e-05, + "loss": 0.4911, + "step": 232120 + }, + { + "epoch": 2.0521048816280345, + "grad_norm": 3.2162883281707764, + "learning_rate": 1.5798251972866092e-05, + "loss": 0.5017, + "step": 232130 + }, + { + "epoch": 2.0521932848883466, + "grad_norm": 4.569762706756592, + "learning_rate": 1.579677858519422e-05, + "loss": 0.448, + "step": 232140 + }, + { + "epoch": 2.0522816881486587, + "grad_norm": 2.8200905323028564, + "learning_rate": 1.5795305197522352e-05, + "loss": 0.5957, + "step": 232150 + }, + { + "epoch": 2.0523700914089713, + "grad_norm": 4.01318883895874, + "learning_rate": 1.579383180985048e-05, + "loss": 0.4658, + "step": 232160 + }, + { + "epoch": 2.0524584946692834, + "grad_norm": 2.6902830600738525, + "learning_rate": 1.579235842217861e-05, + "loss": 0.4794, + "step": 232170 + }, + { + "epoch": 2.0525468979295955, + "grad_norm": 14.536140441894531, + "learning_rate": 1.579088503450674e-05, + "loss": 0.4362, + "step": 232180 + }, + { + "epoch": 2.0526353011899077, + "grad_norm": 3.462282419204712, + "learning_rate": 1.578941164683487e-05, + "loss": 0.6445, + "step": 232190 + }, + { + "epoch": 2.0527237044502202, + "grad_norm": 2.5057718753814697, + "learning_rate": 1.5787938259162997e-05, + "loss": 0.6761, + "step": 232200 + }, + { + "epoch": 2.0528121077105324, + "grad_norm": 2.8004212379455566, + "learning_rate": 1.578646487149113e-05, + "loss": 0.6096, + "step": 232210 + }, + { + "epoch": 2.0529005109708445, + "grad_norm": 11.500385284423828, + "learning_rate": 1.5784991483819257e-05, + "loss": 0.5469, + "step": 232220 + }, + { + "epoch": 2.052988914231157, + "grad_norm": 2.389430522918701, + "learning_rate": 1.5783518096147386e-05, + "loss": 0.623, + "step": 232230 + }, + { + "epoch": 2.053077317491469, + "grad_norm": 1.9671823978424072, + "learning_rate": 1.5782044708475517e-05, + "loss": 0.5754, + "step": 232240 + }, + { + "epoch": 2.0531657207517813, + "grad_norm": 1.3044846057891846, + "learning_rate": 1.5780571320803646e-05, + "loss": 0.5779, + "step": 232250 + }, + { + "epoch": 2.0532541240120934, + "grad_norm": 2.362286329269409, + "learning_rate": 1.5779097933131774e-05, + "loss": 0.5335, + "step": 232260 + }, + { + "epoch": 2.053342527272406, + "grad_norm": 8.260251998901367, + "learning_rate": 1.5777624545459906e-05, + "loss": 0.6143, + "step": 232270 + }, + { + "epoch": 2.053430930532718, + "grad_norm": 4.929906845092773, + "learning_rate": 1.5776151157788034e-05, + "loss": 0.5484, + "step": 232280 + }, + { + "epoch": 2.05351933379303, + "grad_norm": 2.3731675148010254, + "learning_rate": 1.5774677770116162e-05, + "loss": 0.5058, + "step": 232290 + }, + { + "epoch": 2.0536077370533423, + "grad_norm": 2.4674441814422607, + "learning_rate": 1.5773204382444294e-05, + "loss": 0.4644, + "step": 232300 + }, + { + "epoch": 2.053696140313655, + "grad_norm": 1.9993362426757812, + "learning_rate": 1.5771730994772423e-05, + "loss": 0.5723, + "step": 232310 + }, + { + "epoch": 2.053784543573967, + "grad_norm": 3.6804347038269043, + "learning_rate": 1.577025760710055e-05, + "loss": 0.4972, + "step": 232320 + }, + { + "epoch": 2.053872946834279, + "grad_norm": 3.377811908721924, + "learning_rate": 1.5768784219428683e-05, + "loss": 0.478, + "step": 232330 + }, + { + "epoch": 2.0539613500945917, + "grad_norm": 16.1857852935791, + "learning_rate": 1.576731083175681e-05, + "loss": 0.5033, + "step": 232340 + }, + { + "epoch": 2.054049753354904, + "grad_norm": 10.684974670410156, + "learning_rate": 1.576583744408494e-05, + "loss": 0.4155, + "step": 232350 + }, + { + "epoch": 2.054138156615216, + "grad_norm": 3.017305612564087, + "learning_rate": 1.5764364056413068e-05, + "loss": 0.4439, + "step": 232360 + }, + { + "epoch": 2.054226559875528, + "grad_norm": 3.661722421646118, + "learning_rate": 1.57628906687412e-05, + "loss": 0.4798, + "step": 232370 + }, + { + "epoch": 2.0543149631358406, + "grad_norm": 4.736203193664551, + "learning_rate": 1.5761417281069328e-05, + "loss": 0.6297, + "step": 232380 + }, + { + "epoch": 2.0544033663961527, + "grad_norm": 0.7311581969261169, + "learning_rate": 1.5759943893397456e-05, + "loss": 0.4022, + "step": 232390 + }, + { + "epoch": 2.054491769656465, + "grad_norm": 2.1221816539764404, + "learning_rate": 1.5758470505725588e-05, + "loss": 0.4544, + "step": 232400 + }, + { + "epoch": 2.054580172916777, + "grad_norm": 3.8647336959838867, + "learning_rate": 1.5756997118053716e-05, + "loss": 0.609, + "step": 232410 + }, + { + "epoch": 2.0546685761770895, + "grad_norm": 6.023137092590332, + "learning_rate": 1.5755523730381845e-05, + "loss": 0.477, + "step": 232420 + }, + { + "epoch": 2.0547569794374017, + "grad_norm": 2.2725107669830322, + "learning_rate": 1.5754050342709973e-05, + "loss": 0.6767, + "step": 232430 + }, + { + "epoch": 2.0548453826977138, + "grad_norm": 2.400625705718994, + "learning_rate": 1.5752576955038105e-05, + "loss": 0.609, + "step": 232440 + }, + { + "epoch": 2.0549337859580263, + "grad_norm": 4.559525012969971, + "learning_rate": 1.5751103567366233e-05, + "loss": 0.4219, + "step": 232450 + }, + { + "epoch": 2.0550221892183385, + "grad_norm": 2.54927659034729, + "learning_rate": 1.574963017969436e-05, + "loss": 0.774, + "step": 232460 + }, + { + "epoch": 2.0551105924786506, + "grad_norm": 2.503234386444092, + "learning_rate": 1.574815679202249e-05, + "loss": 0.5763, + "step": 232470 + }, + { + "epoch": 2.0551989957389627, + "grad_norm": 2.6977767944335938, + "learning_rate": 1.574668340435062e-05, + "loss": 0.6144, + "step": 232480 + }, + { + "epoch": 2.0552873989992753, + "grad_norm": 0.8490464687347412, + "learning_rate": 1.574521001667875e-05, + "loss": 0.5294, + "step": 232490 + }, + { + "epoch": 2.0553758022595874, + "grad_norm": 5.797241687774658, + "learning_rate": 1.5743736629006878e-05, + "loss": 0.5325, + "step": 232500 + }, + { + "epoch": 2.0554642055198995, + "grad_norm": 6.328723430633545, + "learning_rate": 1.574226324133501e-05, + "loss": 0.5358, + "step": 232510 + }, + { + "epoch": 2.0555526087802116, + "grad_norm": 1.350393295288086, + "learning_rate": 1.574078985366314e-05, + "loss": 0.5322, + "step": 232520 + }, + { + "epoch": 2.055641012040524, + "grad_norm": 11.449645042419434, + "learning_rate": 1.5739316465991267e-05, + "loss": 0.455, + "step": 232530 + }, + { + "epoch": 2.0557294153008363, + "grad_norm": 2.4055213928222656, + "learning_rate": 1.5737843078319395e-05, + "loss": 0.502, + "step": 232540 + }, + { + "epoch": 2.0558178185611484, + "grad_norm": 6.425386905670166, + "learning_rate": 1.5736369690647527e-05, + "loss": 0.5976, + "step": 232550 + }, + { + "epoch": 2.055906221821461, + "grad_norm": 1.581001877784729, + "learning_rate": 1.5734896302975655e-05, + "loss": 0.507, + "step": 232560 + }, + { + "epoch": 2.055994625081773, + "grad_norm": 8.866503715515137, + "learning_rate": 1.5733422915303784e-05, + "loss": 0.4669, + "step": 232570 + }, + { + "epoch": 2.0560830283420852, + "grad_norm": 1.6587051153182983, + "learning_rate": 1.5731949527631912e-05, + "loss": 0.6174, + "step": 232580 + }, + { + "epoch": 2.0561714316023973, + "grad_norm": 3.1415131092071533, + "learning_rate": 1.5730476139960044e-05, + "loss": 0.5186, + "step": 232590 + }, + { + "epoch": 2.05625983486271, + "grad_norm": 15.773072242736816, + "learning_rate": 1.5729002752288172e-05, + "loss": 0.6621, + "step": 232600 + }, + { + "epoch": 2.056348238123022, + "grad_norm": 13.034499168395996, + "learning_rate": 1.57275293646163e-05, + "loss": 0.5948, + "step": 232610 + }, + { + "epoch": 2.056436641383334, + "grad_norm": 1.0541774034500122, + "learning_rate": 1.5726055976944432e-05, + "loss": 0.5414, + "step": 232620 + }, + { + "epoch": 2.0565250446436463, + "grad_norm": 22.101980209350586, + "learning_rate": 1.572458258927256e-05, + "loss": 0.5959, + "step": 232630 + }, + { + "epoch": 2.056613447903959, + "grad_norm": 3.8811333179473877, + "learning_rate": 1.572310920160069e-05, + "loss": 0.4809, + "step": 232640 + }, + { + "epoch": 2.056701851164271, + "grad_norm": 8.26369857788086, + "learning_rate": 1.5721635813928817e-05, + "loss": 0.5101, + "step": 232650 + }, + { + "epoch": 2.056790254424583, + "grad_norm": 1.261708378791809, + "learning_rate": 1.572016242625695e-05, + "loss": 0.6576, + "step": 232660 + }, + { + "epoch": 2.056878657684895, + "grad_norm": 4.858982086181641, + "learning_rate": 1.5718689038585077e-05, + "loss": 0.6659, + "step": 232670 + }, + { + "epoch": 2.0569670609452078, + "grad_norm": 3.236792802810669, + "learning_rate": 1.5717215650913206e-05, + "loss": 0.5968, + "step": 232680 + }, + { + "epoch": 2.05705546420552, + "grad_norm": 1.6625187397003174, + "learning_rate": 1.5715742263241337e-05, + "loss": 0.6351, + "step": 232690 + }, + { + "epoch": 2.057143867465832, + "grad_norm": 5.5977253913879395, + "learning_rate": 1.5714268875569466e-05, + "loss": 0.6515, + "step": 232700 + }, + { + "epoch": 2.0572322707261446, + "grad_norm": 4.204442501068115, + "learning_rate": 1.5712795487897594e-05, + "loss": 0.4435, + "step": 232710 + }, + { + "epoch": 2.0573206739864567, + "grad_norm": 1.6375216245651245, + "learning_rate": 1.5711322100225722e-05, + "loss": 0.5239, + "step": 232720 + }, + { + "epoch": 2.057409077246769, + "grad_norm": 2.1367201805114746, + "learning_rate": 1.5709848712553854e-05, + "loss": 0.5488, + "step": 232730 + }, + { + "epoch": 2.057497480507081, + "grad_norm": 3.4991295337677, + "learning_rate": 1.5708375324881982e-05, + "loss": 0.731, + "step": 232740 + }, + { + "epoch": 2.0575858837673935, + "grad_norm": 6.927596569061279, + "learning_rate": 1.570690193721011e-05, + "loss": 0.6827, + "step": 232750 + }, + { + "epoch": 2.0576742870277056, + "grad_norm": 0.9930360317230225, + "learning_rate": 1.570542854953824e-05, + "loss": 0.4404, + "step": 232760 + }, + { + "epoch": 2.0577626902880177, + "grad_norm": 2.7718727588653564, + "learning_rate": 1.570395516186637e-05, + "loss": 0.5412, + "step": 232770 + }, + { + "epoch": 2.05785109354833, + "grad_norm": 3.351569652557373, + "learning_rate": 1.57024817741945e-05, + "loss": 0.5702, + "step": 232780 + }, + { + "epoch": 2.0579394968086424, + "grad_norm": 1.893885850906372, + "learning_rate": 1.5701008386522628e-05, + "loss": 0.5435, + "step": 232790 + }, + { + "epoch": 2.0580279000689545, + "grad_norm": 1.5758800506591797, + "learning_rate": 1.569953499885076e-05, + "loss": 0.5147, + "step": 232800 + }, + { + "epoch": 2.0581163033292666, + "grad_norm": 5.5366315841674805, + "learning_rate": 1.5698061611178888e-05, + "loss": 0.4779, + "step": 232810 + }, + { + "epoch": 2.058204706589579, + "grad_norm": 5.055202960968018, + "learning_rate": 1.5696588223507016e-05, + "loss": 0.5835, + "step": 232820 + }, + { + "epoch": 2.0582931098498913, + "grad_norm": 2.474210500717163, + "learning_rate": 1.5695114835835144e-05, + "loss": 0.4905, + "step": 232830 + }, + { + "epoch": 2.0583815131102035, + "grad_norm": 10.510014533996582, + "learning_rate": 1.5693641448163276e-05, + "loss": 0.5369, + "step": 232840 + }, + { + "epoch": 2.0584699163705156, + "grad_norm": 6.502597332000732, + "learning_rate": 1.5692168060491405e-05, + "loss": 0.6071, + "step": 232850 + }, + { + "epoch": 2.058558319630828, + "grad_norm": 4.13629674911499, + "learning_rate": 1.5690694672819533e-05, + "loss": 0.5083, + "step": 232860 + }, + { + "epoch": 2.0586467228911403, + "grad_norm": 4.320540428161621, + "learning_rate": 1.5689221285147665e-05, + "loss": 0.5341, + "step": 232870 + }, + { + "epoch": 2.0587351261514524, + "grad_norm": 4.03468656539917, + "learning_rate": 1.5687747897475793e-05, + "loss": 0.4725, + "step": 232880 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 3.0790822505950928, + "learning_rate": 1.568627450980392e-05, + "loss": 0.7436, + "step": 232890 + }, + { + "epoch": 2.058911932672077, + "grad_norm": 1.4359678030014038, + "learning_rate": 1.568480112213205e-05, + "loss": 0.547, + "step": 232900 + }, + { + "epoch": 2.059000335932389, + "grad_norm": 4.174426555633545, + "learning_rate": 1.568332773446018e-05, + "loss": 0.6196, + "step": 232910 + }, + { + "epoch": 2.0590887391927013, + "grad_norm": 1.4196349382400513, + "learning_rate": 1.568185434678831e-05, + "loss": 0.5232, + "step": 232920 + }, + { + "epoch": 2.059177142453014, + "grad_norm": 1.4415967464447021, + "learning_rate": 1.5680380959116438e-05, + "loss": 0.5449, + "step": 232930 + }, + { + "epoch": 2.059265545713326, + "grad_norm": 1.9231523275375366, + "learning_rate": 1.5678907571444566e-05, + "loss": 0.4118, + "step": 232940 + }, + { + "epoch": 2.059353948973638, + "grad_norm": 2.2585837841033936, + "learning_rate": 1.5677434183772698e-05, + "loss": 0.4985, + "step": 232950 + }, + { + "epoch": 2.0594423522339502, + "grad_norm": 3.632296562194824, + "learning_rate": 1.5675960796100827e-05, + "loss": 0.5915, + "step": 232960 + }, + { + "epoch": 2.059530755494263, + "grad_norm": 11.843439102172852, + "learning_rate": 1.5674487408428955e-05, + "loss": 0.5089, + "step": 232970 + }, + { + "epoch": 2.059619158754575, + "grad_norm": 1.974780797958374, + "learning_rate": 1.5673014020757087e-05, + "loss": 0.47, + "step": 232980 + }, + { + "epoch": 2.059707562014887, + "grad_norm": 1.5138241052627563, + "learning_rate": 1.5671540633085215e-05, + "loss": 0.4956, + "step": 232990 + }, + { + "epoch": 2.059795965275199, + "grad_norm": 3.6658482551574707, + "learning_rate": 1.5670067245413343e-05, + "loss": 0.5793, + "step": 233000 + }, + { + "epoch": 2.0598843685355117, + "grad_norm": 1.1115801334381104, + "learning_rate": 1.5668593857741472e-05, + "loss": 0.4613, + "step": 233010 + }, + { + "epoch": 2.059972771795824, + "grad_norm": 0.8452221155166626, + "learning_rate": 1.5667120470069603e-05, + "loss": 0.5271, + "step": 233020 + }, + { + "epoch": 2.060061175056136, + "grad_norm": 2.7244746685028076, + "learning_rate": 1.5665647082397732e-05, + "loss": 0.4163, + "step": 233030 + }, + { + "epoch": 2.0601495783164485, + "grad_norm": 1.8605334758758545, + "learning_rate": 1.566417369472586e-05, + "loss": 0.4323, + "step": 233040 + }, + { + "epoch": 2.0602379815767606, + "grad_norm": 12.121393203735352, + "learning_rate": 1.5662700307053992e-05, + "loss": 0.6378, + "step": 233050 + }, + { + "epoch": 2.0603263848370728, + "grad_norm": 0.9520108699798584, + "learning_rate": 1.566122691938212e-05, + "loss": 0.5095, + "step": 233060 + }, + { + "epoch": 2.060414788097385, + "grad_norm": 2.1312050819396973, + "learning_rate": 1.565975353171025e-05, + "loss": 0.7, + "step": 233070 + }, + { + "epoch": 2.0605031913576974, + "grad_norm": 3.2304162979125977, + "learning_rate": 1.5658280144038377e-05, + "loss": 0.4861, + "step": 233080 + }, + { + "epoch": 2.0605915946180096, + "grad_norm": 1.5600264072418213, + "learning_rate": 1.565680675636651e-05, + "loss": 0.4468, + "step": 233090 + }, + { + "epoch": 2.0606799978783217, + "grad_norm": 3.028773546218872, + "learning_rate": 1.5655333368694637e-05, + "loss": 0.4079, + "step": 233100 + }, + { + "epoch": 2.060768401138634, + "grad_norm": 22.320194244384766, + "learning_rate": 1.5653859981022765e-05, + "loss": 0.5187, + "step": 233110 + }, + { + "epoch": 2.0608568043989464, + "grad_norm": 2.560748338699341, + "learning_rate": 1.5652386593350897e-05, + "loss": 0.5535, + "step": 233120 + }, + { + "epoch": 2.0609452076592585, + "grad_norm": 4.875572204589844, + "learning_rate": 1.5650913205679026e-05, + "loss": 0.5022, + "step": 233130 + }, + { + "epoch": 2.0610336109195706, + "grad_norm": 2.006239414215088, + "learning_rate": 1.5649439818007154e-05, + "loss": 0.5448, + "step": 233140 + }, + { + "epoch": 2.061122014179883, + "grad_norm": 2.4891035556793213, + "learning_rate": 1.5647966430335286e-05, + "loss": 0.4972, + "step": 233150 + }, + { + "epoch": 2.0612104174401953, + "grad_norm": 2.234797477722168, + "learning_rate": 1.5646493042663414e-05, + "loss": 0.5176, + "step": 233160 + }, + { + "epoch": 2.0612988207005074, + "grad_norm": 11.231446266174316, + "learning_rate": 1.5645019654991542e-05, + "loss": 0.603, + "step": 233170 + }, + { + "epoch": 2.0613872239608195, + "grad_norm": 1.632785677909851, + "learning_rate": 1.5643546267319674e-05, + "loss": 0.4669, + "step": 233180 + }, + { + "epoch": 2.061475627221132, + "grad_norm": 1.9496651887893677, + "learning_rate": 1.5642072879647802e-05, + "loss": 0.3762, + "step": 233190 + }, + { + "epoch": 2.061564030481444, + "grad_norm": 4.683070182800293, + "learning_rate": 1.564059949197593e-05, + "loss": 0.5643, + "step": 233200 + }, + { + "epoch": 2.0616524337417563, + "grad_norm": 1.6859397888183594, + "learning_rate": 1.5639126104304063e-05, + "loss": 0.4984, + "step": 233210 + }, + { + "epoch": 2.0617408370020684, + "grad_norm": 3.2659077644348145, + "learning_rate": 1.563765271663219e-05, + "loss": 0.5738, + "step": 233220 + }, + { + "epoch": 2.061829240262381, + "grad_norm": 2.148397445678711, + "learning_rate": 1.563617932896032e-05, + "loss": 0.6256, + "step": 233230 + }, + { + "epoch": 2.061917643522693, + "grad_norm": 2.4518280029296875, + "learning_rate": 1.563470594128845e-05, + "loss": 0.6655, + "step": 233240 + }, + { + "epoch": 2.0620060467830053, + "grad_norm": 5.915921688079834, + "learning_rate": 1.563323255361658e-05, + "loss": 0.5265, + "step": 233250 + }, + { + "epoch": 2.0620944500433174, + "grad_norm": 1.3652362823486328, + "learning_rate": 1.5631759165944708e-05, + "loss": 0.6484, + "step": 233260 + }, + { + "epoch": 2.06218285330363, + "grad_norm": 3.580735921859741, + "learning_rate": 1.563028577827284e-05, + "loss": 0.4635, + "step": 233270 + }, + { + "epoch": 2.062271256563942, + "grad_norm": 5.279926776885986, + "learning_rate": 1.5628812390600968e-05, + "loss": 0.5676, + "step": 233280 + }, + { + "epoch": 2.062359659824254, + "grad_norm": 8.966619491577148, + "learning_rate": 1.5627339002929096e-05, + "loss": 0.5093, + "step": 233290 + }, + { + "epoch": 2.0624480630845667, + "grad_norm": 1.4041380882263184, + "learning_rate": 1.5625865615257224e-05, + "loss": 0.6513, + "step": 233300 + }, + { + "epoch": 2.062536466344879, + "grad_norm": 0.9365440011024475, + "learning_rate": 1.5624392227585356e-05, + "loss": 0.4742, + "step": 233310 + }, + { + "epoch": 2.062624869605191, + "grad_norm": 7.722444534301758, + "learning_rate": 1.5622918839913485e-05, + "loss": 0.6596, + "step": 233320 + }, + { + "epoch": 2.062713272865503, + "grad_norm": 1.4477300643920898, + "learning_rate": 1.5621445452241613e-05, + "loss": 0.4947, + "step": 233330 + }, + { + "epoch": 2.0628016761258157, + "grad_norm": 2.665956735610962, + "learning_rate": 1.5619972064569745e-05, + "loss": 0.4813, + "step": 233340 + }, + { + "epoch": 2.062890079386128, + "grad_norm": 2.8050591945648193, + "learning_rate": 1.5618498676897873e-05, + "loss": 0.5149, + "step": 233350 + }, + { + "epoch": 2.06297848264644, + "grad_norm": 1.151573896408081, + "learning_rate": 1.5617025289226e-05, + "loss": 0.4962, + "step": 233360 + }, + { + "epoch": 2.0630668859067525, + "grad_norm": 2.794217348098755, + "learning_rate": 1.561555190155413e-05, + "loss": 0.5343, + "step": 233370 + }, + { + "epoch": 2.0631552891670646, + "grad_norm": 2.339550256729126, + "learning_rate": 1.561407851388226e-05, + "loss": 0.5268, + "step": 233380 + }, + { + "epoch": 2.0632436924273767, + "grad_norm": 13.287505149841309, + "learning_rate": 1.561260512621039e-05, + "loss": 0.5175, + "step": 233390 + }, + { + "epoch": 2.063332095687689, + "grad_norm": 4.312925815582275, + "learning_rate": 1.5611131738538518e-05, + "loss": 0.537, + "step": 233400 + }, + { + "epoch": 2.0634204989480014, + "grad_norm": 6.188470840454102, + "learning_rate": 1.5609658350866647e-05, + "loss": 0.5978, + "step": 233410 + }, + { + "epoch": 2.0635089022083135, + "grad_norm": 4.443595886230469, + "learning_rate": 1.5608184963194778e-05, + "loss": 0.5414, + "step": 233420 + }, + { + "epoch": 2.0635973054686256, + "grad_norm": 9.348119735717773, + "learning_rate": 1.5606711575522907e-05, + "loss": 0.5432, + "step": 233430 + }, + { + "epoch": 2.0636857087289378, + "grad_norm": 3.086570978164673, + "learning_rate": 1.5605238187851035e-05, + "loss": 0.5407, + "step": 233440 + }, + { + "epoch": 2.0637741119892503, + "grad_norm": 1.5600178241729736, + "learning_rate": 1.5603764800179167e-05, + "loss": 0.56, + "step": 233450 + }, + { + "epoch": 2.0638625152495624, + "grad_norm": 2.21791672706604, + "learning_rate": 1.5602291412507295e-05, + "loss": 0.6428, + "step": 233460 + }, + { + "epoch": 2.0639509185098746, + "grad_norm": 2.209655284881592, + "learning_rate": 1.5600818024835423e-05, + "loss": 0.5632, + "step": 233470 + }, + { + "epoch": 2.0640393217701867, + "grad_norm": 8.63415241241455, + "learning_rate": 1.5599344637163552e-05, + "loss": 0.4244, + "step": 233480 + }, + { + "epoch": 2.0641277250304992, + "grad_norm": 6.176987648010254, + "learning_rate": 1.5597871249491684e-05, + "loss": 0.4642, + "step": 233490 + }, + { + "epoch": 2.0642161282908114, + "grad_norm": 1.1406521797180176, + "learning_rate": 1.5596397861819812e-05, + "loss": 0.4976, + "step": 233500 + }, + { + "epoch": 2.0643045315511235, + "grad_norm": 1.9696946144104004, + "learning_rate": 1.559492447414794e-05, + "loss": 0.6077, + "step": 233510 + }, + { + "epoch": 2.064392934811436, + "grad_norm": 3.8707401752471924, + "learning_rate": 1.5593451086476072e-05, + "loss": 0.5526, + "step": 233520 + }, + { + "epoch": 2.064481338071748, + "grad_norm": 3.8376686573028564, + "learning_rate": 1.55919776988042e-05, + "loss": 0.5707, + "step": 233530 + }, + { + "epoch": 2.0645697413320603, + "grad_norm": 8.121301651000977, + "learning_rate": 1.559050431113233e-05, + "loss": 0.4393, + "step": 233540 + }, + { + "epoch": 2.0646581445923724, + "grad_norm": 4.857401371002197, + "learning_rate": 1.5589030923460457e-05, + "loss": 0.5704, + "step": 233550 + }, + { + "epoch": 2.064746547852685, + "grad_norm": 2.942481756210327, + "learning_rate": 1.558755753578859e-05, + "loss": 0.5439, + "step": 233560 + }, + { + "epoch": 2.064834951112997, + "grad_norm": 7.71659517288208, + "learning_rate": 1.5586084148116717e-05, + "loss": 0.4208, + "step": 233570 + }, + { + "epoch": 2.064923354373309, + "grad_norm": 4.945372581481934, + "learning_rate": 1.5584610760444846e-05, + "loss": 0.5623, + "step": 233580 + }, + { + "epoch": 2.0650117576336213, + "grad_norm": 1.2617449760437012, + "learning_rate": 1.5583137372772974e-05, + "loss": 0.5477, + "step": 233590 + }, + { + "epoch": 2.065100160893934, + "grad_norm": 7.661891460418701, + "learning_rate": 1.5581663985101106e-05, + "loss": 0.5612, + "step": 233600 + }, + { + "epoch": 2.065188564154246, + "grad_norm": 4.973225116729736, + "learning_rate": 1.5580190597429234e-05, + "loss": 0.5963, + "step": 233610 + }, + { + "epoch": 2.065276967414558, + "grad_norm": 1.3271300792694092, + "learning_rate": 1.5578717209757362e-05, + "loss": 0.5055, + "step": 233620 + }, + { + "epoch": 2.0653653706748707, + "grad_norm": 4.487350940704346, + "learning_rate": 1.5577243822085494e-05, + "loss": 0.5725, + "step": 233630 + }, + { + "epoch": 2.065453773935183, + "grad_norm": 2.7205007076263428, + "learning_rate": 1.5575770434413622e-05, + "loss": 0.6255, + "step": 233640 + }, + { + "epoch": 2.065542177195495, + "grad_norm": 1.0460608005523682, + "learning_rate": 1.557429704674175e-05, + "loss": 0.5447, + "step": 233650 + }, + { + "epoch": 2.065630580455807, + "grad_norm": 1.1489644050598145, + "learning_rate": 1.557282365906988e-05, + "loss": 0.5162, + "step": 233660 + }, + { + "epoch": 2.0657189837161196, + "grad_norm": 5.125308990478516, + "learning_rate": 1.557135027139801e-05, + "loss": 0.3771, + "step": 233670 + }, + { + "epoch": 2.0658073869764317, + "grad_norm": 3.046475887298584, + "learning_rate": 1.556987688372614e-05, + "loss": 0.5403, + "step": 233680 + }, + { + "epoch": 2.065895790236744, + "grad_norm": 2.562744379043579, + "learning_rate": 1.5568403496054268e-05, + "loss": 0.5943, + "step": 233690 + }, + { + "epoch": 2.065984193497056, + "grad_norm": 7.07082462310791, + "learning_rate": 1.5566930108382396e-05, + "loss": 0.5446, + "step": 233700 + }, + { + "epoch": 2.0660725967573685, + "grad_norm": 1.3667078018188477, + "learning_rate": 1.5565456720710528e-05, + "loss": 0.6118, + "step": 233710 + }, + { + "epoch": 2.0661610000176807, + "grad_norm": 9.14773178100586, + "learning_rate": 1.5563983333038656e-05, + "loss": 0.4832, + "step": 233720 + }, + { + "epoch": 2.066249403277993, + "grad_norm": 1.6121585369110107, + "learning_rate": 1.5562509945366784e-05, + "loss": 0.5198, + "step": 233730 + }, + { + "epoch": 2.0663378065383053, + "grad_norm": 3.192275285720825, + "learning_rate": 1.5561036557694916e-05, + "loss": 0.4742, + "step": 233740 + }, + { + "epoch": 2.0664262097986175, + "grad_norm": 2.521320343017578, + "learning_rate": 1.5559563170023044e-05, + "loss": 0.5701, + "step": 233750 + }, + { + "epoch": 2.0665146130589296, + "grad_norm": 2.6699490547180176, + "learning_rate": 1.5558089782351173e-05, + "loss": 0.4743, + "step": 233760 + }, + { + "epoch": 2.0666030163192417, + "grad_norm": 0.7513478398323059, + "learning_rate": 1.55566163946793e-05, + "loss": 0.6284, + "step": 233770 + }, + { + "epoch": 2.0666914195795543, + "grad_norm": 1.777999758720398, + "learning_rate": 1.5555143007007433e-05, + "loss": 0.4781, + "step": 233780 + }, + { + "epoch": 2.0667798228398664, + "grad_norm": 7.985141277313232, + "learning_rate": 1.555366961933556e-05, + "loss": 0.555, + "step": 233790 + }, + { + "epoch": 2.0668682261001785, + "grad_norm": 3.418877363204956, + "learning_rate": 1.555219623166369e-05, + "loss": 0.5213, + "step": 233800 + }, + { + "epoch": 2.0669566293604906, + "grad_norm": 2.5481297969818115, + "learning_rate": 1.555072284399182e-05, + "loss": 0.5429, + "step": 233810 + }, + { + "epoch": 2.067045032620803, + "grad_norm": 5.440483093261719, + "learning_rate": 1.554924945631995e-05, + "loss": 0.5796, + "step": 233820 + }, + { + "epoch": 2.0671334358811153, + "grad_norm": 2.590834140777588, + "learning_rate": 1.5547776068648078e-05, + "loss": 0.53, + "step": 233830 + }, + { + "epoch": 2.0672218391414274, + "grad_norm": 1.5293880701065063, + "learning_rate": 1.5546302680976206e-05, + "loss": 0.6418, + "step": 233840 + }, + { + "epoch": 2.0673102424017396, + "grad_norm": 6.0740132331848145, + "learning_rate": 1.5544829293304338e-05, + "loss": 0.5387, + "step": 233850 + }, + { + "epoch": 2.067398645662052, + "grad_norm": 1.8395590782165527, + "learning_rate": 1.5543355905632467e-05, + "loss": 0.6671, + "step": 233860 + }, + { + "epoch": 2.0674870489223642, + "grad_norm": 1.6989926099777222, + "learning_rate": 1.5541882517960595e-05, + "loss": 0.5309, + "step": 233870 + }, + { + "epoch": 2.0675754521826764, + "grad_norm": 4.231560707092285, + "learning_rate": 1.5540409130288723e-05, + "loss": 0.4813, + "step": 233880 + }, + { + "epoch": 2.067663855442989, + "grad_norm": 1.7609984874725342, + "learning_rate": 1.5538935742616855e-05, + "loss": 0.6201, + "step": 233890 + }, + { + "epoch": 2.067752258703301, + "grad_norm": 2.276296377182007, + "learning_rate": 1.5537462354944983e-05, + "loss": 0.5228, + "step": 233900 + }, + { + "epoch": 2.067840661963613, + "grad_norm": 2.1341712474823, + "learning_rate": 1.553598896727311e-05, + "loss": 0.5166, + "step": 233910 + }, + { + "epoch": 2.0679290652239253, + "grad_norm": 2.1887760162353516, + "learning_rate": 1.5534515579601243e-05, + "loss": 0.45, + "step": 233920 + }, + { + "epoch": 2.068017468484238, + "grad_norm": 1.114351511001587, + "learning_rate": 1.5533042191929372e-05, + "loss": 0.6854, + "step": 233930 + }, + { + "epoch": 2.06810587174455, + "grad_norm": 5.880609512329102, + "learning_rate": 1.55315688042575e-05, + "loss": 0.5934, + "step": 233940 + }, + { + "epoch": 2.068194275004862, + "grad_norm": 10.143177032470703, + "learning_rate": 1.553009541658563e-05, + "loss": 0.4088, + "step": 233950 + }, + { + "epoch": 2.0682826782651746, + "grad_norm": 13.683798789978027, + "learning_rate": 1.552862202891376e-05, + "loss": 0.5488, + "step": 233960 + }, + { + "epoch": 2.0683710815254868, + "grad_norm": 1.9536701440811157, + "learning_rate": 1.552714864124189e-05, + "loss": 0.5511, + "step": 233970 + }, + { + "epoch": 2.068459484785799, + "grad_norm": 2.739705801010132, + "learning_rate": 1.5525675253570017e-05, + "loss": 0.6373, + "step": 233980 + }, + { + "epoch": 2.068547888046111, + "grad_norm": 8.29617691040039, + "learning_rate": 1.552420186589815e-05, + "loss": 0.5589, + "step": 233990 + }, + { + "epoch": 2.0686362913064236, + "grad_norm": 1.4963918924331665, + "learning_rate": 1.5522728478226277e-05, + "loss": 0.4606, + "step": 234000 + }, + { + "epoch": 2.0687246945667357, + "grad_norm": 8.87845230102539, + "learning_rate": 1.5521255090554405e-05, + "loss": 0.704, + "step": 234010 + }, + { + "epoch": 2.068813097827048, + "grad_norm": 1.9716249704360962, + "learning_rate": 1.5519781702882534e-05, + "loss": 0.4921, + "step": 234020 + }, + { + "epoch": 2.06890150108736, + "grad_norm": 2.761329174041748, + "learning_rate": 1.5518308315210665e-05, + "loss": 0.6837, + "step": 234030 + }, + { + "epoch": 2.0689899043476725, + "grad_norm": 1.2886667251586914, + "learning_rate": 1.5516834927538794e-05, + "loss": 0.5232, + "step": 234040 + }, + { + "epoch": 2.0690783076079846, + "grad_norm": 1.1491878032684326, + "learning_rate": 1.5515361539866922e-05, + "loss": 0.5598, + "step": 234050 + }, + { + "epoch": 2.0691667108682967, + "grad_norm": 2.06026029586792, + "learning_rate": 1.5513888152195054e-05, + "loss": 0.5173, + "step": 234060 + }, + { + "epoch": 2.069255114128609, + "grad_norm": 2.9801318645477295, + "learning_rate": 1.5512414764523182e-05, + "loss": 0.5817, + "step": 234070 + }, + { + "epoch": 2.0693435173889214, + "grad_norm": 3.353050947189331, + "learning_rate": 1.551094137685131e-05, + "loss": 0.5218, + "step": 234080 + }, + { + "epoch": 2.0694319206492335, + "grad_norm": 2.208454132080078, + "learning_rate": 1.5509467989179442e-05, + "loss": 0.5997, + "step": 234090 + }, + { + "epoch": 2.0695203239095457, + "grad_norm": 4.6981635093688965, + "learning_rate": 1.550799460150757e-05, + "loss": 0.6896, + "step": 234100 + }, + { + "epoch": 2.069608727169858, + "grad_norm": 18.994739532470703, + "learning_rate": 1.55065212138357e-05, + "loss": 0.5624, + "step": 234110 + }, + { + "epoch": 2.0696971304301703, + "grad_norm": 3.5385324954986572, + "learning_rate": 1.550504782616383e-05, + "loss": 0.4574, + "step": 234120 + }, + { + "epoch": 2.0697855336904825, + "grad_norm": 1.6206737756729126, + "learning_rate": 1.550357443849196e-05, + "loss": 0.6044, + "step": 234130 + }, + { + "epoch": 2.0698739369507946, + "grad_norm": 4.207000255584717, + "learning_rate": 1.5502101050820088e-05, + "loss": 0.5549, + "step": 234140 + }, + { + "epoch": 2.069962340211107, + "grad_norm": 1.8997361660003662, + "learning_rate": 1.550062766314822e-05, + "loss": 0.5716, + "step": 234150 + }, + { + "epoch": 2.0700507434714193, + "grad_norm": 8.831233978271484, + "learning_rate": 1.5499154275476348e-05, + "loss": 0.704, + "step": 234160 + }, + { + "epoch": 2.0701391467317314, + "grad_norm": 1.6916531324386597, + "learning_rate": 1.549768088780448e-05, + "loss": 0.5589, + "step": 234170 + }, + { + "epoch": 2.0702275499920435, + "grad_norm": 2.135178804397583, + "learning_rate": 1.5496207500132608e-05, + "loss": 0.5306, + "step": 234180 + }, + { + "epoch": 2.070315953252356, + "grad_norm": 1.6312179565429688, + "learning_rate": 1.5494734112460736e-05, + "loss": 0.552, + "step": 234190 + }, + { + "epoch": 2.070404356512668, + "grad_norm": 3.7227983474731445, + "learning_rate": 1.5493260724788864e-05, + "loss": 0.6047, + "step": 234200 + }, + { + "epoch": 2.0704927597729803, + "grad_norm": 8.878120422363281, + "learning_rate": 1.5491787337116996e-05, + "loss": 0.4301, + "step": 234210 + }, + { + "epoch": 2.070581163033293, + "grad_norm": 4.666725158691406, + "learning_rate": 1.5490313949445125e-05, + "loss": 0.4194, + "step": 234220 + }, + { + "epoch": 2.070669566293605, + "grad_norm": 3.929607391357422, + "learning_rate": 1.5488840561773253e-05, + "loss": 0.4599, + "step": 234230 + }, + { + "epoch": 2.070757969553917, + "grad_norm": 5.024203777313232, + "learning_rate": 1.548736717410138e-05, + "loss": 0.4488, + "step": 234240 + }, + { + "epoch": 2.0708463728142292, + "grad_norm": 3.9222261905670166, + "learning_rate": 1.5485893786429513e-05, + "loss": 0.5288, + "step": 234250 + }, + { + "epoch": 2.070934776074542, + "grad_norm": 5.327709674835205, + "learning_rate": 1.548442039875764e-05, + "loss": 0.5388, + "step": 234260 + }, + { + "epoch": 2.071023179334854, + "grad_norm": 2.4685428142547607, + "learning_rate": 1.548294701108577e-05, + "loss": 0.6376, + "step": 234270 + }, + { + "epoch": 2.071111582595166, + "grad_norm": 4.4171319007873535, + "learning_rate": 1.54814736234139e-05, + "loss": 0.516, + "step": 234280 + }, + { + "epoch": 2.071199985855478, + "grad_norm": 1.029421091079712, + "learning_rate": 1.548000023574203e-05, + "loss": 0.4345, + "step": 234290 + }, + { + "epoch": 2.0712883891157907, + "grad_norm": 4.146646022796631, + "learning_rate": 1.5478526848070158e-05, + "loss": 0.5949, + "step": 234300 + }, + { + "epoch": 2.071376792376103, + "grad_norm": 2.0076234340667725, + "learning_rate": 1.5477053460398286e-05, + "loss": 0.4633, + "step": 234310 + }, + { + "epoch": 2.071465195636415, + "grad_norm": 1.624782681465149, + "learning_rate": 1.5475580072726418e-05, + "loss": 0.6027, + "step": 234320 + }, + { + "epoch": 2.0715535988967275, + "grad_norm": 4.828230381011963, + "learning_rate": 1.5474106685054547e-05, + "loss": 0.5022, + "step": 234330 + }, + { + "epoch": 2.0716420021570396, + "grad_norm": 1.8550783395767212, + "learning_rate": 1.5472633297382675e-05, + "loss": 0.5246, + "step": 234340 + }, + { + "epoch": 2.0717304054173518, + "grad_norm": 1.1485100984573364, + "learning_rate": 1.5471159909710803e-05, + "loss": 0.5113, + "step": 234350 + }, + { + "epoch": 2.071818808677664, + "grad_norm": 6.985781669616699, + "learning_rate": 1.5469686522038935e-05, + "loss": 0.4931, + "step": 234360 + }, + { + "epoch": 2.0719072119379764, + "grad_norm": 1.7253764867782593, + "learning_rate": 1.5468213134367063e-05, + "loss": 0.5619, + "step": 234370 + }, + { + "epoch": 2.0719956151982886, + "grad_norm": 2.0708229541778564, + "learning_rate": 1.5466739746695192e-05, + "loss": 0.5281, + "step": 234380 + }, + { + "epoch": 2.0720840184586007, + "grad_norm": 4.326268196105957, + "learning_rate": 1.5465266359023323e-05, + "loss": 0.528, + "step": 234390 + }, + { + "epoch": 2.072172421718913, + "grad_norm": 0.8208890557289124, + "learning_rate": 1.5463792971351452e-05, + "loss": 0.4746, + "step": 234400 + }, + { + "epoch": 2.0722608249792254, + "grad_norm": 7.302486896514893, + "learning_rate": 1.546231958367958e-05, + "loss": 0.4576, + "step": 234410 + }, + { + "epoch": 2.0723492282395375, + "grad_norm": 2.121028184890747, + "learning_rate": 1.546084619600771e-05, + "loss": 0.4114, + "step": 234420 + }, + { + "epoch": 2.0724376314998496, + "grad_norm": 4.57228946685791, + "learning_rate": 1.545937280833584e-05, + "loss": 0.5349, + "step": 234430 + }, + { + "epoch": 2.0725260347601617, + "grad_norm": 2.7882676124572754, + "learning_rate": 1.545789942066397e-05, + "loss": 0.6236, + "step": 234440 + }, + { + "epoch": 2.0726144380204743, + "grad_norm": 2.9251163005828857, + "learning_rate": 1.5456426032992097e-05, + "loss": 0.497, + "step": 234450 + }, + { + "epoch": 2.0727028412807864, + "grad_norm": 4.256241798400879, + "learning_rate": 1.545495264532023e-05, + "loss": 0.594, + "step": 234460 + }, + { + "epoch": 2.0727912445410985, + "grad_norm": 1.1736671924591064, + "learning_rate": 1.5453479257648357e-05, + "loss": 0.4422, + "step": 234470 + }, + { + "epoch": 2.072879647801411, + "grad_norm": 3.4411282539367676, + "learning_rate": 1.5452005869976485e-05, + "loss": 0.4923, + "step": 234480 + }, + { + "epoch": 2.072968051061723, + "grad_norm": 4.44528341293335, + "learning_rate": 1.5450532482304614e-05, + "loss": 0.5121, + "step": 234490 + }, + { + "epoch": 2.0730564543220353, + "grad_norm": 3.144818067550659, + "learning_rate": 1.5449059094632746e-05, + "loss": 0.6045, + "step": 234500 + }, + { + "epoch": 2.0731448575823475, + "grad_norm": 5.156505584716797, + "learning_rate": 1.5447585706960874e-05, + "loss": 0.5962, + "step": 234510 + }, + { + "epoch": 2.07323326084266, + "grad_norm": 2.140868663787842, + "learning_rate": 1.5446112319289002e-05, + "loss": 0.5929, + "step": 234520 + }, + { + "epoch": 2.073321664102972, + "grad_norm": 16.916118621826172, + "learning_rate": 1.544463893161713e-05, + "loss": 0.5797, + "step": 234530 + }, + { + "epoch": 2.0734100673632843, + "grad_norm": 3.05837345123291, + "learning_rate": 1.5443165543945262e-05, + "loss": 0.5981, + "step": 234540 + }, + { + "epoch": 2.073498470623597, + "grad_norm": 6.138031005859375, + "learning_rate": 1.544169215627339e-05, + "loss": 0.5632, + "step": 234550 + }, + { + "epoch": 2.073586873883909, + "grad_norm": 4.811116695404053, + "learning_rate": 1.544021876860152e-05, + "loss": 0.5512, + "step": 234560 + }, + { + "epoch": 2.073675277144221, + "grad_norm": 7.1277852058410645, + "learning_rate": 1.543874538092965e-05, + "loss": 0.5234, + "step": 234570 + }, + { + "epoch": 2.073763680404533, + "grad_norm": 1.6328223943710327, + "learning_rate": 1.543727199325778e-05, + "loss": 0.4299, + "step": 234580 + }, + { + "epoch": 2.0738520836648457, + "grad_norm": 7.803845405578613, + "learning_rate": 1.5435798605585908e-05, + "loss": 0.5979, + "step": 234590 + }, + { + "epoch": 2.073940486925158, + "grad_norm": 6.772291660308838, + "learning_rate": 1.5434325217914036e-05, + "loss": 0.6415, + "step": 234600 + }, + { + "epoch": 2.07402889018547, + "grad_norm": 3.3583824634552, + "learning_rate": 1.5432851830242168e-05, + "loss": 0.6676, + "step": 234610 + }, + { + "epoch": 2.074117293445782, + "grad_norm": 6.4028778076171875, + "learning_rate": 1.5431378442570296e-05, + "loss": 0.555, + "step": 234620 + }, + { + "epoch": 2.0742056967060947, + "grad_norm": 6.72327995300293, + "learning_rate": 1.5429905054898424e-05, + "loss": 0.7236, + "step": 234630 + }, + { + "epoch": 2.074294099966407, + "grad_norm": 3.046621799468994, + "learning_rate": 1.5428431667226556e-05, + "loss": 0.5834, + "step": 234640 + }, + { + "epoch": 2.074382503226719, + "grad_norm": 3.2505874633789062, + "learning_rate": 1.5426958279554684e-05, + "loss": 0.4786, + "step": 234650 + }, + { + "epoch": 2.074470906487031, + "grad_norm": 4.344619274139404, + "learning_rate": 1.5425484891882813e-05, + "loss": 0.5742, + "step": 234660 + }, + { + "epoch": 2.0745593097473436, + "grad_norm": 5.583522796630859, + "learning_rate": 1.542401150421094e-05, + "loss": 0.5498, + "step": 234670 + }, + { + "epoch": 2.0746477130076557, + "grad_norm": 1.871364712715149, + "learning_rate": 1.5422538116539073e-05, + "loss": 0.6503, + "step": 234680 + }, + { + "epoch": 2.074736116267968, + "grad_norm": 3.8546998500823975, + "learning_rate": 1.54210647288672e-05, + "loss": 0.4759, + "step": 234690 + }, + { + "epoch": 2.0748245195282804, + "grad_norm": 1.3926606178283691, + "learning_rate": 1.541959134119533e-05, + "loss": 0.6712, + "step": 234700 + }, + { + "epoch": 2.0749129227885925, + "grad_norm": 2.762019634246826, + "learning_rate": 1.5418117953523458e-05, + "loss": 0.4122, + "step": 234710 + }, + { + "epoch": 2.0750013260489046, + "grad_norm": 3.2734930515289307, + "learning_rate": 1.541664456585159e-05, + "loss": 0.3681, + "step": 234720 + }, + { + "epoch": 2.0750897293092168, + "grad_norm": 2.2399697303771973, + "learning_rate": 1.5415171178179718e-05, + "loss": 0.5341, + "step": 234730 + }, + { + "epoch": 2.0751781325695293, + "grad_norm": 2.1827428340911865, + "learning_rate": 1.5413697790507846e-05, + "loss": 0.4952, + "step": 234740 + }, + { + "epoch": 2.0752665358298414, + "grad_norm": 9.778084754943848, + "learning_rate": 1.5412224402835978e-05, + "loss": 0.4543, + "step": 234750 + }, + { + "epoch": 2.0753549390901536, + "grad_norm": 3.6555538177490234, + "learning_rate": 1.5410751015164106e-05, + "loss": 0.3807, + "step": 234760 + }, + { + "epoch": 2.0754433423504657, + "grad_norm": 2.367147922515869, + "learning_rate": 1.5409277627492235e-05, + "loss": 0.5097, + "step": 234770 + }, + { + "epoch": 2.0755317456107782, + "grad_norm": 11.634543418884277, + "learning_rate": 1.5407804239820363e-05, + "loss": 0.5382, + "step": 234780 + }, + { + "epoch": 2.0756201488710904, + "grad_norm": 1.1405590772628784, + "learning_rate": 1.5406330852148495e-05, + "loss": 0.4866, + "step": 234790 + }, + { + "epoch": 2.0757085521314025, + "grad_norm": 7.482063293457031, + "learning_rate": 1.5404857464476623e-05, + "loss": 0.5256, + "step": 234800 + }, + { + "epoch": 2.075796955391715, + "grad_norm": 2.7694332599639893, + "learning_rate": 1.540338407680475e-05, + "loss": 0.5886, + "step": 234810 + }, + { + "epoch": 2.075885358652027, + "grad_norm": 2.289332866668701, + "learning_rate": 1.540191068913288e-05, + "loss": 0.4949, + "step": 234820 + }, + { + "epoch": 2.0759737619123393, + "grad_norm": 5.992410182952881, + "learning_rate": 1.5400437301461012e-05, + "loss": 0.4775, + "step": 234830 + }, + { + "epoch": 2.0760621651726514, + "grad_norm": 4.14433479309082, + "learning_rate": 1.539896391378914e-05, + "loss": 0.5014, + "step": 234840 + }, + { + "epoch": 2.076150568432964, + "grad_norm": 6.455384254455566, + "learning_rate": 1.539749052611727e-05, + "loss": 0.5956, + "step": 234850 + }, + { + "epoch": 2.076238971693276, + "grad_norm": 3.178196907043457, + "learning_rate": 1.53960171384454e-05, + "loss": 0.5747, + "step": 234860 + }, + { + "epoch": 2.076327374953588, + "grad_norm": 6.349433422088623, + "learning_rate": 1.539454375077353e-05, + "loss": 0.6612, + "step": 234870 + }, + { + "epoch": 2.0764157782139003, + "grad_norm": 1.854823112487793, + "learning_rate": 1.5393070363101657e-05, + "loss": 0.472, + "step": 234880 + }, + { + "epoch": 2.076504181474213, + "grad_norm": 1.1186264753341675, + "learning_rate": 1.5391596975429785e-05, + "loss": 0.5323, + "step": 234890 + }, + { + "epoch": 2.076592584734525, + "grad_norm": 3.2885656356811523, + "learning_rate": 1.5390123587757917e-05, + "loss": 0.4748, + "step": 234900 + }, + { + "epoch": 2.076680987994837, + "grad_norm": 5.528682708740234, + "learning_rate": 1.5388650200086045e-05, + "loss": 0.6814, + "step": 234910 + }, + { + "epoch": 2.0767693912551497, + "grad_norm": 12.447989463806152, + "learning_rate": 1.5387176812414174e-05, + "loss": 0.5328, + "step": 234920 + }, + { + "epoch": 2.076857794515462, + "grad_norm": 1.835616111755371, + "learning_rate": 1.5385703424742305e-05, + "loss": 0.3571, + "step": 234930 + }, + { + "epoch": 2.076946197775774, + "grad_norm": 1.4236358404159546, + "learning_rate": 1.5384230037070434e-05, + "loss": 0.5674, + "step": 234940 + }, + { + "epoch": 2.077034601036086, + "grad_norm": 5.740789413452148, + "learning_rate": 1.5382756649398562e-05, + "loss": 0.5173, + "step": 234950 + }, + { + "epoch": 2.0771230042963986, + "grad_norm": 2.5573360919952393, + "learning_rate": 1.5381283261726694e-05, + "loss": 0.6147, + "step": 234960 + }, + { + "epoch": 2.0772114075567107, + "grad_norm": 14.161341667175293, + "learning_rate": 1.5379809874054822e-05, + "loss": 0.5519, + "step": 234970 + }, + { + "epoch": 2.077299810817023, + "grad_norm": 3.1678028106689453, + "learning_rate": 1.537833648638295e-05, + "loss": 0.5977, + "step": 234980 + }, + { + "epoch": 2.077388214077335, + "grad_norm": 4.525453090667725, + "learning_rate": 1.5376863098711082e-05, + "loss": 0.5413, + "step": 234990 + }, + { + "epoch": 2.0774766173376475, + "grad_norm": 18.43514060974121, + "learning_rate": 1.537538971103921e-05, + "loss": 0.4675, + "step": 235000 + }, + { + "epoch": 2.0775650205979597, + "grad_norm": 1.0014605522155762, + "learning_rate": 1.537391632336734e-05, + "loss": 0.5265, + "step": 235010 + }, + { + "epoch": 2.077653423858272, + "grad_norm": 3.759934663772583, + "learning_rate": 1.537244293569547e-05, + "loss": 0.6185, + "step": 235020 + }, + { + "epoch": 2.077741827118584, + "grad_norm": 12.303557395935059, + "learning_rate": 1.53709695480236e-05, + "loss": 0.5384, + "step": 235030 + }, + { + "epoch": 2.0778302303788965, + "grad_norm": 3.5496857166290283, + "learning_rate": 1.5369496160351727e-05, + "loss": 0.4346, + "step": 235040 + }, + { + "epoch": 2.0779186336392086, + "grad_norm": 4.4985246658325195, + "learning_rate": 1.536802277267986e-05, + "loss": 0.6134, + "step": 235050 + }, + { + "epoch": 2.0780070368995207, + "grad_norm": 1.347710132598877, + "learning_rate": 1.5366549385007988e-05, + "loss": 0.4994, + "step": 235060 + }, + { + "epoch": 2.0780954401598333, + "grad_norm": 2.122184991836548, + "learning_rate": 1.5365075997336116e-05, + "loss": 0.6011, + "step": 235070 + }, + { + "epoch": 2.0781838434201454, + "grad_norm": 5.015860080718994, + "learning_rate": 1.5363602609664248e-05, + "loss": 0.5405, + "step": 235080 + }, + { + "epoch": 2.0782722466804575, + "grad_norm": 3.3702826499938965, + "learning_rate": 1.5362129221992376e-05, + "loss": 0.4984, + "step": 235090 + }, + { + "epoch": 2.0783606499407696, + "grad_norm": 6.307773113250732, + "learning_rate": 1.5360655834320504e-05, + "loss": 0.4995, + "step": 235100 + }, + { + "epoch": 2.078449053201082, + "grad_norm": 0.8391619920730591, + "learning_rate": 1.5359182446648636e-05, + "loss": 0.5395, + "step": 235110 + }, + { + "epoch": 2.0785374564613943, + "grad_norm": 2.5374321937561035, + "learning_rate": 1.5357709058976764e-05, + "loss": 0.4819, + "step": 235120 + }, + { + "epoch": 2.0786258597217064, + "grad_norm": 1.2772438526153564, + "learning_rate": 1.5356235671304893e-05, + "loss": 0.4878, + "step": 235130 + }, + { + "epoch": 2.078714262982019, + "grad_norm": 1.3359925746917725, + "learning_rate": 1.535476228363302e-05, + "loss": 0.4253, + "step": 235140 + }, + { + "epoch": 2.078802666242331, + "grad_norm": 2.026435136795044, + "learning_rate": 1.5353288895961153e-05, + "loss": 0.6537, + "step": 235150 + }, + { + "epoch": 2.0788910695026432, + "grad_norm": 1.1564325094223022, + "learning_rate": 1.535181550828928e-05, + "loss": 0.6454, + "step": 235160 + }, + { + "epoch": 2.0789794727629554, + "grad_norm": 3.122222661972046, + "learning_rate": 1.535034212061741e-05, + "loss": 0.6337, + "step": 235170 + }, + { + "epoch": 2.079067876023268, + "grad_norm": 4.961802005767822, + "learning_rate": 1.5348868732945538e-05, + "loss": 0.6052, + "step": 235180 + }, + { + "epoch": 2.07915627928358, + "grad_norm": 2.787627935409546, + "learning_rate": 1.534739534527367e-05, + "loss": 0.5165, + "step": 235190 + }, + { + "epoch": 2.079244682543892, + "grad_norm": 5.852365493774414, + "learning_rate": 1.5345921957601798e-05, + "loss": 0.5237, + "step": 235200 + }, + { + "epoch": 2.0793330858042043, + "grad_norm": 2.236734628677368, + "learning_rate": 1.5344448569929926e-05, + "loss": 0.5799, + "step": 235210 + }, + { + "epoch": 2.079421489064517, + "grad_norm": 2.0226211547851562, + "learning_rate": 1.5342975182258058e-05, + "loss": 0.5642, + "step": 235220 + }, + { + "epoch": 2.079509892324829, + "grad_norm": 1.2063610553741455, + "learning_rate": 1.5341501794586187e-05, + "loss": 0.5408, + "step": 235230 + }, + { + "epoch": 2.079598295585141, + "grad_norm": 4.506719589233398, + "learning_rate": 1.5340028406914315e-05, + "loss": 0.6545, + "step": 235240 + }, + { + "epoch": 2.079686698845453, + "grad_norm": 2.8342316150665283, + "learning_rate": 1.5338555019242443e-05, + "loss": 0.4198, + "step": 235250 + }, + { + "epoch": 2.0797751021057658, + "grad_norm": 5.314000129699707, + "learning_rate": 1.5337081631570575e-05, + "loss": 0.6183, + "step": 235260 + }, + { + "epoch": 2.079863505366078, + "grad_norm": 3.649683713912964, + "learning_rate": 1.5335608243898703e-05, + "loss": 0.6198, + "step": 235270 + }, + { + "epoch": 2.07995190862639, + "grad_norm": 1.3316240310668945, + "learning_rate": 1.5334134856226832e-05, + "loss": 0.5285, + "step": 235280 + }, + { + "epoch": 2.0800403118867026, + "grad_norm": 2.8754093647003174, + "learning_rate": 1.533266146855496e-05, + "loss": 0.4777, + "step": 235290 + }, + { + "epoch": 2.0801287151470147, + "grad_norm": 2.555805206298828, + "learning_rate": 1.5331188080883092e-05, + "loss": 0.4467, + "step": 235300 + }, + { + "epoch": 2.080217118407327, + "grad_norm": 5.635794162750244, + "learning_rate": 1.532971469321122e-05, + "loss": 0.5944, + "step": 235310 + }, + { + "epoch": 2.080305521667639, + "grad_norm": 4.90559196472168, + "learning_rate": 1.532824130553935e-05, + "loss": 0.4174, + "step": 235320 + }, + { + "epoch": 2.0803939249279515, + "grad_norm": 2.9512712955474854, + "learning_rate": 1.532676791786748e-05, + "loss": 0.5773, + "step": 235330 + }, + { + "epoch": 2.0804823281882636, + "grad_norm": 3.4657816886901855, + "learning_rate": 1.532529453019561e-05, + "loss": 0.6139, + "step": 235340 + }, + { + "epoch": 2.0805707314485757, + "grad_norm": 3.0065407752990723, + "learning_rate": 1.5323821142523737e-05, + "loss": 0.4739, + "step": 235350 + }, + { + "epoch": 2.080659134708888, + "grad_norm": 6.14694881439209, + "learning_rate": 1.5322347754851865e-05, + "loss": 0.627, + "step": 235360 + }, + { + "epoch": 2.0807475379692004, + "grad_norm": 3.371912956237793, + "learning_rate": 1.5320874367179997e-05, + "loss": 0.4932, + "step": 235370 + }, + { + "epoch": 2.0808359412295125, + "grad_norm": 2.4259281158447266, + "learning_rate": 1.5319400979508125e-05, + "loss": 0.6639, + "step": 235380 + }, + { + "epoch": 2.0809243444898247, + "grad_norm": 9.96768569946289, + "learning_rate": 1.5317927591836254e-05, + "loss": 0.4122, + "step": 235390 + }, + { + "epoch": 2.0810127477501372, + "grad_norm": 1.895201563835144, + "learning_rate": 1.5316454204164386e-05, + "loss": 0.4557, + "step": 235400 + }, + { + "epoch": 2.0811011510104493, + "grad_norm": 4.7876434326171875, + "learning_rate": 1.5314980816492514e-05, + "loss": 0.4734, + "step": 235410 + }, + { + "epoch": 2.0811895542707615, + "grad_norm": 9.018458366394043, + "learning_rate": 1.5313507428820642e-05, + "loss": 0.6239, + "step": 235420 + }, + { + "epoch": 2.0812779575310736, + "grad_norm": 3.3068294525146484, + "learning_rate": 1.531203404114877e-05, + "loss": 0.4975, + "step": 235430 + }, + { + "epoch": 2.081366360791386, + "grad_norm": 3.5367536544799805, + "learning_rate": 1.5310560653476902e-05, + "loss": 0.4851, + "step": 235440 + }, + { + "epoch": 2.0814547640516983, + "grad_norm": 2.29510760307312, + "learning_rate": 1.530908726580503e-05, + "loss": 0.5466, + "step": 235450 + }, + { + "epoch": 2.0815431673120104, + "grad_norm": 4.672278881072998, + "learning_rate": 1.530761387813316e-05, + "loss": 0.5348, + "step": 235460 + }, + { + "epoch": 2.0816315705723225, + "grad_norm": 3.4688072204589844, + "learning_rate": 1.5306140490461287e-05, + "loss": 0.5612, + "step": 235470 + }, + { + "epoch": 2.081719973832635, + "grad_norm": 7.125442028045654, + "learning_rate": 1.530466710278942e-05, + "loss": 0.606, + "step": 235480 + }, + { + "epoch": 2.081808377092947, + "grad_norm": 1.4540719985961914, + "learning_rate": 1.5303193715117547e-05, + "loss": 0.6711, + "step": 235490 + }, + { + "epoch": 2.0818967803532593, + "grad_norm": 2.2034261226654053, + "learning_rate": 1.5301720327445676e-05, + "loss": 0.6025, + "step": 235500 + }, + { + "epoch": 2.081985183613572, + "grad_norm": 1.4936803579330444, + "learning_rate": 1.5300246939773808e-05, + "loss": 0.4445, + "step": 235510 + }, + { + "epoch": 2.082073586873884, + "grad_norm": 13.735296249389648, + "learning_rate": 1.5298773552101936e-05, + "loss": 0.4967, + "step": 235520 + }, + { + "epoch": 2.082161990134196, + "grad_norm": 2.3747169971466064, + "learning_rate": 1.5297300164430064e-05, + "loss": 0.4721, + "step": 235530 + }, + { + "epoch": 2.0822503933945082, + "grad_norm": 7.834507465362549, + "learning_rate": 1.5295826776758193e-05, + "loss": 0.4612, + "step": 235540 + }, + { + "epoch": 2.082338796654821, + "grad_norm": 7.189184665679932, + "learning_rate": 1.5294353389086324e-05, + "loss": 0.524, + "step": 235550 + }, + { + "epoch": 2.082427199915133, + "grad_norm": 2.304093837738037, + "learning_rate": 1.5292880001414453e-05, + "loss": 0.5178, + "step": 235560 + }, + { + "epoch": 2.082515603175445, + "grad_norm": 2.936788320541382, + "learning_rate": 1.529140661374258e-05, + "loss": 0.6299, + "step": 235570 + }, + { + "epoch": 2.082604006435757, + "grad_norm": 4.66724967956543, + "learning_rate": 1.5289933226070713e-05, + "loss": 0.448, + "step": 235580 + }, + { + "epoch": 2.0826924096960697, + "grad_norm": 1.8339934349060059, + "learning_rate": 1.528845983839884e-05, + "loss": 0.674, + "step": 235590 + }, + { + "epoch": 2.082780812956382, + "grad_norm": 3.142303466796875, + "learning_rate": 1.528698645072697e-05, + "loss": 0.508, + "step": 235600 + }, + { + "epoch": 2.082869216216694, + "grad_norm": 4.153080463409424, + "learning_rate": 1.5285513063055098e-05, + "loss": 0.5958, + "step": 235610 + }, + { + "epoch": 2.082957619477006, + "grad_norm": 8.030028343200684, + "learning_rate": 1.528403967538323e-05, + "loss": 0.4747, + "step": 235620 + }, + { + "epoch": 2.0830460227373186, + "grad_norm": 3.073543071746826, + "learning_rate": 1.5282566287711358e-05, + "loss": 0.5689, + "step": 235630 + }, + { + "epoch": 2.0831344259976308, + "grad_norm": 3.711402654647827, + "learning_rate": 1.5281092900039486e-05, + "loss": 0.4563, + "step": 235640 + }, + { + "epoch": 2.083222829257943, + "grad_norm": 1.053275227546692, + "learning_rate": 1.5279619512367615e-05, + "loss": 0.5512, + "step": 235650 + }, + { + "epoch": 2.0833112325182555, + "grad_norm": 1.645991325378418, + "learning_rate": 1.5278146124695746e-05, + "loss": 0.5817, + "step": 235660 + }, + { + "epoch": 2.0833996357785676, + "grad_norm": 3.234671115875244, + "learning_rate": 1.5276672737023875e-05, + "loss": 0.5683, + "step": 235670 + }, + { + "epoch": 2.0834880390388797, + "grad_norm": 0.6957482099533081, + "learning_rate": 1.5275199349352003e-05, + "loss": 0.5066, + "step": 235680 + }, + { + "epoch": 2.083576442299192, + "grad_norm": 2.1270506381988525, + "learning_rate": 1.5273725961680135e-05, + "loss": 0.4883, + "step": 235690 + }, + { + "epoch": 2.0836648455595044, + "grad_norm": 14.347164154052734, + "learning_rate": 1.5272252574008263e-05, + "loss": 0.6012, + "step": 235700 + }, + { + "epoch": 2.0837532488198165, + "grad_norm": 2.944049835205078, + "learning_rate": 1.527077918633639e-05, + "loss": 0.6247, + "step": 235710 + }, + { + "epoch": 2.0838416520801286, + "grad_norm": 3.2231602668762207, + "learning_rate": 1.526930579866452e-05, + "loss": 0.435, + "step": 235720 + }, + { + "epoch": 2.083930055340441, + "grad_norm": 5.679676055908203, + "learning_rate": 1.526783241099265e-05, + "loss": 0.5577, + "step": 235730 + }, + { + "epoch": 2.0840184586007533, + "grad_norm": 1.0155150890350342, + "learning_rate": 1.526635902332078e-05, + "loss": 0.5536, + "step": 235740 + }, + { + "epoch": 2.0841068618610654, + "grad_norm": 2.0221121311187744, + "learning_rate": 1.526488563564891e-05, + "loss": 0.5615, + "step": 235750 + }, + { + "epoch": 2.0841952651213775, + "grad_norm": 0.8385308980941772, + "learning_rate": 1.526341224797704e-05, + "loss": 0.3872, + "step": 235760 + }, + { + "epoch": 2.08428366838169, + "grad_norm": 3.0247085094451904, + "learning_rate": 1.526193886030517e-05, + "loss": 0.4588, + "step": 235770 + }, + { + "epoch": 2.0843720716420022, + "grad_norm": 6.403876781463623, + "learning_rate": 1.5260465472633297e-05, + "loss": 0.5743, + "step": 235780 + }, + { + "epoch": 2.0844604749023143, + "grad_norm": 3.4650983810424805, + "learning_rate": 1.5258992084961425e-05, + "loss": 0.6083, + "step": 235790 + }, + { + "epoch": 2.0845488781626265, + "grad_norm": 1.139920711517334, + "learning_rate": 1.5257518697289559e-05, + "loss": 0.5235, + "step": 235800 + }, + { + "epoch": 2.084637281422939, + "grad_norm": 1.955649495124817, + "learning_rate": 1.5256045309617687e-05, + "loss": 0.6542, + "step": 235810 + }, + { + "epoch": 2.084725684683251, + "grad_norm": 2.617950916290283, + "learning_rate": 1.5254571921945815e-05, + "loss": 0.5231, + "step": 235820 + }, + { + "epoch": 2.0848140879435633, + "grad_norm": 3.540703773498535, + "learning_rate": 1.5253098534273944e-05, + "loss": 0.6493, + "step": 235830 + }, + { + "epoch": 2.0849024912038754, + "grad_norm": 2.9357755184173584, + "learning_rate": 1.5251625146602075e-05, + "loss": 0.4868, + "step": 235840 + }, + { + "epoch": 2.084990894464188, + "grad_norm": 5.302179336547852, + "learning_rate": 1.5250151758930204e-05, + "loss": 0.4385, + "step": 235850 + }, + { + "epoch": 2.0850792977245, + "grad_norm": 1.0946992635726929, + "learning_rate": 1.5248678371258332e-05, + "loss": 0.4368, + "step": 235860 + }, + { + "epoch": 2.085167700984812, + "grad_norm": 3.8016645908355713, + "learning_rate": 1.5247204983586464e-05, + "loss": 0.5707, + "step": 235870 + }, + { + "epoch": 2.0852561042451248, + "grad_norm": 6.607137680053711, + "learning_rate": 1.5245731595914592e-05, + "loss": 0.5828, + "step": 235880 + }, + { + "epoch": 2.085344507505437, + "grad_norm": 0.760259211063385, + "learning_rate": 1.524425820824272e-05, + "loss": 0.534, + "step": 235890 + }, + { + "epoch": 2.085432910765749, + "grad_norm": 2.5225744247436523, + "learning_rate": 1.5242784820570849e-05, + "loss": 0.5225, + "step": 235900 + }, + { + "epoch": 2.085521314026061, + "grad_norm": 5.43554162979126, + "learning_rate": 1.524131143289898e-05, + "loss": 0.5796, + "step": 235910 + }, + { + "epoch": 2.0856097172863737, + "grad_norm": 1.2399890422821045, + "learning_rate": 1.5239838045227109e-05, + "loss": 0.4954, + "step": 235920 + }, + { + "epoch": 2.085698120546686, + "grad_norm": 3.1444294452667236, + "learning_rate": 1.5238364657555237e-05, + "loss": 0.5556, + "step": 235930 + }, + { + "epoch": 2.085786523806998, + "grad_norm": 3.526614189147949, + "learning_rate": 1.5236891269883366e-05, + "loss": 0.4991, + "step": 235940 + }, + { + "epoch": 2.08587492706731, + "grad_norm": 2.0699048042297363, + "learning_rate": 1.5235417882211497e-05, + "loss": 0.5922, + "step": 235950 + }, + { + "epoch": 2.0859633303276226, + "grad_norm": 3.9802422523498535, + "learning_rate": 1.5233944494539626e-05, + "loss": 0.5254, + "step": 235960 + }, + { + "epoch": 2.0860517335879347, + "grad_norm": 3.513880491256714, + "learning_rate": 1.5232471106867754e-05, + "loss": 0.6408, + "step": 235970 + }, + { + "epoch": 2.086140136848247, + "grad_norm": 21.687049865722656, + "learning_rate": 1.5230997719195886e-05, + "loss": 0.7368, + "step": 235980 + }, + { + "epoch": 2.0862285401085594, + "grad_norm": 1.7563294172286987, + "learning_rate": 1.5229524331524014e-05, + "loss": 0.6883, + "step": 235990 + }, + { + "epoch": 2.0863169433688715, + "grad_norm": 12.85245132446289, + "learning_rate": 1.5228050943852143e-05, + "loss": 0.4441, + "step": 236000 + }, + { + "epoch": 2.0864053466291836, + "grad_norm": 2.5863678455352783, + "learning_rate": 1.5226577556180271e-05, + "loss": 0.5027, + "step": 236010 + }, + { + "epoch": 2.0864937498894958, + "grad_norm": 6.722099304199219, + "learning_rate": 1.5225104168508403e-05, + "loss": 0.4599, + "step": 236020 + }, + { + "epoch": 2.0865821531498083, + "grad_norm": 7.559728145599365, + "learning_rate": 1.5223630780836531e-05, + "loss": 0.5695, + "step": 236030 + }, + { + "epoch": 2.0866705564101204, + "grad_norm": 4.051757335662842, + "learning_rate": 1.522215739316466e-05, + "loss": 0.4569, + "step": 236040 + }, + { + "epoch": 2.0867589596704326, + "grad_norm": 2.9302620887756348, + "learning_rate": 1.5220684005492791e-05, + "loss": 0.5115, + "step": 236050 + }, + { + "epoch": 2.0868473629307447, + "grad_norm": 3.010341167449951, + "learning_rate": 1.521921061782092e-05, + "loss": 0.4479, + "step": 236060 + }, + { + "epoch": 2.0869357661910573, + "grad_norm": 4.263801574707031, + "learning_rate": 1.5217737230149048e-05, + "loss": 0.589, + "step": 236070 + }, + { + "epoch": 2.0870241694513694, + "grad_norm": 4.449808597564697, + "learning_rate": 1.5216263842477176e-05, + "loss": 0.5178, + "step": 236080 + }, + { + "epoch": 2.0871125727116815, + "grad_norm": 2.3196849822998047, + "learning_rate": 1.5214790454805308e-05, + "loss": 0.6772, + "step": 236090 + }, + { + "epoch": 2.087200975971994, + "grad_norm": 1.4862501621246338, + "learning_rate": 1.5213317067133436e-05, + "loss": 0.5258, + "step": 236100 + }, + { + "epoch": 2.087289379232306, + "grad_norm": 3.0488100051879883, + "learning_rate": 1.5211843679461565e-05, + "loss": 0.5201, + "step": 236110 + }, + { + "epoch": 2.0873777824926183, + "grad_norm": 3.724518060684204, + "learning_rate": 1.5210370291789695e-05, + "loss": 0.5123, + "step": 236120 + }, + { + "epoch": 2.0874661857529304, + "grad_norm": 1.226907730102539, + "learning_rate": 1.5208896904117825e-05, + "loss": 0.5695, + "step": 236130 + }, + { + "epoch": 2.087554589013243, + "grad_norm": 4.974011421203613, + "learning_rate": 1.5207423516445953e-05, + "loss": 0.5339, + "step": 236140 + }, + { + "epoch": 2.087642992273555, + "grad_norm": 4.445333003997803, + "learning_rate": 1.5205950128774083e-05, + "loss": 0.7197, + "step": 236150 + }, + { + "epoch": 2.087731395533867, + "grad_norm": 1.3671797513961792, + "learning_rate": 1.5204476741102213e-05, + "loss": 0.5225, + "step": 236160 + }, + { + "epoch": 2.0878197987941793, + "grad_norm": 7.3110032081604, + "learning_rate": 1.5203003353430342e-05, + "loss": 0.5656, + "step": 236170 + }, + { + "epoch": 2.087908202054492, + "grad_norm": 3.4759411811828613, + "learning_rate": 1.5201529965758472e-05, + "loss": 0.4875, + "step": 236180 + }, + { + "epoch": 2.087996605314804, + "grad_norm": 2.7246603965759277, + "learning_rate": 1.52000565780866e-05, + "loss": 0.5016, + "step": 236190 + }, + { + "epoch": 2.088085008575116, + "grad_norm": 6.735143661499023, + "learning_rate": 1.519858319041473e-05, + "loss": 0.6102, + "step": 236200 + }, + { + "epoch": 2.0881734118354287, + "grad_norm": 1.5556901693344116, + "learning_rate": 1.519710980274286e-05, + "loss": 0.4417, + "step": 236210 + }, + { + "epoch": 2.088261815095741, + "grad_norm": 7.295099258422852, + "learning_rate": 1.5195636415070988e-05, + "loss": 0.5039, + "step": 236220 + }, + { + "epoch": 2.088350218356053, + "grad_norm": 1.7095205783843994, + "learning_rate": 1.5194163027399119e-05, + "loss": 0.4794, + "step": 236230 + }, + { + "epoch": 2.088438621616365, + "grad_norm": 2.8763840198516846, + "learning_rate": 1.5192689639727249e-05, + "loss": 0.5521, + "step": 236240 + }, + { + "epoch": 2.0885270248766776, + "grad_norm": 2.5834357738494873, + "learning_rate": 1.5191216252055377e-05, + "loss": 0.4559, + "step": 236250 + }, + { + "epoch": 2.0886154281369897, + "grad_norm": 2.189363479614258, + "learning_rate": 1.5189742864383505e-05, + "loss": 0.5111, + "step": 236260 + }, + { + "epoch": 2.088703831397302, + "grad_norm": 3.6508584022521973, + "learning_rate": 1.5188269476711637e-05, + "loss": 0.5026, + "step": 236270 + }, + { + "epoch": 2.088792234657614, + "grad_norm": 2.0389106273651123, + "learning_rate": 1.5186796089039765e-05, + "loss": 0.5432, + "step": 236280 + }, + { + "epoch": 2.0888806379179266, + "grad_norm": 2.299022674560547, + "learning_rate": 1.5185322701367894e-05, + "loss": 0.584, + "step": 236290 + }, + { + "epoch": 2.0889690411782387, + "grad_norm": 6.18102502822876, + "learning_rate": 1.5183849313696022e-05, + "loss": 0.4438, + "step": 236300 + }, + { + "epoch": 2.089057444438551, + "grad_norm": 1.33731210231781, + "learning_rate": 1.5182375926024154e-05, + "loss": 0.5689, + "step": 236310 + }, + { + "epoch": 2.0891458476988634, + "grad_norm": 1.2905175685882568, + "learning_rate": 1.5180902538352282e-05, + "loss": 0.6013, + "step": 236320 + }, + { + "epoch": 2.0892342509591755, + "grad_norm": 2.7056281566619873, + "learning_rate": 1.517942915068041e-05, + "loss": 0.5815, + "step": 236330 + }, + { + "epoch": 2.0893226542194876, + "grad_norm": 6.963068008422852, + "learning_rate": 1.5177955763008542e-05, + "loss": 0.5758, + "step": 236340 + }, + { + "epoch": 2.0894110574797997, + "grad_norm": 3.617516279220581, + "learning_rate": 1.517648237533667e-05, + "loss": 0.5251, + "step": 236350 + }, + { + "epoch": 2.0894994607401123, + "grad_norm": 5.311575412750244, + "learning_rate": 1.5175008987664799e-05, + "loss": 0.5757, + "step": 236360 + }, + { + "epoch": 2.0895878640004244, + "grad_norm": 11.443158149719238, + "learning_rate": 1.5173535599992927e-05, + "loss": 0.4668, + "step": 236370 + }, + { + "epoch": 2.0896762672607365, + "grad_norm": 2.051553249359131, + "learning_rate": 1.5172062212321059e-05, + "loss": 0.4774, + "step": 236380 + }, + { + "epoch": 2.0897646705210486, + "grad_norm": 0.9097640514373779, + "learning_rate": 1.5170588824649187e-05, + "loss": 0.5662, + "step": 236390 + }, + { + "epoch": 2.089853073781361, + "grad_norm": 2.522975444793701, + "learning_rate": 1.5169115436977316e-05, + "loss": 0.443, + "step": 236400 + }, + { + "epoch": 2.0899414770416733, + "grad_norm": 2.0570425987243652, + "learning_rate": 1.5167642049305444e-05, + "loss": 0.5101, + "step": 236410 + }, + { + "epoch": 2.0900298803019854, + "grad_norm": 3.4381279945373535, + "learning_rate": 1.5166168661633576e-05, + "loss": 0.5256, + "step": 236420 + }, + { + "epoch": 2.0901182835622976, + "grad_norm": 11.221440315246582, + "learning_rate": 1.5164695273961704e-05, + "loss": 0.5413, + "step": 236430 + }, + { + "epoch": 2.09020668682261, + "grad_norm": 2.4512016773223877, + "learning_rate": 1.5163221886289833e-05, + "loss": 0.4696, + "step": 236440 + }, + { + "epoch": 2.0902950900829222, + "grad_norm": 1.5028125047683716, + "learning_rate": 1.5161748498617964e-05, + "loss": 0.5954, + "step": 236450 + }, + { + "epoch": 2.0903834933432344, + "grad_norm": 2.024628162384033, + "learning_rate": 1.5160275110946093e-05, + "loss": 0.4738, + "step": 236460 + }, + { + "epoch": 2.090471896603547, + "grad_norm": 5.6808342933654785, + "learning_rate": 1.5158801723274221e-05, + "loss": 0.4638, + "step": 236470 + }, + { + "epoch": 2.090560299863859, + "grad_norm": 2.265744686126709, + "learning_rate": 1.515732833560235e-05, + "loss": 0.5757, + "step": 236480 + }, + { + "epoch": 2.090648703124171, + "grad_norm": 3.016300678253174, + "learning_rate": 1.5155854947930481e-05, + "loss": 0.5788, + "step": 236490 + }, + { + "epoch": 2.0907371063844833, + "grad_norm": 6.474188327789307, + "learning_rate": 1.515438156025861e-05, + "loss": 0.5535, + "step": 236500 + }, + { + "epoch": 2.090825509644796, + "grad_norm": 3.801280975341797, + "learning_rate": 1.5152908172586738e-05, + "loss": 0.6404, + "step": 236510 + }, + { + "epoch": 2.090913912905108, + "grad_norm": 1.3492531776428223, + "learning_rate": 1.515143478491487e-05, + "loss": 0.5074, + "step": 236520 + }, + { + "epoch": 2.09100231616542, + "grad_norm": 5.42991304397583, + "learning_rate": 1.5149961397242998e-05, + "loss": 0.5461, + "step": 236530 + }, + { + "epoch": 2.091090719425732, + "grad_norm": 4.947376251220703, + "learning_rate": 1.5148488009571126e-05, + "loss": 0.4793, + "step": 236540 + }, + { + "epoch": 2.0911791226860448, + "grad_norm": 3.0080995559692383, + "learning_rate": 1.5147014621899255e-05, + "loss": 0.7225, + "step": 236550 + }, + { + "epoch": 2.091267525946357, + "grad_norm": 5.010528564453125, + "learning_rate": 1.5145541234227386e-05, + "loss": 0.4251, + "step": 236560 + }, + { + "epoch": 2.091355929206669, + "grad_norm": 8.112334251403809, + "learning_rate": 1.5144067846555515e-05, + "loss": 0.5348, + "step": 236570 + }, + { + "epoch": 2.0914443324669816, + "grad_norm": 2.1110267639160156, + "learning_rate": 1.5142594458883643e-05, + "loss": 0.4468, + "step": 236580 + }, + { + "epoch": 2.0915327357272937, + "grad_norm": 4.412904739379883, + "learning_rate": 1.5141121071211773e-05, + "loss": 0.6212, + "step": 236590 + }, + { + "epoch": 2.091621138987606, + "grad_norm": 5.6892595291137695, + "learning_rate": 1.5139647683539903e-05, + "loss": 0.6316, + "step": 236600 + }, + { + "epoch": 2.091709542247918, + "grad_norm": 4.606527328491211, + "learning_rate": 1.5138174295868032e-05, + "loss": 0.543, + "step": 236610 + }, + { + "epoch": 2.0917979455082305, + "grad_norm": 2.120344877243042, + "learning_rate": 1.5136700908196162e-05, + "loss": 0.5107, + "step": 236620 + }, + { + "epoch": 2.0918863487685426, + "grad_norm": 1.9685099124908447, + "learning_rate": 1.5135227520524292e-05, + "loss": 0.6408, + "step": 236630 + }, + { + "epoch": 2.0919747520288547, + "grad_norm": 4.406557559967041, + "learning_rate": 1.513375413285242e-05, + "loss": 0.4415, + "step": 236640 + }, + { + "epoch": 2.092063155289167, + "grad_norm": 2.6652653217315674, + "learning_rate": 1.513228074518055e-05, + "loss": 0.5211, + "step": 236650 + }, + { + "epoch": 2.0921515585494794, + "grad_norm": 2.969414472579956, + "learning_rate": 1.5130807357508678e-05, + "loss": 0.4959, + "step": 236660 + }, + { + "epoch": 2.0922399618097915, + "grad_norm": 8.142206192016602, + "learning_rate": 1.5129333969836808e-05, + "loss": 0.5345, + "step": 236670 + }, + { + "epoch": 2.0923283650701037, + "grad_norm": 2.0199079513549805, + "learning_rate": 1.5127860582164938e-05, + "loss": 0.6277, + "step": 236680 + }, + { + "epoch": 2.0924167683304162, + "grad_norm": 2.9006593227386475, + "learning_rate": 1.5126387194493067e-05, + "loss": 0.6038, + "step": 236690 + }, + { + "epoch": 2.0925051715907284, + "grad_norm": 5.024865627288818, + "learning_rate": 1.5124913806821197e-05, + "loss": 0.5676, + "step": 236700 + }, + { + "epoch": 2.0925935748510405, + "grad_norm": 1.514017939567566, + "learning_rate": 1.5123440419149327e-05, + "loss": 0.4267, + "step": 236710 + }, + { + "epoch": 2.0926819781113526, + "grad_norm": 2.440743923187256, + "learning_rate": 1.5121967031477455e-05, + "loss": 0.529, + "step": 236720 + }, + { + "epoch": 2.092770381371665, + "grad_norm": 3.6691083908081055, + "learning_rate": 1.5120493643805584e-05, + "loss": 0.4276, + "step": 236730 + }, + { + "epoch": 2.0928587846319773, + "grad_norm": 3.64646577835083, + "learning_rate": 1.5119020256133715e-05, + "loss": 0.5247, + "step": 236740 + }, + { + "epoch": 2.0929471878922894, + "grad_norm": 2.3496129512786865, + "learning_rate": 1.5117546868461844e-05, + "loss": 0.5309, + "step": 236750 + }, + { + "epoch": 2.0930355911526015, + "grad_norm": 2.3559725284576416, + "learning_rate": 1.5116073480789972e-05, + "loss": 0.5136, + "step": 236760 + }, + { + "epoch": 2.093123994412914, + "grad_norm": 0.7633588910102844, + "learning_rate": 1.51146000931181e-05, + "loss": 0.3824, + "step": 236770 + }, + { + "epoch": 2.093212397673226, + "grad_norm": 5.384082317352295, + "learning_rate": 1.5113126705446232e-05, + "loss": 0.6581, + "step": 236780 + }, + { + "epoch": 2.0933008009335383, + "grad_norm": 1.5632063150405884, + "learning_rate": 1.511165331777436e-05, + "loss": 0.5161, + "step": 236790 + }, + { + "epoch": 2.093389204193851, + "grad_norm": 8.293729782104492, + "learning_rate": 1.5110179930102489e-05, + "loss": 0.614, + "step": 236800 + }, + { + "epoch": 2.093477607454163, + "grad_norm": 1.0564966201782227, + "learning_rate": 1.510870654243062e-05, + "loss": 0.5369, + "step": 236810 + }, + { + "epoch": 2.093566010714475, + "grad_norm": 3.4685585498809814, + "learning_rate": 1.5107233154758749e-05, + "loss": 0.7528, + "step": 236820 + }, + { + "epoch": 2.0936544139747872, + "grad_norm": 3.2802977561950684, + "learning_rate": 1.5105759767086877e-05, + "loss": 0.6132, + "step": 236830 + }, + { + "epoch": 2.0937428172351, + "grad_norm": 2.318263530731201, + "learning_rate": 1.5104286379415006e-05, + "loss": 0.5927, + "step": 236840 + }, + { + "epoch": 2.093831220495412, + "grad_norm": 6.369232177734375, + "learning_rate": 1.5102812991743137e-05, + "loss": 0.4954, + "step": 236850 + }, + { + "epoch": 2.093919623755724, + "grad_norm": 3.1779069900512695, + "learning_rate": 1.5101339604071266e-05, + "loss": 0.564, + "step": 236860 + }, + { + "epoch": 2.094008027016036, + "grad_norm": 29.086835861206055, + "learning_rate": 1.5099866216399394e-05, + "loss": 0.5224, + "step": 236870 + }, + { + "epoch": 2.0940964302763487, + "grad_norm": 4.037421703338623, + "learning_rate": 1.5098392828727522e-05, + "loss": 0.5562, + "step": 236880 + }, + { + "epoch": 2.094184833536661, + "grad_norm": 8.783136367797852, + "learning_rate": 1.5096919441055654e-05, + "loss": 0.6107, + "step": 236890 + }, + { + "epoch": 2.094273236796973, + "grad_norm": 0.9600059390068054, + "learning_rate": 1.5095446053383783e-05, + "loss": 0.6117, + "step": 236900 + }, + { + "epoch": 2.0943616400572855, + "grad_norm": 3.4379050731658936, + "learning_rate": 1.5093972665711911e-05, + "loss": 0.4785, + "step": 236910 + }, + { + "epoch": 2.0944500433175977, + "grad_norm": 1.5102381706237793, + "learning_rate": 1.5092499278040043e-05, + "loss": 0.674, + "step": 236920 + }, + { + "epoch": 2.0945384465779098, + "grad_norm": 1.5262420177459717, + "learning_rate": 1.5091025890368171e-05, + "loss": 0.5459, + "step": 236930 + }, + { + "epoch": 2.094626849838222, + "grad_norm": 3.689359664916992, + "learning_rate": 1.50895525026963e-05, + "loss": 0.5611, + "step": 236940 + }, + { + "epoch": 2.0947152530985345, + "grad_norm": 3.3904402256011963, + "learning_rate": 1.5088079115024428e-05, + "loss": 0.452, + "step": 236950 + }, + { + "epoch": 2.0948036563588466, + "grad_norm": 1.3054176568984985, + "learning_rate": 1.508660572735256e-05, + "loss": 0.497, + "step": 236960 + }, + { + "epoch": 2.0948920596191587, + "grad_norm": 2.2172956466674805, + "learning_rate": 1.5085132339680688e-05, + "loss": 0.5462, + "step": 236970 + }, + { + "epoch": 2.094980462879471, + "grad_norm": 0.8468037843704224, + "learning_rate": 1.5083658952008816e-05, + "loss": 0.4673, + "step": 236980 + }, + { + "epoch": 2.0950688661397834, + "grad_norm": 2.7758829593658447, + "learning_rate": 1.5082185564336948e-05, + "loss": 0.5229, + "step": 236990 + }, + { + "epoch": 2.0951572694000955, + "grad_norm": 2.9750425815582275, + "learning_rate": 1.5080712176665076e-05, + "loss": 0.4422, + "step": 237000 + }, + { + "epoch": 2.0952456726604076, + "grad_norm": 4.392441272735596, + "learning_rate": 1.5079238788993205e-05, + "loss": 0.5759, + "step": 237010 + }, + { + "epoch": 2.0953340759207197, + "grad_norm": 2.0458037853240967, + "learning_rate": 1.5077765401321333e-05, + "loss": 0.4795, + "step": 237020 + }, + { + "epoch": 2.0954224791810323, + "grad_norm": 3.5554721355438232, + "learning_rate": 1.5076292013649465e-05, + "loss": 0.649, + "step": 237030 + }, + { + "epoch": 2.0955108824413444, + "grad_norm": 2.2351343631744385, + "learning_rate": 1.5074818625977593e-05, + "loss": 0.4215, + "step": 237040 + }, + { + "epoch": 2.0955992857016565, + "grad_norm": 2.4167072772979736, + "learning_rate": 1.5073345238305721e-05, + "loss": 0.5289, + "step": 237050 + }, + { + "epoch": 2.095687688961969, + "grad_norm": 2.3876006603240967, + "learning_rate": 1.5071871850633852e-05, + "loss": 0.4903, + "step": 237060 + }, + { + "epoch": 2.0957760922222812, + "grad_norm": 5.646481990814209, + "learning_rate": 1.5070398462961982e-05, + "loss": 0.7481, + "step": 237070 + }, + { + "epoch": 2.0958644954825933, + "grad_norm": 8.636377334594727, + "learning_rate": 1.506892507529011e-05, + "loss": 0.7335, + "step": 237080 + }, + { + "epoch": 2.0959528987429055, + "grad_norm": 0.6873093843460083, + "learning_rate": 1.506745168761824e-05, + "loss": 0.5384, + "step": 237090 + }, + { + "epoch": 2.096041302003218, + "grad_norm": 3.322133779525757, + "learning_rate": 1.506597829994637e-05, + "loss": 0.4795, + "step": 237100 + }, + { + "epoch": 2.09612970526353, + "grad_norm": 1.4408599138259888, + "learning_rate": 1.5064504912274498e-05, + "loss": 0.4529, + "step": 237110 + }, + { + "epoch": 2.0962181085238423, + "grad_norm": 3.018019914627075, + "learning_rate": 1.5063031524602628e-05, + "loss": 0.4335, + "step": 237120 + }, + { + "epoch": 2.0963065117841544, + "grad_norm": 11.64940071105957, + "learning_rate": 1.5061558136930757e-05, + "loss": 0.5112, + "step": 237130 + }, + { + "epoch": 2.096394915044467, + "grad_norm": 5.700075626373291, + "learning_rate": 1.5060084749258887e-05, + "loss": 0.4738, + "step": 237140 + }, + { + "epoch": 2.096483318304779, + "grad_norm": 8.12649154663086, + "learning_rate": 1.5058611361587017e-05, + "loss": 0.5105, + "step": 237150 + }, + { + "epoch": 2.096571721565091, + "grad_norm": 2.7007148265838623, + "learning_rate": 1.5057137973915145e-05, + "loss": 0.4637, + "step": 237160 + }, + { + "epoch": 2.0966601248254038, + "grad_norm": 5.4591498374938965, + "learning_rate": 1.5055664586243275e-05, + "loss": 0.5589, + "step": 237170 + }, + { + "epoch": 2.096748528085716, + "grad_norm": 1.2159215211868286, + "learning_rate": 1.5054191198571405e-05, + "loss": 0.4207, + "step": 237180 + }, + { + "epoch": 2.096836931346028, + "grad_norm": 1.3613191843032837, + "learning_rate": 1.5052717810899534e-05, + "loss": 0.5057, + "step": 237190 + }, + { + "epoch": 2.09692533460634, + "grad_norm": 1.3767032623291016, + "learning_rate": 1.5051244423227662e-05, + "loss": 0.5671, + "step": 237200 + }, + { + "epoch": 2.0970137378666527, + "grad_norm": 2.874750852584839, + "learning_rate": 1.5049771035555794e-05, + "loss": 0.52, + "step": 237210 + }, + { + "epoch": 2.097102141126965, + "grad_norm": 2.850503921508789, + "learning_rate": 1.5048297647883922e-05, + "loss": 0.5181, + "step": 237220 + }, + { + "epoch": 2.097190544387277, + "grad_norm": 1.9614334106445312, + "learning_rate": 1.504682426021205e-05, + "loss": 0.5617, + "step": 237230 + }, + { + "epoch": 2.097278947647589, + "grad_norm": 1.5679880380630493, + "learning_rate": 1.5045350872540179e-05, + "loss": 0.5295, + "step": 237240 + }, + { + "epoch": 2.0973673509079016, + "grad_norm": 13.442856788635254, + "learning_rate": 1.504387748486831e-05, + "loss": 0.575, + "step": 237250 + }, + { + "epoch": 2.0974557541682137, + "grad_norm": 4.783178806304932, + "learning_rate": 1.5042404097196439e-05, + "loss": 0.4658, + "step": 237260 + }, + { + "epoch": 2.097544157428526, + "grad_norm": 0.8833920359611511, + "learning_rate": 1.5040930709524567e-05, + "loss": 0.4631, + "step": 237270 + }, + { + "epoch": 2.0976325606888384, + "grad_norm": 2.7543249130249023, + "learning_rate": 1.5039457321852699e-05, + "loss": 0.4907, + "step": 237280 + }, + { + "epoch": 2.0977209639491505, + "grad_norm": 2.043517589569092, + "learning_rate": 1.5037983934180827e-05, + "loss": 0.5803, + "step": 237290 + }, + { + "epoch": 2.0978093672094626, + "grad_norm": 3.1374499797821045, + "learning_rate": 1.5036510546508956e-05, + "loss": 0.5622, + "step": 237300 + }, + { + "epoch": 2.0978977704697748, + "grad_norm": 5.5435662269592285, + "learning_rate": 1.5035037158837084e-05, + "loss": 0.5926, + "step": 237310 + }, + { + "epoch": 2.0979861737300873, + "grad_norm": 6.9917378425598145, + "learning_rate": 1.5033563771165216e-05, + "loss": 0.5749, + "step": 237320 + }, + { + "epoch": 2.0980745769903995, + "grad_norm": 3.0614054203033447, + "learning_rate": 1.5032090383493344e-05, + "loss": 0.5752, + "step": 237330 + }, + { + "epoch": 2.0981629802507116, + "grad_norm": 17.501901626586914, + "learning_rate": 1.5030616995821473e-05, + "loss": 0.6123, + "step": 237340 + }, + { + "epoch": 2.0982513835110237, + "grad_norm": 4.263730049133301, + "learning_rate": 1.5029143608149604e-05, + "loss": 0.5416, + "step": 237350 + }, + { + "epoch": 2.0983397867713363, + "grad_norm": 2.4341745376586914, + "learning_rate": 1.5027670220477733e-05, + "loss": 0.5949, + "step": 237360 + }, + { + "epoch": 2.0984281900316484, + "grad_norm": 9.7907075881958, + "learning_rate": 1.5026196832805861e-05, + "loss": 0.5663, + "step": 237370 + }, + { + "epoch": 2.0985165932919605, + "grad_norm": 3.6279563903808594, + "learning_rate": 1.502472344513399e-05, + "loss": 0.566, + "step": 237380 + }, + { + "epoch": 2.098604996552273, + "grad_norm": 2.0721538066864014, + "learning_rate": 1.5023250057462121e-05, + "loss": 0.495, + "step": 237390 + }, + { + "epoch": 2.098693399812585, + "grad_norm": 6.47329044342041, + "learning_rate": 1.502177666979025e-05, + "loss": 0.4573, + "step": 237400 + }, + { + "epoch": 2.0987818030728973, + "grad_norm": 11.968038558959961, + "learning_rate": 1.5020303282118378e-05, + "loss": 0.4888, + "step": 237410 + }, + { + "epoch": 2.0988702063332094, + "grad_norm": 2.176527261734009, + "learning_rate": 1.5018829894446506e-05, + "loss": 0.4445, + "step": 237420 + }, + { + "epoch": 2.098958609593522, + "grad_norm": 3.60756254196167, + "learning_rate": 1.5017356506774638e-05, + "loss": 0.5006, + "step": 237430 + }, + { + "epoch": 2.099047012853834, + "grad_norm": 1.3114408254623413, + "learning_rate": 1.5015883119102766e-05, + "loss": 0.4382, + "step": 237440 + }, + { + "epoch": 2.0991354161141462, + "grad_norm": 20.686546325683594, + "learning_rate": 1.5014409731430895e-05, + "loss": 0.4575, + "step": 237450 + }, + { + "epoch": 2.0992238193744583, + "grad_norm": 2.048480749130249, + "learning_rate": 1.5012936343759026e-05, + "loss": 0.6248, + "step": 237460 + }, + { + "epoch": 2.099312222634771, + "grad_norm": 5.42110013961792, + "learning_rate": 1.5011462956087155e-05, + "loss": 0.5945, + "step": 237470 + }, + { + "epoch": 2.099400625895083, + "grad_norm": 6.611693859100342, + "learning_rate": 1.5009989568415283e-05, + "loss": 0.6209, + "step": 237480 + }, + { + "epoch": 2.099489029155395, + "grad_norm": 1.5068403482437134, + "learning_rate": 1.5008516180743411e-05, + "loss": 0.6102, + "step": 237490 + }, + { + "epoch": 2.0995774324157077, + "grad_norm": 7.817174911499023, + "learning_rate": 1.5007042793071543e-05, + "loss": 0.6661, + "step": 237500 + }, + { + "epoch": 2.09966583567602, + "grad_norm": 15.418646812438965, + "learning_rate": 1.5005569405399671e-05, + "loss": 0.5927, + "step": 237510 + }, + { + "epoch": 2.099754238936332, + "grad_norm": 5.1957688331604, + "learning_rate": 1.50040960177278e-05, + "loss": 0.5616, + "step": 237520 + }, + { + "epoch": 2.099842642196644, + "grad_norm": 2.176626682281494, + "learning_rate": 1.500262263005593e-05, + "loss": 0.5796, + "step": 237530 + }, + { + "epoch": 2.0999310454569566, + "grad_norm": 1.458861231803894, + "learning_rate": 1.500114924238406e-05, + "loss": 0.4589, + "step": 237540 + }, + { + "epoch": 2.1000194487172688, + "grad_norm": 2.943763494491577, + "learning_rate": 1.4999675854712188e-05, + "loss": 0.586, + "step": 237550 + }, + { + "epoch": 2.100107851977581, + "grad_norm": 0.9774618148803711, + "learning_rate": 1.4998202467040318e-05, + "loss": 0.5705, + "step": 237560 + }, + { + "epoch": 2.100196255237893, + "grad_norm": 5.886435031890869, + "learning_rate": 1.4996729079368448e-05, + "loss": 0.5419, + "step": 237570 + }, + { + "epoch": 2.1002846584982056, + "grad_norm": 1.4539377689361572, + "learning_rate": 1.4995255691696577e-05, + "loss": 0.5816, + "step": 237580 + }, + { + "epoch": 2.1003730617585177, + "grad_norm": 3.6237030029296875, + "learning_rate": 1.4993782304024707e-05, + "loss": 0.6194, + "step": 237590 + }, + { + "epoch": 2.10046146501883, + "grad_norm": 1.5658246278762817, + "learning_rate": 1.4992308916352835e-05, + "loss": 0.4023, + "step": 237600 + }, + { + "epoch": 2.100549868279142, + "grad_norm": 3.003751754760742, + "learning_rate": 1.4990835528680965e-05, + "loss": 0.5402, + "step": 237610 + }, + { + "epoch": 2.1006382715394545, + "grad_norm": 2.4334583282470703, + "learning_rate": 1.4989362141009095e-05, + "loss": 0.4678, + "step": 237620 + }, + { + "epoch": 2.1007266747997666, + "grad_norm": 2.2536182403564453, + "learning_rate": 1.4987888753337224e-05, + "loss": 0.5625, + "step": 237630 + }, + { + "epoch": 2.1008150780600787, + "grad_norm": 2.1046645641326904, + "learning_rate": 1.4986415365665354e-05, + "loss": 0.5114, + "step": 237640 + }, + { + "epoch": 2.1009034813203913, + "grad_norm": 1.916111946105957, + "learning_rate": 1.4984941977993484e-05, + "loss": 0.4076, + "step": 237650 + }, + { + "epoch": 2.1009918845807034, + "grad_norm": 1.2276091575622559, + "learning_rate": 1.4983468590321612e-05, + "loss": 0.5848, + "step": 237660 + }, + { + "epoch": 2.1010802878410155, + "grad_norm": 1.5937352180480957, + "learning_rate": 1.498199520264974e-05, + "loss": 0.5776, + "step": 237670 + }, + { + "epoch": 2.1011686911013276, + "grad_norm": 2.8371644020080566, + "learning_rate": 1.4980521814977872e-05, + "loss": 0.5818, + "step": 237680 + }, + { + "epoch": 2.10125709436164, + "grad_norm": 7.26334285736084, + "learning_rate": 1.4979048427306e-05, + "loss": 0.5022, + "step": 237690 + }, + { + "epoch": 2.1013454976219523, + "grad_norm": 6.608311176300049, + "learning_rate": 1.4977575039634129e-05, + "loss": 0.5666, + "step": 237700 + }, + { + "epoch": 2.1014339008822644, + "grad_norm": 1.737022876739502, + "learning_rate": 1.4976101651962257e-05, + "loss": 0.4289, + "step": 237710 + }, + { + "epoch": 2.1015223041425766, + "grad_norm": 5.353837966918945, + "learning_rate": 1.4974628264290389e-05, + "loss": 0.4946, + "step": 237720 + }, + { + "epoch": 2.101610707402889, + "grad_norm": 1.6901822090148926, + "learning_rate": 1.4973154876618517e-05, + "loss": 0.5146, + "step": 237730 + }, + { + "epoch": 2.1016991106632013, + "grad_norm": 2.412571430206299, + "learning_rate": 1.4971681488946646e-05, + "loss": 0.6614, + "step": 237740 + }, + { + "epoch": 2.1017875139235134, + "grad_norm": 1.8104140758514404, + "learning_rate": 1.4970208101274777e-05, + "loss": 0.4866, + "step": 237750 + }, + { + "epoch": 2.101875917183826, + "grad_norm": 4.805348873138428, + "learning_rate": 1.4968734713602906e-05, + "loss": 0.4431, + "step": 237760 + }, + { + "epoch": 2.101964320444138, + "grad_norm": 2.9549319744110107, + "learning_rate": 1.4967261325931034e-05, + "loss": 0.636, + "step": 237770 + }, + { + "epoch": 2.10205272370445, + "grad_norm": 7.668278217315674, + "learning_rate": 1.4965787938259162e-05, + "loss": 0.581, + "step": 237780 + }, + { + "epoch": 2.1021411269647623, + "grad_norm": 24.124935150146484, + "learning_rate": 1.4964314550587294e-05, + "loss": 0.5167, + "step": 237790 + }, + { + "epoch": 2.102229530225075, + "grad_norm": 3.9696388244628906, + "learning_rate": 1.4962841162915423e-05, + "loss": 0.5202, + "step": 237800 + }, + { + "epoch": 2.102317933485387, + "grad_norm": 3.2452330589294434, + "learning_rate": 1.4961367775243551e-05, + "loss": 0.6678, + "step": 237810 + }, + { + "epoch": 2.102406336745699, + "grad_norm": 0.6988434195518494, + "learning_rate": 1.4959894387571683e-05, + "loss": 0.3734, + "step": 237820 + }, + { + "epoch": 2.102494740006011, + "grad_norm": 2.8945388793945312, + "learning_rate": 1.4958420999899811e-05, + "loss": 0.3976, + "step": 237830 + }, + { + "epoch": 2.102583143266324, + "grad_norm": 2.218108892440796, + "learning_rate": 1.495694761222794e-05, + "loss": 0.5212, + "step": 237840 + }, + { + "epoch": 2.102671546526636, + "grad_norm": 2.6857848167419434, + "learning_rate": 1.4955474224556068e-05, + "loss": 0.4743, + "step": 237850 + }, + { + "epoch": 2.102759949786948, + "grad_norm": 5.626918792724609, + "learning_rate": 1.49540008368842e-05, + "loss": 0.5142, + "step": 237860 + }, + { + "epoch": 2.1028483530472606, + "grad_norm": 3.2249343395233154, + "learning_rate": 1.4952527449212328e-05, + "loss": 0.5029, + "step": 237870 + }, + { + "epoch": 2.1029367563075727, + "grad_norm": 3.0192246437072754, + "learning_rate": 1.4951054061540456e-05, + "loss": 0.4317, + "step": 237880 + }, + { + "epoch": 2.103025159567885, + "grad_norm": 1.6168016195297241, + "learning_rate": 1.4949580673868585e-05, + "loss": 0.5965, + "step": 237890 + }, + { + "epoch": 2.103113562828197, + "grad_norm": 1.4848724603652954, + "learning_rate": 1.4948107286196716e-05, + "loss": 0.5002, + "step": 237900 + }, + { + "epoch": 2.1032019660885095, + "grad_norm": 1.1011241674423218, + "learning_rate": 1.4946633898524845e-05, + "loss": 0.4727, + "step": 237910 + }, + { + "epoch": 2.1032903693488216, + "grad_norm": 2.617119789123535, + "learning_rate": 1.4945160510852973e-05, + "loss": 0.5445, + "step": 237920 + }, + { + "epoch": 2.1033787726091338, + "grad_norm": 4.667477130889893, + "learning_rate": 1.4943687123181105e-05, + "loss": 0.487, + "step": 237930 + }, + { + "epoch": 2.103467175869446, + "grad_norm": 3.617231607437134, + "learning_rate": 1.4942213735509233e-05, + "loss": 0.5438, + "step": 237940 + }, + { + "epoch": 2.1035555791297584, + "grad_norm": 7.8957695960998535, + "learning_rate": 1.4940740347837361e-05, + "loss": 0.5154, + "step": 237950 + }, + { + "epoch": 2.1036439823900706, + "grad_norm": 5.4811296463012695, + "learning_rate": 1.493926696016549e-05, + "loss": 0.5853, + "step": 237960 + }, + { + "epoch": 2.1037323856503827, + "grad_norm": 2.7541918754577637, + "learning_rate": 1.4937793572493621e-05, + "loss": 0.5871, + "step": 237970 + }, + { + "epoch": 2.1038207889106952, + "grad_norm": 2.7675342559814453, + "learning_rate": 1.493632018482175e-05, + "loss": 0.567, + "step": 237980 + }, + { + "epoch": 2.1039091921710074, + "grad_norm": 1.9097161293029785, + "learning_rate": 1.4934846797149878e-05, + "loss": 0.4957, + "step": 237990 + }, + { + "epoch": 2.1039975954313195, + "grad_norm": 3.7136216163635254, + "learning_rate": 1.4933373409478008e-05, + "loss": 0.6293, + "step": 238000 + }, + { + "epoch": 2.1040859986916316, + "grad_norm": 5.730007648468018, + "learning_rate": 1.4931900021806138e-05, + "loss": 0.5238, + "step": 238010 + }, + { + "epoch": 2.104174401951944, + "grad_norm": 3.8481740951538086, + "learning_rate": 1.4930426634134267e-05, + "loss": 0.4485, + "step": 238020 + }, + { + "epoch": 2.1042628052122563, + "grad_norm": 1.767208456993103, + "learning_rate": 1.4928953246462397e-05, + "loss": 0.5673, + "step": 238030 + }, + { + "epoch": 2.1043512084725684, + "grad_norm": 11.271956443786621, + "learning_rate": 1.4927479858790527e-05, + "loss": 0.5367, + "step": 238040 + }, + { + "epoch": 2.1044396117328805, + "grad_norm": 12.260025978088379, + "learning_rate": 1.4926006471118655e-05, + "loss": 0.5789, + "step": 238050 + }, + { + "epoch": 2.104528014993193, + "grad_norm": 4.41155481338501, + "learning_rate": 1.4924533083446785e-05, + "loss": 0.5739, + "step": 238060 + }, + { + "epoch": 2.104616418253505, + "grad_norm": 7.858800888061523, + "learning_rate": 1.4923059695774914e-05, + "loss": 0.5601, + "step": 238070 + }, + { + "epoch": 2.1047048215138173, + "grad_norm": 2.0014281272888184, + "learning_rate": 1.4921586308103044e-05, + "loss": 0.5697, + "step": 238080 + }, + { + "epoch": 2.10479322477413, + "grad_norm": 3.575439929962158, + "learning_rate": 1.4920112920431174e-05, + "loss": 0.5861, + "step": 238090 + }, + { + "epoch": 2.104881628034442, + "grad_norm": 6.622105598449707, + "learning_rate": 1.4918639532759302e-05, + "loss": 0.5669, + "step": 238100 + }, + { + "epoch": 2.104970031294754, + "grad_norm": 4.358893394470215, + "learning_rate": 1.4917166145087432e-05, + "loss": 0.5853, + "step": 238110 + }, + { + "epoch": 2.1050584345550662, + "grad_norm": 2.1446502208709717, + "learning_rate": 1.4915692757415562e-05, + "loss": 0.6179, + "step": 238120 + }, + { + "epoch": 2.105146837815379, + "grad_norm": 2.5073153972625732, + "learning_rate": 1.491421936974369e-05, + "loss": 0.4749, + "step": 238130 + }, + { + "epoch": 2.105235241075691, + "grad_norm": 8.518044471740723, + "learning_rate": 1.4912745982071819e-05, + "loss": 0.6447, + "step": 238140 + }, + { + "epoch": 2.105323644336003, + "grad_norm": 6.561489105224609, + "learning_rate": 1.491127259439995e-05, + "loss": 0.5217, + "step": 238150 + }, + { + "epoch": 2.105412047596315, + "grad_norm": 1.191451907157898, + "learning_rate": 1.4909799206728079e-05, + "loss": 0.5737, + "step": 238160 + }, + { + "epoch": 2.1055004508566277, + "grad_norm": 3.5979175567626953, + "learning_rate": 1.4908325819056207e-05, + "loss": 0.3855, + "step": 238170 + }, + { + "epoch": 2.10558885411694, + "grad_norm": 6.246284008026123, + "learning_rate": 1.4906852431384336e-05, + "loss": 0.4644, + "step": 238180 + }, + { + "epoch": 2.105677257377252, + "grad_norm": 4.42651891708374, + "learning_rate": 1.4905379043712467e-05, + "loss": 0.5563, + "step": 238190 + }, + { + "epoch": 2.105765660637564, + "grad_norm": 6.559763431549072, + "learning_rate": 1.4903905656040596e-05, + "loss": 0.5209, + "step": 238200 + }, + { + "epoch": 2.1058540638978767, + "grad_norm": 3.0911660194396973, + "learning_rate": 1.4902432268368724e-05, + "loss": 0.3455, + "step": 238210 + }, + { + "epoch": 2.105942467158189, + "grad_norm": 46.80445098876953, + "learning_rate": 1.4900958880696856e-05, + "loss": 0.4458, + "step": 238220 + }, + { + "epoch": 2.106030870418501, + "grad_norm": 3.212737798690796, + "learning_rate": 1.4899485493024984e-05, + "loss": 0.536, + "step": 238230 + }, + { + "epoch": 2.1061192736788135, + "grad_norm": 6.231499671936035, + "learning_rate": 1.4898012105353112e-05, + "loss": 0.6462, + "step": 238240 + }, + { + "epoch": 2.1062076769391256, + "grad_norm": 1.6078251600265503, + "learning_rate": 1.489653871768124e-05, + "loss": 0.5258, + "step": 238250 + }, + { + "epoch": 2.1062960801994377, + "grad_norm": 2.271446704864502, + "learning_rate": 1.4895065330009373e-05, + "loss": 0.6202, + "step": 238260 + }, + { + "epoch": 2.10638448345975, + "grad_norm": 1.0816800594329834, + "learning_rate": 1.4893591942337501e-05, + "loss": 0.4988, + "step": 238270 + }, + { + "epoch": 2.1064728867200624, + "grad_norm": 6.455515384674072, + "learning_rate": 1.489211855466563e-05, + "loss": 0.5112, + "step": 238280 + }, + { + "epoch": 2.1065612899803745, + "grad_norm": 1.790582537651062, + "learning_rate": 1.4890645166993761e-05, + "loss": 0.4774, + "step": 238290 + }, + { + "epoch": 2.1066496932406866, + "grad_norm": 1.855430245399475, + "learning_rate": 1.488917177932189e-05, + "loss": 0.5849, + "step": 238300 + }, + { + "epoch": 2.1067380965009987, + "grad_norm": 2.6242358684539795, + "learning_rate": 1.4887698391650018e-05, + "loss": 0.5672, + "step": 238310 + }, + { + "epoch": 2.1068264997613113, + "grad_norm": 5.402157783508301, + "learning_rate": 1.4886225003978146e-05, + "loss": 0.5674, + "step": 238320 + }, + { + "epoch": 2.1069149030216234, + "grad_norm": 1.5425413846969604, + "learning_rate": 1.4884751616306278e-05, + "loss": 0.4508, + "step": 238330 + }, + { + "epoch": 2.1070033062819356, + "grad_norm": 2.3321855068206787, + "learning_rate": 1.4883278228634406e-05, + "loss": 0.5807, + "step": 238340 + }, + { + "epoch": 2.107091709542248, + "grad_norm": 5.780003070831299, + "learning_rate": 1.4881804840962535e-05, + "loss": 0.494, + "step": 238350 + }, + { + "epoch": 2.1071801128025602, + "grad_norm": 22.208255767822266, + "learning_rate": 1.4880331453290663e-05, + "loss": 0.6129, + "step": 238360 + }, + { + "epoch": 2.1072685160628724, + "grad_norm": 2.8133487701416016, + "learning_rate": 1.4878858065618795e-05, + "loss": 0.5153, + "step": 238370 + }, + { + "epoch": 2.1073569193231845, + "grad_norm": 1.3363138437271118, + "learning_rate": 1.4877384677946923e-05, + "loss": 0.5993, + "step": 238380 + }, + { + "epoch": 2.107445322583497, + "grad_norm": 2.6398799419403076, + "learning_rate": 1.4875911290275051e-05, + "loss": 0.6955, + "step": 238390 + }, + { + "epoch": 2.107533725843809, + "grad_norm": 3.545785427093506, + "learning_rate": 1.4874437902603183e-05, + "loss": 0.4177, + "step": 238400 + }, + { + "epoch": 2.1076221291041213, + "grad_norm": 6.453826904296875, + "learning_rate": 1.4872964514931311e-05, + "loss": 0.4651, + "step": 238410 + }, + { + "epoch": 2.1077105323644334, + "grad_norm": 3.5629420280456543, + "learning_rate": 1.487149112725944e-05, + "loss": 0.5026, + "step": 238420 + }, + { + "epoch": 2.107798935624746, + "grad_norm": 3.5814507007598877, + "learning_rate": 1.4870017739587568e-05, + "loss": 0.4177, + "step": 238430 + }, + { + "epoch": 2.107887338885058, + "grad_norm": 1.973318338394165, + "learning_rate": 1.48685443519157e-05, + "loss": 0.4393, + "step": 238440 + }, + { + "epoch": 2.10797574214537, + "grad_norm": 2.7584149837493896, + "learning_rate": 1.4867070964243828e-05, + "loss": 0.537, + "step": 238450 + }, + { + "epoch": 2.1080641454056828, + "grad_norm": 3.6803719997406006, + "learning_rate": 1.4865597576571957e-05, + "loss": 0.5761, + "step": 238460 + }, + { + "epoch": 2.108152548665995, + "grad_norm": 4.396089553833008, + "learning_rate": 1.4864124188900088e-05, + "loss": 0.5793, + "step": 238470 + }, + { + "epoch": 2.108240951926307, + "grad_norm": 1.2036890983581543, + "learning_rate": 1.4862650801228217e-05, + "loss": 0.5794, + "step": 238480 + }, + { + "epoch": 2.108329355186619, + "grad_norm": 7.297264575958252, + "learning_rate": 1.4861177413556345e-05, + "loss": 0.6597, + "step": 238490 + }, + { + "epoch": 2.1084177584469317, + "grad_norm": 20.098400115966797, + "learning_rate": 1.4859704025884475e-05, + "loss": 0.4617, + "step": 238500 + }, + { + "epoch": 2.108506161707244, + "grad_norm": 1.1539673805236816, + "learning_rate": 1.4858230638212605e-05, + "loss": 0.4774, + "step": 238510 + }, + { + "epoch": 2.108594564967556, + "grad_norm": 2.4030628204345703, + "learning_rate": 1.4856757250540733e-05, + "loss": 0.4173, + "step": 238520 + }, + { + "epoch": 2.108682968227868, + "grad_norm": 4.7356791496276855, + "learning_rate": 1.4855283862868864e-05, + "loss": 0.6376, + "step": 238530 + }, + { + "epoch": 2.1087713714881806, + "grad_norm": 2.8331377506256104, + "learning_rate": 1.4853810475196992e-05, + "loss": 0.5353, + "step": 238540 + }, + { + "epoch": 2.1088597747484927, + "grad_norm": 4.224395275115967, + "learning_rate": 1.4852337087525122e-05, + "loss": 0.493, + "step": 238550 + }, + { + "epoch": 2.108948178008805, + "grad_norm": 5.981100559234619, + "learning_rate": 1.4850863699853252e-05, + "loss": 0.5468, + "step": 238560 + }, + { + "epoch": 2.1090365812691174, + "grad_norm": 2.0761823654174805, + "learning_rate": 1.484939031218138e-05, + "loss": 0.609, + "step": 238570 + }, + { + "epoch": 2.1091249845294295, + "grad_norm": 1.196777582168579, + "learning_rate": 1.484791692450951e-05, + "loss": 0.4578, + "step": 238580 + }, + { + "epoch": 2.1092133877897417, + "grad_norm": 7.692698955535889, + "learning_rate": 1.484644353683764e-05, + "loss": 0.4439, + "step": 238590 + }, + { + "epoch": 2.1093017910500538, + "grad_norm": 2.7484052181243896, + "learning_rate": 1.4844970149165769e-05, + "loss": 0.7145, + "step": 238600 + }, + { + "epoch": 2.1093901943103663, + "grad_norm": 2.7875328063964844, + "learning_rate": 1.4843496761493897e-05, + "loss": 0.5511, + "step": 238610 + }, + { + "epoch": 2.1094785975706785, + "grad_norm": 0.7642548680305481, + "learning_rate": 1.4842023373822029e-05, + "loss": 0.4454, + "step": 238620 + }, + { + "epoch": 2.1095670008309906, + "grad_norm": 1.0987783670425415, + "learning_rate": 1.4840549986150157e-05, + "loss": 0.5315, + "step": 238630 + }, + { + "epoch": 2.1096554040913027, + "grad_norm": 1.633946180343628, + "learning_rate": 1.4839076598478286e-05, + "loss": 0.5497, + "step": 238640 + }, + { + "epoch": 2.1097438073516153, + "grad_norm": 6.50010347366333, + "learning_rate": 1.4837603210806414e-05, + "loss": 0.648, + "step": 238650 + }, + { + "epoch": 2.1098322106119274, + "grad_norm": 1.9326896667480469, + "learning_rate": 1.4836129823134546e-05, + "loss": 0.4389, + "step": 238660 + }, + { + "epoch": 2.1099206138722395, + "grad_norm": 10.61047077178955, + "learning_rate": 1.4834656435462674e-05, + "loss": 0.5823, + "step": 238670 + }, + { + "epoch": 2.110009017132552, + "grad_norm": 8.093788146972656, + "learning_rate": 1.4833183047790802e-05, + "loss": 0.729, + "step": 238680 + }, + { + "epoch": 2.110097420392864, + "grad_norm": 3.6855928897857666, + "learning_rate": 1.4831709660118934e-05, + "loss": 0.6578, + "step": 238690 + }, + { + "epoch": 2.1101858236531763, + "grad_norm": 1.067094087600708, + "learning_rate": 1.4830236272447062e-05, + "loss": 0.3426, + "step": 238700 + }, + { + "epoch": 2.1102742269134884, + "grad_norm": 1.6818755865097046, + "learning_rate": 1.482876288477519e-05, + "loss": 0.5291, + "step": 238710 + }, + { + "epoch": 2.110362630173801, + "grad_norm": 15.636614799499512, + "learning_rate": 1.482728949710332e-05, + "loss": 0.6048, + "step": 238720 + }, + { + "epoch": 2.110451033434113, + "grad_norm": 5.51932954788208, + "learning_rate": 1.4825816109431451e-05, + "loss": 0.459, + "step": 238730 + }, + { + "epoch": 2.1105394366944252, + "grad_norm": 8.058509826660156, + "learning_rate": 1.482434272175958e-05, + "loss": 0.5598, + "step": 238740 + }, + { + "epoch": 2.1106278399547374, + "grad_norm": 0.9066150188446045, + "learning_rate": 1.4822869334087708e-05, + "loss": 0.5541, + "step": 238750 + }, + { + "epoch": 2.11071624321505, + "grad_norm": 1.2833402156829834, + "learning_rate": 1.482139594641584e-05, + "loss": 0.4337, + "step": 238760 + }, + { + "epoch": 2.110804646475362, + "grad_norm": 1.6819376945495605, + "learning_rate": 1.4819922558743968e-05, + "loss": 0.5072, + "step": 238770 + }, + { + "epoch": 2.110893049735674, + "grad_norm": 2.32165265083313, + "learning_rate": 1.4818449171072096e-05, + "loss": 0.5675, + "step": 238780 + }, + { + "epoch": 2.1109814529959863, + "grad_norm": 5.2871317863464355, + "learning_rate": 1.4816975783400224e-05, + "loss": 0.605, + "step": 238790 + }, + { + "epoch": 2.111069856256299, + "grad_norm": 8.594915390014648, + "learning_rate": 1.4815502395728356e-05, + "loss": 0.4812, + "step": 238800 + }, + { + "epoch": 2.111158259516611, + "grad_norm": 4.369677543640137, + "learning_rate": 1.4814029008056485e-05, + "loss": 0.5351, + "step": 238810 + }, + { + "epoch": 2.111246662776923, + "grad_norm": 4.0479230880737305, + "learning_rate": 1.4812555620384613e-05, + "loss": 0.5815, + "step": 238820 + }, + { + "epoch": 2.1113350660372356, + "grad_norm": 2.4949591159820557, + "learning_rate": 1.4811082232712741e-05, + "loss": 0.6291, + "step": 238830 + }, + { + "epoch": 2.1114234692975478, + "grad_norm": 3.4854602813720703, + "learning_rate": 1.4809608845040873e-05, + "loss": 0.5262, + "step": 238840 + }, + { + "epoch": 2.11151187255786, + "grad_norm": 3.713670015335083, + "learning_rate": 1.4808135457369001e-05, + "loss": 0.6583, + "step": 238850 + }, + { + "epoch": 2.111600275818172, + "grad_norm": 2.322791814804077, + "learning_rate": 1.480666206969713e-05, + "loss": 0.4637, + "step": 238860 + }, + { + "epoch": 2.1116886790784846, + "grad_norm": 0.7642722129821777, + "learning_rate": 1.4805188682025261e-05, + "loss": 0.4395, + "step": 238870 + }, + { + "epoch": 2.1117770823387967, + "grad_norm": 2.579500675201416, + "learning_rate": 1.480371529435339e-05, + "loss": 0.4862, + "step": 238880 + }, + { + "epoch": 2.111865485599109, + "grad_norm": 4.47136116027832, + "learning_rate": 1.4802241906681518e-05, + "loss": 0.3981, + "step": 238890 + }, + { + "epoch": 2.111953888859421, + "grad_norm": 2.0278234481811523, + "learning_rate": 1.4800768519009647e-05, + "loss": 0.3951, + "step": 238900 + }, + { + "epoch": 2.1120422921197335, + "grad_norm": 2.0030274391174316, + "learning_rate": 1.4799295131337778e-05, + "loss": 0.5377, + "step": 238910 + }, + { + "epoch": 2.1121306953800456, + "grad_norm": 2.9303743839263916, + "learning_rate": 1.4797821743665907e-05, + "loss": 0.5321, + "step": 238920 + }, + { + "epoch": 2.1122190986403577, + "grad_norm": 1.6655513048171997, + "learning_rate": 1.4796348355994035e-05, + "loss": 0.6132, + "step": 238930 + }, + { + "epoch": 2.1123075019006703, + "grad_norm": 1.7388615608215332, + "learning_rate": 1.4794874968322167e-05, + "loss": 0.5742, + "step": 238940 + }, + { + "epoch": 2.1123959051609824, + "grad_norm": 1.2142493724822998, + "learning_rate": 1.4793401580650295e-05, + "loss": 0.528, + "step": 238950 + }, + { + "epoch": 2.1124843084212945, + "grad_norm": 1.1040047407150269, + "learning_rate": 1.4791928192978423e-05, + "loss": 0.5153, + "step": 238960 + }, + { + "epoch": 2.1125727116816067, + "grad_norm": 1.3043032884597778, + "learning_rate": 1.4790454805306553e-05, + "loss": 0.4423, + "step": 238970 + }, + { + "epoch": 2.112661114941919, + "grad_norm": 5.860005855560303, + "learning_rate": 1.4788981417634684e-05, + "loss": 0.4866, + "step": 238980 + }, + { + "epoch": 2.1127495182022313, + "grad_norm": 1.1819249391555786, + "learning_rate": 1.4787508029962812e-05, + "loss": 0.4157, + "step": 238990 + }, + { + "epoch": 2.1128379214625435, + "grad_norm": 6.6261773109436035, + "learning_rate": 1.4786034642290942e-05, + "loss": 0.498, + "step": 239000 + }, + { + "epoch": 2.1129263247228556, + "grad_norm": 5.813360214233398, + "learning_rate": 1.478456125461907e-05, + "loss": 0.5674, + "step": 239010 + }, + { + "epoch": 2.113014727983168, + "grad_norm": 2.2305688858032227, + "learning_rate": 1.4783087866947202e-05, + "loss": 0.5309, + "step": 239020 + }, + { + "epoch": 2.1131031312434803, + "grad_norm": 1.9112749099731445, + "learning_rate": 1.478161447927533e-05, + "loss": 0.4543, + "step": 239030 + }, + { + "epoch": 2.1131915345037924, + "grad_norm": 2.131376028060913, + "learning_rate": 1.4780141091603459e-05, + "loss": 0.6203, + "step": 239040 + }, + { + "epoch": 2.113279937764105, + "grad_norm": 0.9026879072189331, + "learning_rate": 1.477866770393159e-05, + "loss": 0.4244, + "step": 239050 + }, + { + "epoch": 2.113368341024417, + "grad_norm": 3.9795291423797607, + "learning_rate": 1.4777194316259719e-05, + "loss": 0.5734, + "step": 239060 + }, + { + "epoch": 2.113456744284729, + "grad_norm": 5.941439628601074, + "learning_rate": 1.4775720928587847e-05, + "loss": 0.6044, + "step": 239070 + }, + { + "epoch": 2.1135451475450413, + "grad_norm": 5.388944149017334, + "learning_rate": 1.4774247540915976e-05, + "loss": 0.5136, + "step": 239080 + }, + { + "epoch": 2.113633550805354, + "grad_norm": 1.27601158618927, + "learning_rate": 1.4772774153244107e-05, + "loss": 0.5826, + "step": 239090 + }, + { + "epoch": 2.113721954065666, + "grad_norm": 2.571349859237671, + "learning_rate": 1.4771300765572236e-05, + "loss": 0.6702, + "step": 239100 + }, + { + "epoch": 2.113810357325978, + "grad_norm": 1.1360399723052979, + "learning_rate": 1.4769827377900364e-05, + "loss": 0.4699, + "step": 239110 + }, + { + "epoch": 2.1138987605862902, + "grad_norm": 4.069033622741699, + "learning_rate": 1.4768353990228492e-05, + "loss": 0.513, + "step": 239120 + }, + { + "epoch": 2.113987163846603, + "grad_norm": 37.805999755859375, + "learning_rate": 1.4766880602556624e-05, + "loss": 0.5162, + "step": 239130 + }, + { + "epoch": 2.114075567106915, + "grad_norm": 3.572377920150757, + "learning_rate": 1.4765407214884752e-05, + "loss": 0.4434, + "step": 239140 + }, + { + "epoch": 2.114163970367227, + "grad_norm": 2.8229846954345703, + "learning_rate": 1.476393382721288e-05, + "loss": 0.5313, + "step": 239150 + }, + { + "epoch": 2.1142523736275396, + "grad_norm": 7.769279956817627, + "learning_rate": 1.4762460439541013e-05, + "loss": 0.5447, + "step": 239160 + }, + { + "epoch": 2.1143407768878517, + "grad_norm": 2.465287685394287, + "learning_rate": 1.4760987051869141e-05, + "loss": 0.5602, + "step": 239170 + }, + { + "epoch": 2.114429180148164, + "grad_norm": 3.515223264694214, + "learning_rate": 1.475951366419727e-05, + "loss": 0.611, + "step": 239180 + }, + { + "epoch": 2.114517583408476, + "grad_norm": 1.3344496488571167, + "learning_rate": 1.4758040276525398e-05, + "loss": 0.7813, + "step": 239190 + }, + { + "epoch": 2.1146059866687885, + "grad_norm": 4.002016067504883, + "learning_rate": 1.475656688885353e-05, + "loss": 0.4801, + "step": 239200 + }, + { + "epoch": 2.1146943899291006, + "grad_norm": 4.12450647354126, + "learning_rate": 1.4755093501181658e-05, + "loss": 0.5985, + "step": 239210 + }, + { + "epoch": 2.1147827931894128, + "grad_norm": 13.5738525390625, + "learning_rate": 1.4753620113509786e-05, + "loss": 0.5042, + "step": 239220 + }, + { + "epoch": 2.114871196449725, + "grad_norm": 1.0182147026062012, + "learning_rate": 1.4752146725837918e-05, + "loss": 0.4008, + "step": 239230 + }, + { + "epoch": 2.1149595997100374, + "grad_norm": 3.359891176223755, + "learning_rate": 1.4750673338166046e-05, + "loss": 0.5221, + "step": 239240 + }, + { + "epoch": 2.1150480029703496, + "grad_norm": 1.3687238693237305, + "learning_rate": 1.4749199950494174e-05, + "loss": 0.7355, + "step": 239250 + }, + { + "epoch": 2.1151364062306617, + "grad_norm": 0.8647713661193848, + "learning_rate": 1.4747726562822303e-05, + "loss": 0.5382, + "step": 239260 + }, + { + "epoch": 2.1152248094909742, + "grad_norm": 1.5926364660263062, + "learning_rate": 1.4746253175150435e-05, + "loss": 0.5993, + "step": 239270 + }, + { + "epoch": 2.1153132127512864, + "grad_norm": 0.9589020013809204, + "learning_rate": 1.4744779787478563e-05, + "loss": 0.5141, + "step": 239280 + }, + { + "epoch": 2.1154016160115985, + "grad_norm": 4.250946998596191, + "learning_rate": 1.4743306399806691e-05, + "loss": 0.4968, + "step": 239290 + }, + { + "epoch": 2.1154900192719106, + "grad_norm": 3.426882266998291, + "learning_rate": 1.474183301213482e-05, + "loss": 0.6261, + "step": 239300 + }, + { + "epoch": 2.115578422532223, + "grad_norm": 3.2096705436706543, + "learning_rate": 1.4740359624462951e-05, + "loss": 0.7452, + "step": 239310 + }, + { + "epoch": 2.1156668257925353, + "grad_norm": 4.541516304016113, + "learning_rate": 1.473888623679108e-05, + "loss": 0.4577, + "step": 239320 + }, + { + "epoch": 2.1157552290528474, + "grad_norm": 1.3465654850006104, + "learning_rate": 1.4737412849119208e-05, + "loss": 0.442, + "step": 239330 + }, + { + "epoch": 2.1158436323131595, + "grad_norm": 28.251665115356445, + "learning_rate": 1.473593946144734e-05, + "loss": 0.5087, + "step": 239340 + }, + { + "epoch": 2.115932035573472, + "grad_norm": 2.3388333320617676, + "learning_rate": 1.4734466073775468e-05, + "loss": 0.588, + "step": 239350 + }, + { + "epoch": 2.116020438833784, + "grad_norm": 2.374920606613159, + "learning_rate": 1.4732992686103597e-05, + "loss": 0.5343, + "step": 239360 + }, + { + "epoch": 2.1161088420940963, + "grad_norm": 2.124237298965454, + "learning_rate": 1.4731519298431727e-05, + "loss": 0.5187, + "step": 239370 + }, + { + "epoch": 2.1161972453544085, + "grad_norm": 3.680851697921753, + "learning_rate": 1.4730045910759857e-05, + "loss": 0.5669, + "step": 239380 + }, + { + "epoch": 2.116285648614721, + "grad_norm": 7.963100433349609, + "learning_rate": 1.4728572523087985e-05, + "loss": 0.4747, + "step": 239390 + }, + { + "epoch": 2.116374051875033, + "grad_norm": 0.6645820736885071, + "learning_rate": 1.4727099135416115e-05, + "loss": 0.5245, + "step": 239400 + }, + { + "epoch": 2.1164624551353453, + "grad_norm": 3.356074571609497, + "learning_rate": 1.4725625747744245e-05, + "loss": 0.5869, + "step": 239410 + }, + { + "epoch": 2.116550858395658, + "grad_norm": 4.547163009643555, + "learning_rate": 1.4724152360072373e-05, + "loss": 0.4528, + "step": 239420 + }, + { + "epoch": 2.11663926165597, + "grad_norm": 4.649807929992676, + "learning_rate": 1.4722678972400503e-05, + "loss": 0.4009, + "step": 239430 + }, + { + "epoch": 2.116727664916282, + "grad_norm": 18.266130447387695, + "learning_rate": 1.4721205584728632e-05, + "loss": 0.4729, + "step": 239440 + }, + { + "epoch": 2.116816068176594, + "grad_norm": 0.870461106300354, + "learning_rate": 1.4719732197056762e-05, + "loss": 0.5182, + "step": 239450 + }, + { + "epoch": 2.1169044714369067, + "grad_norm": 2.4320878982543945, + "learning_rate": 1.4718258809384892e-05, + "loss": 0.5308, + "step": 239460 + }, + { + "epoch": 2.116992874697219, + "grad_norm": 3.1683943271636963, + "learning_rate": 1.471678542171302e-05, + "loss": 0.5897, + "step": 239470 + }, + { + "epoch": 2.117081277957531, + "grad_norm": 13.254822731018066, + "learning_rate": 1.4715312034041149e-05, + "loss": 0.5837, + "step": 239480 + }, + { + "epoch": 2.117169681217843, + "grad_norm": 3.3362035751342773, + "learning_rate": 1.471383864636928e-05, + "loss": 0.5129, + "step": 239490 + }, + { + "epoch": 2.1172580844781557, + "grad_norm": 16.666479110717773, + "learning_rate": 1.4712365258697409e-05, + "loss": 0.5636, + "step": 239500 + }, + { + "epoch": 2.117346487738468, + "grad_norm": 1.2788805961608887, + "learning_rate": 1.4710891871025537e-05, + "loss": 0.5284, + "step": 239510 + }, + { + "epoch": 2.11743489099878, + "grad_norm": 2.704812526702881, + "learning_rate": 1.4709418483353669e-05, + "loss": 0.5016, + "step": 239520 + }, + { + "epoch": 2.1175232942590925, + "grad_norm": 3.2600371837615967, + "learning_rate": 1.4707945095681797e-05, + "loss": 0.5721, + "step": 239530 + }, + { + "epoch": 2.1176116975194046, + "grad_norm": 4.601171016693115, + "learning_rate": 1.4706471708009926e-05, + "loss": 0.5045, + "step": 239540 + }, + { + "epoch": 2.1177001007797167, + "grad_norm": 2.2275550365448, + "learning_rate": 1.4704998320338054e-05, + "loss": 0.595, + "step": 239550 + }, + { + "epoch": 2.117788504040029, + "grad_norm": 7.594853401184082, + "learning_rate": 1.4703524932666186e-05, + "loss": 0.4131, + "step": 239560 + }, + { + "epoch": 2.1178769073003414, + "grad_norm": 1.4593853950500488, + "learning_rate": 1.4702051544994314e-05, + "loss": 0.4397, + "step": 239570 + }, + { + "epoch": 2.1179653105606535, + "grad_norm": 5.190990447998047, + "learning_rate": 1.4700578157322442e-05, + "loss": 0.6137, + "step": 239580 + }, + { + "epoch": 2.1180537138209656, + "grad_norm": 2.062546491622925, + "learning_rate": 1.469910476965057e-05, + "loss": 0.5236, + "step": 239590 + }, + { + "epoch": 2.1181421170812778, + "grad_norm": 2.0268757343292236, + "learning_rate": 1.4697631381978702e-05, + "loss": 0.5296, + "step": 239600 + }, + { + "epoch": 2.1182305203415903, + "grad_norm": 5.959784984588623, + "learning_rate": 1.469615799430683e-05, + "loss": 0.517, + "step": 239610 + }, + { + "epoch": 2.1183189236019024, + "grad_norm": 2.1720612049102783, + "learning_rate": 1.4694684606634959e-05, + "loss": 0.5292, + "step": 239620 + }, + { + "epoch": 2.1184073268622146, + "grad_norm": 2.898484706878662, + "learning_rate": 1.4693211218963091e-05, + "loss": 0.6853, + "step": 239630 + }, + { + "epoch": 2.118495730122527, + "grad_norm": 2.5823893547058105, + "learning_rate": 1.469173783129122e-05, + "loss": 0.4534, + "step": 239640 + }, + { + "epoch": 2.1185841333828392, + "grad_norm": 2.075148344039917, + "learning_rate": 1.4690264443619348e-05, + "loss": 0.5011, + "step": 239650 + }, + { + "epoch": 2.1186725366431514, + "grad_norm": 2.1702382564544678, + "learning_rate": 1.4688791055947476e-05, + "loss": 0.5633, + "step": 239660 + }, + { + "epoch": 2.1187609399034635, + "grad_norm": 4.684069633483887, + "learning_rate": 1.4687317668275608e-05, + "loss": 0.4393, + "step": 239670 + }, + { + "epoch": 2.118849343163776, + "grad_norm": 4.709403038024902, + "learning_rate": 1.4685844280603736e-05, + "loss": 0.5822, + "step": 239680 + }, + { + "epoch": 2.118937746424088, + "grad_norm": 4.740092754364014, + "learning_rate": 1.4684370892931864e-05, + "loss": 0.5749, + "step": 239690 + }, + { + "epoch": 2.1190261496844003, + "grad_norm": 3.7029666900634766, + "learning_rate": 1.4682897505259996e-05, + "loss": 0.6394, + "step": 239700 + }, + { + "epoch": 2.1191145529447124, + "grad_norm": 1.884626865386963, + "learning_rate": 1.4681424117588124e-05, + "loss": 0.5851, + "step": 239710 + }, + { + "epoch": 2.119202956205025, + "grad_norm": 1.279578685760498, + "learning_rate": 1.4679950729916253e-05, + "loss": 0.6275, + "step": 239720 + }, + { + "epoch": 2.119291359465337, + "grad_norm": 4.489234924316406, + "learning_rate": 1.4678477342244381e-05, + "loss": 0.5098, + "step": 239730 + }, + { + "epoch": 2.119379762725649, + "grad_norm": 1.7651032209396362, + "learning_rate": 1.4677003954572513e-05, + "loss": 0.5101, + "step": 239740 + }, + { + "epoch": 2.1194681659859618, + "grad_norm": 3.341102123260498, + "learning_rate": 1.4675530566900641e-05, + "loss": 0.6612, + "step": 239750 + }, + { + "epoch": 2.119556569246274, + "grad_norm": 1.4641309976577759, + "learning_rate": 1.467405717922877e-05, + "loss": 0.4361, + "step": 239760 + }, + { + "epoch": 2.119644972506586, + "grad_norm": 2.6191601753234863, + "learning_rate": 1.4672583791556898e-05, + "loss": 0.4896, + "step": 239770 + }, + { + "epoch": 2.119733375766898, + "grad_norm": 1.4509540796279907, + "learning_rate": 1.467111040388503e-05, + "loss": 0.5171, + "step": 239780 + }, + { + "epoch": 2.1198217790272107, + "grad_norm": 1.1035059690475464, + "learning_rate": 1.4669637016213158e-05, + "loss": 0.5522, + "step": 239790 + }, + { + "epoch": 2.119910182287523, + "grad_norm": 1.4631824493408203, + "learning_rate": 1.4668163628541286e-05, + "loss": 0.5095, + "step": 239800 + }, + { + "epoch": 2.119998585547835, + "grad_norm": 11.814665794372559, + "learning_rate": 1.4666690240869418e-05, + "loss": 0.6297, + "step": 239810 + }, + { + "epoch": 2.120086988808147, + "grad_norm": 7.164376735687256, + "learning_rate": 1.4665216853197547e-05, + "loss": 0.5018, + "step": 239820 + }, + { + "epoch": 2.1201753920684596, + "grad_norm": 4.910740852355957, + "learning_rate": 1.4663743465525675e-05, + "loss": 0.5741, + "step": 239830 + }, + { + "epoch": 2.1202637953287717, + "grad_norm": 1.492996335029602, + "learning_rate": 1.4662270077853805e-05, + "loss": 0.4588, + "step": 239840 + }, + { + "epoch": 2.120352198589084, + "grad_norm": 8.212862014770508, + "learning_rate": 1.4660796690181935e-05, + "loss": 0.4392, + "step": 239850 + }, + { + "epoch": 2.1204406018493964, + "grad_norm": 2.2045745849609375, + "learning_rate": 1.4659323302510063e-05, + "loss": 0.473, + "step": 239860 + }, + { + "epoch": 2.1205290051097085, + "grad_norm": 2.7689998149871826, + "learning_rate": 1.4657849914838193e-05, + "loss": 0.4974, + "step": 239870 + }, + { + "epoch": 2.1206174083700207, + "grad_norm": 14.793066024780273, + "learning_rate": 1.4656376527166323e-05, + "loss": 0.5493, + "step": 239880 + }, + { + "epoch": 2.120705811630333, + "grad_norm": 2.9704573154449463, + "learning_rate": 1.4654903139494452e-05, + "loss": 0.5057, + "step": 239890 + }, + { + "epoch": 2.1207942148906453, + "grad_norm": 3.312669038772583, + "learning_rate": 1.4653429751822582e-05, + "loss": 0.582, + "step": 239900 + }, + { + "epoch": 2.1208826181509575, + "grad_norm": 10.208150863647461, + "learning_rate": 1.465195636415071e-05, + "loss": 0.5423, + "step": 239910 + }, + { + "epoch": 2.1209710214112696, + "grad_norm": 1.3462343215942383, + "learning_rate": 1.465048297647884e-05, + "loss": 0.5643, + "step": 239920 + }, + { + "epoch": 2.1210594246715817, + "grad_norm": 5.49409818649292, + "learning_rate": 1.464900958880697e-05, + "loss": 0.5376, + "step": 239930 + }, + { + "epoch": 2.1211478279318943, + "grad_norm": 3.385828733444214, + "learning_rate": 1.4647536201135099e-05, + "loss": 0.5656, + "step": 239940 + }, + { + "epoch": 2.1212362311922064, + "grad_norm": 2.6967039108276367, + "learning_rate": 1.4646062813463227e-05, + "loss": 0.4111, + "step": 239950 + }, + { + "epoch": 2.1213246344525185, + "grad_norm": 1.0082072019577026, + "learning_rate": 1.4644589425791359e-05, + "loss": 0.637, + "step": 239960 + }, + { + "epoch": 2.1214130377128306, + "grad_norm": 14.05224323272705, + "learning_rate": 1.4643116038119487e-05, + "loss": 0.3949, + "step": 239970 + }, + { + "epoch": 2.121501440973143, + "grad_norm": 5.751608848571777, + "learning_rate": 1.4641642650447615e-05, + "loss": 0.6334, + "step": 239980 + }, + { + "epoch": 2.1215898442334553, + "grad_norm": 11.240212440490723, + "learning_rate": 1.4640169262775747e-05, + "loss": 0.6054, + "step": 239990 + }, + { + "epoch": 2.1216782474937674, + "grad_norm": 4.4324164390563965, + "learning_rate": 1.4638695875103876e-05, + "loss": 0.6219, + "step": 240000 + }, + { + "epoch": 2.12176665075408, + "grad_norm": 4.325161457061768, + "learning_rate": 1.4637222487432004e-05, + "loss": 0.6089, + "step": 240010 + }, + { + "epoch": 2.121855054014392, + "grad_norm": 12.561410903930664, + "learning_rate": 1.4635749099760132e-05, + "loss": 0.5758, + "step": 240020 + }, + { + "epoch": 2.1219434572747042, + "grad_norm": 1.894711971282959, + "learning_rate": 1.4634275712088264e-05, + "loss": 0.5498, + "step": 240030 + }, + { + "epoch": 2.1220318605350164, + "grad_norm": 2.867130994796753, + "learning_rate": 1.4632802324416392e-05, + "loss": 0.6331, + "step": 240040 + }, + { + "epoch": 2.122120263795329, + "grad_norm": 1.132964015007019, + "learning_rate": 1.463132893674452e-05, + "loss": 0.51, + "step": 240050 + }, + { + "epoch": 2.122208667055641, + "grad_norm": 3.9914145469665527, + "learning_rate": 1.4629855549072652e-05, + "loss": 0.5685, + "step": 240060 + }, + { + "epoch": 2.122297070315953, + "grad_norm": 5.3202996253967285, + "learning_rate": 1.462838216140078e-05, + "loss": 0.5738, + "step": 240070 + }, + { + "epoch": 2.1223854735762653, + "grad_norm": 5.403987884521484, + "learning_rate": 1.462690877372891e-05, + "loss": 0.5635, + "step": 240080 + }, + { + "epoch": 2.122473876836578, + "grad_norm": 3.1679487228393555, + "learning_rate": 1.4625435386057038e-05, + "loss": 0.464, + "step": 240090 + }, + { + "epoch": 2.12256228009689, + "grad_norm": 5.563746929168701, + "learning_rate": 1.462396199838517e-05, + "loss": 0.5178, + "step": 240100 + }, + { + "epoch": 2.122650683357202, + "grad_norm": 3.053244113922119, + "learning_rate": 1.4622488610713298e-05, + "loss": 0.5913, + "step": 240110 + }, + { + "epoch": 2.1227390866175146, + "grad_norm": 10.388944625854492, + "learning_rate": 1.4621015223041426e-05, + "loss": 0.6379, + "step": 240120 + }, + { + "epoch": 2.1228274898778268, + "grad_norm": 1.8436764478683472, + "learning_rate": 1.4619541835369554e-05, + "loss": 0.6943, + "step": 240130 + }, + { + "epoch": 2.122915893138139, + "grad_norm": 5.256056785583496, + "learning_rate": 1.4618068447697686e-05, + "loss": 0.5418, + "step": 240140 + }, + { + "epoch": 2.123004296398451, + "grad_norm": 1.485425591468811, + "learning_rate": 1.4616595060025814e-05, + "loss": 0.546, + "step": 240150 + }, + { + "epoch": 2.1230926996587636, + "grad_norm": 6.504018783569336, + "learning_rate": 1.4615121672353943e-05, + "loss": 0.5055, + "step": 240160 + }, + { + "epoch": 2.1231811029190757, + "grad_norm": 1.1064144372940063, + "learning_rate": 1.4613648284682075e-05, + "loss": 0.4839, + "step": 240170 + }, + { + "epoch": 2.123269506179388, + "grad_norm": 1.7591400146484375, + "learning_rate": 1.4612174897010203e-05, + "loss": 0.4896, + "step": 240180 + }, + { + "epoch": 2.1233579094397, + "grad_norm": 3.680651903152466, + "learning_rate": 1.4610701509338331e-05, + "loss": 0.5946, + "step": 240190 + }, + { + "epoch": 2.1234463127000125, + "grad_norm": 4.794328689575195, + "learning_rate": 1.460922812166646e-05, + "loss": 0.57, + "step": 240200 + }, + { + "epoch": 2.1235347159603246, + "grad_norm": 3.464540719985962, + "learning_rate": 1.4607754733994591e-05, + "loss": 0.6601, + "step": 240210 + }, + { + "epoch": 2.1236231192206367, + "grad_norm": 1.2451211214065552, + "learning_rate": 1.460628134632272e-05, + "loss": 0.365, + "step": 240220 + }, + { + "epoch": 2.1237115224809493, + "grad_norm": 2.2480506896972656, + "learning_rate": 1.4604807958650848e-05, + "loss": 0.7481, + "step": 240230 + }, + { + "epoch": 2.1237999257412614, + "grad_norm": 1.6372555494308472, + "learning_rate": 1.4603334570978976e-05, + "loss": 0.4024, + "step": 240240 + }, + { + "epoch": 2.1238883290015735, + "grad_norm": 1.4410884380340576, + "learning_rate": 1.4601861183307108e-05, + "loss": 0.4883, + "step": 240250 + }, + { + "epoch": 2.1239767322618857, + "grad_norm": 2.8486905097961426, + "learning_rate": 1.4600387795635236e-05, + "loss": 0.5715, + "step": 240260 + }, + { + "epoch": 2.1240651355221982, + "grad_norm": 3.2499794960021973, + "learning_rate": 1.4598914407963365e-05, + "loss": 0.5148, + "step": 240270 + }, + { + "epoch": 2.1241535387825103, + "grad_norm": 5.381415367126465, + "learning_rate": 1.4597441020291497e-05, + "loss": 0.6452, + "step": 240280 + }, + { + "epoch": 2.1242419420428225, + "grad_norm": 1.2976635694503784, + "learning_rate": 1.4595967632619625e-05, + "loss": 0.4863, + "step": 240290 + }, + { + "epoch": 2.1243303453031346, + "grad_norm": 5.2817277908325195, + "learning_rate": 1.4594494244947753e-05, + "loss": 0.5275, + "step": 240300 + }, + { + "epoch": 2.124418748563447, + "grad_norm": 2.4230539798736572, + "learning_rate": 1.4593020857275883e-05, + "loss": 0.469, + "step": 240310 + }, + { + "epoch": 2.1245071518237593, + "grad_norm": 5.153696060180664, + "learning_rate": 1.4591547469604013e-05, + "loss": 0.628, + "step": 240320 + }, + { + "epoch": 2.1245955550840714, + "grad_norm": 7.4757399559021, + "learning_rate": 1.4590074081932142e-05, + "loss": 0.6016, + "step": 240330 + }, + { + "epoch": 2.124683958344384, + "grad_norm": 24.21209716796875, + "learning_rate": 1.4588600694260272e-05, + "loss": 0.5469, + "step": 240340 + }, + { + "epoch": 2.124772361604696, + "grad_norm": 2.855147361755371, + "learning_rate": 1.4587127306588402e-05, + "loss": 0.4387, + "step": 240350 + }, + { + "epoch": 2.124860764865008, + "grad_norm": 1.47322678565979, + "learning_rate": 1.458565391891653e-05, + "loss": 0.456, + "step": 240360 + }, + { + "epoch": 2.1249491681253203, + "grad_norm": 2.173752546310425, + "learning_rate": 1.458418053124466e-05, + "loss": 0.6575, + "step": 240370 + }, + { + "epoch": 2.125037571385633, + "grad_norm": 3.1523139476776123, + "learning_rate": 1.4582707143572789e-05, + "loss": 0.4298, + "step": 240380 + }, + { + "epoch": 2.125125974645945, + "grad_norm": 1.9576984643936157, + "learning_rate": 1.4581233755900919e-05, + "loss": 0.4515, + "step": 240390 + }, + { + "epoch": 2.125214377906257, + "grad_norm": 1.0822162628173828, + "learning_rate": 1.4579760368229049e-05, + "loss": 0.5338, + "step": 240400 + }, + { + "epoch": 2.1253027811665692, + "grad_norm": 1.6529641151428223, + "learning_rate": 1.4578286980557177e-05, + "loss": 0.5293, + "step": 240410 + }, + { + "epoch": 2.125391184426882, + "grad_norm": 3.1430909633636475, + "learning_rate": 1.4576813592885305e-05, + "loss": 0.5365, + "step": 240420 + }, + { + "epoch": 2.125479587687194, + "grad_norm": 0.7883678674697876, + "learning_rate": 1.4575340205213437e-05, + "loss": 0.4112, + "step": 240430 + }, + { + "epoch": 2.125567990947506, + "grad_norm": 1.439388632774353, + "learning_rate": 1.4573866817541565e-05, + "loss": 0.3975, + "step": 240440 + }, + { + "epoch": 2.1256563942078186, + "grad_norm": 1.7812825441360474, + "learning_rate": 1.4572393429869694e-05, + "loss": 0.6317, + "step": 240450 + }, + { + "epoch": 2.1257447974681307, + "grad_norm": 2.912522554397583, + "learning_rate": 1.4570920042197826e-05, + "loss": 0.6094, + "step": 240460 + }, + { + "epoch": 2.125833200728443, + "grad_norm": 2.550605535507202, + "learning_rate": 1.4569446654525954e-05, + "loss": 0.6188, + "step": 240470 + }, + { + "epoch": 2.125921603988755, + "grad_norm": 2.2892959117889404, + "learning_rate": 1.4567973266854082e-05, + "loss": 0.6075, + "step": 240480 + }, + { + "epoch": 2.1260100072490675, + "grad_norm": 1.751401662826538, + "learning_rate": 1.456649987918221e-05, + "loss": 0.6068, + "step": 240490 + }, + { + "epoch": 2.1260984105093796, + "grad_norm": 3.084197998046875, + "learning_rate": 1.4565026491510342e-05, + "loss": 0.5281, + "step": 240500 + }, + { + "epoch": 2.1261868137696918, + "grad_norm": 3.2959957122802734, + "learning_rate": 1.456355310383847e-05, + "loss": 0.6392, + "step": 240510 + }, + { + "epoch": 2.126275217030004, + "grad_norm": 5.311089992523193, + "learning_rate": 1.4562079716166599e-05, + "loss": 0.5264, + "step": 240520 + }, + { + "epoch": 2.1263636202903164, + "grad_norm": 5.699232578277588, + "learning_rate": 1.456060632849473e-05, + "loss": 0.6636, + "step": 240530 + }, + { + "epoch": 2.1264520235506286, + "grad_norm": 5.25744104385376, + "learning_rate": 1.455913294082286e-05, + "loss": 0.5323, + "step": 240540 + }, + { + "epoch": 2.1265404268109407, + "grad_norm": 4.4053802490234375, + "learning_rate": 1.4557659553150988e-05, + "loss": 0.4922, + "step": 240550 + }, + { + "epoch": 2.126628830071253, + "grad_norm": 2.7832424640655518, + "learning_rate": 1.4556186165479116e-05, + "loss": 0.6121, + "step": 240560 + }, + { + "epoch": 2.1267172333315654, + "grad_norm": 2.1833748817443848, + "learning_rate": 1.4554712777807248e-05, + "loss": 0.5498, + "step": 240570 + }, + { + "epoch": 2.1268056365918775, + "grad_norm": 4.73268985748291, + "learning_rate": 1.4553239390135376e-05, + "loss": 0.4989, + "step": 240580 + }, + { + "epoch": 2.1268940398521896, + "grad_norm": 1.5983659029006958, + "learning_rate": 1.4551766002463504e-05, + "loss": 0.449, + "step": 240590 + }, + { + "epoch": 2.126982443112502, + "grad_norm": 1.3055710792541504, + "learning_rate": 1.4550292614791633e-05, + "loss": 0.5206, + "step": 240600 + }, + { + "epoch": 2.1270708463728143, + "grad_norm": 4.789244174957275, + "learning_rate": 1.4548819227119764e-05, + "loss": 0.5636, + "step": 240610 + }, + { + "epoch": 2.1271592496331264, + "grad_norm": 3.334754228591919, + "learning_rate": 1.4547345839447893e-05, + "loss": 0.5347, + "step": 240620 + }, + { + "epoch": 2.1272476528934385, + "grad_norm": 2.9668731689453125, + "learning_rate": 1.4545872451776021e-05, + "loss": 0.5231, + "step": 240630 + }, + { + "epoch": 2.127336056153751, + "grad_norm": 2.6539628505706787, + "learning_rate": 1.4544399064104153e-05, + "loss": 0.4069, + "step": 240640 + }, + { + "epoch": 2.127424459414063, + "grad_norm": 7.577880859375, + "learning_rate": 1.4542925676432281e-05, + "loss": 0.5211, + "step": 240650 + }, + { + "epoch": 2.1275128626743753, + "grad_norm": 2.1609697341918945, + "learning_rate": 1.454145228876041e-05, + "loss": 0.3814, + "step": 240660 + }, + { + "epoch": 2.127601265934688, + "grad_norm": 2.936253547668457, + "learning_rate": 1.4539978901088538e-05, + "loss": 0.5424, + "step": 240670 + }, + { + "epoch": 2.127689669195, + "grad_norm": 0.964122474193573, + "learning_rate": 1.453850551341667e-05, + "loss": 0.5818, + "step": 240680 + }, + { + "epoch": 2.127778072455312, + "grad_norm": 1.289310097694397, + "learning_rate": 1.4537032125744798e-05, + "loss": 0.4823, + "step": 240690 + }, + { + "epoch": 2.1278664757156243, + "grad_norm": 4.428369998931885, + "learning_rate": 1.4535558738072926e-05, + "loss": 0.5208, + "step": 240700 + }, + { + "epoch": 2.127954878975937, + "grad_norm": 4.573816299438477, + "learning_rate": 1.4534085350401055e-05, + "loss": 0.4631, + "step": 240710 + }, + { + "epoch": 2.128043282236249, + "grad_norm": 7.7769999504089355, + "learning_rate": 1.4532611962729186e-05, + "loss": 0.6083, + "step": 240720 + }, + { + "epoch": 2.128131685496561, + "grad_norm": 3.773874282836914, + "learning_rate": 1.4531138575057315e-05, + "loss": 0.5394, + "step": 240730 + }, + { + "epoch": 2.128220088756873, + "grad_norm": 1.633716344833374, + "learning_rate": 1.4529665187385443e-05, + "loss": 0.6209, + "step": 240740 + }, + { + "epoch": 2.1283084920171857, + "grad_norm": 8.172629356384277, + "learning_rate": 1.4528191799713575e-05, + "loss": 0.5643, + "step": 240750 + }, + { + "epoch": 2.128396895277498, + "grad_norm": 3.142880439758301, + "learning_rate": 1.4526718412041703e-05, + "loss": 0.4009, + "step": 240760 + }, + { + "epoch": 2.12848529853781, + "grad_norm": 16.025484085083008, + "learning_rate": 1.4525245024369832e-05, + "loss": 0.5322, + "step": 240770 + }, + { + "epoch": 2.128573701798122, + "grad_norm": 12.173168182373047, + "learning_rate": 1.4523771636697962e-05, + "loss": 0.5905, + "step": 240780 + }, + { + "epoch": 2.1286621050584347, + "grad_norm": 1.3065463304519653, + "learning_rate": 1.4522298249026092e-05, + "loss": 0.5894, + "step": 240790 + }, + { + "epoch": 2.128750508318747, + "grad_norm": 2.994091272354126, + "learning_rate": 1.452082486135422e-05, + "loss": 0.4063, + "step": 240800 + }, + { + "epoch": 2.128838911579059, + "grad_norm": 1.2291150093078613, + "learning_rate": 1.451935147368235e-05, + "loss": 0.6098, + "step": 240810 + }, + { + "epoch": 2.1289273148393715, + "grad_norm": 2.171867609024048, + "learning_rate": 1.451787808601048e-05, + "loss": 0.6846, + "step": 240820 + }, + { + "epoch": 2.1290157180996836, + "grad_norm": 2.645055055618286, + "learning_rate": 1.4516404698338609e-05, + "loss": 0.4745, + "step": 240830 + }, + { + "epoch": 2.1291041213599957, + "grad_norm": 2.92521071434021, + "learning_rate": 1.4514931310666739e-05, + "loss": 0.484, + "step": 240840 + }, + { + "epoch": 2.129192524620308, + "grad_norm": 1.6796516180038452, + "learning_rate": 1.4513457922994867e-05, + "loss": 0.5992, + "step": 240850 + }, + { + "epoch": 2.1292809278806204, + "grad_norm": 2.1034462451934814, + "learning_rate": 1.4511984535322997e-05, + "loss": 0.4698, + "step": 240860 + }, + { + "epoch": 2.1293693311409325, + "grad_norm": 3.301942825317383, + "learning_rate": 1.4510511147651127e-05, + "loss": 0.6137, + "step": 240870 + }, + { + "epoch": 2.1294577344012446, + "grad_norm": 1.9830807447433472, + "learning_rate": 1.4509037759979255e-05, + "loss": 0.4982, + "step": 240880 + }, + { + "epoch": 2.1295461376615568, + "grad_norm": 7.278713226318359, + "learning_rate": 1.4507564372307384e-05, + "loss": 0.536, + "step": 240890 + }, + { + "epoch": 2.1296345409218693, + "grad_norm": 4.218784809112549, + "learning_rate": 1.4506090984635516e-05, + "loss": 0.5146, + "step": 240900 + }, + { + "epoch": 2.1297229441821814, + "grad_norm": 1.3913666009902954, + "learning_rate": 1.4504617596963644e-05, + "loss": 0.4343, + "step": 240910 + }, + { + "epoch": 2.1298113474424936, + "grad_norm": 4.6573896408081055, + "learning_rate": 1.4503144209291772e-05, + "loss": 0.4404, + "step": 240920 + }, + { + "epoch": 2.129899750702806, + "grad_norm": 4.129805564880371, + "learning_rate": 1.4501670821619904e-05, + "loss": 0.5619, + "step": 240930 + }, + { + "epoch": 2.1299881539631182, + "grad_norm": 2.1862449645996094, + "learning_rate": 1.4500197433948032e-05, + "loss": 0.5501, + "step": 240940 + }, + { + "epoch": 2.1300765572234304, + "grad_norm": 2.336277723312378, + "learning_rate": 1.449872404627616e-05, + "loss": 0.5119, + "step": 240950 + }, + { + "epoch": 2.1301649604837425, + "grad_norm": 4.518657684326172, + "learning_rate": 1.4497250658604289e-05, + "loss": 0.5453, + "step": 240960 + }, + { + "epoch": 2.130253363744055, + "grad_norm": 0.8558495044708252, + "learning_rate": 1.449577727093242e-05, + "loss": 0.6047, + "step": 240970 + }, + { + "epoch": 2.130341767004367, + "grad_norm": 6.045229434967041, + "learning_rate": 1.4494303883260549e-05, + "loss": 0.6039, + "step": 240980 + }, + { + "epoch": 2.1304301702646793, + "grad_norm": 2.793973207473755, + "learning_rate": 1.4492830495588677e-05, + "loss": 0.5223, + "step": 240990 + }, + { + "epoch": 2.1305185735249914, + "grad_norm": 9.690314292907715, + "learning_rate": 1.449135710791681e-05, + "loss": 0.577, + "step": 241000 + }, + { + "epoch": 2.130606976785304, + "grad_norm": 3.1432371139526367, + "learning_rate": 1.4489883720244938e-05, + "loss": 0.6153, + "step": 241010 + }, + { + "epoch": 2.130695380045616, + "grad_norm": 3.075500249862671, + "learning_rate": 1.4488410332573066e-05, + "loss": 0.5478, + "step": 241020 + }, + { + "epoch": 2.130783783305928, + "grad_norm": 1.6417487859725952, + "learning_rate": 1.4486936944901194e-05, + "loss": 0.5323, + "step": 241030 + }, + { + "epoch": 2.1308721865662408, + "grad_norm": 5.175327777862549, + "learning_rate": 1.4485463557229326e-05, + "loss": 0.4571, + "step": 241040 + }, + { + "epoch": 2.130960589826553, + "grad_norm": 1.320513129234314, + "learning_rate": 1.4483990169557454e-05, + "loss": 0.5943, + "step": 241050 + }, + { + "epoch": 2.131048993086865, + "grad_norm": 10.010064125061035, + "learning_rate": 1.4482516781885583e-05, + "loss": 0.4862, + "step": 241060 + }, + { + "epoch": 2.131137396347177, + "grad_norm": 2.8325459957122803, + "learning_rate": 1.4481043394213711e-05, + "loss": 0.5712, + "step": 241070 + }, + { + "epoch": 2.1312257996074897, + "grad_norm": 9.87783145904541, + "learning_rate": 1.4479570006541843e-05, + "loss": 0.5292, + "step": 241080 + }, + { + "epoch": 2.131314202867802, + "grad_norm": 4.5042829513549805, + "learning_rate": 1.4478096618869971e-05, + "loss": 0.5512, + "step": 241090 + }, + { + "epoch": 2.131402606128114, + "grad_norm": 2.3752403259277344, + "learning_rate": 1.44766232311981e-05, + "loss": 0.5118, + "step": 241100 + }, + { + "epoch": 2.131491009388426, + "grad_norm": 1.4001080989837646, + "learning_rate": 1.4475149843526231e-05, + "loss": 0.5193, + "step": 241110 + }, + { + "epoch": 2.1315794126487386, + "grad_norm": 41.40862274169922, + "learning_rate": 1.447367645585436e-05, + "loss": 0.5044, + "step": 241120 + }, + { + "epoch": 2.1316678159090507, + "grad_norm": 8.68869686126709, + "learning_rate": 1.4472203068182488e-05, + "loss": 0.5161, + "step": 241130 + }, + { + "epoch": 2.131756219169363, + "grad_norm": 2.687761068344116, + "learning_rate": 1.4470729680510616e-05, + "loss": 0.4795, + "step": 241140 + }, + { + "epoch": 2.131844622429675, + "grad_norm": 2.6580259799957275, + "learning_rate": 1.4469256292838748e-05, + "loss": 0.4793, + "step": 241150 + }, + { + "epoch": 2.1319330256899875, + "grad_norm": 3.525467872619629, + "learning_rate": 1.4467782905166876e-05, + "loss": 0.5413, + "step": 241160 + }, + { + "epoch": 2.1320214289502997, + "grad_norm": 1.0800466537475586, + "learning_rate": 1.4466309517495005e-05, + "loss": 0.6325, + "step": 241170 + }, + { + "epoch": 2.132109832210612, + "grad_norm": 3.723456382751465, + "learning_rate": 1.4464836129823133e-05, + "loss": 0.5811, + "step": 241180 + }, + { + "epoch": 2.1321982354709244, + "grad_norm": 2.3076610565185547, + "learning_rate": 1.4463362742151265e-05, + "loss": 0.5064, + "step": 241190 + }, + { + "epoch": 2.1322866387312365, + "grad_norm": 1.6674011945724487, + "learning_rate": 1.4461889354479393e-05, + "loss": 0.4917, + "step": 241200 + }, + { + "epoch": 2.1323750419915486, + "grad_norm": 10.855932235717773, + "learning_rate": 1.4460415966807522e-05, + "loss": 0.5223, + "step": 241210 + }, + { + "epoch": 2.1324634452518607, + "grad_norm": 2.292621612548828, + "learning_rate": 1.4458942579135653e-05, + "loss": 0.509, + "step": 241220 + }, + { + "epoch": 2.1325518485121733, + "grad_norm": 5.0880208015441895, + "learning_rate": 1.4457469191463782e-05, + "loss": 0.5604, + "step": 241230 + }, + { + "epoch": 2.1326402517724854, + "grad_norm": 2.9229114055633545, + "learning_rate": 1.445599580379191e-05, + "loss": 0.568, + "step": 241240 + }, + { + "epoch": 2.1327286550327975, + "grad_norm": 20.93067169189453, + "learning_rate": 1.445452241612004e-05, + "loss": 0.5155, + "step": 241250 + }, + { + "epoch": 2.13281705829311, + "grad_norm": 3.92238712310791, + "learning_rate": 1.445304902844817e-05, + "loss": 0.4974, + "step": 241260 + }, + { + "epoch": 2.132905461553422, + "grad_norm": 1.362905502319336, + "learning_rate": 1.4451575640776298e-05, + "loss": 0.6001, + "step": 241270 + }, + { + "epoch": 2.1329938648137343, + "grad_norm": 5.493089199066162, + "learning_rate": 1.4450102253104429e-05, + "loss": 0.542, + "step": 241280 + }, + { + "epoch": 2.1330822680740464, + "grad_norm": 0.8251736164093018, + "learning_rate": 1.4448628865432559e-05, + "loss": 0.5299, + "step": 241290 + }, + { + "epoch": 2.133170671334359, + "grad_norm": 2.2211060523986816, + "learning_rate": 1.4447155477760687e-05, + "loss": 0.6785, + "step": 241300 + }, + { + "epoch": 2.133259074594671, + "grad_norm": 2.0152032375335693, + "learning_rate": 1.4445682090088817e-05, + "loss": 0.5667, + "step": 241310 + }, + { + "epoch": 2.1333474778549832, + "grad_norm": 5.118082046508789, + "learning_rate": 1.4444208702416945e-05, + "loss": 0.5581, + "step": 241320 + }, + { + "epoch": 2.1334358811152954, + "grad_norm": 5.4591264724731445, + "learning_rate": 1.4442735314745075e-05, + "loss": 0.5292, + "step": 241330 + }, + { + "epoch": 2.133524284375608, + "grad_norm": 0.7229009866714478, + "learning_rate": 1.4441261927073205e-05, + "loss": 0.4424, + "step": 241340 + }, + { + "epoch": 2.13361268763592, + "grad_norm": 3.3428032398223877, + "learning_rate": 1.4439788539401334e-05, + "loss": 0.4474, + "step": 241350 + }, + { + "epoch": 2.133701090896232, + "grad_norm": 6.701232433319092, + "learning_rate": 1.4438315151729462e-05, + "loss": 0.4876, + "step": 241360 + }, + { + "epoch": 2.1337894941565443, + "grad_norm": 4.500949382781982, + "learning_rate": 1.4436841764057594e-05, + "loss": 0.3816, + "step": 241370 + }, + { + "epoch": 2.133877897416857, + "grad_norm": 6.844239711761475, + "learning_rate": 1.4435368376385722e-05, + "loss": 0.5475, + "step": 241380 + }, + { + "epoch": 2.133966300677169, + "grad_norm": 4.111814022064209, + "learning_rate": 1.443389498871385e-05, + "loss": 0.5708, + "step": 241390 + }, + { + "epoch": 2.134054703937481, + "grad_norm": 0.7619613409042358, + "learning_rate": 1.4432421601041982e-05, + "loss": 0.4919, + "step": 241400 + }, + { + "epoch": 2.1341431071977937, + "grad_norm": 5.253554821014404, + "learning_rate": 1.443094821337011e-05, + "loss": 0.3953, + "step": 241410 + }, + { + "epoch": 2.1342315104581058, + "grad_norm": 2.977701187133789, + "learning_rate": 1.4429474825698239e-05, + "loss": 0.522, + "step": 241420 + }, + { + "epoch": 2.134319913718418, + "grad_norm": 0.973832368850708, + "learning_rate": 1.4428001438026367e-05, + "loss": 0.4609, + "step": 241430 + }, + { + "epoch": 2.13440831697873, + "grad_norm": 3.1162588596343994, + "learning_rate": 1.4426528050354499e-05, + "loss": 0.5457, + "step": 241440 + }, + { + "epoch": 2.1344967202390426, + "grad_norm": 5.190722942352295, + "learning_rate": 1.4425054662682627e-05, + "loss": 0.5208, + "step": 241450 + }, + { + "epoch": 2.1345851234993547, + "grad_norm": 2.410080909729004, + "learning_rate": 1.4423581275010756e-05, + "loss": 0.6353, + "step": 241460 + }, + { + "epoch": 2.134673526759667, + "grad_norm": 5.906299114227295, + "learning_rate": 1.4422107887338888e-05, + "loss": 0.6147, + "step": 241470 + }, + { + "epoch": 2.134761930019979, + "grad_norm": 1.2651969194412231, + "learning_rate": 1.4420634499667016e-05, + "loss": 0.6037, + "step": 241480 + }, + { + "epoch": 2.1348503332802915, + "grad_norm": 2.931360960006714, + "learning_rate": 1.4419161111995144e-05, + "loss": 0.4676, + "step": 241490 + }, + { + "epoch": 2.1349387365406036, + "grad_norm": 3.6155264377593994, + "learning_rate": 1.4417687724323273e-05, + "loss": 0.4576, + "step": 241500 + }, + { + "epoch": 2.1350271398009157, + "grad_norm": 4.940629959106445, + "learning_rate": 1.4416214336651404e-05, + "loss": 0.5469, + "step": 241510 + }, + { + "epoch": 2.1351155430612283, + "grad_norm": 1.8617632389068604, + "learning_rate": 1.4414740948979533e-05, + "loss": 0.5119, + "step": 241520 + }, + { + "epoch": 2.1352039463215404, + "grad_norm": 1.7448190450668335, + "learning_rate": 1.4413267561307661e-05, + "loss": 0.6365, + "step": 241530 + }, + { + "epoch": 2.1352923495818525, + "grad_norm": 6.863341331481934, + "learning_rate": 1.441179417363579e-05, + "loss": 0.5401, + "step": 241540 + }, + { + "epoch": 2.1353807528421647, + "grad_norm": 2.572873115539551, + "learning_rate": 1.4410320785963921e-05, + "loss": 0.6393, + "step": 241550 + }, + { + "epoch": 2.1354691561024772, + "grad_norm": 11.500815391540527, + "learning_rate": 1.440884739829205e-05, + "loss": 0.4541, + "step": 241560 + }, + { + "epoch": 2.1355575593627893, + "grad_norm": 1.7326067686080933, + "learning_rate": 1.4407374010620178e-05, + "loss": 0.4239, + "step": 241570 + }, + { + "epoch": 2.1356459626231015, + "grad_norm": 2.4487578868865967, + "learning_rate": 1.440590062294831e-05, + "loss": 0.5302, + "step": 241580 + }, + { + "epoch": 2.1357343658834136, + "grad_norm": 7.502598762512207, + "learning_rate": 1.4404427235276438e-05, + "loss": 0.6234, + "step": 241590 + }, + { + "epoch": 2.135822769143726, + "grad_norm": 2.2589612007141113, + "learning_rate": 1.4402953847604566e-05, + "loss": 0.5065, + "step": 241600 + }, + { + "epoch": 2.1359111724040383, + "grad_norm": 6.206423282623291, + "learning_rate": 1.4401480459932695e-05, + "loss": 0.57, + "step": 241610 + }, + { + "epoch": 2.1359995756643504, + "grad_norm": 5.917253494262695, + "learning_rate": 1.4400007072260826e-05, + "loss": 0.4815, + "step": 241620 + }, + { + "epoch": 2.136087978924663, + "grad_norm": 1.267506718635559, + "learning_rate": 1.4398533684588955e-05, + "loss": 0.4889, + "step": 241630 + }, + { + "epoch": 2.136176382184975, + "grad_norm": 1.6015568971633911, + "learning_rate": 1.4397060296917083e-05, + "loss": 0.5747, + "step": 241640 + }, + { + "epoch": 2.136264785445287, + "grad_norm": 7.800164699554443, + "learning_rate": 1.4395586909245215e-05, + "loss": 0.4963, + "step": 241650 + }, + { + "epoch": 2.1363531887055993, + "grad_norm": 3.359905242919922, + "learning_rate": 1.4394113521573343e-05, + "loss": 0.6034, + "step": 241660 + }, + { + "epoch": 2.136441591965912, + "grad_norm": 3.055669069290161, + "learning_rate": 1.4392640133901472e-05, + "loss": 0.3775, + "step": 241670 + }, + { + "epoch": 2.136529995226224, + "grad_norm": 0.7454774975776672, + "learning_rate": 1.43911667462296e-05, + "loss": 0.4697, + "step": 241680 + }, + { + "epoch": 2.136618398486536, + "grad_norm": 1.6458839178085327, + "learning_rate": 1.4389693358557732e-05, + "loss": 0.555, + "step": 241690 + }, + { + "epoch": 2.1367068017468482, + "grad_norm": 3.0612423419952393, + "learning_rate": 1.438821997088586e-05, + "loss": 0.659, + "step": 241700 + }, + { + "epoch": 2.136795205007161, + "grad_norm": 2.6412100791931152, + "learning_rate": 1.4386746583213988e-05, + "loss": 0.4425, + "step": 241710 + }, + { + "epoch": 2.136883608267473, + "grad_norm": 4.7102766036987305, + "learning_rate": 1.4385273195542118e-05, + "loss": 0.6199, + "step": 241720 + }, + { + "epoch": 2.136972011527785, + "grad_norm": 28.148395538330078, + "learning_rate": 1.4383799807870249e-05, + "loss": 0.6223, + "step": 241730 + }, + { + "epoch": 2.137060414788097, + "grad_norm": 2.2630279064178467, + "learning_rate": 1.4382326420198377e-05, + "loss": 0.5906, + "step": 241740 + }, + { + "epoch": 2.1371488180484097, + "grad_norm": 6.718053340911865, + "learning_rate": 1.4380853032526507e-05, + "loss": 0.5314, + "step": 241750 + }, + { + "epoch": 2.137237221308722, + "grad_norm": 1.4393151998519897, + "learning_rate": 1.4379379644854637e-05, + "loss": 0.5464, + "step": 241760 + }, + { + "epoch": 2.137325624569034, + "grad_norm": 2.1033742427825928, + "learning_rate": 1.4377906257182765e-05, + "loss": 0.4539, + "step": 241770 + }, + { + "epoch": 2.1374140278293465, + "grad_norm": 3.394801378250122, + "learning_rate": 1.4376432869510895e-05, + "loss": 0.4945, + "step": 241780 + }, + { + "epoch": 2.1375024310896586, + "grad_norm": 16.7426815032959, + "learning_rate": 1.4374959481839024e-05, + "loss": 0.4795, + "step": 241790 + }, + { + "epoch": 2.1375908343499708, + "grad_norm": 6.900665760040283, + "learning_rate": 1.4373486094167154e-05, + "loss": 0.6598, + "step": 241800 + }, + { + "epoch": 2.137679237610283, + "grad_norm": 1.9208617210388184, + "learning_rate": 1.4372012706495284e-05, + "loss": 0.52, + "step": 241810 + }, + { + "epoch": 2.1377676408705955, + "grad_norm": 1.8118481636047363, + "learning_rate": 1.4370539318823412e-05, + "loss": 0.5103, + "step": 241820 + }, + { + "epoch": 2.1378560441309076, + "grad_norm": 3.6516449451446533, + "learning_rate": 1.436906593115154e-05, + "loss": 0.6061, + "step": 241830 + }, + { + "epoch": 2.1379444473912197, + "grad_norm": 2.785080671310425, + "learning_rate": 1.4367592543479672e-05, + "loss": 0.4296, + "step": 241840 + }, + { + "epoch": 2.1380328506515323, + "grad_norm": 1.504050374031067, + "learning_rate": 1.43661191558078e-05, + "loss": 0.5205, + "step": 241850 + }, + { + "epoch": 2.1381212539118444, + "grad_norm": 17.64576530456543, + "learning_rate": 1.4364645768135929e-05, + "loss": 0.5244, + "step": 241860 + }, + { + "epoch": 2.1382096571721565, + "grad_norm": 2.4146389961242676, + "learning_rate": 1.436317238046406e-05, + "loss": 0.4442, + "step": 241870 + }, + { + "epoch": 2.1382980604324686, + "grad_norm": 0.9571672677993774, + "learning_rate": 1.4361698992792189e-05, + "loss": 0.545, + "step": 241880 + }, + { + "epoch": 2.138386463692781, + "grad_norm": 5.250718116760254, + "learning_rate": 1.4360225605120317e-05, + "loss": 0.679, + "step": 241890 + }, + { + "epoch": 2.1384748669530933, + "grad_norm": 28.875904083251953, + "learning_rate": 1.4358752217448446e-05, + "loss": 0.4861, + "step": 241900 + }, + { + "epoch": 2.1385632702134054, + "grad_norm": 4.673433303833008, + "learning_rate": 1.4357278829776578e-05, + "loss": 0.4213, + "step": 241910 + }, + { + "epoch": 2.1386516734737175, + "grad_norm": 2.102203845977783, + "learning_rate": 1.4355805442104706e-05, + "loss": 0.6095, + "step": 241920 + }, + { + "epoch": 2.13874007673403, + "grad_norm": 4.014671802520752, + "learning_rate": 1.4354332054432834e-05, + "loss": 0.5444, + "step": 241930 + }, + { + "epoch": 2.1388284799943422, + "grad_norm": 2.92116379737854, + "learning_rate": 1.4352858666760966e-05, + "loss": 0.5391, + "step": 241940 + }, + { + "epoch": 2.1389168832546543, + "grad_norm": 1.087288498878479, + "learning_rate": 1.4351385279089094e-05, + "loss": 0.5779, + "step": 241950 + }, + { + "epoch": 2.1390052865149665, + "grad_norm": 1.7543346881866455, + "learning_rate": 1.4349911891417223e-05, + "loss": 0.6116, + "step": 241960 + }, + { + "epoch": 2.139093689775279, + "grad_norm": 5.162784576416016, + "learning_rate": 1.4348438503745351e-05, + "loss": 0.5345, + "step": 241970 + }, + { + "epoch": 2.139182093035591, + "grad_norm": 6.423407554626465, + "learning_rate": 1.4346965116073483e-05, + "loss": 0.4691, + "step": 241980 + }, + { + "epoch": 2.1392704962959033, + "grad_norm": 9.25793743133545, + "learning_rate": 1.4345491728401611e-05, + "loss": 0.5937, + "step": 241990 + }, + { + "epoch": 2.139358899556216, + "grad_norm": 5.816666126251221, + "learning_rate": 1.434401834072974e-05, + "loss": 0.6699, + "step": 242000 + }, + { + "epoch": 2.139447302816528, + "grad_norm": 3.4566822052001953, + "learning_rate": 1.4342544953057868e-05, + "loss": 0.5898, + "step": 242010 + }, + { + "epoch": 2.13953570607684, + "grad_norm": 2.7299349308013916, + "learning_rate": 1.4341071565386e-05, + "loss": 0.6148, + "step": 242020 + }, + { + "epoch": 2.139624109337152, + "grad_norm": 6.660886764526367, + "learning_rate": 1.4339598177714128e-05, + "loss": 0.585, + "step": 242030 + }, + { + "epoch": 2.1397125125974648, + "grad_norm": 5.433264255523682, + "learning_rate": 1.4338124790042256e-05, + "loss": 0.6168, + "step": 242040 + }, + { + "epoch": 2.139800915857777, + "grad_norm": 4.035323143005371, + "learning_rate": 1.4336651402370388e-05, + "loss": 0.5179, + "step": 242050 + }, + { + "epoch": 2.139889319118089, + "grad_norm": 4.869418144226074, + "learning_rate": 1.4335178014698516e-05, + "loss": 0.5753, + "step": 242060 + }, + { + "epoch": 2.139977722378401, + "grad_norm": 1.796901822090149, + "learning_rate": 1.4333704627026645e-05, + "loss": 0.4814, + "step": 242070 + }, + { + "epoch": 2.1400661256387137, + "grad_norm": 1.966572880744934, + "learning_rate": 1.4332231239354773e-05, + "loss": 0.4475, + "step": 242080 + }, + { + "epoch": 2.140154528899026, + "grad_norm": 15.745786666870117, + "learning_rate": 1.4330757851682905e-05, + "loss": 0.5573, + "step": 242090 + }, + { + "epoch": 2.140242932159338, + "grad_norm": 2.192887306213379, + "learning_rate": 1.4329284464011033e-05, + "loss": 0.6546, + "step": 242100 + }, + { + "epoch": 2.1403313354196505, + "grad_norm": 2.9849705696105957, + "learning_rate": 1.4327811076339162e-05, + "loss": 0.5612, + "step": 242110 + }, + { + "epoch": 2.1404197386799626, + "grad_norm": 6.382410049438477, + "learning_rate": 1.4326337688667293e-05, + "loss": 0.5895, + "step": 242120 + }, + { + "epoch": 2.1405081419402747, + "grad_norm": 1.4512547254562378, + "learning_rate": 1.4324864300995422e-05, + "loss": 0.5142, + "step": 242130 + }, + { + "epoch": 2.140596545200587, + "grad_norm": 4.293490409851074, + "learning_rate": 1.432339091332355e-05, + "loss": 0.4949, + "step": 242140 + }, + { + "epoch": 2.1406849484608994, + "grad_norm": 9.325529098510742, + "learning_rate": 1.4321917525651678e-05, + "loss": 0.524, + "step": 242150 + }, + { + "epoch": 2.1407733517212115, + "grad_norm": 1.982619047164917, + "learning_rate": 1.432044413797981e-05, + "loss": 0.5948, + "step": 242160 + }, + { + "epoch": 2.1408617549815236, + "grad_norm": 1.9328582286834717, + "learning_rate": 1.4318970750307938e-05, + "loss": 0.5281, + "step": 242170 + }, + { + "epoch": 2.1409501582418358, + "grad_norm": 13.996715545654297, + "learning_rate": 1.4317497362636067e-05, + "loss": 0.5422, + "step": 242180 + }, + { + "epoch": 2.1410385615021483, + "grad_norm": 6.919266700744629, + "learning_rate": 1.4316023974964197e-05, + "loss": 0.5878, + "step": 242190 + }, + { + "epoch": 2.1411269647624604, + "grad_norm": 5.402243137359619, + "learning_rate": 1.4314550587292327e-05, + "loss": 0.5434, + "step": 242200 + }, + { + "epoch": 2.1412153680227726, + "grad_norm": 5.773210048675537, + "learning_rate": 1.4313077199620455e-05, + "loss": 0.5433, + "step": 242210 + }, + { + "epoch": 2.141303771283085, + "grad_norm": 5.391111373901367, + "learning_rate": 1.4311603811948585e-05, + "loss": 0.6053, + "step": 242220 + }, + { + "epoch": 2.1413921745433973, + "grad_norm": 4.08708381652832, + "learning_rate": 1.4310130424276715e-05, + "loss": 0.5569, + "step": 242230 + }, + { + "epoch": 2.1414805778037094, + "grad_norm": 2.8274664878845215, + "learning_rate": 1.4308657036604844e-05, + "loss": 0.4783, + "step": 242240 + }, + { + "epoch": 2.1415689810640215, + "grad_norm": 8.11133861541748, + "learning_rate": 1.4307183648932974e-05, + "loss": 0.4669, + "step": 242250 + }, + { + "epoch": 2.141657384324334, + "grad_norm": 5.682794094085693, + "learning_rate": 1.4305710261261102e-05, + "loss": 0.4711, + "step": 242260 + }, + { + "epoch": 2.141745787584646, + "grad_norm": 2.2159953117370605, + "learning_rate": 1.4304236873589232e-05, + "loss": 0.6263, + "step": 242270 + }, + { + "epoch": 2.1418341908449583, + "grad_norm": 8.61824893951416, + "learning_rate": 1.4302763485917362e-05, + "loss": 0.5808, + "step": 242280 + }, + { + "epoch": 2.1419225941052704, + "grad_norm": 11.9248046875, + "learning_rate": 1.430129009824549e-05, + "loss": 0.4896, + "step": 242290 + }, + { + "epoch": 2.142010997365583, + "grad_norm": 1.136541485786438, + "learning_rate": 1.4299816710573619e-05, + "loss": 0.4602, + "step": 242300 + }, + { + "epoch": 2.142099400625895, + "grad_norm": 2.0425612926483154, + "learning_rate": 1.429834332290175e-05, + "loss": 0.4981, + "step": 242310 + }, + { + "epoch": 2.142187803886207, + "grad_norm": 3.8917834758758545, + "learning_rate": 1.4296869935229879e-05, + "loss": 0.5946, + "step": 242320 + }, + { + "epoch": 2.1422762071465193, + "grad_norm": 3.478379726409912, + "learning_rate": 1.4295396547558007e-05, + "loss": 0.5897, + "step": 242330 + }, + { + "epoch": 2.142364610406832, + "grad_norm": 2.584498167037964, + "learning_rate": 1.4293923159886139e-05, + "loss": 0.6232, + "step": 242340 + }, + { + "epoch": 2.142453013667144, + "grad_norm": 2.9658308029174805, + "learning_rate": 1.4292449772214267e-05, + "loss": 0.5424, + "step": 242350 + }, + { + "epoch": 2.142541416927456, + "grad_norm": 1.8993122577667236, + "learning_rate": 1.4290976384542396e-05, + "loss": 0.4749, + "step": 242360 + }, + { + "epoch": 2.1426298201877687, + "grad_norm": 4.842706680297852, + "learning_rate": 1.4289502996870524e-05, + "loss": 0.6059, + "step": 242370 + }, + { + "epoch": 2.142718223448081, + "grad_norm": 0.6682385802268982, + "learning_rate": 1.4288029609198656e-05, + "loss": 0.4473, + "step": 242380 + }, + { + "epoch": 2.142806626708393, + "grad_norm": 7.5713114738464355, + "learning_rate": 1.4286556221526784e-05, + "loss": 0.5488, + "step": 242390 + }, + { + "epoch": 2.142895029968705, + "grad_norm": 8.595520973205566, + "learning_rate": 1.4285082833854913e-05, + "loss": 0.6164, + "step": 242400 + }, + { + "epoch": 2.1429834332290176, + "grad_norm": 2.4477131366729736, + "learning_rate": 1.4283609446183044e-05, + "loss": 0.4624, + "step": 242410 + }, + { + "epoch": 2.1430718364893298, + "grad_norm": 2.379791259765625, + "learning_rate": 1.4282136058511173e-05, + "loss": 0.5402, + "step": 242420 + }, + { + "epoch": 2.143160239749642, + "grad_norm": 2.250732660293579, + "learning_rate": 1.4280662670839301e-05, + "loss": 0.574, + "step": 242430 + }, + { + "epoch": 2.1432486430099544, + "grad_norm": 5.373231410980225, + "learning_rate": 1.427918928316743e-05, + "loss": 0.5613, + "step": 242440 + }, + { + "epoch": 2.1433370462702666, + "grad_norm": 1.385927438735962, + "learning_rate": 1.4277715895495561e-05, + "loss": 0.4959, + "step": 242450 + }, + { + "epoch": 2.1434254495305787, + "grad_norm": 3.0531952381134033, + "learning_rate": 1.427624250782369e-05, + "loss": 0.5113, + "step": 242460 + }, + { + "epoch": 2.143513852790891, + "grad_norm": 0.6860807538032532, + "learning_rate": 1.4274769120151818e-05, + "loss": 0.6248, + "step": 242470 + }, + { + "epoch": 2.1436022560512034, + "grad_norm": 4.633317470550537, + "learning_rate": 1.4273295732479946e-05, + "loss": 0.4027, + "step": 242480 + }, + { + "epoch": 2.1436906593115155, + "grad_norm": 5.374399185180664, + "learning_rate": 1.4271822344808078e-05, + "loss": 0.4662, + "step": 242490 + }, + { + "epoch": 2.1437790625718276, + "grad_norm": 9.23890209197998, + "learning_rate": 1.4270348957136206e-05, + "loss": 0.5763, + "step": 242500 + }, + { + "epoch": 2.1438674658321397, + "grad_norm": 3.057779312133789, + "learning_rate": 1.4268875569464335e-05, + "loss": 0.5484, + "step": 242510 + }, + { + "epoch": 2.1439558690924523, + "grad_norm": 5.137360572814941, + "learning_rate": 1.4267402181792466e-05, + "loss": 0.5025, + "step": 242520 + }, + { + "epoch": 2.1440442723527644, + "grad_norm": 1.0035243034362793, + "learning_rate": 1.4265928794120595e-05, + "loss": 0.4326, + "step": 242530 + }, + { + "epoch": 2.1441326756130765, + "grad_norm": 1.8420356512069702, + "learning_rate": 1.4264455406448723e-05, + "loss": 0.4649, + "step": 242540 + }, + { + "epoch": 2.1442210788733886, + "grad_norm": 3.611940383911133, + "learning_rate": 1.4262982018776851e-05, + "loss": 0.5158, + "step": 242550 + }, + { + "epoch": 2.144309482133701, + "grad_norm": 5.660635471343994, + "learning_rate": 1.4261508631104983e-05, + "loss": 0.4927, + "step": 242560 + }, + { + "epoch": 2.1443978853940133, + "grad_norm": 1.4465100765228271, + "learning_rate": 1.4260035243433112e-05, + "loss": 0.5378, + "step": 242570 + }, + { + "epoch": 2.1444862886543254, + "grad_norm": 2.076045274734497, + "learning_rate": 1.425856185576124e-05, + "loss": 0.4524, + "step": 242580 + }, + { + "epoch": 2.144574691914638, + "grad_norm": 6.39243745803833, + "learning_rate": 1.4257088468089372e-05, + "loss": 0.5133, + "step": 242590 + }, + { + "epoch": 2.14466309517495, + "grad_norm": 4.7603583335876465, + "learning_rate": 1.42556150804175e-05, + "loss": 0.6123, + "step": 242600 + }, + { + "epoch": 2.1447514984352622, + "grad_norm": 2.5386927127838135, + "learning_rate": 1.4254141692745628e-05, + "loss": 0.6442, + "step": 242610 + }, + { + "epoch": 2.1448399016955744, + "grad_norm": 2.9273979663848877, + "learning_rate": 1.4252668305073757e-05, + "loss": 0.6885, + "step": 242620 + }, + { + "epoch": 2.144928304955887, + "grad_norm": 1.6224405765533447, + "learning_rate": 1.4251194917401888e-05, + "loss": 0.5061, + "step": 242630 + }, + { + "epoch": 2.145016708216199, + "grad_norm": 2.5285868644714355, + "learning_rate": 1.4249721529730017e-05, + "loss": 0.6212, + "step": 242640 + }, + { + "epoch": 2.145105111476511, + "grad_norm": 2.4672160148620605, + "learning_rate": 1.4248248142058145e-05, + "loss": 0.6273, + "step": 242650 + }, + { + "epoch": 2.1451935147368233, + "grad_norm": 3.798577308654785, + "learning_rate": 1.4246774754386275e-05, + "loss": 0.5571, + "step": 242660 + }, + { + "epoch": 2.145281917997136, + "grad_norm": 13.11314582824707, + "learning_rate": 1.4245301366714405e-05, + "loss": 0.587, + "step": 242670 + }, + { + "epoch": 2.145370321257448, + "grad_norm": 3.123300552368164, + "learning_rate": 1.4243827979042534e-05, + "loss": 0.5052, + "step": 242680 + }, + { + "epoch": 2.14545872451776, + "grad_norm": 7.283506870269775, + "learning_rate": 1.4242354591370664e-05, + "loss": 0.6203, + "step": 242690 + }, + { + "epoch": 2.1455471277780727, + "grad_norm": 18.284273147583008, + "learning_rate": 1.4240881203698794e-05, + "loss": 0.4269, + "step": 242700 + }, + { + "epoch": 2.145635531038385, + "grad_norm": 9.643710136413574, + "learning_rate": 1.4239407816026922e-05, + "loss": 0.5335, + "step": 242710 + }, + { + "epoch": 2.145723934298697, + "grad_norm": 1.1225578784942627, + "learning_rate": 1.4237934428355052e-05, + "loss": 0.4317, + "step": 242720 + }, + { + "epoch": 2.145812337559009, + "grad_norm": 2.470534086227417, + "learning_rate": 1.423646104068318e-05, + "loss": 0.6215, + "step": 242730 + }, + { + "epoch": 2.1459007408193216, + "grad_norm": 2.7008957862854004, + "learning_rate": 1.423498765301131e-05, + "loss": 0.5555, + "step": 242740 + }, + { + "epoch": 2.1459891440796337, + "grad_norm": 3.037323474884033, + "learning_rate": 1.423351426533944e-05, + "loss": 0.6183, + "step": 242750 + }, + { + "epoch": 2.146077547339946, + "grad_norm": 2.7267367839813232, + "learning_rate": 1.4232040877667569e-05, + "loss": 0.5153, + "step": 242760 + }, + { + "epoch": 2.146165950600258, + "grad_norm": 2.1468639373779297, + "learning_rate": 1.4230567489995699e-05, + "loss": 0.6118, + "step": 242770 + }, + { + "epoch": 2.1462543538605705, + "grad_norm": 16.27389144897461, + "learning_rate": 1.4229094102323829e-05, + "loss": 0.4848, + "step": 242780 + }, + { + "epoch": 2.1463427571208826, + "grad_norm": 5.013876914978027, + "learning_rate": 1.4227620714651957e-05, + "loss": 0.4837, + "step": 242790 + }, + { + "epoch": 2.1464311603811947, + "grad_norm": 5.544818878173828, + "learning_rate": 1.4226147326980086e-05, + "loss": 0.5458, + "step": 242800 + }, + { + "epoch": 2.1465195636415073, + "grad_norm": 3.3300654888153076, + "learning_rate": 1.4224673939308217e-05, + "loss": 0.5519, + "step": 242810 + }, + { + "epoch": 2.1466079669018194, + "grad_norm": 1.6979732513427734, + "learning_rate": 1.4223200551636346e-05, + "loss": 0.5769, + "step": 242820 + }, + { + "epoch": 2.1466963701621316, + "grad_norm": 1.1829543113708496, + "learning_rate": 1.4221727163964474e-05, + "loss": 0.5548, + "step": 242830 + }, + { + "epoch": 2.1467847734224437, + "grad_norm": 4.9494123458862305, + "learning_rate": 1.4220253776292603e-05, + "loss": 0.4306, + "step": 242840 + }, + { + "epoch": 2.1468731766827562, + "grad_norm": 3.2819621562957764, + "learning_rate": 1.4218780388620734e-05, + "loss": 0.5528, + "step": 242850 + }, + { + "epoch": 2.1469615799430684, + "grad_norm": 7.365408897399902, + "learning_rate": 1.4217307000948863e-05, + "loss": 0.6366, + "step": 242860 + }, + { + "epoch": 2.1470499832033805, + "grad_norm": 7.6473565101623535, + "learning_rate": 1.4215833613276991e-05, + "loss": 0.6102, + "step": 242870 + }, + { + "epoch": 2.1471383864636926, + "grad_norm": 5.077911376953125, + "learning_rate": 1.4214360225605123e-05, + "loss": 0.5057, + "step": 242880 + }, + { + "epoch": 2.147226789724005, + "grad_norm": 4.500606060028076, + "learning_rate": 1.4212886837933251e-05, + "loss": 0.5648, + "step": 242890 + }, + { + "epoch": 2.1473151929843173, + "grad_norm": 1.1757597923278809, + "learning_rate": 1.421141345026138e-05, + "loss": 0.6111, + "step": 242900 + }, + { + "epoch": 2.1474035962446294, + "grad_norm": 2.015333414077759, + "learning_rate": 1.4209940062589508e-05, + "loss": 0.5132, + "step": 242910 + }, + { + "epoch": 2.1474919995049415, + "grad_norm": 3.463022470474243, + "learning_rate": 1.420846667491764e-05, + "loss": 0.4902, + "step": 242920 + }, + { + "epoch": 2.147580402765254, + "grad_norm": 2.312302589416504, + "learning_rate": 1.4206993287245768e-05, + "loss": 0.5324, + "step": 242930 + }, + { + "epoch": 2.147668806025566, + "grad_norm": 19.547388076782227, + "learning_rate": 1.4205519899573896e-05, + "loss": 0.5791, + "step": 242940 + }, + { + "epoch": 2.1477572092858783, + "grad_norm": 8.90135383605957, + "learning_rate": 1.4204046511902025e-05, + "loss": 0.4676, + "step": 242950 + }, + { + "epoch": 2.147845612546191, + "grad_norm": 1.9959759712219238, + "learning_rate": 1.4202573124230156e-05, + "loss": 0.5748, + "step": 242960 + }, + { + "epoch": 2.147934015806503, + "grad_norm": 2.963587999343872, + "learning_rate": 1.4201099736558285e-05, + "loss": 0.4803, + "step": 242970 + }, + { + "epoch": 2.148022419066815, + "grad_norm": 22.07484245300293, + "learning_rate": 1.4199626348886413e-05, + "loss": 0.535, + "step": 242980 + }, + { + "epoch": 2.1481108223271272, + "grad_norm": 3.7510414123535156, + "learning_rate": 1.4198152961214545e-05, + "loss": 0.5776, + "step": 242990 + }, + { + "epoch": 2.14819922558744, + "grad_norm": 7.482873439788818, + "learning_rate": 1.4196679573542673e-05, + "loss": 0.535, + "step": 243000 + }, + { + "epoch": 2.148287628847752, + "grad_norm": 5.228713512420654, + "learning_rate": 1.4195206185870801e-05, + "loss": 0.5299, + "step": 243010 + }, + { + "epoch": 2.148376032108064, + "grad_norm": 4.087660312652588, + "learning_rate": 1.419373279819893e-05, + "loss": 0.5808, + "step": 243020 + }, + { + "epoch": 2.1484644353683766, + "grad_norm": 4.328578472137451, + "learning_rate": 1.4192259410527062e-05, + "loss": 0.5598, + "step": 243030 + }, + { + "epoch": 2.1485528386286887, + "grad_norm": 6.337801933288574, + "learning_rate": 1.419078602285519e-05, + "loss": 0.4681, + "step": 243040 + }, + { + "epoch": 2.148641241889001, + "grad_norm": 2.118286609649658, + "learning_rate": 1.4189312635183318e-05, + "loss": 0.505, + "step": 243050 + }, + { + "epoch": 2.148729645149313, + "grad_norm": 5.208235263824463, + "learning_rate": 1.418783924751145e-05, + "loss": 0.4822, + "step": 243060 + }, + { + "epoch": 2.1488180484096255, + "grad_norm": 3.5704526901245117, + "learning_rate": 1.4186365859839578e-05, + "loss": 0.4362, + "step": 243070 + }, + { + "epoch": 2.1489064516699377, + "grad_norm": 5.069797039031982, + "learning_rate": 1.4184892472167707e-05, + "loss": 0.5719, + "step": 243080 + }, + { + "epoch": 2.1489948549302498, + "grad_norm": 16.605297088623047, + "learning_rate": 1.4183419084495835e-05, + "loss": 0.5458, + "step": 243090 + }, + { + "epoch": 2.149083258190562, + "grad_norm": 4.501530647277832, + "learning_rate": 1.4181945696823967e-05, + "loss": 0.5595, + "step": 243100 + }, + { + "epoch": 2.1491716614508745, + "grad_norm": 0.9349772334098816, + "learning_rate": 1.4180472309152095e-05, + "loss": 0.4654, + "step": 243110 + }, + { + "epoch": 2.1492600647111866, + "grad_norm": 1.8278956413269043, + "learning_rate": 1.4178998921480224e-05, + "loss": 0.4924, + "step": 243120 + }, + { + "epoch": 2.1493484679714987, + "grad_norm": 1.8389554023742676, + "learning_rate": 1.4177525533808354e-05, + "loss": 0.5094, + "step": 243130 + }, + { + "epoch": 2.149436871231811, + "grad_norm": 4.045785903930664, + "learning_rate": 1.4176052146136484e-05, + "loss": 0.5638, + "step": 243140 + }, + { + "epoch": 2.1495252744921234, + "grad_norm": 1.785209059715271, + "learning_rate": 1.4174578758464612e-05, + "loss": 0.5695, + "step": 243150 + }, + { + "epoch": 2.1496136777524355, + "grad_norm": 3.6308679580688477, + "learning_rate": 1.4173105370792742e-05, + "loss": 0.5674, + "step": 243160 + }, + { + "epoch": 2.1497020810127476, + "grad_norm": 2.0617215633392334, + "learning_rate": 1.4171631983120872e-05, + "loss": 0.5044, + "step": 243170 + }, + { + "epoch": 2.14979048427306, + "grad_norm": 1.6697742938995361, + "learning_rate": 1.4170158595449e-05, + "loss": 0.5729, + "step": 243180 + }, + { + "epoch": 2.1498788875333723, + "grad_norm": 5.6597771644592285, + "learning_rate": 1.416868520777713e-05, + "loss": 0.6924, + "step": 243190 + }, + { + "epoch": 2.1499672907936844, + "grad_norm": 2.2514662742614746, + "learning_rate": 1.4167211820105259e-05, + "loss": 0.4477, + "step": 243200 + }, + { + "epoch": 2.1500556940539965, + "grad_norm": 4.054137229919434, + "learning_rate": 1.4165738432433389e-05, + "loss": 0.522, + "step": 243210 + }, + { + "epoch": 2.150144097314309, + "grad_norm": 3.878481864929199, + "learning_rate": 1.4164265044761519e-05, + "loss": 0.5467, + "step": 243220 + }, + { + "epoch": 2.1502325005746212, + "grad_norm": 1.781954288482666, + "learning_rate": 1.4162791657089647e-05, + "loss": 0.5784, + "step": 243230 + }, + { + "epoch": 2.1503209038349334, + "grad_norm": 2.1596295833587646, + "learning_rate": 1.4161318269417777e-05, + "loss": 0.5393, + "step": 243240 + }, + { + "epoch": 2.1504093070952455, + "grad_norm": 5.978092670440674, + "learning_rate": 1.4159844881745907e-05, + "loss": 0.5502, + "step": 243250 + }, + { + "epoch": 2.150497710355558, + "grad_norm": 2.007143259048462, + "learning_rate": 1.4158371494074036e-05, + "loss": 0.5631, + "step": 243260 + }, + { + "epoch": 2.15058611361587, + "grad_norm": 2.132291078567505, + "learning_rate": 1.4156898106402164e-05, + "loss": 0.5119, + "step": 243270 + }, + { + "epoch": 2.1506745168761823, + "grad_norm": 2.355161190032959, + "learning_rate": 1.4155424718730296e-05, + "loss": 0.5385, + "step": 243280 + }, + { + "epoch": 2.150762920136495, + "grad_norm": 1.3130526542663574, + "learning_rate": 1.4153951331058424e-05, + "loss": 0.4757, + "step": 243290 + }, + { + "epoch": 2.150851323396807, + "grad_norm": 1.5809767246246338, + "learning_rate": 1.4152477943386553e-05, + "loss": 0.5905, + "step": 243300 + }, + { + "epoch": 2.150939726657119, + "grad_norm": 18.600439071655273, + "learning_rate": 1.4151004555714681e-05, + "loss": 0.5049, + "step": 243310 + }, + { + "epoch": 2.151028129917431, + "grad_norm": 4.024776935577393, + "learning_rate": 1.4149531168042813e-05, + "loss": 0.5275, + "step": 243320 + }, + { + "epoch": 2.1511165331777438, + "grad_norm": 1.5048987865447998, + "learning_rate": 1.4148057780370941e-05, + "loss": 0.4114, + "step": 243330 + }, + { + "epoch": 2.151204936438056, + "grad_norm": 4.712319374084473, + "learning_rate": 1.414658439269907e-05, + "loss": 0.6641, + "step": 243340 + }, + { + "epoch": 2.151293339698368, + "grad_norm": 1.437583565711975, + "learning_rate": 1.4145111005027201e-05, + "loss": 0.4208, + "step": 243350 + }, + { + "epoch": 2.15138174295868, + "grad_norm": 5.430643081665039, + "learning_rate": 1.414363761735533e-05, + "loss": 0.5979, + "step": 243360 + }, + { + "epoch": 2.1514701462189927, + "grad_norm": 2.116624116897583, + "learning_rate": 1.4142164229683458e-05, + "loss": 0.5769, + "step": 243370 + }, + { + "epoch": 2.151558549479305, + "grad_norm": 3.3818469047546387, + "learning_rate": 1.4140690842011586e-05, + "loss": 0.556, + "step": 243380 + }, + { + "epoch": 2.151646952739617, + "grad_norm": 4.797572135925293, + "learning_rate": 1.4139217454339718e-05, + "loss": 0.4472, + "step": 243390 + }, + { + "epoch": 2.1517353559999295, + "grad_norm": 3.4348905086517334, + "learning_rate": 1.4137744066667846e-05, + "loss": 0.5901, + "step": 243400 + }, + { + "epoch": 2.1518237592602416, + "grad_norm": 3.245023250579834, + "learning_rate": 1.4136270678995975e-05, + "loss": 0.5512, + "step": 243410 + }, + { + "epoch": 2.1519121625205537, + "grad_norm": 2.7222514152526855, + "learning_rate": 1.4134797291324103e-05, + "loss": 0.4633, + "step": 243420 + }, + { + "epoch": 2.152000565780866, + "grad_norm": 2.101318120956421, + "learning_rate": 1.4133323903652235e-05, + "loss": 0.6493, + "step": 243430 + }, + { + "epoch": 2.1520889690411784, + "grad_norm": 1.0674017667770386, + "learning_rate": 1.4131850515980363e-05, + "loss": 0.5322, + "step": 243440 + }, + { + "epoch": 2.1521773723014905, + "grad_norm": 4.016197204589844, + "learning_rate": 1.4130377128308491e-05, + "loss": 0.4698, + "step": 243450 + }, + { + "epoch": 2.1522657755618027, + "grad_norm": 5.009634494781494, + "learning_rate": 1.4128903740636623e-05, + "loss": 0.4983, + "step": 243460 + }, + { + "epoch": 2.1523541788221148, + "grad_norm": 0.9461274147033691, + "learning_rate": 1.4127430352964752e-05, + "loss": 0.4518, + "step": 243470 + }, + { + "epoch": 2.1524425820824273, + "grad_norm": 4.652108192443848, + "learning_rate": 1.412595696529288e-05, + "loss": 0.5045, + "step": 243480 + }, + { + "epoch": 2.1525309853427395, + "grad_norm": 3.8266632556915283, + "learning_rate": 1.4124483577621008e-05, + "loss": 0.6037, + "step": 243490 + }, + { + "epoch": 2.1526193886030516, + "grad_norm": 3.0139670372009277, + "learning_rate": 1.412301018994914e-05, + "loss": 0.5259, + "step": 243500 + }, + { + "epoch": 2.1527077918633637, + "grad_norm": 1.3237189054489136, + "learning_rate": 1.4121536802277268e-05, + "loss": 0.5152, + "step": 243510 + }, + { + "epoch": 2.1527961951236763, + "grad_norm": 7.67440128326416, + "learning_rate": 1.4120063414605397e-05, + "loss": 0.5884, + "step": 243520 + }, + { + "epoch": 2.1528845983839884, + "grad_norm": 1.374855399131775, + "learning_rate": 1.4118590026933528e-05, + "loss": 0.5226, + "step": 243530 + }, + { + "epoch": 2.1529730016443005, + "grad_norm": 1.9073383808135986, + "learning_rate": 1.4117116639261657e-05, + "loss": 0.4565, + "step": 243540 + }, + { + "epoch": 2.153061404904613, + "grad_norm": 7.997211933135986, + "learning_rate": 1.4115643251589785e-05, + "loss": 0.6821, + "step": 243550 + }, + { + "epoch": 2.153149808164925, + "grad_norm": 2.6752657890319824, + "learning_rate": 1.4114169863917913e-05, + "loss": 0.5862, + "step": 243560 + }, + { + "epoch": 2.1532382114252373, + "grad_norm": 2.375593662261963, + "learning_rate": 1.4112696476246045e-05, + "loss": 0.5359, + "step": 243570 + }, + { + "epoch": 2.1533266146855494, + "grad_norm": 2.9395029544830322, + "learning_rate": 1.4111223088574174e-05, + "loss": 0.4453, + "step": 243580 + }, + { + "epoch": 2.153415017945862, + "grad_norm": 5.228000640869141, + "learning_rate": 1.4109749700902302e-05, + "loss": 0.5479, + "step": 243590 + }, + { + "epoch": 2.153503421206174, + "grad_norm": 1.9900497198104858, + "learning_rate": 1.4108276313230432e-05, + "loss": 0.44, + "step": 243600 + }, + { + "epoch": 2.1535918244664862, + "grad_norm": 1.2646338939666748, + "learning_rate": 1.4106802925558562e-05, + "loss": 0.5087, + "step": 243610 + }, + { + "epoch": 2.153680227726799, + "grad_norm": 6.573325157165527, + "learning_rate": 1.410532953788669e-05, + "loss": 0.6127, + "step": 243620 + }, + { + "epoch": 2.153768630987111, + "grad_norm": 7.747644901275635, + "learning_rate": 1.410385615021482e-05, + "loss": 0.4292, + "step": 243630 + }, + { + "epoch": 2.153857034247423, + "grad_norm": 10.362239837646484, + "learning_rate": 1.410238276254295e-05, + "loss": 0.4352, + "step": 243640 + }, + { + "epoch": 2.153945437507735, + "grad_norm": 3.3149967193603516, + "learning_rate": 1.4100909374871079e-05, + "loss": 0.5836, + "step": 243650 + }, + { + "epoch": 2.1540338407680477, + "grad_norm": 1.4797831773757935, + "learning_rate": 1.4099435987199209e-05, + "loss": 0.4638, + "step": 243660 + }, + { + "epoch": 2.15412224402836, + "grad_norm": 12.024620056152344, + "learning_rate": 1.4097962599527337e-05, + "loss": 0.5758, + "step": 243670 + }, + { + "epoch": 2.154210647288672, + "grad_norm": 3.7230045795440674, + "learning_rate": 1.4096489211855469e-05, + "loss": 0.4682, + "step": 243680 + }, + { + "epoch": 2.154299050548984, + "grad_norm": 3.8605735301971436, + "learning_rate": 1.4095015824183597e-05, + "loss": 0.5119, + "step": 243690 + }, + { + "epoch": 2.1543874538092966, + "grad_norm": 3.4832050800323486, + "learning_rate": 1.4093542436511726e-05, + "loss": 0.5583, + "step": 243700 + }, + { + "epoch": 2.1544758570696088, + "grad_norm": 4.797885417938232, + "learning_rate": 1.4092069048839857e-05, + "loss": 0.529, + "step": 243710 + }, + { + "epoch": 2.154564260329921, + "grad_norm": 1.958357810974121, + "learning_rate": 1.4090595661167986e-05, + "loss": 0.7129, + "step": 243720 + }, + { + "epoch": 2.154652663590233, + "grad_norm": 2.1061131954193115, + "learning_rate": 1.4089122273496114e-05, + "loss": 0.5779, + "step": 243730 + }, + { + "epoch": 2.1547410668505456, + "grad_norm": 3.5940310955047607, + "learning_rate": 1.4087648885824242e-05, + "loss": 0.524, + "step": 243740 + }, + { + "epoch": 2.1548294701108577, + "grad_norm": 8.640286445617676, + "learning_rate": 1.4086175498152374e-05, + "loss": 0.503, + "step": 243750 + }, + { + "epoch": 2.15491787337117, + "grad_norm": 2.501406192779541, + "learning_rate": 1.4084702110480503e-05, + "loss": 0.5646, + "step": 243760 + }, + { + "epoch": 2.1550062766314824, + "grad_norm": 2.376133680343628, + "learning_rate": 1.4083228722808631e-05, + "loss": 0.6129, + "step": 243770 + }, + { + "epoch": 2.1550946798917945, + "grad_norm": 7.6234588623046875, + "learning_rate": 1.408175533513676e-05, + "loss": 0.5366, + "step": 243780 + }, + { + "epoch": 2.1551830831521066, + "grad_norm": 1.8829786777496338, + "learning_rate": 1.4080281947464891e-05, + "loss": 0.5431, + "step": 243790 + }, + { + "epoch": 2.1552714864124187, + "grad_norm": 1.1410290002822876, + "learning_rate": 1.407880855979302e-05, + "loss": 0.3913, + "step": 243800 + }, + { + "epoch": 2.1553598896727313, + "grad_norm": 2.5937092304229736, + "learning_rate": 1.4077335172121148e-05, + "loss": 0.7068, + "step": 243810 + }, + { + "epoch": 2.1554482929330434, + "grad_norm": 4.540855884552002, + "learning_rate": 1.407586178444928e-05, + "loss": 0.5477, + "step": 243820 + }, + { + "epoch": 2.1555366961933555, + "grad_norm": 2.1727147102355957, + "learning_rate": 1.4074388396777408e-05, + "loss": 0.5254, + "step": 243830 + }, + { + "epoch": 2.1556250994536676, + "grad_norm": 4.82479190826416, + "learning_rate": 1.4072915009105536e-05, + "loss": 0.4104, + "step": 243840 + }, + { + "epoch": 2.15571350271398, + "grad_norm": 23.717605590820312, + "learning_rate": 1.4071441621433665e-05, + "loss": 0.4393, + "step": 243850 + }, + { + "epoch": 2.1558019059742923, + "grad_norm": 12.543879508972168, + "learning_rate": 1.4069968233761796e-05, + "loss": 0.616, + "step": 243860 + }, + { + "epoch": 2.1558903092346045, + "grad_norm": 2.2181859016418457, + "learning_rate": 1.4068494846089925e-05, + "loss": 0.548, + "step": 243870 + }, + { + "epoch": 2.155978712494917, + "grad_norm": 3.189884901046753, + "learning_rate": 1.4067021458418053e-05, + "loss": 0.653, + "step": 243880 + }, + { + "epoch": 2.156067115755229, + "grad_norm": 3.093968152999878, + "learning_rate": 1.4065548070746181e-05, + "loss": 0.4418, + "step": 243890 + }, + { + "epoch": 2.1561555190155413, + "grad_norm": 2.7692604064941406, + "learning_rate": 1.4064074683074313e-05, + "loss": 0.5483, + "step": 243900 + }, + { + "epoch": 2.1562439222758534, + "grad_norm": 4.377284526824951, + "learning_rate": 1.4062601295402441e-05, + "loss": 0.5313, + "step": 243910 + }, + { + "epoch": 2.156332325536166, + "grad_norm": 1.1519109010696411, + "learning_rate": 1.406112790773057e-05, + "loss": 0.5374, + "step": 243920 + }, + { + "epoch": 2.156420728796478, + "grad_norm": 3.7586669921875, + "learning_rate": 1.4059654520058702e-05, + "loss": 0.5953, + "step": 243930 + }, + { + "epoch": 2.15650913205679, + "grad_norm": 1.9818472862243652, + "learning_rate": 1.405818113238683e-05, + "loss": 0.4939, + "step": 243940 + }, + { + "epoch": 2.1565975353171023, + "grad_norm": 4.222870826721191, + "learning_rate": 1.4056707744714958e-05, + "loss": 0.528, + "step": 243950 + }, + { + "epoch": 2.156685938577415, + "grad_norm": 2.9066803455352783, + "learning_rate": 1.4055234357043087e-05, + "loss": 0.6258, + "step": 243960 + }, + { + "epoch": 2.156774341837727, + "grad_norm": 4.5635247230529785, + "learning_rate": 1.4053760969371218e-05, + "loss": 0.4973, + "step": 243970 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 16.13134765625, + "learning_rate": 1.4052287581699347e-05, + "loss": 0.6346, + "step": 243980 + }, + { + "epoch": 2.1569511483583517, + "grad_norm": 2.2910966873168945, + "learning_rate": 1.4050814194027475e-05, + "loss": 0.4616, + "step": 243990 + }, + { + "epoch": 2.157039551618664, + "grad_norm": 1.913004994392395, + "learning_rate": 1.4049340806355607e-05, + "loss": 0.5095, + "step": 244000 + }, + { + "epoch": 2.157127954878976, + "grad_norm": 21.787546157836914, + "learning_rate": 1.4047867418683735e-05, + "loss": 0.5429, + "step": 244010 + }, + { + "epoch": 2.157216358139288, + "grad_norm": 1.3679674863815308, + "learning_rate": 1.4046394031011863e-05, + "loss": 0.5798, + "step": 244020 + }, + { + "epoch": 2.1573047613996006, + "grad_norm": 1.7564347982406616, + "learning_rate": 1.4044920643339994e-05, + "loss": 0.5611, + "step": 244030 + }, + { + "epoch": 2.1573931646599127, + "grad_norm": 3.607762098312378, + "learning_rate": 1.4043447255668124e-05, + "loss": 0.4732, + "step": 244040 + }, + { + "epoch": 2.157481567920225, + "grad_norm": 0.5492611527442932, + "learning_rate": 1.4041973867996252e-05, + "loss": 0.4291, + "step": 244050 + }, + { + "epoch": 2.157569971180537, + "grad_norm": 4.104165077209473, + "learning_rate": 1.4040500480324382e-05, + "loss": 0.5458, + "step": 244060 + }, + { + "epoch": 2.1576583744408495, + "grad_norm": 1.4264072179794312, + "learning_rate": 1.403902709265251e-05, + "loss": 0.6206, + "step": 244070 + }, + { + "epoch": 2.1577467777011616, + "grad_norm": 5.074565410614014, + "learning_rate": 1.403755370498064e-05, + "loss": 0.5389, + "step": 244080 + }, + { + "epoch": 2.1578351809614738, + "grad_norm": 3.419053316116333, + "learning_rate": 1.403608031730877e-05, + "loss": 0.5098, + "step": 244090 + }, + { + "epoch": 2.157923584221786, + "grad_norm": 1.3161959648132324, + "learning_rate": 1.4034606929636899e-05, + "loss": 0.4712, + "step": 244100 + }, + { + "epoch": 2.1580119874820984, + "grad_norm": 3.7547218799591064, + "learning_rate": 1.4033133541965029e-05, + "loss": 0.521, + "step": 244110 + }, + { + "epoch": 2.1581003907424106, + "grad_norm": 0.9869753122329712, + "learning_rate": 1.4031660154293159e-05, + "loss": 0.5712, + "step": 244120 + }, + { + "epoch": 2.1581887940027227, + "grad_norm": 7.619582176208496, + "learning_rate": 1.4030186766621287e-05, + "loss": 0.5717, + "step": 244130 + }, + { + "epoch": 2.1582771972630352, + "grad_norm": 9.656731605529785, + "learning_rate": 1.4028713378949416e-05, + "loss": 0.6281, + "step": 244140 + }, + { + "epoch": 2.1583656005233474, + "grad_norm": 1.5996860265731812, + "learning_rate": 1.4027239991277547e-05, + "loss": 0.6208, + "step": 244150 + }, + { + "epoch": 2.1584540037836595, + "grad_norm": 3.011223316192627, + "learning_rate": 1.4025766603605676e-05, + "loss": 0.4602, + "step": 244160 + }, + { + "epoch": 2.1585424070439716, + "grad_norm": 7.06212043762207, + "learning_rate": 1.4024293215933804e-05, + "loss": 0.4739, + "step": 244170 + }, + { + "epoch": 2.158630810304284, + "grad_norm": 2.929180860519409, + "learning_rate": 1.4022819828261936e-05, + "loss": 0.4309, + "step": 244180 + }, + { + "epoch": 2.1587192135645963, + "grad_norm": 3.3005049228668213, + "learning_rate": 1.4021346440590064e-05, + "loss": 0.5599, + "step": 244190 + }, + { + "epoch": 2.1588076168249084, + "grad_norm": 5.65521764755249, + "learning_rate": 1.4019873052918192e-05, + "loss": 0.4833, + "step": 244200 + }, + { + "epoch": 2.158896020085221, + "grad_norm": 3.765920639038086, + "learning_rate": 1.401839966524632e-05, + "loss": 0.4913, + "step": 244210 + }, + { + "epoch": 2.158984423345533, + "grad_norm": 4.740515232086182, + "learning_rate": 1.4016926277574453e-05, + "loss": 0.4676, + "step": 244220 + }, + { + "epoch": 2.159072826605845, + "grad_norm": 1.7302119731903076, + "learning_rate": 1.4015452889902581e-05, + "loss": 0.5435, + "step": 244230 + }, + { + "epoch": 2.1591612298661573, + "grad_norm": 1.803101897239685, + "learning_rate": 1.401397950223071e-05, + "loss": 0.5711, + "step": 244240 + }, + { + "epoch": 2.15924963312647, + "grad_norm": 2.203212022781372, + "learning_rate": 1.4012506114558838e-05, + "loss": 0.6152, + "step": 244250 + }, + { + "epoch": 2.159338036386782, + "grad_norm": 1.3150070905685425, + "learning_rate": 1.401103272688697e-05, + "loss": 0.4575, + "step": 244260 + }, + { + "epoch": 2.159426439647094, + "grad_norm": 2.411425828933716, + "learning_rate": 1.4009559339215098e-05, + "loss": 0.5459, + "step": 244270 + }, + { + "epoch": 2.1595148429074063, + "grad_norm": 1.25325345993042, + "learning_rate": 1.4008085951543226e-05, + "loss": 0.4423, + "step": 244280 + }, + { + "epoch": 2.159603246167719, + "grad_norm": 1.5778499841690063, + "learning_rate": 1.4006612563871358e-05, + "loss": 0.5208, + "step": 244290 + }, + { + "epoch": 2.159691649428031, + "grad_norm": 11.144630432128906, + "learning_rate": 1.4005139176199486e-05, + "loss": 0.4481, + "step": 244300 + }, + { + "epoch": 2.159780052688343, + "grad_norm": 4.063504219055176, + "learning_rate": 1.4003665788527615e-05, + "loss": 0.6327, + "step": 244310 + }, + { + "epoch": 2.159868455948655, + "grad_norm": 2.275707244873047, + "learning_rate": 1.4002192400855743e-05, + "loss": 0.4797, + "step": 244320 + }, + { + "epoch": 2.1599568592089677, + "grad_norm": 1.0021597146987915, + "learning_rate": 1.4000719013183875e-05, + "loss": 0.5657, + "step": 244330 + }, + { + "epoch": 2.16004526246928, + "grad_norm": 2.9870662689208984, + "learning_rate": 1.3999245625512003e-05, + "loss": 0.5142, + "step": 244340 + }, + { + "epoch": 2.160133665729592, + "grad_norm": 10.80369758605957, + "learning_rate": 1.3997772237840131e-05, + "loss": 0.5662, + "step": 244350 + }, + { + "epoch": 2.1602220689899045, + "grad_norm": 2.751176595687866, + "learning_rate": 1.3996298850168263e-05, + "loss": 0.5611, + "step": 244360 + }, + { + "epoch": 2.1603104722502167, + "grad_norm": 2.8473587036132812, + "learning_rate": 1.3994825462496391e-05, + "loss": 0.566, + "step": 244370 + }, + { + "epoch": 2.160398875510529, + "grad_norm": 4.801595211029053, + "learning_rate": 1.399335207482452e-05, + "loss": 0.536, + "step": 244380 + }, + { + "epoch": 2.160487278770841, + "grad_norm": 1.6389212608337402, + "learning_rate": 1.3991878687152648e-05, + "loss": 0.5479, + "step": 244390 + }, + { + "epoch": 2.1605756820311535, + "grad_norm": 8.041728019714355, + "learning_rate": 1.399040529948078e-05, + "loss": 0.5684, + "step": 244400 + }, + { + "epoch": 2.1606640852914656, + "grad_norm": 4.559194564819336, + "learning_rate": 1.3988931911808908e-05, + "loss": 0.6588, + "step": 244410 + }, + { + "epoch": 2.1607524885517777, + "grad_norm": 2.1719346046447754, + "learning_rate": 1.3987458524137037e-05, + "loss": 0.5658, + "step": 244420 + }, + { + "epoch": 2.16084089181209, + "grad_norm": 2.13505482673645, + "learning_rate": 1.3985985136465165e-05, + "loss": 0.5392, + "step": 244430 + }, + { + "epoch": 2.1609292950724024, + "grad_norm": 4.424104690551758, + "learning_rate": 1.3984511748793297e-05, + "loss": 0.6399, + "step": 244440 + }, + { + "epoch": 2.1610176983327145, + "grad_norm": 2.958709478378296, + "learning_rate": 1.3983038361121425e-05, + "loss": 0.5419, + "step": 244450 + }, + { + "epoch": 2.1611061015930266, + "grad_norm": 8.076685905456543, + "learning_rate": 1.3981564973449553e-05, + "loss": 0.5349, + "step": 244460 + }, + { + "epoch": 2.161194504853339, + "grad_norm": 3.134028434753418, + "learning_rate": 1.3980091585777685e-05, + "loss": 0.6062, + "step": 244470 + }, + { + "epoch": 2.1612829081136513, + "grad_norm": 2.1475369930267334, + "learning_rate": 1.3978618198105814e-05, + "loss": 0.4731, + "step": 244480 + }, + { + "epoch": 2.1613713113739634, + "grad_norm": 0.6279566884040833, + "learning_rate": 1.3977144810433942e-05, + "loss": 0.5298, + "step": 244490 + }, + { + "epoch": 2.1614597146342756, + "grad_norm": 3.532468795776367, + "learning_rate": 1.3975671422762072e-05, + "loss": 0.5596, + "step": 244500 + }, + { + "epoch": 2.161548117894588, + "grad_norm": 9.180599212646484, + "learning_rate": 1.3974198035090202e-05, + "loss": 0.6088, + "step": 244510 + }, + { + "epoch": 2.1616365211549002, + "grad_norm": 5.718137741088867, + "learning_rate": 1.397272464741833e-05, + "loss": 0.4327, + "step": 244520 + }, + { + "epoch": 2.1617249244152124, + "grad_norm": 2.650045871734619, + "learning_rate": 1.397125125974646e-05, + "loss": 0.5135, + "step": 244530 + }, + { + "epoch": 2.1618133276755245, + "grad_norm": 2.1892130374908447, + "learning_rate": 1.3969777872074589e-05, + "loss": 0.5408, + "step": 244540 + }, + { + "epoch": 2.161901730935837, + "grad_norm": 2.976047992706299, + "learning_rate": 1.3968304484402719e-05, + "loss": 0.6716, + "step": 244550 + }, + { + "epoch": 2.161990134196149, + "grad_norm": 6.948282241821289, + "learning_rate": 1.3966831096730849e-05, + "loss": 0.5853, + "step": 244560 + }, + { + "epoch": 2.1620785374564613, + "grad_norm": 5.530930995941162, + "learning_rate": 1.3965357709058977e-05, + "loss": 0.5061, + "step": 244570 + }, + { + "epoch": 2.162166940716774, + "grad_norm": 3.9617760181427, + "learning_rate": 1.3963884321387107e-05, + "loss": 0.5118, + "step": 244580 + }, + { + "epoch": 2.162255343977086, + "grad_norm": 1.3141119480133057, + "learning_rate": 1.3962410933715237e-05, + "loss": 0.5132, + "step": 244590 + }, + { + "epoch": 2.162343747237398, + "grad_norm": 4.2620439529418945, + "learning_rate": 1.3960937546043366e-05, + "loss": 0.6221, + "step": 244600 + }, + { + "epoch": 2.16243215049771, + "grad_norm": 19.701045989990234, + "learning_rate": 1.3959464158371494e-05, + "loss": 0.6749, + "step": 244610 + }, + { + "epoch": 2.1625205537580228, + "grad_norm": 1.7902919054031372, + "learning_rate": 1.3957990770699626e-05, + "loss": 0.6546, + "step": 244620 + }, + { + "epoch": 2.162608957018335, + "grad_norm": 3.999694347381592, + "learning_rate": 1.3956517383027754e-05, + "loss": 0.5574, + "step": 244630 + }, + { + "epoch": 2.162697360278647, + "grad_norm": 4.620487213134766, + "learning_rate": 1.3955043995355882e-05, + "loss": 0.5332, + "step": 244640 + }, + { + "epoch": 2.162785763538959, + "grad_norm": 1.3593881130218506, + "learning_rate": 1.3953570607684014e-05, + "loss": 0.5028, + "step": 244650 + }, + { + "epoch": 2.1628741667992717, + "grad_norm": 3.8344950675964355, + "learning_rate": 1.3952097220012143e-05, + "loss": 0.5862, + "step": 244660 + }, + { + "epoch": 2.162962570059584, + "grad_norm": 5.104284763336182, + "learning_rate": 1.3950623832340271e-05, + "loss": 0.3894, + "step": 244670 + }, + { + "epoch": 2.163050973319896, + "grad_norm": 2.437725782394409, + "learning_rate": 1.39491504446684e-05, + "loss": 0.5869, + "step": 244680 + }, + { + "epoch": 2.163139376580208, + "grad_norm": 2.034870147705078, + "learning_rate": 1.3947677056996531e-05, + "loss": 0.5663, + "step": 244690 + }, + { + "epoch": 2.1632277798405206, + "grad_norm": 4.022502422332764, + "learning_rate": 1.394620366932466e-05, + "loss": 0.5079, + "step": 244700 + }, + { + "epoch": 2.1633161831008327, + "grad_norm": 5.6952362060546875, + "learning_rate": 1.3944730281652788e-05, + "loss": 0.633, + "step": 244710 + }, + { + "epoch": 2.163404586361145, + "grad_norm": 10.168376922607422, + "learning_rate": 1.3943256893980916e-05, + "loss": 0.4814, + "step": 244720 + }, + { + "epoch": 2.1634929896214574, + "grad_norm": 4.023350238800049, + "learning_rate": 1.3941783506309048e-05, + "loss": 0.3932, + "step": 244730 + }, + { + "epoch": 2.1635813928817695, + "grad_norm": 5.350805759429932, + "learning_rate": 1.3940310118637176e-05, + "loss": 0.4575, + "step": 244740 + }, + { + "epoch": 2.1636697961420817, + "grad_norm": 4.876666069030762, + "learning_rate": 1.3938836730965304e-05, + "loss": 0.6068, + "step": 244750 + }, + { + "epoch": 2.163758199402394, + "grad_norm": 6.854573726654053, + "learning_rate": 1.3937363343293436e-05, + "loss": 0.5514, + "step": 244760 + }, + { + "epoch": 2.1638466026627063, + "grad_norm": 3.881807565689087, + "learning_rate": 1.3935889955621565e-05, + "loss": 0.5478, + "step": 244770 + }, + { + "epoch": 2.1639350059230185, + "grad_norm": 1.4720745086669922, + "learning_rate": 1.3934416567949693e-05, + "loss": 0.4722, + "step": 244780 + }, + { + "epoch": 2.1640234091833306, + "grad_norm": 2.5812907218933105, + "learning_rate": 1.3932943180277821e-05, + "loss": 0.5301, + "step": 244790 + }, + { + "epoch": 2.164111812443643, + "grad_norm": 5.832734107971191, + "learning_rate": 1.3931469792605953e-05, + "loss": 0.5796, + "step": 244800 + }, + { + "epoch": 2.1642002157039553, + "grad_norm": 9.020278930664062, + "learning_rate": 1.3929996404934081e-05, + "loss": 0.6099, + "step": 244810 + }, + { + "epoch": 2.1642886189642674, + "grad_norm": 1.697989583015442, + "learning_rate": 1.392852301726221e-05, + "loss": 0.4551, + "step": 244820 + }, + { + "epoch": 2.1643770222245795, + "grad_norm": 12.095008850097656, + "learning_rate": 1.3927049629590341e-05, + "loss": 0.6112, + "step": 244830 + }, + { + "epoch": 2.164465425484892, + "grad_norm": 5.4530029296875, + "learning_rate": 1.392557624191847e-05, + "loss": 0.5937, + "step": 244840 + }, + { + "epoch": 2.164553828745204, + "grad_norm": 3.7663493156433105, + "learning_rate": 1.3924102854246598e-05, + "loss": 0.5613, + "step": 244850 + }, + { + "epoch": 2.1646422320055163, + "grad_norm": 1.8486216068267822, + "learning_rate": 1.3922629466574727e-05, + "loss": 0.5238, + "step": 244860 + }, + { + "epoch": 2.1647306352658284, + "grad_norm": 1.256035566329956, + "learning_rate": 1.3921156078902858e-05, + "loss": 0.5199, + "step": 244870 + }, + { + "epoch": 2.164819038526141, + "grad_norm": 3.0461976528167725, + "learning_rate": 1.3919682691230987e-05, + "loss": 0.4549, + "step": 244880 + }, + { + "epoch": 2.164907441786453, + "grad_norm": 1.387467384338379, + "learning_rate": 1.3918209303559115e-05, + "loss": 0.6556, + "step": 244890 + }, + { + "epoch": 2.1649958450467652, + "grad_norm": 6.598106384277344, + "learning_rate": 1.3916735915887243e-05, + "loss": 0.5243, + "step": 244900 + }, + { + "epoch": 2.1650842483070774, + "grad_norm": 2.5628185272216797, + "learning_rate": 1.3915262528215375e-05, + "loss": 0.4777, + "step": 244910 + }, + { + "epoch": 2.16517265156739, + "grad_norm": 1.0419151782989502, + "learning_rate": 1.3913789140543503e-05, + "loss": 0.5036, + "step": 244920 + }, + { + "epoch": 2.165261054827702, + "grad_norm": 3.077003240585327, + "learning_rate": 1.3912315752871632e-05, + "loss": 0.5703, + "step": 244930 + }, + { + "epoch": 2.165349458088014, + "grad_norm": 6.428390026092529, + "learning_rate": 1.3910842365199764e-05, + "loss": 0.5011, + "step": 244940 + }, + { + "epoch": 2.1654378613483267, + "grad_norm": 9.683162689208984, + "learning_rate": 1.3909368977527892e-05, + "loss": 0.6318, + "step": 244950 + }, + { + "epoch": 2.165526264608639, + "grad_norm": 3.1095309257507324, + "learning_rate": 1.390789558985602e-05, + "loss": 0.5257, + "step": 244960 + }, + { + "epoch": 2.165614667868951, + "grad_norm": 2.2861905097961426, + "learning_rate": 1.390642220218415e-05, + "loss": 0.4275, + "step": 244970 + }, + { + "epoch": 2.165703071129263, + "grad_norm": 0.7386719584465027, + "learning_rate": 1.390494881451228e-05, + "loss": 0.4954, + "step": 244980 + }, + { + "epoch": 2.1657914743895756, + "grad_norm": 2.898801326751709, + "learning_rate": 1.3903475426840409e-05, + "loss": 0.4591, + "step": 244990 + }, + { + "epoch": 2.1658798776498878, + "grad_norm": 1.53287672996521, + "learning_rate": 1.3902002039168539e-05, + "loss": 0.543, + "step": 245000 + }, + { + "epoch": 2.1659682809102, + "grad_norm": 20.700708389282227, + "learning_rate": 1.3900528651496667e-05, + "loss": 0.5818, + "step": 245010 + }, + { + "epoch": 2.166056684170512, + "grad_norm": 2.0985934734344482, + "learning_rate": 1.3899055263824797e-05, + "loss": 0.6065, + "step": 245020 + }, + { + "epoch": 2.1661450874308246, + "grad_norm": 9.296022415161133, + "learning_rate": 1.3897581876152927e-05, + "loss": 0.4693, + "step": 245030 + }, + { + "epoch": 2.1662334906911367, + "grad_norm": 2.9874932765960693, + "learning_rate": 1.3896108488481056e-05, + "loss": 0.5708, + "step": 245040 + }, + { + "epoch": 2.166321893951449, + "grad_norm": 0.6178944110870361, + "learning_rate": 1.3894635100809186e-05, + "loss": 0.4885, + "step": 245050 + }, + { + "epoch": 2.1664102972117614, + "grad_norm": 2.0273022651672363, + "learning_rate": 1.3893161713137316e-05, + "loss": 0.6483, + "step": 245060 + }, + { + "epoch": 2.1664987004720735, + "grad_norm": 1.7737313508987427, + "learning_rate": 1.3891688325465444e-05, + "loss": 0.5575, + "step": 245070 + }, + { + "epoch": 2.1665871037323856, + "grad_norm": 4.955204486846924, + "learning_rate": 1.3890214937793572e-05, + "loss": 0.564, + "step": 245080 + }, + { + "epoch": 2.1666755069926977, + "grad_norm": 2.2948811054229736, + "learning_rate": 1.3888741550121704e-05, + "loss": 0.5669, + "step": 245090 + }, + { + "epoch": 2.1667639102530103, + "grad_norm": 1.6553735733032227, + "learning_rate": 1.3887268162449832e-05, + "loss": 0.5176, + "step": 245100 + }, + { + "epoch": 2.1668523135133224, + "grad_norm": 2.119971513748169, + "learning_rate": 1.388579477477796e-05, + "loss": 0.4297, + "step": 245110 + }, + { + "epoch": 2.1669407167736345, + "grad_norm": 2.4013822078704834, + "learning_rate": 1.3884321387106093e-05, + "loss": 0.6021, + "step": 245120 + }, + { + "epoch": 2.1670291200339467, + "grad_norm": 2.5578408241271973, + "learning_rate": 1.3882847999434221e-05, + "loss": 0.5492, + "step": 245130 + }, + { + "epoch": 2.167117523294259, + "grad_norm": 2.7995684146881104, + "learning_rate": 1.388137461176235e-05, + "loss": 0.4916, + "step": 245140 + }, + { + "epoch": 2.1672059265545713, + "grad_norm": 0.8490287661552429, + "learning_rate": 1.3879901224090478e-05, + "loss": 0.5844, + "step": 245150 + }, + { + "epoch": 2.1672943298148835, + "grad_norm": 17.122573852539062, + "learning_rate": 1.387842783641861e-05, + "loss": 0.5429, + "step": 245160 + }, + { + "epoch": 2.167382733075196, + "grad_norm": 3.265955924987793, + "learning_rate": 1.3876954448746738e-05, + "loss": 0.5262, + "step": 245170 + }, + { + "epoch": 2.167471136335508, + "grad_norm": 7.569705486297607, + "learning_rate": 1.3875481061074866e-05, + "loss": 0.5062, + "step": 245180 + }, + { + "epoch": 2.1675595395958203, + "grad_norm": 3.7553534507751465, + "learning_rate": 1.3874007673402994e-05, + "loss": 0.4766, + "step": 245190 + }, + { + "epoch": 2.1676479428561324, + "grad_norm": 14.884906768798828, + "learning_rate": 1.3872534285731126e-05, + "loss": 0.534, + "step": 245200 + }, + { + "epoch": 2.167736346116445, + "grad_norm": 1.4473294019699097, + "learning_rate": 1.3871060898059254e-05, + "loss": 0.5547, + "step": 245210 + }, + { + "epoch": 2.167824749376757, + "grad_norm": 0.48898178339004517, + "learning_rate": 1.3869587510387383e-05, + "loss": 0.4961, + "step": 245220 + }, + { + "epoch": 2.167913152637069, + "grad_norm": 2.8773393630981445, + "learning_rate": 1.3868114122715515e-05, + "loss": 0.5075, + "step": 245230 + }, + { + "epoch": 2.1680015558973813, + "grad_norm": 8.390589714050293, + "learning_rate": 1.3866640735043643e-05, + "loss": 0.5704, + "step": 245240 + }, + { + "epoch": 2.168089959157694, + "grad_norm": 5.169947624206543, + "learning_rate": 1.3865167347371771e-05, + "loss": 0.4539, + "step": 245250 + }, + { + "epoch": 2.168178362418006, + "grad_norm": 3.6591403484344482, + "learning_rate": 1.38636939596999e-05, + "loss": 0.4864, + "step": 245260 + }, + { + "epoch": 2.168266765678318, + "grad_norm": 7.300472736358643, + "learning_rate": 1.3862220572028031e-05, + "loss": 0.59, + "step": 245270 + }, + { + "epoch": 2.1683551689386302, + "grad_norm": 0.6648169159889221, + "learning_rate": 1.386074718435616e-05, + "loss": 0.5483, + "step": 245280 + }, + { + "epoch": 2.168443572198943, + "grad_norm": 2.337102174758911, + "learning_rate": 1.3859273796684288e-05, + "loss": 0.4632, + "step": 245290 + }, + { + "epoch": 2.168531975459255, + "grad_norm": 3.4456276893615723, + "learning_rate": 1.385780040901242e-05, + "loss": 0.545, + "step": 245300 + }, + { + "epoch": 2.168620378719567, + "grad_norm": 5.791783809661865, + "learning_rate": 1.3856327021340548e-05, + "loss": 0.6413, + "step": 245310 + }, + { + "epoch": 2.1687087819798796, + "grad_norm": 4.634941577911377, + "learning_rate": 1.3854853633668677e-05, + "loss": 0.6397, + "step": 245320 + }, + { + "epoch": 2.1687971852401917, + "grad_norm": 2.8367810249328613, + "learning_rate": 1.3853380245996805e-05, + "loss": 0.6296, + "step": 245330 + }, + { + "epoch": 2.168885588500504, + "grad_norm": 3.423731565475464, + "learning_rate": 1.3851906858324937e-05, + "loss": 0.5284, + "step": 245340 + }, + { + "epoch": 2.168973991760816, + "grad_norm": 0.7320992350578308, + "learning_rate": 1.3850433470653065e-05, + "loss": 0.4735, + "step": 245350 + }, + { + "epoch": 2.1690623950211285, + "grad_norm": 5.876013278961182, + "learning_rate": 1.3848960082981193e-05, + "loss": 0.5311, + "step": 245360 + }, + { + "epoch": 2.1691507982814406, + "grad_norm": 3.7625656127929688, + "learning_rate": 1.3847486695309322e-05, + "loss": 0.4789, + "step": 245370 + }, + { + "epoch": 2.1692392015417528, + "grad_norm": 4.662148952484131, + "learning_rate": 1.3846013307637453e-05, + "loss": 0.5266, + "step": 245380 + }, + { + "epoch": 2.1693276048020653, + "grad_norm": 2.7653744220733643, + "learning_rate": 1.3844539919965582e-05, + "loss": 0.6138, + "step": 245390 + }, + { + "epoch": 2.1694160080623774, + "grad_norm": 1.9007302522659302, + "learning_rate": 1.384306653229371e-05, + "loss": 0.556, + "step": 245400 + }, + { + "epoch": 2.1695044113226896, + "grad_norm": 1.6720101833343506, + "learning_rate": 1.3841593144621842e-05, + "loss": 0.5818, + "step": 245410 + }, + { + "epoch": 2.1695928145830017, + "grad_norm": 4.869603633880615, + "learning_rate": 1.384011975694997e-05, + "loss": 0.5234, + "step": 245420 + }, + { + "epoch": 2.1696812178433142, + "grad_norm": 3.6686697006225586, + "learning_rate": 1.3838646369278099e-05, + "loss": 0.4062, + "step": 245430 + }, + { + "epoch": 2.1697696211036264, + "grad_norm": 4.259734153747559, + "learning_rate": 1.3837172981606229e-05, + "loss": 0.6217, + "step": 245440 + }, + { + "epoch": 2.1698580243639385, + "grad_norm": 4.924173831939697, + "learning_rate": 1.3835699593934359e-05, + "loss": 0.5244, + "step": 245450 + }, + { + "epoch": 2.1699464276242506, + "grad_norm": 5.799219608306885, + "learning_rate": 1.3834226206262487e-05, + "loss": 0.5239, + "step": 245460 + }, + { + "epoch": 2.170034830884563, + "grad_norm": 13.323660850524902, + "learning_rate": 1.3832752818590617e-05, + "loss": 0.565, + "step": 245470 + }, + { + "epoch": 2.1701232341448753, + "grad_norm": 1.5805681943893433, + "learning_rate": 1.3831279430918745e-05, + "loss": 0.52, + "step": 245480 + }, + { + "epoch": 2.1702116374051874, + "grad_norm": 2.4813132286071777, + "learning_rate": 1.3829806043246876e-05, + "loss": 0.5119, + "step": 245490 + }, + { + "epoch": 2.1703000406654995, + "grad_norm": 4.1714348793029785, + "learning_rate": 1.3828332655575006e-05, + "loss": 0.5406, + "step": 245500 + }, + { + "epoch": 2.170388443925812, + "grad_norm": 3.914215326309204, + "learning_rate": 1.3826859267903134e-05, + "loss": 0.4963, + "step": 245510 + }, + { + "epoch": 2.170476847186124, + "grad_norm": 2.773785352706909, + "learning_rate": 1.3825385880231264e-05, + "loss": 0.544, + "step": 245520 + }, + { + "epoch": 2.1705652504464363, + "grad_norm": 3.136577606201172, + "learning_rate": 1.3823912492559394e-05, + "loss": 0.6345, + "step": 245530 + }, + { + "epoch": 2.170653653706749, + "grad_norm": 1.8958722352981567, + "learning_rate": 1.3822439104887522e-05, + "loss": 0.6177, + "step": 245540 + }, + { + "epoch": 2.170742056967061, + "grad_norm": 3.0869932174682617, + "learning_rate": 1.382096571721565e-05, + "loss": 0.3911, + "step": 245550 + }, + { + "epoch": 2.170830460227373, + "grad_norm": 1.8408747911453247, + "learning_rate": 1.3819492329543782e-05, + "loss": 0.5038, + "step": 245560 + }, + { + "epoch": 2.1709188634876853, + "grad_norm": 5.619261264801025, + "learning_rate": 1.381801894187191e-05, + "loss": 0.4941, + "step": 245570 + }, + { + "epoch": 2.171007266747998, + "grad_norm": 1.8387924432754517, + "learning_rate": 1.381654555420004e-05, + "loss": 0.4695, + "step": 245580 + }, + { + "epoch": 2.17109567000831, + "grad_norm": 1.2662813663482666, + "learning_rate": 1.3815072166528171e-05, + "loss": 0.5093, + "step": 245590 + }, + { + "epoch": 2.171184073268622, + "grad_norm": 37.69186019897461, + "learning_rate": 1.38135987788563e-05, + "loss": 0.4581, + "step": 245600 + }, + { + "epoch": 2.171272476528934, + "grad_norm": 6.407205104827881, + "learning_rate": 1.3812125391184428e-05, + "loss": 0.4725, + "step": 245610 + }, + { + "epoch": 2.1713608797892467, + "grad_norm": 2.777477979660034, + "learning_rate": 1.3810652003512556e-05, + "loss": 0.5003, + "step": 245620 + }, + { + "epoch": 2.171449283049559, + "grad_norm": 1.3655823469161987, + "learning_rate": 1.3809178615840688e-05, + "loss": 0.552, + "step": 245630 + }, + { + "epoch": 2.171537686309871, + "grad_norm": 3.8200771808624268, + "learning_rate": 1.3807705228168816e-05, + "loss": 0.5033, + "step": 245640 + }, + { + "epoch": 2.1716260895701835, + "grad_norm": 1.716650366783142, + "learning_rate": 1.3806231840496944e-05, + "loss": 0.4564, + "step": 245650 + }, + { + "epoch": 2.1717144928304957, + "grad_norm": 3.9431941509246826, + "learning_rate": 1.3804758452825073e-05, + "loss": 0.5929, + "step": 245660 + }, + { + "epoch": 2.171802896090808, + "grad_norm": 4.676994800567627, + "learning_rate": 1.3803285065153205e-05, + "loss": 0.6618, + "step": 245670 + }, + { + "epoch": 2.17189129935112, + "grad_norm": 2.5879907608032227, + "learning_rate": 1.3801811677481333e-05, + "loss": 0.4448, + "step": 245680 + }, + { + "epoch": 2.1719797026114325, + "grad_norm": 8.072145462036133, + "learning_rate": 1.3800338289809461e-05, + "loss": 0.6706, + "step": 245690 + }, + { + "epoch": 2.1720681058717446, + "grad_norm": 3.1810572147369385, + "learning_rate": 1.3798864902137593e-05, + "loss": 0.4324, + "step": 245700 + }, + { + "epoch": 2.1721565091320567, + "grad_norm": 1.5665669441223145, + "learning_rate": 1.3797391514465721e-05, + "loss": 0.4903, + "step": 245710 + }, + { + "epoch": 2.172244912392369, + "grad_norm": 7.954720497131348, + "learning_rate": 1.379591812679385e-05, + "loss": 0.6431, + "step": 245720 + }, + { + "epoch": 2.1723333156526814, + "grad_norm": 11.870237350463867, + "learning_rate": 1.3794444739121978e-05, + "loss": 0.5042, + "step": 245730 + }, + { + "epoch": 2.1724217189129935, + "grad_norm": 13.580682754516602, + "learning_rate": 1.379297135145011e-05, + "loss": 0.5661, + "step": 245740 + }, + { + "epoch": 2.1725101221733056, + "grad_norm": 1.8155741691589355, + "learning_rate": 1.3791497963778238e-05, + "loss": 0.433, + "step": 245750 + }, + { + "epoch": 2.172598525433618, + "grad_norm": 1.4799396991729736, + "learning_rate": 1.3790024576106366e-05, + "loss": 0.4621, + "step": 245760 + }, + { + "epoch": 2.1726869286939303, + "grad_norm": 2.6888537406921387, + "learning_rate": 1.3788551188434498e-05, + "loss": 0.4635, + "step": 245770 + }, + { + "epoch": 2.1727753319542424, + "grad_norm": 1.4573020935058594, + "learning_rate": 1.3787077800762627e-05, + "loss": 0.5438, + "step": 245780 + }, + { + "epoch": 2.1728637352145546, + "grad_norm": 3.1951091289520264, + "learning_rate": 1.3785604413090755e-05, + "loss": 0.4487, + "step": 245790 + }, + { + "epoch": 2.172952138474867, + "grad_norm": 5.12869930267334, + "learning_rate": 1.3784131025418883e-05, + "loss": 0.5098, + "step": 245800 + }, + { + "epoch": 2.1730405417351792, + "grad_norm": 3.0785319805145264, + "learning_rate": 1.3782657637747015e-05, + "loss": 0.4477, + "step": 245810 + }, + { + "epoch": 2.1731289449954914, + "grad_norm": 4.8019280433654785, + "learning_rate": 1.3781184250075143e-05, + "loss": 0.5572, + "step": 245820 + }, + { + "epoch": 2.1732173482558035, + "grad_norm": 2.286085367202759, + "learning_rate": 1.3779710862403272e-05, + "loss": 0.5842, + "step": 245830 + }, + { + "epoch": 2.173305751516116, + "grad_norm": 1.1346417665481567, + "learning_rate": 1.37782374747314e-05, + "loss": 0.3996, + "step": 245840 + }, + { + "epoch": 2.173394154776428, + "grad_norm": 3.9003689289093018, + "learning_rate": 1.3776764087059532e-05, + "loss": 0.567, + "step": 245850 + }, + { + "epoch": 2.1734825580367403, + "grad_norm": 3.531090259552002, + "learning_rate": 1.377529069938766e-05, + "loss": 0.5206, + "step": 245860 + }, + { + "epoch": 2.1735709612970524, + "grad_norm": 2.1370325088500977, + "learning_rate": 1.3773817311715789e-05, + "loss": 0.5331, + "step": 245870 + }, + { + "epoch": 2.173659364557365, + "grad_norm": 1.0407413244247437, + "learning_rate": 1.377234392404392e-05, + "loss": 0.4386, + "step": 245880 + }, + { + "epoch": 2.173747767817677, + "grad_norm": 5.557190895080566, + "learning_rate": 1.3770870536372049e-05, + "loss": 0.5894, + "step": 245890 + }, + { + "epoch": 2.173836171077989, + "grad_norm": 2.4773123264312744, + "learning_rate": 1.3769397148700177e-05, + "loss": 0.6203, + "step": 245900 + }, + { + "epoch": 2.1739245743383018, + "grad_norm": 5.096643447875977, + "learning_rate": 1.3767923761028307e-05, + "loss": 0.4908, + "step": 245910 + }, + { + "epoch": 2.174012977598614, + "grad_norm": 9.131943702697754, + "learning_rate": 1.3766450373356437e-05, + "loss": 0.6065, + "step": 245920 + }, + { + "epoch": 2.174101380858926, + "grad_norm": 11.351020812988281, + "learning_rate": 1.3764976985684565e-05, + "loss": 0.4938, + "step": 245930 + }, + { + "epoch": 2.174189784119238, + "grad_norm": 2.4357900619506836, + "learning_rate": 1.3763503598012695e-05, + "loss": 0.4997, + "step": 245940 + }, + { + "epoch": 2.1742781873795507, + "grad_norm": 7.056451797485352, + "learning_rate": 1.3762030210340826e-05, + "loss": 0.647, + "step": 245950 + }, + { + "epoch": 2.174366590639863, + "grad_norm": 10.277010917663574, + "learning_rate": 1.3760556822668954e-05, + "loss": 0.5614, + "step": 245960 + }, + { + "epoch": 2.174454993900175, + "grad_norm": 3.8671648502349854, + "learning_rate": 1.3759083434997084e-05, + "loss": 0.5069, + "step": 245970 + }, + { + "epoch": 2.1745433971604875, + "grad_norm": 3.8851795196533203, + "learning_rate": 1.3757610047325212e-05, + "loss": 0.5353, + "step": 245980 + }, + { + "epoch": 2.1746318004207996, + "grad_norm": 1.939041256904602, + "learning_rate": 1.3756136659653342e-05, + "loss": 0.5456, + "step": 245990 + }, + { + "epoch": 2.1747202036811117, + "grad_norm": 2.4722182750701904, + "learning_rate": 1.3754663271981472e-05, + "loss": 0.5782, + "step": 246000 + }, + { + "epoch": 2.174808606941424, + "grad_norm": 3.121884346008301, + "learning_rate": 1.37531898843096e-05, + "loss": 0.4163, + "step": 246010 + }, + { + "epoch": 2.1748970102017364, + "grad_norm": 3.3458499908447266, + "learning_rate": 1.3751716496637729e-05, + "loss": 0.479, + "step": 246020 + }, + { + "epoch": 2.1749854134620485, + "grad_norm": 1.8416224718093872, + "learning_rate": 1.375024310896586e-05, + "loss": 0.3875, + "step": 246030 + }, + { + "epoch": 2.1750738167223607, + "grad_norm": 2.9351820945739746, + "learning_rate": 1.374876972129399e-05, + "loss": 0.5163, + "step": 246040 + }, + { + "epoch": 2.175162219982673, + "grad_norm": 3.1195194721221924, + "learning_rate": 1.3747296333622118e-05, + "loss": 0.4308, + "step": 246050 + }, + { + "epoch": 2.1752506232429853, + "grad_norm": 30.92794418334961, + "learning_rate": 1.374582294595025e-05, + "loss": 0.495, + "step": 246060 + }, + { + "epoch": 2.1753390265032975, + "grad_norm": 1.9470129013061523, + "learning_rate": 1.3744349558278378e-05, + "loss": 0.5012, + "step": 246070 + }, + { + "epoch": 2.1754274297636096, + "grad_norm": 2.940351724624634, + "learning_rate": 1.3742876170606506e-05, + "loss": 0.5844, + "step": 246080 + }, + { + "epoch": 2.1755158330239217, + "grad_norm": 2.4406871795654297, + "learning_rate": 1.3741402782934634e-05, + "loss": 0.6179, + "step": 246090 + }, + { + "epoch": 2.1756042362842343, + "grad_norm": 2.3817877769470215, + "learning_rate": 1.3739929395262766e-05, + "loss": 0.6074, + "step": 246100 + }, + { + "epoch": 2.1756926395445464, + "grad_norm": 5.012096405029297, + "learning_rate": 1.3738456007590894e-05, + "loss": 0.4643, + "step": 246110 + }, + { + "epoch": 2.1757810428048585, + "grad_norm": 10.282967567443848, + "learning_rate": 1.3736982619919023e-05, + "loss": 0.6129, + "step": 246120 + }, + { + "epoch": 2.175869446065171, + "grad_norm": 5.316123008728027, + "learning_rate": 1.3735509232247151e-05, + "loss": 0.3888, + "step": 246130 + }, + { + "epoch": 2.175957849325483, + "grad_norm": 1.7234137058258057, + "learning_rate": 1.3734035844575283e-05, + "loss": 0.5174, + "step": 246140 + }, + { + "epoch": 2.1760462525857953, + "grad_norm": 8.489676475524902, + "learning_rate": 1.3732562456903411e-05, + "loss": 0.4989, + "step": 246150 + }, + { + "epoch": 2.1761346558461074, + "grad_norm": 2.837161064147949, + "learning_rate": 1.373108906923154e-05, + "loss": 0.4619, + "step": 246160 + }, + { + "epoch": 2.17622305910642, + "grad_norm": 3.0441277027130127, + "learning_rate": 1.3729615681559671e-05, + "loss": 0.6477, + "step": 246170 + }, + { + "epoch": 2.176311462366732, + "grad_norm": 21.759906768798828, + "learning_rate": 1.37281422938878e-05, + "loss": 0.5273, + "step": 246180 + }, + { + "epoch": 2.1763998656270442, + "grad_norm": 4.97874641418457, + "learning_rate": 1.3726668906215928e-05, + "loss": 0.5874, + "step": 246190 + }, + { + "epoch": 2.1764882688873564, + "grad_norm": 2.662759304046631, + "learning_rate": 1.3725195518544056e-05, + "loss": 0.5468, + "step": 246200 + }, + { + "epoch": 2.176576672147669, + "grad_norm": 5.262113571166992, + "learning_rate": 1.3723722130872188e-05, + "loss": 0.4108, + "step": 246210 + }, + { + "epoch": 2.176665075407981, + "grad_norm": 1.4069370031356812, + "learning_rate": 1.3722248743200317e-05, + "loss": 0.5497, + "step": 246220 + }, + { + "epoch": 2.176753478668293, + "grad_norm": 14.456093788146973, + "learning_rate": 1.3720775355528445e-05, + "loss": 0.5925, + "step": 246230 + }, + { + "epoch": 2.1768418819286057, + "grad_norm": 2.736205816268921, + "learning_rate": 1.3719301967856577e-05, + "loss": 0.6606, + "step": 246240 + }, + { + "epoch": 2.176930285188918, + "grad_norm": 15.487725257873535, + "learning_rate": 1.3717828580184705e-05, + "loss": 0.6164, + "step": 246250 + }, + { + "epoch": 2.17701868844923, + "grad_norm": 2.6263227462768555, + "learning_rate": 1.3716355192512833e-05, + "loss": 0.5671, + "step": 246260 + }, + { + "epoch": 2.177107091709542, + "grad_norm": 1.2716829776763916, + "learning_rate": 1.3714881804840962e-05, + "loss": 0.5125, + "step": 246270 + }, + { + "epoch": 2.1771954949698546, + "grad_norm": 4.498462200164795, + "learning_rate": 1.3713408417169093e-05, + "loss": 0.6794, + "step": 246280 + }, + { + "epoch": 2.1772838982301668, + "grad_norm": 3.1482064723968506, + "learning_rate": 1.3711935029497222e-05, + "loss": 0.5315, + "step": 246290 + }, + { + "epoch": 2.177372301490479, + "grad_norm": 2.5860178470611572, + "learning_rate": 1.371046164182535e-05, + "loss": 0.6001, + "step": 246300 + }, + { + "epoch": 2.177460704750791, + "grad_norm": 13.322017669677734, + "learning_rate": 1.3708988254153478e-05, + "loss": 0.4918, + "step": 246310 + }, + { + "epoch": 2.1775491080111036, + "grad_norm": 4.020044326782227, + "learning_rate": 1.370751486648161e-05, + "loss": 0.5262, + "step": 246320 + }, + { + "epoch": 2.1776375112714157, + "grad_norm": 2.113619327545166, + "learning_rate": 1.3706041478809739e-05, + "loss": 0.6233, + "step": 246330 + }, + { + "epoch": 2.177725914531728, + "grad_norm": 6.302050590515137, + "learning_rate": 1.3704568091137867e-05, + "loss": 0.556, + "step": 246340 + }, + { + "epoch": 2.1778143177920404, + "grad_norm": 1.2457382678985596, + "learning_rate": 1.3703094703465999e-05, + "loss": 0.5308, + "step": 246350 + }, + { + "epoch": 2.1779027210523525, + "grad_norm": 6.223342418670654, + "learning_rate": 1.3701621315794127e-05, + "loss": 0.5265, + "step": 246360 + }, + { + "epoch": 2.1779911243126646, + "grad_norm": 0.8979325294494629, + "learning_rate": 1.3700147928122255e-05, + "loss": 0.5403, + "step": 246370 + }, + { + "epoch": 2.1780795275729767, + "grad_norm": 2.1822152137756348, + "learning_rate": 1.3698674540450385e-05, + "loss": 0.5595, + "step": 246380 + }, + { + "epoch": 2.1781679308332893, + "grad_norm": 2.5853798389434814, + "learning_rate": 1.3697201152778515e-05, + "loss": 0.5542, + "step": 246390 + }, + { + "epoch": 2.1782563340936014, + "grad_norm": 2.2700555324554443, + "learning_rate": 1.3695727765106644e-05, + "loss": 0.5328, + "step": 246400 + }, + { + "epoch": 2.1783447373539135, + "grad_norm": 4.468782424926758, + "learning_rate": 1.3694254377434774e-05, + "loss": 0.5143, + "step": 246410 + }, + { + "epoch": 2.1784331406142257, + "grad_norm": 23.558744430541992, + "learning_rate": 1.3692780989762904e-05, + "loss": 0.4305, + "step": 246420 + }, + { + "epoch": 2.1785215438745382, + "grad_norm": 1.8454546928405762, + "learning_rate": 1.3691307602091032e-05, + "loss": 0.5466, + "step": 246430 + }, + { + "epoch": 2.1786099471348503, + "grad_norm": 1.1160887479782104, + "learning_rate": 1.3689834214419162e-05, + "loss": 0.4399, + "step": 246440 + }, + { + "epoch": 2.1786983503951625, + "grad_norm": 2.1913986206054688, + "learning_rate": 1.368836082674729e-05, + "loss": 0.586, + "step": 246450 + }, + { + "epoch": 2.1787867536554746, + "grad_norm": 1.1190730333328247, + "learning_rate": 1.368688743907542e-05, + "loss": 0.4676, + "step": 246460 + }, + { + "epoch": 2.178875156915787, + "grad_norm": 4.154147624969482, + "learning_rate": 1.368541405140355e-05, + "loss": 0.6278, + "step": 246470 + }, + { + "epoch": 2.1789635601760993, + "grad_norm": 1.7494193315505981, + "learning_rate": 1.3683940663731679e-05, + "loss": 0.5704, + "step": 246480 + }, + { + "epoch": 2.1790519634364114, + "grad_norm": 2.4169821739196777, + "learning_rate": 1.3682467276059807e-05, + "loss": 0.4803, + "step": 246490 + }, + { + "epoch": 2.179140366696724, + "grad_norm": 10.102002143859863, + "learning_rate": 1.368099388838794e-05, + "loss": 0.5329, + "step": 246500 + }, + { + "epoch": 2.179228769957036, + "grad_norm": 18.877544403076172, + "learning_rate": 1.3679520500716068e-05, + "loss": 0.6607, + "step": 246510 + }, + { + "epoch": 2.179317173217348, + "grad_norm": 8.207087516784668, + "learning_rate": 1.3678047113044196e-05, + "loss": 0.4887, + "step": 246520 + }, + { + "epoch": 2.1794055764776603, + "grad_norm": 8.116870880126953, + "learning_rate": 1.3676573725372328e-05, + "loss": 0.5716, + "step": 246530 + }, + { + "epoch": 2.179493979737973, + "grad_norm": 1.2831543684005737, + "learning_rate": 1.3675100337700456e-05, + "loss": 0.3899, + "step": 246540 + }, + { + "epoch": 2.179582382998285, + "grad_norm": 3.179790735244751, + "learning_rate": 1.3673626950028584e-05, + "loss": 0.4758, + "step": 246550 + }, + { + "epoch": 2.179670786258597, + "grad_norm": 6.929795265197754, + "learning_rate": 1.3672153562356713e-05, + "loss": 0.5783, + "step": 246560 + }, + { + "epoch": 2.1797591895189097, + "grad_norm": 4.522343158721924, + "learning_rate": 1.3670680174684844e-05, + "loss": 0.5082, + "step": 246570 + }, + { + "epoch": 2.179847592779222, + "grad_norm": 3.1433193683624268, + "learning_rate": 1.3669206787012973e-05, + "loss": 0.5057, + "step": 246580 + }, + { + "epoch": 2.179935996039534, + "grad_norm": 4.062282085418701, + "learning_rate": 1.3667733399341101e-05, + "loss": 0.6302, + "step": 246590 + }, + { + "epoch": 2.180024399299846, + "grad_norm": 4.1055145263671875, + "learning_rate": 1.366626001166923e-05, + "loss": 0.5074, + "step": 246600 + }, + { + "epoch": 2.1801128025601586, + "grad_norm": 1.5637820959091187, + "learning_rate": 1.3664786623997361e-05, + "loss": 0.4739, + "step": 246610 + }, + { + "epoch": 2.1802012058204707, + "grad_norm": 1.666195034980774, + "learning_rate": 1.366331323632549e-05, + "loss": 0.5394, + "step": 246620 + }, + { + "epoch": 2.180289609080783, + "grad_norm": 4.35031795501709, + "learning_rate": 1.3661839848653618e-05, + "loss": 0.5235, + "step": 246630 + }, + { + "epoch": 2.180378012341095, + "grad_norm": 3.142624616622925, + "learning_rate": 1.366036646098175e-05, + "loss": 0.4949, + "step": 246640 + }, + { + "epoch": 2.1804664156014075, + "grad_norm": 1.2951722145080566, + "learning_rate": 1.3658893073309878e-05, + "loss": 0.5552, + "step": 246650 + }, + { + "epoch": 2.1805548188617196, + "grad_norm": 2.6370766162872314, + "learning_rate": 1.3657419685638006e-05, + "loss": 0.6229, + "step": 246660 + }, + { + "epoch": 2.1806432221220318, + "grad_norm": 18.470544815063477, + "learning_rate": 1.3655946297966135e-05, + "loss": 0.6222, + "step": 246670 + }, + { + "epoch": 2.180731625382344, + "grad_norm": 6.603714942932129, + "learning_rate": 1.3654472910294267e-05, + "loss": 0.5231, + "step": 246680 + }, + { + "epoch": 2.1808200286426564, + "grad_norm": 1.8312325477600098, + "learning_rate": 1.3652999522622395e-05, + "loss": 0.5662, + "step": 246690 + }, + { + "epoch": 2.1809084319029686, + "grad_norm": 1.8746618032455444, + "learning_rate": 1.3651526134950523e-05, + "loss": 0.5281, + "step": 246700 + }, + { + "epoch": 2.1809968351632807, + "grad_norm": 11.97623348236084, + "learning_rate": 1.3650052747278655e-05, + "loss": 0.6048, + "step": 246710 + }, + { + "epoch": 2.1810852384235933, + "grad_norm": 4.407632827758789, + "learning_rate": 1.3648579359606783e-05, + "loss": 0.5664, + "step": 246720 + }, + { + "epoch": 2.1811736416839054, + "grad_norm": 3.3378896713256836, + "learning_rate": 1.3647105971934912e-05, + "loss": 0.5478, + "step": 246730 + }, + { + "epoch": 2.1812620449442175, + "grad_norm": 1.7167526483535767, + "learning_rate": 1.364563258426304e-05, + "loss": 0.5062, + "step": 246740 + }, + { + "epoch": 2.1813504482045296, + "grad_norm": 0.7564432621002197, + "learning_rate": 1.3644159196591172e-05, + "loss": 0.5083, + "step": 246750 + }, + { + "epoch": 2.181438851464842, + "grad_norm": 9.128724098205566, + "learning_rate": 1.36426858089193e-05, + "loss": 0.5609, + "step": 246760 + }, + { + "epoch": 2.1815272547251543, + "grad_norm": 2.7882425785064697, + "learning_rate": 1.3641212421247428e-05, + "loss": 0.6504, + "step": 246770 + }, + { + "epoch": 2.1816156579854664, + "grad_norm": 1.1797698736190796, + "learning_rate": 1.3639739033575557e-05, + "loss": 0.6186, + "step": 246780 + }, + { + "epoch": 2.1817040612457785, + "grad_norm": 5.56332540512085, + "learning_rate": 1.3638265645903689e-05, + "loss": 0.5978, + "step": 246790 + }, + { + "epoch": 2.181792464506091, + "grad_norm": 3.211848497390747, + "learning_rate": 1.3636792258231817e-05, + "loss": 0.524, + "step": 246800 + }, + { + "epoch": 2.181880867766403, + "grad_norm": 5.251622200012207, + "learning_rate": 1.3635318870559945e-05, + "loss": 0.5718, + "step": 246810 + }, + { + "epoch": 2.1819692710267153, + "grad_norm": 4.092431545257568, + "learning_rate": 1.3633845482888077e-05, + "loss": 0.5831, + "step": 246820 + }, + { + "epoch": 2.182057674287028, + "grad_norm": 1.3686481714248657, + "learning_rate": 1.3632372095216205e-05, + "loss": 0.5996, + "step": 246830 + }, + { + "epoch": 2.18214607754734, + "grad_norm": 1.2413418292999268, + "learning_rate": 1.3630898707544334e-05, + "loss": 0.5227, + "step": 246840 + }, + { + "epoch": 2.182234480807652, + "grad_norm": 2.8372156620025635, + "learning_rate": 1.3629425319872464e-05, + "loss": 0.4212, + "step": 246850 + }, + { + "epoch": 2.1823228840679643, + "grad_norm": 5.697726726531982, + "learning_rate": 1.3627951932200594e-05, + "loss": 0.4158, + "step": 246860 + }, + { + "epoch": 2.182411287328277, + "grad_norm": 3.255807638168335, + "learning_rate": 1.3626478544528722e-05, + "loss": 0.4927, + "step": 246870 + }, + { + "epoch": 2.182499690588589, + "grad_norm": 2.7179830074310303, + "learning_rate": 1.3625005156856852e-05, + "loss": 0.4467, + "step": 246880 + }, + { + "epoch": 2.182588093848901, + "grad_norm": 1.6089831590652466, + "learning_rate": 1.3623531769184982e-05, + "loss": 0.4907, + "step": 246890 + }, + { + "epoch": 2.182676497109213, + "grad_norm": 2.2919106483459473, + "learning_rate": 1.362205838151311e-05, + "loss": 0.4628, + "step": 246900 + }, + { + "epoch": 2.1827649003695258, + "grad_norm": 10.823596000671387, + "learning_rate": 1.362058499384124e-05, + "loss": 0.5692, + "step": 246910 + }, + { + "epoch": 2.182853303629838, + "grad_norm": 2.7689125537872314, + "learning_rate": 1.3619111606169369e-05, + "loss": 0.5002, + "step": 246920 + }, + { + "epoch": 2.18294170689015, + "grad_norm": 3.01531982421875, + "learning_rate": 1.3617638218497499e-05, + "loss": 0.4995, + "step": 246930 + }, + { + "epoch": 2.1830301101504626, + "grad_norm": 1.2634992599487305, + "learning_rate": 1.3616164830825629e-05, + "loss": 0.4982, + "step": 246940 + }, + { + "epoch": 2.1831185134107747, + "grad_norm": 3.51444149017334, + "learning_rate": 1.3614691443153757e-05, + "loss": 0.5695, + "step": 246950 + }, + { + "epoch": 2.183206916671087, + "grad_norm": 2.664560556411743, + "learning_rate": 1.3613218055481886e-05, + "loss": 0.4744, + "step": 246960 + }, + { + "epoch": 2.183295319931399, + "grad_norm": 1.4890944957733154, + "learning_rate": 1.3611744667810018e-05, + "loss": 0.5306, + "step": 246970 + }, + { + "epoch": 2.1833837231917115, + "grad_norm": 2.1953353881835938, + "learning_rate": 1.3610271280138146e-05, + "loss": 0.4721, + "step": 246980 + }, + { + "epoch": 2.1834721264520236, + "grad_norm": 5.770285606384277, + "learning_rate": 1.3608797892466274e-05, + "loss": 0.5095, + "step": 246990 + }, + { + "epoch": 2.1835605297123357, + "grad_norm": 4.705709457397461, + "learning_rate": 1.3607324504794406e-05, + "loss": 0.5324, + "step": 247000 + }, + { + "epoch": 2.183648932972648, + "grad_norm": 1.074044942855835, + "learning_rate": 1.3605851117122534e-05, + "loss": 0.4377, + "step": 247010 + }, + { + "epoch": 2.1837373362329604, + "grad_norm": 4.647189140319824, + "learning_rate": 1.3604377729450663e-05, + "loss": 0.5123, + "step": 247020 + }, + { + "epoch": 2.1838257394932725, + "grad_norm": 1.1642022132873535, + "learning_rate": 1.3602904341778791e-05, + "loss": 0.5342, + "step": 247030 + }, + { + "epoch": 2.1839141427535846, + "grad_norm": 2.552166223526001, + "learning_rate": 1.3601430954106923e-05, + "loss": 0.6953, + "step": 247040 + }, + { + "epoch": 2.1840025460138968, + "grad_norm": 13.36396312713623, + "learning_rate": 1.3599957566435051e-05, + "loss": 0.5103, + "step": 247050 + }, + { + "epoch": 2.1840909492742093, + "grad_norm": 1.8164278268814087, + "learning_rate": 1.359848417876318e-05, + "loss": 0.5395, + "step": 247060 + }, + { + "epoch": 2.1841793525345214, + "grad_norm": 11.132706642150879, + "learning_rate": 1.3597010791091311e-05, + "loss": 0.5122, + "step": 247070 + }, + { + "epoch": 2.1842677557948336, + "grad_norm": 5.581788063049316, + "learning_rate": 1.359553740341944e-05, + "loss": 0.4983, + "step": 247080 + }, + { + "epoch": 2.184356159055146, + "grad_norm": 2.7674078941345215, + "learning_rate": 1.3594064015747568e-05, + "loss": 0.528, + "step": 247090 + }, + { + "epoch": 2.1844445623154582, + "grad_norm": 1.7639862298965454, + "learning_rate": 1.3592590628075696e-05, + "loss": 0.5862, + "step": 247100 + }, + { + "epoch": 2.1845329655757704, + "grad_norm": 2.098076105117798, + "learning_rate": 1.3591117240403828e-05, + "loss": 0.5694, + "step": 247110 + }, + { + "epoch": 2.1846213688360825, + "grad_norm": 1.3820040225982666, + "learning_rate": 1.3589643852731956e-05, + "loss": 0.6859, + "step": 247120 + }, + { + "epoch": 2.184709772096395, + "grad_norm": 10.822224617004395, + "learning_rate": 1.3588170465060085e-05, + "loss": 0.6197, + "step": 247130 + }, + { + "epoch": 2.184798175356707, + "grad_norm": 2.72259259223938, + "learning_rate": 1.3586697077388213e-05, + "loss": 0.4962, + "step": 247140 + }, + { + "epoch": 2.1848865786170193, + "grad_norm": 2.1395621299743652, + "learning_rate": 1.3585223689716345e-05, + "loss": 0.512, + "step": 247150 + }, + { + "epoch": 2.184974981877332, + "grad_norm": 9.799553871154785, + "learning_rate": 1.3583750302044473e-05, + "loss": 0.5989, + "step": 247160 + }, + { + "epoch": 2.185063385137644, + "grad_norm": 2.2822749614715576, + "learning_rate": 1.3582276914372602e-05, + "loss": 0.4974, + "step": 247170 + }, + { + "epoch": 2.185151788397956, + "grad_norm": 7.460197448730469, + "learning_rate": 1.3580803526700733e-05, + "loss": 0.4834, + "step": 247180 + }, + { + "epoch": 2.185240191658268, + "grad_norm": 14.248702049255371, + "learning_rate": 1.3579330139028862e-05, + "loss": 0.5068, + "step": 247190 + }, + { + "epoch": 2.185328594918581, + "grad_norm": 1.573256015777588, + "learning_rate": 1.357785675135699e-05, + "loss": 0.4844, + "step": 247200 + }, + { + "epoch": 2.185416998178893, + "grad_norm": 3.189751625061035, + "learning_rate": 1.3576383363685118e-05, + "loss": 0.49, + "step": 247210 + }, + { + "epoch": 2.185505401439205, + "grad_norm": 3.4552056789398193, + "learning_rate": 1.357490997601325e-05, + "loss": 0.5075, + "step": 247220 + }, + { + "epoch": 2.185593804699517, + "grad_norm": 2.482994318008423, + "learning_rate": 1.3573436588341379e-05, + "loss": 0.5796, + "step": 247230 + }, + { + "epoch": 2.1856822079598297, + "grad_norm": 2.9146435260772705, + "learning_rate": 1.3571963200669507e-05, + "loss": 0.5249, + "step": 247240 + }, + { + "epoch": 2.185770611220142, + "grad_norm": 3.2425994873046875, + "learning_rate": 1.3570489812997635e-05, + "loss": 0.597, + "step": 247250 + }, + { + "epoch": 2.185859014480454, + "grad_norm": 4.335749626159668, + "learning_rate": 1.3569016425325767e-05, + "loss": 0.5688, + "step": 247260 + }, + { + "epoch": 2.185947417740766, + "grad_norm": 2.110487937927246, + "learning_rate": 1.3567543037653895e-05, + "loss": 0.5022, + "step": 247270 + }, + { + "epoch": 2.1860358210010786, + "grad_norm": 2.5514917373657227, + "learning_rate": 1.3566069649982024e-05, + "loss": 0.5973, + "step": 247280 + }, + { + "epoch": 2.1861242242613907, + "grad_norm": 1.9982377290725708, + "learning_rate": 1.3564596262310155e-05, + "loss": 0.5071, + "step": 247290 + }, + { + "epoch": 2.186212627521703, + "grad_norm": 7.5051774978637695, + "learning_rate": 1.3563122874638284e-05, + "loss": 0.5216, + "step": 247300 + }, + { + "epoch": 2.1863010307820154, + "grad_norm": 5.773554801940918, + "learning_rate": 1.3561649486966412e-05, + "loss": 0.561, + "step": 247310 + }, + { + "epoch": 2.1863894340423276, + "grad_norm": 3.029366970062256, + "learning_rate": 1.3560176099294542e-05, + "loss": 0.5379, + "step": 247320 + }, + { + "epoch": 2.1864778373026397, + "grad_norm": 4.5431036949157715, + "learning_rate": 1.3558702711622672e-05, + "loss": 0.5313, + "step": 247330 + }, + { + "epoch": 2.186566240562952, + "grad_norm": 2.5761382579803467, + "learning_rate": 1.35572293239508e-05, + "loss": 0.413, + "step": 247340 + }, + { + "epoch": 2.1866546438232644, + "grad_norm": 2.7519407272338867, + "learning_rate": 1.355575593627893e-05, + "loss": 0.6355, + "step": 247350 + }, + { + "epoch": 2.1867430470835765, + "grad_norm": 2.0490541458129883, + "learning_rate": 1.355428254860706e-05, + "loss": 0.4491, + "step": 247360 + }, + { + "epoch": 2.1868314503438886, + "grad_norm": 2.183389186859131, + "learning_rate": 1.3552809160935189e-05, + "loss": 0.5078, + "step": 247370 + }, + { + "epoch": 2.186919853604201, + "grad_norm": 3.79795503616333, + "learning_rate": 1.3551335773263319e-05, + "loss": 0.5543, + "step": 247380 + }, + { + "epoch": 2.1870082568645133, + "grad_norm": 0.8931883573532104, + "learning_rate": 1.3549862385591447e-05, + "loss": 0.571, + "step": 247390 + }, + { + "epoch": 2.1870966601248254, + "grad_norm": 1.9521468877792358, + "learning_rate": 1.3548388997919577e-05, + "loss": 0.5864, + "step": 247400 + }, + { + "epoch": 2.1871850633851375, + "grad_norm": 2.5125038623809814, + "learning_rate": 1.3546915610247708e-05, + "loss": 0.6021, + "step": 247410 + }, + { + "epoch": 2.18727346664545, + "grad_norm": 1.7364736795425415, + "learning_rate": 1.3545442222575836e-05, + "loss": 0.4599, + "step": 247420 + }, + { + "epoch": 2.187361869905762, + "grad_norm": 2.0616772174835205, + "learning_rate": 1.3543968834903964e-05, + "loss": 0.4675, + "step": 247430 + }, + { + "epoch": 2.1874502731660743, + "grad_norm": 14.144537925720215, + "learning_rate": 1.3542495447232096e-05, + "loss": 0.4927, + "step": 247440 + }, + { + "epoch": 2.1875386764263864, + "grad_norm": 2.120788097381592, + "learning_rate": 1.3541022059560224e-05, + "loss": 0.533, + "step": 247450 + }, + { + "epoch": 2.187627079686699, + "grad_norm": 4.679813861846924, + "learning_rate": 1.3539548671888353e-05, + "loss": 0.4059, + "step": 247460 + }, + { + "epoch": 2.187715482947011, + "grad_norm": 1.8896130323410034, + "learning_rate": 1.3538075284216484e-05, + "loss": 0.4189, + "step": 247470 + }, + { + "epoch": 2.1878038862073232, + "grad_norm": 7.276263236999512, + "learning_rate": 1.3536601896544613e-05, + "loss": 0.4476, + "step": 247480 + }, + { + "epoch": 2.1878922894676354, + "grad_norm": 4.5130486488342285, + "learning_rate": 1.3535128508872741e-05, + "loss": 0.5407, + "step": 247490 + }, + { + "epoch": 2.187980692727948, + "grad_norm": 3.916213274002075, + "learning_rate": 1.353365512120087e-05, + "loss": 0.5075, + "step": 247500 + }, + { + "epoch": 2.18806909598826, + "grad_norm": 4.324140548706055, + "learning_rate": 1.3532181733529001e-05, + "loss": 0.5561, + "step": 247510 + }, + { + "epoch": 2.188157499248572, + "grad_norm": 2.5763211250305176, + "learning_rate": 1.353070834585713e-05, + "loss": 0.5688, + "step": 247520 + }, + { + "epoch": 2.1882459025088847, + "grad_norm": 1.2428698539733887, + "learning_rate": 1.3529234958185258e-05, + "loss": 0.5472, + "step": 247530 + }, + { + "epoch": 2.188334305769197, + "grad_norm": 2.391094207763672, + "learning_rate": 1.352776157051339e-05, + "loss": 0.5318, + "step": 247540 + }, + { + "epoch": 2.188422709029509, + "grad_norm": 4.65841817855835, + "learning_rate": 1.3526288182841518e-05, + "loss": 0.6427, + "step": 247550 + }, + { + "epoch": 2.188511112289821, + "grad_norm": 2.0346181392669678, + "learning_rate": 1.3524814795169646e-05, + "loss": 0.4725, + "step": 247560 + }, + { + "epoch": 2.1885995155501337, + "grad_norm": 5.212941646575928, + "learning_rate": 1.3523341407497775e-05, + "loss": 0.6663, + "step": 247570 + }, + { + "epoch": 2.1886879188104458, + "grad_norm": 1.4639272689819336, + "learning_rate": 1.3521868019825906e-05, + "loss": 0.5491, + "step": 247580 + }, + { + "epoch": 2.188776322070758, + "grad_norm": 2.0487008094787598, + "learning_rate": 1.3520394632154035e-05, + "loss": 0.5449, + "step": 247590 + }, + { + "epoch": 2.18886472533107, + "grad_norm": 9.501411437988281, + "learning_rate": 1.3518921244482163e-05, + "loss": 0.5155, + "step": 247600 + }, + { + "epoch": 2.1889531285913826, + "grad_norm": 3.623779773712158, + "learning_rate": 1.3517447856810292e-05, + "loss": 0.4581, + "step": 247610 + }, + { + "epoch": 2.1890415318516947, + "grad_norm": 1.9156826734542847, + "learning_rate": 1.3515974469138423e-05, + "loss": 0.393, + "step": 247620 + }, + { + "epoch": 2.189129935112007, + "grad_norm": 3.184713363647461, + "learning_rate": 1.3514501081466552e-05, + "loss": 0.4906, + "step": 247630 + }, + { + "epoch": 2.189218338372319, + "grad_norm": 7.753109931945801, + "learning_rate": 1.351302769379468e-05, + "loss": 0.5886, + "step": 247640 + }, + { + "epoch": 2.1893067416326315, + "grad_norm": 7.002075672149658, + "learning_rate": 1.3511554306122812e-05, + "loss": 0.6324, + "step": 247650 + }, + { + "epoch": 2.1893951448929436, + "grad_norm": 2.602299213409424, + "learning_rate": 1.351008091845094e-05, + "loss": 0.5468, + "step": 247660 + }, + { + "epoch": 2.1894835481532557, + "grad_norm": 5.828254699707031, + "learning_rate": 1.3508607530779068e-05, + "loss": 0.542, + "step": 247670 + }, + { + "epoch": 2.1895719514135683, + "grad_norm": 1.429974913597107, + "learning_rate": 1.3507134143107197e-05, + "loss": 0.46, + "step": 247680 + }, + { + "epoch": 2.1896603546738804, + "grad_norm": 2.3256983757019043, + "learning_rate": 1.3505660755435329e-05, + "loss": 0.5497, + "step": 247690 + }, + { + "epoch": 2.1897487579341925, + "grad_norm": 0.7099183797836304, + "learning_rate": 1.3504187367763457e-05, + "loss": 0.586, + "step": 247700 + }, + { + "epoch": 2.1898371611945047, + "grad_norm": 2.024054765701294, + "learning_rate": 1.3502713980091585e-05, + "loss": 0.6126, + "step": 247710 + }, + { + "epoch": 2.1899255644548172, + "grad_norm": 5.463651180267334, + "learning_rate": 1.3501240592419714e-05, + "loss": 0.4912, + "step": 247720 + }, + { + "epoch": 2.1900139677151294, + "grad_norm": 5.231754302978516, + "learning_rate": 1.3499767204747845e-05, + "loss": 0.4863, + "step": 247730 + }, + { + "epoch": 2.1901023709754415, + "grad_norm": 3.2312517166137695, + "learning_rate": 1.3498293817075974e-05, + "loss": 0.5996, + "step": 247740 + }, + { + "epoch": 2.190190774235754, + "grad_norm": 3.621776819229126, + "learning_rate": 1.3496820429404102e-05, + "loss": 0.5378, + "step": 247750 + }, + { + "epoch": 2.190279177496066, + "grad_norm": 1.9060697555541992, + "learning_rate": 1.3495347041732234e-05, + "loss": 0.5534, + "step": 247760 + }, + { + "epoch": 2.1903675807563783, + "grad_norm": 1.1750483512878418, + "learning_rate": 1.3493873654060362e-05, + "loss": 0.539, + "step": 247770 + }, + { + "epoch": 2.1904559840166904, + "grad_norm": 18.78888511657715, + "learning_rate": 1.349240026638849e-05, + "loss": 0.5812, + "step": 247780 + }, + { + "epoch": 2.190544387277003, + "grad_norm": 6.772854804992676, + "learning_rate": 1.349092687871662e-05, + "loss": 0.5029, + "step": 247790 + }, + { + "epoch": 2.190632790537315, + "grad_norm": 5.093850135803223, + "learning_rate": 1.348945349104475e-05, + "loss": 0.532, + "step": 247800 + }, + { + "epoch": 2.190721193797627, + "grad_norm": 6.806595325469971, + "learning_rate": 1.3487980103372879e-05, + "loss": 0.5704, + "step": 247810 + }, + { + "epoch": 2.1908095970579393, + "grad_norm": 10.253061294555664, + "learning_rate": 1.3486506715701009e-05, + "loss": 0.4534, + "step": 247820 + }, + { + "epoch": 2.190898000318252, + "grad_norm": 3.1542418003082275, + "learning_rate": 1.3485033328029139e-05, + "loss": 0.6127, + "step": 247830 + }, + { + "epoch": 2.190986403578564, + "grad_norm": 4.665557384490967, + "learning_rate": 1.3483559940357267e-05, + "loss": 0.5931, + "step": 247840 + }, + { + "epoch": 2.191074806838876, + "grad_norm": 4.058244705200195, + "learning_rate": 1.3482086552685397e-05, + "loss": 0.5956, + "step": 247850 + }, + { + "epoch": 2.1911632100991882, + "grad_norm": 2.2743899822235107, + "learning_rate": 1.3480613165013526e-05, + "loss": 0.5266, + "step": 247860 + }, + { + "epoch": 2.191251613359501, + "grad_norm": 8.486218452453613, + "learning_rate": 1.3479139777341656e-05, + "loss": 0.4448, + "step": 247870 + }, + { + "epoch": 2.191340016619813, + "grad_norm": 7.624509334564209, + "learning_rate": 1.3477666389669786e-05, + "loss": 0.6627, + "step": 247880 + }, + { + "epoch": 2.191428419880125, + "grad_norm": 2.049185276031494, + "learning_rate": 1.3476193001997914e-05, + "loss": 0.5748, + "step": 247890 + }, + { + "epoch": 2.1915168231404376, + "grad_norm": 2.933204174041748, + "learning_rate": 1.3474719614326043e-05, + "loss": 0.4976, + "step": 247900 + }, + { + "epoch": 2.1916052264007497, + "grad_norm": 4.831140041351318, + "learning_rate": 1.3473246226654174e-05, + "loss": 0.4995, + "step": 247910 + }, + { + "epoch": 2.191693629661062, + "grad_norm": 1.058968424797058, + "learning_rate": 1.3471772838982303e-05, + "loss": 0.475, + "step": 247920 + }, + { + "epoch": 2.191782032921374, + "grad_norm": 1.9651315212249756, + "learning_rate": 1.3470299451310431e-05, + "loss": 0.425, + "step": 247930 + }, + { + "epoch": 2.1918704361816865, + "grad_norm": 7.354053974151611, + "learning_rate": 1.3468826063638563e-05, + "loss": 0.5704, + "step": 247940 + }, + { + "epoch": 2.1919588394419987, + "grad_norm": 2.5158638954162598, + "learning_rate": 1.3467352675966691e-05, + "loss": 0.472, + "step": 247950 + }, + { + "epoch": 2.1920472427023108, + "grad_norm": 2.033168077468872, + "learning_rate": 1.346587928829482e-05, + "loss": 0.6133, + "step": 247960 + }, + { + "epoch": 2.1921356459626233, + "grad_norm": 3.3521625995635986, + "learning_rate": 1.3464405900622948e-05, + "loss": 0.5425, + "step": 247970 + }, + { + "epoch": 2.1922240492229355, + "grad_norm": 4.782717704772949, + "learning_rate": 1.346293251295108e-05, + "loss": 0.5774, + "step": 247980 + }, + { + "epoch": 2.1923124524832476, + "grad_norm": 2.2091333866119385, + "learning_rate": 1.3461459125279208e-05, + "loss": 0.5426, + "step": 247990 + }, + { + "epoch": 2.1924008557435597, + "grad_norm": 32.41534423828125, + "learning_rate": 1.3459985737607336e-05, + "loss": 0.6794, + "step": 248000 + }, + { + "epoch": 2.1924892590038723, + "grad_norm": 2.869694948196411, + "learning_rate": 1.3458512349935468e-05, + "loss": 0.5782, + "step": 248010 + }, + { + "epoch": 2.1925776622641844, + "grad_norm": 5.095350742340088, + "learning_rate": 1.3457038962263596e-05, + "loss": 0.5509, + "step": 248020 + }, + { + "epoch": 2.1926660655244965, + "grad_norm": 2.3132121562957764, + "learning_rate": 1.3455565574591725e-05, + "loss": 0.4913, + "step": 248030 + }, + { + "epoch": 2.1927544687848086, + "grad_norm": 4.364688873291016, + "learning_rate": 1.3454092186919853e-05, + "loss": 0.5258, + "step": 248040 + }, + { + "epoch": 2.192842872045121, + "grad_norm": 2.4316611289978027, + "learning_rate": 1.3452618799247985e-05, + "loss": 0.4778, + "step": 248050 + }, + { + "epoch": 2.1929312753054333, + "grad_norm": 2.218245506286621, + "learning_rate": 1.3451145411576113e-05, + "loss": 0.5331, + "step": 248060 + }, + { + "epoch": 2.1930196785657454, + "grad_norm": 1.2283589839935303, + "learning_rate": 1.3449672023904242e-05, + "loss": 0.461, + "step": 248070 + }, + { + "epoch": 2.1931080818260575, + "grad_norm": 3.451308488845825, + "learning_rate": 1.344819863623237e-05, + "loss": 0.5336, + "step": 248080 + }, + { + "epoch": 2.19319648508637, + "grad_norm": 2.578212261199951, + "learning_rate": 1.3446725248560502e-05, + "loss": 0.5903, + "step": 248090 + }, + { + "epoch": 2.1932848883466822, + "grad_norm": 4.578711986541748, + "learning_rate": 1.344525186088863e-05, + "loss": 0.656, + "step": 248100 + }, + { + "epoch": 2.1933732916069943, + "grad_norm": 4.933976650238037, + "learning_rate": 1.3443778473216758e-05, + "loss": 0.442, + "step": 248110 + }, + { + "epoch": 2.193461694867307, + "grad_norm": 4.5620036125183105, + "learning_rate": 1.344230508554489e-05, + "loss": 0.5356, + "step": 248120 + }, + { + "epoch": 2.193550098127619, + "grad_norm": 1.3037015199661255, + "learning_rate": 1.3440831697873018e-05, + "loss": 0.543, + "step": 248130 + }, + { + "epoch": 2.193638501387931, + "grad_norm": 7.304600715637207, + "learning_rate": 1.3439358310201147e-05, + "loss": 0.614, + "step": 248140 + }, + { + "epoch": 2.1937269046482433, + "grad_norm": 1.267075538635254, + "learning_rate": 1.3437884922529275e-05, + "loss": 0.531, + "step": 248150 + }, + { + "epoch": 2.193815307908556, + "grad_norm": 5.666834354400635, + "learning_rate": 1.3436411534857407e-05, + "loss": 0.5295, + "step": 248160 + }, + { + "epoch": 2.193903711168868, + "grad_norm": 3.5872297286987305, + "learning_rate": 1.3434938147185535e-05, + "loss": 0.3633, + "step": 248170 + }, + { + "epoch": 2.19399211442918, + "grad_norm": 8.192139625549316, + "learning_rate": 1.3433464759513664e-05, + "loss": 0.4571, + "step": 248180 + }, + { + "epoch": 2.194080517689492, + "grad_norm": 4.5500688552856445, + "learning_rate": 1.3431991371841792e-05, + "loss": 0.6117, + "step": 248190 + }, + { + "epoch": 2.1941689209498048, + "grad_norm": 0.9038406610488892, + "learning_rate": 1.3430517984169924e-05, + "loss": 0.4218, + "step": 248200 + }, + { + "epoch": 2.194257324210117, + "grad_norm": 2.1371798515319824, + "learning_rate": 1.3429044596498052e-05, + "loss": 0.4766, + "step": 248210 + }, + { + "epoch": 2.194345727470429, + "grad_norm": 8.208730697631836, + "learning_rate": 1.342757120882618e-05, + "loss": 0.479, + "step": 248220 + }, + { + "epoch": 2.194434130730741, + "grad_norm": 2.32049822807312, + "learning_rate": 1.3426097821154312e-05, + "loss": 0.4093, + "step": 248230 + }, + { + "epoch": 2.1945225339910537, + "grad_norm": 3.3404295444488525, + "learning_rate": 1.342462443348244e-05, + "loss": 0.5052, + "step": 248240 + }, + { + "epoch": 2.194610937251366, + "grad_norm": 3.0793960094451904, + "learning_rate": 1.3423151045810569e-05, + "loss": 0.6201, + "step": 248250 + }, + { + "epoch": 2.194699340511678, + "grad_norm": 35.796329498291016, + "learning_rate": 1.3421677658138699e-05, + "loss": 0.6822, + "step": 248260 + }, + { + "epoch": 2.1947877437719905, + "grad_norm": 5.138563632965088, + "learning_rate": 1.3420204270466829e-05, + "loss": 0.7885, + "step": 248270 + }, + { + "epoch": 2.1948761470323026, + "grad_norm": 3.4745380878448486, + "learning_rate": 1.3418730882794957e-05, + "loss": 0.5765, + "step": 248280 + }, + { + "epoch": 2.1949645502926147, + "grad_norm": 6.935659408569336, + "learning_rate": 1.3417257495123087e-05, + "loss": 0.6093, + "step": 248290 + }, + { + "epoch": 2.195052953552927, + "grad_norm": 1.4496848583221436, + "learning_rate": 1.3415784107451217e-05, + "loss": 0.614, + "step": 248300 + }, + { + "epoch": 2.1951413568132394, + "grad_norm": 2.736081123352051, + "learning_rate": 1.3414310719779346e-05, + "loss": 0.5063, + "step": 248310 + }, + { + "epoch": 2.1952297600735515, + "grad_norm": 2.9513230323791504, + "learning_rate": 1.3412837332107476e-05, + "loss": 0.5127, + "step": 248320 + }, + { + "epoch": 2.1953181633338636, + "grad_norm": 1.1803243160247803, + "learning_rate": 1.3411363944435604e-05, + "loss": 0.6258, + "step": 248330 + }, + { + "epoch": 2.195406566594176, + "grad_norm": 3.999048948287964, + "learning_rate": 1.3409890556763736e-05, + "loss": 0.4908, + "step": 248340 + }, + { + "epoch": 2.1954949698544883, + "grad_norm": 1.4178131818771362, + "learning_rate": 1.3408417169091864e-05, + "loss": 0.469, + "step": 248350 + }, + { + "epoch": 2.1955833731148005, + "grad_norm": 3.1010172367095947, + "learning_rate": 1.3406943781419993e-05, + "loss": 0.4981, + "step": 248360 + }, + { + "epoch": 2.1956717763751126, + "grad_norm": 2.83014178276062, + "learning_rate": 1.3405470393748121e-05, + "loss": 0.4479, + "step": 248370 + }, + { + "epoch": 2.195760179635425, + "grad_norm": 4.243435382843018, + "learning_rate": 1.3403997006076253e-05, + "loss": 0.4798, + "step": 248380 + }, + { + "epoch": 2.1958485828957373, + "grad_norm": 6.018049716949463, + "learning_rate": 1.3402523618404381e-05, + "loss": 0.5454, + "step": 248390 + }, + { + "epoch": 2.1959369861560494, + "grad_norm": 2.2778513431549072, + "learning_rate": 1.340105023073251e-05, + "loss": 0.5496, + "step": 248400 + }, + { + "epoch": 2.1960253894163615, + "grad_norm": 2.533500909805298, + "learning_rate": 1.3399576843060641e-05, + "loss": 0.4309, + "step": 248410 + }, + { + "epoch": 2.196113792676674, + "grad_norm": 4.050847053527832, + "learning_rate": 1.339810345538877e-05, + "loss": 0.5442, + "step": 248420 + }, + { + "epoch": 2.196202195936986, + "grad_norm": 3.163923978805542, + "learning_rate": 1.3396630067716898e-05, + "loss": 0.6295, + "step": 248430 + }, + { + "epoch": 2.1962905991972983, + "grad_norm": 1.9944928884506226, + "learning_rate": 1.3395156680045026e-05, + "loss": 0.4354, + "step": 248440 + }, + { + "epoch": 2.1963790024576104, + "grad_norm": 3.093656539916992, + "learning_rate": 1.3393683292373158e-05, + "loss": 0.5283, + "step": 248450 + }, + { + "epoch": 2.196467405717923, + "grad_norm": 5.619362831115723, + "learning_rate": 1.3392209904701286e-05, + "loss": 0.5574, + "step": 248460 + }, + { + "epoch": 2.196555808978235, + "grad_norm": 5.0072526931762695, + "learning_rate": 1.3390736517029415e-05, + "loss": 0.4315, + "step": 248470 + }, + { + "epoch": 2.1966442122385472, + "grad_norm": 2.0619728565216064, + "learning_rate": 1.3389263129357546e-05, + "loss": 0.4966, + "step": 248480 + }, + { + "epoch": 2.19673261549886, + "grad_norm": 2.8122880458831787, + "learning_rate": 1.3387789741685675e-05, + "loss": 0.5742, + "step": 248490 + }, + { + "epoch": 2.196821018759172, + "grad_norm": 1.3270502090454102, + "learning_rate": 1.3386316354013803e-05, + "loss": 0.6397, + "step": 248500 + }, + { + "epoch": 2.196909422019484, + "grad_norm": 1.6752490997314453, + "learning_rate": 1.3384842966341931e-05, + "loss": 0.4741, + "step": 248510 + }, + { + "epoch": 2.196997825279796, + "grad_norm": 1.8148902654647827, + "learning_rate": 1.3383369578670063e-05, + "loss": 0.4602, + "step": 248520 + }, + { + "epoch": 2.1970862285401087, + "grad_norm": 4.225399971008301, + "learning_rate": 1.3381896190998192e-05, + "loss": 0.4487, + "step": 248530 + }, + { + "epoch": 2.197174631800421, + "grad_norm": 6.169032573699951, + "learning_rate": 1.338042280332632e-05, + "loss": 0.4907, + "step": 248540 + }, + { + "epoch": 2.197263035060733, + "grad_norm": 5.886810302734375, + "learning_rate": 1.3378949415654448e-05, + "loss": 0.496, + "step": 248550 + }, + { + "epoch": 2.1973514383210455, + "grad_norm": 3.5278451442718506, + "learning_rate": 1.337747602798258e-05, + "loss": 0.4627, + "step": 248560 + }, + { + "epoch": 2.1974398415813576, + "grad_norm": 2.5107581615448, + "learning_rate": 1.3376002640310708e-05, + "loss": 0.522, + "step": 248570 + }, + { + "epoch": 2.1975282448416698, + "grad_norm": 6.515069007873535, + "learning_rate": 1.3374529252638837e-05, + "loss": 0.6172, + "step": 248580 + }, + { + "epoch": 2.197616648101982, + "grad_norm": 2.8458285331726074, + "learning_rate": 1.3373055864966968e-05, + "loss": 0.4554, + "step": 248590 + }, + { + "epoch": 2.1977050513622944, + "grad_norm": 5.031824111938477, + "learning_rate": 1.3371582477295097e-05, + "loss": 0.4179, + "step": 248600 + }, + { + "epoch": 2.1977934546226066, + "grad_norm": 2.114513874053955, + "learning_rate": 1.3370109089623225e-05, + "loss": 0.6778, + "step": 248610 + }, + { + "epoch": 2.1978818578829187, + "grad_norm": 2.5759754180908203, + "learning_rate": 1.3368635701951354e-05, + "loss": 0.6144, + "step": 248620 + }, + { + "epoch": 2.197970261143231, + "grad_norm": 1.5666327476501465, + "learning_rate": 1.3367162314279485e-05, + "loss": 0.4739, + "step": 248630 + }, + { + "epoch": 2.1980586644035434, + "grad_norm": 2.527386426925659, + "learning_rate": 1.3365688926607614e-05, + "loss": 0.6023, + "step": 248640 + }, + { + "epoch": 2.1981470676638555, + "grad_norm": 11.160932540893555, + "learning_rate": 1.3364215538935742e-05, + "loss": 0.4996, + "step": 248650 + }, + { + "epoch": 2.1982354709241676, + "grad_norm": 3.554805278778076, + "learning_rate": 1.3362742151263874e-05, + "loss": 0.5242, + "step": 248660 + }, + { + "epoch": 2.1983238741844797, + "grad_norm": 8.004654884338379, + "learning_rate": 1.3361268763592002e-05, + "loss": 0.5743, + "step": 248670 + }, + { + "epoch": 2.1984122774447923, + "grad_norm": 7.881980895996094, + "learning_rate": 1.335979537592013e-05, + "loss": 0.6251, + "step": 248680 + }, + { + "epoch": 2.1985006807051044, + "grad_norm": 1.4040323495864868, + "learning_rate": 1.335832198824826e-05, + "loss": 0.6106, + "step": 248690 + }, + { + "epoch": 2.1985890839654165, + "grad_norm": 1.475546956062317, + "learning_rate": 1.335684860057639e-05, + "loss": 0.445, + "step": 248700 + }, + { + "epoch": 2.198677487225729, + "grad_norm": 1.6685012578964233, + "learning_rate": 1.3355375212904519e-05, + "loss": 0.5305, + "step": 248710 + }, + { + "epoch": 2.198765890486041, + "grad_norm": 5.823601245880127, + "learning_rate": 1.3353901825232649e-05, + "loss": 0.5807, + "step": 248720 + }, + { + "epoch": 2.1988542937463533, + "grad_norm": 5.065922260284424, + "learning_rate": 1.3352428437560777e-05, + "loss": 0.5159, + "step": 248730 + }, + { + "epoch": 2.1989426970066654, + "grad_norm": 2.789523124694824, + "learning_rate": 1.3350955049888907e-05, + "loss": 0.5042, + "step": 248740 + }, + { + "epoch": 2.199031100266978, + "grad_norm": 1.2181733846664429, + "learning_rate": 1.3349481662217037e-05, + "loss": 0.5228, + "step": 248750 + }, + { + "epoch": 2.19911950352729, + "grad_norm": 3.462846040725708, + "learning_rate": 1.3348008274545166e-05, + "loss": 0.5804, + "step": 248760 + }, + { + "epoch": 2.1992079067876023, + "grad_norm": 6.696045875549316, + "learning_rate": 1.3346534886873296e-05, + "loss": 0.5334, + "step": 248770 + }, + { + "epoch": 2.1992963100479144, + "grad_norm": 13.107380867004395, + "learning_rate": 1.3345061499201426e-05, + "loss": 0.5431, + "step": 248780 + }, + { + "epoch": 2.199384713308227, + "grad_norm": 2.6388185024261475, + "learning_rate": 1.3343588111529554e-05, + "loss": 0.3763, + "step": 248790 + }, + { + "epoch": 2.199473116568539, + "grad_norm": 2.246396064758301, + "learning_rate": 1.3342114723857683e-05, + "loss": 0.5584, + "step": 248800 + }, + { + "epoch": 2.199561519828851, + "grad_norm": 2.575255870819092, + "learning_rate": 1.3340641336185814e-05, + "loss": 0.4359, + "step": 248810 + }, + { + "epoch": 2.1996499230891633, + "grad_norm": 2.8880059719085693, + "learning_rate": 1.3339167948513943e-05, + "loss": 0.556, + "step": 248820 + }, + { + "epoch": 2.199738326349476, + "grad_norm": 2.218728542327881, + "learning_rate": 1.3337694560842071e-05, + "loss": 0.5695, + "step": 248830 + }, + { + "epoch": 2.199826729609788, + "grad_norm": 3.548462390899658, + "learning_rate": 1.33362211731702e-05, + "loss": 0.4176, + "step": 248840 + }, + { + "epoch": 2.1999151328701, + "grad_norm": 2.9193859100341797, + "learning_rate": 1.3334747785498331e-05, + "loss": 0.5695, + "step": 248850 + }, + { + "epoch": 2.2000035361304127, + "grad_norm": 16.740568161010742, + "learning_rate": 1.333327439782646e-05, + "loss": 0.517, + "step": 248860 + }, + { + "epoch": 2.200091939390725, + "grad_norm": 2.8154659271240234, + "learning_rate": 1.3331801010154588e-05, + "loss": 0.4914, + "step": 248870 + }, + { + "epoch": 2.200180342651037, + "grad_norm": 4.678836822509766, + "learning_rate": 1.333032762248272e-05, + "loss": 0.3929, + "step": 248880 + }, + { + "epoch": 2.200268745911349, + "grad_norm": 2.0656960010528564, + "learning_rate": 1.3328854234810848e-05, + "loss": 0.5093, + "step": 248890 + }, + { + "epoch": 2.2003571491716616, + "grad_norm": 2.987643241882324, + "learning_rate": 1.3327380847138976e-05, + "loss": 0.4645, + "step": 248900 + }, + { + "epoch": 2.2004455524319737, + "grad_norm": 12.750919342041016, + "learning_rate": 1.3325907459467105e-05, + "loss": 0.6356, + "step": 248910 + }, + { + "epoch": 2.200533955692286, + "grad_norm": 4.590280532836914, + "learning_rate": 1.3324434071795236e-05, + "loss": 0.5877, + "step": 248920 + }, + { + "epoch": 2.2006223589525984, + "grad_norm": 1.230583667755127, + "learning_rate": 1.3322960684123365e-05, + "loss": 0.5495, + "step": 248930 + }, + { + "epoch": 2.2007107622129105, + "grad_norm": 1.9174563884735107, + "learning_rate": 1.3321487296451493e-05, + "loss": 0.5655, + "step": 248940 + }, + { + "epoch": 2.2007991654732226, + "grad_norm": 4.2815842628479, + "learning_rate": 1.3320013908779625e-05, + "loss": 0.4846, + "step": 248950 + }, + { + "epoch": 2.2008875687335347, + "grad_norm": 18.049541473388672, + "learning_rate": 1.3318540521107753e-05, + "loss": 0.5998, + "step": 248960 + }, + { + "epoch": 2.2009759719938473, + "grad_norm": 4.250127792358398, + "learning_rate": 1.3317067133435882e-05, + "loss": 0.4781, + "step": 248970 + }, + { + "epoch": 2.2010643752541594, + "grad_norm": 25.395647048950195, + "learning_rate": 1.331559374576401e-05, + "loss": 0.496, + "step": 248980 + }, + { + "epoch": 2.2011527785144716, + "grad_norm": 5.661912441253662, + "learning_rate": 1.3314120358092142e-05, + "loss": 0.4074, + "step": 248990 + }, + { + "epoch": 2.2012411817747837, + "grad_norm": 1.278296947479248, + "learning_rate": 1.331264697042027e-05, + "loss": 0.5053, + "step": 249000 + }, + { + "epoch": 2.2013295850350962, + "grad_norm": 5.121157646179199, + "learning_rate": 1.3311173582748398e-05, + "loss": 0.4942, + "step": 249010 + }, + { + "epoch": 2.2014179882954084, + "grad_norm": 1.1101226806640625, + "learning_rate": 1.3309700195076527e-05, + "loss": 0.5041, + "step": 249020 + }, + { + "epoch": 2.2015063915557205, + "grad_norm": 2.6902756690979004, + "learning_rate": 1.3308226807404658e-05, + "loss": 0.5656, + "step": 249030 + }, + { + "epoch": 2.2015947948160326, + "grad_norm": 1.427984595298767, + "learning_rate": 1.3306753419732787e-05, + "loss": 0.4527, + "step": 249040 + }, + { + "epoch": 2.201683198076345, + "grad_norm": 0.9949051141738892, + "learning_rate": 1.3305280032060915e-05, + "loss": 0.5445, + "step": 249050 + }, + { + "epoch": 2.2017716013366573, + "grad_norm": 4.321620464324951, + "learning_rate": 1.3303806644389047e-05, + "loss": 0.515, + "step": 249060 + }, + { + "epoch": 2.2018600045969694, + "grad_norm": 2.191041946411133, + "learning_rate": 1.3302333256717175e-05, + "loss": 0.6025, + "step": 249070 + }, + { + "epoch": 2.201948407857282, + "grad_norm": 7.499270439147949, + "learning_rate": 1.3300859869045304e-05, + "loss": 0.4739, + "step": 249080 + }, + { + "epoch": 2.202036811117594, + "grad_norm": 16.23049545288086, + "learning_rate": 1.3299386481373432e-05, + "loss": 0.4368, + "step": 249090 + }, + { + "epoch": 2.202125214377906, + "grad_norm": 1.9177802801132202, + "learning_rate": 1.3297913093701564e-05, + "loss": 0.4264, + "step": 249100 + }, + { + "epoch": 2.2022136176382183, + "grad_norm": 7.329319953918457, + "learning_rate": 1.3296439706029692e-05, + "loss": 0.5225, + "step": 249110 + }, + { + "epoch": 2.202302020898531, + "grad_norm": 2.6626486778259277, + "learning_rate": 1.329496631835782e-05, + "loss": 0.5472, + "step": 249120 + }, + { + "epoch": 2.202390424158843, + "grad_norm": 3.6199307441711426, + "learning_rate": 1.3293492930685952e-05, + "loss": 0.3916, + "step": 249130 + }, + { + "epoch": 2.202478827419155, + "grad_norm": 1.6247435808181763, + "learning_rate": 1.329201954301408e-05, + "loss": 0.6559, + "step": 249140 + }, + { + "epoch": 2.2025672306794677, + "grad_norm": 2.7864363193511963, + "learning_rate": 1.3290546155342209e-05, + "loss": 0.592, + "step": 249150 + }, + { + "epoch": 2.20265563393978, + "grad_norm": 2.7513508796691895, + "learning_rate": 1.3289072767670339e-05, + "loss": 0.569, + "step": 249160 + }, + { + "epoch": 2.202744037200092, + "grad_norm": 1.8982609510421753, + "learning_rate": 1.3287599379998469e-05, + "loss": 0.5285, + "step": 249170 + }, + { + "epoch": 2.202832440460404, + "grad_norm": 3.4049758911132812, + "learning_rate": 1.3286125992326597e-05, + "loss": 0.7192, + "step": 249180 + }, + { + "epoch": 2.2029208437207166, + "grad_norm": 2.8480541706085205, + "learning_rate": 1.3284652604654727e-05, + "loss": 0.6053, + "step": 249190 + }, + { + "epoch": 2.2030092469810287, + "grad_norm": 4.120934963226318, + "learning_rate": 1.3283179216982856e-05, + "loss": 0.616, + "step": 249200 + }, + { + "epoch": 2.203097650241341, + "grad_norm": 20.30929946899414, + "learning_rate": 1.3281705829310986e-05, + "loss": 0.6394, + "step": 249210 + }, + { + "epoch": 2.203186053501653, + "grad_norm": 1.7026211023330688, + "learning_rate": 1.3280232441639116e-05, + "loss": 0.474, + "step": 249220 + }, + { + "epoch": 2.2032744567619655, + "grad_norm": 1.9374653100967407, + "learning_rate": 1.3278759053967244e-05, + "loss": 0.6231, + "step": 249230 + }, + { + "epoch": 2.2033628600222777, + "grad_norm": 1.2990700006484985, + "learning_rate": 1.3277285666295374e-05, + "loss": 0.4195, + "step": 249240 + }, + { + "epoch": 2.20345126328259, + "grad_norm": 2.4881269931793213, + "learning_rate": 1.3275812278623504e-05, + "loss": 0.6474, + "step": 249250 + }, + { + "epoch": 2.203539666542902, + "grad_norm": 3.403794527053833, + "learning_rate": 1.3274338890951633e-05, + "loss": 0.4742, + "step": 249260 + }, + { + "epoch": 2.2036280698032145, + "grad_norm": 1.1583021879196167, + "learning_rate": 1.3272865503279761e-05, + "loss": 0.6569, + "step": 249270 + }, + { + "epoch": 2.2037164730635266, + "grad_norm": 4.36346960067749, + "learning_rate": 1.3271392115607893e-05, + "loss": 0.5025, + "step": 249280 + }, + { + "epoch": 2.2038048763238387, + "grad_norm": 1.3842864036560059, + "learning_rate": 1.3269918727936021e-05, + "loss": 0.5962, + "step": 249290 + }, + { + "epoch": 2.2038932795841513, + "grad_norm": 3.1008200645446777, + "learning_rate": 1.326844534026415e-05, + "loss": 0.4901, + "step": 249300 + }, + { + "epoch": 2.2039816828444634, + "grad_norm": 6.165328025817871, + "learning_rate": 1.3266971952592278e-05, + "loss": 0.569, + "step": 249310 + }, + { + "epoch": 2.2040700861047755, + "grad_norm": 14.500444412231445, + "learning_rate": 1.326549856492041e-05, + "loss": 0.5134, + "step": 249320 + }, + { + "epoch": 2.2041584893650876, + "grad_norm": 7.207456111907959, + "learning_rate": 1.3264025177248538e-05, + "loss": 0.4805, + "step": 249330 + }, + { + "epoch": 2.2042468926254, + "grad_norm": 2.824608087539673, + "learning_rate": 1.3262551789576666e-05, + "loss": 0.494, + "step": 249340 + }, + { + "epoch": 2.2043352958857123, + "grad_norm": 4.00406551361084, + "learning_rate": 1.3261078401904798e-05, + "loss": 0.7035, + "step": 249350 + }, + { + "epoch": 2.2044236991460244, + "grad_norm": 8.290523529052734, + "learning_rate": 1.3259605014232926e-05, + "loss": 0.6158, + "step": 249360 + }, + { + "epoch": 2.2045121024063365, + "grad_norm": 1.4723474979400635, + "learning_rate": 1.3258131626561055e-05, + "loss": 0.5062, + "step": 249370 + }, + { + "epoch": 2.204600505666649, + "grad_norm": 3.1293463706970215, + "learning_rate": 1.3256658238889183e-05, + "loss": 0.6395, + "step": 249380 + }, + { + "epoch": 2.2046889089269612, + "grad_norm": 1.1376453638076782, + "learning_rate": 1.3255184851217315e-05, + "loss": 0.5612, + "step": 249390 + }, + { + "epoch": 2.2047773121872734, + "grad_norm": 3.9921152591705322, + "learning_rate": 1.3253711463545443e-05, + "loss": 0.4707, + "step": 249400 + }, + { + "epoch": 2.2048657154475855, + "grad_norm": 1.0629435777664185, + "learning_rate": 1.3252238075873571e-05, + "loss": 0.5229, + "step": 249410 + }, + { + "epoch": 2.204954118707898, + "grad_norm": 1.6712992191314697, + "learning_rate": 1.3250764688201703e-05, + "loss": 0.4403, + "step": 249420 + }, + { + "epoch": 2.20504252196821, + "grad_norm": 5.825390815734863, + "learning_rate": 1.3249291300529832e-05, + "loss": 0.5589, + "step": 249430 + }, + { + "epoch": 2.2051309252285223, + "grad_norm": 1.9167630672454834, + "learning_rate": 1.324781791285796e-05, + "loss": 0.6318, + "step": 249440 + }, + { + "epoch": 2.205219328488835, + "grad_norm": 2.319549560546875, + "learning_rate": 1.3246344525186088e-05, + "loss": 0.4354, + "step": 249450 + }, + { + "epoch": 2.205307731749147, + "grad_norm": 2.477796792984009, + "learning_rate": 1.324487113751422e-05, + "loss": 0.5805, + "step": 249460 + }, + { + "epoch": 2.205396135009459, + "grad_norm": 14.294662475585938, + "learning_rate": 1.3243397749842348e-05, + "loss": 0.5212, + "step": 249470 + }, + { + "epoch": 2.205484538269771, + "grad_norm": 1.6851885318756104, + "learning_rate": 1.3241924362170477e-05, + "loss": 0.4292, + "step": 249480 + }, + { + "epoch": 2.2055729415300838, + "grad_norm": 5.46026086807251, + "learning_rate": 1.3240450974498605e-05, + "loss": 0.5939, + "step": 249490 + }, + { + "epoch": 2.205661344790396, + "grad_norm": 3.6346561908721924, + "learning_rate": 1.3238977586826737e-05, + "loss": 0.4709, + "step": 249500 + }, + { + "epoch": 2.205749748050708, + "grad_norm": 20.783916473388672, + "learning_rate": 1.3237504199154865e-05, + "loss": 0.7271, + "step": 249510 + }, + { + "epoch": 2.2058381513110206, + "grad_norm": 4.792198657989502, + "learning_rate": 1.3236030811482993e-05, + "loss": 0.5136, + "step": 249520 + }, + { + "epoch": 2.2059265545713327, + "grad_norm": 2.3931844234466553, + "learning_rate": 1.3234557423811125e-05, + "loss": 0.5434, + "step": 249530 + }, + { + "epoch": 2.206014957831645, + "grad_norm": 1.5297844409942627, + "learning_rate": 1.3233084036139254e-05, + "loss": 0.5472, + "step": 249540 + }, + { + "epoch": 2.206103361091957, + "grad_norm": 3.235701322555542, + "learning_rate": 1.3231610648467382e-05, + "loss": 0.5051, + "step": 249550 + }, + { + "epoch": 2.2061917643522695, + "grad_norm": 3.0994412899017334, + "learning_rate": 1.323013726079551e-05, + "loss": 0.6526, + "step": 249560 + }, + { + "epoch": 2.2062801676125816, + "grad_norm": 10.671908378601074, + "learning_rate": 1.3228663873123642e-05, + "loss": 0.4997, + "step": 249570 + }, + { + "epoch": 2.2063685708728937, + "grad_norm": 2.1372947692871094, + "learning_rate": 1.322719048545177e-05, + "loss": 0.5286, + "step": 249580 + }, + { + "epoch": 2.206456974133206, + "grad_norm": 5.531351566314697, + "learning_rate": 1.3225717097779899e-05, + "loss": 0.6492, + "step": 249590 + }, + { + "epoch": 2.2065453773935184, + "grad_norm": 5.7973480224609375, + "learning_rate": 1.322424371010803e-05, + "loss": 0.5715, + "step": 249600 + }, + { + "epoch": 2.2066337806538305, + "grad_norm": 11.998841285705566, + "learning_rate": 1.3222770322436159e-05, + "loss": 0.5825, + "step": 249610 + }, + { + "epoch": 2.2067221839141427, + "grad_norm": 18.290002822875977, + "learning_rate": 1.3221296934764287e-05, + "loss": 0.6014, + "step": 249620 + }, + { + "epoch": 2.2068105871744548, + "grad_norm": 8.915258407592773, + "learning_rate": 1.3219823547092417e-05, + "loss": 0.492, + "step": 249630 + }, + { + "epoch": 2.2068989904347673, + "grad_norm": 16.542482376098633, + "learning_rate": 1.3218350159420547e-05, + "loss": 0.557, + "step": 249640 + }, + { + "epoch": 2.2069873936950795, + "grad_norm": 1.616204857826233, + "learning_rate": 1.3216876771748676e-05, + "loss": 0.5499, + "step": 249650 + }, + { + "epoch": 2.2070757969553916, + "grad_norm": 12.764683723449707, + "learning_rate": 1.3215403384076806e-05, + "loss": 0.5415, + "step": 249660 + }, + { + "epoch": 2.207164200215704, + "grad_norm": 17.56515121459961, + "learning_rate": 1.3213929996404934e-05, + "loss": 0.5744, + "step": 249670 + }, + { + "epoch": 2.2072526034760163, + "grad_norm": 2.567283868789673, + "learning_rate": 1.3212456608733064e-05, + "loss": 0.5331, + "step": 249680 + }, + { + "epoch": 2.2073410067363284, + "grad_norm": 2.7432751655578613, + "learning_rate": 1.3210983221061194e-05, + "loss": 0.5019, + "step": 249690 + }, + { + "epoch": 2.2074294099966405, + "grad_norm": 4.4268574714660645, + "learning_rate": 1.3209509833389322e-05, + "loss": 0.4422, + "step": 249700 + }, + { + "epoch": 2.207517813256953, + "grad_norm": 1.8631535768508911, + "learning_rate": 1.3208036445717453e-05, + "loss": 0.5278, + "step": 249710 + }, + { + "epoch": 2.207606216517265, + "grad_norm": 10.096842765808105, + "learning_rate": 1.3206563058045583e-05, + "loss": 0.5649, + "step": 249720 + }, + { + "epoch": 2.2076946197775773, + "grad_norm": 1.3873947858810425, + "learning_rate": 1.3205089670373711e-05, + "loss": 0.5165, + "step": 249730 + }, + { + "epoch": 2.20778302303789, + "grad_norm": 1.835329294204712, + "learning_rate": 1.320361628270184e-05, + "loss": 0.6649, + "step": 249740 + }, + { + "epoch": 2.207871426298202, + "grad_norm": 3.0300354957580566, + "learning_rate": 1.3202142895029971e-05, + "loss": 0.6477, + "step": 249750 + }, + { + "epoch": 2.207959829558514, + "grad_norm": 7.400020122528076, + "learning_rate": 1.32006695073581e-05, + "loss": 0.5611, + "step": 249760 + }, + { + "epoch": 2.2080482328188262, + "grad_norm": 3.683628559112549, + "learning_rate": 1.3199196119686228e-05, + "loss": 0.613, + "step": 249770 + }, + { + "epoch": 2.208136636079139, + "grad_norm": 2.0177690982818604, + "learning_rate": 1.3197722732014356e-05, + "loss": 0.5586, + "step": 249780 + }, + { + "epoch": 2.208225039339451, + "grad_norm": 3.679809808731079, + "learning_rate": 1.3196249344342488e-05, + "loss": 0.5817, + "step": 249790 + }, + { + "epoch": 2.208313442599763, + "grad_norm": 2.6718428134918213, + "learning_rate": 1.3194775956670616e-05, + "loss": 0.6356, + "step": 249800 + }, + { + "epoch": 2.208401845860075, + "grad_norm": 2.1872098445892334, + "learning_rate": 1.3193302568998745e-05, + "loss": 0.5112, + "step": 249810 + }, + { + "epoch": 2.2084902491203877, + "grad_norm": 2.9174561500549316, + "learning_rate": 1.3191829181326876e-05, + "loss": 0.5967, + "step": 249820 + }, + { + "epoch": 2.2085786523807, + "grad_norm": 0.9797311425209045, + "learning_rate": 1.3190355793655005e-05, + "loss": 0.485, + "step": 249830 + }, + { + "epoch": 2.208667055641012, + "grad_norm": 1.0028798580169678, + "learning_rate": 1.3188882405983133e-05, + "loss": 0.6527, + "step": 249840 + }, + { + "epoch": 2.208755458901324, + "grad_norm": 5.774487495422363, + "learning_rate": 1.3187409018311261e-05, + "loss": 0.5032, + "step": 249850 + }, + { + "epoch": 2.2088438621616366, + "grad_norm": 0.944429337978363, + "learning_rate": 1.3185935630639393e-05, + "loss": 0.3892, + "step": 249860 + }, + { + "epoch": 2.2089322654219488, + "grad_norm": 3.6548397541046143, + "learning_rate": 1.3184462242967521e-05, + "loss": 0.5396, + "step": 249870 + }, + { + "epoch": 2.209020668682261, + "grad_norm": 1.8926538228988647, + "learning_rate": 1.318298885529565e-05, + "loss": 0.506, + "step": 249880 + }, + { + "epoch": 2.2091090719425734, + "grad_norm": 1.3070541620254517, + "learning_rate": 1.3181515467623782e-05, + "loss": 0.5989, + "step": 249890 + }, + { + "epoch": 2.2091974752028856, + "grad_norm": 1.947528600692749, + "learning_rate": 1.318004207995191e-05, + "loss": 0.4, + "step": 249900 + }, + { + "epoch": 2.2092858784631977, + "grad_norm": 2.311702251434326, + "learning_rate": 1.3178568692280038e-05, + "loss": 0.4817, + "step": 249910 + }, + { + "epoch": 2.20937428172351, + "grad_norm": 3.1572346687316895, + "learning_rate": 1.3177095304608167e-05, + "loss": 0.422, + "step": 249920 + }, + { + "epoch": 2.2094626849838224, + "grad_norm": 1.8141001462936401, + "learning_rate": 1.3175621916936298e-05, + "loss": 0.4922, + "step": 249930 + }, + { + "epoch": 2.2095510882441345, + "grad_norm": 2.8884246349334717, + "learning_rate": 1.3174148529264427e-05, + "loss": 0.483, + "step": 249940 + }, + { + "epoch": 2.2096394915044466, + "grad_norm": 28.952098846435547, + "learning_rate": 1.3172675141592555e-05, + "loss": 0.5192, + "step": 249950 + }, + { + "epoch": 2.2097278947647587, + "grad_norm": 5.061415195465088, + "learning_rate": 1.3171201753920683e-05, + "loss": 0.5539, + "step": 249960 + }, + { + "epoch": 2.2098162980250713, + "grad_norm": 7.288519382476807, + "learning_rate": 1.3169728366248815e-05, + "loss": 0.4502, + "step": 249970 + }, + { + "epoch": 2.2099047012853834, + "grad_norm": 3.347747802734375, + "learning_rate": 1.3168254978576944e-05, + "loss": 0.4968, + "step": 249980 + }, + { + "epoch": 2.2099931045456955, + "grad_norm": 1.8496769666671753, + "learning_rate": 1.3166781590905072e-05, + "loss": 0.4053, + "step": 249990 + }, + { + "epoch": 2.2100815078060077, + "grad_norm": 5.128690242767334, + "learning_rate": 1.3165308203233204e-05, + "loss": 0.4705, + "step": 250000 + }, + { + "epoch": 2.21016991106632, + "grad_norm": 15.506339073181152, + "learning_rate": 1.3163834815561332e-05, + "loss": 0.5704, + "step": 250010 + }, + { + "epoch": 2.2102583143266323, + "grad_norm": 2.200875759124756, + "learning_rate": 1.316236142788946e-05, + "loss": 0.503, + "step": 250020 + }, + { + "epoch": 2.2103467175869445, + "grad_norm": 20.887386322021484, + "learning_rate": 1.3160888040217589e-05, + "loss": 0.637, + "step": 250030 + }, + { + "epoch": 2.210435120847257, + "grad_norm": 2.7311174869537354, + "learning_rate": 1.315941465254572e-05, + "loss": 0.7389, + "step": 250040 + }, + { + "epoch": 2.210523524107569, + "grad_norm": 15.473363876342773, + "learning_rate": 1.3157941264873849e-05, + "loss": 0.6386, + "step": 250050 + }, + { + "epoch": 2.2106119273678813, + "grad_norm": 20.06948471069336, + "learning_rate": 1.3156467877201977e-05, + "loss": 0.5676, + "step": 250060 + }, + { + "epoch": 2.2107003306281934, + "grad_norm": 3.180919885635376, + "learning_rate": 1.3154994489530109e-05, + "loss": 0.6262, + "step": 250070 + }, + { + "epoch": 2.210788733888506, + "grad_norm": 17.40102195739746, + "learning_rate": 1.3153521101858237e-05, + "loss": 0.5395, + "step": 250080 + }, + { + "epoch": 2.210877137148818, + "grad_norm": 5.067879676818848, + "learning_rate": 1.3152047714186366e-05, + "loss": 0.5308, + "step": 250090 + }, + { + "epoch": 2.21096554040913, + "grad_norm": 2.846417188644409, + "learning_rate": 1.3150574326514496e-05, + "loss": 0.5168, + "step": 250100 + }, + { + "epoch": 2.2110539436694427, + "grad_norm": 3.202138662338257, + "learning_rate": 1.3149100938842626e-05, + "loss": 0.6521, + "step": 250110 + }, + { + "epoch": 2.211142346929755, + "grad_norm": 4.375974178314209, + "learning_rate": 1.3147627551170754e-05, + "loss": 0.5925, + "step": 250120 + }, + { + "epoch": 2.211230750190067, + "grad_norm": 4.778468608856201, + "learning_rate": 1.3146154163498884e-05, + "loss": 0.5562, + "step": 250130 + }, + { + "epoch": 2.211319153450379, + "grad_norm": 1.2279138565063477, + "learning_rate": 1.3144680775827012e-05, + "loss": 0.4069, + "step": 250140 + }, + { + "epoch": 2.2114075567106917, + "grad_norm": 2.6827704906463623, + "learning_rate": 1.3143207388155142e-05, + "loss": 0.524, + "step": 250150 + }, + { + "epoch": 2.211495959971004, + "grad_norm": 1.9626140594482422, + "learning_rate": 1.3141734000483273e-05, + "loss": 0.5278, + "step": 250160 + }, + { + "epoch": 2.211584363231316, + "grad_norm": 1.427968978881836, + "learning_rate": 1.3140260612811401e-05, + "loss": 0.4319, + "step": 250170 + }, + { + "epoch": 2.211672766491628, + "grad_norm": 2.100080966949463, + "learning_rate": 1.3138787225139531e-05, + "loss": 0.6057, + "step": 250180 + }, + { + "epoch": 2.2117611697519406, + "grad_norm": 1.9762159585952759, + "learning_rate": 1.3137313837467661e-05, + "loss": 0.6444, + "step": 250190 + }, + { + "epoch": 2.2118495730122527, + "grad_norm": 1.9531619548797607, + "learning_rate": 1.313584044979579e-05, + "loss": 0.6142, + "step": 250200 + }, + { + "epoch": 2.211937976272565, + "grad_norm": 2.0177347660064697, + "learning_rate": 1.3134367062123918e-05, + "loss": 0.4971, + "step": 250210 + }, + { + "epoch": 2.212026379532877, + "grad_norm": 2.060743808746338, + "learning_rate": 1.313289367445205e-05, + "loss": 0.573, + "step": 250220 + }, + { + "epoch": 2.2121147827931895, + "grad_norm": 13.040575981140137, + "learning_rate": 1.3131420286780178e-05, + "loss": 0.4113, + "step": 250230 + }, + { + "epoch": 2.2122031860535016, + "grad_norm": 2.7994396686553955, + "learning_rate": 1.3129946899108306e-05, + "loss": 0.6191, + "step": 250240 + }, + { + "epoch": 2.2122915893138138, + "grad_norm": 13.928500175476074, + "learning_rate": 1.3128473511436438e-05, + "loss": 0.5997, + "step": 250250 + }, + { + "epoch": 2.2123799925741263, + "grad_norm": 2.0316214561462402, + "learning_rate": 1.3127000123764566e-05, + "loss": 0.5178, + "step": 250260 + }, + { + "epoch": 2.2124683958344384, + "grad_norm": 2.9346396923065186, + "learning_rate": 1.3125526736092695e-05, + "loss": 0.5862, + "step": 250270 + }, + { + "epoch": 2.2125567990947506, + "grad_norm": 4.441427230834961, + "learning_rate": 1.3124053348420823e-05, + "loss": 0.5446, + "step": 250280 + }, + { + "epoch": 2.2126452023550627, + "grad_norm": 5.7374444007873535, + "learning_rate": 1.3122579960748955e-05, + "loss": 0.5231, + "step": 250290 + }, + { + "epoch": 2.2127336056153752, + "grad_norm": 6.379061698913574, + "learning_rate": 1.3121106573077083e-05, + "loss": 0.4691, + "step": 250300 + }, + { + "epoch": 2.2128220088756874, + "grad_norm": 3.822597026824951, + "learning_rate": 1.3119633185405211e-05, + "loss": 0.7504, + "step": 250310 + }, + { + "epoch": 2.2129104121359995, + "grad_norm": 3.1277577877044678, + "learning_rate": 1.311815979773334e-05, + "loss": 0.5661, + "step": 250320 + }, + { + "epoch": 2.212998815396312, + "grad_norm": 15.616717338562012, + "learning_rate": 1.3116686410061471e-05, + "loss": 0.6885, + "step": 250330 + }, + { + "epoch": 2.213087218656624, + "grad_norm": 9.788675308227539, + "learning_rate": 1.31152130223896e-05, + "loss": 0.4751, + "step": 250340 + }, + { + "epoch": 2.2131756219169363, + "grad_norm": 12.667468070983887, + "learning_rate": 1.3113739634717728e-05, + "loss": 0.4783, + "step": 250350 + }, + { + "epoch": 2.2132640251772484, + "grad_norm": 3.4218883514404297, + "learning_rate": 1.311226624704586e-05, + "loss": 0.5724, + "step": 250360 + }, + { + "epoch": 2.213352428437561, + "grad_norm": 3.4602630138397217, + "learning_rate": 1.3110792859373988e-05, + "loss": 0.5479, + "step": 250370 + }, + { + "epoch": 2.213440831697873, + "grad_norm": 1.3644003868103027, + "learning_rate": 1.3109319471702117e-05, + "loss": 0.4288, + "step": 250380 + }, + { + "epoch": 2.213529234958185, + "grad_norm": 11.019760131835938, + "learning_rate": 1.3107846084030245e-05, + "loss": 0.4437, + "step": 250390 + }, + { + "epoch": 2.2136176382184973, + "grad_norm": 4.914597034454346, + "learning_rate": 1.3106372696358377e-05, + "loss": 0.5639, + "step": 250400 + }, + { + "epoch": 2.21370604147881, + "grad_norm": 18.239688873291016, + "learning_rate": 1.3104899308686505e-05, + "loss": 0.5483, + "step": 250410 + }, + { + "epoch": 2.213794444739122, + "grad_norm": 5.6542253494262695, + "learning_rate": 1.3103425921014633e-05, + "loss": 0.5435, + "step": 250420 + }, + { + "epoch": 2.213882847999434, + "grad_norm": 1.4107170104980469, + "learning_rate": 1.3101952533342762e-05, + "loss": 0.4337, + "step": 250430 + }, + { + "epoch": 2.2139712512597463, + "grad_norm": 5.753830432891846, + "learning_rate": 1.3100479145670894e-05, + "loss": 0.5942, + "step": 250440 + }, + { + "epoch": 2.214059654520059, + "grad_norm": 2.7613542079925537, + "learning_rate": 1.3099005757999022e-05, + "loss": 0.5568, + "step": 250450 + }, + { + "epoch": 2.214148057780371, + "grad_norm": 6.365067005157471, + "learning_rate": 1.309753237032715e-05, + "loss": 0.4732, + "step": 250460 + }, + { + "epoch": 2.214236461040683, + "grad_norm": 9.881786346435547, + "learning_rate": 1.3096058982655282e-05, + "loss": 0.73, + "step": 250470 + }, + { + "epoch": 2.2143248643009956, + "grad_norm": 1.865573525428772, + "learning_rate": 1.309458559498341e-05, + "loss": 0.5379, + "step": 250480 + }, + { + "epoch": 2.2144132675613077, + "grad_norm": 5.85319709777832, + "learning_rate": 1.3093112207311539e-05, + "loss": 0.5496, + "step": 250490 + }, + { + "epoch": 2.21450167082162, + "grad_norm": 4.4764204025268555, + "learning_rate": 1.3091638819639667e-05, + "loss": 0.5492, + "step": 250500 + }, + { + "epoch": 2.214590074081932, + "grad_norm": 1.9360861778259277, + "learning_rate": 1.3090165431967799e-05, + "loss": 0.4469, + "step": 250510 + }, + { + "epoch": 2.2146784773422445, + "grad_norm": 2.052361249923706, + "learning_rate": 1.3088692044295927e-05, + "loss": 0.4997, + "step": 250520 + }, + { + "epoch": 2.2147668806025567, + "grad_norm": 3.2892239093780518, + "learning_rate": 1.3087218656624055e-05, + "loss": 0.5166, + "step": 250530 + }, + { + "epoch": 2.214855283862869, + "grad_norm": 4.837767601013184, + "learning_rate": 1.3085745268952187e-05, + "loss": 0.5299, + "step": 250540 + }, + { + "epoch": 2.214943687123181, + "grad_norm": 2.3411800861358643, + "learning_rate": 1.3084271881280316e-05, + "loss": 0.4841, + "step": 250550 + }, + { + "epoch": 2.2150320903834935, + "grad_norm": 10.265049934387207, + "learning_rate": 1.3082798493608444e-05, + "loss": 0.3998, + "step": 250560 + }, + { + "epoch": 2.2151204936438056, + "grad_norm": 1.4159842729568481, + "learning_rate": 1.3081325105936574e-05, + "loss": 0.4791, + "step": 250570 + }, + { + "epoch": 2.2152088969041177, + "grad_norm": 1.1874366998672485, + "learning_rate": 1.3079851718264704e-05, + "loss": 0.513, + "step": 250580 + }, + { + "epoch": 2.2152973001644303, + "grad_norm": 1.23835289478302, + "learning_rate": 1.3078378330592832e-05, + "loss": 0.5922, + "step": 250590 + }, + { + "epoch": 2.2153857034247424, + "grad_norm": 2.998436450958252, + "learning_rate": 1.3076904942920962e-05, + "loss": 0.3826, + "step": 250600 + }, + { + "epoch": 2.2154741066850545, + "grad_norm": 1.2800798416137695, + "learning_rate": 1.307543155524909e-05, + "loss": 0.5311, + "step": 250610 + }, + { + "epoch": 2.2155625099453666, + "grad_norm": 2.0059814453125, + "learning_rate": 1.307395816757722e-05, + "loss": 0.5175, + "step": 250620 + }, + { + "epoch": 2.215650913205679, + "grad_norm": 2.7817113399505615, + "learning_rate": 1.3072484779905351e-05, + "loss": 0.4945, + "step": 250630 + }, + { + "epoch": 2.2157393164659913, + "grad_norm": 6.259496212005615, + "learning_rate": 1.307101139223348e-05, + "loss": 0.5047, + "step": 250640 + }, + { + "epoch": 2.2158277197263034, + "grad_norm": 3.6685280799865723, + "learning_rate": 1.306953800456161e-05, + "loss": 0.4349, + "step": 250650 + }, + { + "epoch": 2.2159161229866156, + "grad_norm": 6.061193466186523, + "learning_rate": 1.306806461688974e-05, + "loss": 0.6341, + "step": 250660 + }, + { + "epoch": 2.216004526246928, + "grad_norm": 2.4200689792633057, + "learning_rate": 1.3066591229217868e-05, + "loss": 0.4906, + "step": 250670 + }, + { + "epoch": 2.2160929295072402, + "grad_norm": 0.8486049771308899, + "learning_rate": 1.3065117841545996e-05, + "loss": 0.4882, + "step": 250680 + }, + { + "epoch": 2.2161813327675524, + "grad_norm": 1.0026763677597046, + "learning_rate": 1.3063644453874128e-05, + "loss": 0.4565, + "step": 250690 + }, + { + "epoch": 2.216269736027865, + "grad_norm": 1.6266553401947021, + "learning_rate": 1.3062171066202256e-05, + "loss": 0.5308, + "step": 250700 + }, + { + "epoch": 2.216358139288177, + "grad_norm": 5.071847438812256, + "learning_rate": 1.3060697678530384e-05, + "loss": 0.5433, + "step": 250710 + }, + { + "epoch": 2.216446542548489, + "grad_norm": 5.902851581573486, + "learning_rate": 1.3059224290858516e-05, + "loss": 0.5199, + "step": 250720 + }, + { + "epoch": 2.2165349458088013, + "grad_norm": 20.987396240234375, + "learning_rate": 1.3057750903186645e-05, + "loss": 0.498, + "step": 250730 + }, + { + "epoch": 2.216623349069114, + "grad_norm": 2.8624637126922607, + "learning_rate": 1.3056277515514773e-05, + "loss": 0.6672, + "step": 250740 + }, + { + "epoch": 2.216711752329426, + "grad_norm": 3.5764219760894775, + "learning_rate": 1.3054804127842901e-05, + "loss": 0.6343, + "step": 250750 + }, + { + "epoch": 2.216800155589738, + "grad_norm": 9.28814697265625, + "learning_rate": 1.3053330740171033e-05, + "loss": 0.6247, + "step": 250760 + }, + { + "epoch": 2.21688855885005, + "grad_norm": 4.469008922576904, + "learning_rate": 1.3051857352499161e-05, + "loss": 0.5117, + "step": 250770 + }, + { + "epoch": 2.2169769621103628, + "grad_norm": 6.303928852081299, + "learning_rate": 1.305038396482729e-05, + "loss": 0.4678, + "step": 250780 + }, + { + "epoch": 2.217065365370675, + "grad_norm": 9.007296562194824, + "learning_rate": 1.3048910577155418e-05, + "loss": 0.4676, + "step": 250790 + }, + { + "epoch": 2.217153768630987, + "grad_norm": 1.3194639682769775, + "learning_rate": 1.304743718948355e-05, + "loss": 0.5065, + "step": 250800 + }, + { + "epoch": 2.217242171891299, + "grad_norm": 1.3456107378005981, + "learning_rate": 1.3045963801811678e-05, + "loss": 0.5362, + "step": 250810 + }, + { + "epoch": 2.2173305751516117, + "grad_norm": 4.393184185028076, + "learning_rate": 1.3044490414139807e-05, + "loss": 0.5786, + "step": 250820 + }, + { + "epoch": 2.217418978411924, + "grad_norm": 10.508270263671875, + "learning_rate": 1.3043017026467938e-05, + "loss": 0.6076, + "step": 250830 + }, + { + "epoch": 2.217507381672236, + "grad_norm": 4.869872093200684, + "learning_rate": 1.3041543638796067e-05, + "loss": 0.5668, + "step": 250840 + }, + { + "epoch": 2.2175957849325485, + "grad_norm": 3.502800464630127, + "learning_rate": 1.3040070251124195e-05, + "loss": 0.5215, + "step": 250850 + }, + { + "epoch": 2.2176841881928606, + "grad_norm": 1.9118003845214844, + "learning_rate": 1.3038596863452323e-05, + "loss": 0.4904, + "step": 250860 + }, + { + "epoch": 2.2177725914531727, + "grad_norm": 4.929308891296387, + "learning_rate": 1.3037123475780455e-05, + "loss": 0.578, + "step": 250870 + }, + { + "epoch": 2.217860994713485, + "grad_norm": 14.072076797485352, + "learning_rate": 1.3035650088108583e-05, + "loss": 0.5918, + "step": 250880 + }, + { + "epoch": 2.2179493979737974, + "grad_norm": 3.3183164596557617, + "learning_rate": 1.3034176700436712e-05, + "loss": 0.6682, + "step": 250890 + }, + { + "epoch": 2.2180378012341095, + "grad_norm": 1.2496317625045776, + "learning_rate": 1.303270331276484e-05, + "loss": 0.5412, + "step": 250900 + }, + { + "epoch": 2.2181262044944217, + "grad_norm": 2.265411138534546, + "learning_rate": 1.3031229925092972e-05, + "loss": 0.4503, + "step": 250910 + }, + { + "epoch": 2.2182146077547342, + "grad_norm": 4.204479694366455, + "learning_rate": 1.30297565374211e-05, + "loss": 0.5027, + "step": 250920 + }, + { + "epoch": 2.2183030110150463, + "grad_norm": 1.2375704050064087, + "learning_rate": 1.3028283149749229e-05, + "loss": 0.4573, + "step": 250930 + }, + { + "epoch": 2.2183914142753585, + "grad_norm": 1.9128835201263428, + "learning_rate": 1.302680976207736e-05, + "loss": 0.4071, + "step": 250940 + }, + { + "epoch": 2.2184798175356706, + "grad_norm": 1.0355994701385498, + "learning_rate": 1.3025336374405489e-05, + "loss": 0.5175, + "step": 250950 + }, + { + "epoch": 2.218568220795983, + "grad_norm": 5.9021077156066895, + "learning_rate": 1.3023862986733617e-05, + "loss": 0.5306, + "step": 250960 + }, + { + "epoch": 2.2186566240562953, + "grad_norm": 1.997849702835083, + "learning_rate": 1.3022389599061745e-05, + "loss": 0.5514, + "step": 250970 + }, + { + "epoch": 2.2187450273166074, + "grad_norm": 4.175297260284424, + "learning_rate": 1.3020916211389877e-05, + "loss": 0.639, + "step": 250980 + }, + { + "epoch": 2.2188334305769195, + "grad_norm": 1.1240904331207275, + "learning_rate": 1.3019442823718006e-05, + "loss": 0.5432, + "step": 250990 + }, + { + "epoch": 2.218921833837232, + "grad_norm": 1.2035881280899048, + "learning_rate": 1.3017969436046134e-05, + "loss": 0.451, + "step": 251000 + }, + { + "epoch": 2.219010237097544, + "grad_norm": 4.94524621963501, + "learning_rate": 1.3016496048374266e-05, + "loss": 0.505, + "step": 251010 + }, + { + "epoch": 2.2190986403578563, + "grad_norm": 2.4814610481262207, + "learning_rate": 1.3015022660702394e-05, + "loss": 0.506, + "step": 251020 + }, + { + "epoch": 2.2191870436181684, + "grad_norm": 1.489661455154419, + "learning_rate": 1.3013549273030522e-05, + "loss": 0.5252, + "step": 251030 + }, + { + "epoch": 2.219275446878481, + "grad_norm": 0.9940539598464966, + "learning_rate": 1.3012075885358652e-05, + "loss": 0.5745, + "step": 251040 + }, + { + "epoch": 2.219363850138793, + "grad_norm": 5.394950866699219, + "learning_rate": 1.3010602497686782e-05, + "loss": 0.5623, + "step": 251050 + }, + { + "epoch": 2.2194522533991052, + "grad_norm": 4.761203765869141, + "learning_rate": 1.300912911001491e-05, + "loss": 0.5065, + "step": 251060 + }, + { + "epoch": 2.219540656659418, + "grad_norm": 3.1393861770629883, + "learning_rate": 1.300765572234304e-05, + "loss": 0.4636, + "step": 251070 + }, + { + "epoch": 2.21962905991973, + "grad_norm": 3.055935859680176, + "learning_rate": 1.300618233467117e-05, + "loss": 0.6159, + "step": 251080 + }, + { + "epoch": 2.219717463180042, + "grad_norm": 6.493808746337891, + "learning_rate": 1.30047089469993e-05, + "loss": 0.4645, + "step": 251090 + }, + { + "epoch": 2.219805866440354, + "grad_norm": 1.7242510318756104, + "learning_rate": 1.300323555932743e-05, + "loss": 0.5832, + "step": 251100 + }, + { + "epoch": 2.2198942697006667, + "grad_norm": 7.8766398429870605, + "learning_rate": 1.3001762171655558e-05, + "loss": 0.3661, + "step": 251110 + }, + { + "epoch": 2.219982672960979, + "grad_norm": 2.2419965267181396, + "learning_rate": 1.3000288783983688e-05, + "loss": 0.5548, + "step": 251120 + }, + { + "epoch": 2.220071076221291, + "grad_norm": 2.390048027038574, + "learning_rate": 1.2998815396311818e-05, + "loss": 0.5293, + "step": 251130 + }, + { + "epoch": 2.220159479481603, + "grad_norm": 3.228367567062378, + "learning_rate": 1.2997342008639946e-05, + "loss": 0.5785, + "step": 251140 + }, + { + "epoch": 2.2202478827419156, + "grad_norm": 1.8755384683609009, + "learning_rate": 1.2995868620968074e-05, + "loss": 0.4946, + "step": 251150 + }, + { + "epoch": 2.2203362860022278, + "grad_norm": 3.8253231048583984, + "learning_rate": 1.2994395233296206e-05, + "loss": 0.5815, + "step": 251160 + }, + { + "epoch": 2.22042468926254, + "grad_norm": 2.3487422466278076, + "learning_rate": 1.2992921845624335e-05, + "loss": 0.5235, + "step": 251170 + }, + { + "epoch": 2.2205130925228524, + "grad_norm": 2.546074867248535, + "learning_rate": 1.2991448457952463e-05, + "loss": 0.6607, + "step": 251180 + }, + { + "epoch": 2.2206014957831646, + "grad_norm": 2.2555971145629883, + "learning_rate": 1.2989975070280595e-05, + "loss": 0.5858, + "step": 251190 + }, + { + "epoch": 2.2206898990434767, + "grad_norm": 1.7068660259246826, + "learning_rate": 1.2988501682608723e-05, + "loss": 0.6252, + "step": 251200 + }, + { + "epoch": 2.220778302303789, + "grad_norm": 3.384754180908203, + "learning_rate": 1.2987028294936851e-05, + "loss": 0.5243, + "step": 251210 + }, + { + "epoch": 2.2208667055641014, + "grad_norm": 1.065903663635254, + "learning_rate": 1.298555490726498e-05, + "loss": 0.5593, + "step": 251220 + }, + { + "epoch": 2.2209551088244135, + "grad_norm": 1.9604142904281616, + "learning_rate": 1.2984081519593111e-05, + "loss": 0.5052, + "step": 251230 + }, + { + "epoch": 2.2210435120847256, + "grad_norm": 4.627053260803223, + "learning_rate": 1.298260813192124e-05, + "loss": 0.4844, + "step": 251240 + }, + { + "epoch": 2.2211319153450377, + "grad_norm": 3.03295636177063, + "learning_rate": 1.2981134744249368e-05, + "loss": 0.4929, + "step": 251250 + }, + { + "epoch": 2.2212203186053503, + "grad_norm": 4.665919780731201, + "learning_rate": 1.2979661356577496e-05, + "loss": 0.5522, + "step": 251260 + }, + { + "epoch": 2.2213087218656624, + "grad_norm": 4.7059645652771, + "learning_rate": 1.2978187968905628e-05, + "loss": 0.5387, + "step": 251270 + }, + { + "epoch": 2.2213971251259745, + "grad_norm": 5.020578384399414, + "learning_rate": 1.2976714581233757e-05, + "loss": 0.5342, + "step": 251280 + }, + { + "epoch": 2.221485528386287, + "grad_norm": 1.9202998876571655, + "learning_rate": 1.2975241193561885e-05, + "loss": 0.5363, + "step": 251290 + }, + { + "epoch": 2.221573931646599, + "grad_norm": 1.8208755254745483, + "learning_rate": 1.2973767805890017e-05, + "loss": 0.6843, + "step": 251300 + }, + { + "epoch": 2.2216623349069113, + "grad_norm": 3.4782025814056396, + "learning_rate": 1.2972294418218145e-05, + "loss": 0.512, + "step": 251310 + }, + { + "epoch": 2.2217507381672235, + "grad_norm": 2.3582818508148193, + "learning_rate": 1.2970821030546273e-05, + "loss": 0.5021, + "step": 251320 + }, + { + "epoch": 2.221839141427536, + "grad_norm": 4.982050895690918, + "learning_rate": 1.2969347642874402e-05, + "loss": 0.6149, + "step": 251330 + }, + { + "epoch": 2.221927544687848, + "grad_norm": 2.42169451713562, + "learning_rate": 1.2967874255202533e-05, + "loss": 0.534, + "step": 251340 + }, + { + "epoch": 2.2220159479481603, + "grad_norm": 2.4647717475891113, + "learning_rate": 1.2966400867530662e-05, + "loss": 0.6714, + "step": 251350 + }, + { + "epoch": 2.2221043512084724, + "grad_norm": 1.5188722610473633, + "learning_rate": 1.296492747985879e-05, + "loss": 0.4405, + "step": 251360 + }, + { + "epoch": 2.222192754468785, + "grad_norm": 1.3679540157318115, + "learning_rate": 1.2963454092186922e-05, + "loss": 0.4994, + "step": 251370 + }, + { + "epoch": 2.222281157729097, + "grad_norm": 2.6775290966033936, + "learning_rate": 1.296198070451505e-05, + "loss": 0.4825, + "step": 251380 + }, + { + "epoch": 2.222369560989409, + "grad_norm": 15.519209861755371, + "learning_rate": 1.2960507316843179e-05, + "loss": 0.51, + "step": 251390 + }, + { + "epoch": 2.2224579642497213, + "grad_norm": 1.7371797561645508, + "learning_rate": 1.2959033929171307e-05, + "loss": 0.6239, + "step": 251400 + }, + { + "epoch": 2.222546367510034, + "grad_norm": 11.913886070251465, + "learning_rate": 1.2957560541499439e-05, + "loss": 0.5269, + "step": 251410 + }, + { + "epoch": 2.222634770770346, + "grad_norm": 3.497894525527954, + "learning_rate": 1.2956087153827567e-05, + "loss": 0.6057, + "step": 251420 + }, + { + "epoch": 2.222723174030658, + "grad_norm": 2.2373251914978027, + "learning_rate": 1.2954613766155695e-05, + "loss": 0.5233, + "step": 251430 + }, + { + "epoch": 2.2228115772909707, + "grad_norm": 19.650920867919922, + "learning_rate": 1.2953140378483824e-05, + "loss": 0.5841, + "step": 251440 + }, + { + "epoch": 2.222899980551283, + "grad_norm": 3.3620429039001465, + "learning_rate": 1.2951666990811956e-05, + "loss": 0.5507, + "step": 251450 + }, + { + "epoch": 2.222988383811595, + "grad_norm": 3.928896427154541, + "learning_rate": 1.2950193603140084e-05, + "loss": 0.4853, + "step": 251460 + }, + { + "epoch": 2.223076787071907, + "grad_norm": 2.944913625717163, + "learning_rate": 1.2948720215468212e-05, + "loss": 0.5125, + "step": 251470 + }, + { + "epoch": 2.2231651903322196, + "grad_norm": 3.157196521759033, + "learning_rate": 1.2947246827796344e-05, + "loss": 0.4779, + "step": 251480 + }, + { + "epoch": 2.2232535935925317, + "grad_norm": 3.581127166748047, + "learning_rate": 1.2945773440124472e-05, + "loss": 0.5178, + "step": 251490 + }, + { + "epoch": 2.223341996852844, + "grad_norm": 1.490012526512146, + "learning_rate": 1.29443000524526e-05, + "loss": 0.4586, + "step": 251500 + }, + { + "epoch": 2.2234304001131564, + "grad_norm": 19.005556106567383, + "learning_rate": 1.294282666478073e-05, + "loss": 0.5967, + "step": 251510 + }, + { + "epoch": 2.2235188033734685, + "grad_norm": 1.8652236461639404, + "learning_rate": 1.294135327710886e-05, + "loss": 0.5773, + "step": 251520 + }, + { + "epoch": 2.2236072066337806, + "grad_norm": 1.6523195505142212, + "learning_rate": 1.2939879889436989e-05, + "loss": 0.476, + "step": 251530 + }, + { + "epoch": 2.2236956098940928, + "grad_norm": 4.979263782501221, + "learning_rate": 1.293840650176512e-05, + "loss": 0.5682, + "step": 251540 + }, + { + "epoch": 2.2237840131544053, + "grad_norm": 1.4981769323349, + "learning_rate": 1.2936933114093248e-05, + "loss": 0.5668, + "step": 251550 + }, + { + "epoch": 2.2238724164147174, + "grad_norm": 4.9290385246276855, + "learning_rate": 1.2935459726421378e-05, + "loss": 0.6552, + "step": 251560 + }, + { + "epoch": 2.2239608196750296, + "grad_norm": 2.8024373054504395, + "learning_rate": 1.2933986338749508e-05, + "loss": 0.5074, + "step": 251570 + }, + { + "epoch": 2.2240492229353417, + "grad_norm": 3.6692256927490234, + "learning_rate": 1.2932512951077636e-05, + "loss": 0.5373, + "step": 251580 + }, + { + "epoch": 2.2241376261956542, + "grad_norm": 3.208085536956787, + "learning_rate": 1.2931039563405766e-05, + "loss": 0.4299, + "step": 251590 + }, + { + "epoch": 2.2242260294559664, + "grad_norm": 1.979636549949646, + "learning_rate": 1.2929566175733896e-05, + "loss": 0.5451, + "step": 251600 + }, + { + "epoch": 2.2243144327162785, + "grad_norm": 2.6537797451019287, + "learning_rate": 1.2928092788062024e-05, + "loss": 0.497, + "step": 251610 + }, + { + "epoch": 2.2244028359765906, + "grad_norm": 1.0650426149368286, + "learning_rate": 1.2926619400390153e-05, + "loss": 0.3901, + "step": 251620 + }, + { + "epoch": 2.224491239236903, + "grad_norm": 8.66428279876709, + "learning_rate": 1.2925146012718285e-05, + "loss": 0.4437, + "step": 251630 + }, + { + "epoch": 2.2245796424972153, + "grad_norm": 3.461270570755005, + "learning_rate": 1.2923672625046413e-05, + "loss": 0.5135, + "step": 251640 + }, + { + "epoch": 2.2246680457575274, + "grad_norm": 0.9741714000701904, + "learning_rate": 1.2922199237374541e-05, + "loss": 0.4176, + "step": 251650 + }, + { + "epoch": 2.22475644901784, + "grad_norm": 1.9178307056427002, + "learning_rate": 1.2920725849702673e-05, + "loss": 0.623, + "step": 251660 + }, + { + "epoch": 2.224844852278152, + "grad_norm": 3.3437185287475586, + "learning_rate": 1.2919252462030801e-05, + "loss": 0.5815, + "step": 251670 + }, + { + "epoch": 2.224933255538464, + "grad_norm": 1.3660871982574463, + "learning_rate": 1.291777907435893e-05, + "loss": 0.5708, + "step": 251680 + }, + { + "epoch": 2.2250216587987763, + "grad_norm": 4.318828582763672, + "learning_rate": 1.2916305686687058e-05, + "loss": 0.6222, + "step": 251690 + }, + { + "epoch": 2.225110062059089, + "grad_norm": 16.12813949584961, + "learning_rate": 1.291483229901519e-05, + "loss": 0.4357, + "step": 251700 + }, + { + "epoch": 2.225198465319401, + "grad_norm": 3.2588956356048584, + "learning_rate": 1.2913358911343318e-05, + "loss": 0.5343, + "step": 251710 + }, + { + "epoch": 2.225286868579713, + "grad_norm": 4.00527811050415, + "learning_rate": 1.2911885523671447e-05, + "loss": 0.5278, + "step": 251720 + }, + { + "epoch": 2.2253752718400253, + "grad_norm": 14.245950698852539, + "learning_rate": 1.2910412135999575e-05, + "loss": 0.5042, + "step": 251730 + }, + { + "epoch": 2.225463675100338, + "grad_norm": 1.178683876991272, + "learning_rate": 1.2908938748327707e-05, + "loss": 0.724, + "step": 251740 + }, + { + "epoch": 2.22555207836065, + "grad_norm": 2.9642488956451416, + "learning_rate": 1.2907465360655835e-05, + "loss": 0.4014, + "step": 251750 + }, + { + "epoch": 2.225640481620962, + "grad_norm": 1.0188406705856323, + "learning_rate": 1.2905991972983963e-05, + "loss": 0.5191, + "step": 251760 + }, + { + "epoch": 2.2257288848812746, + "grad_norm": 1.8935588598251343, + "learning_rate": 1.2904518585312095e-05, + "loss": 0.5046, + "step": 251770 + }, + { + "epoch": 2.2258172881415867, + "grad_norm": 3.6448495388031006, + "learning_rate": 1.2903045197640223e-05, + "loss": 0.6374, + "step": 251780 + }, + { + "epoch": 2.225905691401899, + "grad_norm": 2.740370750427246, + "learning_rate": 1.2901571809968352e-05, + "loss": 0.4176, + "step": 251790 + }, + { + "epoch": 2.225994094662211, + "grad_norm": 2.1083390712738037, + "learning_rate": 1.290009842229648e-05, + "loss": 0.7013, + "step": 251800 + }, + { + "epoch": 2.2260824979225236, + "grad_norm": 1.4592236280441284, + "learning_rate": 1.2898625034624612e-05, + "loss": 0.6769, + "step": 251810 + }, + { + "epoch": 2.2261709011828357, + "grad_norm": 4.473559856414795, + "learning_rate": 1.289715164695274e-05, + "loss": 0.5414, + "step": 251820 + }, + { + "epoch": 2.226259304443148, + "grad_norm": 4.057676315307617, + "learning_rate": 1.2895678259280869e-05, + "loss": 0.501, + "step": 251830 + }, + { + "epoch": 2.22634770770346, + "grad_norm": 8.632678031921387, + "learning_rate": 1.2894204871609e-05, + "loss": 0.5896, + "step": 251840 + }, + { + "epoch": 2.2264361109637725, + "grad_norm": 5.231939792633057, + "learning_rate": 1.2892731483937129e-05, + "loss": 0.5562, + "step": 251850 + }, + { + "epoch": 2.2265245142240846, + "grad_norm": 1.4357541799545288, + "learning_rate": 1.2891258096265257e-05, + "loss": 0.4948, + "step": 251860 + }, + { + "epoch": 2.2266129174843967, + "grad_norm": 4.080999374389648, + "learning_rate": 1.2889784708593385e-05, + "loss": 0.5871, + "step": 251870 + }, + { + "epoch": 2.2267013207447093, + "grad_norm": 4.765201568603516, + "learning_rate": 1.2888311320921517e-05, + "loss": 0.5753, + "step": 251880 + }, + { + "epoch": 2.2267897240050214, + "grad_norm": 5.448455333709717, + "learning_rate": 1.2886837933249645e-05, + "loss": 0.5956, + "step": 251890 + }, + { + "epoch": 2.2268781272653335, + "grad_norm": 1.631020426750183, + "learning_rate": 1.2885364545577774e-05, + "loss": 0.6184, + "step": 251900 + }, + { + "epoch": 2.2269665305256456, + "grad_norm": 1.9771192073822021, + "learning_rate": 1.2883891157905902e-05, + "loss": 0.5254, + "step": 251910 + }, + { + "epoch": 2.227054933785958, + "grad_norm": 2.0855929851531982, + "learning_rate": 1.2882417770234034e-05, + "loss": 0.627, + "step": 251920 + }, + { + "epoch": 2.2271433370462703, + "grad_norm": 4.3385796546936035, + "learning_rate": 1.2880944382562162e-05, + "loss": 0.534, + "step": 251930 + }, + { + "epoch": 2.2272317403065824, + "grad_norm": 2.951432704925537, + "learning_rate": 1.287947099489029e-05, + "loss": 0.6013, + "step": 251940 + }, + { + "epoch": 2.2273201435668946, + "grad_norm": 1.6343220472335815, + "learning_rate": 1.2877997607218422e-05, + "loss": 0.5583, + "step": 251950 + }, + { + "epoch": 2.227408546827207, + "grad_norm": 5.996504306793213, + "learning_rate": 1.287652421954655e-05, + "loss": 0.4504, + "step": 251960 + }, + { + "epoch": 2.2274969500875192, + "grad_norm": 2.0284783840179443, + "learning_rate": 1.2875050831874679e-05, + "loss": 0.5436, + "step": 251970 + }, + { + "epoch": 2.2275853533478314, + "grad_norm": 5.314773082733154, + "learning_rate": 1.2873577444202809e-05, + "loss": 0.5427, + "step": 251980 + }, + { + "epoch": 2.2276737566081435, + "grad_norm": 10.48105239868164, + "learning_rate": 1.287210405653094e-05, + "loss": 0.5152, + "step": 251990 + }, + { + "epoch": 2.227762159868456, + "grad_norm": 3.721132278442383, + "learning_rate": 1.2870630668859068e-05, + "loss": 0.5652, + "step": 252000 + }, + { + "epoch": 2.227850563128768, + "grad_norm": 2.040672779083252, + "learning_rate": 1.2869157281187198e-05, + "loss": 0.53, + "step": 252010 + }, + { + "epoch": 2.2279389663890803, + "grad_norm": 1.7377985715866089, + "learning_rate": 1.2867683893515326e-05, + "loss": 0.4586, + "step": 252020 + }, + { + "epoch": 2.228027369649393, + "grad_norm": 4.129240989685059, + "learning_rate": 1.2866210505843456e-05, + "loss": 0.6313, + "step": 252030 + }, + { + "epoch": 2.228115772909705, + "grad_norm": 1.191815972328186, + "learning_rate": 1.2864737118171586e-05, + "loss": 0.4692, + "step": 252040 + }, + { + "epoch": 2.228204176170017, + "grad_norm": 1.776800274848938, + "learning_rate": 1.2863263730499714e-05, + "loss": 0.5805, + "step": 252050 + }, + { + "epoch": 2.228292579430329, + "grad_norm": 7.243698596954346, + "learning_rate": 1.2861790342827844e-05, + "loss": 0.6249, + "step": 252060 + }, + { + "epoch": 2.2283809826906418, + "grad_norm": 2.1829023361206055, + "learning_rate": 1.2860316955155974e-05, + "loss": 0.4543, + "step": 252070 + }, + { + "epoch": 2.228469385950954, + "grad_norm": 6.014856815338135, + "learning_rate": 1.2858843567484103e-05, + "loss": 0.6046, + "step": 252080 + }, + { + "epoch": 2.228557789211266, + "grad_norm": 2.359017848968506, + "learning_rate": 1.2857370179812231e-05, + "loss": 0.5837, + "step": 252090 + }, + { + "epoch": 2.2286461924715786, + "grad_norm": 1.3298579454421997, + "learning_rate": 1.2855896792140363e-05, + "loss": 0.5752, + "step": 252100 + }, + { + "epoch": 2.2287345957318907, + "grad_norm": 3.300748109817505, + "learning_rate": 1.2854423404468491e-05, + "loss": 0.5002, + "step": 252110 + }, + { + "epoch": 2.228822998992203, + "grad_norm": 1.2174992561340332, + "learning_rate": 1.285295001679662e-05, + "loss": 0.4948, + "step": 252120 + }, + { + "epoch": 2.228911402252515, + "grad_norm": 4.190326690673828, + "learning_rate": 1.2851476629124751e-05, + "loss": 0.8226, + "step": 252130 + }, + { + "epoch": 2.2289998055128275, + "grad_norm": 6.610119819641113, + "learning_rate": 1.285000324145288e-05, + "loss": 0.5054, + "step": 252140 + }, + { + "epoch": 2.2290882087731396, + "grad_norm": 6.981377601623535, + "learning_rate": 1.2848529853781008e-05, + "loss": 0.5041, + "step": 252150 + }, + { + "epoch": 2.2291766120334517, + "grad_norm": 3.08777117729187, + "learning_rate": 1.2847056466109136e-05, + "loss": 0.5306, + "step": 252160 + }, + { + "epoch": 2.229265015293764, + "grad_norm": 5.857673645019531, + "learning_rate": 1.2845583078437268e-05, + "loss": 0.5125, + "step": 252170 + }, + { + "epoch": 2.2293534185540764, + "grad_norm": 5.254504203796387, + "learning_rate": 1.2844109690765397e-05, + "loss": 0.4734, + "step": 252180 + }, + { + "epoch": 2.2294418218143885, + "grad_norm": 2.037724494934082, + "learning_rate": 1.2842636303093525e-05, + "loss": 0.4883, + "step": 252190 + }, + { + "epoch": 2.2295302250747007, + "grad_norm": 3.5297746658325195, + "learning_rate": 1.2841162915421653e-05, + "loss": 0.6519, + "step": 252200 + }, + { + "epoch": 2.229618628335013, + "grad_norm": 2.6495227813720703, + "learning_rate": 1.2839689527749785e-05, + "loss": 0.5703, + "step": 252210 + }, + { + "epoch": 2.2297070315953254, + "grad_norm": 4.143001079559326, + "learning_rate": 1.2838216140077913e-05, + "loss": 0.5114, + "step": 252220 + }, + { + "epoch": 2.2297954348556375, + "grad_norm": 1.650437593460083, + "learning_rate": 1.2836742752406042e-05, + "loss": 0.5586, + "step": 252230 + }, + { + "epoch": 2.2298838381159496, + "grad_norm": 1.3252032995224, + "learning_rate": 1.2835269364734173e-05, + "loss": 0.4758, + "step": 252240 + }, + { + "epoch": 2.229972241376262, + "grad_norm": 5.453187942504883, + "learning_rate": 1.2833795977062302e-05, + "loss": 0.3802, + "step": 252250 + }, + { + "epoch": 2.2300606446365743, + "grad_norm": 2.7457239627838135, + "learning_rate": 1.283232258939043e-05, + "loss": 0.6407, + "step": 252260 + }, + { + "epoch": 2.2301490478968864, + "grad_norm": 1.229357123374939, + "learning_rate": 1.2830849201718558e-05, + "loss": 0.4973, + "step": 252270 + }, + { + "epoch": 2.2302374511571985, + "grad_norm": 3.304680585861206, + "learning_rate": 1.282937581404669e-05, + "loss": 0.5929, + "step": 252280 + }, + { + "epoch": 2.230325854417511, + "grad_norm": 10.364500999450684, + "learning_rate": 1.2827902426374819e-05, + "loss": 0.5565, + "step": 252290 + }, + { + "epoch": 2.230414257677823, + "grad_norm": 5.381203651428223, + "learning_rate": 1.2826429038702947e-05, + "loss": 0.5041, + "step": 252300 + }, + { + "epoch": 2.2305026609381353, + "grad_norm": 25.48887825012207, + "learning_rate": 1.2824955651031079e-05, + "loss": 0.7178, + "step": 252310 + }, + { + "epoch": 2.2305910641984474, + "grad_norm": 2.867090940475464, + "learning_rate": 1.2823482263359207e-05, + "loss": 0.4819, + "step": 252320 + }, + { + "epoch": 2.23067946745876, + "grad_norm": 6.2493767738342285, + "learning_rate": 1.2822008875687335e-05, + "loss": 0.649, + "step": 252330 + }, + { + "epoch": 2.230767870719072, + "grad_norm": 4.989572525024414, + "learning_rate": 1.2820535488015464e-05, + "loss": 0.42, + "step": 252340 + }, + { + "epoch": 2.2308562739793842, + "grad_norm": 7.1690826416015625, + "learning_rate": 1.2819062100343595e-05, + "loss": 0.6327, + "step": 252350 + }, + { + "epoch": 2.230944677239697, + "grad_norm": 2.3390285968780518, + "learning_rate": 1.2817588712671724e-05, + "loss": 0.5819, + "step": 252360 + }, + { + "epoch": 2.231033080500009, + "grad_norm": 4.922679424285889, + "learning_rate": 1.2816115324999852e-05, + "loss": 0.6136, + "step": 252370 + }, + { + "epoch": 2.231121483760321, + "grad_norm": 0.8719506859779358, + "learning_rate": 1.281464193732798e-05, + "loss": 0.5189, + "step": 252380 + }, + { + "epoch": 2.231209887020633, + "grad_norm": 1.4049886465072632, + "learning_rate": 1.2813168549656112e-05, + "loss": 0.5735, + "step": 252390 + }, + { + "epoch": 2.2312982902809457, + "grad_norm": 2.7326948642730713, + "learning_rate": 1.281169516198424e-05, + "loss": 0.515, + "step": 252400 + }, + { + "epoch": 2.231386693541258, + "grad_norm": 2.548985242843628, + "learning_rate": 1.2810221774312369e-05, + "loss": 0.469, + "step": 252410 + }, + { + "epoch": 2.23147509680157, + "grad_norm": 4.484524726867676, + "learning_rate": 1.28087483866405e-05, + "loss": 0.4558, + "step": 252420 + }, + { + "epoch": 2.231563500061882, + "grad_norm": 6.092586994171143, + "learning_rate": 1.2807274998968629e-05, + "loss": 0.497, + "step": 252430 + }, + { + "epoch": 2.2316519033221947, + "grad_norm": 3.130527973175049, + "learning_rate": 1.2805801611296757e-05, + "loss": 0.5305, + "step": 252440 + }, + { + "epoch": 2.2317403065825068, + "grad_norm": 1.534021019935608, + "learning_rate": 1.2804328223624887e-05, + "loss": 0.4454, + "step": 252450 + }, + { + "epoch": 2.231828709842819, + "grad_norm": 9.00811767578125, + "learning_rate": 1.2802854835953018e-05, + "loss": 0.4921, + "step": 252460 + }, + { + "epoch": 2.2319171131031315, + "grad_norm": 2.9687247276306152, + "learning_rate": 1.2801381448281146e-05, + "loss": 0.5433, + "step": 252470 + }, + { + "epoch": 2.2320055163634436, + "grad_norm": 1.6613713502883911, + "learning_rate": 1.2799908060609276e-05, + "loss": 0.4217, + "step": 252480 + }, + { + "epoch": 2.2320939196237557, + "grad_norm": 1.9242967367172241, + "learning_rate": 1.2798434672937404e-05, + "loss": 0.5595, + "step": 252490 + }, + { + "epoch": 2.232182322884068, + "grad_norm": 3.7425146102905273, + "learning_rate": 1.2796961285265534e-05, + "loss": 0.6372, + "step": 252500 + }, + { + "epoch": 2.2322707261443804, + "grad_norm": 3.1541969776153564, + "learning_rate": 1.2795487897593664e-05, + "loss": 0.516, + "step": 252510 + }, + { + "epoch": 2.2323591294046925, + "grad_norm": 1.4329630136489868, + "learning_rate": 1.2794014509921793e-05, + "loss": 0.5435, + "step": 252520 + }, + { + "epoch": 2.2324475326650046, + "grad_norm": 1.028855323791504, + "learning_rate": 1.2792541122249923e-05, + "loss": 0.3797, + "step": 252530 + }, + { + "epoch": 2.2325359359253167, + "grad_norm": 11.909648895263672, + "learning_rate": 1.2791067734578053e-05, + "loss": 0.551, + "step": 252540 + }, + { + "epoch": 2.2326243391856293, + "grad_norm": 1.7132185697555542, + "learning_rate": 1.2789594346906181e-05, + "loss": 0.5334, + "step": 252550 + }, + { + "epoch": 2.2327127424459414, + "grad_norm": 3.23384690284729, + "learning_rate": 1.278812095923431e-05, + "loss": 0.5618, + "step": 252560 + }, + { + "epoch": 2.2328011457062535, + "grad_norm": 7.422555446624756, + "learning_rate": 1.2786647571562441e-05, + "loss": 0.4807, + "step": 252570 + }, + { + "epoch": 2.2328895489665657, + "grad_norm": 9.031991958618164, + "learning_rate": 1.278517418389057e-05, + "loss": 0.6263, + "step": 252580 + }, + { + "epoch": 2.2329779522268782, + "grad_norm": 2.659722089767456, + "learning_rate": 1.2783700796218698e-05, + "loss": 0.4034, + "step": 252590 + }, + { + "epoch": 2.2330663554871903, + "grad_norm": 2.255070447921753, + "learning_rate": 1.278222740854683e-05, + "loss": 0.4997, + "step": 252600 + }, + { + "epoch": 2.2331547587475025, + "grad_norm": 2.232422113418579, + "learning_rate": 1.2780754020874958e-05, + "loss": 0.5091, + "step": 252610 + }, + { + "epoch": 2.233243162007815, + "grad_norm": 3.2916576862335205, + "learning_rate": 1.2779280633203086e-05, + "loss": 0.5166, + "step": 252620 + }, + { + "epoch": 2.233331565268127, + "grad_norm": 1.1393431425094604, + "learning_rate": 1.2777807245531215e-05, + "loss": 0.4774, + "step": 252630 + }, + { + "epoch": 2.2334199685284393, + "grad_norm": 3.4115540981292725, + "learning_rate": 1.2776333857859347e-05, + "loss": 0.5458, + "step": 252640 + }, + { + "epoch": 2.2335083717887514, + "grad_norm": 9.526936531066895, + "learning_rate": 1.2774860470187475e-05, + "loss": 0.6762, + "step": 252650 + }, + { + "epoch": 2.233596775049064, + "grad_norm": 2.7988791465759277, + "learning_rate": 1.2773387082515603e-05, + "loss": 0.5531, + "step": 252660 + }, + { + "epoch": 2.233685178309376, + "grad_norm": 4.183450698852539, + "learning_rate": 1.2771913694843732e-05, + "loss": 0.513, + "step": 252670 + }, + { + "epoch": 2.233773581569688, + "grad_norm": 2.849205493927002, + "learning_rate": 1.2770440307171863e-05, + "loss": 0.4268, + "step": 252680 + }, + { + "epoch": 2.2338619848300008, + "grad_norm": 5.757079124450684, + "learning_rate": 1.2768966919499992e-05, + "loss": 0.5889, + "step": 252690 + }, + { + "epoch": 2.233950388090313, + "grad_norm": 2.3108673095703125, + "learning_rate": 1.276749353182812e-05, + "loss": 0.4853, + "step": 252700 + }, + { + "epoch": 2.234038791350625, + "grad_norm": 2.659597873687744, + "learning_rate": 1.2766020144156252e-05, + "loss": 0.5665, + "step": 252710 + }, + { + "epoch": 2.234127194610937, + "grad_norm": 3.3465418815612793, + "learning_rate": 1.276454675648438e-05, + "loss": 0.7121, + "step": 252720 + }, + { + "epoch": 2.2342155978712497, + "grad_norm": 3.4034671783447266, + "learning_rate": 1.2763073368812509e-05, + "loss": 0.4975, + "step": 252730 + }, + { + "epoch": 2.234304001131562, + "grad_norm": 1.2039121389389038, + "learning_rate": 1.2761599981140637e-05, + "loss": 0.5606, + "step": 252740 + }, + { + "epoch": 2.234392404391874, + "grad_norm": 2.21065092086792, + "learning_rate": 1.2760126593468769e-05, + "loss": 0.6474, + "step": 252750 + }, + { + "epoch": 2.234480807652186, + "grad_norm": 0.7184696197509766, + "learning_rate": 1.2758653205796897e-05, + "loss": 0.4903, + "step": 252760 + }, + { + "epoch": 2.2345692109124986, + "grad_norm": 4.29422664642334, + "learning_rate": 1.2757179818125025e-05, + "loss": 0.489, + "step": 252770 + }, + { + "epoch": 2.2346576141728107, + "grad_norm": 3.3389880657196045, + "learning_rate": 1.2755706430453157e-05, + "loss": 0.4977, + "step": 252780 + }, + { + "epoch": 2.234746017433123, + "grad_norm": 4.231547832489014, + "learning_rate": 1.2754233042781285e-05, + "loss": 0.4857, + "step": 252790 + }, + { + "epoch": 2.234834420693435, + "grad_norm": 3.541996717453003, + "learning_rate": 1.2752759655109414e-05, + "loss": 0.5902, + "step": 252800 + }, + { + "epoch": 2.2349228239537475, + "grad_norm": 3.5669679641723633, + "learning_rate": 1.2751286267437542e-05, + "loss": 0.6906, + "step": 252810 + }, + { + "epoch": 2.2350112272140596, + "grad_norm": 4.42146110534668, + "learning_rate": 1.2749812879765674e-05, + "loss": 0.6565, + "step": 252820 + }, + { + "epoch": 2.2350996304743718, + "grad_norm": 7.496128559112549, + "learning_rate": 1.2748339492093802e-05, + "loss": 0.5414, + "step": 252830 + }, + { + "epoch": 2.2351880337346843, + "grad_norm": 4.723792552947998, + "learning_rate": 1.274686610442193e-05, + "loss": 0.4971, + "step": 252840 + }, + { + "epoch": 2.2352764369949965, + "grad_norm": 2.3797905445098877, + "learning_rate": 1.2745392716750059e-05, + "loss": 0.4733, + "step": 252850 + }, + { + "epoch": 2.2353648402553086, + "grad_norm": 5.321717262268066, + "learning_rate": 1.274391932907819e-05, + "loss": 0.5216, + "step": 252860 + }, + { + "epoch": 2.2354532435156207, + "grad_norm": 5.5020341873168945, + "learning_rate": 1.2742445941406319e-05, + "loss": 0.493, + "step": 252870 + }, + { + "epoch": 2.2355416467759333, + "grad_norm": 1.8313398361206055, + "learning_rate": 1.2740972553734447e-05, + "loss": 0.6152, + "step": 252880 + }, + { + "epoch": 2.2356300500362454, + "grad_norm": 1.9191946983337402, + "learning_rate": 1.2739499166062579e-05, + "loss": 0.5675, + "step": 252890 + }, + { + "epoch": 2.2357184532965575, + "grad_norm": 5.448577404022217, + "learning_rate": 1.2738025778390707e-05, + "loss": 0.6644, + "step": 252900 + }, + { + "epoch": 2.2358068565568696, + "grad_norm": 2.274205446243286, + "learning_rate": 1.2736552390718836e-05, + "loss": 0.5727, + "step": 252910 + }, + { + "epoch": 2.235895259817182, + "grad_norm": 6.07928991317749, + "learning_rate": 1.2735079003046966e-05, + "loss": 0.6219, + "step": 252920 + }, + { + "epoch": 2.2359836630774943, + "grad_norm": 1.2950937747955322, + "learning_rate": 1.2733605615375096e-05, + "loss": 0.4991, + "step": 252930 + }, + { + "epoch": 2.2360720663378064, + "grad_norm": 7.558318138122559, + "learning_rate": 1.2732132227703224e-05, + "loss": 0.4309, + "step": 252940 + }, + { + "epoch": 2.236160469598119, + "grad_norm": 4.042947769165039, + "learning_rate": 1.2730658840031354e-05, + "loss": 0.4827, + "step": 252950 + }, + { + "epoch": 2.236248872858431, + "grad_norm": 16.065204620361328, + "learning_rate": 1.2729185452359484e-05, + "loss": 0.4637, + "step": 252960 + }, + { + "epoch": 2.2363372761187432, + "grad_norm": 1.9434490203857422, + "learning_rate": 1.2727712064687613e-05, + "loss": 0.4854, + "step": 252970 + }, + { + "epoch": 2.2364256793790553, + "grad_norm": 5.589130401611328, + "learning_rate": 1.2726238677015743e-05, + "loss": 0.592, + "step": 252980 + }, + { + "epoch": 2.236514082639368, + "grad_norm": 2.047581672668457, + "learning_rate": 1.2724765289343871e-05, + "loss": 0.5523, + "step": 252990 + }, + { + "epoch": 2.23660248589968, + "grad_norm": 2.790074348449707, + "learning_rate": 1.2723291901672003e-05, + "loss": 0.5383, + "step": 253000 + }, + { + "epoch": 2.236690889159992, + "grad_norm": 1.294431209564209, + "learning_rate": 1.2721818514000131e-05, + "loss": 0.4837, + "step": 253010 + }, + { + "epoch": 2.2367792924203043, + "grad_norm": 2.0613300800323486, + "learning_rate": 1.272034512632826e-05, + "loss": 0.552, + "step": 253020 + }, + { + "epoch": 2.236867695680617, + "grad_norm": 2.1567509174346924, + "learning_rate": 1.2718871738656388e-05, + "loss": 0.5437, + "step": 253030 + }, + { + "epoch": 2.236956098940929, + "grad_norm": 1.073787808418274, + "learning_rate": 1.271739835098452e-05, + "loss": 0.5926, + "step": 253040 + }, + { + "epoch": 2.237044502201241, + "grad_norm": 1.5064496994018555, + "learning_rate": 1.2715924963312648e-05, + "loss": 0.508, + "step": 253050 + }, + { + "epoch": 2.2371329054615536, + "grad_norm": 2.2105588912963867, + "learning_rate": 1.2714451575640776e-05, + "loss": 0.5524, + "step": 253060 + }, + { + "epoch": 2.2372213087218658, + "grad_norm": 5.310718059539795, + "learning_rate": 1.2712978187968908e-05, + "loss": 0.3843, + "step": 253070 + }, + { + "epoch": 2.237309711982178, + "grad_norm": 3.119602680206299, + "learning_rate": 1.2711504800297036e-05, + "loss": 0.5267, + "step": 253080 + }, + { + "epoch": 2.23739811524249, + "grad_norm": 4.5277228355407715, + "learning_rate": 1.2710031412625165e-05, + "loss": 0.5373, + "step": 253090 + }, + { + "epoch": 2.2374865185028026, + "grad_norm": 2.650120735168457, + "learning_rate": 1.2708558024953293e-05, + "loss": 0.4452, + "step": 253100 + }, + { + "epoch": 2.2375749217631147, + "grad_norm": 2.1366372108459473, + "learning_rate": 1.2707084637281425e-05, + "loss": 0.6294, + "step": 253110 + }, + { + "epoch": 2.237663325023427, + "grad_norm": 1.8659820556640625, + "learning_rate": 1.2705611249609553e-05, + "loss": 0.572, + "step": 253120 + }, + { + "epoch": 2.237751728283739, + "grad_norm": 1.5331933498382568, + "learning_rate": 1.2704137861937682e-05, + "loss": 0.5263, + "step": 253130 + }, + { + "epoch": 2.2378401315440515, + "grad_norm": 1.188280701637268, + "learning_rate": 1.270266447426581e-05, + "loss": 0.5215, + "step": 253140 + }, + { + "epoch": 2.2379285348043636, + "grad_norm": 12.35371208190918, + "learning_rate": 1.2701191086593942e-05, + "loss": 0.6255, + "step": 253150 + }, + { + "epoch": 2.2380169380646757, + "grad_norm": 1.900578498840332, + "learning_rate": 1.269971769892207e-05, + "loss": 0.6826, + "step": 253160 + }, + { + "epoch": 2.238105341324988, + "grad_norm": 19.32665252685547, + "learning_rate": 1.2698244311250198e-05, + "loss": 0.5105, + "step": 253170 + }, + { + "epoch": 2.2381937445853004, + "grad_norm": 2.8707218170166016, + "learning_rate": 1.269677092357833e-05, + "loss": 0.5701, + "step": 253180 + }, + { + "epoch": 2.2382821478456125, + "grad_norm": 2.170468807220459, + "learning_rate": 1.2695297535906459e-05, + "loss": 0.4897, + "step": 253190 + }, + { + "epoch": 2.2383705511059246, + "grad_norm": 11.327629089355469, + "learning_rate": 1.2693824148234587e-05, + "loss": 0.4551, + "step": 253200 + }, + { + "epoch": 2.238458954366237, + "grad_norm": 2.2333009243011475, + "learning_rate": 1.2692350760562715e-05, + "loss": 0.5158, + "step": 253210 + }, + { + "epoch": 2.2385473576265493, + "grad_norm": 1.180032730102539, + "learning_rate": 1.2690877372890847e-05, + "loss": 0.5586, + "step": 253220 + }, + { + "epoch": 2.2386357608868614, + "grad_norm": 1.9853150844573975, + "learning_rate": 1.2689403985218975e-05, + "loss": 0.5951, + "step": 253230 + }, + { + "epoch": 2.2387241641471736, + "grad_norm": 6.762989521026611, + "learning_rate": 1.2687930597547104e-05, + "loss": 0.5975, + "step": 253240 + }, + { + "epoch": 2.238812567407486, + "grad_norm": 7.288366794586182, + "learning_rate": 1.2686457209875235e-05, + "loss": 0.5182, + "step": 253250 + }, + { + "epoch": 2.2389009706677983, + "grad_norm": 4.0701584815979, + "learning_rate": 1.2684983822203364e-05, + "loss": 0.6479, + "step": 253260 + }, + { + "epoch": 2.2389893739281104, + "grad_norm": 2.479717493057251, + "learning_rate": 1.2683510434531492e-05, + "loss": 0.5079, + "step": 253270 + }, + { + "epoch": 2.239077777188423, + "grad_norm": 2.8263113498687744, + "learning_rate": 1.268203704685962e-05, + "loss": 0.4693, + "step": 253280 + }, + { + "epoch": 2.239166180448735, + "grad_norm": 2.0348870754241943, + "learning_rate": 1.2680563659187752e-05, + "loss": 0.6475, + "step": 253290 + }, + { + "epoch": 2.239254583709047, + "grad_norm": 1.4950881004333496, + "learning_rate": 1.267909027151588e-05, + "loss": 0.5941, + "step": 253300 + }, + { + "epoch": 2.2393429869693593, + "grad_norm": 4.31201696395874, + "learning_rate": 1.2677616883844009e-05, + "loss": 0.4868, + "step": 253310 + }, + { + "epoch": 2.239431390229672, + "grad_norm": 2.190439462661743, + "learning_rate": 1.2676143496172137e-05, + "loss": 0.5231, + "step": 253320 + }, + { + "epoch": 2.239519793489984, + "grad_norm": 3.3000316619873047, + "learning_rate": 1.2674670108500269e-05, + "loss": 0.5488, + "step": 253330 + }, + { + "epoch": 2.239608196750296, + "grad_norm": 6.819434642791748, + "learning_rate": 1.2673196720828397e-05, + "loss": 0.5154, + "step": 253340 + }, + { + "epoch": 2.239696600010608, + "grad_norm": 3.1311023235321045, + "learning_rate": 1.2671723333156527e-05, + "loss": 0.5199, + "step": 253350 + }, + { + "epoch": 2.239785003270921, + "grad_norm": 2.7600440979003906, + "learning_rate": 1.2670249945484657e-05, + "loss": 0.5353, + "step": 253360 + }, + { + "epoch": 2.239873406531233, + "grad_norm": 4.6110053062438965, + "learning_rate": 1.2668776557812786e-05, + "loss": 0.5093, + "step": 253370 + }, + { + "epoch": 2.239961809791545, + "grad_norm": 2.678877115249634, + "learning_rate": 1.2667303170140916e-05, + "loss": 0.6114, + "step": 253380 + }, + { + "epoch": 2.240050213051857, + "grad_norm": 0.7888281941413879, + "learning_rate": 1.2665829782469044e-05, + "loss": 0.5252, + "step": 253390 + }, + { + "epoch": 2.2401386163121697, + "grad_norm": 1.9236689805984497, + "learning_rate": 1.2664356394797174e-05, + "loss": 0.5889, + "step": 253400 + }, + { + "epoch": 2.240227019572482, + "grad_norm": 16.878978729248047, + "learning_rate": 1.2662883007125304e-05, + "loss": 0.5331, + "step": 253410 + }, + { + "epoch": 2.240315422832794, + "grad_norm": 4.042402267456055, + "learning_rate": 1.2661409619453433e-05, + "loss": 0.5011, + "step": 253420 + }, + { + "epoch": 2.2404038260931065, + "grad_norm": 1.8915503025054932, + "learning_rate": 1.2659936231781563e-05, + "loss": 0.4815, + "step": 253430 + }, + { + "epoch": 2.2404922293534186, + "grad_norm": 1.3140441179275513, + "learning_rate": 1.2658462844109693e-05, + "loss": 0.5546, + "step": 253440 + }, + { + "epoch": 2.2405806326137307, + "grad_norm": 0.9059475064277649, + "learning_rate": 1.2656989456437821e-05, + "loss": 0.5099, + "step": 253450 + }, + { + "epoch": 2.240669035874043, + "grad_norm": 6.309267997741699, + "learning_rate": 1.265551606876595e-05, + "loss": 0.4781, + "step": 253460 + }, + { + "epoch": 2.2407574391343554, + "grad_norm": 4.785647392272949, + "learning_rate": 1.2654042681094081e-05, + "loss": 0.451, + "step": 253470 + }, + { + "epoch": 2.2408458423946676, + "grad_norm": 4.263225078582764, + "learning_rate": 1.265256929342221e-05, + "loss": 0.5996, + "step": 253480 + }, + { + "epoch": 2.2409342456549797, + "grad_norm": 5.103342056274414, + "learning_rate": 1.2651095905750338e-05, + "loss": 0.4936, + "step": 253490 + }, + { + "epoch": 2.241022648915292, + "grad_norm": 2.0941078662872314, + "learning_rate": 1.2649622518078466e-05, + "loss": 0.422, + "step": 253500 + }, + { + "epoch": 2.2411110521756044, + "grad_norm": 9.05228328704834, + "learning_rate": 1.2648149130406598e-05, + "loss": 0.5048, + "step": 253510 + }, + { + "epoch": 2.2411994554359165, + "grad_norm": 4.322164058685303, + "learning_rate": 1.2646675742734726e-05, + "loss": 0.6255, + "step": 253520 + }, + { + "epoch": 2.2412878586962286, + "grad_norm": 10.949601173400879, + "learning_rate": 1.2645202355062855e-05, + "loss": 0.4939, + "step": 253530 + }, + { + "epoch": 2.241376261956541, + "grad_norm": 2.197873830795288, + "learning_rate": 1.2643728967390986e-05, + "loss": 0.4102, + "step": 253540 + }, + { + "epoch": 2.2414646652168533, + "grad_norm": 2.491154670715332, + "learning_rate": 1.2642255579719115e-05, + "loss": 0.6301, + "step": 253550 + }, + { + "epoch": 2.2415530684771654, + "grad_norm": 3.007502317428589, + "learning_rate": 1.2640782192047243e-05, + "loss": 0.4157, + "step": 253560 + }, + { + "epoch": 2.2416414717374775, + "grad_norm": 2.2519590854644775, + "learning_rate": 1.2639308804375372e-05, + "loss": 0.5409, + "step": 253570 + }, + { + "epoch": 2.24172987499779, + "grad_norm": 2.9471848011016846, + "learning_rate": 1.2637835416703503e-05, + "loss": 0.4524, + "step": 253580 + }, + { + "epoch": 2.241818278258102, + "grad_norm": 6.25692892074585, + "learning_rate": 1.2636362029031632e-05, + "loss": 0.4706, + "step": 253590 + }, + { + "epoch": 2.2419066815184143, + "grad_norm": 2.9500205516815186, + "learning_rate": 1.263488864135976e-05, + "loss": 0.5092, + "step": 253600 + }, + { + "epoch": 2.2419950847787264, + "grad_norm": 2.1385011672973633, + "learning_rate": 1.2633415253687888e-05, + "loss": 0.3702, + "step": 253610 + }, + { + "epoch": 2.242083488039039, + "grad_norm": 2.6108431816101074, + "learning_rate": 1.263194186601602e-05, + "loss": 0.5418, + "step": 253620 + }, + { + "epoch": 2.242171891299351, + "grad_norm": 15.481287002563477, + "learning_rate": 1.2630468478344148e-05, + "loss": 0.6351, + "step": 253630 + }, + { + "epoch": 2.2422602945596632, + "grad_norm": 2.1267478466033936, + "learning_rate": 1.2628995090672277e-05, + "loss": 0.5907, + "step": 253640 + }, + { + "epoch": 2.242348697819976, + "grad_norm": 2.009594440460205, + "learning_rate": 1.2627521703000409e-05, + "loss": 0.4892, + "step": 253650 + }, + { + "epoch": 2.242437101080288, + "grad_norm": 4.490423679351807, + "learning_rate": 1.2626048315328537e-05, + "loss": 0.5032, + "step": 253660 + }, + { + "epoch": 2.2425255043406, + "grad_norm": 1.422493577003479, + "learning_rate": 1.2624574927656665e-05, + "loss": 0.4887, + "step": 253670 + }, + { + "epoch": 2.242613907600912, + "grad_norm": 4.48104190826416, + "learning_rate": 1.2623101539984794e-05, + "loss": 0.5548, + "step": 253680 + }, + { + "epoch": 2.2427023108612247, + "grad_norm": 8.379036903381348, + "learning_rate": 1.2621628152312925e-05, + "loss": 0.5543, + "step": 253690 + }, + { + "epoch": 2.242790714121537, + "grad_norm": 9.65436840057373, + "learning_rate": 1.2620154764641054e-05, + "loss": 0.5755, + "step": 253700 + }, + { + "epoch": 2.242879117381849, + "grad_norm": 4.702982425689697, + "learning_rate": 1.2618681376969182e-05, + "loss": 0.5228, + "step": 253710 + }, + { + "epoch": 2.242967520642161, + "grad_norm": 4.421905994415283, + "learning_rate": 1.2617207989297314e-05, + "loss": 0.4951, + "step": 253720 + }, + { + "epoch": 2.2430559239024737, + "grad_norm": 2.08052134513855, + "learning_rate": 1.2615734601625442e-05, + "loss": 0.4895, + "step": 253730 + }, + { + "epoch": 2.243144327162786, + "grad_norm": 2.2632317543029785, + "learning_rate": 1.261426121395357e-05, + "loss": 0.5038, + "step": 253740 + }, + { + "epoch": 2.243232730423098, + "grad_norm": 2.700002431869507, + "learning_rate": 1.2612787826281699e-05, + "loss": 0.4643, + "step": 253750 + }, + { + "epoch": 2.24332113368341, + "grad_norm": 1.9125080108642578, + "learning_rate": 1.261131443860983e-05, + "loss": 0.521, + "step": 253760 + }, + { + "epoch": 2.2434095369437226, + "grad_norm": 4.061237335205078, + "learning_rate": 1.2609841050937959e-05, + "loss": 0.4498, + "step": 253770 + }, + { + "epoch": 2.2434979402040347, + "grad_norm": 2.245572090148926, + "learning_rate": 1.2608367663266087e-05, + "loss": 0.5663, + "step": 253780 + }, + { + "epoch": 2.243586343464347, + "grad_norm": 2.123220205307007, + "learning_rate": 1.2606894275594217e-05, + "loss": 0.6304, + "step": 253790 + }, + { + "epoch": 2.2436747467246594, + "grad_norm": 4.161200046539307, + "learning_rate": 1.2605420887922347e-05, + "loss": 0.6291, + "step": 253800 + }, + { + "epoch": 2.2437631499849715, + "grad_norm": 3.6126692295074463, + "learning_rate": 1.2603947500250476e-05, + "loss": 0.4772, + "step": 253810 + }, + { + "epoch": 2.2438515532452836, + "grad_norm": 1.2575958967208862, + "learning_rate": 1.2602474112578606e-05, + "loss": 0.594, + "step": 253820 + }, + { + "epoch": 2.2439399565055957, + "grad_norm": 8.3369779586792, + "learning_rate": 1.2601000724906736e-05, + "loss": 0.754, + "step": 253830 + }, + { + "epoch": 2.2440283597659083, + "grad_norm": 2.0817759037017822, + "learning_rate": 1.2599527337234864e-05, + "loss": 0.4352, + "step": 253840 + }, + { + "epoch": 2.2441167630262204, + "grad_norm": 2.63472580909729, + "learning_rate": 1.2598053949562994e-05, + "loss": 0.4854, + "step": 253850 + }, + { + "epoch": 2.2442051662865325, + "grad_norm": 1.1414251327514648, + "learning_rate": 1.2596580561891123e-05, + "loss": 0.5831, + "step": 253860 + }, + { + "epoch": 2.244293569546845, + "grad_norm": 1.8685020208358765, + "learning_rate": 1.2595107174219253e-05, + "loss": 0.4466, + "step": 253870 + }, + { + "epoch": 2.2443819728071572, + "grad_norm": 4.88785457611084, + "learning_rate": 1.2593633786547383e-05, + "loss": 0.557, + "step": 253880 + }, + { + "epoch": 2.2444703760674694, + "grad_norm": 1.8433657884597778, + "learning_rate": 1.2592160398875511e-05, + "loss": 0.5595, + "step": 253890 + }, + { + "epoch": 2.2445587793277815, + "grad_norm": 8.696371078491211, + "learning_rate": 1.2590687011203641e-05, + "loss": 0.4665, + "step": 253900 + }, + { + "epoch": 2.244647182588094, + "grad_norm": 3.271641254425049, + "learning_rate": 1.2589213623531771e-05, + "loss": 0.5, + "step": 253910 + }, + { + "epoch": 2.244735585848406, + "grad_norm": 5.972082138061523, + "learning_rate": 1.25877402358599e-05, + "loss": 0.6581, + "step": 253920 + }, + { + "epoch": 2.2448239891087183, + "grad_norm": 1.2643530368804932, + "learning_rate": 1.2586266848188028e-05, + "loss": 0.3885, + "step": 253930 + }, + { + "epoch": 2.2449123923690304, + "grad_norm": 1.4845143556594849, + "learning_rate": 1.258479346051616e-05, + "loss": 0.4653, + "step": 253940 + }, + { + "epoch": 2.245000795629343, + "grad_norm": 2.980423927307129, + "learning_rate": 1.2583320072844288e-05, + "loss": 0.4499, + "step": 253950 + }, + { + "epoch": 2.245089198889655, + "grad_norm": 4.259295463562012, + "learning_rate": 1.2581846685172416e-05, + "loss": 0.4639, + "step": 253960 + }, + { + "epoch": 2.245177602149967, + "grad_norm": 11.17354965209961, + "learning_rate": 1.2580373297500545e-05, + "loss": 0.482, + "step": 253970 + }, + { + "epoch": 2.2452660054102793, + "grad_norm": 4.9159417152404785, + "learning_rate": 1.2578899909828676e-05, + "loss": 0.613, + "step": 253980 + }, + { + "epoch": 2.245354408670592, + "grad_norm": 6.1804423332214355, + "learning_rate": 1.2577426522156805e-05, + "loss": 0.6085, + "step": 253990 + }, + { + "epoch": 2.245442811930904, + "grad_norm": 27.09356689453125, + "learning_rate": 1.2575953134484933e-05, + "loss": 0.6052, + "step": 254000 + }, + { + "epoch": 2.245531215191216, + "grad_norm": 2.2024269104003906, + "learning_rate": 1.2574479746813065e-05, + "loss": 0.4723, + "step": 254010 + }, + { + "epoch": 2.2456196184515287, + "grad_norm": 4.0012431144714355, + "learning_rate": 1.2573006359141193e-05, + "loss": 0.58, + "step": 254020 + }, + { + "epoch": 2.245708021711841, + "grad_norm": 1.7850067615509033, + "learning_rate": 1.2571532971469322e-05, + "loss": 0.532, + "step": 254030 + }, + { + "epoch": 2.245796424972153, + "grad_norm": 14.720090866088867, + "learning_rate": 1.257005958379745e-05, + "loss": 0.452, + "step": 254040 + }, + { + "epoch": 2.245884828232465, + "grad_norm": 18.2075138092041, + "learning_rate": 1.2568586196125582e-05, + "loss": 0.6598, + "step": 254050 + }, + { + "epoch": 2.2459732314927776, + "grad_norm": 0.8161367774009705, + "learning_rate": 1.256711280845371e-05, + "loss": 0.4517, + "step": 254060 + }, + { + "epoch": 2.2460616347530897, + "grad_norm": 4.753641128540039, + "learning_rate": 1.2565639420781838e-05, + "loss": 0.5894, + "step": 254070 + }, + { + "epoch": 2.246150038013402, + "grad_norm": 27.06513214111328, + "learning_rate": 1.2564166033109967e-05, + "loss": 0.5184, + "step": 254080 + }, + { + "epoch": 2.246238441273714, + "grad_norm": 7.429715156555176, + "learning_rate": 1.2562692645438098e-05, + "loss": 0.5692, + "step": 254090 + }, + { + "epoch": 2.2463268445340265, + "grad_norm": 4.486611366271973, + "learning_rate": 1.2561219257766227e-05, + "loss": 0.5949, + "step": 254100 + }, + { + "epoch": 2.2464152477943387, + "grad_norm": 3.4978559017181396, + "learning_rate": 1.2559745870094355e-05, + "loss": 0.4887, + "step": 254110 + }, + { + "epoch": 2.2465036510546508, + "grad_norm": 3.2686331272125244, + "learning_rate": 1.2558272482422487e-05, + "loss": 0.5583, + "step": 254120 + }, + { + "epoch": 2.2465920543149633, + "grad_norm": 2.3942692279815674, + "learning_rate": 1.2556799094750615e-05, + "loss": 0.4085, + "step": 254130 + }, + { + "epoch": 2.2466804575752755, + "grad_norm": 1.7464889287948608, + "learning_rate": 1.2555325707078744e-05, + "loss": 0.5785, + "step": 254140 + }, + { + "epoch": 2.2467688608355876, + "grad_norm": 4.90798807144165, + "learning_rate": 1.2553852319406872e-05, + "loss": 0.4688, + "step": 254150 + }, + { + "epoch": 2.2468572640958997, + "grad_norm": 2.7325403690338135, + "learning_rate": 1.2552378931735004e-05, + "loss": 0.5461, + "step": 254160 + }, + { + "epoch": 2.2469456673562123, + "grad_norm": 4.062154293060303, + "learning_rate": 1.2550905544063132e-05, + "loss": 0.6166, + "step": 254170 + }, + { + "epoch": 2.2470340706165244, + "grad_norm": 4.622369766235352, + "learning_rate": 1.254943215639126e-05, + "loss": 0.5817, + "step": 254180 + }, + { + "epoch": 2.2471224738768365, + "grad_norm": 2.112797975540161, + "learning_rate": 1.2547958768719392e-05, + "loss": 0.5684, + "step": 254190 + }, + { + "epoch": 2.2472108771371486, + "grad_norm": 5.074954509735107, + "learning_rate": 1.254648538104752e-05, + "loss": 0.4742, + "step": 254200 + }, + { + "epoch": 2.247299280397461, + "grad_norm": 3.340444564819336, + "learning_rate": 1.2545011993375649e-05, + "loss": 0.5428, + "step": 254210 + }, + { + "epoch": 2.2473876836577733, + "grad_norm": 5.45438814163208, + "learning_rate": 1.2543538605703777e-05, + "loss": 0.4252, + "step": 254220 + }, + { + "epoch": 2.2474760869180854, + "grad_norm": 11.2264986038208, + "learning_rate": 1.2542065218031909e-05, + "loss": 0.5215, + "step": 254230 + }, + { + "epoch": 2.247564490178398, + "grad_norm": 4.039429664611816, + "learning_rate": 1.2540591830360037e-05, + "loss": 0.5207, + "step": 254240 + }, + { + "epoch": 2.24765289343871, + "grad_norm": 7.626267433166504, + "learning_rate": 1.2539118442688166e-05, + "loss": 0.5763, + "step": 254250 + }, + { + "epoch": 2.2477412966990222, + "grad_norm": 3.957284927368164, + "learning_rate": 1.2537645055016296e-05, + "loss": 0.47, + "step": 254260 + }, + { + "epoch": 2.2478296999593343, + "grad_norm": 1.972067952156067, + "learning_rate": 1.2536171667344426e-05, + "loss": 0.4428, + "step": 254270 + }, + { + "epoch": 2.247918103219647, + "grad_norm": 24.079561233520508, + "learning_rate": 1.2534698279672554e-05, + "loss": 0.5892, + "step": 254280 + }, + { + "epoch": 2.248006506479959, + "grad_norm": 2.013171911239624, + "learning_rate": 1.2533224892000684e-05, + "loss": 0.5226, + "step": 254290 + }, + { + "epoch": 2.248094909740271, + "grad_norm": 3.6540002822875977, + "learning_rate": 1.2531751504328814e-05, + "loss": 0.4826, + "step": 254300 + }, + { + "epoch": 2.2481833130005833, + "grad_norm": 2.2159876823425293, + "learning_rate": 1.2530278116656943e-05, + "loss": 0.5734, + "step": 254310 + }, + { + "epoch": 2.248271716260896, + "grad_norm": 2.8464362621307373, + "learning_rate": 1.2528804728985073e-05, + "loss": 0.535, + "step": 254320 + }, + { + "epoch": 2.248360119521208, + "grad_norm": 2.653597354888916, + "learning_rate": 1.2527331341313201e-05, + "loss": 0.6385, + "step": 254330 + }, + { + "epoch": 2.24844852278152, + "grad_norm": 1.1675798892974854, + "learning_rate": 1.2525857953641331e-05, + "loss": 0.471, + "step": 254340 + }, + { + "epoch": 2.248536926041832, + "grad_norm": 5.7221550941467285, + "learning_rate": 1.2524384565969461e-05, + "loss": 0.5717, + "step": 254350 + }, + { + "epoch": 2.2486253293021448, + "grad_norm": 2.717066764831543, + "learning_rate": 1.252291117829759e-05, + "loss": 0.6477, + "step": 254360 + }, + { + "epoch": 2.248713732562457, + "grad_norm": 6.439792633056641, + "learning_rate": 1.252143779062572e-05, + "loss": 0.5469, + "step": 254370 + }, + { + "epoch": 2.248802135822769, + "grad_norm": 2.816739559173584, + "learning_rate": 1.251996440295385e-05, + "loss": 0.587, + "step": 254380 + }, + { + "epoch": 2.2488905390830816, + "grad_norm": 3.1749541759490967, + "learning_rate": 1.2518491015281978e-05, + "loss": 0.4316, + "step": 254390 + }, + { + "epoch": 2.2489789423433937, + "grad_norm": 12.242215156555176, + "learning_rate": 1.2517017627610106e-05, + "loss": 0.4677, + "step": 254400 + }, + { + "epoch": 2.249067345603706, + "grad_norm": 16.74303436279297, + "learning_rate": 1.2515544239938238e-05, + "loss": 0.4928, + "step": 254410 + }, + { + "epoch": 2.249155748864018, + "grad_norm": 1.9944812059402466, + "learning_rate": 1.2514070852266366e-05, + "loss": 0.5239, + "step": 254420 + }, + { + "epoch": 2.2492441521243305, + "grad_norm": 1.888506293296814, + "learning_rate": 1.2512597464594495e-05, + "loss": 0.4201, + "step": 254430 + }, + { + "epoch": 2.2493325553846426, + "grad_norm": 6.789795398712158, + "learning_rate": 1.2511124076922623e-05, + "loss": 0.5114, + "step": 254440 + }, + { + "epoch": 2.2494209586449547, + "grad_norm": 1.5835825204849243, + "learning_rate": 1.2509650689250755e-05, + "loss": 0.4078, + "step": 254450 + }, + { + "epoch": 2.2495093619052673, + "grad_norm": 5.006653785705566, + "learning_rate": 1.2508177301578883e-05, + "loss": 0.548, + "step": 254460 + }, + { + "epoch": 2.2495977651655794, + "grad_norm": 3.0346696376800537, + "learning_rate": 1.2506703913907012e-05, + "loss": 0.6902, + "step": 254470 + }, + { + "epoch": 2.2496861684258915, + "grad_norm": 2.0062708854675293, + "learning_rate": 1.2505230526235143e-05, + "loss": 0.4606, + "step": 254480 + }, + { + "epoch": 2.2497745716862037, + "grad_norm": 7.4129414558410645, + "learning_rate": 1.2503757138563272e-05, + "loss": 0.479, + "step": 254490 + }, + { + "epoch": 2.249862974946516, + "grad_norm": 2.5161235332489014, + "learning_rate": 1.25022837508914e-05, + "loss": 0.4919, + "step": 254500 + }, + { + "epoch": 2.2499513782068283, + "grad_norm": 1.773936152458191, + "learning_rate": 1.2500810363219528e-05, + "loss": 0.5062, + "step": 254510 + }, + { + "epoch": 2.2500397814671405, + "grad_norm": 1.8533095121383667, + "learning_rate": 1.2499336975547658e-05, + "loss": 0.5066, + "step": 254520 + }, + { + "epoch": 2.2501281847274526, + "grad_norm": 0.8794558644294739, + "learning_rate": 1.2497863587875788e-05, + "loss": 0.5833, + "step": 254530 + }, + { + "epoch": 2.250216587987765, + "grad_norm": 7.538434982299805, + "learning_rate": 1.2496390200203917e-05, + "loss": 0.4922, + "step": 254540 + }, + { + "epoch": 2.2503049912480773, + "grad_norm": 10.649346351623535, + "learning_rate": 1.2494916812532047e-05, + "loss": 0.5166, + "step": 254550 + }, + { + "epoch": 2.2503933945083894, + "grad_norm": 12.515463829040527, + "learning_rate": 1.2493443424860177e-05, + "loss": 0.4875, + "step": 254560 + }, + { + "epoch": 2.2504817977687015, + "grad_norm": 1.9790345430374146, + "learning_rate": 1.2491970037188305e-05, + "loss": 0.6578, + "step": 254570 + }, + { + "epoch": 2.250570201029014, + "grad_norm": 21.184968948364258, + "learning_rate": 1.2490496649516435e-05, + "loss": 0.629, + "step": 254580 + }, + { + "epoch": 2.250658604289326, + "grad_norm": 7.999340534210205, + "learning_rate": 1.2489023261844564e-05, + "loss": 0.6012, + "step": 254590 + }, + { + "epoch": 2.2507470075496383, + "grad_norm": 1.70009446144104, + "learning_rate": 1.2487549874172694e-05, + "loss": 0.5859, + "step": 254600 + }, + { + "epoch": 2.250835410809951, + "grad_norm": 4.166402339935303, + "learning_rate": 1.2486076486500822e-05, + "loss": 0.4683, + "step": 254610 + }, + { + "epoch": 2.250923814070263, + "grad_norm": 11.924890518188477, + "learning_rate": 1.2484603098828952e-05, + "loss": 0.6322, + "step": 254620 + }, + { + "epoch": 2.251012217330575, + "grad_norm": 5.191905975341797, + "learning_rate": 1.248312971115708e-05, + "loss": 0.4793, + "step": 254630 + }, + { + "epoch": 2.2511006205908872, + "grad_norm": 6.981032848358154, + "learning_rate": 1.248165632348521e-05, + "loss": 0.4316, + "step": 254640 + }, + { + "epoch": 2.2511890238512, + "grad_norm": 2.285388231277466, + "learning_rate": 1.248018293581334e-05, + "loss": 0.5785, + "step": 254650 + }, + { + "epoch": 2.251277427111512, + "grad_norm": 8.516698837280273, + "learning_rate": 1.2478709548141469e-05, + "loss": 0.5462, + "step": 254660 + }, + { + "epoch": 2.251365830371824, + "grad_norm": 1.1132254600524902, + "learning_rate": 1.2477236160469599e-05, + "loss": 0.5005, + "step": 254670 + }, + { + "epoch": 2.2514542336321366, + "grad_norm": 5.401992321014404, + "learning_rate": 1.2475762772797727e-05, + "loss": 0.5473, + "step": 254680 + }, + { + "epoch": 2.2515426368924487, + "grad_norm": 2.9870247840881348, + "learning_rate": 1.2474289385125857e-05, + "loss": 0.4606, + "step": 254690 + }, + { + "epoch": 2.251631040152761, + "grad_norm": 6.145971298217773, + "learning_rate": 1.2472815997453986e-05, + "loss": 0.6392, + "step": 254700 + }, + { + "epoch": 2.251719443413073, + "grad_norm": 3.847421884536743, + "learning_rate": 1.2471342609782116e-05, + "loss": 0.5264, + "step": 254710 + }, + { + "epoch": 2.251807846673385, + "grad_norm": 1.330945372581482, + "learning_rate": 1.2469869222110244e-05, + "loss": 0.3225, + "step": 254720 + }, + { + "epoch": 2.2518962499336976, + "grad_norm": 11.13337516784668, + "learning_rate": 1.2468395834438374e-05, + "loss": 0.5739, + "step": 254730 + }, + { + "epoch": 2.2519846531940098, + "grad_norm": 4.372508525848389, + "learning_rate": 1.2466922446766504e-05, + "loss": 0.5352, + "step": 254740 + }, + { + "epoch": 2.252073056454322, + "grad_norm": 1.6097230911254883, + "learning_rate": 1.2465449059094633e-05, + "loss": 0.5109, + "step": 254750 + }, + { + "epoch": 2.2521614597146344, + "grad_norm": 2.898754119873047, + "learning_rate": 1.2463975671422763e-05, + "loss": 0.5025, + "step": 254760 + }, + { + "epoch": 2.2522498629749466, + "grad_norm": 6.849977970123291, + "learning_rate": 1.2462502283750891e-05, + "loss": 0.4392, + "step": 254770 + }, + { + "epoch": 2.2523382662352587, + "grad_norm": 3.324328660964966, + "learning_rate": 1.2461028896079021e-05, + "loss": 0.4903, + "step": 254780 + }, + { + "epoch": 2.252426669495571, + "grad_norm": 6.11318826675415, + "learning_rate": 1.2459555508407151e-05, + "loss": 0.6489, + "step": 254790 + }, + { + "epoch": 2.2525150727558834, + "grad_norm": 1.9623678922653198, + "learning_rate": 1.245808212073528e-05, + "loss": 0.4872, + "step": 254800 + }, + { + "epoch": 2.2526034760161955, + "grad_norm": 1.0237971544265747, + "learning_rate": 1.245660873306341e-05, + "loss": 0.5021, + "step": 254810 + }, + { + "epoch": 2.2526918792765076, + "grad_norm": 3.5560898780822754, + "learning_rate": 1.245513534539154e-05, + "loss": 0.5436, + "step": 254820 + }, + { + "epoch": 2.25278028253682, + "grad_norm": 3.721842050552368, + "learning_rate": 1.2453661957719668e-05, + "loss": 0.5811, + "step": 254830 + }, + { + "epoch": 2.2528686857971323, + "grad_norm": 1.4190164804458618, + "learning_rate": 1.2452188570047798e-05, + "loss": 0.4279, + "step": 254840 + }, + { + "epoch": 2.2529570890574444, + "grad_norm": 4.494285583496094, + "learning_rate": 1.2450715182375928e-05, + "loss": 0.5744, + "step": 254850 + }, + { + "epoch": 2.2530454923177565, + "grad_norm": 6.1833086013793945, + "learning_rate": 1.2449241794704056e-05, + "loss": 0.5565, + "step": 254860 + }, + { + "epoch": 2.253133895578069, + "grad_norm": 3.736898183822632, + "learning_rate": 1.2447768407032186e-05, + "loss": 0.396, + "step": 254870 + }, + { + "epoch": 2.253222298838381, + "grad_norm": 4.520800590515137, + "learning_rate": 1.2446295019360315e-05, + "loss": 0.4891, + "step": 254880 + }, + { + "epoch": 2.2533107020986933, + "grad_norm": 2.0803773403167725, + "learning_rate": 1.2444821631688445e-05, + "loss": 0.4994, + "step": 254890 + }, + { + "epoch": 2.2533991053590054, + "grad_norm": 3.111959218978882, + "learning_rate": 1.2443348244016573e-05, + "loss": 0.4701, + "step": 254900 + }, + { + "epoch": 2.253487508619318, + "grad_norm": 2.3763391971588135, + "learning_rate": 1.2441874856344703e-05, + "loss": 0.5678, + "step": 254910 + }, + { + "epoch": 2.25357591187963, + "grad_norm": 0.9147385358810425, + "learning_rate": 1.2440401468672831e-05, + "loss": 0.4784, + "step": 254920 + }, + { + "epoch": 2.2536643151399423, + "grad_norm": 12.261999130249023, + "learning_rate": 1.2438928081000962e-05, + "loss": 0.5703, + "step": 254930 + }, + { + "epoch": 2.2537527184002544, + "grad_norm": 2.900031805038452, + "learning_rate": 1.2437454693329092e-05, + "loss": 0.5604, + "step": 254940 + }, + { + "epoch": 2.253841121660567, + "grad_norm": 4.749475479125977, + "learning_rate": 1.243598130565722e-05, + "loss": 0.4584, + "step": 254950 + }, + { + "epoch": 2.253929524920879, + "grad_norm": 0.8546419143676758, + "learning_rate": 1.243450791798535e-05, + "loss": 0.5717, + "step": 254960 + }, + { + "epoch": 2.254017928181191, + "grad_norm": 1.6981145143508911, + "learning_rate": 1.2433034530313478e-05, + "loss": 0.5039, + "step": 254970 + }, + { + "epoch": 2.2541063314415037, + "grad_norm": 13.539844512939453, + "learning_rate": 1.2431561142641608e-05, + "loss": 0.4946, + "step": 254980 + }, + { + "epoch": 2.254194734701816, + "grad_norm": 1.5104293823242188, + "learning_rate": 1.2430087754969737e-05, + "loss": 0.6101, + "step": 254990 + }, + { + "epoch": 2.254283137962128, + "grad_norm": 2.8734290599823, + "learning_rate": 1.2428614367297867e-05, + "loss": 0.6035, + "step": 255000 + }, + { + "epoch": 2.25437154122244, + "grad_norm": 2.4359500408172607, + "learning_rate": 1.2427140979625995e-05, + "loss": 0.4662, + "step": 255010 + }, + { + "epoch": 2.2544599444827527, + "grad_norm": 3.7035248279571533, + "learning_rate": 1.2425667591954125e-05, + "loss": 0.6319, + "step": 255020 + }, + { + "epoch": 2.254548347743065, + "grad_norm": 4.708608627319336, + "learning_rate": 1.2424194204282255e-05, + "loss": 0.7186, + "step": 255030 + }, + { + "epoch": 2.254636751003377, + "grad_norm": 1.7487807273864746, + "learning_rate": 1.2422720816610384e-05, + "loss": 0.5276, + "step": 255040 + }, + { + "epoch": 2.2547251542636895, + "grad_norm": 3.5769524574279785, + "learning_rate": 1.2421247428938514e-05, + "loss": 0.5537, + "step": 255050 + }, + { + "epoch": 2.2548135575240016, + "grad_norm": 3.0523736476898193, + "learning_rate": 1.2419774041266642e-05, + "loss": 0.5497, + "step": 255060 + }, + { + "epoch": 2.2549019607843137, + "grad_norm": 1.8914066553115845, + "learning_rate": 1.2418300653594772e-05, + "loss": 0.5136, + "step": 255070 + }, + { + "epoch": 2.254990364044626, + "grad_norm": 3.3064990043640137, + "learning_rate": 1.24168272659229e-05, + "loss": 0.4922, + "step": 255080 + }, + { + "epoch": 2.2550787673049384, + "grad_norm": 9.225730895996094, + "learning_rate": 1.241535387825103e-05, + "loss": 0.4671, + "step": 255090 + }, + { + "epoch": 2.2551671705652505, + "grad_norm": 1.8057348728179932, + "learning_rate": 1.2413880490579159e-05, + "loss": 0.5935, + "step": 255100 + }, + { + "epoch": 2.2552555738255626, + "grad_norm": 3.3664047718048096, + "learning_rate": 1.2412407102907289e-05, + "loss": 0.6321, + "step": 255110 + }, + { + "epoch": 2.2553439770858748, + "grad_norm": 2.1597049236297607, + "learning_rate": 1.2410933715235419e-05, + "loss": 0.5498, + "step": 255120 + }, + { + "epoch": 2.2554323803461873, + "grad_norm": 2.0296764373779297, + "learning_rate": 1.2409460327563547e-05, + "loss": 0.4023, + "step": 255130 + }, + { + "epoch": 2.2555207836064994, + "grad_norm": 4.146529674530029, + "learning_rate": 1.2407986939891677e-05, + "loss": 0.5291, + "step": 255140 + }, + { + "epoch": 2.2556091868668116, + "grad_norm": 9.243882179260254, + "learning_rate": 1.2406513552219806e-05, + "loss": 0.5248, + "step": 255150 + }, + { + "epoch": 2.2556975901271237, + "grad_norm": 6.318830966949463, + "learning_rate": 1.2405040164547936e-05, + "loss": 0.5348, + "step": 255160 + }, + { + "epoch": 2.2557859933874362, + "grad_norm": 3.6144251823425293, + "learning_rate": 1.2403566776876064e-05, + "loss": 0.5011, + "step": 255170 + }, + { + "epoch": 2.2558743966477484, + "grad_norm": 3.0251057147979736, + "learning_rate": 1.2402093389204194e-05, + "loss": 0.572, + "step": 255180 + }, + { + "epoch": 2.2559627999080605, + "grad_norm": 6.978640556335449, + "learning_rate": 1.2400620001532322e-05, + "loss": 0.489, + "step": 255190 + }, + { + "epoch": 2.256051203168373, + "grad_norm": 4.725232124328613, + "learning_rate": 1.2399146613860452e-05, + "loss": 0.5163, + "step": 255200 + }, + { + "epoch": 2.256139606428685, + "grad_norm": 2.562138795852661, + "learning_rate": 1.2397673226188583e-05, + "loss": 0.5927, + "step": 255210 + }, + { + "epoch": 2.2562280096889973, + "grad_norm": 1.662571907043457, + "learning_rate": 1.2396199838516711e-05, + "loss": 0.5352, + "step": 255220 + }, + { + "epoch": 2.2563164129493094, + "grad_norm": 4.596323490142822, + "learning_rate": 1.2394726450844841e-05, + "loss": 0.5416, + "step": 255230 + }, + { + "epoch": 2.256404816209622, + "grad_norm": 9.5223388671875, + "learning_rate": 1.239325306317297e-05, + "loss": 0.4765, + "step": 255240 + }, + { + "epoch": 2.256493219469934, + "grad_norm": 5.149219036102295, + "learning_rate": 1.23917796755011e-05, + "loss": 0.6229, + "step": 255250 + }, + { + "epoch": 2.256581622730246, + "grad_norm": 1.2231996059417725, + "learning_rate": 1.239030628782923e-05, + "loss": 0.5234, + "step": 255260 + }, + { + "epoch": 2.2566700259905588, + "grad_norm": 1.9591525793075562, + "learning_rate": 1.2388832900157358e-05, + "loss": 0.5861, + "step": 255270 + }, + { + "epoch": 2.256758429250871, + "grad_norm": 3.915600061416626, + "learning_rate": 1.2387359512485488e-05, + "loss": 0.6798, + "step": 255280 + }, + { + "epoch": 2.256846832511183, + "grad_norm": 6.022317886352539, + "learning_rate": 1.2385886124813618e-05, + "loss": 0.6523, + "step": 255290 + }, + { + "epoch": 2.256935235771495, + "grad_norm": 15.137800216674805, + "learning_rate": 1.2384412737141746e-05, + "loss": 0.6019, + "step": 255300 + }, + { + "epoch": 2.2570236390318072, + "grad_norm": 3.018139123916626, + "learning_rate": 1.2382939349469876e-05, + "loss": 0.5325, + "step": 255310 + }, + { + "epoch": 2.25711204229212, + "grad_norm": 4.1557297706604, + "learning_rate": 1.2381465961798006e-05, + "loss": 0.5029, + "step": 255320 + }, + { + "epoch": 2.257200445552432, + "grad_norm": 2.345231533050537, + "learning_rate": 1.2379992574126135e-05, + "loss": 0.4609, + "step": 255330 + }, + { + "epoch": 2.257288848812744, + "grad_norm": 4.470200061798096, + "learning_rate": 1.2378519186454265e-05, + "loss": 0.5345, + "step": 255340 + }, + { + "epoch": 2.2573772520730566, + "grad_norm": 6.620648384094238, + "learning_rate": 1.2377045798782393e-05, + "loss": 0.5129, + "step": 255350 + }, + { + "epoch": 2.2574656553333687, + "grad_norm": 2.9955432415008545, + "learning_rate": 1.2375572411110523e-05, + "loss": 0.5803, + "step": 255360 + }, + { + "epoch": 2.257554058593681, + "grad_norm": 2.3380014896392822, + "learning_rate": 1.2374099023438651e-05, + "loss": 0.5436, + "step": 255370 + }, + { + "epoch": 2.257642461853993, + "grad_norm": 5.035484313964844, + "learning_rate": 1.2372625635766782e-05, + "loss": 0.4697, + "step": 255380 + }, + { + "epoch": 2.2577308651143055, + "grad_norm": 2.703923463821411, + "learning_rate": 1.2371152248094912e-05, + "loss": 0.4745, + "step": 255390 + }, + { + "epoch": 2.2578192683746177, + "grad_norm": 3.5494959354400635, + "learning_rate": 1.236967886042304e-05, + "loss": 0.4651, + "step": 255400 + }, + { + "epoch": 2.25790767163493, + "grad_norm": 2.6410255432128906, + "learning_rate": 1.236820547275117e-05, + "loss": 0.4729, + "step": 255410 + }, + { + "epoch": 2.2579960748952423, + "grad_norm": 4.607995510101318, + "learning_rate": 1.2366732085079298e-05, + "loss": 0.4822, + "step": 255420 + }, + { + "epoch": 2.2580844781555545, + "grad_norm": 1.8334124088287354, + "learning_rate": 1.2365258697407428e-05, + "loss": 0.4207, + "step": 255430 + }, + { + "epoch": 2.2581728814158666, + "grad_norm": 3.925304889678955, + "learning_rate": 1.2363785309735557e-05, + "loss": 0.4512, + "step": 255440 + }, + { + "epoch": 2.2582612846761787, + "grad_norm": 2.3761160373687744, + "learning_rate": 1.2362311922063687e-05, + "loss": 0.579, + "step": 255450 + }, + { + "epoch": 2.2583496879364913, + "grad_norm": 5.155466079711914, + "learning_rate": 1.2360838534391815e-05, + "loss": 0.6578, + "step": 255460 + }, + { + "epoch": 2.2584380911968034, + "grad_norm": 1.8481146097183228, + "learning_rate": 1.2359365146719945e-05, + "loss": 0.4664, + "step": 255470 + }, + { + "epoch": 2.2585264944571155, + "grad_norm": 7.703135967254639, + "learning_rate": 1.2357891759048074e-05, + "loss": 0.449, + "step": 255480 + }, + { + "epoch": 2.2586148977174276, + "grad_norm": 11.660038948059082, + "learning_rate": 1.2356418371376204e-05, + "loss": 0.6149, + "step": 255490 + }, + { + "epoch": 2.25870330097774, + "grad_norm": 2.447523593902588, + "learning_rate": 1.2354944983704334e-05, + "loss": 0.4462, + "step": 255500 + }, + { + "epoch": 2.2587917042380523, + "grad_norm": 3.674255132675171, + "learning_rate": 1.2353471596032462e-05, + "loss": 0.5922, + "step": 255510 + }, + { + "epoch": 2.2588801074983644, + "grad_norm": 4.263308048248291, + "learning_rate": 1.2351998208360592e-05, + "loss": 0.6364, + "step": 255520 + }, + { + "epoch": 2.2589685107586766, + "grad_norm": 2.226649284362793, + "learning_rate": 1.235052482068872e-05, + "loss": 0.6609, + "step": 255530 + }, + { + "epoch": 2.259056914018989, + "grad_norm": 4.706161022186279, + "learning_rate": 1.234905143301685e-05, + "loss": 0.632, + "step": 255540 + }, + { + "epoch": 2.2591453172793012, + "grad_norm": 4.2752814292907715, + "learning_rate": 1.2347578045344979e-05, + "loss": 0.5526, + "step": 255550 + }, + { + "epoch": 2.2592337205396134, + "grad_norm": 12.716913223266602, + "learning_rate": 1.2346104657673109e-05, + "loss": 0.4833, + "step": 255560 + }, + { + "epoch": 2.259322123799926, + "grad_norm": 15.33011531829834, + "learning_rate": 1.2344631270001237e-05, + "loss": 0.4802, + "step": 255570 + }, + { + "epoch": 2.259410527060238, + "grad_norm": 8.596412658691406, + "learning_rate": 1.2343157882329367e-05, + "loss": 0.6423, + "step": 255580 + }, + { + "epoch": 2.25949893032055, + "grad_norm": 5.074573993682861, + "learning_rate": 1.2341684494657497e-05, + "loss": 0.6476, + "step": 255590 + }, + { + "epoch": 2.2595873335808623, + "grad_norm": 1.3483860492706299, + "learning_rate": 1.2340211106985626e-05, + "loss": 0.4415, + "step": 255600 + }, + { + "epoch": 2.259675736841175, + "grad_norm": 2.9938066005706787, + "learning_rate": 1.2338737719313756e-05, + "loss": 0.4846, + "step": 255610 + }, + { + "epoch": 2.259764140101487, + "grad_norm": 2.0936272144317627, + "learning_rate": 1.2337264331641884e-05, + "loss": 0.5258, + "step": 255620 + }, + { + "epoch": 2.259852543361799, + "grad_norm": 1.044798493385315, + "learning_rate": 1.2335790943970014e-05, + "loss": 0.7325, + "step": 255630 + }, + { + "epoch": 2.2599409466221116, + "grad_norm": 4.462527751922607, + "learning_rate": 1.2334317556298142e-05, + "loss": 0.5379, + "step": 255640 + }, + { + "epoch": 2.2600293498824238, + "grad_norm": 2.3184447288513184, + "learning_rate": 1.2332844168626272e-05, + "loss": 0.5184, + "step": 255650 + }, + { + "epoch": 2.260117753142736, + "grad_norm": 2.1777007579803467, + "learning_rate": 1.23313707809544e-05, + "loss": 0.4173, + "step": 255660 + }, + { + "epoch": 2.260206156403048, + "grad_norm": 1.389666199684143, + "learning_rate": 1.2329897393282531e-05, + "loss": 0.3849, + "step": 255670 + }, + { + "epoch": 2.2602945596633606, + "grad_norm": 3.0835914611816406, + "learning_rate": 1.2328424005610661e-05, + "loss": 0.5389, + "step": 255680 + }, + { + "epoch": 2.2603829629236727, + "grad_norm": 8.09292221069336, + "learning_rate": 1.232695061793879e-05, + "loss": 0.635, + "step": 255690 + }, + { + "epoch": 2.260471366183985, + "grad_norm": 3.8848934173583984, + "learning_rate": 1.232547723026692e-05, + "loss": 0.5702, + "step": 255700 + }, + { + "epoch": 2.260559769444297, + "grad_norm": 1.0839723348617554, + "learning_rate": 1.232400384259505e-05, + "loss": 0.6042, + "step": 255710 + }, + { + "epoch": 2.2606481727046095, + "grad_norm": 2.0277256965637207, + "learning_rate": 1.2322530454923178e-05, + "loss": 0.5336, + "step": 255720 + }, + { + "epoch": 2.2607365759649216, + "grad_norm": 0.8966629505157471, + "learning_rate": 1.2321057067251308e-05, + "loss": 0.5328, + "step": 255730 + }, + { + "epoch": 2.2608249792252337, + "grad_norm": 8.7720308303833, + "learning_rate": 1.2319583679579438e-05, + "loss": 0.4866, + "step": 255740 + }, + { + "epoch": 2.260913382485546, + "grad_norm": 1.3665432929992676, + "learning_rate": 1.2318110291907566e-05, + "loss": 0.5898, + "step": 255750 + }, + { + "epoch": 2.2610017857458584, + "grad_norm": 10.735663414001465, + "learning_rate": 1.2316636904235696e-05, + "loss": 0.5896, + "step": 255760 + }, + { + "epoch": 2.2610901890061705, + "grad_norm": 1.3133783340454102, + "learning_rate": 1.2315163516563826e-05, + "loss": 0.6041, + "step": 255770 + }, + { + "epoch": 2.2611785922664827, + "grad_norm": 3.5805182456970215, + "learning_rate": 1.2313690128891955e-05, + "loss": 0.5229, + "step": 255780 + }, + { + "epoch": 2.261266995526795, + "grad_norm": 4.125779151916504, + "learning_rate": 1.2312216741220085e-05, + "loss": 0.6677, + "step": 255790 + }, + { + "epoch": 2.2613553987871073, + "grad_norm": 4.046088218688965, + "learning_rate": 1.2310743353548213e-05, + "loss": 0.6227, + "step": 255800 + }, + { + "epoch": 2.2614438020474195, + "grad_norm": 2.371525526046753, + "learning_rate": 1.2309269965876343e-05, + "loss": 0.5309, + "step": 255810 + }, + { + "epoch": 2.2615322053077316, + "grad_norm": 5.229541778564453, + "learning_rate": 1.2307796578204471e-05, + "loss": 0.5076, + "step": 255820 + }, + { + "epoch": 2.261620608568044, + "grad_norm": 1.8187837600708008, + "learning_rate": 1.2306323190532601e-05, + "loss": 0.5226, + "step": 255830 + }, + { + "epoch": 2.2617090118283563, + "grad_norm": 5.304487228393555, + "learning_rate": 1.230484980286073e-05, + "loss": 0.542, + "step": 255840 + }, + { + "epoch": 2.2617974150886684, + "grad_norm": 1.9385650157928467, + "learning_rate": 1.230337641518886e-05, + "loss": 0.4593, + "step": 255850 + }, + { + "epoch": 2.261885818348981, + "grad_norm": 31.126737594604492, + "learning_rate": 1.230190302751699e-05, + "loss": 0.5759, + "step": 255860 + }, + { + "epoch": 2.261974221609293, + "grad_norm": 1.3266245126724243, + "learning_rate": 1.2300429639845118e-05, + "loss": 0.4611, + "step": 255870 + }, + { + "epoch": 2.262062624869605, + "grad_norm": 6.022707939147949, + "learning_rate": 1.2298956252173248e-05, + "loss": 0.5145, + "step": 255880 + }, + { + "epoch": 2.2621510281299173, + "grad_norm": 10.080405235290527, + "learning_rate": 1.2297482864501377e-05, + "loss": 0.663, + "step": 255890 + }, + { + "epoch": 2.2622394313902294, + "grad_norm": 3.342852830886841, + "learning_rate": 1.2296009476829507e-05, + "loss": 0.5947, + "step": 255900 + }, + { + "epoch": 2.262327834650542, + "grad_norm": 3.1081442832946777, + "learning_rate": 1.2294536089157635e-05, + "loss": 0.5403, + "step": 255910 + }, + { + "epoch": 2.262416237910854, + "grad_norm": 2.4926364421844482, + "learning_rate": 1.2293062701485765e-05, + "loss": 0.5622, + "step": 255920 + }, + { + "epoch": 2.2625046411711662, + "grad_norm": 6.445644855499268, + "learning_rate": 1.2291589313813893e-05, + "loss": 0.5033, + "step": 255930 + }, + { + "epoch": 2.262593044431479, + "grad_norm": 4.099185466766357, + "learning_rate": 1.2290115926142024e-05, + "loss": 0.4332, + "step": 255940 + }, + { + "epoch": 2.262681447691791, + "grad_norm": 1.075320839881897, + "learning_rate": 1.2288642538470152e-05, + "loss": 0.5947, + "step": 255950 + }, + { + "epoch": 2.262769850952103, + "grad_norm": 1.7791204452514648, + "learning_rate": 1.2287169150798282e-05, + "loss": 0.5089, + "step": 255960 + }, + { + "epoch": 2.262858254212415, + "grad_norm": 2.284778118133545, + "learning_rate": 1.2285695763126412e-05, + "loss": 0.4353, + "step": 255970 + }, + { + "epoch": 2.2629466574727277, + "grad_norm": 2.2140800952911377, + "learning_rate": 1.228422237545454e-05, + "loss": 0.5075, + "step": 255980 + }, + { + "epoch": 2.26303506073304, + "grad_norm": 1.5500562191009521, + "learning_rate": 1.228274898778267e-05, + "loss": 0.7154, + "step": 255990 + }, + { + "epoch": 2.263123463993352, + "grad_norm": 3.9767866134643555, + "learning_rate": 1.2281275600110799e-05, + "loss": 0.5184, + "step": 256000 + }, + { + "epoch": 2.2632118672536645, + "grad_norm": 2.2944114208221436, + "learning_rate": 1.2279802212438929e-05, + "loss": 0.4603, + "step": 256010 + }, + { + "epoch": 2.2633002705139766, + "grad_norm": 4.410185813903809, + "learning_rate": 1.2278328824767057e-05, + "loss": 0.5113, + "step": 256020 + }, + { + "epoch": 2.2633886737742888, + "grad_norm": 4.310354709625244, + "learning_rate": 1.2276855437095187e-05, + "loss": 0.4404, + "step": 256030 + }, + { + "epoch": 2.263477077034601, + "grad_norm": 1.403725504875183, + "learning_rate": 1.2275382049423316e-05, + "loss": 0.5422, + "step": 256040 + }, + { + "epoch": 2.2635654802949134, + "grad_norm": 7.670761585235596, + "learning_rate": 1.2273908661751446e-05, + "loss": 0.4919, + "step": 256050 + }, + { + "epoch": 2.2636538835552256, + "grad_norm": 0.9476408958435059, + "learning_rate": 1.2272435274079576e-05, + "loss": 0.4937, + "step": 256060 + }, + { + "epoch": 2.2637422868155377, + "grad_norm": 18.7071475982666, + "learning_rate": 1.2270961886407704e-05, + "loss": 0.4973, + "step": 256070 + }, + { + "epoch": 2.26383069007585, + "grad_norm": 1.3405749797821045, + "learning_rate": 1.2269488498735834e-05, + "loss": 0.4024, + "step": 256080 + }, + { + "epoch": 2.2639190933361624, + "grad_norm": 1.6818350553512573, + "learning_rate": 1.2268015111063962e-05, + "loss": 0.4933, + "step": 256090 + }, + { + "epoch": 2.2640074965964745, + "grad_norm": 2.4634318351745605, + "learning_rate": 1.2266541723392092e-05, + "loss": 0.4371, + "step": 256100 + }, + { + "epoch": 2.2640958998567866, + "grad_norm": 4.300954818725586, + "learning_rate": 1.226506833572022e-05, + "loss": 0.6437, + "step": 256110 + }, + { + "epoch": 2.2641843031170987, + "grad_norm": 4.779867649078369, + "learning_rate": 1.226359494804835e-05, + "loss": 0.6309, + "step": 256120 + }, + { + "epoch": 2.2642727063774113, + "grad_norm": 1.3897210359573364, + "learning_rate": 1.226212156037648e-05, + "loss": 0.5043, + "step": 256130 + }, + { + "epoch": 2.2643611096377234, + "grad_norm": 3.259211301803589, + "learning_rate": 1.226064817270461e-05, + "loss": 0.5369, + "step": 256140 + }, + { + "epoch": 2.2644495128980355, + "grad_norm": 2.18986177444458, + "learning_rate": 1.225917478503274e-05, + "loss": 0.5238, + "step": 256150 + }, + { + "epoch": 2.264537916158348, + "grad_norm": 2.116725444793701, + "learning_rate": 1.2257701397360868e-05, + "loss": 0.5404, + "step": 256160 + }, + { + "epoch": 2.26462631941866, + "grad_norm": 13.943422317504883, + "learning_rate": 1.2256228009688998e-05, + "loss": 0.4884, + "step": 256170 + }, + { + "epoch": 2.2647147226789723, + "grad_norm": 1.749947428703308, + "learning_rate": 1.2254754622017128e-05, + "loss": 0.4792, + "step": 256180 + }, + { + "epoch": 2.2648031259392845, + "grad_norm": 0.6772105693817139, + "learning_rate": 1.2253281234345256e-05, + "loss": 0.4416, + "step": 256190 + }, + { + "epoch": 2.264891529199597, + "grad_norm": 2.1291120052337646, + "learning_rate": 1.2251807846673386e-05, + "loss": 0.5206, + "step": 256200 + }, + { + "epoch": 2.264979932459909, + "grad_norm": 31.80188751220703, + "learning_rate": 1.2250334459001516e-05, + "loss": 0.568, + "step": 256210 + }, + { + "epoch": 2.2650683357202213, + "grad_norm": 2.914248466491699, + "learning_rate": 1.2248861071329645e-05, + "loss": 0.6213, + "step": 256220 + }, + { + "epoch": 2.265156738980534, + "grad_norm": 2.0712673664093018, + "learning_rate": 1.2247387683657775e-05, + "loss": 0.3931, + "step": 256230 + }, + { + "epoch": 2.265245142240846, + "grad_norm": 2.5015454292297363, + "learning_rate": 1.2245914295985905e-05, + "loss": 0.533, + "step": 256240 + }, + { + "epoch": 2.265333545501158, + "grad_norm": 2.9545364379882812, + "learning_rate": 1.2244440908314033e-05, + "loss": 0.5824, + "step": 256250 + }, + { + "epoch": 2.26542194876147, + "grad_norm": 1.3912540674209595, + "learning_rate": 1.2242967520642163e-05, + "loss": 0.5876, + "step": 256260 + }, + { + "epoch": 2.2655103520217827, + "grad_norm": 3.313477039337158, + "learning_rate": 1.2241494132970291e-05, + "loss": 0.6538, + "step": 256270 + }, + { + "epoch": 2.265598755282095, + "grad_norm": 0.6336605548858643, + "learning_rate": 1.2240020745298421e-05, + "loss": 0.5383, + "step": 256280 + }, + { + "epoch": 2.265687158542407, + "grad_norm": 3.955780029296875, + "learning_rate": 1.223854735762655e-05, + "loss": 0.5119, + "step": 256290 + }, + { + "epoch": 2.265775561802719, + "grad_norm": 3.838972806930542, + "learning_rate": 1.223707396995468e-05, + "loss": 0.485, + "step": 256300 + }, + { + "epoch": 2.2658639650630317, + "grad_norm": 0.9034501314163208, + "learning_rate": 1.2235600582282808e-05, + "loss": 0.5105, + "step": 256310 + }, + { + "epoch": 2.265952368323344, + "grad_norm": 1.616733431816101, + "learning_rate": 1.2234127194610938e-05, + "loss": 0.6425, + "step": 256320 + }, + { + "epoch": 2.266040771583656, + "grad_norm": 2.7299728393554688, + "learning_rate": 1.2232653806939068e-05, + "loss": 0.6411, + "step": 256330 + }, + { + "epoch": 2.266129174843968, + "grad_norm": 5.742094993591309, + "learning_rate": 1.2231180419267197e-05, + "loss": 0.4631, + "step": 256340 + }, + { + "epoch": 2.2662175781042806, + "grad_norm": 10.26740550994873, + "learning_rate": 1.2229707031595327e-05, + "loss": 0.6141, + "step": 256350 + }, + { + "epoch": 2.2663059813645927, + "grad_norm": 4.101613998413086, + "learning_rate": 1.2228233643923455e-05, + "loss": 0.7162, + "step": 256360 + }, + { + "epoch": 2.266394384624905, + "grad_norm": 4.683797359466553, + "learning_rate": 1.2226760256251585e-05, + "loss": 0.4657, + "step": 256370 + }, + { + "epoch": 2.2664827878852174, + "grad_norm": 3.339047431945801, + "learning_rate": 1.2225286868579713e-05, + "loss": 0.5161, + "step": 256380 + }, + { + "epoch": 2.2665711911455295, + "grad_norm": 5.020615577697754, + "learning_rate": 1.2223813480907844e-05, + "loss": 0.5915, + "step": 256390 + }, + { + "epoch": 2.2666595944058416, + "grad_norm": 2.0556511878967285, + "learning_rate": 1.2222340093235972e-05, + "loss": 0.473, + "step": 256400 + }, + { + "epoch": 2.2667479976661538, + "grad_norm": 1.0549181699752808, + "learning_rate": 1.2220866705564102e-05, + "loss": 0.5268, + "step": 256410 + }, + { + "epoch": 2.2668364009264663, + "grad_norm": 1.5736749172210693, + "learning_rate": 1.2219393317892232e-05, + "loss": 0.4928, + "step": 256420 + }, + { + "epoch": 2.2669248041867784, + "grad_norm": 8.89582633972168, + "learning_rate": 1.221791993022036e-05, + "loss": 0.5108, + "step": 256430 + }, + { + "epoch": 2.2670132074470906, + "grad_norm": 4.237354278564453, + "learning_rate": 1.221644654254849e-05, + "loss": 0.4895, + "step": 256440 + }, + { + "epoch": 2.267101610707403, + "grad_norm": 4.074317932128906, + "learning_rate": 1.2214973154876619e-05, + "loss": 0.5062, + "step": 256450 + }, + { + "epoch": 2.2671900139677152, + "grad_norm": 1.5946393013000488, + "learning_rate": 1.2213499767204749e-05, + "loss": 0.443, + "step": 256460 + }, + { + "epoch": 2.2672784172280274, + "grad_norm": 6.549824237823486, + "learning_rate": 1.2212026379532877e-05, + "loss": 0.5719, + "step": 256470 + }, + { + "epoch": 2.2673668204883395, + "grad_norm": 9.953897476196289, + "learning_rate": 1.2210552991861007e-05, + "loss": 0.6029, + "step": 256480 + }, + { + "epoch": 2.2674552237486516, + "grad_norm": 3.770750045776367, + "learning_rate": 1.2209079604189136e-05, + "loss": 0.5227, + "step": 256490 + }, + { + "epoch": 2.267543627008964, + "grad_norm": 3.7640373706817627, + "learning_rate": 1.2207606216517266e-05, + "loss": 0.558, + "step": 256500 + }, + { + "epoch": 2.2676320302692763, + "grad_norm": 4.207760334014893, + "learning_rate": 1.2206132828845394e-05, + "loss": 0.561, + "step": 256510 + }, + { + "epoch": 2.2677204335295884, + "grad_norm": 3.5463318824768066, + "learning_rate": 1.2204659441173524e-05, + "loss": 0.6616, + "step": 256520 + }, + { + "epoch": 2.267808836789901, + "grad_norm": 5.338455677032471, + "learning_rate": 1.2203186053501654e-05, + "loss": 0.4991, + "step": 256530 + }, + { + "epoch": 2.267897240050213, + "grad_norm": 5.483229637145996, + "learning_rate": 1.2201712665829782e-05, + "loss": 0.7103, + "step": 256540 + }, + { + "epoch": 2.267985643310525, + "grad_norm": 2.8585658073425293, + "learning_rate": 1.2200239278157912e-05, + "loss": 0.5162, + "step": 256550 + }, + { + "epoch": 2.2680740465708373, + "grad_norm": 5.535957336425781, + "learning_rate": 1.219876589048604e-05, + "loss": 0.5844, + "step": 256560 + }, + { + "epoch": 2.26816244983115, + "grad_norm": 2.413275957107544, + "learning_rate": 1.219729250281417e-05, + "loss": 0.4197, + "step": 256570 + }, + { + "epoch": 2.268250853091462, + "grad_norm": 5.532749652862549, + "learning_rate": 1.21958191151423e-05, + "loss": 0.61, + "step": 256580 + }, + { + "epoch": 2.268339256351774, + "grad_norm": 1.1271675825119019, + "learning_rate": 1.219434572747043e-05, + "loss": 0.54, + "step": 256590 + }, + { + "epoch": 2.2684276596120867, + "grad_norm": 4.621586322784424, + "learning_rate": 1.2192872339798558e-05, + "loss": 0.5328, + "step": 256600 + }, + { + "epoch": 2.268516062872399, + "grad_norm": 2.8275818824768066, + "learning_rate": 1.2191398952126688e-05, + "loss": 0.5702, + "step": 256610 + }, + { + "epoch": 2.268604466132711, + "grad_norm": 2.2571187019348145, + "learning_rate": 1.2189925564454818e-05, + "loss": 0.469, + "step": 256620 + }, + { + "epoch": 2.268692869393023, + "grad_norm": 2.3537802696228027, + "learning_rate": 1.2188452176782946e-05, + "loss": 0.53, + "step": 256630 + }, + { + "epoch": 2.2687812726533356, + "grad_norm": 0.8906327486038208, + "learning_rate": 1.2186978789111076e-05, + "loss": 0.607, + "step": 256640 + }, + { + "epoch": 2.2688696759136477, + "grad_norm": 15.257574081420898, + "learning_rate": 1.2185505401439206e-05, + "loss": 0.4729, + "step": 256650 + }, + { + "epoch": 2.26895807917396, + "grad_norm": 2.9876699447631836, + "learning_rate": 1.2184032013767334e-05, + "loss": 0.4523, + "step": 256660 + }, + { + "epoch": 2.269046482434272, + "grad_norm": 3.075740098953247, + "learning_rate": 1.2182558626095465e-05, + "loss": 0.5061, + "step": 256670 + }, + { + "epoch": 2.2691348856945845, + "grad_norm": 3.7285847663879395, + "learning_rate": 1.2181085238423595e-05, + "loss": 0.4992, + "step": 256680 + }, + { + "epoch": 2.2692232889548967, + "grad_norm": 1.6629009246826172, + "learning_rate": 1.2179611850751723e-05, + "loss": 0.6155, + "step": 256690 + }, + { + "epoch": 2.269311692215209, + "grad_norm": 2.0929501056671143, + "learning_rate": 1.2178138463079853e-05, + "loss": 0.3886, + "step": 256700 + }, + { + "epoch": 2.269400095475521, + "grad_norm": 3.255666971206665, + "learning_rate": 1.2176665075407983e-05, + "loss": 0.5865, + "step": 256710 + }, + { + "epoch": 2.2694884987358335, + "grad_norm": 5.300893783569336, + "learning_rate": 1.2175191687736111e-05, + "loss": 0.5362, + "step": 256720 + }, + { + "epoch": 2.2695769019961456, + "grad_norm": 3.8484947681427, + "learning_rate": 1.2173718300064241e-05, + "loss": 0.5541, + "step": 256730 + }, + { + "epoch": 2.2696653052564577, + "grad_norm": 12.394248008728027, + "learning_rate": 1.217224491239237e-05, + "loss": 0.5504, + "step": 256740 + }, + { + "epoch": 2.2697537085167703, + "grad_norm": 4.940649032592773, + "learning_rate": 1.21707715247205e-05, + "loss": 0.6111, + "step": 256750 + }, + { + "epoch": 2.2698421117770824, + "grad_norm": 14.621428489685059, + "learning_rate": 1.2169298137048628e-05, + "loss": 0.4877, + "step": 256760 + }, + { + "epoch": 2.2699305150373945, + "grad_norm": 0.7064706087112427, + "learning_rate": 1.2167824749376758e-05, + "loss": 0.5015, + "step": 256770 + }, + { + "epoch": 2.2700189182977066, + "grad_norm": 6.232082843780518, + "learning_rate": 1.2166351361704887e-05, + "loss": 0.5657, + "step": 256780 + }, + { + "epoch": 2.270107321558019, + "grad_norm": 1.979737639427185, + "learning_rate": 1.2164877974033017e-05, + "loss": 0.4194, + "step": 256790 + }, + { + "epoch": 2.2701957248183313, + "grad_norm": 1.1046439409255981, + "learning_rate": 1.2163404586361147e-05, + "loss": 0.5114, + "step": 256800 + }, + { + "epoch": 2.2702841280786434, + "grad_norm": 2.063493490219116, + "learning_rate": 1.2161931198689275e-05, + "loss": 0.5779, + "step": 256810 + }, + { + "epoch": 2.270372531338956, + "grad_norm": 4.440842151641846, + "learning_rate": 1.2160457811017405e-05, + "loss": 0.5486, + "step": 256820 + }, + { + "epoch": 2.270460934599268, + "grad_norm": 1.8029438257217407, + "learning_rate": 1.2158984423345533e-05, + "loss": 0.4739, + "step": 256830 + }, + { + "epoch": 2.2705493378595802, + "grad_norm": 4.93871545791626, + "learning_rate": 1.2157511035673663e-05, + "loss": 0.3891, + "step": 256840 + }, + { + "epoch": 2.2706377411198924, + "grad_norm": 2.8103349208831787, + "learning_rate": 1.2156037648001792e-05, + "loss": 0.6576, + "step": 256850 + }, + { + "epoch": 2.270726144380205, + "grad_norm": 0.7174580097198486, + "learning_rate": 1.2154564260329922e-05, + "loss": 0.5128, + "step": 256860 + }, + { + "epoch": 2.270814547640517, + "grad_norm": 4.446401596069336, + "learning_rate": 1.215309087265805e-05, + "loss": 0.5157, + "step": 256870 + }, + { + "epoch": 2.270902950900829, + "grad_norm": 4.263336181640625, + "learning_rate": 1.215161748498618e-05, + "loss": 0.5936, + "step": 256880 + }, + { + "epoch": 2.2709913541611413, + "grad_norm": 2.9699087142944336, + "learning_rate": 1.215014409731431e-05, + "loss": 0.5945, + "step": 256890 + }, + { + "epoch": 2.271079757421454, + "grad_norm": 8.017019271850586, + "learning_rate": 1.2148670709642439e-05, + "loss": 0.6334, + "step": 256900 + }, + { + "epoch": 2.271168160681766, + "grad_norm": 2.1226766109466553, + "learning_rate": 1.2147197321970569e-05, + "loss": 0.6624, + "step": 256910 + }, + { + "epoch": 2.271256563942078, + "grad_norm": 2.6994259357452393, + "learning_rate": 1.2145723934298697e-05, + "loss": 0.6816, + "step": 256920 + }, + { + "epoch": 2.27134496720239, + "grad_norm": 2.064814805984497, + "learning_rate": 1.2144250546626827e-05, + "loss": 0.7107, + "step": 256930 + }, + { + "epoch": 2.2714333704627028, + "grad_norm": 3.131283760070801, + "learning_rate": 1.2142777158954955e-05, + "loss": 0.5547, + "step": 256940 + }, + { + "epoch": 2.271521773723015, + "grad_norm": 3.0357348918914795, + "learning_rate": 1.2141303771283086e-05, + "loss": 0.5, + "step": 256950 + }, + { + "epoch": 2.271610176983327, + "grad_norm": 6.600270748138428, + "learning_rate": 1.2139830383611214e-05, + "loss": 0.5161, + "step": 256960 + }, + { + "epoch": 2.2716985802436396, + "grad_norm": 0.9859300255775452, + "learning_rate": 1.2138356995939344e-05, + "loss": 0.4942, + "step": 256970 + }, + { + "epoch": 2.2717869835039517, + "grad_norm": 5.276820182800293, + "learning_rate": 1.2136883608267474e-05, + "loss": 0.6758, + "step": 256980 + }, + { + "epoch": 2.271875386764264, + "grad_norm": 14.828718185424805, + "learning_rate": 1.2135410220595602e-05, + "loss": 0.5689, + "step": 256990 + }, + { + "epoch": 2.271963790024576, + "grad_norm": 2.359741687774658, + "learning_rate": 1.2133936832923732e-05, + "loss": 0.5748, + "step": 257000 + }, + { + "epoch": 2.2720521932848885, + "grad_norm": 3.5737345218658447, + "learning_rate": 1.213246344525186e-05, + "loss": 0.5599, + "step": 257010 + }, + { + "epoch": 2.2721405965452006, + "grad_norm": 3.130422592163086, + "learning_rate": 1.213099005757999e-05, + "loss": 0.5793, + "step": 257020 + }, + { + "epoch": 2.2722289998055127, + "grad_norm": 2.18151593208313, + "learning_rate": 1.2129516669908119e-05, + "loss": 0.5026, + "step": 257030 + }, + { + "epoch": 2.2723174030658253, + "grad_norm": 1.1375659704208374, + "learning_rate": 1.212804328223625e-05, + "loss": 0.557, + "step": 257040 + }, + { + "epoch": 2.2724058063261374, + "grad_norm": 8.349773406982422, + "learning_rate": 1.2126569894564378e-05, + "loss": 0.5494, + "step": 257050 + }, + { + "epoch": 2.2724942095864495, + "grad_norm": 1.8922837972640991, + "learning_rate": 1.2125096506892508e-05, + "loss": 0.4477, + "step": 257060 + }, + { + "epoch": 2.2725826128467617, + "grad_norm": 2.655022144317627, + "learning_rate": 1.2123623119220636e-05, + "loss": 0.4935, + "step": 257070 + }, + { + "epoch": 2.272671016107074, + "grad_norm": 2.261911153793335, + "learning_rate": 1.2122149731548766e-05, + "loss": 0.558, + "step": 257080 + }, + { + "epoch": 2.2727594193673863, + "grad_norm": 8.469675064086914, + "learning_rate": 1.2120676343876896e-05, + "loss": 0.612, + "step": 257090 + }, + { + "epoch": 2.2728478226276985, + "grad_norm": 11.797723770141602, + "learning_rate": 1.2119202956205024e-05, + "loss": 0.6052, + "step": 257100 + }, + { + "epoch": 2.2729362258880106, + "grad_norm": 4.545322418212891, + "learning_rate": 1.2117729568533154e-05, + "loss": 0.5101, + "step": 257110 + }, + { + "epoch": 2.273024629148323, + "grad_norm": 4.627824306488037, + "learning_rate": 1.2116256180861284e-05, + "loss": 0.6319, + "step": 257120 + }, + { + "epoch": 2.2731130324086353, + "grad_norm": 2.59601092338562, + "learning_rate": 1.2114782793189413e-05, + "loss": 0.5022, + "step": 257130 + }, + { + "epoch": 2.2732014356689474, + "grad_norm": 6.187968730926514, + "learning_rate": 1.2113309405517543e-05, + "loss": 0.473, + "step": 257140 + }, + { + "epoch": 2.2732898389292595, + "grad_norm": 3.958462715148926, + "learning_rate": 1.2111836017845673e-05, + "loss": 0.506, + "step": 257150 + }, + { + "epoch": 2.273378242189572, + "grad_norm": 12.807205200195312, + "learning_rate": 1.2110362630173801e-05, + "loss": 0.4751, + "step": 257160 + }, + { + "epoch": 2.273466645449884, + "grad_norm": 7.627584934234619, + "learning_rate": 1.2108889242501931e-05, + "loss": 0.4843, + "step": 257170 + }, + { + "epoch": 2.2735550487101963, + "grad_norm": 1.9021185636520386, + "learning_rate": 1.2107415854830061e-05, + "loss": 0.5799, + "step": 257180 + }, + { + "epoch": 2.273643451970509, + "grad_norm": 2.973775625228882, + "learning_rate": 1.210594246715819e-05, + "loss": 0.5653, + "step": 257190 + }, + { + "epoch": 2.273731855230821, + "grad_norm": 3.4285504817962646, + "learning_rate": 1.210446907948632e-05, + "loss": 0.5013, + "step": 257200 + }, + { + "epoch": 2.273820258491133, + "grad_norm": 5.657044887542725, + "learning_rate": 1.2102995691814448e-05, + "loss": 0.5618, + "step": 257210 + }, + { + "epoch": 2.2739086617514452, + "grad_norm": 3.053189992904663, + "learning_rate": 1.2101522304142578e-05, + "loss": 0.5113, + "step": 257220 + }, + { + "epoch": 2.273997065011758, + "grad_norm": 5.467819690704346, + "learning_rate": 1.2100048916470707e-05, + "loss": 0.4871, + "step": 257230 + }, + { + "epoch": 2.27408546827207, + "grad_norm": 5.454168319702148, + "learning_rate": 1.2098575528798837e-05, + "loss": 0.5247, + "step": 257240 + }, + { + "epoch": 2.274173871532382, + "grad_norm": 3.0872576236724854, + "learning_rate": 1.2097102141126965e-05, + "loss": 0.5357, + "step": 257250 + }, + { + "epoch": 2.274262274792694, + "grad_norm": 2.358774185180664, + "learning_rate": 1.2095628753455095e-05, + "loss": 0.4578, + "step": 257260 + }, + { + "epoch": 2.2743506780530067, + "grad_norm": 2.785620927810669, + "learning_rate": 1.2094155365783225e-05, + "loss": 0.5672, + "step": 257270 + }, + { + "epoch": 2.274439081313319, + "grad_norm": 7.234309196472168, + "learning_rate": 1.2092681978111353e-05, + "loss": 0.4416, + "step": 257280 + }, + { + "epoch": 2.274527484573631, + "grad_norm": 3.061631917953491, + "learning_rate": 1.2091208590439483e-05, + "loss": 0.5965, + "step": 257290 + }, + { + "epoch": 2.274615887833943, + "grad_norm": 3.1282992362976074, + "learning_rate": 1.2089735202767612e-05, + "loss": 0.5278, + "step": 257300 + }, + { + "epoch": 2.2747042910942556, + "grad_norm": 7.260063648223877, + "learning_rate": 1.2088261815095742e-05, + "loss": 0.6603, + "step": 257310 + }, + { + "epoch": 2.2747926943545678, + "grad_norm": 3.4713950157165527, + "learning_rate": 1.208678842742387e-05, + "loss": 0.5292, + "step": 257320 + }, + { + "epoch": 2.27488109761488, + "grad_norm": 3.445079803466797, + "learning_rate": 1.2085315039752e-05, + "loss": 0.4907, + "step": 257330 + }, + { + "epoch": 2.2749695008751925, + "grad_norm": 3.704183340072632, + "learning_rate": 1.2083841652080129e-05, + "loss": 0.5658, + "step": 257340 + }, + { + "epoch": 2.2750579041355046, + "grad_norm": 5.850945949554443, + "learning_rate": 1.2082368264408259e-05, + "loss": 0.5413, + "step": 257350 + }, + { + "epoch": 2.2751463073958167, + "grad_norm": 3.839923620223999, + "learning_rate": 1.2080894876736389e-05, + "loss": 0.5699, + "step": 257360 + }, + { + "epoch": 2.275234710656129, + "grad_norm": 2.3737738132476807, + "learning_rate": 1.2079421489064517e-05, + "loss": 0.4837, + "step": 257370 + }, + { + "epoch": 2.2753231139164414, + "grad_norm": 3.6750924587249756, + "learning_rate": 1.2077948101392647e-05, + "loss": 0.5249, + "step": 257380 + }, + { + "epoch": 2.2754115171767535, + "grad_norm": 2.100832939147949, + "learning_rate": 1.2076474713720775e-05, + "loss": 0.5758, + "step": 257390 + }, + { + "epoch": 2.2754999204370656, + "grad_norm": 2.5104315280914307, + "learning_rate": 1.2075001326048906e-05, + "loss": 0.5083, + "step": 257400 + }, + { + "epoch": 2.275588323697378, + "grad_norm": 7.526862144470215, + "learning_rate": 1.2073527938377034e-05, + "loss": 0.5499, + "step": 257410 + }, + { + "epoch": 2.2756767269576903, + "grad_norm": 3.375678777694702, + "learning_rate": 1.2072054550705164e-05, + "loss": 0.5774, + "step": 257420 + }, + { + "epoch": 2.2757651302180024, + "grad_norm": 5.9180498123168945, + "learning_rate": 1.2070581163033292e-05, + "loss": 0.4259, + "step": 257430 + }, + { + "epoch": 2.2758535334783145, + "grad_norm": 1.4465411901474, + "learning_rate": 1.2069107775361422e-05, + "loss": 0.4919, + "step": 257440 + }, + { + "epoch": 2.275941936738627, + "grad_norm": 3.994340181350708, + "learning_rate": 1.2067634387689552e-05, + "loss": 0.3891, + "step": 257450 + }, + { + "epoch": 2.2760303399989392, + "grad_norm": 3.8028199672698975, + "learning_rate": 1.206616100001768e-05, + "loss": 0.5627, + "step": 257460 + }, + { + "epoch": 2.2761187432592513, + "grad_norm": 2.522184371948242, + "learning_rate": 1.206468761234581e-05, + "loss": 0.4983, + "step": 257470 + }, + { + "epoch": 2.2762071465195635, + "grad_norm": 3.2262680530548096, + "learning_rate": 1.2063214224673939e-05, + "loss": 0.5612, + "step": 257480 + }, + { + "epoch": 2.276295549779876, + "grad_norm": 3.436816930770874, + "learning_rate": 1.206174083700207e-05, + "loss": 0.6102, + "step": 257490 + }, + { + "epoch": 2.276383953040188, + "grad_norm": 1.9686957597732544, + "learning_rate": 1.2060267449330198e-05, + "loss": 0.5994, + "step": 257500 + }, + { + "epoch": 2.2764723563005003, + "grad_norm": 8.895249366760254, + "learning_rate": 1.2058794061658328e-05, + "loss": 0.5862, + "step": 257510 + }, + { + "epoch": 2.2765607595608124, + "grad_norm": 2.765110969543457, + "learning_rate": 1.2057320673986456e-05, + "loss": 0.7231, + "step": 257520 + }, + { + "epoch": 2.276649162821125, + "grad_norm": 1.8135273456573486, + "learning_rate": 1.2055847286314586e-05, + "loss": 0.4574, + "step": 257530 + }, + { + "epoch": 2.276737566081437, + "grad_norm": 2.70473051071167, + "learning_rate": 1.2054373898642716e-05, + "loss": 0.4871, + "step": 257540 + }, + { + "epoch": 2.276825969341749, + "grad_norm": 2.354494094848633, + "learning_rate": 1.2052900510970844e-05, + "loss": 0.4677, + "step": 257550 + }, + { + "epoch": 2.2769143726020618, + "grad_norm": 0.8699732422828674, + "learning_rate": 1.2051427123298974e-05, + "loss": 0.3962, + "step": 257560 + }, + { + "epoch": 2.277002775862374, + "grad_norm": 2.9168198108673096, + "learning_rate": 1.2049953735627103e-05, + "loss": 0.5364, + "step": 257570 + }, + { + "epoch": 2.277091179122686, + "grad_norm": 15.934507369995117, + "learning_rate": 1.2048480347955233e-05, + "loss": 0.4928, + "step": 257580 + }, + { + "epoch": 2.277179582382998, + "grad_norm": 1.9501093626022339, + "learning_rate": 1.2047006960283363e-05, + "loss": 0.4654, + "step": 257590 + }, + { + "epoch": 2.2772679856433107, + "grad_norm": 2.8067963123321533, + "learning_rate": 1.2045533572611491e-05, + "loss": 0.4312, + "step": 257600 + }, + { + "epoch": 2.277356388903623, + "grad_norm": 3.625459909439087, + "learning_rate": 1.2044060184939621e-05, + "loss": 0.6171, + "step": 257610 + }, + { + "epoch": 2.277444792163935, + "grad_norm": 1.984890341758728, + "learning_rate": 1.2042586797267751e-05, + "loss": 0.6302, + "step": 257620 + }, + { + "epoch": 2.2775331954242475, + "grad_norm": 0.6746512055397034, + "learning_rate": 1.204111340959588e-05, + "loss": 0.5051, + "step": 257630 + }, + { + "epoch": 2.2776215986845596, + "grad_norm": 2.4498205184936523, + "learning_rate": 1.203964002192401e-05, + "loss": 0.4885, + "step": 257640 + }, + { + "epoch": 2.2777100019448717, + "grad_norm": 3.004611015319824, + "learning_rate": 1.203816663425214e-05, + "loss": 0.6117, + "step": 257650 + }, + { + "epoch": 2.277798405205184, + "grad_norm": 3.427130937576294, + "learning_rate": 1.2036693246580268e-05, + "loss": 0.4036, + "step": 257660 + }, + { + "epoch": 2.277886808465496, + "grad_norm": 5.553164482116699, + "learning_rate": 1.2035219858908398e-05, + "loss": 0.4981, + "step": 257670 + }, + { + "epoch": 2.2779752117258085, + "grad_norm": 1.5807390213012695, + "learning_rate": 1.2033746471236527e-05, + "loss": 0.4916, + "step": 257680 + }, + { + "epoch": 2.2780636149861206, + "grad_norm": 1.3949272632598877, + "learning_rate": 1.2032273083564657e-05, + "loss": 0.6158, + "step": 257690 + }, + { + "epoch": 2.2781520182464328, + "grad_norm": 21.30250358581543, + "learning_rate": 1.2030799695892785e-05, + "loss": 0.5937, + "step": 257700 + }, + { + "epoch": 2.2782404215067453, + "grad_norm": 7.827817916870117, + "learning_rate": 1.2029326308220915e-05, + "loss": 0.5007, + "step": 257710 + }, + { + "epoch": 2.2783288247670574, + "grad_norm": 4.24570369720459, + "learning_rate": 1.2027852920549043e-05, + "loss": 0.5061, + "step": 257720 + }, + { + "epoch": 2.2784172280273696, + "grad_norm": 3.041670560836792, + "learning_rate": 1.2026379532877173e-05, + "loss": 0.5823, + "step": 257730 + }, + { + "epoch": 2.2785056312876817, + "grad_norm": 6.5597615242004395, + "learning_rate": 1.2024906145205303e-05, + "loss": 0.512, + "step": 257740 + }, + { + "epoch": 2.2785940345479943, + "grad_norm": 2.318195104598999, + "learning_rate": 1.2023432757533432e-05, + "loss": 0.579, + "step": 257750 + }, + { + "epoch": 2.2786824378083064, + "grad_norm": 1.8276273012161255, + "learning_rate": 1.2021959369861562e-05, + "loss": 0.5296, + "step": 257760 + }, + { + "epoch": 2.2787708410686185, + "grad_norm": 10.683591842651367, + "learning_rate": 1.202048598218969e-05, + "loss": 0.5555, + "step": 257770 + }, + { + "epoch": 2.278859244328931, + "grad_norm": 1.8185906410217285, + "learning_rate": 1.201901259451782e-05, + "loss": 0.485, + "step": 257780 + }, + { + "epoch": 2.278947647589243, + "grad_norm": 2.2257261276245117, + "learning_rate": 1.2017539206845949e-05, + "loss": 0.5491, + "step": 257790 + }, + { + "epoch": 2.2790360508495553, + "grad_norm": 16.583499908447266, + "learning_rate": 1.2016065819174079e-05, + "loss": 0.507, + "step": 257800 + }, + { + "epoch": 2.2791244541098674, + "grad_norm": 1.5681357383728027, + "learning_rate": 1.2014592431502207e-05, + "loss": 0.4877, + "step": 257810 + }, + { + "epoch": 2.27921285737018, + "grad_norm": 2.391857624053955, + "learning_rate": 1.2013119043830337e-05, + "loss": 0.5696, + "step": 257820 + }, + { + "epoch": 2.279301260630492, + "grad_norm": 3.5998051166534424, + "learning_rate": 1.2011645656158467e-05, + "loss": 0.4974, + "step": 257830 + }, + { + "epoch": 2.279389663890804, + "grad_norm": 4.532112121582031, + "learning_rate": 1.2010172268486595e-05, + "loss": 0.5149, + "step": 257840 + }, + { + "epoch": 2.279478067151117, + "grad_norm": 3.8546736240386963, + "learning_rate": 1.2008698880814725e-05, + "loss": 0.5484, + "step": 257850 + }, + { + "epoch": 2.279566470411429, + "grad_norm": 2.537308931350708, + "learning_rate": 1.2007225493142854e-05, + "loss": 0.6183, + "step": 257860 + }, + { + "epoch": 2.279654873671741, + "grad_norm": 1.743908405303955, + "learning_rate": 1.2005752105470984e-05, + "loss": 0.4728, + "step": 257870 + }, + { + "epoch": 2.279743276932053, + "grad_norm": 2.922927141189575, + "learning_rate": 1.2004278717799112e-05, + "loss": 0.4872, + "step": 257880 + }, + { + "epoch": 2.2798316801923653, + "grad_norm": 4.022650718688965, + "learning_rate": 1.2002805330127242e-05, + "loss": 0.5466, + "step": 257890 + }, + { + "epoch": 2.279920083452678, + "grad_norm": 3.061516761779785, + "learning_rate": 1.200133194245537e-05, + "loss": 0.5321, + "step": 257900 + }, + { + "epoch": 2.28000848671299, + "grad_norm": 5.341427326202393, + "learning_rate": 1.19998585547835e-05, + "loss": 0.5644, + "step": 257910 + }, + { + "epoch": 2.280096889973302, + "grad_norm": 7.379515171051025, + "learning_rate": 1.199838516711163e-05, + "loss": 0.5072, + "step": 257920 + }, + { + "epoch": 2.2801852932336146, + "grad_norm": 7.7087721824646, + "learning_rate": 1.1996911779439759e-05, + "loss": 0.5109, + "step": 257930 + }, + { + "epoch": 2.2802736964939267, + "grad_norm": 4.801506996154785, + "learning_rate": 1.1995438391767889e-05, + "loss": 0.5138, + "step": 257940 + }, + { + "epoch": 2.280362099754239, + "grad_norm": 1.6994314193725586, + "learning_rate": 1.1993965004096017e-05, + "loss": 0.4416, + "step": 257950 + }, + { + "epoch": 2.280450503014551, + "grad_norm": 1.6736160516738892, + "learning_rate": 1.1992491616424148e-05, + "loss": 0.5297, + "step": 257960 + }, + { + "epoch": 2.2805389062748636, + "grad_norm": 3.1785218715667725, + "learning_rate": 1.1991018228752276e-05, + "loss": 0.4632, + "step": 257970 + }, + { + "epoch": 2.2806273095351757, + "grad_norm": 3.2986831665039062, + "learning_rate": 1.1989544841080406e-05, + "loss": 0.4605, + "step": 257980 + }, + { + "epoch": 2.280715712795488, + "grad_norm": 4.126906394958496, + "learning_rate": 1.1988071453408534e-05, + "loss": 0.4339, + "step": 257990 + }, + { + "epoch": 2.2808041160558004, + "grad_norm": 4.183578968048096, + "learning_rate": 1.1986598065736664e-05, + "loss": 0.4532, + "step": 258000 + }, + { + "epoch": 2.2808925193161125, + "grad_norm": 3.4795427322387695, + "learning_rate": 1.1985124678064794e-05, + "loss": 0.5329, + "step": 258010 + }, + { + "epoch": 2.2809809225764246, + "grad_norm": 6.167972564697266, + "learning_rate": 1.1983651290392923e-05, + "loss": 0.5113, + "step": 258020 + }, + { + "epoch": 2.2810693258367367, + "grad_norm": 2.9759292602539062, + "learning_rate": 1.1982177902721053e-05, + "loss": 0.4586, + "step": 258030 + }, + { + "epoch": 2.2811577290970493, + "grad_norm": 6.749821186065674, + "learning_rate": 1.1980704515049183e-05, + "loss": 0.5645, + "step": 258040 + }, + { + "epoch": 2.2812461323573614, + "grad_norm": 10.755722045898438, + "learning_rate": 1.1979231127377311e-05, + "loss": 0.4966, + "step": 258050 + }, + { + "epoch": 2.2813345356176735, + "grad_norm": 10.28490161895752, + "learning_rate": 1.1977757739705441e-05, + "loss": 0.5233, + "step": 258060 + }, + { + "epoch": 2.2814229388779856, + "grad_norm": 3.647273302078247, + "learning_rate": 1.1976284352033571e-05, + "loss": 0.5903, + "step": 258070 + }, + { + "epoch": 2.281511342138298, + "grad_norm": 2.8119008541107178, + "learning_rate": 1.19748109643617e-05, + "loss": 0.5792, + "step": 258080 + }, + { + "epoch": 2.2815997453986103, + "grad_norm": 4.545162677764893, + "learning_rate": 1.197333757668983e-05, + "loss": 0.562, + "step": 258090 + }, + { + "epoch": 2.2816881486589224, + "grad_norm": 2.222421407699585, + "learning_rate": 1.1971864189017958e-05, + "loss": 0.5717, + "step": 258100 + }, + { + "epoch": 2.2817765519192346, + "grad_norm": 3.116619110107422, + "learning_rate": 1.1970390801346088e-05, + "loss": 0.5473, + "step": 258110 + }, + { + "epoch": 2.281864955179547, + "grad_norm": 4.362210750579834, + "learning_rate": 1.1968917413674218e-05, + "loss": 0.4792, + "step": 258120 + }, + { + "epoch": 2.2819533584398592, + "grad_norm": 3.3410651683807373, + "learning_rate": 1.1967444026002347e-05, + "loss": 0.4871, + "step": 258130 + }, + { + "epoch": 2.2820417617001714, + "grad_norm": 5.461167812347412, + "learning_rate": 1.1965970638330477e-05, + "loss": 0.5307, + "step": 258140 + }, + { + "epoch": 2.282130164960484, + "grad_norm": 4.457417011260986, + "learning_rate": 1.1964497250658605e-05, + "loss": 0.6861, + "step": 258150 + }, + { + "epoch": 2.282218568220796, + "grad_norm": 11.809749603271484, + "learning_rate": 1.1963023862986735e-05, + "loss": 0.5524, + "step": 258160 + }, + { + "epoch": 2.282306971481108, + "grad_norm": 1.8864740133285522, + "learning_rate": 1.1961550475314863e-05, + "loss": 0.542, + "step": 258170 + }, + { + "epoch": 2.2823953747414203, + "grad_norm": 3.415985107421875, + "learning_rate": 1.1960077087642993e-05, + "loss": 0.4811, + "step": 258180 + }, + { + "epoch": 2.282483778001733, + "grad_norm": 3.3992748260498047, + "learning_rate": 1.1958603699971122e-05, + "loss": 0.5767, + "step": 258190 + }, + { + "epoch": 2.282572181262045, + "grad_norm": 2.3915534019470215, + "learning_rate": 1.1957130312299252e-05, + "loss": 0.4374, + "step": 258200 + }, + { + "epoch": 2.282660584522357, + "grad_norm": 2.4218013286590576, + "learning_rate": 1.1955656924627382e-05, + "loss": 0.5352, + "step": 258210 + }, + { + "epoch": 2.2827489877826697, + "grad_norm": 20.22212791442871, + "learning_rate": 1.195418353695551e-05, + "loss": 0.4714, + "step": 258220 + }, + { + "epoch": 2.282837391042982, + "grad_norm": 4.120296955108643, + "learning_rate": 1.195271014928364e-05, + "loss": 0.5105, + "step": 258230 + }, + { + "epoch": 2.282925794303294, + "grad_norm": 5.363488674163818, + "learning_rate": 1.1951236761611769e-05, + "loss": 0.4162, + "step": 258240 + }, + { + "epoch": 2.283014197563606, + "grad_norm": 4.634644985198975, + "learning_rate": 1.1949763373939899e-05, + "loss": 0.43, + "step": 258250 + }, + { + "epoch": 2.283102600823918, + "grad_norm": 5.528831481933594, + "learning_rate": 1.1948289986268027e-05, + "loss": 0.3917, + "step": 258260 + }, + { + "epoch": 2.2831910040842307, + "grad_norm": 2.7219016551971436, + "learning_rate": 1.1946816598596157e-05, + "loss": 0.4736, + "step": 258270 + }, + { + "epoch": 2.283279407344543, + "grad_norm": 1.8297898769378662, + "learning_rate": 1.1945343210924285e-05, + "loss": 0.5504, + "step": 258280 + }, + { + "epoch": 2.283367810604855, + "grad_norm": 3.397207498550415, + "learning_rate": 1.1943869823252415e-05, + "loss": 0.6746, + "step": 258290 + }, + { + "epoch": 2.2834562138651675, + "grad_norm": 8.466011047363281, + "learning_rate": 1.1942396435580545e-05, + "loss": 0.5444, + "step": 258300 + }, + { + "epoch": 2.2835446171254796, + "grad_norm": 1.5106641054153442, + "learning_rate": 1.1940923047908674e-05, + "loss": 0.5551, + "step": 258310 + }, + { + "epoch": 2.2836330203857917, + "grad_norm": 3.4426984786987305, + "learning_rate": 1.1939449660236804e-05, + "loss": 0.5531, + "step": 258320 + }, + { + "epoch": 2.283721423646104, + "grad_norm": 1.59598708152771, + "learning_rate": 1.1937976272564932e-05, + "loss": 0.4471, + "step": 258330 + }, + { + "epoch": 2.2838098269064164, + "grad_norm": 1.3417335748672485, + "learning_rate": 1.1936502884893062e-05, + "loss": 0.564, + "step": 258340 + }, + { + "epoch": 2.2838982301667285, + "grad_norm": 3.8677334785461426, + "learning_rate": 1.193502949722119e-05, + "loss": 0.4324, + "step": 258350 + }, + { + "epoch": 2.2839866334270407, + "grad_norm": 4.39150857925415, + "learning_rate": 1.193355610954932e-05, + "loss": 0.499, + "step": 258360 + }, + { + "epoch": 2.2840750366873532, + "grad_norm": 3.9934914112091064, + "learning_rate": 1.1932082721877449e-05, + "loss": 0.5003, + "step": 258370 + }, + { + "epoch": 2.2841634399476654, + "grad_norm": 9.883804321289062, + "learning_rate": 1.1930609334205579e-05, + "loss": 0.6834, + "step": 258380 + }, + { + "epoch": 2.2842518432079775, + "grad_norm": 2.0604612827301025, + "learning_rate": 1.1929135946533709e-05, + "loss": 0.5774, + "step": 258390 + }, + { + "epoch": 2.2843402464682896, + "grad_norm": 1.7123113870620728, + "learning_rate": 1.1927662558861837e-05, + "loss": 0.5205, + "step": 258400 + }, + { + "epoch": 2.284428649728602, + "grad_norm": 1.9771491289138794, + "learning_rate": 1.1926189171189968e-05, + "loss": 0.5431, + "step": 258410 + }, + { + "epoch": 2.2845170529889143, + "grad_norm": 3.4352104663848877, + "learning_rate": 1.1924715783518096e-05, + "loss": 0.5851, + "step": 258420 + }, + { + "epoch": 2.2846054562492264, + "grad_norm": 1.6965303421020508, + "learning_rate": 1.1923242395846226e-05, + "loss": 0.4958, + "step": 258430 + }, + { + "epoch": 2.284693859509539, + "grad_norm": 1.9104902744293213, + "learning_rate": 1.1921769008174354e-05, + "loss": 0.5554, + "step": 258440 + }, + { + "epoch": 2.284782262769851, + "grad_norm": 2.674818277359009, + "learning_rate": 1.1920295620502484e-05, + "loss": 0.5183, + "step": 258450 + }, + { + "epoch": 2.284870666030163, + "grad_norm": 3.179213047027588, + "learning_rate": 1.1918822232830613e-05, + "loss": 0.4555, + "step": 258460 + }, + { + "epoch": 2.2849590692904753, + "grad_norm": 3.423804998397827, + "learning_rate": 1.1917348845158743e-05, + "loss": 0.5412, + "step": 258470 + }, + { + "epoch": 2.2850474725507874, + "grad_norm": 2.788823366165161, + "learning_rate": 1.1915875457486873e-05, + "loss": 0.616, + "step": 258480 + }, + { + "epoch": 2.2851358758111, + "grad_norm": 42.05954360961914, + "learning_rate": 1.1914402069815001e-05, + "loss": 0.6124, + "step": 258490 + }, + { + "epoch": 2.285224279071412, + "grad_norm": 10.734573364257812, + "learning_rate": 1.1912928682143131e-05, + "loss": 0.467, + "step": 258500 + }, + { + "epoch": 2.2853126823317242, + "grad_norm": 6.3564629554748535, + "learning_rate": 1.1911455294471261e-05, + "loss": 0.4821, + "step": 258510 + }, + { + "epoch": 2.285401085592037, + "grad_norm": 3.089716672897339, + "learning_rate": 1.190998190679939e-05, + "loss": 0.4974, + "step": 258520 + }, + { + "epoch": 2.285489488852349, + "grad_norm": 2.300266742706299, + "learning_rate": 1.190850851912752e-05, + "loss": 0.536, + "step": 258530 + }, + { + "epoch": 2.285577892112661, + "grad_norm": 1.7699977159500122, + "learning_rate": 1.190703513145565e-05, + "loss": 0.5838, + "step": 258540 + }, + { + "epoch": 2.285666295372973, + "grad_norm": 1.4589766263961792, + "learning_rate": 1.1905561743783778e-05, + "loss": 0.524, + "step": 258550 + }, + { + "epoch": 2.2857546986332857, + "grad_norm": 14.989256858825684, + "learning_rate": 1.1904088356111908e-05, + "loss": 0.6197, + "step": 258560 + }, + { + "epoch": 2.285843101893598, + "grad_norm": 3.5018091201782227, + "learning_rate": 1.1902614968440038e-05, + "loss": 0.5282, + "step": 258570 + }, + { + "epoch": 2.28593150515391, + "grad_norm": 2.7327961921691895, + "learning_rate": 1.1901141580768166e-05, + "loss": 0.5886, + "step": 258580 + }, + { + "epoch": 2.2860199084142225, + "grad_norm": 2.5236175060272217, + "learning_rate": 1.1899668193096297e-05, + "loss": 0.5264, + "step": 258590 + }, + { + "epoch": 2.2861083116745347, + "grad_norm": 0.9406690001487732, + "learning_rate": 1.1898194805424425e-05, + "loss": 0.4416, + "step": 258600 + }, + { + "epoch": 2.2861967149348468, + "grad_norm": 5.510041236877441, + "learning_rate": 1.1896721417752555e-05, + "loss": 0.6049, + "step": 258610 + }, + { + "epoch": 2.286285118195159, + "grad_norm": 3.0902516841888428, + "learning_rate": 1.1895248030080683e-05, + "loss": 0.4187, + "step": 258620 + }, + { + "epoch": 2.2863735214554715, + "grad_norm": 1.1525007486343384, + "learning_rate": 1.1893774642408813e-05, + "loss": 0.3806, + "step": 258630 + }, + { + "epoch": 2.2864619247157836, + "grad_norm": 1.0578961372375488, + "learning_rate": 1.1892301254736942e-05, + "loss": 0.4561, + "step": 258640 + }, + { + "epoch": 2.2865503279760957, + "grad_norm": 1.8693323135375977, + "learning_rate": 1.1890827867065072e-05, + "loss": 0.5813, + "step": 258650 + }, + { + "epoch": 2.286638731236408, + "grad_norm": 6.01668119430542, + "learning_rate": 1.18893544793932e-05, + "loss": 0.5522, + "step": 258660 + }, + { + "epoch": 2.2867271344967204, + "grad_norm": 2.4038376808166504, + "learning_rate": 1.188788109172133e-05, + "loss": 0.4428, + "step": 258670 + }, + { + "epoch": 2.2868155377570325, + "grad_norm": 4.510138988494873, + "learning_rate": 1.188640770404946e-05, + "loss": 0.4426, + "step": 258680 + }, + { + "epoch": 2.2869039410173446, + "grad_norm": 1.566523790359497, + "learning_rate": 1.1884934316377589e-05, + "loss": 0.4735, + "step": 258690 + }, + { + "epoch": 2.2869923442776567, + "grad_norm": 2.271803855895996, + "learning_rate": 1.1883460928705719e-05, + "loss": 0.6069, + "step": 258700 + }, + { + "epoch": 2.2870807475379693, + "grad_norm": 1.780442237854004, + "learning_rate": 1.1881987541033847e-05, + "loss": 0.4849, + "step": 258710 + }, + { + "epoch": 2.2871691507982814, + "grad_norm": 22.652313232421875, + "learning_rate": 1.1880514153361977e-05, + "loss": 0.5077, + "step": 258720 + }, + { + "epoch": 2.2872575540585935, + "grad_norm": 5.076672077178955, + "learning_rate": 1.1879040765690105e-05, + "loss": 0.5224, + "step": 258730 + }, + { + "epoch": 2.287345957318906, + "grad_norm": 1.894026279449463, + "learning_rate": 1.1877567378018235e-05, + "loss": 0.4461, + "step": 258740 + }, + { + "epoch": 2.2874343605792182, + "grad_norm": 3.7971251010894775, + "learning_rate": 1.1876093990346364e-05, + "loss": 0.621, + "step": 258750 + }, + { + "epoch": 2.2875227638395303, + "grad_norm": 11.883145332336426, + "learning_rate": 1.1874620602674494e-05, + "loss": 0.4764, + "step": 258760 + }, + { + "epoch": 2.2876111670998425, + "grad_norm": 1.9330343008041382, + "learning_rate": 1.1873147215002624e-05, + "loss": 0.6377, + "step": 258770 + }, + { + "epoch": 2.287699570360155, + "grad_norm": 1.7178391218185425, + "learning_rate": 1.1871673827330752e-05, + "loss": 0.4984, + "step": 258780 + }, + { + "epoch": 2.287787973620467, + "grad_norm": 3.2954182624816895, + "learning_rate": 1.1870200439658882e-05, + "loss": 0.5428, + "step": 258790 + }, + { + "epoch": 2.2878763768807793, + "grad_norm": 1.9847383499145508, + "learning_rate": 1.186872705198701e-05, + "loss": 0.4789, + "step": 258800 + }, + { + "epoch": 2.287964780141092, + "grad_norm": 5.910218715667725, + "learning_rate": 1.186725366431514e-05, + "loss": 0.5858, + "step": 258810 + }, + { + "epoch": 2.288053183401404, + "grad_norm": 1.8916573524475098, + "learning_rate": 1.1865780276643269e-05, + "loss": 0.4418, + "step": 258820 + }, + { + "epoch": 2.288141586661716, + "grad_norm": 1.009761095046997, + "learning_rate": 1.1864306888971399e-05, + "loss": 0.5589, + "step": 258830 + }, + { + "epoch": 2.288229989922028, + "grad_norm": 7.7262282371521, + "learning_rate": 1.1862833501299527e-05, + "loss": 0.5038, + "step": 258840 + }, + { + "epoch": 2.2883183931823403, + "grad_norm": 2.463284492492676, + "learning_rate": 1.1861360113627657e-05, + "loss": 0.4602, + "step": 258850 + }, + { + "epoch": 2.288406796442653, + "grad_norm": 1.4684208631515503, + "learning_rate": 1.1859886725955787e-05, + "loss": 0.462, + "step": 258860 + }, + { + "epoch": 2.288495199702965, + "grad_norm": 6.230657577514648, + "learning_rate": 1.1858413338283916e-05, + "loss": 0.5283, + "step": 258870 + }, + { + "epoch": 2.288583602963277, + "grad_norm": 0.9634785652160645, + "learning_rate": 1.1856939950612046e-05, + "loss": 0.6528, + "step": 258880 + }, + { + "epoch": 2.2886720062235897, + "grad_norm": 6.552861213684082, + "learning_rate": 1.1855466562940174e-05, + "loss": 0.4822, + "step": 258890 + }, + { + "epoch": 2.288760409483902, + "grad_norm": 0.686669111251831, + "learning_rate": 1.1853993175268304e-05, + "loss": 0.4671, + "step": 258900 + }, + { + "epoch": 2.288848812744214, + "grad_norm": 25.99290657043457, + "learning_rate": 1.1852519787596433e-05, + "loss": 0.5131, + "step": 258910 + }, + { + "epoch": 2.288937216004526, + "grad_norm": 3.1465871334075928, + "learning_rate": 1.1851046399924563e-05, + "loss": 0.5746, + "step": 258920 + }, + { + "epoch": 2.2890256192648386, + "grad_norm": 3.5952112674713135, + "learning_rate": 1.1849573012252691e-05, + "loss": 0.4345, + "step": 258930 + }, + { + "epoch": 2.2891140225251507, + "grad_norm": 1.6725726127624512, + "learning_rate": 1.1848099624580821e-05, + "loss": 0.5366, + "step": 258940 + }, + { + "epoch": 2.289202425785463, + "grad_norm": 1.3325248956680298, + "learning_rate": 1.1846626236908951e-05, + "loss": 0.5247, + "step": 258950 + }, + { + "epoch": 2.2892908290457754, + "grad_norm": 1.8758389949798584, + "learning_rate": 1.184515284923708e-05, + "loss": 0.6758, + "step": 258960 + }, + { + "epoch": 2.2893792323060875, + "grad_norm": 15.745492935180664, + "learning_rate": 1.184367946156521e-05, + "loss": 0.5078, + "step": 258970 + }, + { + "epoch": 2.2894676355663997, + "grad_norm": 7.054400444030762, + "learning_rate": 1.184220607389334e-05, + "loss": 0.62, + "step": 258980 + }, + { + "epoch": 2.2895560388267118, + "grad_norm": 6.443073749542236, + "learning_rate": 1.1840732686221468e-05, + "loss": 0.5836, + "step": 258990 + }, + { + "epoch": 2.2896444420870243, + "grad_norm": 3.5590736865997314, + "learning_rate": 1.1839259298549598e-05, + "loss": 0.5717, + "step": 259000 + }, + { + "epoch": 2.2897328453473365, + "grad_norm": 3.524709463119507, + "learning_rate": 1.1837785910877728e-05, + "loss": 0.5703, + "step": 259010 + }, + { + "epoch": 2.2898212486076486, + "grad_norm": 4.948880672454834, + "learning_rate": 1.1836312523205856e-05, + "loss": 0.5683, + "step": 259020 + }, + { + "epoch": 2.289909651867961, + "grad_norm": 3.3407719135284424, + "learning_rate": 1.1834839135533986e-05, + "loss": 0.5978, + "step": 259030 + }, + { + "epoch": 2.2899980551282733, + "grad_norm": 16.509796142578125, + "learning_rate": 1.1833365747862116e-05, + "loss": 0.4859, + "step": 259040 + }, + { + "epoch": 2.2900864583885854, + "grad_norm": 3.7032783031463623, + "learning_rate": 1.1831892360190245e-05, + "loss": 0.58, + "step": 259050 + }, + { + "epoch": 2.2901748616488975, + "grad_norm": 3.724459648132324, + "learning_rate": 1.1830418972518375e-05, + "loss": 0.4608, + "step": 259060 + }, + { + "epoch": 2.2902632649092096, + "grad_norm": 2.8998777866363525, + "learning_rate": 1.1828945584846503e-05, + "loss": 0.5822, + "step": 259070 + }, + { + "epoch": 2.290351668169522, + "grad_norm": 2.2037901878356934, + "learning_rate": 1.1827472197174633e-05, + "loss": 0.4227, + "step": 259080 + }, + { + "epoch": 2.2904400714298343, + "grad_norm": 1.5630687475204468, + "learning_rate": 1.1825998809502762e-05, + "loss": 0.5035, + "step": 259090 + }, + { + "epoch": 2.2905284746901464, + "grad_norm": 2.820253610610962, + "learning_rate": 1.1824525421830892e-05, + "loss": 0.4252, + "step": 259100 + }, + { + "epoch": 2.290616877950459, + "grad_norm": 6.118350028991699, + "learning_rate": 1.182305203415902e-05, + "loss": 0.5519, + "step": 259110 + }, + { + "epoch": 2.290705281210771, + "grad_norm": 3.6298069953918457, + "learning_rate": 1.182157864648715e-05, + "loss": 0.6494, + "step": 259120 + }, + { + "epoch": 2.2907936844710832, + "grad_norm": 1.9646481275558472, + "learning_rate": 1.182010525881528e-05, + "loss": 0.5167, + "step": 259130 + }, + { + "epoch": 2.2908820877313953, + "grad_norm": 1.477258563041687, + "learning_rate": 1.1818631871143409e-05, + "loss": 0.5375, + "step": 259140 + }, + { + "epoch": 2.290970490991708, + "grad_norm": 1.2633521556854248, + "learning_rate": 1.1817158483471539e-05, + "loss": 0.4596, + "step": 259150 + }, + { + "epoch": 2.29105889425202, + "grad_norm": 2.5734190940856934, + "learning_rate": 1.1815685095799667e-05, + "loss": 0.508, + "step": 259160 + }, + { + "epoch": 2.291147297512332, + "grad_norm": 2.1885807514190674, + "learning_rate": 1.1814211708127797e-05, + "loss": 0.5122, + "step": 259170 + }, + { + "epoch": 2.2912357007726447, + "grad_norm": 2.1508920192718506, + "learning_rate": 1.1812738320455925e-05, + "loss": 0.5494, + "step": 259180 + }, + { + "epoch": 2.291324104032957, + "grad_norm": 5.04841947555542, + "learning_rate": 1.1811264932784055e-05, + "loss": 0.6155, + "step": 259190 + }, + { + "epoch": 2.291412507293269, + "grad_norm": 2.3796944618225098, + "learning_rate": 1.1809791545112184e-05, + "loss": 0.5916, + "step": 259200 + }, + { + "epoch": 2.291500910553581, + "grad_norm": 4.415163993835449, + "learning_rate": 1.1808318157440314e-05, + "loss": 0.5679, + "step": 259210 + }, + { + "epoch": 2.2915893138138936, + "grad_norm": 10.343878746032715, + "learning_rate": 1.1806844769768442e-05, + "loss": 0.5363, + "step": 259220 + }, + { + "epoch": 2.2916777170742058, + "grad_norm": 2.0149106979370117, + "learning_rate": 1.1805371382096572e-05, + "loss": 0.5828, + "step": 259230 + }, + { + "epoch": 2.291766120334518, + "grad_norm": 9.882540702819824, + "learning_rate": 1.1803897994424702e-05, + "loss": 0.4893, + "step": 259240 + }, + { + "epoch": 2.29185452359483, + "grad_norm": 1.2446866035461426, + "learning_rate": 1.180242460675283e-05, + "loss": 0.409, + "step": 259250 + }, + { + "epoch": 2.2919429268551426, + "grad_norm": 9.53824234008789, + "learning_rate": 1.180095121908096e-05, + "loss": 0.5499, + "step": 259260 + }, + { + "epoch": 2.2920313301154547, + "grad_norm": 2.638638973236084, + "learning_rate": 1.1799477831409089e-05, + "loss": 0.5121, + "step": 259270 + }, + { + "epoch": 2.292119733375767, + "grad_norm": 2.914440870285034, + "learning_rate": 1.1798004443737219e-05, + "loss": 0.4283, + "step": 259280 + }, + { + "epoch": 2.292208136636079, + "grad_norm": 6.956794261932373, + "learning_rate": 1.1796531056065347e-05, + "loss": 0.5573, + "step": 259290 + }, + { + "epoch": 2.2922965398963915, + "grad_norm": 5.326164722442627, + "learning_rate": 1.1795057668393477e-05, + "loss": 0.4765, + "step": 259300 + }, + { + "epoch": 2.2923849431567036, + "grad_norm": 1.9975029230117798, + "learning_rate": 1.1793584280721606e-05, + "loss": 0.5406, + "step": 259310 + }, + { + "epoch": 2.2924733464170157, + "grad_norm": 6.317074298858643, + "learning_rate": 1.1792110893049736e-05, + "loss": 0.6291, + "step": 259320 + }, + { + "epoch": 2.2925617496773283, + "grad_norm": 5.532962799072266, + "learning_rate": 1.1790637505377866e-05, + "loss": 0.4357, + "step": 259330 + }, + { + "epoch": 2.2926501529376404, + "grad_norm": 20.285133361816406, + "learning_rate": 1.1789164117705994e-05, + "loss": 0.4665, + "step": 259340 + }, + { + "epoch": 2.2927385561979525, + "grad_norm": 2.6194851398468018, + "learning_rate": 1.1787690730034124e-05, + "loss": 0.6543, + "step": 259350 + }, + { + "epoch": 2.2928269594582646, + "grad_norm": 21.400066375732422, + "learning_rate": 1.1786217342362253e-05, + "loss": 0.5906, + "step": 259360 + }, + { + "epoch": 2.292915362718577, + "grad_norm": 4.700669765472412, + "learning_rate": 1.1784743954690383e-05, + "loss": 0.493, + "step": 259370 + }, + { + "epoch": 2.2930037659788893, + "grad_norm": 2.9618608951568604, + "learning_rate": 1.1783270567018511e-05, + "loss": 0.5003, + "step": 259380 + }, + { + "epoch": 2.2930921692392014, + "grad_norm": 3.7596728801727295, + "learning_rate": 1.1781797179346641e-05, + "loss": 0.5646, + "step": 259390 + }, + { + "epoch": 2.293180572499514, + "grad_norm": 2.868093252182007, + "learning_rate": 1.178032379167477e-05, + "loss": 0.5789, + "step": 259400 + }, + { + "epoch": 2.293268975759826, + "grad_norm": 3.376652956008911, + "learning_rate": 1.17788504040029e-05, + "loss": 0.4352, + "step": 259410 + }, + { + "epoch": 2.2933573790201383, + "grad_norm": 1.5657767057418823, + "learning_rate": 1.177737701633103e-05, + "loss": 0.5543, + "step": 259420 + }, + { + "epoch": 2.2934457822804504, + "grad_norm": 7.428125381469727, + "learning_rate": 1.1775903628659158e-05, + "loss": 0.5641, + "step": 259430 + }, + { + "epoch": 2.293534185540763, + "grad_norm": 4.301126003265381, + "learning_rate": 1.1774430240987288e-05, + "loss": 0.4795, + "step": 259440 + }, + { + "epoch": 2.293622588801075, + "grad_norm": 3.2257776260375977, + "learning_rate": 1.1772956853315418e-05, + "loss": 0.6258, + "step": 259450 + }, + { + "epoch": 2.293710992061387, + "grad_norm": 2.8956475257873535, + "learning_rate": 1.1771483465643546e-05, + "loss": 0.5686, + "step": 259460 + }, + { + "epoch": 2.2937993953216993, + "grad_norm": 4.203882694244385, + "learning_rate": 1.1770010077971676e-05, + "loss": 0.6313, + "step": 259470 + }, + { + "epoch": 2.293887798582012, + "grad_norm": 2.544503927230835, + "learning_rate": 1.1768536690299806e-05, + "loss": 0.5208, + "step": 259480 + }, + { + "epoch": 2.293976201842324, + "grad_norm": 8.471534729003906, + "learning_rate": 1.1767063302627935e-05, + "loss": 0.5446, + "step": 259490 + }, + { + "epoch": 2.294064605102636, + "grad_norm": 1.479592204093933, + "learning_rate": 1.1765589914956065e-05, + "loss": 0.3981, + "step": 259500 + }, + { + "epoch": 2.294153008362948, + "grad_norm": 2.575593948364258, + "learning_rate": 1.1764116527284195e-05, + "loss": 0.6184, + "step": 259510 + }, + { + "epoch": 2.294241411623261, + "grad_norm": 5.490714073181152, + "learning_rate": 1.1762643139612323e-05, + "loss": 0.5172, + "step": 259520 + }, + { + "epoch": 2.294329814883573, + "grad_norm": 5.864655017852783, + "learning_rate": 1.1761169751940453e-05, + "loss": 0.631, + "step": 259530 + }, + { + "epoch": 2.294418218143885, + "grad_norm": 3.9755678176879883, + "learning_rate": 1.1759696364268582e-05, + "loss": 0.4409, + "step": 259540 + }, + { + "epoch": 2.2945066214041976, + "grad_norm": 11.667485237121582, + "learning_rate": 1.1758222976596712e-05, + "loss": 0.4517, + "step": 259550 + }, + { + "epoch": 2.2945950246645097, + "grad_norm": 1.5370354652404785, + "learning_rate": 1.175674958892484e-05, + "loss": 0.6126, + "step": 259560 + }, + { + "epoch": 2.294683427924822, + "grad_norm": 1.5665957927703857, + "learning_rate": 1.175527620125297e-05, + "loss": 0.5634, + "step": 259570 + }, + { + "epoch": 2.294771831185134, + "grad_norm": 2.717686891555786, + "learning_rate": 1.1753802813581098e-05, + "loss": 0.5574, + "step": 259580 + }, + { + "epoch": 2.2948602344454465, + "grad_norm": 1.919175386428833, + "learning_rate": 1.1752329425909228e-05, + "loss": 0.4756, + "step": 259590 + }, + { + "epoch": 2.2949486377057586, + "grad_norm": 2.1500051021575928, + "learning_rate": 1.1750856038237359e-05, + "loss": 0.5383, + "step": 259600 + }, + { + "epoch": 2.2950370409660708, + "grad_norm": 1.8897526264190674, + "learning_rate": 1.1749382650565487e-05, + "loss": 0.4825, + "step": 259610 + }, + { + "epoch": 2.2951254442263833, + "grad_norm": 1.8276630640029907, + "learning_rate": 1.1747909262893617e-05, + "loss": 0.5032, + "step": 259620 + }, + { + "epoch": 2.2952138474866954, + "grad_norm": 1.1113568544387817, + "learning_rate": 1.1746435875221745e-05, + "loss": 0.443, + "step": 259630 + }, + { + "epoch": 2.2953022507470076, + "grad_norm": 16.36684226989746, + "learning_rate": 1.1744962487549875e-05, + "loss": 0.5344, + "step": 259640 + }, + { + "epoch": 2.2953906540073197, + "grad_norm": 1.8042906522750854, + "learning_rate": 1.1743489099878004e-05, + "loss": 0.5775, + "step": 259650 + }, + { + "epoch": 2.295479057267632, + "grad_norm": 1.5815187692642212, + "learning_rate": 1.1742015712206134e-05, + "loss": 0.5523, + "step": 259660 + }, + { + "epoch": 2.2955674605279444, + "grad_norm": 4.199748992919922, + "learning_rate": 1.1740542324534262e-05, + "loss": 0.488, + "step": 259670 + }, + { + "epoch": 2.2956558637882565, + "grad_norm": 1.9949684143066406, + "learning_rate": 1.1739068936862392e-05, + "loss": 0.479, + "step": 259680 + }, + { + "epoch": 2.2957442670485686, + "grad_norm": 2.7820065021514893, + "learning_rate": 1.1737595549190522e-05, + "loss": 0.6095, + "step": 259690 + }, + { + "epoch": 2.295832670308881, + "grad_norm": 3.1942410469055176, + "learning_rate": 1.173612216151865e-05, + "loss": 0.5294, + "step": 259700 + }, + { + "epoch": 2.2959210735691933, + "grad_norm": 36.8575439453125, + "learning_rate": 1.173464877384678e-05, + "loss": 0.7163, + "step": 259710 + }, + { + "epoch": 2.2960094768295054, + "grad_norm": 1.7731789350509644, + "learning_rate": 1.1733175386174909e-05, + "loss": 0.4639, + "step": 259720 + }, + { + "epoch": 2.2960978800898175, + "grad_norm": 1.7278555631637573, + "learning_rate": 1.1731701998503039e-05, + "loss": 0.5644, + "step": 259730 + }, + { + "epoch": 2.29618628335013, + "grad_norm": 4.494910717010498, + "learning_rate": 1.1730228610831167e-05, + "loss": 0.7134, + "step": 259740 + }, + { + "epoch": 2.296274686610442, + "grad_norm": 1.5782073736190796, + "learning_rate": 1.1728755223159297e-05, + "loss": 0.5007, + "step": 259750 + }, + { + "epoch": 2.2963630898707543, + "grad_norm": 1.5910489559173584, + "learning_rate": 1.1727281835487426e-05, + "loss": 0.5561, + "step": 259760 + }, + { + "epoch": 2.296451493131067, + "grad_norm": 10.488365173339844, + "learning_rate": 1.1725808447815556e-05, + "loss": 0.5841, + "step": 259770 + }, + { + "epoch": 2.296539896391379, + "grad_norm": 2.0290231704711914, + "learning_rate": 1.1724335060143684e-05, + "loss": 0.4539, + "step": 259780 + }, + { + "epoch": 2.296628299651691, + "grad_norm": 22.63119125366211, + "learning_rate": 1.1722861672471814e-05, + "loss": 0.577, + "step": 259790 + }, + { + "epoch": 2.2967167029120032, + "grad_norm": 8.117563247680664, + "learning_rate": 1.1721388284799944e-05, + "loss": 0.5049, + "step": 259800 + }, + { + "epoch": 2.296805106172316, + "grad_norm": 2.6464293003082275, + "learning_rate": 1.1719914897128073e-05, + "loss": 0.6043, + "step": 259810 + }, + { + "epoch": 2.296893509432628, + "grad_norm": 4.437290191650391, + "learning_rate": 1.1718441509456203e-05, + "loss": 0.6869, + "step": 259820 + }, + { + "epoch": 2.29698191269294, + "grad_norm": 1.2371470928192139, + "learning_rate": 1.1716968121784331e-05, + "loss": 0.4598, + "step": 259830 + }, + { + "epoch": 2.297070315953252, + "grad_norm": 1.4526609182357788, + "learning_rate": 1.1715494734112461e-05, + "loss": 0.4266, + "step": 259840 + }, + { + "epoch": 2.2971587192135647, + "grad_norm": 2.115576982498169, + "learning_rate": 1.171402134644059e-05, + "loss": 0.5383, + "step": 259850 + }, + { + "epoch": 2.297247122473877, + "grad_norm": 2.6705033779144287, + "learning_rate": 1.171254795876872e-05, + "loss": 0.4703, + "step": 259860 + }, + { + "epoch": 2.297335525734189, + "grad_norm": 14.162151336669922, + "learning_rate": 1.1711074571096848e-05, + "loss": 0.4861, + "step": 259870 + }, + { + "epoch": 2.297423928994501, + "grad_norm": 9.871244430541992, + "learning_rate": 1.1709601183424978e-05, + "loss": 0.5509, + "step": 259880 + }, + { + "epoch": 2.2975123322548137, + "grad_norm": 2.3719735145568848, + "learning_rate": 1.1708127795753108e-05, + "loss": 0.6301, + "step": 259890 + }, + { + "epoch": 2.297600735515126, + "grad_norm": 1.8303548097610474, + "learning_rate": 1.1706654408081236e-05, + "loss": 0.532, + "step": 259900 + }, + { + "epoch": 2.297689138775438, + "grad_norm": 18.160720825195312, + "learning_rate": 1.1705181020409366e-05, + "loss": 0.4285, + "step": 259910 + }, + { + "epoch": 2.2977775420357505, + "grad_norm": 3.547769784927368, + "learning_rate": 1.1703707632737496e-05, + "loss": 0.4046, + "step": 259920 + }, + { + "epoch": 2.2978659452960626, + "grad_norm": 4.944892883300781, + "learning_rate": 1.1702234245065625e-05, + "loss": 0.3728, + "step": 259930 + }, + { + "epoch": 2.2979543485563747, + "grad_norm": 3.186312675476074, + "learning_rate": 1.1700760857393755e-05, + "loss": 0.5599, + "step": 259940 + }, + { + "epoch": 2.298042751816687, + "grad_norm": 3.336904764175415, + "learning_rate": 1.1699287469721885e-05, + "loss": 0.4692, + "step": 259950 + }, + { + "epoch": 2.2981311550769994, + "grad_norm": 2.4898266792297363, + "learning_rate": 1.1697814082050013e-05, + "loss": 0.5468, + "step": 259960 + }, + { + "epoch": 2.2982195583373115, + "grad_norm": 3.863327980041504, + "learning_rate": 1.1696340694378143e-05, + "loss": 0.6132, + "step": 259970 + }, + { + "epoch": 2.2983079615976236, + "grad_norm": 2.7698843479156494, + "learning_rate": 1.1694867306706273e-05, + "loss": 0.6736, + "step": 259980 + }, + { + "epoch": 2.298396364857936, + "grad_norm": 3.1243488788604736, + "learning_rate": 1.1693393919034402e-05, + "loss": 0.5625, + "step": 259990 + }, + { + "epoch": 2.2984847681182483, + "grad_norm": 7.693816184997559, + "learning_rate": 1.1691920531362532e-05, + "loss": 0.4567, + "step": 260000 + }, + { + "epoch": 2.2985731713785604, + "grad_norm": 7.671384811401367, + "learning_rate": 1.169044714369066e-05, + "loss": 0.4625, + "step": 260010 + }, + { + "epoch": 2.2986615746388726, + "grad_norm": 3.516448736190796, + "learning_rate": 1.168897375601879e-05, + "loss": 0.6316, + "step": 260020 + }, + { + "epoch": 2.298749977899185, + "grad_norm": 3.407794237136841, + "learning_rate": 1.1687500368346918e-05, + "loss": 0.7159, + "step": 260030 + }, + { + "epoch": 2.2988383811594972, + "grad_norm": 12.269466400146484, + "learning_rate": 1.1686026980675048e-05, + "loss": 0.539, + "step": 260040 + }, + { + "epoch": 2.2989267844198094, + "grad_norm": 3.24448823928833, + "learning_rate": 1.1684553593003177e-05, + "loss": 0.464, + "step": 260050 + }, + { + "epoch": 2.2990151876801215, + "grad_norm": 6.8791184425354, + "learning_rate": 1.1683080205331307e-05, + "loss": 0.5094, + "step": 260060 + }, + { + "epoch": 2.299103590940434, + "grad_norm": 30.3896427154541, + "learning_rate": 1.1681606817659437e-05, + "loss": 0.5604, + "step": 260070 + }, + { + "epoch": 2.299191994200746, + "grad_norm": 2.8767282962799072, + "learning_rate": 1.1680133429987565e-05, + "loss": 0.58, + "step": 260080 + }, + { + "epoch": 2.2992803974610583, + "grad_norm": 4.976644515991211, + "learning_rate": 1.1678660042315695e-05, + "loss": 0.5274, + "step": 260090 + }, + { + "epoch": 2.2993688007213704, + "grad_norm": 8.127227783203125, + "learning_rate": 1.1677186654643824e-05, + "loss": 0.5777, + "step": 260100 + }, + { + "epoch": 2.299457203981683, + "grad_norm": 2.42294979095459, + "learning_rate": 1.1675713266971954e-05, + "loss": 0.537, + "step": 260110 + }, + { + "epoch": 2.299545607241995, + "grad_norm": 2.5756170749664307, + "learning_rate": 1.1674239879300082e-05, + "loss": 0.5743, + "step": 260120 + }, + { + "epoch": 2.299634010502307, + "grad_norm": 14.413482666015625, + "learning_rate": 1.1672766491628212e-05, + "loss": 0.5425, + "step": 260130 + }, + { + "epoch": 2.2997224137626198, + "grad_norm": 1.389936089515686, + "learning_rate": 1.167129310395634e-05, + "loss": 0.4103, + "step": 260140 + }, + { + "epoch": 2.299810817022932, + "grad_norm": 3.133547067642212, + "learning_rate": 1.166981971628447e-05, + "loss": 0.5634, + "step": 260150 + }, + { + "epoch": 2.299899220283244, + "grad_norm": 5.357043266296387, + "learning_rate": 1.16683463286126e-05, + "loss": 0.4737, + "step": 260160 + }, + { + "epoch": 2.299987623543556, + "grad_norm": 2.403163433074951, + "learning_rate": 1.1666872940940729e-05, + "loss": 0.6937, + "step": 260170 + }, + { + "epoch": 2.3000760268038687, + "grad_norm": 6.109233379364014, + "learning_rate": 1.1665399553268859e-05, + "loss": 0.4773, + "step": 260180 + }, + { + "epoch": 2.300164430064181, + "grad_norm": 3.702396869659424, + "learning_rate": 1.1663926165596987e-05, + "loss": 0.5838, + "step": 260190 + }, + { + "epoch": 2.300252833324493, + "grad_norm": 36.53094482421875, + "learning_rate": 1.1662452777925117e-05, + "loss": 0.4976, + "step": 260200 + }, + { + "epoch": 2.3003412365848055, + "grad_norm": 3.5208187103271484, + "learning_rate": 1.1660979390253246e-05, + "loss": 0.4813, + "step": 260210 + }, + { + "epoch": 2.3004296398451176, + "grad_norm": 3.7541160583496094, + "learning_rate": 1.1659506002581376e-05, + "loss": 0.5244, + "step": 260220 + }, + { + "epoch": 2.3005180431054297, + "grad_norm": 1.875852346420288, + "learning_rate": 1.1658032614909504e-05, + "loss": 0.6299, + "step": 260230 + }, + { + "epoch": 2.300606446365742, + "grad_norm": 1.2459274530410767, + "learning_rate": 1.1656559227237634e-05, + "loss": 0.4676, + "step": 260240 + }, + { + "epoch": 2.300694849626054, + "grad_norm": 18.139230728149414, + "learning_rate": 1.1655085839565763e-05, + "loss": 0.501, + "step": 260250 + }, + { + "epoch": 2.3007832528863665, + "grad_norm": 2.0327394008636475, + "learning_rate": 1.1653612451893893e-05, + "loss": 0.4711, + "step": 260260 + }, + { + "epoch": 2.3008716561466787, + "grad_norm": 2.1820924282073975, + "learning_rate": 1.1652139064222023e-05, + "loss": 0.5608, + "step": 260270 + }, + { + "epoch": 2.3009600594069908, + "grad_norm": 1.7463101148605347, + "learning_rate": 1.1650665676550151e-05, + "loss": 0.5117, + "step": 260280 + }, + { + "epoch": 2.3010484626673033, + "grad_norm": 2.014204263687134, + "learning_rate": 1.1649192288878281e-05, + "loss": 0.5768, + "step": 260290 + }, + { + "epoch": 2.3011368659276155, + "grad_norm": 4.82535982131958, + "learning_rate": 1.164771890120641e-05, + "loss": 0.5536, + "step": 260300 + }, + { + "epoch": 2.3012252691879276, + "grad_norm": 4.064444065093994, + "learning_rate": 1.164624551353454e-05, + "loss": 0.486, + "step": 260310 + }, + { + "epoch": 2.3013136724482397, + "grad_norm": 7.141782760620117, + "learning_rate": 1.1644772125862668e-05, + "loss": 0.5717, + "step": 260320 + }, + { + "epoch": 2.3014020757085523, + "grad_norm": 2.5722694396972656, + "learning_rate": 1.1643298738190798e-05, + "loss": 0.5345, + "step": 260330 + }, + { + "epoch": 2.3014904789688644, + "grad_norm": 1.8838657140731812, + "learning_rate": 1.1641825350518928e-05, + "loss": 0.5496, + "step": 260340 + }, + { + "epoch": 2.3015788822291765, + "grad_norm": 1.673690915107727, + "learning_rate": 1.1640351962847056e-05, + "loss": 0.5497, + "step": 260350 + }, + { + "epoch": 2.301667285489489, + "grad_norm": 15.379714965820312, + "learning_rate": 1.1638878575175186e-05, + "loss": 0.444, + "step": 260360 + }, + { + "epoch": 2.301755688749801, + "grad_norm": 2.20792293548584, + "learning_rate": 1.1637405187503316e-05, + "loss": 0.5177, + "step": 260370 + }, + { + "epoch": 2.3018440920101133, + "grad_norm": 3.899547815322876, + "learning_rate": 1.1635931799831445e-05, + "loss": 0.5752, + "step": 260380 + }, + { + "epoch": 2.3019324952704254, + "grad_norm": 9.445771217346191, + "learning_rate": 1.1634458412159575e-05, + "loss": 0.4654, + "step": 260390 + }, + { + "epoch": 2.302020898530738, + "grad_norm": 4.3855485916137695, + "learning_rate": 1.1632985024487705e-05, + "loss": 0.6035, + "step": 260400 + }, + { + "epoch": 2.30210930179105, + "grad_norm": 5.71193265914917, + "learning_rate": 1.1631511636815833e-05, + "loss": 0.5561, + "step": 260410 + }, + { + "epoch": 2.3021977050513622, + "grad_norm": 1.701006531715393, + "learning_rate": 1.1630038249143963e-05, + "loss": 0.561, + "step": 260420 + }, + { + "epoch": 2.3022861083116744, + "grad_norm": 2.9355671405792236, + "learning_rate": 1.1628564861472092e-05, + "loss": 0.6254, + "step": 260430 + }, + { + "epoch": 2.302374511571987, + "grad_norm": 2.986804485321045, + "learning_rate": 1.1627091473800222e-05, + "loss": 0.4329, + "step": 260440 + }, + { + "epoch": 2.302462914832299, + "grad_norm": 1.7263169288635254, + "learning_rate": 1.1625618086128352e-05, + "loss": 0.4635, + "step": 260450 + }, + { + "epoch": 2.302551318092611, + "grad_norm": 4.169259071350098, + "learning_rate": 1.162414469845648e-05, + "loss": 0.5581, + "step": 260460 + }, + { + "epoch": 2.3026397213529233, + "grad_norm": 0.9589517116546631, + "learning_rate": 1.162267131078461e-05, + "loss": 0.4979, + "step": 260470 + }, + { + "epoch": 2.302728124613236, + "grad_norm": 26.01097869873047, + "learning_rate": 1.1621197923112738e-05, + "loss": 0.4964, + "step": 260480 + }, + { + "epoch": 2.302816527873548, + "grad_norm": 4.100005149841309, + "learning_rate": 1.1619724535440868e-05, + "loss": 0.5235, + "step": 260490 + }, + { + "epoch": 2.30290493113386, + "grad_norm": 3.302241325378418, + "learning_rate": 1.1618251147768997e-05, + "loss": 0.7354, + "step": 260500 + }, + { + "epoch": 2.3029933343941726, + "grad_norm": 2.692145347595215, + "learning_rate": 1.1616777760097127e-05, + "loss": 0.5187, + "step": 260510 + }, + { + "epoch": 2.3030817376544848, + "grad_norm": 4.891733646392822, + "learning_rate": 1.1615304372425255e-05, + "loss": 0.5142, + "step": 260520 + }, + { + "epoch": 2.303170140914797, + "grad_norm": 3.0714941024780273, + "learning_rate": 1.1613830984753385e-05, + "loss": 0.5104, + "step": 260530 + }, + { + "epoch": 2.303258544175109, + "grad_norm": 2.824841022491455, + "learning_rate": 1.1612357597081515e-05, + "loss": 0.6095, + "step": 260540 + }, + { + "epoch": 2.3033469474354216, + "grad_norm": 3.762983798980713, + "learning_rate": 1.1610884209409644e-05, + "loss": 0.6292, + "step": 260550 + }, + { + "epoch": 2.3034353506957337, + "grad_norm": 2.7360942363739014, + "learning_rate": 1.1609410821737774e-05, + "loss": 0.539, + "step": 260560 + }, + { + "epoch": 2.303523753956046, + "grad_norm": 3.6470401287078857, + "learning_rate": 1.1607937434065902e-05, + "loss": 0.6164, + "step": 260570 + }, + { + "epoch": 2.3036121572163584, + "grad_norm": 1.9141167402267456, + "learning_rate": 1.1606464046394032e-05, + "loss": 0.6039, + "step": 260580 + }, + { + "epoch": 2.3037005604766705, + "grad_norm": 1.4375101327896118, + "learning_rate": 1.160499065872216e-05, + "loss": 0.4806, + "step": 260590 + }, + { + "epoch": 2.3037889637369826, + "grad_norm": 9.089749336242676, + "learning_rate": 1.160351727105029e-05, + "loss": 0.6076, + "step": 260600 + }, + { + "epoch": 2.3038773669972947, + "grad_norm": 15.387625694274902, + "learning_rate": 1.1602043883378419e-05, + "loss": 0.6446, + "step": 260610 + }, + { + "epoch": 2.3039657702576073, + "grad_norm": 6.75402307510376, + "learning_rate": 1.1600570495706549e-05, + "loss": 0.4428, + "step": 260620 + }, + { + "epoch": 2.3040541735179194, + "grad_norm": 3.396256923675537, + "learning_rate": 1.1599097108034679e-05, + "loss": 0.6655, + "step": 260630 + }, + { + "epoch": 2.3041425767782315, + "grad_norm": 4.039883136749268, + "learning_rate": 1.1597623720362807e-05, + "loss": 0.6577, + "step": 260640 + }, + { + "epoch": 2.3042309800385437, + "grad_norm": 3.0208046436309814, + "learning_rate": 1.1596150332690937e-05, + "loss": 0.5912, + "step": 260650 + }, + { + "epoch": 2.304319383298856, + "grad_norm": 1.3529431819915771, + "learning_rate": 1.1594676945019066e-05, + "loss": 0.4374, + "step": 260660 + }, + { + "epoch": 2.3044077865591683, + "grad_norm": 1.0717918872833252, + "learning_rate": 1.1593203557347196e-05, + "loss": 0.5477, + "step": 260670 + }, + { + "epoch": 2.3044961898194805, + "grad_norm": 2.8305838108062744, + "learning_rate": 1.1591730169675324e-05, + "loss": 0.5764, + "step": 260680 + }, + { + "epoch": 2.3045845930797926, + "grad_norm": 1.5964465141296387, + "learning_rate": 1.1590256782003454e-05, + "loss": 0.5998, + "step": 260690 + }, + { + "epoch": 2.304672996340105, + "grad_norm": 1.6720601320266724, + "learning_rate": 1.1588783394331582e-05, + "loss": 0.5119, + "step": 260700 + }, + { + "epoch": 2.3047613996004173, + "grad_norm": 4.124337673187256, + "learning_rate": 1.1587310006659713e-05, + "loss": 0.4502, + "step": 260710 + }, + { + "epoch": 2.3048498028607294, + "grad_norm": 2.109267473220825, + "learning_rate": 1.1585836618987843e-05, + "loss": 0.6462, + "step": 260720 + }, + { + "epoch": 2.304938206121042, + "grad_norm": 3.8557846546173096, + "learning_rate": 1.1584363231315971e-05, + "loss": 0.5913, + "step": 260730 + }, + { + "epoch": 2.305026609381354, + "grad_norm": 5.26356315612793, + "learning_rate": 1.1582889843644101e-05, + "loss": 0.6426, + "step": 260740 + }, + { + "epoch": 2.305115012641666, + "grad_norm": 6.908241271972656, + "learning_rate": 1.158141645597223e-05, + "loss": 0.6772, + "step": 260750 + }, + { + "epoch": 2.3052034159019783, + "grad_norm": 4.98683500289917, + "learning_rate": 1.157994306830036e-05, + "loss": 0.6125, + "step": 260760 + }, + { + "epoch": 2.305291819162291, + "grad_norm": 6.300786018371582, + "learning_rate": 1.1578469680628488e-05, + "loss": 0.4945, + "step": 260770 + }, + { + "epoch": 2.305380222422603, + "grad_norm": 1.6561038494110107, + "learning_rate": 1.1576996292956618e-05, + "loss": 0.5224, + "step": 260780 + }, + { + "epoch": 2.305468625682915, + "grad_norm": 5.930668830871582, + "learning_rate": 1.1575522905284746e-05, + "loss": 0.5933, + "step": 260790 + }, + { + "epoch": 2.3055570289432277, + "grad_norm": 2.530850887298584, + "learning_rate": 1.1574049517612876e-05, + "loss": 0.4975, + "step": 260800 + }, + { + "epoch": 2.30564543220354, + "grad_norm": 5.522804260253906, + "learning_rate": 1.1572576129941006e-05, + "loss": 0.5921, + "step": 260810 + }, + { + "epoch": 2.305733835463852, + "grad_norm": 3.040428400039673, + "learning_rate": 1.1571102742269135e-05, + "loss": 0.506, + "step": 260820 + }, + { + "epoch": 2.305822238724164, + "grad_norm": 6.883382320404053, + "learning_rate": 1.1569629354597265e-05, + "loss": 0.5784, + "step": 260830 + }, + { + "epoch": 2.305910641984476, + "grad_norm": 3.4975194931030273, + "learning_rate": 1.1568155966925395e-05, + "loss": 0.5443, + "step": 260840 + }, + { + "epoch": 2.3059990452447887, + "grad_norm": 8.90981674194336, + "learning_rate": 1.1566682579253523e-05, + "loss": 0.4486, + "step": 260850 + }, + { + "epoch": 2.306087448505101, + "grad_norm": 4.9335036277771, + "learning_rate": 1.1565209191581653e-05, + "loss": 0.4775, + "step": 260860 + }, + { + "epoch": 2.306175851765413, + "grad_norm": 6.9347243309021, + "learning_rate": 1.1563735803909783e-05, + "loss": 0.5577, + "step": 260870 + }, + { + "epoch": 2.3062642550257255, + "grad_norm": 1.0258523225784302, + "learning_rate": 1.1562262416237912e-05, + "loss": 0.5724, + "step": 260880 + }, + { + "epoch": 2.3063526582860376, + "grad_norm": 1.7703065872192383, + "learning_rate": 1.1560789028566042e-05, + "loss": 0.4884, + "step": 260890 + }, + { + "epoch": 2.3064410615463498, + "grad_norm": 4.082756996154785, + "learning_rate": 1.155931564089417e-05, + "loss": 0.4975, + "step": 260900 + }, + { + "epoch": 2.306529464806662, + "grad_norm": 3.144211769104004, + "learning_rate": 1.15578422532223e-05, + "loss": 0.6755, + "step": 260910 + }, + { + "epoch": 2.3066178680669744, + "grad_norm": 1.9882434606552124, + "learning_rate": 1.155636886555043e-05, + "loss": 0.4652, + "step": 260920 + }, + { + "epoch": 2.3067062713272866, + "grad_norm": 1.5635277032852173, + "learning_rate": 1.1554895477878558e-05, + "loss": 0.5101, + "step": 260930 + }, + { + "epoch": 2.3067946745875987, + "grad_norm": 1.4135533571243286, + "learning_rate": 1.1553422090206688e-05, + "loss": 0.5334, + "step": 260940 + }, + { + "epoch": 2.3068830778479112, + "grad_norm": 5.424489974975586, + "learning_rate": 1.1551948702534817e-05, + "loss": 0.4539, + "step": 260950 + }, + { + "epoch": 2.3069714811082234, + "grad_norm": 2.264775037765503, + "learning_rate": 1.1550475314862947e-05, + "loss": 0.5645, + "step": 260960 + }, + { + "epoch": 2.3070598843685355, + "grad_norm": 2.246954917907715, + "learning_rate": 1.1549001927191075e-05, + "loss": 0.4448, + "step": 260970 + }, + { + "epoch": 2.3071482876288476, + "grad_norm": 1.2962473630905151, + "learning_rate": 1.1547528539519205e-05, + "loss": 0.5258, + "step": 260980 + }, + { + "epoch": 2.30723669088916, + "grad_norm": 1.9151854515075684, + "learning_rate": 1.1546055151847334e-05, + "loss": 0.4975, + "step": 260990 + }, + { + "epoch": 2.3073250941494723, + "grad_norm": 3.39237117767334, + "learning_rate": 1.1544581764175464e-05, + "loss": 0.6804, + "step": 261000 + }, + { + "epoch": 2.3074134974097844, + "grad_norm": 5.640404224395752, + "learning_rate": 1.1543108376503594e-05, + "loss": 0.5093, + "step": 261010 + }, + { + "epoch": 2.3075019006700965, + "grad_norm": 3.2271201610565186, + "learning_rate": 1.1541634988831722e-05, + "loss": 0.6048, + "step": 261020 + }, + { + "epoch": 2.307590303930409, + "grad_norm": 10.674870491027832, + "learning_rate": 1.1540161601159852e-05, + "loss": 0.5055, + "step": 261030 + }, + { + "epoch": 2.307678707190721, + "grad_norm": 2.4844326972961426, + "learning_rate": 1.153868821348798e-05, + "loss": 0.5045, + "step": 261040 + }, + { + "epoch": 2.3077671104510333, + "grad_norm": 2.056361436843872, + "learning_rate": 1.153721482581611e-05, + "loss": 0.5135, + "step": 261050 + }, + { + "epoch": 2.3078555137113455, + "grad_norm": 2.7626991271972656, + "learning_rate": 1.1535741438144239e-05, + "loss": 0.6279, + "step": 261060 + }, + { + "epoch": 2.307943916971658, + "grad_norm": 3.9846351146698, + "learning_rate": 1.1534268050472369e-05, + "loss": 0.6626, + "step": 261070 + }, + { + "epoch": 2.30803232023197, + "grad_norm": 1.8369050025939941, + "learning_rate": 1.1532794662800497e-05, + "loss": 0.6301, + "step": 261080 + }, + { + "epoch": 2.3081207234922823, + "grad_norm": 10.969522476196289, + "learning_rate": 1.1531321275128627e-05, + "loss": 0.6027, + "step": 261090 + }, + { + "epoch": 2.308209126752595, + "grad_norm": 4.47543478012085, + "learning_rate": 1.1529847887456757e-05, + "loss": 0.5193, + "step": 261100 + }, + { + "epoch": 2.308297530012907, + "grad_norm": 5.171657562255859, + "learning_rate": 1.1528374499784886e-05, + "loss": 0.5391, + "step": 261110 + }, + { + "epoch": 2.308385933273219, + "grad_norm": 3.5178897380828857, + "learning_rate": 1.1526901112113016e-05, + "loss": 0.4195, + "step": 261120 + }, + { + "epoch": 2.308474336533531, + "grad_norm": 11.048431396484375, + "learning_rate": 1.1525427724441144e-05, + "loss": 0.5289, + "step": 261130 + }, + { + "epoch": 2.3085627397938437, + "grad_norm": 2.4356436729431152, + "learning_rate": 1.1523954336769274e-05, + "loss": 0.5525, + "step": 261140 + }, + { + "epoch": 2.308651143054156, + "grad_norm": 3.1807281970977783, + "learning_rate": 1.1522480949097402e-05, + "loss": 0.4939, + "step": 261150 + }, + { + "epoch": 2.308739546314468, + "grad_norm": 1.8932641744613647, + "learning_rate": 1.1521007561425533e-05, + "loss": 0.5602, + "step": 261160 + }, + { + "epoch": 2.3088279495747805, + "grad_norm": 19.360504150390625, + "learning_rate": 1.1519534173753661e-05, + "loss": 0.4881, + "step": 261170 + }, + { + "epoch": 2.3089163528350927, + "grad_norm": 3.3536391258239746, + "learning_rate": 1.1518060786081791e-05, + "loss": 0.6564, + "step": 261180 + }, + { + "epoch": 2.309004756095405, + "grad_norm": 6.785184860229492, + "learning_rate": 1.1516587398409921e-05, + "loss": 0.7045, + "step": 261190 + }, + { + "epoch": 2.309093159355717, + "grad_norm": 2.142981767654419, + "learning_rate": 1.151511401073805e-05, + "loss": 0.4341, + "step": 261200 + }, + { + "epoch": 2.3091815626160295, + "grad_norm": 2.677133798599243, + "learning_rate": 1.151364062306618e-05, + "loss": 0.6409, + "step": 261210 + }, + { + "epoch": 2.3092699658763416, + "grad_norm": 7.470454216003418, + "learning_rate": 1.1512167235394308e-05, + "loss": 0.5758, + "step": 261220 + }, + { + "epoch": 2.3093583691366537, + "grad_norm": 2.547539234161377, + "learning_rate": 1.1510693847722438e-05, + "loss": 0.6731, + "step": 261230 + }, + { + "epoch": 2.309446772396966, + "grad_norm": 5.427080154418945, + "learning_rate": 1.1509220460050566e-05, + "loss": 0.4373, + "step": 261240 + }, + { + "epoch": 2.3095351756572784, + "grad_norm": 2.362111806869507, + "learning_rate": 1.1507747072378696e-05, + "loss": 0.5005, + "step": 261250 + }, + { + "epoch": 2.3096235789175905, + "grad_norm": 6.859429359436035, + "learning_rate": 1.1506273684706825e-05, + "loss": 0.4435, + "step": 261260 + }, + { + "epoch": 2.3097119821779026, + "grad_norm": 2.0631189346313477, + "learning_rate": 1.1504800297034955e-05, + "loss": 0.5681, + "step": 261270 + }, + { + "epoch": 2.3098003854382148, + "grad_norm": 4.012839317321777, + "learning_rate": 1.1503326909363085e-05, + "loss": 0.5785, + "step": 261280 + }, + { + "epoch": 2.3098887886985273, + "grad_norm": 3.4789767265319824, + "learning_rate": 1.1501853521691213e-05, + "loss": 0.6202, + "step": 261290 + }, + { + "epoch": 2.3099771919588394, + "grad_norm": 5.885256290435791, + "learning_rate": 1.1500380134019343e-05, + "loss": 0.4798, + "step": 261300 + }, + { + "epoch": 2.3100655952191516, + "grad_norm": 14.56280517578125, + "learning_rate": 1.1498906746347473e-05, + "loss": 0.5249, + "step": 261310 + }, + { + "epoch": 2.310153998479464, + "grad_norm": 2.3001198768615723, + "learning_rate": 1.1497433358675601e-05, + "loss": 0.5533, + "step": 261320 + }, + { + "epoch": 2.3102424017397762, + "grad_norm": 2.3670241832733154, + "learning_rate": 1.1495959971003731e-05, + "loss": 0.5703, + "step": 261330 + }, + { + "epoch": 2.3103308050000884, + "grad_norm": 1.8588852882385254, + "learning_rate": 1.1494486583331862e-05, + "loss": 0.6896, + "step": 261340 + }, + { + "epoch": 2.3104192082604005, + "grad_norm": 3.7617111206054688, + "learning_rate": 1.149301319565999e-05, + "loss": 0.4371, + "step": 261350 + }, + { + "epoch": 2.310507611520713, + "grad_norm": 5.21024227142334, + "learning_rate": 1.149153980798812e-05, + "loss": 0.6038, + "step": 261360 + }, + { + "epoch": 2.310596014781025, + "grad_norm": 1.6285313367843628, + "learning_rate": 1.1490066420316248e-05, + "loss": 0.4831, + "step": 261370 + }, + { + "epoch": 2.3106844180413373, + "grad_norm": 3.5718438625335693, + "learning_rate": 1.1488593032644378e-05, + "loss": 0.5245, + "step": 261380 + }, + { + "epoch": 2.31077282130165, + "grad_norm": 2.3800911903381348, + "learning_rate": 1.1487119644972508e-05, + "loss": 0.5815, + "step": 261390 + }, + { + "epoch": 2.310861224561962, + "grad_norm": 5.7724609375, + "learning_rate": 1.1485646257300637e-05, + "loss": 0.6021, + "step": 261400 + }, + { + "epoch": 2.310949627822274, + "grad_norm": 1.658455491065979, + "learning_rate": 1.1484172869628767e-05, + "loss": 0.4102, + "step": 261410 + }, + { + "epoch": 2.311038031082586, + "grad_norm": 5.555984973907471, + "learning_rate": 1.1482699481956895e-05, + "loss": 0.5619, + "step": 261420 + }, + { + "epoch": 2.3111264343428983, + "grad_norm": 1.9994621276855469, + "learning_rate": 1.1481226094285025e-05, + "loss": 0.3846, + "step": 261430 + }, + { + "epoch": 2.311214837603211, + "grad_norm": 3.239980459213257, + "learning_rate": 1.1479752706613154e-05, + "loss": 0.5813, + "step": 261440 + }, + { + "epoch": 2.311303240863523, + "grad_norm": 2.6692845821380615, + "learning_rate": 1.1478279318941284e-05, + "loss": 0.4822, + "step": 261450 + }, + { + "epoch": 2.311391644123835, + "grad_norm": 8.870869636535645, + "learning_rate": 1.1476805931269412e-05, + "loss": 0.4549, + "step": 261460 + }, + { + "epoch": 2.3114800473841477, + "grad_norm": 3.2965152263641357, + "learning_rate": 1.1475332543597542e-05, + "loss": 0.5029, + "step": 261470 + }, + { + "epoch": 2.31156845064446, + "grad_norm": 0.6670087575912476, + "learning_rate": 1.1473859155925672e-05, + "loss": 0.6258, + "step": 261480 + }, + { + "epoch": 2.311656853904772, + "grad_norm": 1.9396076202392578, + "learning_rate": 1.14723857682538e-05, + "loss": 0.6149, + "step": 261490 + }, + { + "epoch": 2.311745257165084, + "grad_norm": 2.1890978813171387, + "learning_rate": 1.147091238058193e-05, + "loss": 0.5061, + "step": 261500 + }, + { + "epoch": 2.3118336604253966, + "grad_norm": 2.2735447883605957, + "learning_rate": 1.1469438992910059e-05, + "loss": 0.5127, + "step": 261510 + }, + { + "epoch": 2.3119220636857087, + "grad_norm": 3.548892021179199, + "learning_rate": 1.1467965605238189e-05, + "loss": 0.7002, + "step": 261520 + }, + { + "epoch": 2.312010466946021, + "grad_norm": 5.472440719604492, + "learning_rate": 1.1466492217566317e-05, + "loss": 0.4479, + "step": 261530 + }, + { + "epoch": 2.3120988702063334, + "grad_norm": 5.139155864715576, + "learning_rate": 1.1465018829894447e-05, + "loss": 0.5858, + "step": 261540 + }, + { + "epoch": 2.3121872734666455, + "grad_norm": 11.460111618041992, + "learning_rate": 1.1463545442222576e-05, + "loss": 0.5115, + "step": 261550 + }, + { + "epoch": 2.3122756767269577, + "grad_norm": 5.76637601852417, + "learning_rate": 1.1462072054550706e-05, + "loss": 0.4866, + "step": 261560 + }, + { + "epoch": 2.31236407998727, + "grad_norm": 2.169355869293213, + "learning_rate": 1.1460598666878836e-05, + "loss": 0.3954, + "step": 261570 + }, + { + "epoch": 2.3124524832475823, + "grad_norm": 2.8808040618896484, + "learning_rate": 1.1459125279206964e-05, + "loss": 0.4781, + "step": 261580 + }, + { + "epoch": 2.3125408865078945, + "grad_norm": 5.853160858154297, + "learning_rate": 1.1457651891535094e-05, + "loss": 0.479, + "step": 261590 + }, + { + "epoch": 2.3126292897682066, + "grad_norm": 1.9566152095794678, + "learning_rate": 1.1456178503863222e-05, + "loss": 0.5334, + "step": 261600 + }, + { + "epoch": 2.3127176930285187, + "grad_norm": 6.38908576965332, + "learning_rate": 1.1454705116191352e-05, + "loss": 0.6326, + "step": 261610 + }, + { + "epoch": 2.3128060962888313, + "grad_norm": 2.4594509601593018, + "learning_rate": 1.1453231728519481e-05, + "loss": 0.4317, + "step": 261620 + }, + { + "epoch": 2.3128944995491434, + "grad_norm": 3.1883063316345215, + "learning_rate": 1.1451758340847611e-05, + "loss": 0.5532, + "step": 261630 + }, + { + "epoch": 2.3129829028094555, + "grad_norm": 2.9559600353240967, + "learning_rate": 1.145028495317574e-05, + "loss": 0.5249, + "step": 261640 + }, + { + "epoch": 2.3130713060697676, + "grad_norm": 2.0421640872955322, + "learning_rate": 1.144881156550387e-05, + "loss": 0.3761, + "step": 261650 + }, + { + "epoch": 2.31315970933008, + "grad_norm": 0.9813166260719299, + "learning_rate": 1.1447338177832e-05, + "loss": 0.4074, + "step": 261660 + }, + { + "epoch": 2.3132481125903923, + "grad_norm": 1.4246805906295776, + "learning_rate": 1.1445864790160128e-05, + "loss": 0.4708, + "step": 261670 + }, + { + "epoch": 2.3133365158507044, + "grad_norm": 4.807336807250977, + "learning_rate": 1.1444391402488258e-05, + "loss": 0.6772, + "step": 261680 + }, + { + "epoch": 2.313424919111017, + "grad_norm": 1.5083609819412231, + "learning_rate": 1.1442918014816386e-05, + "loss": 0.47, + "step": 261690 + }, + { + "epoch": 2.313513322371329, + "grad_norm": 8.073775291442871, + "learning_rate": 1.1441444627144516e-05, + "loss": 0.5624, + "step": 261700 + }, + { + "epoch": 2.3136017256316412, + "grad_norm": 16.49439811706543, + "learning_rate": 1.1439971239472645e-05, + "loss": 0.5505, + "step": 261710 + }, + { + "epoch": 2.3136901288919534, + "grad_norm": 2.7310657501220703, + "learning_rate": 1.1438497851800775e-05, + "loss": 0.6645, + "step": 261720 + }, + { + "epoch": 2.313778532152266, + "grad_norm": 1.1191768646240234, + "learning_rate": 1.1437024464128903e-05, + "loss": 0.4558, + "step": 261730 + }, + { + "epoch": 2.313866935412578, + "grad_norm": 1.8698821067810059, + "learning_rate": 1.1435551076457033e-05, + "loss": 0.4125, + "step": 261740 + }, + { + "epoch": 2.31395533867289, + "grad_norm": 1.5336493253707886, + "learning_rate": 1.1434077688785163e-05, + "loss": 0.4538, + "step": 261750 + }, + { + "epoch": 2.3140437419332027, + "grad_norm": 1.5305899381637573, + "learning_rate": 1.1432604301113291e-05, + "loss": 0.3811, + "step": 261760 + }, + { + "epoch": 2.314132145193515, + "grad_norm": 6.184157371520996, + "learning_rate": 1.1431130913441421e-05, + "loss": 0.5833, + "step": 261770 + }, + { + "epoch": 2.314220548453827, + "grad_norm": 2.557711362838745, + "learning_rate": 1.1429657525769551e-05, + "loss": 0.4672, + "step": 261780 + }, + { + "epoch": 2.314308951714139, + "grad_norm": 5.785394668579102, + "learning_rate": 1.142818413809768e-05, + "loss": 0.5449, + "step": 261790 + }, + { + "epoch": 2.3143973549744516, + "grad_norm": 1.8953760862350464, + "learning_rate": 1.142671075042581e-05, + "loss": 0.4785, + "step": 261800 + }, + { + "epoch": 2.3144857582347638, + "grad_norm": 4.025540351867676, + "learning_rate": 1.142523736275394e-05, + "loss": 0.627, + "step": 261810 + }, + { + "epoch": 2.314574161495076, + "grad_norm": 2.050048828125, + "learning_rate": 1.1423763975082068e-05, + "loss": 0.4604, + "step": 261820 + }, + { + "epoch": 2.314662564755388, + "grad_norm": 3.5236241817474365, + "learning_rate": 1.1422290587410198e-05, + "loss": 0.4448, + "step": 261830 + }, + { + "epoch": 2.3147509680157006, + "grad_norm": 2.667032480239868, + "learning_rate": 1.1420817199738328e-05, + "loss": 0.6291, + "step": 261840 + }, + { + "epoch": 2.3148393712760127, + "grad_norm": 1.888251543045044, + "learning_rate": 1.1419343812066457e-05, + "loss": 0.5282, + "step": 261850 + }, + { + "epoch": 2.314927774536325, + "grad_norm": 8.361710548400879, + "learning_rate": 1.1417870424394587e-05, + "loss": 0.5954, + "step": 261860 + }, + { + "epoch": 2.315016177796637, + "grad_norm": 3.185600519180298, + "learning_rate": 1.1416397036722715e-05, + "loss": 0.5082, + "step": 261870 + }, + { + "epoch": 2.3151045810569495, + "grad_norm": 4.112468719482422, + "learning_rate": 1.1414923649050845e-05, + "loss": 0.4611, + "step": 261880 + }, + { + "epoch": 2.3151929843172616, + "grad_norm": 3.9376842975616455, + "learning_rate": 1.1413450261378974e-05, + "loss": 0.431, + "step": 261890 + }, + { + "epoch": 2.3152813875775737, + "grad_norm": 3.4204277992248535, + "learning_rate": 1.1411976873707104e-05, + "loss": 0.4491, + "step": 261900 + }, + { + "epoch": 2.3153697908378863, + "grad_norm": 3.9410367012023926, + "learning_rate": 1.1410503486035232e-05, + "loss": 0.5005, + "step": 261910 + }, + { + "epoch": 2.3154581940981984, + "grad_norm": 1.5162773132324219, + "learning_rate": 1.1409030098363362e-05, + "loss": 0.6017, + "step": 261920 + }, + { + "epoch": 2.3155465973585105, + "grad_norm": 4.00655460357666, + "learning_rate": 1.140755671069149e-05, + "loss": 0.4065, + "step": 261930 + }, + { + "epoch": 2.3156350006188227, + "grad_norm": 3.7437045574188232, + "learning_rate": 1.140608332301962e-05, + "loss": 0.5423, + "step": 261940 + }, + { + "epoch": 2.3157234038791352, + "grad_norm": 5.372136116027832, + "learning_rate": 1.140460993534775e-05, + "loss": 0.5993, + "step": 261950 + }, + { + "epoch": 2.3158118071394473, + "grad_norm": 11.19925308227539, + "learning_rate": 1.1403136547675879e-05, + "loss": 0.5229, + "step": 261960 + }, + { + "epoch": 2.3159002103997595, + "grad_norm": 14.256199836730957, + "learning_rate": 1.1401663160004009e-05, + "loss": 0.6595, + "step": 261970 + }, + { + "epoch": 2.315988613660072, + "grad_norm": 3.478200912475586, + "learning_rate": 1.1400189772332137e-05, + "loss": 0.6008, + "step": 261980 + }, + { + "epoch": 2.316077016920384, + "grad_norm": 4.118014812469482, + "learning_rate": 1.1398716384660267e-05, + "loss": 0.3846, + "step": 261990 + }, + { + "epoch": 2.3161654201806963, + "grad_norm": 0.3967991769313812, + "learning_rate": 1.1397242996988396e-05, + "loss": 0.4423, + "step": 262000 + }, + { + "epoch": 2.3162538234410084, + "grad_norm": 4.186169624328613, + "learning_rate": 1.1395769609316526e-05, + "loss": 0.5426, + "step": 262010 + }, + { + "epoch": 2.3163422267013205, + "grad_norm": 1.7973713874816895, + "learning_rate": 1.1394296221644654e-05, + "loss": 0.4845, + "step": 262020 + }, + { + "epoch": 2.316430629961633, + "grad_norm": 2.6376407146453857, + "learning_rate": 1.1392822833972784e-05, + "loss": 0.5866, + "step": 262030 + }, + { + "epoch": 2.316519033221945, + "grad_norm": 4.77910852432251, + "learning_rate": 1.1391349446300914e-05, + "loss": 0.5798, + "step": 262040 + }, + { + "epoch": 2.3166074364822573, + "grad_norm": 0.914598822593689, + "learning_rate": 1.1389876058629042e-05, + "loss": 0.5865, + "step": 262050 + }, + { + "epoch": 2.31669583974257, + "grad_norm": 1.9854319095611572, + "learning_rate": 1.1388402670957172e-05, + "loss": 0.5882, + "step": 262060 + }, + { + "epoch": 2.316784243002882, + "grad_norm": 1.3589730262756348, + "learning_rate": 1.13869292832853e-05, + "loss": 0.567, + "step": 262070 + }, + { + "epoch": 2.316872646263194, + "grad_norm": 9.54977798461914, + "learning_rate": 1.1385455895613431e-05, + "loss": 0.6412, + "step": 262080 + }, + { + "epoch": 2.3169610495235062, + "grad_norm": 2.556987762451172, + "learning_rate": 1.138398250794156e-05, + "loss": 0.4866, + "step": 262090 + }, + { + "epoch": 2.317049452783819, + "grad_norm": 1.2140403985977173, + "learning_rate": 1.138250912026969e-05, + "loss": 0.5977, + "step": 262100 + }, + { + "epoch": 2.317137856044131, + "grad_norm": 2.8864893913269043, + "learning_rate": 1.1381035732597818e-05, + "loss": 0.617, + "step": 262110 + }, + { + "epoch": 2.317226259304443, + "grad_norm": 5.037142276763916, + "learning_rate": 1.1379562344925948e-05, + "loss": 0.5298, + "step": 262120 + }, + { + "epoch": 2.3173146625647556, + "grad_norm": 2.6532399654388428, + "learning_rate": 1.1378088957254078e-05, + "loss": 0.4827, + "step": 262130 + }, + { + "epoch": 2.3174030658250677, + "grad_norm": 1.2438966035842896, + "learning_rate": 1.1376615569582206e-05, + "loss": 0.477, + "step": 262140 + }, + { + "epoch": 2.31749146908538, + "grad_norm": 2.220479726791382, + "learning_rate": 1.1375142181910336e-05, + "loss": 0.4504, + "step": 262150 + }, + { + "epoch": 2.317579872345692, + "grad_norm": 1.8779276609420776, + "learning_rate": 1.1373668794238464e-05, + "loss": 0.4003, + "step": 262160 + }, + { + "epoch": 2.3176682756060045, + "grad_norm": 6.72541618347168, + "learning_rate": 1.1372195406566595e-05, + "loss": 0.6083, + "step": 262170 + }, + { + "epoch": 2.3177566788663166, + "grad_norm": 3.583116054534912, + "learning_rate": 1.1370722018894723e-05, + "loss": 0.575, + "step": 262180 + }, + { + "epoch": 2.3178450821266288, + "grad_norm": 13.330500602722168, + "learning_rate": 1.1369248631222853e-05, + "loss": 0.4945, + "step": 262190 + }, + { + "epoch": 2.317933485386941, + "grad_norm": 6.678737163543701, + "learning_rate": 1.1367775243550981e-05, + "loss": 0.5078, + "step": 262200 + }, + { + "epoch": 2.3180218886472534, + "grad_norm": 4.092459201812744, + "learning_rate": 1.1366301855879111e-05, + "loss": 0.4788, + "step": 262210 + }, + { + "epoch": 2.3181102919075656, + "grad_norm": 2.7053415775299072, + "learning_rate": 1.1364828468207241e-05, + "loss": 0.5749, + "step": 262220 + }, + { + "epoch": 2.3181986951678777, + "grad_norm": 2.189988613128662, + "learning_rate": 1.136335508053537e-05, + "loss": 0.5197, + "step": 262230 + }, + { + "epoch": 2.31828709842819, + "grad_norm": 5.539681911468506, + "learning_rate": 1.13618816928635e-05, + "loss": 0.5336, + "step": 262240 + }, + { + "epoch": 2.3183755016885024, + "grad_norm": 5.167107582092285, + "learning_rate": 1.136040830519163e-05, + "loss": 0.4983, + "step": 262250 + }, + { + "epoch": 2.3184639049488145, + "grad_norm": 2.6046574115753174, + "learning_rate": 1.1358934917519758e-05, + "loss": 0.5303, + "step": 262260 + }, + { + "epoch": 2.3185523082091266, + "grad_norm": 1.291885256767273, + "learning_rate": 1.1357461529847888e-05, + "loss": 0.5414, + "step": 262270 + }, + { + "epoch": 2.318640711469439, + "grad_norm": 2.9678471088409424, + "learning_rate": 1.1355988142176018e-05, + "loss": 0.4614, + "step": 262280 + }, + { + "epoch": 2.3187291147297513, + "grad_norm": 4.128335475921631, + "learning_rate": 1.1354514754504147e-05, + "loss": 0.5475, + "step": 262290 + }, + { + "epoch": 2.3188175179900634, + "grad_norm": 5.211457252502441, + "learning_rate": 1.1353041366832277e-05, + "loss": 0.5549, + "step": 262300 + }, + { + "epoch": 2.3189059212503755, + "grad_norm": 8.215897560119629, + "learning_rate": 1.1351567979160407e-05, + "loss": 0.6479, + "step": 262310 + }, + { + "epoch": 2.318994324510688, + "grad_norm": 3.5134286880493164, + "learning_rate": 1.1350094591488535e-05, + "loss": 0.4417, + "step": 262320 + }, + { + "epoch": 2.319082727771, + "grad_norm": 5.294400215148926, + "learning_rate": 1.1348621203816665e-05, + "loss": 0.432, + "step": 262330 + }, + { + "epoch": 2.3191711310313123, + "grad_norm": 2.7355918884277344, + "learning_rate": 1.1347147816144793e-05, + "loss": 0.5703, + "step": 262340 + }, + { + "epoch": 2.319259534291625, + "grad_norm": 8.647595405578613, + "learning_rate": 1.1345674428472924e-05, + "loss": 0.657, + "step": 262350 + }, + { + "epoch": 2.319347937551937, + "grad_norm": 3.210958957672119, + "learning_rate": 1.1344201040801052e-05, + "loss": 0.5474, + "step": 262360 + }, + { + "epoch": 2.319436340812249, + "grad_norm": 2.8235840797424316, + "learning_rate": 1.1342727653129182e-05, + "loss": 0.4292, + "step": 262370 + }, + { + "epoch": 2.3195247440725613, + "grad_norm": 14.592721939086914, + "learning_rate": 1.134125426545731e-05, + "loss": 0.6124, + "step": 262380 + }, + { + "epoch": 2.319613147332874, + "grad_norm": 4.21879768371582, + "learning_rate": 1.133978087778544e-05, + "loss": 0.6714, + "step": 262390 + }, + { + "epoch": 2.319701550593186, + "grad_norm": 2.902940034866333, + "learning_rate": 1.1338307490113569e-05, + "loss": 0.532, + "step": 262400 + }, + { + "epoch": 2.319789953853498, + "grad_norm": 2.179553985595703, + "learning_rate": 1.1336834102441699e-05, + "loss": 0.5744, + "step": 262410 + }, + { + "epoch": 2.31987835711381, + "grad_norm": 2.3487141132354736, + "learning_rate": 1.1335360714769829e-05, + "loss": 0.4969, + "step": 262420 + }, + { + "epoch": 2.3199667603741227, + "grad_norm": 2.229466676712036, + "learning_rate": 1.1333887327097957e-05, + "loss": 0.5327, + "step": 262430 + }, + { + "epoch": 2.320055163634435, + "grad_norm": 2.3279354572296143, + "learning_rate": 1.1332413939426087e-05, + "loss": 0.4154, + "step": 262440 + }, + { + "epoch": 2.320143566894747, + "grad_norm": 3.157691717147827, + "learning_rate": 1.1330940551754216e-05, + "loss": 0.5137, + "step": 262450 + }, + { + "epoch": 2.320231970155059, + "grad_norm": 1.2090648412704468, + "learning_rate": 1.1329467164082346e-05, + "loss": 0.5208, + "step": 262460 + }, + { + "epoch": 2.3203203734153717, + "grad_norm": 2.1250813007354736, + "learning_rate": 1.1327993776410474e-05, + "loss": 0.5077, + "step": 262470 + }, + { + "epoch": 2.320408776675684, + "grad_norm": 14.0779447555542, + "learning_rate": 1.1326520388738604e-05, + "loss": 0.5271, + "step": 262480 + }, + { + "epoch": 2.320497179935996, + "grad_norm": 2.728998899459839, + "learning_rate": 1.1325047001066732e-05, + "loss": 0.5287, + "step": 262490 + }, + { + "epoch": 2.3205855831963085, + "grad_norm": 7.338333606719971, + "learning_rate": 1.1323573613394862e-05, + "loss": 0.4539, + "step": 262500 + }, + { + "epoch": 2.3206739864566206, + "grad_norm": 2.7718162536621094, + "learning_rate": 1.1322100225722992e-05, + "loss": 0.5296, + "step": 262510 + }, + { + "epoch": 2.3207623897169327, + "grad_norm": 2.6329658031463623, + "learning_rate": 1.132062683805112e-05, + "loss": 0.5143, + "step": 262520 + }, + { + "epoch": 2.320850792977245, + "grad_norm": 7.664357662200928, + "learning_rate": 1.131915345037925e-05, + "loss": 0.4744, + "step": 262530 + }, + { + "epoch": 2.3209391962375574, + "grad_norm": 17.006023406982422, + "learning_rate": 1.131768006270738e-05, + "loss": 0.6116, + "step": 262540 + }, + { + "epoch": 2.3210275994978695, + "grad_norm": 1.4228782653808594, + "learning_rate": 1.131620667503551e-05, + "loss": 0.3913, + "step": 262550 + }, + { + "epoch": 2.3211160027581816, + "grad_norm": 3.759024143218994, + "learning_rate": 1.1314733287363638e-05, + "loss": 0.5093, + "step": 262560 + }, + { + "epoch": 2.321204406018494, + "grad_norm": 2.181114435195923, + "learning_rate": 1.1313259899691768e-05, + "loss": 0.5487, + "step": 262570 + }, + { + "epoch": 2.3212928092788063, + "grad_norm": 1.2080780267715454, + "learning_rate": 1.1311786512019896e-05, + "loss": 0.47, + "step": 262580 + }, + { + "epoch": 2.3213812125391184, + "grad_norm": 6.25924825668335, + "learning_rate": 1.1310313124348026e-05, + "loss": 0.6496, + "step": 262590 + }, + { + "epoch": 2.3214696157994306, + "grad_norm": 1.735942006111145, + "learning_rate": 1.1308839736676156e-05, + "loss": 0.4572, + "step": 262600 + }, + { + "epoch": 2.3215580190597427, + "grad_norm": 14.306096076965332, + "learning_rate": 1.1307366349004284e-05, + "loss": 0.7158, + "step": 262610 + }, + { + "epoch": 2.3216464223200552, + "grad_norm": 4.456296443939209, + "learning_rate": 1.1305892961332415e-05, + "loss": 0.4643, + "step": 262620 + }, + { + "epoch": 2.3217348255803674, + "grad_norm": 1.6086517572402954, + "learning_rate": 1.1304419573660543e-05, + "loss": 0.4761, + "step": 262630 + }, + { + "epoch": 2.3218232288406795, + "grad_norm": 2.8987674713134766, + "learning_rate": 1.1302946185988673e-05, + "loss": 0.5722, + "step": 262640 + }, + { + "epoch": 2.321911632100992, + "grad_norm": 3.1908435821533203, + "learning_rate": 1.1301472798316801e-05, + "loss": 0.5405, + "step": 262650 + }, + { + "epoch": 2.322000035361304, + "grad_norm": 2.3831844329833984, + "learning_rate": 1.1299999410644931e-05, + "loss": 0.4692, + "step": 262660 + }, + { + "epoch": 2.3220884386216163, + "grad_norm": 2.6252729892730713, + "learning_rate": 1.1298526022973061e-05, + "loss": 0.5977, + "step": 262670 + }, + { + "epoch": 2.3221768418819284, + "grad_norm": 6.886342525482178, + "learning_rate": 1.129705263530119e-05, + "loss": 0.6234, + "step": 262680 + }, + { + "epoch": 2.322265245142241, + "grad_norm": 6.958355903625488, + "learning_rate": 1.129557924762932e-05, + "loss": 0.4457, + "step": 262690 + }, + { + "epoch": 2.322353648402553, + "grad_norm": 1.2958979606628418, + "learning_rate": 1.129410585995745e-05, + "loss": 0.5853, + "step": 262700 + }, + { + "epoch": 2.322442051662865, + "grad_norm": 2.9697608947753906, + "learning_rate": 1.1292632472285578e-05, + "loss": 0.6468, + "step": 262710 + }, + { + "epoch": 2.322530454923178, + "grad_norm": 9.384261131286621, + "learning_rate": 1.1291159084613708e-05, + "loss": 0.4747, + "step": 262720 + }, + { + "epoch": 2.32261885818349, + "grad_norm": 1.7182663679122925, + "learning_rate": 1.1289685696941838e-05, + "loss": 0.5327, + "step": 262730 + }, + { + "epoch": 2.322707261443802, + "grad_norm": 2.4280385971069336, + "learning_rate": 1.1288212309269967e-05, + "loss": 0.6335, + "step": 262740 + }, + { + "epoch": 2.322795664704114, + "grad_norm": 10.692617416381836, + "learning_rate": 1.1286738921598097e-05, + "loss": 0.565, + "step": 262750 + }, + { + "epoch": 2.3228840679644267, + "grad_norm": 5.929415702819824, + "learning_rate": 1.1285265533926225e-05, + "loss": 0.4734, + "step": 262760 + }, + { + "epoch": 2.322972471224739, + "grad_norm": 2.1688432693481445, + "learning_rate": 1.1283792146254355e-05, + "loss": 0.4126, + "step": 262770 + }, + { + "epoch": 2.323060874485051, + "grad_norm": 4.644767761230469, + "learning_rate": 1.1282318758582485e-05, + "loss": 0.437, + "step": 262780 + }, + { + "epoch": 2.323149277745363, + "grad_norm": 4.648106575012207, + "learning_rate": 1.1280845370910613e-05, + "loss": 0.5068, + "step": 262790 + }, + { + "epoch": 2.3232376810056756, + "grad_norm": 3.6522891521453857, + "learning_rate": 1.1279371983238744e-05, + "loss": 0.4816, + "step": 262800 + }, + { + "epoch": 2.3233260842659877, + "grad_norm": 2.1884336471557617, + "learning_rate": 1.1277898595566872e-05, + "loss": 0.5026, + "step": 262810 + }, + { + "epoch": 2.3234144875263, + "grad_norm": 3.593816041946411, + "learning_rate": 1.1276425207895002e-05, + "loss": 0.5948, + "step": 262820 + }, + { + "epoch": 2.323502890786612, + "grad_norm": 7.264957427978516, + "learning_rate": 1.127495182022313e-05, + "loss": 0.5195, + "step": 262830 + }, + { + "epoch": 2.3235912940469245, + "grad_norm": 15.512468338012695, + "learning_rate": 1.127347843255126e-05, + "loss": 0.5924, + "step": 262840 + }, + { + "epoch": 2.3236796973072367, + "grad_norm": 5.179112434387207, + "learning_rate": 1.1272005044879389e-05, + "loss": 0.5666, + "step": 262850 + }, + { + "epoch": 2.323768100567549, + "grad_norm": 0.8464191555976868, + "learning_rate": 1.1270531657207519e-05, + "loss": 0.4136, + "step": 262860 + }, + { + "epoch": 2.3238565038278614, + "grad_norm": 13.765921592712402, + "learning_rate": 1.1269058269535649e-05, + "loss": 0.4876, + "step": 262870 + }, + { + "epoch": 2.3239449070881735, + "grad_norm": 11.316739082336426, + "learning_rate": 1.1267584881863777e-05, + "loss": 0.6293, + "step": 262880 + }, + { + "epoch": 2.3240333103484856, + "grad_norm": 11.494611740112305, + "learning_rate": 1.1266111494191907e-05, + "loss": 0.5029, + "step": 262890 + }, + { + "epoch": 2.3241217136087977, + "grad_norm": 1.7371128797531128, + "learning_rate": 1.1264638106520036e-05, + "loss": 0.4536, + "step": 262900 + }, + { + "epoch": 2.3242101168691103, + "grad_norm": 3.7005696296691895, + "learning_rate": 1.1263164718848166e-05, + "loss": 0.5947, + "step": 262910 + }, + { + "epoch": 2.3242985201294224, + "grad_norm": 9.563356399536133, + "learning_rate": 1.1261691331176294e-05, + "loss": 0.5691, + "step": 262920 + }, + { + "epoch": 2.3243869233897345, + "grad_norm": 3.74212384223938, + "learning_rate": 1.1260217943504424e-05, + "loss": 0.5538, + "step": 262930 + }, + { + "epoch": 2.324475326650047, + "grad_norm": 4.585826396942139, + "learning_rate": 1.1258744555832552e-05, + "loss": 0.5899, + "step": 262940 + }, + { + "epoch": 2.324563729910359, + "grad_norm": 0.9881808161735535, + "learning_rate": 1.1257271168160682e-05, + "loss": 0.4761, + "step": 262950 + }, + { + "epoch": 2.3246521331706713, + "grad_norm": 1.425551176071167, + "learning_rate": 1.125579778048881e-05, + "loss": 0.5681, + "step": 262960 + }, + { + "epoch": 2.3247405364309834, + "grad_norm": 3.2959940433502197, + "learning_rate": 1.125432439281694e-05, + "loss": 0.5461, + "step": 262970 + }, + { + "epoch": 2.324828939691296, + "grad_norm": 4.497406482696533, + "learning_rate": 1.125285100514507e-05, + "loss": 0.542, + "step": 262980 + }, + { + "epoch": 2.324917342951608, + "grad_norm": 3.1114721298217773, + "learning_rate": 1.12513776174732e-05, + "loss": 0.4914, + "step": 262990 + }, + { + "epoch": 2.3250057462119202, + "grad_norm": 2.1042990684509277, + "learning_rate": 1.124990422980133e-05, + "loss": 0.4957, + "step": 263000 + }, + { + "epoch": 2.3250941494722324, + "grad_norm": 2.000885486602783, + "learning_rate": 1.1248430842129458e-05, + "loss": 0.4336, + "step": 263010 + }, + { + "epoch": 2.325182552732545, + "grad_norm": 3.578127384185791, + "learning_rate": 1.1246957454457588e-05, + "loss": 0.67, + "step": 263020 + }, + { + "epoch": 2.325270955992857, + "grad_norm": 8.505927085876465, + "learning_rate": 1.1245484066785716e-05, + "loss": 0.5655, + "step": 263030 + }, + { + "epoch": 2.325359359253169, + "grad_norm": 2.6246249675750732, + "learning_rate": 1.1244010679113846e-05, + "loss": 0.5558, + "step": 263040 + }, + { + "epoch": 2.3254477625134813, + "grad_norm": 7.133711338043213, + "learning_rate": 1.1242537291441974e-05, + "loss": 0.5159, + "step": 263050 + }, + { + "epoch": 2.325536165773794, + "grad_norm": 5.1766228675842285, + "learning_rate": 1.1241063903770104e-05, + "loss": 0.4072, + "step": 263060 + }, + { + "epoch": 2.325624569034106, + "grad_norm": 13.603132247924805, + "learning_rate": 1.1239590516098234e-05, + "loss": 0.6132, + "step": 263070 + }, + { + "epoch": 2.325712972294418, + "grad_norm": 3.0010509490966797, + "learning_rate": 1.1238117128426363e-05, + "loss": 0.6413, + "step": 263080 + }, + { + "epoch": 2.3258013755547307, + "grad_norm": 0.787135124206543, + "learning_rate": 1.1236643740754493e-05, + "loss": 0.603, + "step": 263090 + }, + { + "epoch": 2.3258897788150428, + "grad_norm": 3.3527355194091797, + "learning_rate": 1.1235170353082621e-05, + "loss": 0.466, + "step": 263100 + }, + { + "epoch": 2.325978182075355, + "grad_norm": 2.469621419906616, + "learning_rate": 1.1233696965410751e-05, + "loss": 0.5232, + "step": 263110 + }, + { + "epoch": 2.326066585335667, + "grad_norm": 1.0837697982788086, + "learning_rate": 1.123222357773888e-05, + "loss": 0.4525, + "step": 263120 + }, + { + "epoch": 2.3261549885959796, + "grad_norm": 1.349210500717163, + "learning_rate": 1.123075019006701e-05, + "loss": 0.602, + "step": 263130 + }, + { + "epoch": 2.3262433918562917, + "grad_norm": 5.8354268074035645, + "learning_rate": 1.122927680239514e-05, + "loss": 0.5785, + "step": 263140 + }, + { + "epoch": 2.326331795116604, + "grad_norm": 2.006589889526367, + "learning_rate": 1.1227803414723268e-05, + "loss": 0.5842, + "step": 263150 + }, + { + "epoch": 2.3264201983769164, + "grad_norm": 4.483150959014893, + "learning_rate": 1.1226330027051398e-05, + "loss": 0.5502, + "step": 263160 + }, + { + "epoch": 2.3265086016372285, + "grad_norm": 2.4719207286834717, + "learning_rate": 1.1224856639379528e-05, + "loss": 0.4593, + "step": 263170 + }, + { + "epoch": 2.3265970048975406, + "grad_norm": 3.347445249557495, + "learning_rate": 1.1223383251707657e-05, + "loss": 0.5299, + "step": 263180 + }, + { + "epoch": 2.3266854081578527, + "grad_norm": 3.73453688621521, + "learning_rate": 1.1221909864035787e-05, + "loss": 0.4441, + "step": 263190 + }, + { + "epoch": 2.326773811418165, + "grad_norm": 6.937019348144531, + "learning_rate": 1.1220436476363917e-05, + "loss": 0.536, + "step": 263200 + }, + { + "epoch": 2.3268622146784774, + "grad_norm": 10.463818550109863, + "learning_rate": 1.1218963088692045e-05, + "loss": 0.5846, + "step": 263210 + }, + { + "epoch": 2.3269506179387895, + "grad_norm": 0.6591556072235107, + "learning_rate": 1.1217489701020175e-05, + "loss": 0.4403, + "step": 263220 + }, + { + "epoch": 2.3270390211991017, + "grad_norm": 4.485307216644287, + "learning_rate": 1.1216016313348303e-05, + "loss": 0.4766, + "step": 263230 + }, + { + "epoch": 2.3271274244594142, + "grad_norm": 3.9524989128112793, + "learning_rate": 1.1214542925676433e-05, + "loss": 0.4726, + "step": 263240 + }, + { + "epoch": 2.3272158277197263, + "grad_norm": 1.6099746227264404, + "learning_rate": 1.1213069538004563e-05, + "loss": 0.4834, + "step": 263250 + }, + { + "epoch": 2.3273042309800385, + "grad_norm": 1.415482521057129, + "learning_rate": 1.1211596150332692e-05, + "loss": 0.604, + "step": 263260 + }, + { + "epoch": 2.3273926342403506, + "grad_norm": 28.913860321044922, + "learning_rate": 1.1210122762660822e-05, + "loss": 0.5483, + "step": 263270 + }, + { + "epoch": 2.327481037500663, + "grad_norm": 2.822252035140991, + "learning_rate": 1.120864937498895e-05, + "loss": 0.5091, + "step": 263280 + }, + { + "epoch": 2.3275694407609753, + "grad_norm": 4.249111175537109, + "learning_rate": 1.120717598731708e-05, + "loss": 0.5059, + "step": 263290 + }, + { + "epoch": 2.3276578440212874, + "grad_norm": 4.301076412200928, + "learning_rate": 1.1205702599645209e-05, + "loss": 0.5652, + "step": 263300 + }, + { + "epoch": 2.3277462472816, + "grad_norm": 2.177920341491699, + "learning_rate": 1.1204229211973339e-05, + "loss": 0.6751, + "step": 263310 + }, + { + "epoch": 2.327834650541912, + "grad_norm": 2.3259756565093994, + "learning_rate": 1.1202755824301467e-05, + "loss": 0.4279, + "step": 263320 + }, + { + "epoch": 2.327923053802224, + "grad_norm": 3.4177920818328857, + "learning_rate": 1.1201282436629597e-05, + "loss": 0.3969, + "step": 263330 + }, + { + "epoch": 2.3280114570625363, + "grad_norm": 1.215247392654419, + "learning_rate": 1.1199809048957727e-05, + "loss": 0.4636, + "step": 263340 + }, + { + "epoch": 2.328099860322849, + "grad_norm": 1.3323938846588135, + "learning_rate": 1.1198335661285855e-05, + "loss": 0.6007, + "step": 263350 + }, + { + "epoch": 2.328188263583161, + "grad_norm": 3.1231987476348877, + "learning_rate": 1.1196862273613986e-05, + "loss": 0.549, + "step": 263360 + }, + { + "epoch": 2.328276666843473, + "grad_norm": 1.8065677881240845, + "learning_rate": 1.1195388885942114e-05, + "loss": 0.5594, + "step": 263370 + }, + { + "epoch": 2.3283650701037852, + "grad_norm": 5.018585681915283, + "learning_rate": 1.1193915498270244e-05, + "loss": 0.5324, + "step": 263380 + }, + { + "epoch": 2.328453473364098, + "grad_norm": 4.450254440307617, + "learning_rate": 1.1192442110598372e-05, + "loss": 0.5364, + "step": 263390 + }, + { + "epoch": 2.32854187662441, + "grad_norm": 3.5244553089141846, + "learning_rate": 1.1190968722926502e-05, + "loss": 0.444, + "step": 263400 + }, + { + "epoch": 2.328630279884722, + "grad_norm": 2.3630120754241943, + "learning_rate": 1.118949533525463e-05, + "loss": 0.5912, + "step": 263410 + }, + { + "epoch": 2.328718683145034, + "grad_norm": 7.678308010101318, + "learning_rate": 1.118802194758276e-05, + "loss": 0.5656, + "step": 263420 + }, + { + "epoch": 2.3288070864053467, + "grad_norm": 2.5644547939300537, + "learning_rate": 1.118654855991089e-05, + "loss": 0.6792, + "step": 263430 + }, + { + "epoch": 2.328895489665659, + "grad_norm": 1.6038157939910889, + "learning_rate": 1.1185075172239019e-05, + "loss": 0.5823, + "step": 263440 + }, + { + "epoch": 2.328983892925971, + "grad_norm": 1.6336392164230347, + "learning_rate": 1.118360178456715e-05, + "loss": 0.5199, + "step": 263450 + }, + { + "epoch": 2.3290722961862835, + "grad_norm": 5.257998466491699, + "learning_rate": 1.1182128396895278e-05, + "loss": 0.5378, + "step": 263460 + }, + { + "epoch": 2.3291606994465957, + "grad_norm": 3.6875667572021484, + "learning_rate": 1.1180655009223408e-05, + "loss": 0.6073, + "step": 263470 + }, + { + "epoch": 2.3292491027069078, + "grad_norm": 2.119178056716919, + "learning_rate": 1.1179181621551536e-05, + "loss": 0.4977, + "step": 263480 + }, + { + "epoch": 2.32933750596722, + "grad_norm": 3.5257105827331543, + "learning_rate": 1.1177708233879666e-05, + "loss": 0.6052, + "step": 263490 + }, + { + "epoch": 2.3294259092275325, + "grad_norm": 5.364933967590332, + "learning_rate": 1.1176234846207794e-05, + "loss": 0.5734, + "step": 263500 + }, + { + "epoch": 2.3295143124878446, + "grad_norm": 6.212423801422119, + "learning_rate": 1.1174761458535924e-05, + "loss": 0.5599, + "step": 263510 + }, + { + "epoch": 2.3296027157481567, + "grad_norm": 3.9252190589904785, + "learning_rate": 1.1173288070864053e-05, + "loss": 0.4777, + "step": 263520 + }, + { + "epoch": 2.3296911190084693, + "grad_norm": 2.833355188369751, + "learning_rate": 1.1171814683192183e-05, + "loss": 0.5417, + "step": 263530 + }, + { + "epoch": 2.3297795222687814, + "grad_norm": 3.1803719997406006, + "learning_rate": 1.1170341295520313e-05, + "loss": 0.6123, + "step": 263540 + }, + { + "epoch": 2.3298679255290935, + "grad_norm": 2.3743393421173096, + "learning_rate": 1.1168867907848441e-05, + "loss": 0.4863, + "step": 263550 + }, + { + "epoch": 2.3299563287894056, + "grad_norm": 1.4525270462036133, + "learning_rate": 1.1167394520176571e-05, + "loss": 0.4448, + "step": 263560 + }, + { + "epoch": 2.330044732049718, + "grad_norm": 4.939568042755127, + "learning_rate": 1.11659211325047e-05, + "loss": 0.4315, + "step": 263570 + }, + { + "epoch": 2.3301331353100303, + "grad_norm": 3.064539670944214, + "learning_rate": 1.116444774483283e-05, + "loss": 0.4851, + "step": 263580 + }, + { + "epoch": 2.3302215385703424, + "grad_norm": 3.111398935317993, + "learning_rate": 1.1162974357160958e-05, + "loss": 0.548, + "step": 263590 + }, + { + "epoch": 2.3303099418306545, + "grad_norm": 3.9201464653015137, + "learning_rate": 1.1161500969489088e-05, + "loss": 0.7357, + "step": 263600 + }, + { + "epoch": 2.330398345090967, + "grad_norm": 3.934558391571045, + "learning_rate": 1.1160027581817218e-05, + "loss": 0.5994, + "step": 263610 + }, + { + "epoch": 2.3304867483512792, + "grad_norm": 5.140555381774902, + "learning_rate": 1.1158554194145346e-05, + "loss": 0.585, + "step": 263620 + }, + { + "epoch": 2.3305751516115913, + "grad_norm": 3.7709429264068604, + "learning_rate": 1.1157080806473477e-05, + "loss": 0.5114, + "step": 263630 + }, + { + "epoch": 2.3306635548719035, + "grad_norm": 3.417423725128174, + "learning_rate": 1.1155607418801607e-05, + "loss": 0.5659, + "step": 263640 + }, + { + "epoch": 2.330751958132216, + "grad_norm": 1.1489049196243286, + "learning_rate": 1.1154134031129735e-05, + "loss": 0.556, + "step": 263650 + }, + { + "epoch": 2.330840361392528, + "grad_norm": 5.2950334548950195, + "learning_rate": 1.1152660643457865e-05, + "loss": 0.5872, + "step": 263660 + }, + { + "epoch": 2.3309287646528403, + "grad_norm": 2.1337592601776123, + "learning_rate": 1.1151187255785995e-05, + "loss": 0.6356, + "step": 263670 + }, + { + "epoch": 2.331017167913153, + "grad_norm": 2.423402786254883, + "learning_rate": 1.1149713868114123e-05, + "loss": 0.4258, + "step": 263680 + }, + { + "epoch": 2.331105571173465, + "grad_norm": 2.1931188106536865, + "learning_rate": 1.1148240480442253e-05, + "loss": 0.4444, + "step": 263690 + }, + { + "epoch": 2.331193974433777, + "grad_norm": 1.8426101207733154, + "learning_rate": 1.1146767092770382e-05, + "loss": 0.4386, + "step": 263700 + }, + { + "epoch": 2.331282377694089, + "grad_norm": 1.044671893119812, + "learning_rate": 1.1145293705098512e-05, + "loss": 0.5873, + "step": 263710 + }, + { + "epoch": 2.3313707809544018, + "grad_norm": 3.5918009281158447, + "learning_rate": 1.1143820317426642e-05, + "loss": 0.5397, + "step": 263720 + }, + { + "epoch": 2.331459184214714, + "grad_norm": 2.6830813884735107, + "learning_rate": 1.114234692975477e-05, + "loss": 0.5549, + "step": 263730 + }, + { + "epoch": 2.331547587475026, + "grad_norm": 3.9444191455841064, + "learning_rate": 1.11408735420829e-05, + "loss": 0.5241, + "step": 263740 + }, + { + "epoch": 2.3316359907353386, + "grad_norm": 7.664246082305908, + "learning_rate": 1.1139400154411029e-05, + "loss": 0.7164, + "step": 263750 + }, + { + "epoch": 2.3317243939956507, + "grad_norm": 2.4927613735198975, + "learning_rate": 1.1137926766739159e-05, + "loss": 0.5213, + "step": 263760 + }, + { + "epoch": 2.331812797255963, + "grad_norm": 8.30559253692627, + "learning_rate": 1.1136453379067287e-05, + "loss": 0.6287, + "step": 263770 + }, + { + "epoch": 2.331901200516275, + "grad_norm": 3.414565324783325, + "learning_rate": 1.1134979991395417e-05, + "loss": 0.4277, + "step": 263780 + }, + { + "epoch": 2.331989603776587, + "grad_norm": 2.5618326663970947, + "learning_rate": 1.1133506603723545e-05, + "loss": 0.4706, + "step": 263790 + }, + { + "epoch": 2.3320780070368996, + "grad_norm": 6.029736042022705, + "learning_rate": 1.1132033216051675e-05, + "loss": 0.503, + "step": 263800 + }, + { + "epoch": 2.3321664102972117, + "grad_norm": 8.374373435974121, + "learning_rate": 1.1130559828379806e-05, + "loss": 0.5504, + "step": 263810 + }, + { + "epoch": 2.332254813557524, + "grad_norm": 3.025097131729126, + "learning_rate": 1.1129086440707934e-05, + "loss": 0.5515, + "step": 263820 + }, + { + "epoch": 2.3323432168178364, + "grad_norm": 13.599252700805664, + "learning_rate": 1.1127613053036064e-05, + "loss": 0.4802, + "step": 263830 + }, + { + "epoch": 2.3324316200781485, + "grad_norm": 1.1892255544662476, + "learning_rate": 1.1126139665364192e-05, + "loss": 0.528, + "step": 263840 + }, + { + "epoch": 2.3325200233384606, + "grad_norm": 2.06760573387146, + "learning_rate": 1.1124666277692322e-05, + "loss": 0.6, + "step": 263850 + }, + { + "epoch": 2.3326084265987728, + "grad_norm": 3.613736867904663, + "learning_rate": 1.112319289002045e-05, + "loss": 0.4269, + "step": 263860 + }, + { + "epoch": 2.3326968298590853, + "grad_norm": 6.264893054962158, + "learning_rate": 1.112171950234858e-05, + "loss": 0.4139, + "step": 263870 + }, + { + "epoch": 2.3327852331193974, + "grad_norm": 4.6732916831970215, + "learning_rate": 1.1120246114676709e-05, + "loss": 0.5778, + "step": 263880 + }, + { + "epoch": 2.3328736363797096, + "grad_norm": 4.467507362365723, + "learning_rate": 1.1118772727004839e-05, + "loss": 0.491, + "step": 263890 + }, + { + "epoch": 2.332962039640022, + "grad_norm": 1.1290229558944702, + "learning_rate": 1.111729933933297e-05, + "loss": 0.4626, + "step": 263900 + }, + { + "epoch": 2.3330504429003343, + "grad_norm": 2.1149673461914062, + "learning_rate": 1.1115825951661098e-05, + "loss": 0.4602, + "step": 263910 + }, + { + "epoch": 2.3331388461606464, + "grad_norm": 4.979875087738037, + "learning_rate": 1.1114352563989228e-05, + "loss": 0.4976, + "step": 263920 + }, + { + "epoch": 2.3332272494209585, + "grad_norm": 3.031094789505005, + "learning_rate": 1.1112879176317356e-05, + "loss": 0.6834, + "step": 263930 + }, + { + "epoch": 2.333315652681271, + "grad_norm": 6.601892948150635, + "learning_rate": 1.1111405788645486e-05, + "loss": 0.558, + "step": 263940 + }, + { + "epoch": 2.333404055941583, + "grad_norm": 8.478073120117188, + "learning_rate": 1.1109932400973614e-05, + "loss": 0.4863, + "step": 263950 + }, + { + "epoch": 2.3334924592018953, + "grad_norm": 0.8916829228401184, + "learning_rate": 1.1108459013301744e-05, + "loss": 0.6048, + "step": 263960 + }, + { + "epoch": 2.3335808624622074, + "grad_norm": 1.7561638355255127, + "learning_rate": 1.1106985625629873e-05, + "loss": 0.4901, + "step": 263970 + }, + { + "epoch": 2.33366926572252, + "grad_norm": 3.3298914432525635, + "learning_rate": 1.1105512237958003e-05, + "loss": 0.6473, + "step": 263980 + }, + { + "epoch": 2.333757668982832, + "grad_norm": 15.714461326599121, + "learning_rate": 1.1104038850286133e-05, + "loss": 0.5253, + "step": 263990 + }, + { + "epoch": 2.333846072243144, + "grad_norm": 2.4954428672790527, + "learning_rate": 1.1102565462614261e-05, + "loss": 0.5381, + "step": 264000 + }, + { + "epoch": 2.3339344755034563, + "grad_norm": 20.387210845947266, + "learning_rate": 1.1101092074942391e-05, + "loss": 0.4597, + "step": 264010 + }, + { + "epoch": 2.334022878763769, + "grad_norm": 2.3003387451171875, + "learning_rate": 1.109961868727052e-05, + "loss": 0.5513, + "step": 264020 + }, + { + "epoch": 2.334111282024081, + "grad_norm": 4.5786662101745605, + "learning_rate": 1.109814529959865e-05, + "loss": 0.6336, + "step": 264030 + }, + { + "epoch": 2.334199685284393, + "grad_norm": 2.9842865467071533, + "learning_rate": 1.1096671911926778e-05, + "loss": 0.5067, + "step": 264040 + }, + { + "epoch": 2.3342880885447057, + "grad_norm": 2.4921152591705322, + "learning_rate": 1.1095198524254908e-05, + "loss": 0.5039, + "step": 264050 + }, + { + "epoch": 2.334376491805018, + "grad_norm": 2.260565996170044, + "learning_rate": 1.1093725136583036e-05, + "loss": 0.5653, + "step": 264060 + }, + { + "epoch": 2.33446489506533, + "grad_norm": 2.3350141048431396, + "learning_rate": 1.1092251748911166e-05, + "loss": 0.5753, + "step": 264070 + }, + { + "epoch": 2.334553298325642, + "grad_norm": 27.242765426635742, + "learning_rate": 1.1090778361239296e-05, + "loss": 0.5733, + "step": 264080 + }, + { + "epoch": 2.3346417015859546, + "grad_norm": 17.811683654785156, + "learning_rate": 1.1089304973567425e-05, + "loss": 0.7876, + "step": 264090 + }, + { + "epoch": 2.3347301048462668, + "grad_norm": 3.792261838912964, + "learning_rate": 1.1087831585895555e-05, + "loss": 0.5278, + "step": 264100 + }, + { + "epoch": 2.334818508106579, + "grad_norm": 1.6810028553009033, + "learning_rate": 1.1086358198223685e-05, + "loss": 0.5272, + "step": 264110 + }, + { + "epoch": 2.3349069113668914, + "grad_norm": 11.522461891174316, + "learning_rate": 1.1084884810551813e-05, + "loss": 0.5618, + "step": 264120 + }, + { + "epoch": 2.3349953146272036, + "grad_norm": 1.427977204322815, + "learning_rate": 1.1083411422879943e-05, + "loss": 0.5191, + "step": 264130 + }, + { + "epoch": 2.3350837178875157, + "grad_norm": 11.260397911071777, + "learning_rate": 1.1081938035208073e-05, + "loss": 0.4585, + "step": 264140 + }, + { + "epoch": 2.335172121147828, + "grad_norm": 4.484994411468506, + "learning_rate": 1.1080464647536202e-05, + "loss": 0.4409, + "step": 264150 + }, + { + "epoch": 2.3352605244081404, + "grad_norm": 16.667970657348633, + "learning_rate": 1.1078991259864332e-05, + "loss": 0.5168, + "step": 264160 + }, + { + "epoch": 2.3353489276684525, + "grad_norm": 2.6021366119384766, + "learning_rate": 1.107751787219246e-05, + "loss": 0.5103, + "step": 264170 + }, + { + "epoch": 2.3354373309287646, + "grad_norm": 4.210467338562012, + "learning_rate": 1.107604448452059e-05, + "loss": 0.5715, + "step": 264180 + }, + { + "epoch": 2.3355257341890767, + "grad_norm": 6.384367942810059, + "learning_rate": 1.107457109684872e-05, + "loss": 0.5131, + "step": 264190 + }, + { + "epoch": 2.3356141374493893, + "grad_norm": 18.68454360961914, + "learning_rate": 1.1073097709176849e-05, + "loss": 0.6596, + "step": 264200 + }, + { + "epoch": 2.3357025407097014, + "grad_norm": 3.566671848297119, + "learning_rate": 1.1071624321504979e-05, + "loss": 0.5482, + "step": 264210 + }, + { + "epoch": 2.3357909439700135, + "grad_norm": 2.0996203422546387, + "learning_rate": 1.1070150933833107e-05, + "loss": 0.4777, + "step": 264220 + }, + { + "epoch": 2.3358793472303256, + "grad_norm": 6.523705959320068, + "learning_rate": 1.1068677546161237e-05, + "loss": 0.5428, + "step": 264230 + }, + { + "epoch": 2.335967750490638, + "grad_norm": 1.3725429773330688, + "learning_rate": 1.1067204158489365e-05, + "loss": 0.5262, + "step": 264240 + }, + { + "epoch": 2.3360561537509503, + "grad_norm": 1.8962433338165283, + "learning_rate": 1.1065730770817495e-05, + "loss": 0.5706, + "step": 264250 + }, + { + "epoch": 2.3361445570112624, + "grad_norm": 4.392897129058838, + "learning_rate": 1.1064257383145624e-05, + "loss": 0.6556, + "step": 264260 + }, + { + "epoch": 2.336232960271575, + "grad_norm": 1.6586635112762451, + "learning_rate": 1.1062783995473754e-05, + "loss": 0.413, + "step": 264270 + }, + { + "epoch": 2.336321363531887, + "grad_norm": 2.8835926055908203, + "learning_rate": 1.1061310607801884e-05, + "loss": 0.4457, + "step": 264280 + }, + { + "epoch": 2.3364097667921992, + "grad_norm": 2.2460098266601562, + "learning_rate": 1.1059837220130012e-05, + "loss": 0.5021, + "step": 264290 + }, + { + "epoch": 2.3364981700525114, + "grad_norm": 2.7871673107147217, + "learning_rate": 1.1058363832458142e-05, + "loss": 0.5088, + "step": 264300 + }, + { + "epoch": 2.336586573312824, + "grad_norm": 5.215290546417236, + "learning_rate": 1.105689044478627e-05, + "loss": 0.5732, + "step": 264310 + }, + { + "epoch": 2.336674976573136, + "grad_norm": 1.8743544816970825, + "learning_rate": 1.10554170571144e-05, + "loss": 0.4532, + "step": 264320 + }, + { + "epoch": 2.336763379833448, + "grad_norm": 6.248709201812744, + "learning_rate": 1.1053943669442529e-05, + "loss": 0.6176, + "step": 264330 + }, + { + "epoch": 2.3368517830937607, + "grad_norm": 2.255681037902832, + "learning_rate": 1.1052470281770659e-05, + "loss": 0.5445, + "step": 264340 + }, + { + "epoch": 2.336940186354073, + "grad_norm": 3.0044658184051514, + "learning_rate": 1.1050996894098787e-05, + "loss": 0.4514, + "step": 264350 + }, + { + "epoch": 2.337028589614385, + "grad_norm": 4.518677234649658, + "learning_rate": 1.1049523506426917e-05, + "loss": 0.5985, + "step": 264360 + }, + { + "epoch": 2.337116992874697, + "grad_norm": 2.842318296432495, + "learning_rate": 1.1048050118755048e-05, + "loss": 0.5323, + "step": 264370 + }, + { + "epoch": 2.337205396135009, + "grad_norm": 1.1750260591506958, + "learning_rate": 1.1046576731083176e-05, + "loss": 0.3668, + "step": 264380 + }, + { + "epoch": 2.337293799395322, + "grad_norm": 0.9989945292472839, + "learning_rate": 1.1045103343411306e-05, + "loss": 0.5715, + "step": 264390 + }, + { + "epoch": 2.337382202655634, + "grad_norm": 6.6348347663879395, + "learning_rate": 1.1043629955739434e-05, + "loss": 0.5384, + "step": 264400 + }, + { + "epoch": 2.337470605915946, + "grad_norm": 1.716619849205017, + "learning_rate": 1.1042156568067564e-05, + "loss": 0.4547, + "step": 264410 + }, + { + "epoch": 2.3375590091762586, + "grad_norm": 5.419493675231934, + "learning_rate": 1.1040683180395693e-05, + "loss": 0.4815, + "step": 264420 + }, + { + "epoch": 2.3376474124365707, + "grad_norm": 7.4886932373046875, + "learning_rate": 1.1039209792723823e-05, + "loss": 0.5987, + "step": 264430 + }, + { + "epoch": 2.337735815696883, + "grad_norm": 11.017470359802246, + "learning_rate": 1.1037736405051951e-05, + "loss": 0.5742, + "step": 264440 + }, + { + "epoch": 2.337824218957195, + "grad_norm": 1.5609923601150513, + "learning_rate": 1.1036263017380081e-05, + "loss": 0.588, + "step": 264450 + }, + { + "epoch": 2.3379126222175075, + "grad_norm": 5.921296119689941, + "learning_rate": 1.1034789629708211e-05, + "loss": 0.5166, + "step": 264460 + }, + { + "epoch": 2.3380010254778196, + "grad_norm": 1.9246691465377808, + "learning_rate": 1.103331624203634e-05, + "loss": 0.5459, + "step": 264470 + }, + { + "epoch": 2.3380894287381317, + "grad_norm": 1.9507017135620117, + "learning_rate": 1.103184285436447e-05, + "loss": 0.4456, + "step": 264480 + }, + { + "epoch": 2.3381778319984443, + "grad_norm": 7.1076836585998535, + "learning_rate": 1.1030369466692598e-05, + "loss": 0.5802, + "step": 264490 + }, + { + "epoch": 2.3382662352587564, + "grad_norm": 4.487006664276123, + "learning_rate": 1.1028896079020728e-05, + "loss": 0.5568, + "step": 264500 + }, + { + "epoch": 2.3383546385190686, + "grad_norm": 2.428027868270874, + "learning_rate": 1.1027422691348856e-05, + "loss": 0.6071, + "step": 264510 + }, + { + "epoch": 2.3384430417793807, + "grad_norm": 2.338336229324341, + "learning_rate": 1.1025949303676986e-05, + "loss": 0.4923, + "step": 264520 + }, + { + "epoch": 2.3385314450396932, + "grad_norm": 3.2151901721954346, + "learning_rate": 1.1024475916005115e-05, + "loss": 0.5905, + "step": 264530 + }, + { + "epoch": 2.3386198483000054, + "grad_norm": 6.261946678161621, + "learning_rate": 1.1023002528333245e-05, + "loss": 0.6017, + "step": 264540 + }, + { + "epoch": 2.3387082515603175, + "grad_norm": 1.7213959693908691, + "learning_rate": 1.1021529140661375e-05, + "loss": 0.6917, + "step": 264550 + }, + { + "epoch": 2.3387966548206296, + "grad_norm": 1.2203795909881592, + "learning_rate": 1.1020055752989503e-05, + "loss": 0.341, + "step": 264560 + }, + { + "epoch": 2.338885058080942, + "grad_norm": 8.812543869018555, + "learning_rate": 1.1018582365317633e-05, + "loss": 0.6365, + "step": 264570 + }, + { + "epoch": 2.3389734613412543, + "grad_norm": 8.105429649353027, + "learning_rate": 1.1017108977645763e-05, + "loss": 0.4279, + "step": 264580 + }, + { + "epoch": 2.3390618646015664, + "grad_norm": 2.6383073329925537, + "learning_rate": 1.1015635589973892e-05, + "loss": 0.4448, + "step": 264590 + }, + { + "epoch": 2.3391502678618785, + "grad_norm": 1.8231126070022583, + "learning_rate": 1.1014162202302022e-05, + "loss": 0.5669, + "step": 264600 + }, + { + "epoch": 2.339238671122191, + "grad_norm": 7.228283882141113, + "learning_rate": 1.1012688814630152e-05, + "loss": 0.5835, + "step": 264610 + }, + { + "epoch": 2.339327074382503, + "grad_norm": 2.3388895988464355, + "learning_rate": 1.101121542695828e-05, + "loss": 0.5211, + "step": 264620 + }, + { + "epoch": 2.3394154776428153, + "grad_norm": 5.498096466064453, + "learning_rate": 1.100974203928641e-05, + "loss": 0.4728, + "step": 264630 + }, + { + "epoch": 2.339503880903128, + "grad_norm": 3.4583115577697754, + "learning_rate": 1.1008268651614539e-05, + "loss": 0.4141, + "step": 264640 + }, + { + "epoch": 2.33959228416344, + "grad_norm": 0.7154993414878845, + "learning_rate": 1.1006795263942669e-05, + "loss": 0.4742, + "step": 264650 + }, + { + "epoch": 2.339680687423752, + "grad_norm": 3.7648627758026123, + "learning_rate": 1.1005321876270799e-05, + "loss": 0.5528, + "step": 264660 + }, + { + "epoch": 2.3397690906840642, + "grad_norm": 3.8903188705444336, + "learning_rate": 1.1003848488598927e-05, + "loss": 0.5502, + "step": 264670 + }, + { + "epoch": 2.339857493944377, + "grad_norm": 2.3824405670166016, + "learning_rate": 1.1002375100927057e-05, + "loss": 0.6311, + "step": 264680 + }, + { + "epoch": 2.339945897204689, + "grad_norm": 4.849879264831543, + "learning_rate": 1.1000901713255185e-05, + "loss": 0.5772, + "step": 264690 + }, + { + "epoch": 2.340034300465001, + "grad_norm": 2.008281707763672, + "learning_rate": 1.0999428325583315e-05, + "loss": 0.4081, + "step": 264700 + }, + { + "epoch": 2.3401227037253136, + "grad_norm": 8.36697769165039, + "learning_rate": 1.0997954937911444e-05, + "loss": 0.6133, + "step": 264710 + }, + { + "epoch": 2.3402111069856257, + "grad_norm": 1.7575979232788086, + "learning_rate": 1.0996481550239574e-05, + "loss": 0.6119, + "step": 264720 + }, + { + "epoch": 2.340299510245938, + "grad_norm": 2.4420318603515625, + "learning_rate": 1.0995008162567702e-05, + "loss": 0.393, + "step": 264730 + }, + { + "epoch": 2.34038791350625, + "grad_norm": 3.2822046279907227, + "learning_rate": 1.0993534774895832e-05, + "loss": 0.4565, + "step": 264740 + }, + { + "epoch": 2.3404763167665625, + "grad_norm": 2.3320486545562744, + "learning_rate": 1.0992061387223962e-05, + "loss": 0.3886, + "step": 264750 + }, + { + "epoch": 2.3405647200268747, + "grad_norm": 2.514420986175537, + "learning_rate": 1.099058799955209e-05, + "loss": 0.5854, + "step": 264760 + }, + { + "epoch": 2.3406531232871868, + "grad_norm": 3.5219833850860596, + "learning_rate": 1.098911461188022e-05, + "loss": 0.4827, + "step": 264770 + }, + { + "epoch": 2.340741526547499, + "grad_norm": 2.6608619689941406, + "learning_rate": 1.0987641224208349e-05, + "loss": 0.6419, + "step": 264780 + }, + { + "epoch": 2.3408299298078115, + "grad_norm": 1.706734538078308, + "learning_rate": 1.0986167836536479e-05, + "loss": 0.6836, + "step": 264790 + }, + { + "epoch": 2.3409183330681236, + "grad_norm": 2.4161760807037354, + "learning_rate": 1.0984694448864607e-05, + "loss": 0.4515, + "step": 264800 + }, + { + "epoch": 2.3410067363284357, + "grad_norm": 3.9371163845062256, + "learning_rate": 1.0983221061192737e-05, + "loss": 0.5026, + "step": 264810 + }, + { + "epoch": 2.341095139588748, + "grad_norm": 1.5785771608352661, + "learning_rate": 1.0981747673520866e-05, + "loss": 0.5469, + "step": 264820 + }, + { + "epoch": 2.3411835428490604, + "grad_norm": 11.621973991394043, + "learning_rate": 1.0980274285848996e-05, + "loss": 0.5073, + "step": 264830 + }, + { + "epoch": 2.3412719461093725, + "grad_norm": 6.450642108917236, + "learning_rate": 1.0978800898177126e-05, + "loss": 0.3901, + "step": 264840 + }, + { + "epoch": 2.3413603493696846, + "grad_norm": 1.8314085006713867, + "learning_rate": 1.0977327510505254e-05, + "loss": 0.6101, + "step": 264850 + }, + { + "epoch": 2.341448752629997, + "grad_norm": 3.1587727069854736, + "learning_rate": 1.0975854122833384e-05, + "loss": 0.5509, + "step": 264860 + }, + { + "epoch": 2.3415371558903093, + "grad_norm": 2.38997745513916, + "learning_rate": 1.0974380735161513e-05, + "loss": 0.5921, + "step": 264870 + }, + { + "epoch": 2.3416255591506214, + "grad_norm": 21.149335861206055, + "learning_rate": 1.0972907347489643e-05, + "loss": 0.6317, + "step": 264880 + }, + { + "epoch": 2.3417139624109335, + "grad_norm": 2.6147732734680176, + "learning_rate": 1.0971433959817771e-05, + "loss": 0.6178, + "step": 264890 + }, + { + "epoch": 2.341802365671246, + "grad_norm": 7.103963851928711, + "learning_rate": 1.0969960572145901e-05, + "loss": 0.4291, + "step": 264900 + }, + { + "epoch": 2.3418907689315582, + "grad_norm": 7.600110054016113, + "learning_rate": 1.096848718447403e-05, + "loss": 0.536, + "step": 264910 + }, + { + "epoch": 2.3419791721918704, + "grad_norm": 2.0762665271759033, + "learning_rate": 1.096701379680216e-05, + "loss": 0.5013, + "step": 264920 + }, + { + "epoch": 2.342067575452183, + "grad_norm": 5.964752197265625, + "learning_rate": 1.096554040913029e-05, + "loss": 0.5229, + "step": 264930 + }, + { + "epoch": 2.342155978712495, + "grad_norm": 6.002354621887207, + "learning_rate": 1.0964067021458418e-05, + "loss": 0.5032, + "step": 264940 + }, + { + "epoch": 2.342244381972807, + "grad_norm": 3.6536498069763184, + "learning_rate": 1.0962593633786548e-05, + "loss": 0.4993, + "step": 264950 + }, + { + "epoch": 2.3423327852331193, + "grad_norm": 2.8985087871551514, + "learning_rate": 1.0961120246114676e-05, + "loss": 0.576, + "step": 264960 + }, + { + "epoch": 2.3424211884934314, + "grad_norm": 3.024523973464966, + "learning_rate": 1.0959646858442806e-05, + "loss": 0.6114, + "step": 264970 + }, + { + "epoch": 2.342509591753744, + "grad_norm": 8.951346397399902, + "learning_rate": 1.0958173470770935e-05, + "loss": 0.598, + "step": 264980 + }, + { + "epoch": 2.342597995014056, + "grad_norm": 1.533031702041626, + "learning_rate": 1.0956700083099065e-05, + "loss": 0.4604, + "step": 264990 + }, + { + "epoch": 2.342686398274368, + "grad_norm": 4.690630912780762, + "learning_rate": 1.0955226695427195e-05, + "loss": 0.6027, + "step": 265000 + }, + { + "epoch": 2.3427748015346808, + "grad_norm": 1.8269938230514526, + "learning_rate": 1.0953753307755323e-05, + "loss": 0.4877, + "step": 265010 + }, + { + "epoch": 2.342863204794993, + "grad_norm": 1.8884024620056152, + "learning_rate": 1.0952279920083453e-05, + "loss": 0.4631, + "step": 265020 + }, + { + "epoch": 2.342951608055305, + "grad_norm": 1.4869071245193481, + "learning_rate": 1.0950806532411583e-05, + "loss": 0.5378, + "step": 265030 + }, + { + "epoch": 2.343040011315617, + "grad_norm": 7.121763706207275, + "learning_rate": 1.0949333144739712e-05, + "loss": 0.5814, + "step": 265040 + }, + { + "epoch": 2.3431284145759297, + "grad_norm": 2.199887752532959, + "learning_rate": 1.0947859757067842e-05, + "loss": 0.3779, + "step": 265050 + }, + { + "epoch": 2.343216817836242, + "grad_norm": 9.647836685180664, + "learning_rate": 1.0946386369395972e-05, + "loss": 0.5118, + "step": 265060 + }, + { + "epoch": 2.343305221096554, + "grad_norm": 2.7302582263946533, + "learning_rate": 1.09449129817241e-05, + "loss": 0.653, + "step": 265070 + }, + { + "epoch": 2.3433936243568665, + "grad_norm": 1.3745814561843872, + "learning_rate": 1.094343959405223e-05, + "loss": 0.5601, + "step": 265080 + }, + { + "epoch": 2.3434820276171786, + "grad_norm": 2.306387186050415, + "learning_rate": 1.0941966206380358e-05, + "loss": 0.4774, + "step": 265090 + }, + { + "epoch": 2.3435704308774907, + "grad_norm": 13.696290969848633, + "learning_rate": 1.0940492818708489e-05, + "loss": 0.535, + "step": 265100 + }, + { + "epoch": 2.343658834137803, + "grad_norm": 3.054443597793579, + "learning_rate": 1.0939019431036617e-05, + "loss": 0.5297, + "step": 265110 + }, + { + "epoch": 2.3437472373981154, + "grad_norm": 1.937474012374878, + "learning_rate": 1.0937546043364747e-05, + "loss": 0.5122, + "step": 265120 + }, + { + "epoch": 2.3438356406584275, + "grad_norm": 5.511300563812256, + "learning_rate": 1.0936072655692877e-05, + "loss": 0.6352, + "step": 265130 + }, + { + "epoch": 2.3439240439187397, + "grad_norm": 6.677384853363037, + "learning_rate": 1.0934599268021005e-05, + "loss": 0.5547, + "step": 265140 + }, + { + "epoch": 2.3440124471790518, + "grad_norm": 2.875537872314453, + "learning_rate": 1.0933125880349135e-05, + "loss": 0.6393, + "step": 265150 + }, + { + "epoch": 2.3441008504393643, + "grad_norm": 1.9043227434158325, + "learning_rate": 1.0931652492677264e-05, + "loss": 0.4872, + "step": 265160 + }, + { + "epoch": 2.3441892536996765, + "grad_norm": 1.874245285987854, + "learning_rate": 1.0930179105005394e-05, + "loss": 0.473, + "step": 265170 + }, + { + "epoch": 2.3442776569599886, + "grad_norm": 2.006753444671631, + "learning_rate": 1.0928705717333522e-05, + "loss": 0.5786, + "step": 265180 + }, + { + "epoch": 2.3443660602203007, + "grad_norm": 3.080805540084839, + "learning_rate": 1.0927232329661652e-05, + "loss": 0.4227, + "step": 265190 + }, + { + "epoch": 2.3444544634806133, + "grad_norm": 7.576858997344971, + "learning_rate": 1.092575894198978e-05, + "loss": 0.5107, + "step": 265200 + }, + { + "epoch": 2.3445428667409254, + "grad_norm": 4.060778617858887, + "learning_rate": 1.092428555431791e-05, + "loss": 0.6061, + "step": 265210 + }, + { + "epoch": 2.3446312700012375, + "grad_norm": 10.754429817199707, + "learning_rate": 1.092281216664604e-05, + "loss": 0.4436, + "step": 265220 + }, + { + "epoch": 2.34471967326155, + "grad_norm": 5.928440093994141, + "learning_rate": 1.0921338778974169e-05, + "loss": 0.5174, + "step": 265230 + }, + { + "epoch": 2.344808076521862, + "grad_norm": 4.152365684509277, + "learning_rate": 1.0919865391302299e-05, + "loss": 0.5239, + "step": 265240 + }, + { + "epoch": 2.3448964797821743, + "grad_norm": 2.884516954421997, + "learning_rate": 1.0918392003630427e-05, + "loss": 0.4891, + "step": 265250 + }, + { + "epoch": 2.3449848830424864, + "grad_norm": 4.263916969299316, + "learning_rate": 1.0916918615958557e-05, + "loss": 0.5391, + "step": 265260 + }, + { + "epoch": 2.345073286302799, + "grad_norm": 13.530688285827637, + "learning_rate": 1.0915445228286686e-05, + "loss": 0.5136, + "step": 265270 + }, + { + "epoch": 2.345161689563111, + "grad_norm": 4.557342529296875, + "learning_rate": 1.0913971840614816e-05, + "loss": 0.531, + "step": 265280 + }, + { + "epoch": 2.3452500928234232, + "grad_norm": 11.014352798461914, + "learning_rate": 1.0912498452942944e-05, + "loss": 0.5325, + "step": 265290 + }, + { + "epoch": 2.345338496083736, + "grad_norm": 13.571048736572266, + "learning_rate": 1.0911025065271074e-05, + "loss": 0.5836, + "step": 265300 + }, + { + "epoch": 2.345426899344048, + "grad_norm": 2.542391061782837, + "learning_rate": 1.0909551677599204e-05, + "loss": 0.5954, + "step": 265310 + }, + { + "epoch": 2.34551530260436, + "grad_norm": 1.3372920751571655, + "learning_rate": 1.0908078289927333e-05, + "loss": 0.3417, + "step": 265320 + }, + { + "epoch": 2.345603705864672, + "grad_norm": 2.027765989303589, + "learning_rate": 1.0906604902255463e-05, + "loss": 0.486, + "step": 265330 + }, + { + "epoch": 2.3456921091249847, + "grad_norm": 2.5210866928100586, + "learning_rate": 1.0905131514583591e-05, + "loss": 0.5858, + "step": 265340 + }, + { + "epoch": 2.345780512385297, + "grad_norm": 2.299269199371338, + "learning_rate": 1.0903658126911721e-05, + "loss": 0.4152, + "step": 265350 + }, + { + "epoch": 2.345868915645609, + "grad_norm": 1.639922022819519, + "learning_rate": 1.090218473923985e-05, + "loss": 0.4929, + "step": 265360 + }, + { + "epoch": 2.345957318905921, + "grad_norm": 10.728738784790039, + "learning_rate": 1.090071135156798e-05, + "loss": 0.5567, + "step": 265370 + }, + { + "epoch": 2.3460457221662336, + "grad_norm": 1.065611720085144, + "learning_rate": 1.0899237963896108e-05, + "loss": 0.47, + "step": 265380 + }, + { + "epoch": 2.3461341254265458, + "grad_norm": 1.4403209686279297, + "learning_rate": 1.0897764576224238e-05, + "loss": 0.4685, + "step": 265390 + }, + { + "epoch": 2.346222528686858, + "grad_norm": 3.2502377033233643, + "learning_rate": 1.0896291188552368e-05, + "loss": 0.4266, + "step": 265400 + }, + { + "epoch": 2.34631093194717, + "grad_norm": 4.497636318206787, + "learning_rate": 1.0894817800880496e-05, + "loss": 0.6585, + "step": 265410 + }, + { + "epoch": 2.3463993352074826, + "grad_norm": 0.7818334102630615, + "learning_rate": 1.0893344413208626e-05, + "loss": 0.5259, + "step": 265420 + }, + { + "epoch": 2.3464877384677947, + "grad_norm": 8.22137451171875, + "learning_rate": 1.0891871025536755e-05, + "loss": 0.4879, + "step": 265430 + }, + { + "epoch": 2.346576141728107, + "grad_norm": 2.664548397064209, + "learning_rate": 1.0890397637864885e-05, + "loss": 0.5658, + "step": 265440 + }, + { + "epoch": 2.3466645449884194, + "grad_norm": 2.0297653675079346, + "learning_rate": 1.0888924250193013e-05, + "loss": 0.4937, + "step": 265450 + }, + { + "epoch": 2.3467529482487315, + "grad_norm": 2.221503257751465, + "learning_rate": 1.0887450862521143e-05, + "loss": 0.5045, + "step": 265460 + }, + { + "epoch": 2.3468413515090436, + "grad_norm": 3.2406458854675293, + "learning_rate": 1.0885977474849273e-05, + "loss": 0.567, + "step": 265470 + }, + { + "epoch": 2.3469297547693557, + "grad_norm": 2.5304384231567383, + "learning_rate": 1.0884504087177402e-05, + "loss": 0.5347, + "step": 265480 + }, + { + "epoch": 2.3470181580296683, + "grad_norm": 3.3791022300720215, + "learning_rate": 1.0883030699505532e-05, + "loss": 0.5511, + "step": 265490 + }, + { + "epoch": 2.3471065612899804, + "grad_norm": 1.2670260667800903, + "learning_rate": 1.0881557311833662e-05, + "loss": 0.5707, + "step": 265500 + }, + { + "epoch": 2.3471949645502925, + "grad_norm": 5.210492134094238, + "learning_rate": 1.088008392416179e-05, + "loss": 0.6367, + "step": 265510 + }, + { + "epoch": 2.347283367810605, + "grad_norm": 11.994673728942871, + "learning_rate": 1.087861053648992e-05, + "loss": 0.5228, + "step": 265520 + }, + { + "epoch": 2.347371771070917, + "grad_norm": 3.422161817550659, + "learning_rate": 1.087713714881805e-05, + "loss": 0.5852, + "step": 265530 + }, + { + "epoch": 2.3474601743312293, + "grad_norm": 2.1705973148345947, + "learning_rate": 1.0875663761146178e-05, + "loss": 0.4507, + "step": 265540 + }, + { + "epoch": 2.3475485775915415, + "grad_norm": 2.6213321685791016, + "learning_rate": 1.0874190373474309e-05, + "loss": 0.5803, + "step": 265550 + }, + { + "epoch": 2.3476369808518536, + "grad_norm": 2.2767229080200195, + "learning_rate": 1.0872716985802437e-05, + "loss": 0.6341, + "step": 265560 + }, + { + "epoch": 2.347725384112166, + "grad_norm": 25.35391616821289, + "learning_rate": 1.0871243598130567e-05, + "loss": 0.5242, + "step": 265570 + }, + { + "epoch": 2.3478137873724783, + "grad_norm": 2.092524766921997, + "learning_rate": 1.0869770210458697e-05, + "loss": 0.6775, + "step": 265580 + }, + { + "epoch": 2.3479021906327904, + "grad_norm": 3.0572240352630615, + "learning_rate": 1.0868296822786825e-05, + "loss": 0.529, + "step": 265590 + }, + { + "epoch": 2.347990593893103, + "grad_norm": 5.669069766998291, + "learning_rate": 1.0866823435114955e-05, + "loss": 0.4546, + "step": 265600 + }, + { + "epoch": 2.348078997153415, + "grad_norm": 2.621131181716919, + "learning_rate": 1.0865350047443084e-05, + "loss": 0.5202, + "step": 265610 + }, + { + "epoch": 2.348167400413727, + "grad_norm": 2.0205681324005127, + "learning_rate": 1.0863876659771214e-05, + "loss": 0.469, + "step": 265620 + }, + { + "epoch": 2.3482558036740393, + "grad_norm": 1.0203722715377808, + "learning_rate": 1.0862403272099342e-05, + "loss": 0.5978, + "step": 265630 + }, + { + "epoch": 2.348344206934352, + "grad_norm": 8.732378005981445, + "learning_rate": 1.0860929884427472e-05, + "loss": 0.4717, + "step": 265640 + }, + { + "epoch": 2.348432610194664, + "grad_norm": 3.712832450866699, + "learning_rate": 1.08594564967556e-05, + "loss": 0.604, + "step": 265650 + }, + { + "epoch": 2.348521013454976, + "grad_norm": 2.4171671867370605, + "learning_rate": 1.085798310908373e-05, + "loss": 0.579, + "step": 265660 + }, + { + "epoch": 2.3486094167152887, + "grad_norm": 1.8733044862747192, + "learning_rate": 1.0856509721411859e-05, + "loss": 0.4841, + "step": 265670 + }, + { + "epoch": 2.348697819975601, + "grad_norm": 2.6557648181915283, + "learning_rate": 1.0855036333739989e-05, + "loss": 0.4875, + "step": 265680 + }, + { + "epoch": 2.348786223235913, + "grad_norm": 3.593625068664551, + "learning_rate": 1.0853562946068119e-05, + "loss": 0.5051, + "step": 265690 + }, + { + "epoch": 2.348874626496225, + "grad_norm": 1.1997578144073486, + "learning_rate": 1.0852089558396247e-05, + "loss": 0.503, + "step": 265700 + }, + { + "epoch": 2.3489630297565376, + "grad_norm": 1.3594717979431152, + "learning_rate": 1.0850616170724377e-05, + "loss": 0.4686, + "step": 265710 + }, + { + "epoch": 2.3490514330168497, + "grad_norm": 1.7126946449279785, + "learning_rate": 1.0849142783052506e-05, + "loss": 0.6453, + "step": 265720 + }, + { + "epoch": 2.349139836277162, + "grad_norm": 3.2126312255859375, + "learning_rate": 1.0847669395380636e-05, + "loss": 0.6691, + "step": 265730 + }, + { + "epoch": 2.349228239537474, + "grad_norm": 3.3555870056152344, + "learning_rate": 1.0846196007708764e-05, + "loss": 0.6155, + "step": 265740 + }, + { + "epoch": 2.3493166427977865, + "grad_norm": 1.7889728546142578, + "learning_rate": 1.0844722620036894e-05, + "loss": 0.4577, + "step": 265750 + }, + { + "epoch": 2.3494050460580986, + "grad_norm": 2.5857152938842773, + "learning_rate": 1.0843249232365023e-05, + "loss": 0.6115, + "step": 265760 + }, + { + "epoch": 2.3494934493184108, + "grad_norm": 25.376846313476562, + "learning_rate": 1.0841775844693153e-05, + "loss": 0.5733, + "step": 265770 + }, + { + "epoch": 2.349581852578723, + "grad_norm": 2.4583969116210938, + "learning_rate": 1.0840302457021283e-05, + "loss": 0.4712, + "step": 265780 + }, + { + "epoch": 2.3496702558390354, + "grad_norm": 1.5990146398544312, + "learning_rate": 1.0838829069349411e-05, + "loss": 0.5304, + "step": 265790 + }, + { + "epoch": 2.3497586590993476, + "grad_norm": 1.6529035568237305, + "learning_rate": 1.0837355681677541e-05, + "loss": 0.5519, + "step": 265800 + }, + { + "epoch": 2.3498470623596597, + "grad_norm": 1.3092199563980103, + "learning_rate": 1.083588229400567e-05, + "loss": 0.4671, + "step": 265810 + }, + { + "epoch": 2.3499354656199722, + "grad_norm": 2.0789682865142822, + "learning_rate": 1.08344089063338e-05, + "loss": 0.4967, + "step": 265820 + }, + { + "epoch": 2.3500238688802844, + "grad_norm": 2.705392599105835, + "learning_rate": 1.0832935518661928e-05, + "loss": 0.4452, + "step": 265830 + }, + { + "epoch": 2.3501122721405965, + "grad_norm": 4.83109188079834, + "learning_rate": 1.0831462130990058e-05, + "loss": 0.7125, + "step": 265840 + }, + { + "epoch": 2.3502006754009086, + "grad_norm": 3.691737651824951, + "learning_rate": 1.0829988743318186e-05, + "loss": 0.5807, + "step": 265850 + }, + { + "epoch": 2.350289078661221, + "grad_norm": 5.639939308166504, + "learning_rate": 1.0828515355646316e-05, + "loss": 0.5021, + "step": 265860 + }, + { + "epoch": 2.3503774819215333, + "grad_norm": 6.028113842010498, + "learning_rate": 1.0827041967974446e-05, + "loss": 0.5268, + "step": 265870 + }, + { + "epoch": 2.3504658851818454, + "grad_norm": 4.481994152069092, + "learning_rate": 1.0825568580302575e-05, + "loss": 0.5245, + "step": 265880 + }, + { + "epoch": 2.350554288442158, + "grad_norm": 1.6069895029067993, + "learning_rate": 1.0824095192630705e-05, + "loss": 0.4186, + "step": 265890 + }, + { + "epoch": 2.35064269170247, + "grad_norm": 3.9204647541046143, + "learning_rate": 1.0822621804958833e-05, + "loss": 0.5295, + "step": 265900 + }, + { + "epoch": 2.350731094962782, + "grad_norm": 2.5019829273223877, + "learning_rate": 1.0821148417286963e-05, + "loss": 0.5096, + "step": 265910 + }, + { + "epoch": 2.3508194982230943, + "grad_norm": 1.9129271507263184, + "learning_rate": 1.0819675029615091e-05, + "loss": 0.5209, + "step": 265920 + }, + { + "epoch": 2.350907901483407, + "grad_norm": 1.9530349969863892, + "learning_rate": 1.0818201641943222e-05, + "loss": 0.3762, + "step": 265930 + }, + { + "epoch": 2.350996304743719, + "grad_norm": 4.232728481292725, + "learning_rate": 1.0816728254271352e-05, + "loss": 0.5631, + "step": 265940 + }, + { + "epoch": 2.351084708004031, + "grad_norm": 9.467439651489258, + "learning_rate": 1.081525486659948e-05, + "loss": 0.5903, + "step": 265950 + }, + { + "epoch": 2.3511731112643433, + "grad_norm": 9.449173927307129, + "learning_rate": 1.081378147892761e-05, + "loss": 0.5723, + "step": 265960 + }, + { + "epoch": 2.351261514524656, + "grad_norm": 4.2920403480529785, + "learning_rate": 1.081230809125574e-05, + "loss": 0.5008, + "step": 265970 + }, + { + "epoch": 2.351349917784968, + "grad_norm": 2.6516189575195312, + "learning_rate": 1.0810834703583868e-05, + "loss": 0.5894, + "step": 265980 + }, + { + "epoch": 2.35143832104528, + "grad_norm": 5.120555877685547, + "learning_rate": 1.0809361315911998e-05, + "loss": 0.5322, + "step": 265990 + }, + { + "epoch": 2.351526724305592, + "grad_norm": 2.227701425552368, + "learning_rate": 1.0807887928240128e-05, + "loss": 0.4537, + "step": 266000 + }, + { + "epoch": 2.3516151275659047, + "grad_norm": 2.9680206775665283, + "learning_rate": 1.0806414540568257e-05, + "loss": 0.4745, + "step": 266010 + }, + { + "epoch": 2.351703530826217, + "grad_norm": 4.773060321807861, + "learning_rate": 1.0804941152896387e-05, + "loss": 0.4389, + "step": 266020 + }, + { + "epoch": 2.351791934086529, + "grad_norm": 4.358973979949951, + "learning_rate": 1.0803467765224515e-05, + "loss": 0.5274, + "step": 266030 + }, + { + "epoch": 2.3518803373468415, + "grad_norm": 3.2884342670440674, + "learning_rate": 1.0801994377552645e-05, + "loss": 0.5586, + "step": 266040 + }, + { + "epoch": 2.3519687406071537, + "grad_norm": 2.6858139038085938, + "learning_rate": 1.0800520989880775e-05, + "loss": 0.4673, + "step": 266050 + }, + { + "epoch": 2.352057143867466, + "grad_norm": 2.051637887954712, + "learning_rate": 1.0799047602208904e-05, + "loss": 0.4615, + "step": 266060 + }, + { + "epoch": 2.352145547127778, + "grad_norm": 5.067936897277832, + "learning_rate": 1.0797574214537034e-05, + "loss": 0.5209, + "step": 266070 + }, + { + "epoch": 2.3522339503880905, + "grad_norm": 9.270831108093262, + "learning_rate": 1.0796100826865162e-05, + "loss": 0.4944, + "step": 266080 + }, + { + "epoch": 2.3523223536484026, + "grad_norm": 3.265824317932129, + "learning_rate": 1.0794627439193292e-05, + "loss": 0.5543, + "step": 266090 + }, + { + "epoch": 2.3524107569087147, + "grad_norm": 2.119288921356201, + "learning_rate": 1.079315405152142e-05, + "loss": 0.5157, + "step": 266100 + }, + { + "epoch": 2.3524991601690273, + "grad_norm": 3.2506062984466553, + "learning_rate": 1.079168066384955e-05, + "loss": 0.6664, + "step": 266110 + }, + { + "epoch": 2.3525875634293394, + "grad_norm": 2.876891613006592, + "learning_rate": 1.0790207276177679e-05, + "loss": 0.5512, + "step": 266120 + }, + { + "epoch": 2.3526759666896515, + "grad_norm": 4.880911350250244, + "learning_rate": 1.0788733888505809e-05, + "loss": 0.5569, + "step": 266130 + }, + { + "epoch": 2.3527643699499636, + "grad_norm": 1.8779159784317017, + "learning_rate": 1.0787260500833939e-05, + "loss": 0.3757, + "step": 266140 + }, + { + "epoch": 2.3528527732102757, + "grad_norm": 16.469324111938477, + "learning_rate": 1.0785787113162067e-05, + "loss": 0.5424, + "step": 266150 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 2.432617425918579, + "learning_rate": 1.0784313725490197e-05, + "loss": 0.6331, + "step": 266160 + }, + { + "epoch": 2.3530295797309004, + "grad_norm": 3.543848991394043, + "learning_rate": 1.0782840337818326e-05, + "loss": 0.5759, + "step": 266170 + }, + { + "epoch": 2.3531179829912126, + "grad_norm": 3.909695625305176, + "learning_rate": 1.0781366950146456e-05, + "loss": 0.5923, + "step": 266180 + }, + { + "epoch": 2.353206386251525, + "grad_norm": 12.865134239196777, + "learning_rate": 1.0779893562474584e-05, + "loss": 0.5031, + "step": 266190 + }, + { + "epoch": 2.3532947895118372, + "grad_norm": 2.5058038234710693, + "learning_rate": 1.0778420174802714e-05, + "loss": 0.5137, + "step": 266200 + }, + { + "epoch": 2.3533831927721494, + "grad_norm": 0.9657379388809204, + "learning_rate": 1.0776946787130843e-05, + "loss": 0.5016, + "step": 266210 + }, + { + "epoch": 2.3534715960324615, + "grad_norm": 2.7462363243103027, + "learning_rate": 1.0775473399458973e-05, + "loss": 0.4397, + "step": 266220 + }, + { + "epoch": 2.353559999292774, + "grad_norm": 3.7030341625213623, + "learning_rate": 1.0774000011787101e-05, + "loss": 0.5311, + "step": 266230 + }, + { + "epoch": 2.353648402553086, + "grad_norm": 3.423203229904175, + "learning_rate": 1.0772526624115231e-05, + "loss": 0.4909, + "step": 266240 + }, + { + "epoch": 2.3537368058133983, + "grad_norm": 2.0918869972229004, + "learning_rate": 1.0771053236443361e-05, + "loss": 0.443, + "step": 266250 + }, + { + "epoch": 2.353825209073711, + "grad_norm": 11.963713645935059, + "learning_rate": 1.076957984877149e-05, + "loss": 0.4927, + "step": 266260 + }, + { + "epoch": 2.353913612334023, + "grad_norm": 1.3468090295791626, + "learning_rate": 1.076810646109962e-05, + "loss": 0.5592, + "step": 266270 + }, + { + "epoch": 2.354002015594335, + "grad_norm": 3.9712274074554443, + "learning_rate": 1.0766633073427748e-05, + "loss": 0.5366, + "step": 266280 + }, + { + "epoch": 2.354090418854647, + "grad_norm": 6.155745506286621, + "learning_rate": 1.0765159685755878e-05, + "loss": 0.648, + "step": 266290 + }, + { + "epoch": 2.3541788221149598, + "grad_norm": 0.975818395614624, + "learning_rate": 1.0763686298084006e-05, + "loss": 0.4991, + "step": 266300 + }, + { + "epoch": 2.354267225375272, + "grad_norm": 33.68168640136719, + "learning_rate": 1.0762212910412136e-05, + "loss": 0.51, + "step": 266310 + }, + { + "epoch": 2.354355628635584, + "grad_norm": 7.030093669891357, + "learning_rate": 1.0760739522740265e-05, + "loss": 0.4556, + "step": 266320 + }, + { + "epoch": 2.354444031895896, + "grad_norm": 5.943902492523193, + "learning_rate": 1.0759266135068395e-05, + "loss": 0.502, + "step": 266330 + }, + { + "epoch": 2.3545324351562087, + "grad_norm": 0.779185950756073, + "learning_rate": 1.0757792747396525e-05, + "loss": 0.5017, + "step": 266340 + }, + { + "epoch": 2.354620838416521, + "grad_norm": 2.8590362071990967, + "learning_rate": 1.0756319359724653e-05, + "loss": 0.4601, + "step": 266350 + }, + { + "epoch": 2.354709241676833, + "grad_norm": 3.1071391105651855, + "learning_rate": 1.0754845972052783e-05, + "loss": 0.5294, + "step": 266360 + }, + { + "epoch": 2.354797644937145, + "grad_norm": 6.179594993591309, + "learning_rate": 1.0753372584380911e-05, + "loss": 0.6024, + "step": 266370 + }, + { + "epoch": 2.3548860481974576, + "grad_norm": 4.02847957611084, + "learning_rate": 1.0751899196709042e-05, + "loss": 0.4523, + "step": 266380 + }, + { + "epoch": 2.3549744514577697, + "grad_norm": 2.964663028717041, + "learning_rate": 1.075042580903717e-05, + "loss": 0.4659, + "step": 266390 + }, + { + "epoch": 2.355062854718082, + "grad_norm": 1.7525633573532104, + "learning_rate": 1.07489524213653e-05, + "loss": 0.5447, + "step": 266400 + }, + { + "epoch": 2.3551512579783944, + "grad_norm": 12.660274505615234, + "learning_rate": 1.074747903369343e-05, + "loss": 0.4236, + "step": 266410 + }, + { + "epoch": 2.3552396612387065, + "grad_norm": 2.977996587753296, + "learning_rate": 1.0746005646021558e-05, + "loss": 0.681, + "step": 266420 + }, + { + "epoch": 2.3553280644990187, + "grad_norm": 1.7301517724990845, + "learning_rate": 1.0744532258349688e-05, + "loss": 0.4119, + "step": 266430 + }, + { + "epoch": 2.355416467759331, + "grad_norm": 4.535831928253174, + "learning_rate": 1.0743058870677818e-05, + "loss": 0.5709, + "step": 266440 + }, + { + "epoch": 2.3555048710196433, + "grad_norm": 16.50196075439453, + "learning_rate": 1.0741585483005947e-05, + "loss": 0.5641, + "step": 266450 + }, + { + "epoch": 2.3555932742799555, + "grad_norm": 2.3947536945343018, + "learning_rate": 1.0740112095334077e-05, + "loss": 0.6184, + "step": 266460 + }, + { + "epoch": 2.3556816775402676, + "grad_norm": 2.8569209575653076, + "learning_rate": 1.0738638707662207e-05, + "loss": 0.4243, + "step": 266470 + }, + { + "epoch": 2.35577008080058, + "grad_norm": 7.11621618270874, + "learning_rate": 1.0737165319990335e-05, + "loss": 0.4765, + "step": 266480 + }, + { + "epoch": 2.3558584840608923, + "grad_norm": 3.094392776489258, + "learning_rate": 1.0735691932318465e-05, + "loss": 0.5817, + "step": 266490 + }, + { + "epoch": 2.3559468873212044, + "grad_norm": 1.9154013395309448, + "learning_rate": 1.0734218544646594e-05, + "loss": 0.4836, + "step": 266500 + }, + { + "epoch": 2.3560352905815165, + "grad_norm": 2.1581265926361084, + "learning_rate": 1.0732745156974724e-05, + "loss": 0.6661, + "step": 266510 + }, + { + "epoch": 2.356123693841829, + "grad_norm": 3.931621789932251, + "learning_rate": 1.0731271769302854e-05, + "loss": 0.6285, + "step": 266520 + }, + { + "epoch": 2.356212097102141, + "grad_norm": 2.7157530784606934, + "learning_rate": 1.0729798381630982e-05, + "loss": 0.5493, + "step": 266530 + }, + { + "epoch": 2.3563005003624533, + "grad_norm": 4.408050537109375, + "learning_rate": 1.0728324993959112e-05, + "loss": 0.5322, + "step": 266540 + }, + { + "epoch": 2.3563889036227654, + "grad_norm": 1.5121294260025024, + "learning_rate": 1.072685160628724e-05, + "loss": 0.4554, + "step": 266550 + }, + { + "epoch": 2.356477306883078, + "grad_norm": 2.8865609169006348, + "learning_rate": 1.072537821861537e-05, + "loss": 0.5829, + "step": 266560 + }, + { + "epoch": 2.35656571014339, + "grad_norm": 4.438077449798584, + "learning_rate": 1.0723904830943499e-05, + "loss": 0.4418, + "step": 266570 + }, + { + "epoch": 2.3566541134037022, + "grad_norm": 15.565384864807129, + "learning_rate": 1.0722431443271629e-05, + "loss": 0.7582, + "step": 266580 + }, + { + "epoch": 2.3567425166640144, + "grad_norm": 4.928466796875, + "learning_rate": 1.0720958055599757e-05, + "loss": 0.4345, + "step": 266590 + }, + { + "epoch": 2.356830919924327, + "grad_norm": 2.529482841491699, + "learning_rate": 1.0719484667927887e-05, + "loss": 0.5714, + "step": 266600 + }, + { + "epoch": 2.356919323184639, + "grad_norm": 9.485407829284668, + "learning_rate": 1.0718011280256017e-05, + "loss": 0.5774, + "step": 266610 + }, + { + "epoch": 2.357007726444951, + "grad_norm": 7.032020092010498, + "learning_rate": 1.0716537892584146e-05, + "loss": 0.5578, + "step": 266620 + }, + { + "epoch": 2.3570961297052637, + "grad_norm": 0.9230557680130005, + "learning_rate": 1.0715064504912276e-05, + "loss": 0.3915, + "step": 266630 + }, + { + "epoch": 2.357184532965576, + "grad_norm": 3.1832642555236816, + "learning_rate": 1.0713591117240404e-05, + "loss": 0.5015, + "step": 266640 + }, + { + "epoch": 2.357272936225888, + "grad_norm": 2.2086899280548096, + "learning_rate": 1.0712117729568534e-05, + "loss": 0.4243, + "step": 266650 + }, + { + "epoch": 2.3573613394862, + "grad_norm": 4.035462856292725, + "learning_rate": 1.0710644341896663e-05, + "loss": 0.5442, + "step": 266660 + }, + { + "epoch": 2.3574497427465126, + "grad_norm": 1.9105241298675537, + "learning_rate": 1.0709170954224793e-05, + "loss": 0.4807, + "step": 266670 + }, + { + "epoch": 2.3575381460068248, + "grad_norm": 4.258758544921875, + "learning_rate": 1.0707697566552921e-05, + "loss": 0.5203, + "step": 266680 + }, + { + "epoch": 2.357626549267137, + "grad_norm": 2.2199692726135254, + "learning_rate": 1.0706224178881051e-05, + "loss": 0.528, + "step": 266690 + }, + { + "epoch": 2.3577149525274494, + "grad_norm": 6.960660934448242, + "learning_rate": 1.070475079120918e-05, + "loss": 0.5941, + "step": 266700 + }, + { + "epoch": 2.3578033557877616, + "grad_norm": 4.588650703430176, + "learning_rate": 1.070327740353731e-05, + "loss": 0.4768, + "step": 266710 + }, + { + "epoch": 2.3578917590480737, + "grad_norm": 12.704652786254883, + "learning_rate": 1.070180401586544e-05, + "loss": 0.4347, + "step": 266720 + }, + { + "epoch": 2.357980162308386, + "grad_norm": 1.0430245399475098, + "learning_rate": 1.0700330628193568e-05, + "loss": 0.5155, + "step": 266730 + }, + { + "epoch": 2.358068565568698, + "grad_norm": 1.6629669666290283, + "learning_rate": 1.0698857240521698e-05, + "loss": 0.5683, + "step": 266740 + }, + { + "epoch": 2.3581569688290105, + "grad_norm": 4.251067161560059, + "learning_rate": 1.0697383852849826e-05, + "loss": 0.5407, + "step": 266750 + }, + { + "epoch": 2.3582453720893226, + "grad_norm": 1.997297763824463, + "learning_rate": 1.0695910465177956e-05, + "loss": 0.4004, + "step": 266760 + }, + { + "epoch": 2.3583337753496347, + "grad_norm": 1.8764537572860718, + "learning_rate": 1.0694437077506085e-05, + "loss": 0.5587, + "step": 266770 + }, + { + "epoch": 2.3584221786099473, + "grad_norm": 3.6154286861419678, + "learning_rate": 1.0692963689834215e-05, + "loss": 0.5377, + "step": 266780 + }, + { + "epoch": 2.3585105818702594, + "grad_norm": 7.308650493621826, + "learning_rate": 1.0691490302162343e-05, + "loss": 0.5132, + "step": 266790 + }, + { + "epoch": 2.3585989851305715, + "grad_norm": 1.529338002204895, + "learning_rate": 1.0690016914490473e-05, + "loss": 0.5671, + "step": 266800 + }, + { + "epoch": 2.3586873883908837, + "grad_norm": 1.8873608112335205, + "learning_rate": 1.0688543526818603e-05, + "loss": 0.5077, + "step": 266810 + }, + { + "epoch": 2.358775791651196, + "grad_norm": 4.4783124923706055, + "learning_rate": 1.0687070139146731e-05, + "loss": 0.5482, + "step": 266820 + }, + { + "epoch": 2.3588641949115083, + "grad_norm": 2.58103346824646, + "learning_rate": 1.0685596751474861e-05, + "loss": 0.5009, + "step": 266830 + }, + { + "epoch": 2.3589525981718205, + "grad_norm": 1.833702564239502, + "learning_rate": 1.068412336380299e-05, + "loss": 0.4548, + "step": 266840 + }, + { + "epoch": 2.359041001432133, + "grad_norm": 2.4454853534698486, + "learning_rate": 1.068264997613112e-05, + "loss": 0.4676, + "step": 266850 + }, + { + "epoch": 2.359129404692445, + "grad_norm": 3.8333752155303955, + "learning_rate": 1.0681176588459248e-05, + "loss": 0.4654, + "step": 266860 + }, + { + "epoch": 2.3592178079527573, + "grad_norm": 9.573997497558594, + "learning_rate": 1.0679703200787378e-05, + "loss": 0.4948, + "step": 266870 + }, + { + "epoch": 2.3593062112130694, + "grad_norm": 3.344494104385376, + "learning_rate": 1.0678229813115508e-05, + "loss": 0.5166, + "step": 266880 + }, + { + "epoch": 2.359394614473382, + "grad_norm": 3.0302648544311523, + "learning_rate": 1.0676756425443637e-05, + "loss": 0.4543, + "step": 266890 + }, + { + "epoch": 2.359483017733694, + "grad_norm": 4.296350479125977, + "learning_rate": 1.0675283037771767e-05, + "loss": 0.5845, + "step": 266900 + }, + { + "epoch": 2.359571420994006, + "grad_norm": 0.971002459526062, + "learning_rate": 1.0673809650099897e-05, + "loss": 0.4991, + "step": 266910 + }, + { + "epoch": 2.3596598242543183, + "grad_norm": 3.3627185821533203, + "learning_rate": 1.0672336262428025e-05, + "loss": 0.5077, + "step": 266920 + }, + { + "epoch": 2.359748227514631, + "grad_norm": 18.80883026123047, + "learning_rate": 1.0670862874756155e-05, + "loss": 0.5524, + "step": 266930 + }, + { + "epoch": 2.359836630774943, + "grad_norm": 2.6087090969085693, + "learning_rate": 1.0669389487084285e-05, + "loss": 0.5417, + "step": 266940 + }, + { + "epoch": 2.359925034035255, + "grad_norm": 1.3001375198364258, + "learning_rate": 1.0667916099412414e-05, + "loss": 0.4963, + "step": 266950 + }, + { + "epoch": 2.3600134372955672, + "grad_norm": 3.968001365661621, + "learning_rate": 1.0666442711740544e-05, + "loss": 0.4634, + "step": 266960 + }, + { + "epoch": 2.36010184055588, + "grad_norm": 5.662290096282959, + "learning_rate": 1.0664969324068672e-05, + "loss": 0.5258, + "step": 266970 + }, + { + "epoch": 2.360190243816192, + "grad_norm": 2.9515790939331055, + "learning_rate": 1.0663495936396802e-05, + "loss": 0.4598, + "step": 266980 + }, + { + "epoch": 2.360278647076504, + "grad_norm": 3.13879132270813, + "learning_rate": 1.0662022548724932e-05, + "loss": 0.4322, + "step": 266990 + }, + { + "epoch": 2.3603670503368166, + "grad_norm": 12.312248229980469, + "learning_rate": 1.066054916105306e-05, + "loss": 0.6099, + "step": 267000 + }, + { + "epoch": 2.3604554535971287, + "grad_norm": 5.5174479484558105, + "learning_rate": 1.065907577338119e-05, + "loss": 0.6044, + "step": 267010 + }, + { + "epoch": 2.360543856857441, + "grad_norm": 7.5163702964782715, + "learning_rate": 1.0657602385709319e-05, + "loss": 0.4932, + "step": 267020 + }, + { + "epoch": 2.360632260117753, + "grad_norm": 1.4617258310317993, + "learning_rate": 1.0656128998037449e-05, + "loss": 0.5603, + "step": 267030 + }, + { + "epoch": 2.3607206633780655, + "grad_norm": 0.9048759937286377, + "learning_rate": 1.0654655610365577e-05, + "loss": 0.6471, + "step": 267040 + }, + { + "epoch": 2.3608090666383776, + "grad_norm": 6.789186954498291, + "learning_rate": 1.0653182222693707e-05, + "loss": 0.4682, + "step": 267050 + }, + { + "epoch": 2.3608974698986898, + "grad_norm": 1.4875516891479492, + "learning_rate": 1.0651708835021836e-05, + "loss": 0.5371, + "step": 267060 + }, + { + "epoch": 2.3609858731590023, + "grad_norm": 1.5061345100402832, + "learning_rate": 1.0650235447349966e-05, + "loss": 0.5792, + "step": 267070 + }, + { + "epoch": 2.3610742764193144, + "grad_norm": 5.743527412414551, + "learning_rate": 1.0648762059678096e-05, + "loss": 0.5582, + "step": 267080 + }, + { + "epoch": 2.3611626796796266, + "grad_norm": 2.113719940185547, + "learning_rate": 1.0647288672006224e-05, + "loss": 0.4634, + "step": 267090 + }, + { + "epoch": 2.3612510829399387, + "grad_norm": 4.000491619110107, + "learning_rate": 1.0645815284334354e-05, + "loss": 0.5443, + "step": 267100 + }, + { + "epoch": 2.3613394862002512, + "grad_norm": 3.118182420730591, + "learning_rate": 1.0644341896662482e-05, + "loss": 0.5427, + "step": 267110 + }, + { + "epoch": 2.3614278894605634, + "grad_norm": 3.1913819313049316, + "learning_rate": 1.0642868508990613e-05, + "loss": 0.6035, + "step": 267120 + }, + { + "epoch": 2.3615162927208755, + "grad_norm": 13.146281242370605, + "learning_rate": 1.0641395121318741e-05, + "loss": 0.522, + "step": 267130 + }, + { + "epoch": 2.3616046959811876, + "grad_norm": 4.925789833068848, + "learning_rate": 1.0639921733646871e-05, + "loss": 0.5869, + "step": 267140 + }, + { + "epoch": 2.3616930992415, + "grad_norm": 2.276764154434204, + "learning_rate": 1.0638448345975e-05, + "loss": 0.5004, + "step": 267150 + }, + { + "epoch": 2.3617815025018123, + "grad_norm": 2.6815664768218994, + "learning_rate": 1.063697495830313e-05, + "loss": 0.4442, + "step": 267160 + }, + { + "epoch": 2.3618699057621244, + "grad_norm": 2.0601794719696045, + "learning_rate": 1.063550157063126e-05, + "loss": 0.4918, + "step": 267170 + }, + { + "epoch": 2.3619583090224365, + "grad_norm": 3.3552732467651367, + "learning_rate": 1.0634028182959388e-05, + "loss": 0.5201, + "step": 267180 + }, + { + "epoch": 2.362046712282749, + "grad_norm": 2.1623666286468506, + "learning_rate": 1.0632554795287518e-05, + "loss": 0.4398, + "step": 267190 + }, + { + "epoch": 2.362135115543061, + "grad_norm": 13.163881301879883, + "learning_rate": 1.0631081407615646e-05, + "loss": 0.519, + "step": 267200 + }, + { + "epoch": 2.3622235188033733, + "grad_norm": 2.6387622356414795, + "learning_rate": 1.0629608019943776e-05, + "loss": 0.5657, + "step": 267210 + }, + { + "epoch": 2.362311922063686, + "grad_norm": 2.4927306175231934, + "learning_rate": 1.0628134632271905e-05, + "loss": 0.4495, + "step": 267220 + }, + { + "epoch": 2.362400325323998, + "grad_norm": 1.3983500003814697, + "learning_rate": 1.0626661244600035e-05, + "loss": 0.5647, + "step": 267230 + }, + { + "epoch": 2.36248872858431, + "grad_norm": 1.6910669803619385, + "learning_rate": 1.0625187856928163e-05, + "loss": 0.4954, + "step": 267240 + }, + { + "epoch": 2.3625771318446223, + "grad_norm": 4.140649318695068, + "learning_rate": 1.0623714469256293e-05, + "loss": 0.4736, + "step": 267250 + }, + { + "epoch": 2.362665535104935, + "grad_norm": 2.310429096221924, + "learning_rate": 1.0622241081584421e-05, + "loss": 0.5278, + "step": 267260 + }, + { + "epoch": 2.362753938365247, + "grad_norm": 4.5334343910217285, + "learning_rate": 1.0620767693912551e-05, + "loss": 0.5576, + "step": 267270 + }, + { + "epoch": 2.362842341625559, + "grad_norm": 2.872365951538086, + "learning_rate": 1.0619294306240681e-05, + "loss": 0.5092, + "step": 267280 + }, + { + "epoch": 2.3629307448858716, + "grad_norm": 8.915101051330566, + "learning_rate": 1.061782091856881e-05, + "loss": 0.4047, + "step": 267290 + }, + { + "epoch": 2.3630191481461837, + "grad_norm": 6.720763683319092, + "learning_rate": 1.061634753089694e-05, + "loss": 0.3608, + "step": 267300 + }, + { + "epoch": 2.363107551406496, + "grad_norm": 6.567525863647461, + "learning_rate": 1.0614874143225068e-05, + "loss": 0.5234, + "step": 267310 + }, + { + "epoch": 2.363195954666808, + "grad_norm": 3.0626003742218018, + "learning_rate": 1.0613400755553198e-05, + "loss": 0.5356, + "step": 267320 + }, + { + "epoch": 2.36328435792712, + "grad_norm": 4.544857501983643, + "learning_rate": 1.0611927367881328e-05, + "loss": 0.4588, + "step": 267330 + }, + { + "epoch": 2.3633727611874327, + "grad_norm": 4.862277507781982, + "learning_rate": 1.0610453980209457e-05, + "loss": 0.4275, + "step": 267340 + }, + { + "epoch": 2.363461164447745, + "grad_norm": 1.4431540966033936, + "learning_rate": 1.0608980592537587e-05, + "loss": 0.5922, + "step": 267350 + }, + { + "epoch": 2.363549567708057, + "grad_norm": 6.352346897125244, + "learning_rate": 1.0607507204865717e-05, + "loss": 0.5059, + "step": 267360 + }, + { + "epoch": 2.3636379709683695, + "grad_norm": 4.442579746246338, + "learning_rate": 1.0606033817193845e-05, + "loss": 0.6529, + "step": 267370 + }, + { + "epoch": 2.3637263742286816, + "grad_norm": 1.796593427658081, + "learning_rate": 1.0604560429521975e-05, + "loss": 0.5657, + "step": 267380 + }, + { + "epoch": 2.3638147774889937, + "grad_norm": 3.2986831665039062, + "learning_rate": 1.0603087041850105e-05, + "loss": 0.4919, + "step": 267390 + }, + { + "epoch": 2.363903180749306, + "grad_norm": 4.67545223236084, + "learning_rate": 1.0601613654178234e-05, + "loss": 0.59, + "step": 267400 + }, + { + "epoch": 2.3639915840096184, + "grad_norm": 5.602180480957031, + "learning_rate": 1.0600140266506364e-05, + "loss": 0.6031, + "step": 267410 + }, + { + "epoch": 2.3640799872699305, + "grad_norm": 2.452129602432251, + "learning_rate": 1.0598666878834492e-05, + "loss": 0.5287, + "step": 267420 + }, + { + "epoch": 2.3641683905302426, + "grad_norm": 3.4835939407348633, + "learning_rate": 1.0597193491162622e-05, + "loss": 0.4868, + "step": 267430 + }, + { + "epoch": 2.364256793790555, + "grad_norm": 6.531728744506836, + "learning_rate": 1.059572010349075e-05, + "loss": 0.616, + "step": 267440 + }, + { + "epoch": 2.3643451970508673, + "grad_norm": 7.4394683837890625, + "learning_rate": 1.059424671581888e-05, + "loss": 0.5807, + "step": 267450 + }, + { + "epoch": 2.3644336003111794, + "grad_norm": 2.75474214553833, + "learning_rate": 1.059277332814701e-05, + "loss": 0.459, + "step": 267460 + }, + { + "epoch": 2.3645220035714916, + "grad_norm": 4.104860782623291, + "learning_rate": 1.0591299940475139e-05, + "loss": 0.4858, + "step": 267470 + }, + { + "epoch": 2.364610406831804, + "grad_norm": 5.923753261566162, + "learning_rate": 1.0589826552803269e-05, + "loss": 0.526, + "step": 267480 + }, + { + "epoch": 2.3646988100921162, + "grad_norm": 9.74465560913086, + "learning_rate": 1.0588353165131397e-05, + "loss": 0.4984, + "step": 267490 + }, + { + "epoch": 2.3647872133524284, + "grad_norm": 11.351550102233887, + "learning_rate": 1.0586879777459527e-05, + "loss": 0.4572, + "step": 267500 + }, + { + "epoch": 2.364875616612741, + "grad_norm": 1.6184992790222168, + "learning_rate": 1.0585406389787656e-05, + "loss": 0.5544, + "step": 267510 + }, + { + "epoch": 2.364964019873053, + "grad_norm": 1.3692867755889893, + "learning_rate": 1.0583933002115786e-05, + "loss": 0.4592, + "step": 267520 + }, + { + "epoch": 2.365052423133365, + "grad_norm": 13.093110084533691, + "learning_rate": 1.0582459614443914e-05, + "loss": 0.5029, + "step": 267530 + }, + { + "epoch": 2.3651408263936773, + "grad_norm": 10.24616813659668, + "learning_rate": 1.0580986226772044e-05, + "loss": 0.7099, + "step": 267540 + }, + { + "epoch": 2.3652292296539894, + "grad_norm": 7.093361854553223, + "learning_rate": 1.0579512839100174e-05, + "loss": 0.5586, + "step": 267550 + }, + { + "epoch": 2.365317632914302, + "grad_norm": 2.9705684185028076, + "learning_rate": 1.0578039451428302e-05, + "loss": 0.5273, + "step": 267560 + }, + { + "epoch": 2.365406036174614, + "grad_norm": 1.2570576667785645, + "learning_rate": 1.0576566063756433e-05, + "loss": 0.4254, + "step": 267570 + }, + { + "epoch": 2.365494439434926, + "grad_norm": 1.2267999649047852, + "learning_rate": 1.0575092676084561e-05, + "loss": 0.5901, + "step": 267580 + }, + { + "epoch": 2.3655828426952388, + "grad_norm": 1.03463876247406, + "learning_rate": 1.0573619288412691e-05, + "loss": 0.4605, + "step": 267590 + }, + { + "epoch": 2.365671245955551, + "grad_norm": 3.2759487628936768, + "learning_rate": 1.057214590074082e-05, + "loss": 0.5806, + "step": 267600 + }, + { + "epoch": 2.365759649215863, + "grad_norm": 1.9477211236953735, + "learning_rate": 1.057067251306895e-05, + "loss": 0.5385, + "step": 267610 + }, + { + "epoch": 2.365848052476175, + "grad_norm": 8.940404891967773, + "learning_rate": 1.0569199125397078e-05, + "loss": 0.4508, + "step": 267620 + }, + { + "epoch": 2.3659364557364877, + "grad_norm": 3.807974338531494, + "learning_rate": 1.0567725737725208e-05, + "loss": 0.5554, + "step": 267630 + }, + { + "epoch": 2.3660248589968, + "grad_norm": 1.588126301765442, + "learning_rate": 1.0566252350053338e-05, + "loss": 0.4918, + "step": 267640 + }, + { + "epoch": 2.366113262257112, + "grad_norm": 1.1419310569763184, + "learning_rate": 1.0564778962381466e-05, + "loss": 0.5995, + "step": 267650 + }, + { + "epoch": 2.3662016655174245, + "grad_norm": 12.654397010803223, + "learning_rate": 1.0563305574709596e-05, + "loss": 0.4816, + "step": 267660 + }, + { + "epoch": 2.3662900687777366, + "grad_norm": 1.5989019870758057, + "learning_rate": 1.0561832187037725e-05, + "loss": 0.5864, + "step": 267670 + }, + { + "epoch": 2.3663784720380487, + "grad_norm": 2.1467111110687256, + "learning_rate": 1.0560358799365855e-05, + "loss": 0.5795, + "step": 267680 + }, + { + "epoch": 2.366466875298361, + "grad_norm": 2.9139997959136963, + "learning_rate": 1.0558885411693983e-05, + "loss": 0.5308, + "step": 267690 + }, + { + "epoch": 2.3665552785586734, + "grad_norm": 6.5293192863464355, + "learning_rate": 1.0557412024022113e-05, + "loss": 0.4683, + "step": 267700 + }, + { + "epoch": 2.3666436818189855, + "grad_norm": 1.9134337902069092, + "learning_rate": 1.0555938636350241e-05, + "loss": 0.4554, + "step": 267710 + }, + { + "epoch": 2.3667320850792977, + "grad_norm": 10.486961364746094, + "learning_rate": 1.0554465248678371e-05, + "loss": 0.4352, + "step": 267720 + }, + { + "epoch": 2.36682048833961, + "grad_norm": 2.3389949798583984, + "learning_rate": 1.0552991861006501e-05, + "loss": 0.5183, + "step": 267730 + }, + { + "epoch": 2.3669088915999223, + "grad_norm": 1.7407852411270142, + "learning_rate": 1.055151847333463e-05, + "loss": 0.5376, + "step": 267740 + }, + { + "epoch": 2.3669972948602345, + "grad_norm": 3.7924721240997314, + "learning_rate": 1.055004508566276e-05, + "loss": 0.5136, + "step": 267750 + }, + { + "epoch": 2.3670856981205466, + "grad_norm": 2.5379927158355713, + "learning_rate": 1.0548571697990888e-05, + "loss": 0.5782, + "step": 267760 + }, + { + "epoch": 2.3671741013808587, + "grad_norm": 2.0480353832244873, + "learning_rate": 1.0547098310319018e-05, + "loss": 0.6539, + "step": 267770 + }, + { + "epoch": 2.3672625046411713, + "grad_norm": 2.2821316719055176, + "learning_rate": 1.0545624922647147e-05, + "loss": 0.5382, + "step": 267780 + }, + { + "epoch": 2.3673509079014834, + "grad_norm": 3.1826725006103516, + "learning_rate": 1.0544151534975277e-05, + "loss": 0.4447, + "step": 267790 + }, + { + "epoch": 2.3674393111617955, + "grad_norm": 10.553441047668457, + "learning_rate": 1.0542678147303407e-05, + "loss": 0.662, + "step": 267800 + }, + { + "epoch": 2.367527714422108, + "grad_norm": 3.786693572998047, + "learning_rate": 1.0541204759631535e-05, + "loss": 0.5905, + "step": 267810 + }, + { + "epoch": 2.36761611768242, + "grad_norm": 2.6224558353424072, + "learning_rate": 1.0539731371959665e-05, + "loss": 0.4744, + "step": 267820 + }, + { + "epoch": 2.3677045209427323, + "grad_norm": 3.645904064178467, + "learning_rate": 1.0538257984287795e-05, + "loss": 0.596, + "step": 267830 + }, + { + "epoch": 2.3677929242030444, + "grad_norm": 2.4209532737731934, + "learning_rate": 1.0536784596615923e-05, + "loss": 0.5206, + "step": 267840 + }, + { + "epoch": 2.367881327463357, + "grad_norm": 1.5487247705459595, + "learning_rate": 1.0535311208944054e-05, + "loss": 0.622, + "step": 267850 + }, + { + "epoch": 2.367969730723669, + "grad_norm": 5.018542766571045, + "learning_rate": 1.0533837821272184e-05, + "loss": 0.5179, + "step": 267860 + }, + { + "epoch": 2.3680581339839812, + "grad_norm": 13.594481468200684, + "learning_rate": 1.0532364433600312e-05, + "loss": 0.4467, + "step": 267870 + }, + { + "epoch": 2.368146537244294, + "grad_norm": 6.387732028961182, + "learning_rate": 1.0530891045928442e-05, + "loss": 0.4926, + "step": 267880 + }, + { + "epoch": 2.368234940504606, + "grad_norm": 3.698925256729126, + "learning_rate": 1.052941765825657e-05, + "loss": 0.5482, + "step": 267890 + }, + { + "epoch": 2.368323343764918, + "grad_norm": 0.7485411167144775, + "learning_rate": 1.05279442705847e-05, + "loss": 0.495, + "step": 267900 + }, + { + "epoch": 2.36841174702523, + "grad_norm": 3.384700059890747, + "learning_rate": 1.0526470882912829e-05, + "loss": 0.6291, + "step": 267910 + }, + { + "epoch": 2.3685001502855423, + "grad_norm": 3.680511951446533, + "learning_rate": 1.0524997495240959e-05, + "loss": 0.4808, + "step": 267920 + }, + { + "epoch": 2.368588553545855, + "grad_norm": 2.650510787963867, + "learning_rate": 1.0523524107569089e-05, + "loss": 0.5302, + "step": 267930 + }, + { + "epoch": 2.368676956806167, + "grad_norm": 1.835325837135315, + "learning_rate": 1.0522050719897217e-05, + "loss": 0.4864, + "step": 267940 + }, + { + "epoch": 2.368765360066479, + "grad_norm": 2.3717846870422363, + "learning_rate": 1.0520577332225347e-05, + "loss": 0.5095, + "step": 267950 + }, + { + "epoch": 2.3688537633267917, + "grad_norm": 28.654253005981445, + "learning_rate": 1.0519103944553476e-05, + "loss": 0.5813, + "step": 267960 + }, + { + "epoch": 2.3689421665871038, + "grad_norm": 3.4435923099517822, + "learning_rate": 1.0517630556881606e-05, + "loss": 0.4689, + "step": 267970 + }, + { + "epoch": 2.369030569847416, + "grad_norm": 1.7633137702941895, + "learning_rate": 1.0516157169209734e-05, + "loss": 0.5685, + "step": 267980 + }, + { + "epoch": 2.369118973107728, + "grad_norm": 1.9393572807312012, + "learning_rate": 1.0514683781537864e-05, + "loss": 0.5453, + "step": 267990 + }, + { + "epoch": 2.3692073763680406, + "grad_norm": 7.056755065917969, + "learning_rate": 1.0513210393865992e-05, + "loss": 0.64, + "step": 268000 + }, + { + "epoch": 2.3692957796283527, + "grad_norm": 2.0740885734558105, + "learning_rate": 1.0511737006194122e-05, + "loss": 0.6033, + "step": 268010 + }, + { + "epoch": 2.369384182888665, + "grad_norm": 2.7878575325012207, + "learning_rate": 1.0510263618522252e-05, + "loss": 0.6089, + "step": 268020 + }, + { + "epoch": 2.3694725861489774, + "grad_norm": 1.8140082359313965, + "learning_rate": 1.0508790230850381e-05, + "loss": 0.5394, + "step": 268030 + }, + { + "epoch": 2.3695609894092895, + "grad_norm": 2.669248580932617, + "learning_rate": 1.0507316843178511e-05, + "loss": 0.6072, + "step": 268040 + }, + { + "epoch": 2.3696493926696016, + "grad_norm": 1.8801496028900146, + "learning_rate": 1.050584345550664e-05, + "loss": 0.5004, + "step": 268050 + }, + { + "epoch": 2.3697377959299137, + "grad_norm": 3.226942539215088, + "learning_rate": 1.050437006783477e-05, + "loss": 0.6262, + "step": 268060 + }, + { + "epoch": 2.3698261991902263, + "grad_norm": 2.5638487339019775, + "learning_rate": 1.0502896680162898e-05, + "loss": 0.4892, + "step": 268070 + }, + { + "epoch": 2.3699146024505384, + "grad_norm": 4.405429840087891, + "learning_rate": 1.0501423292491028e-05, + "loss": 0.5004, + "step": 268080 + }, + { + "epoch": 2.3700030057108505, + "grad_norm": 4.0503411293029785, + "learning_rate": 1.0499949904819156e-05, + "loss": 0.4315, + "step": 268090 + }, + { + "epoch": 2.370091408971163, + "grad_norm": 3.289562225341797, + "learning_rate": 1.0498476517147286e-05, + "loss": 0.4933, + "step": 268100 + }, + { + "epoch": 2.3701798122314752, + "grad_norm": 1.3532932996749878, + "learning_rate": 1.0497003129475416e-05, + "loss": 0.5806, + "step": 268110 + }, + { + "epoch": 2.3702682154917873, + "grad_norm": 4.798042297363281, + "learning_rate": 1.0495529741803545e-05, + "loss": 0.5337, + "step": 268120 + }, + { + "epoch": 2.3703566187520995, + "grad_norm": 4.104560852050781, + "learning_rate": 1.0494056354131675e-05, + "loss": 0.5214, + "step": 268130 + }, + { + "epoch": 2.3704450220124116, + "grad_norm": 5.147737979888916, + "learning_rate": 1.0492582966459803e-05, + "loss": 0.5225, + "step": 268140 + }, + { + "epoch": 2.370533425272724, + "grad_norm": 3.019810914993286, + "learning_rate": 1.0491109578787933e-05, + "loss": 0.5428, + "step": 268150 + }, + { + "epoch": 2.3706218285330363, + "grad_norm": 1.8444162607192993, + "learning_rate": 1.0489636191116061e-05, + "loss": 0.5738, + "step": 268160 + }, + { + "epoch": 2.3707102317933484, + "grad_norm": 5.808875560760498, + "learning_rate": 1.0488162803444191e-05, + "loss": 0.5726, + "step": 268170 + }, + { + "epoch": 2.370798635053661, + "grad_norm": 9.993462562561035, + "learning_rate": 1.048668941577232e-05, + "loss": 0.5116, + "step": 268180 + }, + { + "epoch": 2.370887038313973, + "grad_norm": 5.8873796463012695, + "learning_rate": 1.048521602810045e-05, + "loss": 0.5613, + "step": 268190 + }, + { + "epoch": 2.370975441574285, + "grad_norm": 1.3064581155776978, + "learning_rate": 1.048374264042858e-05, + "loss": 0.4036, + "step": 268200 + }, + { + "epoch": 2.3710638448345973, + "grad_norm": 3.13145112991333, + "learning_rate": 1.0482269252756708e-05, + "loss": 0.5118, + "step": 268210 + }, + { + "epoch": 2.37115224809491, + "grad_norm": 18.852420806884766, + "learning_rate": 1.0480795865084838e-05, + "loss": 0.5842, + "step": 268220 + }, + { + "epoch": 2.371240651355222, + "grad_norm": 2.4069325923919678, + "learning_rate": 1.0479322477412967e-05, + "loss": 0.5473, + "step": 268230 + }, + { + "epoch": 2.371329054615534, + "grad_norm": 3.210300922393799, + "learning_rate": 1.0477849089741097e-05, + "loss": 0.52, + "step": 268240 + }, + { + "epoch": 2.3714174578758467, + "grad_norm": 3.5760252475738525, + "learning_rate": 1.0476375702069225e-05, + "loss": 0.499, + "step": 268250 + }, + { + "epoch": 2.371505861136159, + "grad_norm": 2.8062496185302734, + "learning_rate": 1.0474902314397355e-05, + "loss": 0.4112, + "step": 268260 + }, + { + "epoch": 2.371594264396471, + "grad_norm": 1.0548356771469116, + "learning_rate": 1.0473428926725485e-05, + "loss": 0.5772, + "step": 268270 + }, + { + "epoch": 2.371682667656783, + "grad_norm": 2.648998260498047, + "learning_rate": 1.0471955539053613e-05, + "loss": 0.4279, + "step": 268280 + }, + { + "epoch": 2.3717710709170956, + "grad_norm": 6.546137809753418, + "learning_rate": 1.0470482151381743e-05, + "loss": 0.6649, + "step": 268290 + }, + { + "epoch": 2.3718594741774077, + "grad_norm": 5.6072492599487305, + "learning_rate": 1.0469008763709874e-05, + "loss": 0.6018, + "step": 268300 + }, + { + "epoch": 2.37194787743772, + "grad_norm": 1.9877279996871948, + "learning_rate": 1.0467535376038002e-05, + "loss": 0.5149, + "step": 268310 + }, + { + "epoch": 2.372036280698032, + "grad_norm": 2.0390663146972656, + "learning_rate": 1.0466061988366132e-05, + "loss": 0.5832, + "step": 268320 + }, + { + "epoch": 2.3721246839583445, + "grad_norm": 1.5471065044403076, + "learning_rate": 1.0464588600694262e-05, + "loss": 0.3425, + "step": 268330 + }, + { + "epoch": 2.3722130872186566, + "grad_norm": 7.606821537017822, + "learning_rate": 1.046311521302239e-05, + "loss": 0.654, + "step": 268340 + }, + { + "epoch": 2.3723014904789688, + "grad_norm": 7.5691046714782715, + "learning_rate": 1.046164182535052e-05, + "loss": 0.5359, + "step": 268350 + }, + { + "epoch": 2.372389893739281, + "grad_norm": 2.7267143726348877, + "learning_rate": 1.0460168437678649e-05, + "loss": 0.555, + "step": 268360 + }, + { + "epoch": 2.3724782969995935, + "grad_norm": 5.629044055938721, + "learning_rate": 1.0458695050006779e-05, + "loss": 0.5156, + "step": 268370 + }, + { + "epoch": 2.3725667002599056, + "grad_norm": 3.6360065937042236, + "learning_rate": 1.0457221662334907e-05, + "loss": 0.5766, + "step": 268380 + }, + { + "epoch": 2.3726551035202177, + "grad_norm": 6.314613342285156, + "learning_rate": 1.0455748274663037e-05, + "loss": 0.5888, + "step": 268390 + }, + { + "epoch": 2.3727435067805303, + "grad_norm": 3.71598744392395, + "learning_rate": 1.0454274886991167e-05, + "loss": 0.5982, + "step": 268400 + }, + { + "epoch": 2.3728319100408424, + "grad_norm": 2.0751256942749023, + "learning_rate": 1.0452801499319296e-05, + "loss": 0.4865, + "step": 268410 + }, + { + "epoch": 2.3729203133011545, + "grad_norm": 1.5248242616653442, + "learning_rate": 1.0451328111647426e-05, + "loss": 0.5304, + "step": 268420 + }, + { + "epoch": 2.3730087165614666, + "grad_norm": 7.278916835784912, + "learning_rate": 1.0449854723975554e-05, + "loss": 0.5311, + "step": 268430 + }, + { + "epoch": 2.373097119821779, + "grad_norm": 4.816013813018799, + "learning_rate": 1.0448381336303684e-05, + "loss": 0.5048, + "step": 268440 + }, + { + "epoch": 2.3731855230820913, + "grad_norm": 5.68934965133667, + "learning_rate": 1.0446907948631812e-05, + "loss": 0.4247, + "step": 268450 + }, + { + "epoch": 2.3732739263424034, + "grad_norm": 4.859226226806641, + "learning_rate": 1.0445434560959942e-05, + "loss": 0.5949, + "step": 268460 + }, + { + "epoch": 2.373362329602716, + "grad_norm": 3.2475335597991943, + "learning_rate": 1.044396117328807e-05, + "loss": 0.4915, + "step": 268470 + }, + { + "epoch": 2.373450732863028, + "grad_norm": 1.0875184535980225, + "learning_rate": 1.04424877856162e-05, + "loss": 0.4571, + "step": 268480 + }, + { + "epoch": 2.37353913612334, + "grad_norm": 2.8910584449768066, + "learning_rate": 1.0441014397944331e-05, + "loss": 0.6016, + "step": 268490 + }, + { + "epoch": 2.3736275393836523, + "grad_norm": 3.2240102291107178, + "learning_rate": 1.043954101027246e-05, + "loss": 0.5609, + "step": 268500 + }, + { + "epoch": 2.3737159426439645, + "grad_norm": 4.96482515335083, + "learning_rate": 1.043806762260059e-05, + "loss": 0.6755, + "step": 268510 + }, + { + "epoch": 2.373804345904277, + "grad_norm": 7.482293128967285, + "learning_rate": 1.0436594234928718e-05, + "loss": 0.6835, + "step": 268520 + }, + { + "epoch": 2.373892749164589, + "grad_norm": 2.5998477935791016, + "learning_rate": 1.0435120847256848e-05, + "loss": 0.4799, + "step": 268530 + }, + { + "epoch": 2.3739811524249013, + "grad_norm": 2.821077585220337, + "learning_rate": 1.0433647459584976e-05, + "loss": 0.6241, + "step": 268540 + }, + { + "epoch": 2.374069555685214, + "grad_norm": 1.1901018619537354, + "learning_rate": 1.0432174071913106e-05, + "loss": 0.4843, + "step": 268550 + }, + { + "epoch": 2.374157958945526, + "grad_norm": 8.085489273071289, + "learning_rate": 1.0430700684241234e-05, + "loss": 0.4592, + "step": 268560 + }, + { + "epoch": 2.374246362205838, + "grad_norm": 2.1854147911071777, + "learning_rate": 1.0429227296569364e-05, + "loss": 0.5139, + "step": 268570 + }, + { + "epoch": 2.37433476546615, + "grad_norm": 1.9591411352157593, + "learning_rate": 1.0427753908897495e-05, + "loss": 0.5406, + "step": 268580 + }, + { + "epoch": 2.3744231687264628, + "grad_norm": 2.6547787189483643, + "learning_rate": 1.0426280521225623e-05, + "loss": 0.4934, + "step": 268590 + }, + { + "epoch": 2.374511571986775, + "grad_norm": 11.639860153198242, + "learning_rate": 1.0424807133553753e-05, + "loss": 0.617, + "step": 268600 + }, + { + "epoch": 2.374599975247087, + "grad_norm": 9.088811874389648, + "learning_rate": 1.0423333745881881e-05, + "loss": 0.4564, + "step": 268610 + }, + { + "epoch": 2.3746883785073996, + "grad_norm": 6.50138521194458, + "learning_rate": 1.0421860358210011e-05, + "loss": 0.6671, + "step": 268620 + }, + { + "epoch": 2.3747767817677117, + "grad_norm": 3.110938549041748, + "learning_rate": 1.042038697053814e-05, + "loss": 0.5738, + "step": 268630 + }, + { + "epoch": 2.374865185028024, + "grad_norm": 2.389756202697754, + "learning_rate": 1.041891358286627e-05, + "loss": 0.5857, + "step": 268640 + }, + { + "epoch": 2.374953588288336, + "grad_norm": 12.576469421386719, + "learning_rate": 1.0417440195194398e-05, + "loss": 0.5261, + "step": 268650 + }, + { + "epoch": 2.3750419915486485, + "grad_norm": 37.55697250366211, + "learning_rate": 1.0415966807522528e-05, + "loss": 0.6739, + "step": 268660 + }, + { + "epoch": 2.3751303948089606, + "grad_norm": 2.8087682723999023, + "learning_rate": 1.0414493419850658e-05, + "loss": 0.5126, + "step": 268670 + }, + { + "epoch": 2.3752187980692727, + "grad_norm": 3.7757980823516846, + "learning_rate": 1.0413020032178787e-05, + "loss": 0.6672, + "step": 268680 + }, + { + "epoch": 2.3753072013295853, + "grad_norm": 2.1079413890838623, + "learning_rate": 1.0411546644506917e-05, + "loss": 0.5072, + "step": 268690 + }, + { + "epoch": 2.3753956045898974, + "grad_norm": 1.7493926286697388, + "learning_rate": 1.0410073256835045e-05, + "loss": 0.4675, + "step": 268700 + }, + { + "epoch": 2.3754840078502095, + "grad_norm": 2.48154354095459, + "learning_rate": 1.0408599869163175e-05, + "loss": 0.5932, + "step": 268710 + }, + { + "epoch": 2.3755724111105216, + "grad_norm": 1.0905494689941406, + "learning_rate": 1.0407126481491303e-05, + "loss": 0.5434, + "step": 268720 + }, + { + "epoch": 2.3756608143708338, + "grad_norm": 3.0281429290771484, + "learning_rate": 1.0405653093819433e-05, + "loss": 0.5962, + "step": 268730 + }, + { + "epoch": 2.3757492176311463, + "grad_norm": 3.3174421787261963, + "learning_rate": 1.0404179706147563e-05, + "loss": 0.512, + "step": 268740 + }, + { + "epoch": 2.3758376208914584, + "grad_norm": 3.618191719055176, + "learning_rate": 1.0402706318475692e-05, + "loss": 0.4622, + "step": 268750 + }, + { + "epoch": 2.3759260241517706, + "grad_norm": 7.096739768981934, + "learning_rate": 1.0401232930803822e-05, + "loss": 0.5002, + "step": 268760 + }, + { + "epoch": 2.376014427412083, + "grad_norm": 5.997368335723877, + "learning_rate": 1.0399759543131952e-05, + "loss": 0.487, + "step": 268770 + }, + { + "epoch": 2.3761028306723952, + "grad_norm": 3.4730618000030518, + "learning_rate": 1.039828615546008e-05, + "loss": 0.4689, + "step": 268780 + }, + { + "epoch": 2.3761912339327074, + "grad_norm": 12.206145286560059, + "learning_rate": 1.039681276778821e-05, + "loss": 0.4876, + "step": 268790 + }, + { + "epoch": 2.3762796371930195, + "grad_norm": 2.867727756500244, + "learning_rate": 1.039533938011634e-05, + "loss": 0.5653, + "step": 268800 + }, + { + "epoch": 2.376368040453332, + "grad_norm": 3.940499782562256, + "learning_rate": 1.0393865992444469e-05, + "loss": 0.4325, + "step": 268810 + }, + { + "epoch": 2.376456443713644, + "grad_norm": 1.125211477279663, + "learning_rate": 1.0392392604772599e-05, + "loss": 0.4187, + "step": 268820 + }, + { + "epoch": 2.3765448469739563, + "grad_norm": 1.4447335004806519, + "learning_rate": 1.0390919217100727e-05, + "loss": 0.4804, + "step": 268830 + }, + { + "epoch": 2.376633250234269, + "grad_norm": 5.612671852111816, + "learning_rate": 1.0389445829428857e-05, + "loss": 0.5829, + "step": 268840 + }, + { + "epoch": 2.376721653494581, + "grad_norm": 3.165249824523926, + "learning_rate": 1.0387972441756985e-05, + "loss": 0.623, + "step": 268850 + }, + { + "epoch": 2.376810056754893, + "grad_norm": 0.7272610068321228, + "learning_rate": 1.0386499054085116e-05, + "loss": 0.4412, + "step": 268860 + }, + { + "epoch": 2.376898460015205, + "grad_norm": 5.067653179168701, + "learning_rate": 1.0385025666413246e-05, + "loss": 0.4634, + "step": 268870 + }, + { + "epoch": 2.376986863275518, + "grad_norm": 6.481470584869385, + "learning_rate": 1.0383552278741374e-05, + "loss": 0.6017, + "step": 268880 + }, + { + "epoch": 2.37707526653583, + "grad_norm": 3.2562546730041504, + "learning_rate": 1.0382078891069504e-05, + "loss": 0.4263, + "step": 268890 + }, + { + "epoch": 2.377163669796142, + "grad_norm": 3.8159937858581543, + "learning_rate": 1.0380605503397632e-05, + "loss": 0.5352, + "step": 268900 + }, + { + "epoch": 2.377252073056454, + "grad_norm": 1.803598165512085, + "learning_rate": 1.0379132115725762e-05, + "loss": 0.5478, + "step": 268910 + }, + { + "epoch": 2.3773404763167667, + "grad_norm": 1.0188977718353271, + "learning_rate": 1.037765872805389e-05, + "loss": 0.4548, + "step": 268920 + }, + { + "epoch": 2.377428879577079, + "grad_norm": 2.2794251441955566, + "learning_rate": 1.037618534038202e-05, + "loss": 0.6169, + "step": 268930 + }, + { + "epoch": 2.377517282837391, + "grad_norm": 14.106587409973145, + "learning_rate": 1.0374711952710149e-05, + "loss": 0.5339, + "step": 268940 + }, + { + "epoch": 2.377605686097703, + "grad_norm": 8.33643913269043, + "learning_rate": 1.037323856503828e-05, + "loss": 0.5997, + "step": 268950 + }, + { + "epoch": 2.3776940893580156, + "grad_norm": 11.899035453796387, + "learning_rate": 1.037176517736641e-05, + "loss": 0.5471, + "step": 268960 + }, + { + "epoch": 2.3777824926183277, + "grad_norm": 9.357743263244629, + "learning_rate": 1.0370291789694538e-05, + "loss": 0.5575, + "step": 268970 + }, + { + "epoch": 2.37787089587864, + "grad_norm": 26.253660202026367, + "learning_rate": 1.0368818402022668e-05, + "loss": 0.542, + "step": 268980 + }, + { + "epoch": 2.3779592991389524, + "grad_norm": 7.131432056427002, + "learning_rate": 1.0367345014350796e-05, + "loss": 0.5044, + "step": 268990 + }, + { + "epoch": 2.3780477023992646, + "grad_norm": 1.9082821607589722, + "learning_rate": 1.0365871626678926e-05, + "loss": 0.6181, + "step": 269000 + }, + { + "epoch": 2.3781361056595767, + "grad_norm": 2.4148592948913574, + "learning_rate": 1.0364398239007054e-05, + "loss": 0.5583, + "step": 269010 + }, + { + "epoch": 2.378224508919889, + "grad_norm": 2.201098680496216, + "learning_rate": 1.0362924851335184e-05, + "loss": 0.414, + "step": 269020 + }, + { + "epoch": 2.3783129121802014, + "grad_norm": 0.797221839427948, + "learning_rate": 1.0361451463663313e-05, + "loss": 0.5224, + "step": 269030 + }, + { + "epoch": 2.3784013154405135, + "grad_norm": 2.4749295711517334, + "learning_rate": 1.0359978075991443e-05, + "loss": 0.4855, + "step": 269040 + }, + { + "epoch": 2.3784897187008256, + "grad_norm": 1.4483765363693237, + "learning_rate": 1.0358504688319573e-05, + "loss": 0.5725, + "step": 269050 + }, + { + "epoch": 2.378578121961138, + "grad_norm": 6.001196384429932, + "learning_rate": 1.0357031300647701e-05, + "loss": 0.567, + "step": 269060 + }, + { + "epoch": 2.3786665252214503, + "grad_norm": 9.875116348266602, + "learning_rate": 1.0355557912975831e-05, + "loss": 0.4934, + "step": 269070 + }, + { + "epoch": 2.3787549284817624, + "grad_norm": 3.6754891872406006, + "learning_rate": 1.035408452530396e-05, + "loss": 0.5081, + "step": 269080 + }, + { + "epoch": 2.3788433317420745, + "grad_norm": 1.4550973176956177, + "learning_rate": 1.035261113763209e-05, + "loss": 0.6278, + "step": 269090 + }, + { + "epoch": 2.378931735002387, + "grad_norm": 1.4718693494796753, + "learning_rate": 1.0351137749960218e-05, + "loss": 0.5084, + "step": 269100 + }, + { + "epoch": 2.379020138262699, + "grad_norm": 1.7124398946762085, + "learning_rate": 1.0349664362288348e-05, + "loss": 0.5763, + "step": 269110 + }, + { + "epoch": 2.3791085415230113, + "grad_norm": 4.643787860870361, + "learning_rate": 1.0348190974616476e-05, + "loss": 0.5804, + "step": 269120 + }, + { + "epoch": 2.3791969447833234, + "grad_norm": 4.960026264190674, + "learning_rate": 1.0346717586944607e-05, + "loss": 0.4955, + "step": 269130 + }, + { + "epoch": 2.379285348043636, + "grad_norm": 3.580540657043457, + "learning_rate": 1.0345244199272737e-05, + "loss": 0.5223, + "step": 269140 + }, + { + "epoch": 2.379373751303948, + "grad_norm": 0.9013394713401794, + "learning_rate": 1.0343770811600865e-05, + "loss": 0.4302, + "step": 269150 + }, + { + "epoch": 2.3794621545642602, + "grad_norm": 3.0956506729125977, + "learning_rate": 1.0342297423928995e-05, + "loss": 0.5821, + "step": 269160 + }, + { + "epoch": 2.3795505578245724, + "grad_norm": 2.9031383991241455, + "learning_rate": 1.0340824036257123e-05, + "loss": 0.5015, + "step": 269170 + }, + { + "epoch": 2.379638961084885, + "grad_norm": 2.5241217613220215, + "learning_rate": 1.0339350648585253e-05, + "loss": 0.5259, + "step": 269180 + }, + { + "epoch": 2.379727364345197, + "grad_norm": 3.2159502506256104, + "learning_rate": 1.0337877260913382e-05, + "loss": 0.5428, + "step": 269190 + }, + { + "epoch": 2.379815767605509, + "grad_norm": 2.6210579872131348, + "learning_rate": 1.0336403873241512e-05, + "loss": 0.5484, + "step": 269200 + }, + { + "epoch": 2.3799041708658217, + "grad_norm": 2.6794769763946533, + "learning_rate": 1.0334930485569642e-05, + "loss": 0.5999, + "step": 269210 + }, + { + "epoch": 2.379992574126134, + "grad_norm": 2.628788471221924, + "learning_rate": 1.033345709789777e-05, + "loss": 0.4292, + "step": 269220 + }, + { + "epoch": 2.380080977386446, + "grad_norm": 2.3190953731536865, + "learning_rate": 1.03319837102259e-05, + "loss": 0.4094, + "step": 269230 + }, + { + "epoch": 2.380169380646758, + "grad_norm": 2.883040428161621, + "learning_rate": 1.033051032255403e-05, + "loss": 0.5369, + "step": 269240 + }, + { + "epoch": 2.3802577839070707, + "grad_norm": 4.569016456604004, + "learning_rate": 1.0329036934882159e-05, + "loss": 0.5833, + "step": 269250 + }, + { + "epoch": 2.3803461871673828, + "grad_norm": 6.294579982757568, + "learning_rate": 1.0327563547210289e-05, + "loss": 0.5189, + "step": 269260 + }, + { + "epoch": 2.380434590427695, + "grad_norm": 3.220876455307007, + "learning_rate": 1.0326090159538419e-05, + "loss": 0.5064, + "step": 269270 + }, + { + "epoch": 2.3805229936880075, + "grad_norm": 2.863380193710327, + "learning_rate": 1.0324616771866547e-05, + "loss": 0.5699, + "step": 269280 + }, + { + "epoch": 2.3806113969483196, + "grad_norm": 3.5879478454589844, + "learning_rate": 1.0323143384194677e-05, + "loss": 0.4511, + "step": 269290 + }, + { + "epoch": 2.3806998002086317, + "grad_norm": 2.7566163539886475, + "learning_rate": 1.0321669996522805e-05, + "loss": 0.6107, + "step": 269300 + }, + { + "epoch": 2.380788203468944, + "grad_norm": 4.304871082305908, + "learning_rate": 1.0320196608850936e-05, + "loss": 0.5618, + "step": 269310 + }, + { + "epoch": 2.380876606729256, + "grad_norm": 9.95263957977295, + "learning_rate": 1.0318723221179066e-05, + "loss": 0.394, + "step": 269320 + }, + { + "epoch": 2.3809650099895685, + "grad_norm": 1.1057329177856445, + "learning_rate": 1.0317249833507194e-05, + "loss": 0.4618, + "step": 269330 + }, + { + "epoch": 2.3810534132498806, + "grad_norm": 4.528639793395996, + "learning_rate": 1.0315776445835324e-05, + "loss": 0.4047, + "step": 269340 + }, + { + "epoch": 2.3811418165101927, + "grad_norm": 1.8387010097503662, + "learning_rate": 1.0314303058163452e-05, + "loss": 0.3985, + "step": 269350 + }, + { + "epoch": 2.3812302197705053, + "grad_norm": 5.313090801239014, + "learning_rate": 1.0312829670491582e-05, + "loss": 0.5668, + "step": 269360 + }, + { + "epoch": 2.3813186230308174, + "grad_norm": 1.9584068059921265, + "learning_rate": 1.031135628281971e-05, + "loss": 0.5543, + "step": 269370 + }, + { + "epoch": 2.3814070262911295, + "grad_norm": 2.9929375648498535, + "learning_rate": 1.030988289514784e-05, + "loss": 0.499, + "step": 269380 + }, + { + "epoch": 2.3814954295514417, + "grad_norm": 2.0636188983917236, + "learning_rate": 1.0308409507475969e-05, + "loss": 0.52, + "step": 269390 + }, + { + "epoch": 2.3815838328117542, + "grad_norm": 2.896277666091919, + "learning_rate": 1.03069361198041e-05, + "loss": 0.5496, + "step": 269400 + }, + { + "epoch": 2.3816722360720664, + "grad_norm": 5.849462509155273, + "learning_rate": 1.0305462732132228e-05, + "loss": 0.6022, + "step": 269410 + }, + { + "epoch": 2.3817606393323785, + "grad_norm": 2.7253360748291016, + "learning_rate": 1.0303989344460358e-05, + "loss": 0.4252, + "step": 269420 + }, + { + "epoch": 2.381849042592691, + "grad_norm": 6.911833763122559, + "learning_rate": 1.0302515956788488e-05, + "loss": 0.5579, + "step": 269430 + }, + { + "epoch": 2.381937445853003, + "grad_norm": 9.368270874023438, + "learning_rate": 1.0301042569116616e-05, + "loss": 0.5654, + "step": 269440 + }, + { + "epoch": 2.3820258491133153, + "grad_norm": 2.4520423412323, + "learning_rate": 1.0299569181444746e-05, + "loss": 0.4792, + "step": 269450 + }, + { + "epoch": 2.3821142523736274, + "grad_norm": 5.691624641418457, + "learning_rate": 1.0298095793772874e-05, + "loss": 0.485, + "step": 269460 + }, + { + "epoch": 2.38220265563394, + "grad_norm": 3.3446576595306396, + "learning_rate": 1.0296622406101004e-05, + "loss": 0.4331, + "step": 269470 + }, + { + "epoch": 2.382291058894252, + "grad_norm": 3.3430044651031494, + "learning_rate": 1.0295149018429133e-05, + "loss": 0.5821, + "step": 269480 + }, + { + "epoch": 2.382379462154564, + "grad_norm": 1.9228525161743164, + "learning_rate": 1.0293675630757263e-05, + "loss": 0.6158, + "step": 269490 + }, + { + "epoch": 2.3824678654148763, + "grad_norm": 2.7172353267669678, + "learning_rate": 1.0292202243085391e-05, + "loss": 0.5404, + "step": 269500 + }, + { + "epoch": 2.382556268675189, + "grad_norm": 9.817855834960938, + "learning_rate": 1.0290728855413521e-05, + "loss": 0.5409, + "step": 269510 + }, + { + "epoch": 2.382644671935501, + "grad_norm": 1.7205349206924438, + "learning_rate": 1.0289255467741651e-05, + "loss": 0.4087, + "step": 269520 + }, + { + "epoch": 2.382733075195813, + "grad_norm": 4.105278491973877, + "learning_rate": 1.028778208006978e-05, + "loss": 0.4383, + "step": 269530 + }, + { + "epoch": 2.3828214784561252, + "grad_norm": 2.3368594646453857, + "learning_rate": 1.028630869239791e-05, + "loss": 0.6045, + "step": 269540 + }, + { + "epoch": 2.382909881716438, + "grad_norm": 5.582050323486328, + "learning_rate": 1.0284835304726038e-05, + "loss": 0.5981, + "step": 269550 + }, + { + "epoch": 2.38299828497675, + "grad_norm": 3.623868227005005, + "learning_rate": 1.0283361917054168e-05, + "loss": 0.5961, + "step": 269560 + }, + { + "epoch": 2.383086688237062, + "grad_norm": 3.158557415008545, + "learning_rate": 1.0281888529382296e-05, + "loss": 0.6004, + "step": 269570 + }, + { + "epoch": 2.3831750914973746, + "grad_norm": 2.41996431350708, + "learning_rate": 1.0280415141710426e-05, + "loss": 0.5945, + "step": 269580 + }, + { + "epoch": 2.3832634947576867, + "grad_norm": 1.2165207862854004, + "learning_rate": 1.0278941754038555e-05, + "loss": 0.5514, + "step": 269590 + }, + { + "epoch": 2.383351898017999, + "grad_norm": 2.8475911617279053, + "learning_rate": 1.0277468366366685e-05, + "loss": 0.5422, + "step": 269600 + }, + { + "epoch": 2.383440301278311, + "grad_norm": 2.3954765796661377, + "learning_rate": 1.0275994978694815e-05, + "loss": 0.5638, + "step": 269610 + }, + { + "epoch": 2.3835287045386235, + "grad_norm": 4.627337455749512, + "learning_rate": 1.0274521591022943e-05, + "loss": 0.6525, + "step": 269620 + }, + { + "epoch": 2.3836171077989357, + "grad_norm": 1.0566006898880005, + "learning_rate": 1.0273048203351073e-05, + "loss": 0.4213, + "step": 269630 + }, + { + "epoch": 2.3837055110592478, + "grad_norm": 5.218748092651367, + "learning_rate": 1.0271574815679202e-05, + "loss": 0.5858, + "step": 269640 + }, + { + "epoch": 2.3837939143195603, + "grad_norm": 13.563329696655273, + "learning_rate": 1.0270101428007332e-05, + "loss": 0.622, + "step": 269650 + }, + { + "epoch": 2.3838823175798725, + "grad_norm": 5.850147247314453, + "learning_rate": 1.0268628040335462e-05, + "loss": 0.4701, + "step": 269660 + }, + { + "epoch": 2.3839707208401846, + "grad_norm": 4.803499698638916, + "learning_rate": 1.026715465266359e-05, + "loss": 0.5682, + "step": 269670 + }, + { + "epoch": 2.3840591241004967, + "grad_norm": 2.6381311416625977, + "learning_rate": 1.026568126499172e-05, + "loss": 0.4022, + "step": 269680 + }, + { + "epoch": 2.3841475273608093, + "grad_norm": 2.2220003604888916, + "learning_rate": 1.026420787731985e-05, + "loss": 0.5662, + "step": 269690 + }, + { + "epoch": 2.3842359306211214, + "grad_norm": 4.810914039611816, + "learning_rate": 1.0262734489647979e-05, + "loss": 0.4318, + "step": 269700 + }, + { + "epoch": 2.3843243338814335, + "grad_norm": 1.6528156995773315, + "learning_rate": 1.0261261101976109e-05, + "loss": 0.5945, + "step": 269710 + }, + { + "epoch": 2.3844127371417456, + "grad_norm": 1.7090619802474976, + "learning_rate": 1.0259787714304239e-05, + "loss": 0.5279, + "step": 269720 + }, + { + "epoch": 2.384501140402058, + "grad_norm": 5.552154064178467, + "learning_rate": 1.0258314326632367e-05, + "loss": 0.6065, + "step": 269730 + }, + { + "epoch": 2.3845895436623703, + "grad_norm": 6.339583396911621, + "learning_rate": 1.0256840938960497e-05, + "loss": 0.5337, + "step": 269740 + }, + { + "epoch": 2.3846779469226824, + "grad_norm": 1.3679378032684326, + "learning_rate": 1.0255367551288625e-05, + "loss": 0.5739, + "step": 269750 + }, + { + "epoch": 2.3847663501829945, + "grad_norm": 2.4793381690979004, + "learning_rate": 1.0253894163616755e-05, + "loss": 0.4761, + "step": 269760 + }, + { + "epoch": 2.384854753443307, + "grad_norm": 7.534440517425537, + "learning_rate": 1.0252420775944884e-05, + "loss": 0.4123, + "step": 269770 + }, + { + "epoch": 2.3849431567036192, + "grad_norm": 1.44501531124115, + "learning_rate": 1.0250947388273014e-05, + "loss": 0.389, + "step": 269780 + }, + { + "epoch": 2.3850315599639313, + "grad_norm": 4.855055332183838, + "learning_rate": 1.0249474000601144e-05, + "loss": 0.4857, + "step": 269790 + }, + { + "epoch": 2.385119963224244, + "grad_norm": 3.912416934967041, + "learning_rate": 1.0248000612929272e-05, + "loss": 0.4405, + "step": 269800 + }, + { + "epoch": 2.385208366484556, + "grad_norm": 7.479511737823486, + "learning_rate": 1.0246527225257402e-05, + "loss": 0.4659, + "step": 269810 + }, + { + "epoch": 2.385296769744868, + "grad_norm": 3.1216299533843994, + "learning_rate": 1.024505383758553e-05, + "loss": 0.4685, + "step": 269820 + }, + { + "epoch": 2.3853851730051803, + "grad_norm": 6.287917137145996, + "learning_rate": 1.024358044991366e-05, + "loss": 0.4853, + "step": 269830 + }, + { + "epoch": 2.385473576265493, + "grad_norm": 1.3076577186584473, + "learning_rate": 1.0242107062241789e-05, + "loss": 0.5317, + "step": 269840 + }, + { + "epoch": 2.385561979525805, + "grad_norm": 5.219674587249756, + "learning_rate": 1.0240633674569919e-05, + "loss": 0.4943, + "step": 269850 + }, + { + "epoch": 2.385650382786117, + "grad_norm": 4.8685688972473145, + "learning_rate": 1.0239160286898047e-05, + "loss": 0.4622, + "step": 269860 + }, + { + "epoch": 2.3857387860464296, + "grad_norm": 6.442781448364258, + "learning_rate": 1.0237686899226178e-05, + "loss": 0.5584, + "step": 269870 + }, + { + "epoch": 2.3858271893067418, + "grad_norm": 4.2437744140625, + "learning_rate": 1.0236213511554308e-05, + "loss": 0.5443, + "step": 269880 + }, + { + "epoch": 2.385915592567054, + "grad_norm": 4.545530796051025, + "learning_rate": 1.0234740123882436e-05, + "loss": 0.6784, + "step": 269890 + }, + { + "epoch": 2.386003995827366, + "grad_norm": 4.3721160888671875, + "learning_rate": 1.0233266736210566e-05, + "loss": 0.4089, + "step": 269900 + }, + { + "epoch": 2.386092399087678, + "grad_norm": 2.751387357711792, + "learning_rate": 1.0231793348538694e-05, + "loss": 0.4633, + "step": 269910 + }, + { + "epoch": 2.3861808023479907, + "grad_norm": 3.204629421234131, + "learning_rate": 1.0230319960866824e-05, + "loss": 0.5967, + "step": 269920 + }, + { + "epoch": 2.386269205608303, + "grad_norm": 1.9774569272994995, + "learning_rate": 1.0228846573194953e-05, + "loss": 0.4588, + "step": 269930 + }, + { + "epoch": 2.386357608868615, + "grad_norm": 3.3836848735809326, + "learning_rate": 1.0227373185523083e-05, + "loss": 0.5228, + "step": 269940 + }, + { + "epoch": 2.3864460121289275, + "grad_norm": 2.5960891246795654, + "learning_rate": 1.0225899797851211e-05, + "loss": 0.3542, + "step": 269950 + }, + { + "epoch": 2.3865344153892396, + "grad_norm": 5.405077934265137, + "learning_rate": 1.0224426410179341e-05, + "loss": 0.6287, + "step": 269960 + }, + { + "epoch": 2.3866228186495517, + "grad_norm": 0.8766354918479919, + "learning_rate": 1.022295302250747e-05, + "loss": 0.5708, + "step": 269970 + }, + { + "epoch": 2.386711221909864, + "grad_norm": 3.4500255584716797, + "learning_rate": 1.02214796348356e-05, + "loss": 0.42, + "step": 269980 + }, + { + "epoch": 2.3867996251701764, + "grad_norm": 7.967315196990967, + "learning_rate": 1.022000624716373e-05, + "loss": 0.4345, + "step": 269990 + }, + { + "epoch": 2.3868880284304885, + "grad_norm": 4.680641174316406, + "learning_rate": 1.0218532859491858e-05, + "loss": 0.4209, + "step": 270000 + }, + { + "epoch": 2.3869764316908006, + "grad_norm": 2.6986820697784424, + "learning_rate": 1.0217059471819988e-05, + "loss": 0.4625, + "step": 270010 + }, + { + "epoch": 2.387064834951113, + "grad_norm": 3.4898786544799805, + "learning_rate": 1.0215586084148116e-05, + "loss": 0.5325, + "step": 270020 + }, + { + "epoch": 2.3871532382114253, + "grad_norm": 1.4452722072601318, + "learning_rate": 1.0214112696476246e-05, + "loss": 0.4833, + "step": 270030 + }, + { + "epoch": 2.3872416414717375, + "grad_norm": 6.5640130043029785, + "learning_rate": 1.0212639308804375e-05, + "loss": 0.4443, + "step": 270040 + }, + { + "epoch": 2.3873300447320496, + "grad_norm": 1.2794337272644043, + "learning_rate": 1.0211165921132505e-05, + "loss": 0.6816, + "step": 270050 + }, + { + "epoch": 2.387418447992362, + "grad_norm": 10.370240211486816, + "learning_rate": 1.0209692533460633e-05, + "loss": 0.5869, + "step": 270060 + }, + { + "epoch": 2.3875068512526743, + "grad_norm": 2.374508857727051, + "learning_rate": 1.0208219145788763e-05, + "loss": 0.5683, + "step": 270070 + }, + { + "epoch": 2.3875952545129864, + "grad_norm": 4.483141899108887, + "learning_rate": 1.0206745758116893e-05, + "loss": 0.4991, + "step": 270080 + }, + { + "epoch": 2.3876836577732985, + "grad_norm": 0.6656047105789185, + "learning_rate": 1.0205272370445022e-05, + "loss": 0.4649, + "step": 270090 + }, + { + "epoch": 2.387772061033611, + "grad_norm": 4.154426574707031, + "learning_rate": 1.0203798982773152e-05, + "loss": 0.5241, + "step": 270100 + }, + { + "epoch": 2.387860464293923, + "grad_norm": 5.786174774169922, + "learning_rate": 1.020232559510128e-05, + "loss": 0.6235, + "step": 270110 + }, + { + "epoch": 2.3879488675542353, + "grad_norm": 1.144124984741211, + "learning_rate": 1.020085220742941e-05, + "loss": 0.5011, + "step": 270120 + }, + { + "epoch": 2.3880372708145474, + "grad_norm": 5.234827995300293, + "learning_rate": 1.019937881975754e-05, + "loss": 0.4586, + "step": 270130 + }, + { + "epoch": 2.38812567407486, + "grad_norm": 2.4835782051086426, + "learning_rate": 1.0197905432085669e-05, + "loss": 0.6048, + "step": 270140 + }, + { + "epoch": 2.388214077335172, + "grad_norm": 1.6747742891311646, + "learning_rate": 1.0196432044413799e-05, + "loss": 0.6203, + "step": 270150 + }, + { + "epoch": 2.3883024805954842, + "grad_norm": 0.7653728723526001, + "learning_rate": 1.0194958656741929e-05, + "loss": 0.4575, + "step": 270160 + }, + { + "epoch": 2.388390883855797, + "grad_norm": 5.206302642822266, + "learning_rate": 1.0193485269070057e-05, + "loss": 0.4116, + "step": 270170 + }, + { + "epoch": 2.388479287116109, + "grad_norm": 7.729319095611572, + "learning_rate": 1.0192011881398187e-05, + "loss": 0.5712, + "step": 270180 + }, + { + "epoch": 2.388567690376421, + "grad_norm": 6.867127418518066, + "learning_rate": 1.0190538493726317e-05, + "loss": 0.4638, + "step": 270190 + }, + { + "epoch": 2.388656093636733, + "grad_norm": 11.5264253616333, + "learning_rate": 1.0189065106054445e-05, + "loss": 0.6165, + "step": 270200 + }, + { + "epoch": 2.3887444968970457, + "grad_norm": 2.0064918994903564, + "learning_rate": 1.0187591718382575e-05, + "loss": 0.6703, + "step": 270210 + }, + { + "epoch": 2.388832900157358, + "grad_norm": 4.633963584899902, + "learning_rate": 1.0186118330710704e-05, + "loss": 0.3879, + "step": 270220 + }, + { + "epoch": 2.38892130341767, + "grad_norm": 2.1169826984405518, + "learning_rate": 1.0184644943038834e-05, + "loss": 0.6241, + "step": 270230 + }, + { + "epoch": 2.3890097066779825, + "grad_norm": 6.269118309020996, + "learning_rate": 1.0183171555366962e-05, + "loss": 0.5659, + "step": 270240 + }, + { + "epoch": 2.3890981099382946, + "grad_norm": 6.601999282836914, + "learning_rate": 1.0181698167695092e-05, + "loss": 0.6289, + "step": 270250 + }, + { + "epoch": 2.3891865131986068, + "grad_norm": 1.4737735986709595, + "learning_rate": 1.0180224780023222e-05, + "loss": 0.4203, + "step": 270260 + }, + { + "epoch": 2.389274916458919, + "grad_norm": 3.278348207473755, + "learning_rate": 1.017875139235135e-05, + "loss": 0.6761, + "step": 270270 + }, + { + "epoch": 2.3893633197192314, + "grad_norm": 3.7813169956207275, + "learning_rate": 1.017727800467948e-05, + "loss": 0.4442, + "step": 270280 + }, + { + "epoch": 2.3894517229795436, + "grad_norm": 3.2910594940185547, + "learning_rate": 1.0175804617007609e-05, + "loss": 0.5627, + "step": 270290 + }, + { + "epoch": 2.3895401262398557, + "grad_norm": 2.8817059993743896, + "learning_rate": 1.0174331229335739e-05, + "loss": 0.4881, + "step": 270300 + }, + { + "epoch": 2.389628529500168, + "grad_norm": 0.8311653137207031, + "learning_rate": 1.0172857841663867e-05, + "loss": 0.4684, + "step": 270310 + }, + { + "epoch": 2.3897169327604804, + "grad_norm": 22.28907012939453, + "learning_rate": 1.0171384453991998e-05, + "loss": 0.5357, + "step": 270320 + }, + { + "epoch": 2.3898053360207925, + "grad_norm": 3.0497853755950928, + "learning_rate": 1.0169911066320126e-05, + "loss": 0.6142, + "step": 270330 + }, + { + "epoch": 2.3898937392811046, + "grad_norm": 10.457246780395508, + "learning_rate": 1.0168437678648256e-05, + "loss": 0.6023, + "step": 270340 + }, + { + "epoch": 2.3899821425414167, + "grad_norm": 1.5752679109573364, + "learning_rate": 1.0166964290976386e-05, + "loss": 0.446, + "step": 270350 + }, + { + "epoch": 2.3900705458017293, + "grad_norm": 3.573310613632202, + "learning_rate": 1.0165490903304514e-05, + "loss": 0.5259, + "step": 270360 + }, + { + "epoch": 2.3901589490620414, + "grad_norm": 1.118364930152893, + "learning_rate": 1.0164017515632644e-05, + "loss": 0.5703, + "step": 270370 + }, + { + "epoch": 2.3902473523223535, + "grad_norm": 1.6907751560211182, + "learning_rate": 1.0162544127960773e-05, + "loss": 0.5801, + "step": 270380 + }, + { + "epoch": 2.390335755582666, + "grad_norm": 16.94002342224121, + "learning_rate": 1.0161070740288903e-05, + "loss": 0.5474, + "step": 270390 + }, + { + "epoch": 2.390424158842978, + "grad_norm": 1.4883209466934204, + "learning_rate": 1.0159597352617031e-05, + "loss": 0.6331, + "step": 270400 + }, + { + "epoch": 2.3905125621032903, + "grad_norm": 2.2582292556762695, + "learning_rate": 1.0158123964945161e-05, + "loss": 0.5219, + "step": 270410 + }, + { + "epoch": 2.3906009653636024, + "grad_norm": 2.3434250354766846, + "learning_rate": 1.015665057727329e-05, + "loss": 0.4803, + "step": 270420 + }, + { + "epoch": 2.390689368623915, + "grad_norm": 2.3987996578216553, + "learning_rate": 1.015517718960142e-05, + "loss": 0.4541, + "step": 270430 + }, + { + "epoch": 2.390777771884227, + "grad_norm": 6.009228229522705, + "learning_rate": 1.015370380192955e-05, + "loss": 0.5506, + "step": 270440 + }, + { + "epoch": 2.3908661751445393, + "grad_norm": 15.873747825622559, + "learning_rate": 1.0152230414257678e-05, + "loss": 0.6048, + "step": 270450 + }, + { + "epoch": 2.390954578404852, + "grad_norm": 4.273069381713867, + "learning_rate": 1.0150757026585808e-05, + "loss": 0.4975, + "step": 270460 + }, + { + "epoch": 2.391042981665164, + "grad_norm": 7.338464260101318, + "learning_rate": 1.0149283638913936e-05, + "loss": 0.6197, + "step": 270470 + }, + { + "epoch": 2.391131384925476, + "grad_norm": 3.042360544204712, + "learning_rate": 1.0147810251242066e-05, + "loss": 0.4923, + "step": 270480 + }, + { + "epoch": 2.391219788185788, + "grad_norm": 1.5245252847671509, + "learning_rate": 1.0146336863570195e-05, + "loss": 0.5029, + "step": 270490 + }, + { + "epoch": 2.3913081914461003, + "grad_norm": 3.8659040927886963, + "learning_rate": 1.0144863475898325e-05, + "loss": 0.5787, + "step": 270500 + }, + { + "epoch": 2.391396594706413, + "grad_norm": 2.440183401107788, + "learning_rate": 1.0143390088226453e-05, + "loss": 0.5718, + "step": 270510 + }, + { + "epoch": 2.391484997966725, + "grad_norm": 10.039068222045898, + "learning_rate": 1.0141916700554583e-05, + "loss": 0.6617, + "step": 270520 + }, + { + "epoch": 2.391573401227037, + "grad_norm": 2.0790419578552246, + "learning_rate": 1.0140443312882712e-05, + "loss": 0.6587, + "step": 270530 + }, + { + "epoch": 2.3916618044873497, + "grad_norm": 1.9918771982192993, + "learning_rate": 1.0138969925210842e-05, + "loss": 0.5804, + "step": 270540 + }, + { + "epoch": 2.391750207747662, + "grad_norm": 2.684877634048462, + "learning_rate": 1.0137496537538972e-05, + "loss": 0.4576, + "step": 270550 + }, + { + "epoch": 2.391838611007974, + "grad_norm": 6.15983247756958, + "learning_rate": 1.01360231498671e-05, + "loss": 0.5109, + "step": 270560 + }, + { + "epoch": 2.391927014268286, + "grad_norm": 2.6234772205352783, + "learning_rate": 1.013454976219523e-05, + "loss": 0.6379, + "step": 270570 + }, + { + "epoch": 2.3920154175285986, + "grad_norm": 1.866553544998169, + "learning_rate": 1.0133076374523358e-05, + "loss": 0.5381, + "step": 270580 + }, + { + "epoch": 2.3921038207889107, + "grad_norm": 1.5235004425048828, + "learning_rate": 1.0131602986851488e-05, + "loss": 0.625, + "step": 270590 + }, + { + "epoch": 2.392192224049223, + "grad_norm": 5.443690299987793, + "learning_rate": 1.0130129599179619e-05, + "loss": 0.5205, + "step": 270600 + }, + { + "epoch": 2.3922806273095354, + "grad_norm": 4.432121276855469, + "learning_rate": 1.0128656211507747e-05, + "loss": 0.5717, + "step": 270610 + }, + { + "epoch": 2.3923690305698475, + "grad_norm": 4.578058242797852, + "learning_rate": 1.0127182823835877e-05, + "loss": 0.509, + "step": 270620 + }, + { + "epoch": 2.3924574338301596, + "grad_norm": 1.7980115413665771, + "learning_rate": 1.0125709436164007e-05, + "loss": 0.6239, + "step": 270630 + }, + { + "epoch": 2.3925458370904717, + "grad_norm": 4.169879913330078, + "learning_rate": 1.0124236048492135e-05, + "loss": 0.5718, + "step": 270640 + }, + { + "epoch": 2.3926342403507843, + "grad_norm": 1.4170373678207397, + "learning_rate": 1.0122762660820265e-05, + "loss": 0.5413, + "step": 270650 + }, + { + "epoch": 2.3927226436110964, + "grad_norm": 2.7052266597747803, + "learning_rate": 1.0121289273148395e-05, + "loss": 0.5404, + "step": 270660 + }, + { + "epoch": 2.3928110468714086, + "grad_norm": 3.309792995452881, + "learning_rate": 1.0119815885476524e-05, + "loss": 0.5731, + "step": 270670 + }, + { + "epoch": 2.3928994501317207, + "grad_norm": 2.4979023933410645, + "learning_rate": 1.0118342497804654e-05, + "loss": 0.5381, + "step": 270680 + }, + { + "epoch": 2.3929878533920332, + "grad_norm": 13.860910415649414, + "learning_rate": 1.0116869110132782e-05, + "loss": 0.5947, + "step": 270690 + }, + { + "epoch": 2.3930762566523454, + "grad_norm": 6.622684478759766, + "learning_rate": 1.0115395722460912e-05, + "loss": 0.5241, + "step": 270700 + }, + { + "epoch": 2.3931646599126575, + "grad_norm": 3.840116500854492, + "learning_rate": 1.011392233478904e-05, + "loss": 0.5625, + "step": 270710 + }, + { + "epoch": 2.3932530631729696, + "grad_norm": 12.145888328552246, + "learning_rate": 1.011244894711717e-05, + "loss": 0.482, + "step": 270720 + }, + { + "epoch": 2.393341466433282, + "grad_norm": 9.320114135742188, + "learning_rate": 1.01109755594453e-05, + "loss": 0.5966, + "step": 270730 + }, + { + "epoch": 2.3934298696935943, + "grad_norm": 2.361046075820923, + "learning_rate": 1.0109502171773429e-05, + "loss": 0.4257, + "step": 270740 + }, + { + "epoch": 2.3935182729539064, + "grad_norm": 1.5295840501785278, + "learning_rate": 1.0108028784101559e-05, + "loss": 0.4628, + "step": 270750 + }, + { + "epoch": 2.393606676214219, + "grad_norm": 1.841159701347351, + "learning_rate": 1.0106555396429687e-05, + "loss": 0.3872, + "step": 270760 + }, + { + "epoch": 2.393695079474531, + "grad_norm": 2.034297466278076, + "learning_rate": 1.0105082008757817e-05, + "loss": 0.5463, + "step": 270770 + }, + { + "epoch": 2.393783482734843, + "grad_norm": 1.8758708238601685, + "learning_rate": 1.0103608621085946e-05, + "loss": 0.5247, + "step": 270780 + }, + { + "epoch": 2.3938718859951553, + "grad_norm": 2.346585750579834, + "learning_rate": 1.0102135233414076e-05, + "loss": 0.3176, + "step": 270790 + }, + { + "epoch": 2.393960289255468, + "grad_norm": 2.0685362815856934, + "learning_rate": 1.0100661845742204e-05, + "loss": 0.5112, + "step": 270800 + }, + { + "epoch": 2.39404869251578, + "grad_norm": 1.0122367143630981, + "learning_rate": 1.0099188458070334e-05, + "loss": 0.4424, + "step": 270810 + }, + { + "epoch": 2.394137095776092, + "grad_norm": 8.27646255493164, + "learning_rate": 1.0097715070398464e-05, + "loss": 0.5783, + "step": 270820 + }, + { + "epoch": 2.3942254990364047, + "grad_norm": 5.160140514373779, + "learning_rate": 1.0096241682726593e-05, + "loss": 0.5022, + "step": 270830 + }, + { + "epoch": 2.394313902296717, + "grad_norm": 1.5714423656463623, + "learning_rate": 1.0094768295054723e-05, + "loss": 0.4921, + "step": 270840 + }, + { + "epoch": 2.394402305557029, + "grad_norm": 4.984143257141113, + "learning_rate": 1.0093294907382851e-05, + "loss": 0.541, + "step": 270850 + }, + { + "epoch": 2.394490708817341, + "grad_norm": 5.461113929748535, + "learning_rate": 1.0091821519710981e-05, + "loss": 0.5342, + "step": 270860 + }, + { + "epoch": 2.3945791120776536, + "grad_norm": 7.008866310119629, + "learning_rate": 1.009034813203911e-05, + "loss": 0.4754, + "step": 270870 + }, + { + "epoch": 2.3946675153379657, + "grad_norm": 2.2870001792907715, + "learning_rate": 1.008887474436724e-05, + "loss": 0.6033, + "step": 270880 + }, + { + "epoch": 2.394755918598278, + "grad_norm": 3.814098596572876, + "learning_rate": 1.0087401356695368e-05, + "loss": 0.4906, + "step": 270890 + }, + { + "epoch": 2.39484432185859, + "grad_norm": 2.1824166774749756, + "learning_rate": 1.0085927969023498e-05, + "loss": 0.544, + "step": 270900 + }, + { + "epoch": 2.3949327251189025, + "grad_norm": 6.303493499755859, + "learning_rate": 1.0084454581351628e-05, + "loss": 0.5313, + "step": 270910 + }, + { + "epoch": 2.3950211283792147, + "grad_norm": 1.274641990661621, + "learning_rate": 1.0082981193679756e-05, + "loss": 0.4462, + "step": 270920 + }, + { + "epoch": 2.395109531639527, + "grad_norm": 1.8361639976501465, + "learning_rate": 1.0081507806007886e-05, + "loss": 0.5542, + "step": 270930 + }, + { + "epoch": 2.395197934899839, + "grad_norm": 4.124182224273682, + "learning_rate": 1.0080034418336015e-05, + "loss": 0.3522, + "step": 270940 + }, + { + "epoch": 2.3952863381601515, + "grad_norm": 2.264514684677124, + "learning_rate": 1.0078561030664145e-05, + "loss": 0.4243, + "step": 270950 + }, + { + "epoch": 2.3953747414204636, + "grad_norm": 5.958105564117432, + "learning_rate": 1.0077087642992273e-05, + "loss": 0.4974, + "step": 270960 + }, + { + "epoch": 2.3954631446807757, + "grad_norm": 2.1438181400299072, + "learning_rate": 1.0075614255320403e-05, + "loss": 0.4315, + "step": 270970 + }, + { + "epoch": 2.3955515479410883, + "grad_norm": 1.8404405117034912, + "learning_rate": 1.0074140867648532e-05, + "loss": 0.5776, + "step": 270980 + }, + { + "epoch": 2.3956399512014004, + "grad_norm": 2.686272621154785, + "learning_rate": 1.0072667479976662e-05, + "loss": 0.5078, + "step": 270990 + }, + { + "epoch": 2.3957283544617125, + "grad_norm": 3.905007839202881, + "learning_rate": 1.007119409230479e-05, + "loss": 0.4586, + "step": 271000 + }, + { + "epoch": 2.3958167577220246, + "grad_norm": 1.1040127277374268, + "learning_rate": 1.006972070463292e-05, + "loss": 0.5158, + "step": 271010 + }, + { + "epoch": 2.395905160982337, + "grad_norm": 5.521117687225342, + "learning_rate": 1.006824731696105e-05, + "loss": 0.5336, + "step": 271020 + }, + { + "epoch": 2.3959935642426493, + "grad_norm": 4.252713203430176, + "learning_rate": 1.0066773929289178e-05, + "loss": 0.5299, + "step": 271030 + }, + { + "epoch": 2.3960819675029614, + "grad_norm": 2.8275563716888428, + "learning_rate": 1.0065300541617308e-05, + "loss": 0.5536, + "step": 271040 + }, + { + "epoch": 2.396170370763274, + "grad_norm": 2.7608859539031982, + "learning_rate": 1.0063827153945437e-05, + "loss": 0.5095, + "step": 271050 + }, + { + "epoch": 2.396258774023586, + "grad_norm": 3.25068998336792, + "learning_rate": 1.0062353766273567e-05, + "loss": 0.6593, + "step": 271060 + }, + { + "epoch": 2.3963471772838982, + "grad_norm": 3.306769371032715, + "learning_rate": 1.0060880378601697e-05, + "loss": 0.448, + "step": 271070 + }, + { + "epoch": 2.3964355805442104, + "grad_norm": 0.7224762439727783, + "learning_rate": 1.0059406990929825e-05, + "loss": 0.4482, + "step": 271080 + }, + { + "epoch": 2.3965239838045225, + "grad_norm": 7.455730438232422, + "learning_rate": 1.0057933603257955e-05, + "loss": 0.6157, + "step": 271090 + }, + { + "epoch": 2.396612387064835, + "grad_norm": 6.88818359375, + "learning_rate": 1.0056460215586085e-05, + "loss": 0.632, + "step": 271100 + }, + { + "epoch": 2.396700790325147, + "grad_norm": 5.41624116897583, + "learning_rate": 1.0054986827914214e-05, + "loss": 0.597, + "step": 271110 + }, + { + "epoch": 2.3967891935854593, + "grad_norm": 5.46660852432251, + "learning_rate": 1.0053513440242344e-05, + "loss": 0.4147, + "step": 271120 + }, + { + "epoch": 2.396877596845772, + "grad_norm": 3.5830507278442383, + "learning_rate": 1.0052040052570474e-05, + "loss": 0.4332, + "step": 271130 + }, + { + "epoch": 2.396966000106084, + "grad_norm": 4.885222911834717, + "learning_rate": 1.0050566664898602e-05, + "loss": 0.562, + "step": 271140 + }, + { + "epoch": 2.397054403366396, + "grad_norm": 5.83073616027832, + "learning_rate": 1.0049093277226732e-05, + "loss": 0.6205, + "step": 271150 + }, + { + "epoch": 2.397142806626708, + "grad_norm": 54.45116424560547, + "learning_rate": 1.004761988955486e-05, + "loss": 0.5756, + "step": 271160 + }, + { + "epoch": 2.3972312098870208, + "grad_norm": 9.130830764770508, + "learning_rate": 1.004614650188299e-05, + "loss": 0.6358, + "step": 271170 + }, + { + "epoch": 2.397319613147333, + "grad_norm": 2.96807599067688, + "learning_rate": 1.0044673114211119e-05, + "loss": 0.4709, + "step": 271180 + }, + { + "epoch": 2.397408016407645, + "grad_norm": 1.5605251789093018, + "learning_rate": 1.0043199726539249e-05, + "loss": 0.5283, + "step": 271190 + }, + { + "epoch": 2.3974964196679576, + "grad_norm": 7.940244197845459, + "learning_rate": 1.0041726338867379e-05, + "loss": 0.7409, + "step": 271200 + }, + { + "epoch": 2.3975848229282697, + "grad_norm": 2.0741915702819824, + "learning_rate": 1.0040252951195507e-05, + "loss": 0.5917, + "step": 271210 + }, + { + "epoch": 2.397673226188582, + "grad_norm": 4.918076515197754, + "learning_rate": 1.0038779563523637e-05, + "loss": 0.5633, + "step": 271220 + }, + { + "epoch": 2.397761629448894, + "grad_norm": 7.381714820861816, + "learning_rate": 1.0037306175851766e-05, + "loss": 0.5491, + "step": 271230 + }, + { + "epoch": 2.3978500327092065, + "grad_norm": 1.7731115818023682, + "learning_rate": 1.0035832788179896e-05, + "loss": 0.5237, + "step": 271240 + }, + { + "epoch": 2.3979384359695186, + "grad_norm": 1.6022820472717285, + "learning_rate": 1.0034359400508024e-05, + "loss": 0.5183, + "step": 271250 + }, + { + "epoch": 2.3980268392298307, + "grad_norm": 4.726632118225098, + "learning_rate": 1.0032886012836154e-05, + "loss": 0.5266, + "step": 271260 + }, + { + "epoch": 2.398115242490143, + "grad_norm": 4.5221333503723145, + "learning_rate": 1.0031412625164283e-05, + "loss": 0.4527, + "step": 271270 + }, + { + "epoch": 2.3982036457504554, + "grad_norm": 4.6693220138549805, + "learning_rate": 1.0029939237492413e-05, + "loss": 0.4683, + "step": 271280 + }, + { + "epoch": 2.3982920490107675, + "grad_norm": 3.3452343940734863, + "learning_rate": 1.0028465849820543e-05, + "loss": 0.5152, + "step": 271290 + }, + { + "epoch": 2.3983804522710797, + "grad_norm": 2.5060203075408936, + "learning_rate": 1.0026992462148671e-05, + "loss": 0.4476, + "step": 271300 + }, + { + "epoch": 2.3984688555313918, + "grad_norm": 3.6070330142974854, + "learning_rate": 1.0025519074476801e-05, + "loss": 0.619, + "step": 271310 + }, + { + "epoch": 2.3985572587917043, + "grad_norm": 1.4778404235839844, + "learning_rate": 1.002404568680493e-05, + "loss": 0.5515, + "step": 271320 + }, + { + "epoch": 2.3986456620520165, + "grad_norm": 1.3632951974868774, + "learning_rate": 1.002257229913306e-05, + "loss": 0.4323, + "step": 271330 + }, + { + "epoch": 2.3987340653123286, + "grad_norm": 3.4011316299438477, + "learning_rate": 1.0021098911461188e-05, + "loss": 0.4716, + "step": 271340 + }, + { + "epoch": 2.398822468572641, + "grad_norm": 5.901969909667969, + "learning_rate": 1.0019625523789318e-05, + "loss": 0.3791, + "step": 271350 + }, + { + "epoch": 2.3989108718329533, + "grad_norm": 4.737884521484375, + "learning_rate": 1.0018152136117446e-05, + "loss": 0.5308, + "step": 271360 + }, + { + "epoch": 2.3989992750932654, + "grad_norm": 4.57314920425415, + "learning_rate": 1.0016678748445576e-05, + "loss": 0.7397, + "step": 271370 + }, + { + "epoch": 2.3990876783535775, + "grad_norm": 1.1393789052963257, + "learning_rate": 1.0015205360773706e-05, + "loss": 0.5133, + "step": 271380 + }, + { + "epoch": 2.39917608161389, + "grad_norm": 1.137664556503296, + "learning_rate": 1.0013731973101835e-05, + "loss": 0.462, + "step": 271390 + }, + { + "epoch": 2.399264484874202, + "grad_norm": 5.227400779724121, + "learning_rate": 1.0012258585429965e-05, + "loss": 0.4531, + "step": 271400 + }, + { + "epoch": 2.3993528881345143, + "grad_norm": 1.472112774848938, + "learning_rate": 1.0010785197758093e-05, + "loss": 0.5175, + "step": 271410 + }, + { + "epoch": 2.399441291394827, + "grad_norm": 3.797260046005249, + "learning_rate": 1.0009311810086223e-05, + "loss": 0.5855, + "step": 271420 + }, + { + "epoch": 2.399529694655139, + "grad_norm": 5.754822254180908, + "learning_rate": 1.0007838422414352e-05, + "loss": 0.4639, + "step": 271430 + }, + { + "epoch": 2.399618097915451, + "grad_norm": 15.236405372619629, + "learning_rate": 1.0006365034742482e-05, + "loss": 0.5127, + "step": 271440 + }, + { + "epoch": 2.3997065011757632, + "grad_norm": 1.6619501113891602, + "learning_rate": 1.000489164707061e-05, + "loss": 0.5055, + "step": 271450 + }, + { + "epoch": 2.399794904436076, + "grad_norm": 5.916011333465576, + "learning_rate": 1.000341825939874e-05, + "loss": 0.5593, + "step": 271460 + }, + { + "epoch": 2.399883307696388, + "grad_norm": 3.485762119293213, + "learning_rate": 1.000194487172687e-05, + "loss": 0.5122, + "step": 271470 + }, + { + "epoch": 2.3999717109567, + "grad_norm": 2.050906181335449, + "learning_rate": 1.0000471484054998e-05, + "loss": 0.545, + "step": 271480 + }, + { + "epoch": 2.400060114217012, + "grad_norm": 3.4743406772613525, + "learning_rate": 9.998998096383128e-06, + "loss": 0.4686, + "step": 271490 + }, + { + "epoch": 2.4001485174773247, + "grad_norm": 5.36237907409668, + "learning_rate": 9.997524708711257e-06, + "loss": 0.6283, + "step": 271500 + }, + { + "epoch": 2.400236920737637, + "grad_norm": 10.186990737915039, + "learning_rate": 9.996051321039387e-06, + "loss": 0.5188, + "step": 271510 + }, + { + "epoch": 2.400325323997949, + "grad_norm": 2.935375213623047, + "learning_rate": 9.994577933367515e-06, + "loss": 0.5123, + "step": 271520 + }, + { + "epoch": 2.400413727258261, + "grad_norm": 7.252697467803955, + "learning_rate": 9.993104545695645e-06, + "loss": 0.5239, + "step": 271530 + }, + { + "epoch": 2.4005021305185736, + "grad_norm": 1.832800030708313, + "learning_rate": 9.991631158023775e-06, + "loss": 0.6382, + "step": 271540 + }, + { + "epoch": 2.4005905337788858, + "grad_norm": 1.7258086204528809, + "learning_rate": 9.990157770351904e-06, + "loss": 0.5501, + "step": 271550 + }, + { + "epoch": 2.400678937039198, + "grad_norm": 4.60171365737915, + "learning_rate": 9.988684382680034e-06, + "loss": 0.5714, + "step": 271560 + }, + { + "epoch": 2.4007673402995104, + "grad_norm": 2.393079996109009, + "learning_rate": 9.987210995008164e-06, + "loss": 0.6068, + "step": 271570 + }, + { + "epoch": 2.4008557435598226, + "grad_norm": 4.238519191741943, + "learning_rate": 9.985737607336292e-06, + "loss": 0.4466, + "step": 271580 + }, + { + "epoch": 2.4009441468201347, + "grad_norm": 12.40735149383545, + "learning_rate": 9.984264219664422e-06, + "loss": 0.5673, + "step": 271590 + }, + { + "epoch": 2.401032550080447, + "grad_norm": 6.224597930908203, + "learning_rate": 9.982790831992552e-06, + "loss": 0.5615, + "step": 271600 + }, + { + "epoch": 2.4011209533407594, + "grad_norm": 2.5352542400360107, + "learning_rate": 9.98131744432068e-06, + "loss": 0.6992, + "step": 271610 + }, + { + "epoch": 2.4012093566010715, + "grad_norm": 2.3690073490142822, + "learning_rate": 9.97984405664881e-06, + "loss": 0.5149, + "step": 271620 + }, + { + "epoch": 2.4012977598613836, + "grad_norm": 4.6905012130737305, + "learning_rate": 9.978370668976939e-06, + "loss": 0.4543, + "step": 271630 + }, + { + "epoch": 2.401386163121696, + "grad_norm": 1.5505967140197754, + "learning_rate": 9.976897281305069e-06, + "loss": 0.5766, + "step": 271640 + }, + { + "epoch": 2.4014745663820083, + "grad_norm": 2.0948071479797363, + "learning_rate": 9.975423893633197e-06, + "loss": 0.4935, + "step": 271650 + }, + { + "epoch": 2.4015629696423204, + "grad_norm": 1.778010368347168, + "learning_rate": 9.973950505961327e-06, + "loss": 0.5162, + "step": 271660 + }, + { + "epoch": 2.4016513729026325, + "grad_norm": 3.313521385192871, + "learning_rate": 9.972477118289457e-06, + "loss": 0.5928, + "step": 271670 + }, + { + "epoch": 2.4017397761629447, + "grad_norm": 15.33132553100586, + "learning_rate": 9.971003730617586e-06, + "loss": 0.4641, + "step": 271680 + }, + { + "epoch": 2.401828179423257, + "grad_norm": 1.2696748971939087, + "learning_rate": 9.969530342945716e-06, + "loss": 0.5495, + "step": 271690 + }, + { + "epoch": 2.4019165826835693, + "grad_norm": 2.203951835632324, + "learning_rate": 9.968056955273844e-06, + "loss": 0.5482, + "step": 271700 + }, + { + "epoch": 2.4020049859438815, + "grad_norm": 2.220449209213257, + "learning_rate": 9.966583567601974e-06, + "loss": 0.5915, + "step": 271710 + }, + { + "epoch": 2.402093389204194, + "grad_norm": 11.854196548461914, + "learning_rate": 9.965110179930103e-06, + "loss": 0.4814, + "step": 271720 + }, + { + "epoch": 2.402181792464506, + "grad_norm": 4.568637371063232, + "learning_rate": 9.963636792258233e-06, + "loss": 0.5131, + "step": 271730 + }, + { + "epoch": 2.4022701957248183, + "grad_norm": 5.077901840209961, + "learning_rate": 9.962163404586361e-06, + "loss": 0.6065, + "step": 271740 + }, + { + "epoch": 2.4023585989851304, + "grad_norm": 1.357389211654663, + "learning_rate": 9.960690016914491e-06, + "loss": 0.4345, + "step": 271750 + }, + { + "epoch": 2.402447002245443, + "grad_norm": 13.22885513305664, + "learning_rate": 9.959216629242621e-06, + "loss": 0.5755, + "step": 271760 + }, + { + "epoch": 2.402535405505755, + "grad_norm": 2.609036922454834, + "learning_rate": 9.95774324157075e-06, + "loss": 0.4712, + "step": 271770 + }, + { + "epoch": 2.402623808766067, + "grad_norm": 4.233473300933838, + "learning_rate": 9.95626985389888e-06, + "loss": 0.5299, + "step": 271780 + }, + { + "epoch": 2.4027122120263797, + "grad_norm": 0.8128421306610107, + "learning_rate": 9.954796466227008e-06, + "loss": 0.4654, + "step": 271790 + }, + { + "epoch": 2.402800615286692, + "grad_norm": 3.110783338546753, + "learning_rate": 9.953323078555138e-06, + "loss": 0.5476, + "step": 271800 + }, + { + "epoch": 2.402889018547004, + "grad_norm": 1.4992506504058838, + "learning_rate": 9.951849690883266e-06, + "loss": 0.5069, + "step": 271810 + }, + { + "epoch": 2.402977421807316, + "grad_norm": 2.67812180519104, + "learning_rate": 9.950376303211396e-06, + "loss": 0.6515, + "step": 271820 + }, + { + "epoch": 2.4030658250676287, + "grad_norm": 2.234647512435913, + "learning_rate": 9.948902915539525e-06, + "loss": 0.5601, + "step": 271830 + }, + { + "epoch": 2.403154228327941, + "grad_norm": 2.665834665298462, + "learning_rate": 9.947429527867655e-06, + "loss": 0.6153, + "step": 271840 + }, + { + "epoch": 2.403242631588253, + "grad_norm": 5.025918960571289, + "learning_rate": 9.945956140195785e-06, + "loss": 0.5334, + "step": 271850 + }, + { + "epoch": 2.403331034848565, + "grad_norm": 2.4264047145843506, + "learning_rate": 9.944482752523913e-06, + "loss": 0.4681, + "step": 271860 + }, + { + "epoch": 2.4034194381088776, + "grad_norm": 19.460878372192383, + "learning_rate": 9.943009364852043e-06, + "loss": 0.5539, + "step": 271870 + }, + { + "epoch": 2.4035078413691897, + "grad_norm": 5.403330326080322, + "learning_rate": 9.941535977180172e-06, + "loss": 0.5717, + "step": 271880 + }, + { + "epoch": 2.403596244629502, + "grad_norm": 2.745450973510742, + "learning_rate": 9.940062589508302e-06, + "loss": 0.4411, + "step": 271890 + }, + { + "epoch": 2.403684647889814, + "grad_norm": 1.9871909618377686, + "learning_rate": 9.93858920183643e-06, + "loss": 0.5674, + "step": 271900 + }, + { + "epoch": 2.4037730511501265, + "grad_norm": 1.91422438621521, + "learning_rate": 9.93711581416456e-06, + "loss": 0.4491, + "step": 271910 + }, + { + "epoch": 2.4038614544104386, + "grad_norm": 7.535366535186768, + "learning_rate": 9.935642426492688e-06, + "loss": 0.5561, + "step": 271920 + }, + { + "epoch": 2.4039498576707508, + "grad_norm": 2.751833200454712, + "learning_rate": 9.934169038820818e-06, + "loss": 0.5654, + "step": 271930 + }, + { + "epoch": 2.4040382609310633, + "grad_norm": 12.822623252868652, + "learning_rate": 9.932695651148948e-06, + "loss": 0.5499, + "step": 271940 + }, + { + "epoch": 2.4041266641913754, + "grad_norm": 1.7176774740219116, + "learning_rate": 9.931222263477077e-06, + "loss": 0.4894, + "step": 271950 + }, + { + "epoch": 2.4042150674516876, + "grad_norm": 2.0951550006866455, + "learning_rate": 9.929748875805207e-06, + "loss": 0.4797, + "step": 271960 + }, + { + "epoch": 2.4043034707119997, + "grad_norm": 3.932072401046753, + "learning_rate": 9.928275488133335e-06, + "loss": 0.545, + "step": 271970 + }, + { + "epoch": 2.4043918739723122, + "grad_norm": 8.882614135742188, + "learning_rate": 9.926802100461465e-06, + "loss": 0.5762, + "step": 271980 + }, + { + "epoch": 2.4044802772326244, + "grad_norm": 2.924015522003174, + "learning_rate": 9.925328712789595e-06, + "loss": 0.4349, + "step": 271990 + }, + { + "epoch": 2.4045686804929365, + "grad_norm": 4.149078369140625, + "learning_rate": 9.923855325117724e-06, + "loss": 0.6452, + "step": 272000 + }, + { + "epoch": 2.404657083753249, + "grad_norm": 12.459444999694824, + "learning_rate": 9.922381937445854e-06, + "loss": 0.4408, + "step": 272010 + }, + { + "epoch": 2.404745487013561, + "grad_norm": 0.9029993414878845, + "learning_rate": 9.920908549773984e-06, + "loss": 0.6048, + "step": 272020 + }, + { + "epoch": 2.4048338902738733, + "grad_norm": 3.146458864212036, + "learning_rate": 9.919435162102112e-06, + "loss": 0.3601, + "step": 272030 + }, + { + "epoch": 2.4049222935341854, + "grad_norm": 4.779111385345459, + "learning_rate": 9.917961774430242e-06, + "loss": 0.6246, + "step": 272040 + }, + { + "epoch": 2.405010696794498, + "grad_norm": 2.9467854499816895, + "learning_rate": 9.916488386758372e-06, + "loss": 0.5682, + "step": 272050 + }, + { + "epoch": 2.40509910005481, + "grad_norm": 4.590737819671631, + "learning_rate": 9.9150149990865e-06, + "loss": 0.5902, + "step": 272060 + }, + { + "epoch": 2.405187503315122, + "grad_norm": 2.312718629837036, + "learning_rate": 9.91354161141463e-06, + "loss": 0.6307, + "step": 272070 + }, + { + "epoch": 2.4052759065754343, + "grad_norm": 11.805683135986328, + "learning_rate": 9.912068223742759e-06, + "loss": 0.5315, + "step": 272080 + }, + { + "epoch": 2.405364309835747, + "grad_norm": 2.7962048053741455, + "learning_rate": 9.910594836070889e-06, + "loss": 0.5413, + "step": 272090 + }, + { + "epoch": 2.405452713096059, + "grad_norm": 4.95560359954834, + "learning_rate": 9.909121448399017e-06, + "loss": 0.6223, + "step": 272100 + }, + { + "epoch": 2.405541116356371, + "grad_norm": 17.27062225341797, + "learning_rate": 9.907648060727147e-06, + "loss": 0.5578, + "step": 272110 + }, + { + "epoch": 2.4056295196166833, + "grad_norm": 1.3969810009002686, + "learning_rate": 9.906174673055276e-06, + "loss": 0.5929, + "step": 272120 + }, + { + "epoch": 2.405717922876996, + "grad_norm": 7.018454074859619, + "learning_rate": 9.904701285383406e-06, + "loss": 0.4559, + "step": 272130 + }, + { + "epoch": 2.405806326137308, + "grad_norm": 5.957291126251221, + "learning_rate": 9.903227897711536e-06, + "loss": 0.595, + "step": 272140 + }, + { + "epoch": 2.40589472939762, + "grad_norm": 9.577057838439941, + "learning_rate": 9.901754510039664e-06, + "loss": 0.4438, + "step": 272150 + }, + { + "epoch": 2.4059831326579326, + "grad_norm": 1.6927099227905273, + "learning_rate": 9.900281122367794e-06, + "loss": 0.4839, + "step": 272160 + }, + { + "epoch": 2.4060715359182447, + "grad_norm": 1.5683380365371704, + "learning_rate": 9.898807734695923e-06, + "loss": 0.51, + "step": 272170 + }, + { + "epoch": 2.406159939178557, + "grad_norm": 1.924176812171936, + "learning_rate": 9.897334347024053e-06, + "loss": 0.459, + "step": 272180 + }, + { + "epoch": 2.406248342438869, + "grad_norm": 4.766420841217041, + "learning_rate": 9.895860959352181e-06, + "loss": 0.6669, + "step": 272190 + }, + { + "epoch": 2.4063367456991815, + "grad_norm": 2.691772937774658, + "learning_rate": 9.894387571680311e-06, + "loss": 0.5557, + "step": 272200 + }, + { + "epoch": 2.4064251489594937, + "grad_norm": 6.616300106048584, + "learning_rate": 9.89291418400844e-06, + "loss": 0.4881, + "step": 272210 + }, + { + "epoch": 2.406513552219806, + "grad_norm": 2.770777702331543, + "learning_rate": 9.89144079633657e-06, + "loss": 0.4424, + "step": 272220 + }, + { + "epoch": 2.4066019554801183, + "grad_norm": 1.1278291940689087, + "learning_rate": 9.8899674086647e-06, + "loss": 0.4521, + "step": 272230 + }, + { + "epoch": 2.4066903587404305, + "grad_norm": 1.4228655099868774, + "learning_rate": 9.888494020992828e-06, + "loss": 0.5732, + "step": 272240 + }, + { + "epoch": 2.4067787620007426, + "grad_norm": 9.771646499633789, + "learning_rate": 9.887020633320958e-06, + "loss": 0.4305, + "step": 272250 + }, + { + "epoch": 2.4068671652610547, + "grad_norm": 10.909440040588379, + "learning_rate": 9.885547245649086e-06, + "loss": 0.6082, + "step": 272260 + }, + { + "epoch": 2.406955568521367, + "grad_norm": 2.8904762268066406, + "learning_rate": 9.884073857977216e-06, + "loss": 0.4831, + "step": 272270 + }, + { + "epoch": 2.4070439717816794, + "grad_norm": 9.778183937072754, + "learning_rate": 9.882600470305345e-06, + "loss": 0.5642, + "step": 272280 + }, + { + "epoch": 2.4071323750419915, + "grad_norm": 10.268649101257324, + "learning_rate": 9.881127082633475e-06, + "loss": 0.5931, + "step": 272290 + }, + { + "epoch": 2.4072207783023036, + "grad_norm": 1.2844117879867554, + "learning_rate": 9.879653694961603e-06, + "loss": 0.5767, + "step": 272300 + }, + { + "epoch": 2.407309181562616, + "grad_norm": 1.688664436340332, + "learning_rate": 9.878180307289733e-06, + "loss": 0.6472, + "step": 272310 + }, + { + "epoch": 2.4073975848229283, + "grad_norm": 1.7121423482894897, + "learning_rate": 9.876706919617863e-06, + "loss": 0.6453, + "step": 272320 + }, + { + "epoch": 2.4074859880832404, + "grad_norm": 3.721167802810669, + "learning_rate": 9.875233531945991e-06, + "loss": 0.4995, + "step": 272330 + }, + { + "epoch": 2.4075743913435526, + "grad_norm": 4.000843524932861, + "learning_rate": 9.873760144274122e-06, + "loss": 0.4601, + "step": 272340 + }, + { + "epoch": 2.407662794603865, + "grad_norm": 3.733942747116089, + "learning_rate": 9.87228675660225e-06, + "loss": 0.5155, + "step": 272350 + }, + { + "epoch": 2.4077511978641772, + "grad_norm": 6.740336894989014, + "learning_rate": 9.87081336893038e-06, + "loss": 0.6768, + "step": 272360 + }, + { + "epoch": 2.4078396011244894, + "grad_norm": 10.299590110778809, + "learning_rate": 9.869339981258508e-06, + "loss": 0.4126, + "step": 272370 + }, + { + "epoch": 2.407928004384802, + "grad_norm": 9.492457389831543, + "learning_rate": 9.867866593586638e-06, + "loss": 0.5611, + "step": 272380 + }, + { + "epoch": 2.408016407645114, + "grad_norm": 2.8086373805999756, + "learning_rate": 9.866393205914767e-06, + "loss": 0.6884, + "step": 272390 + }, + { + "epoch": 2.408104810905426, + "grad_norm": 4.114748477935791, + "learning_rate": 9.864919818242897e-06, + "loss": 0.568, + "step": 272400 + }, + { + "epoch": 2.4081932141657383, + "grad_norm": 7.901828765869141, + "learning_rate": 9.863446430571027e-06, + "loss": 0.4939, + "step": 272410 + }, + { + "epoch": 2.408281617426051, + "grad_norm": 3.5253586769104004, + "learning_rate": 9.861973042899155e-06, + "loss": 0.5124, + "step": 272420 + }, + { + "epoch": 2.408370020686363, + "grad_norm": 2.900696277618408, + "learning_rate": 9.860499655227285e-06, + "loss": 0.6247, + "step": 272430 + }, + { + "epoch": 2.408458423946675, + "grad_norm": 2.1373016834259033, + "learning_rate": 9.859026267555414e-06, + "loss": 0.5264, + "step": 272440 + }, + { + "epoch": 2.408546827206987, + "grad_norm": 3.447903871536255, + "learning_rate": 9.857552879883544e-06, + "loss": 0.5491, + "step": 272450 + }, + { + "epoch": 2.4086352304672998, + "grad_norm": 3.1018877029418945, + "learning_rate": 9.856079492211674e-06, + "loss": 0.4263, + "step": 272460 + }, + { + "epoch": 2.408723633727612, + "grad_norm": 5.40636682510376, + "learning_rate": 9.854606104539802e-06, + "loss": 0.5862, + "step": 272470 + }, + { + "epoch": 2.408812036987924, + "grad_norm": 3.7513816356658936, + "learning_rate": 9.853132716867932e-06, + "loss": 0.6295, + "step": 272480 + }, + { + "epoch": 2.408900440248236, + "grad_norm": 3.7380807399749756, + "learning_rate": 9.851659329196062e-06, + "loss": 0.5668, + "step": 272490 + }, + { + "epoch": 2.4089888435085487, + "grad_norm": 10.218631744384766, + "learning_rate": 9.85018594152419e-06, + "loss": 0.5546, + "step": 272500 + }, + { + "epoch": 2.409077246768861, + "grad_norm": 1.8346728086471558, + "learning_rate": 9.84871255385232e-06, + "loss": 0.5814, + "step": 272510 + }, + { + "epoch": 2.409165650029173, + "grad_norm": 5.490819931030273, + "learning_rate": 9.84723916618045e-06, + "loss": 0.3789, + "step": 272520 + }, + { + "epoch": 2.4092540532894855, + "grad_norm": 2.8767004013061523, + "learning_rate": 9.845765778508579e-06, + "loss": 0.5435, + "step": 272530 + }, + { + "epoch": 2.4093424565497976, + "grad_norm": 3.174288511276245, + "learning_rate": 9.844292390836709e-06, + "loss": 0.6483, + "step": 272540 + }, + { + "epoch": 2.4094308598101097, + "grad_norm": 2.516206741333008, + "learning_rate": 9.842819003164837e-06, + "loss": 0.5612, + "step": 272550 + }, + { + "epoch": 2.409519263070422, + "grad_norm": 3.0741617679595947, + "learning_rate": 9.841345615492967e-06, + "loss": 0.5681, + "step": 272560 + }, + { + "epoch": 2.4096076663307344, + "grad_norm": 9.813926696777344, + "learning_rate": 9.839872227821096e-06, + "loss": 0.4596, + "step": 272570 + }, + { + "epoch": 2.4096960695910465, + "grad_norm": 2.2851955890655518, + "learning_rate": 9.838398840149226e-06, + "loss": 0.5468, + "step": 272580 + }, + { + "epoch": 2.4097844728513587, + "grad_norm": 1.9165594577789307, + "learning_rate": 9.836925452477354e-06, + "loss": 0.449, + "step": 272590 + }, + { + "epoch": 2.4098728761116712, + "grad_norm": 2.1104698181152344, + "learning_rate": 9.835452064805484e-06, + "loss": 0.5182, + "step": 272600 + }, + { + "epoch": 2.4099612793719833, + "grad_norm": 4.566133499145508, + "learning_rate": 9.833978677133614e-06, + "loss": 0.5382, + "step": 272610 + }, + { + "epoch": 2.4100496826322955, + "grad_norm": 2.5275843143463135, + "learning_rate": 9.832505289461743e-06, + "loss": 0.5566, + "step": 272620 + }, + { + "epoch": 2.4101380858926076, + "grad_norm": 2.761704921722412, + "learning_rate": 9.831031901789873e-06, + "loss": 0.6162, + "step": 272630 + }, + { + "epoch": 2.41022648915292, + "grad_norm": 6.038506984710693, + "learning_rate": 9.829558514118001e-06, + "loss": 0.5356, + "step": 272640 + }, + { + "epoch": 2.4103148924132323, + "grad_norm": 3.559037208557129, + "learning_rate": 9.828085126446131e-06, + "loss": 0.5135, + "step": 272650 + }, + { + "epoch": 2.4104032956735444, + "grad_norm": 0.7312003374099731, + "learning_rate": 9.82661173877426e-06, + "loss": 0.4422, + "step": 272660 + }, + { + "epoch": 2.4104916989338565, + "grad_norm": 3.4069743156433105, + "learning_rate": 9.82513835110239e-06, + "loss": 0.4596, + "step": 272670 + }, + { + "epoch": 2.410580102194169, + "grad_norm": 33.18650436401367, + "learning_rate": 9.823664963430518e-06, + "loss": 0.5591, + "step": 272680 + }, + { + "epoch": 2.410668505454481, + "grad_norm": 3.0791397094726562, + "learning_rate": 9.822191575758648e-06, + "loss": 0.5292, + "step": 272690 + }, + { + "epoch": 2.4107569087147933, + "grad_norm": 1.5453648567199707, + "learning_rate": 9.820718188086778e-06, + "loss": 0.4832, + "step": 272700 + }, + { + "epoch": 2.4108453119751054, + "grad_norm": 2.436760663986206, + "learning_rate": 9.819244800414906e-06, + "loss": 0.7065, + "step": 272710 + }, + { + "epoch": 2.410933715235418, + "grad_norm": 3.1814160346984863, + "learning_rate": 9.817771412743036e-06, + "loss": 0.4513, + "step": 272720 + }, + { + "epoch": 2.41102211849573, + "grad_norm": 0.8832152485847473, + "learning_rate": 9.816298025071165e-06, + "loss": 0.5113, + "step": 272730 + }, + { + "epoch": 2.4111105217560422, + "grad_norm": 0.9359428882598877, + "learning_rate": 9.814824637399295e-06, + "loss": 0.5558, + "step": 272740 + }, + { + "epoch": 2.411198925016355, + "grad_norm": 1.2240461111068726, + "learning_rate": 9.813351249727423e-06, + "loss": 0.5465, + "step": 272750 + }, + { + "epoch": 2.411287328276667, + "grad_norm": 2.619905948638916, + "learning_rate": 9.811877862055553e-06, + "loss": 0.4937, + "step": 272760 + }, + { + "epoch": 2.411375731536979, + "grad_norm": 5.219773292541504, + "learning_rate": 9.810404474383681e-06, + "loss": 0.4807, + "step": 272770 + }, + { + "epoch": 2.411464134797291, + "grad_norm": 2.655771493911743, + "learning_rate": 9.808931086711811e-06, + "loss": 0.4505, + "step": 272780 + }, + { + "epoch": 2.4115525380576037, + "grad_norm": 3.6397838592529297, + "learning_rate": 9.807457699039942e-06, + "loss": 0.5224, + "step": 272790 + }, + { + "epoch": 2.411640941317916, + "grad_norm": 5.147067070007324, + "learning_rate": 9.80598431136807e-06, + "loss": 0.6767, + "step": 272800 + }, + { + "epoch": 2.411729344578228, + "grad_norm": 8.224040031433105, + "learning_rate": 9.8045109236962e-06, + "loss": 0.5384, + "step": 272810 + }, + { + "epoch": 2.4118177478385405, + "grad_norm": 1.1884547472000122, + "learning_rate": 9.803037536024328e-06, + "loss": 0.59, + "step": 272820 + }, + { + "epoch": 2.4119061510988526, + "grad_norm": 3.3857288360595703, + "learning_rate": 9.801564148352458e-06, + "loss": 0.4744, + "step": 272830 + }, + { + "epoch": 2.4119945543591648, + "grad_norm": 2.232840061187744, + "learning_rate": 9.800090760680587e-06, + "loss": 0.503, + "step": 272840 + }, + { + "epoch": 2.412082957619477, + "grad_norm": 1.6897424459457397, + "learning_rate": 9.798617373008717e-06, + "loss": 0.5326, + "step": 272850 + }, + { + "epoch": 2.412171360879789, + "grad_norm": 0.809542715549469, + "learning_rate": 9.797143985336845e-06, + "loss": 0.4236, + "step": 272860 + }, + { + "epoch": 2.4122597641401016, + "grad_norm": 7.579285144805908, + "learning_rate": 9.795670597664975e-06, + "loss": 0.4191, + "step": 272870 + }, + { + "epoch": 2.4123481674004137, + "grad_norm": 1.955091118812561, + "learning_rate": 9.794197209993105e-06, + "loss": 0.6126, + "step": 272880 + }, + { + "epoch": 2.412436570660726, + "grad_norm": 2.5681798458099365, + "learning_rate": 9.792723822321234e-06, + "loss": 0.618, + "step": 272890 + }, + { + "epoch": 2.4125249739210384, + "grad_norm": 2.167111396789551, + "learning_rate": 9.791250434649364e-06, + "loss": 0.518, + "step": 272900 + }, + { + "epoch": 2.4126133771813505, + "grad_norm": 1.5094304084777832, + "learning_rate": 9.789777046977492e-06, + "loss": 0.5335, + "step": 272910 + }, + { + "epoch": 2.4127017804416626, + "grad_norm": 3.1732146739959717, + "learning_rate": 9.788303659305622e-06, + "loss": 0.6107, + "step": 272920 + }, + { + "epoch": 2.4127901837019747, + "grad_norm": 2.2523739337921143, + "learning_rate": 9.786830271633752e-06, + "loss": 0.5884, + "step": 272930 + }, + { + "epoch": 2.4128785869622873, + "grad_norm": 1.341930866241455, + "learning_rate": 9.78535688396188e-06, + "loss": 0.4717, + "step": 272940 + }, + { + "epoch": 2.4129669902225994, + "grad_norm": 1.6904247999191284, + "learning_rate": 9.78388349629001e-06, + "loss": 0.4854, + "step": 272950 + }, + { + "epoch": 2.4130553934829115, + "grad_norm": 2.420478343963623, + "learning_rate": 9.78241010861814e-06, + "loss": 0.4552, + "step": 272960 + }, + { + "epoch": 2.413143796743224, + "grad_norm": 0.6735581159591675, + "learning_rate": 9.780936720946269e-06, + "loss": 0.4986, + "step": 272970 + }, + { + "epoch": 2.413232200003536, + "grad_norm": 3.854632616043091, + "learning_rate": 9.779463333274399e-06, + "loss": 0.5558, + "step": 272980 + }, + { + "epoch": 2.4133206032638483, + "grad_norm": 2.0315351486206055, + "learning_rate": 9.777989945602529e-06, + "loss": 0.5658, + "step": 272990 + }, + { + "epoch": 2.4134090065241605, + "grad_norm": 2.45462965965271, + "learning_rate": 9.776516557930657e-06, + "loss": 0.5774, + "step": 273000 + }, + { + "epoch": 2.413497409784473, + "grad_norm": 6.000032901763916, + "learning_rate": 9.775043170258787e-06, + "loss": 0.438, + "step": 273010 + }, + { + "epoch": 2.413585813044785, + "grad_norm": 2.2658371925354004, + "learning_rate": 9.773569782586916e-06, + "loss": 0.5284, + "step": 273020 + }, + { + "epoch": 2.4136742163050973, + "grad_norm": 3.0718185901641846, + "learning_rate": 9.772096394915046e-06, + "loss": 0.5378, + "step": 273030 + }, + { + "epoch": 2.4137626195654094, + "grad_norm": 2.440939426422119, + "learning_rate": 9.770623007243174e-06, + "loss": 0.4555, + "step": 273040 + }, + { + "epoch": 2.413851022825722, + "grad_norm": 12.628743171691895, + "learning_rate": 9.769149619571304e-06, + "loss": 0.5516, + "step": 273050 + }, + { + "epoch": 2.413939426086034, + "grad_norm": 10.182336807250977, + "learning_rate": 9.767676231899434e-06, + "loss": 0.549, + "step": 273060 + }, + { + "epoch": 2.414027829346346, + "grad_norm": 2.0704445838928223, + "learning_rate": 9.766202844227563e-06, + "loss": 0.4132, + "step": 273070 + }, + { + "epoch": 2.4141162326066583, + "grad_norm": 1.2147208452224731, + "learning_rate": 9.764729456555693e-06, + "loss": 0.6791, + "step": 273080 + }, + { + "epoch": 2.414204635866971, + "grad_norm": 1.5606064796447754, + "learning_rate": 9.763256068883821e-06, + "loss": 0.4444, + "step": 273090 + }, + { + "epoch": 2.414293039127283, + "grad_norm": 2.366584062576294, + "learning_rate": 9.761782681211951e-06, + "loss": 0.4951, + "step": 273100 + }, + { + "epoch": 2.414381442387595, + "grad_norm": 15.92019271850586, + "learning_rate": 9.76030929354008e-06, + "loss": 0.6475, + "step": 273110 + }, + { + "epoch": 2.4144698456479077, + "grad_norm": 6.672244548797607, + "learning_rate": 9.75883590586821e-06, + "loss": 0.5563, + "step": 273120 + }, + { + "epoch": 2.41455824890822, + "grad_norm": 3.1834802627563477, + "learning_rate": 9.757362518196338e-06, + "loss": 0.4663, + "step": 273130 + }, + { + "epoch": 2.414646652168532, + "grad_norm": 1.6511309146881104, + "learning_rate": 9.755889130524468e-06, + "loss": 0.3645, + "step": 273140 + }, + { + "epoch": 2.414735055428844, + "grad_norm": 1.9919216632843018, + "learning_rate": 9.754415742852596e-06, + "loss": 0.4702, + "step": 273150 + }, + { + "epoch": 2.4148234586891566, + "grad_norm": 1.393186092376709, + "learning_rate": 9.752942355180726e-06, + "loss": 0.5131, + "step": 273160 + }, + { + "epoch": 2.4149118619494687, + "grad_norm": 2.044095516204834, + "learning_rate": 9.751468967508856e-06, + "loss": 0.5417, + "step": 273170 + }, + { + "epoch": 2.415000265209781, + "grad_norm": 2.545093536376953, + "learning_rate": 9.749995579836985e-06, + "loss": 0.4591, + "step": 273180 + }, + { + "epoch": 2.4150886684700934, + "grad_norm": 2.147650957107544, + "learning_rate": 9.748522192165115e-06, + "loss": 0.5034, + "step": 273190 + }, + { + "epoch": 2.4151770717304055, + "grad_norm": 2.435088872909546, + "learning_rate": 9.747048804493243e-06, + "loss": 0.5673, + "step": 273200 + }, + { + "epoch": 2.4152654749907176, + "grad_norm": 2.7031285762786865, + "learning_rate": 9.745575416821373e-06, + "loss": 0.5024, + "step": 273210 + }, + { + "epoch": 2.4153538782510298, + "grad_norm": 26.57193946838379, + "learning_rate": 9.744102029149501e-06, + "loss": 0.4181, + "step": 273220 + }, + { + "epoch": 2.4154422815113423, + "grad_norm": 6.533260822296143, + "learning_rate": 9.742628641477631e-06, + "loss": 0.5361, + "step": 273230 + }, + { + "epoch": 2.4155306847716544, + "grad_norm": 7.906519412994385, + "learning_rate": 9.74115525380576e-06, + "loss": 0.564, + "step": 273240 + }, + { + "epoch": 2.4156190880319666, + "grad_norm": 5.095401763916016, + "learning_rate": 9.73968186613389e-06, + "loss": 0.5446, + "step": 273250 + }, + { + "epoch": 2.4157074912922787, + "grad_norm": 1.116929531097412, + "learning_rate": 9.73820847846202e-06, + "loss": 0.4199, + "step": 273260 + }, + { + "epoch": 2.4157958945525912, + "grad_norm": 4.61077880859375, + "learning_rate": 9.736735090790148e-06, + "loss": 0.4812, + "step": 273270 + }, + { + "epoch": 2.4158842978129034, + "grad_norm": 25.67048454284668, + "learning_rate": 9.735261703118278e-06, + "loss": 0.6545, + "step": 273280 + }, + { + "epoch": 2.4159727010732155, + "grad_norm": 1.2074404954910278, + "learning_rate": 9.733788315446407e-06, + "loss": 0.4577, + "step": 273290 + }, + { + "epoch": 2.4160611043335276, + "grad_norm": 8.640055656433105, + "learning_rate": 9.732314927774537e-06, + "loss": 0.502, + "step": 273300 + }, + { + "epoch": 2.41614950759384, + "grad_norm": 3.7312371730804443, + "learning_rate": 9.730841540102665e-06, + "loss": 0.5228, + "step": 273310 + }, + { + "epoch": 2.4162379108541523, + "grad_norm": 2.499600648880005, + "learning_rate": 9.729368152430795e-06, + "loss": 0.6091, + "step": 273320 + }, + { + "epoch": 2.4163263141144644, + "grad_norm": 3.4854743480682373, + "learning_rate": 9.727894764758923e-06, + "loss": 0.5391, + "step": 273330 + }, + { + "epoch": 2.416414717374777, + "grad_norm": 8.846802711486816, + "learning_rate": 9.726421377087053e-06, + "loss": 0.5605, + "step": 273340 + }, + { + "epoch": 2.416503120635089, + "grad_norm": 2.543076276779175, + "learning_rate": 9.724947989415184e-06, + "loss": 0.4674, + "step": 273350 + }, + { + "epoch": 2.416591523895401, + "grad_norm": 1.3799370527267456, + "learning_rate": 9.723474601743312e-06, + "loss": 0.4583, + "step": 273360 + }, + { + "epoch": 2.4166799271557133, + "grad_norm": 1.2838613986968994, + "learning_rate": 9.722001214071442e-06, + "loss": 0.4646, + "step": 273370 + }, + { + "epoch": 2.416768330416026, + "grad_norm": 7.580527305603027, + "learning_rate": 9.72052782639957e-06, + "loss": 0.6642, + "step": 273380 + }, + { + "epoch": 2.416856733676338, + "grad_norm": 12.6409912109375, + "learning_rate": 9.7190544387277e-06, + "loss": 0.5418, + "step": 273390 + }, + { + "epoch": 2.41694513693665, + "grad_norm": 3.713469982147217, + "learning_rate": 9.71758105105583e-06, + "loss": 0.6536, + "step": 273400 + }, + { + "epoch": 2.4170335401969627, + "grad_norm": 3.258639335632324, + "learning_rate": 9.716107663383959e-06, + "loss": 0.5841, + "step": 273410 + }, + { + "epoch": 2.417121943457275, + "grad_norm": 4.626204967498779, + "learning_rate": 9.714634275712089e-06, + "loss": 0.5814, + "step": 273420 + }, + { + "epoch": 2.417210346717587, + "grad_norm": 5.078734874725342, + "learning_rate": 9.713160888040219e-06, + "loss": 0.4921, + "step": 273430 + }, + { + "epoch": 2.417298749977899, + "grad_norm": 4.9207258224487305, + "learning_rate": 9.711687500368347e-06, + "loss": 0.5362, + "step": 273440 + }, + { + "epoch": 2.417387153238211, + "grad_norm": 1.752839207649231, + "learning_rate": 9.710214112696477e-06, + "loss": 0.5852, + "step": 273450 + }, + { + "epoch": 2.4174755564985237, + "grad_norm": 2.7418971061706543, + "learning_rate": 9.708740725024607e-06, + "loss": 0.4344, + "step": 273460 + }, + { + "epoch": 2.417563959758836, + "grad_norm": 3.407564640045166, + "learning_rate": 9.707267337352736e-06, + "loss": 0.5902, + "step": 273470 + }, + { + "epoch": 2.417652363019148, + "grad_norm": 1.1563420295715332, + "learning_rate": 9.705793949680866e-06, + "loss": 0.4941, + "step": 273480 + }, + { + "epoch": 2.4177407662794606, + "grad_norm": 2.876108169555664, + "learning_rate": 9.704320562008994e-06, + "loss": 0.6022, + "step": 273490 + }, + { + "epoch": 2.4178291695397727, + "grad_norm": 2.6516411304473877, + "learning_rate": 9.702847174337124e-06, + "loss": 0.4758, + "step": 273500 + }, + { + "epoch": 2.417917572800085, + "grad_norm": 3.197925567626953, + "learning_rate": 9.701373786665252e-06, + "loss": 0.536, + "step": 273510 + }, + { + "epoch": 2.418005976060397, + "grad_norm": 2.0400187969207764, + "learning_rate": 9.699900398993382e-06, + "loss": 0.532, + "step": 273520 + }, + { + "epoch": 2.4180943793207095, + "grad_norm": 4.167843818664551, + "learning_rate": 9.698427011321513e-06, + "loss": 0.5856, + "step": 273530 + }, + { + "epoch": 2.4181827825810216, + "grad_norm": 1.7051159143447876, + "learning_rate": 9.696953623649641e-06, + "loss": 0.5866, + "step": 273540 + }, + { + "epoch": 2.4182711858413337, + "grad_norm": 1.641491174697876, + "learning_rate": 9.695480235977771e-06, + "loss": 0.5106, + "step": 273550 + }, + { + "epoch": 2.4183595891016463, + "grad_norm": 9.266183853149414, + "learning_rate": 9.6940068483059e-06, + "loss": 0.6101, + "step": 273560 + }, + { + "epoch": 2.4184479923619584, + "grad_norm": 7.35838508605957, + "learning_rate": 9.69253346063403e-06, + "loss": 0.5367, + "step": 273570 + }, + { + "epoch": 2.4185363956222705, + "grad_norm": 7.926471710205078, + "learning_rate": 9.691060072962158e-06, + "loss": 0.4722, + "step": 273580 + }, + { + "epoch": 2.4186247988825826, + "grad_norm": 3.4108781814575195, + "learning_rate": 9.689586685290288e-06, + "loss": 0.572, + "step": 273590 + }, + { + "epoch": 2.418713202142895, + "grad_norm": 1.6841480731964111, + "learning_rate": 9.688113297618416e-06, + "loss": 0.5318, + "step": 273600 + }, + { + "epoch": 2.4188016054032073, + "grad_norm": 17.081989288330078, + "learning_rate": 9.686639909946546e-06, + "loss": 0.5999, + "step": 273610 + }, + { + "epoch": 2.4188900086635194, + "grad_norm": 11.655868530273438, + "learning_rate": 9.685166522274676e-06, + "loss": 0.6096, + "step": 273620 + }, + { + "epoch": 2.4189784119238316, + "grad_norm": 7.120723724365234, + "learning_rate": 9.683693134602805e-06, + "loss": 0.5624, + "step": 273630 + }, + { + "epoch": 2.419066815184144, + "grad_norm": 2.0725138187408447, + "learning_rate": 9.682219746930935e-06, + "loss": 0.4292, + "step": 273640 + }, + { + "epoch": 2.4191552184444562, + "grad_norm": 3.1960926055908203, + "learning_rate": 9.680746359259063e-06, + "loss": 0.5196, + "step": 273650 + }, + { + "epoch": 2.4192436217047684, + "grad_norm": 17.543848037719727, + "learning_rate": 9.679272971587193e-06, + "loss": 0.5048, + "step": 273660 + }, + { + "epoch": 2.4193320249650805, + "grad_norm": 4.966705799102783, + "learning_rate": 9.677799583915321e-06, + "loss": 0.4644, + "step": 273670 + }, + { + "epoch": 2.419420428225393, + "grad_norm": 2.985823631286621, + "learning_rate": 9.676326196243451e-06, + "loss": 0.571, + "step": 273680 + }, + { + "epoch": 2.419508831485705, + "grad_norm": 10.069761276245117, + "learning_rate": 9.67485280857158e-06, + "loss": 0.6081, + "step": 273690 + }, + { + "epoch": 2.4195972347460173, + "grad_norm": 6.268047332763672, + "learning_rate": 9.67337942089971e-06, + "loss": 0.5321, + "step": 273700 + }, + { + "epoch": 2.41968563800633, + "grad_norm": 3.0476996898651123, + "learning_rate": 9.671906033227838e-06, + "loss": 0.5757, + "step": 273710 + }, + { + "epoch": 2.419774041266642, + "grad_norm": 7.89447546005249, + "learning_rate": 9.670432645555968e-06, + "loss": 0.5313, + "step": 273720 + }, + { + "epoch": 2.419862444526954, + "grad_norm": 1.644480586051941, + "learning_rate": 9.668959257884098e-06, + "loss": 0.4422, + "step": 273730 + }, + { + "epoch": 2.419950847787266, + "grad_norm": 3.636242389678955, + "learning_rate": 9.667485870212227e-06, + "loss": 0.4547, + "step": 273740 + }, + { + "epoch": 2.4200392510475788, + "grad_norm": 7.814889907836914, + "learning_rate": 9.666012482540357e-06, + "loss": 0.3936, + "step": 273750 + }, + { + "epoch": 2.420127654307891, + "grad_norm": 0.7745106816291809, + "learning_rate": 9.664539094868485e-06, + "loss": 0.5223, + "step": 273760 + }, + { + "epoch": 2.420216057568203, + "grad_norm": 3.569542646408081, + "learning_rate": 9.663065707196615e-06, + "loss": 0.5831, + "step": 273770 + }, + { + "epoch": 2.4203044608285156, + "grad_norm": 1.3134101629257202, + "learning_rate": 9.661592319524743e-06, + "loss": 0.5627, + "step": 273780 + }, + { + "epoch": 2.4203928640888277, + "grad_norm": 2.7121479511260986, + "learning_rate": 9.660118931852873e-06, + "loss": 0.516, + "step": 273790 + }, + { + "epoch": 2.42048126734914, + "grad_norm": 2.615586996078491, + "learning_rate": 9.658645544181002e-06, + "loss": 0.4893, + "step": 273800 + }, + { + "epoch": 2.420569670609452, + "grad_norm": 0.47503337264060974, + "learning_rate": 9.657172156509132e-06, + "loss": 0.3791, + "step": 273810 + }, + { + "epoch": 2.4206580738697645, + "grad_norm": 2.186204671859741, + "learning_rate": 9.655698768837262e-06, + "loss": 0.5541, + "step": 273820 + }, + { + "epoch": 2.4207464771300766, + "grad_norm": 1.5587568283081055, + "learning_rate": 9.65422538116539e-06, + "loss": 0.543, + "step": 273830 + }, + { + "epoch": 2.4208348803903887, + "grad_norm": 2.74202823638916, + "learning_rate": 9.65275199349352e-06, + "loss": 0.5521, + "step": 273840 + }, + { + "epoch": 2.420923283650701, + "grad_norm": 4.577587127685547, + "learning_rate": 9.651278605821649e-06, + "loss": 0.5518, + "step": 273850 + }, + { + "epoch": 2.4210116869110134, + "grad_norm": 1.6315829753875732, + "learning_rate": 9.649805218149779e-06, + "loss": 0.5393, + "step": 273860 + }, + { + "epoch": 2.4211000901713255, + "grad_norm": 4.993988037109375, + "learning_rate": 9.648331830477909e-06, + "loss": 0.5315, + "step": 273870 + }, + { + "epoch": 2.4211884934316377, + "grad_norm": 3.278857469558716, + "learning_rate": 9.646858442806037e-06, + "loss": 0.6051, + "step": 273880 + }, + { + "epoch": 2.42127689669195, + "grad_norm": 1.9941222667694092, + "learning_rate": 9.645385055134167e-06, + "loss": 0.5448, + "step": 273890 + }, + { + "epoch": 2.4213652999522624, + "grad_norm": 5.928884983062744, + "learning_rate": 9.643911667462297e-06, + "loss": 0.5618, + "step": 273900 + }, + { + "epoch": 2.4214537032125745, + "grad_norm": 3.1210789680480957, + "learning_rate": 9.642438279790426e-06, + "loss": 0.4736, + "step": 273910 + }, + { + "epoch": 2.4215421064728866, + "grad_norm": 1.6536107063293457, + "learning_rate": 9.640964892118556e-06, + "loss": 0.5814, + "step": 273920 + }, + { + "epoch": 2.421630509733199, + "grad_norm": 2.1482691764831543, + "learning_rate": 9.639491504446686e-06, + "loss": 0.5033, + "step": 273930 + }, + { + "epoch": 2.4217189129935113, + "grad_norm": 1.955352783203125, + "learning_rate": 9.638018116774814e-06, + "loss": 0.621, + "step": 273940 + }, + { + "epoch": 2.4218073162538234, + "grad_norm": 1.5744571685791016, + "learning_rate": 9.636544729102944e-06, + "loss": 0.5394, + "step": 273950 + }, + { + "epoch": 2.4218957195141355, + "grad_norm": 8.022942543029785, + "learning_rate": 9.635071341431072e-06, + "loss": 0.4044, + "step": 273960 + }, + { + "epoch": 2.421984122774448, + "grad_norm": 6.0239787101745605, + "learning_rate": 9.633597953759202e-06, + "loss": 0.521, + "step": 273970 + }, + { + "epoch": 2.42207252603476, + "grad_norm": 3.286203384399414, + "learning_rate": 9.63212456608733e-06, + "loss": 0.4277, + "step": 273980 + }, + { + "epoch": 2.4221609292950723, + "grad_norm": 3.4836041927337646, + "learning_rate": 9.630651178415461e-06, + "loss": 0.6853, + "step": 273990 + }, + { + "epoch": 2.422249332555385, + "grad_norm": 11.060029983520508, + "learning_rate": 9.629177790743591e-06, + "loss": 0.5045, + "step": 274000 + }, + { + "epoch": 2.422337735815697, + "grad_norm": 8.782930374145508, + "learning_rate": 9.62770440307172e-06, + "loss": 0.5124, + "step": 274010 + }, + { + "epoch": 2.422426139076009, + "grad_norm": 15.665619850158691, + "learning_rate": 9.62623101539985e-06, + "loss": 0.4437, + "step": 274020 + }, + { + "epoch": 2.4225145423363212, + "grad_norm": 4.348913192749023, + "learning_rate": 9.624757627727978e-06, + "loss": 0.5517, + "step": 274030 + }, + { + "epoch": 2.4226029455966334, + "grad_norm": 5.2956719398498535, + "learning_rate": 9.623284240056108e-06, + "loss": 0.4427, + "step": 274040 + }, + { + "epoch": 2.422691348856946, + "grad_norm": 5.346617698669434, + "learning_rate": 9.621810852384236e-06, + "loss": 0.4717, + "step": 274050 + }, + { + "epoch": 2.422779752117258, + "grad_norm": 5.6556806564331055, + "learning_rate": 9.620337464712366e-06, + "loss": 0.3947, + "step": 274060 + }, + { + "epoch": 2.42286815537757, + "grad_norm": 2.938232660293579, + "learning_rate": 9.618864077040494e-06, + "loss": 0.5941, + "step": 274070 + }, + { + "epoch": 2.4229565586378827, + "grad_norm": 10.743523597717285, + "learning_rate": 9.617390689368625e-06, + "loss": 0.518, + "step": 274080 + }, + { + "epoch": 2.423044961898195, + "grad_norm": 2.49768328666687, + "learning_rate": 9.615917301696755e-06, + "loss": 0.5445, + "step": 274090 + }, + { + "epoch": 2.423133365158507, + "grad_norm": 5.208009243011475, + "learning_rate": 9.614443914024883e-06, + "loss": 0.592, + "step": 274100 + }, + { + "epoch": 2.423221768418819, + "grad_norm": 6.417023181915283, + "learning_rate": 9.612970526353013e-06, + "loss": 0.58, + "step": 274110 + }, + { + "epoch": 2.4233101716791317, + "grad_norm": 1.771368384361267, + "learning_rate": 9.611497138681141e-06, + "loss": 0.5309, + "step": 274120 + }, + { + "epoch": 2.4233985749394438, + "grad_norm": 2.1932241916656494, + "learning_rate": 9.610023751009271e-06, + "loss": 0.5531, + "step": 274130 + }, + { + "epoch": 2.423486978199756, + "grad_norm": 3.519335985183716, + "learning_rate": 9.6085503633374e-06, + "loss": 0.56, + "step": 274140 + }, + { + "epoch": 2.4235753814600685, + "grad_norm": 8.614008903503418, + "learning_rate": 9.60707697566553e-06, + "loss": 0.6022, + "step": 274150 + }, + { + "epoch": 2.4236637847203806, + "grad_norm": 3.8653314113616943, + "learning_rate": 9.605603587993658e-06, + "loss": 0.5261, + "step": 274160 + }, + { + "epoch": 2.4237521879806927, + "grad_norm": 1.0673904418945312, + "learning_rate": 9.604130200321788e-06, + "loss": 0.555, + "step": 274170 + }, + { + "epoch": 2.423840591241005, + "grad_norm": 33.81007385253906, + "learning_rate": 9.602656812649918e-06, + "loss": 0.5178, + "step": 274180 + }, + { + "epoch": 2.4239289945013174, + "grad_norm": 1.6326946020126343, + "learning_rate": 9.601183424978047e-06, + "loss": 0.4938, + "step": 274190 + }, + { + "epoch": 2.4240173977616295, + "grad_norm": 2.763127326965332, + "learning_rate": 9.599710037306177e-06, + "loss": 0.5401, + "step": 274200 + }, + { + "epoch": 2.4241058010219416, + "grad_norm": 6.555072784423828, + "learning_rate": 9.598236649634305e-06, + "loss": 0.5707, + "step": 274210 + }, + { + "epoch": 2.4241942042822537, + "grad_norm": 1.7792187929153442, + "learning_rate": 9.596763261962435e-06, + "loss": 0.5539, + "step": 274220 + }, + { + "epoch": 2.4242826075425663, + "grad_norm": 3.1833059787750244, + "learning_rate": 9.595289874290563e-06, + "loss": 0.4381, + "step": 274230 + }, + { + "epoch": 2.4243710108028784, + "grad_norm": 3.8469159603118896, + "learning_rate": 9.593816486618693e-06, + "loss": 0.4832, + "step": 274240 + }, + { + "epoch": 2.4244594140631905, + "grad_norm": 3.511960029602051, + "learning_rate": 9.592343098946822e-06, + "loss": 0.5485, + "step": 274250 + }, + { + "epoch": 2.4245478173235027, + "grad_norm": 2.8462107181549072, + "learning_rate": 9.590869711274952e-06, + "loss": 0.4738, + "step": 274260 + }, + { + "epoch": 2.4246362205838152, + "grad_norm": 2.352555751800537, + "learning_rate": 9.58939632360308e-06, + "loss": 0.5013, + "step": 274270 + }, + { + "epoch": 2.4247246238441273, + "grad_norm": 2.632732629776001, + "learning_rate": 9.58792293593121e-06, + "loss": 0.5403, + "step": 274280 + }, + { + "epoch": 2.4248130271044395, + "grad_norm": 4.696986198425293, + "learning_rate": 9.58644954825934e-06, + "loss": 0.6109, + "step": 274290 + }, + { + "epoch": 2.424901430364752, + "grad_norm": 2.784381866455078, + "learning_rate": 9.584976160587469e-06, + "loss": 0.5928, + "step": 274300 + }, + { + "epoch": 2.424989833625064, + "grad_norm": 6.12005090713501, + "learning_rate": 9.583502772915599e-06, + "loss": 0.584, + "step": 274310 + }, + { + "epoch": 2.4250782368853763, + "grad_norm": 2.422222852706909, + "learning_rate": 9.582029385243729e-06, + "loss": 0.5022, + "step": 274320 + }, + { + "epoch": 2.4251666401456884, + "grad_norm": 4.038949489593506, + "learning_rate": 9.580555997571857e-06, + "loss": 0.5847, + "step": 274330 + }, + { + "epoch": 2.425255043406001, + "grad_norm": 6.234360694885254, + "learning_rate": 9.579082609899987e-06, + "loss": 0.5619, + "step": 274340 + }, + { + "epoch": 2.425343446666313, + "grad_norm": 2.0728347301483154, + "learning_rate": 9.577609222228117e-06, + "loss": 0.5547, + "step": 274350 + }, + { + "epoch": 2.425431849926625, + "grad_norm": 1.4261667728424072, + "learning_rate": 9.576135834556246e-06, + "loss": 0.4716, + "step": 274360 + }, + { + "epoch": 2.4255202531869378, + "grad_norm": 15.780448913574219, + "learning_rate": 9.574662446884376e-06, + "loss": 0.5606, + "step": 274370 + }, + { + "epoch": 2.42560865644725, + "grad_norm": 5.384955883026123, + "learning_rate": 9.573189059212506e-06, + "loss": 0.5514, + "step": 274380 + }, + { + "epoch": 2.425697059707562, + "grad_norm": 1.0565736293792725, + "learning_rate": 9.571715671540634e-06, + "loss": 0.4658, + "step": 274390 + }, + { + "epoch": 2.425785462967874, + "grad_norm": 1.656499981880188, + "learning_rate": 9.570242283868764e-06, + "loss": 0.518, + "step": 274400 + }, + { + "epoch": 2.4258738662281867, + "grad_norm": 3.8356378078460693, + "learning_rate": 9.568768896196892e-06, + "loss": 0.6722, + "step": 274410 + }, + { + "epoch": 2.425962269488499, + "grad_norm": 13.769882202148438, + "learning_rate": 9.567295508525022e-06, + "loss": 0.4508, + "step": 274420 + }, + { + "epoch": 2.426050672748811, + "grad_norm": 4.419256210327148, + "learning_rate": 9.56582212085315e-06, + "loss": 0.6168, + "step": 274430 + }, + { + "epoch": 2.426139076009123, + "grad_norm": 8.246675491333008, + "learning_rate": 9.564348733181281e-06, + "loss": 0.6318, + "step": 274440 + }, + { + "epoch": 2.4262274792694356, + "grad_norm": 3.1597399711608887, + "learning_rate": 9.56287534550941e-06, + "loss": 0.3949, + "step": 274450 + }, + { + "epoch": 2.4263158825297477, + "grad_norm": 3.2368342876434326, + "learning_rate": 9.56140195783754e-06, + "loss": 0.6323, + "step": 274460 + }, + { + "epoch": 2.42640428579006, + "grad_norm": 5.149718284606934, + "learning_rate": 9.55992857016567e-06, + "loss": 0.4158, + "step": 274470 + }, + { + "epoch": 2.426492689050372, + "grad_norm": 2.0629663467407227, + "learning_rate": 9.558455182493798e-06, + "loss": 0.5916, + "step": 274480 + }, + { + "epoch": 2.4265810923106845, + "grad_norm": 9.234391212463379, + "learning_rate": 9.556981794821928e-06, + "loss": 0.5843, + "step": 274490 + }, + { + "epoch": 2.4266694955709966, + "grad_norm": 4.267177104949951, + "learning_rate": 9.555508407150056e-06, + "loss": 0.5641, + "step": 274500 + }, + { + "epoch": 2.4267578988313088, + "grad_norm": 11.680285453796387, + "learning_rate": 9.554035019478186e-06, + "loss": 0.4161, + "step": 274510 + }, + { + "epoch": 2.4268463020916213, + "grad_norm": 5.7796220779418945, + "learning_rate": 9.552561631806314e-06, + "loss": 0.5693, + "step": 274520 + }, + { + "epoch": 2.4269347053519335, + "grad_norm": 13.492436408996582, + "learning_rate": 9.551088244134445e-06, + "loss": 0.5853, + "step": 274530 + }, + { + "epoch": 2.4270231086122456, + "grad_norm": 3.039944887161255, + "learning_rate": 9.549614856462573e-06, + "loss": 0.6383, + "step": 274540 + }, + { + "epoch": 2.4271115118725577, + "grad_norm": 2.206707000732422, + "learning_rate": 9.548141468790703e-06, + "loss": 0.5826, + "step": 274550 + }, + { + "epoch": 2.4271999151328703, + "grad_norm": 2.218820571899414, + "learning_rate": 9.546668081118833e-06, + "loss": 0.4275, + "step": 274560 + }, + { + "epoch": 2.4272883183931824, + "grad_norm": 2.4991564750671387, + "learning_rate": 9.545194693446961e-06, + "loss": 0.487, + "step": 274570 + }, + { + "epoch": 2.4273767216534945, + "grad_norm": 3.583858013153076, + "learning_rate": 9.543721305775091e-06, + "loss": 0.5308, + "step": 274580 + }, + { + "epoch": 2.427465124913807, + "grad_norm": 2.3332507610321045, + "learning_rate": 9.54224791810322e-06, + "loss": 0.4649, + "step": 274590 + }, + { + "epoch": 2.427553528174119, + "grad_norm": 3.360567569732666, + "learning_rate": 9.54077453043135e-06, + "loss": 0.5294, + "step": 274600 + }, + { + "epoch": 2.4276419314344313, + "grad_norm": 4.162355899810791, + "learning_rate": 9.539301142759478e-06, + "loss": 0.5348, + "step": 274610 + }, + { + "epoch": 2.4277303346947434, + "grad_norm": 1.7762751579284668, + "learning_rate": 9.537827755087608e-06, + "loss": 0.5205, + "step": 274620 + }, + { + "epoch": 2.4278187379550555, + "grad_norm": 2.9642934799194336, + "learning_rate": 9.536354367415737e-06, + "loss": 0.5664, + "step": 274630 + }, + { + "epoch": 2.427907141215368, + "grad_norm": 12.983196258544922, + "learning_rate": 9.534880979743867e-06, + "loss": 0.5274, + "step": 274640 + }, + { + "epoch": 2.4279955444756802, + "grad_norm": 1.9089516401290894, + "learning_rate": 9.533407592071997e-06, + "loss": 0.4948, + "step": 274650 + }, + { + "epoch": 2.4280839477359923, + "grad_norm": 2.0614426136016846, + "learning_rate": 9.531934204400125e-06, + "loss": 0.4724, + "step": 274660 + }, + { + "epoch": 2.428172350996305, + "grad_norm": 3.1169776916503906, + "learning_rate": 9.530460816728255e-06, + "loss": 0.4962, + "step": 274670 + }, + { + "epoch": 2.428260754256617, + "grad_norm": 3.4747650623321533, + "learning_rate": 9.528987429056383e-06, + "loss": 0.4371, + "step": 274680 + }, + { + "epoch": 2.428349157516929, + "grad_norm": 3.701220750808716, + "learning_rate": 9.527514041384513e-06, + "loss": 0.5619, + "step": 274690 + }, + { + "epoch": 2.4284375607772413, + "grad_norm": 3.1220686435699463, + "learning_rate": 9.526040653712642e-06, + "loss": 0.6243, + "step": 274700 + }, + { + "epoch": 2.428525964037554, + "grad_norm": 2.888411045074463, + "learning_rate": 9.524567266040772e-06, + "loss": 0.4963, + "step": 274710 + }, + { + "epoch": 2.428614367297866, + "grad_norm": 1.9905142784118652, + "learning_rate": 9.5230938783689e-06, + "loss": 0.4234, + "step": 274720 + }, + { + "epoch": 2.428702770558178, + "grad_norm": 3.5030405521392822, + "learning_rate": 9.52162049069703e-06, + "loss": 0.5122, + "step": 274730 + }, + { + "epoch": 2.4287911738184906, + "grad_norm": 3.4790432453155518, + "learning_rate": 9.520147103025159e-06, + "loss": 0.533, + "step": 274740 + }, + { + "epoch": 2.4288795770788028, + "grad_norm": 2.41606068611145, + "learning_rate": 9.518673715353289e-06, + "loss": 0.5088, + "step": 274750 + }, + { + "epoch": 2.428967980339115, + "grad_norm": 2.5328526496887207, + "learning_rate": 9.517200327681419e-06, + "loss": 0.7224, + "step": 274760 + }, + { + "epoch": 2.429056383599427, + "grad_norm": 2.4455597400665283, + "learning_rate": 9.515726940009547e-06, + "loss": 0.5615, + "step": 274770 + }, + { + "epoch": 2.4291447868597396, + "grad_norm": 2.0215494632720947, + "learning_rate": 9.514253552337677e-06, + "loss": 0.494, + "step": 274780 + }, + { + "epoch": 2.4292331901200517, + "grad_norm": 3.167290687561035, + "learning_rate": 9.512780164665807e-06, + "loss": 0.473, + "step": 274790 + }, + { + "epoch": 2.429321593380364, + "grad_norm": 1.3631892204284668, + "learning_rate": 9.511306776993935e-06, + "loss": 0.4853, + "step": 274800 + }, + { + "epoch": 2.429409996640676, + "grad_norm": 1.3453123569488525, + "learning_rate": 9.509833389322066e-06, + "loss": 0.5688, + "step": 274810 + }, + { + "epoch": 2.4294983999009885, + "grad_norm": 108.22843933105469, + "learning_rate": 9.508360001650196e-06, + "loss": 0.5362, + "step": 274820 + }, + { + "epoch": 2.4295868031613006, + "grad_norm": 1.9028550386428833, + "learning_rate": 9.506886613978324e-06, + "loss": 0.4113, + "step": 274830 + }, + { + "epoch": 2.4296752064216127, + "grad_norm": 0.909525990486145, + "learning_rate": 9.505413226306454e-06, + "loss": 0.5673, + "step": 274840 + }, + { + "epoch": 2.429763609681925, + "grad_norm": 2.3468515872955322, + "learning_rate": 9.503939838634584e-06, + "loss": 0.5162, + "step": 274850 + }, + { + "epoch": 2.4298520129422374, + "grad_norm": 3.1360085010528564, + "learning_rate": 9.502466450962712e-06, + "loss": 0.5302, + "step": 274860 + }, + { + "epoch": 2.4299404162025495, + "grad_norm": 0.8796341419219971, + "learning_rate": 9.500993063290842e-06, + "loss": 0.4247, + "step": 274870 + }, + { + "epoch": 2.4300288194628616, + "grad_norm": 1.2784000635147095, + "learning_rate": 9.49951967561897e-06, + "loss": 0.5224, + "step": 274880 + }, + { + "epoch": 2.430117222723174, + "grad_norm": 1.767113208770752, + "learning_rate": 9.4980462879471e-06, + "loss": 0.4712, + "step": 274890 + }, + { + "epoch": 2.4302056259834863, + "grad_norm": 3.322242259979248, + "learning_rate": 9.49657290027523e-06, + "loss": 0.4974, + "step": 274900 + }, + { + "epoch": 2.4302940292437984, + "grad_norm": 1.891144871711731, + "learning_rate": 9.49509951260336e-06, + "loss": 0.5539, + "step": 274910 + }, + { + "epoch": 2.4303824325041106, + "grad_norm": 2.4760639667510986, + "learning_rate": 9.493626124931488e-06, + "loss": 0.6049, + "step": 274920 + }, + { + "epoch": 2.430470835764423, + "grad_norm": 3.7245733737945557, + "learning_rate": 9.492152737259618e-06, + "loss": 0.6728, + "step": 274930 + }, + { + "epoch": 2.4305592390247353, + "grad_norm": 3.5492613315582275, + "learning_rate": 9.490679349587748e-06, + "loss": 0.4907, + "step": 274940 + }, + { + "epoch": 2.4306476422850474, + "grad_norm": 1.5347416400909424, + "learning_rate": 9.489205961915876e-06, + "loss": 0.4541, + "step": 274950 + }, + { + "epoch": 2.43073604554536, + "grad_norm": 3.249051094055176, + "learning_rate": 9.487732574244006e-06, + "loss": 0.552, + "step": 274960 + }, + { + "epoch": 2.430824448805672, + "grad_norm": 3.2552995681762695, + "learning_rate": 9.486259186572134e-06, + "loss": 0.5163, + "step": 274970 + }, + { + "epoch": 2.430912852065984, + "grad_norm": 6.5465922355651855, + "learning_rate": 9.484785798900264e-06, + "loss": 0.5757, + "step": 274980 + }, + { + "epoch": 2.4310012553262963, + "grad_norm": 3.186040163040161, + "learning_rate": 9.483312411228393e-06, + "loss": 0.5438, + "step": 274990 + }, + { + "epoch": 2.431089658586609, + "grad_norm": 2.1956894397735596, + "learning_rate": 9.481839023556523e-06, + "loss": 0.5049, + "step": 275000 + }, + { + "epoch": 2.431178061846921, + "grad_norm": 4.920975685119629, + "learning_rate": 9.480365635884651e-06, + "loss": 0.6868, + "step": 275010 + }, + { + "epoch": 2.431266465107233, + "grad_norm": 8.296487808227539, + "learning_rate": 9.478892248212781e-06, + "loss": 0.4707, + "step": 275020 + }, + { + "epoch": 2.431354868367545, + "grad_norm": 4.06593132019043, + "learning_rate": 9.477418860540911e-06, + "loss": 0.4533, + "step": 275030 + }, + { + "epoch": 2.431443271627858, + "grad_norm": 2.353095531463623, + "learning_rate": 9.47594547286904e-06, + "loss": 0.3776, + "step": 275040 + }, + { + "epoch": 2.43153167488817, + "grad_norm": 3.7956490516662598, + "learning_rate": 9.47447208519717e-06, + "loss": 0.4573, + "step": 275050 + }, + { + "epoch": 2.431620078148482, + "grad_norm": 7.946572303771973, + "learning_rate": 9.472998697525298e-06, + "loss": 0.5478, + "step": 275060 + }, + { + "epoch": 2.431708481408794, + "grad_norm": 4.614199638366699, + "learning_rate": 9.471525309853428e-06, + "loss": 0.3634, + "step": 275070 + }, + { + "epoch": 2.4317968846691067, + "grad_norm": 10.042034149169922, + "learning_rate": 9.470051922181556e-06, + "loss": 0.6874, + "step": 275080 + }, + { + "epoch": 2.431885287929419, + "grad_norm": 15.884081840515137, + "learning_rate": 9.468578534509687e-06, + "loss": 0.7178, + "step": 275090 + }, + { + "epoch": 2.431973691189731, + "grad_norm": 5.1295928955078125, + "learning_rate": 9.467105146837815e-06, + "loss": 0.4939, + "step": 275100 + }, + { + "epoch": 2.4320620944500435, + "grad_norm": 4.123824596405029, + "learning_rate": 9.465631759165945e-06, + "loss": 0.4654, + "step": 275110 + }, + { + "epoch": 2.4321504977103556, + "grad_norm": 1.7362514734268188, + "learning_rate": 9.464158371494075e-06, + "loss": 0.5815, + "step": 275120 + }, + { + "epoch": 2.4322389009706677, + "grad_norm": 1.3770450353622437, + "learning_rate": 9.462684983822203e-06, + "loss": 0.4647, + "step": 275130 + }, + { + "epoch": 2.43232730423098, + "grad_norm": 6.261582851409912, + "learning_rate": 9.461211596150333e-06, + "loss": 0.3775, + "step": 275140 + }, + { + "epoch": 2.4324157074912924, + "grad_norm": 1.517290472984314, + "learning_rate": 9.459738208478462e-06, + "loss": 0.5467, + "step": 275150 + }, + { + "epoch": 2.4325041107516046, + "grad_norm": 8.262085914611816, + "learning_rate": 9.458264820806592e-06, + "loss": 0.5234, + "step": 275160 + }, + { + "epoch": 2.4325925140119167, + "grad_norm": 1.7775261402130127, + "learning_rate": 9.45679143313472e-06, + "loss": 0.4867, + "step": 275170 + }, + { + "epoch": 2.4326809172722292, + "grad_norm": 8.84912395477295, + "learning_rate": 9.45531804546285e-06, + "loss": 0.53, + "step": 275180 + }, + { + "epoch": 2.4327693205325414, + "grad_norm": 11.21257495880127, + "learning_rate": 9.453844657790979e-06, + "loss": 0.516, + "step": 275190 + }, + { + "epoch": 2.4328577237928535, + "grad_norm": 6.543550968170166, + "learning_rate": 9.452371270119109e-06, + "loss": 0.5783, + "step": 275200 + }, + { + "epoch": 2.4329461270531656, + "grad_norm": 4.068567276000977, + "learning_rate": 9.450897882447239e-06, + "loss": 0.4387, + "step": 275210 + }, + { + "epoch": 2.4330345303134777, + "grad_norm": 4.725688934326172, + "learning_rate": 9.449424494775367e-06, + "loss": 0.7081, + "step": 275220 + }, + { + "epoch": 2.4331229335737903, + "grad_norm": 1.7915492057800293, + "learning_rate": 9.447951107103497e-06, + "loss": 0.5336, + "step": 275230 + }, + { + "epoch": 2.4332113368341024, + "grad_norm": 13.126238822937012, + "learning_rate": 9.446477719431625e-06, + "loss": 0.6103, + "step": 275240 + }, + { + "epoch": 2.4332997400944145, + "grad_norm": 2.726179361343384, + "learning_rate": 9.445004331759755e-06, + "loss": 0.4819, + "step": 275250 + }, + { + "epoch": 2.433388143354727, + "grad_norm": 1.7211021184921265, + "learning_rate": 9.443530944087885e-06, + "loss": 0.4749, + "step": 275260 + }, + { + "epoch": 2.433476546615039, + "grad_norm": 3.0714826583862305, + "learning_rate": 9.442057556416014e-06, + "loss": 0.6988, + "step": 275270 + }, + { + "epoch": 2.4335649498753513, + "grad_norm": 4.459081172943115, + "learning_rate": 9.440584168744144e-06, + "loss": 0.5787, + "step": 275280 + }, + { + "epoch": 2.4336533531356634, + "grad_norm": 4.705945014953613, + "learning_rate": 9.439110781072274e-06, + "loss": 0.4836, + "step": 275290 + }, + { + "epoch": 2.433741756395976, + "grad_norm": 3.2161948680877686, + "learning_rate": 9.437637393400402e-06, + "loss": 0.4899, + "step": 275300 + }, + { + "epoch": 2.433830159656288, + "grad_norm": 5.932947158813477, + "learning_rate": 9.436164005728532e-06, + "loss": 0.5575, + "step": 275310 + }, + { + "epoch": 2.4339185629166002, + "grad_norm": 1.9309208393096924, + "learning_rate": 9.434690618056662e-06, + "loss": 0.5311, + "step": 275320 + }, + { + "epoch": 2.434006966176913, + "grad_norm": 3.493452310562134, + "learning_rate": 9.43321723038479e-06, + "loss": 0.5372, + "step": 275330 + }, + { + "epoch": 2.434095369437225, + "grad_norm": 0.5759629011154175, + "learning_rate": 9.43174384271292e-06, + "loss": 0.5126, + "step": 275340 + }, + { + "epoch": 2.434183772697537, + "grad_norm": 6.624363899230957, + "learning_rate": 9.430270455041049e-06, + "loss": 0.5876, + "step": 275350 + }, + { + "epoch": 2.434272175957849, + "grad_norm": 1.2824883460998535, + "learning_rate": 9.42879706736918e-06, + "loss": 0.5066, + "step": 275360 + }, + { + "epoch": 2.4343605792181617, + "grad_norm": 6.870247840881348, + "learning_rate": 9.427323679697308e-06, + "loss": 0.5694, + "step": 275370 + }, + { + "epoch": 2.434448982478474, + "grad_norm": 6.681769847869873, + "learning_rate": 9.425850292025438e-06, + "loss": 0.4871, + "step": 275380 + }, + { + "epoch": 2.434537385738786, + "grad_norm": 1.6385177373886108, + "learning_rate": 9.424376904353566e-06, + "loss": 0.4318, + "step": 275390 + }, + { + "epoch": 2.434625788999098, + "grad_norm": 3.1085715293884277, + "learning_rate": 9.422903516681696e-06, + "loss": 0.3885, + "step": 275400 + }, + { + "epoch": 2.4347141922594107, + "grad_norm": 6.883274078369141, + "learning_rate": 9.421430129009826e-06, + "loss": 0.4966, + "step": 275410 + }, + { + "epoch": 2.434802595519723, + "grad_norm": 5.34896993637085, + "learning_rate": 9.419956741337954e-06, + "loss": 0.6071, + "step": 275420 + }, + { + "epoch": 2.434890998780035, + "grad_norm": 6.047584533691406, + "learning_rate": 9.418483353666084e-06, + "loss": 0.6047, + "step": 275430 + }, + { + "epoch": 2.434979402040347, + "grad_norm": 3.018876075744629, + "learning_rate": 9.417009965994213e-06, + "loss": 0.5223, + "step": 275440 + }, + { + "epoch": 2.4350678053006596, + "grad_norm": 3.50710129737854, + "learning_rate": 9.415536578322343e-06, + "loss": 0.5451, + "step": 275450 + }, + { + "epoch": 2.4351562085609717, + "grad_norm": 2.8582825660705566, + "learning_rate": 9.414063190650471e-06, + "loss": 0.6691, + "step": 275460 + }, + { + "epoch": 2.435244611821284, + "grad_norm": 2.903977155685425, + "learning_rate": 9.412589802978601e-06, + "loss": 0.5619, + "step": 275470 + }, + { + "epoch": 2.4353330150815964, + "grad_norm": 2.537665843963623, + "learning_rate": 9.41111641530673e-06, + "loss": 0.5245, + "step": 275480 + }, + { + "epoch": 2.4354214183419085, + "grad_norm": 4.153640270233154, + "learning_rate": 9.40964302763486e-06, + "loss": 0.4985, + "step": 275490 + }, + { + "epoch": 2.4355098216022206, + "grad_norm": 1.5298527479171753, + "learning_rate": 9.40816963996299e-06, + "loss": 0.465, + "step": 275500 + }, + { + "epoch": 2.4355982248625327, + "grad_norm": 3.4456586837768555, + "learning_rate": 9.406696252291118e-06, + "loss": 0.6096, + "step": 275510 + }, + { + "epoch": 2.4356866281228453, + "grad_norm": 3.2404286861419678, + "learning_rate": 9.405222864619248e-06, + "loss": 0.5325, + "step": 275520 + }, + { + "epoch": 2.4357750313831574, + "grad_norm": 5.25468635559082, + "learning_rate": 9.403749476947376e-06, + "loss": 0.5499, + "step": 275530 + }, + { + "epoch": 2.4358634346434695, + "grad_norm": 11.65117359161377, + "learning_rate": 9.402276089275507e-06, + "loss": 0.4166, + "step": 275540 + }, + { + "epoch": 2.435951837903782, + "grad_norm": 3.229675531387329, + "learning_rate": 9.400802701603635e-06, + "loss": 0.63, + "step": 275550 + }, + { + "epoch": 2.4360402411640942, + "grad_norm": 9.502424240112305, + "learning_rate": 9.399329313931765e-06, + "loss": 0.6546, + "step": 275560 + }, + { + "epoch": 2.4361286444244064, + "grad_norm": 4.837954521179199, + "learning_rate": 9.397855926259893e-06, + "loss": 0.5589, + "step": 275570 + }, + { + "epoch": 2.4362170476847185, + "grad_norm": 7.017982482910156, + "learning_rate": 9.396382538588023e-06, + "loss": 0.5309, + "step": 275580 + }, + { + "epoch": 2.436305450945031, + "grad_norm": 3.6404783725738525, + "learning_rate": 9.394909150916153e-06, + "loss": 0.5995, + "step": 275590 + }, + { + "epoch": 2.436393854205343, + "grad_norm": 1.9234553575515747, + "learning_rate": 9.393435763244282e-06, + "loss": 0.6037, + "step": 275600 + }, + { + "epoch": 2.4364822574656553, + "grad_norm": 2.044924736022949, + "learning_rate": 9.391962375572412e-06, + "loss": 0.423, + "step": 275610 + }, + { + "epoch": 2.4365706607259674, + "grad_norm": 14.83303451538086, + "learning_rate": 9.39048898790054e-06, + "loss": 0.4743, + "step": 275620 + }, + { + "epoch": 2.43665906398628, + "grad_norm": 3.4794130325317383, + "learning_rate": 9.38901560022867e-06, + "loss": 0.6393, + "step": 275630 + }, + { + "epoch": 2.436747467246592, + "grad_norm": 2.2462127208709717, + "learning_rate": 9.387542212556799e-06, + "loss": 0.6894, + "step": 275640 + }, + { + "epoch": 2.436835870506904, + "grad_norm": 1.4684219360351562, + "learning_rate": 9.386068824884929e-06, + "loss": 0.4926, + "step": 275650 + }, + { + "epoch": 2.4369242737672163, + "grad_norm": 4.5692830085754395, + "learning_rate": 9.384595437213057e-06, + "loss": 0.6574, + "step": 275660 + }, + { + "epoch": 2.437012677027529, + "grad_norm": 2.281569480895996, + "learning_rate": 9.383122049541187e-06, + "loss": 0.6209, + "step": 275670 + }, + { + "epoch": 2.437101080287841, + "grad_norm": 2.687037229537964, + "learning_rate": 9.381648661869317e-06, + "loss": 0.5172, + "step": 275680 + }, + { + "epoch": 2.437189483548153, + "grad_norm": 5.891260623931885, + "learning_rate": 9.380175274197445e-06, + "loss": 0.565, + "step": 275690 + }, + { + "epoch": 2.4372778868084657, + "grad_norm": 2.9725887775421143, + "learning_rate": 9.378701886525575e-06, + "loss": 0.4962, + "step": 275700 + }, + { + "epoch": 2.437366290068778, + "grad_norm": 3.57210636138916, + "learning_rate": 9.377228498853704e-06, + "loss": 0.4895, + "step": 275710 + }, + { + "epoch": 2.43745469332909, + "grad_norm": 6.164513111114502, + "learning_rate": 9.375755111181834e-06, + "loss": 0.5205, + "step": 275720 + }, + { + "epoch": 2.437543096589402, + "grad_norm": 2.112860679626465, + "learning_rate": 9.374281723509964e-06, + "loss": 0.6786, + "step": 275730 + }, + { + "epoch": 2.4376314998497146, + "grad_norm": 3.215009927749634, + "learning_rate": 9.372808335838092e-06, + "loss": 0.5426, + "step": 275740 + }, + { + "epoch": 2.4377199031100267, + "grad_norm": 5.732872486114502, + "learning_rate": 9.371334948166222e-06, + "loss": 0.6748, + "step": 275750 + }, + { + "epoch": 2.437808306370339, + "grad_norm": 12.513802528381348, + "learning_rate": 9.369861560494352e-06, + "loss": 0.6112, + "step": 275760 + }, + { + "epoch": 2.4378967096306514, + "grad_norm": 4.313749313354492, + "learning_rate": 9.36838817282248e-06, + "loss": 0.4677, + "step": 275770 + }, + { + "epoch": 2.4379851128909635, + "grad_norm": 13.271227836608887, + "learning_rate": 9.36691478515061e-06, + "loss": 0.5239, + "step": 275780 + }, + { + "epoch": 2.4380735161512757, + "grad_norm": 2.6610186100006104, + "learning_rate": 9.36544139747874e-06, + "loss": 0.5016, + "step": 275790 + }, + { + "epoch": 2.4381619194115878, + "grad_norm": 2.6465847492218018, + "learning_rate": 9.363968009806869e-06, + "loss": 0.4622, + "step": 275800 + }, + { + "epoch": 2.4382503226719, + "grad_norm": 3.0981929302215576, + "learning_rate": 9.362494622135e-06, + "loss": 0.4171, + "step": 275810 + }, + { + "epoch": 2.4383387259322125, + "grad_norm": 4.330734729766846, + "learning_rate": 9.361021234463128e-06, + "loss": 0.6913, + "step": 275820 + }, + { + "epoch": 2.4384271291925246, + "grad_norm": 3.3819305896759033, + "learning_rate": 9.359547846791258e-06, + "loss": 0.5143, + "step": 275830 + }, + { + "epoch": 2.4385155324528367, + "grad_norm": 2.9043498039245605, + "learning_rate": 9.358074459119386e-06, + "loss": 0.5494, + "step": 275840 + }, + { + "epoch": 2.4386039357131493, + "grad_norm": 4.12640905380249, + "learning_rate": 9.356601071447516e-06, + "loss": 0.5121, + "step": 275850 + }, + { + "epoch": 2.4386923389734614, + "grad_norm": 1.394702434539795, + "learning_rate": 9.355127683775644e-06, + "loss": 0.5498, + "step": 275860 + }, + { + "epoch": 2.4387807422337735, + "grad_norm": 3.507603883743286, + "learning_rate": 9.353654296103774e-06, + "loss": 0.5236, + "step": 275870 + }, + { + "epoch": 2.4388691454940856, + "grad_norm": 3.84596586227417, + "learning_rate": 9.352180908431904e-06, + "loss": 0.4508, + "step": 275880 + }, + { + "epoch": 2.438957548754398, + "grad_norm": 2.5614352226257324, + "learning_rate": 9.350707520760033e-06, + "loss": 0.5657, + "step": 275890 + }, + { + "epoch": 2.4390459520147103, + "grad_norm": 5.441175937652588, + "learning_rate": 9.349234133088163e-06, + "loss": 0.5296, + "step": 275900 + }, + { + "epoch": 2.4391343552750224, + "grad_norm": 2.1744983196258545, + "learning_rate": 9.347760745416291e-06, + "loss": 0.4747, + "step": 275910 + }, + { + "epoch": 2.439222758535335, + "grad_norm": 0.952954113483429, + "learning_rate": 9.346287357744421e-06, + "loss": 0.5099, + "step": 275920 + }, + { + "epoch": 2.439311161795647, + "grad_norm": 1.4656267166137695, + "learning_rate": 9.34481397007255e-06, + "loss": 0.6224, + "step": 275930 + }, + { + "epoch": 2.4393995650559592, + "grad_norm": 4.055540084838867, + "learning_rate": 9.34334058240068e-06, + "loss": 0.5276, + "step": 275940 + }, + { + "epoch": 2.4394879683162713, + "grad_norm": 1.743790626525879, + "learning_rate": 9.341867194728808e-06, + "loss": 0.4514, + "step": 275950 + }, + { + "epoch": 2.439576371576584, + "grad_norm": 8.91683292388916, + "learning_rate": 9.340393807056938e-06, + "loss": 0.4492, + "step": 275960 + }, + { + "epoch": 2.439664774836896, + "grad_norm": 5.189273834228516, + "learning_rate": 9.338920419385068e-06, + "loss": 0.402, + "step": 275970 + }, + { + "epoch": 2.439753178097208, + "grad_norm": 1.9415736198425293, + "learning_rate": 9.337447031713196e-06, + "loss": 0.5437, + "step": 275980 + }, + { + "epoch": 2.4398415813575203, + "grad_norm": 3.826516628265381, + "learning_rate": 9.335973644041326e-06, + "loss": 0.4839, + "step": 275990 + }, + { + "epoch": 2.439929984617833, + "grad_norm": 32.81159973144531, + "learning_rate": 9.334500256369455e-06, + "loss": 0.5599, + "step": 276000 + }, + { + "epoch": 2.440018387878145, + "grad_norm": 0.7898046374320984, + "learning_rate": 9.333026868697585e-06, + "loss": 0.5699, + "step": 276010 + }, + { + "epoch": 2.440106791138457, + "grad_norm": 11.163710594177246, + "learning_rate": 9.331553481025713e-06, + "loss": 0.5149, + "step": 276020 + }, + { + "epoch": 2.440195194398769, + "grad_norm": 2.5160610675811768, + "learning_rate": 9.330080093353843e-06, + "loss": 0.4834, + "step": 276030 + }, + { + "epoch": 2.4402835976590818, + "grad_norm": 4.776188850402832, + "learning_rate": 9.328606705681972e-06, + "loss": 0.5511, + "step": 276040 + }, + { + "epoch": 2.440372000919394, + "grad_norm": 4.214691638946533, + "learning_rate": 9.327133318010102e-06, + "loss": 0.5482, + "step": 276050 + }, + { + "epoch": 2.440460404179706, + "grad_norm": 3.4533531665802, + "learning_rate": 9.325659930338232e-06, + "loss": 0.7192, + "step": 276060 + }, + { + "epoch": 2.4405488074400186, + "grad_norm": 16.397098541259766, + "learning_rate": 9.32418654266636e-06, + "loss": 0.4739, + "step": 276070 + }, + { + "epoch": 2.4406372107003307, + "grad_norm": 10.143166542053223, + "learning_rate": 9.32271315499449e-06, + "loss": 0.4569, + "step": 276080 + }, + { + "epoch": 2.440725613960643, + "grad_norm": 1.5317139625549316, + "learning_rate": 9.321239767322618e-06, + "loss": 0.675, + "step": 276090 + }, + { + "epoch": 2.440814017220955, + "grad_norm": 6.358989715576172, + "learning_rate": 9.319766379650749e-06, + "loss": 0.508, + "step": 276100 + }, + { + "epoch": 2.4409024204812675, + "grad_norm": 1.321579098701477, + "learning_rate": 9.318292991978877e-06, + "loss": 0.5081, + "step": 276110 + }, + { + "epoch": 2.4409908237415796, + "grad_norm": 15.484801292419434, + "learning_rate": 9.316819604307007e-06, + "loss": 0.6012, + "step": 276120 + }, + { + "epoch": 2.4410792270018917, + "grad_norm": 2.7216854095458984, + "learning_rate": 9.315346216635135e-06, + "loss": 0.5291, + "step": 276130 + }, + { + "epoch": 2.4411676302622043, + "grad_norm": 2.382138967514038, + "learning_rate": 9.313872828963265e-06, + "loss": 0.7058, + "step": 276140 + }, + { + "epoch": 2.4412560335225164, + "grad_norm": 2.3453261852264404, + "learning_rate": 9.312399441291395e-06, + "loss": 0.5046, + "step": 276150 + }, + { + "epoch": 2.4413444367828285, + "grad_norm": 4.968954563140869, + "learning_rate": 9.310926053619524e-06, + "loss": 0.5988, + "step": 276160 + }, + { + "epoch": 2.4414328400431407, + "grad_norm": 20.372310638427734, + "learning_rate": 9.309452665947654e-06, + "loss": 0.4568, + "step": 276170 + }, + { + "epoch": 2.441521243303453, + "grad_norm": 3.5111563205718994, + "learning_rate": 9.307979278275782e-06, + "loss": 0.4642, + "step": 276180 + }, + { + "epoch": 2.4416096465637653, + "grad_norm": 4.325994491577148, + "learning_rate": 9.306505890603912e-06, + "loss": 0.6743, + "step": 276190 + }, + { + "epoch": 2.4416980498240775, + "grad_norm": 6.010775566101074, + "learning_rate": 9.305032502932042e-06, + "loss": 0.4631, + "step": 276200 + }, + { + "epoch": 2.4417864530843896, + "grad_norm": 1.2930092811584473, + "learning_rate": 9.30355911526017e-06, + "loss": 0.4662, + "step": 276210 + }, + { + "epoch": 2.441874856344702, + "grad_norm": 9.439176559448242, + "learning_rate": 9.3020857275883e-06, + "loss": 0.4176, + "step": 276220 + }, + { + "epoch": 2.4419632596050143, + "grad_norm": 15.269810676574707, + "learning_rate": 9.30061233991643e-06, + "loss": 0.5053, + "step": 276230 + }, + { + "epoch": 2.4420516628653264, + "grad_norm": 1.8706769943237305, + "learning_rate": 9.299138952244559e-06, + "loss": 0.4778, + "step": 276240 + }, + { + "epoch": 2.4421400661256385, + "grad_norm": 3.9606001377105713, + "learning_rate": 9.297665564572689e-06, + "loss": 0.5391, + "step": 276250 + }, + { + "epoch": 2.442228469385951, + "grad_norm": 1.8553210496902466, + "learning_rate": 9.296192176900819e-06, + "loss": 0.5806, + "step": 276260 + }, + { + "epoch": 2.442316872646263, + "grad_norm": 4.870859622955322, + "learning_rate": 9.294718789228947e-06, + "loss": 0.5733, + "step": 276270 + }, + { + "epoch": 2.4424052759065753, + "grad_norm": 1.4659407138824463, + "learning_rate": 9.293245401557078e-06, + "loss": 0.4872, + "step": 276280 + }, + { + "epoch": 2.442493679166888, + "grad_norm": 2.1185669898986816, + "learning_rate": 9.291772013885206e-06, + "loss": 0.4693, + "step": 276290 + }, + { + "epoch": 2.4425820824272, + "grad_norm": 6.1047258377075195, + "learning_rate": 9.290298626213336e-06, + "loss": 0.5677, + "step": 276300 + }, + { + "epoch": 2.442670485687512, + "grad_norm": 4.8709797859191895, + "learning_rate": 9.288825238541464e-06, + "loss": 0.5725, + "step": 276310 + }, + { + "epoch": 2.4427588889478242, + "grad_norm": 10.027135848999023, + "learning_rate": 9.287351850869594e-06, + "loss": 0.5265, + "step": 276320 + }, + { + "epoch": 2.442847292208137, + "grad_norm": 2.8124148845672607, + "learning_rate": 9.285878463197724e-06, + "loss": 0.3692, + "step": 276330 + }, + { + "epoch": 2.442935695468449, + "grad_norm": 5.537686824798584, + "learning_rate": 9.284405075525853e-06, + "loss": 0.4853, + "step": 276340 + }, + { + "epoch": 2.443024098728761, + "grad_norm": 0.994949996471405, + "learning_rate": 9.282931687853983e-06, + "loss": 0.3603, + "step": 276350 + }, + { + "epoch": 2.4431125019890736, + "grad_norm": 1.1316746473312378, + "learning_rate": 9.281458300182111e-06, + "loss": 0.5028, + "step": 276360 + }, + { + "epoch": 2.4432009052493857, + "grad_norm": 2.7985599040985107, + "learning_rate": 9.279984912510241e-06, + "loss": 0.6345, + "step": 276370 + }, + { + "epoch": 2.443289308509698, + "grad_norm": 2.3769326210021973, + "learning_rate": 9.27851152483837e-06, + "loss": 0.4947, + "step": 276380 + }, + { + "epoch": 2.44337771177001, + "grad_norm": 4.077784061431885, + "learning_rate": 9.2770381371665e-06, + "loss": 0.5504, + "step": 276390 + }, + { + "epoch": 2.443466115030322, + "grad_norm": 6.154041290283203, + "learning_rate": 9.275564749494628e-06, + "loss": 0.5288, + "step": 276400 + }, + { + "epoch": 2.4435545182906346, + "grad_norm": 6.90071439743042, + "learning_rate": 9.274091361822758e-06, + "loss": 0.5085, + "step": 276410 + }, + { + "epoch": 2.4436429215509468, + "grad_norm": 3.236362934112549, + "learning_rate": 9.272617974150886e-06, + "loss": 0.518, + "step": 276420 + }, + { + "epoch": 2.443731324811259, + "grad_norm": 13.265640258789062, + "learning_rate": 9.271144586479016e-06, + "loss": 0.5144, + "step": 276430 + }, + { + "epoch": 2.4438197280715714, + "grad_norm": 1.7951140403747559, + "learning_rate": 9.269671198807146e-06, + "loss": 0.5738, + "step": 276440 + }, + { + "epoch": 2.4439081313318836, + "grad_norm": 16.34493064880371, + "learning_rate": 9.268197811135275e-06, + "loss": 0.5712, + "step": 276450 + }, + { + "epoch": 2.4439965345921957, + "grad_norm": 2.778292179107666, + "learning_rate": 9.266724423463405e-06, + "loss": 0.4463, + "step": 276460 + }, + { + "epoch": 2.444084937852508, + "grad_norm": 6.708696365356445, + "learning_rate": 9.265251035791533e-06, + "loss": 0.4353, + "step": 276470 + }, + { + "epoch": 2.4441733411128204, + "grad_norm": 1.8605765104293823, + "learning_rate": 9.263777648119663e-06, + "loss": 0.4422, + "step": 276480 + }, + { + "epoch": 2.4442617443731325, + "grad_norm": 10.61630916595459, + "learning_rate": 9.262304260447792e-06, + "loss": 0.6056, + "step": 276490 + }, + { + "epoch": 2.4443501476334446, + "grad_norm": 1.6700153350830078, + "learning_rate": 9.260830872775922e-06, + "loss": 0.4818, + "step": 276500 + }, + { + "epoch": 2.444438550893757, + "grad_norm": 2.969968557357788, + "learning_rate": 9.25935748510405e-06, + "loss": 0.3734, + "step": 276510 + }, + { + "epoch": 2.4445269541540693, + "grad_norm": 1.702420949935913, + "learning_rate": 9.25788409743218e-06, + "loss": 0.4452, + "step": 276520 + }, + { + "epoch": 2.4446153574143814, + "grad_norm": 2.8807756900787354, + "learning_rate": 9.25641070976031e-06, + "loss": 0.5579, + "step": 276530 + }, + { + "epoch": 2.4447037606746935, + "grad_norm": 2.426752805709839, + "learning_rate": 9.254937322088438e-06, + "loss": 0.4855, + "step": 276540 + }, + { + "epoch": 2.444792163935006, + "grad_norm": 0.9633365273475647, + "learning_rate": 9.253463934416569e-06, + "loss": 0.5084, + "step": 276550 + }, + { + "epoch": 2.444880567195318, + "grad_norm": 8.241148948669434, + "learning_rate": 9.251990546744697e-06, + "loss": 0.4469, + "step": 276560 + }, + { + "epoch": 2.4449689704556303, + "grad_norm": 0.8652594089508057, + "learning_rate": 9.250517159072827e-06, + "loss": 0.4074, + "step": 276570 + }, + { + "epoch": 2.4450573737159425, + "grad_norm": 1.1323769092559814, + "learning_rate": 9.249043771400955e-06, + "loss": 0.5205, + "step": 276580 + }, + { + "epoch": 2.445145776976255, + "grad_norm": 4.059826374053955, + "learning_rate": 9.247570383729085e-06, + "loss": 0.552, + "step": 276590 + }, + { + "epoch": 2.445234180236567, + "grad_norm": 1.4894708395004272, + "learning_rate": 9.246096996057214e-06, + "loss": 0.457, + "step": 276600 + }, + { + "epoch": 2.4453225834968793, + "grad_norm": 1.1660248041152954, + "learning_rate": 9.244623608385344e-06, + "loss": 0.4301, + "step": 276610 + }, + { + "epoch": 2.4454109867571914, + "grad_norm": 3.956131935119629, + "learning_rate": 9.243150220713474e-06, + "loss": 0.5021, + "step": 276620 + }, + { + "epoch": 2.445499390017504, + "grad_norm": 1.6093757152557373, + "learning_rate": 9.241676833041602e-06, + "loss": 0.4269, + "step": 276630 + }, + { + "epoch": 2.445587793277816, + "grad_norm": 5.969840049743652, + "learning_rate": 9.240203445369732e-06, + "loss": 0.5116, + "step": 276640 + }, + { + "epoch": 2.445676196538128, + "grad_norm": 12.5451021194458, + "learning_rate": 9.238730057697862e-06, + "loss": 0.4874, + "step": 276650 + }, + { + "epoch": 2.4457645997984407, + "grad_norm": 2.523571014404297, + "learning_rate": 9.23725667002599e-06, + "loss": 0.5184, + "step": 276660 + }, + { + "epoch": 2.445853003058753, + "grad_norm": 3.144442319869995, + "learning_rate": 9.23578328235412e-06, + "loss": 0.7336, + "step": 276670 + }, + { + "epoch": 2.445941406319065, + "grad_norm": 3.9734861850738525, + "learning_rate": 9.23430989468225e-06, + "loss": 0.58, + "step": 276680 + }, + { + "epoch": 2.446029809579377, + "grad_norm": 2.516308546066284, + "learning_rate": 9.232836507010379e-06, + "loss": 0.5225, + "step": 276690 + }, + { + "epoch": 2.4461182128396897, + "grad_norm": 1.0627186298370361, + "learning_rate": 9.231363119338509e-06, + "loss": 0.5748, + "step": 276700 + }, + { + "epoch": 2.446206616100002, + "grad_norm": 7.756067276000977, + "learning_rate": 9.229889731666639e-06, + "loss": 0.4703, + "step": 276710 + }, + { + "epoch": 2.446295019360314, + "grad_norm": 2.34893536567688, + "learning_rate": 9.228416343994767e-06, + "loss": 0.5094, + "step": 276720 + }, + { + "epoch": 2.4463834226206265, + "grad_norm": 4.948226451873779, + "learning_rate": 9.226942956322898e-06, + "loss": 0.5009, + "step": 276730 + }, + { + "epoch": 2.4464718258809386, + "grad_norm": 4.499366283416748, + "learning_rate": 9.225469568651026e-06, + "loss": 0.4668, + "step": 276740 + }, + { + "epoch": 2.4465602291412507, + "grad_norm": 8.326952934265137, + "learning_rate": 9.223996180979156e-06, + "loss": 0.4415, + "step": 276750 + }, + { + "epoch": 2.446648632401563, + "grad_norm": 3.1218013763427734, + "learning_rate": 9.222522793307284e-06, + "loss": 0.4982, + "step": 276760 + }, + { + "epoch": 2.4467370356618754, + "grad_norm": 5.243674278259277, + "learning_rate": 9.221049405635414e-06, + "loss": 0.452, + "step": 276770 + }, + { + "epoch": 2.4468254389221875, + "grad_norm": 1.858563780784607, + "learning_rate": 9.219576017963543e-06, + "loss": 0.5262, + "step": 276780 + }, + { + "epoch": 2.4469138421824996, + "grad_norm": 5.033310890197754, + "learning_rate": 9.218102630291673e-06, + "loss": 0.6754, + "step": 276790 + }, + { + "epoch": 2.4470022454428118, + "grad_norm": 1.3772488832473755, + "learning_rate": 9.216629242619803e-06, + "loss": 0.4459, + "step": 276800 + }, + { + "epoch": 2.4470906487031243, + "grad_norm": 7.822283744812012, + "learning_rate": 9.215155854947931e-06, + "loss": 0.5441, + "step": 276810 + }, + { + "epoch": 2.4471790519634364, + "grad_norm": 9.572296142578125, + "learning_rate": 9.213682467276061e-06, + "loss": 0.5367, + "step": 276820 + }, + { + "epoch": 2.4472674552237486, + "grad_norm": 3.640643358230591, + "learning_rate": 9.21220907960419e-06, + "loss": 0.4217, + "step": 276830 + }, + { + "epoch": 2.4473558584840607, + "grad_norm": 1.529314637184143, + "learning_rate": 9.21073569193232e-06, + "loss": 0.4741, + "step": 276840 + }, + { + "epoch": 2.4474442617443732, + "grad_norm": 3.299189329147339, + "learning_rate": 9.209262304260448e-06, + "loss": 0.5742, + "step": 276850 + }, + { + "epoch": 2.4475326650046854, + "grad_norm": 9.18659496307373, + "learning_rate": 9.207788916588578e-06, + "loss": 0.5095, + "step": 276860 + }, + { + "epoch": 2.4476210682649975, + "grad_norm": 8.069513320922852, + "learning_rate": 9.206315528916706e-06, + "loss": 0.3607, + "step": 276870 + }, + { + "epoch": 2.44770947152531, + "grad_norm": 10.63515567779541, + "learning_rate": 9.204842141244836e-06, + "loss": 0.39, + "step": 276880 + }, + { + "epoch": 2.447797874785622, + "grad_norm": 2.7619380950927734, + "learning_rate": 9.203368753572965e-06, + "loss": 0.5498, + "step": 276890 + }, + { + "epoch": 2.4478862780459343, + "grad_norm": 3.983799934387207, + "learning_rate": 9.201895365901095e-06, + "loss": 0.5163, + "step": 276900 + }, + { + "epoch": 2.4479746813062464, + "grad_norm": 2.0461935997009277, + "learning_rate": 9.200421978229225e-06, + "loss": 0.4425, + "step": 276910 + }, + { + "epoch": 2.448063084566559, + "grad_norm": 2.248751640319824, + "learning_rate": 9.198948590557353e-06, + "loss": 0.4107, + "step": 276920 + }, + { + "epoch": 2.448151487826871, + "grad_norm": 3.585782289505005, + "learning_rate": 9.197475202885483e-06, + "loss": 0.5839, + "step": 276930 + }, + { + "epoch": 2.448239891087183, + "grad_norm": 1.697689414024353, + "learning_rate": 9.196001815213612e-06, + "loss": 0.5361, + "step": 276940 + }, + { + "epoch": 2.4483282943474958, + "grad_norm": 4.995443344116211, + "learning_rate": 9.194528427541742e-06, + "loss": 0.4344, + "step": 276950 + }, + { + "epoch": 2.448416697607808, + "grad_norm": 3.0811960697174072, + "learning_rate": 9.19305503986987e-06, + "loss": 0.5657, + "step": 276960 + }, + { + "epoch": 2.44850510086812, + "grad_norm": 9.26603889465332, + "learning_rate": 9.191581652198e-06, + "loss": 0.3914, + "step": 276970 + }, + { + "epoch": 2.448593504128432, + "grad_norm": 4.173733234405518, + "learning_rate": 9.190108264526128e-06, + "loss": 0.509, + "step": 276980 + }, + { + "epoch": 2.4486819073887442, + "grad_norm": 6.430305004119873, + "learning_rate": 9.188634876854258e-06, + "loss": 0.6749, + "step": 276990 + }, + { + "epoch": 2.448770310649057, + "grad_norm": 2.0441970825195312, + "learning_rate": 9.187161489182388e-06, + "loss": 0.3664, + "step": 277000 + }, + { + "epoch": 2.448858713909369, + "grad_norm": 2.6792104244232178, + "learning_rate": 9.185688101510517e-06, + "loss": 0.5622, + "step": 277010 + }, + { + "epoch": 2.448947117169681, + "grad_norm": 5.308959484100342, + "learning_rate": 9.184214713838647e-06, + "loss": 0.4586, + "step": 277020 + }, + { + "epoch": 2.4490355204299936, + "grad_norm": 2.1115612983703613, + "learning_rate": 9.182741326166775e-06, + "loss": 0.496, + "step": 277030 + }, + { + "epoch": 2.4491239236903057, + "grad_norm": 2.8341550827026367, + "learning_rate": 9.181267938494905e-06, + "loss": 0.5016, + "step": 277040 + }, + { + "epoch": 2.449212326950618, + "grad_norm": 4.8668999671936035, + "learning_rate": 9.179794550823034e-06, + "loss": 0.5096, + "step": 277050 + }, + { + "epoch": 2.44930073021093, + "grad_norm": 1.364045262336731, + "learning_rate": 9.178321163151164e-06, + "loss": 0.42, + "step": 277060 + }, + { + "epoch": 2.4493891334712425, + "grad_norm": 6.28797721862793, + "learning_rate": 9.176847775479292e-06, + "loss": 0.5944, + "step": 277070 + }, + { + "epoch": 2.4494775367315547, + "grad_norm": 3.7153682708740234, + "learning_rate": 9.175374387807422e-06, + "loss": 0.4977, + "step": 277080 + }, + { + "epoch": 2.449565939991867, + "grad_norm": 1.5027225017547607, + "learning_rate": 9.173901000135552e-06, + "loss": 0.5759, + "step": 277090 + }, + { + "epoch": 2.4496543432521793, + "grad_norm": 4.300380229949951, + "learning_rate": 9.17242761246368e-06, + "loss": 0.4692, + "step": 277100 + }, + { + "epoch": 2.4497427465124915, + "grad_norm": 1.5401822328567505, + "learning_rate": 9.17095422479181e-06, + "loss": 0.4501, + "step": 277110 + }, + { + "epoch": 2.4498311497728036, + "grad_norm": 3.2877602577209473, + "learning_rate": 9.16948083711994e-06, + "loss": 0.354, + "step": 277120 + }, + { + "epoch": 2.4499195530331157, + "grad_norm": 11.52132797241211, + "learning_rate": 9.168007449448069e-06, + "loss": 0.6295, + "step": 277130 + }, + { + "epoch": 2.4500079562934283, + "grad_norm": 2.189457416534424, + "learning_rate": 9.166534061776199e-06, + "loss": 0.3921, + "step": 277140 + }, + { + "epoch": 2.4500963595537404, + "grad_norm": 1.0676435232162476, + "learning_rate": 9.165060674104329e-06, + "loss": 0.5598, + "step": 277150 + }, + { + "epoch": 2.4501847628140525, + "grad_norm": 3.3207485675811768, + "learning_rate": 9.163587286432457e-06, + "loss": 0.4146, + "step": 277160 + }, + { + "epoch": 2.450273166074365, + "grad_norm": 1.4816399812698364, + "learning_rate": 9.162113898760587e-06, + "loss": 0.5742, + "step": 277170 + }, + { + "epoch": 2.450361569334677, + "grad_norm": 4.218057155609131, + "learning_rate": 9.160640511088717e-06, + "loss": 0.5989, + "step": 277180 + }, + { + "epoch": 2.4504499725949893, + "grad_norm": 11.334430694580078, + "learning_rate": 9.159167123416846e-06, + "loss": 0.5998, + "step": 277190 + }, + { + "epoch": 2.4505383758553014, + "grad_norm": 3.9680697917938232, + "learning_rate": 9.157693735744976e-06, + "loss": 0.4658, + "step": 277200 + }, + { + "epoch": 2.4506267791156136, + "grad_norm": 2.3843700885772705, + "learning_rate": 9.156220348073104e-06, + "loss": 0.4313, + "step": 277210 + }, + { + "epoch": 2.450715182375926, + "grad_norm": 2.469425916671753, + "learning_rate": 9.154746960401234e-06, + "loss": 0.5161, + "step": 277220 + }, + { + "epoch": 2.4508035856362382, + "grad_norm": 3.8371565341949463, + "learning_rate": 9.153273572729363e-06, + "loss": 0.6049, + "step": 277230 + }, + { + "epoch": 2.4508919888965504, + "grad_norm": 2.046497344970703, + "learning_rate": 9.151800185057493e-06, + "loss": 0.6331, + "step": 277240 + }, + { + "epoch": 2.450980392156863, + "grad_norm": 7.034142971038818, + "learning_rate": 9.150326797385621e-06, + "loss": 0.5885, + "step": 277250 + }, + { + "epoch": 2.451068795417175, + "grad_norm": 3.3888649940490723, + "learning_rate": 9.148853409713751e-06, + "loss": 0.5429, + "step": 277260 + }, + { + "epoch": 2.451157198677487, + "grad_norm": 8.747509956359863, + "learning_rate": 9.147380022041881e-06, + "loss": 0.5309, + "step": 277270 + }, + { + "epoch": 2.4512456019377993, + "grad_norm": 4.312733173370361, + "learning_rate": 9.14590663437001e-06, + "loss": 0.4275, + "step": 277280 + }, + { + "epoch": 2.451334005198112, + "grad_norm": 2.0600297451019287, + "learning_rate": 9.14443324669814e-06, + "loss": 0.4558, + "step": 277290 + }, + { + "epoch": 2.451422408458424, + "grad_norm": 5.636139392852783, + "learning_rate": 9.142959859026268e-06, + "loss": 0.4636, + "step": 277300 + }, + { + "epoch": 2.451510811718736, + "grad_norm": 4.72311544418335, + "learning_rate": 9.141486471354398e-06, + "loss": 0.5194, + "step": 277310 + }, + { + "epoch": 2.4515992149790486, + "grad_norm": 3.738414764404297, + "learning_rate": 9.140013083682526e-06, + "loss": 0.5808, + "step": 277320 + }, + { + "epoch": 2.4516876182393608, + "grad_norm": 1.6347148418426514, + "learning_rate": 9.138539696010656e-06, + "loss": 0.4697, + "step": 277330 + }, + { + "epoch": 2.451776021499673, + "grad_norm": 3.8469510078430176, + "learning_rate": 9.137066308338785e-06, + "loss": 0.4728, + "step": 277340 + }, + { + "epoch": 2.451864424759985, + "grad_norm": 13.908392906188965, + "learning_rate": 9.135592920666915e-06, + "loss": 0.5205, + "step": 277350 + }, + { + "epoch": 2.4519528280202976, + "grad_norm": 1.6144424676895142, + "learning_rate": 9.134119532995045e-06, + "loss": 0.5219, + "step": 277360 + }, + { + "epoch": 2.4520412312806097, + "grad_norm": 1.936342716217041, + "learning_rate": 9.132646145323173e-06, + "loss": 0.5408, + "step": 277370 + }, + { + "epoch": 2.452129634540922, + "grad_norm": 3.9691081047058105, + "learning_rate": 9.131172757651303e-06, + "loss": 0.583, + "step": 277380 + }, + { + "epoch": 2.452218037801234, + "grad_norm": 2.051069974899292, + "learning_rate": 9.129699369979432e-06, + "loss": 0.5872, + "step": 277390 + }, + { + "epoch": 2.4523064410615465, + "grad_norm": 18.215654373168945, + "learning_rate": 9.128225982307562e-06, + "loss": 0.4704, + "step": 277400 + }, + { + "epoch": 2.4523948443218586, + "grad_norm": 2.757842540740967, + "learning_rate": 9.12675259463569e-06, + "loss": 0.5193, + "step": 277410 + }, + { + "epoch": 2.4524832475821707, + "grad_norm": 3.601551055908203, + "learning_rate": 9.12527920696382e-06, + "loss": 0.6806, + "step": 277420 + }, + { + "epoch": 2.452571650842483, + "grad_norm": 8.143007278442383, + "learning_rate": 9.123805819291948e-06, + "loss": 0.7034, + "step": 277430 + }, + { + "epoch": 2.4526600541027954, + "grad_norm": 2.840022325515747, + "learning_rate": 9.122332431620078e-06, + "loss": 0.5093, + "step": 277440 + }, + { + "epoch": 2.4527484573631075, + "grad_norm": 3.765103578567505, + "learning_rate": 9.120859043948207e-06, + "loss": 0.4602, + "step": 277450 + }, + { + "epoch": 2.4528368606234197, + "grad_norm": 1.7268537282943726, + "learning_rate": 9.119385656276337e-06, + "loss": 0.5449, + "step": 277460 + }, + { + "epoch": 2.452925263883732, + "grad_norm": 4.921513080596924, + "learning_rate": 9.117912268604467e-06, + "loss": 0.5094, + "step": 277470 + }, + { + "epoch": 2.4530136671440443, + "grad_norm": 21.85819435119629, + "learning_rate": 9.116438880932595e-06, + "loss": 0.5668, + "step": 277480 + }, + { + "epoch": 2.4531020704043565, + "grad_norm": 1.1985058784484863, + "learning_rate": 9.114965493260725e-06, + "loss": 0.4219, + "step": 277490 + }, + { + "epoch": 2.4531904736646686, + "grad_norm": 0.5383462905883789, + "learning_rate": 9.113492105588854e-06, + "loss": 0.4671, + "step": 277500 + }, + { + "epoch": 2.453278876924981, + "grad_norm": 1.5497316122055054, + "learning_rate": 9.112018717916984e-06, + "loss": 0.4673, + "step": 277510 + }, + { + "epoch": 2.4533672801852933, + "grad_norm": 4.816950798034668, + "learning_rate": 9.110545330245112e-06, + "loss": 0.4883, + "step": 277520 + }, + { + "epoch": 2.4534556834456054, + "grad_norm": 2.5877349376678467, + "learning_rate": 9.109071942573242e-06, + "loss": 0.6908, + "step": 277530 + }, + { + "epoch": 2.453544086705918, + "grad_norm": 1.7199257612228394, + "learning_rate": 9.10759855490137e-06, + "loss": 0.4777, + "step": 277540 + }, + { + "epoch": 2.45363248996623, + "grad_norm": 9.36799144744873, + "learning_rate": 9.1061251672295e-06, + "loss": 0.6492, + "step": 277550 + }, + { + "epoch": 2.453720893226542, + "grad_norm": 3.107431650161743, + "learning_rate": 9.10465177955763e-06, + "loss": 0.5113, + "step": 277560 + }, + { + "epoch": 2.4538092964868543, + "grad_norm": 1.5678917169570923, + "learning_rate": 9.103178391885759e-06, + "loss": 0.4478, + "step": 277570 + }, + { + "epoch": 2.4538976997471664, + "grad_norm": 6.660762786865234, + "learning_rate": 9.101705004213889e-06, + "loss": 0.5423, + "step": 277580 + }, + { + "epoch": 2.453986103007479, + "grad_norm": 8.619209289550781, + "learning_rate": 9.100231616542019e-06, + "loss": 0.5795, + "step": 277590 + }, + { + "epoch": 2.454074506267791, + "grad_norm": 3.647238254547119, + "learning_rate": 9.098758228870147e-06, + "loss": 0.4427, + "step": 277600 + }, + { + "epoch": 2.4541629095281032, + "grad_norm": 4.61834716796875, + "learning_rate": 9.097284841198277e-06, + "loss": 0.6193, + "step": 277610 + }, + { + "epoch": 2.454251312788416, + "grad_norm": 2.300610065460205, + "learning_rate": 9.095811453526407e-06, + "loss": 0.4888, + "step": 277620 + }, + { + "epoch": 2.454339716048728, + "grad_norm": 1.9904025793075562, + "learning_rate": 9.094338065854536e-06, + "loss": 0.6237, + "step": 277630 + }, + { + "epoch": 2.45442811930904, + "grad_norm": 3.149369478225708, + "learning_rate": 9.092864678182666e-06, + "loss": 0.4863, + "step": 277640 + }, + { + "epoch": 2.454516522569352, + "grad_norm": 2.5886433124542236, + "learning_rate": 9.091391290510796e-06, + "loss": 0.6468, + "step": 277650 + }, + { + "epoch": 2.4546049258296647, + "grad_norm": 3.9433348178863525, + "learning_rate": 9.089917902838924e-06, + "loss": 0.577, + "step": 277660 + }, + { + "epoch": 2.454693329089977, + "grad_norm": 2.773956537246704, + "learning_rate": 9.088444515167054e-06, + "loss": 0.5748, + "step": 277670 + }, + { + "epoch": 2.454781732350289, + "grad_norm": 4.597047328948975, + "learning_rate": 9.086971127495183e-06, + "loss": 0.6469, + "step": 277680 + }, + { + "epoch": 2.4548701356106015, + "grad_norm": 2.0632991790771484, + "learning_rate": 9.085497739823313e-06, + "loss": 0.4398, + "step": 277690 + }, + { + "epoch": 2.4549585388709136, + "grad_norm": 1.9625736474990845, + "learning_rate": 9.084024352151441e-06, + "loss": 0.4082, + "step": 277700 + }, + { + "epoch": 2.4550469421312258, + "grad_norm": 1.3134914636611938, + "learning_rate": 9.082550964479571e-06, + "loss": 0.4739, + "step": 277710 + }, + { + "epoch": 2.455135345391538, + "grad_norm": 2.16113543510437, + "learning_rate": 9.0810775768077e-06, + "loss": 0.5441, + "step": 277720 + }, + { + "epoch": 2.4552237486518504, + "grad_norm": 4.131745338439941, + "learning_rate": 9.07960418913583e-06, + "loss": 0.4991, + "step": 277730 + }, + { + "epoch": 2.4553121519121626, + "grad_norm": 1.9327106475830078, + "learning_rate": 9.07813080146396e-06, + "loss": 0.6118, + "step": 277740 + }, + { + "epoch": 2.4554005551724747, + "grad_norm": 1.2636629343032837, + "learning_rate": 9.076657413792088e-06, + "loss": 0.4737, + "step": 277750 + }, + { + "epoch": 2.4554889584327872, + "grad_norm": 6.049433708190918, + "learning_rate": 9.075184026120218e-06, + "loss": 0.5908, + "step": 277760 + }, + { + "epoch": 2.4555773616930994, + "grad_norm": 1.233872652053833, + "learning_rate": 9.073710638448346e-06, + "loss": 0.3984, + "step": 277770 + }, + { + "epoch": 2.4556657649534115, + "grad_norm": 8.730361938476562, + "learning_rate": 9.072237250776476e-06, + "loss": 0.5938, + "step": 277780 + }, + { + "epoch": 2.4557541682137236, + "grad_norm": 1.788015365600586, + "learning_rate": 9.070763863104605e-06, + "loss": 0.5983, + "step": 277790 + }, + { + "epoch": 2.4558425714740357, + "grad_norm": 1.576796531677246, + "learning_rate": 9.069290475432735e-06, + "loss": 0.5533, + "step": 277800 + }, + { + "epoch": 2.4559309747343483, + "grad_norm": 1.8875197172164917, + "learning_rate": 9.067817087760863e-06, + "loss": 0.5068, + "step": 277810 + }, + { + "epoch": 2.4560193779946604, + "grad_norm": 2.511263370513916, + "learning_rate": 9.066343700088993e-06, + "loss": 0.4924, + "step": 277820 + }, + { + "epoch": 2.4561077812549725, + "grad_norm": 4.155569553375244, + "learning_rate": 9.064870312417123e-06, + "loss": 0.6546, + "step": 277830 + }, + { + "epoch": 2.456196184515285, + "grad_norm": 1.8589391708374023, + "learning_rate": 9.063396924745252e-06, + "loss": 0.4046, + "step": 277840 + }, + { + "epoch": 2.456284587775597, + "grad_norm": 3.757981300354004, + "learning_rate": 9.061923537073382e-06, + "loss": 0.4793, + "step": 277850 + }, + { + "epoch": 2.4563729910359093, + "grad_norm": 2.838810443878174, + "learning_rate": 9.06045014940151e-06, + "loss": 0.6678, + "step": 277860 + }, + { + "epoch": 2.4564613942962215, + "grad_norm": 2.639932870864868, + "learning_rate": 9.05897676172964e-06, + "loss": 0.4822, + "step": 277870 + }, + { + "epoch": 2.456549797556534, + "grad_norm": 1.974440097808838, + "learning_rate": 9.057503374057768e-06, + "loss": 0.6157, + "step": 277880 + }, + { + "epoch": 2.456638200816846, + "grad_norm": 6.1879353523254395, + "learning_rate": 9.056029986385898e-06, + "loss": 0.6141, + "step": 277890 + }, + { + "epoch": 2.4567266040771583, + "grad_norm": 0.8226235508918762, + "learning_rate": 9.054556598714027e-06, + "loss": 0.5511, + "step": 277900 + }, + { + "epoch": 2.456815007337471, + "grad_norm": 0.800304651260376, + "learning_rate": 9.053083211042157e-06, + "loss": 0.5836, + "step": 277910 + }, + { + "epoch": 2.456903410597783, + "grad_norm": 1.9048895835876465, + "learning_rate": 9.051609823370287e-06, + "loss": 0.5214, + "step": 277920 + }, + { + "epoch": 2.456991813858095, + "grad_norm": 3.4578726291656494, + "learning_rate": 9.050136435698415e-06, + "loss": 0.4715, + "step": 277930 + }, + { + "epoch": 2.457080217118407, + "grad_norm": 3.1140670776367188, + "learning_rate": 9.048663048026545e-06, + "loss": 0.5826, + "step": 277940 + }, + { + "epoch": 2.4571686203787197, + "grad_norm": 1.4995731115341187, + "learning_rate": 9.047189660354674e-06, + "loss": 0.5128, + "step": 277950 + }, + { + "epoch": 2.457257023639032, + "grad_norm": 1.2491511106491089, + "learning_rate": 9.045716272682804e-06, + "loss": 0.4468, + "step": 277960 + }, + { + "epoch": 2.457345426899344, + "grad_norm": 49.29486846923828, + "learning_rate": 9.044242885010932e-06, + "loss": 0.592, + "step": 277970 + }, + { + "epoch": 2.457433830159656, + "grad_norm": 2.291843891143799, + "learning_rate": 9.042769497339062e-06, + "loss": 0.5301, + "step": 277980 + }, + { + "epoch": 2.4575222334199687, + "grad_norm": 0.8804166316986084, + "learning_rate": 9.04129610966719e-06, + "loss": 0.473, + "step": 277990 + }, + { + "epoch": 2.457610636680281, + "grad_norm": 2.0686120986938477, + "learning_rate": 9.03982272199532e-06, + "loss": 0.408, + "step": 278000 + }, + { + "epoch": 2.457699039940593, + "grad_norm": 4.864226818084717, + "learning_rate": 9.038349334323449e-06, + "loss": 0.4721, + "step": 278010 + }, + { + "epoch": 2.457787443200905, + "grad_norm": 3.3655405044555664, + "learning_rate": 9.036875946651579e-06, + "loss": 0.4811, + "step": 278020 + }, + { + "epoch": 2.4578758464612176, + "grad_norm": 3.476593494415283, + "learning_rate": 9.035402558979709e-06, + "loss": 0.4079, + "step": 278030 + }, + { + "epoch": 2.4579642497215297, + "grad_norm": 2.003105401992798, + "learning_rate": 9.033929171307837e-06, + "loss": 0.583, + "step": 278040 + }, + { + "epoch": 2.458052652981842, + "grad_norm": 15.368377685546875, + "learning_rate": 9.032455783635967e-06, + "loss": 0.4737, + "step": 278050 + }, + { + "epoch": 2.4581410562421544, + "grad_norm": 1.7434989213943481, + "learning_rate": 9.030982395964097e-06, + "loss": 0.6639, + "step": 278060 + }, + { + "epoch": 2.4582294595024665, + "grad_norm": 2.840785503387451, + "learning_rate": 9.029509008292226e-06, + "loss": 0.5822, + "step": 278070 + }, + { + "epoch": 2.4583178627627786, + "grad_norm": 14.644192695617676, + "learning_rate": 9.028035620620356e-06, + "loss": 0.5459, + "step": 278080 + }, + { + "epoch": 2.4584062660230908, + "grad_norm": 7.964188098907471, + "learning_rate": 9.026562232948486e-06, + "loss": 0.472, + "step": 278090 + }, + { + "epoch": 2.4584946692834033, + "grad_norm": 3.678581476211548, + "learning_rate": 9.025088845276614e-06, + "loss": 0.451, + "step": 278100 + }, + { + "epoch": 2.4585830725437154, + "grad_norm": 26.814067840576172, + "learning_rate": 9.023615457604744e-06, + "loss": 0.663, + "step": 278110 + }, + { + "epoch": 2.4586714758040276, + "grad_norm": 1.815945029258728, + "learning_rate": 9.022142069932874e-06, + "loss": 0.4915, + "step": 278120 + }, + { + "epoch": 2.45875987906434, + "grad_norm": 2.881195306777954, + "learning_rate": 9.020668682261003e-06, + "loss": 0.4729, + "step": 278130 + }, + { + "epoch": 2.4588482823246522, + "grad_norm": 2.809406280517578, + "learning_rate": 9.019195294589133e-06, + "loss": 0.5565, + "step": 278140 + }, + { + "epoch": 2.4589366855849644, + "grad_norm": 3.491137742996216, + "learning_rate": 9.017721906917261e-06, + "loss": 0.543, + "step": 278150 + }, + { + "epoch": 2.4590250888452765, + "grad_norm": 2.174309015274048, + "learning_rate": 9.016248519245391e-06, + "loss": 0.6366, + "step": 278160 + }, + { + "epoch": 2.4591134921055886, + "grad_norm": 11.90239143371582, + "learning_rate": 9.01477513157352e-06, + "loss": 0.5636, + "step": 278170 + }, + { + "epoch": 2.459201895365901, + "grad_norm": 2.682421922683716, + "learning_rate": 9.01330174390165e-06, + "loss": 0.5287, + "step": 278180 + }, + { + "epoch": 2.4592902986262133, + "grad_norm": 4.465697288513184, + "learning_rate": 9.011828356229778e-06, + "loss": 0.6292, + "step": 278190 + }, + { + "epoch": 2.4593787018865254, + "grad_norm": 11.926170349121094, + "learning_rate": 9.010354968557908e-06, + "loss": 0.436, + "step": 278200 + }, + { + "epoch": 2.459467105146838, + "grad_norm": 7.71286153793335, + "learning_rate": 9.008881580886038e-06, + "loss": 0.5285, + "step": 278210 + }, + { + "epoch": 2.45955550840715, + "grad_norm": 5.045874118804932, + "learning_rate": 9.007408193214166e-06, + "loss": 0.5578, + "step": 278220 + }, + { + "epoch": 2.459643911667462, + "grad_norm": 3.9753828048706055, + "learning_rate": 9.005934805542296e-06, + "loss": 0.5032, + "step": 278230 + }, + { + "epoch": 2.4597323149277743, + "grad_norm": 1.1797770261764526, + "learning_rate": 9.004461417870425e-06, + "loss": 0.5568, + "step": 278240 + }, + { + "epoch": 2.459820718188087, + "grad_norm": 3.024487257003784, + "learning_rate": 9.002988030198555e-06, + "loss": 0.5739, + "step": 278250 + }, + { + "epoch": 2.459909121448399, + "grad_norm": 1.4203662872314453, + "learning_rate": 9.001514642526683e-06, + "loss": 0.6197, + "step": 278260 + }, + { + "epoch": 2.459997524708711, + "grad_norm": 3.594736099243164, + "learning_rate": 9.000041254854813e-06, + "loss": 0.471, + "step": 278270 + }, + { + "epoch": 2.4600859279690237, + "grad_norm": 1.618754506111145, + "learning_rate": 8.998567867182941e-06, + "loss": 0.4736, + "step": 278280 + }, + { + "epoch": 2.460174331229336, + "grad_norm": 1.48981773853302, + "learning_rate": 8.997094479511072e-06, + "loss": 0.5891, + "step": 278290 + }, + { + "epoch": 2.460262734489648, + "grad_norm": 25.403709411621094, + "learning_rate": 8.995621091839202e-06, + "loss": 0.5374, + "step": 278300 + }, + { + "epoch": 2.46035113774996, + "grad_norm": 3.075941324234009, + "learning_rate": 8.99414770416733e-06, + "loss": 0.5694, + "step": 278310 + }, + { + "epoch": 2.4604395410102726, + "grad_norm": 1.719422698020935, + "learning_rate": 8.99267431649546e-06, + "loss": 0.497, + "step": 278320 + }, + { + "epoch": 2.4605279442705847, + "grad_norm": 2.1839253902435303, + "learning_rate": 8.991200928823588e-06, + "loss": 0.4408, + "step": 278330 + }, + { + "epoch": 2.460616347530897, + "grad_norm": 10.394698143005371, + "learning_rate": 8.989727541151718e-06, + "loss": 0.5875, + "step": 278340 + }, + { + "epoch": 2.4607047507912094, + "grad_norm": 1.3305435180664062, + "learning_rate": 8.988254153479847e-06, + "loss": 0.5023, + "step": 278350 + }, + { + "epoch": 2.4607931540515215, + "grad_norm": 2.4985926151275635, + "learning_rate": 8.986780765807977e-06, + "loss": 0.595, + "step": 278360 + }, + { + "epoch": 2.4608815573118337, + "grad_norm": 3.1970651149749756, + "learning_rate": 8.985307378136105e-06, + "loss": 0.5194, + "step": 278370 + }, + { + "epoch": 2.460969960572146, + "grad_norm": 1.9151890277862549, + "learning_rate": 8.983833990464235e-06, + "loss": 0.5553, + "step": 278380 + }, + { + "epoch": 2.461058363832458, + "grad_norm": 3.6754579544067383, + "learning_rate": 8.982360602792365e-06, + "loss": 0.4788, + "step": 278390 + }, + { + "epoch": 2.4611467670927705, + "grad_norm": 1.8113216161727905, + "learning_rate": 8.980887215120494e-06, + "loss": 0.4538, + "step": 278400 + }, + { + "epoch": 2.4612351703530826, + "grad_norm": 4.184200286865234, + "learning_rate": 8.979413827448624e-06, + "loss": 0.67, + "step": 278410 + }, + { + "epoch": 2.4613235736133947, + "grad_norm": 3.02575945854187, + "learning_rate": 8.977940439776752e-06, + "loss": 0.5832, + "step": 278420 + }, + { + "epoch": 2.4614119768737073, + "grad_norm": 1.9528453350067139, + "learning_rate": 8.976467052104882e-06, + "loss": 0.5243, + "step": 278430 + }, + { + "epoch": 2.4615003801340194, + "grad_norm": 5.9663190841674805, + "learning_rate": 8.97499366443301e-06, + "loss": 0.55, + "step": 278440 + }, + { + "epoch": 2.4615887833943315, + "grad_norm": 5.256905555725098, + "learning_rate": 8.97352027676114e-06, + "loss": 0.5124, + "step": 278450 + }, + { + "epoch": 2.4616771866546436, + "grad_norm": 3.268441915512085, + "learning_rate": 8.972046889089269e-06, + "loss": 0.579, + "step": 278460 + }, + { + "epoch": 2.461765589914956, + "grad_norm": 2.2425568103790283, + "learning_rate": 8.970573501417399e-06, + "loss": 0.5671, + "step": 278470 + }, + { + "epoch": 2.4618539931752683, + "grad_norm": 15.117268562316895, + "learning_rate": 8.969100113745529e-06, + "loss": 0.543, + "step": 278480 + }, + { + "epoch": 2.4619423964355804, + "grad_norm": 16.285367965698242, + "learning_rate": 8.967626726073657e-06, + "loss": 0.5786, + "step": 278490 + }, + { + "epoch": 2.462030799695893, + "grad_norm": 6.938075542449951, + "learning_rate": 8.966153338401787e-06, + "loss": 0.6606, + "step": 278500 + }, + { + "epoch": 2.462119202956205, + "grad_norm": 2.721885919570923, + "learning_rate": 8.964679950729916e-06, + "loss": 0.6639, + "step": 278510 + }, + { + "epoch": 2.4622076062165172, + "grad_norm": 2.1587629318237305, + "learning_rate": 8.963206563058046e-06, + "loss": 0.4085, + "step": 278520 + }, + { + "epoch": 2.4622960094768294, + "grad_norm": 1.7514179944992065, + "learning_rate": 8.961733175386176e-06, + "loss": 0.4765, + "step": 278530 + }, + { + "epoch": 2.462384412737142, + "grad_norm": 2.278528928756714, + "learning_rate": 8.960259787714304e-06, + "loss": 0.4446, + "step": 278540 + }, + { + "epoch": 2.462472815997454, + "grad_norm": 8.467496871948242, + "learning_rate": 8.958786400042434e-06, + "loss": 0.4602, + "step": 278550 + }, + { + "epoch": 2.462561219257766, + "grad_norm": 1.9095488786697388, + "learning_rate": 8.957313012370564e-06, + "loss": 0.4695, + "step": 278560 + }, + { + "epoch": 2.4626496225180783, + "grad_norm": 10.082772254943848, + "learning_rate": 8.955839624698693e-06, + "loss": 0.6871, + "step": 278570 + }, + { + "epoch": 2.462738025778391, + "grad_norm": 3.9859702587127686, + "learning_rate": 8.954366237026823e-06, + "loss": 0.327, + "step": 278580 + }, + { + "epoch": 2.462826429038703, + "grad_norm": 6.79245138168335, + "learning_rate": 8.952892849354953e-06, + "loss": 0.5513, + "step": 278590 + }, + { + "epoch": 2.462914832299015, + "grad_norm": 5.840973854064941, + "learning_rate": 8.951419461683081e-06, + "loss": 0.6446, + "step": 278600 + }, + { + "epoch": 2.463003235559327, + "grad_norm": 7.121212959289551, + "learning_rate": 8.949946074011211e-06, + "loss": 0.519, + "step": 278610 + }, + { + "epoch": 2.4630916388196398, + "grad_norm": 1.8103245496749878, + "learning_rate": 8.94847268633934e-06, + "loss": 0.5638, + "step": 278620 + }, + { + "epoch": 2.463180042079952, + "grad_norm": 1.2135047912597656, + "learning_rate": 8.94699929866747e-06, + "loss": 0.6012, + "step": 278630 + }, + { + "epoch": 2.463268445340264, + "grad_norm": 20.79791831970215, + "learning_rate": 8.945525910995598e-06, + "loss": 0.6127, + "step": 278640 + }, + { + "epoch": 2.4633568486005766, + "grad_norm": 1.1593999862670898, + "learning_rate": 8.944052523323728e-06, + "loss": 0.4171, + "step": 278650 + }, + { + "epoch": 2.4634452518608887, + "grad_norm": 2.357166290283203, + "learning_rate": 8.942579135651856e-06, + "loss": 0.4876, + "step": 278660 + }, + { + "epoch": 2.463533655121201, + "grad_norm": 5.355009078979492, + "learning_rate": 8.941105747979986e-06, + "loss": 0.5878, + "step": 278670 + }, + { + "epoch": 2.463622058381513, + "grad_norm": 1.1452454328536987, + "learning_rate": 8.939632360308116e-06, + "loss": 0.4308, + "step": 278680 + }, + { + "epoch": 2.4637104616418255, + "grad_norm": 1.0365371704101562, + "learning_rate": 8.938158972636245e-06, + "loss": 0.5663, + "step": 278690 + }, + { + "epoch": 2.4637988649021376, + "grad_norm": 10.548683166503906, + "learning_rate": 8.936685584964375e-06, + "loss": 0.6031, + "step": 278700 + }, + { + "epoch": 2.4638872681624497, + "grad_norm": 1.6188198328018188, + "learning_rate": 8.935212197292503e-06, + "loss": 0.3799, + "step": 278710 + }, + { + "epoch": 2.4639756714227623, + "grad_norm": 4.639346599578857, + "learning_rate": 8.933738809620633e-06, + "loss": 0.5181, + "step": 278720 + }, + { + "epoch": 2.4640640746830744, + "grad_norm": 2.4012889862060547, + "learning_rate": 8.932265421948761e-06, + "loss": 0.5148, + "step": 278730 + }, + { + "epoch": 2.4641524779433865, + "grad_norm": 2.2438278198242188, + "learning_rate": 8.930792034276891e-06, + "loss": 0.4728, + "step": 278740 + }, + { + "epoch": 2.4642408812036987, + "grad_norm": 2.394061326980591, + "learning_rate": 8.92931864660502e-06, + "loss": 0.6982, + "step": 278750 + }, + { + "epoch": 2.464329284464011, + "grad_norm": 1.3215831518173218, + "learning_rate": 8.92784525893315e-06, + "loss": 0.4026, + "step": 278760 + }, + { + "epoch": 2.4644176877243233, + "grad_norm": 2.612156867980957, + "learning_rate": 8.92637187126128e-06, + "loss": 0.4847, + "step": 278770 + }, + { + "epoch": 2.4645060909846355, + "grad_norm": 3.399695634841919, + "learning_rate": 8.924898483589408e-06, + "loss": 0.4614, + "step": 278780 + }, + { + "epoch": 2.4645944942449476, + "grad_norm": 3.2174317836761475, + "learning_rate": 8.923425095917538e-06, + "loss": 0.5139, + "step": 278790 + }, + { + "epoch": 2.46468289750526, + "grad_norm": 4.068019866943359, + "learning_rate": 8.921951708245667e-06, + "loss": 0.6634, + "step": 278800 + }, + { + "epoch": 2.4647713007655723, + "grad_norm": 3.690866231918335, + "learning_rate": 8.920478320573797e-06, + "loss": 0.6828, + "step": 278810 + }, + { + "epoch": 2.4648597040258844, + "grad_norm": 10.65010929107666, + "learning_rate": 8.919004932901925e-06, + "loss": 0.6417, + "step": 278820 + }, + { + "epoch": 2.4649481072861965, + "grad_norm": 1.7076348066329956, + "learning_rate": 8.917531545230055e-06, + "loss": 0.4217, + "step": 278830 + }, + { + "epoch": 2.465036510546509, + "grad_norm": 16.449064254760742, + "learning_rate": 8.916058157558183e-06, + "loss": 0.5462, + "step": 278840 + }, + { + "epoch": 2.465124913806821, + "grad_norm": 3.682584524154663, + "learning_rate": 8.914584769886314e-06, + "loss": 0.5134, + "step": 278850 + }, + { + "epoch": 2.4652133170671333, + "grad_norm": 2.9417827129364014, + "learning_rate": 8.913111382214444e-06, + "loss": 0.657, + "step": 278860 + }, + { + "epoch": 2.465301720327446, + "grad_norm": 5.235904216766357, + "learning_rate": 8.911637994542572e-06, + "loss": 0.4955, + "step": 278870 + }, + { + "epoch": 2.465390123587758, + "grad_norm": 2.85418438911438, + "learning_rate": 8.910164606870702e-06, + "loss": 0.5467, + "step": 278880 + }, + { + "epoch": 2.46547852684807, + "grad_norm": 4.484996318817139, + "learning_rate": 8.90869121919883e-06, + "loss": 0.568, + "step": 278890 + }, + { + "epoch": 2.4655669301083822, + "grad_norm": 4.642464637756348, + "learning_rate": 8.90721783152696e-06, + "loss": 0.5597, + "step": 278900 + }, + { + "epoch": 2.465655333368695, + "grad_norm": 8.43842601776123, + "learning_rate": 8.905744443855089e-06, + "loss": 0.4443, + "step": 278910 + }, + { + "epoch": 2.465743736629007, + "grad_norm": 2.0890414714813232, + "learning_rate": 8.904271056183219e-06, + "loss": 0.5461, + "step": 278920 + }, + { + "epoch": 2.465832139889319, + "grad_norm": 1.7912542819976807, + "learning_rate": 8.902797668511347e-06, + "loss": 0.6614, + "step": 278930 + }, + { + "epoch": 2.4659205431496316, + "grad_norm": 1.12247896194458, + "learning_rate": 8.901324280839477e-06, + "loss": 0.6762, + "step": 278940 + }, + { + "epoch": 2.4660089464099437, + "grad_norm": 4.553225994110107, + "learning_rate": 8.899850893167607e-06, + "loss": 0.5254, + "step": 278950 + }, + { + "epoch": 2.466097349670256, + "grad_norm": 2.7139194011688232, + "learning_rate": 8.898377505495736e-06, + "loss": 0.5027, + "step": 278960 + }, + { + "epoch": 2.466185752930568, + "grad_norm": 2.999685525894165, + "learning_rate": 8.896904117823866e-06, + "loss": 0.6016, + "step": 278970 + }, + { + "epoch": 2.46627415619088, + "grad_norm": 3.9958088397979736, + "learning_rate": 8.895430730151996e-06, + "loss": 0.5733, + "step": 278980 + }, + { + "epoch": 2.4663625594511926, + "grad_norm": 0.9494467377662659, + "learning_rate": 8.893957342480124e-06, + "loss": 0.4632, + "step": 278990 + }, + { + "epoch": 2.4664509627115048, + "grad_norm": 15.465521812438965, + "learning_rate": 8.892483954808254e-06, + "loss": 0.5736, + "step": 279000 + }, + { + "epoch": 2.466539365971817, + "grad_norm": 1.9387810230255127, + "learning_rate": 8.891010567136384e-06, + "loss": 0.5548, + "step": 279010 + }, + { + "epoch": 2.4666277692321295, + "grad_norm": 3.2911527156829834, + "learning_rate": 8.889537179464513e-06, + "loss": 0.5984, + "step": 279020 + }, + { + "epoch": 2.4667161724924416, + "grad_norm": 4.234828948974609, + "learning_rate": 8.888063791792643e-06, + "loss": 0.5038, + "step": 279030 + }, + { + "epoch": 2.4668045757527537, + "grad_norm": 6.808634281158447, + "learning_rate": 8.886590404120771e-06, + "loss": 0.5515, + "step": 279040 + }, + { + "epoch": 2.466892979013066, + "grad_norm": 3.6760826110839844, + "learning_rate": 8.885117016448901e-06, + "loss": 0.5227, + "step": 279050 + }, + { + "epoch": 2.4669813822733784, + "grad_norm": 2.5972983837127686, + "learning_rate": 8.883643628777031e-06, + "loss": 0.4592, + "step": 279060 + }, + { + "epoch": 2.4670697855336905, + "grad_norm": 4.843863487243652, + "learning_rate": 8.88217024110516e-06, + "loss": 0.4707, + "step": 279070 + }, + { + "epoch": 2.4671581887940026, + "grad_norm": 3.175769805908203, + "learning_rate": 8.88069685343329e-06, + "loss": 0.4931, + "step": 279080 + }, + { + "epoch": 2.467246592054315, + "grad_norm": 3.6731014251708984, + "learning_rate": 8.879223465761418e-06, + "loss": 0.4352, + "step": 279090 + }, + { + "epoch": 2.4673349953146273, + "grad_norm": 6.7193379402160645, + "learning_rate": 8.877750078089548e-06, + "loss": 0.5062, + "step": 279100 + }, + { + "epoch": 2.4674233985749394, + "grad_norm": 15.310567855834961, + "learning_rate": 8.876276690417676e-06, + "loss": 0.6043, + "step": 279110 + }, + { + "epoch": 2.4675118018352515, + "grad_norm": 2.9731454849243164, + "learning_rate": 8.874803302745806e-06, + "loss": 0.5358, + "step": 279120 + }, + { + "epoch": 2.467600205095564, + "grad_norm": 2.29075026512146, + "learning_rate": 8.873329915073935e-06, + "loss": 0.4634, + "step": 279130 + }, + { + "epoch": 2.4676886083558762, + "grad_norm": 3.3483269214630127, + "learning_rate": 8.871856527402065e-06, + "loss": 0.45, + "step": 279140 + }, + { + "epoch": 2.4677770116161883, + "grad_norm": 3.7314980030059814, + "learning_rate": 8.870383139730195e-06, + "loss": 0.5536, + "step": 279150 + }, + { + "epoch": 2.4678654148765005, + "grad_norm": 2.491741180419922, + "learning_rate": 8.868909752058323e-06, + "loss": 0.5567, + "step": 279160 + }, + { + "epoch": 2.467953818136813, + "grad_norm": 24.607330322265625, + "learning_rate": 8.867436364386453e-06, + "loss": 0.4573, + "step": 279170 + }, + { + "epoch": 2.468042221397125, + "grad_norm": 2.64412522315979, + "learning_rate": 8.865962976714581e-06, + "loss": 0.4701, + "step": 279180 + }, + { + "epoch": 2.4681306246574373, + "grad_norm": 6.952174186706543, + "learning_rate": 8.864489589042711e-06, + "loss": 0.5962, + "step": 279190 + }, + { + "epoch": 2.4682190279177494, + "grad_norm": 11.303531646728516, + "learning_rate": 8.86301620137084e-06, + "loss": 0.6884, + "step": 279200 + }, + { + "epoch": 2.468307431178062, + "grad_norm": 1.497503638267517, + "learning_rate": 8.86154281369897e-06, + "loss": 0.425, + "step": 279210 + }, + { + "epoch": 2.468395834438374, + "grad_norm": 3.2397687435150146, + "learning_rate": 8.860069426027098e-06, + "loss": 0.6064, + "step": 279220 + }, + { + "epoch": 2.468484237698686, + "grad_norm": 4.007812023162842, + "learning_rate": 8.858596038355228e-06, + "loss": 0.6171, + "step": 279230 + }, + { + "epoch": 2.4685726409589988, + "grad_norm": 10.464337348937988, + "learning_rate": 8.857122650683358e-06, + "loss": 0.473, + "step": 279240 + }, + { + "epoch": 2.468661044219311, + "grad_norm": 4.028013229370117, + "learning_rate": 8.855649263011487e-06, + "loss": 0.5571, + "step": 279250 + }, + { + "epoch": 2.468749447479623, + "grad_norm": 2.2863919734954834, + "learning_rate": 8.854175875339617e-06, + "loss": 0.5534, + "step": 279260 + }, + { + "epoch": 2.468837850739935, + "grad_norm": 5.449743270874023, + "learning_rate": 8.852702487667745e-06, + "loss": 0.4609, + "step": 279270 + }, + { + "epoch": 2.4689262540002477, + "grad_norm": 2.434565305709839, + "learning_rate": 8.851229099995875e-06, + "loss": 0.5774, + "step": 279280 + }, + { + "epoch": 2.46901465726056, + "grad_norm": 6.549703598022461, + "learning_rate": 8.849755712324003e-06, + "loss": 0.3695, + "step": 279290 + }, + { + "epoch": 2.469103060520872, + "grad_norm": 2.0515575408935547, + "learning_rate": 8.848282324652134e-06, + "loss": 0.5663, + "step": 279300 + }, + { + "epoch": 2.4691914637811845, + "grad_norm": 3.920596122741699, + "learning_rate": 8.846808936980262e-06, + "loss": 0.5178, + "step": 279310 + }, + { + "epoch": 2.4692798670414966, + "grad_norm": 4.704256057739258, + "learning_rate": 8.845335549308392e-06, + "loss": 0.5629, + "step": 279320 + }, + { + "epoch": 2.4693682703018087, + "grad_norm": 4.0584492683410645, + "learning_rate": 8.843862161636522e-06, + "loss": 0.4712, + "step": 279330 + }, + { + "epoch": 2.469456673562121, + "grad_norm": 5.0062384605407715, + "learning_rate": 8.84238877396465e-06, + "loss": 0.5565, + "step": 279340 + }, + { + "epoch": 2.4695450768224334, + "grad_norm": 3.6819145679473877, + "learning_rate": 8.84091538629278e-06, + "loss": 0.4181, + "step": 279350 + }, + { + "epoch": 2.4696334800827455, + "grad_norm": 2.2153780460357666, + "learning_rate": 8.839441998620909e-06, + "loss": 0.5074, + "step": 279360 + }, + { + "epoch": 2.4697218833430576, + "grad_norm": 1.71326744556427, + "learning_rate": 8.837968610949039e-06, + "loss": 0.5257, + "step": 279370 + }, + { + "epoch": 2.4698102866033698, + "grad_norm": 7.515169143676758, + "learning_rate": 8.836495223277167e-06, + "loss": 0.589, + "step": 279380 + }, + { + "epoch": 2.4698986898636823, + "grad_norm": 4.922698974609375, + "learning_rate": 8.835021835605297e-06, + "loss": 0.5071, + "step": 279390 + }, + { + "epoch": 2.4699870931239944, + "grad_norm": 2.6033997535705566, + "learning_rate": 8.833548447933426e-06, + "loss": 0.526, + "step": 279400 + }, + { + "epoch": 2.4700754963843066, + "grad_norm": 2.8752946853637695, + "learning_rate": 8.832075060261556e-06, + "loss": 0.4552, + "step": 279410 + }, + { + "epoch": 2.4701638996446187, + "grad_norm": 6.904942512512207, + "learning_rate": 8.830601672589686e-06, + "loss": 0.5297, + "step": 279420 + }, + { + "epoch": 2.4702523029049313, + "grad_norm": 4.504876136779785, + "learning_rate": 8.829128284917814e-06, + "loss": 0.5497, + "step": 279430 + }, + { + "epoch": 2.4703407061652434, + "grad_norm": 2.3968935012817383, + "learning_rate": 8.827654897245944e-06, + "loss": 0.5988, + "step": 279440 + }, + { + "epoch": 2.4704291094255555, + "grad_norm": 4.612353801727295, + "learning_rate": 8.826181509574074e-06, + "loss": 0.5709, + "step": 279450 + }, + { + "epoch": 2.470517512685868, + "grad_norm": 2.423128843307495, + "learning_rate": 8.824708121902202e-06, + "loss": 0.5534, + "step": 279460 + }, + { + "epoch": 2.47060591594618, + "grad_norm": 4.652339935302734, + "learning_rate": 8.823234734230332e-06, + "loss": 0.5505, + "step": 279470 + }, + { + "epoch": 2.4706943192064923, + "grad_norm": 6.157441139221191, + "learning_rate": 8.821761346558463e-06, + "loss": 0.3255, + "step": 279480 + }, + { + "epoch": 2.4707827224668044, + "grad_norm": 1.469695806503296, + "learning_rate": 8.820287958886591e-06, + "loss": 0.4073, + "step": 279490 + }, + { + "epoch": 2.470871125727117, + "grad_norm": 6.608531951904297, + "learning_rate": 8.818814571214721e-06, + "loss": 0.4731, + "step": 279500 + }, + { + "epoch": 2.470959528987429, + "grad_norm": 3.216278076171875, + "learning_rate": 8.817341183542851e-06, + "loss": 0.3586, + "step": 279510 + }, + { + "epoch": 2.471047932247741, + "grad_norm": 1.6763254404067993, + "learning_rate": 8.81586779587098e-06, + "loss": 0.496, + "step": 279520 + }, + { + "epoch": 2.471136335508054, + "grad_norm": 45.39899826049805, + "learning_rate": 8.81439440819911e-06, + "loss": 0.6119, + "step": 279530 + }, + { + "epoch": 2.471224738768366, + "grad_norm": 2.287515163421631, + "learning_rate": 8.812921020527238e-06, + "loss": 0.6209, + "step": 279540 + }, + { + "epoch": 2.471313142028678, + "grad_norm": 5.900786876678467, + "learning_rate": 8.811447632855368e-06, + "loss": 0.523, + "step": 279550 + }, + { + "epoch": 2.47140154528899, + "grad_norm": 8.56271743774414, + "learning_rate": 8.809974245183496e-06, + "loss": 0.5622, + "step": 279560 + }, + { + "epoch": 2.4714899485493023, + "grad_norm": 10.41976261138916, + "learning_rate": 8.808500857511626e-06, + "loss": 0.4118, + "step": 279570 + }, + { + "epoch": 2.471578351809615, + "grad_norm": 5.628787517547607, + "learning_rate": 8.807027469839755e-06, + "loss": 0.5707, + "step": 279580 + }, + { + "epoch": 2.471666755069927, + "grad_norm": 7.575383186340332, + "learning_rate": 8.805554082167885e-06, + "loss": 0.5075, + "step": 279590 + }, + { + "epoch": 2.471755158330239, + "grad_norm": 1.7244447469711304, + "learning_rate": 8.804080694496013e-06, + "loss": 0.5269, + "step": 279600 + }, + { + "epoch": 2.4718435615905516, + "grad_norm": 1.9261603355407715, + "learning_rate": 8.802607306824143e-06, + "loss": 0.5215, + "step": 279610 + }, + { + "epoch": 2.4719319648508637, + "grad_norm": 1.9488099813461304, + "learning_rate": 8.801133919152273e-06, + "loss": 0.3697, + "step": 279620 + }, + { + "epoch": 2.472020368111176, + "grad_norm": 2.441909074783325, + "learning_rate": 8.799660531480401e-06, + "loss": 0.6431, + "step": 279630 + }, + { + "epoch": 2.472108771371488, + "grad_norm": 1.8047938346862793, + "learning_rate": 8.798187143808531e-06, + "loss": 0.6013, + "step": 279640 + }, + { + "epoch": 2.4721971746318006, + "grad_norm": 2.603288412094116, + "learning_rate": 8.79671375613666e-06, + "loss": 0.5097, + "step": 279650 + }, + { + "epoch": 2.4722855778921127, + "grad_norm": 4.542216777801514, + "learning_rate": 8.79524036846479e-06, + "loss": 0.5634, + "step": 279660 + }, + { + "epoch": 2.472373981152425, + "grad_norm": 2.180781841278076, + "learning_rate": 8.793766980792918e-06, + "loss": 0.3992, + "step": 279670 + }, + { + "epoch": 2.4724623844127374, + "grad_norm": 14.174686431884766, + "learning_rate": 8.792293593121048e-06, + "loss": 0.5825, + "step": 279680 + }, + { + "epoch": 2.4725507876730495, + "grad_norm": 1.327408790588379, + "learning_rate": 8.790820205449177e-06, + "loss": 0.5816, + "step": 279690 + }, + { + "epoch": 2.4726391909333616, + "grad_norm": 7.664493083953857, + "learning_rate": 8.789346817777307e-06, + "loss": 0.5145, + "step": 279700 + }, + { + "epoch": 2.4727275941936737, + "grad_norm": 2.2441799640655518, + "learning_rate": 8.787873430105437e-06, + "loss": 0.5375, + "step": 279710 + }, + { + "epoch": 2.4728159974539863, + "grad_norm": 7.796451568603516, + "learning_rate": 8.786400042433565e-06, + "loss": 0.483, + "step": 279720 + }, + { + "epoch": 2.4729044007142984, + "grad_norm": 1.1989774703979492, + "learning_rate": 8.784926654761695e-06, + "loss": 0.6032, + "step": 279730 + }, + { + "epoch": 2.4729928039746105, + "grad_norm": 3.4178082942962646, + "learning_rate": 8.783453267089823e-06, + "loss": 0.6184, + "step": 279740 + }, + { + "epoch": 2.4730812072349226, + "grad_norm": 5.040160655975342, + "learning_rate": 8.781979879417953e-06, + "loss": 0.5602, + "step": 279750 + }, + { + "epoch": 2.473169610495235, + "grad_norm": 1.3584271669387817, + "learning_rate": 8.780506491746082e-06, + "loss": 0.451, + "step": 279760 + }, + { + "epoch": 2.4732580137555473, + "grad_norm": 13.72913932800293, + "learning_rate": 8.779033104074212e-06, + "loss": 0.718, + "step": 279770 + }, + { + "epoch": 2.4733464170158594, + "grad_norm": 1.5241626501083374, + "learning_rate": 8.77755971640234e-06, + "loss": 0.5629, + "step": 279780 + }, + { + "epoch": 2.4734348202761716, + "grad_norm": 4.8715314865112305, + "learning_rate": 8.77608632873047e-06, + "loss": 0.6094, + "step": 279790 + }, + { + "epoch": 2.473523223536484, + "grad_norm": 14.018818855285645, + "learning_rate": 8.7746129410586e-06, + "loss": 0.5072, + "step": 279800 + }, + { + "epoch": 2.4736116267967962, + "grad_norm": 2.086625814437866, + "learning_rate": 8.773139553386729e-06, + "loss": 0.4854, + "step": 279810 + }, + { + "epoch": 2.4737000300571084, + "grad_norm": 5.687159061431885, + "learning_rate": 8.771666165714859e-06, + "loss": 0.4416, + "step": 279820 + }, + { + "epoch": 2.473788433317421, + "grad_norm": 2.7168376445770264, + "learning_rate": 8.770192778042987e-06, + "loss": 0.3741, + "step": 279830 + }, + { + "epoch": 2.473876836577733, + "grad_norm": 0.974591076374054, + "learning_rate": 8.768719390371117e-06, + "loss": 0.4119, + "step": 279840 + }, + { + "epoch": 2.473965239838045, + "grad_norm": 5.34660005569458, + "learning_rate": 8.767246002699246e-06, + "loss": 0.4567, + "step": 279850 + }, + { + "epoch": 2.4740536430983573, + "grad_norm": 8.104817390441895, + "learning_rate": 8.765772615027376e-06, + "loss": 0.5503, + "step": 279860 + }, + { + "epoch": 2.47414204635867, + "grad_norm": 0.6286687254905701, + "learning_rate": 8.764299227355504e-06, + "loss": 0.4216, + "step": 279870 + }, + { + "epoch": 2.474230449618982, + "grad_norm": 9.457762718200684, + "learning_rate": 8.762825839683634e-06, + "loss": 0.4374, + "step": 279880 + }, + { + "epoch": 2.474318852879294, + "grad_norm": 4.143723964691162, + "learning_rate": 8.761352452011764e-06, + "loss": 0.5597, + "step": 279890 + }, + { + "epoch": 2.4744072561396067, + "grad_norm": 1.4859076738357544, + "learning_rate": 8.759879064339892e-06, + "loss": 0.5907, + "step": 279900 + }, + { + "epoch": 2.474495659399919, + "grad_norm": 1.999810814857483, + "learning_rate": 8.758405676668022e-06, + "loss": 0.3783, + "step": 279910 + }, + { + "epoch": 2.474584062660231, + "grad_norm": 34.57439041137695, + "learning_rate": 8.756932288996152e-06, + "loss": 0.5996, + "step": 279920 + }, + { + "epoch": 2.474672465920543, + "grad_norm": 0.7583003640174866, + "learning_rate": 8.75545890132428e-06, + "loss": 0.5103, + "step": 279930 + }, + { + "epoch": 2.4747608691808556, + "grad_norm": 1.9173041582107544, + "learning_rate": 8.753985513652411e-06, + "loss": 0.6134, + "step": 279940 + }, + { + "epoch": 2.4748492724411677, + "grad_norm": 4.092799663543701, + "learning_rate": 8.752512125980541e-06, + "loss": 0.5964, + "step": 279950 + }, + { + "epoch": 2.47493767570148, + "grad_norm": 1.8084808588027954, + "learning_rate": 8.75103873830867e-06, + "loss": 0.522, + "step": 279960 + }, + { + "epoch": 2.475026078961792, + "grad_norm": 3.3327982425689697, + "learning_rate": 8.7495653506368e-06, + "loss": 0.703, + "step": 279970 + }, + { + "epoch": 2.4751144822221045, + "grad_norm": 2.514735460281372, + "learning_rate": 8.74809196296493e-06, + "loss": 0.581, + "step": 279980 + }, + { + "epoch": 2.4752028854824166, + "grad_norm": 4.8310346603393555, + "learning_rate": 8.746618575293058e-06, + "loss": 0.3779, + "step": 279990 + }, + { + "epoch": 2.4752912887427287, + "grad_norm": 2.6557085514068604, + "learning_rate": 8.745145187621188e-06, + "loss": 0.6751, + "step": 280000 + }, + { + "epoch": 2.475379692003041, + "grad_norm": 2.1326472759246826, + "learning_rate": 8.743671799949316e-06, + "loss": 0.5238, + "step": 280010 + }, + { + "epoch": 2.4754680952633534, + "grad_norm": 8.372288703918457, + "learning_rate": 8.742198412277446e-06, + "loss": 0.5683, + "step": 280020 + }, + { + "epoch": 2.4755564985236655, + "grad_norm": 12.401871681213379, + "learning_rate": 8.740725024605575e-06, + "loss": 0.3999, + "step": 280030 + }, + { + "epoch": 2.4756449017839777, + "grad_norm": 8.29393482208252, + "learning_rate": 8.739251636933705e-06, + "loss": 0.5438, + "step": 280040 + }, + { + "epoch": 2.4757333050442902, + "grad_norm": 5.286712169647217, + "learning_rate": 8.737778249261833e-06, + "loss": 0.5323, + "step": 280050 + }, + { + "epoch": 2.4758217083046024, + "grad_norm": 4.551589488983154, + "learning_rate": 8.736304861589963e-06, + "loss": 0.5302, + "step": 280060 + }, + { + "epoch": 2.4759101115649145, + "grad_norm": 2.8299341201782227, + "learning_rate": 8.734831473918093e-06, + "loss": 0.5963, + "step": 280070 + }, + { + "epoch": 2.4759985148252266, + "grad_norm": 3.377981662750244, + "learning_rate": 8.733358086246221e-06, + "loss": 0.4869, + "step": 280080 + }, + { + "epoch": 2.476086918085539, + "grad_norm": 5.381898880004883, + "learning_rate": 8.731884698574351e-06, + "loss": 0.5033, + "step": 280090 + }, + { + "epoch": 2.4761753213458513, + "grad_norm": 4.8384552001953125, + "learning_rate": 8.73041131090248e-06, + "loss": 0.4562, + "step": 280100 + }, + { + "epoch": 2.4762637246061634, + "grad_norm": 1.7524962425231934, + "learning_rate": 8.72893792323061e-06, + "loss": 0.6116, + "step": 280110 + }, + { + "epoch": 2.476352127866476, + "grad_norm": 2.2583200931549072, + "learning_rate": 8.727464535558738e-06, + "loss": 0.511, + "step": 280120 + }, + { + "epoch": 2.476440531126788, + "grad_norm": 1.9098998308181763, + "learning_rate": 8.725991147886868e-06, + "loss": 0.4843, + "step": 280130 + }, + { + "epoch": 2.4765289343871, + "grad_norm": 2.645768165588379, + "learning_rate": 8.724517760214997e-06, + "loss": 0.5475, + "step": 280140 + }, + { + "epoch": 2.4766173376474123, + "grad_norm": 3.0055527687072754, + "learning_rate": 8.723044372543127e-06, + "loss": 0.58, + "step": 280150 + }, + { + "epoch": 2.4767057409077244, + "grad_norm": 6.285213947296143, + "learning_rate": 8.721570984871255e-06, + "loss": 0.5897, + "step": 280160 + }, + { + "epoch": 2.476794144168037, + "grad_norm": 9.82533073425293, + "learning_rate": 8.720097597199385e-06, + "loss": 0.4066, + "step": 280170 + }, + { + "epoch": 2.476882547428349, + "grad_norm": 7.395061016082764, + "learning_rate": 8.718624209527515e-06, + "loss": 0.4054, + "step": 280180 + }, + { + "epoch": 2.4769709506886612, + "grad_norm": 1.7233260869979858, + "learning_rate": 8.717150821855643e-06, + "loss": 0.4964, + "step": 280190 + }, + { + "epoch": 2.477059353948974, + "grad_norm": 1.8852282762527466, + "learning_rate": 8.715677434183773e-06, + "loss": 0.4805, + "step": 280200 + }, + { + "epoch": 2.477147757209286, + "grad_norm": 1.494240403175354, + "learning_rate": 8.714204046511902e-06, + "loss": 0.5652, + "step": 280210 + }, + { + "epoch": 2.477236160469598, + "grad_norm": 17.284461975097656, + "learning_rate": 8.712730658840032e-06, + "loss": 0.5163, + "step": 280220 + }, + { + "epoch": 2.47732456372991, + "grad_norm": 11.939537048339844, + "learning_rate": 8.71125727116816e-06, + "loss": 0.5263, + "step": 280230 + }, + { + "epoch": 2.4774129669902227, + "grad_norm": 6.5691704750061035, + "learning_rate": 8.70978388349629e-06, + "loss": 0.5775, + "step": 280240 + }, + { + "epoch": 2.477501370250535, + "grad_norm": 6.829984188079834, + "learning_rate": 8.708310495824419e-06, + "loss": 0.554, + "step": 280250 + }, + { + "epoch": 2.477589773510847, + "grad_norm": 40.959556579589844, + "learning_rate": 8.706837108152549e-06, + "loss": 0.5331, + "step": 280260 + }, + { + "epoch": 2.4776781767711595, + "grad_norm": 2.2689621448516846, + "learning_rate": 8.705363720480679e-06, + "loss": 0.5919, + "step": 280270 + }, + { + "epoch": 2.4777665800314717, + "grad_norm": 10.086898803710938, + "learning_rate": 8.703890332808807e-06, + "loss": 0.4938, + "step": 280280 + }, + { + "epoch": 2.4778549832917838, + "grad_norm": 3.680077314376831, + "learning_rate": 8.702416945136937e-06, + "loss": 0.4747, + "step": 280290 + }, + { + "epoch": 2.477943386552096, + "grad_norm": 27.755935668945312, + "learning_rate": 8.700943557465065e-06, + "loss": 0.4316, + "step": 280300 + }, + { + "epoch": 2.4780317898124085, + "grad_norm": 7.645777702331543, + "learning_rate": 8.699470169793196e-06, + "loss": 0.5277, + "step": 280310 + }, + { + "epoch": 2.4781201930727206, + "grad_norm": 3.409411668777466, + "learning_rate": 8.697996782121324e-06, + "loss": 0.5007, + "step": 280320 + }, + { + "epoch": 2.4782085963330327, + "grad_norm": 1.5698580741882324, + "learning_rate": 8.696523394449454e-06, + "loss": 0.4815, + "step": 280330 + }, + { + "epoch": 2.478296999593345, + "grad_norm": 3.612891435623169, + "learning_rate": 8.695050006777582e-06, + "loss": 0.6325, + "step": 280340 + }, + { + "epoch": 2.4783854028536574, + "grad_norm": 8.465577125549316, + "learning_rate": 8.693576619105712e-06, + "loss": 0.5934, + "step": 280350 + }, + { + "epoch": 2.4784738061139695, + "grad_norm": 7.3054585456848145, + "learning_rate": 8.692103231433842e-06, + "loss": 0.545, + "step": 280360 + }, + { + "epoch": 2.4785622093742816, + "grad_norm": 8.37590217590332, + "learning_rate": 8.69062984376197e-06, + "loss": 0.5997, + "step": 280370 + }, + { + "epoch": 2.4786506126345937, + "grad_norm": 3.992547035217285, + "learning_rate": 8.6891564560901e-06, + "loss": 0.5857, + "step": 280380 + }, + { + "epoch": 2.4787390158949063, + "grad_norm": 1.1773899793624878, + "learning_rate": 8.68768306841823e-06, + "loss": 0.4888, + "step": 280390 + }, + { + "epoch": 2.4788274191552184, + "grad_norm": 18.061519622802734, + "learning_rate": 8.68620968074636e-06, + "loss": 0.6183, + "step": 280400 + }, + { + "epoch": 2.4789158224155305, + "grad_norm": 1.3086496591567993, + "learning_rate": 8.68473629307449e-06, + "loss": 0.5249, + "step": 280410 + }, + { + "epoch": 2.479004225675843, + "grad_norm": 2.0957720279693604, + "learning_rate": 8.68326290540262e-06, + "loss": 0.4296, + "step": 280420 + }, + { + "epoch": 2.4790926289361552, + "grad_norm": 5.2236504554748535, + "learning_rate": 8.681789517730748e-06, + "loss": 0.5281, + "step": 280430 + }, + { + "epoch": 2.4791810321964673, + "grad_norm": 18.861337661743164, + "learning_rate": 8.680316130058878e-06, + "loss": 0.6253, + "step": 280440 + }, + { + "epoch": 2.4792694354567795, + "grad_norm": 38.343780517578125, + "learning_rate": 8.678842742387008e-06, + "loss": 0.4979, + "step": 280450 + }, + { + "epoch": 2.479357838717092, + "grad_norm": 2.6812262535095215, + "learning_rate": 8.677369354715136e-06, + "loss": 0.5202, + "step": 280460 + }, + { + "epoch": 2.479446241977404, + "grad_norm": 4.478364944458008, + "learning_rate": 8.675895967043266e-06, + "loss": 0.5468, + "step": 280470 + }, + { + "epoch": 2.4795346452377163, + "grad_norm": 1.8681137561798096, + "learning_rate": 8.674422579371394e-06, + "loss": 0.4879, + "step": 280480 + }, + { + "epoch": 2.479623048498029, + "grad_norm": 5.974477767944336, + "learning_rate": 8.672949191699525e-06, + "loss": 0.5476, + "step": 280490 + }, + { + "epoch": 2.479711451758341, + "grad_norm": 30.592294692993164, + "learning_rate": 8.671475804027653e-06, + "loss": 0.5641, + "step": 280500 + }, + { + "epoch": 2.479799855018653, + "grad_norm": 1.6908665895462036, + "learning_rate": 8.670002416355783e-06, + "loss": 0.5418, + "step": 280510 + }, + { + "epoch": 2.479888258278965, + "grad_norm": 1.754019856452942, + "learning_rate": 8.668529028683911e-06, + "loss": 0.4043, + "step": 280520 + }, + { + "epoch": 2.4799766615392778, + "grad_norm": 4.153541564941406, + "learning_rate": 8.667055641012041e-06, + "loss": 0.5338, + "step": 280530 + }, + { + "epoch": 2.48006506479959, + "grad_norm": 3.8224616050720215, + "learning_rate": 8.665582253340171e-06, + "loss": 0.4941, + "step": 280540 + }, + { + "epoch": 2.480153468059902, + "grad_norm": 0.7912634015083313, + "learning_rate": 8.6641088656683e-06, + "loss": 0.5133, + "step": 280550 + }, + { + "epoch": 2.480241871320214, + "grad_norm": 4.991604328155518, + "learning_rate": 8.66263547799643e-06, + "loss": 0.4897, + "step": 280560 + }, + { + "epoch": 2.4803302745805267, + "grad_norm": 8.659408569335938, + "learning_rate": 8.661162090324558e-06, + "loss": 0.5241, + "step": 280570 + }, + { + "epoch": 2.480418677840839, + "grad_norm": 1.233142614364624, + "learning_rate": 8.659688702652688e-06, + "loss": 0.618, + "step": 280580 + }, + { + "epoch": 2.480507081101151, + "grad_norm": 1.8357526063919067, + "learning_rate": 8.658215314980817e-06, + "loss": 0.6196, + "step": 280590 + }, + { + "epoch": 2.480595484361463, + "grad_norm": 1.249118447303772, + "learning_rate": 8.656741927308947e-06, + "loss": 0.5286, + "step": 280600 + }, + { + "epoch": 2.4806838876217756, + "grad_norm": 3.538130044937134, + "learning_rate": 8.655268539637075e-06, + "loss": 0.479, + "step": 280610 + }, + { + "epoch": 2.4807722908820877, + "grad_norm": 4.472984790802002, + "learning_rate": 8.653795151965205e-06, + "loss": 0.4328, + "step": 280620 + }, + { + "epoch": 2.4808606941424, + "grad_norm": 1.365333914756775, + "learning_rate": 8.652321764293335e-06, + "loss": 0.5221, + "step": 280630 + }, + { + "epoch": 2.4809490974027124, + "grad_norm": 5.702578067779541, + "learning_rate": 8.650848376621463e-06, + "loss": 0.5305, + "step": 280640 + }, + { + "epoch": 2.4810375006630245, + "grad_norm": 0.5363750457763672, + "learning_rate": 8.649374988949593e-06, + "loss": 0.4453, + "step": 280650 + }, + { + "epoch": 2.4811259039233367, + "grad_norm": 1.4233465194702148, + "learning_rate": 8.647901601277722e-06, + "loss": 0.5918, + "step": 280660 + }, + { + "epoch": 2.4812143071836488, + "grad_norm": 5.91023588180542, + "learning_rate": 8.646428213605852e-06, + "loss": 0.5558, + "step": 280670 + }, + { + "epoch": 2.4813027104439613, + "grad_norm": 1.1945688724517822, + "learning_rate": 8.64495482593398e-06, + "loss": 0.6688, + "step": 280680 + }, + { + "epoch": 2.4813911137042735, + "grad_norm": 5.114912986755371, + "learning_rate": 8.64348143826211e-06, + "loss": 0.6635, + "step": 280690 + }, + { + "epoch": 2.4814795169645856, + "grad_norm": 3.9804768562316895, + "learning_rate": 8.642008050590239e-06, + "loss": 0.5077, + "step": 280700 + }, + { + "epoch": 2.481567920224898, + "grad_norm": 3.037600040435791, + "learning_rate": 8.640534662918369e-06, + "loss": 0.4761, + "step": 280710 + }, + { + "epoch": 2.4816563234852103, + "grad_norm": 2.6919808387756348, + "learning_rate": 8.639061275246497e-06, + "loss": 0.5811, + "step": 280720 + }, + { + "epoch": 2.4817447267455224, + "grad_norm": 4.860576152801514, + "learning_rate": 8.637587887574627e-06, + "loss": 0.4928, + "step": 280730 + }, + { + "epoch": 2.4818331300058345, + "grad_norm": 2.758634090423584, + "learning_rate": 8.636114499902757e-06, + "loss": 0.447, + "step": 280740 + }, + { + "epoch": 2.4819215332661466, + "grad_norm": 4.195562839508057, + "learning_rate": 8.634641112230885e-06, + "loss": 0.3972, + "step": 280750 + }, + { + "epoch": 2.482009936526459, + "grad_norm": 11.501033782958984, + "learning_rate": 8.633167724559015e-06, + "loss": 0.4922, + "step": 280760 + }, + { + "epoch": 2.4820983397867713, + "grad_norm": 3.8611598014831543, + "learning_rate": 8.631694336887144e-06, + "loss": 0.6368, + "step": 280770 + }, + { + "epoch": 2.4821867430470834, + "grad_norm": 5.654477119445801, + "learning_rate": 8.630220949215274e-06, + "loss": 0.4123, + "step": 280780 + }, + { + "epoch": 2.482275146307396, + "grad_norm": 10.805825233459473, + "learning_rate": 8.628747561543402e-06, + "loss": 0.4917, + "step": 280790 + }, + { + "epoch": 2.482363549567708, + "grad_norm": 64.3514633178711, + "learning_rate": 8.627274173871532e-06, + "loss": 0.4913, + "step": 280800 + }, + { + "epoch": 2.4824519528280202, + "grad_norm": 1.2455426454544067, + "learning_rate": 8.62580078619966e-06, + "loss": 0.5111, + "step": 280810 + }, + { + "epoch": 2.4825403560883323, + "grad_norm": 7.522809028625488, + "learning_rate": 8.62432739852779e-06, + "loss": 0.5313, + "step": 280820 + }, + { + "epoch": 2.482628759348645, + "grad_norm": 1.4702270030975342, + "learning_rate": 8.62285401085592e-06, + "loss": 0.5181, + "step": 280830 + }, + { + "epoch": 2.482717162608957, + "grad_norm": 4.660195827484131, + "learning_rate": 8.621380623184049e-06, + "loss": 0.4643, + "step": 280840 + }, + { + "epoch": 2.482805565869269, + "grad_norm": 4.490960121154785, + "learning_rate": 8.619907235512179e-06, + "loss": 0.6361, + "step": 280850 + }, + { + "epoch": 2.4828939691295817, + "grad_norm": 5.657462120056152, + "learning_rate": 8.61843384784031e-06, + "loss": 0.5115, + "step": 280860 + }, + { + "epoch": 2.482982372389894, + "grad_norm": 1.205670714378357, + "learning_rate": 8.616960460168438e-06, + "loss": 0.605, + "step": 280870 + }, + { + "epoch": 2.483070775650206, + "grad_norm": 6.201824188232422, + "learning_rate": 8.615487072496568e-06, + "loss": 0.5096, + "step": 280880 + }, + { + "epoch": 2.483159178910518, + "grad_norm": 2.838062286376953, + "learning_rate": 8.614013684824698e-06, + "loss": 0.5238, + "step": 280890 + }, + { + "epoch": 2.4832475821708306, + "grad_norm": 5.0207905769348145, + "learning_rate": 8.612540297152826e-06, + "loss": 0.5729, + "step": 280900 + }, + { + "epoch": 2.4833359854311428, + "grad_norm": 5.678329944610596, + "learning_rate": 8.611066909480956e-06, + "loss": 0.4861, + "step": 280910 + }, + { + "epoch": 2.483424388691455, + "grad_norm": 1.2118099927902222, + "learning_rate": 8.609593521809086e-06, + "loss": 0.436, + "step": 280920 + }, + { + "epoch": 2.483512791951767, + "grad_norm": 3.1135568618774414, + "learning_rate": 8.608120134137214e-06, + "loss": 0.503, + "step": 280930 + }, + { + "epoch": 2.4836011952120796, + "grad_norm": 2.998896598815918, + "learning_rate": 8.606646746465345e-06, + "loss": 0.594, + "step": 280940 + }, + { + "epoch": 2.4836895984723917, + "grad_norm": 0.7018662095069885, + "learning_rate": 8.605173358793473e-06, + "loss": 0.4848, + "step": 280950 + }, + { + "epoch": 2.483778001732704, + "grad_norm": 5.051451683044434, + "learning_rate": 8.603699971121603e-06, + "loss": 0.5771, + "step": 280960 + }, + { + "epoch": 2.483866404993016, + "grad_norm": 2.9823131561279297, + "learning_rate": 8.602226583449731e-06, + "loss": 0.5789, + "step": 280970 + }, + { + "epoch": 2.4839548082533285, + "grad_norm": 4.216987133026123, + "learning_rate": 8.600753195777861e-06, + "loss": 0.6014, + "step": 280980 + }, + { + "epoch": 2.4840432115136406, + "grad_norm": 2.78532338142395, + "learning_rate": 8.59927980810599e-06, + "loss": 0.5111, + "step": 280990 + }, + { + "epoch": 2.4841316147739527, + "grad_norm": 1.1874133348464966, + "learning_rate": 8.59780642043412e-06, + "loss": 0.4694, + "step": 281000 + }, + { + "epoch": 2.4842200180342653, + "grad_norm": 1.9408236742019653, + "learning_rate": 8.59633303276225e-06, + "loss": 0.4872, + "step": 281010 + }, + { + "epoch": 2.4843084212945774, + "grad_norm": 1.8688185214996338, + "learning_rate": 8.594859645090378e-06, + "loss": 0.5837, + "step": 281020 + }, + { + "epoch": 2.4843968245548895, + "grad_norm": 11.582292556762695, + "learning_rate": 8.593386257418508e-06, + "loss": 0.5423, + "step": 281030 + }, + { + "epoch": 2.4844852278152016, + "grad_norm": 3.4980642795562744, + "learning_rate": 8.591912869746637e-06, + "loss": 0.4472, + "step": 281040 + }, + { + "epoch": 2.484573631075514, + "grad_norm": 4.363236427307129, + "learning_rate": 8.590439482074767e-06, + "loss": 0.5184, + "step": 281050 + }, + { + "epoch": 2.4846620343358263, + "grad_norm": 2.5643560886383057, + "learning_rate": 8.588966094402895e-06, + "loss": 0.407, + "step": 281060 + }, + { + "epoch": 2.4847504375961385, + "grad_norm": 4.944018840789795, + "learning_rate": 8.587492706731025e-06, + "loss": 0.4122, + "step": 281070 + }, + { + "epoch": 2.484838840856451, + "grad_norm": 4.728646278381348, + "learning_rate": 8.586019319059153e-06, + "loss": 0.5884, + "step": 281080 + }, + { + "epoch": 2.484927244116763, + "grad_norm": 7.758571147918701, + "learning_rate": 8.584545931387283e-06, + "loss": 0.4076, + "step": 281090 + }, + { + "epoch": 2.4850156473770753, + "grad_norm": 1.5260275602340698, + "learning_rate": 8.583072543715413e-06, + "loss": 0.4963, + "step": 281100 + }, + { + "epoch": 2.4851040506373874, + "grad_norm": 2.4456989765167236, + "learning_rate": 8.581599156043542e-06, + "loss": 0.6015, + "step": 281110 + }, + { + "epoch": 2.4851924538977, + "grad_norm": 1.8170934915542603, + "learning_rate": 8.580125768371672e-06, + "loss": 0.5013, + "step": 281120 + }, + { + "epoch": 2.485280857158012, + "grad_norm": 3.5320749282836914, + "learning_rate": 8.5786523806998e-06, + "loss": 0.4921, + "step": 281130 + }, + { + "epoch": 2.485369260418324, + "grad_norm": 1.078714370727539, + "learning_rate": 8.57717899302793e-06, + "loss": 0.674, + "step": 281140 + }, + { + "epoch": 2.4854576636786363, + "grad_norm": 0.8263960480690002, + "learning_rate": 8.575705605356059e-06, + "loss": 0.5166, + "step": 281150 + }, + { + "epoch": 2.485546066938949, + "grad_norm": 7.065682411193848, + "learning_rate": 8.574232217684189e-06, + "loss": 0.4669, + "step": 281160 + }, + { + "epoch": 2.485634470199261, + "grad_norm": 4.668022632598877, + "learning_rate": 8.572758830012317e-06, + "loss": 0.4748, + "step": 281170 + }, + { + "epoch": 2.485722873459573, + "grad_norm": 1.461403250694275, + "learning_rate": 8.571285442340447e-06, + "loss": 0.5498, + "step": 281180 + }, + { + "epoch": 2.485811276719885, + "grad_norm": 5.174498081207275, + "learning_rate": 8.569812054668575e-06, + "loss": 0.5999, + "step": 281190 + }, + { + "epoch": 2.485899679980198, + "grad_norm": 7.912564754486084, + "learning_rate": 8.568338666996705e-06, + "loss": 0.6067, + "step": 281200 + }, + { + "epoch": 2.48598808324051, + "grad_norm": 5.963522434234619, + "learning_rate": 8.566865279324835e-06, + "loss": 0.3783, + "step": 281210 + }, + { + "epoch": 2.486076486500822, + "grad_norm": 5.653098106384277, + "learning_rate": 8.565391891652964e-06, + "loss": 0.631, + "step": 281220 + }, + { + "epoch": 2.4861648897611346, + "grad_norm": 3.8909225463867188, + "learning_rate": 8.563918503981094e-06, + "loss": 0.533, + "step": 281230 + }, + { + "epoch": 2.4862532930214467, + "grad_norm": 1.6424452066421509, + "learning_rate": 8.562445116309222e-06, + "loss": 0.4367, + "step": 281240 + }, + { + "epoch": 2.486341696281759, + "grad_norm": 2.385033369064331, + "learning_rate": 8.560971728637352e-06, + "loss": 0.5855, + "step": 281250 + }, + { + "epoch": 2.486430099542071, + "grad_norm": 3.5890748500823975, + "learning_rate": 8.55949834096548e-06, + "loss": 0.5096, + "step": 281260 + }, + { + "epoch": 2.4865185028023835, + "grad_norm": 1.9801634550094604, + "learning_rate": 8.55802495329361e-06, + "loss": 0.4209, + "step": 281270 + }, + { + "epoch": 2.4866069060626956, + "grad_norm": 3.4534502029418945, + "learning_rate": 8.556551565621739e-06, + "loss": 0.545, + "step": 281280 + }, + { + "epoch": 2.4866953093230078, + "grad_norm": 4.221946716308594, + "learning_rate": 8.555078177949869e-06, + "loss": 0.5066, + "step": 281290 + }, + { + "epoch": 2.4867837125833203, + "grad_norm": 3.55307674407959, + "learning_rate": 8.553604790277999e-06, + "loss": 0.4754, + "step": 281300 + }, + { + "epoch": 2.4868721158436324, + "grad_norm": 5.689570426940918, + "learning_rate": 8.55213140260613e-06, + "loss": 0.6028, + "step": 281310 + }, + { + "epoch": 2.4869605191039446, + "grad_norm": 3.0139012336730957, + "learning_rate": 8.550658014934258e-06, + "loss": 0.5351, + "step": 281320 + }, + { + "epoch": 2.4870489223642567, + "grad_norm": 1.628691554069519, + "learning_rate": 8.549184627262388e-06, + "loss": 0.6315, + "step": 281330 + }, + { + "epoch": 2.487137325624569, + "grad_norm": 3.7076468467712402, + "learning_rate": 8.547711239590518e-06, + "loss": 0.7061, + "step": 281340 + }, + { + "epoch": 2.4872257288848814, + "grad_norm": 4.740584850311279, + "learning_rate": 8.546237851918646e-06, + "loss": 0.4771, + "step": 281350 + }, + { + "epoch": 2.4873141321451935, + "grad_norm": 2.7139227390289307, + "learning_rate": 8.544764464246776e-06, + "loss": 0.4738, + "step": 281360 + }, + { + "epoch": 2.4874025354055056, + "grad_norm": 3.0072925090789795, + "learning_rate": 8.543291076574904e-06, + "loss": 0.5436, + "step": 281370 + }, + { + "epoch": 2.487490938665818, + "grad_norm": 1.100867748260498, + "learning_rate": 8.541817688903034e-06, + "loss": 0.4268, + "step": 281380 + }, + { + "epoch": 2.4875793419261303, + "grad_norm": 2.4990413188934326, + "learning_rate": 8.540344301231164e-06, + "loss": 0.5028, + "step": 281390 + }, + { + "epoch": 2.4876677451864424, + "grad_norm": 8.967830657958984, + "learning_rate": 8.538870913559293e-06, + "loss": 0.6233, + "step": 281400 + }, + { + "epoch": 2.4877561484467545, + "grad_norm": 2.8646252155303955, + "learning_rate": 8.537397525887423e-06, + "loss": 0.5599, + "step": 281410 + }, + { + "epoch": 2.487844551707067, + "grad_norm": 4.335108757019043, + "learning_rate": 8.535924138215551e-06, + "loss": 0.5871, + "step": 281420 + }, + { + "epoch": 2.487932954967379, + "grad_norm": 4.101443290710449, + "learning_rate": 8.534450750543681e-06, + "loss": 0.5366, + "step": 281430 + }, + { + "epoch": 2.4880213582276913, + "grad_norm": 2.2054922580718994, + "learning_rate": 8.53297736287181e-06, + "loss": 0.4645, + "step": 281440 + }, + { + "epoch": 2.488109761488004, + "grad_norm": 1.0682452917099, + "learning_rate": 8.53150397519994e-06, + "loss": 0.5094, + "step": 281450 + }, + { + "epoch": 2.488198164748316, + "grad_norm": 2.073755979537964, + "learning_rate": 8.530030587528068e-06, + "loss": 0.5108, + "step": 281460 + }, + { + "epoch": 2.488286568008628, + "grad_norm": 3.8056628704071045, + "learning_rate": 8.528557199856198e-06, + "loss": 0.6125, + "step": 281470 + }, + { + "epoch": 2.4883749712689403, + "grad_norm": 2.1050186157226562, + "learning_rate": 8.527083812184328e-06, + "loss": 0.4964, + "step": 281480 + }, + { + "epoch": 2.488463374529253, + "grad_norm": 1.4747847318649292, + "learning_rate": 8.525610424512456e-06, + "loss": 0.5204, + "step": 281490 + }, + { + "epoch": 2.488551777789565, + "grad_norm": 2.428779363632202, + "learning_rate": 8.524137036840587e-06, + "loss": 0.5975, + "step": 281500 + }, + { + "epoch": 2.488640181049877, + "grad_norm": 6.490080833435059, + "learning_rate": 8.522663649168715e-06, + "loss": 0.4722, + "step": 281510 + }, + { + "epoch": 2.488728584310189, + "grad_norm": 1.9261754751205444, + "learning_rate": 8.521190261496845e-06, + "loss": 0.4868, + "step": 281520 + }, + { + "epoch": 2.4888169875705017, + "grad_norm": 2.593219041824341, + "learning_rate": 8.519716873824973e-06, + "loss": 0.5527, + "step": 281530 + }, + { + "epoch": 2.488905390830814, + "grad_norm": 4.059865951538086, + "learning_rate": 8.518243486153103e-06, + "loss": 0.557, + "step": 281540 + }, + { + "epoch": 2.488993794091126, + "grad_norm": 2.8033132553100586, + "learning_rate": 8.516770098481232e-06, + "loss": 0.3892, + "step": 281550 + }, + { + "epoch": 2.489082197351438, + "grad_norm": 15.426776885986328, + "learning_rate": 8.515296710809362e-06, + "loss": 0.6197, + "step": 281560 + }, + { + "epoch": 2.4891706006117507, + "grad_norm": 3.1917407512664795, + "learning_rate": 8.513823323137492e-06, + "loss": 0.7474, + "step": 281570 + }, + { + "epoch": 2.489259003872063, + "grad_norm": 1.2229033708572388, + "learning_rate": 8.51234993546562e-06, + "loss": 0.402, + "step": 281580 + }, + { + "epoch": 2.489347407132375, + "grad_norm": 4.075740337371826, + "learning_rate": 8.51087654779375e-06, + "loss": 0.5922, + "step": 281590 + }, + { + "epoch": 2.4894358103926875, + "grad_norm": 43.40414047241211, + "learning_rate": 8.509403160121879e-06, + "loss": 0.4926, + "step": 281600 + }, + { + "epoch": 2.4895242136529996, + "grad_norm": 4.036749839782715, + "learning_rate": 8.507929772450009e-06, + "loss": 0.4573, + "step": 281610 + }, + { + "epoch": 2.4896126169133117, + "grad_norm": 4.528214454650879, + "learning_rate": 8.506456384778137e-06, + "loss": 0.4909, + "step": 281620 + }, + { + "epoch": 2.489701020173624, + "grad_norm": 1.656269907951355, + "learning_rate": 8.504982997106267e-06, + "loss": 0.4415, + "step": 281630 + }, + { + "epoch": 2.4897894234339364, + "grad_norm": 5.91782283782959, + "learning_rate": 8.503509609434395e-06, + "loss": 0.5143, + "step": 281640 + }, + { + "epoch": 2.4898778266942485, + "grad_norm": 3.8469767570495605, + "learning_rate": 8.502036221762525e-06, + "loss": 0.4667, + "step": 281650 + }, + { + "epoch": 2.4899662299545606, + "grad_norm": 3.194531202316284, + "learning_rate": 8.500562834090655e-06, + "loss": 0.5938, + "step": 281660 + }, + { + "epoch": 2.490054633214873, + "grad_norm": 0.7501530647277832, + "learning_rate": 8.499089446418784e-06, + "loss": 0.5044, + "step": 281670 + }, + { + "epoch": 2.4901430364751853, + "grad_norm": 2.487901449203491, + "learning_rate": 8.497616058746914e-06, + "loss": 0.5237, + "step": 281680 + }, + { + "epoch": 2.4902314397354974, + "grad_norm": 3.3846025466918945, + "learning_rate": 8.496142671075042e-06, + "loss": 0.4832, + "step": 281690 + }, + { + "epoch": 2.4903198429958096, + "grad_norm": 7.866753101348877, + "learning_rate": 8.494669283403172e-06, + "loss": 0.6588, + "step": 281700 + }, + { + "epoch": 2.490408246256122, + "grad_norm": 1.2356256246566772, + "learning_rate": 8.4931958957313e-06, + "loss": 0.6025, + "step": 281710 + }, + { + "epoch": 2.4904966495164342, + "grad_norm": 3.8161275386810303, + "learning_rate": 8.49172250805943e-06, + "loss": 0.5128, + "step": 281720 + }, + { + "epoch": 2.4905850527767464, + "grad_norm": 9.588460922241211, + "learning_rate": 8.490249120387559e-06, + "loss": 0.4944, + "step": 281730 + }, + { + "epoch": 2.4906734560370585, + "grad_norm": 28.814027786254883, + "learning_rate": 8.488775732715689e-06, + "loss": 0.5162, + "step": 281740 + }, + { + "epoch": 2.490761859297371, + "grad_norm": 15.532085418701172, + "learning_rate": 8.487302345043819e-06, + "loss": 0.4668, + "step": 281750 + }, + { + "epoch": 2.490850262557683, + "grad_norm": 5.467520713806152, + "learning_rate": 8.485828957371947e-06, + "loss": 0.5189, + "step": 281760 + }, + { + "epoch": 2.4909386658179953, + "grad_norm": 1.3105249404907227, + "learning_rate": 8.484355569700078e-06, + "loss": 0.5132, + "step": 281770 + }, + { + "epoch": 2.4910270690783074, + "grad_norm": 6.966648101806641, + "learning_rate": 8.482882182028208e-06, + "loss": 0.528, + "step": 281780 + }, + { + "epoch": 2.49111547233862, + "grad_norm": 3.982682704925537, + "learning_rate": 8.481408794356336e-06, + "loss": 0.6125, + "step": 281790 + }, + { + "epoch": 2.491203875598932, + "grad_norm": 1.2228527069091797, + "learning_rate": 8.479935406684466e-06, + "loss": 0.4523, + "step": 281800 + }, + { + "epoch": 2.491292278859244, + "grad_norm": 1.767803430557251, + "learning_rate": 8.478462019012596e-06, + "loss": 0.5796, + "step": 281810 + }, + { + "epoch": 2.4913806821195568, + "grad_norm": 3.2275495529174805, + "learning_rate": 8.476988631340724e-06, + "loss": 0.5732, + "step": 281820 + }, + { + "epoch": 2.491469085379869, + "grad_norm": 20.889909744262695, + "learning_rate": 8.475515243668854e-06, + "loss": 0.618, + "step": 281830 + }, + { + "epoch": 2.491557488640181, + "grad_norm": 2.5429656505584717, + "learning_rate": 8.474041855996983e-06, + "loss": 0.4168, + "step": 281840 + }, + { + "epoch": 2.491645891900493, + "grad_norm": 0.9328232407569885, + "learning_rate": 8.472568468325113e-06, + "loss": 0.4643, + "step": 281850 + }, + { + "epoch": 2.4917342951608057, + "grad_norm": 1.9609278440475464, + "learning_rate": 8.471095080653243e-06, + "loss": 0.631, + "step": 281860 + }, + { + "epoch": 2.491822698421118, + "grad_norm": 4.67589807510376, + "learning_rate": 8.469621692981371e-06, + "loss": 0.4983, + "step": 281870 + }, + { + "epoch": 2.49191110168143, + "grad_norm": 2.662517786026001, + "learning_rate": 8.468148305309501e-06, + "loss": 0.5045, + "step": 281880 + }, + { + "epoch": 2.4919995049417425, + "grad_norm": 2.484492778778076, + "learning_rate": 8.46667491763763e-06, + "loss": 0.5595, + "step": 281890 + }, + { + "epoch": 2.4920879082020546, + "grad_norm": 1.3070499897003174, + "learning_rate": 8.46520152996576e-06, + "loss": 0.6185, + "step": 281900 + }, + { + "epoch": 2.4921763114623667, + "grad_norm": 2.236037254333496, + "learning_rate": 8.463728142293888e-06, + "loss": 0.6669, + "step": 281910 + }, + { + "epoch": 2.492264714722679, + "grad_norm": 2.9687857627868652, + "learning_rate": 8.462254754622018e-06, + "loss": 0.4875, + "step": 281920 + }, + { + "epoch": 2.492353117982991, + "grad_norm": 6.485469341278076, + "learning_rate": 8.460781366950146e-06, + "loss": 0.4002, + "step": 281930 + }, + { + "epoch": 2.4924415212433035, + "grad_norm": 9.570846557617188, + "learning_rate": 8.459307979278276e-06, + "loss": 0.5352, + "step": 281940 + }, + { + "epoch": 2.4925299245036157, + "grad_norm": 3.4257442951202393, + "learning_rate": 8.457834591606407e-06, + "loss": 0.4577, + "step": 281950 + }, + { + "epoch": 2.4926183277639278, + "grad_norm": 3.264303684234619, + "learning_rate": 8.456361203934535e-06, + "loss": 0.4939, + "step": 281960 + }, + { + "epoch": 2.4927067310242403, + "grad_norm": 3.071416139602661, + "learning_rate": 8.454887816262665e-06, + "loss": 0.5715, + "step": 281970 + }, + { + "epoch": 2.4927951342845525, + "grad_norm": 3.4302475452423096, + "learning_rate": 8.453414428590793e-06, + "loss": 0.5151, + "step": 281980 + }, + { + "epoch": 2.4928835375448646, + "grad_norm": 12.072526931762695, + "learning_rate": 8.451941040918923e-06, + "loss": 0.4581, + "step": 281990 + }, + { + "epoch": 2.4929719408051767, + "grad_norm": 1.7053427696228027, + "learning_rate": 8.450467653247052e-06, + "loss": 0.3533, + "step": 282000 + }, + { + "epoch": 2.4930603440654893, + "grad_norm": 3.108088493347168, + "learning_rate": 8.448994265575182e-06, + "loss": 0.5749, + "step": 282010 + }, + { + "epoch": 2.4931487473258014, + "grad_norm": 3.9293594360351562, + "learning_rate": 8.44752087790331e-06, + "loss": 0.5793, + "step": 282020 + }, + { + "epoch": 2.4932371505861135, + "grad_norm": 17.70081901550293, + "learning_rate": 8.44604749023144e-06, + "loss": 0.5503, + "step": 282030 + }, + { + "epoch": 2.493325553846426, + "grad_norm": 3.2956697940826416, + "learning_rate": 8.44457410255957e-06, + "loss": 0.5051, + "step": 282040 + }, + { + "epoch": 2.493413957106738, + "grad_norm": 7.7607622146606445, + "learning_rate": 8.443100714887699e-06, + "loss": 0.5376, + "step": 282050 + }, + { + "epoch": 2.4935023603670503, + "grad_norm": 2.3999249935150146, + "learning_rate": 8.441627327215829e-06, + "loss": 0.5335, + "step": 282060 + }, + { + "epoch": 2.4935907636273624, + "grad_norm": 4.55825138092041, + "learning_rate": 8.440153939543957e-06, + "loss": 0.5876, + "step": 282070 + }, + { + "epoch": 2.493679166887675, + "grad_norm": 4.155837535858154, + "learning_rate": 8.438680551872087e-06, + "loss": 0.4983, + "step": 282080 + }, + { + "epoch": 2.493767570147987, + "grad_norm": 1.5405964851379395, + "learning_rate": 8.437207164200215e-06, + "loss": 0.4825, + "step": 282090 + }, + { + "epoch": 2.4938559734082992, + "grad_norm": 7.533063888549805, + "learning_rate": 8.435733776528345e-06, + "loss": 0.6489, + "step": 282100 + }, + { + "epoch": 2.4939443766686114, + "grad_norm": 6.364786624908447, + "learning_rate": 8.434260388856474e-06, + "loss": 0.5727, + "step": 282110 + }, + { + "epoch": 2.494032779928924, + "grad_norm": 11.733186721801758, + "learning_rate": 8.432787001184604e-06, + "loss": 0.551, + "step": 282120 + }, + { + "epoch": 2.494121183189236, + "grad_norm": 1.088929533958435, + "learning_rate": 8.431313613512734e-06, + "loss": 0.5677, + "step": 282130 + }, + { + "epoch": 2.494209586449548, + "grad_norm": 3.619508981704712, + "learning_rate": 8.429840225840862e-06, + "loss": 0.3784, + "step": 282140 + }, + { + "epoch": 2.4942979897098603, + "grad_norm": 1.536409616470337, + "learning_rate": 8.428366838168992e-06, + "loss": 0.5132, + "step": 282150 + }, + { + "epoch": 2.494386392970173, + "grad_norm": 6.3133134841918945, + "learning_rate": 8.42689345049712e-06, + "loss": 0.6055, + "step": 282160 + }, + { + "epoch": 2.494474796230485, + "grad_norm": 4.42936372756958, + "learning_rate": 8.42542006282525e-06, + "loss": 0.4861, + "step": 282170 + }, + { + "epoch": 2.494563199490797, + "grad_norm": 6.376729488372803, + "learning_rate": 8.423946675153379e-06, + "loss": 0.5513, + "step": 282180 + }, + { + "epoch": 2.4946516027511096, + "grad_norm": 2.3278706073760986, + "learning_rate": 8.422473287481509e-06, + "loss": 0.4346, + "step": 282190 + }, + { + "epoch": 2.4947400060114218, + "grad_norm": 3.8369626998901367, + "learning_rate": 8.420999899809637e-06, + "loss": 0.5225, + "step": 282200 + }, + { + "epoch": 2.494828409271734, + "grad_norm": 19.759029388427734, + "learning_rate": 8.419526512137767e-06, + "loss": 0.5128, + "step": 282210 + }, + { + "epoch": 2.494916812532046, + "grad_norm": 1.8629357814788818, + "learning_rate": 8.418053124465897e-06, + "loss": 0.5519, + "step": 282220 + }, + { + "epoch": 2.4950052157923586, + "grad_norm": 2.852888345718384, + "learning_rate": 8.416579736794026e-06, + "loss": 0.583, + "step": 282230 + }, + { + "epoch": 2.4950936190526707, + "grad_norm": 2.9338607788085938, + "learning_rate": 8.415106349122156e-06, + "loss": 0.5548, + "step": 282240 + }, + { + "epoch": 2.495182022312983, + "grad_norm": 4.635824680328369, + "learning_rate": 8.413632961450286e-06, + "loss": 0.463, + "step": 282250 + }, + { + "epoch": 2.4952704255732954, + "grad_norm": 2.531167507171631, + "learning_rate": 8.412159573778414e-06, + "loss": 0.5576, + "step": 282260 + }, + { + "epoch": 2.4953588288336075, + "grad_norm": 4.038435935974121, + "learning_rate": 8.410686186106544e-06, + "loss": 0.5246, + "step": 282270 + }, + { + "epoch": 2.4954472320939196, + "grad_norm": 1.5529333353042603, + "learning_rate": 8.409212798434674e-06, + "loss": 0.4584, + "step": 282280 + }, + { + "epoch": 2.4955356353542317, + "grad_norm": 2.3736820220947266, + "learning_rate": 8.407739410762803e-06, + "loss": 0.5232, + "step": 282290 + }, + { + "epoch": 2.4956240386145443, + "grad_norm": 9.81212329864502, + "learning_rate": 8.406266023090933e-06, + "loss": 0.5425, + "step": 282300 + }, + { + "epoch": 2.4957124418748564, + "grad_norm": 3.5196049213409424, + "learning_rate": 8.404792635419061e-06, + "loss": 0.5163, + "step": 282310 + }, + { + "epoch": 2.4958008451351685, + "grad_norm": 4.252503871917725, + "learning_rate": 8.403319247747191e-06, + "loss": 0.4761, + "step": 282320 + }, + { + "epoch": 2.4958892483954807, + "grad_norm": 4.996754169464111, + "learning_rate": 8.401845860075321e-06, + "loss": 0.5332, + "step": 282330 + }, + { + "epoch": 2.495977651655793, + "grad_norm": 0.7880841493606567, + "learning_rate": 8.40037247240345e-06, + "loss": 0.4504, + "step": 282340 + }, + { + "epoch": 2.4960660549161053, + "grad_norm": 4.20579195022583, + "learning_rate": 8.39889908473158e-06, + "loss": 0.5314, + "step": 282350 + }, + { + "epoch": 2.4961544581764175, + "grad_norm": 11.602977752685547, + "learning_rate": 8.397425697059708e-06, + "loss": 0.6066, + "step": 282360 + }, + { + "epoch": 2.4962428614367296, + "grad_norm": 7.056084632873535, + "learning_rate": 8.395952309387838e-06, + "loss": 0.612, + "step": 282370 + }, + { + "epoch": 2.496331264697042, + "grad_norm": 2.4426889419555664, + "learning_rate": 8.394478921715966e-06, + "loss": 0.5637, + "step": 282380 + }, + { + "epoch": 2.4964196679573543, + "grad_norm": 1.0739600658416748, + "learning_rate": 8.393005534044096e-06, + "loss": 0.5628, + "step": 282390 + }, + { + "epoch": 2.4965080712176664, + "grad_norm": 3.9649317264556885, + "learning_rate": 8.391532146372225e-06, + "loss": 0.4629, + "step": 282400 + }, + { + "epoch": 2.496596474477979, + "grad_norm": 3.7625186443328857, + "learning_rate": 8.390058758700355e-06, + "loss": 0.6612, + "step": 282410 + }, + { + "epoch": 2.496684877738291, + "grad_norm": 3.587451696395874, + "learning_rate": 8.388585371028485e-06, + "loss": 0.6141, + "step": 282420 + }, + { + "epoch": 2.496773280998603, + "grad_norm": 5.605487823486328, + "learning_rate": 8.387111983356613e-06, + "loss": 0.5399, + "step": 282430 + }, + { + "epoch": 2.4968616842589153, + "grad_norm": 2.208239793777466, + "learning_rate": 8.385638595684743e-06, + "loss": 0.4437, + "step": 282440 + }, + { + "epoch": 2.496950087519228, + "grad_norm": 4.524482250213623, + "learning_rate": 8.384165208012872e-06, + "loss": 0.5825, + "step": 282450 + }, + { + "epoch": 2.49703849077954, + "grad_norm": 3.1360158920288086, + "learning_rate": 8.382691820341002e-06, + "loss": 0.5361, + "step": 282460 + }, + { + "epoch": 2.497126894039852, + "grad_norm": 3.8154613971710205, + "learning_rate": 8.38121843266913e-06, + "loss": 0.6004, + "step": 282470 + }, + { + "epoch": 2.4972152973001647, + "grad_norm": 3.6918675899505615, + "learning_rate": 8.37974504499726e-06, + "loss": 0.6194, + "step": 282480 + }, + { + "epoch": 2.497303700560477, + "grad_norm": 2.067413568496704, + "learning_rate": 8.378271657325388e-06, + "loss": 0.6178, + "step": 282490 + }, + { + "epoch": 2.497392103820789, + "grad_norm": 1.400896668434143, + "learning_rate": 8.376798269653518e-06, + "loss": 0.4325, + "step": 282500 + }, + { + "epoch": 2.497480507081101, + "grad_norm": 3.5295584201812744, + "learning_rate": 8.375324881981649e-06, + "loss": 0.4715, + "step": 282510 + }, + { + "epoch": 2.497568910341413, + "grad_norm": 10.502676963806152, + "learning_rate": 8.373851494309777e-06, + "loss": 0.5969, + "step": 282520 + }, + { + "epoch": 2.4976573136017257, + "grad_norm": 2.815983295440674, + "learning_rate": 8.372378106637907e-06, + "loss": 0.555, + "step": 282530 + }, + { + "epoch": 2.497745716862038, + "grad_norm": 3.5778989791870117, + "learning_rate": 8.370904718966035e-06, + "loss": 0.4768, + "step": 282540 + }, + { + "epoch": 2.49783412012235, + "grad_norm": 2.3839025497436523, + "learning_rate": 8.369431331294165e-06, + "loss": 0.5822, + "step": 282550 + }, + { + "epoch": 2.4979225233826625, + "grad_norm": 2.570106029510498, + "learning_rate": 8.367957943622294e-06, + "loss": 0.402, + "step": 282560 + }, + { + "epoch": 2.4980109266429746, + "grad_norm": 4.572789669036865, + "learning_rate": 8.366484555950424e-06, + "loss": 0.5835, + "step": 282570 + }, + { + "epoch": 2.4980993299032868, + "grad_norm": 2.415663003921509, + "learning_rate": 8.365011168278552e-06, + "loss": 0.683, + "step": 282580 + }, + { + "epoch": 2.498187733163599, + "grad_norm": 1.1543906927108765, + "learning_rate": 8.363537780606682e-06, + "loss": 0.4975, + "step": 282590 + }, + { + "epoch": 2.4982761364239114, + "grad_norm": 4.541924953460693, + "learning_rate": 8.362064392934812e-06, + "loss": 0.5249, + "step": 282600 + }, + { + "epoch": 2.4983645396842236, + "grad_norm": 6.1519880294799805, + "learning_rate": 8.36059100526294e-06, + "loss": 0.402, + "step": 282610 + }, + { + "epoch": 2.4984529429445357, + "grad_norm": 11.62427043914795, + "learning_rate": 8.35911761759107e-06, + "loss": 0.5583, + "step": 282620 + }, + { + "epoch": 2.4985413462048482, + "grad_norm": 6.399318218231201, + "learning_rate": 8.357644229919199e-06, + "loss": 0.4036, + "step": 282630 + }, + { + "epoch": 2.4986297494651604, + "grad_norm": 0.8110967874526978, + "learning_rate": 8.356170842247329e-06, + "loss": 0.6052, + "step": 282640 + }, + { + "epoch": 2.4987181527254725, + "grad_norm": 3.0520076751708984, + "learning_rate": 8.354697454575457e-06, + "loss": 0.649, + "step": 282650 + }, + { + "epoch": 2.4988065559857846, + "grad_norm": 2.5646777153015137, + "learning_rate": 8.353224066903587e-06, + "loss": 0.4811, + "step": 282660 + }, + { + "epoch": 2.498894959246097, + "grad_norm": 8.107065200805664, + "learning_rate": 8.351750679231716e-06, + "loss": 0.6034, + "step": 282670 + }, + { + "epoch": 2.4989833625064093, + "grad_norm": 3.6549158096313477, + "learning_rate": 8.350277291559846e-06, + "loss": 0.5952, + "step": 282680 + }, + { + "epoch": 2.4990717657667214, + "grad_norm": 1.6305829286575317, + "learning_rate": 8.348803903887976e-06, + "loss": 0.5673, + "step": 282690 + }, + { + "epoch": 2.4991601690270335, + "grad_norm": 12.164896965026855, + "learning_rate": 8.347330516216104e-06, + "loss": 0.5102, + "step": 282700 + }, + { + "epoch": 2.499248572287346, + "grad_norm": 2.8028204441070557, + "learning_rate": 8.345857128544234e-06, + "loss": 0.5663, + "step": 282710 + }, + { + "epoch": 2.499336975547658, + "grad_norm": 0.9155920743942261, + "learning_rate": 8.344383740872364e-06, + "loss": 0.4575, + "step": 282720 + }, + { + "epoch": 2.4994253788079703, + "grad_norm": 5.100389003753662, + "learning_rate": 8.342910353200493e-06, + "loss": 0.4309, + "step": 282730 + }, + { + "epoch": 2.4995137820682825, + "grad_norm": 4.109800815582275, + "learning_rate": 8.341436965528623e-06, + "loss": 0.5228, + "step": 282740 + }, + { + "epoch": 2.499602185328595, + "grad_norm": 7.470064640045166, + "learning_rate": 8.339963577856753e-06, + "loss": 0.4917, + "step": 282750 + }, + { + "epoch": 2.499690588588907, + "grad_norm": 4.0781965255737305, + "learning_rate": 8.338490190184881e-06, + "loss": 0.6, + "step": 282760 + }, + { + "epoch": 2.4997789918492193, + "grad_norm": 2.2115156650543213, + "learning_rate": 8.337016802513011e-06, + "loss": 0.5375, + "step": 282770 + }, + { + "epoch": 2.499867395109532, + "grad_norm": 3.0898985862731934, + "learning_rate": 8.335543414841141e-06, + "loss": 0.3892, + "step": 282780 + }, + { + "epoch": 2.499955798369844, + "grad_norm": 9.893712997436523, + "learning_rate": 8.33407002716927e-06, + "loss": 0.5571, + "step": 282790 + }, + { + "epoch": 2.500044201630156, + "grad_norm": 4.688666820526123, + "learning_rate": 8.3325966394974e-06, + "loss": 0.4231, + "step": 282800 + }, + { + "epoch": 2.500132604890468, + "grad_norm": 0.885964572429657, + "learning_rate": 8.331123251825528e-06, + "loss": 0.5584, + "step": 282810 + }, + { + "epoch": 2.5002210081507807, + "grad_norm": 13.84163761138916, + "learning_rate": 8.329649864153658e-06, + "loss": 0.5816, + "step": 282820 + }, + { + "epoch": 2.500309411411093, + "grad_norm": 9.960810661315918, + "learning_rate": 8.328176476481786e-06, + "loss": 0.5707, + "step": 282830 + }, + { + "epoch": 2.500397814671405, + "grad_norm": 0.9061553478240967, + "learning_rate": 8.326703088809916e-06, + "loss": 0.4889, + "step": 282840 + }, + { + "epoch": 2.5004862179317175, + "grad_norm": 1.2679194211959839, + "learning_rate": 8.325229701138045e-06, + "loss": 0.5325, + "step": 282850 + }, + { + "epoch": 2.5005746211920297, + "grad_norm": 1.1562618017196655, + "learning_rate": 8.323756313466175e-06, + "loss": 0.5139, + "step": 282860 + }, + { + "epoch": 2.500663024452342, + "grad_norm": 3.4108707904815674, + "learning_rate": 8.322282925794303e-06, + "loss": 0.4333, + "step": 282870 + }, + { + "epoch": 2.500751427712654, + "grad_norm": 2.0151000022888184, + "learning_rate": 8.320809538122433e-06, + "loss": 0.505, + "step": 282880 + }, + { + "epoch": 2.500839830972966, + "grad_norm": 1.5522539615631104, + "learning_rate": 8.319336150450563e-06, + "loss": 0.3912, + "step": 282890 + }, + { + "epoch": 2.5009282342332786, + "grad_norm": 2.103593349456787, + "learning_rate": 8.317862762778692e-06, + "loss": 0.4548, + "step": 282900 + }, + { + "epoch": 2.5010166374935907, + "grad_norm": 1.4984196424484253, + "learning_rate": 8.316389375106822e-06, + "loss": 0.4413, + "step": 282910 + }, + { + "epoch": 2.5011050407539033, + "grad_norm": 1.7137110233306885, + "learning_rate": 8.31491598743495e-06, + "loss": 0.3275, + "step": 282920 + }, + { + "epoch": 2.5011934440142154, + "grad_norm": 6.976639270782471, + "learning_rate": 8.31344259976308e-06, + "loss": 0.5531, + "step": 282930 + }, + { + "epoch": 2.5012818472745275, + "grad_norm": 2.3462517261505127, + "learning_rate": 8.311969212091208e-06, + "loss": 0.5615, + "step": 282940 + }, + { + "epoch": 2.5013702505348396, + "grad_norm": 8.246173858642578, + "learning_rate": 8.310495824419338e-06, + "loss": 0.5603, + "step": 282950 + }, + { + "epoch": 2.5014586537951518, + "grad_norm": 5.574619293212891, + "learning_rate": 8.309022436747467e-06, + "loss": 0.5714, + "step": 282960 + }, + { + "epoch": 2.5015470570554643, + "grad_norm": 3.404309034347534, + "learning_rate": 8.307549049075597e-06, + "loss": 0.5309, + "step": 282970 + }, + { + "epoch": 2.5016354603157764, + "grad_norm": 4.257323741912842, + "learning_rate": 8.306075661403727e-06, + "loss": 0.5545, + "step": 282980 + }, + { + "epoch": 2.5017238635760886, + "grad_norm": 3.8314976692199707, + "learning_rate": 8.304602273731855e-06, + "loss": 0.5734, + "step": 282990 + }, + { + "epoch": 2.501812266836401, + "grad_norm": 10.687641143798828, + "learning_rate": 8.303128886059985e-06, + "loss": 0.6306, + "step": 283000 + }, + { + "epoch": 2.5019006700967132, + "grad_norm": 6.344671249389648, + "learning_rate": 8.301655498388114e-06, + "loss": 0.5083, + "step": 283010 + }, + { + "epoch": 2.5019890733570254, + "grad_norm": 4.737275123596191, + "learning_rate": 8.300182110716244e-06, + "loss": 0.4685, + "step": 283020 + }, + { + "epoch": 2.5020774766173375, + "grad_norm": 2.0172455310821533, + "learning_rate": 8.298708723044372e-06, + "loss": 0.4736, + "step": 283030 + }, + { + "epoch": 2.50216587987765, + "grad_norm": 5.942038536071777, + "learning_rate": 8.297235335372502e-06, + "loss": 0.5753, + "step": 283040 + }, + { + "epoch": 2.502254283137962, + "grad_norm": 1.630398154258728, + "learning_rate": 8.29576194770063e-06, + "loss": 0.4383, + "step": 283050 + }, + { + "epoch": 2.5023426863982743, + "grad_norm": 17.377410888671875, + "learning_rate": 8.29428856002876e-06, + "loss": 0.569, + "step": 283060 + }, + { + "epoch": 2.502431089658587, + "grad_norm": 2.087979555130005, + "learning_rate": 8.29281517235689e-06, + "loss": 0.4753, + "step": 283070 + }, + { + "epoch": 2.502519492918899, + "grad_norm": 5.346923351287842, + "learning_rate": 8.291341784685019e-06, + "loss": 0.5571, + "step": 283080 + }, + { + "epoch": 2.502607896179211, + "grad_norm": 1.605104684829712, + "learning_rate": 8.289868397013149e-06, + "loss": 0.5224, + "step": 283090 + }, + { + "epoch": 2.502696299439523, + "grad_norm": 2.8299989700317383, + "learning_rate": 8.288395009341277e-06, + "loss": 0.6353, + "step": 283100 + }, + { + "epoch": 2.5027847026998353, + "grad_norm": 1.7810285091400146, + "learning_rate": 8.286921621669407e-06, + "loss": 0.4878, + "step": 283110 + }, + { + "epoch": 2.502873105960148, + "grad_norm": 3.4415059089660645, + "learning_rate": 8.285448233997536e-06, + "loss": 0.5406, + "step": 283120 + }, + { + "epoch": 2.50296150922046, + "grad_norm": 5.289724826812744, + "learning_rate": 8.283974846325666e-06, + "loss": 0.4746, + "step": 283130 + }, + { + "epoch": 2.503049912480772, + "grad_norm": 1.4514684677124023, + "learning_rate": 8.282501458653794e-06, + "loss": 0.4981, + "step": 283140 + }, + { + "epoch": 2.5031383157410847, + "grad_norm": 4.5988264083862305, + "learning_rate": 8.281028070981924e-06, + "loss": 0.4692, + "step": 283150 + }, + { + "epoch": 2.503226719001397, + "grad_norm": 3.5386507511138916, + "learning_rate": 8.279554683310054e-06, + "loss": 0.4639, + "step": 283160 + }, + { + "epoch": 2.503315122261709, + "grad_norm": 1.4415496587753296, + "learning_rate": 8.278081295638183e-06, + "loss": 0.3089, + "step": 283170 + }, + { + "epoch": 2.503403525522021, + "grad_norm": 2.738708972930908, + "learning_rate": 8.276607907966313e-06, + "loss": 0.6242, + "step": 283180 + }, + { + "epoch": 2.5034919287823336, + "grad_norm": 4.074089527130127, + "learning_rate": 8.275134520294443e-06, + "loss": 0.5879, + "step": 283190 + }, + { + "epoch": 2.5035803320426457, + "grad_norm": 5.190864086151123, + "learning_rate": 8.273661132622571e-06, + "loss": 0.6302, + "step": 283200 + }, + { + "epoch": 2.503668735302958, + "grad_norm": 7.544196605682373, + "learning_rate": 8.272187744950701e-06, + "loss": 0.5146, + "step": 283210 + }, + { + "epoch": 2.5037571385632704, + "grad_norm": 17.348711013793945, + "learning_rate": 8.270714357278831e-06, + "loss": 0.4906, + "step": 283220 + }, + { + "epoch": 2.5038455418235825, + "grad_norm": 1.9198873043060303, + "learning_rate": 8.26924096960696e-06, + "loss": 0.4977, + "step": 283230 + }, + { + "epoch": 2.5039339450838947, + "grad_norm": 3.754368782043457, + "learning_rate": 8.26776758193509e-06, + "loss": 0.344, + "step": 283240 + }, + { + "epoch": 2.504022348344207, + "grad_norm": 13.282536506652832, + "learning_rate": 8.26629419426322e-06, + "loss": 0.5045, + "step": 283250 + }, + { + "epoch": 2.504110751604519, + "grad_norm": 5.047104358673096, + "learning_rate": 8.264820806591348e-06, + "loss": 0.4723, + "step": 283260 + }, + { + "epoch": 2.5041991548648315, + "grad_norm": 14.395648956298828, + "learning_rate": 8.263347418919478e-06, + "loss": 0.4171, + "step": 283270 + }, + { + "epoch": 2.5042875581251436, + "grad_norm": 1.1836960315704346, + "learning_rate": 8.261874031247606e-06, + "loss": 0.493, + "step": 283280 + }, + { + "epoch": 2.504375961385456, + "grad_norm": 4.023506164550781, + "learning_rate": 8.260400643575736e-06, + "loss": 0.5561, + "step": 283290 + }, + { + "epoch": 2.5044643646457683, + "grad_norm": 4.2190728187561035, + "learning_rate": 8.258927255903865e-06, + "loss": 0.5494, + "step": 283300 + }, + { + "epoch": 2.5045527679060804, + "grad_norm": 3.6400537490844727, + "learning_rate": 8.257453868231995e-06, + "loss": 0.619, + "step": 283310 + }, + { + "epoch": 2.5046411711663925, + "grad_norm": 3.3298096656799316, + "learning_rate": 8.255980480560123e-06, + "loss": 0.4232, + "step": 283320 + }, + { + "epoch": 2.5047295744267046, + "grad_norm": 12.253691673278809, + "learning_rate": 8.254507092888253e-06, + "loss": 0.5643, + "step": 283330 + }, + { + "epoch": 2.504817977687017, + "grad_norm": 5.061431407928467, + "learning_rate": 8.253033705216382e-06, + "loss": 0.5763, + "step": 283340 + }, + { + "epoch": 2.5049063809473293, + "grad_norm": 1.220438838005066, + "learning_rate": 8.251560317544512e-06, + "loss": 0.5573, + "step": 283350 + }, + { + "epoch": 2.5049947842076414, + "grad_norm": 1.6105468273162842, + "learning_rate": 8.250086929872642e-06, + "loss": 0.4327, + "step": 283360 + }, + { + "epoch": 2.505083187467954, + "grad_norm": 1.299250841140747, + "learning_rate": 8.24861354220077e-06, + "loss": 0.4937, + "step": 283370 + }, + { + "epoch": 2.505171590728266, + "grad_norm": 9.718131065368652, + "learning_rate": 8.2471401545289e-06, + "loss": 0.5688, + "step": 283380 + }, + { + "epoch": 2.5052599939885782, + "grad_norm": 4.985573768615723, + "learning_rate": 8.245666766857028e-06, + "loss": 0.5351, + "step": 283390 + }, + { + "epoch": 2.5053483972488904, + "grad_norm": 1.2967532873153687, + "learning_rate": 8.244193379185158e-06, + "loss": 0.5316, + "step": 283400 + }, + { + "epoch": 2.505436800509203, + "grad_norm": 3.4755115509033203, + "learning_rate": 8.242719991513287e-06, + "loss": 0.4693, + "step": 283410 + }, + { + "epoch": 2.505525203769515, + "grad_norm": 9.113896369934082, + "learning_rate": 8.241246603841417e-06, + "loss": 0.6466, + "step": 283420 + }, + { + "epoch": 2.505613607029827, + "grad_norm": 3.6700973510742188, + "learning_rate": 8.239773216169545e-06, + "loss": 0.6219, + "step": 283430 + }, + { + "epoch": 2.5057020102901397, + "grad_norm": 1.2556768655776978, + "learning_rate": 8.238299828497675e-06, + "loss": 0.4534, + "step": 283440 + }, + { + "epoch": 2.505790413550452, + "grad_norm": 6.134596347808838, + "learning_rate": 8.236826440825805e-06, + "loss": 0.5842, + "step": 283450 + }, + { + "epoch": 2.505878816810764, + "grad_norm": 7.759485721588135, + "learning_rate": 8.235353053153934e-06, + "loss": 0.4965, + "step": 283460 + }, + { + "epoch": 2.505967220071076, + "grad_norm": 11.855794906616211, + "learning_rate": 8.233879665482064e-06, + "loss": 0.524, + "step": 283470 + }, + { + "epoch": 2.506055623331388, + "grad_norm": 6.993310928344727, + "learning_rate": 8.232406277810192e-06, + "loss": 0.6009, + "step": 283480 + }, + { + "epoch": 2.5061440265917008, + "grad_norm": 3.4013047218322754, + "learning_rate": 8.230932890138322e-06, + "loss": 0.4513, + "step": 283490 + }, + { + "epoch": 2.506232429852013, + "grad_norm": 4.600027561187744, + "learning_rate": 8.22945950246645e-06, + "loss": 0.4225, + "step": 283500 + }, + { + "epoch": 2.5063208331123255, + "grad_norm": 3.4561259746551514, + "learning_rate": 8.22798611479458e-06, + "loss": 0.4706, + "step": 283510 + }, + { + "epoch": 2.5064092363726376, + "grad_norm": 3.264570474624634, + "learning_rate": 8.226512727122709e-06, + "loss": 0.3816, + "step": 283520 + }, + { + "epoch": 2.5064976396329497, + "grad_norm": 4.131143093109131, + "learning_rate": 8.225039339450839e-06, + "loss": 0.6047, + "step": 283530 + }, + { + "epoch": 2.506586042893262, + "grad_norm": 3.4034388065338135, + "learning_rate": 8.223565951778969e-06, + "loss": 0.5166, + "step": 283540 + }, + { + "epoch": 2.506674446153574, + "grad_norm": 5.570876598358154, + "learning_rate": 8.222092564107097e-06, + "loss": 0.5245, + "step": 283550 + }, + { + "epoch": 2.5067628494138865, + "grad_norm": 4.567625522613525, + "learning_rate": 8.220619176435227e-06, + "loss": 0.4906, + "step": 283560 + }, + { + "epoch": 2.5068512526741986, + "grad_norm": 1.4228917360305786, + "learning_rate": 8.219145788763356e-06, + "loss": 0.4843, + "step": 283570 + }, + { + "epoch": 2.5069396559345107, + "grad_norm": 0.7783921957015991, + "learning_rate": 8.217672401091486e-06, + "loss": 0.4324, + "step": 283580 + }, + { + "epoch": 2.5070280591948233, + "grad_norm": 4.211084842681885, + "learning_rate": 8.216199013419614e-06, + "loss": 0.6621, + "step": 283590 + }, + { + "epoch": 2.5071164624551354, + "grad_norm": 17.209699630737305, + "learning_rate": 8.214725625747744e-06, + "loss": 0.4499, + "step": 283600 + }, + { + "epoch": 2.5072048657154475, + "grad_norm": 3.6773178577423096, + "learning_rate": 8.213252238075873e-06, + "loss": 0.4504, + "step": 283610 + }, + { + "epoch": 2.5072932689757597, + "grad_norm": 14.957329750061035, + "learning_rate": 8.211778850404003e-06, + "loss": 0.5655, + "step": 283620 + }, + { + "epoch": 2.5073816722360722, + "grad_norm": 8.335701942443848, + "learning_rate": 8.210305462732133e-06, + "loss": 0.5744, + "step": 283630 + }, + { + "epoch": 2.5074700754963843, + "grad_norm": 2.501271963119507, + "learning_rate": 8.208832075060263e-06, + "loss": 0.558, + "step": 283640 + }, + { + "epoch": 2.5075584787566965, + "grad_norm": 4.401176452636719, + "learning_rate": 8.207358687388391e-06, + "loss": 0.4861, + "step": 283650 + }, + { + "epoch": 2.507646882017009, + "grad_norm": 9.325634956359863, + "learning_rate": 8.205885299716521e-06, + "loss": 0.6177, + "step": 283660 + }, + { + "epoch": 2.507735285277321, + "grad_norm": 0.9984008073806763, + "learning_rate": 8.204411912044651e-06, + "loss": 0.4262, + "step": 283670 + }, + { + "epoch": 2.5078236885376333, + "grad_norm": 9.274428367614746, + "learning_rate": 8.20293852437278e-06, + "loss": 0.5993, + "step": 283680 + }, + { + "epoch": 2.5079120917979454, + "grad_norm": 20.13018798828125, + "learning_rate": 8.20146513670091e-06, + "loss": 0.48, + "step": 283690 + }, + { + "epoch": 2.5080004950582575, + "grad_norm": 5.627990245819092, + "learning_rate": 8.199991749029038e-06, + "loss": 0.4401, + "step": 283700 + }, + { + "epoch": 2.50808889831857, + "grad_norm": 9.396977424621582, + "learning_rate": 8.198518361357168e-06, + "loss": 0.5588, + "step": 283710 + }, + { + "epoch": 2.508177301578882, + "grad_norm": 0.932692289352417, + "learning_rate": 8.197044973685298e-06, + "loss": 0.5101, + "step": 283720 + }, + { + "epoch": 2.5082657048391943, + "grad_norm": 1.281574010848999, + "learning_rate": 8.195571586013426e-06, + "loss": 0.4129, + "step": 283730 + }, + { + "epoch": 2.508354108099507, + "grad_norm": 5.273996353149414, + "learning_rate": 8.194098198341556e-06, + "loss": 0.5173, + "step": 283740 + }, + { + "epoch": 2.508442511359819, + "grad_norm": 5.139627456665039, + "learning_rate": 8.192624810669685e-06, + "loss": 0.4489, + "step": 283750 + }, + { + "epoch": 2.508530914620131, + "grad_norm": 3.6157968044281006, + "learning_rate": 8.191151422997815e-06, + "loss": 0.5323, + "step": 283760 + }, + { + "epoch": 2.5086193178804432, + "grad_norm": 3.942720890045166, + "learning_rate": 8.189678035325943e-06, + "loss": 0.6934, + "step": 283770 + }, + { + "epoch": 2.508707721140756, + "grad_norm": 3.5075747966766357, + "learning_rate": 8.188204647654073e-06, + "loss": 0.5104, + "step": 283780 + }, + { + "epoch": 2.508796124401068, + "grad_norm": 7.920588970184326, + "learning_rate": 8.186731259982202e-06, + "loss": 0.4049, + "step": 283790 + }, + { + "epoch": 2.50888452766138, + "grad_norm": 2.3003451824188232, + "learning_rate": 8.185257872310332e-06, + "loss": 0.401, + "step": 283800 + }, + { + "epoch": 2.5089729309216926, + "grad_norm": 5.230231285095215, + "learning_rate": 8.183784484638462e-06, + "loss": 0.6291, + "step": 283810 + }, + { + "epoch": 2.5090613341820047, + "grad_norm": 6.106650352478027, + "learning_rate": 8.18231109696659e-06, + "loss": 0.4799, + "step": 283820 + }, + { + "epoch": 2.509149737442317, + "grad_norm": 0.7861815690994263, + "learning_rate": 8.18083770929472e-06, + "loss": 0.4924, + "step": 283830 + }, + { + "epoch": 2.509238140702629, + "grad_norm": 11.70028305053711, + "learning_rate": 8.179364321622848e-06, + "loss": 0.493, + "step": 283840 + }, + { + "epoch": 2.509326543962941, + "grad_norm": 3.2741527557373047, + "learning_rate": 8.177890933950978e-06, + "loss": 0.5006, + "step": 283850 + }, + { + "epoch": 2.5094149472232536, + "grad_norm": 1.11196768283844, + "learning_rate": 8.176417546279107e-06, + "loss": 0.3608, + "step": 283860 + }, + { + "epoch": 2.5095033504835658, + "grad_norm": 2.5301461219787598, + "learning_rate": 8.174944158607237e-06, + "loss": 0.4348, + "step": 283870 + }, + { + "epoch": 2.5095917537438783, + "grad_norm": 3.492032289505005, + "learning_rate": 8.173470770935365e-06, + "loss": 0.5104, + "step": 283880 + }, + { + "epoch": 2.5096801570041904, + "grad_norm": 2.832463264465332, + "learning_rate": 8.171997383263495e-06, + "loss": 0.4884, + "step": 283890 + }, + { + "epoch": 2.5097685602645026, + "grad_norm": 2.433389186859131, + "learning_rate": 8.170523995591624e-06, + "loss": 0.3756, + "step": 283900 + }, + { + "epoch": 2.5098569635248147, + "grad_norm": 0.6511511206626892, + "learning_rate": 8.169050607919754e-06, + "loss": 0.6875, + "step": 283910 + }, + { + "epoch": 2.509945366785127, + "grad_norm": 4.971965312957764, + "learning_rate": 8.167577220247884e-06, + "loss": 0.5451, + "step": 283920 + }, + { + "epoch": 2.5100337700454394, + "grad_norm": 2.470003128051758, + "learning_rate": 8.166103832576012e-06, + "loss": 0.5767, + "step": 283930 + }, + { + "epoch": 2.5101221733057515, + "grad_norm": 4.662142276763916, + "learning_rate": 8.164630444904142e-06, + "loss": 0.4973, + "step": 283940 + }, + { + "epoch": 2.5102105765660636, + "grad_norm": 9.756182670593262, + "learning_rate": 8.16315705723227e-06, + "loss": 0.603, + "step": 283950 + }, + { + "epoch": 2.510298979826376, + "grad_norm": 12.652859687805176, + "learning_rate": 8.1616836695604e-06, + "loss": 0.5709, + "step": 283960 + }, + { + "epoch": 2.5103873830866883, + "grad_norm": 2.4602155685424805, + "learning_rate": 8.160210281888529e-06, + "loss": 0.5243, + "step": 283970 + }, + { + "epoch": 2.5104757863470004, + "grad_norm": 1.4728790521621704, + "learning_rate": 8.158736894216659e-06, + "loss": 0.5014, + "step": 283980 + }, + { + "epoch": 2.5105641896073125, + "grad_norm": 7.382372856140137, + "learning_rate": 8.157263506544787e-06, + "loss": 0.5392, + "step": 283990 + }, + { + "epoch": 2.510652592867625, + "grad_norm": 5.169096946716309, + "learning_rate": 8.155790118872917e-06, + "loss": 0.6105, + "step": 284000 + }, + { + "epoch": 2.510740996127937, + "grad_norm": 2.6902663707733154, + "learning_rate": 8.154316731201047e-06, + "loss": 0.5407, + "step": 284010 + }, + { + "epoch": 2.5108293993882493, + "grad_norm": 6.588034152984619, + "learning_rate": 8.152843343529176e-06, + "loss": 0.5278, + "step": 284020 + }, + { + "epoch": 2.510917802648562, + "grad_norm": 1.740822196006775, + "learning_rate": 8.151369955857306e-06, + "loss": 0.5932, + "step": 284030 + }, + { + "epoch": 2.511006205908874, + "grad_norm": 4.1499433517456055, + "learning_rate": 8.149896568185434e-06, + "loss": 0.584, + "step": 284040 + }, + { + "epoch": 2.511094609169186, + "grad_norm": 6.272344589233398, + "learning_rate": 8.148423180513564e-06, + "loss": 0.5823, + "step": 284050 + }, + { + "epoch": 2.5111830124294983, + "grad_norm": 7.822166919708252, + "learning_rate": 8.146949792841692e-06, + "loss": 0.4918, + "step": 284060 + }, + { + "epoch": 2.5112714156898104, + "grad_norm": 6.076910018920898, + "learning_rate": 8.145476405169823e-06, + "loss": 0.5657, + "step": 284070 + }, + { + "epoch": 2.511359818950123, + "grad_norm": 4.978733062744141, + "learning_rate": 8.144003017497953e-06, + "loss": 0.5785, + "step": 284080 + }, + { + "epoch": 2.511448222210435, + "grad_norm": 2.1968917846679688, + "learning_rate": 8.142529629826081e-06, + "loss": 0.4651, + "step": 284090 + }, + { + "epoch": 2.5115366254707476, + "grad_norm": 4.566445350646973, + "learning_rate": 8.141056242154211e-06, + "loss": 0.566, + "step": 284100 + }, + { + "epoch": 2.5116250287310597, + "grad_norm": 8.96408462524414, + "learning_rate": 8.139582854482341e-06, + "loss": 0.4638, + "step": 284110 + }, + { + "epoch": 2.511713431991372, + "grad_norm": 6.356683254241943, + "learning_rate": 8.13810946681047e-06, + "loss": 0.5142, + "step": 284120 + }, + { + "epoch": 2.511801835251684, + "grad_norm": 1.720367193222046, + "learning_rate": 8.1366360791386e-06, + "loss": 0.4977, + "step": 284130 + }, + { + "epoch": 2.511890238511996, + "grad_norm": 3.8178820610046387, + "learning_rate": 8.13516269146673e-06, + "loss": 0.6199, + "step": 284140 + }, + { + "epoch": 2.5119786417723087, + "grad_norm": 5.948451995849609, + "learning_rate": 8.133689303794858e-06, + "loss": 0.5244, + "step": 284150 + }, + { + "epoch": 2.512067045032621, + "grad_norm": 2.1877660751342773, + "learning_rate": 8.132215916122988e-06, + "loss": 0.5378, + "step": 284160 + }, + { + "epoch": 2.512155448292933, + "grad_norm": 3.7048399448394775, + "learning_rate": 8.130742528451116e-06, + "loss": 0.4832, + "step": 284170 + }, + { + "epoch": 2.5122438515532455, + "grad_norm": 6.498835563659668, + "learning_rate": 8.129269140779246e-06, + "loss": 0.5415, + "step": 284180 + }, + { + "epoch": 2.5123322548135576, + "grad_norm": 3.925696849822998, + "learning_rate": 8.127795753107376e-06, + "loss": 0.6067, + "step": 284190 + }, + { + "epoch": 2.5124206580738697, + "grad_norm": 1.812637209892273, + "learning_rate": 8.126322365435505e-06, + "loss": 0.4514, + "step": 284200 + }, + { + "epoch": 2.512509061334182, + "grad_norm": 20.243974685668945, + "learning_rate": 8.124848977763635e-06, + "loss": 0.4661, + "step": 284210 + }, + { + "epoch": 2.5125974645944944, + "grad_norm": 5.276867866516113, + "learning_rate": 8.123375590091763e-06, + "loss": 0.5968, + "step": 284220 + }, + { + "epoch": 2.5126858678548065, + "grad_norm": 5.121358871459961, + "learning_rate": 8.121902202419893e-06, + "loss": 0.5923, + "step": 284230 + }, + { + "epoch": 2.5127742711151186, + "grad_norm": 2.524695634841919, + "learning_rate": 8.120428814748021e-06, + "loss": 0.5763, + "step": 284240 + }, + { + "epoch": 2.512862674375431, + "grad_norm": 5.203725337982178, + "learning_rate": 8.118955427076152e-06, + "loss": 0.4668, + "step": 284250 + }, + { + "epoch": 2.5129510776357433, + "grad_norm": 0.9321063756942749, + "learning_rate": 8.11748203940428e-06, + "loss": 0.4734, + "step": 284260 + }, + { + "epoch": 2.5130394808960554, + "grad_norm": 7.609432697296143, + "learning_rate": 8.11600865173241e-06, + "loss": 0.4338, + "step": 284270 + }, + { + "epoch": 2.5131278841563676, + "grad_norm": 3.924804449081421, + "learning_rate": 8.11453526406054e-06, + "loss": 0.7087, + "step": 284280 + }, + { + "epoch": 2.5132162874166797, + "grad_norm": 3.7510783672332764, + "learning_rate": 8.113061876388668e-06, + "loss": 0.4606, + "step": 284290 + }, + { + "epoch": 2.5133046906769922, + "grad_norm": 11.888921737670898, + "learning_rate": 8.111588488716798e-06, + "loss": 0.4659, + "step": 284300 + }, + { + "epoch": 2.5133930939373044, + "grad_norm": 1.4243167638778687, + "learning_rate": 8.110115101044927e-06, + "loss": 0.4502, + "step": 284310 + }, + { + "epoch": 2.5134814971976165, + "grad_norm": 3.5701160430908203, + "learning_rate": 8.108641713373057e-06, + "loss": 0.4932, + "step": 284320 + }, + { + "epoch": 2.513569900457929, + "grad_norm": 5.150989055633545, + "learning_rate": 8.107168325701185e-06, + "loss": 0.661, + "step": 284330 + }, + { + "epoch": 2.513658303718241, + "grad_norm": 1.4589215517044067, + "learning_rate": 8.105694938029315e-06, + "loss": 0.4605, + "step": 284340 + }, + { + "epoch": 2.5137467069785533, + "grad_norm": 12.544736862182617, + "learning_rate": 8.104221550357444e-06, + "loss": 0.5533, + "step": 284350 + }, + { + "epoch": 2.5138351102388654, + "grad_norm": 1.611133337020874, + "learning_rate": 8.102748162685574e-06, + "loss": 0.3292, + "step": 284360 + }, + { + "epoch": 2.513923513499178, + "grad_norm": 1.8159013986587524, + "learning_rate": 8.101274775013704e-06, + "loss": 0.5363, + "step": 284370 + }, + { + "epoch": 2.51401191675949, + "grad_norm": 5.922914028167725, + "learning_rate": 8.099801387341832e-06, + "loss": 0.53, + "step": 284380 + }, + { + "epoch": 2.514100320019802, + "grad_norm": 6.474987983703613, + "learning_rate": 8.098327999669962e-06, + "loss": 0.5194, + "step": 284390 + }, + { + "epoch": 2.514188723280115, + "grad_norm": 4.41485071182251, + "learning_rate": 8.09685461199809e-06, + "loss": 0.5766, + "step": 284400 + }, + { + "epoch": 2.514277126540427, + "grad_norm": 5.353525638580322, + "learning_rate": 8.09538122432622e-06, + "loss": 0.5255, + "step": 284410 + }, + { + "epoch": 2.514365529800739, + "grad_norm": 1.9315181970596313, + "learning_rate": 8.093907836654349e-06, + "loss": 0.4591, + "step": 284420 + }, + { + "epoch": 2.514453933061051, + "grad_norm": 3.5911264419555664, + "learning_rate": 8.092434448982479e-06, + "loss": 0.4734, + "step": 284430 + }, + { + "epoch": 2.5145423363213633, + "grad_norm": 2.9769234657287598, + "learning_rate": 8.090961061310607e-06, + "loss": 0.461, + "step": 284440 + }, + { + "epoch": 2.514630739581676, + "grad_norm": 2.008563756942749, + "learning_rate": 8.089487673638737e-06, + "loss": 0.6257, + "step": 284450 + }, + { + "epoch": 2.514719142841988, + "grad_norm": 0.8366401195526123, + "learning_rate": 8.088014285966866e-06, + "loss": 0.5885, + "step": 284460 + }, + { + "epoch": 2.5148075461023005, + "grad_norm": 3.453355550765991, + "learning_rate": 8.086540898294996e-06, + "loss": 0.4628, + "step": 284470 + }, + { + "epoch": 2.5148959493626126, + "grad_norm": 17.409400939941406, + "learning_rate": 8.085067510623126e-06, + "loss": 0.4507, + "step": 284480 + }, + { + "epoch": 2.5149843526229247, + "grad_norm": 2.0561177730560303, + "learning_rate": 8.083594122951254e-06, + "loss": 0.4764, + "step": 284490 + }, + { + "epoch": 2.515072755883237, + "grad_norm": 22.6203556060791, + "learning_rate": 8.082120735279384e-06, + "loss": 0.6468, + "step": 284500 + }, + { + "epoch": 2.515161159143549, + "grad_norm": 3.133798122406006, + "learning_rate": 8.080647347607512e-06, + "loss": 0.6097, + "step": 284510 + }, + { + "epoch": 2.5152495624038615, + "grad_norm": 4.902829647064209, + "learning_rate": 8.079173959935643e-06, + "loss": 0.5495, + "step": 284520 + }, + { + "epoch": 2.5153379656641737, + "grad_norm": 5.009650230407715, + "learning_rate": 8.077700572263771e-06, + "loss": 0.4657, + "step": 284530 + }, + { + "epoch": 2.515426368924486, + "grad_norm": 4.245798587799072, + "learning_rate": 8.076227184591901e-06, + "loss": 0.5721, + "step": 284540 + }, + { + "epoch": 2.5155147721847984, + "grad_norm": 1.174284815788269, + "learning_rate": 8.074753796920031e-06, + "loss": 0.524, + "step": 284550 + }, + { + "epoch": 2.5156031754451105, + "grad_norm": 12.179656028747559, + "learning_rate": 8.07328040924816e-06, + "loss": 0.4809, + "step": 284560 + }, + { + "epoch": 2.5156915787054226, + "grad_norm": 4.143700122833252, + "learning_rate": 8.07180702157629e-06, + "loss": 0.457, + "step": 284570 + }, + { + "epoch": 2.5157799819657347, + "grad_norm": 9.005715370178223, + "learning_rate": 8.07033363390442e-06, + "loss": 0.4912, + "step": 284580 + }, + { + "epoch": 2.5158683852260473, + "grad_norm": 3.577296495437622, + "learning_rate": 8.068860246232548e-06, + "loss": 0.6759, + "step": 284590 + }, + { + "epoch": 2.5159567884863594, + "grad_norm": 5.947630405426025, + "learning_rate": 8.067386858560678e-06, + "loss": 0.6321, + "step": 284600 + }, + { + "epoch": 2.5160451917466715, + "grad_norm": 8.030614852905273, + "learning_rate": 8.065913470888808e-06, + "loss": 0.5423, + "step": 284610 + }, + { + "epoch": 2.516133595006984, + "grad_norm": 3.5088863372802734, + "learning_rate": 8.064440083216936e-06, + "loss": 0.5049, + "step": 284620 + }, + { + "epoch": 2.516221998267296, + "grad_norm": 2.9089581966400146, + "learning_rate": 8.062966695545066e-06, + "loss": 0.596, + "step": 284630 + }, + { + "epoch": 2.5163104015276083, + "grad_norm": 1.4804167747497559, + "learning_rate": 8.061493307873195e-06, + "loss": 0.7781, + "step": 284640 + }, + { + "epoch": 2.5163988047879204, + "grad_norm": 3.9879026412963867, + "learning_rate": 8.060019920201325e-06, + "loss": 0.4086, + "step": 284650 + }, + { + "epoch": 2.5164872080482326, + "grad_norm": 3.6423180103302, + "learning_rate": 8.058546532529455e-06, + "loss": 0.4023, + "step": 284660 + }, + { + "epoch": 2.516575611308545, + "grad_norm": 2.377091646194458, + "learning_rate": 8.057073144857583e-06, + "loss": 0.5209, + "step": 284670 + }, + { + "epoch": 2.5166640145688572, + "grad_norm": 4.763335227966309, + "learning_rate": 8.055599757185713e-06, + "loss": 0.56, + "step": 284680 + }, + { + "epoch": 2.51675241782917, + "grad_norm": 4.940949440002441, + "learning_rate": 8.054126369513841e-06, + "loss": 0.5894, + "step": 284690 + }, + { + "epoch": 2.516840821089482, + "grad_norm": 3.714641571044922, + "learning_rate": 8.052652981841972e-06, + "loss": 0.5532, + "step": 284700 + }, + { + "epoch": 2.516929224349794, + "grad_norm": 6.0090413093566895, + "learning_rate": 8.0511795941701e-06, + "loss": 0.5679, + "step": 284710 + }, + { + "epoch": 2.517017627610106, + "grad_norm": 3.608367443084717, + "learning_rate": 8.04970620649823e-06, + "loss": 0.582, + "step": 284720 + }, + { + "epoch": 2.5171060308704183, + "grad_norm": 4.069309234619141, + "learning_rate": 8.048232818826358e-06, + "loss": 0.581, + "step": 284730 + }, + { + "epoch": 2.517194434130731, + "grad_norm": 9.374773979187012, + "learning_rate": 8.046759431154488e-06, + "loss": 0.547, + "step": 284740 + }, + { + "epoch": 2.517282837391043, + "grad_norm": 3.41222882270813, + "learning_rate": 8.045286043482618e-06, + "loss": 0.6445, + "step": 284750 + }, + { + "epoch": 2.517371240651355, + "grad_norm": 6.10033655166626, + "learning_rate": 8.043812655810747e-06, + "loss": 0.47, + "step": 284760 + }, + { + "epoch": 2.5174596439116677, + "grad_norm": 2.366708517074585, + "learning_rate": 8.042339268138877e-06, + "loss": 0.5707, + "step": 284770 + }, + { + "epoch": 2.5175480471719798, + "grad_norm": 2.246891736984253, + "learning_rate": 8.040865880467005e-06, + "loss": 0.4576, + "step": 284780 + }, + { + "epoch": 2.517636450432292, + "grad_norm": 4.519936561584473, + "learning_rate": 8.039392492795135e-06, + "loss": 0.598, + "step": 284790 + }, + { + "epoch": 2.517724853692604, + "grad_norm": 2.941627025604248, + "learning_rate": 8.037919105123264e-06, + "loss": 0.4874, + "step": 284800 + }, + { + "epoch": 2.5178132569529166, + "grad_norm": 3.6706650257110596, + "learning_rate": 8.036445717451394e-06, + "loss": 0.5025, + "step": 284810 + }, + { + "epoch": 2.5179016602132287, + "grad_norm": 2.8622887134552, + "learning_rate": 8.034972329779522e-06, + "loss": 0.4728, + "step": 284820 + }, + { + "epoch": 2.517990063473541, + "grad_norm": 3.3458313941955566, + "learning_rate": 8.033498942107652e-06, + "loss": 0.6303, + "step": 284830 + }, + { + "epoch": 2.5180784667338534, + "grad_norm": 1.51913583278656, + "learning_rate": 8.032025554435782e-06, + "loss": 0.524, + "step": 284840 + }, + { + "epoch": 2.5181668699941655, + "grad_norm": 6.185166835784912, + "learning_rate": 8.03055216676391e-06, + "loss": 0.5087, + "step": 284850 + }, + { + "epoch": 2.5182552732544776, + "grad_norm": 5.479124546051025, + "learning_rate": 8.02907877909204e-06, + "loss": 0.4844, + "step": 284860 + }, + { + "epoch": 2.5183436765147897, + "grad_norm": 4.263346195220947, + "learning_rate": 8.027605391420169e-06, + "loss": 0.5544, + "step": 284870 + }, + { + "epoch": 2.518432079775102, + "grad_norm": 4.35559606552124, + "learning_rate": 8.026132003748299e-06, + "loss": 0.6066, + "step": 284880 + }, + { + "epoch": 2.5185204830354144, + "grad_norm": 2.8270905017852783, + "learning_rate": 8.024658616076427e-06, + "loss": 0.4988, + "step": 284890 + }, + { + "epoch": 2.5186088862957265, + "grad_norm": 15.642151832580566, + "learning_rate": 8.023185228404557e-06, + "loss": 0.5929, + "step": 284900 + }, + { + "epoch": 2.5186972895560387, + "grad_norm": 2.0456905364990234, + "learning_rate": 8.021711840732686e-06, + "loss": 0.4333, + "step": 284910 + }, + { + "epoch": 2.5187856928163512, + "grad_norm": 5.191150188446045, + "learning_rate": 8.020238453060816e-06, + "loss": 0.6823, + "step": 284920 + }, + { + "epoch": 2.5188740960766633, + "grad_norm": 3.9505081176757812, + "learning_rate": 8.018765065388946e-06, + "loss": 0.5598, + "step": 284930 + }, + { + "epoch": 2.5189624993369755, + "grad_norm": 3.269458055496216, + "learning_rate": 8.017291677717074e-06, + "loss": 0.4647, + "step": 284940 + }, + { + "epoch": 2.5190509025972876, + "grad_norm": 2.646437406539917, + "learning_rate": 8.015818290045204e-06, + "loss": 0.3421, + "step": 284950 + }, + { + "epoch": 2.5191393058576, + "grad_norm": 23.99907875061035, + "learning_rate": 8.014344902373332e-06, + "loss": 0.52, + "step": 284960 + }, + { + "epoch": 2.5192277091179123, + "grad_norm": 8.771585464477539, + "learning_rate": 8.012871514701462e-06, + "loss": 0.4703, + "step": 284970 + }, + { + "epoch": 2.5193161123782244, + "grad_norm": 3.865586042404175, + "learning_rate": 8.01139812702959e-06, + "loss": 0.6166, + "step": 284980 + }, + { + "epoch": 2.519404515638537, + "grad_norm": 3.621429443359375, + "learning_rate": 8.009924739357721e-06, + "loss": 0.6008, + "step": 284990 + }, + { + "epoch": 2.519492918898849, + "grad_norm": 13.97508716583252, + "learning_rate": 8.00845135168585e-06, + "loss": 0.4748, + "step": 285000 + }, + { + "epoch": 2.519581322159161, + "grad_norm": 1.4926892518997192, + "learning_rate": 8.00697796401398e-06, + "loss": 0.4363, + "step": 285010 + }, + { + "epoch": 2.5196697254194733, + "grad_norm": 5.335996150970459, + "learning_rate": 8.00550457634211e-06, + "loss": 0.5223, + "step": 285020 + }, + { + "epoch": 2.5197581286797854, + "grad_norm": 5.187483310699463, + "learning_rate": 8.004031188670238e-06, + "loss": 0.6092, + "step": 285030 + }, + { + "epoch": 2.519846531940098, + "grad_norm": 9.49863338470459, + "learning_rate": 8.002557800998368e-06, + "loss": 0.5314, + "step": 285040 + }, + { + "epoch": 2.51993493520041, + "grad_norm": 4.209654331207275, + "learning_rate": 8.001084413326498e-06, + "loss": 0.6126, + "step": 285050 + }, + { + "epoch": 2.5200233384607227, + "grad_norm": 16.442276000976562, + "learning_rate": 7.999611025654626e-06, + "loss": 0.5016, + "step": 285060 + }, + { + "epoch": 2.520111741721035, + "grad_norm": 1.5299516916275024, + "learning_rate": 7.998137637982756e-06, + "loss": 0.569, + "step": 285070 + }, + { + "epoch": 2.520200144981347, + "grad_norm": 6.598069190979004, + "learning_rate": 7.996664250310886e-06, + "loss": 0.5264, + "step": 285080 + }, + { + "epoch": 2.520288548241659, + "grad_norm": 17.544580459594727, + "learning_rate": 7.995190862639015e-06, + "loss": 0.5371, + "step": 285090 + }, + { + "epoch": 2.520376951501971, + "grad_norm": 2.8827285766601562, + "learning_rate": 7.993717474967145e-06, + "loss": 0.5558, + "step": 285100 + }, + { + "epoch": 2.5204653547622837, + "grad_norm": 8.586620330810547, + "learning_rate": 7.992244087295273e-06, + "loss": 0.4822, + "step": 285110 + }, + { + "epoch": 2.520553758022596, + "grad_norm": 2.4233059883117676, + "learning_rate": 7.990770699623403e-06, + "loss": 0.5946, + "step": 285120 + }, + { + "epoch": 2.520642161282908, + "grad_norm": 8.802680015563965, + "learning_rate": 7.989297311951533e-06, + "loss": 0.5165, + "step": 285130 + }, + { + "epoch": 2.5207305645432205, + "grad_norm": 6.78078556060791, + "learning_rate": 7.987823924279661e-06, + "loss": 0.4631, + "step": 285140 + }, + { + "epoch": 2.5208189678035327, + "grad_norm": 4.485081195831299, + "learning_rate": 7.986350536607791e-06, + "loss": 0.6652, + "step": 285150 + }, + { + "epoch": 2.5209073710638448, + "grad_norm": 3.837928533554077, + "learning_rate": 7.98487714893592e-06, + "loss": 0.531, + "step": 285160 + }, + { + "epoch": 2.520995774324157, + "grad_norm": 2.1815431118011475, + "learning_rate": 7.98340376126405e-06, + "loss": 0.5962, + "step": 285170 + }, + { + "epoch": 2.5210841775844695, + "grad_norm": 2.091980218887329, + "learning_rate": 7.981930373592178e-06, + "loss": 0.5645, + "step": 285180 + }, + { + "epoch": 2.5211725808447816, + "grad_norm": 3.2688543796539307, + "learning_rate": 7.980456985920308e-06, + "loss": 0.4998, + "step": 285190 + }, + { + "epoch": 2.5212609841050937, + "grad_norm": 7.320805072784424, + "learning_rate": 7.978983598248437e-06, + "loss": 0.5685, + "step": 285200 + }, + { + "epoch": 2.5213493873654063, + "grad_norm": 2.412719488143921, + "learning_rate": 7.977510210576567e-06, + "loss": 0.5135, + "step": 285210 + }, + { + "epoch": 2.5214377906257184, + "grad_norm": 8.243139266967773, + "learning_rate": 7.976036822904697e-06, + "loss": 0.5853, + "step": 285220 + }, + { + "epoch": 2.5215261938860305, + "grad_norm": 2.015704870223999, + "learning_rate": 7.974563435232825e-06, + "loss": 0.3513, + "step": 285230 + }, + { + "epoch": 2.5216145971463426, + "grad_norm": 5.223895072937012, + "learning_rate": 7.973090047560955e-06, + "loss": 0.3799, + "step": 285240 + }, + { + "epoch": 2.5217030004066547, + "grad_norm": 3.446650981903076, + "learning_rate": 7.971616659889083e-06, + "loss": 0.4942, + "step": 285250 + }, + { + "epoch": 2.5217914036669673, + "grad_norm": 9.74634838104248, + "learning_rate": 7.970143272217214e-06, + "loss": 0.4566, + "step": 285260 + }, + { + "epoch": 2.5218798069272794, + "grad_norm": 4.906155586242676, + "learning_rate": 7.968669884545342e-06, + "loss": 0.4774, + "step": 285270 + }, + { + "epoch": 2.521968210187592, + "grad_norm": 7.329070568084717, + "learning_rate": 7.967196496873472e-06, + "loss": 0.5704, + "step": 285280 + }, + { + "epoch": 2.522056613447904, + "grad_norm": 1.9099117517471313, + "learning_rate": 7.9657231092016e-06, + "loss": 0.5253, + "step": 285290 + }, + { + "epoch": 2.5221450167082162, + "grad_norm": 4.758684158325195, + "learning_rate": 7.96424972152973e-06, + "loss": 0.5323, + "step": 285300 + }, + { + "epoch": 2.5222334199685283, + "grad_norm": 3.719243288040161, + "learning_rate": 7.96277633385786e-06, + "loss": 0.6457, + "step": 285310 + }, + { + "epoch": 2.5223218232288405, + "grad_norm": 2.3642401695251465, + "learning_rate": 7.961302946185989e-06, + "loss": 0.3831, + "step": 285320 + }, + { + "epoch": 2.522410226489153, + "grad_norm": 3.843763828277588, + "learning_rate": 7.959829558514119e-06, + "loss": 0.5901, + "step": 285330 + }, + { + "epoch": 2.522498629749465, + "grad_norm": 4.994195461273193, + "learning_rate": 7.958356170842247e-06, + "loss": 0.5765, + "step": 285340 + }, + { + "epoch": 2.5225870330097773, + "grad_norm": 1.0104937553405762, + "learning_rate": 7.956882783170377e-06, + "loss": 0.3789, + "step": 285350 + }, + { + "epoch": 2.52267543627009, + "grad_norm": 1.8495179414749146, + "learning_rate": 7.955409395498506e-06, + "loss": 0.5472, + "step": 285360 + }, + { + "epoch": 2.522763839530402, + "grad_norm": 3.515901565551758, + "learning_rate": 7.953936007826636e-06, + "loss": 0.6433, + "step": 285370 + }, + { + "epoch": 2.522852242790714, + "grad_norm": 0.8285255432128906, + "learning_rate": 7.952462620154764e-06, + "loss": 0.4206, + "step": 285380 + }, + { + "epoch": 2.522940646051026, + "grad_norm": 3.3500874042510986, + "learning_rate": 7.950989232482894e-06, + "loss": 0.5853, + "step": 285390 + }, + { + "epoch": 2.5230290493113388, + "grad_norm": 5.859566688537598, + "learning_rate": 7.949515844811024e-06, + "loss": 0.5792, + "step": 285400 + }, + { + "epoch": 2.523117452571651, + "grad_norm": 9.180251121520996, + "learning_rate": 7.948042457139152e-06, + "loss": 0.5273, + "step": 285410 + }, + { + "epoch": 2.523205855831963, + "grad_norm": 6.887299060821533, + "learning_rate": 7.946569069467282e-06, + "loss": 0.5157, + "step": 285420 + }, + { + "epoch": 2.5232942590922756, + "grad_norm": 4.658280849456787, + "learning_rate": 7.94509568179541e-06, + "loss": 0.5096, + "step": 285430 + }, + { + "epoch": 2.5233826623525877, + "grad_norm": 3.90352725982666, + "learning_rate": 7.943622294123541e-06, + "loss": 0.5336, + "step": 285440 + }, + { + "epoch": 2.5234710656129, + "grad_norm": 2.142873764038086, + "learning_rate": 7.94214890645167e-06, + "loss": 0.5682, + "step": 285450 + }, + { + "epoch": 2.523559468873212, + "grad_norm": 2.315516710281372, + "learning_rate": 7.9406755187798e-06, + "loss": 0.5122, + "step": 285460 + }, + { + "epoch": 2.523647872133524, + "grad_norm": 1.7395604848861694, + "learning_rate": 7.939202131107928e-06, + "loss": 0.4801, + "step": 285470 + }, + { + "epoch": 2.5237362753938366, + "grad_norm": 3.2431488037109375, + "learning_rate": 7.937728743436058e-06, + "loss": 0.4399, + "step": 285480 + }, + { + "epoch": 2.5238246786541487, + "grad_norm": 1.265762448310852, + "learning_rate": 7.936255355764188e-06, + "loss": 0.4416, + "step": 285490 + }, + { + "epoch": 2.523913081914461, + "grad_norm": 1.471287488937378, + "learning_rate": 7.934781968092316e-06, + "loss": 0.4619, + "step": 285500 + }, + { + "epoch": 2.5240014851747734, + "grad_norm": 2.2483925819396973, + "learning_rate": 7.933308580420446e-06, + "loss": 0.5165, + "step": 285510 + }, + { + "epoch": 2.5240898884350855, + "grad_norm": 1.1861714124679565, + "learning_rate": 7.931835192748576e-06, + "loss": 0.4951, + "step": 285520 + }, + { + "epoch": 2.5241782916953976, + "grad_norm": 16.745702743530273, + "learning_rate": 7.930361805076705e-06, + "loss": 0.5658, + "step": 285530 + }, + { + "epoch": 2.5242666949557098, + "grad_norm": 1.836665391921997, + "learning_rate": 7.928888417404835e-06, + "loss": 0.4508, + "step": 285540 + }, + { + "epoch": 2.5243550982160223, + "grad_norm": 5.893899917602539, + "learning_rate": 7.927415029732965e-06, + "loss": 0.5711, + "step": 285550 + }, + { + "epoch": 2.5244435014763345, + "grad_norm": 6.721429347991943, + "learning_rate": 7.925941642061093e-06, + "loss": 0.4565, + "step": 285560 + }, + { + "epoch": 2.5245319047366466, + "grad_norm": 3.507948637008667, + "learning_rate": 7.924468254389223e-06, + "loss": 0.5966, + "step": 285570 + }, + { + "epoch": 2.524620307996959, + "grad_norm": 2.5289061069488525, + "learning_rate": 7.922994866717351e-06, + "loss": 0.4455, + "step": 285580 + }, + { + "epoch": 2.5247087112572713, + "grad_norm": 2.543667793273926, + "learning_rate": 7.921521479045481e-06, + "loss": 0.5112, + "step": 285590 + }, + { + "epoch": 2.5247971145175834, + "grad_norm": 10.163717269897461, + "learning_rate": 7.920048091373611e-06, + "loss": 0.5506, + "step": 285600 + }, + { + "epoch": 2.5248855177778955, + "grad_norm": 5.831507682800293, + "learning_rate": 7.91857470370174e-06, + "loss": 0.5095, + "step": 285610 + }, + { + "epoch": 2.5249739210382076, + "grad_norm": 4.267759799957275, + "learning_rate": 7.91710131602987e-06, + "loss": 0.5943, + "step": 285620 + }, + { + "epoch": 2.52506232429852, + "grad_norm": 2.155897378921509, + "learning_rate": 7.915627928357998e-06, + "loss": 0.4329, + "step": 285630 + }, + { + "epoch": 2.5251507275588323, + "grad_norm": 3.509528875350952, + "learning_rate": 7.914154540686128e-06, + "loss": 0.5387, + "step": 285640 + }, + { + "epoch": 2.525239130819145, + "grad_norm": 19.964876174926758, + "learning_rate": 7.912681153014257e-06, + "loss": 0.4495, + "step": 285650 + }, + { + "epoch": 2.525327534079457, + "grad_norm": 4.115735054016113, + "learning_rate": 7.911207765342387e-06, + "loss": 0.6311, + "step": 285660 + }, + { + "epoch": 2.525415937339769, + "grad_norm": 6.249667644500732, + "learning_rate": 7.909734377670515e-06, + "loss": 0.5264, + "step": 285670 + }, + { + "epoch": 2.525504340600081, + "grad_norm": 2.703172206878662, + "learning_rate": 7.908260989998645e-06, + "loss": 0.4356, + "step": 285680 + }, + { + "epoch": 2.5255927438603933, + "grad_norm": 2.406681537628174, + "learning_rate": 7.906787602326775e-06, + "loss": 0.471, + "step": 285690 + }, + { + "epoch": 2.525681147120706, + "grad_norm": 21.97186851501465, + "learning_rate": 7.905314214654903e-06, + "loss": 0.5496, + "step": 285700 + }, + { + "epoch": 2.525769550381018, + "grad_norm": 2.309353828430176, + "learning_rate": 7.903840826983034e-06, + "loss": 0.6431, + "step": 285710 + }, + { + "epoch": 2.52585795364133, + "grad_norm": 3.2547459602355957, + "learning_rate": 7.902367439311162e-06, + "loss": 0.5702, + "step": 285720 + }, + { + "epoch": 2.5259463569016427, + "grad_norm": 2.398329019546509, + "learning_rate": 7.900894051639292e-06, + "loss": 0.6499, + "step": 285730 + }, + { + "epoch": 2.526034760161955, + "grad_norm": 2.045269727706909, + "learning_rate": 7.89942066396742e-06, + "loss": 0.555, + "step": 285740 + }, + { + "epoch": 2.526123163422267, + "grad_norm": 6.729933738708496, + "learning_rate": 7.89794727629555e-06, + "loss": 0.5394, + "step": 285750 + }, + { + "epoch": 2.526211566682579, + "grad_norm": 8.337625503540039, + "learning_rate": 7.896473888623679e-06, + "loss": 0.4679, + "step": 285760 + }, + { + "epoch": 2.5262999699428916, + "grad_norm": 2.296076536178589, + "learning_rate": 7.895000500951809e-06, + "loss": 0.484, + "step": 285770 + }, + { + "epoch": 2.5263883732032038, + "grad_norm": 1.688645839691162, + "learning_rate": 7.893527113279939e-06, + "loss": 0.5757, + "step": 285780 + }, + { + "epoch": 2.526476776463516, + "grad_norm": 3.2601144313812256, + "learning_rate": 7.892053725608067e-06, + "loss": 0.4907, + "step": 285790 + }, + { + "epoch": 2.5265651797238284, + "grad_norm": 8.00379753112793, + "learning_rate": 7.890580337936197e-06, + "loss": 0.5225, + "step": 285800 + }, + { + "epoch": 2.5266535829841406, + "grad_norm": 6.364208698272705, + "learning_rate": 7.889106950264326e-06, + "loss": 0.5614, + "step": 285810 + }, + { + "epoch": 2.5267419862444527, + "grad_norm": 2.5358405113220215, + "learning_rate": 7.887633562592456e-06, + "loss": 0.458, + "step": 285820 + }, + { + "epoch": 2.526830389504765, + "grad_norm": 4.11875581741333, + "learning_rate": 7.886160174920584e-06, + "loss": 0.6205, + "step": 285830 + }, + { + "epoch": 2.526918792765077, + "grad_norm": 2.1603145599365234, + "learning_rate": 7.884686787248714e-06, + "loss": 0.4862, + "step": 285840 + }, + { + "epoch": 2.5270071960253895, + "grad_norm": 3.5929601192474365, + "learning_rate": 7.883213399576842e-06, + "loss": 0.6016, + "step": 285850 + }, + { + "epoch": 2.5270955992857016, + "grad_norm": 13.038309097290039, + "learning_rate": 7.881740011904972e-06, + "loss": 0.5915, + "step": 285860 + }, + { + "epoch": 2.527184002546014, + "grad_norm": 0.7464901208877563, + "learning_rate": 7.880266624233102e-06, + "loss": 0.4608, + "step": 285870 + }, + { + "epoch": 2.5272724058063263, + "grad_norm": 7.891518592834473, + "learning_rate": 7.87879323656123e-06, + "loss": 0.6646, + "step": 285880 + }, + { + "epoch": 2.5273608090666384, + "grad_norm": 5.050043106079102, + "learning_rate": 7.87731984888936e-06, + "loss": 0.4994, + "step": 285890 + }, + { + "epoch": 2.5274492123269505, + "grad_norm": 3.198244094848633, + "learning_rate": 7.87584646121749e-06, + "loss": 0.435, + "step": 285900 + }, + { + "epoch": 2.5275376155872626, + "grad_norm": 3.6682167053222656, + "learning_rate": 7.87437307354562e-06, + "loss": 0.4843, + "step": 285910 + }, + { + "epoch": 2.527626018847575, + "grad_norm": 1.3074642419815063, + "learning_rate": 7.872899685873748e-06, + "loss": 0.637, + "step": 285920 + }, + { + "epoch": 2.5277144221078873, + "grad_norm": 2.422226905822754, + "learning_rate": 7.871426298201878e-06, + "loss": 0.4879, + "step": 285930 + }, + { + "epoch": 2.5278028253681994, + "grad_norm": 3.34745192527771, + "learning_rate": 7.869952910530006e-06, + "loss": 0.5452, + "step": 285940 + }, + { + "epoch": 2.527891228628512, + "grad_norm": 5.619089603424072, + "learning_rate": 7.868479522858136e-06, + "loss": 0.52, + "step": 285950 + }, + { + "epoch": 2.527979631888824, + "grad_norm": 5.036675453186035, + "learning_rate": 7.867006135186266e-06, + "loss": 0.6164, + "step": 285960 + }, + { + "epoch": 2.5280680351491363, + "grad_norm": 2.5913846492767334, + "learning_rate": 7.865532747514396e-06, + "loss": 0.5135, + "step": 285970 + }, + { + "epoch": 2.5281564384094484, + "grad_norm": 4.11860466003418, + "learning_rate": 7.864059359842524e-06, + "loss": 0.5601, + "step": 285980 + }, + { + "epoch": 2.528244841669761, + "grad_norm": 2.692342519760132, + "learning_rate": 7.862585972170655e-06, + "loss": 0.6018, + "step": 285990 + }, + { + "epoch": 2.528333244930073, + "grad_norm": 1.7527639865875244, + "learning_rate": 7.861112584498785e-06, + "loss": 0.5279, + "step": 286000 + }, + { + "epoch": 2.528421648190385, + "grad_norm": 3.239560842514038, + "learning_rate": 7.859639196826913e-06, + "loss": 0.5221, + "step": 286010 + }, + { + "epoch": 2.5285100514506977, + "grad_norm": 9.328505516052246, + "learning_rate": 7.858165809155043e-06, + "loss": 0.444, + "step": 286020 + }, + { + "epoch": 2.52859845471101, + "grad_norm": 5.25129508972168, + "learning_rate": 7.856692421483171e-06, + "loss": 0.5425, + "step": 286030 + }, + { + "epoch": 2.528686857971322, + "grad_norm": 6.002930641174316, + "learning_rate": 7.855219033811301e-06, + "loss": 0.6988, + "step": 286040 + }, + { + "epoch": 2.528775261231634, + "grad_norm": 2.5140509605407715, + "learning_rate": 7.85374564613943e-06, + "loss": 0.5721, + "step": 286050 + }, + { + "epoch": 2.528863664491946, + "grad_norm": 3.058155059814453, + "learning_rate": 7.85227225846756e-06, + "loss": 0.4818, + "step": 286060 + }, + { + "epoch": 2.528952067752259, + "grad_norm": 1.0566374063491821, + "learning_rate": 7.85079887079569e-06, + "loss": 0.434, + "step": 286070 + }, + { + "epoch": 2.529040471012571, + "grad_norm": 2.33178973197937, + "learning_rate": 7.849325483123818e-06, + "loss": 0.6049, + "step": 286080 + }, + { + "epoch": 2.529128874272883, + "grad_norm": 5.536881923675537, + "learning_rate": 7.847852095451948e-06, + "loss": 0.5334, + "step": 286090 + }, + { + "epoch": 2.5292172775331956, + "grad_norm": 3.4606094360351562, + "learning_rate": 7.846378707780077e-06, + "loss": 0.5334, + "step": 286100 + }, + { + "epoch": 2.5293056807935077, + "grad_norm": 4.192465782165527, + "learning_rate": 7.844905320108207e-06, + "loss": 0.4137, + "step": 286110 + }, + { + "epoch": 2.52939408405382, + "grad_norm": 4.2886176109313965, + "learning_rate": 7.843431932436335e-06, + "loss": 0.5316, + "step": 286120 + }, + { + "epoch": 2.529482487314132, + "grad_norm": 6.275772571563721, + "learning_rate": 7.841958544764465e-06, + "loss": 0.6407, + "step": 286130 + }, + { + "epoch": 2.5295708905744445, + "grad_norm": 1.620099663734436, + "learning_rate": 7.840485157092593e-06, + "loss": 0.5061, + "step": 286140 + }, + { + "epoch": 2.5296592938347566, + "grad_norm": 6.006455421447754, + "learning_rate": 7.839011769420723e-06, + "loss": 0.6367, + "step": 286150 + }, + { + "epoch": 2.5297476970950687, + "grad_norm": 4.350001811981201, + "learning_rate": 7.837538381748853e-06, + "loss": 0.5886, + "step": 286160 + }, + { + "epoch": 2.5298361003553813, + "grad_norm": 2.814239740371704, + "learning_rate": 7.836064994076982e-06, + "loss": 0.4668, + "step": 286170 + }, + { + "epoch": 2.5299245036156934, + "grad_norm": 6.204505920410156, + "learning_rate": 7.834591606405112e-06, + "loss": 0.5431, + "step": 286180 + }, + { + "epoch": 2.5300129068760056, + "grad_norm": 1.5662897825241089, + "learning_rate": 7.83311821873324e-06, + "loss": 0.5734, + "step": 286190 + }, + { + "epoch": 2.5301013101363177, + "grad_norm": 3.9877796173095703, + "learning_rate": 7.83164483106137e-06, + "loss": 0.6215, + "step": 286200 + }, + { + "epoch": 2.53018971339663, + "grad_norm": 4.389085292816162, + "learning_rate": 7.830171443389499e-06, + "loss": 0.5789, + "step": 286210 + }, + { + "epoch": 2.5302781166569424, + "grad_norm": 0.9232724905014038, + "learning_rate": 7.828698055717629e-06, + "loss": 0.411, + "step": 286220 + }, + { + "epoch": 2.5303665199172545, + "grad_norm": 3.4917714595794678, + "learning_rate": 7.827224668045757e-06, + "loss": 0.6508, + "step": 286230 + }, + { + "epoch": 2.530454923177567, + "grad_norm": 4.707198619842529, + "learning_rate": 7.825751280373887e-06, + "loss": 0.5096, + "step": 286240 + }, + { + "epoch": 2.530543326437879, + "grad_norm": 1.6495981216430664, + "learning_rate": 7.824277892702017e-06, + "loss": 0.5024, + "step": 286250 + }, + { + "epoch": 2.5306317296981913, + "grad_norm": 9.021656036376953, + "learning_rate": 7.822804505030145e-06, + "loss": 0.5687, + "step": 286260 + }, + { + "epoch": 2.5307201329585034, + "grad_norm": 4.228960037231445, + "learning_rate": 7.821331117358276e-06, + "loss": 0.6378, + "step": 286270 + }, + { + "epoch": 2.5308085362188155, + "grad_norm": 2.770382881164551, + "learning_rate": 7.819857729686404e-06, + "loss": 0.4613, + "step": 286280 + }, + { + "epoch": 2.530896939479128, + "grad_norm": 1.8547618389129639, + "learning_rate": 7.818384342014534e-06, + "loss": 0.5107, + "step": 286290 + }, + { + "epoch": 2.53098534273944, + "grad_norm": 8.497518539428711, + "learning_rate": 7.816910954342662e-06, + "loss": 0.6147, + "step": 286300 + }, + { + "epoch": 2.5310737459997523, + "grad_norm": 7.653506755828857, + "learning_rate": 7.815437566670792e-06, + "loss": 0.529, + "step": 286310 + }, + { + "epoch": 2.531162149260065, + "grad_norm": 1.659049153327942, + "learning_rate": 7.81396417899892e-06, + "loss": 0.5495, + "step": 286320 + }, + { + "epoch": 2.531250552520377, + "grad_norm": 4.075590133666992, + "learning_rate": 7.81249079132705e-06, + "loss": 0.4711, + "step": 286330 + }, + { + "epoch": 2.531338955780689, + "grad_norm": 3.572044610977173, + "learning_rate": 7.81101740365518e-06, + "loss": 0.5107, + "step": 286340 + }, + { + "epoch": 2.5314273590410012, + "grad_norm": 2.2925355434417725, + "learning_rate": 7.80954401598331e-06, + "loss": 0.4846, + "step": 286350 + }, + { + "epoch": 2.531515762301314, + "grad_norm": 3.0930464267730713, + "learning_rate": 7.80807062831144e-06, + "loss": 0.58, + "step": 286360 + }, + { + "epoch": 2.531604165561626, + "grad_norm": 1.9412472248077393, + "learning_rate": 7.806597240639568e-06, + "loss": 0.6485, + "step": 286370 + }, + { + "epoch": 2.531692568821938, + "grad_norm": 6.316715240478516, + "learning_rate": 7.805123852967698e-06, + "loss": 0.4849, + "step": 286380 + }, + { + "epoch": 2.5317809720822506, + "grad_norm": 4.329470157623291, + "learning_rate": 7.803650465295826e-06, + "loss": 0.4672, + "step": 286390 + }, + { + "epoch": 2.5318693753425627, + "grad_norm": 5.358566761016846, + "learning_rate": 7.802177077623956e-06, + "loss": 0.5013, + "step": 286400 + }, + { + "epoch": 2.531957778602875, + "grad_norm": 11.09455394744873, + "learning_rate": 7.800703689952086e-06, + "loss": 0.4755, + "step": 286410 + }, + { + "epoch": 2.532046181863187, + "grad_norm": 4.014906406402588, + "learning_rate": 7.799230302280214e-06, + "loss": 0.5816, + "step": 286420 + }, + { + "epoch": 2.532134585123499, + "grad_norm": 4.507041931152344, + "learning_rate": 7.797756914608344e-06, + "loss": 0.4776, + "step": 286430 + }, + { + "epoch": 2.5322229883838117, + "grad_norm": 2.967893362045288, + "learning_rate": 7.796283526936475e-06, + "loss": 0.5361, + "step": 286440 + }, + { + "epoch": 2.5323113916441238, + "grad_norm": 1.9803436994552612, + "learning_rate": 7.794810139264603e-06, + "loss": 0.575, + "step": 286450 + }, + { + "epoch": 2.5323997949044363, + "grad_norm": 2.3964085578918457, + "learning_rate": 7.793336751592733e-06, + "loss": 0.597, + "step": 286460 + }, + { + "epoch": 2.5324881981647485, + "grad_norm": 2.447114944458008, + "learning_rate": 7.791863363920863e-06, + "loss": 0.6115, + "step": 286470 + }, + { + "epoch": 2.5325766014250606, + "grad_norm": 3.0418710708618164, + "learning_rate": 7.790389976248991e-06, + "loss": 0.5161, + "step": 286480 + }, + { + "epoch": 2.5326650046853727, + "grad_norm": 4.332446575164795, + "learning_rate": 7.788916588577121e-06, + "loss": 0.6536, + "step": 286490 + }, + { + "epoch": 2.532753407945685, + "grad_norm": 5.376768112182617, + "learning_rate": 7.78744320090525e-06, + "loss": 0.5026, + "step": 286500 + }, + { + "epoch": 2.5328418112059974, + "grad_norm": 2.551091432571411, + "learning_rate": 7.78596981323338e-06, + "loss": 0.5244, + "step": 286510 + }, + { + "epoch": 2.5329302144663095, + "grad_norm": 5.198960781097412, + "learning_rate": 7.78449642556151e-06, + "loss": 0.6733, + "step": 286520 + }, + { + "epoch": 2.5330186177266216, + "grad_norm": 8.839171409606934, + "learning_rate": 7.783023037889638e-06, + "loss": 0.4308, + "step": 286530 + }, + { + "epoch": 2.533107020986934, + "grad_norm": 1.6078214645385742, + "learning_rate": 7.781549650217768e-06, + "loss": 0.5614, + "step": 286540 + }, + { + "epoch": 2.5331954242472463, + "grad_norm": 2.688883066177368, + "learning_rate": 7.780076262545897e-06, + "loss": 0.5393, + "step": 286550 + }, + { + "epoch": 2.5332838275075584, + "grad_norm": 4.848813056945801, + "learning_rate": 7.778602874874027e-06, + "loss": 0.3664, + "step": 286560 + }, + { + "epoch": 2.5333722307678705, + "grad_norm": 6.28578519821167, + "learning_rate": 7.777129487202155e-06, + "loss": 0.4627, + "step": 286570 + }, + { + "epoch": 2.533460634028183, + "grad_norm": 7.091159343719482, + "learning_rate": 7.775656099530285e-06, + "loss": 0.6567, + "step": 286580 + }, + { + "epoch": 2.5335490372884952, + "grad_norm": 2.273583173751831, + "learning_rate": 7.774182711858413e-06, + "loss": 0.4541, + "step": 286590 + }, + { + "epoch": 2.5336374405488074, + "grad_norm": 3.016464948654175, + "learning_rate": 7.772709324186543e-06, + "loss": 0.4429, + "step": 286600 + }, + { + "epoch": 2.53372584380912, + "grad_norm": 2.592296600341797, + "learning_rate": 7.771235936514672e-06, + "loss": 0.4462, + "step": 286610 + }, + { + "epoch": 2.533814247069432, + "grad_norm": 10.184073448181152, + "learning_rate": 7.769762548842802e-06, + "loss": 0.5393, + "step": 286620 + }, + { + "epoch": 2.533902650329744, + "grad_norm": 7.26361083984375, + "learning_rate": 7.768289161170932e-06, + "loss": 0.483, + "step": 286630 + }, + { + "epoch": 2.5339910535900563, + "grad_norm": 1.7661869525909424, + "learning_rate": 7.76681577349906e-06, + "loss": 0.4898, + "step": 286640 + }, + { + "epoch": 2.5340794568503684, + "grad_norm": 3.3786630630493164, + "learning_rate": 7.76534238582719e-06, + "loss": 0.6268, + "step": 286650 + }, + { + "epoch": 2.534167860110681, + "grad_norm": 2.779499053955078, + "learning_rate": 7.763868998155319e-06, + "loss": 0.4831, + "step": 286660 + }, + { + "epoch": 2.534256263370993, + "grad_norm": 6.296142101287842, + "learning_rate": 7.762395610483449e-06, + "loss": 0.5946, + "step": 286670 + }, + { + "epoch": 2.534344666631305, + "grad_norm": 0.8611108660697937, + "learning_rate": 7.760922222811577e-06, + "loss": 0.4776, + "step": 286680 + }, + { + "epoch": 2.5344330698916178, + "grad_norm": 3.2260260581970215, + "learning_rate": 7.759448835139707e-06, + "loss": 0.5627, + "step": 286690 + }, + { + "epoch": 2.53452147315193, + "grad_norm": 3.4291598796844482, + "learning_rate": 7.757975447467835e-06, + "loss": 0.5459, + "step": 286700 + }, + { + "epoch": 2.534609876412242, + "grad_norm": 13.68843936920166, + "learning_rate": 7.756502059795965e-06, + "loss": 0.6613, + "step": 286710 + }, + { + "epoch": 2.534698279672554, + "grad_norm": 3.484387159347534, + "learning_rate": 7.755028672124096e-06, + "loss": 0.5186, + "step": 286720 + }, + { + "epoch": 2.5347866829328667, + "grad_norm": 2.6502256393432617, + "learning_rate": 7.753555284452224e-06, + "loss": 0.6134, + "step": 286730 + }, + { + "epoch": 2.534875086193179, + "grad_norm": 5.837625980377197, + "learning_rate": 7.752081896780354e-06, + "loss": 0.4419, + "step": 286740 + }, + { + "epoch": 2.534963489453491, + "grad_norm": 1.045156717300415, + "learning_rate": 7.750608509108482e-06, + "loss": 0.4537, + "step": 286750 + }, + { + "epoch": 2.5350518927138035, + "grad_norm": 4.720183849334717, + "learning_rate": 7.749135121436612e-06, + "loss": 0.6337, + "step": 286760 + }, + { + "epoch": 2.5351402959741156, + "grad_norm": 6.471834182739258, + "learning_rate": 7.74766173376474e-06, + "loss": 0.5556, + "step": 286770 + }, + { + "epoch": 2.5352286992344277, + "grad_norm": 4.858801364898682, + "learning_rate": 7.74618834609287e-06, + "loss": 0.5648, + "step": 286780 + }, + { + "epoch": 2.53531710249474, + "grad_norm": 3.7188448905944824, + "learning_rate": 7.744714958420999e-06, + "loss": 0.4823, + "step": 286790 + }, + { + "epoch": 2.535405505755052, + "grad_norm": 8.574637413024902, + "learning_rate": 7.743241570749129e-06, + "loss": 0.5529, + "step": 286800 + }, + { + "epoch": 2.5354939090153645, + "grad_norm": 1.2031058073043823, + "learning_rate": 7.74176818307726e-06, + "loss": 0.5514, + "step": 286810 + }, + { + "epoch": 2.5355823122756767, + "grad_norm": 2.1952860355377197, + "learning_rate": 7.740294795405388e-06, + "loss": 0.4927, + "step": 286820 + }, + { + "epoch": 2.535670715535989, + "grad_norm": 7.358974933624268, + "learning_rate": 7.738821407733518e-06, + "loss": 0.5534, + "step": 286830 + }, + { + "epoch": 2.5357591187963013, + "grad_norm": 7.538877487182617, + "learning_rate": 7.737348020061646e-06, + "loss": 0.4962, + "step": 286840 + }, + { + "epoch": 2.5358475220566135, + "grad_norm": 1.9855834245681763, + "learning_rate": 7.735874632389776e-06, + "loss": 0.4525, + "step": 286850 + }, + { + "epoch": 2.5359359253169256, + "grad_norm": 6.743661403656006, + "learning_rate": 7.734401244717904e-06, + "loss": 0.5284, + "step": 286860 + }, + { + "epoch": 2.5360243285772377, + "grad_norm": 4.425868034362793, + "learning_rate": 7.732927857046034e-06, + "loss": 0.5241, + "step": 286870 + }, + { + "epoch": 2.5361127318375503, + "grad_norm": 3.071683406829834, + "learning_rate": 7.731454469374164e-06, + "loss": 0.4724, + "step": 286880 + }, + { + "epoch": 2.5362011350978624, + "grad_norm": 1.3647109270095825, + "learning_rate": 7.729981081702293e-06, + "loss": 0.4318, + "step": 286890 + }, + { + "epoch": 2.5362895383581745, + "grad_norm": 1.320531964302063, + "learning_rate": 7.728507694030423e-06, + "loss": 0.4842, + "step": 286900 + }, + { + "epoch": 2.536377941618487, + "grad_norm": 1.5489968061447144, + "learning_rate": 7.727034306358553e-06, + "loss": 0.5047, + "step": 286910 + }, + { + "epoch": 2.536466344878799, + "grad_norm": 6.981136322021484, + "learning_rate": 7.725560918686681e-06, + "loss": 0.6146, + "step": 286920 + }, + { + "epoch": 2.5365547481391113, + "grad_norm": 1.7232416868209839, + "learning_rate": 7.724087531014811e-06, + "loss": 0.4144, + "step": 286930 + }, + { + "epoch": 2.5366431513994234, + "grad_norm": 6.03798246383667, + "learning_rate": 7.722614143342941e-06, + "loss": 0.5402, + "step": 286940 + }, + { + "epoch": 2.536731554659736, + "grad_norm": 2.836902141571045, + "learning_rate": 7.72114075567107e-06, + "loss": 0.5668, + "step": 286950 + }, + { + "epoch": 2.536819957920048, + "grad_norm": 2.465095043182373, + "learning_rate": 7.7196673679992e-06, + "loss": 0.4979, + "step": 286960 + }, + { + "epoch": 2.5369083611803602, + "grad_norm": 2.241001844406128, + "learning_rate": 7.718193980327328e-06, + "loss": 0.5448, + "step": 286970 + }, + { + "epoch": 2.536996764440673, + "grad_norm": 2.500088930130005, + "learning_rate": 7.716720592655458e-06, + "loss": 0.5883, + "step": 286980 + }, + { + "epoch": 2.537085167700985, + "grad_norm": 1.067237377166748, + "learning_rate": 7.715247204983588e-06, + "loss": 0.4411, + "step": 286990 + }, + { + "epoch": 2.537173570961297, + "grad_norm": 2.942096471786499, + "learning_rate": 7.713773817311717e-06, + "loss": 0.5307, + "step": 287000 + }, + { + "epoch": 2.537261974221609, + "grad_norm": 7.226024627685547, + "learning_rate": 7.712300429639847e-06, + "loss": 0.5751, + "step": 287010 + }, + { + "epoch": 2.5373503774819213, + "grad_norm": 2.6269121170043945, + "learning_rate": 7.710827041967975e-06, + "loss": 0.5319, + "step": 287020 + }, + { + "epoch": 2.537438780742234, + "grad_norm": 3.9821181297302246, + "learning_rate": 7.709353654296105e-06, + "loss": 0.5144, + "step": 287030 + }, + { + "epoch": 2.537527184002546, + "grad_norm": 4.948983192443848, + "learning_rate": 7.707880266624233e-06, + "loss": 0.4664, + "step": 287040 + }, + { + "epoch": 2.5376155872628585, + "grad_norm": 1.7080854177474976, + "learning_rate": 7.706406878952363e-06, + "loss": 0.4748, + "step": 287050 + }, + { + "epoch": 2.5377039905231706, + "grad_norm": 12.415894508361816, + "learning_rate": 7.704933491280492e-06, + "loss": 0.5582, + "step": 287060 + }, + { + "epoch": 2.5377923937834828, + "grad_norm": 4.383171081542969, + "learning_rate": 7.703460103608622e-06, + "loss": 0.5418, + "step": 287070 + }, + { + "epoch": 2.537880797043795, + "grad_norm": 19.72639274597168, + "learning_rate": 7.701986715936752e-06, + "loss": 0.6326, + "step": 287080 + }, + { + "epoch": 2.537969200304107, + "grad_norm": 1.7344335317611694, + "learning_rate": 7.70051332826488e-06, + "loss": 0.4366, + "step": 287090 + }, + { + "epoch": 2.5380576035644196, + "grad_norm": 1.2666950225830078, + "learning_rate": 7.69903994059301e-06, + "loss": 0.5136, + "step": 287100 + }, + { + "epoch": 2.5381460068247317, + "grad_norm": 3.3127048015594482, + "learning_rate": 7.697566552921139e-06, + "loss": 0.5386, + "step": 287110 + }, + { + "epoch": 2.538234410085044, + "grad_norm": 2.4044265747070312, + "learning_rate": 7.696093165249269e-06, + "loss": 0.5595, + "step": 287120 + }, + { + "epoch": 2.5383228133453564, + "grad_norm": 3.1260387897491455, + "learning_rate": 7.694619777577397e-06, + "loss": 0.5184, + "step": 287130 + }, + { + "epoch": 2.5384112166056685, + "grad_norm": 3.493544101715088, + "learning_rate": 7.693146389905527e-06, + "loss": 0.4319, + "step": 287140 + }, + { + "epoch": 2.5384996198659806, + "grad_norm": 1.1025927066802979, + "learning_rate": 7.691673002233655e-06, + "loss": 0.3948, + "step": 287150 + }, + { + "epoch": 2.5385880231262927, + "grad_norm": 1.9067635536193848, + "learning_rate": 7.690199614561785e-06, + "loss": 0.5826, + "step": 287160 + }, + { + "epoch": 2.5386764263866053, + "grad_norm": 2.339031934738159, + "learning_rate": 7.688726226889914e-06, + "loss": 0.4062, + "step": 287170 + }, + { + "epoch": 2.5387648296469174, + "grad_norm": 18.674962997436523, + "learning_rate": 7.687252839218044e-06, + "loss": 0.4819, + "step": 287180 + }, + { + "epoch": 2.5388532329072295, + "grad_norm": 3.701847791671753, + "learning_rate": 7.685779451546174e-06, + "loss": 0.4958, + "step": 287190 + }, + { + "epoch": 2.538941636167542, + "grad_norm": 2.6562771797180176, + "learning_rate": 7.684306063874302e-06, + "loss": 0.6174, + "step": 287200 + }, + { + "epoch": 2.539030039427854, + "grad_norm": 1.335186243057251, + "learning_rate": 7.682832676202432e-06, + "loss": 0.655, + "step": 287210 + }, + { + "epoch": 2.5391184426881663, + "grad_norm": 8.398321151733398, + "learning_rate": 7.68135928853056e-06, + "loss": 0.4876, + "step": 287220 + }, + { + "epoch": 2.5392068459484785, + "grad_norm": 5.006386756896973, + "learning_rate": 7.67988590085869e-06, + "loss": 0.547, + "step": 287230 + }, + { + "epoch": 2.5392952492087906, + "grad_norm": 3.9371137619018555, + "learning_rate": 7.678412513186819e-06, + "loss": 0.5415, + "step": 287240 + }, + { + "epoch": 2.539383652469103, + "grad_norm": 2.336393117904663, + "learning_rate": 7.676939125514949e-06, + "loss": 0.5037, + "step": 287250 + }, + { + "epoch": 2.5394720557294153, + "grad_norm": 2.98300838470459, + "learning_rate": 7.675465737843077e-06, + "loss": 0.6387, + "step": 287260 + }, + { + "epoch": 2.5395604589897274, + "grad_norm": 1.468071699142456, + "learning_rate": 7.673992350171208e-06, + "loss": 0.5621, + "step": 287270 + }, + { + "epoch": 2.53964886225004, + "grad_norm": 2.779430627822876, + "learning_rate": 7.672518962499338e-06, + "loss": 0.4375, + "step": 287280 + }, + { + "epoch": 2.539737265510352, + "grad_norm": 4.306379795074463, + "learning_rate": 7.671045574827466e-06, + "loss": 0.5634, + "step": 287290 + }, + { + "epoch": 2.539825668770664, + "grad_norm": 1.411899447441101, + "learning_rate": 7.669572187155596e-06, + "loss": 0.5405, + "step": 287300 + }, + { + "epoch": 2.5399140720309763, + "grad_norm": 1.5174697637557983, + "learning_rate": 7.668098799483724e-06, + "loss": 0.5007, + "step": 287310 + }, + { + "epoch": 2.540002475291289, + "grad_norm": 2.888972520828247, + "learning_rate": 7.666625411811854e-06, + "loss": 0.4679, + "step": 287320 + }, + { + "epoch": 2.540090878551601, + "grad_norm": 8.835606575012207, + "learning_rate": 7.665152024139983e-06, + "loss": 0.4993, + "step": 287330 + }, + { + "epoch": 2.540179281811913, + "grad_norm": 0.6512501239776611, + "learning_rate": 7.663678636468113e-06, + "loss": 0.4608, + "step": 287340 + }, + { + "epoch": 2.5402676850722257, + "grad_norm": 1.5570520162582397, + "learning_rate": 7.662205248796243e-06, + "loss": 0.3581, + "step": 287350 + }, + { + "epoch": 2.540356088332538, + "grad_norm": 4.186254978179932, + "learning_rate": 7.660731861124371e-06, + "loss": 0.5012, + "step": 287360 + }, + { + "epoch": 2.54044449159285, + "grad_norm": 3.807509422302246, + "learning_rate": 7.659258473452501e-06, + "loss": 0.5755, + "step": 287370 + }, + { + "epoch": 2.540532894853162, + "grad_norm": 1.7260503768920898, + "learning_rate": 7.657785085780631e-06, + "loss": 0.5427, + "step": 287380 + }, + { + "epoch": 2.5406212981134746, + "grad_norm": 6.286538600921631, + "learning_rate": 7.65631169810876e-06, + "loss": 0.4447, + "step": 287390 + }, + { + "epoch": 2.5407097013737867, + "grad_norm": 3.4455792903900146, + "learning_rate": 7.65483831043689e-06, + "loss": 0.6092, + "step": 287400 + }, + { + "epoch": 2.540798104634099, + "grad_norm": 1.6823108196258545, + "learning_rate": 7.65336492276502e-06, + "loss": 0.4445, + "step": 287410 + }, + { + "epoch": 2.5408865078944114, + "grad_norm": 2.945420265197754, + "learning_rate": 7.651891535093148e-06, + "loss": 0.4966, + "step": 287420 + }, + { + "epoch": 2.5409749111547235, + "grad_norm": 1.4819896221160889, + "learning_rate": 7.650418147421278e-06, + "loss": 0.5911, + "step": 287430 + }, + { + "epoch": 2.5410633144150356, + "grad_norm": 2.8629438877105713, + "learning_rate": 7.648944759749406e-06, + "loss": 0.5367, + "step": 287440 + }, + { + "epoch": 2.5411517176753478, + "grad_norm": 4.631710529327393, + "learning_rate": 7.647471372077537e-06, + "loss": 0.5759, + "step": 287450 + }, + { + "epoch": 2.54124012093566, + "grad_norm": 9.926572799682617, + "learning_rate": 7.645997984405667e-06, + "loss": 0.5818, + "step": 287460 + }, + { + "epoch": 2.5413285241959724, + "grad_norm": 1.5947034358978271, + "learning_rate": 7.644524596733795e-06, + "loss": 0.4368, + "step": 287470 + }, + { + "epoch": 2.5414169274562846, + "grad_norm": 11.596885681152344, + "learning_rate": 7.643051209061925e-06, + "loss": 0.6223, + "step": 287480 + }, + { + "epoch": 2.5415053307165967, + "grad_norm": 5.698416709899902, + "learning_rate": 7.641577821390053e-06, + "loss": 0.4939, + "step": 287490 + }, + { + "epoch": 2.5415937339769092, + "grad_norm": 3.4243383407592773, + "learning_rate": 7.640104433718183e-06, + "loss": 0.6681, + "step": 287500 + }, + { + "epoch": 2.5416821372372214, + "grad_norm": 4.191904544830322, + "learning_rate": 7.638631046046312e-06, + "loss": 0.5033, + "step": 287510 + }, + { + "epoch": 2.5417705404975335, + "grad_norm": 3.6062238216400146, + "learning_rate": 7.637157658374442e-06, + "loss": 0.5686, + "step": 287520 + }, + { + "epoch": 2.5418589437578456, + "grad_norm": 2.142219066619873, + "learning_rate": 7.63568427070257e-06, + "loss": 0.5096, + "step": 287530 + }, + { + "epoch": 2.541947347018158, + "grad_norm": 3.0834274291992188, + "learning_rate": 7.6342108830307e-06, + "loss": 0.4614, + "step": 287540 + }, + { + "epoch": 2.5420357502784703, + "grad_norm": 2.9206361770629883, + "learning_rate": 7.63273749535883e-06, + "loss": 0.5201, + "step": 287550 + }, + { + "epoch": 2.5421241535387824, + "grad_norm": 3.406435012817383, + "learning_rate": 7.631264107686959e-06, + "loss": 0.5082, + "step": 287560 + }, + { + "epoch": 2.542212556799095, + "grad_norm": 16.342370986938477, + "learning_rate": 7.629790720015089e-06, + "loss": 0.5779, + "step": 287570 + }, + { + "epoch": 2.542300960059407, + "grad_norm": 1.1872094869613647, + "learning_rate": 7.628317332343217e-06, + "loss": 0.471, + "step": 287580 + }, + { + "epoch": 2.542389363319719, + "grad_norm": 5.09975004196167, + "learning_rate": 7.626843944671347e-06, + "loss": 0.4484, + "step": 287590 + }, + { + "epoch": 2.5424777665800313, + "grad_norm": 5.478202819824219, + "learning_rate": 7.625370556999475e-06, + "loss": 0.4882, + "step": 287600 + }, + { + "epoch": 2.5425661698403434, + "grad_norm": 3.9645943641662598, + "learning_rate": 7.623897169327605e-06, + "loss": 0.4542, + "step": 287610 + }, + { + "epoch": 2.542654573100656, + "grad_norm": 1.436501145362854, + "learning_rate": 7.622423781655734e-06, + "loss": 0.6447, + "step": 287620 + }, + { + "epoch": 2.542742976360968, + "grad_norm": 6.531096935272217, + "learning_rate": 7.620950393983864e-06, + "loss": 0.4989, + "step": 287630 + }, + { + "epoch": 2.5428313796212807, + "grad_norm": 1.4232882261276245, + "learning_rate": 7.619477006311992e-06, + "loss": 0.4308, + "step": 287640 + }, + { + "epoch": 2.542919782881593, + "grad_norm": 2.676994562149048, + "learning_rate": 7.618003618640122e-06, + "loss": 0.4605, + "step": 287650 + }, + { + "epoch": 2.543008186141905, + "grad_norm": 0.8580169081687927, + "learning_rate": 7.616530230968252e-06, + "loss": 0.4421, + "step": 287660 + }, + { + "epoch": 2.543096589402217, + "grad_norm": 2.9297618865966797, + "learning_rate": 7.615056843296381e-06, + "loss": 0.4993, + "step": 287670 + }, + { + "epoch": 2.543184992662529, + "grad_norm": 1.6806273460388184, + "learning_rate": 7.613583455624511e-06, + "loss": 0.4455, + "step": 287680 + }, + { + "epoch": 2.5432733959228417, + "grad_norm": 3.5181283950805664, + "learning_rate": 7.612110067952639e-06, + "loss": 0.5745, + "step": 287690 + }, + { + "epoch": 2.543361799183154, + "grad_norm": 3.801945209503174, + "learning_rate": 7.610636680280769e-06, + "loss": 0.4493, + "step": 287700 + }, + { + "epoch": 2.543450202443466, + "grad_norm": 5.969425678253174, + "learning_rate": 7.609163292608898e-06, + "loss": 0.5222, + "step": 287710 + }, + { + "epoch": 2.5435386057037785, + "grad_norm": 3.3236567974090576, + "learning_rate": 7.6076899049370275e-06, + "loss": 0.5838, + "step": 287720 + }, + { + "epoch": 2.5436270089640907, + "grad_norm": 6.591331481933594, + "learning_rate": 7.606216517265157e-06, + "loss": 0.4587, + "step": 287730 + }, + { + "epoch": 2.543715412224403, + "grad_norm": 5.448455810546875, + "learning_rate": 7.604743129593287e-06, + "loss": 0.4863, + "step": 287740 + }, + { + "epoch": 2.543803815484715, + "grad_norm": 1.3115828037261963, + "learning_rate": 7.603269741921416e-06, + "loss": 0.4834, + "step": 287750 + }, + { + "epoch": 2.5438922187450275, + "grad_norm": 18.662612915039062, + "learning_rate": 7.601796354249545e-06, + "loss": 0.5037, + "step": 287760 + }, + { + "epoch": 2.5439806220053396, + "grad_norm": 2.492130756378174, + "learning_rate": 7.600322966577675e-06, + "loss": 0.5286, + "step": 287770 + }, + { + "epoch": 2.5440690252656517, + "grad_norm": 1.1799880266189575, + "learning_rate": 7.5988495789058035e-06, + "loss": 0.4964, + "step": 287780 + }, + { + "epoch": 2.5441574285259643, + "grad_norm": 4.9721856117248535, + "learning_rate": 7.597376191233934e-06, + "loss": 0.476, + "step": 287790 + }, + { + "epoch": 2.5442458317862764, + "grad_norm": 4.168481826782227, + "learning_rate": 7.595902803562062e-06, + "loss": 0.5051, + "step": 287800 + }, + { + "epoch": 2.5443342350465885, + "grad_norm": 3.5024402141571045, + "learning_rate": 7.594429415890192e-06, + "loss": 0.4801, + "step": 287810 + }, + { + "epoch": 2.5444226383069006, + "grad_norm": 0.5403833985328674, + "learning_rate": 7.59295602821832e-06, + "loss": 0.4329, + "step": 287820 + }, + { + "epoch": 2.5445110415672128, + "grad_norm": 3.9743525981903076, + "learning_rate": 7.59148264054645e-06, + "loss": 0.5731, + "step": 287830 + }, + { + "epoch": 2.5445994448275253, + "grad_norm": 5.898166656494141, + "learning_rate": 7.5900092528745804e-06, + "loss": 0.4603, + "step": 287840 + }, + { + "epoch": 2.5446878480878374, + "grad_norm": 4.906414031982422, + "learning_rate": 7.588535865202709e-06, + "loss": 0.541, + "step": 287850 + }, + { + "epoch": 2.5447762513481496, + "grad_norm": 0.9401805400848389, + "learning_rate": 7.587062477530839e-06, + "loss": 0.4661, + "step": 287860 + }, + { + "epoch": 2.544864654608462, + "grad_norm": 2.631016492843628, + "learning_rate": 7.585589089858967e-06, + "loss": 0.4544, + "step": 287870 + }, + { + "epoch": 2.5449530578687742, + "grad_norm": 1.4852603673934937, + "learning_rate": 7.584115702187097e-06, + "loss": 0.5552, + "step": 287880 + }, + { + "epoch": 2.5450414611290864, + "grad_norm": 4.700036525726318, + "learning_rate": 7.582642314515226e-06, + "loss": 0.6933, + "step": 287890 + }, + { + "epoch": 2.5451298643893985, + "grad_norm": 3.8480851650238037, + "learning_rate": 7.581168926843356e-06, + "loss": 0.5051, + "step": 287900 + }, + { + "epoch": 2.545218267649711, + "grad_norm": 1.3549734354019165, + "learning_rate": 7.579695539171484e-06, + "loss": 0.4754, + "step": 287910 + }, + { + "epoch": 2.545306670910023, + "grad_norm": 1.5471570491790771, + "learning_rate": 7.578222151499614e-06, + "loss": 0.4475, + "step": 287920 + }, + { + "epoch": 2.5453950741703353, + "grad_norm": 1.918792963027954, + "learning_rate": 7.576748763827744e-06, + "loss": 0.5487, + "step": 287930 + }, + { + "epoch": 2.545483477430648, + "grad_norm": 1.8972417116165161, + "learning_rate": 7.5752753761558724e-06, + "loss": 0.4699, + "step": 287940 + }, + { + "epoch": 2.54557188069096, + "grad_norm": 4.712443828582764, + "learning_rate": 7.5738019884840025e-06, + "loss": 0.463, + "step": 287950 + }, + { + "epoch": 2.545660283951272, + "grad_norm": 7.277156829833984, + "learning_rate": 7.572328600812132e-06, + "loss": 0.4538, + "step": 287960 + }, + { + "epoch": 2.545748687211584, + "grad_norm": 2.732403516769409, + "learning_rate": 7.570855213140261e-06, + "loss": 0.5891, + "step": 287970 + }, + { + "epoch": 2.5458370904718968, + "grad_norm": 4.637290954589844, + "learning_rate": 7.56938182546839e-06, + "loss": 0.439, + "step": 287980 + }, + { + "epoch": 2.545925493732209, + "grad_norm": 7.159592628479004, + "learning_rate": 7.56790843779652e-06, + "loss": 0.5514, + "step": 287990 + }, + { + "epoch": 2.546013896992521, + "grad_norm": 3.3373186588287354, + "learning_rate": 7.5664350501246485e-06, + "loss": 0.5767, + "step": 288000 + }, + { + "epoch": 2.5461023002528336, + "grad_norm": 1.3921257257461548, + "learning_rate": 7.5649616624527785e-06, + "loss": 0.4562, + "step": 288010 + }, + { + "epoch": 2.5461907035131457, + "grad_norm": 6.581860065460205, + "learning_rate": 7.563488274780909e-06, + "loss": 0.4473, + "step": 288020 + }, + { + "epoch": 2.546279106773458, + "grad_norm": 2.108926296234131, + "learning_rate": 7.562014887109037e-06, + "loss": 0.5705, + "step": 288030 + }, + { + "epoch": 2.54636751003377, + "grad_norm": 4.0386786460876465, + "learning_rate": 7.560541499437167e-06, + "loss": 0.4995, + "step": 288040 + }, + { + "epoch": 2.546455913294082, + "grad_norm": 2.7508914470672607, + "learning_rate": 7.559068111765295e-06, + "loss": 0.4357, + "step": 288050 + }, + { + "epoch": 2.5465443165543946, + "grad_norm": 2.8584437370300293, + "learning_rate": 7.557594724093425e-06, + "loss": 0.4592, + "step": 288060 + }, + { + "epoch": 2.5466327198147067, + "grad_norm": 4.5629777908325195, + "learning_rate": 7.556121336421554e-06, + "loss": 0.6923, + "step": 288070 + }, + { + "epoch": 2.546721123075019, + "grad_norm": 7.38628625869751, + "learning_rate": 7.554647948749684e-06, + "loss": 0.4393, + "step": 288080 + }, + { + "epoch": 2.5468095263353314, + "grad_norm": 10.627935409545898, + "learning_rate": 7.553174561077812e-06, + "loss": 0.5868, + "step": 288090 + }, + { + "epoch": 2.5468979295956435, + "grad_norm": 8.133939743041992, + "learning_rate": 7.551701173405942e-06, + "loss": 0.4977, + "step": 288100 + }, + { + "epoch": 2.5469863328559557, + "grad_norm": 1.4550999402999878, + "learning_rate": 7.550227785734072e-06, + "loss": 0.4726, + "step": 288110 + }, + { + "epoch": 2.547074736116268, + "grad_norm": 7.214728355407715, + "learning_rate": 7.548754398062201e-06, + "loss": 0.4331, + "step": 288120 + }, + { + "epoch": 2.5471631393765803, + "grad_norm": 1.5697007179260254, + "learning_rate": 7.547281010390331e-06, + "loss": 0.4329, + "step": 288130 + }, + { + "epoch": 2.5472515426368925, + "grad_norm": 3.026517868041992, + "learning_rate": 7.545807622718459e-06, + "loss": 0.3879, + "step": 288140 + }, + { + "epoch": 2.5473399458972046, + "grad_norm": 10.820934295654297, + "learning_rate": 7.544334235046589e-06, + "loss": 0.4683, + "step": 288150 + }, + { + "epoch": 2.547428349157517, + "grad_norm": 4.766831398010254, + "learning_rate": 7.542860847374717e-06, + "loss": 0.5723, + "step": 288160 + }, + { + "epoch": 2.5475167524178293, + "grad_norm": 22.554092407226562, + "learning_rate": 7.5413874597028475e-06, + "loss": 0.5877, + "step": 288170 + }, + { + "epoch": 2.5476051556781414, + "grad_norm": 5.7433953285217285, + "learning_rate": 7.539914072030977e-06, + "loss": 0.525, + "step": 288180 + }, + { + "epoch": 2.5476935589384535, + "grad_norm": 1.37574303150177, + "learning_rate": 7.538440684359106e-06, + "loss": 0.5665, + "step": 288190 + }, + { + "epoch": 2.5477819621987656, + "grad_norm": 5.310640335083008, + "learning_rate": 7.536967296687235e-06, + "loss": 0.477, + "step": 288200 + }, + { + "epoch": 2.547870365459078, + "grad_norm": 7.173449993133545, + "learning_rate": 7.535493909015365e-06, + "loss": 0.4969, + "step": 288210 + }, + { + "epoch": 2.5479587687193903, + "grad_norm": 14.117474555969238, + "learning_rate": 7.534020521343495e-06, + "loss": 0.4963, + "step": 288220 + }, + { + "epoch": 2.548047171979703, + "grad_norm": 3.8011958599090576, + "learning_rate": 7.5325471336716235e-06, + "loss": 0.6617, + "step": 288230 + }, + { + "epoch": 2.548135575240015, + "grad_norm": 2.217721700668335, + "learning_rate": 7.5310737459997536e-06, + "loss": 0.4306, + "step": 288240 + }, + { + "epoch": 2.548223978500327, + "grad_norm": 2.5621862411499023, + "learning_rate": 7.529600358327882e-06, + "loss": 0.4273, + "step": 288250 + }, + { + "epoch": 2.5483123817606392, + "grad_norm": 3.2029781341552734, + "learning_rate": 7.528126970656012e-06, + "loss": 0.4439, + "step": 288260 + }, + { + "epoch": 2.5484007850209514, + "grad_norm": 1.6673387289047241, + "learning_rate": 7.52665358298414e-06, + "loss": 0.4725, + "step": 288270 + }, + { + "epoch": 2.548489188281264, + "grad_norm": 2.0773589611053467, + "learning_rate": 7.52518019531227e-06, + "loss": 0.5456, + "step": 288280 + }, + { + "epoch": 2.548577591541576, + "grad_norm": 15.503325462341309, + "learning_rate": 7.523706807640399e-06, + "loss": 0.3625, + "step": 288290 + }, + { + "epoch": 2.548665994801888, + "grad_norm": 5.971371173858643, + "learning_rate": 7.522233419968529e-06, + "loss": 0.4997, + "step": 288300 + }, + { + "epoch": 2.5487543980622007, + "grad_norm": 2.1780734062194824, + "learning_rate": 7.520760032296659e-06, + "loss": 0.5797, + "step": 288310 + }, + { + "epoch": 2.548842801322513, + "grad_norm": 0.9466203451156616, + "learning_rate": 7.519286644624787e-06, + "loss": 0.5794, + "step": 288320 + }, + { + "epoch": 2.548931204582825, + "grad_norm": 4.00316047668457, + "learning_rate": 7.517813256952917e-06, + "loss": 0.6601, + "step": 288330 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 6.595563888549805, + "learning_rate": 7.5163398692810456e-06, + "loss": 0.4013, + "step": 288340 + }, + { + "epoch": 2.5491080111034496, + "grad_norm": 4.037397861480713, + "learning_rate": 7.514866481609176e-06, + "loss": 0.4854, + "step": 288350 + }, + { + "epoch": 2.5491964143637618, + "grad_norm": 4.102362155914307, + "learning_rate": 7.513393093937304e-06, + "loss": 0.5063, + "step": 288360 + }, + { + "epoch": 2.549284817624074, + "grad_norm": 4.432204723358154, + "learning_rate": 7.511919706265434e-06, + "loss": 0.547, + "step": 288370 + }, + { + "epoch": 2.5493732208843864, + "grad_norm": 3.4532527923583984, + "learning_rate": 7.510446318593563e-06, + "loss": 0.5041, + "step": 288380 + }, + { + "epoch": 2.5494616241446986, + "grad_norm": 5.701144218444824, + "learning_rate": 7.508972930921692e-06, + "loss": 0.427, + "step": 288390 + }, + { + "epoch": 2.5495500274050107, + "grad_norm": 9.492115020751953, + "learning_rate": 7.5074995432498225e-06, + "loss": 0.5125, + "step": 288400 + }, + { + "epoch": 2.549638430665323, + "grad_norm": 2.7386903762817383, + "learning_rate": 7.506026155577952e-06, + "loss": 0.5794, + "step": 288410 + }, + { + "epoch": 2.549726833925635, + "grad_norm": 6.434320449829102, + "learning_rate": 7.504552767906081e-06, + "loss": 0.6265, + "step": 288420 + }, + { + "epoch": 2.5498152371859475, + "grad_norm": 1.556598424911499, + "learning_rate": 7.50307938023421e-06, + "loss": 0.5269, + "step": 288430 + }, + { + "epoch": 2.5499036404462596, + "grad_norm": 7.327907085418701, + "learning_rate": 7.50160599256234e-06, + "loss": 0.5541, + "step": 288440 + }, + { + "epoch": 2.549992043706572, + "grad_norm": 2.636584520339966, + "learning_rate": 7.5001326048904685e-06, + "loss": 0.4922, + "step": 288450 + }, + { + "epoch": 2.5500804469668843, + "grad_norm": 2.9060170650482178, + "learning_rate": 7.4986592172185985e-06, + "loss": 0.5062, + "step": 288460 + }, + { + "epoch": 2.5501688502271964, + "grad_norm": 3.3446786403656006, + "learning_rate": 7.497185829546727e-06, + "loss": 0.4581, + "step": 288470 + }, + { + "epoch": 2.5502572534875085, + "grad_norm": 3.0771312713623047, + "learning_rate": 7.495712441874857e-06, + "loss": 0.3957, + "step": 288480 + }, + { + "epoch": 2.5503456567478207, + "grad_norm": 2.3189308643341064, + "learning_rate": 7.494239054202987e-06, + "loss": 0.4651, + "step": 288490 + }, + { + "epoch": 2.550434060008133, + "grad_norm": 3.915235996246338, + "learning_rate": 7.492765666531115e-06, + "loss": 0.5186, + "step": 288500 + }, + { + "epoch": 2.5505224632684453, + "grad_norm": 1.9587384462356567, + "learning_rate": 7.491292278859245e-06, + "loss": 0.5228, + "step": 288510 + }, + { + "epoch": 2.5506108665287575, + "grad_norm": 6.063168525695801, + "learning_rate": 7.489818891187374e-06, + "loss": 0.6126, + "step": 288520 + }, + { + "epoch": 2.55069926978907, + "grad_norm": 2.0042033195495605, + "learning_rate": 7.488345503515504e-06, + "loss": 0.4727, + "step": 288530 + }, + { + "epoch": 2.550787673049382, + "grad_norm": 1.0631511211395264, + "learning_rate": 7.486872115843632e-06, + "loss": 0.4919, + "step": 288540 + }, + { + "epoch": 2.5508760763096943, + "grad_norm": 6.184615135192871, + "learning_rate": 7.485398728171762e-06, + "loss": 0.4941, + "step": 288550 + }, + { + "epoch": 2.5509644795700064, + "grad_norm": 4.3307414054870605, + "learning_rate": 7.4839253404998905e-06, + "loss": 0.5045, + "step": 288560 + }, + { + "epoch": 2.551052882830319, + "grad_norm": 8.403402328491211, + "learning_rate": 7.4824519528280206e-06, + "loss": 0.7089, + "step": 288570 + }, + { + "epoch": 2.551141286090631, + "grad_norm": 3.295703172683716, + "learning_rate": 7.480978565156151e-06, + "loss": 0.4495, + "step": 288580 + }, + { + "epoch": 2.551229689350943, + "grad_norm": 2.9816770553588867, + "learning_rate": 7.479505177484279e-06, + "loss": 0.5427, + "step": 288590 + }, + { + "epoch": 2.5513180926112558, + "grad_norm": 6.340722560882568, + "learning_rate": 7.478031789812409e-06, + "loss": 0.5153, + "step": 288600 + }, + { + "epoch": 2.551406495871568, + "grad_norm": 17.17133903503418, + "learning_rate": 7.476558402140537e-06, + "loss": 0.4755, + "step": 288610 + }, + { + "epoch": 2.55149489913188, + "grad_norm": 6.44795560836792, + "learning_rate": 7.4750850144686674e-06, + "loss": 0.5617, + "step": 288620 + }, + { + "epoch": 2.551583302392192, + "grad_norm": 46.98590087890625, + "learning_rate": 7.473611626796797e-06, + "loss": 0.6496, + "step": 288630 + }, + { + "epoch": 2.5516717056525042, + "grad_norm": 2.0616977214813232, + "learning_rate": 7.472138239124926e-06, + "loss": 0.6587, + "step": 288640 + }, + { + "epoch": 2.551760108912817, + "grad_norm": 1.2357394695281982, + "learning_rate": 7.470664851453055e-06, + "loss": 0.385, + "step": 288650 + }, + { + "epoch": 2.551848512173129, + "grad_norm": 4.948126316070557, + "learning_rate": 7.469191463781185e-06, + "loss": 0.4637, + "step": 288660 + }, + { + "epoch": 2.551936915433441, + "grad_norm": 3.465017795562744, + "learning_rate": 7.467718076109314e-06, + "loss": 0.5471, + "step": 288670 + }, + { + "epoch": 2.5520253186937536, + "grad_norm": 2.486004590988159, + "learning_rate": 7.4662446884374435e-06, + "loss": 0.483, + "step": 288680 + }, + { + "epoch": 2.5521137219540657, + "grad_norm": 2.462327241897583, + "learning_rate": 7.4647713007655735e-06, + "loss": 0.4673, + "step": 288690 + }, + { + "epoch": 2.552202125214378, + "grad_norm": 2.756190538406372, + "learning_rate": 7.463297913093702e-06, + "loss": 0.4488, + "step": 288700 + }, + { + "epoch": 2.55229052847469, + "grad_norm": 1.2396339178085327, + "learning_rate": 7.461824525421832e-06, + "loss": 0.4555, + "step": 288710 + }, + { + "epoch": 2.5523789317350025, + "grad_norm": 20.345592498779297, + "learning_rate": 7.46035113774996e-06, + "loss": 0.5413, + "step": 288720 + }, + { + "epoch": 2.5524673349953146, + "grad_norm": 5.657917499542236, + "learning_rate": 7.45887775007809e-06, + "loss": 0.5979, + "step": 288730 + }, + { + "epoch": 2.5525557382556268, + "grad_norm": 1.488834023475647, + "learning_rate": 7.457404362406219e-06, + "loss": 0.5817, + "step": 288740 + }, + { + "epoch": 2.5526441415159393, + "grad_norm": 4.423856735229492, + "learning_rate": 7.455930974734349e-06, + "loss": 0.4312, + "step": 288750 + }, + { + "epoch": 2.5527325447762514, + "grad_norm": 1.8119628429412842, + "learning_rate": 7.454457587062477e-06, + "loss": 0.4986, + "step": 288760 + }, + { + "epoch": 2.5528209480365636, + "grad_norm": 2.106029987335205, + "learning_rate": 7.452984199390607e-06, + "loss": 0.6323, + "step": 288770 + }, + { + "epoch": 2.5529093512968757, + "grad_norm": 9.449013710021973, + "learning_rate": 7.451510811718737e-06, + "loss": 0.6259, + "step": 288780 + }, + { + "epoch": 2.552997754557188, + "grad_norm": 2.4413962364196777, + "learning_rate": 7.4500374240468655e-06, + "loss": 0.6129, + "step": 288790 + }, + { + "epoch": 2.5530861578175004, + "grad_norm": 3.4728240966796875, + "learning_rate": 7.448564036374996e-06, + "loss": 0.4255, + "step": 288800 + }, + { + "epoch": 2.5531745610778125, + "grad_norm": 5.483280181884766, + "learning_rate": 7.447090648703124e-06, + "loss": 0.4888, + "step": 288810 + }, + { + "epoch": 2.553262964338125, + "grad_norm": 5.852596759796143, + "learning_rate": 7.445617261031254e-06, + "loss": 0.3708, + "step": 288820 + }, + { + "epoch": 2.553351367598437, + "grad_norm": 1.9813244342803955, + "learning_rate": 7.444143873359382e-06, + "loss": 0.4485, + "step": 288830 + }, + { + "epoch": 2.5534397708587493, + "grad_norm": 2.313133478164673, + "learning_rate": 7.442670485687512e-06, + "loss": 0.6047, + "step": 288840 + }, + { + "epoch": 2.5535281741190614, + "grad_norm": 2.6115400791168213, + "learning_rate": 7.441197098015642e-06, + "loss": 0.6604, + "step": 288850 + }, + { + "epoch": 2.5536165773793735, + "grad_norm": 3.4180970191955566, + "learning_rate": 7.439723710343771e-06, + "loss": 0.5479, + "step": 288860 + }, + { + "epoch": 2.553704980639686, + "grad_norm": 2.477982759475708, + "learning_rate": 7.438250322671901e-06, + "loss": 0.6051, + "step": 288870 + }, + { + "epoch": 2.553793383899998, + "grad_norm": 12.328113555908203, + "learning_rate": 7.43677693500003e-06, + "loss": 0.7127, + "step": 288880 + }, + { + "epoch": 2.5538817871603103, + "grad_norm": 2.4864354133605957, + "learning_rate": 7.435303547328159e-06, + "loss": 0.5025, + "step": 288890 + }, + { + "epoch": 2.553970190420623, + "grad_norm": 1.300728440284729, + "learning_rate": 7.4338301596562884e-06, + "loss": 0.4443, + "step": 288900 + }, + { + "epoch": 2.554058593680935, + "grad_norm": 3.4025566577911377, + "learning_rate": 7.4323567719844185e-06, + "loss": 0.6441, + "step": 288910 + }, + { + "epoch": 2.554146996941247, + "grad_norm": 2.5414278507232666, + "learning_rate": 7.430883384312547e-06, + "loss": 0.5672, + "step": 288920 + }, + { + "epoch": 2.5542354002015593, + "grad_norm": 13.851362228393555, + "learning_rate": 7.429409996640677e-06, + "loss": 0.5694, + "step": 288930 + }, + { + "epoch": 2.554323803461872, + "grad_norm": 3.0876426696777344, + "learning_rate": 7.427936608968805e-06, + "loss": 0.5746, + "step": 288940 + }, + { + "epoch": 2.554412206722184, + "grad_norm": 3.293219804763794, + "learning_rate": 7.426463221296935e-06, + "loss": 0.377, + "step": 288950 + }, + { + "epoch": 2.554500609982496, + "grad_norm": 5.033736705780029, + "learning_rate": 7.424989833625065e-06, + "loss": 0.4306, + "step": 288960 + }, + { + "epoch": 2.5545890132428086, + "grad_norm": 1.5958973169326782, + "learning_rate": 7.423516445953194e-06, + "loss": 0.5038, + "step": 288970 + }, + { + "epoch": 2.5546774165031207, + "grad_norm": 5.800029754638672, + "learning_rate": 7.422043058281324e-06, + "loss": 0.4344, + "step": 288980 + }, + { + "epoch": 2.554765819763433, + "grad_norm": 51.02348327636719, + "learning_rate": 7.420569670609452e-06, + "loss": 0.57, + "step": 288990 + }, + { + "epoch": 2.554854223023745, + "grad_norm": 3.9069178104400635, + "learning_rate": 7.419096282937582e-06, + "loss": 0.5591, + "step": 289000 + }, + { + "epoch": 2.554942626284057, + "grad_norm": 4.30499267578125, + "learning_rate": 7.4176228952657105e-06, + "loss": 0.546, + "step": 289010 + }, + { + "epoch": 2.5550310295443697, + "grad_norm": 1.7015527486801147, + "learning_rate": 7.4161495075938405e-06, + "loss": 0.591, + "step": 289020 + }, + { + "epoch": 2.555119432804682, + "grad_norm": 8.115812301635742, + "learning_rate": 7.414676119921969e-06, + "loss": 0.5356, + "step": 289030 + }, + { + "epoch": 2.5552078360649944, + "grad_norm": 1.6194324493408203, + "learning_rate": 7.413202732250099e-06, + "loss": 0.3999, + "step": 289040 + }, + { + "epoch": 2.5552962393253065, + "grad_norm": 3.5197219848632812, + "learning_rate": 7.411729344578229e-06, + "loss": 0.5674, + "step": 289050 + }, + { + "epoch": 2.5553846425856186, + "grad_norm": 1.7533838748931885, + "learning_rate": 7.410255956906357e-06, + "loss": 0.6426, + "step": 289060 + }, + { + "epoch": 2.5554730458459307, + "grad_norm": 3.514777660369873, + "learning_rate": 7.408782569234487e-06, + "loss": 0.5536, + "step": 289070 + }, + { + "epoch": 2.555561449106243, + "grad_norm": 3.3735907077789307, + "learning_rate": 7.407309181562616e-06, + "loss": 0.6197, + "step": 289080 + }, + { + "epoch": 2.5556498523665554, + "grad_norm": 2.8233120441436768, + "learning_rate": 7.405835793890746e-06, + "loss": 0.501, + "step": 289090 + }, + { + "epoch": 2.5557382556268675, + "grad_norm": 2.9651243686676025, + "learning_rate": 7.404362406218875e-06, + "loss": 0.482, + "step": 289100 + }, + { + "epoch": 2.5558266588871796, + "grad_norm": 3.160036563873291, + "learning_rate": 7.402889018547004e-06, + "loss": 0.4727, + "step": 289110 + }, + { + "epoch": 2.555915062147492, + "grad_norm": 7.303202152252197, + "learning_rate": 7.401415630875133e-06, + "loss": 0.54, + "step": 289120 + }, + { + "epoch": 2.5560034654078043, + "grad_norm": 1.9307538270950317, + "learning_rate": 7.3999422432032635e-06, + "loss": 0.4667, + "step": 289130 + }, + { + "epoch": 2.5560918686681164, + "grad_norm": 3.0684335231781006, + "learning_rate": 7.398468855531393e-06, + "loss": 0.5085, + "step": 289140 + }, + { + "epoch": 2.5561802719284286, + "grad_norm": 2.5713863372802734, + "learning_rate": 7.396995467859522e-06, + "loss": 0.4097, + "step": 289150 + }, + { + "epoch": 2.556268675188741, + "grad_norm": 2.3631577491760254, + "learning_rate": 7.395522080187652e-06, + "loss": 0.4416, + "step": 289160 + }, + { + "epoch": 2.5563570784490532, + "grad_norm": 10.671918869018555, + "learning_rate": 7.39404869251578e-06, + "loss": 0.5517, + "step": 289170 + }, + { + "epoch": 2.5564454817093654, + "grad_norm": 2.983289957046509, + "learning_rate": 7.39257530484391e-06, + "loss": 0.5037, + "step": 289180 + }, + { + "epoch": 2.556533884969678, + "grad_norm": 1.5256379842758179, + "learning_rate": 7.391101917172039e-06, + "loss": 0.4745, + "step": 289190 + }, + { + "epoch": 2.55662228822999, + "grad_norm": 2.385537624359131, + "learning_rate": 7.389628529500169e-06, + "loss": 0.4606, + "step": 289200 + }, + { + "epoch": 2.556710691490302, + "grad_norm": 1.3256375789642334, + "learning_rate": 7.388155141828297e-06, + "loss": 0.4821, + "step": 289210 + }, + { + "epoch": 2.5567990947506143, + "grad_norm": 6.994513511657715, + "learning_rate": 7.386681754156427e-06, + "loss": 0.4806, + "step": 289220 + }, + { + "epoch": 2.5568874980109264, + "grad_norm": 4.499807357788086, + "learning_rate": 7.385208366484557e-06, + "loss": 0.5499, + "step": 289230 + }, + { + "epoch": 2.556975901271239, + "grad_norm": 5.967978000640869, + "learning_rate": 7.3837349788126855e-06, + "loss": 0.3781, + "step": 289240 + }, + { + "epoch": 2.557064304531551, + "grad_norm": 3.9279184341430664, + "learning_rate": 7.3822615911408156e-06, + "loss": 0.4088, + "step": 289250 + }, + { + "epoch": 2.557152707791863, + "grad_norm": 2.390481948852539, + "learning_rate": 7.380788203468944e-06, + "loss": 0.5383, + "step": 289260 + }, + { + "epoch": 2.5572411110521758, + "grad_norm": 4.9234161376953125, + "learning_rate": 7.379314815797074e-06, + "loss": 0.4928, + "step": 289270 + }, + { + "epoch": 2.557329514312488, + "grad_norm": 9.354085922241211, + "learning_rate": 7.377841428125202e-06, + "loss": 0.5886, + "step": 289280 + }, + { + "epoch": 2.5574179175728, + "grad_norm": 8.710415840148926, + "learning_rate": 7.376368040453332e-06, + "loss": 0.5059, + "step": 289290 + }, + { + "epoch": 2.557506320833112, + "grad_norm": 4.809811592102051, + "learning_rate": 7.374894652781461e-06, + "loss": 0.5804, + "step": 289300 + }, + { + "epoch": 2.5575947240934247, + "grad_norm": 1.9694995880126953, + "learning_rate": 7.373421265109591e-06, + "loss": 0.4929, + "step": 289310 + }, + { + "epoch": 2.557683127353737, + "grad_norm": 3.194542646408081, + "learning_rate": 7.37194787743772e-06, + "loss": 0.5697, + "step": 289320 + }, + { + "epoch": 2.557771530614049, + "grad_norm": 11.284690856933594, + "learning_rate": 7.370474489765849e-06, + "loss": 0.4332, + "step": 289330 + }, + { + "epoch": 2.5578599338743615, + "grad_norm": 20.603179931640625, + "learning_rate": 7.369001102093979e-06, + "loss": 0.557, + "step": 289340 + }, + { + "epoch": 2.5579483371346736, + "grad_norm": 6.385110378265381, + "learning_rate": 7.367527714422108e-06, + "loss": 0.5416, + "step": 289350 + }, + { + "epoch": 2.5580367403949857, + "grad_norm": 6.905675411224365, + "learning_rate": 7.366054326750238e-06, + "loss": 0.6174, + "step": 289360 + }, + { + "epoch": 2.558125143655298, + "grad_norm": 2.5988142490386963, + "learning_rate": 7.364580939078367e-06, + "loss": 0.5376, + "step": 289370 + }, + { + "epoch": 2.55821354691561, + "grad_norm": 1.014302134513855, + "learning_rate": 7.363107551406497e-06, + "loss": 0.4273, + "step": 289380 + }, + { + "epoch": 2.5583019501759225, + "grad_norm": 3.3601183891296387, + "learning_rate": 7.361634163734625e-06, + "loss": 0.5682, + "step": 289390 + }, + { + "epoch": 2.5583903534362347, + "grad_norm": 4.320243835449219, + "learning_rate": 7.360160776062755e-06, + "loss": 0.5831, + "step": 289400 + }, + { + "epoch": 2.5584787566965472, + "grad_norm": 2.5010766983032227, + "learning_rate": 7.358687388390884e-06, + "loss": 0.484, + "step": 289410 + }, + { + "epoch": 2.5585671599568593, + "grad_norm": 16.152938842773438, + "learning_rate": 7.357214000719014e-06, + "loss": 0.6187, + "step": 289420 + }, + { + "epoch": 2.5586555632171715, + "grad_norm": 6.436952114105225, + "learning_rate": 7.355740613047144e-06, + "loss": 0.4488, + "step": 289430 + }, + { + "epoch": 2.5587439664774836, + "grad_norm": 3.5600287914276123, + "learning_rate": 7.354267225375272e-06, + "loss": 0.6033, + "step": 289440 + }, + { + "epoch": 2.5588323697377957, + "grad_norm": 1.751481533050537, + "learning_rate": 7.352793837703402e-06, + "loss": 0.4164, + "step": 289450 + }, + { + "epoch": 2.5589207729981083, + "grad_norm": 5.907622814178467, + "learning_rate": 7.3513204500315305e-06, + "loss": 0.5234, + "step": 289460 + }, + { + "epoch": 2.5590091762584204, + "grad_norm": 1.165494441986084, + "learning_rate": 7.3498470623596605e-06, + "loss": 0.5968, + "step": 289470 + }, + { + "epoch": 2.5590975795187325, + "grad_norm": 3.626953601837158, + "learning_rate": 7.348373674687789e-06, + "loss": 0.6168, + "step": 289480 + }, + { + "epoch": 2.559185982779045, + "grad_norm": 1.5662055015563965, + "learning_rate": 7.346900287015919e-06, + "loss": 0.6314, + "step": 289490 + }, + { + "epoch": 2.559274386039357, + "grad_norm": 2.851679801940918, + "learning_rate": 7.345426899344047e-06, + "loss": 0.5478, + "step": 289500 + }, + { + "epoch": 2.5593627892996693, + "grad_norm": 5.357203483581543, + "learning_rate": 7.343953511672177e-06, + "loss": 0.5547, + "step": 289510 + }, + { + "epoch": 2.5594511925599814, + "grad_norm": 1.9946211576461792, + "learning_rate": 7.342480124000307e-06, + "loss": 0.4308, + "step": 289520 + }, + { + "epoch": 2.559539595820294, + "grad_norm": 3.8406543731689453, + "learning_rate": 7.341006736328436e-06, + "loss": 0.6529, + "step": 289530 + }, + { + "epoch": 2.559627999080606, + "grad_norm": 5.054392337799072, + "learning_rate": 7.339533348656566e-06, + "loss": 0.6175, + "step": 289540 + }, + { + "epoch": 2.5597164023409182, + "grad_norm": 1.9775962829589844, + "learning_rate": 7.338059960984694e-06, + "loss": 0.4405, + "step": 289550 + }, + { + "epoch": 2.559804805601231, + "grad_norm": 4.63855504989624, + "learning_rate": 7.336586573312824e-06, + "loss": 0.495, + "step": 289560 + }, + { + "epoch": 2.559893208861543, + "grad_norm": 4.357264518737793, + "learning_rate": 7.335113185640953e-06, + "loss": 0.7235, + "step": 289570 + }, + { + "epoch": 2.559981612121855, + "grad_norm": 5.323146343231201, + "learning_rate": 7.333639797969083e-06, + "loss": 0.4522, + "step": 289580 + }, + { + "epoch": 2.560070015382167, + "grad_norm": 12.841565132141113, + "learning_rate": 7.332166410297212e-06, + "loss": 0.6023, + "step": 289590 + }, + { + "epoch": 2.5601584186424793, + "grad_norm": 6.739912033081055, + "learning_rate": 7.330693022625342e-06, + "loss": 0.5159, + "step": 289600 + }, + { + "epoch": 2.560246821902792, + "grad_norm": 2.474158525466919, + "learning_rate": 7.329219634953471e-06, + "loss": 0.4574, + "step": 289610 + }, + { + "epoch": 2.560335225163104, + "grad_norm": 5.095026016235352, + "learning_rate": 7.3277462472816e-06, + "loss": 0.6088, + "step": 289620 + }, + { + "epoch": 2.5604236284234165, + "grad_norm": 7.5096635818481445, + "learning_rate": 7.32627285960973e-06, + "loss": 0.6056, + "step": 289630 + }, + { + "epoch": 2.5605120316837287, + "grad_norm": 15.26649284362793, + "learning_rate": 7.324799471937859e-06, + "loss": 0.5708, + "step": 289640 + }, + { + "epoch": 2.5606004349440408, + "grad_norm": 1.852070689201355, + "learning_rate": 7.323326084265989e-06, + "loss": 0.4363, + "step": 289650 + }, + { + "epoch": 2.560688838204353, + "grad_norm": 2.5601327419281006, + "learning_rate": 7.321852696594117e-06, + "loss": 0.4456, + "step": 289660 + }, + { + "epoch": 2.560777241464665, + "grad_norm": 2.5782365798950195, + "learning_rate": 7.320379308922247e-06, + "loss": 0.4979, + "step": 289670 + }, + { + "epoch": 2.5608656447249776, + "grad_norm": 2.7474570274353027, + "learning_rate": 7.3189059212503754e-06, + "loss": 0.5044, + "step": 289680 + }, + { + "epoch": 2.5609540479852897, + "grad_norm": 2.6767737865448, + "learning_rate": 7.3174325335785055e-06, + "loss": 0.5402, + "step": 289690 + }, + { + "epoch": 2.561042451245602, + "grad_norm": 3.9953954219818115, + "learning_rate": 7.3159591459066355e-06, + "loss": 0.6154, + "step": 289700 + }, + { + "epoch": 2.5611308545059144, + "grad_norm": 3.7851874828338623, + "learning_rate": 7.314485758234764e-06, + "loss": 0.481, + "step": 289710 + }, + { + "epoch": 2.5612192577662265, + "grad_norm": 1.0429242849349976, + "learning_rate": 7.313012370562894e-06, + "loss": 0.4333, + "step": 289720 + }, + { + "epoch": 2.5613076610265386, + "grad_norm": 4.879161834716797, + "learning_rate": 7.311538982891022e-06, + "loss": 0.5033, + "step": 289730 + }, + { + "epoch": 2.5613960642868507, + "grad_norm": 2.217686414718628, + "learning_rate": 7.310065595219152e-06, + "loss": 0.5857, + "step": 289740 + }, + { + "epoch": 2.5614844675471633, + "grad_norm": 12.372574806213379, + "learning_rate": 7.308592207547281e-06, + "loss": 0.5438, + "step": 289750 + }, + { + "epoch": 2.5615728708074754, + "grad_norm": 6.3479132652282715, + "learning_rate": 7.307118819875411e-06, + "loss": 0.5033, + "step": 289760 + }, + { + "epoch": 2.5616612740677875, + "grad_norm": 1.4256010055541992, + "learning_rate": 7.305645432203539e-06, + "loss": 0.3269, + "step": 289770 + }, + { + "epoch": 2.5617496773281, + "grad_norm": 2.4600794315338135, + "learning_rate": 7.304172044531669e-06, + "loss": 0.4992, + "step": 289780 + }, + { + "epoch": 2.5618380805884122, + "grad_norm": 8.055398941040039, + "learning_rate": 7.302698656859798e-06, + "loss": 0.4656, + "step": 289790 + }, + { + "epoch": 2.5619264838487243, + "grad_norm": 6.813675403594971, + "learning_rate": 7.3012252691879275e-06, + "loss": 0.7118, + "step": 289800 + }, + { + "epoch": 2.5620148871090365, + "grad_norm": 2.5903589725494385, + "learning_rate": 7.299751881516058e-06, + "loss": 0.6152, + "step": 289810 + }, + { + "epoch": 2.5621032903693486, + "grad_norm": 4.925954341888428, + "learning_rate": 7.298278493844187e-06, + "loss": 0.5554, + "step": 289820 + }, + { + "epoch": 2.562191693629661, + "grad_norm": 4.228261947631836, + "learning_rate": 7.296805106172316e-06, + "loss": 0.5972, + "step": 289830 + }, + { + "epoch": 2.5622800968899733, + "grad_norm": 10.470797538757324, + "learning_rate": 7.295331718500445e-06, + "loss": 0.4441, + "step": 289840 + }, + { + "epoch": 2.5623685001502854, + "grad_norm": 4.072735786437988, + "learning_rate": 7.293858330828575e-06, + "loss": 0.5067, + "step": 289850 + }, + { + "epoch": 2.562456903410598, + "grad_norm": 25.817529678344727, + "learning_rate": 7.292384943156704e-06, + "loss": 0.5751, + "step": 289860 + }, + { + "epoch": 2.56254530667091, + "grad_norm": 1.558405876159668, + "learning_rate": 7.290911555484834e-06, + "loss": 0.4515, + "step": 289870 + }, + { + "epoch": 2.562633709931222, + "grad_norm": 2.9240307807922363, + "learning_rate": 7.289438167812962e-06, + "loss": 0.4351, + "step": 289880 + }, + { + "epoch": 2.5627221131915343, + "grad_norm": 2.818044662475586, + "learning_rate": 7.287964780141092e-06, + "loss": 0.4795, + "step": 289890 + }, + { + "epoch": 2.562810516451847, + "grad_norm": 5.411013603210449, + "learning_rate": 7.286491392469222e-06, + "loss": 0.7459, + "step": 289900 + }, + { + "epoch": 2.562898919712159, + "grad_norm": 1.4867072105407715, + "learning_rate": 7.2850180047973504e-06, + "loss": 0.5907, + "step": 289910 + }, + { + "epoch": 2.562987322972471, + "grad_norm": 7.69048547744751, + "learning_rate": 7.2835446171254805e-06, + "loss": 0.6165, + "step": 289920 + }, + { + "epoch": 2.5630757262327837, + "grad_norm": 2.236253261566162, + "learning_rate": 7.282071229453609e-06, + "loss": 0.6116, + "step": 289930 + }, + { + "epoch": 2.563164129493096, + "grad_norm": 2.4824655055999756, + "learning_rate": 7.280597841781739e-06, + "loss": 0.502, + "step": 289940 + }, + { + "epoch": 2.563252532753408, + "grad_norm": 12.701619148254395, + "learning_rate": 7.279124454109867e-06, + "loss": 0.4889, + "step": 289950 + }, + { + "epoch": 2.56334093601372, + "grad_norm": 15.19467830657959, + "learning_rate": 7.277651066437997e-06, + "loss": 0.6758, + "step": 289960 + }, + { + "epoch": 2.563429339274032, + "grad_norm": 4.099213600158691, + "learning_rate": 7.276177678766126e-06, + "loss": 0.5362, + "step": 289970 + }, + { + "epoch": 2.5635177425343447, + "grad_norm": 1.921074628829956, + "learning_rate": 7.274704291094256e-06, + "loss": 0.4708, + "step": 289980 + }, + { + "epoch": 2.563606145794657, + "grad_norm": 2.5999786853790283, + "learning_rate": 7.273230903422386e-06, + "loss": 0.4985, + "step": 289990 + }, + { + "epoch": 2.5636945490549694, + "grad_norm": 2.8438796997070312, + "learning_rate": 7.271757515750514e-06, + "loss": 0.5694, + "step": 290000 + }, + { + "epoch": 2.5637829523152815, + "grad_norm": 4.097357749938965, + "learning_rate": 7.270284128078644e-06, + "loss": 0.329, + "step": 290010 + }, + { + "epoch": 2.5638713555755936, + "grad_norm": 1.739839792251587, + "learning_rate": 7.2688107404067725e-06, + "loss": 0.5045, + "step": 290020 + }, + { + "epoch": 2.5639597588359058, + "grad_norm": 1.1718409061431885, + "learning_rate": 7.2673373527349026e-06, + "loss": 0.5482, + "step": 290030 + }, + { + "epoch": 2.564048162096218, + "grad_norm": 6.105393409729004, + "learning_rate": 7.265863965063032e-06, + "loss": 0.5028, + "step": 290040 + }, + { + "epoch": 2.5641365653565305, + "grad_norm": 2.0577073097229004, + "learning_rate": 7.264390577391161e-06, + "loss": 0.5378, + "step": 290050 + }, + { + "epoch": 2.5642249686168426, + "grad_norm": 11.294700622558594, + "learning_rate": 7.26291718971929e-06, + "loss": 0.5817, + "step": 290060 + }, + { + "epoch": 2.5643133718771547, + "grad_norm": 5.985532283782959, + "learning_rate": 7.26144380204742e-06, + "loss": 0.6717, + "step": 290070 + }, + { + "epoch": 2.5644017751374673, + "grad_norm": 2.467278242111206, + "learning_rate": 7.259970414375549e-06, + "loss": 0.4045, + "step": 290080 + }, + { + "epoch": 2.5644901783977794, + "grad_norm": 6.797497272491455, + "learning_rate": 7.258497026703679e-06, + "loss": 0.4784, + "step": 290090 + }, + { + "epoch": 2.5645785816580915, + "grad_norm": 4.217753887176514, + "learning_rate": 7.257023639031809e-06, + "loss": 0.605, + "step": 290100 + }, + { + "epoch": 2.5646669849184036, + "grad_norm": 4.380706787109375, + "learning_rate": 7.255550251359937e-06, + "loss": 0.5064, + "step": 290110 + }, + { + "epoch": 2.564755388178716, + "grad_norm": 4.851982116699219, + "learning_rate": 7.254076863688067e-06, + "loss": 0.5467, + "step": 290120 + }, + { + "epoch": 2.5648437914390283, + "grad_norm": 5.873217582702637, + "learning_rate": 7.252603476016195e-06, + "loss": 0.5483, + "step": 290130 + }, + { + "epoch": 2.5649321946993404, + "grad_norm": 3.8391072750091553, + "learning_rate": 7.2511300883443255e-06, + "loss": 0.5438, + "step": 290140 + }, + { + "epoch": 2.565020597959653, + "grad_norm": 15.624885559082031, + "learning_rate": 7.249656700672454e-06, + "loss": 0.4947, + "step": 290150 + }, + { + "epoch": 2.565109001219965, + "grad_norm": 3.353505849838257, + "learning_rate": 7.248183313000584e-06, + "loss": 0.5457, + "step": 290160 + }, + { + "epoch": 2.565197404480277, + "grad_norm": 4.867993354797363, + "learning_rate": 7.246709925328714e-06, + "loss": 0.5246, + "step": 290170 + }, + { + "epoch": 2.5652858077405893, + "grad_norm": 4.1318864822387695, + "learning_rate": 7.245236537656842e-06, + "loss": 0.4198, + "step": 290180 + }, + { + "epoch": 2.5653742110009015, + "grad_norm": 3.071706771850586, + "learning_rate": 7.243763149984972e-06, + "loss": 0.5207, + "step": 290190 + }, + { + "epoch": 2.565462614261214, + "grad_norm": 1.5515491962432861, + "learning_rate": 7.242289762313101e-06, + "loss": 0.5754, + "step": 290200 + }, + { + "epoch": 2.565551017521526, + "grad_norm": 3.758007287979126, + "learning_rate": 7.240816374641231e-06, + "loss": 0.5174, + "step": 290210 + }, + { + "epoch": 2.5656394207818387, + "grad_norm": 3.4036102294921875, + "learning_rate": 7.239342986969359e-06, + "loss": 0.5411, + "step": 290220 + }, + { + "epoch": 2.565727824042151, + "grad_norm": 4.982196807861328, + "learning_rate": 7.237869599297489e-06, + "loss": 0.5457, + "step": 290230 + }, + { + "epoch": 2.565816227302463, + "grad_norm": 2.163620948791504, + "learning_rate": 7.2363962116256175e-06, + "loss": 0.6119, + "step": 290240 + }, + { + "epoch": 2.565904630562775, + "grad_norm": 1.7580196857452393, + "learning_rate": 7.2349228239537475e-06, + "loss": 0.5434, + "step": 290250 + }, + { + "epoch": 2.565993033823087, + "grad_norm": 2.9584124088287354, + "learning_rate": 7.2334494362818776e-06, + "loss": 0.5638, + "step": 290260 + }, + { + "epoch": 2.5660814370833998, + "grad_norm": 14.857179641723633, + "learning_rate": 7.231976048610006e-06, + "loss": 0.5925, + "step": 290270 + }, + { + "epoch": 2.566169840343712, + "grad_norm": 2.808340549468994, + "learning_rate": 7.230502660938136e-06, + "loss": 0.6126, + "step": 290280 + }, + { + "epoch": 2.566258243604024, + "grad_norm": 2.5668716430664062, + "learning_rate": 7.229029273266265e-06, + "loss": 0.6546, + "step": 290290 + }, + { + "epoch": 2.5663466468643366, + "grad_norm": 1.6905690431594849, + "learning_rate": 7.227555885594394e-06, + "loss": 0.467, + "step": 290300 + }, + { + "epoch": 2.5664350501246487, + "grad_norm": 2.563905715942383, + "learning_rate": 7.2260824979225236e-06, + "loss": 0.5593, + "step": 290310 + }, + { + "epoch": 2.566523453384961, + "grad_norm": 6.002048969268799, + "learning_rate": 7.224609110250654e-06, + "loss": 0.6478, + "step": 290320 + }, + { + "epoch": 2.566611856645273, + "grad_norm": 17.270204544067383, + "learning_rate": 7.223135722578782e-06, + "loss": 0.5905, + "step": 290330 + }, + { + "epoch": 2.5667002599055855, + "grad_norm": 2.3734970092773438, + "learning_rate": 7.221662334906912e-06, + "loss": 0.426, + "step": 290340 + }, + { + "epoch": 2.5667886631658976, + "grad_norm": 4.115058898925781, + "learning_rate": 7.22018894723504e-06, + "loss": 0.5624, + "step": 290350 + }, + { + "epoch": 2.5668770664262097, + "grad_norm": 3.7801525592803955, + "learning_rate": 7.21871555956317e-06, + "loss": 0.6508, + "step": 290360 + }, + { + "epoch": 2.5669654696865223, + "grad_norm": 14.938491821289062, + "learning_rate": 7.2172421718913005e-06, + "loss": 0.5962, + "step": 290370 + }, + { + "epoch": 2.5670538729468344, + "grad_norm": 8.27540397644043, + "learning_rate": 7.215768784219429e-06, + "loss": 0.5725, + "step": 290380 + }, + { + "epoch": 2.5671422762071465, + "grad_norm": 6.46809196472168, + "learning_rate": 7.214295396547559e-06, + "loss": 0.5122, + "step": 290390 + }, + { + "epoch": 2.5672306794674586, + "grad_norm": 2.815462112426758, + "learning_rate": 7.212822008875687e-06, + "loss": 0.6068, + "step": 290400 + }, + { + "epoch": 2.5673190827277708, + "grad_norm": 2.6178343296051025, + "learning_rate": 7.211348621203817e-06, + "loss": 0.555, + "step": 290410 + }, + { + "epoch": 2.5674074859880833, + "grad_norm": 1.7370020151138306, + "learning_rate": 7.209875233531946e-06, + "loss": 0.4563, + "step": 290420 + }, + { + "epoch": 2.5674958892483954, + "grad_norm": 3.5109968185424805, + "learning_rate": 7.208401845860076e-06, + "loss": 0.6008, + "step": 290430 + }, + { + "epoch": 2.5675842925087076, + "grad_norm": 1.8429661989212036, + "learning_rate": 7.206928458188204e-06, + "loss": 0.3752, + "step": 290440 + }, + { + "epoch": 2.56767269576902, + "grad_norm": 3.4955079555511475, + "learning_rate": 7.205455070516334e-06, + "loss": 0.4237, + "step": 290450 + }, + { + "epoch": 2.5677610990293323, + "grad_norm": 1.7983194589614868, + "learning_rate": 7.203981682844464e-06, + "loss": 0.5043, + "step": 290460 + }, + { + "epoch": 2.5678495022896444, + "grad_norm": 4.6329145431518555, + "learning_rate": 7.2025082951725925e-06, + "loss": 0.4621, + "step": 290470 + }, + { + "epoch": 2.5679379055499565, + "grad_norm": 1.4638735055923462, + "learning_rate": 7.2010349075007225e-06, + "loss": 0.4997, + "step": 290480 + }, + { + "epoch": 2.568026308810269, + "grad_norm": 2.703951358795166, + "learning_rate": 7.199561519828851e-06, + "loss": 0.6752, + "step": 290490 + }, + { + "epoch": 2.568114712070581, + "grad_norm": 10.676141738891602, + "learning_rate": 7.198088132156981e-06, + "loss": 0.4257, + "step": 290500 + }, + { + "epoch": 2.5682031153308933, + "grad_norm": 12.674745559692383, + "learning_rate": 7.19661474448511e-06, + "loss": 0.4279, + "step": 290510 + }, + { + "epoch": 2.568291518591206, + "grad_norm": 3.6587488651275635, + "learning_rate": 7.195141356813239e-06, + "loss": 0.544, + "step": 290520 + }, + { + "epoch": 2.568379921851518, + "grad_norm": 6.788463592529297, + "learning_rate": 7.1936679691413685e-06, + "loss": 0.5196, + "step": 290530 + }, + { + "epoch": 2.56846832511183, + "grad_norm": 2.264103651046753, + "learning_rate": 7.192194581469499e-06, + "loss": 0.5002, + "step": 290540 + }, + { + "epoch": 2.568556728372142, + "grad_norm": 7.4535393714904785, + "learning_rate": 7.190721193797629e-06, + "loss": 0.5253, + "step": 290550 + }, + { + "epoch": 2.5686451316324543, + "grad_norm": 3.7010390758514404, + "learning_rate": 7.189247806125757e-06, + "loss": 0.5149, + "step": 290560 + }, + { + "epoch": 2.568733534892767, + "grad_norm": 4.439531326293945, + "learning_rate": 7.187774418453887e-06, + "loss": 0.4655, + "step": 290570 + }, + { + "epoch": 2.568821938153079, + "grad_norm": 5.954063415527344, + "learning_rate": 7.186301030782015e-06, + "loss": 0.558, + "step": 290580 + }, + { + "epoch": 2.5689103414133916, + "grad_norm": 5.66795015335083, + "learning_rate": 7.1848276431101454e-06, + "loss": 0.4945, + "step": 290590 + }, + { + "epoch": 2.5689987446737037, + "grad_norm": 11.57717514038086, + "learning_rate": 7.183354255438274e-06, + "loss": 0.5607, + "step": 290600 + }, + { + "epoch": 2.569087147934016, + "grad_norm": 11.056740760803223, + "learning_rate": 7.181880867766404e-06, + "loss": 0.5257, + "step": 290610 + }, + { + "epoch": 2.569175551194328, + "grad_norm": 4.887670516967773, + "learning_rate": 7.180407480094532e-06, + "loss": 0.545, + "step": 290620 + }, + { + "epoch": 2.56926395445464, + "grad_norm": 2.2843985557556152, + "learning_rate": 7.178934092422662e-06, + "loss": 0.4112, + "step": 290630 + }, + { + "epoch": 2.5693523577149526, + "grad_norm": 2.801109790802002, + "learning_rate": 7.177460704750792e-06, + "loss": 0.4686, + "step": 290640 + }, + { + "epoch": 2.5694407609752647, + "grad_norm": 6.5461015701293945, + "learning_rate": 7.175987317078921e-06, + "loss": 0.6039, + "step": 290650 + }, + { + "epoch": 2.569529164235577, + "grad_norm": 2.2077338695526123, + "learning_rate": 7.174513929407051e-06, + "loss": 0.4934, + "step": 290660 + }, + { + "epoch": 2.5696175674958894, + "grad_norm": 1.1087141036987305, + "learning_rate": 7.173040541735179e-06, + "loss": 0.4724, + "step": 290670 + }, + { + "epoch": 2.5697059707562016, + "grad_norm": 3.260634422302246, + "learning_rate": 7.171567154063309e-06, + "loss": 0.5141, + "step": 290680 + }, + { + "epoch": 2.5697943740165137, + "grad_norm": 1.350961685180664, + "learning_rate": 7.1700937663914374e-06, + "loss": 0.4749, + "step": 290690 + }, + { + "epoch": 2.569882777276826, + "grad_norm": 3.247637987136841, + "learning_rate": 7.1686203787195675e-06, + "loss": 0.5743, + "step": 290700 + }, + { + "epoch": 2.5699711805371384, + "grad_norm": 8.757539749145508, + "learning_rate": 7.167146991047696e-06, + "loss": 0.5917, + "step": 290710 + }, + { + "epoch": 2.5700595837974505, + "grad_norm": 2.064133882522583, + "learning_rate": 7.165673603375826e-06, + "loss": 0.3803, + "step": 290720 + }, + { + "epoch": 2.5701479870577626, + "grad_norm": 9.388968467712402, + "learning_rate": 7.164200215703956e-06, + "loss": 0.6197, + "step": 290730 + }, + { + "epoch": 2.570236390318075, + "grad_norm": 3.8373959064483643, + "learning_rate": 7.162726828032085e-06, + "loss": 0.5814, + "step": 290740 + }, + { + "epoch": 2.5703247935783873, + "grad_norm": 4.260650634765625, + "learning_rate": 7.161253440360214e-06, + "loss": 0.496, + "step": 290750 + }, + { + "epoch": 2.5704131968386994, + "grad_norm": 2.1455929279327393, + "learning_rate": 7.1597800526883435e-06, + "loss": 0.5604, + "step": 290760 + }, + { + "epoch": 2.5705016000990115, + "grad_norm": 2.7175867557525635, + "learning_rate": 7.158306665016474e-06, + "loss": 0.4768, + "step": 290770 + }, + { + "epoch": 2.5705900033593236, + "grad_norm": 6.203077793121338, + "learning_rate": 7.156833277344602e-06, + "loss": 0.6, + "step": 290780 + }, + { + "epoch": 2.570678406619636, + "grad_norm": 1.0497604608535767, + "learning_rate": 7.155359889672732e-06, + "loss": 0.5461, + "step": 290790 + }, + { + "epoch": 2.5707668098799483, + "grad_norm": 5.892660140991211, + "learning_rate": 7.15388650200086e-06, + "loss": 0.4605, + "step": 290800 + }, + { + "epoch": 2.570855213140261, + "grad_norm": 5.133480072021484, + "learning_rate": 7.15241311432899e-06, + "loss": 0.5128, + "step": 290810 + }, + { + "epoch": 2.570943616400573, + "grad_norm": 2.116699457168579, + "learning_rate": 7.1509397266571204e-06, + "loss": 0.4629, + "step": 290820 + }, + { + "epoch": 2.571032019660885, + "grad_norm": 5.21714448928833, + "learning_rate": 7.149466338985249e-06, + "loss": 0.5383, + "step": 290830 + }, + { + "epoch": 2.5711204229211972, + "grad_norm": 3.169443130493164, + "learning_rate": 7.147992951313379e-06, + "loss": 0.4531, + "step": 290840 + }, + { + "epoch": 2.5712088261815094, + "grad_norm": 10.467610359191895, + "learning_rate": 7.146519563641507e-06, + "loss": 0.5511, + "step": 290850 + }, + { + "epoch": 2.571297229441822, + "grad_norm": 65.54949188232422, + "learning_rate": 7.145046175969637e-06, + "loss": 0.5364, + "step": 290860 + }, + { + "epoch": 2.571385632702134, + "grad_norm": 1.7400555610656738, + "learning_rate": 7.143572788297766e-06, + "loss": 0.5827, + "step": 290870 + }, + { + "epoch": 2.571474035962446, + "grad_norm": 4.752604007720947, + "learning_rate": 7.142099400625896e-06, + "loss": 0.5929, + "step": 290880 + }, + { + "epoch": 2.5715624392227587, + "grad_norm": 4.8134870529174805, + "learning_rate": 7.140626012954024e-06, + "loss": 0.5946, + "step": 290890 + }, + { + "epoch": 2.571650842483071, + "grad_norm": 8.24623966217041, + "learning_rate": 7.139152625282154e-06, + "loss": 0.4232, + "step": 290900 + }, + { + "epoch": 2.571739245743383, + "grad_norm": 2.1741130352020264, + "learning_rate": 7.137679237610282e-06, + "loss": 0.5338, + "step": 290910 + }, + { + "epoch": 2.571827649003695, + "grad_norm": 1.4585745334625244, + "learning_rate": 7.1362058499384125e-06, + "loss": 0.4754, + "step": 290920 + }, + { + "epoch": 2.5719160522640077, + "grad_norm": 4.138350963592529, + "learning_rate": 7.1347324622665425e-06, + "loss": 0.5529, + "step": 290930 + }, + { + "epoch": 2.5720044555243198, + "grad_norm": 0.8282493352890015, + "learning_rate": 7.133259074594671e-06, + "loss": 0.4525, + "step": 290940 + }, + { + "epoch": 2.572092858784632, + "grad_norm": 5.585307598114014, + "learning_rate": 7.131785686922801e-06, + "loss": 0.5183, + "step": 290950 + }, + { + "epoch": 2.5721812620449445, + "grad_norm": 11.064130783081055, + "learning_rate": 7.13031229925093e-06, + "loss": 0.5314, + "step": 290960 + }, + { + "epoch": 2.5722696653052566, + "grad_norm": 1.457417368888855, + "learning_rate": 7.128838911579059e-06, + "loss": 0.3305, + "step": 290970 + }, + { + "epoch": 2.5723580685655687, + "grad_norm": 4.607873916625977, + "learning_rate": 7.1273655239071885e-06, + "loss": 0.576, + "step": 290980 + }, + { + "epoch": 2.572446471825881, + "grad_norm": 1.4441145658493042, + "learning_rate": 7.1258921362353186e-06, + "loss": 0.4073, + "step": 290990 + }, + { + "epoch": 2.572534875086193, + "grad_norm": 3.987887144088745, + "learning_rate": 7.124418748563447e-06, + "loss": 0.5382, + "step": 291000 + }, + { + "epoch": 2.5726232783465055, + "grad_norm": 2.1909964084625244, + "learning_rate": 7.122945360891577e-06, + "loss": 0.5161, + "step": 291010 + }, + { + "epoch": 2.5727116816068176, + "grad_norm": 3.21271014213562, + "learning_rate": 7.121471973219707e-06, + "loss": 0.5984, + "step": 291020 + }, + { + "epoch": 2.5728000848671297, + "grad_norm": 3.0968217849731445, + "learning_rate": 7.119998585547835e-06, + "loss": 0.4589, + "step": 291030 + }, + { + "epoch": 2.5728884881274423, + "grad_norm": 7.352105617523193, + "learning_rate": 7.118525197875965e-06, + "loss": 0.5079, + "step": 291040 + }, + { + "epoch": 2.5729768913877544, + "grad_norm": 1.0211142301559448, + "learning_rate": 7.117051810204094e-06, + "loss": 0.5232, + "step": 291050 + }, + { + "epoch": 2.5730652946480665, + "grad_norm": 3.4918508529663086, + "learning_rate": 7.115578422532224e-06, + "loss": 0.4498, + "step": 291060 + }, + { + "epoch": 2.5731536979083787, + "grad_norm": 3.679516315460205, + "learning_rate": 7.114105034860352e-06, + "loss": 0.4595, + "step": 291070 + }, + { + "epoch": 2.5732421011686912, + "grad_norm": 4.871335983276367, + "learning_rate": 7.112631647188482e-06, + "loss": 0.5449, + "step": 291080 + }, + { + "epoch": 2.5733305044290034, + "grad_norm": 5.770642280578613, + "learning_rate": 7.1111582595166106e-06, + "loss": 0.3627, + "step": 291090 + }, + { + "epoch": 2.5734189076893155, + "grad_norm": 4.444887161254883, + "learning_rate": 7.109684871844741e-06, + "loss": 0.4889, + "step": 291100 + }, + { + "epoch": 2.573507310949628, + "grad_norm": 8.004246711730957, + "learning_rate": 7.108211484172871e-06, + "loss": 0.4644, + "step": 291110 + }, + { + "epoch": 2.57359571420994, + "grad_norm": 5.199616432189941, + "learning_rate": 7.106738096500999e-06, + "loss": 0.582, + "step": 291120 + }, + { + "epoch": 2.5736841174702523, + "grad_norm": 4.122292995452881, + "learning_rate": 7.105264708829129e-06, + "loss": 0.5134, + "step": 291130 + }, + { + "epoch": 2.5737725207305644, + "grad_norm": 2.202420711517334, + "learning_rate": 7.103791321157257e-06, + "loss": 0.509, + "step": 291140 + }, + { + "epoch": 2.5738609239908765, + "grad_norm": 6.41200065612793, + "learning_rate": 7.1023179334853875e-06, + "loss": 0.6085, + "step": 291150 + }, + { + "epoch": 2.573949327251189, + "grad_norm": 3.095994234085083, + "learning_rate": 7.100844545813516e-06, + "loss": 0.5597, + "step": 291160 + }, + { + "epoch": 2.574037730511501, + "grad_norm": 4.163857936859131, + "learning_rate": 7.099371158141646e-06, + "loss": 0.5281, + "step": 291170 + }, + { + "epoch": 2.5741261337718138, + "grad_norm": 8.113866806030273, + "learning_rate": 7.097897770469775e-06, + "loss": 0.4756, + "step": 291180 + }, + { + "epoch": 2.574214537032126, + "grad_norm": 2.458099365234375, + "learning_rate": 7.096424382797904e-06, + "loss": 0.6037, + "step": 291190 + }, + { + "epoch": 2.574302940292438, + "grad_norm": 0.977841854095459, + "learning_rate": 7.094950995126034e-06, + "loss": 0.4939, + "step": 291200 + }, + { + "epoch": 2.57439134355275, + "grad_norm": 2.308629274368286, + "learning_rate": 7.0934776074541635e-06, + "loss": 0.4899, + "step": 291210 + }, + { + "epoch": 2.5744797468130622, + "grad_norm": 2.4277594089508057, + "learning_rate": 7.092004219782293e-06, + "loss": 0.5108, + "step": 291220 + }, + { + "epoch": 2.574568150073375, + "grad_norm": 1.97001314163208, + "learning_rate": 7.090530832110422e-06, + "loss": 0.5088, + "step": 291230 + }, + { + "epoch": 2.574656553333687, + "grad_norm": 11.155823707580566, + "learning_rate": 7.089057444438552e-06, + "loss": 0.6684, + "step": 291240 + }, + { + "epoch": 2.574744956593999, + "grad_norm": 21.078155517578125, + "learning_rate": 7.08758405676668e-06, + "loss": 0.4301, + "step": 291250 + }, + { + "epoch": 2.5748333598543116, + "grad_norm": 3.722567558288574, + "learning_rate": 7.08611066909481e-06, + "loss": 0.5173, + "step": 291260 + }, + { + "epoch": 2.5749217631146237, + "grad_norm": 3.959219455718994, + "learning_rate": 7.084637281422939e-06, + "loss": 0.6406, + "step": 291270 + }, + { + "epoch": 2.575010166374936, + "grad_norm": 1.082443356513977, + "learning_rate": 7.083163893751069e-06, + "loss": 0.6446, + "step": 291280 + }, + { + "epoch": 2.575098569635248, + "grad_norm": 15.530632019042969, + "learning_rate": 7.081690506079199e-06, + "loss": 0.5264, + "step": 291290 + }, + { + "epoch": 2.5751869728955605, + "grad_norm": 1.4701746702194214, + "learning_rate": 7.080217118407327e-06, + "loss": 0.4655, + "step": 291300 + }, + { + "epoch": 2.5752753761558727, + "grad_norm": 9.3839750289917, + "learning_rate": 7.078743730735457e-06, + "loss": 0.6085, + "step": 291310 + }, + { + "epoch": 2.5753637794161848, + "grad_norm": 4.507706165313721, + "learning_rate": 7.0772703430635856e-06, + "loss": 0.607, + "step": 291320 + }, + { + "epoch": 2.5754521826764973, + "grad_norm": 7.476678371429443, + "learning_rate": 7.075796955391716e-06, + "loss": 0.6124, + "step": 291330 + }, + { + "epoch": 2.5755405859368095, + "grad_norm": 4.539484024047852, + "learning_rate": 7.074323567719844e-06, + "loss": 0.6032, + "step": 291340 + }, + { + "epoch": 2.5756289891971216, + "grad_norm": 3.76851224899292, + "learning_rate": 7.072850180047974e-06, + "loss": 0.383, + "step": 291350 + }, + { + "epoch": 2.5757173924574337, + "grad_norm": 1.861653447151184, + "learning_rate": 7.071376792376102e-06, + "loss": 0.5198, + "step": 291360 + }, + { + "epoch": 2.575805795717746, + "grad_norm": 0.5511834025382996, + "learning_rate": 7.0699034047042324e-06, + "loss": 0.4603, + "step": 291370 + }, + { + "epoch": 2.5758941989780584, + "grad_norm": 2.32466721534729, + "learning_rate": 7.0684300170323625e-06, + "loss": 0.514, + "step": 291380 + }, + { + "epoch": 2.5759826022383705, + "grad_norm": 4.2438788414001465, + "learning_rate": 7.066956629360491e-06, + "loss": 0.5226, + "step": 291390 + }, + { + "epoch": 2.576071005498683, + "grad_norm": 1.0155575275421143, + "learning_rate": 7.065483241688621e-06, + "loss": 0.4726, + "step": 291400 + }, + { + "epoch": 2.576159408758995, + "grad_norm": 9.247658729553223, + "learning_rate": 7.064009854016749e-06, + "loss": 0.5221, + "step": 291410 + }, + { + "epoch": 2.5762478120193073, + "grad_norm": 18.262351989746094, + "learning_rate": 7.062536466344879e-06, + "loss": 0.6033, + "step": 291420 + }, + { + "epoch": 2.5763362152796194, + "grad_norm": 6.048531532287598, + "learning_rate": 7.0610630786730085e-06, + "loss": 0.5338, + "step": 291430 + }, + { + "epoch": 2.5764246185399315, + "grad_norm": 5.348221302032471, + "learning_rate": 7.059589691001138e-06, + "loss": 0.5067, + "step": 291440 + }, + { + "epoch": 2.576513021800244, + "grad_norm": 4.324065208435059, + "learning_rate": 7.058116303329267e-06, + "loss": 0.5147, + "step": 291450 + }, + { + "epoch": 2.5766014250605562, + "grad_norm": 2.665686845779419, + "learning_rate": 7.056642915657397e-06, + "loss": 0.4813, + "step": 291460 + }, + { + "epoch": 2.5766898283208683, + "grad_norm": 4.55673885345459, + "learning_rate": 7.055169527985525e-06, + "loss": 0.4666, + "step": 291470 + }, + { + "epoch": 2.576778231581181, + "grad_norm": 3.13246750831604, + "learning_rate": 7.053696140313655e-06, + "loss": 0.5852, + "step": 291480 + }, + { + "epoch": 2.576866634841493, + "grad_norm": 2.843867301940918, + "learning_rate": 7.052222752641785e-06, + "loss": 0.4238, + "step": 291490 + }, + { + "epoch": 2.576955038101805, + "grad_norm": 10.108036994934082, + "learning_rate": 7.050749364969914e-06, + "loss": 0.442, + "step": 291500 + }, + { + "epoch": 2.5770434413621173, + "grad_norm": 1.5499472618103027, + "learning_rate": 7.049275977298044e-06, + "loss": 0.5429, + "step": 291510 + }, + { + "epoch": 2.57713184462243, + "grad_norm": 0.5012868642807007, + "learning_rate": 7.047802589626172e-06, + "loss": 0.5415, + "step": 291520 + }, + { + "epoch": 2.577220247882742, + "grad_norm": 5.1707868576049805, + "learning_rate": 7.046329201954302e-06, + "loss": 0.6645, + "step": 291530 + }, + { + "epoch": 2.577308651143054, + "grad_norm": 10.672884941101074, + "learning_rate": 7.0448558142824305e-06, + "loss": 0.473, + "step": 291540 + }, + { + "epoch": 2.5773970544033666, + "grad_norm": 2.686453342437744, + "learning_rate": 7.043382426610561e-06, + "loss": 0.4776, + "step": 291550 + }, + { + "epoch": 2.5774854576636788, + "grad_norm": 2.4415714740753174, + "learning_rate": 7.041909038938689e-06, + "loss": 0.5348, + "step": 291560 + }, + { + "epoch": 2.577573860923991, + "grad_norm": 2.472630500793457, + "learning_rate": 7.040435651266819e-06, + "loss": 0.4956, + "step": 291570 + }, + { + "epoch": 2.577662264184303, + "grad_norm": 2.7661163806915283, + "learning_rate": 7.038962263594949e-06, + "loss": 0.6924, + "step": 291580 + }, + { + "epoch": 2.577750667444615, + "grad_norm": 3.8075571060180664, + "learning_rate": 7.037488875923077e-06, + "loss": 0.4977, + "step": 291590 + }, + { + "epoch": 2.5778390707049277, + "grad_norm": 4.390771389007568, + "learning_rate": 7.0360154882512074e-06, + "loss": 0.4698, + "step": 291600 + }, + { + "epoch": 2.57792747396524, + "grad_norm": 3.386058807373047, + "learning_rate": 7.034542100579336e-06, + "loss": 0.515, + "step": 291610 + }, + { + "epoch": 2.578015877225552, + "grad_norm": 4.600453853607178, + "learning_rate": 7.033068712907466e-06, + "loss": 0.5238, + "step": 291620 + }, + { + "epoch": 2.5781042804858645, + "grad_norm": 10.396167755126953, + "learning_rate": 7.031595325235594e-06, + "loss": 0.6643, + "step": 291630 + }, + { + "epoch": 2.5781926837461766, + "grad_norm": 1.3148412704467773, + "learning_rate": 7.030121937563724e-06, + "loss": 0.5198, + "step": 291640 + }, + { + "epoch": 2.5782810870064887, + "grad_norm": 3.406574249267578, + "learning_rate": 7.0286485498918534e-06, + "loss": 0.4649, + "step": 291650 + }, + { + "epoch": 2.578369490266801, + "grad_norm": 5.700249671936035, + "learning_rate": 7.027175162219983e-06, + "loss": 0.5073, + "step": 291660 + }, + { + "epoch": 2.5784578935271134, + "grad_norm": 7.972904205322266, + "learning_rate": 7.025701774548113e-06, + "loss": 0.5497, + "step": 291670 + }, + { + "epoch": 2.5785462967874255, + "grad_norm": 2.5727906227111816, + "learning_rate": 7.024228386876242e-06, + "loss": 0.5618, + "step": 291680 + }, + { + "epoch": 2.5786347000477376, + "grad_norm": 3.232928514480591, + "learning_rate": 7.022754999204371e-06, + "loss": 0.5731, + "step": 291690 + }, + { + "epoch": 2.57872310330805, + "grad_norm": 14.807567596435547, + "learning_rate": 7.0212816115325e-06, + "loss": 0.5747, + "step": 291700 + }, + { + "epoch": 2.5788115065683623, + "grad_norm": 4.503212928771973, + "learning_rate": 7.01980822386063e-06, + "loss": 0.6211, + "step": 291710 + }, + { + "epoch": 2.5788999098286745, + "grad_norm": 14.518218040466309, + "learning_rate": 7.018334836188759e-06, + "loss": 0.6095, + "step": 291720 + }, + { + "epoch": 2.5789883130889866, + "grad_norm": 1.5002453327178955, + "learning_rate": 7.016861448516889e-06, + "loss": 0.4681, + "step": 291730 + }, + { + "epoch": 2.5790767163492987, + "grad_norm": 4.636672496795654, + "learning_rate": 7.015388060845017e-06, + "loss": 0.4873, + "step": 291740 + }, + { + "epoch": 2.5791651196096113, + "grad_norm": 5.031120777130127, + "learning_rate": 7.013914673173147e-06, + "loss": 0.4381, + "step": 291750 + }, + { + "epoch": 2.5792535228699234, + "grad_norm": 1.4020417928695679, + "learning_rate": 7.012441285501277e-06, + "loss": 0.4993, + "step": 291760 + }, + { + "epoch": 2.579341926130236, + "grad_norm": 16.746610641479492, + "learning_rate": 7.0109678978294056e-06, + "loss": 0.5543, + "step": 291770 + }, + { + "epoch": 2.579430329390548, + "grad_norm": 4.492101192474365, + "learning_rate": 7.009494510157536e-06, + "loss": 0.4905, + "step": 291780 + }, + { + "epoch": 2.57951873265086, + "grad_norm": 8.429959297180176, + "learning_rate": 7.008021122485664e-06, + "loss": 0.5347, + "step": 291790 + }, + { + "epoch": 2.5796071359111723, + "grad_norm": 12.928462028503418, + "learning_rate": 7.006547734813794e-06, + "loss": 0.5724, + "step": 291800 + }, + { + "epoch": 2.5796955391714844, + "grad_norm": 3.802482843399048, + "learning_rate": 7.005074347141922e-06, + "loss": 0.5424, + "step": 291810 + }, + { + "epoch": 2.579783942431797, + "grad_norm": 2.240936756134033, + "learning_rate": 7.003600959470052e-06, + "loss": 0.5276, + "step": 291820 + }, + { + "epoch": 2.579872345692109, + "grad_norm": 1.8809236288070679, + "learning_rate": 7.002127571798181e-06, + "loss": 0.5818, + "step": 291830 + }, + { + "epoch": 2.5799607489524212, + "grad_norm": 4.016777038574219, + "learning_rate": 7.000654184126311e-06, + "loss": 0.4492, + "step": 291840 + }, + { + "epoch": 2.580049152212734, + "grad_norm": 1.139290690422058, + "learning_rate": 6.999180796454441e-06, + "loss": 0.4718, + "step": 291850 + }, + { + "epoch": 2.580137555473046, + "grad_norm": 9.587358474731445, + "learning_rate": 6.997707408782569e-06, + "loss": 0.5372, + "step": 291860 + }, + { + "epoch": 2.580225958733358, + "grad_norm": 3.1464290618896484, + "learning_rate": 6.996234021110699e-06, + "loss": 0.6249, + "step": 291870 + }, + { + "epoch": 2.58031436199367, + "grad_norm": 2.6658852100372314, + "learning_rate": 6.994760633438828e-06, + "loss": 0.4294, + "step": 291880 + }, + { + "epoch": 2.5804027652539827, + "grad_norm": 3.9911692142486572, + "learning_rate": 6.993287245766958e-06, + "loss": 0.6288, + "step": 291890 + }, + { + "epoch": 2.580491168514295, + "grad_norm": 10.46533203125, + "learning_rate": 6.991813858095087e-06, + "loss": 0.5068, + "step": 291900 + }, + { + "epoch": 2.580579571774607, + "grad_norm": 2.743298053741455, + "learning_rate": 6.990340470423216e-06, + "loss": 0.4005, + "step": 291910 + }, + { + "epoch": 2.5806679750349195, + "grad_norm": 1.4546257257461548, + "learning_rate": 6.988867082751345e-06, + "loss": 0.6281, + "step": 291920 + }, + { + "epoch": 2.5807563782952316, + "grad_norm": 1.837690830230713, + "learning_rate": 6.987393695079475e-06, + "loss": 0.5702, + "step": 291930 + }, + { + "epoch": 2.5808447815555438, + "grad_norm": 27.234432220458984, + "learning_rate": 6.985920307407604e-06, + "loss": 0.5076, + "step": 291940 + }, + { + "epoch": 2.580933184815856, + "grad_norm": 5.142980098724365, + "learning_rate": 6.984446919735734e-06, + "loss": 0.5061, + "step": 291950 + }, + { + "epoch": 2.581021588076168, + "grad_norm": 1.4400602579116821, + "learning_rate": 6.982973532063864e-06, + "loss": 0.4657, + "step": 291960 + }, + { + "epoch": 2.5811099913364806, + "grad_norm": 2.229456901550293, + "learning_rate": 6.981500144391992e-06, + "loss": 0.5392, + "step": 291970 + }, + { + "epoch": 2.5811983945967927, + "grad_norm": 8.779215812683105, + "learning_rate": 6.980026756720122e-06, + "loss": 0.5213, + "step": 291980 + }, + { + "epoch": 2.5812867978571052, + "grad_norm": 3.2285149097442627, + "learning_rate": 6.9785533690482505e-06, + "loss": 0.3628, + "step": 291990 + }, + { + "epoch": 2.5813752011174174, + "grad_norm": 2.145385503768921, + "learning_rate": 6.9770799813763806e-06, + "loss": 0.4221, + "step": 292000 + }, + { + "epoch": 2.5814636043777295, + "grad_norm": 2.6706178188323975, + "learning_rate": 6.975606593704509e-06, + "loss": 0.5243, + "step": 292010 + }, + { + "epoch": 2.5815520076380416, + "grad_norm": 1.8861569166183472, + "learning_rate": 6.974133206032639e-06, + "loss": 0.6026, + "step": 292020 + }, + { + "epoch": 2.5816404108983537, + "grad_norm": 3.7468388080596924, + "learning_rate": 6.972659818360767e-06, + "loss": 0.4442, + "step": 292030 + }, + { + "epoch": 2.5817288141586663, + "grad_norm": 2.6750619411468506, + "learning_rate": 6.971186430688897e-06, + "loss": 0.623, + "step": 292040 + }, + { + "epoch": 2.5818172174189784, + "grad_norm": 1.7387633323669434, + "learning_rate": 6.969713043017027e-06, + "loss": 0.4962, + "step": 292050 + }, + { + "epoch": 2.5819056206792905, + "grad_norm": 4.47970724105835, + "learning_rate": 6.968239655345156e-06, + "loss": 0.4912, + "step": 292060 + }, + { + "epoch": 2.581994023939603, + "grad_norm": 3.878159999847412, + "learning_rate": 6.966766267673286e-06, + "loss": 0.4464, + "step": 292070 + }, + { + "epoch": 2.582082427199915, + "grad_norm": 1.6054596900939941, + "learning_rate": 6.965292880001414e-06, + "loss": 0.4501, + "step": 292080 + }, + { + "epoch": 2.5821708304602273, + "grad_norm": 7.172598838806152, + "learning_rate": 6.963819492329544e-06, + "loss": 0.4703, + "step": 292090 + }, + { + "epoch": 2.5822592337205394, + "grad_norm": 1.799329161643982, + "learning_rate": 6.9623461046576726e-06, + "loss": 0.5882, + "step": 292100 + }, + { + "epoch": 2.582347636980852, + "grad_norm": 5.130559921264648, + "learning_rate": 6.960872716985803e-06, + "loss": 0.6287, + "step": 292110 + }, + { + "epoch": 2.582436040241164, + "grad_norm": 1.756178855895996, + "learning_rate": 6.959399329313932e-06, + "loss": 0.4578, + "step": 292120 + }, + { + "epoch": 2.5825244435014763, + "grad_norm": 2.928774356842041, + "learning_rate": 6.957925941642061e-06, + "loss": 0.5361, + "step": 292130 + }, + { + "epoch": 2.582612846761789, + "grad_norm": 2.8234333992004395, + "learning_rate": 6.956452553970191e-06, + "loss": 0.5118, + "step": 292140 + }, + { + "epoch": 2.582701250022101, + "grad_norm": 2.947486162185669, + "learning_rate": 6.95497916629832e-06, + "loss": 0.533, + "step": 292150 + }, + { + "epoch": 2.582789653282413, + "grad_norm": 4.809210300445557, + "learning_rate": 6.9535057786264495e-06, + "loss": 0.4666, + "step": 292160 + }, + { + "epoch": 2.582878056542725, + "grad_norm": 3.416128158569336, + "learning_rate": 6.952032390954579e-06, + "loss": 0.5151, + "step": 292170 + }, + { + "epoch": 2.5829664598030373, + "grad_norm": 2.537780523300171, + "learning_rate": 6.950559003282709e-06, + "loss": 0.5173, + "step": 292180 + }, + { + "epoch": 2.58305486306335, + "grad_norm": 3.0182573795318604, + "learning_rate": 6.949085615610837e-06, + "loss": 0.6276, + "step": 292190 + }, + { + "epoch": 2.583143266323662, + "grad_norm": 2.347623586654663, + "learning_rate": 6.947612227938967e-06, + "loss": 0.4996, + "step": 292200 + }, + { + "epoch": 2.583231669583974, + "grad_norm": 3.604372262954712, + "learning_rate": 6.9461388402670955e-06, + "loss": 0.6324, + "step": 292210 + }, + { + "epoch": 2.5833200728442867, + "grad_norm": 3.270852565765381, + "learning_rate": 6.9446654525952255e-06, + "loss": 0.5419, + "step": 292220 + }, + { + "epoch": 2.583408476104599, + "grad_norm": 2.040982961654663, + "learning_rate": 6.9431920649233556e-06, + "loss": 0.4502, + "step": 292230 + }, + { + "epoch": 2.583496879364911, + "grad_norm": 2.343367576599121, + "learning_rate": 6.941718677251484e-06, + "loss": 0.4344, + "step": 292240 + }, + { + "epoch": 2.583585282625223, + "grad_norm": 2.3239080905914307, + "learning_rate": 6.940245289579614e-06, + "loss": 0.5203, + "step": 292250 + }, + { + "epoch": 2.5836736858855356, + "grad_norm": 2.8480818271636963, + "learning_rate": 6.938771901907742e-06, + "loss": 0.4559, + "step": 292260 + }, + { + "epoch": 2.5837620891458477, + "grad_norm": 3.2248990535736084, + "learning_rate": 6.937298514235872e-06, + "loss": 0.5474, + "step": 292270 + }, + { + "epoch": 2.58385049240616, + "grad_norm": 2.3616273403167725, + "learning_rate": 6.935825126564001e-06, + "loss": 0.5285, + "step": 292280 + }, + { + "epoch": 2.5839388956664724, + "grad_norm": 2.1086254119873047, + "learning_rate": 6.934351738892131e-06, + "loss": 0.4911, + "step": 292290 + }, + { + "epoch": 2.5840272989267845, + "grad_norm": 2.063842535018921, + "learning_rate": 6.932878351220259e-06, + "loss": 0.4746, + "step": 292300 + }, + { + "epoch": 2.5841157021870966, + "grad_norm": 14.700190544128418, + "learning_rate": 6.931404963548389e-06, + "loss": 0.55, + "step": 292310 + }, + { + "epoch": 2.5842041054474088, + "grad_norm": 1.7784148454666138, + "learning_rate": 6.929931575876519e-06, + "loss": 0.5671, + "step": 292320 + }, + { + "epoch": 2.584292508707721, + "grad_norm": 3.315866470336914, + "learning_rate": 6.928458188204648e-06, + "loss": 0.5638, + "step": 292330 + }, + { + "epoch": 2.5843809119680334, + "grad_norm": 2.8437485694885254, + "learning_rate": 6.926984800532778e-06, + "loss": 0.4881, + "step": 292340 + }, + { + "epoch": 2.5844693152283456, + "grad_norm": 3.2945058345794678, + "learning_rate": 6.925511412860906e-06, + "loss": 0.5116, + "step": 292350 + }, + { + "epoch": 2.584557718488658, + "grad_norm": 9.973116874694824, + "learning_rate": 6.924038025189036e-06, + "loss": 0.557, + "step": 292360 + }, + { + "epoch": 2.5846461217489702, + "grad_norm": 8.151418685913086, + "learning_rate": 6.922564637517165e-06, + "loss": 0.4313, + "step": 292370 + }, + { + "epoch": 2.5847345250092824, + "grad_norm": 2.0133256912231445, + "learning_rate": 6.9210912498452944e-06, + "loss": 0.3309, + "step": 292380 + }, + { + "epoch": 2.5848229282695945, + "grad_norm": 2.5932140350341797, + "learning_rate": 6.919617862173424e-06, + "loss": 0.6006, + "step": 292390 + }, + { + "epoch": 2.5849113315299066, + "grad_norm": 1.9027341604232788, + "learning_rate": 6.918144474501554e-06, + "loss": 0.5741, + "step": 292400 + }, + { + "epoch": 2.584999734790219, + "grad_norm": 4.40666389465332, + "learning_rate": 6.916671086829683e-06, + "loss": 0.5399, + "step": 292410 + }, + { + "epoch": 2.5850881380505313, + "grad_norm": 26.223644256591797, + "learning_rate": 6.915197699157812e-06, + "loss": 0.414, + "step": 292420 + }, + { + "epoch": 2.5851765413108434, + "grad_norm": 2.5686771869659424, + "learning_rate": 6.913724311485942e-06, + "loss": 0.5013, + "step": 292430 + }, + { + "epoch": 2.585264944571156, + "grad_norm": 3.8418638706207275, + "learning_rate": 6.9122509238140705e-06, + "loss": 0.6072, + "step": 292440 + }, + { + "epoch": 2.585353347831468, + "grad_norm": 1.150220513343811, + "learning_rate": 6.9107775361422005e-06, + "loss": 0.6113, + "step": 292450 + }, + { + "epoch": 2.58544175109178, + "grad_norm": 4.6357340812683105, + "learning_rate": 6.909304148470329e-06, + "loss": 0.5366, + "step": 292460 + }, + { + "epoch": 2.5855301543520923, + "grad_norm": 2.094116687774658, + "learning_rate": 6.907830760798459e-06, + "loss": 0.6715, + "step": 292470 + }, + { + "epoch": 2.585618557612405, + "grad_norm": 6.502612113952637, + "learning_rate": 6.906357373126587e-06, + "loss": 0.584, + "step": 292480 + }, + { + "epoch": 2.585706960872717, + "grad_norm": 3.7684459686279297, + "learning_rate": 6.904883985454717e-06, + "loss": 0.4474, + "step": 292490 + }, + { + "epoch": 2.585795364133029, + "grad_norm": 6.002575397491455, + "learning_rate": 6.903410597782846e-06, + "loss": 0.6087, + "step": 292500 + }, + { + "epoch": 2.5858837673933417, + "grad_norm": 8.938801765441895, + "learning_rate": 6.901937210110976e-06, + "loss": 0.4872, + "step": 292510 + }, + { + "epoch": 2.585972170653654, + "grad_norm": 3.5787723064422607, + "learning_rate": 6.900463822439106e-06, + "loss": 0.4206, + "step": 292520 + }, + { + "epoch": 2.586060573913966, + "grad_norm": 3.0640437602996826, + "learning_rate": 6.898990434767234e-06, + "loss": 0.5418, + "step": 292530 + }, + { + "epoch": 2.586148977174278, + "grad_norm": 4.498950958251953, + "learning_rate": 6.897517047095364e-06, + "loss": 0.4714, + "step": 292540 + }, + { + "epoch": 2.58623738043459, + "grad_norm": 3.8163726329803467, + "learning_rate": 6.8960436594234925e-06, + "loss": 0.4589, + "step": 292550 + }, + { + "epoch": 2.5863257836949027, + "grad_norm": 2.2846148014068604, + "learning_rate": 6.894570271751623e-06, + "loss": 0.4821, + "step": 292560 + }, + { + "epoch": 2.586414186955215, + "grad_norm": 2.3698744773864746, + "learning_rate": 6.893096884079751e-06, + "loss": 0.508, + "step": 292570 + }, + { + "epoch": 2.5865025902155274, + "grad_norm": 2.436260461807251, + "learning_rate": 6.891623496407881e-06, + "loss": 0.4885, + "step": 292580 + }, + { + "epoch": 2.5865909934758395, + "grad_norm": 2.490861177444458, + "learning_rate": 6.89015010873601e-06, + "loss": 0.4432, + "step": 292590 + }, + { + "epoch": 2.5866793967361517, + "grad_norm": 5.595514297485352, + "learning_rate": 6.888676721064139e-06, + "loss": 0.5604, + "step": 292600 + }, + { + "epoch": 2.586767799996464, + "grad_norm": 6.280392169952393, + "learning_rate": 6.8872033333922694e-06, + "loss": 0.553, + "step": 292610 + }, + { + "epoch": 2.586856203256776, + "grad_norm": 4.978307247161865, + "learning_rate": 6.885729945720399e-06, + "loss": 0.4413, + "step": 292620 + }, + { + "epoch": 2.5869446065170885, + "grad_norm": 2.991804361343384, + "learning_rate": 6.884256558048528e-06, + "loss": 0.4585, + "step": 292630 + }, + { + "epoch": 2.5870330097774006, + "grad_norm": 1.7155733108520508, + "learning_rate": 6.882783170376657e-06, + "loss": 0.4433, + "step": 292640 + }, + { + "epoch": 2.5871214130377127, + "grad_norm": 3.4233663082122803, + "learning_rate": 6.881309782704787e-06, + "loss": 0.5758, + "step": 292650 + }, + { + "epoch": 2.5872098162980253, + "grad_norm": 3.8794593811035156, + "learning_rate": 6.8798363950329155e-06, + "loss": 0.5169, + "step": 292660 + }, + { + "epoch": 2.5872982195583374, + "grad_norm": 3.6183128356933594, + "learning_rate": 6.8783630073610455e-06, + "loss": 0.4333, + "step": 292670 + }, + { + "epoch": 2.5873866228186495, + "grad_norm": 2.976161479949951, + "learning_rate": 6.876889619689174e-06, + "loss": 0.5968, + "step": 292680 + }, + { + "epoch": 2.5874750260789616, + "grad_norm": 2.957042932510376, + "learning_rate": 6.875416232017304e-06, + "loss": 0.5034, + "step": 292690 + }, + { + "epoch": 2.587563429339274, + "grad_norm": 1.2600058317184448, + "learning_rate": 6.873942844345434e-06, + "loss": 0.6782, + "step": 292700 + }, + { + "epoch": 2.5876518325995863, + "grad_norm": 5.446626663208008, + "learning_rate": 6.872469456673562e-06, + "loss": 0.6031, + "step": 292710 + }, + { + "epoch": 2.5877402358598984, + "grad_norm": 1.6346627473831177, + "learning_rate": 6.870996069001692e-06, + "loss": 0.5173, + "step": 292720 + }, + { + "epoch": 2.587828639120211, + "grad_norm": 2.048769235610962, + "learning_rate": 6.869522681329821e-06, + "loss": 0.6187, + "step": 292730 + }, + { + "epoch": 2.587917042380523, + "grad_norm": 2.956887722015381, + "learning_rate": 6.868049293657951e-06, + "loss": 0.4667, + "step": 292740 + }, + { + "epoch": 2.5880054456408352, + "grad_norm": 2.4051804542541504, + "learning_rate": 6.866575905986079e-06, + "loss": 0.417, + "step": 292750 + }, + { + "epoch": 2.5880938489011474, + "grad_norm": 10.679434776306152, + "learning_rate": 6.865102518314209e-06, + "loss": 0.5392, + "step": 292760 + }, + { + "epoch": 2.5881822521614595, + "grad_norm": 2.4372687339782715, + "learning_rate": 6.8636291306423375e-06, + "loss": 0.5385, + "step": 292770 + }, + { + "epoch": 2.588270655421772, + "grad_norm": 7.295755863189697, + "learning_rate": 6.8621557429704676e-06, + "loss": 0.4872, + "step": 292780 + }, + { + "epoch": 2.588359058682084, + "grad_norm": 3.7196269035339355, + "learning_rate": 6.860682355298598e-06, + "loss": 0.6283, + "step": 292790 + }, + { + "epoch": 2.5884474619423963, + "grad_norm": 4.572228908538818, + "learning_rate": 6.859208967626726e-06, + "loss": 0.479, + "step": 292800 + }, + { + "epoch": 2.588535865202709, + "grad_norm": 2.9899957180023193, + "learning_rate": 6.857735579954856e-06, + "loss": 0.4329, + "step": 292810 + }, + { + "epoch": 2.588624268463021, + "grad_norm": 1.0502655506134033, + "learning_rate": 6.856262192282984e-06, + "loss": 0.5552, + "step": 292820 + }, + { + "epoch": 2.588712671723333, + "grad_norm": 2.199028253555298, + "learning_rate": 6.854788804611114e-06, + "loss": 0.5716, + "step": 292830 + }, + { + "epoch": 2.588801074983645, + "grad_norm": 3.34179425239563, + "learning_rate": 6.853315416939244e-06, + "loss": 0.4623, + "step": 292840 + }, + { + "epoch": 2.5888894782439578, + "grad_norm": 5.211669921875, + "learning_rate": 6.851842029267373e-06, + "loss": 0.4642, + "step": 292850 + }, + { + "epoch": 2.58897788150427, + "grad_norm": 1.247335433959961, + "learning_rate": 6.850368641595502e-06, + "loss": 0.5526, + "step": 292860 + }, + { + "epoch": 2.589066284764582, + "grad_norm": 6.531829833984375, + "learning_rate": 6.848895253923632e-06, + "loss": 0.5922, + "step": 292870 + }, + { + "epoch": 2.5891546880248946, + "grad_norm": 3.9670703411102295, + "learning_rate": 6.847421866251762e-06, + "loss": 0.6429, + "step": 292880 + }, + { + "epoch": 2.5892430912852067, + "grad_norm": 2.0874271392822266, + "learning_rate": 6.8459484785798905e-06, + "loss": 0.5423, + "step": 292890 + }, + { + "epoch": 2.589331494545519, + "grad_norm": 3.6124894618988037, + "learning_rate": 6.8444750909080205e-06, + "loss": 0.6657, + "step": 292900 + }, + { + "epoch": 2.589419897805831, + "grad_norm": 6.686690330505371, + "learning_rate": 6.843001703236149e-06, + "loss": 0.4737, + "step": 292910 + }, + { + "epoch": 2.589508301066143, + "grad_norm": 6.700619697570801, + "learning_rate": 6.841528315564279e-06, + "loss": 0.5393, + "step": 292920 + }, + { + "epoch": 2.5895967043264556, + "grad_norm": 2.9612913131713867, + "learning_rate": 6.840054927892407e-06, + "loss": 0.5218, + "step": 292930 + }, + { + "epoch": 2.5896851075867677, + "grad_norm": 4.346780300140381, + "learning_rate": 6.838581540220537e-06, + "loss": 0.5137, + "step": 292940 + }, + { + "epoch": 2.5897735108470803, + "grad_norm": 10.16402530670166, + "learning_rate": 6.837108152548666e-06, + "loss": 0.5192, + "step": 292950 + }, + { + "epoch": 2.5898619141073924, + "grad_norm": 1.476745367050171, + "learning_rate": 6.835634764876796e-06, + "loss": 0.5833, + "step": 292960 + }, + { + "epoch": 2.5899503173677045, + "grad_norm": 4.687344551086426, + "learning_rate": 6.834161377204926e-06, + "loss": 0.5234, + "step": 292970 + }, + { + "epoch": 2.5900387206280167, + "grad_norm": 5.503157138824463, + "learning_rate": 6.832687989533054e-06, + "loss": 0.5556, + "step": 292980 + }, + { + "epoch": 2.5901271238883288, + "grad_norm": 1.8212943077087402, + "learning_rate": 6.831214601861184e-06, + "loss": 0.5415, + "step": 292990 + }, + { + "epoch": 2.5902155271486413, + "grad_norm": 1.8990623950958252, + "learning_rate": 6.8297412141893125e-06, + "loss": 0.5219, + "step": 293000 + }, + { + "epoch": 2.5903039304089535, + "grad_norm": 2.9374685287475586, + "learning_rate": 6.8282678265174426e-06, + "loss": 0.5493, + "step": 293010 + }, + { + "epoch": 2.5903923336692656, + "grad_norm": 1.1792315244674683, + "learning_rate": 6.826794438845571e-06, + "loss": 0.5484, + "step": 293020 + }, + { + "epoch": 2.590480736929578, + "grad_norm": 3.6119019985198975, + "learning_rate": 6.825321051173701e-06, + "loss": 0.4867, + "step": 293030 + }, + { + "epoch": 2.5905691401898903, + "grad_norm": 2.456906795501709, + "learning_rate": 6.823847663501829e-06, + "loss": 0.5835, + "step": 293040 + }, + { + "epoch": 2.5906575434502024, + "grad_norm": 1.5770152807235718, + "learning_rate": 6.822374275829959e-06, + "loss": 0.4664, + "step": 293050 + }, + { + "epoch": 2.5907459467105145, + "grad_norm": 12.938225746154785, + "learning_rate": 6.8209008881580886e-06, + "loss": 0.5325, + "step": 293060 + }, + { + "epoch": 2.590834349970827, + "grad_norm": 2.6568989753723145, + "learning_rate": 6.819427500486219e-06, + "loss": 0.57, + "step": 293070 + }, + { + "epoch": 2.590922753231139, + "grad_norm": 3.836038112640381, + "learning_rate": 6.817954112814348e-06, + "loss": 0.4892, + "step": 293080 + }, + { + "epoch": 2.5910111564914513, + "grad_norm": 4.444672107696533, + "learning_rate": 6.816480725142477e-06, + "loss": 0.5601, + "step": 293090 + }, + { + "epoch": 2.591099559751764, + "grad_norm": 3.7013204097747803, + "learning_rate": 6.815007337470607e-06, + "loss": 0.5911, + "step": 293100 + }, + { + "epoch": 2.591187963012076, + "grad_norm": 10.285282135009766, + "learning_rate": 6.8135339497987354e-06, + "loss": 0.4658, + "step": 293110 + }, + { + "epoch": 2.591276366272388, + "grad_norm": 1.0226889848709106, + "learning_rate": 6.8120605621268655e-06, + "loss": 0.5265, + "step": 293120 + }, + { + "epoch": 2.5913647695327002, + "grad_norm": 6.298889636993408, + "learning_rate": 6.810587174454994e-06, + "loss": 0.4475, + "step": 293130 + }, + { + "epoch": 2.5914531727930123, + "grad_norm": 3.454969882965088, + "learning_rate": 6.809113786783124e-06, + "loss": 0.5474, + "step": 293140 + }, + { + "epoch": 2.591541576053325, + "grad_norm": 3.0586862564086914, + "learning_rate": 6.807640399111252e-06, + "loss": 0.69, + "step": 293150 + }, + { + "epoch": 2.591629979313637, + "grad_norm": 2.520907163619995, + "learning_rate": 6.806167011439382e-06, + "loss": 0.5198, + "step": 293160 + }, + { + "epoch": 2.5917183825739496, + "grad_norm": 2.093618154525757, + "learning_rate": 6.804693623767512e-06, + "loss": 0.5669, + "step": 293170 + }, + { + "epoch": 2.5918067858342617, + "grad_norm": 2.001154899597168, + "learning_rate": 6.803220236095641e-06, + "loss": 0.5879, + "step": 293180 + }, + { + "epoch": 2.591895189094574, + "grad_norm": 2.7250523567199707, + "learning_rate": 6.801746848423771e-06, + "loss": 0.5414, + "step": 293190 + }, + { + "epoch": 2.591983592354886, + "grad_norm": 6.527849197387695, + "learning_rate": 6.800273460751899e-06, + "loss": 0.5388, + "step": 293200 + }, + { + "epoch": 2.592071995615198, + "grad_norm": 1.1496959924697876, + "learning_rate": 6.798800073080029e-06, + "loss": 0.443, + "step": 293210 + }, + { + "epoch": 2.5921603988755106, + "grad_norm": 16.28754425048828, + "learning_rate": 6.7973266854081575e-06, + "loss": 0.5199, + "step": 293220 + }, + { + "epoch": 2.5922488021358228, + "grad_norm": 1.639220118522644, + "learning_rate": 6.7958532977362875e-06, + "loss": 0.5044, + "step": 293230 + }, + { + "epoch": 2.592337205396135, + "grad_norm": 0.9519252777099609, + "learning_rate": 6.794379910064416e-06, + "loss": 0.6048, + "step": 293240 + }, + { + "epoch": 2.5924256086564474, + "grad_norm": 9.06883716583252, + "learning_rate": 6.792906522392546e-06, + "loss": 0.4995, + "step": 293250 + }, + { + "epoch": 2.5925140119167596, + "grad_norm": 1.7972859144210815, + "learning_rate": 6.791433134720676e-06, + "loss": 0.5331, + "step": 293260 + }, + { + "epoch": 2.5926024151770717, + "grad_norm": 12.903730392456055, + "learning_rate": 6.789959747048804e-06, + "loss": 0.4519, + "step": 293270 + }, + { + "epoch": 2.592690818437384, + "grad_norm": 2.6765573024749756, + "learning_rate": 6.788486359376934e-06, + "loss": 0.5069, + "step": 293280 + }, + { + "epoch": 2.5927792216976964, + "grad_norm": 4.464567184448242, + "learning_rate": 6.787012971705064e-06, + "loss": 0.5176, + "step": 293290 + }, + { + "epoch": 2.5928676249580085, + "grad_norm": 3.4707229137420654, + "learning_rate": 6.785539584033193e-06, + "loss": 0.5272, + "step": 293300 + }, + { + "epoch": 2.5929560282183206, + "grad_norm": 3.1571478843688965, + "learning_rate": 6.784066196361322e-06, + "loss": 0.4716, + "step": 293310 + }, + { + "epoch": 2.593044431478633, + "grad_norm": 7.455725193023682, + "learning_rate": 6.782592808689452e-06, + "loss": 0.5002, + "step": 293320 + }, + { + "epoch": 2.5931328347389453, + "grad_norm": 4.7907538414001465, + "learning_rate": 6.78111942101758e-06, + "loss": 0.5718, + "step": 293330 + }, + { + "epoch": 2.5932212379992574, + "grad_norm": 0.5910280346870422, + "learning_rate": 6.7796460333457104e-06, + "loss": 0.5333, + "step": 293340 + }, + { + "epoch": 2.5933096412595695, + "grad_norm": 1.8960814476013184, + "learning_rate": 6.7781726456738405e-06, + "loss": 0.5654, + "step": 293350 + }, + { + "epoch": 2.5933980445198817, + "grad_norm": 1.122652292251587, + "learning_rate": 6.776699258001969e-06, + "loss": 0.4571, + "step": 293360 + }, + { + "epoch": 2.593486447780194, + "grad_norm": 2.042668104171753, + "learning_rate": 6.775225870330099e-06, + "loss": 0.4593, + "step": 293370 + }, + { + "epoch": 2.5935748510405063, + "grad_norm": 3.7132017612457275, + "learning_rate": 6.773752482658227e-06, + "loss": 0.4972, + "step": 293380 + }, + { + "epoch": 2.5936632543008185, + "grad_norm": 22.958904266357422, + "learning_rate": 6.772279094986357e-06, + "loss": 0.4888, + "step": 293390 + }, + { + "epoch": 2.593751657561131, + "grad_norm": 2.0450029373168945, + "learning_rate": 6.770805707314486e-06, + "loss": 0.602, + "step": 293400 + }, + { + "epoch": 2.593840060821443, + "grad_norm": 4.021274566650391, + "learning_rate": 6.769332319642616e-06, + "loss": 0.5367, + "step": 293410 + }, + { + "epoch": 2.5939284640817553, + "grad_norm": 4.320457935333252, + "learning_rate": 6.767858931970744e-06, + "loss": 0.5382, + "step": 293420 + }, + { + "epoch": 2.5940168673420674, + "grad_norm": 3.3327600955963135, + "learning_rate": 6.766385544298874e-06, + "loss": 0.699, + "step": 293430 + }, + { + "epoch": 2.59410527060238, + "grad_norm": 1.795789122581482, + "learning_rate": 6.764912156627004e-06, + "loss": 0.4653, + "step": 293440 + }, + { + "epoch": 2.594193673862692, + "grad_norm": 1.9834449291229248, + "learning_rate": 6.7634387689551325e-06, + "loss": 0.5062, + "step": 293450 + }, + { + "epoch": 2.594282077123004, + "grad_norm": 1.3940629959106445, + "learning_rate": 6.7619653812832625e-06, + "loss": 0.6478, + "step": 293460 + }, + { + "epoch": 2.5943704803833167, + "grad_norm": 0.9117072820663452, + "learning_rate": 6.760491993611391e-06, + "loss": 0.4628, + "step": 293470 + }, + { + "epoch": 2.594458883643629, + "grad_norm": 1.6469322443008423, + "learning_rate": 6.759018605939521e-06, + "loss": 0.51, + "step": 293480 + }, + { + "epoch": 2.594547286903941, + "grad_norm": 2.0187251567840576, + "learning_rate": 6.757545218267649e-06, + "loss": 0.4796, + "step": 293490 + }, + { + "epoch": 2.594635690164253, + "grad_norm": 2.477034568786621, + "learning_rate": 6.756071830595779e-06, + "loss": 0.4319, + "step": 293500 + }, + { + "epoch": 2.5947240934245652, + "grad_norm": 0.7550190091133118, + "learning_rate": 6.7545984429239085e-06, + "loss": 0.5292, + "step": 293510 + }, + { + "epoch": 2.594812496684878, + "grad_norm": 3.6989614963531494, + "learning_rate": 6.753125055252038e-06, + "loss": 0.3882, + "step": 293520 + }, + { + "epoch": 2.59490089994519, + "grad_norm": 2.6704907417297363, + "learning_rate": 6.751651667580168e-06, + "loss": 0.5552, + "step": 293530 + }, + { + "epoch": 2.5949893032055025, + "grad_norm": 2.1917121410369873, + "learning_rate": 6.750178279908297e-06, + "loss": 0.5563, + "step": 293540 + }, + { + "epoch": 2.5950777064658146, + "grad_norm": 2.030796527862549, + "learning_rate": 6.748704892236426e-06, + "loss": 0.5373, + "step": 293550 + }, + { + "epoch": 2.5951661097261267, + "grad_norm": 3.814898729324341, + "learning_rate": 6.747231504564555e-06, + "loss": 0.4544, + "step": 293560 + }, + { + "epoch": 2.595254512986439, + "grad_norm": 1.8269858360290527, + "learning_rate": 6.7457581168926854e-06, + "loss": 0.4529, + "step": 293570 + }, + { + "epoch": 2.595342916246751, + "grad_norm": 3.126007318496704, + "learning_rate": 6.744284729220814e-06, + "loss": 0.5682, + "step": 293580 + }, + { + "epoch": 2.5954313195070635, + "grad_norm": 4.134472370147705, + "learning_rate": 6.742811341548944e-06, + "loss": 0.6549, + "step": 293590 + }, + { + "epoch": 2.5955197227673756, + "grad_norm": 19.152082443237305, + "learning_rate": 6.741337953877072e-06, + "loss": 0.4568, + "step": 293600 + }, + { + "epoch": 2.5956081260276878, + "grad_norm": 2.3416574001312256, + "learning_rate": 6.739864566205202e-06, + "loss": 0.4323, + "step": 293610 + }, + { + "epoch": 2.5956965292880003, + "grad_norm": 5.511506080627441, + "learning_rate": 6.738391178533331e-06, + "loss": 0.4715, + "step": 293620 + }, + { + "epoch": 2.5957849325483124, + "grad_norm": 3.00129771232605, + "learning_rate": 6.736917790861461e-06, + "loss": 0.5729, + "step": 293630 + }, + { + "epoch": 2.5958733358086246, + "grad_norm": 4.726401329040527, + "learning_rate": 6.735444403189591e-06, + "loss": 0.5573, + "step": 293640 + }, + { + "epoch": 2.5959617390689367, + "grad_norm": 9.304644584655762, + "learning_rate": 6.733971015517719e-06, + "loss": 0.4493, + "step": 293650 + }, + { + "epoch": 2.5960501423292492, + "grad_norm": 1.8431310653686523, + "learning_rate": 6.732497627845849e-06, + "loss": 0.6012, + "step": 293660 + }, + { + "epoch": 2.5961385455895614, + "grad_norm": 4.896688938140869, + "learning_rate": 6.7310242401739775e-06, + "loss": 0.6326, + "step": 293670 + }, + { + "epoch": 2.5962269488498735, + "grad_norm": 3.830872058868408, + "learning_rate": 6.7295508525021075e-06, + "loss": 0.6058, + "step": 293680 + }, + { + "epoch": 2.596315352110186, + "grad_norm": 9.97799301147461, + "learning_rate": 6.728077464830236e-06, + "loss": 0.5738, + "step": 293690 + }, + { + "epoch": 2.596403755370498, + "grad_norm": 5.134629249572754, + "learning_rate": 6.726604077158366e-06, + "loss": 0.5596, + "step": 293700 + }, + { + "epoch": 2.5964921586308103, + "grad_norm": 5.130037784576416, + "learning_rate": 6.725130689486494e-06, + "loss": 0.3979, + "step": 293710 + }, + { + "epoch": 2.5965805618911224, + "grad_norm": 1.2459595203399658, + "learning_rate": 6.723657301814624e-06, + "loss": 0.5936, + "step": 293720 + }, + { + "epoch": 2.5966689651514345, + "grad_norm": 6.082571983337402, + "learning_rate": 6.722183914142754e-06, + "loss": 0.5168, + "step": 293730 + }, + { + "epoch": 2.596757368411747, + "grad_norm": 3.468639612197876, + "learning_rate": 6.720710526470883e-06, + "loss": 0.405, + "step": 293740 + }, + { + "epoch": 2.596845771672059, + "grad_norm": 2.8851664066314697, + "learning_rate": 6.719237138799013e-06, + "loss": 0.5681, + "step": 293750 + }, + { + "epoch": 2.5969341749323718, + "grad_norm": 4.932288646697998, + "learning_rate": 6.717763751127142e-06, + "loss": 0.5939, + "step": 293760 + }, + { + "epoch": 2.597022578192684, + "grad_norm": 3.668128728866577, + "learning_rate": 6.716290363455271e-06, + "loss": 0.4469, + "step": 293770 + }, + { + "epoch": 2.597110981452996, + "grad_norm": 2.994962453842163, + "learning_rate": 6.7148169757834e-06, + "loss": 0.5038, + "step": 293780 + }, + { + "epoch": 2.597199384713308, + "grad_norm": 21.70534896850586, + "learning_rate": 6.71334358811153e-06, + "loss": 0.4317, + "step": 293790 + }, + { + "epoch": 2.5972877879736203, + "grad_norm": 3.046926975250244, + "learning_rate": 6.711870200439659e-06, + "loss": 0.5244, + "step": 293800 + }, + { + "epoch": 2.597376191233933, + "grad_norm": 3.289400339126587, + "learning_rate": 6.710396812767789e-06, + "loss": 0.56, + "step": 293810 + }, + { + "epoch": 2.597464594494245, + "grad_norm": 2.5781710147857666, + "learning_rate": 6.708923425095919e-06, + "loss": 0.4709, + "step": 293820 + }, + { + "epoch": 2.597552997754557, + "grad_norm": 4.468500137329102, + "learning_rate": 6.707450037424047e-06, + "loss": 0.4754, + "step": 293830 + }, + { + "epoch": 2.5976414010148696, + "grad_norm": 2.1405742168426514, + "learning_rate": 6.705976649752177e-06, + "loss": 0.5149, + "step": 293840 + }, + { + "epoch": 2.5977298042751817, + "grad_norm": 8.292662620544434, + "learning_rate": 6.704503262080306e-06, + "loss": 0.431, + "step": 293850 + }, + { + "epoch": 2.597818207535494, + "grad_norm": 19.339670181274414, + "learning_rate": 6.703029874408436e-06, + "loss": 0.4069, + "step": 293860 + }, + { + "epoch": 2.597906610795806, + "grad_norm": 7.4842400550842285, + "learning_rate": 6.701556486736564e-06, + "loss": 0.6326, + "step": 293870 + }, + { + "epoch": 2.5979950140561185, + "grad_norm": 6.548213958740234, + "learning_rate": 6.700083099064694e-06, + "loss": 0.4255, + "step": 293880 + }, + { + "epoch": 2.5980834173164307, + "grad_norm": 4.842371940612793, + "learning_rate": 6.698609711392822e-06, + "loss": 0.5078, + "step": 293890 + }, + { + "epoch": 2.598171820576743, + "grad_norm": 1.9100135564804077, + "learning_rate": 6.6971363237209525e-06, + "loss": 0.5079, + "step": 293900 + }, + { + "epoch": 2.5982602238370553, + "grad_norm": 7.436350345611572, + "learning_rate": 6.6956629360490825e-06, + "loss": 0.6143, + "step": 293910 + }, + { + "epoch": 2.5983486270973675, + "grad_norm": 5.110711097717285, + "learning_rate": 6.694189548377211e-06, + "loss": 0.5565, + "step": 293920 + }, + { + "epoch": 2.5984370303576796, + "grad_norm": 3.47007417678833, + "learning_rate": 6.692716160705341e-06, + "loss": 0.4626, + "step": 293930 + }, + { + "epoch": 2.5985254336179917, + "grad_norm": 15.934464454650879, + "learning_rate": 6.691242773033469e-06, + "loss": 0.6034, + "step": 293940 + }, + { + "epoch": 2.598613836878304, + "grad_norm": 4.21327543258667, + "learning_rate": 6.689769385361599e-06, + "loss": 0.5518, + "step": 293950 + }, + { + "epoch": 2.5987022401386164, + "grad_norm": 1.4430396556854248, + "learning_rate": 6.688295997689728e-06, + "loss": 0.4524, + "step": 293960 + }, + { + "epoch": 2.5987906433989285, + "grad_norm": 10.417230606079102, + "learning_rate": 6.686822610017858e-06, + "loss": 0.5492, + "step": 293970 + }, + { + "epoch": 2.5988790466592406, + "grad_norm": 0.7799577116966248, + "learning_rate": 6.685349222345987e-06, + "loss": 0.5255, + "step": 293980 + }, + { + "epoch": 2.598967449919553, + "grad_norm": 3.623959541320801, + "learning_rate": 6.683875834674116e-06, + "loss": 0.6195, + "step": 293990 + }, + { + "epoch": 2.5990558531798653, + "grad_norm": 1.9375436305999756, + "learning_rate": 6.682402447002246e-06, + "loss": 0.5615, + "step": 294000 + }, + { + "epoch": 2.5991442564401774, + "grad_norm": 2.9097063541412354, + "learning_rate": 6.680929059330375e-06, + "loss": 0.395, + "step": 294010 + }, + { + "epoch": 2.5992326597004896, + "grad_norm": 4.241057395935059, + "learning_rate": 6.6794556716585046e-06, + "loss": 0.5622, + "step": 294020 + }, + { + "epoch": 2.599321062960802, + "grad_norm": 3.7978179454803467, + "learning_rate": 6.677982283986634e-06, + "loss": 0.5256, + "step": 294030 + }, + { + "epoch": 2.5994094662211142, + "grad_norm": 1.8502787351608276, + "learning_rate": 6.676508896314764e-06, + "loss": 0.4731, + "step": 294040 + }, + { + "epoch": 2.5994978694814264, + "grad_norm": 7.767433166503906, + "learning_rate": 6.675035508642892e-06, + "loss": 0.489, + "step": 294050 + }, + { + "epoch": 2.599586272741739, + "grad_norm": 2.4131364822387695, + "learning_rate": 6.673562120971022e-06, + "loss": 0.5716, + "step": 294060 + }, + { + "epoch": 2.599674676002051, + "grad_norm": 4.022740364074707, + "learning_rate": 6.672088733299151e-06, + "loss": 0.5756, + "step": 294070 + }, + { + "epoch": 2.599763079262363, + "grad_norm": 3.4983978271484375, + "learning_rate": 6.670615345627281e-06, + "loss": 0.4598, + "step": 294080 + }, + { + "epoch": 2.5998514825226753, + "grad_norm": 3.036024332046509, + "learning_rate": 6.669141957955409e-06, + "loss": 0.4327, + "step": 294090 + }, + { + "epoch": 2.5999398857829874, + "grad_norm": 5.99903678894043, + "learning_rate": 6.667668570283539e-06, + "loss": 0.4774, + "step": 294100 + }, + { + "epoch": 2.6000282890433, + "grad_norm": 3.827770233154297, + "learning_rate": 6.666195182611669e-06, + "loss": 0.5405, + "step": 294110 + }, + { + "epoch": 2.600116692303612, + "grad_norm": 4.010746955871582, + "learning_rate": 6.6647217949397974e-06, + "loss": 0.591, + "step": 294120 + }, + { + "epoch": 2.6002050955639247, + "grad_norm": 4.592586517333984, + "learning_rate": 6.6632484072679275e-06, + "loss": 0.5647, + "step": 294130 + }, + { + "epoch": 2.6002934988242368, + "grad_norm": 4.812319278717041, + "learning_rate": 6.661775019596056e-06, + "loss": 0.3299, + "step": 294140 + }, + { + "epoch": 2.600381902084549, + "grad_norm": 4.428249359130859, + "learning_rate": 6.660301631924186e-06, + "loss": 0.4563, + "step": 294150 + }, + { + "epoch": 2.600470305344861, + "grad_norm": 2.4861788749694824, + "learning_rate": 6.658828244252314e-06, + "loss": 0.5905, + "step": 294160 + }, + { + "epoch": 2.600558708605173, + "grad_norm": 3.9979467391967773, + "learning_rate": 6.657354856580444e-06, + "loss": 0.5213, + "step": 294170 + }, + { + "epoch": 2.6006471118654857, + "grad_norm": 4.149348258972168, + "learning_rate": 6.655881468908573e-06, + "loss": 0.4888, + "step": 294180 + }, + { + "epoch": 2.600735515125798, + "grad_norm": 19.041601181030273, + "learning_rate": 6.654408081236703e-06, + "loss": 0.6328, + "step": 294190 + }, + { + "epoch": 2.60082391838611, + "grad_norm": 4.1885223388671875, + "learning_rate": 6.652934693564833e-06, + "loss": 0.5663, + "step": 294200 + }, + { + "epoch": 2.6009123216464225, + "grad_norm": 3.6196227073669434, + "learning_rate": 6.651461305892961e-06, + "loss": 0.5812, + "step": 294210 + }, + { + "epoch": 2.6010007249067346, + "grad_norm": 2.882425308227539, + "learning_rate": 6.649987918221091e-06, + "loss": 0.5345, + "step": 294220 + }, + { + "epoch": 2.6010891281670467, + "grad_norm": 5.204586982727051, + "learning_rate": 6.64851453054922e-06, + "loss": 0.6145, + "step": 294230 + }, + { + "epoch": 2.601177531427359, + "grad_norm": 0.5813703536987305, + "learning_rate": 6.6470411428773495e-06, + "loss": 0.4315, + "step": 294240 + }, + { + "epoch": 2.6012659346876714, + "grad_norm": 5.114157676696777, + "learning_rate": 6.645567755205479e-06, + "loss": 0.6201, + "step": 294250 + }, + { + "epoch": 2.6013543379479835, + "grad_norm": 4.988226413726807, + "learning_rate": 6.644094367533609e-06, + "loss": 0.5207, + "step": 294260 + }, + { + "epoch": 2.6014427412082957, + "grad_norm": 2.31034255027771, + "learning_rate": 6.642620979861737e-06, + "loss": 0.6349, + "step": 294270 + }, + { + "epoch": 2.6015311444686082, + "grad_norm": 8.841472625732422, + "learning_rate": 6.641147592189867e-06, + "loss": 0.5121, + "step": 294280 + }, + { + "epoch": 2.6016195477289203, + "grad_norm": 5.945249080657959, + "learning_rate": 6.639674204517997e-06, + "loss": 0.5307, + "step": 294290 + }, + { + "epoch": 2.6017079509892325, + "grad_norm": 3.0977702140808105, + "learning_rate": 6.638200816846126e-06, + "loss": 0.4943, + "step": 294300 + }, + { + "epoch": 2.6017963542495446, + "grad_norm": 8.614341735839844, + "learning_rate": 6.636727429174256e-06, + "loss": 0.489, + "step": 294310 + }, + { + "epoch": 2.6018847575098567, + "grad_norm": 8.183008193969727, + "learning_rate": 6.635254041502384e-06, + "loss": 0.5965, + "step": 294320 + }, + { + "epoch": 2.6019731607701693, + "grad_norm": 4.534794330596924, + "learning_rate": 6.633780653830514e-06, + "loss": 0.5275, + "step": 294330 + }, + { + "epoch": 2.6020615640304814, + "grad_norm": 5.798236846923828, + "learning_rate": 6.632307266158642e-06, + "loss": 0.5803, + "step": 294340 + }, + { + "epoch": 2.602149967290794, + "grad_norm": 7.930055618286133, + "learning_rate": 6.6308338784867724e-06, + "loss": 0.5408, + "step": 294350 + }, + { + "epoch": 2.602238370551106, + "grad_norm": 1.3219870328903198, + "learning_rate": 6.629360490814901e-06, + "loss": 0.3898, + "step": 294360 + }, + { + "epoch": 2.602326773811418, + "grad_norm": 2.019101142883301, + "learning_rate": 6.627887103143031e-06, + "loss": 0.5708, + "step": 294370 + }, + { + "epoch": 2.6024151770717303, + "grad_norm": 1.1345967054367065, + "learning_rate": 6.626413715471161e-06, + "loss": 0.4346, + "step": 294380 + }, + { + "epoch": 2.6025035803320424, + "grad_norm": 1.4842997789382935, + "learning_rate": 6.624940327799289e-06, + "loss": 0.4475, + "step": 294390 + }, + { + "epoch": 2.602591983592355, + "grad_norm": 5.906774520874023, + "learning_rate": 6.623466940127419e-06, + "loss": 0.4843, + "step": 294400 + }, + { + "epoch": 2.602680386852667, + "grad_norm": 25.593761444091797, + "learning_rate": 6.621993552455548e-06, + "loss": 0.6191, + "step": 294410 + }, + { + "epoch": 2.6027687901129792, + "grad_norm": 0.4262900650501251, + "learning_rate": 6.620520164783678e-06, + "loss": 0.4631, + "step": 294420 + }, + { + "epoch": 2.602857193373292, + "grad_norm": 4.19683837890625, + "learning_rate": 6.619046777111806e-06, + "loss": 0.5038, + "step": 294430 + }, + { + "epoch": 2.602945596633604, + "grad_norm": 2.8311450481414795, + "learning_rate": 6.617573389439936e-06, + "loss": 0.4861, + "step": 294440 + }, + { + "epoch": 2.603033999893916, + "grad_norm": 3.3605363368988037, + "learning_rate": 6.616100001768065e-06, + "loss": 0.5362, + "step": 294450 + }, + { + "epoch": 2.603122403154228, + "grad_norm": 2.149733781814575, + "learning_rate": 6.6146266140961945e-06, + "loss": 0.5163, + "step": 294460 + }, + { + "epoch": 2.6032108064145407, + "grad_norm": 1.564103603363037, + "learning_rate": 6.6131532264243245e-06, + "loss": 0.4522, + "step": 294470 + }, + { + "epoch": 2.603299209674853, + "grad_norm": 13.78667163848877, + "learning_rate": 6.611679838752454e-06, + "loss": 0.3908, + "step": 294480 + }, + { + "epoch": 2.603387612935165, + "grad_norm": 2.2815980911254883, + "learning_rate": 6.610206451080583e-06, + "loss": 0.5693, + "step": 294490 + }, + { + "epoch": 2.6034760161954775, + "grad_norm": 14.377361297607422, + "learning_rate": 6.608733063408712e-06, + "loss": 0.4357, + "step": 294500 + }, + { + "epoch": 2.6035644194557896, + "grad_norm": 17.185197830200195, + "learning_rate": 6.607259675736842e-06, + "loss": 0.4498, + "step": 294510 + }, + { + "epoch": 2.6036528227161018, + "grad_norm": 2.4470043182373047, + "learning_rate": 6.6057862880649706e-06, + "loss": 0.5786, + "step": 294520 + }, + { + "epoch": 2.603741225976414, + "grad_norm": 1.9529424905776978, + "learning_rate": 6.604312900393101e-06, + "loss": 0.6092, + "step": 294530 + }, + { + "epoch": 2.603829629236726, + "grad_norm": 2.8253867626190186, + "learning_rate": 6.602839512721229e-06, + "loss": 0.5673, + "step": 294540 + }, + { + "epoch": 2.6039180324970386, + "grad_norm": 2.709608554840088, + "learning_rate": 6.601366125049359e-06, + "loss": 0.5416, + "step": 294550 + }, + { + "epoch": 2.6040064357573507, + "grad_norm": 4.189993858337402, + "learning_rate": 6.599892737377489e-06, + "loss": 0.6709, + "step": 294560 + }, + { + "epoch": 2.604094839017663, + "grad_norm": 3.480025053024292, + "learning_rate": 6.598419349705617e-06, + "loss": 0.5455, + "step": 294570 + }, + { + "epoch": 2.6041832422779754, + "grad_norm": 3.716611623764038, + "learning_rate": 6.5969459620337475e-06, + "loss": 0.4465, + "step": 294580 + }, + { + "epoch": 2.6042716455382875, + "grad_norm": 5.842087745666504, + "learning_rate": 6.595472574361876e-06, + "loss": 0.3958, + "step": 294590 + }, + { + "epoch": 2.6043600487985996, + "grad_norm": 2.6398844718933105, + "learning_rate": 6.593999186690006e-06, + "loss": 0.4635, + "step": 294600 + }, + { + "epoch": 2.6044484520589117, + "grad_norm": 2.7728965282440186, + "learning_rate": 6.592525799018134e-06, + "loss": 0.4535, + "step": 294610 + }, + { + "epoch": 2.6045368553192243, + "grad_norm": 5.942488193511963, + "learning_rate": 6.591052411346264e-06, + "loss": 0.6228, + "step": 294620 + }, + { + "epoch": 2.6046252585795364, + "grad_norm": 4.281318664550781, + "learning_rate": 6.589579023674393e-06, + "loss": 0.4826, + "step": 294630 + }, + { + "epoch": 2.6047136618398485, + "grad_norm": 2.5259275436401367, + "learning_rate": 6.588105636002523e-06, + "loss": 0.4459, + "step": 294640 + }, + { + "epoch": 2.604802065100161, + "grad_norm": 8.609475135803223, + "learning_rate": 6.586632248330651e-06, + "loss": 0.6345, + "step": 294650 + }, + { + "epoch": 2.604890468360473, + "grad_norm": 7.913760662078857, + "learning_rate": 6.585158860658781e-06, + "loss": 0.5316, + "step": 294660 + }, + { + "epoch": 2.6049788716207853, + "grad_norm": 4.236713886260986, + "learning_rate": 6.583685472986911e-06, + "loss": 0.5704, + "step": 294670 + }, + { + "epoch": 2.6050672748810975, + "grad_norm": 4.06890344619751, + "learning_rate": 6.5822120853150395e-06, + "loss": 0.4717, + "step": 294680 + }, + { + "epoch": 2.6051556781414096, + "grad_norm": 1.7448612451553345, + "learning_rate": 6.5807386976431695e-06, + "loss": 0.5658, + "step": 294690 + }, + { + "epoch": 2.605244081401722, + "grad_norm": 4.548469543457031, + "learning_rate": 6.579265309971299e-06, + "loss": 0.611, + "step": 294700 + }, + { + "epoch": 2.6053324846620343, + "grad_norm": 3.5198123455047607, + "learning_rate": 6.577791922299428e-06, + "loss": 0.5353, + "step": 294710 + }, + { + "epoch": 2.605420887922347, + "grad_norm": 4.240504741668701, + "learning_rate": 6.576318534627557e-06, + "loss": 0.7312, + "step": 294720 + }, + { + "epoch": 2.605509291182659, + "grad_norm": 4.603386402130127, + "learning_rate": 6.574845146955687e-06, + "loss": 0.762, + "step": 294730 + }, + { + "epoch": 2.605597694442971, + "grad_norm": 5.610389709472656, + "learning_rate": 6.5733717592838155e-06, + "loss": 0.4753, + "step": 294740 + }, + { + "epoch": 2.605686097703283, + "grad_norm": 5.485790252685547, + "learning_rate": 6.5718983716119456e-06, + "loss": 0.4144, + "step": 294750 + }, + { + "epoch": 2.6057745009635953, + "grad_norm": 1.7303723096847534, + "learning_rate": 6.570424983940076e-06, + "loss": 0.485, + "step": 294760 + }, + { + "epoch": 2.605862904223908, + "grad_norm": 10.420187950134277, + "learning_rate": 6.568951596268204e-06, + "loss": 0.5543, + "step": 294770 + }, + { + "epoch": 2.60595130748422, + "grad_norm": 2.686077356338501, + "learning_rate": 6.567478208596334e-06, + "loss": 0.4763, + "step": 294780 + }, + { + "epoch": 2.606039710744532, + "grad_norm": 1.9455047845840454, + "learning_rate": 6.566004820924462e-06, + "loss": 0.5671, + "step": 294790 + }, + { + "epoch": 2.6061281140048447, + "grad_norm": 4.6845598220825195, + "learning_rate": 6.564531433252592e-06, + "loss": 0.4713, + "step": 294800 + }, + { + "epoch": 2.606216517265157, + "grad_norm": 4.09244966506958, + "learning_rate": 6.563058045580721e-06, + "loss": 0.6113, + "step": 294810 + }, + { + "epoch": 2.606304920525469, + "grad_norm": 1.8062664270401, + "learning_rate": 6.561584657908851e-06, + "loss": 0.4796, + "step": 294820 + }, + { + "epoch": 2.606393323785781, + "grad_norm": 3.2552642822265625, + "learning_rate": 6.560111270236979e-06, + "loss": 0.5772, + "step": 294830 + }, + { + "epoch": 2.6064817270460936, + "grad_norm": 1.9139745235443115, + "learning_rate": 6.558637882565109e-06, + "loss": 0.5469, + "step": 294840 + }, + { + "epoch": 2.6065701303064057, + "grad_norm": 1.2338123321533203, + "learning_rate": 6.557164494893239e-06, + "loss": 0.459, + "step": 294850 + }, + { + "epoch": 2.606658533566718, + "grad_norm": 2.266824960708618, + "learning_rate": 6.555691107221368e-06, + "loss": 0.4653, + "step": 294860 + }, + { + "epoch": 2.6067469368270304, + "grad_norm": 13.217970848083496, + "learning_rate": 6.554217719549498e-06, + "loss": 0.4681, + "step": 294870 + }, + { + "epoch": 2.6068353400873425, + "grad_norm": 2.7653298377990723, + "learning_rate": 6.552744331877626e-06, + "loss": 0.588, + "step": 294880 + }, + { + "epoch": 2.6069237433476546, + "grad_norm": 8.68925952911377, + "learning_rate": 6.551270944205756e-06, + "loss": 0.6198, + "step": 294890 + }, + { + "epoch": 2.6070121466079668, + "grad_norm": 0.6620548367500305, + "learning_rate": 6.5497975565338844e-06, + "loss": 0.5084, + "step": 294900 + }, + { + "epoch": 2.607100549868279, + "grad_norm": 3.233049154281616, + "learning_rate": 6.5483241688620145e-06, + "loss": 0.5362, + "step": 294910 + }, + { + "epoch": 2.6071889531285914, + "grad_norm": 1.5578560829162598, + "learning_rate": 6.546850781190144e-06, + "loss": 0.5091, + "step": 294920 + }, + { + "epoch": 2.6072773563889036, + "grad_norm": 9.52846908569336, + "learning_rate": 6.545377393518273e-06, + "loss": 0.5732, + "step": 294930 + }, + { + "epoch": 2.607365759649216, + "grad_norm": 6.020577907562256, + "learning_rate": 6.543904005846403e-06, + "loss": 0.4327, + "step": 294940 + }, + { + "epoch": 2.6074541629095283, + "grad_norm": 4.114213943481445, + "learning_rate": 6.542430618174532e-06, + "loss": 0.5594, + "step": 294950 + }, + { + "epoch": 2.6075425661698404, + "grad_norm": 2.4443471431732178, + "learning_rate": 6.540957230502661e-06, + "loss": 0.6929, + "step": 294960 + }, + { + "epoch": 2.6076309694301525, + "grad_norm": 10.181063652038574, + "learning_rate": 6.5394838428307905e-06, + "loss": 0.5781, + "step": 294970 + }, + { + "epoch": 2.6077193726904646, + "grad_norm": 2.558556318283081, + "learning_rate": 6.5380104551589206e-06, + "loss": 0.6699, + "step": 294980 + }, + { + "epoch": 2.607807775950777, + "grad_norm": 5.0048089027404785, + "learning_rate": 6.536537067487049e-06, + "loss": 0.5695, + "step": 294990 + }, + { + "epoch": 2.6078961792110893, + "grad_norm": 2.582946300506592, + "learning_rate": 6.535063679815179e-06, + "loss": 0.5266, + "step": 295000 + }, + { + "epoch": 2.6079845824714014, + "grad_norm": 2.7035293579101562, + "learning_rate": 6.533590292143307e-06, + "loss": 0.5285, + "step": 295010 + }, + { + "epoch": 2.608072985731714, + "grad_norm": 3.119770050048828, + "learning_rate": 6.532116904471437e-06, + "loss": 0.4614, + "step": 295020 + }, + { + "epoch": 2.608161388992026, + "grad_norm": 7.1202497482299805, + "learning_rate": 6.5306435167995674e-06, + "loss": 0.4712, + "step": 295030 + }, + { + "epoch": 2.608249792252338, + "grad_norm": 3.456211805343628, + "learning_rate": 6.529170129127696e-06, + "loss": 0.5824, + "step": 295040 + }, + { + "epoch": 2.6083381955126503, + "grad_norm": 3.8283636569976807, + "learning_rate": 6.527696741455826e-06, + "loss": 0.5749, + "step": 295050 + }, + { + "epoch": 2.608426598772963, + "grad_norm": 4.8175177574157715, + "learning_rate": 6.526223353783954e-06, + "loss": 0.5627, + "step": 295060 + }, + { + "epoch": 2.608515002033275, + "grad_norm": 2.8647117614746094, + "learning_rate": 6.524749966112084e-06, + "loss": 0.4498, + "step": 295070 + }, + { + "epoch": 2.608603405293587, + "grad_norm": 4.8081183433532715, + "learning_rate": 6.523276578440213e-06, + "loss": 0.5963, + "step": 295080 + }, + { + "epoch": 2.6086918085538997, + "grad_norm": 3.2403852939605713, + "learning_rate": 6.521803190768343e-06, + "loss": 0.4746, + "step": 295090 + }, + { + "epoch": 2.608780211814212, + "grad_norm": 3.2688169479370117, + "learning_rate": 6.520329803096471e-06, + "loss": 0.4825, + "step": 295100 + }, + { + "epoch": 2.608868615074524, + "grad_norm": 3.053961992263794, + "learning_rate": 6.518856415424601e-06, + "loss": 0.568, + "step": 295110 + }, + { + "epoch": 2.608957018334836, + "grad_norm": 3.0238821506500244, + "learning_rate": 6.517383027752731e-06, + "loss": 0.535, + "step": 295120 + }, + { + "epoch": 2.609045421595148, + "grad_norm": 4.611100673675537, + "learning_rate": 6.5159096400808594e-06, + "loss": 0.4116, + "step": 295130 + }, + { + "epoch": 2.6091338248554607, + "grad_norm": 1.1340751647949219, + "learning_rate": 6.5144362524089895e-06, + "loss": 0.6321, + "step": 295140 + }, + { + "epoch": 2.609222228115773, + "grad_norm": 7.315913677215576, + "learning_rate": 6.512962864737118e-06, + "loss": 0.3749, + "step": 295150 + }, + { + "epoch": 2.609310631376085, + "grad_norm": 2.373044013977051, + "learning_rate": 6.511489477065248e-06, + "loss": 0.4782, + "step": 295160 + }, + { + "epoch": 2.6093990346363976, + "grad_norm": 5.066211223602295, + "learning_rate": 6.510016089393377e-06, + "loss": 0.5507, + "step": 295170 + }, + { + "epoch": 2.6094874378967097, + "grad_norm": 4.372122287750244, + "learning_rate": 6.508542701721506e-06, + "loss": 0.5077, + "step": 295180 + }, + { + "epoch": 2.609575841157022, + "grad_norm": 34.79286575317383, + "learning_rate": 6.5070693140496355e-06, + "loss": 0.5765, + "step": 295190 + }, + { + "epoch": 2.609664244417334, + "grad_norm": 2.283400774002075, + "learning_rate": 6.5055959263777655e-06, + "loss": 0.506, + "step": 295200 + }, + { + "epoch": 2.6097526476776465, + "grad_norm": 1.5855636596679688, + "learning_rate": 6.504122538705894e-06, + "loss": 0.465, + "step": 295210 + }, + { + "epoch": 2.6098410509379586, + "grad_norm": 4.901074409484863, + "learning_rate": 6.502649151034024e-06, + "loss": 0.5712, + "step": 295220 + }, + { + "epoch": 2.6099294541982707, + "grad_norm": 3.131885528564453, + "learning_rate": 6.501175763362154e-06, + "loss": 0.4717, + "step": 295230 + }, + { + "epoch": 2.6100178574585833, + "grad_norm": 0.9222553372383118, + "learning_rate": 6.499702375690282e-06, + "loss": 0.5442, + "step": 295240 + }, + { + "epoch": 2.6101062607188954, + "grad_norm": 3.8717024326324463, + "learning_rate": 6.498228988018412e-06, + "loss": 0.5787, + "step": 295250 + }, + { + "epoch": 2.6101946639792075, + "grad_norm": 5.620165824890137, + "learning_rate": 6.496755600346541e-06, + "loss": 0.3918, + "step": 295260 + }, + { + "epoch": 2.6102830672395196, + "grad_norm": 3.5896263122558594, + "learning_rate": 6.495282212674671e-06, + "loss": 0.5327, + "step": 295270 + }, + { + "epoch": 2.6103714704998318, + "grad_norm": 7.251724720001221, + "learning_rate": 6.493808825002799e-06, + "loss": 0.4974, + "step": 295280 + }, + { + "epoch": 2.6104598737601443, + "grad_norm": 12.559406280517578, + "learning_rate": 6.492335437330929e-06, + "loss": 0.4657, + "step": 295290 + }, + { + "epoch": 2.6105482770204564, + "grad_norm": 3.254058837890625, + "learning_rate": 6.4908620496590575e-06, + "loss": 0.5075, + "step": 295300 + }, + { + "epoch": 2.610636680280769, + "grad_norm": 1.4775601625442505, + "learning_rate": 6.489388661987188e-06, + "loss": 0.5021, + "step": 295310 + }, + { + "epoch": 2.610725083541081, + "grad_norm": 3.553403377532959, + "learning_rate": 6.487915274315318e-06, + "loss": 0.6395, + "step": 295320 + }, + { + "epoch": 2.6108134868013932, + "grad_norm": 1.5500555038452148, + "learning_rate": 6.486441886643446e-06, + "loss": 0.5514, + "step": 295330 + }, + { + "epoch": 2.6109018900617054, + "grad_norm": 1.6556321382522583, + "learning_rate": 6.484968498971576e-06, + "loss": 0.533, + "step": 295340 + }, + { + "epoch": 2.6109902933220175, + "grad_norm": 3.6689071655273438, + "learning_rate": 6.483495111299704e-06, + "loss": 0.566, + "step": 295350 + }, + { + "epoch": 2.61107869658233, + "grad_norm": 2.0752320289611816, + "learning_rate": 6.4820217236278344e-06, + "loss": 0.466, + "step": 295360 + }, + { + "epoch": 2.611167099842642, + "grad_norm": 5.016627311706543, + "learning_rate": 6.480548335955963e-06, + "loss": 0.6317, + "step": 295370 + }, + { + "epoch": 2.6112555031029543, + "grad_norm": 2.0759778022766113, + "learning_rate": 6.479074948284093e-06, + "loss": 0.4166, + "step": 295380 + }, + { + "epoch": 2.611343906363267, + "grad_norm": 7.147825717926025, + "learning_rate": 6.477601560612222e-06, + "loss": 0.5166, + "step": 295390 + }, + { + "epoch": 2.611432309623579, + "grad_norm": 4.670569896697998, + "learning_rate": 6.476128172940352e-06, + "loss": 0.6275, + "step": 295400 + }, + { + "epoch": 2.611520712883891, + "grad_norm": 5.886756896972656, + "learning_rate": 6.474654785268481e-06, + "loss": 0.4947, + "step": 295410 + }, + { + "epoch": 2.611609116144203, + "grad_norm": 4.263431549072266, + "learning_rate": 6.4731813975966105e-06, + "loss": 0.4693, + "step": 295420 + }, + { + "epoch": 2.6116975194045158, + "grad_norm": 5.373870372772217, + "learning_rate": 6.4717080099247405e-06, + "loss": 0.7466, + "step": 295430 + }, + { + "epoch": 2.611785922664828, + "grad_norm": 5.727255344390869, + "learning_rate": 6.470234622252869e-06, + "loss": 0.4962, + "step": 295440 + }, + { + "epoch": 2.61187432592514, + "grad_norm": 2.2324180603027344, + "learning_rate": 6.468761234580999e-06, + "loss": 0.5855, + "step": 295450 + }, + { + "epoch": 2.6119627291854526, + "grad_norm": 1.9341845512390137, + "learning_rate": 6.467287846909127e-06, + "loss": 0.4977, + "step": 295460 + }, + { + "epoch": 2.6120511324457647, + "grad_norm": 1.8359780311584473, + "learning_rate": 6.465814459237257e-06, + "loss": 0.4061, + "step": 295470 + }, + { + "epoch": 2.612139535706077, + "grad_norm": 3.6558942794799805, + "learning_rate": 6.464341071565386e-06, + "loss": 0.5063, + "step": 295480 + }, + { + "epoch": 2.612227938966389, + "grad_norm": 1.5317894220352173, + "learning_rate": 6.462867683893516e-06, + "loss": 0.4088, + "step": 295490 + }, + { + "epoch": 2.612316342226701, + "grad_norm": 1.278588891029358, + "learning_rate": 6.461394296221646e-06, + "loss": 0.4809, + "step": 295500 + }, + { + "epoch": 2.6124047454870136, + "grad_norm": 2.6961216926574707, + "learning_rate": 6.459920908549774e-06, + "loss": 0.5297, + "step": 295510 + }, + { + "epoch": 2.6124931487473257, + "grad_norm": 1.2983503341674805, + "learning_rate": 6.458447520877904e-06, + "loss": 0.4953, + "step": 295520 + }, + { + "epoch": 2.6125815520076383, + "grad_norm": 4.777713775634766, + "learning_rate": 6.4569741332060326e-06, + "loss": 0.5633, + "step": 295530 + }, + { + "epoch": 2.6126699552679504, + "grad_norm": 1.5024484395980835, + "learning_rate": 6.455500745534163e-06, + "loss": 0.6282, + "step": 295540 + }, + { + "epoch": 2.6127583585282625, + "grad_norm": 8.324627876281738, + "learning_rate": 6.454027357862291e-06, + "loss": 0.4405, + "step": 295550 + }, + { + "epoch": 2.6128467617885747, + "grad_norm": 4.747460842132568, + "learning_rate": 6.452553970190421e-06, + "loss": 0.4721, + "step": 295560 + }, + { + "epoch": 2.612935165048887, + "grad_norm": 1.4829277992248535, + "learning_rate": 6.451080582518549e-06, + "loss": 0.4288, + "step": 295570 + }, + { + "epoch": 2.6130235683091994, + "grad_norm": 2.3253026008605957, + "learning_rate": 6.449607194846679e-06, + "loss": 0.6025, + "step": 295580 + }, + { + "epoch": 2.6131119715695115, + "grad_norm": 5.304039001464844, + "learning_rate": 6.4481338071748095e-06, + "loss": 0.5986, + "step": 295590 + }, + { + "epoch": 2.6132003748298236, + "grad_norm": 3.4374938011169434, + "learning_rate": 6.446660419502938e-06, + "loss": 0.4915, + "step": 295600 + }, + { + "epoch": 2.613288778090136, + "grad_norm": 11.307974815368652, + "learning_rate": 6.445187031831068e-06, + "loss": 0.5455, + "step": 295610 + }, + { + "epoch": 2.6133771813504483, + "grad_norm": 4.568840026855469, + "learning_rate": 6.443713644159197e-06, + "loss": 0.5881, + "step": 295620 + }, + { + "epoch": 2.6134655846107604, + "grad_norm": 10.218194007873535, + "learning_rate": 6.442240256487326e-06, + "loss": 0.5428, + "step": 295630 + }, + { + "epoch": 2.6135539878710725, + "grad_norm": 2.4347074031829834, + "learning_rate": 6.4407668688154555e-06, + "loss": 0.5482, + "step": 295640 + }, + { + "epoch": 2.613642391131385, + "grad_norm": 2.92037034034729, + "learning_rate": 6.4392934811435855e-06, + "loss": 0.5427, + "step": 295650 + }, + { + "epoch": 2.613730794391697, + "grad_norm": 6.603312969207764, + "learning_rate": 6.437820093471714e-06, + "loss": 0.4085, + "step": 295660 + }, + { + "epoch": 2.6138191976520093, + "grad_norm": 3.341090679168701, + "learning_rate": 6.436346705799844e-06, + "loss": 0.5631, + "step": 295670 + }, + { + "epoch": 2.613907600912322, + "grad_norm": 9.069631576538086, + "learning_rate": 6.434873318127974e-06, + "loss": 0.5305, + "step": 295680 + }, + { + "epoch": 2.613996004172634, + "grad_norm": 26.81316566467285, + "learning_rate": 6.433399930456102e-06, + "loss": 0.5583, + "step": 295690 + }, + { + "epoch": 2.614084407432946, + "grad_norm": 12.440065383911133, + "learning_rate": 6.431926542784232e-06, + "loss": 0.6216, + "step": 295700 + }, + { + "epoch": 2.6141728106932582, + "grad_norm": 6.866952896118164, + "learning_rate": 6.430453155112361e-06, + "loss": 0.5417, + "step": 295710 + }, + { + "epoch": 2.6142612139535704, + "grad_norm": 2.784376382827759, + "learning_rate": 6.428979767440491e-06, + "loss": 0.553, + "step": 295720 + }, + { + "epoch": 2.614349617213883, + "grad_norm": 8.563126564025879, + "learning_rate": 6.427506379768619e-06, + "loss": 0.5208, + "step": 295730 + }, + { + "epoch": 2.614438020474195, + "grad_norm": 1.9254460334777832, + "learning_rate": 6.426032992096749e-06, + "loss": 0.5612, + "step": 295740 + }, + { + "epoch": 2.614526423734507, + "grad_norm": 8.535990715026855, + "learning_rate": 6.4245596044248775e-06, + "loss": 0.5065, + "step": 295750 + }, + { + "epoch": 2.6146148269948197, + "grad_norm": 3.4993112087249756, + "learning_rate": 6.4230862167530076e-06, + "loss": 0.4529, + "step": 295760 + }, + { + "epoch": 2.614703230255132, + "grad_norm": 3.451051712036133, + "learning_rate": 6.421612829081136e-06, + "loss": 0.5086, + "step": 295770 + }, + { + "epoch": 2.614791633515444, + "grad_norm": 1.5521279573440552, + "learning_rate": 6.420139441409266e-06, + "loss": 0.4988, + "step": 295780 + }, + { + "epoch": 2.614880036775756, + "grad_norm": 1.9985406398773193, + "learning_rate": 6.418666053737396e-06, + "loss": 0.5538, + "step": 295790 + }, + { + "epoch": 2.6149684400360687, + "grad_norm": 3.6476516723632812, + "learning_rate": 6.417192666065524e-06, + "loss": 0.494, + "step": 295800 + }, + { + "epoch": 2.6150568432963808, + "grad_norm": 5.944014072418213, + "learning_rate": 6.415719278393654e-06, + "loss": 0.6357, + "step": 295810 + }, + { + "epoch": 2.615145246556693, + "grad_norm": 6.136425971984863, + "learning_rate": 6.414245890721783e-06, + "loss": 0.4235, + "step": 295820 + }, + { + "epoch": 2.6152336498170055, + "grad_norm": 2.5026488304138184, + "learning_rate": 6.412772503049913e-06, + "loss": 0.4285, + "step": 295830 + }, + { + "epoch": 2.6153220530773176, + "grad_norm": 4.820497989654541, + "learning_rate": 6.411299115378042e-06, + "loss": 0.4201, + "step": 295840 + }, + { + "epoch": 2.6154104563376297, + "grad_norm": 4.459312438964844, + "learning_rate": 6.409825727706171e-06, + "loss": 0.4821, + "step": 295850 + }, + { + "epoch": 2.615498859597942, + "grad_norm": 5.378005027770996, + "learning_rate": 6.4083523400343004e-06, + "loss": 0.537, + "step": 295860 + }, + { + "epoch": 2.615587262858254, + "grad_norm": 1.8292925357818604, + "learning_rate": 6.4068789523624305e-06, + "loss": 0.398, + "step": 295870 + }, + { + "epoch": 2.6156756661185665, + "grad_norm": 1.692501425743103, + "learning_rate": 6.40540556469056e-06, + "loss": 0.503, + "step": 295880 + }, + { + "epoch": 2.6157640693788786, + "grad_norm": 10.318418502807617, + "learning_rate": 6.403932177018689e-06, + "loss": 0.4371, + "step": 295890 + }, + { + "epoch": 2.615852472639191, + "grad_norm": 1.959692120552063, + "learning_rate": 6.402458789346819e-06, + "loss": 0.455, + "step": 295900 + }, + { + "epoch": 2.6159408758995033, + "grad_norm": 1.047804832458496, + "learning_rate": 6.400985401674947e-06, + "loss": 0.5864, + "step": 295910 + }, + { + "epoch": 2.6160292791598154, + "grad_norm": 12.511993408203125, + "learning_rate": 6.399512014003077e-06, + "loss": 0.5174, + "step": 295920 + }, + { + "epoch": 2.6161176824201275, + "grad_norm": 2.961270570755005, + "learning_rate": 6.398038626331206e-06, + "loss": 0.5568, + "step": 295930 + }, + { + "epoch": 2.6162060856804397, + "grad_norm": 3.1383965015411377, + "learning_rate": 6.396565238659336e-06, + "loss": 0.4004, + "step": 295940 + }, + { + "epoch": 2.6162944889407522, + "grad_norm": 1.4550758600234985, + "learning_rate": 6.395091850987464e-06, + "loss": 0.5274, + "step": 295950 + }, + { + "epoch": 2.6163828922010643, + "grad_norm": 5.6759419441223145, + "learning_rate": 6.393618463315594e-06, + "loss": 0.5784, + "step": 295960 + }, + { + "epoch": 2.6164712954613765, + "grad_norm": 8.059830665588379, + "learning_rate": 6.392145075643724e-06, + "loss": 0.5348, + "step": 295970 + }, + { + "epoch": 2.616559698721689, + "grad_norm": 1.4539852142333984, + "learning_rate": 6.3906716879718525e-06, + "loss": 0.4853, + "step": 295980 + }, + { + "epoch": 2.616648101982001, + "grad_norm": 2.2790212631225586, + "learning_rate": 6.389198300299983e-06, + "loss": 0.4969, + "step": 295990 + }, + { + "epoch": 2.6167365052423133, + "grad_norm": 2.3548951148986816, + "learning_rate": 6.387724912628111e-06, + "loss": 0.5576, + "step": 296000 + }, + { + "epoch": 2.6168249085026254, + "grad_norm": 6.299037933349609, + "learning_rate": 6.386251524956241e-06, + "loss": 0.6166, + "step": 296010 + }, + { + "epoch": 2.616913311762938, + "grad_norm": 5.5241169929504395, + "learning_rate": 6.384778137284369e-06, + "loss": 0.5591, + "step": 296020 + }, + { + "epoch": 2.61700171502325, + "grad_norm": 16.244754791259766, + "learning_rate": 6.383304749612499e-06, + "loss": 0.5942, + "step": 296030 + }, + { + "epoch": 2.617090118283562, + "grad_norm": 1.6393706798553467, + "learning_rate": 6.381831361940628e-06, + "loss": 0.4775, + "step": 296040 + }, + { + "epoch": 2.6171785215438748, + "grad_norm": 1.9955651760101318, + "learning_rate": 6.380357974268758e-06, + "loss": 0.5541, + "step": 296050 + }, + { + "epoch": 2.617266924804187, + "grad_norm": 29.92074966430664, + "learning_rate": 6.378884586596888e-06, + "loss": 0.5205, + "step": 296060 + }, + { + "epoch": 2.617355328064499, + "grad_norm": 4.758095741271973, + "learning_rate": 6.377411198925016e-06, + "loss": 0.6286, + "step": 296070 + }, + { + "epoch": 2.617443731324811, + "grad_norm": 3.717350482940674, + "learning_rate": 6.375937811253146e-06, + "loss": 0.4387, + "step": 296080 + }, + { + "epoch": 2.6175321345851232, + "grad_norm": 23.48966407775879, + "learning_rate": 6.3744644235812754e-06, + "loss": 0.4854, + "step": 296090 + }, + { + "epoch": 2.617620537845436, + "grad_norm": 6.0007734298706055, + "learning_rate": 6.372991035909405e-06, + "loss": 0.5436, + "step": 296100 + }, + { + "epoch": 2.617708941105748, + "grad_norm": 13.467168807983398, + "learning_rate": 6.371517648237534e-06, + "loss": 0.6672, + "step": 296110 + }, + { + "epoch": 2.6177973443660605, + "grad_norm": 3.542092800140381, + "learning_rate": 6.370044260565664e-06, + "loss": 0.4961, + "step": 296120 + }, + { + "epoch": 2.6178857476263726, + "grad_norm": 6.920450687408447, + "learning_rate": 6.368570872893792e-06, + "loss": 0.5428, + "step": 296130 + }, + { + "epoch": 2.6179741508866847, + "grad_norm": 3.242772102355957, + "learning_rate": 6.367097485221922e-06, + "loss": 0.5405, + "step": 296140 + }, + { + "epoch": 2.618062554146997, + "grad_norm": 3.2154452800750732, + "learning_rate": 6.365624097550052e-06, + "loss": 0.5318, + "step": 296150 + }, + { + "epoch": 2.618150957407309, + "grad_norm": 1.7301647663116455, + "learning_rate": 6.364150709878181e-06, + "loss": 0.4777, + "step": 296160 + }, + { + "epoch": 2.6182393606676215, + "grad_norm": 3.400926351547241, + "learning_rate": 6.362677322206311e-06, + "loss": 0.4759, + "step": 296170 + }, + { + "epoch": 2.6183277639279336, + "grad_norm": 4.166918754577637, + "learning_rate": 6.361203934534439e-06, + "loss": 0.4804, + "step": 296180 + }, + { + "epoch": 2.6184161671882458, + "grad_norm": 5.007298469543457, + "learning_rate": 6.359730546862569e-06, + "loss": 0.5594, + "step": 296190 + }, + { + "epoch": 2.6185045704485583, + "grad_norm": 5.849729537963867, + "learning_rate": 6.3582571591906975e-06, + "loss": 0.4734, + "step": 296200 + }, + { + "epoch": 2.6185929737088705, + "grad_norm": 2.276496171951294, + "learning_rate": 6.3567837715188275e-06, + "loss": 0.4147, + "step": 296210 + }, + { + "epoch": 2.6186813769691826, + "grad_norm": 12.297639846801758, + "learning_rate": 6.355310383846956e-06, + "loss": 0.556, + "step": 296220 + }, + { + "epoch": 2.6187697802294947, + "grad_norm": 3.2539777755737305, + "learning_rate": 6.353836996175086e-06, + "loss": 0.5545, + "step": 296230 + }, + { + "epoch": 2.6188581834898073, + "grad_norm": 1.0577375888824463, + "learning_rate": 6.352363608503214e-06, + "loss": 0.4767, + "step": 296240 + }, + { + "epoch": 2.6189465867501194, + "grad_norm": 15.501708984375, + "learning_rate": 6.350890220831344e-06, + "loss": 0.4997, + "step": 296250 + }, + { + "epoch": 2.6190349900104315, + "grad_norm": 1.3124366998672485, + "learning_rate": 6.349416833159474e-06, + "loss": 0.4657, + "step": 296260 + }, + { + "epoch": 2.619123393270744, + "grad_norm": 6.996619701385498, + "learning_rate": 6.347943445487603e-06, + "loss": 0.4854, + "step": 296270 + }, + { + "epoch": 2.619211796531056, + "grad_norm": 5.269251346588135, + "learning_rate": 6.346470057815733e-06, + "loss": 0.5907, + "step": 296280 + }, + { + "epoch": 2.6193001997913683, + "grad_norm": 1.6882117986679077, + "learning_rate": 6.344996670143861e-06, + "loss": 0.4952, + "step": 296290 + }, + { + "epoch": 2.6193886030516804, + "grad_norm": 4.241954803466797, + "learning_rate": 6.343523282471991e-06, + "loss": 0.3672, + "step": 296300 + }, + { + "epoch": 2.6194770063119925, + "grad_norm": 1.915203332901001, + "learning_rate": 6.34204989480012e-06, + "loss": 0.4992, + "step": 296310 + }, + { + "epoch": 2.619565409572305, + "grad_norm": 5.362600803375244, + "learning_rate": 6.34057650712825e-06, + "loss": 0.4946, + "step": 296320 + }, + { + "epoch": 2.6196538128326172, + "grad_norm": 3.6194677352905273, + "learning_rate": 6.339103119456379e-06, + "loss": 0.3771, + "step": 296330 + }, + { + "epoch": 2.6197422160929293, + "grad_norm": 4.911238193511963, + "learning_rate": 6.337629731784509e-06, + "loss": 0.5431, + "step": 296340 + }, + { + "epoch": 2.619830619353242, + "grad_norm": 8.62452507019043, + "learning_rate": 6.336156344112638e-06, + "loss": 0.4884, + "step": 296350 + }, + { + "epoch": 2.619919022613554, + "grad_norm": 3.0525918006896973, + "learning_rate": 6.334682956440767e-06, + "loss": 0.4002, + "step": 296360 + }, + { + "epoch": 2.620007425873866, + "grad_norm": 2.655877113342285, + "learning_rate": 6.333209568768897e-06, + "loss": 0.5548, + "step": 296370 + }, + { + "epoch": 2.6200958291341783, + "grad_norm": 1.306393027305603, + "learning_rate": 6.331736181097026e-06, + "loss": 0.6212, + "step": 296380 + }, + { + "epoch": 2.620184232394491, + "grad_norm": 3.6695759296417236, + "learning_rate": 6.330262793425156e-06, + "loss": 0.463, + "step": 296390 + }, + { + "epoch": 2.620272635654803, + "grad_norm": 2.5554301738739014, + "learning_rate": 6.328789405753284e-06, + "loss": 0.5222, + "step": 296400 + }, + { + "epoch": 2.620361038915115, + "grad_norm": 8.394983291625977, + "learning_rate": 6.327316018081414e-06, + "loss": 0.5494, + "step": 296410 + }, + { + "epoch": 2.6204494421754276, + "grad_norm": 2.1416468620300293, + "learning_rate": 6.3258426304095425e-06, + "loss": 0.6161, + "step": 296420 + }, + { + "epoch": 2.6205378454357398, + "grad_norm": 4.673511028289795, + "learning_rate": 6.3243692427376725e-06, + "loss": 0.4177, + "step": 296430 + }, + { + "epoch": 2.620626248696052, + "grad_norm": 0.8206360340118408, + "learning_rate": 6.3228958550658026e-06, + "loss": 0.4225, + "step": 296440 + }, + { + "epoch": 2.620714651956364, + "grad_norm": 2.6879420280456543, + "learning_rate": 6.321422467393931e-06, + "loss": 0.4746, + "step": 296450 + }, + { + "epoch": 2.620803055216676, + "grad_norm": 9.219498634338379, + "learning_rate": 6.319949079722061e-06, + "loss": 0.515, + "step": 296460 + }, + { + "epoch": 2.6208914584769887, + "grad_norm": 3.6780478954315186, + "learning_rate": 6.318475692050189e-06, + "loss": 0.486, + "step": 296470 + }, + { + "epoch": 2.620979861737301, + "grad_norm": 3.7745745182037354, + "learning_rate": 6.317002304378319e-06, + "loss": 0.5542, + "step": 296480 + }, + { + "epoch": 2.6210682649976134, + "grad_norm": 7.742487907409668, + "learning_rate": 6.315528916706448e-06, + "loss": 0.6318, + "step": 296490 + }, + { + "epoch": 2.6211566682579255, + "grad_norm": 1.1137757301330566, + "learning_rate": 6.314055529034578e-06, + "loss": 0.4833, + "step": 296500 + }, + { + "epoch": 2.6212450715182376, + "grad_norm": 2.218658924102783, + "learning_rate": 6.312582141362706e-06, + "loss": 0.4847, + "step": 296510 + }, + { + "epoch": 2.6213334747785497, + "grad_norm": 5.869383811950684, + "learning_rate": 6.311108753690836e-06, + "loss": 0.4849, + "step": 296520 + }, + { + "epoch": 2.621421878038862, + "grad_norm": 9.216531753540039, + "learning_rate": 6.309635366018966e-06, + "loss": 0.5373, + "step": 296530 + }, + { + "epoch": 2.6215102812991744, + "grad_norm": 11.333541870117188, + "learning_rate": 6.3081619783470946e-06, + "loss": 0.5365, + "step": 296540 + }, + { + "epoch": 2.6215986845594865, + "grad_norm": 12.88354778289795, + "learning_rate": 6.306688590675225e-06, + "loss": 0.5964, + "step": 296550 + }, + { + "epoch": 2.6216870878197986, + "grad_norm": 4.658304214477539, + "learning_rate": 6.305215203003354e-06, + "loss": 0.6351, + "step": 296560 + }, + { + "epoch": 2.621775491080111, + "grad_norm": 3.0914313793182373, + "learning_rate": 6.303741815331483e-06, + "loss": 0.4622, + "step": 296570 + }, + { + "epoch": 2.6218638943404233, + "grad_norm": 2.002366542816162, + "learning_rate": 6.302268427659612e-06, + "loss": 0.4658, + "step": 296580 + }, + { + "epoch": 2.6219522976007354, + "grad_norm": 4.644899368286133, + "learning_rate": 6.300795039987742e-06, + "loss": 0.4246, + "step": 296590 + }, + { + "epoch": 2.6220407008610476, + "grad_norm": 7.045590877532959, + "learning_rate": 6.299321652315871e-06, + "loss": 0.6131, + "step": 296600 + }, + { + "epoch": 2.62212910412136, + "grad_norm": 1.9321274757385254, + "learning_rate": 6.297848264644001e-06, + "loss": 0.3823, + "step": 296610 + }, + { + "epoch": 2.6222175073816723, + "grad_norm": 4.618926048278809, + "learning_rate": 6.296374876972131e-06, + "loss": 0.6513, + "step": 296620 + }, + { + "epoch": 2.6223059106419844, + "grad_norm": 1.745328426361084, + "learning_rate": 6.294901489300259e-06, + "loss": 0.5596, + "step": 296630 + }, + { + "epoch": 2.622394313902297, + "grad_norm": 8.227736473083496, + "learning_rate": 6.293428101628389e-06, + "loss": 0.626, + "step": 296640 + }, + { + "epoch": 2.622482717162609, + "grad_norm": 3.6181082725524902, + "learning_rate": 6.2919547139565175e-06, + "loss": 0.4272, + "step": 296650 + }, + { + "epoch": 2.622571120422921, + "grad_norm": 1.8214160203933716, + "learning_rate": 6.2904813262846475e-06, + "loss": 0.3556, + "step": 296660 + }, + { + "epoch": 2.6226595236832333, + "grad_norm": 0.9679660797119141, + "learning_rate": 6.289007938612776e-06, + "loss": 0.4126, + "step": 296670 + }, + { + "epoch": 2.6227479269435454, + "grad_norm": 9.050695419311523, + "learning_rate": 6.287534550940906e-06, + "loss": 0.5833, + "step": 296680 + }, + { + "epoch": 2.622836330203858, + "grad_norm": 4.009555816650391, + "learning_rate": 6.286061163269034e-06, + "loss": 0.497, + "step": 296690 + }, + { + "epoch": 2.62292473346417, + "grad_norm": 12.374807357788086, + "learning_rate": 6.284587775597164e-06, + "loss": 0.5209, + "step": 296700 + }, + { + "epoch": 2.6230131367244827, + "grad_norm": 3.205704689025879, + "learning_rate": 6.283114387925294e-06, + "loss": 0.5184, + "step": 296710 + }, + { + "epoch": 2.623101539984795, + "grad_norm": 4.3931403160095215, + "learning_rate": 6.281641000253423e-06, + "loss": 0.5176, + "step": 296720 + }, + { + "epoch": 2.623189943245107, + "grad_norm": 11.490986824035645, + "learning_rate": 6.280167612581553e-06, + "loss": 0.4594, + "step": 296730 + }, + { + "epoch": 2.623278346505419, + "grad_norm": 2.898409843444824, + "learning_rate": 6.278694224909681e-06, + "loss": 0.4566, + "step": 296740 + }, + { + "epoch": 2.623366749765731, + "grad_norm": 5.786645889282227, + "learning_rate": 6.277220837237811e-06, + "loss": 0.4333, + "step": 296750 + }, + { + "epoch": 2.6234551530260437, + "grad_norm": 3.61681866645813, + "learning_rate": 6.2757474495659395e-06, + "loss": 0.5771, + "step": 296760 + }, + { + "epoch": 2.623543556286356, + "grad_norm": 2.9245007038116455, + "learning_rate": 6.2742740618940696e-06, + "loss": 0.5912, + "step": 296770 + }, + { + "epoch": 2.623631959546668, + "grad_norm": 2.760175943374634, + "learning_rate": 6.272800674222199e-06, + "loss": 0.4354, + "step": 296780 + }, + { + "epoch": 2.6237203628069805, + "grad_norm": 13.331583023071289, + "learning_rate": 6.271327286550328e-06, + "loss": 0.4828, + "step": 296790 + }, + { + "epoch": 2.6238087660672926, + "grad_norm": 3.166128396987915, + "learning_rate": 6.269853898878457e-06, + "loss": 0.6222, + "step": 296800 + }, + { + "epoch": 2.6238971693276048, + "grad_norm": 1.9002878665924072, + "learning_rate": 6.268380511206587e-06, + "loss": 0.6267, + "step": 296810 + }, + { + "epoch": 2.623985572587917, + "grad_norm": 3.622171640396118, + "learning_rate": 6.2669071235347164e-06, + "loss": 0.5171, + "step": 296820 + }, + { + "epoch": 2.6240739758482294, + "grad_norm": 2.750575065612793, + "learning_rate": 6.265433735862846e-06, + "loss": 0.4944, + "step": 296830 + }, + { + "epoch": 2.6241623791085416, + "grad_norm": 1.3927528858184814, + "learning_rate": 6.263960348190976e-06, + "loss": 0.3345, + "step": 296840 + }, + { + "epoch": 2.6242507823688537, + "grad_norm": 3.6370465755462646, + "learning_rate": 6.262486960519104e-06, + "loss": 0.6134, + "step": 296850 + }, + { + "epoch": 2.6243391856291662, + "grad_norm": 6.518866062164307, + "learning_rate": 6.261013572847234e-06, + "loss": 0.6218, + "step": 296860 + }, + { + "epoch": 2.6244275888894784, + "grad_norm": 5.840656280517578, + "learning_rate": 6.2595401851753624e-06, + "loss": 0.4267, + "step": 296870 + }, + { + "epoch": 2.6245159921497905, + "grad_norm": 17.089797973632812, + "learning_rate": 6.2580667975034925e-06, + "loss": 0.645, + "step": 296880 + }, + { + "epoch": 2.6246043954101026, + "grad_norm": 3.8047397136688232, + "learning_rate": 6.256593409831621e-06, + "loss": 0.3247, + "step": 296890 + }, + { + "epoch": 2.6246927986704147, + "grad_norm": 0.7819920182228088, + "learning_rate": 6.255120022159751e-06, + "loss": 0.4151, + "step": 296900 + }, + { + "epoch": 2.6247812019307273, + "grad_norm": 1.3967572450637817, + "learning_rate": 6.253646634487881e-06, + "loss": 0.4841, + "step": 296910 + }, + { + "epoch": 2.6248696051910394, + "grad_norm": 10.407998085021973, + "learning_rate": 6.252173246816009e-06, + "loss": 0.6357, + "step": 296920 + }, + { + "epoch": 2.6249580084513515, + "grad_norm": 4.912079811096191, + "learning_rate": 6.250699859144139e-06, + "loss": 0.5237, + "step": 296930 + }, + { + "epoch": 2.625046411711664, + "grad_norm": 3.5336008071899414, + "learning_rate": 6.249226471472268e-06, + "loss": 0.5889, + "step": 296940 + }, + { + "epoch": 2.625134814971976, + "grad_norm": 1.1987332105636597, + "learning_rate": 6.247753083800398e-06, + "loss": 0.5473, + "step": 296950 + }, + { + "epoch": 2.6252232182322883, + "grad_norm": 2.105146884918213, + "learning_rate": 6.246279696128527e-06, + "loss": 0.5406, + "step": 296960 + }, + { + "epoch": 2.6253116214926004, + "grad_norm": 4.773249626159668, + "learning_rate": 6.244806308456656e-06, + "loss": 0.4766, + "step": 296970 + }, + { + "epoch": 2.625400024752913, + "grad_norm": 1.4825936555862427, + "learning_rate": 6.243332920784785e-06, + "loss": 0.5661, + "step": 296980 + }, + { + "epoch": 2.625488428013225, + "grad_norm": 4.038149833679199, + "learning_rate": 6.2418595331129145e-06, + "loss": 0.5748, + "step": 296990 + }, + { + "epoch": 2.6255768312735372, + "grad_norm": 2.2282874584198, + "learning_rate": 6.240386145441044e-06, + "loss": 0.7397, + "step": 297000 + }, + { + "epoch": 2.62566523453385, + "grad_norm": 2.0650060176849365, + "learning_rate": 6.238912757769173e-06, + "loss": 0.6212, + "step": 297010 + }, + { + "epoch": 2.625753637794162, + "grad_norm": 1.7456274032592773, + "learning_rate": 6.237439370097303e-06, + "loss": 0.5434, + "step": 297020 + }, + { + "epoch": 2.625842041054474, + "grad_norm": 5.306339740753174, + "learning_rate": 6.235965982425432e-06, + "loss": 0.4947, + "step": 297030 + }, + { + "epoch": 2.625930444314786, + "grad_norm": 4.161145210266113, + "learning_rate": 6.234492594753561e-06, + "loss": 0.6229, + "step": 297040 + }, + { + "epoch": 2.6260188475750987, + "grad_norm": 8.775486946105957, + "learning_rate": 6.2330192070816914e-06, + "loss": 0.5847, + "step": 297050 + }, + { + "epoch": 2.626107250835411, + "grad_norm": 4.392618179321289, + "learning_rate": 6.231545819409821e-06, + "loss": 0.3722, + "step": 297060 + }, + { + "epoch": 2.626195654095723, + "grad_norm": 3.022367000579834, + "learning_rate": 6.23007243173795e-06, + "loss": 0.5912, + "step": 297070 + }, + { + "epoch": 2.6262840573560355, + "grad_norm": 1.4011672735214233, + "learning_rate": 6.228599044066079e-06, + "loss": 0.5609, + "step": 297080 + }, + { + "epoch": 2.6263724606163477, + "grad_norm": 3.212547540664673, + "learning_rate": 6.227125656394208e-06, + "loss": 0.5974, + "step": 297090 + }, + { + "epoch": 2.62646086387666, + "grad_norm": 6.005535125732422, + "learning_rate": 6.2256522687223374e-06, + "loss": 0.5471, + "step": 297100 + }, + { + "epoch": 2.626549267136972, + "grad_norm": 6.0251641273498535, + "learning_rate": 6.224178881050467e-06, + "loss": 0.5076, + "step": 297110 + }, + { + "epoch": 2.626637670397284, + "grad_norm": 1.9463199377059937, + "learning_rate": 6.222705493378596e-06, + "loss": 0.4646, + "step": 297120 + }, + { + "epoch": 2.6267260736575966, + "grad_norm": 5.07763671875, + "learning_rate": 6.221232105706726e-06, + "loss": 0.4799, + "step": 297130 + }, + { + "epoch": 2.6268144769179087, + "grad_norm": 4.568909645080566, + "learning_rate": 6.219758718034855e-06, + "loss": 0.5343, + "step": 297140 + }, + { + "epoch": 2.626902880178221, + "grad_norm": 2.5149874687194824, + "learning_rate": 6.218285330362984e-06, + "loss": 0.4321, + "step": 297150 + }, + { + "epoch": 2.6269912834385334, + "grad_norm": 7.902552127838135, + "learning_rate": 6.2168119426911135e-06, + "loss": 0.5388, + "step": 297160 + }, + { + "epoch": 2.6270796866988455, + "grad_norm": 16.79007911682129, + "learning_rate": 6.215338555019243e-06, + "loss": 0.5918, + "step": 297170 + }, + { + "epoch": 2.6271680899591576, + "grad_norm": 1.9529039859771729, + "learning_rate": 6.213865167347372e-06, + "loss": 0.4506, + "step": 297180 + }, + { + "epoch": 2.6272564932194697, + "grad_norm": 2.665576696395874, + "learning_rate": 6.212391779675501e-06, + "loss": 0.5714, + "step": 297190 + }, + { + "epoch": 2.6273448964797823, + "grad_norm": 3.071577787399292, + "learning_rate": 6.21091839200363e-06, + "loss": 0.498, + "step": 297200 + }, + { + "epoch": 2.6274332997400944, + "grad_norm": 2.7690770626068115, + "learning_rate": 6.2094450043317595e-06, + "loss": 0.4887, + "step": 297210 + }, + { + "epoch": 2.6275217030004065, + "grad_norm": 2.4199378490448, + "learning_rate": 6.207971616659889e-06, + "loss": 0.4274, + "step": 297220 + }, + { + "epoch": 2.627610106260719, + "grad_norm": 8.888883590698242, + "learning_rate": 6.206498228988019e-06, + "loss": 0.5173, + "step": 297230 + }, + { + "epoch": 2.6276985095210312, + "grad_norm": 0.6115412712097168, + "learning_rate": 6.205024841316148e-06, + "loss": 0.5687, + "step": 297240 + }, + { + "epoch": 2.6277869127813434, + "grad_norm": 2.4089760780334473, + "learning_rate": 6.203551453644277e-06, + "loss": 0.4274, + "step": 297250 + }, + { + "epoch": 2.6278753160416555, + "grad_norm": 6.167481899261475, + "learning_rate": 6.202078065972406e-06, + "loss": 0.4952, + "step": 297260 + }, + { + "epoch": 2.6279637193019676, + "grad_norm": 2.8998336791992188, + "learning_rate": 6.200604678300536e-06, + "loss": 0.5102, + "step": 297270 + }, + { + "epoch": 2.62805212256228, + "grad_norm": 5.65235710144043, + "learning_rate": 6.199131290628666e-06, + "loss": 0.4894, + "step": 297280 + }, + { + "epoch": 2.6281405258225923, + "grad_norm": 8.634903907775879, + "learning_rate": 6.197657902956795e-06, + "loss": 0.6137, + "step": 297290 + }, + { + "epoch": 2.628228929082905, + "grad_norm": 2.3781168460845947, + "learning_rate": 6.196184515284924e-06, + "loss": 0.4702, + "step": 297300 + }, + { + "epoch": 2.628317332343217, + "grad_norm": 3.450167179107666, + "learning_rate": 6.194711127613053e-06, + "loss": 0.5068, + "step": 297310 + }, + { + "epoch": 2.628405735603529, + "grad_norm": 3.9967963695526123, + "learning_rate": 6.193237739941183e-06, + "loss": 0.6346, + "step": 297320 + }, + { + "epoch": 2.628494138863841, + "grad_norm": 1.5162642002105713, + "learning_rate": 6.1917643522693125e-06, + "loss": 0.4777, + "step": 297330 + }, + { + "epoch": 2.6285825421241533, + "grad_norm": 17.418315887451172, + "learning_rate": 6.190290964597442e-06, + "loss": 0.5336, + "step": 297340 + }, + { + "epoch": 2.628670945384466, + "grad_norm": 3.172459602355957, + "learning_rate": 6.188817576925571e-06, + "loss": 0.6941, + "step": 297350 + }, + { + "epoch": 2.628759348644778, + "grad_norm": 1.3191639184951782, + "learning_rate": 6.1873441892537e-06, + "loss": 0.5218, + "step": 297360 + }, + { + "epoch": 2.62884775190509, + "grad_norm": 3.4024739265441895, + "learning_rate": 6.185870801581829e-06, + "loss": 0.5633, + "step": 297370 + }, + { + "epoch": 2.6289361551654027, + "grad_norm": 5.1784868240356445, + "learning_rate": 6.1843974139099585e-06, + "loss": 0.5607, + "step": 297380 + }, + { + "epoch": 2.629024558425715, + "grad_norm": 2.687730550765991, + "learning_rate": 6.182924026238088e-06, + "loss": 0.5526, + "step": 297390 + }, + { + "epoch": 2.629112961686027, + "grad_norm": 4.80757999420166, + "learning_rate": 6.181450638566217e-06, + "loss": 0.581, + "step": 297400 + }, + { + "epoch": 2.629201364946339, + "grad_norm": 3.332218885421753, + "learning_rate": 6.179977250894347e-06, + "loss": 0.7056, + "step": 297410 + }, + { + "epoch": 2.6292897682066516, + "grad_norm": 4.545067310333252, + "learning_rate": 6.178503863222476e-06, + "loss": 0.3939, + "step": 297420 + }, + { + "epoch": 2.6293781714669637, + "grad_norm": 1.8762322664260864, + "learning_rate": 6.177030475550605e-06, + "loss": 0.5506, + "step": 297430 + }, + { + "epoch": 2.629466574727276, + "grad_norm": 3.2548396587371826, + "learning_rate": 6.1755570878787345e-06, + "loss": 0.553, + "step": 297440 + }, + { + "epoch": 2.6295549779875884, + "grad_norm": 3.747608184814453, + "learning_rate": 6.174083700206864e-06, + "loss": 0.5455, + "step": 297450 + }, + { + "epoch": 2.6296433812479005, + "grad_norm": 1.17307448387146, + "learning_rate": 6.172610312534993e-06, + "loss": 0.5023, + "step": 297460 + }, + { + "epoch": 2.6297317845082127, + "grad_norm": 1.6170775890350342, + "learning_rate": 6.171136924863122e-06, + "loss": 0.5615, + "step": 297470 + }, + { + "epoch": 2.6298201877685248, + "grad_norm": 3.600369453430176, + "learning_rate": 6.169663537191251e-06, + "loss": 0.5224, + "step": 297480 + }, + { + "epoch": 2.629908591028837, + "grad_norm": 4.478697776794434, + "learning_rate": 6.168190149519381e-06, + "loss": 0.4766, + "step": 297490 + }, + { + "epoch": 2.6299969942891495, + "grad_norm": 8.118295669555664, + "learning_rate": 6.1667167618475106e-06, + "loss": 0.4758, + "step": 297500 + }, + { + "epoch": 2.6300853975494616, + "grad_norm": 4.310422420501709, + "learning_rate": 6.16524337417564e-06, + "loss": 0.479, + "step": 297510 + }, + { + "epoch": 2.6301738008097737, + "grad_norm": 10.327524185180664, + "learning_rate": 6.16376998650377e-06, + "loss": 0.4883, + "step": 297520 + }, + { + "epoch": 2.6302622040700863, + "grad_norm": 7.541807651519775, + "learning_rate": 6.162296598831899e-06, + "loss": 0.5687, + "step": 297530 + }, + { + "epoch": 2.6303506073303984, + "grad_norm": 6.903156757354736, + "learning_rate": 6.160823211160028e-06, + "loss": 0.546, + "step": 297540 + }, + { + "epoch": 2.6304390105907105, + "grad_norm": 7.52591609954834, + "learning_rate": 6.159349823488157e-06, + "loss": 0.5057, + "step": 297550 + }, + { + "epoch": 2.6305274138510226, + "grad_norm": 2.3479747772216797, + "learning_rate": 6.157876435816287e-06, + "loss": 0.5581, + "step": 297560 + }, + { + "epoch": 2.630615817111335, + "grad_norm": 0.7813774943351746, + "learning_rate": 6.156403048144416e-06, + "loss": 0.4039, + "step": 297570 + }, + { + "epoch": 2.6307042203716473, + "grad_norm": 2.942129135131836, + "learning_rate": 6.154929660472545e-06, + "loss": 0.5301, + "step": 297580 + }, + { + "epoch": 2.6307926236319594, + "grad_norm": 3.4932315349578857, + "learning_rate": 6.153456272800674e-06, + "loss": 0.6009, + "step": 297590 + }, + { + "epoch": 2.630881026892272, + "grad_norm": 6.496263027191162, + "learning_rate": 6.151982885128804e-06, + "loss": 0.5354, + "step": 297600 + }, + { + "epoch": 2.630969430152584, + "grad_norm": 1.9308843612670898, + "learning_rate": 6.1505094974569335e-06, + "loss": 0.6506, + "step": 297610 + }, + { + "epoch": 2.6310578334128962, + "grad_norm": 2.2697267532348633, + "learning_rate": 6.149036109785063e-06, + "loss": 0.4687, + "step": 297620 + }, + { + "epoch": 2.6311462366732083, + "grad_norm": 5.300718307495117, + "learning_rate": 6.147562722113192e-06, + "loss": 0.5862, + "step": 297630 + }, + { + "epoch": 2.631234639933521, + "grad_norm": 1.401847004890442, + "learning_rate": 6.146089334441321e-06, + "loss": 0.4283, + "step": 297640 + }, + { + "epoch": 2.631323043193833, + "grad_norm": 3.2144014835357666, + "learning_rate": 6.14461594676945e-06, + "loss": 0.4447, + "step": 297650 + }, + { + "epoch": 2.631411446454145, + "grad_norm": 2.336986780166626, + "learning_rate": 6.1431425590975795e-06, + "loss": 0.5029, + "step": 297660 + }, + { + "epoch": 2.6314998497144577, + "grad_norm": 1.436829686164856, + "learning_rate": 6.141669171425709e-06, + "loss": 0.4843, + "step": 297670 + }, + { + "epoch": 2.63158825297477, + "grad_norm": 1.3371931314468384, + "learning_rate": 6.140195783753838e-06, + "loss": 0.4881, + "step": 297680 + }, + { + "epoch": 2.631676656235082, + "grad_norm": 6.887965679168701, + "learning_rate": 6.138722396081968e-06, + "loss": 0.6153, + "step": 297690 + }, + { + "epoch": 2.631765059495394, + "grad_norm": 5.459323883056641, + "learning_rate": 6.137249008410097e-06, + "loss": 0.6082, + "step": 297700 + }, + { + "epoch": 2.631853462755706, + "grad_norm": 6.077756404876709, + "learning_rate": 6.135775620738226e-06, + "loss": 0.486, + "step": 297710 + }, + { + "epoch": 2.6319418660160188, + "grad_norm": 5.706859111785889, + "learning_rate": 6.1343022330663555e-06, + "loss": 0.4996, + "step": 297720 + }, + { + "epoch": 2.632030269276331, + "grad_norm": 2.643465280532837, + "learning_rate": 6.1328288453944856e-06, + "loss": 0.459, + "step": 297730 + }, + { + "epoch": 2.632118672536643, + "grad_norm": 2.345921039581299, + "learning_rate": 6.131355457722615e-06, + "loss": 0.4537, + "step": 297740 + }, + { + "epoch": 2.6322070757969556, + "grad_norm": 1.4083094596862793, + "learning_rate": 6.129882070050744e-06, + "loss": 0.4447, + "step": 297750 + }, + { + "epoch": 2.6322954790572677, + "grad_norm": 3.8890533447265625, + "learning_rate": 6.128408682378873e-06, + "loss": 0.5766, + "step": 297760 + }, + { + "epoch": 2.63238388231758, + "grad_norm": 18.33075714111328, + "learning_rate": 6.126935294707002e-06, + "loss": 0.4819, + "step": 297770 + }, + { + "epoch": 2.632472285577892, + "grad_norm": 6.183882713317871, + "learning_rate": 6.125461907035132e-06, + "loss": 0.6478, + "step": 297780 + }, + { + "epoch": 2.6325606888382045, + "grad_norm": 11.201942443847656, + "learning_rate": 6.123988519363262e-06, + "loss": 0.55, + "step": 297790 + }, + { + "epoch": 2.6326490920985166, + "grad_norm": 8.282132148742676, + "learning_rate": 6.122515131691391e-06, + "loss": 0.4467, + "step": 297800 + }, + { + "epoch": 2.6327374953588287, + "grad_norm": 2.1623849868774414, + "learning_rate": 6.12104174401952e-06, + "loss": 0.5566, + "step": 297810 + }, + { + "epoch": 2.6328258986191413, + "grad_norm": 10.263544082641602, + "learning_rate": 6.119568356347649e-06, + "loss": 0.6309, + "step": 297820 + }, + { + "epoch": 2.6329143018794534, + "grad_norm": 3.179201602935791, + "learning_rate": 6.1180949686757784e-06, + "loss": 0.6064, + "step": 297830 + }, + { + "epoch": 2.6330027051397655, + "grad_norm": 1.2198222875595093, + "learning_rate": 6.116621581003908e-06, + "loss": 0.4593, + "step": 297840 + }, + { + "epoch": 2.6330911084000777, + "grad_norm": 4.032303333282471, + "learning_rate": 6.115148193332037e-06, + "loss": 0.5622, + "step": 297850 + }, + { + "epoch": 2.6331795116603898, + "grad_norm": 13.47317886352539, + "learning_rate": 6.113674805660166e-06, + "loss": 0.478, + "step": 297860 + }, + { + "epoch": 2.6332679149207023, + "grad_norm": 0.8424588441848755, + "learning_rate": 6.112201417988295e-06, + "loss": 0.4061, + "step": 297870 + }, + { + "epoch": 2.6333563181810145, + "grad_norm": 1.3572304248809814, + "learning_rate": 6.110728030316425e-06, + "loss": 0.4795, + "step": 297880 + }, + { + "epoch": 2.633444721441327, + "grad_norm": 2.0501112937927246, + "learning_rate": 6.1092546426445545e-06, + "loss": 0.5393, + "step": 297890 + }, + { + "epoch": 2.633533124701639, + "grad_norm": 4.382127285003662, + "learning_rate": 6.107781254972684e-06, + "loss": 0.593, + "step": 297900 + }, + { + "epoch": 2.6336215279619513, + "grad_norm": 3.276899576187134, + "learning_rate": 6.106307867300813e-06, + "loss": 0.5218, + "step": 297910 + }, + { + "epoch": 2.6337099312222634, + "grad_norm": 2.657844066619873, + "learning_rate": 6.104834479628942e-06, + "loss": 0.5399, + "step": 297920 + }, + { + "epoch": 2.6337983344825755, + "grad_norm": 2.038175106048584, + "learning_rate": 6.103361091957071e-06, + "loss": 0.5707, + "step": 297930 + }, + { + "epoch": 2.633886737742888, + "grad_norm": 6.554128646850586, + "learning_rate": 6.1018877042852005e-06, + "loss": 0.5606, + "step": 297940 + }, + { + "epoch": 2.6339751410032, + "grad_norm": 0.9881811738014221, + "learning_rate": 6.1004143166133305e-06, + "loss": 0.4072, + "step": 297950 + }, + { + "epoch": 2.6340635442635123, + "grad_norm": 4.747561931610107, + "learning_rate": 6.09894092894146e-06, + "loss": 0.5304, + "step": 297960 + }, + { + "epoch": 2.634151947523825, + "grad_norm": 2.0391249656677246, + "learning_rate": 6.097467541269589e-06, + "loss": 0.4099, + "step": 297970 + }, + { + "epoch": 2.634240350784137, + "grad_norm": 7.429102420806885, + "learning_rate": 6.095994153597719e-06, + "loss": 0.6363, + "step": 297980 + }, + { + "epoch": 2.634328754044449, + "grad_norm": 3.3847761154174805, + "learning_rate": 6.094520765925848e-06, + "loss": 0.4509, + "step": 297990 + }, + { + "epoch": 2.6344171573047612, + "grad_norm": 3.925161600112915, + "learning_rate": 6.093047378253977e-06, + "loss": 0.6115, + "step": 298000 + }, + { + "epoch": 2.634505560565074, + "grad_norm": 3.301105260848999, + "learning_rate": 6.091573990582107e-06, + "loss": 0.5669, + "step": 298010 + }, + { + "epoch": 2.634593963825386, + "grad_norm": 1.752500057220459, + "learning_rate": 6.090100602910236e-06, + "loss": 0.4387, + "step": 298020 + }, + { + "epoch": 2.634682367085698, + "grad_norm": 1.7501568794250488, + "learning_rate": 6.088627215238365e-06, + "loss": 0.4563, + "step": 298030 + }, + { + "epoch": 2.6347707703460106, + "grad_norm": 3.3367063999176025, + "learning_rate": 6.087153827566494e-06, + "loss": 0.4511, + "step": 298040 + }, + { + "epoch": 2.6348591736063227, + "grad_norm": 6.2698974609375, + "learning_rate": 6.085680439894623e-06, + "loss": 0.5242, + "step": 298050 + }, + { + "epoch": 2.634947576866635, + "grad_norm": 2.725039482116699, + "learning_rate": 6.084207052222753e-06, + "loss": 0.6176, + "step": 298060 + }, + { + "epoch": 2.635035980126947, + "grad_norm": 5.542684555053711, + "learning_rate": 6.082733664550883e-06, + "loss": 0.5283, + "step": 298070 + }, + { + "epoch": 2.635124383387259, + "grad_norm": 7.4037675857543945, + "learning_rate": 6.081260276879012e-06, + "loss": 0.5963, + "step": 298080 + }, + { + "epoch": 2.6352127866475716, + "grad_norm": 9.794493675231934, + "learning_rate": 6.079786889207141e-06, + "loss": 0.5064, + "step": 298090 + }, + { + "epoch": 2.6353011899078838, + "grad_norm": 2.133608102798462, + "learning_rate": 6.07831350153527e-06, + "loss": 0.5314, + "step": 298100 + }, + { + "epoch": 2.6353895931681963, + "grad_norm": 4.548186779022217, + "learning_rate": 6.0768401138633994e-06, + "loss": 0.7655, + "step": 298110 + }, + { + "epoch": 2.6354779964285084, + "grad_norm": 3.131840467453003, + "learning_rate": 6.075366726191529e-06, + "loss": 0.3113, + "step": 298120 + }, + { + "epoch": 2.6355663996888206, + "grad_norm": 9.687224388122559, + "learning_rate": 6.073893338519658e-06, + "loss": 0.5218, + "step": 298130 + }, + { + "epoch": 2.6356548029491327, + "grad_norm": 10.10106086730957, + "learning_rate": 6.072419950847787e-06, + "loss": 0.5161, + "step": 298140 + }, + { + "epoch": 2.635743206209445, + "grad_norm": 3.3224666118621826, + "learning_rate": 6.070946563175916e-06, + "loss": 0.6377, + "step": 298150 + }, + { + "epoch": 2.6358316094697574, + "grad_norm": 2.8907089233398438, + "learning_rate": 6.069473175504046e-06, + "loss": 0.4454, + "step": 298160 + }, + { + "epoch": 2.6359200127300695, + "grad_norm": 6.396096706390381, + "learning_rate": 6.0679997878321755e-06, + "loss": 0.6378, + "step": 298170 + }, + { + "epoch": 2.6360084159903816, + "grad_norm": 3.955477476119995, + "learning_rate": 6.066526400160305e-06, + "loss": 0.5183, + "step": 298180 + }, + { + "epoch": 2.636096819250694, + "grad_norm": 3.1286420822143555, + "learning_rate": 6.065053012488434e-06, + "loss": 0.414, + "step": 298190 + }, + { + "epoch": 2.6361852225110063, + "grad_norm": 2.1106762886047363, + "learning_rate": 6.063579624816564e-06, + "loss": 0.4327, + "step": 298200 + }, + { + "epoch": 2.6362736257713184, + "grad_norm": 2.377924919128418, + "learning_rate": 6.062106237144693e-06, + "loss": 0.4772, + "step": 298210 + }, + { + "epoch": 2.6363620290316305, + "grad_norm": 0.9422692656517029, + "learning_rate": 6.060632849472822e-06, + "loss": 0.5167, + "step": 298220 + }, + { + "epoch": 2.636450432291943, + "grad_norm": 2.3086581230163574, + "learning_rate": 6.0591594618009516e-06, + "loss": 0.4347, + "step": 298230 + }, + { + "epoch": 2.636538835552255, + "grad_norm": 3.3164267539978027, + "learning_rate": 6.057686074129081e-06, + "loss": 0.5373, + "step": 298240 + }, + { + "epoch": 2.6366272388125673, + "grad_norm": 2.795579671859741, + "learning_rate": 6.05621268645721e-06, + "loss": 0.4071, + "step": 298250 + }, + { + "epoch": 2.63671564207288, + "grad_norm": 3.5417966842651367, + "learning_rate": 6.05473929878534e-06, + "loss": 0.6484, + "step": 298260 + }, + { + "epoch": 2.636804045333192, + "grad_norm": 3.901432752609253, + "learning_rate": 6.053265911113469e-06, + "loss": 0.5315, + "step": 298270 + }, + { + "epoch": 2.636892448593504, + "grad_norm": 2.1611790657043457, + "learning_rate": 6.051792523441598e-06, + "loss": 0.4918, + "step": 298280 + }, + { + "epoch": 2.6369808518538163, + "grad_norm": 2.7642641067504883, + "learning_rate": 6.050319135769728e-06, + "loss": 0.4498, + "step": 298290 + }, + { + "epoch": 2.6370692551141284, + "grad_norm": 3.69091534614563, + "learning_rate": 6.048845748097857e-06, + "loss": 0.4827, + "step": 298300 + }, + { + "epoch": 2.637157658374441, + "grad_norm": 2.3950765132904053, + "learning_rate": 6.047372360425986e-06, + "loss": 0.563, + "step": 298310 + }, + { + "epoch": 2.637246061634753, + "grad_norm": 3.0076823234558105, + "learning_rate": 6.045898972754115e-06, + "loss": 0.4773, + "step": 298320 + }, + { + "epoch": 2.637334464895065, + "grad_norm": 4.34055757522583, + "learning_rate": 6.044425585082244e-06, + "loss": 0.3947, + "step": 298330 + }, + { + "epoch": 2.6374228681553777, + "grad_norm": 2.9598424434661865, + "learning_rate": 6.042952197410374e-06, + "loss": 0.4494, + "step": 298340 + }, + { + "epoch": 2.63751127141569, + "grad_norm": 3.435628890991211, + "learning_rate": 6.041478809738504e-06, + "loss": 0.5276, + "step": 298350 + }, + { + "epoch": 2.637599674676002, + "grad_norm": 3.2643373012542725, + "learning_rate": 6.040005422066633e-06, + "loss": 0.4759, + "step": 298360 + }, + { + "epoch": 2.637688077936314, + "grad_norm": 4.635200023651123, + "learning_rate": 6.038532034394762e-06, + "loss": 0.4888, + "step": 298370 + }, + { + "epoch": 2.6377764811966267, + "grad_norm": 3.117058515548706, + "learning_rate": 6.037058646722891e-06, + "loss": 0.5432, + "step": 298380 + }, + { + "epoch": 2.637864884456939, + "grad_norm": 1.4245867729187012, + "learning_rate": 6.0355852590510205e-06, + "loss": 0.4429, + "step": 298390 + }, + { + "epoch": 2.637953287717251, + "grad_norm": 1.2259520292282104, + "learning_rate": 6.03411187137915e-06, + "loss": 0.3787, + "step": 298400 + }, + { + "epoch": 2.6380416909775635, + "grad_norm": 17.554363250732422, + "learning_rate": 6.032638483707279e-06, + "loss": 0.6545, + "step": 298410 + }, + { + "epoch": 2.6381300942378756, + "grad_norm": 2.7785677909851074, + "learning_rate": 6.031165096035409e-06, + "loss": 0.3861, + "step": 298420 + }, + { + "epoch": 2.6382184974981877, + "grad_norm": 2.6734721660614014, + "learning_rate": 6.029691708363538e-06, + "loss": 0.4381, + "step": 298430 + }, + { + "epoch": 2.6383069007585, + "grad_norm": 2.8754873275756836, + "learning_rate": 6.028218320691667e-06, + "loss": 0.4965, + "step": 298440 + }, + { + "epoch": 2.638395304018812, + "grad_norm": 1.9562495946884155, + "learning_rate": 6.026744933019797e-06, + "loss": 0.4593, + "step": 298450 + }, + { + "epoch": 2.6384837072791245, + "grad_norm": 4.370481967926025, + "learning_rate": 6.0252715453479266e-06, + "loss": 0.5482, + "step": 298460 + }, + { + "epoch": 2.6385721105394366, + "grad_norm": 2.0795414447784424, + "learning_rate": 6.023798157676056e-06, + "loss": 0.4656, + "step": 298470 + }, + { + "epoch": 2.638660513799749, + "grad_norm": 2.1200146675109863, + "learning_rate": 6.022324770004185e-06, + "loss": 0.5801, + "step": 298480 + }, + { + "epoch": 2.6387489170600613, + "grad_norm": 2.834552764892578, + "learning_rate": 6.020851382332314e-06, + "loss": 0.5546, + "step": 298490 + }, + { + "epoch": 2.6388373203203734, + "grad_norm": 1.9214617013931274, + "learning_rate": 6.019377994660443e-06, + "loss": 0.4289, + "step": 298500 + }, + { + "epoch": 2.6389257235806856, + "grad_norm": 5.596981048583984, + "learning_rate": 6.0179046069885726e-06, + "loss": 0.4782, + "step": 298510 + }, + { + "epoch": 2.6390141268409977, + "grad_norm": 3.602888822555542, + "learning_rate": 6.016431219316702e-06, + "loss": 0.6203, + "step": 298520 + }, + { + "epoch": 2.6391025301013102, + "grad_norm": 2.0379583835601807, + "learning_rate": 6.014957831644831e-06, + "loss": 0.5519, + "step": 298530 + }, + { + "epoch": 2.6391909333616224, + "grad_norm": 0.9603956341743469, + "learning_rate": 6.013484443972961e-06, + "loss": 0.4392, + "step": 298540 + }, + { + "epoch": 2.6392793366219345, + "grad_norm": 1.7124792337417603, + "learning_rate": 6.01201105630109e-06, + "loss": 0.5637, + "step": 298550 + }, + { + "epoch": 2.639367739882247, + "grad_norm": 3.723620891571045, + "learning_rate": 6.010537668629219e-06, + "loss": 0.6346, + "step": 298560 + }, + { + "epoch": 2.639456143142559, + "grad_norm": 2.2513539791107178, + "learning_rate": 6.009064280957349e-06, + "loss": 0.3625, + "step": 298570 + }, + { + "epoch": 2.6395445464028713, + "grad_norm": 3.4692537784576416, + "learning_rate": 6.007590893285478e-06, + "loss": 0.4994, + "step": 298580 + }, + { + "epoch": 2.6396329496631834, + "grad_norm": 1.8920928239822388, + "learning_rate": 6.006117505613607e-06, + "loss": 0.5068, + "step": 298590 + }, + { + "epoch": 2.639721352923496, + "grad_norm": 9.385475158691406, + "learning_rate": 6.004644117941736e-06, + "loss": 0.463, + "step": 298600 + }, + { + "epoch": 2.639809756183808, + "grad_norm": 4.002129077911377, + "learning_rate": 6.0031707302698654e-06, + "loss": 0.6679, + "step": 298610 + }, + { + "epoch": 2.63989815944412, + "grad_norm": 3.3416197299957275, + "learning_rate": 6.001697342597995e-06, + "loss": 0.4667, + "step": 298620 + }, + { + "epoch": 2.6399865627044328, + "grad_norm": 2.6004343032836914, + "learning_rate": 6.000223954926125e-06, + "loss": 0.4338, + "step": 298630 + }, + { + "epoch": 2.640074965964745, + "grad_norm": 1.1845167875289917, + "learning_rate": 5.998750567254254e-06, + "loss": 0.5767, + "step": 298640 + }, + { + "epoch": 2.640163369225057, + "grad_norm": 2.5302135944366455, + "learning_rate": 5.997277179582383e-06, + "loss": 0.4934, + "step": 298650 + }, + { + "epoch": 2.640251772485369, + "grad_norm": 2.9948487281799316, + "learning_rate": 5.995803791910512e-06, + "loss": 0.6585, + "step": 298660 + }, + { + "epoch": 2.6403401757456813, + "grad_norm": 2.1750521659851074, + "learning_rate": 5.994330404238642e-06, + "loss": 0.4909, + "step": 298670 + }, + { + "epoch": 2.640428579005994, + "grad_norm": 2.5716795921325684, + "learning_rate": 5.9928570165667715e-06, + "loss": 0.5582, + "step": 298680 + }, + { + "epoch": 2.640516982266306, + "grad_norm": 6.446140766143799, + "learning_rate": 5.991383628894901e-06, + "loss": 0.4769, + "step": 298690 + }, + { + "epoch": 2.6406053855266185, + "grad_norm": 3.36838960647583, + "learning_rate": 5.98991024122303e-06, + "loss": 0.5356, + "step": 298700 + }, + { + "epoch": 2.6406937887869306, + "grad_norm": 1.8072950839996338, + "learning_rate": 5.988436853551159e-06, + "loss": 0.4563, + "step": 298710 + }, + { + "epoch": 2.6407821920472427, + "grad_norm": 4.696130752563477, + "learning_rate": 5.986963465879289e-06, + "loss": 0.5848, + "step": 298720 + }, + { + "epoch": 2.640870595307555, + "grad_norm": 4.027421474456787, + "learning_rate": 5.985490078207418e-06, + "loss": 0.5612, + "step": 298730 + }, + { + "epoch": 2.640958998567867, + "grad_norm": 3.902841567993164, + "learning_rate": 5.984016690535548e-06, + "loss": 0.4223, + "step": 298740 + }, + { + "epoch": 2.6410474018281795, + "grad_norm": 4.4279656410217285, + "learning_rate": 5.982543302863677e-06, + "loss": 0.5344, + "step": 298750 + }, + { + "epoch": 2.6411358050884917, + "grad_norm": 5.622204303741455, + "learning_rate": 5.981069915191806e-06, + "loss": 0.464, + "step": 298760 + }, + { + "epoch": 2.641224208348804, + "grad_norm": 2.7881548404693604, + "learning_rate": 5.979596527519935e-06, + "loss": 0.4306, + "step": 298770 + }, + { + "epoch": 2.6413126116091163, + "grad_norm": 1.774688720703125, + "learning_rate": 5.978123139848064e-06, + "loss": 0.5026, + "step": 298780 + }, + { + "epoch": 2.6414010148694285, + "grad_norm": 16.61012840270996, + "learning_rate": 5.976649752176194e-06, + "loss": 0.446, + "step": 298790 + }, + { + "epoch": 2.6414894181297406, + "grad_norm": 4.308602809906006, + "learning_rate": 5.975176364504323e-06, + "loss": 0.5023, + "step": 298800 + }, + { + "epoch": 2.6415778213900527, + "grad_norm": 5.65335750579834, + "learning_rate": 5.973702976832452e-06, + "loss": 0.4937, + "step": 298810 + }, + { + "epoch": 2.6416662246503653, + "grad_norm": 8.54035758972168, + "learning_rate": 5.972229589160582e-06, + "loss": 0.4615, + "step": 298820 + }, + { + "epoch": 2.6417546279106774, + "grad_norm": 3.2146060466766357, + "learning_rate": 5.970756201488711e-06, + "loss": 0.3976, + "step": 298830 + }, + { + "epoch": 2.6418430311709895, + "grad_norm": 0.89947509765625, + "learning_rate": 5.9692828138168404e-06, + "loss": 0.4464, + "step": 298840 + }, + { + "epoch": 2.641931434431302, + "grad_norm": 14.663556098937988, + "learning_rate": 5.96780942614497e-06, + "loss": 0.4662, + "step": 298850 + }, + { + "epoch": 2.642019837691614, + "grad_norm": 1.265819787979126, + "learning_rate": 5.966336038473099e-06, + "loss": 0.5287, + "step": 298860 + }, + { + "epoch": 2.6421082409519263, + "grad_norm": 1.633187174797058, + "learning_rate": 5.964862650801228e-06, + "loss": 0.6153, + "step": 298870 + }, + { + "epoch": 2.6421966442122384, + "grad_norm": 2.5062053203582764, + "learning_rate": 5.963389263129358e-06, + "loss": 0.5658, + "step": 298880 + }, + { + "epoch": 2.6422850474725506, + "grad_norm": 7.828660488128662, + "learning_rate": 5.961915875457487e-06, + "loss": 0.5225, + "step": 298890 + }, + { + "epoch": 2.642373450732863, + "grad_norm": 2.026047468185425, + "learning_rate": 5.9604424877856165e-06, + "loss": 0.5114, + "step": 298900 + }, + { + "epoch": 2.6424618539931752, + "grad_norm": 6.628760814666748, + "learning_rate": 5.9589691001137465e-06, + "loss": 0.3881, + "step": 298910 + }, + { + "epoch": 2.6425502572534874, + "grad_norm": 5.229498863220215, + "learning_rate": 5.957495712441876e-06, + "loss": 0.4201, + "step": 298920 + }, + { + "epoch": 2.6426386605138, + "grad_norm": 5.549785137176514, + "learning_rate": 5.956022324770005e-06, + "loss": 0.526, + "step": 298930 + }, + { + "epoch": 2.642727063774112, + "grad_norm": 2.6045281887054443, + "learning_rate": 5.954548937098134e-06, + "loss": 0.4948, + "step": 298940 + }, + { + "epoch": 2.642815467034424, + "grad_norm": 5.304214000701904, + "learning_rate": 5.953075549426263e-06, + "loss": 0.613, + "step": 298950 + }, + { + "epoch": 2.6429038702947363, + "grad_norm": 5.568345546722412, + "learning_rate": 5.9516021617543925e-06, + "loss": 0.5832, + "step": 298960 + }, + { + "epoch": 2.642992273555049, + "grad_norm": 2.4487364292144775, + "learning_rate": 5.950128774082522e-06, + "loss": 0.451, + "step": 298970 + }, + { + "epoch": 2.643080676815361, + "grad_norm": 7.256718158721924, + "learning_rate": 5.948655386410651e-06, + "loss": 0.4474, + "step": 298980 + }, + { + "epoch": 2.643169080075673, + "grad_norm": 3.2705841064453125, + "learning_rate": 5.94718199873878e-06, + "loss": 0.5076, + "step": 298990 + }, + { + "epoch": 2.6432574833359856, + "grad_norm": 1.1302958726882935, + "learning_rate": 5.94570861106691e-06, + "loss": 0.5114, + "step": 299000 + }, + { + "epoch": 2.6433458865962978, + "grad_norm": 1.537200927734375, + "learning_rate": 5.944235223395039e-06, + "loss": 0.4477, + "step": 299010 + }, + { + "epoch": 2.64343428985661, + "grad_norm": 15.24630069732666, + "learning_rate": 5.942761835723169e-06, + "loss": 0.5097, + "step": 299020 + }, + { + "epoch": 2.643522693116922, + "grad_norm": 3.599844455718994, + "learning_rate": 5.941288448051298e-06, + "loss": 0.5074, + "step": 299030 + }, + { + "epoch": 2.643611096377234, + "grad_norm": 2.6808018684387207, + "learning_rate": 5.939815060379427e-06, + "loss": 0.5151, + "step": 299040 + }, + { + "epoch": 2.6436994996375467, + "grad_norm": 5.694947242736816, + "learning_rate": 5.938341672707556e-06, + "loss": 0.5375, + "step": 299050 + }, + { + "epoch": 2.643787902897859, + "grad_norm": 1.4874112606048584, + "learning_rate": 5.936868285035685e-06, + "loss": 0.534, + "step": 299060 + }, + { + "epoch": 2.6438763061581714, + "grad_norm": 3.332756519317627, + "learning_rate": 5.935394897363815e-06, + "loss": 0.4978, + "step": 299070 + }, + { + "epoch": 2.6439647094184835, + "grad_norm": 2.509071111679077, + "learning_rate": 5.933921509691944e-06, + "loss": 0.5769, + "step": 299080 + }, + { + "epoch": 2.6440531126787956, + "grad_norm": 2.9371838569641113, + "learning_rate": 5.932448122020073e-06, + "loss": 0.5396, + "step": 299090 + }, + { + "epoch": 2.6441415159391077, + "grad_norm": 0.8642210960388184, + "learning_rate": 5.930974734348203e-06, + "loss": 0.5949, + "step": 299100 + }, + { + "epoch": 2.64422991919942, + "grad_norm": 4.730560302734375, + "learning_rate": 5.929501346676332e-06, + "loss": 0.5591, + "step": 299110 + }, + { + "epoch": 2.6443183224597324, + "grad_norm": 5.395290851593018, + "learning_rate": 5.9280279590044615e-06, + "loss": 0.3596, + "step": 299120 + }, + { + "epoch": 2.6444067257200445, + "grad_norm": 2.94832706451416, + "learning_rate": 5.9265545713325915e-06, + "loss": 0.4724, + "step": 299130 + }, + { + "epoch": 2.6444951289803567, + "grad_norm": 2.6348061561584473, + "learning_rate": 5.925081183660721e-06, + "loss": 0.5037, + "step": 299140 + }, + { + "epoch": 2.644583532240669, + "grad_norm": 23.515432357788086, + "learning_rate": 5.92360779598885e-06, + "loss": 0.4817, + "step": 299150 + }, + { + "epoch": 2.6446719355009813, + "grad_norm": 1.7091264724731445, + "learning_rate": 5.922134408316979e-06, + "loss": 0.6203, + "step": 299160 + }, + { + "epoch": 2.6447603387612935, + "grad_norm": 16.170026779174805, + "learning_rate": 5.920661020645108e-06, + "loss": 0.5681, + "step": 299170 + }, + { + "epoch": 2.6448487420216056, + "grad_norm": 3.3658134937286377, + "learning_rate": 5.9191876329732375e-06, + "loss": 0.5005, + "step": 299180 + }, + { + "epoch": 2.644937145281918, + "grad_norm": 2.3344123363494873, + "learning_rate": 5.9177142453013676e-06, + "loss": 0.5975, + "step": 299190 + }, + { + "epoch": 2.6450255485422303, + "grad_norm": 2.658853769302368, + "learning_rate": 5.916240857629497e-06, + "loss": 0.6061, + "step": 299200 + }, + { + "epoch": 2.6451139518025424, + "grad_norm": 1.1098947525024414, + "learning_rate": 5.914767469957626e-06, + "loss": 0.4736, + "step": 299210 + }, + { + "epoch": 2.645202355062855, + "grad_norm": 6.524179458618164, + "learning_rate": 5.913294082285755e-06, + "loss": 0.4175, + "step": 299220 + }, + { + "epoch": 2.645290758323167, + "grad_norm": 5.043398857116699, + "learning_rate": 5.911820694613884e-06, + "loss": 0.6791, + "step": 299230 + }, + { + "epoch": 2.645379161583479, + "grad_norm": 1.1387453079223633, + "learning_rate": 5.9103473069420136e-06, + "loss": 0.4213, + "step": 299240 + }, + { + "epoch": 2.6454675648437913, + "grad_norm": 28.131587982177734, + "learning_rate": 5.908873919270143e-06, + "loss": 0.4273, + "step": 299250 + }, + { + "epoch": 2.6455559681041034, + "grad_norm": 3.276510715484619, + "learning_rate": 5.907400531598272e-06, + "loss": 0.5385, + "step": 299260 + }, + { + "epoch": 2.645644371364416, + "grad_norm": 4.549676895141602, + "learning_rate": 5.905927143926401e-06, + "loss": 0.593, + "step": 299270 + }, + { + "epoch": 2.645732774624728, + "grad_norm": 13.61014461517334, + "learning_rate": 5.904453756254531e-06, + "loss": 0.6263, + "step": 299280 + }, + { + "epoch": 2.6458211778850407, + "grad_norm": 4.668663501739502, + "learning_rate": 5.90298036858266e-06, + "loss": 0.5349, + "step": 299290 + }, + { + "epoch": 2.645909581145353, + "grad_norm": 8.08745002746582, + "learning_rate": 5.90150698091079e-06, + "loss": 0.6506, + "step": 299300 + }, + { + "epoch": 2.645997984405665, + "grad_norm": 5.145612716674805, + "learning_rate": 5.900033593238919e-06, + "loss": 0.5538, + "step": 299310 + }, + { + "epoch": 2.646086387665977, + "grad_norm": 4.72701358795166, + "learning_rate": 5.898560205567048e-06, + "loss": 0.4629, + "step": 299320 + }, + { + "epoch": 2.646174790926289, + "grad_norm": 8.899473190307617, + "learning_rate": 5.897086817895177e-06, + "loss": 0.4628, + "step": 299330 + }, + { + "epoch": 2.6462631941866017, + "grad_norm": 2.317150115966797, + "learning_rate": 5.895613430223306e-06, + "loss": 0.4427, + "step": 299340 + }, + { + "epoch": 2.646351597446914, + "grad_norm": 4.938594341278076, + "learning_rate": 5.8941400425514365e-06, + "loss": 0.5586, + "step": 299350 + }, + { + "epoch": 2.646440000707226, + "grad_norm": 2.9530131816864014, + "learning_rate": 5.892666654879566e-06, + "loss": 0.5091, + "step": 299360 + }, + { + "epoch": 2.6465284039675385, + "grad_norm": 6.7409210205078125, + "learning_rate": 5.891193267207695e-06, + "loss": 0.5635, + "step": 299370 + }, + { + "epoch": 2.6466168072278506, + "grad_norm": 3.0353035926818848, + "learning_rate": 5.889719879535825e-06, + "loss": 0.4461, + "step": 299380 + }, + { + "epoch": 2.6467052104881628, + "grad_norm": 2.4791598320007324, + "learning_rate": 5.888246491863954e-06, + "loss": 0.4304, + "step": 299390 + }, + { + "epoch": 2.646793613748475, + "grad_norm": 5.3606061935424805, + "learning_rate": 5.886773104192083e-06, + "loss": 0.5006, + "step": 299400 + }, + { + "epoch": 2.6468820170087874, + "grad_norm": 1.8334242105484009, + "learning_rate": 5.8852997165202125e-06, + "loss": 0.5189, + "step": 299410 + }, + { + "epoch": 2.6469704202690996, + "grad_norm": 2.621605634689331, + "learning_rate": 5.883826328848342e-06, + "loss": 0.5286, + "step": 299420 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 2.837853193283081, + "learning_rate": 5.882352941176471e-06, + "loss": 0.6425, + "step": 299430 + }, + { + "epoch": 2.6471472267897243, + "grad_norm": 2.478813409805298, + "learning_rate": 5.8808795535046e-06, + "loss": 0.6021, + "step": 299440 + }, + { + "epoch": 2.6472356300500364, + "grad_norm": 9.855226516723633, + "learning_rate": 5.879406165832729e-06, + "loss": 0.5001, + "step": 299450 + }, + { + "epoch": 2.6473240333103485, + "grad_norm": 7.30198860168457, + "learning_rate": 5.8779327781608585e-06, + "loss": 0.5817, + "step": 299460 + }, + { + "epoch": 2.6474124365706606, + "grad_norm": 2.5789685249328613, + "learning_rate": 5.8764593904889886e-06, + "loss": 0.5077, + "step": 299470 + }, + { + "epoch": 2.6475008398309727, + "grad_norm": 4.303621768951416, + "learning_rate": 5.874986002817118e-06, + "loss": 0.5378, + "step": 299480 + }, + { + "epoch": 2.6475892430912853, + "grad_norm": 2.1290602684020996, + "learning_rate": 5.873512615145247e-06, + "loss": 0.4819, + "step": 299490 + }, + { + "epoch": 2.6476776463515974, + "grad_norm": 12.2060546875, + "learning_rate": 5.872039227473376e-06, + "loss": 0.5185, + "step": 299500 + }, + { + "epoch": 2.6477660496119095, + "grad_norm": 4.694787502288818, + "learning_rate": 5.870565839801505e-06, + "loss": 0.4665, + "step": 299510 + }, + { + "epoch": 2.647854452872222, + "grad_norm": 1.4408296346664429, + "learning_rate": 5.8690924521296346e-06, + "loss": 0.4896, + "step": 299520 + }, + { + "epoch": 2.647942856132534, + "grad_norm": 3.8447821140289307, + "learning_rate": 5.867619064457764e-06, + "loss": 0.5506, + "step": 299530 + }, + { + "epoch": 2.6480312593928463, + "grad_norm": 3.135713577270508, + "learning_rate": 5.866145676785893e-06, + "loss": 0.5827, + "step": 299540 + }, + { + "epoch": 2.6481196626531585, + "grad_norm": 8.433113098144531, + "learning_rate": 5.864672289114022e-06, + "loss": 0.4045, + "step": 299550 + }, + { + "epoch": 2.648208065913471, + "grad_norm": 4.110930442810059, + "learning_rate": 5.863198901442152e-06, + "loss": 0.5038, + "step": 299560 + }, + { + "epoch": 2.648296469173783, + "grad_norm": 6.022458553314209, + "learning_rate": 5.8617255137702814e-06, + "loss": 0.3739, + "step": 299570 + }, + { + "epoch": 2.6483848724340953, + "grad_norm": 14.867831230163574, + "learning_rate": 5.860252126098411e-06, + "loss": 0.5308, + "step": 299580 + }, + { + "epoch": 2.648473275694408, + "grad_norm": 6.608479976654053, + "learning_rate": 5.85877873842654e-06, + "loss": 0.3688, + "step": 299590 + }, + { + "epoch": 2.64856167895472, + "grad_norm": 2.9386496543884277, + "learning_rate": 5.85730535075467e-06, + "loss": 0.5071, + "step": 299600 + }, + { + "epoch": 2.648650082215032, + "grad_norm": 3.149509906768799, + "learning_rate": 5.855831963082799e-06, + "loss": 0.5451, + "step": 299610 + }, + { + "epoch": 2.648738485475344, + "grad_norm": 4.39550256729126, + "learning_rate": 5.854358575410928e-06, + "loss": 0.5347, + "step": 299620 + }, + { + "epoch": 2.6488268887356563, + "grad_norm": 19.8382511138916, + "learning_rate": 5.8528851877390575e-06, + "loss": 0.5089, + "step": 299630 + }, + { + "epoch": 2.648915291995969, + "grad_norm": 0.8128330111503601, + "learning_rate": 5.851411800067187e-06, + "loss": 0.5105, + "step": 299640 + }, + { + "epoch": 2.649003695256281, + "grad_norm": 5.530133247375488, + "learning_rate": 5.849938412395316e-06, + "loss": 0.535, + "step": 299650 + }, + { + "epoch": 2.6490920985165936, + "grad_norm": 3.8686304092407227, + "learning_rate": 5.848465024723446e-06, + "loss": 0.5472, + "step": 299660 + }, + { + "epoch": 2.6491805017769057, + "grad_norm": 2.1397953033447266, + "learning_rate": 5.846991637051575e-06, + "loss": 0.4242, + "step": 299670 + }, + { + "epoch": 2.649268905037218, + "grad_norm": 4.364826679229736, + "learning_rate": 5.845518249379704e-06, + "loss": 0.549, + "step": 299680 + }, + { + "epoch": 2.64935730829753, + "grad_norm": 2.3832647800445557, + "learning_rate": 5.8440448617078335e-06, + "loss": 0.5056, + "step": 299690 + }, + { + "epoch": 2.649445711557842, + "grad_norm": 2.3302247524261475, + "learning_rate": 5.842571474035963e-06, + "loss": 0.5494, + "step": 299700 + }, + { + "epoch": 2.6495341148181546, + "grad_norm": 4.154373645782471, + "learning_rate": 5.841098086364092e-06, + "loss": 0.6258, + "step": 299710 + }, + { + "epoch": 2.6496225180784667, + "grad_norm": 5.3590240478515625, + "learning_rate": 5.839624698692221e-06, + "loss": 0.4401, + "step": 299720 + }, + { + "epoch": 2.649710921338779, + "grad_norm": 2.2340009212493896, + "learning_rate": 5.83815131102035e-06, + "loss": 0.4106, + "step": 299730 + }, + { + "epoch": 2.6497993245990914, + "grad_norm": 5.404051780700684, + "learning_rate": 5.8366779233484795e-06, + "loss": 0.4981, + "step": 299740 + }, + { + "epoch": 2.6498877278594035, + "grad_norm": 19.81913185119629, + "learning_rate": 5.83520453567661e-06, + "loss": 0.5447, + "step": 299750 + }, + { + "epoch": 2.6499761311197156, + "grad_norm": 2.7846453189849854, + "learning_rate": 5.833731148004739e-06, + "loss": 0.5201, + "step": 299760 + }, + { + "epoch": 2.6500645343800278, + "grad_norm": 2.806823968887329, + "learning_rate": 5.832257760332868e-06, + "loss": 0.5472, + "step": 299770 + }, + { + "epoch": 2.6501529376403403, + "grad_norm": 0.4839473366737366, + "learning_rate": 5.830784372660997e-06, + "loss": 0.5756, + "step": 299780 + }, + { + "epoch": 2.6502413409006524, + "grad_norm": 1.425189733505249, + "learning_rate": 5.829310984989126e-06, + "loss": 0.5348, + "step": 299790 + }, + { + "epoch": 2.6503297441609646, + "grad_norm": 8.070107460021973, + "learning_rate": 5.827837597317256e-06, + "loss": 0.4514, + "step": 299800 + }, + { + "epoch": 2.650418147421277, + "grad_norm": 1.2442187070846558, + "learning_rate": 5.826364209645385e-06, + "loss": 0.4449, + "step": 299810 + }, + { + "epoch": 2.6505065506815892, + "grad_norm": 2.933011770248413, + "learning_rate": 5.824890821973515e-06, + "loss": 0.6237, + "step": 299820 + }, + { + "epoch": 2.6505949539419014, + "grad_norm": 1.857027292251587, + "learning_rate": 5.823417434301644e-06, + "loss": 0.5155, + "step": 299830 + }, + { + "epoch": 2.6506833572022135, + "grad_norm": 12.621766090393066, + "learning_rate": 5.821944046629773e-06, + "loss": 0.5361, + "step": 299840 + }, + { + "epoch": 2.6507717604625256, + "grad_norm": 3.977991819381714, + "learning_rate": 5.820470658957903e-06, + "loss": 0.4982, + "step": 299850 + }, + { + "epoch": 2.650860163722838, + "grad_norm": 3.1608829498291016, + "learning_rate": 5.8189972712860325e-06, + "loss": 0.471, + "step": 299860 + }, + { + "epoch": 2.6509485669831503, + "grad_norm": 3.116307020187378, + "learning_rate": 5.817523883614162e-06, + "loss": 0.5001, + "step": 299870 + }, + { + "epoch": 2.651036970243463, + "grad_norm": 4.319490432739258, + "learning_rate": 5.816050495942291e-06, + "loss": 0.4699, + "step": 299880 + }, + { + "epoch": 2.651125373503775, + "grad_norm": 14.125265121459961, + "learning_rate": 5.81457710827042e-06, + "loss": 0.509, + "step": 299890 + }, + { + "epoch": 2.651213776764087, + "grad_norm": 4.610196590423584, + "learning_rate": 5.813103720598549e-06, + "loss": 0.5164, + "step": 299900 + }, + { + "epoch": 2.651302180024399, + "grad_norm": 4.259591102600098, + "learning_rate": 5.8116303329266785e-06, + "loss": 0.5883, + "step": 299910 + }, + { + "epoch": 2.6513905832847113, + "grad_norm": 2.2315778732299805, + "learning_rate": 5.810156945254808e-06, + "loss": 0.5803, + "step": 299920 + }, + { + "epoch": 2.651478986545024, + "grad_norm": 1.113316535949707, + "learning_rate": 5.808683557582937e-06, + "loss": 0.54, + "step": 299930 + }, + { + "epoch": 2.651567389805336, + "grad_norm": 1.659622311592102, + "learning_rate": 5.807210169911067e-06, + "loss": 0.5955, + "step": 299940 + }, + { + "epoch": 2.651655793065648, + "grad_norm": 7.951066970825195, + "learning_rate": 5.805736782239196e-06, + "loss": 0.6165, + "step": 299950 + }, + { + "epoch": 2.6517441963259607, + "grad_norm": 15.481574058532715, + "learning_rate": 5.804263394567325e-06, + "loss": 0.5056, + "step": 299960 + }, + { + "epoch": 2.651832599586273, + "grad_norm": 4.089448928833008, + "learning_rate": 5.8027900068954545e-06, + "loss": 0.4495, + "step": 299970 + }, + { + "epoch": 2.651921002846585, + "grad_norm": 3.9420595169067383, + "learning_rate": 5.801316619223584e-06, + "loss": 0.4088, + "step": 299980 + }, + { + "epoch": 2.652009406106897, + "grad_norm": 6.601164817810059, + "learning_rate": 5.799843231551713e-06, + "loss": 0.5424, + "step": 299990 + }, + { + "epoch": 2.6520978093672096, + "grad_norm": 2.6445364952087402, + "learning_rate": 5.798369843879842e-06, + "loss": 0.5192, + "step": 300000 + }, + { + "epoch": 2.6521862126275217, + "grad_norm": 2.27213978767395, + "learning_rate": 5.796896456207971e-06, + "loss": 0.6865, + "step": 300010 + }, + { + "epoch": 2.652274615887834, + "grad_norm": 7.261443614959717, + "learning_rate": 5.7954230685361006e-06, + "loss": 0.6031, + "step": 300020 + }, + { + "epoch": 2.6523630191481464, + "grad_norm": 3.0194954872131348, + "learning_rate": 5.793949680864231e-06, + "loss": 0.577, + "step": 300030 + }, + { + "epoch": 2.6524514224084585, + "grad_norm": 6.733114719390869, + "learning_rate": 5.79247629319236e-06, + "loss": 0.508, + "step": 300040 + }, + { + "epoch": 2.6525398256687707, + "grad_norm": 13.974281311035156, + "learning_rate": 5.791002905520489e-06, + "loss": 0.5472, + "step": 300050 + }, + { + "epoch": 2.652628228929083, + "grad_norm": 3.835413932800293, + "learning_rate": 5.789529517848619e-06, + "loss": 0.5654, + "step": 300060 + }, + { + "epoch": 2.652716632189395, + "grad_norm": 6.005604267120361, + "learning_rate": 5.788056130176748e-06, + "loss": 0.408, + "step": 300070 + }, + { + "epoch": 2.6528050354497075, + "grad_norm": 3.0700583457946777, + "learning_rate": 5.7865827425048775e-06, + "loss": 0.524, + "step": 300080 + }, + { + "epoch": 2.6528934387100196, + "grad_norm": 0.752582311630249, + "learning_rate": 5.785109354833007e-06, + "loss": 0.5264, + "step": 300090 + }, + { + "epoch": 2.6529818419703317, + "grad_norm": 1.2316105365753174, + "learning_rate": 5.783635967161136e-06, + "loss": 0.5403, + "step": 300100 + }, + { + "epoch": 2.6530702452306443, + "grad_norm": 15.251638412475586, + "learning_rate": 5.782162579489265e-06, + "loss": 0.4685, + "step": 300110 + }, + { + "epoch": 2.6531586484909564, + "grad_norm": 4.075425148010254, + "learning_rate": 5.780689191817394e-06, + "loss": 0.5099, + "step": 300120 + }, + { + "epoch": 2.6532470517512685, + "grad_norm": 2.235830068588257, + "learning_rate": 5.779215804145524e-06, + "loss": 0.5116, + "step": 300130 + }, + { + "epoch": 2.6533354550115806, + "grad_norm": 3.590787172317505, + "learning_rate": 5.7777424164736535e-06, + "loss": 0.6546, + "step": 300140 + }, + { + "epoch": 2.653423858271893, + "grad_norm": 3.9922258853912354, + "learning_rate": 5.776269028801783e-06, + "loss": 0.5579, + "step": 300150 + }, + { + "epoch": 2.6535122615322053, + "grad_norm": 13.267293930053711, + "learning_rate": 5.774795641129912e-06, + "loss": 0.5806, + "step": 300160 + }, + { + "epoch": 2.6536006647925174, + "grad_norm": 3.2832469940185547, + "learning_rate": 5.773322253458041e-06, + "loss": 0.5084, + "step": 300170 + }, + { + "epoch": 2.65368906805283, + "grad_norm": 9.728435516357422, + "learning_rate": 5.77184886578617e-06, + "loss": 0.5018, + "step": 300180 + }, + { + "epoch": 2.653777471313142, + "grad_norm": 1.3365392684936523, + "learning_rate": 5.7703754781142995e-06, + "loss": 0.5514, + "step": 300190 + }, + { + "epoch": 2.6538658745734542, + "grad_norm": 4.373548984527588, + "learning_rate": 5.768902090442429e-06, + "loss": 0.5287, + "step": 300200 + }, + { + "epoch": 2.6539542778337664, + "grad_norm": 2.74937105178833, + "learning_rate": 5.767428702770558e-06, + "loss": 0.4976, + "step": 300210 + }, + { + "epoch": 2.6540426810940785, + "grad_norm": 2.6327850818634033, + "learning_rate": 5.765955315098688e-06, + "loss": 0.4847, + "step": 300220 + }, + { + "epoch": 2.654131084354391, + "grad_norm": 11.242212295532227, + "learning_rate": 5.764481927426817e-06, + "loss": 0.522, + "step": 300230 + }, + { + "epoch": 2.654219487614703, + "grad_norm": 0.8164722323417664, + "learning_rate": 5.763008539754946e-06, + "loss": 0.4864, + "step": 300240 + }, + { + "epoch": 2.6543078908750157, + "grad_norm": 3.2890796661376953, + "learning_rate": 5.7615351520830756e-06, + "loss": 0.5712, + "step": 300250 + }, + { + "epoch": 2.654396294135328, + "grad_norm": 3.0327653884887695, + "learning_rate": 5.760061764411205e-06, + "loss": 0.5256, + "step": 300260 + }, + { + "epoch": 2.65448469739564, + "grad_norm": 1.4070243835449219, + "learning_rate": 5.758588376739334e-06, + "loss": 0.4277, + "step": 300270 + }, + { + "epoch": 2.654573100655952, + "grad_norm": 1.8568750619888306, + "learning_rate": 5.757114989067464e-06, + "loss": 0.4465, + "step": 300280 + }, + { + "epoch": 2.654661503916264, + "grad_norm": 1.6444200277328491, + "learning_rate": 5.755641601395593e-06, + "loss": 0.6054, + "step": 300290 + }, + { + "epoch": 2.6547499071765768, + "grad_norm": 2.996772527694702, + "learning_rate": 5.754168213723722e-06, + "loss": 0.5288, + "step": 300300 + }, + { + "epoch": 2.654838310436889, + "grad_norm": 6.399709224700928, + "learning_rate": 5.7526948260518525e-06, + "loss": 0.5264, + "step": 300310 + }, + { + "epoch": 2.654926713697201, + "grad_norm": 1.8706581592559814, + "learning_rate": 5.751221438379982e-06, + "loss": 0.4974, + "step": 300320 + }, + { + "epoch": 2.6550151169575136, + "grad_norm": 2.782745122909546, + "learning_rate": 5.749748050708111e-06, + "loss": 0.5865, + "step": 300330 + }, + { + "epoch": 2.6551035202178257, + "grad_norm": 10.717726707458496, + "learning_rate": 5.74827466303624e-06, + "loss": 0.5301, + "step": 300340 + }, + { + "epoch": 2.655191923478138, + "grad_norm": 4.803606033325195, + "learning_rate": 5.746801275364369e-06, + "loss": 0.5111, + "step": 300350 + }, + { + "epoch": 2.65528032673845, + "grad_norm": 8.158602714538574, + "learning_rate": 5.7453278876924985e-06, + "loss": 0.5378, + "step": 300360 + }, + { + "epoch": 2.6553687299987625, + "grad_norm": 5.614989757537842, + "learning_rate": 5.743854500020628e-06, + "loss": 0.6271, + "step": 300370 + }, + { + "epoch": 2.6554571332590746, + "grad_norm": 11.185340881347656, + "learning_rate": 5.742381112348757e-06, + "loss": 0.4735, + "step": 300380 + }, + { + "epoch": 2.6555455365193867, + "grad_norm": 8.443015098571777, + "learning_rate": 5.740907724676886e-06, + "loss": 0.4542, + "step": 300390 + }, + { + "epoch": 2.6556339397796993, + "grad_norm": 1.0916564464569092, + "learning_rate": 5.739434337005015e-06, + "loss": 0.5715, + "step": 300400 + }, + { + "epoch": 2.6557223430400114, + "grad_norm": 6.001841068267822, + "learning_rate": 5.737960949333145e-06, + "loss": 0.4279, + "step": 300410 + }, + { + "epoch": 2.6558107463003235, + "grad_norm": 5.107616424560547, + "learning_rate": 5.7364875616612745e-06, + "loss": 0.4565, + "step": 300420 + }, + { + "epoch": 2.6558991495606357, + "grad_norm": 2.251768112182617, + "learning_rate": 5.735014173989404e-06, + "loss": 0.4966, + "step": 300430 + }, + { + "epoch": 2.655987552820948, + "grad_norm": 6.247438430786133, + "learning_rate": 5.733540786317533e-06, + "loss": 0.5541, + "step": 300440 + }, + { + "epoch": 2.6560759560812603, + "grad_norm": 2.577139139175415, + "learning_rate": 5.732067398645662e-06, + "loss": 0.5205, + "step": 300450 + }, + { + "epoch": 2.6561643593415725, + "grad_norm": 3.4754040241241455, + "learning_rate": 5.730594010973791e-06, + "loss": 0.5372, + "step": 300460 + }, + { + "epoch": 2.656252762601885, + "grad_norm": 6.42793607711792, + "learning_rate": 5.7291206233019205e-06, + "loss": 0.5804, + "step": 300470 + }, + { + "epoch": 2.656341165862197, + "grad_norm": 9.925573348999023, + "learning_rate": 5.72764723563005e-06, + "loss": 0.5005, + "step": 300480 + }, + { + "epoch": 2.6564295691225093, + "grad_norm": 3.759094715118408, + "learning_rate": 5.726173847958179e-06, + "loss": 0.6291, + "step": 300490 + }, + { + "epoch": 2.6565179723828214, + "grad_norm": 2.2888121604919434, + "learning_rate": 5.724700460286309e-06, + "loss": 0.4572, + "step": 300500 + }, + { + "epoch": 2.6566063756431335, + "grad_norm": 2.2736122608184814, + "learning_rate": 5.723227072614438e-06, + "loss": 0.4868, + "step": 300510 + }, + { + "epoch": 2.656694778903446, + "grad_norm": 2.2523977756500244, + "learning_rate": 5.721753684942567e-06, + "loss": 0.5437, + "step": 300520 + }, + { + "epoch": 2.656783182163758, + "grad_norm": 1.0190181732177734, + "learning_rate": 5.7202802972706974e-06, + "loss": 0.5656, + "step": 300530 + }, + { + "epoch": 2.6568715854240703, + "grad_norm": 2.9609930515289307, + "learning_rate": 5.718806909598827e-06, + "loss": 0.5471, + "step": 300540 + }, + { + "epoch": 2.656959988684383, + "grad_norm": 2.9780542850494385, + "learning_rate": 5.717333521926956e-06, + "loss": 0.5797, + "step": 300550 + }, + { + "epoch": 2.657048391944695, + "grad_norm": 2.1229498386383057, + "learning_rate": 5.715860134255085e-06, + "loss": 0.4667, + "step": 300560 + }, + { + "epoch": 2.657136795205007, + "grad_norm": 3.5915369987487793, + "learning_rate": 5.714386746583214e-06, + "loss": 0.4764, + "step": 300570 + }, + { + "epoch": 2.6572251984653192, + "grad_norm": 6.785897254943848, + "learning_rate": 5.7129133589113434e-06, + "loss": 0.4274, + "step": 300580 + }, + { + "epoch": 2.657313601725632, + "grad_norm": 4.94478702545166, + "learning_rate": 5.7114399712394735e-06, + "loss": 0.4312, + "step": 300590 + }, + { + "epoch": 2.657402004985944, + "grad_norm": 10.942651748657227, + "learning_rate": 5.709966583567603e-06, + "loss": 0.5689, + "step": 300600 + }, + { + "epoch": 2.657490408246256, + "grad_norm": 1.731431007385254, + "learning_rate": 5.708493195895732e-06, + "loss": 0.4297, + "step": 300610 + }, + { + "epoch": 2.6575788115065686, + "grad_norm": 1.2004914283752441, + "learning_rate": 5.707019808223861e-06, + "loss": 0.5003, + "step": 300620 + }, + { + "epoch": 2.6576672147668807, + "grad_norm": 37.25926208496094, + "learning_rate": 5.70554642055199e-06, + "loss": 0.5509, + "step": 300630 + }, + { + "epoch": 2.657755618027193, + "grad_norm": 1.7246923446655273, + "learning_rate": 5.7040730328801195e-06, + "loss": 0.5864, + "step": 300640 + }, + { + "epoch": 2.657844021287505, + "grad_norm": 1.5300794839859009, + "learning_rate": 5.702599645208249e-06, + "loss": 0.565, + "step": 300650 + }, + { + "epoch": 2.657932424547817, + "grad_norm": 13.067383766174316, + "learning_rate": 5.701126257536378e-06, + "loss": 0.6117, + "step": 300660 + }, + { + "epoch": 2.6580208278081296, + "grad_norm": 5.0918803215026855, + "learning_rate": 5.699652869864507e-06, + "loss": 0.5292, + "step": 300670 + }, + { + "epoch": 2.6581092310684418, + "grad_norm": 3.541083335876465, + "learning_rate": 5.698179482192636e-06, + "loss": 0.6058, + "step": 300680 + }, + { + "epoch": 2.658197634328754, + "grad_norm": 1.7961088418960571, + "learning_rate": 5.696706094520766e-06, + "loss": 0.4658, + "step": 300690 + }, + { + "epoch": 2.6582860375890665, + "grad_norm": 1.1669496297836304, + "learning_rate": 5.6952327068488955e-06, + "loss": 0.5003, + "step": 300700 + }, + { + "epoch": 2.6583744408493786, + "grad_norm": 2.243532419204712, + "learning_rate": 5.693759319177025e-06, + "loss": 0.4756, + "step": 300710 + }, + { + "epoch": 2.6584628441096907, + "grad_norm": 5.7882280349731445, + "learning_rate": 5.692285931505154e-06, + "loss": 0.5027, + "step": 300720 + }, + { + "epoch": 2.658551247370003, + "grad_norm": 2.8899123668670654, + "learning_rate": 5.690812543833283e-06, + "loss": 0.4425, + "step": 300730 + }, + { + "epoch": 2.6586396506303154, + "grad_norm": 5.9657721519470215, + "learning_rate": 5.689339156161412e-06, + "loss": 0.5696, + "step": 300740 + }, + { + "epoch": 2.6587280538906275, + "grad_norm": 11.625365257263184, + "learning_rate": 5.687865768489542e-06, + "loss": 0.5972, + "step": 300750 + }, + { + "epoch": 2.6588164571509396, + "grad_norm": 1.6489592790603638, + "learning_rate": 5.686392380817672e-06, + "loss": 0.4773, + "step": 300760 + }, + { + "epoch": 2.658904860411252, + "grad_norm": 2.2007253170013428, + "learning_rate": 5.684918993145801e-06, + "loss": 0.5605, + "step": 300770 + }, + { + "epoch": 2.6589932636715643, + "grad_norm": 7.205025672912598, + "learning_rate": 5.683445605473931e-06, + "loss": 0.6166, + "step": 300780 + }, + { + "epoch": 2.6590816669318764, + "grad_norm": 5.11989688873291, + "learning_rate": 5.68197221780206e-06, + "loss": 0.4813, + "step": 300790 + }, + { + "epoch": 2.6591700701921885, + "grad_norm": 1.0751843452453613, + "learning_rate": 5.680498830130189e-06, + "loss": 0.5477, + "step": 300800 + }, + { + "epoch": 2.6592584734525007, + "grad_norm": 3.2121779918670654, + "learning_rate": 5.6790254424583184e-06, + "loss": 0.4948, + "step": 300810 + }, + { + "epoch": 2.6593468767128132, + "grad_norm": 2.543372631072998, + "learning_rate": 5.677552054786448e-06, + "loss": 0.5711, + "step": 300820 + }, + { + "epoch": 2.6594352799731253, + "grad_norm": 5.342015266418457, + "learning_rate": 5.676078667114577e-06, + "loss": 0.4881, + "step": 300830 + }, + { + "epoch": 2.659523683233438, + "grad_norm": 2.0400288105010986, + "learning_rate": 5.674605279442706e-06, + "loss": 0.5297, + "step": 300840 + }, + { + "epoch": 2.65961208649375, + "grad_norm": 1.5678961277008057, + "learning_rate": 5.673131891770835e-06, + "loss": 0.5145, + "step": 300850 + }, + { + "epoch": 2.659700489754062, + "grad_norm": 2.512176275253296, + "learning_rate": 5.6716585040989644e-06, + "loss": 0.5659, + "step": 300860 + }, + { + "epoch": 2.6597888930143743, + "grad_norm": 6.987335681915283, + "learning_rate": 5.6701851164270945e-06, + "loss": 0.596, + "step": 300870 + }, + { + "epoch": 2.6598772962746864, + "grad_norm": 7.671796798706055, + "learning_rate": 5.668711728755224e-06, + "loss": 0.4995, + "step": 300880 + }, + { + "epoch": 2.659965699534999, + "grad_norm": 4.143398761749268, + "learning_rate": 5.667238341083353e-06, + "loss": 0.5524, + "step": 300890 + }, + { + "epoch": 2.660054102795311, + "grad_norm": 10.083620071411133, + "learning_rate": 5.665764953411482e-06, + "loss": 0.4111, + "step": 300900 + }, + { + "epoch": 2.660142506055623, + "grad_norm": 5.977963447570801, + "learning_rate": 5.664291565739611e-06, + "loss": 0.5593, + "step": 300910 + }, + { + "epoch": 2.6602309093159358, + "grad_norm": 2.3573315143585205, + "learning_rate": 5.6628181780677405e-06, + "loss": 0.5361, + "step": 300920 + }, + { + "epoch": 2.660319312576248, + "grad_norm": 2.668576717376709, + "learning_rate": 5.66134479039587e-06, + "loss": 0.6138, + "step": 300930 + }, + { + "epoch": 2.66040771583656, + "grad_norm": 1.605341911315918, + "learning_rate": 5.659871402723999e-06, + "loss": 0.4521, + "step": 300940 + }, + { + "epoch": 2.660496119096872, + "grad_norm": 4.285220146179199, + "learning_rate": 5.658398015052128e-06, + "loss": 0.5857, + "step": 300950 + }, + { + "epoch": 2.6605845223571847, + "grad_norm": 5.173956871032715, + "learning_rate": 5.656924627380257e-06, + "loss": 0.4852, + "step": 300960 + }, + { + "epoch": 2.660672925617497, + "grad_norm": 1.4389410018920898, + "learning_rate": 5.655451239708387e-06, + "loss": 0.4773, + "step": 300970 + }, + { + "epoch": 2.660761328877809, + "grad_norm": 4.682857990264893, + "learning_rate": 5.6539778520365166e-06, + "loss": 0.5507, + "step": 300980 + }, + { + "epoch": 2.6608497321381215, + "grad_norm": 5.534067630767822, + "learning_rate": 5.652504464364646e-06, + "loss": 0.3882, + "step": 300990 + }, + { + "epoch": 2.6609381353984336, + "grad_norm": 3.249068021774292, + "learning_rate": 5.651031076692776e-06, + "loss": 0.4982, + "step": 301000 + }, + { + "epoch": 2.6610265386587457, + "grad_norm": 4.706718921661377, + "learning_rate": 5.649557689020905e-06, + "loss": 0.677, + "step": 301010 + }, + { + "epoch": 2.661114941919058, + "grad_norm": 3.097825527191162, + "learning_rate": 5.648084301349034e-06, + "loss": 0.6008, + "step": 301020 + }, + { + "epoch": 2.66120334517937, + "grad_norm": 1.7805726528167725, + "learning_rate": 5.646610913677163e-06, + "loss": 0.379, + "step": 301030 + }, + { + "epoch": 2.6612917484396825, + "grad_norm": 11.988027572631836, + "learning_rate": 5.645137526005293e-06, + "loss": 0.5643, + "step": 301040 + }, + { + "epoch": 2.6613801516999946, + "grad_norm": 13.552850723266602, + "learning_rate": 5.643664138333422e-06, + "loss": 0.5785, + "step": 301050 + }, + { + "epoch": 2.661468554960307, + "grad_norm": 1.9119873046875, + "learning_rate": 5.642190750661552e-06, + "loss": 0.4756, + "step": 301060 + }, + { + "epoch": 2.6615569582206193, + "grad_norm": 3.5115344524383545, + "learning_rate": 5.640717362989681e-06, + "loss": 0.572, + "step": 301070 + }, + { + "epoch": 2.6616453614809314, + "grad_norm": 3.2847979068756104, + "learning_rate": 5.63924397531781e-06, + "loss": 0.4697, + "step": 301080 + }, + { + "epoch": 2.6617337647412436, + "grad_norm": 3.409365653991699, + "learning_rate": 5.6377705876459395e-06, + "loss": 0.5859, + "step": 301090 + }, + { + "epoch": 2.6618221680015557, + "grad_norm": 6.033799171447754, + "learning_rate": 5.636297199974069e-06, + "loss": 0.6115, + "step": 301100 + }, + { + "epoch": 2.6619105712618683, + "grad_norm": 9.483908653259277, + "learning_rate": 5.634823812302198e-06, + "loss": 0.5657, + "step": 301110 + }, + { + "epoch": 2.6619989745221804, + "grad_norm": 4.218178749084473, + "learning_rate": 5.633350424630327e-06, + "loss": 0.5072, + "step": 301120 + }, + { + "epoch": 2.6620873777824925, + "grad_norm": 1.0219837427139282, + "learning_rate": 5.631877036958456e-06, + "loss": 0.5073, + "step": 301130 + }, + { + "epoch": 2.662175781042805, + "grad_norm": 4.797374725341797, + "learning_rate": 5.6304036492865855e-06, + "loss": 0.4828, + "step": 301140 + }, + { + "epoch": 2.662264184303117, + "grad_norm": 2.7135396003723145, + "learning_rate": 5.6289302616147155e-06, + "loss": 0.5397, + "step": 301150 + }, + { + "epoch": 2.6623525875634293, + "grad_norm": 13.253921508789062, + "learning_rate": 5.627456873942845e-06, + "loss": 0.6397, + "step": 301160 + }, + { + "epoch": 2.6624409908237414, + "grad_norm": 2.1119282245635986, + "learning_rate": 5.625983486270974e-06, + "loss": 0.5104, + "step": 301170 + }, + { + "epoch": 2.662529394084054, + "grad_norm": 2.9189627170562744, + "learning_rate": 5.624510098599103e-06, + "loss": 0.6125, + "step": 301180 + }, + { + "epoch": 2.662617797344366, + "grad_norm": 9.283437728881836, + "learning_rate": 5.623036710927232e-06, + "loss": 0.4654, + "step": 301190 + }, + { + "epoch": 2.662706200604678, + "grad_norm": 5.333782196044922, + "learning_rate": 5.6215633232553615e-06, + "loss": 0.6332, + "step": 301200 + }, + { + "epoch": 2.662794603864991, + "grad_norm": 1.74278724193573, + "learning_rate": 5.6200899355834916e-06, + "loss": 0.4755, + "step": 301210 + }, + { + "epoch": 2.662883007125303, + "grad_norm": 2.923628568649292, + "learning_rate": 5.618616547911621e-06, + "loss": 0.5886, + "step": 301220 + }, + { + "epoch": 2.662971410385615, + "grad_norm": 3.357079029083252, + "learning_rate": 5.61714316023975e-06, + "loss": 0.4391, + "step": 301230 + }, + { + "epoch": 2.663059813645927, + "grad_norm": 7.992538928985596, + "learning_rate": 5.615669772567879e-06, + "loss": 0.553, + "step": 301240 + }, + { + "epoch": 2.6631482169062393, + "grad_norm": 3.8884100914001465, + "learning_rate": 5.614196384896009e-06, + "loss": 0.5282, + "step": 301250 + }, + { + "epoch": 2.663236620166552, + "grad_norm": 2.2267849445343018, + "learning_rate": 5.612722997224138e-06, + "loss": 0.4591, + "step": 301260 + }, + { + "epoch": 2.663325023426864, + "grad_norm": 6.391140937805176, + "learning_rate": 5.611249609552268e-06, + "loss": 0.4822, + "step": 301270 + }, + { + "epoch": 2.663413426687176, + "grad_norm": 2.1567375659942627, + "learning_rate": 5.609776221880397e-06, + "loss": 0.4687, + "step": 301280 + }, + { + "epoch": 2.6635018299474886, + "grad_norm": 4.799533367156982, + "learning_rate": 5.608302834208526e-06, + "loss": 0.4859, + "step": 301290 + }, + { + "epoch": 2.6635902332078008, + "grad_norm": 16.771812438964844, + "learning_rate": 5.606829446536655e-06, + "loss": 0.5654, + "step": 301300 + }, + { + "epoch": 2.663678636468113, + "grad_norm": 6.400337219238281, + "learning_rate": 5.6053560588647844e-06, + "loss": 0.5399, + "step": 301310 + }, + { + "epoch": 2.663767039728425, + "grad_norm": 1.5505577325820923, + "learning_rate": 5.603882671192914e-06, + "loss": 0.4835, + "step": 301320 + }, + { + "epoch": 2.6638554429887376, + "grad_norm": 5.508503437042236, + "learning_rate": 5.602409283521043e-06, + "loss": 0.5816, + "step": 301330 + }, + { + "epoch": 2.6639438462490497, + "grad_norm": 0.7409241199493408, + "learning_rate": 5.600935895849173e-06, + "loss": 0.4607, + "step": 301340 + }, + { + "epoch": 2.664032249509362, + "grad_norm": 1.4656535387039185, + "learning_rate": 5.599462508177302e-06, + "loss": 0.584, + "step": 301350 + }, + { + "epoch": 2.6641206527696744, + "grad_norm": 7.501648426055908, + "learning_rate": 5.597989120505431e-06, + "loss": 0.3906, + "step": 301360 + }, + { + "epoch": 2.6642090560299865, + "grad_norm": 1.3327298164367676, + "learning_rate": 5.5965157328335605e-06, + "loss": 0.4429, + "step": 301370 + }, + { + "epoch": 2.6642974592902986, + "grad_norm": 1.8524249792099, + "learning_rate": 5.59504234516169e-06, + "loss": 0.5443, + "step": 301380 + }, + { + "epoch": 2.6643858625506107, + "grad_norm": 3.1454224586486816, + "learning_rate": 5.593568957489819e-06, + "loss": 0.5647, + "step": 301390 + }, + { + "epoch": 2.664474265810923, + "grad_norm": 6.328191757202148, + "learning_rate": 5.592095569817948e-06, + "loss": 0.5615, + "step": 301400 + }, + { + "epoch": 2.6645626690712354, + "grad_norm": 6.902084827423096, + "learning_rate": 5.590622182146077e-06, + "loss": 0.3227, + "step": 301410 + }, + { + "epoch": 2.6646510723315475, + "grad_norm": 5.0680341720581055, + "learning_rate": 5.5891487944742065e-06, + "loss": 0.5623, + "step": 301420 + }, + { + "epoch": 2.66473947559186, + "grad_norm": 4.588562488555908, + "learning_rate": 5.5876754068023365e-06, + "loss": 0.5584, + "step": 301430 + }, + { + "epoch": 2.664827878852172, + "grad_norm": 4.472071170806885, + "learning_rate": 5.586202019130466e-06, + "loss": 0.528, + "step": 301440 + }, + { + "epoch": 2.6649162821124843, + "grad_norm": 4.120019912719727, + "learning_rate": 5.584728631458595e-06, + "loss": 0.515, + "step": 301450 + }, + { + "epoch": 2.6650046853727964, + "grad_norm": 2.610083818435669, + "learning_rate": 5.583255243786725e-06, + "loss": 0.5329, + "step": 301460 + }, + { + "epoch": 2.6650930886331086, + "grad_norm": 6.903878211975098, + "learning_rate": 5.581781856114854e-06, + "loss": 0.4676, + "step": 301470 + }, + { + "epoch": 2.665181491893421, + "grad_norm": 2.3650033473968506, + "learning_rate": 5.580308468442983e-06, + "loss": 0.4556, + "step": 301480 + }, + { + "epoch": 2.6652698951537332, + "grad_norm": 3.4545586109161377, + "learning_rate": 5.578835080771113e-06, + "loss": 0.5035, + "step": 301490 + }, + { + "epoch": 2.6653582984140454, + "grad_norm": 3.1813039779663086, + "learning_rate": 5.577361693099242e-06, + "loss": 0.4643, + "step": 301500 + }, + { + "epoch": 2.665446701674358, + "grad_norm": 2.6331918239593506, + "learning_rate": 5.575888305427371e-06, + "loss": 0.6299, + "step": 301510 + }, + { + "epoch": 2.66553510493467, + "grad_norm": 2.7613770961761475, + "learning_rate": 5.5744149177555e-06, + "loss": 0.522, + "step": 301520 + }, + { + "epoch": 2.665623508194982, + "grad_norm": 8.8428316116333, + "learning_rate": 5.57294153008363e-06, + "loss": 0.5302, + "step": 301530 + }, + { + "epoch": 2.6657119114552943, + "grad_norm": 2.1212480068206787, + "learning_rate": 5.5714681424117594e-06, + "loss": 0.544, + "step": 301540 + }, + { + "epoch": 2.665800314715607, + "grad_norm": 4.884010314941406, + "learning_rate": 5.569994754739889e-06, + "loss": 0.5216, + "step": 301550 + }, + { + "epoch": 2.665888717975919, + "grad_norm": 2.911592483520508, + "learning_rate": 5.568521367068018e-06, + "loss": 0.4081, + "step": 301560 + }, + { + "epoch": 2.665977121236231, + "grad_norm": 0.9001580476760864, + "learning_rate": 5.567047979396147e-06, + "loss": 0.5598, + "step": 301570 + }, + { + "epoch": 2.6660655244965437, + "grad_norm": 0.6461151242256165, + "learning_rate": 5.565574591724276e-06, + "loss": 0.5145, + "step": 301580 + }, + { + "epoch": 2.666153927756856, + "grad_norm": 1.9078935384750366, + "learning_rate": 5.5641012040524054e-06, + "loss": 0.4894, + "step": 301590 + }, + { + "epoch": 2.666242331017168, + "grad_norm": 2.8448331356048584, + "learning_rate": 5.562627816380535e-06, + "loss": 0.5352, + "step": 301600 + }, + { + "epoch": 2.66633073427748, + "grad_norm": 4.8345184326171875, + "learning_rate": 5.561154428708664e-06, + "loss": 0.5653, + "step": 301610 + }, + { + "epoch": 2.666419137537792, + "grad_norm": 3.7088980674743652, + "learning_rate": 5.559681041036794e-06, + "loss": 0.5975, + "step": 301620 + }, + { + "epoch": 2.6665075407981047, + "grad_norm": 3.10241436958313, + "learning_rate": 5.558207653364923e-06, + "loss": 0.5893, + "step": 301630 + }, + { + "epoch": 2.666595944058417, + "grad_norm": 3.9488871097564697, + "learning_rate": 5.556734265693052e-06, + "loss": 0.5433, + "step": 301640 + }, + { + "epoch": 2.6666843473187294, + "grad_norm": 2.434945821762085, + "learning_rate": 5.5552608780211815e-06, + "loss": 0.4685, + "step": 301650 + }, + { + "epoch": 2.6667727505790415, + "grad_norm": 2.459001302719116, + "learning_rate": 5.553787490349311e-06, + "loss": 0.5504, + "step": 301660 + }, + { + "epoch": 2.6668611538393536, + "grad_norm": 3.773650884628296, + "learning_rate": 5.55231410267744e-06, + "loss": 0.4964, + "step": 301670 + }, + { + "epoch": 2.6669495570996657, + "grad_norm": 11.671462059020996, + "learning_rate": 5.55084071500557e-06, + "loss": 0.5329, + "step": 301680 + }, + { + "epoch": 2.667037960359978, + "grad_norm": 5.7890706062316895, + "learning_rate": 5.549367327333699e-06, + "loss": 0.726, + "step": 301690 + }, + { + "epoch": 2.6671263636202904, + "grad_norm": 2.0478944778442383, + "learning_rate": 5.547893939661828e-06, + "loss": 0.4547, + "step": 301700 + }, + { + "epoch": 2.6672147668806026, + "grad_norm": 1.684216856956482, + "learning_rate": 5.546420551989958e-06, + "loss": 0.4901, + "step": 301710 + }, + { + "epoch": 2.6673031701409147, + "grad_norm": 3.330371618270874, + "learning_rate": 5.544947164318088e-06, + "loss": 0.5683, + "step": 301720 + }, + { + "epoch": 2.6673915734012272, + "grad_norm": 12.519342422485352, + "learning_rate": 5.543473776646217e-06, + "loss": 0.5749, + "step": 301730 + }, + { + "epoch": 2.6674799766615394, + "grad_norm": 3.5055530071258545, + "learning_rate": 5.542000388974346e-06, + "loss": 0.4858, + "step": 301740 + }, + { + "epoch": 2.6675683799218515, + "grad_norm": 2.8673346042633057, + "learning_rate": 5.540527001302475e-06, + "loss": 0.5102, + "step": 301750 + }, + { + "epoch": 2.6676567831821636, + "grad_norm": 3.654740810394287, + "learning_rate": 5.539053613630604e-06, + "loss": 0.3957, + "step": 301760 + }, + { + "epoch": 2.667745186442476, + "grad_norm": 7.914402484893799, + "learning_rate": 5.537580225958734e-06, + "loss": 0.6026, + "step": 301770 + }, + { + "epoch": 2.6678335897027883, + "grad_norm": 2.577216148376465, + "learning_rate": 5.536106838286863e-06, + "loss": 0.4497, + "step": 301780 + }, + { + "epoch": 2.6679219929631004, + "grad_norm": 2.546255588531494, + "learning_rate": 5.534633450614992e-06, + "loss": 0.5182, + "step": 301790 + }, + { + "epoch": 2.668010396223413, + "grad_norm": 3.1177425384521484, + "learning_rate": 5.533160062943121e-06, + "loss": 0.4173, + "step": 301800 + }, + { + "epoch": 2.668098799483725, + "grad_norm": 2.4915287494659424, + "learning_rate": 5.531686675271251e-06, + "loss": 0.6075, + "step": 301810 + }, + { + "epoch": 2.668187202744037, + "grad_norm": 1.7169303894042969, + "learning_rate": 5.5302132875993805e-06, + "loss": 0.4887, + "step": 301820 + }, + { + "epoch": 2.6682756060043493, + "grad_norm": 2.875230550765991, + "learning_rate": 5.52873989992751e-06, + "loss": 0.5397, + "step": 301830 + }, + { + "epoch": 2.6683640092646614, + "grad_norm": 0.7961872220039368, + "learning_rate": 5.527266512255639e-06, + "loss": 0.5093, + "step": 301840 + }, + { + "epoch": 2.668452412524974, + "grad_norm": 1.0021085739135742, + "learning_rate": 5.525793124583768e-06, + "loss": 0.481, + "step": 301850 + }, + { + "epoch": 2.668540815785286, + "grad_norm": 4.522593021392822, + "learning_rate": 5.524319736911897e-06, + "loss": 0.532, + "step": 301860 + }, + { + "epoch": 2.6686292190455982, + "grad_norm": 4.6358232498168945, + "learning_rate": 5.5228463492400265e-06, + "loss": 0.5176, + "step": 301870 + }, + { + "epoch": 2.668717622305911, + "grad_norm": 5.157197952270508, + "learning_rate": 5.521372961568156e-06, + "loss": 0.6845, + "step": 301880 + }, + { + "epoch": 2.668806025566223, + "grad_norm": 6.0866804122924805, + "learning_rate": 5.519899573896285e-06, + "loss": 0.5352, + "step": 301890 + }, + { + "epoch": 2.668894428826535, + "grad_norm": 4.129942417144775, + "learning_rate": 5.518426186224415e-06, + "loss": 0.521, + "step": 301900 + }, + { + "epoch": 2.668982832086847, + "grad_norm": 6.449389457702637, + "learning_rate": 5.516952798552544e-06, + "loss": 0.5793, + "step": 301910 + }, + { + "epoch": 2.6690712353471597, + "grad_norm": 1.9693869352340698, + "learning_rate": 5.515479410880673e-06, + "loss": 0.505, + "step": 301920 + }, + { + "epoch": 2.669159638607472, + "grad_norm": 11.20102596282959, + "learning_rate": 5.514006023208803e-06, + "loss": 0.5327, + "step": 301930 + }, + { + "epoch": 2.669248041867784, + "grad_norm": 1.9359103441238403, + "learning_rate": 5.5125326355369326e-06, + "loss": 0.4857, + "step": 301940 + }, + { + "epoch": 2.6693364451280965, + "grad_norm": 5.270946502685547, + "learning_rate": 5.511059247865062e-06, + "loss": 0.4831, + "step": 301950 + }, + { + "epoch": 2.6694248483884087, + "grad_norm": 3.9061062335968018, + "learning_rate": 5.509585860193191e-06, + "loss": 0.4933, + "step": 301960 + }, + { + "epoch": 2.6695132516487208, + "grad_norm": 4.486837863922119, + "learning_rate": 5.50811247252132e-06, + "loss": 0.5953, + "step": 301970 + }, + { + "epoch": 2.669601654909033, + "grad_norm": 2.5730206966400146, + "learning_rate": 5.506639084849449e-06, + "loss": 0.4676, + "step": 301980 + }, + { + "epoch": 2.669690058169345, + "grad_norm": 2.126091957092285, + "learning_rate": 5.505165697177579e-06, + "loss": 0.5013, + "step": 301990 + }, + { + "epoch": 2.6697784614296576, + "grad_norm": 3.117980480194092, + "learning_rate": 5.503692309505709e-06, + "loss": 0.5199, + "step": 302000 + }, + { + "epoch": 2.6698668646899697, + "grad_norm": 1.484261393547058, + "learning_rate": 5.502218921833838e-06, + "loss": 0.4779, + "step": 302010 + }, + { + "epoch": 2.6699552679502823, + "grad_norm": 4.341452598571777, + "learning_rate": 5.500745534161967e-06, + "loss": 0.4872, + "step": 302020 + }, + { + "epoch": 2.6700436712105944, + "grad_norm": 1.6142317056655884, + "learning_rate": 5.499272146490096e-06, + "loss": 0.512, + "step": 302030 + }, + { + "epoch": 2.6701320744709065, + "grad_norm": 3.001720428466797, + "learning_rate": 5.497798758818225e-06, + "loss": 0.518, + "step": 302040 + }, + { + "epoch": 2.6702204777312186, + "grad_norm": 1.9478737115859985, + "learning_rate": 5.496325371146355e-06, + "loss": 0.4518, + "step": 302050 + }, + { + "epoch": 2.6703088809915307, + "grad_norm": 3.868239641189575, + "learning_rate": 5.494851983474484e-06, + "loss": 0.5357, + "step": 302060 + }, + { + "epoch": 2.6703972842518433, + "grad_norm": 7.828040599822998, + "learning_rate": 5.493378595802613e-06, + "loss": 0.5003, + "step": 302070 + }, + { + "epoch": 2.6704856875121554, + "grad_norm": 2.681230306625366, + "learning_rate": 5.491905208130742e-06, + "loss": 0.4752, + "step": 302080 + }, + { + "epoch": 2.6705740907724675, + "grad_norm": 1.2626796960830688, + "learning_rate": 5.490431820458872e-06, + "loss": 0.4494, + "step": 302090 + }, + { + "epoch": 2.67066249403278, + "grad_norm": 12.661581039428711, + "learning_rate": 5.4889584327870015e-06, + "loss": 0.5345, + "step": 302100 + }, + { + "epoch": 2.6707508972930922, + "grad_norm": 4.706273078918457, + "learning_rate": 5.487485045115131e-06, + "loss": 0.6033, + "step": 302110 + }, + { + "epoch": 2.6708393005534043, + "grad_norm": 5.178382873535156, + "learning_rate": 5.48601165744326e-06, + "loss": 0.6868, + "step": 302120 + }, + { + "epoch": 2.6709277038137165, + "grad_norm": 4.399640083312988, + "learning_rate": 5.484538269771389e-06, + "loss": 0.4924, + "step": 302130 + }, + { + "epoch": 2.671016107074029, + "grad_norm": 3.5717556476593018, + "learning_rate": 5.483064882099518e-06, + "loss": 0.7367, + "step": 302140 + }, + { + "epoch": 2.671104510334341, + "grad_norm": 13.272159576416016, + "learning_rate": 5.481591494427648e-06, + "loss": 0.493, + "step": 302150 + }, + { + "epoch": 2.6711929135946533, + "grad_norm": 3.4548048973083496, + "learning_rate": 5.4801181067557775e-06, + "loss": 0.5494, + "step": 302160 + }, + { + "epoch": 2.671281316854966, + "grad_norm": 4.442200660705566, + "learning_rate": 5.478644719083907e-06, + "loss": 0.5068, + "step": 302170 + }, + { + "epoch": 2.671369720115278, + "grad_norm": 1.649648904800415, + "learning_rate": 5.477171331412037e-06, + "loss": 0.5552, + "step": 302180 + }, + { + "epoch": 2.67145812337559, + "grad_norm": 5.470216751098633, + "learning_rate": 5.475697943740166e-06, + "loss": 0.572, + "step": 302190 + }, + { + "epoch": 2.671546526635902, + "grad_norm": 2.338379383087158, + "learning_rate": 5.474224556068295e-06, + "loss": 0.3942, + "step": 302200 + }, + { + "epoch": 2.6716349298962143, + "grad_norm": 5.666273593902588, + "learning_rate": 5.472751168396424e-06, + "loss": 0.4784, + "step": 302210 + }, + { + "epoch": 2.671723333156527, + "grad_norm": 3.2859082221984863, + "learning_rate": 5.4712777807245536e-06, + "loss": 0.6851, + "step": 302220 + }, + { + "epoch": 2.671811736416839, + "grad_norm": 8.032526016235352, + "learning_rate": 5.469804393052683e-06, + "loss": 0.4566, + "step": 302230 + }, + { + "epoch": 2.6719001396771516, + "grad_norm": 2.420863389968872, + "learning_rate": 5.468331005380812e-06, + "loss": 0.4387, + "step": 302240 + }, + { + "epoch": 2.6719885429374637, + "grad_norm": 5.99383020401001, + "learning_rate": 5.466857617708941e-06, + "loss": 0.529, + "step": 302250 + }, + { + "epoch": 2.672076946197776, + "grad_norm": 1.1828259229660034, + "learning_rate": 5.46538423003707e-06, + "loss": 0.5541, + "step": 302260 + }, + { + "epoch": 2.672165349458088, + "grad_norm": 10.188935279846191, + "learning_rate": 5.4639108423652e-06, + "loss": 0.5041, + "step": 302270 + }, + { + "epoch": 2.6722537527184, + "grad_norm": 2.4744949340820312, + "learning_rate": 5.46243745469333e-06, + "loss": 0.5558, + "step": 302280 + }, + { + "epoch": 2.6723421559787126, + "grad_norm": 5.058260440826416, + "learning_rate": 5.460964067021459e-06, + "loss": 0.4483, + "step": 302290 + }, + { + "epoch": 2.6724305592390247, + "grad_norm": 2.7376341819763184, + "learning_rate": 5.459490679349588e-06, + "loss": 0.6755, + "step": 302300 + }, + { + "epoch": 2.672518962499337, + "grad_norm": 4.367979049682617, + "learning_rate": 5.458017291677717e-06, + "loss": 0.5293, + "step": 302310 + }, + { + "epoch": 2.6726073657596494, + "grad_norm": 2.6225290298461914, + "learning_rate": 5.4565439040058464e-06, + "loss": 0.5181, + "step": 302320 + }, + { + "epoch": 2.6726957690199615, + "grad_norm": 2.049832820892334, + "learning_rate": 5.455070516333976e-06, + "loss": 0.4864, + "step": 302330 + }, + { + "epoch": 2.6727841722802737, + "grad_norm": 4.427591800689697, + "learning_rate": 5.453597128662105e-06, + "loss": 0.4887, + "step": 302340 + }, + { + "epoch": 2.6728725755405858, + "grad_norm": 9.5797700881958, + "learning_rate": 5.452123740990234e-06, + "loss": 0.5231, + "step": 302350 + }, + { + "epoch": 2.6729609788008983, + "grad_norm": 1.400490164756775, + "learning_rate": 5.450650353318363e-06, + "loss": 0.3862, + "step": 302360 + }, + { + "epoch": 2.6730493820612105, + "grad_norm": 6.769214153289795, + "learning_rate": 5.449176965646493e-06, + "loss": 0.4996, + "step": 302370 + }, + { + "epoch": 2.6731377853215226, + "grad_norm": 2.169408082962036, + "learning_rate": 5.4477035779746225e-06, + "loss": 0.4482, + "step": 302380 + }, + { + "epoch": 2.673226188581835, + "grad_norm": 13.289419174194336, + "learning_rate": 5.4462301903027525e-06, + "loss": 0.6088, + "step": 302390 + }, + { + "epoch": 2.6733145918421473, + "grad_norm": 3.629873514175415, + "learning_rate": 5.444756802630882e-06, + "loss": 0.565, + "step": 302400 + }, + { + "epoch": 2.6734029951024594, + "grad_norm": 4.769453525543213, + "learning_rate": 5.443283414959011e-06, + "loss": 0.4161, + "step": 302410 + }, + { + "epoch": 2.6734913983627715, + "grad_norm": 11.909154891967773, + "learning_rate": 5.44181002728714e-06, + "loss": 0.5766, + "step": 302420 + }, + { + "epoch": 2.6735798016230836, + "grad_norm": 2.6878128051757812, + "learning_rate": 5.440336639615269e-06, + "loss": 0.4251, + "step": 302430 + }, + { + "epoch": 2.673668204883396, + "grad_norm": 4.733994960784912, + "learning_rate": 5.4388632519433985e-06, + "loss": 0.399, + "step": 302440 + }, + { + "epoch": 2.6737566081437083, + "grad_norm": 28.92795181274414, + "learning_rate": 5.437389864271528e-06, + "loss": 0.5221, + "step": 302450 + }, + { + "epoch": 2.6738450114040204, + "grad_norm": 2.340141773223877, + "learning_rate": 5.435916476599658e-06, + "loss": 0.5072, + "step": 302460 + }, + { + "epoch": 2.673933414664333, + "grad_norm": 4.148924827575684, + "learning_rate": 5.434443088927787e-06, + "loss": 0.6745, + "step": 302470 + }, + { + "epoch": 2.674021817924645, + "grad_norm": 2.6892285346984863, + "learning_rate": 5.432969701255916e-06, + "loss": 0.643, + "step": 302480 + }, + { + "epoch": 2.6741102211849572, + "grad_norm": 21.851425170898438, + "learning_rate": 5.431496313584045e-06, + "loss": 0.5397, + "step": 302490 + }, + { + "epoch": 2.6741986244452693, + "grad_norm": 9.677970886230469, + "learning_rate": 5.430022925912175e-06, + "loss": 0.6544, + "step": 302500 + }, + { + "epoch": 2.674287027705582, + "grad_norm": 3.128013849258423, + "learning_rate": 5.428549538240304e-06, + "loss": 0.4714, + "step": 302510 + }, + { + "epoch": 2.674375430965894, + "grad_norm": 2.0185248851776123, + "learning_rate": 5.427076150568433e-06, + "loss": 0.4818, + "step": 302520 + }, + { + "epoch": 2.674463834226206, + "grad_norm": 1.8542896509170532, + "learning_rate": 5.425602762896562e-06, + "loss": 0.6081, + "step": 302530 + }, + { + "epoch": 2.6745522374865187, + "grad_norm": 1.636214256286621, + "learning_rate": 5.424129375224691e-06, + "loss": 0.4912, + "step": 302540 + }, + { + "epoch": 2.674640640746831, + "grad_norm": 5.564589023590088, + "learning_rate": 5.422655987552821e-06, + "loss": 0.486, + "step": 302550 + }, + { + "epoch": 2.674729044007143, + "grad_norm": 3.543088436126709, + "learning_rate": 5.421182599880951e-06, + "loss": 0.6586, + "step": 302560 + }, + { + "epoch": 2.674817447267455, + "grad_norm": 22.539907455444336, + "learning_rate": 5.41970921220908e-06, + "loss": 0.5589, + "step": 302570 + }, + { + "epoch": 2.674905850527767, + "grad_norm": 2.136669635772705, + "learning_rate": 5.418235824537209e-06, + "loss": 0.4166, + "step": 302580 + }, + { + "epoch": 2.6749942537880798, + "grad_norm": 4.085372447967529, + "learning_rate": 5.416762436865338e-06, + "loss": 0.5463, + "step": 302590 + }, + { + "epoch": 2.675082657048392, + "grad_norm": 1.9284427165985107, + "learning_rate": 5.4152890491934674e-06, + "loss": 0.4261, + "step": 302600 + }, + { + "epoch": 2.6751710603087044, + "grad_norm": 4.164818286895752, + "learning_rate": 5.4138156615215975e-06, + "loss": 0.5388, + "step": 302610 + }, + { + "epoch": 2.6752594635690166, + "grad_norm": 5.401543617248535, + "learning_rate": 5.412342273849727e-06, + "loss": 0.4424, + "step": 302620 + }, + { + "epoch": 2.6753478668293287, + "grad_norm": 1.0231529474258423, + "learning_rate": 5.410868886177856e-06, + "loss": 0.4731, + "step": 302630 + }, + { + "epoch": 2.675436270089641, + "grad_norm": 6.887064456939697, + "learning_rate": 5.409395498505985e-06, + "loss": 0.5279, + "step": 302640 + }, + { + "epoch": 2.675524673349953, + "grad_norm": 34.765045166015625, + "learning_rate": 5.407922110834115e-06, + "loss": 0.4895, + "step": 302650 + }, + { + "epoch": 2.6756130766102655, + "grad_norm": 17.966190338134766, + "learning_rate": 5.406448723162244e-06, + "loss": 0.4569, + "step": 302660 + }, + { + "epoch": 2.6757014798705776, + "grad_norm": 3.0450644493103027, + "learning_rate": 5.4049753354903735e-06, + "loss": 0.4435, + "step": 302670 + }, + { + "epoch": 2.6757898831308897, + "grad_norm": 3.982543706893921, + "learning_rate": 5.403501947818503e-06, + "loss": 0.505, + "step": 302680 + }, + { + "epoch": 2.6758782863912023, + "grad_norm": 1.277186393737793, + "learning_rate": 5.402028560146632e-06, + "loss": 0.4566, + "step": 302690 + }, + { + "epoch": 2.6759666896515144, + "grad_norm": 1.328387975692749, + "learning_rate": 5.400555172474761e-06, + "loss": 0.4802, + "step": 302700 + }, + { + "epoch": 2.6760550929118265, + "grad_norm": 2.6515378952026367, + "learning_rate": 5.39908178480289e-06, + "loss": 0.5677, + "step": 302710 + }, + { + "epoch": 2.6761434961721386, + "grad_norm": 3.1805973052978516, + "learning_rate": 5.3976083971310196e-06, + "loss": 0.5343, + "step": 302720 + }, + { + "epoch": 2.676231899432451, + "grad_norm": 2.2420814037323, + "learning_rate": 5.396135009459149e-06, + "loss": 0.5178, + "step": 302730 + }, + { + "epoch": 2.6763203026927633, + "grad_norm": 9.476059913635254, + "learning_rate": 5.394661621787279e-06, + "loss": 0.4099, + "step": 302740 + }, + { + "epoch": 2.6764087059530755, + "grad_norm": 6.7633442878723145, + "learning_rate": 5.393188234115408e-06, + "loss": 0.5134, + "step": 302750 + }, + { + "epoch": 2.676497109213388, + "grad_norm": 6.382756233215332, + "learning_rate": 5.391714846443537e-06, + "loss": 0.4742, + "step": 302760 + }, + { + "epoch": 2.6765855124737, + "grad_norm": 4.136101245880127, + "learning_rate": 5.390241458771666e-06, + "loss": 0.5122, + "step": 302770 + }, + { + "epoch": 2.6766739157340123, + "grad_norm": 1.4335135221481323, + "learning_rate": 5.388768071099796e-06, + "loss": 0.609, + "step": 302780 + }, + { + "epoch": 2.6767623189943244, + "grad_norm": 1.8573977947235107, + "learning_rate": 5.387294683427925e-06, + "loss": 0.4378, + "step": 302790 + }, + { + "epoch": 2.6768507222546365, + "grad_norm": 12.257981300354004, + "learning_rate": 5.385821295756054e-06, + "loss": 0.4779, + "step": 302800 + }, + { + "epoch": 2.676939125514949, + "grad_norm": 3.2209792137145996, + "learning_rate": 5.384347908084183e-06, + "loss": 0.5168, + "step": 302810 + }, + { + "epoch": 2.677027528775261, + "grad_norm": 1.6314536333084106, + "learning_rate": 5.382874520412312e-06, + "loss": 0.5486, + "step": 302820 + }, + { + "epoch": 2.6771159320355737, + "grad_norm": 6.111200332641602, + "learning_rate": 5.3814011327404425e-06, + "loss": 0.6139, + "step": 302830 + }, + { + "epoch": 2.677204335295886, + "grad_norm": 2.9685919284820557, + "learning_rate": 5.379927745068572e-06, + "loss": 0.5925, + "step": 302840 + }, + { + "epoch": 2.677292738556198, + "grad_norm": 4.899242401123047, + "learning_rate": 5.378454357396701e-06, + "loss": 0.4559, + "step": 302850 + }, + { + "epoch": 2.67738114181651, + "grad_norm": 3.698613166809082, + "learning_rate": 5.376980969724831e-06, + "loss": 0.5172, + "step": 302860 + }, + { + "epoch": 2.677469545076822, + "grad_norm": 1.911853551864624, + "learning_rate": 5.37550758205296e-06, + "loss": 0.4531, + "step": 302870 + }, + { + "epoch": 2.677557948337135, + "grad_norm": 12.07934284210205, + "learning_rate": 5.374034194381089e-06, + "loss": 0.4354, + "step": 302880 + }, + { + "epoch": 2.677646351597447, + "grad_norm": 2.4559009075164795, + "learning_rate": 5.3725608067092185e-06, + "loss": 0.5335, + "step": 302890 + }, + { + "epoch": 2.677734754857759, + "grad_norm": 6.569911956787109, + "learning_rate": 5.371087419037348e-06, + "loss": 0.6165, + "step": 302900 + }, + { + "epoch": 2.6778231581180716, + "grad_norm": 4.634341716766357, + "learning_rate": 5.369614031365477e-06, + "loss": 0.468, + "step": 302910 + }, + { + "epoch": 2.6779115613783837, + "grad_norm": 2.7582099437713623, + "learning_rate": 5.368140643693606e-06, + "loss": 0.5584, + "step": 302920 + }, + { + "epoch": 2.677999964638696, + "grad_norm": 3.7023744583129883, + "learning_rate": 5.366667256021736e-06, + "loss": 0.4401, + "step": 302930 + }, + { + "epoch": 2.678088367899008, + "grad_norm": 3.2657506465911865, + "learning_rate": 5.365193868349865e-06, + "loss": 0.4527, + "step": 302940 + }, + { + "epoch": 2.6781767711593205, + "grad_norm": 15.05247974395752, + "learning_rate": 5.3637204806779946e-06, + "loss": 0.6186, + "step": 302950 + }, + { + "epoch": 2.6782651744196326, + "grad_norm": 5.730356693267822, + "learning_rate": 5.362247093006124e-06, + "loss": 0.42, + "step": 302960 + }, + { + "epoch": 2.6783535776799448, + "grad_norm": 2.8500046730041504, + "learning_rate": 5.360773705334253e-06, + "loss": 0.503, + "step": 302970 + }, + { + "epoch": 2.6784419809402573, + "grad_norm": 3.8009018898010254, + "learning_rate": 5.359300317662382e-06, + "loss": 0.5075, + "step": 302980 + }, + { + "epoch": 2.6785303842005694, + "grad_norm": 5.082824230194092, + "learning_rate": 5.357826929990511e-06, + "loss": 0.4324, + "step": 302990 + }, + { + "epoch": 2.6786187874608816, + "grad_norm": 11.777118682861328, + "learning_rate": 5.3563535423186406e-06, + "loss": 0.6909, + "step": 303000 + }, + { + "epoch": 2.6787071907211937, + "grad_norm": 3.928022623062134, + "learning_rate": 5.35488015464677e-06, + "loss": 0.4983, + "step": 303010 + }, + { + "epoch": 2.678795593981506, + "grad_norm": 1.5744333267211914, + "learning_rate": 5.3534067669749e-06, + "loss": 0.5501, + "step": 303020 + }, + { + "epoch": 2.6788839972418184, + "grad_norm": 4.098039150238037, + "learning_rate": 5.351933379303029e-06, + "loss": 0.4907, + "step": 303030 + }, + { + "epoch": 2.6789724005021305, + "grad_norm": 4.3304619789123535, + "learning_rate": 5.350459991631158e-06, + "loss": 0.5208, + "step": 303040 + }, + { + "epoch": 2.6790608037624426, + "grad_norm": 3.8969082832336426, + "learning_rate": 5.348986603959287e-06, + "loss": 0.5737, + "step": 303050 + }, + { + "epoch": 2.679149207022755, + "grad_norm": 6.298733711242676, + "learning_rate": 5.347513216287417e-06, + "loss": 0.5381, + "step": 303060 + }, + { + "epoch": 2.6792376102830673, + "grad_norm": 5.0616841316223145, + "learning_rate": 5.346039828615546e-06, + "loss": 0.5793, + "step": 303070 + }, + { + "epoch": 2.6793260135433794, + "grad_norm": 4.555797100067139, + "learning_rate": 5.344566440943676e-06, + "loss": 0.6406, + "step": 303080 + }, + { + "epoch": 2.6794144168036915, + "grad_norm": 0.9544357657432556, + "learning_rate": 5.343093053271805e-06, + "loss": 0.4607, + "step": 303090 + }, + { + "epoch": 2.679502820064004, + "grad_norm": 3.831953525543213, + "learning_rate": 5.341619665599934e-06, + "loss": 0.5024, + "step": 303100 + }, + { + "epoch": 2.679591223324316, + "grad_norm": 2.5447030067443848, + "learning_rate": 5.3401462779280635e-06, + "loss": 0.5389, + "step": 303110 + }, + { + "epoch": 2.6796796265846283, + "grad_norm": 4.42680549621582, + "learning_rate": 5.3386728902561935e-06, + "loss": 0.4762, + "step": 303120 + }, + { + "epoch": 2.679768029844941, + "grad_norm": 4.528481960296631, + "learning_rate": 5.337199502584323e-06, + "loss": 0.6241, + "step": 303130 + }, + { + "epoch": 2.679856433105253, + "grad_norm": 30.611507415771484, + "learning_rate": 5.335726114912452e-06, + "loss": 0.61, + "step": 303140 + }, + { + "epoch": 2.679944836365565, + "grad_norm": 1.1384614706039429, + "learning_rate": 5.334252727240581e-06, + "loss": 0.4625, + "step": 303150 + }, + { + "epoch": 2.6800332396258773, + "grad_norm": 3.9268546104431152, + "learning_rate": 5.33277933956871e-06, + "loss": 0.5599, + "step": 303160 + }, + { + "epoch": 2.6801216428861894, + "grad_norm": 10.25774097442627, + "learning_rate": 5.3313059518968395e-06, + "loss": 0.4905, + "step": 303170 + }, + { + "epoch": 2.680210046146502, + "grad_norm": 4.164135456085205, + "learning_rate": 5.329832564224969e-06, + "loss": 0.6126, + "step": 303180 + }, + { + "epoch": 2.680298449406814, + "grad_norm": 3.9200124740600586, + "learning_rate": 5.328359176553098e-06, + "loss": 0.5762, + "step": 303190 + }, + { + "epoch": 2.6803868526671266, + "grad_norm": 15.083598136901855, + "learning_rate": 5.326885788881227e-06, + "loss": 0.4791, + "step": 303200 + }, + { + "epoch": 2.6804752559274387, + "grad_norm": 4.153018474578857, + "learning_rate": 5.325412401209357e-06, + "loss": 0.5849, + "step": 303210 + }, + { + "epoch": 2.680563659187751, + "grad_norm": 1.8608185052871704, + "learning_rate": 5.323939013537486e-06, + "loss": 0.5294, + "step": 303220 + }, + { + "epoch": 2.680652062448063, + "grad_norm": 3.5563313961029053, + "learning_rate": 5.322465625865616e-06, + "loss": 0.4558, + "step": 303230 + }, + { + "epoch": 2.680740465708375, + "grad_norm": 3.1188220977783203, + "learning_rate": 5.320992238193745e-06, + "loss": 0.6002, + "step": 303240 + }, + { + "epoch": 2.6808288689686877, + "grad_norm": 1.3672758340835571, + "learning_rate": 5.319518850521874e-06, + "loss": 0.5758, + "step": 303250 + }, + { + "epoch": 2.680917272229, + "grad_norm": 1.3673694133758545, + "learning_rate": 5.318045462850003e-06, + "loss": 0.4545, + "step": 303260 + }, + { + "epoch": 2.681005675489312, + "grad_norm": 16.112228393554688, + "learning_rate": 5.316572075178132e-06, + "loss": 0.5438, + "step": 303270 + }, + { + "epoch": 2.6810940787496245, + "grad_norm": 8.461275100708008, + "learning_rate": 5.315098687506262e-06, + "loss": 0.4666, + "step": 303280 + }, + { + "epoch": 2.6811824820099366, + "grad_norm": 4.210044860839844, + "learning_rate": 5.313625299834391e-06, + "loss": 0.4485, + "step": 303290 + }, + { + "epoch": 2.6812708852702487, + "grad_norm": 6.568799018859863, + "learning_rate": 5.312151912162521e-06, + "loss": 0.6395, + "step": 303300 + }, + { + "epoch": 2.681359288530561, + "grad_norm": 11.744365692138672, + "learning_rate": 5.31067852449065e-06, + "loss": 0.5989, + "step": 303310 + }, + { + "epoch": 2.6814476917908734, + "grad_norm": 7.4961161613464355, + "learning_rate": 5.309205136818779e-06, + "loss": 0.603, + "step": 303320 + }, + { + "epoch": 2.6815360950511855, + "grad_norm": 7.277774810791016, + "learning_rate": 5.307731749146909e-06, + "loss": 0.4926, + "step": 303330 + }, + { + "epoch": 2.6816244983114976, + "grad_norm": 8.511844635009766, + "learning_rate": 5.3062583614750385e-06, + "loss": 0.4615, + "step": 303340 + }, + { + "epoch": 2.68171290157181, + "grad_norm": 1.808846354484558, + "learning_rate": 5.304784973803168e-06, + "loss": 0.4313, + "step": 303350 + }, + { + "epoch": 2.6818013048321223, + "grad_norm": 8.552785873413086, + "learning_rate": 5.303311586131297e-06, + "loss": 0.4917, + "step": 303360 + }, + { + "epoch": 2.6818897080924344, + "grad_norm": 3.4046761989593506, + "learning_rate": 5.301838198459426e-06, + "loss": 0.4706, + "step": 303370 + }, + { + "epoch": 2.6819781113527466, + "grad_norm": 3.102161407470703, + "learning_rate": 5.300364810787555e-06, + "loss": 0.4061, + "step": 303380 + }, + { + "epoch": 2.6820665146130587, + "grad_norm": 1.5648913383483887, + "learning_rate": 5.2988914231156845e-06, + "loss": 0.4441, + "step": 303390 + }, + { + "epoch": 2.6821549178733712, + "grad_norm": 3.935070037841797, + "learning_rate": 5.2974180354438145e-06, + "loss": 0.4765, + "step": 303400 + }, + { + "epoch": 2.6822433211336834, + "grad_norm": 2.0763630867004395, + "learning_rate": 5.295944647771944e-06, + "loss": 0.4667, + "step": 303410 + }, + { + "epoch": 2.682331724393996, + "grad_norm": 4.288541793823242, + "learning_rate": 5.294471260100073e-06, + "loss": 0.4776, + "step": 303420 + }, + { + "epoch": 2.682420127654308, + "grad_norm": 3.3474552631378174, + "learning_rate": 5.292997872428202e-06, + "loss": 0.5121, + "step": 303430 + }, + { + "epoch": 2.68250853091462, + "grad_norm": 2.760092258453369, + "learning_rate": 5.291524484756331e-06, + "loss": 0.5418, + "step": 303440 + }, + { + "epoch": 2.6825969341749323, + "grad_norm": 6.885657787322998, + "learning_rate": 5.2900510970844605e-06, + "loss": 0.5815, + "step": 303450 + }, + { + "epoch": 2.6826853374352444, + "grad_norm": 9.905792236328125, + "learning_rate": 5.28857770941259e-06, + "loss": 0.5677, + "step": 303460 + }, + { + "epoch": 2.682773740695557, + "grad_norm": 6.114774227142334, + "learning_rate": 5.287104321740719e-06, + "loss": 0.4985, + "step": 303470 + }, + { + "epoch": 2.682862143955869, + "grad_norm": 2.3239240646362305, + "learning_rate": 5.285630934068848e-06, + "loss": 0.5396, + "step": 303480 + }, + { + "epoch": 2.682950547216181, + "grad_norm": 9.232110023498535, + "learning_rate": 5.284157546396978e-06, + "loss": 0.4812, + "step": 303490 + }, + { + "epoch": 2.6830389504764938, + "grad_norm": 1.7250202894210815, + "learning_rate": 5.282684158725107e-06, + "loss": 0.5078, + "step": 303500 + }, + { + "epoch": 2.683127353736806, + "grad_norm": 4.680698871612549, + "learning_rate": 5.281210771053237e-06, + "loss": 0.4744, + "step": 303510 + }, + { + "epoch": 2.683215756997118, + "grad_norm": 8.749503135681152, + "learning_rate": 5.279737383381366e-06, + "loss": 0.593, + "step": 303520 + }, + { + "epoch": 2.68330416025743, + "grad_norm": 2.837489366531372, + "learning_rate": 5.278263995709495e-06, + "loss": 0.3687, + "step": 303530 + }, + { + "epoch": 2.6833925635177427, + "grad_norm": 3.36238169670105, + "learning_rate": 5.276790608037625e-06, + "loss": 0.431, + "step": 303540 + }, + { + "epoch": 2.683480966778055, + "grad_norm": 2.7064499855041504, + "learning_rate": 5.275317220365754e-06, + "loss": 0.517, + "step": 303550 + }, + { + "epoch": 2.683569370038367, + "grad_norm": 6.5852837562561035, + "learning_rate": 5.2738438326938834e-06, + "loss": 0.5744, + "step": 303560 + }, + { + "epoch": 2.6836577732986795, + "grad_norm": 1.0790187120437622, + "learning_rate": 5.272370445022013e-06, + "loss": 0.4711, + "step": 303570 + }, + { + "epoch": 2.6837461765589916, + "grad_norm": 3.9425899982452393, + "learning_rate": 5.270897057350143e-06, + "loss": 0.6841, + "step": 303580 + }, + { + "epoch": 2.6838345798193037, + "grad_norm": 1.790626049041748, + "learning_rate": 5.269423669678272e-06, + "loss": 0.4997, + "step": 303590 + }, + { + "epoch": 2.683922983079616, + "grad_norm": 1.0502065420150757, + "learning_rate": 5.267950282006401e-06, + "loss": 0.5242, + "step": 303600 + }, + { + "epoch": 2.684011386339928, + "grad_norm": 6.75415563583374, + "learning_rate": 5.26647689433453e-06, + "loss": 0.5659, + "step": 303610 + }, + { + "epoch": 2.6840997896002405, + "grad_norm": 2.3441059589385986, + "learning_rate": 5.2650035066626595e-06, + "loss": 0.3558, + "step": 303620 + }, + { + "epoch": 2.6841881928605527, + "grad_norm": 1.2501672506332397, + "learning_rate": 5.263530118990789e-06, + "loss": 0.5218, + "step": 303630 + }, + { + "epoch": 2.6842765961208648, + "grad_norm": 7.788269996643066, + "learning_rate": 5.262056731318918e-06, + "loss": 0.5522, + "step": 303640 + }, + { + "epoch": 2.6843649993811773, + "grad_norm": 2.984762191772461, + "learning_rate": 5.260583343647047e-06, + "loss": 0.5527, + "step": 303650 + }, + { + "epoch": 2.6844534026414895, + "grad_norm": 1.3655368089675903, + "learning_rate": 5.259109955975176e-06, + "loss": 0.3898, + "step": 303660 + }, + { + "epoch": 2.6845418059018016, + "grad_norm": 4.3142852783203125, + "learning_rate": 5.2576365683033055e-06, + "loss": 0.5998, + "step": 303670 + }, + { + "epoch": 2.6846302091621137, + "grad_norm": 7.04757833480835, + "learning_rate": 5.2561631806314356e-06, + "loss": 0.6141, + "step": 303680 + }, + { + "epoch": 2.6847186124224263, + "grad_norm": 3.148176908493042, + "learning_rate": 5.254689792959565e-06, + "loss": 0.5555, + "step": 303690 + }, + { + "epoch": 2.6848070156827384, + "grad_norm": 2.3714401721954346, + "learning_rate": 5.253216405287694e-06, + "loss": 0.5131, + "step": 303700 + }, + { + "epoch": 2.6848954189430505, + "grad_norm": 1.686798095703125, + "learning_rate": 5.251743017615823e-06, + "loss": 0.5172, + "step": 303710 + }, + { + "epoch": 2.684983822203363, + "grad_norm": 10.804594039916992, + "learning_rate": 5.250269629943952e-06, + "loss": 0.4528, + "step": 303720 + }, + { + "epoch": 2.685072225463675, + "grad_norm": 1.4765125513076782, + "learning_rate": 5.2487962422720816e-06, + "loss": 0.508, + "step": 303730 + }, + { + "epoch": 2.6851606287239873, + "grad_norm": 5.291695594787598, + "learning_rate": 5.247322854600211e-06, + "loss": 0.5863, + "step": 303740 + }, + { + "epoch": 2.6852490319842994, + "grad_norm": 2.2572453022003174, + "learning_rate": 5.24584946692834e-06, + "loss": 0.4949, + "step": 303750 + }, + { + "epoch": 2.6853374352446115, + "grad_norm": 8.285826683044434, + "learning_rate": 5.24437607925647e-06, + "loss": 0.4872, + "step": 303760 + }, + { + "epoch": 2.685425838504924, + "grad_norm": 2.1910860538482666, + "learning_rate": 5.242902691584599e-06, + "loss": 0.5668, + "step": 303770 + }, + { + "epoch": 2.6855142417652362, + "grad_norm": 1.7143421173095703, + "learning_rate": 5.241429303912728e-06, + "loss": 0.5347, + "step": 303780 + }, + { + "epoch": 2.685602645025549, + "grad_norm": 3.5705294609069824, + "learning_rate": 5.2399559162408585e-06, + "loss": 0.5582, + "step": 303790 + }, + { + "epoch": 2.685691048285861, + "grad_norm": 4.528110504150391, + "learning_rate": 5.238482528568988e-06, + "loss": 0.4467, + "step": 303800 + }, + { + "epoch": 2.685779451546173, + "grad_norm": 2.5488765239715576, + "learning_rate": 5.237009140897117e-06, + "loss": 0.4999, + "step": 303810 + }, + { + "epoch": 2.685867854806485, + "grad_norm": 8.090842247009277, + "learning_rate": 5.235535753225246e-06, + "loss": 0.5498, + "step": 303820 + }, + { + "epoch": 2.6859562580667973, + "grad_norm": 3.198418378829956, + "learning_rate": 5.234062365553375e-06, + "loss": 0.4544, + "step": 303830 + }, + { + "epoch": 2.68604466132711, + "grad_norm": 7.77910041809082, + "learning_rate": 5.2325889778815045e-06, + "loss": 0.493, + "step": 303840 + }, + { + "epoch": 2.686133064587422, + "grad_norm": 3.1737751960754395, + "learning_rate": 5.231115590209634e-06, + "loss": 0.4586, + "step": 303850 + }, + { + "epoch": 2.686221467847734, + "grad_norm": 2.3401639461517334, + "learning_rate": 5.229642202537764e-06, + "loss": 0.4925, + "step": 303860 + }, + { + "epoch": 2.6863098711080466, + "grad_norm": 6.336198806762695, + "learning_rate": 5.228168814865893e-06, + "loss": 0.4398, + "step": 303870 + }, + { + "epoch": 2.6863982743683588, + "grad_norm": 1.3003123998641968, + "learning_rate": 5.226695427194022e-06, + "loss": 0.4746, + "step": 303880 + }, + { + "epoch": 2.686486677628671, + "grad_norm": 14.118050575256348, + "learning_rate": 5.225222039522151e-06, + "loss": 0.6152, + "step": 303890 + }, + { + "epoch": 2.686575080888983, + "grad_norm": 4.220850467681885, + "learning_rate": 5.2237486518502805e-06, + "loss": 0.5872, + "step": 303900 + }, + { + "epoch": 2.6866634841492956, + "grad_norm": 2.5853283405303955, + "learning_rate": 5.22227526417841e-06, + "loss": 0.4614, + "step": 303910 + }, + { + "epoch": 2.6867518874096077, + "grad_norm": 2.590224504470825, + "learning_rate": 5.220801876506539e-06, + "loss": 0.4513, + "step": 303920 + }, + { + "epoch": 2.68684029066992, + "grad_norm": 8.737375259399414, + "learning_rate": 5.219328488834668e-06, + "loss": 0.5384, + "step": 303930 + }, + { + "epoch": 2.6869286939302324, + "grad_norm": 33.869449615478516, + "learning_rate": 5.217855101162797e-06, + "loss": 0.3951, + "step": 303940 + }, + { + "epoch": 2.6870170971905445, + "grad_norm": 2.8471426963806152, + "learning_rate": 5.2163817134909265e-06, + "loss": 0.6101, + "step": 303950 + }, + { + "epoch": 2.6871055004508566, + "grad_norm": 4.481380939483643, + "learning_rate": 5.2149083258190566e-06, + "loss": 0.611, + "step": 303960 + }, + { + "epoch": 2.6871939037111687, + "grad_norm": 6.052866458892822, + "learning_rate": 5.213434938147186e-06, + "loss": 0.4941, + "step": 303970 + }, + { + "epoch": 2.687282306971481, + "grad_norm": 1.142432689666748, + "learning_rate": 5.211961550475315e-06, + "loss": 0.4412, + "step": 303980 + }, + { + "epoch": 2.6873707102317934, + "grad_norm": 1.202817678451538, + "learning_rate": 5.210488162803444e-06, + "loss": 0.6047, + "step": 303990 + }, + { + "epoch": 2.6874591134921055, + "grad_norm": 5.727240085601807, + "learning_rate": 5.209014775131573e-06, + "loss": 0.4896, + "step": 304000 + }, + { + "epoch": 2.687547516752418, + "grad_norm": 3.7152345180511475, + "learning_rate": 5.207541387459703e-06, + "loss": 0.5451, + "step": 304010 + }, + { + "epoch": 2.68763592001273, + "grad_norm": 3.8011415004730225, + "learning_rate": 5.206067999787833e-06, + "loss": 0.4936, + "step": 304020 + }, + { + "epoch": 2.6877243232730423, + "grad_norm": 4.174756050109863, + "learning_rate": 5.204594612115962e-06, + "loss": 0.4825, + "step": 304030 + }, + { + "epoch": 2.6878127265333545, + "grad_norm": 3.6618595123291016, + "learning_rate": 5.203121224444091e-06, + "loss": 0.5285, + "step": 304040 + }, + { + "epoch": 2.6879011297936666, + "grad_norm": 2.3968346118927, + "learning_rate": 5.201647836772221e-06, + "loss": 0.6063, + "step": 304050 + }, + { + "epoch": 2.687989533053979, + "grad_norm": 12.229643821716309, + "learning_rate": 5.20017444910035e-06, + "loss": 0.653, + "step": 304060 + }, + { + "epoch": 2.6880779363142913, + "grad_norm": 28.310379028320312, + "learning_rate": 5.1987010614284795e-06, + "loss": 0.6635, + "step": 304070 + }, + { + "epoch": 2.6881663395746034, + "grad_norm": 5.151130676269531, + "learning_rate": 5.197227673756609e-06, + "loss": 0.5341, + "step": 304080 + }, + { + "epoch": 2.688254742834916, + "grad_norm": 7.600317478179932, + "learning_rate": 5.195754286084738e-06, + "loss": 0.5185, + "step": 304090 + }, + { + "epoch": 2.688343146095228, + "grad_norm": 16.325515747070312, + "learning_rate": 5.194280898412867e-06, + "loss": 0.4819, + "step": 304100 + }, + { + "epoch": 2.68843154935554, + "grad_norm": 2.951824188232422, + "learning_rate": 5.192807510740996e-06, + "loss": 0.579, + "step": 304110 + }, + { + "epoch": 2.6885199526158523, + "grad_norm": 2.002959966659546, + "learning_rate": 5.1913341230691255e-06, + "loss": 0.5274, + "step": 304120 + }, + { + "epoch": 2.688608355876165, + "grad_norm": 8.000849723815918, + "learning_rate": 5.189860735397255e-06, + "loss": 0.5234, + "step": 304130 + }, + { + "epoch": 2.688696759136477, + "grad_norm": 5.44380521774292, + "learning_rate": 5.188387347725385e-06, + "loss": 0.646, + "step": 304140 + }, + { + "epoch": 2.688785162396789, + "grad_norm": 1.9217720031738281, + "learning_rate": 5.186913960053514e-06, + "loss": 0.3993, + "step": 304150 + }, + { + "epoch": 2.6888735656571017, + "grad_norm": 2.448744058609009, + "learning_rate": 5.185440572381643e-06, + "loss": 0.5967, + "step": 304160 + }, + { + "epoch": 2.688961968917414, + "grad_norm": 14.254759788513184, + "learning_rate": 5.183967184709772e-06, + "loss": 0.4715, + "step": 304170 + }, + { + "epoch": 2.689050372177726, + "grad_norm": 3.043780565261841, + "learning_rate": 5.1824937970379015e-06, + "loss": 0.4926, + "step": 304180 + }, + { + "epoch": 2.689138775438038, + "grad_norm": 7.765686511993408, + "learning_rate": 5.181020409366031e-06, + "loss": 0.5384, + "step": 304190 + }, + { + "epoch": 2.68922717869835, + "grad_norm": 5.148855209350586, + "learning_rate": 5.17954702169416e-06, + "loss": 0.4771, + "step": 304200 + }, + { + "epoch": 2.6893155819586627, + "grad_norm": 11.200129508972168, + "learning_rate": 5.178073634022289e-06, + "loss": 0.5537, + "step": 304210 + }, + { + "epoch": 2.689403985218975, + "grad_norm": 4.458745956420898, + "learning_rate": 5.176600246350418e-06, + "loss": 0.6004, + "step": 304220 + }, + { + "epoch": 2.689492388479287, + "grad_norm": 7.6333794593811035, + "learning_rate": 5.175126858678548e-06, + "loss": 0.6074, + "step": 304230 + }, + { + "epoch": 2.6895807917395995, + "grad_norm": 12.279265403747559, + "learning_rate": 5.173653471006678e-06, + "loss": 0.6182, + "step": 304240 + }, + { + "epoch": 2.6896691949999116, + "grad_norm": 4.669636249542236, + "learning_rate": 5.172180083334807e-06, + "loss": 0.4808, + "step": 304250 + }, + { + "epoch": 2.6897575982602238, + "grad_norm": 7.15441370010376, + "learning_rate": 5.170706695662937e-06, + "loss": 0.5934, + "step": 304260 + }, + { + "epoch": 2.689846001520536, + "grad_norm": 3.280238151550293, + "learning_rate": 5.169233307991066e-06, + "loss": 0.5024, + "step": 304270 + }, + { + "epoch": 2.6899344047808484, + "grad_norm": 3.7015554904937744, + "learning_rate": 5.167759920319195e-06, + "loss": 0.5658, + "step": 304280 + }, + { + "epoch": 2.6900228080411606, + "grad_norm": 8.073193550109863, + "learning_rate": 5.1662865326473244e-06, + "loss": 0.4889, + "step": 304290 + }, + { + "epoch": 2.6901112113014727, + "grad_norm": 1.7262942790985107, + "learning_rate": 5.164813144975454e-06, + "loss": 0.492, + "step": 304300 + }, + { + "epoch": 2.6901996145617852, + "grad_norm": 4.214289665222168, + "learning_rate": 5.163339757303583e-06, + "loss": 0.6812, + "step": 304310 + }, + { + "epoch": 2.6902880178220974, + "grad_norm": 8.710677146911621, + "learning_rate": 5.161866369631712e-06, + "loss": 0.6657, + "step": 304320 + }, + { + "epoch": 2.6903764210824095, + "grad_norm": 12.615281105041504, + "learning_rate": 5.160392981959842e-06, + "loss": 0.5013, + "step": 304330 + }, + { + "epoch": 2.6904648243427216, + "grad_norm": 3.8869073390960693, + "learning_rate": 5.158919594287971e-06, + "loss": 0.5366, + "step": 304340 + }, + { + "epoch": 2.6905532276030337, + "grad_norm": 1.1334365606307983, + "learning_rate": 5.1574462066161005e-06, + "loss": 0.4863, + "step": 304350 + }, + { + "epoch": 2.6906416308633463, + "grad_norm": 3.609147071838379, + "learning_rate": 5.15597281894423e-06, + "loss": 0.5457, + "step": 304360 + }, + { + "epoch": 2.6907300341236584, + "grad_norm": 4.512438774108887, + "learning_rate": 5.154499431272359e-06, + "loss": 0.4717, + "step": 304370 + }, + { + "epoch": 2.690818437383971, + "grad_norm": 3.523669958114624, + "learning_rate": 5.153026043600488e-06, + "loss": 0.4513, + "step": 304380 + }, + { + "epoch": 2.690906840644283, + "grad_norm": 2.6042463779449463, + "learning_rate": 5.151552655928617e-06, + "loss": 0.5179, + "step": 304390 + }, + { + "epoch": 2.690995243904595, + "grad_norm": 3.7346861362457275, + "learning_rate": 5.1500792682567465e-06, + "loss": 0.5294, + "step": 304400 + }, + { + "epoch": 2.6910836471649073, + "grad_norm": 2.3024046421051025, + "learning_rate": 5.148605880584876e-06, + "loss": 0.5297, + "step": 304410 + }, + { + "epoch": 2.6911720504252195, + "grad_norm": 5.957935810089111, + "learning_rate": 5.147132492913005e-06, + "loss": 0.6125, + "step": 304420 + }, + { + "epoch": 2.691260453685532, + "grad_norm": 10.231656074523926, + "learning_rate": 5.145659105241135e-06, + "loss": 0.5142, + "step": 304430 + }, + { + "epoch": 2.691348856945844, + "grad_norm": 27.415607452392578, + "learning_rate": 5.144185717569264e-06, + "loss": 0.5478, + "step": 304440 + }, + { + "epoch": 2.6914372602061563, + "grad_norm": 2.151015520095825, + "learning_rate": 5.142712329897393e-06, + "loss": 0.5226, + "step": 304450 + }, + { + "epoch": 2.691525663466469, + "grad_norm": 3.008965015411377, + "learning_rate": 5.1412389422255225e-06, + "loss": 0.5809, + "step": 304460 + }, + { + "epoch": 2.691614066726781, + "grad_norm": 6.550081729888916, + "learning_rate": 5.139765554553652e-06, + "loss": 0.536, + "step": 304470 + }, + { + "epoch": 2.691702469987093, + "grad_norm": 0.8426406979560852, + "learning_rate": 5.138292166881782e-06, + "loss": 0.4184, + "step": 304480 + }, + { + "epoch": 2.691790873247405, + "grad_norm": 3.137456178665161, + "learning_rate": 5.136818779209911e-06, + "loss": 0.4757, + "step": 304490 + }, + { + "epoch": 2.6918792765077177, + "grad_norm": 2.9646377563476562, + "learning_rate": 5.13534539153804e-06, + "loss": 0.5965, + "step": 304500 + }, + { + "epoch": 2.69196767976803, + "grad_norm": 4.439608097076416, + "learning_rate": 5.133872003866169e-06, + "loss": 0.6412, + "step": 304510 + }, + { + "epoch": 2.692056083028342, + "grad_norm": 1.5348854064941406, + "learning_rate": 5.1323986161942994e-06, + "loss": 0.5533, + "step": 304520 + }, + { + "epoch": 2.6921444862886545, + "grad_norm": 4.926344394683838, + "learning_rate": 5.130925228522429e-06, + "loss": 0.5759, + "step": 304530 + }, + { + "epoch": 2.6922328895489667, + "grad_norm": 6.440145969390869, + "learning_rate": 5.129451840850558e-06, + "loss": 0.5027, + "step": 304540 + }, + { + "epoch": 2.692321292809279, + "grad_norm": 3.622314929962158, + "learning_rate": 5.127978453178687e-06, + "loss": 0.5269, + "step": 304550 + }, + { + "epoch": 2.692409696069591, + "grad_norm": 8.192954063415527, + "learning_rate": 5.126505065506816e-06, + "loss": 0.4449, + "step": 304560 + }, + { + "epoch": 2.692498099329903, + "grad_norm": 3.306488513946533, + "learning_rate": 5.1250316778349455e-06, + "loss": 0.4185, + "step": 304570 + }, + { + "epoch": 2.6925865025902156, + "grad_norm": 3.844407796859741, + "learning_rate": 5.123558290163075e-06, + "loss": 0.5388, + "step": 304580 + }, + { + "epoch": 2.6926749058505277, + "grad_norm": 2.2042462825775146, + "learning_rate": 5.122084902491204e-06, + "loss": 0.5203, + "step": 304590 + }, + { + "epoch": 2.6927633091108403, + "grad_norm": 5.135961055755615, + "learning_rate": 5.120611514819333e-06, + "loss": 0.5047, + "step": 304600 + }, + { + "epoch": 2.6928517123711524, + "grad_norm": 11.14318561553955, + "learning_rate": 5.119138127147463e-06, + "loss": 0.5217, + "step": 304610 + }, + { + "epoch": 2.6929401156314645, + "grad_norm": 0.8528537750244141, + "learning_rate": 5.117664739475592e-06, + "loss": 0.4711, + "step": 304620 + }, + { + "epoch": 2.6930285188917766, + "grad_norm": 6.76836633682251, + "learning_rate": 5.1161913518037215e-06, + "loss": 0.4942, + "step": 304630 + }, + { + "epoch": 2.6931169221520888, + "grad_norm": 2.4129586219787598, + "learning_rate": 5.114717964131851e-06, + "loss": 0.5372, + "step": 304640 + }, + { + "epoch": 2.6932053254124013, + "grad_norm": 3.7991762161254883, + "learning_rate": 5.11324457645998e-06, + "loss": 0.4153, + "step": 304650 + }, + { + "epoch": 2.6932937286727134, + "grad_norm": 2.0539019107818604, + "learning_rate": 5.111771188788109e-06, + "loss": 0.5573, + "step": 304660 + }, + { + "epoch": 2.6933821319330256, + "grad_norm": 6.7425312995910645, + "learning_rate": 5.110297801116238e-06, + "loss": 0.5004, + "step": 304670 + }, + { + "epoch": 2.693470535193338, + "grad_norm": 3.147634506225586, + "learning_rate": 5.1088244134443675e-06, + "loss": 0.6358, + "step": 304680 + }, + { + "epoch": 2.6935589384536502, + "grad_norm": 6.6060404777526855, + "learning_rate": 5.107351025772497e-06, + "loss": 0.5078, + "step": 304690 + }, + { + "epoch": 2.6936473417139624, + "grad_norm": 2.8020741939544678, + "learning_rate": 5.105877638100627e-06, + "loss": 0.5861, + "step": 304700 + }, + { + "epoch": 2.6937357449742745, + "grad_norm": 3.6724865436553955, + "learning_rate": 5.104404250428756e-06, + "loss": 0.5312, + "step": 304710 + }, + { + "epoch": 2.693824148234587, + "grad_norm": 1.4471160173416138, + "learning_rate": 5.102930862756886e-06, + "loss": 0.5223, + "step": 304720 + }, + { + "epoch": 2.693912551494899, + "grad_norm": 1.9359064102172852, + "learning_rate": 5.101457475085015e-06, + "loss": 0.5779, + "step": 304730 + }, + { + "epoch": 2.6940009547552113, + "grad_norm": 2.4951581954956055, + "learning_rate": 5.099984087413144e-06, + "loss": 0.4349, + "step": 304740 + }, + { + "epoch": 2.694089358015524, + "grad_norm": 1.8190711736679077, + "learning_rate": 5.098510699741274e-06, + "loss": 0.4585, + "step": 304750 + }, + { + "epoch": 2.694177761275836, + "grad_norm": 2.2708699703216553, + "learning_rate": 5.097037312069403e-06, + "loss": 0.4909, + "step": 304760 + }, + { + "epoch": 2.694266164536148, + "grad_norm": 12.485867500305176, + "learning_rate": 5.095563924397532e-06, + "loss": 0.5897, + "step": 304770 + }, + { + "epoch": 2.69435456779646, + "grad_norm": 2.8988871574401855, + "learning_rate": 5.094090536725661e-06, + "loss": 0.5457, + "step": 304780 + }, + { + "epoch": 2.6944429710567723, + "grad_norm": 4.139178276062012, + "learning_rate": 5.09261714905379e-06, + "loss": 0.5564, + "step": 304790 + }, + { + "epoch": 2.694531374317085, + "grad_norm": 4.717947006225586, + "learning_rate": 5.0911437613819205e-06, + "loss": 0.5077, + "step": 304800 + }, + { + "epoch": 2.694619777577397, + "grad_norm": 2.159031867980957, + "learning_rate": 5.08967037371005e-06, + "loss": 0.4986, + "step": 304810 + }, + { + "epoch": 2.694708180837709, + "grad_norm": 2.99231219291687, + "learning_rate": 5.088196986038179e-06, + "loss": 0.4408, + "step": 304820 + }, + { + "epoch": 2.6947965840980217, + "grad_norm": 1.9744359254837036, + "learning_rate": 5.086723598366308e-06, + "loss": 0.5299, + "step": 304830 + }, + { + "epoch": 2.694884987358334, + "grad_norm": 2.285630702972412, + "learning_rate": 5.085250210694437e-06, + "loss": 0.5406, + "step": 304840 + }, + { + "epoch": 2.694973390618646, + "grad_norm": 16.391162872314453, + "learning_rate": 5.0837768230225665e-06, + "loss": 0.5016, + "step": 304850 + }, + { + "epoch": 2.695061793878958, + "grad_norm": 4.171940326690674, + "learning_rate": 5.082303435350696e-06, + "loss": 0.5923, + "step": 304860 + }, + { + "epoch": 2.6951501971392706, + "grad_norm": 1.8695321083068848, + "learning_rate": 5.080830047678825e-06, + "loss": 0.4679, + "step": 304870 + }, + { + "epoch": 2.6952386003995827, + "grad_norm": 5.673379421234131, + "learning_rate": 5.079356660006954e-06, + "loss": 0.4532, + "step": 304880 + }, + { + "epoch": 2.695327003659895, + "grad_norm": 10.101022720336914, + "learning_rate": 5.077883272335084e-06, + "loss": 0.5413, + "step": 304890 + }, + { + "epoch": 2.6954154069202074, + "grad_norm": 25.99165153503418, + "learning_rate": 5.076409884663213e-06, + "loss": 0.5247, + "step": 304900 + }, + { + "epoch": 2.6955038101805195, + "grad_norm": 1.7904083728790283, + "learning_rate": 5.0749364969913425e-06, + "loss": 0.4237, + "step": 304910 + }, + { + "epoch": 2.6955922134408317, + "grad_norm": 1.5648753643035889, + "learning_rate": 5.073463109319472e-06, + "loss": 0.6303, + "step": 304920 + }, + { + "epoch": 2.695680616701144, + "grad_norm": 6.062257289886475, + "learning_rate": 5.071989721647601e-06, + "loss": 0.5306, + "step": 304930 + }, + { + "epoch": 2.695769019961456, + "grad_norm": 4.524470329284668, + "learning_rate": 5.070516333975731e-06, + "loss": 0.5238, + "step": 304940 + }, + { + "epoch": 2.6958574232217685, + "grad_norm": 9.806415557861328, + "learning_rate": 5.06904294630386e-06, + "loss": 0.451, + "step": 304950 + }, + { + "epoch": 2.6959458264820806, + "grad_norm": 7.924041748046875, + "learning_rate": 5.067569558631989e-06, + "loss": 0.5692, + "step": 304960 + }, + { + "epoch": 2.696034229742393, + "grad_norm": 4.951496601104736, + "learning_rate": 5.0660961709601186e-06, + "loss": 0.5696, + "step": 304970 + }, + { + "epoch": 2.6961226330027053, + "grad_norm": 2.0635156631469727, + "learning_rate": 5.064622783288248e-06, + "loss": 0.5213, + "step": 304980 + }, + { + "epoch": 2.6962110362630174, + "grad_norm": 2.098074197769165, + "learning_rate": 5.063149395616378e-06, + "loss": 0.4243, + "step": 304990 + }, + { + "epoch": 2.6962994395233295, + "grad_norm": 1.5199066400527954, + "learning_rate": 5.061676007944507e-06, + "loss": 0.5563, + "step": 305000 + }, + { + "epoch": 2.6963878427836416, + "grad_norm": 3.1635637283325195, + "learning_rate": 5.060202620272636e-06, + "loss": 0.5185, + "step": 305010 + }, + { + "epoch": 2.696476246043954, + "grad_norm": 13.294730186462402, + "learning_rate": 5.0587292326007654e-06, + "loss": 0.4477, + "step": 305020 + }, + { + "epoch": 2.6965646493042663, + "grad_norm": 3.2783524990081787, + "learning_rate": 5.057255844928895e-06, + "loss": 0.5279, + "step": 305030 + }, + { + "epoch": 2.6966530525645784, + "grad_norm": 1.5741766691207886, + "learning_rate": 5.055782457257024e-06, + "loss": 0.6262, + "step": 305040 + }, + { + "epoch": 2.696741455824891, + "grad_norm": 3.015608787536621, + "learning_rate": 5.054309069585153e-06, + "loss": 0.5037, + "step": 305050 + }, + { + "epoch": 2.696829859085203, + "grad_norm": 2.934304714202881, + "learning_rate": 5.052835681913282e-06, + "loss": 0.5689, + "step": 305060 + }, + { + "epoch": 2.6969182623455152, + "grad_norm": 8.073206901550293, + "learning_rate": 5.0513622942414114e-06, + "loss": 0.5459, + "step": 305070 + }, + { + "epoch": 2.6970066656058274, + "grad_norm": 2.048814535140991, + "learning_rate": 5.0498889065695415e-06, + "loss": 0.4087, + "step": 305080 + }, + { + "epoch": 2.69709506886614, + "grad_norm": 10.43022346496582, + "learning_rate": 5.048415518897671e-06, + "loss": 0.5536, + "step": 305090 + }, + { + "epoch": 2.697183472126452, + "grad_norm": 2.7701218128204346, + "learning_rate": 5.0469421312258e-06, + "loss": 0.5511, + "step": 305100 + }, + { + "epoch": 2.697271875386764, + "grad_norm": 5.275886058807373, + "learning_rate": 5.045468743553929e-06, + "loss": 0.494, + "step": 305110 + }, + { + "epoch": 2.6973602786470767, + "grad_norm": 2.441899299621582, + "learning_rate": 5.043995355882058e-06, + "loss": 0.482, + "step": 305120 + }, + { + "epoch": 2.697448681907389, + "grad_norm": 15.174245834350586, + "learning_rate": 5.0425219682101875e-06, + "loss": 0.5154, + "step": 305130 + }, + { + "epoch": 2.697537085167701, + "grad_norm": 13.988923072814941, + "learning_rate": 5.041048580538317e-06, + "loss": 0.5573, + "step": 305140 + }, + { + "epoch": 2.697625488428013, + "grad_norm": 3.235213279724121, + "learning_rate": 5.039575192866446e-06, + "loss": 0.4775, + "step": 305150 + }, + { + "epoch": 2.697713891688325, + "grad_norm": 1.6386951208114624, + "learning_rate": 5.038101805194576e-06, + "loss": 0.5965, + "step": 305160 + }, + { + "epoch": 2.6978022949486378, + "grad_norm": 7.500009059906006, + "learning_rate": 5.036628417522705e-06, + "loss": 0.5833, + "step": 305170 + }, + { + "epoch": 2.69789069820895, + "grad_norm": 1.1206668615341187, + "learning_rate": 5.035155029850834e-06, + "loss": 0.395, + "step": 305180 + }, + { + "epoch": 2.6979791014692625, + "grad_norm": 1.5461429357528687, + "learning_rate": 5.033681642178964e-06, + "loss": 0.3592, + "step": 305190 + }, + { + "epoch": 2.6980675047295746, + "grad_norm": 4.08279275894165, + "learning_rate": 5.032208254507094e-06, + "loss": 0.576, + "step": 305200 + }, + { + "epoch": 2.6981559079898867, + "grad_norm": 2.918729066848755, + "learning_rate": 5.030734866835223e-06, + "loss": 0.3436, + "step": 305210 + }, + { + "epoch": 2.698244311250199, + "grad_norm": 3.2693448066711426, + "learning_rate": 5.029261479163352e-06, + "loss": 0.4849, + "step": 305220 + }, + { + "epoch": 2.698332714510511, + "grad_norm": 1.3926005363464355, + "learning_rate": 5.027788091491481e-06, + "loss": 0.4586, + "step": 305230 + }, + { + "epoch": 2.6984211177708235, + "grad_norm": 2.1644041538238525, + "learning_rate": 5.02631470381961e-06, + "loss": 0.5951, + "step": 305240 + }, + { + "epoch": 2.6985095210311356, + "grad_norm": 2.614445447921753, + "learning_rate": 5.02484131614774e-06, + "loss": 0.5497, + "step": 305250 + }, + { + "epoch": 2.6985979242914477, + "grad_norm": 4.083572864532471, + "learning_rate": 5.023367928475869e-06, + "loss": 0.4505, + "step": 305260 + }, + { + "epoch": 2.6986863275517603, + "grad_norm": 8.443065643310547, + "learning_rate": 5.021894540803999e-06, + "loss": 0.4937, + "step": 305270 + }, + { + "epoch": 2.6987747308120724, + "grad_norm": 4.084976673126221, + "learning_rate": 5.020421153132128e-06, + "loss": 0.4106, + "step": 305280 + }, + { + "epoch": 2.6988631340723845, + "grad_norm": 1.364749789237976, + "learning_rate": 5.018947765460257e-06, + "loss": 0.4436, + "step": 305290 + }, + { + "epoch": 2.6989515373326967, + "grad_norm": 4.1475677490234375, + "learning_rate": 5.0174743777883864e-06, + "loss": 0.5816, + "step": 305300 + }, + { + "epoch": 2.6990399405930092, + "grad_norm": 6.451510906219482, + "learning_rate": 5.016000990116516e-06, + "loss": 0.5806, + "step": 305310 + }, + { + "epoch": 2.6991283438533213, + "grad_norm": 5.912315368652344, + "learning_rate": 5.014527602444645e-06, + "loss": 0.486, + "step": 305320 + }, + { + "epoch": 2.6992167471136335, + "grad_norm": 2.5851171016693115, + "learning_rate": 5.013054214772774e-06, + "loss": 0.3807, + "step": 305330 + }, + { + "epoch": 2.699305150373946, + "grad_norm": 1.0022609233856201, + "learning_rate": 5.011580827100903e-06, + "loss": 0.4576, + "step": 305340 + }, + { + "epoch": 2.699393553634258, + "grad_norm": 3.3389413356781006, + "learning_rate": 5.0101074394290324e-06, + "loss": 0.4194, + "step": 305350 + }, + { + "epoch": 2.6994819568945703, + "grad_norm": 10.668707847595215, + "learning_rate": 5.0086340517571625e-06, + "loss": 0.5842, + "step": 305360 + }, + { + "epoch": 2.6995703601548824, + "grad_norm": 2.6948225498199463, + "learning_rate": 5.007160664085292e-06, + "loss": 0.505, + "step": 305370 + }, + { + "epoch": 2.6996587634151945, + "grad_norm": 4.195977210998535, + "learning_rate": 5.005687276413421e-06, + "loss": 0.4165, + "step": 305380 + }, + { + "epoch": 2.699747166675507, + "grad_norm": 5.1476359367370605, + "learning_rate": 5.00421388874155e-06, + "loss": 0.5212, + "step": 305390 + }, + { + "epoch": 2.699835569935819, + "grad_norm": 0.7674698829650879, + "learning_rate": 5.002740501069679e-06, + "loss": 0.5943, + "step": 305400 + }, + { + "epoch": 2.6999239731961313, + "grad_norm": 3.4698004722595215, + "learning_rate": 5.001267113397809e-06, + "loss": 0.5079, + "step": 305410 + }, + { + "epoch": 2.700012376456444, + "grad_norm": 2.1887943744659424, + "learning_rate": 4.9997937257259385e-06, + "loss": 0.5143, + "step": 305420 + }, + { + "epoch": 2.700100779716756, + "grad_norm": 0.9462206363677979, + "learning_rate": 4.998320338054068e-06, + "loss": 0.4475, + "step": 305430 + }, + { + "epoch": 2.700189182977068, + "grad_norm": 3.1105103492736816, + "learning_rate": 4.996846950382197e-06, + "loss": 0.5685, + "step": 305440 + }, + { + "epoch": 2.7002775862373802, + "grad_norm": 3.1628613471984863, + "learning_rate": 4.995373562710327e-06, + "loss": 0.5831, + "step": 305450 + }, + { + "epoch": 2.700365989497693, + "grad_norm": 8.302790641784668, + "learning_rate": 4.993900175038456e-06, + "loss": 0.5974, + "step": 305460 + }, + { + "epoch": 2.700454392758005, + "grad_norm": 7.648561477661133, + "learning_rate": 4.992426787366585e-06, + "loss": 0.4969, + "step": 305470 + }, + { + "epoch": 2.700542796018317, + "grad_norm": 3.627429723739624, + "learning_rate": 4.990953399694715e-06, + "loss": 0.5177, + "step": 305480 + }, + { + "epoch": 2.7006311992786296, + "grad_norm": 4.932847023010254, + "learning_rate": 4.989480012022844e-06, + "loss": 0.5555, + "step": 305490 + }, + { + "epoch": 2.7007196025389417, + "grad_norm": 8.044657707214355, + "learning_rate": 4.988006624350973e-06, + "loss": 0.5556, + "step": 305500 + }, + { + "epoch": 2.700808005799254, + "grad_norm": 5.339785575866699, + "learning_rate": 4.986533236679102e-06, + "loss": 0.4733, + "step": 305510 + }, + { + "epoch": 2.700896409059566, + "grad_norm": 3.3689916133880615, + "learning_rate": 4.985059849007231e-06, + "loss": 0.518, + "step": 305520 + }, + { + "epoch": 2.700984812319878, + "grad_norm": 5.937283992767334, + "learning_rate": 4.983586461335361e-06, + "loss": 0.6059, + "step": 305530 + }, + { + "epoch": 2.7010732155801906, + "grad_norm": 2.47560715675354, + "learning_rate": 4.98211307366349e-06, + "loss": 0.5639, + "step": 305540 + }, + { + "epoch": 2.7011616188405028, + "grad_norm": 3.758863925933838, + "learning_rate": 4.98063968599162e-06, + "loss": 0.4839, + "step": 305550 + }, + { + "epoch": 2.7012500221008153, + "grad_norm": 9.420431137084961, + "learning_rate": 4.979166298319749e-06, + "loss": 0.4313, + "step": 305560 + }, + { + "epoch": 2.7013384253611274, + "grad_norm": 3.308715581893921, + "learning_rate": 4.977692910647878e-06, + "loss": 0.5164, + "step": 305570 + }, + { + "epoch": 2.7014268286214396, + "grad_norm": 2.8270397186279297, + "learning_rate": 4.9762195229760075e-06, + "loss": 0.5489, + "step": 305580 + }, + { + "epoch": 2.7015152318817517, + "grad_norm": 1.293128490447998, + "learning_rate": 4.974746135304137e-06, + "loss": 0.433, + "step": 305590 + }, + { + "epoch": 2.701603635142064, + "grad_norm": 15.91666030883789, + "learning_rate": 4.973272747632266e-06, + "loss": 0.5511, + "step": 305600 + }, + { + "epoch": 2.7016920384023764, + "grad_norm": 20.885360717773438, + "learning_rate": 4.971799359960395e-06, + "loss": 0.4675, + "step": 305610 + }, + { + "epoch": 2.7017804416626885, + "grad_norm": 1.8178162574768066, + "learning_rate": 4.970325972288524e-06, + "loss": 0.4423, + "step": 305620 + }, + { + "epoch": 2.7018688449230006, + "grad_norm": 1.8399800062179565, + "learning_rate": 4.968852584616654e-06, + "loss": 0.3304, + "step": 305630 + }, + { + "epoch": 2.701957248183313, + "grad_norm": 3.349952459335327, + "learning_rate": 4.9673791969447835e-06, + "loss": 0.6609, + "step": 305640 + }, + { + "epoch": 2.7020456514436253, + "grad_norm": 7.2338480949401855, + "learning_rate": 4.965905809272913e-06, + "loss": 0.3814, + "step": 305650 + }, + { + "epoch": 2.7021340547039374, + "grad_norm": 6.654005527496338, + "learning_rate": 4.964432421601043e-06, + "loss": 0.5555, + "step": 305660 + }, + { + "epoch": 2.7022224579642495, + "grad_norm": 3.160285472869873, + "learning_rate": 4.962959033929172e-06, + "loss": 0.6137, + "step": 305670 + }, + { + "epoch": 2.702310861224562, + "grad_norm": 2.390618324279785, + "learning_rate": 4.961485646257301e-06, + "loss": 0.4562, + "step": 305680 + }, + { + "epoch": 2.702399264484874, + "grad_norm": 6.356454849243164, + "learning_rate": 4.96001225858543e-06, + "loss": 0.5888, + "step": 305690 + }, + { + "epoch": 2.7024876677451863, + "grad_norm": 2.8771305084228516, + "learning_rate": 4.9585388709135596e-06, + "loss": 0.4826, + "step": 305700 + }, + { + "epoch": 2.702576071005499, + "grad_norm": 5.236699104309082, + "learning_rate": 4.957065483241689e-06, + "loss": 0.5524, + "step": 305710 + }, + { + "epoch": 2.702664474265811, + "grad_norm": 1.8030084371566772, + "learning_rate": 4.955592095569818e-06, + "loss": 0.3407, + "step": 305720 + }, + { + "epoch": 2.702752877526123, + "grad_norm": 1.272134780883789, + "learning_rate": 4.954118707897948e-06, + "loss": 0.5417, + "step": 305730 + }, + { + "epoch": 2.7028412807864353, + "grad_norm": 9.863807678222656, + "learning_rate": 4.952645320226077e-06, + "loss": 0.7363, + "step": 305740 + }, + { + "epoch": 2.7029296840467474, + "grad_norm": 3.6512272357940674, + "learning_rate": 4.951171932554206e-06, + "loss": 0.506, + "step": 305750 + }, + { + "epoch": 2.70301808730706, + "grad_norm": 5.944571018218994, + "learning_rate": 4.949698544882336e-06, + "loss": 0.6016, + "step": 305760 + }, + { + "epoch": 2.703106490567372, + "grad_norm": 1.6403510570526123, + "learning_rate": 4.948225157210465e-06, + "loss": 0.5679, + "step": 305770 + }, + { + "epoch": 2.7031948938276846, + "grad_norm": 2.3833608627319336, + "learning_rate": 4.946751769538594e-06, + "loss": 0.3951, + "step": 305780 + }, + { + "epoch": 2.7032832970879968, + "grad_norm": 1.573286771774292, + "learning_rate": 4.945278381866723e-06, + "loss": 0.4797, + "step": 305790 + }, + { + "epoch": 2.703371700348309, + "grad_norm": 26.004886627197266, + "learning_rate": 4.943804994194852e-06, + "loss": 0.4511, + "step": 305800 + }, + { + "epoch": 2.703460103608621, + "grad_norm": 3.2662312984466553, + "learning_rate": 4.942331606522982e-06, + "loss": 0.5872, + "step": 305810 + }, + { + "epoch": 2.703548506868933, + "grad_norm": 1.4175481796264648, + "learning_rate": 4.940858218851111e-06, + "loss": 0.6189, + "step": 305820 + }, + { + "epoch": 2.7036369101292457, + "grad_norm": 2.1821706295013428, + "learning_rate": 4.939384831179241e-06, + "loss": 0.6229, + "step": 305830 + }, + { + "epoch": 2.703725313389558, + "grad_norm": 4.832816123962402, + "learning_rate": 4.93791144350737e-06, + "loss": 0.509, + "step": 305840 + }, + { + "epoch": 2.70381371664987, + "grad_norm": 2.705495834350586, + "learning_rate": 4.936438055835499e-06, + "loss": 0.5591, + "step": 305850 + }, + { + "epoch": 2.7039021199101825, + "grad_norm": 12.247186660766602, + "learning_rate": 4.9349646681636285e-06, + "loss": 0.5673, + "step": 305860 + }, + { + "epoch": 2.7039905231704946, + "grad_norm": 3.0930089950561523, + "learning_rate": 4.9334912804917585e-06, + "loss": 0.4236, + "step": 305870 + }, + { + "epoch": 2.7040789264308067, + "grad_norm": 2.013141632080078, + "learning_rate": 4.932017892819888e-06, + "loss": 0.5694, + "step": 305880 + }, + { + "epoch": 2.704167329691119, + "grad_norm": 5.1823883056640625, + "learning_rate": 4.930544505148017e-06, + "loss": 0.6192, + "step": 305890 + }, + { + "epoch": 2.7042557329514314, + "grad_norm": 3.5799787044525146, + "learning_rate": 4.929071117476146e-06, + "loss": 0.467, + "step": 305900 + }, + { + "epoch": 2.7043441362117435, + "grad_norm": 1.2546056509017944, + "learning_rate": 4.927597729804275e-06, + "loss": 0.5253, + "step": 305910 + }, + { + "epoch": 2.7044325394720556, + "grad_norm": 3.1605639457702637, + "learning_rate": 4.926124342132405e-06, + "loss": 0.4209, + "step": 305920 + }, + { + "epoch": 2.704520942732368, + "grad_norm": 3.8731274604797363, + "learning_rate": 4.9246509544605346e-06, + "loss": 0.5198, + "step": 305930 + }, + { + "epoch": 2.7046093459926803, + "grad_norm": 4.296875476837158, + "learning_rate": 4.923177566788664e-06, + "loss": 0.4282, + "step": 305940 + }, + { + "epoch": 2.7046977492529924, + "grad_norm": 3.9034345149993896, + "learning_rate": 4.921704179116793e-06, + "loss": 0.4995, + "step": 305950 + }, + { + "epoch": 2.7047861525133046, + "grad_norm": 3.7499492168426514, + "learning_rate": 4.920230791444922e-06, + "loss": 0.5717, + "step": 305960 + }, + { + "epoch": 2.7048745557736167, + "grad_norm": 10.720964431762695, + "learning_rate": 4.918757403773051e-06, + "loss": 0.5537, + "step": 305970 + }, + { + "epoch": 2.7049629590339292, + "grad_norm": 2.3999290466308594, + "learning_rate": 4.917284016101181e-06, + "loss": 0.5079, + "step": 305980 + }, + { + "epoch": 2.7050513622942414, + "grad_norm": 0.8681366443634033, + "learning_rate": 4.91581062842931e-06, + "loss": 0.4181, + "step": 305990 + }, + { + "epoch": 2.7051397655545535, + "grad_norm": 3.6031997203826904, + "learning_rate": 4.914337240757439e-06, + "loss": 0.6615, + "step": 306000 + }, + { + "epoch": 2.705228168814866, + "grad_norm": 12.544000625610352, + "learning_rate": 4.912863853085569e-06, + "loss": 0.5652, + "step": 306010 + }, + { + "epoch": 2.705316572075178, + "grad_norm": 4.743675231933594, + "learning_rate": 4.911390465413698e-06, + "loss": 0.569, + "step": 306020 + }, + { + "epoch": 2.7054049753354903, + "grad_norm": 3.093681812286377, + "learning_rate": 4.9099170777418274e-06, + "loss": 0.558, + "step": 306030 + }, + { + "epoch": 2.7054933785958024, + "grad_norm": 8.65329647064209, + "learning_rate": 4.908443690069957e-06, + "loss": 0.5148, + "step": 306040 + }, + { + "epoch": 2.705581781856115, + "grad_norm": 2.3796122074127197, + "learning_rate": 4.906970302398086e-06, + "loss": 0.457, + "step": 306050 + }, + { + "epoch": 2.705670185116427, + "grad_norm": 17.594324111938477, + "learning_rate": 4.905496914726215e-06, + "loss": 0.4564, + "step": 306060 + }, + { + "epoch": 2.705758588376739, + "grad_norm": 3.580618381500244, + "learning_rate": 4.904023527054344e-06, + "loss": 0.5084, + "step": 306070 + }, + { + "epoch": 2.705846991637052, + "grad_norm": 2.404059648513794, + "learning_rate": 4.9025501393824734e-06, + "loss": 0.4313, + "step": 306080 + }, + { + "epoch": 2.705935394897364, + "grad_norm": 1.2665178775787354, + "learning_rate": 4.9010767517106035e-06, + "loss": 0.498, + "step": 306090 + }, + { + "epoch": 2.706023798157676, + "grad_norm": 5.3060994148254395, + "learning_rate": 4.899603364038733e-06, + "loss": 0.4676, + "step": 306100 + }, + { + "epoch": 2.706112201417988, + "grad_norm": 4.784682273864746, + "learning_rate": 4.898129976366862e-06, + "loss": 0.4689, + "step": 306110 + }, + { + "epoch": 2.7062006046783003, + "grad_norm": 11.771567344665527, + "learning_rate": 4.896656588694992e-06, + "loss": 0.6151, + "step": 306120 + }, + { + "epoch": 2.706289007938613, + "grad_norm": 1.203100323677063, + "learning_rate": 4.895183201023121e-06, + "loss": 0.5119, + "step": 306130 + }, + { + "epoch": 2.706377411198925, + "grad_norm": 6.187151908874512, + "learning_rate": 4.89370981335125e-06, + "loss": 0.477, + "step": 306140 + }, + { + "epoch": 2.7064658144592375, + "grad_norm": 0.8258188366889954, + "learning_rate": 4.8922364256793795e-06, + "loss": 0.5444, + "step": 306150 + }, + { + "epoch": 2.7065542177195496, + "grad_norm": 1.7154414653778076, + "learning_rate": 4.890763038007509e-06, + "loss": 0.5421, + "step": 306160 + }, + { + "epoch": 2.7066426209798617, + "grad_norm": 0.45739251375198364, + "learning_rate": 4.889289650335638e-06, + "loss": 0.3872, + "step": 306170 + }, + { + "epoch": 2.706731024240174, + "grad_norm": 1.3779003620147705, + "learning_rate": 4.887816262663767e-06, + "loss": 0.4266, + "step": 306180 + }, + { + "epoch": 2.706819427500486, + "grad_norm": 12.026466369628906, + "learning_rate": 4.886342874991896e-06, + "loss": 0.3653, + "step": 306190 + }, + { + "epoch": 2.7069078307607986, + "grad_norm": 3.3614554405212402, + "learning_rate": 4.884869487320026e-06, + "loss": 0.5908, + "step": 306200 + }, + { + "epoch": 2.7069962340211107, + "grad_norm": 2.8692736625671387, + "learning_rate": 4.883396099648156e-06, + "loss": 0.5444, + "step": 306210 + }, + { + "epoch": 2.707084637281423, + "grad_norm": 1.4308867454528809, + "learning_rate": 4.881922711976285e-06, + "loss": 0.4759, + "step": 306220 + }, + { + "epoch": 2.7071730405417354, + "grad_norm": 1.6482778787612915, + "learning_rate": 4.880449324304414e-06, + "loss": 0.4827, + "step": 306230 + }, + { + "epoch": 2.7072614438020475, + "grad_norm": 3.5661845207214355, + "learning_rate": 4.878975936632543e-06, + "loss": 0.5241, + "step": 306240 + }, + { + "epoch": 2.7073498470623596, + "grad_norm": 6.495173931121826, + "learning_rate": 4.877502548960672e-06, + "loss": 0.561, + "step": 306250 + }, + { + "epoch": 2.7074382503226717, + "grad_norm": 3.3596203327178955, + "learning_rate": 4.876029161288802e-06, + "loss": 0.4786, + "step": 306260 + }, + { + "epoch": 2.7075266535829843, + "grad_norm": 1.5276426076889038, + "learning_rate": 4.874555773616931e-06, + "loss": 0.3697, + "step": 306270 + }, + { + "epoch": 2.7076150568432964, + "grad_norm": 6.529496192932129, + "learning_rate": 4.87308238594506e-06, + "loss": 0.7047, + "step": 306280 + }, + { + "epoch": 2.7077034601036085, + "grad_norm": 0.7560232877731323, + "learning_rate": 4.87160899827319e-06, + "loss": 0.6394, + "step": 306290 + }, + { + "epoch": 2.707791863363921, + "grad_norm": 10.134455680847168, + "learning_rate": 4.870135610601319e-06, + "loss": 0.5713, + "step": 306300 + }, + { + "epoch": 2.707880266624233, + "grad_norm": 3.7337677478790283, + "learning_rate": 4.8686622229294484e-06, + "loss": 0.4746, + "step": 306310 + }, + { + "epoch": 2.7079686698845453, + "grad_norm": 1.2915356159210205, + "learning_rate": 4.867188835257578e-06, + "loss": 0.5917, + "step": 306320 + }, + { + "epoch": 2.7080570731448574, + "grad_norm": 1.3934372663497925, + "learning_rate": 4.865715447585707e-06, + "loss": 0.5105, + "step": 306330 + }, + { + "epoch": 2.7081454764051696, + "grad_norm": 2.1479454040527344, + "learning_rate": 4.864242059913837e-06, + "loss": 0.4849, + "step": 306340 + }, + { + "epoch": 2.708233879665482, + "grad_norm": 1.6645327806472778, + "learning_rate": 4.862768672241966e-06, + "loss": 0.4761, + "step": 306350 + }, + { + "epoch": 2.7083222829257942, + "grad_norm": 8.331875801086426, + "learning_rate": 4.861295284570095e-06, + "loss": 0.4989, + "step": 306360 + }, + { + "epoch": 2.708410686186107, + "grad_norm": 1.1463252305984497, + "learning_rate": 4.8598218968982245e-06, + "loss": 0.5567, + "step": 306370 + }, + { + "epoch": 2.708499089446419, + "grad_norm": 21.594709396362305, + "learning_rate": 4.858348509226354e-06, + "loss": 0.6648, + "step": 306380 + }, + { + "epoch": 2.708587492706731, + "grad_norm": 3.7359704971313477, + "learning_rate": 4.856875121554484e-06, + "loss": 0.5587, + "step": 306390 + }, + { + "epoch": 2.708675895967043, + "grad_norm": 2.0848639011383057, + "learning_rate": 4.855401733882613e-06, + "loss": 0.5706, + "step": 306400 + }, + { + "epoch": 2.7087642992273553, + "grad_norm": 3.728639841079712, + "learning_rate": 4.853928346210742e-06, + "loss": 0.5289, + "step": 306410 + }, + { + "epoch": 2.708852702487668, + "grad_norm": 1.6024951934814453, + "learning_rate": 4.852454958538871e-06, + "loss": 0.5353, + "step": 306420 + }, + { + "epoch": 2.70894110574798, + "grad_norm": 2.4785218238830566, + "learning_rate": 4.8509815708670006e-06, + "loss": 0.608, + "step": 306430 + }, + { + "epoch": 2.709029509008292, + "grad_norm": 2.474334239959717, + "learning_rate": 4.84950818319513e-06, + "loss": 0.5358, + "step": 306440 + }, + { + "epoch": 2.7091179122686047, + "grad_norm": 3.9277865886688232, + "learning_rate": 4.848034795523259e-06, + "loss": 0.5399, + "step": 306450 + }, + { + "epoch": 2.7092063155289168, + "grad_norm": 1.3416361808776855, + "learning_rate": 4.846561407851388e-06, + "loss": 0.4357, + "step": 306460 + }, + { + "epoch": 2.709294718789229, + "grad_norm": 1.0916869640350342, + "learning_rate": 4.845088020179517e-06, + "loss": 0.5097, + "step": 306470 + }, + { + "epoch": 2.709383122049541, + "grad_norm": 7.868186950683594, + "learning_rate": 4.843614632507647e-06, + "loss": 0.4639, + "step": 306480 + }, + { + "epoch": 2.7094715253098536, + "grad_norm": 3.072694778442383, + "learning_rate": 4.842141244835777e-06, + "loss": 0.4245, + "step": 306490 + }, + { + "epoch": 2.7095599285701657, + "grad_norm": 4.582046031951904, + "learning_rate": 4.840667857163906e-06, + "loss": 0.6677, + "step": 306500 + }, + { + "epoch": 2.709648331830478, + "grad_norm": 1.4977056980133057, + "learning_rate": 4.839194469492035e-06, + "loss": 0.5797, + "step": 306510 + }, + { + "epoch": 2.7097367350907904, + "grad_norm": 2.487002372741699, + "learning_rate": 4.837721081820164e-06, + "loss": 0.4914, + "step": 306520 + }, + { + "epoch": 2.7098251383511025, + "grad_norm": 1.7460474967956543, + "learning_rate": 4.836247694148293e-06, + "loss": 0.4944, + "step": 306530 + }, + { + "epoch": 2.7099135416114146, + "grad_norm": 1.7526038885116577, + "learning_rate": 4.834774306476423e-06, + "loss": 0.4828, + "step": 306540 + }, + { + "epoch": 2.7100019448717267, + "grad_norm": 2.07344126701355, + "learning_rate": 4.833300918804552e-06, + "loss": 0.4774, + "step": 306550 + }, + { + "epoch": 2.710090348132039, + "grad_norm": 3.209352493286133, + "learning_rate": 4.831827531132682e-06, + "loss": 0.5152, + "step": 306560 + }, + { + "epoch": 2.7101787513923514, + "grad_norm": 4.8180317878723145, + "learning_rate": 4.830354143460811e-06, + "loss": 0.4835, + "step": 306570 + }, + { + "epoch": 2.7102671546526635, + "grad_norm": 6.858720779418945, + "learning_rate": 4.82888075578894e-06, + "loss": 0.4776, + "step": 306580 + }, + { + "epoch": 2.7103555579129757, + "grad_norm": 1.3836487531661987, + "learning_rate": 4.82740736811707e-06, + "loss": 0.5212, + "step": 306590 + }, + { + "epoch": 2.7104439611732882, + "grad_norm": 3.3431246280670166, + "learning_rate": 4.8259339804451995e-06, + "loss": 0.616, + "step": 306600 + }, + { + "epoch": 2.7105323644336003, + "grad_norm": 20.798389434814453, + "learning_rate": 4.824460592773329e-06, + "loss": 0.4492, + "step": 306610 + }, + { + "epoch": 2.7106207676939125, + "grad_norm": 4.936484336853027, + "learning_rate": 4.822987205101458e-06, + "loss": 0.4801, + "step": 306620 + }, + { + "epoch": 2.7107091709542246, + "grad_norm": 1.257745623588562, + "learning_rate": 4.821513817429587e-06, + "loss": 0.5498, + "step": 306630 + }, + { + "epoch": 2.710797574214537, + "grad_norm": 4.747032642364502, + "learning_rate": 4.820040429757716e-06, + "loss": 0.6687, + "step": 306640 + }, + { + "epoch": 2.7108859774748493, + "grad_norm": 9.273734092712402, + "learning_rate": 4.8185670420858455e-06, + "loss": 0.5747, + "step": 306650 + }, + { + "epoch": 2.7109743807351614, + "grad_norm": 5.110916614532471, + "learning_rate": 4.817093654413975e-06, + "loss": 0.4279, + "step": 306660 + }, + { + "epoch": 2.711062783995474, + "grad_norm": 3.457000255584717, + "learning_rate": 4.815620266742105e-06, + "loss": 0.5066, + "step": 306670 + }, + { + "epoch": 2.711151187255786, + "grad_norm": 2.347907781600952, + "learning_rate": 4.814146879070234e-06, + "loss": 0.5659, + "step": 306680 + }, + { + "epoch": 2.711239590516098, + "grad_norm": 4.281490325927734, + "learning_rate": 4.812673491398363e-06, + "loss": 0.4541, + "step": 306690 + }, + { + "epoch": 2.7113279937764103, + "grad_norm": 1.6621384620666504, + "learning_rate": 4.811200103726492e-06, + "loss": 0.6795, + "step": 306700 + }, + { + "epoch": 2.711416397036723, + "grad_norm": 2.77822208404541, + "learning_rate": 4.8097267160546216e-06, + "loss": 0.415, + "step": 306710 + }, + { + "epoch": 2.711504800297035, + "grad_norm": 4.5654826164245605, + "learning_rate": 4.808253328382751e-06, + "loss": 0.453, + "step": 306720 + }, + { + "epoch": 2.711593203557347, + "grad_norm": 3.5075597763061523, + "learning_rate": 4.80677994071088e-06, + "loss": 0.5014, + "step": 306730 + }, + { + "epoch": 2.7116816068176597, + "grad_norm": 1.828865647315979, + "learning_rate": 4.805306553039009e-06, + "loss": 0.3548, + "step": 306740 + }, + { + "epoch": 2.711770010077972, + "grad_norm": 4.245809555053711, + "learning_rate": 4.803833165367138e-06, + "loss": 0.5095, + "step": 306750 + }, + { + "epoch": 2.711858413338284, + "grad_norm": 3.298287868499756, + "learning_rate": 4.802359777695268e-06, + "loss": 0.6879, + "step": 306760 + }, + { + "epoch": 2.711946816598596, + "grad_norm": 3.272134304046631, + "learning_rate": 4.800886390023398e-06, + "loss": 0.5441, + "step": 306770 + }, + { + "epoch": 2.712035219858908, + "grad_norm": 6.342971324920654, + "learning_rate": 4.799413002351527e-06, + "loss": 0.4898, + "step": 306780 + }, + { + "epoch": 2.7121236231192207, + "grad_norm": 2.6247565746307373, + "learning_rate": 4.797939614679656e-06, + "loss": 0.5331, + "step": 306790 + }, + { + "epoch": 2.712212026379533, + "grad_norm": 3.309504985809326, + "learning_rate": 4.796466227007785e-06, + "loss": 0.4143, + "step": 306800 + }, + { + "epoch": 2.712300429639845, + "grad_norm": 14.298945426940918, + "learning_rate": 4.794992839335915e-06, + "loss": 0.5207, + "step": 306810 + }, + { + "epoch": 2.7123888329001575, + "grad_norm": 2.02609920501709, + "learning_rate": 4.7935194516640445e-06, + "loss": 0.4904, + "step": 306820 + }, + { + "epoch": 2.7124772361604697, + "grad_norm": 6.414795398712158, + "learning_rate": 4.792046063992174e-06, + "loss": 0.4586, + "step": 306830 + }, + { + "epoch": 2.7125656394207818, + "grad_norm": 11.552958488464355, + "learning_rate": 4.790572676320303e-06, + "loss": 0.5119, + "step": 306840 + }, + { + "epoch": 2.712654042681094, + "grad_norm": 6.555601119995117, + "learning_rate": 4.789099288648432e-06, + "loss": 0.5574, + "step": 306850 + }, + { + "epoch": 2.7127424459414065, + "grad_norm": 0.9532069563865662, + "learning_rate": 4.787625900976562e-06, + "loss": 0.4049, + "step": 306860 + }, + { + "epoch": 2.7128308492017186, + "grad_norm": 3.2684450149536133, + "learning_rate": 4.786152513304691e-06, + "loss": 0.4947, + "step": 306870 + }, + { + "epoch": 2.7129192524620307, + "grad_norm": 8.185196876525879, + "learning_rate": 4.7846791256328205e-06, + "loss": 0.4928, + "step": 306880 + }, + { + "epoch": 2.7130076557223433, + "grad_norm": 9.50861644744873, + "learning_rate": 4.78320573796095e-06, + "loss": 0.523, + "step": 306890 + }, + { + "epoch": 2.7130960589826554, + "grad_norm": 6.288022994995117, + "learning_rate": 4.781732350289079e-06, + "loss": 0.5577, + "step": 306900 + }, + { + "epoch": 2.7131844622429675, + "grad_norm": 1.9895803928375244, + "learning_rate": 4.780258962617208e-06, + "loss": 0.5566, + "step": 306910 + }, + { + "epoch": 2.7132728655032796, + "grad_norm": 7.056479454040527, + "learning_rate": 4.778785574945337e-06, + "loss": 0.5205, + "step": 306920 + }, + { + "epoch": 2.7133612687635917, + "grad_norm": 3.698262929916382, + "learning_rate": 4.7773121872734665e-06, + "loss": 0.5562, + "step": 306930 + }, + { + "epoch": 2.7134496720239043, + "grad_norm": 1.8502739667892456, + "learning_rate": 4.775838799601596e-06, + "loss": 0.4677, + "step": 306940 + }, + { + "epoch": 2.7135380752842164, + "grad_norm": 1.3407901525497437, + "learning_rate": 4.774365411929726e-06, + "loss": 0.5905, + "step": 306950 + }, + { + "epoch": 2.713626478544529, + "grad_norm": 3.219248056411743, + "learning_rate": 4.772892024257855e-06, + "loss": 0.6008, + "step": 306960 + }, + { + "epoch": 2.713714881804841, + "grad_norm": 5.953789234161377, + "learning_rate": 4.771418636585984e-06, + "loss": 0.6233, + "step": 306970 + }, + { + "epoch": 2.7138032850651532, + "grad_norm": 2.2064707279205322, + "learning_rate": 4.769945248914113e-06, + "loss": 0.5077, + "step": 306980 + }, + { + "epoch": 2.7138916883254653, + "grad_norm": 1.46455979347229, + "learning_rate": 4.768471861242243e-06, + "loss": 0.5492, + "step": 306990 + }, + { + "epoch": 2.7139800915857775, + "grad_norm": 7.648400783538818, + "learning_rate": 4.766998473570372e-06, + "loss": 0.4108, + "step": 307000 + }, + { + "epoch": 2.71406849484609, + "grad_norm": 1.5661089420318604, + "learning_rate": 4.765525085898501e-06, + "loss": 0.4898, + "step": 307010 + }, + { + "epoch": 2.714156898106402, + "grad_norm": 2.9874460697174072, + "learning_rate": 4.76405169822663e-06, + "loss": 0.4531, + "step": 307020 + }, + { + "epoch": 2.7142453013667143, + "grad_norm": 6.541859149932861, + "learning_rate": 4.76257831055476e-06, + "loss": 0.5821, + "step": 307030 + }, + { + "epoch": 2.714333704627027, + "grad_norm": 5.66218900680542, + "learning_rate": 4.7611049228828894e-06, + "loss": 0.6815, + "step": 307040 + }, + { + "epoch": 2.714422107887339, + "grad_norm": 16.782941818237305, + "learning_rate": 4.7596315352110195e-06, + "loss": 0.4133, + "step": 307050 + }, + { + "epoch": 2.714510511147651, + "grad_norm": 2.2009572982788086, + "learning_rate": 4.758158147539149e-06, + "loss": 0.53, + "step": 307060 + }, + { + "epoch": 2.714598914407963, + "grad_norm": 2.8232693672180176, + "learning_rate": 4.756684759867278e-06, + "loss": 0.5151, + "step": 307070 + }, + { + "epoch": 2.7146873176682758, + "grad_norm": 2.6369144916534424, + "learning_rate": 4.755211372195407e-06, + "loss": 0.4441, + "step": 307080 + }, + { + "epoch": 2.714775720928588, + "grad_norm": 9.173973083496094, + "learning_rate": 4.753737984523536e-06, + "loss": 0.5239, + "step": 307090 + }, + { + "epoch": 2.7148641241889, + "grad_norm": 1.0194803476333618, + "learning_rate": 4.7522645968516655e-06, + "loss": 0.4232, + "step": 307100 + }, + { + "epoch": 2.7149525274492126, + "grad_norm": 3.3199355602264404, + "learning_rate": 4.750791209179795e-06, + "loss": 0.4979, + "step": 307110 + }, + { + "epoch": 2.7150409307095247, + "grad_norm": 1.2627931833267212, + "learning_rate": 4.749317821507924e-06, + "loss": 0.4891, + "step": 307120 + }, + { + "epoch": 2.715129333969837, + "grad_norm": 1.9573094844818115, + "learning_rate": 4.747844433836053e-06, + "loss": 0.5306, + "step": 307130 + }, + { + "epoch": 2.715217737230149, + "grad_norm": 5.026834964752197, + "learning_rate": 4.746371046164183e-06, + "loss": 0.5719, + "step": 307140 + }, + { + "epoch": 2.715306140490461, + "grad_norm": 17.811717987060547, + "learning_rate": 4.744897658492312e-06, + "loss": 0.5767, + "step": 307150 + }, + { + "epoch": 2.7153945437507736, + "grad_norm": 5.955763816833496, + "learning_rate": 4.7434242708204415e-06, + "loss": 0.5204, + "step": 307160 + }, + { + "epoch": 2.7154829470110857, + "grad_norm": 4.444065570831299, + "learning_rate": 4.741950883148571e-06, + "loss": 0.5707, + "step": 307170 + }, + { + "epoch": 2.715571350271398, + "grad_norm": 11.609379768371582, + "learning_rate": 4.7404774954767e-06, + "loss": 0.6439, + "step": 307180 + }, + { + "epoch": 2.7156597535317104, + "grad_norm": 11.784463882446289, + "learning_rate": 4.739004107804829e-06, + "loss": 0.458, + "step": 307190 + }, + { + "epoch": 2.7157481567920225, + "grad_norm": 6.817044734954834, + "learning_rate": 4.737530720132958e-06, + "loss": 0.5322, + "step": 307200 + }, + { + "epoch": 2.7158365600523346, + "grad_norm": 3.7703089714050293, + "learning_rate": 4.7360573324610875e-06, + "loss": 0.4178, + "step": 307210 + }, + { + "epoch": 2.7159249633126468, + "grad_norm": 3.222608804702759, + "learning_rate": 4.734583944789217e-06, + "loss": 0.5135, + "step": 307220 + }, + { + "epoch": 2.7160133665729593, + "grad_norm": 4.323358535766602, + "learning_rate": 4.733110557117347e-06, + "loss": 0.5228, + "step": 307230 + }, + { + "epoch": 2.7161017698332715, + "grad_norm": 4.7871904373168945, + "learning_rate": 4.731637169445476e-06, + "loss": 0.5353, + "step": 307240 + }, + { + "epoch": 2.7161901730935836, + "grad_norm": 1.3348867893218994, + "learning_rate": 4.730163781773605e-06, + "loss": 0.4238, + "step": 307250 + }, + { + "epoch": 2.716278576353896, + "grad_norm": 4.134737014770508, + "learning_rate": 4.728690394101734e-06, + "loss": 0.6109, + "step": 307260 + }, + { + "epoch": 2.7163669796142083, + "grad_norm": 6.454626083374023, + "learning_rate": 4.7272170064298644e-06, + "loss": 0.467, + "step": 307270 + }, + { + "epoch": 2.7164553828745204, + "grad_norm": 3.7531886100769043, + "learning_rate": 4.725743618757994e-06, + "loss": 0.5297, + "step": 307280 + }, + { + "epoch": 2.7165437861348325, + "grad_norm": 7.208763122558594, + "learning_rate": 4.724270231086123e-06, + "loss": 0.7434, + "step": 307290 + }, + { + "epoch": 2.716632189395145, + "grad_norm": 0.770551323890686, + "learning_rate": 4.722796843414252e-06, + "loss": 0.574, + "step": 307300 + }, + { + "epoch": 2.716720592655457, + "grad_norm": 6.403985500335693, + "learning_rate": 4.721323455742381e-06, + "loss": 0.6184, + "step": 307310 + }, + { + "epoch": 2.7168089959157693, + "grad_norm": 3.245232343673706, + "learning_rate": 4.719850068070511e-06, + "loss": 0.4296, + "step": 307320 + }, + { + "epoch": 2.716897399176082, + "grad_norm": 1.9715083837509155, + "learning_rate": 4.7183766803986405e-06, + "loss": 0.57, + "step": 307330 + }, + { + "epoch": 2.716985802436394, + "grad_norm": 3.480679512023926, + "learning_rate": 4.71690329272677e-06, + "loss": 0.4826, + "step": 307340 + }, + { + "epoch": 2.717074205696706, + "grad_norm": 2.974647045135498, + "learning_rate": 4.715429905054899e-06, + "loss": 0.4271, + "step": 307350 + }, + { + "epoch": 2.717162608957018, + "grad_norm": 2.0715465545654297, + "learning_rate": 4.713956517383028e-06, + "loss": 0.4615, + "step": 307360 + }, + { + "epoch": 2.7172510122173303, + "grad_norm": 1.320739984512329, + "learning_rate": 4.712483129711157e-06, + "loss": 0.5553, + "step": 307370 + }, + { + "epoch": 2.717339415477643, + "grad_norm": 12.477655410766602, + "learning_rate": 4.7110097420392865e-06, + "loss": 0.5725, + "step": 307380 + }, + { + "epoch": 2.717427818737955, + "grad_norm": 5.050520896911621, + "learning_rate": 4.709536354367416e-06, + "loss": 0.5362, + "step": 307390 + }, + { + "epoch": 2.717516221998267, + "grad_norm": 1.8381577730178833, + "learning_rate": 4.708062966695545e-06, + "loss": 0.5894, + "step": 307400 + }, + { + "epoch": 2.7176046252585797, + "grad_norm": 2.6618881225585938, + "learning_rate": 4.706589579023674e-06, + "loss": 0.6063, + "step": 307410 + }, + { + "epoch": 2.717693028518892, + "grad_norm": 7.886129379272461, + "learning_rate": 4.705116191351804e-06, + "loss": 0.5764, + "step": 307420 + }, + { + "epoch": 2.717781431779204, + "grad_norm": 2.4612836837768555, + "learning_rate": 4.703642803679933e-06, + "loss": 0.4582, + "step": 307430 + }, + { + "epoch": 2.717869835039516, + "grad_norm": 1.5649223327636719, + "learning_rate": 4.7021694160080626e-06, + "loss": 0.4379, + "step": 307440 + }, + { + "epoch": 2.7179582382998286, + "grad_norm": 2.0884270668029785, + "learning_rate": 4.700696028336192e-06, + "loss": 0.523, + "step": 307450 + }, + { + "epoch": 2.7180466415601408, + "grad_norm": 10.133622169494629, + "learning_rate": 4.699222640664321e-06, + "loss": 0.4465, + "step": 307460 + }, + { + "epoch": 2.718135044820453, + "grad_norm": 1.0647538900375366, + "learning_rate": 4.69774925299245e-06, + "loss": 0.5256, + "step": 307470 + }, + { + "epoch": 2.7182234480807654, + "grad_norm": 1.916248083114624, + "learning_rate": 4.696275865320579e-06, + "loss": 0.517, + "step": 307480 + }, + { + "epoch": 2.7183118513410776, + "grad_norm": 1.9882888793945312, + "learning_rate": 4.694802477648709e-06, + "loss": 0.4499, + "step": 307490 + }, + { + "epoch": 2.7184002546013897, + "grad_norm": 7.151304721832275, + "learning_rate": 4.693329089976839e-06, + "loss": 0.6311, + "step": 307500 + }, + { + "epoch": 2.718488657861702, + "grad_norm": 2.910839319229126, + "learning_rate": 4.691855702304968e-06, + "loss": 0.5411, + "step": 307510 + }, + { + "epoch": 2.718577061122014, + "grad_norm": 2.6628551483154297, + "learning_rate": 4.690382314633098e-06, + "loss": 0.4974, + "step": 307520 + }, + { + "epoch": 2.7186654643823265, + "grad_norm": 3.6052024364471436, + "learning_rate": 4.688908926961227e-06, + "loss": 0.5675, + "step": 307530 + }, + { + "epoch": 2.7187538676426386, + "grad_norm": 2.1737895011901855, + "learning_rate": 4.687435539289356e-06, + "loss": 0.5268, + "step": 307540 + }, + { + "epoch": 2.718842270902951, + "grad_norm": 5.653605937957764, + "learning_rate": 4.6859621516174855e-06, + "loss": 0.4924, + "step": 307550 + }, + { + "epoch": 2.7189306741632633, + "grad_norm": 2.9534454345703125, + "learning_rate": 4.684488763945615e-06, + "loss": 0.4421, + "step": 307560 + }, + { + "epoch": 2.7190190774235754, + "grad_norm": 3.192934036254883, + "learning_rate": 4.683015376273744e-06, + "loss": 0.5503, + "step": 307570 + }, + { + "epoch": 2.7191074806838875, + "grad_norm": 3.0693092346191406, + "learning_rate": 4.681541988601873e-06, + "loss": 0.4407, + "step": 307580 + }, + { + "epoch": 2.7191958839441996, + "grad_norm": 3.1779017448425293, + "learning_rate": 4.680068600930002e-06, + "loss": 0.576, + "step": 307590 + }, + { + "epoch": 2.719284287204512, + "grad_norm": 4.402355194091797, + "learning_rate": 4.678595213258132e-06, + "loss": 0.5934, + "step": 307600 + }, + { + "epoch": 2.7193726904648243, + "grad_norm": 2.3391706943511963, + "learning_rate": 4.6771218255862615e-06, + "loss": 0.4606, + "step": 307610 + }, + { + "epoch": 2.7194610937251364, + "grad_norm": 2.8908207416534424, + "learning_rate": 4.675648437914391e-06, + "loss": 0.5238, + "step": 307620 + }, + { + "epoch": 2.719549496985449, + "grad_norm": 3.138995409011841, + "learning_rate": 4.67417505024252e-06, + "loss": 0.4848, + "step": 307630 + }, + { + "epoch": 2.719637900245761, + "grad_norm": 4.864989757537842, + "learning_rate": 4.672701662570649e-06, + "loss": 0.5006, + "step": 307640 + }, + { + "epoch": 2.7197263035060733, + "grad_norm": 2.580040216445923, + "learning_rate": 4.671228274898778e-06, + "loss": 0.4417, + "step": 307650 + }, + { + "epoch": 2.7198147067663854, + "grad_norm": 3.338360548019409, + "learning_rate": 4.6697548872269075e-06, + "loss": 0.4593, + "step": 307660 + }, + { + "epoch": 2.719903110026698, + "grad_norm": 2.7992045879364014, + "learning_rate": 4.668281499555037e-06, + "loss": 0.376, + "step": 307670 + }, + { + "epoch": 2.71999151328701, + "grad_norm": 1.3117823600769043, + "learning_rate": 4.666808111883166e-06, + "loss": 0.4634, + "step": 307680 + }, + { + "epoch": 2.720079916547322, + "grad_norm": 3.2029244899749756, + "learning_rate": 4.665334724211295e-06, + "loss": 0.5841, + "step": 307690 + }, + { + "epoch": 2.7201683198076347, + "grad_norm": 3.664480209350586, + "learning_rate": 4.663861336539425e-06, + "loss": 0.6106, + "step": 307700 + }, + { + "epoch": 2.720256723067947, + "grad_norm": 8.204463005065918, + "learning_rate": 4.662387948867554e-06, + "loss": 0.5321, + "step": 307710 + }, + { + "epoch": 2.720345126328259, + "grad_norm": 7.873061180114746, + "learning_rate": 4.6609145611956836e-06, + "loss": 0.5992, + "step": 307720 + }, + { + "epoch": 2.720433529588571, + "grad_norm": 6.086936950683594, + "learning_rate": 4.659441173523813e-06, + "loss": 0.4903, + "step": 307730 + }, + { + "epoch": 2.720521932848883, + "grad_norm": 3.047480583190918, + "learning_rate": 4.657967785851943e-06, + "loss": 0.6562, + "step": 307740 + }, + { + "epoch": 2.720610336109196, + "grad_norm": 3.5589864253997803, + "learning_rate": 4.656494398180072e-06, + "loss": 0.376, + "step": 307750 + }, + { + "epoch": 2.720698739369508, + "grad_norm": 4.177574157714844, + "learning_rate": 4.655021010508201e-06, + "loss": 0.5474, + "step": 307760 + }, + { + "epoch": 2.72078714262982, + "grad_norm": 0.9950869679450989, + "learning_rate": 4.6535476228363304e-06, + "loss": 0.5276, + "step": 307770 + }, + { + "epoch": 2.7208755458901326, + "grad_norm": 4.3952507972717285, + "learning_rate": 4.65207423516446e-06, + "loss": 0.4208, + "step": 307780 + }, + { + "epoch": 2.7209639491504447, + "grad_norm": 4.07802152633667, + "learning_rate": 4.65060084749259e-06, + "loss": 0.4422, + "step": 307790 + }, + { + "epoch": 2.721052352410757, + "grad_norm": 3.2651007175445557, + "learning_rate": 4.649127459820719e-06, + "loss": 0.6646, + "step": 307800 + }, + { + "epoch": 2.721140755671069, + "grad_norm": 3.7057371139526367, + "learning_rate": 4.647654072148848e-06, + "loss": 0.5863, + "step": 307810 + }, + { + "epoch": 2.7212291589313815, + "grad_norm": 7.1900715827941895, + "learning_rate": 4.646180684476977e-06, + "loss": 0.568, + "step": 307820 + }, + { + "epoch": 2.7213175621916936, + "grad_norm": 2.384350061416626, + "learning_rate": 4.6447072968051065e-06, + "loss": 0.5165, + "step": 307830 + }, + { + "epoch": 2.7214059654520057, + "grad_norm": 3.8018267154693604, + "learning_rate": 4.643233909133236e-06, + "loss": 0.578, + "step": 307840 + }, + { + "epoch": 2.7214943687123183, + "grad_norm": 2.8463051319122314, + "learning_rate": 4.641760521461365e-06, + "loss": 0.3843, + "step": 307850 + }, + { + "epoch": 2.7215827719726304, + "grad_norm": 10.359753608703613, + "learning_rate": 4.640287133789494e-06, + "loss": 0.413, + "step": 307860 + }, + { + "epoch": 2.7216711752329426, + "grad_norm": 6.021923542022705, + "learning_rate": 4.638813746117623e-06, + "loss": 0.467, + "step": 307870 + }, + { + "epoch": 2.7217595784932547, + "grad_norm": 3.2788777351379395, + "learning_rate": 4.637340358445753e-06, + "loss": 0.4989, + "step": 307880 + }, + { + "epoch": 2.7218479817535672, + "grad_norm": 5.772153854370117, + "learning_rate": 4.6358669707738825e-06, + "loss": 0.4277, + "step": 307890 + }, + { + "epoch": 2.7219363850138794, + "grad_norm": 4.52836799621582, + "learning_rate": 4.634393583102012e-06, + "loss": 0.4946, + "step": 307900 + }, + { + "epoch": 2.7220247882741915, + "grad_norm": 2.0275895595550537, + "learning_rate": 4.632920195430141e-06, + "loss": 0.5336, + "step": 307910 + }, + { + "epoch": 2.722113191534504, + "grad_norm": 2.508457660675049, + "learning_rate": 4.63144680775827e-06, + "loss": 0.4822, + "step": 307920 + }, + { + "epoch": 2.722201594794816, + "grad_norm": 4.619881629943848, + "learning_rate": 4.629973420086399e-06, + "loss": 0.5353, + "step": 307930 + }, + { + "epoch": 2.7222899980551283, + "grad_norm": 3.319636821746826, + "learning_rate": 4.6285000324145285e-06, + "loss": 0.5308, + "step": 307940 + }, + { + "epoch": 2.7223784013154404, + "grad_norm": 2.532325506210327, + "learning_rate": 4.627026644742658e-06, + "loss": 0.5033, + "step": 307950 + }, + { + "epoch": 2.7224668045757525, + "grad_norm": 1.8139959573745728, + "learning_rate": 4.625553257070788e-06, + "loss": 0.5984, + "step": 307960 + }, + { + "epoch": 2.722555207836065, + "grad_norm": 7.2776994705200195, + "learning_rate": 4.624079869398917e-06, + "loss": 0.6517, + "step": 307970 + }, + { + "epoch": 2.722643611096377, + "grad_norm": 5.847015857696533, + "learning_rate": 4.622606481727046e-06, + "loss": 0.4846, + "step": 307980 + }, + { + "epoch": 2.7227320143566893, + "grad_norm": 8.451143264770508, + "learning_rate": 4.621133094055176e-06, + "loss": 0.6566, + "step": 307990 + }, + { + "epoch": 2.722820417617002, + "grad_norm": 4.490633964538574, + "learning_rate": 4.6196597063833054e-06, + "loss": 0.4093, + "step": 308000 + }, + { + "epoch": 2.722908820877314, + "grad_norm": 3.3057940006256104, + "learning_rate": 4.618186318711435e-06, + "loss": 0.5925, + "step": 308010 + }, + { + "epoch": 2.722997224137626, + "grad_norm": 1.505448341369629, + "learning_rate": 4.616712931039564e-06, + "loss": 0.5651, + "step": 308020 + }, + { + "epoch": 2.7230856273979382, + "grad_norm": 6.78761100769043, + "learning_rate": 4.615239543367693e-06, + "loss": 0.4643, + "step": 308030 + }, + { + "epoch": 2.723174030658251, + "grad_norm": 2.9143340587615967, + "learning_rate": 4.613766155695822e-06, + "loss": 0.4414, + "step": 308040 + }, + { + "epoch": 2.723262433918563, + "grad_norm": 8.659344673156738, + "learning_rate": 4.6122927680239514e-06, + "loss": 0.5754, + "step": 308050 + }, + { + "epoch": 2.723350837178875, + "grad_norm": 1.6104087829589844, + "learning_rate": 4.610819380352081e-06, + "loss": 0.6062, + "step": 308060 + }, + { + "epoch": 2.7234392404391876, + "grad_norm": 1.1534006595611572, + "learning_rate": 4.609345992680211e-06, + "loss": 0.4604, + "step": 308070 + }, + { + "epoch": 2.7235276436994997, + "grad_norm": 1.646970510482788, + "learning_rate": 4.60787260500834e-06, + "loss": 0.4878, + "step": 308080 + }, + { + "epoch": 2.723616046959812, + "grad_norm": 1.5671457052230835, + "learning_rate": 4.606399217336469e-06, + "loss": 0.5572, + "step": 308090 + }, + { + "epoch": 2.723704450220124, + "grad_norm": 1.273639440536499, + "learning_rate": 4.604925829664598e-06, + "loss": 0.4321, + "step": 308100 + }, + { + "epoch": 2.723792853480436, + "grad_norm": 3.3727028369903564, + "learning_rate": 4.6034524419927275e-06, + "loss": 0.5473, + "step": 308110 + }, + { + "epoch": 2.7238812567407487, + "grad_norm": 3.2422690391540527, + "learning_rate": 4.601979054320857e-06, + "loss": 0.5945, + "step": 308120 + }, + { + "epoch": 2.7239696600010608, + "grad_norm": 2.21574068069458, + "learning_rate": 4.600505666648986e-06, + "loss": 0.5026, + "step": 308130 + }, + { + "epoch": 2.7240580632613733, + "grad_norm": 4.853679656982422, + "learning_rate": 4.599032278977115e-06, + "loss": 0.5556, + "step": 308140 + }, + { + "epoch": 2.7241464665216855, + "grad_norm": 3.434802770614624, + "learning_rate": 4.597558891305244e-06, + "loss": 0.4708, + "step": 308150 + }, + { + "epoch": 2.7242348697819976, + "grad_norm": 24.03137969970703, + "learning_rate": 4.596085503633374e-06, + "loss": 0.5478, + "step": 308160 + }, + { + "epoch": 2.7243232730423097, + "grad_norm": 14.209810256958008, + "learning_rate": 4.5946121159615035e-06, + "loss": 0.6419, + "step": 308170 + }, + { + "epoch": 2.724411676302622, + "grad_norm": 5.826821804046631, + "learning_rate": 4.593138728289633e-06, + "loss": 0.5437, + "step": 308180 + }, + { + "epoch": 2.7245000795629344, + "grad_norm": 4.302548885345459, + "learning_rate": 4.591665340617762e-06, + "loss": 0.4975, + "step": 308190 + }, + { + "epoch": 2.7245884828232465, + "grad_norm": 3.1304078102111816, + "learning_rate": 4.590191952945892e-06, + "loss": 0.5285, + "step": 308200 + }, + { + "epoch": 2.7246768860835586, + "grad_norm": 3.8407771587371826, + "learning_rate": 4.588718565274021e-06, + "loss": 0.4807, + "step": 308210 + }, + { + "epoch": 2.724765289343871, + "grad_norm": 0.7773879766464233, + "learning_rate": 4.58724517760215e-06, + "loss": 0.4865, + "step": 308220 + }, + { + "epoch": 2.7248536926041833, + "grad_norm": 5.732974052429199, + "learning_rate": 4.58577178993028e-06, + "loss": 0.6093, + "step": 308230 + }, + { + "epoch": 2.7249420958644954, + "grad_norm": 7.179583549499512, + "learning_rate": 4.584298402258409e-06, + "loss": 0.5478, + "step": 308240 + }, + { + "epoch": 2.7250304991248075, + "grad_norm": 8.003015518188477, + "learning_rate": 4.582825014586538e-06, + "loss": 0.5608, + "step": 308250 + }, + { + "epoch": 2.72511890238512, + "grad_norm": 4.7416181564331055, + "learning_rate": 4.581351626914668e-06, + "loss": 0.4322, + "step": 308260 + }, + { + "epoch": 2.7252073056454322, + "grad_norm": 3.707581043243408, + "learning_rate": 4.579878239242797e-06, + "loss": 0.6628, + "step": 308270 + }, + { + "epoch": 2.7252957089057444, + "grad_norm": 7.204919338226318, + "learning_rate": 4.5784048515709265e-06, + "loss": 0.4721, + "step": 308280 + }, + { + "epoch": 2.725384112166057, + "grad_norm": 4.720126152038574, + "learning_rate": 4.576931463899056e-06, + "loss": 0.536, + "step": 308290 + }, + { + "epoch": 2.725472515426369, + "grad_norm": 2.1542885303497314, + "learning_rate": 4.575458076227185e-06, + "loss": 0.5095, + "step": 308300 + }, + { + "epoch": 2.725560918686681, + "grad_norm": 4.358950138092041, + "learning_rate": 4.573984688555314e-06, + "loss": 0.4377, + "step": 308310 + }, + { + "epoch": 2.7256493219469933, + "grad_norm": 4.417023181915283, + "learning_rate": 4.572511300883443e-06, + "loss": 0.5161, + "step": 308320 + }, + { + "epoch": 2.7257377252073054, + "grad_norm": 2.548159122467041, + "learning_rate": 4.5710379132115725e-06, + "loss": 0.4315, + "step": 308330 + }, + { + "epoch": 2.725826128467618, + "grad_norm": 4.15629243850708, + "learning_rate": 4.569564525539702e-06, + "loss": 0.5582, + "step": 308340 + }, + { + "epoch": 2.72591453172793, + "grad_norm": 4.223159313201904, + "learning_rate": 4.568091137867832e-06, + "loss": 0.5345, + "step": 308350 + }, + { + "epoch": 2.7260029349882426, + "grad_norm": 3.272223472595215, + "learning_rate": 4.566617750195961e-06, + "loss": 0.473, + "step": 308360 + }, + { + "epoch": 2.7260913382485548, + "grad_norm": 4.485598564147949, + "learning_rate": 4.56514436252409e-06, + "loss": 0.4893, + "step": 308370 + }, + { + "epoch": 2.726179741508867, + "grad_norm": 2.322117567062378, + "learning_rate": 4.563670974852219e-06, + "loss": 0.5502, + "step": 308380 + }, + { + "epoch": 2.726268144769179, + "grad_norm": 2.087444543838501, + "learning_rate": 4.5621975871803485e-06, + "loss": 0.4495, + "step": 308390 + }, + { + "epoch": 2.726356548029491, + "grad_norm": 10.942352294921875, + "learning_rate": 4.560724199508478e-06, + "loss": 0.5951, + "step": 308400 + }, + { + "epoch": 2.7264449512898037, + "grad_norm": 1.6002633571624756, + "learning_rate": 4.559250811836607e-06, + "loss": 0.5184, + "step": 308410 + }, + { + "epoch": 2.726533354550116, + "grad_norm": 2.15305757522583, + "learning_rate": 4.557777424164737e-06, + "loss": 0.4818, + "step": 308420 + }, + { + "epoch": 2.726621757810428, + "grad_norm": 2.1972765922546387, + "learning_rate": 4.556304036492866e-06, + "loss": 0.5429, + "step": 308430 + }, + { + "epoch": 2.7267101610707405, + "grad_norm": 8.116562843322754, + "learning_rate": 4.554830648820995e-06, + "loss": 0.4316, + "step": 308440 + }, + { + "epoch": 2.7267985643310526, + "grad_norm": 2.3947181701660156, + "learning_rate": 4.553357261149125e-06, + "loss": 0.6356, + "step": 308450 + }, + { + "epoch": 2.7268869675913647, + "grad_norm": 2.81655216217041, + "learning_rate": 4.551883873477255e-06, + "loss": 0.6837, + "step": 308460 + }, + { + "epoch": 2.726975370851677, + "grad_norm": 2.6851742267608643, + "learning_rate": 4.550410485805384e-06, + "loss": 0.5018, + "step": 308470 + }, + { + "epoch": 2.7270637741119894, + "grad_norm": 10.4423246383667, + "learning_rate": 4.548937098133513e-06, + "loss": 0.4861, + "step": 308480 + }, + { + "epoch": 2.7271521773723015, + "grad_norm": 7.208816051483154, + "learning_rate": 4.547463710461642e-06, + "loss": 0.4862, + "step": 308490 + }, + { + "epoch": 2.7272405806326137, + "grad_norm": 2.3396527767181396, + "learning_rate": 4.545990322789771e-06, + "loss": 0.5684, + "step": 308500 + }, + { + "epoch": 2.727328983892926, + "grad_norm": 4.623304843902588, + "learning_rate": 4.544516935117901e-06, + "loss": 0.5608, + "step": 308510 + }, + { + "epoch": 2.7274173871532383, + "grad_norm": 5.48579216003418, + "learning_rate": 4.54304354744603e-06, + "loss": 0.4797, + "step": 308520 + }, + { + "epoch": 2.7275057904135505, + "grad_norm": 1.4830689430236816, + "learning_rate": 4.541570159774159e-06, + "loss": 0.5404, + "step": 308530 + }, + { + "epoch": 2.7275941936738626, + "grad_norm": 9.875204086303711, + "learning_rate": 4.540096772102289e-06, + "loss": 0.3894, + "step": 308540 + }, + { + "epoch": 2.7276825969341747, + "grad_norm": 12.222371101379395, + "learning_rate": 4.538623384430418e-06, + "loss": 0.4913, + "step": 308550 + }, + { + "epoch": 2.7277710001944873, + "grad_norm": 4.658017635345459, + "learning_rate": 4.5371499967585475e-06, + "loss": 0.4788, + "step": 308560 + }, + { + "epoch": 2.7278594034547994, + "grad_norm": 11.471343994140625, + "learning_rate": 4.535676609086677e-06, + "loss": 0.4469, + "step": 308570 + }, + { + "epoch": 2.7279478067151115, + "grad_norm": 2.4746885299682617, + "learning_rate": 4.534203221414806e-06, + "loss": 0.5121, + "step": 308580 + }, + { + "epoch": 2.728036209975424, + "grad_norm": 6.388030052185059, + "learning_rate": 4.532729833742935e-06, + "loss": 0.4725, + "step": 308590 + }, + { + "epoch": 2.728124613235736, + "grad_norm": 11.925979614257812, + "learning_rate": 4.531256446071064e-06, + "loss": 0.533, + "step": 308600 + }, + { + "epoch": 2.7282130164960483, + "grad_norm": 5.331080913543701, + "learning_rate": 4.5297830583991935e-06, + "loss": 0.4579, + "step": 308610 + }, + { + "epoch": 2.7283014197563604, + "grad_norm": 1.576750636100769, + "learning_rate": 4.528309670727323e-06, + "loss": 0.53, + "step": 308620 + }, + { + "epoch": 2.728389823016673, + "grad_norm": 9.610515594482422, + "learning_rate": 4.526836283055453e-06, + "loss": 0.5796, + "step": 308630 + }, + { + "epoch": 2.728478226276985, + "grad_norm": 1.8436787128448486, + "learning_rate": 4.525362895383582e-06, + "loss": 0.4973, + "step": 308640 + }, + { + "epoch": 2.7285666295372972, + "grad_norm": 2.0000622272491455, + "learning_rate": 4.523889507711711e-06, + "loss": 0.4954, + "step": 308650 + }, + { + "epoch": 2.72865503279761, + "grad_norm": 2.324134111404419, + "learning_rate": 4.52241612003984e-06, + "loss": 0.4706, + "step": 308660 + }, + { + "epoch": 2.728743436057922, + "grad_norm": 4.807832717895508, + "learning_rate": 4.52094273236797e-06, + "loss": 0.4748, + "step": 308670 + }, + { + "epoch": 2.728831839318234, + "grad_norm": 2.9453125, + "learning_rate": 4.5194693446960996e-06, + "loss": 0.4885, + "step": 308680 + }, + { + "epoch": 2.728920242578546, + "grad_norm": 2.786518096923828, + "learning_rate": 4.517995957024229e-06, + "loss": 0.6277, + "step": 308690 + }, + { + "epoch": 2.7290086458388583, + "grad_norm": 3.8035213947296143, + "learning_rate": 4.516522569352358e-06, + "loss": 0.511, + "step": 308700 + }, + { + "epoch": 2.729097049099171, + "grad_norm": 0.9898993968963623, + "learning_rate": 4.515049181680487e-06, + "loss": 0.509, + "step": 308710 + }, + { + "epoch": 2.729185452359483, + "grad_norm": 4.414059162139893, + "learning_rate": 4.513575794008616e-06, + "loss": 0.6032, + "step": 308720 + }, + { + "epoch": 2.7292738556197955, + "grad_norm": 3.2360787391662598, + "learning_rate": 4.5121024063367464e-06, + "loss": 0.4355, + "step": 308730 + }, + { + "epoch": 2.7293622588801076, + "grad_norm": 28.36467933654785, + "learning_rate": 4.510629018664876e-06, + "loss": 0.5423, + "step": 308740 + }, + { + "epoch": 2.7294506621404198, + "grad_norm": 3.132004737854004, + "learning_rate": 4.509155630993005e-06, + "loss": 0.5846, + "step": 308750 + }, + { + "epoch": 2.729539065400732, + "grad_norm": 1.3512994050979614, + "learning_rate": 4.507682243321134e-06, + "loss": 0.499, + "step": 308760 + }, + { + "epoch": 2.729627468661044, + "grad_norm": 2.743884801864624, + "learning_rate": 4.506208855649263e-06, + "loss": 0.5143, + "step": 308770 + }, + { + "epoch": 2.7297158719213566, + "grad_norm": 1.607499122619629, + "learning_rate": 4.5047354679773924e-06, + "loss": 0.5079, + "step": 308780 + }, + { + "epoch": 2.7298042751816687, + "grad_norm": 8.794159889221191, + "learning_rate": 4.503262080305522e-06, + "loss": 0.6017, + "step": 308790 + }, + { + "epoch": 2.729892678441981, + "grad_norm": 11.583786964416504, + "learning_rate": 4.501788692633651e-06, + "loss": 0.4681, + "step": 308800 + }, + { + "epoch": 2.7299810817022934, + "grad_norm": 2.3600728511810303, + "learning_rate": 4.50031530496178e-06, + "loss": 0.5514, + "step": 308810 + }, + { + "epoch": 2.7300694849626055, + "grad_norm": 3.192382574081421, + "learning_rate": 4.49884191728991e-06, + "loss": 0.4644, + "step": 308820 + }, + { + "epoch": 2.7301578882229176, + "grad_norm": 6.264559268951416, + "learning_rate": 4.497368529618039e-06, + "loss": 0.5533, + "step": 308830 + }, + { + "epoch": 2.7302462914832297, + "grad_norm": 6.074662685394287, + "learning_rate": 4.4958951419461685e-06, + "loss": 0.5104, + "step": 308840 + }, + { + "epoch": 2.7303346947435423, + "grad_norm": 2.009617328643799, + "learning_rate": 4.494421754274298e-06, + "loss": 0.449, + "step": 308850 + }, + { + "epoch": 2.7304230980038544, + "grad_norm": 10.792549133300781, + "learning_rate": 4.492948366602427e-06, + "loss": 0.5431, + "step": 308860 + }, + { + "epoch": 2.7305115012641665, + "grad_norm": 1.311892032623291, + "learning_rate": 4.491474978930556e-06, + "loss": 0.5645, + "step": 308870 + }, + { + "epoch": 2.730599904524479, + "grad_norm": 4.740288734436035, + "learning_rate": 4.490001591258685e-06, + "loss": 0.4325, + "step": 308880 + }, + { + "epoch": 2.730688307784791, + "grad_norm": 4.378518581390381, + "learning_rate": 4.488528203586815e-06, + "loss": 0.5666, + "step": 308890 + }, + { + "epoch": 2.7307767110451033, + "grad_norm": 3.4715986251831055, + "learning_rate": 4.4870548159149445e-06, + "loss": 0.4945, + "step": 308900 + }, + { + "epoch": 2.7308651143054155, + "grad_norm": 5.706183433532715, + "learning_rate": 4.485581428243074e-06, + "loss": 0.4994, + "step": 308910 + }, + { + "epoch": 2.7309535175657276, + "grad_norm": 1.3120520114898682, + "learning_rate": 4.484108040571204e-06, + "loss": 0.5784, + "step": 308920 + }, + { + "epoch": 2.73104192082604, + "grad_norm": 5.862990379333496, + "learning_rate": 4.482634652899333e-06, + "loss": 0.48, + "step": 308930 + }, + { + "epoch": 2.7311303240863523, + "grad_norm": 5.470860958099365, + "learning_rate": 4.481161265227462e-06, + "loss": 0.53, + "step": 308940 + }, + { + "epoch": 2.731218727346665, + "grad_norm": 4.290978908538818, + "learning_rate": 4.479687877555591e-06, + "loss": 0.4669, + "step": 308950 + }, + { + "epoch": 2.731307130606977, + "grad_norm": 1.2046207189559937, + "learning_rate": 4.478214489883721e-06, + "loss": 0.4629, + "step": 308960 + }, + { + "epoch": 2.731395533867289, + "grad_norm": 1.9210127592086792, + "learning_rate": 4.47674110221185e-06, + "loss": 0.5607, + "step": 308970 + }, + { + "epoch": 2.731483937127601, + "grad_norm": 26.72182273864746, + "learning_rate": 4.475267714539979e-06, + "loss": 0.5871, + "step": 308980 + }, + { + "epoch": 2.7315723403879133, + "grad_norm": 0.6122230291366577, + "learning_rate": 4.473794326868108e-06, + "loss": 0.4265, + "step": 308990 + }, + { + "epoch": 2.731660743648226, + "grad_norm": 13.199585914611816, + "learning_rate": 4.472320939196237e-06, + "loss": 0.418, + "step": 309000 + }, + { + "epoch": 2.731749146908538, + "grad_norm": 1.4211528301239014, + "learning_rate": 4.4708475515243674e-06, + "loss": 0.3987, + "step": 309010 + }, + { + "epoch": 2.73183755016885, + "grad_norm": 4.280864238739014, + "learning_rate": 4.469374163852497e-06, + "loss": 0.6201, + "step": 309020 + }, + { + "epoch": 2.7319259534291627, + "grad_norm": 2.015130043029785, + "learning_rate": 4.467900776180626e-06, + "loss": 0.504, + "step": 309030 + }, + { + "epoch": 2.732014356689475, + "grad_norm": 11.65554141998291, + "learning_rate": 4.466427388508755e-06, + "loss": 0.5956, + "step": 309040 + }, + { + "epoch": 2.732102759949787, + "grad_norm": 3.1664018630981445, + "learning_rate": 4.464954000836884e-06, + "loss": 0.6243, + "step": 309050 + }, + { + "epoch": 2.732191163210099, + "grad_norm": 2.682587146759033, + "learning_rate": 4.4634806131650134e-06, + "loss": 0.55, + "step": 309060 + }, + { + "epoch": 2.7322795664704116, + "grad_norm": 3.1288442611694336, + "learning_rate": 4.462007225493143e-06, + "loss": 0.5154, + "step": 309070 + }, + { + "epoch": 2.7323679697307237, + "grad_norm": 5.173380374908447, + "learning_rate": 4.460533837821272e-06, + "loss": 0.4827, + "step": 309080 + }, + { + "epoch": 2.732456372991036, + "grad_norm": 5.918436527252197, + "learning_rate": 4.459060450149401e-06, + "loss": 0.5398, + "step": 309090 + }, + { + "epoch": 2.7325447762513484, + "grad_norm": 1.9713407754898071, + "learning_rate": 4.457587062477531e-06, + "loss": 0.4766, + "step": 309100 + }, + { + "epoch": 2.7326331795116605, + "grad_norm": 3.5990195274353027, + "learning_rate": 4.45611367480566e-06, + "loss": 0.5059, + "step": 309110 + }, + { + "epoch": 2.7327215827719726, + "grad_norm": 2.0139029026031494, + "learning_rate": 4.4546402871337895e-06, + "loss": 0.5835, + "step": 309120 + }, + { + "epoch": 2.7328099860322848, + "grad_norm": 3.647632598876953, + "learning_rate": 4.453166899461919e-06, + "loss": 0.5227, + "step": 309130 + }, + { + "epoch": 2.732898389292597, + "grad_norm": 3.2084431648254395, + "learning_rate": 4.451693511790049e-06, + "loss": 0.5216, + "step": 309140 + }, + { + "epoch": 2.7329867925529094, + "grad_norm": 4.266152858734131, + "learning_rate": 4.450220124118178e-06, + "loss": 0.5448, + "step": 309150 + }, + { + "epoch": 2.7330751958132216, + "grad_norm": 2.388455390930176, + "learning_rate": 4.448746736446307e-06, + "loss": 0.4537, + "step": 309160 + }, + { + "epoch": 2.7331635990735337, + "grad_norm": 5.222240447998047, + "learning_rate": 4.447273348774436e-06, + "loss": 0.4998, + "step": 309170 + }, + { + "epoch": 2.7332520023338462, + "grad_norm": 5.603033065795898, + "learning_rate": 4.4457999611025656e-06, + "loss": 0.5147, + "step": 309180 + }, + { + "epoch": 2.7333404055941584, + "grad_norm": 9.6637544631958, + "learning_rate": 4.444326573430696e-06, + "loss": 0.4091, + "step": 309190 + }, + { + "epoch": 2.7334288088544705, + "grad_norm": 3.92553973197937, + "learning_rate": 4.442853185758825e-06, + "loss": 0.5727, + "step": 309200 + }, + { + "epoch": 2.7335172121147826, + "grad_norm": 5.789129257202148, + "learning_rate": 4.441379798086954e-06, + "loss": 0.5884, + "step": 309210 + }, + { + "epoch": 2.733605615375095, + "grad_norm": 4.118077278137207, + "learning_rate": 4.439906410415083e-06, + "loss": 0.4593, + "step": 309220 + }, + { + "epoch": 2.7336940186354073, + "grad_norm": 2.3017101287841797, + "learning_rate": 4.438433022743212e-06, + "loss": 0.5552, + "step": 309230 + }, + { + "epoch": 2.7337824218957194, + "grad_norm": 5.293052673339844, + "learning_rate": 4.436959635071342e-06, + "loss": 0.4905, + "step": 309240 + }, + { + "epoch": 2.733870825156032, + "grad_norm": 3.0306174755096436, + "learning_rate": 4.435486247399471e-06, + "loss": 0.5087, + "step": 309250 + }, + { + "epoch": 2.733959228416344, + "grad_norm": 9.159019470214844, + "learning_rate": 4.4340128597276e-06, + "loss": 0.632, + "step": 309260 + }, + { + "epoch": 2.734047631676656, + "grad_norm": 2.0829122066497803, + "learning_rate": 4.432539472055729e-06, + "loss": 0.4679, + "step": 309270 + }, + { + "epoch": 2.7341360349369683, + "grad_norm": 2.8774375915527344, + "learning_rate": 4.431066084383858e-06, + "loss": 0.4937, + "step": 309280 + }, + { + "epoch": 2.7342244381972804, + "grad_norm": 11.86851978302002, + "learning_rate": 4.4295926967119885e-06, + "loss": 0.4594, + "step": 309290 + }, + { + "epoch": 2.734312841457593, + "grad_norm": 1.3080328702926636, + "learning_rate": 4.428119309040118e-06, + "loss": 0.4651, + "step": 309300 + }, + { + "epoch": 2.734401244717905, + "grad_norm": 3.6766040325164795, + "learning_rate": 4.426645921368247e-06, + "loss": 0.618, + "step": 309310 + }, + { + "epoch": 2.7344896479782177, + "grad_norm": 5.729684829711914, + "learning_rate": 4.425172533696376e-06, + "loss": 0.4391, + "step": 309320 + }, + { + "epoch": 2.73457805123853, + "grad_norm": 3.730123281478882, + "learning_rate": 4.423699146024505e-06, + "loss": 0.5082, + "step": 309330 + }, + { + "epoch": 2.734666454498842, + "grad_norm": 5.438342094421387, + "learning_rate": 4.4222257583526345e-06, + "loss": 0.454, + "step": 309340 + }, + { + "epoch": 2.734754857759154, + "grad_norm": 5.108583450317383, + "learning_rate": 4.420752370680764e-06, + "loss": 0.5747, + "step": 309350 + }, + { + "epoch": 2.734843261019466, + "grad_norm": 3.3494455814361572, + "learning_rate": 4.419278983008894e-06, + "loss": 0.5086, + "step": 309360 + }, + { + "epoch": 2.7349316642797787, + "grad_norm": 5.901094913482666, + "learning_rate": 4.417805595337023e-06, + "loss": 0.4409, + "step": 309370 + }, + { + "epoch": 2.735020067540091, + "grad_norm": 2.9476566314697266, + "learning_rate": 4.416332207665153e-06, + "loss": 0.5477, + "step": 309380 + }, + { + "epoch": 2.735108470800403, + "grad_norm": 3.5348947048187256, + "learning_rate": 4.414858819993282e-06, + "loss": 0.3757, + "step": 309390 + }, + { + "epoch": 2.7351968740607155, + "grad_norm": 3.197547435760498, + "learning_rate": 4.413385432321411e-06, + "loss": 0.5202, + "step": 309400 + }, + { + "epoch": 2.7352852773210277, + "grad_norm": 2.380253314971924, + "learning_rate": 4.4119120446495406e-06, + "loss": 0.5843, + "step": 309410 + }, + { + "epoch": 2.73537368058134, + "grad_norm": 11.694599151611328, + "learning_rate": 4.41043865697767e-06, + "loss": 0.4245, + "step": 309420 + }, + { + "epoch": 2.735462083841652, + "grad_norm": 0.948406457901001, + "learning_rate": 4.408965269305799e-06, + "loss": 0.3432, + "step": 309430 + }, + { + "epoch": 2.7355504871019645, + "grad_norm": 2.5637826919555664, + "learning_rate": 4.407491881633928e-06, + "loss": 0.5056, + "step": 309440 + }, + { + "epoch": 2.7356388903622766, + "grad_norm": 4.749460220336914, + "learning_rate": 4.406018493962057e-06, + "loss": 0.539, + "step": 309450 + }, + { + "epoch": 2.7357272936225887, + "grad_norm": 6.845168590545654, + "learning_rate": 4.4045451062901866e-06, + "loss": 0.665, + "step": 309460 + }, + { + "epoch": 2.7358156968829013, + "grad_norm": 3.7553021907806396, + "learning_rate": 4.403071718618317e-06, + "loss": 0.4797, + "step": 309470 + }, + { + "epoch": 2.7359041001432134, + "grad_norm": 5.21061897277832, + "learning_rate": 4.401598330946446e-06, + "loss": 0.5824, + "step": 309480 + }, + { + "epoch": 2.7359925034035255, + "grad_norm": 2.533268451690674, + "learning_rate": 4.400124943274575e-06, + "loss": 0.4471, + "step": 309490 + }, + { + "epoch": 2.7360809066638376, + "grad_norm": 2.6658082008361816, + "learning_rate": 4.398651555602704e-06, + "loss": 0.5217, + "step": 309500 + }, + { + "epoch": 2.7361693099241498, + "grad_norm": 2.0318522453308105, + "learning_rate": 4.397178167930833e-06, + "loss": 0.4773, + "step": 309510 + }, + { + "epoch": 2.7362577131844623, + "grad_norm": 1.8851032257080078, + "learning_rate": 4.395704780258963e-06, + "loss": 0.4538, + "step": 309520 + }, + { + "epoch": 2.7363461164447744, + "grad_norm": 4.529411792755127, + "learning_rate": 4.394231392587092e-06, + "loss": 0.5457, + "step": 309530 + }, + { + "epoch": 2.736434519705087, + "grad_norm": 27.861452102661133, + "learning_rate": 4.392758004915221e-06, + "loss": 0.5915, + "step": 309540 + }, + { + "epoch": 2.736522922965399, + "grad_norm": 1.0695983171463013, + "learning_rate": 4.39128461724335e-06, + "loss": 0.5334, + "step": 309550 + }, + { + "epoch": 2.7366113262257112, + "grad_norm": 6.138192653656006, + "learning_rate": 4.3898112295714794e-06, + "loss": 0.5246, + "step": 309560 + }, + { + "epoch": 2.7366997294860234, + "grad_norm": 6.642462253570557, + "learning_rate": 4.3883378418996095e-06, + "loss": 0.5525, + "step": 309570 + }, + { + "epoch": 2.7367881327463355, + "grad_norm": 6.904299259185791, + "learning_rate": 4.386864454227739e-06, + "loss": 0.4346, + "step": 309580 + }, + { + "epoch": 2.736876536006648, + "grad_norm": 18.40342903137207, + "learning_rate": 4.385391066555868e-06, + "loss": 0.5234, + "step": 309590 + }, + { + "epoch": 2.73696493926696, + "grad_norm": 7.367995262145996, + "learning_rate": 4.383917678883998e-06, + "loss": 0.594, + "step": 309600 + }, + { + "epoch": 2.7370533425272723, + "grad_norm": 5.50579833984375, + "learning_rate": 4.382444291212127e-06, + "loss": 0.5741, + "step": 309610 + }, + { + "epoch": 2.737141745787585, + "grad_norm": 3.1250879764556885, + "learning_rate": 4.380970903540256e-06, + "loss": 0.5841, + "step": 309620 + }, + { + "epoch": 2.737230149047897, + "grad_norm": 1.5239273309707642, + "learning_rate": 4.3794975158683855e-06, + "loss": 0.4778, + "step": 309630 + }, + { + "epoch": 2.737318552308209, + "grad_norm": 3.1652448177337646, + "learning_rate": 4.378024128196515e-06, + "loss": 0.4685, + "step": 309640 + }, + { + "epoch": 2.737406955568521, + "grad_norm": 8.03756046295166, + "learning_rate": 4.376550740524644e-06, + "loss": 0.4738, + "step": 309650 + }, + { + "epoch": 2.7374953588288338, + "grad_norm": 2.428725242614746, + "learning_rate": 4.375077352852774e-06, + "loss": 0.4685, + "step": 309660 + }, + { + "epoch": 2.737583762089146, + "grad_norm": 5.233055591583252, + "learning_rate": 4.373603965180903e-06, + "loss": 0.575, + "step": 309670 + }, + { + "epoch": 2.737672165349458, + "grad_norm": 1.878367304801941, + "learning_rate": 4.372130577509032e-06, + "loss": 0.4154, + "step": 309680 + }, + { + "epoch": 2.7377605686097706, + "grad_norm": 4.876142501831055, + "learning_rate": 4.370657189837162e-06, + "loss": 0.4701, + "step": 309690 + }, + { + "epoch": 2.7378489718700827, + "grad_norm": 2.0882532596588135, + "learning_rate": 4.369183802165291e-06, + "loss": 0.5249, + "step": 309700 + }, + { + "epoch": 2.737937375130395, + "grad_norm": 2.43402361869812, + "learning_rate": 4.36771041449342e-06, + "loss": 0.4391, + "step": 309710 + }, + { + "epoch": 2.738025778390707, + "grad_norm": 1.9767513275146484, + "learning_rate": 4.366237026821549e-06, + "loss": 0.4238, + "step": 309720 + }, + { + "epoch": 2.738114181651019, + "grad_norm": 1.7493829727172852, + "learning_rate": 4.364763639149678e-06, + "loss": 0.4327, + "step": 309730 + }, + { + "epoch": 2.7382025849113316, + "grad_norm": 3.756796360015869, + "learning_rate": 4.363290251477808e-06, + "loss": 0.3714, + "step": 309740 + }, + { + "epoch": 2.7382909881716437, + "grad_norm": 4.072164058685303, + "learning_rate": 4.361816863805938e-06, + "loss": 0.5566, + "step": 309750 + }, + { + "epoch": 2.738379391431956, + "grad_norm": 13.501974105834961, + "learning_rate": 4.360343476134067e-06, + "loss": 0.4797, + "step": 309760 + }, + { + "epoch": 2.7384677946922684, + "grad_norm": 3.604445695877075, + "learning_rate": 4.358870088462196e-06, + "loss": 0.5394, + "step": 309770 + }, + { + "epoch": 2.7385561979525805, + "grad_norm": 4.3917059898376465, + "learning_rate": 4.357396700790325e-06, + "loss": 0.5465, + "step": 309780 + }, + { + "epoch": 2.7386446012128927, + "grad_norm": 7.677353858947754, + "learning_rate": 4.3559233131184544e-06, + "loss": 0.4692, + "step": 309790 + }, + { + "epoch": 2.738733004473205, + "grad_norm": 0.7763603925704956, + "learning_rate": 4.354449925446584e-06, + "loss": 0.4984, + "step": 309800 + }, + { + "epoch": 2.7388214077335173, + "grad_norm": 6.682476997375488, + "learning_rate": 4.352976537774713e-06, + "loss": 0.46, + "step": 309810 + }, + { + "epoch": 2.7389098109938295, + "grad_norm": 1.3972442150115967, + "learning_rate": 4.351503150102843e-06, + "loss": 0.4966, + "step": 309820 + }, + { + "epoch": 2.7389982142541416, + "grad_norm": 4.804217338562012, + "learning_rate": 4.350029762430972e-06, + "loss": 0.4629, + "step": 309830 + }, + { + "epoch": 2.739086617514454, + "grad_norm": 1.5853723287582397, + "learning_rate": 4.348556374759101e-06, + "loss": 0.6473, + "step": 309840 + }, + { + "epoch": 2.7391750207747663, + "grad_norm": 5.926552772521973, + "learning_rate": 4.347082987087231e-06, + "loss": 0.5709, + "step": 309850 + }, + { + "epoch": 2.7392634240350784, + "grad_norm": 3.3321571350097656, + "learning_rate": 4.3456095994153605e-06, + "loss": 0.4519, + "step": 309860 + }, + { + "epoch": 2.7393518272953905, + "grad_norm": 3.814429521560669, + "learning_rate": 4.34413621174349e-06, + "loss": 0.5354, + "step": 309870 + }, + { + "epoch": 2.7394402305557026, + "grad_norm": 2.9110610485076904, + "learning_rate": 4.342662824071619e-06, + "loss": 0.5602, + "step": 309880 + }, + { + "epoch": 2.739528633816015, + "grad_norm": 3.658970832824707, + "learning_rate": 4.341189436399748e-06, + "loss": 0.3913, + "step": 309890 + }, + { + "epoch": 2.7396170370763273, + "grad_norm": 2.8041181564331055, + "learning_rate": 4.339716048727877e-06, + "loss": 0.3817, + "step": 309900 + }, + { + "epoch": 2.73970544033664, + "grad_norm": 4.020094394683838, + "learning_rate": 4.3382426610560065e-06, + "loss": 0.5041, + "step": 309910 + }, + { + "epoch": 2.739793843596952, + "grad_norm": 6.321782112121582, + "learning_rate": 4.336769273384136e-06, + "loss": 0.5886, + "step": 309920 + }, + { + "epoch": 2.739882246857264, + "grad_norm": 4.117957592010498, + "learning_rate": 4.335295885712265e-06, + "loss": 0.5833, + "step": 309930 + }, + { + "epoch": 2.7399706501175762, + "grad_norm": 3.875746726989746, + "learning_rate": 4.333822498040395e-06, + "loss": 0.6521, + "step": 309940 + }, + { + "epoch": 2.7400590533778884, + "grad_norm": 2.2878987789154053, + "learning_rate": 4.332349110368524e-06, + "loss": 0.4934, + "step": 309950 + }, + { + "epoch": 2.740147456638201, + "grad_norm": 4.554445743560791, + "learning_rate": 4.330875722696653e-06, + "loss": 0.4664, + "step": 309960 + }, + { + "epoch": 2.740235859898513, + "grad_norm": 4.670612335205078, + "learning_rate": 4.329402335024783e-06, + "loss": 0.4277, + "step": 309970 + }, + { + "epoch": 2.740324263158825, + "grad_norm": 11.62319278717041, + "learning_rate": 4.327928947352912e-06, + "loss": 0.5507, + "step": 309980 + }, + { + "epoch": 2.7404126664191377, + "grad_norm": 6.153392314910889, + "learning_rate": 4.326455559681041e-06, + "loss": 0.6265, + "step": 309990 + }, + { + "epoch": 2.74050106967945, + "grad_norm": 7.498966217041016, + "learning_rate": 4.32498217200917e-06, + "loss": 0.6213, + "step": 310000 + }, + { + "epoch": 2.740589472939762, + "grad_norm": 2.6893744468688965, + "learning_rate": 4.323508784337299e-06, + "loss": 0.3703, + "step": 310010 + }, + { + "epoch": 2.740677876200074, + "grad_norm": 0.9502105116844177, + "learning_rate": 4.322035396665429e-06, + "loss": 0.5711, + "step": 310020 + }, + { + "epoch": 2.7407662794603866, + "grad_norm": 1.3683098554611206, + "learning_rate": 4.320562008993559e-06, + "loss": 0.5807, + "step": 310030 + }, + { + "epoch": 2.7408546827206988, + "grad_norm": 3.4604854583740234, + "learning_rate": 4.319088621321688e-06, + "loss": 0.5523, + "step": 310040 + }, + { + "epoch": 2.740943085981011, + "grad_norm": 4.280843734741211, + "learning_rate": 4.317615233649817e-06, + "loss": 0.4886, + "step": 310050 + }, + { + "epoch": 2.7410314892413234, + "grad_norm": 3.0317602157592773, + "learning_rate": 4.316141845977946e-06, + "loss": 0.6478, + "step": 310060 + }, + { + "epoch": 2.7411198925016356, + "grad_norm": 5.746598243713379, + "learning_rate": 4.314668458306076e-06, + "loss": 0.5693, + "step": 310070 + }, + { + "epoch": 2.7412082957619477, + "grad_norm": 4.868401050567627, + "learning_rate": 4.3131950706342055e-06, + "loss": 0.4911, + "step": 310080 + }, + { + "epoch": 2.74129669902226, + "grad_norm": 3.701336145401001, + "learning_rate": 4.311721682962335e-06, + "loss": 0.5762, + "step": 310090 + }, + { + "epoch": 2.741385102282572, + "grad_norm": 2.6852245330810547, + "learning_rate": 4.310248295290464e-06, + "loss": 0.5566, + "step": 310100 + }, + { + "epoch": 2.7414735055428845, + "grad_norm": 5.890885829925537, + "learning_rate": 4.308774907618593e-06, + "loss": 0.481, + "step": 310110 + }, + { + "epoch": 2.7415619088031966, + "grad_norm": 5.784653663635254, + "learning_rate": 4.307301519946722e-06, + "loss": 0.5846, + "step": 310120 + }, + { + "epoch": 2.741650312063509, + "grad_norm": 2.557201385498047, + "learning_rate": 4.305828132274852e-06, + "loss": 0.5712, + "step": 310130 + }, + { + "epoch": 2.7417387153238213, + "grad_norm": 1.7738829851150513, + "learning_rate": 4.3043547446029816e-06, + "loss": 0.5131, + "step": 310140 + }, + { + "epoch": 2.7418271185841334, + "grad_norm": 6.751017093658447, + "learning_rate": 4.302881356931111e-06, + "loss": 0.6269, + "step": 310150 + }, + { + "epoch": 2.7419155218444455, + "grad_norm": 1.5525835752487183, + "learning_rate": 4.30140796925924e-06, + "loss": 0.3674, + "step": 310160 + }, + { + "epoch": 2.7420039251047577, + "grad_norm": 5.935557842254639, + "learning_rate": 4.299934581587369e-06, + "loss": 0.4501, + "step": 310170 + }, + { + "epoch": 2.74209232836507, + "grad_norm": 1.842021107673645, + "learning_rate": 4.298461193915498e-06, + "loss": 0.5699, + "step": 310180 + }, + { + "epoch": 2.7421807316253823, + "grad_norm": 2.018071174621582, + "learning_rate": 4.2969878062436276e-06, + "loss": 0.5534, + "step": 310190 + }, + { + "epoch": 2.7422691348856945, + "grad_norm": 2.455156087875366, + "learning_rate": 4.295514418571757e-06, + "loss": 0.5365, + "step": 310200 + }, + { + "epoch": 2.742357538146007, + "grad_norm": 3.342062473297119, + "learning_rate": 4.294041030899886e-06, + "loss": 0.4883, + "step": 310210 + }, + { + "epoch": 2.742445941406319, + "grad_norm": 1.5286797285079956, + "learning_rate": 4.292567643228016e-06, + "loss": 0.4876, + "step": 310220 + }, + { + "epoch": 2.7425343446666313, + "grad_norm": 2.032230854034424, + "learning_rate": 4.291094255556145e-06, + "loss": 0.5153, + "step": 310230 + }, + { + "epoch": 2.7426227479269434, + "grad_norm": 2.9771738052368164, + "learning_rate": 4.289620867884274e-06, + "loss": 0.5296, + "step": 310240 + }, + { + "epoch": 2.742711151187256, + "grad_norm": 5.1212968826293945, + "learning_rate": 4.288147480212404e-06, + "loss": 0.5085, + "step": 310250 + }, + { + "epoch": 2.742799554447568, + "grad_norm": 2.337545871734619, + "learning_rate": 4.286674092540533e-06, + "loss": 0.4346, + "step": 310260 + }, + { + "epoch": 2.74288795770788, + "grad_norm": 3.45487904548645, + "learning_rate": 4.285200704868662e-06, + "loss": 0.4563, + "step": 310270 + }, + { + "epoch": 2.7429763609681928, + "grad_norm": 6.3340606689453125, + "learning_rate": 4.283727317196791e-06, + "loss": 0.5406, + "step": 310280 + }, + { + "epoch": 2.743064764228505, + "grad_norm": 5.061007976531982, + "learning_rate": 4.282253929524921e-06, + "loss": 0.505, + "step": 310290 + }, + { + "epoch": 2.743153167488817, + "grad_norm": 23.724197387695312, + "learning_rate": 4.2807805418530505e-06, + "loss": 0.4657, + "step": 310300 + }, + { + "epoch": 2.743241570749129, + "grad_norm": 2.3001925945281982, + "learning_rate": 4.27930715418118e-06, + "loss": 0.4166, + "step": 310310 + }, + { + "epoch": 2.7433299740094412, + "grad_norm": 9.601128578186035, + "learning_rate": 4.27783376650931e-06, + "loss": 0.5325, + "step": 310320 + }, + { + "epoch": 2.743418377269754, + "grad_norm": 4.860722541809082, + "learning_rate": 4.276360378837439e-06, + "loss": 0.464, + "step": 310330 + }, + { + "epoch": 2.743506780530066, + "grad_norm": 13.872221946716309, + "learning_rate": 4.274886991165568e-06, + "loss": 0.6308, + "step": 310340 + }, + { + "epoch": 2.743595183790378, + "grad_norm": 2.7602405548095703, + "learning_rate": 4.273413603493697e-06, + "loss": 0.4922, + "step": 310350 + }, + { + "epoch": 2.7436835870506906, + "grad_norm": 2.514472246170044, + "learning_rate": 4.2719402158218265e-06, + "loss": 0.3882, + "step": 310360 + }, + { + "epoch": 2.7437719903110027, + "grad_norm": 3.6654014587402344, + "learning_rate": 4.270466828149956e-06, + "loss": 0.5505, + "step": 310370 + }, + { + "epoch": 2.743860393571315, + "grad_norm": 4.544604301452637, + "learning_rate": 4.268993440478085e-06, + "loss": 0.4432, + "step": 310380 + }, + { + "epoch": 2.743948796831627, + "grad_norm": 1.4363627433776855, + "learning_rate": 4.267520052806214e-06, + "loss": 0.47, + "step": 310390 + }, + { + "epoch": 2.7440372000919395, + "grad_norm": 5.486488342285156, + "learning_rate": 4.266046665134343e-06, + "loss": 0.689, + "step": 310400 + }, + { + "epoch": 2.7441256033522516, + "grad_norm": 18.910341262817383, + "learning_rate": 4.264573277462473e-06, + "loss": 0.5832, + "step": 310410 + }, + { + "epoch": 2.7442140066125638, + "grad_norm": 2.291567802429199, + "learning_rate": 4.2630998897906026e-06, + "loss": 0.4225, + "step": 310420 + }, + { + "epoch": 2.7443024098728763, + "grad_norm": 2.049788475036621, + "learning_rate": 4.261626502118732e-06, + "loss": 0.511, + "step": 310430 + }, + { + "epoch": 2.7443908131331884, + "grad_norm": 5.315803050994873, + "learning_rate": 4.260153114446861e-06, + "loss": 0.6097, + "step": 310440 + }, + { + "epoch": 2.7444792163935006, + "grad_norm": 1.5796189308166504, + "learning_rate": 4.25867972677499e-06, + "loss": 0.5426, + "step": 310450 + }, + { + "epoch": 2.7445676196538127, + "grad_norm": 2.4931023120880127, + "learning_rate": 4.257206339103119e-06, + "loss": 0.5292, + "step": 310460 + }, + { + "epoch": 2.744656022914125, + "grad_norm": 2.8577463626861572, + "learning_rate": 4.255732951431249e-06, + "loss": 0.5762, + "step": 310470 + }, + { + "epoch": 2.7447444261744374, + "grad_norm": 7.171093463897705, + "learning_rate": 4.254259563759378e-06, + "loss": 0.6369, + "step": 310480 + }, + { + "epoch": 2.7448328294347495, + "grad_norm": 5.842810153961182, + "learning_rate": 4.252786176087507e-06, + "loss": 0.5218, + "step": 310490 + }, + { + "epoch": 2.744921232695062, + "grad_norm": 2.171663522720337, + "learning_rate": 4.251312788415637e-06, + "loss": 0.5221, + "step": 310500 + }, + { + "epoch": 2.745009635955374, + "grad_norm": 1.6800086498260498, + "learning_rate": 4.249839400743766e-06, + "loss": 0.4334, + "step": 310510 + }, + { + "epoch": 2.7450980392156863, + "grad_norm": 2.293623447418213, + "learning_rate": 4.2483660130718954e-06, + "loss": 0.4397, + "step": 310520 + }, + { + "epoch": 2.7451864424759984, + "grad_norm": 1.1111395359039307, + "learning_rate": 4.2468926254000255e-06, + "loss": 0.3379, + "step": 310530 + }, + { + "epoch": 2.7452748457363105, + "grad_norm": 4.009139537811279, + "learning_rate": 4.245419237728155e-06, + "loss": 0.4185, + "step": 310540 + }, + { + "epoch": 2.745363248996623, + "grad_norm": 4.833608150482178, + "learning_rate": 4.243945850056284e-06, + "loss": 0.5544, + "step": 310550 + }, + { + "epoch": 2.745451652256935, + "grad_norm": 2.409183979034424, + "learning_rate": 4.242472462384413e-06, + "loss": 0.4918, + "step": 310560 + }, + { + "epoch": 2.7455400555172473, + "grad_norm": 8.90221881866455, + "learning_rate": 4.240999074712542e-06, + "loss": 0.5575, + "step": 310570 + }, + { + "epoch": 2.74562845877756, + "grad_norm": 0.9937192797660828, + "learning_rate": 4.2395256870406715e-06, + "loss": 0.508, + "step": 310580 + }, + { + "epoch": 2.745716862037872, + "grad_norm": 11.122431755065918, + "learning_rate": 4.2380522993688015e-06, + "loss": 0.4452, + "step": 310590 + }, + { + "epoch": 2.745805265298184, + "grad_norm": 13.329570770263672, + "learning_rate": 4.236578911696931e-06, + "loss": 0.5781, + "step": 310600 + }, + { + "epoch": 2.7458936685584963, + "grad_norm": 2.0345585346221924, + "learning_rate": 4.23510552402506e-06, + "loss": 0.5496, + "step": 310610 + }, + { + "epoch": 2.745982071818809, + "grad_norm": 3.381053924560547, + "learning_rate": 4.233632136353189e-06, + "loss": 0.5761, + "step": 310620 + }, + { + "epoch": 2.746070475079121, + "grad_norm": 4.535212516784668, + "learning_rate": 4.232158748681318e-06, + "loss": 0.5108, + "step": 310630 + }, + { + "epoch": 2.746158878339433, + "grad_norm": 3.7819912433624268, + "learning_rate": 4.2306853610094475e-06, + "loss": 0.511, + "step": 310640 + }, + { + "epoch": 2.7462472815997456, + "grad_norm": 4.422667503356934, + "learning_rate": 4.229211973337577e-06, + "loss": 0.4908, + "step": 310650 + }, + { + "epoch": 2.7463356848600577, + "grad_norm": 9.369030952453613, + "learning_rate": 4.227738585665706e-06, + "loss": 0.5413, + "step": 310660 + }, + { + "epoch": 2.74642408812037, + "grad_norm": 3.3814117908477783, + "learning_rate": 4.226265197993835e-06, + "loss": 0.3895, + "step": 310670 + }, + { + "epoch": 2.746512491380682, + "grad_norm": 1.0876178741455078, + "learning_rate": 4.224791810321964e-06, + "loss": 0.586, + "step": 310680 + }, + { + "epoch": 2.746600894640994, + "grad_norm": 5.616076469421387, + "learning_rate": 4.223318422650094e-06, + "loss": 0.6068, + "step": 310690 + }, + { + "epoch": 2.7466892979013067, + "grad_norm": 2.7823214530944824, + "learning_rate": 4.221845034978224e-06, + "loss": 0.4956, + "step": 310700 + }, + { + "epoch": 2.746777701161619, + "grad_norm": 12.884786605834961, + "learning_rate": 4.220371647306353e-06, + "loss": 0.4751, + "step": 310710 + }, + { + "epoch": 2.7468661044219314, + "grad_norm": 1.9448703527450562, + "learning_rate": 4.218898259634482e-06, + "loss": 0.4391, + "step": 310720 + }, + { + "epoch": 2.7469545076822435, + "grad_norm": 3.9907803535461426, + "learning_rate": 4.217424871962611e-06, + "loss": 0.5483, + "step": 310730 + }, + { + "epoch": 2.7470429109425556, + "grad_norm": 2.5379152297973633, + "learning_rate": 4.21595148429074e-06, + "loss": 0.5714, + "step": 310740 + }, + { + "epoch": 2.7471313142028677, + "grad_norm": 3.697946071624756, + "learning_rate": 4.2144780966188704e-06, + "loss": 0.5373, + "step": 310750 + }, + { + "epoch": 2.74721971746318, + "grad_norm": 14.645133018493652, + "learning_rate": 4.213004708947e-06, + "loss": 0.6191, + "step": 310760 + }, + { + "epoch": 2.7473081207234924, + "grad_norm": 5.872289657592773, + "learning_rate": 4.211531321275129e-06, + "loss": 0.5745, + "step": 310770 + }, + { + "epoch": 2.7473965239838045, + "grad_norm": 8.340093612670898, + "learning_rate": 4.210057933603259e-06, + "loss": 0.4995, + "step": 310780 + }, + { + "epoch": 2.7474849272441166, + "grad_norm": 1.288814902305603, + "learning_rate": 4.208584545931388e-06, + "loss": 0.403, + "step": 310790 + }, + { + "epoch": 2.747573330504429, + "grad_norm": 1.1323134899139404, + "learning_rate": 4.207111158259517e-06, + "loss": 0.4699, + "step": 310800 + }, + { + "epoch": 2.7476617337647413, + "grad_norm": 11.422972679138184, + "learning_rate": 4.2056377705876465e-06, + "loss": 0.5719, + "step": 310810 + }, + { + "epoch": 2.7477501370250534, + "grad_norm": 1.542243242263794, + "learning_rate": 4.204164382915776e-06, + "loss": 0.5023, + "step": 310820 + }, + { + "epoch": 2.7478385402853656, + "grad_norm": 1.2537554502487183, + "learning_rate": 4.202690995243905e-06, + "loss": 0.4069, + "step": 310830 + }, + { + "epoch": 2.747926943545678, + "grad_norm": 10.33382511138916, + "learning_rate": 4.201217607572034e-06, + "loss": 0.6268, + "step": 310840 + }, + { + "epoch": 2.7480153468059902, + "grad_norm": 7.193251132965088, + "learning_rate": 4.199744219900163e-06, + "loss": 0.5737, + "step": 310850 + }, + { + "epoch": 2.7481037500663024, + "grad_norm": 5.271790504455566, + "learning_rate": 4.1982708322282925e-06, + "loss": 0.4845, + "step": 310860 + }, + { + "epoch": 2.748192153326615, + "grad_norm": 2.501217842102051, + "learning_rate": 4.196797444556422e-06, + "loss": 0.4219, + "step": 310870 + }, + { + "epoch": 2.748280556586927, + "grad_norm": 2.3030991554260254, + "learning_rate": 4.195324056884552e-06, + "loss": 0.5288, + "step": 310880 + }, + { + "epoch": 2.748368959847239, + "grad_norm": 4.215305805206299, + "learning_rate": 4.193850669212681e-06, + "loss": 0.5728, + "step": 310890 + }, + { + "epoch": 2.7484573631075513, + "grad_norm": 4.298797130584717, + "learning_rate": 4.19237728154081e-06, + "loss": 0.4485, + "step": 310900 + }, + { + "epoch": 2.7485457663678634, + "grad_norm": 5.303741931915283, + "learning_rate": 4.190903893868939e-06, + "loss": 0.6283, + "step": 310910 + }, + { + "epoch": 2.748634169628176, + "grad_norm": 1.7534091472625732, + "learning_rate": 4.1894305061970686e-06, + "loss": 0.4331, + "step": 310920 + }, + { + "epoch": 2.748722572888488, + "grad_norm": 1.2544584274291992, + "learning_rate": 4.187957118525198e-06, + "loss": 0.6171, + "step": 310930 + }, + { + "epoch": 2.7488109761488, + "grad_norm": 9.667861938476562, + "learning_rate": 4.186483730853327e-06, + "loss": 0.5412, + "step": 310940 + }, + { + "epoch": 2.7488993794091128, + "grad_norm": 3.246981382369995, + "learning_rate": 4.185010343181456e-06, + "loss": 0.4675, + "step": 310950 + }, + { + "epoch": 2.748987782669425, + "grad_norm": 2.5172016620635986, + "learning_rate": 4.183536955509585e-06, + "loss": 0.4655, + "step": 310960 + }, + { + "epoch": 2.749076185929737, + "grad_norm": 4.5355401039123535, + "learning_rate": 4.182063567837715e-06, + "loss": 0.5697, + "step": 310970 + }, + { + "epoch": 2.749164589190049, + "grad_norm": 3.382397174835205, + "learning_rate": 4.180590180165845e-06, + "loss": 0.4913, + "step": 310980 + }, + { + "epoch": 2.7492529924503617, + "grad_norm": 5.520689010620117, + "learning_rate": 4.179116792493974e-06, + "loss": 0.5453, + "step": 310990 + }, + { + "epoch": 2.749341395710674, + "grad_norm": 3.137706756591797, + "learning_rate": 4.177643404822104e-06, + "loss": 0.3796, + "step": 311000 + }, + { + "epoch": 2.749429798970986, + "grad_norm": 1.4193823337554932, + "learning_rate": 4.176170017150233e-06, + "loss": 0.6043, + "step": 311010 + }, + { + "epoch": 2.7495182022312985, + "grad_norm": 6.07499885559082, + "learning_rate": 4.174696629478362e-06, + "loss": 0.4555, + "step": 311020 + }, + { + "epoch": 2.7496066054916106, + "grad_norm": 3.5134835243225098, + "learning_rate": 4.1732232418064915e-06, + "loss": 0.4456, + "step": 311030 + }, + { + "epoch": 2.7496950087519227, + "grad_norm": 6.692794322967529, + "learning_rate": 4.171749854134621e-06, + "loss": 0.5783, + "step": 311040 + }, + { + "epoch": 2.749783412012235, + "grad_norm": 6.108872890472412, + "learning_rate": 4.17027646646275e-06, + "loss": 0.5115, + "step": 311050 + }, + { + "epoch": 2.749871815272547, + "grad_norm": 3.6859216690063477, + "learning_rate": 4.16880307879088e-06, + "loss": 0.3989, + "step": 311060 + }, + { + "epoch": 2.7499602185328595, + "grad_norm": 4.44670295715332, + "learning_rate": 4.167329691119009e-06, + "loss": 0.5899, + "step": 311070 + }, + { + "epoch": 2.7500486217931717, + "grad_norm": 2.267184019088745, + "learning_rate": 4.165856303447138e-06, + "loss": 0.4901, + "step": 311080 + }, + { + "epoch": 2.7501370250534842, + "grad_norm": 1.2395979166030884, + "learning_rate": 4.1643829157752675e-06, + "loss": 0.5438, + "step": 311090 + }, + { + "epoch": 2.7502254283137963, + "grad_norm": 5.9711012840271, + "learning_rate": 4.162909528103397e-06, + "loss": 0.4806, + "step": 311100 + }, + { + "epoch": 2.7503138315741085, + "grad_norm": 10.732582092285156, + "learning_rate": 4.161436140431526e-06, + "loss": 0.4941, + "step": 311110 + }, + { + "epoch": 2.7504022348344206, + "grad_norm": 1.0830386877059937, + "learning_rate": 4.159962752759655e-06, + "loss": 0.4252, + "step": 311120 + }, + { + "epoch": 2.7504906380947327, + "grad_norm": 8.90915584564209, + "learning_rate": 4.158489365087784e-06, + "loss": 0.5856, + "step": 311130 + }, + { + "epoch": 2.7505790413550453, + "grad_norm": 9.096912384033203, + "learning_rate": 4.1570159774159135e-06, + "loss": 0.5854, + "step": 311140 + }, + { + "epoch": 2.7506674446153574, + "grad_norm": 4.531406402587891, + "learning_rate": 4.155542589744043e-06, + "loss": 0.536, + "step": 311150 + }, + { + "epoch": 2.7507558478756695, + "grad_norm": 1.5468474626541138, + "learning_rate": 4.154069202072173e-06, + "loss": 0.5113, + "step": 311160 + }, + { + "epoch": 2.750844251135982, + "grad_norm": 4.183215618133545, + "learning_rate": 4.152595814400302e-06, + "loss": 0.4977, + "step": 311170 + }, + { + "epoch": 2.750932654396294, + "grad_norm": 1.7057058811187744, + "learning_rate": 4.151122426728431e-06, + "loss": 0.5986, + "step": 311180 + }, + { + "epoch": 2.7510210576566063, + "grad_norm": 14.612842559814453, + "learning_rate": 4.14964903905656e-06, + "loss": 0.486, + "step": 311190 + }, + { + "epoch": 2.7511094609169184, + "grad_norm": 4.643563747406006, + "learning_rate": 4.1481756513846896e-06, + "loss": 0.5013, + "step": 311200 + }, + { + "epoch": 2.751197864177231, + "grad_norm": 2.3424503803253174, + "learning_rate": 4.146702263712819e-06, + "loss": 0.4566, + "step": 311210 + }, + { + "epoch": 2.751286267437543, + "grad_norm": 1.2019096612930298, + "learning_rate": 4.145228876040949e-06, + "loss": 0.5342, + "step": 311220 + }, + { + "epoch": 2.7513746706978552, + "grad_norm": 2.274130344390869, + "learning_rate": 4.143755488369078e-06, + "loss": 0.4594, + "step": 311230 + }, + { + "epoch": 2.751463073958168, + "grad_norm": 5.481594562530518, + "learning_rate": 4.142282100697207e-06, + "loss": 0.5106, + "step": 311240 + }, + { + "epoch": 2.75155147721848, + "grad_norm": 3.239806652069092, + "learning_rate": 4.140808713025337e-06, + "loss": 0.5441, + "step": 311250 + }, + { + "epoch": 2.751639880478792, + "grad_norm": 12.574962615966797, + "learning_rate": 4.1393353253534665e-06, + "loss": 0.4555, + "step": 311260 + }, + { + "epoch": 2.751728283739104, + "grad_norm": 1.9007207155227661, + "learning_rate": 4.137861937681596e-06, + "loss": 0.7076, + "step": 311270 + }, + { + "epoch": 2.7518166869994163, + "grad_norm": 7.38585901260376, + "learning_rate": 4.136388550009725e-06, + "loss": 0.546, + "step": 311280 + }, + { + "epoch": 2.751905090259729, + "grad_norm": 3.3192026615142822, + "learning_rate": 4.134915162337854e-06, + "loss": 0.5179, + "step": 311290 + }, + { + "epoch": 2.751993493520041, + "grad_norm": 4.19621467590332, + "learning_rate": 4.133441774665983e-06, + "loss": 0.5867, + "step": 311300 + }, + { + "epoch": 2.7520818967803535, + "grad_norm": 1.884798288345337, + "learning_rate": 4.1319683869941125e-06, + "loss": 0.4267, + "step": 311310 + }, + { + "epoch": 2.7521703000406657, + "grad_norm": 4.640639305114746, + "learning_rate": 4.130494999322242e-06, + "loss": 0.5521, + "step": 311320 + }, + { + "epoch": 2.7522587033009778, + "grad_norm": 6.992824077606201, + "learning_rate": 4.129021611650371e-06, + "loss": 0.4498, + "step": 311330 + }, + { + "epoch": 2.75234710656129, + "grad_norm": 1.6819438934326172, + "learning_rate": 4.127548223978501e-06, + "loss": 0.6274, + "step": 311340 + }, + { + "epoch": 2.752435509821602, + "grad_norm": 2.5880956649780273, + "learning_rate": 4.12607483630663e-06, + "loss": 0.6196, + "step": 311350 + }, + { + "epoch": 2.7525239130819146, + "grad_norm": 3.7420873641967773, + "learning_rate": 4.124601448634759e-06, + "loss": 0.5227, + "step": 311360 + }, + { + "epoch": 2.7526123163422267, + "grad_norm": 4.760318756103516, + "learning_rate": 4.1231280609628885e-06, + "loss": 0.3619, + "step": 311370 + }, + { + "epoch": 2.752700719602539, + "grad_norm": 6.7428131103515625, + "learning_rate": 4.121654673291018e-06, + "loss": 0.4404, + "step": 311380 + }, + { + "epoch": 2.7527891228628514, + "grad_norm": 2.128917932510376, + "learning_rate": 4.120181285619147e-06, + "loss": 0.5462, + "step": 311390 + }, + { + "epoch": 2.7528775261231635, + "grad_norm": 7.5485076904296875, + "learning_rate": 4.118707897947276e-06, + "loss": 0.4324, + "step": 311400 + }, + { + "epoch": 2.7529659293834756, + "grad_norm": 1.3759175539016724, + "learning_rate": 4.117234510275405e-06, + "loss": 0.4758, + "step": 311410 + }, + { + "epoch": 2.7530543326437877, + "grad_norm": 1.271715760231018, + "learning_rate": 4.1157611226035345e-06, + "loss": 0.5256, + "step": 311420 + }, + { + "epoch": 2.7531427359041003, + "grad_norm": 4.763925552368164, + "learning_rate": 4.114287734931664e-06, + "loss": 0.5187, + "step": 311430 + }, + { + "epoch": 2.7532311391644124, + "grad_norm": 3.6790504455566406, + "learning_rate": 4.112814347259794e-06, + "loss": 0.5755, + "step": 311440 + }, + { + "epoch": 2.7533195424247245, + "grad_norm": 3.4164021015167236, + "learning_rate": 4.111340959587923e-06, + "loss": 0.4722, + "step": 311450 + }, + { + "epoch": 2.753407945685037, + "grad_norm": 2.615795612335205, + "learning_rate": 4.109867571916052e-06, + "loss": 0.4382, + "step": 311460 + }, + { + "epoch": 2.7534963489453492, + "grad_norm": 1.6163922548294067, + "learning_rate": 4.108394184244182e-06, + "loss": 0.5056, + "step": 311470 + }, + { + "epoch": 2.7535847522056613, + "grad_norm": 5.3408331871032715, + "learning_rate": 4.1069207965723114e-06, + "loss": 0.5337, + "step": 311480 + }, + { + "epoch": 2.7536731554659735, + "grad_norm": 2.2284963130950928, + "learning_rate": 4.105447408900441e-06, + "loss": 0.4527, + "step": 311490 + }, + { + "epoch": 2.7537615587262856, + "grad_norm": 1.4742189645767212, + "learning_rate": 4.10397402122857e-06, + "loss": 0.484, + "step": 311500 + }, + { + "epoch": 2.753849961986598, + "grad_norm": 3.5935487747192383, + "learning_rate": 4.102500633556699e-06, + "loss": 0.6359, + "step": 311510 + }, + { + "epoch": 2.7539383652469103, + "grad_norm": 2.3631880283355713, + "learning_rate": 4.101027245884828e-06, + "loss": 0.5591, + "step": 311520 + }, + { + "epoch": 2.7540267685072224, + "grad_norm": 4.791767120361328, + "learning_rate": 4.099553858212958e-06, + "loss": 0.4401, + "step": 311530 + }, + { + "epoch": 2.754115171767535, + "grad_norm": 3.591062307357788, + "learning_rate": 4.0980804705410875e-06, + "loss": 0.4402, + "step": 311540 + }, + { + "epoch": 2.754203575027847, + "grad_norm": 2.8310892581939697, + "learning_rate": 4.096607082869217e-06, + "loss": 0.5146, + "step": 311550 + }, + { + "epoch": 2.754291978288159, + "grad_norm": 3.6860995292663574, + "learning_rate": 4.095133695197346e-06, + "loss": 0.4514, + "step": 311560 + }, + { + "epoch": 2.7543803815484713, + "grad_norm": 3.534336805343628, + "learning_rate": 4.093660307525475e-06, + "loss": 0.5714, + "step": 311570 + }, + { + "epoch": 2.754468784808784, + "grad_norm": 2.2244205474853516, + "learning_rate": 4.092186919853604e-06, + "loss": 0.6316, + "step": 311580 + }, + { + "epoch": 2.754557188069096, + "grad_norm": 2.7490835189819336, + "learning_rate": 4.0907135321817335e-06, + "loss": 0.4237, + "step": 311590 + }, + { + "epoch": 2.754645591329408, + "grad_norm": 12.087945938110352, + "learning_rate": 4.089240144509863e-06, + "loss": 0.5143, + "step": 311600 + }, + { + "epoch": 2.7547339945897207, + "grad_norm": 7.3489484786987305, + "learning_rate": 4.087766756837992e-06, + "loss": 0.5166, + "step": 311610 + }, + { + "epoch": 2.754822397850033, + "grad_norm": 1.800536036491394, + "learning_rate": 4.086293369166122e-06, + "loss": 0.6018, + "step": 311620 + }, + { + "epoch": 2.754910801110345, + "grad_norm": 1.526631474494934, + "learning_rate": 4.084819981494251e-06, + "loss": 0.482, + "step": 311630 + }, + { + "epoch": 2.754999204370657, + "grad_norm": 1.588115930557251, + "learning_rate": 4.08334659382238e-06, + "loss": 0.551, + "step": 311640 + }, + { + "epoch": 2.755087607630969, + "grad_norm": 2.296157121658325, + "learning_rate": 4.0818732061505095e-06, + "loss": 0.5559, + "step": 311650 + }, + { + "epoch": 2.7551760108912817, + "grad_norm": 1.8940088748931885, + "learning_rate": 4.080399818478639e-06, + "loss": 0.469, + "step": 311660 + }, + { + "epoch": 2.755264414151594, + "grad_norm": 3.4308464527130127, + "learning_rate": 4.078926430806768e-06, + "loss": 0.691, + "step": 311670 + }, + { + "epoch": 2.7553528174119064, + "grad_norm": 10.30882453918457, + "learning_rate": 4.077453043134897e-06, + "loss": 0.4854, + "step": 311680 + }, + { + "epoch": 2.7554412206722185, + "grad_norm": 2.748020887374878, + "learning_rate": 4.075979655463027e-06, + "loss": 0.469, + "step": 311690 + }, + { + "epoch": 2.7555296239325306, + "grad_norm": 2.3029520511627197, + "learning_rate": 4.074506267791156e-06, + "loss": 0.5434, + "step": 311700 + }, + { + "epoch": 2.7556180271928428, + "grad_norm": 1.730787754058838, + "learning_rate": 4.073032880119286e-06, + "loss": 0.3786, + "step": 311710 + }, + { + "epoch": 2.755706430453155, + "grad_norm": 8.167707443237305, + "learning_rate": 4.071559492447416e-06, + "loss": 0.6043, + "step": 311720 + }, + { + "epoch": 2.7557948337134675, + "grad_norm": 6.129695415496826, + "learning_rate": 4.070086104775545e-06, + "loss": 0.4718, + "step": 311730 + }, + { + "epoch": 2.7558832369737796, + "grad_norm": 7.6747050285339355, + "learning_rate": 4.068612717103674e-06, + "loss": 0.6358, + "step": 311740 + }, + { + "epoch": 2.7559716402340917, + "grad_norm": 7.469211101531982, + "learning_rate": 4.067139329431803e-06, + "loss": 0.5782, + "step": 311750 + }, + { + "epoch": 2.7560600434944043, + "grad_norm": 4.469886779785156, + "learning_rate": 4.0656659417599324e-06, + "loss": 0.462, + "step": 311760 + }, + { + "epoch": 2.7561484467547164, + "grad_norm": 5.875805854797363, + "learning_rate": 4.064192554088062e-06, + "loss": 0.5988, + "step": 311770 + }, + { + "epoch": 2.7562368500150285, + "grad_norm": 2.7115981578826904, + "learning_rate": 4.062719166416191e-06, + "loss": 0.4364, + "step": 311780 + }, + { + "epoch": 2.7563252532753406, + "grad_norm": 6.00388240814209, + "learning_rate": 4.06124577874432e-06, + "loss": 0.4455, + "step": 311790 + }, + { + "epoch": 2.756413656535653, + "grad_norm": 4.775798320770264, + "learning_rate": 4.059772391072449e-06, + "loss": 0.5533, + "step": 311800 + }, + { + "epoch": 2.7565020597959653, + "grad_norm": 2.243093252182007, + "learning_rate": 4.058299003400579e-06, + "loss": 0.5445, + "step": 311810 + }, + { + "epoch": 2.7565904630562774, + "grad_norm": 2.6046195030212402, + "learning_rate": 4.0568256157287085e-06, + "loss": 0.4702, + "step": 311820 + }, + { + "epoch": 2.75667886631659, + "grad_norm": 5.8427815437316895, + "learning_rate": 4.055352228056838e-06, + "loss": 0.5024, + "step": 311830 + }, + { + "epoch": 2.756767269576902, + "grad_norm": 6.901120662689209, + "learning_rate": 4.053878840384967e-06, + "loss": 0.3984, + "step": 311840 + }, + { + "epoch": 2.756855672837214, + "grad_norm": 4.699443817138672, + "learning_rate": 4.052405452713096e-06, + "loss": 0.4978, + "step": 311850 + }, + { + "epoch": 2.7569440760975263, + "grad_norm": 2.134182929992676, + "learning_rate": 4.050932065041225e-06, + "loss": 0.6322, + "step": 311860 + }, + { + "epoch": 2.7570324793578385, + "grad_norm": 1.438490390777588, + "learning_rate": 4.0494586773693545e-06, + "loss": 0.5742, + "step": 311870 + }, + { + "epoch": 2.757120882618151, + "grad_norm": 10.687232971191406, + "learning_rate": 4.047985289697484e-06, + "loss": 0.6108, + "step": 311880 + }, + { + "epoch": 2.757209285878463, + "grad_norm": 24.164409637451172, + "learning_rate": 4.046511902025613e-06, + "loss": 0.4948, + "step": 311890 + }, + { + "epoch": 2.7572976891387757, + "grad_norm": 2.5200576782226562, + "learning_rate": 4.045038514353743e-06, + "loss": 0.5735, + "step": 311900 + }, + { + "epoch": 2.757386092399088, + "grad_norm": 6.063928127288818, + "learning_rate": 4.043565126681872e-06, + "loss": 0.5654, + "step": 311910 + }, + { + "epoch": 2.7574744956594, + "grad_norm": 4.0724406242370605, + "learning_rate": 4.042091739010001e-06, + "loss": 0.4928, + "step": 311920 + }, + { + "epoch": 2.757562898919712, + "grad_norm": 3.0141866207122803, + "learning_rate": 4.040618351338131e-06, + "loss": 0.5609, + "step": 311930 + }, + { + "epoch": 2.757651302180024, + "grad_norm": 2.002556562423706, + "learning_rate": 4.039144963666261e-06, + "loss": 0.4625, + "step": 311940 + }, + { + "epoch": 2.7577397054403368, + "grad_norm": 1.331599473953247, + "learning_rate": 4.03767157599439e-06, + "loss": 0.4393, + "step": 311950 + }, + { + "epoch": 2.757828108700649, + "grad_norm": 6.2539777755737305, + "learning_rate": 4.036198188322519e-06, + "loss": 0.5996, + "step": 311960 + }, + { + "epoch": 2.757916511960961, + "grad_norm": 9.592437744140625, + "learning_rate": 4.034724800650648e-06, + "loss": 0.5183, + "step": 311970 + }, + { + "epoch": 2.7580049152212736, + "grad_norm": 2.288874864578247, + "learning_rate": 4.033251412978777e-06, + "loss": 0.5883, + "step": 311980 + }, + { + "epoch": 2.7580933184815857, + "grad_norm": 13.750874519348145, + "learning_rate": 4.031778025306907e-06, + "loss": 0.52, + "step": 311990 + }, + { + "epoch": 2.758181721741898, + "grad_norm": 3.4621822834014893, + "learning_rate": 4.030304637635037e-06, + "loss": 0.5187, + "step": 312000 + }, + { + "epoch": 2.75827012500221, + "grad_norm": 6.372236728668213, + "learning_rate": 4.028831249963166e-06, + "loss": 0.5535, + "step": 312010 + }, + { + "epoch": 2.7583585282625225, + "grad_norm": 7.8886308670043945, + "learning_rate": 4.027357862291295e-06, + "loss": 0.5425, + "step": 312020 + }, + { + "epoch": 2.7584469315228346, + "grad_norm": 2.8021533489227295, + "learning_rate": 4.025884474619424e-06, + "loss": 0.5429, + "step": 312030 + }, + { + "epoch": 2.7585353347831467, + "grad_norm": 2.4144370555877686, + "learning_rate": 4.0244110869475535e-06, + "loss": 0.3367, + "step": 312040 + }, + { + "epoch": 2.7586237380434593, + "grad_norm": 3.8783626556396484, + "learning_rate": 4.022937699275683e-06, + "loss": 0.512, + "step": 312050 + }, + { + "epoch": 2.7587121413037714, + "grad_norm": 2.958847761154175, + "learning_rate": 4.021464311603812e-06, + "loss": 0.4791, + "step": 312060 + }, + { + "epoch": 2.7588005445640835, + "grad_norm": 1.2143197059631348, + "learning_rate": 4.019990923931941e-06, + "loss": 0.5015, + "step": 312070 + }, + { + "epoch": 2.7588889478243956, + "grad_norm": 2.274629831314087, + "learning_rate": 4.01851753626007e-06, + "loss": 0.5437, + "step": 312080 + }, + { + "epoch": 2.7589773510847078, + "grad_norm": 0.9452793002128601, + "learning_rate": 4.0170441485882e-06, + "loss": 0.4733, + "step": 312090 + }, + { + "epoch": 2.7590657543450203, + "grad_norm": 8.437507629394531, + "learning_rate": 4.0155707609163295e-06, + "loss": 0.5104, + "step": 312100 + }, + { + "epoch": 2.7591541576053324, + "grad_norm": 2.702364683151245, + "learning_rate": 4.014097373244459e-06, + "loss": 0.4829, + "step": 312110 + }, + { + "epoch": 2.7592425608656446, + "grad_norm": 4.68883752822876, + "learning_rate": 4.012623985572588e-06, + "loss": 0.6833, + "step": 312120 + }, + { + "epoch": 2.759330964125957, + "grad_norm": 2.8142035007476807, + "learning_rate": 4.011150597900717e-06, + "loss": 0.4949, + "step": 312130 + }, + { + "epoch": 2.7594193673862693, + "grad_norm": 3.471673011779785, + "learning_rate": 4.009677210228846e-06, + "loss": 0.5683, + "step": 312140 + }, + { + "epoch": 2.7595077706465814, + "grad_norm": 3.1681041717529297, + "learning_rate": 4.008203822556976e-06, + "loss": 0.5524, + "step": 312150 + }, + { + "epoch": 2.7595961739068935, + "grad_norm": 2.890620708465576, + "learning_rate": 4.0067304348851056e-06, + "loss": 0.5091, + "step": 312160 + }, + { + "epoch": 2.759684577167206, + "grad_norm": 3.975911855697632, + "learning_rate": 4.005257047213235e-06, + "loss": 0.7117, + "step": 312170 + }, + { + "epoch": 2.759772980427518, + "grad_norm": 3.776538848876953, + "learning_rate": 4.003783659541365e-06, + "loss": 0.7188, + "step": 312180 + }, + { + "epoch": 2.7598613836878303, + "grad_norm": 3.0265610218048096, + "learning_rate": 4.002310271869494e-06, + "loss": 0.4514, + "step": 312190 + }, + { + "epoch": 2.759949786948143, + "grad_norm": 2.3018572330474854, + "learning_rate": 4.000836884197623e-06, + "loss": 0.4465, + "step": 312200 + }, + { + "epoch": 2.760038190208455, + "grad_norm": 4.536286354064941, + "learning_rate": 3.999363496525752e-06, + "loss": 0.6307, + "step": 312210 + }, + { + "epoch": 2.760126593468767, + "grad_norm": 3.2282087802886963, + "learning_rate": 3.997890108853882e-06, + "loss": 0.5723, + "step": 312220 + }, + { + "epoch": 2.760214996729079, + "grad_norm": 3.3575642108917236, + "learning_rate": 3.996416721182011e-06, + "loss": 0.4294, + "step": 312230 + }, + { + "epoch": 2.7603033999893913, + "grad_norm": 2.539982557296753, + "learning_rate": 3.99494333351014e-06, + "loss": 0.5108, + "step": 312240 + }, + { + "epoch": 2.760391803249704, + "grad_norm": 3.505828619003296, + "learning_rate": 3.993469945838269e-06, + "loss": 0.5792, + "step": 312250 + }, + { + "epoch": 2.760480206510016, + "grad_norm": 3.772106409072876, + "learning_rate": 3.9919965581663984e-06, + "loss": 0.4971, + "step": 312260 + }, + { + "epoch": 2.7605686097703286, + "grad_norm": 1.2430014610290527, + "learning_rate": 3.990523170494528e-06, + "loss": 0.5186, + "step": 312270 + }, + { + "epoch": 2.7606570130306407, + "grad_norm": 17.073930740356445, + "learning_rate": 3.989049782822658e-06, + "loss": 0.5475, + "step": 312280 + }, + { + "epoch": 2.760745416290953, + "grad_norm": 1.283502221107483, + "learning_rate": 3.987576395150787e-06, + "loss": 0.5663, + "step": 312290 + }, + { + "epoch": 2.760833819551265, + "grad_norm": 13.895061492919922, + "learning_rate": 3.986103007478916e-06, + "loss": 0.4843, + "step": 312300 + }, + { + "epoch": 2.760922222811577, + "grad_norm": 3.797361135482788, + "learning_rate": 3.984629619807045e-06, + "loss": 0.5237, + "step": 312310 + }, + { + "epoch": 2.7610106260718896, + "grad_norm": 3.1553823947906494, + "learning_rate": 3.9831562321351745e-06, + "loss": 0.5287, + "step": 312320 + }, + { + "epoch": 2.7610990293322017, + "grad_norm": 2.7755846977233887, + "learning_rate": 3.981682844463304e-06, + "loss": 0.5173, + "step": 312330 + }, + { + "epoch": 2.761187432592514, + "grad_norm": 0.8041819334030151, + "learning_rate": 3.980209456791433e-06, + "loss": 0.4154, + "step": 312340 + }, + { + "epoch": 2.7612758358528264, + "grad_norm": 20.5621280670166, + "learning_rate": 3.978736069119562e-06, + "loss": 0.5426, + "step": 312350 + }, + { + "epoch": 2.7613642391131386, + "grad_norm": 30.442272186279297, + "learning_rate": 3.977262681447691e-06, + "loss": 0.5589, + "step": 312360 + }, + { + "epoch": 2.7614526423734507, + "grad_norm": 4.429216384887695, + "learning_rate": 3.975789293775821e-06, + "loss": 0.4862, + "step": 312370 + }, + { + "epoch": 2.761541045633763, + "grad_norm": 3.3470284938812256, + "learning_rate": 3.9743159061039505e-06, + "loss": 0.5888, + "step": 312380 + }, + { + "epoch": 2.7616294488940754, + "grad_norm": 3.8954663276672363, + "learning_rate": 3.97284251843208e-06, + "loss": 0.5438, + "step": 312390 + }, + { + "epoch": 2.7617178521543875, + "grad_norm": 4.538197994232178, + "learning_rate": 3.97136913076021e-06, + "loss": 0.5491, + "step": 312400 + }, + { + "epoch": 2.7618062554146996, + "grad_norm": 9.053424835205078, + "learning_rate": 3.969895743088339e-06, + "loss": 0.5115, + "step": 312410 + }, + { + "epoch": 2.761894658675012, + "grad_norm": 1.8404406309127808, + "learning_rate": 3.968422355416468e-06, + "loss": 0.4248, + "step": 312420 + }, + { + "epoch": 2.7619830619353243, + "grad_norm": 12.388656616210938, + "learning_rate": 3.966948967744597e-06, + "loss": 0.4658, + "step": 312430 + }, + { + "epoch": 2.7620714651956364, + "grad_norm": 1.1792241334915161, + "learning_rate": 3.965475580072727e-06, + "loss": 0.454, + "step": 312440 + }, + { + "epoch": 2.7621598684559485, + "grad_norm": 3.182039976119995, + "learning_rate": 3.964002192400856e-06, + "loss": 0.4575, + "step": 312450 + }, + { + "epoch": 2.7622482717162606, + "grad_norm": 7.864195346832275, + "learning_rate": 3.962528804728986e-06, + "loss": 0.4154, + "step": 312460 + }, + { + "epoch": 2.762336674976573, + "grad_norm": 2.0335917472839355, + "learning_rate": 3.961055417057115e-06, + "loss": 0.4299, + "step": 312470 + }, + { + "epoch": 2.7624250782368853, + "grad_norm": 4.184572219848633, + "learning_rate": 3.959582029385244e-06, + "loss": 0.5706, + "step": 312480 + }, + { + "epoch": 2.762513481497198, + "grad_norm": 1.5720021724700928, + "learning_rate": 3.9581086417133734e-06, + "loss": 0.4392, + "step": 312490 + }, + { + "epoch": 2.76260188475751, + "grad_norm": 3.5863630771636963, + "learning_rate": 3.956635254041503e-06, + "loss": 0.4971, + "step": 312500 + }, + { + "epoch": 2.762690288017822, + "grad_norm": 3.7470898628234863, + "learning_rate": 3.955161866369632e-06, + "loss": 0.5377, + "step": 312510 + }, + { + "epoch": 2.7627786912781342, + "grad_norm": 4.014135360717773, + "learning_rate": 3.953688478697761e-06, + "loss": 0.5506, + "step": 312520 + }, + { + "epoch": 2.7628670945384464, + "grad_norm": 4.8938446044921875, + "learning_rate": 3.95221509102589e-06, + "loss": 0.641, + "step": 312530 + }, + { + "epoch": 2.762955497798759, + "grad_norm": 2.693080186843872, + "learning_rate": 3.9507417033540194e-06, + "loss": 0.5338, + "step": 312540 + }, + { + "epoch": 2.763043901059071, + "grad_norm": 2.035135269165039, + "learning_rate": 3.949268315682149e-06, + "loss": 0.4787, + "step": 312550 + }, + { + "epoch": 2.763132304319383, + "grad_norm": 7.42504358291626, + "learning_rate": 3.947794928010279e-06, + "loss": 0.4422, + "step": 312560 + }, + { + "epoch": 2.7632207075796957, + "grad_norm": 9.1323881149292, + "learning_rate": 3.946321540338408e-06, + "loss": 0.6193, + "step": 312570 + }, + { + "epoch": 2.763309110840008, + "grad_norm": 1.304225206375122, + "learning_rate": 3.944848152666537e-06, + "loss": 0.5006, + "step": 312580 + }, + { + "epoch": 2.76339751410032, + "grad_norm": 3.983018159866333, + "learning_rate": 3.943374764994666e-06, + "loss": 0.4646, + "step": 312590 + }, + { + "epoch": 2.763485917360632, + "grad_norm": 3.4444353580474854, + "learning_rate": 3.9419013773227955e-06, + "loss": 0.6064, + "step": 312600 + }, + { + "epoch": 2.7635743206209447, + "grad_norm": 4.971999168395996, + "learning_rate": 3.940427989650925e-06, + "loss": 0.556, + "step": 312610 + }, + { + "epoch": 2.7636627238812568, + "grad_norm": 1.6894086599349976, + "learning_rate": 3.938954601979055e-06, + "loss": 0.5167, + "step": 312620 + }, + { + "epoch": 2.763751127141569, + "grad_norm": 2.81687068939209, + "learning_rate": 3.937481214307184e-06, + "loss": 0.5727, + "step": 312630 + }, + { + "epoch": 2.7638395304018815, + "grad_norm": 2.331437826156616, + "learning_rate": 3.936007826635313e-06, + "loss": 0.3673, + "step": 312640 + }, + { + "epoch": 2.7639279336621936, + "grad_norm": 3.3395450115203857, + "learning_rate": 3.934534438963443e-06, + "loss": 0.595, + "step": 312650 + }, + { + "epoch": 2.7640163369225057, + "grad_norm": 3.27608323097229, + "learning_rate": 3.933061051291572e-06, + "loss": 0.4796, + "step": 312660 + }, + { + "epoch": 2.764104740182818, + "grad_norm": 4.486461162567139, + "learning_rate": 3.931587663619702e-06, + "loss": 0.5069, + "step": 312670 + }, + { + "epoch": 2.76419314344313, + "grad_norm": 4.696019172668457, + "learning_rate": 3.930114275947831e-06, + "loss": 0.5326, + "step": 312680 + }, + { + "epoch": 2.7642815467034425, + "grad_norm": 2.9816436767578125, + "learning_rate": 3.92864088827596e-06, + "loss": 0.4887, + "step": 312690 + }, + { + "epoch": 2.7643699499637546, + "grad_norm": 4.879244327545166, + "learning_rate": 3.927167500604089e-06, + "loss": 0.5598, + "step": 312700 + }, + { + "epoch": 2.7644583532240667, + "grad_norm": 4.139683723449707, + "learning_rate": 3.925694112932218e-06, + "loss": 0.6453, + "step": 312710 + }, + { + "epoch": 2.7645467564843793, + "grad_norm": 12.80509090423584, + "learning_rate": 3.924220725260348e-06, + "loss": 0.5949, + "step": 312720 + }, + { + "epoch": 2.7646351597446914, + "grad_norm": 2.495053291320801, + "learning_rate": 3.922747337588477e-06, + "loss": 0.493, + "step": 312730 + }, + { + "epoch": 2.7647235630050035, + "grad_norm": 27.798368453979492, + "learning_rate": 3.921273949916606e-06, + "loss": 0.5411, + "step": 312740 + }, + { + "epoch": 2.7648119662653157, + "grad_norm": 2.0860347747802734, + "learning_rate": 3.919800562244736e-06, + "loss": 0.4841, + "step": 312750 + }, + { + "epoch": 2.7649003695256282, + "grad_norm": 6.871469974517822, + "learning_rate": 3.918327174572865e-06, + "loss": 0.5045, + "step": 312760 + }, + { + "epoch": 2.7649887727859404, + "grad_norm": 1.883770227432251, + "learning_rate": 3.9168537869009945e-06, + "loss": 0.4754, + "step": 312770 + }, + { + "epoch": 2.7650771760462525, + "grad_norm": 5.663798809051514, + "learning_rate": 3.915380399229124e-06, + "loss": 0.4897, + "step": 312780 + }, + { + "epoch": 2.765165579306565, + "grad_norm": 1.368546485900879, + "learning_rate": 3.913907011557253e-06, + "loss": 0.529, + "step": 312790 + }, + { + "epoch": 2.765253982566877, + "grad_norm": 7.203599452972412, + "learning_rate": 3.912433623885382e-06, + "loss": 0.5225, + "step": 312800 + }, + { + "epoch": 2.7653423858271893, + "grad_norm": 1.7829362154006958, + "learning_rate": 3.910960236213511e-06, + "loss": 0.4711, + "step": 312810 + }, + { + "epoch": 2.7654307890875014, + "grad_norm": 13.742483139038086, + "learning_rate": 3.9094868485416405e-06, + "loss": 0.5049, + "step": 312820 + }, + { + "epoch": 2.7655191923478135, + "grad_norm": 3.759416103363037, + "learning_rate": 3.90801346086977e-06, + "loss": 0.509, + "step": 312830 + }, + { + "epoch": 2.765607595608126, + "grad_norm": 4.912713050842285, + "learning_rate": 3.9065400731979e-06, + "loss": 0.5621, + "step": 312840 + }, + { + "epoch": 2.765695998868438, + "grad_norm": 6.19514274597168, + "learning_rate": 3.905066685526029e-06, + "loss": 0.6134, + "step": 312850 + }, + { + "epoch": 2.7657844021287508, + "grad_norm": 4.04016637802124, + "learning_rate": 3.903593297854159e-06, + "loss": 0.4308, + "step": 312860 + }, + { + "epoch": 2.765872805389063, + "grad_norm": 7.4898295402526855, + "learning_rate": 3.902119910182288e-06, + "loss": 0.6066, + "step": 312870 + }, + { + "epoch": 2.765961208649375, + "grad_norm": 3.961318254470825, + "learning_rate": 3.900646522510417e-06, + "loss": 0.5417, + "step": 312880 + }, + { + "epoch": 2.766049611909687, + "grad_norm": 3.1101455688476562, + "learning_rate": 3.8991731348385466e-06, + "loss": 0.4317, + "step": 312890 + }, + { + "epoch": 2.7661380151699992, + "grad_norm": 5.902585029602051, + "learning_rate": 3.897699747166676e-06, + "loss": 0.5321, + "step": 312900 + }, + { + "epoch": 2.766226418430312, + "grad_norm": 2.630932331085205, + "learning_rate": 3.896226359494805e-06, + "loss": 0.4835, + "step": 312910 + }, + { + "epoch": 2.766314821690624, + "grad_norm": 4.235104560852051, + "learning_rate": 3.894752971822934e-06, + "loss": 0.4668, + "step": 312920 + }, + { + "epoch": 2.766403224950936, + "grad_norm": 4.690268516540527, + "learning_rate": 3.893279584151064e-06, + "loss": 0.4628, + "step": 312930 + }, + { + "epoch": 2.7664916282112486, + "grad_norm": 18.463048934936523, + "learning_rate": 3.891806196479193e-06, + "loss": 0.5757, + "step": 312940 + }, + { + "epoch": 2.7665800314715607, + "grad_norm": 2.744863748550415, + "learning_rate": 3.890332808807323e-06, + "loss": 0.5859, + "step": 312950 + }, + { + "epoch": 2.766668434731873, + "grad_norm": 1.6896358728408813, + "learning_rate": 3.888859421135452e-06, + "loss": 0.4693, + "step": 312960 + }, + { + "epoch": 2.766756837992185, + "grad_norm": 2.6269755363464355, + "learning_rate": 3.887386033463581e-06, + "loss": 0.5169, + "step": 312970 + }, + { + "epoch": 2.7668452412524975, + "grad_norm": 3.954969644546509, + "learning_rate": 3.88591264579171e-06, + "loss": 0.5649, + "step": 312980 + }, + { + "epoch": 2.7669336445128097, + "grad_norm": 2.292372465133667, + "learning_rate": 3.884439258119839e-06, + "loss": 0.5398, + "step": 312990 + }, + { + "epoch": 2.7670220477731218, + "grad_norm": 2.1327450275421143, + "learning_rate": 3.882965870447969e-06, + "loss": 0.5675, + "step": 313000 + }, + { + "epoch": 2.7671104510334343, + "grad_norm": 13.396915435791016, + "learning_rate": 3.881492482776098e-06, + "loss": 0.5227, + "step": 313010 + }, + { + "epoch": 2.7671988542937465, + "grad_norm": 3.5894582271575928, + "learning_rate": 3.880019095104227e-06, + "loss": 0.5831, + "step": 313020 + }, + { + "epoch": 2.7672872575540586, + "grad_norm": 5.093552112579346, + "learning_rate": 3.878545707432357e-06, + "loss": 0.4549, + "step": 313030 + }, + { + "epoch": 2.7673756608143707, + "grad_norm": 4.5539631843566895, + "learning_rate": 3.877072319760486e-06, + "loss": 0.5616, + "step": 313040 + }, + { + "epoch": 2.767464064074683, + "grad_norm": 10.130762100219727, + "learning_rate": 3.8755989320886155e-06, + "loss": 0.5243, + "step": 313050 + }, + { + "epoch": 2.7675524673349954, + "grad_norm": 3.250615358352661, + "learning_rate": 3.874125544416745e-06, + "loss": 0.4278, + "step": 313060 + }, + { + "epoch": 2.7676408705953075, + "grad_norm": 1.1886354684829712, + "learning_rate": 3.872652156744874e-06, + "loss": 0.531, + "step": 313070 + }, + { + "epoch": 2.76772927385562, + "grad_norm": 2.883136510848999, + "learning_rate": 3.871178769073004e-06, + "loss": 0.4733, + "step": 313080 + }, + { + "epoch": 2.767817677115932, + "grad_norm": 6.4930009841918945, + "learning_rate": 3.869705381401133e-06, + "loss": 0.5012, + "step": 313090 + }, + { + "epoch": 2.7679060803762443, + "grad_norm": 2.402261734008789, + "learning_rate": 3.868231993729262e-06, + "loss": 0.4867, + "step": 313100 + }, + { + "epoch": 2.7679944836365564, + "grad_norm": 3.221879720687866, + "learning_rate": 3.8667586060573915e-06, + "loss": 0.5136, + "step": 313110 + }, + { + "epoch": 2.7680828868968685, + "grad_norm": 5.681375026702881, + "learning_rate": 3.8652852183855216e-06, + "loss": 0.4234, + "step": 313120 + }, + { + "epoch": 2.768171290157181, + "grad_norm": 4.598705768585205, + "learning_rate": 3.863811830713651e-06, + "loss": 0.4139, + "step": 313130 + }, + { + "epoch": 2.7682596934174932, + "grad_norm": 1.6188185214996338, + "learning_rate": 3.86233844304178e-06, + "loss": 0.4956, + "step": 313140 + }, + { + "epoch": 2.7683480966778053, + "grad_norm": 6.759064197540283, + "learning_rate": 3.860865055369909e-06, + "loss": 0.6233, + "step": 313150 + }, + { + "epoch": 2.768436499938118, + "grad_norm": 1.0389097929000854, + "learning_rate": 3.859391667698038e-06, + "loss": 0.4853, + "step": 313160 + }, + { + "epoch": 2.76852490319843, + "grad_norm": 2.5873327255249023, + "learning_rate": 3.8579182800261676e-06, + "loss": 0.514, + "step": 313170 + }, + { + "epoch": 2.768613306458742, + "grad_norm": 7.9622015953063965, + "learning_rate": 3.856444892354297e-06, + "loss": 0.5855, + "step": 313180 + }, + { + "epoch": 2.7687017097190543, + "grad_norm": 3.9871623516082764, + "learning_rate": 3.854971504682426e-06, + "loss": 0.4595, + "step": 313190 + }, + { + "epoch": 2.768790112979367, + "grad_norm": 0.9736635684967041, + "learning_rate": 3.853498117010555e-06, + "loss": 0.4731, + "step": 313200 + }, + { + "epoch": 2.768878516239679, + "grad_norm": 3.7336676120758057, + "learning_rate": 3.852024729338685e-06, + "loss": 0.4607, + "step": 313210 + }, + { + "epoch": 2.768966919499991, + "grad_norm": 2.697938919067383, + "learning_rate": 3.8505513416668144e-06, + "loss": 0.572, + "step": 313220 + }, + { + "epoch": 2.7690553227603036, + "grad_norm": 7.739188194274902, + "learning_rate": 3.849077953994944e-06, + "loss": 0.4966, + "step": 313230 + }, + { + "epoch": 2.7691437260206158, + "grad_norm": 8.691061019897461, + "learning_rate": 3.847604566323073e-06, + "loss": 0.552, + "step": 313240 + }, + { + "epoch": 2.769232129280928, + "grad_norm": 1.4232304096221924, + "learning_rate": 3.846131178651202e-06, + "loss": 0.5146, + "step": 313250 + }, + { + "epoch": 2.76932053254124, + "grad_norm": 2.6203510761260986, + "learning_rate": 3.844657790979331e-06, + "loss": 0.4884, + "step": 313260 + }, + { + "epoch": 2.769408935801552, + "grad_norm": 5.082408428192139, + "learning_rate": 3.8431844033074604e-06, + "loss": 0.5668, + "step": 313270 + }, + { + "epoch": 2.7694973390618647, + "grad_norm": 2.272325277328491, + "learning_rate": 3.84171101563559e-06, + "loss": 0.4511, + "step": 313280 + }, + { + "epoch": 2.769585742322177, + "grad_norm": 12.044944763183594, + "learning_rate": 3.840237627963719e-06, + "loss": 0.5535, + "step": 313290 + }, + { + "epoch": 2.769674145582489, + "grad_norm": 1.328391671180725, + "learning_rate": 3.838764240291849e-06, + "loss": 0.4916, + "step": 313300 + }, + { + "epoch": 2.7697625488428015, + "grad_norm": 1.3189427852630615, + "learning_rate": 3.837290852619978e-06, + "loss": 0.506, + "step": 313310 + }, + { + "epoch": 2.7698509521031136, + "grad_norm": 12.740005493164062, + "learning_rate": 3.835817464948107e-06, + "loss": 0.5083, + "step": 313320 + }, + { + "epoch": 2.7699393553634257, + "grad_norm": 1.0671952962875366, + "learning_rate": 3.834344077276237e-06, + "loss": 0.3931, + "step": 313330 + }, + { + "epoch": 2.770027758623738, + "grad_norm": 4.176941871643066, + "learning_rate": 3.8328706896043665e-06, + "loss": 0.5638, + "step": 313340 + }, + { + "epoch": 2.7701161618840504, + "grad_norm": 1.1260337829589844, + "learning_rate": 3.831397301932496e-06, + "loss": 0.4187, + "step": 313350 + }, + { + "epoch": 2.7702045651443625, + "grad_norm": 9.250699043273926, + "learning_rate": 3.829923914260625e-06, + "loss": 0.4459, + "step": 313360 + }, + { + "epoch": 2.7702929684046746, + "grad_norm": 6.7021260261535645, + "learning_rate": 3.828450526588754e-06, + "loss": 0.404, + "step": 313370 + }, + { + "epoch": 2.770381371664987, + "grad_norm": 20.261720657348633, + "learning_rate": 3.826977138916883e-06, + "loss": 0.5971, + "step": 313380 + }, + { + "epoch": 2.7704697749252993, + "grad_norm": 3.922633171081543, + "learning_rate": 3.8255037512450125e-06, + "loss": 0.4997, + "step": 313390 + }, + { + "epoch": 2.7705581781856115, + "grad_norm": 6.340877532958984, + "learning_rate": 3.824030363573143e-06, + "loss": 0.4607, + "step": 313400 + }, + { + "epoch": 2.7706465814459236, + "grad_norm": 2.4211339950561523, + "learning_rate": 3.822556975901272e-06, + "loss": 0.4564, + "step": 313410 + }, + { + "epoch": 2.7707349847062357, + "grad_norm": 3.2860107421875, + "learning_rate": 3.821083588229401e-06, + "loss": 0.6063, + "step": 313420 + }, + { + "epoch": 2.7708233879665483, + "grad_norm": 7.685926914215088, + "learning_rate": 3.81961020055753e-06, + "loss": 0.5166, + "step": 313430 + }, + { + "epoch": 2.7709117912268604, + "grad_norm": 2.749436855316162, + "learning_rate": 3.818136812885659e-06, + "loss": 0.4573, + "step": 313440 + }, + { + "epoch": 2.771000194487173, + "grad_norm": 1.249379277229309, + "learning_rate": 3.816663425213789e-06, + "loss": 0.4484, + "step": 313450 + }, + { + "epoch": 2.771088597747485, + "grad_norm": 1.9956929683685303, + "learning_rate": 3.815190037541918e-06, + "loss": 0.4472, + "step": 313460 + }, + { + "epoch": 2.771177001007797, + "grad_norm": 8.914753913879395, + "learning_rate": 3.813716649870047e-06, + "loss": 0.6255, + "step": 313470 + }, + { + "epoch": 2.7712654042681093, + "grad_norm": 3.2763524055480957, + "learning_rate": 3.8122432621981766e-06, + "loss": 0.6043, + "step": 313480 + }, + { + "epoch": 2.7713538075284214, + "grad_norm": 5.121274471282959, + "learning_rate": 3.8107698745263062e-06, + "loss": 0.5555, + "step": 313490 + }, + { + "epoch": 2.771442210788734, + "grad_norm": 18.74786376953125, + "learning_rate": 3.8092964868544354e-06, + "loss": 0.41, + "step": 313500 + }, + { + "epoch": 2.771530614049046, + "grad_norm": 20.469131469726562, + "learning_rate": 3.807823099182565e-06, + "loss": 0.6346, + "step": 313510 + }, + { + "epoch": 2.7716190173093582, + "grad_norm": 3.9166696071624756, + "learning_rate": 3.8063497115106943e-06, + "loss": 0.4967, + "step": 313520 + }, + { + "epoch": 2.771707420569671, + "grad_norm": 2.0088136196136475, + "learning_rate": 3.8048763238388235e-06, + "loss": 0.5189, + "step": 313530 + }, + { + "epoch": 2.771795823829983, + "grad_norm": 1.5406255722045898, + "learning_rate": 3.8034029361669527e-06, + "loss": 0.4486, + "step": 313540 + }, + { + "epoch": 2.771884227090295, + "grad_norm": 3.5250675678253174, + "learning_rate": 3.801929548495082e-06, + "loss": 0.4775, + "step": 313550 + }, + { + "epoch": 2.771972630350607, + "grad_norm": 0.7514532804489136, + "learning_rate": 3.800456160823211e-06, + "loss": 0.5381, + "step": 313560 + }, + { + "epoch": 2.7720610336109197, + "grad_norm": 3.070068597793579, + "learning_rate": 3.7989827731513403e-06, + "loss": 0.5215, + "step": 313570 + }, + { + "epoch": 2.772149436871232, + "grad_norm": 6.7211737632751465, + "learning_rate": 3.7975093854794695e-06, + "loss": 0.507, + "step": 313580 + }, + { + "epoch": 2.772237840131544, + "grad_norm": 1.6284677982330322, + "learning_rate": 3.7960359978075995e-06, + "loss": 0.4214, + "step": 313590 + }, + { + "epoch": 2.7723262433918565, + "grad_norm": 3.1945836544036865, + "learning_rate": 3.7945626101357287e-06, + "loss": 0.4423, + "step": 313600 + }, + { + "epoch": 2.7724146466521686, + "grad_norm": 2.008345127105713, + "learning_rate": 3.793089222463858e-06, + "loss": 0.5987, + "step": 313610 + }, + { + "epoch": 2.7725030499124808, + "grad_norm": 5.114248275756836, + "learning_rate": 3.7916158347919875e-06, + "loss": 0.5603, + "step": 313620 + }, + { + "epoch": 2.772591453172793, + "grad_norm": 3.6853740215301514, + "learning_rate": 3.7901424471201167e-06, + "loss": 0.6753, + "step": 313630 + }, + { + "epoch": 2.772679856433105, + "grad_norm": 6.843773365020752, + "learning_rate": 3.788669059448246e-06, + "loss": 0.5863, + "step": 313640 + }, + { + "epoch": 2.7727682596934176, + "grad_norm": 2.0101888179779053, + "learning_rate": 3.787195671776375e-06, + "loss": 0.4326, + "step": 313650 + }, + { + "epoch": 2.7728566629537297, + "grad_norm": 1.8246397972106934, + "learning_rate": 3.7857222841045044e-06, + "loss": 0.4532, + "step": 313660 + }, + { + "epoch": 2.7729450662140422, + "grad_norm": 5.767509460449219, + "learning_rate": 3.7842488964326336e-06, + "loss": 0.348, + "step": 313670 + }, + { + "epoch": 2.7730334694743544, + "grad_norm": 2.4356300830841064, + "learning_rate": 3.7827755087607636e-06, + "loss": 0.5171, + "step": 313680 + }, + { + "epoch": 2.7731218727346665, + "grad_norm": 3.3924214839935303, + "learning_rate": 3.781302121088893e-06, + "loss": 0.4516, + "step": 313690 + }, + { + "epoch": 2.7732102759949786, + "grad_norm": 4.0258049964904785, + "learning_rate": 3.779828733417022e-06, + "loss": 0.5957, + "step": 313700 + }, + { + "epoch": 2.7732986792552907, + "grad_norm": 6.361627578735352, + "learning_rate": 3.778355345745151e-06, + "loss": 0.493, + "step": 313710 + }, + { + "epoch": 2.7733870825156033, + "grad_norm": 2.784024477005005, + "learning_rate": 3.7768819580732804e-06, + "loss": 0.4629, + "step": 313720 + }, + { + "epoch": 2.7734754857759154, + "grad_norm": 29.80612564086914, + "learning_rate": 3.77540857040141e-06, + "loss": 0.6374, + "step": 313730 + }, + { + "epoch": 2.7735638890362275, + "grad_norm": 2.4582245349884033, + "learning_rate": 3.7739351827295392e-06, + "loss": 0.4834, + "step": 313740 + }, + { + "epoch": 2.77365229229654, + "grad_norm": 6.3922905921936035, + "learning_rate": 3.7724617950576684e-06, + "loss": 0.5737, + "step": 313750 + }, + { + "epoch": 2.773740695556852, + "grad_norm": 1.4135488271713257, + "learning_rate": 3.7709884073857976e-06, + "loss": 0.5479, + "step": 313760 + }, + { + "epoch": 2.7738290988171643, + "grad_norm": 2.377000570297241, + "learning_rate": 3.7695150197139277e-06, + "loss": 0.578, + "step": 313770 + }, + { + "epoch": 2.7739175020774764, + "grad_norm": 3.3794493675231934, + "learning_rate": 3.768041632042057e-06, + "loss": 0.4449, + "step": 313780 + }, + { + "epoch": 2.774005905337789, + "grad_norm": 2.67445969581604, + "learning_rate": 3.766568244370186e-06, + "loss": 0.385, + "step": 313790 + }, + { + "epoch": 2.774094308598101, + "grad_norm": 6.490968704223633, + "learning_rate": 3.7650948566983153e-06, + "loss": 0.5551, + "step": 313800 + }, + { + "epoch": 2.7741827118584133, + "grad_norm": 2.0637314319610596, + "learning_rate": 3.7636214690264445e-06, + "loss": 0.5571, + "step": 313810 + }, + { + "epoch": 2.774271115118726, + "grad_norm": 5.656533241271973, + "learning_rate": 3.7621480813545737e-06, + "loss": 0.534, + "step": 313820 + }, + { + "epoch": 2.774359518379038, + "grad_norm": 1.9322290420532227, + "learning_rate": 3.760674693682703e-06, + "loss": 0.4277, + "step": 313830 + }, + { + "epoch": 2.77444792163935, + "grad_norm": 1.2548762559890747, + "learning_rate": 3.7592013060108325e-06, + "loss": 0.3814, + "step": 313840 + }, + { + "epoch": 2.774536324899662, + "grad_norm": 3.8693995475769043, + "learning_rate": 3.7577279183389617e-06, + "loss": 0.6072, + "step": 313850 + }, + { + "epoch": 2.7746247281599743, + "grad_norm": 4.182264804840088, + "learning_rate": 3.756254530667091e-06, + "loss": 0.3806, + "step": 313860 + }, + { + "epoch": 2.774713131420287, + "grad_norm": 1.6564732789993286, + "learning_rate": 3.754781142995221e-06, + "loss": 0.5649, + "step": 313870 + }, + { + "epoch": 2.774801534680599, + "grad_norm": 18.119361877441406, + "learning_rate": 3.75330775532335e-06, + "loss": 0.6424, + "step": 313880 + }, + { + "epoch": 2.774889937940911, + "grad_norm": 2.9986746311187744, + "learning_rate": 3.7518343676514794e-06, + "loss": 0.6263, + "step": 313890 + }, + { + "epoch": 2.7749783412012237, + "grad_norm": 0.9548075199127197, + "learning_rate": 3.7503609799796086e-06, + "loss": 0.4014, + "step": 313900 + }, + { + "epoch": 2.775066744461536, + "grad_norm": 1.5472418069839478, + "learning_rate": 3.7488875923077378e-06, + "loss": 0.4867, + "step": 313910 + }, + { + "epoch": 2.775155147721848, + "grad_norm": 6.006728649139404, + "learning_rate": 3.747414204635867e-06, + "loss": 0.4374, + "step": 313920 + }, + { + "epoch": 2.77524355098216, + "grad_norm": 3.8330087661743164, + "learning_rate": 3.745940816963996e-06, + "loss": 0.5356, + "step": 313930 + }, + { + "epoch": 2.7753319542424726, + "grad_norm": 14.76555061340332, + "learning_rate": 3.7444674292921254e-06, + "loss": 0.4845, + "step": 313940 + }, + { + "epoch": 2.7754203575027847, + "grad_norm": 3.4459660053253174, + "learning_rate": 3.742994041620255e-06, + "loss": 0.5394, + "step": 313950 + }, + { + "epoch": 2.775508760763097, + "grad_norm": 0.8229383230209351, + "learning_rate": 3.7415206539483846e-06, + "loss": 0.5321, + "step": 313960 + }, + { + "epoch": 2.7755971640234094, + "grad_norm": 11.196011543273926, + "learning_rate": 3.740047266276514e-06, + "loss": 0.5304, + "step": 313970 + }, + { + "epoch": 2.7756855672837215, + "grad_norm": 3.323005437850952, + "learning_rate": 3.7385738786046434e-06, + "loss": 0.5227, + "step": 313980 + }, + { + "epoch": 2.7757739705440336, + "grad_norm": 6.54466438293457, + "learning_rate": 3.7371004909327726e-06, + "loss": 0.5822, + "step": 313990 + }, + { + "epoch": 2.7758623738043458, + "grad_norm": 4.9192280769348145, + "learning_rate": 3.735627103260902e-06, + "loss": 0.5537, + "step": 314000 + }, + { + "epoch": 2.775950777064658, + "grad_norm": 2.7808403968811035, + "learning_rate": 3.734153715589031e-06, + "loss": 0.6014, + "step": 314010 + }, + { + "epoch": 2.7760391803249704, + "grad_norm": 2.377004623413086, + "learning_rate": 3.7326803279171602e-06, + "loss": 0.4809, + "step": 314020 + }, + { + "epoch": 2.7761275835852826, + "grad_norm": 17.623794555664062, + "learning_rate": 3.7312069402452894e-06, + "loss": 0.5362, + "step": 314030 + }, + { + "epoch": 2.776215986845595, + "grad_norm": 3.262930393218994, + "learning_rate": 3.7297335525734186e-06, + "loss": 0.5589, + "step": 314040 + }, + { + "epoch": 2.7763043901059072, + "grad_norm": 1.8514186143875122, + "learning_rate": 3.7282601649015487e-06, + "loss": 0.542, + "step": 314050 + }, + { + "epoch": 2.7763927933662194, + "grad_norm": 5.502357482910156, + "learning_rate": 3.726786777229678e-06, + "loss": 0.4841, + "step": 314060 + }, + { + "epoch": 2.7764811966265315, + "grad_norm": 7.487277030944824, + "learning_rate": 3.725313389557807e-06, + "loss": 0.5064, + "step": 314070 + }, + { + "epoch": 2.7765695998868436, + "grad_norm": 1.1674416065216064, + "learning_rate": 3.7238400018859367e-06, + "loss": 0.494, + "step": 314080 + }, + { + "epoch": 2.776658003147156, + "grad_norm": 2.620814085006714, + "learning_rate": 3.722366614214066e-06, + "loss": 0.5208, + "step": 314090 + }, + { + "epoch": 2.7767464064074683, + "grad_norm": 4.034411430358887, + "learning_rate": 3.720893226542195e-06, + "loss": 0.522, + "step": 314100 + }, + { + "epoch": 2.7768348096677804, + "grad_norm": 6.212573051452637, + "learning_rate": 3.7194198388703243e-06, + "loss": 0.7441, + "step": 314110 + }, + { + "epoch": 2.776923212928093, + "grad_norm": 5.3345417976379395, + "learning_rate": 3.7179464511984535e-06, + "loss": 0.529, + "step": 314120 + }, + { + "epoch": 2.777011616188405, + "grad_norm": 21.909818649291992, + "learning_rate": 3.7164730635265827e-06, + "loss": 0.5073, + "step": 314130 + }, + { + "epoch": 2.777100019448717, + "grad_norm": 2.4027044773101807, + "learning_rate": 3.714999675854712e-06, + "loss": 0.5532, + "step": 314140 + }, + { + "epoch": 2.7771884227090293, + "grad_norm": 2.6120500564575195, + "learning_rate": 3.713526288182842e-06, + "loss": 0.5433, + "step": 314150 + }, + { + "epoch": 2.777276825969342, + "grad_norm": 6.438935279846191, + "learning_rate": 3.712052900510971e-06, + "loss": 0.5249, + "step": 314160 + }, + { + "epoch": 2.777365229229654, + "grad_norm": 4.62705659866333, + "learning_rate": 3.7105795128391004e-06, + "loss": 0.4919, + "step": 314170 + }, + { + "epoch": 2.777453632489966, + "grad_norm": 2.9837143421173096, + "learning_rate": 3.7091061251672296e-06, + "loss": 0.474, + "step": 314180 + }, + { + "epoch": 2.7775420357502787, + "grad_norm": 3.710601806640625, + "learning_rate": 3.707632737495359e-06, + "loss": 0.5916, + "step": 314190 + }, + { + "epoch": 2.777630439010591, + "grad_norm": 8.859443664550781, + "learning_rate": 3.7061593498234884e-06, + "loss": 0.548, + "step": 314200 + }, + { + "epoch": 2.777718842270903, + "grad_norm": 5.2106614112854, + "learning_rate": 3.7046859621516176e-06, + "loss": 0.4997, + "step": 314210 + }, + { + "epoch": 2.777807245531215, + "grad_norm": 5.250215530395508, + "learning_rate": 3.703212574479747e-06, + "loss": 0.5477, + "step": 314220 + }, + { + "epoch": 2.777895648791527, + "grad_norm": 9.506060600280762, + "learning_rate": 3.701739186807876e-06, + "loss": 0.3889, + "step": 314230 + }, + { + "epoch": 2.7779840520518397, + "grad_norm": 2.166919469833374, + "learning_rate": 3.700265799136006e-06, + "loss": 0.5536, + "step": 314240 + }, + { + "epoch": 2.778072455312152, + "grad_norm": 0.8262235522270203, + "learning_rate": 3.6987924114641353e-06, + "loss": 0.5273, + "step": 314250 + }, + { + "epoch": 2.7781608585724644, + "grad_norm": 2.15671443939209, + "learning_rate": 3.6973190237922645e-06, + "loss": 0.5512, + "step": 314260 + }, + { + "epoch": 2.7782492618327765, + "grad_norm": 1.7014296054840088, + "learning_rate": 3.6958456361203937e-06, + "loss": 0.4179, + "step": 314270 + }, + { + "epoch": 2.7783376650930887, + "grad_norm": 9.909805297851562, + "learning_rate": 3.694372248448523e-06, + "loss": 0.5132, + "step": 314280 + }, + { + "epoch": 2.778426068353401, + "grad_norm": 4.898975372314453, + "learning_rate": 3.692898860776652e-06, + "loss": 0.694, + "step": 314290 + }, + { + "epoch": 2.778514471613713, + "grad_norm": 4.458024501800537, + "learning_rate": 3.6914254731047817e-06, + "loss": 0.5918, + "step": 314300 + }, + { + "epoch": 2.7786028748740255, + "grad_norm": 1.7523009777069092, + "learning_rate": 3.689952085432911e-06, + "loss": 0.5101, + "step": 314310 + }, + { + "epoch": 2.7786912781343376, + "grad_norm": 1.046875, + "learning_rate": 3.68847869776104e-06, + "loss": 0.4534, + "step": 314320 + }, + { + "epoch": 2.7787796813946497, + "grad_norm": 6.153555393218994, + "learning_rate": 3.68700531008917e-06, + "loss": 0.6034, + "step": 314330 + }, + { + "epoch": 2.7788680846549623, + "grad_norm": 5.820525169372559, + "learning_rate": 3.6855319224172993e-06, + "loss": 0.5568, + "step": 314340 + }, + { + "epoch": 2.7789564879152744, + "grad_norm": 8.369475364685059, + "learning_rate": 3.6840585347454285e-06, + "loss": 0.556, + "step": 314350 + }, + { + "epoch": 2.7790448911755865, + "grad_norm": 3.767817497253418, + "learning_rate": 3.6825851470735577e-06, + "loss": 0.5706, + "step": 314360 + }, + { + "epoch": 2.7791332944358986, + "grad_norm": 3.99900484085083, + "learning_rate": 3.681111759401687e-06, + "loss": 0.4659, + "step": 314370 + }, + { + "epoch": 2.779221697696211, + "grad_norm": 1.510669231414795, + "learning_rate": 3.679638371729816e-06, + "loss": 0.4955, + "step": 314380 + }, + { + "epoch": 2.7793101009565233, + "grad_norm": 4.215996742248535, + "learning_rate": 3.6781649840579453e-06, + "loss": 0.38, + "step": 314390 + }, + { + "epoch": 2.7793985042168354, + "grad_norm": 1.108500361442566, + "learning_rate": 3.6766915963860745e-06, + "loss": 0.4957, + "step": 314400 + }, + { + "epoch": 2.779486907477148, + "grad_norm": 1.6522547006607056, + "learning_rate": 3.675218208714204e-06, + "loss": 0.5406, + "step": 314410 + }, + { + "epoch": 2.77957531073746, + "grad_norm": 1.7331931591033936, + "learning_rate": 3.6737448210423334e-06, + "loss": 0.4881, + "step": 314420 + }, + { + "epoch": 2.7796637139977722, + "grad_norm": 4.7645463943481445, + "learning_rate": 3.672271433370463e-06, + "loss": 0.4948, + "step": 314430 + }, + { + "epoch": 2.7797521172580844, + "grad_norm": 9.948087692260742, + "learning_rate": 3.6707980456985926e-06, + "loss": 0.47, + "step": 314440 + }, + { + "epoch": 2.7798405205183965, + "grad_norm": 1.2352344989776611, + "learning_rate": 3.669324658026722e-06, + "loss": 0.4052, + "step": 314450 + }, + { + "epoch": 2.779928923778709, + "grad_norm": 9.345041275024414, + "learning_rate": 3.667851270354851e-06, + "loss": 0.3767, + "step": 314460 + }, + { + "epoch": 2.780017327039021, + "grad_norm": 4.915680885314941, + "learning_rate": 3.6663778826829802e-06, + "loss": 0.6126, + "step": 314470 + }, + { + "epoch": 2.7801057302993333, + "grad_norm": 2.2520575523376465, + "learning_rate": 3.6649044950111094e-06, + "loss": 0.6684, + "step": 314480 + }, + { + "epoch": 2.780194133559646, + "grad_norm": 4.001309394836426, + "learning_rate": 3.6634311073392386e-06, + "loss": 0.5048, + "step": 314490 + }, + { + "epoch": 2.780282536819958, + "grad_norm": 3.950244188308716, + "learning_rate": 3.661957719667368e-06, + "loss": 0.4775, + "step": 314500 + }, + { + "epoch": 2.78037094008027, + "grad_norm": 2.3484959602355957, + "learning_rate": 3.660484331995497e-06, + "loss": 0.5236, + "step": 314510 + }, + { + "epoch": 2.780459343340582, + "grad_norm": 3.1119847297668457, + "learning_rate": 3.659010944323627e-06, + "loss": 0.4395, + "step": 314520 + }, + { + "epoch": 2.7805477466008948, + "grad_norm": 2.2461459636688232, + "learning_rate": 3.6575375566517563e-06, + "loss": 0.5011, + "step": 314530 + }, + { + "epoch": 2.780636149861207, + "grad_norm": 2.1546945571899414, + "learning_rate": 3.6560641689798855e-06, + "loss": 0.5131, + "step": 314540 + }, + { + "epoch": 2.780724553121519, + "grad_norm": 4.654511451721191, + "learning_rate": 3.654590781308015e-06, + "loss": 0.4014, + "step": 314550 + }, + { + "epoch": 2.7808129563818316, + "grad_norm": 7.661648273468018, + "learning_rate": 3.6531173936361443e-06, + "loss": 0.5311, + "step": 314560 + }, + { + "epoch": 2.7809013596421437, + "grad_norm": 2.1793150901794434, + "learning_rate": 3.6516440059642735e-06, + "loss": 0.4593, + "step": 314570 + }, + { + "epoch": 2.780989762902456, + "grad_norm": 3.7710986137390137, + "learning_rate": 3.6501706182924027e-06, + "loss": 0.4531, + "step": 314580 + }, + { + "epoch": 2.781078166162768, + "grad_norm": 3.729295253753662, + "learning_rate": 3.648697230620532e-06, + "loss": 0.4976, + "step": 314590 + }, + { + "epoch": 2.78116656942308, + "grad_norm": 2.133481025695801, + "learning_rate": 3.647223842948661e-06, + "loss": 0.6849, + "step": 314600 + }, + { + "epoch": 2.7812549726833926, + "grad_norm": 4.2851786613464355, + "learning_rate": 3.645750455276791e-06, + "loss": 0.59, + "step": 314610 + }, + { + "epoch": 2.7813433759437047, + "grad_norm": 2.902780294418335, + "learning_rate": 3.6442770676049204e-06, + "loss": 0.5535, + "step": 314620 + }, + { + "epoch": 2.7814317792040173, + "grad_norm": 8.727551460266113, + "learning_rate": 3.6428036799330496e-06, + "loss": 0.6289, + "step": 314630 + }, + { + "epoch": 2.7815201824643294, + "grad_norm": 9.369818687438965, + "learning_rate": 3.6413302922611788e-06, + "loss": 0.4416, + "step": 314640 + }, + { + "epoch": 2.7816085857246415, + "grad_norm": 6.459123611450195, + "learning_rate": 3.639856904589308e-06, + "loss": 0.6533, + "step": 314650 + }, + { + "epoch": 2.7816969889849537, + "grad_norm": 2.0000269412994385, + "learning_rate": 3.6383835169174376e-06, + "loss": 0.5016, + "step": 314660 + }, + { + "epoch": 2.7817853922452658, + "grad_norm": 5.809001445770264, + "learning_rate": 3.6369101292455668e-06, + "loss": 0.4538, + "step": 314670 + }, + { + "epoch": 2.7818737955055783, + "grad_norm": 1.7735404968261719, + "learning_rate": 3.635436741573696e-06, + "loss": 0.4974, + "step": 314680 + }, + { + "epoch": 2.7819621987658905, + "grad_norm": 2.2799549102783203, + "learning_rate": 3.633963353901825e-06, + "loss": 0.257, + "step": 314690 + }, + { + "epoch": 2.7820506020262026, + "grad_norm": 3.503387928009033, + "learning_rate": 3.6324899662299544e-06, + "loss": 0.4766, + "step": 314700 + }, + { + "epoch": 2.782139005286515, + "grad_norm": 3.3732802867889404, + "learning_rate": 3.6310165785580844e-06, + "loss": 0.5454, + "step": 314710 + }, + { + "epoch": 2.7822274085468273, + "grad_norm": 1.9952423572540283, + "learning_rate": 3.6295431908862136e-06, + "loss": 0.5338, + "step": 314720 + }, + { + "epoch": 2.7823158118071394, + "grad_norm": 1.1369709968566895, + "learning_rate": 3.628069803214343e-06, + "loss": 0.445, + "step": 314730 + }, + { + "epoch": 2.7824042150674515, + "grad_norm": 5.167178153991699, + "learning_rate": 3.626596415542472e-06, + "loss": 0.5208, + "step": 314740 + }, + { + "epoch": 2.782492618327764, + "grad_norm": 5.099582672119141, + "learning_rate": 3.6251230278706012e-06, + "loss": 0.5532, + "step": 314750 + }, + { + "epoch": 2.782581021588076, + "grad_norm": 7.773534774780273, + "learning_rate": 3.6236496401987304e-06, + "loss": 0.5675, + "step": 314760 + }, + { + "epoch": 2.7826694248483883, + "grad_norm": 0.9430314898490906, + "learning_rate": 3.62217625252686e-06, + "loss": 0.4223, + "step": 314770 + }, + { + "epoch": 2.782757828108701, + "grad_norm": 2.689375162124634, + "learning_rate": 3.6207028648549893e-06, + "loss": 0.5118, + "step": 314780 + }, + { + "epoch": 2.782846231369013, + "grad_norm": 2.6311779022216797, + "learning_rate": 3.6192294771831185e-06, + "loss": 0.5312, + "step": 314790 + }, + { + "epoch": 2.782934634629325, + "grad_norm": 1.7061364650726318, + "learning_rate": 3.6177560895112485e-06, + "loss": 0.49, + "step": 314800 + }, + { + "epoch": 2.7830230378896372, + "grad_norm": 1.4590952396392822, + "learning_rate": 3.6162827018393777e-06, + "loss": 0.5112, + "step": 314810 + }, + { + "epoch": 2.7831114411499493, + "grad_norm": 3.315056800842285, + "learning_rate": 3.614809314167507e-06, + "loss": 0.5625, + "step": 314820 + }, + { + "epoch": 2.783199844410262, + "grad_norm": 3.8450679779052734, + "learning_rate": 3.613335926495636e-06, + "loss": 0.5875, + "step": 314830 + }, + { + "epoch": 2.783288247670574, + "grad_norm": 0.9007291793823242, + "learning_rate": 3.6118625388237653e-06, + "loss": 0.3947, + "step": 314840 + }, + { + "epoch": 2.7833766509308866, + "grad_norm": 1.677483320236206, + "learning_rate": 3.6103891511518945e-06, + "loss": 0.6217, + "step": 314850 + }, + { + "epoch": 2.7834650541911987, + "grad_norm": 1.5345635414123535, + "learning_rate": 3.6089157634800237e-06, + "loss": 0.5196, + "step": 314860 + }, + { + "epoch": 2.783553457451511, + "grad_norm": 3.9862959384918213, + "learning_rate": 3.607442375808153e-06, + "loss": 0.436, + "step": 314870 + }, + { + "epoch": 2.783641860711823, + "grad_norm": 3.793017625808716, + "learning_rate": 3.6059689881362825e-06, + "loss": 0.4247, + "step": 314880 + }, + { + "epoch": 2.783730263972135, + "grad_norm": 2.2815451622009277, + "learning_rate": 3.6044956004644117e-06, + "loss": 0.4803, + "step": 314890 + }, + { + "epoch": 2.7838186672324476, + "grad_norm": 2.2748966217041016, + "learning_rate": 3.6030222127925414e-06, + "loss": 0.5965, + "step": 314900 + }, + { + "epoch": 2.7839070704927598, + "grad_norm": 2.6525423526763916, + "learning_rate": 3.601548825120671e-06, + "loss": 0.6062, + "step": 314910 + }, + { + "epoch": 2.783995473753072, + "grad_norm": 1.8727397918701172, + "learning_rate": 3.6000754374488e-06, + "loss": 0.4773, + "step": 314920 + }, + { + "epoch": 2.7840838770133844, + "grad_norm": 2.5068228244781494, + "learning_rate": 3.5986020497769294e-06, + "loss": 0.5789, + "step": 314930 + }, + { + "epoch": 2.7841722802736966, + "grad_norm": 1.7504264116287231, + "learning_rate": 3.5971286621050586e-06, + "loss": 0.5293, + "step": 314940 + }, + { + "epoch": 2.7842606835340087, + "grad_norm": 2.522470474243164, + "learning_rate": 3.595655274433188e-06, + "loss": 0.5053, + "step": 314950 + }, + { + "epoch": 2.784349086794321, + "grad_norm": 5.1688385009765625, + "learning_rate": 3.594181886761317e-06, + "loss": 0.5874, + "step": 314960 + }, + { + "epoch": 2.7844374900546334, + "grad_norm": 2.033275604248047, + "learning_rate": 3.592708499089446e-06, + "loss": 0.5146, + "step": 314970 + }, + { + "epoch": 2.7845258933149455, + "grad_norm": 8.585947036743164, + "learning_rate": 3.5912351114175754e-06, + "loss": 0.5565, + "step": 314980 + }, + { + "epoch": 2.7846142965752576, + "grad_norm": 1.4980531930923462, + "learning_rate": 3.5897617237457054e-06, + "loss": 0.5224, + "step": 314990 + }, + { + "epoch": 2.78470269983557, + "grad_norm": 2.8766579627990723, + "learning_rate": 3.5882883360738346e-06, + "loss": 0.4601, + "step": 315000 + }, + { + "epoch": 2.7847911030958823, + "grad_norm": 3.115124225616455, + "learning_rate": 3.586814948401964e-06, + "loss": 0.6896, + "step": 315010 + }, + { + "epoch": 2.7848795063561944, + "grad_norm": 6.378185272216797, + "learning_rate": 3.5853415607300935e-06, + "loss": 0.4983, + "step": 315020 + }, + { + "epoch": 2.7849679096165065, + "grad_norm": 5.290971279144287, + "learning_rate": 3.5838681730582227e-06, + "loss": 0.6141, + "step": 315030 + }, + { + "epoch": 2.7850563128768187, + "grad_norm": 2.778111696243286, + "learning_rate": 3.582394785386352e-06, + "loss": 0.5296, + "step": 315040 + }, + { + "epoch": 2.785144716137131, + "grad_norm": 3.0261588096618652, + "learning_rate": 3.580921397714481e-06, + "loss": 0.5191, + "step": 315050 + }, + { + "epoch": 2.7852331193974433, + "grad_norm": 10.091453552246094, + "learning_rate": 3.5794480100426103e-06, + "loss": 0.5346, + "step": 315060 + }, + { + "epoch": 2.7853215226577555, + "grad_norm": 14.921371459960938, + "learning_rate": 3.5779746223707395e-06, + "loss": 0.4886, + "step": 315070 + }, + { + "epoch": 2.785409925918068, + "grad_norm": 9.065103530883789, + "learning_rate": 3.5765012346988695e-06, + "loss": 0.4201, + "step": 315080 + }, + { + "epoch": 2.78549832917838, + "grad_norm": 2.206791639328003, + "learning_rate": 3.5750278470269987e-06, + "loss": 0.4923, + "step": 315090 + }, + { + "epoch": 2.7855867324386923, + "grad_norm": 3.4465575218200684, + "learning_rate": 3.573554459355128e-06, + "loss": 0.4936, + "step": 315100 + }, + { + "epoch": 2.7856751356990044, + "grad_norm": 1.9652825593948364, + "learning_rate": 3.572081071683257e-06, + "loss": 0.6878, + "step": 315110 + }, + { + "epoch": 2.785763538959317, + "grad_norm": 1.6592928171157837, + "learning_rate": 3.5706076840113863e-06, + "loss": 0.4998, + "step": 315120 + }, + { + "epoch": 2.785851942219629, + "grad_norm": 1.8584434986114502, + "learning_rate": 3.569134296339516e-06, + "loss": 0.4141, + "step": 315130 + }, + { + "epoch": 2.785940345479941, + "grad_norm": 1.7030935287475586, + "learning_rate": 3.567660908667645e-06, + "loss": 0.4891, + "step": 315140 + }, + { + "epoch": 2.7860287487402537, + "grad_norm": 4.7447509765625, + "learning_rate": 3.5661875209957744e-06, + "loss": 0.5917, + "step": 315150 + }, + { + "epoch": 2.786117152000566, + "grad_norm": 1.9999200105667114, + "learning_rate": 3.5647141333239036e-06, + "loss": 0.5029, + "step": 315160 + }, + { + "epoch": 2.786205555260878, + "grad_norm": 5.038844108581543, + "learning_rate": 3.5632407456520328e-06, + "loss": 0.5025, + "step": 315170 + }, + { + "epoch": 2.78629395852119, + "grad_norm": 1.6309843063354492, + "learning_rate": 3.561767357980163e-06, + "loss": 0.5354, + "step": 315180 + }, + { + "epoch": 2.7863823617815022, + "grad_norm": 5.616239070892334, + "learning_rate": 3.560293970308292e-06, + "loss": 0.5225, + "step": 315190 + }, + { + "epoch": 2.786470765041815, + "grad_norm": 12.187385559082031, + "learning_rate": 3.558820582636421e-06, + "loss": 0.5134, + "step": 315200 + }, + { + "epoch": 2.786559168302127, + "grad_norm": 3.022188663482666, + "learning_rate": 3.5573471949645504e-06, + "loss": 0.4409, + "step": 315210 + }, + { + "epoch": 2.7866475715624395, + "grad_norm": 2.8706214427948, + "learning_rate": 3.5558738072926796e-06, + "loss": 0.4442, + "step": 315220 + }, + { + "epoch": 2.7867359748227516, + "grad_norm": 9.049537658691406, + "learning_rate": 3.554400419620809e-06, + "loss": 0.4605, + "step": 315230 + }, + { + "epoch": 2.7868243780830637, + "grad_norm": 1.4372684955596924, + "learning_rate": 3.5529270319489384e-06, + "loss": 0.567, + "step": 315240 + }, + { + "epoch": 2.786912781343376, + "grad_norm": 1.9576040506362915, + "learning_rate": 3.5514536442770676e-06, + "loss": 0.4575, + "step": 315250 + }, + { + "epoch": 2.787001184603688, + "grad_norm": 14.241401672363281, + "learning_rate": 3.549980256605197e-06, + "loss": 0.5547, + "step": 315260 + }, + { + "epoch": 2.7870895878640005, + "grad_norm": 9.769283294677734, + "learning_rate": 3.548506868933327e-06, + "loss": 0.6012, + "step": 315270 + }, + { + "epoch": 2.7871779911243126, + "grad_norm": 27.382343292236328, + "learning_rate": 3.547033481261456e-06, + "loss": 0.5994, + "step": 315280 + }, + { + "epoch": 2.7872663943846248, + "grad_norm": 14.368111610412598, + "learning_rate": 3.5455600935895853e-06, + "loss": 0.4907, + "step": 315290 + }, + { + "epoch": 2.7873547976449373, + "grad_norm": 3.8697962760925293, + "learning_rate": 3.5440867059177145e-06, + "loss": 0.5532, + "step": 315300 + }, + { + "epoch": 2.7874432009052494, + "grad_norm": 1.9240440130233765, + "learning_rate": 3.5426133182458437e-06, + "loss": 0.6021, + "step": 315310 + }, + { + "epoch": 2.7875316041655616, + "grad_norm": 0.9321041107177734, + "learning_rate": 3.541139930573973e-06, + "loss": 0.5048, + "step": 315320 + }, + { + "epoch": 2.7876200074258737, + "grad_norm": 6.301486015319824, + "learning_rate": 3.539666542902102e-06, + "loss": 0.4544, + "step": 315330 + }, + { + "epoch": 2.7877084106861862, + "grad_norm": 1.3316552639007568, + "learning_rate": 3.5381931552302317e-06, + "loss": 0.4343, + "step": 315340 + }, + { + "epoch": 2.7877968139464984, + "grad_norm": 2.830575942993164, + "learning_rate": 3.536719767558361e-06, + "loss": 0.6627, + "step": 315350 + }, + { + "epoch": 2.7878852172068105, + "grad_norm": 2.2694380283355713, + "learning_rate": 3.5352463798864905e-06, + "loss": 0.7868, + "step": 315360 + }, + { + "epoch": 2.787973620467123, + "grad_norm": 3.855942964553833, + "learning_rate": 3.53377299221462e-06, + "loss": 0.5386, + "step": 315370 + }, + { + "epoch": 2.788062023727435, + "grad_norm": 2.58528733253479, + "learning_rate": 3.5322996045427494e-06, + "loss": 0.547, + "step": 315380 + }, + { + "epoch": 2.7881504269877473, + "grad_norm": 8.249451637268066, + "learning_rate": 3.5308262168708786e-06, + "loss": 0.5252, + "step": 315390 + }, + { + "epoch": 2.7882388302480594, + "grad_norm": 2.664510488510132, + "learning_rate": 3.5293528291990078e-06, + "loss": 0.4652, + "step": 315400 + }, + { + "epoch": 2.7883272335083715, + "grad_norm": 2.7741477489471436, + "learning_rate": 3.527879441527137e-06, + "loss": 0.5688, + "step": 315410 + }, + { + "epoch": 2.788415636768684, + "grad_norm": 3.7868034839630127, + "learning_rate": 3.526406053855266e-06, + "loss": 0.4568, + "step": 315420 + }, + { + "epoch": 2.788504040028996, + "grad_norm": 5.020224571228027, + "learning_rate": 3.5249326661833954e-06, + "loss": 0.4998, + "step": 315430 + }, + { + "epoch": 2.7885924432893088, + "grad_norm": 1.4374889135360718, + "learning_rate": 3.5234592785115246e-06, + "loss": 0.4464, + "step": 315440 + }, + { + "epoch": 2.788680846549621, + "grad_norm": 4.824401378631592, + "learning_rate": 3.521985890839654e-06, + "loss": 0.4715, + "step": 315450 + }, + { + "epoch": 2.788769249809933, + "grad_norm": 1.415050745010376, + "learning_rate": 3.520512503167784e-06, + "loss": 0.5845, + "step": 315460 + }, + { + "epoch": 2.788857653070245, + "grad_norm": 2.807365655899048, + "learning_rate": 3.519039115495913e-06, + "loss": 0.4794, + "step": 315470 + }, + { + "epoch": 2.7889460563305573, + "grad_norm": 22.411500930786133, + "learning_rate": 3.5175657278240426e-06, + "loss": 0.5322, + "step": 315480 + }, + { + "epoch": 2.78903445959087, + "grad_norm": 4.122489929199219, + "learning_rate": 3.516092340152172e-06, + "loss": 0.5315, + "step": 315490 + }, + { + "epoch": 2.789122862851182, + "grad_norm": 1.3590798377990723, + "learning_rate": 3.514618952480301e-06, + "loss": 0.3651, + "step": 315500 + }, + { + "epoch": 2.789211266111494, + "grad_norm": 6.56985330581665, + "learning_rate": 3.5131455648084303e-06, + "loss": 0.5142, + "step": 315510 + }, + { + "epoch": 2.7892996693718066, + "grad_norm": 7.9228410720825195, + "learning_rate": 3.5116721771365595e-06, + "loss": 0.5949, + "step": 315520 + }, + { + "epoch": 2.7893880726321187, + "grad_norm": 1.5531790256500244, + "learning_rate": 3.5101987894646887e-06, + "loss": 0.4935, + "step": 315530 + }, + { + "epoch": 2.789476475892431, + "grad_norm": 1.947703242301941, + "learning_rate": 3.508725401792818e-06, + "loss": 0.4175, + "step": 315540 + }, + { + "epoch": 2.789564879152743, + "grad_norm": 4.345638275146484, + "learning_rate": 3.507252014120948e-06, + "loss": 0.5711, + "step": 315550 + }, + { + "epoch": 2.7896532824130555, + "grad_norm": 7.262941360473633, + "learning_rate": 3.505778626449077e-06, + "loss": 0.5898, + "step": 315560 + }, + { + "epoch": 2.7897416856733677, + "grad_norm": 10.144116401672363, + "learning_rate": 3.5043052387772063e-06, + "loss": 0.3618, + "step": 315570 + }, + { + "epoch": 2.78983008893368, + "grad_norm": 11.406083106994629, + "learning_rate": 3.5028318511053355e-06, + "loss": 0.5088, + "step": 315580 + }, + { + "epoch": 2.7899184921939923, + "grad_norm": 3.460592269897461, + "learning_rate": 3.501358463433465e-06, + "loss": 0.492, + "step": 315590 + }, + { + "epoch": 2.7900068954543045, + "grad_norm": 2.385711193084717, + "learning_rate": 3.4998850757615943e-06, + "loss": 0.5609, + "step": 315600 + }, + { + "epoch": 2.7900952987146166, + "grad_norm": 18.359283447265625, + "learning_rate": 3.4984116880897235e-06, + "loss": 0.5788, + "step": 315610 + }, + { + "epoch": 2.7901837019749287, + "grad_norm": 0.6721057891845703, + "learning_rate": 3.4969383004178527e-06, + "loss": 0.5594, + "step": 315620 + }, + { + "epoch": 2.790272105235241, + "grad_norm": 1.2047542333602905, + "learning_rate": 3.495464912745982e-06, + "loss": 0.6547, + "step": 315630 + }, + { + "epoch": 2.7903605084955534, + "grad_norm": 2.357652425765991, + "learning_rate": 3.493991525074112e-06, + "loss": 0.6346, + "step": 315640 + }, + { + "epoch": 2.7904489117558655, + "grad_norm": 4.811675548553467, + "learning_rate": 3.492518137402241e-06, + "loss": 0.3982, + "step": 315650 + }, + { + "epoch": 2.7905373150161776, + "grad_norm": 2.223862886428833, + "learning_rate": 3.4910447497303704e-06, + "loss": 0.5753, + "step": 315660 + }, + { + "epoch": 2.79062571827649, + "grad_norm": 7.684101104736328, + "learning_rate": 3.4895713620584996e-06, + "loss": 0.5331, + "step": 315670 + }, + { + "epoch": 2.7907141215368023, + "grad_norm": 5.19871187210083, + "learning_rate": 3.4880979743866288e-06, + "loss": 0.5187, + "step": 315680 + }, + { + "epoch": 2.7908025247971144, + "grad_norm": 2.7575316429138184, + "learning_rate": 3.486624586714758e-06, + "loss": 0.4798, + "step": 315690 + }, + { + "epoch": 2.7908909280574266, + "grad_norm": 3.566830635070801, + "learning_rate": 3.4851511990428876e-06, + "loss": 0.5231, + "step": 315700 + }, + { + "epoch": 2.790979331317739, + "grad_norm": 3.455552339553833, + "learning_rate": 3.483677811371017e-06, + "loss": 0.3846, + "step": 315710 + }, + { + "epoch": 2.7910677345780512, + "grad_norm": 2.898930788040161, + "learning_rate": 3.482204423699146e-06, + "loss": 0.6001, + "step": 315720 + }, + { + "epoch": 2.7911561378383634, + "grad_norm": 2.0450007915496826, + "learning_rate": 3.4807310360272752e-06, + "loss": 0.6715, + "step": 315730 + }, + { + "epoch": 2.791244541098676, + "grad_norm": 1.2463994026184082, + "learning_rate": 3.4792576483554053e-06, + "loss": 0.4761, + "step": 315740 + }, + { + "epoch": 2.791332944358988, + "grad_norm": 1.6303141117095947, + "learning_rate": 3.4777842606835345e-06, + "loss": 0.5241, + "step": 315750 + }, + { + "epoch": 2.7914213476193, + "grad_norm": 12.755999565124512, + "learning_rate": 3.4763108730116637e-06, + "loss": 0.4896, + "step": 315760 + }, + { + "epoch": 2.7915097508796123, + "grad_norm": 3.331711769104004, + "learning_rate": 3.474837485339793e-06, + "loss": 0.391, + "step": 315770 + }, + { + "epoch": 2.7915981541399244, + "grad_norm": 7.006025791168213, + "learning_rate": 3.473364097667922e-06, + "loss": 0.6631, + "step": 315780 + }, + { + "epoch": 2.791686557400237, + "grad_norm": 1.881930947303772, + "learning_rate": 3.4718907099960513e-06, + "loss": 0.4995, + "step": 315790 + }, + { + "epoch": 2.791774960660549, + "grad_norm": 5.827757358551025, + "learning_rate": 3.4704173223241805e-06, + "loss": 0.6473, + "step": 315800 + }, + { + "epoch": 2.7918633639208617, + "grad_norm": 1.9754022359848022, + "learning_rate": 3.46894393465231e-06, + "loss": 0.4105, + "step": 315810 + }, + { + "epoch": 2.7919517671811738, + "grad_norm": 6.273451805114746, + "learning_rate": 3.4674705469804393e-06, + "loss": 0.5379, + "step": 315820 + }, + { + "epoch": 2.792040170441486, + "grad_norm": 7.113163471221924, + "learning_rate": 3.465997159308569e-06, + "loss": 0.5905, + "step": 315830 + }, + { + "epoch": 2.792128573701798, + "grad_norm": 9.949630737304688, + "learning_rate": 3.4645237716366985e-06, + "loss": 0.5833, + "step": 315840 + }, + { + "epoch": 2.79221697696211, + "grad_norm": 0.7881326675415039, + "learning_rate": 3.4630503839648277e-06, + "loss": 0.5248, + "step": 315850 + }, + { + "epoch": 2.7923053802224227, + "grad_norm": 4.36805534362793, + "learning_rate": 3.461576996292957e-06, + "loss": 0.4878, + "step": 315860 + }, + { + "epoch": 2.792393783482735, + "grad_norm": 4.341488838195801, + "learning_rate": 3.460103608621086e-06, + "loss": 0.5421, + "step": 315870 + }, + { + "epoch": 2.792482186743047, + "grad_norm": 6.827460765838623, + "learning_rate": 3.4586302209492153e-06, + "loss": 0.4992, + "step": 315880 + }, + { + "epoch": 2.7925705900033595, + "grad_norm": 3.4273860454559326, + "learning_rate": 3.4571568332773445e-06, + "loss": 0.5828, + "step": 315890 + }, + { + "epoch": 2.7926589932636716, + "grad_norm": 4.405358791351318, + "learning_rate": 3.4556834456054737e-06, + "loss": 0.5295, + "step": 315900 + }, + { + "epoch": 2.7927473965239837, + "grad_norm": 2.8594918251037598, + "learning_rate": 3.454210057933603e-06, + "loss": 0.5727, + "step": 315910 + }, + { + "epoch": 2.792835799784296, + "grad_norm": 6.616241931915283, + "learning_rate": 3.452736670261733e-06, + "loss": 0.6786, + "step": 315920 + }, + { + "epoch": 2.7929242030446084, + "grad_norm": 1.90521240234375, + "learning_rate": 3.451263282589862e-06, + "loss": 0.4917, + "step": 315930 + }, + { + "epoch": 2.7930126063049205, + "grad_norm": 3.6010022163391113, + "learning_rate": 3.4497898949179914e-06, + "loss": 0.5979, + "step": 315940 + }, + { + "epoch": 2.7931010095652327, + "grad_norm": 1.911860466003418, + "learning_rate": 3.448316507246121e-06, + "loss": 0.4746, + "step": 315950 + }, + { + "epoch": 2.7931894128255452, + "grad_norm": 2.4402472972869873, + "learning_rate": 3.4468431195742502e-06, + "loss": 0.5011, + "step": 315960 + }, + { + "epoch": 2.7932778160858573, + "grad_norm": 1.9842174053192139, + "learning_rate": 3.4453697319023794e-06, + "loss": 0.459, + "step": 315970 + }, + { + "epoch": 2.7933662193461695, + "grad_norm": 1.417262315750122, + "learning_rate": 3.4438963442305086e-06, + "loss": 0.5338, + "step": 315980 + }, + { + "epoch": 2.7934546226064816, + "grad_norm": 1.1330608129501343, + "learning_rate": 3.442422956558638e-06, + "loss": 0.4063, + "step": 315990 + }, + { + "epoch": 2.7935430258667937, + "grad_norm": 3.597907781600952, + "learning_rate": 3.440949568886767e-06, + "loss": 0.5889, + "step": 316000 + }, + { + "epoch": 2.7936314291271063, + "grad_norm": 9.780816078186035, + "learning_rate": 3.4394761812148962e-06, + "loss": 0.5619, + "step": 316010 + }, + { + "epoch": 2.7937198323874184, + "grad_norm": 3.4249701499938965, + "learning_rate": 3.4380027935430263e-06, + "loss": 0.4824, + "step": 316020 + }, + { + "epoch": 2.793808235647731, + "grad_norm": 8.41297721862793, + "learning_rate": 3.4365294058711555e-06, + "loss": 0.5377, + "step": 316030 + }, + { + "epoch": 2.793896638908043, + "grad_norm": 5.747102737426758, + "learning_rate": 3.4350560181992847e-06, + "loss": 0.5321, + "step": 316040 + }, + { + "epoch": 2.793985042168355, + "grad_norm": 4.7774248123168945, + "learning_rate": 3.433582630527414e-06, + "loss": 0.552, + "step": 316050 + }, + { + "epoch": 2.7940734454286673, + "grad_norm": 11.267451286315918, + "learning_rate": 3.4321092428555435e-06, + "loss": 0.4907, + "step": 316060 + }, + { + "epoch": 2.7941618486889794, + "grad_norm": 6.276189804077148, + "learning_rate": 3.4306358551836727e-06, + "loss": 0.6249, + "step": 316070 + }, + { + "epoch": 2.794250251949292, + "grad_norm": 4.563801288604736, + "learning_rate": 3.429162467511802e-06, + "loss": 0.5989, + "step": 316080 + }, + { + "epoch": 2.794338655209604, + "grad_norm": 7.289098739624023, + "learning_rate": 3.427689079839931e-06, + "loss": 0.506, + "step": 316090 + }, + { + "epoch": 2.7944270584699162, + "grad_norm": 2.8097290992736816, + "learning_rate": 3.4262156921680603e-06, + "loss": 0.4677, + "step": 316100 + }, + { + "epoch": 2.794515461730229, + "grad_norm": 4.082925796508789, + "learning_rate": 3.4247423044961904e-06, + "loss": 0.4404, + "step": 316110 + }, + { + "epoch": 2.794603864990541, + "grad_norm": 15.366942405700684, + "learning_rate": 3.4232689168243196e-06, + "loss": 0.5448, + "step": 316120 + }, + { + "epoch": 2.794692268250853, + "grad_norm": 3.604694366455078, + "learning_rate": 3.4217955291524488e-06, + "loss": 0.4713, + "step": 316130 + }, + { + "epoch": 2.794780671511165, + "grad_norm": 6.8090500831604, + "learning_rate": 3.420322141480578e-06, + "loss": 0.4682, + "step": 316140 + }, + { + "epoch": 2.7948690747714777, + "grad_norm": 4.504559516906738, + "learning_rate": 3.418848753808707e-06, + "loss": 0.4965, + "step": 316150 + }, + { + "epoch": 2.79495747803179, + "grad_norm": 7.475257873535156, + "learning_rate": 3.4173753661368364e-06, + "loss": 0.6206, + "step": 316160 + }, + { + "epoch": 2.795045881292102, + "grad_norm": 5.156727313995361, + "learning_rate": 3.415901978464966e-06, + "loss": 0.6004, + "step": 316170 + }, + { + "epoch": 2.7951342845524145, + "grad_norm": 23.317222595214844, + "learning_rate": 3.414428590793095e-06, + "loss": 0.4937, + "step": 316180 + }, + { + "epoch": 2.7952226878127266, + "grad_norm": 3.496389389038086, + "learning_rate": 3.4129552031212244e-06, + "loss": 0.4919, + "step": 316190 + }, + { + "epoch": 2.7953110910730388, + "grad_norm": 2.1733217239379883, + "learning_rate": 3.4114818154493544e-06, + "loss": 0.4436, + "step": 316200 + }, + { + "epoch": 2.795399494333351, + "grad_norm": 2.097191333770752, + "learning_rate": 3.4100084277774836e-06, + "loss": 0.5101, + "step": 316210 + }, + { + "epoch": 2.795487897593663, + "grad_norm": 3.5893635749816895, + "learning_rate": 3.408535040105613e-06, + "loss": 0.4796, + "step": 316220 + }, + { + "epoch": 2.7955763008539756, + "grad_norm": 2.4396169185638428, + "learning_rate": 3.407061652433742e-06, + "loss": 0.4752, + "step": 316230 + }, + { + "epoch": 2.7956647041142877, + "grad_norm": 15.608474731445312, + "learning_rate": 3.4055882647618712e-06, + "loss": 0.508, + "step": 316240 + }, + { + "epoch": 2.7957531073746, + "grad_norm": 1.4705015420913696, + "learning_rate": 3.4041148770900004e-06, + "loss": 0.6162, + "step": 316250 + }, + { + "epoch": 2.7958415106349124, + "grad_norm": 2.2263405323028564, + "learning_rate": 3.4026414894181296e-06, + "loss": 0.4584, + "step": 316260 + }, + { + "epoch": 2.7959299138952245, + "grad_norm": 4.9143781661987305, + "learning_rate": 3.401168101746259e-06, + "loss": 0.5705, + "step": 316270 + }, + { + "epoch": 2.7960183171555366, + "grad_norm": 3.5259768962860107, + "learning_rate": 3.3996947140743885e-06, + "loss": 0.6306, + "step": 316280 + }, + { + "epoch": 2.7961067204158487, + "grad_norm": 2.4419898986816406, + "learning_rate": 3.3982213264025177e-06, + "loss": 0.527, + "step": 316290 + }, + { + "epoch": 2.7961951236761613, + "grad_norm": 3.584538221359253, + "learning_rate": 3.3967479387306473e-06, + "loss": 0.5764, + "step": 316300 + }, + { + "epoch": 2.7962835269364734, + "grad_norm": 5.626967430114746, + "learning_rate": 3.395274551058777e-06, + "loss": 0.5698, + "step": 316310 + }, + { + "epoch": 2.7963719301967855, + "grad_norm": 6.198531627655029, + "learning_rate": 3.393801163386906e-06, + "loss": 0.5507, + "step": 316320 + }, + { + "epoch": 2.796460333457098, + "grad_norm": 1.3613569736480713, + "learning_rate": 3.3923277757150353e-06, + "loss": 0.6327, + "step": 316330 + }, + { + "epoch": 2.79654873671741, + "grad_norm": 3.3473286628723145, + "learning_rate": 3.3908543880431645e-06, + "loss": 0.5926, + "step": 316340 + }, + { + "epoch": 2.7966371399777223, + "grad_norm": 11.495025634765625, + "learning_rate": 3.3893810003712937e-06, + "loss": 0.5651, + "step": 316350 + }, + { + "epoch": 2.7967255432380345, + "grad_norm": 1.827826738357544, + "learning_rate": 3.387907612699423e-06, + "loss": 0.4708, + "step": 316360 + }, + { + "epoch": 2.796813946498347, + "grad_norm": 0.8324751853942871, + "learning_rate": 3.386434225027552e-06, + "loss": 0.507, + "step": 316370 + }, + { + "epoch": 2.796902349758659, + "grad_norm": 4.098889350891113, + "learning_rate": 3.3849608373556813e-06, + "loss": 0.6492, + "step": 316380 + }, + { + "epoch": 2.7969907530189713, + "grad_norm": 5.06341552734375, + "learning_rate": 3.3834874496838114e-06, + "loss": 0.5984, + "step": 316390 + }, + { + "epoch": 2.797079156279284, + "grad_norm": 1.4121606349945068, + "learning_rate": 3.3820140620119406e-06, + "loss": 0.5783, + "step": 316400 + }, + { + "epoch": 2.797167559539596, + "grad_norm": 5.909835338592529, + "learning_rate": 3.38054067434007e-06, + "loss": 0.5264, + "step": 316410 + }, + { + "epoch": 2.797255962799908, + "grad_norm": 2.0341577529907227, + "learning_rate": 3.3790672866681994e-06, + "loss": 0.5221, + "step": 316420 + }, + { + "epoch": 2.79734436606022, + "grad_norm": 29.817270278930664, + "learning_rate": 3.3775938989963286e-06, + "loss": 0.5154, + "step": 316430 + }, + { + "epoch": 2.7974327693205323, + "grad_norm": 4.186281204223633, + "learning_rate": 3.376120511324458e-06, + "loss": 0.4586, + "step": 316440 + }, + { + "epoch": 2.797521172580845, + "grad_norm": 8.473283767700195, + "learning_rate": 3.374647123652587e-06, + "loss": 0.5251, + "step": 316450 + }, + { + "epoch": 2.797609575841157, + "grad_norm": 1.9577298164367676, + "learning_rate": 3.373173735980716e-06, + "loss": 0.5124, + "step": 316460 + }, + { + "epoch": 2.797697979101469, + "grad_norm": 4.581801891326904, + "learning_rate": 3.3717003483088454e-06, + "loss": 0.5146, + "step": 316470 + }, + { + "epoch": 2.7977863823617817, + "grad_norm": 11.704327583312988, + "learning_rate": 3.3702269606369755e-06, + "loss": 0.3649, + "step": 316480 + }, + { + "epoch": 2.797874785622094, + "grad_norm": 2.907139539718628, + "learning_rate": 3.3687535729651047e-06, + "loss": 0.3697, + "step": 316490 + }, + { + "epoch": 2.797963188882406, + "grad_norm": 14.64498233795166, + "learning_rate": 3.367280185293234e-06, + "loss": 0.5954, + "step": 316500 + }, + { + "epoch": 2.798051592142718, + "grad_norm": 2.8214335441589355, + "learning_rate": 3.365806797621363e-06, + "loss": 0.4209, + "step": 316510 + }, + { + "epoch": 2.7981399954030306, + "grad_norm": 2.229665517807007, + "learning_rate": 3.3643334099494927e-06, + "loss": 0.7479, + "step": 316520 + }, + { + "epoch": 2.7982283986633427, + "grad_norm": 4.745194435119629, + "learning_rate": 3.362860022277622e-06, + "loss": 0.4393, + "step": 316530 + }, + { + "epoch": 2.798316801923655, + "grad_norm": 3.56323504447937, + "learning_rate": 3.361386634605751e-06, + "loss": 0.4264, + "step": 316540 + }, + { + "epoch": 2.7984052051839674, + "grad_norm": 1.326399803161621, + "learning_rate": 3.3599132469338803e-06, + "loss": 0.3723, + "step": 316550 + }, + { + "epoch": 2.7984936084442795, + "grad_norm": 2.4960999488830566, + "learning_rate": 3.3584398592620095e-06, + "loss": 0.5155, + "step": 316560 + }, + { + "epoch": 2.7985820117045916, + "grad_norm": 4.969238758087158, + "learning_rate": 3.3569664715901387e-06, + "loss": 0.5765, + "step": 316570 + }, + { + "epoch": 2.7986704149649038, + "grad_norm": 3.0714259147644043, + "learning_rate": 3.3554930839182687e-06, + "loss": 0.4659, + "step": 316580 + }, + { + "epoch": 2.798758818225216, + "grad_norm": 7.677590847015381, + "learning_rate": 3.354019696246398e-06, + "loss": 0.4658, + "step": 316590 + }, + { + "epoch": 2.7988472214855284, + "grad_norm": 14.531184196472168, + "learning_rate": 3.352546308574527e-06, + "loss": 0.4429, + "step": 316600 + }, + { + "epoch": 2.7989356247458406, + "grad_norm": 4.265249729156494, + "learning_rate": 3.3510729209026563e-06, + "loss": 0.6652, + "step": 316610 + }, + { + "epoch": 2.799024028006153, + "grad_norm": 6.806464195251465, + "learning_rate": 3.3495995332307855e-06, + "loss": 0.4394, + "step": 316620 + }, + { + "epoch": 2.7991124312664653, + "grad_norm": 3.8701651096343994, + "learning_rate": 3.348126145558915e-06, + "loss": 0.5671, + "step": 316630 + }, + { + "epoch": 2.7992008345267774, + "grad_norm": 5.114289283752441, + "learning_rate": 3.3466527578870444e-06, + "loss": 0.5602, + "step": 316640 + }, + { + "epoch": 2.7992892377870895, + "grad_norm": 2.6029398441314697, + "learning_rate": 3.3451793702151736e-06, + "loss": 0.6035, + "step": 316650 + }, + { + "epoch": 2.7993776410474016, + "grad_norm": 7.183295249938965, + "learning_rate": 3.3437059825433028e-06, + "loss": 0.5304, + "step": 316660 + }, + { + "epoch": 2.799466044307714, + "grad_norm": 3.8018765449523926, + "learning_rate": 3.342232594871433e-06, + "loss": 0.4887, + "step": 316670 + }, + { + "epoch": 2.7995544475680263, + "grad_norm": 5.042635917663574, + "learning_rate": 3.340759207199562e-06, + "loss": 0.4578, + "step": 316680 + }, + { + "epoch": 2.7996428508283384, + "grad_norm": 2.366415023803711, + "learning_rate": 3.3392858195276912e-06, + "loss": 0.4925, + "step": 316690 + }, + { + "epoch": 2.799731254088651, + "grad_norm": 4.675685405731201, + "learning_rate": 3.3378124318558204e-06, + "loss": 0.4854, + "step": 316700 + }, + { + "epoch": 2.799819657348963, + "grad_norm": 3.097524404525757, + "learning_rate": 3.3363390441839496e-06, + "loss": 0.5405, + "step": 316710 + }, + { + "epoch": 2.799908060609275, + "grad_norm": 1.3045450448989868, + "learning_rate": 3.334865656512079e-06, + "loss": 0.435, + "step": 316720 + }, + { + "epoch": 2.7999964638695873, + "grad_norm": 8.550625801086426, + "learning_rate": 3.333392268840208e-06, + "loss": 0.6445, + "step": 316730 + }, + { + "epoch": 2.8000848671299, + "grad_norm": 22.851213455200195, + "learning_rate": 3.3319188811683376e-06, + "loss": 0.5738, + "step": 316740 + }, + { + "epoch": 2.800173270390212, + "grad_norm": 2.1186325550079346, + "learning_rate": 3.330445493496467e-06, + "loss": 0.5707, + "step": 316750 + }, + { + "epoch": 2.800261673650524, + "grad_norm": 2.0649573802948, + "learning_rate": 3.3289721058245965e-06, + "loss": 0.4528, + "step": 316760 + }, + { + "epoch": 2.8003500769108367, + "grad_norm": 2.785313606262207, + "learning_rate": 3.327498718152726e-06, + "loss": 0.4528, + "step": 316770 + }, + { + "epoch": 2.800438480171149, + "grad_norm": 4.223390579223633, + "learning_rate": 3.3260253304808553e-06, + "loss": 0.5716, + "step": 316780 + }, + { + "epoch": 2.800526883431461, + "grad_norm": 8.758340835571289, + "learning_rate": 3.3245519428089845e-06, + "loss": 0.5799, + "step": 316790 + }, + { + "epoch": 2.800615286691773, + "grad_norm": 2.9694573879241943, + "learning_rate": 3.3230785551371137e-06, + "loss": 0.5855, + "step": 316800 + }, + { + "epoch": 2.800703689952085, + "grad_norm": 2.77433443069458, + "learning_rate": 3.321605167465243e-06, + "loss": 0.4619, + "step": 316810 + }, + { + "epoch": 2.8007920932123977, + "grad_norm": 1.9937633275985718, + "learning_rate": 3.320131779793372e-06, + "loss": 0.5445, + "step": 316820 + }, + { + "epoch": 2.80088049647271, + "grad_norm": 2.5312106609344482, + "learning_rate": 3.3186583921215013e-06, + "loss": 0.4643, + "step": 316830 + }, + { + "epoch": 2.800968899733022, + "grad_norm": 6.294669151306152, + "learning_rate": 3.3171850044496305e-06, + "loss": 0.5811, + "step": 316840 + }, + { + "epoch": 2.8010573029933346, + "grad_norm": 2.081190347671509, + "learning_rate": 3.31571161677776e-06, + "loss": 0.5362, + "step": 316850 + }, + { + "epoch": 2.8011457062536467, + "grad_norm": 4.318580627441406, + "learning_rate": 3.3142382291058898e-06, + "loss": 0.5174, + "step": 316860 + }, + { + "epoch": 2.801234109513959, + "grad_norm": 4.193192958831787, + "learning_rate": 3.312764841434019e-06, + "loss": 0.4231, + "step": 316870 + }, + { + "epoch": 2.801322512774271, + "grad_norm": 2.072411060333252, + "learning_rate": 3.3112914537621486e-06, + "loss": 0.483, + "step": 316880 + }, + { + "epoch": 2.8014109160345835, + "grad_norm": 5.717396259307861, + "learning_rate": 3.3098180660902778e-06, + "loss": 0.4958, + "step": 316890 + }, + { + "epoch": 2.8014993192948956, + "grad_norm": 7.502676486968994, + "learning_rate": 3.308344678418407e-06, + "loss": 0.4802, + "step": 316900 + }, + { + "epoch": 2.8015877225552077, + "grad_norm": 4.701310157775879, + "learning_rate": 3.306871290746536e-06, + "loss": 0.401, + "step": 316910 + }, + { + "epoch": 2.8016761258155203, + "grad_norm": 6.088459491729736, + "learning_rate": 3.3053979030746654e-06, + "loss": 0.54, + "step": 316920 + }, + { + "epoch": 2.8017645290758324, + "grad_norm": 19.008962631225586, + "learning_rate": 3.3039245154027946e-06, + "loss": 0.4294, + "step": 316930 + }, + { + "epoch": 2.8018529323361445, + "grad_norm": 4.736425876617432, + "learning_rate": 3.3024511277309238e-06, + "loss": 0.4678, + "step": 316940 + }, + { + "epoch": 2.8019413355964566, + "grad_norm": 3.8191049098968506, + "learning_rate": 3.300977740059054e-06, + "loss": 0.541, + "step": 316950 + }, + { + "epoch": 2.802029738856769, + "grad_norm": 3.996642589569092, + "learning_rate": 3.299504352387183e-06, + "loss": 0.4686, + "step": 316960 + }, + { + "epoch": 2.8021181421170813, + "grad_norm": 5.43536901473999, + "learning_rate": 3.2980309647153122e-06, + "loss": 0.4794, + "step": 316970 + }, + { + "epoch": 2.8022065453773934, + "grad_norm": 1.035733938217163, + "learning_rate": 3.2965575770434414e-06, + "loss": 0.4062, + "step": 316980 + }, + { + "epoch": 2.802294948637706, + "grad_norm": 4.179128170013428, + "learning_rate": 3.295084189371571e-06, + "loss": 0.5797, + "step": 316990 + }, + { + "epoch": 2.802383351898018, + "grad_norm": 6.245996952056885, + "learning_rate": 3.2936108016997003e-06, + "loss": 0.4562, + "step": 317000 + }, + { + "epoch": 2.8024717551583302, + "grad_norm": 0.8434953093528748, + "learning_rate": 3.2921374140278295e-06, + "loss": 0.5728, + "step": 317010 + }, + { + "epoch": 2.8025601584186424, + "grad_norm": 2.6545403003692627, + "learning_rate": 3.2906640263559587e-06, + "loss": 0.6006, + "step": 317020 + }, + { + "epoch": 2.8026485616789545, + "grad_norm": 6.36753511428833, + "learning_rate": 3.289190638684088e-06, + "loss": 0.5765, + "step": 317030 + }, + { + "epoch": 2.802736964939267, + "grad_norm": 2.3544185161590576, + "learning_rate": 3.287717251012217e-06, + "loss": 0.6383, + "step": 317040 + }, + { + "epoch": 2.802825368199579, + "grad_norm": 2.414541721343994, + "learning_rate": 3.286243863340347e-06, + "loss": 0.4955, + "step": 317050 + }, + { + "epoch": 2.8029137714598913, + "grad_norm": 4.283695220947266, + "learning_rate": 3.2847704756684763e-06, + "loss": 0.4806, + "step": 317060 + }, + { + "epoch": 2.803002174720204, + "grad_norm": 5.096637725830078, + "learning_rate": 3.2832970879966055e-06, + "loss": 0.4805, + "step": 317070 + }, + { + "epoch": 2.803090577980516, + "grad_norm": 5.0474677085876465, + "learning_rate": 3.2818237003247347e-06, + "loss": 0.5414, + "step": 317080 + }, + { + "epoch": 2.803178981240828, + "grad_norm": 2.161116123199463, + "learning_rate": 3.280350312652864e-06, + "loss": 0.5621, + "step": 317090 + }, + { + "epoch": 2.80326738450114, + "grad_norm": 9.718421936035156, + "learning_rate": 3.2788769249809935e-06, + "loss": 0.5023, + "step": 317100 + }, + { + "epoch": 2.8033557877614528, + "grad_norm": 1.4252122640609741, + "learning_rate": 3.2774035373091227e-06, + "loss": 0.5464, + "step": 317110 + }, + { + "epoch": 2.803444191021765, + "grad_norm": 4.159720420837402, + "learning_rate": 3.275930149637252e-06, + "loss": 0.4501, + "step": 317120 + }, + { + "epoch": 2.803532594282077, + "grad_norm": 7.504578590393066, + "learning_rate": 3.274456761965381e-06, + "loss": 0.5712, + "step": 317130 + }, + { + "epoch": 2.8036209975423896, + "grad_norm": 5.193378448486328, + "learning_rate": 3.272983374293511e-06, + "loss": 0.6168, + "step": 317140 + }, + { + "epoch": 2.8037094008027017, + "grad_norm": 2.400256872177124, + "learning_rate": 3.2715099866216404e-06, + "loss": 0.327, + "step": 317150 + }, + { + "epoch": 2.803797804063014, + "grad_norm": 2.8662333488464355, + "learning_rate": 3.2700365989497696e-06, + "loss": 0.4929, + "step": 317160 + }, + { + "epoch": 2.803886207323326, + "grad_norm": 6.8134765625, + "learning_rate": 3.268563211277899e-06, + "loss": 0.5261, + "step": 317170 + }, + { + "epoch": 2.803974610583638, + "grad_norm": 1.7591893672943115, + "learning_rate": 3.267089823606028e-06, + "loss": 0.5509, + "step": 317180 + }, + { + "epoch": 2.8040630138439506, + "grad_norm": 1.9095944166183472, + "learning_rate": 3.265616435934157e-06, + "loss": 0.508, + "step": 317190 + }, + { + "epoch": 2.8041514171042627, + "grad_norm": 1.9152454137802124, + "learning_rate": 3.2641430482622864e-06, + "loss": 0.4784, + "step": 317200 + }, + { + "epoch": 2.8042398203645753, + "grad_norm": 3.3404035568237305, + "learning_rate": 3.262669660590416e-06, + "loss": 0.5187, + "step": 317210 + }, + { + "epoch": 2.8043282236248874, + "grad_norm": 6.612401008605957, + "learning_rate": 3.2611962729185452e-06, + "loss": 0.5639, + "step": 317220 + }, + { + "epoch": 2.8044166268851995, + "grad_norm": 6.979229927062988, + "learning_rate": 3.259722885246675e-06, + "loss": 0.4266, + "step": 317230 + }, + { + "epoch": 2.8045050301455117, + "grad_norm": 3.7207529544830322, + "learning_rate": 3.2582494975748045e-06, + "loss": 0.5068, + "step": 317240 + }, + { + "epoch": 2.804593433405824, + "grad_norm": 7.120367527008057, + "learning_rate": 3.2567761099029337e-06, + "loss": 0.5451, + "step": 317250 + }, + { + "epoch": 2.8046818366661364, + "grad_norm": 1.345583200454712, + "learning_rate": 3.255302722231063e-06, + "loss": 0.5108, + "step": 317260 + }, + { + "epoch": 2.8047702399264485, + "grad_norm": 3.6579792499542236, + "learning_rate": 3.253829334559192e-06, + "loss": 0.5074, + "step": 317270 + }, + { + "epoch": 2.8048586431867606, + "grad_norm": 6.121996879577637, + "learning_rate": 3.2523559468873213e-06, + "loss": 0.5591, + "step": 317280 + }, + { + "epoch": 2.804947046447073, + "grad_norm": 6.628709316253662, + "learning_rate": 3.2508825592154505e-06, + "loss": 0.4981, + "step": 317290 + }, + { + "epoch": 2.8050354497073853, + "grad_norm": 5.491447448730469, + "learning_rate": 3.2494091715435797e-06, + "loss": 0.4741, + "step": 317300 + }, + { + "epoch": 2.8051238529676974, + "grad_norm": 1.134204387664795, + "learning_rate": 3.247935783871709e-06, + "loss": 0.4427, + "step": 317310 + }, + { + "epoch": 2.8052122562280095, + "grad_norm": 3.9976818561553955, + "learning_rate": 3.2464623961998385e-06, + "loss": 0.528, + "step": 317320 + }, + { + "epoch": 2.805300659488322, + "grad_norm": 1.056890606880188, + "learning_rate": 3.244989008527968e-06, + "loss": 0.4443, + "step": 317330 + }, + { + "epoch": 2.805389062748634, + "grad_norm": 2.4385509490966797, + "learning_rate": 3.2435156208560973e-06, + "loss": 0.5021, + "step": 317340 + }, + { + "epoch": 2.8054774660089463, + "grad_norm": 3.9052770137786865, + "learning_rate": 3.242042233184227e-06, + "loss": 0.464, + "step": 317350 + }, + { + "epoch": 2.805565869269259, + "grad_norm": 3.243152618408203, + "learning_rate": 3.240568845512356e-06, + "loss": 0.4332, + "step": 317360 + }, + { + "epoch": 2.805654272529571, + "grad_norm": 2.302799940109253, + "learning_rate": 3.2390954578404854e-06, + "loss": 0.4675, + "step": 317370 + }, + { + "epoch": 2.805742675789883, + "grad_norm": 7.455169677734375, + "learning_rate": 3.2376220701686146e-06, + "loss": 0.568, + "step": 317380 + }, + { + "epoch": 2.8058310790501952, + "grad_norm": 5.4449567794799805, + "learning_rate": 3.2361486824967438e-06, + "loss": 0.4106, + "step": 317390 + }, + { + "epoch": 2.8059194823105074, + "grad_norm": 4.617025375366211, + "learning_rate": 3.234675294824873e-06, + "loss": 0.5033, + "step": 317400 + }, + { + "epoch": 2.80600788557082, + "grad_norm": 0.9714881181716919, + "learning_rate": 3.233201907153002e-06, + "loss": 0.495, + "step": 317410 + }, + { + "epoch": 2.806096288831132, + "grad_norm": 6.989334583282471, + "learning_rate": 3.231728519481132e-06, + "loss": 0.4907, + "step": 317420 + }, + { + "epoch": 2.806184692091444, + "grad_norm": 7.635326862335205, + "learning_rate": 3.2302551318092614e-06, + "loss": 0.4793, + "step": 317430 + }, + { + "epoch": 2.8062730953517567, + "grad_norm": 30.75075912475586, + "learning_rate": 3.2287817441373906e-06, + "loss": 0.4891, + "step": 317440 + }, + { + "epoch": 2.806361498612069, + "grad_norm": 2.9454832077026367, + "learning_rate": 3.22730835646552e-06, + "loss": 0.4966, + "step": 317450 + }, + { + "epoch": 2.806449901872381, + "grad_norm": 1.8423027992248535, + "learning_rate": 3.2258349687936494e-06, + "loss": 0.5899, + "step": 317460 + }, + { + "epoch": 2.806538305132693, + "grad_norm": 3.6583192348480225, + "learning_rate": 3.2243615811217786e-06, + "loss": 0.6076, + "step": 317470 + }, + { + "epoch": 2.8066267083930057, + "grad_norm": 6.313131809234619, + "learning_rate": 3.222888193449908e-06, + "loss": 0.5687, + "step": 317480 + }, + { + "epoch": 2.8067151116533178, + "grad_norm": 1.48883056640625, + "learning_rate": 3.221414805778037e-06, + "loss": 0.4823, + "step": 317490 + }, + { + "epoch": 2.80680351491363, + "grad_norm": 10.509977340698242, + "learning_rate": 3.2199414181061662e-06, + "loss": 0.5223, + "step": 317500 + }, + { + "epoch": 2.8068919181739425, + "grad_norm": 8.709662437438965, + "learning_rate": 3.2184680304342963e-06, + "loss": 0.6578, + "step": 317510 + }, + { + "epoch": 2.8069803214342546, + "grad_norm": 4.655099868774414, + "learning_rate": 3.2169946427624255e-06, + "loss": 0.417, + "step": 317520 + }, + { + "epoch": 2.8070687246945667, + "grad_norm": 6.736572742462158, + "learning_rate": 3.2155212550905547e-06, + "loss": 0.4288, + "step": 317530 + }, + { + "epoch": 2.807157127954879, + "grad_norm": 1.7427921295166016, + "learning_rate": 3.214047867418684e-06, + "loss": 0.4999, + "step": 317540 + }, + { + "epoch": 2.8072455312151914, + "grad_norm": 7.275914192199707, + "learning_rate": 3.212574479746813e-06, + "loss": 0.5699, + "step": 317550 + }, + { + "epoch": 2.8073339344755035, + "grad_norm": 3.7134149074554443, + "learning_rate": 3.2111010920749423e-06, + "loss": 0.4075, + "step": 317560 + }, + { + "epoch": 2.8074223377358156, + "grad_norm": 6.911228656768799, + "learning_rate": 3.209627704403072e-06, + "loss": 0.5537, + "step": 317570 + }, + { + "epoch": 2.807510740996128, + "grad_norm": 24.236318588256836, + "learning_rate": 3.208154316731201e-06, + "loss": 0.4566, + "step": 317580 + }, + { + "epoch": 2.8075991442564403, + "grad_norm": 3.9818384647369385, + "learning_rate": 3.2066809290593303e-06, + "loss": 0.5981, + "step": 317590 + }, + { + "epoch": 2.8076875475167524, + "grad_norm": 1.5294129848480225, + "learning_rate": 3.2052075413874595e-06, + "loss": 0.4925, + "step": 317600 + }, + { + "epoch": 2.8077759507770645, + "grad_norm": 5.709980010986328, + "learning_rate": 3.2037341537155896e-06, + "loss": 0.3993, + "step": 317610 + }, + { + "epoch": 2.8078643540373767, + "grad_norm": 4.084619998931885, + "learning_rate": 3.2022607660437188e-06, + "loss": 0.5816, + "step": 317620 + }, + { + "epoch": 2.8079527572976892, + "grad_norm": 8.903585433959961, + "learning_rate": 3.200787378371848e-06, + "loss": 0.4654, + "step": 317630 + }, + { + "epoch": 2.8080411605580013, + "grad_norm": 1.020298957824707, + "learning_rate": 3.199313990699977e-06, + "loss": 0.4109, + "step": 317640 + }, + { + "epoch": 2.8081295638183135, + "grad_norm": 2.880502223968506, + "learning_rate": 3.1978406030281064e-06, + "loss": 0.6659, + "step": 317650 + }, + { + "epoch": 2.808217967078626, + "grad_norm": 3.5430502891540527, + "learning_rate": 3.1963672153562356e-06, + "loss": 0.4858, + "step": 317660 + }, + { + "epoch": 2.808306370338938, + "grad_norm": 4.267662048339844, + "learning_rate": 3.194893827684365e-06, + "loss": 0.4957, + "step": 317670 + }, + { + "epoch": 2.8083947735992503, + "grad_norm": 3.0471396446228027, + "learning_rate": 3.1934204400124944e-06, + "loss": 0.5376, + "step": 317680 + }, + { + "epoch": 2.8084831768595624, + "grad_norm": 4.042324542999268, + "learning_rate": 3.1919470523406236e-06, + "loss": 0.5372, + "step": 317690 + }, + { + "epoch": 2.808571580119875, + "grad_norm": 3.419016122817993, + "learning_rate": 3.1904736646687536e-06, + "loss": 0.5472, + "step": 317700 + }, + { + "epoch": 2.808659983380187, + "grad_norm": 4.618264198303223, + "learning_rate": 3.189000276996883e-06, + "loss": 0.518, + "step": 317710 + }, + { + "epoch": 2.808748386640499, + "grad_norm": 1.619488000869751, + "learning_rate": 3.187526889325012e-06, + "loss": 0.5968, + "step": 317720 + }, + { + "epoch": 2.8088367899008118, + "grad_norm": 12.677444458007812, + "learning_rate": 3.1860535016531412e-06, + "loss": 0.5384, + "step": 317730 + }, + { + "epoch": 2.808925193161124, + "grad_norm": 1.711972951889038, + "learning_rate": 3.1845801139812704e-06, + "loss": 0.4712, + "step": 317740 + }, + { + "epoch": 2.809013596421436, + "grad_norm": 2.2986130714416504, + "learning_rate": 3.1831067263093997e-06, + "loss": 0.5571, + "step": 317750 + }, + { + "epoch": 2.809101999681748, + "grad_norm": 1.2690691947937012, + "learning_rate": 3.181633338637529e-06, + "loss": 0.5004, + "step": 317760 + }, + { + "epoch": 2.8091904029420602, + "grad_norm": 3.6663033962249756, + "learning_rate": 3.180159950965658e-06, + "loss": 0.5989, + "step": 317770 + }, + { + "epoch": 2.809278806202373, + "grad_norm": 5.541961669921875, + "learning_rate": 3.1786865632937877e-06, + "loss": 0.5536, + "step": 317780 + }, + { + "epoch": 2.809367209462685, + "grad_norm": 4.446836948394775, + "learning_rate": 3.1772131756219173e-06, + "loss": 0.6364, + "step": 317790 + }, + { + "epoch": 2.8094556127229975, + "grad_norm": 3.2406675815582275, + "learning_rate": 3.1757397879500465e-06, + "loss": 0.5499, + "step": 317800 + }, + { + "epoch": 2.8095440159833096, + "grad_norm": 1.4515262842178345, + "learning_rate": 3.174266400278176e-06, + "loss": 0.4561, + "step": 317810 + }, + { + "epoch": 2.8096324192436217, + "grad_norm": 2.817840814590454, + "learning_rate": 3.1727930126063053e-06, + "loss": 0.5313, + "step": 317820 + }, + { + "epoch": 2.809720822503934, + "grad_norm": 2.5170490741729736, + "learning_rate": 3.1713196249344345e-06, + "loss": 0.6108, + "step": 317830 + }, + { + "epoch": 2.809809225764246, + "grad_norm": 6.60693359375, + "learning_rate": 3.1698462372625637e-06, + "loss": 0.4377, + "step": 317840 + }, + { + "epoch": 2.8098976290245585, + "grad_norm": 1.6508914232254028, + "learning_rate": 3.168372849590693e-06, + "loss": 0.5918, + "step": 317850 + }, + { + "epoch": 2.8099860322848706, + "grad_norm": 2.920069932937622, + "learning_rate": 3.166899461918822e-06, + "loss": 0.4661, + "step": 317860 + }, + { + "epoch": 2.8100744355451828, + "grad_norm": 8.365301132202148, + "learning_rate": 3.1654260742469513e-06, + "loss": 0.4961, + "step": 317870 + }, + { + "epoch": 2.8101628388054953, + "grad_norm": 13.88684368133545, + "learning_rate": 3.1639526865750805e-06, + "loss": 0.463, + "step": 317880 + }, + { + "epoch": 2.8102512420658075, + "grad_norm": 4.069801330566406, + "learning_rate": 3.1624792989032106e-06, + "loss": 0.5603, + "step": 317890 + }, + { + "epoch": 2.8103396453261196, + "grad_norm": 18.376052856445312, + "learning_rate": 3.1610059112313398e-06, + "loss": 0.4855, + "step": 317900 + }, + { + "epoch": 2.8104280485864317, + "grad_norm": 5.103705883026123, + "learning_rate": 3.159532523559469e-06, + "loss": 0.5311, + "step": 317910 + }, + { + "epoch": 2.8105164518467443, + "grad_norm": 2.1216611862182617, + "learning_rate": 3.1580591358875986e-06, + "loss": 0.5612, + "step": 317920 + }, + { + "epoch": 2.8106048551070564, + "grad_norm": 7.454061031341553, + "learning_rate": 3.156585748215728e-06, + "loss": 0.5623, + "step": 317930 + }, + { + "epoch": 2.8106932583673685, + "grad_norm": 2.682760715484619, + "learning_rate": 3.155112360543857e-06, + "loss": 0.5998, + "step": 317940 + }, + { + "epoch": 2.810781661627681, + "grad_norm": 5.006621360778809, + "learning_rate": 3.153638972871986e-06, + "loss": 0.6001, + "step": 317950 + }, + { + "epoch": 2.810870064887993, + "grad_norm": 1.488314151763916, + "learning_rate": 3.1521655852001154e-06, + "loss": 0.5015, + "step": 317960 + }, + { + "epoch": 2.8109584681483053, + "grad_norm": 19.563249588012695, + "learning_rate": 3.1506921975282446e-06, + "loss": 0.4758, + "step": 317970 + }, + { + "epoch": 2.8110468714086174, + "grad_norm": 1.8307292461395264, + "learning_rate": 3.1492188098563747e-06, + "loss": 0.6066, + "step": 317980 + }, + { + "epoch": 2.8111352746689295, + "grad_norm": 3.515204668045044, + "learning_rate": 3.147745422184504e-06, + "loss": 0.4323, + "step": 317990 + }, + { + "epoch": 2.811223677929242, + "grad_norm": 4.134374141693115, + "learning_rate": 3.146272034512633e-06, + "loss": 0.5612, + "step": 318000 + }, + { + "epoch": 2.8113120811895542, + "grad_norm": 3.0184972286224365, + "learning_rate": 3.1447986468407623e-06, + "loss": 0.5635, + "step": 318010 + }, + { + "epoch": 2.811400484449867, + "grad_norm": 2.293455123901367, + "learning_rate": 3.1433252591688915e-06, + "loss": 0.5872, + "step": 318020 + }, + { + "epoch": 2.811488887710179, + "grad_norm": 4.590289115905762, + "learning_rate": 3.141851871497021e-06, + "loss": 0.4683, + "step": 318030 + }, + { + "epoch": 2.811577290970491, + "grad_norm": 4.414993762969971, + "learning_rate": 3.1403784838251503e-06, + "loss": 0.5728, + "step": 318040 + }, + { + "epoch": 2.811665694230803, + "grad_norm": 5.1081061363220215, + "learning_rate": 3.1389050961532795e-06, + "loss": 0.4971, + "step": 318050 + }, + { + "epoch": 2.8117540974911153, + "grad_norm": 7.924398899078369, + "learning_rate": 3.1374317084814087e-06, + "loss": 0.4332, + "step": 318060 + }, + { + "epoch": 2.811842500751428, + "grad_norm": 4.059812545776367, + "learning_rate": 3.1359583208095387e-06, + "loss": 0.6481, + "step": 318070 + }, + { + "epoch": 2.81193090401174, + "grad_norm": 4.473089218139648, + "learning_rate": 3.134484933137668e-06, + "loss": 0.4643, + "step": 318080 + }, + { + "epoch": 2.812019307272052, + "grad_norm": 14.363041877746582, + "learning_rate": 3.133011545465797e-06, + "loss": 0.4938, + "step": 318090 + }, + { + "epoch": 2.8121077105323646, + "grad_norm": 6.33618688583374, + "learning_rate": 3.1315381577939263e-06, + "loss": 0.5474, + "step": 318100 + }, + { + "epoch": 2.8121961137926768, + "grad_norm": 1.772671103477478, + "learning_rate": 3.1300647701220555e-06, + "loss": 0.4143, + "step": 318110 + }, + { + "epoch": 2.812284517052989, + "grad_norm": 10.429116249084473, + "learning_rate": 3.1285913824501847e-06, + "loss": 0.5577, + "step": 318120 + }, + { + "epoch": 2.812372920313301, + "grad_norm": 13.98965072631836, + "learning_rate": 3.127117994778314e-06, + "loss": 0.5059, + "step": 318130 + }, + { + "epoch": 2.8124613235736136, + "grad_norm": 3.578916072845459, + "learning_rate": 3.1256446071064436e-06, + "loss": 0.4532, + "step": 318140 + }, + { + "epoch": 2.8125497268339257, + "grad_norm": 6.41947078704834, + "learning_rate": 3.124171219434573e-06, + "loss": 0.5505, + "step": 318150 + }, + { + "epoch": 2.812638130094238, + "grad_norm": 3.626469612121582, + "learning_rate": 3.1226978317627024e-06, + "loss": 0.4412, + "step": 318160 + }, + { + "epoch": 2.8127265333545504, + "grad_norm": 3.551513433456421, + "learning_rate": 3.1212244440908316e-06, + "loss": 0.7086, + "step": 318170 + }, + { + "epoch": 2.8128149366148625, + "grad_norm": 3.6516458988189697, + "learning_rate": 3.119751056418961e-06, + "loss": 0.4636, + "step": 318180 + }, + { + "epoch": 2.8129033398751746, + "grad_norm": 11.853553771972656, + "learning_rate": 3.1182776687470904e-06, + "loss": 0.5445, + "step": 318190 + }, + { + "epoch": 2.8129917431354867, + "grad_norm": 5.5324482917785645, + "learning_rate": 3.1168042810752196e-06, + "loss": 0.518, + "step": 318200 + }, + { + "epoch": 2.813080146395799, + "grad_norm": 2.5874714851379395, + "learning_rate": 3.115330893403349e-06, + "loss": 0.5097, + "step": 318210 + }, + { + "epoch": 2.8131685496561114, + "grad_norm": 7.236701965332031, + "learning_rate": 3.113857505731478e-06, + "loss": 0.5565, + "step": 318220 + }, + { + "epoch": 2.8132569529164235, + "grad_norm": 5.5959906578063965, + "learning_rate": 3.1123841180596072e-06, + "loss": 0.5011, + "step": 318230 + }, + { + "epoch": 2.8133453561767356, + "grad_norm": 2.522216796875, + "learning_rate": 3.110910730387737e-06, + "loss": 0.5142, + "step": 318240 + }, + { + "epoch": 2.813433759437048, + "grad_norm": 1.681321144104004, + "learning_rate": 3.109437342715866e-06, + "loss": 0.7069, + "step": 318250 + }, + { + "epoch": 2.8135221626973603, + "grad_norm": 1.5855354070663452, + "learning_rate": 3.1079639550439957e-06, + "loss": 0.5143, + "step": 318260 + }, + { + "epoch": 2.8136105659576724, + "grad_norm": 1.6644141674041748, + "learning_rate": 3.106490567372125e-06, + "loss": 0.4413, + "step": 318270 + }, + { + "epoch": 2.8136989692179846, + "grad_norm": 7.180301189422607, + "learning_rate": 3.1050171797002545e-06, + "loss": 0.5055, + "step": 318280 + }, + { + "epoch": 2.813787372478297, + "grad_norm": 6.7147417068481445, + "learning_rate": 3.1035437920283837e-06, + "loss": 0.4583, + "step": 318290 + }, + { + "epoch": 2.8138757757386093, + "grad_norm": 4.757834434509277, + "learning_rate": 3.102070404356513e-06, + "loss": 0.583, + "step": 318300 + }, + { + "epoch": 2.8139641789989214, + "grad_norm": 5.431620121002197, + "learning_rate": 3.100597016684642e-06, + "loss": 0.6002, + "step": 318310 + }, + { + "epoch": 2.814052582259234, + "grad_norm": 3.5931894779205322, + "learning_rate": 3.0991236290127713e-06, + "loss": 0.5266, + "step": 318320 + }, + { + "epoch": 2.814140985519546, + "grad_norm": 1.9563512802124023, + "learning_rate": 3.097650241340901e-06, + "loss": 0.4772, + "step": 318330 + }, + { + "epoch": 2.814229388779858, + "grad_norm": 8.47156047821045, + "learning_rate": 3.09617685366903e-06, + "loss": 0.3885, + "step": 318340 + }, + { + "epoch": 2.8143177920401703, + "grad_norm": 1.8804881572723389, + "learning_rate": 3.0947034659971593e-06, + "loss": 0.5141, + "step": 318350 + }, + { + "epoch": 2.8144061953004824, + "grad_norm": 9.132364273071289, + "learning_rate": 3.0932300783252885e-06, + "loss": 0.5299, + "step": 318360 + }, + { + "epoch": 2.814494598560795, + "grad_norm": 3.401134967803955, + "learning_rate": 3.091756690653418e-06, + "loss": 0.5187, + "step": 318370 + }, + { + "epoch": 2.814583001821107, + "grad_norm": 6.111754417419434, + "learning_rate": 3.0902833029815474e-06, + "loss": 0.593, + "step": 318380 + }, + { + "epoch": 2.8146714050814197, + "grad_norm": 1.5910080671310425, + "learning_rate": 3.088809915309677e-06, + "loss": 0.5263, + "step": 318390 + }, + { + "epoch": 2.814759808341732, + "grad_norm": 7.19528865814209, + "learning_rate": 3.087336527637806e-06, + "loss": 0.5422, + "step": 318400 + }, + { + "epoch": 2.814848211602044, + "grad_norm": 3.3504996299743652, + "learning_rate": 3.0858631399659354e-06, + "loss": 0.5324, + "step": 318410 + }, + { + "epoch": 2.814936614862356, + "grad_norm": 0.8754217624664307, + "learning_rate": 3.084389752294065e-06, + "loss": 0.4836, + "step": 318420 + }, + { + "epoch": 2.815025018122668, + "grad_norm": 2.0246968269348145, + "learning_rate": 3.0829163646221942e-06, + "loss": 0.5531, + "step": 318430 + }, + { + "epoch": 2.8151134213829807, + "grad_norm": 4.311032772064209, + "learning_rate": 3.0814429769503234e-06, + "loss": 0.5285, + "step": 318440 + }, + { + "epoch": 2.815201824643293, + "grad_norm": 4.255753040313721, + "learning_rate": 3.0799695892784526e-06, + "loss": 0.4702, + "step": 318450 + }, + { + "epoch": 2.815290227903605, + "grad_norm": 1.3690388202667236, + "learning_rate": 3.078496201606582e-06, + "loss": 0.457, + "step": 318460 + }, + { + "epoch": 2.8153786311639175, + "grad_norm": 1.8425508737564087, + "learning_rate": 3.0770228139347114e-06, + "loss": 0.4069, + "step": 318470 + }, + { + "epoch": 2.8154670344242296, + "grad_norm": 5.521236419677734, + "learning_rate": 3.0755494262628406e-06, + "loss": 0.5032, + "step": 318480 + }, + { + "epoch": 2.8155554376845418, + "grad_norm": 2.8384995460510254, + "learning_rate": 3.07407603859097e-06, + "loss": 0.5795, + "step": 318490 + }, + { + "epoch": 2.815643840944854, + "grad_norm": 1.3849979639053345, + "learning_rate": 3.0726026509190995e-06, + "loss": 0.3954, + "step": 318500 + }, + { + "epoch": 2.8157322442051664, + "grad_norm": 12.218595504760742, + "learning_rate": 3.0711292632472287e-06, + "loss": 0.6123, + "step": 318510 + }, + { + "epoch": 2.8158206474654786, + "grad_norm": 5.291456699371338, + "learning_rate": 3.0696558755753583e-06, + "loss": 0.5375, + "step": 318520 + }, + { + "epoch": 2.8159090507257907, + "grad_norm": 1.8429216146469116, + "learning_rate": 3.0681824879034875e-06, + "loss": 0.4773, + "step": 318530 + }, + { + "epoch": 2.8159974539861032, + "grad_norm": 3.3048911094665527, + "learning_rate": 3.0667091002316167e-06, + "loss": 0.458, + "step": 318540 + }, + { + "epoch": 2.8160858572464154, + "grad_norm": 3.867744207382202, + "learning_rate": 3.065235712559746e-06, + "loss": 0.5052, + "step": 318550 + }, + { + "epoch": 2.8161742605067275, + "grad_norm": 7.496212482452393, + "learning_rate": 3.0637623248878755e-06, + "loss": 0.5435, + "step": 318560 + }, + { + "epoch": 2.8162626637670396, + "grad_norm": 3.5208585262298584, + "learning_rate": 3.0622889372160047e-06, + "loss": 0.5918, + "step": 318570 + }, + { + "epoch": 2.8163510670273517, + "grad_norm": 9.90245532989502, + "learning_rate": 3.060815549544134e-06, + "loss": 0.5316, + "step": 318580 + }, + { + "epoch": 2.8164394702876643, + "grad_norm": 7.1530537605285645, + "learning_rate": 3.059342161872263e-06, + "loss": 0.4121, + "step": 318590 + }, + { + "epoch": 2.8165278735479764, + "grad_norm": 3.095024585723877, + "learning_rate": 3.0578687742003923e-06, + "loss": 0.5661, + "step": 318600 + }, + { + "epoch": 2.816616276808289, + "grad_norm": 1.0167540311813354, + "learning_rate": 3.056395386528522e-06, + "loss": 0.4897, + "step": 318610 + }, + { + "epoch": 2.816704680068601, + "grad_norm": 4.652667999267578, + "learning_rate": 3.0549219988566516e-06, + "loss": 0.4988, + "step": 318620 + }, + { + "epoch": 2.816793083328913, + "grad_norm": 3.2474265098571777, + "learning_rate": 3.0534486111847808e-06, + "loss": 0.4487, + "step": 318630 + }, + { + "epoch": 2.8168814865892253, + "grad_norm": 1.5228556394577026, + "learning_rate": 3.05197522351291e-06, + "loss": 0.4593, + "step": 318640 + }, + { + "epoch": 2.8169698898495374, + "grad_norm": 10.38929557800293, + "learning_rate": 3.050501835841039e-06, + "loss": 0.4597, + "step": 318650 + }, + { + "epoch": 2.81705829310985, + "grad_norm": 8.094940185546875, + "learning_rate": 3.049028448169169e-06, + "loss": 0.6889, + "step": 318660 + }, + { + "epoch": 2.817146696370162, + "grad_norm": 10.575318336486816, + "learning_rate": 3.047555060497298e-06, + "loss": 0.447, + "step": 318670 + }, + { + "epoch": 2.8172350996304742, + "grad_norm": 1.2702466249465942, + "learning_rate": 3.046081672825427e-06, + "loss": 0.6263, + "step": 318680 + }, + { + "epoch": 2.817323502890787, + "grad_norm": 3.251462936401367, + "learning_rate": 3.0446082851535564e-06, + "loss": 0.6275, + "step": 318690 + }, + { + "epoch": 2.817411906151099, + "grad_norm": 1.7296732664108276, + "learning_rate": 3.043134897481686e-06, + "loss": 0.4238, + "step": 318700 + }, + { + "epoch": 2.817500309411411, + "grad_norm": 5.216441631317139, + "learning_rate": 3.0416615098098152e-06, + "loss": 0.545, + "step": 318710 + }, + { + "epoch": 2.817588712671723, + "grad_norm": 6.230571269989014, + "learning_rate": 3.0401881221379444e-06, + "loss": 0.5087, + "step": 318720 + }, + { + "epoch": 2.8176771159320357, + "grad_norm": 4.871128559112549, + "learning_rate": 3.038714734466074e-06, + "loss": 0.5215, + "step": 318730 + }, + { + "epoch": 2.817765519192348, + "grad_norm": 1.4826536178588867, + "learning_rate": 3.0372413467942033e-06, + "loss": 0.5558, + "step": 318740 + }, + { + "epoch": 2.81785392245266, + "grad_norm": 1.8344775438308716, + "learning_rate": 3.035767959122333e-06, + "loss": 0.5063, + "step": 318750 + }, + { + "epoch": 2.8179423257129725, + "grad_norm": 1.0839455127716064, + "learning_rate": 3.034294571450462e-06, + "loss": 0.4202, + "step": 318760 + }, + { + "epoch": 2.8180307289732847, + "grad_norm": 3.559135675430298, + "learning_rate": 3.0328211837785913e-06, + "loss": 0.4513, + "step": 318770 + }, + { + "epoch": 2.818119132233597, + "grad_norm": 3.519164562225342, + "learning_rate": 3.0313477961067205e-06, + "loss": 0.7333, + "step": 318780 + }, + { + "epoch": 2.818207535493909, + "grad_norm": 4.783641338348389, + "learning_rate": 3.0298744084348497e-06, + "loss": 0.567, + "step": 318790 + }, + { + "epoch": 2.818295938754221, + "grad_norm": 2.493481397628784, + "learning_rate": 3.0284010207629793e-06, + "loss": 0.5108, + "step": 318800 + }, + { + "epoch": 2.8183843420145336, + "grad_norm": 1.0327508449554443, + "learning_rate": 3.0269276330911085e-06, + "loss": 0.5734, + "step": 318810 + }, + { + "epoch": 2.8184727452748457, + "grad_norm": 5.404900550842285, + "learning_rate": 3.0254542454192377e-06, + "loss": 0.5786, + "step": 318820 + }, + { + "epoch": 2.818561148535158, + "grad_norm": 10.335110664367676, + "learning_rate": 3.023980857747367e-06, + "loss": 0.4678, + "step": 318830 + }, + { + "epoch": 2.8186495517954704, + "grad_norm": 9.558374404907227, + "learning_rate": 3.0225074700754965e-06, + "loss": 0.5371, + "step": 318840 + }, + { + "epoch": 2.8187379550557825, + "grad_norm": 3.3539257049560547, + "learning_rate": 3.021034082403626e-06, + "loss": 0.4986, + "step": 318850 + }, + { + "epoch": 2.8188263583160946, + "grad_norm": 2.9771907329559326, + "learning_rate": 3.0195606947317554e-06, + "loss": 0.5433, + "step": 318860 + }, + { + "epoch": 2.8189147615764067, + "grad_norm": 2.3960626125335693, + "learning_rate": 3.0180873070598846e-06, + "loss": 0.5656, + "step": 318870 + }, + { + "epoch": 2.8190031648367193, + "grad_norm": 3.7716081142425537, + "learning_rate": 3.0166139193880138e-06, + "loss": 0.5473, + "step": 318880 + }, + { + "epoch": 2.8190915680970314, + "grad_norm": 17.027372360229492, + "learning_rate": 3.0151405317161434e-06, + "loss": 0.4793, + "step": 318890 + }, + { + "epoch": 2.8191799713573436, + "grad_norm": 2.175257921218872, + "learning_rate": 3.0136671440442726e-06, + "loss": 0.4873, + "step": 318900 + }, + { + "epoch": 2.819268374617656, + "grad_norm": 2.0843234062194824, + "learning_rate": 3.012193756372402e-06, + "loss": 0.4271, + "step": 318910 + }, + { + "epoch": 2.8193567778779682, + "grad_norm": 5.018442153930664, + "learning_rate": 3.010720368700531e-06, + "loss": 0.5242, + "step": 318920 + }, + { + "epoch": 2.8194451811382804, + "grad_norm": 2.8797502517700195, + "learning_rate": 3.00924698102866e-06, + "loss": 0.6025, + "step": 318930 + }, + { + "epoch": 2.8195335843985925, + "grad_norm": 16.86820411682129, + "learning_rate": 3.00777359335679e-06, + "loss": 0.5549, + "step": 318940 + }, + { + "epoch": 2.8196219876589046, + "grad_norm": 2.9608280658721924, + "learning_rate": 3.006300205684919e-06, + "loss": 0.5839, + "step": 318950 + }, + { + "epoch": 2.819710390919217, + "grad_norm": 2.235496997833252, + "learning_rate": 3.0048268180130486e-06, + "loss": 0.508, + "step": 318960 + }, + { + "epoch": 2.8197987941795293, + "grad_norm": 3.245119571685791, + "learning_rate": 3.003353430341178e-06, + "loss": 0.491, + "step": 318970 + }, + { + "epoch": 2.819887197439842, + "grad_norm": 2.520087718963623, + "learning_rate": 3.0018800426693075e-06, + "loss": 0.5733, + "step": 318980 + }, + { + "epoch": 2.819975600700154, + "grad_norm": 1.0327550172805786, + "learning_rate": 3.0004066549974367e-06, + "loss": 0.4735, + "step": 318990 + }, + { + "epoch": 2.820064003960466, + "grad_norm": 3.475147247314453, + "learning_rate": 2.998933267325566e-06, + "loss": 0.4038, + "step": 319000 + }, + { + "epoch": 2.820152407220778, + "grad_norm": 3.1793363094329834, + "learning_rate": 2.997459879653695e-06, + "loss": 0.6079, + "step": 319010 + }, + { + "epoch": 2.8202408104810903, + "grad_norm": 2.6943163871765137, + "learning_rate": 2.9959864919818243e-06, + "loss": 0.6387, + "step": 319020 + }, + { + "epoch": 2.820329213741403, + "grad_norm": 2.548457384109497, + "learning_rate": 2.994513104309954e-06, + "loss": 0.4335, + "step": 319030 + }, + { + "epoch": 2.820417617001715, + "grad_norm": 1.7644470930099487, + "learning_rate": 2.993039716638083e-06, + "loss": 0.3371, + "step": 319040 + }, + { + "epoch": 2.820506020262027, + "grad_norm": 2.50935435295105, + "learning_rate": 2.9915663289662123e-06, + "loss": 0.6549, + "step": 319050 + }, + { + "epoch": 2.8205944235223397, + "grad_norm": 4.992284297943115, + "learning_rate": 2.9900929412943415e-06, + "loss": 0.4155, + "step": 319060 + }, + { + "epoch": 2.820682826782652, + "grad_norm": 9.04786205291748, + "learning_rate": 2.988619553622471e-06, + "loss": 0.5963, + "step": 319070 + }, + { + "epoch": 2.820771230042964, + "grad_norm": 4.404948711395264, + "learning_rate": 2.9871461659506003e-06, + "loss": 0.5768, + "step": 319080 + }, + { + "epoch": 2.820859633303276, + "grad_norm": 2.3655965328216553, + "learning_rate": 2.98567277827873e-06, + "loss": 0.4425, + "step": 319090 + }, + { + "epoch": 2.8209480365635886, + "grad_norm": 6.288800239562988, + "learning_rate": 2.984199390606859e-06, + "loss": 0.5783, + "step": 319100 + }, + { + "epoch": 2.8210364398239007, + "grad_norm": 7.06682825088501, + "learning_rate": 2.9827260029349883e-06, + "loss": 0.4392, + "step": 319110 + }, + { + "epoch": 2.821124843084213, + "grad_norm": 7.508475303649902, + "learning_rate": 2.9812526152631176e-06, + "loss": 0.4217, + "step": 319120 + }, + { + "epoch": 2.8212132463445254, + "grad_norm": 3.141927480697632, + "learning_rate": 2.979779227591247e-06, + "loss": 0.4844, + "step": 319130 + }, + { + "epoch": 2.8213016496048375, + "grad_norm": 1.0030770301818848, + "learning_rate": 2.9783058399193764e-06, + "loss": 0.6217, + "step": 319140 + }, + { + "epoch": 2.8213900528651497, + "grad_norm": 4.3085432052612305, + "learning_rate": 2.9768324522475056e-06, + "loss": 0.6007, + "step": 319150 + }, + { + "epoch": 2.8214784561254618, + "grad_norm": 5.895828723907471, + "learning_rate": 2.9753590645756348e-06, + "loss": 0.562, + "step": 319160 + }, + { + "epoch": 2.821566859385774, + "grad_norm": 2.0401134490966797, + "learning_rate": 2.9738856769037644e-06, + "loss": 0.5583, + "step": 319170 + }, + { + "epoch": 2.8216552626460865, + "grad_norm": 9.162633895874023, + "learning_rate": 2.9724122892318936e-06, + "loss": 0.5896, + "step": 319180 + }, + { + "epoch": 2.8217436659063986, + "grad_norm": 5.820855617523193, + "learning_rate": 2.970938901560023e-06, + "loss": 0.5827, + "step": 319190 + }, + { + "epoch": 2.821832069166711, + "grad_norm": 1.6822099685668945, + "learning_rate": 2.9694655138881524e-06, + "loss": 0.3479, + "step": 319200 + }, + { + "epoch": 2.8219204724270233, + "grad_norm": 4.062804222106934, + "learning_rate": 2.9679921262162816e-06, + "loss": 0.5493, + "step": 319210 + }, + { + "epoch": 2.8220088756873354, + "grad_norm": 9.257307052612305, + "learning_rate": 2.9665187385444113e-06, + "loss": 0.5183, + "step": 319220 + }, + { + "epoch": 2.8220972789476475, + "grad_norm": 19.613361358642578, + "learning_rate": 2.9650453508725405e-06, + "loss": 0.4813, + "step": 319230 + }, + { + "epoch": 2.8221856822079596, + "grad_norm": 5.1841888427734375, + "learning_rate": 2.9635719632006697e-06, + "loss": 0.4956, + "step": 319240 + }, + { + "epoch": 2.822274085468272, + "grad_norm": 1.572402000427246, + "learning_rate": 2.962098575528799e-06, + "loss": 0.5371, + "step": 319250 + }, + { + "epoch": 2.8223624887285843, + "grad_norm": 2.5470056533813477, + "learning_rate": 2.960625187856928e-06, + "loss": 0.475, + "step": 319260 + }, + { + "epoch": 2.8224508919888964, + "grad_norm": 6.872549533843994, + "learning_rate": 2.9591518001850577e-06, + "loss": 0.479, + "step": 319270 + }, + { + "epoch": 2.822539295249209, + "grad_norm": 2.4473085403442383, + "learning_rate": 2.957678412513187e-06, + "loss": 0.5007, + "step": 319280 + }, + { + "epoch": 2.822627698509521, + "grad_norm": 1.357988953590393, + "learning_rate": 2.956205024841316e-06, + "loss": 0.5942, + "step": 319290 + }, + { + "epoch": 2.8227161017698332, + "grad_norm": 8.377309799194336, + "learning_rate": 2.9547316371694453e-06, + "loss": 0.5341, + "step": 319300 + }, + { + "epoch": 2.8228045050301454, + "grad_norm": 2.2492763996124268, + "learning_rate": 2.953258249497575e-06, + "loss": 0.4894, + "step": 319310 + }, + { + "epoch": 2.822892908290458, + "grad_norm": 7.917896747589111, + "learning_rate": 2.9517848618257045e-06, + "loss": 0.5955, + "step": 319320 + }, + { + "epoch": 2.82298131155077, + "grad_norm": 3.302199363708496, + "learning_rate": 2.9503114741538337e-06, + "loss": 0.548, + "step": 319330 + }, + { + "epoch": 2.823069714811082, + "grad_norm": 2.8650763034820557, + "learning_rate": 2.948838086481963e-06, + "loss": 0.4756, + "step": 319340 + }, + { + "epoch": 2.8231581180713947, + "grad_norm": 1.7041981220245361, + "learning_rate": 2.947364698810092e-06, + "loss": 0.5718, + "step": 319350 + }, + { + "epoch": 2.823246521331707, + "grad_norm": 4.709727764129639, + "learning_rate": 2.9458913111382218e-06, + "loss": 0.4989, + "step": 319360 + }, + { + "epoch": 2.823334924592019, + "grad_norm": 4.196396827697754, + "learning_rate": 2.944417923466351e-06, + "loss": 0.612, + "step": 319370 + }, + { + "epoch": 2.823423327852331, + "grad_norm": 2.4867563247680664, + "learning_rate": 2.94294453579448e-06, + "loss": 0.5427, + "step": 319380 + }, + { + "epoch": 2.823511731112643, + "grad_norm": 2.0735673904418945, + "learning_rate": 2.9414711481226094e-06, + "loss": 0.5895, + "step": 319390 + }, + { + "epoch": 2.8236001343729558, + "grad_norm": 12.526673316955566, + "learning_rate": 2.9399977604507386e-06, + "loss": 0.5811, + "step": 319400 + }, + { + "epoch": 2.823688537633268, + "grad_norm": 4.0456767082214355, + "learning_rate": 2.938524372778868e-06, + "loss": 0.5046, + "step": 319410 + }, + { + "epoch": 2.82377694089358, + "grad_norm": 6.582129001617432, + "learning_rate": 2.9370509851069974e-06, + "loss": 0.6409, + "step": 319420 + }, + { + "epoch": 2.8238653441538926, + "grad_norm": 1.9784539937973022, + "learning_rate": 2.935577597435127e-06, + "loss": 0.454, + "step": 319430 + }, + { + "epoch": 2.8239537474142047, + "grad_norm": 7.415929317474365, + "learning_rate": 2.9341042097632562e-06, + "loss": 0.5158, + "step": 319440 + }, + { + "epoch": 2.824042150674517, + "grad_norm": 2.080134391784668, + "learning_rate": 2.932630822091386e-06, + "loss": 0.5283, + "step": 319450 + }, + { + "epoch": 2.824130553934829, + "grad_norm": 2.6108665466308594, + "learning_rate": 2.931157434419515e-06, + "loss": 0.5217, + "step": 319460 + }, + { + "epoch": 2.8242189571951415, + "grad_norm": 1.670150637626648, + "learning_rate": 2.9296840467476442e-06, + "loss": 0.6005, + "step": 319470 + }, + { + "epoch": 2.8243073604554536, + "grad_norm": 2.1977717876434326, + "learning_rate": 2.9282106590757734e-06, + "loss": 0.3694, + "step": 319480 + }, + { + "epoch": 2.8243957637157657, + "grad_norm": 1.8020737171173096, + "learning_rate": 2.9267372714039026e-06, + "loss": 0.4252, + "step": 319490 + }, + { + "epoch": 2.8244841669760783, + "grad_norm": 9.188392639160156, + "learning_rate": 2.9252638837320323e-06, + "loss": 0.541, + "step": 319500 + }, + { + "epoch": 2.8245725702363904, + "grad_norm": 1.6046125888824463, + "learning_rate": 2.9237904960601615e-06, + "loss": 0.3897, + "step": 319510 + }, + { + "epoch": 2.8246609734967025, + "grad_norm": 4.14688777923584, + "learning_rate": 2.9223171083882907e-06, + "loss": 0.5001, + "step": 319520 + }, + { + "epoch": 2.8247493767570147, + "grad_norm": 4.160635471343994, + "learning_rate": 2.92084372071642e-06, + "loss": 0.5219, + "step": 319530 + }, + { + "epoch": 2.8248377800173268, + "grad_norm": 7.976750373840332, + "learning_rate": 2.9193703330445495e-06, + "loss": 0.5042, + "step": 319540 + }, + { + "epoch": 2.8249261832776393, + "grad_norm": 3.764474391937256, + "learning_rate": 2.917896945372679e-06, + "loss": 0.4632, + "step": 319550 + }, + { + "epoch": 2.8250145865379515, + "grad_norm": 3.3990299701690674, + "learning_rate": 2.9164235577008083e-06, + "loss": 0.5834, + "step": 319560 + }, + { + "epoch": 2.825102989798264, + "grad_norm": 5.269881248474121, + "learning_rate": 2.9149501700289375e-06, + "loss": 0.5216, + "step": 319570 + }, + { + "epoch": 2.825191393058576, + "grad_norm": 4.9020676612854, + "learning_rate": 2.9134767823570667e-06, + "loss": 0.6644, + "step": 319580 + }, + { + "epoch": 2.8252797963188883, + "grad_norm": 9.441620826721191, + "learning_rate": 2.9120033946851964e-06, + "loss": 0.5347, + "step": 319590 + }, + { + "epoch": 2.8253681995792004, + "grad_norm": 6.5171732902526855, + "learning_rate": 2.9105300070133256e-06, + "loss": 0.4977, + "step": 319600 + }, + { + "epoch": 2.8254566028395125, + "grad_norm": 4.160897731781006, + "learning_rate": 2.9090566193414548e-06, + "loss": 0.4363, + "step": 319610 + }, + { + "epoch": 2.825545006099825, + "grad_norm": 5.47243595123291, + "learning_rate": 2.907583231669584e-06, + "loss": 0.4702, + "step": 319620 + }, + { + "epoch": 2.825633409360137, + "grad_norm": 9.186187744140625, + "learning_rate": 2.906109843997713e-06, + "loss": 0.4636, + "step": 319630 + }, + { + "epoch": 2.8257218126204493, + "grad_norm": 4.915459632873535, + "learning_rate": 2.9046364563258428e-06, + "loss": 0.5483, + "step": 319640 + }, + { + "epoch": 2.825810215880762, + "grad_norm": 13.296531677246094, + "learning_rate": 2.903163068653972e-06, + "loss": 0.4828, + "step": 319650 + }, + { + "epoch": 2.825898619141074, + "grad_norm": 2.6074862480163574, + "learning_rate": 2.9016896809821016e-06, + "loss": 0.5141, + "step": 319660 + }, + { + "epoch": 2.825987022401386, + "grad_norm": 3.4658656120300293, + "learning_rate": 2.900216293310231e-06, + "loss": 0.4989, + "step": 319670 + }, + { + "epoch": 2.8260754256616982, + "grad_norm": 3.1879770755767822, + "learning_rate": 2.89874290563836e-06, + "loss": 0.3903, + "step": 319680 + }, + { + "epoch": 2.826163828922011, + "grad_norm": 3.524627447128296, + "learning_rate": 2.8972695179664896e-06, + "loss": 0.3959, + "step": 319690 + }, + { + "epoch": 2.826252232182323, + "grad_norm": 8.707279205322266, + "learning_rate": 2.895796130294619e-06, + "loss": 0.4175, + "step": 319700 + }, + { + "epoch": 2.826340635442635, + "grad_norm": 4.875030994415283, + "learning_rate": 2.894322742622748e-06, + "loss": 0.55, + "step": 319710 + }, + { + "epoch": 2.8264290387029476, + "grad_norm": 1.7041890621185303, + "learning_rate": 2.8928493549508772e-06, + "loss": 0.4342, + "step": 319720 + }, + { + "epoch": 2.8265174419632597, + "grad_norm": 1.8235247135162354, + "learning_rate": 2.891375967279007e-06, + "loss": 0.4356, + "step": 319730 + }, + { + "epoch": 2.826605845223572, + "grad_norm": 3.6502981185913086, + "learning_rate": 2.889902579607136e-06, + "loss": 0.3727, + "step": 319740 + }, + { + "epoch": 2.826694248483884, + "grad_norm": 9.848337173461914, + "learning_rate": 2.8884291919352653e-06, + "loss": 0.459, + "step": 319750 + }, + { + "epoch": 2.826782651744196, + "grad_norm": 4.746179580688477, + "learning_rate": 2.8869558042633945e-06, + "loss": 0.597, + "step": 319760 + }, + { + "epoch": 2.8268710550045086, + "grad_norm": 1.7334163188934326, + "learning_rate": 2.885482416591524e-06, + "loss": 0.4817, + "step": 319770 + }, + { + "epoch": 2.8269594582648208, + "grad_norm": 4.38770866394043, + "learning_rate": 2.8840090289196533e-06, + "loss": 0.4605, + "step": 319780 + }, + { + "epoch": 2.8270478615251333, + "grad_norm": 3.350916862487793, + "learning_rate": 2.882535641247783e-06, + "loss": 0.4905, + "step": 319790 + }, + { + "epoch": 2.8271362647854454, + "grad_norm": 1.4260892868041992, + "learning_rate": 2.881062253575912e-06, + "loss": 0.6014, + "step": 319800 + }, + { + "epoch": 2.8272246680457576, + "grad_norm": 0.8022703528404236, + "learning_rate": 2.8795888659040413e-06, + "loss": 0.3744, + "step": 319810 + }, + { + "epoch": 2.8273130713060697, + "grad_norm": 8.555363655090332, + "learning_rate": 2.8781154782321705e-06, + "loss": 0.4181, + "step": 319820 + }, + { + "epoch": 2.827401474566382, + "grad_norm": 1.9865620136260986, + "learning_rate": 2.8766420905603e-06, + "loss": 0.4399, + "step": 319830 + }, + { + "epoch": 2.8274898778266944, + "grad_norm": 3.2394399642944336, + "learning_rate": 2.8751687028884293e-06, + "loss": 0.5083, + "step": 319840 + }, + { + "epoch": 2.8275782810870065, + "grad_norm": 3.6408774852752686, + "learning_rate": 2.8736953152165585e-06, + "loss": 0.4884, + "step": 319850 + }, + { + "epoch": 2.8276666843473186, + "grad_norm": 6.943711280822754, + "learning_rate": 2.8722219275446877e-06, + "loss": 0.5981, + "step": 319860 + }, + { + "epoch": 2.827755087607631, + "grad_norm": 3.7883684635162354, + "learning_rate": 2.8707485398728174e-06, + "loss": 0.5695, + "step": 319870 + }, + { + "epoch": 2.8278434908679433, + "grad_norm": 2.133979320526123, + "learning_rate": 2.8692751522009466e-06, + "loss": 0.4636, + "step": 319880 + }, + { + "epoch": 2.8279318941282554, + "grad_norm": 5.6743316650390625, + "learning_rate": 2.8678017645290758e-06, + "loss": 0.6659, + "step": 319890 + }, + { + "epoch": 2.8280202973885675, + "grad_norm": 3.8006649017333984, + "learning_rate": 2.8663283768572054e-06, + "loss": 0.4633, + "step": 319900 + }, + { + "epoch": 2.82810870064888, + "grad_norm": 1.2830321788787842, + "learning_rate": 2.8648549891853346e-06, + "loss": 0.4312, + "step": 319910 + }, + { + "epoch": 2.828197103909192, + "grad_norm": 1.7304478883743286, + "learning_rate": 2.8633816015134642e-06, + "loss": 0.6003, + "step": 319920 + }, + { + "epoch": 2.8282855071695043, + "grad_norm": 6.292008399963379, + "learning_rate": 2.8619082138415934e-06, + "loss": 0.6124, + "step": 319930 + }, + { + "epoch": 2.828373910429817, + "grad_norm": 0.9378353357315063, + "learning_rate": 2.8604348261697226e-06, + "loss": 0.5356, + "step": 319940 + }, + { + "epoch": 2.828462313690129, + "grad_norm": 3.9875128269195557, + "learning_rate": 2.858961438497852e-06, + "loss": 0.5272, + "step": 319950 + }, + { + "epoch": 2.828550716950441, + "grad_norm": 6.307404041290283, + "learning_rate": 2.857488050825981e-06, + "loss": 0.5982, + "step": 319960 + }, + { + "epoch": 2.8286391202107533, + "grad_norm": 1.5944725275039673, + "learning_rate": 2.8560146631541106e-06, + "loss": 0.5151, + "step": 319970 + }, + { + "epoch": 2.8287275234710654, + "grad_norm": 2.4033868312835693, + "learning_rate": 2.85454127548224e-06, + "loss": 0.5634, + "step": 319980 + }, + { + "epoch": 2.828815926731378, + "grad_norm": 3.7014923095703125, + "learning_rate": 2.853067887810369e-06, + "loss": 0.4374, + "step": 319990 + }, + { + "epoch": 2.82890432999169, + "grad_norm": 2.3390560150146484, + "learning_rate": 2.8515945001384987e-06, + "loss": 0.4822, + "step": 320000 + }, + { + "epoch": 2.828992733252002, + "grad_norm": 7.392993927001953, + "learning_rate": 2.850121112466628e-06, + "loss": 0.4697, + "step": 320010 + }, + { + "epoch": 2.8290811365123147, + "grad_norm": 4.862769603729248, + "learning_rate": 2.8486477247947575e-06, + "loss": 0.4294, + "step": 320020 + }, + { + "epoch": 2.829169539772627, + "grad_norm": 7.103195667266846, + "learning_rate": 2.8471743371228867e-06, + "loss": 0.4889, + "step": 320030 + }, + { + "epoch": 2.829257943032939, + "grad_norm": 4.9096269607543945, + "learning_rate": 2.845700949451016e-06, + "loss": 0.5364, + "step": 320040 + }, + { + "epoch": 2.829346346293251, + "grad_norm": 3.667140483856201, + "learning_rate": 2.844227561779145e-06, + "loss": 0.516, + "step": 320050 + }, + { + "epoch": 2.8294347495535637, + "grad_norm": 2.319427728652954, + "learning_rate": 2.8427541741072747e-06, + "loss": 0.4933, + "step": 320060 + }, + { + "epoch": 2.829523152813876, + "grad_norm": 5.266237735748291, + "learning_rate": 2.841280786435404e-06, + "loss": 0.5977, + "step": 320070 + }, + { + "epoch": 2.829611556074188, + "grad_norm": 1.047705888748169, + "learning_rate": 2.839807398763533e-06, + "loss": 0.4192, + "step": 320080 + }, + { + "epoch": 2.8296999593345005, + "grad_norm": 6.065146446228027, + "learning_rate": 2.8383340110916623e-06, + "loss": 0.4994, + "step": 320090 + }, + { + "epoch": 2.8297883625948126, + "grad_norm": 9.643455505371094, + "learning_rate": 2.8368606234197915e-06, + "loss": 0.3888, + "step": 320100 + }, + { + "epoch": 2.8298767658551247, + "grad_norm": 3.715761661529541, + "learning_rate": 2.835387235747921e-06, + "loss": 0.7013, + "step": 320110 + }, + { + "epoch": 2.829965169115437, + "grad_norm": 8.16971492767334, + "learning_rate": 2.8339138480760504e-06, + "loss": 0.547, + "step": 320120 + }, + { + "epoch": 2.830053572375749, + "grad_norm": 2.613426446914673, + "learning_rate": 2.83244046040418e-06, + "loss": 0.5554, + "step": 320130 + }, + { + "epoch": 2.8301419756360615, + "grad_norm": 2.0418660640716553, + "learning_rate": 2.830967072732309e-06, + "loss": 0.5861, + "step": 320140 + }, + { + "epoch": 2.8302303788963736, + "grad_norm": 2.721752405166626, + "learning_rate": 2.829493685060439e-06, + "loss": 0.5472, + "step": 320150 + }, + { + "epoch": 2.830318782156686, + "grad_norm": 5.364828586578369, + "learning_rate": 2.828020297388568e-06, + "loss": 0.549, + "step": 320160 + }, + { + "epoch": 2.8304071854169983, + "grad_norm": 3.1262035369873047, + "learning_rate": 2.826546909716697e-06, + "loss": 0.4494, + "step": 320170 + }, + { + "epoch": 2.8304955886773104, + "grad_norm": 4.244606018066406, + "learning_rate": 2.8250735220448264e-06, + "loss": 0.4958, + "step": 320180 + }, + { + "epoch": 2.8305839919376226, + "grad_norm": 4.136528015136719, + "learning_rate": 2.8236001343729556e-06, + "loss": 0.4434, + "step": 320190 + }, + { + "epoch": 2.8306723951979347, + "grad_norm": 3.7892463207244873, + "learning_rate": 2.8221267467010852e-06, + "loss": 0.42, + "step": 320200 + }, + { + "epoch": 2.8307607984582472, + "grad_norm": 10.647525787353516, + "learning_rate": 2.8206533590292144e-06, + "loss": 0.4824, + "step": 320210 + }, + { + "epoch": 2.8308492017185594, + "grad_norm": 10.533933639526367, + "learning_rate": 2.8191799713573436e-06, + "loss": 0.5471, + "step": 320220 + }, + { + "epoch": 2.8309376049788715, + "grad_norm": 1.6404069662094116, + "learning_rate": 2.817706583685473e-06, + "loss": 0.5346, + "step": 320230 + }, + { + "epoch": 2.831026008239184, + "grad_norm": 1.1358014345169067, + "learning_rate": 2.8162331960136025e-06, + "loss": 0.5423, + "step": 320240 + }, + { + "epoch": 2.831114411499496, + "grad_norm": 3.742095708847046, + "learning_rate": 2.814759808341732e-06, + "loss": 0.513, + "step": 320250 + }, + { + "epoch": 2.8312028147598083, + "grad_norm": 0.4711519777774811, + "learning_rate": 2.8132864206698613e-06, + "loss": 0.506, + "step": 320260 + }, + { + "epoch": 2.8312912180201204, + "grad_norm": 2.6556239128112793, + "learning_rate": 2.8118130329979905e-06, + "loss": 0.5479, + "step": 320270 + }, + { + "epoch": 2.831379621280433, + "grad_norm": 4.5092668533325195, + "learning_rate": 2.8103396453261197e-06, + "loss": 0.4661, + "step": 320280 + }, + { + "epoch": 2.831468024540745, + "grad_norm": 2.873588800430298, + "learning_rate": 2.8088662576542493e-06, + "loss": 0.5165, + "step": 320290 + }, + { + "epoch": 2.831556427801057, + "grad_norm": 26.377384185791016, + "learning_rate": 2.8073928699823785e-06, + "loss": 0.5783, + "step": 320300 + }, + { + "epoch": 2.8316448310613698, + "grad_norm": 8.803069114685059, + "learning_rate": 2.8059194823105077e-06, + "loss": 0.3908, + "step": 320310 + }, + { + "epoch": 2.831733234321682, + "grad_norm": 4.055894374847412, + "learning_rate": 2.804446094638637e-06, + "loss": 0.4975, + "step": 320320 + }, + { + "epoch": 2.831821637581994, + "grad_norm": 5.277040481567383, + "learning_rate": 2.802972706966766e-06, + "loss": 0.5454, + "step": 320330 + }, + { + "epoch": 2.831910040842306, + "grad_norm": 30.90617561340332, + "learning_rate": 2.8014993192948957e-06, + "loss": 0.4849, + "step": 320340 + }, + { + "epoch": 2.8319984441026183, + "grad_norm": 2.420535087585449, + "learning_rate": 2.800025931623025e-06, + "loss": 0.5335, + "step": 320350 + }, + { + "epoch": 2.832086847362931, + "grad_norm": 6.099735260009766, + "learning_rate": 2.7985525439511546e-06, + "loss": 0.5501, + "step": 320360 + }, + { + "epoch": 2.832175250623243, + "grad_norm": 2.1413612365722656, + "learning_rate": 2.7970791562792838e-06, + "loss": 0.5617, + "step": 320370 + }, + { + "epoch": 2.8322636538835555, + "grad_norm": 3.780735731124878, + "learning_rate": 2.795605768607413e-06, + "loss": 0.4866, + "step": 320380 + }, + { + "epoch": 2.8323520571438676, + "grad_norm": 7.038763999938965, + "learning_rate": 2.7941323809355426e-06, + "loss": 0.442, + "step": 320390 + }, + { + "epoch": 2.8324404604041797, + "grad_norm": 4.3805108070373535, + "learning_rate": 2.792658993263672e-06, + "loss": 0.5785, + "step": 320400 + }, + { + "epoch": 2.832528863664492, + "grad_norm": 3.499676465988159, + "learning_rate": 2.791185605591801e-06, + "loss": 0.5229, + "step": 320410 + }, + { + "epoch": 2.832617266924804, + "grad_norm": 21.558731079101562, + "learning_rate": 2.78971221791993e-06, + "loss": 0.5347, + "step": 320420 + }, + { + "epoch": 2.8327056701851165, + "grad_norm": 11.243224143981934, + "learning_rate": 2.78823883024806e-06, + "loss": 0.5093, + "step": 320430 + }, + { + "epoch": 2.8327940734454287, + "grad_norm": 3.4583890438079834, + "learning_rate": 2.786765442576189e-06, + "loss": 0.5474, + "step": 320440 + }, + { + "epoch": 2.832882476705741, + "grad_norm": 1.4822912216186523, + "learning_rate": 2.7852920549043182e-06, + "loss": 0.4878, + "step": 320450 + }, + { + "epoch": 2.8329708799660533, + "grad_norm": 6.4161906242370605, + "learning_rate": 2.7838186672324474e-06, + "loss": 0.4668, + "step": 320460 + }, + { + "epoch": 2.8330592832263655, + "grad_norm": 4.058376312255859, + "learning_rate": 2.782345279560577e-06, + "loss": 0.5397, + "step": 320470 + }, + { + "epoch": 2.8331476864866776, + "grad_norm": 1.1773748397827148, + "learning_rate": 2.7808718918887067e-06, + "loss": 0.7001, + "step": 320480 + }, + { + "epoch": 2.8332360897469897, + "grad_norm": 2.281991958618164, + "learning_rate": 2.779398504216836e-06, + "loss": 0.5313, + "step": 320490 + }, + { + "epoch": 2.8333244930073023, + "grad_norm": 1.3617335557937622, + "learning_rate": 2.777925116544965e-06, + "loss": 0.5065, + "step": 320500 + }, + { + "epoch": 2.8334128962676144, + "grad_norm": 3.8236989974975586, + "learning_rate": 2.7764517288730943e-06, + "loss": 0.5939, + "step": 320510 + }, + { + "epoch": 2.8335012995279265, + "grad_norm": 3.1058225631713867, + "learning_rate": 2.7749783412012235e-06, + "loss": 0.5556, + "step": 320520 + }, + { + "epoch": 2.833589702788239, + "grad_norm": 4.043028831481934, + "learning_rate": 2.773504953529353e-06, + "loss": 0.4924, + "step": 320530 + }, + { + "epoch": 2.833678106048551, + "grad_norm": 6.474728584289551, + "learning_rate": 2.7720315658574823e-06, + "loss": 0.5469, + "step": 320540 + }, + { + "epoch": 2.8337665093088633, + "grad_norm": 8.388376235961914, + "learning_rate": 2.7705581781856115e-06, + "loss": 0.4538, + "step": 320550 + }, + { + "epoch": 2.8338549125691754, + "grad_norm": 3.886340379714966, + "learning_rate": 2.7690847905137407e-06, + "loss": 0.5257, + "step": 320560 + }, + { + "epoch": 2.8339433158294876, + "grad_norm": 2.4706411361694336, + "learning_rate": 2.7676114028418703e-06, + "loss": 0.4956, + "step": 320570 + }, + { + "epoch": 2.8340317190898, + "grad_norm": 3.8481764793395996, + "learning_rate": 2.7661380151699995e-06, + "loss": 0.4209, + "step": 320580 + }, + { + "epoch": 2.8341201223501122, + "grad_norm": 6.623083114624023, + "learning_rate": 2.764664627498129e-06, + "loss": 0.5027, + "step": 320590 + }, + { + "epoch": 2.8342085256104244, + "grad_norm": 7.432358264923096, + "learning_rate": 2.7631912398262584e-06, + "loss": 0.5483, + "step": 320600 + }, + { + "epoch": 2.834296928870737, + "grad_norm": 4.6673431396484375, + "learning_rate": 2.7617178521543876e-06, + "loss": 0.5368, + "step": 320610 + }, + { + "epoch": 2.834385332131049, + "grad_norm": 1.9592279195785522, + "learning_rate": 2.760244464482517e-06, + "loss": 0.4888, + "step": 320620 + }, + { + "epoch": 2.834473735391361, + "grad_norm": 3.129708766937256, + "learning_rate": 2.7587710768106464e-06, + "loss": 0.5965, + "step": 320630 + }, + { + "epoch": 2.8345621386516733, + "grad_norm": 1.9867818355560303, + "learning_rate": 2.7572976891387756e-06, + "loss": 0.4094, + "step": 320640 + }, + { + "epoch": 2.834650541911986, + "grad_norm": 8.950523376464844, + "learning_rate": 2.7558243014669048e-06, + "loss": 0.5592, + "step": 320650 + }, + { + "epoch": 2.834738945172298, + "grad_norm": 2.5632882118225098, + "learning_rate": 2.754350913795034e-06, + "loss": 0.4965, + "step": 320660 + }, + { + "epoch": 2.83482734843261, + "grad_norm": 12.0087308883667, + "learning_rate": 2.7528775261231636e-06, + "loss": 0.4734, + "step": 320670 + }, + { + "epoch": 2.8349157516929226, + "grad_norm": 1.716140627861023, + "learning_rate": 2.751404138451293e-06, + "loss": 0.5311, + "step": 320680 + }, + { + "epoch": 2.8350041549532348, + "grad_norm": 9.47512435913086, + "learning_rate": 2.749930750779422e-06, + "loss": 0.5115, + "step": 320690 + }, + { + "epoch": 2.835092558213547, + "grad_norm": 2.376523971557617, + "learning_rate": 2.7484573631075516e-06, + "loss": 0.5889, + "step": 320700 + }, + { + "epoch": 2.835180961473859, + "grad_norm": 3.346954584121704, + "learning_rate": 2.746983975435681e-06, + "loss": 0.6419, + "step": 320710 + }, + { + "epoch": 2.835269364734171, + "grad_norm": 3.7310307025909424, + "learning_rate": 2.7455105877638105e-06, + "loss": 0.6812, + "step": 320720 + }, + { + "epoch": 2.8353577679944837, + "grad_norm": 2.1778087615966797, + "learning_rate": 2.7440372000919397e-06, + "loss": 0.6122, + "step": 320730 + }, + { + "epoch": 2.835446171254796, + "grad_norm": 3.5528817176818848, + "learning_rate": 2.742563812420069e-06, + "loss": 0.416, + "step": 320740 + }, + { + "epoch": 2.8355345745151084, + "grad_norm": 4.414902687072754, + "learning_rate": 2.741090424748198e-06, + "loss": 0.6096, + "step": 320750 + }, + { + "epoch": 2.8356229777754205, + "grad_norm": 2.3934266567230225, + "learning_rate": 2.7396170370763277e-06, + "loss": 0.4458, + "step": 320760 + }, + { + "epoch": 2.8357113810357326, + "grad_norm": 5.330607891082764, + "learning_rate": 2.738143649404457e-06, + "loss": 0.4902, + "step": 320770 + }, + { + "epoch": 2.8357997842960447, + "grad_norm": 5.098132133483887, + "learning_rate": 2.736670261732586e-06, + "loss": 0.4486, + "step": 320780 + }, + { + "epoch": 2.835888187556357, + "grad_norm": 5.329583168029785, + "learning_rate": 2.7351968740607153e-06, + "loss": 0.6126, + "step": 320790 + }, + { + "epoch": 2.8359765908166694, + "grad_norm": 1.0196245908737183, + "learning_rate": 2.7337234863888445e-06, + "loss": 0.5793, + "step": 320800 + }, + { + "epoch": 2.8360649940769815, + "grad_norm": 4.630887508392334, + "learning_rate": 2.732250098716974e-06, + "loss": 0.4792, + "step": 320810 + }, + { + "epoch": 2.8361533973372937, + "grad_norm": 7.250920295715332, + "learning_rate": 2.7307767110451033e-06, + "loss": 0.5585, + "step": 320820 + }, + { + "epoch": 2.836241800597606, + "grad_norm": 3.960911989212036, + "learning_rate": 2.729303323373233e-06, + "loss": 0.4026, + "step": 320830 + }, + { + "epoch": 2.8363302038579183, + "grad_norm": 2.811328887939453, + "learning_rate": 2.727829935701362e-06, + "loss": 0.5226, + "step": 320840 + }, + { + "epoch": 2.8364186071182305, + "grad_norm": 3.571503162384033, + "learning_rate": 2.7263565480294918e-06, + "loss": 0.4829, + "step": 320850 + }, + { + "epoch": 2.8365070103785426, + "grad_norm": 2.0158417224884033, + "learning_rate": 2.724883160357621e-06, + "loss": 0.4526, + "step": 320860 + }, + { + "epoch": 2.836595413638855, + "grad_norm": 1.433997631072998, + "learning_rate": 2.72340977268575e-06, + "loss": 0.4645, + "step": 320870 + }, + { + "epoch": 2.8366838168991673, + "grad_norm": 5.823851585388184, + "learning_rate": 2.7219363850138794e-06, + "loss": 0.5043, + "step": 320880 + }, + { + "epoch": 2.8367722201594794, + "grad_norm": 7.595434665679932, + "learning_rate": 2.7204629973420086e-06, + "loss": 0.4932, + "step": 320890 + }, + { + "epoch": 2.836860623419792, + "grad_norm": 4.22359037399292, + "learning_rate": 2.718989609670138e-06, + "loss": 0.4694, + "step": 320900 + }, + { + "epoch": 2.836949026680104, + "grad_norm": 4.6248321533203125, + "learning_rate": 2.7175162219982674e-06, + "loss": 0.4232, + "step": 320910 + }, + { + "epoch": 2.837037429940416, + "grad_norm": 2.351776123046875, + "learning_rate": 2.7160428343263966e-06, + "loss": 0.4126, + "step": 320920 + }, + { + "epoch": 2.8371258332007283, + "grad_norm": 1.995771050453186, + "learning_rate": 2.714569446654526e-06, + "loss": 0.5218, + "step": 320930 + }, + { + "epoch": 2.8372142364610404, + "grad_norm": 6.280935287475586, + "learning_rate": 2.7130960589826554e-06, + "loss": 0.5631, + "step": 320940 + }, + { + "epoch": 2.837302639721353, + "grad_norm": 5.618125915527344, + "learning_rate": 2.711622671310785e-06, + "loss": 0.5386, + "step": 320950 + }, + { + "epoch": 2.837391042981665, + "grad_norm": 1.8222819566726685, + "learning_rate": 2.7101492836389143e-06, + "loss": 0.5088, + "step": 320960 + }, + { + "epoch": 2.8374794462419777, + "grad_norm": 3.1643238067626953, + "learning_rate": 2.7086758959670435e-06, + "loss": 0.4982, + "step": 320970 + }, + { + "epoch": 2.83756784950229, + "grad_norm": 1.2630916833877563, + "learning_rate": 2.7072025082951727e-06, + "loss": 0.4563, + "step": 320980 + }, + { + "epoch": 2.837656252762602, + "grad_norm": 2.950472116470337, + "learning_rate": 2.7057291206233023e-06, + "loss": 0.5815, + "step": 320990 + }, + { + "epoch": 2.837744656022914, + "grad_norm": 8.653281211853027, + "learning_rate": 2.7042557329514315e-06, + "loss": 0.5392, + "step": 321000 + }, + { + "epoch": 2.837833059283226, + "grad_norm": 4.772243499755859, + "learning_rate": 2.7027823452795607e-06, + "loss": 0.5708, + "step": 321010 + }, + { + "epoch": 2.8379214625435387, + "grad_norm": 2.9767322540283203, + "learning_rate": 2.70130895760769e-06, + "loss": 0.5428, + "step": 321020 + }, + { + "epoch": 2.838009865803851, + "grad_norm": 8.250003814697266, + "learning_rate": 2.699835569935819e-06, + "loss": 0.5204, + "step": 321030 + }, + { + "epoch": 2.838098269064163, + "grad_norm": 7.220468997955322, + "learning_rate": 2.6983621822639487e-06, + "loss": 0.4528, + "step": 321040 + }, + { + "epoch": 2.8381866723244755, + "grad_norm": 3.6625571250915527, + "learning_rate": 2.696888794592078e-06, + "loss": 0.5018, + "step": 321050 + }, + { + "epoch": 2.8382750755847876, + "grad_norm": 17.014972686767578, + "learning_rate": 2.6954154069202075e-06, + "loss": 0.5423, + "step": 321060 + }, + { + "epoch": 2.8383634788450998, + "grad_norm": 2.9764623641967773, + "learning_rate": 2.6939420192483367e-06, + "loss": 0.6201, + "step": 321070 + }, + { + "epoch": 2.838451882105412, + "grad_norm": 3.1523489952087402, + "learning_rate": 2.692468631576466e-06, + "loss": 0.5819, + "step": 321080 + }, + { + "epoch": 2.8385402853657244, + "grad_norm": 3.302539348602295, + "learning_rate": 2.6909952439045956e-06, + "loss": 0.5339, + "step": 321090 + }, + { + "epoch": 2.8386286886260366, + "grad_norm": 1.6603301763534546, + "learning_rate": 2.6895218562327248e-06, + "loss": 0.4338, + "step": 321100 + }, + { + "epoch": 2.8387170918863487, + "grad_norm": 1.2521287202835083, + "learning_rate": 2.688048468560854e-06, + "loss": 0.5051, + "step": 321110 + }, + { + "epoch": 2.8388054951466613, + "grad_norm": 3.110442876815796, + "learning_rate": 2.686575080888983e-06, + "loss": 0.4643, + "step": 321120 + }, + { + "epoch": 2.8388938984069734, + "grad_norm": 13.533472061157227, + "learning_rate": 2.6851016932171128e-06, + "loss": 0.5637, + "step": 321130 + }, + { + "epoch": 2.8389823016672855, + "grad_norm": 5.287076473236084, + "learning_rate": 2.683628305545242e-06, + "loss": 0.6051, + "step": 321140 + }, + { + "epoch": 2.8390707049275976, + "grad_norm": 2.253727674484253, + "learning_rate": 2.682154917873371e-06, + "loss": 0.5885, + "step": 321150 + }, + { + "epoch": 2.8391591081879097, + "grad_norm": 2.284377336502075, + "learning_rate": 2.6806815302015004e-06, + "loss": 0.5859, + "step": 321160 + }, + { + "epoch": 2.8392475114482223, + "grad_norm": 2.5332977771759033, + "learning_rate": 2.67920814252963e-06, + "loss": 0.4243, + "step": 321170 + }, + { + "epoch": 2.8393359147085344, + "grad_norm": 17.42315673828125, + "learning_rate": 2.6777347548577596e-06, + "loss": 0.5423, + "step": 321180 + }, + { + "epoch": 2.8394243179688465, + "grad_norm": 2.3888180255889893, + "learning_rate": 2.676261367185889e-06, + "loss": 0.5486, + "step": 321190 + }, + { + "epoch": 2.839512721229159, + "grad_norm": 3.110473155975342, + "learning_rate": 2.674787979514018e-06, + "loss": 0.3993, + "step": 321200 + }, + { + "epoch": 2.839601124489471, + "grad_norm": 4.4572834968566895, + "learning_rate": 2.6733145918421472e-06, + "loss": 0.6295, + "step": 321210 + }, + { + "epoch": 2.8396895277497833, + "grad_norm": 2.134809970855713, + "learning_rate": 2.6718412041702764e-06, + "loss": 0.38, + "step": 321220 + }, + { + "epoch": 2.8397779310100955, + "grad_norm": 4.484951972961426, + "learning_rate": 2.670367816498406e-06, + "loss": 0.5745, + "step": 321230 + }, + { + "epoch": 2.839866334270408, + "grad_norm": 1.114912509918213, + "learning_rate": 2.6688944288265353e-06, + "loss": 0.4323, + "step": 321240 + }, + { + "epoch": 2.83995473753072, + "grad_norm": 2.7757911682128906, + "learning_rate": 2.6674210411546645e-06, + "loss": 0.48, + "step": 321250 + }, + { + "epoch": 2.8400431407910323, + "grad_norm": 2.7817702293395996, + "learning_rate": 2.6659476534827937e-06, + "loss": 0.5876, + "step": 321260 + }, + { + "epoch": 2.840131544051345, + "grad_norm": 4.38533878326416, + "learning_rate": 2.664474265810923e-06, + "loss": 0.5504, + "step": 321270 + }, + { + "epoch": 2.840219947311657, + "grad_norm": 4.602242946624756, + "learning_rate": 2.6630008781390525e-06, + "loss": 0.5104, + "step": 321280 + }, + { + "epoch": 2.840308350571969, + "grad_norm": 7.516955852508545, + "learning_rate": 2.661527490467182e-06, + "loss": 0.602, + "step": 321290 + }, + { + "epoch": 2.840396753832281, + "grad_norm": 1.6488475799560547, + "learning_rate": 2.6600541027953113e-06, + "loss": 0.4107, + "step": 321300 + }, + { + "epoch": 2.8404851570925933, + "grad_norm": 10.285978317260742, + "learning_rate": 2.6585807151234405e-06, + "loss": 0.5364, + "step": 321310 + }, + { + "epoch": 2.840573560352906, + "grad_norm": 6.2079758644104, + "learning_rate": 2.65710732745157e-06, + "loss": 0.4875, + "step": 321320 + }, + { + "epoch": 2.840661963613218, + "grad_norm": 3.4796502590179443, + "learning_rate": 2.6556339397796993e-06, + "loss": 0.5045, + "step": 321330 + }, + { + "epoch": 2.8407503668735306, + "grad_norm": 4.852774143218994, + "learning_rate": 2.6541605521078285e-06, + "loss": 0.3956, + "step": 321340 + }, + { + "epoch": 2.8408387701338427, + "grad_norm": 2.3879034519195557, + "learning_rate": 2.6526871644359577e-06, + "loss": 0.4467, + "step": 321350 + }, + { + "epoch": 2.840927173394155, + "grad_norm": 2.5320961475372314, + "learning_rate": 2.651213776764087e-06, + "loss": 0.5434, + "step": 321360 + }, + { + "epoch": 2.841015576654467, + "grad_norm": 1.737464427947998, + "learning_rate": 2.6497403890922166e-06, + "loss": 0.4767, + "step": 321370 + }, + { + "epoch": 2.841103979914779, + "grad_norm": 2.985214948654175, + "learning_rate": 2.6482670014203458e-06, + "loss": 0.4722, + "step": 321380 + }, + { + "epoch": 2.8411923831750916, + "grad_norm": 2.76536226272583, + "learning_rate": 2.646793613748475e-06, + "loss": 0.5203, + "step": 321390 + }, + { + "epoch": 2.8412807864354037, + "grad_norm": 8.85761833190918, + "learning_rate": 2.6453202260766046e-06, + "loss": 0.4987, + "step": 321400 + }, + { + "epoch": 2.841369189695716, + "grad_norm": 3.2395670413970947, + "learning_rate": 2.643846838404734e-06, + "loss": 0.4307, + "step": 321410 + }, + { + "epoch": 2.8414575929560284, + "grad_norm": 2.41862416267395, + "learning_rate": 2.6423734507328634e-06, + "loss": 0.481, + "step": 321420 + }, + { + "epoch": 2.8415459962163405, + "grad_norm": 1.7745498418807983, + "learning_rate": 2.6409000630609926e-06, + "loss": 0.6049, + "step": 321430 + }, + { + "epoch": 2.8416343994766526, + "grad_norm": 2.4958975315093994, + "learning_rate": 2.639426675389122e-06, + "loss": 0.4803, + "step": 321440 + }, + { + "epoch": 2.8417228027369648, + "grad_norm": 4.101349353790283, + "learning_rate": 2.637953287717251e-06, + "loss": 0.5228, + "step": 321450 + }, + { + "epoch": 2.8418112059972773, + "grad_norm": 8.85032844543457, + "learning_rate": 2.6364799000453807e-06, + "loss": 0.533, + "step": 321460 + }, + { + "epoch": 2.8418996092575894, + "grad_norm": 1.4603569507598877, + "learning_rate": 2.63500651237351e-06, + "loss": 0.513, + "step": 321470 + }, + { + "epoch": 2.8419880125179016, + "grad_norm": 3.6566152572631836, + "learning_rate": 2.633533124701639e-06, + "loss": 0.4776, + "step": 321480 + }, + { + "epoch": 2.842076415778214, + "grad_norm": 6.091458797454834, + "learning_rate": 2.6320597370297683e-06, + "loss": 0.3912, + "step": 321490 + }, + { + "epoch": 2.8421648190385262, + "grad_norm": 4.349668979644775, + "learning_rate": 2.6305863493578975e-06, + "loss": 0.4877, + "step": 321500 + }, + { + "epoch": 2.8422532222988384, + "grad_norm": 3.504810333251953, + "learning_rate": 2.629112961686027e-06, + "loss": 0.5445, + "step": 321510 + }, + { + "epoch": 2.8423416255591505, + "grad_norm": 1.497635841369629, + "learning_rate": 2.6276395740141563e-06, + "loss": 0.4358, + "step": 321520 + }, + { + "epoch": 2.8424300288194626, + "grad_norm": 8.037155151367188, + "learning_rate": 2.626166186342286e-06, + "loss": 0.4939, + "step": 321530 + }, + { + "epoch": 2.842518432079775, + "grad_norm": 7.606583118438721, + "learning_rate": 2.624692798670415e-06, + "loss": 0.51, + "step": 321540 + }, + { + "epoch": 2.8426068353400873, + "grad_norm": 4.614182472229004, + "learning_rate": 2.6232194109985443e-06, + "loss": 0.5308, + "step": 321550 + }, + { + "epoch": 2.8426952386004, + "grad_norm": 4.267144680023193, + "learning_rate": 2.621746023326674e-06, + "loss": 0.4395, + "step": 321560 + }, + { + "epoch": 2.842783641860712, + "grad_norm": 3.73848295211792, + "learning_rate": 2.620272635654803e-06, + "loss": 0.4517, + "step": 321570 + }, + { + "epoch": 2.842872045121024, + "grad_norm": 6.187122821807861, + "learning_rate": 2.6187992479829323e-06, + "loss": 0.5462, + "step": 321580 + }, + { + "epoch": 2.842960448381336, + "grad_norm": 1.7913357019424438, + "learning_rate": 2.6173258603110615e-06, + "loss": 0.6611, + "step": 321590 + }, + { + "epoch": 2.8430488516416483, + "grad_norm": 2.680617332458496, + "learning_rate": 2.615852472639191e-06, + "loss": 0.4888, + "step": 321600 + }, + { + "epoch": 2.843137254901961, + "grad_norm": 3.516681432723999, + "learning_rate": 2.6143790849673204e-06, + "loss": 0.4043, + "step": 321610 + }, + { + "epoch": 2.843225658162273, + "grad_norm": 4.490594387054443, + "learning_rate": 2.6129056972954496e-06, + "loss": 0.6756, + "step": 321620 + }, + { + "epoch": 2.843314061422585, + "grad_norm": 5.518166542053223, + "learning_rate": 2.6114323096235788e-06, + "loss": 0.5004, + "step": 321630 + }, + { + "epoch": 2.8434024646828977, + "grad_norm": 2.958787441253662, + "learning_rate": 2.6099589219517084e-06, + "loss": 0.4847, + "step": 321640 + }, + { + "epoch": 2.84349086794321, + "grad_norm": 3.6258602142333984, + "learning_rate": 2.608485534279838e-06, + "loss": 0.5432, + "step": 321650 + }, + { + "epoch": 2.843579271203522, + "grad_norm": 2.7412967681884766, + "learning_rate": 2.6070121466079672e-06, + "loss": 0.5193, + "step": 321660 + }, + { + "epoch": 2.843667674463834, + "grad_norm": 2.6814892292022705, + "learning_rate": 2.6055387589360964e-06, + "loss": 0.6054, + "step": 321670 + }, + { + "epoch": 2.8437560777241466, + "grad_norm": 6.616231918334961, + "learning_rate": 2.6040653712642256e-06, + "loss": 0.5864, + "step": 321680 + }, + { + "epoch": 2.8438444809844587, + "grad_norm": 1.6874979734420776, + "learning_rate": 2.602591983592355e-06, + "loss": 0.3995, + "step": 321690 + }, + { + "epoch": 2.843932884244771, + "grad_norm": 5.684721946716309, + "learning_rate": 2.6011185959204844e-06, + "loss": 0.4886, + "step": 321700 + }, + { + "epoch": 2.8440212875050834, + "grad_norm": 5.349959850311279, + "learning_rate": 2.5996452082486136e-06, + "loss": 0.4804, + "step": 321710 + }, + { + "epoch": 2.8441096907653955, + "grad_norm": 5.315876483917236, + "learning_rate": 2.598171820576743e-06, + "loss": 0.4332, + "step": 321720 + }, + { + "epoch": 2.8441980940257077, + "grad_norm": 1.9461898803710938, + "learning_rate": 2.596698432904872e-06, + "loss": 0.4079, + "step": 321730 + }, + { + "epoch": 2.84428649728602, + "grad_norm": 3.2027692794799805, + "learning_rate": 2.5952250452330017e-06, + "loss": 0.6106, + "step": 321740 + }, + { + "epoch": 2.844374900546332, + "grad_norm": 7.205440998077393, + "learning_rate": 2.593751657561131e-06, + "loss": 0.509, + "step": 321750 + }, + { + "epoch": 2.8444633038066445, + "grad_norm": 8.760283470153809, + "learning_rate": 2.5922782698892605e-06, + "loss": 0.4985, + "step": 321760 + }, + { + "epoch": 2.8445517070669566, + "grad_norm": 6.23874044418335, + "learning_rate": 2.5908048822173897e-06, + "loss": 0.5008, + "step": 321770 + }, + { + "epoch": 2.8446401103272687, + "grad_norm": 3.504628896713257, + "learning_rate": 2.589331494545519e-06, + "loss": 0.6679, + "step": 321780 + }, + { + "epoch": 2.8447285135875813, + "grad_norm": 0.9346033930778503, + "learning_rate": 2.5878581068736485e-06, + "loss": 0.4236, + "step": 321790 + }, + { + "epoch": 2.8448169168478934, + "grad_norm": 2.1191515922546387, + "learning_rate": 2.5863847192017777e-06, + "loss": 0.4529, + "step": 321800 + }, + { + "epoch": 2.8449053201082055, + "grad_norm": 6.145602226257324, + "learning_rate": 2.584911331529907e-06, + "loss": 0.5507, + "step": 321810 + }, + { + "epoch": 2.8449937233685176, + "grad_norm": 2.604897975921631, + "learning_rate": 2.583437943858036e-06, + "loss": 0.4613, + "step": 321820 + }, + { + "epoch": 2.84508212662883, + "grad_norm": 2.070343255996704, + "learning_rate": 2.5819645561861653e-06, + "loss": 0.3955, + "step": 321830 + }, + { + "epoch": 2.8451705298891423, + "grad_norm": 7.008542060852051, + "learning_rate": 2.580491168514295e-06, + "loss": 0.4793, + "step": 321840 + }, + { + "epoch": 2.8452589331494544, + "grad_norm": 1.9860254526138306, + "learning_rate": 2.579017780842424e-06, + "loss": 0.4987, + "step": 321850 + }, + { + "epoch": 2.845347336409767, + "grad_norm": 9.205671310424805, + "learning_rate": 2.5775443931705534e-06, + "loss": 0.4245, + "step": 321860 + }, + { + "epoch": 2.845435739670079, + "grad_norm": 5.481868743896484, + "learning_rate": 2.576071005498683e-06, + "loss": 0.5208, + "step": 321870 + }, + { + "epoch": 2.8455241429303912, + "grad_norm": 5.615553855895996, + "learning_rate": 2.5745976178268126e-06, + "loss": 0.6409, + "step": 321880 + }, + { + "epoch": 2.8456125461907034, + "grad_norm": 1.8004544973373413, + "learning_rate": 2.573124230154942e-06, + "loss": 0.5139, + "step": 321890 + }, + { + "epoch": 2.8457009494510155, + "grad_norm": 1.5015536546707153, + "learning_rate": 2.571650842483071e-06, + "loss": 0.4799, + "step": 321900 + }, + { + "epoch": 2.845789352711328, + "grad_norm": 2.887176513671875, + "learning_rate": 2.5701774548112e-06, + "loss": 0.6214, + "step": 321910 + }, + { + "epoch": 2.84587775597164, + "grad_norm": 18.113481521606445, + "learning_rate": 2.5687040671393294e-06, + "loss": 0.4954, + "step": 321920 + }, + { + "epoch": 2.8459661592319527, + "grad_norm": 2.9676010608673096, + "learning_rate": 2.567230679467459e-06, + "loss": 0.564, + "step": 321930 + }, + { + "epoch": 2.846054562492265, + "grad_norm": 22.52900505065918, + "learning_rate": 2.5657572917955882e-06, + "loss": 0.6064, + "step": 321940 + }, + { + "epoch": 2.846142965752577, + "grad_norm": 8.80559253692627, + "learning_rate": 2.5642839041237174e-06, + "loss": 0.4435, + "step": 321950 + }, + { + "epoch": 2.846231369012889, + "grad_norm": 1.9780410528182983, + "learning_rate": 2.5628105164518466e-06, + "loss": 0.5117, + "step": 321960 + }, + { + "epoch": 2.846319772273201, + "grad_norm": 1.9441864490509033, + "learning_rate": 2.561337128779976e-06, + "loss": 0.6094, + "step": 321970 + }, + { + "epoch": 2.8464081755335138, + "grad_norm": 3.5115206241607666, + "learning_rate": 2.5598637411081055e-06, + "loss": 0.5408, + "step": 321980 + }, + { + "epoch": 2.846496578793826, + "grad_norm": 3.417027711868286, + "learning_rate": 2.558390353436235e-06, + "loss": 0.6325, + "step": 321990 + }, + { + "epoch": 2.846584982054138, + "grad_norm": 5.484719753265381, + "learning_rate": 2.5569169657643643e-06, + "loss": 0.504, + "step": 322000 + }, + { + "epoch": 2.8466733853144506, + "grad_norm": 2.501359701156616, + "learning_rate": 2.5554435780924935e-06, + "loss": 0.5693, + "step": 322010 + }, + { + "epoch": 2.8467617885747627, + "grad_norm": 2.6285605430603027, + "learning_rate": 2.553970190420623e-06, + "loss": 0.4559, + "step": 322020 + }, + { + "epoch": 2.846850191835075, + "grad_norm": 4.846668720245361, + "learning_rate": 2.5524968027487523e-06, + "loss": 0.4764, + "step": 322030 + }, + { + "epoch": 2.846938595095387, + "grad_norm": 2.048436403274536, + "learning_rate": 2.5510234150768815e-06, + "loss": 0.5085, + "step": 322040 + }, + { + "epoch": 2.8470269983556995, + "grad_norm": 2.134925365447998, + "learning_rate": 2.5495500274050107e-06, + "loss": 0.4822, + "step": 322050 + }, + { + "epoch": 2.8471154016160116, + "grad_norm": 5.984210968017578, + "learning_rate": 2.54807663973314e-06, + "loss": 0.5256, + "step": 322060 + }, + { + "epoch": 2.8472038048763237, + "grad_norm": 0.9003944993019104, + "learning_rate": 2.5466032520612695e-06, + "loss": 0.5705, + "step": 322070 + }, + { + "epoch": 2.8472922081366363, + "grad_norm": 10.43937873840332, + "learning_rate": 2.5451298643893987e-06, + "loss": 0.4143, + "step": 322080 + }, + { + "epoch": 2.8473806113969484, + "grad_norm": 2.6982827186584473, + "learning_rate": 2.543656476717528e-06, + "loss": 0.5051, + "step": 322090 + }, + { + "epoch": 2.8474690146572605, + "grad_norm": 10.960773468017578, + "learning_rate": 2.5421830890456576e-06, + "loss": 0.459, + "step": 322100 + }, + { + "epoch": 2.8475574179175727, + "grad_norm": 4.532067775726318, + "learning_rate": 2.5407097013737868e-06, + "loss": 0.5319, + "step": 322110 + }, + { + "epoch": 2.847645821177885, + "grad_norm": 6.553077697753906, + "learning_rate": 2.5392363137019164e-06, + "loss": 0.6025, + "step": 322120 + }, + { + "epoch": 2.8477342244381973, + "grad_norm": 4.231441497802734, + "learning_rate": 2.5377629260300456e-06, + "loss": 0.57, + "step": 322130 + }, + { + "epoch": 2.8478226276985095, + "grad_norm": 7.847822189331055, + "learning_rate": 2.536289538358175e-06, + "loss": 0.5752, + "step": 322140 + }, + { + "epoch": 2.847911030958822, + "grad_norm": 4.723659992218018, + "learning_rate": 2.534816150686304e-06, + "loss": 0.4467, + "step": 322150 + }, + { + "epoch": 2.847999434219134, + "grad_norm": 4.620954513549805, + "learning_rate": 2.5333427630144336e-06, + "loss": 0.4893, + "step": 322160 + }, + { + "epoch": 2.8480878374794463, + "grad_norm": 2.449892520904541, + "learning_rate": 2.531869375342563e-06, + "loss": 0.476, + "step": 322170 + }, + { + "epoch": 2.8481762407397584, + "grad_norm": 1.2823439836502075, + "learning_rate": 2.530395987670692e-06, + "loss": 0.5241, + "step": 322180 + }, + { + "epoch": 2.8482646440000705, + "grad_norm": 2.342435121536255, + "learning_rate": 2.5289225999988212e-06, + "loss": 0.5005, + "step": 322190 + }, + { + "epoch": 2.848353047260383, + "grad_norm": 4.058268070220947, + "learning_rate": 2.5274492123269504e-06, + "loss": 0.5203, + "step": 322200 + }, + { + "epoch": 2.848441450520695, + "grad_norm": 6.4408769607543945, + "learning_rate": 2.52597582465508e-06, + "loss": 0.5065, + "step": 322210 + }, + { + "epoch": 2.8485298537810073, + "grad_norm": 4.201756954193115, + "learning_rate": 2.5245024369832092e-06, + "loss": 0.4759, + "step": 322220 + }, + { + "epoch": 2.84861825704132, + "grad_norm": 1.843934178352356, + "learning_rate": 2.523029049311339e-06, + "loss": 0.4789, + "step": 322230 + }, + { + "epoch": 2.848706660301632, + "grad_norm": 6.152200698852539, + "learning_rate": 2.521555661639468e-06, + "loss": 0.5544, + "step": 322240 + }, + { + "epoch": 2.848795063561944, + "grad_norm": 3.3256542682647705, + "learning_rate": 2.5200822739675973e-06, + "loss": 0.4891, + "step": 322250 + }, + { + "epoch": 2.8488834668222562, + "grad_norm": 3.080350160598755, + "learning_rate": 2.518608886295727e-06, + "loss": 0.4836, + "step": 322260 + }, + { + "epoch": 2.848971870082569, + "grad_norm": 4.999409198760986, + "learning_rate": 2.517135498623856e-06, + "loss": 0.5886, + "step": 322270 + }, + { + "epoch": 2.849060273342881, + "grad_norm": 1.7969616651535034, + "learning_rate": 2.5156621109519853e-06, + "loss": 0.4741, + "step": 322280 + }, + { + "epoch": 2.849148676603193, + "grad_norm": 1.8246426582336426, + "learning_rate": 2.5141887232801145e-06, + "loss": 0.4244, + "step": 322290 + }, + { + "epoch": 2.8492370798635056, + "grad_norm": 7.247693061828613, + "learning_rate": 2.512715335608244e-06, + "loss": 0.5051, + "step": 322300 + }, + { + "epoch": 2.8493254831238177, + "grad_norm": 14.418137550354004, + "learning_rate": 2.5112419479363733e-06, + "loss": 0.5761, + "step": 322310 + }, + { + "epoch": 2.84941388638413, + "grad_norm": 2.016348123550415, + "learning_rate": 2.5097685602645025e-06, + "loss": 0.5323, + "step": 322320 + }, + { + "epoch": 2.849502289644442, + "grad_norm": 6.297048091888428, + "learning_rate": 2.508295172592632e-06, + "loss": 0.5094, + "step": 322330 + }, + { + "epoch": 2.849590692904754, + "grad_norm": 2.3600378036499023, + "learning_rate": 2.5068217849207614e-06, + "loss": 0.6348, + "step": 322340 + }, + { + "epoch": 2.8496790961650666, + "grad_norm": 13.093294143676758, + "learning_rate": 2.505348397248891e-06, + "loss": 0.4721, + "step": 322350 + }, + { + "epoch": 2.8497674994253788, + "grad_norm": 2.258080244064331, + "learning_rate": 2.50387500957702e-06, + "loss": 0.5703, + "step": 322360 + }, + { + "epoch": 2.849855902685691, + "grad_norm": 7.927944183349609, + "learning_rate": 2.5024016219051494e-06, + "loss": 0.5556, + "step": 322370 + }, + { + "epoch": 2.8499443059460035, + "grad_norm": 1.5520873069763184, + "learning_rate": 2.5009282342332786e-06, + "loss": 0.4576, + "step": 322380 + }, + { + "epoch": 2.8500327092063156, + "grad_norm": 1.0915415287017822, + "learning_rate": 2.4994548465614078e-06, + "loss": 0.4408, + "step": 322390 + }, + { + "epoch": 2.8501211124666277, + "grad_norm": 8.82461166381836, + "learning_rate": 2.4979814588895374e-06, + "loss": 0.4595, + "step": 322400 + }, + { + "epoch": 2.85020951572694, + "grad_norm": 6.1891913414001465, + "learning_rate": 2.4965080712176666e-06, + "loss": 0.5071, + "step": 322410 + }, + { + "epoch": 2.8502979189872524, + "grad_norm": 3.733389139175415, + "learning_rate": 2.495034683545796e-06, + "loss": 0.5015, + "step": 322420 + }, + { + "epoch": 2.8503863222475645, + "grad_norm": 4.169348239898682, + "learning_rate": 2.493561295873925e-06, + "loss": 0.5091, + "step": 322430 + }, + { + "epoch": 2.8504747255078766, + "grad_norm": 4.321398735046387, + "learning_rate": 2.4920879082020546e-06, + "loss": 0.4026, + "step": 322440 + }, + { + "epoch": 2.850563128768189, + "grad_norm": 4.430682182312012, + "learning_rate": 2.490614520530184e-06, + "loss": 0.4597, + "step": 322450 + }, + { + "epoch": 2.8506515320285013, + "grad_norm": 6.171957015991211, + "learning_rate": 2.4891411328583135e-06, + "loss": 0.501, + "step": 322460 + }, + { + "epoch": 2.8507399352888134, + "grad_norm": 3.4733426570892334, + "learning_rate": 2.4876677451864427e-06, + "loss": 0.4945, + "step": 322470 + }, + { + "epoch": 2.8508283385491255, + "grad_norm": 18.624425888061523, + "learning_rate": 2.486194357514572e-06, + "loss": 0.5786, + "step": 322480 + }, + { + "epoch": 2.8509167418094377, + "grad_norm": 8.537191390991211, + "learning_rate": 2.4847209698427015e-06, + "loss": 0.55, + "step": 322490 + }, + { + "epoch": 2.8510051450697502, + "grad_norm": 5.5934648513793945, + "learning_rate": 2.4832475821708307e-06, + "loss": 0.3846, + "step": 322500 + }, + { + "epoch": 2.8510935483300623, + "grad_norm": 3.4468371868133545, + "learning_rate": 2.48177419449896e-06, + "loss": 0.4818, + "step": 322510 + }, + { + "epoch": 2.851181951590375, + "grad_norm": 2.7148985862731934, + "learning_rate": 2.480300806827089e-06, + "loss": 0.5423, + "step": 322520 + }, + { + "epoch": 2.851270354850687, + "grad_norm": 30.219058990478516, + "learning_rate": 2.4788274191552183e-06, + "loss": 0.5469, + "step": 322530 + }, + { + "epoch": 2.851358758110999, + "grad_norm": 1.7040406465530396, + "learning_rate": 2.477354031483348e-06, + "loss": 0.4193, + "step": 322540 + }, + { + "epoch": 2.8514471613713113, + "grad_norm": 12.084625244140625, + "learning_rate": 2.475880643811477e-06, + "loss": 0.5527, + "step": 322550 + }, + { + "epoch": 2.8515355646316234, + "grad_norm": 1.476606845855713, + "learning_rate": 2.4744072561396063e-06, + "loss": 0.3814, + "step": 322560 + }, + { + "epoch": 2.851623967891936, + "grad_norm": 2.7390220165252686, + "learning_rate": 2.472933868467736e-06, + "loss": 0.5724, + "step": 322570 + }, + { + "epoch": 2.851712371152248, + "grad_norm": 1.79238760471344, + "learning_rate": 2.4714604807958656e-06, + "loss": 0.4944, + "step": 322580 + }, + { + "epoch": 2.85180077441256, + "grad_norm": 3.2560853958129883, + "learning_rate": 2.4699870931239948e-06, + "loss": 0.5798, + "step": 322590 + }, + { + "epoch": 2.8518891776728728, + "grad_norm": 11.03203296661377, + "learning_rate": 2.468513705452124e-06, + "loss": 0.4632, + "step": 322600 + }, + { + "epoch": 2.851977580933185, + "grad_norm": 1.5827000141143799, + "learning_rate": 2.467040317780253e-06, + "loss": 0.5058, + "step": 322610 + }, + { + "epoch": 2.852065984193497, + "grad_norm": 4.816565036773682, + "learning_rate": 2.4655669301083824e-06, + "loss": 0.4302, + "step": 322620 + }, + { + "epoch": 2.852154387453809, + "grad_norm": 3.1450741291046143, + "learning_rate": 2.464093542436512e-06, + "loss": 0.5138, + "step": 322630 + }, + { + "epoch": 2.8522427907141217, + "grad_norm": 1.99415922164917, + "learning_rate": 2.462620154764641e-06, + "loss": 0.6593, + "step": 322640 + }, + { + "epoch": 2.852331193974434, + "grad_norm": 8.680097579956055, + "learning_rate": 2.4611467670927704e-06, + "loss": 0.6346, + "step": 322650 + }, + { + "epoch": 2.852419597234746, + "grad_norm": 2.0591444969177246, + "learning_rate": 2.4596733794208996e-06, + "loss": 0.5309, + "step": 322660 + }, + { + "epoch": 2.8525080004950585, + "grad_norm": 8.292585372924805, + "learning_rate": 2.458199991749029e-06, + "loss": 0.7304, + "step": 322670 + }, + { + "epoch": 2.8525964037553706, + "grad_norm": 14.335709571838379, + "learning_rate": 2.4567266040771584e-06, + "loss": 0.3884, + "step": 322680 + }, + { + "epoch": 2.8526848070156827, + "grad_norm": 2.2650768756866455, + "learning_rate": 2.455253216405288e-06, + "loss": 0.7242, + "step": 322690 + }, + { + "epoch": 2.852773210275995, + "grad_norm": 1.4074316024780273, + "learning_rate": 2.4537798287334172e-06, + "loss": 0.5331, + "step": 322700 + }, + { + "epoch": 2.852861613536307, + "grad_norm": 2.3847389221191406, + "learning_rate": 2.4523064410615464e-06, + "loss": 0.5856, + "step": 322710 + }, + { + "epoch": 2.8529500167966195, + "grad_norm": 6.262715816497803, + "learning_rate": 2.450833053389676e-06, + "loss": 0.5791, + "step": 322720 + }, + { + "epoch": 2.8530384200569316, + "grad_norm": 8.615385055541992, + "learning_rate": 2.4493596657178053e-06, + "loss": 0.6071, + "step": 322730 + }, + { + "epoch": 2.853126823317244, + "grad_norm": 2.5600342750549316, + "learning_rate": 2.4478862780459345e-06, + "loss": 0.4915, + "step": 322740 + }, + { + "epoch": 2.8532152265775563, + "grad_norm": 2.530064344406128, + "learning_rate": 2.4464128903740637e-06, + "loss": 0.3988, + "step": 322750 + }, + { + "epoch": 2.8533036298378684, + "grad_norm": 14.401690483093262, + "learning_rate": 2.444939502702193e-06, + "loss": 0.5474, + "step": 322760 + }, + { + "epoch": 2.8533920330981806, + "grad_norm": 8.175921440124512, + "learning_rate": 2.4434661150303225e-06, + "loss": 0.5749, + "step": 322770 + }, + { + "epoch": 2.8534804363584927, + "grad_norm": 1.0237481594085693, + "learning_rate": 2.4419927273584517e-06, + "loss": 0.5161, + "step": 322780 + }, + { + "epoch": 2.8535688396188053, + "grad_norm": 2.112799882888794, + "learning_rate": 2.440519339686581e-06, + "loss": 0.5661, + "step": 322790 + }, + { + "epoch": 2.8536572428791174, + "grad_norm": 4.651673316955566, + "learning_rate": 2.4390459520147105e-06, + "loss": 0.5759, + "step": 322800 + }, + { + "epoch": 2.8537456461394295, + "grad_norm": 3.2302842140197754, + "learning_rate": 2.4375725643428397e-06, + "loss": 0.5312, + "step": 322810 + }, + { + "epoch": 2.853834049399742, + "grad_norm": 3.8983519077301025, + "learning_rate": 2.4360991766709694e-06, + "loss": 0.534, + "step": 322820 + }, + { + "epoch": 2.853922452660054, + "grad_norm": 2.9067816734313965, + "learning_rate": 2.4346257889990986e-06, + "loss": 0.5586, + "step": 322830 + }, + { + "epoch": 2.8540108559203663, + "grad_norm": 1.8135998249053955, + "learning_rate": 2.4331524013272278e-06, + "loss": 0.5189, + "step": 322840 + }, + { + "epoch": 2.8540992591806784, + "grad_norm": 4.360698699951172, + "learning_rate": 2.431679013655357e-06, + "loss": 0.4918, + "step": 322850 + }, + { + "epoch": 2.854187662440991, + "grad_norm": 3.8568782806396484, + "learning_rate": 2.4302056259834866e-06, + "loss": 0.4244, + "step": 322860 + }, + { + "epoch": 2.854276065701303, + "grad_norm": 4.379212379455566, + "learning_rate": 2.4287322383116158e-06, + "loss": 0.5816, + "step": 322870 + }, + { + "epoch": 2.854364468961615, + "grad_norm": 2.3963305950164795, + "learning_rate": 2.427258850639745e-06, + "loss": 0.4853, + "step": 322880 + }, + { + "epoch": 2.854452872221928, + "grad_norm": 1.2426772117614746, + "learning_rate": 2.425785462967874e-06, + "loss": 0.5137, + "step": 322890 + }, + { + "epoch": 2.85454127548224, + "grad_norm": 4.319767951965332, + "learning_rate": 2.4243120752960034e-06, + "loss": 0.5158, + "step": 322900 + }, + { + "epoch": 2.854629678742552, + "grad_norm": 2.128247022628784, + "learning_rate": 2.422838687624133e-06, + "loss": 0.5516, + "step": 322910 + }, + { + "epoch": 2.854718082002864, + "grad_norm": 2.520658016204834, + "learning_rate": 2.4213652999522626e-06, + "loss": 0.4515, + "step": 322920 + }, + { + "epoch": 2.8548064852631763, + "grad_norm": 9.739389419555664, + "learning_rate": 2.419891912280392e-06, + "loss": 0.6147, + "step": 322930 + }, + { + "epoch": 2.854894888523489, + "grad_norm": 4.729218006134033, + "learning_rate": 2.418418524608521e-06, + "loss": 0.4228, + "step": 322940 + }, + { + "epoch": 2.854983291783801, + "grad_norm": 2.9320287704467773, + "learning_rate": 2.4169451369366502e-06, + "loss": 0.4907, + "step": 322950 + }, + { + "epoch": 2.855071695044113, + "grad_norm": 2.672203540802002, + "learning_rate": 2.41547174926478e-06, + "loss": 0.4998, + "step": 322960 + }, + { + "epoch": 2.8551600983044256, + "grad_norm": 1.786895513534546, + "learning_rate": 2.413998361592909e-06, + "loss": 0.6193, + "step": 322970 + }, + { + "epoch": 2.8552485015647378, + "grad_norm": 5.49282169342041, + "learning_rate": 2.4125249739210383e-06, + "loss": 0.539, + "step": 322980 + }, + { + "epoch": 2.85533690482505, + "grad_norm": 4.714466094970703, + "learning_rate": 2.4110515862491675e-06, + "loss": 0.5073, + "step": 322990 + }, + { + "epoch": 2.855425308085362, + "grad_norm": 3.1261026859283447, + "learning_rate": 2.409578198577297e-06, + "loss": 0.4325, + "step": 323000 + }, + { + "epoch": 2.8555137113456746, + "grad_norm": 6.31460428237915, + "learning_rate": 2.4081048109054263e-06, + "loss": 0.4131, + "step": 323010 + }, + { + "epoch": 2.8556021146059867, + "grad_norm": 2.175708532333374, + "learning_rate": 2.4066314232335555e-06, + "loss": 0.6076, + "step": 323020 + }, + { + "epoch": 2.855690517866299, + "grad_norm": 5.574202537536621, + "learning_rate": 2.405158035561685e-06, + "loss": 0.4265, + "step": 323030 + }, + { + "epoch": 2.8557789211266114, + "grad_norm": 4.459157466888428, + "learning_rate": 2.4036846478898143e-06, + "loss": 0.4583, + "step": 323040 + }, + { + "epoch": 2.8558673243869235, + "grad_norm": 2.660440444946289, + "learning_rate": 2.402211260217944e-06, + "loss": 0.531, + "step": 323050 + }, + { + "epoch": 2.8559557276472356, + "grad_norm": 1.6954104900360107, + "learning_rate": 2.400737872546073e-06, + "loss": 0.739, + "step": 323060 + }, + { + "epoch": 2.8560441309075477, + "grad_norm": 2.671832799911499, + "learning_rate": 2.3992644848742023e-06, + "loss": 0.6002, + "step": 323070 + }, + { + "epoch": 2.85613253416786, + "grad_norm": 7.0304741859436035, + "learning_rate": 2.3977910972023315e-06, + "loss": 0.3974, + "step": 323080 + }, + { + "epoch": 2.8562209374281724, + "grad_norm": 5.060092449188232, + "learning_rate": 2.3963177095304607e-06, + "loss": 0.4934, + "step": 323090 + }, + { + "epoch": 2.8563093406884845, + "grad_norm": 2.5122907161712646, + "learning_rate": 2.3948443218585904e-06, + "loss": 0.4304, + "step": 323100 + }, + { + "epoch": 2.856397743948797, + "grad_norm": 14.072664260864258, + "learning_rate": 2.3933709341867196e-06, + "loss": 0.5287, + "step": 323110 + }, + { + "epoch": 2.856486147209109, + "grad_norm": 3.6246588230133057, + "learning_rate": 2.3918975465148488e-06, + "loss": 0.6781, + "step": 323120 + }, + { + "epoch": 2.8565745504694213, + "grad_norm": 2.3993287086486816, + "learning_rate": 2.390424158842978e-06, + "loss": 0.5944, + "step": 323130 + }, + { + "epoch": 2.8566629537297334, + "grad_norm": 1.4929784536361694, + "learning_rate": 2.3889507711711076e-06, + "loss": 0.4762, + "step": 323140 + }, + { + "epoch": 2.8567513569900456, + "grad_norm": 7.9040069580078125, + "learning_rate": 2.387477383499237e-06, + "loss": 0.4216, + "step": 323150 + }, + { + "epoch": 2.856839760250358, + "grad_norm": 20.463563919067383, + "learning_rate": 2.3860039958273664e-06, + "loss": 0.4858, + "step": 323160 + }, + { + "epoch": 2.8569281635106702, + "grad_norm": 5.8424506187438965, + "learning_rate": 2.3845306081554956e-06, + "loss": 0.5154, + "step": 323170 + }, + { + "epoch": 2.8570165667709824, + "grad_norm": 2.986959457397461, + "learning_rate": 2.383057220483625e-06, + "loss": 0.5379, + "step": 323180 + }, + { + "epoch": 2.857104970031295, + "grad_norm": 2.2924411296844482, + "learning_rate": 2.3815838328117544e-06, + "loss": 0.6251, + "step": 323190 + }, + { + "epoch": 2.857193373291607, + "grad_norm": 2.444554090499878, + "learning_rate": 2.3801104451398836e-06, + "loss": 0.4769, + "step": 323200 + }, + { + "epoch": 2.857281776551919, + "grad_norm": 3.6077592372894287, + "learning_rate": 2.378637057468013e-06, + "loss": 0.4589, + "step": 323210 + }, + { + "epoch": 2.8573701798122313, + "grad_norm": 4.74035120010376, + "learning_rate": 2.377163669796142e-06, + "loss": 0.4233, + "step": 323220 + }, + { + "epoch": 2.857458583072544, + "grad_norm": 1.7760213613510132, + "learning_rate": 2.3756902821242713e-06, + "loss": 0.5295, + "step": 323230 + }, + { + "epoch": 2.857546986332856, + "grad_norm": 7.020007610321045, + "learning_rate": 2.374216894452401e-06, + "loss": 0.48, + "step": 323240 + }, + { + "epoch": 2.857635389593168, + "grad_norm": 0.7351688742637634, + "learning_rate": 2.37274350678053e-06, + "loss": 0.3543, + "step": 323250 + }, + { + "epoch": 2.8577237928534807, + "grad_norm": 1.4446607828140259, + "learning_rate": 2.3712701191086593e-06, + "loss": 0.4145, + "step": 323260 + }, + { + "epoch": 2.857812196113793, + "grad_norm": 2.798844814300537, + "learning_rate": 2.369796731436789e-06, + "loss": 0.4741, + "step": 323270 + }, + { + "epoch": 2.857900599374105, + "grad_norm": 2.7984848022460938, + "learning_rate": 2.3683233437649185e-06, + "loss": 0.7469, + "step": 323280 + }, + { + "epoch": 2.857989002634417, + "grad_norm": 4.4381489753723145, + "learning_rate": 2.3668499560930477e-06, + "loss": 0.497, + "step": 323290 + }, + { + "epoch": 2.858077405894729, + "grad_norm": 1.6505285501480103, + "learning_rate": 2.365376568421177e-06, + "loss": 0.5013, + "step": 323300 + }, + { + "epoch": 2.8581658091550417, + "grad_norm": 27.01951789855957, + "learning_rate": 2.363903180749306e-06, + "loss": 0.4554, + "step": 323310 + }, + { + "epoch": 2.858254212415354, + "grad_norm": 2.2667481899261475, + "learning_rate": 2.3624297930774353e-06, + "loss": 0.5323, + "step": 323320 + }, + { + "epoch": 2.8583426156756664, + "grad_norm": 2.7288005352020264, + "learning_rate": 2.360956405405565e-06, + "loss": 0.4468, + "step": 323330 + }, + { + "epoch": 2.8584310189359785, + "grad_norm": 1.1488837003707886, + "learning_rate": 2.359483017733694e-06, + "loss": 0.4446, + "step": 323340 + }, + { + "epoch": 2.8585194221962906, + "grad_norm": 5.074249267578125, + "learning_rate": 2.3580096300618234e-06, + "loss": 0.5087, + "step": 323350 + }, + { + "epoch": 2.8586078254566027, + "grad_norm": 4.626770973205566, + "learning_rate": 2.3565362423899526e-06, + "loss": 0.5137, + "step": 323360 + }, + { + "epoch": 2.858696228716915, + "grad_norm": 1.7006064653396606, + "learning_rate": 2.3550628547180818e-06, + "loss": 0.4662, + "step": 323370 + }, + { + "epoch": 2.8587846319772274, + "grad_norm": 28.063636779785156, + "learning_rate": 2.3535894670462114e-06, + "loss": 0.5449, + "step": 323380 + }, + { + "epoch": 2.8588730352375396, + "grad_norm": 1.171941876411438, + "learning_rate": 2.352116079374341e-06, + "loss": 0.4747, + "step": 323390 + }, + { + "epoch": 2.8589614384978517, + "grad_norm": 22.89394760131836, + "learning_rate": 2.35064269170247e-06, + "loss": 0.6028, + "step": 323400 + }, + { + "epoch": 2.8590498417581642, + "grad_norm": 10.560433387756348, + "learning_rate": 2.3491693040305994e-06, + "loss": 0.5932, + "step": 323410 + }, + { + "epoch": 2.8591382450184764, + "grad_norm": 2.940001964569092, + "learning_rate": 2.3476959163587286e-06, + "loss": 0.3348, + "step": 323420 + }, + { + "epoch": 2.8592266482787885, + "grad_norm": 5.67319393157959, + "learning_rate": 2.3462225286868582e-06, + "loss": 0.552, + "step": 323430 + }, + { + "epoch": 2.8593150515391006, + "grad_norm": 2.4129743576049805, + "learning_rate": 2.3447491410149874e-06, + "loss": 0.5153, + "step": 323440 + }, + { + "epoch": 2.859403454799413, + "grad_norm": 3.3993313312530518, + "learning_rate": 2.3432757533431166e-06, + "loss": 0.3663, + "step": 323450 + }, + { + "epoch": 2.8594918580597253, + "grad_norm": 9.200560569763184, + "learning_rate": 2.341802365671246e-06, + "loss": 0.5172, + "step": 323460 + }, + { + "epoch": 2.8595802613200374, + "grad_norm": 12.493908882141113, + "learning_rate": 2.3403289779993755e-06, + "loss": 0.4707, + "step": 323470 + }, + { + "epoch": 2.85966866458035, + "grad_norm": 1.375604510307312, + "learning_rate": 2.3388555903275047e-06, + "loss": 0.5058, + "step": 323480 + }, + { + "epoch": 2.859757067840662, + "grad_norm": 3.882176160812378, + "learning_rate": 2.337382202655634e-06, + "loss": 0.7143, + "step": 323490 + }, + { + "epoch": 2.859845471100974, + "grad_norm": 8.690019607543945, + "learning_rate": 2.3359088149837635e-06, + "loss": 0.5628, + "step": 323500 + }, + { + "epoch": 2.8599338743612863, + "grad_norm": 12.735404014587402, + "learning_rate": 2.3344354273118927e-06, + "loss": 0.5061, + "step": 323510 + }, + { + "epoch": 2.8600222776215984, + "grad_norm": 2.6853861808776855, + "learning_rate": 2.3329620396400223e-06, + "loss": 0.4717, + "step": 323520 + }, + { + "epoch": 2.860110680881911, + "grad_norm": 0.7675589323043823, + "learning_rate": 2.3314886519681515e-06, + "loss": 0.3845, + "step": 323530 + }, + { + "epoch": 2.860199084142223, + "grad_norm": 7.7468438148498535, + "learning_rate": 2.3300152642962807e-06, + "loss": 0.5781, + "step": 323540 + }, + { + "epoch": 2.8602874874025352, + "grad_norm": 4.604738235473633, + "learning_rate": 2.32854187662441e-06, + "loss": 0.5122, + "step": 323550 + }, + { + "epoch": 2.860375890662848, + "grad_norm": 3.437903881072998, + "learning_rate": 2.327068488952539e-06, + "loss": 0.3983, + "step": 323560 + }, + { + "epoch": 2.86046429392316, + "grad_norm": 66.16860961914062, + "learning_rate": 2.3255951012806687e-06, + "loss": 0.5146, + "step": 323570 + }, + { + "epoch": 2.860552697183472, + "grad_norm": 2.7321293354034424, + "learning_rate": 2.324121713608798e-06, + "loss": 0.4903, + "step": 323580 + }, + { + "epoch": 2.860641100443784, + "grad_norm": 5.0876593589782715, + "learning_rate": 2.322648325936927e-06, + "loss": 0.4554, + "step": 323590 + }, + { + "epoch": 2.8607295037040967, + "grad_norm": 5.821722984313965, + "learning_rate": 2.3211749382650563e-06, + "loss": 0.6072, + "step": 323600 + }, + { + "epoch": 2.860817906964409, + "grad_norm": 6.856667518615723, + "learning_rate": 2.319701550593186e-06, + "loss": 0.5719, + "step": 323610 + }, + { + "epoch": 2.860906310224721, + "grad_norm": 2.7449638843536377, + "learning_rate": 2.3182281629213156e-06, + "loss": 0.4954, + "step": 323620 + }, + { + "epoch": 2.8609947134850335, + "grad_norm": 8.951770782470703, + "learning_rate": 2.316754775249445e-06, + "loss": 0.553, + "step": 323630 + }, + { + "epoch": 2.8610831167453457, + "grad_norm": 3.170900583267212, + "learning_rate": 2.315281387577574e-06, + "loss": 0.5533, + "step": 323640 + }, + { + "epoch": 2.8611715200056578, + "grad_norm": 0.9313485622406006, + "learning_rate": 2.313807999905703e-06, + "loss": 0.4024, + "step": 323650 + }, + { + "epoch": 2.86125992326597, + "grad_norm": 8.741159439086914, + "learning_rate": 2.312334612233833e-06, + "loss": 0.4343, + "step": 323660 + }, + { + "epoch": 2.861348326526282, + "grad_norm": 2.208693027496338, + "learning_rate": 2.310861224561962e-06, + "loss": 0.592, + "step": 323670 + }, + { + "epoch": 2.8614367297865946, + "grad_norm": 2.2736258506774902, + "learning_rate": 2.3093878368900912e-06, + "loss": 0.3901, + "step": 323680 + }, + { + "epoch": 2.8615251330469067, + "grad_norm": 3.253662109375, + "learning_rate": 2.3079144492182204e-06, + "loss": 0.6001, + "step": 323690 + }, + { + "epoch": 2.8616135363072193, + "grad_norm": 3.1734092235565186, + "learning_rate": 2.3064410615463496e-06, + "loss": 0.5049, + "step": 323700 + }, + { + "epoch": 2.8617019395675314, + "grad_norm": 4.44614839553833, + "learning_rate": 2.3049676738744793e-06, + "loss": 0.4654, + "step": 323710 + }, + { + "epoch": 2.8617903428278435, + "grad_norm": 5.254923343658447, + "learning_rate": 2.3034942862026085e-06, + "loss": 0.7, + "step": 323720 + }, + { + "epoch": 2.8618787460881556, + "grad_norm": 3.5484185218811035, + "learning_rate": 2.302020898530738e-06, + "loss": 0.5819, + "step": 323730 + }, + { + "epoch": 2.8619671493484677, + "grad_norm": 17.276716232299805, + "learning_rate": 2.3005475108588673e-06, + "loss": 0.5593, + "step": 323740 + }, + { + "epoch": 2.8620555526087803, + "grad_norm": 2.1375069618225098, + "learning_rate": 2.299074123186997e-06, + "loss": 0.485, + "step": 323750 + }, + { + "epoch": 2.8621439558690924, + "grad_norm": 2.970452308654785, + "learning_rate": 2.297600735515126e-06, + "loss": 0.4817, + "step": 323760 + }, + { + "epoch": 2.8622323591294045, + "grad_norm": 1.268919825553894, + "learning_rate": 2.2961273478432553e-06, + "loss": 0.5064, + "step": 323770 + }, + { + "epoch": 2.862320762389717, + "grad_norm": 15.980216979980469, + "learning_rate": 2.2946539601713845e-06, + "loss": 0.5247, + "step": 323780 + }, + { + "epoch": 2.8624091656500292, + "grad_norm": 3.908273458480835, + "learning_rate": 2.2931805724995137e-06, + "loss": 0.4499, + "step": 323790 + }, + { + "epoch": 2.8624975689103414, + "grad_norm": 8.635895729064941, + "learning_rate": 2.2917071848276433e-06, + "loss": 0.5475, + "step": 323800 + }, + { + "epoch": 2.8625859721706535, + "grad_norm": 4.5046515464782715, + "learning_rate": 2.2902337971557725e-06, + "loss": 0.5852, + "step": 323810 + }, + { + "epoch": 2.862674375430966, + "grad_norm": 4.917535781860352, + "learning_rate": 2.2887604094839017e-06, + "loss": 0.4536, + "step": 323820 + }, + { + "epoch": 2.862762778691278, + "grad_norm": 6.882230281829834, + "learning_rate": 2.287287021812031e-06, + "loss": 0.6047, + "step": 323830 + }, + { + "epoch": 2.8628511819515903, + "grad_norm": 4.0178632736206055, + "learning_rate": 2.2858136341401606e-06, + "loss": 0.5533, + "step": 323840 + }, + { + "epoch": 2.862939585211903, + "grad_norm": 1.7185235023498535, + "learning_rate": 2.2843402464682898e-06, + "loss": 0.5424, + "step": 323850 + }, + { + "epoch": 2.863027988472215, + "grad_norm": 9.293070793151855, + "learning_rate": 2.2828668587964194e-06, + "loss": 0.5344, + "step": 323860 + }, + { + "epoch": 2.863116391732527, + "grad_norm": 1.4359995126724243, + "learning_rate": 2.2813934711245486e-06, + "loss": 0.5135, + "step": 323870 + }, + { + "epoch": 2.863204794992839, + "grad_norm": 4.001980304718018, + "learning_rate": 2.2799200834526778e-06, + "loss": 0.5011, + "step": 323880 + }, + { + "epoch": 2.8632931982531513, + "grad_norm": 9.679963111877441, + "learning_rate": 2.2784466957808074e-06, + "loss": 0.6522, + "step": 323890 + }, + { + "epoch": 2.863381601513464, + "grad_norm": 3.0284554958343506, + "learning_rate": 2.2769733081089366e-06, + "loss": 0.6541, + "step": 323900 + }, + { + "epoch": 2.863470004773776, + "grad_norm": 3.833906412124634, + "learning_rate": 2.275499920437066e-06, + "loss": 0.5693, + "step": 323910 + }, + { + "epoch": 2.8635584080340886, + "grad_norm": 4.452915191650391, + "learning_rate": 2.274026532765195e-06, + "loss": 0.6131, + "step": 323920 + }, + { + "epoch": 2.8636468112944007, + "grad_norm": 4.430716514587402, + "learning_rate": 2.2725531450933242e-06, + "loss": 0.5577, + "step": 323930 + }, + { + "epoch": 2.863735214554713, + "grad_norm": 10.176749229431152, + "learning_rate": 2.271079757421454e-06, + "loss": 0.552, + "step": 323940 + }, + { + "epoch": 2.863823617815025, + "grad_norm": 12.94589900970459, + "learning_rate": 2.269606369749583e-06, + "loss": 0.5015, + "step": 323950 + }, + { + "epoch": 2.863912021075337, + "grad_norm": 2.531728506088257, + "learning_rate": 2.2681329820777122e-06, + "loss": 0.4954, + "step": 323960 + }, + { + "epoch": 2.8640004243356496, + "grad_norm": 9.018269538879395, + "learning_rate": 2.266659594405842e-06, + "loss": 0.4809, + "step": 323970 + }, + { + "epoch": 2.8640888275959617, + "grad_norm": 4.361026287078857, + "learning_rate": 2.265186206733971e-06, + "loss": 0.6561, + "step": 323980 + }, + { + "epoch": 2.864177230856274, + "grad_norm": 1.5763158798217773, + "learning_rate": 2.2637128190621007e-06, + "loss": 0.5293, + "step": 323990 + }, + { + "epoch": 2.8642656341165864, + "grad_norm": 4.505405426025391, + "learning_rate": 2.26223943139023e-06, + "loss": 0.5279, + "step": 324000 + }, + { + "epoch": 2.8643540373768985, + "grad_norm": 1.4393938779830933, + "learning_rate": 2.260766043718359e-06, + "loss": 0.5028, + "step": 324010 + }, + { + "epoch": 2.8644424406372107, + "grad_norm": 3.262199878692627, + "learning_rate": 2.2592926560464883e-06, + "loss": 0.4814, + "step": 324020 + }, + { + "epoch": 2.8645308438975228, + "grad_norm": 16.903759002685547, + "learning_rate": 2.257819268374618e-06, + "loss": 0.5765, + "step": 324030 + }, + { + "epoch": 2.8646192471578353, + "grad_norm": 3.6462056636810303, + "learning_rate": 2.256345880702747e-06, + "loss": 0.6697, + "step": 324040 + }, + { + "epoch": 2.8647076504181475, + "grad_norm": 5.030492782592773, + "learning_rate": 2.2548724930308763e-06, + "loss": 0.375, + "step": 324050 + }, + { + "epoch": 2.8647960536784596, + "grad_norm": 3.978754758834839, + "learning_rate": 2.2533991053590055e-06, + "loss": 0.4331, + "step": 324060 + }, + { + "epoch": 2.864884456938772, + "grad_norm": 1.1516093015670776, + "learning_rate": 2.251925717687135e-06, + "loss": 0.5011, + "step": 324070 + }, + { + "epoch": 2.8649728601990843, + "grad_norm": 2.7295987606048584, + "learning_rate": 2.2504523300152643e-06, + "loss": 0.5761, + "step": 324080 + }, + { + "epoch": 2.8650612634593964, + "grad_norm": 2.6356916427612305, + "learning_rate": 2.248978942343394e-06, + "loss": 0.4628, + "step": 324090 + }, + { + "epoch": 2.8651496667197085, + "grad_norm": 8.083111763000488, + "learning_rate": 2.247505554671523e-06, + "loss": 0.4444, + "step": 324100 + }, + { + "epoch": 2.8652380699800206, + "grad_norm": 1.539612054824829, + "learning_rate": 2.2460321669996524e-06, + "loss": 0.4325, + "step": 324110 + }, + { + "epoch": 2.865326473240333, + "grad_norm": 2.582695722579956, + "learning_rate": 2.2445587793277816e-06, + "loss": 0.5563, + "step": 324120 + }, + { + "epoch": 2.8654148765006453, + "grad_norm": 7.672632217407227, + "learning_rate": 2.243085391655911e-06, + "loss": 0.4607, + "step": 324130 + }, + { + "epoch": 2.8655032797609574, + "grad_norm": 1.8240798711776733, + "learning_rate": 2.2416120039840404e-06, + "loss": 0.5434, + "step": 324140 + }, + { + "epoch": 2.86559168302127, + "grad_norm": 6.052487850189209, + "learning_rate": 2.2401386163121696e-06, + "loss": 0.5976, + "step": 324150 + }, + { + "epoch": 2.865680086281582, + "grad_norm": 2.344109535217285, + "learning_rate": 2.238665228640299e-06, + "loss": 0.5408, + "step": 324160 + }, + { + "epoch": 2.8657684895418942, + "grad_norm": 2.6917836666107178, + "learning_rate": 2.2371918409684284e-06, + "loss": 0.5172, + "step": 324170 + }, + { + "epoch": 2.8658568928022063, + "grad_norm": 1.9922194480895996, + "learning_rate": 2.2357184532965576e-06, + "loss": 0.6053, + "step": 324180 + }, + { + "epoch": 2.865945296062519, + "grad_norm": 1.7533833980560303, + "learning_rate": 2.234245065624687e-06, + "loss": 0.494, + "step": 324190 + }, + { + "epoch": 2.866033699322831, + "grad_norm": 2.256603717803955, + "learning_rate": 2.2327716779528165e-06, + "loss": 0.3897, + "step": 324200 + }, + { + "epoch": 2.866122102583143, + "grad_norm": 3.6444382667541504, + "learning_rate": 2.2312982902809457e-06, + "loss": 0.5184, + "step": 324210 + }, + { + "epoch": 2.8662105058434557, + "grad_norm": 9.612709999084473, + "learning_rate": 2.2298249026090753e-06, + "loss": 0.478, + "step": 324220 + }, + { + "epoch": 2.866298909103768, + "grad_norm": 2.840296745300293, + "learning_rate": 2.2283515149372045e-06, + "loss": 0.4183, + "step": 324230 + }, + { + "epoch": 2.86638731236408, + "grad_norm": 1.9260231256484985, + "learning_rate": 2.2268781272653337e-06, + "loss": 0.4846, + "step": 324240 + }, + { + "epoch": 2.866475715624392, + "grad_norm": 2.42600154876709, + "learning_rate": 2.225404739593463e-06, + "loss": 0.5347, + "step": 324250 + }, + { + "epoch": 2.866564118884704, + "grad_norm": 3.75242280960083, + "learning_rate": 2.223931351921592e-06, + "loss": 0.48, + "step": 324260 + }, + { + "epoch": 2.8666525221450168, + "grad_norm": 4.124344348907471, + "learning_rate": 2.2224579642497217e-06, + "loss": 0.5832, + "step": 324270 + }, + { + "epoch": 2.866740925405329, + "grad_norm": 7.703774929046631, + "learning_rate": 2.220984576577851e-06, + "loss": 0.5121, + "step": 324280 + }, + { + "epoch": 2.8668293286656414, + "grad_norm": 3.8484106063842773, + "learning_rate": 2.21951118890598e-06, + "loss": 0.4422, + "step": 324290 + }, + { + "epoch": 2.8669177319259536, + "grad_norm": 4.39552116394043, + "learning_rate": 2.2180378012341093e-06, + "loss": 0.639, + "step": 324300 + }, + { + "epoch": 2.8670061351862657, + "grad_norm": 5.5721659660339355, + "learning_rate": 2.216564413562239e-06, + "loss": 0.5238, + "step": 324310 + }, + { + "epoch": 2.867094538446578, + "grad_norm": 5.615044116973877, + "learning_rate": 2.2150910258903686e-06, + "loss": 0.4479, + "step": 324320 + }, + { + "epoch": 2.86718294170689, + "grad_norm": 3.850879430770874, + "learning_rate": 2.2136176382184978e-06, + "loss": 0.4563, + "step": 324330 + }, + { + "epoch": 2.8672713449672025, + "grad_norm": 2.300100326538086, + "learning_rate": 2.212144250546627e-06, + "loss": 0.4812, + "step": 324340 + }, + { + "epoch": 2.8673597482275146, + "grad_norm": 1.526123046875, + "learning_rate": 2.210670862874756e-06, + "loss": 0.5662, + "step": 324350 + }, + { + "epoch": 2.8674481514878267, + "grad_norm": 1.6370068788528442, + "learning_rate": 2.2091974752028858e-06, + "loss": 0.3818, + "step": 324360 + }, + { + "epoch": 2.8675365547481393, + "grad_norm": 3.8691463470458984, + "learning_rate": 2.207724087531015e-06, + "loss": 0.5069, + "step": 324370 + }, + { + "epoch": 2.8676249580084514, + "grad_norm": 7.7925591468811035, + "learning_rate": 2.206250699859144e-06, + "loss": 0.5294, + "step": 324380 + }, + { + "epoch": 2.8677133612687635, + "grad_norm": 4.941892623901367, + "learning_rate": 2.2047773121872734e-06, + "loss": 0.5347, + "step": 324390 + }, + { + "epoch": 2.8678017645290756, + "grad_norm": 3.979600667953491, + "learning_rate": 2.2033039245154026e-06, + "loss": 0.5503, + "step": 324400 + }, + { + "epoch": 2.867890167789388, + "grad_norm": 10.902095794677734, + "learning_rate": 2.2018305368435322e-06, + "loss": 0.4616, + "step": 324410 + }, + { + "epoch": 2.8679785710497003, + "grad_norm": 2.0861146450042725, + "learning_rate": 2.2003571491716614e-06, + "loss": 0.4305, + "step": 324420 + }, + { + "epoch": 2.8680669743100125, + "grad_norm": 6.714892387390137, + "learning_rate": 2.198883761499791e-06, + "loss": 0.5142, + "step": 324430 + }, + { + "epoch": 2.868155377570325, + "grad_norm": 16.94078826904297, + "learning_rate": 2.1974103738279202e-06, + "loss": 0.5836, + "step": 324440 + }, + { + "epoch": 2.868243780830637, + "grad_norm": 2.0547664165496826, + "learning_rate": 2.19593698615605e-06, + "loss": 0.525, + "step": 324450 + }, + { + "epoch": 2.8683321840909493, + "grad_norm": 6.170354843139648, + "learning_rate": 2.194463598484179e-06, + "loss": 0.56, + "step": 324460 + }, + { + "epoch": 2.8684205873512614, + "grad_norm": 2.960392713546753, + "learning_rate": 2.1929902108123083e-06, + "loss": 0.5514, + "step": 324470 + }, + { + "epoch": 2.8685089906115735, + "grad_norm": 2.039905071258545, + "learning_rate": 2.1915168231404375e-06, + "loss": 0.5724, + "step": 324480 + }, + { + "epoch": 2.868597393871886, + "grad_norm": 1.9044936895370483, + "learning_rate": 2.1900434354685667e-06, + "loss": 0.4542, + "step": 324490 + }, + { + "epoch": 2.868685797132198, + "grad_norm": 7.488118648529053, + "learning_rate": 2.1885700477966963e-06, + "loss": 0.5409, + "step": 324500 + }, + { + "epoch": 2.8687742003925107, + "grad_norm": 6.114682197570801, + "learning_rate": 2.1870966601248255e-06, + "loss": 0.4995, + "step": 324510 + }, + { + "epoch": 2.868862603652823, + "grad_norm": 2.538630723953247, + "learning_rate": 2.1856232724529547e-06, + "loss": 0.4884, + "step": 324520 + }, + { + "epoch": 2.868951006913135, + "grad_norm": 5.3265156745910645, + "learning_rate": 2.184149884781084e-06, + "loss": 0.462, + "step": 324530 + }, + { + "epoch": 2.869039410173447, + "grad_norm": 1.1605141162872314, + "learning_rate": 2.1826764971092135e-06, + "loss": 0.6165, + "step": 324540 + }, + { + "epoch": 2.8691278134337592, + "grad_norm": 6.388484477996826, + "learning_rate": 2.1812031094373427e-06, + "loss": 0.5077, + "step": 324550 + }, + { + "epoch": 2.869216216694072, + "grad_norm": 8.77233600616455, + "learning_rate": 2.1797297217654723e-06, + "loss": 0.5389, + "step": 324560 + }, + { + "epoch": 2.869304619954384, + "grad_norm": 4.996603965759277, + "learning_rate": 2.1782563340936015e-06, + "loss": 0.6067, + "step": 324570 + }, + { + "epoch": 2.869393023214696, + "grad_norm": 0.934570848941803, + "learning_rate": 2.1767829464217307e-06, + "loss": 0.6321, + "step": 324580 + }, + { + "epoch": 2.8694814264750086, + "grad_norm": 3.881012201309204, + "learning_rate": 2.1753095587498604e-06, + "loss": 0.421, + "step": 324590 + }, + { + "epoch": 2.8695698297353207, + "grad_norm": 1.8305047750473022, + "learning_rate": 2.1738361710779896e-06, + "loss": 0.4669, + "step": 324600 + }, + { + "epoch": 2.869658232995633, + "grad_norm": 4.05060338973999, + "learning_rate": 2.1723627834061188e-06, + "loss": 0.5926, + "step": 324610 + }, + { + "epoch": 2.869746636255945, + "grad_norm": 5.0326995849609375, + "learning_rate": 2.170889395734248e-06, + "loss": 0.4314, + "step": 324620 + }, + { + "epoch": 2.8698350395162575, + "grad_norm": 4.150937080383301, + "learning_rate": 2.169416008062377e-06, + "loss": 0.585, + "step": 324630 + }, + { + "epoch": 2.8699234427765696, + "grad_norm": 4.166959285736084, + "learning_rate": 2.167942620390507e-06, + "loss": 0.7887, + "step": 324640 + }, + { + "epoch": 2.8700118460368818, + "grad_norm": 15.034402847290039, + "learning_rate": 2.166469232718636e-06, + "loss": 0.4814, + "step": 324650 + }, + { + "epoch": 2.8701002492971943, + "grad_norm": 2.674250602722168, + "learning_rate": 2.1649958450467656e-06, + "loss": 0.3559, + "step": 324660 + }, + { + "epoch": 2.8701886525575064, + "grad_norm": 3.957174777984619, + "learning_rate": 2.163522457374895e-06, + "loss": 0.5492, + "step": 324670 + }, + { + "epoch": 2.8702770558178186, + "grad_norm": 2.615548610687256, + "learning_rate": 2.162049069703024e-06, + "loss": 0.527, + "step": 324680 + }, + { + "epoch": 2.8703654590781307, + "grad_norm": 1.732437252998352, + "learning_rate": 2.1605756820311537e-06, + "loss": 0.5038, + "step": 324690 + }, + { + "epoch": 2.870453862338443, + "grad_norm": 1.9608999490737915, + "learning_rate": 2.159102294359283e-06, + "loss": 0.6001, + "step": 324700 + }, + { + "epoch": 2.8705422655987554, + "grad_norm": 4.679266929626465, + "learning_rate": 2.157628906687412e-06, + "loss": 0.5869, + "step": 324710 + }, + { + "epoch": 2.8706306688590675, + "grad_norm": 7.025559425354004, + "learning_rate": 2.1561555190155413e-06, + "loss": 0.5789, + "step": 324720 + }, + { + "epoch": 2.8707190721193796, + "grad_norm": 6.001691818237305, + "learning_rate": 2.154682131343671e-06, + "loss": 0.5765, + "step": 324730 + }, + { + "epoch": 2.870807475379692, + "grad_norm": 2.4095239639282227, + "learning_rate": 2.1532087436718e-06, + "loss": 0.5766, + "step": 324740 + }, + { + "epoch": 2.8708958786400043, + "grad_norm": 50.643314361572266, + "learning_rate": 2.1517353559999293e-06, + "loss": 0.5155, + "step": 324750 + }, + { + "epoch": 2.8709842819003164, + "grad_norm": 4.597681045532227, + "learning_rate": 2.1502619683280585e-06, + "loss": 0.5294, + "step": 324760 + }, + { + "epoch": 2.8710726851606285, + "grad_norm": 1.0004401206970215, + "learning_rate": 2.148788580656188e-06, + "loss": 0.3963, + "step": 324770 + }, + { + "epoch": 2.871161088420941, + "grad_norm": 1.0169686079025269, + "learning_rate": 2.1473151929843173e-06, + "loss": 0.4308, + "step": 324780 + }, + { + "epoch": 2.871249491681253, + "grad_norm": 1.1444339752197266, + "learning_rate": 2.145841805312447e-06, + "loss": 0.5, + "step": 324790 + }, + { + "epoch": 2.8713378949415653, + "grad_norm": 8.463526725769043, + "learning_rate": 2.144368417640576e-06, + "loss": 0.4614, + "step": 324800 + }, + { + "epoch": 2.871426298201878, + "grad_norm": 4.549572944641113, + "learning_rate": 2.1428950299687053e-06, + "loss": 0.534, + "step": 324810 + }, + { + "epoch": 2.87151470146219, + "grad_norm": 2.8273067474365234, + "learning_rate": 2.1414216422968345e-06, + "loss": 0.5521, + "step": 324820 + }, + { + "epoch": 2.871603104722502, + "grad_norm": 1.9880224466323853, + "learning_rate": 2.139948254624964e-06, + "loss": 0.5216, + "step": 324830 + }, + { + "epoch": 2.8716915079828143, + "grad_norm": 3.2347054481506348, + "learning_rate": 2.1384748669530934e-06, + "loss": 0.4905, + "step": 324840 + }, + { + "epoch": 2.8717799112431264, + "grad_norm": 13.598522186279297, + "learning_rate": 2.1370014792812226e-06, + "loss": 0.6321, + "step": 324850 + }, + { + "epoch": 2.871868314503439, + "grad_norm": 5.749077320098877, + "learning_rate": 2.1355280916093518e-06, + "loss": 0.5126, + "step": 324860 + }, + { + "epoch": 2.871956717763751, + "grad_norm": 3.0459580421447754, + "learning_rate": 2.1340547039374814e-06, + "loss": 0.5017, + "step": 324870 + }, + { + "epoch": 2.8720451210240636, + "grad_norm": 4.361216068267822, + "learning_rate": 2.1325813162656106e-06, + "loss": 0.4837, + "step": 324880 + }, + { + "epoch": 2.8721335242843757, + "grad_norm": 4.136706829071045, + "learning_rate": 2.13110792859374e-06, + "loss": 0.6039, + "step": 324890 + }, + { + "epoch": 2.872221927544688, + "grad_norm": 5.799802780151367, + "learning_rate": 2.1296345409218694e-06, + "loss": 0.6057, + "step": 324900 + }, + { + "epoch": 2.872310330805, + "grad_norm": 1.4000558853149414, + "learning_rate": 2.1281611532499986e-06, + "loss": 0.5053, + "step": 324910 + }, + { + "epoch": 2.872398734065312, + "grad_norm": 1.4195168018341064, + "learning_rate": 2.1266877655781282e-06, + "loss": 0.4718, + "step": 324920 + }, + { + "epoch": 2.8724871373256247, + "grad_norm": 2.933170795440674, + "learning_rate": 2.1252143779062574e-06, + "loss": 0.517, + "step": 324930 + }, + { + "epoch": 2.872575540585937, + "grad_norm": 2.8698065280914307, + "learning_rate": 2.1237409902343866e-06, + "loss": 0.6307, + "step": 324940 + }, + { + "epoch": 2.872663943846249, + "grad_norm": 10.875107765197754, + "learning_rate": 2.122267602562516e-06, + "loss": 0.5202, + "step": 324950 + }, + { + "epoch": 2.8727523471065615, + "grad_norm": 2.8895647525787354, + "learning_rate": 2.120794214890645e-06, + "loss": 0.608, + "step": 324960 + }, + { + "epoch": 2.8728407503668736, + "grad_norm": 2.8035802841186523, + "learning_rate": 2.1193208272187747e-06, + "loss": 0.4873, + "step": 324970 + }, + { + "epoch": 2.8729291536271857, + "grad_norm": 18.11378288269043, + "learning_rate": 2.117847439546904e-06, + "loss": 0.4714, + "step": 324980 + }, + { + "epoch": 2.873017556887498, + "grad_norm": 4.180607795715332, + "learning_rate": 2.116374051875033e-06, + "loss": 0.6314, + "step": 324990 + }, + { + "epoch": 2.8731059601478104, + "grad_norm": 1.967132568359375, + "learning_rate": 2.1149006642031623e-06, + "loss": 0.5255, + "step": 325000 + }, + { + "epoch": 2.8731943634081225, + "grad_norm": 1.7877442836761475, + "learning_rate": 2.113427276531292e-06, + "loss": 0.5983, + "step": 325010 + }, + { + "epoch": 2.8732827666684346, + "grad_norm": 4.623711109161377, + "learning_rate": 2.1119538888594215e-06, + "loss": 0.4387, + "step": 325020 + }, + { + "epoch": 2.873371169928747, + "grad_norm": 3.4951980113983154, + "learning_rate": 2.1104805011875507e-06, + "loss": 0.5757, + "step": 325030 + }, + { + "epoch": 2.8734595731890593, + "grad_norm": 5.037763595581055, + "learning_rate": 2.10900711351568e-06, + "loss": 0.4629, + "step": 325040 + }, + { + "epoch": 2.8735479764493714, + "grad_norm": 2.2107810974121094, + "learning_rate": 2.107533725843809e-06, + "loss": 0.4492, + "step": 325050 + }, + { + "epoch": 2.8736363797096836, + "grad_norm": 3.0558576583862305, + "learning_rate": 2.1060603381719388e-06, + "loss": 0.5141, + "step": 325060 + }, + { + "epoch": 2.8737247829699957, + "grad_norm": 4.005914211273193, + "learning_rate": 2.104586950500068e-06, + "loss": 0.4839, + "step": 325070 + }, + { + "epoch": 2.8738131862303082, + "grad_norm": 1.2621577978134155, + "learning_rate": 2.103113562828197e-06, + "loss": 0.5929, + "step": 325080 + }, + { + "epoch": 2.8739015894906204, + "grad_norm": 3.342411756515503, + "learning_rate": 2.1016401751563264e-06, + "loss": 0.5732, + "step": 325090 + }, + { + "epoch": 2.873989992750933, + "grad_norm": 6.214699745178223, + "learning_rate": 2.1001667874844556e-06, + "loss": 0.5034, + "step": 325100 + }, + { + "epoch": 2.874078396011245, + "grad_norm": 17.245046615600586, + "learning_rate": 2.098693399812585e-06, + "loss": 0.4865, + "step": 325110 + }, + { + "epoch": 2.874166799271557, + "grad_norm": 5.060104846954346, + "learning_rate": 2.0972200121407144e-06, + "loss": 0.554, + "step": 325120 + }, + { + "epoch": 2.8742552025318693, + "grad_norm": 3.815547227859497, + "learning_rate": 2.095746624468844e-06, + "loss": 0.6553, + "step": 325130 + }, + { + "epoch": 2.8743436057921814, + "grad_norm": 3.3565585613250732, + "learning_rate": 2.094273236796973e-06, + "loss": 0.5682, + "step": 325140 + }, + { + "epoch": 2.874432009052494, + "grad_norm": 8.988341331481934, + "learning_rate": 2.092799849125103e-06, + "loss": 0.456, + "step": 325150 + }, + { + "epoch": 2.874520412312806, + "grad_norm": 3.5963211059570312, + "learning_rate": 2.091326461453232e-06, + "loss": 0.4621, + "step": 325160 + }, + { + "epoch": 2.874608815573118, + "grad_norm": 6.121003150939941, + "learning_rate": 2.0898530737813612e-06, + "loss": 0.4859, + "step": 325170 + }, + { + "epoch": 2.8746972188334308, + "grad_norm": 6.0079755783081055, + "learning_rate": 2.0883796861094904e-06, + "loss": 0.5107, + "step": 325180 + }, + { + "epoch": 2.874785622093743, + "grad_norm": 9.351767539978027, + "learning_rate": 2.0869062984376196e-06, + "loss": 0.5115, + "step": 325190 + }, + { + "epoch": 2.874874025354055, + "grad_norm": 8.60627269744873, + "learning_rate": 2.0854329107657493e-06, + "loss": 0.5653, + "step": 325200 + }, + { + "epoch": 2.874962428614367, + "grad_norm": 10.776061058044434, + "learning_rate": 2.0839595230938785e-06, + "loss": 0.5098, + "step": 325210 + }, + { + "epoch": 2.8750508318746797, + "grad_norm": 4.348128318786621, + "learning_rate": 2.0824861354220077e-06, + "loss": 0.4806, + "step": 325220 + }, + { + "epoch": 2.875139235134992, + "grad_norm": 5.7360100746154785, + "learning_rate": 2.081012747750137e-06, + "loss": 0.4688, + "step": 325230 + }, + { + "epoch": 2.875227638395304, + "grad_norm": 5.262650966644287, + "learning_rate": 2.0795393600782665e-06, + "loss": 0.5151, + "step": 325240 + }, + { + "epoch": 2.8753160416556165, + "grad_norm": 2.6698975563049316, + "learning_rate": 2.078065972406396e-06, + "loss": 0.5367, + "step": 325250 + }, + { + "epoch": 2.8754044449159286, + "grad_norm": 5.914938926696777, + "learning_rate": 2.0765925847345253e-06, + "loss": 0.4804, + "step": 325260 + }, + { + "epoch": 2.8754928481762407, + "grad_norm": 1.1700711250305176, + "learning_rate": 2.0751191970626545e-06, + "loss": 0.5603, + "step": 325270 + }, + { + "epoch": 2.875581251436553, + "grad_norm": 2.2784769535064697, + "learning_rate": 2.0736458093907837e-06, + "loss": 0.5183, + "step": 325280 + }, + { + "epoch": 2.875669654696865, + "grad_norm": 4.057373046875, + "learning_rate": 2.0721724217189133e-06, + "loss": 0.4799, + "step": 325290 + }, + { + "epoch": 2.8757580579571775, + "grad_norm": 3.6027638912200928, + "learning_rate": 2.0706990340470425e-06, + "loss": 0.5642, + "step": 325300 + }, + { + "epoch": 2.8758464612174897, + "grad_norm": 8.45533561706543, + "learning_rate": 2.0692256463751717e-06, + "loss": 0.6227, + "step": 325310 + }, + { + "epoch": 2.8759348644778018, + "grad_norm": 4.914971351623535, + "learning_rate": 2.067752258703301e-06, + "loss": 0.515, + "step": 325320 + }, + { + "epoch": 2.8760232677381143, + "grad_norm": 4.519179344177246, + "learning_rate": 2.06627887103143e-06, + "loss": 0.4992, + "step": 325330 + }, + { + "epoch": 2.8761116709984265, + "grad_norm": 6.7234697341918945, + "learning_rate": 2.0648054833595598e-06, + "loss": 0.4785, + "step": 325340 + }, + { + "epoch": 2.8762000742587386, + "grad_norm": 3.4284188747406006, + "learning_rate": 2.063332095687689e-06, + "loss": 0.476, + "step": 325350 + }, + { + "epoch": 2.8762884775190507, + "grad_norm": 1.104881763458252, + "learning_rate": 2.0618587080158186e-06, + "loss": 0.4971, + "step": 325360 + }, + { + "epoch": 2.8763768807793633, + "grad_norm": 4.497557640075684, + "learning_rate": 2.060385320343948e-06, + "loss": 0.49, + "step": 325370 + }, + { + "epoch": 2.8764652840396754, + "grad_norm": 1.29360032081604, + "learning_rate": 2.058911932672077e-06, + "loss": 0.6064, + "step": 325380 + }, + { + "epoch": 2.8765536872999875, + "grad_norm": 3.116621971130371, + "learning_rate": 2.0574385450002066e-06, + "loss": 0.4596, + "step": 325390 + }, + { + "epoch": 2.8766420905603, + "grad_norm": 4.737308979034424, + "learning_rate": 2.055965157328336e-06, + "loss": 0.5176, + "step": 325400 + }, + { + "epoch": 2.876730493820612, + "grad_norm": 2.1892004013061523, + "learning_rate": 2.054491769656465e-06, + "loss": 0.537, + "step": 325410 + }, + { + "epoch": 2.8768188970809243, + "grad_norm": 2.0510261058807373, + "learning_rate": 2.0530183819845942e-06, + "loss": 0.5554, + "step": 325420 + }, + { + "epoch": 2.8769073003412364, + "grad_norm": 5.756936073303223, + "learning_rate": 2.0515449943127234e-06, + "loss": 0.5011, + "step": 325430 + }, + { + "epoch": 2.8769957036015485, + "grad_norm": 4.138476848602295, + "learning_rate": 2.050071606640853e-06, + "loss": 0.6605, + "step": 325440 + }, + { + "epoch": 2.877084106861861, + "grad_norm": 3.4855713844299316, + "learning_rate": 2.0485982189689822e-06, + "loss": 0.5244, + "step": 325450 + }, + { + "epoch": 2.8771725101221732, + "grad_norm": 1.4688725471496582, + "learning_rate": 2.0471248312971114e-06, + "loss": 0.5161, + "step": 325460 + }, + { + "epoch": 2.877260913382486, + "grad_norm": 3.5676357746124268, + "learning_rate": 2.045651443625241e-06, + "loss": 0.549, + "step": 325470 + }, + { + "epoch": 2.877349316642798, + "grad_norm": 4.219442367553711, + "learning_rate": 2.0441780559533703e-06, + "loss": 0.5647, + "step": 325480 + }, + { + "epoch": 2.87743771990311, + "grad_norm": 5.494997978210449, + "learning_rate": 2.0427046682815e-06, + "loss": 0.4117, + "step": 325490 + }, + { + "epoch": 2.877526123163422, + "grad_norm": 5.346526145935059, + "learning_rate": 2.041231280609629e-06, + "loss": 0.627, + "step": 325500 + }, + { + "epoch": 2.8776145264237343, + "grad_norm": 4.406563758850098, + "learning_rate": 2.0397578929377583e-06, + "loss": 0.4548, + "step": 325510 + }, + { + "epoch": 2.877702929684047, + "grad_norm": 1.4897617101669312, + "learning_rate": 2.0382845052658875e-06, + "loss": 0.498, + "step": 325520 + }, + { + "epoch": 2.877791332944359, + "grad_norm": 7.638703346252441, + "learning_rate": 2.036811117594017e-06, + "loss": 0.5912, + "step": 325530 + }, + { + "epoch": 2.877879736204671, + "grad_norm": 5.09198522567749, + "learning_rate": 2.0353377299221463e-06, + "loss": 0.6102, + "step": 325540 + }, + { + "epoch": 2.8779681394649836, + "grad_norm": 12.819684028625488, + "learning_rate": 2.0338643422502755e-06, + "loss": 0.5575, + "step": 325550 + }, + { + "epoch": 2.8780565427252958, + "grad_norm": 4.041370868682861, + "learning_rate": 2.0323909545784047e-06, + "loss": 0.5581, + "step": 325560 + }, + { + "epoch": 2.878144945985608, + "grad_norm": 1.7960469722747803, + "learning_rate": 2.030917566906534e-06, + "loss": 0.5503, + "step": 325570 + }, + { + "epoch": 2.87823334924592, + "grad_norm": 16.633859634399414, + "learning_rate": 2.0294441792346636e-06, + "loss": 0.5808, + "step": 325580 + }, + { + "epoch": 2.8783217525062326, + "grad_norm": 2.9126791954040527, + "learning_rate": 2.0279707915627928e-06, + "loss": 0.5384, + "step": 325590 + }, + { + "epoch": 2.8784101557665447, + "grad_norm": 5.147716045379639, + "learning_rate": 2.0264974038909224e-06, + "loss": 0.4283, + "step": 325600 + }, + { + "epoch": 2.878498559026857, + "grad_norm": 7.216915130615234, + "learning_rate": 2.0250240162190516e-06, + "loss": 0.6068, + "step": 325610 + }, + { + "epoch": 2.8785869622871694, + "grad_norm": 18.38793182373047, + "learning_rate": 2.023550628547181e-06, + "loss": 0.5861, + "step": 325620 + }, + { + "epoch": 2.8786753655474815, + "grad_norm": 4.967648506164551, + "learning_rate": 2.0220772408753104e-06, + "loss": 0.6212, + "step": 325630 + }, + { + "epoch": 2.8787637688077936, + "grad_norm": 5.622519493103027, + "learning_rate": 2.0206038532034396e-06, + "loss": 0.5034, + "step": 325640 + }, + { + "epoch": 2.8788521720681057, + "grad_norm": 3.0590147972106934, + "learning_rate": 2.019130465531569e-06, + "loss": 0.444, + "step": 325650 + }, + { + "epoch": 2.878940575328418, + "grad_norm": 2.4198293685913086, + "learning_rate": 2.017657077859698e-06, + "loss": 0.5079, + "step": 325660 + }, + { + "epoch": 2.8790289785887304, + "grad_norm": 0.7011827230453491, + "learning_rate": 2.0161836901878276e-06, + "loss": 0.47, + "step": 325670 + }, + { + "epoch": 2.8791173818490425, + "grad_norm": 7.16183614730835, + "learning_rate": 2.014710302515957e-06, + "loss": 0.5031, + "step": 325680 + }, + { + "epoch": 2.879205785109355, + "grad_norm": 3.3182597160339355, + "learning_rate": 2.013236914844086e-06, + "loss": 0.4451, + "step": 325690 + }, + { + "epoch": 2.879294188369667, + "grad_norm": 1.4101160764694214, + "learning_rate": 2.0117635271722152e-06, + "loss": 0.4777, + "step": 325700 + }, + { + "epoch": 2.8793825916299793, + "grad_norm": 5.474085807800293, + "learning_rate": 2.010290139500345e-06, + "loss": 0.5223, + "step": 325710 + }, + { + "epoch": 2.8794709948902915, + "grad_norm": 9.018508911132812, + "learning_rate": 2.0088167518284745e-06, + "loss": 0.5603, + "step": 325720 + }, + { + "epoch": 2.8795593981506036, + "grad_norm": 3.0293173789978027, + "learning_rate": 2.0073433641566037e-06, + "loss": 0.6247, + "step": 325730 + }, + { + "epoch": 2.879647801410916, + "grad_norm": 3.8511834144592285, + "learning_rate": 2.005869976484733e-06, + "loss": 0.482, + "step": 325740 + }, + { + "epoch": 2.8797362046712283, + "grad_norm": 2.922930955886841, + "learning_rate": 2.004396588812862e-06, + "loss": 0.6254, + "step": 325750 + }, + { + "epoch": 2.8798246079315404, + "grad_norm": 4.9177680015563965, + "learning_rate": 2.0029232011409917e-06, + "loss": 0.4571, + "step": 325760 + }, + { + "epoch": 2.879913011191853, + "grad_norm": 10.039872169494629, + "learning_rate": 2.001449813469121e-06, + "loss": 0.4997, + "step": 325770 + }, + { + "epoch": 2.880001414452165, + "grad_norm": 4.721561431884766, + "learning_rate": 1.99997642579725e-06, + "loss": 0.3759, + "step": 325780 + }, + { + "epoch": 2.880089817712477, + "grad_norm": 1.699264645576477, + "learning_rate": 1.9985030381253793e-06, + "loss": 0.5316, + "step": 325790 + }, + { + "epoch": 2.8801782209727893, + "grad_norm": 1.702093482017517, + "learning_rate": 1.9970296504535085e-06, + "loss": 0.5906, + "step": 325800 + }, + { + "epoch": 2.880266624233102, + "grad_norm": 4.433439254760742, + "learning_rate": 1.995556262781638e-06, + "loss": 0.4671, + "step": 325810 + }, + { + "epoch": 2.880355027493414, + "grad_norm": 3.0465846061706543, + "learning_rate": 1.9940828751097673e-06, + "loss": 0.4701, + "step": 325820 + }, + { + "epoch": 2.880443430753726, + "grad_norm": 2.0529685020446777, + "learning_rate": 1.992609487437897e-06, + "loss": 0.4436, + "step": 325830 + }, + { + "epoch": 2.8805318340140387, + "grad_norm": 5.597789287567139, + "learning_rate": 1.991136099766026e-06, + "loss": 0.5103, + "step": 325840 + }, + { + "epoch": 2.880620237274351, + "grad_norm": 18.998594284057617, + "learning_rate": 1.9896627120941554e-06, + "loss": 0.617, + "step": 325850 + }, + { + "epoch": 2.880708640534663, + "grad_norm": 6.156402111053467, + "learning_rate": 1.988189324422285e-06, + "loss": 0.5683, + "step": 325860 + }, + { + "epoch": 2.880797043794975, + "grad_norm": 3.1226389408111572, + "learning_rate": 1.986715936750414e-06, + "loss": 0.452, + "step": 325870 + }, + { + "epoch": 2.880885447055287, + "grad_norm": 2.986905574798584, + "learning_rate": 1.9852425490785434e-06, + "loss": 0.5398, + "step": 325880 + }, + { + "epoch": 2.8809738503155997, + "grad_norm": 2.011993169784546, + "learning_rate": 1.9837691614066726e-06, + "loss": 0.3376, + "step": 325890 + }, + { + "epoch": 2.881062253575912, + "grad_norm": 2.2322757244110107, + "learning_rate": 1.9822957737348022e-06, + "loss": 0.5326, + "step": 325900 + }, + { + "epoch": 2.881150656836224, + "grad_norm": 3.1550068855285645, + "learning_rate": 1.9808223860629314e-06, + "loss": 0.5195, + "step": 325910 + }, + { + "epoch": 2.8812390600965365, + "grad_norm": 4.038247108459473, + "learning_rate": 1.9793489983910606e-06, + "loss": 0.5763, + "step": 325920 + }, + { + "epoch": 2.8813274633568486, + "grad_norm": 3.6340200901031494, + "learning_rate": 1.97787561071919e-06, + "loss": 0.4801, + "step": 325930 + }, + { + "epoch": 2.8814158666171608, + "grad_norm": 4.134987831115723, + "learning_rate": 1.9764022230473194e-06, + "loss": 0.5634, + "step": 325940 + }, + { + "epoch": 2.881504269877473, + "grad_norm": 3.6943957805633545, + "learning_rate": 1.974928835375449e-06, + "loss": 0.4501, + "step": 325950 + }, + { + "epoch": 2.8815926731377854, + "grad_norm": 1.2163194417953491, + "learning_rate": 1.9734554477035783e-06, + "loss": 0.4823, + "step": 325960 + }, + { + "epoch": 2.8816810763980976, + "grad_norm": 12.499979019165039, + "learning_rate": 1.9719820600317075e-06, + "loss": 0.5657, + "step": 325970 + }, + { + "epoch": 2.8817694796584097, + "grad_norm": 3.762364625930786, + "learning_rate": 1.9705086723598367e-06, + "loss": 0.5919, + "step": 325980 + }, + { + "epoch": 2.8818578829187222, + "grad_norm": 1.4027457237243652, + "learning_rate": 1.969035284687966e-06, + "loss": 0.5436, + "step": 325990 + }, + { + "epoch": 2.8819462861790344, + "grad_norm": 3.1994612216949463, + "learning_rate": 1.9675618970160955e-06, + "loss": 0.5951, + "step": 326000 + }, + { + "epoch": 2.8820346894393465, + "grad_norm": 3.8155875205993652, + "learning_rate": 1.9660885093442247e-06, + "loss": 0.3651, + "step": 326010 + }, + { + "epoch": 2.8821230926996586, + "grad_norm": 4.392480373382568, + "learning_rate": 1.964615121672354e-06, + "loss": 0.4483, + "step": 326020 + }, + { + "epoch": 2.8822114959599707, + "grad_norm": 3.117737293243408, + "learning_rate": 1.963141734000483e-06, + "loss": 0.5179, + "step": 326030 + }, + { + "epoch": 2.8822998992202833, + "grad_norm": 3.031831979751587, + "learning_rate": 1.9616683463286127e-06, + "loss": 0.5715, + "step": 326040 + }, + { + "epoch": 2.8823883024805954, + "grad_norm": 3.6502068042755127, + "learning_rate": 1.960194958656742e-06, + "loss": 0.5167, + "step": 326050 + }, + { + "epoch": 2.882476705740908, + "grad_norm": 0.8420807123184204, + "learning_rate": 1.9587215709848716e-06, + "loss": 0.5735, + "step": 326060 + }, + { + "epoch": 2.88256510900122, + "grad_norm": 7.4903764724731445, + "learning_rate": 1.9572481833130008e-06, + "loss": 0.3984, + "step": 326070 + }, + { + "epoch": 2.882653512261532, + "grad_norm": 6.434947967529297, + "learning_rate": 1.95577479564113e-06, + "loss": 0.5167, + "step": 326080 + }, + { + "epoch": 2.8827419155218443, + "grad_norm": 13.111700057983398, + "learning_rate": 1.9543014079692596e-06, + "loss": 0.5241, + "step": 326090 + }, + { + "epoch": 2.8828303187821565, + "grad_norm": 0.9521735906600952, + "learning_rate": 1.9528280202973888e-06, + "loss": 0.4788, + "step": 326100 + }, + { + "epoch": 2.882918722042469, + "grad_norm": 3.518390655517578, + "learning_rate": 1.951354632625518e-06, + "loss": 0.4683, + "step": 326110 + }, + { + "epoch": 2.883007125302781, + "grad_norm": 13.301849365234375, + "learning_rate": 1.949881244953647e-06, + "loss": 0.5291, + "step": 326120 + }, + { + "epoch": 2.8830955285630933, + "grad_norm": 4.269556045532227, + "learning_rate": 1.9484078572817764e-06, + "loss": 0.4935, + "step": 326130 + }, + { + "epoch": 2.883183931823406, + "grad_norm": 3.0179312229156494, + "learning_rate": 1.946934469609906e-06, + "loss": 0.5331, + "step": 326140 + }, + { + "epoch": 2.883272335083718, + "grad_norm": 1.1473190784454346, + "learning_rate": 1.945461081938035e-06, + "loss": 0.5437, + "step": 326150 + }, + { + "epoch": 2.88336073834403, + "grad_norm": 13.503312110900879, + "learning_rate": 1.9439876942661644e-06, + "loss": 0.5276, + "step": 326160 + }, + { + "epoch": 2.883449141604342, + "grad_norm": 5.612461090087891, + "learning_rate": 1.942514306594294e-06, + "loss": 0.5175, + "step": 326170 + }, + { + "epoch": 2.8835375448646547, + "grad_norm": 3.725609540939331, + "learning_rate": 1.9410409189224232e-06, + "loss": 0.5273, + "step": 326180 + }, + { + "epoch": 2.883625948124967, + "grad_norm": 4.659853458404541, + "learning_rate": 1.939567531250553e-06, + "loss": 0.5551, + "step": 326190 + }, + { + "epoch": 2.883714351385279, + "grad_norm": 14.260411262512207, + "learning_rate": 1.938094143578682e-06, + "loss": 0.5051, + "step": 326200 + }, + { + "epoch": 2.8838027546455915, + "grad_norm": 3.8721582889556885, + "learning_rate": 1.9366207559068113e-06, + "loss": 0.605, + "step": 326210 + }, + { + "epoch": 2.8838911579059037, + "grad_norm": 3.3679375648498535, + "learning_rate": 1.9351473682349405e-06, + "loss": 0.4782, + "step": 326220 + }, + { + "epoch": 2.883979561166216, + "grad_norm": 2.7024624347686768, + "learning_rate": 1.93367398056307e-06, + "loss": 0.3709, + "step": 326230 + }, + { + "epoch": 2.884067964426528, + "grad_norm": 1.5660183429718018, + "learning_rate": 1.9322005928911993e-06, + "loss": 0.4385, + "step": 326240 + }, + { + "epoch": 2.88415636768684, + "grad_norm": 3.0246965885162354, + "learning_rate": 1.9307272052193285e-06, + "loss": 0.5052, + "step": 326250 + }, + { + "epoch": 2.8842447709471526, + "grad_norm": 2.546403646469116, + "learning_rate": 1.9292538175474577e-06, + "loss": 0.5194, + "step": 326260 + }, + { + "epoch": 2.8843331742074647, + "grad_norm": 1.8655433654785156, + "learning_rate": 1.927780429875587e-06, + "loss": 0.5067, + "step": 326270 + }, + { + "epoch": 2.8844215774677773, + "grad_norm": 3.6738781929016113, + "learning_rate": 1.9263070422037165e-06, + "loss": 0.5512, + "step": 326280 + }, + { + "epoch": 2.8845099807280894, + "grad_norm": 4.471560955047607, + "learning_rate": 1.9248336545318457e-06, + "loss": 0.412, + "step": 326290 + }, + { + "epoch": 2.8845983839884015, + "grad_norm": 11.08122444152832, + "learning_rate": 1.9233602668599753e-06, + "loss": 0.5807, + "step": 326300 + }, + { + "epoch": 2.8846867872487136, + "grad_norm": 5.0352630615234375, + "learning_rate": 1.9218868791881045e-06, + "loss": 0.4421, + "step": 326310 + }, + { + "epoch": 2.8847751905090258, + "grad_norm": 4.222518444061279, + "learning_rate": 1.920413491516234e-06, + "loss": 0.5611, + "step": 326320 + }, + { + "epoch": 2.8848635937693383, + "grad_norm": 3.3651766777038574, + "learning_rate": 1.9189401038443634e-06, + "loss": 0.5521, + "step": 326330 + }, + { + "epoch": 2.8849519970296504, + "grad_norm": 2.384138584136963, + "learning_rate": 1.9174667161724926e-06, + "loss": 0.3504, + "step": 326340 + }, + { + "epoch": 2.8850404002899626, + "grad_norm": 13.900847434997559, + "learning_rate": 1.9159933285006218e-06, + "loss": 0.459, + "step": 326350 + }, + { + "epoch": 2.885128803550275, + "grad_norm": 17.737688064575195, + "learning_rate": 1.914519940828751e-06, + "loss": 0.5453, + "step": 326360 + }, + { + "epoch": 2.8852172068105872, + "grad_norm": 6.348208904266357, + "learning_rate": 1.9130465531568806e-06, + "loss": 0.492, + "step": 326370 + }, + { + "epoch": 2.8853056100708994, + "grad_norm": 5.502959251403809, + "learning_rate": 1.91157316548501e-06, + "loss": 0.63, + "step": 326380 + }, + { + "epoch": 2.8853940133312115, + "grad_norm": 3.673905849456787, + "learning_rate": 1.910099777813139e-06, + "loss": 0.5573, + "step": 326390 + }, + { + "epoch": 2.885482416591524, + "grad_norm": 1.4347622394561768, + "learning_rate": 1.9086263901412686e-06, + "loss": 0.4723, + "step": 326400 + }, + { + "epoch": 2.885570819851836, + "grad_norm": 4.196609973907471, + "learning_rate": 1.9071530024693976e-06, + "loss": 0.4752, + "step": 326410 + }, + { + "epoch": 2.8856592231121483, + "grad_norm": 3.4029579162597656, + "learning_rate": 1.9056796147975272e-06, + "loss": 0.6612, + "step": 326420 + }, + { + "epoch": 2.885747626372461, + "grad_norm": 10.368614196777344, + "learning_rate": 1.9042062271256564e-06, + "loss": 0.4703, + "step": 326430 + }, + { + "epoch": 2.885836029632773, + "grad_norm": 8.225127220153809, + "learning_rate": 1.9027328394537859e-06, + "loss": 0.5685, + "step": 326440 + }, + { + "epoch": 2.885924432893085, + "grad_norm": 5.530344009399414, + "learning_rate": 1.901259451781915e-06, + "loss": 0.4353, + "step": 326450 + }, + { + "epoch": 2.886012836153397, + "grad_norm": 1.2510401010513306, + "learning_rate": 1.8997860641100447e-06, + "loss": 0.477, + "step": 326460 + }, + { + "epoch": 2.8861012394137093, + "grad_norm": 1.6959455013275146, + "learning_rate": 1.8983126764381739e-06, + "loss": 0.5747, + "step": 326470 + }, + { + "epoch": 2.886189642674022, + "grad_norm": 5.026007175445557, + "learning_rate": 1.896839288766303e-06, + "loss": 0.5465, + "step": 326480 + }, + { + "epoch": 2.886278045934334, + "grad_norm": 1.6371581554412842, + "learning_rate": 1.8953659010944325e-06, + "loss": 0.4411, + "step": 326490 + }, + { + "epoch": 2.886366449194646, + "grad_norm": 1.2744925022125244, + "learning_rate": 1.8938925134225617e-06, + "loss": 0.4174, + "step": 326500 + }, + { + "epoch": 2.8864548524549587, + "grad_norm": 11.495088577270508, + "learning_rate": 1.8924191257506913e-06, + "loss": 0.5166, + "step": 326510 + }, + { + "epoch": 2.886543255715271, + "grad_norm": 5.464008331298828, + "learning_rate": 1.8909457380788205e-06, + "loss": 0.5084, + "step": 326520 + }, + { + "epoch": 2.886631658975583, + "grad_norm": 3.8465676307678223, + "learning_rate": 1.8894723504069497e-06, + "loss": 0.5868, + "step": 326530 + }, + { + "epoch": 2.886720062235895, + "grad_norm": 1.9184901714324951, + "learning_rate": 1.887998962735079e-06, + "loss": 0.4717, + "step": 326540 + }, + { + "epoch": 2.8868084654962076, + "grad_norm": 4.060492992401123, + "learning_rate": 1.8865255750632083e-06, + "loss": 0.575, + "step": 326550 + }, + { + "epoch": 2.8868968687565197, + "grad_norm": 1.3604817390441895, + "learning_rate": 1.885052187391338e-06, + "loss": 0.6729, + "step": 326560 + }, + { + "epoch": 2.886985272016832, + "grad_norm": 1.531100869178772, + "learning_rate": 1.8835787997194672e-06, + "loss": 0.675, + "step": 326570 + }, + { + "epoch": 2.8870736752771444, + "grad_norm": 4.726518154144287, + "learning_rate": 1.8821054120475964e-06, + "loss": 0.6205, + "step": 326580 + }, + { + "epoch": 2.8871620785374565, + "grad_norm": 1.761904001235962, + "learning_rate": 1.8806320243757256e-06, + "loss": 0.6084, + "step": 326590 + }, + { + "epoch": 2.8872504817977687, + "grad_norm": 4.660722732543945, + "learning_rate": 1.8791586367038552e-06, + "loss": 0.5645, + "step": 326600 + }, + { + "epoch": 2.887338885058081, + "grad_norm": 2.2645962238311768, + "learning_rate": 1.8776852490319844e-06, + "loss": 0.3711, + "step": 326610 + }, + { + "epoch": 2.8874272883183933, + "grad_norm": 2.1565322875976562, + "learning_rate": 1.8762118613601138e-06, + "loss": 0.5057, + "step": 326620 + }, + { + "epoch": 2.8875156915787055, + "grad_norm": 27.963703155517578, + "learning_rate": 1.874738473688243e-06, + "loss": 0.5189, + "step": 326630 + }, + { + "epoch": 2.8876040948390176, + "grad_norm": 3.970757007598877, + "learning_rate": 1.8732650860163722e-06, + "loss": 0.624, + "step": 326640 + }, + { + "epoch": 2.88769249809933, + "grad_norm": 12.724099159240723, + "learning_rate": 1.8717916983445018e-06, + "loss": 0.5794, + "step": 326650 + }, + { + "epoch": 2.8877809013596423, + "grad_norm": 11.973380088806152, + "learning_rate": 1.870318310672631e-06, + "loss": 0.5238, + "step": 326660 + }, + { + "epoch": 2.8878693046199544, + "grad_norm": 2.8171095848083496, + "learning_rate": 1.8688449230007604e-06, + "loss": 0.4868, + "step": 326670 + }, + { + "epoch": 2.8879577078802665, + "grad_norm": 1.1324682235717773, + "learning_rate": 1.8673715353288896e-06, + "loss": 0.588, + "step": 326680 + }, + { + "epoch": 2.8880461111405786, + "grad_norm": 1.1891402006149292, + "learning_rate": 1.8658981476570188e-06, + "loss": 0.51, + "step": 326690 + }, + { + "epoch": 2.888134514400891, + "grad_norm": 7.868343830108643, + "learning_rate": 1.8644247599851485e-06, + "loss": 0.4833, + "step": 326700 + }, + { + "epoch": 2.8882229176612033, + "grad_norm": 9.02088737487793, + "learning_rate": 1.8629513723132777e-06, + "loss": 0.4671, + "step": 326710 + }, + { + "epoch": 2.8883113209215154, + "grad_norm": 1.7964091300964355, + "learning_rate": 1.8614779846414069e-06, + "loss": 0.4607, + "step": 326720 + }, + { + "epoch": 2.888399724181828, + "grad_norm": 1.771501898765564, + "learning_rate": 1.8600045969695363e-06, + "loss": 0.3988, + "step": 326730 + }, + { + "epoch": 2.88848812744214, + "grad_norm": 3.897977828979492, + "learning_rate": 1.858531209297666e-06, + "loss": 0.4744, + "step": 326740 + }, + { + "epoch": 2.8885765307024522, + "grad_norm": 4.06790828704834, + "learning_rate": 1.857057821625795e-06, + "loss": 0.5564, + "step": 326750 + }, + { + "epoch": 2.8886649339627644, + "grad_norm": 2.0174813270568848, + "learning_rate": 1.8555844339539243e-06, + "loss": 0.4589, + "step": 326760 + }, + { + "epoch": 2.888753337223077, + "grad_norm": 2.544478416442871, + "learning_rate": 1.8541110462820535e-06, + "loss": 0.6166, + "step": 326770 + }, + { + "epoch": 2.888841740483389, + "grad_norm": 1.5492795705795288, + "learning_rate": 1.852637658610183e-06, + "loss": 0.4798, + "step": 326780 + }, + { + "epoch": 2.888930143743701, + "grad_norm": 3.2333710193634033, + "learning_rate": 1.8511642709383123e-06, + "loss": 0.5058, + "step": 326790 + }, + { + "epoch": 2.8890185470040137, + "grad_norm": 2.4803762435913086, + "learning_rate": 1.8496908832664417e-06, + "loss": 0.5036, + "step": 326800 + }, + { + "epoch": 2.889106950264326, + "grad_norm": 4.024792194366455, + "learning_rate": 1.848217495594571e-06, + "loss": 0.4837, + "step": 326810 + }, + { + "epoch": 2.889195353524638, + "grad_norm": 4.84702205657959, + "learning_rate": 1.8467441079227001e-06, + "loss": 0.5737, + "step": 326820 + }, + { + "epoch": 2.88928375678495, + "grad_norm": 4.631750583648682, + "learning_rate": 1.8452707202508293e-06, + "loss": 0.5755, + "step": 326830 + }, + { + "epoch": 2.889372160045262, + "grad_norm": 2.9972000122070312, + "learning_rate": 1.843797332578959e-06, + "loss": 0.4552, + "step": 326840 + }, + { + "epoch": 2.8894605633055748, + "grad_norm": 2.997804880142212, + "learning_rate": 1.8423239449070884e-06, + "loss": 0.5334, + "step": 326850 + }, + { + "epoch": 2.889548966565887, + "grad_norm": 5.024291515350342, + "learning_rate": 1.8408505572352176e-06, + "loss": 0.5378, + "step": 326860 + }, + { + "epoch": 2.8896373698261995, + "grad_norm": 6.3948845863342285, + "learning_rate": 1.8393771695633468e-06, + "loss": 0.5573, + "step": 326870 + }, + { + "epoch": 2.8897257730865116, + "grad_norm": 0.70591139793396, + "learning_rate": 1.8379037818914764e-06, + "loss": 0.5385, + "step": 326880 + }, + { + "epoch": 2.8898141763468237, + "grad_norm": 2.17728590965271, + "learning_rate": 1.8364303942196056e-06, + "loss": 0.4925, + "step": 326890 + }, + { + "epoch": 2.889902579607136, + "grad_norm": 0.9488351941108704, + "learning_rate": 1.8349570065477348e-06, + "loss": 0.5436, + "step": 326900 + }, + { + "epoch": 2.889990982867448, + "grad_norm": 13.97898006439209, + "learning_rate": 1.8334836188758642e-06, + "loss": 0.5161, + "step": 326910 + }, + { + "epoch": 2.8900793861277605, + "grad_norm": 1.551556944847107, + "learning_rate": 1.8320102312039934e-06, + "loss": 0.5302, + "step": 326920 + }, + { + "epoch": 2.8901677893880726, + "grad_norm": 4.258837699890137, + "learning_rate": 1.830536843532123e-06, + "loss": 0.4768, + "step": 326930 + }, + { + "epoch": 2.8902561926483847, + "grad_norm": 4.792545318603516, + "learning_rate": 1.8290634558602523e-06, + "loss": 0.3874, + "step": 326940 + }, + { + "epoch": 2.8903445959086973, + "grad_norm": 4.879011631011963, + "learning_rate": 1.8275900681883815e-06, + "loss": 0.5697, + "step": 326950 + }, + { + "epoch": 2.8904329991690094, + "grad_norm": 8.222295761108398, + "learning_rate": 1.8261166805165109e-06, + "loss": 0.6909, + "step": 326960 + }, + { + "epoch": 2.8905214024293215, + "grad_norm": 3.4733941555023193, + "learning_rate": 1.82464329284464e-06, + "loss": 0.5729, + "step": 326970 + }, + { + "epoch": 2.8906098056896337, + "grad_norm": 6.426056861877441, + "learning_rate": 1.8231699051727697e-06, + "loss": 0.6632, + "step": 326980 + }, + { + "epoch": 2.8906982089499462, + "grad_norm": 5.255385398864746, + "learning_rate": 1.821696517500899e-06, + "loss": 0.6441, + "step": 326990 + }, + { + "epoch": 2.8907866122102583, + "grad_norm": 15.399577140808105, + "learning_rate": 1.820223129829028e-06, + "loss": 0.4994, + "step": 327000 + }, + { + "epoch": 2.8908750154705705, + "grad_norm": 1.775965929031372, + "learning_rate": 1.8187497421571575e-06, + "loss": 0.5454, + "step": 327010 + }, + { + "epoch": 2.890963418730883, + "grad_norm": 2.4255259037017822, + "learning_rate": 1.817276354485287e-06, + "loss": 0.4865, + "step": 327020 + }, + { + "epoch": 2.891051821991195, + "grad_norm": 6.713985443115234, + "learning_rate": 1.8158029668134163e-06, + "loss": 0.4309, + "step": 327030 + }, + { + "epoch": 2.8911402252515073, + "grad_norm": 5.003079414367676, + "learning_rate": 1.8143295791415455e-06, + "loss": 0.5354, + "step": 327040 + }, + { + "epoch": 2.8912286285118194, + "grad_norm": 3.119572877883911, + "learning_rate": 1.8128561914696747e-06, + "loss": 0.455, + "step": 327050 + }, + { + "epoch": 2.8913170317721315, + "grad_norm": 2.5747716426849365, + "learning_rate": 1.811382803797804e-06, + "loss": 0.4182, + "step": 327060 + }, + { + "epoch": 2.891405435032444, + "grad_norm": 5.881381034851074, + "learning_rate": 1.8099094161259336e-06, + "loss": 0.5555, + "step": 327070 + }, + { + "epoch": 2.891493838292756, + "grad_norm": 2.2186014652252197, + "learning_rate": 1.808436028454063e-06, + "loss": 0.4829, + "step": 327080 + }, + { + "epoch": 2.8915822415530683, + "grad_norm": 4.007602691650391, + "learning_rate": 1.8069626407821922e-06, + "loss": 0.4563, + "step": 327090 + }, + { + "epoch": 2.891670644813381, + "grad_norm": 2.61755633354187, + "learning_rate": 1.8054892531103214e-06, + "loss": 0.7197, + "step": 327100 + }, + { + "epoch": 2.891759048073693, + "grad_norm": 5.7784905433654785, + "learning_rate": 1.8040158654384506e-06, + "loss": 0.5736, + "step": 327110 + }, + { + "epoch": 2.891847451334005, + "grad_norm": 10.5038423538208, + "learning_rate": 1.8025424777665802e-06, + "loss": 0.5214, + "step": 327120 + }, + { + "epoch": 2.8919358545943172, + "grad_norm": 3.6286122798919678, + "learning_rate": 1.8010690900947094e-06, + "loss": 0.5243, + "step": 327130 + }, + { + "epoch": 2.89202425785463, + "grad_norm": 3.31280779838562, + "learning_rate": 1.7995957024228388e-06, + "loss": 0.5152, + "step": 327140 + }, + { + "epoch": 2.892112661114942, + "grad_norm": 5.9687018394470215, + "learning_rate": 1.798122314750968e-06, + "loss": 0.499, + "step": 327150 + }, + { + "epoch": 2.892201064375254, + "grad_norm": 5.419929027557373, + "learning_rate": 1.7966489270790976e-06, + "loss": 0.5461, + "step": 327160 + }, + { + "epoch": 2.8922894676355666, + "grad_norm": 2.1541998386383057, + "learning_rate": 1.7951755394072268e-06, + "loss": 0.4382, + "step": 327170 + }, + { + "epoch": 2.8923778708958787, + "grad_norm": 5.564290523529053, + "learning_rate": 1.793702151735356e-06, + "loss": 0.5772, + "step": 327180 + }, + { + "epoch": 2.892466274156191, + "grad_norm": 7.288029670715332, + "learning_rate": 1.7922287640634855e-06, + "loss": 0.5587, + "step": 327190 + }, + { + "epoch": 2.892554677416503, + "grad_norm": 4.57809591293335, + "learning_rate": 1.7907553763916147e-06, + "loss": 0.4818, + "step": 327200 + }, + { + "epoch": 2.8926430806768155, + "grad_norm": 4.51658296585083, + "learning_rate": 1.7892819887197443e-06, + "loss": 0.4043, + "step": 327210 + }, + { + "epoch": 2.8927314839371276, + "grad_norm": 7.108700275421143, + "learning_rate": 1.7878086010478735e-06, + "loss": 0.581, + "step": 327220 + }, + { + "epoch": 2.8928198871974398, + "grad_norm": 1.2347233295440674, + "learning_rate": 1.7863352133760027e-06, + "loss": 0.3676, + "step": 327230 + }, + { + "epoch": 2.8929082904577523, + "grad_norm": 4.430157661437988, + "learning_rate": 1.7848618257041319e-06, + "loss": 0.6316, + "step": 327240 + }, + { + "epoch": 2.8929966937180644, + "grad_norm": 2.4726922512054443, + "learning_rate": 1.7833884380322613e-06, + "loss": 0.4947, + "step": 327250 + }, + { + "epoch": 2.8930850969783766, + "grad_norm": 3.0270233154296875, + "learning_rate": 1.781915050360391e-06, + "loss": 0.5593, + "step": 327260 + }, + { + "epoch": 2.8931735002386887, + "grad_norm": 1.3214995861053467, + "learning_rate": 1.7804416626885201e-06, + "loss": 0.5495, + "step": 327270 + }, + { + "epoch": 2.893261903499001, + "grad_norm": 3.1718785762786865, + "learning_rate": 1.7789682750166493e-06, + "loss": 0.6132, + "step": 327280 + }, + { + "epoch": 2.8933503067593134, + "grad_norm": 4.292414665222168, + "learning_rate": 1.7774948873447785e-06, + "loss": 0.4385, + "step": 327290 + }, + { + "epoch": 2.8934387100196255, + "grad_norm": 5.657448768615723, + "learning_rate": 1.7760214996729081e-06, + "loss": 0.6237, + "step": 327300 + }, + { + "epoch": 2.8935271132799376, + "grad_norm": 12.107390403747559, + "learning_rate": 1.7745481120010373e-06, + "loss": 0.4331, + "step": 327310 + }, + { + "epoch": 2.89361551654025, + "grad_norm": 1.4685616493225098, + "learning_rate": 1.7730747243291668e-06, + "loss": 0.5177, + "step": 327320 + }, + { + "epoch": 2.8937039198005623, + "grad_norm": 9.450855255126953, + "learning_rate": 1.771601336657296e-06, + "loss": 0.65, + "step": 327330 + }, + { + "epoch": 2.8937923230608744, + "grad_norm": 3.109663486480713, + "learning_rate": 1.7701279489854252e-06, + "loss": 0.4012, + "step": 327340 + }, + { + "epoch": 2.8938807263211865, + "grad_norm": 6.851868152618408, + "learning_rate": 1.7686545613135548e-06, + "loss": 0.4922, + "step": 327350 + }, + { + "epoch": 2.893969129581499, + "grad_norm": 4.009052276611328, + "learning_rate": 1.767181173641684e-06, + "loss": 0.4852, + "step": 327360 + }, + { + "epoch": 2.894057532841811, + "grad_norm": 6.601598262786865, + "learning_rate": 1.7657077859698134e-06, + "loss": 0.4894, + "step": 327370 + }, + { + "epoch": 2.8941459361021233, + "grad_norm": 1.544348120689392, + "learning_rate": 1.7642343982979426e-06, + "loss": 0.5208, + "step": 327380 + }, + { + "epoch": 2.894234339362436, + "grad_norm": 6.026576042175293, + "learning_rate": 1.7627610106260718e-06, + "loss": 0.6031, + "step": 327390 + }, + { + "epoch": 2.894322742622748, + "grad_norm": 2.861759662628174, + "learning_rate": 1.7612876229542014e-06, + "loss": 0.5187, + "step": 327400 + }, + { + "epoch": 2.89441114588306, + "grad_norm": 2.3204619884490967, + "learning_rate": 1.7598142352823306e-06, + "loss": 0.3996, + "step": 327410 + }, + { + "epoch": 2.8944995491433723, + "grad_norm": 2.0853753089904785, + "learning_rate": 1.7583408476104598e-06, + "loss": 0.4744, + "step": 327420 + }, + { + "epoch": 2.8945879524036844, + "grad_norm": 7.868892192840576, + "learning_rate": 1.7568674599385892e-06, + "loss": 0.5886, + "step": 327430 + }, + { + "epoch": 2.894676355663997, + "grad_norm": 4.921209335327148, + "learning_rate": 1.7553940722667189e-06, + "loss": 0.6301, + "step": 327440 + }, + { + "epoch": 2.894764758924309, + "grad_norm": 1.8665093183517456, + "learning_rate": 1.753920684594848e-06, + "loss": 0.565, + "step": 327450 + }, + { + "epoch": 2.8948531621846216, + "grad_norm": 2.6877830028533936, + "learning_rate": 1.7524472969229773e-06, + "loss": 0.4833, + "step": 327460 + }, + { + "epoch": 2.8949415654449338, + "grad_norm": 4.97327995300293, + "learning_rate": 1.7509739092511065e-06, + "loss": 0.4601, + "step": 327470 + }, + { + "epoch": 2.895029968705246, + "grad_norm": 3.624128580093384, + "learning_rate": 1.7495005215792359e-06, + "loss": 0.4497, + "step": 327480 + }, + { + "epoch": 2.895118371965558, + "grad_norm": 0.9241101741790771, + "learning_rate": 1.7480271339073653e-06, + "loss": 0.5353, + "step": 327490 + }, + { + "epoch": 2.89520677522587, + "grad_norm": 8.415351867675781, + "learning_rate": 1.7465537462354947e-06, + "loss": 0.5045, + "step": 327500 + }, + { + "epoch": 2.8952951784861827, + "grad_norm": 3.3511996269226074, + "learning_rate": 1.745080358563624e-06, + "loss": 0.5127, + "step": 327510 + }, + { + "epoch": 2.895383581746495, + "grad_norm": 4.3214192390441895, + "learning_rate": 1.7436069708917531e-06, + "loss": 0.4797, + "step": 327520 + }, + { + "epoch": 2.895471985006807, + "grad_norm": 1.6557612419128418, + "learning_rate": 1.7421335832198823e-06, + "loss": 0.5881, + "step": 327530 + }, + { + "epoch": 2.8955603882671195, + "grad_norm": 9.113303184509277, + "learning_rate": 1.740660195548012e-06, + "loss": 0.4736, + "step": 327540 + }, + { + "epoch": 2.8956487915274316, + "grad_norm": 0.9232906699180603, + "learning_rate": 1.7391868078761413e-06, + "loss": 0.4433, + "step": 327550 + }, + { + "epoch": 2.8957371947877437, + "grad_norm": 1.5335867404937744, + "learning_rate": 1.7377134202042706e-06, + "loss": 0.467, + "step": 327560 + }, + { + "epoch": 2.895825598048056, + "grad_norm": 2.49711275100708, + "learning_rate": 1.7362400325323998e-06, + "loss": 0.5423, + "step": 327570 + }, + { + "epoch": 2.8959140013083684, + "grad_norm": 2.248927593231201, + "learning_rate": 1.734766644860529e-06, + "loss": 0.4827, + "step": 327580 + }, + { + "epoch": 2.8960024045686805, + "grad_norm": 8.474502563476562, + "learning_rate": 1.7332932571886586e-06, + "loss": 0.443, + "step": 327590 + }, + { + "epoch": 2.8960908078289926, + "grad_norm": 4.29740571975708, + "learning_rate": 1.731819869516788e-06, + "loss": 0.5433, + "step": 327600 + }, + { + "epoch": 2.896179211089305, + "grad_norm": 5.370006561279297, + "learning_rate": 1.7303464818449172e-06, + "loss": 0.5659, + "step": 327610 + }, + { + "epoch": 2.8962676143496173, + "grad_norm": 5.861617565155029, + "learning_rate": 1.7288730941730464e-06, + "loss": 0.4683, + "step": 327620 + }, + { + "epoch": 2.8963560176099294, + "grad_norm": 1.3562874794006348, + "learning_rate": 1.727399706501176e-06, + "loss": 0.4987, + "step": 327630 + }, + { + "epoch": 2.8964444208702416, + "grad_norm": 2.275869846343994, + "learning_rate": 1.7259263188293052e-06, + "loss": 0.5647, + "step": 327640 + }, + { + "epoch": 2.8965328241305537, + "grad_norm": 1.1669151782989502, + "learning_rate": 1.7244529311574344e-06, + "loss": 0.6521, + "step": 327650 + }, + { + "epoch": 2.8966212273908662, + "grad_norm": 8.202418327331543, + "learning_rate": 1.7229795434855638e-06, + "loss": 0.4871, + "step": 327660 + }, + { + "epoch": 2.8967096306511784, + "grad_norm": 4.256260395050049, + "learning_rate": 1.721506155813693e-06, + "loss": 0.5553, + "step": 327670 + }, + { + "epoch": 2.896798033911491, + "grad_norm": 1.4384957551956177, + "learning_rate": 1.7200327681418227e-06, + "loss": 0.4288, + "step": 327680 + }, + { + "epoch": 2.896886437171803, + "grad_norm": 5.006439685821533, + "learning_rate": 1.7185593804699519e-06, + "loss": 0.5192, + "step": 327690 + }, + { + "epoch": 2.896974840432115, + "grad_norm": 2.7222161293029785, + "learning_rate": 1.717085992798081e-06, + "loss": 0.4755, + "step": 327700 + }, + { + "epoch": 2.8970632436924273, + "grad_norm": 1.474289894104004, + "learning_rate": 1.7156126051262105e-06, + "loss": 0.5749, + "step": 327710 + }, + { + "epoch": 2.8971516469527394, + "grad_norm": 4.422641277313232, + "learning_rate": 1.7141392174543397e-06, + "loss": 0.4957, + "step": 327720 + }, + { + "epoch": 2.897240050213052, + "grad_norm": 1.432699203491211, + "learning_rate": 1.7126658297824693e-06, + "loss": 0.4025, + "step": 327730 + }, + { + "epoch": 2.897328453473364, + "grad_norm": 5.244688510894775, + "learning_rate": 1.7111924421105985e-06, + "loss": 0.3917, + "step": 327740 + }, + { + "epoch": 2.897416856733676, + "grad_norm": 2.76688814163208, + "learning_rate": 1.7097190544387277e-06, + "loss": 0.6113, + "step": 327750 + }, + { + "epoch": 2.897505259993989, + "grad_norm": 3.894942045211792, + "learning_rate": 1.708245666766857e-06, + "loss": 0.5117, + "step": 327760 + }, + { + "epoch": 2.897593663254301, + "grad_norm": 26.69508171081543, + "learning_rate": 1.7067722790949865e-06, + "loss": 0.5013, + "step": 327770 + }, + { + "epoch": 2.897682066514613, + "grad_norm": 2.890465021133423, + "learning_rate": 1.705298891423116e-06, + "loss": 0.5215, + "step": 327780 + }, + { + "epoch": 2.897770469774925, + "grad_norm": 2.400305986404419, + "learning_rate": 1.7038255037512451e-06, + "loss": 0.4505, + "step": 327790 + }, + { + "epoch": 2.8978588730352377, + "grad_norm": 2.7318270206451416, + "learning_rate": 1.7023521160793743e-06, + "loss": 0.5375, + "step": 327800 + }, + { + "epoch": 2.89794727629555, + "grad_norm": 35.065303802490234, + "learning_rate": 1.7008787284075035e-06, + "loss": 0.5859, + "step": 327810 + }, + { + "epoch": 2.898035679555862, + "grad_norm": 0.8606184720993042, + "learning_rate": 1.6994053407356332e-06, + "loss": 0.4937, + "step": 327820 + }, + { + "epoch": 2.8981240828161745, + "grad_norm": 5.262023448944092, + "learning_rate": 1.6979319530637624e-06, + "loss": 0.5288, + "step": 327830 + }, + { + "epoch": 2.8982124860764866, + "grad_norm": 5.244312286376953, + "learning_rate": 1.6964585653918918e-06, + "loss": 0.6751, + "step": 327840 + }, + { + "epoch": 2.8983008893367987, + "grad_norm": 1.7673640251159668, + "learning_rate": 1.694985177720021e-06, + "loss": 0.6707, + "step": 327850 + }, + { + "epoch": 2.898389292597111, + "grad_norm": 3.100783109664917, + "learning_rate": 1.6935117900481502e-06, + "loss": 0.5158, + "step": 327860 + }, + { + "epoch": 2.898477695857423, + "grad_norm": 1.3167930841445923, + "learning_rate": 1.6920384023762798e-06, + "loss": 0.4679, + "step": 327870 + }, + { + "epoch": 2.8985660991177356, + "grad_norm": 1.3405158519744873, + "learning_rate": 1.690565014704409e-06, + "loss": 0.4922, + "step": 327880 + }, + { + "epoch": 2.8986545023780477, + "grad_norm": 4.85495138168335, + "learning_rate": 1.6890916270325384e-06, + "loss": 0.4424, + "step": 327890 + }, + { + "epoch": 2.89874290563836, + "grad_norm": 2.747234344482422, + "learning_rate": 1.6876182393606676e-06, + "loss": 0.5305, + "step": 327900 + }, + { + "epoch": 2.8988313088986724, + "grad_norm": 2.1020455360412598, + "learning_rate": 1.6861448516887972e-06, + "loss": 0.4524, + "step": 327910 + }, + { + "epoch": 2.8989197121589845, + "grad_norm": 3.1773316860198975, + "learning_rate": 1.6846714640169264e-06, + "loss": 0.5616, + "step": 327920 + }, + { + "epoch": 2.8990081154192966, + "grad_norm": 1.0971238613128662, + "learning_rate": 1.6831980763450556e-06, + "loss": 0.6018, + "step": 327930 + }, + { + "epoch": 2.8990965186796087, + "grad_norm": 7.88828706741333, + "learning_rate": 1.6817246886731848e-06, + "loss": 0.4983, + "step": 327940 + }, + { + "epoch": 2.8991849219399213, + "grad_norm": 8.606073379516602, + "learning_rate": 1.6802513010013143e-06, + "loss": 0.462, + "step": 327950 + }, + { + "epoch": 2.8992733252002334, + "grad_norm": 3.5622336864471436, + "learning_rate": 1.6787779133294439e-06, + "loss": 0.5155, + "step": 327960 + }, + { + "epoch": 2.8993617284605455, + "grad_norm": 2.9478724002838135, + "learning_rate": 1.677304525657573e-06, + "loss": 0.5521, + "step": 327970 + }, + { + "epoch": 2.899450131720858, + "grad_norm": 1.2052149772644043, + "learning_rate": 1.6758311379857023e-06, + "loss": 0.4586, + "step": 327980 + }, + { + "epoch": 2.89953853498117, + "grad_norm": 3.708150863647461, + "learning_rate": 1.6743577503138315e-06, + "loss": 0.5603, + "step": 327990 + }, + { + "epoch": 2.8996269382414823, + "grad_norm": 2.582475185394287, + "learning_rate": 1.672884362641961e-06, + "loss": 0.678, + "step": 328000 + }, + { + "epoch": 2.8997153415017944, + "grad_norm": 1.4828236103057861, + "learning_rate": 1.6714109749700903e-06, + "loss": 0.4168, + "step": 328010 + }, + { + "epoch": 2.8998037447621066, + "grad_norm": 1.4816983938217163, + "learning_rate": 1.6699375872982197e-06, + "loss": 0.3952, + "step": 328020 + }, + { + "epoch": 2.899892148022419, + "grad_norm": 5.110167980194092, + "learning_rate": 1.668464199626349e-06, + "loss": 0.489, + "step": 328030 + }, + { + "epoch": 2.8999805512827312, + "grad_norm": 0.8023213744163513, + "learning_rate": 1.6669908119544781e-06, + "loss": 0.6194, + "step": 328040 + }, + { + "epoch": 2.900068954543044, + "grad_norm": 5.140817642211914, + "learning_rate": 1.6655174242826078e-06, + "loss": 0.5356, + "step": 328050 + }, + { + "epoch": 2.900157357803356, + "grad_norm": 3.0941174030303955, + "learning_rate": 1.664044036610737e-06, + "loss": 0.6562, + "step": 328060 + }, + { + "epoch": 2.900245761063668, + "grad_norm": 37.155555725097656, + "learning_rate": 1.6625706489388664e-06, + "loss": 0.4644, + "step": 328070 + }, + { + "epoch": 2.90033416432398, + "grad_norm": 4.7336745262146, + "learning_rate": 1.6610972612669956e-06, + "loss": 0.4995, + "step": 328080 + }, + { + "epoch": 2.9004225675842923, + "grad_norm": 1.849342942237854, + "learning_rate": 1.6596238735951248e-06, + "loss": 0.5918, + "step": 328090 + }, + { + "epoch": 2.900510970844605, + "grad_norm": 7.214869499206543, + "learning_rate": 1.6581504859232544e-06, + "loss": 0.5079, + "step": 328100 + }, + { + "epoch": 2.900599374104917, + "grad_norm": 2.2011520862579346, + "learning_rate": 1.6566770982513836e-06, + "loss": 0.5817, + "step": 328110 + }, + { + "epoch": 2.900687777365229, + "grad_norm": 0.7395120859146118, + "learning_rate": 1.655203710579513e-06, + "loss": 0.6244, + "step": 328120 + }, + { + "epoch": 2.9007761806255417, + "grad_norm": 2.9185752868652344, + "learning_rate": 1.6537303229076422e-06, + "loss": 0.4644, + "step": 328130 + }, + { + "epoch": 2.9008645838858538, + "grad_norm": 3.8815386295318604, + "learning_rate": 1.6522569352357714e-06, + "loss": 0.5117, + "step": 328140 + }, + { + "epoch": 2.900952987146166, + "grad_norm": 1.3877207040786743, + "learning_rate": 1.650783547563901e-06, + "loss": 0.5039, + "step": 328150 + }, + { + "epoch": 2.901041390406478, + "grad_norm": 7.275258541107178, + "learning_rate": 1.6493101598920302e-06, + "loss": 0.4575, + "step": 328160 + }, + { + "epoch": 2.9011297936667906, + "grad_norm": 2.7917139530181885, + "learning_rate": 1.6478367722201594e-06, + "loss": 0.3852, + "step": 328170 + }, + { + "epoch": 2.9012181969271027, + "grad_norm": 1.9918378591537476, + "learning_rate": 1.6463633845482888e-06, + "loss": 0.4885, + "step": 328180 + }, + { + "epoch": 2.901306600187415, + "grad_norm": 4.342094898223877, + "learning_rate": 1.6448899968764185e-06, + "loss": 0.5228, + "step": 328190 + }, + { + "epoch": 2.9013950034477274, + "grad_norm": 10.439650535583496, + "learning_rate": 1.6434166092045477e-06, + "loss": 0.6665, + "step": 328200 + }, + { + "epoch": 2.9014834067080395, + "grad_norm": 2.372314214706421, + "learning_rate": 1.6419432215326769e-06, + "loss": 0.5071, + "step": 328210 + }, + { + "epoch": 2.9015718099683516, + "grad_norm": 3.531060218811035, + "learning_rate": 1.640469833860806e-06, + "loss": 0.5421, + "step": 328220 + }, + { + "epoch": 2.9016602132286637, + "grad_norm": 3.1964657306671143, + "learning_rate": 1.6389964461889355e-06, + "loss": 0.4054, + "step": 328230 + }, + { + "epoch": 2.901748616488976, + "grad_norm": 1.2011040449142456, + "learning_rate": 1.637523058517065e-06, + "loss": 0.5215, + "step": 328240 + }, + { + "epoch": 2.9018370197492884, + "grad_norm": 3.751793146133423, + "learning_rate": 1.6360496708451943e-06, + "loss": 0.4729, + "step": 328250 + }, + { + "epoch": 2.9019254230096005, + "grad_norm": 10.641716003417969, + "learning_rate": 1.6345762831733235e-06, + "loss": 0.6186, + "step": 328260 + }, + { + "epoch": 2.902013826269913, + "grad_norm": 6.691748142242432, + "learning_rate": 1.6331028955014527e-06, + "loss": 0.576, + "step": 328270 + }, + { + "epoch": 2.9021022295302252, + "grad_norm": 2.9427895545959473, + "learning_rate": 1.631629507829582e-06, + "loss": 0.4572, + "step": 328280 + }, + { + "epoch": 2.9021906327905374, + "grad_norm": 6.0036163330078125, + "learning_rate": 1.6301561201577115e-06, + "loss": 0.578, + "step": 328290 + }, + { + "epoch": 2.9022790360508495, + "grad_norm": 1.446393370628357, + "learning_rate": 1.628682732485841e-06, + "loss": 0.4222, + "step": 328300 + }, + { + "epoch": 2.9023674393111616, + "grad_norm": 4.944876670837402, + "learning_rate": 1.6272093448139702e-06, + "loss": 0.551, + "step": 328310 + }, + { + "epoch": 2.902455842571474, + "grad_norm": 7.216762065887451, + "learning_rate": 1.6257359571420994e-06, + "loss": 0.4534, + "step": 328320 + }, + { + "epoch": 2.9025442458317863, + "grad_norm": 0.9898166656494141, + "learning_rate": 1.624262569470229e-06, + "loss": 0.4689, + "step": 328330 + }, + { + "epoch": 2.9026326490920984, + "grad_norm": 4.133835315704346, + "learning_rate": 1.6227891817983582e-06, + "loss": 0.5088, + "step": 328340 + }, + { + "epoch": 2.902721052352411, + "grad_norm": 3.188403844833374, + "learning_rate": 1.6213157941264874e-06, + "loss": 0.4619, + "step": 328350 + }, + { + "epoch": 2.902809455612723, + "grad_norm": 10.151366233825684, + "learning_rate": 1.6198424064546168e-06, + "loss": 0.6287, + "step": 328360 + }, + { + "epoch": 2.902897858873035, + "grad_norm": 7.738288879394531, + "learning_rate": 1.618369018782746e-06, + "loss": 0.5762, + "step": 328370 + }, + { + "epoch": 2.9029862621333473, + "grad_norm": 3.6387808322906494, + "learning_rate": 1.6168956311108756e-06, + "loss": 0.5149, + "step": 328380 + }, + { + "epoch": 2.90307466539366, + "grad_norm": 22.582435607910156, + "learning_rate": 1.6154222434390048e-06, + "loss": 0.4323, + "step": 328390 + }, + { + "epoch": 2.903163068653972, + "grad_norm": 9.528203010559082, + "learning_rate": 1.613948855767134e-06, + "loss": 0.4668, + "step": 328400 + }, + { + "epoch": 2.903251471914284, + "grad_norm": 3.143186330795288, + "learning_rate": 1.6124754680952634e-06, + "loss": 0.5325, + "step": 328410 + }, + { + "epoch": 2.9033398751745967, + "grad_norm": 11.212843894958496, + "learning_rate": 1.6110020804233926e-06, + "loss": 0.6813, + "step": 328420 + }, + { + "epoch": 2.903428278434909, + "grad_norm": 53.12631607055664, + "learning_rate": 1.6095286927515223e-06, + "loss": 0.57, + "step": 328430 + }, + { + "epoch": 2.903516681695221, + "grad_norm": 2.775275945663452, + "learning_rate": 1.6080553050796515e-06, + "loss": 0.572, + "step": 328440 + }, + { + "epoch": 2.903605084955533, + "grad_norm": 15.056328773498535, + "learning_rate": 1.6065819174077807e-06, + "loss": 0.5674, + "step": 328450 + }, + { + "epoch": 2.903693488215845, + "grad_norm": 4.331943035125732, + "learning_rate": 1.6051085297359099e-06, + "loss": 0.5309, + "step": 328460 + }, + { + "epoch": 2.9037818914761577, + "grad_norm": 3.224515438079834, + "learning_rate": 1.6036351420640395e-06, + "loss": 0.5794, + "step": 328470 + }, + { + "epoch": 2.90387029473647, + "grad_norm": 5.262458324432373, + "learning_rate": 1.602161754392169e-06, + "loss": 0.455, + "step": 328480 + }, + { + "epoch": 2.903958697996782, + "grad_norm": 3.2678756713867188, + "learning_rate": 1.600688366720298e-06, + "loss": 0.5534, + "step": 328490 + }, + { + "epoch": 2.9040471012570945, + "grad_norm": 4.278739929199219, + "learning_rate": 1.5992149790484273e-06, + "loss": 0.5339, + "step": 328500 + }, + { + "epoch": 2.9041355045174067, + "grad_norm": 6.177495956420898, + "learning_rate": 1.5977415913765565e-06, + "loss": 0.3716, + "step": 328510 + }, + { + "epoch": 2.9042239077777188, + "grad_norm": 4.593690395355225, + "learning_rate": 1.5962682037046861e-06, + "loss": 0.4415, + "step": 328520 + }, + { + "epoch": 2.904312311038031, + "grad_norm": 1.635559320449829, + "learning_rate": 1.5947948160328153e-06, + "loss": 0.3891, + "step": 328530 + }, + { + "epoch": 2.9044007142983435, + "grad_norm": 2.194458484649658, + "learning_rate": 1.5933214283609447e-06, + "loss": 0.4982, + "step": 328540 + }, + { + "epoch": 2.9044891175586556, + "grad_norm": 1.7193539142608643, + "learning_rate": 1.591848040689074e-06, + "loss": 0.4425, + "step": 328550 + }, + { + "epoch": 2.9045775208189677, + "grad_norm": 3.595327377319336, + "learning_rate": 1.5903746530172031e-06, + "loss": 0.538, + "step": 328560 + }, + { + "epoch": 2.9046659240792803, + "grad_norm": 5.78036642074585, + "learning_rate": 1.5889012653453328e-06, + "loss": 0.493, + "step": 328570 + }, + { + "epoch": 2.9047543273395924, + "grad_norm": 2.474874496459961, + "learning_rate": 1.587427877673462e-06, + "loss": 0.5261, + "step": 328580 + }, + { + "epoch": 2.9048427305999045, + "grad_norm": 2.2788569927215576, + "learning_rate": 1.5859544900015914e-06, + "loss": 0.5408, + "step": 328590 + }, + { + "epoch": 2.9049311338602166, + "grad_norm": 8.977019309997559, + "learning_rate": 1.5844811023297206e-06, + "loss": 0.5307, + "step": 328600 + }, + { + "epoch": 2.9050195371205287, + "grad_norm": 2.469297170639038, + "learning_rate": 1.5830077146578502e-06, + "loss": 0.505, + "step": 328610 + }, + { + "epoch": 2.9051079403808413, + "grad_norm": 6.274876594543457, + "learning_rate": 1.5815343269859794e-06, + "loss": 0.5442, + "step": 328620 + }, + { + "epoch": 2.9051963436411534, + "grad_norm": 3.6653316020965576, + "learning_rate": 1.5800609393141086e-06, + "loss": 0.6146, + "step": 328630 + }, + { + "epoch": 2.905284746901466, + "grad_norm": 2.7571046352386475, + "learning_rate": 1.5785875516422378e-06, + "loss": 0.5168, + "step": 328640 + }, + { + "epoch": 2.905373150161778, + "grad_norm": 3.2283074855804443, + "learning_rate": 1.5771141639703672e-06, + "loss": 0.4488, + "step": 328650 + }, + { + "epoch": 2.9054615534220902, + "grad_norm": 1.8260526657104492, + "learning_rate": 1.5756407762984968e-06, + "loss": 0.432, + "step": 328660 + }, + { + "epoch": 2.9055499566824023, + "grad_norm": 4.508037567138672, + "learning_rate": 1.574167388626626e-06, + "loss": 0.5472, + "step": 328670 + }, + { + "epoch": 2.9056383599427145, + "grad_norm": 10.837754249572754, + "learning_rate": 1.5726940009547552e-06, + "loss": 0.506, + "step": 328680 + }, + { + "epoch": 2.905726763203027, + "grad_norm": 16.028837203979492, + "learning_rate": 1.5712206132828845e-06, + "loss": 0.6943, + "step": 328690 + }, + { + "epoch": 2.905815166463339, + "grad_norm": 4.9876837730407715, + "learning_rate": 1.5697472256110139e-06, + "loss": 0.4854, + "step": 328700 + }, + { + "epoch": 2.9059035697236513, + "grad_norm": 1.736808180809021, + "learning_rate": 1.5682738379391435e-06, + "loss": 0.5178, + "step": 328710 + }, + { + "epoch": 2.905991972983964, + "grad_norm": 2.8250670433044434, + "learning_rate": 1.5668004502672727e-06, + "loss": 0.5257, + "step": 328720 + }, + { + "epoch": 2.906080376244276, + "grad_norm": 2.853886604309082, + "learning_rate": 1.5653270625954019e-06, + "loss": 0.4472, + "step": 328730 + }, + { + "epoch": 2.906168779504588, + "grad_norm": 5.088530540466309, + "learning_rate": 1.563853674923531e-06, + "loss": 0.6059, + "step": 328740 + }, + { + "epoch": 2.9062571827649, + "grad_norm": 1.7493921518325806, + "learning_rate": 1.5623802872516605e-06, + "loss": 0.5213, + "step": 328750 + }, + { + "epoch": 2.9063455860252128, + "grad_norm": 1.8456733226776123, + "learning_rate": 1.56090689957979e-06, + "loss": 0.6172, + "step": 328760 + }, + { + "epoch": 2.906433989285525, + "grad_norm": 23.410287857055664, + "learning_rate": 1.5594335119079193e-06, + "loss": 0.5917, + "step": 328770 + }, + { + "epoch": 2.906522392545837, + "grad_norm": 1.5790683031082153, + "learning_rate": 1.5579601242360485e-06, + "loss": 0.5291, + "step": 328780 + }, + { + "epoch": 2.9066107958061496, + "grad_norm": 2.3327839374542236, + "learning_rate": 1.556486736564178e-06, + "loss": 0.5689, + "step": 328790 + }, + { + "epoch": 2.9066991990664617, + "grad_norm": 12.63715648651123, + "learning_rate": 1.5550133488923071e-06, + "loss": 0.5434, + "step": 328800 + }, + { + "epoch": 2.906787602326774, + "grad_norm": 0.9685347676277161, + "learning_rate": 1.5535399612204366e-06, + "loss": 0.4505, + "step": 328810 + }, + { + "epoch": 2.906876005587086, + "grad_norm": 1.920351266860962, + "learning_rate": 1.552066573548566e-06, + "loss": 0.5238, + "step": 328820 + }, + { + "epoch": 2.906964408847398, + "grad_norm": 1.771120548248291, + "learning_rate": 1.5505931858766952e-06, + "loss": 0.5441, + "step": 328830 + }, + { + "epoch": 2.9070528121077106, + "grad_norm": 5.930893421173096, + "learning_rate": 1.5491197982048246e-06, + "loss": 0.4324, + "step": 328840 + }, + { + "epoch": 2.9071412153680227, + "grad_norm": 1.7306873798370361, + "learning_rate": 1.5476464105329538e-06, + "loss": 0.4209, + "step": 328850 + }, + { + "epoch": 2.9072296186283353, + "grad_norm": 0.7223976850509644, + "learning_rate": 1.5461730228610832e-06, + "loss": 0.4493, + "step": 328860 + }, + { + "epoch": 2.9073180218886474, + "grad_norm": 4.770142078399658, + "learning_rate": 1.5446996351892124e-06, + "loss": 0.4703, + "step": 328870 + }, + { + "epoch": 2.9074064251489595, + "grad_norm": 3.5662219524383545, + "learning_rate": 1.5432262475173418e-06, + "loss": 0.5718, + "step": 328880 + }, + { + "epoch": 2.9074948284092716, + "grad_norm": 1.4841954708099365, + "learning_rate": 1.5417528598454712e-06, + "loss": 0.5092, + "step": 328890 + }, + { + "epoch": 2.9075832316695838, + "grad_norm": 1.7242227792739868, + "learning_rate": 1.5402794721736004e-06, + "loss": 0.4397, + "step": 328900 + }, + { + "epoch": 2.9076716349298963, + "grad_norm": 0.7874936461448669, + "learning_rate": 1.5388060845017298e-06, + "loss": 0.6129, + "step": 328910 + }, + { + "epoch": 2.9077600381902085, + "grad_norm": 1.311879277229309, + "learning_rate": 1.537332696829859e-06, + "loss": 0.5632, + "step": 328920 + }, + { + "epoch": 2.9078484414505206, + "grad_norm": 5.113862037658691, + "learning_rate": 1.5358593091579885e-06, + "loss": 0.6092, + "step": 328930 + }, + { + "epoch": 2.907936844710833, + "grad_norm": 7.214711666107178, + "learning_rate": 1.5343859214861179e-06, + "loss": 0.4919, + "step": 328940 + }, + { + "epoch": 2.9080252479711453, + "grad_norm": 6.745513439178467, + "learning_rate": 1.5329125338142473e-06, + "loss": 0.581, + "step": 328950 + }, + { + "epoch": 2.9081136512314574, + "grad_norm": 8.142043113708496, + "learning_rate": 1.5314391461423765e-06, + "loss": 0.4484, + "step": 328960 + }, + { + "epoch": 2.9082020544917695, + "grad_norm": 1.3580362796783447, + "learning_rate": 1.5299657584705057e-06, + "loss": 0.4526, + "step": 328970 + }, + { + "epoch": 2.908290457752082, + "grad_norm": 2.0152060985565186, + "learning_rate": 1.528492370798635e-06, + "loss": 0.435, + "step": 328980 + }, + { + "epoch": 2.908378861012394, + "grad_norm": 2.5619168281555176, + "learning_rate": 1.5270189831267643e-06, + "loss": 0.6051, + "step": 328990 + }, + { + "epoch": 2.9084672642727063, + "grad_norm": 5.287860870361328, + "learning_rate": 1.525545595454894e-06, + "loss": 0.5579, + "step": 329000 + }, + { + "epoch": 2.908555667533019, + "grad_norm": 1.7849082946777344, + "learning_rate": 1.5240722077830231e-06, + "loss": 0.5172, + "step": 329010 + }, + { + "epoch": 2.908644070793331, + "grad_norm": 5.239295959472656, + "learning_rate": 1.5225988201111525e-06, + "loss": 0.6217, + "step": 329020 + }, + { + "epoch": 2.908732474053643, + "grad_norm": 3.2965633869171143, + "learning_rate": 1.5211254324392817e-06, + "loss": 0.5569, + "step": 329030 + }, + { + "epoch": 2.9088208773139552, + "grad_norm": 1.6610771417617798, + "learning_rate": 1.519652044767411e-06, + "loss": 0.5484, + "step": 329040 + }, + { + "epoch": 2.9089092805742673, + "grad_norm": 4.7527360916137695, + "learning_rate": 1.5181786570955403e-06, + "loss": 0.4582, + "step": 329050 + }, + { + "epoch": 2.90899768383458, + "grad_norm": 1.223594307899475, + "learning_rate": 1.5167052694236698e-06, + "loss": 0.4697, + "step": 329060 + }, + { + "epoch": 2.909086087094892, + "grad_norm": 2.7702319622039795, + "learning_rate": 1.5152318817517992e-06, + "loss": 0.4902, + "step": 329070 + }, + { + "epoch": 2.909174490355204, + "grad_norm": 1.584041953086853, + "learning_rate": 1.5137584940799284e-06, + "loss": 0.4731, + "step": 329080 + }, + { + "epoch": 2.9092628936155167, + "grad_norm": 6.989671230316162, + "learning_rate": 1.5122851064080578e-06, + "loss": 0.6858, + "step": 329090 + }, + { + "epoch": 2.909351296875829, + "grad_norm": 5.190520286560059, + "learning_rate": 1.510811718736187e-06, + "loss": 0.5987, + "step": 329100 + }, + { + "epoch": 2.909439700136141, + "grad_norm": 4.429983139038086, + "learning_rate": 1.5093383310643164e-06, + "loss": 0.4567, + "step": 329110 + }, + { + "epoch": 2.909528103396453, + "grad_norm": 2.0876805782318115, + "learning_rate": 1.5078649433924458e-06, + "loss": 0.6511, + "step": 329120 + }, + { + "epoch": 2.9096165066567656, + "grad_norm": 1.728627324104309, + "learning_rate": 1.506391555720575e-06, + "loss": 0.5484, + "step": 329130 + }, + { + "epoch": 2.9097049099170778, + "grad_norm": 2.695943832397461, + "learning_rate": 1.5049181680487044e-06, + "loss": 0.4896, + "step": 329140 + }, + { + "epoch": 2.90979331317739, + "grad_norm": 3.451207160949707, + "learning_rate": 1.5034447803768336e-06, + "loss": 0.5603, + "step": 329150 + }, + { + "epoch": 2.9098817164377024, + "grad_norm": 4.841032028198242, + "learning_rate": 1.501971392704963e-06, + "loss": 0.5041, + "step": 329160 + }, + { + "epoch": 2.9099701196980146, + "grad_norm": 9.09288215637207, + "learning_rate": 1.5004980050330925e-06, + "loss": 0.5712, + "step": 329170 + }, + { + "epoch": 2.9100585229583267, + "grad_norm": 1.7294079065322876, + "learning_rate": 1.4990246173612217e-06, + "loss": 0.5528, + "step": 329180 + }, + { + "epoch": 2.910146926218639, + "grad_norm": 3.1209170818328857, + "learning_rate": 1.497551229689351e-06, + "loss": 0.4228, + "step": 329190 + }, + { + "epoch": 2.910235329478951, + "grad_norm": 4.063177108764648, + "learning_rate": 1.4960778420174803e-06, + "loss": 0.4414, + "step": 329200 + }, + { + "epoch": 2.9103237327392635, + "grad_norm": 1.6286641359329224, + "learning_rate": 1.4946044543456097e-06, + "loss": 0.5433, + "step": 329210 + }, + { + "epoch": 2.9104121359995756, + "grad_norm": 2.489703893661499, + "learning_rate": 1.4931310666737389e-06, + "loss": 0.4624, + "step": 329220 + }, + { + "epoch": 2.910500539259888, + "grad_norm": 9.534822463989258, + "learning_rate": 1.4916576790018683e-06, + "loss": 0.51, + "step": 329230 + }, + { + "epoch": 2.9105889425202003, + "grad_norm": 2.5564489364624023, + "learning_rate": 1.4901842913299977e-06, + "loss": 0.5064, + "step": 329240 + }, + { + "epoch": 2.9106773457805124, + "grad_norm": 2.665325164794922, + "learning_rate": 1.488710903658127e-06, + "loss": 0.5441, + "step": 329250 + }, + { + "epoch": 2.9107657490408245, + "grad_norm": 3.110867738723755, + "learning_rate": 1.4872375159862563e-06, + "loss": 0.4602, + "step": 329260 + }, + { + "epoch": 2.9108541523011366, + "grad_norm": 10.770288467407227, + "learning_rate": 1.4857641283143855e-06, + "loss": 0.5354, + "step": 329270 + }, + { + "epoch": 2.910942555561449, + "grad_norm": 8.88200855255127, + "learning_rate": 1.484290740642515e-06, + "loss": 0.4564, + "step": 329280 + }, + { + "epoch": 2.9110309588217613, + "grad_norm": 4.115960597991943, + "learning_rate": 1.4828173529706443e-06, + "loss": 0.5678, + "step": 329290 + }, + { + "epoch": 2.9111193620820734, + "grad_norm": 2.4364707469940186, + "learning_rate": 1.4813439652987738e-06, + "loss": 0.5319, + "step": 329300 + }, + { + "epoch": 2.911207765342386, + "grad_norm": 1.2015424966812134, + "learning_rate": 1.479870577626903e-06, + "loss": 0.5279, + "step": 329310 + }, + { + "epoch": 2.911296168602698, + "grad_norm": 5.512742042541504, + "learning_rate": 1.4783971899550322e-06, + "loss": 0.5405, + "step": 329320 + }, + { + "epoch": 2.9113845718630103, + "grad_norm": 11.892988204956055, + "learning_rate": 1.4769238022831616e-06, + "loss": 0.59, + "step": 329330 + }, + { + "epoch": 2.9114729751233224, + "grad_norm": 11.554424285888672, + "learning_rate": 1.475450414611291e-06, + "loss": 0.4781, + "step": 329340 + }, + { + "epoch": 2.911561378383635, + "grad_norm": 9.609766006469727, + "learning_rate": 1.4739770269394204e-06, + "loss": 0.5032, + "step": 329350 + }, + { + "epoch": 2.911649781643947, + "grad_norm": 3.62336802482605, + "learning_rate": 1.4725036392675496e-06, + "loss": 0.4796, + "step": 329360 + }, + { + "epoch": 2.911738184904259, + "grad_norm": 4.976653575897217, + "learning_rate": 1.471030251595679e-06, + "loss": 0.5084, + "step": 329370 + }, + { + "epoch": 2.9118265881645717, + "grad_norm": 4.0889177322387695, + "learning_rate": 1.4695568639238082e-06, + "loss": 0.4439, + "step": 329380 + }, + { + "epoch": 2.911914991424884, + "grad_norm": 4.1474809646606445, + "learning_rate": 1.4680834762519374e-06, + "loss": 0.4631, + "step": 329390 + }, + { + "epoch": 2.912003394685196, + "grad_norm": 4.397684574127197, + "learning_rate": 1.4666100885800668e-06, + "loss": 0.4952, + "step": 329400 + }, + { + "epoch": 2.912091797945508, + "grad_norm": 2.8476953506469727, + "learning_rate": 1.4651367009081962e-06, + "loss": 0.4459, + "step": 329410 + }, + { + "epoch": 2.91218020120582, + "grad_norm": 6.041107654571533, + "learning_rate": 1.4636633132363257e-06, + "loss": 0.4827, + "step": 329420 + }, + { + "epoch": 2.912268604466133, + "grad_norm": 5.0420708656311035, + "learning_rate": 1.4621899255644549e-06, + "loss": 0.5377, + "step": 329430 + }, + { + "epoch": 2.912357007726445, + "grad_norm": 3.6274850368499756, + "learning_rate": 1.4607165378925843e-06, + "loss": 0.4284, + "step": 329440 + }, + { + "epoch": 2.9124454109867575, + "grad_norm": 3.307542562484741, + "learning_rate": 1.4592431502207135e-06, + "loss": 0.573, + "step": 329450 + }, + { + "epoch": 2.9125338142470696, + "grad_norm": 1.2595852613449097, + "learning_rate": 1.4577697625488429e-06, + "loss": 0.4474, + "step": 329460 + }, + { + "epoch": 2.9126222175073817, + "grad_norm": 10.757245063781738, + "learning_rate": 1.4562963748769723e-06, + "loss": 0.5142, + "step": 329470 + }, + { + "epoch": 2.912710620767694, + "grad_norm": 4.0857834815979, + "learning_rate": 1.4548229872051015e-06, + "loss": 0.4999, + "step": 329480 + }, + { + "epoch": 2.912799024028006, + "grad_norm": 1.7564442157745361, + "learning_rate": 1.453349599533231e-06, + "loss": 0.5035, + "step": 329490 + }, + { + "epoch": 2.9128874272883185, + "grad_norm": 2.9655346870422363, + "learning_rate": 1.4518762118613601e-06, + "loss": 0.4487, + "step": 329500 + }, + { + "epoch": 2.9129758305486306, + "grad_norm": 4.246185779571533, + "learning_rate": 1.4504028241894895e-06, + "loss": 0.6049, + "step": 329510 + }, + { + "epoch": 2.9130642338089427, + "grad_norm": 2.2162153720855713, + "learning_rate": 1.448929436517619e-06, + "loss": 0.5724, + "step": 329520 + }, + { + "epoch": 2.9131526370692553, + "grad_norm": 2.0173661708831787, + "learning_rate": 1.4474560488457481e-06, + "loss": 0.5929, + "step": 329530 + }, + { + "epoch": 2.9132410403295674, + "grad_norm": 6.050443649291992, + "learning_rate": 1.4459826611738775e-06, + "loss": 0.4794, + "step": 329540 + }, + { + "epoch": 2.9133294435898796, + "grad_norm": 8.620412826538086, + "learning_rate": 1.4445092735020067e-06, + "loss": 0.4216, + "step": 329550 + }, + { + "epoch": 2.9134178468501917, + "grad_norm": 2.282932996749878, + "learning_rate": 1.4430358858301362e-06, + "loss": 0.5115, + "step": 329560 + }, + { + "epoch": 2.9135062501105042, + "grad_norm": 3.291142225265503, + "learning_rate": 1.4415624981582654e-06, + "loss": 0.5134, + "step": 329570 + }, + { + "epoch": 2.9135946533708164, + "grad_norm": 3.229618787765503, + "learning_rate": 1.440089110486395e-06, + "loss": 0.5321, + "step": 329580 + }, + { + "epoch": 2.9136830566311285, + "grad_norm": 4.742890357971191, + "learning_rate": 1.4386157228145242e-06, + "loss": 0.4327, + "step": 329590 + }, + { + "epoch": 2.913771459891441, + "grad_norm": 3.6507625579833984, + "learning_rate": 1.4371423351426534e-06, + "loss": 0.5236, + "step": 329600 + }, + { + "epoch": 2.913859863151753, + "grad_norm": 2.378690004348755, + "learning_rate": 1.4356689474707828e-06, + "loss": 0.5471, + "step": 329610 + }, + { + "epoch": 2.9139482664120653, + "grad_norm": 4.228653430938721, + "learning_rate": 1.434195559798912e-06, + "loss": 0.4822, + "step": 329620 + }, + { + "epoch": 2.9140366696723774, + "grad_norm": 2.713576316833496, + "learning_rate": 1.4327221721270414e-06, + "loss": 0.475, + "step": 329630 + }, + { + "epoch": 2.9141250729326895, + "grad_norm": 4.092193603515625, + "learning_rate": 1.4312487844551708e-06, + "loss": 0.5069, + "step": 329640 + }, + { + "epoch": 2.914213476193002, + "grad_norm": 1.6365801095962524, + "learning_rate": 1.4297753967833002e-06, + "loss": 0.473, + "step": 329650 + }, + { + "epoch": 2.914301879453314, + "grad_norm": 2.581256866455078, + "learning_rate": 1.4283020091114294e-06, + "loss": 0.4854, + "step": 329660 + }, + { + "epoch": 2.9143902827136263, + "grad_norm": 2.1677162647247314, + "learning_rate": 1.4268286214395586e-06, + "loss": 0.3983, + "step": 329670 + }, + { + "epoch": 2.914478685973939, + "grad_norm": 1.6751574277877808, + "learning_rate": 1.425355233767688e-06, + "loss": 0.6127, + "step": 329680 + }, + { + "epoch": 2.914567089234251, + "grad_norm": 4.196102142333984, + "learning_rate": 1.4238818460958175e-06, + "loss": 0.6632, + "step": 329690 + }, + { + "epoch": 2.914655492494563, + "grad_norm": 6.089974880218506, + "learning_rate": 1.4224084584239469e-06, + "loss": 0.5224, + "step": 329700 + }, + { + "epoch": 2.9147438957548752, + "grad_norm": 2.689375638961792, + "learning_rate": 1.420935070752076e-06, + "loss": 0.4671, + "step": 329710 + }, + { + "epoch": 2.914832299015188, + "grad_norm": 4.055395603179932, + "learning_rate": 1.4194616830802055e-06, + "loss": 0.6295, + "step": 329720 + }, + { + "epoch": 2.9149207022755, + "grad_norm": 4.089426040649414, + "learning_rate": 1.4179882954083347e-06, + "loss": 0.514, + "step": 329730 + }, + { + "epoch": 2.915009105535812, + "grad_norm": 4.964166164398193, + "learning_rate": 1.416514907736464e-06, + "loss": 0.5276, + "step": 329740 + }, + { + "epoch": 2.9150975087961246, + "grad_norm": 1.426652193069458, + "learning_rate": 1.4150415200645933e-06, + "loss": 0.6084, + "step": 329750 + }, + { + "epoch": 2.9151859120564367, + "grad_norm": 26.736351013183594, + "learning_rate": 1.4135681323927227e-06, + "loss": 0.42, + "step": 329760 + }, + { + "epoch": 2.915274315316749, + "grad_norm": 5.323127746582031, + "learning_rate": 1.4120947447208521e-06, + "loss": 0.4964, + "step": 329770 + }, + { + "epoch": 2.915362718577061, + "grad_norm": 9.755573272705078, + "learning_rate": 1.4106213570489813e-06, + "loss": 0.5208, + "step": 329780 + }, + { + "epoch": 2.915451121837373, + "grad_norm": 4.633475303649902, + "learning_rate": 1.4091479693771107e-06, + "loss": 0.4452, + "step": 329790 + }, + { + "epoch": 2.9155395250976857, + "grad_norm": 3.692819118499756, + "learning_rate": 1.40767458170524e-06, + "loss": 0.4693, + "step": 329800 + }, + { + "epoch": 2.9156279283579978, + "grad_norm": 3.5296812057495117, + "learning_rate": 1.4062011940333694e-06, + "loss": 0.5982, + "step": 329810 + }, + { + "epoch": 2.9157163316183103, + "grad_norm": 2.1243481636047363, + "learning_rate": 1.4047278063614988e-06, + "loss": 0.5787, + "step": 329820 + }, + { + "epoch": 2.9158047348786225, + "grad_norm": 3.1700568199157715, + "learning_rate": 1.403254418689628e-06, + "loss": 0.4594, + "step": 329830 + }, + { + "epoch": 2.9158931381389346, + "grad_norm": 3.1852781772613525, + "learning_rate": 1.4017810310177574e-06, + "loss": 0.5218, + "step": 329840 + }, + { + "epoch": 2.9159815413992467, + "grad_norm": 1.6509904861450195, + "learning_rate": 1.4003076433458866e-06, + "loss": 0.6203, + "step": 329850 + }, + { + "epoch": 2.916069944659559, + "grad_norm": 10.85154914855957, + "learning_rate": 1.398834255674016e-06, + "loss": 0.5786, + "step": 329860 + }, + { + "epoch": 2.9161583479198714, + "grad_norm": 4.99819278717041, + "learning_rate": 1.3973608680021454e-06, + "loss": 0.6206, + "step": 329870 + }, + { + "epoch": 2.9162467511801835, + "grad_norm": 12.188699722290039, + "learning_rate": 1.3958874803302746e-06, + "loss": 0.5631, + "step": 329880 + }, + { + "epoch": 2.9163351544404956, + "grad_norm": 7.0480756759643555, + "learning_rate": 1.394414092658404e-06, + "loss": 0.6069, + "step": 329890 + }, + { + "epoch": 2.916423557700808, + "grad_norm": 14.71362018585205, + "learning_rate": 1.3929407049865332e-06, + "loss": 0.5297, + "step": 329900 + }, + { + "epoch": 2.9165119609611203, + "grad_norm": 8.139071464538574, + "learning_rate": 1.3914673173146626e-06, + "loss": 0.4588, + "step": 329910 + }, + { + "epoch": 2.9166003642214324, + "grad_norm": 4.872673988342285, + "learning_rate": 1.3899939296427918e-06, + "loss": 0.4332, + "step": 329920 + }, + { + "epoch": 2.9166887674817445, + "grad_norm": 7.577746868133545, + "learning_rate": 1.3885205419709215e-06, + "loss": 0.4407, + "step": 329930 + }, + { + "epoch": 2.916777170742057, + "grad_norm": 3.7110745906829834, + "learning_rate": 1.3870471542990507e-06, + "loss": 0.5653, + "step": 329940 + }, + { + "epoch": 2.9168655740023692, + "grad_norm": 2.9298582077026367, + "learning_rate": 1.3855737666271799e-06, + "loss": 0.6146, + "step": 329950 + }, + { + "epoch": 2.9169539772626814, + "grad_norm": 5.866812229156494, + "learning_rate": 1.3841003789553093e-06, + "loss": 0.575, + "step": 329960 + }, + { + "epoch": 2.917042380522994, + "grad_norm": 1.6243425607681274, + "learning_rate": 1.3826269912834385e-06, + "loss": 0.4393, + "step": 329970 + }, + { + "epoch": 2.917130783783306, + "grad_norm": 1.0695476531982422, + "learning_rate": 1.381153603611568e-06, + "loss": 0.4631, + "step": 329980 + }, + { + "epoch": 2.917219187043618, + "grad_norm": 2.4153153896331787, + "learning_rate": 1.3796802159396973e-06, + "loss": 0.4143, + "step": 329990 + }, + { + "epoch": 2.9173075903039303, + "grad_norm": 3.2688825130462646, + "learning_rate": 1.3782068282678267e-06, + "loss": 0.5107, + "step": 330000 + }, + { + "epoch": 2.9173959935642424, + "grad_norm": 14.340386390686035, + "learning_rate": 1.376733440595956e-06, + "loss": 0.4475, + "step": 330010 + }, + { + "epoch": 2.917484396824555, + "grad_norm": 2.285454511642456, + "learning_rate": 1.3752600529240851e-06, + "loss": 0.3906, + "step": 330020 + }, + { + "epoch": 2.917572800084867, + "grad_norm": 2.5392379760742188, + "learning_rate": 1.3737866652522145e-06, + "loss": 0.6243, + "step": 330030 + }, + { + "epoch": 2.9176612033451796, + "grad_norm": 8.78630542755127, + "learning_rate": 1.372313277580344e-06, + "loss": 0.4419, + "step": 330040 + }, + { + "epoch": 2.9177496066054918, + "grad_norm": 7.105630397796631, + "learning_rate": 1.3708398899084734e-06, + "loss": 0.5354, + "step": 330050 + }, + { + "epoch": 2.917838009865804, + "grad_norm": 7.275388240814209, + "learning_rate": 1.3693665022366026e-06, + "loss": 0.5459, + "step": 330060 + }, + { + "epoch": 2.917926413126116, + "grad_norm": 1.3211523294448853, + "learning_rate": 1.367893114564732e-06, + "loss": 0.6306, + "step": 330070 + }, + { + "epoch": 2.918014816386428, + "grad_norm": 3.9780852794647217, + "learning_rate": 1.3664197268928612e-06, + "loss": 0.585, + "step": 330080 + }, + { + "epoch": 2.9181032196467407, + "grad_norm": 9.1460599899292, + "learning_rate": 1.3649463392209904e-06, + "loss": 0.3577, + "step": 330090 + }, + { + "epoch": 2.918191622907053, + "grad_norm": 0.8949465155601501, + "learning_rate": 1.3634729515491198e-06, + "loss": 0.362, + "step": 330100 + }, + { + "epoch": 2.918280026167365, + "grad_norm": 1.3925037384033203, + "learning_rate": 1.3619995638772492e-06, + "loss": 0.5265, + "step": 330110 + }, + { + "epoch": 2.9183684294276775, + "grad_norm": 1.9608086347579956, + "learning_rate": 1.3605261762053786e-06, + "loss": 0.4192, + "step": 330120 + }, + { + "epoch": 2.9184568326879896, + "grad_norm": 3.777580738067627, + "learning_rate": 1.3590527885335078e-06, + "loss": 0.5282, + "step": 330130 + }, + { + "epoch": 2.9185452359483017, + "grad_norm": 3.1823532581329346, + "learning_rate": 1.3575794008616372e-06, + "loss": 0.5368, + "step": 330140 + }, + { + "epoch": 2.918633639208614, + "grad_norm": 3.429502010345459, + "learning_rate": 1.3561060131897664e-06, + "loss": 0.5938, + "step": 330150 + }, + { + "epoch": 2.9187220424689264, + "grad_norm": 1.2308542728424072, + "learning_rate": 1.3546326255178958e-06, + "loss": 0.4601, + "step": 330160 + }, + { + "epoch": 2.9188104457292385, + "grad_norm": 2.847017288208008, + "learning_rate": 1.3531592378460253e-06, + "loss": 0.5204, + "step": 330170 + }, + { + "epoch": 2.9188988489895507, + "grad_norm": 0.9330363273620605, + "learning_rate": 1.3516858501741545e-06, + "loss": 0.5133, + "step": 330180 + }, + { + "epoch": 2.918987252249863, + "grad_norm": 0.9184470772743225, + "learning_rate": 1.3502124625022839e-06, + "loss": 0.4467, + "step": 330190 + }, + { + "epoch": 2.9190756555101753, + "grad_norm": 3.5515289306640625, + "learning_rate": 1.348739074830413e-06, + "loss": 0.5539, + "step": 330200 + }, + { + "epoch": 2.9191640587704875, + "grad_norm": 9.605733871459961, + "learning_rate": 1.3472656871585425e-06, + "loss": 0.5729, + "step": 330210 + }, + { + "epoch": 2.9192524620307996, + "grad_norm": 5.521965026855469, + "learning_rate": 1.345792299486672e-06, + "loss": 0.5775, + "step": 330220 + }, + { + "epoch": 2.9193408652911117, + "grad_norm": 1.4675289392471313, + "learning_rate": 1.344318911814801e-06, + "loss": 0.4571, + "step": 330230 + }, + { + "epoch": 2.9194292685514243, + "grad_norm": 1.3006490468978882, + "learning_rate": 1.3428455241429305e-06, + "loss": 0.4674, + "step": 330240 + }, + { + "epoch": 2.9195176718117364, + "grad_norm": 25.742706298828125, + "learning_rate": 1.3413721364710597e-06, + "loss": 0.6296, + "step": 330250 + }, + { + "epoch": 2.9196060750720485, + "grad_norm": 3.3130104541778564, + "learning_rate": 1.3398987487991891e-06, + "loss": 0.5219, + "step": 330260 + }, + { + "epoch": 2.919694478332361, + "grad_norm": 4.018000602722168, + "learning_rate": 1.3384253611273183e-06, + "loss": 0.4724, + "step": 330270 + }, + { + "epoch": 2.919782881592673, + "grad_norm": 3.2630064487457275, + "learning_rate": 1.336951973455448e-06, + "loss": 0.6448, + "step": 330280 + }, + { + "epoch": 2.9198712848529853, + "grad_norm": 3.0969645977020264, + "learning_rate": 1.3354785857835772e-06, + "loss": 0.5497, + "step": 330290 + }, + { + "epoch": 2.9199596881132974, + "grad_norm": 3.6482672691345215, + "learning_rate": 1.3340051981117064e-06, + "loss": 0.5711, + "step": 330300 + }, + { + "epoch": 2.92004809137361, + "grad_norm": 3.2991342544555664, + "learning_rate": 1.3325318104398358e-06, + "loss": 0.5726, + "step": 330310 + }, + { + "epoch": 2.920136494633922, + "grad_norm": 3.124720573425293, + "learning_rate": 1.331058422767965e-06, + "loss": 0.4775, + "step": 330320 + }, + { + "epoch": 2.9202248978942342, + "grad_norm": 1.830737590789795, + "learning_rate": 1.3295850350960944e-06, + "loss": 0.558, + "step": 330330 + }, + { + "epoch": 2.920313301154547, + "grad_norm": 4.2046685218811035, + "learning_rate": 1.3281116474242238e-06, + "loss": 0.5766, + "step": 330340 + }, + { + "epoch": 2.920401704414859, + "grad_norm": 1.5004316568374634, + "learning_rate": 1.3266382597523532e-06, + "loss": 0.5781, + "step": 330350 + }, + { + "epoch": 2.920490107675171, + "grad_norm": 1.2128349542617798, + "learning_rate": 1.3251648720804824e-06, + "loss": 0.4151, + "step": 330360 + }, + { + "epoch": 2.920578510935483, + "grad_norm": 1.5415040254592896, + "learning_rate": 1.3236914844086116e-06, + "loss": 0.3888, + "step": 330370 + }, + { + "epoch": 2.9206669141957953, + "grad_norm": 4.809371471405029, + "learning_rate": 1.322218096736741e-06, + "loss": 0.4639, + "step": 330380 + }, + { + "epoch": 2.920755317456108, + "grad_norm": 4.392032146453857, + "learning_rate": 1.3207447090648704e-06, + "loss": 0.5397, + "step": 330390 + }, + { + "epoch": 2.92084372071642, + "grad_norm": 4.9936394691467285, + "learning_rate": 1.3192713213929998e-06, + "loss": 0.465, + "step": 330400 + }, + { + "epoch": 2.9209321239767325, + "grad_norm": 5.274104595184326, + "learning_rate": 1.317797933721129e-06, + "loss": 0.4902, + "step": 330410 + }, + { + "epoch": 2.9210205272370446, + "grad_norm": 7.787070274353027, + "learning_rate": 1.3163245460492585e-06, + "loss": 0.5716, + "step": 330420 + }, + { + "epoch": 2.9211089304973568, + "grad_norm": 1.4577288627624512, + "learning_rate": 1.3148511583773877e-06, + "loss": 0.4737, + "step": 330430 + }, + { + "epoch": 2.921197333757669, + "grad_norm": 7.860651969909668, + "learning_rate": 1.3133777707055169e-06, + "loss": 0.518, + "step": 330440 + }, + { + "epoch": 2.921285737017981, + "grad_norm": 2.45662784576416, + "learning_rate": 1.3119043830336465e-06, + "loss": 0.5557, + "step": 330450 + }, + { + "epoch": 2.9213741402782936, + "grad_norm": 2.4820644855499268, + "learning_rate": 1.3104309953617757e-06, + "loss": 0.5657, + "step": 330460 + }, + { + "epoch": 2.9214625435386057, + "grad_norm": 42.12007141113281, + "learning_rate": 1.308957607689905e-06, + "loss": 0.4979, + "step": 330470 + }, + { + "epoch": 2.921550946798918, + "grad_norm": 2.3781683444976807, + "learning_rate": 1.3074842200180343e-06, + "loss": 0.4801, + "step": 330480 + }, + { + "epoch": 2.9216393500592304, + "grad_norm": 6.862301349639893, + "learning_rate": 1.3060108323461637e-06, + "loss": 0.4763, + "step": 330490 + }, + { + "epoch": 2.9217277533195425, + "grad_norm": 1.6878200769424438, + "learning_rate": 1.304537444674293e-06, + "loss": 0.5676, + "step": 330500 + }, + { + "epoch": 2.9218161565798546, + "grad_norm": 5.286529064178467, + "learning_rate": 1.3030640570024223e-06, + "loss": 0.6344, + "step": 330510 + }, + { + "epoch": 2.9219045598401667, + "grad_norm": 5.928821086883545, + "learning_rate": 1.3015906693305517e-06, + "loss": 0.5662, + "step": 330520 + }, + { + "epoch": 2.9219929631004793, + "grad_norm": 13.034649848937988, + "learning_rate": 1.300117281658681e-06, + "loss": 0.6429, + "step": 330530 + }, + { + "epoch": 2.9220813663607914, + "grad_norm": 2.9098007678985596, + "learning_rate": 1.2986438939868104e-06, + "loss": 0.5508, + "step": 330540 + }, + { + "epoch": 2.9221697696211035, + "grad_norm": 2.388068199157715, + "learning_rate": 1.2971705063149396e-06, + "loss": 0.5316, + "step": 330550 + }, + { + "epoch": 2.922258172881416, + "grad_norm": 5.973031520843506, + "learning_rate": 1.295697118643069e-06, + "loss": 0.6518, + "step": 330560 + }, + { + "epoch": 2.922346576141728, + "grad_norm": 3.1128299236297607, + "learning_rate": 1.2942237309711984e-06, + "loss": 0.5789, + "step": 330570 + }, + { + "epoch": 2.9224349794020403, + "grad_norm": 6.378337383270264, + "learning_rate": 1.2927503432993276e-06, + "loss": 0.6331, + "step": 330580 + }, + { + "epoch": 2.9225233826623525, + "grad_norm": 3.6702232360839844, + "learning_rate": 1.291276955627457e-06, + "loss": 0.502, + "step": 330590 + }, + { + "epoch": 2.9226117859226646, + "grad_norm": 3.4140172004699707, + "learning_rate": 1.2898035679555862e-06, + "loss": 0.5106, + "step": 330600 + }, + { + "epoch": 2.922700189182977, + "grad_norm": 8.286050796508789, + "learning_rate": 1.2883301802837156e-06, + "loss": 0.4854, + "step": 330610 + }, + { + "epoch": 2.9227885924432893, + "grad_norm": 1.275683045387268, + "learning_rate": 1.2868567926118448e-06, + "loss": 0.4857, + "step": 330620 + }, + { + "epoch": 2.922876995703602, + "grad_norm": 12.921470642089844, + "learning_rate": 1.2853834049399744e-06, + "loss": 0.6506, + "step": 330630 + }, + { + "epoch": 2.922965398963914, + "grad_norm": 3.102219343185425, + "learning_rate": 1.2839100172681036e-06, + "loss": 0.5458, + "step": 330640 + }, + { + "epoch": 2.923053802224226, + "grad_norm": 1.839579463005066, + "learning_rate": 1.2824366295962328e-06, + "loss": 0.6523, + "step": 330650 + }, + { + "epoch": 2.923142205484538, + "grad_norm": 1.8763034343719482, + "learning_rate": 1.2809632419243622e-06, + "loss": 0.5542, + "step": 330660 + }, + { + "epoch": 2.9232306087448503, + "grad_norm": 7.24019718170166, + "learning_rate": 1.2794898542524914e-06, + "loss": 0.4757, + "step": 330670 + }, + { + "epoch": 2.923319012005163, + "grad_norm": 2.2821054458618164, + "learning_rate": 1.2780164665806209e-06, + "loss": 0.5302, + "step": 330680 + }, + { + "epoch": 2.923407415265475, + "grad_norm": 2.452557325363159, + "learning_rate": 1.2765430789087503e-06, + "loss": 0.6058, + "step": 330690 + }, + { + "epoch": 2.923495818525787, + "grad_norm": 1.6311081647872925, + "learning_rate": 1.2750696912368795e-06, + "loss": 0.5439, + "step": 330700 + }, + { + "epoch": 2.9235842217860997, + "grad_norm": 2.704763174057007, + "learning_rate": 1.2735963035650089e-06, + "loss": 0.4873, + "step": 330710 + }, + { + "epoch": 2.923672625046412, + "grad_norm": 5.392584800720215, + "learning_rate": 1.272122915893138e-06, + "loss": 0.5179, + "step": 330720 + }, + { + "epoch": 2.923761028306724, + "grad_norm": 9.241643905639648, + "learning_rate": 1.2706495282212675e-06, + "loss": 0.6143, + "step": 330730 + }, + { + "epoch": 2.923849431567036, + "grad_norm": 0.9814544916152954, + "learning_rate": 1.269176140549397e-06, + "loss": 0.5206, + "step": 330740 + }, + { + "epoch": 2.9239378348273486, + "grad_norm": 1.623825192451477, + "learning_rate": 1.2677027528775263e-06, + "loss": 0.5778, + "step": 330750 + }, + { + "epoch": 2.9240262380876607, + "grad_norm": 1.6204280853271484, + "learning_rate": 1.2662293652056555e-06, + "loss": 0.519, + "step": 330760 + }, + { + "epoch": 2.924114641347973, + "grad_norm": 0.8681414723396301, + "learning_rate": 1.2647559775337847e-06, + "loss": 0.504, + "step": 330770 + }, + { + "epoch": 2.9242030446082854, + "grad_norm": 4.071070194244385, + "learning_rate": 1.2632825898619141e-06, + "loss": 0.4108, + "step": 330780 + }, + { + "epoch": 2.9242914478685975, + "grad_norm": 4.412330627441406, + "learning_rate": 1.2618092021900433e-06, + "loss": 0.6303, + "step": 330790 + }, + { + "epoch": 2.9243798511289096, + "grad_norm": 3.2730395793914795, + "learning_rate": 1.260335814518173e-06, + "loss": 0.4226, + "step": 330800 + }, + { + "epoch": 2.9244682543892218, + "grad_norm": 18.364839553833008, + "learning_rate": 1.2588624268463022e-06, + "loss": 0.5793, + "step": 330810 + }, + { + "epoch": 2.924556657649534, + "grad_norm": 5.927890300750732, + "learning_rate": 1.2573890391744316e-06, + "loss": 0.449, + "step": 330820 + }, + { + "epoch": 2.9246450609098464, + "grad_norm": 5.46162223815918, + "learning_rate": 1.2559156515025608e-06, + "loss": 0.591, + "step": 330830 + }, + { + "epoch": 2.9247334641701586, + "grad_norm": 6.90476131439209, + "learning_rate": 1.25444226383069e-06, + "loss": 0.6354, + "step": 330840 + }, + { + "epoch": 2.9248218674304707, + "grad_norm": 61.68587875366211, + "learning_rate": 1.2529688761588194e-06, + "loss": 0.5003, + "step": 330850 + }, + { + "epoch": 2.9249102706907832, + "grad_norm": 9.40803337097168, + "learning_rate": 1.2514954884869488e-06, + "loss": 0.6348, + "step": 330860 + }, + { + "epoch": 2.9249986739510954, + "grad_norm": 4.436063766479492, + "learning_rate": 1.2500221008150782e-06, + "loss": 0.5176, + "step": 330870 + }, + { + "epoch": 2.9250870772114075, + "grad_norm": 2.4095709323883057, + "learning_rate": 1.2485487131432074e-06, + "loss": 0.526, + "step": 330880 + }, + { + "epoch": 2.9251754804717196, + "grad_norm": 7.109973430633545, + "learning_rate": 1.2470753254713368e-06, + "loss": 0.6725, + "step": 330890 + }, + { + "epoch": 2.925263883732032, + "grad_norm": 4.135026931762695, + "learning_rate": 1.245601937799466e-06, + "loss": 0.5882, + "step": 330900 + }, + { + "epoch": 2.9253522869923443, + "grad_norm": 10.400771141052246, + "learning_rate": 1.2441285501275954e-06, + "loss": 0.4135, + "step": 330910 + }, + { + "epoch": 2.9254406902526564, + "grad_norm": 2.863044500350952, + "learning_rate": 1.2426551624557249e-06, + "loss": 0.5639, + "step": 330920 + }, + { + "epoch": 2.925529093512969, + "grad_norm": 5.166355609893799, + "learning_rate": 1.241181774783854e-06, + "loss": 0.4415, + "step": 330930 + }, + { + "epoch": 2.925617496773281, + "grad_norm": 5.8686676025390625, + "learning_rate": 1.2397083871119835e-06, + "loss": 0.4897, + "step": 330940 + }, + { + "epoch": 2.925705900033593, + "grad_norm": 1.0808826684951782, + "learning_rate": 1.2382349994401127e-06, + "loss": 0.3276, + "step": 330950 + }, + { + "epoch": 2.9257943032939053, + "grad_norm": 7.2653069496154785, + "learning_rate": 1.236761611768242e-06, + "loss": 0.5452, + "step": 330960 + }, + { + "epoch": 2.9258827065542174, + "grad_norm": 2.9335598945617676, + "learning_rate": 1.2352882240963713e-06, + "loss": 0.513, + "step": 330970 + }, + { + "epoch": 2.92597110981453, + "grad_norm": 3.0066475868225098, + "learning_rate": 1.2338148364245007e-06, + "loss": 0.5687, + "step": 330980 + }, + { + "epoch": 2.926059513074842, + "grad_norm": 0.7931689023971558, + "learning_rate": 1.2323414487526301e-06, + "loss": 0.4525, + "step": 330990 + }, + { + "epoch": 2.9261479163351547, + "grad_norm": 2.0104458332061768, + "learning_rate": 1.2308680610807593e-06, + "loss": 0.4735, + "step": 331000 + }, + { + "epoch": 2.926236319595467, + "grad_norm": 0.9463402032852173, + "learning_rate": 1.2293946734088887e-06, + "loss": 0.4738, + "step": 331010 + }, + { + "epoch": 2.926324722855779, + "grad_norm": 5.656471252441406, + "learning_rate": 1.227921285737018e-06, + "loss": 0.3483, + "step": 331020 + }, + { + "epoch": 2.926413126116091, + "grad_norm": 7.017066955566406, + "learning_rate": 1.2264478980651473e-06, + "loss": 0.5959, + "step": 331030 + }, + { + "epoch": 2.926501529376403, + "grad_norm": 2.8431577682495117, + "learning_rate": 1.2249745103932768e-06, + "loss": 0.6014, + "step": 331040 + }, + { + "epoch": 2.9265899326367157, + "grad_norm": 1.1029915809631348, + "learning_rate": 1.223501122721406e-06, + "loss": 0.3913, + "step": 331050 + }, + { + "epoch": 2.926678335897028, + "grad_norm": 1.1077812910079956, + "learning_rate": 1.2220277350495354e-06, + "loss": 0.3928, + "step": 331060 + }, + { + "epoch": 2.92676673915734, + "grad_norm": 7.557567119598389, + "learning_rate": 1.2205543473776646e-06, + "loss": 0.545, + "step": 331070 + }, + { + "epoch": 2.9268551424176525, + "grad_norm": 2.6680195331573486, + "learning_rate": 1.219080959705794e-06, + "loss": 0.5003, + "step": 331080 + }, + { + "epoch": 2.9269435456779647, + "grad_norm": 4.845629692077637, + "learning_rate": 1.2176075720339234e-06, + "loss": 0.5341, + "step": 331090 + }, + { + "epoch": 2.927031948938277, + "grad_norm": 3.466409921646118, + "learning_rate": 1.2161341843620528e-06, + "loss": 0.46, + "step": 331100 + }, + { + "epoch": 2.927120352198589, + "grad_norm": 3.9403669834136963, + "learning_rate": 1.214660796690182e-06, + "loss": 0.4765, + "step": 331110 + }, + { + "epoch": 2.9272087554589015, + "grad_norm": 13.24374008178711, + "learning_rate": 1.2131874090183112e-06, + "loss": 0.4755, + "step": 331120 + }, + { + "epoch": 2.9272971587192136, + "grad_norm": 3.2244529724121094, + "learning_rate": 1.2117140213464406e-06, + "loss": 0.4756, + "step": 331130 + }, + { + "epoch": 2.9273855619795257, + "grad_norm": 1.70856773853302, + "learning_rate": 1.2102406336745698e-06, + "loss": 0.4391, + "step": 331140 + }, + { + "epoch": 2.9274739652398383, + "grad_norm": 1.743481993675232, + "learning_rate": 1.2087672460026994e-06, + "loss": 0.5331, + "step": 331150 + }, + { + "epoch": 2.9275623685001504, + "grad_norm": 4.546342849731445, + "learning_rate": 1.2072938583308286e-06, + "loss": 0.5804, + "step": 331160 + }, + { + "epoch": 2.9276507717604625, + "grad_norm": 3.1594274044036865, + "learning_rate": 1.205820470658958e-06, + "loss": 0.5961, + "step": 331170 + }, + { + "epoch": 2.9277391750207746, + "grad_norm": 6.138018608093262, + "learning_rate": 1.2043470829870873e-06, + "loss": 0.5559, + "step": 331180 + }, + { + "epoch": 2.9278275782810868, + "grad_norm": 5.235229969024658, + "learning_rate": 1.2028736953152165e-06, + "loss": 0.5846, + "step": 331190 + }, + { + "epoch": 2.9279159815413993, + "grad_norm": 11.078290939331055, + "learning_rate": 1.2014003076433459e-06, + "loss": 0.4991, + "step": 331200 + }, + { + "epoch": 2.9280043848017114, + "grad_norm": 5.500136852264404, + "learning_rate": 1.1999269199714753e-06, + "loss": 0.4833, + "step": 331210 + }, + { + "epoch": 2.928092788062024, + "grad_norm": 4.4605393409729, + "learning_rate": 1.1984535322996047e-06, + "loss": 0.5626, + "step": 331220 + }, + { + "epoch": 2.928181191322336, + "grad_norm": 2.058415412902832, + "learning_rate": 1.196980144627734e-06, + "loss": 0.514, + "step": 331230 + }, + { + "epoch": 2.9282695945826482, + "grad_norm": 1.492486834526062, + "learning_rate": 1.1955067569558633e-06, + "loss": 0.5385, + "step": 331240 + }, + { + "epoch": 2.9283579978429604, + "grad_norm": 1.5488357543945312, + "learning_rate": 1.1940333692839925e-06, + "loss": 0.6387, + "step": 331250 + }, + { + "epoch": 2.9284464011032725, + "grad_norm": 8.281062126159668, + "learning_rate": 1.192559981612122e-06, + "loss": 0.5222, + "step": 331260 + }, + { + "epoch": 2.928534804363585, + "grad_norm": 1.0774956941604614, + "learning_rate": 1.1910865939402513e-06, + "loss": 0.3636, + "step": 331270 + }, + { + "epoch": 2.928623207623897, + "grad_norm": 5.589380741119385, + "learning_rate": 1.1896132062683805e-06, + "loss": 0.5074, + "step": 331280 + }, + { + "epoch": 2.9287116108842093, + "grad_norm": 5.947958469390869, + "learning_rate": 1.18813981859651e-06, + "loss": 0.6791, + "step": 331290 + }, + { + "epoch": 2.928800014144522, + "grad_norm": 5.728817939758301, + "learning_rate": 1.1866664309246392e-06, + "loss": 0.432, + "step": 331300 + }, + { + "epoch": 2.928888417404834, + "grad_norm": 4.5192551612854, + "learning_rate": 1.1851930432527686e-06, + "loss": 0.4911, + "step": 331310 + }, + { + "epoch": 2.928976820665146, + "grad_norm": 5.5105791091918945, + "learning_rate": 1.1837196555808978e-06, + "loss": 0.6019, + "step": 331320 + }, + { + "epoch": 2.929065223925458, + "grad_norm": 6.089077949523926, + "learning_rate": 1.1822462679090272e-06, + "loss": 0.4945, + "step": 331330 + }, + { + "epoch": 2.9291536271857708, + "grad_norm": 2.0738747119903564, + "learning_rate": 1.1807728802371566e-06, + "loss": 0.5046, + "step": 331340 + }, + { + "epoch": 2.929242030446083, + "grad_norm": 7.334586143493652, + "learning_rate": 1.1792994925652858e-06, + "loss": 0.518, + "step": 331350 + }, + { + "epoch": 2.929330433706395, + "grad_norm": 3.513317346572876, + "learning_rate": 1.1778261048934152e-06, + "loss": 0.564, + "step": 331360 + }, + { + "epoch": 2.9294188369667076, + "grad_norm": 3.3517792224884033, + "learning_rate": 1.1763527172215444e-06, + "loss": 0.4514, + "step": 331370 + }, + { + "epoch": 2.9295072402270197, + "grad_norm": 2.91500186920166, + "learning_rate": 1.1748793295496738e-06, + "loss": 0.5954, + "step": 331380 + }, + { + "epoch": 2.929595643487332, + "grad_norm": 14.326584815979004, + "learning_rate": 1.1734059418778032e-06, + "loss": 0.6128, + "step": 331390 + }, + { + "epoch": 2.929684046747644, + "grad_norm": 3.5692853927612305, + "learning_rate": 1.1719325542059324e-06, + "loss": 0.5433, + "step": 331400 + }, + { + "epoch": 2.929772450007956, + "grad_norm": 12.424131393432617, + "learning_rate": 1.1704591665340618e-06, + "loss": 0.6027, + "step": 331410 + }, + { + "epoch": 2.9298608532682686, + "grad_norm": 12.042518615722656, + "learning_rate": 1.168985778862191e-06, + "loss": 0.4999, + "step": 331420 + }, + { + "epoch": 2.9299492565285807, + "grad_norm": 1.9275249242782593, + "learning_rate": 1.1675123911903205e-06, + "loss": 0.4689, + "step": 331430 + }, + { + "epoch": 2.930037659788893, + "grad_norm": 2.4917354583740234, + "learning_rate": 1.1660390035184499e-06, + "loss": 0.435, + "step": 331440 + }, + { + "epoch": 2.9301260630492054, + "grad_norm": 3.5315346717834473, + "learning_rate": 1.1645656158465793e-06, + "loss": 0.4529, + "step": 331450 + }, + { + "epoch": 2.9302144663095175, + "grad_norm": 3.095319986343384, + "learning_rate": 1.1630922281747085e-06, + "loss": 0.4235, + "step": 331460 + }, + { + "epoch": 2.9303028695698297, + "grad_norm": 2.681783437728882, + "learning_rate": 1.1616188405028377e-06, + "loss": 0.437, + "step": 331470 + }, + { + "epoch": 2.930391272830142, + "grad_norm": 1.851931095123291, + "learning_rate": 1.160145452830967e-06, + "loss": 0.4033, + "step": 331480 + }, + { + "epoch": 2.9304796760904543, + "grad_norm": 6.338001728057861, + "learning_rate": 1.1586720651590963e-06, + "loss": 0.5258, + "step": 331490 + }, + { + "epoch": 2.9305680793507665, + "grad_norm": 1.5490347146987915, + "learning_rate": 1.157198677487226e-06, + "loss": 0.4995, + "step": 331500 + }, + { + "epoch": 2.9306564826110786, + "grad_norm": 14.733625411987305, + "learning_rate": 1.1557252898153551e-06, + "loss": 0.6282, + "step": 331510 + }, + { + "epoch": 2.930744885871391, + "grad_norm": 2.4752585887908936, + "learning_rate": 1.1542519021434845e-06, + "loss": 0.5556, + "step": 331520 + }, + { + "epoch": 2.9308332891317033, + "grad_norm": 1.997862458229065, + "learning_rate": 1.1527785144716137e-06, + "loss": 0.5713, + "step": 331530 + }, + { + "epoch": 2.9309216923920154, + "grad_norm": 3.8597397804260254, + "learning_rate": 1.151305126799743e-06, + "loss": 0.4049, + "step": 331540 + }, + { + "epoch": 2.9310100956523275, + "grad_norm": 11.254827499389648, + "learning_rate": 1.1498317391278724e-06, + "loss": 0.4122, + "step": 331550 + }, + { + "epoch": 2.9310984989126396, + "grad_norm": 1.0696526765823364, + "learning_rate": 1.1483583514560018e-06, + "loss": 0.4317, + "step": 331560 + }, + { + "epoch": 2.931186902172952, + "grad_norm": 3.48330020904541, + "learning_rate": 1.1468849637841312e-06, + "loss": 0.4475, + "step": 331570 + }, + { + "epoch": 2.9312753054332643, + "grad_norm": 8.821818351745605, + "learning_rate": 1.1454115761122604e-06, + "loss": 0.5492, + "step": 331580 + }, + { + "epoch": 2.931363708693577, + "grad_norm": 2.0982322692871094, + "learning_rate": 1.1439381884403898e-06, + "loss": 0.5648, + "step": 331590 + }, + { + "epoch": 2.931452111953889, + "grad_norm": 6.706940650939941, + "learning_rate": 1.142464800768519e-06, + "loss": 0.6707, + "step": 331600 + }, + { + "epoch": 2.931540515214201, + "grad_norm": 1.7745949029922485, + "learning_rate": 1.1409914130966484e-06, + "loss": 0.5315, + "step": 331610 + }, + { + "epoch": 2.9316289184745132, + "grad_norm": 3.367375373840332, + "learning_rate": 1.1395180254247778e-06, + "loss": 0.5676, + "step": 331620 + }, + { + "epoch": 2.9317173217348254, + "grad_norm": 4.88645076751709, + "learning_rate": 1.138044637752907e-06, + "loss": 0.5801, + "step": 331630 + }, + { + "epoch": 2.931805724995138, + "grad_norm": 5.882744312286377, + "learning_rate": 1.1365712500810364e-06, + "loss": 0.3726, + "step": 331640 + }, + { + "epoch": 2.93189412825545, + "grad_norm": 3.4387168884277344, + "learning_rate": 1.1350978624091656e-06, + "loss": 0.5118, + "step": 331650 + }, + { + "epoch": 2.931982531515762, + "grad_norm": 27.405988693237305, + "learning_rate": 1.133624474737295e-06, + "loss": 0.4732, + "step": 331660 + }, + { + "epoch": 2.9320709347760747, + "grad_norm": 3.2351667881011963, + "learning_rate": 1.1321510870654245e-06, + "loss": 0.5645, + "step": 331670 + }, + { + "epoch": 2.932159338036387, + "grad_norm": 6.8590803146362305, + "learning_rate": 1.1306776993935537e-06, + "loss": 0.6036, + "step": 331680 + }, + { + "epoch": 2.932247741296699, + "grad_norm": 4.246060848236084, + "learning_rate": 1.129204311721683e-06, + "loss": 0.4172, + "step": 331690 + }, + { + "epoch": 2.932336144557011, + "grad_norm": 3.3498361110687256, + "learning_rate": 1.1277309240498123e-06, + "loss": 0.5181, + "step": 331700 + }, + { + "epoch": 2.9324245478173236, + "grad_norm": 3.390728235244751, + "learning_rate": 1.1262575363779417e-06, + "loss": 0.4849, + "step": 331710 + }, + { + "epoch": 2.9325129510776358, + "grad_norm": 3.5909297466278076, + "learning_rate": 1.1247841487060709e-06, + "loss": 0.5583, + "step": 331720 + }, + { + "epoch": 2.932601354337948, + "grad_norm": 3.563746213912964, + "learning_rate": 1.1233107610342003e-06, + "loss": 0.5798, + "step": 331730 + }, + { + "epoch": 2.9326897575982604, + "grad_norm": 3.973869800567627, + "learning_rate": 1.1218373733623297e-06, + "loss": 0.4434, + "step": 331740 + }, + { + "epoch": 2.9327781608585726, + "grad_norm": 4.6771721839904785, + "learning_rate": 1.120363985690459e-06, + "loss": 0.4924, + "step": 331750 + }, + { + "epoch": 2.9328665641188847, + "grad_norm": 5.0182600021362305, + "learning_rate": 1.1188905980185883e-06, + "loss": 0.5573, + "step": 331760 + }, + { + "epoch": 2.932954967379197, + "grad_norm": 4.410765171051025, + "learning_rate": 1.1174172103467175e-06, + "loss": 0.4628, + "step": 331770 + }, + { + "epoch": 2.933043370639509, + "grad_norm": 3.29431414604187, + "learning_rate": 1.115943822674847e-06, + "loss": 0.5152, + "step": 331780 + }, + { + "epoch": 2.9331317738998215, + "grad_norm": 2.2562570571899414, + "learning_rate": 1.1144704350029764e-06, + "loss": 0.5713, + "step": 331790 + }, + { + "epoch": 2.9332201771601336, + "grad_norm": 1.7744845151901245, + "learning_rate": 1.1129970473311058e-06, + "loss": 0.4457, + "step": 331800 + }, + { + "epoch": 2.933308580420446, + "grad_norm": 2.8932526111602783, + "learning_rate": 1.111523659659235e-06, + "loss": 0.4357, + "step": 331810 + }, + { + "epoch": 2.9333969836807583, + "grad_norm": 4.740487575531006, + "learning_rate": 1.1100502719873642e-06, + "loss": 0.5231, + "step": 331820 + }, + { + "epoch": 2.9334853869410704, + "grad_norm": 19.424278259277344, + "learning_rate": 1.1085768843154936e-06, + "loss": 0.5154, + "step": 331830 + }, + { + "epoch": 2.9335737902013825, + "grad_norm": 4.028266906738281, + "learning_rate": 1.1071034966436228e-06, + "loss": 0.54, + "step": 331840 + }, + { + "epoch": 2.9336621934616947, + "grad_norm": 5.154417514801025, + "learning_rate": 1.1056301089717524e-06, + "loss": 0.608, + "step": 331850 + }, + { + "epoch": 2.933750596722007, + "grad_norm": 4.226918697357178, + "learning_rate": 1.1041567212998816e-06, + "loss": 0.4549, + "step": 331860 + }, + { + "epoch": 2.9338389999823193, + "grad_norm": 4.094114303588867, + "learning_rate": 1.102683333628011e-06, + "loss": 0.4819, + "step": 331870 + }, + { + "epoch": 2.9339274032426315, + "grad_norm": 5.820641994476318, + "learning_rate": 1.1012099459561402e-06, + "loss": 0.5935, + "step": 331880 + }, + { + "epoch": 2.934015806502944, + "grad_norm": 3.2831220626831055, + "learning_rate": 1.0997365582842694e-06, + "loss": 0.5274, + "step": 331890 + }, + { + "epoch": 2.934104209763256, + "grad_norm": 2.3857738971710205, + "learning_rate": 1.0982631706123988e-06, + "loss": 0.5394, + "step": 331900 + }, + { + "epoch": 2.9341926130235683, + "grad_norm": 5.184228420257568, + "learning_rate": 1.0967897829405283e-06, + "loss": 0.4751, + "step": 331910 + }, + { + "epoch": 2.9342810162838804, + "grad_norm": 3.1267497539520264, + "learning_rate": 1.0953163952686577e-06, + "loss": 0.505, + "step": 331920 + }, + { + "epoch": 2.934369419544193, + "grad_norm": 3.7354633808135986, + "learning_rate": 1.0938430075967869e-06, + "loss": 0.5424, + "step": 331930 + }, + { + "epoch": 2.934457822804505, + "grad_norm": 1.6384035348892212, + "learning_rate": 1.0923696199249163e-06, + "loss": 0.481, + "step": 331940 + }, + { + "epoch": 2.934546226064817, + "grad_norm": 6.104953765869141, + "learning_rate": 1.0908962322530455e-06, + "loss": 0.684, + "step": 331950 + }, + { + "epoch": 2.9346346293251298, + "grad_norm": 2.2098443508148193, + "learning_rate": 1.0894228445811749e-06, + "loss": 0.5344, + "step": 331960 + }, + { + "epoch": 2.934723032585442, + "grad_norm": 2.8066742420196533, + "learning_rate": 1.0879494569093043e-06, + "loss": 0.4164, + "step": 331970 + }, + { + "epoch": 2.934811435845754, + "grad_norm": 3.0891292095184326, + "learning_rate": 1.0864760692374335e-06, + "loss": 0.4778, + "step": 331980 + }, + { + "epoch": 2.934899839106066, + "grad_norm": 7.632856369018555, + "learning_rate": 1.085002681565563e-06, + "loss": 0.4566, + "step": 331990 + }, + { + "epoch": 2.9349882423663782, + "grad_norm": 4.50180721282959, + "learning_rate": 1.0835292938936921e-06, + "loss": 0.609, + "step": 332000 + }, + { + "epoch": 2.935076645626691, + "grad_norm": 2.425632953643799, + "learning_rate": 1.0820559062218215e-06, + "loss": 0.4884, + "step": 332010 + }, + { + "epoch": 2.935165048887003, + "grad_norm": 5.2290778160095215, + "learning_rate": 1.080582518549951e-06, + "loss": 0.5354, + "step": 332020 + }, + { + "epoch": 2.935253452147315, + "grad_norm": 6.078485012054443, + "learning_rate": 1.0791091308780801e-06, + "loss": 0.5318, + "step": 332030 + }, + { + "epoch": 2.9353418554076276, + "grad_norm": 1.4493048191070557, + "learning_rate": 1.0776357432062096e-06, + "loss": 0.5304, + "step": 332040 + }, + { + "epoch": 2.9354302586679397, + "grad_norm": 4.487633228302002, + "learning_rate": 1.0761623555343388e-06, + "loss": 0.5147, + "step": 332050 + }, + { + "epoch": 2.935518661928252, + "grad_norm": 7.509797096252441, + "learning_rate": 1.0746889678624682e-06, + "loss": 0.4763, + "step": 332060 + }, + { + "epoch": 2.935607065188564, + "grad_norm": 9.135674476623535, + "learning_rate": 1.0732155801905974e-06, + "loss": 0.6236, + "step": 332070 + }, + { + "epoch": 2.9356954684488765, + "grad_norm": 3.586003303527832, + "learning_rate": 1.0717421925187268e-06, + "loss": 0.4395, + "step": 332080 + }, + { + "epoch": 2.9357838717091886, + "grad_norm": 1.9376856088638306, + "learning_rate": 1.0702688048468562e-06, + "loss": 0.4414, + "step": 332090 + }, + { + "epoch": 2.9358722749695008, + "grad_norm": 2.4797258377075195, + "learning_rate": 1.0687954171749854e-06, + "loss": 0.5281, + "step": 332100 + }, + { + "epoch": 2.9359606782298133, + "grad_norm": 2.6490156650543213, + "learning_rate": 1.0673220295031148e-06, + "loss": 0.5319, + "step": 332110 + }, + { + "epoch": 2.9360490814901254, + "grad_norm": 3.673570156097412, + "learning_rate": 1.065848641831244e-06, + "loss": 0.5358, + "step": 332120 + }, + { + "epoch": 2.9361374847504376, + "grad_norm": 4.618862628936768, + "learning_rate": 1.0643752541593734e-06, + "loss": 0.5274, + "step": 332130 + }, + { + "epoch": 2.9362258880107497, + "grad_norm": 4.26412296295166, + "learning_rate": 1.0629018664875028e-06, + "loss": 0.5972, + "step": 332140 + }, + { + "epoch": 2.936314291271062, + "grad_norm": 1.5690829753875732, + "learning_rate": 1.0614284788156323e-06, + "loss": 0.5386, + "step": 332150 + }, + { + "epoch": 2.9364026945313744, + "grad_norm": 3.449817180633545, + "learning_rate": 1.0599550911437615e-06, + "loss": 0.4492, + "step": 332160 + }, + { + "epoch": 2.9364910977916865, + "grad_norm": 2.107651472091675, + "learning_rate": 1.0584817034718907e-06, + "loss": 0.5526, + "step": 332170 + }, + { + "epoch": 2.936579501051999, + "grad_norm": 1.3826146125793457, + "learning_rate": 1.05700831580002e-06, + "loss": 0.5619, + "step": 332180 + }, + { + "epoch": 2.936667904312311, + "grad_norm": 2.128535509109497, + "learning_rate": 1.0555349281281493e-06, + "loss": 0.5274, + "step": 332190 + }, + { + "epoch": 2.9367563075726233, + "grad_norm": 10.150843620300293, + "learning_rate": 1.0540615404562789e-06, + "loss": 0.5806, + "step": 332200 + }, + { + "epoch": 2.9368447108329354, + "grad_norm": 1.0913158655166626, + "learning_rate": 1.052588152784408e-06, + "loss": 0.5479, + "step": 332210 + }, + { + "epoch": 2.9369331140932475, + "grad_norm": 2.365037202835083, + "learning_rate": 1.0511147651125375e-06, + "loss": 0.4543, + "step": 332220 + }, + { + "epoch": 2.93702151735356, + "grad_norm": 2.6533005237579346, + "learning_rate": 1.0496413774406667e-06, + "loss": 0.5348, + "step": 332230 + }, + { + "epoch": 2.937109920613872, + "grad_norm": 5.1026177406311035, + "learning_rate": 1.048167989768796e-06, + "loss": 0.5676, + "step": 332240 + }, + { + "epoch": 2.9371983238741843, + "grad_norm": 2.0468616485595703, + "learning_rate": 1.0466946020969253e-06, + "loss": 0.5577, + "step": 332250 + }, + { + "epoch": 2.937286727134497, + "grad_norm": 1.7090802192687988, + "learning_rate": 1.0452212144250547e-06, + "loss": 0.4824, + "step": 332260 + }, + { + "epoch": 2.937375130394809, + "grad_norm": 4.035421848297119, + "learning_rate": 1.0437478267531841e-06, + "loss": 0.4983, + "step": 332270 + }, + { + "epoch": 2.937463533655121, + "grad_norm": 2.8812856674194336, + "learning_rate": 1.0422744390813133e-06, + "loss": 0.5273, + "step": 332280 + }, + { + "epoch": 2.9375519369154333, + "grad_norm": 4.089599132537842, + "learning_rate": 1.0408010514094428e-06, + "loss": 0.5829, + "step": 332290 + }, + { + "epoch": 2.937640340175746, + "grad_norm": 6.103228569030762, + "learning_rate": 1.039327663737572e-06, + "loss": 0.474, + "step": 332300 + }, + { + "epoch": 2.937728743436058, + "grad_norm": 1.7176735401153564, + "learning_rate": 1.0378542760657014e-06, + "loss": 0.4398, + "step": 332310 + }, + { + "epoch": 2.93781714669637, + "grad_norm": 5.897286891937256, + "learning_rate": 1.0363808883938308e-06, + "loss": 0.4504, + "step": 332320 + }, + { + "epoch": 2.9379055499566826, + "grad_norm": 2.126053810119629, + "learning_rate": 1.03490750072196e-06, + "loss": 0.4995, + "step": 332330 + }, + { + "epoch": 2.9379939532169947, + "grad_norm": 4.046904563903809, + "learning_rate": 1.0334341130500894e-06, + "loss": 0.6357, + "step": 332340 + }, + { + "epoch": 2.938082356477307, + "grad_norm": 11.584725379943848, + "learning_rate": 1.0319607253782186e-06, + "loss": 0.5394, + "step": 332350 + }, + { + "epoch": 2.938170759737619, + "grad_norm": 3.5283756256103516, + "learning_rate": 1.030487337706348e-06, + "loss": 0.5035, + "step": 332360 + }, + { + "epoch": 2.938259162997931, + "grad_norm": 3.8177473545074463, + "learning_rate": 1.0290139500344774e-06, + "loss": 0.5078, + "step": 332370 + }, + { + "epoch": 2.9383475662582437, + "grad_norm": 2.6955838203430176, + "learning_rate": 1.0275405623626066e-06, + "loss": 0.3349, + "step": 332380 + }, + { + "epoch": 2.938435969518556, + "grad_norm": 10.545686721801758, + "learning_rate": 1.026067174690736e-06, + "loss": 0.4386, + "step": 332390 + }, + { + "epoch": 2.9385243727788684, + "grad_norm": 3.3737661838531494, + "learning_rate": 1.0245937870188652e-06, + "loss": 0.6348, + "step": 332400 + }, + { + "epoch": 2.9386127760391805, + "grad_norm": 1.6760369539260864, + "learning_rate": 1.0231203993469947e-06, + "loss": 0.4539, + "step": 332410 + }, + { + "epoch": 2.9387011792994926, + "grad_norm": 4.554879188537598, + "learning_rate": 1.0216470116751239e-06, + "loss": 0.572, + "step": 332420 + }, + { + "epoch": 2.9387895825598047, + "grad_norm": 5.877460956573486, + "learning_rate": 1.0201736240032533e-06, + "loss": 0.5887, + "step": 332430 + }, + { + "epoch": 2.938877985820117, + "grad_norm": 7.21752405166626, + "learning_rate": 1.0187002363313827e-06, + "loss": 0.5663, + "step": 332440 + }, + { + "epoch": 2.9389663890804294, + "grad_norm": 3.1777398586273193, + "learning_rate": 1.0172268486595119e-06, + "loss": 0.4906, + "step": 332450 + }, + { + "epoch": 2.9390547923407415, + "grad_norm": 1.618542194366455, + "learning_rate": 1.0157534609876413e-06, + "loss": 0.5273, + "step": 332460 + }, + { + "epoch": 2.9391431956010536, + "grad_norm": 21.029468536376953, + "learning_rate": 1.0142800733157705e-06, + "loss": 0.5309, + "step": 332470 + }, + { + "epoch": 2.939231598861366, + "grad_norm": 1.3133751153945923, + "learning_rate": 1.0128066856439e-06, + "loss": 0.5333, + "step": 332480 + }, + { + "epoch": 2.9393200021216783, + "grad_norm": 3.222263813018799, + "learning_rate": 1.0113332979720293e-06, + "loss": 0.492, + "step": 332490 + }, + { + "epoch": 2.9394084053819904, + "grad_norm": 3.4017038345336914, + "learning_rate": 1.0098599103001587e-06, + "loss": 0.4711, + "step": 332500 + }, + { + "epoch": 2.9394968086423026, + "grad_norm": 4.28942346572876, + "learning_rate": 1.008386522628288e-06, + "loss": 0.5615, + "step": 332510 + }, + { + "epoch": 2.939585211902615, + "grad_norm": 1.6358811855316162, + "learning_rate": 1.0069131349564171e-06, + "loss": 0.6046, + "step": 332520 + }, + { + "epoch": 2.9396736151629272, + "grad_norm": 2.0971052646636963, + "learning_rate": 1.0054397472845465e-06, + "loss": 0.4781, + "step": 332530 + }, + { + "epoch": 2.9397620184232394, + "grad_norm": 7.674422264099121, + "learning_rate": 1.003966359612676e-06, + "loss": 0.651, + "step": 332540 + }, + { + "epoch": 2.939850421683552, + "grad_norm": 3.3353617191314697, + "learning_rate": 1.0024929719408054e-06, + "loss": 0.57, + "step": 332550 + }, + { + "epoch": 2.939938824943864, + "grad_norm": 4.824578285217285, + "learning_rate": 1.0010195842689346e-06, + "loss": 0.4946, + "step": 332560 + }, + { + "epoch": 2.940027228204176, + "grad_norm": 3.3759148120880127, + "learning_rate": 9.99546196597064e-07, + "loss": 0.6086, + "step": 332570 + }, + { + "epoch": 2.9401156314644883, + "grad_norm": 2.1790993213653564, + "learning_rate": 9.980728089251932e-07, + "loss": 0.5365, + "step": 332580 + }, + { + "epoch": 2.9402040347248004, + "grad_norm": 6.781055450439453, + "learning_rate": 9.965994212533224e-07, + "loss": 0.5498, + "step": 332590 + }, + { + "epoch": 2.940292437985113, + "grad_norm": 9.963984489440918, + "learning_rate": 9.951260335814518e-07, + "loss": 0.4857, + "step": 332600 + }, + { + "epoch": 2.940380841245425, + "grad_norm": 2.3545727729797363, + "learning_rate": 9.936526459095812e-07, + "loss": 0.5305, + "step": 332610 + }, + { + "epoch": 2.940469244505737, + "grad_norm": 1.1940432786941528, + "learning_rate": 9.921792582377106e-07, + "loss": 0.4463, + "step": 332620 + }, + { + "epoch": 2.9405576477660498, + "grad_norm": 5.9522600173950195, + "learning_rate": 9.907058705658398e-07, + "loss": 0.5234, + "step": 332630 + }, + { + "epoch": 2.940646051026362, + "grad_norm": 16.055248260498047, + "learning_rate": 9.892324828939692e-07, + "loss": 0.4125, + "step": 332640 + }, + { + "epoch": 2.940734454286674, + "grad_norm": 4.16633415222168, + "learning_rate": 9.877590952220984e-07, + "loss": 0.4662, + "step": 332650 + }, + { + "epoch": 2.940822857546986, + "grad_norm": 12.208285331726074, + "learning_rate": 9.862857075502279e-07, + "loss": 0.4455, + "step": 332660 + }, + { + "epoch": 2.9409112608072987, + "grad_norm": 3.7897250652313232, + "learning_rate": 9.848123198783573e-07, + "loss": 0.4969, + "step": 332670 + }, + { + "epoch": 2.940999664067611, + "grad_norm": 13.477505683898926, + "learning_rate": 9.833389322064865e-07, + "loss": 0.4755, + "step": 332680 + }, + { + "epoch": 2.941088067327923, + "grad_norm": 7.623547554016113, + "learning_rate": 9.818655445346159e-07, + "loss": 0.4869, + "step": 332690 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.999243974685669, + "learning_rate": 9.80392156862745e-07, + "loss": 0.4478, + "step": 332700 + }, + { + "epoch": 2.9412648738485476, + "grad_norm": 1.285066843032837, + "learning_rate": 9.789187691908745e-07, + "loss": 0.5197, + "step": 332710 + }, + { + "epoch": 2.9413532771088597, + "grad_norm": 5.773447036743164, + "learning_rate": 9.77445381519004e-07, + "loss": 0.4762, + "step": 332720 + }, + { + "epoch": 2.941441680369172, + "grad_norm": 5.48266077041626, + "learning_rate": 9.759719938471331e-07, + "loss": 0.463, + "step": 332730 + }, + { + "epoch": 2.941530083629484, + "grad_norm": 1.6137820482254028, + "learning_rate": 9.744986061752625e-07, + "loss": 0.6361, + "step": 332740 + }, + { + "epoch": 2.9416184868897965, + "grad_norm": 2.1190803050994873, + "learning_rate": 9.730252185033917e-07, + "loss": 0.4882, + "step": 332750 + }, + { + "epoch": 2.9417068901501087, + "grad_norm": 8.648072242736816, + "learning_rate": 9.715518308315211e-07, + "loss": 0.5304, + "step": 332760 + }, + { + "epoch": 2.9417952934104212, + "grad_norm": 4.358310222625732, + "learning_rate": 9.700784431596503e-07, + "loss": 0.4188, + "step": 332770 + }, + { + "epoch": 2.9418836966707334, + "grad_norm": 9.545794486999512, + "learning_rate": 9.6860505548778e-07, + "loss": 0.4945, + "step": 332780 + }, + { + "epoch": 2.9419720999310455, + "grad_norm": 3.4538331031799316, + "learning_rate": 9.671316678159092e-07, + "loss": 0.4785, + "step": 332790 + }, + { + "epoch": 2.9420605031913576, + "grad_norm": 9.132399559020996, + "learning_rate": 9.656582801440384e-07, + "loss": 0.4963, + "step": 332800 + }, + { + "epoch": 2.9421489064516697, + "grad_norm": 2.2299134731292725, + "learning_rate": 9.641848924721678e-07, + "loss": 0.5284, + "step": 332810 + }, + { + "epoch": 2.9422373097119823, + "grad_norm": 3.733222484588623, + "learning_rate": 9.62711504800297e-07, + "loss": 0.542, + "step": 332820 + }, + { + "epoch": 2.9423257129722944, + "grad_norm": 8.98245906829834, + "learning_rate": 9.612381171284264e-07, + "loss": 0.6149, + "step": 332830 + }, + { + "epoch": 2.9424141162326065, + "grad_norm": 15.770872116088867, + "learning_rate": 9.597647294565558e-07, + "loss": 0.4945, + "step": 332840 + }, + { + "epoch": 2.942502519492919, + "grad_norm": 3.0826592445373535, + "learning_rate": 9.58291341784685e-07, + "loss": 0.5102, + "step": 332850 + }, + { + "epoch": 2.942590922753231, + "grad_norm": 4.801102161407471, + "learning_rate": 9.568179541128144e-07, + "loss": 0.5579, + "step": 332860 + }, + { + "epoch": 2.9426793260135433, + "grad_norm": 0.94375079870224, + "learning_rate": 9.553445664409436e-07, + "loss": 0.4755, + "step": 332870 + }, + { + "epoch": 2.9427677292738554, + "grad_norm": 5.371917247772217, + "learning_rate": 9.53871178769073e-07, + "loss": 0.5181, + "step": 332880 + }, + { + "epoch": 2.942856132534168, + "grad_norm": 5.845797538757324, + "learning_rate": 9.523977910972023e-07, + "loss": 0.4599, + "step": 332890 + }, + { + "epoch": 2.94294453579448, + "grad_norm": 4.666518211364746, + "learning_rate": 9.509244034253317e-07, + "loss": 0.6349, + "step": 332900 + }, + { + "epoch": 2.9430329390547922, + "grad_norm": 1.4262235164642334, + "learning_rate": 9.494510157534611e-07, + "loss": 0.5605, + "step": 332910 + }, + { + "epoch": 2.943121342315105, + "grad_norm": 3.0433619022369385, + "learning_rate": 9.479776280815903e-07, + "loss": 0.5244, + "step": 332920 + }, + { + "epoch": 2.943209745575417, + "grad_norm": 5.918675422668457, + "learning_rate": 9.465042404097197e-07, + "loss": 0.6694, + "step": 332930 + }, + { + "epoch": 2.943298148835729, + "grad_norm": 2.5652616024017334, + "learning_rate": 9.45030852737849e-07, + "loss": 0.4277, + "step": 332940 + }, + { + "epoch": 2.943386552096041, + "grad_norm": 1.1091159582138062, + "learning_rate": 9.435574650659784e-07, + "loss": 0.5324, + "step": 332950 + }, + { + "epoch": 2.9434749553563533, + "grad_norm": 7.666991710662842, + "learning_rate": 9.420840773941076e-07, + "loss": 0.4638, + "step": 332960 + }, + { + "epoch": 2.943563358616666, + "grad_norm": 2.2862911224365234, + "learning_rate": 9.406106897222371e-07, + "loss": 0.4286, + "step": 332970 + }, + { + "epoch": 2.943651761876978, + "grad_norm": 1.9438583850860596, + "learning_rate": 9.391373020503663e-07, + "loss": 0.5401, + "step": 332980 + }, + { + "epoch": 2.9437401651372905, + "grad_norm": 2.360994815826416, + "learning_rate": 9.376639143784956e-07, + "loss": 0.4413, + "step": 332990 + }, + { + "epoch": 2.9438285683976027, + "grad_norm": 7.841155529022217, + "learning_rate": 9.36190526706625e-07, + "loss": 0.5574, + "step": 333000 + }, + { + "epoch": 2.9439169716579148, + "grad_norm": 1.4954890012741089, + "learning_rate": 9.347171390347542e-07, + "loss": 0.5363, + "step": 333010 + }, + { + "epoch": 2.944005374918227, + "grad_norm": 5.310091495513916, + "learning_rate": 9.332437513628836e-07, + "loss": 0.5211, + "step": 333020 + }, + { + "epoch": 2.944093778178539, + "grad_norm": 3.8453562259674072, + "learning_rate": 9.31770363691013e-07, + "loss": 0.4257, + "step": 333030 + }, + { + "epoch": 2.9441821814388516, + "grad_norm": 2.9443719387054443, + "learning_rate": 9.302969760191424e-07, + "loss": 0.5891, + "step": 333040 + }, + { + "epoch": 2.9442705846991637, + "grad_norm": 4.378458499908447, + "learning_rate": 9.288235883472716e-07, + "loss": 0.6031, + "step": 333050 + }, + { + "epoch": 2.944358987959476, + "grad_norm": 9.849979400634766, + "learning_rate": 9.273502006754009e-07, + "loss": 0.6458, + "step": 333060 + }, + { + "epoch": 2.9444473912197884, + "grad_norm": 2.6421234607696533, + "learning_rate": 9.258768130035303e-07, + "loss": 0.7319, + "step": 333070 + }, + { + "epoch": 2.9445357944801005, + "grad_norm": 1.6617428064346313, + "learning_rate": 9.244034253316596e-07, + "loss": 0.4842, + "step": 333080 + }, + { + "epoch": 2.9446241977404126, + "grad_norm": 11.12474250793457, + "learning_rate": 9.22930037659789e-07, + "loss": 0.6069, + "step": 333090 + }, + { + "epoch": 2.9447126010007247, + "grad_norm": 2.212523937225342, + "learning_rate": 9.214566499879182e-07, + "loss": 0.4424, + "step": 333100 + }, + { + "epoch": 2.9448010042610373, + "grad_norm": 2.6969857215881348, + "learning_rate": 9.199832623160476e-07, + "loss": 0.4036, + "step": 333110 + }, + { + "epoch": 2.9448894075213494, + "grad_norm": 1.7578719854354858, + "learning_rate": 9.185098746441769e-07, + "loss": 0.4762, + "step": 333120 + }, + { + "epoch": 2.9449778107816615, + "grad_norm": 6.0614776611328125, + "learning_rate": 9.170364869723061e-07, + "loss": 0.5643, + "step": 333130 + }, + { + "epoch": 2.945066214041974, + "grad_norm": 1.6259849071502686, + "learning_rate": 9.155630993004356e-07, + "loss": 0.6929, + "step": 333140 + }, + { + "epoch": 2.9451546173022862, + "grad_norm": 6.305258274078369, + "learning_rate": 9.140897116285648e-07, + "loss": 0.4915, + "step": 333150 + }, + { + "epoch": 2.9452430205625983, + "grad_norm": 2.2439615726470947, + "learning_rate": 9.126163239566943e-07, + "loss": 0.5649, + "step": 333160 + }, + { + "epoch": 2.9453314238229105, + "grad_norm": 3.912944793701172, + "learning_rate": 9.111429362848236e-07, + "loss": 0.4671, + "step": 333170 + }, + { + "epoch": 2.9454198270832226, + "grad_norm": 3.572465181350708, + "learning_rate": 9.09669548612953e-07, + "loss": 0.4359, + "step": 333180 + }, + { + "epoch": 2.945508230343535, + "grad_norm": 1.6161839962005615, + "learning_rate": 9.081961609410822e-07, + "loss": 0.5815, + "step": 333190 + }, + { + "epoch": 2.9455966336038473, + "grad_norm": 1.1468473672866821, + "learning_rate": 9.067227732692115e-07, + "loss": 0.4831, + "step": 333200 + }, + { + "epoch": 2.9456850368641594, + "grad_norm": 3.3126280307769775, + "learning_rate": 9.052493855973409e-07, + "loss": 0.5276, + "step": 333210 + }, + { + "epoch": 2.945773440124472, + "grad_norm": 3.516404151916504, + "learning_rate": 9.037759979254701e-07, + "loss": 0.5144, + "step": 333220 + }, + { + "epoch": 2.945861843384784, + "grad_norm": 8.798751831054688, + "learning_rate": 9.023026102535996e-07, + "loss": 0.3529, + "step": 333230 + }, + { + "epoch": 2.945950246645096, + "grad_norm": 22.89048957824707, + "learning_rate": 9.008292225817288e-07, + "loss": 0.5646, + "step": 333240 + }, + { + "epoch": 2.9460386499054083, + "grad_norm": 3.773745536804199, + "learning_rate": 8.993558349098582e-07, + "loss": 0.6108, + "step": 333250 + }, + { + "epoch": 2.946127053165721, + "grad_norm": 6.805750846862793, + "learning_rate": 8.978824472379875e-07, + "loss": 0.5413, + "step": 333260 + }, + { + "epoch": 2.946215456426033, + "grad_norm": 2.8167974948883057, + "learning_rate": 8.964090595661167e-07, + "loss": 0.5375, + "step": 333270 + }, + { + "epoch": 2.946303859686345, + "grad_norm": 3.228610038757324, + "learning_rate": 8.949356718942462e-07, + "loss": 0.7201, + "step": 333280 + }, + { + "epoch": 2.9463922629466577, + "grad_norm": 2.6536288261413574, + "learning_rate": 8.934622842223755e-07, + "loss": 0.5638, + "step": 333290 + }, + { + "epoch": 2.94648066620697, + "grad_norm": 2.690816879272461, + "learning_rate": 8.919888965505049e-07, + "loss": 0.4073, + "step": 333300 + }, + { + "epoch": 2.946569069467282, + "grad_norm": 4.327020645141602, + "learning_rate": 8.905155088786341e-07, + "loss": 0.4578, + "step": 333310 + }, + { + "epoch": 2.946657472727594, + "grad_norm": 3.299421787261963, + "learning_rate": 8.890421212067636e-07, + "loss": 0.4913, + "step": 333320 + }, + { + "epoch": 2.946745875987906, + "grad_norm": 2.975013256072998, + "learning_rate": 8.875687335348928e-07, + "loss": 0.5296, + "step": 333330 + }, + { + "epoch": 2.9468342792482187, + "grad_norm": 2.1690027713775635, + "learning_rate": 8.860953458630221e-07, + "loss": 0.5183, + "step": 333340 + }, + { + "epoch": 2.946922682508531, + "grad_norm": 2.833936929702759, + "learning_rate": 8.846219581911515e-07, + "loss": 0.4961, + "step": 333350 + }, + { + "epoch": 2.9470110857688434, + "grad_norm": 2.3511664867401123, + "learning_rate": 8.831485705192807e-07, + "loss": 0.5357, + "step": 333360 + }, + { + "epoch": 2.9470994890291555, + "grad_norm": 2.2639269828796387, + "learning_rate": 8.816751828474101e-07, + "loss": 0.4672, + "step": 333370 + }, + { + "epoch": 2.9471878922894676, + "grad_norm": 3.731318950653076, + "learning_rate": 8.802017951755394e-07, + "loss": 0.5929, + "step": 333380 + }, + { + "epoch": 2.9472762955497798, + "grad_norm": 2.742482900619507, + "learning_rate": 8.787284075036688e-07, + "loss": 0.428, + "step": 333390 + }, + { + "epoch": 2.947364698810092, + "grad_norm": 3.5775883197784424, + "learning_rate": 8.772550198317982e-07, + "loss": 0.5334, + "step": 333400 + }, + { + "epoch": 2.9474531020704045, + "grad_norm": 2.745065689086914, + "learning_rate": 8.757816321599274e-07, + "loss": 0.4658, + "step": 333410 + }, + { + "epoch": 2.9475415053307166, + "grad_norm": 3.0185611248016357, + "learning_rate": 8.743082444880568e-07, + "loss": 0.4586, + "step": 333420 + }, + { + "epoch": 2.9476299085910287, + "grad_norm": 17.968290328979492, + "learning_rate": 8.728348568161861e-07, + "loss": 0.5229, + "step": 333430 + }, + { + "epoch": 2.9477183118513413, + "grad_norm": 2.011587142944336, + "learning_rate": 8.713614691443155e-07, + "loss": 0.3516, + "step": 333440 + }, + { + "epoch": 2.9478067151116534, + "grad_norm": 3.3450372219085693, + "learning_rate": 8.698880814724447e-07, + "loss": 0.3455, + "step": 333450 + }, + { + "epoch": 2.9478951183719655, + "grad_norm": 3.2161145210266113, + "learning_rate": 8.684146938005741e-07, + "loss": 0.5643, + "step": 333460 + }, + { + "epoch": 2.9479835216322776, + "grad_norm": 1.947377324104309, + "learning_rate": 8.669413061287034e-07, + "loss": 0.4889, + "step": 333470 + }, + { + "epoch": 2.94807192489259, + "grad_norm": 2.750883102416992, + "learning_rate": 8.654679184568326e-07, + "loss": 0.5619, + "step": 333480 + }, + { + "epoch": 2.9481603281529023, + "grad_norm": 8.375869750976562, + "learning_rate": 8.639945307849621e-07, + "loss": 0.4758, + "step": 333490 + }, + { + "epoch": 2.9482487314132144, + "grad_norm": 8.448152542114258, + "learning_rate": 8.625211431130913e-07, + "loss": 0.5627, + "step": 333500 + }, + { + "epoch": 2.948337134673527, + "grad_norm": 18.997201919555664, + "learning_rate": 8.610477554412207e-07, + "loss": 0.4383, + "step": 333510 + }, + { + "epoch": 2.948425537933839, + "grad_norm": 2.511136054992676, + "learning_rate": 8.5957436776935e-07, + "loss": 0.4734, + "step": 333520 + }, + { + "epoch": 2.9485139411941512, + "grad_norm": 7.374281883239746, + "learning_rate": 8.581009800974795e-07, + "loss": 0.4134, + "step": 333530 + }, + { + "epoch": 2.9486023444544633, + "grad_norm": 1.5602126121520996, + "learning_rate": 8.566275924256087e-07, + "loss": 0.4992, + "step": 333540 + }, + { + "epoch": 2.9486907477147755, + "grad_norm": 5.371391773223877, + "learning_rate": 8.55154204753738e-07, + "loss": 0.5907, + "step": 333550 + }, + { + "epoch": 2.948779150975088, + "grad_norm": 5.465899467468262, + "learning_rate": 8.536808170818674e-07, + "loss": 0.4476, + "step": 333560 + }, + { + "epoch": 2.9488675542354, + "grad_norm": 1.1193711757659912, + "learning_rate": 8.522074294099966e-07, + "loss": 0.4117, + "step": 333570 + }, + { + "epoch": 2.9489559574957127, + "grad_norm": 1.9738980531692505, + "learning_rate": 8.507340417381261e-07, + "loss": 0.4956, + "step": 333580 + }, + { + "epoch": 2.949044360756025, + "grad_norm": 8.726580619812012, + "learning_rate": 8.492606540662553e-07, + "loss": 0.5148, + "step": 333590 + }, + { + "epoch": 2.949132764016337, + "grad_norm": 2.082930326461792, + "learning_rate": 8.477872663943847e-07, + "loss": 0.5372, + "step": 333600 + }, + { + "epoch": 2.949221167276649, + "grad_norm": 1.2331546545028687, + "learning_rate": 8.46313878722514e-07, + "loss": 0.493, + "step": 333610 + }, + { + "epoch": 2.949309570536961, + "grad_norm": 4.893377304077148, + "learning_rate": 8.448404910506432e-07, + "loss": 0.6248, + "step": 333620 + }, + { + "epoch": 2.9493979737972738, + "grad_norm": 1.7880651950836182, + "learning_rate": 8.433671033787726e-07, + "loss": 0.6502, + "step": 333630 + }, + { + "epoch": 2.949486377057586, + "grad_norm": 5.033793926239014, + "learning_rate": 8.418937157069019e-07, + "loss": 0.4924, + "step": 333640 + }, + { + "epoch": 2.949574780317898, + "grad_norm": 4.1373291015625, + "learning_rate": 8.404203280350314e-07, + "loss": 0.4929, + "step": 333650 + }, + { + "epoch": 2.9496631835782106, + "grad_norm": 2.2214720249176025, + "learning_rate": 8.389469403631606e-07, + "loss": 0.404, + "step": 333660 + }, + { + "epoch": 2.9497515868385227, + "grad_norm": 7.528933048248291, + "learning_rate": 8.374735526912901e-07, + "loss": 0.5115, + "step": 333670 + }, + { + "epoch": 2.949839990098835, + "grad_norm": 2.428839921951294, + "learning_rate": 8.360001650194193e-07, + "loss": 0.6041, + "step": 333680 + }, + { + "epoch": 2.949928393359147, + "grad_norm": 4.893891334533691, + "learning_rate": 8.345267773475486e-07, + "loss": 0.5467, + "step": 333690 + }, + { + "epoch": 2.9500167966194595, + "grad_norm": 5.1482157707214355, + "learning_rate": 8.33053389675678e-07, + "loss": 0.5782, + "step": 333700 + }, + { + "epoch": 2.9501051998797716, + "grad_norm": 5.960176944732666, + "learning_rate": 8.315800020038072e-07, + "loss": 0.5043, + "step": 333710 + }, + { + "epoch": 2.9501936031400837, + "grad_norm": 8.360854148864746, + "learning_rate": 8.301066143319366e-07, + "loss": 0.4148, + "step": 333720 + }, + { + "epoch": 2.9502820064003963, + "grad_norm": 5.775094509124756, + "learning_rate": 8.286332266600659e-07, + "loss": 0.5563, + "step": 333730 + }, + { + "epoch": 2.9503704096607084, + "grad_norm": 3.2711892127990723, + "learning_rate": 8.271598389881953e-07, + "loss": 0.4503, + "step": 333740 + }, + { + "epoch": 2.9504588129210205, + "grad_norm": 7.296770095825195, + "learning_rate": 8.256864513163246e-07, + "loss": 0.5718, + "step": 333750 + }, + { + "epoch": 2.9505472161813326, + "grad_norm": 7.407886028289795, + "learning_rate": 8.242130636444538e-07, + "loss": 0.4638, + "step": 333760 + }, + { + "epoch": 2.9506356194416448, + "grad_norm": 2.9688029289245605, + "learning_rate": 8.227396759725832e-07, + "loss": 0.4263, + "step": 333770 + }, + { + "epoch": 2.9507240227019573, + "grad_norm": 10.361343383789062, + "learning_rate": 8.212662883007126e-07, + "loss": 0.5703, + "step": 333780 + }, + { + "epoch": 2.9508124259622694, + "grad_norm": 6.949721336364746, + "learning_rate": 8.19792900628842e-07, + "loss": 0.6177, + "step": 333790 + }, + { + "epoch": 2.9509008292225816, + "grad_norm": 1.561598300933838, + "learning_rate": 8.183195129569712e-07, + "loss": 0.4503, + "step": 333800 + }, + { + "epoch": 2.950989232482894, + "grad_norm": 1.8667621612548828, + "learning_rate": 8.168461252851006e-07, + "loss": 0.3868, + "step": 333810 + }, + { + "epoch": 2.9510776357432063, + "grad_norm": 10.274473190307617, + "learning_rate": 8.153727376132299e-07, + "loss": 0.497, + "step": 333820 + }, + { + "epoch": 2.9511660390035184, + "grad_norm": 1.7416595220565796, + "learning_rate": 8.138993499413591e-07, + "loss": 0.6172, + "step": 333830 + }, + { + "epoch": 2.9512544422638305, + "grad_norm": 4.961159706115723, + "learning_rate": 8.124259622694886e-07, + "loss": 0.5509, + "step": 333840 + }, + { + "epoch": 2.951342845524143, + "grad_norm": 1.0651915073394775, + "learning_rate": 8.109525745976178e-07, + "loss": 0.5916, + "step": 333850 + }, + { + "epoch": 2.951431248784455, + "grad_norm": 2.633289337158203, + "learning_rate": 8.094791869257472e-07, + "loss": 0.5661, + "step": 333860 + }, + { + "epoch": 2.9515196520447673, + "grad_norm": 6.613982200622559, + "learning_rate": 8.080057992538765e-07, + "loss": 0.5175, + "step": 333870 + }, + { + "epoch": 2.95160805530508, + "grad_norm": 3.4101953506469727, + "learning_rate": 8.065324115820059e-07, + "loss": 0.6054, + "step": 333880 + }, + { + "epoch": 2.951696458565392, + "grad_norm": 3.970576524734497, + "learning_rate": 8.050590239101351e-07, + "loss": 0.4821, + "step": 333890 + }, + { + "epoch": 2.951784861825704, + "grad_norm": 5.610123634338379, + "learning_rate": 8.035856362382644e-07, + "loss": 0.5845, + "step": 333900 + }, + { + "epoch": 2.951873265086016, + "grad_norm": 1.469955325126648, + "learning_rate": 8.021122485663939e-07, + "loss": 0.4818, + "step": 333910 + }, + { + "epoch": 2.9519616683463283, + "grad_norm": 11.562004089355469, + "learning_rate": 8.006388608945231e-07, + "loss": 0.405, + "step": 333920 + }, + { + "epoch": 2.952050071606641, + "grad_norm": 5.010549068450928, + "learning_rate": 7.991654732226526e-07, + "loss": 0.6251, + "step": 333930 + }, + { + "epoch": 2.952138474866953, + "grad_norm": 3.5327095985412598, + "learning_rate": 7.976920855507818e-07, + "loss": 0.4634, + "step": 333940 + }, + { + "epoch": 2.9522268781272656, + "grad_norm": 2.0082502365112305, + "learning_rate": 7.962186978789112e-07, + "loss": 0.5711, + "step": 333950 + }, + { + "epoch": 2.9523152813875777, + "grad_norm": 10.683242797851562, + "learning_rate": 7.947453102070405e-07, + "loss": 0.6095, + "step": 333960 + }, + { + "epoch": 2.95240368464789, + "grad_norm": 3.3387339115142822, + "learning_rate": 7.932719225351697e-07, + "loss": 0.566, + "step": 333970 + }, + { + "epoch": 2.952492087908202, + "grad_norm": 5.607863426208496, + "learning_rate": 7.917985348632991e-07, + "loss": 0.4949, + "step": 333980 + }, + { + "epoch": 2.952580491168514, + "grad_norm": 1.988485336303711, + "learning_rate": 7.903251471914284e-07, + "loss": 0.5265, + "step": 333990 + }, + { + "epoch": 2.9526688944288266, + "grad_norm": 8.47644329071045, + "learning_rate": 7.888517595195578e-07, + "loss": 0.5757, + "step": 334000 + }, + { + "epoch": 2.9527572976891387, + "grad_norm": 5.447516441345215, + "learning_rate": 7.873783718476871e-07, + "loss": 0.4511, + "step": 334010 + }, + { + "epoch": 2.952845700949451, + "grad_norm": 25.611717224121094, + "learning_rate": 7.859049841758166e-07, + "loss": 0.4661, + "step": 334020 + }, + { + "epoch": 2.9529341042097634, + "grad_norm": 7.268472194671631, + "learning_rate": 7.844315965039458e-07, + "loss": 0.5711, + "step": 334030 + }, + { + "epoch": 2.9530225074700756, + "grad_norm": 3.1688613891601562, + "learning_rate": 7.829582088320751e-07, + "loss": 0.5417, + "step": 334040 + }, + { + "epoch": 2.9531109107303877, + "grad_norm": 3.6444308757781982, + "learning_rate": 7.814848211602045e-07, + "loss": 0.4988, + "step": 334050 + }, + { + "epoch": 2.9531993139907, + "grad_norm": 5.53487491607666, + "learning_rate": 7.800114334883338e-07, + "loss": 0.5562, + "step": 334060 + }, + { + "epoch": 2.9532877172510124, + "grad_norm": 1.53546142578125, + "learning_rate": 7.785380458164631e-07, + "loss": 0.5051, + "step": 334070 + }, + { + "epoch": 2.9533761205113245, + "grad_norm": 4.140305995941162, + "learning_rate": 7.770646581445924e-07, + "loss": 0.4633, + "step": 334080 + }, + { + "epoch": 2.9534645237716366, + "grad_norm": 23.085290908813477, + "learning_rate": 7.755912704727217e-07, + "loss": 0.5406, + "step": 334090 + }, + { + "epoch": 2.953552927031949, + "grad_norm": 4.462161540985107, + "learning_rate": 7.741178828008511e-07, + "loss": 0.6629, + "step": 334100 + }, + { + "epoch": 2.9536413302922613, + "grad_norm": 3.2922680377960205, + "learning_rate": 7.726444951289804e-07, + "loss": 0.4981, + "step": 334110 + }, + { + "epoch": 2.9537297335525734, + "grad_norm": 26.30126190185547, + "learning_rate": 7.711711074571097e-07, + "loss": 0.5466, + "step": 334120 + }, + { + "epoch": 2.9538181368128855, + "grad_norm": 7.237841606140137, + "learning_rate": 7.696977197852391e-07, + "loss": 0.368, + "step": 334130 + }, + { + "epoch": 2.9539065400731976, + "grad_norm": 6.514906406402588, + "learning_rate": 7.682243321133683e-07, + "loss": 0.5002, + "step": 334140 + }, + { + "epoch": 2.95399494333351, + "grad_norm": 3.160898208618164, + "learning_rate": 7.667509444414976e-07, + "loss": 0.4471, + "step": 334150 + }, + { + "epoch": 2.9540833465938223, + "grad_norm": 0.9255017042160034, + "learning_rate": 7.652775567696271e-07, + "loss": 0.5742, + "step": 334160 + }, + { + "epoch": 2.954171749854135, + "grad_norm": 2.5488171577453613, + "learning_rate": 7.638041690977564e-07, + "loss": 0.5204, + "step": 334170 + }, + { + "epoch": 2.954260153114447, + "grad_norm": 3.150758743286133, + "learning_rate": 7.623307814258857e-07, + "loss": 0.5383, + "step": 334180 + }, + { + "epoch": 2.954348556374759, + "grad_norm": 1.625661015510559, + "learning_rate": 7.608573937540151e-07, + "loss": 0.453, + "step": 334190 + }, + { + "epoch": 2.9544369596350712, + "grad_norm": 4.003690242767334, + "learning_rate": 7.593840060821444e-07, + "loss": 0.502, + "step": 334200 + }, + { + "epoch": 2.9545253628953834, + "grad_norm": 2.9177732467651367, + "learning_rate": 7.579106184102736e-07, + "loss": 0.5049, + "step": 334210 + }, + { + "epoch": 2.954613766155696, + "grad_norm": 1.399957537651062, + "learning_rate": 7.56437230738403e-07, + "loss": 0.4222, + "step": 334220 + }, + { + "epoch": 2.954702169416008, + "grad_norm": 0.8438954949378967, + "learning_rate": 7.549638430665323e-07, + "loss": 0.6213, + "step": 334230 + }, + { + "epoch": 2.95479057267632, + "grad_norm": 20.533790588378906, + "learning_rate": 7.534904553946616e-07, + "loss": 0.4865, + "step": 334240 + }, + { + "epoch": 2.9548789759366327, + "grad_norm": 5.626286029815674, + "learning_rate": 7.52017067722791e-07, + "loss": 0.5822, + "step": 334250 + }, + { + "epoch": 2.954967379196945, + "grad_norm": 3.055326223373413, + "learning_rate": 7.505436800509203e-07, + "loss": 0.6399, + "step": 334260 + }, + { + "epoch": 2.955055782457257, + "grad_norm": 2.5545554161071777, + "learning_rate": 7.490702923790496e-07, + "loss": 0.5413, + "step": 334270 + }, + { + "epoch": 2.955144185717569, + "grad_norm": 3.986860513687134, + "learning_rate": 7.47596904707179e-07, + "loss": 0.4101, + "step": 334280 + }, + { + "epoch": 2.9552325889778817, + "grad_norm": 4.340065002441406, + "learning_rate": 7.461235170353083e-07, + "loss": 0.5367, + "step": 334290 + }, + { + "epoch": 2.955320992238194, + "grad_norm": 2.8179190158843994, + "learning_rate": 7.446501293634376e-07, + "loss": 0.4524, + "step": 334300 + }, + { + "epoch": 2.955409395498506, + "grad_norm": 3.9451775550842285, + "learning_rate": 7.43176741691567e-07, + "loss": 0.5393, + "step": 334310 + }, + { + "epoch": 2.9554977987588185, + "grad_norm": 3.5236241817474365, + "learning_rate": 7.417033540196963e-07, + "loss": 0.4822, + "step": 334320 + }, + { + "epoch": 2.9555862020191306, + "grad_norm": 2.772582530975342, + "learning_rate": 7.402299663478256e-07, + "loss": 0.439, + "step": 334330 + }, + { + "epoch": 2.9556746052794427, + "grad_norm": 1.7575722932815552, + "learning_rate": 7.38756578675955e-07, + "loss": 0.6527, + "step": 334340 + }, + { + "epoch": 2.955763008539755, + "grad_norm": 3.809518337249756, + "learning_rate": 7.372831910040842e-07, + "loss": 0.4831, + "step": 334350 + }, + { + "epoch": 2.955851411800067, + "grad_norm": 12.59978199005127, + "learning_rate": 7.358098033322136e-07, + "loss": 0.4626, + "step": 334360 + }, + { + "epoch": 2.9559398150603795, + "grad_norm": 1.3109796047210693, + "learning_rate": 7.343364156603429e-07, + "loss": 0.4297, + "step": 334370 + }, + { + "epoch": 2.9560282183206916, + "grad_norm": 2.7201597690582275, + "learning_rate": 7.328630279884722e-07, + "loss": 0.5776, + "step": 334380 + }, + { + "epoch": 2.9561166215810037, + "grad_norm": 2.199281692504883, + "learning_rate": 7.313896403166015e-07, + "loss": 0.5214, + "step": 334390 + }, + { + "epoch": 2.9562050248413163, + "grad_norm": 2.814915895462036, + "learning_rate": 7.29916252644731e-07, + "loss": 0.3882, + "step": 334400 + }, + { + "epoch": 2.9562934281016284, + "grad_norm": 3.8725502490997314, + "learning_rate": 7.284428649728602e-07, + "loss": 0.5641, + "step": 334410 + }, + { + "epoch": 2.9563818313619405, + "grad_norm": 13.94198226928711, + "learning_rate": 7.269694773009896e-07, + "loss": 0.5635, + "step": 334420 + }, + { + "epoch": 2.9564702346222527, + "grad_norm": 2.649346351623535, + "learning_rate": 7.254960896291189e-07, + "loss": 0.4198, + "step": 334430 + }, + { + "epoch": 2.9565586378825652, + "grad_norm": 3.578718900680542, + "learning_rate": 7.240227019572482e-07, + "loss": 0.5553, + "step": 334440 + }, + { + "epoch": 2.9566470411428774, + "grad_norm": 2.2096705436706543, + "learning_rate": 7.225493142853776e-07, + "loss": 0.5251, + "step": 334450 + }, + { + "epoch": 2.9567354444031895, + "grad_norm": 5.267762184143066, + "learning_rate": 7.210759266135069e-07, + "loss": 0.5294, + "step": 334460 + }, + { + "epoch": 2.956823847663502, + "grad_norm": 6.0921783447265625, + "learning_rate": 7.196025389416362e-07, + "loss": 0.5082, + "step": 334470 + }, + { + "epoch": 2.956912250923814, + "grad_norm": 14.21677303314209, + "learning_rate": 7.181291512697655e-07, + "loss": 0.4194, + "step": 334480 + }, + { + "epoch": 2.9570006541841263, + "grad_norm": 3.5184035301208496, + "learning_rate": 7.166557635978948e-07, + "loss": 0.5905, + "step": 334490 + }, + { + "epoch": 2.9570890574444384, + "grad_norm": 1.378141164779663, + "learning_rate": 7.151823759260241e-07, + "loss": 0.5196, + "step": 334500 + }, + { + "epoch": 2.9571774607047505, + "grad_norm": 4.943230628967285, + "learning_rate": 7.137089882541535e-07, + "loss": 0.5086, + "step": 334510 + }, + { + "epoch": 2.957265863965063, + "grad_norm": 2.2014925479888916, + "learning_rate": 7.122356005822829e-07, + "loss": 0.47, + "step": 334520 + }, + { + "epoch": 2.957354267225375, + "grad_norm": 1.7577694654464722, + "learning_rate": 7.107622129104122e-07, + "loss": 0.4388, + "step": 334530 + }, + { + "epoch": 2.9574426704856878, + "grad_norm": 5.862669467926025, + "learning_rate": 7.092888252385416e-07, + "loss": 0.4382, + "step": 334540 + }, + { + "epoch": 2.957531073746, + "grad_norm": 1.3559565544128418, + "learning_rate": 7.078154375666708e-07, + "loss": 0.4103, + "step": 334550 + }, + { + "epoch": 2.957619477006312, + "grad_norm": 5.277124404907227, + "learning_rate": 7.063420498948001e-07, + "loss": 0.5457, + "step": 334560 + }, + { + "epoch": 2.957707880266624, + "grad_norm": 5.90317964553833, + "learning_rate": 7.048686622229295e-07, + "loss": 0.527, + "step": 334570 + }, + { + "epoch": 2.9577962835269362, + "grad_norm": 3.4700894355773926, + "learning_rate": 7.033952745510588e-07, + "loss": 0.556, + "step": 334580 + }, + { + "epoch": 2.957884686787249, + "grad_norm": 3.2960946559906006, + "learning_rate": 7.019218868791881e-07, + "loss": 0.5067, + "step": 334590 + }, + { + "epoch": 2.957973090047561, + "grad_norm": 2.2402281761169434, + "learning_rate": 7.004484992073175e-07, + "loss": 0.5067, + "step": 334600 + }, + { + "epoch": 2.958061493307873, + "grad_norm": 2.073333978652954, + "learning_rate": 6.989751115354468e-07, + "loss": 0.5825, + "step": 334610 + }, + { + "epoch": 2.9581498965681856, + "grad_norm": 4.293914318084717, + "learning_rate": 6.975017238635761e-07, + "loss": 0.5149, + "step": 334620 + }, + { + "epoch": 2.9582382998284977, + "grad_norm": 2.3442301750183105, + "learning_rate": 6.960283361917054e-07, + "loss": 0.3953, + "step": 334630 + }, + { + "epoch": 2.95832670308881, + "grad_norm": 6.900905609130859, + "learning_rate": 6.945549485198347e-07, + "loss": 0.4292, + "step": 334640 + }, + { + "epoch": 2.958415106349122, + "grad_norm": 2.998425245285034, + "learning_rate": 6.93081560847964e-07, + "loss": 0.5202, + "step": 334650 + }, + { + "epoch": 2.9585035096094345, + "grad_norm": 7.410462856292725, + "learning_rate": 6.916081731760935e-07, + "loss": 0.3789, + "step": 334660 + }, + { + "epoch": 2.9585919128697467, + "grad_norm": 2.787964344024658, + "learning_rate": 6.901347855042228e-07, + "loss": 0.3398, + "step": 334670 + }, + { + "epoch": 2.9586803161300588, + "grad_norm": 4.058386325836182, + "learning_rate": 6.886613978323521e-07, + "loss": 0.6186, + "step": 334680 + }, + { + "epoch": 2.9587687193903713, + "grad_norm": 0.9971840977668762, + "learning_rate": 6.871880101604814e-07, + "loss": 0.4516, + "step": 334690 + }, + { + "epoch": 2.9588571226506835, + "grad_norm": 2.1134817600250244, + "learning_rate": 6.857146224886107e-07, + "loss": 0.4451, + "step": 334700 + }, + { + "epoch": 2.9589455259109956, + "grad_norm": 4.027384281158447, + "learning_rate": 6.842412348167401e-07, + "loss": 0.4103, + "step": 334710 + }, + { + "epoch": 2.9590339291713077, + "grad_norm": 3.3291208744049072, + "learning_rate": 6.827678471448694e-07, + "loss": 0.6249, + "step": 334720 + }, + { + "epoch": 2.95912233243162, + "grad_norm": 6.9080095291137695, + "learning_rate": 6.812944594729987e-07, + "loss": 0.5769, + "step": 334730 + }, + { + "epoch": 2.9592107356919324, + "grad_norm": 2.623723268508911, + "learning_rate": 6.798210718011281e-07, + "loss": 0.3833, + "step": 334740 + }, + { + "epoch": 2.9592991389522445, + "grad_norm": 4.221076011657715, + "learning_rate": 6.783476841292574e-07, + "loss": 0.5561, + "step": 334750 + }, + { + "epoch": 2.959387542212557, + "grad_norm": 12.261418342590332, + "learning_rate": 6.768742964573866e-07, + "loss": 0.5161, + "step": 334760 + }, + { + "epoch": 2.959475945472869, + "grad_norm": 1.517185091972351, + "learning_rate": 6.75400908785516e-07, + "loss": 0.5169, + "step": 334770 + }, + { + "epoch": 2.9595643487331813, + "grad_norm": 7.488037109375, + "learning_rate": 6.739275211136454e-07, + "loss": 0.5018, + "step": 334780 + }, + { + "epoch": 2.9596527519934934, + "grad_norm": 4.598194599151611, + "learning_rate": 6.724541334417747e-07, + "loss": 0.4443, + "step": 334790 + }, + { + "epoch": 2.9597411552538055, + "grad_norm": 6.238781929016113, + "learning_rate": 6.709807457699041e-07, + "loss": 0.5963, + "step": 334800 + }, + { + "epoch": 2.959829558514118, + "grad_norm": 3.654684066772461, + "learning_rate": 6.695073580980334e-07, + "loss": 0.6164, + "step": 334810 + }, + { + "epoch": 2.9599179617744302, + "grad_norm": 11.87872314453125, + "learning_rate": 6.680339704261627e-07, + "loss": 0.5461, + "step": 334820 + }, + { + "epoch": 2.9600063650347423, + "grad_norm": 5.230428695678711, + "learning_rate": 6.66560582754292e-07, + "loss": 0.6209, + "step": 334830 + }, + { + "epoch": 2.960094768295055, + "grad_norm": 5.164114475250244, + "learning_rate": 6.650871950824213e-07, + "loss": 0.4802, + "step": 334840 + }, + { + "epoch": 2.960183171555367, + "grad_norm": 5.155570983886719, + "learning_rate": 6.636138074105506e-07, + "loss": 0.3892, + "step": 334850 + }, + { + "epoch": 2.960271574815679, + "grad_norm": 3.9848501682281494, + "learning_rate": 6.6214041973868e-07, + "loss": 0.4386, + "step": 334860 + }, + { + "epoch": 2.9603599780759913, + "grad_norm": 3.590057373046875, + "learning_rate": 6.606670320668093e-07, + "loss": 0.5226, + "step": 334870 + }, + { + "epoch": 2.960448381336304, + "grad_norm": 5.222622871398926, + "learning_rate": 6.591936443949386e-07, + "loss": 0.4988, + "step": 334880 + }, + { + "epoch": 2.960536784596616, + "grad_norm": 5.117331027984619, + "learning_rate": 6.57720256723068e-07, + "loss": 0.4653, + "step": 334890 + }, + { + "epoch": 2.960625187856928, + "grad_norm": 3.8801088333129883, + "learning_rate": 6.562468690511973e-07, + "loss": 0.4035, + "step": 334900 + }, + { + "epoch": 2.9607135911172406, + "grad_norm": 2.6848151683807373, + "learning_rate": 6.547734813793266e-07, + "loss": 0.5261, + "step": 334910 + }, + { + "epoch": 2.9608019943775528, + "grad_norm": 3.9233274459838867, + "learning_rate": 6.53300093707456e-07, + "loss": 0.5381, + "step": 334920 + }, + { + "epoch": 2.960890397637865, + "grad_norm": 2.06052827835083, + "learning_rate": 6.518267060355853e-07, + "loss": 0.4714, + "step": 334930 + }, + { + "epoch": 2.960978800898177, + "grad_norm": 1.5588624477386475, + "learning_rate": 6.503533183637146e-07, + "loss": 0.4673, + "step": 334940 + }, + { + "epoch": 2.961067204158489, + "grad_norm": 3.2827677726745605, + "learning_rate": 6.48879930691844e-07, + "loss": 0.4156, + "step": 334950 + }, + { + "epoch": 2.9611556074188017, + "grad_norm": 4.624691486358643, + "learning_rate": 6.474065430199733e-07, + "loss": 0.6276, + "step": 334960 + }, + { + "epoch": 2.961244010679114, + "grad_norm": 4.168994903564453, + "learning_rate": 6.459331553481026e-07, + "loss": 0.5796, + "step": 334970 + }, + { + "epoch": 2.961332413939426, + "grad_norm": 5.170538425445557, + "learning_rate": 6.444597676762319e-07, + "loss": 0.4909, + "step": 334980 + }, + { + "epoch": 2.9614208171997385, + "grad_norm": 5.093525409698486, + "learning_rate": 6.429863800043612e-07, + "loss": 0.5549, + "step": 334990 + }, + { + "epoch": 2.9615092204600506, + "grad_norm": 1.7706618309020996, + "learning_rate": 6.415129923324906e-07, + "loss": 0.5809, + "step": 335000 + }, + { + "epoch": 2.9615976237203627, + "grad_norm": 7.736911296844482, + "learning_rate": 6.400396046606199e-07, + "loss": 0.5335, + "step": 335010 + }, + { + "epoch": 2.961686026980675, + "grad_norm": 5.823414325714111, + "learning_rate": 6.385662169887493e-07, + "loss": 0.522, + "step": 335020 + }, + { + "epoch": 2.9617744302409874, + "grad_norm": 1.4817625284194946, + "learning_rate": 6.370928293168786e-07, + "loss": 0.5065, + "step": 335030 + }, + { + "epoch": 2.9618628335012995, + "grad_norm": 13.932080268859863, + "learning_rate": 6.356194416450079e-07, + "loss": 0.5401, + "step": 335040 + }, + { + "epoch": 2.9619512367616116, + "grad_norm": 11.767707824707031, + "learning_rate": 6.341460539731372e-07, + "loss": 0.577, + "step": 335050 + }, + { + "epoch": 2.962039640021924, + "grad_norm": 2.97473406791687, + "learning_rate": 6.326726663012666e-07, + "loss": 0.4947, + "step": 335060 + }, + { + "epoch": 2.9621280432822363, + "grad_norm": 2.447044610977173, + "learning_rate": 6.311992786293959e-07, + "loss": 0.4573, + "step": 335070 + }, + { + "epoch": 2.9622164465425485, + "grad_norm": 3.9161767959594727, + "learning_rate": 6.297258909575252e-07, + "loss": 0.5265, + "step": 335080 + }, + { + "epoch": 2.9623048498028606, + "grad_norm": 2.6903018951416016, + "learning_rate": 6.282525032856546e-07, + "loss": 0.492, + "step": 335090 + }, + { + "epoch": 2.9623932530631727, + "grad_norm": 3.307736873626709, + "learning_rate": 6.267791156137839e-07, + "loss": 0.4839, + "step": 335100 + }, + { + "epoch": 2.9624816563234853, + "grad_norm": 8.132790565490723, + "learning_rate": 6.253057279419131e-07, + "loss": 0.6162, + "step": 335110 + }, + { + "epoch": 2.9625700595837974, + "grad_norm": 5.393335819244385, + "learning_rate": 6.238323402700425e-07, + "loss": 0.5316, + "step": 335120 + }, + { + "epoch": 2.96265846284411, + "grad_norm": 4.197139739990234, + "learning_rate": 6.223589525981718e-07, + "loss": 0.4422, + "step": 335130 + }, + { + "epoch": 2.962746866104422, + "grad_norm": 18.481855392456055, + "learning_rate": 6.208855649263011e-07, + "loss": 0.4671, + "step": 335140 + }, + { + "epoch": 2.962835269364734, + "grad_norm": 2.3492848873138428, + "learning_rate": 6.194121772544306e-07, + "loss": 0.5374, + "step": 335150 + }, + { + "epoch": 2.9629236726250463, + "grad_norm": 3.996891498565674, + "learning_rate": 6.179387895825599e-07, + "loss": 0.5845, + "step": 335160 + }, + { + "epoch": 2.9630120758853584, + "grad_norm": 5.440873622894287, + "learning_rate": 6.164654019106892e-07, + "loss": 0.4378, + "step": 335170 + }, + { + "epoch": 2.963100479145671, + "grad_norm": 3.4104530811309814, + "learning_rate": 6.149920142388185e-07, + "loss": 0.6005, + "step": 335180 + }, + { + "epoch": 2.963188882405983, + "grad_norm": 1.7294161319732666, + "learning_rate": 6.135186265669478e-07, + "loss": 0.4784, + "step": 335190 + }, + { + "epoch": 2.9632772856662952, + "grad_norm": 2.2345659732818604, + "learning_rate": 6.120452388950771e-07, + "loss": 0.5373, + "step": 335200 + }, + { + "epoch": 2.963365688926608, + "grad_norm": 5.738230228424072, + "learning_rate": 6.105718512232065e-07, + "loss": 0.5294, + "step": 335210 + }, + { + "epoch": 2.96345409218692, + "grad_norm": 2.948974609375, + "learning_rate": 6.090984635513358e-07, + "loss": 0.4955, + "step": 335220 + }, + { + "epoch": 2.963542495447232, + "grad_norm": 7.1646599769592285, + "learning_rate": 6.076250758794651e-07, + "loss": 0.5583, + "step": 335230 + }, + { + "epoch": 2.963630898707544, + "grad_norm": 1.8126200437545776, + "learning_rate": 6.061516882075945e-07, + "loss": 0.5701, + "step": 335240 + }, + { + "epoch": 2.9637193019678567, + "grad_norm": 1.6794610023498535, + "learning_rate": 6.046783005357237e-07, + "loss": 0.4881, + "step": 335250 + }, + { + "epoch": 2.963807705228169, + "grad_norm": 5.175233364105225, + "learning_rate": 6.03204912863853e-07, + "loss": 0.5308, + "step": 335260 + }, + { + "epoch": 2.963896108488481, + "grad_norm": 2.829876184463501, + "learning_rate": 6.017315251919825e-07, + "loss": 0.6056, + "step": 335270 + }, + { + "epoch": 2.9639845117487935, + "grad_norm": 3.416961669921875, + "learning_rate": 6.002581375201118e-07, + "loss": 0.5564, + "step": 335280 + }, + { + "epoch": 2.9640729150091056, + "grad_norm": 3.59869384765625, + "learning_rate": 5.987847498482411e-07, + "loss": 0.457, + "step": 335290 + }, + { + "epoch": 2.9641613182694178, + "grad_norm": 8.901326179504395, + "learning_rate": 5.973113621763705e-07, + "loss": 0.6351, + "step": 335300 + }, + { + "epoch": 2.96424972152973, + "grad_norm": 4.641615867614746, + "learning_rate": 5.958379745044998e-07, + "loss": 0.5323, + "step": 335310 + }, + { + "epoch": 2.964338124790042, + "grad_norm": 1.0643514394760132, + "learning_rate": 5.943645868326291e-07, + "loss": 0.5469, + "step": 335320 + }, + { + "epoch": 2.9644265280503546, + "grad_norm": 1.893661379814148, + "learning_rate": 5.928911991607584e-07, + "loss": 0.3296, + "step": 335330 + }, + { + "epoch": 2.9645149313106667, + "grad_norm": 8.286412239074707, + "learning_rate": 5.914178114888877e-07, + "loss": 0.5747, + "step": 335340 + }, + { + "epoch": 2.9646033345709792, + "grad_norm": 9.29808521270752, + "learning_rate": 5.899444238170171e-07, + "loss": 0.5977, + "step": 335350 + }, + { + "epoch": 2.9646917378312914, + "grad_norm": 5.291074752807617, + "learning_rate": 5.884710361451464e-07, + "loss": 0.3331, + "step": 335360 + }, + { + "epoch": 2.9647801410916035, + "grad_norm": 2.8429813385009766, + "learning_rate": 5.869976484732757e-07, + "loss": 0.4993, + "step": 335370 + }, + { + "epoch": 2.9648685443519156, + "grad_norm": 9.331207275390625, + "learning_rate": 5.85524260801405e-07, + "loss": 0.5226, + "step": 335380 + }, + { + "epoch": 2.9649569476122277, + "grad_norm": 0.9047555923461914, + "learning_rate": 5.840508731295343e-07, + "loss": 0.5135, + "step": 335390 + }, + { + "epoch": 2.9650453508725403, + "grad_norm": 1.4557123184204102, + "learning_rate": 5.825774854576637e-07, + "loss": 0.5043, + "step": 335400 + }, + { + "epoch": 2.9651337541328524, + "grad_norm": 2.8721766471862793, + "learning_rate": 5.811040977857931e-07, + "loss": 0.4355, + "step": 335410 + }, + { + "epoch": 2.9652221573931645, + "grad_norm": 5.155484676361084, + "learning_rate": 5.796307101139224e-07, + "loss": 0.4172, + "step": 335420 + }, + { + "epoch": 2.965310560653477, + "grad_norm": 7.9188313484191895, + "learning_rate": 5.781573224420517e-07, + "loss": 0.4661, + "step": 335430 + }, + { + "epoch": 2.965398963913789, + "grad_norm": 4.962244033813477, + "learning_rate": 5.766839347701811e-07, + "loss": 0.6675, + "step": 335440 + }, + { + "epoch": 2.9654873671741013, + "grad_norm": 9.028661727905273, + "learning_rate": 5.752105470983103e-07, + "loss": 0.6665, + "step": 335450 + }, + { + "epoch": 2.9655757704344134, + "grad_norm": 6.354589462280273, + "learning_rate": 5.737371594264396e-07, + "loss": 0.3794, + "step": 335460 + }, + { + "epoch": 2.965664173694726, + "grad_norm": 2.871797800064087, + "learning_rate": 5.72263771754569e-07, + "loss": 0.494, + "step": 335470 + }, + { + "epoch": 2.965752576955038, + "grad_norm": 1.4531891345977783, + "learning_rate": 5.707903840826983e-07, + "loss": 0.5813, + "step": 335480 + }, + { + "epoch": 2.9658409802153503, + "grad_norm": 4.408214092254639, + "learning_rate": 5.693169964108276e-07, + "loss": 0.4455, + "step": 335490 + }, + { + "epoch": 2.965929383475663, + "grad_norm": 1.5702086687088013, + "learning_rate": 5.67843608738957e-07, + "loss": 0.5113, + "step": 335500 + }, + { + "epoch": 2.966017786735975, + "grad_norm": 5.3205766677856445, + "learning_rate": 5.663702210670863e-07, + "loss": 0.6595, + "step": 335510 + }, + { + "epoch": 2.966106189996287, + "grad_norm": 2.616098165512085, + "learning_rate": 5.648968333952156e-07, + "loss": 0.6837, + "step": 335520 + }, + { + "epoch": 2.966194593256599, + "grad_norm": 11.2829008102417, + "learning_rate": 5.63423445723345e-07, + "loss": 0.3879, + "step": 335530 + }, + { + "epoch": 2.9662829965169113, + "grad_norm": 1.954250454902649, + "learning_rate": 5.619500580514743e-07, + "loss": 0.5303, + "step": 335540 + }, + { + "epoch": 2.966371399777224, + "grad_norm": 19.011533737182617, + "learning_rate": 5.604766703796036e-07, + "loss": 0.5673, + "step": 335550 + }, + { + "epoch": 2.966459803037536, + "grad_norm": 13.222569465637207, + "learning_rate": 5.59003282707733e-07, + "loss": 0.4658, + "step": 335560 + }, + { + "epoch": 2.966548206297848, + "grad_norm": 10.440057754516602, + "learning_rate": 5.575298950358623e-07, + "loss": 0.6095, + "step": 335570 + }, + { + "epoch": 2.9666366095581607, + "grad_norm": 3.0970606803894043, + "learning_rate": 5.560565073639916e-07, + "loss": 0.5947, + "step": 335580 + }, + { + "epoch": 2.966725012818473, + "grad_norm": 1.778258204460144, + "learning_rate": 5.545831196921209e-07, + "loss": 0.4867, + "step": 335590 + }, + { + "epoch": 2.966813416078785, + "grad_norm": 9.691999435424805, + "learning_rate": 5.531097320202502e-07, + "loss": 0.3988, + "step": 335600 + }, + { + "epoch": 2.966901819339097, + "grad_norm": 1.6112170219421387, + "learning_rate": 5.516363443483796e-07, + "loss": 0.6073, + "step": 335610 + }, + { + "epoch": 2.9669902225994096, + "grad_norm": 29.974735260009766, + "learning_rate": 5.501629566765089e-07, + "loss": 0.5553, + "step": 335620 + }, + { + "epoch": 2.9670786258597217, + "grad_norm": 2.928086757659912, + "learning_rate": 5.486895690046382e-07, + "loss": 0.4776, + "step": 335630 + }, + { + "epoch": 2.967167029120034, + "grad_norm": 3.725994825363159, + "learning_rate": 5.472161813327676e-07, + "loss": 0.4875, + "step": 335640 + }, + { + "epoch": 2.9672554323803464, + "grad_norm": 2.9303770065307617, + "learning_rate": 5.45742793660897e-07, + "loss": 0.5703, + "step": 335650 + }, + { + "epoch": 2.9673438356406585, + "grad_norm": 2.5434539318084717, + "learning_rate": 5.442694059890262e-07, + "loss": 0.473, + "step": 335660 + }, + { + "epoch": 2.9674322389009706, + "grad_norm": 2.7368602752685547, + "learning_rate": 5.427960183171556e-07, + "loss": 0.4814, + "step": 335670 + }, + { + "epoch": 2.9675206421612828, + "grad_norm": 11.854361534118652, + "learning_rate": 5.413226306452849e-07, + "loss": 0.5999, + "step": 335680 + }, + { + "epoch": 2.967609045421595, + "grad_norm": 1.2229604721069336, + "learning_rate": 5.398492429734142e-07, + "loss": 0.4666, + "step": 335690 + }, + { + "epoch": 2.9676974486819074, + "grad_norm": 3.490987539291382, + "learning_rate": 5.383758553015436e-07, + "loss": 0.4904, + "step": 335700 + }, + { + "epoch": 2.9677858519422196, + "grad_norm": 3.0502991676330566, + "learning_rate": 5.369024676296729e-07, + "loss": 0.5675, + "step": 335710 + }, + { + "epoch": 2.967874255202532, + "grad_norm": 1.7462477684020996, + "learning_rate": 5.354290799578022e-07, + "loss": 0.5622, + "step": 335720 + }, + { + "epoch": 2.9679626584628442, + "grad_norm": 1.324082851409912, + "learning_rate": 5.339556922859315e-07, + "loss": 0.39, + "step": 335730 + }, + { + "epoch": 2.9680510617231564, + "grad_norm": 3.5736658573150635, + "learning_rate": 5.324823046140608e-07, + "loss": 0.4669, + "step": 335740 + }, + { + "epoch": 2.9681394649834685, + "grad_norm": 5.691527366638184, + "learning_rate": 5.310089169421901e-07, + "loss": 0.5751, + "step": 335750 + }, + { + "epoch": 2.9682278682437806, + "grad_norm": 1.2006659507751465, + "learning_rate": 5.295355292703196e-07, + "loss": 0.433, + "step": 335760 + }, + { + "epoch": 2.968316271504093, + "grad_norm": 2.624927043914795, + "learning_rate": 5.280621415984489e-07, + "loss": 0.4902, + "step": 335770 + }, + { + "epoch": 2.9684046747644053, + "grad_norm": 5.20295524597168, + "learning_rate": 5.265887539265782e-07, + "loss": 0.4138, + "step": 335780 + }, + { + "epoch": 2.9684930780247174, + "grad_norm": 1.281351923942566, + "learning_rate": 5.251153662547076e-07, + "loss": 0.4795, + "step": 335790 + }, + { + "epoch": 2.96858148128503, + "grad_norm": 3.205597400665283, + "learning_rate": 5.236419785828368e-07, + "loss": 0.4741, + "step": 335800 + }, + { + "epoch": 2.968669884545342, + "grad_norm": 3.043614625930786, + "learning_rate": 5.221685909109661e-07, + "loss": 0.475, + "step": 335810 + }, + { + "epoch": 2.968758287805654, + "grad_norm": 11.478108406066895, + "learning_rate": 5.206952032390955e-07, + "loss": 0.5216, + "step": 335820 + }, + { + "epoch": 2.9688466910659663, + "grad_norm": 1.2169263362884521, + "learning_rate": 5.192218155672248e-07, + "loss": 0.4476, + "step": 335830 + }, + { + "epoch": 2.968935094326279, + "grad_norm": 1.6191961765289307, + "learning_rate": 5.177484278953541e-07, + "loss": 0.408, + "step": 335840 + }, + { + "epoch": 2.969023497586591, + "grad_norm": 8.552638053894043, + "learning_rate": 5.162750402234835e-07, + "loss": 0.5953, + "step": 335850 + }, + { + "epoch": 2.969111900846903, + "grad_norm": 3.517845869064331, + "learning_rate": 5.148016525516128e-07, + "loss": 0.5521, + "step": 335860 + }, + { + "epoch": 2.9692003041072157, + "grad_norm": 2.938711166381836, + "learning_rate": 5.13328264879742e-07, + "loss": 0.4403, + "step": 335870 + }, + { + "epoch": 2.969288707367528, + "grad_norm": 1.6895567178726196, + "learning_rate": 5.118548772078714e-07, + "loss": 0.5288, + "step": 335880 + }, + { + "epoch": 2.96937711062784, + "grad_norm": 0.7480763792991638, + "learning_rate": 5.103814895360008e-07, + "loss": 0.4643, + "step": 335890 + }, + { + "epoch": 2.969465513888152, + "grad_norm": 6.076292991638184, + "learning_rate": 5.089081018641301e-07, + "loss": 0.4809, + "step": 335900 + }, + { + "epoch": 2.969553917148464, + "grad_norm": 5.674091815948486, + "learning_rate": 5.074347141922595e-07, + "loss": 0.4904, + "step": 335910 + }, + { + "epoch": 2.9696423204087767, + "grad_norm": 2.8610410690307617, + "learning_rate": 5.059613265203888e-07, + "loss": 0.5114, + "step": 335920 + }, + { + "epoch": 2.969730723669089, + "grad_norm": 0.9119172096252441, + "learning_rate": 5.044879388485181e-07, + "loss": 0.2757, + "step": 335930 + }, + { + "epoch": 2.9698191269294014, + "grad_norm": 8.15263557434082, + "learning_rate": 5.030145511766474e-07, + "loss": 0.6356, + "step": 335940 + }, + { + "epoch": 2.9699075301897135, + "grad_norm": 8.325189590454102, + "learning_rate": 5.015411635047767e-07, + "loss": 0.4723, + "step": 335950 + }, + { + "epoch": 2.9699959334500257, + "grad_norm": 2.3215248584747314, + "learning_rate": 5.000677758329061e-07, + "loss": 0.4945, + "step": 335960 + }, + { + "epoch": 2.970084336710338, + "grad_norm": 13.845855712890625, + "learning_rate": 4.985943881610354e-07, + "loss": 0.539, + "step": 335970 + }, + { + "epoch": 2.97017273997065, + "grad_norm": 6.5736165046691895, + "learning_rate": 4.971210004891647e-07, + "loss": 0.5157, + "step": 335980 + }, + { + "epoch": 2.9702611432309625, + "grad_norm": 1.2494341135025024, + "learning_rate": 4.95647612817294e-07, + "loss": 0.6362, + "step": 335990 + }, + { + "epoch": 2.9703495464912746, + "grad_norm": 2.234370231628418, + "learning_rate": 4.941742251454234e-07, + "loss": 0.4383, + "step": 336000 + }, + { + "epoch": 2.9704379497515867, + "grad_norm": 4.606790542602539, + "learning_rate": 4.927008374735526e-07, + "loss": 0.5531, + "step": 336010 + }, + { + "epoch": 2.9705263530118993, + "grad_norm": 20.438636779785156, + "learning_rate": 4.912274498016821e-07, + "loss": 0.5454, + "step": 336020 + }, + { + "epoch": 2.9706147562722114, + "grad_norm": 1.1969146728515625, + "learning_rate": 4.897540621298114e-07, + "loss": 0.4805, + "step": 336030 + }, + { + "epoch": 2.9707031595325235, + "grad_norm": 6.894013404846191, + "learning_rate": 4.882806744579407e-07, + "loss": 0.421, + "step": 336040 + }, + { + "epoch": 2.9707915627928356, + "grad_norm": 0.9334472417831421, + "learning_rate": 4.868072867860701e-07, + "loss": 0.4844, + "step": 336050 + }, + { + "epoch": 2.970879966053148, + "grad_norm": 10.937143325805664, + "learning_rate": 4.853338991141994e-07, + "loss": 0.438, + "step": 336060 + }, + { + "epoch": 2.9709683693134603, + "grad_norm": 1.216178297996521, + "learning_rate": 4.838605114423287e-07, + "loss": 0.4316, + "step": 336070 + }, + { + "epoch": 2.9710567725737724, + "grad_norm": 1.977311372756958, + "learning_rate": 4.82387123770458e-07, + "loss": 0.6254, + "step": 336080 + }, + { + "epoch": 2.971145175834085, + "grad_norm": 4.164798259735107, + "learning_rate": 4.809137360985873e-07, + "loss": 0.5817, + "step": 336090 + }, + { + "epoch": 2.971233579094397, + "grad_norm": 6.046334266662598, + "learning_rate": 4.794403484267166e-07, + "loss": 0.5627, + "step": 336100 + }, + { + "epoch": 2.9713219823547092, + "grad_norm": 2.1921021938323975, + "learning_rate": 4.77966960754846e-07, + "loss": 0.7521, + "step": 336110 + }, + { + "epoch": 2.9714103856150214, + "grad_norm": 16.59264373779297, + "learning_rate": 4.7649357308297534e-07, + "loss": 0.5125, + "step": 336120 + }, + { + "epoch": 2.9714987888753335, + "grad_norm": 3.8310930728912354, + "learning_rate": 4.750201854111047e-07, + "loss": 0.4879, + "step": 336130 + }, + { + "epoch": 2.971587192135646, + "grad_norm": 5.2446818351745605, + "learning_rate": 4.73546797739234e-07, + "loss": 0.6651, + "step": 336140 + }, + { + "epoch": 2.971675595395958, + "grad_norm": 2.742725372314453, + "learning_rate": 4.7207341006736326e-07, + "loss": 0.5044, + "step": 336150 + }, + { + "epoch": 2.9717639986562703, + "grad_norm": 5.605286121368408, + "learning_rate": 4.706000223954926e-07, + "loss": 0.5923, + "step": 336160 + }, + { + "epoch": 2.971852401916583, + "grad_norm": 2.3964757919311523, + "learning_rate": 4.69126634723622e-07, + "loss": 0.4648, + "step": 336170 + }, + { + "epoch": 2.971940805176895, + "grad_norm": 0.7872205972671509, + "learning_rate": 4.676532470517513e-07, + "loss": 0.4863, + "step": 336180 + }, + { + "epoch": 2.972029208437207, + "grad_norm": 3.262118339538574, + "learning_rate": 4.6617985937988065e-07, + "loss": 0.4911, + "step": 336190 + }, + { + "epoch": 2.972117611697519, + "grad_norm": 2.855809450149536, + "learning_rate": 4.6470647170800995e-07, + "loss": 0.3615, + "step": 336200 + }, + { + "epoch": 2.9722060149578318, + "grad_norm": 2.570660352706909, + "learning_rate": 4.632330840361393e-07, + "loss": 0.5357, + "step": 336210 + }, + { + "epoch": 2.972294418218144, + "grad_norm": 2.814424753189087, + "learning_rate": 4.6175969636426857e-07, + "loss": 0.5278, + "step": 336220 + }, + { + "epoch": 2.972382821478456, + "grad_norm": 3.3074522018432617, + "learning_rate": 4.602863086923979e-07, + "loss": 0.4977, + "step": 336230 + }, + { + "epoch": 2.9724712247387686, + "grad_norm": 12.23355770111084, + "learning_rate": 4.5881292102052723e-07, + "loss": 0.5104, + "step": 336240 + }, + { + "epoch": 2.9725596279990807, + "grad_norm": 1.8017550706863403, + "learning_rate": 4.573395333486566e-07, + "loss": 0.5206, + "step": 336250 + }, + { + "epoch": 2.972648031259393, + "grad_norm": 6.115889072418213, + "learning_rate": 4.5586614567678595e-07, + "loss": 0.6444, + "step": 336260 + }, + { + "epoch": 2.972736434519705, + "grad_norm": 2.2516112327575684, + "learning_rate": 4.5439275800491526e-07, + "loss": 0.513, + "step": 336270 + }, + { + "epoch": 2.9728248377800175, + "grad_norm": 1.9327976703643799, + "learning_rate": 4.529193703330446e-07, + "loss": 0.4431, + "step": 336280 + }, + { + "epoch": 2.9729132410403296, + "grad_norm": 1.2774971723556519, + "learning_rate": 4.5144598266117387e-07, + "loss": 0.4326, + "step": 336290 + }, + { + "epoch": 2.9730016443006417, + "grad_norm": 3.079462766647339, + "learning_rate": 4.499725949893032e-07, + "loss": 0.4892, + "step": 336300 + }, + { + "epoch": 2.9730900475609543, + "grad_norm": 5.195932388305664, + "learning_rate": 4.4849920731743254e-07, + "loss": 0.6125, + "step": 336310 + }, + { + "epoch": 2.9731784508212664, + "grad_norm": 0.9003970623016357, + "learning_rate": 4.470258196455619e-07, + "loss": 0.42, + "step": 336320 + }, + { + "epoch": 2.9732668540815785, + "grad_norm": 5.624589920043945, + "learning_rate": 4.455524319736912e-07, + "loss": 0.4749, + "step": 336330 + }, + { + "epoch": 2.9733552573418907, + "grad_norm": 10.769453048706055, + "learning_rate": 4.4407904430182057e-07, + "loss": 0.6056, + "step": 336340 + }, + { + "epoch": 2.9734436606022028, + "grad_norm": 7.47599983215332, + "learning_rate": 4.426056566299499e-07, + "loss": 0.4533, + "step": 336350 + }, + { + "epoch": 2.9735320638625153, + "grad_norm": 2.32614803314209, + "learning_rate": 4.411322689580792e-07, + "loss": 0.5861, + "step": 336360 + }, + { + "epoch": 2.9736204671228275, + "grad_norm": 3.096092700958252, + "learning_rate": 4.396588812862085e-07, + "loss": 0.3838, + "step": 336370 + }, + { + "epoch": 2.9737088703831396, + "grad_norm": 2.4714255332946777, + "learning_rate": 4.3818549361433785e-07, + "loss": 0.508, + "step": 336380 + }, + { + "epoch": 2.973797273643452, + "grad_norm": 8.598041534423828, + "learning_rate": 4.367121059424672e-07, + "loss": 0.5619, + "step": 336390 + }, + { + "epoch": 2.9738856769037643, + "grad_norm": 1.917928695678711, + "learning_rate": 4.352387182705965e-07, + "loss": 0.4228, + "step": 336400 + }, + { + "epoch": 2.9739740801640764, + "grad_norm": 5.31523323059082, + "learning_rate": 4.3376533059872587e-07, + "loss": 0.5484, + "step": 336410 + }, + { + "epoch": 2.9740624834243885, + "grad_norm": 1.1817293167114258, + "learning_rate": 4.322919429268552e-07, + "loss": 0.3903, + "step": 336420 + }, + { + "epoch": 2.974150886684701, + "grad_norm": 0.989337146282196, + "learning_rate": 4.3081855525498443e-07, + "loss": 0.5513, + "step": 336430 + }, + { + "epoch": 2.974239289945013, + "grad_norm": 1.669836163520813, + "learning_rate": 4.293451675831138e-07, + "loss": 0.4571, + "step": 336440 + }, + { + "epoch": 2.9743276932053253, + "grad_norm": 7.036063194274902, + "learning_rate": 4.2787177991124315e-07, + "loss": 0.5276, + "step": 336450 + }, + { + "epoch": 2.974416096465638, + "grad_norm": 2.040003538131714, + "learning_rate": 4.2639839223937246e-07, + "loss": 0.5253, + "step": 336460 + }, + { + "epoch": 2.97450449972595, + "grad_norm": 1.2515255212783813, + "learning_rate": 4.249250045675018e-07, + "loss": 0.4505, + "step": 336470 + }, + { + "epoch": 2.974592902986262, + "grad_norm": 4.392385005950928, + "learning_rate": 4.234516168956312e-07, + "loss": 0.5726, + "step": 336480 + }, + { + "epoch": 2.9746813062465742, + "grad_norm": 4.9741668701171875, + "learning_rate": 4.219782292237605e-07, + "loss": 0.5355, + "step": 336490 + }, + { + "epoch": 2.9747697095068864, + "grad_norm": 2.070491075515747, + "learning_rate": 4.2050484155188974e-07, + "loss": 0.5166, + "step": 336500 + }, + { + "epoch": 2.974858112767199, + "grad_norm": 3.9806413650512695, + "learning_rate": 4.190314538800191e-07, + "loss": 0.4318, + "step": 336510 + }, + { + "epoch": 2.974946516027511, + "grad_norm": 1.9831172227859497, + "learning_rate": 4.1755806620814846e-07, + "loss": 0.4593, + "step": 336520 + }, + { + "epoch": 2.9750349192878236, + "grad_norm": 2.565004348754883, + "learning_rate": 4.1608467853627777e-07, + "loss": 0.4622, + "step": 336530 + }, + { + "epoch": 2.9751233225481357, + "grad_norm": 3.98645281791687, + "learning_rate": 4.1461129086440713e-07, + "loss": 0.4923, + "step": 336540 + }, + { + "epoch": 2.975211725808448, + "grad_norm": 1.7545653581619263, + "learning_rate": 4.1313790319253643e-07, + "loss": 0.585, + "step": 336550 + }, + { + "epoch": 2.97530012906876, + "grad_norm": 1.7475825548171997, + "learning_rate": 4.116645155206657e-07, + "loss": 0.454, + "step": 336560 + }, + { + "epoch": 2.975388532329072, + "grad_norm": 9.297995567321777, + "learning_rate": 4.1019112784879505e-07, + "loss": 0.4566, + "step": 336570 + }, + { + "epoch": 2.9754769355893846, + "grad_norm": 2.4986674785614014, + "learning_rate": 4.087177401769244e-07, + "loss": 0.5311, + "step": 336580 + }, + { + "epoch": 2.9755653388496968, + "grad_norm": 5.238139629364014, + "learning_rate": 4.072443525050537e-07, + "loss": 0.5991, + "step": 336590 + }, + { + "epoch": 2.975653742110009, + "grad_norm": 1.5221011638641357, + "learning_rate": 4.057709648331831e-07, + "loss": 0.5854, + "step": 336600 + }, + { + "epoch": 2.9757421453703214, + "grad_norm": 3.9832003116607666, + "learning_rate": 4.0429757716131243e-07, + "loss": 0.5183, + "step": 336610 + }, + { + "epoch": 2.9758305486306336, + "grad_norm": 3.208183765411377, + "learning_rate": 4.0282418948944174e-07, + "loss": 0.4222, + "step": 336620 + }, + { + "epoch": 2.9759189518909457, + "grad_norm": 7.227762222290039, + "learning_rate": 4.01350801817571e-07, + "loss": 0.4856, + "step": 336630 + }, + { + "epoch": 2.976007355151258, + "grad_norm": 6.076148986816406, + "learning_rate": 3.9987741414570035e-07, + "loss": 0.5762, + "step": 336640 + }, + { + "epoch": 2.9760957584115704, + "grad_norm": 2.4983530044555664, + "learning_rate": 3.984040264738297e-07, + "loss": 0.4294, + "step": 336650 + }, + { + "epoch": 2.9761841616718825, + "grad_norm": 3.247638463973999, + "learning_rate": 3.96930638801959e-07, + "loss": 0.517, + "step": 336660 + }, + { + "epoch": 2.9762725649321946, + "grad_norm": 1.7140220403671265, + "learning_rate": 3.954572511300884e-07, + "loss": 0.4867, + "step": 336670 + }, + { + "epoch": 2.976360968192507, + "grad_norm": 8.20927619934082, + "learning_rate": 3.939838634582177e-07, + "loss": 0.5791, + "step": 336680 + }, + { + "epoch": 2.9764493714528193, + "grad_norm": 1.4552934169769287, + "learning_rate": 3.9251047578634705e-07, + "loss": 0.5486, + "step": 336690 + }, + { + "epoch": 2.9765377747131314, + "grad_norm": 6.705673694610596, + "learning_rate": 3.910370881144763e-07, + "loss": 0.4746, + "step": 336700 + }, + { + "epoch": 2.9766261779734435, + "grad_norm": 2.5532350540161133, + "learning_rate": 3.895637004426057e-07, + "loss": 0.4046, + "step": 336710 + }, + { + "epoch": 2.9767145812337557, + "grad_norm": 2.025535821914673, + "learning_rate": 3.8809031277073497e-07, + "loss": 0.4522, + "step": 336720 + }, + { + "epoch": 2.976802984494068, + "grad_norm": 6.961740016937256, + "learning_rate": 3.8661692509886433e-07, + "loss": 0.5013, + "step": 336730 + }, + { + "epoch": 2.9768913877543803, + "grad_norm": 3.5195705890655518, + "learning_rate": 3.851435374269937e-07, + "loss": 0.6611, + "step": 336740 + }, + { + "epoch": 2.9769797910146925, + "grad_norm": 5.241905689239502, + "learning_rate": 3.83670149755123e-07, + "loss": 0.4486, + "step": 336750 + }, + { + "epoch": 2.977068194275005, + "grad_norm": 1.8634693622589111, + "learning_rate": 3.821967620832523e-07, + "loss": 0.407, + "step": 336760 + }, + { + "epoch": 2.977156597535317, + "grad_norm": 2.352658271789551, + "learning_rate": 3.8072337441138166e-07, + "loss": 0.4624, + "step": 336770 + }, + { + "epoch": 2.9772450007956293, + "grad_norm": 1.8698391914367676, + "learning_rate": 3.7924998673951097e-07, + "loss": 0.5632, + "step": 336780 + }, + { + "epoch": 2.9773334040559414, + "grad_norm": 4.504188060760498, + "learning_rate": 3.777765990676403e-07, + "loss": 0.5041, + "step": 336790 + }, + { + "epoch": 2.977421807316254, + "grad_norm": 4.538601398468018, + "learning_rate": 3.7630321139576964e-07, + "loss": 0.5176, + "step": 336800 + }, + { + "epoch": 2.977510210576566, + "grad_norm": 2.7872204780578613, + "learning_rate": 3.7482982372389894e-07, + "loss": 0.5017, + "step": 336810 + }, + { + "epoch": 2.977598613836878, + "grad_norm": 2.10764741897583, + "learning_rate": 3.7335643605202825e-07, + "loss": 0.5372, + "step": 336820 + }, + { + "epoch": 2.9776870170971907, + "grad_norm": 2.479703903198242, + "learning_rate": 3.718830483801576e-07, + "loss": 0.493, + "step": 336830 + }, + { + "epoch": 2.977775420357503, + "grad_norm": 7.918840408325195, + "learning_rate": 3.7040966070828697e-07, + "loss": 0.4247, + "step": 336840 + }, + { + "epoch": 2.977863823617815, + "grad_norm": 1.9515713453292847, + "learning_rate": 3.689362730364163e-07, + "loss": 0.4915, + "step": 336850 + }, + { + "epoch": 2.977952226878127, + "grad_norm": 2.4767987728118896, + "learning_rate": 3.674628853645456e-07, + "loss": 0.409, + "step": 336860 + }, + { + "epoch": 2.9780406301384397, + "grad_norm": 2.884553909301758, + "learning_rate": 3.6598949769267494e-07, + "loss": 0.6124, + "step": 336870 + }, + { + "epoch": 2.978129033398752, + "grad_norm": 3.0353195667266846, + "learning_rate": 3.6451611002080425e-07, + "loss": 0.632, + "step": 336880 + }, + { + "epoch": 2.978217436659064, + "grad_norm": 0.6852558255195618, + "learning_rate": 3.6304272234893356e-07, + "loss": 0.4691, + "step": 336890 + }, + { + "epoch": 2.9783058399193765, + "grad_norm": 5.22422981262207, + "learning_rate": 3.615693346770629e-07, + "loss": 0.48, + "step": 336900 + }, + { + "epoch": 2.9783942431796886, + "grad_norm": 4.467206001281738, + "learning_rate": 3.600959470051922e-07, + "loss": 0.6339, + "step": 336910 + }, + { + "epoch": 2.9784826464400007, + "grad_norm": 2.0897560119628906, + "learning_rate": 3.586225593333216e-07, + "loss": 0.5666, + "step": 336920 + }, + { + "epoch": 2.978571049700313, + "grad_norm": 2.8588829040527344, + "learning_rate": 3.571491716614509e-07, + "loss": 0.5976, + "step": 336930 + }, + { + "epoch": 2.978659452960625, + "grad_norm": 4.500250816345215, + "learning_rate": 3.556757839895802e-07, + "loss": 0.4943, + "step": 336940 + }, + { + "epoch": 2.9787478562209375, + "grad_norm": 6.898009777069092, + "learning_rate": 3.5420239631770956e-07, + "loss": 0.5745, + "step": 336950 + }, + { + "epoch": 2.9788362594812496, + "grad_norm": 5.810487747192383, + "learning_rate": 3.5272900864583886e-07, + "loss": 0.6302, + "step": 336960 + }, + { + "epoch": 2.9789246627415618, + "grad_norm": 2.8616554737091064, + "learning_rate": 3.512556209739682e-07, + "loss": 0.5524, + "step": 336970 + }, + { + "epoch": 2.9790130660018743, + "grad_norm": 9.419011116027832, + "learning_rate": 3.4978223330209753e-07, + "loss": 0.5222, + "step": 336980 + }, + { + "epoch": 2.9791014692621864, + "grad_norm": 2.838074207305908, + "learning_rate": 3.483088456302269e-07, + "loss": 0.5303, + "step": 336990 + }, + { + "epoch": 2.9791898725224986, + "grad_norm": 1.5987352132797241, + "learning_rate": 3.468354579583562e-07, + "loss": 0.5895, + "step": 337000 + }, + { + "epoch": 2.9792782757828107, + "grad_norm": 3.1084442138671875, + "learning_rate": 3.453620702864855e-07, + "loss": 0.4587, + "step": 337010 + }, + { + "epoch": 2.9793666790431232, + "grad_norm": 3.5848143100738525, + "learning_rate": 3.4388868261461486e-07, + "loss": 0.4822, + "step": 337020 + }, + { + "epoch": 2.9794550823034354, + "grad_norm": 6.846616268157959, + "learning_rate": 3.4241529494274417e-07, + "loss": 0.4562, + "step": 337030 + }, + { + "epoch": 2.9795434855637475, + "grad_norm": 3.0091660022735596, + "learning_rate": 3.409419072708735e-07, + "loss": 0.5017, + "step": 337040 + }, + { + "epoch": 2.97963188882406, + "grad_norm": 7.982241153717041, + "learning_rate": 3.3946851959900284e-07, + "loss": 0.5784, + "step": 337050 + }, + { + "epoch": 2.979720292084372, + "grad_norm": 2.4995667934417725, + "learning_rate": 3.379951319271322e-07, + "loss": 0.4056, + "step": 337060 + }, + { + "epoch": 2.9798086953446843, + "grad_norm": 7.977654457092285, + "learning_rate": 3.3652174425526145e-07, + "loss": 0.5686, + "step": 337070 + }, + { + "epoch": 2.9798970986049964, + "grad_norm": 6.548238277435303, + "learning_rate": 3.350483565833908e-07, + "loss": 0.428, + "step": 337080 + }, + { + "epoch": 2.9799855018653085, + "grad_norm": 1.7681078910827637, + "learning_rate": 3.3357496891152017e-07, + "loss": 0.4892, + "step": 337090 + }, + { + "epoch": 2.980073905125621, + "grad_norm": 4.575839996337891, + "learning_rate": 3.321015812396494e-07, + "loss": 0.4789, + "step": 337100 + }, + { + "epoch": 2.980162308385933, + "grad_norm": 10.729985237121582, + "learning_rate": 3.306281935677788e-07, + "loss": 0.5388, + "step": 337110 + }, + { + "epoch": 2.9802507116462458, + "grad_norm": 6.188938140869141, + "learning_rate": 3.2915480589590814e-07, + "loss": 0.4565, + "step": 337120 + }, + { + "epoch": 2.980339114906558, + "grad_norm": 1.4313054084777832, + "learning_rate": 3.2768141822403745e-07, + "loss": 0.5434, + "step": 337130 + }, + { + "epoch": 2.98042751816687, + "grad_norm": 5.512416839599609, + "learning_rate": 3.2620803055216676e-07, + "loss": 0.5022, + "step": 337140 + }, + { + "epoch": 2.980515921427182, + "grad_norm": 4.8639445304870605, + "learning_rate": 3.247346428802961e-07, + "loss": 0.5439, + "step": 337150 + }, + { + "epoch": 2.9806043246874943, + "grad_norm": 1.7227532863616943, + "learning_rate": 3.232612552084254e-07, + "loss": 0.4346, + "step": 337160 + }, + { + "epoch": 2.980692727947807, + "grad_norm": 2.7977075576782227, + "learning_rate": 3.2178786753655473e-07, + "loss": 0.5662, + "step": 337170 + }, + { + "epoch": 2.980781131208119, + "grad_norm": 9.822038650512695, + "learning_rate": 3.203144798646841e-07, + "loss": 0.5129, + "step": 337180 + }, + { + "epoch": 2.980869534468431, + "grad_norm": 1.9352823495864868, + "learning_rate": 3.1884109219281345e-07, + "loss": 0.5013, + "step": 337190 + }, + { + "epoch": 2.9809579377287436, + "grad_norm": 6.079701900482178, + "learning_rate": 3.1736770452094276e-07, + "loss": 0.4815, + "step": 337200 + }, + { + "epoch": 2.9810463409890557, + "grad_norm": 3.627035617828369, + "learning_rate": 3.1589431684907206e-07, + "loss": 0.4352, + "step": 337210 + }, + { + "epoch": 2.981134744249368, + "grad_norm": 14.946281433105469, + "learning_rate": 3.144209291772014e-07, + "loss": 0.456, + "step": 337220 + }, + { + "epoch": 2.98122314750968, + "grad_norm": 4.215475082397461, + "learning_rate": 3.1294754150533073e-07, + "loss": 0.4734, + "step": 337230 + }, + { + "epoch": 2.9813115507699925, + "grad_norm": 12.912647247314453, + "learning_rate": 3.1147415383346004e-07, + "loss": 0.5264, + "step": 337240 + }, + { + "epoch": 2.9813999540303047, + "grad_norm": 2.344305992126465, + "learning_rate": 3.100007661615894e-07, + "loss": 0.5515, + "step": 337250 + }, + { + "epoch": 2.981488357290617, + "grad_norm": 7.37380313873291, + "learning_rate": 3.085273784897187e-07, + "loss": 0.5138, + "step": 337260 + }, + { + "epoch": 2.9815767605509294, + "grad_norm": 2.8141167163848877, + "learning_rate": 3.0705399081784806e-07, + "loss": 0.5185, + "step": 337270 + }, + { + "epoch": 2.9816651638112415, + "grad_norm": 4.648847579956055, + "learning_rate": 3.0558060314597737e-07, + "loss": 0.5596, + "step": 337280 + }, + { + "epoch": 2.9817535670715536, + "grad_norm": 4.028388977050781, + "learning_rate": 3.041072154741067e-07, + "loss": 0.543, + "step": 337290 + }, + { + "epoch": 2.9818419703318657, + "grad_norm": 4.174006462097168, + "learning_rate": 3.0263382780223604e-07, + "loss": 0.5875, + "step": 337300 + }, + { + "epoch": 2.981930373592178, + "grad_norm": 3.836698055267334, + "learning_rate": 3.0116044013036534e-07, + "loss": 0.6504, + "step": 337310 + }, + { + "epoch": 2.9820187768524904, + "grad_norm": 7.135203838348389, + "learning_rate": 2.996870524584947e-07, + "loss": 0.6333, + "step": 337320 + }, + { + "epoch": 2.9821071801128025, + "grad_norm": 14.481913566589355, + "learning_rate": 2.98213664786624e-07, + "loss": 0.6719, + "step": 337330 + }, + { + "epoch": 2.982195583373115, + "grad_norm": 2.1394996643066406, + "learning_rate": 2.967402771147533e-07, + "loss": 0.5346, + "step": 337340 + }, + { + "epoch": 2.982283986633427, + "grad_norm": 1.8744107484817505, + "learning_rate": 2.952668894428827e-07, + "loss": 0.4562, + "step": 337350 + }, + { + "epoch": 2.9823723898937393, + "grad_norm": 4.5122971534729, + "learning_rate": 2.93793501771012e-07, + "loss": 0.3872, + "step": 337360 + }, + { + "epoch": 2.9824607931540514, + "grad_norm": 4.216298580169678, + "learning_rate": 2.9232011409914134e-07, + "loss": 0.5191, + "step": 337370 + }, + { + "epoch": 2.9825491964143636, + "grad_norm": 4.321167469024658, + "learning_rate": 2.9084672642727065e-07, + "loss": 0.6339, + "step": 337380 + }, + { + "epoch": 2.982637599674676, + "grad_norm": 1.7287518978118896, + "learning_rate": 2.8937333875539996e-07, + "loss": 0.4998, + "step": 337390 + }, + { + "epoch": 2.9827260029349882, + "grad_norm": 6.941714286804199, + "learning_rate": 2.878999510835293e-07, + "loss": 0.539, + "step": 337400 + }, + { + "epoch": 2.9828144061953004, + "grad_norm": 9.362208366394043, + "learning_rate": 2.864265634116586e-07, + "loss": 0.5381, + "step": 337410 + }, + { + "epoch": 2.982902809455613, + "grad_norm": 4.622639179229736, + "learning_rate": 2.8495317573978793e-07, + "loss": 0.5199, + "step": 337420 + }, + { + "epoch": 2.982991212715925, + "grad_norm": 10.084921836853027, + "learning_rate": 2.834797880679173e-07, + "loss": 0.534, + "step": 337430 + }, + { + "epoch": 2.983079615976237, + "grad_norm": 1.2790204286575317, + "learning_rate": 2.8200640039604665e-07, + "loss": 0.5051, + "step": 337440 + }, + { + "epoch": 2.9831680192365493, + "grad_norm": 1.3072657585144043, + "learning_rate": 2.8053301272417596e-07, + "loss": 0.3875, + "step": 337450 + }, + { + "epoch": 2.983256422496862, + "grad_norm": 6.622050762176514, + "learning_rate": 2.7905962505230527e-07, + "loss": 0.4362, + "step": 337460 + }, + { + "epoch": 2.983344825757174, + "grad_norm": 4.3263840675354, + "learning_rate": 2.775862373804346e-07, + "loss": 0.5543, + "step": 337470 + }, + { + "epoch": 2.983433229017486, + "grad_norm": 1.5657143592834473, + "learning_rate": 2.7611284970856393e-07, + "loss": 0.5739, + "step": 337480 + }, + { + "epoch": 2.9835216322777987, + "grad_norm": 1.9049638509750366, + "learning_rate": 2.7463946203669324e-07, + "loss": 0.4177, + "step": 337490 + }, + { + "epoch": 2.9836100355381108, + "grad_norm": 5.103707790374756, + "learning_rate": 2.731660743648226e-07, + "loss": 0.4337, + "step": 337500 + }, + { + "epoch": 2.983698438798423, + "grad_norm": 3.8285369873046875, + "learning_rate": 2.7169268669295196e-07, + "loss": 0.4115, + "step": 337510 + }, + { + "epoch": 2.983786842058735, + "grad_norm": 3.29080867767334, + "learning_rate": 2.702192990210812e-07, + "loss": 0.6117, + "step": 337520 + }, + { + "epoch": 2.983875245319047, + "grad_norm": 12.907873153686523, + "learning_rate": 2.6874591134921057e-07, + "loss": 0.5623, + "step": 337530 + }, + { + "epoch": 2.9839636485793597, + "grad_norm": 6.095062255859375, + "learning_rate": 2.6727252367733993e-07, + "loss": 0.4874, + "step": 337540 + }, + { + "epoch": 2.984052051839672, + "grad_norm": 2.5514473915100098, + "learning_rate": 2.657991360054692e-07, + "loss": 0.4384, + "step": 337550 + }, + { + "epoch": 2.984140455099984, + "grad_norm": 2.665731430053711, + "learning_rate": 2.6432574833359855e-07, + "loss": 0.518, + "step": 337560 + }, + { + "epoch": 2.9842288583602965, + "grad_norm": 2.9156081676483154, + "learning_rate": 2.628523606617279e-07, + "loss": 0.5166, + "step": 337570 + }, + { + "epoch": 2.9843172616206086, + "grad_norm": 13.090585708618164, + "learning_rate": 2.613789729898572e-07, + "loss": 0.5869, + "step": 337580 + }, + { + "epoch": 2.9844056648809207, + "grad_norm": 2.786674737930298, + "learning_rate": 2.599055853179865e-07, + "loss": 0.529, + "step": 337590 + }, + { + "epoch": 2.984494068141233, + "grad_norm": 32.34979248046875, + "learning_rate": 2.584321976461159e-07, + "loss": 0.4794, + "step": 337600 + }, + { + "epoch": 2.9845824714015454, + "grad_norm": 1.8788261413574219, + "learning_rate": 2.569588099742452e-07, + "loss": 0.5121, + "step": 337610 + }, + { + "epoch": 2.9846708746618575, + "grad_norm": 2.4850454330444336, + "learning_rate": 2.554854223023745e-07, + "loss": 0.4576, + "step": 337620 + }, + { + "epoch": 2.9847592779221697, + "grad_norm": 2.054478406906128, + "learning_rate": 2.5401203463050385e-07, + "loss": 0.5669, + "step": 337630 + }, + { + "epoch": 2.9848476811824822, + "grad_norm": 4.119994640350342, + "learning_rate": 2.525386469586332e-07, + "loss": 0.4548, + "step": 337640 + }, + { + "epoch": 2.9849360844427943, + "grad_norm": 3.5920863151550293, + "learning_rate": 2.510652592867625e-07, + "loss": 0.5202, + "step": 337650 + }, + { + "epoch": 2.9850244877031065, + "grad_norm": 3.5037713050842285, + "learning_rate": 2.4959187161489183e-07, + "loss": 0.5, + "step": 337660 + }, + { + "epoch": 2.9851128909634186, + "grad_norm": 1.474456787109375, + "learning_rate": 2.481184839430212e-07, + "loss": 0.5085, + "step": 337670 + }, + { + "epoch": 2.9852012942237307, + "grad_norm": 3.715665817260742, + "learning_rate": 2.466450962711505e-07, + "loss": 0.5076, + "step": 337680 + }, + { + "epoch": 2.9852896974840433, + "grad_norm": 3.4692978858947754, + "learning_rate": 2.451717085992798e-07, + "loss": 0.5089, + "step": 337690 + }, + { + "epoch": 2.9853781007443554, + "grad_norm": 1.7489097118377686, + "learning_rate": 2.4369832092740916e-07, + "loss": 0.3638, + "step": 337700 + }, + { + "epoch": 2.985466504004668, + "grad_norm": 9.565556526184082, + "learning_rate": 2.4222493325553847e-07, + "loss": 0.5411, + "step": 337710 + }, + { + "epoch": 2.98555490726498, + "grad_norm": 4.116927146911621, + "learning_rate": 2.4075154558366783e-07, + "loss": 0.4809, + "step": 337720 + }, + { + "epoch": 2.985643310525292, + "grad_norm": 4.679140567779541, + "learning_rate": 2.3927815791179713e-07, + "loss": 0.4679, + "step": 337730 + }, + { + "epoch": 2.9857317137856043, + "grad_norm": 5.09114933013916, + "learning_rate": 2.3780477023992647e-07, + "loss": 0.4447, + "step": 337740 + }, + { + "epoch": 2.9858201170459164, + "grad_norm": 3.078909158706665, + "learning_rate": 2.363313825680558e-07, + "loss": 0.4846, + "step": 337750 + }, + { + "epoch": 2.985908520306229, + "grad_norm": 3.474811315536499, + "learning_rate": 2.348579948961851e-07, + "loss": 0.4208, + "step": 337760 + }, + { + "epoch": 2.985996923566541, + "grad_norm": 1.8490434885025024, + "learning_rate": 2.3338460722431444e-07, + "loss": 0.4263, + "step": 337770 + }, + { + "epoch": 2.9860853268268532, + "grad_norm": 12.788832664489746, + "learning_rate": 2.3191121955244377e-07, + "loss": 0.3843, + "step": 337780 + }, + { + "epoch": 2.986173730087166, + "grad_norm": 2.418153762817383, + "learning_rate": 2.3043783188057313e-07, + "loss": 0.5045, + "step": 337790 + }, + { + "epoch": 2.986262133347478, + "grad_norm": 4.553444862365723, + "learning_rate": 2.2896444420870241e-07, + "loss": 0.6178, + "step": 337800 + }, + { + "epoch": 2.98635053660779, + "grad_norm": 4.284328937530518, + "learning_rate": 2.2749105653683175e-07, + "loss": 0.5541, + "step": 337810 + }, + { + "epoch": 2.986438939868102, + "grad_norm": 3.8286428451538086, + "learning_rate": 2.260176688649611e-07, + "loss": 0.5936, + "step": 337820 + }, + { + "epoch": 2.9865273431284147, + "grad_norm": 4.57826566696167, + "learning_rate": 2.245442811930904e-07, + "loss": 0.6624, + "step": 337830 + }, + { + "epoch": 2.986615746388727, + "grad_norm": 1.4539895057678223, + "learning_rate": 2.2307089352121975e-07, + "loss": 0.4187, + "step": 337840 + }, + { + "epoch": 2.986704149649039, + "grad_norm": 2.7720773220062256, + "learning_rate": 2.2159750584934908e-07, + "loss": 0.504, + "step": 337850 + }, + { + "epoch": 2.9867925529093515, + "grad_norm": 3.524150848388672, + "learning_rate": 2.201241181774784e-07, + "loss": 0.4841, + "step": 337860 + }, + { + "epoch": 2.9868809561696636, + "grad_norm": 6.999786853790283, + "learning_rate": 2.1865073050560772e-07, + "loss": 0.6587, + "step": 337870 + }, + { + "epoch": 2.9869693594299758, + "grad_norm": 2.659257411956787, + "learning_rate": 2.1717734283373705e-07, + "loss": 0.4614, + "step": 337880 + }, + { + "epoch": 2.987057762690288, + "grad_norm": 2.697763204574585, + "learning_rate": 2.157039551618664e-07, + "loss": 0.4615, + "step": 337890 + }, + { + "epoch": 2.9871461659506, + "grad_norm": 8.527664184570312, + "learning_rate": 2.142305674899957e-07, + "loss": 0.5519, + "step": 337900 + }, + { + "epoch": 2.9872345692109126, + "grad_norm": 2.919248104095459, + "learning_rate": 2.1275717981812503e-07, + "loss": 0.4964, + "step": 337910 + }, + { + "epoch": 2.9873229724712247, + "grad_norm": 3.884854793548584, + "learning_rate": 2.112837921462544e-07, + "loss": 0.5026, + "step": 337920 + }, + { + "epoch": 2.9874113757315373, + "grad_norm": 2.8500635623931885, + "learning_rate": 2.0981040447438367e-07, + "loss": 0.4877, + "step": 337930 + }, + { + "epoch": 2.9874997789918494, + "grad_norm": 8.19324779510498, + "learning_rate": 2.08337016802513e-07, + "loss": 0.4761, + "step": 337940 + }, + { + "epoch": 2.9875881822521615, + "grad_norm": 3.189567804336548, + "learning_rate": 2.0686362913064236e-07, + "loss": 0.5, + "step": 337950 + }, + { + "epoch": 2.9876765855124736, + "grad_norm": 9.60926342010498, + "learning_rate": 2.053902414587717e-07, + "loss": 0.557, + "step": 337960 + }, + { + "epoch": 2.9877649887727857, + "grad_norm": 2.0089516639709473, + "learning_rate": 2.03916853786901e-07, + "loss": 0.4896, + "step": 337970 + }, + { + "epoch": 2.9878533920330983, + "grad_norm": 6.226129055023193, + "learning_rate": 2.0244346611503033e-07, + "loss": 0.526, + "step": 337980 + }, + { + "epoch": 2.9879417952934104, + "grad_norm": 2.228928327560425, + "learning_rate": 2.0097007844315967e-07, + "loss": 0.4295, + "step": 337990 + }, + { + "epoch": 2.9880301985537225, + "grad_norm": 3.0420830249786377, + "learning_rate": 1.9949669077128897e-07, + "loss": 0.4502, + "step": 338000 + }, + { + "epoch": 2.988118601814035, + "grad_norm": 1.3948390483856201, + "learning_rate": 1.980233030994183e-07, + "loss": 0.5238, + "step": 338010 + }, + { + "epoch": 2.9882070050743472, + "grad_norm": 7.853383541107178, + "learning_rate": 1.9654991542754764e-07, + "loss": 0.4426, + "step": 338020 + }, + { + "epoch": 2.9882954083346593, + "grad_norm": 10.274785041809082, + "learning_rate": 1.9507652775567697e-07, + "loss": 0.4394, + "step": 338030 + }, + { + "epoch": 2.9883838115949715, + "grad_norm": 3.5778865814208984, + "learning_rate": 1.936031400838063e-07, + "loss": 0.4353, + "step": 338040 + }, + { + "epoch": 2.988472214855284, + "grad_norm": 2.925502300262451, + "learning_rate": 1.9212975241193562e-07, + "loss": 0.3656, + "step": 338050 + }, + { + "epoch": 2.988560618115596, + "grad_norm": 2.559934616088867, + "learning_rate": 1.9065636474006495e-07, + "loss": 0.4761, + "step": 338060 + }, + { + "epoch": 2.9886490213759083, + "grad_norm": 3.7020928859710693, + "learning_rate": 1.8918297706819428e-07, + "loss": 0.3955, + "step": 338070 + }, + { + "epoch": 2.988737424636221, + "grad_norm": 27.109031677246094, + "learning_rate": 1.8770958939632362e-07, + "loss": 0.3664, + "step": 338080 + }, + { + "epoch": 2.988825827896533, + "grad_norm": 1.9491606950759888, + "learning_rate": 1.8623620172445295e-07, + "loss": 0.52, + "step": 338090 + }, + { + "epoch": 2.988914231156845, + "grad_norm": 11.763164520263672, + "learning_rate": 1.8476281405258226e-07, + "loss": 0.685, + "step": 338100 + }, + { + "epoch": 2.989002634417157, + "grad_norm": 1.8241640329360962, + "learning_rate": 1.8328942638071162e-07, + "loss": 0.5627, + "step": 338110 + }, + { + "epoch": 2.9890910376774693, + "grad_norm": 1.706583857536316, + "learning_rate": 1.8181603870884092e-07, + "loss": 0.5689, + "step": 338120 + }, + { + "epoch": 2.989179440937782, + "grad_norm": 1.5819331407546997, + "learning_rate": 1.8034265103697026e-07, + "loss": 0.4636, + "step": 338130 + }, + { + "epoch": 2.989267844198094, + "grad_norm": 4.7823486328125, + "learning_rate": 1.788692633650996e-07, + "loss": 0.4699, + "step": 338140 + }, + { + "epoch": 2.989356247458406, + "grad_norm": 2.1073477268218994, + "learning_rate": 1.773958756932289e-07, + "loss": 0.5617, + "step": 338150 + }, + { + "epoch": 2.9894446507187187, + "grad_norm": 5.353991985321045, + "learning_rate": 1.7592248802135826e-07, + "loss": 0.5314, + "step": 338160 + }, + { + "epoch": 2.989533053979031, + "grad_norm": 4.37054967880249, + "learning_rate": 1.7444910034948756e-07, + "loss": 0.4688, + "step": 338170 + }, + { + "epoch": 2.989621457239343, + "grad_norm": 14.944483757019043, + "learning_rate": 1.729757126776169e-07, + "loss": 0.6489, + "step": 338180 + }, + { + "epoch": 2.989709860499655, + "grad_norm": 2.9047865867614746, + "learning_rate": 1.7150232500574623e-07, + "loss": 0.6127, + "step": 338190 + }, + { + "epoch": 2.9897982637599676, + "grad_norm": 1.9622273445129395, + "learning_rate": 1.7002893733387554e-07, + "loss": 0.5028, + "step": 338200 + }, + { + "epoch": 2.9898866670202797, + "grad_norm": 0.36765018105506897, + "learning_rate": 1.6855554966200487e-07, + "loss": 0.4532, + "step": 338210 + }, + { + "epoch": 2.989975070280592, + "grad_norm": 6.987924098968506, + "learning_rate": 1.670821619901342e-07, + "loss": 0.4967, + "step": 338220 + }, + { + "epoch": 2.9900634735409044, + "grad_norm": 3.1975812911987305, + "learning_rate": 1.6560877431826354e-07, + "loss": 0.5095, + "step": 338230 + }, + { + "epoch": 2.9901518768012165, + "grad_norm": 2.619481325149536, + "learning_rate": 1.6413538664639287e-07, + "loss": 0.454, + "step": 338240 + }, + { + "epoch": 2.9902402800615286, + "grad_norm": 1.5460728406906128, + "learning_rate": 1.6266199897452218e-07, + "loss": 0.5644, + "step": 338250 + }, + { + "epoch": 2.9903286833218408, + "grad_norm": 7.368630409240723, + "learning_rate": 1.611886113026515e-07, + "loss": 0.6279, + "step": 338260 + }, + { + "epoch": 2.990417086582153, + "grad_norm": 4.164431095123291, + "learning_rate": 1.5971522363078084e-07, + "loss": 0.5428, + "step": 338270 + }, + { + "epoch": 2.9905054898424654, + "grad_norm": 0.9661316275596619, + "learning_rate": 1.5824183595891018e-07, + "loss": 0.5391, + "step": 338280 + }, + { + "epoch": 2.9905938931027776, + "grad_norm": 2.4151127338409424, + "learning_rate": 1.567684482870395e-07, + "loss": 0.4128, + "step": 338290 + }, + { + "epoch": 2.99068229636309, + "grad_norm": 2.5646607875823975, + "learning_rate": 1.5529506061516884e-07, + "loss": 0.4167, + "step": 338300 + }, + { + "epoch": 2.9907706996234023, + "grad_norm": 3.670832872390747, + "learning_rate": 1.5382167294329815e-07, + "loss": 0.6453, + "step": 338310 + }, + { + "epoch": 2.9908591028837144, + "grad_norm": 4.045512676239014, + "learning_rate": 1.5234828527142748e-07, + "loss": 0.5188, + "step": 338320 + }, + { + "epoch": 2.9909475061440265, + "grad_norm": 1.8340681791305542, + "learning_rate": 1.5087489759955682e-07, + "loss": 0.5222, + "step": 338330 + }, + { + "epoch": 2.9910359094043386, + "grad_norm": 6.435035705566406, + "learning_rate": 1.4940150992768612e-07, + "loss": 0.5137, + "step": 338340 + }, + { + "epoch": 2.991124312664651, + "grad_norm": 3.4783475399017334, + "learning_rate": 1.4792812225581548e-07, + "loss": 0.4435, + "step": 338350 + }, + { + "epoch": 2.9912127159249633, + "grad_norm": 9.507773399353027, + "learning_rate": 1.464547345839448e-07, + "loss": 0.5072, + "step": 338360 + }, + { + "epoch": 2.9913011191852754, + "grad_norm": 7.145630836486816, + "learning_rate": 1.4498134691207412e-07, + "loss": 0.5871, + "step": 338370 + }, + { + "epoch": 2.991389522445588, + "grad_norm": 3.470972776412964, + "learning_rate": 1.4350795924020346e-07, + "loss": 0.4754, + "step": 338380 + }, + { + "epoch": 2.9914779257059, + "grad_norm": 4.152092933654785, + "learning_rate": 1.4203457156833276e-07, + "loss": 0.4228, + "step": 338390 + }, + { + "epoch": 2.991566328966212, + "grad_norm": 4.496047019958496, + "learning_rate": 1.4056118389646212e-07, + "loss": 0.49, + "step": 338400 + }, + { + "epoch": 2.9916547322265243, + "grad_norm": 2.6030025482177734, + "learning_rate": 1.3908779622459143e-07, + "loss": 0.46, + "step": 338410 + }, + { + "epoch": 2.991743135486837, + "grad_norm": 5.737709045410156, + "learning_rate": 1.3761440855272076e-07, + "loss": 0.523, + "step": 338420 + }, + { + "epoch": 2.991831538747149, + "grad_norm": 1.5194557905197144, + "learning_rate": 1.361410208808501e-07, + "loss": 0.539, + "step": 338430 + }, + { + "epoch": 2.991919942007461, + "grad_norm": 11.498824119567871, + "learning_rate": 1.3466763320897943e-07, + "loss": 0.5544, + "step": 338440 + }, + { + "epoch": 2.9920083452677737, + "grad_norm": 2.0029001235961914, + "learning_rate": 1.3319424553710874e-07, + "loss": 0.4606, + "step": 338450 + }, + { + "epoch": 2.992096748528086, + "grad_norm": 6.460208415985107, + "learning_rate": 1.3172085786523807e-07, + "loss": 0.5079, + "step": 338460 + }, + { + "epoch": 2.992185151788398, + "grad_norm": 11.213284492492676, + "learning_rate": 1.302474701933674e-07, + "loss": 0.4617, + "step": 338470 + }, + { + "epoch": 2.99227355504871, + "grad_norm": 2.2948496341705322, + "learning_rate": 1.2877408252149674e-07, + "loss": 0.6108, + "step": 338480 + }, + { + "epoch": 2.992361958309022, + "grad_norm": 1.533443808555603, + "learning_rate": 1.2730069484962607e-07, + "loss": 0.5535, + "step": 338490 + }, + { + "epoch": 2.9924503615693347, + "grad_norm": 4.955904483795166, + "learning_rate": 1.2582730717775538e-07, + "loss": 0.5033, + "step": 338500 + }, + { + "epoch": 2.992538764829647, + "grad_norm": 2.269049882888794, + "learning_rate": 1.243539195058847e-07, + "loss": 0.5466, + "step": 338510 + }, + { + "epoch": 2.9926271680899594, + "grad_norm": 6.8285369873046875, + "learning_rate": 1.2288053183401404e-07, + "loss": 0.4971, + "step": 338520 + }, + { + "epoch": 2.9927155713502716, + "grad_norm": 3.4520983695983887, + "learning_rate": 1.2140714416214338e-07, + "loss": 0.5319, + "step": 338530 + }, + { + "epoch": 2.9928039746105837, + "grad_norm": 13.792840957641602, + "learning_rate": 1.199337564902727e-07, + "loss": 0.5008, + "step": 338540 + }, + { + "epoch": 2.992892377870896, + "grad_norm": 11.089679718017578, + "learning_rate": 1.1846036881840203e-07, + "loss": 0.5013, + "step": 338550 + }, + { + "epoch": 2.992980781131208, + "grad_norm": 9.119688034057617, + "learning_rate": 1.1698698114653136e-07, + "loss": 0.4936, + "step": 338560 + }, + { + "epoch": 2.9930691843915205, + "grad_norm": 1.2945135831832886, + "learning_rate": 1.1551359347466068e-07, + "loss": 0.559, + "step": 338570 + }, + { + "epoch": 2.9931575876518326, + "grad_norm": 20.096479415893555, + "learning_rate": 1.1404020580279e-07, + "loss": 0.6831, + "step": 338580 + }, + { + "epoch": 2.9932459909121447, + "grad_norm": 4.231954097747803, + "learning_rate": 1.1256681813091934e-07, + "loss": 0.4415, + "step": 338590 + }, + { + "epoch": 2.9933343941724573, + "grad_norm": 1.6186569929122925, + "learning_rate": 1.1109343045904866e-07, + "loss": 0.4968, + "step": 338600 + }, + { + "epoch": 2.9934227974327694, + "grad_norm": 4.36681604385376, + "learning_rate": 1.09620042787178e-07, + "loss": 0.6215, + "step": 338610 + }, + { + "epoch": 2.9935112006930815, + "grad_norm": 4.056987762451172, + "learning_rate": 1.0814665511530732e-07, + "loss": 0.5125, + "step": 338620 + }, + { + "epoch": 2.9935996039533936, + "grad_norm": 3.986854314804077, + "learning_rate": 1.0667326744343666e-07, + "loss": 0.4877, + "step": 338630 + }, + { + "epoch": 2.993688007213706, + "grad_norm": 2.641148567199707, + "learning_rate": 1.0519987977156598e-07, + "loss": 0.5006, + "step": 338640 + }, + { + "epoch": 2.9937764104740183, + "grad_norm": 3.91945481300354, + "learning_rate": 1.037264920996953e-07, + "loss": 0.4762, + "step": 338650 + }, + { + "epoch": 2.9938648137343304, + "grad_norm": 5.998415946960449, + "learning_rate": 1.0225310442782464e-07, + "loss": 0.5226, + "step": 338660 + }, + { + "epoch": 2.993953216994643, + "grad_norm": 11.291964530944824, + "learning_rate": 1.0077971675595396e-07, + "loss": 0.5317, + "step": 338670 + }, + { + "epoch": 2.994041620254955, + "grad_norm": 7.971269130706787, + "learning_rate": 9.93063290840833e-08, + "loss": 0.5268, + "step": 338680 + }, + { + "epoch": 2.9941300235152672, + "grad_norm": 2.1859631538391113, + "learning_rate": 9.783294141221262e-08, + "loss": 0.5694, + "step": 338690 + }, + { + "epoch": 2.9942184267755794, + "grad_norm": 2.5922508239746094, + "learning_rate": 9.635955374034195e-08, + "loss": 0.4604, + "step": 338700 + }, + { + "epoch": 2.9943068300358915, + "grad_norm": 4.04775333404541, + "learning_rate": 9.488616606847127e-08, + "loss": 0.4925, + "step": 338710 + }, + { + "epoch": 2.994395233296204, + "grad_norm": 11.298054695129395, + "learning_rate": 9.34127783966006e-08, + "loss": 0.4718, + "step": 338720 + }, + { + "epoch": 2.994483636556516, + "grad_norm": 22.640928268432617, + "learning_rate": 9.193939072472994e-08, + "loss": 0.5547, + "step": 338730 + }, + { + "epoch": 2.9945720398168283, + "grad_norm": 1.973728060722351, + "learning_rate": 9.046600305285926e-08, + "loss": 0.4164, + "step": 338740 + }, + { + "epoch": 2.994660443077141, + "grad_norm": 4.244152545928955, + "learning_rate": 8.899261538098859e-08, + "loss": 0.5395, + "step": 338750 + }, + { + "epoch": 2.994748846337453, + "grad_norm": 1.7364325523376465, + "learning_rate": 8.751922770911791e-08, + "loss": 0.4632, + "step": 338760 + }, + { + "epoch": 2.994837249597765, + "grad_norm": 2.625222682952881, + "learning_rate": 8.604584003724725e-08, + "loss": 0.4879, + "step": 338770 + }, + { + "epoch": 2.994925652858077, + "grad_norm": 5.204318046569824, + "learning_rate": 8.457245236537658e-08, + "loss": 0.599, + "step": 338780 + }, + { + "epoch": 2.99501405611839, + "grad_norm": 8.929240226745605, + "learning_rate": 8.30990646935059e-08, + "loss": 0.531, + "step": 338790 + }, + { + "epoch": 2.995102459378702, + "grad_norm": 4.245266914367676, + "learning_rate": 8.162567702163523e-08, + "loss": 0.4409, + "step": 338800 + }, + { + "epoch": 2.995190862639014, + "grad_norm": 2.262974500656128, + "learning_rate": 8.015228934976455e-08, + "loss": 0.6167, + "step": 338810 + }, + { + "epoch": 2.9952792658993266, + "grad_norm": 2.645019769668579, + "learning_rate": 7.867890167789389e-08, + "loss": 0.4485, + "step": 338820 + }, + { + "epoch": 2.9953676691596387, + "grad_norm": 2.968596935272217, + "learning_rate": 7.72055140060232e-08, + "loss": 0.5304, + "step": 338830 + }, + { + "epoch": 2.995456072419951, + "grad_norm": 3.6404337882995605, + "learning_rate": 7.573212633415254e-08, + "loss": 0.4991, + "step": 338840 + }, + { + "epoch": 2.995544475680263, + "grad_norm": 7.264307022094727, + "learning_rate": 7.425873866228187e-08, + "loss": 0.4785, + "step": 338850 + }, + { + "epoch": 2.995632878940575, + "grad_norm": 3.702834367752075, + "learning_rate": 7.27853509904112e-08, + "loss": 0.6267, + "step": 338860 + }, + { + "epoch": 2.9957212822008876, + "grad_norm": 3.9097955226898193, + "learning_rate": 7.131196331854053e-08, + "loss": 0.5125, + "step": 338870 + }, + { + "epoch": 2.9958096854611997, + "grad_norm": 7.990666389465332, + "learning_rate": 6.983857564666985e-08, + "loss": 0.4487, + "step": 338880 + }, + { + "epoch": 2.9958980887215123, + "grad_norm": 3.677964687347412, + "learning_rate": 6.836518797479918e-08, + "loss": 0.4947, + "step": 338890 + }, + { + "epoch": 2.9959864919818244, + "grad_norm": 1.8501721620559692, + "learning_rate": 6.689180030292851e-08, + "loss": 0.4357, + "step": 338900 + }, + { + "epoch": 2.9960748952421365, + "grad_norm": 4.673166751861572, + "learning_rate": 6.541841263105783e-08, + "loss": 0.6904, + "step": 338910 + }, + { + "epoch": 2.9961632985024487, + "grad_norm": 2.0012056827545166, + "learning_rate": 6.394502495918717e-08, + "loss": 0.5815, + "step": 338920 + }, + { + "epoch": 2.996251701762761, + "grad_norm": 5.870586395263672, + "learning_rate": 6.24716372873165e-08, + "loss": 0.6381, + "step": 338930 + }, + { + "epoch": 2.9963401050230734, + "grad_norm": 5.706888198852539, + "learning_rate": 6.099824961544582e-08, + "loss": 0.5476, + "step": 338940 + }, + { + "epoch": 2.9964285082833855, + "grad_norm": 6.620737075805664, + "learning_rate": 5.9524861943575146e-08, + "loss": 0.3894, + "step": 338950 + }, + { + "epoch": 2.9965169115436976, + "grad_norm": 2.8109772205352783, + "learning_rate": 5.805147427170447e-08, + "loss": 0.6037, + "step": 338960 + }, + { + "epoch": 2.99660531480401, + "grad_norm": 8.355143547058105, + "learning_rate": 5.6578086599833806e-08, + "loss": 0.5517, + "step": 338970 + }, + { + "epoch": 2.9966937180643223, + "grad_norm": 2.563058376312256, + "learning_rate": 5.510469892796313e-08, + "loss": 0.5426, + "step": 338980 + }, + { + "epoch": 2.9967821213246344, + "grad_norm": 3.9081332683563232, + "learning_rate": 5.3631311256092466e-08, + "loss": 0.439, + "step": 338990 + }, + { + "epoch": 2.9968705245849465, + "grad_norm": 2.0069940090179443, + "learning_rate": 5.2157923584221786e-08, + "loss": 0.4724, + "step": 339000 + }, + { + "epoch": 2.996958927845259, + "grad_norm": 1.0275566577911377, + "learning_rate": 5.068453591235111e-08, + "loss": 0.4637, + "step": 339010 + }, + { + "epoch": 2.997047331105571, + "grad_norm": 6.3268938064575195, + "learning_rate": 4.9211148240480446e-08, + "loss": 0.5233, + "step": 339020 + }, + { + "epoch": 2.9971357343658833, + "grad_norm": 2.2824134826660156, + "learning_rate": 4.773776056860977e-08, + "loss": 0.4657, + "step": 339030 + }, + { + "epoch": 2.997224137626196, + "grad_norm": 8.161417007446289, + "learning_rate": 4.62643728967391e-08, + "loss": 0.3513, + "step": 339040 + }, + { + "epoch": 2.997312540886508, + "grad_norm": 6.218818187713623, + "learning_rate": 4.4790985224868427e-08, + "loss": 0.6232, + "step": 339050 + }, + { + "epoch": 2.99740094414682, + "grad_norm": 3.3830366134643555, + "learning_rate": 4.331759755299775e-08, + "loss": 0.5342, + "step": 339060 + }, + { + "epoch": 2.9974893474071322, + "grad_norm": 3.2778093814849854, + "learning_rate": 4.1844209881127087e-08, + "loss": 0.5869, + "step": 339070 + }, + { + "epoch": 2.9975777506674444, + "grad_norm": 3.6108992099761963, + "learning_rate": 4.0370822209256413e-08, + "loss": 0.4766, + "step": 339080 + }, + { + "epoch": 2.997666153927757, + "grad_norm": 8.890320777893066, + "learning_rate": 3.889743453738574e-08, + "loss": 0.5002, + "step": 339090 + }, + { + "epoch": 2.997754557188069, + "grad_norm": 3.2900781631469727, + "learning_rate": 3.742404686551507e-08, + "loss": 0.4291, + "step": 339100 + }, + { + "epoch": 2.9978429604483816, + "grad_norm": 3.012970447540283, + "learning_rate": 3.5950659193644394e-08, + "loss": 0.5347, + "step": 339110 + }, + { + "epoch": 2.9979313637086937, + "grad_norm": 1.5164387226104736, + "learning_rate": 3.447727152177373e-08, + "loss": 0.5952, + "step": 339120 + }, + { + "epoch": 2.998019766969006, + "grad_norm": 4.753470420837402, + "learning_rate": 3.3003883849903054e-08, + "loss": 0.5527, + "step": 339130 + }, + { + "epoch": 2.998108170229318, + "grad_norm": 2.888798236846924, + "learning_rate": 3.153049617803238e-08, + "loss": 0.5567, + "step": 339140 + }, + { + "epoch": 2.99819657348963, + "grad_norm": 2.872748374938965, + "learning_rate": 3.005710850616171e-08, + "loss": 0.4679, + "step": 339150 + }, + { + "epoch": 2.9982849767499427, + "grad_norm": 1.553507924079895, + "learning_rate": 2.8583720834291037e-08, + "loss": 0.3948, + "step": 339160 + }, + { + "epoch": 2.9983733800102548, + "grad_norm": 4.6815667152404785, + "learning_rate": 2.7110333162420367e-08, + "loss": 0.4639, + "step": 339170 + }, + { + "epoch": 2.998461783270567, + "grad_norm": 1.903071641921997, + "learning_rate": 2.563694549054969e-08, + "loss": 0.5668, + "step": 339180 + }, + { + "epoch": 2.9985501865308795, + "grad_norm": 2.362048625946045, + "learning_rate": 2.416355781867902e-08, + "loss": 0.5222, + "step": 339190 + }, + { + "epoch": 2.9986385897911916, + "grad_norm": 8.438667297363281, + "learning_rate": 2.2690170146808347e-08, + "loss": 0.553, + "step": 339200 + }, + { + "epoch": 2.9987269930515037, + "grad_norm": 4.221798896789551, + "learning_rate": 2.1216782474937677e-08, + "loss": 0.5172, + "step": 339210 + }, + { + "epoch": 2.998815396311816, + "grad_norm": 8.509511947631836, + "learning_rate": 1.9743394803067004e-08, + "loss": 0.5611, + "step": 339220 + }, + { + "epoch": 2.9989037995721284, + "grad_norm": 2.744645833969116, + "learning_rate": 1.8270007131196334e-08, + "loss": 0.5702, + "step": 339230 + }, + { + "epoch": 2.9989922028324405, + "grad_norm": 2.1018950939178467, + "learning_rate": 1.679661945932566e-08, + "loss": 0.5044, + "step": 339240 + }, + { + "epoch": 2.9990806060927526, + "grad_norm": 4.280818462371826, + "learning_rate": 1.5323231787454988e-08, + "loss": 0.5679, + "step": 339250 + }, + { + "epoch": 2.999169009353065, + "grad_norm": 2.8638226985931396, + "learning_rate": 1.3849844115584318e-08, + "loss": 0.6278, + "step": 339260 + }, + { + "epoch": 2.9992574126133773, + "grad_norm": 2.1509952545166016, + "learning_rate": 1.2376456443713644e-08, + "loss": 0.5434, + "step": 339270 + }, + { + "epoch": 2.9993458158736894, + "grad_norm": 1.7755389213562012, + "learning_rate": 1.0903068771842973e-08, + "loss": 0.4363, + "step": 339280 + }, + { + "epoch": 2.9994342191340015, + "grad_norm": 2.8123910427093506, + "learning_rate": 9.429681099972301e-09, + "loss": 0.5666, + "step": 339290 + }, + { + "epoch": 2.9995226223943137, + "grad_norm": 10.404129028320312, + "learning_rate": 7.95629342810163e-09, + "loss": 0.5682, + "step": 339300 + }, + { + "epoch": 2.9996110256546262, + "grad_norm": 2.2805697917938232, + "learning_rate": 6.482905756230956e-09, + "loss": 0.4466, + "step": 339310 + }, + { + "epoch": 2.9996994289149383, + "grad_norm": 1.6643775701522827, + "learning_rate": 5.0095180843602845e-09, + "loss": 0.494, + "step": 339320 + }, + { + "epoch": 2.9997878321752505, + "grad_norm": 2.124772548675537, + "learning_rate": 3.536130412489613e-09, + "loss": 0.4325, + "step": 339330 + }, + { + "epoch": 2.999876235435563, + "grad_norm": 0.8858053684234619, + "learning_rate": 2.062742740618941e-09, + "loss": 0.3654, + "step": 339340 + }, + { + "epoch": 2.999964638695875, + "grad_norm": 13.786726951599121, + "learning_rate": 5.893550687482688e-10, + "loss": 0.5006, + "step": 339350 + }, + { + "epoch": 3.0, + "eval_loss": 0.5747780203819275, + "eval_runtime": 1555.3283, + "eval_samples_per_second": 290.916, + "eval_steps_per_second": 18.183, + "step": 339354 + } + ], + "logging_steps": 10, + "max_steps": 339354, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.625577278664192e+17, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}