{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 88.49557522123894, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08849557522123894, "grad_norm": 6.029814720153809, "learning_rate": 1.8e-07, "loss": 0.8556, "step": 10 }, { "epoch": 0.17699115044247787, "grad_norm": 5.713383674621582, "learning_rate": 3.8e-07, "loss": 0.8595, "step": 20 }, { "epoch": 0.26548672566371684, "grad_norm": 5.548460960388184, "learning_rate": 5.8e-07, "loss": 0.8439, "step": 30 }, { "epoch": 0.35398230088495575, "grad_norm": 4.973753929138184, "learning_rate": 7.8e-07, "loss": 0.8031, "step": 40 }, { "epoch": 0.4424778761061947, "grad_norm": 4.316400051116943, "learning_rate": 9.8e-07, "loss": 0.7054, "step": 50 }, { "epoch": 0.5309734513274337, "grad_norm": 2.757798194885254, "learning_rate": 1.18e-06, "loss": 0.6014, "step": 60 }, { "epoch": 0.6194690265486725, "grad_norm": 2.4811291694641113, "learning_rate": 1.3800000000000001e-06, "loss": 0.4621, "step": 70 }, { "epoch": 0.7079646017699115, "grad_norm": 1.1165071725845337, "learning_rate": 1.5800000000000003e-06, "loss": 0.3772, "step": 80 }, { "epoch": 0.7964601769911505, "grad_norm": 0.6678406596183777, "learning_rate": 1.7800000000000001e-06, "loss": 0.2607, "step": 90 }, { "epoch": 0.8849557522123894, "grad_norm": 0.5095370411872864, "learning_rate": 1.98e-06, "loss": 0.2246, "step": 100 }, { "epoch": 0.9734513274336283, "grad_norm": 0.3825031518936157, "learning_rate": 2.1800000000000003e-06, "loss": 0.1869, "step": 110 }, { "epoch": 1.0619469026548674, "grad_norm": 0.31909793615341187, "learning_rate": 2.38e-06, "loss": 0.1633, "step": 120 }, { "epoch": 1.1504424778761062, "grad_norm": 0.27751412987709045, "learning_rate": 2.5800000000000003e-06, "loss": 0.1487, "step": 130 }, { "epoch": 1.238938053097345, "grad_norm": 0.2241702675819397, "learning_rate": 2.78e-06, "loss": 0.1311, "step": 140 }, { "epoch": 1.3274336283185841, "grad_norm": 0.29508182406425476, "learning_rate": 2.9800000000000003e-06, "loss": 0.1256, "step": 150 }, { "epoch": 1.415929203539823, "grad_norm": 0.20176096260547638, "learning_rate": 3.1800000000000005e-06, "loss": 0.1161, "step": 160 }, { "epoch": 1.504424778761062, "grad_norm": 0.23275262117385864, "learning_rate": 3.38e-06, "loss": 0.1121, "step": 170 }, { "epoch": 1.592920353982301, "grad_norm": 0.21292689442634583, "learning_rate": 3.58e-06, "loss": 0.1022, "step": 180 }, { "epoch": 1.6814159292035398, "grad_norm": 0.352565735578537, "learning_rate": 3.7800000000000002e-06, "loss": 0.0989, "step": 190 }, { "epoch": 1.7699115044247788, "grad_norm": 0.1692201793193817, "learning_rate": 3.98e-06, "loss": 0.0942, "step": 200 }, { "epoch": 1.8584070796460177, "grad_norm": 0.1878710687160492, "learning_rate": 4.18e-06, "loss": 0.0852, "step": 210 }, { "epoch": 1.9469026548672566, "grad_norm": 0.18498444557189941, "learning_rate": 4.38e-06, "loss": 0.0827, "step": 220 }, { "epoch": 2.0353982300884956, "grad_norm": 0.18947617709636688, "learning_rate": 4.58e-06, "loss": 0.0785, "step": 230 }, { "epoch": 2.1238938053097347, "grad_norm": 0.18357056379318237, "learning_rate": 4.780000000000001e-06, "loss": 0.0751, "step": 240 }, { "epoch": 2.2123893805309733, "grad_norm": 0.1946367770433426, "learning_rate": 4.98e-06, "loss": 0.0724, "step": 250 }, { "epoch": 2.3008849557522124, "grad_norm": 0.17680567502975464, "learning_rate": 5.18e-06, "loss": 0.0673, "step": 260 }, { "epoch": 2.3893805309734515, "grad_norm": 0.1475750356912613, "learning_rate": 5.38e-06, "loss": 0.0649, "step": 270 }, { "epoch": 2.47787610619469, "grad_norm": 0.16969521343708038, "learning_rate": 5.580000000000001e-06, "loss": 0.0626, "step": 280 }, { "epoch": 2.566371681415929, "grad_norm": 0.1681053340435028, "learning_rate": 5.78e-06, "loss": 0.0592, "step": 290 }, { "epoch": 2.6548672566371683, "grad_norm": 0.1768426150083542, "learning_rate": 5.98e-06, "loss": 0.058, "step": 300 }, { "epoch": 2.7433628318584073, "grad_norm": 0.21786381304264069, "learning_rate": 6.18e-06, "loss": 0.056, "step": 310 }, { "epoch": 2.831858407079646, "grad_norm": 0.1621713936328888, "learning_rate": 6.38e-06, "loss": 0.0552, "step": 320 }, { "epoch": 2.920353982300885, "grad_norm": 0.19948254525661469, "learning_rate": 6.58e-06, "loss": 0.0514, "step": 330 }, { "epoch": 3.0088495575221237, "grad_norm": 0.2901497185230255, "learning_rate": 6.78e-06, "loss": 0.0499, "step": 340 }, { "epoch": 3.0973451327433628, "grad_norm": 0.21824175119400024, "learning_rate": 6.98e-06, "loss": 0.0469, "step": 350 }, { "epoch": 3.185840707964602, "grad_norm": 0.18174271285533905, "learning_rate": 7.180000000000001e-06, "loss": 0.0481, "step": 360 }, { "epoch": 3.274336283185841, "grad_norm": 0.19068759679794312, "learning_rate": 7.3800000000000005e-06, "loss": 0.0461, "step": 370 }, { "epoch": 3.3628318584070795, "grad_norm": 0.23360292613506317, "learning_rate": 7.580000000000001e-06, "loss": 0.0418, "step": 380 }, { "epoch": 3.4513274336283186, "grad_norm": 0.1604355126619339, "learning_rate": 7.78e-06, "loss": 0.0404, "step": 390 }, { "epoch": 3.5398230088495577, "grad_norm": 0.1779727339744568, "learning_rate": 7.98e-06, "loss": 0.0401, "step": 400 }, { "epoch": 3.6283185840707963, "grad_norm": 0.15471251308918, "learning_rate": 8.18e-06, "loss": 0.0389, "step": 410 }, { "epoch": 3.7168141592920354, "grad_norm": 0.1907050460577011, "learning_rate": 8.380000000000001e-06, "loss": 0.0383, "step": 420 }, { "epoch": 3.8053097345132745, "grad_norm": 0.180426225066185, "learning_rate": 8.580000000000001e-06, "loss": 0.0357, "step": 430 }, { "epoch": 3.893805309734513, "grad_norm": 0.16844674944877625, "learning_rate": 8.78e-06, "loss": 0.0359, "step": 440 }, { "epoch": 3.982300884955752, "grad_norm": 0.17488037049770355, "learning_rate": 8.98e-06, "loss": 0.0353, "step": 450 }, { "epoch": 4.070796460176991, "grad_norm": 0.1792651265859604, "learning_rate": 9.180000000000002e-06, "loss": 0.0327, "step": 460 }, { "epoch": 4.15929203539823, "grad_norm": 0.19573146104812622, "learning_rate": 9.38e-06, "loss": 0.0326, "step": 470 }, { "epoch": 4.247787610619469, "grad_norm": 0.26041167974472046, "learning_rate": 9.58e-06, "loss": 0.0321, "step": 480 }, { "epoch": 4.336283185840708, "grad_norm": 0.18472440540790558, "learning_rate": 9.78e-06, "loss": 0.0304, "step": 490 }, { "epoch": 4.424778761061947, "grad_norm": 0.14232812821865082, "learning_rate": 9.980000000000001e-06, "loss": 0.032, "step": 500 }, { "epoch": 4.513274336283186, "grad_norm": 0.18009839951992035, "learning_rate": 1.018e-05, "loss": 0.0294, "step": 510 }, { "epoch": 4.601769911504425, "grad_norm": 0.24435491859912872, "learning_rate": 1.038e-05, "loss": 0.0296, "step": 520 }, { "epoch": 4.6902654867256635, "grad_norm": 0.2446989119052887, "learning_rate": 1.058e-05, "loss": 0.0278, "step": 530 }, { "epoch": 4.778761061946903, "grad_norm": 0.18773755431175232, "learning_rate": 1.0780000000000002e-05, "loss": 0.0284, "step": 540 }, { "epoch": 4.867256637168142, "grad_norm": 0.21854208409786224, "learning_rate": 1.098e-05, "loss": 0.0284, "step": 550 }, { "epoch": 4.95575221238938, "grad_norm": 0.18152262270450592, "learning_rate": 1.118e-05, "loss": 0.0275, "step": 560 }, { "epoch": 5.04424778761062, "grad_norm": 0.20031234622001648, "learning_rate": 1.1380000000000001e-05, "loss": 0.0261, "step": 570 }, { "epoch": 5.132743362831858, "grad_norm": 0.19933964312076569, "learning_rate": 1.1580000000000001e-05, "loss": 0.0266, "step": 580 }, { "epoch": 5.221238938053097, "grad_norm": 0.18228860199451447, "learning_rate": 1.178e-05, "loss": 0.0259, "step": 590 }, { "epoch": 5.3097345132743365, "grad_norm": 0.224607452750206, "learning_rate": 1.198e-05, "loss": 0.0273, "step": 600 }, { "epoch": 5.398230088495575, "grad_norm": 0.2106594741344452, "learning_rate": 1.2180000000000002e-05, "loss": 0.0261, "step": 610 }, { "epoch": 5.486725663716814, "grad_norm": 0.28607967495918274, "learning_rate": 1.238e-05, "loss": 0.0258, "step": 620 }, { "epoch": 5.575221238938053, "grad_norm": 0.20972339808940887, "learning_rate": 1.258e-05, "loss": 0.026, "step": 630 }, { "epoch": 5.663716814159292, "grad_norm": 0.2404285967350006, "learning_rate": 1.278e-05, "loss": 0.024, "step": 640 }, { "epoch": 5.752212389380531, "grad_norm": 0.24622325599193573, "learning_rate": 1.2980000000000001e-05, "loss": 0.0242, "step": 650 }, { "epoch": 5.84070796460177, "grad_norm": 0.19178038835525513, "learning_rate": 1.3180000000000001e-05, "loss": 0.0238, "step": 660 }, { "epoch": 5.929203539823009, "grad_norm": 0.26031139492988586, "learning_rate": 1.338e-05, "loss": 0.0233, "step": 670 }, { "epoch": 6.017699115044247, "grad_norm": 0.2246655374765396, "learning_rate": 1.358e-05, "loss": 0.0236, "step": 680 }, { "epoch": 6.106194690265487, "grad_norm": 0.2755095064640045, "learning_rate": 1.3780000000000002e-05, "loss": 0.0228, "step": 690 }, { "epoch": 6.1946902654867255, "grad_norm": 0.17769227921962738, "learning_rate": 1.3980000000000002e-05, "loss": 0.024, "step": 700 }, { "epoch": 6.283185840707965, "grad_norm": 0.21013042330741882, "learning_rate": 1.4180000000000001e-05, "loss": 0.0236, "step": 710 }, { "epoch": 6.371681415929204, "grad_norm": 0.22938348352909088, "learning_rate": 1.4380000000000001e-05, "loss": 0.0228, "step": 720 }, { "epoch": 6.460176991150442, "grad_norm": 0.18899478018283844, "learning_rate": 1.4580000000000003e-05, "loss": 0.0221, "step": 730 }, { "epoch": 6.548672566371682, "grad_norm": 0.2541691064834595, "learning_rate": 1.4779999999999999e-05, "loss": 0.0215, "step": 740 }, { "epoch": 6.6371681415929205, "grad_norm": 0.24163757264614105, "learning_rate": 1.4979999999999999e-05, "loss": 0.0225, "step": 750 }, { "epoch": 6.725663716814159, "grad_norm": 0.1695130616426468, "learning_rate": 1.518e-05, "loss": 0.022, "step": 760 }, { "epoch": 6.814159292035399, "grad_norm": 0.20755034685134888, "learning_rate": 1.538e-05, "loss": 0.0223, "step": 770 }, { "epoch": 6.902654867256637, "grad_norm": 0.2123480886220932, "learning_rate": 1.558e-05, "loss": 0.021, "step": 780 }, { "epoch": 6.991150442477876, "grad_norm": 0.1984192132949829, "learning_rate": 1.578e-05, "loss": 0.0204, "step": 790 }, { "epoch": 7.079646017699115, "grad_norm": 0.20947502553462982, "learning_rate": 1.598e-05, "loss": 0.0208, "step": 800 }, { "epoch": 7.168141592920354, "grad_norm": 0.18240587413311005, "learning_rate": 1.618e-05, "loss": 0.0208, "step": 810 }, { "epoch": 7.256637168141593, "grad_norm": 0.2724882960319519, "learning_rate": 1.6380000000000002e-05, "loss": 0.0206, "step": 820 }, { "epoch": 7.345132743362832, "grad_norm": 0.1758035570383072, "learning_rate": 1.658e-05, "loss": 0.0207, "step": 830 }, { "epoch": 7.433628318584071, "grad_norm": 0.22411009669303894, "learning_rate": 1.6780000000000002e-05, "loss": 0.0208, "step": 840 }, { "epoch": 7.522123893805309, "grad_norm": 0.23918718099594116, "learning_rate": 1.698e-05, "loss": 0.0217, "step": 850 }, { "epoch": 7.610619469026549, "grad_norm": 0.17702209949493408, "learning_rate": 1.718e-05, "loss": 0.021, "step": 860 }, { "epoch": 7.699115044247788, "grad_norm": 0.17671605944633484, "learning_rate": 1.7380000000000003e-05, "loss": 0.0201, "step": 870 }, { "epoch": 7.787610619469026, "grad_norm": 0.24670277535915375, "learning_rate": 1.758e-05, "loss": 0.0195, "step": 880 }, { "epoch": 7.876106194690266, "grad_norm": 0.21365919709205627, "learning_rate": 1.7780000000000003e-05, "loss": 0.0194, "step": 890 }, { "epoch": 7.964601769911504, "grad_norm": 0.192793071269989, "learning_rate": 1.798e-05, "loss": 0.0197, "step": 900 }, { "epoch": 8.053097345132743, "grad_norm": 0.21986353397369385, "learning_rate": 1.818e-05, "loss": 0.0192, "step": 910 }, { "epoch": 8.141592920353983, "grad_norm": 0.22282905876636505, "learning_rate": 1.838e-05, "loss": 0.0198, "step": 920 }, { "epoch": 8.230088495575222, "grad_norm": 0.21167707443237305, "learning_rate": 1.858e-05, "loss": 0.02, "step": 930 }, { "epoch": 8.31858407079646, "grad_norm": 0.24645684659481049, "learning_rate": 1.878e-05, "loss": 0.0187, "step": 940 }, { "epoch": 8.4070796460177, "grad_norm": 0.2618500888347626, "learning_rate": 1.898e-05, "loss": 0.0184, "step": 950 }, { "epoch": 8.495575221238939, "grad_norm": 0.2853642404079437, "learning_rate": 1.918e-05, "loss": 0.02, "step": 960 }, { "epoch": 8.584070796460177, "grad_norm": 0.22935572266578674, "learning_rate": 1.938e-05, "loss": 0.0187, "step": 970 }, { "epoch": 8.672566371681416, "grad_norm": 0.18018950521945953, "learning_rate": 1.9580000000000002e-05, "loss": 0.0187, "step": 980 }, { "epoch": 8.761061946902656, "grad_norm": 0.2860763669013977, "learning_rate": 1.978e-05, "loss": 0.019, "step": 990 }, { "epoch": 8.849557522123893, "grad_norm": 0.19469276070594788, "learning_rate": 1.9980000000000002e-05, "loss": 0.0191, "step": 1000 }, { "epoch": 8.938053097345133, "grad_norm": 0.2059198021888733, "learning_rate": 2.0180000000000003e-05, "loss": 0.0177, "step": 1010 }, { "epoch": 9.026548672566372, "grad_norm": 0.22872787714004517, "learning_rate": 2.038e-05, "loss": 0.0187, "step": 1020 }, { "epoch": 9.11504424778761, "grad_norm": 0.21850481629371643, "learning_rate": 2.0580000000000003e-05, "loss": 0.0177, "step": 1030 }, { "epoch": 9.20353982300885, "grad_norm": 0.3010428249835968, "learning_rate": 2.078e-05, "loss": 0.0183, "step": 1040 }, { "epoch": 9.29203539823009, "grad_norm": 0.30720841884613037, "learning_rate": 2.098e-05, "loss": 0.0175, "step": 1050 }, { "epoch": 9.380530973451327, "grad_norm": 0.25102201104164124, "learning_rate": 2.118e-05, "loss": 0.0182, "step": 1060 }, { "epoch": 9.469026548672566, "grad_norm": 0.22691412270069122, "learning_rate": 2.138e-05, "loss": 0.018, "step": 1070 }, { "epoch": 9.557522123893806, "grad_norm": 0.25092026591300964, "learning_rate": 2.158e-05, "loss": 0.0187, "step": 1080 }, { "epoch": 9.646017699115044, "grad_norm": 0.21945276856422424, "learning_rate": 2.178e-05, "loss": 0.0178, "step": 1090 }, { "epoch": 9.734513274336283, "grad_norm": 0.2584983706474304, "learning_rate": 2.198e-05, "loss": 0.0178, "step": 1100 }, { "epoch": 9.823008849557523, "grad_norm": 0.23842653632164001, "learning_rate": 2.218e-05, "loss": 0.0176, "step": 1110 }, { "epoch": 9.91150442477876, "grad_norm": 0.17505544424057007, "learning_rate": 2.2380000000000003e-05, "loss": 0.0182, "step": 1120 }, { "epoch": 10.0, "grad_norm": 0.19007155299186707, "learning_rate": 2.258e-05, "loss": 0.0187, "step": 1130 }, { "epoch": 10.08849557522124, "grad_norm": 0.24683013558387756, "learning_rate": 2.2780000000000002e-05, "loss": 0.0169, "step": 1140 }, { "epoch": 10.176991150442477, "grad_norm": 0.2595357298851013, "learning_rate": 2.298e-05, "loss": 0.0171, "step": 1150 }, { "epoch": 10.265486725663717, "grad_norm": 0.2851751744747162, "learning_rate": 2.318e-05, "loss": 0.0176, "step": 1160 }, { "epoch": 10.353982300884956, "grad_norm": 0.2224997580051422, "learning_rate": 2.3380000000000003e-05, "loss": 0.0175, "step": 1170 }, { "epoch": 10.442477876106194, "grad_norm": 0.2187778502702713, "learning_rate": 2.358e-05, "loss": 0.0164, "step": 1180 }, { "epoch": 10.530973451327434, "grad_norm": 0.21035735309123993, "learning_rate": 2.3780000000000003e-05, "loss": 0.0165, "step": 1190 }, { "epoch": 10.619469026548673, "grad_norm": 0.23563535511493683, "learning_rate": 2.398e-05, "loss": 0.0165, "step": 1200 }, { "epoch": 10.70796460176991, "grad_norm": 0.20609630644321442, "learning_rate": 2.418e-05, "loss": 0.0161, "step": 1210 }, { "epoch": 10.79646017699115, "grad_norm": 0.17000283300876617, "learning_rate": 2.438e-05, "loss": 0.0168, "step": 1220 }, { "epoch": 10.88495575221239, "grad_norm": 0.22076988220214844, "learning_rate": 2.4580000000000002e-05, "loss": 0.0163, "step": 1230 }, { "epoch": 10.973451327433628, "grad_norm": 0.18816152215003967, "learning_rate": 2.478e-05, "loss": 0.0168, "step": 1240 }, { "epoch": 11.061946902654867, "grad_norm": 0.19269531965255737, "learning_rate": 2.498e-05, "loss": 0.0167, "step": 1250 }, { "epoch": 11.150442477876107, "grad_norm": 0.195516899228096, "learning_rate": 2.5180000000000003e-05, "loss": 0.0155, "step": 1260 }, { "epoch": 11.238938053097344, "grad_norm": 0.19087128341197968, "learning_rate": 2.5380000000000004e-05, "loss": 0.0165, "step": 1270 }, { "epoch": 11.327433628318584, "grad_norm": 0.21465033292770386, "learning_rate": 2.5580000000000002e-05, "loss": 0.0157, "step": 1280 }, { "epoch": 11.415929203539823, "grad_norm": 0.21152853965759277, "learning_rate": 2.5779999999999997e-05, "loss": 0.016, "step": 1290 }, { "epoch": 11.504424778761061, "grad_norm": 0.26900339126586914, "learning_rate": 2.598e-05, "loss": 0.0153, "step": 1300 }, { "epoch": 11.5929203539823, "grad_norm": 0.23565775156021118, "learning_rate": 2.618e-05, "loss": 0.0154, "step": 1310 }, { "epoch": 11.68141592920354, "grad_norm": 0.23392872512340546, "learning_rate": 2.6379999999999998e-05, "loss": 0.0162, "step": 1320 }, { "epoch": 11.769911504424778, "grad_norm": 0.23122538626194, "learning_rate": 2.658e-05, "loss": 0.0156, "step": 1330 }, { "epoch": 11.858407079646017, "grad_norm": 0.2104092538356781, "learning_rate": 2.678e-05, "loss": 0.0161, "step": 1340 }, { "epoch": 11.946902654867257, "grad_norm": 0.16211697459220886, "learning_rate": 2.698e-05, "loss": 0.0151, "step": 1350 }, { "epoch": 12.035398230088495, "grad_norm": 0.22679129242897034, "learning_rate": 2.718e-05, "loss": 0.0157, "step": 1360 }, { "epoch": 12.123893805309734, "grad_norm": 0.2528381645679474, "learning_rate": 2.738e-05, "loss": 0.0152, "step": 1370 }, { "epoch": 12.212389380530974, "grad_norm": 0.257403701543808, "learning_rate": 2.758e-05, "loss": 0.0154, "step": 1380 }, { "epoch": 12.300884955752213, "grad_norm": 0.22304107248783112, "learning_rate": 2.778e-05, "loss": 0.0151, "step": 1390 }, { "epoch": 12.389380530973451, "grad_norm": 0.23130843043327332, "learning_rate": 2.798e-05, "loss": 0.0149, "step": 1400 }, { "epoch": 12.47787610619469, "grad_norm": 0.22789767384529114, "learning_rate": 2.818e-05, "loss": 0.0153, "step": 1410 }, { "epoch": 12.56637168141593, "grad_norm": 0.24470973014831543, "learning_rate": 2.8380000000000003e-05, "loss": 0.0143, "step": 1420 }, { "epoch": 12.654867256637168, "grad_norm": 0.14789143204689026, "learning_rate": 2.858e-05, "loss": 0.0151, "step": 1430 }, { "epoch": 12.743362831858407, "grad_norm": 0.19800685346126556, "learning_rate": 2.8780000000000002e-05, "loss": 0.0144, "step": 1440 }, { "epoch": 12.831858407079647, "grad_norm": 0.19041651487350464, "learning_rate": 2.898e-05, "loss": 0.0161, "step": 1450 }, { "epoch": 12.920353982300885, "grad_norm": 0.20260532200336456, "learning_rate": 2.9180000000000002e-05, "loss": 0.016, "step": 1460 }, { "epoch": 13.008849557522124, "grad_norm": 0.25979915261268616, "learning_rate": 2.9380000000000003e-05, "loss": 0.0152, "step": 1470 }, { "epoch": 13.097345132743364, "grad_norm": 0.2235214114189148, "learning_rate": 2.958e-05, "loss": 0.0152, "step": 1480 }, { "epoch": 13.185840707964601, "grad_norm": 0.2452046126127243, "learning_rate": 2.9780000000000003e-05, "loss": 0.0155, "step": 1490 }, { "epoch": 13.274336283185841, "grad_norm": 0.18082256615161896, "learning_rate": 2.998e-05, "loss": 0.0156, "step": 1500 }, { "epoch": 13.36283185840708, "grad_norm": 0.2241387665271759, "learning_rate": 3.0180000000000002e-05, "loss": 0.0152, "step": 1510 }, { "epoch": 13.451327433628318, "grad_norm": 0.19304688274860382, "learning_rate": 3.0380000000000004e-05, "loss": 0.0145, "step": 1520 }, { "epoch": 13.539823008849558, "grad_norm": 0.2725679874420166, "learning_rate": 3.058e-05, "loss": 0.0149, "step": 1530 }, { "epoch": 13.628318584070797, "grad_norm": 0.22139526903629303, "learning_rate": 3.078e-05, "loss": 0.0141, "step": 1540 }, { "epoch": 13.716814159292035, "grad_norm": 0.2034563273191452, "learning_rate": 3.0980000000000005e-05, "loss": 0.0138, "step": 1550 }, { "epoch": 13.805309734513274, "grad_norm": 0.20761550962924957, "learning_rate": 3.118e-05, "loss": 0.014, "step": 1560 }, { "epoch": 13.893805309734514, "grad_norm": 0.21596471965312958, "learning_rate": 3.138e-05, "loss": 0.0141, "step": 1570 }, { "epoch": 13.982300884955752, "grad_norm": 0.19941861927509308, "learning_rate": 3.1580000000000006e-05, "loss": 0.0141, "step": 1580 }, { "epoch": 14.070796460176991, "grad_norm": 0.17475123703479767, "learning_rate": 3.1780000000000004e-05, "loss": 0.0142, "step": 1590 }, { "epoch": 14.15929203539823, "grad_norm": 0.2039012759923935, "learning_rate": 3.198e-05, "loss": 0.0147, "step": 1600 }, { "epoch": 14.247787610619469, "grad_norm": 0.23782622814178467, "learning_rate": 3.218e-05, "loss": 0.0143, "step": 1610 }, { "epoch": 14.336283185840708, "grad_norm": 0.2800450325012207, "learning_rate": 3.238e-05, "loss": 0.0141, "step": 1620 }, { "epoch": 14.424778761061948, "grad_norm": 0.21049116551876068, "learning_rate": 3.2579999999999996e-05, "loss": 0.0135, "step": 1630 }, { "epoch": 14.513274336283185, "grad_norm": 0.22005024552345276, "learning_rate": 3.278e-05, "loss": 0.0143, "step": 1640 }, { "epoch": 14.601769911504425, "grad_norm": 0.22420290112495422, "learning_rate": 3.298e-05, "loss": 0.0139, "step": 1650 }, { "epoch": 14.690265486725664, "grad_norm": 0.19930607080459595, "learning_rate": 3.318e-05, "loss": 0.0137, "step": 1660 }, { "epoch": 14.778761061946902, "grad_norm": 0.19809278845787048, "learning_rate": 3.338e-05, "loss": 0.0141, "step": 1670 }, { "epoch": 14.867256637168142, "grad_norm": 0.16252639889717102, "learning_rate": 3.358e-05, "loss": 0.0142, "step": 1680 }, { "epoch": 14.955752212389381, "grad_norm": 0.18918722867965698, "learning_rate": 3.378e-05, "loss": 0.0143, "step": 1690 }, { "epoch": 15.044247787610619, "grad_norm": 0.19039537012577057, "learning_rate": 3.398e-05, "loss": 0.0138, "step": 1700 }, { "epoch": 15.132743362831858, "grad_norm": 0.14957837760448456, "learning_rate": 3.418e-05, "loss": 0.0137, "step": 1710 }, { "epoch": 15.221238938053098, "grad_norm": 0.16923746466636658, "learning_rate": 3.438e-05, "loss": 0.0143, "step": 1720 }, { "epoch": 15.309734513274336, "grad_norm": 0.16197200119495392, "learning_rate": 3.4580000000000004e-05, "loss": 0.0132, "step": 1730 }, { "epoch": 15.398230088495575, "grad_norm": 0.1780109852552414, "learning_rate": 3.478e-05, "loss": 0.0134, "step": 1740 }, { "epoch": 15.486725663716815, "grad_norm": 0.20671486854553223, "learning_rate": 3.498e-05, "loss": 0.0134, "step": 1750 }, { "epoch": 15.575221238938052, "grad_norm": 0.2506687641143799, "learning_rate": 3.518e-05, "loss": 0.0127, "step": 1760 }, { "epoch": 15.663716814159292, "grad_norm": 0.16043032705783844, "learning_rate": 3.5380000000000003e-05, "loss": 0.0128, "step": 1770 }, { "epoch": 15.752212389380531, "grad_norm": 0.17977970838546753, "learning_rate": 3.558e-05, "loss": 0.013, "step": 1780 }, { "epoch": 15.84070796460177, "grad_norm": 0.21391505002975464, "learning_rate": 3.578e-05, "loss": 0.0132, "step": 1790 }, { "epoch": 15.929203539823009, "grad_norm": 0.1991083323955536, "learning_rate": 3.5980000000000004e-05, "loss": 0.0133, "step": 1800 }, { "epoch": 16.01769911504425, "grad_norm": 0.25892403721809387, "learning_rate": 3.618e-05, "loss": 0.0126, "step": 1810 }, { "epoch": 16.106194690265486, "grad_norm": 0.20870722830295563, "learning_rate": 3.638e-05, "loss": 0.0126, "step": 1820 }, { "epoch": 16.194690265486727, "grad_norm": 0.18789322674274445, "learning_rate": 3.6580000000000006e-05, "loss": 0.0126, "step": 1830 }, { "epoch": 16.283185840707965, "grad_norm": 0.2000027447938919, "learning_rate": 3.6780000000000004e-05, "loss": 0.013, "step": 1840 }, { "epoch": 16.371681415929203, "grad_norm": 0.15799404680728912, "learning_rate": 3.698e-05, "loss": 0.0124, "step": 1850 }, { "epoch": 16.460176991150444, "grad_norm": 0.22756226360797882, "learning_rate": 3.7180000000000007e-05, "loss": 0.0127, "step": 1860 }, { "epoch": 16.548672566371682, "grad_norm": 0.21301455795764923, "learning_rate": 3.7380000000000005e-05, "loss": 0.0134, "step": 1870 }, { "epoch": 16.63716814159292, "grad_norm": 0.1839960813522339, "learning_rate": 3.758e-05, "loss": 0.0128, "step": 1880 }, { "epoch": 16.72566371681416, "grad_norm": 0.16055116057395935, "learning_rate": 3.778000000000001e-05, "loss": 0.0136, "step": 1890 }, { "epoch": 16.8141592920354, "grad_norm": 0.30682259798049927, "learning_rate": 3.7980000000000006e-05, "loss": 0.0127, "step": 1900 }, { "epoch": 16.902654867256636, "grad_norm": 0.22010301053524017, "learning_rate": 3.818e-05, "loss": 0.0134, "step": 1910 }, { "epoch": 16.991150442477878, "grad_norm": 0.18450503051280975, "learning_rate": 3.838e-05, "loss": 0.0125, "step": 1920 }, { "epoch": 17.079646017699115, "grad_norm": 0.14620986580848694, "learning_rate": 3.858e-05, "loss": 0.0133, "step": 1930 }, { "epoch": 17.168141592920353, "grad_norm": 0.17002588510513306, "learning_rate": 3.878e-05, "loss": 0.0124, "step": 1940 }, { "epoch": 17.256637168141594, "grad_norm": 0.17407143115997314, "learning_rate": 3.898e-05, "loss": 0.0131, "step": 1950 }, { "epoch": 17.345132743362832, "grad_norm": 0.25174498558044434, "learning_rate": 3.918e-05, "loss": 0.0119, "step": 1960 }, { "epoch": 17.43362831858407, "grad_norm": 0.2005157768726349, "learning_rate": 3.938e-05, "loss": 0.012, "step": 1970 }, { "epoch": 17.52212389380531, "grad_norm": 0.19487272202968597, "learning_rate": 3.958e-05, "loss": 0.013, "step": 1980 }, { "epoch": 17.61061946902655, "grad_norm": 0.2406047135591507, "learning_rate": 3.978e-05, "loss": 0.0134, "step": 1990 }, { "epoch": 17.699115044247787, "grad_norm": 0.17815567553043365, "learning_rate": 3.998e-05, "loss": 0.0126, "step": 2000 }, { "epoch": 17.787610619469028, "grad_norm": 0.20930008590221405, "learning_rate": 4.018e-05, "loss": 0.0123, "step": 2010 }, { "epoch": 17.876106194690266, "grad_norm": 0.18442749977111816, "learning_rate": 4.038e-05, "loss": 0.0122, "step": 2020 }, { "epoch": 17.964601769911503, "grad_norm": 0.16639980673789978, "learning_rate": 4.058e-05, "loss": 0.0129, "step": 2030 }, { "epoch": 18.053097345132745, "grad_norm": 0.22504822909832, "learning_rate": 4.078e-05, "loss": 0.0131, "step": 2040 }, { "epoch": 18.141592920353983, "grad_norm": 0.24628128111362457, "learning_rate": 4.0980000000000004e-05, "loss": 0.0118, "step": 2050 }, { "epoch": 18.23008849557522, "grad_norm": 0.14474467933177948, "learning_rate": 4.118e-05, "loss": 0.0119, "step": 2060 }, { "epoch": 18.31858407079646, "grad_norm": 0.19983842968940735, "learning_rate": 4.138e-05, "loss": 0.0119, "step": 2070 }, { "epoch": 18.4070796460177, "grad_norm": 0.26629599928855896, "learning_rate": 4.1580000000000005e-05, "loss": 0.0126, "step": 2080 }, { "epoch": 18.495575221238937, "grad_norm": 0.18293090164661407, "learning_rate": 4.178e-05, "loss": 0.0132, "step": 2090 }, { "epoch": 18.58407079646018, "grad_norm": 0.22794148325920105, "learning_rate": 4.198e-05, "loss": 0.0117, "step": 2100 }, { "epoch": 18.672566371681416, "grad_norm": 0.1965721845626831, "learning_rate": 4.2180000000000006e-05, "loss": 0.0126, "step": 2110 }, { "epoch": 18.761061946902654, "grad_norm": 0.21172276139259338, "learning_rate": 4.2380000000000004e-05, "loss": 0.0118, "step": 2120 }, { "epoch": 18.849557522123895, "grad_norm": 0.21038956940174103, "learning_rate": 4.258e-05, "loss": 0.0125, "step": 2130 }, { "epoch": 18.938053097345133, "grad_norm": 0.23481352627277374, "learning_rate": 4.278e-05, "loss": 0.012, "step": 2140 }, { "epoch": 19.02654867256637, "grad_norm": 0.18684646487236023, "learning_rate": 4.2980000000000005e-05, "loss": 0.0121, "step": 2150 }, { "epoch": 19.115044247787612, "grad_norm": 0.23360024392604828, "learning_rate": 4.318e-05, "loss": 0.0119, "step": 2160 }, { "epoch": 19.20353982300885, "grad_norm": 0.2147015929222107, "learning_rate": 4.338e-05, "loss": 0.0118, "step": 2170 }, { "epoch": 19.292035398230087, "grad_norm": 0.20503421127796173, "learning_rate": 4.3580000000000006e-05, "loss": 0.0118, "step": 2180 }, { "epoch": 19.38053097345133, "grad_norm": 0.19846957921981812, "learning_rate": 4.3780000000000004e-05, "loss": 0.0116, "step": 2190 }, { "epoch": 19.469026548672566, "grad_norm": 0.22245913743972778, "learning_rate": 4.398e-05, "loss": 0.0127, "step": 2200 }, { "epoch": 19.557522123893804, "grad_norm": 0.20002296566963196, "learning_rate": 4.418000000000001e-05, "loss": 0.0124, "step": 2210 }, { "epoch": 19.646017699115045, "grad_norm": 0.22656002640724182, "learning_rate": 4.438e-05, "loss": 0.0111, "step": 2220 }, { "epoch": 19.734513274336283, "grad_norm": 0.1582970917224884, "learning_rate": 4.458e-05, "loss": 0.0116, "step": 2230 }, { "epoch": 19.82300884955752, "grad_norm": 0.18759259581565857, "learning_rate": 4.478e-05, "loss": 0.0114, "step": 2240 }, { "epoch": 19.911504424778762, "grad_norm": 0.23023168742656708, "learning_rate": 4.498e-05, "loss": 0.0114, "step": 2250 }, { "epoch": 20.0, "grad_norm": 0.23070260882377625, "learning_rate": 4.518e-05, "loss": 0.0113, "step": 2260 }, { "epoch": 20.088495575221238, "grad_norm": 0.21114951372146606, "learning_rate": 4.538e-05, "loss": 0.0116, "step": 2270 }, { "epoch": 20.17699115044248, "grad_norm": 0.21285273134708405, "learning_rate": 4.558e-05, "loss": 0.0116, "step": 2280 }, { "epoch": 20.265486725663717, "grad_norm": 0.20174440741539001, "learning_rate": 4.578e-05, "loss": 0.0112, "step": 2290 }, { "epoch": 20.353982300884955, "grad_norm": 0.22542431950569153, "learning_rate": 4.5980000000000004e-05, "loss": 0.0123, "step": 2300 }, { "epoch": 20.442477876106196, "grad_norm": 0.1854224056005478, "learning_rate": 4.618e-05, "loss": 0.0121, "step": 2310 }, { "epoch": 20.530973451327434, "grad_norm": 0.2263476550579071, "learning_rate": 4.638e-05, "loss": 0.0116, "step": 2320 }, { "epoch": 20.61946902654867, "grad_norm": 0.18328975141048431, "learning_rate": 4.6580000000000005e-05, "loss": 0.0114, "step": 2330 }, { "epoch": 20.707964601769913, "grad_norm": 0.18268951773643494, "learning_rate": 4.678e-05, "loss": 0.0108, "step": 2340 }, { "epoch": 20.79646017699115, "grad_norm": 0.16564399003982544, "learning_rate": 4.698e-05, "loss": 0.0113, "step": 2350 }, { "epoch": 20.884955752212388, "grad_norm": 0.19969241321086884, "learning_rate": 4.718e-05, "loss": 0.0117, "step": 2360 }, { "epoch": 20.97345132743363, "grad_norm": 0.15618184208869934, "learning_rate": 4.7380000000000004e-05, "loss": 0.0108, "step": 2370 }, { "epoch": 21.061946902654867, "grad_norm": 0.21858179569244385, "learning_rate": 4.758e-05, "loss": 0.0113, "step": 2380 }, { "epoch": 21.150442477876105, "grad_norm": 0.17541509866714478, "learning_rate": 4.778e-05, "loss": 0.0125, "step": 2390 }, { "epoch": 21.238938053097346, "grad_norm": 0.16505980491638184, "learning_rate": 4.7980000000000005e-05, "loss": 0.0119, "step": 2400 }, { "epoch": 21.327433628318584, "grad_norm": 0.23706042766571045, "learning_rate": 4.818e-05, "loss": 0.0121, "step": 2410 }, { "epoch": 21.41592920353982, "grad_norm": 0.16708803176879883, "learning_rate": 4.838e-05, "loss": 0.0114, "step": 2420 }, { "epoch": 21.504424778761063, "grad_norm": 0.15749461948871613, "learning_rate": 4.8580000000000006e-05, "loss": 0.0119, "step": 2430 }, { "epoch": 21.5929203539823, "grad_norm": 0.1923663467168808, "learning_rate": 4.8780000000000004e-05, "loss": 0.0106, "step": 2440 }, { "epoch": 21.68141592920354, "grad_norm": 0.1585606038570404, "learning_rate": 4.898e-05, "loss": 0.0112, "step": 2450 }, { "epoch": 21.76991150442478, "grad_norm": 0.19709140062332153, "learning_rate": 4.918000000000001e-05, "loss": 0.0111, "step": 2460 }, { "epoch": 21.858407079646017, "grad_norm": 0.22520484030246735, "learning_rate": 4.9380000000000005e-05, "loss": 0.0105, "step": 2470 }, { "epoch": 21.946902654867255, "grad_norm": 0.22094905376434326, "learning_rate": 4.958e-05, "loss": 0.0115, "step": 2480 }, { "epoch": 22.035398230088497, "grad_norm": 0.184371218085289, "learning_rate": 4.978e-05, "loss": 0.011, "step": 2490 }, { "epoch": 22.123893805309734, "grad_norm": 0.2175336331129074, "learning_rate": 4.9980000000000006e-05, "loss": 0.0115, "step": 2500 }, { "epoch": 22.212389380530972, "grad_norm": 0.1742861568927765, "learning_rate": 5.0180000000000004e-05, "loss": 0.0106, "step": 2510 }, { "epoch": 22.300884955752213, "grad_norm": 0.1842757761478424, "learning_rate": 5.038e-05, "loss": 0.0106, "step": 2520 }, { "epoch": 22.38938053097345, "grad_norm": 0.1987450271844864, "learning_rate": 5.058000000000001e-05, "loss": 0.0114, "step": 2530 }, { "epoch": 22.47787610619469, "grad_norm": 0.20004890859127045, "learning_rate": 5.0780000000000005e-05, "loss": 0.0112, "step": 2540 }, { "epoch": 22.56637168141593, "grad_norm": 0.19499580562114716, "learning_rate": 5.098e-05, "loss": 0.011, "step": 2550 }, { "epoch": 22.654867256637168, "grad_norm": 0.16874614357948303, "learning_rate": 5.118000000000001e-05, "loss": 0.0104, "step": 2560 }, { "epoch": 22.743362831858406, "grad_norm": 0.19128884375095367, "learning_rate": 5.1380000000000006e-05, "loss": 0.0104, "step": 2570 }, { "epoch": 22.831858407079647, "grad_norm": 0.18968653678894043, "learning_rate": 5.1580000000000004e-05, "loss": 0.0105, "step": 2580 }, { "epoch": 22.920353982300885, "grad_norm": 0.21244198083877563, "learning_rate": 5.178000000000001e-05, "loss": 0.0106, "step": 2590 }, { "epoch": 23.008849557522122, "grad_norm": 0.1860707700252533, "learning_rate": 5.198000000000001e-05, "loss": 0.0106, "step": 2600 }, { "epoch": 23.097345132743364, "grad_norm": 0.17499375343322754, "learning_rate": 5.2180000000000005e-05, "loss": 0.0102, "step": 2610 }, { "epoch": 23.1858407079646, "grad_norm": 0.18945537507534027, "learning_rate": 5.238000000000001e-05, "loss": 0.0109, "step": 2620 }, { "epoch": 23.27433628318584, "grad_norm": 0.19200605154037476, "learning_rate": 5.258000000000001e-05, "loss": 0.0107, "step": 2630 }, { "epoch": 23.36283185840708, "grad_norm": 0.2066267877817154, "learning_rate": 5.2780000000000006e-05, "loss": 0.0102, "step": 2640 }, { "epoch": 23.451327433628318, "grad_norm": 0.21788930892944336, "learning_rate": 5.2980000000000004e-05, "loss": 0.0105, "step": 2650 }, { "epoch": 23.539823008849556, "grad_norm": 0.21332311630249023, "learning_rate": 5.318000000000001e-05, "loss": 0.0104, "step": 2660 }, { "epoch": 23.628318584070797, "grad_norm": 0.21296343207359314, "learning_rate": 5.338000000000001e-05, "loss": 0.0104, "step": 2670 }, { "epoch": 23.716814159292035, "grad_norm": 0.18731948733329773, "learning_rate": 5.3580000000000005e-05, "loss": 0.0107, "step": 2680 }, { "epoch": 23.805309734513273, "grad_norm": 0.20445139706134796, "learning_rate": 5.378e-05, "loss": 0.0105, "step": 2690 }, { "epoch": 23.893805309734514, "grad_norm": 0.2132468819618225, "learning_rate": 5.3979999999999995e-05, "loss": 0.0106, "step": 2700 }, { "epoch": 23.98230088495575, "grad_norm": 0.1869552731513977, "learning_rate": 5.418e-05, "loss": 0.01, "step": 2710 }, { "epoch": 24.07079646017699, "grad_norm": 0.18613892793655396, "learning_rate": 5.438e-05, "loss": 0.0102, "step": 2720 }, { "epoch": 24.15929203539823, "grad_norm": 0.18388274312019348, "learning_rate": 5.4579999999999996e-05, "loss": 0.0102, "step": 2730 }, { "epoch": 24.24778761061947, "grad_norm": 0.20290076732635498, "learning_rate": 5.478e-05, "loss": 0.0102, "step": 2740 }, { "epoch": 24.336283185840706, "grad_norm": 0.17434221506118774, "learning_rate": 5.498e-05, "loss": 0.0103, "step": 2750 }, { "epoch": 24.424778761061948, "grad_norm": 0.16907170414924622, "learning_rate": 5.518e-05, "loss": 0.0097, "step": 2760 }, { "epoch": 24.513274336283185, "grad_norm": 0.12880559265613556, "learning_rate": 5.538e-05, "loss": 0.0101, "step": 2770 }, { "epoch": 24.601769911504427, "grad_norm": 0.19324956834316254, "learning_rate": 5.558e-05, "loss": 0.0106, "step": 2780 }, { "epoch": 24.690265486725664, "grad_norm": 0.18656569719314575, "learning_rate": 5.578e-05, "loss": 0.0097, "step": 2790 }, { "epoch": 24.778761061946902, "grad_norm": 0.15899035334587097, "learning_rate": 5.5979999999999996e-05, "loss": 0.01, "step": 2800 }, { "epoch": 24.86725663716814, "grad_norm": 0.15733803808689117, "learning_rate": 5.618e-05, "loss": 0.0112, "step": 2810 }, { "epoch": 24.95575221238938, "grad_norm": 0.17346592247486115, "learning_rate": 5.638e-05, "loss": 0.0099, "step": 2820 }, { "epoch": 25.04424778761062, "grad_norm": 0.1815352588891983, "learning_rate": 5.658e-05, "loss": 0.0104, "step": 2830 }, { "epoch": 25.13274336283186, "grad_norm": 0.198966845870018, "learning_rate": 5.678e-05, "loss": 0.0109, "step": 2840 }, { "epoch": 25.221238938053098, "grad_norm": 0.17682309448719025, "learning_rate": 5.698e-05, "loss": 0.0106, "step": 2850 }, { "epoch": 25.309734513274336, "grad_norm": 0.22696511447429657, "learning_rate": 5.718e-05, "loss": 0.0096, "step": 2860 }, { "epoch": 25.398230088495577, "grad_norm": 0.16920962929725647, "learning_rate": 5.738e-05, "loss": 0.011, "step": 2870 }, { "epoch": 25.486725663716815, "grad_norm": 0.2623240053653717, "learning_rate": 5.758e-05, "loss": 0.0098, "step": 2880 }, { "epoch": 25.575221238938052, "grad_norm": 0.19749803841114044, "learning_rate": 5.778e-05, "loss": 0.0103, "step": 2890 }, { "epoch": 25.663716814159294, "grad_norm": 0.17247773706912994, "learning_rate": 5.7980000000000004e-05, "loss": 0.0107, "step": 2900 }, { "epoch": 25.75221238938053, "grad_norm": 0.20530232787132263, "learning_rate": 5.818e-05, "loss": 0.0095, "step": 2910 }, { "epoch": 25.84070796460177, "grad_norm": 0.19921380281448364, "learning_rate": 5.838e-05, "loss": 0.0101, "step": 2920 }, { "epoch": 25.92920353982301, "grad_norm": 0.16885505616664886, "learning_rate": 5.858e-05, "loss": 0.0099, "step": 2930 }, { "epoch": 26.01769911504425, "grad_norm": 0.20461390912532806, "learning_rate": 5.878e-05, "loss": 0.0103, "step": 2940 }, { "epoch": 26.106194690265486, "grad_norm": 0.16449180245399475, "learning_rate": 5.898e-05, "loss": 0.0107, "step": 2950 }, { "epoch": 26.194690265486727, "grad_norm": 0.20002399384975433, "learning_rate": 5.918e-05, "loss": 0.0099, "step": 2960 }, { "epoch": 26.283185840707965, "grad_norm": 0.189950630068779, "learning_rate": 5.9380000000000004e-05, "loss": 0.01, "step": 2970 }, { "epoch": 26.371681415929203, "grad_norm": 0.1926291584968567, "learning_rate": 5.958e-05, "loss": 0.0106, "step": 2980 }, { "epoch": 26.460176991150444, "grad_norm": 0.23954054713249207, "learning_rate": 5.978e-05, "loss": 0.0102, "step": 2990 }, { "epoch": 26.548672566371682, "grad_norm": 0.21134018898010254, "learning_rate": 5.9980000000000005e-05, "loss": 0.0101, "step": 3000 }, { "epoch": 26.63716814159292, "grad_norm": 0.18221573531627655, "learning_rate": 6.018e-05, "loss": 0.0095, "step": 3010 }, { "epoch": 26.72566371681416, "grad_norm": 0.15541145205497742, "learning_rate": 6.038e-05, "loss": 0.0098, "step": 3020 }, { "epoch": 26.8141592920354, "grad_norm": 0.20895129442214966, "learning_rate": 6.0580000000000006e-05, "loss": 0.0099, "step": 3030 }, { "epoch": 26.902654867256636, "grad_norm": 0.24560369551181793, "learning_rate": 6.0780000000000004e-05, "loss": 0.0097, "step": 3040 }, { "epoch": 26.991150442477878, "grad_norm": 0.209951251745224, "learning_rate": 6.098e-05, "loss": 0.0102, "step": 3050 }, { "epoch": 27.079646017699115, "grad_norm": 0.23731729388237, "learning_rate": 6.118000000000001e-05, "loss": 0.0107, "step": 3060 }, { "epoch": 27.168141592920353, "grad_norm": 0.21880139410495758, "learning_rate": 6.138e-05, "loss": 0.0095, "step": 3070 }, { "epoch": 27.256637168141594, "grad_norm": 0.20155078172683716, "learning_rate": 6.158e-05, "loss": 0.0096, "step": 3080 }, { "epoch": 27.345132743362832, "grad_norm": 0.18142688274383545, "learning_rate": 6.178000000000001e-05, "loss": 0.0094, "step": 3090 }, { "epoch": 27.43362831858407, "grad_norm": 0.2185787856578827, "learning_rate": 6.198e-05, "loss": 0.0097, "step": 3100 }, { "epoch": 27.52212389380531, "grad_norm": 0.19645309448242188, "learning_rate": 6.218e-05, "loss": 0.0102, "step": 3110 }, { "epoch": 27.61061946902655, "grad_norm": 0.15861794352531433, "learning_rate": 6.238000000000001e-05, "loss": 0.01, "step": 3120 }, { "epoch": 27.699115044247787, "grad_norm": 0.19991379976272583, "learning_rate": 6.258e-05, "loss": 0.0102, "step": 3130 }, { "epoch": 27.787610619469028, "grad_norm": 0.1209021508693695, "learning_rate": 6.278e-05, "loss": 0.01, "step": 3140 }, { "epoch": 27.876106194690266, "grad_norm": 0.16849768161773682, "learning_rate": 6.298000000000001e-05, "loss": 0.0098, "step": 3150 }, { "epoch": 27.964601769911503, "grad_norm": 0.17233285307884216, "learning_rate": 6.318e-05, "loss": 0.0099, "step": 3160 }, { "epoch": 28.053097345132745, "grad_norm": 0.1882430613040924, "learning_rate": 6.338e-05, "loss": 0.0091, "step": 3170 }, { "epoch": 28.141592920353983, "grad_norm": 0.150844544172287, "learning_rate": 6.358000000000001e-05, "loss": 0.009, "step": 3180 }, { "epoch": 28.23008849557522, "grad_norm": 0.16444620490074158, "learning_rate": 6.378e-05, "loss": 0.0096, "step": 3190 }, { "epoch": 28.31858407079646, "grad_norm": 0.15817801654338837, "learning_rate": 6.398000000000001e-05, "loss": 0.0098, "step": 3200 }, { "epoch": 28.4070796460177, "grad_norm": 0.15221363306045532, "learning_rate": 6.418000000000001e-05, "loss": 0.0093, "step": 3210 }, { "epoch": 28.495575221238937, "grad_norm": 0.22582747042179108, "learning_rate": 6.438e-05, "loss": 0.0095, "step": 3220 }, { "epoch": 28.58407079646018, "grad_norm": 0.2243175506591797, "learning_rate": 6.458000000000001e-05, "loss": 0.0099, "step": 3230 }, { "epoch": 28.672566371681416, "grad_norm": 0.22440826892852783, "learning_rate": 6.478000000000001e-05, "loss": 0.0096, "step": 3240 }, { "epoch": 28.761061946902654, "grad_norm": 0.18992920219898224, "learning_rate": 6.498e-05, "loss": 0.0094, "step": 3250 }, { "epoch": 28.849557522123895, "grad_norm": 0.22126157581806183, "learning_rate": 6.518000000000001e-05, "loss": 0.0098, "step": 3260 }, { "epoch": 28.938053097345133, "grad_norm": 0.17253299057483673, "learning_rate": 6.538000000000001e-05, "loss": 0.0093, "step": 3270 }, { "epoch": 29.02654867256637, "grad_norm": 0.15734551846981049, "learning_rate": 6.558e-05, "loss": 0.0088, "step": 3280 }, { "epoch": 29.115044247787612, "grad_norm": 0.18129082024097443, "learning_rate": 6.578000000000001e-05, "loss": 0.0095, "step": 3290 }, { "epoch": 29.20353982300885, "grad_norm": 0.20513580739498138, "learning_rate": 6.598e-05, "loss": 0.0098, "step": 3300 }, { "epoch": 29.292035398230087, "grad_norm": 0.15419155359268188, "learning_rate": 6.618e-05, "loss": 0.0098, "step": 3310 }, { "epoch": 29.38053097345133, "grad_norm": 0.14091163873672485, "learning_rate": 6.638e-05, "loss": 0.01, "step": 3320 }, { "epoch": 29.469026548672566, "grad_norm": 0.19755586981773376, "learning_rate": 6.658e-05, "loss": 0.0101, "step": 3330 }, { "epoch": 29.557522123893804, "grad_norm": 0.16471098363399506, "learning_rate": 6.678e-05, "loss": 0.0094, "step": 3340 }, { "epoch": 29.646017699115045, "grad_norm": 0.15273906290531158, "learning_rate": 6.698e-05, "loss": 0.0098, "step": 3350 }, { "epoch": 29.734513274336283, "grad_norm": 0.18923990428447723, "learning_rate": 6.718e-05, "loss": 0.0096, "step": 3360 }, { "epoch": 29.82300884955752, "grad_norm": 0.16429926455020905, "learning_rate": 6.738e-05, "loss": 0.0089, "step": 3370 }, { "epoch": 29.911504424778762, "grad_norm": 0.18430180847644806, "learning_rate": 6.758e-05, "loss": 0.0088, "step": 3380 }, { "epoch": 30.0, "grad_norm": 0.17353755235671997, "learning_rate": 6.778e-05, "loss": 0.0097, "step": 3390 }, { "epoch": 30.088495575221238, "grad_norm": 0.22288672626018524, "learning_rate": 6.798e-05, "loss": 0.0083, "step": 3400 }, { "epoch": 30.17699115044248, "grad_norm": 0.1450582891702652, "learning_rate": 6.818e-05, "loss": 0.0091, "step": 3410 }, { "epoch": 30.265486725663717, "grad_norm": 0.20816563069820404, "learning_rate": 6.838e-05, "loss": 0.0089, "step": 3420 }, { "epoch": 30.353982300884955, "grad_norm": 0.16875436902046204, "learning_rate": 6.858e-05, "loss": 0.0084, "step": 3430 }, { "epoch": 30.442477876106196, "grad_norm": 0.14342516660690308, "learning_rate": 6.878e-05, "loss": 0.0097, "step": 3440 }, { "epoch": 30.530973451327434, "grad_norm": 0.14947320520877838, "learning_rate": 6.898e-05, "loss": 0.009, "step": 3450 }, { "epoch": 30.61946902654867, "grad_norm": 0.13617642223834991, "learning_rate": 6.918e-05, "loss": 0.0092, "step": 3460 }, { "epoch": 30.707964601769913, "grad_norm": 0.12668970227241516, "learning_rate": 6.938e-05, "loss": 0.0098, "step": 3470 }, { "epoch": 30.79646017699115, "grad_norm": 0.19894877076148987, "learning_rate": 6.958e-05, "loss": 0.0092, "step": 3480 }, { "epoch": 30.884955752212388, "grad_norm": 0.1800524890422821, "learning_rate": 6.978e-05, "loss": 0.0092, "step": 3490 }, { "epoch": 30.97345132743363, "grad_norm": 0.18502476811408997, "learning_rate": 6.998e-05, "loss": 0.0096, "step": 3500 }, { "epoch": 31.061946902654867, "grad_norm": 0.17043296992778778, "learning_rate": 7.018e-05, "loss": 0.0097, "step": 3510 }, { "epoch": 31.150442477876105, "grad_norm": 0.2069322019815445, "learning_rate": 7.038e-05, "loss": 0.0098, "step": 3520 }, { "epoch": 31.238938053097346, "grad_norm": 0.2311137467622757, "learning_rate": 7.058e-05, "loss": 0.0094, "step": 3530 }, { "epoch": 31.327433628318584, "grad_norm": 0.16992923617362976, "learning_rate": 7.078e-05, "loss": 0.0095, "step": 3540 }, { "epoch": 31.41592920353982, "grad_norm": 0.16129949688911438, "learning_rate": 7.098e-05, "loss": 0.0089, "step": 3550 }, { "epoch": 31.504424778761063, "grad_norm": 0.19930200278759003, "learning_rate": 7.118e-05, "loss": 0.0089, "step": 3560 }, { "epoch": 31.5929203539823, "grad_norm": 0.1485586166381836, "learning_rate": 7.138e-05, "loss": 0.009, "step": 3570 }, { "epoch": 31.68141592920354, "grad_norm": 0.2192714512348175, "learning_rate": 7.158e-05, "loss": 0.0094, "step": 3580 }, { "epoch": 31.76991150442478, "grad_norm": 0.185531347990036, "learning_rate": 7.178000000000001e-05, "loss": 0.0093, "step": 3590 }, { "epoch": 31.858407079646017, "grad_norm": 0.19907477498054504, "learning_rate": 7.198e-05, "loss": 0.0094, "step": 3600 }, { "epoch": 31.946902654867255, "grad_norm": 0.1861497461795807, "learning_rate": 7.218e-05, "loss": 0.0096, "step": 3610 }, { "epoch": 32.0353982300885, "grad_norm": 0.14380060136318207, "learning_rate": 7.238000000000001e-05, "loss": 0.0095, "step": 3620 }, { "epoch": 32.123893805309734, "grad_norm": 0.16309955716133118, "learning_rate": 7.258e-05, "loss": 0.0093, "step": 3630 }, { "epoch": 32.21238938053097, "grad_norm": 0.13984215259552002, "learning_rate": 7.278e-05, "loss": 0.0094, "step": 3640 }, { "epoch": 32.30088495575221, "grad_norm": 0.1498541384935379, "learning_rate": 7.298000000000001e-05, "loss": 0.0097, "step": 3650 }, { "epoch": 32.389380530973455, "grad_norm": 0.18653202056884766, "learning_rate": 7.318e-05, "loss": 0.0091, "step": 3660 }, { "epoch": 32.47787610619469, "grad_norm": 0.1598045825958252, "learning_rate": 7.338e-05, "loss": 0.0094, "step": 3670 }, { "epoch": 32.56637168141593, "grad_norm": 0.16379061341285706, "learning_rate": 7.358000000000001e-05, "loss": 0.0088, "step": 3680 }, { "epoch": 32.65486725663717, "grad_norm": 0.1803268939256668, "learning_rate": 7.378e-05, "loss": 0.0093, "step": 3690 }, { "epoch": 32.743362831858406, "grad_norm": 0.14064811170101166, "learning_rate": 7.398e-05, "loss": 0.009, "step": 3700 }, { "epoch": 32.83185840707964, "grad_norm": 0.12818744778633118, "learning_rate": 7.418000000000001e-05, "loss": 0.0092, "step": 3710 }, { "epoch": 32.92035398230089, "grad_norm": 0.1366705447435379, "learning_rate": 7.438e-05, "loss": 0.0085, "step": 3720 }, { "epoch": 33.008849557522126, "grad_norm": 0.18504424393177032, "learning_rate": 7.458000000000001e-05, "loss": 0.0093, "step": 3730 }, { "epoch": 33.097345132743364, "grad_norm": 0.14896735548973083, "learning_rate": 7.478e-05, "loss": 0.009, "step": 3740 }, { "epoch": 33.1858407079646, "grad_norm": 0.13870835304260254, "learning_rate": 7.498e-05, "loss": 0.0093, "step": 3750 }, { "epoch": 33.27433628318584, "grad_norm": 0.1251486837863922, "learning_rate": 7.518000000000001e-05, "loss": 0.0083, "step": 3760 }, { "epoch": 33.36283185840708, "grad_norm": 0.2125852108001709, "learning_rate": 7.538e-05, "loss": 0.0094, "step": 3770 }, { "epoch": 33.45132743362832, "grad_norm": 0.14534027874469757, "learning_rate": 7.558e-05, "loss": 0.0086, "step": 3780 }, { "epoch": 33.53982300884956, "grad_norm": 0.1484474241733551, "learning_rate": 7.578000000000001e-05, "loss": 0.009, "step": 3790 }, { "epoch": 33.6283185840708, "grad_norm": 0.1549052894115448, "learning_rate": 7.598e-05, "loss": 0.0091, "step": 3800 }, { "epoch": 33.716814159292035, "grad_norm": 0.13797472417354584, "learning_rate": 7.618e-05, "loss": 0.0091, "step": 3810 }, { "epoch": 33.80530973451327, "grad_norm": 0.15759176015853882, "learning_rate": 7.638000000000001e-05, "loss": 0.0078, "step": 3820 }, { "epoch": 33.89380530973451, "grad_norm": 0.13823020458221436, "learning_rate": 7.658e-05, "loss": 0.009, "step": 3830 }, { "epoch": 33.982300884955755, "grad_norm": 0.12496250122785568, "learning_rate": 7.678000000000001e-05, "loss": 0.0083, "step": 3840 }, { "epoch": 34.07079646017699, "grad_norm": 0.13213366270065308, "learning_rate": 7.698000000000001e-05, "loss": 0.0091, "step": 3850 }, { "epoch": 34.15929203539823, "grad_norm": 0.18073223531246185, "learning_rate": 7.718e-05, "loss": 0.0081, "step": 3860 }, { "epoch": 34.24778761061947, "grad_norm": 0.1358853280544281, "learning_rate": 7.738000000000001e-05, "loss": 0.0086, "step": 3870 }, { "epoch": 34.336283185840706, "grad_norm": 0.16672301292419434, "learning_rate": 7.758000000000001e-05, "loss": 0.0087, "step": 3880 }, { "epoch": 34.424778761061944, "grad_norm": 0.11548765748739243, "learning_rate": 7.778e-05, "loss": 0.0087, "step": 3890 }, { "epoch": 34.51327433628319, "grad_norm": 0.15918992459774017, "learning_rate": 7.798000000000001e-05, "loss": 0.0088, "step": 3900 }, { "epoch": 34.60176991150443, "grad_norm": 0.17667222023010254, "learning_rate": 7.818000000000001e-05, "loss": 0.0088, "step": 3910 }, { "epoch": 34.690265486725664, "grad_norm": 0.1465061604976654, "learning_rate": 7.838e-05, "loss": 0.0085, "step": 3920 }, { "epoch": 34.7787610619469, "grad_norm": 0.18721789121627808, "learning_rate": 7.858000000000001e-05, "loss": 0.0087, "step": 3930 }, { "epoch": 34.86725663716814, "grad_norm": 0.16090698540210724, "learning_rate": 7.878e-05, "loss": 0.0085, "step": 3940 }, { "epoch": 34.95575221238938, "grad_norm": 0.16545279324054718, "learning_rate": 7.897999999999999e-05, "loss": 0.0082, "step": 3950 }, { "epoch": 35.04424778761062, "grad_norm": 0.16116927564144135, "learning_rate": 7.918e-05, "loss": 0.0084, "step": 3960 }, { "epoch": 35.13274336283186, "grad_norm": 0.19407112896442413, "learning_rate": 7.938e-05, "loss": 0.008, "step": 3970 }, { "epoch": 35.2212389380531, "grad_norm": 0.13435247540473938, "learning_rate": 7.958e-05, "loss": 0.0088, "step": 3980 }, { "epoch": 35.309734513274336, "grad_norm": 0.15754403173923492, "learning_rate": 7.978e-05, "loss": 0.0092, "step": 3990 }, { "epoch": 35.39823008849557, "grad_norm": 0.18012726306915283, "learning_rate": 7.998e-05, "loss": 0.0089, "step": 4000 }, { "epoch": 35.48672566371681, "grad_norm": 0.17743121087551117, "learning_rate": 8.018e-05, "loss": 0.0089, "step": 4010 }, { "epoch": 35.575221238938056, "grad_norm": 0.1524660289287567, "learning_rate": 8.038e-05, "loss": 0.0086, "step": 4020 }, { "epoch": 35.663716814159294, "grad_norm": 0.17956510186195374, "learning_rate": 8.058e-05, "loss": 0.0086, "step": 4030 }, { "epoch": 35.75221238938053, "grad_norm": 0.17635688185691833, "learning_rate": 8.078e-05, "loss": 0.0086, "step": 4040 }, { "epoch": 35.84070796460177, "grad_norm": 0.16003718972206116, "learning_rate": 8.098e-05, "loss": 0.0091, "step": 4050 }, { "epoch": 35.92920353982301, "grad_norm": 0.14748983085155487, "learning_rate": 8.118e-05, "loss": 0.0096, "step": 4060 }, { "epoch": 36.017699115044245, "grad_norm": 0.16579337418079376, "learning_rate": 8.138e-05, "loss": 0.0092, "step": 4070 }, { "epoch": 36.10619469026549, "grad_norm": 0.15083903074264526, "learning_rate": 8.158e-05, "loss": 0.0091, "step": 4080 }, { "epoch": 36.19469026548673, "grad_norm": 0.13731341063976288, "learning_rate": 8.178e-05, "loss": 0.0083, "step": 4090 }, { "epoch": 36.283185840707965, "grad_norm": 0.164901465177536, "learning_rate": 8.198e-05, "loss": 0.0087, "step": 4100 }, { "epoch": 36.3716814159292, "grad_norm": 0.1438867747783661, "learning_rate": 8.218e-05, "loss": 0.0082, "step": 4110 }, { "epoch": 36.46017699115044, "grad_norm": 0.15548557043075562, "learning_rate": 8.238000000000001e-05, "loss": 0.0083, "step": 4120 }, { "epoch": 36.54867256637168, "grad_norm": 0.14919929206371307, "learning_rate": 8.258e-05, "loss": 0.0083, "step": 4130 }, { "epoch": 36.63716814159292, "grad_norm": 0.13696545362472534, "learning_rate": 8.278e-05, "loss": 0.0081, "step": 4140 }, { "epoch": 36.72566371681416, "grad_norm": 0.15954339504241943, "learning_rate": 8.298000000000001e-05, "loss": 0.0078, "step": 4150 }, { "epoch": 36.8141592920354, "grad_norm": 0.1508786976337433, "learning_rate": 8.318e-05, "loss": 0.0093, "step": 4160 }, { "epoch": 36.902654867256636, "grad_norm": 0.1239413172006607, "learning_rate": 8.338e-05, "loss": 0.0086, "step": 4170 }, { "epoch": 36.991150442477874, "grad_norm": 0.15482401847839355, "learning_rate": 8.358e-05, "loss": 0.0084, "step": 4180 }, { "epoch": 37.07964601769911, "grad_norm": 0.16805481910705566, "learning_rate": 8.378e-05, "loss": 0.0091, "step": 4190 }, { "epoch": 37.16814159292036, "grad_norm": 0.21241982281208038, "learning_rate": 8.398e-05, "loss": 0.009, "step": 4200 }, { "epoch": 37.256637168141594, "grad_norm": 0.1799900233745575, "learning_rate": 8.418e-05, "loss": 0.0088, "step": 4210 }, { "epoch": 37.34513274336283, "grad_norm": 0.1303999274969101, "learning_rate": 8.438e-05, "loss": 0.0086, "step": 4220 }, { "epoch": 37.43362831858407, "grad_norm": 0.16632936894893646, "learning_rate": 8.458e-05, "loss": 0.0089, "step": 4230 }, { "epoch": 37.52212389380531, "grad_norm": 0.16269199550151825, "learning_rate": 8.478e-05, "loss": 0.0088, "step": 4240 }, { "epoch": 37.610619469026545, "grad_norm": 0.15422070026397705, "learning_rate": 8.498e-05, "loss": 0.0088, "step": 4250 }, { "epoch": 37.69911504424779, "grad_norm": 0.14053797721862793, "learning_rate": 8.518000000000001e-05, "loss": 0.0086, "step": 4260 }, { "epoch": 37.78761061946903, "grad_norm": 0.15649044513702393, "learning_rate": 8.538e-05, "loss": 0.0086, "step": 4270 }, { "epoch": 37.876106194690266, "grad_norm": 0.16178472340106964, "learning_rate": 8.558e-05, "loss": 0.0092, "step": 4280 }, { "epoch": 37.9646017699115, "grad_norm": 0.19397889077663422, "learning_rate": 8.578000000000001e-05, "loss": 0.0081, "step": 4290 }, { "epoch": 38.05309734513274, "grad_norm": 0.18350431323051453, "learning_rate": 8.598e-05, "loss": 0.0098, "step": 4300 }, { "epoch": 38.14159292035398, "grad_norm": 0.12597690522670746, "learning_rate": 8.618e-05, "loss": 0.0087, "step": 4310 }, { "epoch": 38.230088495575224, "grad_norm": 0.15449459850788116, "learning_rate": 8.638000000000001e-05, "loss": 0.0083, "step": 4320 }, { "epoch": 38.31858407079646, "grad_norm": 0.177949458360672, "learning_rate": 8.658e-05, "loss": 0.0098, "step": 4330 }, { "epoch": 38.4070796460177, "grad_norm": 0.1619928777217865, "learning_rate": 8.678e-05, "loss": 0.0078, "step": 4340 }, { "epoch": 38.49557522123894, "grad_norm": 0.15013732016086578, "learning_rate": 8.698000000000001e-05, "loss": 0.0083, "step": 4350 }, { "epoch": 38.584070796460175, "grad_norm": 0.17102967202663422, "learning_rate": 8.718e-05, "loss": 0.0085, "step": 4360 }, { "epoch": 38.67256637168141, "grad_norm": 0.14002537727355957, "learning_rate": 8.738000000000001e-05, "loss": 0.0083, "step": 4370 }, { "epoch": 38.76106194690266, "grad_norm": 0.14207163453102112, "learning_rate": 8.758000000000001e-05, "loss": 0.0083, "step": 4380 }, { "epoch": 38.849557522123895, "grad_norm": 0.16239385306835175, "learning_rate": 8.778e-05, "loss": 0.0085, "step": 4390 }, { "epoch": 38.93805309734513, "grad_norm": 0.15160299837589264, "learning_rate": 8.798000000000001e-05, "loss": 0.0077, "step": 4400 }, { "epoch": 39.02654867256637, "grad_norm": 0.20175650715827942, "learning_rate": 8.818000000000001e-05, "loss": 0.008, "step": 4410 }, { "epoch": 39.11504424778761, "grad_norm": 0.15453539788722992, "learning_rate": 8.838e-05, "loss": 0.008, "step": 4420 }, { "epoch": 39.203539823008846, "grad_norm": 0.18454639613628387, "learning_rate": 8.858000000000001e-05, "loss": 0.0088, "step": 4430 }, { "epoch": 39.29203539823009, "grad_norm": 0.2145855724811554, "learning_rate": 8.878000000000001e-05, "loss": 0.0086, "step": 4440 }, { "epoch": 39.38053097345133, "grad_norm": 0.24812659621238708, "learning_rate": 8.898e-05, "loss": 0.0086, "step": 4450 }, { "epoch": 39.469026548672566, "grad_norm": 0.14961618185043335, "learning_rate": 8.918000000000001e-05, "loss": 0.0082, "step": 4460 }, { "epoch": 39.557522123893804, "grad_norm": 0.1429668813943863, "learning_rate": 8.938e-05, "loss": 0.0083, "step": 4470 }, { "epoch": 39.64601769911504, "grad_norm": 0.12803004682064056, "learning_rate": 8.958e-05, "loss": 0.0076, "step": 4480 }, { "epoch": 39.73451327433628, "grad_norm": 0.13182350993156433, "learning_rate": 8.978000000000001e-05, "loss": 0.0079, "step": 4490 }, { "epoch": 39.823008849557525, "grad_norm": 0.1274401843547821, "learning_rate": 8.998e-05, "loss": 0.0083, "step": 4500 }, { "epoch": 39.91150442477876, "grad_norm": 0.13591696321964264, "learning_rate": 9.018000000000001e-05, "loss": 0.0084, "step": 4510 }, { "epoch": 40.0, "grad_norm": 0.1654900312423706, "learning_rate": 9.038000000000001e-05, "loss": 0.0082, "step": 4520 }, { "epoch": 40.08849557522124, "grad_norm": 0.1695890575647354, "learning_rate": 9.058e-05, "loss": 0.008, "step": 4530 }, { "epoch": 40.176991150442475, "grad_norm": 0.15948446094989777, "learning_rate": 9.078000000000001e-05, "loss": 0.0088, "step": 4540 }, { "epoch": 40.26548672566372, "grad_norm": 0.12526074051856995, "learning_rate": 9.098000000000001e-05, "loss": 0.0076, "step": 4550 }, { "epoch": 40.35398230088496, "grad_norm": 0.1264461874961853, "learning_rate": 9.118e-05, "loss": 0.009, "step": 4560 }, { "epoch": 40.442477876106196, "grad_norm": 0.1087394431233406, "learning_rate": 9.138e-05, "loss": 0.0082, "step": 4570 }, { "epoch": 40.530973451327434, "grad_norm": 0.12300600856542587, "learning_rate": 9.158e-05, "loss": 0.0081, "step": 4580 }, { "epoch": 40.61946902654867, "grad_norm": 0.2179492563009262, "learning_rate": 9.178e-05, "loss": 0.0085, "step": 4590 }, { "epoch": 40.70796460176991, "grad_norm": 0.1449461728334427, "learning_rate": 9.198e-05, "loss": 0.0079, "step": 4600 }, { "epoch": 40.796460176991154, "grad_norm": 0.16602952778339386, "learning_rate": 9.218e-05, "loss": 0.0083, "step": 4610 }, { "epoch": 40.88495575221239, "grad_norm": 0.1600906252861023, "learning_rate": 9.238e-05, "loss": 0.0085, "step": 4620 }, { "epoch": 40.97345132743363, "grad_norm": 0.1462162286043167, "learning_rate": 9.258e-05, "loss": 0.0081, "step": 4630 }, { "epoch": 41.06194690265487, "grad_norm": 0.12794408202171326, "learning_rate": 9.278e-05, "loss": 0.0089, "step": 4640 }, { "epoch": 41.150442477876105, "grad_norm": 0.1656036525964737, "learning_rate": 9.298e-05, "loss": 0.0079, "step": 4650 }, { "epoch": 41.23893805309734, "grad_norm": 0.17827680706977844, "learning_rate": 9.318e-05, "loss": 0.0086, "step": 4660 }, { "epoch": 41.32743362831859, "grad_norm": 0.18648305535316467, "learning_rate": 9.338e-05, "loss": 0.0088, "step": 4670 }, { "epoch": 41.415929203539825, "grad_norm": 0.14734520018100739, "learning_rate": 9.358e-05, "loss": 0.0079, "step": 4680 }, { "epoch": 41.50442477876106, "grad_norm": 0.148806631565094, "learning_rate": 9.378e-05, "loss": 0.0082, "step": 4690 }, { "epoch": 41.5929203539823, "grad_norm": 0.17897126078605652, "learning_rate": 9.398e-05, "loss": 0.0093, "step": 4700 }, { "epoch": 41.68141592920354, "grad_norm": 0.18038998544216156, "learning_rate": 9.418e-05, "loss": 0.0087, "step": 4710 }, { "epoch": 41.769911504424776, "grad_norm": 0.12283733487129211, "learning_rate": 9.438e-05, "loss": 0.0078, "step": 4720 }, { "epoch": 41.85840707964602, "grad_norm": 0.1232951357960701, "learning_rate": 9.458e-05, "loss": 0.0085, "step": 4730 }, { "epoch": 41.94690265486726, "grad_norm": 0.13569188117980957, "learning_rate": 9.478e-05, "loss": 0.0087, "step": 4740 }, { "epoch": 42.0353982300885, "grad_norm": 0.11398052424192429, "learning_rate": 9.498e-05, "loss": 0.0084, "step": 4750 }, { "epoch": 42.123893805309734, "grad_norm": 0.13281014561653137, "learning_rate": 9.518000000000001e-05, "loss": 0.0076, "step": 4760 }, { "epoch": 42.21238938053097, "grad_norm": 0.17270945012569427, "learning_rate": 9.538e-05, "loss": 0.0082, "step": 4770 }, { "epoch": 42.30088495575221, "grad_norm": 0.15716494619846344, "learning_rate": 9.558e-05, "loss": 0.0086, "step": 4780 }, { "epoch": 42.389380530973455, "grad_norm": 0.12443627417087555, "learning_rate": 9.578000000000001e-05, "loss": 0.0075, "step": 4790 }, { "epoch": 42.47787610619469, "grad_norm": 0.12717044353485107, "learning_rate": 9.598e-05, "loss": 0.008, "step": 4800 }, { "epoch": 42.56637168141593, "grad_norm": 0.14651887118816376, "learning_rate": 9.618e-05, "loss": 0.0076, "step": 4810 }, { "epoch": 42.65486725663717, "grad_norm": 0.13384868204593658, "learning_rate": 9.638000000000001e-05, "loss": 0.0082, "step": 4820 }, { "epoch": 42.743362831858406, "grad_norm": 0.16453304886817932, "learning_rate": 9.658e-05, "loss": 0.0083, "step": 4830 }, { "epoch": 42.83185840707964, "grad_norm": 0.17644605040550232, "learning_rate": 9.678e-05, "loss": 0.0079, "step": 4840 }, { "epoch": 42.92035398230089, "grad_norm": 0.18505771458148956, "learning_rate": 9.698000000000001e-05, "loss": 0.0082, "step": 4850 }, { "epoch": 43.008849557522126, "grad_norm": 0.17535194754600525, "learning_rate": 9.718e-05, "loss": 0.0082, "step": 4860 }, { "epoch": 43.097345132743364, "grad_norm": 0.15071967244148254, "learning_rate": 9.738e-05, "loss": 0.0081, "step": 4870 }, { "epoch": 43.1858407079646, "grad_norm": 0.11829250305891037, "learning_rate": 9.758000000000001e-05, "loss": 0.0077, "step": 4880 }, { "epoch": 43.27433628318584, "grad_norm": 0.1169343814253807, "learning_rate": 9.778e-05, "loss": 0.0075, "step": 4890 }, { "epoch": 43.36283185840708, "grad_norm": 0.12891387939453125, "learning_rate": 9.798000000000001e-05, "loss": 0.0073, "step": 4900 }, { "epoch": 43.45132743362832, "grad_norm": 0.1629272848367691, "learning_rate": 9.818000000000001e-05, "loss": 0.0081, "step": 4910 }, { "epoch": 43.53982300884956, "grad_norm": 0.1464705765247345, "learning_rate": 9.838e-05, "loss": 0.0082, "step": 4920 }, { "epoch": 43.6283185840708, "grad_norm": 0.13340625166893005, "learning_rate": 9.858000000000001e-05, "loss": 0.0091, "step": 4930 }, { "epoch": 43.716814159292035, "grad_norm": 0.1393061727285385, "learning_rate": 9.878e-05, "loss": 0.008, "step": 4940 }, { "epoch": 43.80530973451327, "grad_norm": 0.1344250589609146, "learning_rate": 9.898e-05, "loss": 0.0078, "step": 4950 }, { "epoch": 43.89380530973451, "grad_norm": 0.12341362237930298, "learning_rate": 9.918000000000001e-05, "loss": 0.0083, "step": 4960 }, { "epoch": 43.982300884955755, "grad_norm": 0.11524364352226257, "learning_rate": 9.938e-05, "loss": 0.0083, "step": 4970 }, { "epoch": 44.07079646017699, "grad_norm": 0.1671265959739685, "learning_rate": 9.958e-05, "loss": 0.008, "step": 4980 }, { "epoch": 44.15929203539823, "grad_norm": 0.12943553924560547, "learning_rate": 9.978000000000001e-05, "loss": 0.0084, "step": 4990 }, { "epoch": 44.24778761061947, "grad_norm": 0.12595415115356445, "learning_rate": 9.998e-05, "loss": 0.0077, "step": 5000 }, { "epoch": 44.336283185840706, "grad_norm": 0.1713840663433075, "learning_rate": 9.999999778549045e-05, "loss": 0.0087, "step": 5010 }, { "epoch": 44.424778761061944, "grad_norm": 0.16503138840198517, "learning_rate": 9.999999013039593e-05, "loss": 0.0085, "step": 5020 }, { "epoch": 44.51327433628319, "grad_norm": 0.14200608432292938, "learning_rate": 9.999997700737766e-05, "loss": 0.0082, "step": 5030 }, { "epoch": 44.60176991150443, "grad_norm": 0.13645635545253754, "learning_rate": 9.999995841643709e-05, "loss": 0.0082, "step": 5040 }, { "epoch": 44.690265486725664, "grad_norm": 0.12640200555324554, "learning_rate": 9.999993435757623e-05, "loss": 0.0085, "step": 5050 }, { "epoch": 44.7787610619469, "grad_norm": 0.11563556641340256, "learning_rate": 9.999990483079773e-05, "loss": 0.0083, "step": 5060 }, { "epoch": 44.86725663716814, "grad_norm": 0.1313731074333191, "learning_rate": 9.999986983610481e-05, "loss": 0.0075, "step": 5070 }, { "epoch": 44.95575221238938, "grad_norm": 0.1426715850830078, "learning_rate": 9.99998293735013e-05, "loss": 0.0075, "step": 5080 }, { "epoch": 45.04424778761062, "grad_norm": 0.1411140263080597, "learning_rate": 9.999978344299161e-05, "loss": 0.008, "step": 5090 }, { "epoch": 45.13274336283186, "grad_norm": 0.1875256448984146, "learning_rate": 9.99997320445808e-05, "loss": 0.0083, "step": 5100 }, { "epoch": 45.2212389380531, "grad_norm": 0.1731545478105545, "learning_rate": 9.999967517827444e-05, "loss": 0.0081, "step": 5110 }, { "epoch": 45.309734513274336, "grad_norm": 0.12668772041797638, "learning_rate": 9.999961284407879e-05, "loss": 0.0079, "step": 5120 }, { "epoch": 45.39823008849557, "grad_norm": 0.1660381704568863, "learning_rate": 9.999954504200067e-05, "loss": 0.0085, "step": 5130 }, { "epoch": 45.48672566371681, "grad_norm": 0.159927099943161, "learning_rate": 9.999947177204744e-05, "loss": 0.0079, "step": 5140 }, { "epoch": 45.575221238938056, "grad_norm": 0.08492934703826904, "learning_rate": 9.999939303422718e-05, "loss": 0.0078, "step": 5150 }, { "epoch": 45.663716814159294, "grad_norm": 0.13286644220352173, "learning_rate": 9.999930882854847e-05, "loss": 0.0077, "step": 5160 }, { "epoch": 45.75221238938053, "grad_norm": 0.165427565574646, "learning_rate": 9.999921915502051e-05, "loss": 0.0078, "step": 5170 }, { "epoch": 45.84070796460177, "grad_norm": 0.1565057635307312, "learning_rate": 9.99991240136531e-05, "loss": 0.0084, "step": 5180 }, { "epoch": 45.92920353982301, "grad_norm": 0.13537095487117767, "learning_rate": 9.999902340445668e-05, "loss": 0.0078, "step": 5190 }, { "epoch": 46.017699115044245, "grad_norm": 0.1399421989917755, "learning_rate": 9.999891732744224e-05, "loss": 0.0076, "step": 5200 }, { "epoch": 46.10619469026549, "grad_norm": 0.14480353891849518, "learning_rate": 9.999880578262135e-05, "loss": 0.0076, "step": 5210 }, { "epoch": 46.19469026548673, "grad_norm": 0.15610076487064362, "learning_rate": 9.999868877000624e-05, "loss": 0.0076, "step": 5220 }, { "epoch": 46.283185840707965, "grad_norm": 0.18092435598373413, "learning_rate": 9.99985662896097e-05, "loss": 0.0079, "step": 5230 }, { "epoch": 46.3716814159292, "grad_norm": 0.12604007124900818, "learning_rate": 9.999843834144513e-05, "loss": 0.0078, "step": 5240 }, { "epoch": 46.46017699115044, "grad_norm": 0.21595871448516846, "learning_rate": 9.99983049255265e-05, "loss": 0.008, "step": 5250 }, { "epoch": 46.54867256637168, "grad_norm": 0.16414035856723785, "learning_rate": 9.999816604186843e-05, "loss": 0.0076, "step": 5260 }, { "epoch": 46.63716814159292, "grad_norm": 0.14966443181037903, "learning_rate": 9.999802169048609e-05, "loss": 0.0085, "step": 5270 }, { "epoch": 46.72566371681416, "grad_norm": 0.1894393116235733, "learning_rate": 9.999787187139527e-05, "loss": 0.0078, "step": 5280 }, { "epoch": 46.8141592920354, "grad_norm": 0.1692032665014267, "learning_rate": 9.999771658461234e-05, "loss": 0.0079, "step": 5290 }, { "epoch": 46.902654867256636, "grad_norm": 0.18700039386749268, "learning_rate": 9.999755583015431e-05, "loss": 0.008, "step": 5300 }, { "epoch": 46.991150442477874, "grad_norm": 0.1777140349149704, "learning_rate": 9.999738960803874e-05, "loss": 0.0087, "step": 5310 }, { "epoch": 47.07964601769911, "grad_norm": 0.1733047068119049, "learning_rate": 9.99972179182838e-05, "loss": 0.0079, "step": 5320 }, { "epoch": 47.16814159292036, "grad_norm": 0.16074609756469727, "learning_rate": 9.99970407609083e-05, "loss": 0.0083, "step": 5330 }, { "epoch": 47.256637168141594, "grad_norm": 0.11764027923345566, "learning_rate": 9.999685813593159e-05, "loss": 0.0079, "step": 5340 }, { "epoch": 47.34513274336283, "grad_norm": 0.14405815303325653, "learning_rate": 9.999667004337362e-05, "loss": 0.0075, "step": 5350 }, { "epoch": 47.43362831858407, "grad_norm": 0.13123449683189392, "learning_rate": 9.9996476483255e-05, "loss": 0.0071, "step": 5360 }, { "epoch": 47.52212389380531, "grad_norm": 0.1139673963189125, "learning_rate": 9.999627745559688e-05, "loss": 0.0078, "step": 5370 }, { "epoch": 47.610619469026545, "grad_norm": 0.14024882018566132, "learning_rate": 9.999607296042101e-05, "loss": 0.0072, "step": 5380 }, { "epoch": 47.69911504424779, "grad_norm": 0.14336134493350983, "learning_rate": 9.99958629977498e-05, "loss": 0.0081, "step": 5390 }, { "epoch": 47.78761061946903, "grad_norm": 0.15486717224121094, "learning_rate": 9.999564756760615e-05, "loss": 0.0078, "step": 5400 }, { "epoch": 47.876106194690266, "grad_norm": 0.14090169966220856, "learning_rate": 9.999542667001366e-05, "loss": 0.0085, "step": 5410 }, { "epoch": 47.9646017699115, "grad_norm": 0.13845594227313995, "learning_rate": 9.999520030499647e-05, "loss": 0.0078, "step": 5420 }, { "epoch": 48.05309734513274, "grad_norm": 0.14902648329734802, "learning_rate": 9.999496847257936e-05, "loss": 0.0078, "step": 5430 }, { "epoch": 48.14159292035398, "grad_norm": 0.13788892328739166, "learning_rate": 9.999473117278764e-05, "loss": 0.007, "step": 5440 }, { "epoch": 48.230088495575224, "grad_norm": 0.16489292681217194, "learning_rate": 9.999448840564731e-05, "loss": 0.0078, "step": 5450 }, { "epoch": 48.31858407079646, "grad_norm": 0.13535885512828827, "learning_rate": 9.999424017118488e-05, "loss": 0.0082, "step": 5460 }, { "epoch": 48.4070796460177, "grad_norm": 0.14194975793361664, "learning_rate": 9.999398646942751e-05, "loss": 0.0075, "step": 5470 }, { "epoch": 48.49557522123894, "grad_norm": 0.12652724981307983, "learning_rate": 9.999372730040296e-05, "loss": 0.0076, "step": 5480 }, { "epoch": 48.584070796460175, "grad_norm": 0.17050671577453613, "learning_rate": 9.999346266413953e-05, "loss": 0.0081, "step": 5490 }, { "epoch": 48.67256637168141, "grad_norm": 0.17964069545269012, "learning_rate": 9.99931925606662e-05, "loss": 0.008, "step": 5500 }, { "epoch": 48.76106194690266, "grad_norm": 0.1285768449306488, "learning_rate": 9.99929169900125e-05, "loss": 0.0073, "step": 5510 }, { "epoch": 48.849557522123895, "grad_norm": 0.12192533910274506, "learning_rate": 9.999263595220855e-05, "loss": 0.008, "step": 5520 }, { "epoch": 48.93805309734513, "grad_norm": 0.14079219102859497, "learning_rate": 9.99923494472851e-05, "loss": 0.0081, "step": 5530 }, { "epoch": 49.02654867256637, "grad_norm": 0.11866286396980286, "learning_rate": 9.999205747527348e-05, "loss": 0.0081, "step": 5540 }, { "epoch": 49.11504424778761, "grad_norm": 0.17622506618499756, "learning_rate": 9.999176003620561e-05, "loss": 0.0074, "step": 5550 }, { "epoch": 49.203539823008846, "grad_norm": 0.1561884731054306, "learning_rate": 9.999145713011405e-05, "loss": 0.0071, "step": 5560 }, { "epoch": 49.29203539823009, "grad_norm": 0.15959735214710236, "learning_rate": 9.999114875703186e-05, "loss": 0.0072, "step": 5570 }, { "epoch": 49.38053097345133, "grad_norm": 0.13429850339889526, "learning_rate": 9.999083491699281e-05, "loss": 0.0075, "step": 5580 }, { "epoch": 49.469026548672566, "grad_norm": 0.13098657131195068, "learning_rate": 9.999051561003123e-05, "loss": 0.0068, "step": 5590 }, { "epoch": 49.557522123893804, "grad_norm": 0.1348903477191925, "learning_rate": 9.999019083618202e-05, "loss": 0.008, "step": 5600 }, { "epoch": 49.64601769911504, "grad_norm": 0.14096537232398987, "learning_rate": 9.99898605954807e-05, "loss": 0.0071, "step": 5610 }, { "epoch": 49.73451327433628, "grad_norm": 0.09773388504981995, "learning_rate": 9.998952488796338e-05, "loss": 0.0071, "step": 5620 }, { "epoch": 49.823008849557525, "grad_norm": 0.1066397875547409, "learning_rate": 9.998918371366676e-05, "loss": 0.007, "step": 5630 }, { "epoch": 49.91150442477876, "grad_norm": 0.13253827393054962, "learning_rate": 9.99888370726282e-05, "loss": 0.0081, "step": 5640 }, { "epoch": 50.0, "grad_norm": 0.10580658912658691, "learning_rate": 9.998848496488556e-05, "loss": 0.0074, "step": 5650 }, { "epoch": 50.08849557522124, "grad_norm": 0.12896136939525604, "learning_rate": 9.998812739047736e-05, "loss": 0.0071, "step": 5660 }, { "epoch": 50.176991150442475, "grad_norm": 0.12507116794586182, "learning_rate": 9.99877643494427e-05, "loss": 0.0081, "step": 5670 }, { "epoch": 50.26548672566372, "grad_norm": 0.13797657191753387, "learning_rate": 9.998739584182128e-05, "loss": 0.0067, "step": 5680 }, { "epoch": 50.35398230088496, "grad_norm": 0.1256704032421112, "learning_rate": 9.998702186765342e-05, "loss": 0.0073, "step": 5690 }, { "epoch": 50.442477876106196, "grad_norm": 0.14041030406951904, "learning_rate": 9.998664242698e-05, "loss": 0.0078, "step": 5700 }, { "epoch": 50.530973451327434, "grad_norm": 0.13407254219055176, "learning_rate": 9.998625751984251e-05, "loss": 0.0072, "step": 5710 }, { "epoch": 50.61946902654867, "grad_norm": 0.11884211003780365, "learning_rate": 9.998586714628307e-05, "loss": 0.0073, "step": 5720 }, { "epoch": 50.70796460176991, "grad_norm": 0.10131306946277618, "learning_rate": 9.998547130634432e-05, "loss": 0.0068, "step": 5730 }, { "epoch": 50.796460176991154, "grad_norm": 0.12167004495859146, "learning_rate": 9.99850700000696e-05, "loss": 0.0066, "step": 5740 }, { "epoch": 50.88495575221239, "grad_norm": 0.10352536290884018, "learning_rate": 9.998466322750278e-05, "loss": 0.0071, "step": 5750 }, { "epoch": 50.97345132743363, "grad_norm": 0.10714670270681381, "learning_rate": 9.998425098868834e-05, "loss": 0.0075, "step": 5760 }, { "epoch": 51.06194690265487, "grad_norm": 0.11024036258459091, "learning_rate": 9.998383328367136e-05, "loss": 0.0071, "step": 5770 }, { "epoch": 51.150442477876105, "grad_norm": 0.10746333748102188, "learning_rate": 9.99834101124975e-05, "loss": 0.0071, "step": 5780 }, { "epoch": 51.23893805309734, "grad_norm": 0.11430933326482773, "learning_rate": 9.998298147521309e-05, "loss": 0.0067, "step": 5790 }, { "epoch": 51.32743362831859, "grad_norm": 0.13216055929660797, "learning_rate": 9.998254737186496e-05, "loss": 0.0068, "step": 5800 }, { "epoch": 51.415929203539825, "grad_norm": 0.14904490113258362, "learning_rate": 9.99821078025006e-05, "loss": 0.0075, "step": 5810 }, { "epoch": 51.50442477876106, "grad_norm": 0.1333301067352295, "learning_rate": 9.998166276716807e-05, "loss": 0.0075, "step": 5820 }, { "epoch": 51.5929203539823, "grad_norm": 0.13811925053596497, "learning_rate": 9.998121226591606e-05, "loss": 0.0069, "step": 5830 }, { "epoch": 51.68141592920354, "grad_norm": 0.1468496173620224, "learning_rate": 9.998075629879382e-05, "loss": 0.0072, "step": 5840 }, { "epoch": 51.769911504424776, "grad_norm": 0.14314375817775726, "learning_rate": 9.99802948658512e-05, "loss": 0.0075, "step": 5850 }, { "epoch": 51.85840707964602, "grad_norm": 0.13941283524036407, "learning_rate": 9.99798279671387e-05, "loss": 0.0077, "step": 5860 }, { "epoch": 51.94690265486726, "grad_norm": 0.13589324057102203, "learning_rate": 9.997935560270734e-05, "loss": 0.0074, "step": 5870 }, { "epoch": 52.0353982300885, "grad_norm": 0.14620272815227509, "learning_rate": 9.997887777260879e-05, "loss": 0.0075, "step": 5880 }, { "epoch": 52.123893805309734, "grad_norm": 0.12211687862873077, "learning_rate": 9.997839447689532e-05, "loss": 0.0068, "step": 5890 }, { "epoch": 52.21238938053097, "grad_norm": 0.14531292021274567, "learning_rate": 9.997790571561978e-05, "loss": 0.0073, "step": 5900 }, { "epoch": 52.30088495575221, "grad_norm": 0.19999101758003235, "learning_rate": 9.99774114888356e-05, "loss": 0.0079, "step": 5910 }, { "epoch": 52.389380530973455, "grad_norm": 0.12175704538822174, "learning_rate": 9.997691179659684e-05, "loss": 0.0069, "step": 5920 }, { "epoch": 52.47787610619469, "grad_norm": 0.14504700899124146, "learning_rate": 9.997640663895815e-05, "loss": 0.0071, "step": 5930 }, { "epoch": 52.56637168141593, "grad_norm": 0.11807917058467865, "learning_rate": 9.997589601597477e-05, "loss": 0.0071, "step": 5940 }, { "epoch": 52.65486725663717, "grad_norm": 0.14919745922088623, "learning_rate": 9.997537992770252e-05, "loss": 0.0076, "step": 5950 }, { "epoch": 52.743362831858406, "grad_norm": 0.16913174092769623, "learning_rate": 9.997485837419788e-05, "loss": 0.0079, "step": 5960 }, { "epoch": 52.83185840707964, "grad_norm": 0.1366829127073288, "learning_rate": 9.997433135551786e-05, "loss": 0.007, "step": 5970 }, { "epoch": 52.92035398230089, "grad_norm": 0.18243227899074554, "learning_rate": 9.997379887172009e-05, "loss": 0.0074, "step": 5980 }, { "epoch": 53.008849557522126, "grad_norm": 0.14823484420776367, "learning_rate": 9.997326092286281e-05, "loss": 0.0073, "step": 5990 }, { "epoch": 53.097345132743364, "grad_norm": 0.1373046487569809, "learning_rate": 9.997271750900486e-05, "loss": 0.0075, "step": 6000 }, { "epoch": 53.1858407079646, "grad_norm": 0.1174549013376236, "learning_rate": 9.997216863020565e-05, "loss": 0.007, "step": 6010 }, { "epoch": 53.27433628318584, "grad_norm": 0.14685441553592682, "learning_rate": 9.99716142865252e-05, "loss": 0.0072, "step": 6020 }, { "epoch": 53.36283185840708, "grad_norm": 0.12311328202486038, "learning_rate": 9.997105447802415e-05, "loss": 0.007, "step": 6030 }, { "epoch": 53.45132743362832, "grad_norm": 0.14539910852909088, "learning_rate": 9.997048920476373e-05, "loss": 0.007, "step": 6040 }, { "epoch": 53.53982300884956, "grad_norm": 0.1585048884153366, "learning_rate": 9.996991846680572e-05, "loss": 0.007, "step": 6050 }, { "epoch": 53.6283185840708, "grad_norm": 0.1574210524559021, "learning_rate": 9.996934226421257e-05, "loss": 0.0066, "step": 6060 }, { "epoch": 53.716814159292035, "grad_norm": 0.13777823746204376, "learning_rate": 9.996876059704726e-05, "loss": 0.0073, "step": 6070 }, { "epoch": 53.80530973451327, "grad_norm": 0.1269245743751526, "learning_rate": 9.996817346537343e-05, "loss": 0.0076, "step": 6080 }, { "epoch": 53.89380530973451, "grad_norm": 0.12032314389944077, "learning_rate": 9.996758086925526e-05, "loss": 0.0077, "step": 6090 }, { "epoch": 53.982300884955755, "grad_norm": 0.1316395401954651, "learning_rate": 9.996698280875759e-05, "loss": 0.007, "step": 6100 }, { "epoch": 54.07079646017699, "grad_norm": 0.13027605414390564, "learning_rate": 9.99663792839458e-05, "loss": 0.0082, "step": 6110 }, { "epoch": 54.15929203539823, "grad_norm": 0.09656640887260437, "learning_rate": 9.99657702948859e-05, "loss": 0.0066, "step": 6120 }, { "epoch": 54.24778761061947, "grad_norm": 0.10608144849538803, "learning_rate": 9.996515584164448e-05, "loss": 0.0069, "step": 6130 }, { "epoch": 54.336283185840706, "grad_norm": 0.13142161071300507, "learning_rate": 9.996453592428873e-05, "loss": 0.0078, "step": 6140 }, { "epoch": 54.424778761061944, "grad_norm": 0.15804174542427063, "learning_rate": 9.996391054288646e-05, "loss": 0.0071, "step": 6150 }, { "epoch": 54.51327433628319, "grad_norm": 0.13912999629974365, "learning_rate": 9.996327969750605e-05, "loss": 0.0079, "step": 6160 }, { "epoch": 54.60176991150443, "grad_norm": 0.15766675770282745, "learning_rate": 9.996264338821649e-05, "loss": 0.0073, "step": 6170 }, { "epoch": 54.690265486725664, "grad_norm": 0.11186890304088593, "learning_rate": 9.996200161508735e-05, "loss": 0.0069, "step": 6180 }, { "epoch": 54.7787610619469, "grad_norm": 0.10806110501289368, "learning_rate": 9.996135437818885e-05, "loss": 0.007, "step": 6190 }, { "epoch": 54.86725663716814, "grad_norm": 0.16035513579845428, "learning_rate": 9.996070167759175e-05, "loss": 0.007, "step": 6200 }, { "epoch": 54.95575221238938, "grad_norm": 0.16528643667697906, "learning_rate": 9.996004351336743e-05, "loss": 0.0077, "step": 6210 }, { "epoch": 55.04424778761062, "grad_norm": 0.15115606784820557, "learning_rate": 9.995937988558785e-05, "loss": 0.0075, "step": 6220 }, { "epoch": 55.13274336283186, "grad_norm": 0.13698093593120575, "learning_rate": 9.995871079432561e-05, "loss": 0.0073, "step": 6230 }, { "epoch": 55.2212389380531, "grad_norm": 0.13265031576156616, "learning_rate": 9.995803623965389e-05, "loss": 0.0081, "step": 6240 }, { "epoch": 55.309734513274336, "grad_norm": 0.18100149929523468, "learning_rate": 9.995735622164641e-05, "loss": 0.0072, "step": 6250 }, { "epoch": 55.39823008849557, "grad_norm": 0.1533287614583969, "learning_rate": 9.995667074037758e-05, "loss": 0.0067, "step": 6260 }, { "epoch": 55.48672566371681, "grad_norm": 0.13572955131530762, "learning_rate": 9.995597979592232e-05, "loss": 0.0077, "step": 6270 }, { "epoch": 55.575221238938056, "grad_norm": 0.10485535860061646, "learning_rate": 9.995528338835625e-05, "loss": 0.0071, "step": 6280 }, { "epoch": 55.663716814159294, "grad_norm": 0.10920023918151855, "learning_rate": 9.995458151775547e-05, "loss": 0.0071, "step": 6290 }, { "epoch": 55.75221238938053, "grad_norm": 0.10756077617406845, "learning_rate": 9.995387418419677e-05, "loss": 0.0067, "step": 6300 }, { "epoch": 55.84070796460177, "grad_norm": 0.12476219236850739, "learning_rate": 9.99531613877575e-05, "loss": 0.007, "step": 6310 }, { "epoch": 55.92920353982301, "grad_norm": 0.11445712298154831, "learning_rate": 9.995244312851559e-05, "loss": 0.007, "step": 6320 }, { "epoch": 56.017699115044245, "grad_norm": 0.1060228943824768, "learning_rate": 9.995171940654961e-05, "loss": 0.007, "step": 6330 }, { "epoch": 56.10619469026549, "grad_norm": 0.15434004366397858, "learning_rate": 9.995099022193871e-05, "loss": 0.0075, "step": 6340 }, { "epoch": 56.19469026548673, "grad_norm": 0.10044840723276138, "learning_rate": 9.995025557476261e-05, "loss": 0.0065, "step": 6350 }, { "epoch": 56.283185840707965, "grad_norm": 0.10664913058280945, "learning_rate": 9.994951546510165e-05, "loss": 0.0077, "step": 6360 }, { "epoch": 56.3716814159292, "grad_norm": 0.10567296296358109, "learning_rate": 9.994876989303679e-05, "loss": 0.0067, "step": 6370 }, { "epoch": 56.46017699115044, "grad_norm": 0.12378619611263275, "learning_rate": 9.994801885864955e-05, "loss": 0.0071, "step": 6380 }, { "epoch": 56.54867256637168, "grad_norm": 0.1165599524974823, "learning_rate": 9.994726236202205e-05, "loss": 0.0072, "step": 6390 }, { "epoch": 56.63716814159292, "grad_norm": 0.13224077224731445, "learning_rate": 9.994650040323704e-05, "loss": 0.0064, "step": 6400 }, { "epoch": 56.72566371681416, "grad_norm": 0.12277481704950333, "learning_rate": 9.994573298237784e-05, "loss": 0.0072, "step": 6410 }, { "epoch": 56.8141592920354, "grad_norm": 0.11215825378894806, "learning_rate": 9.994496009952837e-05, "loss": 0.0069, "step": 6420 }, { "epoch": 56.902654867256636, "grad_norm": 0.13313978910446167, "learning_rate": 9.994418175477316e-05, "loss": 0.0068, "step": 6430 }, { "epoch": 56.991150442477874, "grad_norm": 0.1526990532875061, "learning_rate": 9.994339794819733e-05, "loss": 0.0066, "step": 6440 }, { "epoch": 57.07964601769911, "grad_norm": 0.11533765494823456, "learning_rate": 9.994260867988658e-05, "loss": 0.0069, "step": 6450 }, { "epoch": 57.16814159292036, "grad_norm": 0.1275179386138916, "learning_rate": 9.994181394992723e-05, "loss": 0.0072, "step": 6460 }, { "epoch": 57.256637168141594, "grad_norm": 0.12770402431488037, "learning_rate": 9.994101375840618e-05, "loss": 0.007, "step": 6470 }, { "epoch": 57.34513274336283, "grad_norm": 0.13415737450122833, "learning_rate": 9.994020810541098e-05, "loss": 0.0069, "step": 6480 }, { "epoch": 57.43362831858407, "grad_norm": 0.11421578377485275, "learning_rate": 9.99393969910297e-05, "loss": 0.0062, "step": 6490 }, { "epoch": 57.52212389380531, "grad_norm": 0.1381864696741104, "learning_rate": 9.993858041535104e-05, "loss": 0.0067, "step": 6500 }, { "epoch": 57.610619469026545, "grad_norm": 0.12277805805206299, "learning_rate": 9.99377583784643e-05, "loss": 0.0071, "step": 6510 }, { "epoch": 57.69911504424779, "grad_norm": 0.11379420757293701, "learning_rate": 9.993693088045939e-05, "loss": 0.0067, "step": 6520 }, { "epoch": 57.78761061946903, "grad_norm": 0.10273974388837814, "learning_rate": 9.99360979214268e-05, "loss": 0.006, "step": 6530 }, { "epoch": 57.876106194690266, "grad_norm": 0.09692050516605377, "learning_rate": 9.99352595014576e-05, "loss": 0.0068, "step": 6540 }, { "epoch": 57.9646017699115, "grad_norm": 0.10250181704759598, "learning_rate": 9.993441562064354e-05, "loss": 0.0064, "step": 6550 }, { "epoch": 58.05309734513274, "grad_norm": 0.10750356316566467, "learning_rate": 9.993356627907685e-05, "loss": 0.0071, "step": 6560 }, { "epoch": 58.14159292035398, "grad_norm": 0.13370372354984283, "learning_rate": 9.99327114768504e-05, "loss": 0.0067, "step": 6570 }, { "epoch": 58.230088495575224, "grad_norm": 0.14788718521595, "learning_rate": 9.99318512140577e-05, "loss": 0.0067, "step": 6580 }, { "epoch": 58.31858407079646, "grad_norm": 0.15469089150428772, "learning_rate": 9.993098549079284e-05, "loss": 0.0073, "step": 6590 }, { "epoch": 58.4070796460177, "grad_norm": 0.16022972762584686, "learning_rate": 9.993011430715047e-05, "loss": 0.0073, "step": 6600 }, { "epoch": 58.49557522123894, "grad_norm": 0.12116739153862, "learning_rate": 9.992923766322586e-05, "loss": 0.0068, "step": 6610 }, { "epoch": 58.584070796460175, "grad_norm": 0.12725108861923218, "learning_rate": 9.99283555591149e-05, "loss": 0.0065, "step": 6620 }, { "epoch": 58.67256637168141, "grad_norm": 0.11936835944652557, "learning_rate": 9.992746799491404e-05, "loss": 0.0065, "step": 6630 }, { "epoch": 58.76106194690266, "grad_norm": 0.13782432675361633, "learning_rate": 9.992657497072033e-05, "loss": 0.0069, "step": 6640 }, { "epoch": 58.849557522123895, "grad_norm": 0.12118148803710938, "learning_rate": 9.992567648663147e-05, "loss": 0.007, "step": 6650 }, { "epoch": 58.93805309734513, "grad_norm": 0.12604814767837524, "learning_rate": 9.992477254274568e-05, "loss": 0.0061, "step": 6660 }, { "epoch": 59.02654867256637, "grad_norm": 0.13500675559043884, "learning_rate": 9.992386313916183e-05, "loss": 0.0067, "step": 6670 }, { "epoch": 59.11504424778761, "grad_norm": 0.12500520050525665, "learning_rate": 9.992294827597934e-05, "loss": 0.0072, "step": 6680 }, { "epoch": 59.203539823008846, "grad_norm": 0.1168852224946022, "learning_rate": 9.992202795329831e-05, "loss": 0.0062, "step": 6690 }, { "epoch": 59.29203539823009, "grad_norm": 0.12165257334709167, "learning_rate": 9.992110217121936e-05, "loss": 0.0065, "step": 6700 }, { "epoch": 59.38053097345133, "grad_norm": 0.13444724678993225, "learning_rate": 9.992017092984372e-05, "loss": 0.0068, "step": 6710 }, { "epoch": 59.469026548672566, "grad_norm": 0.15891699492931366, "learning_rate": 9.991923422927326e-05, "loss": 0.0071, "step": 6720 }, { "epoch": 59.557522123893804, "grad_norm": 0.1080576628446579, "learning_rate": 9.991829206961037e-05, "loss": 0.0068, "step": 6730 }, { "epoch": 59.64601769911504, "grad_norm": 0.10575453191995621, "learning_rate": 9.991734445095813e-05, "loss": 0.0066, "step": 6740 }, { "epoch": 59.73451327433628, "grad_norm": 0.08989618718624115, "learning_rate": 9.991639137342015e-05, "loss": 0.007, "step": 6750 }, { "epoch": 59.823008849557525, "grad_norm": 0.12456994503736496, "learning_rate": 9.991543283710064e-05, "loss": 0.007, "step": 6760 }, { "epoch": 59.91150442477876, "grad_norm": 0.10034193098545074, "learning_rate": 9.991446884210445e-05, "loss": 0.007, "step": 6770 }, { "epoch": 60.0, "grad_norm": 0.11604133248329163, "learning_rate": 9.9913499388537e-05, "loss": 0.0067, "step": 6780 }, { "epoch": 60.08849557522124, "grad_norm": 0.13283856213092804, "learning_rate": 9.99125244765043e-05, "loss": 0.0067, "step": 6790 }, { "epoch": 60.176991150442475, "grad_norm": 0.11176202446222305, "learning_rate": 9.991154410611296e-05, "loss": 0.0067, "step": 6800 }, { "epoch": 60.26548672566372, "grad_norm": 0.11676081269979477, "learning_rate": 9.99105582774702e-05, "loss": 0.0068, "step": 6810 }, { "epoch": 60.35398230088496, "grad_norm": 0.1053016260266304, "learning_rate": 9.990956699068384e-05, "loss": 0.0066, "step": 6820 }, { "epoch": 60.442477876106196, "grad_norm": 0.08537508547306061, "learning_rate": 9.990857024586224e-05, "loss": 0.0061, "step": 6830 }, { "epoch": 60.530973451327434, "grad_norm": 0.08726681023836136, "learning_rate": 9.990756804311446e-05, "loss": 0.0067, "step": 6840 }, { "epoch": 60.61946902654867, "grad_norm": 0.13327188789844513, "learning_rate": 9.990656038255006e-05, "loss": 0.0065, "step": 6850 }, { "epoch": 60.70796460176991, "grad_norm": 0.16907227039337158, "learning_rate": 9.990554726427926e-05, "loss": 0.0066, "step": 6860 }, { "epoch": 60.796460176991154, "grad_norm": 0.12004328519105911, "learning_rate": 9.990452868841284e-05, "loss": 0.0069, "step": 6870 }, { "epoch": 60.88495575221239, "grad_norm": 0.10381284356117249, "learning_rate": 9.99035046550622e-05, "loss": 0.0066, "step": 6880 }, { "epoch": 60.97345132743363, "grad_norm": 0.11252574622631073, "learning_rate": 9.99024751643393e-05, "loss": 0.0065, "step": 6890 }, { "epoch": 61.06194690265487, "grad_norm": 0.14611081779003143, "learning_rate": 9.990144021635677e-05, "loss": 0.0067, "step": 6900 }, { "epoch": 61.150442477876105, "grad_norm": 0.09365980327129364, "learning_rate": 9.990039981122775e-05, "loss": 0.0068, "step": 6910 }, { "epoch": 61.23893805309734, "grad_norm": 0.13181225955486298, "learning_rate": 9.989935394906602e-05, "loss": 0.0068, "step": 6920 }, { "epoch": 61.32743362831859, "grad_norm": 0.1608838140964508, "learning_rate": 9.989830262998598e-05, "loss": 0.0067, "step": 6930 }, { "epoch": 61.415929203539825, "grad_norm": 0.14466796815395355, "learning_rate": 9.989724585410259e-05, "loss": 0.0068, "step": 6940 }, { "epoch": 61.50442477876106, "grad_norm": 0.15593942999839783, "learning_rate": 9.989618362153139e-05, "loss": 0.0067, "step": 6950 }, { "epoch": 61.5929203539823, "grad_norm": 0.13513709604740143, "learning_rate": 9.989511593238859e-05, "loss": 0.0072, "step": 6960 }, { "epoch": 61.68141592920354, "grad_norm": 0.10657244175672531, "learning_rate": 9.98940427867909e-05, "loss": 0.0067, "step": 6970 }, { "epoch": 61.769911504424776, "grad_norm": 0.11525707691907883, "learning_rate": 9.989296418485573e-05, "loss": 0.0067, "step": 6980 }, { "epoch": 61.85840707964602, "grad_norm": 0.10681252926588058, "learning_rate": 9.989188012670101e-05, "loss": 0.0063, "step": 6990 }, { "epoch": 61.94690265486726, "grad_norm": 0.1459273248910904, "learning_rate": 9.989079061244528e-05, "loss": 0.0065, "step": 7000 }, { "epoch": 62.0353982300885, "grad_norm": 0.1311957687139511, "learning_rate": 9.988969564220769e-05, "loss": 0.0068, "step": 7010 }, { "epoch": 62.123893805309734, "grad_norm": 0.10953950136899948, "learning_rate": 9.988859521610801e-05, "loss": 0.0074, "step": 7020 }, { "epoch": 62.21238938053097, "grad_norm": 0.13867156207561493, "learning_rate": 9.988748933426656e-05, "loss": 0.0069, "step": 7030 }, { "epoch": 62.30088495575221, "grad_norm": 0.1676408052444458, "learning_rate": 9.988637799680428e-05, "loss": 0.007, "step": 7040 }, { "epoch": 62.389380530973455, "grad_norm": 0.18464261293411255, "learning_rate": 9.98852612038427e-05, "loss": 0.0076, "step": 7050 }, { "epoch": 62.47787610619469, "grad_norm": 0.13487857580184937, "learning_rate": 9.988413895550397e-05, "loss": 0.0076, "step": 7060 }, { "epoch": 62.56637168141593, "grad_norm": 0.1034436747431755, "learning_rate": 9.98830112519108e-05, "loss": 0.0069, "step": 7070 }, { "epoch": 62.65486725663717, "grad_norm": 0.09467529505491257, "learning_rate": 9.98818780931865e-05, "loss": 0.0067, "step": 7080 }, { "epoch": 62.743362831858406, "grad_norm": 0.11496268212795258, "learning_rate": 9.988073947945502e-05, "loss": 0.0069, "step": 7090 }, { "epoch": 62.83185840707964, "grad_norm": 0.13266722857952118, "learning_rate": 9.987959541084087e-05, "loss": 0.0066, "step": 7100 }, { "epoch": 62.92035398230089, "grad_norm": 0.12566514313220978, "learning_rate": 9.987844588746915e-05, "loss": 0.0066, "step": 7110 }, { "epoch": 63.008849557522126, "grad_norm": 0.1609344482421875, "learning_rate": 9.987729090946558e-05, "loss": 0.007, "step": 7120 }, { "epoch": 63.097345132743364, "grad_norm": 0.15092191100120544, "learning_rate": 9.987613047695647e-05, "loss": 0.0067, "step": 7130 }, { "epoch": 63.1858407079646, "grad_norm": 0.13600541651248932, "learning_rate": 9.987496459006871e-05, "loss": 0.0064, "step": 7140 }, { "epoch": 63.27433628318584, "grad_norm": 0.12328796088695526, "learning_rate": 9.987379324892982e-05, "loss": 0.0065, "step": 7150 }, { "epoch": 63.36283185840708, "grad_norm": 0.1525561362504959, "learning_rate": 9.987261645366788e-05, "loss": 0.0067, "step": 7160 }, { "epoch": 63.45132743362832, "grad_norm": 0.12145034223794937, "learning_rate": 9.987143420441158e-05, "loss": 0.0067, "step": 7170 }, { "epoch": 63.53982300884956, "grad_norm": 0.11069979518651962, "learning_rate": 9.987024650129022e-05, "loss": 0.0068, "step": 7180 }, { "epoch": 63.6283185840708, "grad_norm": 0.11291412264108658, "learning_rate": 9.986905334443368e-05, "loss": 0.0069, "step": 7190 }, { "epoch": 63.716814159292035, "grad_norm": 0.0968683585524559, "learning_rate": 9.986785473397245e-05, "loss": 0.0061, "step": 7200 }, { "epoch": 63.80530973451327, "grad_norm": 0.10504832863807678, "learning_rate": 9.98666506700376e-05, "loss": 0.0061, "step": 7210 }, { "epoch": 63.89380530973451, "grad_norm": 0.09243866056203842, "learning_rate": 9.986544115276081e-05, "loss": 0.0062, "step": 7220 }, { "epoch": 63.982300884955755, "grad_norm": 0.08823885023593903, "learning_rate": 9.986422618227433e-05, "loss": 0.006, "step": 7230 }, { "epoch": 64.070796460177, "grad_norm": 0.10525842010974884, "learning_rate": 9.986300575871106e-05, "loss": 0.0062, "step": 7240 }, { "epoch": 64.15929203539822, "grad_norm": 0.11518089473247528, "learning_rate": 9.986177988220444e-05, "loss": 0.0058, "step": 7250 }, { "epoch": 64.24778761061947, "grad_norm": 0.10956887900829315, "learning_rate": 9.986054855288856e-05, "loss": 0.0058, "step": 7260 }, { "epoch": 64.33628318584071, "grad_norm": 0.1005554124712944, "learning_rate": 9.985931177089802e-05, "loss": 0.0064, "step": 7270 }, { "epoch": 64.42477876106194, "grad_norm": 0.14025098085403442, "learning_rate": 9.985806953636814e-05, "loss": 0.0063, "step": 7280 }, { "epoch": 64.51327433628319, "grad_norm": 0.11949311941862106, "learning_rate": 9.985682184943471e-05, "loss": 0.0071, "step": 7290 }, { "epoch": 64.60176991150442, "grad_norm": 0.11547757685184479, "learning_rate": 9.98555687102342e-05, "loss": 0.0059, "step": 7300 }, { "epoch": 64.69026548672566, "grad_norm": 0.10911315679550171, "learning_rate": 9.985431011890367e-05, "loss": 0.0064, "step": 7310 }, { "epoch": 64.77876106194691, "grad_norm": 0.11663784086704254, "learning_rate": 9.985304607558075e-05, "loss": 0.0063, "step": 7320 }, { "epoch": 64.86725663716814, "grad_norm": 0.12374185770750046, "learning_rate": 9.985177658040364e-05, "loss": 0.0067, "step": 7330 }, { "epoch": 64.95575221238938, "grad_norm": 0.11252603679895401, "learning_rate": 9.985050163351119e-05, "loss": 0.006, "step": 7340 }, { "epoch": 65.04424778761062, "grad_norm": 0.12345760315656662, "learning_rate": 9.984922123504286e-05, "loss": 0.0067, "step": 7350 }, { "epoch": 65.13274336283186, "grad_norm": 0.10149062424898148, "learning_rate": 9.984793538513862e-05, "loss": 0.0072, "step": 7360 }, { "epoch": 65.22123893805309, "grad_norm": 0.13657146692276, "learning_rate": 9.984664408393912e-05, "loss": 0.0063, "step": 7370 }, { "epoch": 65.30973451327434, "grad_norm": 0.11091844737529755, "learning_rate": 9.984534733158556e-05, "loss": 0.0069, "step": 7380 }, { "epoch": 65.39823008849558, "grad_norm": 0.09856518357992172, "learning_rate": 9.984404512821977e-05, "loss": 0.0065, "step": 7390 }, { "epoch": 65.48672566371681, "grad_norm": 0.1332857757806778, "learning_rate": 9.984273747398411e-05, "loss": 0.0061, "step": 7400 }, { "epoch": 65.57522123893806, "grad_norm": 0.11882749944925308, "learning_rate": 9.984142436902165e-05, "loss": 0.0072, "step": 7410 }, { "epoch": 65.66371681415929, "grad_norm": 0.11369358748197556, "learning_rate": 9.984010581347596e-05, "loss": 0.0061, "step": 7420 }, { "epoch": 65.75221238938053, "grad_norm": 0.13496249914169312, "learning_rate": 9.983878180749121e-05, "loss": 0.0063, "step": 7430 }, { "epoch": 65.84070796460178, "grad_norm": 0.12470993399620056, "learning_rate": 9.983745235121222e-05, "loss": 0.0062, "step": 7440 }, { "epoch": 65.929203539823, "grad_norm": 0.09955979138612747, "learning_rate": 9.983611744478438e-05, "loss": 0.0069, "step": 7450 }, { "epoch": 66.01769911504425, "grad_norm": 0.13134808838367462, "learning_rate": 9.983477708835365e-05, "loss": 0.0066, "step": 7460 }, { "epoch": 66.10619469026548, "grad_norm": 0.10295764356851578, "learning_rate": 9.983343128206664e-05, "loss": 0.0059, "step": 7470 }, { "epoch": 66.19469026548673, "grad_norm": 0.12271054089069366, "learning_rate": 9.983208002607049e-05, "loss": 0.0062, "step": 7480 }, { "epoch": 66.28318584070796, "grad_norm": 0.15199494361877441, "learning_rate": 9.9830723320513e-05, "loss": 0.0064, "step": 7490 }, { "epoch": 66.3716814159292, "grad_norm": 0.10865994542837143, "learning_rate": 9.982936116554254e-05, "loss": 0.0065, "step": 7500 }, { "epoch": 66.46017699115045, "grad_norm": 0.09021376818418503, "learning_rate": 9.982799356130803e-05, "loss": 0.0067, "step": 7510 }, { "epoch": 66.54867256637168, "grad_norm": 0.16522467136383057, "learning_rate": 9.982662050795908e-05, "loss": 0.0059, "step": 7520 }, { "epoch": 66.63716814159292, "grad_norm": 0.10417914390563965, "learning_rate": 9.982524200564583e-05, "loss": 0.0063, "step": 7530 }, { "epoch": 66.72566371681415, "grad_norm": 0.1503831446170807, "learning_rate": 9.982385805451901e-05, "loss": 0.0065, "step": 7540 }, { "epoch": 66.8141592920354, "grad_norm": 0.11018391698598862, "learning_rate": 9.982246865472998e-05, "loss": 0.0059, "step": 7550 }, { "epoch": 66.90265486725664, "grad_norm": 0.11249592155218124, "learning_rate": 9.982107380643069e-05, "loss": 0.0065, "step": 7560 }, { "epoch": 66.99115044247787, "grad_norm": 0.10396026074886322, "learning_rate": 9.981967350977368e-05, "loss": 0.0066, "step": 7570 }, { "epoch": 67.07964601769912, "grad_norm": 0.13432832062244415, "learning_rate": 9.981826776491208e-05, "loss": 0.0062, "step": 7580 }, { "epoch": 67.16814159292035, "grad_norm": 0.11561453342437744, "learning_rate": 9.98168565719996e-05, "loss": 0.0066, "step": 7590 }, { "epoch": 67.2566371681416, "grad_norm": 0.10294978320598602, "learning_rate": 9.98154399311906e-05, "loss": 0.0061, "step": 7600 }, { "epoch": 67.34513274336283, "grad_norm": 0.09747711569070816, "learning_rate": 9.981401784263997e-05, "loss": 0.0067, "step": 7610 }, { "epoch": 67.43362831858407, "grad_norm": 0.10972411185503006, "learning_rate": 9.981259030650326e-05, "loss": 0.0063, "step": 7620 }, { "epoch": 67.52212389380531, "grad_norm": 0.11169084161520004, "learning_rate": 9.981115732293655e-05, "loss": 0.006, "step": 7630 }, { "epoch": 67.61061946902655, "grad_norm": 0.10598007589578629, "learning_rate": 9.980971889209659e-05, "loss": 0.0063, "step": 7640 }, { "epoch": 67.69911504424779, "grad_norm": 0.11927840113639832, "learning_rate": 9.980827501414064e-05, "loss": 0.0059, "step": 7650 }, { "epoch": 67.78761061946902, "grad_norm": 0.10992605984210968, "learning_rate": 9.980682568922663e-05, "loss": 0.0066, "step": 7660 }, { "epoch": 67.87610619469027, "grad_norm": 0.13199156522750854, "learning_rate": 9.980537091751304e-05, "loss": 0.0066, "step": 7670 }, { "epoch": 67.96460176991151, "grad_norm": 0.1461779773235321, "learning_rate": 9.980391069915897e-05, "loss": 0.0064, "step": 7680 }, { "epoch": 68.05309734513274, "grad_norm": 0.14791648089885712, "learning_rate": 9.98024450343241e-05, "loss": 0.0064, "step": 7690 }, { "epoch": 68.14159292035399, "grad_norm": 0.14366358518600464, "learning_rate": 9.980097392316872e-05, "loss": 0.0068, "step": 7700 }, { "epoch": 68.23008849557522, "grad_norm": 0.1535428911447525, "learning_rate": 9.97994973658537e-05, "loss": 0.0059, "step": 7710 }, { "epoch": 68.31858407079646, "grad_norm": 0.1114254891872406, "learning_rate": 9.979801536254054e-05, "loss": 0.0062, "step": 7720 }, { "epoch": 68.40707964601769, "grad_norm": 0.10827835649251938, "learning_rate": 9.979652791339127e-05, "loss": 0.0064, "step": 7730 }, { "epoch": 68.49557522123894, "grad_norm": 0.1307893693447113, "learning_rate": 9.97950350185686e-05, "loss": 0.007, "step": 7740 }, { "epoch": 68.58407079646018, "grad_norm": 0.09643508493900299, "learning_rate": 9.979353667823574e-05, "loss": 0.0059, "step": 7750 }, { "epoch": 68.67256637168141, "grad_norm": 0.1439676731824875, "learning_rate": 9.979203289255658e-05, "loss": 0.0056, "step": 7760 }, { "epoch": 68.76106194690266, "grad_norm": 0.10104666650295258, "learning_rate": 9.979052366169557e-05, "loss": 0.006, "step": 7770 }, { "epoch": 68.84955752212389, "grad_norm": 0.1201610267162323, "learning_rate": 9.978900898581775e-05, "loss": 0.0055, "step": 7780 }, { "epoch": 68.93805309734513, "grad_norm": 0.11689028143882751, "learning_rate": 9.978748886508875e-05, "loss": 0.0062, "step": 7790 }, { "epoch": 69.02654867256638, "grad_norm": 0.12445169687271118, "learning_rate": 9.978596329967484e-05, "loss": 0.0063, "step": 7800 }, { "epoch": 69.11504424778761, "grad_norm": 0.10243967920541763, "learning_rate": 9.978443228974284e-05, "loss": 0.0066, "step": 7810 }, { "epoch": 69.20353982300885, "grad_norm": 0.1291644275188446, "learning_rate": 9.978289583546015e-05, "loss": 0.007, "step": 7820 }, { "epoch": 69.29203539823008, "grad_norm": 0.09051987528800964, "learning_rate": 9.978135393699484e-05, "loss": 0.0064, "step": 7830 }, { "epoch": 69.38053097345133, "grad_norm": 0.10411614924669266, "learning_rate": 9.977980659451548e-05, "loss": 0.0066, "step": 7840 }, { "epoch": 69.46902654867256, "grad_norm": 0.16520874202251434, "learning_rate": 9.977825380819135e-05, "loss": 0.007, "step": 7850 }, { "epoch": 69.5575221238938, "grad_norm": 0.10322892665863037, "learning_rate": 9.97766955781922e-05, "loss": 0.007, "step": 7860 }, { "epoch": 69.64601769911505, "grad_norm": 0.10508093237876892, "learning_rate": 9.977513190468848e-05, "loss": 0.0059, "step": 7870 }, { "epoch": 69.73451327433628, "grad_norm": 0.1304185390472412, "learning_rate": 9.977356278785116e-05, "loss": 0.0061, "step": 7880 }, { "epoch": 69.82300884955752, "grad_norm": 0.1335097700357437, "learning_rate": 9.977198822785184e-05, "loss": 0.0073, "step": 7890 }, { "epoch": 69.91150442477876, "grad_norm": 0.10650800913572311, "learning_rate": 9.977040822486273e-05, "loss": 0.006, "step": 7900 }, { "epoch": 70.0, "grad_norm": 0.12559041380882263, "learning_rate": 9.97688227790566e-05, "loss": 0.0066, "step": 7910 }, { "epoch": 70.08849557522124, "grad_norm": 0.11108136922121048, "learning_rate": 9.976723189060684e-05, "loss": 0.0064, "step": 7920 }, { "epoch": 70.17699115044248, "grad_norm": 0.11059076339006424, "learning_rate": 9.976563555968742e-05, "loss": 0.0063, "step": 7930 }, { "epoch": 70.26548672566372, "grad_norm": 0.10653873533010483, "learning_rate": 9.976403378647292e-05, "loss": 0.0063, "step": 7940 }, { "epoch": 70.35398230088495, "grad_norm": 0.09831434488296509, "learning_rate": 9.97624265711385e-05, "loss": 0.0058, "step": 7950 }, { "epoch": 70.4424778761062, "grad_norm": 0.11119065433740616, "learning_rate": 9.976081391385993e-05, "loss": 0.0064, "step": 7960 }, { "epoch": 70.53097345132744, "grad_norm": 0.09702420979738235, "learning_rate": 9.975919581481356e-05, "loss": 0.0065, "step": 7970 }, { "epoch": 70.61946902654867, "grad_norm": 0.127532497048378, "learning_rate": 9.975757227417634e-05, "loss": 0.0067, "step": 7980 }, { "epoch": 70.70796460176992, "grad_norm": 0.12315838038921356, "learning_rate": 9.975594329212586e-05, "loss": 0.0064, "step": 7990 }, { "epoch": 70.79646017699115, "grad_norm": 0.09994253516197205, "learning_rate": 9.97543088688402e-05, "loss": 0.0062, "step": 8000 }, { "epoch": 70.88495575221239, "grad_norm": 0.10381423681974411, "learning_rate": 9.975266900449814e-05, "loss": 0.0061, "step": 8010 }, { "epoch": 70.97345132743362, "grad_norm": 0.13495273888111115, "learning_rate": 9.975102369927898e-05, "loss": 0.0066, "step": 8020 }, { "epoch": 71.06194690265487, "grad_norm": 0.12326859682798386, "learning_rate": 9.974937295336269e-05, "loss": 0.0065, "step": 8030 }, { "epoch": 71.15044247787611, "grad_norm": 0.09616561233997345, "learning_rate": 9.974771676692975e-05, "loss": 0.0061, "step": 8040 }, { "epoch": 71.23893805309734, "grad_norm": 0.12078515440225601, "learning_rate": 9.974605514016131e-05, "loss": 0.0061, "step": 8050 }, { "epoch": 71.32743362831859, "grad_norm": 0.11471603810787201, "learning_rate": 9.974438807323907e-05, "loss": 0.0064, "step": 8060 }, { "epoch": 71.41592920353982, "grad_norm": 0.10006176680326462, "learning_rate": 9.974271556634535e-05, "loss": 0.0061, "step": 8070 }, { "epoch": 71.50442477876106, "grad_norm": 0.08977751433849335, "learning_rate": 9.974103761966302e-05, "loss": 0.0056, "step": 8080 }, { "epoch": 71.59292035398231, "grad_norm": 0.10365013033151627, "learning_rate": 9.973935423337563e-05, "loss": 0.0058, "step": 8090 }, { "epoch": 71.68141592920354, "grad_norm": 0.11227709800004959, "learning_rate": 9.973766540766722e-05, "loss": 0.006, "step": 8100 }, { "epoch": 71.76991150442478, "grad_norm": 0.1021062508225441, "learning_rate": 9.97359711427225e-05, "loss": 0.0057, "step": 8110 }, { "epoch": 71.85840707964601, "grad_norm": 0.09944093972444534, "learning_rate": 9.973427143872677e-05, "loss": 0.0064, "step": 8120 }, { "epoch": 71.94690265486726, "grad_norm": 0.10755596309900284, "learning_rate": 9.973256629586589e-05, "loss": 0.0052, "step": 8130 }, { "epoch": 72.03539823008849, "grad_norm": 0.12734898924827576, "learning_rate": 9.973085571432632e-05, "loss": 0.0065, "step": 8140 }, { "epoch": 72.12389380530973, "grad_norm": 0.12120959162712097, "learning_rate": 9.972913969429513e-05, "loss": 0.0057, "step": 8150 }, { "epoch": 72.21238938053098, "grad_norm": 0.10789244621992111, "learning_rate": 9.972741823596e-05, "loss": 0.006, "step": 8160 }, { "epoch": 72.30088495575221, "grad_norm": 0.09750421345233917, "learning_rate": 9.972569133950917e-05, "loss": 0.0063, "step": 8170 }, { "epoch": 72.38938053097345, "grad_norm": 0.07650677859783173, "learning_rate": 9.972395900513151e-05, "loss": 0.0063, "step": 8180 }, { "epoch": 72.47787610619469, "grad_norm": 0.08634928613901138, "learning_rate": 9.972222123301645e-05, "loss": 0.0062, "step": 8190 }, { "epoch": 72.56637168141593, "grad_norm": 0.10441482812166214, "learning_rate": 9.972047802335403e-05, "loss": 0.0058, "step": 8200 }, { "epoch": 72.65486725663717, "grad_norm": 0.10933029651641846, "learning_rate": 9.971872937633488e-05, "loss": 0.0055, "step": 8210 }, { "epoch": 72.7433628318584, "grad_norm": 0.07779169827699661, "learning_rate": 9.971697529215024e-05, "loss": 0.006, "step": 8220 }, { "epoch": 72.83185840707965, "grad_norm": 0.09137412160634995, "learning_rate": 9.971521577099192e-05, "loss": 0.0064, "step": 8230 }, { "epoch": 72.92035398230088, "grad_norm": 0.10819313675165176, "learning_rate": 9.971345081305236e-05, "loss": 0.0062, "step": 8240 }, { "epoch": 73.00884955752213, "grad_norm": 0.1135842502117157, "learning_rate": 9.971168041852456e-05, "loss": 0.0062, "step": 8250 }, { "epoch": 73.09734513274336, "grad_norm": 0.11758588254451752, "learning_rate": 9.970990458760215e-05, "loss": 0.0061, "step": 8260 }, { "epoch": 73.1858407079646, "grad_norm": 0.11972816288471222, "learning_rate": 9.970812332047929e-05, "loss": 0.0059, "step": 8270 }, { "epoch": 73.27433628318585, "grad_norm": 0.1334552764892578, "learning_rate": 9.97063366173508e-05, "loss": 0.0063, "step": 8280 }, { "epoch": 73.36283185840708, "grad_norm": 0.13754241168498993, "learning_rate": 9.970454447841207e-05, "loss": 0.006, "step": 8290 }, { "epoch": 73.45132743362832, "grad_norm": 0.14387445151805878, "learning_rate": 9.970274690385909e-05, "loss": 0.0064, "step": 8300 }, { "epoch": 73.53982300884955, "grad_norm": 0.1262485235929489, "learning_rate": 9.970094389388844e-05, "loss": 0.0061, "step": 8310 }, { "epoch": 73.6283185840708, "grad_norm": 0.11576636880636215, "learning_rate": 9.969913544869728e-05, "loss": 0.0058, "step": 8320 }, { "epoch": 73.71681415929204, "grad_norm": 0.1376902163028717, "learning_rate": 9.96973215684834e-05, "loss": 0.0062, "step": 8330 }, { "epoch": 73.80530973451327, "grad_norm": 0.15589570999145508, "learning_rate": 9.969550225344513e-05, "loss": 0.0062, "step": 8340 }, { "epoch": 73.89380530973452, "grad_norm": 0.10897907614707947, "learning_rate": 9.969367750378147e-05, "loss": 0.0059, "step": 8350 }, { "epoch": 73.98230088495575, "grad_norm": 0.09046350419521332, "learning_rate": 9.969184731969194e-05, "loss": 0.0058, "step": 8360 }, { "epoch": 74.070796460177, "grad_norm": 0.09479265660047531, "learning_rate": 9.96900117013767e-05, "loss": 0.0059, "step": 8370 }, { "epoch": 74.15929203539822, "grad_norm": 0.1085423082113266, "learning_rate": 9.96881706490365e-05, "loss": 0.006, "step": 8380 }, { "epoch": 74.24778761061947, "grad_norm": 0.12503911554813385, "learning_rate": 9.968632416287265e-05, "loss": 0.0058, "step": 8390 }, { "epoch": 74.33628318584071, "grad_norm": 0.11865498870611191, "learning_rate": 9.96844722430871e-05, "loss": 0.0057, "step": 8400 }, { "epoch": 74.42477876106194, "grad_norm": 0.1037907749414444, "learning_rate": 9.968261488988235e-05, "loss": 0.0064, "step": 8410 }, { "epoch": 74.51327433628319, "grad_norm": 0.10178150236606598, "learning_rate": 9.968075210346155e-05, "loss": 0.0057, "step": 8420 }, { "epoch": 74.60176991150442, "grad_norm": 0.1001485213637352, "learning_rate": 9.967888388402839e-05, "loss": 0.0061, "step": 8430 }, { "epoch": 74.69026548672566, "grad_norm": 0.11811663210391998, "learning_rate": 9.967701023178717e-05, "loss": 0.006, "step": 8440 }, { "epoch": 74.77876106194691, "grad_norm": 0.12607654929161072, "learning_rate": 9.967513114694282e-05, "loss": 0.0059, "step": 8450 }, { "epoch": 74.86725663716814, "grad_norm": 0.1423610895872116, "learning_rate": 9.967324662970079e-05, "loss": 0.0059, "step": 8460 }, { "epoch": 74.95575221238938, "grad_norm": 0.12265769392251968, "learning_rate": 9.96713566802672e-05, "loss": 0.0059, "step": 8470 }, { "epoch": 75.04424778761062, "grad_norm": 0.08160112053155899, "learning_rate": 9.966946129884873e-05, "loss": 0.0061, "step": 8480 }, { "epoch": 75.13274336283186, "grad_norm": 0.09482406079769135, "learning_rate": 9.966756048565265e-05, "loss": 0.0059, "step": 8490 }, { "epoch": 75.22123893805309, "grad_norm": 0.10568396747112274, "learning_rate": 9.966565424088681e-05, "loss": 0.0061, "step": 8500 }, { "epoch": 75.30973451327434, "grad_norm": 0.10592056810855865, "learning_rate": 9.96637425647597e-05, "loss": 0.0064, "step": 8510 }, { "epoch": 75.39823008849558, "grad_norm": 0.09835847467184067, "learning_rate": 9.966182545748038e-05, "loss": 0.0057, "step": 8520 }, { "epoch": 75.48672566371681, "grad_norm": 0.11802490055561066, "learning_rate": 9.96599029192585e-05, "loss": 0.0065, "step": 8530 }, { "epoch": 75.57522123893806, "grad_norm": 0.12815123796463013, "learning_rate": 9.965797495030428e-05, "loss": 0.0064, "step": 8540 }, { "epoch": 75.66371681415929, "grad_norm": 0.09517830610275269, "learning_rate": 9.96560415508286e-05, "loss": 0.0062, "step": 8550 }, { "epoch": 75.75221238938053, "grad_norm": 0.08576002717018127, "learning_rate": 9.965410272104286e-05, "loss": 0.0059, "step": 8560 }, { "epoch": 75.84070796460178, "grad_norm": 0.09037141501903534, "learning_rate": 9.96521584611591e-05, "loss": 0.0062, "step": 8570 }, { "epoch": 75.929203539823, "grad_norm": 0.11598179489374161, "learning_rate": 9.965020877138994e-05, "loss": 0.0058, "step": 8580 }, { "epoch": 76.01769911504425, "grad_norm": 0.1054382398724556, "learning_rate": 9.964825365194861e-05, "loss": 0.0064, "step": 8590 }, { "epoch": 76.10619469026548, "grad_norm": 0.13920103013515472, "learning_rate": 9.96462931030489e-05, "loss": 0.006, "step": 8600 }, { "epoch": 76.19469026548673, "grad_norm": 0.08667236566543579, "learning_rate": 9.96443271249052e-05, "loss": 0.0061, "step": 8610 }, { "epoch": 76.28318584070796, "grad_norm": 0.09789692610502243, "learning_rate": 9.964235571773255e-05, "loss": 0.0064, "step": 8620 }, { "epoch": 76.3716814159292, "grad_norm": 0.09337849915027618, "learning_rate": 9.96403788817465e-05, "loss": 0.0057, "step": 8630 }, { "epoch": 76.46017699115045, "grad_norm": 0.08435383439064026, "learning_rate": 9.963839661716325e-05, "loss": 0.0062, "step": 8640 }, { "epoch": 76.54867256637168, "grad_norm": 0.1057974249124527, "learning_rate": 9.963640892419958e-05, "loss": 0.0061, "step": 8650 }, { "epoch": 76.63716814159292, "grad_norm": 0.11115610599517822, "learning_rate": 9.963441580307286e-05, "loss": 0.0064, "step": 8660 }, { "epoch": 76.72566371681415, "grad_norm": 0.10767662525177002, "learning_rate": 9.963241725400104e-05, "loss": 0.0063, "step": 8670 }, { "epoch": 76.8141592920354, "grad_norm": 0.12147220969200134, "learning_rate": 9.963041327720271e-05, "loss": 0.0059, "step": 8680 }, { "epoch": 76.90265486725664, "grad_norm": 0.11704272031784058, "learning_rate": 9.962840387289697e-05, "loss": 0.0063, "step": 8690 }, { "epoch": 76.99115044247787, "grad_norm": 0.10685360431671143, "learning_rate": 9.962638904130363e-05, "loss": 0.0063, "step": 8700 }, { "epoch": 77.07964601769912, "grad_norm": 0.12996336817741394, "learning_rate": 9.962436878264298e-05, "loss": 0.0062, "step": 8710 }, { "epoch": 77.16814159292035, "grad_norm": 0.10194937884807587, "learning_rate": 9.962234309713598e-05, "loss": 0.0061, "step": 8720 }, { "epoch": 77.2566371681416, "grad_norm": 0.09662755578756332, "learning_rate": 9.962031198500414e-05, "loss": 0.0062, "step": 8730 }, { "epoch": 77.34513274336283, "grad_norm": 0.09166355431079865, "learning_rate": 9.961827544646958e-05, "loss": 0.0061, "step": 8740 }, { "epoch": 77.43362831858407, "grad_norm": 0.10736475884914398, "learning_rate": 9.961623348175501e-05, "loss": 0.0063, "step": 8750 }, { "epoch": 77.52212389380531, "grad_norm": 0.0980466902256012, "learning_rate": 9.961418609108377e-05, "loss": 0.0059, "step": 8760 }, { "epoch": 77.61061946902655, "grad_norm": 0.09984292089939117, "learning_rate": 9.961213327467971e-05, "loss": 0.0055, "step": 8770 }, { "epoch": 77.69911504424779, "grad_norm": 0.13165168464183807, "learning_rate": 9.961007503276736e-05, "loss": 0.0059, "step": 8780 }, { "epoch": 77.78761061946902, "grad_norm": 0.13526293635368347, "learning_rate": 9.960801136557179e-05, "loss": 0.0063, "step": 8790 }, { "epoch": 77.87610619469027, "grad_norm": 0.12890978157520294, "learning_rate": 9.960594227331866e-05, "loss": 0.0058, "step": 8800 }, { "epoch": 77.96460176991151, "grad_norm": 0.10762617737054825, "learning_rate": 9.960386775623429e-05, "loss": 0.006, "step": 8810 }, { "epoch": 78.05309734513274, "grad_norm": 0.12391962856054306, "learning_rate": 9.96017878145455e-05, "loss": 0.0067, "step": 8820 }, { "epoch": 78.14159292035399, "grad_norm": 0.11198613792657852, "learning_rate": 9.959970244847977e-05, "loss": 0.0057, "step": 8830 }, { "epoch": 78.23008849557522, "grad_norm": 0.12982113659381866, "learning_rate": 9.959761165826518e-05, "loss": 0.0061, "step": 8840 }, { "epoch": 78.31858407079646, "grad_norm": 0.12969951331615448, "learning_rate": 9.959551544413033e-05, "loss": 0.0062, "step": 8850 }, { "epoch": 78.40707964601769, "grad_norm": 0.1363687664270401, "learning_rate": 9.959341380630448e-05, "loss": 0.0058, "step": 8860 }, { "epoch": 78.49557522123894, "grad_norm": 0.1254795491695404, "learning_rate": 9.959130674501746e-05, "loss": 0.0058, "step": 8870 }, { "epoch": 78.58407079646018, "grad_norm": 0.07944416999816895, "learning_rate": 9.958919426049968e-05, "loss": 0.0058, "step": 8880 }, { "epoch": 78.67256637168141, "grad_norm": 0.10879120975732803, "learning_rate": 9.958707635298219e-05, "loss": 0.0056, "step": 8890 }, { "epoch": 78.76106194690266, "grad_norm": 0.12182161211967468, "learning_rate": 9.958495302269657e-05, "loss": 0.0061, "step": 8900 }, { "epoch": 78.84955752212389, "grad_norm": 0.1068938598036766, "learning_rate": 9.958282426987503e-05, "loss": 0.0062, "step": 8910 }, { "epoch": 78.93805309734513, "grad_norm": 0.11309578269720078, "learning_rate": 9.95806900947504e-05, "loss": 0.0057, "step": 8920 }, { "epoch": 79.02654867256638, "grad_norm": 0.09065814316272736, "learning_rate": 9.957855049755604e-05, "loss": 0.0055, "step": 8930 }, { "epoch": 79.11504424778761, "grad_norm": 0.07722936570644379, "learning_rate": 9.957640547852593e-05, "loss": 0.006, "step": 8940 }, { "epoch": 79.20353982300885, "grad_norm": 0.10858945548534393, "learning_rate": 9.957425503789466e-05, "loss": 0.006, "step": 8950 }, { "epoch": 79.29203539823008, "grad_norm": 0.11869582533836365, "learning_rate": 9.957209917589738e-05, "loss": 0.0053, "step": 8960 }, { "epoch": 79.38053097345133, "grad_norm": 0.1038854718208313, "learning_rate": 9.956993789276987e-05, "loss": 0.0053, "step": 8970 }, { "epoch": 79.46902654867256, "grad_norm": 0.10900082439184189, "learning_rate": 9.956777118874847e-05, "loss": 0.0056, "step": 8980 }, { "epoch": 79.5575221238938, "grad_norm": 0.0852367952466011, "learning_rate": 9.956559906407016e-05, "loss": 0.0058, "step": 8990 }, { "epoch": 79.64601769911505, "grad_norm": 0.10679316520690918, "learning_rate": 9.956342151897245e-05, "loss": 0.0054, "step": 9000 }, { "epoch": 79.73451327433628, "grad_norm": 0.08642175048589706, "learning_rate": 9.956123855369346e-05, "loss": 0.0062, "step": 9010 }, { "epoch": 79.82300884955752, "grad_norm": 0.1008024737238884, "learning_rate": 9.955905016847196e-05, "loss": 0.006, "step": 9020 }, { "epoch": 79.91150442477876, "grad_norm": 0.08043470978736877, "learning_rate": 9.955685636354723e-05, "loss": 0.0056, "step": 9030 }, { "epoch": 80.0, "grad_norm": 0.08366107195615768, "learning_rate": 9.95546571391592e-05, "loss": 0.0057, "step": 9040 }, { "epoch": 80.08849557522124, "grad_norm": 0.1145307868719101, "learning_rate": 9.955245249554837e-05, "loss": 0.0061, "step": 9050 }, { "epoch": 80.17699115044248, "grad_norm": 0.09086012840270996, "learning_rate": 9.955024243295582e-05, "loss": 0.0057, "step": 9060 }, { "epoch": 80.26548672566372, "grad_norm": 0.09050711244344711, "learning_rate": 9.954802695162328e-05, "loss": 0.0061, "step": 9070 }, { "epoch": 80.35398230088495, "grad_norm": 0.1267324984073639, "learning_rate": 9.954580605179302e-05, "loss": 0.0057, "step": 9080 }, { "epoch": 80.4424778761062, "grad_norm": 0.10124458372592926, "learning_rate": 9.954357973370788e-05, "loss": 0.006, "step": 9090 }, { "epoch": 80.53097345132744, "grad_norm": 0.10128167271614075, "learning_rate": 9.954134799761135e-05, "loss": 0.0061, "step": 9100 }, { "epoch": 80.61946902654867, "grad_norm": 0.10916615277528763, "learning_rate": 9.953911084374748e-05, "loss": 0.0064, "step": 9110 }, { "epoch": 80.70796460176992, "grad_norm": 0.11554095894098282, "learning_rate": 9.953686827236093e-05, "loss": 0.0055, "step": 9120 }, { "epoch": 80.79646017699115, "grad_norm": 0.1443350613117218, "learning_rate": 9.953462028369695e-05, "loss": 0.0062, "step": 9130 }, { "epoch": 80.88495575221239, "grad_norm": 0.09688980877399445, "learning_rate": 9.953236687800136e-05, "loss": 0.0062, "step": 9140 }, { "epoch": 80.97345132743362, "grad_norm": 0.08535933494567871, "learning_rate": 9.95301080555206e-05, "loss": 0.0066, "step": 9150 }, { "epoch": 81.06194690265487, "grad_norm": 0.08409234136343002, "learning_rate": 9.952784381650171e-05, "loss": 0.006, "step": 9160 }, { "epoch": 81.15044247787611, "grad_norm": 0.12123929709196091, "learning_rate": 9.952557416119226e-05, "loss": 0.0061, "step": 9170 }, { "epoch": 81.23893805309734, "grad_norm": 0.08133456110954285, "learning_rate": 9.95232990898405e-05, "loss": 0.0053, "step": 9180 }, { "epoch": 81.32743362831859, "grad_norm": 0.1189822256565094, "learning_rate": 9.95210186026952e-05, "loss": 0.006, "step": 9190 }, { "epoch": 81.41592920353982, "grad_norm": 0.1182321310043335, "learning_rate": 9.951873270000576e-05, "loss": 0.0059, "step": 9200 }, { "epoch": 81.50442477876106, "grad_norm": 0.16531887650489807, "learning_rate": 9.951644138202216e-05, "loss": 0.0059, "step": 9210 }, { "epoch": 81.59292035398231, "grad_norm": 0.09983012825250626, "learning_rate": 9.951414464899498e-05, "loss": 0.0065, "step": 9220 }, { "epoch": 81.68141592920354, "grad_norm": 0.11131998151540756, "learning_rate": 9.951184250117538e-05, "loss": 0.0056, "step": 9230 }, { "epoch": 81.76991150442478, "grad_norm": 0.11074583977460861, "learning_rate": 9.950953493881513e-05, "loss": 0.0059, "step": 9240 }, { "epoch": 81.85840707964601, "grad_norm": 0.11894248425960541, "learning_rate": 9.950722196216658e-05, "loss": 0.0052, "step": 9250 }, { "epoch": 81.94690265486726, "grad_norm": 0.10545102506875992, "learning_rate": 9.950490357148265e-05, "loss": 0.0058, "step": 9260 }, { "epoch": 82.03539823008849, "grad_norm": 0.1360078603029251, "learning_rate": 9.950257976701692e-05, "loss": 0.0061, "step": 9270 }, { "epoch": 82.12389380530973, "grad_norm": 0.06910306215286255, "learning_rate": 9.950025054902348e-05, "loss": 0.0055, "step": 9280 }, { "epoch": 82.21238938053098, "grad_norm": 0.09763657301664352, "learning_rate": 9.949791591775706e-05, "loss": 0.0056, "step": 9290 }, { "epoch": 82.30088495575221, "grad_norm": 0.07559313625097275, "learning_rate": 9.949557587347298e-05, "loss": 0.0051, "step": 9300 }, { "epoch": 82.38938053097345, "grad_norm": 0.07049795985221863, "learning_rate": 9.949323041642713e-05, "loss": 0.0057, "step": 9310 }, { "epoch": 82.47787610619469, "grad_norm": 0.10614913702011108, "learning_rate": 9.949087954687602e-05, "loss": 0.0055, "step": 9320 }, { "epoch": 82.56637168141593, "grad_norm": 0.10284052789211273, "learning_rate": 9.948852326507672e-05, "loss": 0.0059, "step": 9330 }, { "epoch": 82.65486725663717, "grad_norm": 0.12747420370578766, "learning_rate": 9.948616157128694e-05, "loss": 0.0061, "step": 9340 }, { "epoch": 82.7433628318584, "grad_norm": 0.10097663104534149, "learning_rate": 9.948379446576493e-05, "loss": 0.0065, "step": 9350 }, { "epoch": 82.83185840707965, "grad_norm": 0.1118328720331192, "learning_rate": 9.948142194876952e-05, "loss": 0.0058, "step": 9360 }, { "epoch": 82.92035398230088, "grad_norm": 0.10961694270372391, "learning_rate": 9.947904402056024e-05, "loss": 0.0054, "step": 9370 }, { "epoch": 83.00884955752213, "grad_norm": 0.10540442168712616, "learning_rate": 9.947666068139708e-05, "loss": 0.0059, "step": 9380 }, { "epoch": 83.09734513274336, "grad_norm": 0.08544182032346725, "learning_rate": 9.947427193154071e-05, "loss": 0.0056, "step": 9390 }, { "epoch": 83.1858407079646, "grad_norm": 0.09622497111558914, "learning_rate": 9.947187777125233e-05, "loss": 0.006, "step": 9400 }, { "epoch": 83.27433628318585, "grad_norm": 0.07578998804092407, "learning_rate": 9.946947820079377e-05, "loss": 0.0053, "step": 9410 }, { "epoch": 83.36283185840708, "grad_norm": 0.09491860866546631, "learning_rate": 9.946707322042747e-05, "loss": 0.0051, "step": 9420 }, { "epoch": 83.45132743362832, "grad_norm": 0.08115492761135101, "learning_rate": 9.94646628304164e-05, "loss": 0.0052, "step": 9430 }, { "epoch": 83.53982300884955, "grad_norm": 0.1010664626955986, "learning_rate": 9.946224703102418e-05, "loss": 0.0057, "step": 9440 }, { "epoch": 83.6283185840708, "grad_norm": 0.10234019160270691, "learning_rate": 9.945982582251498e-05, "loss": 0.0059, "step": 9450 }, { "epoch": 83.71681415929204, "grad_norm": 0.1145630031824112, "learning_rate": 9.94573992051536e-05, "loss": 0.0057, "step": 9460 }, { "epoch": 83.80530973451327, "grad_norm": 0.10085400193929672, "learning_rate": 9.94549671792054e-05, "loss": 0.0064, "step": 9470 }, { "epoch": 83.89380530973452, "grad_norm": 0.11343911290168762, "learning_rate": 9.945252974493635e-05, "loss": 0.0056, "step": 9480 }, { "epoch": 83.98230088495575, "grad_norm": 0.12302631139755249, "learning_rate": 9.9450086902613e-05, "loss": 0.006, "step": 9490 }, { "epoch": 84.070796460177, "grad_norm": 0.10083619505167007, "learning_rate": 9.944763865250248e-05, "loss": 0.0053, "step": 9500 }, { "epoch": 84.15929203539822, "grad_norm": 0.0939771980047226, "learning_rate": 9.944518499487254e-05, "loss": 0.0065, "step": 9510 }, { "epoch": 84.24778761061947, "grad_norm": 0.09466631710529327, "learning_rate": 9.944272592999151e-05, "loss": 0.006, "step": 9520 }, { "epoch": 84.33628318584071, "grad_norm": 0.10912278294563293, "learning_rate": 9.94402614581283e-05, "loss": 0.0068, "step": 9530 }, { "epoch": 84.42477876106194, "grad_norm": 0.09015228599309921, "learning_rate": 9.943779157955244e-05, "loss": 0.0055, "step": 9540 }, { "epoch": 84.51327433628319, "grad_norm": 0.08392563462257385, "learning_rate": 9.943531629453403e-05, "loss": 0.0055, "step": 9550 }, { "epoch": 84.60176991150442, "grad_norm": 0.09547723829746246, "learning_rate": 9.943283560334375e-05, "loss": 0.0056, "step": 9560 }, { "epoch": 84.69026548672566, "grad_norm": 0.10921365767717361, "learning_rate": 9.943034950625288e-05, "loss": 0.0051, "step": 9570 }, { "epoch": 84.77876106194691, "grad_norm": 0.09915768355131149, "learning_rate": 9.942785800353332e-05, "loss": 0.0063, "step": 9580 }, { "epoch": 84.86725663716814, "grad_norm": 0.127979576587677, "learning_rate": 9.942536109545751e-05, "loss": 0.0055, "step": 9590 }, { "epoch": 84.95575221238938, "grad_norm": 0.12383809685707092, "learning_rate": 9.942285878229853e-05, "loss": 0.0054, "step": 9600 }, { "epoch": 85.04424778761062, "grad_norm": 0.0929873138666153, "learning_rate": 9.942035106433001e-05, "loss": 0.0054, "step": 9610 }, { "epoch": 85.13274336283186, "grad_norm": 0.10625676810741425, "learning_rate": 9.94178379418262e-05, "loss": 0.0053, "step": 9620 }, { "epoch": 85.22123893805309, "grad_norm": 0.07674217224121094, "learning_rate": 9.941531941506194e-05, "loss": 0.0055, "step": 9630 }, { "epoch": 85.30973451327434, "grad_norm": 0.1119031086564064, "learning_rate": 9.941279548431263e-05, "loss": 0.0062, "step": 9640 }, { "epoch": 85.39823008849558, "grad_norm": 0.07825129479169846, "learning_rate": 9.941026614985431e-05, "loss": 0.0055, "step": 9650 }, { "epoch": 85.48672566371681, "grad_norm": 0.09553010016679764, "learning_rate": 9.940773141196357e-05, "loss": 0.0055, "step": 9660 }, { "epoch": 85.57522123893806, "grad_norm": 0.08578657358884811, "learning_rate": 9.94051912709176e-05, "loss": 0.0059, "step": 9670 }, { "epoch": 85.66371681415929, "grad_norm": 0.13160476088523865, "learning_rate": 9.940264572699421e-05, "loss": 0.0055, "step": 9680 }, { "epoch": 85.75221238938053, "grad_norm": 0.09317557513713837, "learning_rate": 9.940009478047174e-05, "loss": 0.0058, "step": 9690 }, { "epoch": 85.84070796460178, "grad_norm": 0.10249894112348557, "learning_rate": 9.939753843162918e-05, "loss": 0.0052, "step": 9700 }, { "epoch": 85.929203539823, "grad_norm": 0.12268459796905518, "learning_rate": 9.939497668074609e-05, "loss": 0.0063, "step": 9710 }, { "epoch": 86.01769911504425, "grad_norm": 0.10745935142040253, "learning_rate": 9.93924095281026e-05, "loss": 0.006, "step": 9720 }, { "epoch": 86.10619469026548, "grad_norm": 0.13324441015720367, "learning_rate": 9.938983697397948e-05, "loss": 0.0054, "step": 9730 }, { "epoch": 86.19469026548673, "grad_norm": 0.12653110921382904, "learning_rate": 9.938725901865805e-05, "loss": 0.0057, "step": 9740 }, { "epoch": 86.28318584070796, "grad_norm": 0.1267603188753128, "learning_rate": 9.93846756624202e-05, "loss": 0.0053, "step": 9750 }, { "epoch": 86.3716814159292, "grad_norm": 0.1272428333759308, "learning_rate": 9.938208690554849e-05, "loss": 0.0062, "step": 9760 }, { "epoch": 86.46017699115045, "grad_norm": 0.11038095504045486, "learning_rate": 9.9379492748326e-05, "loss": 0.006, "step": 9770 }, { "epoch": 86.54867256637168, "grad_norm": 0.12999387085437775, "learning_rate": 9.937689319103641e-05, "loss": 0.006, "step": 9780 }, { "epoch": 86.63716814159292, "grad_norm": 0.12017898261547089, "learning_rate": 9.937428823396404e-05, "loss": 0.0061, "step": 9790 }, { "epoch": 86.72566371681415, "grad_norm": 0.09171116352081299, "learning_rate": 9.937167787739372e-05, "loss": 0.006, "step": 9800 }, { "epoch": 86.8141592920354, "grad_norm": 0.12420973926782608, "learning_rate": 9.936906212161095e-05, "loss": 0.0053, "step": 9810 }, { "epoch": 86.90265486725664, "grad_norm": 0.1406574696302414, "learning_rate": 9.936644096690176e-05, "loss": 0.0053, "step": 9820 }, { "epoch": 86.99115044247787, "grad_norm": 0.1257728487253189, "learning_rate": 9.936381441355282e-05, "loss": 0.0056, "step": 9830 }, { "epoch": 87.07964601769912, "grad_norm": 0.09478408843278885, "learning_rate": 9.936118246185136e-05, "loss": 0.0057, "step": 9840 }, { "epoch": 87.16814159292035, "grad_norm": 0.1045784205198288, "learning_rate": 9.935854511208518e-05, "loss": 0.0055, "step": 9850 }, { "epoch": 87.2566371681416, "grad_norm": 0.1083742156624794, "learning_rate": 9.935590236454272e-05, "loss": 0.0056, "step": 9860 }, { "epoch": 87.34513274336283, "grad_norm": 0.12038854509592056, "learning_rate": 9.935325421951298e-05, "loss": 0.0058, "step": 9870 }, { "epoch": 87.43362831858407, "grad_norm": 0.127515971660614, "learning_rate": 9.935060067728557e-05, "loss": 0.0053, "step": 9880 }, { "epoch": 87.52212389380531, "grad_norm": 0.10194288939237595, "learning_rate": 9.934794173815067e-05, "loss": 0.0056, "step": 9890 }, { "epoch": 87.61061946902655, "grad_norm": 0.11484695971012115, "learning_rate": 9.934527740239906e-05, "loss": 0.0053, "step": 9900 }, { "epoch": 87.69911504424779, "grad_norm": 0.13571931421756744, "learning_rate": 9.934260767032209e-05, "loss": 0.0054, "step": 9910 }, { "epoch": 87.78761061946902, "grad_norm": 0.09630174189805984, "learning_rate": 9.933993254221172e-05, "loss": 0.005, "step": 9920 }, { "epoch": 87.87610619469027, "grad_norm": 0.11174452304840088, "learning_rate": 9.933725201836053e-05, "loss": 0.0061, "step": 9930 }, { "epoch": 87.96460176991151, "grad_norm": 0.12019488960504532, "learning_rate": 9.933456609906162e-05, "loss": 0.0058, "step": 9940 }, { "epoch": 88.05309734513274, "grad_norm": 0.09590188413858414, "learning_rate": 9.933187478460875e-05, "loss": 0.0055, "step": 9950 }, { "epoch": 88.14159292035399, "grad_norm": 0.13018690049648285, "learning_rate": 9.93291780752962e-05, "loss": 0.0062, "step": 9960 }, { "epoch": 88.23008849557522, "grad_norm": 0.12045933306217194, "learning_rate": 9.932647597141893e-05, "loss": 0.006, "step": 9970 }, { "epoch": 88.31858407079646, "grad_norm": 0.09797903150320053, "learning_rate": 9.932376847327239e-05, "loss": 0.0058, "step": 9980 }, { "epoch": 88.40707964601769, "grad_norm": 0.09055085480213165, "learning_rate": 9.932105558115268e-05, "loss": 0.0056, "step": 9990 }, { "epoch": 88.49557522123894, "grad_norm": 0.07018600404262543, "learning_rate": 9.931833729535651e-05, "loss": 0.0064, "step": 10000 } ], "logging_steps": 10, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 885, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 512, "trial_name": null, "trial_params": null }