type eval | step 0 | loss 62.1168 38.5482 32.7109 16.2987 23.8339 | checkpoint False | ce_loss 2.7305 | sae_losses 62.1168 38.5482 32.7109 16.2987 23.8339 | ce_loss_increases 13.6339 4.8078 3.4410 2.5197 0.4850 | compound_ce_loss_increase 11.0687 | l0s 126.5022 130.4285 132.0811 124.6985 128.5097 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 250 | loss 0.0976 0.1029 0.1535 0.1265 0.1991 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0976 0.1029 0.1535 0.1265 0.1991 | ce_loss_increases 0.9425 1.0777 2.5968 2.3437 6.3653 | compound_ce_loss_increase 7.3942 | l0s 32.3041 19.5469 8.0821 6.7983 0.8884 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 500 | loss 0.0504 0.0698 0.1209 0.1059 0.1879 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0504 0.0698 0.1209 0.1059 0.1879 | ce_loss_increases 0.2568 0.4954 1.5351 1.3955 4.0608 | compound_ce_loss_increase 7.3846 | l0s 32.5219 15.3129 5.9977 5.6847 1.6278 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 750 | loss 0.0443 0.0640 0.1156 0.1027 0.1852 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0443 0.0640 0.1156 0.1027 0.1852 | ce_loss_increases 0.1325 0.4469 1.3722 1.2836 3.5578 | compound_ce_loss_increase 7.4394 | l0s 27.5626 12.8291 4.8814 4.8781 1.6247 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1000 | loss 0.0425 0.0619 0.1140 0.1019 0.1843 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0425 0.0619 0.1140 0.1019 0.1843 | ce_loss_increases 0.1250 0.4374 1.3264 1.2442 3.3534 | compound_ce_loss_increase 7.3618 | l0s 25.6762 11.5295 4.3647 4.4795 1.5408 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1250 | loss 0.0416 0.0609 0.1135 0.1015 0.1840 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0416 0.0609 0.1135 0.1015 0.1840 | ce_loss_increases 0.1196 0.4261 1.3144 1.2372 3.3324 | compound_ce_loss_increase 7.3926 | l0s 24.4143 10.8949 4.1687 4.3115 1.5073 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1500 | loss 0.0410 0.0603 0.1131 0.1013 0.1838 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0410 0.0603 0.1131 0.1013 0.1838 | ce_loss_increases 0.1199 0.4295 1.3123 1.2241 3.2959 | compound_ce_loss_increase 7.3557 | l0s 23.5010 10.4786 4.0506 4.2185 1.4877 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1750 | loss 0.0406 0.0600 0.1130 0.1011 0.1837 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0406 0.0600 0.1130 0.1011 0.1837 | ce_loss_increases 0.1155 0.4251 1.3100 1.2176 3.2816 | compound_ce_loss_increase 7.3658 | l0s 23.1034 10.1729 3.9743 4.1718 1.4905 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2000 | loss 0.0402 0.0598 0.1128 0.1011 0.1837 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0402 0.0598 0.1128 0.1011 0.1837 | ce_loss_increases 0.1156 0.4269 1.3052 1.2221 3.2495 | compound_ce_loss_increase 7.3287 | l0s 22.6448 9.9752 3.9285 4.1169 1.4746 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2250 | loss 0.0400 0.0596 0.1128 0.1010 0.1836 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0400 0.0596 0.1128 0.1010 0.1836 | ce_loss_increases 0.1145 0.4273 1.2986 1.2156 3.2534 | compound_ce_loss_increase 7.3384 | l0s 22.2548 9.8874 3.8997 4.0863 1.4748 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2500 | loss 0.0398 0.0595 0.1127 0.1010 0.1836 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0398 0.0595 0.1127 0.1010 0.1836 | ce_loss_increases 0.1132 0.4292 1.2953 1.2142 3.2785 | compound_ce_loss_increase 7.3939 | l0s 21.9071 9.7915 3.8888 4.0860 1.4736 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2750 | loss 0.0397 0.0594 0.1126 0.1009 0.1836 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0397 0.0594 0.1126 0.1009 0.1836 | ce_loss_increases 0.1130 0.4271 1.2966 1.2176 3.2556 | compound_ce_loss_increase 7.3690 | l0s 21.6792 9.7545 3.8633 4.0458 1.4745 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3000 | loss 0.0396 0.0593 0.1126 0.1009 0.1835 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0396 0.0593 0.1126 0.1009 0.1835 | ce_loss_increases 0.1138 0.4268 1.2927 1.2154 3.2783 | compound_ce_loss_increase 7.4031 | l0s 21.5254 9.6603 3.8601 4.0322 1.4683 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3250 | loss 0.0395 0.0593 0.1126 0.1009 0.1835 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0395 0.0593 0.1126 0.1009 0.1835 | ce_loss_increases 0.1125 0.4230 1.2961 1.2188 3.2590 | compound_ce_loss_increase 7.3798 | l0s 21.5168 9.6156 3.8545 4.0206 1.4718 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3500 | loss 0.0395 0.0593 0.1126 0.1009 0.1835 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0395 0.0593 0.1126 0.1009 0.1835 | ce_loss_increases 0.1131 0.4261 1.2974 1.2128 3.2615 | compound_ce_loss_increase 7.3961 | l0s 21.3164 9.5854 3.8460 4.0333 1.4747 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3750 | loss 0.0394 0.0592 0.1126 0.1009 0.1835 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0394 0.0592 0.1126 0.1009 0.1835 | ce_loss_increases 0.1131 0.4242 1.2988 1.2187 3.2306 | compound_ce_loss_increase 7.3487 | l0s 21.3501 9.5534 3.8388 4.0077 1.4743 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4000 | loss 0.0394 0.0592 0.1125 0.1008 0.1835 | checkpoint False True True True True | ce_loss 2.7305 | sae_losses 0.0394 0.0592 0.1125 0.1008 0.1835 | ce_loss_increases 0.1126 0.4206 1.2931 1.2120 3.2413 | compound_ce_loss_increase 7.3541 | l0s 21.3689 9.5213 3.8385 4.0210 1.4727 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4250 | loss 0.0394 0.0591 0.1125 0.1008 0.1835 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.0394 0.0591 0.1125 0.1008 0.1835 | ce_loss_increases 0.1133 0.4202 1.2919 1.2104 3.2444 | compound_ce_loss_increase 7.3592 | l0s 21.2880 9.5207 3.8392 4.0146 1.4707 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4500 | loss 0.0394 0.0591 0.1125 0.1008 0.1835 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0394 0.0591 0.1125 0.1008 0.1835 | ce_loss_increases 0.1124 0.4227 1.2965 1.2138 3.2618 | compound_ce_loss_increase 7.3984 | l0s 21.2795 9.4830 3.8287 4.0055 1.4733 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4750 | loss 0.0393 0.0591 0.1125 0.1008 0.1835 | checkpoint True True True False False | ce_loss 2.7305 | sae_losses 0.0393 0.0591 0.1125 0.1008 0.1835 | ce_loss_increases 0.1125 0.4232 1.2934 1.2091 3.2448 | compound_ce_loss_increase 7.3566 | l0s 21.2658 9.4830 3.8251 4.0167 1.4686 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 5000 | loss 0.0393 0.0591 0.1125 0.1008 0.1835 | checkpoint True True True False False | ce_loss 2.7305 | sae_losses 0.0393 0.0591 0.1125 0.1008 0.1835 | ce_loss_increases 0.1135 0.4253 1.2955 1.2099 3.2526 | compound_ce_loss_increase 7.3795 | l0s 21.1398 9.4597 3.8206 4.0128 1.4737 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 0 | loss 66.9152 33.4942 26.4795 16.6743 18.6534 | checkpoint False | ce_loss 2.7305 | sae_losses 66.9152 33.4942 26.4795 16.6743 18.6534 | ce_loss_increases 13.2048 4.9146 3.7548 2.4958 0.5724 | compound_ce_loss_increase 9.4137 | l0s 127.4290 126.8114 128.7220 123.8109 129.8279 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 250 | loss 0.2804 0.1630 0.1527 0.1262 0.1479 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2804 0.1630 0.1527 0.1262 0.1479 | ce_loss_increases 1.9628 1.8229 2.5734 2.3130 1.7294 | compound_ce_loss_increase 7.7162 | l0s 20.0368 14.2932 7.8862 6.6719 8.0205 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 500 | loss 0.1865 0.1146 0.1203 0.1058 0.1271 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1865 0.1146 0.1203 0.1058 0.1271 | ce_loss_increases 1.0174 0.9088 1.5520 1.3950 0.9918 | compound_ce_loss_increase 6.7608 | l0s 12.4116 9.4445 5.8227 5.6889 6.5869 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 750 | loss 0.1711 0.1065 0.1152 0.1025 0.1238 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1711 0.1065 0.1152 0.1025 0.1238 | ce_loss_increases 0.9512 0.7821 1.3442 1.2620 0.9081 | compound_ce_loss_increase 6.7964 | l0s 9.8073 7.4841 4.6591 4.7849 5.6608 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1000 | loss 0.1652 0.1039 0.1138 0.1017 0.1229 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1652 0.1039 0.1138 0.1017 0.1229 | ce_loss_increases 0.9266 0.7399 1.3031 1.2388 0.8830 | compound_ce_loss_increase 6.6203 | l0s 8.5332 6.7643 4.2466 4.3820 5.2310 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1250 | loss 0.1621 0.1029 0.1132 0.1014 0.1226 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1621 0.1029 0.1132 0.1014 0.1226 | ce_loss_increases 0.8971 0.7356 1.2786 1.2448 0.8790 | compound_ce_loss_increase 6.8698 | l0s 7.8725 6.4386 4.0380 4.2134 5.0607 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1500 | loss 0.1606 0.1024 0.1129 0.1012 0.1224 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1606 0.1024 0.1129 0.1012 0.1224 | ce_loss_increases 0.8810 0.7355 1.2634 1.2390 0.8704 | compound_ce_loss_increase 6.8387 | l0s 7.4109 6.2097 3.9608 4.1258 4.9957 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1750 | loss 0.1596 0.1021 0.1128 0.1011 0.1223 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1596 0.1021 0.1128 0.1011 0.1223 | ce_loss_increases 0.8683 0.7521 1.2589 1.2291 0.8736 | compound_ce_loss_increase 6.9167 | l0s 7.2302 6.0494 3.8978 4.0824 4.9417 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2000 | loss 0.1590 0.1018 0.1127 0.1010 0.1223 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1590 0.1018 0.1127 0.1010 0.1223 | ce_loss_increases 0.8714 0.7581 1.2711 1.2244 0.8660 | compound_ce_loss_increase 6.8720 | l0s 7.0511 5.9623 3.8312 4.0291 4.8978 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2250 | loss 0.1587 0.1017 0.1126 0.1009 0.1223 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1587 0.1017 0.1126 0.1009 0.1223 | ce_loss_increases 0.8661 0.7624 1.2665 1.2182 0.8689 | compound_ce_loss_increase 6.9136 | l0s 6.9251 5.9107 3.8052 3.9985 4.8895 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2500 | loss 0.1583 0.1015 0.1125 0.1009 0.1222 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1583 0.1015 0.1125 0.1009 0.1222 | ce_loss_increases 0.8709 0.7650 1.2679 1.2168 0.8686 | compound_ce_loss_increase 6.9736 | l0s 6.7599 5.8630 3.7801 4.0060 4.8948 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2750 | loss 0.1581 0.1014 0.1125 0.1009 0.1222 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1581 0.1014 0.1125 0.1009 0.1222 | ce_loss_increases 0.8603 0.7564 1.2647 1.2209 0.8651 | compound_ce_loss_increase 6.8967 | l0s 6.7602 5.8303 3.7468 3.9566 4.8615 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3000 | loss 0.1581 0.1014 0.1124 0.1009 0.1222 | checkpoint True True True True False | ce_loss 2.7305 | sae_losses 0.1581 0.1014 0.1124 0.1009 0.1222 | ce_loss_increases 0.8598 0.7541 1.2635 1.2179 0.8669 | compound_ce_loss_increase 6.8723 | l0s 6.6806 5.8107 3.7328 3.9475 4.8505 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3250 | loss 0.1578 0.1013 0.1124 0.1009 0.1222 | checkpoint True True False True True | ce_loss 2.7305 | sae_losses 0.1578 0.1013 0.1124 0.1009 0.1222 | ce_loss_increases 0.8580 0.7560 1.2695 1.2225 0.8612 | compound_ce_loss_increase 6.8579 | l0s 6.6533 5.7855 3.7133 3.9447 4.8459 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3500 | loss 0.1578 0.1012 0.1124 0.1009 0.1221 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.1578 0.1012 0.1124 0.1009 0.1221 | ce_loss_increases 0.8523 0.7575 1.2686 1.2177 0.8621 | compound_ce_loss_increase 6.8672 | l0s 6.6260 5.7635 3.7126 3.9477 4.8487 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3750 | loss 0.1577 0.1012 0.1124 0.1008 0.1221 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1577 0.1012 0.1124 0.1008 0.1221 | ce_loss_increases 0.8589 0.7598 1.2722 1.2188 0.8550 | compound_ce_loss_increase 6.8171 | l0s 6.5838 5.7624 3.6943 3.9342 4.8471 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4000 | loss 0.1577 0.1012 0.1123 0.1008 0.1221 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1577 0.1012 0.1123 0.1008 0.1221 | ce_loss_increases 0.8520 0.7570 1.2643 1.2169 0.8570 | compound_ce_loss_increase 6.8115 | l0s 6.6477 5.7549 3.7006 3.9389 4.8362 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4250 | loss 0.1577 0.1012 0.1123 0.1008 0.1221 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1577 0.1012 0.1123 0.1008 0.1221 | ce_loss_increases 0.8538 0.7558 1.2662 1.2145 0.8560 | compound_ce_loss_increase 6.7904 | l0s 6.5565 5.7537 3.6962 3.9375 4.8354 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4500 | loss 0.1576 0.1012 0.1123 0.1008 0.1221 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1576 0.1012 0.1123 0.1008 0.1221 | ce_loss_increases 0.8506 0.7604 1.2711 1.2178 0.8616 | compound_ce_loss_increase 6.8377 | l0s 6.5711 5.7536 3.6839 3.9281 4.8309 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4750 | loss 0.1576 0.1011 0.1123 0.1008 0.1221 | checkpoint False True False False False | ce_loss 2.7305 | sae_losses 0.1576 0.1011 0.1123 0.1008 0.1221 | ce_loss_increases 0.8500 0.7600 1.2688 1.2154 0.8585 | compound_ce_loss_increase 6.8023 | l0s 6.5627 5.7389 3.6860 3.9306 4.8321 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 5000 | loss 0.1575 0.1011 0.1123 0.1008 0.1221 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.1575 0.1011 0.1123 0.1008 0.1221 | ce_loss_increases 0.8521 0.7618 1.2702 1.2140 0.8612 | compound_ce_loss_increase 6.8242 | l0s 6.5298 5.7303 3.6797 3.9339 4.8407 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 0 | loss 217.0749 133.9181 84.7643 65.7416 65.9811 | checkpoint False | ce_loss 2.7305 | sae_losses 217.0749 133.9181 84.7643 65.7416 65.9811 | ce_loss_increases 14.6022 5.6782 3.5488 2.4542 0.2608 | compound_ce_loss_increase 11.3118 | l0s 251.6037 254.1876 249.7975 260.2430 258.3506 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 250 | loss 0.2878 0.1750 0.1556 0.1295 0.1481 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2878 0.1750 0.1556 0.1295 0.1481 | ce_loss_increases 1.7752 1.9067 2.5840 2.4058 1.6351 | compound_ce_loss_increase 7.2510 | l0s 22.5019 15.2678 8.6838 7.7088 8.7541 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 500 | loss 0.1716 0.1093 0.1177 0.1041 0.1242 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1716 0.1093 0.1177 0.1041 0.1242 | ce_loss_increases 0.8736 0.8163 1.3476 1.2489 0.8938 | compound_ce_loss_increase 7.2575 | l0s 13.7032 9.8405 6.7115 6.5038 8.1346 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 750 | loss 0.1553 0.0997 0.1118 0.1003 0.1203 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1553 0.0997 0.1118 0.1003 0.1203 | ce_loss_increases 0.7452 0.6947 1.1749 1.1083 0.7912 | compound_ce_loss_increase 7.0751 | l0s 10.3311 8.0425 5.6991 5.7201 7.3129 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1000 | loss 0.1498 0.0966 0.1099 0.0991 0.1190 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1498 0.0966 0.1099 0.0991 0.1190 | ce_loss_increases 0.6984 0.6138 1.1302 1.0606 0.7539 | compound_ce_loss_increase 7.0669 | l0s 8.9906 7.1512 5.1559 5.2188 6.7310 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1250 | loss 0.1471 0.0951 0.1089 0.0985 0.1184 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1471 0.0951 0.1089 0.0985 0.1184 | ce_loss_increases 0.6834 0.6096 1.1350 1.0475 0.7373 | compound_ce_loss_increase 7.0362 | l0s 8.2262 6.4852 4.7994 4.9698 6.4235 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1500 | loss 0.1458 0.0942 0.1084 0.0982 0.1181 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1458 0.0942 0.1084 0.0982 0.1181 | ce_loss_increases 0.6743 0.5788 1.1246 1.0355 0.7329 | compound_ce_loss_increase 7.1809 | l0s 7.8444 6.1932 4.6230 4.8385 6.2194 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1750 | loss 0.1450 0.0937 0.1081 0.0979 0.1179 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1450 0.0937 0.1081 0.0979 0.1179 | ce_loss_increases 0.6672 0.5681 1.0965 1.0226 0.7210 | compound_ce_loss_increase 7.0754 | l0s 7.5208 5.9453 4.5342 4.7276 6.0683 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2000 | loss 0.1443 0.0933 0.1079 0.0977 0.1178 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1443 0.0933 0.1079 0.0977 0.1178 | ce_loss_increases 0.6705 0.5585 1.1108 1.0202 0.7172 | compound_ce_loss_increase 6.9490 | l0s 7.2990 5.8250 4.4249 4.6440 5.9563 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2250 | loss 0.1440 0.0930 0.1077 0.0976 0.1177 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1440 0.0930 0.1077 0.0976 0.1177 | ce_loss_increases 0.6719 0.5592 1.1002 1.0160 0.7204 | compound_ce_loss_increase 7.2182 | l0s 7.1619 5.7361 4.3753 4.6062 5.8781 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2500 | loss 0.1435 0.0928 0.1075 0.0975 0.1176 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1435 0.0928 0.1075 0.0975 0.1176 | ce_loss_increases 0.6727 0.5640 1.0968 1.0176 0.7186 | compound_ce_loss_increase 7.0815 | l0s 7.0032 5.6546 4.3245 4.5460 5.8050 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2750 | loss 0.1433 0.0927 0.1074 0.0975 0.1175 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1433 0.0927 0.1074 0.0975 0.1175 | ce_loss_increases 0.6677 0.5617 1.0977 1.0186 0.7141 | compound_ce_loss_increase 7.0705 | l0s 6.9426 5.6079 4.2750 4.5103 5.8034 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3000 | loss 0.1433 0.0925 0.1074 0.0974 0.1175 | checkpoint False True True True True | ce_loss 2.7305 | sae_losses 0.1433 0.0925 0.1074 0.0974 0.1175 | ce_loss_increases 0.6656 0.5727 1.0942 1.0094 0.7123 | compound_ce_loss_increase 6.9809 | l0s 6.8317 5.5753 4.2554 4.5177 5.7522 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3250 | loss 0.1431 0.0925 0.1073 0.0974 0.1175 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1431 0.0925 0.1073 0.0974 0.1175 | ce_loss_increases 0.6694 0.5671 1.0873 1.0081 0.7138 | compound_ce_loss_increase 7.0122 | l0s 6.7233 5.5430 4.2379 4.4940 5.7304 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3500 | loss 0.1428 0.0924 0.1072 0.0974 0.1174 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1428 0.0924 0.1072 0.0974 0.1174 | ce_loss_increases 0.6625 0.5712 1.0841 1.0136 0.7139 | compound_ce_loss_increase 6.9587 | l0s 6.7020 5.5053 4.2323 4.4818 5.7102 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3750 | loss 0.1428 0.0923 0.1072 0.0973 0.1174 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1428 0.0923 0.1072 0.0973 0.1174 | ce_loss_increases 0.6631 0.5700 1.0859 1.0158 0.7088 | compound_ce_loss_increase 6.9256 | l0s 6.6508 5.4996 4.2133 4.4553 5.6942 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4000 | loss 0.1428 0.0923 0.1072 0.0973 0.1174 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1428 0.0923 0.1072 0.0973 0.1174 | ce_loss_increases 0.6582 0.5752 1.0874 1.0116 0.7106 | compound_ce_loss_increase 6.8789 | l0s 6.6271 5.4772 4.1988 4.4602 5.6748 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4250 | loss 0.1428 0.0922 0.1071 0.0973 0.1174 | checkpoint False True True True True | ce_loss 2.7305 | sae_losses 0.1428 0.0922 0.1071 0.0973 0.1174 | ce_loss_increases 0.6597 0.5722 1.0824 1.0057 0.7125 | compound_ce_loss_increase 6.9523 | l0s 6.5979 5.4733 4.1992 4.4758 5.6949 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4500 | loss 0.1427 0.0922 0.1071 0.0973 0.1174 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1427 0.0922 0.1071 0.0973 0.1174 | ce_loss_increases 0.6576 0.5748 1.0876 1.0074 0.7102 | compound_ce_loss_increase 6.9241 | l0s 6.5901 5.4573 4.1765 4.4566 5.6876 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4750 | loss 0.1426 0.0922 0.1071 0.0973 0.1174 | checkpoint True True True True False | ce_loss 2.7305 | sae_losses 0.1426 0.0922 0.1071 0.0973 0.1174 | ce_loss_increases 0.6594 0.5710 1.0879 1.0077 0.7079 | compound_ce_loss_increase 6.8838 | l0s 6.5463 5.4541 4.1734 4.4478 5.6742 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 5000 | loss 0.1426 0.0922 0.1071 0.0973 0.1174 | checkpoint False True True True True | ce_loss 2.7305 | sae_losses 0.1426 0.0922 0.1071 0.0973 0.1174 | ce_loss_increases 0.6584 0.5763 1.0851 1.0094 0.7111 | compound_ce_loss_increase 6.9385 | l0s 6.5548 5.4330 4.1765 4.4545 5.6792 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 0 | loss 263.4649 165.1268 115.5145 77.1456 67.5597 | checkpoint False | ce_loss 2.7305 | sae_losses 263.4649 165.1268 115.5145 77.1456 67.5597 | ce_loss_increases 14.3452 5.1352 3.6494 2.6231 0.3475 | compound_ce_loss_increase 11.4295 | l0s 256.9329 261.0746 265.2831 254.1063 251.9931 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 250 | loss 0.3948 0.3291 0.2209 0.1679 0.1925 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.3948 0.3291 0.2209 0.1679 0.1925 | ce_loss_increases 2.3141 4.6114 5.0827 5.6332 4.5915 | compound_ce_loss_increase 6.9377 | l0s 18.3972 5.8177 1.4787 0.6369 2.5792 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 500 | loss 0.2491 0.2517 0.1960 0.1588 0.1720 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2491 0.2517 0.1960 0.1588 0.1720 | ce_loss_increases 1.3340 2.6353 3.7622 4.4979 2.4034 | compound_ce_loss_increase 7.4810 | l0s 9.9013 4.1823 2.0910 1.3741 3.2784 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 750 | loss 0.2289 0.2402 0.1914 0.1565 0.1686 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2289 0.2402 0.1914 0.1565 0.1686 | ce_loss_increases 1.1594 2.3567 3.4387 4.1572 2.0690 | compound_ce_loss_increase 7.4601 | l0s 7.2982 3.3578 1.8534 1.4147 2.9780 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1000 | loss 0.2230 0.2364 0.1902 0.1559 0.1675 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2230 0.2364 0.1902 0.1559 0.1675 | ce_loss_increases 1.1209 2.2832 3.3411 4.0981 1.9336 | compound_ce_loss_increase 7.3901 | l0s 6.3587 3.0018 1.7482 1.3543 2.7505 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1250 | loss 0.2202 0.2347 0.1896 0.1557 0.1671 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2202 0.2347 0.1896 0.1557 0.1671 | ce_loss_increases 1.0986 2.2781 3.2989 4.1061 1.8902 | compound_ce_loss_increase 7.4188 | l0s 5.7783 2.8146 1.6990 1.3446 2.6485 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1500 | loss 0.2186 0.2338 0.1892 0.1555 0.1669 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2186 0.2338 0.1892 0.1555 0.1669 | ce_loss_increases 1.0694 2.2246 3.2769 4.0663 1.8950 | compound_ce_loss_increase 7.5581 | l0s 5.4925 2.7274 1.6666 1.3474 2.5990 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1750 | loss 0.2173 0.2333 0.1889 0.1554 0.1667 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2173 0.2333 0.1889 0.1554 0.1667 | ce_loss_increases 1.0707 2.2039 3.2298 4.0152 1.8515 | compound_ce_loss_increase 7.4483 | l0s 5.2419 2.6551 1.6385 1.3464 2.5465 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2000 | loss 0.2159 0.2329 0.1887 0.1553 0.1667 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2159 0.2329 0.1887 0.1553 0.1667 | ce_loss_increases 1.0461 2.2019 3.2239 3.9884 1.8376 | compound_ce_loss_increase 7.3885 | l0s 5.0655 2.6094 1.6175 1.3412 2.5087 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2250 | loss 0.2152 0.2326 0.1886 0.1552 0.1666 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2152 0.2326 0.1886 0.1552 0.1666 | ce_loss_increases 1.0414 2.2190 3.2385 4.0051 1.8528 | compound_ce_loss_increase 7.4748 | l0s 4.9608 2.5686 1.6124 1.3425 2.4927 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2500 | loss 0.2146 0.2325 0.1885 0.1552 0.1666 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2146 0.2325 0.1885 0.1552 0.1666 | ce_loss_increases 1.0426 2.2212 3.2347 4.0195 1.8499 | compound_ce_loss_increase 7.4506 | l0s 4.8939 2.5480 1.6032 1.3433 2.4756 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2750 | loss 0.2142 0.2322 0.1884 0.1551 0.1665 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2142 0.2322 0.1884 0.1551 0.1665 | ce_loss_increases 1.0330 2.2153 3.2293 3.9931 1.8476 | compound_ce_loss_increase 7.4449 | l0s 4.8319 2.5295 1.6004 1.3462 2.4582 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3000 | loss 0.2141 0.2321 0.1884 0.1551 0.1665 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2141 0.2321 0.1884 0.1551 0.1665 | ce_loss_increases 1.0230 2.2129 3.2289 4.0050 1.8319 | compound_ce_loss_increase 7.4335 | l0s 4.7789 2.5160 1.6048 1.3485 2.4520 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3250 | loss 0.2138 0.2320 0.1883 0.1551 0.1665 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2138 0.2320 0.1883 0.1551 0.1665 | ce_loss_increases 1.0254 2.1953 3.2162 3.9982 1.8419 | compound_ce_loss_increase 7.4614 | l0s 4.7259 2.5057 1.6035 1.3508 2.4493 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3500 | loss 0.2136 0.2319 0.1882 0.1551 0.1664 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2136 0.2319 0.1882 0.1551 0.1664 | ce_loss_increases 1.0232 2.2092 3.2196 3.9811 1.8303 | compound_ce_loss_increase 7.4233 | l0s 4.7036 2.4918 1.6071 1.3576 2.4442 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3750 | loss 0.2136 0.2318 0.1882 0.1550 0.1664 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2136 0.2318 0.1882 0.1550 0.1664 | ce_loss_increases 1.0217 2.2007 3.2149 3.9803 1.8254 | compound_ce_loss_increase 7.4245 | l0s 4.7003 2.4876 1.6005 1.3527 2.4318 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4000 | loss 0.2135 0.2318 0.1882 0.1550 0.1664 | checkpoint True True True True False | ce_loss 2.7305 | sae_losses 0.2135 0.2318 0.1882 0.1550 0.1664 | ce_loss_increases 1.0203 2.2034 3.2106 3.9924 1.8233 | compound_ce_loss_increase 7.4014 | l0s 4.6617 2.4762 1.5992 1.3568 2.4272 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4250 | loss 0.2135 0.2317 0.1882 0.1550 0.1664 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2135 0.2317 0.1882 0.1550 0.1664 | ce_loss_increases 1.0200 2.1908 3.2215 3.9936 1.8321 | compound_ce_loss_increase 7.4514 | l0s 4.6718 2.4765 1.5965 1.3583 2.4392 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4500 | loss 0.2134 0.2317 0.1881 0.1550 0.1664 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2134 0.2317 0.1881 0.1550 0.1664 | ce_loss_increases 1.0242 2.1997 3.2088 3.9763 1.8304 | compound_ce_loss_increase 7.4451 | l0s 4.6689 2.4651 1.5947 1.3562 2.4301 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4750 | loss 0.2134 0.2316 0.1881 0.1550 0.1664 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2134 0.2316 0.1881 0.1550 0.1664 | ce_loss_increases 1.0200 2.1879 3.2081 3.9688 1.8213 | compound_ce_loss_increase 7.4252 | l0s 4.6524 2.4669 1.5942 1.3542 2.4242 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 5000 | loss 0.2134 0.2316 0.1881 0.1550 0.1664 | checkpoint True True False True True | ce_loss 2.7305 | sae_losses 0.2134 0.2316 0.1881 0.1550 0.1664 | ce_loss_increases 1.0247 2.1943 3.2218 3.9952 1.8311 | compound_ce_loss_increase 7.4506 | l0s 4.6467 2.4590 1.5956 1.3551 2.4286 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 0 | loss 228.0831 151.7672 101.7318 65.4508 59.5769 | checkpoint False | ce_loss 2.7305 | sae_losses 228.0831 151.7672 101.7318 65.4508 59.5769 | ce_loss_increases 15.1737 5.6176 3.8103 2.4554 0.3263 | compound_ce_loss_increase 13.1763 | l0s 250.7765 262.3879 262.0721 260.5365 253.3260 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 250 | loss 0.2006 0.1266 0.1046 0.0885 0.1042 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.2006 0.1266 0.1046 0.0885 0.1042 | ce_loss_increases 1.3214 1.1681 1.4280 1.1088 0.6645 | compound_ce_loss_increase 6.3830 | l0s 27.5051 20.5368 14.1571 13.6512 15.6874 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 500 | loss 0.1043 0.0667 0.0744 0.0691 0.0841 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1043 0.0667 0.0744 0.0691 0.0841 | ce_loss_increases 0.4811 0.4678 0.7304 0.6437 0.4012 | compound_ce_loss_increase 6.8437 | l0s 20.7711 15.4090 11.3696 12.6329 17.1463 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 750 | loss 0.0916 0.0592 0.0696 0.0658 0.0809 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0916 0.0592 0.0696 0.0658 0.0809 | ce_loss_increases 0.3851 0.3842 0.6208 0.5575 0.3530 | compound_ce_loss_increase 6.4559 | l0s 16.2395 12.8906 9.9856 11.9131 16.7249 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1000 | loss 0.0873 0.0568 0.0680 0.0646 0.0798 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0873 0.0568 0.0680 0.0646 0.0798 | ce_loss_increases 0.3689 0.3561 0.5955 0.5365 0.3418 | compound_ce_loss_increase 6.2818 | l0s 14.1681 11.6093 9.3029 11.3358 16.0011 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1250 | loss 0.0852 0.0555 0.0672 0.0640 0.0793 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0852 0.0555 0.0672 0.0640 0.0793 | ce_loss_increases 0.3696 0.3270 0.5892 0.5309 0.3402 | compound_ce_loss_increase 6.1845 | l0s 12.9116 10.8693 8.8363 10.8791 15.4473 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1500 | loss 0.0839 0.0548 0.0667 0.0637 0.0790 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0839 0.0548 0.0667 0.0637 0.0790 | ce_loss_increases 0.3700 0.3225 0.5748 0.5263 0.3356 | compound_ce_loss_increase 5.9956 | l0s 12.3710 10.4399 8.5786 10.6410 15.1459 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1750 | loss 0.0829 0.0543 0.0664 0.0635 0.0788 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0829 0.0543 0.0664 0.0635 0.0788 | ce_loss_increases 0.3675 0.3078 0.5599 0.5164 0.3304 | compound_ce_loss_increase 5.9405 | l0s 11.9054 10.1585 8.4260 10.4111 14.8904 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2000 | loss 0.0822 0.0539 0.0662 0.0633 0.0786 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0822 0.0539 0.0662 0.0633 0.0786 | ce_loss_increases 0.3682 0.3031 0.5641 0.5186 0.3290 | compound_ce_loss_increase 5.9497 | l0s 11.5908 9.9693 8.2726 10.2526 14.6543 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2250 | loss 0.0818 0.0537 0.0660 0.0632 0.0785 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0818 0.0537 0.0660 0.0632 0.0785 | ce_loss_increases 0.3712 0.3040 0.5615 0.5102 0.3319 | compound_ce_loss_increase 6.0919 | l0s 11.2982 9.7756 8.1782 10.2105 14.5247 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2500 | loss 0.0814 0.0535 0.0659 0.0631 0.0784 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0814 0.0535 0.0659 0.0631 0.0784 | ce_loss_increases 0.3715 0.3044 0.5616 0.5157 0.3301 | compound_ce_loss_increase 6.0922 | l0s 10.9955 9.6578 8.0721 10.0574 14.3246 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2750 | loss 0.0811 0.0533 0.0659 0.0631 0.0783 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0811 0.0533 0.0659 0.0631 0.0783 | ce_loss_increases 0.3675 0.3070 0.5609 0.5101 0.3268 | compound_ce_loss_increase 6.1650 | l0s 10.8388 9.5277 7.9950 10.0474 14.3193 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3000 | loss 0.0809 0.0532 0.0658 0.0631 0.0783 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.0809 0.0532 0.0658 0.0631 0.0783 | ce_loss_increases 0.3645 0.3041 0.5579 0.5121 0.3279 | compound_ce_loss_increase 6.1266 | l0s 10.7384 9.4567 7.9844 10.0067 14.2022 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3250 | loss 0.0807 0.0532 0.0658 0.0630 0.0783 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0807 0.0532 0.0658 0.0630 0.0783 | ce_loss_increases 0.3666 0.3051 0.5585 0.5084 0.3285 | compound_ce_loss_increase 6.1916 | l0s 10.4893 9.3989 7.9178 9.9738 14.1134 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3500 | loss 0.0806 0.0531 0.0657 0.0630 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0806 0.0531 0.0657 0.0630 0.0782 | ce_loss_increases 0.3631 0.3016 0.5585 0.5094 0.3295 | compound_ce_loss_increase 6.2096 | l0s 10.4911 9.3348 7.8809 9.9223 14.0264 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3750 | loss 0.0804 0.0530 0.0657 0.0630 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0804 0.0530 0.0657 0.0630 0.0782 | ce_loss_increases 0.3644 0.3040 0.5568 0.5113 0.3273 | compound_ce_loss_increase 6.1797 | l0s 10.3669 9.3380 7.8633 9.8749 14.0105 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4000 | loss 0.0804 0.0530 0.0657 0.0630 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0804 0.0530 0.0657 0.0630 0.0782 | ce_loss_increases 0.3640 0.3065 0.5587 0.5099 0.3284 | compound_ce_loss_increase 6.1848 | l0s 10.3247 9.2994 7.8337 9.8558 13.9896 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4250 | loss 0.0803 0.0530 0.0657 0.0630 0.0782 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.0803 0.0530 0.0657 0.0630 0.0782 | ce_loss_increases 0.3649 0.3064 0.5589 0.5063 0.3287 | compound_ce_loss_increase 6.2226 | l0s 10.2953 9.2847 7.8302 9.8859 14.0009 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4500 | loss 0.0803 0.0529 0.0656 0.0630 0.0782 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.0803 0.0529 0.0656 0.0630 0.0782 | ce_loss_increases 0.3646 0.3049 0.5590 0.5067 0.3280 | compound_ce_loss_increase 6.2095 | l0s 10.2967 9.2850 7.8124 9.8427 13.9828 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4750 | loss 0.0802 0.0529 0.0656 0.0630 0.0781 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0802 0.0529 0.0656 0.0630 0.0781 | ce_loss_increases 0.3654 0.3063 0.5609 0.5073 0.3266 | compound_ce_loss_increase 6.1940 | l0s 10.2605 9.2456 7.7938 9.8366 13.9347 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 5000 | loss 0.0802 0.0529 0.0656 0.0630 0.0781 | checkpoint True True True False False | ce_loss 2.7305 | sae_losses 0.0802 0.0529 0.0656 0.0630 0.0781 | ce_loss_increases 0.3670 0.3075 0.5580 0.5048 0.3279 | compound_ce_loss_increase 6.2255 | l0s 10.2341 9.2333 7.8070 9.8524 13.9733 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 0 | loss 987.3041 524.5605 337.6251 245.5708 230.6556 | checkpoint False | ce_loss 2.7305 | sae_losses 987.3041 524.5605 337.6251 245.5708 230.6556 | ce_loss_increases 14.7336 5.2044 3.5371 2.4544 0.1771 | compound_ce_loss_increase 13.5463 | l0s 522.5518 514.7168 503.2405 509.2917 504.4014 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 250 | loss 0.1972 0.1396 0.1151 0.1012 0.1138 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1972 0.1396 0.1151 0.1012 0.1138 | ce_loss_increases 1.1611 1.1100 1.4332 1.2219 0.6630 | compound_ce_loss_increase 7.0352 | l0s 29.7715 21.1023 15.4014 14.3351 16.4334 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 500 | loss 0.0966 0.0649 0.0737 0.0696 0.0838 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0966 0.0649 0.0737 0.0696 0.0838 | ce_loss_increases 0.4124 0.4584 0.7189 0.6472 0.4036 | compound_ce_loss_increase 6.0999 | l0s 20.9468 15.4493 11.7687 13.0468 17.5238 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 750 | loss 0.0835 0.0559 0.0677 0.0653 0.0796 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0835 0.0559 0.0677 0.0653 0.0796 | ce_loss_increases 0.3161 0.3527 0.5919 0.5314 0.3422 | compound_ce_loss_increase 5.9040 | l0s 15.7637 12.6997 10.4810 12.6682 17.9152 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1000 | loss 0.0790 0.0530 0.0658 0.0638 0.0781 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0790 0.0530 0.0658 0.0638 0.0781 | ce_loss_increases 0.2998 0.3038 0.5386 0.4943 0.3254 | compound_ce_loss_increase 5.7929 | l0s 13.7214 11.4431 9.8056 12.3201 17.6313 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1250 | loss 0.0767 0.0516 0.0648 0.0630 0.0774 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0767 0.0516 0.0648 0.0630 0.0774 | ce_loss_increases 0.3089 0.3005 0.5312 0.4809 0.3200 | compound_ce_loss_increase 5.9202 | l0s 12.4707 10.5642 9.2647 11.9764 17.2184 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1500 | loss 0.0755 0.0508 0.0642 0.0627 0.0770 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0755 0.0508 0.0642 0.0627 0.0770 | ce_loss_increases 0.2961 0.2934 0.5204 0.4739 0.3158 | compound_ce_loss_increase 6.0876 | l0s 11.8025 10.0521 8.9742 11.7736 16.9861 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1750 | loss 0.0746 0.0503 0.0638 0.0624 0.0767 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0746 0.0503 0.0638 0.0624 0.0767 | ce_loss_increases 0.2933 0.2842 0.4986 0.4606 0.3109 | compound_ce_loss_increase 6.0643 | l0s 11.2739 9.6912 8.8505 11.6013 16.7478 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2000 | loss 0.0739 0.0498 0.0635 0.0621 0.0765 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0739 0.0498 0.0635 0.0621 0.0765 | ce_loss_increases 0.2833 0.2776 0.4990 0.4610 0.3072 | compound_ce_loss_increase 6.0324 | l0s 11.0272 9.4591 8.6675 11.4740 16.5603 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2250 | loss 0.0735 0.0495 0.0633 0.0619 0.0764 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0735 0.0495 0.0633 0.0619 0.0764 | ce_loss_increases 0.2818 0.2716 0.4927 0.4488 0.3115 | compound_ce_loss_increase 6.1538 | l0s 10.6639 9.2630 8.5550 11.4843 16.3902 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2500 | loss 0.0730 0.0493 0.0632 0.0617 0.0762 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0730 0.0493 0.0632 0.0617 0.0762 | ce_loss_increases 0.2805 0.2770 0.5012 0.4559 0.3091 | compound_ce_loss_increase 6.1324 | l0s 10.3822 9.0572 8.3856 11.2489 16.1901 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2750 | loss 0.0727 0.0491 0.0630 0.0616 0.0761 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0727 0.0491 0.0630 0.0616 0.0761 | ce_loss_increases 0.2714 0.2675 0.4957 0.4524 0.3047 | compound_ce_loss_increase 6.1868 | l0s 10.3068 8.9261 8.3173 11.2356 16.2338 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3000 | loss 0.0726 0.0489 0.0629 0.0615 0.0761 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0726 0.0489 0.0629 0.0615 0.0761 | ce_loss_increases 0.2772 0.2625 0.4912 0.4523 0.3058 | compound_ce_loss_increase 6.1439 | l0s 10.1582 8.8404 8.3173 11.2087 16.1265 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3250 | loss 0.0724 0.0488 0.0629 0.0615 0.0761 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0724 0.0488 0.0629 0.0615 0.0761 | ce_loss_increases 0.2754 0.2600 0.4926 0.4478 0.3061 | compound_ce_loss_increase 6.2101 | l0s 9.9763 8.7575 8.2296 11.1575 16.0378 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3500 | loss 0.0723 0.0488 0.0628 0.0614 0.0760 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0723 0.0488 0.0628 0.0614 0.0760 | ce_loss_increases 0.2719 0.2580 0.4897 0.4493 0.3071 | compound_ce_loss_increase 6.2306 | l0s 9.9122 8.6689 8.2061 11.1092 15.9663 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3750 | loss 0.0722 0.0487 0.0628 0.0614 0.0760 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0722 0.0487 0.0628 0.0614 0.0760 | ce_loss_increases 0.2735 0.2576 0.4878 0.4506 0.3045 | compound_ce_loss_increase 6.2379 | l0s 9.8453 8.6378 8.1789 11.0515 15.9512 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4000 | loss 0.0721 0.0487 0.0627 0.0613 0.0760 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0721 0.0487 0.0627 0.0613 0.0760 | ce_loss_increases 0.2728 0.2576 0.4893 0.4474 0.3055 | compound_ce_loss_increase 6.2510 | l0s 9.8062 8.6063 8.1541 11.0512 15.8944 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4250 | loss 0.0721 0.0486 0.0627 0.0613 0.0759 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.0721 0.0486 0.0627 0.0613 0.0759 | ce_loss_increases 0.2714 0.2580 0.4873 0.4456 0.3057 | compound_ce_loss_increase 6.3179 | l0s 9.7692 8.5767 8.1503 11.0892 15.9306 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4500 | loss 0.0720 0.0486 0.0627 0.0613 0.0759 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.0720 0.0486 0.0627 0.0613 0.0759 | ce_loss_increases 0.2696 0.2574 0.4886 0.4450 0.3048 | compound_ce_loss_increase 6.2795 | l0s 9.7471 8.5406 8.1320 11.0344 15.9094 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4750 | loss 0.0720 0.0486 0.0627 0.0613 0.0759 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0720 0.0486 0.0627 0.0613 0.0759 | ce_loss_increases 0.2706 0.2559 0.4882 0.4456 0.3033 | compound_ce_loss_increase 6.2783 | l0s 9.7352 8.5310 8.1163 11.0260 15.8737 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 5000 | loss 0.0720 0.0486 0.0626 0.0613 0.0759 | checkpoint True True True True False | ce_loss 2.7305 | sae_losses 0.0720 0.0486 0.0626 0.0613 0.0759 | ce_loss_increases 0.2727 0.2579 0.4847 0.4434 0.3050 | compound_ce_loss_increase 6.3318 | l0s 9.6803 8.5014 8.1259 11.0453 15.9217 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 0 | loss 227.3096 125.0853 84.5692 65.1142 67.1791 | checkpoint False | ce_loss 2.7305 | sae_losses 227.3096 125.0853 84.5692 65.1142 67.1791 | ce_loss_increases 14.3188 5.3126 3.5227 2.4438 0.3794 | compound_ce_loss_increase 12.2888 | l0s 256.3822 252.1598 257.6640 252.1761 258.2471 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 250 | loss 0.1946 0.1165 0.1058 0.0912 0.1051 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1946 0.1165 0.1058 0.0912 0.1051 | ce_loss_increases 1.3221 1.1455 1.3966 1.1405 0.7271 | compound_ce_loss_increase 6.4669 | l0s 28.2331 20.1459 14.4353 13.4622 15.7307 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 500 | loss 0.1033 0.0671 0.0744 0.0693 0.0842 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.1033 0.0671 0.0744 0.0693 0.0842 | ce_loss_increases 0.5019 0.4648 0.7391 0.6445 0.4026 | compound_ce_loss_increase 6.7294 | l0s 20.9785 15.4434 11.4092 12.7625 17.4459 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 750 | loss 0.0911 0.0592 0.0695 0.0659 0.0809 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0911 0.0592 0.0695 0.0659 0.0809 | ce_loss_increases 0.3855 0.3779 0.6303 0.5564 0.3562 | compound_ce_loss_increase 6.0386 | l0s 16.4252 12.9995 10.0678 12.0255 16.8596 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1000 | loss 0.0873 0.0566 0.0679 0.0647 0.0798 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0873 0.0566 0.0679 0.0647 0.0798 | ce_loss_increases 0.3577 0.3515 0.5893 0.5349 0.3448 | compound_ce_loss_increase 6.0544 | l0s 14.4741 11.8798 9.3197 11.4210 16.0594 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1250 | loss 0.0853 0.0553 0.0672 0.0641 0.0792 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0853 0.0553 0.0672 0.0641 0.0792 | ce_loss_increases 0.3470 0.3268 0.5839 0.5280 0.3418 | compound_ce_loss_increase 6.0430 | l0s 13.4529 11.0981 8.7687 10.9694 15.4237 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1500 | loss 0.0839 0.0546 0.0667 0.0638 0.0790 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0839 0.0546 0.0667 0.0638 0.0790 | ce_loss_increases 0.3545 0.3203 0.5773 0.5221 0.3391 | compound_ce_loss_increase 6.0592 | l0s 12.7778 10.5788 8.5156 10.7386 15.1051 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 1750 | loss 0.0830 0.0542 0.0664 0.0636 0.0787 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0830 0.0542 0.0664 0.0636 0.0787 | ce_loss_increases 0.3419 0.3094 0.5633 0.5101 0.3342 | compound_ce_loss_increase 5.9900 | l0s 12.3649 10.2321 8.3374 10.5805 14.8649 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2000 | loss 0.0822 0.0539 0.0662 0.0634 0.0786 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0822 0.0539 0.0662 0.0634 0.0786 | ce_loss_increases 0.3442 0.3079 0.5649 0.5127 0.3326 | compound_ce_loss_increase 5.9945 | l0s 12.0825 9.9626 8.1846 10.4395 14.6080 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2250 | loss 0.0817 0.0536 0.0660 0.0633 0.0785 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0817 0.0536 0.0660 0.0633 0.0785 | ce_loss_increases 0.3458 0.3092 0.5585 0.5023 0.3349 | compound_ce_loss_increase 6.0645 | l0s 11.7261 9.7345 8.0765 10.4374 14.4803 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2500 | loss 0.0813 0.0535 0.0659 0.0632 0.0784 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0813 0.0535 0.0659 0.0632 0.0784 | ce_loss_increases 0.3526 0.3144 0.5659 0.5087 0.3342 | compound_ce_loss_increase 6.0342 | l0s 11.4629 9.5473 7.9557 10.2523 14.2952 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 2750 | loss 0.0809 0.0533 0.0658 0.0631 0.0783 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0809 0.0533 0.0658 0.0631 0.0783 | ce_loss_increases 0.3478 0.3188 0.5623 0.5037 0.3301 | compound_ce_loss_increase 6.0714 | l0s 11.2984 9.4367 7.8699 10.2199 14.3144 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3000 | loss 0.0808 0.0533 0.0657 0.0631 0.0783 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0808 0.0533 0.0657 0.0631 0.0783 | ce_loss_increases 0.3467 0.3207 0.5589 0.5054 0.3311 | compound_ce_loss_increase 5.9939 | l0s 11.1300 9.3763 7.8633 10.1942 14.1981 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3250 | loss 0.0806 0.0532 0.0657 0.0631 0.0783 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0806 0.0532 0.0657 0.0631 0.0783 | ce_loss_increases 0.3462 0.3220 0.5614 0.5026 0.3320 | compound_ce_loss_increase 6.0097 | l0s 10.9300 9.2972 7.8040 10.1428 14.1252 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3500 | loss 0.0804 0.0531 0.0656 0.0631 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0804 0.0531 0.0656 0.0631 0.0782 | ce_loss_increases 0.3419 0.3236 0.5580 0.5025 0.3329 | compound_ce_loss_increase 5.9775 | l0s 10.8840 9.2297 7.7851 10.0913 14.0759 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 3750 | loss 0.0803 0.0531 0.0656 0.0630 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0803 0.0531 0.0656 0.0630 0.0782 | ce_loss_increases 0.3430 0.3211 0.5571 0.5061 0.3299 | compound_ce_loss_increase 5.9679 | l0s 10.8471 9.2199 7.7683 10.0314 14.0377 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4000 | loss 0.0803 0.0530 0.0656 0.0630 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0803 0.0530 0.0656 0.0630 0.0782 | ce_loss_increases 0.3416 0.3237 0.5580 0.5034 0.3310 | compound_ce_loss_increase 5.9416 | l0s 10.8269 9.1926 7.7378 10.0352 14.0272 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4250 | loss 0.0802 0.0530 0.0655 0.0630 0.0782 | checkpoint True True True False True | ce_loss 2.7305 | sae_losses 0.0802 0.0530 0.0655 0.0630 0.0782 | ce_loss_increases 0.3424 0.3236 0.5598 0.5018 0.3312 | compound_ce_loss_increase 5.9847 | l0s 10.7967 9.1708 7.7368 10.0487 14.0572 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4500 | loss 0.0802 0.0530 0.0655 0.0630 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0802 0.0530 0.0655 0.0630 0.0782 | ce_loss_increases 0.3437 0.3254 0.5597 0.5011 0.3304 | compound_ce_loss_increase 5.9399 | l0s 10.8001 9.1407 7.7135 10.0123 14.0348 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 4750 | loss 0.0801 0.0529 0.0655 0.0630 0.0782 | checkpoint True True True True True | ce_loss 2.7305 | sae_losses 0.0801 0.0529 0.0655 0.0630 0.0782 | ce_loss_increases 0.3436 0.3239 0.5609 0.5025 0.3287 | compound_ce_loss_increase 5.9415 | l0s 10.7744 9.1199 7.7017 10.0002 14.0069 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391 type eval | step 5000 | loss 0.0801 0.0529 0.0655 0.0630 0.0782 | checkpoint True True True True False | ce_loss 2.7305 | sae_losses 0.0801 0.0529 0.0655 0.0630 0.0782 | ce_loss_increases 0.3456 0.3268 0.5580 0.5014 0.3306 | compound_ce_loss_increase 5.9570 | l0s 10.6992 9.0957 7.7161 10.0124 14.0342 | stream_l1s 4.4510 3.2427 2.6543 2.1799 2.1391