diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,8 +1,8 @@ { - "best_metric": 6.580160140991211, - "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-2200", - "epoch": 0.02138874035596975, - "global_step": 2800, + "best_metric": 6.423073768615723, + "best_model_checkpoint": "./output_v2/34bCodellama_CodeLlama-34b-Python-hf_unnatural-instructions_standardized/checkpoint-3200", + "epoch": 0.02902757619738752, + "global_step": 3800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -17067,11 +17067,6110 @@ "train_runtime": 7855.0688, "train_samples_per_second": 3.819, "train_steps_per_second": 3.819 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0757, + "step": 2801 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.8944, + "step": 2802 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8365, + "step": 2803 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.873, + "step": 2804 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.3793, + "step": 2805 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1923, + "step": 2806 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.2306, + "step": 2807 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.4439, + "step": 2808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3377, + "step": 2809 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8737, + "step": 2810 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4191, + "step": 2811 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.892, + "step": 2812 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4568, + "step": 2813 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0652, + "step": 2814 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.6912, + "step": 2815 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.9033, + "step": 2816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4433, + "step": 2817 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7591, + "step": 2818 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4458, + "step": 2819 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3721, + "step": 2820 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4111, + "step": 2821 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0999, + "step": 2822 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.5154, + "step": 2823 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.1967, + "step": 2824 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8437, + "step": 2825 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.759, + "step": 2826 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.6223, + "step": 2827 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3442, + "step": 2828 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.1991, + "step": 2829 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.5302, + "step": 2830 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1725, + "step": 2831 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8107, + "step": 2832 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.7891, + "step": 2833 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5592, + "step": 2834 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8792, + "step": 2835 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2387, + "step": 2836 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9369, + "step": 2837 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2098, + "step": 2838 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.6645, + "step": 2839 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2349, + "step": 2840 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8613, + "step": 2841 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5482, + "step": 2842 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.5438, + "step": 2843 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6133, + "step": 2844 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.9983, + "step": 2845 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8706, + "step": 2846 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9685, + "step": 2847 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.04, + "step": 2848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6498, + "step": 2849 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.6786, + "step": 2850 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.789, + "step": 2851 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 10.1116, + "step": 2852 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.7085, + "step": 2853 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1083, + "step": 2854 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0795, + "step": 2855 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8677, + "step": 2856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.1663, + "step": 2857 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.5379, + "step": 2858 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4923, + "step": 2859 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1456, + "step": 2860 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1686, + "step": 2861 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4153, + "step": 2862 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.17, + "step": 2863 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3301, + "step": 2864 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7106, + "step": 2865 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.463, + "step": 2866 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.614, + "step": 2867 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1451, + "step": 2868 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6817, + "step": 2869 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.9577, + "step": 2870 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.6171, + "step": 2871 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.5129, + "step": 2872 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.3386, + "step": 2873 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.1176, + "step": 2874 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.9075, + "step": 2875 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.667, + "step": 2876 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.8097, + "step": 2877 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7707, + "step": 2878 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7571, + "step": 2879 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0732, + "step": 2880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.5586, + "step": 2881 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8045, + "step": 2882 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.4442, + "step": 2883 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.418, + "step": 2884 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.7253, + "step": 2885 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4227, + "step": 2886 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.9878, + "step": 2887 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.8351, + "step": 2888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1715, + "step": 2889 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.1207, + "step": 2890 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0396, + "step": 2891 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7162, + "step": 2892 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2566, + "step": 2893 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4087, + "step": 2894 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.4509, + "step": 2895 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8165, + "step": 2896 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.9842, + "step": 2897 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.574, + "step": 2898 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4512, + "step": 2899 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9904, + "step": 2900 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.6141, + "step": 2901 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9528, + "step": 2902 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.9028, + "step": 2903 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.3868, + "step": 2904 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0461, + "step": 2905 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.5161, + "step": 2906 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.107, + "step": 2907 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7174, + "step": 2908 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7671, + "step": 2909 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.6803, + "step": 2910 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5357, + "step": 2911 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6898, + "step": 2912 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8564, + "step": 2913 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1457, + "step": 2914 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.3408, + "step": 2915 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.6095, + "step": 2916 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.866, + "step": 2917 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7324, + "step": 2918 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4026, + "step": 2919 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.1467, + "step": 2920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2496, + "step": 2921 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.5265, + "step": 2922 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8246, + "step": 2923 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.5775, + "step": 2924 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.2886, + "step": 2925 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.3076, + "step": 2926 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7975, + "step": 2927 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.9898, + "step": 2928 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7256, + "step": 2929 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7971, + "step": 2930 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.5884, + "step": 2931 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.0749, + "step": 2932 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6183, + "step": 2933 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.0019, + "step": 2934 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.1706, + "step": 2935 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4075, + "step": 2936 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.4092, + "step": 2937 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.9424, + "step": 2938 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.9025, + "step": 2939 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7751, + "step": 2940 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.988, + "step": 2941 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.1208, + "step": 2942 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1999, + "step": 2943 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2129, + "step": 2944 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4431, + "step": 2945 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1979, + "step": 2946 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8246, + "step": 2947 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4876, + "step": 2948 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.7158, + "step": 2949 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.3611, + "step": 2950 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.9919, + "step": 2951 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4136, + "step": 2952 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.801, + "step": 2953 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.6503, + "step": 2954 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.553, + "step": 2955 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.3536, + "step": 2956 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8516, + "step": 2957 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 9.9344, + "step": 2958 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8727, + "step": 2959 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9155, + "step": 2960 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9684, + "step": 2961 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0399, + "step": 2962 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.4298, + "step": 2963 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4559, + "step": 2964 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0361, + "step": 2965 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0081, + "step": 2966 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6698, + "step": 2967 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.3355, + "step": 2968 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.7555, + "step": 2969 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.29, + "step": 2970 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4079, + "step": 2971 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0588, + "step": 2972 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2956, + "step": 2973 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7318, + "step": 2974 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8846, + "step": 2975 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.5159, + "step": 2976 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7629, + "step": 2977 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2039, + "step": 2978 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.311, + "step": 2979 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9786, + "step": 2980 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.7138, + "step": 2981 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4827, + "step": 2982 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5264, + "step": 2983 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8153, + "step": 2984 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.3512, + "step": 2985 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.1515, + "step": 2986 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1689, + "step": 2987 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8027, + "step": 2988 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.7472, + "step": 2989 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0236, + "step": 2990 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1741, + "step": 2991 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8884, + "step": 2992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3912, + "step": 2993 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2109, + "step": 2994 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1693, + "step": 2995 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8166, + "step": 2996 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4902, + "step": 2997 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.3921, + "step": 2998 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.8787, + "step": 2999 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1304, + "step": 3000 + }, + { + "epoch": 0.02, + "eval_loss": 6.659167289733887, + "eval_runtime": 22.4512, + "eval_samples_per_second": 2.227, + "eval_steps_per_second": 1.114, + "step": 3000 + }, + { + "epoch": 0.02, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.973116703033447, + "step": 3000 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.4366, + "step": 3001 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1239, + "step": 3002 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.419, + "step": 3003 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7662, + "step": 3004 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1246, + "step": 3005 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3498, + "step": 3006 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1245, + "step": 3007 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.6488, + "step": 3008 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3641, + "step": 3009 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7842, + "step": 3010 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.802, + "step": 3011 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1015, + "step": 3012 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.9015, + "step": 3013 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8768, + "step": 3014 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7296, + "step": 3015 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4175, + "step": 3016 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.3568, + "step": 3017 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.5869, + "step": 3018 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5304, + "step": 3019 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.1129, + "step": 3020 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8128, + "step": 3021 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1541, + "step": 3022 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3002, + "step": 3023 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.0616, + "step": 3024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3152, + "step": 3025 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4433, + "step": 3026 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.8406, + "step": 3027 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.2653, + "step": 3028 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.7041, + "step": 3029 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3463, + "step": 3030 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.7164, + "step": 3031 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.9969, + "step": 3032 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.1306, + "step": 3033 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0204, + "step": 3034 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.6743, + "step": 3035 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.3186, + "step": 3036 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.5673, + "step": 3037 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1416, + "step": 3038 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1956, + "step": 3039 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6836, + "step": 3040 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0172, + "step": 3041 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.796, + "step": 3042 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6945, + "step": 3043 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.5079, + "step": 3044 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.399, + "step": 3045 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0071, + "step": 3046 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.4171, + "step": 3047 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0925, + "step": 3048 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.6842, + "step": 3049 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.2442, + "step": 3050 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8023, + "step": 3051 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7147, + "step": 3052 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.9681, + "step": 3053 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 9.1882, + "step": 3054 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.9869, + "step": 3055 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.0705, + "step": 3056 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8802, + "step": 3057 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.8399, + "step": 3058 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.6438, + "step": 3059 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0608, + "step": 3060 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.763, + "step": 3061 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.355, + "step": 3062 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.5504, + "step": 3063 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.1582, + "step": 3064 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.1922, + "step": 3065 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.0706, + "step": 3066 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.807, + "step": 3067 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0305, + "step": 3068 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0774, + "step": 3069 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4469, + "step": 3070 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1158, + "step": 3071 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8087, + "step": 3072 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.5672, + "step": 3073 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.5864, + "step": 3074 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7665, + "step": 3075 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2434, + "step": 3076 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.3988, + "step": 3077 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0307, + "step": 3078 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.6561, + "step": 3079 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8158, + "step": 3080 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8384, + "step": 3081 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.5515, + "step": 3082 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8108, + "step": 3083 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.2536, + "step": 3084 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.2506, + "step": 3085 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1605, + "step": 3086 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4572, + "step": 3087 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.3312, + "step": 3088 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1481, + "step": 3089 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.3304, + "step": 3090 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.2617, + "step": 3091 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.3148, + "step": 3092 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4781, + "step": 3093 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.327, + "step": 3094 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.3288, + "step": 3095 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2354, + "step": 3096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4802, + "step": 3097 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 9.1659, + "step": 3098 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 1.9887, + "step": 3099 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 1.9497, + "step": 3100 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2668, + "step": 3101 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.458, + "step": 3102 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.9919, + "step": 3103 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0408, + "step": 3104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.9966, + "step": 3105 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 9.1371, + "step": 3106 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0939, + "step": 3107 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2532, + "step": 3108 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7343, + "step": 3109 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.156, + "step": 3110 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2223, + "step": 3111 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.6182, + "step": 3112 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4423, + "step": 3113 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.3855, + "step": 3114 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.2115, + "step": 3115 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6719, + "step": 3116 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5285, + "step": 3117 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.0171, + "step": 3118 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2127, + "step": 3119 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8153, + "step": 3120 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.1838, + "step": 3121 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.711, + "step": 3122 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1568, + "step": 3123 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.3741, + "step": 3124 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2741, + "step": 3125 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.1653, + "step": 3126 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.9722, + "step": 3127 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.9181, + "step": 3128 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.384, + "step": 3129 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.1491, + "step": 3130 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.8641, + "step": 3131 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.6125, + "step": 3132 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.1702, + "step": 3133 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.4853, + "step": 3134 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.7929, + "step": 3135 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8747, + "step": 3136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.2659, + "step": 3137 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.0685, + "step": 3138 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2297, + "step": 3139 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0715, + "step": 3140 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2201, + "step": 3141 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2134, + "step": 3142 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6098, + "step": 3143 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.2036, + "step": 3144 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2481, + "step": 3145 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4508, + "step": 3146 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.1454, + "step": 3147 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7573, + "step": 3148 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2946, + "step": 3149 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0121, + "step": 3150 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.498, + "step": 3151 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4016, + "step": 3152 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5934, + "step": 3153 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.737, + "step": 3154 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.9874, + "step": 3155 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7714, + "step": 3156 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.3529, + "step": 3157 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7673, + "step": 3158 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3835, + "step": 3159 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.0336, + "step": 3160 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2998, + "step": 3161 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0446, + "step": 3162 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5225, + "step": 3163 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1391, + "step": 3164 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7669, + "step": 3165 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.118, + "step": 3166 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7045, + "step": 3167 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.077, + "step": 3168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.0697, + "step": 3169 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8445, + "step": 3170 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4044, + "step": 3171 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9817, + "step": 3172 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.2508, + "step": 3173 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.6302, + "step": 3174 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.46, + "step": 3175 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.8094, + "step": 3176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.1174, + "step": 3177 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.5756, + "step": 3178 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4215, + "step": 3179 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.5957, + "step": 3180 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3534, + "step": 3181 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9342, + "step": 3182 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.8227, + "step": 3183 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.1404, + "step": 3184 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.9897, + "step": 3185 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7286, + "step": 3186 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.5414, + "step": 3187 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7523, + "step": 3188 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4033, + "step": 3189 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8593, + "step": 3190 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6095, + "step": 3191 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.504, + "step": 3192 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.6776, + "step": 3193 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0174, + "step": 3194 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7759, + "step": 3195 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.5685, + "step": 3196 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2786, + "step": 3197 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.7794, + "step": 3198 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.5827, + "step": 3199 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3282, + "step": 3200 + }, + { + "epoch": 0.02, + "eval_loss": 6.423073768615723, + "eval_runtime": 22.4644, + "eval_samples_per_second": 2.226, + "eval_steps_per_second": 1.113, + "step": 3200 + }, + { + "epoch": 0.02, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.9956862831115725, + "step": 3200 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3005, + "step": 3201 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0131, + "step": 3202 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.9222, + "step": 3203 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6755, + "step": 3204 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.8386, + "step": 3205 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7854, + "step": 3206 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4073, + "step": 3207 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.4278, + "step": 3208 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.5322, + "step": 3209 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.1508, + "step": 3210 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4289, + "step": 3211 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 9.0088, + "step": 3212 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6227, + "step": 3213 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.232, + "step": 3214 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4584, + "step": 3215 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8538, + "step": 3216 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.1044, + "step": 3217 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3851, + "step": 3218 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7661, + "step": 3219 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.4478, + "step": 3220 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0658, + "step": 3221 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.0961, + "step": 3222 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.5636, + "step": 3223 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6945, + "step": 3224 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.5381, + "step": 3225 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2431, + "step": 3226 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.3755, + "step": 3227 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.4123, + "step": 3228 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.0414, + "step": 3229 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.6732, + "step": 3230 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8904, + "step": 3231 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7829, + "step": 3232 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8394, + "step": 3233 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.2565, + "step": 3234 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.7535, + "step": 3235 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.8878, + "step": 3236 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.9086, + "step": 3237 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7491, + "step": 3238 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.0775, + "step": 3239 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.3575, + "step": 3240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.7583, + "step": 3241 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.7107, + "step": 3242 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.3212, + "step": 3243 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.5662, + "step": 3244 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.9553, + "step": 3245 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.3713, + "step": 3246 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4519, + "step": 3247 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.0003, + "step": 3248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.2796, + "step": 3249 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.3262, + "step": 3250 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.0106, + "step": 3251 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.8502, + "step": 3252 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.3491, + "step": 3253 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 2.2728, + "step": 3254 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.822, + "step": 3255 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.6077, + "step": 3256 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9328, + "step": 3257 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.8518, + "step": 3258 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.6541, + "step": 3259 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.6976, + "step": 3260 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 8.9285, + "step": 3261 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.9624, + "step": 3262 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.6754, + "step": 3263 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.4891, + "step": 3264 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.7557, + "step": 3265 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.9059, + "step": 3266 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 6.2432, + "step": 3267 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 5.9467, + "step": 3268 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 7.6248, + "step": 3269 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 3.6632, + "step": 3270 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.5671, + "step": 3271 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004, + "loss": 4.9754, + "step": 3272 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2562, + "step": 3273 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6304, + "step": 3274 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2176, + "step": 3275 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0867, + "step": 3276 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6769, + "step": 3277 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1474, + "step": 3278 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5615, + "step": 3279 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3285, + "step": 3280 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2536, + "step": 3281 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1851, + "step": 3282 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1866, + "step": 3283 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.26, + "step": 3284 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1826, + "step": 3285 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3643, + "step": 3286 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5651, + "step": 3287 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7203, + "step": 3288 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7083, + "step": 3289 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1364, + "step": 3290 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1367, + "step": 3291 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5968, + "step": 3292 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.287, + "step": 3293 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5756, + "step": 3294 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3841, + "step": 3295 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3323, + "step": 3296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0486, + "step": 3297 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6838, + "step": 3298 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5777, + "step": 3299 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4593, + "step": 3300 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9308, + "step": 3301 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6334, + "step": 3302 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6559, + "step": 3303 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7031, + "step": 3304 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.2146, + "step": 3305 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.6907, + "step": 3306 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8058, + "step": 3307 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7805, + "step": 3308 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.557, + "step": 3309 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2331, + "step": 3310 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1702, + "step": 3311 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2958, + "step": 3312 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6121, + "step": 3313 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9686, + "step": 3314 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0866, + "step": 3315 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2124, + "step": 3316 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9969, + "step": 3317 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3417, + "step": 3318 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7311, + "step": 3319 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8008, + "step": 3320 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2469, + "step": 3321 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4888, + "step": 3322 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2527, + "step": 3323 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7326, + "step": 3324 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4268, + "step": 3325 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3635, + "step": 3326 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8848, + "step": 3327 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2306, + "step": 3328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4185, + "step": 3329 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0911, + "step": 3330 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0277, + "step": 3331 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4066, + "step": 3332 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9085, + "step": 3333 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2858, + "step": 3334 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4789, + "step": 3335 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8016, + "step": 3336 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9443, + "step": 3337 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8456, + "step": 3338 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0709, + "step": 3339 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7879, + "step": 3340 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6286, + "step": 3341 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7528, + "step": 3342 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4875, + "step": 3343 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4383, + "step": 3344 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7478, + "step": 3345 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8581, + "step": 3346 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4681, + "step": 3347 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.179, + "step": 3348 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.772, + "step": 3349 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5266, + "step": 3350 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.962, + "step": 3351 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1986, + "step": 3352 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8946, + "step": 3353 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8563, + "step": 3354 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0887, + "step": 3355 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9487, + "step": 3356 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9164, + "step": 3357 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6243, + "step": 3358 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1646, + "step": 3359 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3753, + "step": 3360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6102, + "step": 3361 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8946, + "step": 3362 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1349, + "step": 3363 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7417, + "step": 3364 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4195, + "step": 3365 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0547, + "step": 3366 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3341, + "step": 3367 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4345, + "step": 3368 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.3088, + "step": 3369 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7109, + "step": 3370 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6865, + "step": 3371 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5466, + "step": 3372 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4801, + "step": 3373 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7442, + "step": 3374 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0129, + "step": 3375 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2859, + "step": 3376 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0064, + "step": 3377 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1277, + "step": 3378 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9071, + "step": 3379 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3575, + "step": 3380 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7016, + "step": 3381 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.151, + "step": 3382 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4008, + "step": 3383 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3565, + "step": 3384 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3863, + "step": 3385 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7621, + "step": 3386 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3799, + "step": 3387 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8051, + "step": 3388 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1864, + "step": 3389 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4464, + "step": 3390 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.649, + "step": 3391 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7819, + "step": 3392 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4867, + "step": 3393 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0732, + "step": 3394 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2601, + "step": 3395 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8808, + "step": 3396 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9649, + "step": 3397 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3081, + "step": 3398 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7762, + "step": 3399 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0978, + "step": 3400 + }, + { + "epoch": 0.03, + "eval_loss": 6.543734073638916, + "eval_runtime": 22.8771, + "eval_samples_per_second": 2.186, + "eval_steps_per_second": 1.093, + "step": 3400 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.872052774429321, + "step": 3400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7529, + "step": 3401 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2717, + "step": 3402 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9914, + "step": 3403 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1987, + "step": 3404 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.68, + "step": 3405 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9154, + "step": 3406 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5814, + "step": 3407 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3381, + "step": 3408 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1775, + "step": 3409 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9658, + "step": 3410 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6455, + "step": 3411 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3931, + "step": 3412 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3601, + "step": 3413 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4806, + "step": 3414 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2846, + "step": 3415 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3628, + "step": 3416 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7838, + "step": 3417 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0117, + "step": 3418 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1784, + "step": 3419 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.925, + "step": 3420 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6457, + "step": 3421 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5869, + "step": 3422 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8333, + "step": 3423 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.1381, + "step": 3424 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8135, + "step": 3425 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5245, + "step": 3426 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0547, + "step": 3427 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7663, + "step": 3428 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1778, + "step": 3429 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4051, + "step": 3430 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1333, + "step": 3431 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3353, + "step": 3432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.3009, + "step": 3433 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7293, + "step": 3434 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5752, + "step": 3435 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3992, + "step": 3436 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8915, + "step": 3437 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4496, + "step": 3438 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8425, + "step": 3439 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.7227, + "step": 3440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4708, + "step": 3441 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.923, + "step": 3442 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5959, + "step": 3443 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9186, + "step": 3444 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2626, + "step": 3445 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2608, + "step": 3446 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9251, + "step": 3447 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0449, + "step": 3448 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0141, + "step": 3449 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3608, + "step": 3450 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9488, + "step": 3451 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7488, + "step": 3452 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6528, + "step": 3453 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9995, + "step": 3454 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7377, + "step": 3455 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6786, + "step": 3456 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2383, + "step": 3457 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0354, + "step": 3458 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9902, + "step": 3459 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7258, + "step": 3460 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5172, + "step": 3461 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.988, + "step": 3462 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4772, + "step": 3463 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5068, + "step": 3464 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8593, + "step": 3465 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9398, + "step": 3466 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0332, + "step": 3467 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7405, + "step": 3468 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6347, + "step": 3469 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0263, + "step": 3470 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6087, + "step": 3471 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7333, + "step": 3472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1251, + "step": 3473 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5823, + "step": 3474 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1494, + "step": 3475 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3429, + "step": 3476 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4662, + "step": 3477 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5451, + "step": 3478 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.972, + "step": 3479 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6314, + "step": 3480 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8037, + "step": 3481 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3445, + "step": 3482 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4575, + "step": 3483 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0294, + "step": 3484 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3406, + "step": 3485 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3541, + "step": 3486 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2848, + "step": 3487 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2269, + "step": 3488 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2968, + "step": 3489 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1864, + "step": 3490 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9224, + "step": 3491 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6688, + "step": 3492 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.19, + "step": 3493 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4404, + "step": 3494 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.901, + "step": 3495 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1823, + "step": 3496 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7348, + "step": 3497 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7604, + "step": 3498 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2457, + "step": 3499 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1082, + "step": 3500 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6028, + "step": 3501 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8972, + "step": 3502 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0901, + "step": 3503 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1925, + "step": 3504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2407, + "step": 3505 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2848, + "step": 3506 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1169, + "step": 3507 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2106, + "step": 3508 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7564, + "step": 3509 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6047, + "step": 3510 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3407, + "step": 3511 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9848, + "step": 3512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7671, + "step": 3513 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6396, + "step": 3514 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.699, + "step": 3515 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2749, + "step": 3516 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5383, + "step": 3517 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4338, + "step": 3518 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3005, + "step": 3519 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1728, + "step": 3520 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0289, + "step": 3521 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5751, + "step": 3522 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0577, + "step": 3523 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6819, + "step": 3524 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5446, + "step": 3525 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0514, + "step": 3526 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6815, + "step": 3527 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6218, + "step": 3528 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4958, + "step": 3529 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9706, + "step": 3530 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9316, + "step": 3531 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.479, + "step": 3532 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2855, + "step": 3533 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2544, + "step": 3534 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9837, + "step": 3535 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.978, + "step": 3536 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7618, + "step": 3537 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5827, + "step": 3538 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8986, + "step": 3539 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9653, + "step": 3540 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8958, + "step": 3541 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4007, + "step": 3542 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0079, + "step": 3543 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.1045, + "step": 3544 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0049, + "step": 3545 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7175, + "step": 3546 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.4449, + "step": 3547 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.8293, + "step": 3548 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9874, + "step": 3549 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6799, + "step": 3550 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4073, + "step": 3551 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2746, + "step": 3552 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4299, + "step": 3553 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7892, + "step": 3554 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.123, + "step": 3555 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.6298, + "step": 3556 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3416, + "step": 3557 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.746, + "step": 3558 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8938, + "step": 3559 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6562, + "step": 3560 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7783, + "step": 3561 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.291, + "step": 3562 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1611, + "step": 3563 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4299, + "step": 3564 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.341, + "step": 3565 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0554, + "step": 3566 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3736, + "step": 3567 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4401, + "step": 3568 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0594, + "step": 3569 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1593, + "step": 3570 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3663, + "step": 3571 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6513, + "step": 3572 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0796, + "step": 3573 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.593, + "step": 3574 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9869, + "step": 3575 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4861, + "step": 3576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4145, + "step": 3577 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9537, + "step": 3578 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3876, + "step": 3579 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0458, + "step": 3580 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1648, + "step": 3581 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0825, + "step": 3582 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1962, + "step": 3583 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9999, + "step": 3584 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8877, + "step": 3585 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.387, + "step": 3586 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3146, + "step": 3587 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2283, + "step": 3588 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1697, + "step": 3589 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1183, + "step": 3590 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7817, + "step": 3591 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.103, + "step": 3592 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2412, + "step": 3593 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9878, + "step": 3594 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3351, + "step": 3595 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7125, + "step": 3596 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0872, + "step": 3597 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.6162, + "step": 3598 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.699, + "step": 3599 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7063, + "step": 3600 + }, + { + "epoch": 0.03, + "eval_loss": 6.505133152008057, + "eval_runtime": 22.6179, + "eval_samples_per_second": 2.211, + "eval_steps_per_second": 1.105, + "step": 3600 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 4.059268207550049, + "step": 3600 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.79, + "step": 3601 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9946, + "step": 3602 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5461, + "step": 3603 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5258, + "step": 3604 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7346, + "step": 3605 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3883, + "step": 3606 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5788, + "step": 3607 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2204, + "step": 3608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1632, + "step": 3609 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7381, + "step": 3610 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1473, + "step": 3611 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9152, + "step": 3612 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4622, + "step": 3613 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.6216, + "step": 3614 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6317, + "step": 3615 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4409, + "step": 3616 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0236, + "step": 3617 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8894, + "step": 3618 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0822, + "step": 3619 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.953, + "step": 3620 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1938, + "step": 3621 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3474, + "step": 3622 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4535, + "step": 3623 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3665, + "step": 3624 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.4139, + "step": 3625 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.6508, + "step": 3626 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5838, + "step": 3627 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9303, + "step": 3628 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3722, + "step": 3629 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4449, + "step": 3630 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.3978, + "step": 3631 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5792, + "step": 3632 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.4508, + "step": 3633 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.5883, + "step": 3634 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1239, + "step": 3635 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2981, + "step": 3636 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7042, + "step": 3637 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1361, + "step": 3638 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9068, + "step": 3639 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3976, + "step": 3640 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0685, + "step": 3641 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5028, + "step": 3642 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0945, + "step": 3643 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0476, + "step": 3644 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7249, + "step": 3645 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8524, + "step": 3646 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1725, + "step": 3647 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1051, + "step": 3648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7044, + "step": 3649 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0646, + "step": 3650 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.5899, + "step": 3651 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2098, + "step": 3652 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2737, + "step": 3653 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7012, + "step": 3654 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.392, + "step": 3655 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1024, + "step": 3656 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8084, + "step": 3657 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0021, + "step": 3658 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.0005, + "step": 3659 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0075, + "step": 3660 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0208, + "step": 3661 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1425, + "step": 3662 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2731, + "step": 3663 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0704, + "step": 3664 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.7608, + "step": 3665 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.126, + "step": 3666 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1159, + "step": 3667 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.301, + "step": 3668 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7234, + "step": 3669 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.9938, + "step": 3670 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1246, + "step": 3671 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2179, + "step": 3672 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4044, + "step": 3673 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0196, + "step": 3674 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5111, + "step": 3675 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6913, + "step": 3676 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3374, + "step": 3677 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7363, + "step": 3678 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.0937, + "step": 3679 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.691, + "step": 3680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.1534, + "step": 3681 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2765, + "step": 3682 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.635, + "step": 3683 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4933, + "step": 3684 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3922, + "step": 3685 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2925, + "step": 3686 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.646, + "step": 3687 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.2651, + "step": 3688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7927, + "step": 3689 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.7202, + "step": 3690 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5883, + "step": 3691 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1781, + "step": 3692 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5795, + "step": 3693 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.1326, + "step": 3694 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.2378, + "step": 3695 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.462, + "step": 3696 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.7268, + "step": 3697 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9041, + "step": 3698 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8232, + "step": 3699 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 1.7176, + "step": 3700 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.854, + "step": 3701 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 1.6464, + "step": 3702 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.264, + "step": 3703 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8032, + "step": 3704 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.7097, + "step": 3705 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.068, + "step": 3706 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3674, + "step": 3707 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5067, + "step": 3708 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.4841, + "step": 3709 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.2839, + "step": 3710 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0894, + "step": 3711 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5614, + "step": 3712 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.7785, + "step": 3713 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8865, + "step": 3714 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0697, + "step": 3715 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0635, + "step": 3716 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0788, + "step": 3717 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8854, + "step": 3718 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3054, + "step": 3719 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7088, + "step": 3720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8188, + "step": 3721 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4379, + "step": 3722 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8049, + "step": 3723 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4344, + "step": 3724 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.647, + "step": 3725 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5758, + "step": 3726 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9208, + "step": 3727 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.4727, + "step": 3728 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2176, + "step": 3729 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0123, + "step": 3730 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 9.404, + "step": 3731 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.0537, + "step": 3732 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.8559, + "step": 3733 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2705, + "step": 3734 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4536, + "step": 3735 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.5407, + "step": 3736 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7118, + "step": 3737 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.4877, + "step": 3738 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1927, + "step": 3739 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.3252, + "step": 3740 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.9464, + "step": 3741 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.3133, + "step": 3742 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.427, + "step": 3743 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8029, + "step": 3744 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8428, + "step": 3745 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3284, + "step": 3746 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.2628, + "step": 3747 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.466, + "step": 3748 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.3269, + "step": 3749 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.1805, + "step": 3750 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.5579, + "step": 3751 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.581, + "step": 3752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.0164, + "step": 3753 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.9736, + "step": 3754 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.8441, + "step": 3755 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3202, + "step": 3756 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.9395, + "step": 3757 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.2659, + "step": 3758 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5779, + "step": 3759 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.1052, + "step": 3760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.0075, + "step": 3761 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.5628, + "step": 3762 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7005, + "step": 3763 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.979, + "step": 3764 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.9874, + "step": 3765 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.8137, + "step": 3766 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1964, + "step": 3767 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.9047, + "step": 3768 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2724, + "step": 3769 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.8014, + "step": 3770 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.7738, + "step": 3771 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.926, + "step": 3772 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.94, + "step": 3773 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.1657, + "step": 3774 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9987, + "step": 3775 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5263, + "step": 3776 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.0671, + "step": 3777 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.1628, + "step": 3778 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4194, + "step": 3779 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5225, + "step": 3780 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.3863, + "step": 3781 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.7951, + "step": 3782 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5864, + "step": 3783 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.177, + "step": 3784 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.2334, + "step": 3785 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.8444, + "step": 3786 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.7108, + "step": 3787 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6521, + "step": 3788 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.4604, + "step": 3789 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.12, + "step": 3790 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 6.3275, + "step": 3791 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.9743, + "step": 3792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 5.6064, + "step": 3793 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.6108, + "step": 3794 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 3.0104, + "step": 3795 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.2739, + "step": 3796 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 2.8316, + "step": 3797 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 7.5044, + "step": 3798 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 8.838, + "step": 3799 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004, + "loss": 4.5967, + "step": 3800 + }, + { + "epoch": 0.03, + "eval_loss": 6.464095592498779, + "eval_runtime": 22.5612, + "eval_samples_per_second": 2.216, + "eval_steps_per_second": 1.108, + "step": 3800 + }, + { + "epoch": 0.03, + "mmlu_eval_accuracy": 0.2525477994227994, + "mmlu_eval_accuracy_abstract_algebra": 0.18181818181818182, + "mmlu_eval_accuracy_anatomy": 0.07142857142857142, + "mmlu_eval_accuracy_astronomy": 0.3125, + "mmlu_eval_accuracy_business_ethics": 0.4444444444444444, + "mmlu_loss": 3.589159059524536, + "step": 3800 + }, + { + "epoch": 0.03, + "step": 3800, + "total_flos": 6.198225528943411e+16, + "train_loss": 1.5118552861401908, + "train_runtime": 2842.6735, + "train_samples_per_second": 10.553, + "train_steps_per_second": 10.553 } ], "max_steps": 30000, "num_train_epochs": 1, - "total_flos": 4.660001608148582e+16, + "total_flos": 6.198225528943411e+16, "trial_name": null, "trial_params": null }