Plofski commited on
Commit
50052b6
·
verified ·
1 Parent(s): 6ef2d11

Training in progress, step 3500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6c549f8867ec3aa46fdb16d23e60b8f0ae222fa21f2d19da894e88c1f3b09c3
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f96aaa5e97f3f83387afc0775efd5e922752a17138c7276a9efe7c9ff0bbeee
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53021cc365fb35689cf7935de3e1b4f7d09b54591f9f81b2f7c83736f1ee6045
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:451881f3cab07a4e85e5f970801619f2d6aa94fada708d3b827ca3fafa636054
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:852ff1feb145f352899b6aa5117c88c8890d68604ca6bf2baf1e72eb1508c72e
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cae1361ad95b650252f8194ff20a5669981349cd4f0f59f3528fb4497ea319b8
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6044731009470079,
6
  "eval_steps": 500,
7
- "global_step": 3000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2708,6 +2708,456 @@
2708
  "mean_token_accuracy": 0.7911386549472809,
2709
  "num_tokens": 3316348.0,
2710
  "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2711
  }
2712
  ],
2713
  "logging_steps": 10,
@@ -2727,7 +3177,7 @@
2727
  "attributes": {}
2728
  }
2729
  },
2730
- "total_flos": 4014132187054080.0,
2731
  "train_batch_size": 8,
2732
  "trial_name": null,
2733
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7052186177715092,
6
  "eval_steps": 500,
7
+ "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2708
  "mean_token_accuracy": 0.7911386549472809,
2709
  "num_tokens": 3316348.0,
2710
  "step": 3000
2711
+ },
2712
+ {
2713
+ "epoch": 0.6064880112834978,
2714
+ "grad_norm": 11.25,
2715
+ "learning_rate": 1.595808986500101e-05,
2716
+ "loss": 0.9079,
2717
+ "mean_token_accuracy": 0.7789463937282562,
2718
+ "num_tokens": 3326996.0,
2719
+ "step": 3010
2720
+ },
2721
+ {
2722
+ "epoch": 0.6085029216199879,
2723
+ "grad_norm": 11.75,
2724
+ "learning_rate": 1.5944657129424407e-05,
2725
+ "loss": 0.8986,
2726
+ "mean_token_accuracy": 0.7775606334209442,
2727
+ "num_tokens": 3339244.0,
2728
+ "step": 3020
2729
+ },
2730
+ {
2731
+ "epoch": 0.6105178319564779,
2732
+ "grad_norm": 11.1875,
2733
+ "learning_rate": 1.593122439384781e-05,
2734
+ "loss": 0.8756,
2735
+ "mean_token_accuracy": 0.7890569746494294,
2736
+ "num_tokens": 3349636.0,
2737
+ "step": 3030
2738
+ },
2739
+ {
2740
+ "epoch": 0.612532742292968,
2741
+ "grad_norm": 11.9375,
2742
+ "learning_rate": 1.591779165827121e-05,
2743
+ "loss": 0.8937,
2744
+ "mean_token_accuracy": 0.7829440474510193,
2745
+ "num_tokens": 3360711.0,
2746
+ "step": 3040
2747
+ },
2748
+ {
2749
+ "epoch": 0.614547652629458,
2750
+ "grad_norm": 10.25,
2751
+ "learning_rate": 1.5904358922694607e-05,
2752
+ "loss": 0.9303,
2753
+ "mean_token_accuracy": 0.7729782402515412,
2754
+ "num_tokens": 3372520.0,
2755
+ "step": 3050
2756
+ },
2757
+ {
2758
+ "epoch": 0.616562562965948,
2759
+ "grad_norm": 10.625,
2760
+ "learning_rate": 1.5890926187118006e-05,
2761
+ "loss": 0.9247,
2762
+ "mean_token_accuracy": 0.7741464018821717,
2763
+ "num_tokens": 3384007.0,
2764
+ "step": 3060
2765
+ },
2766
+ {
2767
+ "epoch": 0.6185774733024381,
2768
+ "grad_norm": 9.75,
2769
+ "learning_rate": 1.5877493451541408e-05,
2770
+ "loss": 0.7869,
2771
+ "mean_token_accuracy": 0.8034534513950348,
2772
+ "num_tokens": 3395421.0,
2773
+ "step": 3070
2774
+ },
2775
+ {
2776
+ "epoch": 0.6205923836389281,
2777
+ "grad_norm": 12.0,
2778
+ "learning_rate": 1.5864060715964807e-05,
2779
+ "loss": 0.8645,
2780
+ "mean_token_accuracy": 0.7838839650154114,
2781
+ "num_tokens": 3405277.0,
2782
+ "step": 3080
2783
+ },
2784
+ {
2785
+ "epoch": 0.6226072939754181,
2786
+ "grad_norm": 11.375,
2787
+ "learning_rate": 1.5850627980388206e-05,
2788
+ "loss": 0.8441,
2789
+ "mean_token_accuracy": 0.7875894546508789,
2790
+ "num_tokens": 3417672.0,
2791
+ "step": 3090
2792
+ },
2793
+ {
2794
+ "epoch": 0.6246222043119081,
2795
+ "grad_norm": 11.5625,
2796
+ "learning_rate": 1.5837195244811608e-05,
2797
+ "loss": 0.876,
2798
+ "mean_token_accuracy": 0.7891711950302124,
2799
+ "num_tokens": 3428808.0,
2800
+ "step": 3100
2801
+ },
2802
+ {
2803
+ "epoch": 0.6266371146483981,
2804
+ "grad_norm": 10.6875,
2805
+ "learning_rate": 1.5823762509235007e-05,
2806
+ "loss": 0.8585,
2807
+ "mean_token_accuracy": 0.790976220369339,
2808
+ "num_tokens": 3440047.0,
2809
+ "step": 3110
2810
+ },
2811
+ {
2812
+ "epoch": 0.6286520249848881,
2813
+ "grad_norm": 11.8125,
2814
+ "learning_rate": 1.5810329773658406e-05,
2815
+ "loss": 0.8671,
2816
+ "mean_token_accuracy": 0.7851876437664032,
2817
+ "num_tokens": 3450270.0,
2818
+ "step": 3120
2819
+ },
2820
+ {
2821
+ "epoch": 0.6306669353213782,
2822
+ "grad_norm": 13.125,
2823
+ "learning_rate": 1.5796897038081808e-05,
2824
+ "loss": 0.9273,
2825
+ "mean_token_accuracy": 0.7657091677188873,
2826
+ "num_tokens": 3462054.0,
2827
+ "step": 3130
2828
+ },
2829
+ {
2830
+ "epoch": 0.6326818456578682,
2831
+ "grad_norm": 10.9375,
2832
+ "learning_rate": 1.5783464302505207e-05,
2833
+ "loss": 0.8185,
2834
+ "mean_token_accuracy": 0.7917792797088623,
2835
+ "num_tokens": 3472333.0,
2836
+ "step": 3140
2837
+ },
2838
+ {
2839
+ "epoch": 0.6346967559943583,
2840
+ "grad_norm": 12.0625,
2841
+ "learning_rate": 1.5770031566928606e-05,
2842
+ "loss": 0.9939,
2843
+ "mean_token_accuracy": 0.7620590627193451,
2844
+ "num_tokens": 3484753.0,
2845
+ "step": 3150
2846
+ },
2847
+ {
2848
+ "epoch": 0.6367116663308483,
2849
+ "grad_norm": 14.1875,
2850
+ "learning_rate": 1.5756598831352005e-05,
2851
+ "loss": 0.8723,
2852
+ "mean_token_accuracy": 0.7846651554107666,
2853
+ "num_tokens": 3496220.0,
2854
+ "step": 3160
2855
+ },
2856
+ {
2857
+ "epoch": 0.6387265766673383,
2858
+ "grad_norm": 11.6875,
2859
+ "learning_rate": 1.5743166095775407e-05,
2860
+ "loss": 0.9375,
2861
+ "mean_token_accuracy": 0.7770143210887909,
2862
+ "num_tokens": 3508224.0,
2863
+ "step": 3170
2864
+ },
2865
+ {
2866
+ "epoch": 0.6407414870038284,
2867
+ "grad_norm": 11.0625,
2868
+ "learning_rate": 1.5729733360198806e-05,
2869
+ "loss": 0.8789,
2870
+ "mean_token_accuracy": 0.7903493702411651,
2871
+ "num_tokens": 3519271.0,
2872
+ "step": 3180
2873
+ },
2874
+ {
2875
+ "epoch": 0.6427563973403183,
2876
+ "grad_norm": 16.25,
2877
+ "learning_rate": 1.5716300624622204e-05,
2878
+ "loss": 0.9003,
2879
+ "mean_token_accuracy": 0.7797963619232178,
2880
+ "num_tokens": 3530537.0,
2881
+ "step": 3190
2882
+ },
2883
+ {
2884
+ "epoch": 0.6447713076768083,
2885
+ "grad_norm": 10.75,
2886
+ "learning_rate": 1.5702867889045607e-05,
2887
+ "loss": 0.9229,
2888
+ "mean_token_accuracy": 0.7731367945671082,
2889
+ "num_tokens": 3540961.0,
2890
+ "step": 3200
2891
+ },
2892
+ {
2893
+ "epoch": 0.6467862180132984,
2894
+ "grad_norm": 11.75,
2895
+ "learning_rate": 1.5689435153469006e-05,
2896
+ "loss": 0.9519,
2897
+ "mean_token_accuracy": 0.766649729013443,
2898
+ "num_tokens": 3552392.0,
2899
+ "step": 3210
2900
+ },
2901
+ {
2902
+ "epoch": 0.6488011283497884,
2903
+ "grad_norm": 11.375,
2904
+ "learning_rate": 1.5676002417892404e-05,
2905
+ "loss": 0.8958,
2906
+ "mean_token_accuracy": 0.7798868775367737,
2907
+ "num_tokens": 3563665.0,
2908
+ "step": 3220
2909
+ },
2910
+ {
2911
+ "epoch": 0.6508160386862785,
2912
+ "grad_norm": 10.875,
2913
+ "learning_rate": 1.5662569682315803e-05,
2914
+ "loss": 0.9158,
2915
+ "mean_token_accuracy": 0.7784943222999573,
2916
+ "num_tokens": 3575115.0,
2917
+ "step": 3230
2918
+ },
2919
+ {
2920
+ "epoch": 0.6528309490227685,
2921
+ "grad_norm": 10.1875,
2922
+ "learning_rate": 1.5649136946739205e-05,
2923
+ "loss": 0.8092,
2924
+ "mean_token_accuracy": 0.7988557398319245,
2925
+ "num_tokens": 3585453.0,
2926
+ "step": 3240
2927
+ },
2928
+ {
2929
+ "epoch": 0.6548458593592585,
2930
+ "grad_norm": 12.8125,
2931
+ "learning_rate": 1.5635704211162604e-05,
2932
+ "loss": 0.8562,
2933
+ "mean_token_accuracy": 0.7906098127365112,
2934
+ "num_tokens": 3595472.0,
2935
+ "step": 3250
2936
+ },
2937
+ {
2938
+ "epoch": 0.6568607696957486,
2939
+ "grad_norm": 10.9375,
2940
+ "learning_rate": 1.5622271475586003e-05,
2941
+ "loss": 0.9317,
2942
+ "mean_token_accuracy": 0.776879757642746,
2943
+ "num_tokens": 3607704.0,
2944
+ "step": 3260
2945
+ },
2946
+ {
2947
+ "epoch": 0.6588756800322386,
2948
+ "grad_norm": 9.6875,
2949
+ "learning_rate": 1.5608838740009405e-05,
2950
+ "loss": 0.8642,
2951
+ "mean_token_accuracy": 0.7901065409183502,
2952
+ "num_tokens": 3618233.0,
2953
+ "step": 3270
2954
+ },
2955
+ {
2956
+ "epoch": 0.6608905903687285,
2957
+ "grad_norm": 13.8125,
2958
+ "learning_rate": 1.5595406004432804e-05,
2959
+ "loss": 0.9939,
2960
+ "mean_token_accuracy": 0.7686895251274108,
2961
+ "num_tokens": 3628902.0,
2962
+ "step": 3280
2963
+ },
2964
+ {
2965
+ "epoch": 0.6629055007052186,
2966
+ "grad_norm": 12.25,
2967
+ "learning_rate": 1.5581973268856203e-05,
2968
+ "loss": 0.8935,
2969
+ "mean_token_accuracy": 0.7827515482902527,
2970
+ "num_tokens": 3640225.0,
2971
+ "step": 3290
2972
+ },
2973
+ {
2974
+ "epoch": 0.6649204110417086,
2975
+ "grad_norm": 13.8125,
2976
+ "learning_rate": 1.5568540533279605e-05,
2977
+ "loss": 0.8856,
2978
+ "mean_token_accuracy": 0.7820924818515778,
2979
+ "num_tokens": 3651992.0,
2980
+ "step": 3300
2981
+ },
2982
+ {
2983
+ "epoch": 0.6669353213781987,
2984
+ "grad_norm": 11.0,
2985
+ "learning_rate": 1.5555107797703004e-05,
2986
+ "loss": 0.9789,
2987
+ "mean_token_accuracy": 0.7679969072341919,
2988
+ "num_tokens": 3663369.0,
2989
+ "step": 3310
2990
+ },
2991
+ {
2992
+ "epoch": 0.6689502317146887,
2993
+ "grad_norm": 10.625,
2994
+ "learning_rate": 1.5541675062126403e-05,
2995
+ "loss": 0.9536,
2996
+ "mean_token_accuracy": 0.7675111889839172,
2997
+ "num_tokens": 3674969.0,
2998
+ "step": 3320
2999
+ },
3000
+ {
3001
+ "epoch": 0.6709651420511787,
3002
+ "grad_norm": 10.375,
3003
+ "learning_rate": 1.5528242326549802e-05,
3004
+ "loss": 0.917,
3005
+ "mean_token_accuracy": 0.7766897320747376,
3006
+ "num_tokens": 3685794.0,
3007
+ "step": 3330
3008
+ },
3009
+ {
3010
+ "epoch": 0.6729800523876688,
3011
+ "grad_norm": 12.6875,
3012
+ "learning_rate": 1.5514809590973204e-05,
3013
+ "loss": 0.8213,
3014
+ "mean_token_accuracy": 0.798279982805252,
3015
+ "num_tokens": 3698384.0,
3016
+ "step": 3340
3017
+ },
3018
+ {
3019
+ "epoch": 0.6749949627241588,
3020
+ "grad_norm": 14.125,
3021
+ "learning_rate": 1.5501376855396603e-05,
3022
+ "loss": 0.9941,
3023
+ "mean_token_accuracy": 0.7723333060741424,
3024
+ "num_tokens": 3709110.0,
3025
+ "step": 3350
3026
+ },
3027
+ {
3028
+ "epoch": 0.6770098730606487,
3029
+ "grad_norm": 10.875,
3030
+ "learning_rate": 1.548794411982e-05,
3031
+ "loss": 0.9428,
3032
+ "mean_token_accuracy": 0.7793790519237518,
3033
+ "num_tokens": 3720500.0,
3034
+ "step": 3360
3035
+ },
3036
+ {
3037
+ "epoch": 0.6790247833971388,
3038
+ "grad_norm": 10.5625,
3039
+ "learning_rate": 1.5474511384243404e-05,
3040
+ "loss": 0.9055,
3041
+ "mean_token_accuracy": 0.7757111012935638,
3042
+ "num_tokens": 3733649.0,
3043
+ "step": 3370
3044
+ },
3045
+ {
3046
+ "epoch": 0.6810396937336288,
3047
+ "grad_norm": 11.625,
3048
+ "learning_rate": 1.5461078648666803e-05,
3049
+ "loss": 1.0494,
3050
+ "mean_token_accuracy": 0.748576694726944,
3051
+ "num_tokens": 3744082.0,
3052
+ "step": 3380
3053
+ },
3054
+ {
3055
+ "epoch": 0.6830546040701189,
3056
+ "grad_norm": 11.0,
3057
+ "learning_rate": 1.54476459130902e-05,
3058
+ "loss": 0.9308,
3059
+ "mean_token_accuracy": 0.7800273001194,
3060
+ "num_tokens": 3755970.0,
3061
+ "step": 3390
3062
+ },
3063
+ {
3064
+ "epoch": 0.6850695144066089,
3065
+ "grad_norm": 10.4375,
3066
+ "learning_rate": 1.54342131775136e-05,
3067
+ "loss": 0.8138,
3068
+ "mean_token_accuracy": 0.7963871121406555,
3069
+ "num_tokens": 3766720.0,
3070
+ "step": 3400
3071
+ },
3072
+ {
3073
+ "epoch": 0.6870844247430989,
3074
+ "grad_norm": 11.5625,
3075
+ "learning_rate": 1.5420780441937003e-05,
3076
+ "loss": 0.8504,
3077
+ "mean_token_accuracy": 0.7921045780181885,
3078
+ "num_tokens": 3777338.0,
3079
+ "step": 3410
3080
+ },
3081
+ {
3082
+ "epoch": 0.689099335079589,
3083
+ "grad_norm": 11.625,
3084
+ "learning_rate": 1.54073477063604e-05,
3085
+ "loss": 0.9142,
3086
+ "mean_token_accuracy": 0.7711953699588776,
3087
+ "num_tokens": 3788475.0,
3088
+ "step": 3420
3089
+ },
3090
+ {
3091
+ "epoch": 0.691114245416079,
3092
+ "grad_norm": 11.1875,
3093
+ "learning_rate": 1.53939149707838e-05,
3094
+ "loss": 0.9957,
3095
+ "mean_token_accuracy": 0.7668613314628601,
3096
+ "num_tokens": 3800222.0,
3097
+ "step": 3430
3098
+ },
3099
+ {
3100
+ "epoch": 0.6931291557525691,
3101
+ "grad_norm": 13.125,
3102
+ "learning_rate": 1.5380482235207202e-05,
3103
+ "loss": 0.8789,
3104
+ "mean_token_accuracy": 0.7842870116233825,
3105
+ "num_tokens": 3811363.0,
3106
+ "step": 3440
3107
+ },
3108
+ {
3109
+ "epoch": 0.695144066089059,
3110
+ "grad_norm": 14.4375,
3111
+ "learning_rate": 1.53670494996306e-05,
3112
+ "loss": 0.7952,
3113
+ "mean_token_accuracy": 0.8054643094539642,
3114
+ "num_tokens": 3821569.0,
3115
+ "step": 3450
3116
+ },
3117
+ {
3118
+ "epoch": 0.697158976425549,
3119
+ "grad_norm": 9.6875,
3120
+ "learning_rate": 1.5353616764054e-05,
3121
+ "loss": 0.8705,
3122
+ "mean_token_accuracy": 0.7873030543327332,
3123
+ "num_tokens": 3833270.0,
3124
+ "step": 3460
3125
+ },
3126
+ {
3127
+ "epoch": 0.6991738867620391,
3128
+ "grad_norm": 10.6875,
3129
+ "learning_rate": 1.53401840284774e-05,
3130
+ "loss": 0.9286,
3131
+ "mean_token_accuracy": 0.7691307544708252,
3132
+ "num_tokens": 3844114.0,
3133
+ "step": 3470
3134
+ },
3135
+ {
3136
+ "epoch": 0.7011887970985291,
3137
+ "grad_norm": 11.4375,
3138
+ "learning_rate": 1.53267512929008e-05,
3139
+ "loss": 0.993,
3140
+ "mean_token_accuracy": 0.7620323598384857,
3141
+ "num_tokens": 3856040.0,
3142
+ "step": 3480
3143
+ },
3144
+ {
3145
+ "epoch": 0.7032037074350191,
3146
+ "grad_norm": 11.5,
3147
+ "learning_rate": 1.53133185573242e-05,
3148
+ "loss": 0.8668,
3149
+ "mean_token_accuracy": 0.7913073658943176,
3150
+ "num_tokens": 3867207.0,
3151
+ "step": 3490
3152
+ },
3153
+ {
3154
+ "epoch": 0.7052186177715092,
3155
+ "grad_norm": 9.25,
3156
+ "learning_rate": 1.52998858217476e-05,
3157
+ "loss": 0.8715,
3158
+ "mean_token_accuracy": 0.7891253709793091,
3159
+ "num_tokens": 3879065.0,
3160
+ "step": 3500
3161
  }
3162
  ],
3163
  "logging_steps": 10,
 
3177
  "attributes": {}
3178
  }
3179
  },
3180
+ "total_flos": 4699418269335552.0,
3181
  "train_batch_size": 8,
3182
  "trial_name": null,
3183
  "trial_params": null