8BitStudio commited on
Commit
c8501ce
·
verified ·
1 Parent(s): d3ac247

Training in progress, step 22000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:375e4b9cfa9de09d8057f42e98dbc192a0866e06789a8ec7b0e9091572c996e2
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:299b24fe69c89f19141b9f985a9ac826c3a53ad4e1b08b8aba5729be39c93c43
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbebb029cca703c9435c8129fccc4b3f8d45e60881ec4b04f2d6acf25bec8c42
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2578fa210b28417d8f969fa905bceff91b35a10909b4f603355ac6d743992a10
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d849fd0ed2b4c55b2499c77003ae5987968969429cd3a8cafdd43ae46b463c3e
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59dbdf3564f71a619277fad1d7b29f944b0a8aee767f1ee531e2a42c249a6709
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e17b5806922786f9c39beaa8475b0a348452b2fd43fba768c2f31b4cb13e074a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b97fc3e9888373aed6e862ae95add028b1c9773804bea656915decaab6270d
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 5.030437158469946,
6
  "eval_steps": 500,
7
- "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2808,6 +2808,286 @@
2808
  "learning_rate": 0.0002720358859033514,
2809
  "loss": 1.6249,
2810
  "step": 20000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2811
  }
2812
  ],
2813
  "logging_steps": 50,
@@ -2827,7 +3107,7 @@
2827
  "attributes": {}
2828
  }
2829
  },
2830
- "total_flos": 1.0695590988837028e+19,
2831
  "train_batch_size": 16,
2832
  "trial_name": null,
2833
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 6.014666666666667,
6
  "eval_steps": 500,
7
+ "global_step": 22000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2808
  "learning_rate": 0.0002720358859033514,
2809
  "loss": 1.6249,
2810
  "step": 20000
2811
+ },
2812
+ {
2813
+ "epoch": 5.030983606557377,
2814
+ "grad_norm": 0.59375,
2815
+ "learning_rate": 0.00027188193193538625,
2816
+ "loss": 1.6114,
2817
+ "step": 20050
2818
+ },
2819
+ {
2820
+ "epoch": 5.031530054644809,
2821
+ "grad_norm": 0.671875,
2822
+ "learning_rate": 0.00027172759915556504,
2823
+ "loss": 1.6081,
2824
+ "step": 20100
2825
+ },
2826
+ {
2827
+ "epoch": 5.032076502732241,
2828
+ "grad_norm": 0.62890625,
2829
+ "learning_rate": 0.0002715728880435577,
2830
+ "loss": 1.627,
2831
+ "step": 20150
2832
+ },
2833
+ {
2834
+ "epoch": 5.032622950819672,
2835
+ "grad_norm": 0.62890625,
2836
+ "learning_rate": 0.00027141779908020986,
2837
+ "loss": 1.5912,
2838
+ "step": 20200
2839
+ },
2840
+ {
2841
+ "epoch": 5.033169398907104,
2842
+ "grad_norm": 0.65234375,
2843
+ "learning_rate": 0.00027126233274754163,
2844
+ "loss": 1.6476,
2845
+ "step": 20250
2846
+ },
2847
+ {
2848
+ "epoch": 5.033715846994536,
2849
+ "grad_norm": 0.59375,
2850
+ "learning_rate": 0.00027110648952874595,
2851
+ "loss": 1.6051,
2852
+ "step": 20300
2853
+ },
2854
+ {
2855
+ "epoch": 5.034262295081967,
2856
+ "grad_norm": 0.65234375,
2857
+ "learning_rate": 0.0002709502699081871,
2858
+ "loss": 1.5705,
2859
+ "step": 20350
2860
+ },
2861
+ {
2862
+ "epoch": 5.034808743169399,
2863
+ "grad_norm": 0.578125,
2864
+ "learning_rate": 0.00027079367437139935,
2865
+ "loss": 1.6533,
2866
+ "step": 20400
2867
+ },
2868
+ {
2869
+ "epoch": 5.035355191256831,
2870
+ "grad_norm": 0.59765625,
2871
+ "learning_rate": 0.00027063670340508514,
2872
+ "loss": 1.6099,
2873
+ "step": 20450
2874
+ },
2875
+ {
2876
+ "epoch": 5.035901639344262,
2877
+ "grad_norm": 0.5703125,
2878
+ "learning_rate": 0.00027047935749711395,
2879
+ "loss": 1.6018,
2880
+ "step": 20500
2881
+ },
2882
+ {
2883
+ "epoch": 5.036448087431694,
2884
+ "grad_norm": 0.59375,
2885
+ "learning_rate": 0.0002703216371365204,
2886
+ "loss": 1.637,
2887
+ "step": 20550
2888
+ },
2889
+ {
2890
+ "epoch": 5.036994535519126,
2891
+ "grad_norm": 0.56640625,
2892
+ "learning_rate": 0.00027016354281350315,
2893
+ "loss": 1.6394,
2894
+ "step": 20600
2895
+ },
2896
+ {
2897
+ "epoch": 5.037540983606557,
2898
+ "grad_norm": 0.6015625,
2899
+ "learning_rate": 0.00027000507501942283,
2900
+ "loss": 1.6253,
2901
+ "step": 20650
2902
+ },
2903
+ {
2904
+ "epoch": 6.000459016393442,
2905
+ "grad_norm": 0.57421875,
2906
+ "learning_rate": 0.0002698462342468011,
2907
+ "loss": 1.5897,
2908
+ "step": 20700
2909
+ },
2910
+ {
2911
+ "epoch": 6.001005464480874,
2912
+ "grad_norm": 0.56640625,
2913
+ "learning_rate": 0.0002696870209893187,
2914
+ "loss": 1.569,
2915
+ "step": 20750
2916
+ },
2917
+ {
2918
+ "epoch": 6.001551912568306,
2919
+ "grad_norm": 0.59765625,
2920
+ "learning_rate": 0.00026952743574181414,
2921
+ "loss": 1.5482,
2922
+ "step": 20800
2923
+ },
2924
+ {
2925
+ "epoch": 6.002098360655737,
2926
+ "grad_norm": 0.59765625,
2927
+ "learning_rate": 0.00026936747900028205,
2928
+ "loss": 1.5458,
2929
+ "step": 20850
2930
+ },
2931
+ {
2932
+ "epoch": 6.002644808743169,
2933
+ "grad_norm": 0.734375,
2934
+ "learning_rate": 0.00026920715126187167,
2935
+ "loss": 1.4787,
2936
+ "step": 20900
2937
+ },
2938
+ {
2939
+ "epoch": 6.003191256830601,
2940
+ "grad_norm": 0.7109375,
2941
+ "learning_rate": 0.0002690464530248853,
2942
+ "loss": 1.5565,
2943
+ "step": 20950
2944
+ },
2945
+ {
2946
+ "epoch": 6.0037377049180325,
2947
+ "grad_norm": 0.64453125,
2948
+ "learning_rate": 0.00026888538478877675,
2949
+ "loss": 1.5588,
2950
+ "step": 21000
2951
+ },
2952
+ {
2953
+ "epoch": 6.0042841530054645,
2954
+ "grad_norm": 0.6953125,
2955
+ "learning_rate": 0.0002687239470541498,
2956
+ "loss": 1.5347,
2957
+ "step": 21050
2958
+ },
2959
+ {
2960
+ "epoch": 6.0048306010928965,
2961
+ "grad_norm": 0.58984375,
2962
+ "learning_rate": 0.00026856214032275675,
2963
+ "loss": 1.5341,
2964
+ "step": 21100
2965
+ },
2966
+ {
2967
+ "epoch": 6.0053770491803276,
2968
+ "grad_norm": 0.73046875,
2969
+ "learning_rate": 0.00026839996509749655,
2970
+ "loss": 1.5441,
2971
+ "step": 21150
2972
+ },
2973
+ {
2974
+ "epoch": 6.0059234972677595,
2975
+ "grad_norm": 0.58984375,
2976
+ "learning_rate": 0.00026823742188241366,
2977
+ "loss": 1.5405,
2978
+ "step": 21200
2979
+ },
2980
+ {
2981
+ "epoch": 6.0064699453551915,
2982
+ "grad_norm": 0.64453125,
2983
+ "learning_rate": 0.000268074511182696,
2984
+ "loss": 1.5327,
2985
+ "step": 21250
2986
+ },
2987
+ {
2988
+ "epoch": 6.007016393442623,
2989
+ "grad_norm": 0.61328125,
2990
+ "learning_rate": 0.00026791123350467384,
2991
+ "loss": 1.5338,
2992
+ "step": 21300
2993
+ },
2994
+ {
2995
+ "epoch": 6.007562841530055,
2996
+ "grad_norm": 0.58984375,
2997
+ "learning_rate": 0.000267747589355818,
2998
+ "loss": 1.5663,
2999
+ "step": 21350
3000
+ },
3001
+ {
3002
+ "epoch": 6.008109289617487,
3003
+ "grad_norm": 0.67578125,
3004
+ "learning_rate": 0.0002675835792447382,
3005
+ "loss": 1.5519,
3006
+ "step": 21400
3007
+ },
3008
+ {
3009
+ "epoch": 6.008655737704918,
3010
+ "grad_norm": 0.66015625,
3011
+ "learning_rate": 0.0002674192036811818,
3012
+ "loss": 1.5256,
3013
+ "step": 21450
3014
+ },
3015
+ {
3016
+ "epoch": 6.00920218579235,
3017
+ "grad_norm": 0.578125,
3018
+ "learning_rate": 0.0002672544631760317,
3019
+ "loss": 1.5428,
3020
+ "step": 21500
3021
+ },
3022
+ {
3023
+ "epoch": 6.009748633879782,
3024
+ "grad_norm": 0.75,
3025
+ "learning_rate": 0.00026708935824130514,
3026
+ "loss": 1.5806,
3027
+ "step": 21550
3028
+ },
3029
+ {
3030
+ "epoch": 6.010295081967213,
3031
+ "grad_norm": 0.57421875,
3032
+ "learning_rate": 0.00026692388939015226,
3033
+ "loss": 1.5628,
3034
+ "step": 21600
3035
+ },
3036
+ {
3037
+ "epoch": 6.010841530054645,
3038
+ "grad_norm": 0.61328125,
3039
+ "learning_rate": 0.00026675805713685387,
3040
+ "loss": 1.5275,
3041
+ "step": 21650
3042
+ },
3043
+ {
3044
+ "epoch": 6.011387978142077,
3045
+ "grad_norm": 0.61328125,
3046
+ "learning_rate": 0.0002665918619968206,
3047
+ "loss": 1.5547,
3048
+ "step": 21700
3049
+ },
3050
+ {
3051
+ "epoch": 6.011934426229508,
3052
+ "grad_norm": 0.57421875,
3053
+ "learning_rate": 0.0002664253044865907,
3054
+ "loss": 1.575,
3055
+ "step": 21750
3056
+ },
3057
+ {
3058
+ "epoch": 6.01248087431694,
3059
+ "grad_norm": 0.59765625,
3060
+ "learning_rate": 0.0002662583851238287,
3061
+ "loss": 1.5386,
3062
+ "step": 21800
3063
+ },
3064
+ {
3065
+ "epoch": 6.013027322404372,
3066
+ "grad_norm": 0.6484375,
3067
+ "learning_rate": 0.000266091104427324,
3068
+ "loss": 1.5107,
3069
+ "step": 21850
3070
+ },
3071
+ {
3072
+ "epoch": 6.013573770491803,
3073
+ "grad_norm": 0.625,
3074
+ "learning_rate": 0.00026592346291698864,
3075
+ "loss": 1.5516,
3076
+ "step": 21900
3077
+ },
3078
+ {
3079
+ "epoch": 6.014120218579235,
3080
+ "grad_norm": 0.5703125,
3081
+ "learning_rate": 0.00026575546111385647,
3082
+ "loss": 1.5431,
3083
+ "step": 21950
3084
+ },
3085
+ {
3086
+ "epoch": 6.014666666666667,
3087
+ "grad_norm": 0.6015625,
3088
+ "learning_rate": 0.00026558709954008095,
3089
+ "loss": 1.566,
3090
+ "step": 22000
3091
  }
3092
  ],
3093
  "logging_steps": 50,
 
3107
  "attributes": {}
3108
  }
3109
  },
3110
+ "total_flos": 1.1765225285807505e+19,
3111
  "train_batch_size": 16,
3112
  "trial_name": null,
3113
  "trial_params": null