Wilsonwin commited on
Commit
a82ebe9
·
verified ·
1 Parent(s): 3a830f9

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd5257ed25b3deedcdfbd77b311ce64f39ce97cab4262552a2cce890d0e1ed2f
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b52d2c4e6f1dc1fc53e1df4ec08ffe7a50c1b6037cc45122a1b5264d5c4b91
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38c6d9ddeda93bf2814232d10b3b4a6111c3ba43c271d5af0fe9ac07ad7bdf8f
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a58a7f728d5913709f013bfd6cbcb991064242e3075f2b5e93d9b5b184b9f7
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8647979d889bb2b15d0a3e8961a7e547be28d07767d240f858bd959476bb870c
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf45e6f5a33d99139eae20e5be76bd3bf9589da43c06744e1ac55dde6dda87db
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f34721a2fd924d02bdad3691f09e25bcb5ed140f7982be7b710c4ccbd2538c0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839b4043be0c777e952526844484b5d7c9eb08d95c6a855198a76f2eb1f08d84
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6757898293630681,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2879,6 +2879,364 @@
2879
  "eval_samples_per_second": 276.145,
2880
  "eval_steps_per_second": 5.799,
2881
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2882
  }
2883
  ],
2884
  "logging_steps": 10,
@@ -2898,7 +3256,7 @@
2898
  "attributes": {}
2899
  }
2900
  },
2901
- "total_flos": 1.33782728343552e+17,
2902
  "train_batch_size": 48,
2903
  "trial_name": null,
2904
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7602635580334516,
6
  "eval_steps": 500,
7
+ "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2879
  "eval_samples_per_second": 276.145,
2880
  "eval_steps_per_second": 5.799,
2881
  "step": 4000
2882
+ },
2883
+ {
2884
+ "epoch": 0.6774793039364757,
2885
+ "grad_norm": 0.5737301111221313,
2886
+ "learning_rate": 0.00027017636818368575,
2887
+ "loss": 4.737479400634766,
2888
+ "step": 4010
2889
+ },
2890
+ {
2891
+ "epoch": 0.6791687785098834,
2892
+ "grad_norm": 0.5806599855422974,
2893
+ "learning_rate": 0.0002698891091982504,
2894
+ "loss": 4.715182876586914,
2895
+ "step": 4020
2896
+ },
2897
+ {
2898
+ "epoch": 0.6808582530832911,
2899
+ "grad_norm": 0.5259511470794678,
2900
+ "learning_rate": 0.00026960062766607135,
2901
+ "loss": 4.735322189331055,
2902
+ "step": 4030
2903
+ },
2904
+ {
2905
+ "epoch": 0.6825477276566988,
2906
+ "grad_norm": 0.5434650182723999,
2907
+ "learning_rate": 0.0002693109265288851,
2908
+ "loss": 4.725672912597656,
2909
+ "step": 4040
2910
+ },
2911
+ {
2912
+ "epoch": 0.6842372022301064,
2913
+ "grad_norm": 0.5198240876197815,
2914
+ "learning_rate": 0.0002690200087408648,
2915
+ "loss": 4.725751113891602,
2916
+ "step": 4050
2917
+ },
2918
+ {
2919
+ "epoch": 0.6859266768035142,
2920
+ "grad_norm": 0.5575292110443115,
2921
+ "learning_rate": 0.00026872787726859004,
2922
+ "loss": 4.715484619140625,
2923
+ "step": 4060
2924
+ },
2925
+ {
2926
+ "epoch": 0.6876161513769218,
2927
+ "grad_norm": 0.561512291431427,
2928
+ "learning_rate": 0.0002684345350910169,
2929
+ "loss": 4.725441360473633,
2930
+ "step": 4070
2931
+ },
2932
+ {
2933
+ "epoch": 0.6893056259503294,
2934
+ "grad_norm": 0.5424394011497498,
2935
+ "learning_rate": 0.0002681399851994472,
2936
+ "loss": 4.7274932861328125,
2937
+ "step": 4080
2938
+ },
2939
+ {
2940
+ "epoch": 0.6909951005237371,
2941
+ "grad_norm": 0.5293362140655518,
2942
+ "learning_rate": 0.00026784423059749845,
2943
+ "loss": 4.7252765655517575,
2944
+ "step": 4090
2945
+ },
2946
+ {
2947
+ "epoch": 0.6926845750971448,
2948
+ "grad_norm": 0.5352376103401184,
2949
+ "learning_rate": 0.0002675472743010727,
2950
+ "loss": 4.697403335571289,
2951
+ "step": 4100
2952
+ },
2953
+ {
2954
+ "epoch": 0.6943740496705525,
2955
+ "grad_norm": 0.5501886010169983,
2956
+ "learning_rate": 0.0002672491193383263,
2957
+ "loss": 4.723195648193359,
2958
+ "step": 4110
2959
+ },
2960
+ {
2961
+ "epoch": 0.6960635242439601,
2962
+ "grad_norm": 0.5308210253715515,
2963
+ "learning_rate": 0.00026694976874963854,
2964
+ "loss": 4.736632919311523,
2965
+ "step": 4120
2966
+ },
2967
+ {
2968
+ "epoch": 0.6977529988173679,
2969
+ "grad_norm": 0.533686101436615,
2970
+ "learning_rate": 0.00026664922558758105,
2971
+ "loss": 4.699850463867188,
2972
+ "step": 4130
2973
+ },
2974
+ {
2975
+ "epoch": 0.6994424733907755,
2976
+ "grad_norm": 0.52994704246521,
2977
+ "learning_rate": 0.00026634749291688646,
2978
+ "loss": 4.7275341033935545,
2979
+ "step": 4140
2980
+ },
2981
+ {
2982
+ "epoch": 0.7011319479641831,
2983
+ "grad_norm": 0.5824037790298462,
2984
+ "learning_rate": 0.00026604457381441715,
2985
+ "loss": 4.705679702758789,
2986
+ "step": 4150
2987
+ },
2988
+ {
2989
+ "epoch": 0.7028214225375908,
2990
+ "grad_norm": 0.6155262589454651,
2991
+ "learning_rate": 0.00026574047136913403,
2992
+ "loss": 4.699795150756836,
2993
+ "step": 4160
2994
+ },
2995
+ {
2996
+ "epoch": 0.7045108971109985,
2997
+ "grad_norm": 0.5350865721702576,
2998
+ "learning_rate": 0.0002654351886820648,
2999
+ "loss": 4.712226867675781,
3000
+ "step": 4170
3001
+ },
3002
+ {
3003
+ "epoch": 0.7062003716844062,
3004
+ "grad_norm": 0.5593312382698059,
3005
+ "learning_rate": 0.0002651287288662724,
3006
+ "loss": 4.721994018554687,
3007
+ "step": 4180
3008
+ },
3009
+ {
3010
+ "epoch": 0.7078898462578138,
3011
+ "grad_norm": 0.5269652605056763,
3012
+ "learning_rate": 0.0002648210950468236,
3013
+ "loss": 4.703836822509766,
3014
+ "step": 4190
3015
+ },
3016
+ {
3017
+ "epoch": 0.7095793208312214,
3018
+ "grad_norm": 0.5680537223815918,
3019
+ "learning_rate": 0.0002645122903607566,
3020
+ "loss": 4.695099258422852,
3021
+ "step": 4200
3022
+ },
3023
+ {
3024
+ "epoch": 0.7112687954046292,
3025
+ "grad_norm": 0.5447277426719666,
3026
+ "learning_rate": 0.0002642023179570493,
3027
+ "loss": 4.695394515991211,
3028
+ "step": 4210
3029
+ },
3030
+ {
3031
+ "epoch": 0.7129582699780368,
3032
+ "grad_norm": 0.5375188589096069,
3033
+ "learning_rate": 0.0002638911809965874,
3034
+ "loss": 4.705070495605469,
3035
+ "step": 4220
3036
+ },
3037
+ {
3038
+ "epoch": 0.7146477445514445,
3039
+ "grad_norm": 0.5439088940620422,
3040
+ "learning_rate": 0.0002635788826521316,
3041
+ "loss": 4.692306900024414,
3042
+ "step": 4230
3043
+ },
3044
+ {
3045
+ "epoch": 0.7163372191248522,
3046
+ "grad_norm": 0.5620496869087219,
3047
+ "learning_rate": 0.00026326542610828597,
3048
+ "loss": 4.7032218933105465,
3049
+ "step": 4240
3050
+ },
3051
+ {
3052
+ "epoch": 0.7180266936982599,
3053
+ "grad_norm": 0.527233362197876,
3054
+ "learning_rate": 0.00026295081456146485,
3055
+ "loss": 4.714799880981445,
3056
+ "step": 4250
3057
+ },
3058
+ {
3059
+ "epoch": 0.7197161682716675,
3060
+ "grad_norm": 0.5378382205963135,
3061
+ "learning_rate": 0.0002626350512198606,
3062
+ "loss": 4.6938121795654295,
3063
+ "step": 4260
3064
+ },
3065
+ {
3066
+ "epoch": 0.7214056428450751,
3067
+ "grad_norm": 0.5405885577201843,
3068
+ "learning_rate": 0.0002623181393034108,
3069
+ "loss": 4.696908950805664,
3070
+ "step": 4270
3071
+ },
3072
+ {
3073
+ "epoch": 0.7230951174184829,
3074
+ "grad_norm": 0.5289508700370789,
3075
+ "learning_rate": 0.00026200008204376525,
3076
+ "loss": 4.715534210205078,
3077
+ "step": 4280
3078
+ },
3079
+ {
3080
+ "epoch": 0.7247845919918905,
3081
+ "grad_norm": 0.79053795337677,
3082
+ "learning_rate": 0.00026168088268425346,
3083
+ "loss": 4.691967391967774,
3084
+ "step": 4290
3085
+ },
3086
+ {
3087
+ "epoch": 0.7264740665652982,
3088
+ "grad_norm": 0.5415652394294739,
3089
+ "learning_rate": 0.00026136054447985105,
3090
+ "loss": 4.698383331298828,
3091
+ "step": 4300
3092
+ },
3093
+ {
3094
+ "epoch": 0.7281635411387058,
3095
+ "grad_norm": 0.5491306781768799,
3096
+ "learning_rate": 0.00026103907069714694,
3097
+ "loss": 4.708710479736328,
3098
+ "step": 4310
3099
+ },
3100
+ {
3101
+ "epoch": 0.7298530157121136,
3102
+ "grad_norm": 0.5362562537193298,
3103
+ "learning_rate": 0.0002607164646143098,
3104
+ "loss": 4.68592643737793,
3105
+ "step": 4320
3106
+ },
3107
+ {
3108
+ "epoch": 0.7315424902855212,
3109
+ "grad_norm": 0.5329167246818542,
3110
+ "learning_rate": 0.0002603927295210547,
3111
+ "loss": 4.681344223022461,
3112
+ "step": 4330
3113
+ },
3114
+ {
3115
+ "epoch": 0.7332319648589288,
3116
+ "grad_norm": 0.5879621505737305,
3117
+ "learning_rate": 0.00026006786871860975,
3118
+ "loss": 4.659723281860352,
3119
+ "step": 4340
3120
+ },
3121
+ {
3122
+ "epoch": 0.7349214394323366,
3123
+ "grad_norm": 0.5552240014076233,
3124
+ "learning_rate": 0.00025974188551968207,
3125
+ "loss": 4.70800552368164,
3126
+ "step": 4350
3127
+ },
3128
+ {
3129
+ "epoch": 0.7366109140057442,
3130
+ "grad_norm": 0.5462090373039246,
3131
+ "learning_rate": 0.0002594147832484243,
3132
+ "loss": 4.6786457061767575,
3133
+ "step": 4360
3134
+ },
3135
+ {
3136
+ "epoch": 0.7383003885791519,
3137
+ "grad_norm": 0.515416145324707,
3138
+ "learning_rate": 0.0002590865652404007,
3139
+ "loss": 4.681232452392578,
3140
+ "step": 4370
3141
+ },
3142
+ {
3143
+ "epoch": 0.7399898631525595,
3144
+ "grad_norm": 0.5645248293876648,
3145
+ "learning_rate": 0.0002587572348425529,
3146
+ "loss": 4.682769775390625,
3147
+ "step": 4380
3148
+ },
3149
+ {
3150
+ "epoch": 0.7416793377259672,
3151
+ "grad_norm": 0.5235434174537659,
3152
+ "learning_rate": 0.0002584267954131659,
3153
+ "loss": 4.673912811279297,
3154
+ "step": 4390
3155
+ },
3156
+ {
3157
+ "epoch": 0.7433688122993749,
3158
+ "grad_norm": 0.5084188580513,
3159
+ "learning_rate": 0.000258095250321834,
3160
+ "loss": 4.675137329101562,
3161
+ "step": 4400
3162
+ },
3163
+ {
3164
+ "epoch": 0.7450582868727825,
3165
+ "grad_norm": 0.5971478819847107,
3166
+ "learning_rate": 0.00025776260294942615,
3167
+ "loss": 4.688092422485352,
3168
+ "step": 4410
3169
+ },
3170
+ {
3171
+ "epoch": 0.7467477614461903,
3172
+ "grad_norm": 0.5779770016670227,
3173
+ "learning_rate": 0.0002574288566880517,
3174
+ "loss": 4.664862823486328,
3175
+ "step": 4420
3176
+ },
3177
+ {
3178
+ "epoch": 0.7484372360195979,
3179
+ "grad_norm": 0.5589803457260132,
3180
+ "learning_rate": 0.0002570940149410256,
3181
+ "loss": 4.665248870849609,
3182
+ "step": 4430
3183
+ },
3184
+ {
3185
+ "epoch": 0.7501267105930056,
3186
+ "grad_norm": 0.4953916072845459,
3187
+ "learning_rate": 0.00025675808112283387,
3188
+ "loss": 4.670894622802734,
3189
+ "step": 4440
3190
+ },
3191
+ {
3192
+ "epoch": 0.7518161851664132,
3193
+ "grad_norm": 0.5200746059417725,
3194
+ "learning_rate": 0.00025642105865909874,
3195
+ "loss": 4.664446258544922,
3196
+ "step": 4450
3197
+ },
3198
+ {
3199
+ "epoch": 0.7535056597398209,
3200
+ "grad_norm": 0.7123140692710876,
3201
+ "learning_rate": 0.0002560829509865437,
3202
+ "loss": 4.660491943359375,
3203
+ "step": 4460
3204
+ },
3205
+ {
3206
+ "epoch": 0.7551951343132286,
3207
+ "grad_norm": 0.5178130865097046,
3208
+ "learning_rate": 0.00025574376155295845,
3209
+ "loss": 4.669913101196289,
3210
+ "step": 4470
3211
+ },
3212
+ {
3213
+ "epoch": 0.7568846088866362,
3214
+ "grad_norm": 0.5300018191337585,
3215
+ "learning_rate": 0.00025540349381716367,
3216
+ "loss": 4.688555145263672,
3217
+ "step": 4480
3218
+ },
3219
+ {
3220
+ "epoch": 0.758574083460044,
3221
+ "grad_norm": 0.6072678565979004,
3222
+ "learning_rate": 0.00025506215124897593,
3223
+ "loss": 4.667338562011719,
3224
+ "step": 4490
3225
+ },
3226
+ {
3227
+ "epoch": 0.7602635580334516,
3228
+ "grad_norm": 0.5844916701316833,
3229
+ "learning_rate": 0.0002547197373291721,
3230
+ "loss": 4.678690719604492,
3231
+ "step": 4500
3232
+ },
3233
+ {
3234
+ "epoch": 0.7602635580334516,
3235
+ "eval_loss": 4.629103660583496,
3236
+ "eval_runtime": 3.5634,
3237
+ "eval_samples_per_second": 280.631,
3238
+ "eval_steps_per_second": 5.893,
3239
+ "step": 4500
3240
  }
3241
  ],
3242
  "logging_steps": 10,
 
3256
  "attributes": {}
3257
  }
3258
  },
3259
+ "total_flos": 1.50505569386496e+17,
3260
  "train_batch_size": 48,
3261
  "trial_name": null,
3262
  "trial_params": null