Wilsonwin commited on
Commit
11d9639
·
verified ·
1 Parent(s): 5a3c27c

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a45987252e54dc35108e11e93cd15c2f7eff117407dadcc866c536c9fe38d549
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71e25cdc8697039d1202fb4440876be16955562540fb206d7cbbcfc37a7f33da
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1032541ab3e6eca2a68a25836b91b06b42d61052c34cec2e6dfe0544f185dcf0
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a082fad0106d612afcdf0e9dbf262fd1aa3ca7c9a2ef45f2a14751b1d80d165
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:329a377c90ca49d3bcb8c01bcb7bdf9bc769af05915d36720b3201a9c222f867
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3343121e0ab3aeb674ab29d872307564462c4bd82cdd92e6577a4ff26999fc00
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f34721a2fd924d02bdad3691f09e25bcb5ed140f7982be7b710c4ccbd2538c0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839b4043be0c777e952526844484b5d7c9eb08d95c6a855198a76f2eb1f08d84
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6757898293630681,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -2879,6 +2879,364 @@
2879
  "eval_samples_per_second": 263.356,
2880
  "eval_steps_per_second": 5.53,
2881
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2882
  }
2883
  ],
2884
  "logging_steps": 10,
@@ -2898,7 +3256,7 @@
2898
  "attributes": {}
2899
  }
2900
  },
2901
- "total_flos": 1.33782728343552e+17,
2902
  "train_batch_size": 48,
2903
  "trial_name": null,
2904
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7602635580334516,
6
  "eval_steps": 500,
7
+ "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
2879
  "eval_samples_per_second": 263.356,
2880
  "eval_steps_per_second": 5.53,
2881
  "step": 4000
2882
+ },
2883
+ {
2884
+ "epoch": 0.6774793039364757,
2885
+ "grad_norm": 0.5953722596168518,
2886
+ "learning_rate": 0.00027017636818368575,
2887
+ "loss": 4.737245559692383,
2888
+ "step": 4010
2889
+ },
2890
+ {
2891
+ "epoch": 0.6791687785098834,
2892
+ "grad_norm": 0.6203189492225647,
2893
+ "learning_rate": 0.0002698891091982504,
2894
+ "loss": 4.716173934936523,
2895
+ "step": 4020
2896
+ },
2897
+ {
2898
+ "epoch": 0.6808582530832911,
2899
+ "grad_norm": 0.5239487886428833,
2900
+ "learning_rate": 0.00026960062766607135,
2901
+ "loss": 4.735467529296875,
2902
+ "step": 4030
2903
+ },
2904
+ {
2905
+ "epoch": 0.6825477276566988,
2906
+ "grad_norm": 0.5474298000335693,
2907
+ "learning_rate": 0.0002693109265288851,
2908
+ "loss": 4.725514984130859,
2909
+ "step": 4040
2910
+ },
2911
+ {
2912
+ "epoch": 0.6842372022301064,
2913
+ "grad_norm": 0.5452102422714233,
2914
+ "learning_rate": 0.0002690200087408648,
2915
+ "loss": 4.726776885986328,
2916
+ "step": 4050
2917
+ },
2918
+ {
2919
+ "epoch": 0.6859266768035142,
2920
+ "grad_norm": 0.6271504759788513,
2921
+ "learning_rate": 0.00026872787726859004,
2922
+ "loss": 4.71842041015625,
2923
+ "step": 4060
2924
+ },
2925
+ {
2926
+ "epoch": 0.6876161513769218,
2927
+ "grad_norm": 0.5585569143295288,
2928
+ "learning_rate": 0.0002684345350910169,
2929
+ "loss": 4.728883361816406,
2930
+ "step": 4070
2931
+ },
2932
+ {
2933
+ "epoch": 0.6893056259503294,
2934
+ "grad_norm": 0.544662594795227,
2935
+ "learning_rate": 0.0002681399851994472,
2936
+ "loss": 4.729270553588867,
2937
+ "step": 4080
2938
+ },
2939
+ {
2940
+ "epoch": 0.6909951005237371,
2941
+ "grad_norm": 0.5363122820854187,
2942
+ "learning_rate": 0.00026784423059749845,
2943
+ "loss": 4.726214599609375,
2944
+ "step": 4090
2945
+ },
2946
+ {
2947
+ "epoch": 0.6926845750971448,
2948
+ "grad_norm": 0.5298801064491272,
2949
+ "learning_rate": 0.0002675472743010727,
2950
+ "loss": 4.697872924804687,
2951
+ "step": 4100
2952
+ },
2953
+ {
2954
+ "epoch": 0.6943740496705525,
2955
+ "grad_norm": 0.5710757374763489,
2956
+ "learning_rate": 0.0002672491193383263,
2957
+ "loss": 4.723146438598633,
2958
+ "step": 4110
2959
+ },
2960
+ {
2961
+ "epoch": 0.6960635242439601,
2962
+ "grad_norm": 0.5484883785247803,
2963
+ "learning_rate": 0.00026694976874963854,
2964
+ "loss": 4.738557052612305,
2965
+ "step": 4120
2966
+ },
2967
+ {
2968
+ "epoch": 0.6977529988173679,
2969
+ "grad_norm": 0.5273333191871643,
2970
+ "learning_rate": 0.00026664922558758105,
2971
+ "loss": 4.700592803955078,
2972
+ "step": 4130
2973
+ },
2974
+ {
2975
+ "epoch": 0.6994424733907755,
2976
+ "grad_norm": 0.5574657320976257,
2977
+ "learning_rate": 0.00026634749291688646,
2978
+ "loss": 4.729513168334961,
2979
+ "step": 4140
2980
+ },
2981
+ {
2982
+ "epoch": 0.7011319479641831,
2983
+ "grad_norm": 0.5571582317352295,
2984
+ "learning_rate": 0.00026604457381441715,
2985
+ "loss": 4.706221389770508,
2986
+ "step": 4150
2987
+ },
2988
+ {
2989
+ "epoch": 0.7028214225375908,
2990
+ "grad_norm": 0.6286988258361816,
2991
+ "learning_rate": 0.00026574047136913403,
2992
+ "loss": 4.701080322265625,
2993
+ "step": 4160
2994
+ },
2995
+ {
2996
+ "epoch": 0.7045108971109985,
2997
+ "grad_norm": 0.5314433574676514,
2998
+ "learning_rate": 0.0002654351886820648,
2999
+ "loss": 4.714921188354492,
3000
+ "step": 4170
3001
+ },
3002
+ {
3003
+ "epoch": 0.7062003716844062,
3004
+ "grad_norm": 0.539644718170166,
3005
+ "learning_rate": 0.0002651287288662724,
3006
+ "loss": 4.722955703735352,
3007
+ "step": 4180
3008
+ },
3009
+ {
3010
+ "epoch": 0.7078898462578138,
3011
+ "grad_norm": 0.5164220333099365,
3012
+ "learning_rate": 0.0002648210950468236,
3013
+ "loss": 4.7029579162597654,
3014
+ "step": 4190
3015
+ },
3016
+ {
3017
+ "epoch": 0.7095793208312214,
3018
+ "grad_norm": 0.5345500111579895,
3019
+ "learning_rate": 0.0002645122903607566,
3020
+ "loss": 4.696025085449219,
3021
+ "step": 4200
3022
+ },
3023
+ {
3024
+ "epoch": 0.7112687954046292,
3025
+ "grad_norm": 0.5561880469322205,
3026
+ "learning_rate": 0.0002642023179570493,
3027
+ "loss": 4.696010971069336,
3028
+ "step": 4210
3029
+ },
3030
+ {
3031
+ "epoch": 0.7129582699780368,
3032
+ "grad_norm": 0.5260653495788574,
3033
+ "learning_rate": 0.0002638911809965874,
3034
+ "loss": 4.705658721923828,
3035
+ "step": 4220
3036
+ },
3037
+ {
3038
+ "epoch": 0.7146477445514445,
3039
+ "grad_norm": 0.517846941947937,
3040
+ "learning_rate": 0.0002635788826521316,
3041
+ "loss": 4.690948104858398,
3042
+ "step": 4230
3043
+ },
3044
+ {
3045
+ "epoch": 0.7163372191248522,
3046
+ "grad_norm": 0.5815365314483643,
3047
+ "learning_rate": 0.00026326542610828597,
3048
+ "loss": 4.702710723876953,
3049
+ "step": 4240
3050
+ },
3051
+ {
3052
+ "epoch": 0.7180266936982599,
3053
+ "grad_norm": 0.5511707067489624,
3054
+ "learning_rate": 0.00026295081456146485,
3055
+ "loss": 4.713930130004883,
3056
+ "step": 4250
3057
+ },
3058
+ {
3059
+ "epoch": 0.7197161682716675,
3060
+ "grad_norm": 0.5390937924385071,
3061
+ "learning_rate": 0.0002626350512198606,
3062
+ "loss": 4.694212341308594,
3063
+ "step": 4260
3064
+ },
3065
+ {
3066
+ "epoch": 0.7214056428450751,
3067
+ "grad_norm": 0.5410081744194031,
3068
+ "learning_rate": 0.0002623181393034108,
3069
+ "loss": 4.696395492553711,
3070
+ "step": 4270
3071
+ },
3072
+ {
3073
+ "epoch": 0.7230951174184829,
3074
+ "grad_norm": 0.5272055268287659,
3075
+ "learning_rate": 0.00026200008204376525,
3076
+ "loss": 4.715652847290039,
3077
+ "step": 4280
3078
+ },
3079
+ {
3080
+ "epoch": 0.7247845919918905,
3081
+ "grad_norm": 0.5485383868217468,
3082
+ "learning_rate": 0.00026168088268425346,
3083
+ "loss": 4.689223861694336,
3084
+ "step": 4290
3085
+ },
3086
+ {
3087
+ "epoch": 0.7264740665652982,
3088
+ "grad_norm": 0.4974030554294586,
3089
+ "learning_rate": 0.00026136054447985105,
3090
+ "loss": 4.6958671569824215,
3091
+ "step": 4300
3092
+ },
3093
+ {
3094
+ "epoch": 0.7281635411387058,
3095
+ "grad_norm": 0.5421763062477112,
3096
+ "learning_rate": 0.00026103907069714694,
3097
+ "loss": 4.706072235107422,
3098
+ "step": 4310
3099
+ },
3100
+ {
3101
+ "epoch": 0.7298530157121136,
3102
+ "grad_norm": 0.5402170419692993,
3103
+ "learning_rate": 0.0002607164646143098,
3104
+ "loss": 4.684348297119141,
3105
+ "step": 4320
3106
+ },
3107
+ {
3108
+ "epoch": 0.7315424902855212,
3109
+ "grad_norm": 0.5388095378875732,
3110
+ "learning_rate": 0.0002603927295210547,
3111
+ "loss": 4.681607818603515,
3112
+ "step": 4330
3113
+ },
3114
+ {
3115
+ "epoch": 0.7332319648589288,
3116
+ "grad_norm": 0.5691295266151428,
3117
+ "learning_rate": 0.00026006786871860975,
3118
+ "loss": 4.659119033813477,
3119
+ "step": 4340
3120
+ },
3121
+ {
3122
+ "epoch": 0.7349214394323366,
3123
+ "grad_norm": 0.5657386183738708,
3124
+ "learning_rate": 0.00025974188551968207,
3125
+ "loss": 4.707662963867188,
3126
+ "step": 4350
3127
+ },
3128
+ {
3129
+ "epoch": 0.7366109140057442,
3130
+ "grad_norm": 0.5887618660926819,
3131
+ "learning_rate": 0.0002594147832484243,
3132
+ "loss": 4.678396606445313,
3133
+ "step": 4360
3134
+ },
3135
+ {
3136
+ "epoch": 0.7383003885791519,
3137
+ "grad_norm": 0.5618587136268616,
3138
+ "learning_rate": 0.0002590865652404007,
3139
+ "loss": 4.6809638977050785,
3140
+ "step": 4370
3141
+ },
3142
+ {
3143
+ "epoch": 0.7399898631525595,
3144
+ "grad_norm": 0.5673303604125977,
3145
+ "learning_rate": 0.0002587572348425529,
3146
+ "loss": 4.683576583862305,
3147
+ "step": 4380
3148
+ },
3149
+ {
3150
+ "epoch": 0.7416793377259672,
3151
+ "grad_norm": 0.5109097361564636,
3152
+ "learning_rate": 0.0002584267954131659,
3153
+ "loss": 4.674320983886719,
3154
+ "step": 4390
3155
+ },
3156
+ {
3157
+ "epoch": 0.7433688122993749,
3158
+ "grad_norm": 0.5133926272392273,
3159
+ "learning_rate": 0.000258095250321834,
3160
+ "loss": 4.676524353027344,
3161
+ "step": 4400
3162
+ },
3163
+ {
3164
+ "epoch": 0.7450582868727825,
3165
+ "grad_norm": 0.5628970265388489,
3166
+ "learning_rate": 0.00025776260294942615,
3167
+ "loss": 4.688607025146484,
3168
+ "step": 4410
3169
+ },
3170
+ {
3171
+ "epoch": 0.7467477614461903,
3172
+ "grad_norm": 0.5761396884918213,
3173
+ "learning_rate": 0.0002574288566880517,
3174
+ "loss": 4.666116333007812,
3175
+ "step": 4420
3176
+ },
3177
+ {
3178
+ "epoch": 0.7484372360195979,
3179
+ "grad_norm": 0.5518139004707336,
3180
+ "learning_rate": 0.0002570940149410256,
3181
+ "loss": 4.665610504150391,
3182
+ "step": 4430
3183
+ },
3184
+ {
3185
+ "epoch": 0.7501267105930056,
3186
+ "grad_norm": 0.5176488757133484,
3187
+ "learning_rate": 0.00025675808112283387,
3188
+ "loss": 4.673014831542969,
3189
+ "step": 4440
3190
+ },
3191
+ {
3192
+ "epoch": 0.7518161851664132,
3193
+ "grad_norm": 0.5482094287872314,
3194
+ "learning_rate": 0.00025642105865909874,
3195
+ "loss": 4.665557098388672,
3196
+ "step": 4450
3197
+ },
3198
+ {
3199
+ "epoch": 0.7535056597398209,
3200
+ "grad_norm": 0.7407347559928894,
3201
+ "learning_rate": 0.0002560829509865437,
3202
+ "loss": 4.660655975341797,
3203
+ "step": 4460
3204
+ },
3205
+ {
3206
+ "epoch": 0.7551951343132286,
3207
+ "grad_norm": 0.5341119766235352,
3208
+ "learning_rate": 0.00025574376155295845,
3209
+ "loss": 4.670759582519532,
3210
+ "step": 4470
3211
+ },
3212
+ {
3213
+ "epoch": 0.7568846088866362,
3214
+ "grad_norm": 0.5163617134094238,
3215
+ "learning_rate": 0.00025540349381716367,
3216
+ "loss": 4.689437484741211,
3217
+ "step": 4480
3218
+ },
3219
+ {
3220
+ "epoch": 0.758574083460044,
3221
+ "grad_norm": 0.6329180598258972,
3222
+ "learning_rate": 0.00025506215124897593,
3223
+ "loss": 4.6677288055419925,
3224
+ "step": 4490
3225
+ },
3226
+ {
3227
+ "epoch": 0.7602635580334516,
3228
+ "grad_norm": 0.5871708393096924,
3229
+ "learning_rate": 0.0002547197373291721,
3230
+ "loss": 4.678330993652343,
3231
+ "step": 4500
3232
+ },
3233
+ {
3234
+ "epoch": 0.7602635580334516,
3235
+ "eval_loss": 4.647042751312256,
3236
+ "eval_runtime": 3.6169,
3237
+ "eval_samples_per_second": 276.482,
3238
+ "eval_steps_per_second": 5.806,
3239
+ "step": 4500
3240
  }
3241
  ],
3242
  "logging_steps": 10,
 
3256
  "attributes": {}
3257
  }
3258
  },
3259
+ "total_flos": 1.50505569386496e+17,
3260
  "train_batch_size": 48,
3261
  "trial_name": null,
3262
  "trial_params": null