Azrail commited on
Commit
2992939
·
verified ·
1 Parent(s): a786784

Training in progress, step 15000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0579f8b01bd92a4b6d4d9542187f9f6be5d525493ee4cacf89313462b0d4fc29
3
  size 150625560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15f637ff72e852c00df336464cba31267a78c2fec942618a4cf3dbc081150cb8
3
  size 150625560
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1ac4e5f1a091d05231fad9fd4f9941afbf6737a4f9256414d7439dd21637791
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11255a9366d03d2ecf115313602ca401e81860858d4e1ecad341feef41b0e95b
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dac96a69b6625532fa7a1849a782b63a79e8d1b28e764bc8297e354d748f16c9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea828a56e17bf773dc8e4fa2c22d13619b805c6b9321028dd494ff57e5daf8e6
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:081dc59c3c452b8ce89bfce5eae0952bf765aeed7903bbba40be0fb195d20006
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1819c72414dd202fc7a5b387187559436ac1d66f4c4de3f13c18065ffbdf0216
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.3792567557372846,
6
  "eval_steps": 500,
7
- "global_step": 14000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3060,11 +3060,229 @@
3060
  "eval_steps_per_second": 20.347,
3061
  "num_input_tokens_seen": 6763271617,
3062
  "step": 14000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3063
  }
3064
  ],
3065
  "logging_steps": 50,
3066
  "max_steps": 16568,
3067
- "num_input_tokens_seen": 6763271617,
3068
  "num_train_epochs": 4,
3069
  "save_steps": 1000,
3070
  "stateful_callbacks": {
@@ -3079,7 +3297,7 @@
3079
  "attributes": {}
3080
  }
3081
  },
3082
- "total_flos": 1.809241167078482e+18,
3083
  "train_batch_size": 16,
3084
  "trial_name": null,
3085
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.620667803310349,
6
  "eval_steps": 500,
7
+ "global_step": 15000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3060
  "eval_steps_per_second": 20.347,
3061
  "num_input_tokens_seen": 6763271617,
3062
  "step": 14000
3063
+ },
3064
+ {
3065
+ "epoch": 3.3913273081159376,
3066
+ "grad_norm": 0.25,
3067
+ "learning_rate": 9.499019164025955e-06,
3068
+ "loss": 2.0975,
3069
+ "mean_token_accuracy": 0.5536782286874949,
3070
+ "num_input_tokens_seen": 6787391841,
3071
+ "num_tokens": 2860344977.0,
3072
+ "step": 14050
3073
+ },
3074
+ {
3075
+ "epoch": 3.4033978604945907,
3076
+ "grad_norm": 0.25,
3077
+ "learning_rate": 9.310396861324884e-06,
3078
+ "loss": 2.1022,
3079
+ "mean_token_accuracy": 0.5538700968772173,
3080
+ "num_input_tokens_seen": 6811630961,
3081
+ "num_tokens": 2870526506.0,
3082
+ "step": 14100
3083
+ },
3084
+ {
3085
+ "epoch": 3.415468412873244,
3086
+ "grad_norm": 0.2431640625,
3087
+ "learning_rate": 9.121774558623813e-06,
3088
+ "loss": 2.0934,
3089
+ "mean_token_accuracy": 0.5550757900252938,
3090
+ "num_input_tokens_seen": 6835811825,
3091
+ "num_tokens": 2880722898.0,
3092
+ "step": 14150
3093
+ },
3094
+ {
3095
+ "epoch": 3.4275389652518973,
3096
+ "grad_norm": 0.2578125,
3097
+ "learning_rate": 8.93315225592274e-06,
3098
+ "loss": 2.0875,
3099
+ "mean_token_accuracy": 0.5558257311582565,
3100
+ "num_input_tokens_seen": 6859918049,
3101
+ "num_tokens": 2890925546.0,
3102
+ "step": 14200
3103
+ },
3104
+ {
3105
+ "epoch": 3.439609517630551,
3106
+ "grad_norm": 0.2294921875,
3107
+ "learning_rate": 8.74452995322167e-06,
3108
+ "loss": 2.0969,
3109
+ "mean_token_accuracy": 0.5544555878639221,
3110
+ "num_input_tokens_seen": 6883973409,
3111
+ "num_tokens": 2901007248.0,
3112
+ "step": 14250
3113
+ },
3114
+ {
3115
+ "epoch": 3.451680070009204,
3116
+ "grad_norm": 0.25390625,
3117
+ "learning_rate": 8.555907650520598e-06,
3118
+ "loss": 2.0987,
3119
+ "mean_token_accuracy": 0.5544828617200256,
3120
+ "num_input_tokens_seen": 6908263985,
3121
+ "num_tokens": 2911355124.0,
3122
+ "step": 14300
3123
+ },
3124
+ {
3125
+ "epoch": 3.463750622387857,
3126
+ "grad_norm": 0.271484375,
3127
+ "learning_rate": 8.367285347819527e-06,
3128
+ "loss": 2.0889,
3129
+ "mean_token_accuracy": 0.5557316156104207,
3130
+ "num_input_tokens_seen": 6932344993,
3131
+ "num_tokens": 2921442830.0,
3132
+ "step": 14350
3133
+ },
3134
+ {
3135
+ "epoch": 3.4758211747665104,
3136
+ "grad_norm": 0.255859375,
3137
+ "learning_rate": 8.178663045118456e-06,
3138
+ "loss": 2.0979,
3139
+ "mean_token_accuracy": 0.5547628674656153,
3140
+ "num_input_tokens_seen": 6956417041,
3141
+ "num_tokens": 2931461417.0,
3142
+ "step": 14400
3143
+ },
3144
+ {
3145
+ "epoch": 3.4878917271451635,
3146
+ "grad_norm": 0.234375,
3147
+ "learning_rate": 7.990040742417383e-06,
3148
+ "loss": 2.1005,
3149
+ "mean_token_accuracy": 0.5539160283654928,
3150
+ "num_input_tokens_seen": 6980421889,
3151
+ "num_tokens": 2941531928.0,
3152
+ "step": 14450
3153
+ },
3154
+ {
3155
+ "epoch": 3.4999622795238166,
3156
+ "grad_norm": 0.275390625,
3157
+ "learning_rate": 7.801418439716313e-06,
3158
+ "loss": 2.1017,
3159
+ "num_input_tokens_seen": 7004552193,
3160
+ "step": 14500
3161
+ },
3162
+ {
3163
+ "epoch": 3.4999622795238166,
3164
+ "eval_loss": 1.9681284427642822,
3165
+ "eval_mean_token_accuracy": 0.5785388401566912,
3166
+ "eval_num_tokens": 2951727207.0,
3167
+ "eval_runtime": 131.2276,
3168
+ "eval_samples_per_second": 81.629,
3169
+ "eval_steps_per_second": 20.407,
3170
+ "num_input_tokens_seen": 7004552193,
3171
+ "step": 14500
3172
+ },
3173
+ {
3174
+ "epoch": 3.51203283190247,
3175
+ "grad_norm": 0.267578125,
3176
+ "learning_rate": 7.612796137015241e-06,
3177
+ "loss": 2.09,
3178
+ "mean_token_accuracy": 0.5543626462481916,
3179
+ "num_input_tokens_seen": 7028775953,
3180
+ "num_tokens": 2961945579.0,
3181
+ "step": 14550
3182
+ },
3183
+ {
3184
+ "epoch": 3.524103384281123,
3185
+ "grad_norm": 0.26171875,
3186
+ "learning_rate": 7.42417383431417e-06,
3187
+ "loss": 2.0978,
3188
+ "mean_token_accuracy": 0.5544422981515527,
3189
+ "num_input_tokens_seen": 7052883457,
3190
+ "num_tokens": 2972173798.0,
3191
+ "step": 14600
3192
+ },
3193
+ {
3194
+ "epoch": 3.536173936659776,
3195
+ "grad_norm": 0.251953125,
3196
+ "learning_rate": 7.235551531613098e-06,
3197
+ "loss": 2.0915,
3198
+ "mean_token_accuracy": 0.5559014651551842,
3199
+ "num_input_tokens_seen": 7077135185,
3200
+ "num_tokens": 2982315453.0,
3201
+ "step": 14650
3202
+ },
3203
+ {
3204
+ "epoch": 3.5482444890384297,
3205
+ "grad_norm": 0.310546875,
3206
+ "learning_rate": 7.0469292289120274e-06,
3207
+ "loss": 2.0932,
3208
+ "mean_token_accuracy": 0.5552764968574047,
3209
+ "num_input_tokens_seen": 7101260305,
3210
+ "num_tokens": 2992557355.0,
3211
+ "step": 14700
3212
+ },
3213
+ {
3214
+ "epoch": 3.5603150414170828,
3215
+ "grad_norm": 0.25390625,
3216
+ "learning_rate": 6.858306926210955e-06,
3217
+ "loss": 2.0959,
3218
+ "mean_token_accuracy": 0.555088207796216,
3219
+ "num_input_tokens_seen": 7125198545,
3220
+ "num_tokens": 3002657117.0,
3221
+ "step": 14750
3222
+ },
3223
+ {
3224
+ "epoch": 3.572385593795736,
3225
+ "grad_norm": 0.2314453125,
3226
+ "learning_rate": 6.669684623509884e-06,
3227
+ "loss": 2.0933,
3228
+ "mean_token_accuracy": 0.5554985254630447,
3229
+ "num_input_tokens_seen": 7149297905,
3230
+ "num_tokens": 3012818977.0,
3231
+ "step": 14800
3232
+ },
3233
+ {
3234
+ "epoch": 3.5844561461743893,
3235
+ "grad_norm": 0.23828125,
3236
+ "learning_rate": 6.481062320808813e-06,
3237
+ "loss": 2.0901,
3238
+ "mean_token_accuracy": 0.5556722393259406,
3239
+ "num_input_tokens_seen": 7173408417,
3240
+ "num_tokens": 3022993500.0,
3241
+ "step": 14850
3242
+ },
3243
+ {
3244
+ "epoch": 3.5965266985530424,
3245
+ "grad_norm": 0.279296875,
3246
+ "learning_rate": 6.292440018107741e-06,
3247
+ "loss": 2.0862,
3248
+ "mean_token_accuracy": 0.5560053834319114,
3249
+ "num_input_tokens_seen": 7197689201,
3250
+ "num_tokens": 3033239485.0,
3251
+ "step": 14900
3252
+ },
3253
+ {
3254
+ "epoch": 3.608597250931696,
3255
+ "grad_norm": 0.265625,
3256
+ "learning_rate": 6.10381771540667e-06,
3257
+ "loss": 2.093,
3258
+ "mean_token_accuracy": 0.5550377672165632,
3259
+ "num_input_tokens_seen": 7221805553,
3260
+ "num_tokens": 3043356374.0,
3261
+ "step": 14950
3262
+ },
3263
+ {
3264
+ "epoch": 3.620667803310349,
3265
+ "grad_norm": 0.24609375,
3266
+ "learning_rate": 5.915195412705598e-06,
3267
+ "loss": 2.0994,
3268
+ "num_input_tokens_seen": 7245951473,
3269
+ "step": 15000
3270
+ },
3271
+ {
3272
+ "epoch": 3.620667803310349,
3273
+ "eval_loss": 1.9680702686309814,
3274
+ "eval_mean_token_accuracy": 0.5785124528710748,
3275
+ "eval_num_tokens": 3053564564.0,
3276
+ "eval_runtime": 130.6855,
3277
+ "eval_samples_per_second": 81.968,
3278
+ "eval_steps_per_second": 20.492,
3279
+ "num_input_tokens_seen": 7245951473,
3280
+ "step": 15000
3281
  }
3282
  ],
3283
  "logging_steps": 50,
3284
  "max_steps": 16568,
3285
+ "num_input_tokens_seen": 7245951473,
3286
  "num_train_epochs": 4,
3287
  "save_steps": 1000,
3288
  "stateful_callbacks": {
 
3297
  "attributes": {}
3298
  }
3299
  },
3300
+ "total_flos": 1.9383627395138765e+18,
3301
  "train_batch_size": 16,
3302
  "trial_name": null,
3303
  "trial_params": null