Azrail commited on
Commit
0f94b24
·
verified ·
1 Parent(s): bfff457

Training in progress, step 52000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55cdd483825b29b30f7f81376356d5e2543f5fc8a3afa0e3b843e8665cf2d119
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f79bfd92c936c07be11debb700728ae4b7e0771937dc9aee38748f4dc80dc3
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e04bcdcff48abed6280a66dda32419f983e084885866d5f14e841f6587fe0aff
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdcb0e96beb98fcdfd50cc3b612cd068e544f01ef0961afbf353f3d6eabba3ce
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:871241677306799dd94bb012f99e77b35a49885274956fc7cf6b8c017fdd6180
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:746267b8ba996549a033d105e363328c635034a7afa0e3070ea8447957aaca5a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38e628546b6b3793b4db9c04b0c48bd7f457b5c91e760c9c29b133754fb90815
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24b3fcbecd3d55078c913506015bb6e1182f04ee52bf4c0845fc043823a61161
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3430514425817648,
6
  "eval_steps": 500,
7
- "global_step": 51000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9086,11 +9086,189 @@
9086
  "eval_steps_per_second": 23.543,
9087
  "num_input_tokens_seen": 13369344000,
9088
  "step": 51000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9089
  }
9090
  ],
9091
  "logging_steps": 50,
9092
  "max_steps": 60000,
9093
- "num_input_tokens_seen": 13369344000,
9094
  "num_train_epochs": 1,
9095
  "save_steps": 1000,
9096
  "stateful_callbacks": {
@@ -9105,7 +9283,7 @@
9105
  "attributes": {}
9106
  }
9107
  },
9108
- "total_flos": 3.57643000479744e+18,
9109
  "train_batch_size": 64,
9110
  "trial_name": null,
9111
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.34977794145591706,
6
  "eval_steps": 500,
7
+ "global_step": 52000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9086
  "eval_steps_per_second": 23.543,
9087
  "num_input_tokens_seen": 13369344000,
9088
  "step": 51000
9089
+ },
9090
+ {
9091
+ "epoch": 0.3433877675254724,
9092
+ "grad_norm": 0.19399498403072357,
9093
+ "learning_rate": 0.001,
9094
+ "loss": 3.0583,
9095
+ "num_input_tokens_seen": 13382451200,
9096
+ "step": 51050
9097
+ },
9098
+ {
9099
+ "epoch": 0.34372409246918,
9100
+ "grad_norm": 0.19893072545528412,
9101
+ "learning_rate": 0.001,
9102
+ "loss": 3.0505,
9103
+ "num_input_tokens_seen": 13395558400,
9104
+ "step": 51100
9105
+ },
9106
+ {
9107
+ "epoch": 0.3440604174128876,
9108
+ "grad_norm": 0.17791305482387543,
9109
+ "learning_rate": 0.001,
9110
+ "loss": 3.0504,
9111
+ "num_input_tokens_seen": 13408665600,
9112
+ "step": 51150
9113
+ },
9114
+ {
9115
+ "epoch": 0.34439674235659523,
9116
+ "grad_norm": 0.7631425261497498,
9117
+ "learning_rate": 0.001,
9118
+ "loss": 3.0483,
9119
+ "num_input_tokens_seen": 13421772800,
9120
+ "step": 51200
9121
+ },
9122
+ {
9123
+ "epoch": 0.34473306730030284,
9124
+ "grad_norm": 0.22620978951454163,
9125
+ "learning_rate": 0.001,
9126
+ "loss": 3.0512,
9127
+ "num_input_tokens_seen": 13434880000,
9128
+ "step": 51250
9129
+ },
9130
+ {
9131
+ "epoch": 0.34506939224401045,
9132
+ "grad_norm": 0.219919815659523,
9133
+ "learning_rate": 0.001,
9134
+ "loss": 3.0415,
9135
+ "num_input_tokens_seen": 13447987200,
9136
+ "step": 51300
9137
+ },
9138
+ {
9139
+ "epoch": 0.34540571718771806,
9140
+ "grad_norm": 0.21654649078845978,
9141
+ "learning_rate": 0.001,
9142
+ "loss": 3.062,
9143
+ "num_input_tokens_seen": 13461094400,
9144
+ "step": 51350
9145
+ },
9146
+ {
9147
+ "epoch": 0.3457420421314257,
9148
+ "grad_norm": 0.2439095377922058,
9149
+ "learning_rate": 0.001,
9150
+ "loss": 3.0478,
9151
+ "num_input_tokens_seen": 13474201600,
9152
+ "step": 51400
9153
+ },
9154
+ {
9155
+ "epoch": 0.3460783670751333,
9156
+ "grad_norm": 0.19535380601882935,
9157
+ "learning_rate": 0.001,
9158
+ "loss": 3.0444,
9159
+ "num_input_tokens_seen": 13487308800,
9160
+ "step": 51450
9161
+ },
9162
+ {
9163
+ "epoch": 0.3464146920188409,
9164
+ "grad_norm": 0.1964534968137741,
9165
+ "learning_rate": 0.001,
9166
+ "loss": 3.049,
9167
+ "num_input_tokens_seen": 13500416000,
9168
+ "step": 51500
9169
+ },
9170
+ {
9171
+ "epoch": 0.3464146920188409,
9172
+ "eval_loss": 2.945749044418335,
9173
+ "eval_runtime": 53.0447,
9174
+ "eval_samples_per_second": 94.26,
9175
+ "eval_steps_per_second": 23.565,
9176
+ "num_input_tokens_seen": 13500416000,
9177
+ "step": 51500
9178
+ },
9179
+ {
9180
+ "epoch": 0.3467510169625485,
9181
+ "grad_norm": 0.2085062563419342,
9182
+ "learning_rate": 0.001,
9183
+ "loss": 3.0582,
9184
+ "num_input_tokens_seen": 13513523200,
9185
+ "step": 51550
9186
+ },
9187
+ {
9188
+ "epoch": 0.3470873419062562,
9189
+ "grad_norm": 0.1903097778558731,
9190
+ "learning_rate": 0.001,
9191
+ "loss": 3.0488,
9192
+ "num_input_tokens_seen": 13526630400,
9193
+ "step": 51600
9194
+ },
9195
+ {
9196
+ "epoch": 0.3474236668499638,
9197
+ "grad_norm": 0.20101405680179596,
9198
+ "learning_rate": 0.001,
9199
+ "loss": 3.0573,
9200
+ "num_input_tokens_seen": 13539737600,
9201
+ "step": 51650
9202
+ },
9203
+ {
9204
+ "epoch": 0.3477599917936714,
9205
+ "grad_norm": 0.6418889164924622,
9206
+ "learning_rate": 0.001,
9207
+ "loss": 3.0513,
9208
+ "num_input_tokens_seen": 13552844800,
9209
+ "step": 51700
9210
+ },
9211
+ {
9212
+ "epoch": 0.348096316737379,
9213
+ "grad_norm": 0.22524093091487885,
9214
+ "learning_rate": 0.001,
9215
+ "loss": 3.0567,
9216
+ "num_input_tokens_seen": 13565952000,
9217
+ "step": 51750
9218
+ },
9219
+ {
9220
+ "epoch": 0.3484326416810866,
9221
+ "grad_norm": 0.21830599009990692,
9222
+ "learning_rate": 0.001,
9223
+ "loss": 3.0538,
9224
+ "num_input_tokens_seen": 13579059200,
9225
+ "step": 51800
9226
+ },
9227
+ {
9228
+ "epoch": 0.34876896662479423,
9229
+ "grad_norm": 0.6111611127853394,
9230
+ "learning_rate": 0.001,
9231
+ "loss": 3.0581,
9232
+ "num_input_tokens_seen": 13592166400,
9233
+ "step": 51850
9234
+ },
9235
+ {
9236
+ "epoch": 0.34910529156850184,
9237
+ "grad_norm": 0.3782864511013031,
9238
+ "learning_rate": 0.001,
9239
+ "loss": 3.0694,
9240
+ "num_input_tokens_seen": 13605273600,
9241
+ "step": 51900
9242
+ },
9243
+ {
9244
+ "epoch": 0.34944161651220945,
9245
+ "grad_norm": 0.23944802582263947,
9246
+ "learning_rate": 0.001,
9247
+ "loss": 3.0683,
9248
+ "num_input_tokens_seen": 13618380800,
9249
+ "step": 51950
9250
+ },
9251
+ {
9252
+ "epoch": 0.34977794145591706,
9253
+ "grad_norm": 0.20257577300071716,
9254
+ "learning_rate": 0.001,
9255
+ "loss": 3.0509,
9256
+ "num_input_tokens_seen": 13631488000,
9257
+ "step": 52000
9258
+ },
9259
+ {
9260
+ "epoch": 0.34977794145591706,
9261
+ "eval_loss": 2.94769287109375,
9262
+ "eval_runtime": 53.1351,
9263
+ "eval_samples_per_second": 94.1,
9264
+ "eval_steps_per_second": 23.525,
9265
+ "num_input_tokens_seen": 13631488000,
9266
+ "step": 52000
9267
  }
9268
  ],
9269
  "logging_steps": 50,
9270
  "max_steps": 60000,
9271
+ "num_input_tokens_seen": 13631488000,
9272
  "num_train_epochs": 1,
9273
  "save_steps": 1000,
9274
  "stateful_callbacks": {
 
9283
  "attributes": {}
9284
  }
9285
  },
9286
+ "total_flos": 3.64655608332288e+18,
9287
  "train_batch_size": 64,
9288
  "trial_name": null,
9289
  "trial_params": null