Azrail commited on
Commit
0034770
·
verified ·
1 Parent(s): ec5126b

Training in progress, step 131000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67a6c7abe32dd438fb09470397d8599e18c7c6f7d6e5ad7c2ea59aa52e0c0fc9
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18f6247fa697227171786e92b63492b81203ba9ab620eea2a35269c2dc5abc91
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:af9646577ee4ed03ad7c9691e7703d876a8256d338d3a2fb5035f6f80fe627b5
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a79b728b1351b728e46db09ab4e3bda84220fcf605f8e84a1af65a7e98ccf401
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1315ef35a655eddf08abff5aa18ec6897fdbfeff08c3f5d07895fadd41b93070
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13afedcbea29e4911157dfdebca89adaca3015ec55fbe8952619bfb77f49f98b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8acfe6d76758b902ab66b172fa1db8b08d2d4760abe1682738a74d50eadc0c50
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d530307a60624b67b44a38452390579f46394dc6c46c3e7e0b33446906fdcfb9
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2402089271020904,
6
  "eval_steps": 500,
7
- "global_step": 130000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -23148,11 +23148,189 @@
23148
  "eval_steps_per_second": 15.045,
23149
  "num_input_tokens_seen": 68146442176,
23150
  "step": 130000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23151
  }
23152
  ],
23153
  "logging_steps": 50,
23154
  "max_steps": 140000,
23155
- "num_input_tokens_seen": 68146442176,
23156
  "num_train_epochs": 2,
23157
  "save_steps": 1000,
23158
  "stateful_callbacks": {
@@ -23167,7 +23345,7 @@
23167
  "attributes": {}
23168
  }
23169
  },
23170
- "total_flos": 1.206067727404671e+20,
23171
  "train_batch_size": 32,
23172
  "trial_name": null,
23173
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.249748977425856,
6
  "eval_steps": 500,
7
+ "global_step": 131000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
23148
  "eval_steps_per_second": 15.045,
23149
  "num_input_tokens_seen": 68146442176,
23150
  "step": 130000
23151
+ },
23152
+ {
23153
+ "epoch": 1.2406859296182788,
23154
+ "grad_norm": 0.12637196481227875,
23155
+ "learning_rate": 0.00028053434571568983,
23156
+ "loss": 2.0543,
23157
+ "num_input_tokens_seen": 68172655040,
23158
+ "step": 130050
23159
+ },
23160
+ {
23161
+ "epoch": 1.241162932134467,
23162
+ "grad_norm": 0.1351892203092575,
23163
+ "learning_rate": 0.000278017467984759,
23164
+ "loss": 2.0578,
23165
+ "num_input_tokens_seen": 68198869440,
23166
+ "step": 130100
23167
+ },
23168
+ {
23169
+ "epoch": 1.2416399346506553,
23170
+ "grad_norm": 0.12203965336084366,
23171
+ "learning_rate": 0.00027550757645927764,
23172
+ "loss": 2.0427,
23173
+ "num_input_tokens_seen": 68225083840,
23174
+ "step": 130150
23175
+ },
23176
+ {
23177
+ "epoch": 1.2421169371668435,
23178
+ "grad_norm": 0.13395994901657104,
23179
+ "learning_rate": 0.00027300475013022663,
23180
+ "loss": 2.0488,
23181
+ "num_input_tokens_seen": 68251293952,
23182
+ "step": 130200
23183
+ },
23184
+ {
23185
+ "epoch": 1.242593939683032,
23186
+ "grad_norm": 0.1291465014219284,
23187
+ "learning_rate": 0.0002705090677662311,
23188
+ "loss": 2.0484,
23189
+ "num_input_tokens_seen": 68277498432,
23190
+ "step": 130250
23191
+ },
23192
+ {
23193
+ "epoch": 1.24307094219922,
23194
+ "grad_norm": 0.12472834438085556,
23195
+ "learning_rate": 0.000268020607911083,
23196
+ "loss": 2.0538,
23197
+ "num_input_tokens_seen": 68303709440,
23198
+ "step": 130300
23199
+ },
23200
+ {
23201
+ "epoch": 1.2435479447154083,
23202
+ "grad_norm": 0.1263572871685028,
23203
+ "learning_rate": 0.0002655394488812677,
23204
+ "loss": 2.0487,
23205
+ "num_input_tokens_seen": 68329920512,
23206
+ "step": 130350
23207
+ },
23208
+ {
23209
+ "epoch": 1.2440249472315967,
23210
+ "grad_norm": 0.12614773213863373,
23211
+ "learning_rate": 0.0002630656687635007,
23212
+ "loss": 2.053,
23213
+ "num_input_tokens_seen": 68356112384,
23214
+ "step": 130400
23215
+ },
23216
+ {
23217
+ "epoch": 1.244501949747785,
23218
+ "grad_norm": 0.1241307333111763,
23219
+ "learning_rate": 0.0002605993454122687,
23220
+ "loss": 2.049,
23221
+ "num_input_tokens_seen": 68382320896,
23222
+ "step": 130450
23223
+ },
23224
+ {
23225
+ "epoch": 1.2449789522639731,
23226
+ "grad_norm": 0.12764516472816467,
23227
+ "learning_rate": 0.0002581405564473801,
23228
+ "loss": 2.0338,
23229
+ "num_input_tokens_seen": 68408534464,
23230
+ "step": 130500
23231
+ },
23232
+ {
23233
+ "epoch": 1.2449789522639731,
23234
+ "eval_loss": 1.9643968343734741,
23235
+ "eval_runtime": 82.7385,
23236
+ "eval_samples_per_second": 60.431,
23237
+ "eval_steps_per_second": 15.108,
23238
+ "num_input_tokens_seen": 68408534464,
23239
+ "step": 130500
23240
+ },
23241
+ {
23242
+ "epoch": 1.2454559547801614,
23243
+ "grad_norm": 0.1308233141899109,
23244
+ "learning_rate": 0.0002556893792515227,
23245
+ "loss": 2.0371,
23246
+ "num_input_tokens_seen": 68434747040,
23247
+ "step": 130550
23248
+ },
23249
+ {
23250
+ "epoch": 1.2459329572963498,
23251
+ "grad_norm": 0.12745235860347748,
23252
+ "learning_rate": 0.00025324589096782657,
23253
+ "loss": 2.0373,
23254
+ "num_input_tokens_seen": 68460951616,
23255
+ "step": 130600
23256
+ },
23257
+ {
23258
+ "epoch": 1.246409959812538,
23259
+ "grad_norm": 0.1278812736272812,
23260
+ "learning_rate": 0.0002508101684974387,
23261
+ "loss": 2.0405,
23262
+ "num_input_tokens_seen": 68487165696,
23263
+ "step": 130650
23264
+ },
23265
+ {
23266
+ "epoch": 1.2468869623287262,
23267
+ "grad_norm": 0.12204719334840775,
23268
+ "learning_rate": 0.00024838228849709997,
23269
+ "loss": 2.0424,
23270
+ "num_input_tokens_seen": 68513380096,
23271
+ "step": 130700
23272
+ },
23273
+ {
23274
+ "epoch": 1.2473639648449146,
23275
+ "grad_norm": 0.11976956576108932,
23276
+ "learning_rate": 0.0002459623273767354,
23277
+ "loss": 2.0596,
23278
+ "num_input_tokens_seen": 68539590240,
23279
+ "step": 130750
23280
+ },
23281
+ {
23282
+ "epoch": 1.2478409673611028,
23283
+ "grad_norm": 0.13120809197425842,
23284
+ "learning_rate": 0.000243550361297047,
23285
+ "loss": 2.037,
23286
+ "num_input_tokens_seen": 68565804640,
23287
+ "step": 130800
23288
+ },
23289
+ {
23290
+ "epoch": 1.248317969877291,
23291
+ "grad_norm": 0.12905927002429962,
23292
+ "learning_rate": 0.00024114646616711844,
23293
+ "loss": 2.0341,
23294
+ "num_input_tokens_seen": 68592007552,
23295
+ "step": 130850
23296
+ },
23297
+ {
23298
+ "epoch": 1.2487949723934793,
23299
+ "grad_norm": 0.12697407603263855,
23300
+ "learning_rate": 0.00023875071764202561,
23301
+ "loss": 2.05,
23302
+ "num_input_tokens_seen": 68618221952,
23303
+ "step": 130900
23304
+ },
23305
+ {
23306
+ "epoch": 1.2492719749096677,
23307
+ "grad_norm": 0.12694934010505676,
23308
+ "learning_rate": 0.00023636319112045495,
23309
+ "loss": 2.0436,
23310
+ "num_input_tokens_seen": 68644425984,
23311
+ "step": 130950
23312
+ },
23313
+ {
23314
+ "epoch": 1.249748977425856,
23315
+ "grad_norm": 0.1360025703907013,
23316
+ "learning_rate": 0.00023398396174233177,
23317
+ "loss": 2.0506,
23318
+ "num_input_tokens_seen": 68670633664,
23319
+ "step": 131000
23320
+ },
23321
+ {
23322
+ "epoch": 1.249748977425856,
23323
+ "eval_loss": 1.962631106376648,
23324
+ "eval_runtime": 82.4327,
23325
+ "eval_samples_per_second": 60.656,
23326
+ "eval_steps_per_second": 15.164,
23327
+ "num_input_tokens_seen": 68670633664,
23328
+ "step": 131000
23329
  }
23330
  ],
23331
  "logging_steps": 50,
23332
  "max_steps": 140000,
23333
+ "num_input_tokens_seen": 68670633664,
23334
  "num_train_epochs": 2,
23335
  "save_steps": 1000,
23336
  "stateful_callbacks": {
 
23345
  "attributes": {}
23346
  }
23347
  },
23348
+ "total_flos": 1.2153449606169969e+20,
23349
  "train_batch_size": 32,
23350
  "trial_name": null,
23351
  "trial_params": null