Azrail commited on
Commit
8a1068a
·
verified ·
1 Parent(s): 9e9455e

Training in progress, step 58000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16b35a6c5a2893347ac39200ce6524a1890f21615a98cf260909a1625f36f1c5
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fbbcf8e4efabf5866400ce20d5f64dfe9bcdba3c76105321e75b94424bbdf9a
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b96c4f49154280d995e547e25a75aad825b4ac333aa881c2f7edaa3460a4415
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee28446b68e061d51e2acb6d49ad965661e91bf2d3291a5dc5003af4c9992cc6
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b80a94302b027aba469e721f259f7cea336e0f08145beaf0eef00eec23f3459c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec1bfb0db1c21e8b4cd52af95928aa8366b624cdfe8a7ae4baa053e84325dfb8
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d03f04e05cd70ad1a826e9dcf44af396ac68835a057941493a30d6d09cfeca51
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e04abc75ac3354daa3070b9f9eb5e8a95eba4855d092af143aa714bd01a0140a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.38341043582667833,
6
  "eval_steps": 500,
7
- "global_step": 57000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10154,11 +10154,189 @@
10154
  "eval_steps_per_second": 23.699,
10155
  "num_input_tokens_seen": 14942208000,
10156
  "step": 57000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10157
  }
10158
  ],
10159
  "logging_steps": 50,
10160
  "max_steps": 60000,
10161
- "num_input_tokens_seen": 14942208000,
10162
  "num_train_epochs": 1,
10163
  "save_steps": 1000,
10164
  "stateful_callbacks": {
@@ -10173,7 +10351,7 @@
10173
  "attributes": {}
10174
  }
10175
  },
10176
- "total_flos": 3.99718647595008e+18,
10177
  "train_batch_size": 64,
10178
  "trial_name": null,
10179
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.39013693470083055,
6
  "eval_steps": 500,
7
+ "global_step": 58000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10154
  "eval_steps_per_second": 23.699,
10155
  "num_input_tokens_seen": 14942208000,
10156
  "step": 57000
10157
+ },
10158
+ {
10159
+ "epoch": 0.38374676077038594,
10160
+ "grad_norm": 0.255500853061676,
10161
+ "learning_rate": 0.0004869115258460635,
10162
+ "loss": 3.0102,
10163
+ "num_input_tokens_seen": 14955315200,
10164
+ "step": 57050
10165
+ },
10166
+ {
10167
+ "epoch": 0.38408308571409355,
10168
+ "grad_norm": 0.18287675082683563,
10169
+ "learning_rate": 0.0004738320218785281,
10170
+ "loss": 3.0074,
10171
+ "num_input_tokens_seen": 14968422400,
10172
+ "step": 57100
10173
+ },
10174
+ {
10175
+ "epoch": 0.38441941065780116,
10176
+ "grad_norm": 0.1864452064037323,
10177
+ "learning_rate": 0.0004607704521360776,
10178
+ "loss": 3.0181,
10179
+ "num_input_tokens_seen": 14981529600,
10180
+ "step": 57150
10181
+ },
10182
+ {
10183
+ "epoch": 0.3847557356015088,
10184
+ "grad_norm": 0.17273065447807312,
10185
+ "learning_rate": 0.00044773576836617336,
10186
+ "loss": 3.0077,
10187
+ "num_input_tokens_seen": 14994636800,
10188
+ "step": 57200
10189
+ },
10190
+ {
10191
+ "epoch": 0.3850920605452164,
10192
+ "grad_norm": 0.17590677738189697,
10193
+ "learning_rate": 0.00043473690388997434,
10194
+ "loss": 3.0118,
10195
+ "num_input_tokens_seen": 15007744000,
10196
+ "step": 57250
10197
+ },
10198
+ {
10199
+ "epoch": 0.385428385488924,
10200
+ "grad_norm": 0.16380582749843597,
10201
+ "learning_rate": 0.0004217827674798845,
10202
+ "loss": 3.0074,
10203
+ "num_input_tokens_seen": 15020851200,
10204
+ "step": 57300
10205
+ },
10206
+ {
10207
+ "epoch": 0.3857647104326316,
10208
+ "grad_norm": 0.19464251399040222,
10209
+ "learning_rate": 0.00040888223725392626,
10210
+ "loss": 3.0126,
10211
+ "num_input_tokens_seen": 15033958400,
10212
+ "step": 57350
10213
+ },
10214
+ {
10215
+ "epoch": 0.3861010353763392,
10216
+ "grad_norm": 0.17150136828422546,
10217
+ "learning_rate": 0.0003960441545911204,
10218
+ "loss": 3.0049,
10219
+ "num_input_tokens_seen": 15047065600,
10220
+ "step": 57400
10221
+ },
10222
+ {
10223
+ "epoch": 0.38643736032004683,
10224
+ "grad_norm": 0.1877928376197815,
10225
+ "learning_rate": 0.00038327731807204744,
10226
+ "loss": 3.0089,
10227
+ "num_input_tokens_seen": 15060172800,
10228
+ "step": 57450
10229
+ },
10230
+ {
10231
+ "epoch": 0.38677368526375444,
10232
+ "grad_norm": 0.2605326771736145,
10233
+ "learning_rate": 0.0003705904774487396,
10234
+ "loss": 3.0115,
10235
+ "num_input_tokens_seen": 15073280000,
10236
+ "step": 57500
10237
+ },
10238
+ {
10239
+ "epoch": 0.38677368526375444,
10240
+ "eval_loss": 2.9029135704040527,
10241
+ "eval_runtime": 53.9097,
10242
+ "eval_samples_per_second": 92.748,
10243
+ "eval_steps_per_second": 23.187,
10244
+ "num_input_tokens_seen": 15073280000,
10245
+ "step": 57500
10246
+ },
10247
+ {
10248
+ "epoch": 0.38711001020746205,
10249
+ "grad_norm": 0.21006393432617188,
10250
+ "learning_rate": 0.0003579923276480387,
10251
+ "loss": 3.0044,
10252
+ "num_input_tokens_seen": 15086387200,
10253
+ "step": 57550
10254
+ },
10255
+ {
10256
+ "epoch": 0.38744633515116966,
10257
+ "grad_norm": 0.1743878722190857,
10258
+ "learning_rate": 0.00034549150281252633,
10259
+ "loss": 3.0114,
10260
+ "num_input_tokens_seen": 15099494400,
10261
+ "step": 57600
10262
+ },
10263
+ {
10264
+ "epoch": 0.3877826600948773,
10265
+ "grad_norm": 0.16699257493019104,
10266
+ "learning_rate": 0.00033309657038311456,
10267
+ "loss": 3.0041,
10268
+ "num_input_tokens_seen": 15112601600,
10269
+ "step": 57650
10270
+ },
10271
+ {
10272
+ "epoch": 0.3881189850385849,
10273
+ "grad_norm": 0.17115868628025055,
10274
+ "learning_rate": 0.00032081602522734986,
10275
+ "loss": 3.0051,
10276
+ "num_input_tokens_seen": 15125708800,
10277
+ "step": 57700
10278
+ },
10279
+ {
10280
+ "epoch": 0.3884553099822925,
10281
+ "grad_norm": 0.16885310411453247,
10282
+ "learning_rate": 0.0003086582838174551,
10283
+ "loss": 2.9969,
10284
+ "num_input_tokens_seen": 15138816000,
10285
+ "step": 57750
10286
+ },
10287
+ {
10288
+ "epoch": 0.3887916349260001,
10289
+ "grad_norm": 0.17101123929023743,
10290
+ "learning_rate": 0.0002966316784621,
10291
+ "loss": 2.9947,
10292
+ "num_input_tokens_seen": 15151923200,
10293
+ "step": 57800
10294
+ },
10295
+ {
10296
+ "epoch": 0.3891279598697077,
10297
+ "grad_norm": 0.1529199331998825,
10298
+ "learning_rate": 0.0002847444515958523,
10299
+ "loss": 3.0019,
10300
+ "num_input_tokens_seen": 15165030400,
10301
+ "step": 57850
10302
+ },
10303
+ {
10304
+ "epoch": 0.38946428481341533,
10305
+ "grad_norm": 0.16087768971920013,
10306
+ "learning_rate": 0.00027300475013022663,
10307
+ "loss": 2.9947,
10308
+ "num_input_tokens_seen": 15178137600,
10309
+ "step": 57900
10310
+ },
10311
+ {
10312
+ "epoch": 0.38980060975712294,
10313
+ "grad_norm": 0.16023555397987366,
10314
+ "learning_rate": 0.00026142061987019576,
10315
+ "loss": 3.0022,
10316
+ "num_input_tokens_seen": 15191244800,
10317
+ "step": 57950
10318
+ },
10319
+ {
10320
+ "epoch": 0.39013693470083055,
10321
+ "grad_norm": 0.16161410510540009,
10322
+ "learning_rate": 0.0002500000000000001,
10323
+ "loss": 2.9931,
10324
+ "num_input_tokens_seen": 15204352000,
10325
+ "step": 58000
10326
+ },
10327
+ {
10328
+ "epoch": 0.39013693470083055,
10329
+ "eval_loss": 2.8950610160827637,
10330
+ "eval_runtime": 53.5434,
10331
+ "eval_samples_per_second": 93.382,
10332
+ "eval_steps_per_second": 23.346,
10333
+ "num_input_tokens_seen": 15204352000,
10334
+ "step": 58000
10335
  }
10336
  ],
10337
  "logging_steps": 50,
10338
  "max_steps": 60000,
10339
+ "num_input_tokens_seen": 15204352000,
10340
  "num_train_epochs": 1,
10341
  "save_steps": 1000,
10342
  "stateful_callbacks": {
 
10351
  "attributes": {}
10352
  }
10353
  },
10354
+ "total_flos": 4.06731255447552e+18,
10355
  "train_batch_size": 64,
10356
  "trial_name": null,
10357
  "trial_params": null