Azrail commited on
Commit
5ebe303
·
verified ·
1 Parent(s): 95ba003

Training in progress, step 30000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:496d2b46e7c0c2d415917c3f430a70a0aac599fe885f35c60cc3199532b41d7a
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9965fcf14e783e7e1d55074ea2afa9a825c414e7bb1e05e788c2b6e78b01e868
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5a0ef03792604564acfd0823f03cfd37314bbc8a8eb68b05d8de1d1cfee687a
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44a1859e7ded3de3d773c5abac76c0fc5f7c6f4fc38577dfe331b1a4c391ab7
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:782fee7d7309ad00bf19a629f420a995596231f63b5af04a7f7244e077883f2d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc95bdc35ebe00877717681894afcd7d44f457b0583fea8b14d22f39dd179eb8
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6038c3966e5acd5e329cd1d75f036dea625d34bb913a8f0d05452e8d1784e0ba
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d0c8a94ae7b3402d9f6c538decfc8292fd64108bb86fd10da3f27734428bf0b
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6370152540441545,
6
  "eval_steps": 500,
7
- "global_step": 29000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5170,11 +5170,189 @@
5170
  "eval_steps_per_second": 18.663,
5171
  "num_input_tokens_seen": 30408700160,
5172
  "step": 29000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5173
  }
5174
  ],
5175
  "logging_steps": 50,
5176
  "max_steps": 200000,
5177
- "num_input_tokens_seen": 30408700160,
5178
  "num_train_epochs": 5,
5179
  "save_steps": 1000,
5180
  "stateful_callbacks": {
@@ -5189,7 +5367,7 @@
5189
  "attributes": {}
5190
  }
5191
  },
5192
- "total_flos": 1.7317977576074772e+19,
5193
  "train_batch_size": 64,
5194
  "trial_name": null,
5195
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6589812972870563,
6
  "eval_steps": 500,
7
+ "global_step": 30000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5170
  "eval_steps_per_second": 18.663,
5171
  "num_input_tokens_seen": 30408700160,
5172
  "step": 29000
5173
+ },
5174
+ {
5175
+ "epoch": 0.6381135562062996,
5176
+ "grad_norm": 0.13853897154331207,
5177
+ "learning_rate": 0.001,
5178
+ "loss": 2.6719,
5179
+ "num_input_tokens_seen": 30461128960,
5180
+ "step": 29050
5181
+ },
5182
+ {
5183
+ "epoch": 0.6392118583684446,
5184
+ "grad_norm": 0.14228977262973785,
5185
+ "learning_rate": 0.001,
5186
+ "loss": 2.6788,
5187
+ "num_input_tokens_seen": 30513557760,
5188
+ "step": 29100
5189
+ },
5190
+ {
5191
+ "epoch": 0.6403101605305898,
5192
+ "grad_norm": 0.13464143872261047,
5193
+ "learning_rate": 0.001,
5194
+ "loss": 2.6743,
5195
+ "num_input_tokens_seen": 30565986560,
5196
+ "step": 29150
5197
+ },
5198
+ {
5199
+ "epoch": 0.6414084626927349,
5200
+ "grad_norm": 0.15960821509361267,
5201
+ "learning_rate": 0.001,
5202
+ "loss": 2.6729,
5203
+ "num_input_tokens_seen": 30618415360,
5204
+ "step": 29200
5205
+ },
5206
+ {
5207
+ "epoch": 0.64250676485488,
5208
+ "grad_norm": 0.13830585777759552,
5209
+ "learning_rate": 0.001,
5210
+ "loss": 2.6723,
5211
+ "num_input_tokens_seen": 30670844160,
5212
+ "step": 29250
5213
+ },
5214
+ {
5215
+ "epoch": 0.643605067017025,
5216
+ "grad_norm": 0.14440728724002838,
5217
+ "learning_rate": 0.001,
5218
+ "loss": 2.664,
5219
+ "num_input_tokens_seen": 30723272960,
5220
+ "step": 29300
5221
+ },
5222
+ {
5223
+ "epoch": 0.6447033691791701,
5224
+ "grad_norm": 0.14259463548660278,
5225
+ "learning_rate": 0.001,
5226
+ "loss": 2.6675,
5227
+ "num_input_tokens_seen": 30775701760,
5228
+ "step": 29350
5229
+ },
5230
+ {
5231
+ "epoch": 0.6458016713413153,
5232
+ "grad_norm": 0.1462564468383789,
5233
+ "learning_rate": 0.001,
5234
+ "loss": 2.6671,
5235
+ "num_input_tokens_seen": 30828130560,
5236
+ "step": 29400
5237
+ },
5238
+ {
5239
+ "epoch": 0.6468999735034603,
5240
+ "grad_norm": 0.1443469077348709,
5241
+ "learning_rate": 0.001,
5242
+ "loss": 2.6667,
5243
+ "num_input_tokens_seen": 30880559360,
5244
+ "step": 29450
5245
+ },
5246
+ {
5247
+ "epoch": 0.6479982756656054,
5248
+ "grad_norm": 0.143255814909935,
5249
+ "learning_rate": 0.001,
5250
+ "loss": 2.6652,
5251
+ "num_input_tokens_seen": 30932988160,
5252
+ "step": 29500
5253
+ },
5254
+ {
5255
+ "epoch": 0.6479982756656054,
5256
+ "eval_loss": 2.569544792175293,
5257
+ "eval_runtime": 66.8674,
5258
+ "eval_samples_per_second": 74.775,
5259
+ "eval_steps_per_second": 18.694,
5260
+ "num_input_tokens_seen": 30932988160,
5261
+ "step": 29500
5262
+ },
5263
+ {
5264
+ "epoch": 0.6490965778277505,
5265
+ "grad_norm": 0.15149758756160736,
5266
+ "learning_rate": 0.001,
5267
+ "loss": 2.6681,
5268
+ "num_input_tokens_seen": 30985416960,
5269
+ "step": 29550
5270
+ },
5271
+ {
5272
+ "epoch": 0.6501948799898957,
5273
+ "grad_norm": 0.15703468024730682,
5274
+ "learning_rate": 0.001,
5275
+ "loss": 2.6681,
5276
+ "num_input_tokens_seen": 31037845760,
5277
+ "step": 29600
5278
+ },
5279
+ {
5280
+ "epoch": 0.6512931821520407,
5281
+ "grad_norm": 0.14332515001296997,
5282
+ "learning_rate": 0.001,
5283
+ "loss": 2.6622,
5284
+ "num_input_tokens_seen": 31090274560,
5285
+ "step": 29650
5286
+ },
5287
+ {
5288
+ "epoch": 0.6523914843141858,
5289
+ "grad_norm": 0.13763870298862457,
5290
+ "learning_rate": 0.001,
5291
+ "loss": 2.6724,
5292
+ "num_input_tokens_seen": 31142703360,
5293
+ "step": 29700
5294
+ },
5295
+ {
5296
+ "epoch": 0.6534897864763309,
5297
+ "grad_norm": 0.11858976632356644,
5298
+ "learning_rate": 0.001,
5299
+ "loss": 2.6743,
5300
+ "num_input_tokens_seen": 31195132160,
5301
+ "step": 29750
5302
+ },
5303
+ {
5304
+ "epoch": 0.654588088638476,
5305
+ "grad_norm": 0.15627937018871307,
5306
+ "learning_rate": 0.001,
5307
+ "loss": 2.6653,
5308
+ "num_input_tokens_seen": 31247560960,
5309
+ "step": 29800
5310
+ },
5311
+ {
5312
+ "epoch": 0.6556863908006211,
5313
+ "grad_norm": 0.15052759647369385,
5314
+ "learning_rate": 0.001,
5315
+ "loss": 2.6684,
5316
+ "num_input_tokens_seen": 31299989760,
5317
+ "step": 29850
5318
+ },
5319
+ {
5320
+ "epoch": 0.6567846929627662,
5321
+ "grad_norm": 0.1648450791835785,
5322
+ "learning_rate": 0.001,
5323
+ "loss": 2.6783,
5324
+ "num_input_tokens_seen": 31352418560,
5325
+ "step": 29900
5326
+ },
5327
+ {
5328
+ "epoch": 0.6578829951249113,
5329
+ "grad_norm": 0.13318586349487305,
5330
+ "learning_rate": 0.001,
5331
+ "loss": 2.6712,
5332
+ "num_input_tokens_seen": 31404847360,
5333
+ "step": 29950
5334
+ },
5335
+ {
5336
+ "epoch": 0.6589812972870563,
5337
+ "grad_norm": 0.1517287641763687,
5338
+ "learning_rate": 0.001,
5339
+ "loss": 2.6688,
5340
+ "num_input_tokens_seen": 31457276160,
5341
+ "step": 30000
5342
+ },
5343
+ {
5344
+ "epoch": 0.6589812972870563,
5345
+ "eval_loss": 2.5676708221435547,
5346
+ "eval_runtime": 66.0876,
5347
+ "eval_samples_per_second": 75.657,
5348
+ "eval_steps_per_second": 18.914,
5349
+ "num_input_tokens_seen": 31457276160,
5350
+ "step": 30000
5351
  }
5352
  ],
5353
  "logging_steps": 50,
5354
  "max_steps": 200000,
5355
+ "num_input_tokens_seen": 31457276160,
5356
  "num_train_epochs": 5,
5357
  "save_steps": 1000,
5358
  "stateful_callbacks": {
 
5367
  "attributes": {}
5368
  }
5369
  },
5370
+ "total_flos": 1.79151492920397e+19,
5371
  "train_batch_size": 64,
5372
  "trial_name": null,
5373
  "trial_params": null