Azrail commited on
Commit
7117390
·
verified ·
1 Parent(s): 4989521

Training in progress, step 30000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8189eaf6ceb4528bf3c0dab262ec5edb992db88ed01c3b7194410d43f95eccb4
3
  size 563074920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc13ffa23a1f5210f44d10669aa87f3ec7bfb7a2664786f76ce56132b042639e
3
  size 563074920
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e394ade7b3b72772b5bf25ed828cc65b1ad4f5ea3415308e4d9dfe69c0f2dcc1
3
  size 1125916346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c2acc68a1693942d243837338503be83794d69c0b95c32e490c2e11f4c4406e
3
  size 1125916346
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b35f118514fc0f516065715cf3b60710d0e7202cdecbb2634a8a2ee950df7fdc
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5f9d2ea250bcd3507c62c8571a114db63d14fdd2d31f9df1da7534fe6e55434
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e9da9c4f38246d3405fbdd73a148fe885c434414b1445bf2d8246988599b4525
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:492390519daa872425f50793597ce5e74ef972fc3d656ffa5ca614e3b949a837
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.19506846735041528,
6
  "eval_steps": 500,
7
- "global_step": 29000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5170,11 +5170,189 @@
5170
  "eval_steps_per_second": 8.742,
5171
  "num_input_tokens_seen": 7602176000,
5172
  "step": 29000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5173
  }
5174
  ],
5175
  "logging_steps": 50,
5176
  "max_steps": 30000,
5177
- "num_input_tokens_seen": 7602176000,
5178
  "num_train_epochs": 1,
5179
  "save_steps": 1000,
5180
  "stateful_callbacks": {
@@ -5184,12 +5362,12 @@
5184
  "should_evaluate": false,
5185
  "should_log": false,
5186
  "should_save": true,
5187
- "should_training_stop": false
5188
  },
5189
  "attributes": {}
5190
  }
5191
  },
5192
- "total_flos": 4.84423791280128e+18,
5193
  "train_batch_size": 64,
5194
  "trial_name": null,
5195
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.20179496622456752,
6
  "eval_steps": 500,
7
+ "global_step": 30000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5170
  "eval_steps_per_second": 8.742,
5171
  "num_input_tokens_seen": 7602176000,
5172
  "step": 29000
5173
+ },
5174
+ {
5175
+ "epoch": 0.1954047922941229,
5176
+ "grad_norm": 0.6069262027740479,
5177
+ "learning_rate": 1.5444383180638342e-06,
5178
+ "loss": 11.9314,
5179
+ "num_input_tokens_seen": 7615283200,
5180
+ "step": 29050
5181
+ },
5182
+ {
5183
+ "epoch": 0.1957411172378305,
5184
+ "grad_norm": 0.628108561038971,
5185
+ "learning_rate": 1.3862661152405309e-06,
5186
+ "loss": 11.9151,
5187
+ "num_input_tokens_seen": 7628390400,
5188
+ "step": 29100
5189
+ },
5190
+ {
5191
+ "epoch": 0.1960774421815381,
5192
+ "grad_norm": 0.6232333779335022,
5193
+ "learning_rate": 1.236618116485233e-06,
5194
+ "loss": 11.8887,
5195
+ "num_input_tokens_seen": 7641497600,
5196
+ "step": 29150
5197
+ },
5198
+ {
5199
+ "epoch": 0.19641376712524572,
5200
+ "grad_norm": 0.6372972726821899,
5201
+ "learning_rate": 1.0954985936379223e-06,
5202
+ "loss": 11.8873,
5203
+ "num_input_tokens_seen": 7654604800,
5204
+ "step": 29200
5205
+ },
5206
+ {
5207
+ "epoch": 0.19675009206895333,
5208
+ "grad_norm": 0.5991822481155396,
5209
+ "learning_rate": 9.6291157508529e-07,
5210
+ "loss": 11.9405,
5211
+ "num_input_tokens_seen": 7667712000,
5212
+ "step": 29250
5213
+ },
5214
+ {
5215
+ "epoch": 0.19708641701266094,
5216
+ "grad_norm": 0.6108511686325073,
5217
+ "learning_rate": 8.388608456459612e-07,
5218
+ "loss": 11.9085,
5219
+ "num_input_tokens_seen": 7680819200,
5220
+ "step": 29300
5221
+ },
5222
+ {
5223
+ "epoch": 0.19742274195636855,
5224
+ "grad_norm": 0.6104913949966431,
5225
+ "learning_rate": 7.23349946462215e-07,
5226
+ "loss": 11.8859,
5227
+ "num_input_tokens_seen": 7693926400,
5228
+ "step": 29350
5229
+ },
5230
+ {
5231
+ "epoch": 0.1977590669000762,
5232
+ "grad_norm": 0.6084222197532654,
5233
+ "learning_rate": 6.163821748990994e-07,
5234
+ "loss": 11.9059,
5235
+ "num_input_tokens_seen": 7707033600,
5236
+ "step": 29400
5237
+ },
5238
+ {
5239
+ "epoch": 0.1980953918437838,
5240
+ "grad_norm": 0.633105993270874,
5241
+ "learning_rate": 5.179605844501388e-07,
5242
+ "loss": 11.9174,
5243
+ "num_input_tokens_seen": 7720140800,
5244
+ "step": 29450
5245
+ },
5246
+ {
5247
+ "epoch": 0.1984317167874914,
5248
+ "grad_norm": 0.6088514924049377,
5249
+ "learning_rate": 4.280879846503049e-07,
5250
+ "loss": 11.9125,
5251
+ "num_input_tokens_seen": 7733248000,
5252
+ "step": 29500
5253
+ },
5254
+ {
5255
+ "epoch": 0.1984317167874914,
5256
+ "eval_loss": 2.8849411010742188,
5257
+ "eval_runtime": 143.8146,
5258
+ "eval_samples_per_second": 34.767,
5259
+ "eval_steps_per_second": 8.692,
5260
+ "num_input_tokens_seen": 7733248000,
5261
+ "step": 29500
5262
+ },
5263
+ {
5264
+ "epoch": 0.19876804173119902,
5265
+ "grad_norm": 0.6054402589797974,
5266
+ "learning_rate": 3.467669409957463e-07,
5267
+ "loss": 11.9468,
5268
+ "num_input_tokens_seen": 7746355200,
5269
+ "step": 29550
5270
+ },
5271
+ {
5272
+ "epoch": 0.19910436667490664,
5273
+ "grad_norm": 0.6133595705032349,
5274
+ "learning_rate": 2.7399977487051473e-07,
5275
+ "loss": 11.9368,
5276
+ "num_input_tokens_seen": 7759462400,
5277
+ "step": 29600
5278
+ },
5279
+ {
5280
+ "epoch": 0.19944069161861425,
5281
+ "grad_norm": 0.6098650693893433,
5282
+ "learning_rate": 2.097885634804175e-07,
5283
+ "loss": 11.8971,
5284
+ "num_input_tokens_seen": 7772569600,
5285
+ "step": 29650
5286
+ },
5287
+ {
5288
+ "epoch": 0.19977701656232186,
5289
+ "grad_norm": 0.6231054663658142,
5290
+ "learning_rate": 1.541351397936319e-07,
5291
+ "loss": 11.9546,
5292
+ "num_input_tokens_seen": 7785676800,
5293
+ "step": 29700
5294
+ },
5295
+ {
5296
+ "epoch": 0.20011334150602947,
5297
+ "grad_norm": 0.6323234438896179,
5298
+ "learning_rate": 1.0704109248838022e-07,
5299
+ "loss": 11.8848,
5300
+ "num_input_tokens_seen": 7798784000,
5301
+ "step": 29750
5302
+ },
5303
+ {
5304
+ "epoch": 0.20044966644973708,
5305
+ "grad_norm": 0.6294256448745728,
5306
+ "learning_rate": 6.850776590763274e-08,
5307
+ "loss": 11.9027,
5308
+ "num_input_tokens_seen": 7811891200,
5309
+ "step": 29800
5310
+ },
5311
+ {
5312
+ "epoch": 0.2007859913934447,
5313
+ "grad_norm": 0.6184135675430298,
5314
+ "learning_rate": 3.853626002063848e-08,
5315
+ "loss": 11.9454,
5316
+ "num_input_tokens_seen": 7824998400,
5317
+ "step": 29850
5318
+ },
5319
+ {
5320
+ "epoch": 0.2011223163371523,
5321
+ "grad_norm": 0.6376939415931702,
5322
+ "learning_rate": 1.7127430391683516e-08,
5323
+ "loss": 11.8928,
5324
+ "num_input_tokens_seen": 7838105600,
5325
+ "step": 29900
5326
+ },
5327
+ {
5328
+ "epoch": 0.2014586412808599,
5329
+ "grad_norm": 0.6745944619178772,
5330
+ "learning_rate": 4.281888155543978e-09,
5331
+ "loss": 11.9315,
5332
+ "num_input_tokens_seen": 7851212800,
5333
+ "step": 29950
5334
+ },
5335
+ {
5336
+ "epoch": 0.20179496622456752,
5337
+ "grad_norm": 0.6381050944328308,
5338
+ "learning_rate": 0.0,
5339
+ "loss": 11.9242,
5340
+ "num_input_tokens_seen": 7864320000,
5341
+ "step": 30000
5342
+ },
5343
+ {
5344
+ "epoch": 0.20179496622456752,
5345
+ "eval_loss": 2.8848958015441895,
5346
+ "eval_runtime": 142.697,
5347
+ "eval_samples_per_second": 35.039,
5348
+ "eval_steps_per_second": 8.76,
5349
+ "num_input_tokens_seen": 7864320000,
5350
+ "step": 30000
5351
  }
5352
  ],
5353
  "logging_steps": 50,
5354
  "max_steps": 30000,
5355
+ "num_input_tokens_seen": 7864320000,
5356
  "num_train_epochs": 1,
5357
  "save_steps": 1000,
5358
  "stateful_callbacks": {
 
5362
  "should_evaluate": false,
5363
  "should_log": false,
5364
  "should_save": true,
5365
+ "should_training_stop": true
5366
  },
5367
  "attributes": {}
5368
  }
5369
  },
5370
+ "total_flos": 5.0112805994496e+18,
5371
  "train_batch_size": 64,
5372
  "trial_name": null,
5373
  "trial_params": null