Azrail commited on
Commit
3b48c2d
·
verified ·
1 Parent(s): e2407ba

Training in progress, step 47000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa89571eb3340eba1a67ab65cc95a52de52c688ab135a582ba9671de6b4b9b2b
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f4e08ed2a6d62d28d840192a090317a05ca939879ecf26aa2b319d9c763f735
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee8593e17fbb590b6be9983a2252f2eb629b591782e538eabf2da48b5e3443f7
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b2e15feb0f7f3fe2709a8b7d31a3a5c543a260dee03048851f465de58a0a6ac
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fdfef9e83b1fd0865026b3e547285feb0ce1b439ee58282cde4fbaa3e21a682
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef6d6c68b31cc97d3a7886b7338b6c21c45d7ba1c6c1b89db7e0a3456d53ecda
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c85b77405559b6f9d3b974ee441baee89ea00505d86e9a6015f23da9cbeb2cb5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30a691323d967d54c1c0f6fb771a9863c3def8ea94c66492bb5dbdffa3e83798
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.21942115744660554,
6
  "eval_steps": 500,
7
- "global_step": 46000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8196,11 +8196,189 @@
8196
  "eval_steps_per_second": 24.661,
8197
  "num_input_tokens_seen": 12058619456,
8198
  "step": 46000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8199
  }
8200
  ],
8201
  "logging_steps": 50,
8202
  "max_steps": 70000,
8203
- "num_input_tokens_seen": 12058619456,
8204
  "num_train_epochs": 1,
8205
  "save_steps": 1000,
8206
  "stateful_callbacks": {
@@ -8215,7 +8393,7 @@
8215
  "attributes": {}
8216
  }
8217
  },
8218
- "total_flos": 3.2257983966058906e+18,
8219
  "train_batch_size": 64,
8220
  "trial_name": null,
8221
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.22419118260848825,
6
  "eval_steps": 500,
7
+ "global_step": 47000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8196
  "eval_steps_per_second": 24.661,
8197
  "num_input_tokens_seen": 12058619456,
8198
  "step": 46000
8199
+ },
8200
+ {
8201
+ "epoch": 0.21965965870469967,
8202
+ "grad_norm": 0.19492709636688232,
8203
+ "learning_rate": 0.001,
8204
+ "loss": 2.6196,
8205
+ "num_input_tokens_seen": 12071726656,
8206
+ "step": 46050
8207
+ },
8208
+ {
8209
+ "epoch": 0.2198981599627938,
8210
+ "grad_norm": 0.19643568992614746,
8211
+ "learning_rate": 0.001,
8212
+ "loss": 2.6108,
8213
+ "num_input_tokens_seen": 12084833856,
8214
+ "step": 46100
8215
+ },
8216
+ {
8217
+ "epoch": 0.22013666122088793,
8218
+ "grad_norm": 0.18720099329948425,
8219
+ "learning_rate": 0.001,
8220
+ "loss": 2.6181,
8221
+ "num_input_tokens_seen": 12097941056,
8222
+ "step": 46150
8223
+ },
8224
+ {
8225
+ "epoch": 0.2203751624789821,
8226
+ "grad_norm": 0.1929876208305359,
8227
+ "learning_rate": 0.001,
8228
+ "loss": 2.6152,
8229
+ "num_input_tokens_seen": 12111048256,
8230
+ "step": 46200
8231
+ },
8232
+ {
8233
+ "epoch": 0.22061366373707622,
8234
+ "grad_norm": 0.19732603430747986,
8235
+ "learning_rate": 0.001,
8236
+ "loss": 2.6267,
8237
+ "num_input_tokens_seen": 12124155456,
8238
+ "step": 46250
8239
+ },
8240
+ {
8241
+ "epoch": 0.22085216499517035,
8242
+ "grad_norm": 0.1964132934808731,
8243
+ "learning_rate": 0.001,
8244
+ "loss": 2.605,
8245
+ "num_input_tokens_seen": 12137262656,
8246
+ "step": 46300
8247
+ },
8248
+ {
8249
+ "epoch": 0.22109066625326448,
8250
+ "grad_norm": 0.1927288919687271,
8251
+ "learning_rate": 0.001,
8252
+ "loss": 2.6178,
8253
+ "num_input_tokens_seen": 12150369856,
8254
+ "step": 46350
8255
+ },
8256
+ {
8257
+ "epoch": 0.22132916751135862,
8258
+ "grad_norm": 0.17873398959636688,
8259
+ "learning_rate": 0.001,
8260
+ "loss": 2.6033,
8261
+ "num_input_tokens_seen": 12163477056,
8262
+ "step": 46400
8263
+ },
8264
+ {
8265
+ "epoch": 0.22156766876945275,
8266
+ "grad_norm": 0.24716190993785858,
8267
+ "learning_rate": 0.001,
8268
+ "loss": 2.6141,
8269
+ "num_input_tokens_seen": 12176584256,
8270
+ "step": 46450
8271
+ },
8272
+ {
8273
+ "epoch": 0.2218061700275469,
8274
+ "grad_norm": 0.2021339386701584,
8275
+ "learning_rate": 0.001,
8276
+ "loss": 2.6259,
8277
+ "num_input_tokens_seen": 12189691456,
8278
+ "step": 46500
8279
+ },
8280
+ {
8281
+ "epoch": 0.2218061700275469,
8282
+ "eval_loss": 2.4975087642669678,
8283
+ "eval_runtime": 50.8921,
8284
+ "eval_samples_per_second": 98.247,
8285
+ "eval_steps_per_second": 24.562,
8286
+ "num_input_tokens_seen": 12189691456,
8287
+ "step": 46500
8288
+ },
8289
+ {
8290
+ "epoch": 0.22204467128564104,
8291
+ "grad_norm": 0.20796166360378265,
8292
+ "learning_rate": 0.001,
8293
+ "loss": 2.6211,
8294
+ "num_input_tokens_seen": 12202798656,
8295
+ "step": 46550
8296
+ },
8297
+ {
8298
+ "epoch": 0.22228317254373517,
8299
+ "grad_norm": 0.20472556352615356,
8300
+ "learning_rate": 0.001,
8301
+ "loss": 2.6123,
8302
+ "num_input_tokens_seen": 12215905856,
8303
+ "step": 46600
8304
+ },
8305
+ {
8306
+ "epoch": 0.2225216738018293,
8307
+ "grad_norm": 0.20017485320568085,
8308
+ "learning_rate": 0.001,
8309
+ "loss": 2.6037,
8310
+ "num_input_tokens_seen": 12229013056,
8311
+ "step": 46650
8312
+ },
8313
+ {
8314
+ "epoch": 0.22276017505992343,
8315
+ "grad_norm": 0.2037762850522995,
8316
+ "learning_rate": 0.001,
8317
+ "loss": 2.6155,
8318
+ "num_input_tokens_seen": 12242120256,
8319
+ "step": 46700
8320
+ },
8321
+ {
8322
+ "epoch": 0.2229986763180176,
8323
+ "grad_norm": 0.19346804916858673,
8324
+ "learning_rate": 0.001,
8325
+ "loss": 2.601,
8326
+ "num_input_tokens_seen": 12255227456,
8327
+ "step": 46750
8328
+ },
8329
+ {
8330
+ "epoch": 0.22323717757611172,
8331
+ "grad_norm": 0.18640096485614777,
8332
+ "learning_rate": 0.001,
8333
+ "loss": 2.6168,
8334
+ "num_input_tokens_seen": 12268334656,
8335
+ "step": 46800
8336
+ },
8337
+ {
8338
+ "epoch": 0.22347567883420585,
8339
+ "grad_norm": 0.20295055210590363,
8340
+ "learning_rate": 0.001,
8341
+ "loss": 2.6221,
8342
+ "num_input_tokens_seen": 12281441856,
8343
+ "step": 46850
8344
+ },
8345
+ {
8346
+ "epoch": 0.22371418009229999,
8347
+ "grad_norm": 0.20705671608448029,
8348
+ "learning_rate": 0.001,
8349
+ "loss": 2.6202,
8350
+ "num_input_tokens_seen": 12294549056,
8351
+ "step": 46900
8352
+ },
8353
+ {
8354
+ "epoch": 0.22395268135039412,
8355
+ "grad_norm": 0.18724282085895538,
8356
+ "learning_rate": 0.001,
8357
+ "loss": 2.6061,
8358
+ "num_input_tokens_seen": 12307656256,
8359
+ "step": 46950
8360
+ },
8361
+ {
8362
+ "epoch": 0.22419118260848825,
8363
+ "grad_norm": 0.18210910260677338,
8364
+ "learning_rate": 0.001,
8365
+ "loss": 2.6045,
8366
+ "num_input_tokens_seen": 12320763456,
8367
+ "step": 47000
8368
+ },
8369
+ {
8370
+ "epoch": 0.22419118260848825,
8371
+ "eval_loss": 2.497344493865967,
8372
+ "eval_runtime": 51.17,
8373
+ "eval_samples_per_second": 97.713,
8374
+ "eval_steps_per_second": 24.428,
8375
+ "num_input_tokens_seen": 12320763456,
8376
+ "step": 47000
8377
  }
8378
  ],
8379
  "logging_steps": 50,
8380
  "max_steps": 70000,
8381
+ "num_input_tokens_seen": 12320763456,
8382
  "num_train_epochs": 1,
8383
  "save_steps": 1000,
8384
  "stateful_callbacks": {
 
8393
  "attributes": {}
8394
  }
8395
  },
8396
+ "total_flos": 3.2959244751313306e+18,
8397
  "train_batch_size": 64,
8398
  "trial_name": null,
8399
  "trial_params": null