Azrail commited on
Commit
c443e10
·
verified ·
1 Parent(s): 55af302

Training in progress, step 36000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39e47431790297c8d1ac0d590138e540ff35b008c08f15b4fec92555b68b3ca0
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efae6f25b472bb0e65dad8b999f6e73025004f81855c74ec54fc8ecdd3a25a3d
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cb4360f6e3ef0a4db7ef43d5c8060cb784d63688538fb77fe4f179313685acd
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcd1e62e2d3104319cb00e159562f9ab40349a35045ca52ca467e6336a9d4925
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3505914cea5cefe31834749326fbe845962aa02c10480cbc9f90524db4d28f1f
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3105f55ffa4117a580fe7ec380b19db2b68da0c57679e9557361f205c3d7ca03
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c432826b41d4d9850a94ad79c80845280b64911bf27c831beef66a783066385f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5a2a9a97378c1b7631d78a28de277749231ad65f077c045df73323c2c2b85da
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7688115135015657,
6
  "eval_steps": 500,
7
- "global_step": 35000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6238,11 +6238,189 @@
6238
  "eval_steps_per_second": 18.399,
6239
  "num_input_tokens_seen": 36700156160,
6240
  "step": 35000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6241
  }
6242
  ],
6243
  "logging_steps": 50,
6244
  "max_steps": 200000,
6245
- "num_input_tokens_seen": 36700156160,
6246
  "num_train_epochs": 5,
6247
  "save_steps": 1000,
6248
  "stateful_callbacks": {
@@ -6257,7 +6435,7 @@
6257
  "attributes": {}
6258
  }
6259
  },
6260
- "total_flos": 2.090100787186434e+19,
6261
  "train_batch_size": 64,
6262
  "trial_name": null,
6263
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7907775567444676,
6
  "eval_steps": 500,
7
+ "global_step": 36000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6238
  "eval_steps_per_second": 18.399,
6239
  "num_input_tokens_seen": 36700156160,
6240
  "step": 35000
6241
+ },
6242
+ {
6243
+ "epoch": 0.7699098156637109,
6244
+ "grad_norm": 0.14624406397342682,
6245
+ "learning_rate": 0.001,
6246
+ "loss": 2.657,
6247
+ "num_input_tokens_seen": 36752584960,
6248
+ "step": 35050
6249
+ },
6250
+ {
6251
+ "epoch": 0.771008117825856,
6252
+ "grad_norm": 0.16855786740779877,
6253
+ "learning_rate": 0.001,
6254
+ "loss": 2.6585,
6255
+ "num_input_tokens_seen": 36805013760,
6256
+ "step": 35100
6257
+ },
6258
+ {
6259
+ "epoch": 0.772106419988001,
6260
+ "grad_norm": 0.1439932882785797,
6261
+ "learning_rate": 0.001,
6262
+ "loss": 2.6653,
6263
+ "num_input_tokens_seen": 36857442560,
6264
+ "step": 35150
6265
+ },
6266
+ {
6267
+ "epoch": 0.7732047221501461,
6268
+ "grad_norm": 0.16299331188201904,
6269
+ "learning_rate": 0.001,
6270
+ "loss": 2.6621,
6271
+ "num_input_tokens_seen": 36909871360,
6272
+ "step": 35200
6273
+ },
6274
+ {
6275
+ "epoch": 0.7743030243122913,
6276
+ "grad_norm": 0.16961826384067535,
6277
+ "learning_rate": 0.001,
6278
+ "loss": 2.6545,
6279
+ "num_input_tokens_seen": 36962300160,
6280
+ "step": 35250
6281
+ },
6282
+ {
6283
+ "epoch": 0.7754013264744364,
6284
+ "grad_norm": 0.13337954878807068,
6285
+ "learning_rate": 0.001,
6286
+ "loss": 2.652,
6287
+ "num_input_tokens_seen": 37014728960,
6288
+ "step": 35300
6289
+ },
6290
+ {
6291
+ "epoch": 0.7764996286365814,
6292
+ "grad_norm": 0.1728074699640274,
6293
+ "learning_rate": 0.001,
6294
+ "loss": 2.6631,
6295
+ "num_input_tokens_seen": 37067157760,
6296
+ "step": 35350
6297
+ },
6298
+ {
6299
+ "epoch": 0.7775979307987265,
6300
+ "grad_norm": 0.16615192592144012,
6301
+ "learning_rate": 0.001,
6302
+ "loss": 2.6551,
6303
+ "num_input_tokens_seen": 37119586560,
6304
+ "step": 35400
6305
+ },
6306
+ {
6307
+ "epoch": 0.7786962329608716,
6308
+ "grad_norm": 0.1515650749206543,
6309
+ "learning_rate": 0.001,
6310
+ "loss": 2.6529,
6311
+ "num_input_tokens_seen": 37172015360,
6312
+ "step": 35450
6313
+ },
6314
+ {
6315
+ "epoch": 0.7797945351230167,
6316
+ "grad_norm": 0.1534053236246109,
6317
+ "learning_rate": 0.001,
6318
+ "loss": 2.6567,
6319
+ "num_input_tokens_seen": 37224444160,
6320
+ "step": 35500
6321
+ },
6322
+ {
6323
+ "epoch": 0.7797945351230167,
6324
+ "eval_loss": 2.55454683303833,
6325
+ "eval_runtime": 67.0727,
6326
+ "eval_samples_per_second": 74.546,
6327
+ "eval_steps_per_second": 18.637,
6328
+ "num_input_tokens_seen": 37224444160,
6329
+ "step": 35500
6330
+ },
6331
+ {
6332
+ "epoch": 0.7808928372851618,
6333
+ "grad_norm": 0.16377541422843933,
6334
+ "learning_rate": 0.001,
6335
+ "loss": 2.6552,
6336
+ "num_input_tokens_seen": 37276872960,
6337
+ "step": 35550
6338
+ },
6339
+ {
6340
+ "epoch": 0.7819911394473069,
6341
+ "grad_norm": 0.14807477593421936,
6342
+ "learning_rate": 0.001,
6343
+ "loss": 2.6563,
6344
+ "num_input_tokens_seen": 37329301760,
6345
+ "step": 35600
6346
+ },
6347
+ {
6348
+ "epoch": 0.783089441609452,
6349
+ "grad_norm": 0.13599660992622375,
6350
+ "learning_rate": 0.001,
6351
+ "loss": 2.6575,
6352
+ "num_input_tokens_seen": 37381730560,
6353
+ "step": 35650
6354
+ },
6355
+ {
6356
+ "epoch": 0.7841877437715971,
6357
+ "grad_norm": 0.16653482615947723,
6358
+ "learning_rate": 0.001,
6359
+ "loss": 2.6515,
6360
+ "num_input_tokens_seen": 37434159360,
6361
+ "step": 35700
6362
+ },
6363
+ {
6364
+ "epoch": 0.7852860459337422,
6365
+ "grad_norm": 0.15467293560504913,
6366
+ "learning_rate": 0.001,
6367
+ "loss": 2.6548,
6368
+ "num_input_tokens_seen": 37486588160,
6369
+ "step": 35750
6370
+ },
6371
+ {
6372
+ "epoch": 0.7863843480958873,
6373
+ "grad_norm": 0.4751467704772949,
6374
+ "learning_rate": 0.001,
6375
+ "loss": 2.6592,
6376
+ "num_input_tokens_seen": 37539016960,
6377
+ "step": 35800
6378
+ },
6379
+ {
6380
+ "epoch": 0.7874826502580323,
6381
+ "grad_norm": 0.15940867364406586,
6382
+ "learning_rate": 0.001,
6383
+ "loss": 2.6624,
6384
+ "num_input_tokens_seen": 37591445760,
6385
+ "step": 35850
6386
+ },
6387
+ {
6388
+ "epoch": 0.7885809524201775,
6389
+ "grad_norm": 0.137634739279747,
6390
+ "learning_rate": 0.001,
6391
+ "loss": 2.6559,
6392
+ "num_input_tokens_seen": 37643874560,
6393
+ "step": 35900
6394
+ },
6395
+ {
6396
+ "epoch": 0.7896792545823226,
6397
+ "grad_norm": 0.16022460162639618,
6398
+ "learning_rate": 0.001,
6399
+ "loss": 2.6555,
6400
+ "num_input_tokens_seen": 37696303360,
6401
+ "step": 35950
6402
+ },
6403
+ {
6404
+ "epoch": 0.7907775567444676,
6405
+ "grad_norm": 0.147109717130661,
6406
+ "learning_rate": 0.001,
6407
+ "loss": 2.663,
6408
+ "num_input_tokens_seen": 37748732160,
6409
+ "step": 36000
6410
+ },
6411
+ {
6412
+ "epoch": 0.7907775567444676,
6413
+ "eval_loss": 2.556107521057129,
6414
+ "eval_runtime": 67.1814,
6415
+ "eval_samples_per_second": 74.425,
6416
+ "eval_steps_per_second": 18.606,
6417
+ "num_input_tokens_seen": 37748732160,
6418
+ "step": 36000
6419
  }
6420
  ],
6421
  "logging_steps": 50,
6422
  "max_steps": 200000,
6423
+ "num_input_tokens_seen": 37748732160,
6424
  "num_train_epochs": 5,
6425
  "save_steps": 1000,
6426
  "stateful_callbacks": {
 
6435
  "attributes": {}
6436
  }
6437
  },
6438
+ "total_flos": 2.149817958782927e+19,
6439
  "train_batch_size": 64,
6440
  "trial_name": null,
6441
  "trial_params": null