Azrail commited on
Commit
98c49bc
·
verified ·
1 Parent(s): 066d08c

Training in progress, step 48000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f38e32bce356957f430b89897be9ca31be9dda65402b8c04ff406323ca6d70ad
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0699f2befd5f5fe39f37d9992ad71298c6e825af92f8b9997d530b9228219782
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f528592dbd40e729e83c423405897ab615abc4bdf25221b204d2fb8c14ca347
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6975052cf62584f01dc0b92d80322e1defc71e0703e038bfa5340c5530e8e1a
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4b5e9d9565bd28e61942fdcfd622917c991cba6a677edb92c4e5afeb24b9087
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f916f32ed5bd769a4257bf59e71aa59f0b4e6ba66e2f6069ff1d46ad7cda2db
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:30a691323d967d54c1c0f6fb771a9863c3def8ea94c66492bb5dbdffa3e83798
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de67d78be185ea67aa4ca20dcc37ca7f9d17d76246f8cfa3148b96b4fc56902c
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4483823652169765,
6
  "eval_steps": 500,
7
- "global_step": 47000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -8374,11 +8374,189 @@
8374
  "eval_steps_per_second": 15.296,
8375
  "num_input_tokens_seen": 24637513248,
8376
  "step": 47000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8377
  }
8378
  ],
8379
  "logging_steps": 50,
8380
  "max_steps": 70000,
8381
- "num_input_tokens_seen": 24637513248,
8382
  "num_train_epochs": 1,
8383
  "save_steps": 1000,
8384
  "stateful_callbacks": {
@@ -8393,7 +8571,7 @@
8393
  "attributes": {}
8394
  }
8395
  },
8396
- "total_flos": 4.3603904566543565e+19,
8397
  "train_batch_size": 32,
8398
  "trial_name": null,
8399
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.457922415540742,
6
  "eval_steps": 500,
7
+ "global_step": 48000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
8374
  "eval_steps_per_second": 15.296,
8375
  "num_input_tokens_seen": 24637513248,
8376
  "step": 47000
8377
+ },
8378
+ {
8379
+ "epoch": 0.4488593677331648,
8380
+ "grad_norm": 0.14600330591201782,
8381
+ "learning_rate": 0.001,
8382
+ "loss": 2.1773,
8383
+ "num_input_tokens_seen": 24663726880,
8384
+ "step": 47050
8385
+ },
8386
+ {
8387
+ "epoch": 0.4493363702493531,
8388
+ "grad_norm": 0.13896551728248596,
8389
+ "learning_rate": 0.001,
8390
+ "loss": 2.1699,
8391
+ "num_input_tokens_seen": 24689934976,
8392
+ "step": 47100
8393
+ },
8394
+ {
8395
+ "epoch": 0.44981337276554134,
8396
+ "grad_norm": 0.15189655125141144,
8397
+ "learning_rate": 0.001,
8398
+ "loss": 2.1747,
8399
+ "num_input_tokens_seen": 24716146208,
8400
+ "step": 47150
8401
+ },
8402
+ {
8403
+ "epoch": 0.4502903752817296,
8404
+ "grad_norm": 0.1438799947500229,
8405
+ "learning_rate": 0.001,
8406
+ "loss": 2.1754,
8407
+ "num_input_tokens_seen": 24742351360,
8408
+ "step": 47200
8409
+ },
8410
+ {
8411
+ "epoch": 0.45076737779791787,
8412
+ "grad_norm": 0.14087191224098206,
8413
+ "learning_rate": 0.001,
8414
+ "loss": 2.1659,
8415
+ "num_input_tokens_seen": 24768557056,
8416
+ "step": 47250
8417
+ },
8418
+ {
8419
+ "epoch": 0.45124438031410613,
8420
+ "grad_norm": 0.1569574773311615,
8421
+ "learning_rate": 0.001,
8422
+ "loss": 2.1765,
8423
+ "num_input_tokens_seen": 24794768736,
8424
+ "step": 47300
8425
+ },
8426
+ {
8427
+ "epoch": 0.45172138283029445,
8428
+ "grad_norm": 0.14594893157482147,
8429
+ "learning_rate": 0.001,
8430
+ "loss": 2.1867,
8431
+ "num_input_tokens_seen": 24820973728,
8432
+ "step": 47350
8433
+ },
8434
+ {
8435
+ "epoch": 0.4521983853464827,
8436
+ "grad_norm": 0.13743354380130768,
8437
+ "learning_rate": 0.001,
8438
+ "loss": 2.1671,
8439
+ "num_input_tokens_seen": 24847180800,
8440
+ "step": 47400
8441
+ },
8442
+ {
8443
+ "epoch": 0.452675387862671,
8444
+ "grad_norm": 0.14880713820457458,
8445
+ "learning_rate": 0.001,
8446
+ "loss": 2.1834,
8447
+ "num_input_tokens_seen": 24873395200,
8448
+ "step": 47450
8449
+ },
8450
+ {
8451
+ "epoch": 0.45315239037885924,
8452
+ "grad_norm": 0.13658978044986725,
8453
+ "learning_rate": 0.001,
8454
+ "loss": 2.1608,
8455
+ "num_input_tokens_seen": 24899608000,
8456
+ "step": 47500
8457
+ },
8458
+ {
8459
+ "epoch": 0.45315239037885924,
8460
+ "eval_loss": 2.0886528491973877,
8461
+ "eval_runtime": 82.7799,
8462
+ "eval_samples_per_second": 60.401,
8463
+ "eval_steps_per_second": 15.1,
8464
+ "num_input_tokens_seen": 24899608000,
8465
+ "step": 47500
8466
+ },
8467
+ {
8468
+ "epoch": 0.4536293928950475,
8469
+ "grad_norm": 0.14707359671592712,
8470
+ "learning_rate": 0.001,
8471
+ "loss": 2.172,
8472
+ "num_input_tokens_seen": 24925815680,
8473
+ "step": 47550
8474
+ },
8475
+ {
8476
+ "epoch": 0.4541063954112358,
8477
+ "grad_norm": 0.16340535879135132,
8478
+ "learning_rate": 0.001,
8479
+ "loss": 2.1721,
8480
+ "num_input_tokens_seen": 24952024960,
8481
+ "step": 47600
8482
+ },
8483
+ {
8484
+ "epoch": 0.4545833979274241,
8485
+ "grad_norm": 0.14133617281913757,
8486
+ "learning_rate": 0.001,
8487
+ "loss": 2.1682,
8488
+ "num_input_tokens_seen": 24978238080,
8489
+ "step": 47650
8490
+ },
8491
+ {
8492
+ "epoch": 0.45506040044361235,
8493
+ "grad_norm": 0.14507652819156647,
8494
+ "learning_rate": 0.001,
8495
+ "loss": 2.1717,
8496
+ "num_input_tokens_seen": 25004442496,
8497
+ "step": 47700
8498
+ },
8499
+ {
8500
+ "epoch": 0.4555374029598006,
8501
+ "grad_norm": 0.1635296642780304,
8502
+ "learning_rate": 0.001,
8503
+ "loss": 2.1722,
8504
+ "num_input_tokens_seen": 25030655840,
8505
+ "step": 47750
8506
+ },
8507
+ {
8508
+ "epoch": 0.45601440547598887,
8509
+ "grad_norm": 0.15049296617507935,
8510
+ "learning_rate": 0.001,
8511
+ "loss": 2.1647,
8512
+ "num_input_tokens_seen": 25056870240,
8513
+ "step": 47800
8514
+ },
8515
+ {
8516
+ "epoch": 0.45649140799217713,
8517
+ "grad_norm": 0.14016319811344147,
8518
+ "learning_rate": 0.001,
8519
+ "loss": 2.3042,
8520
+ "num_input_tokens_seen": 25083083712,
8521
+ "step": 47850
8522
+ },
8523
+ {
8524
+ "epoch": 0.45696841050836545,
8525
+ "grad_norm": 0.1369781345129013,
8526
+ "learning_rate": 0.001,
8527
+ "loss": 2.21,
8528
+ "num_input_tokens_seen": 25109294720,
8529
+ "step": 47900
8530
+ },
8531
+ {
8532
+ "epoch": 0.4574454130245537,
8533
+ "grad_norm": 0.13268031179904938,
8534
+ "learning_rate": 0.001,
8535
+ "loss": 2.1809,
8536
+ "num_input_tokens_seen": 25135504256,
8537
+ "step": 47950
8538
+ },
8539
+ {
8540
+ "epoch": 0.457922415540742,
8541
+ "grad_norm": 0.13591749966144562,
8542
+ "learning_rate": 0.001,
8543
+ "loss": 2.1808,
8544
+ "num_input_tokens_seen": 25161718656,
8545
+ "step": 48000
8546
+ },
8547
+ {
8548
+ "epoch": 0.457922415540742,
8549
+ "eval_loss": 2.0938363075256348,
8550
+ "eval_runtime": 81.9703,
8551
+ "eval_samples_per_second": 60.998,
8552
+ "eval_steps_per_second": 15.249,
8553
+ "num_input_tokens_seen": 25161718656,
8554
+ "step": 48000
8555
  }
8556
  ],
8557
  "logging_steps": 50,
8558
  "max_steps": 70000,
8559
+ "num_input_tokens_seen": 25161718656,
8560
  "num_train_epochs": 1,
8561
  "save_steps": 1000,
8562
  "stateful_callbacks": {
 
8571
  "attributes": {}
8572
  }
8573
  },
8574
+ "total_flos": 4.4531652523637146e+19,
8575
  "train_batch_size": 32,
8576
  "trial_name": null,
8577
  "trial_params": null