Azrail commited on
Commit
b93b7cf
·
verified ·
1 Parent(s): fda6f10

Training in progress, step 37000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efae6f25b472bb0e65dad8b999f6e73025004f81855c74ec54fc8ecdd3a25a3d
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ce938a644f0cf4d10d231b631256c1bcbd8d98d79787b20ca3ed148b88756be
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bcd1e62e2d3104319cb00e159562f9ab40349a35045ca52ca467e6336a9d4925
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64656c8de22e45c2941d2ea854ec0d370243cfeea2920fb181966f363dd14777
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3105f55ffa4117a580fe7ec380b19db2b68da0c57679e9557361f205c3d7ca03
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0451e520bbe84b70e4cd2907956e95cd6d56464539f21e68e26c043e5cf63b1e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5a2a9a97378c1b7631d78a28de277749231ad65f077c045df73323c2c2b85da
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90815e584013ee668de6d5b656c515902fbacbb32f54a71d2d1d29e05110019f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7907775567444676,
6
  "eval_steps": 500,
7
- "global_step": 36000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6416,11 +6416,189 @@
6416
  "eval_steps_per_second": 18.606,
6417
  "num_input_tokens_seen": 37748732160,
6418
  "step": 36000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6419
  }
6420
  ],
6421
  "logging_steps": 50,
6422
  "max_steps": 200000,
6423
- "num_input_tokens_seen": 37748732160,
6424
  "num_train_epochs": 5,
6425
  "save_steps": 1000,
6426
  "stateful_callbacks": {
@@ -6435,7 +6613,7 @@
6435
  "attributes": {}
6436
  }
6437
  },
6438
- "total_flos": 2.149817958782927e+19,
6439
  "train_batch_size": 64,
6440
  "trial_name": null,
6441
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8127435999873696,
6
  "eval_steps": 500,
7
+ "global_step": 37000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6416
  "eval_steps_per_second": 18.606,
6417
  "num_input_tokens_seen": 37748732160,
6418
  "step": 36000
6419
+ },
6420
+ {
6421
+ "epoch": 0.7918758589066127,
6422
+ "grad_norm": 0.16054154932498932,
6423
+ "learning_rate": 0.001,
6424
+ "loss": 2.6516,
6425
+ "num_input_tokens_seen": 37801160960,
6426
+ "step": 36050
6427
+ },
6428
+ {
6429
+ "epoch": 0.7929741610687578,
6430
+ "grad_norm": 0.15180550515651703,
6431
+ "learning_rate": 0.001,
6432
+ "loss": 2.6508,
6433
+ "num_input_tokens_seen": 37853589760,
6434
+ "step": 36100
6435
+ },
6436
+ {
6437
+ "epoch": 0.794072463230903,
6438
+ "grad_norm": 0.19564937055110931,
6439
+ "learning_rate": 0.001,
6440
+ "loss": 2.6532,
6441
+ "num_input_tokens_seen": 37906018560,
6442
+ "step": 36150
6443
+ },
6444
+ {
6445
+ "epoch": 0.795170765393048,
6446
+ "grad_norm": 0.15047501027584076,
6447
+ "learning_rate": 0.001,
6448
+ "loss": 2.6567,
6449
+ "num_input_tokens_seen": 37958447360,
6450
+ "step": 36200
6451
+ },
6452
+ {
6453
+ "epoch": 0.7962690675551931,
6454
+ "grad_norm": 0.1420314759016037,
6455
+ "learning_rate": 0.001,
6456
+ "loss": 2.6511,
6457
+ "num_input_tokens_seen": 38010876160,
6458
+ "step": 36250
6459
+ },
6460
+ {
6461
+ "epoch": 0.7973673697173382,
6462
+ "grad_norm": 0.14328153431415558,
6463
+ "learning_rate": 0.001,
6464
+ "loss": 2.6601,
6465
+ "num_input_tokens_seen": 38063304960,
6466
+ "step": 36300
6467
+ },
6468
+ {
6469
+ "epoch": 0.7984656718794833,
6470
+ "grad_norm": 0.15527622401714325,
6471
+ "learning_rate": 0.001,
6472
+ "loss": 2.6598,
6473
+ "num_input_tokens_seen": 38115733760,
6474
+ "step": 36350
6475
+ },
6476
+ {
6477
+ "epoch": 0.7995639740416284,
6478
+ "grad_norm": 0.15956974029541016,
6479
+ "learning_rate": 0.001,
6480
+ "loss": 2.6522,
6481
+ "num_input_tokens_seen": 38168162560,
6482
+ "step": 36400
6483
+ },
6484
+ {
6485
+ "epoch": 0.8006622762037735,
6486
+ "grad_norm": 0.15193034708499908,
6487
+ "learning_rate": 0.001,
6488
+ "loss": 2.6561,
6489
+ "num_input_tokens_seen": 38220591360,
6490
+ "step": 36450
6491
+ },
6492
+ {
6493
+ "epoch": 0.8017605783659186,
6494
+ "grad_norm": 0.1692439615726471,
6495
+ "learning_rate": 0.001,
6496
+ "loss": 2.653,
6497
+ "num_input_tokens_seen": 38273020160,
6498
+ "step": 36500
6499
+ },
6500
+ {
6501
+ "epoch": 0.8017605783659186,
6502
+ "eval_loss": 2.553743362426758,
6503
+ "eval_runtime": 66.3488,
6504
+ "eval_samples_per_second": 75.359,
6505
+ "eval_steps_per_second": 18.84,
6506
+ "num_input_tokens_seen": 38273020160,
6507
+ "step": 36500
6508
+ },
6509
+ {
6510
+ "epoch": 0.8028588805280636,
6511
+ "grad_norm": 0.473707377910614,
6512
+ "learning_rate": 0.001,
6513
+ "loss": 2.6604,
6514
+ "num_input_tokens_seen": 38325448960,
6515
+ "step": 36550
6516
+ },
6517
+ {
6518
+ "epoch": 0.8039571826902088,
6519
+ "grad_norm": 0.16226574778556824,
6520
+ "learning_rate": 0.001,
6521
+ "loss": 2.6615,
6522
+ "num_input_tokens_seen": 38377877760,
6523
+ "step": 36600
6524
+ },
6525
+ {
6526
+ "epoch": 0.8050554848523539,
6527
+ "grad_norm": 0.17274035513401031,
6528
+ "learning_rate": 0.001,
6529
+ "loss": 2.6616,
6530
+ "num_input_tokens_seen": 38430306560,
6531
+ "step": 36650
6532
+ },
6533
+ {
6534
+ "epoch": 0.8061537870144989,
6535
+ "grad_norm": 0.14171990752220154,
6536
+ "learning_rate": 0.001,
6537
+ "loss": 2.6628,
6538
+ "num_input_tokens_seen": 38482735360,
6539
+ "step": 36700
6540
+ },
6541
+ {
6542
+ "epoch": 0.807252089176644,
6543
+ "grad_norm": 0.3828020989894867,
6544
+ "learning_rate": 0.001,
6545
+ "loss": 2.6717,
6546
+ "num_input_tokens_seen": 38535164160,
6547
+ "step": 36750
6548
+ },
6549
+ {
6550
+ "epoch": 0.8083503913387892,
6551
+ "grad_norm": 0.20836575329303741,
6552
+ "learning_rate": 0.001,
6553
+ "loss": 2.685,
6554
+ "num_input_tokens_seen": 38587592960,
6555
+ "step": 36800
6556
+ },
6557
+ {
6558
+ "epoch": 0.8094486935009343,
6559
+ "grad_norm": 0.14613227546215057,
6560
+ "learning_rate": 0.001,
6561
+ "loss": 2.6687,
6562
+ "num_input_tokens_seen": 38640021760,
6563
+ "step": 36850
6564
+ },
6565
+ {
6566
+ "epoch": 0.8105469956630793,
6567
+ "grad_norm": 0.16505028307437897,
6568
+ "learning_rate": 0.001,
6569
+ "loss": 2.6654,
6570
+ "num_input_tokens_seen": 38692450560,
6571
+ "step": 36900
6572
+ },
6573
+ {
6574
+ "epoch": 0.8116452978252244,
6575
+ "grad_norm": 0.15305323898792267,
6576
+ "learning_rate": 0.001,
6577
+ "loss": 2.6612,
6578
+ "num_input_tokens_seen": 38744879360,
6579
+ "step": 36950
6580
+ },
6581
+ {
6582
+ "epoch": 0.8127435999873696,
6583
+ "grad_norm": 0.2416296899318695,
6584
+ "learning_rate": 0.001,
6585
+ "loss": 2.6614,
6586
+ "num_input_tokens_seen": 38797308160,
6587
+ "step": 37000
6588
+ },
6589
+ {
6590
+ "epoch": 0.8127435999873696,
6591
+ "eval_loss": 2.5642571449279785,
6592
+ "eval_runtime": 66.5631,
6593
+ "eval_samples_per_second": 75.117,
6594
+ "eval_steps_per_second": 18.779,
6595
+ "num_input_tokens_seen": 38797308160,
6596
+ "step": 37000
6597
  }
6598
  ],
6599
  "logging_steps": 50,
6600
  "max_steps": 200000,
6601
+ "num_input_tokens_seen": 38797308160,
6602
  "num_train_epochs": 5,
6603
  "save_steps": 1000,
6604
  "stateful_callbacks": {
 
6613
  "attributes": {}
6614
  }
6615
  },
6616
+ "total_flos": 2.2095351303794196e+19,
6617
  "train_batch_size": 64,
6618
  "trial_name": null,
6619
  "trial_params": null