Azrail commited on
Commit
80cdde0
·
verified ·
1 Parent(s): 19a37ff

Training in progress, step 32000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99efb4f925ebae40cd6f793929b87a0ccac0e7b97e6def05084db3705337b811
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f3efdd22645edf9b27968c44325394fb2b759ab91da7c8ce83b3d5624316247
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aba48d7345e1335acdd811f72ad9602a930b00d7d91d9a11216fc53d7f15cb25
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdf80f22e7a0541c733f265b4f922632311af4702895a6489cc3c6583b1b00ec
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17ffd9dd4a600ef00ffe7371c71cf7eaaf39e90e97468b4a36b4cc557b2fc5d1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26f742dd126d572747f29fd7ba88348146ec68ecb2ae0d2effd91de53bff9d0d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:277f21680b959b596662b48a96a00aaa486d9a86675c2da90af20e0783552321
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b45d7a8e84e284c770af40e442ab0efb2fec2b035c2481cdfd246cdf35d0dd1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6809473405299582,
6
  "eval_steps": 500,
7
- "global_step": 31000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5526,11 +5526,189 @@
5526
  "eval_steps_per_second": 19.005,
5527
  "num_input_tokens_seen": 32505852160,
5528
  "step": 31000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5529
  }
5530
  ],
5531
  "logging_steps": 50,
5532
  "max_steps": 200000,
5533
- "num_input_tokens_seen": 32505852160,
5534
  "num_train_epochs": 5,
5535
  "save_steps": 1000,
5536
  "stateful_callbacks": {
@@ -5545,7 +5723,7 @@
5545
  "attributes": {}
5546
  }
5547
  },
5548
- "total_flos": 1.851232100800463e+19,
5549
  "train_batch_size": 64,
5550
  "trial_name": null,
5551
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.7029133837728602,
6
  "eval_steps": 500,
7
+ "global_step": 32000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5526
  "eval_steps_per_second": 19.005,
5527
  "num_input_tokens_seen": 32505852160,
5528
  "step": 31000
5529
+ },
5530
+ {
5531
+ "epoch": 0.6820456426921033,
5532
+ "grad_norm": 0.143716499209404,
5533
+ "learning_rate": 0.001,
5534
+ "loss": 2.6591,
5535
+ "num_input_tokens_seen": 32558280960,
5536
+ "step": 31050
5537
+ },
5538
+ {
5539
+ "epoch": 0.6831439448542485,
5540
+ "grad_norm": 0.16048283874988556,
5541
+ "learning_rate": 0.001,
5542
+ "loss": 2.659,
5543
+ "num_input_tokens_seen": 32610709760,
5544
+ "step": 31100
5545
+ },
5546
+ {
5547
+ "epoch": 0.6842422470163936,
5548
+ "grad_norm": 0.15203309059143066,
5549
+ "learning_rate": 0.001,
5550
+ "loss": 2.6703,
5551
+ "num_input_tokens_seen": 32663138560,
5552
+ "step": 31150
5553
+ },
5554
+ {
5555
+ "epoch": 0.6853405491785386,
5556
+ "grad_norm": 0.14977113902568817,
5557
+ "learning_rate": 0.001,
5558
+ "loss": 2.6657,
5559
+ "num_input_tokens_seen": 32715567360,
5560
+ "step": 31200
5561
+ },
5562
+ {
5563
+ "epoch": 0.6864388513406837,
5564
+ "grad_norm": 0.15292279422283173,
5565
+ "learning_rate": 0.001,
5566
+ "loss": 2.6629,
5567
+ "num_input_tokens_seen": 32767996160,
5568
+ "step": 31250
5569
+ },
5570
+ {
5571
+ "epoch": 0.6875371535028288,
5572
+ "grad_norm": 0.13721971213817596,
5573
+ "learning_rate": 0.001,
5574
+ "loss": 2.6641,
5575
+ "num_input_tokens_seen": 32820424960,
5576
+ "step": 31300
5577
+ },
5578
+ {
5579
+ "epoch": 0.6886354556649739,
5580
+ "grad_norm": 0.15564891695976257,
5581
+ "learning_rate": 0.001,
5582
+ "loss": 2.6673,
5583
+ "num_input_tokens_seen": 32872853760,
5584
+ "step": 31350
5585
+ },
5586
+ {
5587
+ "epoch": 0.689733757827119,
5588
+ "grad_norm": 0.15267717838287354,
5589
+ "learning_rate": 0.001,
5590
+ "loss": 2.6624,
5591
+ "num_input_tokens_seen": 32925282560,
5592
+ "step": 31400
5593
+ },
5594
+ {
5595
+ "epoch": 0.6908320599892641,
5596
+ "grad_norm": 0.15039384365081787,
5597
+ "learning_rate": 0.001,
5598
+ "loss": 2.6615,
5599
+ "num_input_tokens_seen": 32977711360,
5600
+ "step": 31450
5601
+ },
5602
+ {
5603
+ "epoch": 0.6919303621514092,
5604
+ "grad_norm": 0.14114901423454285,
5605
+ "learning_rate": 0.001,
5606
+ "loss": 2.6663,
5607
+ "num_input_tokens_seen": 33030140160,
5608
+ "step": 31500
5609
+ },
5610
+ {
5611
+ "epoch": 0.6919303621514092,
5612
+ "eval_loss": 2.5618767738342285,
5613
+ "eval_runtime": 66.9611,
5614
+ "eval_samples_per_second": 74.67,
5615
+ "eval_steps_per_second": 18.668,
5616
+ "num_input_tokens_seen": 33030140160,
5617
+ "step": 31500
5618
+ },
5619
+ {
5620
+ "epoch": 0.6930286643135543,
5621
+ "grad_norm": 0.1415725201368332,
5622
+ "learning_rate": 0.001,
5623
+ "loss": 2.6606,
5624
+ "num_input_tokens_seen": 33082568960,
5625
+ "step": 31550
5626
+ },
5627
+ {
5628
+ "epoch": 0.6941269664756994,
5629
+ "grad_norm": 0.14324156939983368,
5630
+ "learning_rate": 0.001,
5631
+ "loss": 2.6616,
5632
+ "num_input_tokens_seen": 33134997760,
5633
+ "step": 31600
5634
+ },
5635
+ {
5636
+ "epoch": 0.6952252686378445,
5637
+ "grad_norm": 0.1544431746006012,
5638
+ "learning_rate": 0.001,
5639
+ "loss": 2.6567,
5640
+ "num_input_tokens_seen": 33187426560,
5641
+ "step": 31650
5642
+ },
5643
+ {
5644
+ "epoch": 0.6963235707999895,
5645
+ "grad_norm": 0.14641186594963074,
5646
+ "learning_rate": 0.001,
5647
+ "loss": 2.6605,
5648
+ "num_input_tokens_seen": 33239855360,
5649
+ "step": 31700
5650
+ },
5651
+ {
5652
+ "epoch": 0.6974218729621346,
5653
+ "grad_norm": 0.13757406175136566,
5654
+ "learning_rate": 0.001,
5655
+ "loss": 2.673,
5656
+ "num_input_tokens_seen": 33292284160,
5657
+ "step": 31750
5658
+ },
5659
+ {
5660
+ "epoch": 0.6985201751242798,
5661
+ "grad_norm": 0.14516425132751465,
5662
+ "learning_rate": 0.001,
5663
+ "loss": 2.6781,
5664
+ "num_input_tokens_seen": 33344712960,
5665
+ "step": 31800
5666
+ },
5667
+ {
5668
+ "epoch": 0.6996184772864249,
5669
+ "grad_norm": 0.15246887505054474,
5670
+ "learning_rate": 0.001,
5671
+ "loss": 2.6683,
5672
+ "num_input_tokens_seen": 33397141760,
5673
+ "step": 31850
5674
+ },
5675
+ {
5676
+ "epoch": 0.7007167794485699,
5677
+ "grad_norm": 0.1413787305355072,
5678
+ "learning_rate": 0.001,
5679
+ "loss": 2.6591,
5680
+ "num_input_tokens_seen": 33449570560,
5681
+ "step": 31900
5682
+ },
5683
+ {
5684
+ "epoch": 0.701815081610715,
5685
+ "grad_norm": 0.16077399253845215,
5686
+ "learning_rate": 0.001,
5687
+ "loss": 2.6628,
5688
+ "num_input_tokens_seen": 33501999360,
5689
+ "step": 31950
5690
+ },
5691
+ {
5692
+ "epoch": 0.7029133837728602,
5693
+ "grad_norm": 0.1555839478969574,
5694
+ "learning_rate": 0.001,
5695
+ "loss": 2.6631,
5696
+ "num_input_tokens_seen": 33554428160,
5697
+ "step": 32000
5698
+ },
5699
+ {
5700
+ "epoch": 0.7029133837728602,
5701
+ "eval_loss": 2.561042547225952,
5702
+ "eval_runtime": 66.7879,
5703
+ "eval_samples_per_second": 74.864,
5704
+ "eval_steps_per_second": 18.716,
5705
+ "num_input_tokens_seen": 33554428160,
5706
+ "step": 32000
5707
  }
5708
  ],
5709
  "logging_steps": 50,
5710
  "max_steps": 200000,
5711
+ "num_input_tokens_seen": 33554428160,
5712
  "num_train_epochs": 5,
5713
  "save_steps": 1000,
5714
  "stateful_callbacks": {
 
5723
  "attributes": {}
5724
  }
5725
  },
5726
+ "total_flos": 1.9109492723969556e+19,
5727
  "train_batch_size": 64,
5728
  "trial_name": null,
5729
  "trial_params": null