Azrail commited on
Commit
a175442
·
verified ·
1 Parent(s): 22dab81

Training in progress, step 128000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9d1695bc9de636b5aaeaf2dd7d5f58cbc5a682eb69ac9b38095e92d54ec5937
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8c3d49b98bfff4ce201de8fd57e1cb46f198541be8429619ddab0ad9d2161b3
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4dc4491cbd42db47871ad0a656d153441e2ea2d0c5e68c9fdfe29f91fdedede3
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29a7f0ec5937d8a36844081698ed35de214589f8d2c33900c6101538c2a4386f
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8542f0951d699465323349728bdecbda5c5f0e8274e699cbba04806de2fddeeb
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:065d5f05cf0a782fa5b97e409b16ef2b4cf8c6102c4a9437ad899a13c927398f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27a3a16e476801029c30325a569467f804e448c3ecc89accd2bd78b3749ec27f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d00381a4191263086f00c86313941ab13504158fdcedfcfacd5f658d7b3729
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.2115887761307942,
6
  "eval_steps": 500,
7
- "global_step": 127000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -22614,11 +22614,189 @@
22614
  "eval_steps_per_second": 15.182,
22615
  "num_input_tokens_seen": 66573856704,
22616
  "step": 127000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22617
  }
22618
  ],
22619
  "logging_steps": 50,
22620
  "max_steps": 140000,
22621
- "num_input_tokens_seen": 66573856704,
22622
  "num_train_epochs": 2,
22623
  "save_steps": 1000,
22624
  "stateful_callbacks": {
@@ -22633,7 +22811,7 @@
22633
  "attributes": {}
22634
  }
22635
  },
22636
- "total_flos": 1.1782358329461719e+20,
22637
  "train_batch_size": 32,
22638
  "trial_name": null,
22639
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2211288264545597,
6
  "eval_steps": 500,
7
+ "global_step": 128000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
22614
  "eval_steps_per_second": 15.182,
22615
  "num_input_tokens_seen": 66573856704,
22616
  "step": 127000
22617
+ },
22618
+ {
22619
+ "epoch": 1.2120657786469824,
22620
+ "grad_norm": 0.13154049217700958,
22621
+ "learning_rate": 0.00044123130127108126,
22622
+ "loss": 2.0525,
22623
+ "num_input_tokens_seen": 66600067712,
22624
+ "step": 127050
22625
+ },
22626
+ {
22627
+ "epoch": 1.2125427811631706,
22628
+ "grad_norm": 0.13129626214504242,
22629
+ "learning_rate": 0.00043844669015467863,
22630
+ "loss": 2.0411,
22631
+ "num_input_tokens_seen": 66626274400,
22632
+ "step": 127100
22633
+ },
22634
+ {
22635
+ "epoch": 1.2130197836793588,
22636
+ "grad_norm": 0.12721647322177887,
22637
+ "learning_rate": 0.0004356640162360581,
22638
+ "loss": 2.0469,
22639
+ "num_input_tokens_seen": 66652487040,
22640
+ "step": 127150
22641
+ },
22642
+ {
22643
+ "epoch": 1.2134967861955472,
22644
+ "grad_norm": 0.1383296549320221,
22645
+ "learning_rate": 0.0004328833670911724,
22646
+ "loss": 2.0578,
22647
+ "num_input_tokens_seen": 66678700288,
22648
+ "step": 127200
22649
+ },
22650
+ {
22651
+ "epoch": 1.2139737887117354,
22652
+ "grad_norm": 0.12966816127300262,
22653
+ "learning_rate": 0.00043010483023225046,
22654
+ "loss": 2.0544,
22655
+ "num_input_tokens_seen": 66704910336,
22656
+ "step": 127250
22657
+ },
22658
+ {
22659
+ "epoch": 1.2144507912279237,
22660
+ "grad_norm": 0.13144998252391815,
22661
+ "learning_rate": 0.0004273284931050438,
22662
+ "loss": 2.061,
22663
+ "num_input_tokens_seen": 66731122112,
22664
+ "step": 127300
22665
+ },
22666
+ {
22667
+ "epoch": 1.214927793744112,
22668
+ "grad_norm": 0.13422222435474396,
22669
+ "learning_rate": 0.0004245544430860743,
22670
+ "loss": 2.062,
22671
+ "num_input_tokens_seen": 66757331872,
22672
+ "step": 127350
22673
+ },
22674
+ {
22675
+ "epoch": 1.2154047962603003,
22676
+ "grad_norm": 0.1333204060792923,
22677
+ "learning_rate": 0.0004217827674798845,
22678
+ "loss": 2.0538,
22679
+ "num_input_tokens_seen": 66783545248,
22680
+ "step": 127400
22681
+ },
22682
+ {
22683
+ "epoch": 1.2158817987764885,
22684
+ "grad_norm": 0.13239559531211853,
22685
+ "learning_rate": 0.0004190135535162894,
22686
+ "loss": 2.0545,
22687
+ "num_input_tokens_seen": 66809758656,
22688
+ "step": 127450
22689
+ },
22690
+ {
22691
+ "epoch": 1.2163588012926767,
22692
+ "grad_norm": 0.13535359501838684,
22693
+ "learning_rate": 0.00041624688834763184,
22694
+ "loss": 2.0625,
22695
+ "num_input_tokens_seen": 66835970592,
22696
+ "step": 127500
22697
+ },
22698
+ {
22699
+ "epoch": 1.2163588012926767,
22700
+ "eval_loss": 1.9728902578353882,
22701
+ "eval_runtime": 82.272,
22702
+ "eval_samples_per_second": 60.774,
22703
+ "eval_steps_per_second": 15.194,
22704
+ "num_input_tokens_seen": 66835970592,
22705
+ "step": 127500
22706
+ },
22707
+ {
22708
+ "epoch": 1.2168358038088651,
22709
+ "grad_norm": 0.1306886225938797,
22710
+ "learning_rate": 0.0004134828590460387,
22711
+ "loss": 2.0548,
22712
+ "num_input_tokens_seen": 66862174016,
22713
+ "step": 127550
22714
+ },
22715
+ {
22716
+ "epoch": 1.2173128063250533,
22717
+ "grad_norm": 0.1322244554758072,
22718
+ "learning_rate": 0.0004107215526006817,
22719
+ "loss": 2.0544,
22720
+ "num_input_tokens_seen": 66888384224,
22721
+ "step": 127600
22722
+ },
22723
+ {
22724
+ "epoch": 1.2177898088412416,
22725
+ "grad_norm": 0.13241881132125854,
22726
+ "learning_rate": 0.0004079630559150391,
22727
+ "loss": 2.0646,
22728
+ "num_input_tokens_seen": 66914597888,
22729
+ "step": 127650
22730
+ },
22731
+ {
22732
+ "epoch": 1.21826681135743,
22733
+ "grad_norm": 0.12745130062103271,
22734
+ "learning_rate": 0.0004052074558041608,
22735
+ "loss": 2.0554,
22736
+ "num_input_tokens_seen": 66940807552,
22737
+ "step": 127700
22738
+ },
22739
+ {
22740
+ "epoch": 1.2187438138736182,
22741
+ "grad_norm": 0.13167862594127655,
22742
+ "learning_rate": 0.00040245483899193594,
22743
+ "loss": 2.0449,
22744
+ "num_input_tokens_seen": 66967017376,
22745
+ "step": 127750
22746
+ },
22747
+ {
22748
+ "epoch": 1.2192208163898064,
22749
+ "grad_norm": 0.1641312688589096,
22750
+ "learning_rate": 0.00039970529210836363,
22751
+ "loss": 2.0438,
22752
+ "num_input_tokens_seen": 66993229600,
22753
+ "step": 127800
22754
+ },
22755
+ {
22756
+ "epoch": 1.2196978189059948,
22757
+ "grad_norm": 0.1290162205696106,
22758
+ "learning_rate": 0.00039695890168682686,
22759
+ "loss": 2.0633,
22760
+ "num_input_tokens_seen": 67019433984,
22761
+ "step": 127850
22762
+ },
22763
+ {
22764
+ "epoch": 1.220174821422183,
22765
+ "grad_norm": 0.12822365760803223,
22766
+ "learning_rate": 0.0003942157541613686,
22767
+ "loss": 2.0477,
22768
+ "num_input_tokens_seen": 67045643168,
22769
+ "step": 127900
22770
+ },
22771
+ {
22772
+ "epoch": 1.2206518239383712,
22773
+ "grad_norm": 0.13961108028888702,
22774
+ "learning_rate": 0.0003914759358639719,
22775
+ "loss": 2.063,
22776
+ "num_input_tokens_seen": 67071854592,
22777
+ "step": 127950
22778
+ },
22779
+ {
22780
+ "epoch": 1.2211288264545597,
22781
+ "grad_norm": 0.13082347810268402,
22782
+ "learning_rate": 0.00038873953302184284,
22783
+ "loss": 2.0557,
22784
+ "num_input_tokens_seen": 67098059328,
22785
+ "step": 128000
22786
+ },
22787
+ {
22788
+ "epoch": 1.2211288264545597,
22789
+ "eval_loss": 1.9715449810028076,
22790
+ "eval_runtime": 83.7065,
22791
+ "eval_samples_per_second": 59.733,
22792
+ "eval_steps_per_second": 14.933,
22793
+ "num_input_tokens_seen": 67098059328,
22794
+ "step": 128000
22795
  }
22796
  ],
22797
  "logging_steps": 50,
22798
  "max_steps": 140000,
22799
+ "num_input_tokens_seen": 67098059328,
22800
  "num_train_epochs": 2,
22801
  "save_steps": 1000,
22802
  "stateful_callbacks": {
 
22811
  "attributes": {}
22812
  }
22813
  },
22814
+ "total_flos": 1.1875132632453857e+20,
22815
  "train_batch_size": 32,
22816
  "trial_name": null,
22817
  "trial_params": null