Azrail commited on
Commit
200e13b
·
verified ·
1 Parent(s): e2e1f85

Training in progress, step 117000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88eb3f94bc7241f618e5c9770b54c115b258d914f67d481780ad17863ab32c2e
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ccf8d1ee3da4942ba95f7a3a54578d6c16809257e74ad1be0b26812641e3056
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ead16386e0cfae3ee1c925e0e05a55f093ed2c84207e3beb26950f24f2d0edd3
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f1a7487954ffb44d1bab57c681b14f7a5680ded0c52a6c8bb015865beff7ed1
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:315a996739a8cfadd830b0d25c5fc7336620692744591af847d9b45121986328
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48498b576bbabf1971bbdc1b63e18da5e5d6ff6ee2d2893d269ddf346414745c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f9c807b963b46c441b7e935adcacbb554bdd0c85992b7453ee29eed159b81fb
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f7ac3b8ebf1c0d4bfd4f038411c119a54a5a538a834ebe005f085cdf984be31
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1066482225693741,
6
  "eval_steps": 500,
7
- "global_step": 116000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -20656,11 +20656,189 @@
20656
  "eval_steps_per_second": 15.098,
20657
  "num_input_tokens_seen": 60807636160,
20658
  "step": 116000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20659
  }
20660
  ],
20661
  "logging_steps": 50,
20662
  "max_steps": 140000,
20663
- "num_input_tokens_seen": 60807636160,
20664
  "num_train_epochs": 2,
20665
  "save_steps": 1000,
20666
  "stateful_callbacks": {
@@ -20675,7 +20853,7 @@
20675
  "attributes": {}
20676
  }
20677
  },
20678
- "total_flos": 1.0761842469036442e+20,
20679
  "train_batch_size": 32,
20680
  "trial_name": null,
20681
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1161882728931396,
6
  "eval_steps": 500,
7
+ "global_step": 117000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
20656
  "eval_steps_per_second": 15.098,
20657
  "num_input_tokens_seen": 60807636160,
20658
  "step": 116000
20659
+ },
20660
+ {
20661
+ "epoch": 1.1071252250855623,
20662
+ "grad_norm": 0.13944968581199646,
20663
+ "learning_rate": 0.0009492603104988907,
20664
+ "loss": 2.1028,
20665
+ "num_input_tokens_seen": 60833850560,
20666
+ "step": 116050
20667
+ },
20668
+ {
20669
+ "epoch": 1.1076022276017505,
20670
+ "grad_norm": 0.14454355835914612,
20671
+ "learning_rate": 0.0009480220479843627,
20672
+ "loss": 2.0995,
20673
+ "num_input_tokens_seen": 60860064224,
20674
+ "step": 116100
20675
+ },
20676
+ {
20677
+ "epoch": 1.108079230117939,
20678
+ "grad_norm": 0.1737418919801712,
20679
+ "learning_rate": 0.0009467696853780625,
20680
+ "loss": 2.0841,
20681
+ "num_input_tokens_seen": 60886278080,
20682
+ "step": 116150
20683
+ },
20684
+ {
20685
+ "epoch": 1.1085562326341272,
20686
+ "grad_norm": 0.1442703902721405,
20687
+ "learning_rate": 0.0009455032620941839,
20688
+ "loss": 2.0847,
20689
+ "num_input_tokens_seen": 60912488608,
20690
+ "step": 116200
20691
+ },
20692
+ {
20693
+ "epoch": 1.1090332351503154,
20694
+ "grad_norm": 0.14151588082313538,
20695
+ "learning_rate": 0.0009442228179894363,
20696
+ "loss": 2.0939,
20697
+ "num_input_tokens_seen": 60938699264,
20698
+ "step": 116250
20699
+ },
20700
+ {
20701
+ "epoch": 1.1095102376665036,
20702
+ "grad_norm": 0.12823954224586487,
20703
+ "learning_rate": 0.00094292839336179,
20704
+ "loss": 2.0911,
20705
+ "num_input_tokens_seen": 60964913664,
20706
+ "step": 116300
20707
+ },
20708
+ {
20709
+ "epoch": 1.109987240182692,
20710
+ "grad_norm": 0.1551038920879364,
20711
+ "learning_rate": 0.0009416200289492091,
20712
+ "loss": 2.0905,
20713
+ "num_input_tokens_seen": 60991126176,
20714
+ "step": 116350
20715
+ },
20716
+ {
20717
+ "epoch": 1.1104642426988802,
20718
+ "grad_norm": 0.14844666421413422,
20719
+ "learning_rate": 0.000940297765928369,
20720
+ "loss": 2.0853,
20721
+ "num_input_tokens_seen": 61017336640,
20722
+ "step": 116400
20723
+ },
20724
+ {
20725
+ "epoch": 1.1109412452150684,
20726
+ "grad_norm": 0.14786940813064575,
20727
+ "learning_rate": 0.0009389616459133597,
20728
+ "loss": 2.0948,
20729
+ "num_input_tokens_seen": 61043543488,
20730
+ "step": 116450
20731
+ },
20732
+ {
20733
+ "epoch": 1.1114182477312569,
20734
+ "grad_norm": 0.1404752880334854,
20735
+ "learning_rate": 0.0009376117109543769,
20736
+ "loss": 2.0889,
20737
+ "num_input_tokens_seen": 61069752768,
20738
+ "step": 116500
20739
+ },
20740
+ {
20741
+ "epoch": 1.1114182477312569,
20742
+ "eval_loss": 2.007530450820923,
20743
+ "eval_runtime": 83.3145,
20744
+ "eval_samples_per_second": 60.014,
20745
+ "eval_steps_per_second": 15.003,
20746
+ "num_input_tokens_seen": 61069752768,
20747
+ "step": 116500
20748
+ },
20749
+ {
20750
+ "epoch": 1.111895250247445,
20751
+ "grad_norm": 0.14887551963329315,
20752
+ "learning_rate": 0.0009362480035363986,
20753
+ "loss": 2.0906,
20754
+ "num_input_tokens_seen": 61095967168,
20755
+ "step": 116550
20756
+ },
20757
+ {
20758
+ "epoch": 1.1123722527636333,
20759
+ "grad_norm": 0.1436939537525177,
20760
+ "learning_rate": 0.0009348705665778478,
20761
+ "loss": 2.0857,
20762
+ "num_input_tokens_seen": 61122178400,
20763
+ "step": 116600
20764
+ },
20765
+ {
20766
+ "epoch": 1.1128492552798217,
20767
+ "grad_norm": 0.15015645325183868,
20768
+ "learning_rate": 0.0009334794434292415,
20769
+ "loss": 2.0877,
20770
+ "num_input_tokens_seen": 61148383936,
20771
+ "step": 116650
20772
+ },
20773
+ {
20774
+ "epoch": 1.11332625779601,
20775
+ "grad_norm": 0.15639320015907288,
20776
+ "learning_rate": 0.0009320746778718274,
20777
+ "loss": 2.082,
20778
+ "num_input_tokens_seen": 61174590560,
20779
+ "step": 116700
20780
+ },
20781
+ {
20782
+ "epoch": 1.1138032603121981,
20783
+ "grad_norm": 0.1376616209745407,
20784
+ "learning_rate": 0.0009306563141162046,
20785
+ "loss": 2.0893,
20786
+ "num_input_tokens_seen": 61200799104,
20787
+ "step": 116750
20788
+ },
20789
+ {
20790
+ "epoch": 1.1142802628283863,
20791
+ "grad_norm": 0.13897264003753662,
20792
+ "learning_rate": 0.000929224396800933,
20793
+ "loss": 2.0885,
20794
+ "num_input_tokens_seen": 61227004960,
20795
+ "step": 116800
20796
+ },
20797
+ {
20798
+ "epoch": 1.1147572653445748,
20799
+ "grad_norm": 0.16240862011909485,
20800
+ "learning_rate": 0.0009277789709911291,
20801
+ "loss": 2.0772,
20802
+ "num_input_tokens_seen": 61253214976,
20803
+ "step": 116850
20804
+ },
20805
+ {
20806
+ "epoch": 1.115234267860763,
20807
+ "grad_norm": 0.13620969653129578,
20808
+ "learning_rate": 0.0009263200821770461,
20809
+ "loss": 2.0815,
20810
+ "num_input_tokens_seen": 61279425344,
20811
+ "step": 116900
20812
+ },
20813
+ {
20814
+ "epoch": 1.1157112703769512,
20815
+ "grad_norm": 0.13625779747962952,
20816
+ "learning_rate": 0.0009248477762726437,
20817
+ "loss": 2.0834,
20818
+ "num_input_tokens_seen": 61305623936,
20819
+ "step": 116950
20820
+ },
20821
+ {
20822
+ "epoch": 1.1161882728931396,
20823
+ "grad_norm": 0.1379876434803009,
20824
+ "learning_rate": 0.0009233620996141421,
20825
+ "loss": 2.0879,
20826
+ "num_input_tokens_seen": 61331831488,
20827
+ "step": 117000
20828
+ },
20829
+ {
20830
+ "epoch": 1.1161882728931396,
20831
+ "eval_loss": 2.0054421424865723,
20832
+ "eval_runtime": 82.7611,
20833
+ "eval_samples_per_second": 60.415,
20834
+ "eval_steps_per_second": 15.104,
20835
+ "num_input_tokens_seen": 61331831488,
20836
+ "step": 117000
20837
  }
20838
  ],
20839
  "logging_steps": 50,
20840
  "max_steps": 140000,
20841
+ "num_input_tokens_seen": 61331831488,
20842
  "num_train_epochs": 2,
20843
  "save_steps": 1000,
20844
  "stateful_callbacks": {
 
20853
  "attributes": {}
20854
  }
20855
  },
20856
+ "total_flos": 1.0854615480769659e+20,
20857
  "train_batch_size": 32,
20858
  "trial_name": null,
20859
  "trial_params": null