mohammadmahdinouri commited on
Commit
191df2f
·
verified ·
1 Parent(s): a2d738d

Training in progress, step 20000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01e4827000f30108c5db6d9ab6168d6e7dfecf37eef3edc1465363ee9ea8e490
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db7432141f78eaf89762dbfa7cd270e9a33828df0e033550b34c9481463227a6
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:795bb5905ce658a665a647e1035b68562ea8227998cfd6cdd93e835459408e5d
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5277a64b453be07385d84e9f80db45a50e37f4890167c3e4c572e1a6fb7bdaaa
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6872023f654a65ebb855f875663f2550ec7c7270f37183aedc09afdf3151f71c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baf78593c218b20d298480993c7fbaf9b2ea100e2a22749e83c5c1aba18f3f4c
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d749134b574c8d566f1f7b1e5e174cfc46c406c32210d882ffb530c2f402814
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65689efd51e6068aa6422e7737ef0148b7583a59986d4d53a6a0a02103bfcb11
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7b2317285b7aac6485bde8423b9bd42301b29e0cd0b6a3f299d06ddf3270099
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90981e208c884dfa861b8ec3fc9badb69e05a78f261183a623615ac5a97c3c95
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a512863aeac154eb9ea09654b5c57fb002e6788836adf8be9c2844cb710adf1
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c76af8694f8d37feee42992c1a0000126f33879d8755e31713c98eb2fdb7b48c
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b73090e5ff4d77e40aae33305c58d2deda13e4f4510f1c076acf40a9f8a97bef
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5601fb269352a3de217d5b9fa42e25567fee4127194adcf0f48431818665f1d8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.03704949520062789,
6
  "eval_steps": 500,
7
- "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6658,6 +6658,356 @@
6658
  "learning_rate": 0.0004939859465103925,
6659
  "loss": 19.6594,
6660
  "step": 19000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6661
  }
6662
  ],
6663
  "logging_steps": 20,
@@ -6677,7 +7027,7 @@
6677
  "attributes": {}
6678
  }
6679
  },
6680
- "total_flos": 1.3968203395446604e+19,
6681
  "train_batch_size": 48,
6682
  "trial_name": null,
6683
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.03899946863223989,
6
  "eval_steps": 500,
7
+ "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6658
  "learning_rate": 0.0004939859465103925,
6659
  "loss": 19.6594,
6660
  "step": 19000
6661
+ },
6662
+ {
6663
+ "epoch": 0.03708849466926013,
6664
+ "grad_norm": 11.0625,
6665
+ "learning_rate": 0.0004939794444794831,
6666
+ "loss": 19.7557,
6667
+ "step": 19020
6668
+ },
6669
+ {
6670
+ "epoch": 0.03712749413789237,
6671
+ "grad_norm": 8.5,
6672
+ "learning_rate": 0.0004939729424485738,
6673
+ "loss": 19.6508,
6674
+ "step": 19040
6675
+ },
6676
+ {
6677
+ "epoch": 0.03716649360652461,
6678
+ "grad_norm": 12.375,
6679
+ "learning_rate": 0.0004939664404176644,
6680
+ "loss": 19.6447,
6681
+ "step": 19060
6682
+ },
6683
+ {
6684
+ "epoch": 0.03720549307515685,
6685
+ "grad_norm": 9.5625,
6686
+ "learning_rate": 0.0004939599383867551,
6687
+ "loss": 19.5852,
6688
+ "step": 19080
6689
+ },
6690
+ {
6691
+ "epoch": 0.037244492543789094,
6692
+ "grad_norm": 9.5625,
6693
+ "learning_rate": 0.0004939534363558458,
6694
+ "loss": 19.6278,
6695
+ "step": 19100
6696
+ },
6697
+ {
6698
+ "epoch": 0.03728349201242133,
6699
+ "grad_norm": 11.25,
6700
+ "learning_rate": 0.0004939469343249364,
6701
+ "loss": 19.7012,
6702
+ "step": 19120
6703
+ },
6704
+ {
6705
+ "epoch": 0.03732249148105357,
6706
+ "grad_norm": 10.9375,
6707
+ "learning_rate": 0.0004939404322940271,
6708
+ "loss": 19.6584,
6709
+ "step": 19140
6710
+ },
6711
+ {
6712
+ "epoch": 0.03736149094968581,
6713
+ "grad_norm": 10.375,
6714
+ "learning_rate": 0.0004939339302631176,
6715
+ "loss": 19.6233,
6716
+ "step": 19160
6717
+ },
6718
+ {
6719
+ "epoch": 0.03740049041831805,
6720
+ "grad_norm": 11.625,
6721
+ "learning_rate": 0.0004939274282322083,
6722
+ "loss": 19.5714,
6723
+ "step": 19180
6724
+ },
6725
+ {
6726
+ "epoch": 0.03743948988695029,
6727
+ "grad_norm": 10.0,
6728
+ "learning_rate": 0.0004939209262012989,
6729
+ "loss": 19.5819,
6730
+ "step": 19200
6731
+ },
6732
+ {
6733
+ "epoch": 0.03747848935558253,
6734
+ "grad_norm": 10.375,
6735
+ "learning_rate": 0.0004939144241703896,
6736
+ "loss": 19.605,
6737
+ "step": 19220
6738
+ },
6739
+ {
6740
+ "epoch": 0.03751748882421477,
6741
+ "grad_norm": 10.6875,
6742
+ "learning_rate": 0.0004939079221394803,
6743
+ "loss": 19.6348,
6744
+ "step": 19240
6745
+ },
6746
+ {
6747
+ "epoch": 0.03755648829284701,
6748
+ "grad_norm": 9.3125,
6749
+ "learning_rate": 0.0004939014201085709,
6750
+ "loss": 19.6152,
6751
+ "step": 19260
6752
+ },
6753
+ {
6754
+ "epoch": 0.03759548776147925,
6755
+ "grad_norm": 10.1875,
6756
+ "learning_rate": 0.0004938949180776616,
6757
+ "loss": 19.6556,
6758
+ "step": 19280
6759
+ },
6760
+ {
6761
+ "epoch": 0.03763448723011149,
6762
+ "grad_norm": 9.125,
6763
+ "learning_rate": 0.0004938884160467522,
6764
+ "loss": 19.603,
6765
+ "step": 19300
6766
+ },
6767
+ {
6768
+ "epoch": 0.03767348669874373,
6769
+ "grad_norm": 10.0625,
6770
+ "learning_rate": 0.0004938819140158429,
6771
+ "loss": 19.6083,
6772
+ "step": 19320
6773
+ },
6774
+ {
6775
+ "epoch": 0.03771248616737597,
6776
+ "grad_norm": 11.8125,
6777
+ "learning_rate": 0.0004938754119849335,
6778
+ "loss": 19.6945,
6779
+ "step": 19340
6780
+ },
6781
+ {
6782
+ "epoch": 0.03775148563600821,
6783
+ "grad_norm": 10.125,
6784
+ "learning_rate": 0.0004938689099540242,
6785
+ "loss": 19.5438,
6786
+ "step": 19360
6787
+ },
6788
+ {
6789
+ "epoch": 0.03779048510464045,
6790
+ "grad_norm": 9.5625,
6791
+ "learning_rate": 0.0004938624079231149,
6792
+ "loss": 19.6158,
6793
+ "step": 19380
6794
+ },
6795
+ {
6796
+ "epoch": 0.03782948457327269,
6797
+ "grad_norm": 10.1875,
6798
+ "learning_rate": 0.0004938559058922054,
6799
+ "loss": 19.6023,
6800
+ "step": 19400
6801
+ },
6802
+ {
6803
+ "epoch": 0.03786848404190493,
6804
+ "grad_norm": 9.75,
6805
+ "learning_rate": 0.0004938494038612961,
6806
+ "loss": 19.6143,
6807
+ "step": 19420
6808
+ },
6809
+ {
6810
+ "epoch": 0.03790748351053717,
6811
+ "grad_norm": 9.875,
6812
+ "learning_rate": 0.0004938429018303867,
6813
+ "loss": 19.5367,
6814
+ "step": 19440
6815
+ },
6816
+ {
6817
+ "epoch": 0.03794648297916941,
6818
+ "grad_norm": 9.375,
6819
+ "learning_rate": 0.0004938363997994774,
6820
+ "loss": 19.5761,
6821
+ "step": 19460
6822
+ },
6823
+ {
6824
+ "epoch": 0.03798548244780165,
6825
+ "grad_norm": 10.1875,
6826
+ "learning_rate": 0.000493829897768568,
6827
+ "loss": 19.5939,
6828
+ "step": 19480
6829
+ },
6830
+ {
6831
+ "epoch": 0.038024481916433886,
6832
+ "grad_norm": 10.0,
6833
+ "learning_rate": 0.0004938233957376587,
6834
+ "loss": 19.5595,
6835
+ "step": 19500
6836
+ },
6837
+ {
6838
+ "epoch": 0.03806348138506613,
6839
+ "grad_norm": 12.75,
6840
+ "learning_rate": 0.0004938168937067493,
6841
+ "loss": 19.5722,
6842
+ "step": 19520
6843
+ },
6844
+ {
6845
+ "epoch": 0.03810248085369837,
6846
+ "grad_norm": 10.375,
6847
+ "learning_rate": 0.00049381039167584,
6848
+ "loss": 19.5889,
6849
+ "step": 19540
6850
+ },
6851
+ {
6852
+ "epoch": 0.038141480322330605,
6853
+ "grad_norm": 10.5,
6854
+ "learning_rate": 0.0004938038896449307,
6855
+ "loss": 19.5379,
6856
+ "step": 19560
6857
+ },
6858
+ {
6859
+ "epoch": 0.03818047979096285,
6860
+ "grad_norm": 9.875,
6861
+ "learning_rate": 0.0004937973876140213,
6862
+ "loss": 19.5243,
6863
+ "step": 19580
6864
+ },
6865
+ {
6866
+ "epoch": 0.03821947925959509,
6867
+ "grad_norm": 9.1875,
6868
+ "learning_rate": 0.0004937908855831119,
6869
+ "loss": 19.6067,
6870
+ "step": 19600
6871
+ },
6872
+ {
6873
+ "epoch": 0.03825847872822733,
6874
+ "grad_norm": 10.25,
6875
+ "learning_rate": 0.0004937843835522025,
6876
+ "loss": 19.6051,
6877
+ "step": 19620
6878
+ },
6879
+ {
6880
+ "epoch": 0.038297478196859566,
6881
+ "grad_norm": 10.5,
6882
+ "learning_rate": 0.0004937778815212932,
6883
+ "loss": 19.5555,
6884
+ "step": 19640
6885
+ },
6886
+ {
6887
+ "epoch": 0.03833647766549181,
6888
+ "grad_norm": 10.75,
6889
+ "learning_rate": 0.0004937713794903838,
6890
+ "loss": 19.601,
6891
+ "step": 19660
6892
+ },
6893
+ {
6894
+ "epoch": 0.03837547713412405,
6895
+ "grad_norm": 9.5,
6896
+ "learning_rate": 0.0004937648774594745,
6897
+ "loss": 19.5818,
6898
+ "step": 19680
6899
+ },
6900
+ {
6901
+ "epoch": 0.038414476602756285,
6902
+ "grad_norm": 10.625,
6903
+ "learning_rate": 0.0004937583754285651,
6904
+ "loss": 19.5565,
6905
+ "step": 19700
6906
+ },
6907
+ {
6908
+ "epoch": 0.03845347607138853,
6909
+ "grad_norm": 9.9375,
6910
+ "learning_rate": 0.0004937518733976558,
6911
+ "loss": 19.5719,
6912
+ "step": 19720
6913
+ },
6914
+ {
6915
+ "epoch": 0.03849247554002077,
6916
+ "grad_norm": 10.3125,
6917
+ "learning_rate": 0.0004937453713667465,
6918
+ "loss": 19.5583,
6919
+ "step": 19740
6920
+ },
6921
+ {
6922
+ "epoch": 0.038531475008653004,
6923
+ "grad_norm": 10.4375,
6924
+ "learning_rate": 0.000493738869335837,
6925
+ "loss": 19.5279,
6926
+ "step": 19760
6927
+ },
6928
+ {
6929
+ "epoch": 0.038570474477285246,
6930
+ "grad_norm": 9.4375,
6931
+ "learning_rate": 0.0004937323673049277,
6932
+ "loss": 19.4711,
6933
+ "step": 19780
6934
+ },
6935
+ {
6936
+ "epoch": 0.03860947394591749,
6937
+ "grad_norm": 11.9375,
6938
+ "learning_rate": 0.0004937258652740183,
6939
+ "loss": 19.5244,
6940
+ "step": 19800
6941
+ },
6942
+ {
6943
+ "epoch": 0.038648473414549724,
6944
+ "grad_norm": 9.8125,
6945
+ "learning_rate": 0.000493719363243109,
6946
+ "loss": 19.5652,
6947
+ "step": 19820
6948
+ },
6949
+ {
6950
+ "epoch": 0.038687472883181966,
6951
+ "grad_norm": 11.4375,
6952
+ "learning_rate": 0.0004937128612121996,
6953
+ "loss": 19.5002,
6954
+ "step": 19840
6955
+ },
6956
+ {
6957
+ "epoch": 0.03872647235181421,
6958
+ "grad_norm": 10.1875,
6959
+ "learning_rate": 0.0004937063591812903,
6960
+ "loss": 19.4699,
6961
+ "step": 19860
6962
+ },
6963
+ {
6964
+ "epoch": 0.03876547182044645,
6965
+ "grad_norm": 10.8125,
6966
+ "learning_rate": 0.000493699857150381,
6967
+ "loss": 19.486,
6968
+ "step": 19880
6969
+ },
6970
+ {
6971
+ "epoch": 0.038804471289078685,
6972
+ "grad_norm": 9.0625,
6973
+ "learning_rate": 0.0004936933551194716,
6974
+ "loss": 19.5072,
6975
+ "step": 19900
6976
+ },
6977
+ {
6978
+ "epoch": 0.03884347075771093,
6979
+ "grad_norm": 9.625,
6980
+ "learning_rate": 0.0004936868530885622,
6981
+ "loss": 19.434,
6982
+ "step": 19920
6983
+ },
6984
+ {
6985
+ "epoch": 0.03888247022634317,
6986
+ "grad_norm": 9.5,
6987
+ "learning_rate": 0.0004936803510576528,
6988
+ "loss": 19.4787,
6989
+ "step": 19940
6990
+ },
6991
+ {
6992
+ "epoch": 0.038921469694975404,
6993
+ "grad_norm": 10.0625,
6994
+ "learning_rate": 0.0004936738490267435,
6995
+ "loss": 19.4656,
6996
+ "step": 19960
6997
+ },
6998
+ {
6999
+ "epoch": 0.038960469163607646,
7000
+ "grad_norm": 9.125,
7001
+ "learning_rate": 0.0004936673469958341,
7002
+ "loss": 19.5858,
7003
+ "step": 19980
7004
+ },
7005
+ {
7006
+ "epoch": 0.03899946863223989,
7007
+ "grad_norm": 8.875,
7008
+ "learning_rate": 0.0004936608449649248,
7009
+ "loss": 19.5272,
7010
+ "step": 20000
7011
  }
7012
  ],
7013
  "logging_steps": 20,
 
7027
  "attributes": {}
7028
  }
7029
  },
7030
+ "total_flos": 1.4703345011961889e+19,
7031
  "train_batch_size": 48,
7032
  "trial_name": null,
7033
  "trial_params": null