mohammadmahdinouri commited on
Commit
b4556af
·
verified ·
1 Parent(s): bb569b5

Training in progress, step 17000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:42301bc164cb007a8e9ffaaebd3b674826efaacc96f02799ea8c54ebdf5beff1
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e1f6084c2fd12874836176a807971d304a89f7ecfc63e2081a9bd54f224b13b
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e166c3997353d811bb7375dab7e17cf88064b52029e8056c729ba4ae8d2e8f22
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:729c0d767d06adf4295f1acf80d3c9a43aee84e3de6cc9a899725bd2d9ba998b
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8653c4f16bb3c4531444bd438e2a397c259c928e9f5a96f450fc3aa43ef0f5c
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6775411b7c96ce112db0ff86dbc4c7f4f5876ba69512e78981d49611b5ed959e
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91185d0e7a47d1f7979000c680b3a146a800c2ff31f983b75b24ceb331884072
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c2e00f40f2b965358ee58725a6039af41eeb8a8f4527ae152ec5dad618307fd
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be0be34d9684d804e2f3030fceca4c7b93603e6596a44aaf270c97cb1740b1da
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37ee15f1c9ceef9e456d1af53da3ed0fd0ec244051b974379f15c285ed42f8b7
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e539799e7e99b66c33c364546118319f901c9765aa17eaf7cf8b17906c00c95a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e79d7f1dfea25dc4809dc0e5c220d70f3b690693b546131b59ad7f9ed9b129c
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ccc2a52ae0327def30cc40f7f273a4a1537961b9b580753fe57ec7ecdab69b35
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84d957adbd57639a95ced1440a685d29db26c75001a9b3061d2f7af9b9a721b1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.031199574905791908,
6
  "eval_steps": 500,
7
- "global_step": 16000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5608,6 +5608,356 @@
5608
  "learning_rate": 0.0004949612511467957,
5609
  "loss": 20.3333,
5610
  "step": 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5611
  }
5612
  ],
5613
  "logging_steps": 20,
@@ -5627,7 +5977,7 @@
5627
  "attributes": {}
5628
  }
5629
  },
5630
- "total_flos": 1.176271382718605e+19,
5631
  "train_batch_size": 48,
5632
  "trial_name": null,
5633
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.033149548337403904,
6
  "eval_steps": 500,
7
+ "global_step": 17000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5608
  "learning_rate": 0.0004949612511467957,
5609
  "loss": 20.3333,
5610
  "step": 16000
5611
+ },
5612
+ {
5613
+ "epoch": 0.03123857437442415,
5614
+ "grad_norm": 11.4375,
5615
+ "learning_rate": 0.0004949547491158863,
5616
+ "loss": 20.2811,
5617
+ "step": 16020
5618
+ },
5619
+ {
5620
+ "epoch": 0.03127757384305639,
5621
+ "grad_norm": 11.1875,
5622
+ "learning_rate": 0.000494948247084977,
5623
+ "loss": 20.367,
5624
+ "step": 16040
5625
+ },
5626
+ {
5627
+ "epoch": 0.03131657331168863,
5628
+ "grad_norm": 11.75,
5629
+ "learning_rate": 0.0004949417450540676,
5630
+ "loss": 20.3134,
5631
+ "step": 16060
5632
+ },
5633
+ {
5634
+ "epoch": 0.03135557278032087,
5635
+ "grad_norm": 10.25,
5636
+ "learning_rate": 0.0004949352430231583,
5637
+ "loss": 20.3922,
5638
+ "step": 16080
5639
+ },
5640
+ {
5641
+ "epoch": 0.03139457224895311,
5642
+ "grad_norm": 10.375,
5643
+ "learning_rate": 0.000494928740992249,
5644
+ "loss": 20.3097,
5645
+ "step": 16100
5646
+ },
5647
+ {
5648
+ "epoch": 0.031433571717585346,
5649
+ "grad_norm": 11.375,
5650
+ "learning_rate": 0.0004949222389613396,
5651
+ "loss": 20.3737,
5652
+ "step": 16120
5653
+ },
5654
+ {
5655
+ "epoch": 0.03147257118621759,
5656
+ "grad_norm": 10.1875,
5657
+ "learning_rate": 0.0004949157369304303,
5658
+ "loss": 20.3886,
5659
+ "step": 16140
5660
+ },
5661
+ {
5662
+ "epoch": 0.03151157065484983,
5663
+ "grad_norm": 11.1875,
5664
+ "learning_rate": 0.0004949092348995209,
5665
+ "loss": 20.2403,
5666
+ "step": 16160
5667
+ },
5668
+ {
5669
+ "epoch": 0.031550570123482065,
5670
+ "grad_norm": 11.625,
5671
+ "learning_rate": 0.0004949027328686116,
5672
+ "loss": 20.3402,
5673
+ "step": 16180
5674
+ },
5675
+ {
5676
+ "epoch": 0.03158956959211431,
5677
+ "grad_norm": 14.6875,
5678
+ "learning_rate": 0.0004948962308377022,
5679
+ "loss": 20.3529,
5680
+ "step": 16200
5681
+ },
5682
+ {
5683
+ "epoch": 0.03162856906074655,
5684
+ "grad_norm": 11.5,
5685
+ "learning_rate": 0.0004948897288067928,
5686
+ "loss": 20.2767,
5687
+ "step": 16220
5688
+ },
5689
+ {
5690
+ "epoch": 0.031667568529378784,
5691
+ "grad_norm": 9.6875,
5692
+ "learning_rate": 0.0004948832267758834,
5693
+ "loss": 20.2271,
5694
+ "step": 16240
5695
+ },
5696
+ {
5697
+ "epoch": 0.031706567998011026,
5698
+ "grad_norm": 11.25,
5699
+ "learning_rate": 0.0004948767247449741,
5700
+ "loss": 20.3672,
5701
+ "step": 16260
5702
+ },
5703
+ {
5704
+ "epoch": 0.03174556746664327,
5705
+ "grad_norm": 11.8125,
5706
+ "learning_rate": 0.0004948702227140648,
5707
+ "loss": 20.3693,
5708
+ "step": 16280
5709
+ },
5710
+ {
5711
+ "epoch": 0.03178456693527551,
5712
+ "grad_norm": 13.6875,
5713
+ "learning_rate": 0.0004948637206831554,
5714
+ "loss": 20.2767,
5715
+ "step": 16300
5716
+ },
5717
+ {
5718
+ "epoch": 0.031823566403907745,
5719
+ "grad_norm": 11.25,
5720
+ "learning_rate": 0.0004948572186522461,
5721
+ "loss": 20.2559,
5722
+ "step": 16320
5723
+ },
5724
+ {
5725
+ "epoch": 0.03186256587253999,
5726
+ "grad_norm": 12.1875,
5727
+ "learning_rate": 0.0004948507166213367,
5728
+ "loss": 20.2962,
5729
+ "step": 16340
5730
+ },
5731
+ {
5732
+ "epoch": 0.03190156534117223,
5733
+ "grad_norm": 11.4375,
5734
+ "learning_rate": 0.0004948442145904274,
5735
+ "loss": 20.2648,
5736
+ "step": 16360
5737
+ },
5738
+ {
5739
+ "epoch": 0.031940564809804464,
5740
+ "grad_norm": 10.9375,
5741
+ "learning_rate": 0.0004948377125595179,
5742
+ "loss": 20.2703,
5743
+ "step": 16380
5744
+ },
5745
+ {
5746
+ "epoch": 0.031979564278436706,
5747
+ "grad_norm": 12.1875,
5748
+ "learning_rate": 0.0004948312105286086,
5749
+ "loss": 20.3281,
5750
+ "step": 16400
5751
+ },
5752
+ {
5753
+ "epoch": 0.03201856374706895,
5754
+ "grad_norm": 12.1875,
5755
+ "learning_rate": 0.0004948247084976992,
5756
+ "loss": 20.2317,
5757
+ "step": 16420
5758
+ },
5759
+ {
5760
+ "epoch": 0.03205756321570118,
5761
+ "grad_norm": 11.375,
5762
+ "learning_rate": 0.0004948182064667899,
5763
+ "loss": 20.2883,
5764
+ "step": 16440
5765
+ },
5766
+ {
5767
+ "epoch": 0.032096562684333425,
5768
+ "grad_norm": 12.8125,
5769
+ "learning_rate": 0.0004948117044358806,
5770
+ "loss": 20.2294,
5771
+ "step": 16460
5772
+ },
5773
+ {
5774
+ "epoch": 0.03213556215296567,
5775
+ "grad_norm": 12.5625,
5776
+ "learning_rate": 0.0004948052024049712,
5777
+ "loss": 20.1226,
5778
+ "step": 16480
5779
+ },
5780
+ {
5781
+ "epoch": 0.0321745616215979,
5782
+ "grad_norm": 11.375,
5783
+ "learning_rate": 0.0004947987003740619,
5784
+ "loss": 20.2422,
5785
+ "step": 16500
5786
+ },
5787
+ {
5788
+ "epoch": 0.032213561090230144,
5789
+ "grad_norm": 11.375,
5790
+ "learning_rate": 0.0004947921983431524,
5791
+ "loss": 20.2142,
5792
+ "step": 16520
5793
+ },
5794
+ {
5795
+ "epoch": 0.032252560558862386,
5796
+ "grad_norm": 12.5,
5797
+ "learning_rate": 0.0004947856963122431,
5798
+ "loss": 20.2658,
5799
+ "step": 16540
5800
+ },
5801
+ {
5802
+ "epoch": 0.03229156002749463,
5803
+ "grad_norm": 9.8125,
5804
+ "learning_rate": 0.0004947791942813337,
5805
+ "loss": 20.1552,
5806
+ "step": 16560
5807
+ },
5808
+ {
5809
+ "epoch": 0.032330559496126864,
5810
+ "grad_norm": 10.0625,
5811
+ "learning_rate": 0.0004947726922504244,
5812
+ "loss": 20.1369,
5813
+ "step": 16580
5814
+ },
5815
+ {
5816
+ "epoch": 0.032369558964759106,
5817
+ "grad_norm": 11.375,
5818
+ "learning_rate": 0.000494766190219515,
5819
+ "loss": 20.1965,
5820
+ "step": 16600
5821
+ },
5822
+ {
5823
+ "epoch": 0.03240855843339135,
5824
+ "grad_norm": 10.875,
5825
+ "learning_rate": 0.0004947596881886057,
5826
+ "loss": 20.2377,
5827
+ "step": 16620
5828
+ },
5829
+ {
5830
+ "epoch": 0.03244755790202358,
5831
+ "grad_norm": 10.375,
5832
+ "learning_rate": 0.0004947531861576964,
5833
+ "loss": 20.2204,
5834
+ "step": 16640
5835
+ },
5836
+ {
5837
+ "epoch": 0.032486557370655825,
5838
+ "grad_norm": 11.375,
5839
+ "learning_rate": 0.000494746684126787,
5840
+ "loss": 20.1081,
5841
+ "step": 16660
5842
+ },
5843
+ {
5844
+ "epoch": 0.03252555683928807,
5845
+ "grad_norm": 11.1875,
5846
+ "learning_rate": 0.0004947401820958777,
5847
+ "loss": 20.3024,
5848
+ "step": 16680
5849
+ },
5850
+ {
5851
+ "epoch": 0.0325645563079203,
5852
+ "grad_norm": 11.3125,
5853
+ "learning_rate": 0.0004947336800649683,
5854
+ "loss": 20.1351,
5855
+ "step": 16700
5856
+ },
5857
+ {
5858
+ "epoch": 0.032603555776552544,
5859
+ "grad_norm": 11.1875,
5860
+ "learning_rate": 0.0004947271780340589,
5861
+ "loss": 20.1989,
5862
+ "step": 16720
5863
+ },
5864
+ {
5865
+ "epoch": 0.032642555245184786,
5866
+ "grad_norm": 9.6875,
5867
+ "learning_rate": 0.0004947206760031495,
5868
+ "loss": 20.1502,
5869
+ "step": 16740
5870
+ },
5871
+ {
5872
+ "epoch": 0.03268155471381702,
5873
+ "grad_norm": 11.125,
5874
+ "learning_rate": 0.0004947141739722402,
5875
+ "loss": 20.0948,
5876
+ "step": 16760
5877
+ },
5878
+ {
5879
+ "epoch": 0.03272055418244926,
5880
+ "grad_norm": 11.4375,
5881
+ "learning_rate": 0.0004947076719413309,
5882
+ "loss": 20.1084,
5883
+ "step": 16780
5884
+ },
5885
+ {
5886
+ "epoch": 0.032759553651081505,
5887
+ "grad_norm": 10.3125,
5888
+ "learning_rate": 0.0004947011699104215,
5889
+ "loss": 20.1207,
5890
+ "step": 16800
5891
+ },
5892
+ {
5893
+ "epoch": 0.03279855311971375,
5894
+ "grad_norm": 11.1875,
5895
+ "learning_rate": 0.0004946946678795122,
5896
+ "loss": 20.0984,
5897
+ "step": 16820
5898
+ },
5899
+ {
5900
+ "epoch": 0.03283755258834598,
5901
+ "grad_norm": 10.8125,
5902
+ "learning_rate": 0.0004946881658486028,
5903
+ "loss": 20.1778,
5904
+ "step": 16840
5905
+ },
5906
+ {
5907
+ "epoch": 0.032876552056978224,
5908
+ "grad_norm": 10.8125,
5909
+ "learning_rate": 0.0004946816638176935,
5910
+ "loss": 20.2415,
5911
+ "step": 16860
5912
+ },
5913
+ {
5914
+ "epoch": 0.032915551525610466,
5915
+ "grad_norm": 10.25,
5916
+ "learning_rate": 0.0004946751617867841,
5917
+ "loss": 20.1135,
5918
+ "step": 16880
5919
+ },
5920
+ {
5921
+ "epoch": 0.0329545509942427,
5922
+ "grad_norm": 10.875,
5923
+ "learning_rate": 0.0004946686597558748,
5924
+ "loss": 20.1361,
5925
+ "step": 16900
5926
+ },
5927
+ {
5928
+ "epoch": 0.03299355046287494,
5929
+ "grad_norm": 11.4375,
5930
+ "learning_rate": 0.0004946621577249655,
5931
+ "loss": 20.0907,
5932
+ "step": 16920
5933
+ },
5934
+ {
5935
+ "epoch": 0.033032549931507185,
5936
+ "grad_norm": 14.5,
5937
+ "learning_rate": 0.0004946556556940561,
5938
+ "loss": 20.1267,
5939
+ "step": 16940
5940
+ },
5941
+ {
5942
+ "epoch": 0.03307154940013942,
5943
+ "grad_norm": 12.875,
5944
+ "learning_rate": 0.0004946491536631467,
5945
+ "loss": 20.0818,
5946
+ "step": 16960
5947
+ },
5948
+ {
5949
+ "epoch": 0.03311054886877166,
5950
+ "grad_norm": 10.875,
5951
+ "learning_rate": 0.0004946426516322373,
5952
+ "loss": 20.1085,
5953
+ "step": 16980
5954
+ },
5955
+ {
5956
+ "epoch": 0.033149548337403904,
5957
+ "grad_norm": 10.5625,
5958
+ "learning_rate": 0.000494636149601328,
5959
+ "loss": 20.0712,
5960
+ "step": 17000
5961
  }
5962
  ],
5963
  "logging_steps": 20,
 
5977
  "attributes": {}
5978
  }
5979
  },
5980
+ "total_flos": 1.2497927616331776e+19,
5981
  "train_batch_size": 48,
5982
  "trial_name": null,
5983
  "trial_params": null