Wilsonwin commited on
Commit
2553ef9
·
verified ·
1 Parent(s): af54ba3

Training in progress, step 8500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a5b1264363800d835097d941071eaf668b648591456cb18035122aa338a30b9
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:663d31a8b6ad2423dc3c0b8759bef8029d3f5914e7b173b5be641f54497bab8c
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bfe3532ddb10671229c77a55f85cca973229a308c2faa98d60ea12da855a7153
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93129eb07b3c389c642dd3ac521458eb6b0b8b0b4b6634a4a4ec236e73b73dd
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16ef5699c401ab357753367766bad7490c0997d4f3cbc8e6689c7f21d470f2f2
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95d6f8a42fc11a5f0262b0c737f666f824322b1b030452310cca3fb10ffef9ad
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:578bef6269d270c9ba7be042609ff28604e2fee3538e234c365c9aa652e62f33
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cda9bcc9266ec91d2da20eab50cd7cea609c16666645a54519c40bab7f69f1a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.3515796587261362,
6
  "eval_steps": 500,
7
- "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5743,6 +5743,364 @@
5743
  "eval_samples_per_second": 267.812,
5744
  "eval_steps_per_second": 5.624,
5745
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5746
  }
5747
  ],
5748
  "logging_steps": 10,
@@ -5762,7 +6120,7 @@
5762
  "attributes": {}
5763
  }
5764
  },
5765
- "total_flos": 2.6756336633197363e+17,
5766
  "train_batch_size": 48,
5767
  "trial_name": null,
5768
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.4360533873965196,
6
  "eval_steps": 500,
7
+ "global_step": 8500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5743
  "eval_samples_per_second": 267.812,
5744
  "eval_steps_per_second": 5.624,
5745
  "step": 8000
5746
+ },
5747
+ {
5748
+ "epoch": 1.353269133299544,
5749
+ "grad_norm": 0.463106632232666,
5750
+ "learning_rate": 9.883703849181374e-05,
5751
+ "loss": 4.368831634521484,
5752
+ "step": 8010
5753
+ },
5754
+ {
5755
+ "epoch": 1.3549586078729514,
5756
+ "grad_norm": 0.4774092137813568,
5757
+ "learning_rate": 9.838702606099289e-05,
5758
+ "loss": 4.350126647949219,
5759
+ "step": 8020
5760
+ },
5761
+ {
5762
+ "epoch": 1.3566480824463591,
5763
+ "grad_norm": 0.5083175897598267,
5764
+ "learning_rate": 9.793753994381003e-05,
5765
+ "loss": 4.375761032104492,
5766
+ "step": 8030
5767
+ },
5768
+ {
5769
+ "epoch": 1.3583375570197669,
5770
+ "grad_norm": 0.493473619222641,
5771
+ "learning_rate": 9.748858472381567e-05,
5772
+ "loss": 4.382857894897461,
5773
+ "step": 8040
5774
+ },
5775
+ {
5776
+ "epoch": 1.3600270315931744,
5777
+ "grad_norm": 0.47200217843055725,
5778
+ "learning_rate": 9.704016497914657e-05,
5779
+ "loss": 4.363901901245117,
5780
+ "step": 8050
5781
+ },
5782
+ {
5783
+ "epoch": 1.3617165061665821,
5784
+ "grad_norm": 0.47423017024993896,
5785
+ "learning_rate": 9.659228528247923e-05,
5786
+ "loss": 4.352508544921875,
5787
+ "step": 8060
5788
+ },
5789
+ {
5790
+ "epoch": 1.3634059807399899,
5791
+ "grad_norm": 0.49032631516456604,
5792
+ "learning_rate": 9.614495020098284e-05,
5793
+ "loss": 4.386605834960937,
5794
+ "step": 8070
5795
+ },
5796
+ {
5797
+ "epoch": 1.3650954553133976,
5798
+ "grad_norm": 0.5129415392875671,
5799
+ "learning_rate": 9.569816429627329e-05,
5800
+ "loss": 4.370170211791992,
5801
+ "step": 8080
5802
+ },
5803
+ {
5804
+ "epoch": 1.3667849298868053,
5805
+ "grad_norm": 0.47328782081604004,
5806
+ "learning_rate": 9.525193212436607e-05,
5807
+ "loss": 4.394309616088867,
5808
+ "step": 8090
5809
+ },
5810
+ {
5811
+ "epoch": 1.3684744044602128,
5812
+ "grad_norm": 0.5091307759284973,
5813
+ "learning_rate": 9.480625823563032e-05,
5814
+ "loss": 4.353821182250977,
5815
+ "step": 8100
5816
+ },
5817
+ {
5818
+ "epoch": 1.3701638790336206,
5819
+ "grad_norm": 0.49530673027038574,
5820
+ "learning_rate": 9.436114717474197e-05,
5821
+ "loss": 4.374178314208985,
5822
+ "step": 8110
5823
+ },
5824
+ {
5825
+ "epoch": 1.3718533536070283,
5826
+ "grad_norm": 0.5062808394432068,
5827
+ "learning_rate": 9.391660348063778e-05,
5828
+ "loss": 4.366446685791016,
5829
+ "step": 8120
5830
+ },
5831
+ {
5832
+ "epoch": 1.3735428281804358,
5833
+ "grad_norm": 0.4893403947353363,
5834
+ "learning_rate": 9.347263168646881e-05,
5835
+ "loss": 4.377128601074219,
5836
+ "step": 8130
5837
+ },
5838
+ {
5839
+ "epoch": 1.3752323027538436,
5840
+ "grad_norm": 0.49352315068244934,
5841
+ "learning_rate": 9.30292363195543e-05,
5842
+ "loss": 4.390756988525391,
5843
+ "step": 8140
5844
+ },
5845
+ {
5846
+ "epoch": 1.3769217773272513,
5847
+ "grad_norm": 0.4956866502761841,
5848
+ "learning_rate": 9.258642190133548e-05,
5849
+ "loss": 4.364201354980469,
5850
+ "step": 8150
5851
+ },
5852
+ {
5853
+ "epoch": 1.3786112519006588,
5854
+ "grad_norm": 0.4806705415248871,
5855
+ "learning_rate": 9.21441929473295e-05,
5856
+ "loss": 4.336410140991211,
5857
+ "step": 8160
5858
+ },
5859
+ {
5860
+ "epoch": 1.3803007264740665,
5861
+ "grad_norm": 0.503070056438446,
5862
+ "learning_rate": 9.170255396708336e-05,
5863
+ "loss": 4.363087463378906,
5864
+ "step": 8170
5865
+ },
5866
+ {
5867
+ "epoch": 1.3819902010474743,
5868
+ "grad_norm": 0.4839601218700409,
5869
+ "learning_rate": 9.126150946412775e-05,
5870
+ "loss": 4.353903961181641,
5871
+ "step": 8180
5872
+ },
5873
+ {
5874
+ "epoch": 1.3836796756208818,
5875
+ "grad_norm": 0.4867366552352905,
5876
+ "learning_rate": 9.082106393593153e-05,
5877
+ "loss": 4.347708892822266,
5878
+ "step": 8190
5879
+ },
5880
+ {
5881
+ "epoch": 1.3853691501942895,
5882
+ "grad_norm": 0.4875339865684509,
5883
+ "learning_rate": 9.038122187385543e-05,
5884
+ "loss": 4.371865844726562,
5885
+ "step": 8200
5886
+ },
5887
+ {
5888
+ "epoch": 1.3870586247676973,
5889
+ "grad_norm": 0.49478384852409363,
5890
+ "learning_rate": 8.994198776310652e-05,
5891
+ "loss": 4.368743133544922,
5892
+ "step": 8210
5893
+ },
5894
+ {
5895
+ "epoch": 1.388748099341105,
5896
+ "grad_norm": 0.4815446734428406,
5897
+ "learning_rate": 8.950336608269243e-05,
5898
+ "loss": 4.383320999145508,
5899
+ "step": 8220
5900
+ },
5901
+ {
5902
+ "epoch": 1.3904375739145125,
5903
+ "grad_norm": 0.4883415997028351,
5904
+ "learning_rate": 8.906536130537566e-05,
5905
+ "loss": 4.368521881103516,
5906
+ "step": 8230
5907
+ },
5908
+ {
5909
+ "epoch": 1.3921270484879202,
5910
+ "grad_norm": 0.5107654929161072,
5911
+ "learning_rate": 8.862797789762785e-05,
5912
+ "loss": 4.353972244262695,
5913
+ "step": 8240
5914
+ },
5915
+ {
5916
+ "epoch": 1.393816523061328,
5917
+ "grad_norm": 0.46853381395339966,
5918
+ "learning_rate": 8.819122031958446e-05,
5919
+ "loss": 4.374198150634766,
5920
+ "step": 8250
5921
+ },
5922
+ {
5923
+ "epoch": 1.3955059976347357,
5924
+ "grad_norm": 0.49264970421791077,
5925
+ "learning_rate": 8.77550930249991e-05,
5926
+ "loss": 4.353750228881836,
5927
+ "step": 8260
5928
+ },
5929
+ {
5930
+ "epoch": 1.3971954722081432,
5931
+ "grad_norm": 0.49197956919670105,
5932
+ "learning_rate": 8.731960046119819e-05,
5933
+ "loss": 4.378075408935547,
5934
+ "step": 8270
5935
+ },
5936
+ {
5937
+ "epoch": 1.398884946781551,
5938
+ "grad_norm": 0.48225274682044983,
5939
+ "learning_rate": 8.688474706903554e-05,
5940
+ "loss": 4.360022735595703,
5941
+ "step": 8280
5942
+ },
5943
+ {
5944
+ "epoch": 1.4005744213549587,
5945
+ "grad_norm": 0.4796869456768036,
5946
+ "learning_rate": 8.645053728284734e-05,
5947
+ "loss": 4.351276779174805,
5948
+ "step": 8290
5949
+ },
5950
+ {
5951
+ "epoch": 1.4022638959283662,
5952
+ "grad_norm": 0.46706125140190125,
5953
+ "learning_rate": 8.601697553040645e-05,
5954
+ "loss": 4.367401885986328,
5955
+ "step": 8300
5956
+ },
5957
+ {
5958
+ "epoch": 1.403953370501774,
5959
+ "grad_norm": 0.4695565104484558,
5960
+ "learning_rate": 8.55840662328778e-05,
5961
+ "loss": 4.338150405883789,
5962
+ "step": 8310
5963
+ },
5964
+ {
5965
+ "epoch": 1.4056428450751817,
5966
+ "grad_norm": 0.4987981915473938,
5967
+ "learning_rate": 8.515181380477273e-05,
5968
+ "loss": 4.369682693481446,
5969
+ "step": 8320
5970
+ },
5971
+ {
5972
+ "epoch": 1.4073323196485892,
5973
+ "grad_norm": 0.4853006899356842,
5974
+ "learning_rate": 8.47202226539046e-05,
5975
+ "loss": 4.392825698852539,
5976
+ "step": 8330
5977
+ },
5978
+ {
5979
+ "epoch": 1.409021794221997,
5980
+ "grad_norm": 0.48891976475715637,
5981
+ "learning_rate": 8.428929718134331e-05,
5982
+ "loss": 4.3820442199707035,
5983
+ "step": 8340
5984
+ },
5985
+ {
5986
+ "epoch": 1.4107112687954046,
5987
+ "grad_norm": 0.48374229669570923,
5988
+ "learning_rate": 8.385904178137061e-05,
5989
+ "loss": 4.367736053466797,
5990
+ "step": 8350
5991
+ },
5992
+ {
5993
+ "epoch": 1.4124007433688122,
5994
+ "grad_norm": 0.4966294765472412,
5995
+ "learning_rate": 8.342946084143546e-05,
5996
+ "loss": 4.336813354492188,
5997
+ "step": 8360
5998
+ },
5999
+ {
6000
+ "epoch": 1.41409021794222,
6001
+ "grad_norm": 0.4939606487751007,
6002
+ "learning_rate": 8.300055874210903e-05,
6003
+ "loss": 4.390798568725586,
6004
+ "step": 8370
6005
+ },
6006
+ {
6007
+ "epoch": 1.4157796925156276,
6008
+ "grad_norm": 0.48403191566467285,
6009
+ "learning_rate": 8.257233985704021e-05,
6010
+ "loss": 4.3521678924560545,
6011
+ "step": 8380
6012
+ },
6013
+ {
6014
+ "epoch": 1.4174691670890354,
6015
+ "grad_norm": 0.4766407012939453,
6016
+ "learning_rate": 8.214480855291084e-05,
6017
+ "loss": 4.337980651855469,
6018
+ "step": 8390
6019
+ },
6020
+ {
6021
+ "epoch": 1.419158641662443,
6022
+ "grad_norm": 0.469018816947937,
6023
+ "learning_rate": 8.171796918939142e-05,
6024
+ "loss": 4.341955184936523,
6025
+ "step": 8400
6026
+ },
6027
+ {
6028
+ "epoch": 1.4208481162358506,
6029
+ "grad_norm": 0.4855271875858307,
6030
+ "learning_rate": 8.129182611909642e-05,
6031
+ "loss": 4.353343963623047,
6032
+ "step": 8410
6033
+ },
6034
+ {
6035
+ "epoch": 1.4225375908092583,
6036
+ "grad_norm": 0.4870193898677826,
6037
+ "learning_rate": 8.086638368753993e-05,
6038
+ "loss": 4.374142074584961,
6039
+ "step": 8420
6040
+ },
6041
+ {
6042
+ "epoch": 1.424227065382666,
6043
+ "grad_norm": 0.4896891415119171,
6044
+ "learning_rate": 8.04416462330916e-05,
6045
+ "loss": 4.367203140258789,
6046
+ "step": 8430
6047
+ },
6048
+ {
6049
+ "epoch": 1.4259165399560736,
6050
+ "grad_norm": 0.46844348311424255,
6051
+ "learning_rate": 8.0017618086932e-05,
6052
+ "loss": 4.35595817565918,
6053
+ "step": 8440
6054
+ },
6055
+ {
6056
+ "epoch": 1.4276060145294813,
6057
+ "grad_norm": 0.4512944519519806,
6058
+ "learning_rate": 7.959430357300885e-05,
6059
+ "loss": 4.3400733947753904,
6060
+ "step": 8450
6061
+ },
6062
+ {
6063
+ "epoch": 1.429295489102889,
6064
+ "grad_norm": 0.4732443392276764,
6065
+ "learning_rate": 7.917170700799256e-05,
6066
+ "loss": 4.333652114868164,
6067
+ "step": 8460
6068
+ },
6069
+ {
6070
+ "epoch": 1.4309849636762966,
6071
+ "grad_norm": 0.4684848487377167,
6072
+ "learning_rate": 7.874983270123254e-05,
6073
+ "loss": 4.352918243408203,
6074
+ "step": 8470
6075
+ },
6076
+ {
6077
+ "epoch": 1.4326744382497043,
6078
+ "grad_norm": 0.506878137588501,
6079
+ "learning_rate": 7.832868495471306e-05,
6080
+ "loss": 4.357436752319336,
6081
+ "step": 8480
6082
+ },
6083
+ {
6084
+ "epoch": 1.434363912823112,
6085
+ "grad_norm": 0.5020336508750916,
6086
+ "learning_rate": 7.790826806300928e-05,
6087
+ "loss": 4.359925079345703,
6088
+ "step": 8490
6089
+ },
6090
+ {
6091
+ "epoch": 1.4360533873965196,
6092
+ "grad_norm": 0.4732269048690796,
6093
+ "learning_rate": 7.748858631324393e-05,
6094
+ "loss": 4.356634902954101,
6095
+ "step": 8500
6096
+ },
6097
+ {
6098
+ "epoch": 1.4360533873965196,
6099
+ "eval_loss": 4.3328938484191895,
6100
+ "eval_runtime": 3.6888,
6101
+ "eval_samples_per_second": 271.089,
6102
+ "eval_steps_per_second": 5.693,
6103
+ "step": 8500
6104
  }
6105
  ],
6106
  "logging_steps": 10,
 
6120
  "attributes": {}
6121
  }
6122
  },
6123
+ "total_flos": 2.8428620737491763e+17,
6124
  "train_batch_size": 48,
6125
  "trial_name": null,
6126
  "trial_params": null