mohammadmahdinouri commited on
Commit
1f1ffde
·
verified ·
1 Parent(s): 97682e3

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a619d173a81b959c06c6819e63784a3964cf704234614a53b41a95f8c4ce423b
3
  size 487156538
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85ef9658a737c10a3387f46921cffde7ab5a025ce78736b7108c955a7faeac8a
3
  size 487156538
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc07de972c404bd9d03e65d6a0a8bb8a57f33213d57e57b51c19d276698a2990
3
  size 1059459406
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3eb6ae8fa6329d3a9b6361b73791d9116263499067936e1da0e69f22ad24064a
3
  size 1059459406
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fde34039d7b04934a891fddf8651f7147686cc194dd14ef9c544d9f194e3db54
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea728a187b8c68044774e601a8e864fe3e690cd5a58b87c25fc28b6ccafe83e8
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5d095629c4afecfa399dffed86284dc4231689f617f0e254b3490299c477dd5
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe1466545cc900a21ebf0c37b4514ebc0bde5d4f73811c54ed7c2486869a9cb1
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6404ff16418ff06858ba815c4899c94a4c015e7870eab3f1b01051d9d511b73
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4318c358ed565a24eac21beca64897e6b4960e90756779dce640fed08ea3eccd
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b933704c82ebaf750aa2519cd157aa39099844e58ed4ac2bed0623c91353a70d
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a07179b14e6d25f457c0ab61baeffe9e5158660fa3b0e9f67490cfa9f8da1124
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfcd8a09e8e46c589c8638cc20283a9b31e9d60ec45a6122361751489d45607f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7985ffec4b6d44038e4e914003aaad7f6fef0f867a7890a2fafa6d483c9c9580
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.01364981402128396,
6
  "eval_steps": 500,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4908,6 +4908,356 @@
4908
  "learning_rate": 0.0004978871650560052,
4909
  "loss": 17.2572,
4910
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4911
  }
4912
  ],
4913
  "logging_steps": 10,
@@ -4927,7 +5277,7 @@
4927
  "attributes": {}
4928
  }
4929
  },
4930
- "total_flos": 1.557487423730588e+19,
4931
  "train_batch_size": 48,
4932
  "trial_name": null,
4933
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.014624800737089957,
6
  "eval_steps": 500,
7
+ "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4908
  "learning_rate": 0.0004978871650560052,
4909
  "loss": 17.2572,
4910
  "step": 7000
4911
+ },
4912
+ {
4913
+ "epoch": 0.01366931375560008,
4914
+ "grad_norm": 6.78125,
4915
+ "learning_rate": 0.0004978839140405506,
4916
+ "loss": 17.2407,
4917
+ "step": 7010
4918
+ },
4919
+ {
4920
+ "epoch": 0.0136888134899162,
4921
+ "grad_norm": 8.0625,
4922
+ "learning_rate": 0.0004978806630250959,
4923
+ "loss": 17.2084,
4924
+ "step": 7020
4925
+ },
4926
+ {
4927
+ "epoch": 0.01370831322423232,
4928
+ "grad_norm": 9.0,
4929
+ "learning_rate": 0.0004978774120096412,
4930
+ "loss": 17.2807,
4931
+ "step": 7030
4932
+ },
4933
+ {
4934
+ "epoch": 0.01372781295854844,
4935
+ "grad_norm": 8.25,
4936
+ "learning_rate": 0.0004978741609941866,
4937
+ "loss": 17.3416,
4938
+ "step": 7040
4939
+ },
4940
+ {
4941
+ "epoch": 0.01374731269286456,
4942
+ "grad_norm": 7.6875,
4943
+ "learning_rate": 0.0004978709099787319,
4944
+ "loss": 17.2057,
4945
+ "step": 7050
4946
+ },
4947
+ {
4948
+ "epoch": 0.01376681242718068,
4949
+ "grad_norm": 9.8125,
4950
+ "learning_rate": 0.0004978676589632772,
4951
+ "loss": 17.309,
4952
+ "step": 7060
4953
+ },
4954
+ {
4955
+ "epoch": 0.013786312161496799,
4956
+ "grad_norm": 8.5625,
4957
+ "learning_rate": 0.0004978644079478225,
4958
+ "loss": 17.2685,
4959
+ "step": 7070
4960
+ },
4961
+ {
4962
+ "epoch": 0.01380581189581292,
4963
+ "grad_norm": 8.875,
4964
+ "learning_rate": 0.0004978611569323679,
4965
+ "loss": 17.2111,
4966
+ "step": 7080
4967
+ },
4968
+ {
4969
+ "epoch": 0.013825311630129039,
4970
+ "grad_norm": 8.9375,
4971
+ "learning_rate": 0.0004978579059169132,
4972
+ "loss": 17.1583,
4973
+ "step": 7090
4974
+ },
4975
+ {
4976
+ "epoch": 0.01384481136444516,
4977
+ "grad_norm": 7.46875,
4978
+ "learning_rate": 0.0004978546549014585,
4979
+ "loss": 17.275,
4980
+ "step": 7100
4981
+ },
4982
+ {
4983
+ "epoch": 0.01386431109876128,
4984
+ "grad_norm": 7.65625,
4985
+ "learning_rate": 0.0004978514038860039,
4986
+ "loss": 17.2723,
4987
+ "step": 7110
4988
+ },
4989
+ {
4990
+ "epoch": 0.013883810833077399,
4991
+ "grad_norm": 7.5625,
4992
+ "learning_rate": 0.0004978481528705492,
4993
+ "loss": 17.181,
4994
+ "step": 7120
4995
+ },
4996
+ {
4997
+ "epoch": 0.01390331056739352,
4998
+ "grad_norm": 7.71875,
4999
+ "learning_rate": 0.0004978449018550945,
5000
+ "loss": 17.2703,
5001
+ "step": 7130
5002
+ },
5003
+ {
5004
+ "epoch": 0.013922810301709639,
5005
+ "grad_norm": 7.78125,
5006
+ "learning_rate": 0.0004978416508396397,
5007
+ "loss": 17.1655,
5008
+ "step": 7140
5009
+ },
5010
+ {
5011
+ "epoch": 0.01394231003602576,
5012
+ "grad_norm": 6.21875,
5013
+ "learning_rate": 0.0004978383998241851,
5014
+ "loss": 17.0151,
5015
+ "step": 7150
5016
+ },
5017
+ {
5018
+ "epoch": 0.01396180977034188,
5019
+ "grad_norm": 7.375,
5020
+ "learning_rate": 0.0004978351488087304,
5021
+ "loss": 17.1762,
5022
+ "step": 7160
5023
+ },
5024
+ {
5025
+ "epoch": 0.013981309504657998,
5026
+ "grad_norm": 6.09375,
5027
+ "learning_rate": 0.0004978318977932757,
5028
+ "loss": 17.1404,
5029
+ "step": 7170
5030
+ },
5031
+ {
5032
+ "epoch": 0.01400080923897412,
5033
+ "grad_norm": 7.125,
5034
+ "learning_rate": 0.000497828646777821,
5035
+ "loss": 17.2528,
5036
+ "step": 7180
5037
+ },
5038
+ {
5039
+ "epoch": 0.014020308973290239,
5040
+ "grad_norm": 7.75,
5041
+ "learning_rate": 0.0004978253957623664,
5042
+ "loss": 17.2482,
5043
+ "step": 7190
5044
+ },
5045
+ {
5046
+ "epoch": 0.014039808707606358,
5047
+ "grad_norm": 7.96875,
5048
+ "learning_rate": 0.0004978221447469117,
5049
+ "loss": 17.2275,
5050
+ "step": 7200
5051
+ },
5052
+ {
5053
+ "epoch": 0.014059308441922479,
5054
+ "grad_norm": 7.1875,
5055
+ "learning_rate": 0.000497818893731457,
5056
+ "loss": 17.215,
5057
+ "step": 7210
5058
+ },
5059
+ {
5060
+ "epoch": 0.014078808176238598,
5061
+ "grad_norm": 11.75,
5062
+ "learning_rate": 0.0004978156427160024,
5063
+ "loss": 17.1656,
5064
+ "step": 7220
5065
+ },
5066
+ {
5067
+ "epoch": 0.01409830791055472,
5068
+ "grad_norm": 7.6875,
5069
+ "learning_rate": 0.0004978123917005477,
5070
+ "loss": 17.259,
5071
+ "step": 7230
5072
+ },
5073
+ {
5074
+ "epoch": 0.014117807644870839,
5075
+ "grad_norm": 6.9375,
5076
+ "learning_rate": 0.000497809140685093,
5077
+ "loss": 17.1892,
5078
+ "step": 7240
5079
+ },
5080
+ {
5081
+ "epoch": 0.014137307379186958,
5082
+ "grad_norm": 7.3125,
5083
+ "learning_rate": 0.0004978058896696383,
5084
+ "loss": 17.158,
5085
+ "step": 7250
5086
+ },
5087
+ {
5088
+ "epoch": 0.014156807113503079,
5089
+ "grad_norm": 6.65625,
5090
+ "learning_rate": 0.0004978026386541837,
5091
+ "loss": 17.1097,
5092
+ "step": 7260
5093
+ },
5094
+ {
5095
+ "epoch": 0.014176306847819198,
5096
+ "grad_norm": 7.53125,
5097
+ "learning_rate": 0.000497799387638729,
5098
+ "loss": 17.2728,
5099
+ "step": 7270
5100
+ },
5101
+ {
5102
+ "epoch": 0.01419580658213532,
5103
+ "grad_norm": 6.21875,
5104
+ "learning_rate": 0.0004977961366232743,
5105
+ "loss": 17.218,
5106
+ "step": 7280
5107
+ },
5108
+ {
5109
+ "epoch": 0.014215306316451438,
5110
+ "grad_norm": 7.6875,
5111
+ "learning_rate": 0.0004977928856078195,
5112
+ "loss": 17.2388,
5113
+ "step": 7290
5114
+ },
5115
+ {
5116
+ "epoch": 0.014234806050767558,
5117
+ "grad_norm": 7.65625,
5118
+ "learning_rate": 0.0004977896345923649,
5119
+ "loss": 17.1839,
5120
+ "step": 7300
5121
+ },
5122
+ {
5123
+ "epoch": 0.014254305785083679,
5124
+ "grad_norm": 9.0,
5125
+ "learning_rate": 0.0004977863835769102,
5126
+ "loss": 17.0956,
5127
+ "step": 7310
5128
+ },
5129
+ {
5130
+ "epoch": 0.014273805519399798,
5131
+ "grad_norm": 6.46875,
5132
+ "learning_rate": 0.0004977831325614555,
5133
+ "loss": 17.2038,
5134
+ "step": 7320
5135
+ },
5136
+ {
5137
+ "epoch": 0.014293305253715917,
5138
+ "grad_norm": 6.1875,
5139
+ "learning_rate": 0.0004977798815460009,
5140
+ "loss": 17.2453,
5141
+ "step": 7330
5142
+ },
5143
+ {
5144
+ "epoch": 0.014312804988032038,
5145
+ "grad_norm": 6.125,
5146
+ "learning_rate": 0.0004977766305305462,
5147
+ "loss": 17.2322,
5148
+ "step": 7340
5149
+ },
5150
+ {
5151
+ "epoch": 0.014332304722348158,
5152
+ "grad_norm": 7.46875,
5153
+ "learning_rate": 0.0004977733795150915,
5154
+ "loss": 17.1341,
5155
+ "step": 7350
5156
+ },
5157
+ {
5158
+ "epoch": 0.014351804456664279,
5159
+ "grad_norm": 7.53125,
5160
+ "learning_rate": 0.0004977701284996368,
5161
+ "loss": 17.2389,
5162
+ "step": 7360
5163
+ },
5164
+ {
5165
+ "epoch": 0.014371304190980398,
5166
+ "grad_norm": 7.4375,
5167
+ "learning_rate": 0.0004977668774841822,
5168
+ "loss": 17.2391,
5169
+ "step": 7370
5170
+ },
5171
+ {
5172
+ "epoch": 0.014390803925296517,
5173
+ "grad_norm": 7.8125,
5174
+ "learning_rate": 0.0004977636264687275,
5175
+ "loss": 17.1651,
5176
+ "step": 7380
5177
+ },
5178
+ {
5179
+ "epoch": 0.014410303659612638,
5180
+ "grad_norm": 7.09375,
5181
+ "learning_rate": 0.0004977603754532728,
5182
+ "loss": 17.2448,
5183
+ "step": 7390
5184
+ },
5185
+ {
5186
+ "epoch": 0.014429803393928757,
5187
+ "grad_norm": 7.28125,
5188
+ "learning_rate": 0.0004977571244378182,
5189
+ "loss": 17.168,
5190
+ "step": 7400
5191
+ },
5192
+ {
5193
+ "epoch": 0.014449303128244878,
5194
+ "grad_norm": 7.71875,
5195
+ "learning_rate": 0.0004977538734223635,
5196
+ "loss": 17.2225,
5197
+ "step": 7410
5198
+ },
5199
+ {
5200
+ "epoch": 0.014468802862560998,
5201
+ "grad_norm": 9.0,
5202
+ "learning_rate": 0.0004977506224069088,
5203
+ "loss": 17.1799,
5204
+ "step": 7420
5205
+ },
5206
+ {
5207
+ "epoch": 0.014488302596877117,
5208
+ "grad_norm": 7.25,
5209
+ "learning_rate": 0.0004977473713914541,
5210
+ "loss": 17.1958,
5211
+ "step": 7430
5212
+ },
5213
+ {
5214
+ "epoch": 0.014507802331193238,
5215
+ "grad_norm": 6.5,
5216
+ "learning_rate": 0.0004977441203759995,
5217
+ "loss": 17.1678,
5218
+ "step": 7440
5219
+ },
5220
+ {
5221
+ "epoch": 0.014527302065509357,
5222
+ "grad_norm": 7.3125,
5223
+ "learning_rate": 0.0004977408693605448,
5224
+ "loss": 17.1689,
5225
+ "step": 7450
5226
+ },
5227
+ {
5228
+ "epoch": 0.014546801799825477,
5229
+ "grad_norm": 6.625,
5230
+ "learning_rate": 0.0004977376183450901,
5231
+ "loss": 17.1743,
5232
+ "step": 7460
5233
+ },
5234
+ {
5235
+ "epoch": 0.014566301534141598,
5236
+ "grad_norm": 7.125,
5237
+ "learning_rate": 0.0004977343673296354,
5238
+ "loss": 17.2665,
5239
+ "step": 7470
5240
+ },
5241
+ {
5242
+ "epoch": 0.014585801268457717,
5243
+ "grad_norm": 8.5,
5244
+ "learning_rate": 0.0004977311163141807,
5245
+ "loss": 17.1666,
5246
+ "step": 7480
5247
+ },
5248
+ {
5249
+ "epoch": 0.014605301002773838,
5250
+ "grad_norm": 7.0625,
5251
+ "learning_rate": 0.000497727865298726,
5252
+ "loss": 17.1159,
5253
+ "step": 7490
5254
+ },
5255
+ {
5256
+ "epoch": 0.014624800737089957,
5257
+ "grad_norm": 7.625,
5258
+ "learning_rate": 0.0004977246142832713,
5259
+ "loss": 17.2182,
5260
+ "step": 7500
5261
  }
5262
  ],
5263
  "logging_steps": 10,
 
5277
  "attributes": {}
5278
  }
5279
  },
5280
+ "total_flos": 1.6687308508994994e+19,
5281
  "train_batch_size": 48,
5282
  "trial_name": null,
5283
  "trial_params": null