Wilsonwin commited on
Commit
91d46a6
·
verified ·
1 Parent(s): dfc058e

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c76cf6da384756da592c5c50d0169c8d71834422387d445a1df34766e643d9bb
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c46250bd52dcffd2137953f30321a4ed3d622b1bca6be15bc5f8f084e4fc31f
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:68bf75b9ec958e92f211abfe579212c04190172b49bdfbcf136eea9aae980133
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71e649d70bbaea3f3c60f2aa0818a879521dffa0038d58ed1695489f8bca966b
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:012c45d165b3369856a4591817420a71a07d3d2cd37f890b655313517015a2fd
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7105421fba4235e8fc90f3dbc4569b85e884f75c3232217a25f8f5042cf8247a
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c87a18ccc821b756f8fecf0a1e33873b3617702f02d6f52c0042644b36bee0d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1a5b64fb90c999b23793906d64020914f128f72d1523c4f0f8e8ea53ab2425c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1826322013853692,
6
  "eval_steps": 500,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5027,6 +5027,364 @@
5027
  "eval_samples_per_second": 274.439,
5028
  "eval_steps_per_second": 5.763,
5029
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5030
  }
5031
  ],
5032
  "logging_steps": 10,
@@ -5046,7 +5404,7 @@
5046
  "attributes": {}
5047
  }
5048
  },
5049
- "total_flos": 2.3411768424608563e+17,
5050
  "train_batch_size": 48,
5051
  "trial_name": null,
5052
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2671059300557528,
6
  "eval_steps": 500,
7
+ "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5027
  "eval_samples_per_second": 274.439,
5028
  "eval_steps_per_second": 5.763,
5029
  "step": 7000
5030
+ },
5031
+ {
5032
+ "epoch": 1.184321675958777,
5033
+ "grad_norm": 0.4751238524913788,
5034
+ "learning_rate": 0.000145689605353268,
5035
+ "loss": 4.423115158081055,
5036
+ "step": 7010
5037
+ },
5038
+ {
5039
+ "epoch": 1.1860111505321844,
5040
+ "grad_norm": 0.5007295608520508,
5041
+ "learning_rate": 0.00014521082727641152,
5042
+ "loss": 4.394451522827149,
5043
+ "step": 7020
5044
+ },
5045
+ {
5046
+ "epoch": 1.1877006251055922,
5047
+ "grad_norm": 0.48377716541290283,
5048
+ "learning_rate": 0.0001447320980362472,
5049
+ "loss": 4.4401298522949215,
5050
+ "step": 7030
5051
+ },
5052
+ {
5053
+ "epoch": 1.189390099679,
5054
+ "grad_norm": 0.5163672566413879,
5055
+ "learning_rate": 0.00014425342251452679,
5056
+ "loss": 4.39234504699707,
5057
+ "step": 7040
5058
+ },
5059
+ {
5060
+ "epoch": 1.1910795742524076,
5061
+ "grad_norm": 0.4772108793258667,
5062
+ "learning_rate": 0.00014377480559245434,
5063
+ "loss": 4.4317176818847654,
5064
+ "step": 7050
5065
+ },
5066
+ {
5067
+ "epoch": 1.1927690488258151,
5068
+ "grad_norm": 0.5122771263122559,
5069
+ "learning_rate": 0.00014329625215063629,
5070
+ "loss": 4.436024856567383,
5071
+ "step": 7060
5072
+ },
5073
+ {
5074
+ "epoch": 1.1944585233992229,
5075
+ "grad_norm": 0.4967247545719147,
5076
+ "learning_rate": 0.00014281776706903177,
5077
+ "loss": 4.399545288085937,
5078
+ "step": 7070
5079
+ },
5080
+ {
5081
+ "epoch": 1.1961479979726306,
5082
+ "grad_norm": 0.5011969208717346,
5083
+ "learning_rate": 0.0001423393552269028,
5084
+ "loss": 4.415679550170898,
5085
+ "step": 7080
5086
+ },
5087
+ {
5088
+ "epoch": 1.1978374725460381,
5089
+ "grad_norm": 0.5074766874313354,
5090
+ "learning_rate": 0.00014186102150276454,
5091
+ "loss": 4.4365581512451175,
5092
+ "step": 7090
5093
+ },
5094
+ {
5095
+ "epoch": 1.1995269471194459,
5096
+ "grad_norm": 0.48712876439094543,
5097
+ "learning_rate": 0.00014138277077433567,
5098
+ "loss": 4.423890686035156,
5099
+ "step": 7100
5100
+ },
5101
+ {
5102
+ "epoch": 1.2012164216928536,
5103
+ "grad_norm": 0.5180738568305969,
5104
+ "learning_rate": 0.00014090460791848827,
5105
+ "loss": 4.416677474975586,
5106
+ "step": 7110
5107
+ },
5108
+ {
5109
+ "epoch": 1.202905896266261,
5110
+ "grad_norm": 0.45985448360443115,
5111
+ "learning_rate": 0.00014042653781119868,
5112
+ "loss": 4.396572494506836,
5113
+ "step": 7120
5114
+ },
5115
+ {
5116
+ "epoch": 1.2045953708396688,
5117
+ "grad_norm": 0.4647870361804962,
5118
+ "learning_rate": 0.0001399485653274973,
5119
+ "loss": 4.408332824707031,
5120
+ "step": 7130
5121
+ },
5122
+ {
5123
+ "epoch": 1.2062848454130766,
5124
+ "grad_norm": 0.47191551327705383,
5125
+ "learning_rate": 0.00013947069534141904,
5126
+ "loss": 4.4233543395996096,
5127
+ "step": 7140
5128
+ },
5129
+ {
5130
+ "epoch": 1.207974319986484,
5131
+ "grad_norm": 0.481503427028656,
5132
+ "learning_rate": 0.00013899293272595355,
5133
+ "loss": 4.440040969848633,
5134
+ "step": 7150
5135
+ },
5136
+ {
5137
+ "epoch": 1.2096637945598918,
5138
+ "grad_norm": 0.47193533182144165,
5139
+ "learning_rate": 0.0001385152823529957,
5140
+ "loss": 4.411207962036133,
5141
+ "step": 7160
5142
+ },
5143
+ {
5144
+ "epoch": 1.2113532691332995,
5145
+ "grad_norm": 0.4883231520652771,
5146
+ "learning_rate": 0.00013803774909329567,
5147
+ "loss": 4.404615020751953,
5148
+ "step": 7170
5149
+ },
5150
+ {
5151
+ "epoch": 1.2130427437067073,
5152
+ "grad_norm": 0.47556227445602417,
5153
+ "learning_rate": 0.0001375603378164095,
5154
+ "loss": 4.381325149536133,
5155
+ "step": 7180
5156
+ },
5157
+ {
5158
+ "epoch": 1.2147322182801148,
5159
+ "grad_norm": 0.47802919149398804,
5160
+ "learning_rate": 0.00013708305339064933,
5161
+ "loss": 4.414505386352539,
5162
+ "step": 7190
5163
+ },
5164
+ {
5165
+ "epoch": 1.2164216928535225,
5166
+ "grad_norm": 0.4807351529598236,
5167
+ "learning_rate": 0.00013660590068303373,
5168
+ "loss": 4.441463470458984,
5169
+ "step": 7200
5170
+ },
5171
+ {
5172
+ "epoch": 1.2181111674269303,
5173
+ "grad_norm": 0.5178191065788269,
5174
+ "learning_rate": 0.00013612888455923804,
5175
+ "loss": 4.416116714477539,
5176
+ "step": 7210
5177
+ },
5178
+ {
5179
+ "epoch": 1.219800642000338,
5180
+ "grad_norm": 0.48551276326179504,
5181
+ "learning_rate": 0.0001356520098835449,
5182
+ "loss": 4.4417884826660154,
5183
+ "step": 7220
5184
+ },
5185
+ {
5186
+ "epoch": 1.2214901165737455,
5187
+ "grad_norm": 0.4903479218482971,
5188
+ "learning_rate": 0.00013517528151879457,
5189
+ "loss": 4.411055374145508,
5190
+ "step": 7230
5191
+ },
5192
+ {
5193
+ "epoch": 1.2231795911471532,
5194
+ "grad_norm": 0.475599080324173,
5195
+ "learning_rate": 0.0001346987043263352,
5196
+ "loss": 4.41358642578125,
5197
+ "step": 7240
5198
+ },
5199
+ {
5200
+ "epoch": 1.224869065720561,
5201
+ "grad_norm": 0.5013647079467773,
5202
+ "learning_rate": 0.00013422228316597356,
5203
+ "loss": 4.4313819885253904,
5204
+ "step": 7250
5205
+ },
5206
+ {
5207
+ "epoch": 1.2265585402939685,
5208
+ "grad_norm": 0.4796617031097412,
5209
+ "learning_rate": 0.00013374602289592508,
5210
+ "loss": 4.431335830688477,
5211
+ "step": 7260
5212
+ },
5213
+ {
5214
+ "epoch": 1.2282480148673762,
5215
+ "grad_norm": 0.5207120180130005,
5216
+ "learning_rate": 0.00013326992837276494,
5217
+ "loss": 4.4034477233886715,
5218
+ "step": 7270
5219
+ },
5220
+ {
5221
+ "epoch": 1.229937489440784,
5222
+ "grad_norm": 0.48880499601364136,
5223
+ "learning_rate": 0.0001327940044513777,
5224
+ "loss": 4.412507629394531,
5225
+ "step": 7280
5226
+ },
5227
+ {
5228
+ "epoch": 1.2316269640141915,
5229
+ "grad_norm": 0.4934345483779907,
5230
+ "learning_rate": 0.00013231825598490854,
5231
+ "loss": 4.4090087890625,
5232
+ "step": 7290
5233
+ },
5234
+ {
5235
+ "epoch": 1.2333164385875992,
5236
+ "grad_norm": 0.4781767427921295,
5237
+ "learning_rate": 0.0001318426878247133,
5238
+ "loss": 4.418943786621094,
5239
+ "step": 7300
5240
+ },
5241
+ {
5242
+ "epoch": 1.235005913161007,
5243
+ "grad_norm": 0.5096566081047058,
5244
+ "learning_rate": 0.00013136730482030928,
5245
+ "loss": 4.422417449951172,
5246
+ "step": 7310
5247
+ },
5248
+ {
5249
+ "epoch": 1.2366953877344147,
5250
+ "grad_norm": 0.4950323700904846,
5251
+ "learning_rate": 0.0001308921118193257,
5252
+ "loss": 4.429974746704102,
5253
+ "step": 7320
5254
+ },
5255
+ {
5256
+ "epoch": 1.2383848623078222,
5257
+ "grad_norm": 0.513830304145813,
5258
+ "learning_rate": 0.00013041711366745408,
5259
+ "loss": 4.422944641113281,
5260
+ "step": 7330
5261
+ },
5262
+ {
5263
+ "epoch": 1.24007433688123,
5264
+ "grad_norm": 0.48782795667648315,
5265
+ "learning_rate": 0.00012994231520839934,
5266
+ "loss": 4.427762222290039,
5267
+ "step": 7340
5268
+ },
5269
+ {
5270
+ "epoch": 1.2417638114546377,
5271
+ "grad_norm": 0.4940618872642517,
5272
+ "learning_rate": 0.0001294677212838297,
5273
+ "loss": 4.397305297851562,
5274
+ "step": 7350
5275
+ },
5276
+ {
5277
+ "epoch": 1.2434532860280454,
5278
+ "grad_norm": 0.47247400879859924,
5279
+ "learning_rate": 0.00012899333673332795,
5280
+ "loss": 4.438782501220703,
5281
+ "step": 7360
5282
+ },
5283
+ {
5284
+ "epoch": 1.245142760601453,
5285
+ "grad_norm": 0.508912205696106,
5286
+ "learning_rate": 0.00012851916639434164,
5287
+ "loss": 4.381727600097657,
5288
+ "step": 7370
5289
+ },
5290
+ {
5291
+ "epoch": 1.2468322351748606,
5292
+ "grad_norm": 0.47959408164024353,
5293
+ "learning_rate": 0.00012804521510213407,
5294
+ "loss": 4.4016876220703125,
5295
+ "step": 7380
5296
+ },
5297
+ {
5298
+ "epoch": 1.2485217097482684,
5299
+ "grad_norm": 0.4996829330921173,
5300
+ "learning_rate": 0.00012757148768973483,
5301
+ "loss": 4.403145217895508,
5302
+ "step": 7390
5303
+ },
5304
+ {
5305
+ "epoch": 1.2502111843216759,
5306
+ "grad_norm": 0.4780764877796173,
5307
+ "learning_rate": 0.00012709798898789042,
5308
+ "loss": 4.438173294067383,
5309
+ "step": 7400
5310
+ },
5311
+ {
5312
+ "epoch": 1.2519006588950836,
5313
+ "grad_norm": 0.4783228039741516,
5314
+ "learning_rate": 0.00012662472382501524,
5315
+ "loss": 4.408795928955078,
5316
+ "step": 7410
5317
+ },
5318
+ {
5319
+ "epoch": 1.2535901334684914,
5320
+ "grad_norm": 0.5030134320259094,
5321
+ "learning_rate": 0.0001261516970271422,
5322
+ "loss": 4.420645523071289,
5323
+ "step": 7420
5324
+ },
5325
+ {
5326
+ "epoch": 1.2552796080418989,
5327
+ "grad_norm": 0.4808700978755951,
5328
+ "learning_rate": 0.0001256789134178735,
5329
+ "loss": 4.436854553222656,
5330
+ "step": 7430
5331
+ },
5332
+ {
5333
+ "epoch": 1.2569690826153066,
5334
+ "grad_norm": 0.493521511554718,
5335
+ "learning_rate": 0.00012520637781833144,
5336
+ "loss": 4.406418991088867,
5337
+ "step": 7440
5338
+ },
5339
+ {
5340
+ "epoch": 1.2586585571887143,
5341
+ "grad_norm": 0.4652424454689026,
5342
+ "learning_rate": 0.0001247340950471094,
5343
+ "loss": 4.417532348632813,
5344
+ "step": 7450
5345
+ },
5346
+ {
5347
+ "epoch": 1.2603480317621218,
5348
+ "grad_norm": 0.5171105265617371,
5349
+ "learning_rate": 0.0001242620699202224,
5350
+ "loss": 4.399831390380859,
5351
+ "step": 7460
5352
+ },
5353
+ {
5354
+ "epoch": 1.2620375063355296,
5355
+ "grad_norm": 0.49303922057151794,
5356
+ "learning_rate": 0.00012379030725105837,
5357
+ "loss": 4.401795959472656,
5358
+ "step": 7470
5359
+ },
5360
+ {
5361
+ "epoch": 1.2637269809089373,
5362
+ "grad_norm": 0.501957893371582,
5363
+ "learning_rate": 0.00012331881185032872,
5364
+ "loss": 4.388158798217773,
5365
+ "step": 7480
5366
+ },
5367
+ {
5368
+ "epoch": 1.265416455482345,
5369
+ "grad_norm": 0.5417965650558472,
5370
+ "learning_rate": 0.00012284758852601962,
5371
+ "loss": 4.410275650024414,
5372
+ "step": 7490
5373
+ },
5374
+ {
5375
+ "epoch": 1.2671059300557528,
5376
+ "grad_norm": 0.5049046277999878,
5377
+ "learning_rate": 0.00012237664208334263,
5378
+ "loss": 4.402982330322265,
5379
+ "step": 7500
5380
+ },
5381
+ {
5382
+ "epoch": 1.2671059300557528,
5383
+ "eval_loss": 4.3625264167785645,
5384
+ "eval_runtime": 3.7028,
5385
+ "eval_samples_per_second": 270.066,
5386
+ "eval_steps_per_second": 5.671,
5387
+ "step": 7500
5388
  }
5389
  ],
5390
  "logging_steps": 10,
 
5404
  "attributes": {}
5405
  }
5406
  },
5407
+ "total_flos": 2.5084052528902963e+17,
5408
  "train_batch_size": 48,
5409
  "trial_name": null,
5410
  "trial_params": null