Wilsonwin commited on
Commit
ea91019
·
verified ·
1 Parent(s): 66c98a4

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34c8f104effe1a88e833bb692c7b75c569bc83b156fc0482dcf0ed735fda2945
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:152c34cc1ef8eea86d84f7b0351d9f983b40e24507e8054571349aacd4aba343
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26c334859cc6eb4b1ef4006976a7f325a89208371148b26da8caf2a6573930ff
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8012a529a81b3f92efa4c79d19d5460d546f7ff16907210ecdb6456891de9745
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2f6f65c0c5e2316b09e8cb46abab96e8f2ae754bdffd662e804a33277263cd9
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b50695bbf99bef39c4d13662a35b1f845a2b2c6b19490939ad9cc39127e32ab1
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3c87a18ccc821b756f8fecf0a1e33873b3617702f02d6f52c0042644b36bee0d
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1a5b64fb90c999b23793906d64020914f128f72d1523c4f0f8e8ea53ab2425c
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1826322013853692,
6
  "eval_steps": 500,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5027,6 +5027,364 @@
5027
  "eval_samples_per_second": 245.6,
5028
  "eval_steps_per_second": 5.158,
5029
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5030
  }
5031
  ],
5032
  "logging_steps": 10,
@@ -5046,7 +5404,7 @@
5046
  "attributes": {}
5047
  }
5048
  },
5049
- "total_flos": 2.3411768424608563e+17,
5050
  "train_batch_size": 48,
5051
  "trial_name": null,
5052
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2671059300557528,
6
  "eval_steps": 500,
7
+ "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5027
  "eval_samples_per_second": 245.6,
5028
  "eval_steps_per_second": 5.158,
5029
  "step": 7000
5030
+ },
5031
+ {
5032
+ "epoch": 1.184321675958777,
5033
+ "grad_norm": 0.49278581142425537,
5034
+ "learning_rate": 0.000145689605353268,
5035
+ "loss": 4.424203109741211,
5036
+ "step": 7010
5037
+ },
5038
+ {
5039
+ "epoch": 1.1860111505321844,
5040
+ "grad_norm": 0.49644234776496887,
5041
+ "learning_rate": 0.00014521082727641152,
5042
+ "loss": 4.395336151123047,
5043
+ "step": 7020
5044
+ },
5045
+ {
5046
+ "epoch": 1.1877006251055922,
5047
+ "grad_norm": 0.483456552028656,
5048
+ "learning_rate": 0.0001447320980362472,
5049
+ "loss": 4.440347671508789,
5050
+ "step": 7030
5051
+ },
5052
+ {
5053
+ "epoch": 1.189390099679,
5054
+ "grad_norm": 0.5150992274284363,
5055
+ "learning_rate": 0.00014425342251452679,
5056
+ "loss": 4.393960571289062,
5057
+ "step": 7040
5058
+ },
5059
+ {
5060
+ "epoch": 1.1910795742524076,
5061
+ "grad_norm": 0.47316014766693115,
5062
+ "learning_rate": 0.00014377480559245434,
5063
+ "loss": 4.433261108398438,
5064
+ "step": 7050
5065
+ },
5066
+ {
5067
+ "epoch": 1.1927690488258151,
5068
+ "grad_norm": 0.5043189525604248,
5069
+ "learning_rate": 0.00014329625215063629,
5070
+ "loss": 4.437650680541992,
5071
+ "step": 7060
5072
+ },
5073
+ {
5074
+ "epoch": 1.1944585233992229,
5075
+ "grad_norm": 0.49998390674591064,
5076
+ "learning_rate": 0.00014281776706903177,
5077
+ "loss": 4.40019416809082,
5078
+ "step": 7070
5079
+ },
5080
+ {
5081
+ "epoch": 1.1961479979726306,
5082
+ "grad_norm": 0.5133141279220581,
5083
+ "learning_rate": 0.0001423393552269028,
5084
+ "loss": 4.417116928100586,
5085
+ "step": 7080
5086
+ },
5087
+ {
5088
+ "epoch": 1.1978374725460381,
5089
+ "grad_norm": 0.513031005859375,
5090
+ "learning_rate": 0.00014186102150276454,
5091
+ "loss": 4.438409805297852,
5092
+ "step": 7090
5093
+ },
5094
+ {
5095
+ "epoch": 1.1995269471194459,
5096
+ "grad_norm": 0.4915519058704376,
5097
+ "learning_rate": 0.00014138277077433567,
5098
+ "loss": 4.4253074645996096,
5099
+ "step": 7100
5100
+ },
5101
+ {
5102
+ "epoch": 1.2012164216928536,
5103
+ "grad_norm": 0.5202800035476685,
5104
+ "learning_rate": 0.00014090460791848827,
5105
+ "loss": 4.41809310913086,
5106
+ "step": 7110
5107
+ },
5108
+ {
5109
+ "epoch": 1.202905896266261,
5110
+ "grad_norm": 0.49077826738357544,
5111
+ "learning_rate": 0.00014042653781119868,
5112
+ "loss": 4.397499465942383,
5113
+ "step": 7120
5114
+ },
5115
+ {
5116
+ "epoch": 1.2045953708396688,
5117
+ "grad_norm": 0.4648706912994385,
5118
+ "learning_rate": 0.0001399485653274973,
5119
+ "loss": 4.408271026611328,
5120
+ "step": 7130
5121
+ },
5122
+ {
5123
+ "epoch": 1.2062848454130766,
5124
+ "grad_norm": 0.4614482820034027,
5125
+ "learning_rate": 0.00013947069534141904,
5126
+ "loss": 4.425214004516602,
5127
+ "step": 7140
5128
+ },
5129
+ {
5130
+ "epoch": 1.207974319986484,
5131
+ "grad_norm": 0.4744400084018707,
5132
+ "learning_rate": 0.00013899293272595355,
5133
+ "loss": 4.440077590942383,
5134
+ "step": 7150
5135
+ },
5136
+ {
5137
+ "epoch": 1.2096637945598918,
5138
+ "grad_norm": 0.46556323766708374,
5139
+ "learning_rate": 0.0001385152823529957,
5140
+ "loss": 4.412957382202149,
5141
+ "step": 7160
5142
+ },
5143
+ {
5144
+ "epoch": 1.2113532691332995,
5145
+ "grad_norm": 0.4939349591732025,
5146
+ "learning_rate": 0.00013803774909329567,
5147
+ "loss": 4.405846023559571,
5148
+ "step": 7170
5149
+ },
5150
+ {
5151
+ "epoch": 1.2130427437067073,
5152
+ "grad_norm": 0.47055721282958984,
5153
+ "learning_rate": 0.0001375603378164095,
5154
+ "loss": 4.382000350952149,
5155
+ "step": 7180
5156
+ },
5157
+ {
5158
+ "epoch": 1.2147322182801148,
5159
+ "grad_norm": 0.47987523674964905,
5160
+ "learning_rate": 0.00013708305339064933,
5161
+ "loss": 4.415153121948242,
5162
+ "step": 7190
5163
+ },
5164
+ {
5165
+ "epoch": 1.2164216928535225,
5166
+ "grad_norm": 0.4784037470817566,
5167
+ "learning_rate": 0.00013660590068303373,
5168
+ "loss": 4.4415229797363285,
5169
+ "step": 7200
5170
+ },
5171
+ {
5172
+ "epoch": 1.2181111674269303,
5173
+ "grad_norm": 0.500056803226471,
5174
+ "learning_rate": 0.00013612888455923804,
5175
+ "loss": 4.416479873657226,
5176
+ "step": 7210
5177
+ },
5178
+ {
5179
+ "epoch": 1.219800642000338,
5180
+ "grad_norm": 0.4778987169265747,
5181
+ "learning_rate": 0.0001356520098835449,
5182
+ "loss": 4.442354583740235,
5183
+ "step": 7220
5184
+ },
5185
+ {
5186
+ "epoch": 1.2214901165737455,
5187
+ "grad_norm": 0.5005702376365662,
5188
+ "learning_rate": 0.00013517528151879457,
5189
+ "loss": 4.411639404296875,
5190
+ "step": 7230
5191
+ },
5192
+ {
5193
+ "epoch": 1.2231795911471532,
5194
+ "grad_norm": 0.4689568281173706,
5195
+ "learning_rate": 0.0001346987043263352,
5196
+ "loss": 4.414199447631836,
5197
+ "step": 7240
5198
+ },
5199
+ {
5200
+ "epoch": 1.224869065720561,
5201
+ "grad_norm": 0.4993502199649811,
5202
+ "learning_rate": 0.00013422228316597356,
5203
+ "loss": 4.432155609130859,
5204
+ "step": 7250
5205
+ },
5206
+ {
5207
+ "epoch": 1.2265585402939685,
5208
+ "grad_norm": 0.4782608151435852,
5209
+ "learning_rate": 0.00013374602289592508,
5210
+ "loss": 4.431560897827149,
5211
+ "step": 7260
5212
+ },
5213
+ {
5214
+ "epoch": 1.2282480148673762,
5215
+ "grad_norm": 0.5125144124031067,
5216
+ "learning_rate": 0.00013326992837276494,
5217
+ "loss": 4.405394744873047,
5218
+ "step": 7270
5219
+ },
5220
+ {
5221
+ "epoch": 1.229937489440784,
5222
+ "grad_norm": 0.48408523201942444,
5223
+ "learning_rate": 0.0001327940044513777,
5224
+ "loss": 4.4137004852294925,
5225
+ "step": 7280
5226
+ },
5227
+ {
5228
+ "epoch": 1.2316269640141915,
5229
+ "grad_norm": 0.4888753294944763,
5230
+ "learning_rate": 0.00013231825598490854,
5231
+ "loss": 4.409386062622071,
5232
+ "step": 7290
5233
+ },
5234
+ {
5235
+ "epoch": 1.2333164385875992,
5236
+ "grad_norm": 0.47923538088798523,
5237
+ "learning_rate": 0.0001318426878247133,
5238
+ "loss": 4.4191631317138675,
5239
+ "step": 7300
5240
+ },
5241
+ {
5242
+ "epoch": 1.235005913161007,
5243
+ "grad_norm": 0.46775540709495544,
5244
+ "learning_rate": 0.00013136730482030928,
5245
+ "loss": 4.423541259765625,
5246
+ "step": 7310
5247
+ },
5248
+ {
5249
+ "epoch": 1.2366953877344147,
5250
+ "grad_norm": 0.48620909452438354,
5251
+ "learning_rate": 0.0001308921118193257,
5252
+ "loss": 4.431262969970703,
5253
+ "step": 7320
5254
+ },
5255
+ {
5256
+ "epoch": 1.2383848623078222,
5257
+ "grad_norm": 0.5028111338615417,
5258
+ "learning_rate": 0.00013041711366745408,
5259
+ "loss": 4.423612976074219,
5260
+ "step": 7330
5261
+ },
5262
+ {
5263
+ "epoch": 1.24007433688123,
5264
+ "grad_norm": 0.4982888996601105,
5265
+ "learning_rate": 0.00012994231520839934,
5266
+ "loss": 4.428596878051758,
5267
+ "step": 7340
5268
+ },
5269
+ {
5270
+ "epoch": 1.2417638114546377,
5271
+ "grad_norm": 0.5141102075576782,
5272
+ "learning_rate": 0.0001294677212838297,
5273
+ "loss": 4.398578262329101,
5274
+ "step": 7350
5275
+ },
5276
+ {
5277
+ "epoch": 1.2434532860280454,
5278
+ "grad_norm": 0.48103561997413635,
5279
+ "learning_rate": 0.00012899333673332795,
5280
+ "loss": 4.439675140380859,
5281
+ "step": 7360
5282
+ },
5283
+ {
5284
+ "epoch": 1.245142760601453,
5285
+ "grad_norm": 0.5084096789360046,
5286
+ "learning_rate": 0.00012851916639434164,
5287
+ "loss": 4.3824302673339846,
5288
+ "step": 7370
5289
+ },
5290
+ {
5291
+ "epoch": 1.2468322351748606,
5292
+ "grad_norm": 0.4776511788368225,
5293
+ "learning_rate": 0.00012804521510213407,
5294
+ "loss": 4.402749633789062,
5295
+ "step": 7380
5296
+ },
5297
+ {
5298
+ "epoch": 1.2485217097482684,
5299
+ "grad_norm": 0.499318391084671,
5300
+ "learning_rate": 0.00012757148768973483,
5301
+ "loss": 4.405498886108399,
5302
+ "step": 7390
5303
+ },
5304
+ {
5305
+ "epoch": 1.2502111843216759,
5306
+ "grad_norm": 0.4898117184638977,
5307
+ "learning_rate": 0.00012709798898789042,
5308
+ "loss": 4.4396411895751955,
5309
+ "step": 7400
5310
+ },
5311
+ {
5312
+ "epoch": 1.2519006588950836,
5313
+ "grad_norm": 0.4777224361896515,
5314
+ "learning_rate": 0.00012662472382501524,
5315
+ "loss": 4.409711074829102,
5316
+ "step": 7410
5317
+ },
5318
+ {
5319
+ "epoch": 1.2535901334684914,
5320
+ "grad_norm": 0.48530757427215576,
5321
+ "learning_rate": 0.0001261516970271422,
5322
+ "loss": 4.4214935302734375,
5323
+ "step": 7420
5324
+ },
5325
+ {
5326
+ "epoch": 1.2552796080418989,
5327
+ "grad_norm": 0.48434415459632874,
5328
+ "learning_rate": 0.0001256789134178735,
5329
+ "loss": 4.438081741333008,
5330
+ "step": 7430
5331
+ },
5332
+ {
5333
+ "epoch": 1.2569690826153066,
5334
+ "grad_norm": 0.4974631071090698,
5335
+ "learning_rate": 0.00012520637781833144,
5336
+ "loss": 4.407797622680664,
5337
+ "step": 7440
5338
+ },
5339
+ {
5340
+ "epoch": 1.2586585571887143,
5341
+ "grad_norm": 0.4732743799686432,
5342
+ "learning_rate": 0.0001247340950471094,
5343
+ "loss": 4.418028259277344,
5344
+ "step": 7450
5345
+ },
5346
+ {
5347
+ "epoch": 1.2603480317621218,
5348
+ "grad_norm": 0.5003547072410583,
5349
+ "learning_rate": 0.0001242620699202224,
5350
+ "loss": 4.400883483886719,
5351
+ "step": 7460
5352
+ },
5353
+ {
5354
+ "epoch": 1.2620375063355296,
5355
+ "grad_norm": 0.49987900257110596,
5356
+ "learning_rate": 0.00012379030725105837,
5357
+ "loss": 4.402442169189453,
5358
+ "step": 7470
5359
+ },
5360
+ {
5361
+ "epoch": 1.2637269809089373,
5362
+ "grad_norm": 0.49416637420654297,
5363
+ "learning_rate": 0.00012331881185032872,
5364
+ "loss": 4.388990020751953,
5365
+ "step": 7480
5366
+ },
5367
+ {
5368
+ "epoch": 1.265416455482345,
5369
+ "grad_norm": 0.5343226194381714,
5370
+ "learning_rate": 0.00012284758852601962,
5371
+ "loss": 4.411848449707032,
5372
+ "step": 7490
5373
+ },
5374
+ {
5375
+ "epoch": 1.2671059300557528,
5376
+ "grad_norm": 0.5128340125083923,
5377
+ "learning_rate": 0.00012237664208334263,
5378
+ "loss": 4.403173446655273,
5379
+ "step": 7500
5380
+ },
5381
+ {
5382
+ "epoch": 1.2671059300557528,
5383
+ "eval_loss": 4.382744789123535,
5384
+ "eval_runtime": 3.7472,
5385
+ "eval_samples_per_second": 266.869,
5386
+ "eval_steps_per_second": 5.604,
5387
+ "step": 7500
5388
  }
5389
  ],
5390
  "logging_steps": 10,
 
5404
  "attributes": {}
5405
  }
5406
  },
5407
+ "total_flos": 2.5084052528902963e+17,
5408
  "train_batch_size": 48,
5409
  "trial_name": null,
5410
  "trial_params": null