minpeter commited on
Commit
21e5e6c
·
verified ·
1 Parent(s): 1763d9e

Training in progress, step 19000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:175d3790f4e3bfb8245c7b39dd800f69c0db9fc536965d696ec8288a9a4a9102
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f397f4e5642ab9da594752daa8ae50f67bd1c0633a0f55e9742963fc8094fc07
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:788c53706c70a599497c5b67318671aa687876ff14e4cb7b622bd54561a1949e
3
  size 209816139
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2947a98ddfbd91d8b159be483158ff85ad080e24af7608cd8524985d5ff37696
3
  size 209816139
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9a15fe294d83e6dcaaac27e2da3864df4a3093a7010bb98cb24cc0788020fcf
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4bf7692b5f5edcd474e14a77a5f13e3f7c7765bb40a870ecf6eeef166453cdc
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbd4cb683a8704b2a8823c0b9c5dfb0af1604e837d8fd4eb0a6b198fcadf9ed0
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67c08732e07123ac792d039d19d16df7f1963cb3c04d8bb64d087ce8609b973b
3
  size 14917
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3db8ebdfd646609f3b5f0928122cbb46e5703a2f230d68735c07eb7b6e67a5cd
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c27272cd09ade13a826f643a30f8708da8615de25fd4349edd6f9144bb4f5503
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
- "epoch": 0.056215919099046205,
6
  "eval_steps": 1000,
7
- "global_step": 18000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5192,6 +5192,294 @@
5192
  "eval_samples_per_second": 50.835,
5193
  "eval_steps_per_second": 3.185,
5194
  "step": 18000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5195
  }
5196
  ],
5197
  "logging_steps": 25,
@@ -5211,7 +5499,7 @@
5211
  "attributes": {}
5212
  }
5213
  },
5214
- "total_flos": 2.282666343213826e+18,
5215
  "train_batch_size": 8,
5216
  "trial_name": null,
5217
  "trial_params": null
 
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
+ "epoch": 0.05933902571565988,
6
  "eval_steps": 1000,
7
+ "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5192
  "eval_samples_per_second": 50.835,
5193
  "eval_steps_per_second": 3.185,
5194
  "step": 18000
5195
+ },
5196
+ {
5197
+ "epoch": 0.05629399676446155,
5198
+ "grad_norm": 43.5,
5199
+ "learning_rate": 0.0009998918390993648,
5200
+ "loss": 30.9913,
5201
+ "step": 18025
5202
+ },
5203
+ {
5204
+ "epoch": 0.056372074429876885,
5205
+ "grad_norm": 42.75,
5206
+ "learning_rate": 0.000999889137307281,
5207
+ "loss": 31.086,
5208
+ "step": 18050
5209
+ },
5210
+ {
5211
+ "epoch": 0.05645015209529223,
5212
+ "grad_norm": 41.0,
5213
+ "learning_rate": 0.0009998864021893864,
5214
+ "loss": 31.0512,
5215
+ "step": 18075
5216
+ },
5217
+ {
5218
+ "epoch": 0.05652822976070757,
5219
+ "grad_norm": 42.75,
5220
+ "learning_rate": 0.0009998836337458629,
5221
+ "loss": 31.2091,
5222
+ "step": 18100
5223
+ },
5224
+ {
5225
+ "epoch": 0.056606307426122916,
5226
+ "grad_norm": 44.25,
5227
+ "learning_rate": 0.0009998808319768954,
5228
+ "loss": 31.1535,
5229
+ "step": 18125
5230
+ },
5231
+ {
5232
+ "epoch": 0.05668438509153825,
5233
+ "grad_norm": 43.5,
5234
+ "learning_rate": 0.0009998779968826707,
5235
+ "loss": 31.3788,
5236
+ "step": 18150
5237
+ },
5238
+ {
5239
+ "epoch": 0.0567624627569536,
5240
+ "grad_norm": 43.75,
5241
+ "learning_rate": 0.0009998751284633779,
5242
+ "loss": 31.3632,
5243
+ "step": 18175
5244
+ },
5245
+ {
5246
+ "epoch": 0.05684054042236894,
5247
+ "grad_norm": 39.0,
5248
+ "learning_rate": 0.0009998722267192076,
5249
+ "loss": 31.101,
5250
+ "step": 18200
5251
+ },
5252
+ {
5253
+ "epoch": 0.056918618087784284,
5254
+ "grad_norm": 38.25,
5255
+ "learning_rate": 0.000999869291650354,
5256
+ "loss": 30.8788,
5257
+ "step": 18225
5258
+ },
5259
+ {
5260
+ "epoch": 0.05699669575319962,
5261
+ "grad_norm": 36.75,
5262
+ "learning_rate": 0.0009998663232570122,
5263
+ "loss": 31.0841,
5264
+ "step": 18250
5265
+ },
5266
+ {
5267
+ "epoch": 0.057074773418614964,
5268
+ "grad_norm": 39.75,
5269
+ "learning_rate": 0.0009998633215393805,
5270
+ "loss": 31.4425,
5271
+ "step": 18275
5272
+ },
5273
+ {
5274
+ "epoch": 0.05715285108403031,
5275
+ "grad_norm": 37.5,
5276
+ "learning_rate": 0.000999860286497659,
5277
+ "loss": 31.6592,
5278
+ "step": 18300
5279
+ },
5280
+ {
5281
+ "epoch": 0.05723092874944565,
5282
+ "grad_norm": 40.0,
5283
+ "learning_rate": 0.0009998572181320496,
5284
+ "loss": 31.3277,
5285
+ "step": 18325
5286
+ },
5287
+ {
5288
+ "epoch": 0.05730900641486099,
5289
+ "grad_norm": 39.75,
5290
+ "learning_rate": 0.0009998541164427575,
5291
+ "loss": 31.3697,
5292
+ "step": 18350
5293
+ },
5294
+ {
5295
+ "epoch": 0.05738708408027633,
5296
+ "grad_norm": 35.0,
5297
+ "learning_rate": 0.0009998509814299888,
5298
+ "loss": 31.2663,
5299
+ "step": 18375
5300
+ },
5301
+ {
5302
+ "epoch": 0.057465161745691676,
5303
+ "grad_norm": 37.25,
5304
+ "learning_rate": 0.000999847813093953,
5305
+ "loss": 31.6682,
5306
+ "step": 18400
5307
+ },
5308
+ {
5309
+ "epoch": 0.05754323941110702,
5310
+ "grad_norm": 38.75,
5311
+ "learning_rate": 0.0009998446114348612,
5312
+ "loss": 31.7364,
5313
+ "step": 18425
5314
+ },
5315
+ {
5316
+ "epoch": 0.057621317076522356,
5317
+ "grad_norm": 48.5,
5318
+ "learning_rate": 0.0009998413764529266,
5319
+ "loss": 31.8273,
5320
+ "step": 18450
5321
+ },
5322
+ {
5323
+ "epoch": 0.0576993947419377,
5324
+ "grad_norm": 39.5,
5325
+ "learning_rate": 0.0009998381081483651,
5326
+ "loss": 32.178,
5327
+ "step": 18475
5328
+ },
5329
+ {
5330
+ "epoch": 0.057777472407353044,
5331
+ "grad_norm": 38.75,
5332
+ "learning_rate": 0.0009998348065213946,
5333
+ "loss": 32.3324,
5334
+ "step": 18500
5335
+ },
5336
+ {
5337
+ "epoch": 0.05785555007276839,
5338
+ "grad_norm": 41.75,
5339
+ "learning_rate": 0.000999831471572235,
5340
+ "loss": 32.6464,
5341
+ "step": 18525
5342
+ },
5343
+ {
5344
+ "epoch": 0.057933627738183724,
5345
+ "grad_norm": 42.0,
5346
+ "learning_rate": 0.0009998281033011091,
5347
+ "loss": 32.1848,
5348
+ "step": 18550
5349
+ },
5350
+ {
5351
+ "epoch": 0.05801170540359907,
5352
+ "grad_norm": 39.75,
5353
+ "learning_rate": 0.000999824701708241,
5354
+ "loss": 32.543,
5355
+ "step": 18575
5356
+ },
5357
+ {
5358
+ "epoch": 0.05808978306901441,
5359
+ "grad_norm": 48.5,
5360
+ "learning_rate": 0.0009998212667938578,
5361
+ "loss": 32.4726,
5362
+ "step": 18600
5363
+ },
5364
+ {
5365
+ "epoch": 0.058167860734429755,
5366
+ "grad_norm": 45.0,
5367
+ "learning_rate": 0.000999817798558188,
5368
+ "loss": 32.2877,
5369
+ "step": 18625
5370
+ },
5371
+ {
5372
+ "epoch": 0.05824593839984509,
5373
+ "grad_norm": 38.25,
5374
+ "learning_rate": 0.0009998142970014633,
5375
+ "loss": 32.4187,
5376
+ "step": 18650
5377
+ },
5378
+ {
5379
+ "epoch": 0.058324016065260435,
5380
+ "grad_norm": 51.5,
5381
+ "learning_rate": 0.0009998107621239168,
5382
+ "loss": 32.6334,
5383
+ "step": 18675
5384
+ },
5385
+ {
5386
+ "epoch": 0.05840209373067578,
5387
+ "grad_norm": 48.5,
5388
+ "learning_rate": 0.0009998071939257842,
5389
+ "loss": 33.0217,
5390
+ "step": 18700
5391
+ },
5392
+ {
5393
+ "epoch": 0.05848017139609112,
5394
+ "grad_norm": 50.0,
5395
+ "learning_rate": 0.0009998035924073036,
5396
+ "loss": 32.839,
5397
+ "step": 18725
5398
+ },
5399
+ {
5400
+ "epoch": 0.05855824906150646,
5401
+ "grad_norm": 41.75,
5402
+ "learning_rate": 0.000999799957568715,
5403
+ "loss": 32.84,
5404
+ "step": 18750
5405
+ },
5406
+ {
5407
+ "epoch": 0.0586363267269218,
5408
+ "grad_norm": 55.5,
5409
+ "learning_rate": 0.0009997962894102608,
5410
+ "loss": 33.0097,
5411
+ "step": 18775
5412
+ },
5413
+ {
5414
+ "epoch": 0.05871440439233715,
5415
+ "grad_norm": 52.5,
5416
+ "learning_rate": 0.0009997925879321854,
5417
+ "loss": 33.0055,
5418
+ "step": 18800
5419
+ },
5420
+ {
5421
+ "epoch": 0.05879248205775249,
5422
+ "grad_norm": 47.25,
5423
+ "learning_rate": 0.0009997888531347358,
5424
+ "loss": 33.3652,
5425
+ "step": 18825
5426
+ },
5427
+ {
5428
+ "epoch": 0.05887055972316783,
5429
+ "grad_norm": 41.25,
5430
+ "learning_rate": 0.0009997850850181605,
5431
+ "loss": 33.1608,
5432
+ "step": 18850
5433
+ },
5434
+ {
5435
+ "epoch": 0.05894863738858317,
5436
+ "grad_norm": 42.75,
5437
+ "learning_rate": 0.000999781283582711,
5438
+ "loss": 33.2872,
5439
+ "step": 18875
5440
+ },
5441
+ {
5442
+ "epoch": 0.059026715053998514,
5443
+ "grad_norm": 43.25,
5444
+ "learning_rate": 0.0009997774488286408,
5445
+ "loss": 33.0581,
5446
+ "step": 18900
5447
+ },
5448
+ {
5449
+ "epoch": 0.05910479271941386,
5450
+ "grad_norm": 48.0,
5451
+ "learning_rate": 0.0009997735807562055,
5452
+ "loss": 33.0212,
5453
+ "step": 18925
5454
+ },
5455
+ {
5456
+ "epoch": 0.059182870384829195,
5457
+ "grad_norm": 39.0,
5458
+ "learning_rate": 0.000999769679365663,
5459
+ "loss": 32.7047,
5460
+ "step": 18950
5461
+ },
5462
+ {
5463
+ "epoch": 0.05926094805024454,
5464
+ "grad_norm": 41.25,
5465
+ "learning_rate": 0.0009997657446572735,
5466
+ "loss": 32.7831,
5467
+ "step": 18975
5468
+ },
5469
+ {
5470
+ "epoch": 0.05933902571565988,
5471
+ "grad_norm": 42.75,
5472
+ "learning_rate": 0.0009997617766312988,
5473
+ "loss": 32.8744,
5474
+ "step": 19000
5475
+ },
5476
+ {
5477
+ "epoch": 0.05933902571565988,
5478
+ "eval_loss": 32.887264251708984,
5479
+ "eval_runtime": 102.2215,
5480
+ "eval_samples_per_second": 50.899,
5481
+ "eval_steps_per_second": 3.189,
5482
+ "step": 19000
5483
  }
5484
  ],
5485
  "logging_steps": 25,
 
5499
  "attributes": {}
5500
  }
5501
  },
5502
+ "total_flos": 2.4094740084298875e+18,
5503
  "train_batch_size": 8,
5504
  "trial_name": null,
5505
  "trial_params": null