Azrail commited on
Commit
ed1f2d3
·
verified ·
1 Parent(s): 5dec0a3

Training in progress, step 31000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9965fcf14e783e7e1d55074ea2afa9a825c414e7bb1e05e788c2b6e78b01e868
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99efb4f925ebae40cd6f793929b87a0ccac0e7b97e6def05084db3705337b811
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e44a1859e7ded3de3d773c5abac76c0fc5f7c6f4fc38577dfe331b1a4c391ab7
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aba48d7345e1335acdd811f72ad9602a930b00d7d91d9a11216fc53d7f15cb25
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc95bdc35ebe00877717681894afcd7d44f457b0583fea8b14d22f39dd179eb8
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ffd9dd4a600ef00ffe7371c71cf7eaaf39e90e97468b4a36b4cc557b2fc5d1
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d0c8a94ae7b3402d9f6c538decfc8292fd64108bb86fd10da3f27734428bf0b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:277f21680b959b596662b48a96a00aaa486d9a86675c2da90af20e0783552321
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.6589812972870563,
6
  "eval_steps": 500,
7
- "global_step": 30000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5348,11 +5348,189 @@
5348
  "eval_steps_per_second": 18.914,
5349
  "num_input_tokens_seen": 31457276160,
5350
  "step": 30000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5351
  }
5352
  ],
5353
  "logging_steps": 50,
5354
  "max_steps": 200000,
5355
- "num_input_tokens_seen": 31457276160,
5356
  "num_train_epochs": 5,
5357
  "save_steps": 1000,
5358
  "stateful_callbacks": {
@@ -5367,7 +5545,7 @@
5367
  "attributes": {}
5368
  }
5369
  },
5370
- "total_flos": 1.79151492920397e+19,
5371
  "train_batch_size": 64,
5372
  "trial_name": null,
5373
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.6809473405299582,
6
  "eval_steps": 500,
7
+ "global_step": 31000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5348
  "eval_steps_per_second": 18.914,
5349
  "num_input_tokens_seen": 31457276160,
5350
  "step": 30000
5351
+ },
5352
+ {
5353
+ "epoch": 0.6600795994492015,
5354
+ "grad_norm": 0.14465224742889404,
5355
+ "learning_rate": 0.001,
5356
+ "loss": 2.6657,
5357
+ "num_input_tokens_seen": 31509704960,
5358
+ "step": 30050
5359
+ },
5360
+ {
5361
+ "epoch": 0.6611779016113466,
5362
+ "grad_norm": 0.16096332669258118,
5363
+ "learning_rate": 0.001,
5364
+ "loss": 2.6612,
5365
+ "num_input_tokens_seen": 31562133760,
5366
+ "step": 30100
5367
+ },
5368
+ {
5369
+ "epoch": 0.6622762037734916,
5370
+ "grad_norm": 0.1434296816587448,
5371
+ "learning_rate": 0.001,
5372
+ "loss": 2.6695,
5373
+ "num_input_tokens_seen": 31614562560,
5374
+ "step": 30150
5375
+ },
5376
+ {
5377
+ "epoch": 0.6633745059356367,
5378
+ "grad_norm": 0.13844367861747742,
5379
+ "learning_rate": 0.001,
5380
+ "loss": 2.6649,
5381
+ "num_input_tokens_seen": 31666991360,
5382
+ "step": 30200
5383
+ },
5384
+ {
5385
+ "epoch": 0.6644728080977819,
5386
+ "grad_norm": 0.1579446643590927,
5387
+ "learning_rate": 0.001,
5388
+ "loss": 2.6701,
5389
+ "num_input_tokens_seen": 31719420160,
5390
+ "step": 30250
5391
+ },
5392
+ {
5393
+ "epoch": 0.665571110259927,
5394
+ "grad_norm": 0.1585385501384735,
5395
+ "learning_rate": 0.001,
5396
+ "loss": 2.665,
5397
+ "num_input_tokens_seen": 31771848960,
5398
+ "step": 30300
5399
+ },
5400
+ {
5401
+ "epoch": 0.666669412422072,
5402
+ "grad_norm": 0.18768636882305145,
5403
+ "learning_rate": 0.001,
5404
+ "loss": 2.6708,
5405
+ "num_input_tokens_seen": 31824277760,
5406
+ "step": 30350
5407
+ },
5408
+ {
5409
+ "epoch": 0.6677677145842171,
5410
+ "grad_norm": 0.13027966022491455,
5411
+ "learning_rate": 0.001,
5412
+ "loss": 2.6657,
5413
+ "num_input_tokens_seen": 31876706560,
5414
+ "step": 30400
5415
+ },
5416
+ {
5417
+ "epoch": 0.6688660167463623,
5418
+ "grad_norm": 0.13473722338676453,
5419
+ "learning_rate": 0.001,
5420
+ "loss": 2.6658,
5421
+ "num_input_tokens_seen": 31929135360,
5422
+ "step": 30450
5423
+ },
5424
+ {
5425
+ "epoch": 0.6699643189085073,
5426
+ "grad_norm": 0.14617317914962769,
5427
+ "learning_rate": 0.001,
5428
+ "loss": 2.664,
5429
+ "num_input_tokens_seen": 31981564160,
5430
+ "step": 30500
5431
+ },
5432
+ {
5433
+ "epoch": 0.6699643189085073,
5434
+ "eval_loss": 2.5658769607543945,
5435
+ "eval_runtime": 67.5011,
5436
+ "eval_samples_per_second": 74.073,
5437
+ "eval_steps_per_second": 18.518,
5438
+ "num_input_tokens_seen": 31981564160,
5439
+ "step": 30500
5440
+ },
5441
+ {
5442
+ "epoch": 0.6710626210706524,
5443
+ "grad_norm": 0.14581717550754547,
5444
+ "learning_rate": 0.001,
5445
+ "loss": 2.6654,
5446
+ "num_input_tokens_seen": 32033992960,
5447
+ "step": 30550
5448
+ },
5449
+ {
5450
+ "epoch": 0.6721609232327975,
5451
+ "grad_norm": 0.12281567603349686,
5452
+ "learning_rate": 0.001,
5453
+ "loss": 2.6649,
5454
+ "num_input_tokens_seen": 32086421760,
5455
+ "step": 30600
5456
+ },
5457
+ {
5458
+ "epoch": 0.6732592253949425,
5459
+ "grad_norm": 0.14368072152137756,
5460
+ "learning_rate": 0.001,
5461
+ "loss": 2.6605,
5462
+ "num_input_tokens_seen": 32138850560,
5463
+ "step": 30650
5464
+ },
5465
+ {
5466
+ "epoch": 0.6743575275570877,
5467
+ "grad_norm": 0.14596907794475555,
5468
+ "learning_rate": 0.001,
5469
+ "loss": 2.6651,
5470
+ "num_input_tokens_seen": 32191279360,
5471
+ "step": 30700
5472
+ },
5473
+ {
5474
+ "epoch": 0.6754558297192328,
5475
+ "grad_norm": 0.15414392948150635,
5476
+ "learning_rate": 0.001,
5477
+ "loss": 2.6696,
5478
+ "num_input_tokens_seen": 32243708160,
5479
+ "step": 30750
5480
+ },
5481
+ {
5482
+ "epoch": 0.6765541318813779,
5483
+ "grad_norm": 0.14875884354114532,
5484
+ "learning_rate": 0.001,
5485
+ "loss": 2.6662,
5486
+ "num_input_tokens_seen": 32296136960,
5487
+ "step": 30800
5488
+ },
5489
+ {
5490
+ "epoch": 0.6776524340435229,
5491
+ "grad_norm": 0.13774773478507996,
5492
+ "learning_rate": 0.001,
5493
+ "loss": 2.6649,
5494
+ "num_input_tokens_seen": 32348565760,
5495
+ "step": 30850
5496
+ },
5497
+ {
5498
+ "epoch": 0.6787507362056681,
5499
+ "grad_norm": 0.1647578626871109,
5500
+ "learning_rate": 0.001,
5501
+ "loss": 2.6693,
5502
+ "num_input_tokens_seen": 32400994560,
5503
+ "step": 30900
5504
+ },
5505
+ {
5506
+ "epoch": 0.6798490383678132,
5507
+ "grad_norm": 0.1620490700006485,
5508
+ "learning_rate": 0.001,
5509
+ "loss": 2.6726,
5510
+ "num_input_tokens_seen": 32453423360,
5511
+ "step": 30950
5512
+ },
5513
+ {
5514
+ "epoch": 0.6809473405299582,
5515
+ "grad_norm": 0.14238062500953674,
5516
+ "learning_rate": 0.001,
5517
+ "loss": 2.6681,
5518
+ "num_input_tokens_seen": 32505852160,
5519
+ "step": 31000
5520
+ },
5521
+ {
5522
+ "epoch": 0.6809473405299582,
5523
+ "eval_loss": 2.5645763874053955,
5524
+ "eval_runtime": 65.7725,
5525
+ "eval_samples_per_second": 76.02,
5526
+ "eval_steps_per_second": 19.005,
5527
+ "num_input_tokens_seen": 32505852160,
5528
+ "step": 31000
5529
  }
5530
  ],
5531
  "logging_steps": 50,
5532
  "max_steps": 200000,
5533
+ "num_input_tokens_seen": 32505852160,
5534
  "num_train_epochs": 5,
5535
  "save_steps": 1000,
5536
  "stateful_callbacks": {
 
5545
  "attributes": {}
5546
  }
5547
  },
5548
+ "total_flos": 1.851232100800463e+19,
5549
  "train_batch_size": 64,
5550
  "trial_name": null,
5551
  "trial_params": null