Azrail commited on
Commit
1914385
·
verified ·
1 Parent(s): 889071d

Training in progress, step 31000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc13ffa23a1f5210f44d10669aa87f3ec7bfb7a2664786f76ce56132b042639e
3
  size 563074920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5bbb33796637d85d181dd86914f0d0b2932daf04a02e2d42b0e675ffd28388a
3
  size 563074920
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c2acc68a1693942d243837338503be83794d69c0b95c32e490c2e11f4c4406e
3
  size 1125916346
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:404568263d026535dcc44bb135fbb61c7e3760b5962c18e72f460d9b5076b3b1
3
  size 1125916346
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5f9d2ea250bcd3507c62c8571a114db63d14fdd2d31f9df1da7534fe6e55434
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f9232b4b974a65603075b06bb82ca61a1267905abb281ba5363cf0b0ac176db
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:492390519daa872425f50793597ce5e74ef972fc3d656ffa5ca614e3b949a837
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0aa4aa16ed53784eb6010613ed4115c7bfda9657643c7abb9d4d9e40642eb9e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.20179496622456752,
6
  "eval_steps": 500,
7
- "global_step": 30000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5348,11 +5348,189 @@
5348
  "eval_steps_per_second": 8.76,
5349
  "num_input_tokens_seen": 7864320000,
5350
  "step": 30000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5351
  }
5352
  ],
5353
  "logging_steps": 50,
5354
- "max_steps": 30000,
5355
- "num_input_tokens_seen": 7864320000,
5356
  "num_train_epochs": 1,
5357
  "save_steps": 1000,
5358
  "stateful_callbacks": {
@@ -5362,12 +5540,12 @@
5362
  "should_evaluate": false,
5363
  "should_log": false,
5364
  "should_save": true,
5365
- "should_training_stop": true
5366
  },
5367
  "attributes": {}
5368
  }
5369
  },
5370
- "total_flos": 5.0112805994496e+18,
5371
  "train_batch_size": 64,
5372
  "trial_name": null,
5373
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.20852146509871977,
6
  "eval_steps": 500,
7
+ "global_step": 31000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5348
  "eval_steps_per_second": 8.76,
5349
  "num_input_tokens_seen": 7864320000,
5350
  "step": 30000
5351
+ },
5352
+ {
5353
+ "epoch": 0.20213129116827513,
5354
+ "grad_norm": 0.7275823950767517,
5355
+ "learning_rate": 0.0002881031482247361,
5356
+ "loss": 12.0089,
5357
+ "num_input_tokens_seen": 7877427200,
5358
+ "step": 30050
5359
+ },
5360
+ {
5361
+ "epoch": 0.20246761611198275,
5362
+ "grad_norm": 0.7593051195144653,
5363
+ "learning_rate": 0.0002904816199505797,
5364
+ "loss": 12.0389,
5365
+ "num_input_tokens_seen": 7890534400,
5366
+ "step": 30100
5367
+ },
5368
+ {
5369
+ "epoch": 0.20280394105569036,
5370
+ "grad_norm": 0.7933290004730225,
5371
+ "learning_rate": 0.00029286069073616763,
5372
+ "loss": 12.0537,
5373
+ "num_input_tokens_seen": 7903641600,
5374
+ "step": 30150
5375
+ },
5376
+ {
5377
+ "epoch": 0.20314026599939797,
5378
+ "grad_norm": 0.736951470375061,
5379
+ "learning_rate": 0.0002952402108495577,
5380
+ "loss": 12.0687,
5381
+ "num_input_tokens_seen": 7916748800,
5382
+ "step": 30200
5383
+ },
5384
+ {
5385
+ "epoch": 0.20347659094310558,
5386
+ "grad_norm": 0.7448037266731262,
5387
+ "learning_rate": 0.0002976200305305268,
5388
+ "loss": 12.0549,
5389
+ "num_input_tokens_seen": 7929856000,
5390
+ "step": 30250
5391
+ },
5392
+ {
5393
+ "epoch": 0.2038129158868132,
5394
+ "grad_norm": 0.7063918113708496,
5395
+ "learning_rate": 0.0002999999999999999,
5396
+ "loss": 12.0769,
5397
+ "num_input_tokens_seen": 7942963200,
5398
+ "step": 30300
5399
+ },
5400
+ {
5401
+ "epoch": 0.20414924083052083,
5402
+ "grad_norm": 0.7379609942436218,
5403
+ "learning_rate": 0.000302379969469473,
5404
+ "loss": 12.1145,
5405
+ "num_input_tokens_seen": 7956070400,
5406
+ "step": 30350
5407
+ },
5408
+ {
5409
+ "epoch": 0.20448556577422844,
5410
+ "grad_norm": 0.7159172892570496,
5411
+ "learning_rate": 0.0003047597891504424,
5412
+ "loss": 12.1304,
5413
+ "num_input_tokens_seen": 7969177600,
5414
+ "step": 30400
5415
+ },
5416
+ {
5417
+ "epoch": 0.20482189071793605,
5418
+ "grad_norm": 0.759340226650238,
5419
+ "learning_rate": 0.00030713930926383194,
5420
+ "loss": 12.1011,
5421
+ "num_input_tokens_seen": 7982284800,
5422
+ "step": 30450
5423
+ },
5424
+ {
5425
+ "epoch": 0.20515821566164366,
5426
+ "grad_norm": 0.782768189907074,
5427
+ "learning_rate": 0.00030951838004942016,
5428
+ "loss": 12.1276,
5429
+ "num_input_tokens_seen": 7995392000,
5430
+ "step": 30500
5431
+ },
5432
+ {
5433
+ "epoch": 0.20515821566164366,
5434
+ "eval_loss": 2.9330999851226807,
5435
+ "eval_runtime": 143.3174,
5436
+ "eval_samples_per_second": 34.888,
5437
+ "eval_steps_per_second": 8.722,
5438
+ "num_input_tokens_seen": 7995392000,
5439
+ "step": 30500
5440
+ },
5441
+ {
5442
+ "epoch": 0.20549454060535127,
5443
+ "grad_norm": 0.7521361112594604,
5444
+ "learning_rate": 0.00031189685177526375,
5445
+ "loss": 12.1475,
5446
+ "num_input_tokens_seen": 8008499200,
5447
+ "step": 30550
5448
+ },
5449
+ {
5450
+ "epoch": 0.20583086554905888,
5451
+ "grad_norm": 0.752306342124939,
5452
+ "learning_rate": 0.00031427457474712264,
5453
+ "loss": 12.0914,
5454
+ "num_input_tokens_seen": 8021606400,
5455
+ "step": 30600
5456
+ },
5457
+ {
5458
+ "epoch": 0.2061671904927665,
5459
+ "grad_norm": 0.6963069438934326,
5460
+ "learning_rate": 0.0003166513993178817,
5461
+ "loss": 12.1272,
5462
+ "num_input_tokens_seen": 8034713600,
5463
+ "step": 30650
5464
+ },
5465
+ {
5466
+ "epoch": 0.2065035154364741,
5467
+ "grad_norm": 0.7007436752319336,
5468
+ "learning_rate": 0.0003190271758969692,
5469
+ "loss": 12.1085,
5470
+ "num_input_tokens_seen": 8047820800,
5471
+ "step": 30700
5472
+ },
5473
+ {
5474
+ "epoch": 0.20683984038018172,
5475
+ "grad_norm": 0.7034767270088196,
5476
+ "learning_rate": 0.00032140175495976947,
5477
+ "loss": 12.1114,
5478
+ "num_input_tokens_seen": 8060928000,
5479
+ "step": 30750
5480
+ },
5481
+ {
5482
+ "epoch": 0.20717616532388933,
5483
+ "grad_norm": 0.7317435145378113,
5484
+ "learning_rate": 0.0003237749870570365,
5485
+ "loss": 12.0728,
5486
+ "num_input_tokens_seen": 8074035200,
5487
+ "step": 30800
5488
+ },
5489
+ {
5490
+ "epoch": 0.20751249026759694,
5491
+ "grad_norm": 0.665651261806488,
5492
+ "learning_rate": 0.0003261467228242976,
5493
+ "loss": 12.1099,
5494
+ "num_input_tokens_seen": 8087142400,
5495
+ "step": 30850
5496
+ },
5497
+ {
5498
+ "epoch": 0.20784881521130455,
5499
+ "grad_norm": 0.7023760080337524,
5500
+ "learning_rate": 0.0003285168129912546,
5501
+ "loss": 12.1188,
5502
+ "num_input_tokens_seen": 8100249600,
5503
+ "step": 30900
5504
+ },
5505
+ {
5506
+ "epoch": 0.20818514015501216,
5507
+ "grad_norm": 0.7026780247688293,
5508
+ "learning_rate": 0.00033088510839118004,
5509
+ "loss": 12.0884,
5510
+ "num_input_tokens_seen": 8113356800,
5511
+ "step": 30950
5512
+ },
5513
+ {
5514
+ "epoch": 0.20852146509871977,
5515
+ "grad_norm": 0.7397706508636475,
5516
+ "learning_rate": 0.00033325145997030323,
5517
+ "loss": 12.0894,
5518
+ "num_input_tokens_seen": 8126464000,
5519
+ "step": 31000
5520
+ },
5521
+ {
5522
+ "epoch": 0.20852146509871977,
5523
+ "eval_loss": 2.9383528232574463,
5524
+ "eval_runtime": 144.6078,
5525
+ "eval_samples_per_second": 34.576,
5526
+ "eval_steps_per_second": 8.644,
5527
+ "num_input_tokens_seen": 8126464000,
5528
+ "step": 31000
5529
  }
5530
  ],
5531
  "logging_steps": 50,
5532
+ "max_steps": 60000,
5533
+ "num_input_tokens_seen": 8126464000,
5534
  "num_train_epochs": 1,
5535
  "save_steps": 1000,
5536
  "stateful_callbacks": {
 
5540
  "should_evaluate": false,
5541
  "should_log": false,
5542
  "should_save": true,
5543
+ "should_training_stop": false
5544
  },
5545
  "attributes": {}
5546
  }
5547
  },
5548
+ "total_flos": 5.17832328609792e+18,
5549
  "train_batch_size": 64,
5550
  "trial_name": null,
5551
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c535587179e528588509a5683a599c692165045d10114ebf77f1f94172c77e9
3
- size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37300a576f29a5a8ddf81ea75e13d6c1ee5bf582f11fc6860569d8fcc97499d1
3
+ size 6008