Azrail commited on
Commit
613cf80
·
verified ·
1 Parent(s): aaefb20

Training in progress, step 65000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:567849b3336c60bd2ca86c0e32d8fa276a554db52049aae022ae3912ae149f08
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97f833e77e28bcce2d00fc8f583d642be803be2e4268c16065f001da61ccfb12
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe74b2d737ce2dc3386b2964624b6ffd7d46aa98c026d78df24bca83b7a5f473
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6fb466dd570b07209b2b66d3759663a3b462b568c13bb8f7963bf1191bda0a0
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f03ef68c121377c551657263f23acf972b60bf546b00ad9803912e5c78e5ecd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5abe0ab18889dbab668e6d9fae1d62109a3226e616d0e681a91c9a668ea4330
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a987661a10dd2abc0dca231a45c2e361e0f28b82da18aba64a79545986bd62dc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83439c671f875b1f809ad8f03d85b4a006312176c0266e869dc1f2efa804bb73
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.43049592794574404,
6
  "eval_steps": 500,
7
- "global_step": 64000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11400,11 +11400,189 @@
11400
  "eval_steps_per_second": 23.518,
11401
  "num_input_tokens_seen": 16777216000,
11402
  "step": 64000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11403
  }
11404
  ],
11405
  "logging_steps": 50,
11406
  "max_steps": 70000,
11407
- "num_input_tokens_seen": 16777216000,
11408
  "num_train_epochs": 1,
11409
  "save_steps": 1000,
11410
  "stateful_callbacks": {
@@ -11419,7 +11597,7 @@
11419
  "attributes": {}
11420
  }
11421
  },
11422
- "total_flos": 4.48806902562816e+18,
11423
  "train_batch_size": 64,
11424
  "trial_name": null,
11425
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4372224268198963,
6
  "eval_steps": 500,
7
+ "global_step": 65000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11400
  "eval_steps_per_second": 23.518,
11401
  "num_input_tokens_seen": 16777216000,
11402
  "step": 64000
11403
+ },
11404
+ {
11405
+ "epoch": 0.43083225288945165,
11406
+ "grad_norm": 0.15883377194404602,
11407
+ "learning_rate": 0.000304132494574022,
11408
+ "loss": 2.9851,
11409
+ "num_input_tokens_seen": 16790323200,
11410
+ "step": 64050
11411
+ },
11412
+ {
11413
+ "epoch": 0.43116857783315926,
11414
+ "grad_norm": 0.176467627286911,
11415
+ "learning_rate": 0.00029962558344842963,
11416
+ "loss": 2.9865,
11417
+ "num_input_tokens_seen": 16803430400,
11418
+ "step": 64100
11419
+ },
11420
+ {
11421
+ "epoch": 0.43150490277686687,
11422
+ "grad_norm": 0.16392388939857483,
11423
+ "learning_rate": 0.00029513798482615227,
11424
+ "loss": 2.9788,
11425
+ "num_input_tokens_seen": 16816537600,
11426
+ "step": 64150
11427
+ },
11428
+ {
11429
+ "epoch": 0.4318412277205745,
11430
+ "grad_norm": 0.15614169836044312,
11431
+ "learning_rate": 0.0002906701312312861,
11432
+ "loss": 2.9769,
11433
+ "num_input_tokens_seen": 16829644800,
11434
+ "step": 64200
11435
+ },
11436
+ {
11437
+ "epoch": 0.43217755266428215,
11438
+ "grad_norm": 0.16225555539131165,
11439
+ "learning_rate": 0.00028622245328485907,
11440
+ "loss": 2.9881,
11441
+ "num_input_tokens_seen": 16842752000,
11442
+ "step": 64250
11443
+ },
11444
+ {
11445
+ "epoch": 0.43251387760798976,
11446
+ "grad_norm": 0.16419048607349396,
11447
+ "learning_rate": 0.0002817953796633289,
11448
+ "loss": 2.99,
11449
+ "num_input_tokens_seen": 16855859200,
11450
+ "step": 64300
11451
+ },
11452
+ {
11453
+ "epoch": 0.43285020255169737,
11454
+ "grad_norm": 0.16654469072818756,
11455
+ "learning_rate": 0.000277389337057266,
11456
+ "loss": 2.9919,
11457
+ "num_input_tokens_seen": 16868966400,
11458
+ "step": 64350
11459
+ },
11460
+ {
11461
+ "epoch": 0.433186527495405,
11462
+ "grad_norm": 0.1688661277294159,
11463
+ "learning_rate": 0.00027300475013022663,
11464
+ "loss": 2.9844,
11465
+ "num_input_tokens_seen": 16882073600,
11466
+ "step": 64400
11467
+ },
11468
+ {
11469
+ "epoch": 0.4335228524391126,
11470
+ "grad_norm": 0.162180095911026,
11471
+ "learning_rate": 0.000268642041477825,
11472
+ "loss": 2.9847,
11473
+ "num_input_tokens_seen": 16895180800,
11474
+ "step": 64450
11475
+ },
11476
+ {
11477
+ "epoch": 0.4338591773828202,
11478
+ "grad_norm": 0.18244421482086182,
11479
+ "learning_rate": 0.00026430163158700117,
11480
+ "loss": 2.9789,
11481
+ "num_input_tokens_seen": 16908288000,
11482
+ "step": 64500
11483
+ },
11484
+ {
11485
+ "epoch": 0.4338591773828202,
11486
+ "eval_loss": 2.8813860416412354,
11487
+ "eval_runtime": 53.1806,
11488
+ "eval_samples_per_second": 94.019,
11489
+ "eval_steps_per_second": 23.505,
11490
+ "num_input_tokens_seen": 16908288000,
11491
+ "step": 64500
11492
+ },
11493
+ {
11494
+ "epoch": 0.4341955023265278,
11495
+ "grad_norm": 0.15887753665447235,
11496
+ "learning_rate": 0.00025998393879549445,
11497
+ "loss": 2.9723,
11498
+ "num_input_tokens_seen": 16921395200,
11499
+ "step": 64550
11500
+ },
11501
+ {
11502
+ "epoch": 0.4345318272702354,
11503
+ "grad_norm": 0.17573221027851105,
11504
+ "learning_rate": 0.0002556893792515227,
11505
+ "loss": 2.99,
11506
+ "num_input_tokens_seen": 16934502400,
11507
+ "step": 64600
11508
+ },
11509
+ {
11510
+ "epoch": 0.43486815221394304,
11511
+ "grad_norm": 0.1790430247783661,
11512
+ "learning_rate": 0.0002514183668736727,
11513
+ "loss": 2.9887,
11514
+ "num_input_tokens_seen": 16947609600,
11515
+ "step": 64650
11516
+ },
11517
+ {
11518
+ "epoch": 0.43520447715765065,
11519
+ "grad_norm": 0.16031622886657715,
11520
+ "learning_rate": 0.0002471713133110078,
11521
+ "loss": 2.9835,
11522
+ "num_input_tokens_seen": 16960716800,
11523
+ "step": 64700
11524
+ },
11525
+ {
11526
+ "epoch": 0.43554080210135826,
11527
+ "grad_norm": 0.1702345311641693,
11528
+ "learning_rate": 0.0002429486279033892,
11529
+ "loss": 2.9862,
11530
+ "num_input_tokens_seen": 16973824000,
11531
+ "step": 64750
11532
+ },
11533
+ {
11534
+ "epoch": 0.43587712704506587,
11535
+ "grad_norm": 0.16080138087272644,
11536
+ "learning_rate": 0.00023875071764202561,
11537
+ "loss": 2.9785,
11538
+ "num_input_tokens_seen": 16986931200,
11539
+ "step": 64800
11540
+ },
11541
+ {
11542
+ "epoch": 0.4362134519887735,
11543
+ "grad_norm": 0.17694465816020966,
11544
+ "learning_rate": 0.0002345779871302453,
11545
+ "loss": 2.9962,
11546
+ "num_input_tokens_seen": 17000038400,
11547
+ "step": 64850
11548
+ },
11549
+ {
11550
+ "epoch": 0.4365497769324811,
11551
+ "grad_norm": 0.15310978889465332,
11552
+ "learning_rate": 0.00023043083854449987,
11553
+ "loss": 2.98,
11554
+ "num_input_tokens_seen": 17013145600,
11555
+ "step": 64900
11556
+ },
11557
+ {
11558
+ "epoch": 0.4368861018761887,
11559
+ "grad_norm": 0.15505504608154297,
11560
+ "learning_rate": 0.0002263096715956019,
11561
+ "loss": 2.9825,
11562
+ "num_input_tokens_seen": 17026252800,
11563
+ "step": 64950
11564
+ },
11565
+ {
11566
+ "epoch": 0.4372224268198963,
11567
+ "grad_norm": 0.15211448073387146,
11568
+ "learning_rate": 0.00022221488349019903,
11569
+ "loss": 2.9876,
11570
+ "num_input_tokens_seen": 17039360000,
11571
+ "step": 65000
11572
+ },
11573
+ {
11574
+ "epoch": 0.4372224268198963,
11575
+ "eval_loss": 2.8792829513549805,
11576
+ "eval_runtime": 53.0249,
11577
+ "eval_samples_per_second": 94.295,
11578
+ "eval_steps_per_second": 23.574,
11579
+ "num_input_tokens_seen": 17039360000,
11580
+ "step": 65000
11581
  }
11582
  ],
11583
  "logging_steps": 50,
11584
  "max_steps": 70000,
11585
+ "num_input_tokens_seen": 17039360000,
11586
  "num_train_epochs": 1,
11587
  "save_steps": 1000,
11588
  "stateful_callbacks": {
 
11597
  "attributes": {}
11598
  }
11599
  },
11600
+ "total_flos": 4.5581951041536e+18,
11601
  "train_batch_size": 64,
11602
  "trial_name": null,
11603
  "trial_params": null