minpeter commited on
Commit
174293c
·
verified ·
1 Parent(s): b2bc1f3

Training in progress, step 20000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f397f4e5642ab9da594752daa8ae50f67bd1c0633a0f55e9742963fc8094fc07
3
  size 373077376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df03cae2dd432c211456aab943782bf83ba84e08565c4c981659cb89c83a578e
3
  size 373077376
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2947a98ddfbd91d8b159be483158ff85ad080e24af7608cd8524985d5ff37696
3
  size 209816139
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4292287a7fa690fe53e7b389faee8373877f88d995cc45d3321aeb77bf8c4af6
3
  size 209816139
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4bf7692b5f5edcd474e14a77a5f13e3f7c7765bb40a870ecf6eeef166453cdc
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22e8bb13b8b5cd110e015717953ca96d5c03c35ddfe30ca45c1fab9651d07421
3
  size 14917
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67c08732e07123ac792d039d19d16df7f1963cb3c04d8bb64d087ce8609b973b
3
  size 14917
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ace0471241ab08ffd32878e593821b741d6b0b68bcb601ea44671e5ef83eef
3
  size 14917
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c27272cd09ade13a826f643a30f8708da8615de25fd4349edd6f9144bb4f5503
3
  size 1401
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba4436ed0869bacf238e760f8e2f2044a22ff86693a77a3015046ef89f00fc7e
3
  size 1401
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
- "epoch": 0.05933902571565988,
6
  "eval_steps": 1000,
7
- "global_step": 19000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5480,6 +5480,294 @@
5480
  "eval_samples_per_second": 50.899,
5481
  "eval_steps_per_second": 3.189,
5482
  "step": 19000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5483
  }
5484
  ],
5485
  "logging_steps": 25,
@@ -5499,7 +5787,7 @@
5499
  "attributes": {}
5500
  }
5501
  },
5502
- "total_flos": 2.4094740084298875e+18,
5503
  "train_batch_size": 8,
5504
  "trial_name": null,
5505
  "trial_params": null
 
2
  "best_global_step": 2000,
3
  "best_metric": 9.218317031860352,
4
  "best_model_checkpoint": "./artifacts/models/base-250725-test/checkpoint-2000",
5
+ "epoch": 0.06246213233227356,
6
  "eval_steps": 1000,
7
+ "global_step": 20000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5480
  "eval_samples_per_second": 50.899,
5481
  "eval_steps_per_second": 3.189,
5482
  "step": 19000
5483
+ },
5484
+ {
5485
+ "epoch": 0.059417103381075226,
5486
+ "grad_norm": 41.5,
5487
+ "learning_rate": 0.0009997577752880041,
5488
+ "loss": 32.8132,
5489
+ "step": 19025
5490
+ },
5491
+ {
5492
+ "epoch": 0.05949518104649056,
5493
+ "grad_norm": 43.75,
5494
+ "learning_rate": 0.0009997537406276557,
5495
+ "loss": 32.9501,
5496
+ "step": 19050
5497
+ },
5498
+ {
5499
+ "epoch": 0.059573258711905906,
5500
+ "grad_norm": 45.25,
5501
+ "learning_rate": 0.0009997496726505228,
5502
+ "loss": 32.7061,
5503
+ "step": 19075
5504
+ },
5505
+ {
5506
+ "epoch": 0.05965133637732125,
5507
+ "grad_norm": 37.5,
5508
+ "learning_rate": 0.0009997455713568763,
5509
+ "loss": 32.7181,
5510
+ "step": 19100
5511
+ },
5512
+ {
5513
+ "epoch": 0.059729414042736594,
5514
+ "grad_norm": 41.0,
5515
+ "learning_rate": 0.00099974143674699,
5516
+ "loss": 32.554,
5517
+ "step": 19125
5518
+ },
5519
+ {
5520
+ "epoch": 0.05980749170815193,
5521
+ "grad_norm": 41.5,
5522
+ "learning_rate": 0.0009997372688211395,
5523
+ "loss": 32.7137,
5524
+ "step": 19150
5525
+ },
5526
+ {
5527
+ "epoch": 0.059885569373567274,
5528
+ "grad_norm": 45.0,
5529
+ "learning_rate": 0.0009997330675796023,
5530
+ "loss": 33.0025,
5531
+ "step": 19175
5532
+ },
5533
+ {
5534
+ "epoch": 0.05996364703898262,
5535
+ "grad_norm": 42.0,
5536
+ "learning_rate": 0.000999728833022659,
5537
+ "loss": 32.9643,
5538
+ "step": 19200
5539
+ },
5540
+ {
5541
+ "epoch": 0.06004172470439796,
5542
+ "grad_norm": 52.5,
5543
+ "learning_rate": 0.0009997245651505915,
5544
+ "loss": 32.8268,
5545
+ "step": 19225
5546
+ },
5547
+ {
5548
+ "epoch": 0.0601198023698133,
5549
+ "grad_norm": 43.0,
5550
+ "learning_rate": 0.0009997202639636844,
5551
+ "loss": 32.8,
5552
+ "step": 19250
5553
+ },
5554
+ {
5555
+ "epoch": 0.06019788003522864,
5556
+ "grad_norm": 56.5,
5557
+ "learning_rate": 0.0009997159294622246,
5558
+ "loss": 32.9133,
5559
+ "step": 19275
5560
+ },
5561
+ {
5562
+ "epoch": 0.060275957700643985,
5563
+ "grad_norm": 44.25,
5564
+ "learning_rate": 0.000999711561646501,
5565
+ "loss": 32.8573,
5566
+ "step": 19300
5567
+ },
5568
+ {
5569
+ "epoch": 0.06035403536605933,
5570
+ "grad_norm": 44.0,
5571
+ "learning_rate": 0.0009997071605168043,
5572
+ "loss": 32.7512,
5573
+ "step": 19325
5574
+ },
5575
+ {
5576
+ "epoch": 0.060432113031474666,
5577
+ "grad_norm": 36.5,
5578
+ "learning_rate": 0.000999702726073429,
5579
+ "loss": 32.9202,
5580
+ "step": 19350
5581
+ },
5582
+ {
5583
+ "epoch": 0.06051019069689001,
5584
+ "grad_norm": 40.0,
5585
+ "learning_rate": 0.0009996982583166695,
5586
+ "loss": 32.942,
5587
+ "step": 19375
5588
+ },
5589
+ {
5590
+ "epoch": 0.06058826836230535,
5591
+ "grad_norm": 39.0,
5592
+ "learning_rate": 0.0009996937572468246,
5593
+ "loss": 32.8775,
5594
+ "step": 19400
5595
+ },
5596
+ {
5597
+ "epoch": 0.0606663460277207,
5598
+ "grad_norm": 37.0,
5599
+ "learning_rate": 0.000999689222864194,
5600
+ "loss": 32.8532,
5601
+ "step": 19425
5602
+ },
5603
+ {
5604
+ "epoch": 0.06074442369313603,
5605
+ "grad_norm": 47.25,
5606
+ "learning_rate": 0.0009996846551690798,
5607
+ "loss": 32.9941,
5608
+ "step": 19450
5609
+ },
5610
+ {
5611
+ "epoch": 0.06082250135855138,
5612
+ "grad_norm": 38.0,
5613
+ "learning_rate": 0.0009996800541617868,
5614
+ "loss": 32.8616,
5615
+ "step": 19475
5616
+ },
5617
+ {
5618
+ "epoch": 0.06090057902396672,
5619
+ "grad_norm": 39.5,
5620
+ "learning_rate": 0.0009996754198426216,
5621
+ "loss": 32.9031,
5622
+ "step": 19500
5623
+ },
5624
+ {
5625
+ "epoch": 0.060978656689382064,
5626
+ "grad_norm": 44.5,
5627
+ "learning_rate": 0.0009996707522118933,
5628
+ "loss": 33.0028,
5629
+ "step": 19525
5630
+ },
5631
+ {
5632
+ "epoch": 0.0610567343547974,
5633
+ "grad_norm": 39.75,
5634
+ "learning_rate": 0.0009996660512699128,
5635
+ "loss": 32.8195,
5636
+ "step": 19550
5637
+ },
5638
+ {
5639
+ "epoch": 0.061134812020212745,
5640
+ "grad_norm": 40.75,
5641
+ "learning_rate": 0.0009996613170169936,
5642
+ "loss": 32.571,
5643
+ "step": 19575
5644
+ },
5645
+ {
5646
+ "epoch": 0.06121288968562809,
5647
+ "grad_norm": 36.75,
5648
+ "learning_rate": 0.0009996565494534517,
5649
+ "loss": 32.5517,
5650
+ "step": 19600
5651
+ },
5652
+ {
5653
+ "epoch": 0.06129096735104343,
5654
+ "grad_norm": 38.0,
5655
+ "learning_rate": 0.0009996517485796044,
5656
+ "loss": 32.5484,
5657
+ "step": 19625
5658
+ },
5659
+ {
5660
+ "epoch": 0.06136904501645877,
5661
+ "grad_norm": 41.75,
5662
+ "learning_rate": 0.000999646914395772,
5663
+ "loss": 32.5895,
5664
+ "step": 19650
5665
+ },
5666
+ {
5667
+ "epoch": 0.06144712268187411,
5668
+ "grad_norm": 42.0,
5669
+ "learning_rate": 0.0009996420469022766,
5670
+ "loss": 32.8765,
5671
+ "step": 19675
5672
+ },
5673
+ {
5674
+ "epoch": 0.061525200347289456,
5675
+ "grad_norm": 38.5,
5676
+ "learning_rate": 0.0009996371460994431,
5677
+ "loss": 32.8793,
5678
+ "step": 19700
5679
+ },
5680
+ {
5681
+ "epoch": 0.0616032780127048,
5682
+ "grad_norm": 40.25,
5683
+ "learning_rate": 0.0009996322119875977,
5684
+ "loss": 33.0708,
5685
+ "step": 19725
5686
+ },
5687
+ {
5688
+ "epoch": 0.06168135567812014,
5689
+ "grad_norm": 38.0,
5690
+ "learning_rate": 0.00099962724456707,
5691
+ "loss": 33.188,
5692
+ "step": 19750
5693
+ },
5694
+ {
5695
+ "epoch": 0.06175943334353548,
5696
+ "grad_norm": 49.0,
5697
+ "learning_rate": 0.0009996222438381904,
5698
+ "loss": 33.2918,
5699
+ "step": 19775
5700
+ },
5701
+ {
5702
+ "epoch": 0.061837511008950824,
5703
+ "grad_norm": 44.75,
5704
+ "learning_rate": 0.0009996172098012928,
5705
+ "loss": 33.4949,
5706
+ "step": 19800
5707
+ },
5708
+ {
5709
+ "epoch": 0.06191558867436617,
5710
+ "grad_norm": 43.25,
5711
+ "learning_rate": 0.0009996121424567126,
5712
+ "loss": 33.8741,
5713
+ "step": 19825
5714
+ },
5715
+ {
5716
+ "epoch": 0.061993666339781504,
5717
+ "grad_norm": 41.75,
5718
+ "learning_rate": 0.0009996070418047877,
5719
+ "loss": 33.6041,
5720
+ "step": 19850
5721
+ },
5722
+ {
5723
+ "epoch": 0.06207174400519685,
5724
+ "grad_norm": 40.25,
5725
+ "learning_rate": 0.000999601907845858,
5726
+ "loss": 33.6722,
5727
+ "step": 19875
5728
+ },
5729
+ {
5730
+ "epoch": 0.06214982167061219,
5731
+ "grad_norm": 40.5,
5732
+ "learning_rate": 0.000999596740580266,
5733
+ "loss": 33.484,
5734
+ "step": 19900
5735
+ },
5736
+ {
5737
+ "epoch": 0.062227899336027535,
5738
+ "grad_norm": 46.25,
5739
+ "learning_rate": 0.000999591540008356,
5740
+ "loss": 33.7352,
5741
+ "step": 19925
5742
+ },
5743
+ {
5744
+ "epoch": 0.06230597700144287,
5745
+ "grad_norm": 48.5,
5746
+ "learning_rate": 0.0009995863061304747,
5747
+ "loss": 33.9541,
5748
+ "step": 19950
5749
+ },
5750
+ {
5751
+ "epoch": 0.062384054666858216,
5752
+ "grad_norm": 44.0,
5753
+ "learning_rate": 0.0009995810389469711,
5754
+ "loss": 34.2383,
5755
+ "step": 19975
5756
+ },
5757
+ {
5758
+ "epoch": 0.06246213233227356,
5759
+ "grad_norm": 40.75,
5760
+ "learning_rate": 0.0009995757384581964,
5761
+ "loss": 33.8251,
5762
+ "step": 20000
5763
+ },
5764
+ {
5765
+ "epoch": 0.06246213233227356,
5766
+ "eval_loss": 34.19303512573242,
5767
+ "eval_runtime": 102.3811,
5768
+ "eval_samples_per_second": 50.82,
5769
+ "eval_steps_per_second": 3.184,
5770
+ "step": 20000
5771
  }
5772
  ],
5773
  "logging_steps": 25,
 
5787
  "attributes": {}
5788
  }
5789
  },
5790
+ "total_flos": 2.53630733446493e+18,
5791
  "train_batch_size": 8,
5792
  "trial_name": null,
5793
  "trial_params": null