Wilsonwin commited on
Commit
7ba2ca8
·
verified ·
1 Parent(s): 20211a8

Training in progress, step 11000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:289c5f7a117bc2146cbc4b2792b4927c7ce3188416b6d12c24b53c92eac18575
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c44d6488e642b06142cd46a4094e9c0a6f469f36493fd58ad5c3d0e96ec36e4
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:78aff07fca8298b71f09331247658b387bcda955f1390e18f38dbc6caf805220
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf4416085db9fad5f4e22c85af55c9af92326a09616b1325abe5de95aa64bef3
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88a0861f9132710b799b6fa2e167a1b0b3b522e3a288bf5f69138ff390819689
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a9c47849ad44860f45019fca12bd8b47e7589be1317a01ad6705b924156a6be
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f96c5626b64f285225e7bd0540a942ee4b22f3baba9f0a0f2189b039b8bf46c
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d77946d2c30708215d82675369c6b0f4ea0ac50e0bfa8851a58c893e34baac40
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.7739483020780538,
6
  "eval_steps": 500,
7
- "global_step": 10500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7533,6 +7533,364 @@
7533
  "eval_samples_per_second": 276.668,
7534
  "eval_steps_per_second": 5.81,
7535
  "step": 10500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7536
  }
7537
  ],
7538
  "logging_steps": 10,
@@ -7552,7 +7910,7 @@
7552
  "attributes": {}
7553
  }
7554
  },
7555
- "total_flos": 3.511775715466936e+17,
7556
  "train_batch_size": 48,
7557
  "trial_name": null,
7558
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.8584220307484371,
6
  "eval_steps": 500,
7
+ "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7533
  "eval_samples_per_second": 276.668,
7534
  "eval_steps_per_second": 5.81,
7535
  "step": 10500
7536
+ },
7537
+ {
7538
+ "epoch": 1.7756377766514615,
7539
+ "grad_norm": 0.4605439007282257,
7540
+ "learning_rate": 1.330666277084756e-05,
7541
+ "loss": 4.323733139038086,
7542
+ "step": 10510
7543
+ },
7544
+ {
7545
+ "epoch": 1.777327251224869,
7546
+ "grad_norm": 0.45236051082611084,
7547
+ "learning_rate": 1.3110123947820345e-05,
7548
+ "loss": 4.295808410644531,
7549
+ "step": 10520
7550
+ },
7551
+ {
7552
+ "epoch": 1.7790167257982767,
7553
+ "grad_norm": 0.44899049401283264,
7554
+ "learning_rate": 1.2914981033673616e-05,
7555
+ "loss": 4.3012432098388675,
7556
+ "step": 10530
7557
+ },
7558
+ {
7559
+ "epoch": 1.7807062003716845,
7560
+ "grad_norm": 0.45084264874458313,
7561
+ "learning_rate": 1.2721236018340675e-05,
7562
+ "loss": 4.251883316040039,
7563
+ "step": 10540
7564
+ },
7565
+ {
7566
+ "epoch": 1.782395674945092,
7567
+ "grad_norm": 0.4470881521701813,
7568
+ "learning_rate": 1.2528890877500025e-05,
7569
+ "loss": 4.3038276672363285,
7570
+ "step": 10550
7571
+ },
7572
+ {
7573
+ "epoch": 1.7840851495184997,
7574
+ "grad_norm": 0.4608835279941559,
7575
+ "learning_rate": 1.2337947572555257e-05,
7576
+ "loss": 4.313830947875976,
7577
+ "step": 10560
7578
+ },
7579
+ {
7580
+ "epoch": 1.7857746240919075,
7581
+ "grad_norm": 0.4542098045349121,
7582
+ "learning_rate": 1.2148408050614961e-05,
7583
+ "loss": 4.274040985107422,
7584
+ "step": 10570
7585
+ },
7586
+ {
7587
+ "epoch": 1.787464098665315,
7588
+ "grad_norm": 0.47143012285232544,
7589
+ "learning_rate": 1.1960274244472928e-05,
7590
+ "loss": 4.278904724121094,
7591
+ "step": 10580
7592
+ },
7593
+ {
7594
+ "epoch": 1.7891535732387227,
7595
+ "grad_norm": 0.4651215970516205,
7596
+ "learning_rate": 1.1773548072588352e-05,
7597
+ "loss": 4.294227600097656,
7598
+ "step": 10590
7599
+ },
7600
+ {
7601
+ "epoch": 1.7908430478121304,
7602
+ "grad_norm": 0.46851372718811035,
7603
+ "learning_rate": 1.158823143906652e-05,
7604
+ "loss": 4.300647735595703,
7605
+ "step": 10600
7606
+ },
7607
+ {
7608
+ "epoch": 1.792532522385538,
7609
+ "grad_norm": 0.46132445335388184,
7610
+ "learning_rate": 1.1404326233639056e-05,
7611
+ "loss": 4.321317291259765,
7612
+ "step": 10610
7613
+ },
7614
+ {
7615
+ "epoch": 1.794221996958946,
7616
+ "grad_norm": 0.4654752016067505,
7617
+ "learning_rate": 1.1221834331644857e-05,
7618
+ "loss": 4.292523193359375,
7619
+ "step": 10620
7620
+ },
7621
+ {
7622
+ "epoch": 1.7959114715323534,
7623
+ "grad_norm": 0.4643712043762207,
7624
+ "learning_rate": 1.1040757594010908e-05,
7625
+ "loss": 4.292951583862305,
7626
+ "step": 10630
7627
+ },
7628
+ {
7629
+ "epoch": 1.7976009461057612,
7630
+ "grad_norm": 0.4542069435119629,
7631
+ "learning_rate": 1.0861097867233375e-05,
7632
+ "loss": 4.301964187622071,
7633
+ "step": 10640
7634
+ },
7635
+ {
7636
+ "epoch": 1.799290420679169,
7637
+ "grad_norm": 0.4690853953361511,
7638
+ "learning_rate": 1.0682856983358645e-05,
7639
+ "loss": 4.299973678588867,
7640
+ "step": 10650
7641
+ },
7642
+ {
7643
+ "epoch": 1.8009798952525764,
7644
+ "grad_norm": 0.45049384236335754,
7645
+ "learning_rate": 1.050603675996477e-05,
7646
+ "loss": 4.2946735382080075,
7647
+ "step": 10660
7648
+ },
7649
+ {
7650
+ "epoch": 1.8026693698259841,
7651
+ "grad_norm": 0.4500885009765625,
7652
+ "learning_rate": 1.0330639000142877e-05,
7653
+ "loss": 4.296617889404297,
7654
+ "step": 10670
7655
+ },
7656
+ {
7657
+ "epoch": 1.8043588443993919,
7658
+ "grad_norm": 0.45720404386520386,
7659
+ "learning_rate": 1.0156665492478794e-05,
7660
+ "loss": 4.294065856933594,
7661
+ "step": 10680
7662
+ },
7663
+ {
7664
+ "epoch": 1.8060483189727994,
7665
+ "grad_norm": 0.455400675535202,
7666
+ "learning_rate": 9.984118011034787e-06,
7667
+ "loss": 4.294452285766601,
7668
+ "step": 10690
7669
+ },
7670
+ {
7671
+ "epoch": 1.8077377935462071,
7672
+ "grad_norm": 0.4519326388835907,
7673
+ "learning_rate": 9.812998315331449e-06,
7674
+ "loss": 4.294923782348633,
7675
+ "step": 10700
7676
+ },
7677
+ {
7678
+ "epoch": 1.8094272681196149,
7679
+ "grad_norm": 0.4522061049938202,
7680
+ "learning_rate": 9.64330815032991e-06,
7681
+ "loss": 4.294776535034179,
7682
+ "step": 10710
7683
+ },
7684
+ {
7685
+ "epoch": 1.8111167426930224,
7686
+ "grad_norm": 0.44188153743743896,
7687
+ "learning_rate": 9.475049246413801e-06,
7688
+ "loss": 4.282304382324218,
7689
+ "step": 10720
7690
+ },
7691
+ {
7692
+ "epoch": 1.81280621726643,
7693
+ "grad_norm": 0.4490991532802582,
7694
+ "learning_rate": 9.308223319371789e-06,
7695
+ "loss": 4.309776306152344,
7696
+ "step": 10730
7697
+ },
7698
+ {
7699
+ "epoch": 1.8144956918398378,
7700
+ "grad_norm": 0.44931647181510925,
7701
+ "learning_rate": 9.142832070380051e-06,
7702
+ "loss": 4.275448608398437,
7703
+ "step": 10740
7704
+ },
7705
+ {
7706
+ "epoch": 1.8161851664132453,
7707
+ "grad_norm": 0.4667072296142578,
7708
+ "learning_rate": 8.978877185984895e-06,
7709
+ "loss": 4.278246688842773,
7710
+ "step": 10750
7711
+ },
7712
+ {
7713
+ "epoch": 1.8178746409866533,
7714
+ "grad_norm": 0.46589362621307373,
7715
+ "learning_rate": 8.816360338085537e-06,
7716
+ "loss": 4.319537734985351,
7717
+ "step": 10760
7718
+ },
7719
+ {
7720
+ "epoch": 1.8195641155600608,
7721
+ "grad_norm": 0.4552581012248993,
7722
+ "learning_rate": 8.655283183917094e-06,
7723
+ "loss": 4.305705642700195,
7724
+ "step": 10770
7725
+ },
7726
+ {
7727
+ "epoch": 1.8212535901334683,
7728
+ "grad_norm": 0.4621904194355011,
7729
+ "learning_rate": 8.495647366033708e-06,
7730
+ "loss": 4.302457427978515,
7731
+ "step": 10780
7732
+ },
7733
+ {
7734
+ "epoch": 1.8229430647068763,
7735
+ "grad_norm": 0.44105246663093567,
7736
+ "learning_rate": 8.33745451229173e-06,
7737
+ "loss": 4.288130187988282,
7738
+ "step": 10790
7739
+ },
7740
+ {
7741
+ "epoch": 1.8246325392802838,
7742
+ "grad_norm": 0.44687995314598083,
7743
+ "learning_rate": 8.180706235833162e-06,
7744
+ "loss": 4.27890625,
7745
+ "step": 10800
7746
+ },
7747
+ {
7748
+ "epoch": 1.8263220138536915,
7749
+ "grad_norm": 0.45001620054244995,
7750
+ "learning_rate": 8.025404135069207e-06,
7751
+ "loss": 4.305799102783203,
7752
+ "step": 10810
7753
+ },
7754
+ {
7755
+ "epoch": 1.8280114884270993,
7756
+ "grad_norm": 0.4509744942188263,
7757
+ "learning_rate": 7.871549793663985e-06,
7758
+ "loss": 4.28497314453125,
7759
+ "step": 10820
7760
+ },
7761
+ {
7762
+ "epoch": 1.8297009630005068,
7763
+ "grad_norm": 0.4606933295726776,
7764
+ "learning_rate": 7.719144780518315e-06,
7765
+ "loss": 4.279584121704102,
7766
+ "step": 10830
7767
+ },
7768
+ {
7769
+ "epoch": 1.8313904375739145,
7770
+ "grad_norm": 0.453891396522522,
7771
+ "learning_rate": 7.568190649753753e-06,
7772
+ "loss": 4.292636871337891,
7773
+ "step": 10840
7774
+ },
7775
+ {
7776
+ "epoch": 1.8330799121473222,
7777
+ "grad_norm": 0.4472525715827942,
7778
+ "learning_rate": 7.418688940696843e-06,
7779
+ "loss": 4.301625061035156,
7780
+ "step": 10850
7781
+ },
7782
+ {
7783
+ "epoch": 1.8347693867207298,
7784
+ "grad_norm": 0.45282483100891113,
7785
+ "learning_rate": 7.270641177863251e-06,
7786
+ "loss": 4.318436813354492,
7787
+ "step": 10860
7788
+ },
7789
+ {
7790
+ "epoch": 1.8364588612941375,
7791
+ "grad_norm": 0.4516963064670563,
7792
+ "learning_rate": 7.124048870942301e-06,
7793
+ "loss": 4.304822540283203,
7794
+ "step": 10870
7795
+ },
7796
+ {
7797
+ "epoch": 1.8381483358675452,
7798
+ "grad_norm": 0.4429229497909546,
7799
+ "learning_rate": 6.97891351478157e-06,
7800
+ "loss": 4.284355163574219,
7801
+ "step": 10880
7802
+ },
7803
+ {
7804
+ "epoch": 1.8398378104409527,
7805
+ "grad_norm": 0.4634481966495514,
7806
+ "learning_rate": 6.83523658937174e-06,
7807
+ "loss": 4.301011276245117,
7808
+ "step": 10890
7809
+ },
7810
+ {
7811
+ "epoch": 1.8415272850143607,
7812
+ "grad_norm": 0.44028082489967346,
7813
+ "learning_rate": 6.693019559831319e-06,
7814
+ "loss": 4.270964431762695,
7815
+ "step": 10900
7816
+ },
7817
+ {
7818
+ "epoch": 1.8432167595877682,
7819
+ "grad_norm": 0.44995155930519104,
7820
+ "learning_rate": 6.552263876391878e-06,
7821
+ "loss": 4.295645523071289,
7822
+ "step": 10910
7823
+ },
7824
+ {
7825
+ "epoch": 1.8449062341611757,
7826
+ "grad_norm": 0.45434826612472534,
7827
+ "learning_rate": 6.412970974383069e-06,
7828
+ "loss": 4.267873001098633,
7829
+ "step": 10920
7830
+ },
7831
+ {
7832
+ "epoch": 1.8465957087345837,
7833
+ "grad_norm": 0.4576048254966736,
7834
+ "learning_rate": 6.275142274218264e-06,
7835
+ "loss": 4.275784683227539,
7836
+ "step": 10930
7837
+ },
7838
+ {
7839
+ "epoch": 1.8482851833079912,
7840
+ "grad_norm": 0.447792649269104,
7841
+ "learning_rate": 6.138779181379777e-06,
7842
+ "loss": 4.311757659912109,
7843
+ "step": 10940
7844
+ },
7845
+ {
7846
+ "epoch": 1.849974657881399,
7847
+ "grad_norm": 0.45298346877098083,
7848
+ "learning_rate": 6.003883086404709e-06,
7849
+ "loss": 4.295004272460938,
7850
+ "step": 10950
7851
+ },
7852
+ {
7853
+ "epoch": 1.8516641324548067,
7854
+ "grad_norm": 0.4503316879272461,
7855
+ "learning_rate": 5.870455364870747e-06,
7856
+ "loss": 4.2887310028076175,
7857
+ "step": 10960
7858
+ },
7859
+ {
7860
+ "epoch": 1.8533536070282142,
7861
+ "grad_norm": 0.45251592993736267,
7862
+ "learning_rate": 5.738497377382117e-06,
7863
+ "loss": 4.286402893066406,
7864
+ "step": 10970
7865
+ },
7866
+ {
7867
+ "epoch": 1.855043081601622,
7868
+ "grad_norm": 0.44080156087875366,
7869
+ "learning_rate": 5.608010469555674e-06,
7870
+ "loss": 4.308802795410156,
7871
+ "step": 10980
7872
+ },
7873
+ {
7874
+ "epoch": 1.8567325561750296,
7875
+ "grad_norm": 0.4467971622943878,
7876
+ "learning_rate": 5.4789959720071995e-06,
7877
+ "loss": 4.299658584594726,
7878
+ "step": 10990
7879
+ },
7880
+ {
7881
+ "epoch": 1.8584220307484371,
7882
+ "grad_norm": 0.4504829943180084,
7883
+ "learning_rate": 5.3514552003379395e-06,
7884
+ "loss": 4.2919353485107425,
7885
+ "step": 11000
7886
+ },
7887
+ {
7888
+ "epoch": 1.8584220307484371,
7889
+ "eval_loss": 4.247786521911621,
7890
+ "eval_runtime": 3.5852,
7891
+ "eval_samples_per_second": 278.924,
7892
+ "eval_steps_per_second": 5.857,
7893
+ "step": 11000
7894
  }
7895
  ],
7896
  "logging_steps": 10,
 
7910
  "attributes": {}
7911
  }
7912
  },
7913
+ "total_flos": 3.679004125896376e+17,
7914
  "train_batch_size": 48,
7915
  "trial_name": null,
7916
  "trial_params": null