Wilsonwin commited on
Commit
92e1cac
·
verified ·
1 Parent(s): 15cf6a8

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:539ec21ed2f2d5401b90d0d0b28a43621343b47ec158a5dc912ef7d73a069cdf
3
  size 328277848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34c8f104effe1a88e833bb692c7b75c569bc83b156fc0482dcf0ed735fda2945
3
  size 328277848
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c631e1446372f309276121049f5c8b7603bed555765afc41b0a5db7f194949eb
3
  size 318646859
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26c334859cc6eb4b1ef4006976a7f325a89208371148b26da8caf2a6573930ff
3
  size 318646859
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59ab6babcc58d5a8a0338e2999283607960e6faa29d71e8d0c3f11e2480b272d
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2f6f65c0c5e2316b09e8cb46abab96e8f2ae754bdffd662e804a33277263cd9
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ad5a3c7ee6384cdea60f7a41957135fc1d6a8e0bdd3b9a0dd5c4c46f69d638ec
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c87a18ccc821b756f8fecf0a1e33873b3617702f02d6f52c0042644b36bee0d
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0981584727149856,
6
  "eval_steps": 500,
7
- "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4669,6 +4669,364 @@
4669
  "eval_samples_per_second": 248.518,
4670
  "eval_steps_per_second": 5.219,
4671
  "step": 6500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4672
  }
4673
  ],
4674
  "logging_steps": 10,
@@ -4688,7 +5046,7 @@
4688
  "attributes": {}
4689
  }
4690
  },
4691
- "total_flos": 2.1739484320314163e+17,
4692
  "train_batch_size": 48,
4693
  "trial_name": null,
4694
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1826322013853692,
6
  "eval_steps": 500,
7
+ "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4669
  "eval_samples_per_second": 248.518,
4670
  "eval_steps_per_second": 5.219,
4671
  "step": 6500
4672
+ },
4673
+ {
4674
+ "epoch": 1.0998479472883933,
4675
+ "grad_norm": 0.4943171739578247,
4676
+ "learning_rate": 0.0001695828863325459,
4677
+ "loss": 4.467470932006836,
4678
+ "step": 6510
4679
+ },
4680
+ {
4681
+ "epoch": 1.101537421861801,
4682
+ "grad_norm": 0.474933385848999,
4683
+ "learning_rate": 0.00016910788818067434,
4684
+ "loss": 4.4371185302734375,
4685
+ "step": 6520
4686
+ },
4687
+ {
4688
+ "epoch": 1.1032268964352085,
4689
+ "grad_norm": 0.5118041634559631,
4690
+ "learning_rate": 0.0001686326951796907,
4691
+ "loss": 4.451096725463867,
4692
+ "step": 6530
4693
+ },
4694
+ {
4695
+ "epoch": 1.1049163710086163,
4696
+ "grad_norm": 0.5289651155471802,
4697
+ "learning_rate": 0.00016815731217528667,
4698
+ "loss": 4.448075485229492,
4699
+ "step": 6540
4700
+ },
4701
+ {
4702
+ "epoch": 1.106605845582024,
4703
+ "grad_norm": 0.5182890295982361,
4704
+ "learning_rate": 0.00016768174401509143,
4705
+ "loss": 4.467396926879883,
4706
+ "step": 6550
4707
+ },
4708
+ {
4709
+ "epoch": 1.1082953201554318,
4710
+ "grad_norm": 0.5209820866584778,
4711
+ "learning_rate": 0.0001672059955486223,
4712
+ "loss": 4.459186172485351,
4713
+ "step": 6560
4714
+ },
4715
+ {
4716
+ "epoch": 1.1099847947288393,
4717
+ "grad_norm": 0.48584309220314026,
4718
+ "learning_rate": 0.000166730071627235,
4719
+ "loss": 4.46546516418457,
4720
+ "step": 6570
4721
+ },
4722
+ {
4723
+ "epoch": 1.111674269302247,
4724
+ "grad_norm": 0.5017306804656982,
4725
+ "learning_rate": 0.00016625397710407487,
4726
+ "loss": 4.452592086791992,
4727
+ "step": 6580
4728
+ },
4729
+ {
4730
+ "epoch": 1.1133637438756547,
4731
+ "grad_norm": 0.46485376358032227,
4732
+ "learning_rate": 0.00016577771683402647,
4733
+ "loss": 4.46324348449707,
4734
+ "step": 6590
4735
+ },
4736
+ {
4737
+ "epoch": 1.1150532184490622,
4738
+ "grad_norm": 0.5154596567153931,
4739
+ "learning_rate": 0.00016530129567366483,
4740
+ "loss": 4.457768249511719,
4741
+ "step": 6600
4742
+ },
4743
+ {
4744
+ "epoch": 1.11674269302247,
4745
+ "grad_norm": 0.49490463733673096,
4746
+ "learning_rate": 0.0001648247184812054,
4747
+ "loss": 4.427638244628906,
4748
+ "step": 6610
4749
+ },
4750
+ {
4751
+ "epoch": 1.1184321675958777,
4752
+ "grad_norm": 0.4721022844314575,
4753
+ "learning_rate": 0.00016434799011645507,
4754
+ "loss": 4.4389793395996096,
4755
+ "step": 6620
4756
+ },
4757
+ {
4758
+ "epoch": 1.1201216421692854,
4759
+ "grad_norm": 0.4648183286190033,
4760
+ "learning_rate": 0.00016387111544076193,
4761
+ "loss": 4.460124969482422,
4762
+ "step": 6630
4763
+ },
4764
+ {
4765
+ "epoch": 1.121811116742693,
4766
+ "grad_norm": 0.5035665035247803,
4767
+ "learning_rate": 0.00016339409931696625,
4768
+ "loss": 4.439287185668945,
4769
+ "step": 6640
4770
+ },
4771
+ {
4772
+ "epoch": 1.1235005913161007,
4773
+ "grad_norm": 0.4910880923271179,
4774
+ "learning_rate": 0.00016291694660935065,
4775
+ "loss": 4.456634140014648,
4776
+ "step": 6650
4777
+ },
4778
+ {
4779
+ "epoch": 1.1251900658895084,
4780
+ "grad_norm": 0.48906245827674866,
4781
+ "learning_rate": 0.00016243966218359047,
4782
+ "loss": 4.428804016113281,
4783
+ "step": 6660
4784
+ },
4785
+ {
4786
+ "epoch": 1.126879540462916,
4787
+ "grad_norm": 0.5756556391716003,
4788
+ "learning_rate": 0.00016196225090670435,
4789
+ "loss": 4.411157608032227,
4790
+ "step": 6670
4791
+ },
4792
+ {
4793
+ "epoch": 1.1285690150363237,
4794
+ "grad_norm": 0.49011167883872986,
4795
+ "learning_rate": 0.0001614847176470043,
4796
+ "loss": 4.435109329223633,
4797
+ "step": 6680
4798
+ },
4799
+ {
4800
+ "epoch": 1.1302584896097314,
4801
+ "grad_norm": 0.4775542616844177,
4802
+ "learning_rate": 0.00016100706727404645,
4803
+ "loss": 4.428675842285156,
4804
+ "step": 6690
4805
+ },
4806
+ {
4807
+ "epoch": 1.131947964183139,
4808
+ "grad_norm": 0.5201391577720642,
4809
+ "learning_rate": 0.00016052930465858094,
4810
+ "loss": 4.4389808654785154,
4811
+ "step": 6700
4812
+ },
4813
+ {
4814
+ "epoch": 1.1336374387565467,
4815
+ "grad_norm": 0.49004724621772766,
4816
+ "learning_rate": 0.00016005143467250267,
4817
+ "loss": 4.459021377563476,
4818
+ "step": 6710
4819
+ },
4820
+ {
4821
+ "epoch": 1.1353269133299544,
4822
+ "grad_norm": 0.49011871218681335,
4823
+ "learning_rate": 0.00015957346218880124,
4824
+ "loss": 4.455972290039062,
4825
+ "step": 6720
4826
+ },
4827
+ {
4828
+ "epoch": 1.1370163879033621,
4829
+ "grad_norm": 0.5173168182373047,
4830
+ "learning_rate": 0.0001590953920815117,
4831
+ "loss": 4.443459701538086,
4832
+ "step": 6730
4833
+ },
4834
+ {
4835
+ "epoch": 1.1387058624767696,
4836
+ "grad_norm": 0.47700756788253784,
4837
+ "learning_rate": 0.00015861722922566436,
4838
+ "loss": 4.435110473632813,
4839
+ "step": 6740
4840
+ },
4841
+ {
4842
+ "epoch": 1.1403953370501774,
4843
+ "grad_norm": 0.5626063942909241,
4844
+ "learning_rate": 0.00015813897849723544,
4845
+ "loss": 4.432453536987305,
4846
+ "step": 6750
4847
+ },
4848
+ {
4849
+ "epoch": 1.142084811623585,
4850
+ "grad_norm": 0.49542316794395447,
4851
+ "learning_rate": 0.0001576606447730972,
4852
+ "loss": 4.4374950408935545,
4853
+ "step": 6760
4854
+ },
4855
+ {
4856
+ "epoch": 1.1437742861969928,
4857
+ "grad_norm": 0.5116281509399414,
4858
+ "learning_rate": 0.0001571822329309682,
4859
+ "loss": 4.423119354248047,
4860
+ "step": 6770
4861
+ },
4862
+ {
4863
+ "epoch": 1.1454637607704004,
4864
+ "grad_norm": 0.4868847131729126,
4865
+ "learning_rate": 0.00015670374784936371,
4866
+ "loss": 4.4402107238769535,
4867
+ "step": 6780
4868
+ },
4869
+ {
4870
+ "epoch": 1.147153235343808,
4871
+ "grad_norm": 0.4938635230064392,
4872
+ "learning_rate": 0.00015622519440754566,
4873
+ "loss": 4.424631881713867,
4874
+ "step": 6790
4875
+ },
4876
+ {
4877
+ "epoch": 1.1488427099172158,
4878
+ "grad_norm": 0.5740174651145935,
4879
+ "learning_rate": 0.0001557465774854732,
4880
+ "loss": 4.450838470458985,
4881
+ "step": 6800
4882
+ },
4883
+ {
4884
+ "epoch": 1.1505321844906233,
4885
+ "grad_norm": 0.4828670918941498,
4886
+ "learning_rate": 0.0001552679019637528,
4887
+ "loss": 4.438276290893555,
4888
+ "step": 6810
4889
+ },
4890
+ {
4891
+ "epoch": 1.152221659064031,
4892
+ "grad_norm": 0.4659689664840698,
4893
+ "learning_rate": 0.00015478917272358848,
4894
+ "loss": 4.426282501220703,
4895
+ "step": 6820
4896
+ },
4897
+ {
4898
+ "epoch": 1.1539111336374388,
4899
+ "grad_norm": 0.4927656352519989,
4900
+ "learning_rate": 0.000154310394646732,
4901
+ "loss": 4.464373016357422,
4902
+ "step": 6830
4903
+ },
4904
+ {
4905
+ "epoch": 1.1556006082108463,
4906
+ "grad_norm": 0.5161291360855103,
4907
+ "learning_rate": 0.00015383157261543318,
4908
+ "loss": 4.416297531127929,
4909
+ "step": 6840
4910
+ },
4911
+ {
4912
+ "epoch": 1.157290082784254,
4913
+ "grad_norm": 0.4933563768863678,
4914
+ "learning_rate": 0.00015335271151239,
4915
+ "loss": 4.420982742309571,
4916
+ "step": 6850
4917
+ },
4918
+ {
4919
+ "epoch": 1.1589795573576618,
4920
+ "grad_norm": 0.4847005307674408,
4921
+ "learning_rate": 0.00015287381622069892,
4922
+ "loss": 4.416022872924804,
4923
+ "step": 6860
4924
+ },
4925
+ {
4926
+ "epoch": 1.1606690319310695,
4927
+ "grad_norm": 0.4981960654258728,
4928
+ "learning_rate": 0.00015239489162380504,
4929
+ "loss": 4.422767639160156,
4930
+ "step": 6870
4931
+ },
4932
+ {
4933
+ "epoch": 1.162358506504477,
4934
+ "grad_norm": 0.5001937747001648,
4935
+ "learning_rate": 0.0001519159426054522,
4936
+ "loss": 4.4368339538574215,
4937
+ "step": 6880
4938
+ },
4939
+ {
4940
+ "epoch": 1.1640479810778848,
4941
+ "grad_norm": 0.5044972896575928,
4942
+ "learning_rate": 0.0001514369740496334,
4943
+ "loss": 4.411078643798828,
4944
+ "step": 6890
4945
+ },
4946
+ {
4947
+ "epoch": 1.1657374556512925,
4948
+ "grad_norm": 0.4734691083431244,
4949
+ "learning_rate": 0.00015095799084054073,
4950
+ "loss": 4.438079071044922,
4951
+ "step": 6900
4952
+ },
4953
+ {
4954
+ "epoch": 1.1674269302247002,
4955
+ "grad_norm": 0.49377161264419556,
4956
+ "learning_rate": 0.00015047899786251587,
4957
+ "loss": 4.442370986938476,
4958
+ "step": 6910
4959
+ },
4960
+ {
4961
+ "epoch": 1.1691164047981077,
4962
+ "grad_norm": 0.5010132193565369,
4963
+ "learning_rate": 0.00015,
4964
+ "loss": 4.442108917236328,
4965
+ "step": 6920
4966
+ },
4967
+ {
4968
+ "epoch": 1.1708058793715155,
4969
+ "grad_norm": 0.5035766959190369,
4970
+ "learning_rate": 0.0001495210021374841,
4971
+ "loss": 4.430604553222656,
4972
+ "step": 6930
4973
+ },
4974
+ {
4975
+ "epoch": 1.1724953539449232,
4976
+ "grad_norm": 0.4899141788482666,
4977
+ "learning_rate": 0.00014904200915945927,
4978
+ "loss": 4.435578918457031,
4979
+ "step": 6940
4980
+ },
4981
+ {
4982
+ "epoch": 1.1741848285183307,
4983
+ "grad_norm": 0.4718686044216156,
4984
+ "learning_rate": 0.00014856302595036663,
4985
+ "loss": 4.429093551635742,
4986
+ "step": 6950
4987
+ },
4988
+ {
4989
+ "epoch": 1.1758743030917385,
4990
+ "grad_norm": 0.4881162941455841,
4991
+ "learning_rate": 0.00014808405739454776,
4992
+ "loss": 4.408749008178711,
4993
+ "step": 6960
4994
+ },
4995
+ {
4996
+ "epoch": 1.1775637776651462,
4997
+ "grad_norm": 0.46740713715553284,
4998
+ "learning_rate": 0.00014760510837619493,
4999
+ "loss": 4.419464492797852,
5000
+ "step": 6970
5001
+ },
5002
+ {
5003
+ "epoch": 1.1792532522385537,
5004
+ "grad_norm": 0.4737609922885895,
5005
+ "learning_rate": 0.00014712618377930105,
5006
+ "loss": 4.421468353271484,
5007
+ "step": 6980
5008
+ },
5009
+ {
5010
+ "epoch": 1.1809427268119614,
5011
+ "grad_norm": 0.4975055754184723,
5012
+ "learning_rate": 0.00014664728848760996,
5013
+ "loss": 4.422280502319336,
5014
+ "step": 6990
5015
+ },
5016
+ {
5017
+ "epoch": 1.1826322013853692,
5018
+ "grad_norm": 0.4839191734790802,
5019
+ "learning_rate": 0.00014616842738456682,
5020
+ "loss": 4.395424652099609,
5021
+ "step": 7000
5022
+ },
5023
+ {
5024
+ "epoch": 1.1826322013853692,
5025
+ "eval_loss": 4.412718772888184,
5026
+ "eval_runtime": 4.0717,
5027
+ "eval_samples_per_second": 245.6,
5028
+ "eval_steps_per_second": 5.158,
5029
+ "step": 7000
5030
  }
5031
  ],
5032
  "logging_steps": 10,
 
5046
  "attributes": {}
5047
  }
5048
  },
5049
+ "total_flos": 2.3411768424608563e+17,
5050
  "train_batch_size": 48,
5051
  "trial_name": null,
5052
  "trial_params": null