Azrail commited on
Commit
2b17472
·
verified ·
1 Parent(s): b04777b

Training in progress, step 56000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:62e4ec5f596aeddac39f75a6501f66ecd7eb297d85fd39f281237c384adec887
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3d360be7fe2543c78a1f7ac85877b8ebcc55a8fc7ce7ea8871241b28859be01
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6aa82a32a09e79af011cf35188194304359148308b76399c6d5815593f337709
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c586225c37191bdb386336c5aa7eba4c313537c276b8b87dd7fefbcb4a3ca975
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a5eacfa99e53a8a1de73851121ef39f03223e9cc67398ac06a0e84e6dbf4ae3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f1d7953b9adf97d81c8d5df7c90f2cd3786e196584c751d3c25ee459604bb2b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5df6e1f8ed049732a2e5d49c46b32207c644d0cb43e6b3e615ea32a67128cbab
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56641b065a04f5f757422df636842a91ff2acd7d071b6672db512bd44af71813
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3699574380783738,
6
  "eval_steps": 500,
7
- "global_step": 55000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9798,11 +9798,189 @@
9798
  "eval_steps_per_second": 23.6,
9799
  "num_input_tokens_seen": 14417920000,
9800
  "step": 55000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9801
  }
9802
  ],
9803
  "logging_steps": 50,
9804
  "max_steps": 60000,
9805
- "num_input_tokens_seen": 14417920000,
9806
  "num_train_epochs": 1,
9807
  "save_steps": 1000,
9808
  "stateful_callbacks": {
@@ -9817,7 +9995,7 @@
9817
  "attributes": {}
9818
  }
9819
  },
9820
- "total_flos": 3.8569343188992e+18,
9821
  "train_batch_size": 64,
9822
  "trial_name": null,
9823
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.37668393695252606,
6
  "eval_steps": 500,
7
+ "global_step": 56000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9798
  "eval_steps_per_second": 23.6,
9799
  "num_input_tokens_seen": 14417920000,
9800
  "step": 55000
9801
+ },
9802
+ {
9803
+ "epoch": 0.3702937630220814,
9804
+ "grad_norm": 0.2545956075191498,
9805
+ "learning_rate": 0.0009263200821770461,
9806
+ "loss": 3.0397,
9807
+ "num_input_tokens_seen": 14431027200,
9808
+ "step": 55050
9809
+ },
9810
+ {
9811
+ "epoch": 0.370630087965789,
9812
+ "grad_norm": 0.26363736391067505,
9813
+ "learning_rate": 0.0009193352839727121,
9814
+ "loss": 3.0554,
9815
+ "num_input_tokens_seen": 14444134400,
9816
+ "step": 55100
9817
+ },
9818
+ {
9819
+ "epoch": 0.3709664129094966,
9820
+ "grad_norm": 0.2228112667798996,
9821
+ "learning_rate": 0.0009120630943110077,
9822
+ "loss": 3.0482,
9823
+ "num_input_tokens_seen": 14457241600,
9824
+ "step": 55150
9825
+ },
9826
+ {
9827
+ "epoch": 0.3713027378532043,
9828
+ "grad_norm": 0.2184106856584549,
9829
+ "learning_rate": 0.0009045084971874737,
9830
+ "loss": 3.0368,
9831
+ "num_input_tokens_seen": 14470348800,
9832
+ "step": 55200
9833
+ },
9834
+ {
9835
+ "epoch": 0.3716390627969119,
9836
+ "grad_norm": 0.5658212900161743,
9837
+ "learning_rate": 0.0008966766701456176,
9838
+ "loss": 3.0541,
9839
+ "num_input_tokens_seen": 14483456000,
9840
+ "step": 55250
9841
+ },
9842
+ {
9843
+ "epoch": 0.3719753877406195,
9844
+ "grad_norm": 0.31839439272880554,
9845
+ "learning_rate": 0.0008885729807284854,
9846
+ "loss": 3.0516,
9847
+ "num_input_tokens_seen": 14496563200,
9848
+ "step": 55300
9849
+ },
9850
+ {
9851
+ "epoch": 0.3723117126843271,
9852
+ "grad_norm": 0.2521055042743683,
9853
+ "learning_rate": 0.0008802029828000156,
9854
+ "loss": 3.049,
9855
+ "num_input_tokens_seen": 14509670400,
9856
+ "step": 55350
9857
+ },
9858
+ {
9859
+ "epoch": 0.3726480376280347,
9860
+ "grad_norm": 0.23797062039375305,
9861
+ "learning_rate": 0.0008715724127386971,
9862
+ "loss": 3.0393,
9863
+ "num_input_tokens_seen": 14522777600,
9864
+ "step": 55400
9865
+ },
9866
+ {
9867
+ "epoch": 0.37298436257174233,
9868
+ "grad_norm": 0.26673102378845215,
9869
+ "learning_rate": 0.0008626871855061438,
9870
+ "loss": 3.0535,
9871
+ "num_input_tokens_seen": 14535884800,
9872
+ "step": 55450
9873
+ },
9874
+ {
9875
+ "epoch": 0.37332068751544994,
9876
+ "grad_norm": 0.37754055857658386,
9877
+ "learning_rate": 0.0008535533905932737,
9878
+ "loss": 3.0432,
9879
+ "num_input_tokens_seen": 14548992000,
9880
+ "step": 55500
9881
+ },
9882
+ {
9883
+ "epoch": 0.37332068751544994,
9884
+ "eval_loss": 2.9362170696258545,
9885
+ "eval_runtime": 53.4795,
9886
+ "eval_samples_per_second": 93.494,
9887
+ "eval_steps_per_second": 23.373,
9888
+ "num_input_tokens_seen": 14548992000,
9889
+ "step": 55500
9890
+ },
9891
+ {
9892
+ "epoch": 0.37365701245915756,
9893
+ "grad_norm": 0.2160724252462387,
9894
+ "learning_rate": 0.000844177287846877,
9895
+ "loss": 3.0378,
9896
+ "num_input_tokens_seen": 14562099200,
9897
+ "step": 55550
9898
+ },
9899
+ {
9900
+ "epoch": 0.37399333740286517,
9901
+ "grad_norm": 0.22323860228061676,
9902
+ "learning_rate": 0.0008345653031794292,
9903
+ "loss": 3.0419,
9904
+ "num_input_tokens_seen": 14575206400,
9905
+ "step": 55600
9906
+ },
9907
+ {
9908
+ "epoch": 0.3743296623465728,
9909
+ "grad_norm": 0.19688346982002258,
9910
+ "learning_rate": 0.0008247240241650918,
9911
+ "loss": 3.0297,
9912
+ "num_input_tokens_seen": 14588313600,
9913
+ "step": 55650
9914
+ },
9915
+ {
9916
+ "epoch": 0.3746659872902804,
9917
+ "grad_norm": 0.1972673088312149,
9918
+ "learning_rate": 0.0008146601955249188,
9919
+ "loss": 3.0405,
9920
+ "num_input_tokens_seen": 14601420800,
9921
+ "step": 55700
9922
+ },
9923
+ {
9924
+ "epoch": 0.375002312233988,
9925
+ "grad_norm": 0.44073277711868286,
9926
+ "learning_rate": 0.0008043807145043603,
9927
+ "loss": 3.0343,
9928
+ "num_input_tokens_seen": 14614528000,
9929
+ "step": 55750
9930
+ },
9931
+ {
9932
+ "epoch": 0.3753386371776956,
9933
+ "grad_norm": 0.22042399644851685,
9934
+ "learning_rate": 0.0007938926261462366,
9935
+ "loss": 3.0337,
9936
+ "num_input_tokens_seen": 14627635200,
9937
+ "step": 55800
9938
+ },
9939
+ {
9940
+ "epoch": 0.3756749621214032,
9941
+ "grad_norm": 0.2954588234424591,
9942
+ "learning_rate": 0.0007832031184624164,
9943
+ "loss": 3.0334,
9944
+ "num_input_tokens_seen": 14640742400,
9945
+ "step": 55850
9946
+ },
9947
+ {
9948
+ "epoch": 0.37601128706511083,
9949
+ "grad_norm": 0.5062097907066345,
9950
+ "learning_rate": 0.0007723195175075137,
9951
+ "loss": 3.0385,
9952
+ "num_input_tokens_seen": 14653849600,
9953
+ "step": 55900
9954
+ },
9955
+ {
9956
+ "epoch": 0.37634761200881844,
9957
+ "grad_norm": 0.30344095826148987,
9958
+ "learning_rate": 0.0007612492823579744,
9959
+ "loss": 3.04,
9960
+ "num_input_tokens_seen": 14666956800,
9961
+ "step": 55950
9962
+ },
9963
+ {
9964
+ "epoch": 0.37668393695252606,
9965
+ "grad_norm": 0.21088473498821259,
9966
+ "learning_rate": 0.00075,
9967
+ "loss": 3.0364,
9968
+ "num_input_tokens_seen": 14680064000,
9969
+ "step": 56000
9970
+ },
9971
+ {
9972
+ "epoch": 0.37668393695252606,
9973
+ "eval_loss": 2.9313743114471436,
9974
+ "eval_runtime": 53.142,
9975
+ "eval_samples_per_second": 94.088,
9976
+ "eval_steps_per_second": 23.522,
9977
+ "num_input_tokens_seen": 14680064000,
9978
+ "step": 56000
9979
  }
9980
  ],
9981
  "logging_steps": 50,
9982
  "max_steps": 60000,
9983
+ "num_input_tokens_seen": 14680064000,
9984
  "num_train_epochs": 1,
9985
  "save_steps": 1000,
9986
  "stateful_callbacks": {
 
9995
  "attributes": {}
9996
  }
9997
  },
9998
+ "total_flos": 3.92706039742464e+18,
9999
  "train_batch_size": 64,
10000
  "trial_name": null,
10001
  "trial_params": null