Azrail commited on
Commit
5be6eef
·
verified ·
1 Parent(s): 6fa2215

Training in progress, step 62000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ff3677e2a6c68c6a9bc84018c91a9abb1bcf7c14c1b566d1f4d545783476a72
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbbf7b607c85a5d696bff54af0adb9f239d76d76446306b0d75e85fb86338432
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95defb77fd9d966f9fb370451c779ea88fb6409a7bea604ae57a6a4ab86f381e
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3561f0a9213e3ac9e43eff9c9d946a42b171ff83db0a3806965305d6e1bbe28a
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8ee7735caca4437694ef1fa1c7821cadab81eb5dba9c8318224d8baee7f9384
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b604bf86b8b70beb6e4043604c61f8577f1fbe75a9d1e20249b5622ec5aa2654
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:80725391fd9590c70c1e5ba84487c80bcb26eb7012140d59e753f7bdbcc81863
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68dfbb60d9dcf18c45914087cca91dc6c214da7f11269c4a414921902f313d06
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4103164313232873,
6
  "eval_steps": 500,
7
- "global_step": 61000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -10866,11 +10866,189 @@
10866
  "eval_steps_per_second": 23.996,
10867
  "num_input_tokens_seen": 15990784000,
10868
  "step": 61000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10869
  }
10870
  ],
10871
  "logging_steps": 50,
10872
  "max_steps": 70000,
10873
- "num_input_tokens_seen": 15990784000,
10874
  "num_train_epochs": 1,
10875
  "save_steps": 1000,
10876
  "stateful_callbacks": {
@@ -10885,7 +11063,7 @@
10885
  "attributes": {}
10886
  }
10887
  },
10888
- "total_flos": 4.27769079005184e+18,
10889
  "train_batch_size": 64,
10890
  "trial_name": null,
10891
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.41704293019743954,
6
  "eval_steps": 500,
7
+ "global_step": 62000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
10866
  "eval_steps_per_second": 23.996,
10867
  "num_input_tokens_seen": 15990784000,
10868
  "step": 61000
10869
+ },
10870
+ {
10871
+ "epoch": 0.41065275626699493,
10872
+ "grad_norm": 0.3535885810852051,
10873
+ "learning_rate": 0.0005927261190557954,
10874
+ "loss": 3.0102,
10875
+ "num_input_tokens_seen": 16003891200,
10876
+ "step": 61050
10877
+ },
10878
+ {
10879
+ "epoch": 0.41098908121070254,
10880
+ "grad_norm": 0.2633107304573059,
10881
+ "learning_rate": 0.0005878981399671774,
10882
+ "loss": 3.0424,
10883
+ "num_input_tokens_seen": 16016998400,
10884
+ "step": 61100
10885
+ },
10886
+ {
10887
+ "epoch": 0.41132540615441016,
10888
+ "grad_norm": 0.3054018020629883,
10889
+ "learning_rate": 0.0005830616890728827,
10890
+ "loss": 3.0233,
10891
+ "num_input_tokens_seen": 16030105600,
10892
+ "step": 61150
10893
+ },
10894
+ {
10895
+ "epoch": 0.41166173109811777,
10896
+ "grad_norm": 0.21453993022441864,
10897
+ "learning_rate": 0.0005782172325201155,
10898
+ "loss": 3.018,
10899
+ "num_input_tokens_seen": 16043212800,
10900
+ "step": 61200
10901
+ },
10902
+ {
10903
+ "epoch": 0.4119980560418254,
10904
+ "grad_norm": 0.27815598249435425,
10905
+ "learning_rate": 0.0005733652372276809,
10906
+ "loss": 3.0254,
10907
+ "num_input_tokens_seen": 16056320000,
10908
+ "step": 61250
10909
+ },
10910
+ {
10911
+ "epoch": 0.412334380985533,
10912
+ "grad_norm": 0.20687313377857208,
10913
+ "learning_rate": 0.0005685061708409841,
10914
+ "loss": 3.0165,
10915
+ "num_input_tokens_seen": 16069427200,
10916
+ "step": 61300
10917
+ },
10918
+ {
10919
+ "epoch": 0.4126707059292406,
10920
+ "grad_norm": 0.1985252946615219,
10921
+ "learning_rate": 0.0005636405016869566,
10922
+ "loss": 3.0164,
10923
+ "num_input_tokens_seen": 16082534400,
10924
+ "step": 61350
10925
+ },
10926
+ {
10927
+ "epoch": 0.4130070308729482,
10928
+ "grad_norm": 0.26703181862831116,
10929
+ "learning_rate": 0.0005587686987289189,
10930
+ "loss": 3.0001,
10931
+ "num_input_tokens_seen": 16095641600,
10932
+ "step": 61400
10933
+ },
10934
+ {
10935
+ "epoch": 0.4133433558166558,
10936
+ "grad_norm": 0.1948036104440689,
10937
+ "learning_rate": 0.0005538912315213797,
10938
+ "loss": 3.0058,
10939
+ "num_input_tokens_seen": 16108748800,
10940
+ "step": 61450
10941
+ },
10942
+ {
10943
+ "epoch": 0.41367968076036343,
10944
+ "grad_norm": 0.20653308928012848,
10945
+ "learning_rate": 0.0005490085701647804,
10946
+ "loss": 3.0115,
10947
+ "num_input_tokens_seen": 16121856000,
10948
+ "step": 61500
10949
+ },
10950
+ {
10951
+ "epoch": 0.41367968076036343,
10952
+ "eval_loss": 2.9048781394958496,
10953
+ "eval_runtime": 53.8207,
10954
+ "eval_samples_per_second": 92.901,
10955
+ "eval_steps_per_second": 23.225,
10956
+ "num_input_tokens_seen": 16121856000,
10957
+ "step": 61500
10958
+ },
10959
+ {
10960
+ "epoch": 0.41401600570407104,
10961
+ "grad_norm": 0.19605295360088348,
10962
+ "learning_rate": 0.0005441211852601849,
10963
+ "loss": 3.0225,
10964
+ "num_input_tokens_seen": 16134963200,
10965
+ "step": 61550
10966
+ },
10967
+ {
10968
+ "epoch": 0.41435233064777865,
10969
+ "grad_norm": 0.17526155710220337,
10970
+ "learning_rate": 0.0005392295478639225,
10971
+ "loss": 3.0117,
10972
+ "num_input_tokens_seen": 16148070400,
10973
+ "step": 61600
10974
+ },
10975
+ {
10976
+ "epoch": 0.41468865559148627,
10977
+ "grad_norm": 0.17657403647899628,
10978
+ "learning_rate": 0.0005343341294421868,
10979
+ "loss": 3.0107,
10980
+ "num_input_tokens_seen": 16161177600,
10981
+ "step": 61650
10982
+ },
10983
+ {
10984
+ "epoch": 0.4150249805351939,
10985
+ "grad_norm": 0.18658681213855743,
10986
+ "learning_rate": 0.0005294354018255945,
10987
+ "loss": 3.0085,
10988
+ "num_input_tokens_seen": 16174284800,
10989
+ "step": 61700
10990
+ },
10991
+ {
10992
+ "epoch": 0.4153613054789015,
10993
+ "grad_norm": 0.24781519174575806,
10994
+ "learning_rate": 0.0005245338371637091,
10995
+ "loss": 2.9939,
10996
+ "num_input_tokens_seen": 16187392000,
10997
+ "step": 61750
10998
+ },
10999
+ {
11000
+ "epoch": 0.4156976304226091,
11001
+ "grad_norm": 0.20824941992759705,
11002
+ "learning_rate": 0.0005196299078795343,
11003
+ "loss": 3.0038,
11004
+ "num_input_tokens_seen": 16200499200,
11005
+ "step": 61800
11006
+ },
11007
+ {
11008
+ "epoch": 0.4160339553663167,
11009
+ "grad_norm": 0.38262441754341125,
11010
+ "learning_rate": 0.0005147240866239817,
11011
+ "loss": 3.0141,
11012
+ "num_input_tokens_seen": 16213606400,
11013
+ "step": 61850
11014
+ },
11015
+ {
11016
+ "epoch": 0.4163702803100243,
11017
+ "grad_norm": 0.200628861784935,
11018
+ "learning_rate": 0.0005098168462303141,
11019
+ "loss": 3.0187,
11020
+ "num_input_tokens_seen": 16226713600,
11021
+ "step": 61900
11022
+ },
11023
+ {
11024
+ "epoch": 0.41670660525373193,
11025
+ "grad_norm": 0.18858259916305542,
11026
+ "learning_rate": 0.000504908659668575,
11027
+ "loss": 3.0049,
11028
+ "num_input_tokens_seen": 16239820800,
11029
+ "step": 61950
11030
+ },
11031
+ {
11032
+ "epoch": 0.41704293019743954,
11033
+ "grad_norm": 0.19025108218193054,
11034
+ "learning_rate": 0.0005,
11035
+ "loss": 3.0079,
11036
+ "num_input_tokens_seen": 16252928000,
11037
+ "step": 62000
11038
+ },
11039
+ {
11040
+ "epoch": 0.41704293019743954,
11041
+ "eval_loss": 2.9012608528137207,
11042
+ "eval_runtime": 52.7052,
11043
+ "eval_samples_per_second": 94.867,
11044
+ "eval_steps_per_second": 23.717,
11045
+ "num_input_tokens_seen": 16252928000,
11046
+ "step": 62000
11047
  }
11048
  ],
11049
  "logging_steps": 50,
11050
  "max_steps": 70000,
11051
+ "num_input_tokens_seen": 16252928000,
11052
  "num_train_epochs": 1,
11053
  "save_steps": 1000,
11054
  "stateful_callbacks": {
 
11063
  "attributes": {}
11064
  }
11065
  },
11066
+ "total_flos": 4.34781686857728e+18,
11067
  "train_batch_size": 64,
11068
  "trial_name": null,
11069
  "trial_params": null