Azrail commited on
Commit
c5957cb
·
verified ·
1 Parent(s): 1d46f77

Training in progress, step 124000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cef5b67a6a8ef1b7b03d42987cf14119de3a2a743fc8652bcc28538e2c6f502f
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6da59dcdd189ba50995bfaca6dfb3c1f07cec1d39f2b04e6b589b61aa33bf008
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03925e5e99d9cbfffe2f6300cf8385c7fca65c8ed5a96f6e0b64b1da83665e80
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0a4aebfce2ae0e56c21f66beb3519294df5637c5928eb84133802b9a02f01ec
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057702d02e4981608a0b19960ab61ff20cc438831297a4986309cdb565b1c450
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f0ee0fd151b13dc8525e6639746bb04660a2a355f86970459a4f08c593ef0a
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e98c7489b04ae19323aa5fe9264a9e2511b478d8f623351ee3b05babc6a227f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4c196ff888110afc03a5fac8e049987b043db46c6b51b50b9a63aa8569f2b7f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1734285748357323,
6
  "eval_steps": 500,
7
- "global_step": 123000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -21902,11 +21902,189 @@
21902
  "eval_steps_per_second": 15.061,
21903
  "num_input_tokens_seen": 64477051392,
21904
  "step": 123000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21905
  }
21906
  ],
21907
  "logging_steps": 50,
21908
  "max_steps": 140000,
21909
- "num_input_tokens_seen": 64477051392,
21910
  "num_train_epochs": 2,
21911
  "save_steps": 1000,
21912
  "stateful_callbacks": {
@@ -21921,7 +22099,7 @@
21921
  "attributes": {}
21922
  }
21923
  },
21924
- "total_flos": 1.141126203496661e+20,
21925
  "train_batch_size": 32,
21926
  "trial_name": null,
21927
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1829686251594977,
6
  "eval_steps": 500,
7
+ "global_step": 124000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
21902
  "eval_steps_per_second": 15.061,
21903
  "num_input_tokens_seen": 64477051392,
21904
  "step": 123000
21905
+ },
21906
+ {
21907
+ "epoch": 1.1739055773519205,
21908
+ "grad_norm": 0.14069771766662598,
21909
+ "learning_rate": 0.0006624893596897613,
21910
+ "loss": 2.0767,
21911
+ "num_input_tokens_seen": 64503259872,
21912
+ "step": 123050
21913
+ },
21914
+ {
21915
+ "epoch": 1.174382579868109,
21916
+ "grad_norm": 0.14180107414722443,
21917
+ "learning_rate": 0.0006598340745578908,
21918
+ "loss": 2.0611,
21919
+ "num_input_tokens_seen": 64529460896,
21920
+ "step": 123100
21921
+ },
21922
+ {
21923
+ "epoch": 1.174859582384297,
21924
+ "grad_norm": 0.14584094285964966,
21925
+ "learning_rate": 0.000657173759148761,
21926
+ "loss": 2.0693,
21927
+ "num_input_tokens_seen": 64555675296,
21928
+ "step": 123150
21929
+ },
21930
+ {
21931
+ "epoch": 1.1753365849004853,
21932
+ "grad_norm": 0.1269799768924713,
21933
+ "learning_rate": 0.0006545084971874737,
21934
+ "loss": 2.0615,
21935
+ "num_input_tokens_seen": 64581882720,
21936
+ "step": 123200
21937
+ },
21938
+ {
21939
+ "epoch": 1.1758135874166737,
21940
+ "grad_norm": 0.15073458850383759,
21941
+ "learning_rate": 0.0006518383725548074,
21942
+ "loss": 2.083,
21943
+ "num_input_tokens_seen": 64608088736,
21944
+ "step": 123250
21945
+ },
21946
+ {
21947
+ "epoch": 1.176290589932862,
21948
+ "grad_norm": 0.12902715802192688,
21949
+ "learning_rate": 0.000649163469284578,
21950
+ "loss": 2.0579,
21951
+ "num_input_tokens_seen": 64634299936,
21952
+ "step": 123300
21953
+ },
21954
+ {
21955
+ "epoch": 1.1767675924490502,
21956
+ "grad_norm": 0.13666096329689026,
21957
+ "learning_rate": 0.0006464838715609945,
21958
+ "loss": 2.0673,
21959
+ "num_input_tokens_seen": 64660511904,
21960
+ "step": 123350
21961
+ },
21962
+ {
21963
+ "epoch": 1.1772445949652384,
21964
+ "grad_norm": 0.13477379083633423,
21965
+ "learning_rate": 0.0006437996637160086,
21966
+ "loss": 2.0752,
21967
+ "num_input_tokens_seen": 64686718272,
21968
+ "step": 123400
21969
+ },
21970
+ {
21971
+ "epoch": 1.1777215974814268,
21972
+ "grad_norm": 0.13596594333648682,
21973
+ "learning_rate": 0.0006411109302266615,
21974
+ "loss": 2.0606,
21975
+ "num_input_tokens_seen": 64712932256,
21976
+ "step": 123450
21977
+ },
21978
+ {
21979
+ "epoch": 1.178198599997615,
21980
+ "grad_norm": 0.1400011032819748,
21981
+ "learning_rate": 0.0006384177557124247,
21982
+ "loss": 2.066,
21983
+ "num_input_tokens_seen": 64739145440,
21984
+ "step": 123500
21985
+ },
21986
+ {
21987
+ "epoch": 1.178198599997615,
21988
+ "eval_loss": 1.986546516418457,
21989
+ "eval_runtime": 82.7963,
21990
+ "eval_samples_per_second": 60.389,
21991
+ "eval_steps_per_second": 15.097,
21992
+ "num_input_tokens_seen": 64739145440,
21993
+ "step": 123500
21994
+ },
21995
+ {
21996
+ "epoch": 1.1786756025138032,
21997
+ "grad_norm": 0.13023069500923157,
21998
+ "learning_rate": 0.0006357202249325371,
21999
+ "loss": 2.0727,
22000
+ "num_input_tokens_seen": 64765359840,
22001
+ "step": 123550
22002
+ },
22003
+ {
22004
+ "epoch": 1.1791526050299916,
22005
+ "grad_norm": 0.13744056224822998,
22006
+ "learning_rate": 0.0006330184227833376,
22007
+ "loss": 2.0603,
22008
+ "num_input_tokens_seen": 64791573504,
22009
+ "step": 123600
22010
+ },
22011
+ {
22012
+ "epoch": 1.1796296075461798,
22013
+ "grad_norm": 0.1399419903755188,
22014
+ "learning_rate": 0.0006303124342955927,
22015
+ "loss": 2.0699,
22016
+ "num_input_tokens_seen": 64817787904,
22017
+ "step": 123650
22018
+ },
22019
+ {
22020
+ "epoch": 1.180106610062368,
22021
+ "grad_norm": 0.13453304767608643,
22022
+ "learning_rate": 0.0006276023446318213,
22023
+ "loss": 2.0764,
22024
+ "num_input_tokens_seen": 64844002304,
22025
+ "step": 123700
22026
+ },
22027
+ {
22028
+ "epoch": 1.1805836125785563,
22029
+ "grad_norm": 0.13495005667209625,
22030
+ "learning_rate": 0.0006248882390836135,
22031
+ "loss": 2.0629,
22032
+ "num_input_tokens_seen": 64870216704,
22033
+ "step": 123750
22034
+ },
22035
+ {
22036
+ "epoch": 1.1810606150947447,
22037
+ "grad_norm": 0.14330346882343292,
22038
+ "learning_rate": 0.000622170203068947,
22039
+ "loss": 2.0677,
22040
+ "num_input_tokens_seen": 64896426784,
22041
+ "step": 123800
22042
+ },
22043
+ {
22044
+ "epoch": 1.181537617610933,
22045
+ "grad_norm": 0.13179130852222443,
22046
+ "learning_rate": 0.0006194483221294988,
22047
+ "loss": 2.0568,
22048
+ "num_input_tokens_seen": 64922636000,
22049
+ "step": 123850
22050
+ },
22051
+ {
22052
+ "epoch": 1.182014620127121,
22053
+ "grad_norm": 0.12518762052059174,
22054
+ "learning_rate": 0.0006167226819279528,
22055
+ "loss": 2.0604,
22056
+ "num_input_tokens_seen": 64948840416,
22057
+ "step": 123900
22058
+ },
22059
+ {
22060
+ "epoch": 1.1824916226433095,
22061
+ "grad_norm": 0.12823528051376343,
22062
+ "learning_rate": 0.0006139933682453035,
22063
+ "loss": 2.0683,
22064
+ "num_input_tokens_seen": 64975054816,
22065
+ "step": 123950
22066
+ },
22067
+ {
22068
+ "epoch": 1.1829686251594977,
22069
+ "grad_norm": 0.1308305859565735,
22070
+ "learning_rate": 0.0006112604669781572,
22071
+ "loss": 2.0639,
22072
+ "num_input_tokens_seen": 65001257824,
22073
+ "step": 124000
22074
+ },
22075
+ {
22076
+ "epoch": 1.1829686251594977,
22077
+ "eval_loss": 1.9843353033065796,
22078
+ "eval_runtime": 82.7751,
22079
+ "eval_samples_per_second": 60.405,
22080
+ "eval_steps_per_second": 15.101,
22081
+ "num_input_tokens_seen": 65001257824,
22082
+ "step": 124000
22083
  }
22084
  ],
22085
  "logging_steps": 50,
22086
  "max_steps": 140000,
22087
+ "num_input_tokens_seen": 65001257824,
22088
  "num_train_epochs": 2,
22089
  "save_steps": 1000,
22090
  "stateful_callbacks": {
 
22099
  "attributes": {}
22100
  }
22101
  },
22102
+ "total_flos": 1.150403701190529e+20,
22103
  "train_batch_size": 32,
22104
  "trial_name": null,
22105
  "trial_params": null