Azrail commited on
Commit
9476241
·
verified ·
1 Parent(s): 9169e0d

Training in progress, step 130000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:857ead76dd55a0ff132114f3566b2633c2c5cdde85ae73d0787d641584b91007
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67a6c7abe32dd438fb09470397d8599e18c7c6f7d6e5ad7c2ea59aa52e0c0fc9
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d51187329bb716afa734f026372750945e338e23b7c661997a4d4207a6fd698
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af9646577ee4ed03ad7c9691e7703d876a8256d338d3a2fb5035f6f80fe627b5
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01c311980c8b0da96dd9e638e23b1e84aa50fb6a11433bc22a347279b706965b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1315ef35a655eddf08abff5aa18ec6897fdbfeff08c3f5d07895fadd41b93070
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60a157573f4024c9cf3f191281f1d04ef870f25b0126e228157b25abffaa2ebf
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8acfe6d76758b902ab66b172fa1db8b08d2d4760abe1682738a74d50eadc0c50
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.230668876778325,
6
  "eval_steps": 500,
7
- "global_step": 129000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -22970,11 +22970,189 @@
22970
  "eval_steps_per_second": 15.182,
22971
  "num_input_tokens_seen": 67622231264,
22972
  "step": 129000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22973
  }
22974
  ],
22975
  "logging_steps": 50,
22976
  "max_steps": 140000,
22977
- "num_input_tokens_seen": 67622231264,
22978
  "num_train_epochs": 2,
22979
  "save_steps": 1000,
22980
  "stateful_callbacks": {
@@ -22989,7 +23167,7 @@
22989
  "attributes": {}
22990
  }
22991
  },
22992
- "total_flos": 1.1967901504229745e+20,
22993
  "train_batch_size": 32,
22994
  "trial_name": null,
22995
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2402089271020904,
6
  "eval_steps": 500,
7
+ "global_step": 130000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
22970
  "eval_steps_per_second": 15.182,
22971
  "num_input_tokens_seen": 67622231264,
22972
  "step": 129000
22973
+ },
22974
+ {
22975
+ "epoch": 1.2311458792945134,
22976
+ "grad_norm": 0.13078469038009644,
22977
+ "learning_rate": 0.00033221549498448967,
22978
+ "loss": 2.0474,
22979
+ "num_input_tokens_seen": 67648445664,
22980
+ "step": 129050
22981
+ },
22982
+ {
22983
+ "epoch": 1.2316228818107016,
22984
+ "grad_norm": 0.1259986162185669,
22985
+ "learning_rate": 0.0003295758014387375,
22986
+ "loss": 2.0605,
22987
+ "num_input_tokens_seen": 67674654432,
22988
+ "step": 129100
22989
+ },
22990
+ {
22991
+ "epoch": 1.2320998843268898,
22992
+ "grad_norm": 0.13479039072990417,
22993
+ "learning_rate": 0.0003269414714612534,
22994
+ "loss": 2.0499,
22995
+ "num_input_tokens_seen": 67700854208,
22996
+ "step": 129150
22997
+ },
22998
+ {
22999
+ "epoch": 1.232576886843078,
23000
+ "grad_norm": 0.12382933497428894,
23001
+ "learning_rate": 0.0003243125879593286,
23002
+ "loss": 2.0403,
23003
+ "num_input_tokens_seen": 67727067232,
23004
+ "step": 129200
23005
+ },
23006
+ {
23007
+ "epoch": 1.2330538893592664,
23008
+ "grad_norm": 0.13765262067317963,
23009
+ "learning_rate": 0.0003216892336688435,
23010
+ "loss": 2.05,
23011
+ "num_input_tokens_seen": 67753274144,
23012
+ "step": 129250
23013
+ },
23014
+ {
23015
+ "epoch": 1.2335308918754546,
23016
+ "grad_norm": 0.13626757264137268,
23017
+ "learning_rate": 0.000319071491151664,
23018
+ "loss": 2.0533,
23019
+ "num_input_tokens_seen": 67779485312,
23020
+ "step": 129300
23021
+ },
23022
+ {
23023
+ "epoch": 1.2340078943916428,
23024
+ "grad_norm": 0.13541923463344574,
23025
+ "learning_rate": 0.00031645944279304295,
23026
+ "loss": 2.0502,
23027
+ "num_input_tokens_seen": 67805697216,
23028
+ "step": 129350
23029
+ },
23030
+ {
23031
+ "epoch": 1.2344848969078313,
23032
+ "grad_norm": 0.12669889628887177,
23033
+ "learning_rate": 0.00031385317079902743,
23034
+ "loss": 2.0434,
23035
+ "num_input_tokens_seen": 67831908160,
23036
+ "step": 129400
23037
+ },
23038
+ {
23039
+ "epoch": 1.2349618994240195,
23040
+ "grad_norm": 0.12400075793266296,
23041
+ "learning_rate": 0.0003112527571938717,
23042
+ "loss": 2.0556,
23043
+ "num_input_tokens_seen": 67858116736,
23044
+ "step": 129450
23045
+ },
23046
+ {
23047
+ "epoch": 1.2354389019402077,
23048
+ "grad_norm": 0.13263045251369476,
23049
+ "learning_rate": 0.0003086582838174551,
23050
+ "loss": 2.0405,
23051
+ "num_input_tokens_seen": 67884327168,
23052
+ "step": 129500
23053
+ },
23054
+ {
23055
+ "epoch": 1.2354389019402077,
23056
+ "eval_loss": 1.966764211654663,
23057
+ "eval_runtime": 82.4836,
23058
+ "eval_samples_per_second": 60.618,
23059
+ "eval_steps_per_second": 15.155,
23060
+ "num_input_tokens_seen": 67884327168,
23061
+ "step": 129500
23062
+ },
23063
+ {
23064
+ "epoch": 1.235915904456396,
23065
+ "grad_norm": 0.12067709863185883,
23066
+ "learning_rate": 0.00030606983232270746,
23067
+ "loss": 2.0511,
23068
+ "num_input_tokens_seen": 67910538880,
23069
+ "step": 129550
23070
+ },
23071
+ {
23072
+ "epoch": 1.2363929069725843,
23073
+ "grad_norm": 0.13021409511566162,
23074
+ "learning_rate": 0.0003034874841730382,
23075
+ "loss": 2.0525,
23076
+ "num_input_tokens_seen": 67936753280,
23077
+ "step": 129600
23078
+ },
23079
+ {
23080
+ "epoch": 1.2368699094887725,
23081
+ "grad_norm": 0.12661676108837128,
23082
+ "learning_rate": 0.0003009113206397734,
23083
+ "loss": 2.0575,
23084
+ "num_input_tokens_seen": 67962958784,
23085
+ "step": 129650
23086
+ },
23087
+ {
23088
+ "epoch": 1.237346912004961,
23089
+ "grad_norm": 0.12730489671230316,
23090
+ "learning_rate": 0.0002983414227995975,
23091
+ "loss": 2.0552,
23092
+ "num_input_tokens_seen": 67989169536,
23093
+ "step": 129700
23094
+ },
23095
+ {
23096
+ "epoch": 1.2378239145211491,
23097
+ "grad_norm": 0.12583428621292114,
23098
+ "learning_rate": 0.000295777871532002,
23099
+ "loss": 2.0413,
23100
+ "num_input_tokens_seen": 68015382560,
23101
+ "step": 129750
23102
+ },
23103
+ {
23104
+ "epoch": 1.2383009170373374,
23105
+ "grad_norm": 0.12833881378173828,
23106
+ "learning_rate": 0.00029322074751673977,
23107
+ "loss": 2.0456,
23108
+ "num_input_tokens_seen": 68041596960,
23109
+ "step": 129800
23110
+ },
23111
+ {
23112
+ "epoch": 1.2387779195535256,
23113
+ "grad_norm": 0.1263890564441681,
23114
+ "learning_rate": 0.0002906701312312861,
23115
+ "loss": 2.0506,
23116
+ "num_input_tokens_seen": 68067805312,
23117
+ "step": 129850
23118
+ },
23119
+ {
23120
+ "epoch": 1.239254922069714,
23121
+ "grad_norm": 0.1265845000743866,
23122
+ "learning_rate": 0.0002881261029483057,
23123
+ "loss": 2.0376,
23124
+ "num_input_tokens_seen": 68094019712,
23125
+ "step": 129900
23126
+ },
23127
+ {
23128
+ "epoch": 1.2397319245859022,
23129
+ "grad_norm": 0.1379150003194809,
23130
+ "learning_rate": 0.0002855887427331267,
23131
+ "loss": 2.0482,
23132
+ "num_input_tokens_seen": 68120232192,
23133
+ "step": 129950
23134
+ },
23135
+ {
23136
+ "epoch": 1.2402089271020904,
23137
+ "grad_norm": 0.12455019354820251,
23138
+ "learning_rate": 0.00028305813044122096,
23139
+ "loss": 2.038,
23140
+ "num_input_tokens_seen": 68146442176,
23141
+ "step": 130000
23142
+ },
23143
+ {
23144
+ "epoch": 1.2402089271020904,
23145
+ "eval_loss": 1.965224266052246,
23146
+ "eval_runtime": 83.0846,
23147
+ "eval_samples_per_second": 60.18,
23148
+ "eval_steps_per_second": 15.045,
23149
+ "num_input_tokens_seen": 68146442176,
23150
+ "step": 130000
23151
  }
23152
  ],
23153
  "logging_steps": 50,
23154
  "max_steps": 140000,
23155
+ "num_input_tokens_seen": 68146442176,
23156
  "num_train_epochs": 2,
23157
  "save_steps": 1000,
23158
  "stateful_callbacks": {
 
23167
  "attributes": {}
23168
  }
23169
  },
23170
+ "total_flos": 1.206067727404671e+20,
23171
  "train_batch_size": 32,
23172
  "trial_name": null,
23173
  "trial_params": null