mgh6 commited on
Commit
5f54607
·
verified ·
1 Parent(s): 1732277

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -3,12 +3,12 @@
3
  "architectures": [
4
  "EsmForMaskedLM"
5
  ],
6
- "attention_probs_dropout_prob": 0.01,
7
  "classifier_dropout": null,
8
  "emb_layer_norm_before": false,
9
  "esmfold_config": null,
10
  "hidden_act": "gelu",
11
- "hidden_dropout_prob": 0.01,
12
  "hidden_size": 1280,
13
  "initializer_range": 0.02,
14
  "intermediate_size": 5120,
 
3
  "architectures": [
4
  "EsmForMaskedLM"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
  "classifier_dropout": null,
8
  "emb_layer_norm_before": false,
9
  "esmfold_config": null,
10
  "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
  "hidden_size": 1280,
13
  "initializer_range": 0.02,
14
  "intermediate_size": 5120,
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48475f083a5c2dd20122d335f8f1642d2b1704a4a54b55ecc266741d8af4decc
3
  size 2611614300
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a054bde49e1eaba391534cd5d6084e672a214d2bcdb35339a02e7f60b250444a
3
  size 2611614300
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd01a2371483ae6f6e5acd61fd57cc71db67bf7044ba7e1b6b06f33fe8ab5258
3
  size 5213028466
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88cdb9d1903a098ab709a4a31d88f0bc508b9856da309433ddbe56564fc7e4c6
3
  size 5213028466
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5c2bb2c73c38e1a76a17a11eb153818930706a055ece3df18411968d5d527a9
3
- size 14942
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5980b31e2575fd555b9b5d35d9fc68ab477ebc20a6cd235cb9c30ed6d640a78
3
+ size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd7d798f6dc160785a8da7f98e23574f8db4fbcce75b7717e46f600053dad5a1
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0001f7c15dabc5570df0e50376618b295ab961ad32150f9a463dd24a0d337e21
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,233 +1,25 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.624842568962117,
5
  "eval_steps": 500,
6
- "global_step": 2500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.024993702758484677,
13
- "grad_norm": 0.821007490158081,
14
- "learning_rate": 0.000997500624843789,
15
- "loss": 88799130419.2,
16
  "step": 100
17
- },
18
- {
19
- "epoch": 0.049987405516969355,
20
- "grad_norm": 0.7833845615386963,
21
- "learning_rate": 0.000995001249687578,
22
- "loss": 15026012815.36,
23
- "step": 200
24
- },
25
- {
26
- "epoch": 0.07498110827545404,
27
- "grad_norm": 0.7531446218490601,
28
- "learning_rate": 0.0009925018745313673,
29
- "loss": 34681771786.24,
30
- "step": 300
31
- },
32
- {
33
- "epoch": 0.09997481103393871,
34
- "grad_norm": 0.7915667295455933,
35
- "learning_rate": 0.0009900024993751563,
36
- "loss": 4205931724.8,
37
- "step": 400
38
- },
39
- {
40
- "epoch": 0.1249685137924234,
41
- "grad_norm": 0.7665759921073914,
42
- "learning_rate": 0.0009875031242189453,
43
- "loss": 7206768476.16,
44
- "step": 500
45
- },
46
- {
47
- "epoch": 0.1249685137924234,
48
- "eval_loss": NaN,
49
- "eval_runtime": 451.579,
50
- "eval_samples_per_second": 34.373,
51
- "eval_steps_per_second": 34.373,
52
- "step": 500
53
- },
54
- {
55
- "epoch": 0.14996221655090808,
56
- "grad_norm": 0.918053388595581,
57
- "learning_rate": 0.0009850037490627343,
58
- "loss": 96702698618.88,
59
- "step": 600
60
- },
61
- {
62
- "epoch": 0.17495591930939275,
63
- "grad_norm": 0.7509938478469849,
64
- "learning_rate": 0.0009825043739065233,
65
- "loss": 2674936709.12,
66
- "step": 700
67
- },
68
- {
69
- "epoch": 0.19994962206787742,
70
- "grad_norm": 0.9247342944145203,
71
- "learning_rate": 0.0009800049987503125,
72
- "loss": 24605596057.6,
73
- "step": 800
74
- },
75
- {
76
- "epoch": 0.22494332482636212,
77
- "grad_norm": 0.7432388663291931,
78
- "learning_rate": 0.0009775056235941015,
79
- "loss": 382103401267.2,
80
- "step": 900
81
- },
82
- {
83
- "epoch": 0.2499370275848468,
84
- "grad_norm": 0.6949240565299988,
85
- "learning_rate": 0.0009750062484378905,
86
- "loss": 32236791398.4,
87
- "step": 1000
88
- },
89
- {
90
- "epoch": 0.2499370275848468,
91
- "eval_loss": NaN,
92
- "eval_runtime": 424.2575,
93
- "eval_samples_per_second": 36.586,
94
- "eval_steps_per_second": 36.586,
95
- "step": 1000
96
- },
97
- {
98
- "epoch": 0.2749307303433315,
99
- "grad_norm": 0.6821365356445312,
100
- "learning_rate": 0.0009725068732816797,
101
- "loss": 26628705484.8,
102
- "step": 1100
103
- },
104
- {
105
- "epoch": 0.29992443310181616,
106
- "grad_norm": 0.6461954116821289,
107
- "learning_rate": 0.0009700074981254687,
108
- "loss": 27969317765.12,
109
- "step": 1200
110
- },
111
- {
112
- "epoch": 0.3249181358603008,
113
- "grad_norm": 0.7664767503738403,
114
- "learning_rate": 0.0009675081229692577,
115
- "loss": 64947680706.56,
116
- "step": 1300
117
- },
118
- {
119
- "epoch": 0.3499118386187855,
120
- "grad_norm": 0.7155385613441467,
121
- "learning_rate": 0.0009650087478130467,
122
- "loss": 21440355368.96,
123
- "step": 1400
124
- },
125
- {
126
- "epoch": 0.37490554137727017,
127
- "grad_norm": 0.6681867837905884,
128
- "learning_rate": 0.0009625093726568358,
129
- "loss": 25921771274.24,
130
- "step": 1500
131
- },
132
- {
133
- "epoch": 0.37490554137727017,
134
- "eval_loss": NaN,
135
- "eval_runtime": 423.4797,
136
- "eval_samples_per_second": 36.653,
137
- "eval_steps_per_second": 36.653,
138
- "step": 1500
139
- },
140
- {
141
- "epoch": 0.39989924413575484,
142
- "grad_norm": 0.6754324436187744,
143
- "learning_rate": 0.0009600099975006249,
144
- "loss": 673877977989.12,
145
- "step": 1600
146
- },
147
- {
148
- "epoch": 0.42489294689423956,
149
- "grad_norm": 0.5371769070625305,
150
- "learning_rate": 0.0009575106223444139,
151
- "loss": 445900065341.44,
152
- "step": 1700
153
- },
154
- {
155
- "epoch": 0.44988664965272424,
156
- "grad_norm": 0.750677227973938,
157
- "learning_rate": 0.000955011247188203,
158
- "loss": 35523568599.04,
159
- "step": 1800
160
- },
161
- {
162
- "epoch": 0.4748803524112089,
163
- "grad_norm": 0.7535462975502014,
164
- "learning_rate": 0.000952511872031992,
165
- "loss": 46281166684.16,
166
- "step": 1900
167
- },
168
- {
169
- "epoch": 0.4998740551696936,
170
- "grad_norm": 0.7034441828727722,
171
- "learning_rate": 0.000950012496875781,
172
- "loss": 342942380195.84,
173
- "step": 2000
174
- },
175
- {
176
- "epoch": 0.4998740551696936,
177
- "eval_loss": NaN,
178
- "eval_runtime": 423.9254,
179
- "eval_samples_per_second": 36.615,
180
- "eval_steps_per_second": 36.615,
181
- "step": 2000
182
- },
183
- {
184
- "epoch": 0.5248677579281783,
185
- "grad_norm": 0.7051777839660645,
186
- "learning_rate": 0.0009475131217195702,
187
- "loss": 3791894609.92,
188
- "step": 2100
189
- },
190
- {
191
- "epoch": 0.549861460686663,
192
- "grad_norm": 0.5590131878852844,
193
- "learning_rate": 0.0009450137465633592,
194
- "loss": 41969762959.36,
195
- "step": 2200
196
- },
197
- {
198
- "epoch": 0.5748551634451476,
199
- "grad_norm": 0.6362671256065369,
200
- "learning_rate": 0.0009425143714071483,
201
- "loss": 43926135767.04,
202
- "step": 2300
203
- },
204
- {
205
- "epoch": 0.5998488662036323,
206
- "grad_norm": 0.6453591585159302,
207
- "learning_rate": 0.0009400149962509373,
208
- "loss": 972726877552.64,
209
- "step": 2400
210
- },
211
- {
212
- "epoch": 0.624842568962117,
213
- "grad_norm": 0.6738520264625549,
214
- "learning_rate": 0.0009375156210947263,
215
- "loss": 1207760825876.48,
216
- "step": 2500
217
- },
218
- {
219
- "epoch": 0.624842568962117,
220
- "eval_loss": NaN,
221
- "eval_runtime": 423.912,
222
- "eval_samples_per_second": 36.616,
223
- "eval_steps_per_second": 36.616,
224
- "step": 2500
225
  }
226
  ],
227
  "logging_steps": 100,
228
- "max_steps": 40010,
229
  "num_input_tokens_seen": 0,
230
- "num_train_epochs": 10,
231
  "save_steps": 100,
232
  "stateful_callbacks": {
233
  "TrainerControl": {
@@ -241,7 +33,7 @@
241
  "attributes": {}
242
  }
243
  },
244
- "total_flos": 4.862199331815424e+17,
245
  "train_batch_size": 1,
246
  "trial_name": null,
247
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.024993702758484677,
5
  "eval_steps": 500,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.024993702758484677,
13
+ "grad_norm": 1.2550193071365356,
14
+ "learning_rate": 9.875031242189454e-05,
15
+ "loss": 89312785858.56,
16
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
  ],
19
  "logging_steps": 100,
20
+ "max_steps": 8002,
21
  "num_input_tokens_seen": 0,
22
+ "num_train_epochs": 2,
23
  "save_steps": 100,
24
  "stateful_callbacks": {
25
  "TrainerControl": {
 
33
  "attributes": {}
34
  }
35
  },
36
+ "total_flos": 1.9448797327261696e+16,
37
  "train_batch_size": 1,
38
  "trial_name": null,
39
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c699965ccc424515c6327ad161112409d54814998bcea28fb2549ae8ad3d0f20
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e112b2855101d9f6ef3ac1291e6f721e5abc5c131c8763f4579e684682a8ab7
3
  size 5176