Umt06 commited on
Commit
1939d7b
·
verified ·
1 Parent(s): e90a3cc

Training in progress, epoch 4

Browse files
config.json CHANGED
@@ -2,12 +2,12 @@
2
  "architectures": [
3
  "BertForSequenceClassification"
4
  ],
5
- "attention_probs_dropout_prob": 0.27628785152324686,
6
  "classifier_dropout": null,
7
  "dtype": "float32",
8
  "gradient_checkpointing": false,
9
  "hidden_act": "gelu",
10
- "hidden_dropout_prob": 0.27628785152324686,
11
  "hidden_size": 768,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
 
2
  "architectures": [
3
  "BertForSequenceClassification"
4
  ],
5
+ "attention_probs_dropout_prob": 0.30716938624140044,
6
  "classifier_dropout": null,
7
  "dtype": "float32",
8
  "gradient_checkpointing": false,
9
  "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.30716938624140044,
11
  "hidden_size": 768,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ee129af0d3461cda8f43dcc1c04b86b09eae4aa5b5a7d2b62e4e4c660df4068
3
  size 437958648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee88310a0719da5fea2ba4af46040bc655bc67a15f87d4cb735b482610ab3273
3
  size 437958648
run-2/checkpoint-345/config.json CHANGED
@@ -2,12 +2,12 @@
2
  "architectures": [
3
  "BertForSequenceClassification"
4
  ],
5
- "attention_probs_dropout_prob": 0.4355887560595135,
6
  "classifier_dropout": null,
7
  "dtype": "float32",
8
  "gradient_checkpointing": false,
9
  "hidden_act": "gelu",
10
- "hidden_dropout_prob": 0.4355887560595135,
11
  "hidden_size": 768,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
 
2
  "architectures": [
3
  "BertForSequenceClassification"
4
  ],
5
+ "attention_probs_dropout_prob": 0.30716938624140044,
6
  "classifier_dropout": null,
7
  "dtype": "float32",
8
  "gradient_checkpointing": false,
9
  "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.30716938624140044,
11
  "hidden_size": 768,
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
run-2/checkpoint-345/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a230d945b5f99fde00359bdc41eb4959cace16a3b6c2339f3c64fa30faea227a
3
  size 437958648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59015fd59426bc6b714ac44d0a06544f1151a3bf783d9e726eff1b67087653d5
3
  size 437958648
run-2/checkpoint-345/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd21559bcdc2f5d4f6cb42aa4f3e3c76f702c6afd78c74e23ad7b23f969432e9
3
  size 876038394
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32ae245dad3710d524ee497bccce9c76c9fea8b12796a581c23a9a9744e2cdac
3
  size 876038394
run-2/checkpoint-345/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4e3221eb46f601e64928f5cd3e8e92c2e49ed394afd9ba340d07e78c615ee257
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95a176f4542543fcc0daa5a5add4355639b9c7df2473d5a2a5b271a804478aef
3
  size 1064
run-2/checkpoint-345/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_global_step": 230,
3
- "best_metric": 0.696078431372549,
4
- "best_model_checkpoint": "bert-base-uncased-finetuned-mrpc\\run-2\\checkpoint-230",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
  "global_step": 345,
@@ -11,277 +11,277 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.08724100327153762,
14
- "grad_norm": 4.4148850440979,
15
- "learning_rate": 5.3364625125575524e-05,
16
- "loss": 0.7011,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.17448200654307525,
21
- "grad_norm": 4.751035690307617,
22
- "learning_rate": 5.177639223493339e-05,
23
- "loss": 0.642,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.2617230098146129,
28
- "grad_norm": 6.830023288726807,
29
- "learning_rate": 5.018815934429126e-05,
30
- "loss": 0.6572,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.3489640130861505,
35
- "grad_norm": 6.113767147064209,
36
- "learning_rate": 4.8599926453649135e-05,
37
- "loss": 0.6311,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.4362050163576881,
42
- "grad_norm": 4.641395568847656,
43
- "learning_rate": 4.7011693563007e-05,
44
- "loss": 0.5981,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.5234460196292258,
49
- "grad_norm": 3.66355299949646,
50
- "learning_rate": 4.5423460672364874e-05,
51
- "loss": 0.5738,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.6106870229007634,
56
- "grad_norm": 4.72593879699707,
57
- "learning_rate": 4.3835227781722754e-05,
58
- "loss": 0.5866,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.697928026172301,
63
- "grad_norm": 8.480412483215332,
64
- "learning_rate": 4.224699489108062e-05,
65
- "loss": 0.6254,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.7851690294438386,
70
- "grad_norm": 4.573983192443848,
71
- "learning_rate": 4.065876200043849e-05,
72
- "loss": 0.608,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.8724100327153762,
77
- "grad_norm": 4.324202060699463,
78
- "learning_rate": 3.9070529109796365e-05,
79
- "loss": 0.6458,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.9596510359869138,
84
- "grad_norm": 4.329319000244141,
85
- "learning_rate": 3.748229621915423e-05,
86
- "loss": 0.5813,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 1.0,
91
  "eval_accuracy": 0.6838235294117647,
92
  "eval_f1": 0.8122270742358079,
93
- "eval_loss": 0.6114351749420166,
94
- "eval_runtime": 12.643,
95
- "eval_samples_per_second": 32.271,
96
- "eval_steps_per_second": 4.034,
97
  "step": 115
98
  },
99
  {
100
  "epoch": 1.043620501635769,
101
- "grad_norm": 5.093260288238525,
102
- "learning_rate": 3.5894063328512104e-05,
103
- "loss": 0.6107,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 1.1308615049073065,
108
- "grad_norm": 6.72236967086792,
109
- "learning_rate": 3.430583043786998e-05,
110
- "loss": 0.5974,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 1.2181025081788441,
115
- "grad_norm": 4.98067569732666,
116
- "learning_rate": 3.271759754722785e-05,
117
- "loss": 0.5637,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 1.3053435114503817,
122
- "grad_norm": 4.982036590576172,
123
- "learning_rate": 3.112936465658572e-05,
124
- "loss": 0.5754,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 1.3925845147219194,
129
- "grad_norm": 5.333518028259277,
130
- "learning_rate": 2.9541131765943592e-05,
131
- "loss": 0.5781,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 1.479825517993457,
136
- "grad_norm": 6.1204514503479,
137
- "learning_rate": 2.795289887530146e-05,
138
- "loss": 0.5841,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 1.5670665212649946,
143
- "grad_norm": 11.200117111206055,
144
- "learning_rate": 2.6364665984659334e-05,
145
- "loss": 0.5665,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 1.6543075245365322,
150
- "grad_norm": 7.264375686645508,
151
- "learning_rate": 2.4776433094017207e-05,
152
- "loss": 0.6032,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 1.7415485278080698,
157
- "grad_norm": 8.504103660583496,
158
- "learning_rate": 2.3188200203375077e-05,
159
- "loss": 0.5764,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 1.8287895310796074,
164
- "grad_norm": 9.18921184539795,
165
- "learning_rate": 2.159996731273295e-05,
166
- "loss": 0.6016,
167
  "step": 210
168
  },
169
  {
170
  "epoch": 1.916030534351145,
171
- "grad_norm": 13.669305801391602,
172
- "learning_rate": 2.001173442209082e-05,
173
- "loss": 0.6045,
174
  "step": 220
175
  },
176
  {
177
  "epoch": 2.0,
178
- "grad_norm": 16.49704360961914,
179
- "learning_rate": 1.842350153144869e-05,
180
- "loss": 0.6168,
181
  "step": 230
182
  },
183
  {
184
  "epoch": 2.0,
185
- "eval_accuracy": 0.696078431372549,
186
- "eval_f1": 0.8181818181818182,
187
- "eval_loss": 0.5858412981033325,
188
- "eval_runtime": 12.3476,
189
- "eval_samples_per_second": 33.043,
190
- "eval_steps_per_second": 4.13,
191
  "step": 230
192
  },
193
  {
194
  "epoch": 2.087241003271538,
195
- "grad_norm": 10.109269142150879,
196
- "learning_rate": 1.6835268640806564e-05,
197
- "loss": 0.618,
198
  "step": 240
199
  },
200
  {
201
  "epoch": 2.174482006543075,
202
- "grad_norm": 9.174595832824707,
203
- "learning_rate": 1.5247035750164434e-05,
204
- "loss": 0.5813,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 2.261723009814613,
209
- "grad_norm": 7.199938774108887,
210
- "learning_rate": 1.3658802859522307e-05,
211
- "loss": 0.5333,
212
  "step": 260
213
  },
214
  {
215
  "epoch": 2.3489640130861504,
216
- "grad_norm": 10.91189956665039,
217
- "learning_rate": 1.2070569968880176e-05,
218
- "loss": 0.5614,
219
  "step": 270
220
  },
221
  {
222
  "epoch": 2.4362050163576883,
223
- "grad_norm": 8.650083541870117,
224
- "learning_rate": 1.0482337078238049e-05,
225
- "loss": 0.5926,
226
  "step": 280
227
  },
228
  {
229
  "epoch": 2.5234460196292257,
230
- "grad_norm": 14.0360689163208,
231
- "learning_rate": 8.89410418759592e-06,
232
- "loss": 0.5575,
233
  "step": 290
234
  },
235
  {
236
  "epoch": 2.6106870229007635,
237
- "grad_norm": 5.1986308097839355,
238
- "learning_rate": 7.305871296953791e-06,
239
- "loss": 0.578,
240
  "step": 300
241
  },
242
  {
243
  "epoch": 2.697928026172301,
244
- "grad_norm": 6.444606304168701,
245
- "learning_rate": 5.717638406311662e-06,
246
- "loss": 0.5572,
247
  "step": 310
248
  },
249
  {
250
  "epoch": 2.7851690294438387,
251
- "grad_norm": 12.22812271118164,
252
- "learning_rate": 4.129405515669534e-06,
253
- "loss": 0.5741,
254
  "step": 320
255
  },
256
  {
257
  "epoch": 2.872410032715376,
258
- "grad_norm": 6.917333602905273,
259
- "learning_rate": 2.5411726250274058e-06,
260
- "loss": 0.5769,
261
  "step": 330
262
  },
263
  {
264
  "epoch": 2.959651035986914,
265
- "grad_norm": 6.114295482635498,
266
- "learning_rate": 9.529397343852771e-07,
267
- "loss": 0.564,
268
  "step": 340
269
  },
270
  {
271
  "epoch": 3.0,
272
  "eval_accuracy": 0.6887254901960784,
273
  "eval_f1": 0.8145985401459854,
274
- "eval_loss": 0.6323757767677307,
275
- "eval_runtime": 12.4955,
276
- "eval_samples_per_second": 32.652,
277
- "eval_steps_per_second": 4.081,
278
  "step": 345
279
  }
280
  ],
281
  "logging_steps": 10,
282
- "max_steps": 345,
283
  "num_input_tokens_seen": 0,
284
- "num_train_epochs": 3,
285
  "save_steps": 500,
286
  "stateful_callbacks": {
287
  "TrainerControl": {
@@ -290,7 +290,7 @@
290
  "should_evaluate": false,
291
  "should_log": false,
292
  "should_save": true,
293
- "should_training_stop": true
294
  },
295
  "attributes": {}
296
  }
@@ -299,9 +299,9 @@
299
  "train_batch_size": 4,
300
  "trial_name": null,
301
  "trial_params": {
302
- "hidden_dropout_prob": 0.4355887560595135,
303
- "learning_rate": 5.4794034727153435e-05,
304
- "num_train_epochs": 3,
305
  "per_device_train_batch_size": 4
306
  }
307
  }
 
1
  {
2
+ "best_global_step": 345,
3
+ "best_metric": 0.6887254901960784,
4
+ "best_model_checkpoint": "bert-base-uncased-finetuned-mrpc\\run-2\\checkpoint-345",
5
  "epoch": 3.0,
6
  "eval_steps": 500,
7
  "global_step": 345,
 
11
  "log_history": [
12
  {
13
  "epoch": 0.08724100327153762,
14
+ "grad_norm": 2.3152477741241455,
15
+ "learning_rate": 2.396012025677338e-06,
16
+ "loss": 0.6915,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.17448200654307525,
21
+ "grad_norm": 7.123985290527344,
22
+ "learning_rate": 2.342885373223295e-06,
23
+ "loss": 0.6551,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.2617230098146129,
28
+ "grad_norm": 2.969932794570923,
29
+ "learning_rate": 2.2897587207692523e-06,
30
+ "loss": 0.6812,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.3489640130861505,
35
+ "grad_norm": 2.524810791015625,
36
+ "learning_rate": 2.236632068315209e-06,
37
+ "loss": 0.6443,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.4362050163576881,
42
+ "grad_norm": 3.7716894149780273,
43
+ "learning_rate": 2.183505415861166e-06,
44
+ "loss": 0.6306,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.5234460196292258,
49
+ "grad_norm": 6.62515926361084,
50
+ "learning_rate": 2.130378763407123e-06,
51
+ "loss": 0.619,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.6106870229007634,
56
+ "grad_norm": 4.636287689208984,
57
+ "learning_rate": 2.0772521109530803e-06,
58
+ "loss": 0.6566,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.697928026172301,
63
+ "grad_norm": 2.319833755493164,
64
+ "learning_rate": 2.024125458499037e-06,
65
+ "loss": 0.6577,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.7851690294438386,
70
+ "grad_norm": 3.791926145553589,
71
+ "learning_rate": 1.9709988060449944e-06,
72
+ "loss": 0.6468,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.8724100327153762,
77
+ "grad_norm": 4.255199432373047,
78
+ "learning_rate": 1.917872153590951e-06,
79
+ "loss": 0.663,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.9596510359869138,
84
+ "grad_norm": 2.442918300628662,
85
+ "learning_rate": 1.8647455011369084e-06,
86
+ "loss": 0.6114,
87
  "step": 110
88
  },
89
  {
90
  "epoch": 1.0,
91
  "eval_accuracy": 0.6838235294117647,
92
  "eval_f1": 0.8122270742358079,
93
+ "eval_loss": 0.6194218397140503,
94
+ "eval_runtime": 12.4471,
95
+ "eval_samples_per_second": 32.779,
96
+ "eval_steps_per_second": 4.097,
97
  "step": 115
98
  },
99
  {
100
  "epoch": 1.043620501635769,
101
+ "grad_norm": 3.5363006591796875,
102
+ "learning_rate": 1.8116188486828654e-06,
103
+ "loss": 0.6394,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 1.1308615049073065,
108
+ "grad_norm": 4.324810028076172,
109
+ "learning_rate": 1.7584921962288222e-06,
110
+ "loss": 0.6445,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 1.2181025081788441,
115
+ "grad_norm": 4.151200294494629,
116
+ "learning_rate": 1.7053655437747794e-06,
117
+ "loss": 0.6276,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 1.3053435114503817,
122
+ "grad_norm": 2.969996690750122,
123
+ "learning_rate": 1.6522388913207364e-06,
124
+ "loss": 0.6167,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 1.3925845147219194,
129
+ "grad_norm": 2.5971832275390625,
130
+ "learning_rate": 1.5991122388666934e-06,
131
+ "loss": 0.644,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 1.479825517993457,
136
+ "grad_norm": 3.444840431213379,
137
+ "learning_rate": 1.5459855864126504e-06,
138
+ "loss": 0.6292,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 1.5670665212649946,
143
+ "grad_norm": 4.372523784637451,
144
+ "learning_rate": 1.4928589339586077e-06,
145
+ "loss": 0.6357,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 1.6543075245365322,
150
+ "grad_norm": 5.912120342254639,
151
+ "learning_rate": 1.4397322815045647e-06,
152
+ "loss": 0.6113,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 1.7415485278080698,
157
+ "grad_norm": 3.5597641468048096,
158
+ "learning_rate": 1.3866056290505217e-06,
159
+ "loss": 0.6195,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 1.8287895310796074,
164
+ "grad_norm": 2.4704296588897705,
165
+ "learning_rate": 1.3334789765964785e-06,
166
+ "loss": 0.6397,
167
  "step": 210
168
  },
169
  {
170
  "epoch": 1.916030534351145,
171
+ "grad_norm": 4.187549114227295,
172
+ "learning_rate": 1.2803523241424355e-06,
173
+ "loss": 0.6145,
174
  "step": 220
175
  },
176
  {
177
  "epoch": 2.0,
178
+ "grad_norm": 3.662736415863037,
179
+ "learning_rate": 1.2272256716883927e-06,
180
+ "loss": 0.6161,
181
  "step": 230
182
  },
183
  {
184
  "epoch": 2.0,
185
+ "eval_accuracy": 0.6838235294117647,
186
+ "eval_f1": 0.8122270742358079,
187
+ "eval_loss": 0.6124567985534668,
188
+ "eval_runtime": 12.2949,
189
+ "eval_samples_per_second": 33.185,
190
+ "eval_steps_per_second": 4.148,
191
  "step": 230
192
  },
193
  {
194
  "epoch": 2.087241003271538,
195
+ "grad_norm": 4.740890979766846,
196
+ "learning_rate": 1.1740990192343497e-06,
197
+ "loss": 0.6427,
198
  "step": 240
199
  },
200
  {
201
  "epoch": 2.174482006543075,
202
+ "grad_norm": 3.5964860916137695,
203
+ "learning_rate": 1.1209723667803067e-06,
204
+ "loss": 0.6071,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 2.261723009814613,
209
+ "grad_norm": 2.725940465927124,
210
+ "learning_rate": 1.0678457143262637e-06,
211
+ "loss": 0.6001,
212
  "step": 260
213
  },
214
  {
215
  "epoch": 2.3489640130861504,
216
+ "grad_norm": 4.717036724090576,
217
+ "learning_rate": 1.0147190618722207e-06,
218
+ "loss": 0.6061,
219
  "step": 270
220
  },
221
  {
222
  "epoch": 2.4362050163576883,
223
+ "grad_norm": 3.381378173828125,
224
+ "learning_rate": 9.615924094181778e-07,
225
+ "loss": 0.6195,
226
  "step": 280
227
  },
228
  {
229
  "epoch": 2.5234460196292257,
230
+ "grad_norm": 3.877357244491577,
231
+ "learning_rate": 9.084657569641349e-07,
232
+ "loss": 0.5771,
233
  "step": 290
234
  },
235
  {
236
  "epoch": 2.6106870229007635,
237
+ "grad_norm": 5.829881191253662,
238
+ "learning_rate": 8.553391045100918e-07,
239
+ "loss": 0.6278,
240
  "step": 300
241
  },
242
  {
243
  "epoch": 2.697928026172301,
244
+ "grad_norm": 4.983321189880371,
245
+ "learning_rate": 8.022124520560489e-07,
246
+ "loss": 0.5921,
247
  "step": 310
248
  },
249
  {
250
  "epoch": 2.7851690294438387,
251
+ "grad_norm": 3.385566473007202,
252
+ "learning_rate": 7.490857996020059e-07,
253
+ "loss": 0.6355,
254
  "step": 320
255
  },
256
  {
257
  "epoch": 2.872410032715376,
258
+ "grad_norm": 3.0855424404144287,
259
+ "learning_rate": 6.95959147147963e-07,
260
+ "loss": 0.6111,
261
  "step": 330
262
  },
263
  {
264
  "epoch": 2.959651035986914,
265
+ "grad_norm": 3.3900763988494873,
266
+ "learning_rate": 6.428324946939199e-07,
267
+ "loss": 0.6181,
268
  "step": 340
269
  },
270
  {
271
  "epoch": 3.0,
272
  "eval_accuracy": 0.6887254901960784,
273
  "eval_f1": 0.8145985401459854,
274
+ "eval_loss": 0.6093047857284546,
275
+ "eval_runtime": 12.431,
276
+ "eval_samples_per_second": 32.821,
277
+ "eval_steps_per_second": 4.103,
278
  "step": 345
279
  }
280
  ],
281
  "logging_steps": 10,
282
+ "max_steps": 460,
283
  "num_input_tokens_seen": 0,
284
+ "num_train_epochs": 4,
285
  "save_steps": 500,
286
  "stateful_callbacks": {
287
  "TrainerControl": {
 
290
  "should_evaluate": false,
291
  "should_log": false,
292
  "should_save": true,
293
+ "should_training_stop": false
294
  },
295
  "attributes": {}
296
  }
 
299
  "train_batch_size": 4,
300
  "trial_name": null,
301
  "trial_params": {
302
+ "hidden_dropout_prob": 0.30716938624140044,
303
+ "learning_rate": 2.4438260128859767e-06,
304
+ "num_train_epochs": 4,
305
  "per_device_train_batch_size": 4
306
  }
307
  }
run-2/checkpoint-345/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7910ec94831bf816572caa453dd30c002c77ffc3b3e3cdec79d030a73f5e993
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8017655c1a55cbfc9386d425ba50c1e9b58602655bfee37869fd1e3fccbcf104
3
  size 5432
run-2/checkpoint-460/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.30716938624140044,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.30716938624140044,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "problem_type": "single_label_classification",
22
+ "transformers_version": "4.57.1",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
run-2/checkpoint-460/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee88310a0719da5fea2ba4af46040bc655bc67a15f87d4cb735b482610ab3273
3
+ size 437958648
run-2/checkpoint-460/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3054a4e7edf4e84d5e7d24d91760a82233c33c49198264e2a2090efc62822104
3
+ size 876038394
run-2/checkpoint-460/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ac81eae5b6a91a1c2a4e96af771c2481446f42d07f79a97683469ebd698e7ca
3
+ size 14244
run-2/checkpoint-460/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4590d0c77692194fb0c6528622656f06517e46d3862041f2a68656bf5654acb7
3
+ size 1064
run-2/checkpoint-460/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
run-2/checkpoint-460/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
run-2/checkpoint-460/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
run-2/checkpoint-460/trainer_state.json ADDED
@@ -0,0 +1,401 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 345,
3
+ "best_metric": 0.6887254901960784,
4
+ "best_model_checkpoint": "bert-base-uncased-finetuned-mrpc\\run-2\\checkpoint-345",
5
+ "epoch": 4.0,
6
+ "eval_steps": 500,
7
+ "global_step": 460,
8
+ "is_hyper_param_search": true,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.08724100327153762,
14
+ "grad_norm": 2.3152477741241455,
15
+ "learning_rate": 2.396012025677338e-06,
16
+ "loss": 0.6915,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.17448200654307525,
21
+ "grad_norm": 7.123985290527344,
22
+ "learning_rate": 2.342885373223295e-06,
23
+ "loss": 0.6551,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.2617230098146129,
28
+ "grad_norm": 2.969932794570923,
29
+ "learning_rate": 2.2897587207692523e-06,
30
+ "loss": 0.6812,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.3489640130861505,
35
+ "grad_norm": 2.524810791015625,
36
+ "learning_rate": 2.236632068315209e-06,
37
+ "loss": 0.6443,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.4362050163576881,
42
+ "grad_norm": 3.7716894149780273,
43
+ "learning_rate": 2.183505415861166e-06,
44
+ "loss": 0.6306,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.5234460196292258,
49
+ "grad_norm": 6.62515926361084,
50
+ "learning_rate": 2.130378763407123e-06,
51
+ "loss": 0.619,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.6106870229007634,
56
+ "grad_norm": 4.636287689208984,
57
+ "learning_rate": 2.0772521109530803e-06,
58
+ "loss": 0.6566,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.697928026172301,
63
+ "grad_norm": 2.319833755493164,
64
+ "learning_rate": 2.024125458499037e-06,
65
+ "loss": 0.6577,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.7851690294438386,
70
+ "grad_norm": 3.791926145553589,
71
+ "learning_rate": 1.9709988060449944e-06,
72
+ "loss": 0.6468,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.8724100327153762,
77
+ "grad_norm": 4.255199432373047,
78
+ "learning_rate": 1.917872153590951e-06,
79
+ "loss": 0.663,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.9596510359869138,
84
+ "grad_norm": 2.442918300628662,
85
+ "learning_rate": 1.8647455011369084e-06,
86
+ "loss": 0.6114,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 1.0,
91
+ "eval_accuracy": 0.6838235294117647,
92
+ "eval_f1": 0.8122270742358079,
93
+ "eval_loss": 0.6194218397140503,
94
+ "eval_runtime": 12.4471,
95
+ "eval_samples_per_second": 32.779,
96
+ "eval_steps_per_second": 4.097,
97
+ "step": 115
98
+ },
99
+ {
100
+ "epoch": 1.043620501635769,
101
+ "grad_norm": 3.5363006591796875,
102
+ "learning_rate": 1.8116188486828654e-06,
103
+ "loss": 0.6394,
104
+ "step": 120
105
+ },
106
+ {
107
+ "epoch": 1.1308615049073065,
108
+ "grad_norm": 4.324810028076172,
109
+ "learning_rate": 1.7584921962288222e-06,
110
+ "loss": 0.6445,
111
+ "step": 130
112
+ },
113
+ {
114
+ "epoch": 1.2181025081788441,
115
+ "grad_norm": 4.151200294494629,
116
+ "learning_rate": 1.7053655437747794e-06,
117
+ "loss": 0.6276,
118
+ "step": 140
119
+ },
120
+ {
121
+ "epoch": 1.3053435114503817,
122
+ "grad_norm": 2.969996690750122,
123
+ "learning_rate": 1.6522388913207364e-06,
124
+ "loss": 0.6167,
125
+ "step": 150
126
+ },
127
+ {
128
+ "epoch": 1.3925845147219194,
129
+ "grad_norm": 2.5971832275390625,
130
+ "learning_rate": 1.5991122388666934e-06,
131
+ "loss": 0.644,
132
+ "step": 160
133
+ },
134
+ {
135
+ "epoch": 1.479825517993457,
136
+ "grad_norm": 3.444840431213379,
137
+ "learning_rate": 1.5459855864126504e-06,
138
+ "loss": 0.6292,
139
+ "step": 170
140
+ },
141
+ {
142
+ "epoch": 1.5670665212649946,
143
+ "grad_norm": 4.372523784637451,
144
+ "learning_rate": 1.4928589339586077e-06,
145
+ "loss": 0.6357,
146
+ "step": 180
147
+ },
148
+ {
149
+ "epoch": 1.6543075245365322,
150
+ "grad_norm": 5.912120342254639,
151
+ "learning_rate": 1.4397322815045647e-06,
152
+ "loss": 0.6113,
153
+ "step": 190
154
+ },
155
+ {
156
+ "epoch": 1.7415485278080698,
157
+ "grad_norm": 3.5597641468048096,
158
+ "learning_rate": 1.3866056290505217e-06,
159
+ "loss": 0.6195,
160
+ "step": 200
161
+ },
162
+ {
163
+ "epoch": 1.8287895310796074,
164
+ "grad_norm": 2.4704296588897705,
165
+ "learning_rate": 1.3334789765964785e-06,
166
+ "loss": 0.6397,
167
+ "step": 210
168
+ },
169
+ {
170
+ "epoch": 1.916030534351145,
171
+ "grad_norm": 4.187549114227295,
172
+ "learning_rate": 1.2803523241424355e-06,
173
+ "loss": 0.6145,
174
+ "step": 220
175
+ },
176
+ {
177
+ "epoch": 2.0,
178
+ "grad_norm": 3.662736415863037,
179
+ "learning_rate": 1.2272256716883927e-06,
180
+ "loss": 0.6161,
181
+ "step": 230
182
+ },
183
+ {
184
+ "epoch": 2.0,
185
+ "eval_accuracy": 0.6838235294117647,
186
+ "eval_f1": 0.8122270742358079,
187
+ "eval_loss": 0.6124567985534668,
188
+ "eval_runtime": 12.2949,
189
+ "eval_samples_per_second": 33.185,
190
+ "eval_steps_per_second": 4.148,
191
+ "step": 230
192
+ },
193
+ {
194
+ "epoch": 2.087241003271538,
195
+ "grad_norm": 4.740890979766846,
196
+ "learning_rate": 1.1740990192343497e-06,
197
+ "loss": 0.6427,
198
+ "step": 240
199
+ },
200
+ {
201
+ "epoch": 2.174482006543075,
202
+ "grad_norm": 3.5964860916137695,
203
+ "learning_rate": 1.1209723667803067e-06,
204
+ "loss": 0.6071,
205
+ "step": 250
206
+ },
207
+ {
208
+ "epoch": 2.261723009814613,
209
+ "grad_norm": 2.725940465927124,
210
+ "learning_rate": 1.0678457143262637e-06,
211
+ "loss": 0.6001,
212
+ "step": 260
213
+ },
214
+ {
215
+ "epoch": 2.3489640130861504,
216
+ "grad_norm": 4.717036724090576,
217
+ "learning_rate": 1.0147190618722207e-06,
218
+ "loss": 0.6061,
219
+ "step": 270
220
+ },
221
+ {
222
+ "epoch": 2.4362050163576883,
223
+ "grad_norm": 3.381378173828125,
224
+ "learning_rate": 9.615924094181778e-07,
225
+ "loss": 0.6195,
226
+ "step": 280
227
+ },
228
+ {
229
+ "epoch": 2.5234460196292257,
230
+ "grad_norm": 3.877357244491577,
231
+ "learning_rate": 9.084657569641349e-07,
232
+ "loss": 0.5771,
233
+ "step": 290
234
+ },
235
+ {
236
+ "epoch": 2.6106870229007635,
237
+ "grad_norm": 5.829881191253662,
238
+ "learning_rate": 8.553391045100918e-07,
239
+ "loss": 0.6278,
240
+ "step": 300
241
+ },
242
+ {
243
+ "epoch": 2.697928026172301,
244
+ "grad_norm": 4.983321189880371,
245
+ "learning_rate": 8.022124520560489e-07,
246
+ "loss": 0.5921,
247
+ "step": 310
248
+ },
249
+ {
250
+ "epoch": 2.7851690294438387,
251
+ "grad_norm": 3.385566473007202,
252
+ "learning_rate": 7.490857996020059e-07,
253
+ "loss": 0.6355,
254
+ "step": 320
255
+ },
256
+ {
257
+ "epoch": 2.872410032715376,
258
+ "grad_norm": 3.0855424404144287,
259
+ "learning_rate": 6.95959147147963e-07,
260
+ "loss": 0.6111,
261
+ "step": 330
262
+ },
263
+ {
264
+ "epoch": 2.959651035986914,
265
+ "grad_norm": 3.3900763988494873,
266
+ "learning_rate": 6.428324946939199e-07,
267
+ "loss": 0.6181,
268
+ "step": 340
269
+ },
270
+ {
271
+ "epoch": 3.0,
272
+ "eval_accuracy": 0.6887254901960784,
273
+ "eval_f1": 0.8145985401459854,
274
+ "eval_loss": 0.6093047857284546,
275
+ "eval_runtime": 12.431,
276
+ "eval_samples_per_second": 32.821,
277
+ "eval_steps_per_second": 4.103,
278
+ "step": 345
279
+ },
280
+ {
281
+ "epoch": 3.0436205016357687,
282
+ "grad_norm": 3.486827850341797,
283
+ "learning_rate": 5.897058422398769e-07,
284
+ "loss": 0.561,
285
+ "step": 350
286
+ },
287
+ {
288
+ "epoch": 3.1308615049073065,
289
+ "grad_norm": 3.283555507659912,
290
+ "learning_rate": 5.36579189785834e-07,
291
+ "loss": 0.6097,
292
+ "step": 360
293
+ },
294
+ {
295
+ "epoch": 3.218102508178844,
296
+ "grad_norm": 4.11808443069458,
297
+ "learning_rate": 4.834525373317911e-07,
298
+ "loss": 0.5979,
299
+ "step": 370
300
+ },
301
+ {
302
+ "epoch": 3.3053435114503817,
303
+ "grad_norm": 2.6555957794189453,
304
+ "learning_rate": 4.3032588487774807e-07,
305
+ "loss": 0.5943,
306
+ "step": 380
307
+ },
308
+ {
309
+ "epoch": 3.392584514721919,
310
+ "grad_norm": 2.754282236099243,
311
+ "learning_rate": 3.771992324237051e-07,
312
+ "loss": 0.5948,
313
+ "step": 390
314
+ },
315
+ {
316
+ "epoch": 3.479825517993457,
317
+ "grad_norm": 3.611149787902832,
318
+ "learning_rate": 3.2407257996966214e-07,
319
+ "loss": 0.5928,
320
+ "step": 400
321
+ },
322
+ {
323
+ "epoch": 3.5670665212649943,
324
+ "grad_norm": 4.274903297424316,
325
+ "learning_rate": 2.7094592751561915e-07,
326
+ "loss": 0.5866,
327
+ "step": 410
328
+ },
329
+ {
330
+ "epoch": 3.654307524536532,
331
+ "grad_norm": 3.5177125930786133,
332
+ "learning_rate": 2.178192750615762e-07,
333
+ "loss": 0.6009,
334
+ "step": 420
335
+ },
336
+ {
337
+ "epoch": 3.74154852780807,
338
+ "grad_norm": 3.027575969696045,
339
+ "learning_rate": 1.646926226075332e-07,
340
+ "loss": 0.6174,
341
+ "step": 430
342
+ },
343
+ {
344
+ "epoch": 3.8287895310796074,
345
+ "grad_norm": 2.8280646800994873,
346
+ "learning_rate": 1.1156597015349024e-07,
347
+ "loss": 0.631,
348
+ "step": 440
349
+ },
350
+ {
351
+ "epoch": 3.916030534351145,
352
+ "grad_norm": 3.2580528259277344,
353
+ "learning_rate": 5.843931769944727e-08,
354
+ "loss": 0.6025,
355
+ "step": 450
356
+ },
357
+ {
358
+ "epoch": 4.0,
359
+ "grad_norm": 3.878319501876831,
360
+ "learning_rate": 5.312665245404297e-09,
361
+ "loss": 0.6509,
362
+ "step": 460
363
+ },
364
+ {
365
+ "epoch": 4.0,
366
+ "eval_accuracy": 0.6887254901960784,
367
+ "eval_f1": 0.8145985401459854,
368
+ "eval_loss": 0.6089843511581421,
369
+ "eval_runtime": 12.3068,
370
+ "eval_samples_per_second": 33.153,
371
+ "eval_steps_per_second": 4.144,
372
+ "step": 460
373
+ }
374
+ ],
375
+ "logging_steps": 10,
376
+ "max_steps": 460,
377
+ "num_input_tokens_seen": 0,
378
+ "num_train_epochs": 4,
379
+ "save_steps": 500,
380
+ "stateful_callbacks": {
381
+ "TrainerControl": {
382
+ "args": {
383
+ "should_epoch_stop": false,
384
+ "should_evaluate": false,
385
+ "should_log": false,
386
+ "should_save": true,
387
+ "should_training_stop": true
388
+ },
389
+ "attributes": {}
390
+ }
391
+ },
392
+ "total_flos": 965091351060480.0,
393
+ "train_batch_size": 4,
394
+ "trial_name": null,
395
+ "trial_params": {
396
+ "hidden_dropout_prob": 0.30716938624140044,
397
+ "learning_rate": 2.4438260128859767e-06,
398
+ "num_train_epochs": 4,
399
+ "per_device_train_batch_size": 4
400
+ }
401
+ }
run-2/checkpoint-460/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8017655c1a55cbfc9386d425ba50c1e9b58602655bfee37869fd1e3fccbcf104
3
+ size 5432
run-2/checkpoint-460/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41a153127cb706537432ced8326839888209aadcf2a2673718dc223a43a0dd31
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8017655c1a55cbfc9386d425ba50c1e9b58602655bfee37869fd1e3fccbcf104
3
  size 5432