Ruslan10 commited on
Commit
f07eaae
·
verified ·
1 Parent(s): ca4dc18

Training in progress, epoch 1

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f222110934988e114609565e5e8aba9fc71a59a451caae54b923b9c47a443599
3
  size 437961724
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a9d1a7c0af34b2662d912bca96b4323fc85be50bfcd854f08488170d35f605
3
  size 437961724
run-2/checkpoint-1359/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "dtype": "float32",
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "LABEL_0",
14
+ "1": "LABEL_1",
15
+ "2": "LABEL_2"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "LABEL_0": 0,
21
+ "LABEL_1": 1,
22
+ "LABEL_2": 2
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "transformers_version": "4.57.1",
33
+ "type_vocab_size": 2,
34
+ "use_cache": true,
35
+ "vocab_size": 30522
36
+ }
run-2/checkpoint-1359/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17a9d1a7c0af34b2662d912bca96b4323fc85be50bfcd854f08488170d35f605
3
+ size 437961724
run-2/checkpoint-1359/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46c33389283e208e1decfa83b3515a6379f814c2ff81822e1e0b958a0c8ec9b
3
+ size 876047755
run-2/checkpoint-1359/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bf99a97e55e37df1321c689e62a0643f71031f8b594e08a09e50b16de6a73e3
3
+ size 14709
run-2/checkpoint-1359/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be3415804e0baef86602afc2f77976eabebf5e078b8f5f63b855d510c163c36b
3
+ size 1383
run-2/checkpoint-1359/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e0bbbf384b81bbdda3c52b04df0b53dfa70b7e88d22ea4d6fa3aa455f7a45d
3
+ size 1465
run-2/checkpoint-1359/special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
run-2/checkpoint-1359/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
run-2/checkpoint-1359/tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "max_length": 256,
50
+ "model_max_length": 512,
51
+ "pad_to_multiple_of": null,
52
+ "pad_token": "[PAD]",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "[SEP]",
56
+ "stride": 0,
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "BertTokenizer",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "[UNK]"
63
+ }
run-2/checkpoint-1359/trainer_state.json ADDED
@@ -0,0 +1,1013 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1359,
3
+ "best_metric": 0.7880794701986755,
4
+ "best_model_checkpoint": "bert-finetuned-sentiment/run-2/checkpoint-1359",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1359,
8
+ "is_hyper_param_search": true,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.007358351729212656,
14
+ "grad_norm": 15.73861026763916,
15
+ "learning_rate": 7.661248088138319e-07,
16
+ "loss": 0.1318,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.014716703458425313,
21
+ "grad_norm": 0.6841095685958862,
22
+ "learning_rate": 1.6173745963847562e-06,
23
+ "loss": 0.2228,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.02207505518763797,
28
+ "grad_norm": 20.415630340576172,
29
+ "learning_rate": 2.46862438395568e-06,
30
+ "loss": 0.2499,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.029433406916850625,
35
+ "grad_norm": 2.7358345985412598,
36
+ "learning_rate": 3.319874171526605e-06,
37
+ "loss": 0.2024,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.03679175864606328,
42
+ "grad_norm": 6.356612205505371,
43
+ "learning_rate": 4.171123959097529e-06,
44
+ "loss": 0.1746,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.04415011037527594,
49
+ "grad_norm": 0.4042581021785736,
50
+ "learning_rate": 5.022373746668453e-06,
51
+ "loss": 0.1,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.051508462104488596,
56
+ "grad_norm": 1.1148451566696167,
57
+ "learning_rate": 5.8736235342393774e-06,
58
+ "loss": 0.1023,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.05886681383370125,
63
+ "grad_norm": 39.592811584472656,
64
+ "learning_rate": 6.724873321810302e-06,
65
+ "loss": 0.0813,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.06622516556291391,
70
+ "grad_norm": 19.681055068969727,
71
+ "learning_rate": 7.576123109381227e-06,
72
+ "loss": 0.0931,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.07358351729212656,
77
+ "grad_norm": 10.252055168151855,
78
+ "learning_rate": 8.42737289695215e-06,
79
+ "loss": 0.2333,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.08094186902133922,
84
+ "grad_norm": 33.362030029296875,
85
+ "learning_rate": 8.49814021437465e-06,
86
+ "loss": 0.1284,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.08830022075055188,
91
+ "grad_norm": 20.02435874938965,
92
+ "learning_rate": 8.482187257336215e-06,
93
+ "loss": 0.1234,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.09565857247976453,
98
+ "grad_norm": 66.22127532958984,
99
+ "learning_rate": 8.46623430029778e-06,
100
+ "loss": 0.0565,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.10301692420897719,
105
+ "grad_norm": 0.9394611120223999,
106
+ "learning_rate": 8.450281343259344e-06,
107
+ "loss": 0.0571,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.11037527593818984,
112
+ "grad_norm": 32.92816162109375,
113
+ "learning_rate": 8.434328386220908e-06,
114
+ "loss": 0.202,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.1177336276674025,
119
+ "grad_norm": 30.171003341674805,
120
+ "learning_rate": 8.418375429182472e-06,
121
+ "loss": 0.2109,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.12509197939661515,
126
+ "grad_norm": 0.9405515193939209,
127
+ "learning_rate": 8.402422472144037e-06,
128
+ "loss": 0.1541,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.13245033112582782,
133
+ "grad_norm": 13.217094421386719,
134
+ "learning_rate": 8.386469515105601e-06,
135
+ "loss": 0.206,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.13980868285504047,
140
+ "grad_norm": 0.7528594136238098,
141
+ "learning_rate": 8.370516558067165e-06,
142
+ "loss": 0.0698,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.14716703458425312,
147
+ "grad_norm": 6.840025424957275,
148
+ "learning_rate": 8.35456360102873e-06,
149
+ "loss": 0.1986,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.1545253863134658,
154
+ "grad_norm": 1.212363362312317,
155
+ "learning_rate": 8.338610643990296e-06,
156
+ "loss": 0.1699,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.16188373804267844,
161
+ "grad_norm": 3.122361183166504,
162
+ "learning_rate": 8.322657686951858e-06,
163
+ "loss": 0.1288,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.1692420897718911,
168
+ "grad_norm": 0.28292983770370483,
169
+ "learning_rate": 8.306704729913423e-06,
170
+ "loss": 0.007,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.17660044150110377,
175
+ "grad_norm": 0.11904435604810715,
176
+ "learning_rate": 8.290751772874989e-06,
177
+ "loss": 0.1291,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.18395879323031641,
182
+ "grad_norm": 62.53517532348633,
183
+ "learning_rate": 8.274798815836553e-06,
184
+ "loss": 0.0704,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.19131714495952906,
189
+ "grad_norm": 34.67837142944336,
190
+ "learning_rate": 8.258845858798116e-06,
191
+ "loss": 0.1849,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.1986754966887417,
196
+ "grad_norm": 0.05347510427236557,
197
+ "learning_rate": 8.242892901759682e-06,
198
+ "loss": 0.1584,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.20603384841795438,
203
+ "grad_norm": 15.957563400268555,
204
+ "learning_rate": 8.226939944721246e-06,
205
+ "loss": 0.1044,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.21339220014716703,
210
+ "grad_norm": 0.15385572612285614,
211
+ "learning_rate": 8.21098698768281e-06,
212
+ "loss": 0.0807,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.22075055187637968,
217
+ "grad_norm": 0.08834321796894073,
218
+ "learning_rate": 8.195034030644375e-06,
219
+ "loss": 0.1526,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.22810890360559236,
224
+ "grad_norm": 5.847803115844727,
225
+ "learning_rate": 8.179081073605939e-06,
226
+ "loss": 0.2518,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.235467255334805,
231
+ "grad_norm": 76.60889434814453,
232
+ "learning_rate": 8.163128116567503e-06,
233
+ "loss": 0.3841,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.24282560706401765,
238
+ "grad_norm": 5.798557758331299,
239
+ "learning_rate": 8.147175159529068e-06,
240
+ "loss": 0.2359,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.2501839587932303,
245
+ "grad_norm": 0.20405276119709015,
246
+ "learning_rate": 8.131222202490632e-06,
247
+ "loss": 0.1507,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.257542310522443,
252
+ "grad_norm": 0.35858389735221863,
253
+ "learning_rate": 8.115269245452196e-06,
254
+ "loss": 0.286,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.26490066225165565,
259
+ "grad_norm": 0.10192416608333588,
260
+ "learning_rate": 8.09931628841376e-06,
261
+ "loss": 0.0885,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.27225901398086827,
266
+ "grad_norm": 3.3536927700042725,
267
+ "learning_rate": 8.083363331375325e-06,
268
+ "loss": 0.2359,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.27961736571008095,
273
+ "grad_norm": 0.36128467321395874,
274
+ "learning_rate": 8.06741037433689e-06,
275
+ "loss": 0.2289,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.2869757174392936,
280
+ "grad_norm": 35.29405975341797,
281
+ "learning_rate": 8.051457417298454e-06,
282
+ "loss": 0.1705,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.29433406916850624,
287
+ "grad_norm": 13.290261268615723,
288
+ "learning_rate": 8.035504460260018e-06,
289
+ "loss": 0.1391,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.3016924208977189,
294
+ "grad_norm": 4.086803436279297,
295
+ "learning_rate": 8.019551503221582e-06,
296
+ "loss": 0.3115,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.3090507726269316,
301
+ "grad_norm": 85.40064239501953,
302
+ "learning_rate": 8.003598546183147e-06,
303
+ "loss": 0.1905,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.3164091243561442,
308
+ "grad_norm": 3.496741533279419,
309
+ "learning_rate": 7.987645589144711e-06,
310
+ "loss": 0.1074,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.3237674760853569,
315
+ "grad_norm": 32.27187728881836,
316
+ "learning_rate": 7.971692632106275e-06,
317
+ "loss": 0.1561,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.33112582781456956,
322
+ "grad_norm": 8.242842674255371,
323
+ "learning_rate": 7.95573967506784e-06,
324
+ "loss": 0.2492,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.3384841795437822,
329
+ "grad_norm": 29.650043487548828,
330
+ "learning_rate": 7.939786718029404e-06,
331
+ "loss": 0.2973,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.34584253127299486,
336
+ "grad_norm": 79.97142791748047,
337
+ "learning_rate": 7.923833760990968e-06,
338
+ "loss": 0.1544,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.35320088300220753,
343
+ "grad_norm": 29.046892166137695,
344
+ "learning_rate": 7.907880803952533e-06,
345
+ "loss": 0.1099,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.36055923473142015,
350
+ "grad_norm": 0.17732320725917816,
351
+ "learning_rate": 7.891927846914099e-06,
352
+ "loss": 0.1003,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.36791758646063283,
357
+ "grad_norm": 0.7188624143600464,
358
+ "learning_rate": 7.875974889875661e-06,
359
+ "loss": 0.3034,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.37527593818984545,
364
+ "grad_norm": 21.04668617248535,
365
+ "learning_rate": 7.860021932837226e-06,
366
+ "loss": 0.2042,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.3826342899190581,
371
+ "grad_norm": 5.541590690612793,
372
+ "learning_rate": 7.844068975798792e-06,
373
+ "loss": 0.2426,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.3899926416482708,
378
+ "grad_norm": 0.21457338333129883,
379
+ "learning_rate": 7.828116018760356e-06,
380
+ "loss": 0.2613,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.3973509933774834,
385
+ "grad_norm": 6.815304279327393,
386
+ "learning_rate": 7.812163061721919e-06,
387
+ "loss": 0.1203,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.4047093451066961,
392
+ "grad_norm": 1.4606707096099854,
393
+ "learning_rate": 7.796210104683485e-06,
394
+ "loss": 0.1307,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.41206769683590877,
399
+ "grad_norm": 170.10000610351562,
400
+ "learning_rate": 7.780257147645049e-06,
401
+ "loss": 0.2187,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.4194260485651214,
406
+ "grad_norm": 52.976402282714844,
407
+ "learning_rate": 7.764304190606613e-06,
408
+ "loss": 0.1297,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.42678440029433407,
413
+ "grad_norm": 1.0618321895599365,
414
+ "learning_rate": 7.748351233568176e-06,
415
+ "loss": 0.1972,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.43414275202354674,
420
+ "grad_norm": 0.2524672746658325,
421
+ "learning_rate": 7.732398276529742e-06,
422
+ "loss": 0.2513,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.44150110375275936,
427
+ "grad_norm": 0.7126919627189636,
428
+ "learning_rate": 7.716445319491306e-06,
429
+ "loss": 0.2055,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.44885945548197204,
434
+ "grad_norm": 6.286591529846191,
435
+ "learning_rate": 7.70049236245287e-06,
436
+ "loss": 0.0236,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.4562178072111847,
441
+ "grad_norm": 0.11959370225667953,
442
+ "learning_rate": 7.684539405414435e-06,
443
+ "loss": 0.2016,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.46357615894039733,
448
+ "grad_norm": 6.085186004638672,
449
+ "learning_rate": 7.668586448376e-06,
450
+ "loss": 0.3102,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.47093451066961,
455
+ "grad_norm": 96.2499008178711,
456
+ "learning_rate": 7.652633491337564e-06,
457
+ "loss": 0.2181,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.4782928623988227,
462
+ "grad_norm": 14.058534622192383,
463
+ "learning_rate": 7.636680534299128e-06,
464
+ "loss": 0.2536,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.4856512141280353,
469
+ "grad_norm": 33.89338684082031,
470
+ "learning_rate": 7.620727577260692e-06,
471
+ "loss": 0.2979,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.493009565857248,
476
+ "grad_norm": 0.733513355255127,
477
+ "learning_rate": 7.604774620222257e-06,
478
+ "loss": 0.1619,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.5003679175864606,
483
+ "grad_norm": 0.18061041831970215,
484
+ "learning_rate": 7.588821663183821e-06,
485
+ "loss": 0.1896,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.5077262693156733,
490
+ "grad_norm": 0.1485278606414795,
491
+ "learning_rate": 7.572868706145385e-06,
492
+ "loss": 0.0868,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.515084621044886,
497
+ "grad_norm": 65.20437622070312,
498
+ "learning_rate": 7.55691574910695e-06,
499
+ "loss": 0.1969,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.5224429727740986,
504
+ "grad_norm": 33.507415771484375,
505
+ "learning_rate": 7.540962792068515e-06,
506
+ "loss": 0.2309,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.5298013245033113,
511
+ "grad_norm": 7.292962551116943,
512
+ "learning_rate": 7.525009835030078e-06,
513
+ "loss": 0.3215,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.5371596762325239,
518
+ "grad_norm": 0.141510471701622,
519
+ "learning_rate": 7.509056877991643e-06,
520
+ "loss": 0.1861,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.5445180279617365,
525
+ "grad_norm": 10.76659107208252,
526
+ "learning_rate": 7.493103920953208e-06,
527
+ "loss": 0.1563,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.5518763796909493,
532
+ "grad_norm": 0.535017192363739,
533
+ "learning_rate": 7.477150963914772e-06,
534
+ "loss": 0.1037,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.5592347314201619,
539
+ "grad_norm": 0.12771788239479065,
540
+ "learning_rate": 7.461198006876336e-06,
541
+ "loss": 0.0714,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.5665930831493745,
546
+ "grad_norm": 0.13390083611011505,
547
+ "learning_rate": 7.4452450498379e-06,
548
+ "loss": 0.2959,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.5739514348785872,
553
+ "grad_norm": 56.0723991394043,
554
+ "learning_rate": 7.429292092799465e-06,
555
+ "loss": 0.1132,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.5813097866077999,
560
+ "grad_norm": 0.137408047914505,
561
+ "learning_rate": 7.4133391357610295e-06,
562
+ "loss": 0.2051,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.5886681383370125,
567
+ "grad_norm": 12.899972915649414,
568
+ "learning_rate": 7.397386178722593e-06,
569
+ "loss": 0.3972,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.5960264900662252,
574
+ "grad_norm": 11.240960121154785,
575
+ "learning_rate": 7.381433221684158e-06,
576
+ "loss": 0.3619,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.6033848417954378,
581
+ "grad_norm": 8.278285026550293,
582
+ "learning_rate": 7.3654802646457225e-06,
583
+ "loss": 0.1447,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.6107431935246505,
588
+ "grad_norm": 2.9699766635894775,
589
+ "learning_rate": 7.349527307607287e-06,
590
+ "loss": 0.1212,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.6181015452538632,
595
+ "grad_norm": 12.325569152832031,
596
+ "learning_rate": 7.333574350568851e-06,
597
+ "loss": 0.2392,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.6254598969830758,
602
+ "grad_norm": 1.178353190422058,
603
+ "learning_rate": 7.3176213935304155e-06,
604
+ "loss": 0.193,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.6328182487122884,
609
+ "grad_norm": 42.010711669921875,
610
+ "learning_rate": 7.30166843649198e-06,
611
+ "loss": 0.178,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.6401766004415012,
616
+ "grad_norm": 0.831745445728302,
617
+ "learning_rate": 7.285715479453545e-06,
618
+ "loss": 0.2348,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.6475349521707138,
623
+ "grad_norm": 68.13945007324219,
624
+ "learning_rate": 7.2697625224151084e-06,
625
+ "loss": 0.0788,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.6548933038999264,
630
+ "grad_norm": 33.832366943359375,
631
+ "learning_rate": 7.253809565376673e-06,
632
+ "loss": 0.1783,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.6622516556291391,
637
+ "grad_norm": 1.357936978340149,
638
+ "learning_rate": 7.237856608338238e-06,
639
+ "loss": 0.1938,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.6696100073583517,
644
+ "grad_norm": 53.46693420410156,
645
+ "learning_rate": 7.221903651299802e-06,
646
+ "loss": 0.4283,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.6769683590875644,
651
+ "grad_norm": 4.254789352416992,
652
+ "learning_rate": 7.205950694261366e-06,
653
+ "loss": 0.2616,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.6843267108167771,
658
+ "grad_norm": 75.10453796386719,
659
+ "learning_rate": 7.189997737222931e-06,
660
+ "loss": 0.1924,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.6916850625459897,
665
+ "grad_norm": 0.7422951459884644,
666
+ "learning_rate": 7.174044780184495e-06,
667
+ "loss": 0.13,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.6990434142752023,
672
+ "grad_norm": 18.9062557220459,
673
+ "learning_rate": 7.15809182314606e-06,
674
+ "loss": 0.1283,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.7064017660044151,
679
+ "grad_norm": 6.750290393829346,
680
+ "learning_rate": 7.142138866107623e-06,
681
+ "loss": 0.3134,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.7137601177336277,
686
+ "grad_norm": 16.308557510375977,
687
+ "learning_rate": 7.126185909069188e-06,
688
+ "loss": 0.2575,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.7211184694628403,
693
+ "grad_norm": 67.54432678222656,
694
+ "learning_rate": 7.110232952030753e-06,
695
+ "loss": 0.3646,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.7284768211920529,
700
+ "grad_norm": 20.565406799316406,
701
+ "learning_rate": 7.094279994992317e-06,
702
+ "loss": 0.1607,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.7358351729212657,
707
+ "grad_norm": 9.457584381103516,
708
+ "learning_rate": 7.078327037953881e-06,
709
+ "loss": 0.4157,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.7431935246504783,
714
+ "grad_norm": 1.9208470582962036,
715
+ "learning_rate": 7.062374080915446e-06,
716
+ "loss": 0.288,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.7505518763796909,
721
+ "grad_norm": 0.18186113238334656,
722
+ "learning_rate": 7.04642112387701e-06,
723
+ "loss": 0.1683,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.7579102281089036,
728
+ "grad_norm": 56.17685317993164,
729
+ "learning_rate": 7.030468166838575e-06,
730
+ "loss": 0.1789,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.7652685798381162,
735
+ "grad_norm": 22.048622131347656,
736
+ "learning_rate": 7.014515209800139e-06,
737
+ "loss": 0.2506,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.7726269315673289,
742
+ "grad_norm": 3.697582244873047,
743
+ "learning_rate": 6.998562252761703e-06,
744
+ "loss": 0.1667,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.7799852832965416,
749
+ "grad_norm": 6.335505962371826,
750
+ "learning_rate": 6.982609295723268e-06,
751
+ "loss": 0.1866,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.7873436350257542,
756
+ "grad_norm": 18.40208625793457,
757
+ "learning_rate": 6.9666563386848324e-06,
758
+ "loss": 0.2035,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.7947019867549668,
763
+ "grad_norm": 0.3842657506465912,
764
+ "learning_rate": 6.950703381646396e-06,
765
+ "loss": 0.3858,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.8020603384841796,
770
+ "grad_norm": 0.10957049578428268,
771
+ "learning_rate": 6.934750424607961e-06,
772
+ "loss": 0.1044,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.8094186902133922,
777
+ "grad_norm": 52.896461486816406,
778
+ "learning_rate": 6.9187974675695254e-06,
779
+ "loss": 0.0542,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.8167770419426048,
784
+ "grad_norm": 77.98719787597656,
785
+ "learning_rate": 6.90284451053109e-06,
786
+ "loss": 0.2078,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.8241353936718175,
791
+ "grad_norm": 103.59429931640625,
792
+ "learning_rate": 6.886891553492654e-06,
793
+ "loss": 0.1757,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.8314937454010302,
798
+ "grad_norm": 14.622530937194824,
799
+ "learning_rate": 6.8709385964542184e-06,
800
+ "loss": 0.1741,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.8388520971302428,
805
+ "grad_norm": 0.8220998048782349,
806
+ "learning_rate": 6.854985639415783e-06,
807
+ "loss": 0.1864,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.8462104488594555,
812
+ "grad_norm": 44.293025970458984,
813
+ "learning_rate": 6.839032682377347e-06,
814
+ "loss": 0.3249,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.8535688005886681,
819
+ "grad_norm": 45.35190200805664,
820
+ "learning_rate": 6.823079725338911e-06,
821
+ "loss": 0.2564,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.8609271523178808,
826
+ "grad_norm": 92.81620025634766,
827
+ "learning_rate": 6.807126768300476e-06,
828
+ "loss": 0.1886,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.8682855040470935,
833
+ "grad_norm": 110.62460327148438,
834
+ "learning_rate": 6.79117381126204e-06,
835
+ "loss": 0.2484,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.8756438557763061,
840
+ "grad_norm": 25.606109619140625,
841
+ "learning_rate": 6.775220854223605e-06,
842
+ "loss": 0.1544,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.8830022075055187,
847
+ "grad_norm": 0.10603518784046173,
848
+ "learning_rate": 6.759267897185169e-06,
849
+ "loss": 0.3421,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.8903605592347315,
854
+ "grad_norm": 0.8049039840698242,
855
+ "learning_rate": 6.743314940146733e-06,
856
+ "loss": 0.3171,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.8977189109639441,
861
+ "grad_norm": 0.1788051277399063,
862
+ "learning_rate": 6.727361983108298e-06,
863
+ "loss": 0.2324,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.9050772626931567,
868
+ "grad_norm": 4.411452293395996,
869
+ "learning_rate": 6.711409026069863e-06,
870
+ "loss": 0.0503,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.9124356144223694,
875
+ "grad_norm": 0.08730533719062805,
876
+ "learning_rate": 6.695456069031426e-06,
877
+ "loss": 0.1698,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.919793966151582,
882
+ "grad_norm": 14.45964527130127,
883
+ "learning_rate": 6.679503111992991e-06,
884
+ "loss": 0.1356,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.9271523178807947,
889
+ "grad_norm": 99.87224578857422,
890
+ "learning_rate": 6.663550154954556e-06,
891
+ "loss": 0.0962,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.9345106696100074,
896
+ "grad_norm": 0.4108155369758606,
897
+ "learning_rate": 6.64759719791612e-06,
898
+ "loss": 0.1787,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.94186902133922,
903
+ "grad_norm": 108.73641204833984,
904
+ "learning_rate": 6.631644240877684e-06,
905
+ "loss": 0.279,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.9492273730684326,
910
+ "grad_norm": 131.88836669921875,
911
+ "learning_rate": 6.6156912838392486e-06,
912
+ "loss": 0.1746,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.9565857247976454,
917
+ "grad_norm": 0.547515869140625,
918
+ "learning_rate": 6.599738326800813e-06,
919
+ "loss": 0.0208,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.963944076526858,
924
+ "grad_norm": 19.8458251953125,
925
+ "learning_rate": 6.583785369762378e-06,
926
+ "loss": 0.1389,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.9713024282560706,
931
+ "grad_norm": 0.5687592625617981,
932
+ "learning_rate": 6.5678324127239416e-06,
933
+ "loss": 0.1856,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.9786607799852833,
938
+ "grad_norm": 0.06650309264659882,
939
+ "learning_rate": 6.551879455685506e-06,
940
+ "loss": 0.2146,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.986019131714496,
945
+ "grad_norm": 38.21836471557617,
946
+ "learning_rate": 6.53592649864707e-06,
947
+ "loss": 0.2825,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.9933774834437086,
952
+ "grad_norm": 2.0989203453063965,
953
+ "learning_rate": 6.519973541608635e-06,
954
+ "loss": 0.341,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 1.0,
959
+ "eval_accuracy": 0.7880794701986755,
960
+ "eval_confusion_matrix": [
961
+ [
962
+ 676,
963
+ 8,
964
+ 78
965
+ ],
966
+ [
967
+ 7,
968
+ 70,
969
+ 34
970
+ ],
971
+ [
972
+ 110,
973
+ 19,
974
+ 206
975
+ ]
976
+ ],
977
+ "eval_f1": 0.7852632048062911,
978
+ "eval_loss": 1.1113489866256714,
979
+ "eval_precision": 0.7836833128242224,
980
+ "eval_recall": 0.7880794701986755,
981
+ "eval_runtime": 10.673,
982
+ "eval_samples_per_second": 113.183,
983
+ "eval_steps_per_second": 3.56,
984
+ "step": 1359
985
+ }
986
+ ],
987
+ "logging_steps": 10,
988
+ "max_steps": 5436,
989
+ "num_input_tokens_seen": 0,
990
+ "num_train_epochs": 4,
991
+ "save_steps": 500,
992
+ "stateful_callbacks": {
993
+ "TrainerControl": {
994
+ "args": {
995
+ "should_epoch_stop": false,
996
+ "should_evaluate": false,
997
+ "should_log": false,
998
+ "should_save": true,
999
+ "should_training_stop": false
1000
+ },
1001
+ "attributes": {}
1002
+ }
1003
+ },
1004
+ "total_flos": 1158870714941520.0,
1005
+ "train_batch_size": 8,
1006
+ "trial_name": null,
1007
+ "trial_params": {
1008
+ "learning_rate": 8.512497875709243e-06,
1009
+ "num_train_epochs": 4,
1010
+ "per_device_train_batch_size": 4,
1011
+ "seed": 14
1012
+ }
1013
+ }
run-2/checkpoint-1359/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1ed213c84afc0c72cb8266b9163d38ffb171814d89789d606f80a887cc280d4
3
+ size 5777
run-2/checkpoint-1359/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b658592e23f2a4cd1a0d40f6dc93783b89faf26708f63097721fc7f888ffd853
3
  size 5777
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1ed213c84afc0c72cb8266b9163d38ffb171814d89789d606f80a887cc280d4
3
  size 5777