Chantland commited on
Commit
c46e3ea
·
verified ·
1 Parent(s): 58fd8ac

Update model to Roberta

Browse files

Use Roberta-base as an example of the model instead, as it produces marginal improvements and was used as our model when we ran the prediction of further data.

config.json CHANGED
@@ -1,13 +1,15 @@
1
  {
2
- "_name_or_path": "distilbert-base-uncased",
3
- "activation": "gelu",
4
  "architectures": [
5
- "DistilBertForSequenceClassification"
6
  ],
7
- "attention_dropout": 0.1,
8
- "dim": 768,
9
- "dropout": 0.1,
10
- "hidden_dim": 3072,
 
 
 
11
  "id2label": {
12
  "0": "EVENT_Illness",
13
  "1": "EVENT_Accident",
@@ -23,6 +25,7 @@
23
  "11": "ACTION_Priest_High_Religion"
24
  },
25
  "initializer_range": 0.02,
 
26
  "label2id": {
27
  "ACTION_Divination": 9,
28
  "ACTION_Physical_Material": 7,
@@ -37,17 +40,17 @@
37
  "EVENT_Illness": 0,
38
  "EVENT_Other": 2
39
  },
40
- "max_position_embeddings": 512,
41
- "model_type": "distilbert",
42
- "n_heads": 12,
43
- "n_layers": 6,
44
- "pad_token_id": 0,
 
 
45
  "problem_type": "multi_label_classification",
46
- "qa_dropout": 0.1,
47
- "seq_classif_dropout": 0.2,
48
- "sinusoidal_pos_embds": false,
49
- "tie_weights_": true,
50
  "torch_dtype": "float32",
51
  "transformers_version": "4.41.1",
52
- "vocab_size": 30522
 
 
53
  }
 
1
  {
2
+ "_name_or_path": "roberta-base",
 
3
  "architectures": [
4
+ "RobertaForSequenceClassification"
5
  ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
  "id2label": {
14
  "0": "EVENT_Illness",
15
  "1": "EVENT_Accident",
 
25
  "11": "ACTION_Priest_High_Religion"
26
  },
27
  "initializer_range": 0.02,
28
+ "intermediate_size": 3072,
29
  "label2id": {
30
  "ACTION_Divination": 9,
31
  "ACTION_Physical_Material": 7,
 
40
  "EVENT_Illness": 0,
41
  "EVENT_Other": 2
42
  },
43
+ "layer_norm_eps": 1e-05,
44
+ "max_position_embeddings": 514,
45
+ "model_type": "roberta",
46
+ "num_attention_heads": 12,
47
+ "num_hidden_layers": 12,
48
+ "pad_token_id": 1,
49
+ "position_embedding_type": "absolute",
50
  "problem_type": "multi_label_classification",
 
 
 
 
51
  "torch_dtype": "float32",
52
  "transformers_version": "4.41.1",
53
+ "type_vocab_size": 1,
54
+ "use_cache": true,
55
+ "vocab_size": 50265
56
  }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:610010ff6d1820ecda9ccc984dc14b78e65a98750da221e9da71dd4100294c76
3
- size 267863328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5ca05876cf3afcab6dd3663ec1da3c4140beba4d57538335952e4fcbc5726f8
3
+ size 498643584
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:097a9d2dbbb8bd734adf115a34b696de90b89b7e310a93e9fb3c58bd2e7ddceb
3
- size 535788730
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff392e23a8b29f83c080239697cb66c5590f3624a6cd4bdbd3c7bd6e416b9c3a
3
+ size 997406970
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88ff5bb36b8e29678107a9fe193a35ed023fde1c7b0d9c95341d37f70c2cdbea
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79ca43ed4de6070e99212deaaf8b64aeb3e2e8e121eced50b7b4583557dcafd
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10b3b185fea1205fbfcb56f550d303bc0b6cb7d325de973a2d3dc5d3e34c96cf
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66dbd422e8b77089bde7762cdb8a7cd324acef7ce31dd863b7ad1d09d9d144e4
3
  size 1064
special_tokens_map.json CHANGED
@@ -1,7 +1,15 @@
1
  {
2
- "cls_token": "[CLS]",
3
- "mask_token": "[MASK]",
4
- "pad_token": "[PAD]",
5
- "sep_token": "[SEP]",
6
- "unk_token": "[UNK]"
 
 
 
 
 
 
 
 
7
  }
 
1
  {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,55 +1,57 @@
1
  {
 
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "[PAD]",
5
  "lstrip": false,
6
- "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
10
  },
11
- "100": {
12
- "content": "[UNK]",
13
  "lstrip": false,
14
- "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
- "101": {
20
- "content": "[CLS]",
21
  "lstrip": false,
22
- "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
  },
27
- "102": {
28
- "content": "[SEP]",
29
  "lstrip": false,
30
- "normalized": false,
31
  "rstrip": false,
32
  "single_word": false,
33
  "special": true
34
  },
35
- "103": {
36
- "content": "[MASK]",
37
- "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  }
43
  },
 
44
  "clean_up_tokenization_spaces": true,
45
- "cls_token": "[CLS]",
46
- "do_lower_case": true,
47
- "mask_token": "[MASK]",
 
48
  "model_max_length": 512,
49
- "pad_token": "[PAD]",
50
- "sep_token": "[SEP]",
51
- "strip_accents": null,
52
- "tokenize_chinese_chars": true,
53
- "tokenizer_class": "DistilBertTokenizer",
54
- "unk_token": "[UNK]"
55
  }
 
1
  {
2
+ "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "0": {
5
+ "content": "<s>",
6
  "lstrip": false,
7
+ "normalized": true,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
11
  },
12
+ "1": {
13
+ "content": "<pad>",
14
  "lstrip": false,
15
+ "normalized": true,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
+ "2": {
21
+ "content": "</s>",
22
  "lstrip": false,
23
+ "normalized": true,
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
27
  },
28
+ "3": {
29
+ "content": "<unk>",
30
  "lstrip": false,
31
+ "normalized": true,
32
  "rstrip": false,
33
  "single_word": false,
34
  "special": true
35
  },
36
+ "50264": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
  "normalized": false,
40
  "rstrip": false,
41
  "single_word": false,
42
  "special": true
43
  }
44
  },
45
+ "bos_token": "<s>",
46
  "clean_up_tokenization_spaces": true,
47
+ "cls_token": "<s>",
48
+ "eos_token": "</s>",
49
+ "errors": "replace",
50
+ "mask_token": "<mask>",
51
  "model_max_length": 512,
52
+ "pad_token": "<pad>",
53
+ "sep_token": "</s>",
54
+ "tokenizer_class": "RobertaTokenizer",
55
+ "trim_offsets": true,
56
+ "unk_token": "<unk>"
 
57
  }
trainer_state.json CHANGED
@@ -1,904 +1,1045 @@
1
  {
2
- "best_metric": 0.6474642162926313,
3
- "best_model_checkpoint": "drive/MyDrive/NLP_HRAF//Models/HRAF_MultiLabel_SubClasses_Kfolds/Model_3_LearningRates/Learning_Rate_2e-05_fold_1/checkpoint-10790",
4
- "epoch": 13.0,
5
  "eval_steps": 500,
6
- "global_step": 10790,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.12048192771084337,
13
- "grad_norm": 0.7108750343322754,
14
  "learning_rate": 1.9839357429718877e-05,
15
- "loss": 0.4589,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.24096385542168675,
20
- "grad_norm": 0.7625552415847778,
21
  "learning_rate": 1.967871485943775e-05,
22
- "loss": 0.3592,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.3614457831325301,
27
- "grad_norm": 0.8666885495185852,
28
  "learning_rate": 1.951807228915663e-05,
29
- "loss": 0.3393,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.4819277108433735,
34
- "grad_norm": 0.9493631720542908,
35
  "learning_rate": 1.9357429718875505e-05,
36
- "loss": 0.3154,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.6024096385542169,
41
- "grad_norm": 0.8787522315979004,
42
  "learning_rate": 1.9196787148594377e-05,
43
- "loss": 0.3001,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.7228915662650602,
48
- "grad_norm": 0.9206348061561584,
49
  "learning_rate": 1.9036144578313255e-05,
50
- "loss": 0.2905,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.8433734939759037,
55
- "grad_norm": 1.008158564567566,
56
  "learning_rate": 1.887550200803213e-05,
57
- "loss": 0.2877,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.963855421686747,
62
- "grad_norm": 1.307525873184204,
63
  "learning_rate": 1.8714859437751005e-05,
64
- "loss": 0.2888,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 1.0,
69
- "eval_accuracy": 0.36347197106690776,
70
- "eval_f1": 0.5040954832670255,
71
- "eval_loss": 0.2610304355621338,
72
- "eval_roc_auc": 0.67926006705383,
73
- "eval_runtime": 20.823,
74
- "eval_samples_per_second": 79.671,
75
- "eval_steps_per_second": 9.989,
76
  "step": 830
77
  },
78
  {
79
  "epoch": 1.0843373493975903,
80
- "grad_norm": 1.141208529472351,
81
  "learning_rate": 1.855421686746988e-05,
82
- "loss": 0.269,
83
  "step": 900
84
  },
85
  {
86
  "epoch": 1.2048192771084336,
87
- "grad_norm": 0.8844193816184998,
88
  "learning_rate": 1.8393574297188755e-05,
89
- "loss": 0.2579,
90
  "step": 1000
91
  },
92
  {
93
  "epoch": 1.3253012048192772,
94
- "grad_norm": 1.1616642475128174,
95
  "learning_rate": 1.8232931726907634e-05,
96
- "loss": 0.2549,
97
  "step": 1100
98
  },
99
  {
100
  "epoch": 1.4457831325301205,
101
- "grad_norm": 1.3445640802383423,
102
  "learning_rate": 1.807228915662651e-05,
103
- "loss": 0.2551,
104
  "step": 1200
105
  },
106
  {
107
  "epoch": 1.5662650602409638,
108
- "grad_norm": 1.131273865699768,
109
  "learning_rate": 1.7911646586345384e-05,
110
- "loss": 0.2672,
111
  "step": 1300
112
  },
113
  {
114
  "epoch": 1.6867469879518073,
115
- "grad_norm": 1.3358525037765503,
116
  "learning_rate": 1.775100401606426e-05,
117
- "loss": 0.2454,
118
  "step": 1400
119
  },
120
  {
121
  "epoch": 1.8072289156626506,
122
- "grad_norm": 1.2100324630737305,
123
  "learning_rate": 1.7590361445783134e-05,
124
- "loss": 0.2364,
125
  "step": 1500
126
  },
127
  {
128
  "epoch": 1.927710843373494,
129
- "grad_norm": 1.2214767932891846,
130
  "learning_rate": 1.742971887550201e-05,
131
- "loss": 0.2409,
132
  "step": 1600
133
  },
134
  {
135
  "epoch": 2.0,
136
- "eval_accuracy": 0.3767329716696805,
137
- "eval_f1": 0.6076237435994689,
138
- "eval_loss": 0.24850943684577942,
139
- "eval_roc_auc": 0.7578414834576195,
140
- "eval_runtime": 21.0346,
141
- "eval_samples_per_second": 78.87,
142
- "eval_steps_per_second": 9.888,
143
  "step": 1660
144
  },
145
  {
146
  "epoch": 2.0481927710843375,
147
- "grad_norm": 0.8881352543830872,
148
  "learning_rate": 1.7269076305220884e-05,
149
- "loss": 0.2341,
150
  "step": 1700
151
  },
152
  {
153
  "epoch": 2.1686746987951806,
154
- "grad_norm": 1.4135143756866455,
155
  "learning_rate": 1.710843373493976e-05,
156
- "loss": 0.2119,
157
  "step": 1800
158
  },
159
  {
160
  "epoch": 2.289156626506024,
161
- "grad_norm": 1.291266918182373,
162
  "learning_rate": 1.6947791164658637e-05,
163
- "loss": 0.2074,
164
  "step": 1900
165
  },
166
  {
167
  "epoch": 2.4096385542168672,
168
- "grad_norm": 1.311950445175171,
169
  "learning_rate": 1.6787148594377512e-05,
170
- "loss": 0.2063,
171
  "step": 2000
172
  },
173
  {
174
  "epoch": 2.5301204819277108,
175
- "grad_norm": 1.6564782857894897,
176
  "learning_rate": 1.6626506024096387e-05,
177
- "loss": 0.2169,
178
  "step": 2100
179
  },
180
  {
181
  "epoch": 2.6506024096385543,
182
- "grad_norm": 2.071871757507324,
183
  "learning_rate": 1.6465863453815262e-05,
184
- "loss": 0.2041,
185
  "step": 2200
186
  },
187
  {
188
  "epoch": 2.7710843373493974,
189
- "grad_norm": 1.6373318433761597,
190
  "learning_rate": 1.6305220883534137e-05,
191
- "loss": 0.2149,
192
  "step": 2300
193
  },
194
  {
195
  "epoch": 2.891566265060241,
196
- "grad_norm": 1.5182716846466064,
197
  "learning_rate": 1.6144578313253015e-05,
198
- "loss": 0.2057,
199
  "step": 2400
200
  },
201
  {
202
  "epoch": 3.0,
203
- "eval_accuracy": 0.3821579264617239,
204
- "eval_f1": 0.6133793367835921,
205
- "eval_loss": 0.24218665063381195,
206
- "eval_roc_auc": 0.7590708449600018,
207
- "eval_runtime": 20.7829,
208
- "eval_samples_per_second": 79.825,
209
- "eval_steps_per_second": 10.008,
210
  "step": 2490
211
  },
212
  {
213
  "epoch": 3.0120481927710845,
214
- "grad_norm": 1.7013568878173828,
215
  "learning_rate": 1.5983935742971887e-05,
216
- "loss": 0.202,
217
  "step": 2500
218
  },
219
  {
220
  "epoch": 3.1325301204819276,
221
- "grad_norm": 1.2363818883895874,
222
  "learning_rate": 1.5823293172690762e-05,
223
- "loss": 0.1797,
224
  "step": 2600
225
  },
226
  {
227
  "epoch": 3.253012048192771,
228
- "grad_norm": 2.084066390991211,
229
  "learning_rate": 1.566265060240964e-05,
230
  "loss": 0.1788,
231
  "step": 2700
232
  },
233
  {
234
  "epoch": 3.3734939759036147,
235
- "grad_norm": 1.1712193489074707,
236
  "learning_rate": 1.5502008032128516e-05,
237
- "loss": 0.169,
238
  "step": 2800
239
  },
240
  {
241
  "epoch": 3.4939759036144578,
242
- "grad_norm": 1.9433542490005493,
243
  "learning_rate": 1.534136546184739e-05,
244
- "loss": 0.1712,
245
  "step": 2900
246
  },
247
  {
248
  "epoch": 3.6144578313253013,
249
- "grad_norm": 2.300347328186035,
250
  "learning_rate": 1.5180722891566266e-05,
251
- "loss": 0.1749,
252
  "step": 3000
253
  },
254
  {
255
  "epoch": 3.734939759036145,
256
- "grad_norm": 1.4740608930587769,
257
  "learning_rate": 1.5020080321285142e-05,
258
- "loss": 0.1736,
259
  "step": 3100
260
  },
261
  {
262
  "epoch": 3.855421686746988,
263
- "grad_norm": 1.7682331800460815,
264
  "learning_rate": 1.4859437751004017e-05,
265
- "loss": 0.1635,
266
  "step": 3200
267
  },
268
  {
269
  "epoch": 3.9759036144578315,
270
- "grad_norm": 1.4352338314056396,
271
  "learning_rate": 1.4698795180722894e-05,
272
- "loss": 0.1587,
273
  "step": 3300
274
  },
275
  {
276
  "epoch": 4.0,
277
- "eval_accuracy": 0.38276069921639544,
278
- "eval_f1": 0.6340291889894697,
279
- "eval_loss": 0.2462325543165207,
280
- "eval_roc_auc": 0.7771644800479557,
281
- "eval_runtime": 21.0125,
282
- "eval_samples_per_second": 78.953,
283
- "eval_steps_per_second": 9.899,
284
  "step": 3320
285
  },
286
  {
287
  "epoch": 4.096385542168675,
288
- "grad_norm": 3.4992010593414307,
289
  "learning_rate": 1.4538152610441769e-05,
290
- "loss": 0.148,
291
  "step": 3400
292
  },
293
  {
294
  "epoch": 4.216867469879518,
295
- "grad_norm": 1.555429458618164,
296
  "learning_rate": 1.4377510040160642e-05,
297
- "loss": 0.1397,
298
  "step": 3500
299
  },
300
  {
301
  "epoch": 4.337349397590361,
302
- "grad_norm": 2.655567169189453,
303
  "learning_rate": 1.4216867469879519e-05,
304
- "loss": 0.1412,
305
  "step": 3600
306
  },
307
  {
308
  "epoch": 4.457831325301205,
309
- "grad_norm": 1.797630786895752,
310
  "learning_rate": 1.4056224899598394e-05,
311
- "loss": 0.1419,
312
  "step": 3700
313
  },
314
  {
315
  "epoch": 4.578313253012048,
316
- "grad_norm": 1.2415262460708618,
317
  "learning_rate": 1.3895582329317269e-05,
318
- "loss": 0.1405,
319
  "step": 3800
320
  },
321
  {
322
  "epoch": 4.698795180722891,
323
- "grad_norm": 1.4111042022705078,
324
  "learning_rate": 1.3734939759036146e-05,
325
- "loss": 0.1349,
326
  "step": 3900
327
  },
328
  {
329
  "epoch": 4.8192771084337345,
330
- "grad_norm": 2.2596189975738525,
331
  "learning_rate": 1.357429718875502e-05,
332
- "loss": 0.1291,
333
  "step": 4000
334
  },
335
  {
336
  "epoch": 4.9397590361445785,
337
- "grad_norm": 2.264066457748413,
338
  "learning_rate": 1.3413654618473897e-05,
339
- "loss": 0.1412,
340
  "step": 4100
341
  },
342
  {
343
  "epoch": 5.0,
344
- "eval_accuracy": 0.38637733574442434,
345
- "eval_f1": 0.627102627102627,
346
- "eval_loss": 0.25749197602272034,
347
- "eval_roc_auc": 0.7690261217645041,
348
- "eval_runtime": 20.8914,
349
- "eval_samples_per_second": 79.411,
350
- "eval_steps_per_second": 9.956,
351
  "step": 4150
352
  },
353
  {
354
  "epoch": 5.0602409638554215,
355
- "grad_norm": 1.1299407482147217,
356
  "learning_rate": 1.3253012048192772e-05,
357
- "loss": 0.1223,
358
  "step": 4200
359
  },
360
  {
361
  "epoch": 5.180722891566265,
362
- "grad_norm": 1.1912181377410889,
363
  "learning_rate": 1.309236947791165e-05,
364
- "loss": 0.1112,
365
  "step": 4300
366
  },
367
  {
368
  "epoch": 5.301204819277109,
369
- "grad_norm": 2.00722074508667,
370
  "learning_rate": 1.2931726907630524e-05,
371
- "loss": 0.1035,
372
  "step": 4400
373
  },
374
  {
375
  "epoch": 5.421686746987952,
376
- "grad_norm": 1.543757677078247,
377
  "learning_rate": 1.2771084337349398e-05,
378
- "loss": 0.1082,
379
  "step": 4500
380
  },
381
  {
382
  "epoch": 5.542168674698795,
383
- "grad_norm": 1.6844489574432373,
384
  "learning_rate": 1.2610441767068273e-05,
385
- "loss": 0.1113,
386
  "step": 4600
387
  },
388
  {
389
  "epoch": 5.662650602409639,
390
- "grad_norm": 2.3987362384796143,
391
  "learning_rate": 1.244979919678715e-05,
392
- "loss": 0.1054,
393
  "step": 4700
394
  },
395
  {
396
  "epoch": 5.783132530120482,
397
- "grad_norm": 1.196558952331543,
398
  "learning_rate": 1.2289156626506024e-05,
399
- "loss": 0.1076,
400
  "step": 4800
401
  },
402
  {
403
  "epoch": 5.903614457831325,
404
- "grad_norm": 1.7159672975540161,
405
  "learning_rate": 1.2128514056224901e-05,
406
- "loss": 0.1129,
407
  "step": 4900
408
  },
409
  {
410
  "epoch": 6.0,
411
- "eval_accuracy": 0.38396624472573837,
412
- "eval_f1": 0.6401569186875892,
413
- "eval_loss": 0.28615912795066833,
414
- "eval_roc_auc": 0.7876849584660104,
415
- "eval_runtime": 21.0174,
416
- "eval_samples_per_second": 78.935,
417
- "eval_steps_per_second": 9.897,
418
  "step": 4980
419
  },
420
  {
421
  "epoch": 6.024096385542169,
422
- "grad_norm": 1.7499059438705444,
423
  "learning_rate": 1.1967871485943776e-05,
424
- "loss": 0.101,
425
  "step": 5000
426
  },
427
  {
428
  "epoch": 6.144578313253012,
429
- "grad_norm": 1.2233022451400757,
430
  "learning_rate": 1.1807228915662651e-05,
431
- "loss": 0.0835,
432
  "step": 5100
433
  },
434
  {
435
  "epoch": 6.265060240963855,
436
- "grad_norm": 1.219558596611023,
437
  "learning_rate": 1.1646586345381528e-05,
438
- "loss": 0.082,
439
  "step": 5200
440
  },
441
  {
442
  "epoch": 6.385542168674699,
443
- "grad_norm": 2.4673707485198975,
444
  "learning_rate": 1.1485943775100403e-05,
445
- "loss": 0.0808,
446
  "step": 5300
447
  },
448
  {
449
  "epoch": 6.506024096385542,
450
- "grad_norm": 2.749701738357544,
451
  "learning_rate": 1.132530120481928e-05,
452
- "loss": 0.0891,
453
  "step": 5400
454
  },
455
  {
456
  "epoch": 6.626506024096385,
457
- "grad_norm": 2.653024196624756,
458
  "learning_rate": 1.1164658634538153e-05,
459
- "loss": 0.0895,
460
  "step": 5500
461
  },
462
  {
463
  "epoch": 6.746987951807229,
464
- "grad_norm": 1.81606924533844,
465
  "learning_rate": 1.1004016064257028e-05,
466
- "loss": 0.0827,
467
  "step": 5600
468
  },
469
  {
470
  "epoch": 6.867469879518072,
471
- "grad_norm": 2.775585174560547,
472
  "learning_rate": 1.0843373493975904e-05,
473
- "loss": 0.0872,
474
  "step": 5700
475
  },
476
  {
477
  "epoch": 6.9879518072289155,
478
- "grad_norm": 3.0529415607452393,
479
  "learning_rate": 1.068273092369478e-05,
480
- "loss": 0.0754,
481
  "step": 5800
482
  },
483
  {
484
  "epoch": 7.0,
485
- "eval_accuracy": 0.3899939722724533,
486
- "eval_f1": 0.6352293577981651,
487
- "eval_loss": 0.30027899146080017,
488
- "eval_roc_auc": 0.7791627807894616,
489
- "eval_runtime": 20.9855,
490
- "eval_samples_per_second": 79.054,
491
- "eval_steps_per_second": 9.912,
492
  "step": 5810
493
  },
494
  {
495
  "epoch": 7.108433734939759,
496
- "grad_norm": 1.6993205547332764,
497
  "learning_rate": 1.0522088353413654e-05,
498
- "loss": 0.0723,
499
  "step": 5900
500
  },
501
  {
502
  "epoch": 7.228915662650603,
503
- "grad_norm": 2.1551597118377686,
504
  "learning_rate": 1.0361445783132531e-05,
505
- "loss": 0.0685,
506
  "step": 6000
507
  },
508
  {
509
  "epoch": 7.349397590361446,
510
- "grad_norm": 1.4902188777923584,
511
  "learning_rate": 1.0200803212851406e-05,
512
- "loss": 0.0664,
513
  "step": 6100
514
  },
515
  {
516
  "epoch": 7.469879518072289,
517
- "grad_norm": 0.829651951789856,
518
  "learning_rate": 1.0040160642570283e-05,
519
- "loss": 0.0662,
520
  "step": 6200
521
  },
522
  {
523
  "epoch": 7.590361445783133,
524
- "grad_norm": 2.141355037689209,
525
  "learning_rate": 9.879518072289156e-06,
526
- "loss": 0.0654,
527
  "step": 6300
528
  },
529
  {
530
  "epoch": 7.710843373493976,
531
- "grad_norm": 0.7993516325950623,
532
  "learning_rate": 9.718875502008033e-06,
533
- "loss": 0.0669,
534
  "step": 6400
535
  },
536
  {
537
  "epoch": 7.831325301204819,
538
- "grad_norm": 0.8961694836616516,
539
  "learning_rate": 9.558232931726908e-06,
540
- "loss": 0.0677,
541
  "step": 6500
542
  },
543
  {
544
  "epoch": 7.951807228915663,
545
- "grad_norm": 1.8866826295852661,
546
  "learning_rate": 9.397590361445785e-06,
547
- "loss": 0.058,
548
  "step": 6600
549
  },
550
  {
551
  "epoch": 8.0,
552
- "eval_accuracy": 0.3887884267631103,
553
- "eval_f1": 0.6461868190748349,
554
- "eval_loss": 0.3176693618297577,
555
- "eval_roc_auc": 0.790825180731994,
556
- "eval_runtime": 20.9796,
557
- "eval_samples_per_second": 79.077,
558
- "eval_steps_per_second": 9.914,
559
  "step": 6640
560
  },
561
  {
562
  "epoch": 8.072289156626505,
563
- "grad_norm": 0.4812127947807312,
564
  "learning_rate": 9.23694779116466e-06,
565
- "loss": 0.0543,
566
  "step": 6700
567
  },
568
  {
569
  "epoch": 8.19277108433735,
570
- "grad_norm": 1.1170074939727783,
571
  "learning_rate": 9.076305220883535e-06,
572
- "loss": 0.0534,
573
  "step": 6800
574
  },
575
  {
576
  "epoch": 8.313253012048193,
577
- "grad_norm": 2.044552803039551,
578
  "learning_rate": 8.91566265060241e-06,
579
- "loss": 0.0541,
580
  "step": 6900
581
  },
582
  {
583
  "epoch": 8.433734939759036,
584
- "grad_norm": 2.3580517768859863,
585
  "learning_rate": 8.755020080321286e-06,
586
- "loss": 0.046,
587
  "step": 7000
588
  },
589
  {
590
  "epoch": 8.55421686746988,
591
- "grad_norm": 2.568995952606201,
592
  "learning_rate": 8.594377510040161e-06,
593
- "loss": 0.0516,
594
  "step": 7100
595
  },
596
  {
597
  "epoch": 8.674698795180722,
598
- "grad_norm": 0.7591239213943481,
599
  "learning_rate": 8.433734939759038e-06,
600
- "loss": 0.0503,
601
  "step": 7200
602
  },
603
  {
604
  "epoch": 8.795180722891565,
605
- "grad_norm": 1.0098503828048706,
606
  "learning_rate": 8.273092369477911e-06,
607
- "loss": 0.0452,
608
  "step": 7300
609
  },
610
  {
611
  "epoch": 8.91566265060241,
612
- "grad_norm": 2.4211244583129883,
613
  "learning_rate": 8.112449799196788e-06,
614
- "loss": 0.0611,
615
  "step": 7400
616
  },
617
  {
618
  "epoch": 9.0,
619
- "eval_accuracy": 0.38396624472573837,
620
- "eval_f1": 0.6377283414722372,
621
- "eval_loss": 0.33842870593070984,
622
- "eval_roc_auc": 0.783423869627736,
623
- "eval_runtime": 20.9102,
624
- "eval_samples_per_second": 79.339,
625
- "eval_steps_per_second": 9.947,
626
  "step": 7470
627
  },
628
  {
629
  "epoch": 9.036144578313253,
630
- "grad_norm": 1.7786929607391357,
631
  "learning_rate": 7.951807228915663e-06,
632
- "loss": 0.0467,
633
  "step": 7500
634
  },
635
  {
636
  "epoch": 9.156626506024097,
637
- "grad_norm": 0.9288263916969299,
638
  "learning_rate": 7.79116465863454e-06,
639
- "loss": 0.0358,
640
  "step": 7600
641
  },
642
  {
643
  "epoch": 9.27710843373494,
644
- "grad_norm": 1.6899335384368896,
645
  "learning_rate": 7.630522088353415e-06,
646
- "loss": 0.0403,
647
  "step": 7700
648
  },
649
  {
650
  "epoch": 9.397590361445783,
651
- "grad_norm": 0.633351743221283,
652
  "learning_rate": 7.469879518072289e-06,
653
- "loss": 0.0409,
654
  "step": 7800
655
  },
656
  {
657
  "epoch": 9.518072289156626,
658
- "grad_norm": 1.880730152130127,
659
  "learning_rate": 7.309236947791165e-06,
660
- "loss": 0.04,
661
  "step": 7900
662
  },
663
  {
664
  "epoch": 9.638554216867469,
665
- "grad_norm": 0.7761407494544983,
666
  "learning_rate": 7.148594377510041e-06,
667
- "loss": 0.0416,
668
  "step": 8000
669
  },
670
  {
671
  "epoch": 9.759036144578314,
672
- "grad_norm": 1.7540706396102905,
673
  "learning_rate": 6.987951807228917e-06,
674
- "loss": 0.0383,
675
  "step": 8100
676
  },
677
  {
678
  "epoch": 9.879518072289157,
679
- "grad_norm": 1.621785044670105,
680
  "learning_rate": 6.8273092369477925e-06,
681
- "loss": 0.0399,
682
  "step": 8200
683
  },
684
  {
685
  "epoch": 10.0,
686
- "grad_norm": 4.052353382110596,
687
  "learning_rate": 6.666666666666667e-06,
688
- "loss": 0.0418,
689
  "step": 8300
690
  },
691
  {
692
  "epoch": 10.0,
693
- "eval_accuracy": 0.38155515370705245,
694
- "eval_f1": 0.6405322783672002,
695
- "eval_loss": 0.3592796325683594,
696
- "eval_roc_auc": 0.7861850408280293,
697
- "eval_runtime": 20.9788,
698
- "eval_samples_per_second": 79.08,
699
- "eval_steps_per_second": 9.915,
700
  "step": 8300
701
  },
702
  {
703
  "epoch": 10.120481927710843,
704
- "grad_norm": 2.524548053741455,
705
  "learning_rate": 6.5060240963855425e-06,
706
- "loss": 0.0327,
707
  "step": 8400
708
  },
709
  {
710
  "epoch": 10.240963855421686,
711
- "grad_norm": 2.5789272785186768,
712
  "learning_rate": 6.345381526104418e-06,
713
- "loss": 0.0329,
714
  "step": 8500
715
  },
716
  {
717
  "epoch": 10.36144578313253,
718
- "grad_norm": 1.6276124715805054,
719
  "learning_rate": 6.184738955823294e-06,
720
- "loss": 0.0327,
721
  "step": 8600
722
  },
723
  {
724
  "epoch": 10.481927710843374,
725
- "grad_norm": 1.2820446491241455,
726
  "learning_rate": 6.02409638554217e-06,
727
- "loss": 0.0319,
728
  "step": 8700
729
  },
730
  {
731
  "epoch": 10.602409638554217,
732
- "grad_norm": 0.8200409412384033,
733
  "learning_rate": 5.863453815261044e-06,
734
- "loss": 0.0338,
735
  "step": 8800
736
  },
737
  {
738
  "epoch": 10.72289156626506,
739
- "grad_norm": 1.6061540842056274,
740
  "learning_rate": 5.70281124497992e-06,
741
- "loss": 0.0311,
742
  "step": 8900
743
  },
744
  {
745
  "epoch": 10.843373493975903,
746
- "grad_norm": 2.1266753673553467,
747
  "learning_rate": 5.542168674698796e-06,
748
- "loss": 0.0309,
749
  "step": 9000
750
  },
751
  {
752
  "epoch": 10.963855421686747,
753
- "grad_norm": 1.8387219905853271,
754
  "learning_rate": 5.381526104417672e-06,
755
- "loss": 0.0338,
756
  "step": 9100
757
  },
758
  {
759
  "epoch": 11.0,
760
- "eval_accuracy": 0.38155515370705245,
761
- "eval_f1": 0.6427417907769604,
762
- "eval_loss": 0.37010136246681213,
763
- "eval_roc_auc": 0.787888262607272,
764
- "eval_runtime": 20.9246,
765
- "eval_samples_per_second": 79.285,
766
- "eval_steps_per_second": 9.94,
767
  "step": 9130
768
  },
769
  {
770
  "epoch": 11.08433734939759,
771
- "grad_norm": 1.1557573080062866,
772
  "learning_rate": 5.220883534136547e-06,
773
- "loss": 0.0261,
774
  "step": 9200
775
  },
776
  {
777
  "epoch": 11.204819277108435,
778
- "grad_norm": 0.8962405920028687,
779
  "learning_rate": 5.060240963855422e-06,
780
- "loss": 0.0247,
781
  "step": 9300
782
  },
783
  {
784
  "epoch": 11.325301204819278,
785
- "grad_norm": 0.8799346089363098,
786
  "learning_rate": 4.899598393574298e-06,
787
- "loss": 0.0255,
788
  "step": 9400
789
  },
790
  {
791
  "epoch": 11.44578313253012,
792
- "grad_norm": 0.8840754628181458,
793
  "learning_rate": 4.7389558232931736e-06,
794
- "loss": 0.0266,
795
  "step": 9500
796
  },
797
  {
798
  "epoch": 11.566265060240964,
799
- "grad_norm": 1.1683375835418701,
800
  "learning_rate": 4.578313253012049e-06,
801
- "loss": 0.0261,
802
  "step": 9600
803
  },
804
  {
805
  "epoch": 11.686746987951807,
806
- "grad_norm": 0.885728120803833,
807
  "learning_rate": 4.4176706827309244e-06,
808
- "loss": 0.0257,
809
  "step": 9700
810
  },
811
  {
812
  "epoch": 11.80722891566265,
813
- "grad_norm": 0.2890942394733429,
814
  "learning_rate": 4.2570281124497995e-06,
815
- "loss": 0.029,
816
  "step": 9800
817
  },
818
  {
819
  "epoch": 11.927710843373493,
820
- "grad_norm": 2.2087390422821045,
821
  "learning_rate": 4.096385542168675e-06,
822
- "loss": 0.0255,
823
  "step": 9900
824
  },
825
  {
826
  "epoch": 12.0,
827
- "eval_accuracy": 0.3845690174804099,
828
- "eval_f1": 0.6420863309352519,
829
- "eval_loss": 0.3844664394855499,
830
- "eval_roc_auc": 0.7870362182333409,
831
- "eval_runtime": 20.95,
832
- "eval_samples_per_second": 79.189,
833
- "eval_steps_per_second": 9.928,
834
  "step": 9960
835
  },
836
  {
837
  "epoch": 12.048192771084338,
838
- "grad_norm": 1.7163885831832886,
839
  "learning_rate": 3.93574297188755e-06,
840
- "loss": 0.0261,
841
  "step": 10000
842
  },
843
  {
844
  "epoch": 12.168674698795181,
845
- "grad_norm": 1.0416496992111206,
846
  "learning_rate": 3.7751004016064258e-06,
847
- "loss": 0.0215,
848
  "step": 10100
849
  },
850
  {
851
  "epoch": 12.289156626506024,
852
- "grad_norm": 1.3379343748092651,
853
  "learning_rate": 3.6144578313253016e-06,
854
- "loss": 0.0223,
855
  "step": 10200
856
  },
857
  {
858
  "epoch": 12.409638554216867,
859
- "grad_norm": 0.658170759677887,
860
  "learning_rate": 3.453815261044177e-06,
861
- "loss": 0.0192,
862
  "step": 10300
863
  },
864
  {
865
  "epoch": 12.53012048192771,
866
- "grad_norm": 0.9047495722770691,
867
  "learning_rate": 3.2931726907630525e-06,
868
- "loss": 0.0237,
869
  "step": 10400
870
  },
871
  {
872
  "epoch": 12.650602409638553,
873
- "grad_norm": 1.0494842529296875,
874
  "learning_rate": 3.132530120481928e-06,
875
- "loss": 0.0193,
876
  "step": 10500
877
  },
878
  {
879
  "epoch": 12.771084337349398,
880
- "grad_norm": 0.32704225182533264,
881
  "learning_rate": 2.9718875502008034e-06,
882
- "loss": 0.0208,
883
  "step": 10600
884
  },
885
  {
886
  "epoch": 12.891566265060241,
887
- "grad_norm": 0.49072301387786865,
888
  "learning_rate": 2.811244979919679e-06,
889
- "loss": 0.0202,
890
  "step": 10700
891
  },
892
  {
893
  "epoch": 13.0,
894
- "eval_accuracy": 0.38095238095238093,
895
- "eval_f1": 0.6474642162926313,
896
- "eval_loss": 0.3947090208530426,
897
- "eval_roc_auc": 0.7937935574323361,
898
- "eval_runtime": 21.0756,
899
- "eval_samples_per_second": 78.716,
900
- "eval_steps_per_second": 9.869,
901
  "step": 10790
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902
  }
903
  ],
904
  "logging_steps": 100,
@@ -913,12 +1054,12 @@
913
  "should_evaluate": false,
914
  "should_log": false,
915
  "should_save": true,
916
- "should_training_stop": false
917
  },
918
  "attributes": {}
919
  }
920
  },
921
- "total_flos": 8035320180521232.0,
922
  "train_batch_size": 8,
923
  "trial_name": null,
924
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6742756804214223,
3
+ "best_model_checkpoint": "drive/MyDrive/NLP_HRAF//Models/HRAF_MultiLabel_SubClasses_Kfolds/Model_5_Roberta/Learning_Rate_2e-05_Weight_Decay_0.01_fold_1/checkpoint-12450",
4
+ "epoch": 15.0,
5
  "eval_steps": 500,
6
+ "global_step": 12450,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.12048192771084337,
13
+ "grad_norm": 0.9642070531845093,
14
  "learning_rate": 1.9839357429718877e-05,
15
+ "loss": 0.4237,
16
  "step": 100
17
  },
18
  {
19
  "epoch": 0.24096385542168675,
20
+ "grad_norm": 0.7253573536872864,
21
  "learning_rate": 1.967871485943775e-05,
22
+ "loss": 0.3297,
23
  "step": 200
24
  },
25
  {
26
  "epoch": 0.3614457831325301,
27
+ "grad_norm": 0.9410437941551208,
28
  "learning_rate": 1.951807228915663e-05,
29
+ "loss": 0.3214,
30
  "step": 300
31
  },
32
  {
33
  "epoch": 0.4819277108433735,
34
+ "grad_norm": 1.9593065977096558,
35
  "learning_rate": 1.9357429718875505e-05,
36
+ "loss": 0.304,
37
  "step": 400
38
  },
39
  {
40
  "epoch": 0.6024096385542169,
41
+ "grad_norm": 1.892052412033081,
42
  "learning_rate": 1.9196787148594377e-05,
43
+ "loss": 0.2951,
44
  "step": 500
45
  },
46
  {
47
  "epoch": 0.7228915662650602,
48
+ "grad_norm": 1.4741196632385254,
49
  "learning_rate": 1.9036144578313255e-05,
50
+ "loss": 0.2843,
51
  "step": 600
52
  },
53
  {
54
  "epoch": 0.8433734939759037,
55
+ "grad_norm": 1.5908536911010742,
56
  "learning_rate": 1.887550200803213e-05,
57
+ "loss": 0.278,
58
  "step": 700
59
  },
60
  {
61
  "epoch": 0.963855421686747,
62
+ "grad_norm": 2.7788383960723877,
63
  "learning_rate": 1.8714859437751005e-05,
64
+ "loss": 0.2809,
65
  "step": 800
66
  },
67
  {
68
  "epoch": 1.0,
69
+ "eval_accuracy": 0.37130801687763715,
70
+ "eval_f1": 0.5538327145664992,
71
+ "eval_loss": 0.25523483753204346,
72
+ "eval_roc_auc": 0.7095420424046738,
73
+ "eval_runtime": 36.0551,
74
+ "eval_samples_per_second": 46.013,
75
+ "eval_steps_per_second": 5.769,
76
  "step": 830
77
  },
78
  {
79
  "epoch": 1.0843373493975903,
80
+ "grad_norm": 2.297464370727539,
81
  "learning_rate": 1.855421686746988e-05,
82
+ "loss": 0.2649,
83
  "step": 900
84
  },
85
  {
86
  "epoch": 1.2048192771084336,
87
+ "grad_norm": 2.1780202388763428,
88
  "learning_rate": 1.8393574297188755e-05,
89
+ "loss": 0.2526,
90
  "step": 1000
91
  },
92
  {
93
  "epoch": 1.3253012048192772,
94
+ "grad_norm": 2.1221420764923096,
95
  "learning_rate": 1.8232931726907634e-05,
96
+ "loss": 0.2516,
97
  "step": 1100
98
  },
99
  {
100
  "epoch": 1.4457831325301205,
101
+ "grad_norm": 2.1369152069091797,
102
  "learning_rate": 1.807228915662651e-05,
103
+ "loss": 0.2512,
104
  "step": 1200
105
  },
106
  {
107
  "epoch": 1.5662650602409638,
108
+ "grad_norm": 1.9198232889175415,
109
  "learning_rate": 1.7911646586345384e-05,
110
+ "loss": 0.2635,
111
  "step": 1300
112
  },
113
  {
114
  "epoch": 1.6867469879518073,
115
+ "grad_norm": 1.4999780654907227,
116
  "learning_rate": 1.775100401606426e-05,
117
+ "loss": 0.2429,
118
  "step": 1400
119
  },
120
  {
121
  "epoch": 1.8072289156626506,
122
+ "grad_norm": 2.2243740558624268,
123
  "learning_rate": 1.7590361445783134e-05,
124
+ "loss": 0.2341,
125
  "step": 1500
126
  },
127
  {
128
  "epoch": 1.927710843373494,
129
+ "grad_norm": 1.936194896697998,
130
  "learning_rate": 1.742971887550201e-05,
131
+ "loss": 0.2358,
132
  "step": 1600
133
  },
134
  {
135
  "epoch": 2.0,
136
+ "eval_accuracy": 0.38396624472573837,
137
+ "eval_f1": 0.6305942773294204,
138
+ "eval_loss": 0.24461282789707184,
139
+ "eval_roc_auc": 0.7766385397275983,
140
+ "eval_runtime": 36.1223,
141
+ "eval_samples_per_second": 45.927,
142
+ "eval_steps_per_second": 5.758,
143
  "step": 1660
144
  },
145
  {
146
  "epoch": 2.0481927710843375,
147
+ "grad_norm": 1.8394912481307983,
148
  "learning_rate": 1.7269076305220884e-05,
149
+ "loss": 0.2262,
150
  "step": 1700
151
  },
152
  {
153
  "epoch": 2.1686746987951806,
154
+ "grad_norm": 1.9453928470611572,
155
  "learning_rate": 1.710843373493976e-05,
156
+ "loss": 0.2099,
157
  "step": 1800
158
  },
159
  {
160
  "epoch": 2.289156626506024,
161
+ "grad_norm": 3.4696056842803955,
162
  "learning_rate": 1.6947791164658637e-05,
163
+ "loss": 0.2055,
164
  "step": 1900
165
  },
166
  {
167
  "epoch": 2.4096385542168672,
168
+ "grad_norm": 3.629636764526367,
169
  "learning_rate": 1.6787148594377512e-05,
170
+ "loss": 0.2048,
171
  "step": 2000
172
  },
173
  {
174
  "epoch": 2.5301204819277108,
175
+ "grad_norm": 3.143533229827881,
176
  "learning_rate": 1.6626506024096387e-05,
177
+ "loss": 0.2149,
178
  "step": 2100
179
  },
180
  {
181
  "epoch": 2.6506024096385543,
182
+ "grad_norm": 4.122682094573975,
183
  "learning_rate": 1.6465863453815262e-05,
184
+ "loss": 0.2027,
185
  "step": 2200
186
  },
187
  {
188
  "epoch": 2.7710843373493974,
189
+ "grad_norm": 2.8808822631835938,
190
  "learning_rate": 1.6305220883534137e-05,
191
+ "loss": 0.2153,
192
  "step": 2300
193
  },
194
  {
195
  "epoch": 2.891566265060241,
196
+ "grad_norm": 2.86367130279541,
197
  "learning_rate": 1.6144578313253015e-05,
198
+ "loss": 0.2038,
199
  "step": 2400
200
  },
201
  {
202
  "epoch": 3.0,
203
+ "eval_accuracy": 0.3990355635925256,
204
+ "eval_f1": 0.648381788261108,
205
+ "eval_loss": 0.23571637272834778,
206
+ "eval_roc_auc": 0.7872360545001245,
207
+ "eval_runtime": 36.0229,
208
+ "eval_samples_per_second": 46.054,
209
+ "eval_steps_per_second": 5.774,
210
  "step": 2490
211
  },
212
  {
213
  "epoch": 3.0120481927710845,
214
+ "grad_norm": 2.759669780731201,
215
  "learning_rate": 1.5983935742971887e-05,
216
+ "loss": 0.2018,
217
  "step": 2500
218
  },
219
  {
220
  "epoch": 3.1325301204819276,
221
+ "grad_norm": 6.346066951751709,
222
  "learning_rate": 1.5823293172690762e-05,
223
+ "loss": 0.179,
224
  "step": 2600
225
  },
226
  {
227
  "epoch": 3.253012048192771,
228
+ "grad_norm": 2.918868064880371,
229
  "learning_rate": 1.566265060240964e-05,
230
  "loss": 0.1788,
231
  "step": 2700
232
  },
233
  {
234
  "epoch": 3.3734939759036147,
235
+ "grad_norm": 3.852792263031006,
236
  "learning_rate": 1.5502008032128516e-05,
237
+ "loss": 0.1727,
238
  "step": 2800
239
  },
240
  {
241
  "epoch": 3.4939759036144578,
242
+ "grad_norm": 6.0479655265808105,
243
  "learning_rate": 1.534136546184739e-05,
244
+ "loss": 0.1791,
245
  "step": 2900
246
  },
247
  {
248
  "epoch": 3.6144578313253013,
249
+ "grad_norm": 3.1924941539764404,
250
  "learning_rate": 1.5180722891566266e-05,
251
+ "loss": 0.1739,
252
  "step": 3000
253
  },
254
  {
255
  "epoch": 3.734939759036145,
256
+ "grad_norm": 2.770388603210449,
257
  "learning_rate": 1.5020080321285142e-05,
258
+ "loss": 0.1723,
259
  "step": 3100
260
  },
261
  {
262
  "epoch": 3.855421686746988,
263
+ "grad_norm": 3.522843360900879,
264
  "learning_rate": 1.4859437751004017e-05,
265
+ "loss": 0.1664,
266
  "step": 3200
267
  },
268
  {
269
  "epoch": 3.9759036144578315,
270
+ "grad_norm": 2.9013149738311768,
271
  "learning_rate": 1.4698795180722894e-05,
272
+ "loss": 0.1591,
273
  "step": 3300
274
  },
275
  {
276
  "epoch": 4.0,
277
+ "eval_accuracy": 0.4213381555153707,
278
+ "eval_f1": 0.6570518823749766,
279
+ "eval_loss": 0.2389475554227829,
280
+ "eval_roc_auc": 0.7871399448360252,
281
+ "eval_runtime": 35.9942,
282
+ "eval_samples_per_second": 46.091,
283
+ "eval_steps_per_second": 5.779,
284
  "step": 3320
285
  },
286
  {
287
  "epoch": 4.096385542168675,
288
+ "grad_norm": 2.389446258544922,
289
  "learning_rate": 1.4538152610441769e-05,
290
+ "loss": 0.1488,
291
  "step": 3400
292
  },
293
  {
294
  "epoch": 4.216867469879518,
295
+ "grad_norm": 2.0893971920013428,
296
  "learning_rate": 1.4377510040160642e-05,
297
+ "loss": 0.1431,
298
  "step": 3500
299
  },
300
  {
301
  "epoch": 4.337349397590361,
302
+ "grad_norm": 4.6121745109558105,
303
  "learning_rate": 1.4216867469879519e-05,
304
+ "loss": 0.1436,
305
  "step": 3600
306
  },
307
  {
308
  "epoch": 4.457831325301205,
309
+ "grad_norm": 3.5218753814697266,
310
  "learning_rate": 1.4056224899598394e-05,
311
+ "loss": 0.152,
312
  "step": 3700
313
  },
314
  {
315
  "epoch": 4.578313253012048,
316
+ "grad_norm": 1.686698317527771,
317
  "learning_rate": 1.3895582329317269e-05,
318
+ "loss": 0.1463,
319
  "step": 3800
320
  },
321
  {
322
  "epoch": 4.698795180722891,
323
+ "grad_norm": 3.079829692840576,
324
  "learning_rate": 1.3734939759036146e-05,
325
+ "loss": 0.1434,
326
  "step": 3900
327
  },
328
  {
329
  "epoch": 4.8192771084337345,
330
+ "grad_norm": 3.5164265632629395,
331
  "learning_rate": 1.357429718875502e-05,
332
+ "loss": 0.1352,
333
  "step": 4000
334
  },
335
  {
336
  "epoch": 4.9397590361445785,
337
+ "grad_norm": 4.049747467041016,
338
  "learning_rate": 1.3413654618473897e-05,
339
+ "loss": 0.1427,
340
  "step": 4100
341
  },
342
  {
343
  "epoch": 5.0,
344
+ "eval_accuracy": 0.4207353827606992,
345
+ "eval_f1": 0.6415022761760243,
346
+ "eval_loss": 0.2513498365879059,
347
+ "eval_roc_auc": 0.7761577437017827,
348
+ "eval_runtime": 36.1323,
349
+ "eval_samples_per_second": 45.915,
350
+ "eval_steps_per_second": 5.757,
351
  "step": 4150
352
  },
353
  {
354
  "epoch": 5.0602409638554215,
355
+ "grad_norm": 3.9669837951660156,
356
  "learning_rate": 1.3253012048192772e-05,
357
+ "loss": 0.1295,
358
  "step": 4200
359
  },
360
  {
361
  "epoch": 5.180722891566265,
362
+ "grad_norm": 2.201209783554077,
363
  "learning_rate": 1.309236947791165e-05,
364
+ "loss": 0.1154,
365
  "step": 4300
366
  },
367
  {
368
  "epoch": 5.301204819277109,
369
+ "grad_norm": 3.613372802734375,
370
  "learning_rate": 1.2931726907630524e-05,
371
+ "loss": 0.1133,
372
  "step": 4400
373
  },
374
  {
375
  "epoch": 5.421686746987952,
376
+ "grad_norm": 4.806926727294922,
377
  "learning_rate": 1.2771084337349398e-05,
378
+ "loss": 0.1173,
379
  "step": 4500
380
  },
381
  {
382
  "epoch": 5.542168674698795,
383
+ "grad_norm": 3.5737357139587402,
384
  "learning_rate": 1.2610441767068273e-05,
385
+ "loss": 0.1202,
386
  "step": 4600
387
  },
388
  {
389
  "epoch": 5.662650602409639,
390
+ "grad_norm": 5.031768798828125,
391
  "learning_rate": 1.244979919678715e-05,
392
+ "loss": 0.1147,
393
  "step": 4700
394
  },
395
  {
396
  "epoch": 5.783132530120482,
397
+ "grad_norm": 2.068950891494751,
398
  "learning_rate": 1.2289156626506024e-05,
399
+ "loss": 0.1168,
400
  "step": 4800
401
  },
402
  {
403
  "epoch": 5.903614457831325,
404
+ "grad_norm": 2.5072097778320312,
405
  "learning_rate": 1.2128514056224901e-05,
406
+ "loss": 0.1226,
407
  "step": 4900
408
  },
409
  {
410
  "epoch": 6.0,
411
+ "eval_accuracy": 0.41350210970464135,
412
+ "eval_f1": 0.6666666666666665,
413
+ "eval_loss": 0.2606567144393921,
414
+ "eval_roc_auc": 0.8057414996823179,
415
+ "eval_runtime": 36.1583,
416
+ "eval_samples_per_second": 45.882,
417
+ "eval_steps_per_second": 5.752,
418
  "step": 4980
419
  },
420
  {
421
  "epoch": 6.024096385542169,
422
+ "grad_norm": 2.0976059436798096,
423
  "learning_rate": 1.1967871485943776e-05,
424
+ "loss": 0.1077,
425
  "step": 5000
426
  },
427
  {
428
  "epoch": 6.144578313253012,
429
+ "grad_norm": 1.4871183633804321,
430
  "learning_rate": 1.1807228915662651e-05,
431
+ "loss": 0.0934,
432
  "step": 5100
433
  },
434
  {
435
  "epoch": 6.265060240963855,
436
+ "grad_norm": 1.9144952297210693,
437
  "learning_rate": 1.1646586345381528e-05,
438
+ "loss": 0.0938,
439
  "step": 5200
440
  },
441
  {
442
  "epoch": 6.385542168674699,
443
+ "grad_norm": 2.3612289428710938,
444
  "learning_rate": 1.1485943775100403e-05,
445
+ "loss": 0.0955,
446
  "step": 5300
447
  },
448
  {
449
  "epoch": 6.506024096385542,
450
+ "grad_norm": 5.222254276275635,
451
  "learning_rate": 1.132530120481928e-05,
452
+ "loss": 0.097,
453
  "step": 5400
454
  },
455
  {
456
  "epoch": 6.626506024096385,
457
+ "grad_norm": 5.138168811798096,
458
  "learning_rate": 1.1164658634538153e-05,
459
+ "loss": 0.0993,
460
  "step": 5500
461
  },
462
  {
463
  "epoch": 6.746987951807229,
464
+ "grad_norm": 5.146157264709473,
465
  "learning_rate": 1.1004016064257028e-05,
466
+ "loss": 0.0954,
467
  "step": 5600
468
  },
469
  {
470
  "epoch": 6.867469879518072,
471
+ "grad_norm": 4.897678375244141,
472
  "learning_rate": 1.0843373493975904e-05,
473
+ "loss": 0.0985,
474
  "step": 5700
475
  },
476
  {
477
  "epoch": 6.9879518072289155,
478
+ "grad_norm": 3.3976993560791016,
479
  "learning_rate": 1.068273092369478e-05,
480
+ "loss": 0.0857,
481
  "step": 5800
482
  },
483
  {
484
  "epoch": 7.0,
485
+ "eval_accuracy": 0.4092827004219409,
486
+ "eval_f1": 0.6649736985307456,
487
+ "eval_loss": 0.2713634967803955,
488
+ "eval_roc_auc": 0.7982755374895809,
489
+ "eval_runtime": 36.3274,
490
+ "eval_samples_per_second": 45.668,
491
+ "eval_steps_per_second": 5.726,
492
  "step": 5810
493
  },
494
  {
495
  "epoch": 7.108433734939759,
496
+ "grad_norm": 6.454195499420166,
497
  "learning_rate": 1.0522088353413654e-05,
498
+ "loss": 0.0817,
499
  "step": 5900
500
  },
501
  {
502
  "epoch": 7.228915662650603,
503
+ "grad_norm": 1.3181122541427612,
504
  "learning_rate": 1.0361445783132531e-05,
505
+ "loss": 0.0816,
506
  "step": 6000
507
  },
508
  {
509
  "epoch": 7.349397590361446,
510
+ "grad_norm": 4.141767501831055,
511
  "learning_rate": 1.0200803212851406e-05,
512
+ "loss": 0.0794,
513
  "step": 6100
514
  },
515
  {
516
  "epoch": 7.469879518072289,
517
+ "grad_norm": 1.5225648880004883,
518
  "learning_rate": 1.0040160642570283e-05,
519
+ "loss": 0.0748,
520
  "step": 6200
521
  },
522
  {
523
  "epoch": 7.590361445783133,
524
+ "grad_norm": 4.940995693206787,
525
  "learning_rate": 9.879518072289156e-06,
526
+ "loss": 0.0733,
527
  "step": 6300
528
  },
529
  {
530
  "epoch": 7.710843373493976,
531
+ "grad_norm": 3.411694288253784,
532
  "learning_rate": 9.718875502008033e-06,
533
+ "loss": 0.0779,
534
  "step": 6400
535
  },
536
  {
537
  "epoch": 7.831325301204819,
538
+ "grad_norm": 4.794209957122803,
539
  "learning_rate": 9.558232931726908e-06,
540
+ "loss": 0.0778,
541
  "step": 6500
542
  },
543
  {
544
  "epoch": 7.951807228915663,
545
+ "grad_norm": 2.7523696422576904,
546
  "learning_rate": 9.397590361445785e-06,
547
+ "loss": 0.0714,
548
  "step": 6600
549
  },
550
  {
551
  "epoch": 8.0,
552
+ "eval_accuracy": 0.4050632911392405,
553
+ "eval_f1": 0.6728395061728395,
554
+ "eval_loss": 0.2908540666103363,
555
+ "eval_roc_auc": 0.8154374909432742,
556
+ "eval_runtime": 36.2563,
557
+ "eval_samples_per_second": 45.758,
558
+ "eval_steps_per_second": 5.737,
559
  "step": 6640
560
  },
561
  {
562
  "epoch": 8.072289156626505,
563
+ "grad_norm": 2.963338851928711,
564
  "learning_rate": 9.23694779116466e-06,
565
+ "loss": 0.066,
566
  "step": 6700
567
  },
568
  {
569
  "epoch": 8.19277108433735,
570
+ "grad_norm": 1.1161987781524658,
571
  "learning_rate": 9.076305220883535e-06,
572
+ "loss": 0.0665,
573
  "step": 6800
574
  },
575
  {
576
  "epoch": 8.313253012048193,
577
+ "grad_norm": 1.9074004888534546,
578
  "learning_rate": 8.91566265060241e-06,
579
+ "loss": 0.0603,
580
  "step": 6900
581
  },
582
  {
583
  "epoch": 8.433734939759036,
584
+ "grad_norm": 4.402090549468994,
585
  "learning_rate": 8.755020080321286e-06,
586
+ "loss": 0.0558,
587
  "step": 7000
588
  },
589
  {
590
  "epoch": 8.55421686746988,
591
+ "grad_norm": 8.068613052368164,
592
  "learning_rate": 8.594377510040161e-06,
593
+ "loss": 0.0596,
594
  "step": 7100
595
  },
596
  {
597
  "epoch": 8.674698795180722,
598
+ "grad_norm": 0.7083752751350403,
599
  "learning_rate": 8.433734939759038e-06,
600
+ "loss": 0.0615,
601
  "step": 7200
602
  },
603
  {
604
  "epoch": 8.795180722891565,
605
+ "grad_norm": 2.8427162170410156,
606
  "learning_rate": 8.273092369477911e-06,
607
+ "loss": 0.056,
608
  "step": 7300
609
  },
610
  {
611
  "epoch": 8.91566265060241,
612
+ "grad_norm": 4.300654888153076,
613
  "learning_rate": 8.112449799196788e-06,
614
+ "loss": 0.073,
615
  "step": 7400
616
  },
617
  {
618
  "epoch": 9.0,
619
+ "eval_accuracy": 0.41832429174201324,
620
+ "eval_f1": 0.6730903994393833,
621
+ "eval_loss": 0.29414018988609314,
622
+ "eval_roc_auc": 0.810645259972926,
623
+ "eval_runtime": 36.2468,
624
+ "eval_samples_per_second": 45.77,
625
+ "eval_steps_per_second": 5.738,
626
  "step": 7470
627
  },
628
  {
629
  "epoch": 9.036144578313253,
630
+ "grad_norm": 3.129971981048584,
631
  "learning_rate": 7.951807228915663e-06,
632
+ "loss": 0.0563,
633
  "step": 7500
634
  },
635
  {
636
  "epoch": 9.156626506024097,
637
+ "grad_norm": 2.5316765308380127,
638
  "learning_rate": 7.79116465863454e-06,
639
+ "loss": 0.048,
640
  "step": 7600
641
  },
642
  {
643
  "epoch": 9.27710843373494,
644
+ "grad_norm": 4.083515167236328,
645
  "learning_rate": 7.630522088353415e-06,
646
+ "loss": 0.0502,
647
  "step": 7700
648
  },
649
  {
650
  "epoch": 9.397590361445783,
651
+ "grad_norm": 1.0832017660140991,
652
  "learning_rate": 7.469879518072289e-06,
653
+ "loss": 0.0521,
654
  "step": 7800
655
  },
656
  {
657
  "epoch": 9.518072289156626,
658
+ "grad_norm": 4.1632304191589355,
659
  "learning_rate": 7.309236947791165e-06,
660
+ "loss": 0.0508,
661
  "step": 7900
662
  },
663
  {
664
  "epoch": 9.638554216867469,
665
+ "grad_norm": 2.0603678226470947,
666
  "learning_rate": 7.148594377510041e-06,
667
+ "loss": 0.053,
668
  "step": 8000
669
  },
670
  {
671
  "epoch": 9.759036144578314,
672
+ "grad_norm": 2.3865954875946045,
673
  "learning_rate": 6.987951807228917e-06,
674
+ "loss": 0.0471,
675
  "step": 8100
676
  },
677
  {
678
  "epoch": 9.879518072289157,
679
+ "grad_norm": 3.4538087844848633,
680
  "learning_rate": 6.8273092369477925e-06,
681
+ "loss": 0.0459,
682
  "step": 8200
683
  },
684
  {
685
  "epoch": 10.0,
686
+ "grad_norm": 1.0237865447998047,
687
  "learning_rate": 6.666666666666667e-06,
688
+ "loss": 0.052,
689
  "step": 8300
690
  },
691
  {
692
  "epoch": 10.0,
693
+ "eval_accuracy": 0.4141048824593128,
694
+ "eval_f1": 0.6719829877724616,
695
+ "eval_loss": 0.30549874901771545,
696
+ "eval_roc_auc": 0.807412395917321,
697
+ "eval_runtime": 36.14,
698
+ "eval_samples_per_second": 45.905,
699
+ "eval_steps_per_second": 5.755,
700
  "step": 8300
701
  },
702
  {
703
  "epoch": 10.120481927710843,
704
+ "grad_norm": 3.1426589488983154,
705
  "learning_rate": 6.5060240963855425e-06,
706
+ "loss": 0.0382,
707
  "step": 8400
708
  },
709
  {
710
  "epoch": 10.240963855421686,
711
+ "grad_norm": 5.971590995788574,
712
  "learning_rate": 6.345381526104418e-06,
713
+ "loss": 0.0403,
714
  "step": 8500
715
  },
716
  {
717
  "epoch": 10.36144578313253,
718
+ "grad_norm": 2.7165796756744385,
719
  "learning_rate": 6.184738955823294e-06,
720
+ "loss": 0.0417,
721
  "step": 8600
722
  },
723
  {
724
  "epoch": 10.481927710843374,
725
+ "grad_norm": 6.249508857727051,
726
  "learning_rate": 6.02409638554217e-06,
727
+ "loss": 0.0409,
728
  "step": 8700
729
  },
730
  {
731
  "epoch": 10.602409638554217,
732
+ "grad_norm": 1.5167735815048218,
733
  "learning_rate": 5.863453815261044e-06,
734
+ "loss": 0.0413,
735
  "step": 8800
736
  },
737
  {
738
  "epoch": 10.72289156626506,
739
+ "grad_norm": 1.3362675905227661,
740
  "learning_rate": 5.70281124497992e-06,
741
+ "loss": 0.0381,
742
  "step": 8900
743
  },
744
  {
745
  "epoch": 10.843373493975903,
746
+ "grad_norm": 3.719500780105591,
747
  "learning_rate": 5.542168674698796e-06,
748
+ "loss": 0.0412,
749
  "step": 9000
750
  },
751
  {
752
  "epoch": 10.963855421686747,
753
+ "grad_norm": 4.197484493255615,
754
  "learning_rate": 5.381526104417672e-06,
755
+ "loss": 0.0429,
756
  "step": 9100
757
  },
758
  {
759
  "epoch": 11.0,
760
+ "eval_accuracy": 0.4165159734779988,
761
+ "eval_f1": 0.6682226211849193,
762
+ "eval_loss": 0.31563234329223633,
763
+ "eval_roc_auc": 0.8023591454661876,
764
+ "eval_runtime": 36.1507,
765
+ "eval_samples_per_second": 45.891,
766
+ "eval_steps_per_second": 5.754,
767
  "step": 9130
768
  },
769
  {
770
  "epoch": 11.08433734939759,
771
+ "grad_norm": 1.9439764022827148,
772
  "learning_rate": 5.220883534136547e-06,
773
+ "loss": 0.0345,
774
  "step": 9200
775
  },
776
  {
777
  "epoch": 11.204819277108435,
778
+ "grad_norm": 5.573112487792969,
779
  "learning_rate": 5.060240963855422e-06,
780
+ "loss": 0.0334,
781
  "step": 9300
782
  },
783
  {
784
  "epoch": 11.325301204819278,
785
+ "grad_norm": 3.091160535812378,
786
  "learning_rate": 4.899598393574298e-06,
787
+ "loss": 0.0331,
788
  "step": 9400
789
  },
790
  {
791
  "epoch": 11.44578313253012,
792
+ "grad_norm": 4.914794445037842,
793
  "learning_rate": 4.7389558232931736e-06,
794
+ "loss": 0.0345,
795
  "step": 9500
796
  },
797
  {
798
  "epoch": 11.566265060240964,
799
+ "grad_norm": 1.9498165845870972,
800
  "learning_rate": 4.578313253012049e-06,
801
+ "loss": 0.0345,
802
  "step": 9600
803
  },
804
  {
805
  "epoch": 11.686746987951807,
806
+ "grad_norm": 2.1993534564971924,
807
  "learning_rate": 4.4176706827309244e-06,
808
+ "loss": 0.0332,
809
  "step": 9700
810
  },
811
  {
812
  "epoch": 11.80722891566265,
813
+ "grad_norm": 0.7553381323814392,
814
  "learning_rate": 4.2570281124497995e-06,
815
+ "loss": 0.0323,
816
  "step": 9800
817
  },
818
  {
819
  "epoch": 11.927710843373493,
820
+ "grad_norm": 1.3014346361160278,
821
  "learning_rate": 4.096385542168675e-06,
822
+ "loss": 0.0323,
823
  "step": 9900
824
  },
825
  {
826
  "epoch": 12.0,
827
+ "eval_accuracy": 0.41350210970464135,
828
+ "eval_f1": 0.6725321133204293,
829
+ "eval_loss": 0.3264513611793518,
830
+ "eval_roc_auc": 0.8093228231966124,
831
+ "eval_runtime": 36.2329,
832
+ "eval_samples_per_second": 45.787,
833
+ "eval_steps_per_second": 5.741,
834
  "step": 9960
835
  },
836
  {
837
  "epoch": 12.048192771084338,
838
+ "grad_norm": 2.3635246753692627,
839
  "learning_rate": 3.93574297188755e-06,
840
+ "loss": 0.0331,
841
  "step": 10000
842
  },
843
  {
844
  "epoch": 12.168674698795181,
845
+ "grad_norm": 3.8707635402679443,
846
  "learning_rate": 3.7751004016064258e-06,
847
+ "loss": 0.0277,
848
  "step": 10100
849
  },
850
  {
851
  "epoch": 12.289156626506024,
852
+ "grad_norm": 3.9427218437194824,
853
  "learning_rate": 3.6144578313253016e-06,
854
+ "loss": 0.0296,
855
  "step": 10200
856
  },
857
  {
858
  "epoch": 12.409638554216867,
859
+ "grad_norm": 0.7694936990737915,
860
  "learning_rate": 3.453815261044177e-06,
861
+ "loss": 0.0268,
862
  "step": 10300
863
  },
864
  {
865
  "epoch": 12.53012048192771,
866
+ "grad_norm": 1.952202558517456,
867
  "learning_rate": 3.2931726907630525e-06,
868
+ "loss": 0.0276,
869
  "step": 10400
870
  },
871
  {
872
  "epoch": 12.650602409638553,
873
+ "grad_norm": 1.1884231567382812,
874
  "learning_rate": 3.132530120481928e-06,
875
+ "loss": 0.0262,
876
  "step": 10500
877
  },
878
  {
879
  "epoch": 12.771084337349398,
880
+ "grad_norm": 0.3486195206642151,
881
  "learning_rate": 2.9718875502008034e-06,
882
+ "loss": 0.03,
883
  "step": 10600
884
  },
885
  {
886
  "epoch": 12.891566265060241,
887
+ "grad_norm": 0.7074311971664429,
888
  "learning_rate": 2.811244979919679e-06,
889
+ "loss": 0.0286,
890
  "step": 10700
891
  },
892
  {
893
  "epoch": 13.0,
894
+ "eval_accuracy": 0.4110910186859554,
895
+ "eval_f1": 0.6732949590092447,
896
+ "eval_loss": 0.3310515284538269,
897
+ "eval_roc_auc": 0.8117622251864293,
898
+ "eval_runtime": 36.1171,
899
+ "eval_samples_per_second": 45.934,
900
+ "eval_steps_per_second": 5.759,
901
  "step": 10790
902
+ },
903
+ {
904
+ "epoch": 13.012048192771084,
905
+ "grad_norm": 7.378662586212158,
906
+ "learning_rate": 2.6506024096385547e-06,
907
+ "loss": 0.0271,
908
+ "step": 10800
909
+ },
910
+ {
911
+ "epoch": 13.132530120481928,
912
+ "grad_norm": 2.406675100326538,
913
+ "learning_rate": 2.4899598393574297e-06,
914
+ "loss": 0.0251,
915
+ "step": 10900
916
+ },
917
+ {
918
+ "epoch": 13.25301204819277,
919
+ "grad_norm": 3.8638405799865723,
920
+ "learning_rate": 2.3293172690763055e-06,
921
+ "loss": 0.0258,
922
+ "step": 11000
923
+ },
924
+ {
925
+ "epoch": 13.373493975903614,
926
+ "grad_norm": 2.06321120262146,
927
+ "learning_rate": 2.168674698795181e-06,
928
+ "loss": 0.0246,
929
+ "step": 11100
930
+ },
931
+ {
932
+ "epoch": 13.493975903614459,
933
+ "grad_norm": 1.5517412424087524,
934
+ "learning_rate": 2.0080321285140564e-06,
935
+ "loss": 0.0249,
936
+ "step": 11200
937
+ },
938
+ {
939
+ "epoch": 13.614457831325302,
940
+ "grad_norm": 0.4058358669281006,
941
+ "learning_rate": 1.8473895582329318e-06,
942
+ "loss": 0.0234,
943
+ "step": 11300
944
+ },
945
+ {
946
+ "epoch": 13.734939759036145,
947
+ "grad_norm": 3.0318214893341064,
948
+ "learning_rate": 1.6867469879518073e-06,
949
+ "loss": 0.0258,
950
+ "step": 11400
951
+ },
952
+ {
953
+ "epoch": 13.855421686746988,
954
+ "grad_norm": 0.8424203991889954,
955
+ "learning_rate": 1.526104417670683e-06,
956
+ "loss": 0.0269,
957
+ "step": 11500
958
+ },
959
+ {
960
+ "epoch": 13.975903614457831,
961
+ "grad_norm": 3.9194679260253906,
962
+ "learning_rate": 1.3654618473895584e-06,
963
+ "loss": 0.0234,
964
+ "step": 11600
965
+ },
966
+ {
967
+ "epoch": 14.0,
968
+ "eval_accuracy": 0.40687160940325495,
969
+ "eval_f1": 0.6735155841894479,
970
+ "eval_loss": 0.33624783158302307,
971
+ "eval_roc_auc": 0.8122911998969546,
972
+ "eval_runtime": 36.1849,
973
+ "eval_samples_per_second": 45.848,
974
+ "eval_steps_per_second": 5.748,
975
+ "step": 11620
976
+ },
977
+ {
978
+ "epoch": 14.096385542168674,
979
+ "grad_norm": 1.4352937936782837,
980
+ "learning_rate": 1.2048192771084338e-06,
981
+ "loss": 0.0209,
982
+ "step": 11700
983
+ },
984
+ {
985
+ "epoch": 14.216867469879517,
986
+ "grad_norm": 0.7450918555259705,
987
+ "learning_rate": 1.0441767068273092e-06,
988
+ "loss": 0.0218,
989
+ "step": 11800
990
+ },
991
+ {
992
+ "epoch": 14.337349397590362,
993
+ "grad_norm": 1.0368732213974,
994
+ "learning_rate": 8.835341365461848e-07,
995
+ "loss": 0.0235,
996
+ "step": 11900
997
+ },
998
+ {
999
+ "epoch": 14.457831325301205,
1000
+ "grad_norm": 6.217952728271484,
1001
+ "learning_rate": 7.228915662650602e-07,
1002
+ "loss": 0.0228,
1003
+ "step": 12000
1004
+ },
1005
+ {
1006
+ "epoch": 14.578313253012048,
1007
+ "grad_norm": 3.589872360229492,
1008
+ "learning_rate": 5.622489959839358e-07,
1009
+ "loss": 0.0222,
1010
+ "step": 12100
1011
+ },
1012
+ {
1013
+ "epoch": 14.698795180722891,
1014
+ "grad_norm": 2.410654306411743,
1015
+ "learning_rate": 4.0160642570281125e-07,
1016
+ "loss": 0.0203,
1017
+ "step": 12200
1018
+ },
1019
+ {
1020
+ "epoch": 14.819277108433734,
1021
+ "grad_norm": 0.5593228340148926,
1022
+ "learning_rate": 2.409638554216868e-07,
1023
+ "loss": 0.0246,
1024
+ "step": 12300
1025
+ },
1026
+ {
1027
+ "epoch": 14.939759036144578,
1028
+ "grad_norm": 1.7539204359054565,
1029
+ "learning_rate": 8.032128514056224e-08,
1030
+ "loss": 0.0237,
1031
+ "step": 12400
1032
+ },
1033
+ {
1034
+ "epoch": 15.0,
1035
+ "eval_accuracy": 0.4153104279686558,
1036
+ "eval_f1": 0.6742756804214223,
1037
+ "eval_loss": 0.3354536294937134,
1038
+ "eval_roc_auc": 0.810820573413045,
1039
+ "eval_runtime": 36.2022,
1040
+ "eval_samples_per_second": 45.826,
1041
+ "eval_steps_per_second": 5.746,
1042
+ "step": 12450
1043
  }
1044
  ],
1045
  "logging_steps": 100,
 
1054
  "should_evaluate": false,
1055
  "should_log": false,
1056
  "should_save": true,
1057
+ "should_training_stop": true
1058
  },
1059
  "attributes": {}
1060
  }
1061
  },
1062
+ "total_flos": 1.857073408472736e+16,
1063
  "train_batch_size": 8,
1064
  "trial_name": null,
1065
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90c00f8b7311babb09b5cfce1bf4c2db61f426d28c044e987c559cbb8c1af657
3
  size 5304
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cfc045024ac566aada5a538769e67be335647460017ad6775bae4db97d36dc8
3
  size 5304
vocab.json ADDED
The diff for this file is too large to render. See raw diff