batoulnn commited on
Commit
4e6a79b
·
verified ·
1 Parent(s): 679d316

Upload 12 files

Browse files
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "hidden_act": "gelu",
8
+ "hidden_dropout_prob": 0.1,
9
+ "hidden_size": 768,
10
+ "id2label": {
11
+ "0": "LABEL_0",
12
+ "1": "LABEL_1",
13
+ "2": "LABEL_2",
14
+ "3": "LABEL_3",
15
+ "4": "LABEL_4",
16
+ "5": "LABEL_5",
17
+ "6": "LABEL_6",
18
+ "7": "LABEL_7",
19
+ "8": "LABEL_8",
20
+ "9": "LABEL_9",
21
+ "10": "LABEL_10",
22
+ "11": "LABEL_11",
23
+ "12": "LABEL_12",
24
+ "13": "LABEL_13",
25
+ "14": "LABEL_14",
26
+ "15": "LABEL_15",
27
+ "16": "LABEL_16",
28
+ "17": "LABEL_17",
29
+ "18": "LABEL_18",
30
+ "19": "LABEL_19",
31
+ "20": "LABEL_20"
32
+ },
33
+ "initializer_range": 0.02,
34
+ "intermediate_size": 3072,
35
+ "label2id": {
36
+ "LABEL_0": 0,
37
+ "LABEL_1": 1,
38
+ "LABEL_10": 10,
39
+ "LABEL_11": 11,
40
+ "LABEL_12": 12,
41
+ "LABEL_13": 13,
42
+ "LABEL_14": 14,
43
+ "LABEL_15": 15,
44
+ "LABEL_16": 16,
45
+ "LABEL_17": 17,
46
+ "LABEL_18": 18,
47
+ "LABEL_19": 19,
48
+ "LABEL_2": 2,
49
+ "LABEL_20": 20,
50
+ "LABEL_3": 3,
51
+ "LABEL_4": 4,
52
+ "LABEL_5": 5,
53
+ "LABEL_6": 6,
54
+ "LABEL_7": 7,
55
+ "LABEL_8": 8,
56
+ "LABEL_9": 9
57
+ },
58
+ "layer_norm_eps": 1e-12,
59
+ "max_position_embeddings": 512,
60
+ "model_type": "bert",
61
+ "num_attention_heads": 12,
62
+ "num_hidden_layers": 12,
63
+ "pad_token_id": 0,
64
+ "position_embedding_type": "absolute",
65
+ "problem_type": "single_label_classification",
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.52.4",
68
+ "type_vocab_size": 2,
69
+ "use_cache": true,
70
+ "vocab_size": 64000
71
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c77f56b46a277da3ae1a217d223891bc50fc0fce2f34fdcb57b03af4e96eed
3
+ size 540861516
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63b228a3362fe4f6561ddfd3a414db9a803a07917c60a798bfa42d0c7679eefb
3
+ size 1081844026
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f4b83cd54d128d19dba2d6f5d74d45d60ccd96dc27c0ac8c43f1acfcd124cdf
3
+ size 14244
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30172cf14f5dbe00280d63e36224a9f28dc7a0e8b38a74ceb5eb284e84da363
3
+ size 988
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bb3670dbe342b2f63fd0f1586e8d8e6a2203de4db4552d51d74532ab3a05ae1
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "5": {
44
+ "content": "[رابط]",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": true,
49
+ "special": true
50
+ },
51
+ "6": {
52
+ "content": "[بريد]",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": true,
57
+ "special": true
58
+ },
59
+ "7": {
60
+ "content": "[مستخدم]",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": true,
65
+ "special": true
66
+ }
67
+ },
68
+ "clean_up_tokenization_spaces": true,
69
+ "cls_token": "[CLS]",
70
+ "do_basic_tokenize": true,
71
+ "do_lower_case": false,
72
+ "extra_special_tokens": {},
73
+ "mask_token": "[MASK]",
74
+ "max_len": 512,
75
+ "max_length": 512,
76
+ "model_max_length": 512,
77
+ "never_split": [
78
+ "[بريد]",
79
+ "[مستخدم]",
80
+ "[رابط]"
81
+ ],
82
+ "pad_to_multiple_of": null,
83
+ "pad_token": "[PAD]",
84
+ "pad_token_type_id": 0,
85
+ "padding_side": "right",
86
+ "sep_token": "[SEP]",
87
+ "stride": 0,
88
+ "strip_accents": null,
89
+ "tokenize_chinese_chars": true,
90
+ "tokenizer_class": "BertTokenizer",
91
+ "truncation_side": "right",
92
+ "truncation_strategy": "longest_first",
93
+ "unk_token": "[UNK]"
94
+ }
trainer_state.json ADDED
@@ -0,0 +1,716 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 15500,
3
+ "best_metric": 0.9434096975688787,
4
+ "best_model_checkpoint": "./arabert_author_model_full/checkpoint-15500",
5
+ "epoch": 3.374700631395602,
6
+ "eval_steps": 500,
7
+ "global_step": 15500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.10886131069018071,
14
+ "grad_norm": 745471.1875,
15
+ "learning_rate": 2.171926006528836e-05,
16
+ "loss": 2.2995,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.10886131069018071,
21
+ "eval_accuracy": 0.6985195154777928,
22
+ "eval_f1_macro": 0.601246564201863,
23
+ "eval_f1_micro": 0.6985195154777928,
24
+ "eval_loss": 1.2079353332519531,
25
+ "eval_precision_macro": 0.6412966664769482,
26
+ "eval_precision_micro": 0.6985195154777928,
27
+ "eval_recall_macro": 0.6419387939365965,
28
+ "eval_recall_micro": 0.6985195154777928,
29
+ "eval_runtime": 14.7462,
30
+ "eval_samples_per_second": 50.386,
31
+ "eval_steps_per_second": 3.187,
32
+ "step": 500
33
+ },
34
+ {
35
+ "epoch": 0.21772262138036141,
36
+ "grad_norm": 956772.5625,
37
+ "learning_rate": 4.348204570184984e-05,
38
+ "loss": 0.849,
39
+ "step": 1000
40
+ },
41
+ {
42
+ "epoch": 0.21772262138036141,
43
+ "eval_accuracy": 0.819650067294751,
44
+ "eval_f1_macro": 0.7996069224582287,
45
+ "eval_f1_micro": 0.819650067294751,
46
+ "eval_loss": 0.5631475448608398,
47
+ "eval_precision_macro": 0.80345079706281,
48
+ "eval_precision_micro": 0.819650067294751,
49
+ "eval_recall_macro": 0.82483257704162,
50
+ "eval_recall_micro": 0.819650067294751,
51
+ "eval_runtime": 14.7707,
52
+ "eval_samples_per_second": 50.302,
53
+ "eval_steps_per_second": 3.182,
54
+ "step": 1000
55
+ },
56
+ {
57
+ "epoch": 0.32658393207054215,
58
+ "grad_norm": 326904.5,
59
+ "learning_rate": 6.524483133841132e-05,
60
+ "loss": 0.5868,
61
+ "step": 1500
62
+ },
63
+ {
64
+ "epoch": 0.32658393207054215,
65
+ "eval_accuracy": 0.8021534320323015,
66
+ "eval_f1_macro": 0.793396840762137,
67
+ "eval_f1_micro": 0.8021534320323015,
68
+ "eval_loss": 0.6846649646759033,
69
+ "eval_precision_macro": 0.8435106749075885,
70
+ "eval_precision_micro": 0.8021534320323015,
71
+ "eval_recall_macro": 0.7916833340258262,
72
+ "eval_recall_micro": 0.8021534320323015,
73
+ "eval_runtime": 14.8117,
74
+ "eval_samples_per_second": 50.163,
75
+ "eval_steps_per_second": 3.173,
76
+ "step": 1500
77
+ },
78
+ {
79
+ "epoch": 0.43544524276072283,
80
+ "grad_norm": 218412.828125,
81
+ "learning_rate": 7.998128491699842e-05,
82
+ "loss": 0.5612,
83
+ "step": 2000
84
+ },
85
+ {
86
+ "epoch": 0.43544524276072283,
87
+ "eval_accuracy": 0.8519515477792732,
88
+ "eval_f1_macro": 0.8247069527158585,
89
+ "eval_f1_micro": 0.8519515477792732,
90
+ "eval_loss": 0.4477691948413849,
91
+ "eval_precision_macro": 0.8896978331250329,
92
+ "eval_precision_micro": 0.8519515477792732,
93
+ "eval_recall_macro": 0.8379996548860711,
94
+ "eval_recall_micro": 0.8519515477792732,
95
+ "eval_runtime": 14.7186,
96
+ "eval_samples_per_second": 50.48,
97
+ "eval_steps_per_second": 3.193,
98
+ "step": 2000
99
+ },
100
+ {
101
+ "epoch": 0.5443065534509035,
102
+ "grad_norm": 1928294.625,
103
+ "learning_rate": 7.968493088594472e-05,
104
+ "loss": 0.4929,
105
+ "step": 2500
106
+ },
107
+ {
108
+ "epoch": 0.5443065534509035,
109
+ "eval_accuracy": 0.873485868102288,
110
+ "eval_f1_macro": 0.8688756385026712,
111
+ "eval_f1_micro": 0.873485868102288,
112
+ "eval_loss": 0.3026486039161682,
113
+ "eval_precision_macro": 0.878210989714668,
114
+ "eval_precision_micro": 0.873485868102288,
115
+ "eval_recall_macro": 0.8859921080399548,
116
+ "eval_recall_micro": 0.873485868102288,
117
+ "eval_runtime": 14.7131,
118
+ "eval_samples_per_second": 50.499,
119
+ "eval_steps_per_second": 3.194,
120
+ "step": 2500
121
+ },
122
+ {
123
+ "epoch": 0.6531678641410843,
124
+ "grad_norm": 3023410.5,
125
+ "learning_rate": 7.903065943344406e-05,
126
+ "loss": 0.4618,
127
+ "step": 3000
128
+ },
129
+ {
130
+ "epoch": 0.6531678641410843,
131
+ "eval_accuracy": 0.8613728129205922,
132
+ "eval_f1_macro": 0.8296445269102163,
133
+ "eval_f1_micro": 0.8613728129205922,
134
+ "eval_loss": 0.43775779008865356,
135
+ "eval_precision_macro": 0.8710561256381226,
136
+ "eval_precision_micro": 0.8613728129205922,
137
+ "eval_recall_macro": 0.8541696546910839,
138
+ "eval_recall_micro": 0.8613728129205922,
139
+ "eval_runtime": 14.7062,
140
+ "eval_samples_per_second": 50.523,
141
+ "eval_steps_per_second": 3.196,
142
+ "step": 3000
143
+ },
144
+ {
145
+ "epoch": 0.762029174831265,
146
+ "grad_norm": 211605.15625,
147
+ "learning_rate": 7.802437141773096e-05,
148
+ "loss": 0.4028,
149
+ "step": 3500
150
+ },
151
+ {
152
+ "epoch": 0.762029174831265,
153
+ "eval_accuracy": 0.8950201884253028,
154
+ "eval_f1_macro": 0.8917785158702655,
155
+ "eval_f1_micro": 0.8950201884253028,
156
+ "eval_loss": 0.25510504841804504,
157
+ "eval_precision_macro": 0.9107123575695487,
158
+ "eval_precision_micro": 0.8950201884253028,
159
+ "eval_recall_macro": 0.9057010565367906,
160
+ "eval_recall_micro": 0.8950201884253028,
161
+ "eval_runtime": 14.7188,
162
+ "eval_samples_per_second": 50.48,
163
+ "eval_steps_per_second": 3.193,
164
+ "step": 3500
165
+ },
166
+ {
167
+ "epoch": 0.8708904855214457,
168
+ "grad_norm": 37626.74609375,
169
+ "learning_rate": 7.667514252581752e-05,
170
+ "loss": 0.3747,
171
+ "step": 4000
172
+ },
173
+ {
174
+ "epoch": 0.8708904855214457,
175
+ "eval_accuracy": 0.892328398384926,
176
+ "eval_f1_macro": 0.8948877387080549,
177
+ "eval_f1_micro": 0.892328398384926,
178
+ "eval_loss": 0.2622196674346924,
179
+ "eval_precision_macro": 0.9437605053976897,
180
+ "eval_precision_micro": 0.892328398384926,
181
+ "eval_recall_macro": 0.9063603025064753,
182
+ "eval_recall_micro": 0.892328398384926,
183
+ "eval_runtime": 14.7613,
184
+ "eval_samples_per_second": 50.334,
185
+ "eval_steps_per_second": 3.184,
186
+ "step": 4000
187
+ },
188
+ {
189
+ "epoch": 0.9797517962116263,
190
+ "grad_norm": 341548.65625,
191
+ "learning_rate": 7.499514142009407e-05,
192
+ "loss": 0.3686,
193
+ "step": 4500
194
+ },
195
+ {
196
+ "epoch": 0.9797517962116263,
197
+ "eval_accuracy": 0.901749663526245,
198
+ "eval_f1_macro": 0.9071958475193036,
199
+ "eval_f1_micro": 0.9017496635262451,
200
+ "eval_loss": 0.21770605444908142,
201
+ "eval_precision_macro": 0.9392339212137314,
202
+ "eval_precision_micro": 0.901749663526245,
203
+ "eval_recall_macro": 0.9187280722751042,
204
+ "eval_recall_micro": 0.901749663526245,
205
+ "eval_runtime": 14.7411,
206
+ "eval_samples_per_second": 50.403,
207
+ "eval_steps_per_second": 3.188,
208
+ "step": 4500
209
+ },
210
+ {
211
+ "epoch": 1.088613106901807,
212
+ "grad_norm": 51656.32421875,
213
+ "learning_rate": 7.299951998946065e-05,
214
+ "loss": 0.2762,
215
+ "step": 5000
216
+ },
217
+ {
218
+ "epoch": 1.088613106901807,
219
+ "eval_accuracy": 0.8896366083445492,
220
+ "eval_f1_macro": 0.8803954267807832,
221
+ "eval_f1_micro": 0.8896366083445492,
222
+ "eval_loss": 0.37781140208244324,
223
+ "eval_precision_macro": 0.8980066417509999,
224
+ "eval_precision_micro": 0.8896366083445492,
225
+ "eval_recall_macro": 0.8882222866157216,
226
+ "eval_recall_micro": 0.8896366083445492,
227
+ "eval_runtime": 15.0879,
228
+ "eval_samples_per_second": 49.245,
229
+ "eval_steps_per_second": 3.115,
230
+ "step": 5000
231
+ },
232
+ {
233
+ "epoch": 1.1974744175919878,
234
+ "grad_norm": 1009913.0625,
235
+ "learning_rate": 7.070627669481137e-05,
236
+ "loss": 0.2851,
237
+ "step": 5500
238
+ },
239
+ {
240
+ "epoch": 1.1974744175919878,
241
+ "eval_accuracy": 0.882907133243607,
242
+ "eval_f1_macro": 0.8672894626796113,
243
+ "eval_f1_micro": 0.882907133243607,
244
+ "eval_loss": 0.38583362102508545,
245
+ "eval_precision_macro": 0.9049625152940963,
246
+ "eval_precision_micro": 0.882907133243607,
247
+ "eval_recall_macro": 0.8813935878782198,
248
+ "eval_recall_micro": 0.882907133243607,
249
+ "eval_runtime": 14.7029,
250
+ "eval_samples_per_second": 50.534,
251
+ "eval_steps_per_second": 3.197,
252
+ "step": 5500
253
+ },
254
+ {
255
+ "epoch": 1.3063357282821686,
256
+ "grad_norm": 26227.69140625,
257
+ "learning_rate": 6.813609424135567e-05,
258
+ "loss": 0.2818,
259
+ "step": 6000
260
+ },
261
+ {
262
+ "epoch": 1.3063357282821686,
263
+ "eval_accuracy": 0.9138627187079408,
264
+ "eval_f1_macro": 0.9250807107212078,
265
+ "eval_f1_micro": 0.9138627187079408,
266
+ "eval_loss": 0.1822730302810669,
267
+ "eval_precision_macro": 0.9436200764635643,
268
+ "eval_precision_micro": 0.9138627187079408,
269
+ "eval_recall_macro": 0.9322277636580386,
270
+ "eval_recall_micro": 0.9138627187079408,
271
+ "eval_runtime": 14.7441,
272
+ "eval_samples_per_second": 50.393,
273
+ "eval_steps_per_second": 3.188,
274
+ "step": 6000
275
+ },
276
+ {
277
+ "epoch": 1.4151970389723492,
278
+ "grad_norm": 87145.015625,
279
+ "learning_rate": 6.531215304180572e-05,
280
+ "loss": 0.2539,
281
+ "step": 6500
282
+ },
283
+ {
284
+ "epoch": 1.4151970389723492,
285
+ "eval_accuracy": 0.9044414535666218,
286
+ "eval_f1_macro": 0.9159118265135213,
287
+ "eval_f1_micro": 0.9044414535666218,
288
+ "eval_loss": 0.19744105637073517,
289
+ "eval_precision_macro": 0.9248731430404993,
290
+ "eval_precision_micro": 0.9044414535666218,
291
+ "eval_recall_macro": 0.9361879615931227,
292
+ "eval_recall_micro": 0.9044414535666218,
293
+ "eval_runtime": 14.7205,
294
+ "eval_samples_per_second": 50.474,
295
+ "eval_steps_per_second": 3.193,
296
+ "step": 6500
297
+ },
298
+ {
299
+ "epoch": 1.52405834966253,
300
+ "grad_norm": 4197689.5,
301
+ "learning_rate": 6.22599221528008e-05,
302
+ "loss": 0.2342,
303
+ "step": 7000
304
+ },
305
+ {
306
+ "epoch": 1.52405834966253,
307
+ "eval_accuracy": 0.9152086137281292,
308
+ "eval_f1_macro": 0.9209521774588028,
309
+ "eval_f1_micro": 0.9152086137281292,
310
+ "eval_loss": 0.16721387207508087,
311
+ "eval_precision_macro": 0.9316385374819118,
312
+ "eval_precision_micro": 0.9152086137281292,
313
+ "eval_recall_macro": 0.9305594066426393,
314
+ "eval_recall_micro": 0.9152086137281292,
315
+ "eval_runtime": 14.7185,
316
+ "eval_samples_per_second": 50.481,
317
+ "eval_steps_per_second": 3.193,
318
+ "step": 7000
319
+ },
320
+ {
321
+ "epoch": 1.6329196603527105,
322
+ "grad_norm": 29691.1875,
323
+ "learning_rate": 5.900692957010821e-05,
324
+ "loss": 0.2658,
325
+ "step": 7500
326
+ },
327
+ {
328
+ "epoch": 1.6329196603527105,
329
+ "eval_accuracy": 0.9205921938088829,
330
+ "eval_f1_macro": 0.9292673927082579,
331
+ "eval_f1_micro": 0.9205921938088829,
332
+ "eval_loss": 0.16926071047782898,
333
+ "eval_precision_macro": 0.9467601029387086,
334
+ "eval_precision_micro": 0.9205921938088829,
335
+ "eval_recall_macro": 0.9353857192023052,
336
+ "eval_recall_micro": 0.9205921938088829,
337
+ "eval_runtime": 14.7038,
338
+ "eval_samples_per_second": 50.531,
339
+ "eval_steps_per_second": 3.196,
340
+ "step": 7500
341
+ },
342
+ {
343
+ "epoch": 1.7417809710428913,
344
+ "grad_norm": 82702.546875,
345
+ "learning_rate": 5.5582513954302386e-05,
346
+ "loss": 0.2703,
347
+ "step": 8000
348
+ },
349
+ {
350
+ "epoch": 1.7417809710428913,
351
+ "eval_accuracy": 0.917900403768506,
352
+ "eval_f1_macro": 0.9205592899943698,
353
+ "eval_f1_micro": 0.917900403768506,
354
+ "eval_loss": 0.22037993371486664,
355
+ "eval_precision_macro": 0.9459349396324186,
356
+ "eval_precision_micro": 0.917900403768506,
357
+ "eval_recall_macro": 0.9278516945604416,
358
+ "eval_recall_micro": 0.917900403768506,
359
+ "eval_runtime": 14.7085,
360
+ "eval_samples_per_second": 50.515,
361
+ "eval_steps_per_second": 3.195,
362
+ "step": 8000
363
+ },
364
+ {
365
+ "epoch": 1.850642281733072,
366
+ "grad_norm": 450699.1875,
367
+ "learning_rate": 5.201756002610252e-05,
368
+ "loss": 0.2566,
369
+ "step": 8500
370
+ },
371
+ {
372
+ "epoch": 1.850642281733072,
373
+ "eval_accuracy": 0.9098250336473755,
374
+ "eval_f1_macro": 0.9126391472355347,
375
+ "eval_f1_micro": 0.9098250336473755,
376
+ "eval_loss": 0.26449093222618103,
377
+ "eval_precision_macro": 0.9352643525302922,
378
+ "eval_precision_micro": 0.9098250336473755,
379
+ "eval_recall_macro": 0.931955435163728,
380
+ "eval_recall_micro": 0.9098250336473755,
381
+ "eval_runtime": 14.6939,
382
+ "eval_samples_per_second": 50.565,
383
+ "eval_steps_per_second": 3.199,
384
+ "step": 8500
385
+ },
386
+ {
387
+ "epoch": 1.959503592423253,
388
+ "grad_norm": 37148.73046875,
389
+ "learning_rate": 4.834422001783138e-05,
390
+ "loss": 0.2242,
391
+ "step": 9000
392
+ },
393
+ {
394
+ "epoch": 1.959503592423253,
395
+ "eval_accuracy": 0.9246298788694481,
396
+ "eval_f1_macro": 0.9278695233625198,
397
+ "eval_f1_micro": 0.9246298788694481,
398
+ "eval_loss": 0.20524874329566956,
399
+ "eval_precision_macro": 0.9473174570200222,
400
+ "eval_precision_micro": 0.9246298788694481,
401
+ "eval_recall_macro": 0.9317137486146517,
402
+ "eval_recall_micro": 0.9246298788694481,
403
+ "eval_runtime": 14.65,
404
+ "eval_samples_per_second": 50.717,
405
+ "eval_steps_per_second": 3.208,
406
+ "step": 9000
407
+ },
408
+ {
409
+ "epoch": 2.0683649031134337,
410
+ "grad_norm": 65893.8984375,
411
+ "learning_rate": 4.45956236932181e-05,
412
+ "loss": 0.1672,
413
+ "step": 9500
414
+ },
415
+ {
416
+ "epoch": 2.0683649031134337,
417
+ "eval_accuracy": 0.9165545087483177,
418
+ "eval_f1_macro": 0.9239702133396492,
419
+ "eval_f1_micro": 0.9165545087483177,
420
+ "eval_loss": 0.3571414351463318,
421
+ "eval_precision_macro": 0.9412785975210729,
422
+ "eval_precision_micro": 0.9165545087483177,
423
+ "eval_recall_macro": 0.9173054563259597,
424
+ "eval_recall_micro": 0.9165545087483177,
425
+ "eval_runtime": 14.749,
426
+ "eval_samples_per_second": 50.376,
427
+ "eval_steps_per_second": 3.187,
428
+ "step": 9500
429
+ },
430
+ {
431
+ "epoch": 2.177226213803614,
432
+ "grad_norm": 20243.5546875,
433
+ "learning_rate": 4.0805579550869046e-05,
434
+ "loss": 0.1593,
435
+ "step": 10000
436
+ },
437
+ {
438
+ "epoch": 2.177226213803614,
439
+ "eval_accuracy": 0.9125168236877523,
440
+ "eval_f1_macro": 0.9238184226911409,
441
+ "eval_f1_micro": 0.9125168236877523,
442
+ "eval_loss": 0.30988800525665283,
443
+ "eval_precision_macro": 0.9555289484815556,
444
+ "eval_precision_micro": 0.9125168236877523,
445
+ "eval_recall_macro": 0.9275764985418137,
446
+ "eval_recall_micro": 0.9125168236877523,
447
+ "eval_runtime": 15.0155,
448
+ "eval_samples_per_second": 49.482,
449
+ "eval_steps_per_second": 3.13,
450
+ "step": 10000
451
+ },
452
+ {
453
+ "epoch": 2.286087524493795,
454
+ "grad_norm": 33157.19140625,
455
+ "learning_rate": 3.7008269906245454e-05,
456
+ "loss": 0.1799,
457
+ "step": 10500
458
+ },
459
+ {
460
+ "epoch": 2.286087524493795,
461
+ "eval_accuracy": 0.9246298788694481,
462
+ "eval_f1_macro": 0.9287251727049811,
463
+ "eval_f1_micro": 0.9246298788694481,
464
+ "eval_loss": 0.23414301872253418,
465
+ "eval_precision_macro": 0.959944603131214,
466
+ "eval_precision_micro": 0.9246298788694481,
467
+ "eval_recall_macro": 0.9306134629626335,
468
+ "eval_recall_micro": 0.9246298788694481,
469
+ "eval_runtime": 14.6983,
470
+ "eval_samples_per_second": 50.55,
471
+ "eval_steps_per_second": 3.198,
472
+ "step": 10500
473
+ },
474
+ {
475
+ "epoch": 2.3949488351839756,
476
+ "grad_norm": 48777.84375,
477
+ "learning_rate": 3.323794260219589e-05,
478
+ "loss": 0.166,
479
+ "step": 11000
480
+ },
481
+ {
482
+ "epoch": 2.3949488351839756,
483
+ "eval_accuracy": 0.9057873485868102,
484
+ "eval_f1_macro": 0.9123153410480982,
485
+ "eval_f1_micro": 0.9057873485868102,
486
+ "eval_loss": 0.3453662395477295,
487
+ "eval_precision_macro": 0.9446104426733389,
488
+ "eval_precision_micro": 0.9057873485868102,
489
+ "eval_recall_macro": 0.91935239522038,
490
+ "eval_recall_micro": 0.9057873485868102,
491
+ "eval_runtime": 14.7404,
492
+ "eval_samples_per_second": 50.406,
493
+ "eval_steps_per_second": 3.189,
494
+ "step": 11000
495
+ },
496
+ {
497
+ "epoch": 2.5038101458741564,
498
+ "grad_norm": 33563.56640625,
499
+ "learning_rate": 2.9528602128499004e-05,
500
+ "loss": 0.162,
501
+ "step": 11500
502
+ },
503
+ {
504
+ "epoch": 2.5038101458741564,
505
+ "eval_accuracy": 0.9098250336473755,
506
+ "eval_f1_macro": 0.9212878627631594,
507
+ "eval_f1_micro": 0.9098250336473755,
508
+ "eval_loss": 0.22809743881225586,
509
+ "eval_precision_macro": 0.9389309808956737,
510
+ "eval_precision_micro": 0.9098250336473755,
511
+ "eval_recall_macro": 0.9311247877025975,
512
+ "eval_recall_micro": 0.9098250336473755,
513
+ "eval_runtime": 14.666,
514
+ "eval_samples_per_second": 50.661,
515
+ "eval_steps_per_second": 3.205,
516
+ "step": 11500
517
+ },
518
+ {
519
+ "epoch": 2.612671456564337,
520
+ "grad_norm": 58977.125,
521
+ "learning_rate": 2.591370293620146e-05,
522
+ "loss": 0.1452,
523
+ "step": 12000
524
+ },
525
+ {
526
+ "epoch": 2.612671456564337,
527
+ "eval_accuracy": 0.9219380888290714,
528
+ "eval_f1_macro": 0.9232635700162879,
529
+ "eval_f1_micro": 0.9219380888290714,
530
+ "eval_loss": 0.2860707640647888,
531
+ "eval_precision_macro": 0.9426347574998575,
532
+ "eval_precision_micro": 0.9219380888290714,
533
+ "eval_recall_macro": 0.9262974863930373,
534
+ "eval_recall_micro": 0.9219380888290714,
535
+ "eval_runtime": 14.8095,
536
+ "eval_samples_per_second": 50.171,
537
+ "eval_steps_per_second": 3.174,
538
+ "step": 12000
539
+ },
540
+ {
541
+ "epoch": 2.7215327672545175,
542
+ "grad_norm": 46900.25390625,
543
+ "learning_rate": 2.2425847712741887e-05,
544
+ "loss": 0.1418,
545
+ "step": 12500
546
+ },
547
+ {
548
+ "epoch": 2.7215327672545175,
549
+ "eval_accuracy": 0.9286675639300135,
550
+ "eval_f1_macro": 0.9357990563843356,
551
+ "eval_f1_micro": 0.9286675639300135,
552
+ "eval_loss": 0.15669873356819153,
553
+ "eval_precision_macro": 0.9529768865317036,
554
+ "eval_precision_micro": 0.9286675639300135,
555
+ "eval_recall_macro": 0.9417303559122717,
556
+ "eval_recall_micro": 0.9286675639300135,
557
+ "eval_runtime": 14.7072,
558
+ "eval_samples_per_second": 50.52,
559
+ "eval_steps_per_second": 3.196,
560
+ "step": 12500
561
+ },
562
+ {
563
+ "epoch": 2.8303940779446983,
564
+ "grad_norm": 37592.3515625,
565
+ "learning_rate": 1.9096493339109878e-05,
566
+ "loss": 0.1429,
567
+ "step": 13000
568
+ },
569
+ {
570
+ "epoch": 2.8303940779446983,
571
+ "eval_accuracy": 0.9165545087483177,
572
+ "eval_f1_macro": 0.9295728643158702,
573
+ "eval_f1_micro": 0.9165545087483177,
574
+ "eval_loss": 0.22479559481143951,
575
+ "eval_precision_macro": 0.9605098350591709,
576
+ "eval_precision_micro": 0.9165545087483177,
577
+ "eval_recall_macro": 0.9328126952515738,
578
+ "eval_recall_micro": 0.9165545087483177,
579
+ "eval_runtime": 14.6901,
580
+ "eval_samples_per_second": 50.578,
581
+ "eval_steps_per_second": 3.199,
582
+ "step": 13000
583
+ },
584
+ {
585
+ "epoch": 2.939255388634879,
586
+ "grad_norm": 79597.40625,
587
+ "learning_rate": 1.5955667181005554e-05,
588
+ "loss": 0.1293,
589
+ "step": 13500
590
+ },
591
+ {
592
+ "epoch": 2.939255388634879,
593
+ "eval_accuracy": 0.9246298788694481,
594
+ "eval_f1_macro": 0.9319848397676713,
595
+ "eval_f1_micro": 0.9246298788694481,
596
+ "eval_loss": 0.27543124556541443,
597
+ "eval_precision_macro": 0.9589344708678029,
598
+ "eval_precision_micro": 0.9246298788694481,
599
+ "eval_recall_macro": 0.932925082879603,
600
+ "eval_recall_micro": 0.9246298788694481,
601
+ "eval_runtime": 14.726,
602
+ "eval_samples_per_second": 50.455,
603
+ "eval_steps_per_second": 3.192,
604
+ "step": 13500
605
+ },
606
+ {
607
+ "epoch": 3.04811669932506,
608
+ "grad_norm": 25773.66796875,
609
+ "learning_rate": 1.3031696272762192e-05,
610
+ "loss": 0.1137,
611
+ "step": 14000
612
+ },
613
+ {
614
+ "epoch": 3.04811669932506,
615
+ "eval_accuracy": 0.9246298788694481,
616
+ "eval_f1_macro": 0.937910042741771,
617
+ "eval_f1_micro": 0.9246298788694481,
618
+ "eval_loss": 0.20125848054885864,
619
+ "eval_precision_macro": 0.9546735463378956,
620
+ "eval_precision_micro": 0.9246298788694481,
621
+ "eval_recall_macro": 0.9429177293988182,
622
+ "eval_recall_micro": 0.9246298788694481,
623
+ "eval_runtime": 15.0054,
624
+ "eval_samples_per_second": 49.515,
625
+ "eval_steps_per_second": 3.132,
626
+ "step": 14000
627
+ },
628
+ {
629
+ "epoch": 3.1569780100152407,
630
+ "grad_norm": 17888.46484375,
631
+ "learning_rate": 1.0350951836516297e-05,
632
+ "loss": 0.0987,
633
+ "step": 14500
634
+ },
635
+ {
636
+ "epoch": 3.1569780100152407,
637
+ "eval_accuracy": 0.9232839838492598,
638
+ "eval_f1_macro": 0.9266276405829272,
639
+ "eval_f1_micro": 0.9232839838492598,
640
+ "eval_loss": 0.29369959235191345,
641
+ "eval_precision_macro": 0.9436536313571009,
642
+ "eval_precision_micro": 0.9232839838492598,
643
+ "eval_recall_macro": 0.9283196203410136,
644
+ "eval_recall_micro": 0.9232839838492598,
645
+ "eval_runtime": 14.7764,
646
+ "eval_samples_per_second": 50.283,
647
+ "eval_steps_per_second": 3.181,
648
+ "step": 14500
649
+ },
650
+ {
651
+ "epoch": 3.265839320705421,
652
+ "grad_norm": 85828.9375,
653
+ "learning_rate": 7.9376114407998e-06,
654
+ "loss": 0.0859,
655
+ "step": 15000
656
+ },
657
+ {
658
+ "epoch": 3.265839320705421,
659
+ "eval_accuracy": 0.9246298788694481,
660
+ "eval_f1_macro": 0.9402166974265765,
661
+ "eval_f1_micro": 0.9246298788694481,
662
+ "eval_loss": 0.17889092862606049,
663
+ "eval_precision_macro": 0.9685045177945787,
664
+ "eval_precision_micro": 0.9246298788694481,
665
+ "eval_recall_macro": 0.9463450172046672,
666
+ "eval_recall_micro": 0.9246298788694481,
667
+ "eval_runtime": 14.7495,
668
+ "eval_samples_per_second": 50.375,
669
+ "eval_steps_per_second": 3.187,
670
+ "step": 15000
671
+ },
672
+ {
673
+ "epoch": 3.374700631395602,
674
+ "grad_norm": 146288.75,
675
+ "learning_rate": 5.813440943640527e-06,
676
+ "loss": 0.0857,
677
+ "step": 15500
678
+ },
679
+ {
680
+ "epoch": 3.374700631395602,
681
+ "eval_accuracy": 0.927321668909825,
682
+ "eval_f1_macro": 0.9434096975688787,
683
+ "eval_f1_micro": 0.927321668909825,
684
+ "eval_loss": 0.16961060464382172,
685
+ "eval_precision_macro": 0.9641802881027017,
686
+ "eval_precision_micro": 0.927321668909825,
687
+ "eval_recall_macro": 0.9472331991452233,
688
+ "eval_recall_micro": 0.927321668909825,
689
+ "eval_runtime": 14.7305,
690
+ "eval_samples_per_second": 50.44,
691
+ "eval_steps_per_second": 3.191,
692
+ "step": 15500
693
+ }
694
+ ],
695
+ "logging_steps": 500,
696
+ "max_steps": 18372,
697
+ "num_input_tokens_seen": 0,
698
+ "num_train_epochs": 4,
699
+ "save_steps": 500,
700
+ "stateful_callbacks": {
701
+ "TrainerControl": {
702
+ "args": {
703
+ "should_epoch_stop": false,
704
+ "should_evaluate": false,
705
+ "should_log": false,
706
+ "should_save": true,
707
+ "should_training_stop": false
708
+ },
709
+ "attributes": {}
710
+ }
711
+ },
712
+ "total_flos": 6.52555679969065e+16,
713
+ "train_batch_size": 16,
714
+ "trial_name": null,
715
+ "trial_params": null
716
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1047e61b858553751484fc75261eb0ac4cc7e2f0958fe8dea13f34fba3822fb
3
+ size 5304
vocab.txt ADDED
The diff for this file is too large to render. See raw diff