J7B commited on
Commit
c38e2f4
·
verified ·
1 Parent(s): 8ba70c8

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -39,3 +39,6 @@ xlmr_spam_final/checkpoint-2814/tokenizer.json filter=lfs diff=lfs merge=lfs -te
39
  xlmr_v1/checkpoint-322/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
  xlmr_v1/checkpoint-644/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
  xlmr_v1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
39
  xlmr_v1/checkpoint-322/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
  xlmr_v1/checkpoint-644/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
  xlmr_v1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-322/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-644/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-322/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-05,
18
+ "max_position_embeddings": 514,
19
+ "model_type": "xlm-roberta",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "output_past": true,
23
+ "pad_token_id": 1,
24
+ "position_embedding_type": "absolute",
25
+ "problem_type": "single_label_classification",
26
+ "tie_word_embeddings": true,
27
+ "transformers_version": "5.1.0",
28
+ "type_vocab_size": 1,
29
+ "use_cache": false,
30
+ "vocab_size": 250002
31
+ }
checkpoint-322/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f741b4925aca4d07257786c0c16d8dd1ab6d8d96730995f7b753f94e423a87bb
3
+ size 1112204984
checkpoint-322/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4942b5a303e11c1e12f1ec3e38cb77f52d813c2a3b6c36af6914aa4b1fbf311e
3
+ size 2224529658
checkpoint-322/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d04f779af80b021b161268ac30c9b56ff5d34fbece9bc553e4ff656cfe52ffdd
3
+ size 14244
checkpoint-322/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2725633a61db79106970482748454641c9161b5d2d378c0df56fe53a583e772d
3
+ size 988
checkpoint-322/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c9d0bfb7e7c3b9b19e70bb550549ae06f83a172d04342c9f707356c25a12f99
3
+ size 1064
checkpoint-322/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2464f9721707cb3d5edcf9a3d73454b13e8a7b3bb8fdba94b3de3d843f30e946
3
+ size 16781584
checkpoint-322/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "is_local": true,
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 512,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "XLMRobertaTokenizer",
13
+ "unk_token": "<unk>"
14
+ }
checkpoint-322/trainer_state.json ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 322,
3
+ "best_metric": 0.18244007229804993,
4
+ "best_model_checkpoint": "D:\\Major Project\\SpamX\\ml\\xlmr\\xlmr_v1\\checkpoint-322",
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 322,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03108003108003108,
14
+ "grad_norm": 29.43787956237793,
15
+ "learning_rate": 4.945652173913044e-06,
16
+ "loss": 1.2356471061706542,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.06216006216006216,
21
+ "grad_norm": 36.178550720214844,
22
+ "learning_rate": 4.875776397515528e-06,
23
+ "loss": 1.1817050933837892,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.09324009324009325,
28
+ "grad_norm": 29.694974899291992,
29
+ "learning_rate": 4.798136645962733e-06,
30
+ "loss": 1.0811898231506347,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.12432012432012432,
35
+ "grad_norm": 8.485957145690918,
36
+ "learning_rate": 4.7204968944099384e-06,
37
+ "loss": 0.876597785949707,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.1554001554001554,
42
+ "grad_norm": 21.316192626953125,
43
+ "learning_rate": 4.642857142857144e-06,
44
+ "loss": 0.9505236625671387,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.1864801864801865,
49
+ "grad_norm": 34.35721969604492,
50
+ "learning_rate": 4.565217391304348e-06,
51
+ "loss": 1.1337352752685548,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.21756021756021757,
56
+ "grad_norm": 48.13325881958008,
57
+ "learning_rate": 4.487577639751553e-06,
58
+ "loss": 1.063378143310547,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.24864024864024864,
63
+ "grad_norm": 38.45956802368164,
64
+ "learning_rate": 4.4177018633540375e-06,
65
+ "loss": 1.1818448066711427,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.27972027972027974,
70
+ "grad_norm": 18.295886993408203,
71
+ "learning_rate": 4.340062111801243e-06,
72
+ "loss": 0.91037015914917,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.3108003108003108,
77
+ "grad_norm": 13.219426155090332,
78
+ "learning_rate": 4.262422360248447e-06,
79
+ "loss": 0.9483179092407227,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.3418803418803419,
84
+ "grad_norm": 23.118179321289062,
85
+ "learning_rate": 4.184782608695653e-06,
86
+ "loss": 0.992742919921875,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.372960372960373,
91
+ "grad_norm": 13.910191535949707,
92
+ "learning_rate": 4.107142857142857e-06,
93
+ "loss": 0.8459652900695801,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.40404040404040403,
98
+ "grad_norm": 124.82660675048828,
99
+ "learning_rate": 4.0295031055900625e-06,
100
+ "loss": 0.9497438430786133,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.43512043512043513,
105
+ "grad_norm": 46.57713317871094,
106
+ "learning_rate": 3.951863354037268e-06,
107
+ "loss": 1.1179959297180175,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.4662004662004662,
112
+ "grad_norm": 17.421274185180664,
113
+ "learning_rate": 3.874223602484472e-06,
114
+ "loss": 0.7715085983276367,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.4972804972804973,
119
+ "grad_norm": 16.492841720581055,
120
+ "learning_rate": 3.7965838509316772e-06,
121
+ "loss": 0.9310503959655761,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.5283605283605284,
126
+ "grad_norm": 38.246829986572266,
127
+ "learning_rate": 3.718944099378882e-06,
128
+ "loss": 0.89602632522583,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.5594405594405595,
133
+ "grad_norm": 28.985132217407227,
134
+ "learning_rate": 3.6413043478260875e-06,
135
+ "loss": 0.8126945495605469,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.5905205905205905,
140
+ "grad_norm": 14.60274600982666,
141
+ "learning_rate": 3.5636645962732924e-06,
142
+ "loss": 0.7371460914611816,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.6216006216006216,
147
+ "grad_norm": 30.11294937133789,
148
+ "learning_rate": 3.486024844720497e-06,
149
+ "loss": 1.0865836143493652,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.6526806526806527,
154
+ "grad_norm": 17.661558151245117,
155
+ "learning_rate": 3.4083850931677022e-06,
156
+ "loss": 1.0465456008911134,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.6837606837606838,
161
+ "grad_norm": 34.937503814697266,
162
+ "learning_rate": 3.3385093167701865e-06,
163
+ "loss": 0.8159684181213379,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.7148407148407149,
168
+ "grad_norm": 58.97747802734375,
169
+ "learning_rate": 3.2608695652173914e-06,
170
+ "loss": 0.820067310333252,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.745920745920746,
175
+ "grad_norm": 41.07950973510742,
176
+ "learning_rate": 3.1832298136645968e-06,
177
+ "loss": 1.0034560203552245,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.777000777000777,
182
+ "grad_norm": 31.760068893432617,
183
+ "learning_rate": 3.1055900621118013e-06,
184
+ "loss": 0.7825074672698975,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.8080808080808081,
189
+ "grad_norm": 29.330337524414062,
190
+ "learning_rate": 3.027950310559006e-06,
191
+ "loss": 0.8985923767089844,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.8391608391608392,
196
+ "grad_norm": 28.913965225219727,
197
+ "learning_rate": 2.9503105590062115e-06,
198
+ "loss": 0.8792219161987305,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.8702408702408703,
203
+ "grad_norm": 17.811166763305664,
204
+ "learning_rate": 2.8726708074534164e-06,
205
+ "loss": 0.8635202407836914,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.9013209013209014,
210
+ "grad_norm": 19.338523864746094,
211
+ "learning_rate": 2.795031055900621e-06,
212
+ "loss": 0.890287208557129,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.9324009324009324,
217
+ "grad_norm": 18.355663299560547,
218
+ "learning_rate": 2.7173913043478263e-06,
219
+ "loss": 0.9401198387145996,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.9634809634809635,
224
+ "grad_norm": 20.48928451538086,
225
+ "learning_rate": 2.639751552795031e-06,
226
+ "loss": 0.831356430053711,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.9945609945609946,
231
+ "grad_norm": 59.57387161254883,
232
+ "learning_rate": 2.5621118012422365e-06,
233
+ "loss": 0.7133886814117432,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 1.0,
238
+ "eval_loss": 0.18244007229804993,
239
+ "eval_runtime": 4.4097,
240
+ "eval_samples_per_second": 259.426,
241
+ "eval_steps_per_second": 32.428,
242
+ "step": 322
243
+ }
244
+ ],
245
+ "logging_steps": 10,
246
+ "max_steps": 644,
247
+ "num_input_tokens_seen": 0,
248
+ "num_train_epochs": 2,
249
+ "save_steps": 500,
250
+ "stateful_callbacks": {
251
+ "TrainerControl": {
252
+ "args": {
253
+ "should_epoch_stop": false,
254
+ "should_evaluate": false,
255
+ "should_log": false,
256
+ "should_save": true,
257
+ "should_training_stop": false
258
+ },
259
+ "attributes": {}
260
+ }
261
+ },
262
+ "total_flos": 391300611531000.0,
263
+ "train_batch_size": 8,
264
+ "trial_name": null,
265
+ "trial_params": null
266
+ }
checkpoint-322/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44da1fe3b83e5bc0b568a24f212da51163e6e6058c33c08f7252f28612d3574
3
+ size 4792
checkpoint-644/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-05,
18
+ "max_position_embeddings": 514,
19
+ "model_type": "xlm-roberta",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "output_past": true,
23
+ "pad_token_id": 1,
24
+ "position_embedding_type": "absolute",
25
+ "problem_type": "single_label_classification",
26
+ "tie_word_embeddings": true,
27
+ "transformers_version": "5.1.0",
28
+ "type_vocab_size": 1,
29
+ "use_cache": false,
30
+ "vocab_size": 250002
31
+ }
checkpoint-644/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd8036f3652c5248472577c5be1472e4d09695a46c48a475306888f0ced9d4a
3
+ size 1112204984
checkpoint-644/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a21b7921622201f092ce3cdd38b18e9b48040089269df7f65cd969de677f95e
3
+ size 2224529658
checkpoint-644/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:743aba1687c315dec910b7953f281e6820bd8053009788bfba4856f84534632f
3
+ size 14244
checkpoint-644/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3be0a5d421e6ee0417ca0578aea14899255b1cd05eb21ef8852f72ca9dee77
3
+ size 988
checkpoint-644/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1184452503e1d36a2645049020c04efce12f9872de36ec791c90dbf0159c4a6d
3
+ size 1064
checkpoint-644/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2464f9721707cb3d5edcf9a3d73454b13e8a7b3bb8fdba94b3de3d843f30e946
3
+ size 16781584
checkpoint-644/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "is_local": true,
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 512,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "XLMRobertaTokenizer",
13
+ "unk_token": "<unk>"
14
+ }
checkpoint-644/trainer_state.json ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 644,
3
+ "best_metric": 0.17823560535907745,
4
+ "best_model_checkpoint": "D:\\Major Project\\SpamX\\ml\\xlmr\\xlmr_v1\\checkpoint-644",
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 644,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.03108003108003108,
14
+ "grad_norm": 29.43787956237793,
15
+ "learning_rate": 4.945652173913044e-06,
16
+ "loss": 1.2356471061706542,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.06216006216006216,
21
+ "grad_norm": 36.178550720214844,
22
+ "learning_rate": 4.875776397515528e-06,
23
+ "loss": 1.1817050933837892,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.09324009324009325,
28
+ "grad_norm": 29.694974899291992,
29
+ "learning_rate": 4.798136645962733e-06,
30
+ "loss": 1.0811898231506347,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.12432012432012432,
35
+ "grad_norm": 8.485957145690918,
36
+ "learning_rate": 4.7204968944099384e-06,
37
+ "loss": 0.876597785949707,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.1554001554001554,
42
+ "grad_norm": 21.316192626953125,
43
+ "learning_rate": 4.642857142857144e-06,
44
+ "loss": 0.9505236625671387,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.1864801864801865,
49
+ "grad_norm": 34.35721969604492,
50
+ "learning_rate": 4.565217391304348e-06,
51
+ "loss": 1.1337352752685548,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.21756021756021757,
56
+ "grad_norm": 48.13325881958008,
57
+ "learning_rate": 4.487577639751553e-06,
58
+ "loss": 1.063378143310547,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.24864024864024864,
63
+ "grad_norm": 38.45956802368164,
64
+ "learning_rate": 4.4177018633540375e-06,
65
+ "loss": 1.1818448066711427,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.27972027972027974,
70
+ "grad_norm": 18.295886993408203,
71
+ "learning_rate": 4.340062111801243e-06,
72
+ "loss": 0.91037015914917,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.3108003108003108,
77
+ "grad_norm": 13.219426155090332,
78
+ "learning_rate": 4.262422360248447e-06,
79
+ "loss": 0.9483179092407227,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.3418803418803419,
84
+ "grad_norm": 23.118179321289062,
85
+ "learning_rate": 4.184782608695653e-06,
86
+ "loss": 0.992742919921875,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.372960372960373,
91
+ "grad_norm": 13.910191535949707,
92
+ "learning_rate": 4.107142857142857e-06,
93
+ "loss": 0.8459652900695801,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.40404040404040403,
98
+ "grad_norm": 124.82660675048828,
99
+ "learning_rate": 4.0295031055900625e-06,
100
+ "loss": 0.9497438430786133,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.43512043512043513,
105
+ "grad_norm": 46.57713317871094,
106
+ "learning_rate": 3.951863354037268e-06,
107
+ "loss": 1.1179959297180175,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.4662004662004662,
112
+ "grad_norm": 17.421274185180664,
113
+ "learning_rate": 3.874223602484472e-06,
114
+ "loss": 0.7715085983276367,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.4972804972804973,
119
+ "grad_norm": 16.492841720581055,
120
+ "learning_rate": 3.7965838509316772e-06,
121
+ "loss": 0.9310503959655761,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.5283605283605284,
126
+ "grad_norm": 38.246829986572266,
127
+ "learning_rate": 3.718944099378882e-06,
128
+ "loss": 0.89602632522583,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.5594405594405595,
133
+ "grad_norm": 28.985132217407227,
134
+ "learning_rate": 3.6413043478260875e-06,
135
+ "loss": 0.8126945495605469,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.5905205905205905,
140
+ "grad_norm": 14.60274600982666,
141
+ "learning_rate": 3.5636645962732924e-06,
142
+ "loss": 0.7371460914611816,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.6216006216006216,
147
+ "grad_norm": 30.11294937133789,
148
+ "learning_rate": 3.486024844720497e-06,
149
+ "loss": 1.0865836143493652,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.6526806526806527,
154
+ "grad_norm": 17.661558151245117,
155
+ "learning_rate": 3.4083850931677022e-06,
156
+ "loss": 1.0465456008911134,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.6837606837606838,
161
+ "grad_norm": 34.937503814697266,
162
+ "learning_rate": 3.3385093167701865e-06,
163
+ "loss": 0.8159684181213379,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.7148407148407149,
168
+ "grad_norm": 58.97747802734375,
169
+ "learning_rate": 3.2608695652173914e-06,
170
+ "loss": 0.820067310333252,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.745920745920746,
175
+ "grad_norm": 41.07950973510742,
176
+ "learning_rate": 3.1832298136645968e-06,
177
+ "loss": 1.0034560203552245,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.777000777000777,
182
+ "grad_norm": 31.760068893432617,
183
+ "learning_rate": 3.1055900621118013e-06,
184
+ "loss": 0.7825074672698975,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.8080808080808081,
189
+ "grad_norm": 29.330337524414062,
190
+ "learning_rate": 3.027950310559006e-06,
191
+ "loss": 0.8985923767089844,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.8391608391608392,
196
+ "grad_norm": 28.913965225219727,
197
+ "learning_rate": 2.9503105590062115e-06,
198
+ "loss": 0.8792219161987305,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.8702408702408703,
203
+ "grad_norm": 17.811166763305664,
204
+ "learning_rate": 2.8726708074534164e-06,
205
+ "loss": 0.8635202407836914,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.9013209013209014,
210
+ "grad_norm": 19.338523864746094,
211
+ "learning_rate": 2.795031055900621e-06,
212
+ "loss": 0.890287208557129,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.9324009324009324,
217
+ "grad_norm": 18.355663299560547,
218
+ "learning_rate": 2.7173913043478263e-06,
219
+ "loss": 0.9401198387145996,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.9634809634809635,
224
+ "grad_norm": 20.48928451538086,
225
+ "learning_rate": 2.639751552795031e-06,
226
+ "loss": 0.831356430053711,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.9945609945609946,
231
+ "grad_norm": 59.57387161254883,
232
+ "learning_rate": 2.5621118012422365e-06,
233
+ "loss": 0.7133886814117432,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 1.0,
238
+ "eval_loss": 0.18244007229804993,
239
+ "eval_runtime": 4.4097,
240
+ "eval_samples_per_second": 259.426,
241
+ "eval_steps_per_second": 32.428,
242
+ "step": 322
243
+ },
244
+ {
245
+ "epoch": 1.0248640248640248,
246
+ "grad_norm": 39.35393524169922,
247
+ "learning_rate": 2.484472049689441e-06,
248
+ "loss": 0.7849094867706299,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 1.055944055944056,
253
+ "grad_norm": 39.5618896484375,
254
+ "learning_rate": 2.4068322981366464e-06,
255
+ "loss": 0.5631073474884033,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.087024087024087,
260
+ "grad_norm": 27.039840698242188,
261
+ "learning_rate": 2.3291925465838513e-06,
262
+ "loss": 0.8551163673400879,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.118104118104118,
267
+ "grad_norm": 31.069948196411133,
268
+ "learning_rate": 2.251552795031056e-06,
269
+ "loss": 0.940975284576416,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 1.1491841491841492,
274
+ "grad_norm": 34.83683776855469,
275
+ "learning_rate": 2.173913043478261e-06,
276
+ "loss": 0.8859931945800781,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 1.1802641802641802,
281
+ "grad_norm": 43.3201789855957,
282
+ "learning_rate": 2.096273291925466e-06,
283
+ "loss": 0.6784295558929443,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 1.2113442113442114,
288
+ "grad_norm": 26.327808380126953,
289
+ "learning_rate": 2.018633540372671e-06,
290
+ "loss": 0.7212324619293213,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 1.2424242424242424,
295
+ "grad_norm": 22.891172409057617,
296
+ "learning_rate": 1.940993788819876e-06,
297
+ "loss": 0.8333398818969726,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.2735042735042734,
302
+ "grad_norm": 56.2291374206543,
303
+ "learning_rate": 1.8633540372670808e-06,
304
+ "loss": 1.0169650077819825,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 1.3045843045843046,
309
+ "grad_norm": 38.48230743408203,
310
+ "learning_rate": 1.7857142857142859e-06,
311
+ "loss": 0.7382871627807617,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 1.3356643356643356,
316
+ "grad_norm": 38.05250549316406,
317
+ "learning_rate": 1.7080745341614908e-06,
318
+ "loss": 0.8073366165161133,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 1.3667443667443666,
323
+ "grad_norm": 30.15036392211914,
324
+ "learning_rate": 1.6304347826086957e-06,
325
+ "loss": 0.8878802299499512,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 1.3978243978243978,
330
+ "grad_norm": 35.91055679321289,
331
+ "learning_rate": 1.5527950310559006e-06,
332
+ "loss": 0.5926938533782959,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 1.428904428904429,
337
+ "grad_norm": 18.346158981323242,
338
+ "learning_rate": 1.4751552795031058e-06,
339
+ "loss": 0.590770959854126,
340
+ "step": 460
341
+ },
342
+ {
343
+ "epoch": 1.45998445998446,
344
+ "grad_norm": 31.250991821289062,
345
+ "learning_rate": 1.3975155279503105e-06,
346
+ "loss": 0.7174652099609375,
347
+ "step": 470
348
+ },
349
+ {
350
+ "epoch": 1.491064491064491,
351
+ "grad_norm": 35.001522064208984,
352
+ "learning_rate": 1.3198757763975156e-06,
353
+ "loss": 0.8534024238586426,
354
+ "step": 480
355
+ },
356
+ {
357
+ "epoch": 1.5221445221445222,
358
+ "grad_norm": 32.24079513549805,
359
+ "learning_rate": 1.2422360248447205e-06,
360
+ "loss": 0.8570188522338867,
361
+ "step": 490
362
+ },
363
+ {
364
+ "epoch": 1.5532245532245532,
365
+ "grad_norm": 19.8613224029541,
366
+ "learning_rate": 1.1645962732919256e-06,
367
+ "loss": 0.6434041023254394,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 1.5843045843045842,
372
+ "grad_norm": 46.252769470214844,
373
+ "learning_rate": 1.0869565217391306e-06,
374
+ "loss": 0.5186689376831055,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 1.6153846153846154,
379
+ "grad_norm": 20.27726936340332,
380
+ "learning_rate": 1.0093167701863355e-06,
381
+ "loss": 1.0002134323120118,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 1.6464646464646466,
386
+ "grad_norm": 22.71544075012207,
387
+ "learning_rate": 9.316770186335404e-07,
388
+ "loss": 0.6581085681915283,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 1.6775446775446774,
393
+ "grad_norm": 34.64213180541992,
394
+ "learning_rate": 8.540372670807454e-07,
395
+ "loss": 1.042281150817871,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 1.7086247086247086,
400
+ "grad_norm": 17.567914962768555,
401
+ "learning_rate": 7.763975155279503e-07,
402
+ "loss": 0.7002626419067383,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 1.7397047397047398,
407
+ "grad_norm": 31.270509719848633,
408
+ "learning_rate": 6.987577639751552e-07,
409
+ "loss": 0.7479125022888183,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 1.7707847707847708,
414
+ "grad_norm": 33.94447708129883,
415
+ "learning_rate": 6.211180124223603e-07,
416
+ "loss": 0.6186141014099121,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 1.8018648018648018,
421
+ "grad_norm": 18.601036071777344,
422
+ "learning_rate": 5.434782608695653e-07,
423
+ "loss": 0.6978522777557373,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 1.832944832944833,
428
+ "grad_norm": 29.79970359802246,
429
+ "learning_rate": 4.658385093167702e-07,
430
+ "loss": 0.838288402557373,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 1.864024864024864,
435
+ "grad_norm": 15.155320167541504,
436
+ "learning_rate": 3.8819875776397516e-07,
437
+ "loss": 0.4178286552429199,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 1.895104895104895,
442
+ "grad_norm": 32.92871856689453,
443
+ "learning_rate": 3.1055900621118013e-07,
444
+ "loss": 0.8897994041442872,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 1.9261849261849262,
449
+ "grad_norm": 6.8692827224731445,
450
+ "learning_rate": 2.329192546583851e-07,
451
+ "loss": 0.6488828659057617,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 1.9572649572649574,
456
+ "grad_norm": 54.483192443847656,
457
+ "learning_rate": 1.5527950310559006e-07,
458
+ "loss": 1.0308164596557616,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 1.9883449883449882,
463
+ "grad_norm": 30.52878761291504,
464
+ "learning_rate": 7.763975155279503e-08,
465
+ "loss": 0.7207555294036865,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 2.0,
470
+ "eval_loss": 0.17823560535907745,
471
+ "eval_runtime": 5.6175,
472
+ "eval_samples_per_second": 203.648,
473
+ "eval_steps_per_second": 25.456,
474
+ "step": 644
475
+ }
476
+ ],
477
+ "logging_steps": 10,
478
+ "max_steps": 644,
479
+ "num_input_tokens_seen": 0,
480
+ "num_train_epochs": 2,
481
+ "save_steps": 500,
482
+ "stateful_callbacks": {
483
+ "TrainerControl": {
484
+ "args": {
485
+ "should_epoch_stop": false,
486
+ "should_evaluate": false,
487
+ "should_log": false,
488
+ "should_save": true,
489
+ "should_training_stop": true
490
+ },
491
+ "attributes": {}
492
+ }
493
+ },
494
+ "total_flos": 777428418602520.0,
495
+ "train_batch_size": 8,
496
+ "trial_name": null,
497
+ "trial_params": null
498
+ }
checkpoint-644/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44da1fe3b83e5bc0b568a24f212da51163e6e6058c33c08f7252f28612d3574
3
+ size 4792
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": 2,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 3072,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-05,
18
+ "max_position_embeddings": 514,
19
+ "model_type": "xlm-roberta",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 12,
22
+ "output_past": true,
23
+ "pad_token_id": 1,
24
+ "position_embedding_type": "absolute",
25
+ "problem_type": "single_label_classification",
26
+ "tie_word_embeddings": true,
27
+ "transformers_version": "5.1.0",
28
+ "type_vocab_size": 1,
29
+ "use_cache": false,
30
+ "vocab_size": 250002
31
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd8036f3652c5248472577c5be1472e4d09695a46c48a475306888f0ced9d4a
3
+ size 1112204984
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2464f9721707cb3d5edcf9a3d73454b13e8a7b3bb8fdba94b3de3d843f30e946
3
+ size 16781584
tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "is_local": true,
8
+ "mask_token": "<mask>",
9
+ "model_max_length": 512,
10
+ "pad_token": "<pad>",
11
+ "sep_token": "</s>",
12
+ "tokenizer_class": "XLMRobertaTokenizer",
13
+ "unk_token": "<unk>"
14
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44da1fe3b83e5bc0b568a24f212da51163e6e6058c33c08f7252f28612d3574
3
+ size 4792